From 44fb5f91161af5a0ca798d237271e15b3eb13fe3 Mon Sep 17 00:00:00 2001 From: Chirag Jain Date: Fri, 5 Mar 2021 20:46:24 +0530 Subject: [PATCH 1/2] fix(v2/number): cast to int directly if no decimal point is found to avoid wrong values because of precision loss --- .../numeral/number/standard_number_detector.py | 9 +++++++-- ner_v2/detectors/numeral/utils.py | 1 + .../numeral/number/en/number_ner_tests.yaml | 16 +++++++++++++++- .../numeral/number/en/test_number_detection.py | 3 +++ 4 files changed, 26 insertions(+), 3 deletions(-) diff --git a/ner_v2/detectors/numeral/number/standard_number_detector.py b/ner_v2/detectors/numeral/number/standard_number_detector.py index 0c454f64e..0b971a834 100644 --- a/ner_v2/detectors/numeral/number/standard_number_detector.py +++ b/ner_v2/detectors/numeral/number/standard_number_detector.py @@ -3,6 +3,7 @@ import collections import os +import decimal import pandas as pd from six.moves import zip @@ -302,8 +303,12 @@ def _detect_number_from_digit(self, number_list=None, original_list=None): scale = 1 if number: - number = float(number) * scale - number = int(number) if number.is_integer() else number + if '.' not in number: + number = int(number) * scale + else: + number = float(number) * scale + # FIXME: this conversion from float -> int is lossy, consider using Decimal class + number = int(number) if number.is_integer() else number unit = None if self.unit_type: unit, original_text = self._get_unit_from_text(original_text, processed_text) diff --git a/ner_v2/detectors/numeral/utils.py b/ner_v2/detectors/numeral/utils.py index 573605b05..7a748d314 100644 --- a/ner_v2/detectors/numeral/utils.py +++ b/ner_v2/detectors/numeral/utils.py @@ -26,6 +26,7 @@ def get_number_from_number_word(text, number_word_dict): [In] >> _get_number_from_numerals('two hundred three four hundred three', number_word_dict) [Out] >> (['103', '403'], ['one hundred three', 'four hundred three']) """ + # FIXME: conversion from float -> int is lossy, consider using Decimal class detected_number_list = [] detected_original_text_list = [] diff --git a/ner_v2/tests/numeral/number/en/number_ner_tests.yaml b/ner_v2/tests/numeral/number/en/number_ner_tests.yaml index 24cfa727c..89ba0d73d 100644 --- a/ner_v2/tests/numeral/number/en/number_ner_tests.yaml +++ b/ner_v2/tests/numeral/number/en/number_ner_tests.yaml @@ -175,4 +175,18 @@ tests: - original_text: "50000 rupees" output_id: 3 unit: rupees - value: "50000" \ No newline at end of file + value: "50000" + - id: en_14 + message: "here are some super large numbers 12345678901234567890 12345678 987654321012345678901 98765432101234567890123 " + unit_type: null + min_digit: 20 + max_digit: 22 + outputs: + - original_text: "12345678901234567890" + output_id: 1 + unit: null + value: "12345678901234567890" + - original_text: "987654321012345678901" + output_id: 2 + unit: null + value: "987654321012345678901" \ No newline at end of file diff --git a/ner_v2/tests/numeral/number/en/test_number_detection.py b/ner_v2/tests/numeral/number/en/test_number_detection.py index bcd834583..79508ae2c 100644 --- a/ner_v2/tests/numeral/number/en/test_number_detection.py +++ b/ner_v2/tests/numeral/number/en/test_number_detection.py @@ -194,6 +194,9 @@ def run_test(self): message = testcase["message"] unit_type = testcase.get("unit_type", None) number_detector_object = NumberDetector(entity_name="number", language=language, unit_type=unit_type) + number_detector_object.set_min_max_digits( + min_digit=testcase.get('min_digit', number_detector_object.min_digit), + max_digit=testcase.get('max_digit', number_detector_object.max_digit)) number_dicts, spans = number_detector_object.detect_entity(message) expected_number_dicts, expected_spans = parse_expected_outputs(testcase["outputs"]) From c12bf84a545763c3e6ae1e5d0a7e659602f0f10f Mon Sep 17 00:00:00 2001 From: Chirag Jain Date: Fri, 5 Mar 2021 20:51:21 +0530 Subject: [PATCH 2/2] style: remove unused decimal import --- ner_v2/detectors/numeral/number/standard_number_detector.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ner_v2/detectors/numeral/number/standard_number_detector.py b/ner_v2/detectors/numeral/number/standard_number_detector.py index 0b971a834..2df38a2f9 100644 --- a/ner_v2/detectors/numeral/number/standard_number_detector.py +++ b/ner_v2/detectors/numeral/number/standard_number_detector.py @@ -3,7 +3,6 @@ import collections import os -import decimal import pandas as pd from six.moves import zip