From 297eb9baea57e85e99d64ec2f0843ca863759122 Mon Sep 17 00:00:00 2001 From: harjinder7 Date: Tue, 8 Nov 2022 05:46:30 +0000 Subject: [PATCH] removed the logs and cleaned the code --- language_utilities/constant.py | 2 +- ner_v2/api.py | 5 +- ner_v2/detectors/base_detector.py | 12 ---- .../numeral/number/number_detection.py | 7 +-- .../number/standard_number_detector.py | 1 + .../numeral/number/zh-TW/number_detection.py | 30 ++++------ .../phone_number/phone_number_detection.py | 59 ++++++------------- 7 files changed, 34 insertions(+), 82 deletions(-) diff --git a/language_utilities/constant.py b/language_utilities/constant.py index e1fda55a..5e91a061 100644 --- a/language_utilities/constant.py +++ b/language_utilities/constant.py @@ -30,7 +30,7 @@ PORTUGUESE_LANG = 'pt' TURKISH_LANG = 'tr' -CHINESE_LANG = 'zh-TW' +CHINESE_TRADITIONAL_LANG = 'zh-TW' # language translation status TRANSLATED_TEXT = 'translated_text' diff --git a/ner_v2/api.py b/ner_v2/api.py index bfb0e0f7..f80485f5 100644 --- a/ner_v2/api.py +++ b/ner_v2/api.py @@ -10,7 +10,7 @@ from chatbot_ner.config import ner_logger from datastore.exceptions import DataStoreRequestException -from language_utilities.constant import ENGLISH_LANG +from language_utilities.constant import ENGLISH_LANG, CHINESE_TRADITIONAL_LANG from ner_constants import PARAMETER_MESSAGE, PARAMETER_ENTITY_NAME, PARAMETER_STRUCTURED_VALUE, \ PARAMETER_FALLBACK_VALUE, \ PARAMETER_BOT_MESSAGE, PARAMETER_TIMEZONE, PARAMETER_LANGUAGE_SCRIPT, PARAMETER_SOURCE_LANGUAGE, \ @@ -634,7 +634,7 @@ def phone_number(request): ner_logger.debug('Entity Name %s' % entity_name) ner_logger.debug('Source Language %s' % language) - if language == 'zh-TW': + if language == CHINESE_TRADITIONAL_LANG: phone_number_detection = ChinesePhoneDetector(entity_name=entity_name, language=language, locale=parameters_dict[PARAMETER_LOCALE]) else: @@ -644,7 +644,6 @@ def phone_number(request): ner_logger.debug(parameters_dict) if isinstance(message, six.string_types): - ner_logger.debug(f'++ API msg : {message}') entity_output = phone_number_detection.detect(message=message, structured_value=parameters_dict[PARAMETER_STRUCTURED_VALUE], fallback_value=parameters_dict[PARAMETER_FALLBACK_VALUE], diff --git a/ner_v2/detectors/base_detector.py b/ner_v2/detectors/base_detector.py index 7bd5ae8b..8fc9c6ba 100644 --- a/ner_v2/detectors/base_detector.py +++ b/ner_v2/detectors/base_detector.py @@ -10,7 +10,6 @@ from ner_constants import (FROM_STRUCTURE_VALUE_VERIFIED, FROM_STRUCTURE_VALUE_NOT_VERIFIED, FROM_MESSAGE, FROM_FALLBACK_VALUE, ORIGINAL_TEXT, ENTITY_VALUE, DETECTION_METHOD, DETECTION_LANGUAGE, ENTITY_VALUE_DICT_KEY) -from chatbot_ner.config import ner_logger class BaseDetector(object): """ @@ -32,9 +31,7 @@ def __init__(self, language=ENGLISH_LANG, translation_enabled=False): language (str): ISO 639 language code of language of original query translation_enabled (bool): Decides to either enable or disable translation API """ - ner_logger.debug(f'-= BASE : {language}') self._language = language - ner_logger.debug(f'-= PHONE : {self._language}') self._processing_language = ENGLISH_LANG self._translation_enabled = translation_enabled self._set_language_processing_script() @@ -59,7 +56,6 @@ def detect_entity(self, text, **kwargs): tuple: Two lists of same length containing detected values and original substring from text which is used to derive the detected value respectively """ - ner_logger.debug(f'>>> base detector detect entity') return [], [] def _set_language_processing_script(self): @@ -67,7 +63,6 @@ def _set_language_processing_script(self): This method is used to decide the language in which detector should run it's logic based on supported language and query language for which subclass is initialized """ - ner_logger.debug(f'-+-+ {self._language} , {self.supported_languages}') if self._language in self.supported_languages: self._processing_language = self._language elif ENGLISH_LANG in self.supported_languages and self._translation_enabled: @@ -135,8 +130,6 @@ def detect(self, message=None, structured_value=None, fallback_value=None, **kwa >> [{'detection': 'message', 'original_text': 'inferno', 'entity_value': {'value': u'Inferno'}}] """ - - ner_logger.debug(f'==== M :{message}') if self._language != self._processing_language and self._translation_enabled: if structured_value: translation_output = translate_text(structured_value, self._language, @@ -148,23 +141,18 @@ def detect(self, message=None, structured_value=None, fallback_value=None, **kwa message = translation_output[TRANSLATED_TEXT] if translation_output['status'] else None text = structured_value if structured_value else message - ner_logger.debug(f'==== M :{message}') entity_list, original_text_list = self.detect_entity(text=text, **kwargs) if structured_value: - ner_logger.debug(f'structured ==== {entity_list}, {original_text_list}') if entity_list: value, method, original_text = entity_list, FROM_STRUCTURE_VALUE_VERIFIED, original_text_list else: value, method, original_text = [structured_value], FROM_STRUCTURE_VALUE_NOT_VERIFIED, \ [structured_value] elif entity_list: - ner_logger.debug(f'entity list ==== {entity_list}, {original_text_list}') value, method, original_text = entity_list, FROM_MESSAGE, original_text_list elif fallback_value: - ner_logger.debug(f'fallback value ==== {entity_list}, {original_text_list}') value, method, original_text = [fallback_value], FROM_FALLBACK_VALUE, [fallback_value] else: - ner_logger.debug(f'None ==== {entity_list}, {original_text_list}') return None return self.output_entity_dict_list(entity_value_list=value, original_text_list=original_text, diff --git a/ner_v2/detectors/numeral/number/number_detection.py b/ner_v2/detectors/numeral/number/number_detection.py index bc3b2eb7..ee97925d 100644 --- a/ner_v2/detectors/numeral/number/number_detection.py +++ b/ner_v2/detectors/numeral/number/number_detection.py @@ -23,8 +23,6 @@ from ner_v2.detectors.numeral.constant import NUMBER_DETECTION_RETURN_DICT_VALUE, NUMBER_DETECTION_RETURN_DICT_UNIT from ner_v2.detectors.utils import get_lang_data_path -from chatbot_ner.config import ner_logger - COMMON_NON_NUMERIC_PUNCTUATIONS = re.escape('!"#%&\'()*/;<=>?@[\\]^_`{|}~।') @@ -113,12 +111,10 @@ def __init__(self, entity_name, language=ENGLISH_LANG, unit_type=None, detect_wi self.detect_without_unit = detect_without_unit self.punctuations_to_filter = re.compile(f'[{COMMON_NON_NUMERIC_PUNCTUATIONS}]') try: - ner_logger.debug(f'MODEL LOADING FOR : {self.language}') number_detector_module = importlib.import_module( 'ner_v2.detectors.numeral.number.{0}.number_detection'.format(self.language)) self.language_number_detector = number_detector_module.NumberDetector(entity_name=self.entity_name, unit_type=self.unit_type) - ner_logger.debug(f'MODEL LOADED FOR : {self.language}') except ImportError: standard_number_regex = importlib.import_module( 'ner_v2.detectors.numeral.number.standard_number_detector' @@ -131,6 +127,9 @@ def __init__(self, entity_name, language=ENGLISH_LANG, unit_type=None, detect_wi ) def get_language_number_detector(self): + """ + To get the language number detector being used by current Number detector + """ return self.language_number_detector @property diff --git a/ner_v2/detectors/numeral/number/standard_number_detector.py b/ner_v2/detectors/numeral/number/standard_number_detector.py index 4dae7f91..32fc1774 100644 --- a/ner_v2/detectors/numeral/number/standard_number_detector.py +++ b/ner_v2/detectors/numeral/number/standard_number_detector.py @@ -318,6 +318,7 @@ def _detect_number_from_digit(self, number_list=None, original_list=None): start_span = 0 end_span = -1 spanned_text = self.processed_text + regex_numeric_patterns = re.compile(r'(([\d,]+\.?[\d]*)\s?(' + self.scale_map_choices + r'))[\s\-\:]' + r'|([\d,]+\.?[\d]*)', re.UNICODE) patterns = regex_numeric_patterns.findall(processed_text) diff --git a/ner_v2/detectors/numeral/number/zh-TW/number_detection.py b/ner_v2/detectors/numeral/number/zh-TW/number_detection.py index 4ab4a6a3..a1404833 100644 --- a/ner_v2/detectors/numeral/number/zh-TW/number_detection.py +++ b/ner_v2/detectors/numeral/number/zh-TW/number_detection.py @@ -15,18 +15,6 @@ NUMBER_DETECTION_RETURN_DICT_UNIT, NUMBER_DETECTION_RETURN_DICT_VALUE from ner_v2.detectors.numeral.number.standard_number_detector import BaseNumberDetector -from chatbot_ner.config import ner_logger - - -""" -mapping some special character for chinese (traditional) -use to replace in text string -""" -special_chars_mapping = { - ',' : '、', # comma character - '.' : '點' #dian ( period ) -} - class NumberDetector(BaseNumberDetector): """ @@ -51,7 +39,11 @@ def __init__(self, entity_name='number', unit_type=None): self.detector_preferences = [ self._detect_number_from_text ] - ner_logger.debug(f'-=-= CHINESE NUMBER DETECTOR') + + self.special_chars_mapping = { + ',': '、', # comma character + '.': '點' #dian ( period ) + } def _get_base_map_choices(self, base_map): number_set = set() @@ -86,7 +78,7 @@ def _have_digits_only(self, text=None, scale_map=None): def replace_special_chars(self, text=None): text = text or '' - for _char, _native_char in special_chars_mapping.items(): + for _char, _native_char in self.special_chars_mapping.items(): text = text.replace(_native_char, _char) return text @@ -109,13 +101,13 @@ def _detect_number_from_text(self, number_list=None, original_list=None): rgx_pattern = r'([{}]+)({}?([{}]*))'.format( self.base_numbers_map_full, - special_chars_mapping.get('.', '\.'), + self.special_chars_mapping.get('.', '\.'), self.base_numbers_map_full ) regex_digit_patterns = re.compile(rgx_pattern) patterns = regex_digit_patterns.findall(self.processed_text) for pattern in patterns: - full_number = number, after_decimal, original_text = None, None, None + full_number, number, original_text = None, None, None if pattern[0].strip(): original_text = pattern[0].strip() span = re.search(original_text, spanned_text).span() @@ -129,7 +121,6 @@ def _detect_number_from_text(self, number_list=None, original_list=None): if number.isnumeric(): full_number = number - if full_number: _pattern = re.compile(re.escape(original_text), flags=_re_flags) @@ -144,9 +135,8 @@ def _detect_number_from_text(self, number_list=None, original_list=None): return number_list, original_list def extract_digits_only(self, text, with_scale=False): - ner_logger.debug(f'++++ extracting') text = text or '' - rgx_pattern = r'[\s-.+{}]+' + rgx_pattern = r'[-,.+\s{}]+' if not with_scale: rgx_pattern = re.compile(rgx_pattern.format(self.base_numbers_map_choices)) else: @@ -158,5 +148,5 @@ def get_number_digit_by_digit(self, text=''): def get_number_with_digit_scaling(self, text=''): # change the below logic to work with scaling - return '' + return text \ No newline at end of file diff --git a/ner_v2/detectors/pattern/phone_number/phone_number_detection.py b/ner_v2/detectors/pattern/phone_number/phone_number_detection.py index ba8f90af..b26aa0c5 100644 --- a/ner_v2/detectors/pattern/phone_number/phone_number_detection.py +++ b/ner_v2/detectors/pattern/phone_number/phone_number_detection.py @@ -13,12 +13,10 @@ import phonenumbers from six.moves import zip -from language_utilities.constant import ENGLISH_LANG, CHINESE_LANG +from language_utilities.constant import ENGLISH_LANG, CHINESE_TRADITIONAL_LANG from ner_v2.detectors.base_detector import BaseDetector from ner_v2.detectors.numeral.number.number_detection import NumberDetector -from chatbot_ner.config import ner_logger - class PhoneDetector(BaseDetector): """ @@ -40,10 +38,8 @@ def __init__(self, entity_name, language=ENGLISH_LANG, locale=None): locale(str, optional): locale of the country from which you are dialing. Ex: 'en-IN' """ self._supported_languages = NumberDetector.get_supported_languages() - ner_logger.debug(f'-= PHONE : {language}') super(PhoneDetector, self).__init__(language, locale) self.language = language - ner_logger.debug(f'-= PHONE : {self.language}') self.locale = locale or 'en-IN' if _regex_available: # This will replace all types of dashes(em or en) by hyphen. @@ -105,7 +101,6 @@ def detect_entity(self, text, **kwargs): """ self.text = " " + text.lower().strip() + " " self.phone, self.original_phone_text = [], [] - ner_logger.debug(f'### PH :{self.text} {self.country_code}') for match in phonenumbers.PhoneNumberMatcher(self.text, self.country_code, leniency=0): if match.number.country_code == phonenumbers.country_code_for_region(self.country_code): self.phone.append(self.check_for_country_code(str(match.number.national_number))) @@ -115,7 +110,6 @@ def detect_entity(self, text, **kwargs): self.phone.append({"country_calling_code": str(match.number.country_code), "value": str(match.number.national_number)}) self.original_phone_text.append(self.text[match.start:match.end]) - ner_logger.info(f'### {self.phone} {self.original_phone_text}') self.phone, self.original_phone_text = self.check_for_alphas() return self.phone, self.original_phone_text @@ -164,7 +158,7 @@ class ChinesePhoneDetector(PhoneDetector): This method is used to detect phone numbers present in chinese text. """ - def __init__(self, entity_name, language=CHINESE_LANG, locale=None): + def __init__(self, entity_name, language=CHINESE_TRADITIONAL_LANG, locale=None): """ Args: entity_name (str): A string by which the detected numbers would be replaced with @@ -173,20 +167,7 @@ def __init__(self, entity_name, language=CHINESE_LANG, locale=None): locale(str, optional): locale of the country from which you are dialing. Ex: 'en-IN' """ self._supported_languages = NumberDetector.get_supported_languages() - ner_logger.debug(f'-= CHINESE : {language}') super(ChinesePhoneDetector, self).__init__(entity_name, language, locale) - self.language = language - ner_logger.debug(f'-= CHINESE : {self.language}') - self.locale = locale or CHINESE_LANG - if _regex_available: - # This will replace all types of dashes(em or en) by hyphen. - self.locale = regex.sub('\\p{Pd}', '-', self.locale) - - self.text = '' - self.phone, self.original_phone_text = [], [] - self.country_code = self.get_country_code_from_locale() - self.entity_name = entity_name - self.tag = '__' + self.entity_name + '__' # Using Chinese number detector here self.number_detector = NumberDetector(self.entity_name, language=self.language) @@ -201,31 +182,25 @@ def _text_list_for_detection(self, text=None): return : list[string] """ text = text or '' - ner_logger.debug(f'<<< Sanitizeing text : {text}') matches = self.language_number_detector.extract_digits_only(text) return matches def detect_entity(self, text, **kwargs): - ner_logger.debug(f'<<< chinese phone number detect entity') - + """ + This is to detect phone numbers from text by mapping chinese digits to numeric values + """ number_matches = self._text_list_for_detection(text) self.phone, self.original_phone_text = [], [] - try: - for _text in number_matches: - original_text = " " + _text.lower().strip() + " " - sanitized_text = self.language_number_detector.get_number_digit_by_digit(original_text) - - ner_logger.debug(f'### PH : {sanitized_text} {self.country_code} {original_text}') - for match in phonenumbers.PhoneNumberMatcher(sanitized_text, self.country_code, leniency=0): - if match.number.country_code == phonenumbers.country_code_for_region(self.country_code): - self.phone.append(self.check_for_country_code(str(match.number.national_number))) - self.original_phone_text.append(original_text[match.start:match.end]) - else: - # This means our detector has detected some other country code. - self.phone.append({"country_calling_code": str(match.number.country_code), - "value": str(match.number.national_number)}) - self.original_phone_text.append(original_text[match.start:match.end]) - except Exception as exp: - ner_logger.error(f'Exception in detect_entity for ChinesePhoneDetector, {str(exp)}') - ner_logger.debug(f'==== {self.phone}, {self.original_phone_text}') + for _text in number_matches: + original_text = " " + _text.lower().strip() + " " + sanitized_text = self.language_number_detector.get_number_digit_by_digit(original_text) + for match in phonenumbers.PhoneNumberMatcher(sanitized_text, self.country_code, leniency=0): + if match.number.country_code == phonenumbers.country_code_for_region(self.country_code): + self.phone.append(self.check_for_country_code(str(match.number.national_number))) + self.original_phone_text.append(original_text[match.start:match.end]) + else: + # This means our detector has detected some other country code. + self.phone.append({"country_calling_code": str(match.number.country_code), + "value": str(match.number.national_number)}) + self.original_phone_text.append(original_text[match.start:match.end]) return self.phone, self.original_phone_text \ No newline at end of file