diff --git a/language_utilities/constant.py b/language_utilities/constant.py index 450893c5..5e91a061 100644 --- a/language_utilities/constant.py +++ b/language_utilities/constant.py @@ -30,5 +30,7 @@ PORTUGUESE_LANG = 'pt' TURKISH_LANG = 'tr' +CHINESE_TRADITIONAL_LANG = 'zh-TW' + # language translation status TRANSLATED_TEXT = 'translated_text' diff --git a/ner_v2/api.py b/ner_v2/api.py index ecfe154e..965852a6 100644 --- a/ner_v2/api.py +++ b/ner_v2/api.py @@ -10,7 +10,7 @@ from chatbot_ner.config import ner_logger from datastore.exceptions import DataStoreRequestException -from language_utilities.constant import ENGLISH_LANG +from language_utilities.constant import ENGLISH_LANG, CHINESE_TRADITIONAL_LANG from ner_constants import PARAMETER_MESSAGE, PARAMETER_ENTITY_NAME, PARAMETER_STRUCTURED_VALUE, \ PARAMETER_FALLBACK_VALUE, \ PARAMETER_BOT_MESSAGE, PARAMETER_TIMEZONE, PARAMETER_LANGUAGE_SCRIPT, PARAMETER_SOURCE_LANGUAGE, \ @@ -18,7 +18,7 @@ PARAMETER_LOCALE, PARAMETER_RANGE_ENABLED from ner_v2.detectors.numeral.number.number_detection import NumberDetector from ner_v2.detectors.numeral.number_range.number_range_detection import NumberRangeDetector -from ner_v2.detectors.pattern.phone_number.phone_number_detection import PhoneDetector +from ner_v2.detectors.pattern.phone_number.phone_number_detection import PhoneDetector, ChinesePhoneDetector from ner_v2.detectors.temporal.date.date_detection import DateAdvancedDetector from ner_v2.detectors.temporal.time.time_detection import TimeDetector from ner_v2.detectors.textual.utils import get_text_entity_detection_data, validate_text_request, InvalidTextRequest @@ -634,8 +634,12 @@ def phone_number(request): ner_logger.debug('Entity Name %s' % entity_name) ner_logger.debug('Source Language %s' % language) - phone_number_detection = PhoneDetector(entity_name=entity_name, language=language, - locale=parameters_dict[PARAMETER_LOCALE]) + if language == CHINESE_TRADITIONAL_LANG: + phone_number_detection = ChinesePhoneDetector(entity_name=entity_name, language=language, + locale=parameters_dict[PARAMETER_LOCALE]) + else: + phone_number_detection = PhoneDetector(entity_name=entity_name, language=language, + locale=parameters_dict[PARAMETER_LOCALE]) message = parameters_dict[PARAMETER_MESSAGE] ner_logger.debug(parameters_dict) diff --git a/ner_v2/detectors/base_detector.py b/ner_v2/detectors/base_detector.py index d9d0127c..8fc9c6ba 100644 --- a/ner_v2/detectors/base_detector.py +++ b/ner_v2/detectors/base_detector.py @@ -11,7 +11,6 @@ FROM_FALLBACK_VALUE, ORIGINAL_TEXT, ENTITY_VALUE, DETECTION_METHOD, DETECTION_LANGUAGE, ENTITY_VALUE_DICT_KEY) - class BaseDetector(object): """ This class is the base class from which will be inherited by individual detectors. It primarily contains the @@ -143,7 +142,6 @@ def detect(self, message=None, structured_value=None, fallback_value=None, **kwa text = structured_value if structured_value else message entity_list, original_text_list = self.detect_entity(text=text, **kwargs) - if structured_value: if entity_list: value, method, original_text = entity_list, FROM_STRUCTURE_VALUE_VERIFIED, original_text_list diff --git a/ner_v2/detectors/numeral/constant.py b/ner_v2/detectors/numeral/constant.py index 036ae4cf..037c39f0 100644 --- a/ner_v2/detectors/numeral/constant.py +++ b/ner_v2/detectors/numeral/constant.py @@ -4,6 +4,7 @@ NUMBER_NUMERAL_CONSTANT_FILE_NAME = 'numerals_constant.csv' NUMBER_NUMERAL_FILE_VARIANTS_COLUMN_NAME = 'name_variants' NUMBER_NUMERAL_FILE_VALUE_COLUMN_NAME = 'number_value' +NUMBER_NUMERAL_FILE_NUMBER_COLUMN_NAME = 'number' NUMBER_NUMERAL_FILE_TYPE_COLUMN_NAME = 'number_type' # type value of number in numeral_constant data file diff --git a/ner_v2/detectors/numeral/number/number_detection.py b/ner_v2/detectors/numeral/number/number_detection.py index b1d8f232..ee97925d 100644 --- a/ner_v2/detectors/numeral/number/number_detection.py +++ b/ner_v2/detectors/numeral/number/number_detection.py @@ -81,7 +81,7 @@ def get_supported_languages(): cwd = os.path.dirname(os.path.abspath(__file__)) cwd_dirs = [x for x in os.listdir(cwd) if os.path.isdir(os.path.join(cwd, x))] for _dir in cwd_dirs: - if len(_dir.rstrip(os.sep)) == 2: + if len(_dir.rstrip(os.sep)) in [2, 5]: supported_languages.append(_dir) return supported_languages @@ -115,7 +115,6 @@ def __init__(self, entity_name, language=ENGLISH_LANG, unit_type=None, detect_wi 'ner_v2.detectors.numeral.number.{0}.number_detection'.format(self.language)) self.language_number_detector = number_detector_module.NumberDetector(entity_name=self.entity_name, unit_type=self.unit_type) - except ImportError: standard_number_regex = importlib.import_module( 'ner_v2.detectors.numeral.number.standard_number_detector' @@ -127,6 +126,12 @@ def __init__(self, entity_name, language=ENGLISH_LANG, unit_type=None, detect_wi lang_code=self.language) ) + def get_language_number_detector(self): + """ + To get the language number detector being used by current Number detector + """ + return self.language_number_detector + @property def supported_languages(self): return self._supported_languages diff --git a/ner_v2/detectors/numeral/number/standard_number_detector.py b/ner_v2/detectors/numeral/number/standard_number_detector.py index 0a5759da..32fc1774 100644 --- a/ner_v2/detectors/numeral/number/standard_number_detector.py +++ b/ner_v2/detectors/numeral/number/standard_number_detector.py @@ -19,7 +19,8 @@ NUMBER_NUMERAL_FILE_VALUE_COLUMN_NAME, NUMBER_NUMERAL_FILE_TYPE_COLUMN_NAME, NUMBER_TYPE_UNIT, \ NUMBER_NUMERAL_CONSTANT_FILE_NAME, NUMBER_DETECTION_RETURN_DICT_VALUE, NUMBER_DETECTION_RETURN_DICT_SPAN, \ NUMBER_DETECTION_RETURN_DICT_UNIT, NUMBER_UNITS_FILE_NAME, NUMBER_DATA_FILE_UNIT_VARIANTS_COLUMN_NAME, \ - NUMBER_DATA_FILE_UNIT_VALUE_COLUMN_NAME, NUMBER_TYPE_SCALE, NUMBER_DATA_FILE_UNIT_TYPE_COLUMN_NAME + NUMBER_DATA_FILE_UNIT_VALUE_COLUMN_NAME, NUMBER_TYPE_SCALE, NUMBER_DATA_FILE_UNIT_TYPE_COLUMN_NAME, \ + NUMBER_NUMERAL_FILE_NUMBER_COLUMN_NAME from ner_v2.detectors.numeral.utils import get_number_from_number_word, get_list_from_pipe_sep_string NumberVariant = collections.namedtuple('NumberVariant', ['scale', 'increment']) @@ -46,6 +47,7 @@ def __init__(self, entity_name, data_directory_path, unit_type=None): self.entity_name = entity_name self.tag = '__' + entity_name + '__' + self.base_numbers_map = {} self.numbers_word_map = {} self.scale_map = {} self.units_map = {} @@ -122,6 +124,8 @@ def init_regex_and_parser(self, data_directory_path): for numeral in name_variants: # tuple values to corresponds to (scale, increment), for unit type, scale will always be 1. self.numbers_word_map[numeral] = NumberVariant(scale=1, increment=value) + # map the name of number to latin numeric value + self.base_numbers_map[numeral] = value elif number_type == NUMBER_TYPE_SCALE: for numeral in name_variants: @@ -129,6 +133,11 @@ def init_regex_and_parser(self, data_directory_path): self.numbers_word_map[numeral] = NumberVariant(scale=value, increment=0) # Dict map to store scale and their values self.scale_map[numeral] = value + # map the name of number to latin numeric value + self.base_numbers_map[numeral] = value + + number_text = row[NUMBER_NUMERAL_FILE_NUMBER_COLUMN_NAME] + self.base_numbers_map[number_text] = value # create units_dict having unit variants and their corresponding value unit_file_path = os.path.join(data_directory_path, NUMBER_UNITS_FILE_NAME) diff --git a/ner_v2/detectors/numeral/number/zh-TW/__init__.py b/ner_v2/detectors/numeral/number/zh-TW/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/ner_v2/detectors/numeral/number/zh-TW/data/numerals_constant.csv b/ner_v2/detectors/numeral/number/zh-TW/data/numerals_constant.csv new file mode 100644 index 00000000..4abbc1e9 --- /dev/null +++ b/ner_v2/detectors/numeral/number/zh-TW/data/numerals_constant.csv @@ -0,0 +1,17 @@ +number,name_variants,number_value,number_type +零,零|líng,0,unit +〇,零|líng,0,unit +一,一|壹|yī|yāo,1,unit +二,二|貳|èr|liǎng,2,unit +三,三|叁|sān,3,unit +四,四|肆|sì,4,unit +五,五|伍|wǔ,5,unit +六,六|陸|lìu,6,unit +七,七|柒|qī,7,unit +八,八|捌|bā,8,unit +九,九|玖|jiǔ,9,unit +十,十|拾|shí,10,scale +百,百|佰,100,scale +千,千|仟,1000,scale +萬,萬|萬,10000,scale +億,億|億,100000000,scale \ No newline at end of file diff --git a/ner_v2/detectors/numeral/number/zh-TW/data/units.csv b/ner_v2/detectors/numeral/number/zh-TW/data/units.csv new file mode 100644 index 00000000..bd2dc845 --- /dev/null +++ b/ner_v2/detectors/numeral/number/zh-TW/data/units.csv @@ -0,0 +1,2 @@ +unit_type,unit_value,unit_variants +currency,dollar,Dollar | usd | डॉलर | $ \ No newline at end of file diff --git a/ner_v2/detectors/numeral/number/zh-TW/number_detection.py b/ner_v2/detectors/numeral/number/zh-TW/number_detection.py new file mode 100644 index 00000000..18c53050 --- /dev/null +++ b/ner_v2/detectors/numeral/number/zh-TW/number_detection.py @@ -0,0 +1,152 @@ +from __future__ import absolute_import +import os + +try: + import regex as re + + _re_flags = re.UNICODE | re.V1 | re.WORD +except ImportError: + import re + + _re_flags = re.UNICODE + +from ner_v2.constant import LANGUAGE_DATA_DIRECTORY +from ner_v2.detectors.numeral.constant import NUMBER_DETECTION_RETURN_DICT_SPAN, \ + NUMBER_DETECTION_RETURN_DICT_UNIT, NUMBER_DETECTION_RETURN_DICT_VALUE +from ner_v2.detectors.numeral.number.standard_number_detector import BaseNumberDetector + + +class NumberDetector(BaseNumberDetector): + """ + Number detector to detect numbers in chinese text + It map the chinese to latin character as per data file and extract the numeric values + """ + + data_directory_path = os.path.join((os.path.dirname(os.path.abspath(__file__)).rstrip(os.sep)), + LANGUAGE_DATA_DIRECTORY) + + def __init__(self, entity_name='number', unit_type=None): + super(NumberDetector, self).__init__(entity_name=entity_name, + data_directory_path=NumberDetector.data_directory_path, + unit_type=unit_type) + + self.base_numbers_map_full = self.base_numbers_map.copy() + self.base_numbers_map_choices_full = self._get_base_map_choices(self.base_numbers_map_full) + + self._filter_base_numbers_map() + self.base_numbers_map_choices = self._get_base_map_choices(self.base_numbers_map) + + self.detector_preferences = [ + self._detect_number_from_text + ] + + self.special_chars_mapping = { + ',': '、', + '.': '點', + '+': '加' + } + + def _get_base_map_choices(self, base_map): + number_set = set() + for key, val in base_map.items(): + number_set.add(str(key)) + number_set.add(str(val)) + + sorted_len_base_number_key_vals = sorted(list(number_set)) + return "|".join([re.escape(x) for x in sorted_len_base_number_key_vals]) + + def _filter_base_numbers_map(self): + """ + Only require the chinese digits mapping for digit from 0 to 9 + """ + new_base_numbers_map = {} + for k, v in self.base_numbers_map.items(): + if 0 <= v <= 9: + new_base_numbers_map[k] = v + self.base_numbers_map = new_base_numbers_map + + def _have_digits_only(self, text=None, scale_map=None): + text = text or '' + scale_map = scale_map or {} + + scaling_digits = set(list(scale_map.keys())) + only_digits = True + for _digit in text: + if _digit in scaling_digits: + only_digits = False + break + return only_digits + + def replace_special_chars(self, text=None): + text = text or '' + for _char, _native_char in self.special_chars_mapping.items(): + text = text.replace(_native_char, _char) + return text + + def _detect_number_from_text(self, number_list=None, original_list=None): + """ + extract out the numbers from chinese text ( roman as well as chinese ) + """ + number_list = number_list or [] + original_list = original_list or [] + start_span = 0 + end_span = -1 + + # removing hyphen + self.processed_text = re.sub(r'[-]+', '', self.processed_text) + + spanned_text = self.processed_text + processed_text = self.processed_text + + # need to handle decimal points as well + rgx_pattern = r'([{}]+)({}?([{}]*))'.format( + self.base_numbers_map_full, + self.special_chars_mapping.get('.', '.'), + self.base_numbers_map_full + ) + regex_digit_patterns = re.compile(rgx_pattern) + patterns = regex_digit_patterns.findall(self.processed_text) + for pattern in patterns: + full_number, number, original_text = None, None, None + if pattern[0].strip(): + original_text = pattern[0].strip() + span = re.search(original_text, spanned_text).span() + start_span = end_span + span[0] + end_span += span[1] + spanned_text = spanned_text[span[1]:] + number = self.get_number(original_text) + if number.isnumeric(): + full_number = number + + if full_number: + _pattern = re.compile(re.escape(original_text), flags=_re_flags) + if _pattern.search(processed_text): + processed_text = _pattern.sub(self.tag, processed_text, 1) + number_list.append({ + NUMBER_DETECTION_RETURN_DICT_VALUE: int(full_number), + NUMBER_DETECTION_RETURN_DICT_UNIT: None, + NUMBER_DETECTION_RETURN_DICT_SPAN: (start_span, end_span) + }) + original_list.append(original_text) + return number_list, original_list + + def get_number(self, original_text): + if self._have_digits_only(original_text, self.scale_map): + return self.get_number_digit_by_digit(original_text) + return self.get_number_with_digit_scaling(original_text) + + def extract_digits_only(self, text, rgx_pattern=None, with_special_chars=False): + text = text or '' + rgx_pattern = rgx_pattern or r'[-,.+\s{}]+' + digit_choices = self.base_numbers_map_choices + if with_special_chars: + digit_choices += '|'.join(self.special_chars_mapping.values()) + rgx_pattern = re.compile(rgx_pattern.format(digit_choices)) + return rgx_pattern.findall(text) + + def get_number_digit_by_digit(self, text=''): + return ''.join([str(self.base_numbers_map.get(_t, _t)) for _t in text]) + + def get_number_with_digit_scaling(self, text=''): + # change the below logic to work with scaling + return text \ No newline at end of file diff --git a/ner_v2/detectors/pattern/phone_number/phone_number_detection.py b/ner_v2/detectors/pattern/phone_number/phone_number_detection.py index 21b1a816..bc626e67 100644 --- a/ner_v2/detectors/pattern/phone_number/phone_number_detection.py +++ b/ner_v2/detectors/pattern/phone_number/phone_number_detection.py @@ -13,7 +13,7 @@ import phonenumbers from six.moves import zip -from language_utilities.constant import ENGLISH_LANG +from language_utilities.constant import ENGLISH_LANG, CHINESE_TRADITIONAL_LANG from ner_v2.detectors.base_detector import BaseDetector from ner_v2.detectors.numeral.number.number_detection import NumberDetector @@ -152,3 +152,63 @@ def check_for_country_code(self, phone_num): phone_dict['value'] = phone_num return phone_dict + + +class ChinesePhoneDetector(PhoneDetector): + """ + This method is used to detect phone numbers present in chinese text. + """ + + def __init__(self, entity_name, language=CHINESE_TRADITIONAL_LANG, locale=None): + """ + Args: + entity_name (str): A string by which the detected numbers would be replaced with + on calling detect_entity() + language (str, optional): language code of number text, defaults to 'en' + locale(str, optional): locale of the country from which you are dialing. Ex: 'en-IN' + """ + self._supported_languages = NumberDetector.get_supported_languages() + super(ChinesePhoneDetector, self).__init__(entity_name, language, locale) + + # Using Chinese number detector here + self.number_detector = NumberDetector(self.entity_name, language=self.language) + self.language_number_detector = self.number_detector.get_language_number_detector() + + def _text_list_for_detection(self, text=None): + """ + This function is use to preprocess text before detecting phone number + and return a list of string on which phone number detection need to be made + + parameters : text (string) + return : list[string] + """ + text = text or '' + phone_number_format_regex = r'[-(),.+\s{}]+' + matches = self.language_number_detector.extract_digits_only(text, phone_number_format_regex, True) + return matches + + def _sanitize_text(self, text=None): + text = text or '' + sanitized_text = self.language_number_detector.replace_special_chars(text) + sanitized_text = self.language_number_detector.get_number_digit_by_digit(sanitized_text) + return sanitized_text + + def detect_entity(self, text, **kwargs): + """ + This is to detect phone numbers from text by mapping chinese digits to numeric values + """ + number_matches = self._text_list_for_detection(text) + self.phone, self.original_phone_text = [], [] + for _text in number_matches: + original_text = " " + _text.lower().strip() + " " + sanitized_text = self._sanitize_text(original_text) + for match in phonenumbers.PhoneNumberMatcher(sanitized_text, self.country_code, leniency=0): + if match.number.country_code == phonenumbers.country_code_for_region(self.country_code): + self.phone.append(self.check_for_country_code(str(match.number.national_number))) + self.original_phone_text.append(original_text[match.start:match.end]) + else: + # This means our detector has detected some other country code. + self.phone.append({"country_calling_code": str(match.number.country_code), + "value": str(match.number.national_number)}) + self.original_phone_text.append(original_text[match.start:match.end]) + return self.phone, self.original_phone_text \ No newline at end of file