Skip to content

Commit

Permalink
Merge pull request #511 from hellohaptik/ML-3334/chinese_number_mappi…
Browse files Browse the repository at this point in the history
…ng_of_digits

Ml 3334/chinese number mapping of digits
  • Loading branch information
harjinder7 authored Nov 10, 2022
2 parents 36ba916 + f8eb9ad commit 348afc3
Show file tree
Hide file tree
Showing 11 changed files with 260 additions and 10 deletions.
2 changes: 2 additions & 0 deletions language_utilities/constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,5 +30,7 @@
PORTUGUESE_LANG = 'pt'
TURKISH_LANG = 'tr'

CHINESE_TRADITIONAL_LANG = 'zh-TW'

# language translation status
TRANSLATED_TEXT = 'translated_text'
12 changes: 8 additions & 4 deletions ner_v2/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,15 @@

from chatbot_ner.config import ner_logger
from datastore.exceptions import DataStoreRequestException
from language_utilities.constant import ENGLISH_LANG
from language_utilities.constant import ENGLISH_LANG, CHINESE_TRADITIONAL_LANG
from ner_constants import PARAMETER_MESSAGE, PARAMETER_ENTITY_NAME, PARAMETER_STRUCTURED_VALUE, \
PARAMETER_FALLBACK_VALUE, \
PARAMETER_BOT_MESSAGE, PARAMETER_TIMEZONE, PARAMETER_LANGUAGE_SCRIPT, PARAMETER_SOURCE_LANGUAGE, \
PARAMETER_PAST_DATE_REFERENCED, PARAMETER_MIN_DIGITS, PARAMETER_MAX_DIGITS, PARAMETER_NUMBER_UNIT_TYPE, \
PARAMETER_LOCALE, PARAMETER_RANGE_ENABLED
from ner_v2.detectors.numeral.number.number_detection import NumberDetector
from ner_v2.detectors.numeral.number_range.number_range_detection import NumberRangeDetector
from ner_v2.detectors.pattern.phone_number.phone_number_detection import PhoneDetector
from ner_v2.detectors.pattern.phone_number.phone_number_detection import PhoneDetector, ChinesePhoneDetector
from ner_v2.detectors.temporal.date.date_detection import DateAdvancedDetector
from ner_v2.detectors.temporal.time.time_detection import TimeDetector
from ner_v2.detectors.textual.utils import get_text_entity_detection_data, validate_text_request, InvalidTextRequest
Expand Down Expand Up @@ -634,8 +634,12 @@ def phone_number(request):
ner_logger.debug('Entity Name %s' % entity_name)
ner_logger.debug('Source Language %s' % language)

phone_number_detection = PhoneDetector(entity_name=entity_name, language=language,
locale=parameters_dict[PARAMETER_LOCALE])
if language == CHINESE_TRADITIONAL_LANG:
phone_number_detection = ChinesePhoneDetector(entity_name=entity_name, language=language,
locale=parameters_dict[PARAMETER_LOCALE])
else:
phone_number_detection = PhoneDetector(entity_name=entity_name, language=language,
locale=parameters_dict[PARAMETER_LOCALE])
message = parameters_dict[PARAMETER_MESSAGE]

ner_logger.debug(parameters_dict)
Expand Down
2 changes: 0 additions & 2 deletions ner_v2/detectors/base_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
FROM_FALLBACK_VALUE, ORIGINAL_TEXT, ENTITY_VALUE, DETECTION_METHOD,
DETECTION_LANGUAGE, ENTITY_VALUE_DICT_KEY)


class BaseDetector(object):
"""
This class is the base class from which will be inherited by individual detectors. It primarily contains the
Expand Down Expand Up @@ -143,7 +142,6 @@ def detect(self, message=None, structured_value=None, fallback_value=None, **kwa

text = structured_value if structured_value else message
entity_list, original_text_list = self.detect_entity(text=text, **kwargs)

if structured_value:
if entity_list:
value, method, original_text = entity_list, FROM_STRUCTURE_VALUE_VERIFIED, original_text_list
Expand Down
1 change: 1 addition & 0 deletions ner_v2/detectors/numeral/constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
NUMBER_NUMERAL_CONSTANT_FILE_NAME = 'numerals_constant.csv'
NUMBER_NUMERAL_FILE_VARIANTS_COLUMN_NAME = 'name_variants'
NUMBER_NUMERAL_FILE_VALUE_COLUMN_NAME = 'number_value'
NUMBER_NUMERAL_FILE_NUMBER_COLUMN_NAME = 'number'
NUMBER_NUMERAL_FILE_TYPE_COLUMN_NAME = 'number_type'

# type value of number in numeral_constant data file
Expand Down
9 changes: 7 additions & 2 deletions ner_v2/detectors/numeral/number/number_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def get_supported_languages():
cwd = os.path.dirname(os.path.abspath(__file__))
cwd_dirs = [x for x in os.listdir(cwd) if os.path.isdir(os.path.join(cwd, x))]
for _dir in cwd_dirs:
if len(_dir.rstrip(os.sep)) == 2:
if len(_dir.rstrip(os.sep)) in [2, 5]:
supported_languages.append(_dir)
return supported_languages

Expand Down Expand Up @@ -115,7 +115,6 @@ def __init__(self, entity_name, language=ENGLISH_LANG, unit_type=None, detect_wi
'ner_v2.detectors.numeral.number.{0}.number_detection'.format(self.language))
self.language_number_detector = number_detector_module.NumberDetector(entity_name=self.entity_name,
unit_type=self.unit_type)

except ImportError:
standard_number_regex = importlib.import_module(
'ner_v2.detectors.numeral.number.standard_number_detector'
Expand All @@ -127,6 +126,12 @@ def __init__(self, entity_name, language=ENGLISH_LANG, unit_type=None, detect_wi
lang_code=self.language)
)

def get_language_number_detector(self):
"""
To get the language number detector being used by current Number detector
"""
return self.language_number_detector

@property
def supported_languages(self):
return self._supported_languages
Expand Down
11 changes: 10 additions & 1 deletion ner_v2/detectors/numeral/number/standard_number_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@
NUMBER_NUMERAL_FILE_VALUE_COLUMN_NAME, NUMBER_NUMERAL_FILE_TYPE_COLUMN_NAME, NUMBER_TYPE_UNIT, \
NUMBER_NUMERAL_CONSTANT_FILE_NAME, NUMBER_DETECTION_RETURN_DICT_VALUE, NUMBER_DETECTION_RETURN_DICT_SPAN, \
NUMBER_DETECTION_RETURN_DICT_UNIT, NUMBER_UNITS_FILE_NAME, NUMBER_DATA_FILE_UNIT_VARIANTS_COLUMN_NAME, \
NUMBER_DATA_FILE_UNIT_VALUE_COLUMN_NAME, NUMBER_TYPE_SCALE, NUMBER_DATA_FILE_UNIT_TYPE_COLUMN_NAME
NUMBER_DATA_FILE_UNIT_VALUE_COLUMN_NAME, NUMBER_TYPE_SCALE, NUMBER_DATA_FILE_UNIT_TYPE_COLUMN_NAME, \
NUMBER_NUMERAL_FILE_NUMBER_COLUMN_NAME
from ner_v2.detectors.numeral.utils import get_number_from_number_word, get_list_from_pipe_sep_string

NumberVariant = collections.namedtuple('NumberVariant', ['scale', 'increment'])
Expand All @@ -46,6 +47,7 @@ def __init__(self, entity_name, data_directory_path, unit_type=None):
self.entity_name = entity_name
self.tag = '__' + entity_name + '__'

self.base_numbers_map = {}
self.numbers_word_map = {}
self.scale_map = {}
self.units_map = {}
Expand Down Expand Up @@ -122,13 +124,20 @@ def init_regex_and_parser(self, data_directory_path):
for numeral in name_variants:
# tuple values to corresponds to (scale, increment), for unit type, scale will always be 1.
self.numbers_word_map[numeral] = NumberVariant(scale=1, increment=value)
# map the name of number to latin numeric value
self.base_numbers_map[numeral] = value

elif number_type == NUMBER_TYPE_SCALE:
for numeral in name_variants:
# tuple values to corresponds to (scale, increment), for scale type, increment will always be 0.
self.numbers_word_map[numeral] = NumberVariant(scale=value, increment=0)
# Dict map to store scale and their values
self.scale_map[numeral] = value
# map the name of number to latin numeric value
self.base_numbers_map[numeral] = value

number_text = row[NUMBER_NUMERAL_FILE_NUMBER_COLUMN_NAME]
self.base_numbers_map[number_text] = value

# create units_dict having unit variants and their corresponding value
unit_file_path = os.path.join(data_directory_path, NUMBER_UNITS_FILE_NAME)
Expand Down
Empty file.
17 changes: 17 additions & 0 deletions ner_v2/detectors/numeral/number/zh-TW/data/numerals_constant.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
number,name_variants,number_value,number_type
零,零|líng,0,unit
〇,零|líng,0,unit
一,一|壹|yī|yāo,1,unit
二,二|貳|èr|liǎng,2,unit
三,三|叁|sān,3,unit
四,四|肆|sì,4,unit
五,五|伍|wǔ,5,unit
六,六|陸|lìu,6,unit
七,七|柒|qī,7,unit
八,八|捌|bā,8,unit
九,九|玖|jiǔ,9,unit
十,十|拾|shí,10,scale
百,百|佰,100,scale
千,千|仟,1000,scale
萬,萬|萬,10000,scale
億,億|億,100000000,scale
2 changes: 2 additions & 0 deletions ner_v2/detectors/numeral/number/zh-TW/data/units.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
unit_type,unit_value,unit_variants
currency,dollar,Dollar | usd | डॉलर | $
152 changes: 152 additions & 0 deletions ner_v2/detectors/numeral/number/zh-TW/number_detection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
from __future__ import absolute_import
import os

try:
import regex as re

_re_flags = re.UNICODE | re.V1 | re.WORD
except ImportError:
import re

_re_flags = re.UNICODE

from ner_v2.constant import LANGUAGE_DATA_DIRECTORY
from ner_v2.detectors.numeral.constant import NUMBER_DETECTION_RETURN_DICT_SPAN, \
NUMBER_DETECTION_RETURN_DICT_UNIT, NUMBER_DETECTION_RETURN_DICT_VALUE
from ner_v2.detectors.numeral.number.standard_number_detector import BaseNumberDetector


class NumberDetector(BaseNumberDetector):
"""
Number detector to detect numbers in chinese text
It map the chinese to latin character as per data file and extract the numeric values
"""

data_directory_path = os.path.join((os.path.dirname(os.path.abspath(__file__)).rstrip(os.sep)),
LANGUAGE_DATA_DIRECTORY)

def __init__(self, entity_name='number', unit_type=None):
super(NumberDetector, self).__init__(entity_name=entity_name,
data_directory_path=NumberDetector.data_directory_path,
unit_type=unit_type)

self.base_numbers_map_full = self.base_numbers_map.copy()
self.base_numbers_map_choices_full = self._get_base_map_choices(self.base_numbers_map_full)

self._filter_base_numbers_map()
self.base_numbers_map_choices = self._get_base_map_choices(self.base_numbers_map)

self.detector_preferences = [
self._detect_number_from_text
]

self.special_chars_mapping = {
',': '、',
'.': '點',
'+': '加'
}

def _get_base_map_choices(self, base_map):
number_set = set()
for key, val in base_map.items():
number_set.add(str(key))
number_set.add(str(val))

sorted_len_base_number_key_vals = sorted(list(number_set))
return "|".join([re.escape(x) for x in sorted_len_base_number_key_vals])

def _filter_base_numbers_map(self):
"""
Only require the chinese digits mapping for digit from 0 to 9
"""
new_base_numbers_map = {}
for k, v in self.base_numbers_map.items():
if 0 <= v <= 9:
new_base_numbers_map[k] = v
self.base_numbers_map = new_base_numbers_map

def _have_digits_only(self, text=None, scale_map=None):
text = text or ''
scale_map = scale_map or {}

scaling_digits = set(list(scale_map.keys()))
only_digits = True
for _digit in text:
if _digit in scaling_digits:
only_digits = False
break
return only_digits

def replace_special_chars(self, text=None):
text = text or ''
for _char, _native_char in self.special_chars_mapping.items():
text = text.replace(_native_char, _char)
return text

def _detect_number_from_text(self, number_list=None, original_list=None):
"""
extract out the numbers from chinese text ( roman as well as chinese )
"""
number_list = number_list or []
original_list = original_list or []
start_span = 0
end_span = -1

# removing hyphen
self.processed_text = re.sub(r'[-]+', '', self.processed_text)

spanned_text = self.processed_text
processed_text = self.processed_text

# need to handle decimal points as well
rgx_pattern = r'([{}]+)({}?([{}]*))'.format(
self.base_numbers_map_full,
self.special_chars_mapping.get('.', '.'),
self.base_numbers_map_full
)
regex_digit_patterns = re.compile(rgx_pattern)
patterns = regex_digit_patterns.findall(self.processed_text)
for pattern in patterns:
full_number, number, original_text = None, None, None
if pattern[0].strip():
original_text = pattern[0].strip()
span = re.search(original_text, spanned_text).span()
start_span = end_span + span[0]
end_span += span[1]
spanned_text = spanned_text[span[1]:]
number = self.get_number(original_text)
if number.isnumeric():
full_number = number

if full_number:
_pattern = re.compile(re.escape(original_text), flags=_re_flags)
if _pattern.search(processed_text):
processed_text = _pattern.sub(self.tag, processed_text, 1)
number_list.append({
NUMBER_DETECTION_RETURN_DICT_VALUE: int(full_number),
NUMBER_DETECTION_RETURN_DICT_UNIT: None,
NUMBER_DETECTION_RETURN_DICT_SPAN: (start_span, end_span)
})
original_list.append(original_text)
return number_list, original_list

def get_number(self, original_text):
if self._have_digits_only(original_text, self.scale_map):
return self.get_number_digit_by_digit(original_text)
return self.get_number_with_digit_scaling(original_text)

def extract_digits_only(self, text, rgx_pattern=None, with_special_chars=False):
text = text or ''
rgx_pattern = rgx_pattern or r'[-,.+\s{}]+'
digit_choices = self.base_numbers_map_choices
if with_special_chars:
digit_choices += '|'.join(self.special_chars_mapping.values())
rgx_pattern = re.compile(rgx_pattern.format(digit_choices))
return rgx_pattern.findall(text)

def get_number_digit_by_digit(self, text=''):
return ''.join([str(self.base_numbers_map.get(_t, _t)) for _t in text])

def get_number_with_digit_scaling(self, text=''):
# change the below logic to work with scaling
return text
62 changes: 61 additions & 1 deletion ner_v2/detectors/pattern/phone_number/phone_number_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import phonenumbers
from six.moves import zip

from language_utilities.constant import ENGLISH_LANG
from language_utilities.constant import ENGLISH_LANG, CHINESE_TRADITIONAL_LANG
from ner_v2.detectors.base_detector import BaseDetector
from ner_v2.detectors.numeral.number.number_detection import NumberDetector

Expand Down Expand Up @@ -152,3 +152,63 @@ def check_for_country_code(self, phone_num):
phone_dict['value'] = phone_num

return phone_dict


class ChinesePhoneDetector(PhoneDetector):
"""
This method is used to detect phone numbers present in chinese text.
"""

def __init__(self, entity_name, language=CHINESE_TRADITIONAL_LANG, locale=None):
"""
Args:
entity_name (str): A string by which the detected numbers would be replaced with
on calling detect_entity()
language (str, optional): language code of number text, defaults to 'en'
locale(str, optional): locale of the country from which you are dialing. Ex: 'en-IN'
"""
self._supported_languages = NumberDetector.get_supported_languages()
super(ChinesePhoneDetector, self).__init__(entity_name, language, locale)

# Using Chinese number detector here
self.number_detector = NumberDetector(self.entity_name, language=self.language)
self.language_number_detector = self.number_detector.get_language_number_detector()

def _text_list_for_detection(self, text=None):
"""
This function is use to preprocess text before detecting phone number
and return a list of string on which phone number detection need to be made
parameters : text (string)
return : list[string]
"""
text = text or ''
phone_number_format_regex = r'[-(),.+\s{}]+'
matches = self.language_number_detector.extract_digits_only(text, phone_number_format_regex, True)
return matches

def _sanitize_text(self, text=None):
text = text or ''
sanitized_text = self.language_number_detector.replace_special_chars(text)
sanitized_text = self.language_number_detector.get_number_digit_by_digit(sanitized_text)
return sanitized_text

def detect_entity(self, text, **kwargs):
"""
This is to detect phone numbers from text by mapping chinese digits to numeric values
"""
number_matches = self._text_list_for_detection(text)
self.phone, self.original_phone_text = [], []
for _text in number_matches:
original_text = " " + _text.lower().strip() + " "
sanitized_text = self._sanitize_text(original_text)
for match in phonenumbers.PhoneNumberMatcher(sanitized_text, self.country_code, leniency=0):
if match.number.country_code == phonenumbers.country_code_for_region(self.country_code):
self.phone.append(self.check_for_country_code(str(match.number.national_number)))
self.original_phone_text.append(original_text[match.start:match.end])
else:
# This means our detector has detected some other country code.
self.phone.append({"country_calling_code": str(match.number.country_code),
"value": str(match.number.national_number)})
self.original_phone_text.append(original_text[match.start:match.end])
return self.phone, self.original_phone_text

0 comments on commit 348afc3

Please sign in to comment.