Skip to content

Commit

Permalink
handeling brackets and + symbol in phone number
Browse files Browse the repository at this point in the history
  • Loading branch information
harjinder7 committed Nov 9, 2022
1 parent 414600d commit f8eb9ad
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 9 deletions.
15 changes: 8 additions & 7 deletions ner_v2/detectors/numeral/number/zh-TW/number_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ def __init__(self, entity_name='number', unit_type=None):

self.special_chars_mapping = {
',': '、',
'.': '點'
'.': '點',
'+': '加'
}

def _get_base_map_choices(self, base_map):
Expand Down Expand Up @@ -134,13 +135,13 @@ def get_number(self, original_text):
return self.get_number_digit_by_digit(original_text)
return self.get_number_with_digit_scaling(original_text)

def extract_digits_only(self, text, with_scale=False):
def extract_digits_only(self, text, rgx_pattern=None, with_special_chars=False):
text = text or ''
rgx_pattern = r'[-,.+\s{}]+'
if not with_scale:
rgx_pattern = re.compile(rgx_pattern.format(self.base_numbers_map_choices))
else:
rgx_pattern = re.compile(rgx_pattern.format(self.base_numbers_map_choices_full))
rgx_pattern = rgx_pattern or r'[-,.+\s{}]+'
digit_choices = self.base_numbers_map_choices
if with_special_chars:
digit_choices += '|'.join(self.special_chars_mapping.values())
rgx_pattern = re.compile(rgx_pattern.format(digit_choices))
return rgx_pattern.findall(text)

def get_number_digit_by_digit(self, text=''):
Expand Down
11 changes: 9 additions & 2 deletions ner_v2/detectors/pattern/phone_number/phone_number_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,9 +183,16 @@ def _text_list_for_detection(self, text=None):
return : list[string]
"""
text = text or ''
matches = self.language_number_detector.extract_digits_only(text)
phone_number_format_regex = r'[-(),.+\s{}]+'
matches = self.language_number_detector.extract_digits_only(text, phone_number_format_regex, True)
return matches

def _sanitize_text(self, text=None):
text = text or ''
sanitized_text = self.language_number_detector.replace_special_chars(text)
sanitized_text = self.language_number_detector.get_number_digit_by_digit(sanitized_text)
return sanitized_text

def detect_entity(self, text, **kwargs):
"""
This is to detect phone numbers from text by mapping chinese digits to numeric values
Expand All @@ -194,7 +201,7 @@ def detect_entity(self, text, **kwargs):
self.phone, self.original_phone_text = [], []
for _text in number_matches:
original_text = " " + _text.lower().strip() + " "
sanitized_text = self.language_number_detector.get_number_digit_by_digit(original_text)
sanitized_text = self._sanitize_text(original_text)
for match in phonenumbers.PhoneNumberMatcher(sanitized_text, self.country_code, leniency=0):
if match.number.country_code == phonenumbers.country_code_for_region(self.country_code):
self.phone.append(self.check_for_country_code(str(match.number.national_number)))
Expand Down

0 comments on commit f8eb9ad

Please sign in to comment.