From 77b408f5e3ac94cff0c005fff0848463a26c5790 Mon Sep 17 00:00:00 2001 From: ranvijayj Date: Tue, 23 Jul 2019 17:22:46 +0530 Subject: [PATCH 01/83] Remove SECRET_KEY from settings and move to ENV Remove SECRET_KEY from settings and move to ENV Value to will be retrieved from ENVKEY or .env for dev ADHOC - Security --- chatbot_ner/settings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/chatbot_ner/settings.py b/chatbot_ner/settings.py index 1f565f11f..603e1a384 100755 --- a/chatbot_ner/settings.py +++ b/chatbot_ner/settings.py @@ -18,7 +18,7 @@ # See https://docs.djangoproject.com/en/1.11/howto/deployment/checklist/ # SECURITY WARNING: keep the secret key used in production secret! -SECRET_KEY = '70vigjv=h)=p8d%e80(3ue2p70e^x96#n8u+7ia9+7o02iq*6k' +SECRET_KEY = os.environ.get('SECRET_KEY') # SECURITY WARNING: don't run with debug turned on in production! _dj_debug = os.environ.get('DJANGO_DEBUG') From ccd150220a0322871ca71bddaf5329bd63c0be92 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Thu, 31 Oct 2019 15:34:22 +0530 Subject: [PATCH 02/83] add dd-mm-yy support to hi and other languages --- .../temporal/date/hi/date_detection.py | 1 + .../temporal/date/standard_date_regex.py | 88 +++++++++++++++++++ 2 files changed, 89 insertions(+) diff --git a/ner_v2/detectors/temporal/date/hi/date_detection.py b/ner_v2/detectors/temporal/date/hi/date_detection.py index 289259a74..86ff7c820 100644 --- a/ner_v2/detectors/temporal/date/hi/date_detection.py +++ b/ner_v2/detectors/temporal/date/hi/date_detection.py @@ -28,6 +28,7 @@ def __init__(self, entity_name, locale=None, timezone='UTC', past_date_reference self._detect_weekday_diff, self._detect_weekday, self.custom_christmas_date_detector, + self._gregorian_day_month_year_format ] def custom_christmas_date_detector(self, date_list=None, original_list=None): diff --git a/ner_v2/detectors/temporal/date/standard_date_regex.py b/ner_v2/detectors/temporal/date/standard_date_regex.py index 5fb5815eb..4e19dbb0d 100644 --- a/ner_v2/detectors/temporal/date/standard_date_regex.py +++ b/ner_v2/detectors/temporal/date/standard_date_regex.py @@ -537,6 +537,94 @@ def _detect_weekday(self, date_list, original_list): original_list.append(original) return date_list, original_list + def _gregorian_day_month_year_format(self, date_list=None, original_list=None): + """ + Detects date in the following format + + format: + where each part is in of one of the formats given against them + day: d, dd + month: m, mm + year: yy, yyyy + separator: "/", "-", "." + + Two character years are assumed to be belong to 21st century - 20xx. + Only years between 1900 to 2099 are detected + + Few valid examples: + "6/2/39", "7/01/1997", "28-12-2096" + + Args: + date_list: Optional, list to store dictionaries of detected dates + original_list: Optional, list to store corresponding substrings of given text which were detected as + date entities + Returns: + A tuple of two lists with first list containing the detected date entities and second list containing their + corresponding substrings in the given text. + + """ + if original_list is None: + original_list = [] + if date_list is None: + date_list = [] + regex_pattern = re.compile(r'[^/\-\.\w](([12][0-9]|3[01]|0?[1-9])\s?[/\-\.]\s?(1[0-2]|0?[1-9])' + r'(?:\s?[/\-\.]\s?((?:20|19)?[0-9]{2}))?)\W') + patterns = regex_pattern.findall(self.processed_text.lower()) + for pattern in patterns: + original = pattern[0] + dd = int(pattern[1]) + mm = int(pattern[2]) + yy = int(self.normalize_year(pattern[3])) if pattern[3] else self.now_date.year + try: + # to catch dates which are not possible like "31/11" (october 31st) + if not pattern[3] and self.timezone.localize(datetime.datetime(year=yy, month=mm, day=dd))\ + < self.now_date: + yy += 1 + except: + return date_list, original_list + + date = { + 'dd': int(dd), + 'mm': int(mm), + 'yy': int(yy), + 'type': TYPE_EXACT + } + date_list.append(date) + # original = self.regx_to_process.text_substitute(original) + original_list.append(original) + return date_list, original_list + + def normalize_year(self, year): + """ + Normalize two digit year to four digits by taking into consideration the bot message. Useful in cases like + date of birth where past century is preferred than current. If no bot message is given it falls back to + current century + + Args: + year (str): Year string to normalize + + Returns: + str: year in four digits + """ + past_regex = re.compile(r'birth|bday|dob|born|जन्म|जन्मदिन|పుట్టినరోజు|పుట్టిన') + present_regex = None + future_regex = None + this_century = int(str(self.now_date.year)[:2]) + if len(year) == 2: + if self.bot_message: + if past_regex and past_regex.search(self.bot_message): + return str(this_century - 1) + year + elif present_regex and present_regex.search(self.bot_message): + return str(this_century) + year + elif future_regex and future_regex.search(self.bot_message): + return str(this_century + 1) + year + + # if patterns didn't match or no bot message set, fallback to current century + if len(year) == 2: + return str(this_century) + year + + return year + def _update_processed_text(self, original_date_list): """ Replaces detected date with tag generated from entity_name used to initialize the object with From 7430674d7207b8f542577097a87d5361333cb199 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Thu, 31 Oct 2019 15:53:39 +0530 Subject: [PATCH 03/83] fix detecor pref --- ner_v2/detectors/temporal/date/hi/date_detection.py | 4 ++-- ner_v2/detectors/temporal/date/standard_date_regex.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/ner_v2/detectors/temporal/date/hi/date_detection.py b/ner_v2/detectors/temporal/date/hi/date_detection.py index 86ff7c820..9bfbd7b0d 100644 --- a/ner_v2/detectors/temporal/date/hi/date_detection.py +++ b/ner_v2/detectors/temporal/date/hi/date_detection.py @@ -16,6 +16,7 @@ def __init__(self, entity_name, locale=None, timezone='UTC', past_date_reference timezone=timezone, past_date_referenced=past_date_referenced) self.detector_preferences = [ + self._gregorian_day_month_year_format, self._detect_relative_date, self._detect_date_month, self._detect_date_ref_month_1, @@ -27,8 +28,7 @@ def __init__(self, entity_name, locale=None, timezone='UTC', past_date_reference self._detect_weekday_ref_month_2, self._detect_weekday_diff, self._detect_weekday, - self.custom_christmas_date_detector, - self._gregorian_day_month_year_format + self.custom_christmas_date_detector ] def custom_christmas_date_detector(self, date_list=None, original_list=None): diff --git a/ner_v2/detectors/temporal/date/standard_date_regex.py b/ner_v2/detectors/temporal/date/standard_date_regex.py index 4e19dbb0d..e85a5e1f7 100644 --- a/ner_v2/detectors/temporal/date/standard_date_regex.py +++ b/ner_v2/detectors/temporal/date/standard_date_regex.py @@ -60,7 +60,8 @@ def __init__(self, entity_name, data_directory_path, locale=None, timezone='UTC' self.init_regex_and_parser(data_directory_path) # Variable to define default order in which these regex will work - self.detector_preferences = [self._detect_relative_date, + self.detector_preferences = [self._gregorian_day_month_year_format, + self._detect_relative_date, self._detect_date_month, self._detect_date_ref_month_1, self._detect_date_ref_month_2, From 726565a17a72fd97533a879592b9b96d91d82764 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Thu, 31 Oct 2019 16:44:15 +0530 Subject: [PATCH 04/83] add bot_message in detect_date --- ner_v2/detectors/temporal/date/date_detection.py | 3 ++- ner_v2/detectors/temporal/date/en/date_detection.py | 4 +++- .../detectors/temporal/date/standard_date_regex.py | 13 ++++++++++++- 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/ner_v2/detectors/temporal/date/date_detection.py b/ner_v2/detectors/temporal/date/date_detection.py index d8c92f105..a2c4e111b 100644 --- a/ner_v2/detectors/temporal/date/date_detection.py +++ b/ner_v2/detectors/temporal/date/date_detection.py @@ -832,7 +832,8 @@ def detect_entity(self, text, **kwargs): self.processed_text = self.text self.tagged_text = self.text if self.language_date_detector: - self.date, self.original_date_text = self.language_date_detector.detect_date(self.processed_text) + self.date, self.original_date_text = self.language_date_detector.detect_date(self.processed_text, + self.bot_message) validated_date_list, validated_original_list = [], [] diff --git a/ner_v2/detectors/temporal/date/en/date_detection.py b/ner_v2/detectors/temporal/date/en/date_detection.py index 5c1423fac..1adc9bd9b 100644 --- a/ner_v2/detectors/temporal/date/en/date_detection.py +++ b/ner_v2/detectors/temporal/date/en/date_detection.py @@ -149,7 +149,7 @@ def get_country_code_from_locale(self): else: return None - def detect_date(self, text): + def detect_date(self, text, bot_message=None): """ Detects exact date for complete date information - day, month, year are available in text and possible dates for if there are missing parts of date - day, month, year assuming sensible defaults. Also @@ -164,6 +164,8 @@ def detect_date(self, text): self.text = " " + text.strip().lower() + " " self.processed_text = self.text self.tagged_text = self.text + if bot_message: + self.set_bot_message(bot_message) if self.locale: self.country_code = self.get_country_code_from_locale() date_list = [] diff --git a/ner_v2/detectors/temporal/date/standard_date_regex.py b/ner_v2/detectors/temporal/date/standard_date_regex.py index e85a5e1f7..a2e90c981 100644 --- a/ner_v2/detectors/temporal/date/standard_date_regex.py +++ b/ner_v2/detectors/temporal/date/standard_date_regex.py @@ -74,10 +74,12 @@ def __init__(self, entity_name, data_directory_path, locale=None, timezone='UTC' self._detect_weekday ] - def detect_date(self, text): + def detect_date(self, text, bot_message=None): self.text = text self.processed_text = text self.tagged_text = text + if bot_message: + self.set_bot_message(bot_message) date_list, original_list = None, None for detector in self.detector_preferences: @@ -626,6 +628,15 @@ def normalize_year(self, year): return year + def set_bot_message(self, bot_message): + """ + Sets the object's bot_message attribute + + Args: + bot_message: is the previous message that is sent by the bot + """ + self.bot_message = bot_message + def _update_processed_text(self, original_date_list): """ Replaces detected date with tag generated from entity_name used to initialize the object with From fdf0a2928b4537fa07fe1ee690d2a749e54e90ae Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Fri, 1 Nov 2019 10:41:47 +0530 Subject: [PATCH 05/83] enhance normalize_year --- ner_v2/detectors/temporal/date/en/date_detection.py | 2 +- ner_v2/detectors/temporal/date/standard_date_regex.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ner_v2/detectors/temporal/date/en/date_detection.py b/ner_v2/detectors/temporal/date/en/date_detection.py index 1adc9bd9b..5fbf7f463 100644 --- a/ner_v2/detectors/temporal/date/en/date_detection.py +++ b/ner_v2/detectors/temporal/date/en/date_detection.py @@ -2001,7 +2001,7 @@ def normalize_year(self, year): this_century = int(str(self.now_date.year)[:2]) if len(year) == 2: if self.bot_message: - if past_regex and past_regex.search(self.bot_message): + if past_regex and past_regex.search(self.bot_message) and int(year) > int(str(self.now_date.year)[2:]): return str(this_century - 1) + year elif present_regex and present_regex.search(self.bot_message): return str(this_century) + year diff --git a/ner_v2/detectors/temporal/date/standard_date_regex.py b/ner_v2/detectors/temporal/date/standard_date_regex.py index a2e90c981..e5eaee0d9 100644 --- a/ner_v2/detectors/temporal/date/standard_date_regex.py +++ b/ner_v2/detectors/temporal/date/standard_date_regex.py @@ -615,7 +615,7 @@ def normalize_year(self, year): this_century = int(str(self.now_date.year)[:2]) if len(year) == 2: if self.bot_message: - if past_regex and past_regex.search(self.bot_message): + if past_regex and past_regex.search(self.bot_message) and int(year) > int(str(self.now_date.year)[2:]): return str(this_century - 1) + year elif present_regex and present_regex.search(self.bot_message): return str(this_century) + year From bdbd742cb7e7829eb0a9ef9d75c4e54bf4276266 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Fri, 1 Nov 2019 11:51:57 +0530 Subject: [PATCH 06/83] add , flags=re.UNICODE --- ner_v2/detectors/temporal/date/standard_date_regex.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ner_v2/detectors/temporal/date/standard_date_regex.py b/ner_v2/detectors/temporal/date/standard_date_regex.py index e5eaee0d9..aeda1d3ba 100644 --- a/ner_v2/detectors/temporal/date/standard_date_regex.py +++ b/ner_v2/detectors/temporal/date/standard_date_regex.py @@ -609,7 +609,8 @@ def normalize_year(self, year): Returns: str: year in four digits """ - past_regex = re.compile(r'birth|bday|dob|born|जन्म|जन्मदिन|పుట్టినరోజు|పుట్టిన') + past_regex = re.compile(r'birth|bday|dob|born|जन्म|जन्मदिन|పుట్టినరోజు|పుట్టిన', flags=re.UNICODE) + # Todo: Add more language variations of birthday. present_regex = None future_regex = None this_century = int(str(self.now_date.year)[:2]) From 48c1252de636cba01f33396c12c7752fadc50c56 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Fri, 1 Nov 2019 11:58:53 +0530 Subject: [PATCH 07/83] add ur in regex --- ner_v2/detectors/temporal/date/standard_date_regex.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ner_v2/detectors/temporal/date/standard_date_regex.py b/ner_v2/detectors/temporal/date/standard_date_regex.py index aeda1d3ba..07311fbf4 100644 --- a/ner_v2/detectors/temporal/date/standard_date_regex.py +++ b/ner_v2/detectors/temporal/date/standard_date_regex.py @@ -609,7 +609,7 @@ def normalize_year(self, year): Returns: str: year in four digits """ - past_regex = re.compile(r'birth|bday|dob|born|जन्म|जन्मदिन|పుట్టినరోజు|పుట్టిన', flags=re.UNICODE) + past_regex = re.compile(ur'birth|bday|dob|born|जन्म|जन्मदिन|పుట్టినరోజు|పుట్టిన', flags=re.UNICODE) # Todo: Add more language variations of birthday. present_regex = None future_regex = None From 0ef14ed2ea7c89722cc9608f85104e2e3cbc96e6 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Fri, 1 Nov 2019 15:16:43 +0530 Subject: [PATCH 08/83] add convert_numbers --- .../detectors/temporal/date/standard_date_regex.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/ner_v2/detectors/temporal/date/standard_date_regex.py b/ner_v2/detectors/temporal/date/standard_date_regex.py index 07311fbf4..5959c89c9 100644 --- a/ner_v2/detectors/temporal/date/standard_date_regex.py +++ b/ner_v2/detectors/temporal/date/standard_date_regex.py @@ -572,7 +572,8 @@ def _gregorian_day_month_year_format(self, date_list=None, original_list=None): date_list = [] regex_pattern = re.compile(r'[^/\-\.\w](([12][0-9]|3[01]|0?[1-9])\s?[/\-\.]\s?(1[0-2]|0?[1-9])' r'(?:\s?[/\-\.]\s?((?:20|19)?[0-9]{2}))?)\W') - patterns = regex_pattern.findall(self.processed_text.lower()) + translate_number = self.convert_numbers(self.processed_text.lower()) + patterns = regex_pattern.findall(translate_number) for pattern in patterns: original = pattern[0] dd = int(pattern[1]) @@ -597,13 +598,22 @@ def _gregorian_day_month_year_format(self, date_list=None, original_list=None): original_list.append(original) return date_list, original_list + @staticmethod + def convert_numbers(text): + result = text + digit = re.compile(r'(\d)', re.U) + groups = digit.findall(result) + for group in groups: + result = result.replace(group, str(int(group))) + return result + def normalize_year(self, year): """ Normalize two digit year to four digits by taking into consideration the bot message. Useful in cases like date of birth where past century is preferred than current. If no bot message is given it falls back to current century - Args: + Args:[{"key":"message","value":"१/३/६६","description":""}] year (str): Year string to normalize Returns: From be79225a7af114bd82ed41fc13ebc4499315149b Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Fri, 1 Nov 2019 15:32:27 +0530 Subject: [PATCH 09/83] add convert_numbers --- ner_v2/detectors/temporal/date/standard_date_regex.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/ner_v2/detectors/temporal/date/standard_date_regex.py b/ner_v2/detectors/temporal/date/standard_date_regex.py index 5959c89c9..e96559cb8 100644 --- a/ner_v2/detectors/temporal/date/standard_date_regex.py +++ b/ner_v2/detectors/temporal/date/standard_date_regex.py @@ -595,7 +595,11 @@ def _gregorian_day_month_year_format(self, date_list=None, original_list=None): } date_list.append(date) # original = self.regx_to_process.text_substitute(original) - original_list.append(original) + if translate_number != self.processed_text.lower(): + match = re.search(original, translate_number) + original_list.append(self.processed_text[(match.span()[0]):(match.span()[1])]) + else: + original_list.append(original) return date_list, original_list @staticmethod From 9fda2172069f779615f33d64e604378cbe40669f Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Fri, 1 Nov 2019 16:27:39 +0530 Subject: [PATCH 10/83] add test cases --- .../temporal/date/en/test_date_detection.py | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/ner_v2/tests/temporal/date/en/test_date_detection.py b/ner_v2/tests/temporal/date/en/test_date_detection.py index 93d95fa1c..2c1d5ac5b 100644 --- a/ner_v2/tests/temporal/date/en/test_date_detection.py +++ b/ner_v2/tests/temporal/date/en/test_date_detection.py @@ -244,4 +244,30 @@ def test_en_gregorian_year_day_month_format(self): 'value': {'dd': day1, 'mm': month, 'yy': year1, 'type': 'date'} }, date_dicts) + self.assertEqual(original_texts.count(message.lower()), 1) + + def test_hi_gregorian_dd_mm_yy_format(self): + """ + Date detection for pattern '१/३/६६' + """ + message = u'१/३/६६' + locale = 'hi-in' + # If we run + day1 = 1 + month = 3 + year1 = 1966 + bot_message = u'जन्मदिन' + + date_detector_object = DateAdvancedDetector(entity_name=self.entity_name, language='en', locale=locale) + date_dicts, original_texts = date_detector_object.detect_entity(message, bot_message=bot_message) + + self.assertIn({ + 'normal': True, + 'start_range': False, + 'end_range': False, + 'from': False, + 'to': False, + 'value': {'dd': day1, 'mm': month, 'yy': year1, 'type': 'date'} + }, date_dicts) + self.assertEqual(original_texts.count(message.lower()), 1) \ No newline at end of file From 3b0f2e474066414facac7bbbced7959db50817eb Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Fri, 1 Nov 2019 16:29:45 +0530 Subject: [PATCH 11/83] add test cases --- ner_v2/tests/temporal/date/en/test_date_detection.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ner_v2/tests/temporal/date/en/test_date_detection.py b/ner_v2/tests/temporal/date/en/test_date_detection.py index 2c1d5ac5b..cb118822a 100644 --- a/ner_v2/tests/temporal/date/en/test_date_detection.py +++ b/ner_v2/tests/temporal/date/en/test_date_detection.py @@ -1,3 +1,5 @@ +# coding=utf-8 + from __future__ import absolute_import import datetime From a7c59150c6c0422831b1ced8a2320a6e55da0486 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Fri, 1 Nov 2019 16:31:34 +0530 Subject: [PATCH 12/83] add test cases --- ner_v2/tests/temporal/date/en/test_date_detection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ner_v2/tests/temporal/date/en/test_date_detection.py b/ner_v2/tests/temporal/date/en/test_date_detection.py index cb118822a..26408e0a1 100644 --- a/ner_v2/tests/temporal/date/en/test_date_detection.py +++ b/ner_v2/tests/temporal/date/en/test_date_detection.py @@ -260,7 +260,7 @@ def test_hi_gregorian_dd_mm_yy_format(self): year1 = 1966 bot_message = u'जन्मदिन' - date_detector_object = DateAdvancedDetector(entity_name=self.entity_name, language='en', locale=locale) + date_detector_object = DateAdvancedDetector(entity_name=self.entity_name, language='hi', locale=locale) date_dicts, original_texts = date_detector_object.detect_entity(message, bot_message=bot_message) self.assertIn({ From b1e116f46b45500afa0774e8cb047e8f79defcc3 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Fri, 1 Nov 2019 16:46:25 +0530 Subject: [PATCH 13/83] add test cases --- ner_v2/detectors/temporal/date/date_detection.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/ner_v2/detectors/temporal/date/date_detection.py b/ner_v2/detectors/temporal/date/date_detection.py index a2c4e111b..7d4d25c8d 100644 --- a/ner_v2/detectors/temporal/date/date_detection.py +++ b/ner_v2/detectors/temporal/date/date_detection.py @@ -92,7 +92,7 @@ def __init__(self, entity_name='date', locale=None, language=ENGLISH_LANG, timez def supported_languages(self): return self._supported_languages - def detect_entity(self, text, run_model=False, **kwargs): + def detect_entity(self, text, run_model=False, bot_message=None, **kwargs): """ Detects all date strings in text and returns two lists of detected date entities and their corresponding original substrings in text respectively. @@ -132,10 +132,15 @@ def detect_entity(self, text, run_model=False, **kwargs): Additionally this function assigns these lists to self.date and self.original_date_text attributes respectively. + :param text: text + :param run_model: run_model + :param bot_message: bot_message """ self.text = ' ' + text.lower() + ' ' self.processed_text = self.text self.tagged_text = self.text + if bot_message: + self.bot_message = bot_message date_data = [] if run_model: date_data = self._date_model_detection() From 4bec1501d59e7eff2dc0ce9e729df3c92f0bc7d1 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Fri, 1 Nov 2019 16:54:31 +0530 Subject: [PATCH 14/83] add test cases --- ner_v2/detectors/temporal/date/date_detection.py | 6 ++++-- ner_v2/tests/temporal/date/en/test_date_detection.py | 1 + 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/ner_v2/detectors/temporal/date/date_detection.py b/ner_v2/detectors/temporal/date/date_detection.py index 7d4d25c8d..6f12337ac 100644 --- a/ner_v2/detectors/temporal/date/date_detection.py +++ b/ner_v2/detectors/temporal/date/date_detection.py @@ -509,7 +509,7 @@ def _date_value(self, text): (['friday'], ['friday']) """ - date_list, original_list = self.date_detector_object.detect_entity(text) + date_list, original_list = self.date_detector_object.detect_entity(text, self.bot_message) return date_list, original_list def unzip_convert_date_dictionaries(self, entity_dict_list): @@ -808,7 +808,7 @@ def __init__(self, entity_name, locale=None, language=ENGLISH_LANG, timezone='UT locale=self.locale ) - def detect_entity(self, text, **kwargs): + def detect_entity(self, text, bot_message=None, **kwargs): """ Detects all date strings in text and returns two lists of detected date entities and their corresponding original substrings in text respectively. @@ -830,6 +830,8 @@ def detect_entity(self, text, **kwargs): Additionally this function assigns these lists to self.date and self.original_date_text attributes respectively. + :param text: text + :param bot_message: bot message """ diff --git a/ner_v2/tests/temporal/date/en/test_date_detection.py b/ner_v2/tests/temporal/date/en/test_date_detection.py index 26408e0a1..b999a1ccf 100644 --- a/ner_v2/tests/temporal/date/en/test_date_detection.py +++ b/ner_v2/tests/temporal/date/en/test_date_detection.py @@ -261,6 +261,7 @@ def test_hi_gregorian_dd_mm_yy_format(self): bot_message = u'जन्मदिन' date_detector_object = DateAdvancedDetector(entity_name=self.entity_name, language='hi', locale=locale) + date_detection.set_bot_message(bot_message=parameters_dict[PARAMETER_BOT_MESSAGE]) date_dicts, original_texts = date_detector_object.detect_entity(message, bot_message=bot_message) self.assertIn({ From c9a2ae6275abe17ca94b6b7360038a3ee21915ec Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Fri, 1 Nov 2019 16:55:26 +0530 Subject: [PATCH 15/83] add test cases --- ner_v2/tests/temporal/date/en/test_date_detection.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ner_v2/tests/temporal/date/en/test_date_detection.py b/ner_v2/tests/temporal/date/en/test_date_detection.py index b999a1ccf..26408e0a1 100644 --- a/ner_v2/tests/temporal/date/en/test_date_detection.py +++ b/ner_v2/tests/temporal/date/en/test_date_detection.py @@ -261,7 +261,6 @@ def test_hi_gregorian_dd_mm_yy_format(self): bot_message = u'जन्मदिन' date_detector_object = DateAdvancedDetector(entity_name=self.entity_name, language='hi', locale=locale) - date_detection.set_bot_message(bot_message=parameters_dict[PARAMETER_BOT_MESSAGE]) date_dicts, original_texts = date_detector_object.detect_entity(message, bot_message=bot_message) self.assertIn({ From 1844e3f028a10d53cae78c6ac30f6f25ed2aa057 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Fri, 1 Nov 2019 16:57:11 +0530 Subject: [PATCH 16/83] add test cases --- ner_v2/detectors/temporal/date/date_detection.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ner_v2/detectors/temporal/date/date_detection.py b/ner_v2/detectors/temporal/date/date_detection.py index 6f12337ac..00b702c05 100644 --- a/ner_v2/detectors/temporal/date/date_detection.py +++ b/ner_v2/detectors/temporal/date/date_detection.py @@ -838,6 +838,8 @@ def detect_entity(self, text, bot_message=None, **kwargs): self.text = ' ' + text.strip().lower() + ' ' self.processed_text = self.text self.tagged_text = self.text + if bot_message: + self.bot_message = bot_message if self.language_date_detector: self.date, self.original_date_text = self.language_date_detector.detect_date(self.processed_text, self.bot_message) From 1f764cfb3e3e4299a13ccf8525fe1f96b7b58d3e Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Fri, 1 Nov 2019 19:09:56 +0530 Subject: [PATCH 17/83] add test cases --- .../detectors/temporal/date/date_detection.py | 31 +++++++------------ .../temporal/date/standard_date_regex.py | 4 +-- 2 files changed, 12 insertions(+), 23 deletions(-) diff --git a/ner_v2/detectors/temporal/date/date_detection.py b/ner_v2/detectors/temporal/date/date_detection.py index 00b702c05..92bcca72b 100644 --- a/ner_v2/detectors/temporal/date/date_detection.py +++ b/ner_v2/detectors/temporal/date/date_detection.py @@ -59,7 +59,7 @@ def get_supported_languages(): return supported_languages def __init__(self, entity_name='date', locale=None, language=ENGLISH_LANG, timezone='UTC', - past_date_referenced=False): + past_date_referenced=False, bot_message=None): """ Initializes the DateDetector object with given entity_name and pytz timezone object @@ -87,12 +87,14 @@ def __init__(self, entity_name='date', locale=None, language=ENGLISH_LANG, timez past_date_referenced=past_date_referenced, locale=locale) self.bot_message = None + if bot_message: + self.set_bot_message(bot_message) @property def supported_languages(self): return self._supported_languages - def detect_entity(self, text, run_model=False, bot_message=None, **kwargs): + def detect_entity(self, text, run_model=False, **kwargs): """ Detects all date strings in text and returns two lists of detected date entities and their corresponding original substrings in text respectively. @@ -134,13 +136,10 @@ def detect_entity(self, text, run_model=False, bot_message=None, **kwargs): respectively. :param text: text :param run_model: run_model - :param bot_message: bot_message """ self.text = ' ' + text.lower() + ' ' self.processed_text = self.text self.tagged_text = self.text - if bot_message: - self.bot_message = bot_message date_data = [] if run_model: date_data = self._date_model_detection() @@ -509,7 +508,7 @@ def _date_value(self, text): (['friday'], ['friday']) """ - date_list, original_list = self.date_detector_object.detect_entity(text, self.bot_message) + date_list, original_list = self.date_detector_object.detect_entity(text) return date_list, original_list def unzip_convert_date_dictionaries(self, entity_dict_list): @@ -663,7 +662,6 @@ def detect(self, message=None, structured_value=None, fallback_value=None, **kwa (For example, UI elements like form, payload, etc) fallback_value (str): If the detection logic fails to detect any value either from structured_value or message then we return a fallback_value as an output. - bot_message (str): previous message from a bot/agent. Returns: dict or None: dictionary containing entity_value, original_text and detection; @@ -675,9 +673,8 @@ def detect(self, message=None, structured_value=None, fallback_value=None, **kwa message = 'i want to order chinese from mainland china and pizza from domminos' structured_value = None fallback_value = None - bot_message = None output = detect(message=message, structured_value=structured_value, - fallback_value=fallback_value, bot_message=bot_message) + fallback_value=fallback_value) print output >> [{'detection': 'message', 'original_text': 'mainland china', 'entity_value': @@ -690,9 +687,8 @@ def detect(self, message=None, structured_value=None, fallback_value=None, **kwa entity_name = 'movie' structured_value = 'inferno' fallback_value = None - bot_message = None output = get_text(message=message, entity_name=entity_name, structured_value=structured_value, - fallback_value=fallback_value, bot_message=bot_message) + fallback_value=fallback_value) print output >> [{'detection': 'structure_value_verified', 'original_text': 'inferno', 'entity_value': @@ -703,9 +699,8 @@ def detect(self, message=None, structured_value=None, fallback_value=None, **kwa entity_name = 'movie' structured_value = 'delhi' fallback_value = None - bot_message = None output = get_text(message=message, entity_name=entity_name, structured_value=structured_value, - fallback_value=fallback_value, bot_message=bot_message) + fallback_value=fallback_value) print output >> [{'detection': 'message', 'original_text': 'inferno', 'entity_value': {'value': u'Inferno'}}] @@ -808,7 +803,7 @@ def __init__(self, entity_name, locale=None, language=ENGLISH_LANG, timezone='UT locale=self.locale ) - def detect_entity(self, text, bot_message=None, **kwargs): + def detect_entity(self, text, **kwargs): """ Detects all date strings in text and returns two lists of detected date entities and their corresponding original substrings in text respectively. @@ -831,18 +826,13 @@ def detect_entity(self, text, bot_message=None, **kwargs): Additionally this function assigns these lists to self.date and self.original_date_text attributes respectively. :param text: text - :param bot_message: bot message - """ self.text = ' ' + text.strip().lower() + ' ' self.processed_text = self.text self.tagged_text = self.text - if bot_message: - self.bot_message = bot_message if self.language_date_detector: - self.date, self.original_date_text = self.language_date_detector.detect_date(self.processed_text, - self.bot_message) + self.date, self.original_date_text = self.language_date_detector.detect_date(self.processed_text) validated_date_list, validated_original_list = [], [] @@ -865,6 +855,7 @@ def set_bot_message(self, bot_message): bot_message: is the previous message that is sent by the bot """ self.bot_message = bot_message + self.language_date_detector.set_bot_message(bot_message) def to_datetime_object(self, base_date_value_dict): """ diff --git a/ner_v2/detectors/temporal/date/standard_date_regex.py b/ner_v2/detectors/temporal/date/standard_date_regex.py index e96559cb8..aa90c9c67 100644 --- a/ner_v2/detectors/temporal/date/standard_date_regex.py +++ b/ner_v2/detectors/temporal/date/standard_date_regex.py @@ -74,12 +74,10 @@ def __init__(self, entity_name, data_directory_path, locale=None, timezone='UTC' self._detect_weekday ] - def detect_date(self, text, bot_message=None): + def detect_date(self, text): self.text = text self.processed_text = text self.tagged_text = text - if bot_message: - self.set_bot_message(bot_message) date_list, original_list = None, None for detector in self.detector_preferences: From b0e37642e4fc63da40e9a6e719e21d18ad0f76ea Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Fri, 1 Nov 2019 19:13:58 +0530 Subject: [PATCH 18/83] add test cases --- ner_v2/tests/temporal/date/en/test_date_detection.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ner_v2/tests/temporal/date/en/test_date_detection.py b/ner_v2/tests/temporal/date/en/test_date_detection.py index 26408e0a1..c41547335 100644 --- a/ner_v2/tests/temporal/date/en/test_date_detection.py +++ b/ner_v2/tests/temporal/date/en/test_date_detection.py @@ -261,7 +261,8 @@ def test_hi_gregorian_dd_mm_yy_format(self): bot_message = u'जन्मदिन' date_detector_object = DateAdvancedDetector(entity_name=self.entity_name, language='hi', locale=locale) - date_dicts, original_texts = date_detector_object.detect_entity(message, bot_message=bot_message) + date_dicts, original_texts = date_detector_object.detect_entity(message) + date_detector_object.set_bot_message(bot_message) self.assertIn({ 'normal': True, From bc21134119c7d09126a3deae851193d9ecb8c112 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Fri, 1 Nov 2019 19:15:06 +0530 Subject: [PATCH 19/83] add test cases --- ner_v2/tests/temporal/date/en/test_date_detection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ner_v2/tests/temporal/date/en/test_date_detection.py b/ner_v2/tests/temporal/date/en/test_date_detection.py index c41547335..7e2050255 100644 --- a/ner_v2/tests/temporal/date/en/test_date_detection.py +++ b/ner_v2/tests/temporal/date/en/test_date_detection.py @@ -261,8 +261,8 @@ def test_hi_gregorian_dd_mm_yy_format(self): bot_message = u'जन्मदिन' date_detector_object = DateAdvancedDetector(entity_name=self.entity_name, language='hi', locale=locale) - date_dicts, original_texts = date_detector_object.detect_entity(message) date_detector_object.set_bot_message(bot_message) + date_dicts, original_texts = date_detector_object.detect_entity(message) self.assertIn({ 'normal': True, From 7aa15704c9a92dfdfd4049cdbc998de932f70740 Mon Sep 17 00:00:00 2001 From: ashutoshsingh0223 Date: Mon, 4 Nov 2019 13:23:14 +0530 Subject: [PATCH 20/83] update views and base_detector for crf results --- ner_constants.py | 2 ++ ner_v1/api.py | 14 +++++++++----- ner_v1/chatbot/entity_detection.py | 5 ++++- ner_v1/detectors/base_detector.py | 8 ++++++-- 4 files changed, 21 insertions(+), 8 deletions(-) diff --git a/ner_constants.py b/ner_constants.py index aab49c427..f1f77a7ad 100644 --- a/ner_constants.py +++ b/ner_constants.py @@ -56,3 +56,5 @@ PARAMETER_MIN_DIGITS = 'min_number_digits' PARAMETER_MAX_DIGITS = 'max_number_digits' PARAMETER_NUMBER_UNIT_TYPE = 'unit_type' + +PARAMETER_PRIOR_CRF_RESULTS = "crf_results" diff --git a/ner_v1/api.py b/ner_v1/api.py index 316df8b81..6dd097c92 100644 --- a/ner_v1/api.py +++ b/ner_v1/api.py @@ -10,8 +10,8 @@ from language_utilities.constant import ENGLISH_LANG from ner_constants import (PARAMETER_MESSAGE, PARAMETER_ENTITY_NAME, PARAMETER_STRUCTURED_VALUE, PARAMETER_FALLBACK_VALUE, PARAMETER_BOT_MESSAGE, PARAMETER_TIMEZONE, PARAMETER_REGEX, - PARAMETER_LANGUAGE_SCRIPT, - PARAMETER_SOURCE_LANGUAGE) + PARAMETER_LANGUAGE_SCRIPT, PARAMETER_SOURCE_LANGUAGE, PARAMETER_PRIOR_CRF_RESULTS) + from ner_v1.chatbot.combine_detection_logic import combine_output_of_detection_logic_and_tag from ner_v1.chatbot.entity_detection import (get_location, get_phone_number, get_email, get_city, get_pnr, get_number, get_passenger_count, get_shopping_size, get_time, @@ -69,7 +69,8 @@ def get_parameters_dictionary(request): PARAMETER_MAX_DIGITS: request.GET.get('max_number_digits'), PARAMETER_READ_EMBEDDINGS_FROM_REMOTE_URL: to_bool(request.GET.get('read_embeddings_from_remote_url')), PARAMETER_READ_MODEL_FROM_S3: to_bool(request.GET.get('read_model_from_s3')), - PARAMETER_LIVE_CRF_MODEL_PATH: request.GET.get('live_crf_model_path') + PARAMETER_LIVE_CRF_MODEL_PATH: request.GET.get('live_crf_model_path'), + PARAMETER_PRIOR_CRF_RESULTS: request.GET.get("crf_results", "") } return parameters_dict @@ -103,7 +104,8 @@ def parse_post_request(request): PARAMETER_MAX_DIGITS: request_data.get('max_number_digits'), PARAMETER_READ_EMBEDDINGS_FROM_REMOTE_URL: to_bool(request_data.get('read_embeddings_from_remote_url')), PARAMETER_READ_MODEL_FROM_S3: to_bool(request_data.get('read_model_from_s3')), - PARAMETER_LIVE_CRF_MODEL_PATH: request_data.get('live_crf_model_path') + PARAMETER_LIVE_CRF_MODEL_PATH: request_data.get('live_crf_model_path'), + PARAMETER_PRIOR_CRF_RESULTS: request_data.get("crf_results", []) } return parameters_dict @@ -247,6 +249,7 @@ def text(request): live_crf_model_path=parameters_dict[PARAMETER_LIVE_CRF_MODEL_PATH], read_model_from_s3=parameters_dict[PARAMETER_READ_MODEL_FROM_S3], read_embeddings_from_remote_url=parameters_dict[PARAMETER_READ_EMBEDDINGS_FROM_REMOTE_URL], + free_text_detection_results=parameters_dict[PARAMETER_PRIOR_CRF_RESULTS] ) ner_logger.debug('Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output)) except TypeError as e: @@ -361,7 +364,8 @@ def person_name(request): structured_value=parameters_dict[PARAMETER_STRUCTURED_VALUE], fallback_value=parameters_dict[PARAMETER_FALLBACK_VALUE], bot_message=parameters_dict[PARAMETER_BOT_MESSAGE], - language=parameters_dict[PARAMETER_SOURCE_LANGUAGE]) + language=parameters_dict[PARAMETER_SOURCE_LANGUAGE], + free_text_detection_results=parameters_dict[PARAMETER_PRIOR_CRF_RESULTS]) ner_logger.debug('Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output)) except TypeError as e: ner_logger.exception('Exception for person_name: %s ' % e) diff --git a/ner_v1/chatbot/entity_detection.py b/ner_v1/chatbot/entity_detection.py index 94aad4b06..0be0c9d6d 100644 --- a/ner_v1/chatbot/entity_detection.py +++ b/ner_v1/chatbot/entity_detection.py @@ -233,6 +233,8 @@ def get_text(message, entity_name, structured_value, fallback_value, bot_message read_model_from_s3 = kwargs.get('read_model_from_s3', False) read_embeddings_from_remote_url = kwargs.get('read_embeddings_from_remote_url', False) + free_text_detection_results = kwargs.get("free_text_detection_results", []) + text_model_detector = TextModelDetector(entity_name=entity_name, language=language, live_crf_model_path=live_crf_model_path, @@ -519,7 +521,7 @@ def get_city(message, entity_name, structured_value, fallback_value, bot_message def get_person_name(message, entity_name, structured_value, fallback_value, bot_message, - language=ENGLISH_LANG): + language=ENGLISH_LANG, **kwargs): """Use NameDetector to detect names Args: @@ -552,6 +554,7 @@ def get_person_name(message, entity_name, structured_value, fallback_value, bot_ 'entity_value': {'first_name': yash, 'middle_name': None, 'last_name': doshi}}] """ # TODO refactor NameDetector to make this easy to read and use + free_text_detection_results = kwargs.get("free_text_detection_results", []) name_detection = NameDetector(entity_name=entity_name, language=language) text, detection_method, fallback_text, fallback_method = (structured_value, FROM_STRUCTURE_VALUE_VERIFIED, diff --git a/ner_v1/detectors/base_detector.py b/ner_v1/detectors/base_detector.py index 3c28db1ca..82e8a152c 100644 --- a/ner_v1/detectors/base_detector.py +++ b/ner_v1/detectors/base_detector.py @@ -104,7 +104,9 @@ def detect_bulk(self, messages=None, **kwargs): messages.append(translation_output[TRANSLATED_TEXT] if translation_output['status'] else '') texts = messages - entities_list, original_texts_list = self.detect_entity_bulk(texts=texts) + free_text_detection_results = kwargs.get("free_text_detection_results", []) + entities_list, original_texts_list = self.detect_entity_bulk( + texts=texts, free_text_detection_results=free_text_detection_results) if entities_list: values_list, method, original_texts_list = entities_list, FROM_MESSAGE, original_texts_list @@ -169,6 +171,7 @@ def detect(self, message=None, structured_value=None, fallback_value=None, **kwa >> [{'detection': 'message', 'original_text': 'inferno', 'entity_value': {'value': u'Inferno'}}] """ + free_text_detection_results = kwargs.get("free_text_detection_results", []) if self._source_language_script != self._target_language_script and self._translation_enabled: if structured_value: translation_output = translate_text(structured_value, self._source_language_script, @@ -180,7 +183,8 @@ def detect(self, message=None, structured_value=None, fallback_value=None, **kwa message = translation_output[TRANSLATED_TEXT] if translation_output['status'] else None text = structured_value if structured_value else message - entity_list, original_text_list = self.detect_entity(text=text) + entity_list, original_text_list = self.detect_entity(text=text, + free_text_detection_results=free_text_detection_results) if structured_value: if entity_list: From affc2b42e02ef344e08e2a9a3bbd9ec0a8c25e48 Mon Sep 17 00:00:00 2001 From: ashutoshsingh0223 Date: Mon, 4 Nov 2019 13:27:48 +0530 Subject: [PATCH 21/83] update views and base_detector for crf results --- ner_v1/detectors/base_detector.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/ner_v1/detectors/base_detector.py b/ner_v1/detectors/base_detector.py index 82e8a152c..65e9ec3dd 100644 --- a/ner_v1/detectors/base_detector.py +++ b/ner_v1/detectors/base_detector.py @@ -104,6 +104,8 @@ def detect_bulk(self, messages=None, **kwargs): messages.append(translation_output[TRANSLATED_TEXT] if translation_output['status'] else '') texts = messages + + # Prior results from detection using CRF models free_text_detection_results = kwargs.get("free_text_detection_results", []) entities_list, original_texts_list = self.detect_entity_bulk( texts=texts, free_text_detection_results=free_text_detection_results) @@ -171,7 +173,7 @@ def detect(self, message=None, structured_value=None, fallback_value=None, **kwa >> [{'detection': 'message', 'original_text': 'inferno', 'entity_value': {'value': u'Inferno'}}] """ - free_text_detection_results = kwargs.get("free_text_detection_results", []) + if self._source_language_script != self._target_language_script and self._translation_enabled: if structured_value: translation_output = translate_text(structured_value, self._source_language_script, @@ -183,6 +185,9 @@ def detect(self, message=None, structured_value=None, fallback_value=None, **kwa message = translation_output[TRANSLATED_TEXT] if translation_output['status'] else None text = structured_value if structured_value else message + + # Prior results from detection using CRF models + free_text_detection_results = kwargs.get("free_text_detection_results", []) entity_list, original_text_list = self.detect_entity(text=text, free_text_detection_results=free_text_detection_results) From 7bb96fcf7511be502d4253c834ee4db08ddf771b Mon Sep 17 00:00:00 2001 From: ashutoshsingh0223 Date: Mon, 4 Nov 2019 13:38:43 +0530 Subject: [PATCH 22/83] update views and base_detector for crf results --- ner_constants.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ner_constants.py b/ner_constants.py index f1f77a7ad..b7a0c9d82 100644 --- a/ner_constants.py +++ b/ner_constants.py @@ -57,4 +57,5 @@ PARAMETER_MAX_DIGITS = 'max_number_digits' PARAMETER_NUMBER_UNIT_TYPE = 'unit_type' -PARAMETER_PRIOR_CRF_RESULTS = "crf_results" +# Prior detection results from CRF models. +PARAMETER_PRIOR_CRF_RESULTS = "free_text_detection_results" From 68a3c8f208fe473b05d2ab10ccc489b61e2f1848 Mon Sep 17 00:00:00 2001 From: ashutoshsingh0223 Date: Mon, 4 Nov 2019 14:04:42 +0530 Subject: [PATCH 23/83] update_TextModelDetector_for_crf_results --- .../textual/text/text_detection_model.py | 21 +++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/ner_v1/detectors/textual/text/text_detection_model.py b/ner_v1/detectors/textual/text/text_detection_model.py index 0356c4295..29b415a74 100644 --- a/ner_v1/detectors/textual/text/text_detection_model.py +++ b/ner_v1/detectors/textual/text/text_detection_model.py @@ -89,6 +89,13 @@ def detect_entity(self, text, **kwargs): crf_original_texts = crf_model.detect_entity(text=text) + # Access free_text_detection_results(list of str). + # If present replace crf_original_texts with free_text_detection_results. + # Call combine results to .combine_results() from dictionary detection and free_text_detection_results. + free_text_detection_results = kwargs.get("free_text_detection_results") + if free_text_detection_results: + crf_original_texts = free_text_detection_results + values, original_texts = super(TextModelDetector, self).detect_entity(text, **kwargs) text_entity_verified_values, original_texts = self.combine_results(values=values, @@ -161,12 +168,22 @@ def detect_entity_bulk(self, texts, **kwargs): crf_original_texts = [] + # Access free_text_detection_results(list of lists). + # If present replace crf_original_texts with free_text_detection_results. + # Call .combine_results() to combine results from dictionary detection and free_text_detection_results. + free_text_detection_results = kwargs.get("free_text_detection_results") + if free_text_detection_results: + crf_original_texts = free_text_detection_results + values_list, original_texts_list = super(TextModelDetector, self).detect_entity_bulk(texts, **kwargs) text_entity_values_list, original_texts_detected_list = [], [] - for inner_values, inner_original_texts in six.moves.zip(values_list, original_texts_list): + + for inner_crf_original_texts, inner_values, inner_original_texts in six.moves.zip_longest(crf_original_texts, + values_list, + original_texts_list): text_entity_verified_values, original_texts = \ self.combine_results(values=inner_values, original_texts=inner_original_texts, - crf_original_texts=crf_original_texts) + crf_original_texts=inner_crf_original_texts if inner_crf_original_texts else []) text_entity_values_list.append(text_entity_verified_values) original_texts_detected_list.append(original_texts) return text_entity_values_list, original_texts_detected_list From 430ae55b3f9887439150f70c871fdb6a5b073365 Mon Sep 17 00:00:00 2001 From: ashutoshsingh0223 Date: Mon, 4 Nov 2019 16:09:38 +0530 Subject: [PATCH 24/83] update_NameDetector_for_crf_results --- .../detectors/textual/name/name_detection.py | 62 ++++++++++++++++--- 1 file changed, 55 insertions(+), 7 deletions(-) diff --git a/ner_v1/detectors/textual/name/name_detection.py b/ner_v1/detectors/textual/name/name_detection.py index 07ee1121e..c33106d9f 100644 --- a/ner_v1/detectors/textual/name/name_detection.py +++ b/ner_v1/detectors/textual/name/name_detection.py @@ -143,7 +143,7 @@ def get_name_using_pos_tagger(self, text): return entity_value, original_text - def detect_entity(self, text, bot_message=None): + def detect_entity(self, text, bot_message=None, **kwargs): """ Takes text as input and returns two lists 1.entity_value in the form of first, middle and last names @@ -154,7 +154,7 @@ def detect_entity(self, text, bot_message=None): Example: text=my name is yash doshi - Returns: + Returns: [{first_name: "yash", middle_name: None, last_name: "modi"}], [ yash modi"] """ if bot_message: @@ -163,16 +163,24 @@ def detect_entity(self, text, bot_message=None): self.text = text self.tagged_text = self.text + free_text_detection_results = kwargs.get("free_text_detection_results") entity_value, original_text = ([], []) - if self.language == ENGLISH_LANG: - entity_value, original_text = self.detect_english_name() - elif self.language == HINDI_LANG: - entity_value, original_text = self.detect_hindi_name() + if not free_text_detection_results: + if self.language == ENGLISH_LANG: + entity_value, original_text = self.detect_english_name() + elif self.language == HINDI_LANG: + entity_value, original_text = self.detect_hindi_name() + + else: + replaced_text = self.replace_free_text_detection_text(free_text_detection_results, + text=text) + entity_value, original_text = self.detect_person_name_entity(replaced_text) self._update_processed_text(person_name_list=original_text) + return entity_value, original_text def detect_english_name(self, text=None): @@ -239,6 +247,46 @@ def detect_hindi_name(self): return entity_value, original_text + def replace_free_text_detection_text(self, free_text_detection_results, text): + """ + Replace detected names from the text according to replace_detected_text. + Separate method for replacing free_text_detection_results because it these results are not at token level. + For example - + text = "my name is yash doshi" + free_text_detection_results = ["yash doshi"] + while, text_detection_original_texts = ["yash", "doshi"] + + + Args: + free_text_detection_results(list): list containing free_text_entity_results + text(str): original to run detection on + + Returns: + replaced_text(str): text with marked tokens + + Example: + >> text = "my name is yash doshi" + >> free_text_detection_results = ["yash doshi"] + >> replace_free_text_detection_text(free_text_detection_results, text) + 'my name is _yash_ _doshi_' + + """ + if self.language == ENGLISH_LANG: + replaced_text = nltk_tokenizer.tokenize(text.lower()) + # TODO: Add postprocessing after tokenization to handle titles like dr., mr. etc + # TODO: Tokenization issue where trailing '.'s are considered a separate token + else: + replaced_text = text.lower().strip().split() + + for name in free_text_detection_results: + name_tokens = name.split() + for token in name_tokens: + for j in range(len(replaced_text)): + replaced_text[j] = replaced_text[j].replace(token, "_" + token + "_") + + return replaced_text + + def replace_detected_text(self, text_detection_result, text): """ Replaces the detected name from text_detection_result by __ @@ -252,7 +300,7 @@ def replace_detected_text(self, text_detection_result, text): Example: text_detection_result= ([u'dosh', u'yash'], ['doshi', 'yash']) Returns: - ['my', 'name', 'is', 'yash', 'doshi'] + ['my', 'name', 'is', '_yash_', '_doshi_'] """ replaced_text = [] From 2010c54835c80f3d8084e1b8bf5486f5ffde630f Mon Sep 17 00:00:00 2001 From: Ameya Date: Wed, 6 Nov 2019 17:37:06 +0530 Subject: [PATCH 25/83] updates in text detectors --- ner_v1/chatbot/entity_detection.py | 3 ++- ner_v1/detectors/textual/text/text_detection_model.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/ner_v1/chatbot/entity_detection.py b/ner_v1/chatbot/entity_detection.py index 0be0c9d6d..846442c32 100644 --- a/ner_v1/chatbot/entity_detection.py +++ b/ner_v1/chatbot/entity_detection.py @@ -253,7 +253,8 @@ def get_text(message, entity_name, structured_value, fallback_value, bot_message entity_output = text_model_detector.detect(message=message, structured_value=structured_value, fallback_value=fallback_value, - bot_message=bot_message) + bot_message=bot_message, + free_text_detection_results=free_text_detection_results) elif isinstance(message, (list, tuple)): entity_output = text_model_detector.detect_bulk(messages=message) diff --git a/ner_v1/detectors/textual/text/text_detection_model.py b/ner_v1/detectors/textual/text/text_detection_model.py index 29b415a74..77c09dc82 100644 --- a/ner_v1/detectors/textual/text/text_detection_model.py +++ b/ner_v1/detectors/textual/text/text_detection_model.py @@ -92,7 +92,7 @@ def detect_entity(self, text, **kwargs): # Access free_text_detection_results(list of str). # If present replace crf_original_texts with free_text_detection_results. # Call combine results to .combine_results() from dictionary detection and free_text_detection_results. - free_text_detection_results = kwargs.get("free_text_detection_results") + free_text_detection_results = kwargs.get("free_text_detection_results", []) if free_text_detection_results: crf_original_texts = free_text_detection_results From 9c89d6ddcbfc1db99ee5995d853d938bfa0cf231 Mon Sep 17 00:00:00 2001 From: Ameya Date: Wed, 6 Nov 2019 17:43:35 +0530 Subject: [PATCH 26/83] updates in api.py --- ner_v1/api.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ner_v1/api.py b/ner_v1/api.py index 6dd097c92..0aefbe627 100644 --- a/ner_v1/api.py +++ b/ner_v1/api.py @@ -70,9 +70,8 @@ def get_parameters_dictionary(request): PARAMETER_READ_EMBEDDINGS_FROM_REMOTE_URL: to_bool(request.GET.get('read_embeddings_from_remote_url')), PARAMETER_READ_MODEL_FROM_S3: to_bool(request.GET.get('read_model_from_s3')), PARAMETER_LIVE_CRF_MODEL_PATH: request.GET.get('live_crf_model_path'), - PARAMETER_PRIOR_CRF_RESULTS: request.GET.get("crf_results", "") + PARAMETER_PRIOR_CRF_RESULTS: json.loads(request.GET.get("crf_results", "[]")) } - return parameters_dict From 68edf76df062e4a1a34ba74bf696409020443bbe Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Wed, 6 Nov 2019 17:49:25 +0530 Subject: [PATCH 27/83] add past_date_referenced --- ner_v2/api.py | 7 ++++--- ner_v2/detectors/temporal/date/en/date_detection.py | 12 ++++++------ .../detectors/temporal/date/standard_date_regex.py | 6 ++++-- 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/ner_v2/api.py b/ner_v2/api.py index c7d0975c1..2e6cbb5f8 100644 --- a/ner_v2/api.py +++ b/ner_v2/api.py @@ -37,7 +37,7 @@ def get_parameters_dictionary(request): PARAMETER_TIMEZONE: request.GET.get('timezone'), PARAMETER_LANGUAGE_SCRIPT: request.GET.get('language_script', ENGLISH_LANG), PARAMETER_SOURCE_LANGUAGE: request.GET.get('source_language', ENGLISH_LANG), - PARAMETER_PAST_DATE_REFERENCED: request.GET.get('date_past_reference', 'False'), + PARAMETER_PAST_DATE_REFERENCED: request.GET.get('past_date_referenced', 'False'), PARAMETER_MIN_DIGITS: request.GET.get('min_number_digits'), PARAMETER_MAX_DIGITS: request.GET.get('max_number_digits'), PARAMETER_NUMBER_UNIT_TYPE: request.GET.get('unit_type'), @@ -132,8 +132,9 @@ def date(request): ner_logger.debug('Start: %s ' % parameters_dict[PARAMETER_ENTITY_NAME]) timezone = parameters_dict[PARAMETER_TIMEZONE] or 'UTC' - date_past_reference = parameters_dict.get(PARAMETER_PAST_DATE_REFERENCED, "false") - past_date_referenced = date_past_reference == 'true' or date_past_reference == 'True' + past_date_referenced = parameters_dict.get(PARAMETER_PAST_DATE_REFERENCED, False) + past_date_referenced = True if (past_date_referenced == 'true' or past_date_referenced == 'True') else False + date_detection = DateAdvancedDetector(entity_name=parameters_dict[PARAMETER_ENTITY_NAME], language=parameters_dict[PARAMETER_SOURCE_LANGUAGE], timezone=timezone, diff --git a/ner_v2/detectors/temporal/date/en/date_detection.py b/ner_v2/detectors/temporal/date/en/date_detection.py index 5fbf7f463..92a102c55 100644 --- a/ner_v2/detectors/temporal/date/en/date_detection.py +++ b/ner_v2/detectors/temporal/date/en/date_detection.py @@ -99,6 +99,7 @@ def __init__(self, entity_name, locale=None, timezone='UTC', past_date_reference self.bot_message = None self.locale = locale self.country_code = None + self.past_date_referenced = past_date_referenced self.default_detector_preferences = [self._gregorian_day_month_year_format, self._gregorian_month_day_year_format, self._gregorian_year_month_day_format, @@ -149,7 +150,7 @@ def get_country_code_from_locale(self): else: return None - def detect_date(self, text, bot_message=None): + def detect_date(self, text): """ Detects exact date for complete date information - day, month, year are available in text and possible dates for if there are missing parts of date - day, month, year assuming sensible defaults. Also @@ -164,8 +165,6 @@ def detect_date(self, text, bot_message=None): self.text = " " + text.strip().lower() + " " self.processed_text = self.text self.tagged_text = self.text - if bot_message: - self.set_bot_message(bot_message) if self.locale: self.country_code = self.get_country_code_from_locale() date_list = [] @@ -316,7 +315,7 @@ def _gregorian_day_month_year_format(self, date_list=None, original_list=None): yy = int(self.normalize_year(pattern[3])) if pattern[3] else self.now_date.year try: # to catch dates which are not possible like "31/11" (october 31st) - if not pattern[3] and self.timezone.localize(datetime.datetime(year=yy, month=mm, day=dd))\ + if not pattern[3] and self.timezone.localize(datetime.datetime(year=yy, month=mm, day=dd)) \ < self.now_date: yy += 1 except: @@ -373,7 +372,7 @@ def _gregorian_month_day_year_format(self, date_list=None, original_list=None): yy = int(self.normalize_year(pattern[3])) if pattern[3] else self.now_date.year try: # to catch dates which are not possible like "11/31" (october 31st) - if not pattern[3] and self.timezone.localize(datetime.datetime(year=yy, month=mm, day=dd))\ + if not pattern[3] and self.timezone.localize(datetime.datetime(year=yy, month=mm, day=dd)) \ < self.now_date: yy += 1 except: @@ -2001,7 +2000,8 @@ def normalize_year(self, year): this_century = int(str(self.now_date.year)[:2]) if len(year) == 2: if self.bot_message: - if past_regex and past_regex.search(self.bot_message) and int(year) > int(str(self.now_date.year)[2:]): + if (self.past_date_referenced is True) or (past_regex and past_regex.search(self.bot_message) + and int(year) > int(str(self.now_date.year)[2:])): return str(this_century - 1) + year elif present_regex and present_regex.search(self.bot_message): return str(this_century) + year diff --git a/ner_v2/detectors/temporal/date/standard_date_regex.py b/ner_v2/detectors/temporal/date/standard_date_regex.py index aa90c9c67..1102634bb 100644 --- a/ner_v2/detectors/temporal/date/standard_date_regex.py +++ b/ner_v2/detectors/temporal/date/standard_date_regex.py @@ -621,14 +621,16 @@ def normalize_year(self, year): Returns: str: year in four digits """ - past_regex = re.compile(ur'birth|bday|dob|born|जन्म|जन्मदिन|పుట్టినరోజు|పుట్టిన', flags=re.UNICODE) + # past_regex = re.compile(ur'birth|bday|dob|born|जन्म|जन्मदिन|పుట్టినరోజు|పుట్టిన', flags=re.UNICODE) + past_regex = None # Todo: Add more language variations of birthday. present_regex = None future_regex = None this_century = int(str(self.now_date.year)[:2]) if len(year) == 2: if self.bot_message: - if past_regex and past_regex.search(self.bot_message) and int(year) > int(str(self.now_date.year)[2:]): + if self.is_past_referenced or (past_regex and past_regex.search(self.bot_message) + and int(year) > int(str(self.now_date.year)[2:])): return str(this_century - 1) + year elif present_regex and present_regex.search(self.bot_message): return str(this_century) + year From d8ea47a00d4f79a9545ab429057720f00941ea85 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Wed, 6 Nov 2019 17:55:18 +0530 Subject: [PATCH 28/83] fix tests --- ner_v2/tests/temporal/date/en/test_date_detection.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ner_v2/tests/temporal/date/en/test_date_detection.py b/ner_v2/tests/temporal/date/en/test_date_detection.py index 7e2050255..b65fea43f 100644 --- a/ner_v2/tests/temporal/date/en/test_date_detection.py +++ b/ner_v2/tests/temporal/date/en/test_date_detection.py @@ -257,11 +257,11 @@ def test_hi_gregorian_dd_mm_yy_format(self): # If we run day1 = 1 month = 3 - year1 = 1966 - bot_message = u'जन्मदिन' + year1 = 2066 + past_date_referenced = True - date_detector_object = DateAdvancedDetector(entity_name=self.entity_name, language='hi', locale=locale) - date_detector_object.set_bot_message(bot_message) + date_detector_object = DateAdvancedDetector(entity_name=self.entity_name, language='hi', locale=locale, + past_date_referenced= past_date_referenced) date_dicts, original_texts = date_detector_object.detect_entity(message) self.assertIn({ From cd26b90e773082b7dabf8c8455ee364359e520dd Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Wed, 6 Nov 2019 18:26:13 +0530 Subject: [PATCH 29/83] add past_date_referenced --- .../detectors/temporal/date/en/date_detection.py | 4 ++-- .../temporal/date/standard_date_regex.py | 16 ++++++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/ner_v2/detectors/temporal/date/en/date_detection.py b/ner_v2/detectors/temporal/date/en/date_detection.py index 92a102c55..4f87f4262 100644 --- a/ner_v2/detectors/temporal/date/en/date_detection.py +++ b/ner_v2/detectors/temporal/date/en/date_detection.py @@ -2000,8 +2000,8 @@ def normalize_year(self, year): this_century = int(str(self.now_date.year)[:2]) if len(year) == 2: if self.bot_message: - if (self.past_date_referenced is True) or (past_regex and past_regex.search(self.bot_message) - and int(year) > int(str(self.now_date.year)[2:])): + if ((self.past_date_referenced is True) or (past_regex and past_regex.search(self.bot_message))) and \ + int(year) > int(str(self.now_date.year)[2:]): return str(this_century - 1) + year elif present_regex and present_regex.search(self.bot_message): return str(this_century) + year diff --git a/ner_v2/detectors/temporal/date/standard_date_regex.py b/ner_v2/detectors/temporal/date/standard_date_regex.py index 1102634bb..c4817eebf 100644 --- a/ner_v2/detectors/temporal/date/standard_date_regex.py +++ b/ner_v2/detectors/temporal/date/standard_date_regex.py @@ -36,7 +36,7 @@ def __init__(self, entity_name, data_directory_path, locale=None, timezone='UTC' self.now_date = datetime.datetime.now(tz=self.timezone) self.bot_message = None - self.is_past_referenced = past_date_referenced + self.past_date_referenced = past_date_referenced # dict to store words for date, numerals and words which comes in reference to some date self.date_constant_dict = {} @@ -198,7 +198,7 @@ def _detect_relative_date(self, date_list=None, original_list=None): date_rel_match = self.regex_relative_date.findall(self.processed_text) for date_match in date_rel_match: original = date_match[0] - if not self.is_past_referenced: + if not self.past_date_referenced: req_date = self.now_date + datetime.timedelta(days=self.date_constant_dict[date_match[1]][0]) else: req_date = self.now_date - datetime.timedelta(days=self.date_constant_dict[date_match[1]][0]) @@ -242,7 +242,7 @@ def _detect_date_month(self, date_list, original_list): yymmdd = str(self.now_date.year + 1) + mmdd yy = self.now_date.year + 1 - if self.is_past_referenced: + if self.past_date_referenced: if int(today_yymmdd) < int(yymmdd): yy -= 1 date = { @@ -345,11 +345,11 @@ def _detect_date_ref_month_3(self, date_list, original_list): for date_match in date_ref_month_match: original = date_match[0] dd = self._get_int_from_numeral(date_match[1]) - if (self.now_date.day > dd and self.is_past_referenced) or \ - (self.now_date.day <= dd and not self.is_past_referenced): + if (self.now_date.day > dd and self.past_date_referenced) or\ + (self.now_date.day <= dd and not self.past_date_referenced): mm = self.now_date.month yy = self.now_date.year - elif self.now_date.day <= dd and self.is_past_referenced: + elif self.now_date.day <= dd and self.past_date_referenced: req_date = self.now_date - relativedelta(months=1) mm = req_date.month yy = req_date.year @@ -629,8 +629,8 @@ def normalize_year(self, year): this_century = int(str(self.now_date.year)[:2]) if len(year) == 2: if self.bot_message: - if self.is_past_referenced or (past_regex and past_regex.search(self.bot_message) - and int(year) > int(str(self.now_date.year)[2:])): + if self.past_date_referenced or (past_regex and past_regex.search(self.bot_message) + and int(year) > int(str(self.now_date.year)[2:])): return str(this_century - 1) + year elif present_regex and present_regex.search(self.bot_message): return str(this_century) + year From bc11053de72b22f5c7b263c2f49a2f2216c9beb7 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Wed, 6 Nov 2019 18:46:44 +0530 Subject: [PATCH 30/83] fix normalize_year --- .../detectors/temporal/date/en/date_detection.py | 15 +++++++-------- .../temporal/date/standard_date_regex.py | 15 +++++++-------- 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/ner_v2/detectors/temporal/date/en/date_detection.py b/ner_v2/detectors/temporal/date/en/date_detection.py index 4f87f4262..d3726c126 100644 --- a/ner_v2/detectors/temporal/date/en/date_detection.py +++ b/ner_v2/detectors/temporal/date/en/date_detection.py @@ -1999,14 +1999,13 @@ def normalize_year(self, year): future_regex = None this_century = int(str(self.now_date.year)[:2]) if len(year) == 2: - if self.bot_message: - if ((self.past_date_referenced is True) or (past_regex and past_regex.search(self.bot_message))) and \ - int(year) > int(str(self.now_date.year)[2:]): - return str(this_century - 1) + year - elif present_regex and present_regex.search(self.bot_message): - return str(this_century) + year - elif future_regex and future_regex.search(self.bot_message): - return str(this_century + 1) + year + if (((self.bot_message and past_regex.search(self.bot_message)) or (self.past_date_referenced is True)) + and (int(year) > int(str(self.now_date.year)[2:]))): + return str(this_century - 1) + year + elif present_regex and present_regex.search(self.bot_message): + return str(this_century) + year + elif future_regex and future_regex.search(self.bot_message): + return str(this_century + 1) + year # if patterns didn't match or no bot message set, fallback to current century if len(year) == 2: diff --git a/ner_v2/detectors/temporal/date/standard_date_regex.py b/ner_v2/detectors/temporal/date/standard_date_regex.py index c4817eebf..42328ab03 100644 --- a/ner_v2/detectors/temporal/date/standard_date_regex.py +++ b/ner_v2/detectors/temporal/date/standard_date_regex.py @@ -628,14 +628,13 @@ def normalize_year(self, year): future_regex = None this_century = int(str(self.now_date.year)[:2]) if len(year) == 2: - if self.bot_message: - if self.past_date_referenced or (past_regex and past_regex.search(self.bot_message) - and int(year) > int(str(self.now_date.year)[2:])): - return str(this_century - 1) + year - elif present_regex and present_regex.search(self.bot_message): - return str(this_century) + year - elif future_regex and future_regex.search(self.bot_message): - return str(this_century + 1) + year + if (((self.bot_message and past_regex and past_regex.search(self.bot_message)) or + (self.past_date_referenced is True)) and (int(year) > int(str(self.now_date.year)[2:]))): + return str(this_century - 1) + year + elif present_regex and present_regex.search(self.bot_message): + return str(this_century) + year + elif future_regex and future_regex.search(self.bot_message): + return str(this_century + 1) + year # if patterns didn't match or no bot message set, fallback to current century if len(year) == 2: From f9faef0f8b2c87ac6f61ed8677c8d2c3299efcd5 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Wed, 6 Nov 2019 18:49:36 +0530 Subject: [PATCH 31/83] fix tests --- ner_v2/tests/temporal/date/en/test_date_detection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ner_v2/tests/temporal/date/en/test_date_detection.py b/ner_v2/tests/temporal/date/en/test_date_detection.py index b65fea43f..c22fb895c 100644 --- a/ner_v2/tests/temporal/date/en/test_date_detection.py +++ b/ner_v2/tests/temporal/date/en/test_date_detection.py @@ -257,7 +257,7 @@ def test_hi_gregorian_dd_mm_yy_format(self): # If we run day1 = 1 month = 3 - year1 = 2066 + year1 = 1966 past_date_referenced = True date_detector_object = DateAdvancedDetector(entity_name=self.entity_name, language='hi', locale=locale, From 7fba38545a432de1c54d7563513a30aade28a5eb Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Wed, 6 Nov 2019 18:56:03 +0530 Subject: [PATCH 32/83] fix lint --- ner_v2/detectors/temporal/date/en/date_detection.py | 4 ++-- ner_v2/tests/temporal/date/en/test_date_detection.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ner_v2/detectors/temporal/date/en/date_detection.py b/ner_v2/detectors/temporal/date/en/date_detection.py index d3726c126..3aa7c7e4c 100644 --- a/ner_v2/detectors/temporal/date/en/date_detection.py +++ b/ner_v2/detectors/temporal/date/en/date_detection.py @@ -1999,8 +1999,8 @@ def normalize_year(self, year): future_regex = None this_century = int(str(self.now_date.year)[:2]) if len(year) == 2: - if (((self.bot_message and past_regex.search(self.bot_message)) or (self.past_date_referenced is True)) - and (int(year) > int(str(self.now_date.year)[2:]))): + if (((self.bot_message and past_regex.search(self.bot_message)) or + (self.past_date_referenced is True)) and (int(year) > int(str(self.now_date.year)[2:]))): return str(this_century - 1) + year elif present_regex and present_regex.search(self.bot_message): return str(this_century) + year diff --git a/ner_v2/tests/temporal/date/en/test_date_detection.py b/ner_v2/tests/temporal/date/en/test_date_detection.py index c22fb895c..9697606ce 100644 --- a/ner_v2/tests/temporal/date/en/test_date_detection.py +++ b/ner_v2/tests/temporal/date/en/test_date_detection.py @@ -261,7 +261,7 @@ def test_hi_gregorian_dd_mm_yy_format(self): past_date_referenced = True date_detector_object = DateAdvancedDetector(entity_name=self.entity_name, language='hi', locale=locale, - past_date_referenced= past_date_referenced) + past_date_referenced=past_date_referenced) date_dicts, original_texts = date_detector_object.detect_entity(message) self.assertIn({ From 99c6a2532d1cd3b74141b388348af785d9f5c23f Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Thu, 7 Nov 2019 16:04:25 +0530 Subject: [PATCH 33/83] fix date_past_reference in api.py --- ner_v2/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ner_v2/api.py b/ner_v2/api.py index 2e6cbb5f8..69f12ac6b 100644 --- a/ner_v2/api.py +++ b/ner_v2/api.py @@ -37,7 +37,7 @@ def get_parameters_dictionary(request): PARAMETER_TIMEZONE: request.GET.get('timezone'), PARAMETER_LANGUAGE_SCRIPT: request.GET.get('language_script', ENGLISH_LANG), PARAMETER_SOURCE_LANGUAGE: request.GET.get('source_language', ENGLISH_LANG), - PARAMETER_PAST_DATE_REFERENCED: request.GET.get('past_date_referenced', 'False'), + PARAMETER_PAST_DATE_REFERENCED: request.GET.get('date_past_reference', 'False'), PARAMETER_MIN_DIGITS: request.GET.get('min_number_digits'), PARAMETER_MAX_DIGITS: request.GET.get('max_number_digits'), PARAMETER_NUMBER_UNIT_TYPE: request.GET.get('unit_type'), From 2611034ef198930f2d848fe7e427c8bd13babf77 Mon Sep 17 00:00:00 2001 From: ashutoshsingh0223 Date: Fri, 8 Nov 2019 02:00:25 +0530 Subject: [PATCH 34/83] pass free text results in get_person_name --- ner_v1/chatbot/entity_detection.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ner_v1/chatbot/entity_detection.py b/ner_v1/chatbot/entity_detection.py index 846442c32..2785c45e1 100644 --- a/ner_v1/chatbot/entity_detection.py +++ b/ner_v1/chatbot/entity_detection.py @@ -569,7 +569,8 @@ def get_person_name(message, entity_name, structured_value, fallback_value, bot_ entity_list, original_text_list = [], [] if text: - entity_list, original_text_list = name_detection.detect_entity(text=text, bot_message=bot_message) + entity_list, original_text_list = name_detection.detect_entity(text=text, bot_message=bot_message, + free_text_detection_results=free_text_detection_results) if not entity_list and fallback_text: entity_list, original_text_list = NameDetector.get_format_name(fallback_text.split()) From 7183e6ef65a7b53da741d127cf9fd44cab7524cf Mon Sep 17 00:00:00 2001 From: ashutoshsingh0223 Date: Fri, 8 Nov 2019 02:16:53 +0530 Subject: [PATCH 35/83] add post tokenization processing for person_name --- ner_v1/detectors/textual/name/name_detection.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/ner_v1/detectors/textual/name/name_detection.py b/ner_v1/detectors/textual/name/name_detection.py index c33106d9f..8c6ed5d7d 100644 --- a/ner_v1/detectors/textual/name/name_detection.py +++ b/ner_v1/detectors/textual/name/name_detection.py @@ -272,9 +272,16 @@ def replace_free_text_detection_text(self, free_text_detection_results, text): """ if self.language == ENGLISH_LANG: - replaced_text = nltk_tokenizer.tokenize(text.lower()) - # TODO: Add postprocessing after tokenization to handle titles like dr., mr. etc - # TODO: Tokenization issue where trailing '.'s are considered a separate token + replaced_text_ = nltk_tokenizer.tokenize(text.lower()) + replaced_text = [] + for index, token in enumerate(replaced_text_): + if token == "." and 0 < index < len(replaced_text_) - 1 and replaced_text[-1] + "." in text.lower(): + replaced_text[-1] = replaced_text[-1] + "." + else: + _token = token.strip('!"#$%&\'()*+,-/:;<=>?@[\\]^_`{|}~') + if not _token: + _token = token + replaced_text.append(token) else: replaced_text = text.lower().strip().split() @@ -286,7 +293,6 @@ def replace_free_text_detection_text(self, free_text_detection_results, text): return replaced_text - def replace_detected_text(self, text_detection_result, text): """ Replaces the detected name from text_detection_result by __ From dfa7c8253c7aad7b0d7e444669f1a89211a1665c Mon Sep 17 00:00:00 2001 From: ashutoshsingh0223 Date: Fri, 8 Nov 2019 12:13:18 +0530 Subject: [PATCH 36/83] msg --- ner_v1/detectors/base_detector.py | 70 +++++++++++++++++++ .../detectors/textual/text/text_detection.py | 20 ++++++ .../textual/text/text_detection_model.py | 68 ------------------ 3 files changed, 90 insertions(+), 68 deletions(-) diff --git a/ner_v1/detectors/base_detector.py b/ner_v1/detectors/base_detector.py index 65e9ec3dd..f59a43916 100644 --- a/ner_v1/detectors/base_detector.py +++ b/ner_v1/detectors/base_detector.py @@ -10,6 +10,8 @@ from ner_constants import (FROM_STRUCTURE_VALUE_VERIFIED, FROM_STRUCTURE_VALUE_NOT_VERIFIED, FROM_MESSAGE, FROM_FALLBACK_VALUE, ORIGINAL_TEXT, ENTITY_VALUE, DETECTION_METHOD, DETECTION_LANGUAGE, ENTITY_VALUE_DICT_KEY) +from ner_v1.constant import DATASTORE_VERIFIED, CRF_MODEL_VERIFIED + class BaseDetector(object): @@ -119,6 +121,74 @@ def detect_bulk(self, messages=None, **kwargs): detection_method=method, detection_language=self._target_language_script) + def _add_verification_source(self, values, verification_source_dict): + """ + Add the verification source for the detected entities + Args: + values (list): List of detected text type entities + verification_source_dict (dict): Dict consisting of the verification source and value. + Returns: + text_entity_verified_values (list): List of dicts consisting of the key and values for the keys + value and verification source + Example: + values = [u'Chennai', u'New Delhi', u'chennai'] + verification_source_dict = {"datastore_verified": True} + + >> add_verification_source(values, verification_source_dict) + [{'datastore_verified': True, 'value': u'Chennai'}, + {'datastore_verified': True, 'value': u'New Delhi'}, + {'datastore_verified': True, 'value': u'chennai'}] + """ + text_entity_verified_values = [] + for text_entity_value in values: + text_entity_dict = {ENTITY_VALUE_DICT_KEY: text_entity_value} + text_entity_dict.update(verification_source_dict) + text_entity_verified_values.append(text_entity_dict) + return text_entity_verified_values + + def combine_results(self, values, original_texts, crf_original_texts): + """ + This method is used to combine the results provided by the datastore search and the + crf_model if trained. + Args: + values (list): List of values detected by datastore + original_texts (list): List of original texts present in the texts for which value shave been + detected + crf_original_texts (list): Entities detected by the Crf Model + Returns: + combined_values (list): List of dicts each dict consisting of the entity value and additionally + the keys for the datastore and crf model detection + combined_original_texts (list): List of original texts detected by the datastore and the crf model. + """ + unprocessed_crf_original_texts = [] + + combined_values = self._add_verification_source(values=values, + verification_source_dict={ + DATASTORE_VERIFIED: True, + CRF_MODEL_VERIFIED: False + }) + combined_original_texts = original_texts + for i in range(len(crf_original_texts)): + match = False + for j in range(len(original_texts)): + if crf_original_texts[i] == original_texts[j]: + combined_values[j][CRF_MODEL_VERIFIED] = True + match = True + elif re.findall(r'\b%s\b' % crf_original_texts[i], original_texts[j]): + match = True + if not match: + unprocessed_crf_original_texts.append(crf_original_texts[i]) + + unprocessed_crf_original_texts_verified = self._add_verification_source(values=unprocessed_crf_original_texts, + verification_source_dict= + {DATASTORE_VERIFIED: False, + CRF_MODEL_VERIFIED: True} + ) + combined_values.extend(unprocessed_crf_original_texts_verified) + combined_original_texts.extend(unprocessed_crf_original_texts) + + return combined_values, combined_original_texts + def detect(self, message=None, structured_value=None, fallback_value=None, **kwargs): """ Use detector to detect entities from text. It also translates query to language compatible to detector diff --git a/ner_v1/detectors/textual/text/text_detection.py b/ner_v1/detectors/textual/text/text_detection.py index 364485cce..c9c439459 100644 --- a/ner_v1/detectors/textual/text/text_detection.py +++ b/ner_v1/detectors/textual/text/text_detection.py @@ -3,6 +3,7 @@ import string from six import iteritems +import six import language_utilities.constant as lang_constant from chatbot_ner.config import ner_logger @@ -321,8 +322,18 @@ def detect_entity_bulk(self, texts, **kwargs): ] """ + free_text_detection_results = kwargs.get("free_text_detection_results", []) self._process_text(texts) text_entity_values_list, original_texts_list = self._text_detection_with_variants() + + for i, (values, original_texts, free_text_detection_results_) in enumerate( + six.moves.zip_longest(text_entity_values_list, original_texts_list, free_text_detection_results)): + if free_text_detection_results_: + text_entity_values_list[i], original_texts_list[i] = self.combine_results( + values=values, + original_texts=original_texts, + crf_original_texts=free_text_detection_results_) + return text_entity_values_list, original_texts_list def detect_entity(self, text, **kwargs): @@ -361,9 +372,18 @@ def detect_entity(self, text, **kwargs): self._process_text([text]) text_entity_values, original_texts = self._text_detection_with_variants() + free_text_detection_results = kwargs.get("free_text_detection_results", []) + if len(text_entity_values) > 0 and len(original_texts) > 0: self.tagged_text = self.__tagged_texts[0] self.processed_text = self.__processed_texts[0] + + if free_text_detection_results: + text_entity_verified_values, original_texts = self.combine_results( + values=text_entity_values[0], + original_texts=original_texts[0], + crf_original_texts=free_text_detection_results) + return text_entity_verified_values[0], original_texts[0] return text_entity_values[0], original_texts[0] return [], [] diff --git a/ner_v1/detectors/textual/text/text_detection_model.py b/ner_v1/detectors/textual/text/text_detection_model.py index 77c09dc82..6d6825205 100644 --- a/ner_v1/detectors/textual/text/text_detection_model.py +++ b/ner_v1/detectors/textual/text/text_detection_model.py @@ -187,71 +187,3 @@ def detect_entity_bulk(self, texts, **kwargs): text_entity_values_list.append(text_entity_verified_values) original_texts_detected_list.append(original_texts) return text_entity_values_list, original_texts_detected_list - - def _add_verification_source(self, values, verification_source_dict): - """ - Add the verification source for the detected entities - Args: - values (list): List of detected text type entities - verification_source_dict (dict): Dict consisting of the verification source and value. - Returns: - text_entity_verified_values (list): List of dicts consisting of the key and values for the keys - value and verification source - Example: - values = [u'Chennai', u'New Delhi', u'chennai'] - verification_source_dict = {"datastore_verified": True} - - >> add_verification_source(values, verification_source_dict) - [{'datastore_verified': True, 'value': u'Chennai'}, - {'datastore_verified': True, 'value': u'New Delhi'}, - {'datastore_verified': True, 'value': u'chennai'}] - """ - text_entity_verified_values = [] - for text_entity_value in values: - text_entity_dict = {ENTITY_VALUE_DICT_KEY: text_entity_value} - text_entity_dict.update(verification_source_dict) - text_entity_verified_values.append(text_entity_dict) - return text_entity_verified_values - - def combine_results(self, values, original_texts, crf_original_texts): - """ - This method is used to combine the results provided by the datastore search and the - crf_model if trained. - Args: - values (list): List of values detected by datastore - original_texts (list): List of original texts present in the texts for which value shave been - detected - crf_original_texts (list): Entities detected by the Crf Model - Returns: - combined_values (list): List of dicts each dict consisting of the entity value and additionally - the keys for the datastore and crf model detection - combined_original_texts (list): List of original texts detected by the datastore and the crf model. - """ - unprocessed_crf_original_texts = [] - - combined_values = self._add_verification_source(values=values, - verification_source_dict={ - DATASTORE_VERIFIED: True, - CRF_MODEL_VERIFIED: False - }) - combined_original_texts = original_texts - for i in range(len(crf_original_texts)): - match = False - for j in range(len(original_texts)): - if crf_original_texts[i] == original_texts[j]: - combined_values[j][CRF_MODEL_VERIFIED] = True - match = True - elif re.findall(r'\b%s\b' % crf_original_texts[i], original_texts[j]): - match = True - if not match: - unprocessed_crf_original_texts.append(crf_original_texts[i]) - - unprocessed_crf_original_texts_verified = self._add_verification_source(values=unprocessed_crf_original_texts, - verification_source_dict= - {DATASTORE_VERIFIED: False, - CRF_MODEL_VERIFIED: True} - ) - combined_values.extend(unprocessed_crf_original_texts_verified) - combined_original_texts.extend(unprocessed_crf_original_texts) - - return combined_values, combined_original_texts From 5bbc3f7c76eca162ed732160b208860eae4d166e Mon Sep 17 00:00:00 2001 From: ashutoshsingh0223 Date: Fri, 8 Nov 2019 12:35:40 +0530 Subject: [PATCH 37/83] add combine_method to base detector and updates in text_detection.py --- ner_v1/api.py | 3 ++- ner_v1/chatbot/entity_detection.py | 9 +++++---- ner_v1/detectors/textual/text/text_detection.py | 5 +++++ 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/ner_v1/api.py b/ner_v1/api.py index 0aefbe627..394ab6829 100644 --- a/ner_v1/api.py +++ b/ner_v1/api.py @@ -270,7 +270,8 @@ def location(request): entity_output = get_location(parameters_dict[PARAMETER_MESSAGE], parameters_dict[PARAMETER_ENTITY_NAME], parameters_dict[PARAMETER_STRUCTURED_VALUE], parameters_dict[PARAMETER_FALLBACK_VALUE], - parameters_dict[PARAMETER_BOT_MESSAGE]) + parameters_dict[PARAMETER_BOT_MESSAGE], + free_text_detection_results=parameters_dict[PARAMETER_PRIOR_CRF_RESULTS]) ner_logger.debug('Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output)) except TypeError as e: ner_logger.exception('Exception for location: %s ' % e) diff --git a/ner_v1/chatbot/entity_detection.py b/ner_v1/chatbot/entity_detection.py index 2785c45e1..4ee95e8da 100644 --- a/ner_v1/chatbot/entity_detection.py +++ b/ner_v1/chatbot/entity_detection.py @@ -261,7 +261,7 @@ def get_text(message, entity_name, structured_value, fallback_value, bot_message return entity_output -def get_location(message, entity_name, structured_value, fallback_value, bot_message): +def get_location(message, entity_name, structured_value, fallback_value, bot_message, **kwargs): """"Use TextDetector (elasticsearch) to detect location TODO: We can improve this by creating separate for location detection instead of using TextDetector @@ -297,10 +297,10 @@ def get_location(message, entity_name, structured_value, fallback_value, bot_mes >> [{'detection': 'message', 'entity_value': {'value': 'Andheri West'}, 'language': 'en', 'original_text': 'andheri west'}] """ - + free_text_detection_results = kwargs.get("free_text_detection_results", []) text_detection = TextDetector(entity_name=entity_name) return text_detection.detect(message=message, structured_value=structured_value, fallback_value=fallback_value, - bot_message=bot_message) + bot_message=bot_message, free_text_detection_results=free_text_detection_results) def get_phone_number(message, entity_name, structured_value, fallback_value, bot_message): @@ -410,7 +410,7 @@ def get_email(message, entity_name, structured_value, fallback_value, bot_messag bot_message=bot_message) -def get_city(message, entity_name, structured_value, fallback_value, bot_message, language): +def get_city(message, entity_name, structured_value, fallback_value, bot_message, language, **kwargs): """Use CityDetector to detect cities Args: @@ -493,6 +493,7 @@ def get_city(message, entity_name, structured_value, fallback_value, bot_message """ + free_text_detection_results = kwargs.get("free_text_detection_results", []) city_detection = CityDetector(entity_name=entity_name, language=language) city_detection.set_bot_message(bot_message=bot_message) if structured_value: diff --git a/ner_v1/detectors/textual/text/text_detection.py b/ner_v1/detectors/textual/text/text_detection.py index c9c439459..047d90c87 100644 --- a/ner_v1/detectors/textual/text/text_detection.py +++ b/ner_v1/detectors/textual/text/text_detection.py @@ -322,10 +322,13 @@ def detect_entity_bulk(self, texts, **kwargs): ] """ + # For bulk detection free_text_detection_results will be a list of list of str free_text_detection_results = kwargs.get("free_text_detection_results", []) self._process_text(texts) text_entity_values_list, original_texts_list = self._text_detection_with_variants() + # itertate over text_entity_values_list, original_texts_list and if free_text_detection_results has any entry + # for that index use combine_results to merge the results. for i, (values, original_texts, free_text_detection_results_) in enumerate( six.moves.zip_longest(text_entity_values_list, original_texts_list, free_text_detection_results)): if free_text_detection_results_: @@ -372,6 +375,8 @@ def detect_entity(self, text, **kwargs): self._process_text([text]) text_entity_values, original_texts = self._text_detection_with_variants() + # For single message detection free_text_detection_results will be a list of str + # if present use combine_results to merge the results. free_text_detection_results = kwargs.get("free_text_detection_results", []) if len(text_entity_values) > 0 and len(original_texts) > 0: From 55466d2dccf61561d5a316f5f1ef63a96397c357 Mon Sep 17 00:00:00 2001 From: ashutoshsingh0223 Date: Fri, 8 Nov 2019 13:00:41 +0530 Subject: [PATCH 38/83] update detect_entity method in text_detection.py --- .../detectors/textual/text/text_detection.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/ner_v1/detectors/textual/text/text_detection.py b/ner_v1/detectors/textual/text/text_detection.py index 047d90c87..c78b023e2 100644 --- a/ner_v1/detectors/textual/text/text_detection.py +++ b/ner_v1/detectors/textual/text/text_detection.py @@ -379,18 +379,19 @@ def detect_entity(self, text, **kwargs): # if present use combine_results to merge the results. free_text_detection_results = kwargs.get("free_text_detection_results", []) + values, texts = [], [] if len(text_entity_values) > 0 and len(original_texts) > 0: self.tagged_text = self.__tagged_texts[0] self.processed_text = self.__processed_texts[0] - - if free_text_detection_results: - text_entity_verified_values, original_texts = self.combine_results( - values=text_entity_values[0], - original_texts=original_texts[0], - crf_original_texts=free_text_detection_results) - return text_entity_verified_values[0], original_texts[0] - return text_entity_values[0], original_texts[0] - return [], [] + values, texts = text_entity_values[0], original_texts[0] + + if free_text_detection_results: + text_entity_verified_values, original_texts = self.combine_results( + values=values, + original_texts=texts, + crf_original_texts=free_text_detection_results) + return text_entity_verified_values, original_texts + return values, texts def _text_detection_with_variants(self): """ From 43904192d24aef3ea01b91818f7212711b85c06a Mon Sep 17 00:00:00 2001 From: ashutoshsingh0223 Date: Fri, 8 Nov 2019 13:19:15 +0530 Subject: [PATCH 39/83] updates in base_detector.py --- ner_v1/detectors/base_detector.py | 7 +++++++ ner_v1/detectors/textual/text/text_detection.py | 8 +++++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/ner_v1/detectors/base_detector.py b/ner_v1/detectors/base_detector.py index f59a43916..4667eb12f 100644 --- a/ner_v1/detectors/base_detector.py +++ b/ner_v1/detectors/base_detector.py @@ -12,6 +12,13 @@ DETECTION_LANGUAGE, ENTITY_VALUE_DICT_KEY) from ner_v1.constant import DATASTORE_VERIFIED, CRF_MODEL_VERIFIED +try: + import regex as re + _re_flags = re.UNICODE | re.V1 | re.WORD +except ImportError: + import re + _re_flags = re.UNICODE + class BaseDetector(object): diff --git a/ner_v1/detectors/textual/text/text_detection.py b/ner_v1/detectors/textual/text/text_detection.py index c78b023e2..a634a4ef0 100644 --- a/ner_v1/detectors/textual/text/text_detection.py +++ b/ner_v1/detectors/textual/text/text_detection.py @@ -1,5 +1,4 @@ import collections -import re import string from six import iteritems @@ -12,6 +11,13 @@ from lib.nlp.levenshtein_distance import edit_distance from ner_v1.detectors.base_detector import BaseDetector +try: + import regex as re + _re_flags = re.UNICODE | re.V1 | re.WORD +except ImportError: + import re + _re_flags = re.UNICODE + class TextDetector(BaseDetector): """ From 32414f7b1e4de86d6c500a4c51dea5694301b029 Mon Sep 17 00:00:00 2001 From: ashutoshsingh0223 Date: Fri, 8 Nov 2019 13:39:36 +0530 Subject: [PATCH 40/83] updates_to_incorporate_free_text_detection --- ner_v1/detectors/textual/text/text_detection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ner_v1/detectors/textual/text/text_detection.py b/ner_v1/detectors/textual/text/text_detection.py index a634a4ef0..27e263405 100644 --- a/ner_v1/detectors/textual/text/text_detection.py +++ b/ner_v1/detectors/textual/text/text_detection.py @@ -334,7 +334,7 @@ def detect_entity_bulk(self, texts, **kwargs): text_entity_values_list, original_texts_list = self._text_detection_with_variants() # itertate over text_entity_values_list, original_texts_list and if free_text_detection_results has any entry - # for that index use combine_results to merge the results. + # for that index use combine_results to merge the results from free_text and detection. for i, (values, original_texts, free_text_detection_results_) in enumerate( six.moves.zip_longest(text_entity_values_list, original_texts_list, free_text_detection_results)): if free_text_detection_results_: From 4e369a2dc6c35ffae6d24a90e050a5a8f569d793 Mon Sep 17 00:00:00 2001 From: ashutoshsingh0223 Date: Fri, 8 Nov 2019 14:51:25 +0530 Subject: [PATCH 41/83] fix lint errors --- ner_v1/chatbot/entity_detection.py | 8 +++++--- ner_v1/detectors/base_detector.py | 9 ++++----- ner_v1/detectors/textual/name/name_detection.py | 2 +- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/ner_v1/chatbot/entity_detection.py b/ner_v1/chatbot/entity_detection.py index 4ee95e8da..420d11e96 100644 --- a/ner_v1/chatbot/entity_detection.py +++ b/ner_v1/chatbot/entity_detection.py @@ -493,7 +493,7 @@ def get_city(message, entity_name, structured_value, fallback_value, bot_message """ - free_text_detection_results = kwargs.get("free_text_detection_results", []) + # free_text_detection_results = kwargs.get("free_text_detection_results", []) city_detection = CityDetector(entity_name=entity_name, language=language) city_detection.set_bot_message(bot_message=bot_message) if structured_value: @@ -570,8 +570,10 @@ def get_person_name(message, entity_name, structured_value, fallback_value, bot_ entity_list, original_text_list = [], [] if text: - entity_list, original_text_list = name_detection.detect_entity(text=text, bot_message=bot_message, - free_text_detection_results=free_text_detection_results) + entity_list, original_text_list = name_detection.detect_entity( + text=text, + bot_message=bot_message, + free_text_detection_results=free_text_detection_results) if not entity_list and fallback_text: entity_list, original_text_list = NameDetector.get_format_name(fallback_text.split()) diff --git a/ner_v1/detectors/base_detector.py b/ner_v1/detectors/base_detector.py index 4667eb12f..7507534d4 100644 --- a/ner_v1/detectors/base_detector.py +++ b/ner_v1/detectors/base_detector.py @@ -186,11 +186,10 @@ def combine_results(self, values, original_texts, crf_original_texts): if not match: unprocessed_crf_original_texts.append(crf_original_texts[i]) - unprocessed_crf_original_texts_verified = self._add_verification_source(values=unprocessed_crf_original_texts, - verification_source_dict= - {DATASTORE_VERIFIED: False, - CRF_MODEL_VERIFIED: True} - ) + unprocessed_crf_original_texts_verified = self._add_verification_source( + values=unprocessed_crf_original_texts, + verification_source_dict={DATASTORE_VERIFIED: False, CRF_MODEL_VERIFIED: True}) + combined_values.extend(unprocessed_crf_original_texts_verified) combined_original_texts.extend(unprocessed_crf_original_texts) diff --git a/ner_v1/detectors/textual/name/name_detection.py b/ner_v1/detectors/textual/name/name_detection.py index 8c6ed5d7d..7146a0e22 100644 --- a/ner_v1/detectors/textual/name/name_detection.py +++ b/ner_v1/detectors/textual/name/name_detection.py @@ -175,7 +175,7 @@ def detect_entity(self, text, bot_message=None, **kwargs): else: replaced_text = self.replace_free_text_detection_text(free_text_detection_results, - text=text) + text=text) entity_value, original_text = self.detect_person_name_entity(replaced_text) self._update_processed_text(person_name_list=original_text) From ea5d844e12a283d8d7ee3d9fdecb6b1beb01ca0f Mon Sep 17 00:00:00 2001 From: ashutoshsingh0223 Date: Fri, 8 Nov 2019 14:59:18 +0530 Subject: [PATCH 42/83] pass free_text_entity_results to .detect_bulk in get_text --- ner_v1/chatbot/entity_detection.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ner_v1/chatbot/entity_detection.py b/ner_v1/chatbot/entity_detection.py index fdddda051..5b1cce35a 100644 --- a/ner_v1/chatbot/entity_detection.py +++ b/ner_v1/chatbot/entity_detection.py @@ -256,7 +256,8 @@ def get_text(message, entity_name, structured_value, fallback_value, bot_message bot_message=bot_message, free_text_detection_results=free_text_detection_results) elif isinstance(message, (list, tuple)): - entity_output = text_model_detector.detect_bulk(messages=message, fallback_values=fallback_value) + entity_output = text_model_detector.detect_bulk(messages=message, fallback_values=fallback_value, + free_text_detection_results=free_text_detection_results) return entity_output From 32bf0cf1979e84fed42d6c4df79621bf3ed91448 Mon Sep 17 00:00:00 2001 From: ashutoshsingh0223 Date: Mon, 18 Nov 2019 22:13:57 +0530 Subject: [PATCH 43/83] use request.GET.get_list() to take crf results --- ner_v1/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ner_v1/api.py b/ner_v1/api.py index 394ab6829..7238fb26c 100644 --- a/ner_v1/api.py +++ b/ner_v1/api.py @@ -70,7 +70,7 @@ def get_parameters_dictionary(request): PARAMETER_READ_EMBEDDINGS_FROM_REMOTE_URL: to_bool(request.GET.get('read_embeddings_from_remote_url')), PARAMETER_READ_MODEL_FROM_S3: to_bool(request.GET.get('read_model_from_s3')), PARAMETER_LIVE_CRF_MODEL_PATH: request.GET.get('live_crf_model_path'), - PARAMETER_PRIOR_CRF_RESULTS: json.loads(request.GET.get("crf_results", "[]")) + PARAMETER_PRIOR_CRF_RESULTS: request.GET.get_list("crf_results", []) } return parameters_dict From 21562af146c5a81c4e33819a744c62ffad8bdf56 Mon Sep 17 00:00:00 2001 From: ashutoshsingh0223 Date: Mon, 18 Nov 2019 23:10:23 +0530 Subject: [PATCH 44/83] add logs and minor refactoring in detect_entity_bulk of TextDetector --- ner_v1/api.py | 2 ++ ner_v1/chatbot/entity_detection.py | 2 ++ ner_v1/detectors/base_detector.py | 2 +- ner_v1/detectors/textual/text/text_detection.py | 11 ++++++++++- 4 files changed, 15 insertions(+), 2 deletions(-) diff --git a/ner_v1/api.py b/ner_v1/api.py index 7238fb26c..5f1423252 100644 --- a/ner_v1/api.py +++ b/ner_v1/api.py @@ -23,6 +23,7 @@ PARAMETER_READ_EMBEDDINGS_FROM_REMOTE_URL, PARAMETER_LIVE_CRF_MODEL_PATH) from django.views.decorators.csrf import csrf_exempt +from chatbot_ner.config import ner_logger def to_bool(value): @@ -72,6 +73,7 @@ def get_parameters_dictionary(request): PARAMETER_LIVE_CRF_MODEL_PATH: request.GET.get('live_crf_model_path'), PARAMETER_PRIOR_CRF_RESULTS: request.GET.get_list("crf_results", []) } + ner_logger.info("parameters dict - {}".format(parameters_dict)) return parameters_dict diff --git a/ner_v1/chatbot/entity_detection.py b/ner_v1/chatbot/entity_detection.py index 5b1cce35a..9b47dc621 100644 --- a/ner_v1/chatbot/entity_detection.py +++ b/ner_v1/chatbot/entity_detection.py @@ -17,6 +17,7 @@ from ner_v1.detectors.textual.name.name_detection import NameDetector from ner_v1.detectors.textual.text.text_detection import TextDetector from ner_v1.detectors.textual.text.text_detection_model import TextModelDetector +from chatbot_ner.config import ner_logger import six """ @@ -249,6 +250,7 @@ def get_text(message, entity_name, structured_value, fallback_value, bot_message min_token_len_fuzziness = int(min_token_len_fuzziness) text_model_detector.set_min_token_size_for_levenshtein(min_size=min_token_len_fuzziness) + ner_logger.info("free text detection results: {}".format(free_text_detection_results)) if isinstance(message, six.string_types): entity_output = text_model_detector.detect(message=message, structured_value=structured_value, diff --git a/ner_v1/detectors/base_detector.py b/ner_v1/detectors/base_detector.py index 8e7d5a7bf..5e394eecf 100644 --- a/ner_v1/detectors/base_detector.py +++ b/ner_v1/detectors/base_detector.py @@ -276,7 +276,7 @@ def detect(self, message=None, structured_value=None, fallback_value=None, **kwa text = structured_value if structured_value else message - # Prior results from detection using CRF models + # Prior results from detection. free_text_detection_results = kwargs.get("free_text_detection_results", []) entity_list, original_text_list = self.detect_entity(text=text, free_text_detection_results=free_text_detection_results) diff --git a/ner_v1/detectors/textual/text/text_detection.py b/ner_v1/detectors/textual/text/text_detection.py index 866d2519b..e5cafb694 100644 --- a/ner_v1/detectors/textual/text/text_detection.py +++ b/ner_v1/detectors/textual/text/text_detection.py @@ -336,13 +336,20 @@ def detect_entity_bulk(self, texts, **kwargs): # itertate over text_entity_values_list, original_texts_list and if free_text_detection_results has any entry # for that index use combine_results to merge the results from free_text and detection. + + combined_entity_values, combined_original_texts = [], [] for i, (values, original_texts, free_text_detection_results_) in enumerate( six.moves.zip_longest(text_entity_values_list, original_texts_list, free_text_detection_results)): if free_text_detection_results_: - text_entity_values_list[i], original_texts_list[i] = self.combine_results( + combined_entity_values_, combined_original_texts_ = self.combine_results( values=values, original_texts=original_texts, crf_original_texts=free_text_detection_results_) + combined_entity_values.append(combined_entity_values_) + combined_original_texts.append(combined_original_texts_) + else: + combined_entity_values.append(values) + combined_original_texts.append(original_texts) return text_entity_values_list, original_texts_list @@ -392,7 +399,9 @@ def detect_entity(self, text, **kwargs): self.processed_text = self.__processed_texts[0] values, texts = text_entity_values[0], original_texts[0] + ner_logger.info("prior detection results - {}".format(free_text_detection_results)) if free_text_detection_results: + ner_logger.info("combining results") text_entity_verified_values, original_texts = self.combine_results( values=values, original_texts=texts, From 19f851d39d9638fdeaf51f9fa671d0823b5f8733 Mon Sep 17 00:00:00 2001 From: ashutoshsingh0223 Date: Mon, 18 Nov 2019 23:37:22 +0530 Subject: [PATCH 45/83] revert to json.loads() for getting list of crf results --- ner_v1/api.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ner_v1/api.py b/ner_v1/api.py index 5f1423252..94ee1e3d6 100644 --- a/ner_v1/api.py +++ b/ner_v1/api.py @@ -106,9 +106,9 @@ def parse_post_request(request): PARAMETER_READ_EMBEDDINGS_FROM_REMOTE_URL: to_bool(request_data.get('read_embeddings_from_remote_url')), PARAMETER_READ_MODEL_FROM_S3: to_bool(request_data.get('read_model_from_s3')), PARAMETER_LIVE_CRF_MODEL_PATH: request_data.get('live_crf_model_path'), - PARAMETER_PRIOR_CRF_RESULTS: request_data.get("crf_results", []) + PARAMETER_PRIOR_CRF_RESULTS: json.loads(request_data.get("crf_results", '[]')) } - + ner_logger.info(parameters_dict) return parameters_dict From 7ee5ce353308df8d94ef370b7d7614f3ca8d521f Mon Sep 17 00:00:00 2001 From: ashutoshsingh0223 Date: Mon, 18 Nov 2019 23:44:07 +0530 Subject: [PATCH 46/83] revert to json.loads() for getting list of crf results --- ner_v1/api.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ner_v1/api.py b/ner_v1/api.py index 94ee1e3d6..763e010d5 100644 --- a/ner_v1/api.py +++ b/ner_v1/api.py @@ -71,7 +71,7 @@ def get_parameters_dictionary(request): PARAMETER_READ_EMBEDDINGS_FROM_REMOTE_URL: to_bool(request.GET.get('read_embeddings_from_remote_url')), PARAMETER_READ_MODEL_FROM_S3: to_bool(request.GET.get('read_model_from_s3')), PARAMETER_LIVE_CRF_MODEL_PATH: request.GET.get('live_crf_model_path'), - PARAMETER_PRIOR_CRF_RESULTS: request.GET.get_list("crf_results", []) + PARAMETER_PRIOR_CRF_RESULTS: json.loads(request.GET.get("crf_results", '[]')) } ner_logger.info("parameters dict - {}".format(parameters_dict)) return parameters_dict @@ -106,9 +106,9 @@ def parse_post_request(request): PARAMETER_READ_EMBEDDINGS_FROM_REMOTE_URL: to_bool(request_data.get('read_embeddings_from_remote_url')), PARAMETER_READ_MODEL_FROM_S3: to_bool(request_data.get('read_model_from_s3')), PARAMETER_LIVE_CRF_MODEL_PATH: request_data.get('live_crf_model_path'), - PARAMETER_PRIOR_CRF_RESULTS: json.loads(request_data.get("crf_results", '[]')) + PARAMETER_PRIOR_CRF_RESULTS: request_data.get("crf_results", []) } - ner_logger.info(parameters_dict) + return parameters_dict From 36c43340a92c74692744dbd9b9e58d119dcdd9f9 Mon Sep 17 00:00:00 2001 From: ashutoshsingh0223 Date: Tue, 19 Nov 2019 13:28:07 +0530 Subject: [PATCH 47/83] fix in detect_bulk in base_detector.py --- ner_v1/detectors/base_detector.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ner_v1/detectors/base_detector.py b/ner_v1/detectors/base_detector.py index 5e394eecf..47954f6b3 100644 --- a/ner_v1/detectors/base_detector.py +++ b/ner_v1/detectors/base_detector.py @@ -116,7 +116,7 @@ def detect_bulk(self, messages=None, **kwargs): # Prior results from detection using CRF models free_text_detection_results = kwargs.get("free_text_detection_results", []) - entities_list, original_texts_list = self.detect_entity_bulk( + entities_list, original_texts_list_ = self.detect_entity_bulk( texts=texts, free_text_detection_results=free_text_detection_results) fallback_values = kwargs.get('fallback_values') @@ -126,7 +126,7 @@ def detect_bulk(self, messages=None, **kwargs): if entities_list[i]: values_list.append(entities_list[i]) detection_method_list.append(FROM_MESSAGE) - original_texts_list.append(original_list[i]) + original_texts_list.append(original_texts_list_[i]) elif fallback_values and fallback_values[i]: values_list.append([fallback_values[i]]) From 8031923af855cd765eae078977653a00a40b47e2 Mon Sep 17 00:00:00 2001 From: ashutoshsingh0223 Date: Thu, 21 Nov 2019 18:53:56 +0530 Subject: [PATCH 48/83] check for previous bot message only if fte results absent --- ner_v1/detectors/textual/name/name_detection.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ner_v1/detectors/textual/name/name_detection.py b/ner_v1/detectors/textual/name/name_detection.py index fbb154905..d6694fc72 100644 --- a/ner_v1/detectors/textual/name/name_detection.py +++ b/ner_v1/detectors/textual/name/name_detection.py @@ -161,9 +161,6 @@ def detect_entity(self, text, bot_message=None, **kwargs): Returns: [{first_name: "yash", middle_name: None, last_name: "modi"}], [ yash modi"] """ - if bot_message: - if not self.context_check_botmessage(bot_message): - return [], [] self.text = text self.tagged_text = self.text @@ -172,6 +169,9 @@ def detect_entity(self, text, bot_message=None, **kwargs): entity_value, original_text = ([], []) if not free_text_detection_results: + if bot_message: + if not self.context_check_botmessage(bot_message): + return [], [] if self.language == ENGLISH_LANG: entity_value, original_text = self.detect_english_name() elif self.language == HINDI_LANG: From 8ab115f0551387c9b1f3d16a1346af4e4d082173 Mon Sep 17 00:00:00 2001 From: ashutoshsingh0223 Date: Fri, 22 Nov 2019 13:50:48 +0530 Subject: [PATCH 49/83] review changes --- ner_v1/constant.py | 2 +- ner_v1/detectors/base_detector.py | 43 +++++++++++-------- .../detectors/textual/name/name_detection.py | 21 +++++++-- .../detectors/textual/text/text_detection.py | 28 ++++++------ .../textual/text/text_detection_model.py | 39 ++++++++--------- 5 files changed, 78 insertions(+), 55 deletions(-) diff --git a/ner_v1/constant.py b/ner_v1/constant.py index 8e7cbf39b..55a707f4c 100644 --- a/ner_v1/constant.py +++ b/ner_v1/constant.py @@ -112,7 +112,7 @@ PARAMETER_FUZZINESS = 'fuzziness' PARAMETER_MIN_TOKEN_LEN_FUZZINESS = 'min_token_len_fuzziness' DATASTORE_VERIFIED = 'datastore_verified' -CRF_MODEL_VERIFIED = 'crf_model_verified' +MODEL_VERIFIED = 'model_verified' # **********************constants used for text detection************************************ diff --git a/ner_v1/detectors/base_detector.py b/ner_v1/detectors/base_detector.py index 47954f6b3..03fb3298c 100644 --- a/ner_v1/detectors/base_detector.py +++ b/ner_v1/detectors/base_detector.py @@ -10,7 +10,7 @@ from ner_constants import (FROM_STRUCTURE_VALUE_VERIFIED, FROM_STRUCTURE_VALUE_NOT_VERIFIED, FROM_MESSAGE, FROM_FALLBACK_VALUE, ORIGINAL_TEXT, ENTITY_VALUE, DETECTION_METHOD, DETECTION_LANGUAGE, ENTITY_VALUE_DICT_KEY) -from ner_v1.constant import DATASTORE_VERIFIED, CRF_MODEL_VERIFIED +from ner_v1.constant import DATASTORE_VERIFIED, MODEL_VERIFIED try: import regex as re @@ -20,7 +20,6 @@ _re_flags = re.UNICODE - class BaseDetector(object): """ This class is the base class from which will be inherited by individual detectors. It primarily contains the @@ -56,11 +55,12 @@ def supported_languages(self): return [] @abc.abstractmethod - def detect_entity(self, text, **kwargs): + def detect_entity(self, text, free_text_detection_results=None, **kwargs): """ This method runs the core entity detection logic defined inside entity detectors Args: text: text snippet from which entities needs to be detected + free_text_detection_results: prior detection results **kwargs: values specific to different detectors such as 'last bot message', custom configs, etc. Return: tuple: Two lists of same length containing detected values and original substring from text which is used @@ -81,12 +81,13 @@ def _set_language_processing_script(self): raise NotImplementedError('Please enable translation or extend language support' 'for %s' % self._source_language_script) - def detect_bulk(self, messages=None, **kwargs): + def detect_bulk(self, messages=None, free_text_detection_results=None, **kwargs): """ Use detector to detect entities from text. It also translates query to language compatible to detector Args: messages (list of strings): list of natural text(s) on which detection logic is to be run. + free_text_detection_results(list of list of str): prior detection results Returns: dict or None: dictionary containing entity_value, original_text and detection; entity_value is in itself a dict with its keys varying from entity to entity @@ -114,9 +115,10 @@ def detect_bulk(self, messages=None, **kwargs): texts = messages - # Prior results from detection using CRF models - free_text_detection_results = kwargs.get("free_text_detection_results", []) - entities_list, original_texts_list_ = self.detect_entity_bulk( + # Prior results from entity detection using methods like CRF etc. + if free_text_detection_results is None: + free_text_detection_results = [] + entities_list, original_list = self.detect_entity_bulk( texts=texts, free_text_detection_results=free_text_detection_results) fallback_values = kwargs.get('fallback_values') @@ -126,7 +128,7 @@ def detect_bulk(self, messages=None, **kwargs): if entities_list[i]: values_list.append(entities_list[i]) detection_method_list.append(FROM_MESSAGE) - original_texts_list.append(original_texts_list_[i]) + original_texts_list.append(original_list[i]) elif fallback_values and fallback_values[i]: values_list.append([fallback_values[i]]) @@ -167,7 +169,7 @@ def _add_verification_source(self, values, verification_source_dict): text_entity_verified_values.append(text_entity_dict) return text_entity_verified_values - def combine_results(self, values, original_texts, crf_original_texts): + def combine_results(self, values, original_texts, free_text_detection_results): """ This method is used to combine the results provided by the datastore search and the crf_model if trained. @@ -175,7 +177,7 @@ def combine_results(self, values, original_texts, crf_original_texts): values (list): List of values detected by datastore original_texts (list): List of original texts present in the texts for which value shave been detected - crf_original_texts (list): Entities detected by the Crf Model + free_text_detection_results (list): Entities detected by the models like crf etc. Returns: combined_values (list): List of dicts each dict consisting of the entity value and additionally the keys for the datastore and crf model detection @@ -186,30 +188,31 @@ def combine_results(self, values, original_texts, crf_original_texts): combined_values = self._add_verification_source(values=values, verification_source_dict={ DATASTORE_VERIFIED: True, - CRF_MODEL_VERIFIED: False + MODEL_VERIFIED: False }) combined_original_texts = original_texts - for i in range(len(crf_original_texts)): + for i in range(len(free_text_detection_results)): match = False for j in range(len(original_texts)): - if crf_original_texts[i] == original_texts[j]: - combined_values[j][CRF_MODEL_VERIFIED] = True + if free_text_detection_results[i] == original_texts[j]: + combined_values[j][MODEL_VERIFIED] = True match = True - elif re.findall(r'\b%s\b' % crf_original_texts[i], original_texts[j]): + elif re.findall(r'\b%s\b' % free_text_detection_results[i], original_texts[j]): match = True if not match: - unprocessed_crf_original_texts.append(crf_original_texts[i]) + unprocessed_crf_original_texts.append(free_text_detection_results[i]) unprocessed_crf_original_texts_verified = self._add_verification_source( values=unprocessed_crf_original_texts, - verification_source_dict={DATASTORE_VERIFIED: False, CRF_MODEL_VERIFIED: True}) + verification_source_dict={DATASTORE_VERIFIED: False, MODEL_VERIFIED: True}) combined_values.extend(unprocessed_crf_original_texts_verified) combined_original_texts.extend(unprocessed_crf_original_texts) return combined_values, combined_original_texts - def detect(self, message=None, structured_value=None, fallback_value=None, **kwargs): + def detect(self, message=None, structured_value=None, fallback_value=None, + free_text_detection_results=None, **kwargs): """ Use detector to detect entities from text. It also translates query to language compatible to detector Args: @@ -220,6 +223,7 @@ def detect(self, message=None, structured_value=None, fallback_value=None, **kwa (For example, UI elements like form, payload, etc) fallback_value (str): If the detection logic fails to detect any value either from structured_value or message then we return a fallback_value as an output. + free_text_detection_results(list of str): prior detection results from models like CRF etc. bot_message (str): previous message from a bot/agent. Returns: dict or None: dictionary containing entity_value, original_text and detection; @@ -277,7 +281,8 @@ def detect(self, message=None, structured_value=None, fallback_value=None, **kwa text = structured_value if structured_value else message # Prior results from detection. - free_text_detection_results = kwargs.get("free_text_detection_results", []) + if free_text_detection_results: + free_text_detection_results = [] entity_list, original_text_list = self.detect_entity(text=text, free_text_detection_results=free_text_detection_results) diff --git a/ner_v1/detectors/textual/name/name_detection.py b/ner_v1/detectors/textual/name/name_detection.py index d6694fc72..181de07c4 100644 --- a/ner_v1/detectors/textual/name/name_detection.py +++ b/ner_v1/detectors/textual/name/name_detection.py @@ -118,12 +118,14 @@ def get_name_using_pos_tagger(self, text): pattern1 = re.compile(r"name\s*(is|)\s*([\w\s]+)") pattern2 = re.compile(r"myself\s+([\w\s]+)") pattern3 = re.compile(r"call\s+me\s+([\w\s]+)") + pattern4 = re.compile(r"i\s+am\s+([\w\s]+)") name_tokens = text.split() # Passing empty tokens to tag will cause IndexError tagged_names = pos_tagger_object.tag(name_tokens) pattern1_match = pattern1.findall(text) pattern2_match = pattern2.findall(text) pattern3_match = pattern3.findall(text) + pattern4_match = pattern4.findall(text) is_question = [word[0] for word in tagged_names if word[1].startswith('WR') or word[1].startswith('WP') or word[1].startswith('CD')] @@ -139,6 +141,9 @@ def get_name_using_pos_tagger(self, text): elif pattern3_match: entity_value, original_text = self.get_format_name(pattern3_match[0].split(), self.text) + elif pattern4_match: + entity_value, original_text = self.get_format_name(pattern4_match[0].split(), self.text) + elif len(name_tokens) < 4: pos_words = [word[0] for word in tagged_names if word[1].startswith('NN') or word[1].startswith('JJ')] @@ -147,7 +152,7 @@ def get_name_using_pos_tagger(self, text): return entity_value, original_text - def detect_entity(self, text, bot_message=None, **kwargs): + def detect_entity(self, text, bot_message=None, free_text_detection_results=None, **kwargs): """ Takes text as input and returns two lists 1.entity_value in the form of first, middle and last names @@ -155,6 +160,7 @@ def detect_entity(self, text, bot_message=None, **kwargs): Args: text(string): the original text bot_message(string): previous bot message + free_text_detection_results(list of str): detected values from prior detection Example: text=my name is yash doshi @@ -164,7 +170,6 @@ def detect_entity(self, text, bot_message=None, **kwargs): self.text = text self.tagged_text = self.text - free_text_detection_results = kwargs.get("free_text_detection_results") entity_value, original_text = ([], []) @@ -279,13 +284,23 @@ def replace_free_text_detection_text(self, free_text_detection_results, text): replaced_text_ = nltk_tokenizer.tokenize(text.lower()) replaced_text = [] for index, token in enumerate(replaced_text_): + # Fix to handle tokenizer error for tokens with trailing `.`. For eg. + # >> text = "my name is v.k. singh" + # >> tokens = tokenize(text) + # >> tokens + # ["my", "name", "is", "v.k", ".", "singh"] + # this extra `.` token causes problem while training. if token == "." and 0 < index < len(replaced_text_) - 1 and replaced_text[-1] + "." in text.lower(): replaced_text[-1] = replaced_text[-1] + "." else: + # fix to handle examples like `miami,21st street` where tokenizer gives ["miami,21st", "street"]. + # This causes problems while tagging entities according indices. + # For eg is miami is an entity and its indices are (0,5) then due to this extra `,` tagging will be + # problem because now length of token will become 6 not 5. _token = token.strip('!"#$%&\'()*+,-/:;<=>?@[\\]^_`{|}~') if not _token: _token = token - replaced_text.append(token) + replaced_text.append(_token) else: replaced_text = text.lower().strip().split() diff --git a/ner_v1/detectors/textual/text/text_detection.py b/ner_v1/detectors/textual/text/text_detection.py index e5cafb694..f6df44a72 100644 --- a/ner_v1/detectors/textual/text/text_detection.py +++ b/ner_v1/detectors/textual/text/text_detection.py @@ -277,7 +277,7 @@ def _get_tokens_and_indices(txt): return u' '.join(matched_tokens) - def detect_entity_bulk(self, texts, **kwargs): + def detect_entity_bulk(self, texts, free_text_detection_results=None, **kwargs): """ Detects all textual entities in text that are similar to variants of 'entity_name' stored in the datastore and returns two lists of list of detected text entities and their corresponding original substrings @@ -288,6 +288,7 @@ def detect_entity_bulk(self, texts, **kwargs): Args: texts (list): list of strings(bulk detect) to extract textual entities from + free_text_detection_results(list of list of str): results from prior detection. **kwargs: it can be used to send specific arguments in future. for example, fuzziness, previous context. Returns: tuple: @@ -330,7 +331,8 @@ def detect_entity_bulk(self, texts, **kwargs): """ # For bulk detection free_text_detection_results will be a list of list of str - free_text_detection_results = kwargs.get("free_text_detection_results", []) + if free_text_detection_results: + free_text_detection_results = [] self._process_text(texts) text_entity_values_list, original_texts_list = self._text_detection_with_variants() @@ -338,22 +340,22 @@ def detect_entity_bulk(self, texts, **kwargs): # for that index use combine_results to merge the results from free_text and detection. combined_entity_values, combined_original_texts = [], [] - for i, (values, original_texts, free_text_detection_results_) in enumerate( + for i, (values, original_texts, inner_free_text_detection_results_) in enumerate( six.moves.zip_longest(text_entity_values_list, original_texts_list, free_text_detection_results)): - if free_text_detection_results_: - combined_entity_values_, combined_original_texts_ = self.combine_results( + if inner_free_text_detection_results_: + inner_combined_entity_values, inner_combined_original_texts = self.combine_results( values=values, original_texts=original_texts, - crf_original_texts=free_text_detection_results_) - combined_entity_values.append(combined_entity_values_) - combined_original_texts.append(combined_original_texts_) + free_text_detection_results=inner_free_text_detection_results_) + combined_entity_values.append(inner_combined_entity_values) + combined_original_texts.append(inner_combined_original_texts) else: combined_entity_values.append(values) combined_original_texts.append(original_texts) return text_entity_values_list, original_texts_list - def detect_entity(self, text, **kwargs): + def detect_entity(self, text, free_text_detection_results=None, **kwargs): """ Detects all textual entities in text that are similar to variants of 'entity_name' stored in the datastore and returns two lists of detected text entities and their corresponding original substrings in text respectively. @@ -362,6 +364,7 @@ def detect_entity(self, text, **kwargs): is returned. For more information on how data is stored, see Datastore docs. Args: text (unicode): string to extract textual entities from + free_text_detection_results(list of str): prior detection results **kwargs: it can be used to send specific arguments in future. for example, fuzziness, previous context. Returns: tuple: @@ -391,7 +394,8 @@ def detect_entity(self, text, **kwargs): # For single message detection free_text_detection_results will be a list of str # if present use combine_results to merge the results. - free_text_detection_results = kwargs.get("free_text_detection_results", []) + if free_text_detection_results is None: + free_text_detection_results = [] values, texts = [], [] if len(text_entity_values) > 0 and len(original_texts) > 0: @@ -401,11 +405,11 @@ def detect_entity(self, text, **kwargs): ner_logger.info("prior detection results - {}".format(free_text_detection_results)) if free_text_detection_results: - ner_logger.info("combining results") + ner_logger.info("combining results for {0}, {1}, {2}".format(values, texts, free_text_detection_results)) text_entity_verified_values, original_texts = self.combine_results( values=values, original_texts=texts, - crf_original_texts=free_text_detection_results) + free_text_detection_results=free_text_detection_results) return text_entity_verified_values, original_texts return values, texts diff --git a/ner_v1/detectors/textual/text/text_detection_model.py b/ner_v1/detectors/textual/text/text_detection_model.py index 6d6825205..96a67c2c7 100644 --- a/ner_v1/detectors/textual/text/text_detection_model.py +++ b/ner_v1/detectors/textual/text/text_detection_model.py @@ -1,9 +1,5 @@ -import re - from language_utilities.constant import ENGLISH_LANG from models.crf_v2.crf_detect_entity import CrfDetection -from ner_constants import ENTITY_VALUE_DICT_KEY -from ner_v1.constant import DATASTORE_VERIFIED, CRF_MODEL_VERIFIED from ner_v1.detectors.textual.text.text_detection import TextDetector import six @@ -39,7 +35,7 @@ def __init__(self, self.read_embeddings_from_remote_url = read_embeddings_from_remote_url self.live_crf_model_path = live_crf_model_path - def detect_entity(self, text, **kwargs): + def detect_entity(self, text, free_text_detection_results=None, **kwargs): """ Detects all textual entities in text that are similar to variants of 'entity_name' stored in the datastore and returns two lists of detected text entities and their corresponding original substrings in text respectively. @@ -50,6 +46,7 @@ def detect_entity(self, text, **kwargs): In addition to this method also runs the CRF MODEL if trained and provides the results for the given entity. Args: text (str or unicode): string to extract textual entities from + free_text_detection_results(list of str): list of previous detected values **kwargs: it can be used to send specific arguments in future. for example, fuzziness, previous context. Returns: tuple: @@ -92,20 +89,20 @@ def detect_entity(self, text, **kwargs): # Access free_text_detection_results(list of str). # If present replace crf_original_texts with free_text_detection_results. # Call combine results to .combine_results() from dictionary detection and free_text_detection_results. - free_text_detection_results = kwargs.get("free_text_detection_results", []) - if free_text_detection_results: - crf_original_texts = free_text_detection_results + if free_text_detection_results is None: + free_text_detection_results = [] values, original_texts = super(TextModelDetector, self).detect_entity(text, **kwargs) - text_entity_verified_values, original_texts = self.combine_results(values=values, - original_texts=original_texts, - crf_original_texts=crf_original_texts) + text_entity_verified_values, original_texts = \ + self.combine_results(values=values, + original_texts=original_texts, + free_text_detection_results=free_text_detection_results) self.text_entity_values, self.original_texts = text_entity_verified_values, original_texts return self.text_entity_values, self.original_texts - def detect_entity_bulk(self, texts, **kwargs): + def detect_entity_bulk(self, texts, free_text_detection_results=None, **kwargs): """ Detects all textual entities in text that are similar to variants of 'entity_name' stored in the datastore and returns two lists of list of detected text entities and their corresponding original substrings @@ -118,6 +115,7 @@ def detect_entity_bulk(self, texts, **kwargs): Args: texts (list of strings): natural language sentence(s) to extract entities from + free_text_detection_results(list of lists of str): values from previous detection **kwargs: it can be used to send specific arguments in future. for example, fuzziness, previous context. Returns: @@ -171,19 +169,20 @@ def detect_entity_bulk(self, texts, **kwargs): # Access free_text_detection_results(list of lists). # If present replace crf_original_texts with free_text_detection_results. # Call .combine_results() to combine results from dictionary detection and free_text_detection_results. - free_text_detection_results = kwargs.get("free_text_detection_results") - if free_text_detection_results: - crf_original_texts = free_text_detection_results + if free_text_detection_results is None: + free_text_detection_results = [] values_list, original_texts_list = super(TextModelDetector, self).detect_entity_bulk(texts, **kwargs) text_entity_values_list, original_texts_detected_list = [], [] - for inner_crf_original_texts, inner_values, inner_original_texts in six.moves.zip_longest(crf_original_texts, - values_list, - original_texts_list): + for inner_free_text_detection_results, inner_values, inner_original_texts in six.moves.zip_longest( + free_text_detection_results, + values_list, + original_texts_list): text_entity_verified_values, original_texts = \ - self.combine_results(values=inner_values, original_texts=inner_original_texts, - crf_original_texts=inner_crf_original_texts if inner_crf_original_texts else []) + self.combine_results( + values=inner_values, original_texts=inner_original_texts, + free_text_detection_results=inner_free_text_detection_results if inner_free_text_detection_results else []) text_entity_values_list.append(text_entity_verified_values) original_texts_detected_list.append(original_texts) return text_entity_values_list, original_texts_detected_list From 96feda22ca35b3ce9df43bb86241a71767fa06c5 Mon Sep 17 00:00:00 2001 From: ashutoshsingh0223 Date: Fri, 22 Nov 2019 15:03:48 +0530 Subject: [PATCH 50/83] review changes --- ner_v1/chatbot/entity_detection.py | 6 ++++-- ner_v1/detectors/textual/text/text_detection_model.py | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/ner_v1/chatbot/entity_detection.py b/ner_v1/chatbot/entity_detection.py index 9b47dc621..405b6d976 100644 --- a/ner_v1/chatbot/entity_detection.py +++ b/ner_v1/chatbot/entity_detection.py @@ -92,7 +92,8 @@ """ -def get_text(message, entity_name, structured_value, fallback_value, bot_message, language=ENGLISH_LANG, **kwargs): +def get_text(message, entity_name, structured_value, fallback_value, bot_message, language=ENGLISH_LANG, + free_text_detection_results=None, **kwargs): """Use TextDetector (datastore/elasticsearch) to detect textual entities Args: @@ -234,7 +235,8 @@ def get_text(message, entity_name, structured_value, fallback_value, bot_message read_model_from_s3 = kwargs.get('read_model_from_s3', False) read_embeddings_from_remote_url = kwargs.get('read_embeddings_from_remote_url', False) - free_text_detection_results = kwargs.get("free_text_detection_results", []) + if free_text_detection_results is None: + free_text_detection_results = [] text_model_detector = TextModelDetector(entity_name=entity_name, language=language, diff --git a/ner_v1/detectors/textual/text/text_detection_model.py b/ner_v1/detectors/textual/text/text_detection_model.py index 96a67c2c7..04196e75f 100644 --- a/ner_v1/detectors/textual/text/text_detection_model.py +++ b/ner_v1/detectors/textual/text/text_detection_model.py @@ -92,7 +92,8 @@ def detect_entity(self, text, free_text_detection_results=None, **kwargs): if free_text_detection_results is None: free_text_detection_results = [] - values, original_texts = super(TextModelDetector, self).detect_entity(text, **kwargs) + values, original_texts = super(TextModelDetector, self).detect_entity( + text, free_text_detection_results=free_text_detection_results, **kwargs) text_entity_verified_values, original_texts = \ self.combine_results(values=values, @@ -182,7 +183,8 @@ def detect_entity_bulk(self, texts, free_text_detection_results=None, **kwargs): text_entity_verified_values, original_texts = \ self.combine_results( values=inner_values, original_texts=inner_original_texts, - free_text_detection_results=inner_free_text_detection_results if inner_free_text_detection_results else []) + free_text_detection_results=inner_free_text_detection_results if + inner_free_text_detection_results else []) text_entity_values_list.append(text_entity_verified_values) original_texts_detected_list.append(original_texts) return text_entity_values_list, original_texts_detected_list From 7b28bb7330c177bbd5a4eda9c7c83c715a418d07 Mon Sep 17 00:00:00 2001 From: ashutoshsingh0223 Date: Fri, 22 Nov 2019 15:07:03 +0530 Subject: [PATCH 51/83] review changes --- ner_v1/detectors/textual/text/text_detection_model.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ner_v1/detectors/textual/text/text_detection_model.py b/ner_v1/detectors/textual/text/text_detection_model.py index 04196e75f..c9e7295ae 100644 --- a/ner_v1/detectors/textual/text/text_detection_model.py +++ b/ner_v1/detectors/textual/text/text_detection_model.py @@ -173,7 +173,8 @@ def detect_entity_bulk(self, texts, free_text_detection_results=None, **kwargs): if free_text_detection_results is None: free_text_detection_results = [] - values_list, original_texts_list = super(TextModelDetector, self).detect_entity_bulk(texts, **kwargs) + values_list, original_texts_list = super(TextModelDetector, self).detect_entity_bulk( + texts, free_text_detection_results=free_text_detection_results, **kwargs) text_entity_values_list, original_texts_detected_list = [], [] for inner_free_text_detection_results, inner_values, inner_original_texts in six.moves.zip_longest( From c050e3e3ed4e908642283f718dccc11b52086ce0 Mon Sep 17 00:00:00 2001 From: ashutoshsingh0223 Date: Fri, 22 Nov 2019 15:11:23 +0530 Subject: [PATCH 52/83] fix in .detect in BaseDetector --- ner_v1/detectors/base_detector.py | 2 +- ner_v1/detectors/textual/text/text_detection.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ner_v1/detectors/base_detector.py b/ner_v1/detectors/base_detector.py index 03fb3298c..3dc0f94e0 100644 --- a/ner_v1/detectors/base_detector.py +++ b/ner_v1/detectors/base_detector.py @@ -281,7 +281,7 @@ def detect(self, message=None, structured_value=None, fallback_value=None, text = structured_value if structured_value else message # Prior results from detection. - if free_text_detection_results: + if free_text_detection_results is None: free_text_detection_results = [] entity_list, original_text_list = self.detect_entity(text=text, free_text_detection_results=free_text_detection_results) diff --git a/ner_v1/detectors/textual/text/text_detection.py b/ner_v1/detectors/textual/text/text_detection.py index f6df44a72..6adab9b6a 100644 --- a/ner_v1/detectors/textual/text/text_detection.py +++ b/ner_v1/detectors/textual/text/text_detection.py @@ -331,7 +331,7 @@ def detect_entity_bulk(self, texts, free_text_detection_results=None, **kwargs): """ # For bulk detection free_text_detection_results will be a list of list of str - if free_text_detection_results: + if free_text_detection_results is None: free_text_detection_results = [] self._process_text(texts) text_entity_values_list, original_texts_list = self._text_detection_with_variants() From 02149d1e7a1085a81a350eb11fdfd273dc228a2e Mon Sep 17 00:00:00 2001 From: ashutoshsingh0223 Date: Fri, 22 Nov 2019 15:19:45 +0530 Subject: [PATCH 53/83] review changes- rename replace_text with replaced_text_tokens --- ner_v1/detectors/textual/name/name_detection.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ner_v1/detectors/textual/name/name_detection.py b/ner_v1/detectors/textual/name/name_detection.py index 181de07c4..76a68feac 100644 --- a/ner_v1/detectors/textual/name/name_detection.py +++ b/ner_v1/detectors/textual/name/name_detection.py @@ -328,17 +328,17 @@ def replace_detected_text(self, text_detection_result, text): ['my', 'name', 'is', '_yash_', '_doshi_'] """ - replaced_text = [] + replaced_text_tokens = [] if self.language == ENGLISH_LANG: - replaced_text = nltk_tokenizer.tokenize(text.lower()) + replaced_text_tokens = nltk_tokenizer.tokenize(text.lower()) elif self.language == HINDI_LANG: - replaced_text = text.lower().strip().split() + replaced_text_tokens = text.lower().strip().split() for detected_original_text in (text_detection_result[1]): - for j in range(len(replaced_text)): - replaced_text[j] = replaced_text[j].replace(detected_original_text, "_" + detected_original_text + "_") + for j in range(len(replaced_text_tokens)): + replaced_text_tokens[j] = replaced_text_tokens[j].replace(detected_original_text, "_" + detected_original_text + "_") - return replaced_text + return replaced_text_tokens def detect_person_name_entity(self, replaced_text): """ From 5c0e3b33d87f36f0bc1103e96481b504af625d7d Mon Sep 17 00:00:00 2001 From: ashutoshsingh0223 Date: Fri, 22 Nov 2019 15:22:46 +0530 Subject: [PATCH 54/83] lint error fix --- .../detectors/textual/name/name_detection.py | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/ner_v1/detectors/textual/name/name_detection.py b/ner_v1/detectors/textual/name/name_detection.py index 76a68feac..a6bf9e821 100644 --- a/ner_v1/detectors/textual/name/name_detection.py +++ b/ner_v1/detectors/textual/name/name_detection.py @@ -281,17 +281,18 @@ def replace_free_text_detection_text(self, free_text_detection_results, text): """ if self.language == ENGLISH_LANG: - replaced_text_ = nltk_tokenizer.tokenize(text.lower()) - replaced_text = [] - for index, token in enumerate(replaced_text_): + replaced_original_text_tokens = nltk_tokenizer.tokenize(text.lower()) + replaced_text_tokens = [] + for index, token in enumerate(replaced_original_text_tokens): # Fix to handle tokenizer error for tokens with trailing `.`. For eg. # >> text = "my name is v.k. singh" # >> tokens = tokenize(text) # >> tokens # ["my", "name", "is", "v.k", ".", "singh"] # this extra `.` token causes problem while training. - if token == "." and 0 < index < len(replaced_text_) - 1 and replaced_text[-1] + "." in text.lower(): - replaced_text[-1] = replaced_text[-1] + "." + if token == "." and 0 < index < len(replaced_original_text_tokens) - 1 \ + and replaced_text_tokens[-1] + "." in text.lower(): + replaced_text_tokens[-1] = replaced_text_tokens[-1] + "." else: # fix to handle examples like `miami,21st street` where tokenizer gives ["miami,21st", "street"]. # This causes problems while tagging entities according indices. @@ -300,17 +301,17 @@ def replace_free_text_detection_text(self, free_text_detection_results, text): _token = token.strip('!"#$%&\'()*+,-/:;<=>?@[\\]^_`{|}~') if not _token: _token = token - replaced_text.append(_token) + replaced_text_tokens.append(_token) else: - replaced_text = text.lower().strip().split() + replaced_text_tokens = text.lower().strip().split() for name in free_text_detection_results: name_tokens = name.split() for token in name_tokens: - for j in range(len(replaced_text)): - replaced_text[j] = replaced_text[j].replace(token, "_" + token + "_") + for j in range(len(replaced_text_tokens)): + replaced_text_tokens[j] = replaced_text_tokens[j].replace(token, "_" + token + "_") - return replaced_text + return replaced_text_tokens def replace_detected_text(self, text_detection_result, text): """ From f19fd62bc85fcb28ca68883d824e3b30bc3a7467 Mon Sep 17 00:00:00 2001 From: ashutoshsingh0223 Date: Fri, 22 Nov 2019 16:07:37 +0530 Subject: [PATCH 55/83] change parameter name PARAMETER_PRIOR_CRF_RESULTS to PARAMETER_PRIOR_RESULTS --- ner_constants.py | 2 +- ner_v1/api.py | 12 ++++++------ ner_v1/chatbot/entity_detection.py | 13 ++++++++----- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/ner_constants.py b/ner_constants.py index d620c03d3..ca5effd20 100644 --- a/ner_constants.py +++ b/ner_constants.py @@ -58,7 +58,7 @@ PARAMETER_NUMBER_UNIT_TYPE = 'unit_type' # Prior detection results from CRF models. -PARAMETER_PRIOR_CRF_RESULTS = "free_text_detection_results" +PARAMETER_PRIOR_RESULTS = "free_text_detection_results" # Locale for Date and Phone Number detection PARAMETER_LOCALE = 'locale' diff --git a/ner_v1/api.py b/ner_v1/api.py index 763e010d5..fb12cf041 100644 --- a/ner_v1/api.py +++ b/ner_v1/api.py @@ -10,7 +10,7 @@ from language_utilities.constant import ENGLISH_LANG from ner_constants import (PARAMETER_MESSAGE, PARAMETER_ENTITY_NAME, PARAMETER_STRUCTURED_VALUE, PARAMETER_FALLBACK_VALUE, PARAMETER_BOT_MESSAGE, PARAMETER_TIMEZONE, PARAMETER_REGEX, - PARAMETER_LANGUAGE_SCRIPT, PARAMETER_SOURCE_LANGUAGE, PARAMETER_PRIOR_CRF_RESULTS) + PARAMETER_LANGUAGE_SCRIPT, PARAMETER_SOURCE_LANGUAGE, PARAMETER_PRIOR_RESULTS) from ner_v1.chatbot.combine_detection_logic import combine_output_of_detection_logic_and_tag from ner_v1.chatbot.entity_detection import (get_location, get_phone_number, get_email, get_city, get_pnr, @@ -71,7 +71,7 @@ def get_parameters_dictionary(request): PARAMETER_READ_EMBEDDINGS_FROM_REMOTE_URL: to_bool(request.GET.get('read_embeddings_from_remote_url')), PARAMETER_READ_MODEL_FROM_S3: to_bool(request.GET.get('read_model_from_s3')), PARAMETER_LIVE_CRF_MODEL_PATH: request.GET.get('live_crf_model_path'), - PARAMETER_PRIOR_CRF_RESULTS: json.loads(request.GET.get("crf_results", '[]')) + PARAMETER_PRIOR_RESULTS: json.loads(request.GET.get("crf_results", '[]')) } ner_logger.info("parameters dict - {}".format(parameters_dict)) return parameters_dict @@ -106,7 +106,7 @@ def parse_post_request(request): PARAMETER_READ_EMBEDDINGS_FROM_REMOTE_URL: to_bool(request_data.get('read_embeddings_from_remote_url')), PARAMETER_READ_MODEL_FROM_S3: to_bool(request_data.get('read_model_from_s3')), PARAMETER_LIVE_CRF_MODEL_PATH: request_data.get('live_crf_model_path'), - PARAMETER_PRIOR_CRF_RESULTS: request_data.get("crf_results", []) + PARAMETER_PRIOR_RESULTS: request_data.get("crf_results", []) } return parameters_dict @@ -250,7 +250,7 @@ def text(request): live_crf_model_path=parameters_dict[PARAMETER_LIVE_CRF_MODEL_PATH], read_model_from_s3=parameters_dict[PARAMETER_READ_MODEL_FROM_S3], read_embeddings_from_remote_url=parameters_dict[PARAMETER_READ_EMBEDDINGS_FROM_REMOTE_URL], - free_text_detection_results=parameters_dict[PARAMETER_PRIOR_CRF_RESULTS] + free_text_detection_results=parameters_dict[PARAMETER_PRIOR_RESULTS] ) ner_logger.debug('Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output)) except TypeError as e: @@ -273,7 +273,7 @@ def location(request): parameters_dict[PARAMETER_STRUCTURED_VALUE], parameters_dict[PARAMETER_FALLBACK_VALUE], parameters_dict[PARAMETER_BOT_MESSAGE], - free_text_detection_results=parameters_dict[PARAMETER_PRIOR_CRF_RESULTS]) + free_text_detection_results=parameters_dict[PARAMETER_PRIOR_RESULTS]) ner_logger.debug('Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output)) except TypeError as e: ner_logger.exception('Exception for location: %s ' % e) @@ -367,7 +367,7 @@ def person_name(request): fallback_value=parameters_dict[PARAMETER_FALLBACK_VALUE], bot_message=parameters_dict[PARAMETER_BOT_MESSAGE], language=parameters_dict[PARAMETER_SOURCE_LANGUAGE], - free_text_detection_results=parameters_dict[PARAMETER_PRIOR_CRF_RESULTS]) + free_text_detection_results=parameters_dict[PARAMETER_PRIOR_RESULTS]) ner_logger.debug('Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output)) except TypeError as e: ner_logger.exception('Exception for person_name: %s ' % e) diff --git a/ner_v1/chatbot/entity_detection.py b/ner_v1/chatbot/entity_detection.py index 405b6d976..6e6f483c4 100644 --- a/ner_v1/chatbot/entity_detection.py +++ b/ner_v1/chatbot/entity_detection.py @@ -266,7 +266,8 @@ def get_text(message, entity_name, structured_value, fallback_value, bot_message return entity_output -def get_location(message, entity_name, structured_value, fallback_value, bot_message, **kwargs): +def get_location(message, entity_name, structured_value, fallback_value, bot_message, + free_text_detection_results=None, **kwargs): """"Use TextDetector (elasticsearch) to detect location TODO: We can improve this by creating separate for location detection instead of using TextDetector @@ -282,6 +283,7 @@ def get_location(message, entity_name, structured_value, fallback_value, bot_mes fallback_value (str): If the detection logic fails to detect any value either from structured_value or message then we return a fallback_value as an output. bot_message (str): previous message from a bot/agent. + free_text_detection_results(list of str): prior detection results from models like crf etc. Returns: @@ -302,7 +304,7 @@ def get_location(message, entity_name, structured_value, fallback_value, bot_mes >> [{'detection': 'message', 'entity_value': {'value': 'Andheri West'}, 'language': 'en', 'original_text': 'andheri west'}] """ - free_text_detection_results = kwargs.get("free_text_detection_results", []) + free_text_detection_results = free_text_detection_results or [] text_detection = TextDetector(entity_name=entity_name) return text_detection.detect(message=message, structured_value=structured_value, fallback_value=fallback_value, bot_message=bot_message, free_text_detection_results=free_text_detection_results) @@ -498,7 +500,6 @@ def get_city(message, entity_name, structured_value, fallback_value, bot_message """ - # free_text_detection_results = kwargs.get("free_text_detection_results", []) city_detection = CityDetector(entity_name=entity_name, language=language) city_detection.set_bot_message(bot_message=bot_message) if structured_value: @@ -528,7 +529,7 @@ def get_city(message, entity_name, structured_value, fallback_value, bot_message def get_person_name(message, entity_name, structured_value, fallback_value, bot_message, - language=ENGLISH_LANG, **kwargs): + language=ENGLISH_LANG, free_text_detection_results=None, **kwargs): """Use NameDetector to detect names Args: @@ -543,6 +544,7 @@ def get_person_name(message, entity_name, structured_value, fallback_value, bot_ or message then we return a fallback_value as an output. bot_message (str): previous message from a bot/agent. language (str): ISO 639-1 code of language of message + free_text_detection_results(list of str): prior detection results from models like crf etc. Returns: @@ -561,7 +563,8 @@ def get_person_name(message, entity_name, structured_value, fallback_value, bot_ 'entity_value': {'first_name': yash, 'middle_name': None, 'last_name': doshi}}] """ # TODO refactor NameDetector to make this easy to read and use - free_text_detection_results = kwargs.get("free_text_detection_results", []) + free_text_detection_results = free_text_detection_results or [] + name_detection = NameDetector(entity_name=entity_name, language=language) text, detection_method, fallback_text, fallback_method = (structured_value, FROM_STRUCTURE_VALUE_VERIFIED, From fe439ac0798023c717c73b9eed419d53243f91dd Mon Sep 17 00:00:00 2001 From: ashutoshsingh0223 Date: Thu, 12 Dec 2019 16:37:01 +0530 Subject: [PATCH 56/83] add detection method in person_name for free text entities --- ner_v1/chatbot/entity_detection.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ner_v1/chatbot/entity_detection.py b/ner_v1/chatbot/entity_detection.py index 6e6f483c4..e54be5c83 100644 --- a/ner_v1/chatbot/entity_detection.py +++ b/ner_v1/chatbot/entity_detection.py @@ -235,8 +235,7 @@ def get_text(message, entity_name, structured_value, fallback_value, bot_message read_model_from_s3 = kwargs.get('read_model_from_s3', False) read_embeddings_from_remote_url = kwargs.get('read_embeddings_from_remote_url', False) - if free_text_detection_results is None: - free_text_detection_results = [] + free_text_detection_results = free_text_detection_results or [] text_model_detector = TextModelDetector(entity_name=entity_name, language=language, @@ -588,6 +587,8 @@ def get_person_name(message, entity_name, structured_value, fallback_value, bot_ detection_method = fallback_method if entity_list and original_text_list: + if free_text_detection_results: + detection_method = ["free text entity"] return output_entity_dict_list(entity_list, original_text_list, detection_method) return None From b0edf8f130ce98382b1a05bd3b12314f28b3f4fc Mon Sep 17 00:00:00 2001 From: ashutoshsingh0223 Date: Thu, 12 Dec 2019 16:43:31 +0530 Subject: [PATCH 57/83] add detection method in person_name for free text entities --- ner_v1/chatbot/entity_detection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ner_v1/chatbot/entity_detection.py b/ner_v1/chatbot/entity_detection.py index e54be5c83..389ac4a2b 100644 --- a/ner_v1/chatbot/entity_detection.py +++ b/ner_v1/chatbot/entity_detection.py @@ -588,7 +588,7 @@ def get_person_name(message, entity_name, structured_value, fallback_value, bot_ if entity_list and original_text_list: if free_text_detection_results: - detection_method = ["free text entity"] + detection_method = "free text entity" return output_entity_dict_list(entity_list, original_text_list, detection_method) return None From 55781ae72e26963979abacfe86f78c5e7231750f Mon Sep 17 00:00:00 2001 From: ashutoshsingh0223 Date: Tue, 17 Dec 2019 14:50:21 +0530 Subject: [PATCH 58/83] add previous message sub-strings for NAME_VARIATIONS --- ner_v1/detectors/textual/name/hindi_const.py | 25 +++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/ner_v1/detectors/textual/name/hindi_const.py b/ner_v1/detectors/textual/name/hindi_const.py index 7651d42ff..f3c066c9b 100644 --- a/ner_v1/detectors/textual/name/hindi_const.py +++ b/ner_v1/detectors/textual/name/hindi_const.py @@ -300,7 +300,30 @@ HINDI_QUESTIONWORDS = [u'क्या', u'कब', u'कहा', u'क्यों', u'कौन', u'कौन', u'जिसे', u'जिसका', u'कैसे', u'कितने'] # Variants in "name" to check for previous context flag -NAME_VARIATIONS = ['name', u'नाम'] +NAME_VARIATIONS = ["enter your full name again", + "what's your name", + "mention your name ", + "provide your name ", + "help me with your name", + "What's your full name?", + "Here is the menu!", + "forgot to mention your name", + "please help me with your full name", + "please let me know your full name.", + "please enter your name", + "help me with your full name", + "looks like you forgot to mention your name", + "enter your name", + "share your name", + "know your name", + "tell me your name", + "tell your name", + "what should in call you", + "say your name", + "call you", + "address you", + "your name", + "your full name"] # Common hindi words occuring in context to a name COMMON_HINDI_WORDS_OCCURING_WITH_NAME = {u"मुझे", From b4cbec29063637f18b433934ad1cce52e4799f0b Mon Sep 17 00:00:00 2001 From: ranvijayj Date: Tue, 17 Dec 2019 16:40:46 +0530 Subject: [PATCH 59/83] Update requirements for security Vulnerability Update requirements for security Vulnerability High CVE Security --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 0e474b44a..d0d1f37fe 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,12 +2,12 @@ phonenumberslite==8.10.18 six==1.11.0 gunicorn==19.6.0 pytz==2014.2 -nltk==3.2.5 +nltk==3.4.5 numpy==1.10.4 elasticsearch==5.5.0 requests==2.20.0 requests-aws4auth==0.9 -Django==1.11.22 +Django==1.11.26 django-dotenv==1.4.2 weighted-levenshtein==0.1 regex==2018.7.11 From 286a1cf96149edf857d6385ff9de964db73a983c Mon Sep 17 00:00:00 2001 From: Chirag Jain Date: Wed, 18 Dec 2019 12:10:25 +0530 Subject: [PATCH 60/83] Update Dockerfile, env example file and documentation lines --- chatbot_ner/config.py | 26 +++++++++++--------------- config.example | 34 +++++++++++++--------------------- docker/Dockerfile | 5 ++--- docs/install.md | 2 ++ 4 files changed, 28 insertions(+), 39 deletions(-) diff --git a/chatbot_ner/config.py b/chatbot_ner/config.py index 6eb89cd80..45454bf40 100644 --- a/chatbot_ner/config.py +++ b/chatbot_ner/config.py @@ -104,7 +104,6 @@ WORD_EMBEDDING_REMOTE_URL = os.environ.get('WORD_EMBEDDING_REMOTE_URL') - GOOGLE_TRANSLATE_API_KEY = os.environ.get('GOOGLE_TRANSLATE_API_KEY') if not GOOGLE_TRANSLATE_API_KEY: @@ -144,26 +143,23 @@ else: CHATBOT_NER_DATASTORE['elasticsearch']['doc_type'] = 'data_dictionary' -ES_AWS_SECRET_ACCESS_KEY = os.environ.get('ES_AWS_SECRET_ACCESS_KEY') -ES_AWS_ACCESS_KEY_ID = os.environ.get('ES_AWS_ACCESS_KEY_ID') -ES_AWS_REGION = os.environ.get('ES_AWS_REGION') ES_AWS_SERVICE = os.environ.get('ES_AWS_SERVICE') +ES_AWS_REGION = os.environ.get('ES_AWS_REGION') +ES_AWS_ACCESS_KEY_ID = os.environ.get('ES_AWS_ACCESS_KEY_ID') +ES_AWS_SECRET_ACCESS_KEY = os.environ.get('ES_AWS_SECRET_ACCESS_KEY') -if not ES_AWS_SERVICE: - ES_AWS_SERVICE = 'es' - -if ES_AWS_ACCESS_KEY_ID and ES_AWS_SECRET_ACCESS_KEY and ES_AWS_REGION and ES_AWS_SERVICE: - CHATBOT_NER_DATASTORE['elasticsearch']['http_auth'] = AWS4Auth(ES_AWS_ACCESS_KEY_ID, ES_AWS_SECRET_ACCESS_KEY, - ES_AWS_REGION, ES_AWS_SERVICE) - CHATBOT_NER_DATASTORE['elasticsearch']['use_ssl'] = True - CHATBOT_NER_DATASTORE['elasticsearch']['verify_certs'] = True - CHATBOT_NER_DATASTORE['elasticsearch']['connection_class'] = RequestsHttpConnection -elif ES_AWS_REGION and ES_AWS_SERVICE: +if ES_AWS_SERVICE and ES_AWS_REGION: + ner_logger.info('`ES_AWS_SERVICE` and `ES_AWS_REGION` are set. Using AWS Elasticsearch settings ') CHATBOT_NER_DATASTORE['elasticsearch']['use_ssl'] = True CHATBOT_NER_DATASTORE['elasticsearch']['verify_certs'] = True CHATBOT_NER_DATASTORE['elasticsearch']['connection_class'] = RequestsHttpConnection + if ES_AWS_ACCESS_KEY_ID and ES_AWS_SECRET_ACCESS_KEY: + CHATBOT_NER_DATASTORE['elasticsearch']['http_auth'] = AWS4Auth(ES_AWS_ACCESS_KEY_ID, + ES_AWS_SECRET_ACCESS_KEY, + ES_AWS_REGION, ES_AWS_SERVICE) else: - ner_logger.warning('Elasticsearch: Some or all AWS settings missing from environment, this will skip AWS auth!') + ner_logger.warning('`ES_AWS_SERVICE` and `ES_AWS_REGION` are not set. ' + 'This is not a problem if you are using self hosted ES') # Model Vars if os.path.exists(MODEL_CONFIG_PATH): diff --git a/config.example b/config.example index 1c315a9d6..ae5d04a04 100644 --- a/config.example +++ b/config.example @@ -1,16 +1,13 @@ # This is config.example file for chatbot_ner module similar to .env.example file to hold settings # Copy it to a file named config and fill in all the values. # Never push your personal keys and passwords to any public repository! -# Make sure the variables in this file are in the environment. Example: -# $ source chatbot_ner/config # Please don't add spaces around '=' -# This is the primary engine to use. Valid values are one of the following: -# elasticsearch +# This is the primary engine to use. Valid values are one of the following: ['elasticsearch'] ENGINE=elasticsearch -# ES prefixed values correspond to settings for elasticsearch. +# ES prefixed variables correspond to settings for elasticsearch. # ES_URL is the complete url with auth name and password required to connect. If provided, this will override ES_HOST, # ES_PORT, ES_AUTH_NAME, ES_AUTH_PASSWORD # ES_HOST by default is host for ES that comes up with compose @@ -22,6 +19,15 @@ ES_URL= ES_PORT=9200 ES_INDEX_NAME=entity_data ES_DOC_TYPE=data_dictionary +# ES_BULK_MSG_SIZE is an integer value +ES_BULK_MSG_SIZE=1000 +# ES_SEARCH_SIZE is an integer value +ES_SEARCH_SIZE=10000 +# Provide the following values if you need AWS authentication +ES_AWS_SERVICE= +ES_AWS_REGION= +ES_AWS_ACCESS_KEY_ID= +ES_AWS_SECRET_ACCESS_KEY= NAME=chatbot_ner DJANGODIR=/app @@ -31,25 +37,11 @@ DJANGO_SETTINGS_MODULE=chatbot_ner.settings DJANGO_WSGI_MODULE=chatbot_ner/wsgi.py DJANGO_LOG_LEVEL=debug DJANGO_DEBUG=False +# Important: Change the value of SECRET_KEY to something else and keep it secret +SECRET_KEY=!yqqcz-v@(s@kpygpvomcuu3il0q1&qtpz)e_g0ulo-sdv%c0c PORT=8081 TIMEOUT=600 - CITY_MODEL_TYPE=crf CITY_MODEL_PATH= -GOOGLE_TRANSLATE_API_KEY= - - -# ES_BULK_MSG_SIZE is an integer value -ES_BULK_MSG_SIZE=1000 - -# ES_SEARCH_SIZE is an integer value -ES_SEARCH_SIZE=10000 - -# Provide the following values if you need AWS authentication -ES_AWS_SECRET_ACCESS_KEY= -ES_AWS_ACCESS_KEY_ID= -ES_AWS_REGION= -ES_AWS_SERVICE= - # In order to enable entity detection for multiple languages, we use google translate. Please enter the key(optional) GOOGLE_TRANSLATE_API_KEY= diff --git a/docker/Dockerfile b/docker/Dockerfile index c99bf3a1d..0e21311c0 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -31,9 +31,8 @@ ENV PORT=8081 ENV TIMEOUT=600 ENV DEBIAN_FRONTEND=noninteractive - -#ENV DATE_MODEL_TYPE=crf -#ENV DATE_MODEL_PATH=/root/models/models_live/date/crf/model.crf +# Important change this via .env (the file copied from config.example) +ENV SECRET_KEY=!yqqcz-v@(s@kpygpvomcuu3il0q1&qtpz)e_g0ulo-sdv%c0c EXPOSE 8081 diff --git a/docs/install.md b/docs/install.md index 3c9a5f822..46604f530 100644 --- a/docs/install.md +++ b/docs/install.md @@ -56,6 +56,8 @@ cd docker docker-compose up --build -d ``` +Open `docker/.env` file and edit the environment variables if needed. (You should change the SECRET_KEY). + The above will also mount local repo root directory inside the containers /app directory. Please wait 5 seconds to run the first curl or do an API call to chatbot_ner. > **NOTE**: make sure that nothing is running on port 8081 on your server or your local environment. From 1d734afddeb0bdc89d58b220f19920ae0dbc2354 Mon Sep 17 00:00:00 2001 From: Chirag Jain Date: Wed, 18 Dec 2019 12:25:13 +0530 Subject: [PATCH 61/83] Cleanup some conditions in config.py --- chatbot_ner/config.py | 32 +++++++++----------------------- chatbot_ner/settings.py | 2 +- 2 files changed, 10 insertions(+), 24 deletions(-) diff --git a/chatbot_ner/config.py b/chatbot_ner/config.py index 45454bf40..f1f0e9d0a 100644 --- a/chatbot_ner/config.py +++ b/chatbot_ner/config.py @@ -6,16 +6,15 @@ from requests_aws4auth import AWS4Auth BASE_DIR = os.path.dirname(os.path.dirname(__file__)) -CONFIG_PATH = os.path.join(BASE_DIR, 'config') MODEL_CONFIG_PATH = os.path.join(BASE_DIR, 'model_config') LOG_PATH = os.path.join(BASE_DIR, 'logs') + +# TODO: Set this up via Django LOGGING # SET UP NER LOGGING if not os.path.exists(LOG_PATH): os.makedirs(LOG_PATH) -# LOGGING -# TODO - Make this much generic & simpler in the future LOG_LEVEL = os.environ.get('DJANGO_LOG_LEVEL', 'error').upper() # Common formatter @@ -48,25 +47,18 @@ nlp_logger.addHandler(handler) nlp_logger.addHandler(handler_stdout) -if os.path.exists(CONFIG_PATH): - dotenv.read_dotenv(CONFIG_PATH) -else: - ner_logger.debug('Warning: no file named "config" found at %s. This is not a problem if your ' - 'datastore(elasticsearch) connection settings are already available in the environment', - CONFIG_PATH) - -# TODO Consider prefixing everything config with NER_ because these names are in the environment and so are -# TODO lot of others too which may conflict in name. Example user is already using some another instance of -# TODO Elasticsearch for other purposes ENGINE = os.environ.get('ENGINE') if ENGINE: ENGINE = ENGINE.lower() +else: + ner_logger.warning("`ENGINE` variable is not set, Text type entities won't work without it") + # ES settings (Mandatory to use Text type entities) ES_URL = os.environ.get('ES_URL') ES_HOST = os.environ.get('ES_HOST') ES_PORT = os.environ.get('ES_PORT') ES_INDEX_NAME = os.environ.get('ES_INDEX_NAME') -ES_DOC_TYPE = os.environ.get('ES_DOC_TYPE') +ES_DOC_TYPE = os.environ.get('ES_DOC_TYPE', 'data_dictionary') ES_AUTH_NAME = os.environ.get('ES_AUTH_NAME') ES_AUTH_PASSWORD = os.environ.get('ES_AUTH_PASSWORD') ES_BULK_MSG_SIZE = os.environ.get('ES_BULK_MSG_SIZE', '10000') @@ -81,8 +73,8 @@ ES_BULK_MSG_SIZE = int(ES_BULK_MSG_SIZE) ES_SEARCH_SIZE = int(ES_SEARCH_SIZE) except ValueError: - ES_BULK_MSG_SIZE = 10000 - ES_SEARCH_SIZE = 10000 + ES_BULK_MSG_SIZE = 1000 + ES_SEARCH_SIZE = 1000 # Optional Vars ES_INDEX_1 = os.environ.get('ES_INDEX_1') @@ -101,9 +93,7 @@ # Crf Model Specific with additional AWS storage (optional) CRF_MODEL_S3_BUCKET_NAME = os.environ.get('CRF_MODEL_S3_BUCKET_NAME') CRF_MODEL_S3_BUCKET_REGION = os.environ.get('CRF_MODEL_S3_BUCKET_REGION') - WORD_EMBEDDING_REMOTE_URL = os.environ.get('WORD_EMBEDDING_REMOTE_URL') - GOOGLE_TRANSLATE_API_KEY = os.environ.get('GOOGLE_TRANSLATE_API_KEY') if not GOOGLE_TRANSLATE_API_KEY: @@ -115,6 +105,7 @@ 'elasticsearch': { 'connection_url': ES_URL, # Elastic Search URL 'name': ES_INDEX_NAME, # Index name used + 'doc_type': ES_DOC_TYPE, # Index's doc type 'host': ES_HOST, # Elastic Search Host 'port': ES_PORT, # Port of elastic search 'user': ES_AUTH_NAME, @@ -138,11 +129,6 @@ } } -if ES_DOC_TYPE: - CHATBOT_NER_DATASTORE['elasticsearch']['doc_type'] = ES_DOC_TYPE -else: - CHATBOT_NER_DATASTORE['elasticsearch']['doc_type'] = 'data_dictionary' - ES_AWS_SERVICE = os.environ.get('ES_AWS_SERVICE') ES_AWS_REGION = os.environ.get('ES_AWS_REGION') ES_AWS_ACCESS_KEY_ID = os.environ.get('ES_AWS_ACCESS_KEY_ID') diff --git a/chatbot_ner/settings.py b/chatbot_ner/settings.py index 603e1a384..de8e91654 100755 --- a/chatbot_ner/settings.py +++ b/chatbot_ner/settings.py @@ -21,7 +21,7 @@ SECRET_KEY = os.environ.get('SECRET_KEY') # SECURITY WARNING: don't run with debug turned on in production! -_dj_debug = os.environ.get('DJANGO_DEBUG') +_dj_debug = os.environ.get('DJANGO_DEBUG', 'false') DEBUG = (_dj_debug and _dj_debug.lower() == 'true') TEMPLATE_DEBUG = False From 0283b45fe99f2794c7b84b0533a60b6491992e8a Mon Sep 17 00:00:00 2001 From: Pratik Date: Thu, 19 Dec 2019 12:12:16 +0530 Subject: [PATCH 62/83] Update install.md Changes to docker compose install command --- docs/install.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/install.md b/docs/install.md index 3c9a5f822..fbd602c7d 100644 --- a/docs/install.md +++ b/docs/install.md @@ -39,7 +39,7 @@ Following are the steps to create the Docker image and run NER with Docker. Docker Compose ```shell - sudo curl -L https://github.com/docker/compose/releases/download/1.22.0/docker-compose-$(uname -s)-$(uname -m) -o /usr/local/bin/docker-compose + sudo curl -L "https://github.com/docker/compose/releases/download/1.22.0/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose sudo chmod +x /usr/local/bin/docker-compose ``` - MacOS: From fc7cc37c3a138ee919c21ed8bcc9e17f41617130 Mon Sep 17 00:00:00 2001 From: ashutoshsingh0223 Date: Mon, 23 Dec 2019 12:02:00 +0530 Subject: [PATCH 63/83] add hindi variant in --- ner_v1/detectors/textual/name/hindi_const.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ner_v1/detectors/textual/name/hindi_const.py b/ner_v1/detectors/textual/name/hindi_const.py index f3c066c9b..b4401c685 100644 --- a/ner_v1/detectors/textual/name/hindi_const.py +++ b/ner_v1/detectors/textual/name/hindi_const.py @@ -323,7 +323,8 @@ "call you", "address you", "your name", - "your full name"] + "your full name", + u"नाम"] # Common hindi words occuring in context to a name COMMON_HINDI_WORDS_OCCURING_WITH_NAME = {u"मुझे", From c87ea7b9cefe301b148b50a1fc597fd38a3501e3 Mon Sep 17 00:00:00 2001 From: ashutoshsingh0223 Date: Thu, 2 Jan 2020 16:31:41 +0530 Subject: [PATCH 64/83] add variant in previous bot messages --- ner_v1/detectors/textual/name/hindi_const.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ner_v1/detectors/textual/name/hindi_const.py b/ner_v1/detectors/textual/name/hindi_const.py index b4401c685..03cafb0f2 100644 --- a/ner_v1/detectors/textual/name/hindi_const.py +++ b/ner_v1/detectors/textual/name/hindi_const.py @@ -324,6 +324,8 @@ "address you", "your name", "your full name", + "what is your name", + "is your name", u"नाम"] # Common hindi words occuring in context to a name From f35e69764fe24454ecc3e6c0a40a3c5d544b28e5 Mon Sep 17 00:00:00 2001 From: ashutoshsingh0223 Date: Mon, 6 Jan 2020 12:05:43 +0530 Subject: [PATCH 65/83] review changes - free_text_detection_results -> predetected_value --- ner_constants.py | 2 +- ner_v1/api.py | 10 ++--- ner_v1/chatbot/entity_detection.py | 30 ++++++------- ner_v1/detectors/base_detector.py | 36 ++++++++-------- ner_v1/detectors/textual/name/hindi_const.py | 3 +- .../detectors/textual/name/name_detection.py | 31 ++++++++------ .../detectors/textual/text/text_detection.py | 40 +++++++++--------- .../textual/text/text_detection_model.py | 42 +++++++++---------- 8 files changed, 99 insertions(+), 95 deletions(-) diff --git a/ner_constants.py b/ner_constants.py index ca5effd20..f535c14b0 100644 --- a/ner_constants.py +++ b/ner_constants.py @@ -58,7 +58,7 @@ PARAMETER_NUMBER_UNIT_TYPE = 'unit_type' # Prior detection results from CRF models. -PARAMETER_PRIOR_RESULTS = "free_text_detection_results" +PARAMETER_PRIOR_RESULTS = "predetected_values" # Locale for Date and Phone Number detection PARAMETER_LOCALE = 'locale' diff --git a/ner_v1/api.py b/ner_v1/api.py index fb12cf041..66ed52f1a 100644 --- a/ner_v1/api.py +++ b/ner_v1/api.py @@ -71,7 +71,7 @@ def get_parameters_dictionary(request): PARAMETER_READ_EMBEDDINGS_FROM_REMOTE_URL: to_bool(request.GET.get('read_embeddings_from_remote_url')), PARAMETER_READ_MODEL_FROM_S3: to_bool(request.GET.get('read_model_from_s3')), PARAMETER_LIVE_CRF_MODEL_PATH: request.GET.get('live_crf_model_path'), - PARAMETER_PRIOR_RESULTS: json.loads(request.GET.get("crf_results", '[]')) + PARAMETER_PRIOR_RESULTS: json.loads(request.GET.get("predetected_values", '[]')) } ner_logger.info("parameters dict - {}".format(parameters_dict)) return parameters_dict @@ -106,7 +106,7 @@ def parse_post_request(request): PARAMETER_READ_EMBEDDINGS_FROM_REMOTE_URL: to_bool(request_data.get('read_embeddings_from_remote_url')), PARAMETER_READ_MODEL_FROM_S3: to_bool(request_data.get('read_model_from_s3')), PARAMETER_LIVE_CRF_MODEL_PATH: request_data.get('live_crf_model_path'), - PARAMETER_PRIOR_RESULTS: request_data.get("crf_results", []) + PARAMETER_PRIOR_RESULTS: request_data.get("predetected_values", []) } return parameters_dict @@ -250,7 +250,7 @@ def text(request): live_crf_model_path=parameters_dict[PARAMETER_LIVE_CRF_MODEL_PATH], read_model_from_s3=parameters_dict[PARAMETER_READ_MODEL_FROM_S3], read_embeddings_from_remote_url=parameters_dict[PARAMETER_READ_EMBEDDINGS_FROM_REMOTE_URL], - free_text_detection_results=parameters_dict[PARAMETER_PRIOR_RESULTS] + predetected_values=parameters_dict[PARAMETER_PRIOR_RESULTS] ) ner_logger.debug('Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output)) except TypeError as e: @@ -273,7 +273,7 @@ def location(request): parameters_dict[PARAMETER_STRUCTURED_VALUE], parameters_dict[PARAMETER_FALLBACK_VALUE], parameters_dict[PARAMETER_BOT_MESSAGE], - free_text_detection_results=parameters_dict[PARAMETER_PRIOR_RESULTS]) + predetected_values=parameters_dict[PARAMETER_PRIOR_RESULTS]) ner_logger.debug('Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output)) except TypeError as e: ner_logger.exception('Exception for location: %s ' % e) @@ -367,7 +367,7 @@ def person_name(request): fallback_value=parameters_dict[PARAMETER_FALLBACK_VALUE], bot_message=parameters_dict[PARAMETER_BOT_MESSAGE], language=parameters_dict[PARAMETER_SOURCE_LANGUAGE], - free_text_detection_results=parameters_dict[PARAMETER_PRIOR_RESULTS]) + predetected_values=parameters_dict[PARAMETER_PRIOR_RESULTS]) ner_logger.debug('Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output)) except TypeError as e: ner_logger.exception('Exception for person_name: %s ' % e) diff --git a/ner_v1/chatbot/entity_detection.py b/ner_v1/chatbot/entity_detection.py index 389ac4a2b..073c3e9e3 100644 --- a/ner_v1/chatbot/entity_detection.py +++ b/ner_v1/chatbot/entity_detection.py @@ -93,7 +93,7 @@ def get_text(message, entity_name, structured_value, fallback_value, bot_message, language=ENGLISH_LANG, - free_text_detection_results=None, **kwargs): + predetected_values=None, **kwargs): """Use TextDetector (datastore/elasticsearch) to detect textual entities Args: @@ -235,7 +235,7 @@ def get_text(message, entity_name, structured_value, fallback_value, bot_message read_model_from_s3 = kwargs.get('read_model_from_s3', False) read_embeddings_from_remote_url = kwargs.get('read_embeddings_from_remote_url', False) - free_text_detection_results = free_text_detection_results or [] + predetected_values = predetected_values or [] text_model_detector = TextModelDetector(entity_name=entity_name, language=language, @@ -251,22 +251,22 @@ def get_text(message, entity_name, structured_value, fallback_value, bot_message min_token_len_fuzziness = int(min_token_len_fuzziness) text_model_detector.set_min_token_size_for_levenshtein(min_size=min_token_len_fuzziness) - ner_logger.info("free text detection results: {}".format(free_text_detection_results)) + ner_logger.info("free text detection results: {}".format(predetected_values)) if isinstance(message, six.string_types): entity_output = text_model_detector.detect(message=message, structured_value=structured_value, fallback_value=fallback_value, bot_message=bot_message, - free_text_detection_results=free_text_detection_results) + predetected_values=predetected_values) elif isinstance(message, (list, tuple)): entity_output = text_model_detector.detect_bulk(messages=message, fallback_values=fallback_value, - free_text_detection_results=free_text_detection_results) + predetected_values=predetected_values) return entity_output def get_location(message, entity_name, structured_value, fallback_value, bot_message, - free_text_detection_results=None, **kwargs): + predetected_values=None, **kwargs): """"Use TextDetector (elasticsearch) to detect location TODO: We can improve this by creating separate for location detection instead of using TextDetector @@ -282,7 +282,7 @@ def get_location(message, entity_name, structured_value, fallback_value, bot_mes fallback_value (str): If the detection logic fails to detect any value either from structured_value or message then we return a fallback_value as an output. bot_message (str): previous message from a bot/agent. - free_text_detection_results(list of str): prior detection results from models like crf etc. + predetected_values(list of str): prior detection results from models like crf etc. Returns: @@ -303,10 +303,10 @@ def get_location(message, entity_name, structured_value, fallback_value, bot_mes >> [{'detection': 'message', 'entity_value': {'value': 'Andheri West'}, 'language': 'en', 'original_text': 'andheri west'}] """ - free_text_detection_results = free_text_detection_results or [] + predetected_values = predetected_values or [] text_detection = TextDetector(entity_name=entity_name) return text_detection.detect(message=message, structured_value=structured_value, fallback_value=fallback_value, - bot_message=bot_message, free_text_detection_results=free_text_detection_results) + bot_message=bot_message, predetected_values=predetected_values) def get_phone_number(message, entity_name, structured_value, fallback_value, bot_message): @@ -528,7 +528,7 @@ def get_city(message, entity_name, structured_value, fallback_value, bot_message def get_person_name(message, entity_name, structured_value, fallback_value, bot_message, - language=ENGLISH_LANG, free_text_detection_results=None, **kwargs): + language=ENGLISH_LANG, predetected_values=None, **kwargs): """Use NameDetector to detect names Args: @@ -543,7 +543,7 @@ def get_person_name(message, entity_name, structured_value, fallback_value, bot_ or message then we return a fallback_value as an output. bot_message (str): previous message from a bot/agent. language (str): ISO 639-1 code of language of message - free_text_detection_results(list of str): prior detection results from models like crf etc. + predetected_values(list of str): prior detection results from models like crf etc. Returns: @@ -562,7 +562,7 @@ def get_person_name(message, entity_name, structured_value, fallback_value, bot_ 'entity_value': {'first_name': yash, 'middle_name': None, 'last_name': doshi}}] """ # TODO refactor NameDetector to make this easy to read and use - free_text_detection_results = free_text_detection_results or [] + predetected_values = predetected_values or [] name_detection = NameDetector(entity_name=entity_name, language=language) text, detection_method, fallback_text, fallback_method = (structured_value, @@ -580,15 +580,15 @@ def get_person_name(message, entity_name, structured_value, fallback_value, bot_ entity_list, original_text_list = name_detection.detect_entity( text=text, bot_message=bot_message, - free_text_detection_results=free_text_detection_results) + predetected_values=predetected_values) if not entity_list and fallback_text: entity_list, original_text_list = NameDetector.get_format_name(fallback_text.split(), fallback_text) detection_method = fallback_method if entity_list and original_text_list: - if free_text_detection_results: - detection_method = "free text entity" + # if predetected_values: + # detection_method = "free text entity" return output_entity_dict_list(entity_list, original_text_list, detection_method) return None diff --git a/ner_v1/detectors/base_detector.py b/ner_v1/detectors/base_detector.py index 3dc0f94e0..c2d268954 100644 --- a/ner_v1/detectors/base_detector.py +++ b/ner_v1/detectors/base_detector.py @@ -55,12 +55,12 @@ def supported_languages(self): return [] @abc.abstractmethod - def detect_entity(self, text, free_text_detection_results=None, **kwargs): + def detect_entity(self, text, predetected_values=None, **kwargs): """ This method runs the core entity detection logic defined inside entity detectors Args: text: text snippet from which entities needs to be detected - free_text_detection_results: prior detection results + predetected_values: prior detection results **kwargs: values specific to different detectors such as 'last bot message', custom configs, etc. Return: tuple: Two lists of same length containing detected values and original substring from text which is used @@ -81,13 +81,13 @@ def _set_language_processing_script(self): raise NotImplementedError('Please enable translation or extend language support' 'for %s' % self._source_language_script) - def detect_bulk(self, messages=None, free_text_detection_results=None, **kwargs): + def detect_bulk(self, messages=None, predetected_values=None, **kwargs): """ Use detector to detect entities from text. It also translates query to language compatible to detector Args: messages (list of strings): list of natural text(s) on which detection logic is to be run. - free_text_detection_results(list of list of str): prior detection results + predetected_values(list of list of str): prior detection results Returns: dict or None: dictionary containing entity_value, original_text and detection; entity_value is in itself a dict with its keys varying from entity to entity @@ -116,10 +116,10 @@ def detect_bulk(self, messages=None, free_text_detection_results=None, **kwargs) texts = messages # Prior results from entity detection using methods like CRF etc. - if free_text_detection_results is None: - free_text_detection_results = [] + if predetected_values is None: + predetected_values = [] entities_list, original_list = self.detect_entity_bulk( - texts=texts, free_text_detection_results=free_text_detection_results) + texts=texts, predetected_values=predetected_values) fallback_values = kwargs.get('fallback_values') values_list, detection_method_list, original_texts_list = [], [], [] @@ -169,7 +169,7 @@ def _add_verification_source(self, values, verification_source_dict): text_entity_verified_values.append(text_entity_dict) return text_entity_verified_values - def combine_results(self, values, original_texts, free_text_detection_results): + def combine_results(self, values, original_texts, predetected_values): """ This method is used to combine the results provided by the datastore search and the crf_model if trained. @@ -177,7 +177,7 @@ def combine_results(self, values, original_texts, free_text_detection_results): values (list): List of values detected by datastore original_texts (list): List of original texts present in the texts for which value shave been detected - free_text_detection_results (list): Entities detected by the models like crf etc. + predetected_values (list): Entities detected by the models like crf etc. Returns: combined_values (list): List of dicts each dict consisting of the entity value and additionally the keys for the datastore and crf model detection @@ -191,16 +191,16 @@ def combine_results(self, values, original_texts, free_text_detection_results): MODEL_VERIFIED: False }) combined_original_texts = original_texts - for i in range(len(free_text_detection_results)): + for i in range(len(predetected_values)): match = False for j in range(len(original_texts)): - if free_text_detection_results[i] == original_texts[j]: + if predetected_values[i] == original_texts[j]: combined_values[j][MODEL_VERIFIED] = True match = True - elif re.findall(r'\b%s\b' % free_text_detection_results[i], original_texts[j]): + elif re.findall(r'\b%s\b' % predetected_values[i], original_texts[j]): match = True if not match: - unprocessed_crf_original_texts.append(free_text_detection_results[i]) + unprocessed_crf_original_texts.append(predetected_values[i]) unprocessed_crf_original_texts_verified = self._add_verification_source( values=unprocessed_crf_original_texts, @@ -212,7 +212,7 @@ def combine_results(self, values, original_texts, free_text_detection_results): return combined_values, combined_original_texts def detect(self, message=None, structured_value=None, fallback_value=None, - free_text_detection_results=None, **kwargs): + predetected_values=None, **kwargs): """ Use detector to detect entities from text. It also translates query to language compatible to detector Args: @@ -223,7 +223,7 @@ def detect(self, message=None, structured_value=None, fallback_value=None, (For example, UI elements like form, payload, etc) fallback_value (str): If the detection logic fails to detect any value either from structured_value or message then we return a fallback_value as an output. - free_text_detection_results(list of str): prior detection results from models like CRF etc. + predetected_values(list of str): prior detection results from models like CRF etc. bot_message (str): previous message from a bot/agent. Returns: dict or None: dictionary containing entity_value, original_text and detection; @@ -281,10 +281,10 @@ def detect(self, message=None, structured_value=None, fallback_value=None, text = structured_value if structured_value else message # Prior results from detection. - if free_text_detection_results is None: - free_text_detection_results = [] + if predetected_values is None: + predetected_values = [] entity_list, original_text_list = self.detect_entity(text=text, - free_text_detection_results=free_text_detection_results) + predetected_values=predetected_values) if structured_value: if entity_list: diff --git a/ner_v1/detectors/textual/name/hindi_const.py b/ner_v1/detectors/textual/name/hindi_const.py index 03cafb0f2..488dedf03 100644 --- a/ner_v1/detectors/textual/name/hindi_const.py +++ b/ner_v1/detectors/textual/name/hindi_const.py @@ -305,8 +305,7 @@ "mention your name ", "provide your name ", "help me with your name", - "What's your full name?", - "Here is the menu!", + "what's your full name?", "forgot to mention your name", "please help me with your full name", "please let me know your full name.", diff --git a/ner_v1/detectors/textual/name/name_detection.py b/ner_v1/detectors/textual/name/name_detection.py index a6bf9e821..79664d84b 100644 --- a/ner_v1/detectors/textual/name/name_detection.py +++ b/ner_v1/detectors/textual/name/name_detection.py @@ -10,6 +10,7 @@ HINDI_STOPWORDS, NAME_VARIATIONS, COMMON_HINDI_WORDS_OCCURING_WITH_NAME) from ner_v1.detectors.textual.text.text_detection import TextDetector +from ner_v1.constant import DATASTORE_VERIFIED, MODEL_VERIFIED # TODO: Refactor this module for readability and useability. Remove any hacks @@ -152,7 +153,7 @@ def get_name_using_pos_tagger(self, text): return entity_value, original_text - def detect_entity(self, text, bot_message=None, free_text_detection_results=None, **kwargs): + def detect_entity(self, text, bot_message=None, predetected_values=None, **kwargs): """ Takes text as input and returns two lists 1.entity_value in the form of first, middle and last names @@ -160,7 +161,7 @@ def detect_entity(self, text, bot_message=None, free_text_detection_results=None Args: text(string): the original text bot_message(string): previous bot message - free_text_detection_results(list of str): detected values from prior detection + predetected_values(list of str): detected values from prior detection Example: text=my name is yash doshi @@ -173,7 +174,7 @@ def detect_entity(self, text, bot_message=None, free_text_detection_results=None entity_value, original_text = ([], []) - if not free_text_detection_results: + if not predetected_values: if bot_message: if not self.context_check_botmessage(bot_message): return [], [] @@ -181,11 +182,15 @@ def detect_entity(self, text, bot_message=None, free_text_detection_results=None entity_value, original_text = self.detect_english_name() elif self.language == HINDI_LANG: entity_value, original_text = self.detect_hindi_name() + for entity_value_dict in entity_value: + entity_value_dict.update({DATASTORE_VERIFIED: False, MODEL_VERIFIED: True}) else: - replaced_text = self.replace_free_text_detection_text(free_text_detection_results, - text=text) + replaced_text = self.replace_predetected_text(predetected_values, + text=text) entity_value, original_text = self.detect_person_name_entity(replaced_text) + for entity_value_dict in entity_value: + entity_value_dict.update({DATASTORE_VERIFIED: True, MODEL_VERIFIED: False}) self._update_processed_text(person_name_list=original_text) @@ -256,18 +261,18 @@ def detect_hindi_name(self): return entity_value, original_text - def replace_free_text_detection_text(self, free_text_detection_results, text): + def replace_predetected_text(self, predetected_values, text): """ Replace detected names from the text according to replace_detected_text. - Separate method for replacing free_text_detection_results because it these results are not at token level. + Separate method for replacing predetected_values because it these results are not at token level. For example - text = "my name is yash doshi" - free_text_detection_results = ["yash doshi"] + predetected_values = ["yash doshi"] while, text_detection_original_texts = ["yash", "doshi"] Args: - free_text_detection_results(list): list containing free_text_entity_results + predetected_values(list): list containing predetected_values text(str): original to run detection on Returns: @@ -275,8 +280,8 @@ def replace_free_text_detection_text(self, free_text_detection_results, text): Example: >> text = "my name is yash doshi" - >> free_text_detection_results = ["yash doshi"] - >> replace_free_text_detection_text(free_text_detection_results, text) + >> predetected_values = ["yash doshi"] + >> replace_predetected_text(predetected_values, text) 'my name is _yash_ _doshi_' """ @@ -294,7 +299,7 @@ def replace_free_text_detection_text(self, free_text_detection_results, text): and replaced_text_tokens[-1] + "." in text.lower(): replaced_text_tokens[-1] = replaced_text_tokens[-1] + "." else: - # fix to handle examples like `miami,21st street` where tokenizer gives ["miami,21st", "street"]. + # fix to handle examples like `miami,21st street` where tokenizer gives ["miami,", "21st", "street"] # This causes problems while tagging entities according indices. # For eg is miami is an entity and its indices are (0,5) then due to this extra `,` tagging will be # problem because now length of token will become 6 not 5. @@ -305,7 +310,7 @@ def replace_free_text_detection_text(self, free_text_detection_results, text): else: replaced_text_tokens = text.lower().strip().split() - for name in free_text_detection_results: + for name in predetected_values: name_tokens = name.split() for token in name_tokens: for j in range(len(replaced_text_tokens)): diff --git a/ner_v1/detectors/textual/text/text_detection.py b/ner_v1/detectors/textual/text/text_detection.py index 6adab9b6a..6ac3c3f9c 100644 --- a/ner_v1/detectors/textual/text/text_detection.py +++ b/ner_v1/detectors/textual/text/text_detection.py @@ -277,7 +277,7 @@ def _get_tokens_and_indices(txt): return u' '.join(matched_tokens) - def detect_entity_bulk(self, texts, free_text_detection_results=None, **kwargs): + def detect_entity_bulk(self, texts, predetected_values=None, **kwargs): """ Detects all textual entities in text that are similar to variants of 'entity_name' stored in the datastore and returns two lists of list of detected text entities and their corresponding original substrings @@ -288,7 +288,7 @@ def detect_entity_bulk(self, texts, free_text_detection_results=None, **kwargs): Args: texts (list): list of strings(bulk detect) to extract textual entities from - free_text_detection_results(list of list of str): results from prior detection. + predetected_values(list of list of str): results from prior detection. **kwargs: it can be used to send specific arguments in future. for example, fuzziness, previous context. Returns: tuple: @@ -330,23 +330,23 @@ def detect_entity_bulk(self, texts, free_text_detection_results=None, **kwargs): ] """ - # For bulk detection free_text_detection_results will be a list of list of str - if free_text_detection_results is None: - free_text_detection_results = [] + # For bulk detection predetected_values will be a list of list of str + if predetected_values is None: + predetected_values = [] self._process_text(texts) text_entity_values_list, original_texts_list = self._text_detection_with_variants() - # itertate over text_entity_values_list, original_texts_list and if free_text_detection_results has any entry - # for that index use combine_results to merge the results from free_text and detection. + # itertate over text_entity_values_list, original_texts_list and if predetected_values has any entry + # for that index use combine_results to merge the results from predetected_values and dictionary detection. combined_entity_values, combined_original_texts = [], [] - for i, (values, original_texts, inner_free_text_detection_results_) in enumerate( - six.moves.zip_longest(text_entity_values_list, original_texts_list, free_text_detection_results)): - if inner_free_text_detection_results_: + for i, (values, original_texts, inner_predetected_values) in enumerate( + six.moves.zip_longest(text_entity_values_list, original_texts_list, predetected_values)): + if inner_predetected_values: inner_combined_entity_values, inner_combined_original_texts = self.combine_results( values=values, original_texts=original_texts, - free_text_detection_results=inner_free_text_detection_results_) + predetected_values=inner_predetected_values) combined_entity_values.append(inner_combined_entity_values) combined_original_texts.append(inner_combined_original_texts) else: @@ -355,7 +355,7 @@ def detect_entity_bulk(self, texts, free_text_detection_results=None, **kwargs): return text_entity_values_list, original_texts_list - def detect_entity(self, text, free_text_detection_results=None, **kwargs): + def detect_entity(self, text, predetected_values=None, **kwargs): """ Detects all textual entities in text that are similar to variants of 'entity_name' stored in the datastore and returns two lists of detected text entities and their corresponding original substrings in text respectively. @@ -364,7 +364,7 @@ def detect_entity(self, text, free_text_detection_results=None, **kwargs): is returned. For more information on how data is stored, see Datastore docs. Args: text (unicode): string to extract textual entities from - free_text_detection_results(list of str): prior detection results + predetected_values(list of str): prior detection results **kwargs: it can be used to send specific arguments in future. for example, fuzziness, previous context. Returns: tuple: @@ -392,10 +392,10 @@ def detect_entity(self, text, free_text_detection_results=None, **kwargs): self._process_text([text]) text_entity_values, original_texts = self._text_detection_with_variants() - # For single message detection free_text_detection_results will be a list of str + # For single message detection predetected_values will be a list of str # if present use combine_results to merge the results. - if free_text_detection_results is None: - free_text_detection_results = [] + if predetected_values is None: + predetected_values = [] values, texts = [], [] if len(text_entity_values) > 0 and len(original_texts) > 0: @@ -403,13 +403,13 @@ def detect_entity(self, text, free_text_detection_results=None, **kwargs): self.processed_text = self.__processed_texts[0] values, texts = text_entity_values[0], original_texts[0] - ner_logger.info("prior detection results - {}".format(free_text_detection_results)) - if free_text_detection_results: - ner_logger.info("combining results for {0}, {1}, {2}".format(values, texts, free_text_detection_results)) + ner_logger.info("prior detection results - {}".format(predetected_values)) + if predetected_values: + ner_logger.info("combining results for {0}, {1}, {2}".format(values, texts, predetected_values)) text_entity_verified_values, original_texts = self.combine_results( values=values, original_texts=texts, - free_text_detection_results=free_text_detection_results) + predetected_values=predetected_values) return text_entity_verified_values, original_texts return values, texts diff --git a/ner_v1/detectors/textual/text/text_detection_model.py b/ner_v1/detectors/textual/text/text_detection_model.py index c9e7295ae..dae72fe43 100644 --- a/ner_v1/detectors/textual/text/text_detection_model.py +++ b/ner_v1/detectors/textual/text/text_detection_model.py @@ -35,7 +35,7 @@ def __init__(self, self.read_embeddings_from_remote_url = read_embeddings_from_remote_url self.live_crf_model_path = live_crf_model_path - def detect_entity(self, text, free_text_detection_results=None, **kwargs): + def detect_entity(self, text, predetected_values=None, **kwargs): """ Detects all textual entities in text that are similar to variants of 'entity_name' stored in the datastore and returns two lists of detected text entities and their corresponding original substrings in text respectively. @@ -46,7 +46,7 @@ def detect_entity(self, text, free_text_detection_results=None, **kwargs): In addition to this method also runs the CRF MODEL if trained and provides the results for the given entity. Args: text (str or unicode): string to extract textual entities from - free_text_detection_results(list of str): list of previous detected values + predetected_values(list of str): list of previous detected values **kwargs: it can be used to send specific arguments in future. for example, fuzziness, previous context. Returns: tuple: @@ -86,24 +86,24 @@ def detect_entity(self, text, free_text_detection_results=None, **kwargs): crf_original_texts = crf_model.detect_entity(text=text) - # Access free_text_detection_results(list of str). - # If present replace crf_original_texts with free_text_detection_results. - # Call combine results to .combine_results() from dictionary detection and free_text_detection_results. - if free_text_detection_results is None: - free_text_detection_results = [] + # Access predetected_values(list of str). + # If present replace crf_original_texts with predetected_values. + # Call combine results to .combine_results() from dictionary detection and predetected_values. + if predetected_values is None: + predetected_values = [] values, original_texts = super(TextModelDetector, self).detect_entity( - text, free_text_detection_results=free_text_detection_results, **kwargs) + text, predetected_values=predetected_values, **kwargs) text_entity_verified_values, original_texts = \ self.combine_results(values=values, original_texts=original_texts, - free_text_detection_results=free_text_detection_results) + predetected_values=predetected_values) self.text_entity_values, self.original_texts = text_entity_verified_values, original_texts return self.text_entity_values, self.original_texts - def detect_entity_bulk(self, texts, free_text_detection_results=None, **kwargs): + def detect_entity_bulk(self, texts, predetected_values=None, **kwargs): """ Detects all textual entities in text that are similar to variants of 'entity_name' stored in the datastore and returns two lists of list of detected text entities and their corresponding original substrings @@ -116,7 +116,7 @@ def detect_entity_bulk(self, texts, free_text_detection_results=None, **kwargs): Args: texts (list of strings): natural language sentence(s) to extract entities from - free_text_detection_results(list of lists of str): values from previous detection + predetected_values(list of lists of str): values from previous detection **kwargs: it can be used to send specific arguments in future. for example, fuzziness, previous context. Returns: @@ -167,25 +167,25 @@ def detect_entity_bulk(self, texts, free_text_detection_results=None, **kwargs): crf_original_texts = [] - # Access free_text_detection_results(list of lists). - # If present replace crf_original_texts with free_text_detection_results. - # Call .combine_results() to combine results from dictionary detection and free_text_detection_results. - if free_text_detection_results is None: - free_text_detection_results = [] + # Access predetected_values(list of lists). + # If present replace crf_original_texts with predetected_values. + # Call .combine_results() to combine results from dictionary detection and predetected_values. + if predetected_values is None: + predetected_values = [] values_list, original_texts_list = super(TextModelDetector, self).detect_entity_bulk( - texts, free_text_detection_results=free_text_detection_results, **kwargs) + texts, predetected_values=predetected_values, **kwargs) text_entity_values_list, original_texts_detected_list = [], [] - for inner_free_text_detection_results, inner_values, inner_original_texts in six.moves.zip_longest( - free_text_detection_results, + for inner_predetected_values, inner_values, inner_original_texts in six.moves.zip_longest( + predetected_values, values_list, original_texts_list): text_entity_verified_values, original_texts = \ self.combine_results( values=inner_values, original_texts=inner_original_texts, - free_text_detection_results=inner_free_text_detection_results if - inner_free_text_detection_results else []) + predetected_values=inner_predetected_values if + inner_predetected_values else []) text_entity_values_list.append(text_entity_verified_values) original_texts_detected_list.append(original_texts) return text_entity_values_list, original_texts_detected_list From bdd86857e9babda374f6def81ff16a3c1ef3ccad Mon Sep 17 00:00:00 2001 From: ashutoshsingh0223 Date: Mon, 6 Jan 2020 12:47:54 +0530 Subject: [PATCH 66/83] Fix lint error in name_detection.py --- ner_v1/detectors/textual/name/name_detection.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ner_v1/detectors/textual/name/name_detection.py b/ner_v1/detectors/textual/name/name_detection.py index 79664d84b..4a35fa032 100644 --- a/ner_v1/detectors/textual/name/name_detection.py +++ b/ner_v1/detectors/textual/name/name_detection.py @@ -299,7 +299,8 @@ def replace_predetected_text(self, predetected_values, text): and replaced_text_tokens[-1] + "." in text.lower(): replaced_text_tokens[-1] = replaced_text_tokens[-1] + "." else: - # fix to handle examples like `miami,21st street` where tokenizer gives ["miami,", "21st", "street"] + # Fix to handle examples like `miami,21st street` + # where tokenizer gives ["miami,", "21st", "street"]. # This causes problems while tagging entities according indices. # For eg is miami is an entity and its indices are (0,5) then due to this extra `,` tagging will be # problem because now length of token will become 6 not 5. From dcfdf8723e74624ee9da54a9ea0e1a649092a2c3 Mon Sep 17 00:00:00 2001 From: Raza Sayed Date: Thu, 9 Jan 2020 10:13:40 +0000 Subject: [PATCH 67/83] Add .vscode to gitignore --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 5a675421f..b6be485bf 100644 --- a/.gitignore +++ b/.gitignore @@ -103,3 +103,5 @@ ENV/ sftp-config.json .DS_Store logs/ + +.vscode From 067ce6ef028dc771e851e85e69c7eab0e846a995 Mon Sep 17 00:00:00 2001 From: ranvijayj Date: Fri, 10 Jan 2020 19:08:57 +0530 Subject: [PATCH 68/83] add new version of django upgrade for security add new version of django upgrade for security - 1.11.26 to 1.11.27 (affects) application packages ADO-3054 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index d0d1f37fe..ab2dbcf31 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,7 +7,7 @@ numpy==1.10.4 elasticsearch==5.5.0 requests==2.20.0 requests-aws4auth==0.9 -Django==1.11.26 +Django==1.11.27 django-dotenv==1.4.2 weighted-levenshtein==0.1 regex==2018.7.11 From c49d879aeb0138baa8aaf765223cfed0875652cd Mon Sep 17 00:00:00 2001 From: ashutoshsingh0223 Date: Wed, 22 Jan 2020 13:48:47 +0530 Subject: [PATCH 69/83] Fix in name_detection.detect_entity --- ner_v1/detectors/textual/name/name_detection.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ner_v1/detectors/textual/name/name_detection.py b/ner_v1/detectors/textual/name/name_detection.py index 4a35fa032..394766c4c 100644 --- a/ner_v1/detectors/textual/name/name_detection.py +++ b/ner_v1/detectors/textual/name/name_detection.py @@ -182,8 +182,9 @@ def detect_entity(self, text, bot_message=None, predetected_values=None, **kwarg entity_value, original_text = self.detect_english_name() elif self.language == HINDI_LANG: entity_value, original_text = self.detect_hindi_name() - for entity_value_dict in entity_value: - entity_value_dict.update({DATASTORE_VERIFIED: False, MODEL_VERIFIED: True}) + + for entity_value_dict in entity_value: + entity_value_dict.update({DATASTORE_VERIFIED: False, MODEL_VERIFIED: True}) else: replaced_text = self.replace_predetected_text(predetected_values, From 10b35aeba3af4c9644faa51e9529caf00c0e556f Mon Sep 17 00:00:00 2001 From: ashutoshsingh0223 Date: Wed, 22 Jan 2020 13:54:55 +0530 Subject: [PATCH 70/83] Fix lint error --- ner_v1/api.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ner_v1/api.py b/ner_v1/api.py index 66ed52f1a..93c4b55e8 100644 --- a/ner_v1/api.py +++ b/ner_v1/api.py @@ -23,7 +23,6 @@ PARAMETER_READ_EMBEDDINGS_FROM_REMOTE_URL, PARAMETER_LIVE_CRF_MODEL_PATH) from django.views.decorators.csrf import csrf_exempt -from chatbot_ner.config import ner_logger def to_bool(value): From f6889f8dbf5d93cb678b4d2578428423a9647e20 Mon Sep 17 00:00:00 2001 From: ashutoshsingh0223 Date: Wed, 22 Jan 2020 16:28:05 +0530 Subject: [PATCH 71/83] Add mock_values for name_detection and mock elastic call --- .../name/tests/test_cases_person_name.csv | 28 +++++++++---------- .../textual/name/tests/test_name_detection.py | 15 ++++++++-- 2 files changed, 26 insertions(+), 17 deletions(-) diff --git a/ner_v1/detectors/textual/name/tests/test_cases_person_name.csv b/ner_v1/detectors/textual/name/tests/test_cases_person_name.csv index 94eaa35ee..56a0329e5 100644 --- a/ner_v1/detectors/textual/name/tests/test_cases_person_name.csv +++ b/ner_v1/detectors/textual/name/tests/test_cases_person_name.csv @@ -1,14 +1,14 @@ -language,message,first_name,middle_name,last_name,original_entities -en,my name is pratik sridatt jayarao,pratik,sridatt,jayarao,pratik sridatt jayarao -en,my name is pratik jayarao,pratik,None,jayarao,pratik jayarao -en,my name is pratik,pratik,None,None,pratik -en,myself pratik sridatt jayarao,pratik,sridatt,jayarao,pratik sridatt jayarao -en,call me pratik sridatt jayarao,pratik,sridatt,jayarao,pratik sridatt jayarao -en,Pratik Jayarao,Pratik,None,Jayarao,Pratik Jayarao -en,Pratik,Pratik,None,None,Pratik -hi,मेरा नाम प्रतिक श्रीदत्त जयराओ है,प्रतिक,श्रीदत्त,जयराओ,प्रतिक श्रीदत्त जयराओ -hi,नाम प्रतिक श्रीदत्त जयराओ है,प्रतिक,श्रीदत्त,जयराओ,प्रतिक श्रीदत्त जयराओ -hi,नाम प्रतिक जयराओ है,प्रतिक,None,जयराओ,प्रतिक जयराओ -hi,मुझे प्रतिक जयराओ बुलाते है,प्रतिक,None,जयराओ,प्रतिक जयराओ -hi,प्रतिक जयराओ,प्रतिक,None,जयराओ,प्रतिक जयराओ -hi,मेरा नाम pratik jayarao है,pratik,None,jayarao,pratik jayarao +language,message,first_name,middle_name,last_name,original_entities,mocked_values +en,my name is pratik sridatt jayarao,pratik,sridatt,jayarao,pratik sridatt jayarao,'[["jayarao", "pratik", "shridatt"], ["jayarao", "pratik", "sridatt"]]' +en,my name is pratik jayarao,pratik,None,jayarao,pratik jayarao,'[["jayarao", "pratik"], ["jayarao", "pratik"]]' +en,my name is pratik,pratik,None,None,pratik,'[["pratik"], ["pratik"]]' +en,myself pratik sridatt jayarao,pratik,sridatt,jayarao,pratik sridatt jayarao,'[["jayarao", "pratik", "shridatt"], ["jayarao", "pratik", "sridatt"]]' +en,call me pratik sridatt jayarao,pratik,sridatt,jayarao,pratik sridatt jayarao,'[["jayarao", "pratik", "shridatt"], ["jayarao", "pratik", "sridatt"]]' +en,Pratik Jayarao,Pratik,None,Jayarao,Pratik Jayarao,'[["jayarao", "pratik"], ["jayarao", "pratik"]]' +en,Pratik,Pratik,None,None,Pratik,'[["pratik"], ["pratik"]]' +hi,मेरा नाम प्रतिक श्रीदत्त जयराओ है,प्रतिक,श्रीदत्त,जयराओ,प्रतिक श्रीदत्त जयराओ,'[["प्रतिक", "श्रीदत्त", "जयराओ"], ["प्रतिक", "श्रीदत्त", "जयराओ"]]' +hi,नाम प्रतिक श्रीदत्त जयराओ है,प्रतिक,श्रीदत्त,जयराओ,प्रतिक श्रीदत्त जयराओ,'[["प्रतिक", "श्रीदत्त", "जयराओ"], ["प्रतिक", "श्रीदत्त", "जयराओ"]]' +hi,नाम प्रतिक जयराओ है,प्रतिक,None,जयराओ,प्रतिक जयराओ,'[["प्रतिक", "श्रीदत्त", "जयराओ"], ["प्रतिक", "श्रीदत्त", "जयराओ"]]' +hi,मुझे प्रतिक जयराओ बुलाते है,प्रतिक,None,जयराओ,प्रतिक जयराओ,'[["प्रतिक", "श्रीदत्त", "जयराओ"], ["प्रतिक", "श्रीदत्त", "जयराओ"]]' +hi,प्रतिक जयराओ,प्रतिक,None,जयराओ,प्रतिक जयराओ,'[["प्रतिक", "श्रीदत्त", "जयराओ"], ["प्रतिक", "श्रीदत्त", "जयराओ"]]' +hi,मेरा नाम pratik jayarao है,pratik,None,jayarao,pratik jayarao,'[["jayarao", "pratik"], ["jayarao", "pratik"]]' diff --git a/ner_v1/detectors/textual/name/tests/test_name_detection.py b/ner_v1/detectors/textual/name/tests/test_name_detection.py index 4a93d7c78..3ffe4dcc9 100644 --- a/ner_v1/detectors/textual/name/tests/test_name_detection.py +++ b/ner_v1/detectors/textual/name/tests/test_name_detection.py @@ -4,6 +4,8 @@ import pandas as pd from django.test import TestCase +import mock +import json from ner_v1.detectors.textual.name.name_detection import NameDetector @@ -20,14 +22,16 @@ def preprocess_data(self): 'language': [], 'message': [], 'expected_value': [], + 'mocked_values':[] } - for (language, message, first_name, middle_name, last_name, original_entity) in zip( + for (language, message, first_name, middle_name, last_name, original_entity, mocked_values) in zip( self.data['language'], self.data['message'], self.data['first_name'], self.data['middle_name'], self.data['last_name'], - self.data['original_entities']): + self.data['original_entities'], + self.data['mocked_values']): fn = [] mn = [] ln = [] @@ -48,13 +52,18 @@ def preprocess_data(self): test_dict['language'].append(language) test_dict['message'].append(message) test_dict['expected_value'].append(temp) + test_dict['mocked_values'].append(mocked_values) return test_dict - def test_person_name_detection(self): + @mock.patch.object(NameDetector, "text_detection_name") + def test_person_name_detection(self, mock_text_detection_name): for i in range(len(self.data)): message = self.test_dict['message'][i] expected_value = self.test_dict['expected_value'][i] + + mock_text_detection_name.return_value = json.loads(self.test_dict['mocked_values'][i]) + name_detector = NameDetector(language=self.test_dict['language'][i], entity_name='person_name') detected_texts, original_texts = name_detector.detect_entity(text=message) zipped = zip(detected_texts, original_texts) From eb45b052a6d409f44c94dce22533c8805b1604f2 Mon Sep 17 00:00:00 2001 From: ashutoshsingh0223 Date: Wed, 22 Jan 2020 16:30:11 +0530 Subject: [PATCH 72/83] Add mock_values for name_detection and mock elastic call --- ner_v1/detectors/textual/name/tests/test_name_detection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ner_v1/detectors/textual/name/tests/test_name_detection.py b/ner_v1/detectors/textual/name/tests/test_name_detection.py index 3ffe4dcc9..8c067b236 100644 --- a/ner_v1/detectors/textual/name/tests/test_name_detection.py +++ b/ner_v1/detectors/textual/name/tests/test_name_detection.py @@ -22,7 +22,7 @@ def preprocess_data(self): 'language': [], 'message': [], 'expected_value': [], - 'mocked_values':[] + 'mocked_values': [], } for (language, message, first_name, middle_name, last_name, original_entity, mocked_values) in zip( self.data['language'], From 20aff6c9a945ea96d0f2586a70559edab7ba4403 Mon Sep 17 00:00:00 2001 From: ashutoshsingh0223 Date: Wed, 22 Jan 2020 16:34:08 +0530 Subject: [PATCH 73/83] Forced str conversion in test_name_detection.py --- .../detectors/textual/name/tests/test_name_detection.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ner_v1/detectors/textual/name/tests/test_name_detection.py b/ner_v1/detectors/textual/name/tests/test_name_detection.py index 8c067b236..bfacc0d74 100644 --- a/ner_v1/detectors/textual/name/tests/test_name_detection.py +++ b/ner_v1/detectors/textual/name/tests/test_name_detection.py @@ -37,10 +37,10 @@ def preprocess_data(self): ln = [] oe = [] - fn.extend(first_name.split('|')) - mn.extend(middle_name.split('|')) - ln.extend(last_name.split('|')) - oe.extend(original_entity.split('|')) + fn.extend(str(first_name).split('|')) + mn.extend(str(middle_name).split('|')) + ln.extend(str(last_name).split('|')) + oe.extend(str(original_entity).split('|')) temp = [] for f, m, l, o in zip(fn, mn, ln, oe): From d6c13f298bd6d3bd573e837de3a3a2dd8314cc4e Mon Sep 17 00:00:00 2001 From: ashutoshsingh0223 Date: Wed, 22 Jan 2020 16:50:14 +0530 Subject: [PATCH 74/83] Reverse forced str conversion in test_name_detection.py --- .../detectors/textual/name/tests/test_name_detection.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ner_v1/detectors/textual/name/tests/test_name_detection.py b/ner_v1/detectors/textual/name/tests/test_name_detection.py index bfacc0d74..8c067b236 100644 --- a/ner_v1/detectors/textual/name/tests/test_name_detection.py +++ b/ner_v1/detectors/textual/name/tests/test_name_detection.py @@ -37,10 +37,10 @@ def preprocess_data(self): ln = [] oe = [] - fn.extend(str(first_name).split('|')) - mn.extend(str(middle_name).split('|')) - ln.extend(str(last_name).split('|')) - oe.extend(str(original_entity).split('|')) + fn.extend(first_name.split('|')) + mn.extend(middle_name.split('|')) + ln.extend(last_name.split('|')) + oe.extend(original_entity.split('|')) temp = [] for f, m, l, o in zip(fn, mn, ln, oe): From c26c287eff58c464da82d0a2f0d66115dff9cb62 Mon Sep 17 00:00:00 2001 From: ashutoshsingh0223 Date: Wed, 22 Jan 2020 18:04:16 +0530 Subject: [PATCH 75/83] Remove extra keys from NameDetector output --- .../detectors/textual/name/name_detection.py | 5 ++-- .../name/tests/test_cases_person_name.csv | 26 +++++++++---------- .../textual/name/tests/test_name_detection.py | 4 +++ 3 files changed, 20 insertions(+), 15 deletions(-) diff --git a/ner_v1/detectors/textual/name/name_detection.py b/ner_v1/detectors/textual/name/name_detection.py index 394766c4c..a2dd27f3b 100644 --- a/ner_v1/detectors/textual/name/name_detection.py +++ b/ner_v1/detectors/textual/name/name_detection.py @@ -184,14 +184,15 @@ def detect_entity(self, text, bot_message=None, predetected_values=None, **kwarg entity_value, original_text = self.detect_hindi_name() for entity_value_dict in entity_value: - entity_value_dict.update({DATASTORE_VERIFIED: False, MODEL_VERIFIED: True}) + entity_value_dict.update({DATASTORE_VERIFIED: True, MODEL_VERIFIED: False}) else: replaced_text = self.replace_predetected_text(predetected_values, text=text) entity_value, original_text = self.detect_person_name_entity(replaced_text) + for entity_value_dict in entity_value: - entity_value_dict.update({DATASTORE_VERIFIED: True, MODEL_VERIFIED: False}) + entity_value_dict.update({DATASTORE_VERIFIED: False, MODEL_VERIFIED: True}) self._update_processed_text(person_name_list=original_text) diff --git a/ner_v1/detectors/textual/name/tests/test_cases_person_name.csv b/ner_v1/detectors/textual/name/tests/test_cases_person_name.csv index 56a0329e5..ff0fe28cb 100644 --- a/ner_v1/detectors/textual/name/tests/test_cases_person_name.csv +++ b/ner_v1/detectors/textual/name/tests/test_cases_person_name.csv @@ -1,14 +1,14 @@ language,message,first_name,middle_name,last_name,original_entities,mocked_values -en,my name is pratik sridatt jayarao,pratik,sridatt,jayarao,pratik sridatt jayarao,'[["jayarao", "pratik", "shridatt"], ["jayarao", "pratik", "sridatt"]]' -en,my name is pratik jayarao,pratik,None,jayarao,pratik jayarao,'[["jayarao", "pratik"], ["jayarao", "pratik"]]' -en,my name is pratik,pratik,None,None,pratik,'[["pratik"], ["pratik"]]' -en,myself pratik sridatt jayarao,pratik,sridatt,jayarao,pratik sridatt jayarao,'[["jayarao", "pratik", "shridatt"], ["jayarao", "pratik", "sridatt"]]' -en,call me pratik sridatt jayarao,pratik,sridatt,jayarao,pratik sridatt jayarao,'[["jayarao", "pratik", "shridatt"], ["jayarao", "pratik", "sridatt"]]' -en,Pratik Jayarao,Pratik,None,Jayarao,Pratik Jayarao,'[["jayarao", "pratik"], ["jayarao", "pratik"]]' -en,Pratik,Pratik,None,None,Pratik,'[["pratik"], ["pratik"]]' -hi,मेरा नाम प्रतिक श्रीदत्त जयराओ है,प्रतिक,श्रीदत्त,जयराओ,प्रतिक श्रीदत्त जयराओ,'[["प्रतिक", "श्रीदत्त", "जयराओ"], ["प्रतिक", "श्रीदत्त", "जयराओ"]]' -hi,नाम प्रतिक श्रीदत्त जयराओ है,प्रतिक,श्रीदत्त,जयराओ,प्रतिक श्रीदत्त जयराओ,'[["प्रतिक", "श्रीदत्त", "जयराओ"], ["प्रतिक", "श्रीदत्त", "जयराओ"]]' -hi,नाम प्रतिक जयराओ है,प्रतिक,None,जयराओ,प्रतिक जयराओ,'[["प्रतिक", "श्रीदत्त", "जयराओ"], ["प्रतिक", "श्रीदत्त", "जयराओ"]]' -hi,मुझे प्रतिक जयराओ बुलाते है,प्रतिक,None,जयराओ,प्रतिक जयराओ,'[["प्रतिक", "श्रीदत्त", "जयराओ"], ["प्रतिक", "श्रीदत्त", "जयराओ"]]' -hi,प्रतिक जयराओ,प्रतिक,None,जयराओ,प्रतिक जयराओ,'[["प्रतिक", "श्रीदत्त", "जयराओ"], ["प्रतिक", "श्रीदत्त", "जयराओ"]]' -hi,मेरा नाम pratik jayarao है,pratik,None,jayarao,pratik jayarao,'[["jayarao", "pratik"], ["jayarao", "pratik"]]' +en,my name is pratik sridatt jayarao,pratik,sridatt,jayarao,pratik sridatt jayarao,"[[""jayarao"", ""pratik"", ""shridatt""], [""jayarao"", ""pratik"", ""sridatt""]]" +en,my name is pratik jayarao,pratik,None,jayarao,pratik jayarao,"[[""jayarao"", ""pratik""], [""jayarao"", ""pratik""]]" +en,my name is pratik,pratik,None,None,pratik,"[[""pratik""], [""pratik""]]" +en,myself pratik sridatt jayarao,pratik,sridatt,jayarao,pratik sridatt jayarao,"[[""jayarao"", ""pratik"", ""shridatt""], [""jayarao"", ""pratik"", ""sridatt""]]" +en,call me pratik sridatt jayarao,pratik,sridatt,jayarao,pratik sridatt jayarao,"[[""jayarao"", ""pratik"", ""shridatt""], [""jayarao"", ""pratik"", ""sridatt""]]" +en,Pratik Jayarao,Pratik,None,Jayarao,Pratik Jayarao,"[[""jayarao"", ""pratik""], [""jayarao"", ""pratik""]]" +en,Pratik,Pratik,None,None,Pratik,"[[""pratik""], [""pratik""]]" +hi,मेरा नाम प्रतिक श्रीदत्त जयराओ है,प्रतिक,श्रीदत्त,जयराओ,प्रतिक श्रीदत्त जयराओ,"[[""प्रतिक"", ""श्रीदत्त"", ""जयराओ""], [""प्रतिक"", ""श्रीदत्त"", ""जयराओ""]]" +hi,नाम प्रतिक श्रीदत्त जयराओ है,प्रतिक,श्रीदत्त,जयराओ,प्रतिक श्रीदत्त जयराओ,"[[""प्रतिक"", ""श्रीदत्त"", ""जयराओ""], [""प्रतिक"", ""श्रीदत्त"", ""जयराओ""]]" +hi,नाम प्रतिक जयराओ है,प्रतिक,None,जयराओ,प्रतिक जयराओ,"[[""प्रतिक"", ""श्रीदत्त"", ""जयराओ""], [""प्रतिक"", ""श्रीदत्त"", ""जयराओ""]]" +hi,मुझे प्रतिक जयराओ बुलाते है,प्रतिक,None,जयराओ,प्रतिक जयराओ,"[[""प्रतिक"", ""श्रीदत्त"", ""जयराओ""], [""प्रतिक"", ""श्रीदत्त"", ""जयराओ""]]" +hi,प्रतिक जयराओ,प्रतिक,None,जयराओ,प्रतिक जयराओ,"[[""प्रतिक"", ""श्रीदत्त"", ""जयराओ""], [""प्रतिक"", ""श्रीदत्त"", ""जयराओ""]]" +hi,मेरा नाम pratik jayarao है,pratik,None,jayarao,pratik jayarao,"[[""प्रतिक"", ""श्रीदत्त"", ""जयराओ""], [""प्रतिक"", ""श्रीदत्त"", ""जयराओ""]]" diff --git a/ner_v1/detectors/textual/name/tests/test_name_detection.py b/ner_v1/detectors/textual/name/tests/test_name_detection.py index 8c067b236..9bd63c262 100644 --- a/ner_v1/detectors/textual/name/tests/test_name_detection.py +++ b/ner_v1/detectors/textual/name/tests/test_name_detection.py @@ -7,6 +7,7 @@ import mock import json +from ner_v1.constant import DATASTORE_VERIFIED, MODEL_VERIFIED from ner_v1.detectors.textual.name.name_detection import NameDetector @@ -66,6 +67,9 @@ def test_person_name_detection(self, mock_text_detection_name): name_detector = NameDetector(language=self.test_dict['language'][i], entity_name='person_name') detected_texts, original_texts = name_detector.detect_entity(text=message) + for d in detected_texts: + d.pop(MODEL_VERIFIED) + d.pop(DATASTORE_VERIFIED) zipped = zip(detected_texts, original_texts) self.assertEqual(expected_value, zipped) From 01af891949c7f99c70ed60731588b669881997e9 Mon Sep 17 00:00:00 2001 From: Chirag Jain Date: Mon, 27 Jan 2020 17:52:40 +0530 Subject: [PATCH 76/83] Remove redundant TextModelDetector class --- ner_v1/chatbot/entity_detection.py | 34 ++-- .../detectors/textual/text/text_detection.py | 2 +- .../textual/text/text_detection_model.py | 191 ------------------ 3 files changed, 14 insertions(+), 213 deletions(-) delete mode 100644 ner_v1/detectors/textual/text/text_detection_model.py diff --git a/ner_v1/chatbot/entity_detection.py b/ner_v1/chatbot/entity_detection.py index 073c3e9e3..af57863ff 100644 --- a/ner_v1/chatbot/entity_detection.py +++ b/ner_v1/chatbot/entity_detection.py @@ -16,7 +16,6 @@ from ner_v1.detectors.textual.city.city_detection import CityDetector from ner_v1.detectors.textual.name.name_detection import NameDetector from ner_v1.detectors.textual.text.text_detection import TextDetector -from ner_v1.detectors.textual.text.text_detection_model import TextModelDetector from chatbot_ner.config import ner_logger import six @@ -231,36 +230,29 @@ def get_text(message, entity_name, structured_value, fallback_value, bot_message """ fuzziness = kwargs.get('fuzziness', None) min_token_len_fuzziness = kwargs.get('min_token_len_fuzziness', None) - live_crf_model_path = kwargs.get('live_crf_model_path', None) - read_model_from_s3 = kwargs.get('read_model_from_s3', False) - read_embeddings_from_remote_url = kwargs.get('read_embeddings_from_remote_url', False) - predetected_values = predetected_values or [] - text_model_detector = TextModelDetector(entity_name=entity_name, - language=language, - live_crf_model_path=live_crf_model_path, - read_model_from_s3=read_model_from_s3, - read_embeddings_from_remote_url=read_embeddings_from_remote_url) - + text_detector = TextDetector(entity_name=entity_name, source_language_script=language) if fuzziness: fuzziness = parse_fuzziness_parameter(fuzziness) - text_model_detector.set_fuzziness_threshold(fuzziness) + text_detector.set_fuzziness_threshold(fuzziness) if min_token_len_fuzziness: min_token_len_fuzziness = int(min_token_len_fuzziness) - text_model_detector.set_min_token_size_for_levenshtein(min_size=min_token_len_fuzziness) + text_detector.set_min_token_size_for_levenshtein(min_size=min_token_len_fuzziness) - ner_logger.info("free text detection results: {}".format(predetected_values)) + ner_logger.info("Predetected values: {}".format(predetected_values)) if isinstance(message, six.string_types): - entity_output = text_model_detector.detect(message=message, - structured_value=structured_value, - fallback_value=fallback_value, - bot_message=bot_message, - predetected_values=predetected_values) + entity_output = text_detector.detect(message=message, + structured_value=structured_value, + fallback_value=fallback_value, + bot_message=bot_message, + predetected_values=predetected_values) elif isinstance(message, (list, tuple)): - entity_output = text_model_detector.detect_bulk(messages=message, fallback_values=fallback_value, - predetected_values=predetected_values) + entity_output = text_detector.detect_bulk(messages=message, fallback_values=fallback_value, + predetected_values=predetected_values) + else: + raise TypeError('`message` argument must be either of type `str`, `unicode`, `list` or `tuple`.') return entity_output diff --git a/ner_v1/detectors/textual/text/text_detection.py b/ner_v1/detectors/textual/text/text_detection.py index 6ac3c3f9c..f195bc065 100644 --- a/ner_v1/detectors/textual/text/text_detection.py +++ b/ner_v1/detectors/textual/text/text_detection.py @@ -353,7 +353,7 @@ def detect_entity_bulk(self, texts, predetected_values=None, **kwargs): combined_entity_values.append(values) combined_original_texts.append(original_texts) - return text_entity_values_list, original_texts_list + return combined_entity_values, combined_original_texts def detect_entity(self, text, predetected_values=None, **kwargs): """ diff --git a/ner_v1/detectors/textual/text/text_detection_model.py b/ner_v1/detectors/textual/text/text_detection_model.py deleted file mode 100644 index dae72fe43..000000000 --- a/ner_v1/detectors/textual/text/text_detection_model.py +++ /dev/null @@ -1,191 +0,0 @@ -from language_utilities.constant import ENGLISH_LANG -from models.crf_v2.crf_detect_entity import CrfDetection -from ner_v1.detectors.textual.text.text_detection import TextDetector -import six - - -class TextModelDetector(TextDetector): - """ - This class is inherited from the TextDetector class. - This class is primarily used to detect text type entities using the datastore as well as the the CRF - model if trained. - """ - - def __init__(self, - entity_name, - language=ENGLISH_LANG, - live_crf_model_path=None, - read_embeddings_from_remote_url=False, - read_model_from_s3=False, - **kwargs): - """ - Args: - entity_name (str): name of the entity. Same as the entity name under which data is indexed in DataStore - language (str): ISO 639 code for the language to detect entities in - live_crf_model_path (str): path to the crf model, either on disk or s3 - read_embeddings_from_remote_url (bool, optional): if True, read embeddings from remote url configured in - chatbot_ner.config. Defaults to False - read_model_from_s3 (bool, optional): if True, use live_crf_model_path to read model from s3 instead - of local disk. Defaults to False - """ - super(TextModelDetector, self).__init__(entity_name=entity_name, - source_language_script=language, - translation_enabled=False) - self.read_model_from_s3 = read_model_from_s3 - self.read_embeddings_from_remote_url = read_embeddings_from_remote_url - self.live_crf_model_path = live_crf_model_path - - def detect_entity(self, text, predetected_values=None, **kwargs): - """ - Detects all textual entities in text that are similar to variants of 'entity_name' stored in the datastore and - returns two lists of detected text entities and their corresponding original substrings in text respectively. - The first list being a list of dicts with the verification source and the values. - Note that datastore stores number of values under a entity_name and each entity_value has its own list of - variants, whenever a variant is matched successfully, the entity_value whose list the variant belongs to, - is returned. For more information on how data is stored, see Datastore docs. - In addition to this method also runs the CRF MODEL if trained and provides the results for the given entity. - Args: - text (str or unicode): string to extract textual entities from - predetected_values(list of str): list of previous detected values - **kwargs: it can be used to send specific arguments in future. for example, fuzziness, previous context. - Returns: - tuple: - list: containing list of dicts with the source of detection for the entity value and - entity value as defined into datastore - list: containing corresponding original substrings in text - Example: - DataStore().get_entity_dictionary('city') - Output: - { - u'Agartala': [u'', u'Agartala'], - u'Barnala': [u'', u'Barnala'], - ... - u'chennai': [u'', u'chennai', u'tamilnadu', u'madras'], - u'hyderabad': [u'hyderabad'], - u'koramangala': [u'koramangala'] - } - text_detection = TextModelDetector('city') - text_detection.detect_entity('Come to Chennai, TamilNadu, I will visit Delhi next year') - Output: - ([{'datastore_verified': True,'crf_model_verified': True, 'value': u'Chennai'}, - {'datastore_verified': True,'crf_model_verified': False, 'value': u'New Delhi'}, - {'datastore_verified': False,'crf_model_verified': True, 'value': u'chennai'}] - , ['chennai', 'delhi', 'tamilnadu']) - text_detection.tagged_text - Output: - ' come to __city__, __city__, i will visit __city__ next year ' - Additionally this function assigns these lists to self.text_entity_values and self.original_texts attributes - respectively. - """ - crf_original_texts = [] - if self.live_crf_model_path: - crf_model = CrfDetection(entity_name=self.entity_name, - read_model_from_s3=self.read_model_from_s3, - read_embeddings_from_remote_url=self.read_embeddings_from_remote_url, - live_crf_model_path=self.live_crf_model_path) - - crf_original_texts = crf_model.detect_entity(text=text) - - # Access predetected_values(list of str). - # If present replace crf_original_texts with predetected_values. - # Call combine results to .combine_results() from dictionary detection and predetected_values. - if predetected_values is None: - predetected_values = [] - - values, original_texts = super(TextModelDetector, self).detect_entity( - text, predetected_values=predetected_values, **kwargs) - - text_entity_verified_values, original_texts = \ - self.combine_results(values=values, - original_texts=original_texts, - predetected_values=predetected_values) - self.text_entity_values, self.original_texts = text_entity_verified_values, original_texts - - return self.text_entity_values, self.original_texts - - def detect_entity_bulk(self, texts, predetected_values=None, **kwargs): - """ - Detects all textual entities in text that are similar to variants of 'entity_name' stored in the datastore and - returns two lists of list of detected text entities and their corresponding original substrings - for each sentence in text respectively. - The first list being a list of list of dicts with the verification source and the values. - Note that datastore stores number of values under a entity_name and each entity_value has its own list of - variants, whenever a variant is matched successfully, the entity_value whose list the variant belongs to, - is returned. For more information on how data is stored, see Datastore docs. - In addition to this method also runs the CRF MODEL if trained and provides the results for the given entity. - - Args: - texts (list of strings): natural language sentence(s) to extract entities from - predetected_values(list of lists of str): values from previous detection - **kwargs: it can be used to send specific arguments in future. for example, fuzziness, previous context. - - Returns: - tuple: - list of lists(bulk detect): containing list of dicts with the source of detection - for the entity value and entity value as defined into datastore - - list of lists(bulk detect): containing corresponding original substrings in text - - Example: - DataStore().get_entity_dictionary('city') - - Output: - { - u'Agartala': [u'', u'Agartala'], - u'Barnala': [u'', u'Barnala'], - ... - u'chennai': [u'', u'chennai', u'tamilnadu', u'madras'], - u'hyderabad': [u'hyderabad'], - u'koramangala': [u'koramangala'] - } - text_detection = TextDetector('city') - list_of_sentences = ['Come to Chennai, TamilNadu, I will visit Delhi next year', - 'I live in Delhi] - - text_detection.detect_entity(list_of_sentences) - Output: - ( [ - [u'Chennai', u'New Delhi', u'chennai'], - [u'New Delhi'] - ], - [ - ['chennai', 'delhi', 'tamilnadu'], - [delhi] - ] - ) - - text_detection.tagged_text - Output: - [ - ' come to __city__, __city__, i will visit __city__ next year ', - ' i live in __city__ ' - ] - - Additionally this function assigns these lists to self.text_entity_values and self.original_texts attributes - respectively. - """ - - crf_original_texts = [] - - # Access predetected_values(list of lists). - # If present replace crf_original_texts with predetected_values. - # Call .combine_results() to combine results from dictionary detection and predetected_values. - if predetected_values is None: - predetected_values = [] - - values_list, original_texts_list = super(TextModelDetector, self).detect_entity_bulk( - texts, predetected_values=predetected_values, **kwargs) - text_entity_values_list, original_texts_detected_list = [], [] - - for inner_predetected_values, inner_values, inner_original_texts in six.moves.zip_longest( - predetected_values, - values_list, - original_texts_list): - text_entity_verified_values, original_texts = \ - self.combine_results( - values=inner_values, original_texts=inner_original_texts, - predetected_values=inner_predetected_values if - inner_predetected_values else []) - text_entity_values_list.append(text_entity_verified_values) - original_texts_detected_list.append(original_texts) - return text_entity_values_list, original_texts_detected_list From 578d2dc3ca64eeff6ed8acdb5e10645aab91a9ac Mon Sep 17 00:00:00 2001 From: ashutoshsingh0223 Date: Wed, 29 Jan 2020 16:00:25 +0530 Subject: [PATCH 77/83] Remove text_detection call in NameDetector --- ner_v1/detectors/textual/name/name_detection.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/ner_v1/detectors/textual/name/name_detection.py b/ner_v1/detectors/textual/name/name_detection.py index a2dd27f3b..da0b9c86b 100644 --- a/ner_v1/detectors/textual/name/name_detection.py +++ b/ner_v1/detectors/textual/name/name_detection.py @@ -216,11 +216,6 @@ def detect_english_name(self, text=None): if text is None: text = self.text entity_value, original_text = self.get_name_using_pos_tagger(text) - if not entity_value: - text_detection_result = self.text_detection_name(text) - replaced_text = self.replace_detected_text(text_detection_result, text=text) - entity_value, original_text = self.detect_person_name_entity(replaced_text) - return entity_value, original_text def detect_hindi_name(self): From b50325145978f99421c0b4379b8d8dba4edf2e4b Mon Sep 17 00:00:00 2001 From: Chirag Jain Date: Thu, 30 Jan 2020 18:02:21 +0530 Subject: [PATCH 78/83] Always use combine logic to make output values consistent everywhere --- ner_v1/detectors/base_detector.py | 16 +-- .../detectors/textual/name/name_detection.py | 11 ++- .../detectors/textual/text/text_detection.py | 99 +++++++++---------- 3 files changed, 65 insertions(+), 61 deletions(-) diff --git a/ner_v1/detectors/base_detector.py b/ner_v1/detectors/base_detector.py index c2d268954..3fca68e8e 100644 --- a/ner_v1/detectors/base_detector.py +++ b/ner_v1/detectors/base_detector.py @@ -185,11 +185,9 @@ def combine_results(self, values, original_texts, predetected_values): """ unprocessed_crf_original_texts = [] - combined_values = self._add_verification_source(values=values, - verification_source_dict={ - DATASTORE_VERIFIED: True, - MODEL_VERIFIED: False - }) + combined_values = self._add_verification_source( + values=values, verification_source_dict={DATASTORE_VERIFIED: True, MODEL_VERIFIED: False} + ) combined_original_texts = original_texts for i in range(len(predetected_values)): match = False @@ -197,14 +195,18 @@ def combine_results(self, values, original_texts, predetected_values): if predetected_values[i] == original_texts[j]: combined_values[j][MODEL_VERIFIED] = True match = True - elif re.findall(r'\b%s\b' % predetected_values[i], original_texts[j]): + break + elif re.findall(r'\b%s\b' % re.escape(predetected_values[i]), original_texts[j]): + # If predetected value is a substring of some value detected by datastore, skip it from output match = True + break if not match: unprocessed_crf_original_texts.append(predetected_values[i]) unprocessed_crf_original_texts_verified = self._add_verification_source( values=unprocessed_crf_original_texts, - verification_source_dict={DATASTORE_VERIFIED: False, MODEL_VERIFIED: True}) + verification_source_dict={DATASTORE_VERIFIED: False, MODEL_VERIFIED: True} + ) combined_values.extend(unprocessed_crf_original_texts_verified) combined_original_texts.extend(unprocessed_crf_original_texts) diff --git a/ner_v1/detectors/textual/name/name_detection.py b/ner_v1/detectors/textual/name/name_detection.py index da0b9c86b..bd75fb0bb 100644 --- a/ner_v1/detectors/textual/name/name_detection.py +++ b/ner_v1/detectors/textual/name/name_detection.py @@ -83,7 +83,13 @@ def get_format_name(name_tokens, text): last_name = name_tokens[-1] middle_name = " ".join(name_tokens[1:-1]) or None - entity_value.append({FIRST_NAME: first_name, MIDDLE_NAME: middle_name, LAST_NAME: last_name}) + entity_value.append({ + FIRST_NAME: first_name, + MIDDLE_NAME: middle_name, + LAST_NAME: last_name, + DATASTORE_VERIFIED: False, + MODEL_VERIFIED: False, + }) original_text.append(name_text) return entity_value, original_text @@ -187,8 +193,7 @@ def detect_entity(self, text, bot_message=None, predetected_values=None, **kwarg entity_value_dict.update({DATASTORE_VERIFIED: True, MODEL_VERIFIED: False}) else: - replaced_text = self.replace_predetected_text(predetected_values, - text=text) + replaced_text = self.replace_predetected_text(predetected_values, text=text) entity_value, original_text = self.detect_person_name_entity(replaced_text) for entity_value_dict in entity_value: diff --git a/ner_v1/detectors/textual/text/text_detection.py b/ner_v1/detectors/textual/text/text_detection.py index f195bc065..887471549 100644 --- a/ner_v1/detectors/textual/text/text_detection.py +++ b/ner_v1/detectors/textual/text/text_detection.py @@ -1,8 +1,8 @@ import collections import string -from six import iteritems import six +from six import iteritems import language_utilities.constant as lang_constant from chatbot_ner.config import ner_logger @@ -13,10 +13,12 @@ try: import regex as re + _re_flags = re.UNICODE | re.V1 | re.WORD except ImportError: import re + _re_flags = re.UNICODE @@ -287,22 +289,24 @@ def detect_entity_bulk(self, texts, predetected_values=None, **kwargs): is returned. For more information on how data is stored, see Datastore docs. Args: - texts (list): list of strings(bulk detect) to extract textual entities from - predetected_values(list of list of str): results from prior detection. + texts (list): list of str to extract textual entities from + predetected_values (list of list of str): results from prior detection. **kwargs: it can be used to send specific arguments in future. for example, fuzziness, previous context. Returns: tuple: - list or list of lists(bulk detect): containing entity value as defined into datastore - list or list of lists(bulk detect): containing corresponding original substrings in text + list or list of dicts: ith item is a list of values output as dictionary with structure + {'value': , ...} which were detected as entity values in texts[i] + list or list of str: ith item contains corresponding original substrings in texts[i] that were + detected as entity values Example: DataStore().get_entity_dictionary('city') Output: { - u'Agartala': [u'', u'Agartala'], - u'Barnala': [u'', u'Barnala'], + u'Agartala': [u'Agartala'], + u'Barnala': [u'Barnala'], ... - u'chennai': [u'', u'chennai', u'tamilnadu', u'madras'], + u'chennai': [u'chennai', u'tamilnadu', u'madras'], u'hyderabad': [u'hyderabad'], u'koramangala': [u'koramangala'] } @@ -310,14 +314,20 @@ def detect_entity_bulk(self, texts, predetected_values=None, **kwargs): list_of_sentences = ['Come to Chennai, TamilNadu, I will visit Delhi next year', 'I live in Delhi] - text_detection.detect_entity(list_of_sentences) + text_detection.detect_entity_bulk(list_of_sentences) Output: ( [ - [u'Chennai', u'New Delhi', u'chennai'], - [u'New Delhi'] + [ + {'value': u'chennai'}, + {'value': u'tamilnadu'}, + {value': u'new delhi'}, + ], + [ + {'value': u'new delhi'}, + ] ], [ - ['chennai', 'delhi', 'tamilnadu'], + ['chennai', 'tamilnadu', 'delhi'], [delhi] ] ) @@ -331,27 +341,21 @@ def detect_entity_bulk(self, texts, predetected_values=None, **kwargs): """ # For bulk detection predetected_values will be a list of list of str - if predetected_values is None: - predetected_values = [] + predetected_values = predetected_values or [] self._process_text(texts) text_entity_values_list, original_texts_list = self._text_detection_with_variants() # itertate over text_entity_values_list, original_texts_list and if predetected_values has any entry # for that index use combine_results to merge the results from predetected_values and dictionary detection. - combined_entity_values, combined_original_texts = [], [] - for i, (values, original_texts, inner_predetected_values) in enumerate( - six.moves.zip_longest(text_entity_values_list, original_texts_list, predetected_values)): - if inner_predetected_values: - inner_combined_entity_values, inner_combined_original_texts = self.combine_results( - values=values, - original_texts=original_texts, - predetected_values=inner_predetected_values) - combined_entity_values.append(inner_combined_entity_values) - combined_original_texts.append(inner_combined_original_texts) - else: - combined_entity_values.append(values) - combined_original_texts.append(original_texts) + zipped_iter = six.moves.zip_longest(text_entity_values_list, original_texts_list, predetected_values) + for i, (values, original_texts, inner_predetected_values) in enumerate(zipped_iter): + inner_combined_entity_values, inner_combined_original_texts = self.combine_results( + values=values, + original_texts=original_texts, + predetected_values=inner_predetected_values if inner_predetected_values else []) + combined_entity_values.append(inner_combined_entity_values) + combined_original_texts.append(inner_combined_original_texts) return combined_entity_values, combined_original_texts @@ -360,57 +364,50 @@ def detect_entity(self, text, predetected_values=None, **kwargs): Detects all textual entities in text that are similar to variants of 'entity_name' stored in the datastore and returns two lists of detected text entities and their corresponding original substrings in text respectively. Note that datastore stores number of values under a entity_name and each entity_value has its own list of - variants, whenever a variant is matched sucessfully, the entity_value whose list the variant belongs to, + variants, whenever a variant is matched successfully, the entity_value whose list the variant belongs to, is returned. For more information on how data is stored, see Datastore docs. Args: - text (unicode): string to extract textual entities from - predetected_values(list of str): prior detection results + text (str): string to extract textual entities from + predetected_values (list of str): prior detection results **kwargs: it can be used to send specific arguments in future. for example, fuzziness, previous context. Returns: tuple: - list: containing entity value as defined into datastore - list: containing corresponding original substrings in text + list: list of dict with detected value against 'value' + list: list of str containing corresponding original substrings in text Example: DataStore().get_entity_dictionary('city') Output: { - u'Agartala': [u'', u'Agartala'], - u'Barnala': [u'', u'Barnala'], + u'Agartala': [u'Agartala'], + u'Barnala': [u'Barnala'], ... - u'chennai': [u'', u'chennai', u'tamilnadu', u'madras'], + u'chennai': [u'chennai', u'tamilnadu', u'madras'], u'hyderabad': [u'hyderabad'], u'koramangala': [u'koramangala'] } text_detection = TextDetector('city') text_detection.detect_entity('Come to Chennai, TamilNadu, I will visit Delhi next year') Output: - ([u'Chennai', u'New Delhi', u'chennai'], ['chennai', 'delhi', 'tamilnadu']) + ([{'value': u'chennai'}, {'value': u'tamilnadu'}, {value': u'new delhi'}], + ['chennai', 'tamilnadu', 'delhi']) text_detection.tagged_text Output: ' come to __city__, __city__, i will visit __city__ next year ' """ + values, texts = [], [] + predetected_values = predetected_values or [] + self._process_text([text]) text_entity_values, original_texts = self._text_detection_with_variants() - # For single message detection predetected_values will be a list of str - # if present use combine_results to merge the results. - if predetected_values is None: - predetected_values = [] - - values, texts = [], [] - if len(text_entity_values) > 0 and len(original_texts) > 0: + if text_entity_values and original_texts: self.tagged_text = self.__tagged_texts[0] self.processed_text = self.__processed_texts[0] values, texts = text_entity_values[0], original_texts[0] - ner_logger.info("prior detection results - {}".format(predetected_values)) - if predetected_values: - ner_logger.info("combining results for {0}, {1}, {2}".format(values, texts, predetected_values)) - text_entity_verified_values, original_texts = self.combine_results( - values=values, - original_texts=texts, - predetected_values=predetected_values) - return text_entity_verified_values, original_texts + values, texts = self.combine_results(values=values, original_texts=texts, + predetected_values=predetected_values) + return values, texts def _text_detection_with_variants(self): From 34962673086bd8634facfcb6959a9b310c226d0c Mon Sep 17 00:00:00 2001 From: Chirag Jain Date: Thu, 30 Jan 2020 22:57:20 +0530 Subject: [PATCH 79/83] Fix few lint errors --- ner_v1/detectors/textual/name/name_detection.py | 6 +++--- ner_v2/detectors/temporal/date/en/date_detection.py | 4 ++-- ner_v2/detectors/temporal/date/standard_date_regex.py | 11 +++++------ 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/ner_v1/detectors/textual/name/name_detection.py b/ner_v1/detectors/textual/name/name_detection.py index bd75fb0bb..07c4e15c1 100644 --- a/ner_v1/detectors/textual/name/name_detection.py +++ b/ner_v1/detectors/textual/name/name_detection.py @@ -5,12 +5,12 @@ from language_utilities.constant import ENGLISH_LANG, HINDI_LANG from lib.nlp.const import nltk_tokenizer from lib.nlp.pos import POS +from ner_v1.constant import DATASTORE_VERIFIED, MODEL_VERIFIED from ner_v1.constant import EMOJI_RANGES, FIRST_NAME, MIDDLE_NAME, LAST_NAME from ner_v1.detectors.textual.name.hindi_const import (HINDI_BADWORDS, HINDI_QUESTIONWORDS, HINDI_STOPWORDS, NAME_VARIATIONS, COMMON_HINDI_WORDS_OCCURING_WITH_NAME) from ner_v1.detectors.textual.text.text_detection import TextDetector -from ner_v1.constant import DATASTORE_VERIFIED, MODEL_VERIFIED # TODO: Refactor this module for readability and useability. Remove any hacks @@ -201,7 +201,6 @@ def detect_entity(self, text, bot_message=None, predetected_values=None, **kwarg self._update_processed_text(person_name_list=original_text) - return entity_value, original_text def detect_english_name(self, text=None): @@ -345,7 +344,8 @@ def replace_detected_text(self, text_detection_result, text): for detected_original_text in (text_detection_result[1]): for j in range(len(replaced_text_tokens)): - replaced_text_tokens[j] = replaced_text_tokens[j].replace(detected_original_text, "_" + detected_original_text + "_") + replaced_text_tokens[j] = replaced_text_tokens[j].replace( + detected_original_text, "_" + detected_original_text + "_") return replaced_text_tokens diff --git a/ner_v2/detectors/temporal/date/en/date_detection.py b/ner_v2/detectors/temporal/date/en/date_detection.py index 3aa7c7e4c..d9d0ec781 100644 --- a/ner_v2/detectors/temporal/date/en/date_detection.py +++ b/ner_v2/detectors/temporal/date/en/date_detection.py @@ -1999,8 +1999,8 @@ def normalize_year(self, year): future_regex = None this_century = int(str(self.now_date.year)[:2]) if len(year) == 2: - if (((self.bot_message and past_regex.search(self.bot_message)) or - (self.past_date_referenced is True)) and (int(year) > int(str(self.now_date.year)[2:]))): + if (((self.bot_message and past_regex.search(self.bot_message)) + or (self.past_date_referenced is True)) and (int(year) > int(str(self.now_date.year)[2:]))): return str(this_century - 1) + year elif present_regex and present_regex.search(self.bot_message): return str(this_century) + year diff --git a/ner_v2/detectors/temporal/date/standard_date_regex.py b/ner_v2/detectors/temporal/date/standard_date_regex.py index 42328ab03..44adbd7c6 100644 --- a/ner_v2/detectors/temporal/date/standard_date_regex.py +++ b/ner_v2/detectors/temporal/date/standard_date_regex.py @@ -2,7 +2,6 @@ import datetime import re - from dateutil.relativedelta import relativedelta from ner_v2.detectors.temporal.constant import (DATE_CONSTANT_FILE, DATETIME_CONSTANT_FILE, @@ -345,7 +344,7 @@ def _detect_date_ref_month_3(self, date_list, original_list): for date_match in date_ref_month_match: original = date_match[0] dd = self._get_int_from_numeral(date_match[1]) - if (self.now_date.day > dd and self.past_date_referenced) or\ + if (self.now_date.day > dd and self.past_date_referenced) or \ (self.now_date.day <= dd and not self.past_date_referenced): mm = self.now_date.month yy = self.now_date.year @@ -579,10 +578,10 @@ def _gregorian_day_month_year_format(self, date_list=None, original_list=None): yy = int(self.normalize_year(pattern[3])) if pattern[3] else self.now_date.year try: # to catch dates which are not possible like "31/11" (october 31st) - if not pattern[3] and self.timezone.localize(datetime.datetime(year=yy, month=mm, day=dd))\ + if not pattern[3] and self.timezone.localize(datetime.datetime(year=yy, month=mm, day=dd)) \ < self.now_date: yy += 1 - except: + except Exception: return date_list, original_list date = { @@ -628,8 +627,8 @@ def normalize_year(self, year): future_regex = None this_century = int(str(self.now_date.year)[:2]) if len(year) == 2: - if (((self.bot_message and past_regex and past_regex.search(self.bot_message)) or - (self.past_date_referenced is True)) and (int(year) > int(str(self.now_date.year)[2:]))): + if (((self.bot_message and past_regex and past_regex.search(self.bot_message)) + or (self.past_date_referenced is True)) and (int(year) > int(str(self.now_date.year)[2:]))): return str(this_century - 1) + year elif present_regex and present_regex.search(self.bot_message): return str(this_century) + year From e71d8a8867a4b5624995a0557fb69444909aca8d Mon Sep 17 00:00:00 2001 From: ashutoshsingh0223 Date: Fri, 31 Jan 2020 12:25:03 +0530 Subject: [PATCH 80/83] Add hindi detection tags for name detection --- ner_v1/detectors/textual/name/hindi_const.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ner_v1/detectors/textual/name/hindi_const.py b/ner_v1/detectors/textual/name/hindi_const.py index 488dedf03..84886e890 100644 --- a/ner_v1/detectors/textual/name/hindi_const.py +++ b/ner_v1/detectors/textual/name/hindi_const.py @@ -325,7 +325,7 @@ "your full name", "what is your name", "is your name", - u"नाम"] + u"नाम", u'नेम', u'*नाम*', u'*नेम*'] # Common hindi words occuring in context to a name COMMON_HINDI_WORDS_OCCURING_WITH_NAME = {u"मुझे", From 0f6d5f48a2648c93a073e0c3ef287f3a7edd5195 Mon Sep 17 00:00:00 2001 From: ashutoshsingh0223 Date: Fri, 31 Jan 2020 19:48:37 +0530 Subject: [PATCH 81/83] NameDetector - Standalone name will only be detected if bot_mesaage is non null --- .../detectors/textual/name/name_detection.py | 25 ++++++------------- 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/ner_v1/detectors/textual/name/name_detection.py b/ner_v1/detectors/textual/name/name_detection.py index 07c4e15c1..6f88b85d8 100644 --- a/ner_v1/detectors/textual/name/name_detection.py +++ b/ner_v1/detectors/textual/name/name_detection.py @@ -28,7 +28,7 @@ class NameDetector(object): on calling detect_entity() tagged_text: string with city entities replaced with tag defined by entity_name processed_text: string with detected time entities removed - text_detection_object: the object which is used to call the TextDetector + bot_message: previous message """ def __init__(self, entity_name, language=ENGLISH_LANG): @@ -47,7 +47,8 @@ def __init__(self, entity_name, language=ENGLISH_LANG): self.processed_text = '' self.original_name_text = [] self.tag = '_' + entity_name + '_' - self.text_detection_object = TextDetector(entity_name=entity_name) + + self.bot_message = None @staticmethod def get_format_name(name_tokens, text): @@ -93,19 +94,6 @@ def get_format_name(name_tokens, text): original_text.append(name_text) return entity_value, original_text - def text_detection_name(self, text=None): - """ - Makes a call to TextDetection and return the person_name detected from the elastic search. - Returns: - Tuple with list of names detected in TextDetection in the form of variants detected and original_text - - Example : my name is yash doshi - - ([u'dosh', u'yash'], ['doshi', 'yash']) - """ - if text is None: - text = self.text - return self.text_detection_object.detect_entity(text=text) def get_name_using_pos_tagger(self, text): """ @@ -151,7 +139,7 @@ def get_name_using_pos_tagger(self, text): elif pattern4_match: entity_value, original_text = self.get_format_name(pattern4_match[0].split(), self.text) - elif len(name_tokens) < 4: + elif len(name_tokens) < 4 and self.bot_message: pos_words = [word[0] for word in tagged_names if word[1].startswith('NN') or word[1].startswith('JJ')] if pos_words: @@ -177,12 +165,13 @@ def detect_entity(self, text, bot_message=None, predetected_values=None, **kwarg self.text = text self.tagged_text = self.text + self.bot_message = bot_message entity_value, original_text = ([], []) if not predetected_values: - if bot_message: - if not self.context_check_botmessage(bot_message): + if self.bot_message: + if not self.context_check_botmessage(self.bot_message): return [], [] if self.language == ENGLISH_LANG: entity_value, original_text = self.detect_english_name() From 0a4394a282df0c1a06f1248299605f8a2261e73a Mon Sep 17 00:00:00 2001 From: Chirag Jain Date: Fri, 31 Jan 2020 21:07:36 +0530 Subject: [PATCH 82/83] Fix name detection test case --- .../detectors/textual/name/tests/test_name_detection.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/ner_v1/detectors/textual/name/tests/test_name_detection.py b/ner_v1/detectors/textual/name/tests/test_name_detection.py index 9bd63c262..a575ebadb 100644 --- a/ner_v1/detectors/textual/name/tests/test_name_detection.py +++ b/ner_v1/detectors/textual/name/tests/test_name_detection.py @@ -57,16 +57,13 @@ def preprocess_data(self): return test_dict - @mock.patch.object(NameDetector, "text_detection_name") - def test_person_name_detection(self, mock_text_detection_name): + def test_person_name_detection(self): for i in range(len(self.data)): message = self.test_dict['message'][i] expected_value = self.test_dict['expected_value'][i] - - mock_text_detection_name.return_value = json.loads(self.test_dict['mocked_values'][i]) - name_detector = NameDetector(language=self.test_dict['language'][i], entity_name='person_name') - detected_texts, original_texts = name_detector.detect_entity(text=message) + detected_texts, original_texts = name_detector.detect_entity(text=message, + bot_message='what is your name') for d in detected_texts: d.pop(MODEL_VERIFIED) d.pop(DATASTORE_VERIFIED) From 61ac48422416a085d199bce64971fc6939aa9443 Mon Sep 17 00:00:00 2001 From: Chirag Jain Date: Fri, 31 Jan 2020 21:14:38 +0530 Subject: [PATCH 83/83] Fix lint errors --- ner_v1/detectors/textual/name/tests/test_name_detection.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/ner_v1/detectors/textual/name/tests/test_name_detection.py b/ner_v1/detectors/textual/name/tests/test_name_detection.py index a575ebadb..5ed1a0794 100644 --- a/ner_v1/detectors/textual/name/tests/test_name_detection.py +++ b/ner_v1/detectors/textual/name/tests/test_name_detection.py @@ -4,8 +4,6 @@ import pandas as pd from django.test import TestCase -import mock -import json from ner_v1.constant import DATASTORE_VERIFIED, MODEL_VERIFIED from ner_v1.detectors.textual.name.name_detection import NameDetector