From 9d96c72d7bd3f3b5543922c5004e4ffb9a3cd1f2 Mon Sep 17 00:00:00 2001 From: chiragjn Date: Fri, 12 Apr 2019 02:28:39 +0530 Subject: [PATCH 01/12] Add a get_timezone utils functions to set timezone correctly when passed onto another constructor --- .../detectors/temporal/date/date_detection.py | 19 +++------- .../temporal/date/en/date_detection.py | 24 +++++------- .../temporal/date/standard_date_regex.py | 19 +++------- .../temporal/time/en/time_detection.py | 3 +- .../temporal/time/standard_time_regex.py | 11 +----- .../detectors/temporal/time/time_detection.py | 3 +- ner_v2/detectors/temporal/utils.py | 38 ++++++++++++++++++- 7 files changed, 65 insertions(+), 52 deletions(-) diff --git a/ner_v2/detectors/temporal/date/date_detection.py b/ner_v2/detectors/temporal/date/date_detection.py index 3f219d06e..04e52a210 100644 --- a/ner_v2/detectors/temporal/date/date_detection.py +++ b/ner_v2/detectors/temporal/date/date_detection.py @@ -5,19 +5,17 @@ import os import re -import pytz - import models.crf.constant as model_constant import ner_v2.detectors.temporal.constant as temporal_constant -from chatbot_ner.config import ner_logger from language_utilities.constant import ENGLISH_LANG, TRANSLATED_TEXT from language_utilities.utils import translate_text from models.crf.models import Models -from ner_constants import FROM_MESSAGE, FROM_MODEL_VERIFIED, FROM_MODEL_NOT_VERIFIED, FROM_STRUCTURE_VALUE_VERIFIED, \ - FROM_STRUCTURE_VALUE_NOT_VERIFIED, FROM_FALLBACK_VALUE +from ner_constants import (FROM_MESSAGE, FROM_MODEL_VERIFIED, FROM_MODEL_NOT_VERIFIED, FROM_STRUCTURE_VALUE_VERIFIED, + FROM_STRUCTURE_VALUE_NOT_VERIFIED, FROM_FALLBACK_VALUE) from ner_v2.detectors.base_detector import BaseDetector -from ner_v2.detectors.temporal.constant import TYPE_EXACT, TYPE_EVERYDAY, TYPE_PAST, \ - TYPE_NEXT_DAY, TYPE_REPEAT_DAY +from ner_v2.detectors.temporal.constant import (TYPE_EXACT, TYPE_EVERYDAY, TYPE_PAST, + TYPE_NEXT_DAY, TYPE_REPEAT_DAY) +from ner_v2.detectors.temporal.utils import get_timezone from ner_v2.detectors.utils import get_lang_data_path @@ -765,12 +763,7 @@ def __init__(self, entity_name, language=ENGLISH_LANG, timezone='UTC', past_date self.original_date_text = [] self.entity_name = entity_name self.tag = '__' + entity_name + '__' - try: - self.timezone = pytz.timezone(timezone) - except Exception as e: - ner_logger.debug('Timezone error: %s ' % e) - self.timezone = pytz.timezone('UTC') - ner_logger.debug('Default timezone passed as "UTC"') + self.timezone = get_timezone(timezone) self.now_date = datetime.datetime.now(tz=self.timezone) self.bot_message = None self.language = language diff --git a/ner_v2/detectors/temporal/date/en/date_detection.py b/ner_v2/detectors/temporal/date/en/date_detection.py index 12036b6f0..3ce428f67 100644 --- a/ner_v2/detectors/temporal/date/en/date_detection.py +++ b/ner_v2/detectors/temporal/date/en/date_detection.py @@ -2,13 +2,13 @@ import datetime import re -import pytz - -from chatbot_ner.config import ner_logger -from ner_v2.detectors.temporal.constant import TYPE_EXACT, TYPE_EVERYDAY, TYPE_TODAY, TYPE_TOMORROW, TYPE_YESTERDAY, \ - TYPE_DAY_AFTER, TYPE_DAY_BEFORE, TYPE_N_DAYS_AFTER, TYPE_NEXT_DAY, TYPE_THIS_DAY, TYPE_POSSIBLE_DAY, WEEKDAYS, \ - REPEAT_WEEKDAYS, WEEKENDS, REPEAT_WEEKENDS, TYPE_REPEAT_DAY, MONTH_DICT, DAY_DICT, ORDINALS_MAP -from ner_v2.detectors.temporal.utils import get_weekdays_for_month, get_next_date_with_dd, get_previous_date_with_dd +from ner_v2.detectors.temporal.constant import (TYPE_EXACT, TYPE_EVERYDAY, TYPE_TODAY, TYPE_TOMORROW, TYPE_YESTERDAY, + TYPE_DAY_AFTER, TYPE_DAY_BEFORE, TYPE_N_DAYS_AFTER, TYPE_NEXT_DAY, + TYPE_THIS_DAY, TYPE_POSSIBLE_DAY, WEEKDAYS, + REPEAT_WEEKDAYS, WEEKENDS, REPEAT_WEEKENDS, TYPE_REPEAT_DAY, + MONTH_DICT, DAY_DICT, ORDINALS_MAP) +from ner_v2.detectors.temporal.utils import (get_weekdays_for_month, get_next_date_with_dd, get_previous_date_with_dd, + get_timezone) class DateDetector(object): @@ -90,12 +90,7 @@ def __init__(self, entity_name, timezone='UTC', past_date_referenced=False): self.day_dictionary = {} self.entity_name = entity_name self.tag = '__' + entity_name + '__' - try: - self.timezone = pytz.timezone(timezone) - except Exception as e: - ner_logger.debug('Timezone error: %s ' % e) - self.timezone = pytz.timezone('UTC') - ner_logger.debug('Default timezone passed as "UTC"') + self.timezone = get_timezone(timezone) self.now_date = datetime.datetime.now(tz=self.timezone) self.month_dictionary = MONTH_DICT self.day_dictionary = DAY_DICT @@ -875,7 +870,8 @@ def _yesterdays_date(self, date_list=None, original_list=None): original_list = [] if date_list is None: date_list = [] - regex_pattern = re.compile(r'\b((yesterday|sterday|yesterdy|yestrdy|yestrday|previous day|prev day|prevday))\b') + regex_pattern = re.compile( + r'\b((yesterday|sterday|yesterdy|yestrdy|yestrday|previous day|prev day|prevday))\b') patterns = regex_pattern.findall(self.processed_text.lower()) for pattern in patterns: original = pattern[0] diff --git a/ner_v2/detectors/temporal/date/standard_date_regex.py b/ner_v2/detectors/temporal/date/standard_date_regex.py index c4dceb475..efcea1aa4 100644 --- a/ner_v2/detectors/temporal/date/standard_date_regex.py +++ b/ner_v2/detectors/temporal/date/standard_date_regex.py @@ -3,15 +3,13 @@ import datetime import re -import pytz from dateutil.relativedelta import relativedelta -from chatbot_ner.config import ner_logger -from ner_v2.detectors.temporal.constant import TYPE_EXACT -from ner_v2.detectors.temporal.constant import DATE_CONSTANT_FILE, DATETIME_CONSTANT_FILE, \ - RELATIVE_DATE, DATE_LITERAL_TYPE, MONTH_LITERAL_TYPE, WEEKDAY_TYPE, \ - MONTH_TYPE, ADD_DIFF_DATETIME_TYPE, MONTH_DATE_REF_TYPE, NUMERALS_CONSTANT_FILE -from ner_v2.detectors.temporal.utils import next_weekday, nth_weekday, get_tuple_dict +from ner_v2.detectors.temporal.constant import (DATE_CONSTANT_FILE, DATETIME_CONSTANT_FILE, + RELATIVE_DATE, DATE_LITERAL_TYPE, MONTH_LITERAL_TYPE, WEEKDAY_TYPE, + MONTH_TYPE, ADD_DIFF_DATETIME_TYPE, MONTH_DATE_REF_TYPE, + NUMERALS_CONSTANT_FILE, TYPE_EXACT) +from ner_v2.detectors.temporal.utils import next_weekday, nth_weekday, get_tuple_dict, get_timezone class BaseRegexDate(object): @@ -32,12 +30,7 @@ def __init__(self, entity_name, data_directory_path, timezone='UTC', past_date_r self.original_date_text = [] self.entity_name = entity_name self.tag = '__' + entity_name + '__' - try: - self.timezone = pytz.timezone(timezone) - except Exception as e: - ner_logger.debug('Timezone error: %s ' % e) - self.timezone = pytz.timezone('UTC') - ner_logger.debug('Default timezone passed as "UTC"') + self.timezone = get_timezone(timezone) self.now_date = datetime.datetime.now(tz=self.timezone) self.bot_message = None diff --git a/ner_v2/detectors/temporal/time/en/time_detection.py b/ner_v2/detectors/temporal/time/en/time_detection.py index f1463bd50..707199065 100644 --- a/ner_v2/detectors/temporal/time/en/time_detection.py +++ b/ner_v2/detectors/temporal/time/en/time_detection.py @@ -4,6 +4,7 @@ import pytz from ner_v2.detectors.temporal.constant import AM_MERIDIEM, PM_MERIDIEM, TWELVE_HOUR, EVERY_TIME_TYPE +from ner_v2.detectors.temporal.utils import get_timezone class TimeDetector(object): @@ -79,7 +80,7 @@ def __init__(self, entity_name, timezone='UTC'): self.original_time_text = [] self.tag = '__' + entity_name + '__' self.bot_message = None - self.timezone = timezone or 'UTC' + self.timezone = get_timezone(timezone) def set_bot_message(self, bot_message): """ diff --git a/ner_v2/detectors/temporal/time/standard_time_regex.py b/ner_v2/detectors/temporal/time/standard_time_regex.py index 9b94c0e80..c9644ba7d 100644 --- a/ner_v2/detectors/temporal/time/standard_time_regex.py +++ b/ner_v2/detectors/temporal/time/standard_time_regex.py @@ -5,14 +5,12 @@ import os import re -import pytz - from chatbot_ner.config import ner_logger from ner_v2.detectors.temporal.constant import (DATETIME_CONSTANT_FILE, ADD_DIFF_DATETIME_TYPE, NUMERALS_CONSTANT_FILE, TIME_CONSTANT_FILE, REF_DATETIME_TYPE, HOUR_TIME_TYPE, MINUTE_TIME_TYPE, DAYTIME_MERIDIEM, AM_MERIDIEM, PM_MERIDIEM, TWELVE_HOUR) -from ner_v2.detectors.temporal.utils import get_tuple_dict, get_hour_min_diff +from ner_v2.detectors.temporal.utils import get_tuple_dict, get_hour_min_diff, get_timezone class BaseRegexTime(object): @@ -29,12 +27,7 @@ def __init__(self, entity_name, data_directory_path, timezone='UTC'): self.processed_text = '' self.entity_name = entity_name self.tag = '__' + entity_name + '__' - try: - self.timezone = pytz.timezone(timezone) - except Exception as e: - ner_logger.debug('Timezone error: %s ' % e) - self.timezone = pytz.timezone('UTC') - ner_logger.debug('Default timezone passed as "UTC"') + self.timezone = get_timezone(timezone) self.now_date = datetime.datetime.now(tz=self.timezone) self.bot_message = None diff --git a/ner_v2/detectors/temporal/time/time_detection.py b/ner_v2/detectors/temporal/time/time_detection.py index 509c14407..e7df5c386 100644 --- a/ner_v2/detectors/temporal/time/time_detection.py +++ b/ner_v2/detectors/temporal/time/time_detection.py @@ -4,6 +4,7 @@ from language_utilities.constant import ENGLISH_LANG from ner_v2.detectors.base_detector import BaseDetector +from ner_v2.detectors.temporal.utils import get_timezone from ner_v2.detectors.utils import get_lang_data_path @@ -60,7 +61,7 @@ def __init__(self, entity_name='time', timezone='UTC', language=ENGLISH_LANG): self.time = [] self.original_time_text = [] self.tag = '__' + entity_name + '__' - self.timezone = timezone or 'UTC' + self.timezone = get_timezone(timezone) self.language = language try: diff --git a/ner_v2/detectors/temporal/utils.py b/ner_v2/detectors/temporal/utils.py index bb224cc18..264d5f8fb 100644 --- a/ner_v2/detectors/temporal/utils.py +++ b/ner_v2/detectors/temporal/utils.py @@ -1,8 +1,11 @@ import calendar -from datetime import datetime, timedelta +from datetime import datetime, timedelta, tzinfo # FIXME: Change import to `import datetime` import pandas as pd +import pytz +import six +from chatbot_ner.config import ner_logger from ner_v2.detectors.temporal.constant import POSITIVE_TIME_DIFF, NEGATIVE_TIME_DIFF, CONSTANT_FILE_KEY @@ -260,3 +263,36 @@ def get_next_date_with_dd(dd, after_datetime): mm, yy = get_next_month_number(mm=mm, yy=yy) return None, None, None + + +def get_timezone(timezone, ignore_errors=True): + # type: (Union[datetime.tzinfo, str, unicode], bool) -> datetime.tzinfo + """ + Return a datetime.tzinfo (pytz timezone object). If `timezone` is a str, try constructing a pytz + timezone object with it. If an invalid timezone is mentioned and `ignore_errors` is True, an UTC timezone object + will be returned. If `timezone` is already a datetime.tzinfo object it will be returned as is + + Args: + timezone (str or datetime.tzinfo): Either a valid timezone string or datetime.tzinfo object + ignore_errors (bool, optional): when set to True, ignore errors and return a pytz.UTC when error occurs. When + set to False, raise exception when invalid timezone is given. Defaults to True. + + Returns: + datetime.tzinfo: A pytz timezone object + + """ + if (not isinstance(timezone, six.string_types) and + isinstance(timezone, tzinfo) and + hasattr(timezone, 'localize')): + return timezone + + try: + timezone = pytz.timezone(timezone) + except Exception as e: + if ignore_errors: + ner_logger.debug('Timezone error: %s ' % e) + timezone = pytz.timezone('UTC') + ner_logger.debug('Using "UTC" as default timezone') + else: + raise + return timezone From e55564672e87373ae7a32d7657ab22e6b7aefb29 Mon Sep 17 00:00:00 2001 From: chiragjn Date: Fri, 12 Apr 2019 02:41:51 +0530 Subject: [PATCH 02/12] fix a date initialisation error --- ner_v2/detectors/temporal/time/en/time_detection.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/ner_v2/detectors/temporal/time/en/time_detection.py b/ner_v2/detectors/temporal/time/en/time_detection.py index 707199065..ae7291ce7 100644 --- a/ner_v2/detectors/temporal/time/en/time_detection.py +++ b/ner_v2/detectors/temporal/time/en/time_detection.py @@ -1,8 +1,5 @@ import re -from datetime import datetime - -import pytz - +import datetime from ner_v2.detectors.temporal.constant import AM_MERIDIEM, PM_MERIDIEM, TWELVE_HOUR, EVERY_TIME_TYPE from ner_v2.detectors.temporal.utils import get_timezone @@ -81,6 +78,7 @@ def __init__(self, entity_name, timezone='UTC'): self.tag = '__' + entity_name + '__' self.bot_message = None self.timezone = get_timezone(timezone) + self.now_date = datetime.datetime.now(self.timezone) def set_bot_message(self, bot_message): """ @@ -1135,7 +1133,7 @@ def _get_meridiem(self, hours, mins): Returns meridiem type (str): returns the meridiem type whether its am and pm """ - current_datetime = datetime.now(pytz.timezone(self.timezone)) + current_datetime = self.now_date current_hour = current_datetime.hour current_min = current_datetime.minute if hours == 0 or hours >= TWELVE_HOUR: From 784e22bf352c481b56a9321bfcf9fcd4be867c93 Mon Sep 17 00:00:00 2001 From: Prathmesh Ghadge Date: Tue, 16 Apr 2019 10:54:24 +0530 Subject: [PATCH 03/12] Exclude "miscellaneous" pull requests from release notes Exclude "miscellaneous" pull requests from release notes --- .github/release-drafter.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/release-drafter.yml b/.github/release-drafter.yml index 5026bfca6..047479fbd 100644 --- a/.github/release-drafter.yml +++ b/.github/release-drafter.yml @@ -19,3 +19,5 @@ categories: label: packages-updated - title: ЁЯС║ Miscellaneous label: miscellaneous +exclude-labels: + - miscellaneous From e973adf8edb218a9ba33e52236fc7958bb85ac65 Mon Sep 17 00:00:00 2001 From: viraj Date: Tue, 16 Apr 2019 15:11:52 +0530 Subject: [PATCH 04/12] add shopping quantity units for hindi add shopping quantity units for hindi --- ner_v2/detectors/numeral/number/hi/data/units.csv | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ner_v2/detectors/numeral/number/hi/data/units.csv b/ner_v2/detectors/numeral/number/hi/data/units.csv index e7bc47fed..53b9c82ca 100644 --- a/ner_v2/detectors/numeral/number/hi/data/units.csv +++ b/ner_v2/detectors/numeral/number/hi/data/units.csv @@ -1,3 +1,9 @@ unit_type,unit_value,unit_variants currency,rupees,rupees | rupee | rs | rupya | rupaya | rupaye | rupye | rupay | paisa | paise | inr | рд░реВрдкреАрд╕ | рд░реБрдкрдпрд╛ | рд░реВрдкрдП| рдкреИрд╕рд╛| рдкреИрд╕реЗ| тВ╣ currency,dollar,Dollar | usd | рдбреЙрд▓рд░ | $ +package_metric_unit,mg,mg | milligram | milligrams | mgs | рдорд┐рд▓реАрдЧреНрд░рд╛рдо | рдорд┐рд▓рд┐рдЧреНрд░рд╛рдо | рдорд┐рд▓реНрд▓реАрдЧреНрд░рд╛рдо | рдорд┐рд▓реАрдЧреНрд░рд╛рдореНрд╕ | рдорд┐рд▓реНрд▓реАрдЧреНрд░рд╛рдореНрд╕ | рдорд┐рд▓рд┐рдЧреНрд░рд╛рдорд╕ +package_metric_unit,gms,gms | grams | gram | gm | g | рдЧреНрд░рд╛рдо | рдЧреНрд░рд╛рдореНрд╕ +package_metric_unit,kg,kilogram | kilograms | kg | kilo | kgs | рдХрд┐рд▓реЛрдЧреНрд░рд╛рдо | рдХрд┐рд▓реЛрдЧреНрд░рд╛рдореНрд╕ | рдХрд┐рд▓реЛ +package_metric_unit,ml,ml | milliliter | millilitre | milliliters | millilitres | рдорд┐рд▓реАрд▓реАрдЯрд░ | рдорд┐рд▓рд┐рд▓рд┐рдЯрд░ | рдорд┐рд▓рд▓рд┐рд▓рд┐рдЯрд░ | рдорд┐рд▓реА рд▓реАрдЯрд░ +package_metric_unit,ltr,ltr | litre | liter | litres | liters | l | рд▓реАрдЯрд░ | рд▓рд┐рдЯрд░ +package_metric_unit,pcs,pcs | pc | pieces | piece | рдкреАрд╕ | рдкрд┐рд╕ | рдЯреБрдХрдбрд╝реЗ | рдЯреБрдХрдбрд╝рд╛ From 92d82a00649ac8b2f9c0cce88fc114eaea425fc5 Mon Sep 17 00:00:00 2001 From: chiragjn Date: Thu, 18 Apr 2019 13:26:36 +0530 Subject: [PATCH 05/12] Skip next from-to pair if a pair of dates is detected in current pair --- ner_v2/detectors/temporal/date/date_detection.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/ner_v2/detectors/temporal/date/date_detection.py b/ner_v2/detectors/temporal/date/date_detection.py index 04e52a210..d1de987d3 100644 --- a/ner_v2/detectors/temporal/date/date_detection.py +++ b/ner_v2/detectors/temporal/date/date_detection.py @@ -1,4 +1,5 @@ # coding=utf-8 +import six import copy import datetime import importlib @@ -215,9 +216,13 @@ def _detect_range(self): date_dicts[1][temporal_constant.DATE_END_RANGE_PROPERTY] = True date_dict_list.extend(date_dicts) else: - parts = iter(re.split(r'\s+(?:\-|to|till|se)\s+', self.processed_text)) + parts = re.split(r'\s+(?:\-|to|till|se)\s+', self.processed_text) + skip_next_pair = False _day_of_week_types = [temporal_constant.TYPE_THIS_DAY, temporal_constant.TYPE_NEXT_DAY] - for start_part, end_part in zip(parts, parts): # Consumes 2 items at a time from parts + for start_part, end_part in six.moves.zip(parts, parts[1:]): + if skip_next_pair: + continue + skip_next_pair = False start_date_list = self._date_dict_from_text(text=start_part, start_range_property=True) end_date_list = self._date_dict_from_text(text=end_part, end_range_property=True) if start_date_list and end_date_list: @@ -235,6 +240,7 @@ def _detect_range(self): end_date_list = [possible_end_date] date_dict_list.extend(start_date_list) date_dict_list.extend(end_date_list) + skip_next_pair = True return date_dict_list def _fix_day_range(self, start_date_dict, end_date_dict): From 1dbe3f8ccaaf7d32e12ba1f7fb0fd057b0f4a13b Mon Sep 17 00:00:00 2001 From: chiragjn Date: Thu, 18 Apr 2019 13:38:58 +0530 Subject: [PATCH 06/12] Split by conjunctions first before finding date pairs --- .../detectors/temporal/date/date_detection.py | 51 ++++++++++--------- 1 file changed, 27 insertions(+), 24 deletions(-) diff --git a/ner_v2/detectors/temporal/date/date_detection.py b/ner_v2/detectors/temporal/date/date_detection.py index d1de987d3..815d0f3fd 100644 --- a/ner_v2/detectors/temporal/date/date_detection.py +++ b/ner_v2/detectors/temporal/date/date_detection.py @@ -1,4 +1,6 @@ # coding=utf-8 +import itertools + import six import copy import datetime @@ -216,31 +218,32 @@ def _detect_range(self): date_dicts[1][temporal_constant.DATE_END_RANGE_PROPERTY] = True date_dict_list.extend(date_dicts) else: - parts = re.split(r'\s+(?:\-|to|till|se)\s+', self.processed_text) - skip_next_pair = False - _day_of_week_types = [temporal_constant.TYPE_THIS_DAY, temporal_constant.TYPE_NEXT_DAY] - for start_part, end_part in six.moves.zip(parts, parts[1:]): - if skip_next_pair: - continue + for sentence_part in re.split(r'\s+(?:and|aur|&|or)\s+', self.processed_text): + parts = re.split(r'\s+(?:\-|to|till|se)\s+', sentence_part) skip_next_pair = False - start_date_list = self._date_dict_from_text(text=start_part, start_range_property=True) - end_date_list = self._date_dict_from_text(text=end_part, end_range_property=True) - if start_date_list and end_date_list: - possible_start_date = start_date_list[0] - possible_end_date = end_date_list[-1] - start_date_type = possible_start_date[temporal_constant.DATE_VALUE]['type'] - end_date_type = possible_end_date[temporal_constant.DATE_VALUE]['type'] - if start_date_type in _day_of_week_types and end_date_type in _day_of_week_types: - start_date_list, end_date_list = self._fix_day_range(start_date_dict=possible_start_date, - end_date_dict=possible_end_date) - else: - # FIXME: Assumes end_date > start_date. Also can return dates in past when date detector - # returns dates in the past - start_date_list = [possible_start_date] - end_date_list = [possible_end_date] - date_dict_list.extend(start_date_list) - date_dict_list.extend(end_date_list) - skip_next_pair = True + _day_of_week_types = [temporal_constant.TYPE_THIS_DAY, temporal_constant.TYPE_NEXT_DAY] + for start_part, end_part in six.moves.zip(parts, parts[1:]): + if skip_next_pair: + skip_next_pair = False + continue + start_date_list = self._date_dict_from_text(text=start_part, start_range_property=True) + end_date_list = self._date_dict_from_text(text=end_part, end_range_property=True) + if start_date_list and end_date_list: + possible_start_date = start_date_list[0] + possible_end_date = end_date_list[-1] + start_date_type = possible_start_date[temporal_constant.DATE_VALUE]['type'] + end_date_type = possible_end_date[temporal_constant.DATE_VALUE]['type'] + if start_date_type in _day_of_week_types and end_date_type in _day_of_week_types: + start_date_list, end_date_list = self._fix_day_range(start_date_dict=possible_start_date, + end_date_dict=possible_end_date) + else: + # FIXME: Assumes end_date > start_date. Also can return dates in past when date detector + # returns dates in the past + start_date_list = [possible_start_date] + end_date_list = [possible_end_date] + date_dict_list.extend(start_date_list) + date_dict_list.extend(end_date_list) + skip_next_pair = True return date_dict_list def _fix_day_range(self, start_date_dict, end_date_dict): From 20d4af0f8283aef721f52c8fb48cdb86c8e8826b Mon Sep 17 00:00:00 2001 From: chiragjn Date: Thu, 18 Apr 2019 13:45:14 +0530 Subject: [PATCH 07/12] Fix lint errors --- ner_v2/detectors/temporal/date/date_detection.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ner_v2/detectors/temporal/date/date_detection.py b/ner_v2/detectors/temporal/date/date_detection.py index 815d0f3fd..20214e041 100644 --- a/ner_v2/detectors/temporal/date/date_detection.py +++ b/ner_v2/detectors/temporal/date/date_detection.py @@ -1,13 +1,14 @@ # coding=utf-8 -import itertools +from __future__ import absolute_import -import six import copy import datetime import importlib import os import re +import six + import models.crf.constant as model_constant import ner_v2.detectors.temporal.constant as temporal_constant from language_utilities.constant import ENGLISH_LANG, TRANSLATED_TEXT From a6fb328897e86035de5fc445c7dde35d61dd723f Mon Sep 17 00:00:00 2001 From: chiragjn Date: Thu, 25 Apr 2019 13:15:46 +0530 Subject: [PATCH 08/12] Consider only the number of digits in integer part of floating point number when validating --- .../numeral/number/number_detection.py | 23 +++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/ner_v2/detectors/numeral/number/number_detection.py b/ner_v2/detectors/numeral/number/number_detection.py index d2aec851b..cbfb0ebb8 100644 --- a/ner_v2/detectors/numeral/number/number_detection.py +++ b/ner_v2/detectors/numeral/number/number_detection.py @@ -1,8 +1,9 @@ import importlib +import math import os -from ner_v2.detectors.base_detector import BaseDetector from language_utilities.constant import ENGLISH_LANG +from ner_v2.detectors.base_detector import BaseDetector from ner_v2.detectors.numeral.constant import NUMBER_DETECTION_RETURN_DICT_VALUE, NUMBER_DETECTION_RETURN_DICT_UNIT from ner_v2.detectors.utils import get_lang_data_path @@ -50,6 +51,7 @@ class NumberDetector(BaseDetector): max_digit: maximum digit that a number can take """ + @staticmethod def get_supported_languages(): """ @@ -136,7 +138,7 @@ def detect_entity(self, text, **kwargs): for number_value_dict, original_text in zip(number_data[0], number_data[1]): number_value = number_value_dict[NUMBER_DETECTION_RETURN_DICT_VALUE] number_unit = number_value_dict[NUMBER_DETECTION_RETURN_DICT_UNIT] - if self.min_digit <= len(number_value) <= self.max_digit: + if self.min_digit <= self._num_digits(number_value) <= self.max_digit: if self.unit_type and (number_unit is None or self.language_number_detector.units_map[number_unit].type != self.unit_type): continue @@ -165,3 +167,20 @@ def set_min_max_digits(self, min_digit, max_digit): """ self.min_digit = min_digit self.max_digit = max_digit + + @staticmethod + def _num_digits(value): + """ + Calculate the number of digits in given number + + Args: + value (str or float or int): + + Returns: + int: number of digits in given number + + Raises: + ValueError: if the given string cannot be cast to float + """ + v = abs(float(value)) + return 1 if int(v) == 0 else (1 + int(math.log10(v))) From 07953e34b11c9e588712323cf899ad01abe6ddd8 Mon Sep 17 00:00:00 2001 From: chiragjn Date: Fri, 26 Apr 2019 15:24:33 +0530 Subject: [PATCH 09/12] Fetch value only in source_ and add deprecation warnings --- datastore/constants.py | 1 + datastore/datastore.py | 8 +++ datastore/elastic_search/query.py | 95 +++++++++++++++++++------------ 3 files changed, 68 insertions(+), 36 deletions(-) diff --git a/datastore/constants.py b/datastore/constants.py index 41e656c36..f7f76da19 100644 --- a/datastore/constants.py +++ b/datastore/constants.py @@ -7,6 +7,7 @@ ELASTICSEARCH = 'elasticsearch' ELASTICSEARCH_SEARCH_SIZE = ES_SEARCH_SIZE ELASTICSEARCH_BULK_HELPER_MESSAGE_SIZE = ES_BULK_MSG_SIZE +ELASTICSEARCH_VALUES_SEARCH_SIZE = 300000 # settings dictionary key constants ENGINE = 'engine' diff --git a/datastore/datastore.py b/datastore/datastore.py index 7799f383b..6f5680400 100644 --- a/datastore/datastore.py +++ b/datastore/datastore.py @@ -1,3 +1,5 @@ +import warnings + import elastic_search from chatbot_ner.config import ner_logger, CHATBOT_NER_DATASTORE from lib.singleton import Singleton @@ -120,6 +122,7 @@ def create(self, **kwargs): **kwargs ) + # FIXME: repopulate does not consider language of the variants def populate(self, entity_data_directory_path=DEFAULT_ENTITY_DATA_DIRECTORY, csv_file_paths=None, **kwargs): """ Populates the datastore from csv files stored in directory path indicated by entity_data_directory_path and @@ -181,6 +184,7 @@ def delete(self, **kwargs): ignore=[400, 404], **kwargs) + # FIXME: Deprecated, remove def get_entity_dictionary(self, entity_name, **kwargs): """ Args: @@ -214,6 +218,7 @@ def get_entity_dictionary(self, entity_name, **kwargs): ... u'koramangala': [u'koramangala']} """ + warnings.warn("get_entity_dictionary() is deprecated; Please use get_entity_data()", DeprecationWarning) if self._client_or_connection is None: self._connect() results_dictionary = {} @@ -308,6 +313,7 @@ def delete_entity(self, entity_name, **kwargs): ignore=[400, 404], **kwargs) + # FIXME: repopulate does not consider language of the variants def repopulate(self, entity_data_directory_path=DEFAULT_ENTITY_DATA_DIRECTORY, csv_file_paths=None, **kwargs): """ Deletes the existing data and repopulates it for entities from csv files stored in directory path indicated by @@ -378,6 +384,7 @@ def exists(self): return False + # FIXME: Deprecated, remove def update_entity_data(self, entity_name, entity_data, language_script, **kwargs): """ This method is used to populate the the entity dictionary @@ -389,6 +396,7 @@ def update_entity_data(self, entity_name, entity_data, language_script, **kwargs For Elasticsearch: Refer http://elasticsearch-py.readthedocs.io/en/master/helpers.html#elasticsearch.helpers.bulk """ + warnings.warn("update_entity_data() is deprecated; Please use add_entity_data()", DeprecationWarning) if self._client_or_connection is None: self._connect() diff --git a/datastore/elastic_search/query.py b/datastore/elastic_search/query.py index c26569e5d..da49d982c 100644 --- a/datastore/elastic_search/query.py +++ b/datastore/elastic_search/query.py @@ -1,21 +1,24 @@ from __future__ import absolute_import +import collections # std imports import copy -from six import string_types +import json import re -import collections +import warnings + +from six import string_types # Local imports from datastore import constants from external_api.constants import SENTENCE_LIST, ENTITY_LIST from language_utilities.constant import ENGLISH_LANG from lib.nlp.const import TOKENIZER -import json log_prefix = 'datastore.elastic_search.query' +# Deprecated def dictionary_query(connection, index_name, doc_type, entity_name, **kwargs): """ Get all variants data for a entity stored in the index as a dictionary @@ -32,6 +35,7 @@ def dictionary_query(connection, index_name, doc_type, entity_name, **kwargs): dictionary, search results of the 'term' query on entity_name, mapping keys to lists containing synonyms/variants of the key """ + warnings.warn("dictionary_query() is deprecated; Please use get_entity_data()", DeprecationWarning) results_dictionary = {} data = { 'query': { @@ -197,7 +201,7 @@ def get_entity_unique_values(connection, index_name, doc_type, entity_name, valu "unique_values": { "terms": { "field": "value.keyword", - "size": 300000 + "size": constants.ELASTICSEARCH_VALUES_SEARCH_SIZE, } } }, @@ -283,12 +287,14 @@ def full_text_query(connection, index_name, doc_type, entity_name, sentences, fu u'mumbai': u'mumbai', u'pune': u'pune'} """ - index = {'index': index_name, 'type': doc_type} + index_header = json.dumps({'index': index_name, 'type': doc_type}) data = [] - for sentence_ in sentences: - query = _generate_es_search_dictionary(entity_name, sentence_, fuzziness_threshold, + for sentence in sentences: + query = _generate_es_search_dictionary(entity_name=entity_name, + text=sentence, + fuzziness_threshold=fuzziness_threshold, language_script=search_language_script) - data.extend([json.dumps(index), json.dumps(query)]) + data.extend([index_header, json.dumps(query)]) data = '\n'.join(data) kwargs = dict(kwargs, body=data, doc_type=doc_type, index=index_name) @@ -359,17 +365,25 @@ def _get_dynamic_fuzziness_threshold(fuzzy_setting): return fuzzy_setting -def _generate_es_search_dictionary(entity_name, text, fuzziness_threshold, language_script=None): +def _generate_es_search_dictionary(entity_name, text, + fuzziness_threshold=1, + language_script=ENGLISH_LANG, + size=constants.ELASTICSEARCH_SEARCH_SIZE, + as_json=False): """ Generates compound elasticsearch boolean search query dictionary for the sentence. The query generated searches for entity_name in the index and returns search results for the matched word (of sentence) only if entity_name is found. Args: - entity_name: name of the entity to perform a 'term' query on - text: The text on which we need to identify the enitites. - fuzziness_threshold: fuzziness_threshold for elasticsearch match query 'fuzziness' parameter - language_script: language of documents to be searched, optional, defaults to None + entity_name (str): name of the entity to perform a 'term' query on + text (str): The text on which we need to identify the enitites. + fuzziness_threshold (int, optional): fuzziness_threshold for elasticsearch match query 'fuzziness' parameter. + Defaults to 1 + language_script (str, optional): language of documents to be searched, optional, defaults to 'en' + size (int, optional): number of records to return, defaults to `ELASTICSEARCH_SEARCH_SIZE` + as_json (bool, optional): Return the generated query as json string. useful for debug purposes. + Defaults to False Returns: dictionary, the search query for the text @@ -386,24 +400,18 @@ def _generate_es_search_dictionary(entity_name, text, fuzziness_threshold, langu must_terms.append(term_dict_entity_name) # search on language_script, add english as default search - if language_script is not None: - term_dict_language = { - 'terms': { - 'language_script': [language_script, ENGLISH_LANG] - } + term_dict_language = { + 'terms': { + 'language_script': [ENGLISH_LANG] } - must_terms.append(term_dict_language) - - data = { - 'query': { - 'bool': { - 'must': must_terms, - 'should': [], - 'minimum_should_match': 1 - } - }, 'size': constants.ELASTICSEARCH_SEARCH_SIZE } - query_should_data = [] + + if language_script != ENGLISH_LANG: + term_dict_language['terms']['language_script'].append(language_script) + + must_terms.append(term_dict_language) + + should_terms = [] query = { 'match': { 'variants': { @@ -413,15 +421,30 @@ def _generate_es_search_dictionary(entity_name, text, fuzziness_threshold, langu } } } - query_should_data.append(query) - data['query']['bool']['should'] = query_should_data - data['highlight'] = { - 'fields': { - 'variants': {} + should_terms.append(query) + + data = { + '_source': ['value'], + 'query': { + 'bool': { + 'must': must_terms, + 'should': should_terms, + 'minimum_should_match': 1 + }, + 'highlight': { + 'fields': { + 'variants': {} + }, + 'order': 'score', + 'number_of_fragments': 20 + } }, - 'order': 'score', - 'number_of_fragments': 20 + 'size': size } + + if as_json: + data = json.dumps(data) + return data From c9592b13b1f17b07efdcbe9a5de31f3b35b98f92 Mon Sep 17 00:00:00 2001 From: chiragjn Date: Fri, 26 Apr 2019 15:30:18 +0530 Subject: [PATCH 10/12] Fix lint errors --- datastore/datastore.py | 13 ++++++++----- datastore/elastic_search/__init__.py | 2 +- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/datastore/datastore.py b/datastore/datastore.py index 6f5680400..33e7f59d3 100644 --- a/datastore/datastore.py +++ b/datastore/datastore.py @@ -1,12 +1,15 @@ +from __future__ import absolute_import + import warnings -import elastic_search from chatbot_ner.config import ner_logger, CHATBOT_NER_DATASTORE +from datastore import elastic_search +from datastore.constants import (ELASTICSEARCH, ENGINE, ELASTICSEARCH_INDEX_NAME, DEFAULT_ENTITY_DATA_DIRECTORY, + ELASTICSEARCH_DOC_TYPE, ELASTICSEARCH_CRF_DATA_INDEX_NAME, + ELASTICSEARCH_CRF_DATA_DOC_TYPE) +from datastore.exceptions import (DataStoreSettingsImproperlyConfiguredException, EngineNotImplementedException, + EngineConnectionException, NonESEngineTransferException, IndexNotFoundException) from lib.singleton import Singleton -from .constants import (ELASTICSEARCH, ENGINE, ELASTICSEARCH_INDEX_NAME, DEFAULT_ENTITY_DATA_DIRECTORY, - ELASTICSEARCH_DOC_TYPE, ELASTICSEARCH_CRF_DATA_INDEX_NAME, ELASTICSEARCH_CRF_DATA_DOC_TYPE) -from .exceptions import (DataStoreSettingsImproperlyConfiguredException, EngineNotImplementedException, - EngineConnectionException, NonESEngineTransferException, IndexNotFoundException) class DataStore(object): diff --git a/datastore/elastic_search/__init__.py b/datastore/elastic_search/__init__.py index 20b6d27e0..34654dbfb 100644 --- a/datastore/elastic_search/__init__.py +++ b/datastore/elastic_search/__init__.py @@ -2,4 +2,4 @@ import create import populate import query -import transfer \ No newline at end of file +import transfer From 243f621e4025bbaf1715bf54a7da22fd7306edde Mon Sep 17 00:00:00 2001 From: chiragjn Date: Fri, 26 Apr 2019 15:41:27 +0530 Subject: [PATCH 11/12] Add highlight at the correct level in ES query --- datastore/elastic_search/query.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/datastore/elastic_search/query.py b/datastore/elastic_search/query.py index da49d982c..aaf69bb8d 100644 --- a/datastore/elastic_search/query.py +++ b/datastore/elastic_search/query.py @@ -294,7 +294,8 @@ def full_text_query(connection, index_name, doc_type, entity_name, sentences, fu text=sentence, fuzziness_threshold=fuzziness_threshold, language_script=search_language_script) - data.extend([index_header, json.dumps(query)]) + data.append(index_header) + data.append(json.dumps(query)) data = '\n'.join(data) kwargs = dict(kwargs, body=data, doc_type=doc_type, index=index_name) @@ -431,13 +432,13 @@ def _generate_es_search_dictionary(entity_name, text, 'should': should_terms, 'minimum_should_match': 1 }, - 'highlight': { - 'fields': { - 'variants': {} - }, - 'order': 'score', - 'number_of_fragments': 20 - } + }, + 'highlight': { + 'fields': { + 'variants': {} + }, + 'order': 'score', + 'number_of_fragments': 20 }, 'size': size } From e4a4dae519532d85fe8349d5f08bbbfa5f796603 Mon Sep 17 00:00:00 2001 From: chiragjn Date: Fri, 26 Apr 2019 16:29:32 +0530 Subject: [PATCH 12/12] Switch to unified highlighter for faster search on larger documents --- datastore/elastic_search/query.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/datastore/elastic_search/query.py b/datastore/elastic_search/query.py index aaf69bb8d..b6cbbac14 100644 --- a/datastore/elastic_search/query.py +++ b/datastore/elastic_search/query.py @@ -435,7 +435,9 @@ def _generate_es_search_dictionary(entity_name, text, }, 'highlight': { 'fields': { - 'variants': {} + 'variants': { + 'type': 'unified' # experimental in 5.x, default in 6.x and 7.x. Faster than 'plain' + } }, 'order': 'score', 'number_of_fragments': 20