diff --git a/.github/release-drafter.yml b/.github/release-drafter.yml index 5026bfca6..047479fbd 100644 --- a/.github/release-drafter.yml +++ b/.github/release-drafter.yml @@ -19,3 +19,5 @@ categories: label: packages-updated - title: ЁЯС║ Miscellaneous label: miscellaneous +exclude-labels: + - miscellaneous diff --git a/datastore/constants.py b/datastore/constants.py index 41e656c36..f7f76da19 100644 --- a/datastore/constants.py +++ b/datastore/constants.py @@ -7,6 +7,7 @@ ELASTICSEARCH = 'elasticsearch' ELASTICSEARCH_SEARCH_SIZE = ES_SEARCH_SIZE ELASTICSEARCH_BULK_HELPER_MESSAGE_SIZE = ES_BULK_MSG_SIZE +ELASTICSEARCH_VALUES_SEARCH_SIZE = 300000 # settings dictionary key constants ENGINE = 'engine' diff --git a/datastore/datastore.py b/datastore/datastore.py index 7799f383b..33e7f59d3 100644 --- a/datastore/datastore.py +++ b/datastore/datastore.py @@ -1,10 +1,15 @@ -import elastic_search +from __future__ import absolute_import + +import warnings + from chatbot_ner.config import ner_logger, CHATBOT_NER_DATASTORE +from datastore import elastic_search +from datastore.constants import (ELASTICSEARCH, ENGINE, ELASTICSEARCH_INDEX_NAME, DEFAULT_ENTITY_DATA_DIRECTORY, + ELASTICSEARCH_DOC_TYPE, ELASTICSEARCH_CRF_DATA_INDEX_NAME, + ELASTICSEARCH_CRF_DATA_DOC_TYPE) +from datastore.exceptions import (DataStoreSettingsImproperlyConfiguredException, EngineNotImplementedException, + EngineConnectionException, NonESEngineTransferException, IndexNotFoundException) from lib.singleton import Singleton -from .constants import (ELASTICSEARCH, ENGINE, ELASTICSEARCH_INDEX_NAME, DEFAULT_ENTITY_DATA_DIRECTORY, - ELASTICSEARCH_DOC_TYPE, ELASTICSEARCH_CRF_DATA_INDEX_NAME, ELASTICSEARCH_CRF_DATA_DOC_TYPE) -from .exceptions import (DataStoreSettingsImproperlyConfiguredException, EngineNotImplementedException, - EngineConnectionException, NonESEngineTransferException, IndexNotFoundException) class DataStore(object): @@ -120,6 +125,7 @@ def create(self, **kwargs): **kwargs ) + # FIXME: repopulate does not consider language of the variants def populate(self, entity_data_directory_path=DEFAULT_ENTITY_DATA_DIRECTORY, csv_file_paths=None, **kwargs): """ Populates the datastore from csv files stored in directory path indicated by entity_data_directory_path and @@ -181,6 +187,7 @@ def delete(self, **kwargs): ignore=[400, 404], **kwargs) + # FIXME: Deprecated, remove def get_entity_dictionary(self, entity_name, **kwargs): """ Args: @@ -214,6 +221,7 @@ def get_entity_dictionary(self, entity_name, **kwargs): ... u'koramangala': [u'koramangala']} """ + warnings.warn("get_entity_dictionary() is deprecated; Please use get_entity_data()", DeprecationWarning) if self._client_or_connection is None: self._connect() results_dictionary = {} @@ -308,6 +316,7 @@ def delete_entity(self, entity_name, **kwargs): ignore=[400, 404], **kwargs) + # FIXME: repopulate does not consider language of the variants def repopulate(self, entity_data_directory_path=DEFAULT_ENTITY_DATA_DIRECTORY, csv_file_paths=None, **kwargs): """ Deletes the existing data and repopulates it for entities from csv files stored in directory path indicated by @@ -378,6 +387,7 @@ def exists(self): return False + # FIXME: Deprecated, remove def update_entity_data(self, entity_name, entity_data, language_script, **kwargs): """ This method is used to populate the the entity dictionary @@ -389,6 +399,7 @@ def update_entity_data(self, entity_name, entity_data, language_script, **kwargs For Elasticsearch: Refer http://elasticsearch-py.readthedocs.io/en/master/helpers.html#elasticsearch.helpers.bulk """ + warnings.warn("update_entity_data() is deprecated; Please use add_entity_data()", DeprecationWarning) if self._client_or_connection is None: self._connect() diff --git a/datastore/elastic_search/__init__.py b/datastore/elastic_search/__init__.py index 20b6d27e0..34654dbfb 100644 --- a/datastore/elastic_search/__init__.py +++ b/datastore/elastic_search/__init__.py @@ -2,4 +2,4 @@ import create import populate import query -import transfer \ No newline at end of file +import transfer diff --git a/datastore/elastic_search/query.py b/datastore/elastic_search/query.py index c26569e5d..b6cbbac14 100644 --- a/datastore/elastic_search/query.py +++ b/datastore/elastic_search/query.py @@ -1,21 +1,24 @@ from __future__ import absolute_import +import collections # std imports import copy -from six import string_types +import json import re -import collections +import warnings + +from six import string_types # Local imports from datastore import constants from external_api.constants import SENTENCE_LIST, ENTITY_LIST from language_utilities.constant import ENGLISH_LANG from lib.nlp.const import TOKENIZER -import json log_prefix = 'datastore.elastic_search.query' +# Deprecated def dictionary_query(connection, index_name, doc_type, entity_name, **kwargs): """ Get all variants data for a entity stored in the index as a dictionary @@ -32,6 +35,7 @@ def dictionary_query(connection, index_name, doc_type, entity_name, **kwargs): dictionary, search results of the 'term' query on entity_name, mapping keys to lists containing synonyms/variants of the key """ + warnings.warn("dictionary_query() is deprecated; Please use get_entity_data()", DeprecationWarning) results_dictionary = {} data = { 'query': { @@ -197,7 +201,7 @@ def get_entity_unique_values(connection, index_name, doc_type, entity_name, valu "unique_values": { "terms": { "field": "value.keyword", - "size": 300000 + "size": constants.ELASTICSEARCH_VALUES_SEARCH_SIZE, } } }, @@ -283,12 +287,15 @@ def full_text_query(connection, index_name, doc_type, entity_name, sentences, fu u'mumbai': u'mumbai', u'pune': u'pune'} """ - index = {'index': index_name, 'type': doc_type} + index_header = json.dumps({'index': index_name, 'type': doc_type}) data = [] - for sentence_ in sentences: - query = _generate_es_search_dictionary(entity_name, sentence_, fuzziness_threshold, + for sentence in sentences: + query = _generate_es_search_dictionary(entity_name=entity_name, + text=sentence, + fuzziness_threshold=fuzziness_threshold, language_script=search_language_script) - data.extend([json.dumps(index), json.dumps(query)]) + data.append(index_header) + data.append(json.dumps(query)) data = '\n'.join(data) kwargs = dict(kwargs, body=data, doc_type=doc_type, index=index_name) @@ -359,17 +366,25 @@ def _get_dynamic_fuzziness_threshold(fuzzy_setting): return fuzzy_setting -def _generate_es_search_dictionary(entity_name, text, fuzziness_threshold, language_script=None): +def _generate_es_search_dictionary(entity_name, text, + fuzziness_threshold=1, + language_script=ENGLISH_LANG, + size=constants.ELASTICSEARCH_SEARCH_SIZE, + as_json=False): """ Generates compound elasticsearch boolean search query dictionary for the sentence. The query generated searches for entity_name in the index and returns search results for the matched word (of sentence) only if entity_name is found. Args: - entity_name: name of the entity to perform a 'term' query on - text: The text on which we need to identify the enitites. - fuzziness_threshold: fuzziness_threshold for elasticsearch match query 'fuzziness' parameter - language_script: language of documents to be searched, optional, defaults to None + entity_name (str): name of the entity to perform a 'term' query on + text (str): The text on which we need to identify the enitites. + fuzziness_threshold (int, optional): fuzziness_threshold for elasticsearch match query 'fuzziness' parameter. + Defaults to 1 + language_script (str, optional): language of documents to be searched, optional, defaults to 'en' + size (int, optional): number of records to return, defaults to `ELASTICSEARCH_SEARCH_SIZE` + as_json (bool, optional): Return the generated query as json string. useful for debug purposes. + Defaults to False Returns: dictionary, the search query for the text @@ -386,24 +401,18 @@ def _generate_es_search_dictionary(entity_name, text, fuzziness_threshold, langu must_terms.append(term_dict_entity_name) # search on language_script, add english as default search - if language_script is not None: - term_dict_language = { - 'terms': { - 'language_script': [language_script, ENGLISH_LANG] - } + term_dict_language = { + 'terms': { + 'language_script': [ENGLISH_LANG] } - must_terms.append(term_dict_language) - - data = { - 'query': { - 'bool': { - 'must': must_terms, - 'should': [], - 'minimum_should_match': 1 - } - }, 'size': constants.ELASTICSEARCH_SEARCH_SIZE } - query_should_data = [] + + if language_script != ENGLISH_LANG: + term_dict_language['terms']['language_script'].append(language_script) + + must_terms.append(term_dict_language) + + should_terms = [] query = { 'match': { 'variants': { @@ -413,15 +422,32 @@ def _generate_es_search_dictionary(entity_name, text, fuzziness_threshold, langu } } } - query_should_data.append(query) - data['query']['bool']['should'] = query_should_data - data['highlight'] = { - 'fields': { - 'variants': {} + should_terms.append(query) + + data = { + '_source': ['value'], + 'query': { + 'bool': { + 'must': must_terms, + 'should': should_terms, + 'minimum_should_match': 1 + }, }, - 'order': 'score', - 'number_of_fragments': 20 + 'highlight': { + 'fields': { + 'variants': { + 'type': 'unified' # experimental in 5.x, default in 6.x and 7.x. Faster than 'plain' + } + }, + 'order': 'score', + 'number_of_fragments': 20 + }, + 'size': size } + + if as_json: + data = json.dumps(data) + return data diff --git a/ner_v2/detectors/numeral/number/hi/data/units.csv b/ner_v2/detectors/numeral/number/hi/data/units.csv index e7bc47fed..53b9c82ca 100644 --- a/ner_v2/detectors/numeral/number/hi/data/units.csv +++ b/ner_v2/detectors/numeral/number/hi/data/units.csv @@ -1,3 +1,9 @@ unit_type,unit_value,unit_variants currency,rupees,rupees | rupee | rs | rupya | rupaya | rupaye | rupye | rupay | paisa | paise | inr | рд░реВрдкреАрд╕ | рд░реБрдкрдпрд╛ | рд░реВрдкрдП| рдкреИрд╕рд╛| рдкреИрд╕реЗ| тВ╣ currency,dollar,Dollar | usd | рдбреЙрд▓рд░ | $ +package_metric_unit,mg,mg | milligram | milligrams | mgs | рдорд┐рд▓реАрдЧреНрд░рд╛рдо | рдорд┐рд▓рд┐рдЧреНрд░рд╛рдо | рдорд┐рд▓реНрд▓реАрдЧреНрд░рд╛рдо | рдорд┐рд▓реАрдЧреНрд░рд╛рдореНрд╕ | рдорд┐рд▓реНрд▓реАрдЧреНрд░рд╛рдореНрд╕ | рдорд┐рд▓рд┐рдЧреНрд░рд╛рдорд╕ +package_metric_unit,gms,gms | grams | gram | gm | g | рдЧреНрд░рд╛рдо | рдЧреНрд░рд╛рдореНрд╕ +package_metric_unit,kg,kilogram | kilograms | kg | kilo | kgs | рдХрд┐рд▓реЛрдЧреНрд░рд╛рдо | рдХрд┐рд▓реЛрдЧреНрд░рд╛рдореНрд╕ | рдХрд┐рд▓реЛ +package_metric_unit,ml,ml | milliliter | millilitre | milliliters | millilitres | рдорд┐рд▓реАрд▓реАрдЯрд░ | рдорд┐рд▓рд┐рд▓рд┐рдЯрд░ | рдорд┐рд▓рд▓рд┐рд▓рд┐рдЯрд░ | рдорд┐рд▓реА рд▓реАрдЯрд░ +package_metric_unit,ltr,ltr | litre | liter | litres | liters | l | рд▓реАрдЯрд░ | рд▓рд┐рдЯрд░ +package_metric_unit,pcs,pcs | pc | pieces | piece | рдкреАрд╕ | рдкрд┐рд╕ | рдЯреБрдХрдбрд╝реЗ | рдЯреБрдХрдбрд╝рд╛ diff --git a/ner_v2/detectors/numeral/number/number_detection.py b/ner_v2/detectors/numeral/number/number_detection.py index d2aec851b..cbfb0ebb8 100644 --- a/ner_v2/detectors/numeral/number/number_detection.py +++ b/ner_v2/detectors/numeral/number/number_detection.py @@ -1,8 +1,9 @@ import importlib +import math import os -from ner_v2.detectors.base_detector import BaseDetector from language_utilities.constant import ENGLISH_LANG +from ner_v2.detectors.base_detector import BaseDetector from ner_v2.detectors.numeral.constant import NUMBER_DETECTION_RETURN_DICT_VALUE, NUMBER_DETECTION_RETURN_DICT_UNIT from ner_v2.detectors.utils import get_lang_data_path @@ -50,6 +51,7 @@ class NumberDetector(BaseDetector): max_digit: maximum digit that a number can take """ + @staticmethod def get_supported_languages(): """ @@ -136,7 +138,7 @@ def detect_entity(self, text, **kwargs): for number_value_dict, original_text in zip(number_data[0], number_data[1]): number_value = number_value_dict[NUMBER_DETECTION_RETURN_DICT_VALUE] number_unit = number_value_dict[NUMBER_DETECTION_RETURN_DICT_UNIT] - if self.min_digit <= len(number_value) <= self.max_digit: + if self.min_digit <= self._num_digits(number_value) <= self.max_digit: if self.unit_type and (number_unit is None or self.language_number_detector.units_map[number_unit].type != self.unit_type): continue @@ -165,3 +167,20 @@ def set_min_max_digits(self, min_digit, max_digit): """ self.min_digit = min_digit self.max_digit = max_digit + + @staticmethod + def _num_digits(value): + """ + Calculate the number of digits in given number + + Args: + value (str or float or int): + + Returns: + int: number of digits in given number + + Raises: + ValueError: if the given string cannot be cast to float + """ + v = abs(float(value)) + return 1 if int(v) == 0 else (1 + int(math.log10(v))) diff --git a/ner_v2/detectors/temporal/date/date_detection.py b/ner_v2/detectors/temporal/date/date_detection.py index 3f219d06e..20214e041 100644 --- a/ner_v2/detectors/temporal/date/date_detection.py +++ b/ner_v2/detectors/temporal/date/date_detection.py @@ -1,23 +1,25 @@ # coding=utf-8 +from __future__ import absolute_import + import copy import datetime import importlib import os import re -import pytz +import six import models.crf.constant as model_constant import ner_v2.detectors.temporal.constant as temporal_constant -from chatbot_ner.config import ner_logger from language_utilities.constant import ENGLISH_LANG, TRANSLATED_TEXT from language_utilities.utils import translate_text from models.crf.models import Models -from ner_constants import FROM_MESSAGE, FROM_MODEL_VERIFIED, FROM_MODEL_NOT_VERIFIED, FROM_STRUCTURE_VALUE_VERIFIED, \ - FROM_STRUCTURE_VALUE_NOT_VERIFIED, FROM_FALLBACK_VALUE +from ner_constants import (FROM_MESSAGE, FROM_MODEL_VERIFIED, FROM_MODEL_NOT_VERIFIED, FROM_STRUCTURE_VALUE_VERIFIED, + FROM_STRUCTURE_VALUE_NOT_VERIFIED, FROM_FALLBACK_VALUE) from ner_v2.detectors.base_detector import BaseDetector -from ner_v2.detectors.temporal.constant import TYPE_EXACT, TYPE_EVERYDAY, TYPE_PAST, \ - TYPE_NEXT_DAY, TYPE_REPEAT_DAY +from ner_v2.detectors.temporal.constant import (TYPE_EXACT, TYPE_EVERYDAY, TYPE_PAST, + TYPE_NEXT_DAY, TYPE_REPEAT_DAY) +from ner_v2.detectors.temporal.utils import get_timezone from ner_v2.detectors.utils import get_lang_data_path @@ -217,26 +219,32 @@ def _detect_range(self): date_dicts[1][temporal_constant.DATE_END_RANGE_PROPERTY] = True date_dict_list.extend(date_dicts) else: - parts = iter(re.split(r'\s+(?:\-|to|till|se)\s+', self.processed_text)) - _day_of_week_types = [temporal_constant.TYPE_THIS_DAY, temporal_constant.TYPE_NEXT_DAY] - for start_part, end_part in zip(parts, parts): # Consumes 2 items at a time from parts - start_date_list = self._date_dict_from_text(text=start_part, start_range_property=True) - end_date_list = self._date_dict_from_text(text=end_part, end_range_property=True) - if start_date_list and end_date_list: - possible_start_date = start_date_list[0] - possible_end_date = end_date_list[-1] - start_date_type = possible_start_date[temporal_constant.DATE_VALUE]['type'] - end_date_type = possible_end_date[temporal_constant.DATE_VALUE]['type'] - if start_date_type in _day_of_week_types and end_date_type in _day_of_week_types: - start_date_list, end_date_list = self._fix_day_range(start_date_dict=possible_start_date, - end_date_dict=possible_end_date) - else: - # FIXME: Assumes end_date > start_date. Also can return dates in past when date detector - # returns dates in the past - start_date_list = [possible_start_date] - end_date_list = [possible_end_date] - date_dict_list.extend(start_date_list) - date_dict_list.extend(end_date_list) + for sentence_part in re.split(r'\s+(?:and|aur|&|or)\s+', self.processed_text): + parts = re.split(r'\s+(?:\-|to|till|se)\s+', sentence_part) + skip_next_pair = False + _day_of_week_types = [temporal_constant.TYPE_THIS_DAY, temporal_constant.TYPE_NEXT_DAY] + for start_part, end_part in six.moves.zip(parts, parts[1:]): + if skip_next_pair: + skip_next_pair = False + continue + start_date_list = self._date_dict_from_text(text=start_part, start_range_property=True) + end_date_list = self._date_dict_from_text(text=end_part, end_range_property=True) + if start_date_list and end_date_list: + possible_start_date = start_date_list[0] + possible_end_date = end_date_list[-1] + start_date_type = possible_start_date[temporal_constant.DATE_VALUE]['type'] + end_date_type = possible_end_date[temporal_constant.DATE_VALUE]['type'] + if start_date_type in _day_of_week_types and end_date_type in _day_of_week_types: + start_date_list, end_date_list = self._fix_day_range(start_date_dict=possible_start_date, + end_date_dict=possible_end_date) + else: + # FIXME: Assumes end_date > start_date. Also can return dates in past when date detector + # returns dates in the past + start_date_list = [possible_start_date] + end_date_list = [possible_end_date] + date_dict_list.extend(start_date_list) + date_dict_list.extend(end_date_list) + skip_next_pair = True return date_dict_list def _fix_day_range(self, start_date_dict, end_date_dict): @@ -765,12 +773,7 @@ def __init__(self, entity_name, language=ENGLISH_LANG, timezone='UTC', past_date self.original_date_text = [] self.entity_name = entity_name self.tag = '__' + entity_name + '__' - try: - self.timezone = pytz.timezone(timezone) - except Exception as e: - ner_logger.debug('Timezone error: %s ' % e) - self.timezone = pytz.timezone('UTC') - ner_logger.debug('Default timezone passed as "UTC"') + self.timezone = get_timezone(timezone) self.now_date = datetime.datetime.now(tz=self.timezone) self.bot_message = None self.language = language diff --git a/ner_v2/detectors/temporal/date/en/date_detection.py b/ner_v2/detectors/temporal/date/en/date_detection.py index 12036b6f0..3ce428f67 100644 --- a/ner_v2/detectors/temporal/date/en/date_detection.py +++ b/ner_v2/detectors/temporal/date/en/date_detection.py @@ -2,13 +2,13 @@ import datetime import re -import pytz - -from chatbot_ner.config import ner_logger -from ner_v2.detectors.temporal.constant import TYPE_EXACT, TYPE_EVERYDAY, TYPE_TODAY, TYPE_TOMORROW, TYPE_YESTERDAY, \ - TYPE_DAY_AFTER, TYPE_DAY_BEFORE, TYPE_N_DAYS_AFTER, TYPE_NEXT_DAY, TYPE_THIS_DAY, TYPE_POSSIBLE_DAY, WEEKDAYS, \ - REPEAT_WEEKDAYS, WEEKENDS, REPEAT_WEEKENDS, TYPE_REPEAT_DAY, MONTH_DICT, DAY_DICT, ORDINALS_MAP -from ner_v2.detectors.temporal.utils import get_weekdays_for_month, get_next_date_with_dd, get_previous_date_with_dd +from ner_v2.detectors.temporal.constant import (TYPE_EXACT, TYPE_EVERYDAY, TYPE_TODAY, TYPE_TOMORROW, TYPE_YESTERDAY, + TYPE_DAY_AFTER, TYPE_DAY_BEFORE, TYPE_N_DAYS_AFTER, TYPE_NEXT_DAY, + TYPE_THIS_DAY, TYPE_POSSIBLE_DAY, WEEKDAYS, + REPEAT_WEEKDAYS, WEEKENDS, REPEAT_WEEKENDS, TYPE_REPEAT_DAY, + MONTH_DICT, DAY_DICT, ORDINALS_MAP) +from ner_v2.detectors.temporal.utils import (get_weekdays_for_month, get_next_date_with_dd, get_previous_date_with_dd, + get_timezone) class DateDetector(object): @@ -90,12 +90,7 @@ def __init__(self, entity_name, timezone='UTC', past_date_referenced=False): self.day_dictionary = {} self.entity_name = entity_name self.tag = '__' + entity_name + '__' - try: - self.timezone = pytz.timezone(timezone) - except Exception as e: - ner_logger.debug('Timezone error: %s ' % e) - self.timezone = pytz.timezone('UTC') - ner_logger.debug('Default timezone passed as "UTC"') + self.timezone = get_timezone(timezone) self.now_date = datetime.datetime.now(tz=self.timezone) self.month_dictionary = MONTH_DICT self.day_dictionary = DAY_DICT @@ -875,7 +870,8 @@ def _yesterdays_date(self, date_list=None, original_list=None): original_list = [] if date_list is None: date_list = [] - regex_pattern = re.compile(r'\b((yesterday|sterday|yesterdy|yestrdy|yestrday|previous day|prev day|prevday))\b') + regex_pattern = re.compile( + r'\b((yesterday|sterday|yesterdy|yestrdy|yestrday|previous day|prev day|prevday))\b') patterns = regex_pattern.findall(self.processed_text.lower()) for pattern in patterns: original = pattern[0] diff --git a/ner_v2/detectors/temporal/date/standard_date_regex.py b/ner_v2/detectors/temporal/date/standard_date_regex.py index c4dceb475..efcea1aa4 100644 --- a/ner_v2/detectors/temporal/date/standard_date_regex.py +++ b/ner_v2/detectors/temporal/date/standard_date_regex.py @@ -3,15 +3,13 @@ import datetime import re -import pytz from dateutil.relativedelta import relativedelta -from chatbot_ner.config import ner_logger -from ner_v2.detectors.temporal.constant import TYPE_EXACT -from ner_v2.detectors.temporal.constant import DATE_CONSTANT_FILE, DATETIME_CONSTANT_FILE, \ - RELATIVE_DATE, DATE_LITERAL_TYPE, MONTH_LITERAL_TYPE, WEEKDAY_TYPE, \ - MONTH_TYPE, ADD_DIFF_DATETIME_TYPE, MONTH_DATE_REF_TYPE, NUMERALS_CONSTANT_FILE -from ner_v2.detectors.temporal.utils import next_weekday, nth_weekday, get_tuple_dict +from ner_v2.detectors.temporal.constant import (DATE_CONSTANT_FILE, DATETIME_CONSTANT_FILE, + RELATIVE_DATE, DATE_LITERAL_TYPE, MONTH_LITERAL_TYPE, WEEKDAY_TYPE, + MONTH_TYPE, ADD_DIFF_DATETIME_TYPE, MONTH_DATE_REF_TYPE, + NUMERALS_CONSTANT_FILE, TYPE_EXACT) +from ner_v2.detectors.temporal.utils import next_weekday, nth_weekday, get_tuple_dict, get_timezone class BaseRegexDate(object): @@ -32,12 +30,7 @@ def __init__(self, entity_name, data_directory_path, timezone='UTC', past_date_r self.original_date_text = [] self.entity_name = entity_name self.tag = '__' + entity_name + '__' - try: - self.timezone = pytz.timezone(timezone) - except Exception as e: - ner_logger.debug('Timezone error: %s ' % e) - self.timezone = pytz.timezone('UTC') - ner_logger.debug('Default timezone passed as "UTC"') + self.timezone = get_timezone(timezone) self.now_date = datetime.datetime.now(tz=self.timezone) self.bot_message = None diff --git a/ner_v2/detectors/temporal/time/en/time_detection.py b/ner_v2/detectors/temporal/time/en/time_detection.py index f1463bd50..ae7291ce7 100644 --- a/ner_v2/detectors/temporal/time/en/time_detection.py +++ b/ner_v2/detectors/temporal/time/en/time_detection.py @@ -1,9 +1,7 @@ import re -from datetime import datetime - -import pytz - +import datetime from ner_v2.detectors.temporal.constant import AM_MERIDIEM, PM_MERIDIEM, TWELVE_HOUR, EVERY_TIME_TYPE +from ner_v2.detectors.temporal.utils import get_timezone class TimeDetector(object): @@ -79,7 +77,8 @@ def __init__(self, entity_name, timezone='UTC'): self.original_time_text = [] self.tag = '__' + entity_name + '__' self.bot_message = None - self.timezone = timezone or 'UTC' + self.timezone = get_timezone(timezone) + self.now_date = datetime.datetime.now(self.timezone) def set_bot_message(self, bot_message): """ @@ -1134,7 +1133,7 @@ def _get_meridiem(self, hours, mins): Returns meridiem type (str): returns the meridiem type whether its am and pm """ - current_datetime = datetime.now(pytz.timezone(self.timezone)) + current_datetime = self.now_date current_hour = current_datetime.hour current_min = current_datetime.minute if hours == 0 or hours >= TWELVE_HOUR: diff --git a/ner_v2/detectors/temporal/time/standard_time_regex.py b/ner_v2/detectors/temporal/time/standard_time_regex.py index 9b94c0e80..c9644ba7d 100644 --- a/ner_v2/detectors/temporal/time/standard_time_regex.py +++ b/ner_v2/detectors/temporal/time/standard_time_regex.py @@ -5,14 +5,12 @@ import os import re -import pytz - from chatbot_ner.config import ner_logger from ner_v2.detectors.temporal.constant import (DATETIME_CONSTANT_FILE, ADD_DIFF_DATETIME_TYPE, NUMERALS_CONSTANT_FILE, TIME_CONSTANT_FILE, REF_DATETIME_TYPE, HOUR_TIME_TYPE, MINUTE_TIME_TYPE, DAYTIME_MERIDIEM, AM_MERIDIEM, PM_MERIDIEM, TWELVE_HOUR) -from ner_v2.detectors.temporal.utils import get_tuple_dict, get_hour_min_diff +from ner_v2.detectors.temporal.utils import get_tuple_dict, get_hour_min_diff, get_timezone class BaseRegexTime(object): @@ -29,12 +27,7 @@ def __init__(self, entity_name, data_directory_path, timezone='UTC'): self.processed_text = '' self.entity_name = entity_name self.tag = '__' + entity_name + '__' - try: - self.timezone = pytz.timezone(timezone) - except Exception as e: - ner_logger.debug('Timezone error: %s ' % e) - self.timezone = pytz.timezone('UTC') - ner_logger.debug('Default timezone passed as "UTC"') + self.timezone = get_timezone(timezone) self.now_date = datetime.datetime.now(tz=self.timezone) self.bot_message = None diff --git a/ner_v2/detectors/temporal/time/time_detection.py b/ner_v2/detectors/temporal/time/time_detection.py index 509c14407..e7df5c386 100644 --- a/ner_v2/detectors/temporal/time/time_detection.py +++ b/ner_v2/detectors/temporal/time/time_detection.py @@ -4,6 +4,7 @@ from language_utilities.constant import ENGLISH_LANG from ner_v2.detectors.base_detector import BaseDetector +from ner_v2.detectors.temporal.utils import get_timezone from ner_v2.detectors.utils import get_lang_data_path @@ -60,7 +61,7 @@ def __init__(self, entity_name='time', timezone='UTC', language=ENGLISH_LANG): self.time = [] self.original_time_text = [] self.tag = '__' + entity_name + '__' - self.timezone = timezone or 'UTC' + self.timezone = get_timezone(timezone) self.language = language try: diff --git a/ner_v2/detectors/temporal/utils.py b/ner_v2/detectors/temporal/utils.py index bb224cc18..264d5f8fb 100644 --- a/ner_v2/detectors/temporal/utils.py +++ b/ner_v2/detectors/temporal/utils.py @@ -1,8 +1,11 @@ import calendar -from datetime import datetime, timedelta +from datetime import datetime, timedelta, tzinfo # FIXME: Change import to `import datetime` import pandas as pd +import pytz +import six +from chatbot_ner.config import ner_logger from ner_v2.detectors.temporal.constant import POSITIVE_TIME_DIFF, NEGATIVE_TIME_DIFF, CONSTANT_FILE_KEY @@ -260,3 +263,36 @@ def get_next_date_with_dd(dd, after_datetime): mm, yy = get_next_month_number(mm=mm, yy=yy) return None, None, None + + +def get_timezone(timezone, ignore_errors=True): + # type: (Union[datetime.tzinfo, str, unicode], bool) -> datetime.tzinfo + """ + Return a datetime.tzinfo (pytz timezone object). If `timezone` is a str, try constructing a pytz + timezone object with it. If an invalid timezone is mentioned and `ignore_errors` is True, an UTC timezone object + will be returned. If `timezone` is already a datetime.tzinfo object it will be returned as is + + Args: + timezone (str or datetime.tzinfo): Either a valid timezone string or datetime.tzinfo object + ignore_errors (bool, optional): when set to True, ignore errors and return a pytz.UTC when error occurs. When + set to False, raise exception when invalid timezone is given. Defaults to True. + + Returns: + datetime.tzinfo: A pytz timezone object + + """ + if (not isinstance(timezone, six.string_types) and + isinstance(timezone, tzinfo) and + hasattr(timezone, 'localize')): + return timezone + + try: + timezone = pytz.timezone(timezone) + except Exception as e: + if ignore_errors: + ner_logger.debug('Timezone error: %s ' % e) + timezone = pytz.timezone('UTC') + ner_logger.debug('Using "UTC" as default timezone') + else: + raise + return timezone