diff --git a/.gitignore b/.gitignore index 5a675421f..b6be485bf 100644 --- a/.gitignore +++ b/.gitignore @@ -103,3 +103,5 @@ ENV/ sftp-config.json .DS_Store logs/ + +.vscode diff --git a/chatbot_ner/config.py b/chatbot_ner/config.py index 6eb89cd80..f1f0e9d0a 100644 --- a/chatbot_ner/config.py +++ b/chatbot_ner/config.py @@ -6,16 +6,15 @@ from requests_aws4auth import AWS4Auth BASE_DIR = os.path.dirname(os.path.dirname(__file__)) -CONFIG_PATH = os.path.join(BASE_DIR, 'config') MODEL_CONFIG_PATH = os.path.join(BASE_DIR, 'model_config') LOG_PATH = os.path.join(BASE_DIR, 'logs') + +# TODO: Set this up via Django LOGGING # SET UP NER LOGGING if not os.path.exists(LOG_PATH): os.makedirs(LOG_PATH) -# LOGGING -# TODO - Make this much generic & simpler in the future LOG_LEVEL = os.environ.get('DJANGO_LOG_LEVEL', 'error').upper() # Common formatter @@ -48,25 +47,18 @@ nlp_logger.addHandler(handler) nlp_logger.addHandler(handler_stdout) -if os.path.exists(CONFIG_PATH): - dotenv.read_dotenv(CONFIG_PATH) -else: - ner_logger.debug('Warning: no file named "config" found at %s. This is not a problem if your ' - 'datastore(elasticsearch) connection settings are already available in the environment', - CONFIG_PATH) - -# TODO Consider prefixing everything config with NER_ because these names are in the environment and so are -# TODO lot of others too which may conflict in name. Example user is already using some another instance of -# TODO Elasticsearch for other purposes ENGINE = os.environ.get('ENGINE') if ENGINE: ENGINE = ENGINE.lower() +else: + ner_logger.warning("`ENGINE` variable is not set, Text type entities won't work without it") + # ES settings (Mandatory to use Text type entities) ES_URL = os.environ.get('ES_URL') ES_HOST = os.environ.get('ES_HOST') ES_PORT = os.environ.get('ES_PORT') ES_INDEX_NAME = os.environ.get('ES_INDEX_NAME') -ES_DOC_TYPE = os.environ.get('ES_DOC_TYPE') +ES_DOC_TYPE = os.environ.get('ES_DOC_TYPE', 'data_dictionary') ES_AUTH_NAME = os.environ.get('ES_AUTH_NAME') ES_AUTH_PASSWORD = os.environ.get('ES_AUTH_PASSWORD') ES_BULK_MSG_SIZE = os.environ.get('ES_BULK_MSG_SIZE', '10000') @@ -81,8 +73,8 @@ ES_BULK_MSG_SIZE = int(ES_BULK_MSG_SIZE) ES_SEARCH_SIZE = int(ES_SEARCH_SIZE) except ValueError: - ES_BULK_MSG_SIZE = 10000 - ES_SEARCH_SIZE = 10000 + ES_BULK_MSG_SIZE = 1000 + ES_SEARCH_SIZE = 1000 # Optional Vars ES_INDEX_1 = os.environ.get('ES_INDEX_1') @@ -101,10 +93,7 @@ # Crf Model Specific with additional AWS storage (optional) CRF_MODEL_S3_BUCKET_NAME = os.environ.get('CRF_MODEL_S3_BUCKET_NAME') CRF_MODEL_S3_BUCKET_REGION = os.environ.get('CRF_MODEL_S3_BUCKET_REGION') - WORD_EMBEDDING_REMOTE_URL = os.environ.get('WORD_EMBEDDING_REMOTE_URL') - - GOOGLE_TRANSLATE_API_KEY = os.environ.get('GOOGLE_TRANSLATE_API_KEY') if not GOOGLE_TRANSLATE_API_KEY: @@ -116,6 +105,7 @@ 'elasticsearch': { 'connection_url': ES_URL, # Elastic Search URL 'name': ES_INDEX_NAME, # Index name used + 'doc_type': ES_DOC_TYPE, # Index's doc type 'host': ES_HOST, # Elastic Search Host 'port': ES_PORT, # Port of elastic search 'user': ES_AUTH_NAME, @@ -139,31 +129,23 @@ } } -if ES_DOC_TYPE: - CHATBOT_NER_DATASTORE['elasticsearch']['doc_type'] = ES_DOC_TYPE -else: - CHATBOT_NER_DATASTORE['elasticsearch']['doc_type'] = 'data_dictionary' - -ES_AWS_SECRET_ACCESS_KEY = os.environ.get('ES_AWS_SECRET_ACCESS_KEY') -ES_AWS_ACCESS_KEY_ID = os.environ.get('ES_AWS_ACCESS_KEY_ID') -ES_AWS_REGION = os.environ.get('ES_AWS_REGION') ES_AWS_SERVICE = os.environ.get('ES_AWS_SERVICE') +ES_AWS_REGION = os.environ.get('ES_AWS_REGION') +ES_AWS_ACCESS_KEY_ID = os.environ.get('ES_AWS_ACCESS_KEY_ID') +ES_AWS_SECRET_ACCESS_KEY = os.environ.get('ES_AWS_SECRET_ACCESS_KEY') -if not ES_AWS_SERVICE: - ES_AWS_SERVICE = 'es' - -if ES_AWS_ACCESS_KEY_ID and ES_AWS_SECRET_ACCESS_KEY and ES_AWS_REGION and ES_AWS_SERVICE: - CHATBOT_NER_DATASTORE['elasticsearch']['http_auth'] = AWS4Auth(ES_AWS_ACCESS_KEY_ID, ES_AWS_SECRET_ACCESS_KEY, - ES_AWS_REGION, ES_AWS_SERVICE) - CHATBOT_NER_DATASTORE['elasticsearch']['use_ssl'] = True - CHATBOT_NER_DATASTORE['elasticsearch']['verify_certs'] = True - CHATBOT_NER_DATASTORE['elasticsearch']['connection_class'] = RequestsHttpConnection -elif ES_AWS_REGION and ES_AWS_SERVICE: +if ES_AWS_SERVICE and ES_AWS_REGION: + ner_logger.info('`ES_AWS_SERVICE` and `ES_AWS_REGION` are set. Using AWS Elasticsearch settings ') CHATBOT_NER_DATASTORE['elasticsearch']['use_ssl'] = True CHATBOT_NER_DATASTORE['elasticsearch']['verify_certs'] = True CHATBOT_NER_DATASTORE['elasticsearch']['connection_class'] = RequestsHttpConnection + if ES_AWS_ACCESS_KEY_ID and ES_AWS_SECRET_ACCESS_KEY: + CHATBOT_NER_DATASTORE['elasticsearch']['http_auth'] = AWS4Auth(ES_AWS_ACCESS_KEY_ID, + ES_AWS_SECRET_ACCESS_KEY, + ES_AWS_REGION, ES_AWS_SERVICE) else: - ner_logger.warning('Elasticsearch: Some or all AWS settings missing from environment, this will skip AWS auth!') + ner_logger.warning('`ES_AWS_SERVICE` and `ES_AWS_REGION` are not set. ' + 'This is not a problem if you are using self hosted ES') # Model Vars if os.path.exists(MODEL_CONFIG_PATH): diff --git a/chatbot_ner/settings.py b/chatbot_ner/settings.py index 1f565f11f..de8e91654 100755 --- a/chatbot_ner/settings.py +++ b/chatbot_ner/settings.py @@ -18,10 +18,10 @@ # See https://docs.djangoproject.com/en/1.11/howto/deployment/checklist/ # SECURITY WARNING: keep the secret key used in production secret! -SECRET_KEY = '70vigjv=h)=p8d%e80(3ue2p70e^x96#n8u+7ia9+7o02iq*6k' +SECRET_KEY = os.environ.get('SECRET_KEY') # SECURITY WARNING: don't run with debug turned on in production! -_dj_debug = os.environ.get('DJANGO_DEBUG') +_dj_debug = os.environ.get('DJANGO_DEBUG', 'false') DEBUG = (_dj_debug and _dj_debug.lower() == 'true') TEMPLATE_DEBUG = False diff --git a/config.example b/config.example index 1c315a9d6..ae5d04a04 100644 --- a/config.example +++ b/config.example @@ -1,16 +1,13 @@ # This is config.example file for chatbot_ner module similar to .env.example file to hold settings # Copy it to a file named config and fill in all the values. # Never push your personal keys and passwords to any public repository! -# Make sure the variables in this file are in the environment. Example: -# $ source chatbot_ner/config # Please don't add spaces around '=' -# This is the primary engine to use. Valid values are one of the following: -# elasticsearch +# This is the primary engine to use. Valid values are one of the following: ['elasticsearch'] ENGINE=elasticsearch -# ES prefixed values correspond to settings for elasticsearch. +# ES prefixed variables correspond to settings for elasticsearch. # ES_URL is the complete url with auth name and password required to connect. If provided, this will override ES_HOST, # ES_PORT, ES_AUTH_NAME, ES_AUTH_PASSWORD # ES_HOST by default is host for ES that comes up with compose @@ -22,6 +19,15 @@ ES_URL= ES_PORT=9200 ES_INDEX_NAME=entity_data ES_DOC_TYPE=data_dictionary +# ES_BULK_MSG_SIZE is an integer value +ES_BULK_MSG_SIZE=1000 +# ES_SEARCH_SIZE is an integer value +ES_SEARCH_SIZE=10000 +# Provide the following values if you need AWS authentication +ES_AWS_SERVICE= +ES_AWS_REGION= +ES_AWS_ACCESS_KEY_ID= +ES_AWS_SECRET_ACCESS_KEY= NAME=chatbot_ner DJANGODIR=/app @@ -31,25 +37,11 @@ DJANGO_SETTINGS_MODULE=chatbot_ner.settings DJANGO_WSGI_MODULE=chatbot_ner/wsgi.py DJANGO_LOG_LEVEL=debug DJANGO_DEBUG=False +# Important: Change the value of SECRET_KEY to something else and keep it secret +SECRET_KEY=!yqqcz-v@(s@kpygpvomcuu3il0q1&qtpz)e_g0ulo-sdv%c0c PORT=8081 TIMEOUT=600 - CITY_MODEL_TYPE=crf CITY_MODEL_PATH= -GOOGLE_TRANSLATE_API_KEY= - - -# ES_BULK_MSG_SIZE is an integer value -ES_BULK_MSG_SIZE=1000 - -# ES_SEARCH_SIZE is an integer value -ES_SEARCH_SIZE=10000 - -# Provide the following values if you need AWS authentication -ES_AWS_SECRET_ACCESS_KEY= -ES_AWS_ACCESS_KEY_ID= -ES_AWS_REGION= -ES_AWS_SERVICE= - # In order to enable entity detection for multiple languages, we use google translate. Please enter the key(optional) GOOGLE_TRANSLATE_API_KEY= diff --git a/docker/Dockerfile b/docker/Dockerfile index c99bf3a1d..0e21311c0 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -31,9 +31,8 @@ ENV PORT=8081 ENV TIMEOUT=600 ENV DEBIAN_FRONTEND=noninteractive - -#ENV DATE_MODEL_TYPE=crf -#ENV DATE_MODEL_PATH=/root/models/models_live/date/crf/model.crf +# Important change this via .env (the file copied from config.example) +ENV SECRET_KEY=!yqqcz-v@(s@kpygpvomcuu3il0q1&qtpz)e_g0ulo-sdv%c0c EXPOSE 8081 diff --git a/docs/install.md b/docs/install.md index 3c9a5f822..025f03e88 100644 --- a/docs/install.md +++ b/docs/install.md @@ -39,7 +39,7 @@ Following are the steps to create the Docker image and run NER with Docker. Docker Compose ```shell - sudo curl -L https://github.com/docker/compose/releases/download/1.22.0/docker-compose-$(uname -s)-$(uname -m) -o /usr/local/bin/docker-compose + sudo curl -L "https://github.com/docker/compose/releases/download/1.22.0/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose sudo chmod +x /usr/local/bin/docker-compose ``` - MacOS: @@ -56,6 +56,8 @@ cd docker docker-compose up --build -d ``` +Open `docker/.env` file and edit the environment variables if needed. (You should change the SECRET_KEY). + The above will also mount local repo root directory inside the containers /app directory. Please wait 5 seconds to run the first curl or do an API call to chatbot_ner. > **NOTE**: make sure that nothing is running on port 8081 on your server or your local environment. diff --git a/ner_constants.py b/ner_constants.py index dd05aa701..f535c14b0 100644 --- a/ner_constants.py +++ b/ner_constants.py @@ -57,5 +57,8 @@ PARAMETER_MAX_DIGITS = 'max_number_digits' PARAMETER_NUMBER_UNIT_TYPE = 'unit_type' +# Prior detection results from CRF models. +PARAMETER_PRIOR_RESULTS = "predetected_values" + # Locale for Date and Phone Number detection -PARAMETER_LOCALE = 'locale' \ No newline at end of file +PARAMETER_LOCALE = 'locale' diff --git a/ner_v1/api.py b/ner_v1/api.py index 316df8b81..93c4b55e8 100644 --- a/ner_v1/api.py +++ b/ner_v1/api.py @@ -10,8 +10,8 @@ from language_utilities.constant import ENGLISH_LANG from ner_constants import (PARAMETER_MESSAGE, PARAMETER_ENTITY_NAME, PARAMETER_STRUCTURED_VALUE, PARAMETER_FALLBACK_VALUE, PARAMETER_BOT_MESSAGE, PARAMETER_TIMEZONE, PARAMETER_REGEX, - PARAMETER_LANGUAGE_SCRIPT, - PARAMETER_SOURCE_LANGUAGE) + PARAMETER_LANGUAGE_SCRIPT, PARAMETER_SOURCE_LANGUAGE, PARAMETER_PRIOR_RESULTS) + from ner_v1.chatbot.combine_detection_logic import combine_output_of_detection_logic_and_tag from ner_v1.chatbot.entity_detection import (get_location, get_phone_number, get_email, get_city, get_pnr, get_number, get_passenger_count, get_shopping_size, get_time, @@ -69,9 +69,10 @@ def get_parameters_dictionary(request): PARAMETER_MAX_DIGITS: request.GET.get('max_number_digits'), PARAMETER_READ_EMBEDDINGS_FROM_REMOTE_URL: to_bool(request.GET.get('read_embeddings_from_remote_url')), PARAMETER_READ_MODEL_FROM_S3: to_bool(request.GET.get('read_model_from_s3')), - PARAMETER_LIVE_CRF_MODEL_PATH: request.GET.get('live_crf_model_path') + PARAMETER_LIVE_CRF_MODEL_PATH: request.GET.get('live_crf_model_path'), + PARAMETER_PRIOR_RESULTS: json.loads(request.GET.get("predetected_values", '[]')) } - + ner_logger.info("parameters dict - {}".format(parameters_dict)) return parameters_dict @@ -103,7 +104,8 @@ def parse_post_request(request): PARAMETER_MAX_DIGITS: request_data.get('max_number_digits'), PARAMETER_READ_EMBEDDINGS_FROM_REMOTE_URL: to_bool(request_data.get('read_embeddings_from_remote_url')), PARAMETER_READ_MODEL_FROM_S3: to_bool(request_data.get('read_model_from_s3')), - PARAMETER_LIVE_CRF_MODEL_PATH: request_data.get('live_crf_model_path') + PARAMETER_LIVE_CRF_MODEL_PATH: request_data.get('live_crf_model_path'), + PARAMETER_PRIOR_RESULTS: request_data.get("predetected_values", []) } return parameters_dict @@ -247,6 +249,7 @@ def text(request): live_crf_model_path=parameters_dict[PARAMETER_LIVE_CRF_MODEL_PATH], read_model_from_s3=parameters_dict[PARAMETER_READ_MODEL_FROM_S3], read_embeddings_from_remote_url=parameters_dict[PARAMETER_READ_EMBEDDINGS_FROM_REMOTE_URL], + predetected_values=parameters_dict[PARAMETER_PRIOR_RESULTS] ) ner_logger.debug('Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output)) except TypeError as e: @@ -268,7 +271,8 @@ def location(request): entity_output = get_location(parameters_dict[PARAMETER_MESSAGE], parameters_dict[PARAMETER_ENTITY_NAME], parameters_dict[PARAMETER_STRUCTURED_VALUE], parameters_dict[PARAMETER_FALLBACK_VALUE], - parameters_dict[PARAMETER_BOT_MESSAGE]) + parameters_dict[PARAMETER_BOT_MESSAGE], + predetected_values=parameters_dict[PARAMETER_PRIOR_RESULTS]) ner_logger.debug('Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output)) except TypeError as e: ner_logger.exception('Exception for location: %s ' % e) @@ -361,7 +365,8 @@ def person_name(request): structured_value=parameters_dict[PARAMETER_STRUCTURED_VALUE], fallback_value=parameters_dict[PARAMETER_FALLBACK_VALUE], bot_message=parameters_dict[PARAMETER_BOT_MESSAGE], - language=parameters_dict[PARAMETER_SOURCE_LANGUAGE]) + language=parameters_dict[PARAMETER_SOURCE_LANGUAGE], + predetected_values=parameters_dict[PARAMETER_PRIOR_RESULTS]) ner_logger.debug('Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output)) except TypeError as e: ner_logger.exception('Exception for person_name: %s ' % e) diff --git a/ner_v1/chatbot/entity_detection.py b/ner_v1/chatbot/entity_detection.py index 4a7ff7e31..af57863ff 100644 --- a/ner_v1/chatbot/entity_detection.py +++ b/ner_v1/chatbot/entity_detection.py @@ -16,7 +16,7 @@ from ner_v1.detectors.textual.city.city_detection import CityDetector from ner_v1.detectors.textual.name.name_detection import NameDetector from ner_v1.detectors.textual.text.text_detection import TextDetector -from ner_v1.detectors.textual.text.text_detection_model import TextModelDetector +from chatbot_ner.config import ner_logger import six """ @@ -91,7 +91,8 @@ """ -def get_text(message, entity_name, structured_value, fallback_value, bot_message, language=ENGLISH_LANG, **kwargs): +def get_text(message, entity_name, structured_value, fallback_value, bot_message, language=ENGLISH_LANG, + predetected_values=None, **kwargs): """Use TextDetector (datastore/elasticsearch) to detect textual entities Args: @@ -229,36 +230,35 @@ def get_text(message, entity_name, structured_value, fallback_value, bot_message """ fuzziness = kwargs.get('fuzziness', None) min_token_len_fuzziness = kwargs.get('min_token_len_fuzziness', None) - live_crf_model_path = kwargs.get('live_crf_model_path', None) - read_model_from_s3 = kwargs.get('read_model_from_s3', False) - read_embeddings_from_remote_url = kwargs.get('read_embeddings_from_remote_url', False) - - text_model_detector = TextModelDetector(entity_name=entity_name, - language=language, - live_crf_model_path=live_crf_model_path, - read_model_from_s3=read_model_from_s3, - read_embeddings_from_remote_url=read_embeddings_from_remote_url) + predetected_values = predetected_values or [] + text_detector = TextDetector(entity_name=entity_name, source_language_script=language) if fuzziness: fuzziness = parse_fuzziness_parameter(fuzziness) - text_model_detector.set_fuzziness_threshold(fuzziness) + text_detector.set_fuzziness_threshold(fuzziness) if min_token_len_fuzziness: min_token_len_fuzziness = int(min_token_len_fuzziness) - text_model_detector.set_min_token_size_for_levenshtein(min_size=min_token_len_fuzziness) + text_detector.set_min_token_size_for_levenshtein(min_size=min_token_len_fuzziness) + ner_logger.info("Predetected values: {}".format(predetected_values)) if isinstance(message, six.string_types): - entity_output = text_model_detector.detect(message=message, - structured_value=structured_value, - fallback_value=fallback_value, - bot_message=bot_message) + entity_output = text_detector.detect(message=message, + structured_value=structured_value, + fallback_value=fallback_value, + bot_message=bot_message, + predetected_values=predetected_values) elif isinstance(message, (list, tuple)): - entity_output = text_model_detector.detect_bulk(messages=message, fallback_values=fallback_value) + entity_output = text_detector.detect_bulk(messages=message, fallback_values=fallback_value, + predetected_values=predetected_values) + else: + raise TypeError('`message` argument must be either of type `str`, `unicode`, `list` or `tuple`.') return entity_output -def get_location(message, entity_name, structured_value, fallback_value, bot_message): +def get_location(message, entity_name, structured_value, fallback_value, bot_message, + predetected_values=None, **kwargs): """"Use TextDetector (elasticsearch) to detect location TODO: We can improve this by creating separate for location detection instead of using TextDetector @@ -274,6 +274,7 @@ def get_location(message, entity_name, structured_value, fallback_value, bot_mes fallback_value (str): If the detection logic fails to detect any value either from structured_value or message then we return a fallback_value as an output. bot_message (str): previous message from a bot/agent. + predetected_values(list of str): prior detection results from models like crf etc. Returns: @@ -294,10 +295,10 @@ def get_location(message, entity_name, structured_value, fallback_value, bot_mes >> [{'detection': 'message', 'entity_value': {'value': 'Andheri West'}, 'language': 'en', 'original_text': 'andheri west'}] """ - + predetected_values = predetected_values or [] text_detection = TextDetector(entity_name=entity_name) return text_detection.detect(message=message, structured_value=structured_value, fallback_value=fallback_value, - bot_message=bot_message) + bot_message=bot_message, predetected_values=predetected_values) def get_phone_number(message, entity_name, structured_value, fallback_value, bot_message): @@ -407,7 +408,7 @@ def get_email(message, entity_name, structured_value, fallback_value, bot_messag bot_message=bot_message) -def get_city(message, entity_name, structured_value, fallback_value, bot_message, language): +def get_city(message, entity_name, structured_value, fallback_value, bot_message, language, **kwargs): """Use CityDetector to detect cities Args: @@ -519,7 +520,7 @@ def get_city(message, entity_name, structured_value, fallback_value, bot_message def get_person_name(message, entity_name, structured_value, fallback_value, bot_message, - language=ENGLISH_LANG): + language=ENGLISH_LANG, predetected_values=None, **kwargs): """Use NameDetector to detect names Args: @@ -534,6 +535,7 @@ def get_person_name(message, entity_name, structured_value, fallback_value, bot_ or message then we return a fallback_value as an output. bot_message (str): previous message from a bot/agent. language (str): ISO 639-1 code of language of message + predetected_values(list of str): prior detection results from models like crf etc. Returns: @@ -552,6 +554,8 @@ def get_person_name(message, entity_name, structured_value, fallback_value, bot_ 'entity_value': {'first_name': yash, 'middle_name': None, 'last_name': doshi}}] """ # TODO refactor NameDetector to make this easy to read and use + predetected_values = predetected_values or [] + name_detection = NameDetector(entity_name=entity_name, language=language) text, detection_method, fallback_text, fallback_method = (structured_value, FROM_STRUCTURE_VALUE_VERIFIED, @@ -565,13 +569,18 @@ def get_person_name(message, entity_name, structured_value, fallback_value, bot_ entity_list, original_text_list = [], [] if text: - entity_list, original_text_list = name_detection.detect_entity(text=text, bot_message=bot_message) + entity_list, original_text_list = name_detection.detect_entity( + text=text, + bot_message=bot_message, + predetected_values=predetected_values) if not entity_list and fallback_text: entity_list, original_text_list = NameDetector.get_format_name(fallback_text.split(), fallback_text) detection_method = fallback_method if entity_list and original_text_list: + # if predetected_values: + # detection_method = "free text entity" return output_entity_dict_list(entity_list, original_text_list, detection_method) return None diff --git a/ner_v1/constant.py b/ner_v1/constant.py index 8e7cbf39b..55a707f4c 100644 --- a/ner_v1/constant.py +++ b/ner_v1/constant.py @@ -112,7 +112,7 @@ PARAMETER_FUZZINESS = 'fuzziness' PARAMETER_MIN_TOKEN_LEN_FUZZINESS = 'min_token_len_fuzziness' DATASTORE_VERIFIED = 'datastore_verified' -CRF_MODEL_VERIFIED = 'crf_model_verified' +MODEL_VERIFIED = 'model_verified' # **********************constants used for text detection************************************ diff --git a/ner_v1/detectors/base_detector.py b/ner_v1/detectors/base_detector.py index c7dc6b4c3..3fca68e8e 100644 --- a/ner_v1/detectors/base_detector.py +++ b/ner_v1/detectors/base_detector.py @@ -10,6 +10,14 @@ from ner_constants import (FROM_STRUCTURE_VALUE_VERIFIED, FROM_STRUCTURE_VALUE_NOT_VERIFIED, FROM_MESSAGE, FROM_FALLBACK_VALUE, ORIGINAL_TEXT, ENTITY_VALUE, DETECTION_METHOD, DETECTION_LANGUAGE, ENTITY_VALUE_DICT_KEY) +from ner_v1.constant import DATASTORE_VERIFIED, MODEL_VERIFIED + +try: + import regex as re + _re_flags = re.UNICODE | re.V1 | re.WORD +except ImportError: + import re + _re_flags = re.UNICODE class BaseDetector(object): @@ -47,11 +55,12 @@ def supported_languages(self): return [] @abc.abstractmethod - def detect_entity(self, text, **kwargs): + def detect_entity(self, text, predetected_values=None, **kwargs): """ This method runs the core entity detection logic defined inside entity detectors Args: text: text snippet from which entities needs to be detected + predetected_values: prior detection results **kwargs: values specific to different detectors such as 'last bot message', custom configs, etc. Return: tuple: Two lists of same length containing detected values and original substring from text which is used @@ -72,12 +81,13 @@ def _set_language_processing_script(self): raise NotImplementedError('Please enable translation or extend language support' 'for %s' % self._source_language_script) - def detect_bulk(self, messages=None, **kwargs): + def detect_bulk(self, messages=None, predetected_values=None, **kwargs): """ Use detector to detect entities from text. It also translates query to language compatible to detector Args: messages (list of strings): list of natural text(s) on which detection logic is to be run. + predetected_values(list of list of str): prior detection results Returns: dict or None: dictionary containing entity_value, original_text and detection; entity_value is in itself a dict with its keys varying from entity to entity @@ -104,7 +114,12 @@ def detect_bulk(self, messages=None, **kwargs): messages.append(translation_output[TRANSLATED_TEXT] if translation_output['status'] else '') texts = messages - entities_list, original_list = self.detect_entity_bulk(texts=texts) + + # Prior results from entity detection using methods like CRF etc. + if predetected_values is None: + predetected_values = [] + entities_list, original_list = self.detect_entity_bulk( + texts=texts, predetected_values=predetected_values) fallback_values = kwargs.get('fallback_values') values_list, detection_method_list, original_texts_list = [], [], [] @@ -129,7 +144,77 @@ def detect_bulk(self, messages=None, **kwargs): detection_method_list=detection_method_list, detection_language=self._target_language_script) - def detect(self, message=None, structured_value=None, fallback_value=None, **kwargs): + def _add_verification_source(self, values, verification_source_dict): + """ + Add the verification source for the detected entities + Args: + values (list): List of detected text type entities + verification_source_dict (dict): Dict consisting of the verification source and value. + Returns: + text_entity_verified_values (list): List of dicts consisting of the key and values for the keys + value and verification source + Example: + values = [u'Chennai', u'New Delhi', u'chennai'] + verification_source_dict = {"datastore_verified": True} + + >> add_verification_source(values, verification_source_dict) + [{'datastore_verified': True, 'value': u'Chennai'}, + {'datastore_verified': True, 'value': u'New Delhi'}, + {'datastore_verified': True, 'value': u'chennai'}] + """ + text_entity_verified_values = [] + for text_entity_value in values: + text_entity_dict = {ENTITY_VALUE_DICT_KEY: text_entity_value} + text_entity_dict.update(verification_source_dict) + text_entity_verified_values.append(text_entity_dict) + return text_entity_verified_values + + def combine_results(self, values, original_texts, predetected_values): + """ + This method is used to combine the results provided by the datastore search and the + crf_model if trained. + Args: + values (list): List of values detected by datastore + original_texts (list): List of original texts present in the texts for which value shave been + detected + predetected_values (list): Entities detected by the models like crf etc. + Returns: + combined_values (list): List of dicts each dict consisting of the entity value and additionally + the keys for the datastore and crf model detection + combined_original_texts (list): List of original texts detected by the datastore and the crf model. + """ + unprocessed_crf_original_texts = [] + + combined_values = self._add_verification_source( + values=values, verification_source_dict={DATASTORE_VERIFIED: True, MODEL_VERIFIED: False} + ) + combined_original_texts = original_texts + for i in range(len(predetected_values)): + match = False + for j in range(len(original_texts)): + if predetected_values[i] == original_texts[j]: + combined_values[j][MODEL_VERIFIED] = True + match = True + break + elif re.findall(r'\b%s\b' % re.escape(predetected_values[i]), original_texts[j]): + # If predetected value is a substring of some value detected by datastore, skip it from output + match = True + break + if not match: + unprocessed_crf_original_texts.append(predetected_values[i]) + + unprocessed_crf_original_texts_verified = self._add_verification_source( + values=unprocessed_crf_original_texts, + verification_source_dict={DATASTORE_VERIFIED: False, MODEL_VERIFIED: True} + ) + + combined_values.extend(unprocessed_crf_original_texts_verified) + combined_original_texts.extend(unprocessed_crf_original_texts) + + return combined_values, combined_original_texts + + def detect(self, message=None, structured_value=None, fallback_value=None, + predetected_values=None, **kwargs): """ Use detector to detect entities from text. It also translates query to language compatible to detector Args: @@ -140,6 +225,7 @@ def detect(self, message=None, structured_value=None, fallback_value=None, **kwa (For example, UI elements like form, payload, etc) fallback_value (str): If the detection logic fails to detect any value either from structured_value or message then we return a fallback_value as an output. + predetected_values(list of str): prior detection results from models like CRF etc. bot_message (str): previous message from a bot/agent. Returns: dict or None: dictionary containing entity_value, original_text and detection; @@ -183,6 +269,7 @@ def detect(self, message=None, structured_value=None, fallback_value=None, **kwa >> [{'detection': 'message', 'original_text': 'inferno', 'entity_value': {'value': u'Inferno'}}] """ + if self._source_language_script != self._target_language_script and self._translation_enabled: if structured_value: translation_output = translate_text(structured_value, self._source_language_script, @@ -194,7 +281,12 @@ def detect(self, message=None, structured_value=None, fallback_value=None, **kwa message = translation_output[TRANSLATED_TEXT] if translation_output['status'] else None text = structured_value if structured_value else message - entity_list, original_text_list = self.detect_entity(text=text) + + # Prior results from detection. + if predetected_values is None: + predetected_values = [] + entity_list, original_text_list = self.detect_entity(text=text, + predetected_values=predetected_values) if structured_value: if entity_list: diff --git a/ner_v1/detectors/textual/name/hindi_const.py b/ner_v1/detectors/textual/name/hindi_const.py index 7651d42ff..84886e890 100644 --- a/ner_v1/detectors/textual/name/hindi_const.py +++ b/ner_v1/detectors/textual/name/hindi_const.py @@ -300,7 +300,32 @@ HINDI_QUESTIONWORDS = [u'क्या', u'कब', u'कहा', u'क्यों', u'कौन', u'कौन', u'जिसे', u'जिसका', u'कैसे', u'कितने'] # Variants in "name" to check for previous context flag -NAME_VARIATIONS = ['name', u'नाम'] +NAME_VARIATIONS = ["enter your full name again", + "what's your name", + "mention your name ", + "provide your name ", + "help me with your name", + "what's your full name?", + "forgot to mention your name", + "please help me with your full name", + "please let me know your full name.", + "please enter your name", + "help me with your full name", + "looks like you forgot to mention your name", + "enter your name", + "share your name", + "know your name", + "tell me your name", + "tell your name", + "what should in call you", + "say your name", + "call you", + "address you", + "your name", + "your full name", + "what is your name", + "is your name", + u"नाम", u'नेम', u'*नाम*', u'*नेम*'] # Common hindi words occuring in context to a name COMMON_HINDI_WORDS_OCCURING_WITH_NAME = {u"मुझे", diff --git a/ner_v1/detectors/textual/name/name_detection.py b/ner_v1/detectors/textual/name/name_detection.py index 9cf07704e..6f88b85d8 100644 --- a/ner_v1/detectors/textual/name/name_detection.py +++ b/ner_v1/detectors/textual/name/name_detection.py @@ -5,6 +5,7 @@ from language_utilities.constant import ENGLISH_LANG, HINDI_LANG from lib.nlp.const import nltk_tokenizer from lib.nlp.pos import POS +from ner_v1.constant import DATASTORE_VERIFIED, MODEL_VERIFIED from ner_v1.constant import EMOJI_RANGES, FIRST_NAME, MIDDLE_NAME, LAST_NAME from ner_v1.detectors.textual.name.hindi_const import (HINDI_BADWORDS, HINDI_QUESTIONWORDS, HINDI_STOPWORDS, NAME_VARIATIONS, @@ -27,7 +28,7 @@ class NameDetector(object): on calling detect_entity() tagged_text: string with city entities replaced with tag defined by entity_name processed_text: string with detected time entities removed - text_detection_object: the object which is used to call the TextDetector + bot_message: previous message """ def __init__(self, entity_name, language=ENGLISH_LANG): @@ -46,7 +47,8 @@ def __init__(self, entity_name, language=ENGLISH_LANG): self.processed_text = '' self.original_name_text = [] self.tag = '_' + entity_name + '_' - self.text_detection_object = TextDetector(entity_name=entity_name) + + self.bot_message = None @staticmethod def get_format_name(name_tokens, text): @@ -82,23 +84,16 @@ def get_format_name(name_tokens, text): last_name = name_tokens[-1] middle_name = " ".join(name_tokens[1:-1]) or None - entity_value.append({FIRST_NAME: first_name, MIDDLE_NAME: middle_name, LAST_NAME: last_name}) + entity_value.append({ + FIRST_NAME: first_name, + MIDDLE_NAME: middle_name, + LAST_NAME: last_name, + DATASTORE_VERIFIED: False, + MODEL_VERIFIED: False, + }) original_text.append(name_text) return entity_value, original_text - def text_detection_name(self, text=None): - """ - Makes a call to TextDetection and return the person_name detected from the elastic search. - Returns: - Tuple with list of names detected in TextDetection in the form of variants detected and original_text - - Example : my name is yash doshi - - ([u'dosh', u'yash'], ['doshi', 'yash']) - """ - if text is None: - text = self.text - return self.text_detection_object.detect_entity(text=text) def get_name_using_pos_tagger(self, text): """ @@ -118,12 +113,14 @@ def get_name_using_pos_tagger(self, text): pattern1 = re.compile(r"name\s*(is|)\s*([\w\s]+)") pattern2 = re.compile(r"myself\s+([\w\s]+)") pattern3 = re.compile(r"call\s+me\s+([\w\s]+)") + pattern4 = re.compile(r"i\s+am\s+([\w\s]+)") name_tokens = text.split() # Passing empty tokens to tag will cause IndexError tagged_names = pos_tagger_object.tag(name_tokens) pattern1_match = pattern1.findall(text) pattern2_match = pattern2.findall(text) pattern3_match = pattern3.findall(text) + pattern4_match = pattern4.findall(text) is_question = [word[0] for word in tagged_names if word[1].startswith('WR') or word[1].startswith('WP') or word[1].startswith('CD')] @@ -139,7 +136,10 @@ def get_name_using_pos_tagger(self, text): elif pattern3_match: entity_value, original_text = self.get_format_name(pattern3_match[0].split(), self.text) - elif len(name_tokens) < 4: + elif pattern4_match: + entity_value, original_text = self.get_format_name(pattern4_match[0].split(), self.text) + + elif len(name_tokens) < 4 and self.bot_message: pos_words = [word[0] for word in tagged_names if word[1].startswith('NN') or word[1].startswith('JJ')] if pos_words: @@ -147,7 +147,7 @@ def get_name_using_pos_tagger(self, text): return entity_value, original_text - def detect_entity(self, text, bot_message=None): + def detect_entity(self, text, bot_message=None, predetected_values=None, **kwargs): """ Takes text as input and returns two lists 1.entity_value in the form of first, middle and last names @@ -155,25 +155,38 @@ def detect_entity(self, text, bot_message=None): Args: text(string): the original text bot_message(string): previous bot message + predetected_values(list of str): detected values from prior detection Example: text=my name is yash doshi - Returns: + Returns: [{first_name: "yash", middle_name: None, last_name: "modi"}], [ yash modi"] """ - if bot_message: - if not self.context_check_botmessage(bot_message): - return [], [] self.text = text self.tagged_text = self.text + self.bot_message = bot_message entity_value, original_text = ([], []) - if self.language == ENGLISH_LANG: - entity_value, original_text = self.detect_english_name() - elif self.language == HINDI_LANG: - entity_value, original_text = self.detect_hindi_name() + if not predetected_values: + if self.bot_message: + if not self.context_check_botmessage(self.bot_message): + return [], [] + if self.language == ENGLISH_LANG: + entity_value, original_text = self.detect_english_name() + elif self.language == HINDI_LANG: + entity_value, original_text = self.detect_hindi_name() + + for entity_value_dict in entity_value: + entity_value_dict.update({DATASTORE_VERIFIED: True, MODEL_VERIFIED: False}) + + else: + replaced_text = self.replace_predetected_text(predetected_values, text=text) + entity_value, original_text = self.detect_person_name_entity(replaced_text) + + for entity_value_dict in entity_value: + entity_value_dict.update({DATASTORE_VERIFIED: False, MODEL_VERIFIED: True}) self._update_processed_text(person_name_list=original_text) @@ -196,11 +209,6 @@ def detect_english_name(self, text=None): if text is None: text = self.text entity_value, original_text = self.get_name_using_pos_tagger(text) - if not entity_value: - text_detection_result = self.text_detection_name(text) - replaced_text = self.replace_detected_text(text_detection_result, text=text) - entity_value, original_text = self.detect_person_name_entity(replaced_text) - return entity_value, original_text def detect_hindi_name(self): @@ -243,6 +251,64 @@ def detect_hindi_name(self): return entity_value, original_text + def replace_predetected_text(self, predetected_values, text): + """ + Replace detected names from the text according to replace_detected_text. + Separate method for replacing predetected_values because it these results are not at token level. + For example - + text = "my name is yash doshi" + predetected_values = ["yash doshi"] + while, text_detection_original_texts = ["yash", "doshi"] + + + Args: + predetected_values(list): list containing predetected_values + text(str): original to run detection on + + Returns: + replaced_text(str): text with marked tokens + + Example: + >> text = "my name is yash doshi" + >> predetected_values = ["yash doshi"] + >> replace_predetected_text(predetected_values, text) + 'my name is _yash_ _doshi_' + + """ + if self.language == ENGLISH_LANG: + replaced_original_text_tokens = nltk_tokenizer.tokenize(text.lower()) + replaced_text_tokens = [] + for index, token in enumerate(replaced_original_text_tokens): + # Fix to handle tokenizer error for tokens with trailing `.`. For eg. + # >> text = "my name is v.k. singh" + # >> tokens = tokenize(text) + # >> tokens + # ["my", "name", "is", "v.k", ".", "singh"] + # this extra `.` token causes problem while training. + if token == "." and 0 < index < len(replaced_original_text_tokens) - 1 \ + and replaced_text_tokens[-1] + "." in text.lower(): + replaced_text_tokens[-1] = replaced_text_tokens[-1] + "." + else: + # Fix to handle examples like `miami,21st street` + # where tokenizer gives ["miami,", "21st", "street"]. + # This causes problems while tagging entities according indices. + # For eg is miami is an entity and its indices are (0,5) then due to this extra `,` tagging will be + # problem because now length of token will become 6 not 5. + _token = token.strip('!"#$%&\'()*+,-/:;<=>?@[\\]^_`{|}~') + if not _token: + _token = token + replaced_text_tokens.append(_token) + else: + replaced_text_tokens = text.lower().strip().split() + + for name in predetected_values: + name_tokens = name.split() + for token in name_tokens: + for j in range(len(replaced_text_tokens)): + replaced_text_tokens[j] = replaced_text_tokens[j].replace(token, "_" + token + "_") + + return replaced_text_tokens + def replace_detected_text(self, text_detection_result, text): """ Replaces the detected name from text_detection_result by __ @@ -256,20 +322,21 @@ def replace_detected_text(self, text_detection_result, text): Example: text_detection_result= ([u'dosh', u'yash'], ['doshi', 'yash']) Returns: - ['my', 'name', 'is', 'yash', 'doshi'] + ['my', 'name', 'is', '_yash_', '_doshi_'] """ - replaced_text = [] + replaced_text_tokens = [] if self.language == ENGLISH_LANG: - replaced_text = nltk_tokenizer.tokenize(text.lower()) + replaced_text_tokens = nltk_tokenizer.tokenize(text.lower()) elif self.language == HINDI_LANG: - replaced_text = text.lower().strip().split() + replaced_text_tokens = text.lower().strip().split() for detected_original_text in (text_detection_result[1]): - for j in range(len(replaced_text)): - replaced_text[j] = replaced_text[j].replace(detected_original_text, "_" + detected_original_text + "_") + for j in range(len(replaced_text_tokens)): + replaced_text_tokens[j] = replaced_text_tokens[j].replace( + detected_original_text, "_" + detected_original_text + "_") - return replaced_text + return replaced_text_tokens def detect_person_name_entity(self, replaced_text): """ diff --git a/ner_v1/detectors/textual/name/tests/test_cases_person_name.csv b/ner_v1/detectors/textual/name/tests/test_cases_person_name.csv index 94eaa35ee..ff0fe28cb 100644 --- a/ner_v1/detectors/textual/name/tests/test_cases_person_name.csv +++ b/ner_v1/detectors/textual/name/tests/test_cases_person_name.csv @@ -1,14 +1,14 @@ -language,message,first_name,middle_name,last_name,original_entities -en,my name is pratik sridatt jayarao,pratik,sridatt,jayarao,pratik sridatt jayarao -en,my name is pratik jayarao,pratik,None,jayarao,pratik jayarao -en,my name is pratik,pratik,None,None,pratik -en,myself pratik sridatt jayarao,pratik,sridatt,jayarao,pratik sridatt jayarao -en,call me pratik sridatt jayarao,pratik,sridatt,jayarao,pratik sridatt jayarao -en,Pratik Jayarao,Pratik,None,Jayarao,Pratik Jayarao -en,Pratik,Pratik,None,None,Pratik -hi,मेरा नाम प्रतिक श्रीदत्त जयराओ है,प्रतिक,श्रीदत्त,जयराओ,प्रतिक श्रीदत्त जयराओ -hi,नाम प्रतिक श्रीदत्त जयराओ है,प्रतिक,श्रीदत्त,जयराओ,प्रतिक श्रीदत्त जयराओ -hi,नाम प्रतिक जयराओ है,प्रतिक,None,जयराओ,प्रतिक जयराओ -hi,मुझे प्रतिक जयराओ बुलाते है,प्रतिक,None,जयराओ,प्रतिक जयराओ -hi,प्रतिक जयराओ,प्रतिक,None,जयराओ,प्रतिक जयराओ -hi,मेरा नाम pratik jayarao है,pratik,None,jayarao,pratik jayarao +language,message,first_name,middle_name,last_name,original_entities,mocked_values +en,my name is pratik sridatt jayarao,pratik,sridatt,jayarao,pratik sridatt jayarao,"[[""jayarao"", ""pratik"", ""shridatt""], [""jayarao"", ""pratik"", ""sridatt""]]" +en,my name is pratik jayarao,pratik,None,jayarao,pratik jayarao,"[[""jayarao"", ""pratik""], [""jayarao"", ""pratik""]]" +en,my name is pratik,pratik,None,None,pratik,"[[""pratik""], [""pratik""]]" +en,myself pratik sridatt jayarao,pratik,sridatt,jayarao,pratik sridatt jayarao,"[[""jayarao"", ""pratik"", ""shridatt""], [""jayarao"", ""pratik"", ""sridatt""]]" +en,call me pratik sridatt jayarao,pratik,sridatt,jayarao,pratik sridatt jayarao,"[[""jayarao"", ""pratik"", ""shridatt""], [""jayarao"", ""pratik"", ""sridatt""]]" +en,Pratik Jayarao,Pratik,None,Jayarao,Pratik Jayarao,"[[""jayarao"", ""pratik""], [""jayarao"", ""pratik""]]" +en,Pratik,Pratik,None,None,Pratik,"[[""pratik""], [""pratik""]]" +hi,मेरा नाम प्रतिक श्रीदत्त जयराओ है,प्रतिक,श्रीदत्त,जयराओ,प्रतिक श्रीदत्त जयराओ,"[[""प्रतिक"", ""श्रीदत्त"", ""जयराओ""], [""प्रतिक"", ""श्रीदत्त"", ""जयराओ""]]" +hi,नाम प्रतिक श्रीदत्त जयराओ है,प्रतिक,श्रीदत्त,जयराओ,प्रतिक श्रीदत्त जयराओ,"[[""प्रतिक"", ""श्रीदत्त"", ""जयराओ""], [""प्रतिक"", ""श्रीदत्त"", ""जयराओ""]]" +hi,नाम प्रतिक जयराओ है,प्रतिक,None,जयराओ,प्रतिक जयराओ,"[[""प्रतिक"", ""श्रीदत्त"", ""जयराओ""], [""प्रतिक"", ""श्रीदत्त"", ""जयराओ""]]" +hi,मुझे प्रतिक जयराओ बुलाते है,प्रतिक,None,जयराओ,प्रतिक जयराओ,"[[""प्रतिक"", ""श्रीदत्त"", ""जयराओ""], [""प्रतिक"", ""श्रीदत्त"", ""जयराओ""]]" +hi,प्रतिक जयराओ,प्रतिक,None,जयराओ,प्रतिक जयराओ,"[[""प्रतिक"", ""श्रीदत्त"", ""जयराओ""], [""प्रतिक"", ""श्रीदत्त"", ""जयराओ""]]" +hi,मेरा नाम pratik jayarao है,pratik,None,jayarao,pratik jayarao,"[[""प्रतिक"", ""श्रीदत्त"", ""जयराओ""], [""प्रतिक"", ""श्रीदत्त"", ""जयराओ""]]" diff --git a/ner_v1/detectors/textual/name/tests/test_name_detection.py b/ner_v1/detectors/textual/name/tests/test_name_detection.py index 4a93d7c78..5ed1a0794 100644 --- a/ner_v1/detectors/textual/name/tests/test_name_detection.py +++ b/ner_v1/detectors/textual/name/tests/test_name_detection.py @@ -5,6 +5,7 @@ import pandas as pd from django.test import TestCase +from ner_v1.constant import DATASTORE_VERIFIED, MODEL_VERIFIED from ner_v1.detectors.textual.name.name_detection import NameDetector @@ -20,14 +21,16 @@ def preprocess_data(self): 'language': [], 'message': [], 'expected_value': [], + 'mocked_values': [], } - for (language, message, first_name, middle_name, last_name, original_entity) in zip( + for (language, message, first_name, middle_name, last_name, original_entity, mocked_values) in zip( self.data['language'], self.data['message'], self.data['first_name'], self.data['middle_name'], self.data['last_name'], - self.data['original_entities']): + self.data['original_entities'], + self.data['mocked_values']): fn = [] mn = [] ln = [] @@ -48,6 +51,7 @@ def preprocess_data(self): test_dict['language'].append(language) test_dict['message'].append(message) test_dict['expected_value'].append(temp) + test_dict['mocked_values'].append(mocked_values) return test_dict @@ -56,7 +60,11 @@ def test_person_name_detection(self): message = self.test_dict['message'][i] expected_value = self.test_dict['expected_value'][i] name_detector = NameDetector(language=self.test_dict['language'][i], entity_name='person_name') - detected_texts, original_texts = name_detector.detect_entity(text=message) + detected_texts, original_texts = name_detector.detect_entity(text=message, + bot_message='what is your name') + for d in detected_texts: + d.pop(MODEL_VERIFIED) + d.pop(DATASTORE_VERIFIED) zipped = zip(detected_texts, original_texts) self.assertEqual(expected_value, zipped) diff --git a/ner_v1/detectors/textual/text/text_detection.py b/ner_v1/detectors/textual/text/text_detection.py index 4ece23730..887471549 100644 --- a/ner_v1/detectors/textual/text/text_detection.py +++ b/ner_v1/detectors/textual/text/text_detection.py @@ -1,6 +1,7 @@ import collections import string +import six from six import iteritems import language_utilities.constant as lang_constant @@ -12,11 +13,12 @@ try: import regex as re + _re_flags = re.UNICODE | re.V1 | re.WORD except ImportError: - import re + _re_flags = re.UNICODE @@ -277,7 +279,7 @@ def _get_tokens_and_indices(txt): return u' '.join(matched_tokens) - def detect_entity_bulk(self, texts, **kwargs): + def detect_entity_bulk(self, texts, predetected_values=None, **kwargs): """ Detects all textual entities in text that are similar to variants of 'entity_name' stored in the datastore and returns two lists of list of detected text entities and their corresponding original substrings @@ -287,21 +289,24 @@ def detect_entity_bulk(self, texts, **kwargs): is returned. For more information on how data is stored, see Datastore docs. Args: - texts (list): list of strings(bulk detect) to extract textual entities from + texts (list): list of str to extract textual entities from + predetected_values (list of list of str): results from prior detection. **kwargs: it can be used to send specific arguments in future. for example, fuzziness, previous context. Returns: tuple: - list or list of lists(bulk detect): containing entity value as defined into datastore - list or list of lists(bulk detect): containing corresponding original substrings in text + list or list of dicts: ith item is a list of values output as dictionary with structure + {'value': , ...} which were detected as entity values in texts[i] + list or list of str: ith item contains corresponding original substrings in texts[i] that were + detected as entity values Example: DataStore().get_entity_dictionary('city') Output: { - u'Agartala': [u'', u'Agartala'], - u'Barnala': [u'', u'Barnala'], + u'Agartala': [u'Agartala'], + u'Barnala': [u'Barnala'], ... - u'chennai': [u'', u'chennai', u'tamilnadu', u'madras'], + u'chennai': [u'chennai', u'tamilnadu', u'madras'], u'hyderabad': [u'hyderabad'], u'koramangala': [u'koramangala'] } @@ -309,14 +314,20 @@ def detect_entity_bulk(self, texts, **kwargs): list_of_sentences = ['Come to Chennai, TamilNadu, I will visit Delhi next year', 'I live in Delhi] - text_detection.detect_entity(list_of_sentences) + text_detection.detect_entity_bulk(list_of_sentences) Output: ( [ - [u'Chennai', u'New Delhi', u'chennai'], - [u'New Delhi'] + [ + {'value': u'chennai'}, + {'value': u'tamilnadu'}, + {value': u'new delhi'}, + ], + [ + {'value': u'new delhi'}, + ] ], [ - ['chennai', 'delhi', 'tamilnadu'], + ['chennai', 'tamilnadu', 'delhi'], [delhi] ] ) @@ -329,51 +340,75 @@ def detect_entity_bulk(self, texts, **kwargs): ] """ + # For bulk detection predetected_values will be a list of list of str + predetected_values = predetected_values or [] self._process_text(texts) text_entity_values_list, original_texts_list = self._text_detection_with_variants() - return text_entity_values_list, original_texts_list - def detect_entity(self, text, **kwargs): + # itertate over text_entity_values_list, original_texts_list and if predetected_values has any entry + # for that index use combine_results to merge the results from predetected_values and dictionary detection. + combined_entity_values, combined_original_texts = [], [] + zipped_iter = six.moves.zip_longest(text_entity_values_list, original_texts_list, predetected_values) + for i, (values, original_texts, inner_predetected_values) in enumerate(zipped_iter): + inner_combined_entity_values, inner_combined_original_texts = self.combine_results( + values=values, + original_texts=original_texts, + predetected_values=inner_predetected_values if inner_predetected_values else []) + combined_entity_values.append(inner_combined_entity_values) + combined_original_texts.append(inner_combined_original_texts) + + return combined_entity_values, combined_original_texts + + def detect_entity(self, text, predetected_values=None, **kwargs): """ Detects all textual entities in text that are similar to variants of 'entity_name' stored in the datastore and returns two lists of detected text entities and their corresponding original substrings in text respectively. Note that datastore stores number of values under a entity_name and each entity_value has its own list of - variants, whenever a variant is matched sucessfully, the entity_value whose list the variant belongs to, + variants, whenever a variant is matched successfully, the entity_value whose list the variant belongs to, is returned. For more information on how data is stored, see Datastore docs. Args: - text (unicode): string to extract textual entities from + text (str): string to extract textual entities from + predetected_values (list of str): prior detection results **kwargs: it can be used to send specific arguments in future. for example, fuzziness, previous context. Returns: tuple: - list: containing entity value as defined into datastore - list: containing corresponding original substrings in text + list: list of dict with detected value against 'value' + list: list of str containing corresponding original substrings in text Example: DataStore().get_entity_dictionary('city') Output: { - u'Agartala': [u'', u'Agartala'], - u'Barnala': [u'', u'Barnala'], + u'Agartala': [u'Agartala'], + u'Barnala': [u'Barnala'], ... - u'chennai': [u'', u'chennai', u'tamilnadu', u'madras'], + u'chennai': [u'chennai', u'tamilnadu', u'madras'], u'hyderabad': [u'hyderabad'], u'koramangala': [u'koramangala'] } text_detection = TextDetector('city') text_detection.detect_entity('Come to Chennai, TamilNadu, I will visit Delhi next year') Output: - ([u'Chennai', u'New Delhi', u'chennai'], ['chennai', 'delhi', 'tamilnadu']) + ([{'value': u'chennai'}, {'value': u'tamilnadu'}, {value': u'new delhi'}], + ['chennai', 'tamilnadu', 'delhi']) text_detection.tagged_text Output: ' come to __city__, __city__, i will visit __city__ next year ' """ + values, texts = [], [] + predetected_values = predetected_values or [] + self._process_text([text]) text_entity_values, original_texts = self._text_detection_with_variants() - if len(text_entity_values) > 0 and len(original_texts) > 0: + if text_entity_values and original_texts: self.tagged_text = self.__tagged_texts[0] self.processed_text = self.__processed_texts[0] - return text_entity_values[0], original_texts[0] - return [], [] + values, texts = text_entity_values[0], original_texts[0] + + values, texts = self.combine_results(values=values, original_texts=texts, + predetected_values=predetected_values) + + return values, texts def _text_detection_with_variants(self): """ diff --git a/ner_v1/detectors/textual/text/text_detection_model.py b/ner_v1/detectors/textual/text/text_detection_model.py deleted file mode 100644 index 0356c4295..000000000 --- a/ner_v1/detectors/textual/text/text_detection_model.py +++ /dev/null @@ -1,240 +0,0 @@ -import re - -from language_utilities.constant import ENGLISH_LANG -from models.crf_v2.crf_detect_entity import CrfDetection -from ner_constants import ENTITY_VALUE_DICT_KEY -from ner_v1.constant import DATASTORE_VERIFIED, CRF_MODEL_VERIFIED -from ner_v1.detectors.textual.text.text_detection import TextDetector -import six - - -class TextModelDetector(TextDetector): - """ - This class is inherited from the TextDetector class. - This class is primarily used to detect text type entities using the datastore as well as the the CRF - model if trained. - """ - - def __init__(self, - entity_name, - language=ENGLISH_LANG, - live_crf_model_path=None, - read_embeddings_from_remote_url=False, - read_model_from_s3=False, - **kwargs): - """ - Args: - entity_name (str): name of the entity. Same as the entity name under which data is indexed in DataStore - language (str): ISO 639 code for the language to detect entities in - live_crf_model_path (str): path to the crf model, either on disk or s3 - read_embeddings_from_remote_url (bool, optional): if True, read embeddings from remote url configured in - chatbot_ner.config. Defaults to False - read_model_from_s3 (bool, optional): if True, use live_crf_model_path to read model from s3 instead - of local disk. Defaults to False - """ - super(TextModelDetector, self).__init__(entity_name=entity_name, - source_language_script=language, - translation_enabled=False) - self.read_model_from_s3 = read_model_from_s3 - self.read_embeddings_from_remote_url = read_embeddings_from_remote_url - self.live_crf_model_path = live_crf_model_path - - def detect_entity(self, text, **kwargs): - """ - Detects all textual entities in text that are similar to variants of 'entity_name' stored in the datastore and - returns two lists of detected text entities and their corresponding original substrings in text respectively. - The first list being a list of dicts with the verification source and the values. - Note that datastore stores number of values under a entity_name and each entity_value has its own list of - variants, whenever a variant is matched successfully, the entity_value whose list the variant belongs to, - is returned. For more information on how data is stored, see Datastore docs. - In addition to this method also runs the CRF MODEL if trained and provides the results for the given entity. - Args: - text (str or unicode): string to extract textual entities from - **kwargs: it can be used to send specific arguments in future. for example, fuzziness, previous context. - Returns: - tuple: - list: containing list of dicts with the source of detection for the entity value and - entity value as defined into datastore - list: containing corresponding original substrings in text - Example: - DataStore().get_entity_dictionary('city') - Output: - { - u'Agartala': [u'', u'Agartala'], - u'Barnala': [u'', u'Barnala'], - ... - u'chennai': [u'', u'chennai', u'tamilnadu', u'madras'], - u'hyderabad': [u'hyderabad'], - u'koramangala': [u'koramangala'] - } - text_detection = TextModelDetector('city') - text_detection.detect_entity('Come to Chennai, TamilNadu, I will visit Delhi next year') - Output: - ([{'datastore_verified': True,'crf_model_verified': True, 'value': u'Chennai'}, - {'datastore_verified': True,'crf_model_verified': False, 'value': u'New Delhi'}, - {'datastore_verified': False,'crf_model_verified': True, 'value': u'chennai'}] - , ['chennai', 'delhi', 'tamilnadu']) - text_detection.tagged_text - Output: - ' come to __city__, __city__, i will visit __city__ next year ' - Additionally this function assigns these lists to self.text_entity_values and self.original_texts attributes - respectively. - """ - crf_original_texts = [] - if self.live_crf_model_path: - crf_model = CrfDetection(entity_name=self.entity_name, - read_model_from_s3=self.read_model_from_s3, - read_embeddings_from_remote_url=self.read_embeddings_from_remote_url, - live_crf_model_path=self.live_crf_model_path) - - crf_original_texts = crf_model.detect_entity(text=text) - - values, original_texts = super(TextModelDetector, self).detect_entity(text, **kwargs) - - text_entity_verified_values, original_texts = self.combine_results(values=values, - original_texts=original_texts, - crf_original_texts=crf_original_texts) - self.text_entity_values, self.original_texts = text_entity_verified_values, original_texts - - return self.text_entity_values, self.original_texts - - def detect_entity_bulk(self, texts, **kwargs): - """ - Detects all textual entities in text that are similar to variants of 'entity_name' stored in the datastore and - returns two lists of list of detected text entities and their corresponding original substrings - for each sentence in text respectively. - The first list being a list of list of dicts with the verification source and the values. - Note that datastore stores number of values under a entity_name and each entity_value has its own list of - variants, whenever a variant is matched successfully, the entity_value whose list the variant belongs to, - is returned. For more information on how data is stored, see Datastore docs. - In addition to this method also runs the CRF MODEL if trained and provides the results for the given entity. - - Args: - texts (list of strings): natural language sentence(s) to extract entities from - **kwargs: it can be used to send specific arguments in future. for example, fuzziness, previous context. - - Returns: - tuple: - list of lists(bulk detect): containing list of dicts with the source of detection - for the entity value and entity value as defined into datastore - - list of lists(bulk detect): containing corresponding original substrings in text - - Example: - DataStore().get_entity_dictionary('city') - - Output: - { - u'Agartala': [u'', u'Agartala'], - u'Barnala': [u'', u'Barnala'], - ... - u'chennai': [u'', u'chennai', u'tamilnadu', u'madras'], - u'hyderabad': [u'hyderabad'], - u'koramangala': [u'koramangala'] - } - text_detection = TextDetector('city') - list_of_sentences = ['Come to Chennai, TamilNadu, I will visit Delhi next year', - 'I live in Delhi] - - text_detection.detect_entity(list_of_sentences) - Output: - ( [ - [u'Chennai', u'New Delhi', u'chennai'], - [u'New Delhi'] - ], - [ - ['chennai', 'delhi', 'tamilnadu'], - [delhi] - ] - ) - - text_detection.tagged_text - Output: - [ - ' come to __city__, __city__, i will visit __city__ next year ', - ' i live in __city__ ' - ] - - Additionally this function assigns these lists to self.text_entity_values and self.original_texts attributes - respectively. - """ - - crf_original_texts = [] - - values_list, original_texts_list = super(TextModelDetector, self).detect_entity_bulk(texts, **kwargs) - text_entity_values_list, original_texts_detected_list = [], [] - for inner_values, inner_original_texts in six.moves.zip(values_list, original_texts_list): - text_entity_verified_values, original_texts = \ - self.combine_results(values=inner_values, original_texts=inner_original_texts, - crf_original_texts=crf_original_texts) - text_entity_values_list.append(text_entity_verified_values) - original_texts_detected_list.append(original_texts) - return text_entity_values_list, original_texts_detected_list - - def _add_verification_source(self, values, verification_source_dict): - """ - Add the verification source for the detected entities - Args: - values (list): List of detected text type entities - verification_source_dict (dict): Dict consisting of the verification source and value. - Returns: - text_entity_verified_values (list): List of dicts consisting of the key and values for the keys - value and verification source - Example: - values = [u'Chennai', u'New Delhi', u'chennai'] - verification_source_dict = {"datastore_verified": True} - - >> add_verification_source(values, verification_source_dict) - [{'datastore_verified': True, 'value': u'Chennai'}, - {'datastore_verified': True, 'value': u'New Delhi'}, - {'datastore_verified': True, 'value': u'chennai'}] - """ - text_entity_verified_values = [] - for text_entity_value in values: - text_entity_dict = {ENTITY_VALUE_DICT_KEY: text_entity_value} - text_entity_dict.update(verification_source_dict) - text_entity_verified_values.append(text_entity_dict) - return text_entity_verified_values - - def combine_results(self, values, original_texts, crf_original_texts): - """ - This method is used to combine the results provided by the datastore search and the - crf_model if trained. - Args: - values (list): List of values detected by datastore - original_texts (list): List of original texts present in the texts for which value shave been - detected - crf_original_texts (list): Entities detected by the Crf Model - Returns: - combined_values (list): List of dicts each dict consisting of the entity value and additionally - the keys for the datastore and crf model detection - combined_original_texts (list): List of original texts detected by the datastore and the crf model. - """ - unprocessed_crf_original_texts = [] - - combined_values = self._add_verification_source(values=values, - verification_source_dict={ - DATASTORE_VERIFIED: True, - CRF_MODEL_VERIFIED: False - }) - combined_original_texts = original_texts - for i in range(len(crf_original_texts)): - match = False - for j in range(len(original_texts)): - if crf_original_texts[i] == original_texts[j]: - combined_values[j][CRF_MODEL_VERIFIED] = True - match = True - elif re.findall(r'\b%s\b' % crf_original_texts[i], original_texts[j]): - match = True - if not match: - unprocessed_crf_original_texts.append(crf_original_texts[i]) - - unprocessed_crf_original_texts_verified = self._add_verification_source(values=unprocessed_crf_original_texts, - verification_source_dict= - {DATASTORE_VERIFIED: False, - CRF_MODEL_VERIFIED: True} - ) - combined_values.extend(unprocessed_crf_original_texts_verified) - combined_original_texts.extend(unprocessed_crf_original_texts) - - return combined_values, combined_original_texts diff --git a/ner_v2/api.py b/ner_v2/api.py index c7d0975c1..69f12ac6b 100644 --- a/ner_v2/api.py +++ b/ner_v2/api.py @@ -132,8 +132,9 @@ def date(request): ner_logger.debug('Start: %s ' % parameters_dict[PARAMETER_ENTITY_NAME]) timezone = parameters_dict[PARAMETER_TIMEZONE] or 'UTC' - date_past_reference = parameters_dict.get(PARAMETER_PAST_DATE_REFERENCED, "false") - past_date_referenced = date_past_reference == 'true' or date_past_reference == 'True' + past_date_referenced = parameters_dict.get(PARAMETER_PAST_DATE_REFERENCED, False) + past_date_referenced = True if (past_date_referenced == 'true' or past_date_referenced == 'True') else False + date_detection = DateAdvancedDetector(entity_name=parameters_dict[PARAMETER_ENTITY_NAME], language=parameters_dict[PARAMETER_SOURCE_LANGUAGE], timezone=timezone, diff --git a/ner_v2/detectors/temporal/date/date_detection.py b/ner_v2/detectors/temporal/date/date_detection.py index d8c92f105..92bcca72b 100644 --- a/ner_v2/detectors/temporal/date/date_detection.py +++ b/ner_v2/detectors/temporal/date/date_detection.py @@ -59,7 +59,7 @@ def get_supported_languages(): return supported_languages def __init__(self, entity_name='date', locale=None, language=ENGLISH_LANG, timezone='UTC', - past_date_referenced=False): + past_date_referenced=False, bot_message=None): """ Initializes the DateDetector object with given entity_name and pytz timezone object @@ -87,6 +87,8 @@ def __init__(self, entity_name='date', locale=None, language=ENGLISH_LANG, timez past_date_referenced=past_date_referenced, locale=locale) self.bot_message = None + if bot_message: + self.set_bot_message(bot_message) @property def supported_languages(self): @@ -132,6 +134,8 @@ def detect_entity(self, text, run_model=False, **kwargs): Additionally this function assigns these lists to self.date and self.original_date_text attributes respectively. + :param text: text + :param run_model: run_model """ self.text = ' ' + text.lower() + ' ' self.processed_text = self.text @@ -658,7 +662,6 @@ def detect(self, message=None, structured_value=None, fallback_value=None, **kwa (For example, UI elements like form, payload, etc) fallback_value (str): If the detection logic fails to detect any value either from structured_value or message then we return a fallback_value as an output. - bot_message (str): previous message from a bot/agent. Returns: dict or None: dictionary containing entity_value, original_text and detection; @@ -670,9 +673,8 @@ def detect(self, message=None, structured_value=None, fallback_value=None, **kwa message = 'i want to order chinese from mainland china and pizza from domminos' structured_value = None fallback_value = None - bot_message = None output = detect(message=message, structured_value=structured_value, - fallback_value=fallback_value, bot_message=bot_message) + fallback_value=fallback_value) print output >> [{'detection': 'message', 'original_text': 'mainland china', 'entity_value': @@ -685,9 +687,8 @@ def detect(self, message=None, structured_value=None, fallback_value=None, **kwa entity_name = 'movie' structured_value = 'inferno' fallback_value = None - bot_message = None output = get_text(message=message, entity_name=entity_name, structured_value=structured_value, - fallback_value=fallback_value, bot_message=bot_message) + fallback_value=fallback_value) print output >> [{'detection': 'structure_value_verified', 'original_text': 'inferno', 'entity_value': @@ -698,9 +699,8 @@ def detect(self, message=None, structured_value=None, fallback_value=None, **kwa entity_name = 'movie' structured_value = 'delhi' fallback_value = None - bot_message = None output = get_text(message=message, entity_name=entity_name, structured_value=structured_value, - fallback_value=fallback_value, bot_message=bot_message) + fallback_value=fallback_value) print output >> [{'detection': 'message', 'original_text': 'inferno', 'entity_value': {'value': u'Inferno'}}] @@ -825,7 +825,7 @@ def detect_entity(self, text, **kwargs): Additionally this function assigns these lists to self.date and self.original_date_text attributes respectively. - + :param text: text """ self.text = ' ' + text.strip().lower() + ' ' @@ -855,6 +855,7 @@ def set_bot_message(self, bot_message): bot_message: is the previous message that is sent by the bot """ self.bot_message = bot_message + self.language_date_detector.set_bot_message(bot_message) def to_datetime_object(self, base_date_value_dict): """ diff --git a/ner_v2/detectors/temporal/date/en/date_detection.py b/ner_v2/detectors/temporal/date/en/date_detection.py index 5c1423fac..d9d0ec781 100644 --- a/ner_v2/detectors/temporal/date/en/date_detection.py +++ b/ner_v2/detectors/temporal/date/en/date_detection.py @@ -99,6 +99,7 @@ def __init__(self, entity_name, locale=None, timezone='UTC', past_date_reference self.bot_message = None self.locale = locale self.country_code = None + self.past_date_referenced = past_date_referenced self.default_detector_preferences = [self._gregorian_day_month_year_format, self._gregorian_month_day_year_format, self._gregorian_year_month_day_format, @@ -314,7 +315,7 @@ def _gregorian_day_month_year_format(self, date_list=None, original_list=None): yy = int(self.normalize_year(pattern[3])) if pattern[3] else self.now_date.year try: # to catch dates which are not possible like "31/11" (october 31st) - if not pattern[3] and self.timezone.localize(datetime.datetime(year=yy, month=mm, day=dd))\ + if not pattern[3] and self.timezone.localize(datetime.datetime(year=yy, month=mm, day=dd)) \ < self.now_date: yy += 1 except: @@ -371,7 +372,7 @@ def _gregorian_month_day_year_format(self, date_list=None, original_list=None): yy = int(self.normalize_year(pattern[3])) if pattern[3] else self.now_date.year try: # to catch dates which are not possible like "11/31" (october 31st) - if not pattern[3] and self.timezone.localize(datetime.datetime(year=yy, month=mm, day=dd))\ + if not pattern[3] and self.timezone.localize(datetime.datetime(year=yy, month=mm, day=dd)) \ < self.now_date: yy += 1 except: @@ -1998,13 +1999,13 @@ def normalize_year(self, year): future_regex = None this_century = int(str(self.now_date.year)[:2]) if len(year) == 2: - if self.bot_message: - if past_regex and past_regex.search(self.bot_message): - return str(this_century - 1) + year - elif present_regex and present_regex.search(self.bot_message): - return str(this_century) + year - elif future_regex and future_regex.search(self.bot_message): - return str(this_century + 1) + year + if (((self.bot_message and past_regex.search(self.bot_message)) + or (self.past_date_referenced is True)) and (int(year) > int(str(self.now_date.year)[2:]))): + return str(this_century - 1) + year + elif present_regex and present_regex.search(self.bot_message): + return str(this_century) + year + elif future_regex and future_regex.search(self.bot_message): + return str(this_century + 1) + year # if patterns didn't match or no bot message set, fallback to current century if len(year) == 2: diff --git a/ner_v2/detectors/temporal/date/hi/date_detection.py b/ner_v2/detectors/temporal/date/hi/date_detection.py index 289259a74..9bfbd7b0d 100644 --- a/ner_v2/detectors/temporal/date/hi/date_detection.py +++ b/ner_v2/detectors/temporal/date/hi/date_detection.py @@ -16,6 +16,7 @@ def __init__(self, entity_name, locale=None, timezone='UTC', past_date_reference timezone=timezone, past_date_referenced=past_date_referenced) self.detector_preferences = [ + self._gregorian_day_month_year_format, self._detect_relative_date, self._detect_date_month, self._detect_date_ref_month_1, @@ -27,7 +28,7 @@ def __init__(self, entity_name, locale=None, timezone='UTC', past_date_reference self._detect_weekday_ref_month_2, self._detect_weekday_diff, self._detect_weekday, - self.custom_christmas_date_detector, + self.custom_christmas_date_detector ] def custom_christmas_date_detector(self, date_list=None, original_list=None): diff --git a/ner_v2/detectors/temporal/date/standard_date_regex.py b/ner_v2/detectors/temporal/date/standard_date_regex.py index 5fb5815eb..44adbd7c6 100644 --- a/ner_v2/detectors/temporal/date/standard_date_regex.py +++ b/ner_v2/detectors/temporal/date/standard_date_regex.py @@ -2,7 +2,6 @@ import datetime import re - from dateutil.relativedelta import relativedelta from ner_v2.detectors.temporal.constant import (DATE_CONSTANT_FILE, DATETIME_CONSTANT_FILE, @@ -36,7 +35,7 @@ def __init__(self, entity_name, data_directory_path, locale=None, timezone='UTC' self.now_date = datetime.datetime.now(tz=self.timezone) self.bot_message = None - self.is_past_referenced = past_date_referenced + self.past_date_referenced = past_date_referenced # dict to store words for date, numerals and words which comes in reference to some date self.date_constant_dict = {} @@ -60,7 +59,8 @@ def __init__(self, entity_name, data_directory_path, locale=None, timezone='UTC' self.init_regex_and_parser(data_directory_path) # Variable to define default order in which these regex will work - self.detector_preferences = [self._detect_relative_date, + self.detector_preferences = [self._gregorian_day_month_year_format, + self._detect_relative_date, self._detect_date_month, self._detect_date_ref_month_1, self._detect_date_ref_month_2, @@ -197,7 +197,7 @@ def _detect_relative_date(self, date_list=None, original_list=None): date_rel_match = self.regex_relative_date.findall(self.processed_text) for date_match in date_rel_match: original = date_match[0] - if not self.is_past_referenced: + if not self.past_date_referenced: req_date = self.now_date + datetime.timedelta(days=self.date_constant_dict[date_match[1]][0]) else: req_date = self.now_date - datetime.timedelta(days=self.date_constant_dict[date_match[1]][0]) @@ -241,7 +241,7 @@ def _detect_date_month(self, date_list, original_list): yymmdd = str(self.now_date.year + 1) + mmdd yy = self.now_date.year + 1 - if self.is_past_referenced: + if self.past_date_referenced: if int(today_yymmdd) < int(yymmdd): yy -= 1 date = { @@ -344,11 +344,11 @@ def _detect_date_ref_month_3(self, date_list, original_list): for date_match in date_ref_month_match: original = date_match[0] dd = self._get_int_from_numeral(date_match[1]) - if (self.now_date.day > dd and self.is_past_referenced) or \ - (self.now_date.day <= dd and not self.is_past_referenced): + if (self.now_date.day > dd and self.past_date_referenced) or \ + (self.now_date.day <= dd and not self.past_date_referenced): mm = self.now_date.month yy = self.now_date.year - elif self.now_date.day <= dd and self.is_past_referenced: + elif self.now_date.day <= dd and self.past_date_referenced: req_date = self.now_date - relativedelta(months=1) mm = req_date.month yy = req_date.year @@ -537,6 +537,119 @@ def _detect_weekday(self, date_list, original_list): original_list.append(original) return date_list, original_list + def _gregorian_day_month_year_format(self, date_list=None, original_list=None): + """ + Detects date in the following format + + format: + where each part is in of one of the formats given against them + day: d, dd + month: m, mm + year: yy, yyyy + separator: "/", "-", "." + + Two character years are assumed to be belong to 21st century - 20xx. + Only years between 1900 to 2099 are detected + + Few valid examples: + "6/2/39", "7/01/1997", "28-12-2096" + + Args: + date_list: Optional, list to store dictionaries of detected dates + original_list: Optional, list to store corresponding substrings of given text which were detected as + date entities + Returns: + A tuple of two lists with first list containing the detected date entities and second list containing their + corresponding substrings in the given text. + + """ + if original_list is None: + original_list = [] + if date_list is None: + date_list = [] + regex_pattern = re.compile(r'[^/\-\.\w](([12][0-9]|3[01]|0?[1-9])\s?[/\-\.]\s?(1[0-2]|0?[1-9])' + r'(?:\s?[/\-\.]\s?((?:20|19)?[0-9]{2}))?)\W') + translate_number = self.convert_numbers(self.processed_text.lower()) + patterns = regex_pattern.findall(translate_number) + for pattern in patterns: + original = pattern[0] + dd = int(pattern[1]) + mm = int(pattern[2]) + yy = int(self.normalize_year(pattern[3])) if pattern[3] else self.now_date.year + try: + # to catch dates which are not possible like "31/11" (october 31st) + if not pattern[3] and self.timezone.localize(datetime.datetime(year=yy, month=mm, day=dd)) \ + < self.now_date: + yy += 1 + except Exception: + return date_list, original_list + + date = { + 'dd': int(dd), + 'mm': int(mm), + 'yy': int(yy), + 'type': TYPE_EXACT + } + date_list.append(date) + # original = self.regx_to_process.text_substitute(original) + if translate_number != self.processed_text.lower(): + match = re.search(original, translate_number) + original_list.append(self.processed_text[(match.span()[0]):(match.span()[1])]) + else: + original_list.append(original) + return date_list, original_list + + @staticmethod + def convert_numbers(text): + result = text + digit = re.compile(r'(\d)', re.U) + groups = digit.findall(result) + for group in groups: + result = result.replace(group, str(int(group))) + return result + + def normalize_year(self, year): + """ + Normalize two digit year to four digits by taking into consideration the bot message. Useful in cases like + date of birth where past century is preferred than current. If no bot message is given it falls back to + current century + + Args:[{"key":"message","value":"१/३/६६","description":""}] + year (str): Year string to normalize + + Returns: + str: year in four digits + """ + # past_regex = re.compile(ur'birth|bday|dob|born|जन्म|जन्मदिन|పుట్టినరోజు|పుట్టిన', flags=re.UNICODE) + past_regex = None + # Todo: Add more language variations of birthday. + present_regex = None + future_regex = None + this_century = int(str(self.now_date.year)[:2]) + if len(year) == 2: + if (((self.bot_message and past_regex and past_regex.search(self.bot_message)) + or (self.past_date_referenced is True)) and (int(year) > int(str(self.now_date.year)[2:]))): + return str(this_century - 1) + year + elif present_regex and present_regex.search(self.bot_message): + return str(this_century) + year + elif future_regex and future_regex.search(self.bot_message): + return str(this_century + 1) + year + + # if patterns didn't match or no bot message set, fallback to current century + if len(year) == 2: + return str(this_century) + year + + return year + + def set_bot_message(self, bot_message): + """ + Sets the object's bot_message attribute + + Args: + bot_message: is the previous message that is sent by the bot + """ + self.bot_message = bot_message + def _update_processed_text(self, original_date_list): """ Replaces detected date with tag generated from entity_name used to initialize the object with diff --git a/ner_v2/tests/temporal/date/en/test_date_detection.py b/ner_v2/tests/temporal/date/en/test_date_detection.py index 93d95fa1c..9697606ce 100644 --- a/ner_v2/tests/temporal/date/en/test_date_detection.py +++ b/ner_v2/tests/temporal/date/en/test_date_detection.py @@ -1,3 +1,5 @@ +# coding=utf-8 + from __future__ import absolute_import import datetime @@ -244,4 +246,31 @@ def test_en_gregorian_year_day_month_format(self): 'value': {'dd': day1, 'mm': month, 'yy': year1, 'type': 'date'} }, date_dicts) + self.assertEqual(original_texts.count(message.lower()), 1) + + def test_hi_gregorian_dd_mm_yy_format(self): + """ + Date detection for pattern '१/३/६६' + """ + message = u'१/३/६६' + locale = 'hi-in' + # If we run + day1 = 1 + month = 3 + year1 = 1966 + past_date_referenced = True + + date_detector_object = DateAdvancedDetector(entity_name=self.entity_name, language='hi', locale=locale, + past_date_referenced=past_date_referenced) + date_dicts, original_texts = date_detector_object.detect_entity(message) + + self.assertIn({ + 'normal': True, + 'start_range': False, + 'end_range': False, + 'from': False, + 'to': False, + 'value': {'dd': day1, 'mm': month, 'yy': year1, 'type': 'date'} + }, date_dicts) + self.assertEqual(original_texts.count(message.lower()), 1) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 0e474b44a..ab2dbcf31 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,12 +2,12 @@ phonenumberslite==8.10.18 six==1.11.0 gunicorn==19.6.0 pytz==2014.2 -nltk==3.2.5 +nltk==3.4.5 numpy==1.10.4 elasticsearch==5.5.0 requests==2.20.0 requests-aws4auth==0.9 -Django==1.11.22 +Django==1.11.27 django-dotenv==1.4.2 weighted-levenshtein==0.1 regex==2018.7.11