diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 000000000..ddfdf1a89 --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,17 @@ +## JIRA Ticket Number + +JIRA TICKET: + +## Description of change +(REMOVE ME) Please include a summary of the change and which issue is fixed. Please also include relevant motivation and context. List any dependencies that are required for this change. + +## Checklist (OPTIONAL): + +- [ ] My code follows the style guidelines of this project +- [ ] I have performed a self-review of my own code +- [ ] I have commented my code, particularly in hard-to-understand areas +- [ ] I have made corresponding changes to the documentation +- [ ] My changes generate no new warnings +- [ ] I have added tests that prove my fix is effective or that my feature works +- [ ] New and existing unit tests pass locally with my changes +- [ ] Any dependent changes have been merged and published in downstream modules diff --git a/chatbot_ner/urls.py b/chatbot_ner/urls.py index fcc96376a..4c72caa3d 100755 --- a/chatbot_ner/urls.py +++ b/chatbot_ner/urls.py @@ -34,6 +34,7 @@ url(r'^v2/number/$', api_v2.number), url(r'^v2/phone_number/$', api_v2.phone_number), url(r'^v2/number_range/$', api_v2.number_range), + url(r'^v2/text/$', api_v2.text), # V2 bulk detectors url(r'^v2/date_bulk/$', api_v2.date), diff --git a/config.example b/config.example index 7fdba8a19..9ebf284c0 100644 --- a/config.example +++ b/config.example @@ -34,8 +34,8 @@ ES_ALIAS=entity_data ES_INDEX_1=entity_data_v1 ES_INDEX_2= ES_DOC_TYPE=data_dictionary -ELASTICSEARCH_CRF_DATA_DOC_TYPE=training_dictionary ELASTICSEARCH_CRF_DATA_INDEX_NAME=entity_examples_data +ELASTICSEARCH_CRF_DATA_DOC_TYPE=training_dictionary ES_BULK_MSG_SIZE=1000 ES_SEARCH_SIZE=10000 diff --git a/datastore/datastore.py b/datastore/datastore.py index fae8ad90e..4badd0f1b 100644 --- a/datastore/datastore.py +++ b/datastore/datastore.py @@ -151,7 +151,6 @@ def create(self, err_if_exists=True, **kwargs): if self._engine == ELASTICSEARCH: es_url = elastic_search.connect.get_es_url() - es_object = elastic_search.transfer.ESTransfer(source=es_url, destination=None) create_map = [ # TODO: use namedtuples (True, ELASTICSEARCH_INDEX_1, ELASTICSEARCH_DOC_TYPE, self._store_name, self._check_doc_type_for_elasticsearch, elastic_search.create.create_entity_index), @@ -180,8 +179,10 @@ def create(self, err_if_exists=True, **kwargs): **kwargs ) if alias_name: - es_object.point_an_alias_to_index(es_url=es_url, alias_name=self._store_name, - index_name=index_name) + elastic_search.create.create_alias(connection=self._client_or_connection, + index_list=[index_name], + alias_name=alias_name, + logger=ner_logger) def delete(self, err_if_does_not_exist=True, **kwargs): """ @@ -208,15 +209,15 @@ def delete(self, err_if_does_not_exist=True, **kwargs): self._connect() if self._engine == ELASTICSEARCH: - for index_key in [ELASTICSEARCH_INDEX_1, ELASTICSEARCH_INDEX_2, ELASTICSEARCH_CRF_DATA_INDEX_NAME]: - if self._connection_settings.get(index_key): + delete_map = [ELASTICSEARCH_INDEX_1, ELASTICSEARCH_INDEX_2, ELASTICSEARCH_CRF_DATA_INDEX_NAME] + for index_name_key in delete_map: + if self._connection_settings.get(index_name_key): + index_name = self._connection_settings.get(index_name_key) elastic_search.create.delete_index(connection=self._client_or_connection, - index_name=self._store_name, + index_name=index_name, logger=ner_logger, err_if_does_not_exist=err_if_does_not_exist, **kwargs) - # TODO: cleanup aliases ? - # === Incompatible or deprecated/duplicate APIs # FIXME: repopulate does not consider language of the variants diff --git a/datastore/elastic_search/create.py b/datastore/elastic_search/create.py index f06d7c77a..07a04e77f 100644 --- a/datastore/elastic_search/create.py +++ b/datastore/elastic_search/create.py @@ -1,9 +1,16 @@ +import logging +from typing import List, Dict, Any + +from elasticsearch import Elasticsearch +from elasticsearch.exceptions import NotFoundError + from .utils import filter_kwargs log_prefix = 'datastore.elastic_search.create' def exists(connection, index_name): + # type: (Elasticsearch, str) -> bool """ Checks if index_name exists @@ -18,6 +25,7 @@ def exists(connection, index_name): def delete_index(connection, index_name, logger, err_if_does_not_exist=True, **kwargs): + # type: (Elasticsearch, str, logging.Logger, bool, **Any) -> None """ Deletes the index named index_name @@ -25,6 +33,7 @@ def delete_index(connection, index_name, logger, err_if_does_not_exist=True, **k connection: Elasticsearch client object index_name: The name of the index logger: logging object to log at debug and exception level + err_if_does_not_exist: if to raise error if index does not exist already, defaults to True kwargs: body: The configuration for the index (settings and mappings) master_timeout: Specify timeout for connection to master @@ -40,11 +49,17 @@ def delete_index(connection, index_name, logger, err_if_does_not_exist=True, **k else: return + try: + delete_alias(connection=connection, index_list=[index_name], alias_name='_all', logger=logger) + except NotFoundError: + logger.warning('No aliases found on on index %s', index_name) + connection.indices.delete(index=index_name, **kwargs) logger.debug('%s: Delete Index %s: Operation successfully completed', log_prefix, index_name) def _create_index(connection, index_name, doc_type, logger, mapping_body, err_if_exists=True, **kwargs): + # type: (Elasticsearch, str, str, logging.Logger, Dict[str, Any], bool, **Any) -> None """ Creates an Elasticsearch index needed for similarity based searching Args: @@ -53,6 +68,7 @@ def _create_index(connection, index_name, doc_type, logger, mapping_body, err_if doc_type: The type of the documents that will be indexed logger: logging object to log at debug and exception level mapping_body: dict, mappings to put on the index + err_if_exists: if to raise error if the index already exists, defaults to True kwargs: master_timeout: Specify timeout for connection to master timeout: Explicit operation timeout @@ -118,6 +134,7 @@ def _create_index(connection, index_name, doc_type, logger, mapping_body, err_if def create_entity_index(connection, index_name, doc_type, logger, **kwargs): + # type: (Elasticsearch, str, str, logging.Logger, **Any) -> None """ Creates an mapping specific to entity storage in elasticsearch and makes a call to create_index to create the index with the given mapping body @@ -145,10 +162,32 @@ def create_entity_index(connection, index_name, doc_type, logger, **kwargs): mapping_body = { doc_type: { 'properties': { + 'language_script': { + 'type': 'text', + 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}, + }, + 'value': { + 'type': 'text', + 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}, + }, 'variants': { 'type': 'text', + 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}, 'analyzer': 'my_analyzer', 'norms': {'enabled': False}, # Needed if we want to give longer variants higher scores + }, + # other removed/unused fields, kept only for backward compatibility + 'dict_type': { + 'type': 'text', + 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}, + }, + 'entity_data': { + 'type': 'text', + 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}, + }, + 'source_language': { + 'type': 'text', + 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}, } } } @@ -158,6 +197,7 @@ def create_entity_index(connection, index_name, doc_type, logger, **kwargs): def create_crf_index(connection, index_name, doc_type, logger, **kwargs): + # type: (Elasticsearch, str, str, logging.Logger, **Any) -> None """ This method is used to create an index with mapping suited for story training_data Args: @@ -184,17 +224,17 @@ def create_crf_index(connection, index_name, doc_type, logger, **kwargs): mapping_body = { doc_type: { 'properties': { - "entity_data": { - "type": "text" + 'entity_data': { + 'type': 'text' }, - "sentence": { - "enabled": "false" + 'sentence': { + 'enabled': False }, - "entities": { - "enabled": "false" + 'entities': { + 'enabled': False }, - "language_script": { - "type": "text" + 'language_script': { + 'type': 'text' } } } @@ -204,10 +244,11 @@ def create_crf_index(connection, index_name, doc_type, logger, **kwargs): def create_alias(connection, index_list, alias_name, logger, **kwargs): + # type: (Elasticsearch, List[str], str, logging.Logger, **Any) -> None """ This method is used to create alias for list of indices Args: - connection: + connection: Elasticsearch client object index_list (list): List of indices the alias has to point to alias_name (str): Name of the alias logger: logging object to log at debug and exception level @@ -215,6 +256,24 @@ def create_alias(connection, index_list, alias_name, logger, **kwargs): **kwargs: https://www.elastic.co/guide/en/elasticsearch/reference/current/indices-aliases.html """ - logger.debug('Alias creation %s started %s' % alias_name) + logger.debug('Putting alias %s to indices: %s', alias_name, str(index_list)) connection.indices.put_alias(index=index_list, name=alias_name, **kwargs) - logger.debug('Alias %s now points to indices %s' % (alias_name, str(index_list))) + logger.debug('Alias %s now points to indices %s', alias_name, str(index_list)) + + +def delete_alias(connection, index_list, alias_name, logger, **kwargs): + # type: (Elasticsearch, List[str], str, logging.Logger, **Any) -> None + """ + Delete alias `alias_name` from list of indices in `index_list` + Args: + connection: Elasticsearch client object + index_list (list): List of indices the alias has to point to + alias_name (str): Name of the alias + logger: logging object to log at debug and exception level + + **kwargs: + https://www.elastic.co/guide/en/elasticsearch/reference/current/indices-aliases.html + """ + logger.debug('Removing alias %s from indices: %s', alias_name, str(index_list)) + connection.indices.delete_alias(index=index_list, name=alias_name, **kwargs) + logger.debug('Alias %s removed from indices %s', alias_name, str(index_list)) diff --git a/ner_constants.py b/ner_constants.py index f535c14b0..fc279bfa2 100644 --- a/ner_constants.py +++ b/ner_constants.py @@ -25,6 +25,11 @@ ENTITY_VALUE_DICT_KEY = 'value' +# datastore_verified a key to verify value from the datastore +DATASTORE_VERIFIED = 'datastore_verified' +# model_verified a key to verify value from the model +MODEL_VERIFIED = 'model_verified' + # ************************ constants tell us what to do with structured_value ************************ # This will execute entity detection on the structured_value. STRUCTURED = 0 diff --git a/ner_v2/api.py b/ner_v2/api.py index 611662a4c..2783d2609 100644 --- a/ner_v2/api.py +++ b/ner_v2/api.py @@ -11,6 +11,8 @@ from ner_v2.detectors.temporal.time.time_detection import TimeDetector from ner_v2.detectors.numeral.number.number_detection import NumberDetector from ner_v2.detectors.numeral.number_range.number_range_detection import NumberRangeDetector + +from ner_v2.detectors.textual.utils import get_text_entity_detection_data, verify_text_request from language_utilities.constant import ENGLISH_LANG from ner_v2.detectors.pattern.phone_number.phone_number_detection import PhoneDetector @@ -552,3 +554,148 @@ def phone_number(request): return HttpResponse(status=500) return HttpResponse(json.dumps({'data': entity_output}), content_type='application/json') + + +@csrf_exempt +def text(request): + """ + Uses TextDetector to the get the values of multiple text entity detection. This is used + for both single text message or multiple text message detection. + + Currently only POST method is supported. + + Args: + request: request for text detection + + Request parameters + + message (list of str): list of message string for which detection logic needs to be run on. + + source_language (str): language for which the phone numbers have to be detected + + bot_message (str): previous message from a bot/agent. + + entities (dict): dictionary of entties to be detected, each entity dict will contain + following details: + + entity_name (str): name of the entity. Also acts as elastic-search dictionary name + if entity uses elastic-search lookup + structured_value (str): [Optional] Value obtained from any structured elements. + + Note if structured value is detection is run on structured value instead of message + (For example, UI elements like form, payload, etc) + + fallback_value (str): [Optional] If the detection logic fails to detect any value + either from structured_value or message then we return a fallback_value as an output. + + use_fallback (bool): Default as False, if this is present for a single message + fallback value will be used. + + fuzziness (int): [Optional] Fuzziness value for each entity + + min_token_size_for_fuzziness (int): [Optional] minimum size for token match + + Returns: + response (django.http.response.HttpResponse): HttpResponse object + + + Examples: + + 1) For single message: + input request: + { + "message": ["I want to go to Jabalpur"], + "bot_message": null, + "language_script": "en", + "source_language": "en", + "entities": { + "city": { + "structured_value": "Delhi", + "fallback_value": null, + "predetected_values": ["Mumbai"], + "fuzziness": null, + "min_token_len_fuzziness": null, + "use_fallback": false + }, + "restaurant": { + "structured_value": null, + "fallback_value": null, + "predetected_values": null, + "fuzziness": null, + "min_token_len_fuzziness": null, + "use_fallback": false + } + } + } + output response: + { + "success": true, + "error": null, + "data": [ + { + "entities": { + "restaurant": [], + "city": [ + { + "entity_value": { + "value": "New Delhi", + "datastore_verified": true, + "model_verified": false + }, + "detection": "structure_value_verified", + "original_text": "delhi", + "language": "en" + }, + { + "entity_value": { + "value": "Mumbai", + "datastore_verified": false, + "model_verified": true + }, + "detection": "structure_value_verified", + "original_text": "Mumbai", + "language": "en" + } + ] + }, + "language": "en" + } + ] + } + """ + data = [] + + if request.method == "GET": + response = {"success": False, "error": "Get method is not allowed"} + return HttpResponse(json.dumps(response), status=501) + + elif request.method == "POST": + ner_logger.debug("Fetching result") + + try: + verify_text_request(request) + # if verify success get detection data + data = get_text_entity_detection_data(request) + + except KeyError as err: + response = {"success": False, "error": str(err)} + ner_logger.debug(response) + return HttpResponse(json.dumps(response), content_type='application/json', + status=400) + except TypeError as err: + response = {"success": False, "error": str(err)} + ner_logger.debug(response) + return HttpResponse(json.dumps(response), content_type='application/json', + status=400) + except Exception as err: + response = {"success": False, "error": str(err)} + ner_logger.debug(response) + return HttpResponse(json.dumps(response), content_type='application/json', + status=400) + + if data: + response = {"success": True, "error": None, "data": data} + return HttpResponse(json.dumps(response), content_type='application/json', status=200) + else: + response = {"success": False, "error": "Some error while parsing"} + return HttpResponse(json.dumps(response), status=400) diff --git a/ner_v2/detectors/temporal/date/en/date_detection.py b/ner_v2/detectors/temporal/date/en/date_detection.py index 4fc0786fd..b3ff6fa24 100644 --- a/ner_v2/detectors/temporal/date/en/date_detection.py +++ b/ner_v2/detectors/temporal/date/en/date_detection.py @@ -679,7 +679,7 @@ def _gregorian_month_day_with_ordinals_year_format(self, date_list=None, origina original_list = [] if date_list is None: date_list = [] - regex_pattern = re.compile(r'\b(((?:20|19)?[0-9]{2})?\s?([A-Za-z]+)[\ \,\-]\s?([12][0-9]' + regex_pattern = re.compile(r'\b(((?:20|19)?[0-9]{2}\s)?([A-Za-z]+)[\ \,\-]\s?([12][0-9]' r'|3[01]|0?[1-9])\s?(?:nd|st|rd|th)?' r'(?:[\ \,\-]\s?((?:20|19)?[0-9]{2}))?)\W') patterns = regex_pattern.findall(self.processed_text.lower()) diff --git a/ner_v2/detectors/textual/__init__.py b/ner_v2/detectors/textual/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/ner_v2/detectors/textual/elastic_search.py b/ner_v2/detectors/textual/elastic_search.py new file mode 100644 index 000000000..a613c2833 --- /dev/null +++ b/ner_v2/detectors/textual/elastic_search.py @@ -0,0 +1,251 @@ +from __future__ import absolute_import + +import json +import six + +from itertools import chain +from elasticsearch import Elasticsearch + +from lib.singleton import Singleton +from chatbot_ner.config import ner_logger, CHATBOT_NER_DATASTORE +from datastore import constants +from datastore.exceptions import DataStoreSettingsImproperlyConfiguredException +from language_utilities.constant import ENGLISH_LANG + +from ner_v2.detectors.textual.queries import _generate_multi_entity_es_query, \ + _parse_multi_entity_es_results + + +class ElasticSearchDataStore(six.with_metaclass(Singleton, object)): + """ + Class responsible for holding connections and performing search in + ElasticSearch DB. + Used as a singleton in this module. + """ + + def __init__(self): + self._engine_name = constants.ELASTICSEARCH + self._kwargs = {} + self._conns = {} + self._connection_settings = {} + self._connection = None + self._index_name = None + + self.query_data = [] + + # configure variables and connection + self._configure_store() + + # define doc type + self.doc_type = self._connection_settings[ + constants.ELASTICSEARCH_DOC_TYPE] + + def _configure_store(self, **kwargs): + """ + Configure self variables and connection. + Also add default connection to registry with alias `default` + """ + self._connection_settings = CHATBOT_NER_DATASTORE. \ + get(self._engine_name) + + if self._connection_settings is None: + raise DataStoreSettingsImproperlyConfiguredException() + + self._index_name = self._connection_settings[constants.ELASTICSEARCH_ALIAS] + self._connection = self.connect(**self._connection_settings) + + self._conns['default'] = self._connection + + def add_new_connection(self, alias, conn): + """ + Add new connection object, which can be directly passed through as-is to + the connection registry. + """ + self._conns[alias] = conn + + def get_or_create_new_connection(self, alias="default", **kwargs): + """ + Retrieve a connection with given alias. + Construct it if necessary (only when configuration was passed to us). + + If some non-string alias has been passed through it assume a client instance + and will just return it as-is. + + Raises ``KeyError`` if no client (or its definition) is registered + under the alias. + """ + + if not isinstance(alias, six.string_types): + return alias + + # connection already established + try: + return self._conns[alias] + except KeyError: + pass + + # if not, try to create it a new connection + try: + conn = self.connect(**kwargs) + self._conns[alias] = conn + except KeyError: + # no connection and no kwargs to set one up + raise KeyError("There is no connection with alias %r." % alias) + + # check if this is necessary here + def _check_doc_type_for_elasticsearch(self): + """ + Checks if doc_type is present in connection settings, if not an exception is raised + + Raises: + DataStoreSettingsImproperlyConfiguredException if doc_type was not found in + connection settings + """ + # TODO: This check should be during init or boot + if constants.ELASTICSEARCH_DOC_TYPE not in self._connection_settings: + ner_logger.debug("No doc type is present") + raise DataStoreSettingsImproperlyConfiguredException( + 'Elasticsearch needs doc_type. Please configure ES_DOC_TYPE in your environment') + + def generate_query_data(self, entities, texts, fuzziness_threshold=1, + search_language_script=ENGLISH_LANG): + + # check if text is string + if isinstance(texts, str): + texts = [texts] + + index_header = json.dumps({'index': self._index_name, 'type': self.doc_type}) + + data = list(chain.from_iterable([[index_header, + json.dumps(_generate_multi_entity_es_query( + entities=entities, + text=each, + fuzziness_threshold=fuzziness_threshold, + language_script=search_language_script))] + for each in texts])) + + return data + + def get_multi_entity_results(self, entities, texts, fuzziness_threshold=1, + search_language_script=ENGLISH_LANG, **kwargs): + """ + Returns: + list of collections.OrderedDict: dictionary mapping each entity for each text + with their value variants to entity value + + Example: + db = ElasticSearchDataStore() + entities = ['city', 'restaurant'] + texts = ['I want to go to mumbai and eat at dominoes pizza', + ' I want to go Jabalpur'] + + get_multi_entity_results(entities, texts) + + Output: + [ + { + 'restaurant': OrderedDict([ + ("Domino's Pizza", "Domino's Pizza"), + ('Domino', "Domino's Pizza"), + ('Dominos', "Domino's Pizza"), + ('Pizza Pizza Pizza', 'Pizza Pizza Pizza'), + ('Pizza', 'U S Pizza')]), + 'city': OrderedDict([ + ('Mumbai', 'Mumbai'), + ('mumbai', 'mumbai')])}, + { + 'city': OrderedDict([ + ('Jabalpur', 'Jabalpur'), + ('Jamalpur', 'Jamalpur'), + ('goa', 'goa')]), + 'restaurant': OrderedDict([ + ('TMOS', 'TMOS'), ('G.', 'G Pulla Reddy Sweets')])} + ] + """ + + self._check_doc_type_for_elasticsearch() + request_timeout = self._connection_settings.get('request_timeout', 20) + index_name = self._index_name + + data = [] + for entity_list, text_list in zip(entities, texts): + data.extend(self.generate_query_data(entity_list, text_list, fuzziness_threshold, + search_language_script)) + + # add `\n` for each index_header and query data text entry + query_data = '\n'.join(data) + + kwargs = dict(body=query_data, doc_type=self.doc_type, index=index_name, + request_timeout=request_timeout) + + results = self._run_es_search(self._connection, **kwargs) + results = _parse_multi_entity_es_results(results.get("responses")) + + return results + + @staticmethod + def connect(connection_url=None, host=None, port=None, user=None, password=None, **kwargs): + """ + Establishes connection to a single Elasticsearch Instance. + if connection_url is not None, then host, port, user, password are not used + Args: + connection_url: Elasticsearch connection url of the format https://user:secret@host:port/abc . + Optional if other parameters are provided. + host: nodes to connect to . e.g. localhost. Optional if connection_url is provided + port: port for elasticsearch connection. Optional if connection_url is provided + user: Optional, username for elasticsearch authentication + password: Optional, password for elasticsearch authentication + kwargs: any additional arguments will be passed on to the Transport class and, subsequently, + to the Connection instances. + Returns: + Elasticsearch client connection object + + """ + connection = None + if user and password: + kwargs = dict(kwargs, http_auth=(user, password)) + + if connection_url: + connection = Elasticsearch(hosts=[connection_url], **kwargs) + elif host and port: + connection = Elasticsearch(hosts=[{'host': host, 'port': int(port)}], **kwargs) + + if connection and not connection.ping(): + connection = None + + return connection + + @staticmethod + def _run_es_search(connection, **kwargs): + """ + Execute the elasticsearch.ElasticSearch.msearch() method and return all results + Args: + connection: Elasticsearch client object + kwargs: + Refer https://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.Elasticsearch.search + Returns: + dictionary, search results from elasticsearch.ElasticSearch.msearch + """ + + return connection.msearch(**kwargs) + + @staticmethod + def _get_dynamic_fuzziness_threshold(fuzzy_setting): + """ + Approximately emulate AUTO:[low],[high] functionality of elasticsearch 6.2+ on older versions + + Args: + fuzzy_setting (int or str): Can be int or "auto" or "auto:," + + Returns: + int or str: fuzziness as int when ES version < 6.2 + otherwise the input is returned as it is + """ + if isinstance(fuzzy_setting, six.string_types): + if constants.ELASTICSEARCH_VERSION_MAJOR > 6 \ + or (constants.ELASTICSEARCH_VERSION_MAJOR == 6 + and constants.ELASTICSEARCH_VERSION_MINOR >= 2): + return fuzzy_setting + return 'auto' + + return fuzzy_setting diff --git a/ner_v2/detectors/textual/queries.py b/ner_v2/detectors/textual/queries.py new file mode 100644 index 000000000..a6de5a5ab --- /dev/null +++ b/ner_v2/detectors/textual/queries.py @@ -0,0 +1,233 @@ +from __future__ import absolute_import + +import collections +import json +import re + +from six.moves import zip +from six import string_types + +from datastore import constants +from language_utilities.constant import ENGLISH_LANG +from lib.nlp.const import TOKENIZER + + +def _generate_multi_entity_es_query(entities, text, + fuzziness_threshold=1, + language_script=ENGLISH_LANG, + size=constants.ELASTICSEARCH_SEARCH_SIZE, + as_json=False): + """ + Generates compound elasticsearch boolean filter search query dictionary + for a text for multiple entity_data. + The query generated searches for entity_name in the index and returns search results for the + matched word (of sentence) only if entity_name is found. + + Args: + entities (list/str): list of the entity to perform a 'term' query on. + If str will converted to list internally. + text (str): The text on which we need to identify the entities. + fuzziness_threshold (int, optional): fuzziness_threshold for elasticsearch match query 'fuzziness' parameter. + Defaults to 1 + language_script (str, optional): language of documents to be searched, + optional, defaults to 'en' + size (int, optional): number of records to return, + defaults to `ELASTICSEARCH_SEARCH_SIZE` + as_json (bool, optional): Return the generated query as json string. + useful for debug purposes. Defaults to False + + Returns: + dictionary, the search query for the text + + Examples Query generated: + _generate_multi_entity_es_query(['city', 'restaurant'], "I want to go to + mumbai") + + Outputs: + { + '_source': ['value', 'entity_data'], + 'query': {'bool': {'filter': + [{'terms': + {'entity_data': ['city', 'restaurant']}}, + {'terms': {'language_script': ['en']}}], + 'should': [{'match': + {'variants': {'query': 'I want to go to mumbai', + 'fuzziness': 1, 'prefix_length': 1}}}], + 'minimum_should_match': 1}}, + 'highlight': + {'fields': {'variants': {'type': 'unified'}}, + order': 'score', 'number_of_fragments': 20}, + 'size': 10000 + } + """ + + # if entities instance of string convert to list + if isinstance(entities, string_types): + entities = [entities] + + filter_terms = [] + term_dict_entity_name = { + 'terms': { + 'entity_data': entities + } + } + filter_terms.append(term_dict_entity_name) + + # search on language_script, add english as default search + term_dict_language = { + 'terms': { + 'language_script': [ENGLISH_LANG] + } + } + + if language_script != ENGLISH_LANG: + term_dict_language['terms']['language_script'].append(language_script) + + filter_terms.append(term_dict_language) + + should_terms = [] + query = { + 'match': { + 'variants': { + 'query': text, + 'fuzziness': fuzziness_threshold, + 'prefix_length': 1 + } + } + } + + should_terms.append(query) + + data = { + '_source': ['value', 'entity_data'], + 'query': { + 'bool': { + 'filter': filter_terms, + 'should': should_terms, + 'minimum_should_match': 1 + }, + }, + 'highlight': { + 'fields': { + 'variants': { + 'type': 'unified' + } + }, + 'order': 'score', + 'number_of_fragments': 20 + }, + 'size': size + } + + if as_json: + data = json.dumps(data) + + return data + + +def _parse_multi_entity_es_results(results_list): + """ + This will parse highlighted results returned from elasticsearch query and + generate a variants to values dictionary mapped to each entity for each + search text terms. + + Args: + results_list (list of dict): + search results list of dictionaries from elasticsearch including highlights + and scores + + Returns: + list of dict of collections.OrderedDict: + list containing dicts mapping each entity to matching variants to their entity + values based on the parsed results from highlighted search query results + + Example: + Parameter ngram_results has highlighted search results as follows: + + [ + {u'_shards': {u'failed': 0, u'successful': 5, u'total': 5}, + u'hits': {u'hits': [{u'_id': u'AVrW02UE9WNuMIY9vmWn', + u'_index': u'doc_type_name', + u'_score': 11.501145, + u'_source': {u'dict_type': u'variants', + u'entity_data': u'city', + u'value': u'goa', + u'variants': [u'', u'goa']}, + u'_type': u'data_dictionary', + u'highlight': {u'variants': [u'goa']}}, + {u'_id': u'AVrW02W99WNuMIY9vmcf', + u'_index': u'entity_data', + u'_score': 11.210829, + u'_source': {u'dict_type': u'variants', + u'entity_data': u'city', + u'value': u'Mumbai', + u'variants': [u'', u'Mumbai']}, + u'_type': u'data_dictionary', + u'highlight': {u'variants': [u'Mumbai']}}, + ... + u'max_score': 11.501145, + u'total': 17}, + u'timed_out': False, + u'took': 96} + ] + + After parsing highlighted results, this function returns + + [ + { + 'city': OrderedDict([ + ('Mumbai', 'Mumbai'), + ('mumbai', 'mumbai'), + ('goa', 'goa') + ]) + }, + { + 'city': OrderedDict([ + ('Jabalpur', 'Jabalpur'), + ('Jamalpur', 'Jamalpur'), + ('goa', 'goa') + ]) + } + ] + + + + """ + entity_variants_to_values_list = [] + + if results_list: + for results in results_list: + entity_dict = {} + entity_variants_to_values_dict = {} + + if results['hits']['total'] > 0: + for hit in results['hits']['hits']: + if 'highlight' not in hit: + continue + + value = hit['_source']['value'] + entity_name = hit['_source']['entity_data'] + + if entity_name not in entity_dict: + entity_dict[entity_name] = {'value': [], 'variant': []} + + entity_dict[entity_name]['value'].extend( + [value for _ in hit['highlight']['variants']]) + entity_dict[entity_name]['variant'].extend( + [variant for variant in hit['highlight']['variants']]) + + for each_entity in entity_dict.keys(): + entity_values = entity_dict[each_entity]['value'] + entity_variants = entity_dict[each_entity]['variant'] + entity_variants_to_values = collections.OrderedDict() + + for value, variant in zip(entity_values, entity_variants): + variant = re.sub(r'\s+', ' ', variant.strip()) + variant_no_highlight_tags = variant.replace('', '').replace('', '').strip() + if variant.count('') == len(TOKENIZER.tokenize(variant_no_highlight_tags)): + variant = variant_no_highlight_tags + if variant not in entity_variants_to_values: + entity_variants_to_values[variant] = value + entity_variants_to_values_dict[each_entity] = entity_variants_to_values + entity_variants_to_values_list.append(entity_variants_to_values_dict) + return entity_variants_to_values_list diff --git a/ner_v2/detectors/textual/tests/__init__.py b/ner_v2/detectors/textual/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/ner_v2/detectors/textual/tests/query_generate_output_entity_list_data.json b/ner_v2/detectors/textual/tests/query_generate_output_entity_list_data.json new file mode 100644 index 000000000..36e71657d --- /dev/null +++ b/ner_v2/detectors/textual/tests/query_generate_output_entity_list_data.json @@ -0,0 +1,49 @@ +{ + "_source": [ + "value", + "entity_data" + ], + "query": { + "bool": { + "filter": [ + { + "terms": { + "entity_data": [ + "city", + "restaurant" + ] + } + }, + { + "terms": { + "language_script": [ + "en" + ] + } + } + ], + "should": [ + { + "match": { + "variants": { + "query": "I want to go to mumbai", + "fuzziness": 1, + "prefix_length": 1 + } + } + } + ], + "minimum_should_match": 1 + } + }, + "highlight": { + "fields": { + "variants": { + "type": "unified" + } + }, + "order": "score", + "number_of_fragments": 20 + }, + "size": 10000 +} diff --git a/ner_v2/detectors/textual/tests/query_generate_output_entity_string_data.json b/ner_v2/detectors/textual/tests/query_generate_output_entity_string_data.json new file mode 100644 index 000000000..df5e350e1 --- /dev/null +++ b/ner_v2/detectors/textual/tests/query_generate_output_entity_string_data.json @@ -0,0 +1,48 @@ +{ + "_source": [ + "value", + "entity_data" + ], + "query": { + "bool": { + "filter": [ + { + "terms": { + "entity_data": [ + "city" + ] + } + }, + { + "terms": { + "language_script": [ + "en" + ] + } + } + ], + "should": [ + { + "match": { + "variants": { + "query": "I want to go to mumbai", + "fuzziness": 1, + "prefix_length": 1 + } + } + } + ], + "minimum_should_match": 1 + } + }, + "highlight": { + "fields": { + "variants": { + "type": "unified" + } + }, + "order": "score", + "number_of_fragments": 20 + }, + "size": 10000 +} diff --git a/ner_v2/detectors/textual/tests/query_parse_input_data.json b/ner_v2/detectors/textual/tests/query_parse_input_data.json new file mode 100644 index 000000000..8d30fc176 --- /dev/null +++ b/ner_v2/detectors/textual/tests/query_parse_input_data.json @@ -0,0 +1,171 @@ +[ + { + "took": 13, + "timed_out": false, + "_shards": { + "total": 5, + "successful": 5, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": 5, + "max_score": 6.4403224, + "hits": [ + { + "_index": "entity_data_v1", + "_type": "data_dictionary", + "_id": "AXRpst4hee1DSR1e2_7_", + "_score": 6.4403224, + "_source": { + "entity_data": "city", + "value": "Mumbai" + }, + "highlight": { + "variants": [ + "Mumbai" + ] + } + }, + { + "_index": "entity_data_v1", + "_type": "data_dictionary", + "_id": "AXRpstzYee1DSR1e2_0I", + "_score": 6.3653703, + "_source": { + "entity_data": "city", + "value": "mumbai" + }, + "highlight": { + "variants": [ + "mumbai" + ] + } + }, + { + "_index": "entity_data_v1", + "_type": "data_dictionary", + "_id": "AXRpsttmee1DSR1e2_sI", + "_score": 6.3596706, + "_source": { + "entity_data": "city", + "value": "Wani" + }, + "highlight": { + "variants": [ + "Wani" + ] + } + }, + { + "_index": "entity_data_v1", + "_type": "data_dictionary", + "_id": "AXRpsttmee1DSR1e2_r9", + "_score": 4.420918, + "_source": { + "entity_data": "city", + "value": "East" + }, + "highlight": { + "variants": [ + "east" + ] + } + }, + { + "_index": "entity_data_v1", + "_type": "data_dictionary", + "_id": "AXRpsttnee1DSR1e2_wp", + "_score": 3.8191679, + "_source": { + "entity_data": "city", + "value": "goa" + }, + "highlight": { + "variants": [ + "goa" + ] + } + } + ] + }, + "status": 200 + }, + { + "took": 12, + "timed_out": false, + "_shards": { + "total": 5, + "successful": 5, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": 4, + "max_score": 8.485634, + "hits": [ + { + "_index": "entity_data_v1", + "_type": "data_dictionary", + "_id": "AXRpstzZee1DSR1e2_4u", + "_score": 8.485634, + "_source": { + "entity_data": "city", + "value": "Jabalpur" + }, + "highlight": { + "variants": [ + "Jabalpur" + ] + } + }, + { + "_index": "entity_data_v1", + "_type": "data_dictionary", + "_id": "AXRpsttnee1DSR1e2_yQ", + "_score": 7.418161, + "_source": { + "entity_data": "city", + "value": "Jamalpur" + }, + "highlight": { + "variants": [ + "Jamalpur" + ] + } + }, + { + "_index": "entity_data_v1", + "_type": "data_dictionary", + "_id": "AXRpsttmee1DSR1e2_sI", + "_score": 6.3596706, + "_source": { + "entity_data": "city", + "value": "Wani" + }, + "highlight": { + "variants": [ + "Wani" + ] + } + }, + { + "_index": "entity_data_v1", + "_type": "data_dictionary", + "_id": "AXRpsttnee1DSR1e2_wp", + "_score": 3.8191679, + "_source": { + "entity_data": "city", + "value": "goa" + }, + "highlight": { + "variants": [ + "goa" + ] + } + } + ] + }, + "status": 200 + } +] diff --git a/ner_v2/detectors/textual/tests/query_parse_output_data.json b/ner_v2/detectors/textual/tests/query_parse_output_data.json new file mode 100644 index 000000000..435a6bf64 --- /dev/null +++ b/ner_v2/detectors/textual/tests/query_parse_output_data.json @@ -0,0 +1,19 @@ +[ + { + "city": { + "Mumbai": "Mumbai", + "mumbai": "mumbai", + "Wani": "Wani", + "east": "East", + "goa": "goa" + } + }, + { + "city": { + "Jabalpur": "Jabalpur", + "Jamalpur": "Jamalpur", + "Wani": "Wani", + "goa": "goa" + } + } +] diff --git a/ner_v2/detectors/textual/tests/test_elastic_search.py b/ner_v2/detectors/textual/tests/test_elastic_search.py new file mode 100644 index 000000000..ad083771f --- /dev/null +++ b/ner_v2/detectors/textual/tests/test_elastic_search.py @@ -0,0 +1,83 @@ +from __future__ import absolute_import + +from django.test import TestCase +from elasticsearch import Elasticsearch + +from ner_v2.detectors.textual.elastic_search import ElasticSearchDataStore +from chatbot_ner.config import CHATBOT_NER_DATASTORE + + +class TestESDataStore(TestCase): + def test_elasticsearch_connection(self): + c = ElasticSearchDataStore() + + connection = c.get_or_create_new_connection('default') + + self.assertIsInstance(connection, Elasticsearch) + + # :TODO: configure parameters here + def test_elasticsearch_connect(self): + kwargs = CHATBOT_NER_DATASTORE.get('elasticsearch') + + connection = ElasticSearchDataStore.connect(**kwargs) + + self.assertIsInstance(connection, Elasticsearch) + + def test_elasticsearch_get_connection(self): + c = ElasticSearchDataStore() + + conn = c.get_or_create_new_connection() + self.assertIsInstance(conn, Elasticsearch) + + def test_elasticsearch_add_connection(self): + kwargs = CHATBOT_NER_DATASTORE.get('elasticsearch') + c = Elasticsearch(**kwargs) + + es = ElasticSearchDataStore() + es.add_new_connection('new', c) + + conn = es.get_or_create_new_connection() + new_conn = es.get_or_create_new_connection('new') + + self.assertIsInstance(new_conn, Elasticsearch) + self.assertIsInstance(c, Elasticsearch) + self.assertIsInstance(conn, Elasticsearch) + + def test_elasticsearch_get_dynamic_fuzziness_threshold(self): + fuzzy = 1 + + fuzzy_threshold = ElasticSearchDataStore._get_dynamic_fuzziness_threshold(fuzzy) + + self.assertEqual(fuzzy_threshold, fuzzy) + + fuzzy = '1' + + fuzzy_threshold = ElasticSearchDataStore._get_dynamic_fuzziness_threshold(fuzzy) + + self.assertEqual(fuzzy_threshold, 'auto') + + # :TODO: Check if below is expected + fuzzy = 'some_string' + + fuzzy_threshold = ElasticSearchDataStore._get_dynamic_fuzziness_threshold(fuzzy) + + self.assertEqual(fuzzy_threshold, 'auto') + + def test_add_query(self): + es = ElasticSearchDataStore() + + entity_list_1 = ['city', 'restaurant'] + text_1 = "I want to go to mumbai" + + query_data = es.generate_query_data(entities=entity_list_1, texts=text_1) + + assert_data = ['{"index": "entity_data", "type": "data_dictionary"}', + '{"_source": ["value", "entity_data"], ' + '"query": {"bool": {"filter": [{"terms": {"entity_data":' + ' ["city", "restaurant"]}}, {"terms": {"language_script": ["en"]}}],' + ' "should": [{"match": {"variants": {"query": "I want to go to mumbai",' + ' "fuzziness": 1, "prefix_length": 1}}}], "minimum_should_match": 1}},' + ' "highlight": {"fields": {"variants": {"type": "unified"}},' + ' "order": "score", "number_of_fragments": 20}, "size": 10000}'] + + self.assertListEqual(query_data, assert_data) diff --git a/ner_v2/detectors/textual/tests/test_queries.py b/ner_v2/detectors/textual/tests/test_queries.py new file mode 100644 index 000000000..0d93a29d0 --- /dev/null +++ b/ner_v2/detectors/textual/tests/test_queries.py @@ -0,0 +1,68 @@ +from __future__ import absolute_import + +import json +import os +from django.test import TestCase + +from ner_v2.detectors.textual.queries import _parse_multi_entity_es_results, \ + _generate_multi_entity_es_query + + +es_tests_directory = os.path.dirname(os.path.abspath(__file__)) + + +class TestESDataStoreQueries(TestCase): + + def test_parse_multi_entity_es_results(self): + # get input data from file `query_parse_input_data.json` + + input_test_file = os.path.join(es_tests_directory, 'query_parse_input_data.json') + output_test_file = os.path.join(es_tests_directory, 'query_parse_output_data.json') + + with open(input_test_file, 'r') as f: + input_data = json.load(f) + + result = _parse_multi_entity_es_results(input_data) + + # get output data from file `query_parse_output_data.json` + with open(output_test_file, 'r') as f: + output_data = json.load(f) + + # set max diff to None + self.maxDiff = None + + self.assertDictEqual(result[1], output_data[1]) + + def test_generate_multi_entity_es_query_list(self): + + entity_list = ['city', 'restaurant'] + text = "I want to go to mumbai" + output_test_file = os.path.join(es_tests_directory, + 'query_generate_output_entity_list_data.json') + + result = _generate_multi_entity_es_query(entity_list, text) + + with open(output_test_file, "r") as f: + output_data = json.load(f) + + # set max diff to None + self.maxDiff = None + + self.assertDictEqual(result, output_data) + + def test_generate_multi_entity_es_query_string(self): + + entity_string = "city" + text = "I want to go to mumbai" + output_test_file = os.path.join(es_tests_directory, + 'query_generate_output_entity_string_data.json') + + result = _generate_multi_entity_es_query(entity_string, text) + + with open(output_test_file, "r") as f: + output_data = json.load(f) + + # set max diff to None + self.maxDiff = None + + self.assertDictEqual(result, output_data) diff --git a/ner_v2/detectors/textual/tests/test_text_detection.py b/ner_v2/detectors/textual/tests/test_text_detection.py new file mode 100644 index 000000000..909cd2633 --- /dev/null +++ b/ner_v2/detectors/textual/tests/test_text_detection.py @@ -0,0 +1,188 @@ +from __future__ import absolute_import + +import os + +from collections import OrderedDict +from mock import patch + +from django.test import TestCase + +from ner_v2.detectors.textual.text_detection import TextDetector + +tests_directory = os.path.dirname(os.path.abspath(__file__)) + + +class TestTextualUtils(TestCase): + + def test_text_detector_intialization(self): + entity_dict = {'city': {'structured_value': None, + 'fallback_value': None, + 'predetected_values': [[]], + 'fuzziness': "4,7", + 'min_token_len_fuzziness': 4, + 'use_fallback': None}, + 'restaurant': {'structured_value': None, + 'fallback_value': None, + 'predetected_values': None, + 'fuzziness': "4,7", + 'min_token_len_fuzziness': 4, + 'use_fallback': None} + } + + language = 'en' + target_language_script = 'en' + + text_detector = TextDetector(entity_dict=entity_dict, source_language_script=language, + target_language_script=target_language_script) + + self.assertIsInstance(text_detector, TextDetector) + + self.assertEqual(language, text_detector._source_language_script) + self.assertEqual(target_language_script, text_detector._target_language_script) + + self.assertDictEqual(entity_dict, text_detector.entities_dict) + + @patch('ner_v2.detectors.textual.elastic_search.' + 'ElasticSearchDataStore.get_multi_entity_results') + def test_text_detection_detect_single_message(self, mock_es_query): + entity_dict = {'city': {'structured_value': None, + 'fallback_value': None, + 'predetected_values': None, + 'fuzziness': "4,7", + 'min_token_len_fuzziness': 4, + 'use_fallback': None}, + 'restaurant': {'structured_value': None, + 'fallback_value': None, + 'predetected_values': None, + 'fuzziness': "4,7", + 'min_token_len_fuzziness': 4, + 'use_fallback': None} + } + + language = 'en' + target_language_script = 'en' + + message = "I want to go to Mumbai to order Dominoes" + + mock_es_query.return_value = [{ + 'restaurant': OrderedDict([('Domino', "Domino's Pizza"), + ('Dominos', "Domino's Pizza"), ('TMOS', 'TMOS'), + ('G.', 'G Pulla Reddy Sweets')]), + 'city': OrderedDict([('Wani', 'Wani'), ('mumbai', 'mumbai'), + ('Mumbai', 'Mumbai'), ('goa', 'goa')])}] + + text_detector = TextDetector(entity_dict=entity_dict, source_language_script=language, + target_language_script=target_language_script) + + result = text_detector.detect(message=message) + + assert_output = [{'city': [{ + 'entity_value': {'value': 'Mumbai', + 'datastore_verified': True, 'model_verified': False}, + 'detection': 'message', 'original_text': 'mumbai', 'language': 'en'}], + 'restaurant': [ + {'entity_value': {'value': "Domino's Pizza", 'datastore_verified': True, 'model_verified': False}, + 'detection': 'message', 'original_text': 'dominoes', 'language': 'en'}]}] + + self.maxDiff = None + self.assertListEqual(result, assert_output) + + @patch('ner_v2.detectors.textual.elastic_search.' + 'ElasticSearchDataStore.get_multi_entity_results') + def test_text_detection_detect_bulk_message(self, mock_es_query): + entity_dict = {'city': {'structured_value': None, + 'fallback_value': None, + 'predetected_values': None, + 'fuzziness': "4,7", + 'min_token_len_fuzziness': 4, + 'use_fallback': None}, + 'restaurant': {'structured_value': None, + 'fallback_value': None, + 'predetected_values': None, + 'fuzziness': "4,7", + 'min_token_len_fuzziness': 4, + 'use_fallback': None} + } + + language = 'en' + target_language_script = 'en' + + message = ['I want to go to Mumbai to order Dominoes', + 'I want to go to Delhi'] + + mock_es_query.return_value = [{ + 'restaurant': OrderedDict([('Domino', "Domino's Pizza"), + ('Dominos', "Domino's Pizza"), + ('TMOS', 'TMOS'), ('G.', 'G Pulla Reddy Sweets')]), + 'city': OrderedDict([('Wani', 'Wani'), ('mumbai', 'mumbai'), + ('Mumbai', 'Mumbai'), ('goa', 'goa')])}, + {'restaurant': OrderedDict([('TMOS', 'TMOS'), + ('Deli', 'Deli'), + ('G.', 'G Pulla Reddy Sweets')]), + 'city': OrderedDict([('Delhi', 'New Delhi'), ('Wani', 'Wani'), + ('goa', 'goa')])}] + + text_detector = TextDetector(entity_dict=entity_dict, source_language_script=language, + target_language_script=target_language_script) + + result = text_detector.detect_bulk(messages=message) + + assert_output = [{'city': [{ + 'entity_value': {'value': 'Mumbai', 'datastore_verified': True, + 'model_verified': False}, + 'detection': 'message', 'original_text': 'mumbai', 'language': 'en'}], + 'restaurant': [{ + 'entity_value': {'value': "Domino's Pizza", 'datastore_verified': True, + 'model_verified': False}, + 'detection': 'message', 'original_text': 'dominoes', 'language': 'en'}]}, + {'city': [{'entity_value': {'value': 'New Delhi', 'datastore_verified': True, + 'model_verified': False}, + 'detection': 'message', 'original_text': 'delhi', 'language': 'en'}], + 'restaurant': [{'entity_value': {'value': 'Deli', 'datastore_verified': True, + 'model_verified': False}, + 'detection': 'message', 'original_text': 'delhi', 'language': 'en'}]}] + + self.maxDiff = None + self.assertListEqual(result, assert_output) + + def test_text_detection_set_fuzziness_hi_lo_threshold(self): + + entity_dict = {'city': {'structured_value': None, + 'fallback_value': None, + 'predetected_values': [[]], + 'fuzziness': "5,8", + 'min_token_len_fuzziness': 4, + 'use_fallback': None}} + language = 'en' + target_language_script = 'en' + + text_detector = TextDetector(entity_dict=entity_dict, source_language_script=language, + target_language_script=target_language_script) + + fuzziness = entity_dict['city']['fuzziness'] + + # assert for default fuzziness hi and low i.e. 4,7 + self.assertEqual(text_detector._fuzziness_lo, 4) + self.assertEqual(text_detector._fuzziness_hi, 7) + + # set new threshold and assert\ + text_detector.set_fuzziness_low_high_threshold(fuzziness) + self.assertEqual(text_detector._fuzziness_lo, 5) + self.assertEqual(text_detector._fuzziness_hi, 8) + + def test_text_detection_get_substring(self): + entity_dict = {'city': {'structured_value': None, + 'fallback_value': None, + 'predetected_values': [[]], + 'fuzziness': "2,4", + 'min_token_len_fuzziness': 4, + 'use_fallback': None}} + language = 'en' + target_language_script = 'en' + + text_detector = TextDetector(entity_dict=entity_dict, source_language_script=language, + target_language_script=target_language_script) + + substring = text_detector._get_entity_substring_from_text('Mmsbai', 'Mumbai', 'city') + + self.assertEqual(substring, 'Mmsbai') diff --git a/ner_v2/detectors/textual/tests/test_utils.py b/ner_v2/detectors/textual/tests/test_utils.py new file mode 100644 index 000000000..a192b10cc --- /dev/null +++ b/ner_v2/detectors/textual/tests/test_utils.py @@ -0,0 +1,225 @@ +from __future__ import absolute_import + +import json +import os + +from collections import OrderedDict +from mock import patch + +from django.test import TestCase +from django.http import HttpRequest + +from ner_v2.detectors.textual.utils import get_text_entity_detection_data, verify_text_request, \ + get_output_for_fallback_entities, get_detection + +tests_directory = os.path.dirname(os.path.abspath(__file__)) + + +class TestTextualUtils(TestCase): + + def test_get_output_for_fallback_entities(self): + input_data = {'city': {'fallback_value': 'Mumbai', 'ignore_message': True}, + 'restaurant': {'fallback_value': None, 'ignore_message': True}} + + assert_output_data = {'city': [{'entity_value': {'value': 'Mumbai', + 'datastore_verified': False, + 'model_verified': False}, + 'detection': 'fallback_value', + 'original_text': 'Mumbai', 'language': 'en'}], + 'restaurant': []} + + result = get_output_for_fallback_entities(input_data) + + self.assertDictEqual(result, assert_output_data) + + def test_verify_text_request_ok(self): + request = HttpRequest() + + # test if everything is ok + request._body = b'{"messages":["something"], "entities":{"something":""}}' + verify_text_request(request) + + def test_verify_text_request_exceptions(self): + request = HttpRequest() + + # test if no message + request._body = b'{}' + self.assertRaises(KeyError, verify_text_request, request=request) + + # test if no entities + request._body = b'{"messages": "something"}' + self.assertRaises(KeyError, verify_text_request, request=request) + + # test if message not in proper format + request._body = b'{"messages":"something", "entities":"something"}' + self.assertRaises(TypeError, verify_text_request, request=request) + + # test if entities not in proper format + request._body = b'{"messages":["something"], "entities":"something"}' + self.assertRaises(TypeError, verify_text_request, request=request) + + @patch('ner_v2.detectors.textual.utils.get_detection') + def test_get_text_entity_detection_data(self, mock_get_detection): + input_data = { + "messages": ["I want to go to Mumbai"], + "bot_message": None, + "language_script": "en", + "source_language": "en", + "entities": { + "city": { + "structured_value": None, + "fallback_value": None, + "predetected_values": None, + "fuzziness": 4, + "min_token_len_fuzziness": 4, + "ignore_message": None + }, + + "restaurant": { + "structured_value": None, + "fallback_value": None, + "predetected_values": None, + "fuzziness": None, + "min_token_len_fuzziness": None, + "ignore_message": True + }, + } + + } + + request = HttpRequest() + + request._body = json.dumps(input_data) + + mock_get_detection.return_value = [{'entities': {'city': [ + {'entity_value': {'value': 'Mumbai', 'datastore_verified': True, + 'model_verified': False}, + 'detection': 'message', 'original_text': 'mumbai', + 'language': 'en'}], 'restaurant': []}, + 'language': 'en'}] + + output = get_text_entity_detection_data(request) + + assert_output = [{ + 'entities': {'entities': {'city': [ + {'entity_value': {'value': 'Mumbai', + 'datastore_verified': True, + 'model_verified': False}, + 'detection': 'message', 'original_text': 'mumbai', + 'language': 'en'}], 'restaurant': []}, + 'language': 'en', 'restaurant': []}, 'language': 'en'}] + + self.assertListEqual(output, assert_output) + + @patch('ner_v2.detectors.textual.utils.get_detection') + def test_get_text_entity_detection_data_structured(self, mock_get_detection): + input_data = { + "messages": ["I want to go to Mumbai"], + "bot_message": None, + "language_script": "en", + "source_language": "en", + "entities": { + "city": { + "structured_value": None, + "fallback_value": None, + "predetected_values": None, + "fuzziness": 4, + "min_token_len_fuzziness": 4, + "ignore_message": None + }, + + "restaurant": { + "structured_value": None, + "fallback_value": None, + "predetected_values": None, + "fuzziness": None, + "min_token_len_fuzziness": None, + "ignore_message": True + }, + } + + } + + request = HttpRequest() + + request._body = json.dumps(input_data) + + mock_get_detection.return_value = [{'city': [ + {'entity_value': {'value': 'New Delhi', 'datastore_verified': True, 'model_verified': False}, + 'detection': 'structure_value_verified', 'original_text': 'delhi', 'language': 'en'}]}] + + output = get_text_entity_detection_data(request) + + assert_output = [{'entities': {'city': [ + {'entity_value': {'value': 'New Delhi', 'datastore_verified': True, 'model_verified': False}, + 'detection': 'structure_value_verified', 'original_text': 'delhi', 'language': 'en'}], 'restaurant': []}, + 'language': 'en'}] + + self.assertListEqual(output, assert_output) + + @patch('ner_v2.detectors.textual.elastic_search.' + 'ElasticSearchDataStore.get_multi_entity_results') + def test_get_text_detection_string_message(self, mock_es_query): + entity_dict = {'city': {'structured_value': None, + 'fallback_value': None, + 'predetected_values': None, + 'fuzziness': "4,7", + 'min_token_len_fuzziness': 4, + 'ignore_message': None}} + + message = "I want to go to Mumbai" + + mock_es_query.return_value = [ + {'city': OrderedDict([('Wani', 'Wani'), + ('mumbai', 'mumbai'), + ('Mumbai', 'Mumbai'), + ('goa', 'goa')]) + }] + + output = get_detection(message, entity_dict) + assert_output = [ + {'city': [{'entity_value': {'value': 'Mumbai', 'datastore_verified': True, + 'model_verified': False}, 'detection': 'message', + 'original_text': 'mumbai', + 'language': 'en'}]}] + + self.assertDictEqual(assert_output[0], output[0]) + + @patch('ner_v2.detectors.textual.elastic_search.' + 'ElasticSearchDataStore.get_multi_entity_results') + def test_get_text_detection_list_message(self, mock_es_query): + entity_dict = {'city': {'structured_value': None, + 'fallback_value': None, + 'predetected_values': None, + 'fuzziness': "4,7", + 'min_token_len_fuzziness': 4, + 'ignore_message': None}} + + message = ["I want to go to Mumbai", "I want to go to Delhi"] + + mock_es_query.return_value = [ + {'city': OrderedDict([('Wani', 'Wani'), + ('mumbai', 'mumbai'), + ('Mumbai', 'Mumbai'), ( + 'goa', 'goa')])}, + {'city': OrderedDict([('Delhi', 'New Delhi'), + ('Wani', 'Wani'), + ('goa', 'goa')])}] + + output = get_detection(message, entity_dict) + assert_output = [{'city': [ + {'entity_value': {'value': 'Mumbai', + 'datastore_verified': True, + 'model_verified': False}, + 'detection': 'message', + 'original_text': 'mumbai', + 'language': 'en'}]}, + {'city': [ + {'entity_value': {'value': 'New Delhi', + 'datastore_verified': True, + 'model_verified': False}, + 'detection': 'message', + 'original_text': 'delhi', + 'language': 'en'}]}] + + self.assertListEqual(assert_output, output) diff --git a/ner_v2/detectors/textual/text_detection.py b/ner_v2/detectors/textual/text_detection.py new file mode 100644 index 000000000..58d238a34 --- /dev/null +++ b/ner_v2/detectors/textual/text_detection.py @@ -0,0 +1,850 @@ +from __future__ import absolute_import + +import collections +import string + +import six +from six import iteritems + +import language_utilities.constant as lang_constant +from chatbot_ner.config import ner_logger + +from ner_v2.detectors.textual.elastic_search import ElasticSearchDataStore + +from lib.nlp.const import TOKENIZER, whitespace_tokenizer +from lib.nlp.levenshtein_distance import edit_distance +from six.moves import range + +from ner_constants import (FROM_STRUCTURE_VALUE_VERIFIED, FROM_STRUCTURE_VALUE_NOT_VERIFIED, + FROM_MESSAGE, FROM_FALLBACK_VALUE, ORIGINAL_TEXT, ENTITY_VALUE, + DETECTION_METHOD, DETECTION_LANGUAGE, ENTITY_VALUE_DICT_KEY) + +from ner_v1.constant import DATASTORE_VERIFIED, MODEL_VERIFIED + +from language_utilities.constant import ENGLISH_LANG + +try: + import regex as re + + _re_flags = re.UNICODE | re.V1 | re.WORD + +except ImportError: + import re + + _re_flags = re.UNICODE + + +class TextDetector(object): + """ + TextDetector detects multiple custom entities in text string by performing similarity searches against a list + fetched from elasticsearch datastore. + + TextDetector detects text type custom entities that do not adhere to some strict/weak formats which other entities + like date, time, email, etc do. Examples of such types of entites can be city, food dish name, brand names etc + + Attributes: + entities_dict (dict): dict with details of entities to be dected. Each entites will contailn: + `value`: name of the entity + + `_fuzziness` (str or int): If this parameter is str, elasticsearch's + auto is used with low and high term distances. Default low and high + term distances are 3 and 6 for elasticsearch. For this module they are + set to 4 and 7 respectively. + + In auto mode, if length of term is less than low it must match exactly, + if it is between [low, high) one insert/delete/substitution is allowed, + for anything higher than equal to high, two inserts/deletes/substitutions + are allowed + + `_min_token_size_for_fuzziness (int)`: minimum number of letters a word must + have to be considered for calculating edit distance with similar + ngrams from the datastore + processed_text (str): string with detected text entities removed + """ + + def __init__(self, entity_dict=None, + source_language_script=lang_constant.ENGLISH_LANG, + target_language_script=ENGLISH_LANG): + + # define entities to detect + self.entities_dict = entity_dict + + self.processed_text = None + self.__texts = [] + self.__processed_texts = [] + + # defaults for auto mode + self._fuzziness = "4,7" + + self._fuzziness_lo, self._fuzziness_hi = 4, 7 + self._min_token_size_for_fuzziness = self._fuzziness_lo + + # defaults for non-auto mode + self._min_token_size_for_fuzziness = 4 + + # define data store and target languages + self.esdb = ElasticSearchDataStore() + + self._source_language_script = source_language_script + self._target_language_script = target_language_script + + # set default ES query fuzziness as `auto` + self._es_fuzziness = "auto" + + def _reset_state(self): + """ + Reset all the intermediary states of detection class. + """ + self.processed_text = None + self.__texts = [] + self.__processed_texts = [] + + def set_fuzziness_low_high_threshold(self, fuzziness): + """ + Sets the fuzziness thresholds high and low threshold for similarity searches. + The fuzziness threshold corresponds to the maximum Levenshtein's distance + allowed during similarity matching + Args: + + fuzziness (iterable or int): If this parameter is int, elasticsearch's auto is used with + low and high term distances. + + Please make sure the iterable has only two integers like (4, 7). + This will generate "auto:4,7" + + Note that this also sets _min_token_size_for_fuzziness to first value of the iterable + If this argument is int, elasticsearch will set fuzziness as min(2, fuzziness) + """ + try: + iter(fuzziness) + if len(fuzziness) == 3: + lo, hi = fuzziness.split(",") + self._fuzziness_lo, self._fuzziness_hi = int(lo), int(hi) + self._min_token_size_for_fuzziness = self._fuzziness_lo + except TypeError: + ner_logger.exception(f"Fuzziness not in correct format, got {fuzziness}") + raise TypeError('Fuzziness has to be an iterable of length 2 ') + + def _get_fuzziness_threshold_for_token(self, token): + """ + Return dynamic fuzziness threshold for damerau-levenshtein check based on length of token if elasticsearch + fuzziness was set to auto mode + + Args: + token (str or unicode): the string to calculate fuzziness threshold for + fuzziness (int): fuzziness value provided + + Returns: + int: fuzziness threshold for ngram matching on elastic search results + """ + + if len(token) < self._fuzziness_lo: + return 0 # strict match + elif len(token) >= self._fuzziness_hi: + return 2 # Allow upto two inserts/deletes and one substitution + else: + return 1 # lo <= len < hi Allow only insert/delete + + def set_min_token_size_for_levenshtein(self, min_size): + """ + Sets the minimum number of letters a word must have to be considered for calculating edit + distance with similar ngrams from the datastore + + Args: + min_size: integer, maximum allowed Levenshtein's distance from the word/phrase being tested for + entity match + """ + self._min_token_size_for_fuzziness = min_size + + def _process_text(self, texts): + """ + This will pre-process texts for detection + Args: + texts: list of message strings + """ + self._reset_state() + for text in texts: + text = text.lower() + text = text.decode('utf-8') if isinstance(text, bytes) else text + self.__texts.append(text) + self.__processed_texts.append(u' ' + text + u' ') + + @staticmethod + def _get_substring_from_processed_text(text, matched_tokens): + """ + Get part of original text that was detected as some entity value. + + This method was written to tackle cases when original text contains special characters which are dropped + during tokenization + + Args: + matched_tokens (list): list of tokens (usually tokens from fuzzy match results from ES) + to find as a contiguous substring in the processed sentence considering the effects + of tokenizer + text (string or unicode): sentence from self.processed_text from where indices of given token will be + given + + Returns: + str or unicode: part of original text that corresponds to given tokens + + E.g. + self.processed_text = u'i want to order 1 pc hot & crispy' + tokens = [u'i', u'want', u'to', u'order', u'1', u'pc', u'hot', u'crispy'] + indices = [(1, 2), (3, 7), (8, 10), (11, 16), (17, 18), (19, 21), (22, 25), (28, 34)]) + + In: matched_tokens = [u'1', u'pc', u'hot', u'crispy'] + Out: 1 pc hot & crispy + + Notice that & is dropped during tokenization but when finding original text, + we recover it from processed text + """ + + def _get_tokens_and_indices(txt): + """ + Args: + txt (str or unicode): text to get tokens from and indicies of those tokens in the given text + + Returns: + tuple: + list: containing tokens, direct results from tokenizer.tokenize + list: containing (int, int) indicating start and end position of ith token (of first list) + in given text + + E.g. + In: text = u'i want to order 1 pc hot & crispy' + Out: ([u'i', u'want', u'to', u'order', u'1', u'pc', u'hot', u'crispy'], + [(1, 2), (3, 7), (8, 10), (11, 16), (17, 18), (19, 21), (22, 25), (28, 34)]) + + """ + txt = txt.rstrip() + ' __eos__' + processed_text_tokens = TOKENIZER.tokenize(txt) + processed_text_tokens_indices = [] + + offset = 0 + for token in processed_text_tokens: + st = txt.index(token) + en = st + len(token) + + # Small block to handle tricky cases like '(A B) C' + # It extends the previous token's end boundary if there are special characters except whitespace + # towards the end of previous token + prefix = txt[:en] + prefix_tokens = whitespace_tokenizer.tokenize(prefix) + if prefix and len(prefix_tokens) > 1 and prefix_tokens[0]: + if processed_text_tokens_indices: + s, e = processed_text_tokens_indices.pop() + e += len(prefix_tokens[0]) + processed_text_tokens_indices.append((s, e)) + + txt = txt[en:] + processed_text_tokens_indices.append((offset + st, offset + en)) + offset += en + + # remove eos parts + processed_text_tokens.pop() + processed_text_tokens_indices.pop() + + return processed_text_tokens, processed_text_tokens_indices + + try: + n = len(matched_tokens) + tokens, indices = _get_tokens_and_indices(text) + for i in range(len(tokens) - n + 1): + if tokens[i:i + n] == matched_tokens: + start = indices[i][0] + end = indices[i + n - 1][1] + return text[start:end] + except (ValueError, IndexError): + ner_logger.exception('Error getting original text (%s, %s)' % (matched_tokens, text)) + + return u' '.join(matched_tokens) + + def _process_es_result(self, entity_result, entity_list, text, + processed_text): + """ + Process ElasticSearch results which will contain list of dictionary where for + each item key will be variant and value will be entity value this will be + processed to get the original text which has been identified and will + return the results dictionary for each entity detected + + Args: + entity_result: ES result for entity + entity_list: List of entity for which ES query ran + text: original text message + processed_text: processed text on which detection ran + + Returns: + result_dict: dictionary with detected text and original text for + each entity + + """ + result_dict = {} + + for each_key in entity_list: + original_final_list = [] + value_final_list = [] + variants_to_values = collections.OrderedDict() + original_final_list_ = [] + value_final_list_ = [] + _processed_text = processed_text + + _variants_to_values = entity_result.get(each_key, []) + + if not _variants_to_values: + result_dict[each_key] = ([], []) + continue + + for variant, value in iteritems(_variants_to_values): + variant = variant.lower() + if isinstance(variant, bytes): + variant = variant.decode('utf-8') + + variants_to_values[variant] = value + variants_list = list(variants_to_values.keys()) + + exact_matches, fuzzy_variants = [], [] + + for variant in variants_list: + if u' '.join(TOKENIZER.tokenize(variant)) in text: + exact_matches.append(variant) + else: + fuzzy_variants.append(variant) + + exact_matches.sort(key=lambda s: len(TOKENIZER.tokenize(s)), reverse=True) + fuzzy_variants.sort(key=lambda s: len(TOKENIZER.tokenize(s)), reverse=True) + + variants_list = exact_matches + fuzzy_variants + for variant in variants_list: + + original_text = self._get_entity_substring_from_text(_processed_text, + variant, each_key) + if original_text: + value_final_list.append(variants_to_values[variant]) + original_final_list.append(original_text) + boundary_punct_pattern = re.compile( + r'(^[{0}]+)|([{0}]+$)'.format(re.escape(string.punctuation))) + original_text_ = boundary_punct_pattern.sub("", original_text) + + _pattern = re.compile(r'\b%s\b' % re.escape(original_text_), flags=_re_flags) + tag = '__' + each_key + '__' + _processed_text = _pattern.sub(tag, _processed_text) + + value_final_list_.append(value_final_list) + original_final_list_.append(original_final_list) + + result_dict[each_key] = (value_final_list_, original_final_list_) + + return result_dict + + def _get_single_text_detection_with_variants(self, message): + """ + This function will normalise the message by breaking it into trigrams, + bigrams and unigrams. + + The generated ngrams will be used to create query to retrieve search results from datastore. + + These results will contain list of dictionary where for each item key will be variant and + value will be entity value this will be further processed to get the original text which has + been identified and will return the results + + Returns: + list of dict: list of dict for each message with key as entity name + containing the detected text entities and original message. + """ + + entities_dict = self.entities_dict + es_entity_list = [] + structured_value_entities_list = [] + text_value_entities_list = [] + texts = [] + + for each_entity, value in entities_dict.items(): + structured_value = value.get('structured_value') + + if structured_value: + # add entity list and text for each structured entity + # for ES query + es_entity_list.append([each_entity]) + structured_value_entities_list.append(each_entity) + texts.append(structured_value) + else: + text_value_entities_list.append(each_entity) + + if text_value_entities_list: + # add entity list and text for all other textual + # entity for ES query + es_entity_list.append(text_value_entities_list) + texts.append(message) + + # pre-process text + self._process_text(texts) + texts = [u' '.join(TOKENIZER.tokenize(processed_text)) for + processed_text in self.__processed_texts] + + # fetch ES datastore search result + es_results = self.esdb.get_multi_entity_results(entities=es_entity_list, + texts=texts, + fuzziness_threshold=self._es_fuzziness, + search_language_script=self._target_language_script + ) + + final_list = [] + result_dict = {} + + for index, entity_result in enumerate(es_results): + processed_text = self.__processed_texts[index] + text = texts[index] + entity_list = es_entity_list[index] + result_dict.update(self._process_es_result(entity_result=entity_result, + entity_list=entity_list, + text=text, processed_text=processed_text)) + + final_list.append(result_dict) + + return final_list + + def _get_bulk_text_detection_with_variants(self, messages): + """ + This function will normalise the message by breaking it into trigrams, bigrams and unigrams. + The generated ngrams will be used to create query to retrieve search results from datastore. + These results will contain list of dictionary where for each item key will be variant and + value will be entity value this will be further processed to get the original text which has + been identified and will return the results + + Args: + messages (list of str): list of message for which detection needs to be perform + + Returns: + tuple: + list of lists: list of dict for each message with key as entity name + containing the detected text entities and original message. + """ + + self._process_text(messages) + + texts = [u' '.join(TOKENIZER.tokenize(processed_text)) for + processed_text in self.__processed_texts] + + entity_list = list(self.entities_dict) + + # entity list for ES search should be list of entities + # for all list of texts + es_entity_list = [entity_list] + es_texts = [texts] + + # fetch ES datastore search result + es_results = self.esdb.get_multi_entity_results(entities=es_entity_list, + texts=es_texts, + fuzziness_threshold=self._es_fuzziness, + search_language_script=self._target_language_script + ) + + final_list = [] + + for index, entity_result in enumerate(es_results): + processed_text = self.__processed_texts[index] + text = texts[index] + result_list = self._process_es_result(entity_result=entity_result, + entity_list=entity_list, + text=text, processed_text=processed_text) + final_list.append(result_list) + + return final_list + + def _get_entity_substring_from_text(self, text, variant, entity_name): + """ + Check ngrams of the text for similarity against the variant (can be a ngram) using Levenshtein distance + and return the closest substring in the text that matches the variant. + For each entity fuziness and min_token_size_for_fuzziness is used from the entity details. + Args: + variant(str or unicode): string, ngram of variant to fuzzy detect in the text using + Levenshtein distance + text(str or unicode): sentence from self.processed on which detection is being done + entity_name (str): name of the entity to get fuzziness and min_token_lenght value + Returns: + str or unicode or None: part of the given text that was detected as entity given the variant, + None otherwise + Example: + >>> text_detector = TextDetector(entity_dict={'city':{}) + >>> text = 'Come to Chennai, Tamil Nadu, I will visit Delehi next year'.lower() + >>> text_detector._get_entity_substring_from_text(variant='chennai') + 'chennai' + >>> text_detector._get_entity_substring_from_text(variant='delhi') + 'delehi' + """ + variant_tokens = TOKENIZER.tokenize(variant) + text_tokens = TOKENIZER.tokenize(text) + original_text_tokens = [] + variant_token_i = 0 + for text_token in text_tokens: + variant_token = variant_tokens[variant_token_i] + same = variant_token == text_token + + # get fuzziness and min_token_size_for_fuziness value from entity dict + entity_dict = self.entities_dict.get(entity_name, {}) + + # get fuzziness from entity if not set default + fuzziness = entity_dict.get('fuzziness') or self._fuzziness + + self.set_fuzziness_low_high_threshold(fuzziness) + + min_token_size_for_fuzziness = entity_dict.get('min_token_len_fuzziness') + + if not min_token_size_for_fuzziness: + min_token_size_for_fuzziness = self._min_token_size_for_fuzziness + + ft = self._get_fuzziness_threshold_for_token(token=text_token) + + # set substitution cost to one + if same or (len(text_token) > min_token_size_for_fuzziness + and edit_distance(string1=variant_token, + string2=text_token, + substitution_cost=1, + max_distance=ft + 1) <= ft): + original_text_tokens.append(text_token) + variant_token_i += 1 + if variant_token_i == len(variant_tokens): + return self._get_substring_from_processed_text(text, original_text_tokens) + else: + original_text_tokens = [] + variant_token_i = 0 + return None + + @staticmethod + def _add_verification_source(values, verification_source_dict): + text_entity_verified_values = [] + for text_entity_value in values: + text_entity_dict = {ENTITY_VALUE_DICT_KEY: text_entity_value} + text_entity_dict.update(verification_source_dict) + text_entity_verified_values.append(text_entity_dict) + return text_entity_verified_values + + def combine_results(self, values, original_texts, predetected_values): + """ + This method is used to combine the results provided by the datastore search and the + crf_model if trained. + Args: + values (list): List of values detected by datastore + original_texts (list): List of original texts present in the texts for which value shave been + detected + predetected_values (list): Entities detected by the models like crf etc. + Returns: + combined_values (list): List of dicts each dict consisting of the entity value and additionally + the keys for the datastore and crf model detection + combined_original_texts (list): List of original texts detected by the datastore and the crf model. + """ + unprocessed_crf_original_texts = [] + + combined_values = self._add_verification_source( + values=values, verification_source_dict={DATASTORE_VERIFIED: True, MODEL_VERIFIED: False} + ) + combined_original_texts = original_texts + + for i in range(len(predetected_values)): + match = False + for j in range(len(original_texts)): + if predetected_values[i].lower() == original_texts[j]: + combined_values[j][MODEL_VERIFIED] = True + match = True + break + elif re.findall(r'\b%s\b' % re.escape(predetected_values[i]), original_texts[j]): + # If predetected value is a substring of some value detected by datastore, + # skip it from output + match = True + break + if not match: + unprocessed_crf_original_texts.append(predetected_values[i]) + + unprocessed_crf_original_texts_verified = self._add_verification_source( + values=unprocessed_crf_original_texts, + verification_source_dict={DATASTORE_VERIFIED: False, MODEL_VERIFIED: True} + ) + + combined_values.extend(unprocessed_crf_original_texts_verified) + combined_original_texts.extend(unprocessed_crf_original_texts) + + return combined_values, combined_original_texts + + def detect(self, message=None, **kwargs): + """ + This method will detect all textual entities over the single message. + + If structured value is present for any given entity it will be preferred + over message and a new ES query is added with text as structured value. + + After detection it will combine the result and outputs list of dictionary + for all the entities detected over message + + Args: + message (str): message on which textual entities needs to be detected + **kwargs: other keyword arguments if required + + Returns: + List of dict of all the entities with detected values of textual entites + + Examples: + + entity_dict = { + 'city': {'structured_value': None, 'fallback_value': None}, + 'restaurant': {'structured_value': None, 'fallback_value': None}, + 'brand' : {'structured_value': 'Nike', 'fallback_value': None}, + } + + text_detection = TextDetector(entity_dict) + + text_detection.detect('Buy ticket to Chennai from Mumbai') + + output: + [ { + 'city': [ + {'entity_value': {'value': 'Mumbai', + 'datastore_verified': True, + 'model_verified': False}, + 'detection': 'message', + 'original_text': 'Mumbai', + 'language': 'en'}, + + {'entity_value': {'value': 'Chennai', + 'datastore_verified': True, + 'model_verified': False}, + 'detection': 'message', + 'original_text': 'Chennai', + 'language': 'en'} + ], + 'restaurant': [], + 'brand': [ + {'entity_value': {'value': 'Nike', + 'datastore_verified': True, + 'model_verified': False}, + "detection": "structure_value_verified", + 'original_text': 'Nike', + 'language': 'en'}] + }] + """ + + res_list = self._get_single_text_detection_with_variants(message) + data_list = [] + + for index, res in enumerate(res_list): + entities = {} + + for entity, value in res.items(): + entities[entity] = [] + values, texts = [], [] + text_entity_values, original_texts = value + entity_dict = self.entities_dict.get(entity, {}) + + # get structured value from entity dict + structured_value = entity_dict.get('structured_value') + + # get predetected value list from entity dict + predetected_values = entity_dict.get('predetected_values') + + # get predetected value for message from index + if predetected_values: + _predetected_value = predetected_values[index] + else: + _predetected_value = [] + + # get fallback value from entity dict + fallback_value = entity_dict.get('fallback_value') + + if text_entity_values and original_texts: + values, texts = text_entity_values[0], original_texts[0] + + entity_list, original_text_list = self.combine_results(values=values, original_texts=texts, + predetected_values=_predetected_value) + + if structured_value: + if entity_list: + value, method, original_text = entity_list, FROM_STRUCTURE_VALUE_VERIFIED, original_text_list + else: + value, method, original_text = [structured_value], FROM_STRUCTURE_VALUE_NOT_VERIFIED, \ + [structured_value] + elif entity_list: + value, method, original_text = entity_list, FROM_MESSAGE, original_text_list + elif fallback_value: + value, method, original_text = [fallback_value], FROM_FALLBACK_VALUE, [fallback_value] + else: + continue + + out = self.output_entity_dict_list(entity_value_list=value, original_text_list=original_text, + detection_method=method, + detection_language=self._target_language_script) + + entities[entity] = out + data_list.append(entities) + + return data_list + + def detect_bulk(self, messages=None, **kwargs): + """ + This method will detect all textual entities over the multiple message. + After detection it will combine the result and outputs list of dictionary + for all the entities detected over message + + Args: + + messages (list of str): list of message for which detection needs to be perform + **kwargs: other keyword arguments if required + + Returns: + List of dict of all the entities with detected values of textual entites + + + example: + + entity_dict = { + 'city': {'fallback_value': 'Mumbai', 'use_fallback': False}, + 'restaurant': {'fallback_value': None, 'use_fallback': False} + } + + text_detection = TextDetector(entity_dict) + text_detection.detect(['Buy ticket to Chennai from Mumbai', + 'I want to eat at dominoes']) + + output: + [ { + 'city': [ + {'entity_value': {'value': 'Mumbai', + 'datastore_verified': True, + 'model_verified': False}, + 'detection': 'message', + 'original_text': 'Mumbai', + 'language': 'en'}, + {'entity_value': {'value': 'Chennai', + 'datastore_verified': True, + 'model_verified': False}, + 'detection': 'message', + 'original_text': 'Chennai', + 'language': 'en'} + ], + 'restaurant': []}, + { + 'city': [], + 'restaurant': [ + {'entity_value': {'value': 'Domminoe's Pizza', + 'datastore_verified': True, + 'model_verified': False}, + 'detection': 'message', + 'original_text': 'dominoes', + 'language': 'en'} + ] + ] + """ + + res_list = self._get_bulk_text_detection_with_variants(messages) + data_list = [] + + for index, res in enumerate(res_list): + entities = {} + for entity, value in res.items(): + entities[entity] = [] + values, texts = [], [] + # get predetected value from entity dict + entity_dict = self.entities_dict.get(entity, {}) + predetected_values = entity_dict.get('predetected_values') + + # get predetected value for message from index + if predetected_values: + _predetected_value = predetected_values[index] + else: + _predetected_value = [] + + # get fallback value from entity dict + fallback_value = entity_dict.get('fallback_value') + + text_entity_values, original_texts = value + if text_entity_values and original_texts: + values, texts = text_entity_values[0], original_texts[0] + + entity_list, original_text_list = self.combine_results(values=values, original_texts=texts, + predetected_values=_predetected_value) + + if entity_list: + value, method, original_text = entity_list, FROM_MESSAGE, original_text_list + elif fallback_value: + value, method, original_text = [fallback_value], FROM_FALLBACK_VALUE, [fallback_value] + else: + continue + + out = self.output_entity_dict_list(entity_value_list=value, original_text_list=original_text, + detection_method=method, + detection_language=self._target_language_script) + + entities[entity] = out + data_list.append(entities) + + return data_list + + @staticmethod + def output_entity_dict_list(entity_value_list, original_text_list, detection_method=None, + detection_method_list=None, detection_language=ENGLISH_LANG): + """ + Format detected entity values for bulk detection + + Args: + + entity_values_list (list of lists): containing list of entity values which are identified from given + detection logic + original_texts_list (list of lists): containing list original values or actual values from + messages which are identified + detection_method (str, optional): how the entity was detected + i.e. whether from message, structured_value + or fallback, verified from model or not. + defaults to None + detection_method_list(list, optional): list containing how each entity was detected in the entity_value + list.If provided, this argument will be used over detection method + defaults to None + detection_language(str): ISO 639 code for language in which entity is detected + + Returns: + + list of lists of dict: list of lists containing dictionaries, each containing entity_value, + original_text and detection; + entity_value is in itself a dict with its keys varying from entity to entity + Example Output: + [ + [ + { + "entity_value": entity_value_1, + "detection": detection_method, + "original_text": original_text_1 + }, + { + "entity_value": entity_value_2, + "detection": detection_method, + "original_text": original_text_2 + } + + ], + [ + { + "entity_value": entity_value, + "detection": detection_method, + "original_text": original_text + } + ] + ] + """ + if detection_method_list is None: + detection_method_list = [] + if entity_value_list is None: + entity_value_list = [] + + entity_list = [] + for i, entity_value in enumerate(entity_value_list): + if type(entity_value) in [str, six.text_type]: + entity_value = { + ENTITY_VALUE_DICT_KEY: entity_value + } + method = detection_method_list[i] if detection_method_list else detection_method + entity_list.append( + { + ENTITY_VALUE: entity_value, + DETECTION_METHOD: method, + ORIGINAL_TEXT: original_text_list[i], + DETECTION_LANGUAGE: detection_language + } + ) + return entity_list diff --git a/ner_v2/detectors/textual/utils.py b/ner_v2/detectors/textual/utils.py new file mode 100644 index 000000000..be10aae7b --- /dev/null +++ b/ner_v2/detectors/textual/utils.py @@ -0,0 +1,285 @@ +from __future__ import absolute_import + +import json +import six + +from chatbot_ner.config import ner_logger +from language_utilities.constant import ENGLISH_LANG + +from ner_constants import (DATASTORE_VERIFIED, MODEL_VERIFIED, + FROM_FALLBACK_VALUE, ORIGINAL_TEXT, ENTITY_VALUE, DETECTION_METHOD, + DETECTION_LANGUAGE, ENTITY_VALUE_DICT_KEY) +from ner_v2.detectors.textual.text_detection import TextDetector + + +def verify_text_request(request): + """ + Check the request object + 1. If proper message or entity is present in required + format. + + 2. If length of message or entity is in allowed range + + Args: + request: API request object + + Returns: + Raises KeyError if message or entities are not present + Raises TypeError if message is not list or entities is not dict type + Else Return none + """ + + request_data = json.loads(request.body) + messages = request_data.get("messages") + entities = request_data.get("entities") + + if not messages: + ner_logger.exception("messages param is not passed") + raise KeyError("key messages is required") + + if not entities: + ner_logger.exception("Entities param is not passed") + raise KeyError("Entities dict is required") + + if not isinstance(messages, list): + ner_logger.exception("messages param is not in correct format") + raise TypeError("messages should be in format of list of string") + + if not isinstance(entities, dict): + ner_logger.exception("Entities param is not in correct format") + raise TypeError("Entities should be dict of entity details") + + if len(messages) > 100: + ner_logger.exception("Maximum number of message can be 100 for " + "bulk detection") + raise ValueError("Maximum number of message can be 100 for " + "bulk detection") + + if len(list(entities)) > 100: + ner_logger.exception("Maximum number of entities can be 100 for " + " detection") + raise ValueError("Maximum number of entities can be 100 for " + "bulk detection") + + +def get_detection(message, entity_dict, bot_message=None, + language=ENGLISH_LANG, target_language_script=ENGLISH_LANG, + **kwargs): + """ + Get text detection for given message on given entities dict using + TextDetector module. + + If the message is string type call TextDetector.detect() mwthod, if it is list + call TextDetector.detect_bulk() method. Else, it wol raise an error. + Args: + message: message to detect text on + entity_dict: entity details dict + structured_value: structured value + bot_message: bot message + language: langugae for text detection + target_language_script: target language for detection default ENGLISH + **kwargs: other kwargs + + Returns: + + detected entity output + """ + text_detector = TextDetector(entity_dict=entity_dict, source_language_script=language, + target_language_script=target_language_script) + if isinstance(message, six.string_types): + entity_output = text_detector.detect(message=message, + bot_message=bot_message) + elif isinstance(message, (list, tuple)): + entity_output = text_detector.detect_bulk(messages=message) + else: + raise TypeError('`message` argument must be either of type `str`, `unicode`, `list` or `tuple`.') + + return entity_output + + +def get_text_entity_detection_data(request): + """ + Get details of message and entities from request and call get_detection internally + to get the results. + + Messages to detect text can be of two format: + + 1) Single entry in the list of message, for this we use `text_detector.detect` method. + Also for this case we check if `ignore_message` flag is present. + + 2) For multiples message, underlying code will call `text_detector.detect_bulk` method. + In this case we ignore flag for ignore_message for all the entities. + + Args: + request: request object + Returns: + output data list for all the message + Examples: + Request Object: + { + "messages": ["I want to go to Jabalpur"], + "bot_message": null, + "language_script": "en", + "source_language": "en", + "entities": { + "city": { + "structured_value": "Delhi", + "fallback_value": null, + "predetected_values": ["Mumbai"], + "fuzziness": null, + "min_token_len_fuzziness": null, + "ignore_message": false + }, + "restaurant": { + "structured_value": null, + "fallback_value": null, + "predetected_values": null, + "fuzziness": null, + "min_token_len_fuzziness": null, + "ignore_message": false + } + } + } + output response: + [ + { + "entities": { + "restaurant": [], + "city": [ + { + "entity_value": { + "value": "New Delhi", + "datastore_verified": true, + "model_verified": false + }, + "detection": "structure_value_verified", + "original_text": "delhi", + "language": "en" + }, + { + "entity_value": { + "value": "Mumbai", + "datastore_verified": false, + "model_verified": true + }, + "detection": "structure_value_verified", + "original_text": "Mumbai", + "language": "en" + } + ] + """ + request_data = json.loads(request.body) + messages = request_data.get("messages", []) + bot_message = request_data.get("bot_message") + entities = request_data.get("entities", {}) + target_language_script = request_data.get('language_script') or ENGLISH_LANG + source_language = request_data.get('source_language') or ENGLISH_LANG + + data = [] + + message_len = len(messages) + + if message_len == 1: + + # get first message + message_str = messages[0] + + fallback_value_entities = {} + text_value_entities = {} + + data.append({"entities": {}, "language": source_language}) + + for each_entity, value in entities.items(): + ignore_message = value.get('ignore_message', False) + + if ignore_message: + fallback_value_entities[each_entity] = value + else: + text_value_entities[each_entity] = value + + # get detection for text entities which has ignore_message flag + if fallback_value_entities: + output = get_output_for_fallback_entities(fallback_value_entities, source_language) + data[0]["entities"].update(output) + + # get detection for text entities + if text_value_entities: + output = get_detection(message=message_str, entity_dict=text_value_entities, + structured_value=None, bot_message=bot_message, + language_script=source_language, + target_language_script=target_language_script) + data[0]["entities"].update(output[0]) + + # check if more than one message + elif len(messages) > 1: + text_detection_result = get_detection(message=messages, entity_dict=entities, + structured_value=None, bot_message=bot_message) + + data = [{"entities": x, "language": source_language} for x in text_detection_result] + + else: + ner_logger.debug("No valid message provided") + raise KeyError("Message is required") + + return data + + +def get_output_for_fallback_entities(entities_dict, language=ENGLISH_LANG): + """ + Generate default detection output for default fallback entities. + It will check if fallback_value is present if not it will return + empty list for that entity. + + Args: + entities_dict: dict of entities details + language: language to run + + Returns: + TextDetection output (list of dict) for default fallback values + + Examples: + Input: + { + 'city': {'fallback_value': 'Mumbai', 'ignore_message': True}, + 'restaurant': {'fallback_value': None, 'ignore_message': True} + } + + Output: + + { + 'city': [ + {'entity_value': {'value': 'Mumbai', + 'datastore_verified': False, + 'model_verified': False}, + 'detection': 'fallback_value', + 'original_text': 'Mumbai', + 'language': 'en'} + ], + 'restaurant': [] + } + + """ + output = {} + if not entities_dict: + return output + + for entity, value in entities_dict.items(): + fallback_value = value.get("fallback_value") + + if not fallback_value: + output[entity] = [] + + else: + output[entity] = [ + { + ENTITY_VALUE: { + ENTITY_VALUE_DICT_KEY: fallback_value, + DATASTORE_VERIFIED: False, + MODEL_VERIFIED: False + }, + DETECTION_METHOD: FROM_FALLBACK_VALUE, + ORIGINAL_TEXT: fallback_value, + DETECTION_LANGUAGE: language + } + ] + return output diff --git a/postman_tests/data/entities/date.json b/postman_tests/data/entities/date.json index a127f6b0d..6f504b49e 100644 --- a/postman_tests/data/entities/date.json +++ b/postman_tests/data/entities/date.json @@ -1,46 +1,46 @@ [ { "input": { - "message": "Set me a reminder for 23 December", + "message": "Set me a reminder for 23 December 2030", "entity_name": "date" }, "expected": [ { - "original_text": "23 december", + "original_text": "23 december 2030", "type": "date", "dd": 23, "mm": 12, - "yy": 2020 + "yy": 2030 } ] }, { "input": { - "message": "Set me a reminder for 2 May", + "message": "Set me a reminder for 2 May 2030", "entity_name": "date" }, "expected": [ { - "original_text": "2 may", + "original_text": "2 may 2030", "type": "date", "dd": 2, "mm": 5, - "yy": 2020 + "yy": 2030 } ] }, { "input": { - "message": "Set me a reminder for 3 June", + "message": "Set me a reminder for 3 June 2030", "entity_name": "date" }, "expected": [ { - "original_text": "3 june", + "original_text": "3 june 2030", "type": "date", "dd": 3, "mm": 6, - "yy": 2020 + "yy": 2030 } ] } diff --git a/postman_tests/data/entities/dateV2.json b/postman_tests/data/entities/dateV2.json index 068dfcd94..4c03823dc 100644 --- a/postman_tests/data/entities/dateV2.json +++ b/postman_tests/data/entities/dateV2.json @@ -2,240 +2,240 @@ { "expected": [ { - "original_text": "3/3/1992", - "end_range": false, - "from": false, - "mm": 3, - "dd": 3, - "yy": 1992, - "to": false, - "start_range": false, + "original_text": "3/3/1992", + "end_range": false, + "from": false, + "mm": 3, + "dd": 3, + "yy": 1992, + "to": false, + "start_range": false, "type": "date" } - ], + ], "input": { - "message": "my anniversary was on 3/3/1992", + "message": "my anniversary was on 3/3/1992", "entity_name": "date" } - }, + }, { "expected": [ { - "original_text": "3rd aug 20", - "end_range": false, - "from": false, - "mm": 8, - "dd": 3, - "yy": 2020, - "to": false, - "start_range": false, + "original_text": "3rd aug 2027", + "end_range": false, + "from": false, + "mm": 8, + "dd": 3, + "yy": 2027, + "to": false, + "start_range": false, "type": "date" } - ], + ], "input": { - "message": "Coronoa Virus will end on 3rd Aug 20", + "message": "Coronoa Virus will end on 3rd Aug 2027", "entity_name": "date" } - }, + }, { "expected": [ { - "original_text": "12.03.2016", - "end_range": false, - "from": false, - "mm": 3, - "dd": 12, - "yy": 2016, - "to": false, - "start_range": false, + "original_text": "12.03.2016", + "end_range": false, + "from": false, + "mm": 3, + "dd": 12, + "yy": 2016, + "to": false, + "start_range": false, "type": "date" } - ], + ], "input": { - "message": "12.03.2016 is my nephew's birthday", + "message": "12.03.2016 is my nephew's birthday", "entity_name": "date" } - }, + }, { "expected": [ { - "original_text": "12.4.2016", - "end_range": false, - "from": false, - "mm": 4, - "dd": 12, - "yy": 2016, - "to": false, - "start_range": false, + "original_text": "12.4.2016", + "end_range": false, + "from": false, + "mm": 4, + "dd": 12, + "yy": 2016, + "to": false, + "start_range": false, "type": "date" } - ], + ], "input": { - "message": "12.4.2016 doesnt exist for me", + "message": "12.4.2016 doesnt exist for me", "entity_name": "date" } - }, + }, { "expected": [ { - "original_text": "3.3.12", - "end_range": false, - "from": false, - "mm": 3, - "dd": 3, - "yy": 2012, - "to": false, - "start_range": false, + "original_text": "3.3.12", + "end_range": false, + "from": false, + "mm": 3, + "dd": 3, + "yy": 2012, + "to": false, + "start_range": false, "type": "date" } - ], + ], "input": { - "message": "A date i wont forget is 3.3.12", + "message": "A date i wont forget is 3.3.12", "entity_name": "date" } - }, + }, { "expected": [ { - "original_text": "3/2/17", - "end_range": false, - "from": false, - "mm": 2, - "dd": 3, - "yy": 2017, - "to": false, - "start_range": false, + "original_text": "3/2/17", + "end_range": false, + "from": false, + "mm": 2, + "dd": 3, + "yy": 2017, + "to": false, + "start_range": false, "type": "date" } - ], + ], "input": { - "message": "3/2/17 changed my life forever", + "message": "3/2/17 changed my life forever", "entity_name": "date" } - }, + }, { "expected": [ { - "original_text": "12/12/12", - "end_range": false, - "from": false, - "mm": 12, - "dd": 12, - "yy": 2012, - "to": false, - "start_range": false, + "original_text": "12/12/12", + "end_range": false, + "from": false, + "mm": 12, + "dd": 12, + "yy": 2012, + "to": false, + "start_range": false, "type": "date" } - ], + ], "input": { - "message": "12/12/12 is a strange date isnt it ?", + "message": "12/12/12 is a strange date isnt it ?", "entity_name": "date" } - }, + }, { "expected": [ { - "original_text": "12-30-12", - "end_range": false, - "from": false, - "mm": 12, - "dd": 30, - "yy": 2012, - "to": false, - "start_range": false, + "original_text": "12-30-12", + "end_range": false, + "from": false, + "mm": 12, + "dd": 30, + "yy": 2012, + "to": false, + "start_range": false, "type": "date" } - ], + ], "input": { - "message": "we got married on 12-30-12", + "message": "we got married on 12-30-12", "entity_name": "date" } - }, + }, { "expected": [ { - "original_text": "12/12", - "end_range": false, - "from": false, - "mm": 12, - "dd": 12, - "yy": 2020, - "to": false, - "start_range": false, + "original_text": "12/12/1943", + "end_range": false, + "from": false, + "mm": 12, + "dd": 12, + "yy": 1943, + "to": false, + "start_range": false, "type": "date" } - ], + ], "input": { - "message": "12/12 is a bad day in american history", + "message": "12/12/1943 is a bad day in american history", "entity_name": "date" } - }, + }, { "expected": [ { - "original_text": "october 2nd", - "end_range": false, - "from": false, - "mm": 10, - "dd": 2, - "yy": 2020, - "to": false, - "start_range": false, + "original_text": "october 2nd 1937", + "end_range": false, + "from": false, + "mm": 10, + "dd": 2, + "yy": 1937, + "to": false, + "start_range": false, "type": "date" } - ], + ], "input": { - "message": "Gandhi Jayanti is on October 2nd", + "message": "Gandhi Jayanti is on October 2nd 1937", "entity_name": "date" } }, { "expected": [ { - "original_text": "2019 may 21st", - "end_range": false, - "from": false, - "mm": 5, - "dd": 21, - "yy": 2019, - "to": false, - "start_range": false, + "original_text": "2019 may 21st", + "end_range": false, + "from": false, + "mm": 5, + "dd": 21, + "yy": 2019, + "to": false, + "start_range": false, "type": "date" } - ], + ], "input": { - "message": "2019 May 21st", + "message": "2019 May 21st", "entity_name": "date" } }, { "expected": [ { - "original_text": "2/3/2020", - "end_range": false, - "from": false, - "mm": 3, - "dd": 2, - "yy": 2020, - "to": false, - "start_range": true, + "original_text": "2/3/2020", + "end_range": false, + "from": false, + "mm": 3, + "dd": 2, + "yy": 2020, + "to": false, + "start_range": true, "type": "date" - }, + }, { - "original_text": "5/6/2024", - "end_range": true, - "from": false, - "mm": 6, - "dd": 5, - "yy": 2024, - "to": false, - "start_range": false, + "original_text": "5/6/2024", + "end_range": true, + "from": false, + "mm": 6, + "dd": 5, + "yy": 2024, + "to": false, + "start_range": false, "type": "date" } - ], + ], "input": { - "message": "My meeting is 2/3/2020 to 5/6/2024", + "message": "My meeting is 2/3/2020 to 5/6/2024", "entity_name": "date" } } -] +] \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index d27b3c7bd..f155161a5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,7 @@ numpy==1.16 elasticsearch==5.5.0 requests==2.20.0 requests-aws4auth==0.9 -Django==1.11.28 +Django==1.11.29 django-dotenv==1.4.2 weighted-levenshtein==0.1 regex==2018.7.11