From c8258ab126e53a67a8c086488d8513b117e81e80 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 5 Jun 2020 17:33:36 +0000 Subject: [PATCH 01/31] Bump django from 1.11.28 to 1.11.29 Bumps [django](https://github.com/django/django) from 1.11.28 to 1.11.29. - [Release notes](https://github.com/django/django/releases) - [Commits](https://github.com/django/django/compare/1.11.28...1.11.29) Signed-off-by: dependabot[bot] --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index d27b3c7bd..f155161a5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,7 @@ numpy==1.16 elasticsearch==5.5.0 requests==2.20.0 requests-aws4auth==0.9 -Django==1.11.28 +Django==1.11.29 django-dotenv==1.4.2 weighted-levenshtein==0.1 regex==2018.7.11 From 6c8f35311aaad7c64202665eb9c471852e3edd45 Mon Sep 17 00:00:00 2001 From: Chirag Jain Date: Tue, 9 Jun 2020 19:24:08 +0530 Subject: [PATCH 02/31] Update Elasticsearch mappings for entity indices Some mappings were found to be manually put outside of code with creates a problem with new setups. --- datastore/elastic_search/create.py | 46 ++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 15 deletions(-) diff --git a/datastore/elastic_search/create.py b/datastore/elastic_search/create.py index f06d7c77a..2718bba13 100644 --- a/datastore/elastic_search/create.py +++ b/datastore/elastic_search/create.py @@ -144,13 +144,29 @@ def create_entity_index(connection, index_name, doc_type, logger, **kwargs): """ mapping_body = { doc_type: { - 'properties': { - 'variants': { - 'type': 'text', - 'analyzer': 'my_analyzer', - 'norms': {'enabled': False}, # Needed if we want to give longer variants higher scores - } - } + 'language_script': { + 'type': 'text', + }, + 'value': { + 'type': 'text', + }, + 'variants': { + 'type': 'text', + 'analyzer': 'my_analyzer', + 'norms': { + 'enabled': False # Needed if we want to give longer variants higher scores + }, + }, + # other removed/unused fields, kept only for backward compatibility + 'dict_type': { + 'type': 'text', + }, + 'entity_data': { + 'type': 'text', + }, + 'source_language': { + 'type': 'text', + }, } } @@ -184,17 +200,17 @@ def create_crf_index(connection, index_name, doc_type, logger, **kwargs): mapping_body = { doc_type: { 'properties': { - "entity_data": { - "type": "text" + 'entity_data': { + 'type': 'text' }, - "sentence": { - "enabled": "false" + 'sentence': { + 'enabled': 'false' }, - "entities": { - "enabled": "false" + 'entities': { + 'enabled': 'false' }, - "language_script": { - "type": "text" + 'language_script': { + 'type': 'text' } } } From 53c2db7b7527f4bb760c67f988c9ec59fcb1264a Mon Sep 17 00:00:00 2001 From: Chirag Jain Date: Tue, 9 Jun 2020 19:27:05 +0530 Subject: [PATCH 03/31] fix mapping - revert deletion of properties key nest --- datastore/elastic_search/create.py | 46 ++++++++++++++++-------------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/datastore/elastic_search/create.py b/datastore/elastic_search/create.py index 2718bba13..84aaaf772 100644 --- a/datastore/elastic_search/create.py +++ b/datastore/elastic_search/create.py @@ -144,29 +144,31 @@ def create_entity_index(connection, index_name, doc_type, logger, **kwargs): """ mapping_body = { doc_type: { - 'language_script': { - 'type': 'text', - }, - 'value': { - 'type': 'text', - }, - 'variants': { - 'type': 'text', - 'analyzer': 'my_analyzer', - 'norms': { - 'enabled': False # Needed if we want to give longer variants higher scores + 'properties': { + 'language_script': { + 'type': 'text', + }, + 'value': { + 'type': 'text', + }, + 'variants': { + 'type': 'text', + 'analyzer': 'my_analyzer', + 'norms': { + 'enabled': False # Needed if we want to give longer variants higher scores + }, }, - }, - # other removed/unused fields, kept only for backward compatibility - 'dict_type': { - 'type': 'text', - }, - 'entity_data': { - 'type': 'text', - }, - 'source_language': { - 'type': 'text', - }, + # other removed/unused fields, kept only for backward compatibility + 'dict_type': { + 'type': 'text', + }, + 'entity_data': { + 'type': 'text', + }, + 'source_language': { + 'type': 'text', + } + } } } From 2e90bd74447f7181899351f466c630624d55a2da Mon Sep 17 00:00:00 2001 From: Chirag Jain Date: Tue, 9 Jun 2020 19:28:16 +0530 Subject: [PATCH 04/31] Use native bool types instead of str in mapping body --- datastore/elastic_search/create.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datastore/elastic_search/create.py b/datastore/elastic_search/create.py index 84aaaf772..45fb988ed 100644 --- a/datastore/elastic_search/create.py +++ b/datastore/elastic_search/create.py @@ -206,10 +206,10 @@ def create_crf_index(connection, index_name, doc_type, logger, **kwargs): 'type': 'text' }, 'sentence': { - 'enabled': 'false' + 'enabled': False }, 'entities': { - 'enabled': 'false' + 'enabled': False }, 'language_script': { 'type': 'text' From 16a9722b1775a8e4b82eddbb9f9804c3ad84bdc9 Mon Sep 17 00:00:00 2001 From: Chirag Jain Date: Tue, 9 Jun 2020 19:50:31 +0530 Subject: [PATCH 05/31] Add length limits to fields entity index mapping --- datastore/elastic_search/create.py | 34 +++++++++++++++++------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/datastore/elastic_search/create.py b/datastore/elastic_search/create.py index 45fb988ed..b261bf088 100644 --- a/datastore/elastic_search/create.py +++ b/datastore/elastic_search/create.py @@ -4,7 +4,7 @@ def exists(connection, index_name): - """ + ''' Checks if index_name exists Args: @@ -13,12 +13,12 @@ def exists(connection, index_name): Returns: boolean, True if index exists , False otherwise - """ + ''' return connection.indices.exists(index_name) def delete_index(connection, index_name, logger, err_if_does_not_exist=True, **kwargs): - """ + ''' Deletes the index named index_name Args: @@ -33,7 +33,7 @@ def delete_index(connection, index_name, logger, err_if_does_not_exist=True, **k wait_for_active_shards: Set the number of active shards to wait for before the operation returns. Refer https://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.client.IndicesClient.delete - """ + ''' if not exists(connection, index_name): if err_if_does_not_exist: raise Exception('Failed to delete index {}. It does not exist!'.format(index_name)) @@ -45,7 +45,7 @@ def delete_index(connection, index_name, logger, err_if_does_not_exist=True, **k def _create_index(connection, index_name, doc_type, logger, mapping_body, err_if_exists=True, **kwargs): - """ + ''' Creates an Elasticsearch index needed for similarity based searching Args: connection: Elasticsearch client object @@ -67,7 +67,7 @@ def _create_index(connection, index_name, doc_type, logger, mapping_body, err_if (missing or closed) Refer https://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.client.IndicesClient.create Refer https://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.client.IndicesClient.put_mapping - """ + ''' if exists(connection=connection, index_name=index_name): if err_if_exists: raise Exception('Failed to create index {}. it already exists. Please check and delete it using ' @@ -118,7 +118,7 @@ def _create_index(connection, index_name, doc_type, logger, mapping_body, err_if def create_entity_index(connection, index_name, doc_type, logger, **kwargs): - """ + ''' Creates an mapping specific to entity storage in elasticsearch and makes a call to create_index to create the index with the given mapping body Args: @@ -141,32 +141,36 @@ def create_entity_index(connection, index_name, doc_type, logger, **kwargs): Refer https://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.client.IndicesClient.create Refer https://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.client.IndicesClient.put_mapping - """ + ''' mapping_body = { doc_type: { 'properties': { 'language_script': { 'type': 'text', + 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}, }, 'value': { 'type': 'text', + 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}, }, 'variants': { 'type': 'text', + 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}, 'analyzer': 'my_analyzer', - 'norms': { - 'enabled': False # Needed if we want to give longer variants higher scores - }, + 'norms': {'enabled': False}, # Needed if we want to give longer variants higher scores }, # other removed/unused fields, kept only for backward compatibility 'dict_type': { 'type': 'text', + 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}, }, 'entity_data': { 'type': 'text', + 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}, }, 'source_language': { 'type': 'text', + 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}, } } } @@ -176,7 +180,7 @@ def create_entity_index(connection, index_name, doc_type, logger, **kwargs): def create_crf_index(connection, index_name, doc_type, logger, **kwargs): - """ + ''' This method is used to create an index with mapping suited for story training_data Args: connection: Elasticsearch client object @@ -198,7 +202,7 @@ def create_crf_index(connection, index_name, doc_type, logger, **kwargs): Refer https://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.client.IndicesClient.create Refer https://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.client.IndicesClient.put_mapping - """ + ''' mapping_body = { doc_type: { 'properties': { @@ -222,7 +226,7 @@ def create_crf_index(connection, index_name, doc_type, logger, **kwargs): def create_alias(connection, index_list, alias_name, logger, **kwargs): - """ + ''' This method is used to create alias for list of indices Args: connection: @@ -232,7 +236,7 @@ def create_alias(connection, index_list, alias_name, logger, **kwargs): **kwargs: https://www.elastic.co/guide/en/elasticsearch/reference/current/indices-aliases.html - """ + ''' logger.debug('Alias creation %s started %s' % alias_name) connection.indices.put_alias(index=index_list, name=alias_name, **kwargs) logger.debug('Alias %s now points to indices %s' % (alias_name, str(index_list))) From 2abb22c648e9be712e3f59f379b7d1f666674be8 Mon Sep 17 00:00:00 2001 From: Chirag Jain Date: Tue, 9 Jun 2020 19:54:57 +0530 Subject: [PATCH 06/31] Fix broken index deletion code --- config.example | 2 +- datastore/datastore.py | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/config.example b/config.example index 7fdba8a19..9ebf284c0 100644 --- a/config.example +++ b/config.example @@ -34,8 +34,8 @@ ES_ALIAS=entity_data ES_INDEX_1=entity_data_v1 ES_INDEX_2= ES_DOC_TYPE=data_dictionary -ELASTICSEARCH_CRF_DATA_DOC_TYPE=training_dictionary ELASTICSEARCH_CRF_DATA_INDEX_NAME=entity_examples_data +ELASTICSEARCH_CRF_DATA_DOC_TYPE=training_dictionary ES_BULK_MSG_SIZE=1000 ES_SEARCH_SIZE=10000 diff --git a/datastore/datastore.py b/datastore/datastore.py index fae8ad90e..2429ce742 100644 --- a/datastore/datastore.py +++ b/datastore/datastore.py @@ -208,10 +208,11 @@ def delete(self, err_if_does_not_exist=True, **kwargs): self._connect() if self._engine == ELASTICSEARCH: - for index_key in [ELASTICSEARCH_INDEX_1, ELASTICSEARCH_INDEX_2, ELASTICSEARCH_CRF_DATA_INDEX_NAME]: - if self._connection_settings.get(index_key): + for index_name_key in [ELASTICSEARCH_INDEX_1, ELASTICSEARCH_INDEX_2, ELASTICSEARCH_CRF_DATA_INDEX_NAME]: + if self._connection_settings.get(index_name_key): + index_name = self._connection_settings.get(index_name_key) elastic_search.create.delete_index(connection=self._client_or_connection, - index_name=self._store_name, + index_name=index_name, logger=ner_logger, err_if_does_not_exist=err_if_does_not_exist, **kwargs) From 19d63f6be4f1556242531675117cdf62bedffd02 Mon Sep 17 00:00:00 2001 From: Chirag Jain Date: Tue, 9 Jun 2020 20:22:41 +0530 Subject: [PATCH 07/31] Update alias creation and deletion code and add type hints to create.py --- datastore/datastore.py | 14 +++++-- datastore/elastic_search/create.py | 59 +++++++++++++++++++++++------- 2 files changed, 55 insertions(+), 18 deletions(-) diff --git a/datastore/datastore.py b/datastore/datastore.py index 2429ce742..2125fa53e 100644 --- a/datastore/datastore.py +++ b/datastore/datastore.py @@ -151,7 +151,6 @@ def create(self, err_if_exists=True, **kwargs): if self._engine == ELASTICSEARCH: es_url = elastic_search.connect.get_es_url() - es_object = elastic_search.transfer.ESTransfer(source=es_url, destination=None) create_map = [ # TODO: use namedtuples (True, ELASTICSEARCH_INDEX_1, ELASTICSEARCH_DOC_TYPE, self._store_name, self._check_doc_type_for_elasticsearch, elastic_search.create.create_entity_index), @@ -180,8 +179,10 @@ def create(self, err_if_exists=True, **kwargs): **kwargs ) if alias_name: - es_object.point_an_alias_to_index(es_url=es_url, alias_name=self._store_name, - index_name=index_name) + elastic_search.create.create_alias(connection=self._client_or_connection, + index_list=[index_name], + alias_name=alias_name, + logger=ner_logger) def delete(self, err_if_does_not_exist=True, **kwargs): """ @@ -208,7 +209,12 @@ def delete(self, err_if_does_not_exist=True, **kwargs): self._connect() if self._engine == ELASTICSEARCH: - for index_name_key in [ELASTICSEARCH_INDEX_1, ELASTICSEARCH_INDEX_2, ELASTICSEARCH_CRF_DATA_INDEX_NAME]: + delete_map = [ + (ELASTICSEARCH_INDEX_1, self._store_name), + (ELASTICSEARCH_INDEX_2, self._store_name), + (ELASTICSEARCH_CRF_DATA_INDEX_NAME, None), + ] + for (index_name_key, alias_name) in delete_map: if self._connection_settings.get(index_name_key): index_name = self._connection_settings.get(index_name_key) elastic_search.create.delete_index(connection=self._client_or_connection, diff --git a/datastore/elastic_search/create.py b/datastore/elastic_search/create.py index b261bf088..060519dd3 100644 --- a/datastore/elastic_search/create.py +++ b/datastore/elastic_search/create.py @@ -1,10 +1,16 @@ +import logging +from typing import List, Dict, Any + +from elasticsearch import Elasticsearch + from .utils import filter_kwargs log_prefix = 'datastore.elastic_search.create' def exists(connection, index_name): - ''' + # type: (Elasticsearch, str) -> bool + """ Checks if index_name exists Args: @@ -13,18 +19,20 @@ def exists(connection, index_name): Returns: boolean, True if index exists , False otherwise - ''' + """ return connection.indices.exists(index_name) def delete_index(connection, index_name, logger, err_if_does_not_exist=True, **kwargs): - ''' + # type: (Elasticsearch, str, logging.Logger, bool, **Any) -> None + """ Deletes the index named index_name Args: connection: Elasticsearch client object index_name: The name of the index logger: logging object to log at debug and exception level + err_if_does_not_exist: if to raise error if index does not exist already, defaults to True kwargs: body: The configuration for the index (settings and mappings) master_timeout: Specify timeout for connection to master @@ -33,7 +41,7 @@ def delete_index(connection, index_name, logger, err_if_does_not_exist=True, **k wait_for_active_shards: Set the number of active shards to wait for before the operation returns. Refer https://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.client.IndicesClient.delete - ''' + """ if not exists(connection, index_name): if err_if_does_not_exist: raise Exception('Failed to delete index {}. It does not exist!'.format(index_name)) @@ -45,7 +53,8 @@ def delete_index(connection, index_name, logger, err_if_does_not_exist=True, **k def _create_index(connection, index_name, doc_type, logger, mapping_body, err_if_exists=True, **kwargs): - ''' + # type: (Elasticsearch, str, str, logging.Logger, Dict[str, Any], bool, **Any) -> None + """ Creates an Elasticsearch index needed for similarity based searching Args: connection: Elasticsearch client object @@ -53,6 +62,7 @@ def _create_index(connection, index_name, doc_type, logger, mapping_body, err_if doc_type: The type of the documents that will be indexed logger: logging object to log at debug and exception level mapping_body: dict, mappings to put on the index + err_if_exists: if to raise error if the index already exists, defaults to True kwargs: master_timeout: Specify timeout for connection to master timeout: Explicit operation timeout @@ -67,7 +77,7 @@ def _create_index(connection, index_name, doc_type, logger, mapping_body, err_if (missing or closed) Refer https://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.client.IndicesClient.create Refer https://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.client.IndicesClient.put_mapping - ''' + """ if exists(connection=connection, index_name=index_name): if err_if_exists: raise Exception('Failed to create index {}. it already exists. Please check and delete it using ' @@ -118,7 +128,8 @@ def _create_index(connection, index_name, doc_type, logger, mapping_body, err_if def create_entity_index(connection, index_name, doc_type, logger, **kwargs): - ''' + # type: (Elasticsearch, str, str, logging.Logger, **Any) -> None + """ Creates an mapping specific to entity storage in elasticsearch and makes a call to create_index to create the index with the given mapping body Args: @@ -141,7 +152,7 @@ def create_entity_index(connection, index_name, doc_type, logger, **kwargs): Refer https://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.client.IndicesClient.create Refer https://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.client.IndicesClient.put_mapping - ''' + """ mapping_body = { doc_type: { 'properties': { @@ -180,7 +191,8 @@ def create_entity_index(connection, index_name, doc_type, logger, **kwargs): def create_crf_index(connection, index_name, doc_type, logger, **kwargs): - ''' + # type: (Elasticsearch, str, str, logging.Logger, **Any) -> None + """ This method is used to create an index with mapping suited for story training_data Args: connection: Elasticsearch client object @@ -202,7 +214,7 @@ def create_crf_index(connection, index_name, doc_type, logger, **kwargs): Refer https://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.client.IndicesClient.create Refer https://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.client.IndicesClient.put_mapping - ''' + """ mapping_body = { doc_type: { 'properties': { @@ -226,17 +238,36 @@ def create_crf_index(connection, index_name, doc_type, logger, **kwargs): def create_alias(connection, index_list, alias_name, logger, **kwargs): - ''' + # type: (Elasticsearch, List[str], str, logging.Logger, **Any) -> None + """ This method is used to create alias for list of indices Args: - connection: + connection: Elasticsearch client object index_list (list): List of indices the alias has to point to alias_name (str): Name of the alias logger: logging object to log at debug and exception level **kwargs: https://www.elastic.co/guide/en/elasticsearch/reference/current/indices-aliases.html - ''' - logger.debug('Alias creation %s started %s' % alias_name) + """ + logger.debug('Putting alias %s to indices: %s' % (alias_name, str(index_list))) connection.indices.put_alias(index=index_list, name=alias_name, **kwargs) logger.debug('Alias %s now points to indices %s' % (alias_name, str(index_list))) + + +def delete_alias(connection, index_list, alias_name, logger, **kwargs): + # type: (Elasticsearch, List[str], str, logging.Logger, **Any) -> None + """ + Delete alias `alias_name` from list of indices in `index_list` + Args: + connection: Elasticsearch client object + index_list (list): List of indices the alias has to point to + alias_name (str): Name of the alias + logger: logging object to log at debug and exception level + + **kwargs: + https://www.elastic.co/guide/en/elasticsearch/reference/current/indices-aliases.html + """ + logger.debug('Removing alias %s from indices: %s' % (alias_name, str(index_list))) + connection.indices.delete_alias(index=index_list, name=alias_name, **kwargs) + logger.debug('Alias %s removed from indices %s' % (alias_name, str(index_list))) From 976231f3ad8f61175ac95deaf7cac75ad531da80 Mon Sep 17 00:00:00 2001 From: Chirag Jain Date: Tue, 9 Jun 2020 20:26:20 +0530 Subject: [PATCH 08/31] Delete alias before deleting the index itself --- datastore/datastore.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/datastore/datastore.py b/datastore/datastore.py index 2125fa53e..e789acb37 100644 --- a/datastore/datastore.py +++ b/datastore/datastore.py @@ -217,12 +217,15 @@ def delete(self, err_if_does_not_exist=True, **kwargs): for (index_name_key, alias_name) in delete_map: if self._connection_settings.get(index_name_key): index_name = self._connection_settings.get(index_name_key) + if alias_name: + elastic_search.create.delete_alias(connection=self._client_or_connection, + index_list=[index_name], + logger=ner_logger) elastic_search.create.delete_index(connection=self._client_or_connection, index_name=index_name, logger=ner_logger, err_if_does_not_exist=err_if_does_not_exist, **kwargs) - # TODO: cleanup aliases ? # === Incompatible or deprecated/duplicate APIs From aa4e91b96379ad5bcd28d77e1df7fa1fde0d69b3 Mon Sep 17 00:00:00 2001 From: Chirag Jain Date: Tue, 9 Jun 2020 20:28:04 +0530 Subject: [PATCH 09/31] Fix stupid mistake - pass required alias_name argument --- datastore/datastore.py | 1 + 1 file changed, 1 insertion(+) diff --git a/datastore/datastore.py b/datastore/datastore.py index e789acb37..ef32dce7f 100644 --- a/datastore/datastore.py +++ b/datastore/datastore.py @@ -220,6 +220,7 @@ def delete(self, err_if_does_not_exist=True, **kwargs): if alias_name: elastic_search.create.delete_alias(connection=self._client_or_connection, index_list=[index_name], + alias_name=alias_name, logger=ner_logger) elastic_search.create.delete_index(connection=self._client_or_connection, index_name=index_name, From 3e8651123e37abe5e7b42d4ad31a1b679c34b55f Mon Sep 17 00:00:00 2001 From: Chirag Jain Date: Wed, 10 Jun 2020 18:17:43 +0530 Subject: [PATCH 10/31] delete all indices of an index only if index exists --- datastore/datastore.py | 14 ++------------ datastore/elastic_search/create.py | 1 + 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/datastore/datastore.py b/datastore/datastore.py index ef32dce7f..4badd0f1b 100644 --- a/datastore/datastore.py +++ b/datastore/datastore.py @@ -209,25 +209,15 @@ def delete(self, err_if_does_not_exist=True, **kwargs): self._connect() if self._engine == ELASTICSEARCH: - delete_map = [ - (ELASTICSEARCH_INDEX_1, self._store_name), - (ELASTICSEARCH_INDEX_2, self._store_name), - (ELASTICSEARCH_CRF_DATA_INDEX_NAME, None), - ] - for (index_name_key, alias_name) in delete_map: + delete_map = [ELASTICSEARCH_INDEX_1, ELASTICSEARCH_INDEX_2, ELASTICSEARCH_CRF_DATA_INDEX_NAME] + for index_name_key in delete_map: if self._connection_settings.get(index_name_key): index_name = self._connection_settings.get(index_name_key) - if alias_name: - elastic_search.create.delete_alias(connection=self._client_or_connection, - index_list=[index_name], - alias_name=alias_name, - logger=ner_logger) elastic_search.create.delete_index(connection=self._client_or_connection, index_name=index_name, logger=ner_logger, err_if_does_not_exist=err_if_does_not_exist, **kwargs) - # === Incompatible or deprecated/duplicate APIs # FIXME: repopulate does not consider language of the variants diff --git a/datastore/elastic_search/create.py b/datastore/elastic_search/create.py index 060519dd3..5b9314576 100644 --- a/datastore/elastic_search/create.py +++ b/datastore/elastic_search/create.py @@ -48,6 +48,7 @@ def delete_index(connection, index_name, logger, err_if_does_not_exist=True, **k else: return + delete_alias(connection=connection, index_list=[index_name], alias_name='_all', logger=logger) connection.indices.delete(index=index_name, **kwargs) logger.debug('%s: Delete Index %s: Operation successfully completed', log_prefix, index_name) From 9a04197cdaa9760de503abcdca7401c2a28b7bd0 Mon Sep 17 00:00:00 2001 From: Chirag Jain Date: Thu, 11 Jun 2020 16:22:25 +0530 Subject: [PATCH 11/31] Handle error in deletion of non existant aliases --- datastore/elastic_search/create.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/datastore/elastic_search/create.py b/datastore/elastic_search/create.py index 5b9314576..07a04e77f 100644 --- a/datastore/elastic_search/create.py +++ b/datastore/elastic_search/create.py @@ -2,6 +2,7 @@ from typing import List, Dict, Any from elasticsearch import Elasticsearch +from elasticsearch.exceptions import NotFoundError from .utils import filter_kwargs @@ -48,7 +49,11 @@ def delete_index(connection, index_name, logger, err_if_does_not_exist=True, **k else: return - delete_alias(connection=connection, index_list=[index_name], alias_name='_all', logger=logger) + try: + delete_alias(connection=connection, index_list=[index_name], alias_name='_all', logger=logger) + except NotFoundError: + logger.warning('No aliases found on on index %s', index_name) + connection.indices.delete(index=index_name, **kwargs) logger.debug('%s: Delete Index %s: Operation successfully completed', log_prefix, index_name) @@ -251,9 +256,9 @@ def create_alias(connection, index_list, alias_name, logger, **kwargs): **kwargs: https://www.elastic.co/guide/en/elasticsearch/reference/current/indices-aliases.html """ - logger.debug('Putting alias %s to indices: %s' % (alias_name, str(index_list))) + logger.debug('Putting alias %s to indices: %s', alias_name, str(index_list)) connection.indices.put_alias(index=index_list, name=alias_name, **kwargs) - logger.debug('Alias %s now points to indices %s' % (alias_name, str(index_list))) + logger.debug('Alias %s now points to indices %s', alias_name, str(index_list)) def delete_alias(connection, index_list, alias_name, logger, **kwargs): @@ -269,6 +274,6 @@ def delete_alias(connection, index_list, alias_name, logger, **kwargs): **kwargs: https://www.elastic.co/guide/en/elasticsearch/reference/current/indices-aliases.html """ - logger.debug('Removing alias %s from indices: %s' % (alias_name, str(index_list))) + logger.debug('Removing alias %s from indices: %s', alias_name, str(index_list)) connection.indices.delete_alias(index=index_list, name=alias_name, **kwargs) - logger.debug('Alias %s removed from indices %s' % (alias_name, str(index_list))) + logger.debug('Alias %s removed from indices %s', alias_name, str(index_list)) From 84d23e5c337f67864eb73e2782c06b44cfa7026a Mon Sep 17 00:00:00 2001 From: hemantgandhi90 Date: Thu, 11 Jun 2020 18:35:02 +0530 Subject: [PATCH 12/31] Date Test Changes --- postman_tests/data/entities/dateV2.json | 278 ++++++++++++------------ 1 file changed, 139 insertions(+), 139 deletions(-) diff --git a/postman_tests/data/entities/dateV2.json b/postman_tests/data/entities/dateV2.json index 068dfcd94..4c03823dc 100644 --- a/postman_tests/data/entities/dateV2.json +++ b/postman_tests/data/entities/dateV2.json @@ -2,240 +2,240 @@ { "expected": [ { - "original_text": "3/3/1992", - "end_range": false, - "from": false, - "mm": 3, - "dd": 3, - "yy": 1992, - "to": false, - "start_range": false, + "original_text": "3/3/1992", + "end_range": false, + "from": false, + "mm": 3, + "dd": 3, + "yy": 1992, + "to": false, + "start_range": false, "type": "date" } - ], + ], "input": { - "message": "my anniversary was on 3/3/1992", + "message": "my anniversary was on 3/3/1992", "entity_name": "date" } - }, + }, { "expected": [ { - "original_text": "3rd aug 20", - "end_range": false, - "from": false, - "mm": 8, - "dd": 3, - "yy": 2020, - "to": false, - "start_range": false, + "original_text": "3rd aug 2027", + "end_range": false, + "from": false, + "mm": 8, + "dd": 3, + "yy": 2027, + "to": false, + "start_range": false, "type": "date" } - ], + ], "input": { - "message": "Coronoa Virus will end on 3rd Aug 20", + "message": "Coronoa Virus will end on 3rd Aug 2027", "entity_name": "date" } - }, + }, { "expected": [ { - "original_text": "12.03.2016", - "end_range": false, - "from": false, - "mm": 3, - "dd": 12, - "yy": 2016, - "to": false, - "start_range": false, + "original_text": "12.03.2016", + "end_range": false, + "from": false, + "mm": 3, + "dd": 12, + "yy": 2016, + "to": false, + "start_range": false, "type": "date" } - ], + ], "input": { - "message": "12.03.2016 is my nephew's birthday", + "message": "12.03.2016 is my nephew's birthday", "entity_name": "date" } - }, + }, { "expected": [ { - "original_text": "12.4.2016", - "end_range": false, - "from": false, - "mm": 4, - "dd": 12, - "yy": 2016, - "to": false, - "start_range": false, + "original_text": "12.4.2016", + "end_range": false, + "from": false, + "mm": 4, + "dd": 12, + "yy": 2016, + "to": false, + "start_range": false, "type": "date" } - ], + ], "input": { - "message": "12.4.2016 doesnt exist for me", + "message": "12.4.2016 doesnt exist for me", "entity_name": "date" } - }, + }, { "expected": [ { - "original_text": "3.3.12", - "end_range": false, - "from": false, - "mm": 3, - "dd": 3, - "yy": 2012, - "to": false, - "start_range": false, + "original_text": "3.3.12", + "end_range": false, + "from": false, + "mm": 3, + "dd": 3, + "yy": 2012, + "to": false, + "start_range": false, "type": "date" } - ], + ], "input": { - "message": "A date i wont forget is 3.3.12", + "message": "A date i wont forget is 3.3.12", "entity_name": "date" } - }, + }, { "expected": [ { - "original_text": "3/2/17", - "end_range": false, - "from": false, - "mm": 2, - "dd": 3, - "yy": 2017, - "to": false, - "start_range": false, + "original_text": "3/2/17", + "end_range": false, + "from": false, + "mm": 2, + "dd": 3, + "yy": 2017, + "to": false, + "start_range": false, "type": "date" } - ], + ], "input": { - "message": "3/2/17 changed my life forever", + "message": "3/2/17 changed my life forever", "entity_name": "date" } - }, + }, { "expected": [ { - "original_text": "12/12/12", - "end_range": false, - "from": false, - "mm": 12, - "dd": 12, - "yy": 2012, - "to": false, - "start_range": false, + "original_text": "12/12/12", + "end_range": false, + "from": false, + "mm": 12, + "dd": 12, + "yy": 2012, + "to": false, + "start_range": false, "type": "date" } - ], + ], "input": { - "message": "12/12/12 is a strange date isnt it ?", + "message": "12/12/12 is a strange date isnt it ?", "entity_name": "date" } - }, + }, { "expected": [ { - "original_text": "12-30-12", - "end_range": false, - "from": false, - "mm": 12, - "dd": 30, - "yy": 2012, - "to": false, - "start_range": false, + "original_text": "12-30-12", + "end_range": false, + "from": false, + "mm": 12, + "dd": 30, + "yy": 2012, + "to": false, + "start_range": false, "type": "date" } - ], + ], "input": { - "message": "we got married on 12-30-12", + "message": "we got married on 12-30-12", "entity_name": "date" } - }, + }, { "expected": [ { - "original_text": "12/12", - "end_range": false, - "from": false, - "mm": 12, - "dd": 12, - "yy": 2020, - "to": false, - "start_range": false, + "original_text": "12/12/1943", + "end_range": false, + "from": false, + "mm": 12, + "dd": 12, + "yy": 1943, + "to": false, + "start_range": false, "type": "date" } - ], + ], "input": { - "message": "12/12 is a bad day in american history", + "message": "12/12/1943 is a bad day in american history", "entity_name": "date" } - }, + }, { "expected": [ { - "original_text": "october 2nd", - "end_range": false, - "from": false, - "mm": 10, - "dd": 2, - "yy": 2020, - "to": false, - "start_range": false, + "original_text": "october 2nd 1937", + "end_range": false, + "from": false, + "mm": 10, + "dd": 2, + "yy": 1937, + "to": false, + "start_range": false, "type": "date" } - ], + ], "input": { - "message": "Gandhi Jayanti is on October 2nd", + "message": "Gandhi Jayanti is on October 2nd 1937", "entity_name": "date" } }, { "expected": [ { - "original_text": "2019 may 21st", - "end_range": false, - "from": false, - "mm": 5, - "dd": 21, - "yy": 2019, - "to": false, - "start_range": false, + "original_text": "2019 may 21st", + "end_range": false, + "from": false, + "mm": 5, + "dd": 21, + "yy": 2019, + "to": false, + "start_range": false, "type": "date" } - ], + ], "input": { - "message": "2019 May 21st", + "message": "2019 May 21st", "entity_name": "date" } }, { "expected": [ { - "original_text": "2/3/2020", - "end_range": false, - "from": false, - "mm": 3, - "dd": 2, - "yy": 2020, - "to": false, - "start_range": true, + "original_text": "2/3/2020", + "end_range": false, + "from": false, + "mm": 3, + "dd": 2, + "yy": 2020, + "to": false, + "start_range": true, "type": "date" - }, + }, { - "original_text": "5/6/2024", - "end_range": true, - "from": false, - "mm": 6, - "dd": 5, - "yy": 2024, - "to": false, - "start_range": false, + "original_text": "5/6/2024", + "end_range": true, + "from": false, + "mm": 6, + "dd": 5, + "yy": 2024, + "to": false, + "start_range": false, "type": "date" } - ], + ], "input": { - "message": "My meeting is 2/3/2020 to 5/6/2024", + "message": "My meeting is 2/3/2020 to 5/6/2024", "entity_name": "date" } } -] +] \ No newline at end of file From 812c80ff15ebf551fae3cb2cb2f6f857ca2c1899 Mon Sep 17 00:00:00 2001 From: Chirag Jain Date: Fri, 12 Jun 2020 21:28:13 +0530 Subject: [PATCH 13/31] Fix date v1 and date v2 tests temporarily --- ner_v2/detectors/temporal/date/en/date_detection.py | 2 +- postman_tests/data/entities/date.json | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/ner_v2/detectors/temporal/date/en/date_detection.py b/ner_v2/detectors/temporal/date/en/date_detection.py index 4fc0786fd..b3ff6fa24 100644 --- a/ner_v2/detectors/temporal/date/en/date_detection.py +++ b/ner_v2/detectors/temporal/date/en/date_detection.py @@ -679,7 +679,7 @@ def _gregorian_month_day_with_ordinals_year_format(self, date_list=None, origina original_list = [] if date_list is None: date_list = [] - regex_pattern = re.compile(r'\b(((?:20|19)?[0-9]{2})?\s?([A-Za-z]+)[\ \,\-]\s?([12][0-9]' + regex_pattern = re.compile(r'\b(((?:20|19)?[0-9]{2}\s)?([A-Za-z]+)[\ \,\-]\s?([12][0-9]' r'|3[01]|0?[1-9])\s?(?:nd|st|rd|th)?' r'(?:[\ \,\-]\s?((?:20|19)?[0-9]{2}))?)\W') patterns = regex_pattern.findall(self.processed_text.lower()) diff --git a/postman_tests/data/entities/date.json b/postman_tests/data/entities/date.json index a127f6b0d..9fce5dd18 100644 --- a/postman_tests/data/entities/date.json +++ b/postman_tests/data/entities/date.json @@ -1,7 +1,7 @@ [ { "input": { - "message": "Set me a reminder for 23 December", + "message": "Set me a reminder for 23 December 2030", "entity_name": "date" }, "expected": [ @@ -10,13 +10,13 @@ "type": "date", "dd": 23, "mm": 12, - "yy": 2020 + "yy": 2030 } ] }, { "input": { - "message": "Set me a reminder for 2 May", + "message": "Set me a reminder for 2 May 2030", "entity_name": "date" }, "expected": [ @@ -25,13 +25,13 @@ "type": "date", "dd": 2, "mm": 5, - "yy": 2020 + "yy": 2030 } ] }, { "input": { - "message": "Set me a reminder for 3 June", + "message": "Set me a reminder for 3 June 2030", "entity_name": "date" }, "expected": [ @@ -40,7 +40,7 @@ "type": "date", "dd": 3, "mm": 6, - "yy": 2020 + "yy": 2030 } ] } From a1776cb80a59905e1e4920ea559bc6c959be949c Mon Sep 17 00:00:00 2001 From: Chirag Jain Date: Fri, 12 Jun 2020 21:47:44 +0530 Subject: [PATCH 14/31] Fix expected original_text --- postman_tests/data/entities/date.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/postman_tests/data/entities/date.json b/postman_tests/data/entities/date.json index 9fce5dd18..0bde4cb45 100644 --- a/postman_tests/data/entities/date.json +++ b/postman_tests/data/entities/date.json @@ -6,7 +6,7 @@ }, "expected": [ { - "original_text": "23 december", + "original_text": "23 december 2030g", "type": "date", "dd": 23, "mm": 12, @@ -21,7 +21,7 @@ }, "expected": [ { - "original_text": "2 may", + "original_text": "2 may 2030", "type": "date", "dd": 2, "mm": 5, @@ -36,7 +36,7 @@ }, "expected": [ { - "original_text": "3 june", + "original_text": "3 june 2030", "type": "date", "dd": 3, "mm": 6, From 85ccc01b4ea44766836025460e29df2fd1bada45 Mon Sep 17 00:00:00 2001 From: Chirag Jain Date: Fri, 12 Jun 2020 21:49:47 +0530 Subject: [PATCH 15/31] Fix typo in tests --- postman_tests/data/entities/date.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/postman_tests/data/entities/date.json b/postman_tests/data/entities/date.json index 0bde4cb45..6f504b49e 100644 --- a/postman_tests/data/entities/date.json +++ b/postman_tests/data/entities/date.json @@ -6,7 +6,7 @@ }, "expected": [ { - "original_text": "23 december 2030g", + "original_text": "23 december 2030", "type": "date", "dd": 23, "mm": 12, From 7a8318def57dc7efa8da433ea0ba5c4b39bef598 Mon Sep 17 00:00:00 2001 From: Chirag Jain Date: Wed, 8 Jul 2020 13:47:06 +0530 Subject: [PATCH 16/31] Create pull_request_template.md --- .github/pull_request_template.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 .github/pull_request_template.md diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 000000000..ddfdf1a89 --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,17 @@ +## JIRA Ticket Number + +JIRA TICKET: + +## Description of change +(REMOVE ME) Please include a summary of the change and which issue is fixed. Please also include relevant motivation and context. List any dependencies that are required for this change. + +## Checklist (OPTIONAL): + +- [ ] My code follows the style guidelines of this project +- [ ] I have performed a self-review of my own code +- [ ] I have commented my code, particularly in hard-to-understand areas +- [ ] I have made corresponding changes to the documentation +- [ ] My changes generate no new warnings +- [ ] I have added tests that prove my fix is effective or that my feature works +- [ ] New and existing unit tests pass locally with my changes +- [ ] Any dependent changes have been merged and published in downstream modules From 49584a11b398a7cf26957f927e4166b3827816fc Mon Sep 17 00:00:00 2001 From: Ankur Date: Wed, 9 Sep 2020 02:58:19 +0530 Subject: [PATCH 17/31] Why: * To add support for elastic search datastore * To add support for multi entity text detection * To add new endpoint for multi_entity_text detection How: * By creating package for es_datastore * By adding function for generating and parsing es query and results * By adding entry in urls and creating a placeholder view --- chatbot_ner/urls.py | 1 + es_datastore/elastic_search.py | 239 ++++++++++++++++++++++ es_datastore/queries.py | 233 +++++++++++++++++++++ es_datastore/tests/test_elastic_search.py | 23 +++ es_datastore/tests/test_queries.py | 0 ner_v2/api.py | 9 + 6 files changed, 505 insertions(+) create mode 100644 es_datastore/elastic_search.py create mode 100644 es_datastore/queries.py create mode 100644 es_datastore/tests/test_elastic_search.py create mode 100644 es_datastore/tests/test_queries.py diff --git a/chatbot_ner/urls.py b/chatbot_ner/urls.py index fcc96376a..8d6b82010 100755 --- a/chatbot_ner/urls.py +++ b/chatbot_ner/urls.py @@ -34,6 +34,7 @@ url(r'^v2/number/$', api_v2.number), url(r'^v2/phone_number/$', api_v2.phone_number), url(r'^v2/number_range/$', api_v2.number_range), + url(r'^v2/multi_entity_text/$', api_v2.multi_entity_text), # V2 bulk detectors url(r'^v2/date_bulk/$', api_v2.date), diff --git a/es_datastore/elastic_search.py b/es_datastore/elastic_search.py new file mode 100644 index 000000000..62b0a057c --- /dev/null +++ b/es_datastore/elastic_search.py @@ -0,0 +1,239 @@ +from __future__ import absolute_import + +import json +import six + +from itertools import chain +from elasticsearch import Elasticsearch + +from lib.singleton import Singleton +from chatbot_ner.config import ner_logger, CHATBOT_NER_DATASTORE +from datastore import constants +from datastore.exceptions import DataStoreSettingsImproperlyConfiguredException + +from es_datastore.queries import _generate_multi_entity_es_query, \ + _parse_multi_entity_es_results + + +class ElasticSearchDataStore(six.with_metaclass(Singleton, object)): + """ + Class responsible for holding connections and performing search in + ElasticSearch DB. + Used as a singleton in this module. + """ + + def __init__(self): + self._engine_name = constants.ELASTICSEARCH + self._kwargs = {} + self._conns = {} + self._connection_settings = {} + self._connection = None + self._index_name = None + + # configure variables and connection + self._configure_store() + + def _configure_store(self, **kwargs): + """ + Configure self variables and connection. + Also add default connection to registry with alias `default` + """ + self._connection_settings = CHATBOT_NER_DATASTORE. \ + get(self._engine_name) + + if self._connection_settings is None: + raise DataStoreSettingsImproperlyConfiguredException() + + self._index_name = self._connection_settings[constants.ELASTICSEARCH_ALIAS] + self._connection = self.connect(**self._connection_settings) + + self._conns['default'] = self._connection + + def add_new_connection(self, alias, conn): + """ + Add new connection object, which can be directly passed through as-is to + the connection registry. + """ + self._conns[alias] = conn + + def get_or_create_new_connection(self, alias="default", **kwargs): + """ + Retrieve a connection with given alias. + Construct it if necessary (only when configuration was passed to us). + + If some non-string alias has been passed through it assume a client instance + and will just return it as-is. + + Raises ``KeyError`` if no client (or its definition) is registered + under the alias. + """ + + if not isinstance(alias, six.string_types): + return alias + + # connection already established + try: + return self._conns[alias] + except KeyError: + pass + + # if not, try to create it a new connection + try: + conn = self.connect(**kwargs) + self._conns[alias] = conn + except KeyError: + # no connection and no kwargs to set one up + raise KeyError("There is no connection with alias %r." % alias) + + # check if this is necessary here + def _check_doc_type_for_elasticsearch(self): + """ + Checks if doc_type is present in connection settings, if not an exception is raised + + Raises: + DataStoreSettingsImproperlyConfiguredException if doc_type was not found in + connection settings + """ + # TODO: This check should be during init or boot + if constants.ELASTICSEARCH_DOC_TYPE not in self._connection_settings: + ner_logger.debug("No doc type is present") + raise DataStoreSettingsImproperlyConfiguredException( + 'Elasticsearch needs doc_type. Please configure ES_DOC_TYPE in your environment') + + def get_multi_entity_results(self, entities, texts, fuzziness_threshold=1, + **kwargs): + """ + Args: + entities: the list of entities to lookup in the datastore for getting entity values + and their variants + texts(list of strings): the text for which variants need to be find out + fuzziness_threshold: fuzziness allowed for search results on entity value variants + kwargs: + + Returns: + list of collections.OrderedDict: dictionary mapping each entity for each text + with their value variants to entity value + + Example: + db = ElasticSearchDataStore() + entities = ['city', 'restaurant'] + texts = ['I want to go to mumbai and eat at dominoes pizza', + ' I want to go Jabalpur'] + + get_multi_entity_results(entities, texts) + + Output: + [ + { + 'restaurant': OrderedDict([ + ("Domino's Pizza", "Domino's Pizza"), + ('Domino', "Domino's Pizza"), + ('Dominos', "Domino's Pizza"), + ('Pizza Pizza Pizza', 'Pizza Pizza Pizza'), + ('Pizza', 'U S Pizza')]), + 'city': OrderedDict([ + ('Mumbai', 'Mumbai'), + ('mumbai', 'mumbai')])}, + { + 'city': OrderedDict([ + ('Jabalpur', 'Jabalpur'), + ('Jamalpur', 'Jamalpur'), + ('goa', 'goa')]), + 'restaurant': OrderedDict([ + ('TMOS', 'TMOS'), ('G.', 'G Pulla Reddy Sweets')])} + ] + """ + + self._check_doc_type_for_elasticsearch() + request_timeout = self._connection_settings.get('request_timeout', 20) + index_name = self._index_name + + doc_type = self._connection_settings[ + constants.ELASTICSEARCH_DOC_TYPE] + + index_header = json.dumps({'index': self._index_name, 'type': doc_type}) + + data = list(chain.from_iterable([[index_header, + json.dumps(_generate_multi_entity_es_query( + entities=entities, + text=each, + fuzziness_threshold=fuzziness_threshold))] + for each in texts])) + + # add `\n` for each index_header and text entry + data = '\n'.join(data) + + kwargs = dict(kwargs, body=data, doc_type=doc_type, index=index_name, + request_timeout=request_timeout) + + results = self._run_es_search(self._connection, **kwargs) + results = _parse_multi_entity_es_results(results.get("responses")) + + return results + + @staticmethod + def connect(connection_url=None, host=None, port=None, user=None, password=None, **kwargs): + """ + Establishes connection to a single Elasticsearch Instance. + if connection_url is not None, then host, port, user, password are not used + Args: + connection_url: Elasticsearch connection url of the format https://user:secret@host:port/abc . + Optional if other parameters are provided. + host: nodes to connect to . e.g. localhost. Optional if connection_url is provided + port: port for elasticsearch connection. Optional if connection_url is provided + user: Optional, username for elasticsearch authentication + password: Optional, password for elasticsearch authentication + kwargs: any additional arguments will be passed on to the Transport class and, subsequently, + to the Connection instances. + Returns: + Elasticsearch client connection object + + """ + connection = None + if user and password: + kwargs = dict(kwargs, http_auth=(user, password)) + + if connection_url: + connection = Elasticsearch(hosts=[connection_url], **kwargs) + elif host and port: + connection = Elasticsearch(hosts=[{'host': host, 'port': int(port)}], **kwargs) + + if connection and not connection.ping(): + connection = None + + return connection + + @staticmethod + def _run_es_search(connection, **kwargs): + """ + Execute the elasticsearch.ElasticSearch.msearch() method and return all results + Args: + connection: Elasticsearch client object + kwargs: + Refer https://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.Elasticsearch.search + Returns: + dictionary, search results from elasticsearch.ElasticSearch.msearch + """ + + return connection.msearch(**kwargs) + + @staticmethod + def _get_dynamic_fuzziness_threshold(fuzzy_setting): + """ + Approximately emulate AUTO:[low],[high] functionality of elasticsearch 6.2+ on older versions + + Args: + fuzzy_setting (int or str): Can be int or "auto" or "auto:," + + Returns: + int or str: fuzziness as int when ES version < 6.2 + otherwise the input is returned as it is + """ + if isinstance(fuzzy_setting, six.string_types): + if constants.ELASTICSEARCH_VERSION_MAJOR > 6 or \ + (constants.ELASTICSEARCH_VERSION_MAJOR == 6 and + constants.ELASTICSEARCH_VERSION_MINOR >= 2): + return fuzzy_setting + return 'auto' + + return fuzzy_setting diff --git a/es_datastore/queries.py b/es_datastore/queries.py new file mode 100644 index 000000000..738fbf31a --- /dev/null +++ b/es_datastore/queries.py @@ -0,0 +1,233 @@ +from __future__ import absolute_import + +import collections +import json +import re + +from six.moves import zip +from six import string_types + +from datastore import constants +from language_utilities.constant import ENGLISH_LANG +from lib.nlp.const import TOKENIZER + + +def _generate_multi_entity_es_query(entities, text, + fuzziness_threshold=1, + language_script=ENGLISH_LANG, + size=constants.ELASTICSEARCH_SEARCH_SIZE, + as_json=False): + """ + Generates compound elasticsearch boolean filter search query dictionary + for a text for multiple entity_data. + The query generated searches for entity_name in the index and returns search results for the + matched word (of sentence) only if entity_name is found. + + Args: + entities (list/str): list of the entity to perform a 'term' query on. + If str will converted to list internally. + text (str): The text on which we need to identify the entities. + fuzziness_threshold (int, optional): fuzziness_threshold for elasticsearch match query 'fuzziness' parameter. + Defaults to 1 + language_script (str, optional): language of documents to be searched, + optional, defaults to 'en' + size (int, optional): number of records to return, + defaults to `ELASTICSEARCH_SEARCH_SIZE` + as_json (bool, optional): Return the generated query as json string. + useful for debug purposes. Defaults to False + + Returns: + dictionary, the search query for the text + + Examples Query generated: + _generate_multi_entity_es_query(['city', 'restaurant'], "I want to go to + mumbai") + + Outputs: + { + '_source': ['value', 'entity_data'], + 'query': {'bool': {'filter': + [{'terms': + {'entity_data': ['city', 'restaurant']}}, + {'terms': {'language_script': ['en']}}], + 'should': [{'match': + {'variants': {'query': 'I want to go to mumbai', + 'fuzziness': 1, 'prefix_length': 1}}}], + 'minimum_should_match': 1}}, + 'highlight': + {'fields': {'variants': {'type': 'unified'}}, + order': 'score', 'number_of_fragments': 20}, + 'size': 10000 + } + """ + + # if entities instance of string convert to list + if isinstance(entities, string_types): + entities = [entities] + + filter_terms = [] + term_dict_entity_name = { + 'terms': { + 'entity_data': entities + } + } + filter_terms.append(term_dict_entity_name) + + # search on language_script, add english as default search + term_dict_language = { + 'terms': { + 'language_script': [ENGLISH_LANG] + } + } + + if language_script != ENGLISH_LANG: + term_dict_language['terms']['language_script'].append(language_script) + + filter_terms.append(term_dict_language) + + should_terms = [] + query = { + 'match': { + 'variants': { + 'query': text, + 'fuzziness': fuzziness_threshold, + 'prefix_length': 1 + } + } + } + + should_terms.append(query) + + data = { + '_source': ['value', 'entity_data'], + 'query': { + 'bool': { + 'filter': filter_terms, + 'should': should_terms, + 'minimum_should_match': 1 + }, + }, + 'highlight': { + 'fields': { + 'variants': { + 'type': 'unified' + } + }, + 'order': 'score', + 'number_of_fragments': 20 + }, + 'size': size + } + + if as_json: + data = json.dumps(data) + + return data + + +def _parse_multi_entity_es_results(results_list): + """ + This will parse highlighted results returned from elasticsearch query and + generate a variants to values dictionary mapped to each entity for each + search text terms. + + Args: + results_list (list of dict): + search results list of dictionaries from elasticsearch including highlights + and scores + + Returns: + list of dict of collections.OrderedDict: + list containing dicts mapping each entity to matching variants to their entity + values based on the parsed results from highlighted search query results + + Example: + Parameter ngram_results has highlighted search results as follows: + + [ + {u'_shards': {u'failed': 0, u'successful': 5, u'total': 5}, + u'hits': {u'hits': [{u'_id': u'AVrW02UE9WNuMIY9vmWn', + u'_index': u'doc_type_name', + u'_score': 11.501145, + u'_source': {u'dict_type': u'variants', + u'entity_data': u'city', + u'value': u'goa', + u'variants': [u'', u'goa']}, + u'_type': u'data_dictionary', + u'highlight': {u'variants': [u'goa']}}, + {u'_id': u'AVrW02W99WNuMIY9vmcf', + u'_index': u'entity_data', + u'_score': 11.210829, + u'_source': {u'dict_type': u'variants', + u'entity_data': u'city', + u'value': u'Mumbai', + u'variants': [u'', u'Mumbai']}, + u'_type': u'data_dictionary', + u'highlight': {u'variants': [u'Mumbai']}}, + ... + u'max_score': 11.501145, + u'total': 17}, + u'timed_out': False, + u'took': 96} + ] + + After parsing highlighted results, this function returns + + [ + { + 'city': OrderedDict([ + ('Mumbai', 'Mumbai'), + ('mumbai', 'mumbai'), + ('goa', 'goa') + ]) + }, + { + 'city': OrderedDict([ + ('Jabalpur', 'Jabalpur'), + ('Jamalpur', 'Jamalpur'), + ('goa', 'goa') + ]) + } + ] + + + + """ + entity_variants_to_values_list = [] + + if results_list: + for results in results_list: + entity_dict = {} + entity_variants_to_values_dict = {} + + if results['hits']['total'] > 0: + for hit in results['hits']['hits']: + if 'highlight' not in hit: + continue + + value = hit['_source']['value'] + entity_name = hit['_source']['entity_data'] + + if entity_name not in entity_dict: + entity_dict[entity_name] = {'value': [], 'variant': []} + + entity_dict[entity_name]['value'].extend( + [value for _ in hit['highlight']['variants']]) + entity_dict[entity_name]['variant'].extend( + [variant for variant in hit['highlight']['variants']]) + + for each_entity in entity_dict.keys(): + entity_values = entity_dict[each_entity]['value'] + entity_variants = entity_dict[each_entity]['variant'] + entity_variants_to_values = collections.OrderedDict() + + for value, variant in zip(entity_values, entity_variants): + variant = re.sub('\s+', ' ', variant.strip()) + variant_no_highlight_tags = variant.replace('', '').replace('', '').strip() + if variant.count('') == len(TOKENIZER.tokenize(variant_no_highlight_tags)): + variant = variant_no_highlight_tags + if variant not in entity_variants_to_values: + entity_variants_to_values[variant] = value + entity_variants_to_values_dict[each_entity] = entity_variants_to_values + entity_variants_to_values_list.append(entity_variants_to_values_dict) + return entity_variants_to_values_list diff --git a/es_datastore/tests/test_elastic_search.py b/es_datastore/tests/test_elastic_search.py new file mode 100644 index 000000000..b096b8a1e --- /dev/null +++ b/es_datastore/tests/test_elastic_search.py @@ -0,0 +1,23 @@ +from __future__ import absolute_import + +from es_datastore.elastic_search import ElasticSearchDataStore + +from elasticsearch import Elasticsearch + + +def test_elasticsearch_connection(): + c = ElasticSearchDataStore() + + connection = c.get_or_create_new_connection('default') + + assert isinstance(connection, Elasticsearch) + + +# :TODO: configure parameters here +def test_elasticsearch_connect(): + + kwargs = dict() + + connection = ElasticSearchDataStore.connect(**kwargs) + + assert isinstance(connection, Elasticsearch) \ No newline at end of file diff --git a/es_datastore/tests/test_queries.py b/es_datastore/tests/test_queries.py new file mode 100644 index 000000000..e69de29bb diff --git a/ner_v2/api.py b/ner_v2/api.py index 611662a4c..24d099f6f 100644 --- a/ner_v2/api.py +++ b/ner_v2/api.py @@ -552,3 +552,12 @@ def phone_number(request): return HttpResponse(status=500) return HttpResponse(json.dumps({'data': entity_output}), content_type='application/json') + + +@csrf_exempt +def multi_entity_text(request): + """ + Place holder for detecting multi entity text detection + :TODO: to be implemented + """ + return HttpResponse(status=500) From 5eace9529e6567de543fafdb87941f47012013ca Mon Sep 17 00:00:00 2001 From: Ankur Date: Thu, 10 Sep 2020 08:05:15 +0530 Subject: [PATCH 18/31] Why: * To add test cases for es_datastore module * To add test data for es_datastore module --- ...uery_generate_output_entity_list_data.json | 49 +++++ ...ry_generate_output_entity_string_data.json | 48 +++++ .../tests/query_parse_input_data.json | 171 ++++++++++++++++++ .../tests/query_parse_output_data.json | 19 ++ es_datastore/tests/test_elastic_search.py | 61 ++++++- es_datastore/tests/test_queries.py | 56 ++++++ 6 files changed, 394 insertions(+), 10 deletions(-) create mode 100644 es_datastore/tests/query_generate_output_entity_list_data.json create mode 100644 es_datastore/tests/query_generate_output_entity_string_data.json create mode 100644 es_datastore/tests/query_parse_input_data.json create mode 100644 es_datastore/tests/query_parse_output_data.json diff --git a/es_datastore/tests/query_generate_output_entity_list_data.json b/es_datastore/tests/query_generate_output_entity_list_data.json new file mode 100644 index 000000000..36e71657d --- /dev/null +++ b/es_datastore/tests/query_generate_output_entity_list_data.json @@ -0,0 +1,49 @@ +{ + "_source": [ + "value", + "entity_data" + ], + "query": { + "bool": { + "filter": [ + { + "terms": { + "entity_data": [ + "city", + "restaurant" + ] + } + }, + { + "terms": { + "language_script": [ + "en" + ] + } + } + ], + "should": [ + { + "match": { + "variants": { + "query": "I want to go to mumbai", + "fuzziness": 1, + "prefix_length": 1 + } + } + } + ], + "minimum_should_match": 1 + } + }, + "highlight": { + "fields": { + "variants": { + "type": "unified" + } + }, + "order": "score", + "number_of_fragments": 20 + }, + "size": 10000 +} diff --git a/es_datastore/tests/query_generate_output_entity_string_data.json b/es_datastore/tests/query_generate_output_entity_string_data.json new file mode 100644 index 000000000..df5e350e1 --- /dev/null +++ b/es_datastore/tests/query_generate_output_entity_string_data.json @@ -0,0 +1,48 @@ +{ + "_source": [ + "value", + "entity_data" + ], + "query": { + "bool": { + "filter": [ + { + "terms": { + "entity_data": [ + "city" + ] + } + }, + { + "terms": { + "language_script": [ + "en" + ] + } + } + ], + "should": [ + { + "match": { + "variants": { + "query": "I want to go to mumbai", + "fuzziness": 1, + "prefix_length": 1 + } + } + } + ], + "minimum_should_match": 1 + } + }, + "highlight": { + "fields": { + "variants": { + "type": "unified" + } + }, + "order": "score", + "number_of_fragments": 20 + }, + "size": 10000 +} diff --git a/es_datastore/tests/query_parse_input_data.json b/es_datastore/tests/query_parse_input_data.json new file mode 100644 index 000000000..8d30fc176 --- /dev/null +++ b/es_datastore/tests/query_parse_input_data.json @@ -0,0 +1,171 @@ +[ + { + "took": 13, + "timed_out": false, + "_shards": { + "total": 5, + "successful": 5, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": 5, + "max_score": 6.4403224, + "hits": [ + { + "_index": "entity_data_v1", + "_type": "data_dictionary", + "_id": "AXRpst4hee1DSR1e2_7_", + "_score": 6.4403224, + "_source": { + "entity_data": "city", + "value": "Mumbai" + }, + "highlight": { + "variants": [ + "Mumbai" + ] + } + }, + { + "_index": "entity_data_v1", + "_type": "data_dictionary", + "_id": "AXRpstzYee1DSR1e2_0I", + "_score": 6.3653703, + "_source": { + "entity_data": "city", + "value": "mumbai" + }, + "highlight": { + "variants": [ + "mumbai" + ] + } + }, + { + "_index": "entity_data_v1", + "_type": "data_dictionary", + "_id": "AXRpsttmee1DSR1e2_sI", + "_score": 6.3596706, + "_source": { + "entity_data": "city", + "value": "Wani" + }, + "highlight": { + "variants": [ + "Wani" + ] + } + }, + { + "_index": "entity_data_v1", + "_type": "data_dictionary", + "_id": "AXRpsttmee1DSR1e2_r9", + "_score": 4.420918, + "_source": { + "entity_data": "city", + "value": "East" + }, + "highlight": { + "variants": [ + "east" + ] + } + }, + { + "_index": "entity_data_v1", + "_type": "data_dictionary", + "_id": "AXRpsttnee1DSR1e2_wp", + "_score": 3.8191679, + "_source": { + "entity_data": "city", + "value": "goa" + }, + "highlight": { + "variants": [ + "goa" + ] + } + } + ] + }, + "status": 200 + }, + { + "took": 12, + "timed_out": false, + "_shards": { + "total": 5, + "successful": 5, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": 4, + "max_score": 8.485634, + "hits": [ + { + "_index": "entity_data_v1", + "_type": "data_dictionary", + "_id": "AXRpstzZee1DSR1e2_4u", + "_score": 8.485634, + "_source": { + "entity_data": "city", + "value": "Jabalpur" + }, + "highlight": { + "variants": [ + "Jabalpur" + ] + } + }, + { + "_index": "entity_data_v1", + "_type": "data_dictionary", + "_id": "AXRpsttnee1DSR1e2_yQ", + "_score": 7.418161, + "_source": { + "entity_data": "city", + "value": "Jamalpur" + }, + "highlight": { + "variants": [ + "Jamalpur" + ] + } + }, + { + "_index": "entity_data_v1", + "_type": "data_dictionary", + "_id": "AXRpsttmee1DSR1e2_sI", + "_score": 6.3596706, + "_source": { + "entity_data": "city", + "value": "Wani" + }, + "highlight": { + "variants": [ + "Wani" + ] + } + }, + { + "_index": "entity_data_v1", + "_type": "data_dictionary", + "_id": "AXRpsttnee1DSR1e2_wp", + "_score": 3.8191679, + "_source": { + "entity_data": "city", + "value": "goa" + }, + "highlight": { + "variants": [ + "goa" + ] + } + } + ] + }, + "status": 200 + } +] diff --git a/es_datastore/tests/query_parse_output_data.json b/es_datastore/tests/query_parse_output_data.json new file mode 100644 index 000000000..435a6bf64 --- /dev/null +++ b/es_datastore/tests/query_parse_output_data.json @@ -0,0 +1,19 @@ +[ + { + "city": { + "Mumbai": "Mumbai", + "mumbai": "mumbai", + "Wani": "Wani", + "east": "East", + "goa": "goa" + } + }, + { + "city": { + "Jabalpur": "Jabalpur", + "Jamalpur": "Jamalpur", + "Wani": "Wani", + "goa": "goa" + } + } +] diff --git a/es_datastore/tests/test_elastic_search.py b/es_datastore/tests/test_elastic_search.py index b096b8a1e..4d2971c34 100644 --- a/es_datastore/tests/test_elastic_search.py +++ b/es_datastore/tests/test_elastic_search.py @@ -1,23 +1,64 @@ from __future__ import absolute_import +from django.test import TestCase +from elasticsearch import Elasticsearch + from es_datastore.elastic_search import ElasticSearchDataStore +from chatbot_ner.config import CHATBOT_NER_DATASTORE -from elasticsearch import Elasticsearch +class TestESDataStore(TestCase): + def test_elasticsearch_connection(self): + c = ElasticSearchDataStore() + + connection = c.get_or_create_new_connection('default') + + self.assertIsInstance(connection, Elasticsearch) + + # :TODO: configure parameters here + def test_elasticsearch_connect(self): + kwargs = CHATBOT_NER_DATASTORE.get('elasticsearch') + + connection = ElasticSearchDataStore.connect(**kwargs) + + self.assertIsInstance(connection, Elasticsearch) + + def test_elasticsearch_get_connection(self): + c = ElasticSearchDataStore() + + conn = c.get_or_create_new_connection() + self.assertIsInstance(conn, Elasticsearch) + + def test_elasticsearch_add_connection(self): + kwargs = CHATBOT_NER_DATASTORE.get('elasticsearch') + c = Elasticsearch(**kwargs) + + es = ElasticSearchDataStore() + es.add_new_connection('new', c) + + conn = es.get_or_create_new_connection() + new_conn = es.get_or_create_new_connection('new') + + self.assertIsInstance(new_conn, Elasticsearch) + self.assertIsInstance(c, Elasticsearch) + self.assertIsInstance(conn, Elasticsearch) + + def test_elasticsearch_get_dynamic_fuzziness_threshold(self): + fuzzy = 1 -def test_elasticsearch_connection(): - c = ElasticSearchDataStore() + fuzzy_threshold = ElasticSearchDataStore._get_dynamic_fuzziness_threshold(fuzzy) - connection = c.get_or_create_new_connection('default') + self.assertEqual(fuzzy_threshold, fuzzy) - assert isinstance(connection, Elasticsearch) + fuzzy = '1' + fuzzy_threshold = ElasticSearchDataStore._get_dynamic_fuzziness_threshold(fuzzy) -# :TODO: configure parameters here -def test_elasticsearch_connect(): + self.assertEqual(fuzzy_threshold, 'auto') - kwargs = dict() + # :TODO: Check if below is expected + fuzzy = 'some_string' - connection = ElasticSearchDataStore.connect(**kwargs) + fuzzy_threshold = ElasticSearchDataStore._get_dynamic_fuzziness_threshold(fuzzy) - assert isinstance(connection, Elasticsearch) \ No newline at end of file + self.assertEqual(fuzzy_threshold, 'auto') diff --git a/es_datastore/tests/test_queries.py b/es_datastore/tests/test_queries.py index e69de29bb..3565e0e30 100644 --- a/es_datastore/tests/test_queries.py +++ b/es_datastore/tests/test_queries.py @@ -0,0 +1,56 @@ +from __future__ import absolute_import + +import json +from django.test import TestCase + +from es_datastore.queries import _parse_multi_entity_es_results, \ + _generate_multi_entity_es_query + + +class TestESDataStoreQueries(TestCase): + + def test_parse_multi_entity_es_results(self): + # get input data from file `query_parse_input_data.json` + with open('query_parse_input_data.json', 'r') as f: + input_data = json.load(f) + + result = _parse_multi_entity_es_results(input_data) + + # get output data from file `query_parse_output_data.json` + with open('query_parse_output_data.json', 'r') as f: + output_data = json.load(f) + + # set max diff to None + self.maxDiff = None + + self.assertDictEqual(result, output_data) + + def test_generate_multi_entity_es_query_list(self): + + entity_list = ['city', 'restaurant'] + text = "I want to go to mumbai" + + result = _generate_multi_entity_es_query(entity_list, text) + + with open("query_generate_output_entity_list_data.json", "r") as f: + output_data = json.load(f) + + # set max diff to None + self.maxDiff = None + + self.assertDictEqual(result, output_data) + + def test_generate_multi_entity_es_query_string(self): + + entity_string = "city" + text = "I want to go to mumbai" + + result = _generate_multi_entity_es_query(entity_string, text) + + with open("query_generate_output_entity_string_data.json.json", "r") as f: + output_data = json.load(f) + + # set max diff to None + self.maxDiff = None + + self.assertDictEqual(result, output_data) From 41c9198996f5141fba0794b8730ad303ca016359 Mon Sep 17 00:00:00 2001 From: Ankur Agrawal Date: Thu, 10 Sep 2020 13:57:04 +0530 Subject: [PATCH 19/31] Why: * To change the location for the es_datastore module * To change URL endpoint name --- chatbot_ner/urls.py | 2 +- ner_v2/api.py | 2 +- ner_v2/detectors/textual/__init__.py | 0 .../detectors/textual}/elastic_search.py | 8 +++---- .../detectors/textual}/queries.py | 2 +- ner_v2/detectors/textual/tests/__init__.py | 0 ...uery_generate_output_entity_list_data.json | 0 ...ry_generate_output_entity_string_data.json | 0 .../tests/query_parse_input_data.json | 0 .../tests/query_parse_output_data.json | 0 .../textual}/tests/test_elastic_search.py | 2 +- .../detectors/textual}/tests/test_queries.py | 24 ++++++++++++++----- 12 files changed, 26 insertions(+), 14 deletions(-) create mode 100644 ner_v2/detectors/textual/__init__.py rename {es_datastore => ner_v2/detectors/textual}/elastic_search.py (96%) rename {es_datastore => ner_v2/detectors/textual}/queries.py (99%) create mode 100644 ner_v2/detectors/textual/tests/__init__.py rename {es_datastore => ner_v2/detectors/textual}/tests/query_generate_output_entity_list_data.json (100%) rename {es_datastore => ner_v2/detectors/textual}/tests/query_generate_output_entity_string_data.json (100%) rename {es_datastore => ner_v2/detectors/textual}/tests/query_parse_input_data.json (100%) rename {es_datastore => ner_v2/detectors/textual}/tests/query_parse_output_data.json (100%) rename {es_datastore => ner_v2/detectors/textual}/tests/test_elastic_search.py (96%) rename {es_datastore => ner_v2/detectors/textual}/tests/test_queries.py (58%) diff --git a/chatbot_ner/urls.py b/chatbot_ner/urls.py index 8d6b82010..4c72caa3d 100755 --- a/chatbot_ner/urls.py +++ b/chatbot_ner/urls.py @@ -34,7 +34,7 @@ url(r'^v2/number/$', api_v2.number), url(r'^v2/phone_number/$', api_v2.phone_number), url(r'^v2/number_range/$', api_v2.number_range), - url(r'^v2/multi_entity_text/$', api_v2.multi_entity_text), + url(r'^v2/text/$', api_v2.text), # V2 bulk detectors url(r'^v2/date_bulk/$', api_v2.date), diff --git a/ner_v2/api.py b/ner_v2/api.py index 24d099f6f..a071f4ab5 100644 --- a/ner_v2/api.py +++ b/ner_v2/api.py @@ -555,7 +555,7 @@ def phone_number(request): @csrf_exempt -def multi_entity_text(request): +def text(request): """ Place holder for detecting multi entity text detection :TODO: to be implemented diff --git a/ner_v2/detectors/textual/__init__.py b/ner_v2/detectors/textual/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/es_datastore/elastic_search.py b/ner_v2/detectors/textual/elastic_search.py similarity index 96% rename from es_datastore/elastic_search.py rename to ner_v2/detectors/textual/elastic_search.py index 62b0a057c..03104d25f 100644 --- a/es_datastore/elastic_search.py +++ b/ner_v2/detectors/textual/elastic_search.py @@ -11,7 +11,7 @@ from datastore import constants from datastore.exceptions import DataStoreSettingsImproperlyConfiguredException -from es_datastore.queries import _generate_multi_entity_es_query, \ +from ner_v2.detectors.textual.queries import _generate_multi_entity_es_query, \ _parse_multi_entity_es_results @@ -230,9 +230,9 @@ def _get_dynamic_fuzziness_threshold(fuzzy_setting): otherwise the input is returned as it is """ if isinstance(fuzzy_setting, six.string_types): - if constants.ELASTICSEARCH_VERSION_MAJOR > 6 or \ - (constants.ELASTICSEARCH_VERSION_MAJOR == 6 and - constants.ELASTICSEARCH_VERSION_MINOR >= 2): + if constants.ELASTICSEARCH_VERSION_MAJOR > 6 \ + or (constants.ELASTICSEARCH_VERSION_MAJOR == 6 + and constants.ELASTICSEARCH_VERSION_MINOR >= 2): return fuzzy_setting return 'auto' diff --git a/es_datastore/queries.py b/ner_v2/detectors/textual/queries.py similarity index 99% rename from es_datastore/queries.py rename to ner_v2/detectors/textual/queries.py index 738fbf31a..a6de5a5ab 100644 --- a/es_datastore/queries.py +++ b/ner_v2/detectors/textual/queries.py @@ -222,7 +222,7 @@ def _parse_multi_entity_es_results(results_list): entity_variants_to_values = collections.OrderedDict() for value, variant in zip(entity_values, entity_variants): - variant = re.sub('\s+', ' ', variant.strip()) + variant = re.sub(r'\s+', ' ', variant.strip()) variant_no_highlight_tags = variant.replace('', '').replace('', '').strip() if variant.count('') == len(TOKENIZER.tokenize(variant_no_highlight_tags)): variant = variant_no_highlight_tags diff --git a/ner_v2/detectors/textual/tests/__init__.py b/ner_v2/detectors/textual/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/es_datastore/tests/query_generate_output_entity_list_data.json b/ner_v2/detectors/textual/tests/query_generate_output_entity_list_data.json similarity index 100% rename from es_datastore/tests/query_generate_output_entity_list_data.json rename to ner_v2/detectors/textual/tests/query_generate_output_entity_list_data.json diff --git a/es_datastore/tests/query_generate_output_entity_string_data.json b/ner_v2/detectors/textual/tests/query_generate_output_entity_string_data.json similarity index 100% rename from es_datastore/tests/query_generate_output_entity_string_data.json rename to ner_v2/detectors/textual/tests/query_generate_output_entity_string_data.json diff --git a/es_datastore/tests/query_parse_input_data.json b/ner_v2/detectors/textual/tests/query_parse_input_data.json similarity index 100% rename from es_datastore/tests/query_parse_input_data.json rename to ner_v2/detectors/textual/tests/query_parse_input_data.json diff --git a/es_datastore/tests/query_parse_output_data.json b/ner_v2/detectors/textual/tests/query_parse_output_data.json similarity index 100% rename from es_datastore/tests/query_parse_output_data.json rename to ner_v2/detectors/textual/tests/query_parse_output_data.json diff --git a/es_datastore/tests/test_elastic_search.py b/ner_v2/detectors/textual/tests/test_elastic_search.py similarity index 96% rename from es_datastore/tests/test_elastic_search.py rename to ner_v2/detectors/textual/tests/test_elastic_search.py index 4d2971c34..3a88cd30c 100644 --- a/es_datastore/tests/test_elastic_search.py +++ b/ner_v2/detectors/textual/tests/test_elastic_search.py @@ -3,7 +3,7 @@ from django.test import TestCase from elasticsearch import Elasticsearch -from es_datastore.elastic_search import ElasticSearchDataStore +from ner_v2.detectors.textual.elastic_search import ElasticSearchDataStore from chatbot_ner.config import CHATBOT_NER_DATASTORE diff --git a/es_datastore/tests/test_queries.py b/ner_v2/detectors/textual/tests/test_queries.py similarity index 58% rename from es_datastore/tests/test_queries.py rename to ner_v2/detectors/textual/tests/test_queries.py index 3565e0e30..0d93a29d0 100644 --- a/es_datastore/tests/test_queries.py +++ b/ner_v2/detectors/textual/tests/test_queries.py @@ -1,38 +1,48 @@ from __future__ import absolute_import import json +import os from django.test import TestCase -from es_datastore.queries import _parse_multi_entity_es_results, \ +from ner_v2.detectors.textual.queries import _parse_multi_entity_es_results, \ _generate_multi_entity_es_query +es_tests_directory = os.path.dirname(os.path.abspath(__file__)) + + class TestESDataStoreQueries(TestCase): def test_parse_multi_entity_es_results(self): # get input data from file `query_parse_input_data.json` - with open('query_parse_input_data.json', 'r') as f: + + input_test_file = os.path.join(es_tests_directory, 'query_parse_input_data.json') + output_test_file = os.path.join(es_tests_directory, 'query_parse_output_data.json') + + with open(input_test_file, 'r') as f: input_data = json.load(f) result = _parse_multi_entity_es_results(input_data) # get output data from file `query_parse_output_data.json` - with open('query_parse_output_data.json', 'r') as f: + with open(output_test_file, 'r') as f: output_data = json.load(f) # set max diff to None self.maxDiff = None - self.assertDictEqual(result, output_data) + self.assertDictEqual(result[1], output_data[1]) def test_generate_multi_entity_es_query_list(self): entity_list = ['city', 'restaurant'] text = "I want to go to mumbai" + output_test_file = os.path.join(es_tests_directory, + 'query_generate_output_entity_list_data.json') result = _generate_multi_entity_es_query(entity_list, text) - with open("query_generate_output_entity_list_data.json", "r") as f: + with open(output_test_file, "r") as f: output_data = json.load(f) # set max diff to None @@ -44,10 +54,12 @@ def test_generate_multi_entity_es_query_string(self): entity_string = "city" text = "I want to go to mumbai" + output_test_file = os.path.join(es_tests_directory, + 'query_generate_output_entity_string_data.json') result = _generate_multi_entity_es_query(entity_string, text) - with open("query_generate_output_entity_string_data.json.json", "r") as f: + with open(output_test_file, "r") as f: output_data = json.load(f) # set max diff to None From ee67dea7a8fc309a3cd1187a2a9290e9ca36a327 Mon Sep 17 00:00:00 2001 From: Ankur Agrawal Date: Fri, 18 Sep 2020 10:08:21 +0530 Subject: [PATCH 20/31] Add support for Multi-Entity Text Detetction in V2 Why: * To add new detector module for text detection * To add support for new api endpoint in v2 for text detection * To add support for new api endpoint in v2 for bulk text detection How: * By adding a new module for text_detection in detectors/textual * By adding moudle for utility function to parse and get text deetction * By adding appropriate urls and api view for new api endpoint i.e. v2/text, v2/text_bulk --- ner_v2/api.py | 25 +- ner_v2/detectors/textual/text_detection.py | 458 +++++++++++++++++++++ ner_v2/detectors/textual/utils.py | 159 +++++++ 3 files changed, 637 insertions(+), 5 deletions(-) create mode 100644 ner_v2/detectors/textual/text_detection.py create mode 100644 ner_v2/detectors/textual/utils.py diff --git a/ner_v2/api.py b/ner_v2/api.py index a071f4ab5..dd78e56af 100644 --- a/ner_v2/api.py +++ b/ner_v2/api.py @@ -11,6 +11,8 @@ from ner_v2.detectors.temporal.time.time_detection import TimeDetector from ner_v2.detectors.numeral.number.number_detection import NumberDetector from ner_v2.detectors.numeral.number_range.number_range_detection import NumberRangeDetector + +from ner_v2.detectors.textual.utils import parse_text_request from language_utilities.constant import ENGLISH_LANG from ner_v2.detectors.pattern.phone_number.phone_number_detection import PhoneDetector @@ -556,8 +558,21 @@ def phone_number(request): @csrf_exempt def text(request): - """ - Place holder for detecting multi entity text detection - :TODO: to be implemented - """ - return HttpResponse(status=500) + data = [] + + if request.method == "GET": + response = {"success": False, "error": "Get method is not allowed"} + return HttpResponse(json.dumps(response), status=501) + + elif request.method == "POST": + ner_logger.debug("Fetching result") + data = parse_text_request(request) + ner_logger.debug("Result Is:") + ner_logger.debug(data) + + if data: + response = {"success": True, "error": None, "data": data} + return HttpResponse(json.dumps(response), content_type='application/json', status=200) + else: + response = {"success": False, "error": "Some error while parsing"} + return HttpResponse(json.dumps(response), status=500) diff --git a/ner_v2/detectors/textual/text_detection.py b/ner_v2/detectors/textual/text_detection.py new file mode 100644 index 000000000..572058a22 --- /dev/null +++ b/ner_v2/detectors/textual/text_detection.py @@ -0,0 +1,458 @@ +from __future__ import absolute_import +import collections +import string + +import six +from six import iteritems + +import language_utilities.constant as lang_constant +from chatbot_ner.config import ner_logger + +from ner_v2.detectors.textual.elastic_search import ElasticSearchDataStore + +from lib.nlp.const import TOKENIZER, whitespace_tokenizer +from lib.nlp.levenshtein_distance import edit_distance +from six.moves import range + +from ner_constants import (FROM_STRUCTURE_VALUE_VERIFIED, FROM_STRUCTURE_VALUE_NOT_VERIFIED, FROM_MESSAGE, + FROM_FALLBACK_VALUE, ORIGINAL_TEXT, ENTITY_VALUE, DETECTION_METHOD, + DETECTION_LANGUAGE, ENTITY_VALUE_DICT_KEY) +from ner_v1.constant import DATASTORE_VERIFIED, MODEL_VERIFIED + +from language_utilities.constant import ENGLISH_LANG + +try: + import regex as re + + _re_flags = re.UNICODE | re.V1 | re.WORD + +except ImportError: + import re + + _re_flags = re.UNICODE + + +class TextDetector(object): + + def __init__(self, entity_dict=None, + source_language_script=lang_constant.ENGLISH_LANG): + + self.processed_text = None + self.__texts = [] + self.__processed_texts = [] + + # defaults for auto mode + self._fuzziness = "auto:4,7" + self._fuzziness_lo, self._fuzziness_hi = 4, 7 + self._min_token_size_for_fuzziness = self._fuzziness_lo + # self.set_fuzziness_threshold(fuzziness=(self._fuzziness_lo, self._fuzziness_hi)) + + # defaults for non-auto mode + self.set_fuzziness_threshold(fuzziness=1) + self._min_token_size_for_fuzziness = 4 + self.esdb = ElasticSearchDataStore() + + self._source_language_script = source_language_script + self._target_language_script = ENGLISH_LANG + + self.entities_dict_list = entity_dict + + def _reset_state(self): + self.tagged_text = None + self.processed_text = None + self.__texts = [] + self.__processed_texts = [] + + def set_fuzziness_threshold(self, fuzziness): + + try: + iter(fuzziness) + if len(fuzziness) == 2: + lo, hi = fuzziness + self._fuzziness_lo, self._fuzziness_hi = int(lo), int(hi) + self._fuzziness = "auto:" + str(self._fuzziness_lo) + "," + str(self._fuzziness_hi) + self._min_token_size_for_fuzziness = lo + else: + self._fuzziness = "auto" + except TypeError: + if type(fuzziness) == int or type(fuzziness) == float: + self._fuzziness = int(fuzziness) # Note that elasticsearch would take min(2, self._fuzziness) + else: + raise TypeError('Fuziness has to be either an iterable of length 2 or an int') + + def _get_fuzziness_threshold_for_token(self, token): + """ + Return dynamic fuzziness threshold for damerau-levenshtein check based on length of token if elasticsearch + fuzziness was set to auto mode + + Args: + token (str or unicode): the string to calculate fuzziness threshold for + + Returns: + int: fuzziness threshold for ngram matching on elastic search results + """ + if type(self._fuzziness) == int: + return self._fuzziness + else: + if len(token) < self._fuzziness_lo: + return 0 # strict match + elif len(token) >= self._fuzziness_hi: + return 2 # Allow upto two inserts/deletes and one substitution + else: + return 1 # lo <= len < hi Allow only insert/delete + + def set_min_token_size_for_levenshtein(self, min_size): + """ + Sets the minimum number of letters a word must have to be considered for calculating edit distance with similar + ngrams from the datastore + + Args: + min_size: integer, maximum allowed Levenshtein's distance from the word/phrase being tested for + entity match + """ + self._min_token_size_for_fuzziness = min_size + + def _process_text(self, texts): + self._reset_state() + for text in texts: + text = text.lower() + text = text.decode('utf-8') if isinstance(text, bytes) else text + self.__texts.append(text) + # Note: following rules have been disabled because cause problem with generating original text + # regex_to_process = RegexReplace([(r'[\'\/]', r''), (r'\s+', r' ')]) + # processed_text = self.regx_to_process.text_substitute(processed_text) + self.__processed_texts.append(u' ' + text + u' ') + + @staticmethod + def _get_substring_from_processed_text(text, matched_tokens): + """ + Get part of original text that was detected as some entity value. + + This method was written to tackle cases when original text contains special characters which are dropped + during tokenization + + Args: + matched_tokens (list): list of tokens (usually tokens from fuzzy match results from ES) + to find as a contiguous substring in the processed sentence considering the effects + of tokenizer + text (string or unicode): sentence from self.processed_text from where indices of given token will be + given + + Returns: + str or unicode: part of original text that corresponds to given tokens + + E.g. + self.processed_text = u'i want to order 1 pc hot & crispy' + tokens = [u'i', u'want', u'to', u'order', u'1', u'pc', u'hot', u'crispy'] + indices = [(1, 2), (3, 7), (8, 10), (11, 16), (17, 18), (19, 21), (22, 25), (28, 34)]) + + In: matched_tokens = [u'1', u'pc', u'hot', u'crispy'] + Out: 1 pc hot & crispy + + Notice that & is dropped during tokenization but when finding original text, we recover it from processed text + """ + + def _get_tokens_and_indices(txt): + """ + Args: + txt (str or unicode): text to get tokens from and indicies of those tokens in the given text + + Returns: + tuple: + list: containing tokens, direct results from tokenizer.tokenize + list: containing (int, int) indicating start and end position of ith token (of first list) + in given text + + E.g. + In: text = u'i want to order 1 pc hot & crispy' + Out: ([u'i', u'want', u'to', u'order', u'1', u'pc', u'hot', u'crispy'], + [(1, 2), (3, 7), (8, 10), (11, 16), (17, 18), (19, 21), (22, 25), (28, 34)]) + + """ + txt = txt.rstrip() + ' __eos__' + processed_text_tokens = TOKENIZER.tokenize(txt) + processed_text_tokens_indices = [] + + offset = 0 + for token in processed_text_tokens: + st = txt.index(token) + en = st + len(token) + + # Small block to handle tricky cases like '(A B) C' + # It extends the previous token's end boundary if there are special characters except whitespace + # towards the end of previous token + prefix = txt[:en] + prefix_tokens = whitespace_tokenizer.tokenize(prefix) + if prefix and len(prefix_tokens) > 1 and prefix_tokens[0]: + if processed_text_tokens_indices: + s, e = processed_text_tokens_indices.pop() + e += len(prefix_tokens[0]) + processed_text_tokens_indices.append((s, e)) + + txt = txt[en:] + processed_text_tokens_indices.append((offset + st, offset + en)) + offset += en + + # remove eos parts + processed_text_tokens.pop() + processed_text_tokens_indices.pop() + + return processed_text_tokens, processed_text_tokens_indices + + try: + n = len(matched_tokens) + tokens, indices = _get_tokens_and_indices(text) + for i in range(len(tokens) - n + 1): + if tokens[i:i + n] == matched_tokens: + start = indices[i][0] + end = indices[i + n - 1][1] + return text[start:end] + except (ValueError, IndexError): + ner_logger.exception('Error getting original text (%s, %s)' % (matched_tokens, text)) + + return u' '.join(matched_tokens) + + def _get_text_detection_with_variants(self): + texts = [u' '.join(TOKENIZER.tokenize(processed_text)) for processed_text in self.__processed_texts] + + entities_dict_list = self.entities_dict_list + new = self.esdb.get_multi_entity_results(entities=list(entities_dict_list), + texts=texts, + fuzziness_threshold=self._fuzziness + ) + final_list = [] + for index, entity_result in enumerate(new): + result_list = {} + for each_key in entities_dict_list.keys(): + + original_final_list = [] + value_final_list = [] + variants_to_values = collections.OrderedDict() + original_final_list_ = [] + value_final_list_ = [] + + _variants_to_values = entity_result.get(each_key, []) + + if not _variants_to_values: + result_list[each_key] = ([], []) + continue + for variant, value in iteritems(_variants_to_values): + variant = variant.lower() + if isinstance(variant, bytes): + variant = variant.decode('utf-8') + + variants_to_values[variant] = value + variants_list = list(variants_to_values.keys()) + + exact_matches, fuzzy_variants = [], [] + _text = texts + for variant in variants_list: + if u' '.join(TOKENIZER.tokenize(variant)) in _text[index]: + exact_matches.append(variant) + else: + fuzzy_variants.append(variant) + exact_matches.sort(key=lambda s: len(TOKENIZER.tokenize(s)), reverse=True) + fuzzy_variants.sort(key=lambda s: len(TOKENIZER.tokenize(s)), reverse=True) + + variants_list = exact_matches + fuzzy_variants + for variant in variants_list: + + original_text = self._get_entity_substring_from_text(self.__processed_texts[index], variant) + if original_text: + value_final_list.append(variants_to_values[variant]) + original_final_list.append(original_text) + + boundary_punct_pattern = re.compile( + r'(^[{0}]+)|([{0}]+$)'.format(re.escape(string.punctuation))) + original_text_ = boundary_punct_pattern.sub("", original_text) + + _pattern = re.compile(r'\b%s\b' % re.escape(original_text_), flags=_re_flags) + tag = '__' + each_key + '__' + self.__processed_texts[index] = _pattern.sub(tag, self.__processed_texts[index]) + value_final_list_.append(value_final_list) + original_final_list_.append(original_final_list) + + result_list[each_key] = (value_final_list_, original_final_list_) + + final_list.append(result_list) + + return final_list + + def _get_entity_substring_from_text(self, text, variant): + variant_tokens = TOKENIZER.tokenize(variant) + text_tokens = TOKENIZER.tokenize(text) + original_text_tokens = [] + variant_token_i = 0 + for text_token in text_tokens: + variant_token = variant_tokens[variant_token_i] + same = variant_token == text_token + ft = self._get_fuzziness_threshold_for_token(text_token) + if same or (len(text_token) > self._min_token_size_for_fuzziness + and edit_distance(string1=variant_token, + string2=text_token, + max_distance=ft + 1) <= ft): + original_text_tokens.append(text_token) + variant_token_i += 1 + if variant_token_i == len(variant_tokens): + return self._get_substring_from_processed_text(text, original_text_tokens) + else: + original_text_tokens = [] + variant_token_i = 0 + return None + + @staticmethod + def _add_verification_source(values, verification_source_dict): + text_entity_verified_values = [] + for text_entity_value in values: + text_entity_dict = {ENTITY_VALUE_DICT_KEY: text_entity_value} + text_entity_dict.update(verification_source_dict) + text_entity_verified_values.append(text_entity_dict) + return text_entity_verified_values + + def combine_results(self, values, original_texts, predetected_values): + unprocessed_crf_original_texts = [] + + combined_values = self._add_verification_source( + values=values, verification_source_dict={DATASTORE_VERIFIED: True, MODEL_VERIFIED: False} + ) + combined_original_texts = original_texts + for i in range(len(predetected_values)): + match = False + for j in range(len(original_texts)): + if predetected_values[i] == original_texts[j]: + combined_values[j][MODEL_VERIFIED] = True + match = True + break + elif re.findall(r'\b%s\b' % re.escape(predetected_values[i]), original_texts[j]): + # If predetected value is a substring of some value detected by datastore, skip it from output + match = True + break + if not match: + unprocessed_crf_original_texts.append(predetected_values[i]) + + unprocessed_crf_original_texts_verified = self._add_verification_source( + values=unprocessed_crf_original_texts, + verification_source_dict={DATASTORE_VERIFIED: False, MODEL_VERIFIED: True} + ) + + combined_values.extend(unprocessed_crf_original_texts_verified) + combined_original_texts.extend(unprocessed_crf_original_texts) + + return combined_values, combined_original_texts + + def detect(self, message=None, structured_value=None, **kwargs): + + text = structured_value if structured_value else message + self._process_text([text]) + res_list = self._get_text_detection_with_variants() + data_list = [] + + for index, res in enumerate(res_list): + entities = {} + for entity, value in res.items(): + entities[entity] = [] + values, texts = [], [] + text_entity_values, original_texts = value + # get predetected value from entity dict + entity_dict = self.entities_dict_list.get(entity, {}) + predetected_values = entity_dict.get('predetected_values') or [] + + # get fallback value from entity dict + fallback_value = entity_dict.get('fallback_value') + + if text_entity_values and original_texts: + self.processed_text = self.__processed_texts[0] + values, texts = text_entity_values[0], original_texts[0] + + entity_list, original_text_list = self.combine_results(values=values, original_texts=texts, + predetected_values=predetected_values) + + if structured_value: + if entity_list: + value, method, original_text = entity_list, FROM_STRUCTURE_VALUE_VERIFIED, original_text_list + else: + value, method, original_text = [structured_value], FROM_STRUCTURE_VALUE_NOT_VERIFIED, \ + [structured_value] + elif entity_list: + value, method, original_text = entity_list, FROM_MESSAGE, original_text_list + elif fallback_value: + value, method, original_text = [fallback_value], FROM_FALLBACK_VALUE, [fallback_value] + else: + continue + + out = self.output_entity_dict_list(entity_value_list=value, original_text_list=original_text, + detection_method=method, + detection_language=self._target_language_script) + + entities[entity] = out + data_list.append(entities) + + return data_list + + @staticmethod + def output_entity_dict_list(entity_value_list, original_text_list, detection_method=None, + detection_method_list=None, detection_language=ENGLISH_LANG): + if detection_method_list is None: + detection_method_list = [] + if entity_value_list is None: + entity_value_list = [] + + entity_list = [] + for i, entity_value in enumerate(entity_value_list): + if type(entity_value) in [str, six.text_type]: + entity_value = { + ENTITY_VALUE_DICT_KEY: entity_value + } + method = detection_method_list[i] if detection_method_list else detection_method + entity_list.append( + { + ENTITY_VALUE: entity_value, + DETECTION_METHOD: method, + ORIGINAL_TEXT: original_text_list[i], + DETECTION_LANGUAGE: detection_language + } + ) + return entity_list + + def detect_bulk(self, messages=None, **kwargs): + + texts = messages + self._process_text(texts) + + res_list = self._get_text_detection_with_variants() + data_list = [] + for index, res in enumerate(res_list): + entities = {} + for entity, value in res.items(): + entities[entity] = [] + values, texts = [], [] + # get predetected value from entity dict + entity_dict = self.entities_dict_list.get(entity, {}) + predetected_values = entity_dict.get('predetected_values') or [] + + # get fallback value from entity dict + fallback_value = entity_dict.get('fallback_value') + + text_entity_values, original_texts = value + if text_entity_values and original_texts: + self.processed_text = self.__processed_texts[0] + values, texts = text_entity_values[0], original_texts[0] + + entity_list, original_text_list = self.combine_results(values=values, original_texts=texts, + predetected_values=predetected_values) + + if entity_list: + value, method, original_text = entity_list, FROM_MESSAGE, original_text_list + elif fallback_value: + value, method, original_text = [fallback_value], FROM_FALLBACK_VALUE, [fallback_value] + else: + continue + + out = self.output_entity_dict_list(entity_value_list=value, original_text_list=original_text, + detection_method=method, + detection_language=self._target_language_script) + + entities[entity] = out + data_list.append(entities) + + return data_list diff --git a/ner_v2/detectors/textual/utils.py b/ner_v2/detectors/textual/utils.py new file mode 100644 index 000000000..7881d88e1 --- /dev/null +++ b/ner_v2/detectors/textual/utils.py @@ -0,0 +1,159 @@ +import json +import six + +from chatbot_ner.config import ner_logger +from language_utilities.constant import ENGLISH_LANG + +from ner_constants import FROM_FALLBACK_VALUE +from ner_v2.detectors.textual.text_detection import TextDetector + + +def verify_text_request(request): + request_data = json.loads(request.body) + queries = request_data.get("queries") + + if not queries: + raise KeyError("Parameter queries is required") + + +def get_text_detection(message, entity_dict, structured_value, bot_message, + language=ENGLISH_LANG, **kwargs): + """ + Get text detection for given message on given entities dict using + TextDetector module. + Args: + message: message to detect text on + entity_dict: entity details dict + structured_value: structured value + bot_message: bot message + language: langugae for text detection + **kwargs: other kwargs + + Returns: + + detected entity output + + """ + text_detector = TextDetector(entity_dict=entity_dict, source_language_script=language) + + if isinstance(message, six.string_types): + entity_output = text_detector.detect(message=message, + structured_value=structured_value, + bot_message=bot_message) + elif isinstance(message, (list, tuple)): + entity_output = text_detector.detect_bulk(messages=message) + else: + raise TypeError('`message` argument must be either of type `str`, `unicode`, `list` or `tuple`.') + + return entity_output + + +def parse_text_request(request): + """ + Parse text request coming from POST call on `/v2/text/` + Args: + request: request object + + Returns: + output data + """ + request_data = json.loads(request.body) + message = request_data.get("message", []) + bot_message = request_data.get("bot_message") + entities = request_data.get("entities", {}) + language_script = request_data.get('language_script', ENGLISH_LANG) + source_language = request_data.get('source_language', ENGLISH_LANG) + + data = [] + + message_len = len(message) + + if message_len == 1: + + # get first message + message_str = message[0] + + structured_value_entities = {} + fallback_value_entities = {} + text_value_entities = {} + + data.append({"entities": {}, "language": source_language}) + + for each_entity, value in entities.items(): + + structured_value = value.get('structured_value') + use_fallback = value.get('use_fallback', False) + + if use_fallback: + fallback_value_entities[each_entity] = value + elif structured_value: + structured_value_entities[each_entity] = value + else: + text_value_entities[each_entity] = value + + # get detection for normal text entities + output = get_text_detection(message=message_str, entity_dict=text_value_entities, + structured_value=None, bot_message=bot_message) + + data[0]["entities"].update(output[0]) + + # get detection for structured value text entities + if structured_value_entities: + for entity, value in structured_value_entities.items(): + entity_dict = {entity: value} + sv = value.get("structured_value") + print(sv) + output = get_text_detection(message=message_str, entity_dict=entity_dict, + structured_value=sv, bot_message=bot_message) + + data[0]["entities"].update(output[0]) + + # get detection for fallback value text entities + if fallback_value_entities: + output = get_output_for_fallback_entities(fallback_value_entities, source_language) + data[0]["entities"].update(output) + + # check if more than one message + elif len(message) > 1: + text_detection_result = get_text_detection(message=message, entity_dict=entities, + structured_value=None, bot_message=bot_message) + + data = [{"entities": x, "language": source_language} for x in text_detection_result] + + else: + ner_logger.debug("No valid message provided") + raise KeyError("Message is required") + + return data + + +def get_output_for_fallback_entities(entities_dict, language=ENGLISH_LANG): + """ + Generate detection output for default fallback entities. + Args: + entities_dict: dict of entities details + language: language to run + + Returns: + TextDetection output for default fallback + """ + output = {} + if not entities_dict: + return output + + for entity, value in entities_dict.items(): + fallback_value = value.get("fallback_value") + output[entity] = [ + { + "entity_value": { + "value": fallback_value, + "datastore_verified": False, + "model_verified": False + }, + "detection": FROM_FALLBACK_VALUE, + "original_text": fallback_value, + "language": language + } + ] + + return output From a43d7dbddb0144fbd7bebc6a29b6e97177acf8d9 Mon Sep 17 00:00:00 2001 From: Ankur Agrawal Date: Fri, 18 Sep 2020 18:45:44 +0530 Subject: [PATCH 21/31] Fix check for fallback_if entity in case of `use_fallback` --- ner_v2/detectors/textual/utils.py | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/ner_v2/detectors/textual/utils.py b/ner_v2/detectors/textual/utils.py index 7881d88e1..0affe1285 100644 --- a/ner_v2/detectors/textual/utils.py +++ b/ner_v2/detectors/textual/utils.py @@ -143,17 +143,21 @@ def get_output_for_fallback_entities(entities_dict, language=ENGLISH_LANG): for entity, value in entities_dict.items(): fallback_value = value.get("fallback_value") - output[entity] = [ - { - "entity_value": { - "value": fallback_value, - "datastore_verified": False, - "model_verified": False - }, - "detection": FROM_FALLBACK_VALUE, - "original_text": fallback_value, - "language": language - } - ] + + if fallback_value: + output[entity] = [ + { + "entity_value": { + "value": fallback_value, + "datastore_verified": False, + "model_verified": False + }, + "detection": FROM_FALLBACK_VALUE, + "original_text": fallback_value, + "language": language + } + ] + else: + output[entity] = [] return output From 5111fb2eb82dc77b1261748ccedcda362a87f0df Mon Sep 17 00:00:00 2001 From: Ankur Agrawal Date: Sun, 20 Sep 2020 22:13:33 +0530 Subject: [PATCH 22/31] Add test cases for utils module of text detection --- ner_v2/detectors/textual/elastic_search.py | 6 +- ner_v2/detectors/textual/tests/test_utils.py | 225 +++++++++++++++++++ ner_v2/detectors/textual/text_detection.py | 154 +++++++++---- ner_v2/detectors/textual/utils.py | 66 ++++-- 4 files changed, 385 insertions(+), 66 deletions(-) create mode 100644 ner_v2/detectors/textual/tests/test_utils.py diff --git a/ner_v2/detectors/textual/elastic_search.py b/ner_v2/detectors/textual/elastic_search.py index 03104d25f..d8db9ef8d 100644 --- a/ner_v2/detectors/textual/elastic_search.py +++ b/ner_v2/detectors/textual/elastic_search.py @@ -101,13 +101,14 @@ def _check_doc_type_for_elasticsearch(self): 'Elasticsearch needs doc_type. Please configure ES_DOC_TYPE in your environment') def get_multi_entity_results(self, entities, texts, fuzziness_threshold=1, - **kwargs): + search_language_script=None, **kwargs): """ Args: entities: the list of entities to lookup in the datastore for getting entity values and their variants texts(list of strings): the text for which variants need to be find out fuzziness_threshold: fuzziness allowed for search results on entity value variants + search_language_script: language script for ES search kwargs: Returns: @@ -157,7 +158,8 @@ def get_multi_entity_results(self, entities, texts, fuzziness_threshold=1, json.dumps(_generate_multi_entity_es_query( entities=entities, text=each, - fuzziness_threshold=fuzziness_threshold))] + fuzziness_threshold=fuzziness_threshold, + language_script=search_language_script))] for each in texts])) # add `\n` for each index_header and text entry diff --git a/ner_v2/detectors/textual/tests/test_utils.py b/ner_v2/detectors/textual/tests/test_utils.py new file mode 100644 index 000000000..f84a5215a --- /dev/null +++ b/ner_v2/detectors/textual/tests/test_utils.py @@ -0,0 +1,225 @@ +from __future__ import absolute_import + +import json +import os + +from collections import OrderedDict +from mock import patch + +from django.test import TestCase +from django.http import HttpRequest + +from ner_v2.detectors.textual.utils import parse_text_request, verify_text_request, \ + get_output_for_fallback_entities, get_text_detection + +tests_directory = os.path.dirname(os.path.abspath(__file__)) + + +class TestTextualUtils(TestCase): + + def test_get_output_for_fallback_entities(self): + input_data = {'city': {'fallback_value': 'Mumbai', 'use_fallback': True}, + 'restaurant': {'fallback_value': None, 'use_fallback': True}} + + assert_output_data = {'city': [{'entity_value': {'value': 'Mumbai', + 'datastore_verified': False, + 'model_verified': False}, + 'detection': 'fallback_value', + 'original_text': 'Mumbai', 'language': 'en'}], + 'restaurant': []} + + result = get_output_for_fallback_entities(input_data) + + self.assertDictEqual(result, assert_output_data) + + def test_verify_text_request_ok(self): + request = HttpRequest() + + # test if everything is ok + request._body = b'{"message":["something"], "entities":{"something":""}}' + verify_text_request(request) + + def test_verify_text_request_exceptions(self): + request = HttpRequest() + + # test if no message + request._body = b'{}' + self.assertRaises(KeyError, verify_text_request, request=request) + + # test if no entities + request._body = b'{"message": "something"}' + self.assertRaises(KeyError, verify_text_request, request=request) + + # test if message not in proper format + request._body = b'{"message":"something", "entities":"something"}' + self.assertRaises(TypeError, verify_text_request, request=request) + + # test if entities not in proper format + request._body = b'{"message":["something"], "entities":"something"}' + self.assertRaises(TypeError, verify_text_request, request=request) + + @patch('ner_v2.detectors.textual.utils.get_text_detection') + def test_parse_text_request(self, mock_get_detection): + input_data = { + "message": ["I want to go to Mumbai"], + "bot_message": None, + "language_script": "en", + "source_language": "en", + "entities": { + "city": { + "structured_value": None, + "fallback_value": None, + "predetected_values": None, + "fuzziness": 4, + "min_token_len_fuzziness": 4, + "use_fallback": None + }, + + "restaurant": { + "structured_value": None, + "fallback_value": None, + "predetected_values": None, + "fuzziness": None, + "min_token_len_fuzziness": None, + "use_fallback": True + }, + } + + } + + request = HttpRequest() + + request._body = json.dumps(input_data) + + mock_get_detection.return_value = [{'entities': {'city': [ + {'entity_value': {'value': 'Mumbai', 'datastore_verified': True, + 'model_verified': False}, + 'detection': 'message', 'original_text': 'mumbai', + 'language': 'en'}], 'restaurant': []}, + 'language': 'en'}] + + output = parse_text_request(request) + + assert_output = [{'entities': + {'entities': {'city': [ + {'entity_value': {'value': 'Mumbai', + 'datastore_verified': True, + 'model_verified': False}, + 'detection': 'message', 'original_text': 'mumbai', + 'language': 'en'}], 'restaurant': []}, + 'language': 'en', 'restaurant': []}, 'language': 'en'}] + + self.assertListEqual(output, assert_output) + + @patch('ner_v2.detectors.textual.utils.get_text_detection') + def test_parse_text_request_structured(self, mock_get_detection): + input_data = { + "message": ["I want to go to Mumbai"], + "bot_message": None, + "language_script": "en", + "source_language": "en", + "entities": { + "city": { + "structured_value": None, + "fallback_value": None, + "predetected_values": None, + "fuzziness": 4, + "min_token_len_fuzziness": 4, + "use_fallback": None + }, + + "restaurant": { + "structured_value": None, + "fallback_value": None, + "predetected_values": None, + "fuzziness": None, + "min_token_len_fuzziness": None, + "use_fallback": True + }, + } + + } + + request = HttpRequest() + + request._body = json.dumps(input_data) + + mock_get_detection.return_value = [{'city': [ + {'entity_value': {'value': 'New Delhi', 'datastore_verified': True, 'model_verified': False}, + 'detection': 'structure_value_verified', 'original_text': 'delhi', 'language': 'en'}]}] + + output = parse_text_request(request) + + assert_output = [{'entities': {'city': [ + {'entity_value': {'value': 'New Delhi', 'datastore_verified': True, 'model_verified': False}, + 'detection': 'structure_value_verified', 'original_text': 'delhi', 'language': 'en'}], 'restaurant': []}, + 'language': 'en'}] + + self.assertListEqual(output, assert_output) + + @patch('ner_v2.detectors.textual.elastic_search.' + 'ElasticSearchDataStore.get_multi_entity_results') + def test_get_text_detection_string_message(self, mock_es_query): + entity_dict = {'city': {'structured_value': None, + 'fallback_value': None, + 'predetected_values': None, + 'fuzziness': 4, + 'min_token_len_fuzziness': 4, + 'use_fallback': None}} + + message = "I want to go to Mumbai" + + mock_es_query.return_value = [ + {'city': OrderedDict([('Wani', 'Wani'), + ('mumbai', 'mumbai'), + ('Mumbai', 'Mumbai'), + ('goa', 'goa')]) + }] + + output = get_text_detection(message, entity_dict) + assert_output = [ + {'city': [{'entity_value': {'value': 'Mumbai', 'datastore_verified': True, + 'model_verified': False}, 'detection': 'message', + 'original_text': 'mumbai', + 'language': 'en'}]}] + + self.assertDictEqual(assert_output[0], output[0]) + + @patch('ner_v2.detectors.textual.elastic_search.' + 'ElasticSearchDataStore.get_multi_entity_results') + def test_get_text_detection_list_message(self, mock_es_query): + entity_dict = {'city': {'structured_value': None, + 'fallback_value': None, + 'predetected_values': None, + 'fuzziness': 4, + 'min_token_len_fuzziness': 4, + 'use_fallback': None}} + + message = ["I want to go to Mumbai", "I want to go to Delhi"] + + mock_es_query.return_value = [ + {'city': OrderedDict([('Wani', 'Wani'), + ('mumbai', 'mumbai'), + ('Mumbai', 'Mumbai'), ( + 'goa', 'goa')])}, + {'city': OrderedDict([('Delhi', 'New Delhi'), + ('Wani', 'Wani'), + ('goa', 'goa')])}] + + output = get_text_detection(message, entity_dict) + assert_output = [{'city': [ + {'entity_value': {'value': 'Mumbai', + 'datastore_verified': True, + 'model_verified': False}, + 'detection': 'message', + 'original_text': 'mumbai', + 'language': 'en'}]}, + {'city': [ + {'entity_value': {'value': 'New Delhi', + 'datastore_verified': True, + 'model_verified': False}, + 'detection': 'message', + 'original_text': 'delhi', + 'language': 'en'}]}] + + self.assertListEqual(assert_output, output) diff --git a/ner_v2/detectors/textual/text_detection.py b/ner_v2/detectors/textual/text_detection.py index 572058a22..6f582c4fe 100644 --- a/ner_v2/detectors/textual/text_detection.py +++ b/ner_v2/detectors/textual/text_detection.py @@ -1,4 +1,5 @@ from __future__ import absolute_import + import collections import string @@ -35,7 +36,8 @@ class TextDetector(object): def __init__(self, entity_dict=None, - source_language_script=lang_constant.ENGLISH_LANG): + source_language_script=lang_constant.ENGLISH_LANG, + target_language_script=ENGLISH_LANG): self.processed_text = None self.__texts = [] @@ -50,21 +52,40 @@ def __init__(self, entity_dict=None, # defaults for non-auto mode self.set_fuzziness_threshold(fuzziness=1) self._min_token_size_for_fuzziness = 4 - self.esdb = ElasticSearchDataStore() + # define data store and target languages + self.esdb = ElasticSearchDataStore() self._source_language_script = source_language_script - self._target_language_script = ENGLISH_LANG + self._target_language_script = target_language_script + # define entities to detect self.entities_dict_list = entity_dict def _reset_state(self): + """ + Reset all the intermediary states of detection class. + """ self.tagged_text = None self.processed_text = None self.__texts = [] self.__processed_texts = [] def set_fuzziness_threshold(self, fuzziness): + """ + Sets the fuzziness thresholds for similarity searches. The fuzziness threshold corresponds to the + maximum Levenshtein's distance allowed during similarity matching + Args: + + fuzziness (iterable or int): If this parameter is int, elasticsearch's auto is used with + low and high term distances. + + Please make sure the iterable has only two integers like (4, 7). + This will generate "auto:4,7" + + Note that this also sets _min_token_size_for_fuzziness to first value of the iterable + If this argument is int, elasticsearch will set fuzziness as min(2, fuzziness) + """ try: iter(fuzziness) if len(fuzziness) == 2: @@ -80,19 +101,24 @@ def set_fuzziness_threshold(self, fuzziness): else: raise TypeError('Fuziness has to be either an iterable of length 2 or an int') - def _get_fuzziness_threshold_for_token(self, token): + def _get_fuzziness_threshold_for_token(self, token, fuzziness=None): """ Return dynamic fuzziness threshold for damerau-levenshtein check based on length of token if elasticsearch fuzziness was set to auto mode Args: token (str or unicode): the string to calculate fuzziness threshold for + fuzziness (int): fuzziness value provided Returns: int: fuzziness threshold for ngram matching on elastic search results """ - if type(self._fuzziness) == int: - return self._fuzziness + + if not fuzziness: + fuzziness = self._fuzziness + + if type(fuzziness) == int: + return fuzziness else: if len(token) < self._fuzziness_lo: return 0 # strict match @@ -118,9 +144,6 @@ def _process_text(self, texts): text = text.lower() text = text.decode('utf-8') if isinstance(text, bytes) else text self.__texts.append(text) - # Note: following rules have been disabled because cause problem with generating original text - # regex_to_process = RegexReplace([(r'[\'\/]', r''), (r'\s+', r' ')]) - # processed_text = self.regx_to_process.text_substitute(processed_text) self.__processed_texts.append(u' ' + text + u' ') @staticmethod @@ -213,15 +236,18 @@ def _get_tokens_and_indices(txt): return u' '.join(matched_tokens) def _get_text_detection_with_variants(self): - texts = [u' '.join(TOKENIZER.tokenize(processed_text)) for processed_text in self.__processed_texts] + + texts = [u' '.join(TOKENIZER.tokenize(processed_text)) for + processed_text in self.__processed_texts] entities_dict_list = self.entities_dict_list - new = self.esdb.get_multi_entity_results(entities=list(entities_dict_list), - texts=texts, - fuzziness_threshold=self._fuzziness - ) + es_results = self.esdb.get_multi_entity_results(entities=list(entities_dict_list), + texts=texts, + fuzziness_threshold=self._fuzziness, + search_language_script=self._target_language_script + ) final_list = [] - for index, entity_result in enumerate(new): + for index, entity_result in enumerate(es_results): result_list = {} for each_key in entities_dict_list.keys(): @@ -257,11 +283,11 @@ def _get_text_detection_with_variants(self): variants_list = exact_matches + fuzzy_variants for variant in variants_list: - original_text = self._get_entity_substring_from_text(self.__processed_texts[index], variant) + original_text = self._get_entity_substring_from_text(self.__processed_texts[index], + variant, each_key) if original_text: value_final_list.append(variants_to_values[variant]) original_final_list.append(original_text) - boundary_punct_pattern = re.compile( r'(^[{0}]+)|([{0}]+$)'.format(re.escape(string.punctuation))) original_text_ = boundary_punct_pattern.sub("", original_text) @@ -278,7 +304,7 @@ def _get_text_detection_with_variants(self): return final_list - def _get_entity_substring_from_text(self, text, variant): + def _get_entity_substring_from_text(self, text, variant, entity_name): variant_tokens = TOKENIZER.tokenize(variant) text_tokens = TOKENIZER.tokenize(text) original_text_tokens = [] @@ -286,8 +312,17 @@ def _get_entity_substring_from_text(self, text, variant): for text_token in text_tokens: variant_token = variant_tokens[variant_token_i] same = variant_token == text_token - ft = self._get_fuzziness_threshold_for_token(text_token) - if same or (len(text_token) > self._min_token_size_for_fuzziness + + # get fuzziness and min_token_size_for_fuziness value from entity dict + entity_dict = self.entities_dict_list.get(entity_name, {}) + fuzziness = entity_dict.get('fuzziness') + min_token_size_for_fuzziness = entity_dict.get('min_token_len_fuzziness') + + if not min_token_size_for_fuzziness: + min_token_size_for_fuzziness = self._min_token_size_for_fuzziness + + ft = self._get_fuzziness_threshold_for_token(token=text_token, fuzziness=fuzziness) + if same or (len(text_token) > min_token_size_for_fuzziness and edit_distance(string1=variant_token, string2=text_token, max_distance=ft + 1) <= ft): @@ -341,6 +376,16 @@ def combine_results(self, values, original_texts, predetected_values): return combined_values, combined_original_texts def detect(self, message=None, structured_value=None, **kwargs): + """ + + Args: + message: + structured_value: + **kwargs: + + Returns: + + """ text = structured_value if structured_value else message self._process_text([text]) @@ -389,32 +434,16 @@ def detect(self, message=None, structured_value=None, **kwargs): return data_list - @staticmethod - def output_entity_dict_list(entity_value_list, original_text_list, detection_method=None, - detection_method_list=None, detection_language=ENGLISH_LANG): - if detection_method_list is None: - detection_method_list = [] - if entity_value_list is None: - entity_value_list = [] + def detect_bulk(self, messages=None, **kwargs): + """ - entity_list = [] - for i, entity_value in enumerate(entity_value_list): - if type(entity_value) in [str, six.text_type]: - entity_value = { - ENTITY_VALUE_DICT_KEY: entity_value - } - method = detection_method_list[i] if detection_method_list else detection_method - entity_list.append( - { - ENTITY_VALUE: entity_value, - DETECTION_METHOD: method, - ORIGINAL_TEXT: original_text_list[i], - DETECTION_LANGUAGE: detection_language - } - ) - return entity_list + Args: + messages: + **kwargs: - def detect_bulk(self, messages=None, **kwargs): + Returns: + + """ texts = messages self._process_text(texts) @@ -456,3 +485,40 @@ def detect_bulk(self, messages=None, **kwargs): data_list.append(entities) return data_list + + @staticmethod + def output_entity_dict_list(entity_value_list, original_text_list, detection_method=None, + detection_method_list=None, detection_language=ENGLISH_LANG): + """ + + Args: + entity_value_list: + original_text_list: + detection_method: + detection_method_list: + detection_language: + + Returns: + + """ + if detection_method_list is None: + detection_method_list = [] + if entity_value_list is None: + entity_value_list = [] + + entity_list = [] + for i, entity_value in enumerate(entity_value_list): + if type(entity_value) in [str, six.text_type]: + entity_value = { + ENTITY_VALUE_DICT_KEY: entity_value + } + method = detection_method_list[i] if detection_method_list else detection_method + entity_list.append( + { + ENTITY_VALUE: entity_value, + DETECTION_METHOD: method, + ORIGINAL_TEXT: original_text_list[i], + DETECTION_LANGUAGE: detection_language + } + ) + return entity_list diff --git a/ner_v2/detectors/textual/utils.py b/ner_v2/detectors/textual/utils.py index 0affe1285..309add5a4 100644 --- a/ner_v2/detectors/textual/utils.py +++ b/ner_v2/detectors/textual/utils.py @@ -1,3 +1,5 @@ +from __future__ import absolute_import + import json import six @@ -9,15 +11,37 @@ def verify_text_request(request): + """ + Check the request object if proper message or entity is present in required + format. If not present raises appropriate error. + Args: + request: API request object + + Returns: + Raises KeyError if message or entities are not present + Raises TypeError if message is not list or entities is not dict type + Else Return none + """ + request_data = json.loads(request.body) - queries = request_data.get("queries") + message = request_data.get("message") + entities = request_data.get("entities") + + if not message: + raise KeyError("Message is required") + + if not entities: + raise KeyError("Entities dict is required") + + if not isinstance(message, list): + raise TypeError("Message should be in format of list of string") - if not queries: - raise KeyError("Parameter queries is required") + if not isinstance(entities, dict): + raise TypeError("Entities should be dict of entity details") -def get_text_detection(message, entity_dict, structured_value, bot_message, - language=ENGLISH_LANG, **kwargs): +def get_text_detection(message, entity_dict, structured_value=None, bot_message=None, + language=ENGLISH_LANG, target_language_script=ENGLISH_LANG, **kwargs): """ Get text detection for given message on given entities dict using TextDetector module. @@ -27,15 +51,15 @@ def get_text_detection(message, entity_dict, structured_value, bot_message, structured_value: structured value bot_message: bot message language: langugae for text detection + target_language_script: target language for detection default ENGLISH **kwargs: other kwargs Returns: detected entity output - """ - text_detector = TextDetector(entity_dict=entity_dict, source_language_script=language) - + text_detector = TextDetector(entity_dict=entity_dict, source_language_script=language, + target_language_script=target_language_script) if isinstance(message, six.string_types): entity_output = text_detector.detect(message=message, structured_value=structured_value, @@ -61,8 +85,8 @@ def parse_text_request(request): message = request_data.get("message", []) bot_message = request_data.get("bot_message") entities = request_data.get("entities", {}) - language_script = request_data.get('language_script', ENGLISH_LANG) - source_language = request_data.get('source_language', ENGLISH_LANG) + target_language_script = request_data.get('language_script') or ENGLISH_LANG + source_language = request_data.get('source_language') or ENGLISH_LANG data = [] @@ -93,8 +117,9 @@ def parse_text_request(request): # get detection for normal text entities output = get_text_detection(message=message_str, entity_dict=text_value_entities, - structured_value=None, bot_message=bot_message) - + structured_value=None, bot_message=bot_message, + language_script=source_language, + target_language_script=target_language_script) data[0]["entities"].update(output[0]) # get detection for structured value text entities @@ -102,9 +127,10 @@ def parse_text_request(request): for entity, value in structured_value_entities.items(): entity_dict = {entity: value} sv = value.get("structured_value") - print(sv) output = get_text_detection(message=message_str, entity_dict=entity_dict, - structured_value=sv, bot_message=bot_message) + structured_value=sv, bot_message=bot_message, + language_script=source_language, + target_language_script=target_language_script) data[0]["entities"].update(output[0]) @@ -129,7 +155,7 @@ def parse_text_request(request): def get_output_for_fallback_entities(entities_dict, language=ENGLISH_LANG): """ - Generate detection output for default fallback entities. + Generate default detection output for default fallback entities. Args: entities_dict: dict of entities details language: language to run @@ -143,8 +169,11 @@ def get_output_for_fallback_entities(entities_dict, language=ENGLISH_LANG): for entity, value in entities_dict.items(): fallback_value = value.get("fallback_value") - - if fallback_value: + + if not fallback_value: + output[entity] = [] + + else: output[entity] = [ { "entity_value": { @@ -157,7 +186,4 @@ def get_output_for_fallback_entities(entities_dict, language=ENGLISH_LANG): "language": language } ] - else: - output[entity] = [] - return output From 26a152b305c18c9467703e2c9de07bfe7758da8b Mon Sep 17 00:00:00 2001 From: Ankur Agrawal Date: Sun, 20 Sep 2020 23:22:23 +0530 Subject: [PATCH 23/31] Add test cases for text detection module --- .../textual/tests/test_text_detection.py | 139 ++++++++++++++++++ 1 file changed, 139 insertions(+) create mode 100644 ner_v2/detectors/textual/tests/test_text_detection.py diff --git a/ner_v2/detectors/textual/tests/test_text_detection.py b/ner_v2/detectors/textual/tests/test_text_detection.py new file mode 100644 index 000000000..d89485840 --- /dev/null +++ b/ner_v2/detectors/textual/tests/test_text_detection.py @@ -0,0 +1,139 @@ +from __future__ import absolute_import + +import os + +from collections import OrderedDict +from mock import patch + +from django.test import TestCase + +from ner_v2.detectors.textual.text_detection import TextDetector + +tests_directory = os.path.dirname(os.path.abspath(__file__)) + + +class TestTextualUtils(TestCase): + + def test_text_detector_intialization(self): + entity_dict = {'city': {'structured_value': None, + 'fallback_value': None, + 'predetected_values': None, + 'fuzziness': 4, + 'min_token_len_fuzziness': 4, + 'use_fallback': None}, + 'restaurant': {'structured_value': None, + 'fallback_value': None, + 'predetected_values': None, + 'fuzziness': 4, + 'min_token_len_fuzziness': 4, + 'use_fallback': None} + } + + language = 'en' + target_language_script = 'en' + + text_detector = TextDetector(entity_dict=entity_dict, source_language_script=language, + target_language_script=target_language_script) + + self.assertIsInstance(text_detector, TextDetector) + + self.assertEqual(language, text_detector._source_language_script) + self.assertEqual(target_language_script, text_detector._target_language_script) + + self.assertDictEqual(entity_dict, text_detector.entities_dict_list) + + @patch('ner_v2.detectors.textual.elastic_search.' + 'ElasticSearchDataStore.get_multi_entity_results') + def test_text_detection_detect_single_message(self, mock_es_query): + entity_dict = {'city': {'structured_value': None, + 'fallback_value': None, + 'predetected_values': None, + 'fuzziness': 4, + 'min_token_len_fuzziness': 4, + 'use_fallback': None}, + 'restaurant': {'structured_value': None, + 'fallback_value': None, + 'predetected_values': None, + 'fuzziness': 4, + 'min_token_len_fuzziness': 4, + 'use_fallback': None} + } + + language = 'en' + target_language_script = 'en' + + message = "I want to go to Mumbai to order Dominoes" + + mock_es_query.return_value = [{ + 'restaurant': OrderedDict([('Domino', "Domino's Pizza"), + ('Dominos', "Domino's Pizza"), ('TMOS', 'TMOS'), + ('G.', 'G Pulla Reddy Sweets')]), + 'city': OrderedDict([('Wani', 'Wani'), ('mumbai', 'mumbai'), + ('Mumbai', 'Mumbai'), ('goa', 'goa')])}] + + text_detector = TextDetector(entity_dict=entity_dict, source_language_script=language, + target_language_script=target_language_script) + + result = text_detector.detect(message=message) + + assert_output = [{'city': [{ + 'entity_value': {'value': 'Mumbai', + 'datastore_verified': True, 'model_verified': False}, + 'detection': 'message', 'original_text': 'mumbai', 'language': 'en'}], + 'restaurant': [ + {'entity_value': {'value': "Domino's Pizza", 'datastore_verified': True, 'model_verified': False}, + 'detection': 'message', 'original_text': 'dominoes', 'language': 'en'}]}] + + self.maxDiff = None + self.assertListEqual(result, assert_output) + + @patch('ner_v2.detectors.textual.elastic_search.' + 'ElasticSearchDataStore.get_multi_entity_results') + def test_text_detection_detect_bulk_message(self, mock_es_query): + entity_dict = {'city': {'structured_value': None, + 'fallback_value': None, + 'predetected_values': None, + 'fuzziness': 4, + 'min_token_len_fuzziness': 4, + 'use_fallback': None}, + 'restaurant': {'structured_value': None, + 'fallback_value': None, + 'predetected_values': None, + 'fuzziness': 4, + 'min_token_len_fuzziness': 4, + 'use_fallback': None} + } + + language = 'en' + target_language_script = 'en' + + message = ['I want to go to Mumbai to order Dominoes', + 'I want to go to Delhi'] + + mock_es_query.return_value = [{ + 'restaurant': OrderedDict([('Domino', "Domino's Pizza"), + ('Dominos', "Domino's Pizza"), + ('TMOS', 'TMOS'), ('G.', 'G Pulla Reddy Sweets')]), + 'city': OrderedDict([('Wani', 'Wani'), ('mumbai', 'mumbai'), + ('Mumbai', 'Mumbai'), ('goa', 'goa')])}, + {'restaurant': OrderedDict([('TMOS', 'TMOS'), + ('Deli', 'Deli'), + ('G.', 'G Pulla Reddy Sweets')]), + 'city': OrderedDict([('Delhi', 'New Delhi'), ('Wani', 'Wani'), + ('goa', 'goa')])}] + + text_detector = TextDetector(entity_dict=entity_dict, source_language_script=language, + target_language_script=target_language_script) + + result = text_detector.detect_bulk(messages=message) + + assert_output = [{'city': [ + {'entity_value': {'value': 'Mumbai', 'datastore_verified': True, 'model_verified': False}, + 'detection': 'message', 'original_text': 'mumbai', 'language': 'en'}], 'restaurant': [ + {'entity_value': {'value': "Domino's Pizza", 'datastore_verified': True, 'model_verified': False}, + 'detection': 'message', 'original_text': 'dominoes', 'language': 'en'}]}, {'city': [ + {'entity_value': {'value': 'New Delhi', 'datastore_verified': True, 'model_verified': False}, + 'detection': 'message', 'original_text': 'delhi', 'language': 'en'}], 'restaurant': []}] + + self.maxDiff = None + self.assertListEqual(result, assert_output) From aed5c6f394491bd2389984f782b991d6a1299732 Mon Sep 17 00:00:00 2001 From: Ankur Agrawal Date: Mon, 21 Sep 2020 00:39:39 +0530 Subject: [PATCH 24/31] Add documentation for text detection api endpoints and detetction module --- ner_v2/api.py | 127 +++++++++- .../textual/tests/test_text_detection.py | 16 +- ner_v2/detectors/textual/tests/test_utils.py | 4 +- ner_v2/detectors/textual/text_detection.py | 216 ++++++++++++++++-- ner_v2/detectors/textual/utils.py | 101 +++++++- 5 files changed, 430 insertions(+), 34 deletions(-) diff --git a/ner_v2/api.py b/ner_v2/api.py index dd78e56af..d257a39ed 100644 --- a/ner_v2/api.py +++ b/ner_v2/api.py @@ -12,7 +12,7 @@ from ner_v2.detectors.numeral.number.number_detection import NumberDetector from ner_v2.detectors.numeral.number_range.number_range_detection import NumberRangeDetector -from ner_v2.detectors.textual.utils import parse_text_request +from ner_v2.detectors.textual.utils import parse_text_request, verify_text_request from language_utilities.constant import ENGLISH_LANG from ner_v2.detectors.pattern.phone_number.phone_number_detection import PhoneDetector @@ -558,6 +558,111 @@ def phone_number(request): @csrf_exempt def text(request): + """ + Uses TextDetector to the get the values of multiple text entity detection. This is used + for both single text message or multiple text message detection. + + Currently only POST method is supported. + + Args: + request: request for text detection + + Request parameters + + message (list of str): list of message string for which detection logic needs to be run on. + + source_language (str): language for which the phone numbers have to be detected + + bot_message (str): previous message from a bot/agent. + + entities (dict): dictionary of entties to be detected, each entity dict will contain + following details: + + entity_name (str): name of the entity. Also acts as elastic-search dictionary name + if entity uses elastic-search lookup + structured_value (str): [Optional] Value obtained from any structured elements. + + Note if structured value is detection is run on structured value instead of message + (For example, UI elements like form, payload, etc) + + fallback_value (str): [Optional] If the detection logic fails to detect any value + either from structured_value or message then we return a fallback_value as an output. + + use_fallback (bool): Default as False, if this is present for a single message + fallback value will be used. + + fuzziness (int): [Optional] Fuzziness value for each entity + + min_token_size_for_fuzziness (int): [Optional] minimum size for token match + + Returns: + response (django.http.response.HttpResponse): HttpResponse object + + + Examples: + + 1) For single message: + input request: + { + "message": ["I want to go to Jabalpur"], + "bot_message": null, + "language_script": "en", + "source_language": "en", + "entities": { + "city": { + "structured_value": "Delhi", + "fallback_value": null, + "predetected_values": ["Mumbai"], + "fuzziness": null, + "min_token_len_fuzziness": null, + "use_fallback": false + }, + "restaurant": { + "structured_value": null, + "fallback_value": null, + "predetected_values": null, + "fuzziness": null, + "min_token_len_fuzziness": null, + "use_fallback": false + } + } + } + output response: + { + "success": true, + "error": null, + "data": [ + { + "entities": { + "restaurant": [], + "city": [ + { + "entity_value": { + "value": "New Delhi", + "datastore_verified": true, + "model_verified": false + }, + "detection": "structure_value_verified", + "original_text": "delhi", + "language": "en" + }, + { + "entity_value": { + "value": "Mumbai", + "datastore_verified": false, + "model_verified": true + }, + "detection": "structure_value_verified", + "original_text": "Mumbai", + "language": "en" + } + ] + }, + "language": "en" + } + ] + } + """ data = [] if request.method == "GET": @@ -566,13 +671,27 @@ def text(request): elif request.method == "POST": ner_logger.debug("Fetching result") + + try: + verify_text_request(request) + + except KeyError as err: + response = {"success": False, "error": str(err)} + ner_logger.debug(response) + return HttpResponse(json.dumps(response), content_type='application/json', + status=400) + except TypeError as err: + response = {"success": False, "error": str(err)} + ner_logger.debug(response) + return HttpResponse(json.dumps(response), content_type='application/json', + status=400) + + # if verify success parse request and get data data = parse_text_request(request) - ner_logger.debug("Result Is:") - ner_logger.debug(data) if data: response = {"success": True, "error": None, "data": data} return HttpResponse(json.dumps(response), content_type='application/json', status=200) else: response = {"success": False, "error": "Some error while parsing"} - return HttpResponse(json.dumps(response), status=500) + return HttpResponse(json.dumps(response), status=400) diff --git a/ner_v2/detectors/textual/tests/test_text_detection.py b/ner_v2/detectors/textual/tests/test_text_detection.py index d89485840..144a7a4a6 100644 --- a/ner_v2/detectors/textual/tests/test_text_detection.py +++ b/ner_v2/detectors/textual/tests/test_text_detection.py @@ -127,13 +127,15 @@ def test_text_detection_detect_bulk_message(self, mock_es_query): result = text_detector.detect_bulk(messages=message) - assert_output = [{'city': [ - {'entity_value': {'value': 'Mumbai', 'datastore_verified': True, 'model_verified': False}, - 'detection': 'message', 'original_text': 'mumbai', 'language': 'en'}], 'restaurant': [ - {'entity_value': {'value': "Domino's Pizza", 'datastore_verified': True, 'model_verified': False}, - 'detection': 'message', 'original_text': 'dominoes', 'language': 'en'}]}, {'city': [ - {'entity_value': {'value': 'New Delhi', 'datastore_verified': True, 'model_verified': False}, - 'detection': 'message', 'original_text': 'delhi', 'language': 'en'}], 'restaurant': []}] + assert_output = [{ + 'city': [{'entity_value': {'value': 'Mumbai', 'datastore_verified': True, 'model_verified': False}, + 'detection': 'message', 'original_text': 'mumbai', 'language': 'en'}], + 'restaurant': [{'entity_value': {'value': "Domino's Pizza", 'datastore_verified': True, + 'model_verified': False}, + 'detection': 'message', 'original_text': 'dominoes', 'language': 'en'}]}, + {'city': [{'entity_value': {'value': 'New Delhi', 'datastore_verified': True, + 'model_verified': False}, + 'detection': 'message', 'original_text': 'delhi', 'language': 'en'}], 'restaurant': []}] self.maxDiff = None self.assertListEqual(result, assert_output) diff --git a/ner_v2/detectors/textual/tests/test_utils.py b/ner_v2/detectors/textual/tests/test_utils.py index f84a5215a..8e55b7718 100644 --- a/ner_v2/detectors/textual/tests/test_utils.py +++ b/ner_v2/detectors/textual/tests/test_utils.py @@ -100,8 +100,8 @@ def test_parse_text_request(self, mock_get_detection): output = parse_text_request(request) - assert_output = [{'entities': - {'entities': {'city': [ + assert_output = [{ + 'entities': {'entities': {'city': [ {'entity_value': {'value': 'Mumbai', 'datastore_verified': True, 'model_verified': False}, diff --git a/ner_v2/detectors/textual/text_detection.py b/ner_v2/detectors/textual/text_detection.py index 6f582c4fe..c24926134 100644 --- a/ner_v2/detectors/textual/text_detection.py +++ b/ner_v2/detectors/textual/text_detection.py @@ -34,6 +34,32 @@ class TextDetector(object): + """ + TextDetector detects multiple custom entities in text string by performing similarity searches against a list + fetched from elasticsearch datastore. + + TextDetector detects text type custom entities that do not adhere to some strict/weak formats which other entities + like date, time, email, etc do. Examples of such types of entites can be city, food dish name, brand names etc + + Attributes: + entities_dict_list (dict): dict with details of entities to be dected. Each entites will contailn: + `value`: name of the entity + + `_fuzziness` (str or int): If this parameter is str, elasticsearch's + auto is used with low and high term distances. Default low and high + term distances are 3 and 6 for elasticsearch. For this module they are + set to 4 and 7 respectively. + + In auto mode, if length of term is less than low it must match exactly, + if it is between [low, high) one insert/delete/substitution is allowed, + for anything higher than equal to high, two inserts/deletes/substitutions + are allowed + + `_min_token_size_for_fuzziness (int)`: minimum number of letters a word must + have to be considered for calculating edit distance with similar + ngrams from the datastore + processed_text (str): string with detected text entities removed + """ def __init__(self, entity_dict=None, source_language_script=lang_constant.ENGLISH_LANG, @@ -47,7 +73,6 @@ def __init__(self, entity_dict=None, self._fuzziness = "auto:4,7" self._fuzziness_lo, self._fuzziness_hi = 4, 7 self._min_token_size_for_fuzziness = self._fuzziness_lo - # self.set_fuzziness_threshold(fuzziness=(self._fuzziness_lo, self._fuzziness_hi)) # defaults for non-auto mode self.set_fuzziness_threshold(fuzziness=1) @@ -236,6 +261,17 @@ def _get_tokens_and_indices(txt): return u' '.join(matched_tokens) def _get_text_detection_with_variants(self): + """ + This function will normalise the message by breaking it into trigrams, bigrams and unigrams. The generated + ngrams will be used to create query to retrieve search results from datastore. These results will contain list + of dictionary where for each item key will be variant and value will be entity value this will be further + processed to get the original text which has been identified and will return the results + + Returns: + tuple: + list of lists: list of dict for each message with key as entity name + containing the detected text entities and original message. + """ texts = [u' '.join(TOKENIZER.tokenize(processed_text)) for processed_text in self.__processed_texts] @@ -305,6 +341,27 @@ def _get_text_detection_with_variants(self): return final_list def _get_entity_substring_from_text(self, text, variant, entity_name): + """ + Check ngrams of the text for similarity against the variant (can be a ngram) using Levenshtein distance + and return the closest substring in the text that matches the variant. + For each entity fuziness and min_token_size_for_fuzziness is used from the entity details. + Args: + variant(str or unicode): string, ngram of variant to fuzzy detect in the text using + Levenshtein distance + text(str or unicode): sentence from self.processed on which detection is being done + entity_name (str): name of the entity to get fuzziness and min_token_lenght value + Returns: + str or unicode or None: part of the given text that was detected as entity given the variant, + None otherwise + Example: + >>> text_detector = TextDetector('city') + >>> text = 'Come to Chennai, Tamil Nadu, I will visit Delehi next year'.lower() + >>> text_detector.detect_entity(text) + >>> text_detector._get_entity_substring_from_text(variant='chennai') + 'chennai' + >>> text_detector._get_entity_substring_from_text(variant='delhi') + 'delehi' + """ variant_tokens = TOKENIZER.tokenize(variant) text_tokens = TOKENIZER.tokenize(text) original_text_tokens = [] @@ -345,6 +402,19 @@ def _add_verification_source(values, verification_source_dict): return text_entity_verified_values def combine_results(self, values, original_texts, predetected_values): + """ + This method is used to combine the results provided by the datastore search and the + crf_model if trained. + Args: + values (list): List of values detected by datastore + original_texts (list): List of original texts present in the texts for which value shave been + detected + predetected_values (list): Entities detected by the models like crf etc. + Returns: + combined_values (list): List of dicts each dict consisting of the entity value and additionally + the keys for the datastore and crf model detection + combined_original_texts (list): List of original texts detected by the datastore and the crf model. + """ unprocessed_crf_original_texts = [] combined_values = self._add_verification_source( @@ -377,14 +447,47 @@ def combine_results(self, values, original_texts, predetected_values): def detect(self, message=None, structured_value=None, **kwargs): """ + This method will detect all textual entities over the single message. + After detection it will combine the result and outputs list of dictionary + for all the entities detected over message - Args: - message: - structured_value: - **kwargs: + Args: + message (str): message on which textual entities needs to be detected + structured_value(str): if this present it will preferred over message + **kwargs: other keyword arguments if required Returns: - + List of dict of all the entities with detected values of textual entites + + Examples: + + + entity_dict = { + 'city': {'fallback_value': 'Mumbai', 'use_fallback': True}, + 'restaurant': {'fallback_value': None, 'use_fallback': True} + } + + text_detection = TextDetector(entity_dict) + text_detection.detect('Buy ticket to Chennai from Mumbai) + + output: + [ { + 'city': [ + {'entity_value': {'value': 'Mumbai', + 'datastore_verified': False, + 'model_verified': False}, + 'detection': 'fallback_value', + 'original_text': 'Mumbai', + 'language': 'en'}, + {'entity_value': {'value': 'Chennai', + 'datastore_verified': False, + 'model_verified': False}, + 'detection': 'fallback_value', + 'original_text': 'Chennai', + 'language': 'en'} + ], + 'restaurant': [] + }] """ text = structured_value if structured_value else message @@ -436,13 +539,60 @@ def detect(self, message=None, structured_value=None, **kwargs): def detect_bulk(self, messages=None, **kwargs): """ + This method will detect all textual entities over the multiple message. + After detection it will combine the result and outputs list of dictionary + for all the entities detected over message - Args: - messages: - **kwargs: + Args: - Returns: + messages (list of str): list of message for which detection needs to be perform + **kwargs: other keyword arguments if required + Returns: + List of dict of all the entities with detected values of textual entites + + + example: + + entity_dict = { + 'city': {'fallback_value': 'Mumbai', 'use_fallback': True}, + 'restaurant': {'fallback_value': None, 'use_fallback': True} + } + + text_detection = TextDetector(entity_dict) + text_detection.detect(['Buy ticket to Chennai from Mumbai', + 'I want to eat at dominoes']) + + output: + [ { + 'city': [ + {'entity_value': {'value': 'Mumbai', + 'datastore_verified': False, + 'model_verified': False}, + 'detection': 'fallback_value', + 'original_text': 'Mumbai', + 'language': 'en'}, + {'entity_value': {'value': 'Chennai', + 'datastore_verified': False, + 'model_verified': False}, + 'detection': 'fallback_value', + 'original_text': 'Chennai', + 'language': 'en'} + ], + 'restaurant': []}, + { + + , + 'city': [], + 'restaurant': [ + {'entity_value': {'value': 'Domminoe's Pizza', + 'datastore_verified': False, + 'model_verified': False}, + 'detection': 'fallback_value', + 'original_text': 'dominoes', + 'language': 'en'} + ] + ] """ texts = messages @@ -490,16 +640,48 @@ def detect_bulk(self, messages=None, **kwargs): def output_entity_dict_list(entity_value_list, original_text_list, detection_method=None, detection_method_list=None, detection_language=ENGLISH_LANG): """ - + Format detected entity values for bulk detection Args: - entity_value_list: - original_text_list: - detection_method: - detection_method_list: - detection_language: + entity_values_list (list of lists): containing list of entity values which are identified from given + detection logic + original_texts_list (list of lists): containing list original values or actual values from + messages which are identified + detection_method (str, optional): how the entity was detected + i.e. whether from message, structured_value + or fallback, verified from model or not. + defaults to None + detection_method_list(list, optional): list containing how each entity was detected in the entity_value + list.If provided, this argument will be used over detection method + defaults to None + detection_language(str): ISO 639 code for language in which entity is detected Returns: - + list of lists of dict: list of lists containing dictionaries, each containing entity_value, + original_text and detection; + entity_value is in itself a dict with its keys varying from entity to entity + Example Output: + [ + [ + { + "entity_value": entity_value_1, + "detection": detection_method, + "original_text": original_text_1 + }, + { + "entity_value": entity_value_2, + "detection": detection_method, + "original_text": original_text_2 + } + + ], + [ + { + "entity_value": entity_value, + "detection": detection_method, + "original_text": original_text + } + ] + ] """ if detection_method_list is None: detection_method_list = [] diff --git a/ner_v2/detectors/textual/utils.py b/ner_v2/detectors/textual/utils.py index 309add5a4..ff9355019 100644 --- a/ner_v2/detectors/textual/utils.py +++ b/ner_v2/detectors/textual/utils.py @@ -28,15 +28,19 @@ def verify_text_request(request): entities = request_data.get("entities") if not message: + ner_logger.exception("Message param is not passed") raise KeyError("Message is required") if not entities: + ner_logger.exception("Entities param is not passed") raise KeyError("Entities dict is required") if not isinstance(message, list): + ner_logger.exception("Message param is not in correct format") raise TypeError("Message should be in format of list of string") if not isinstance(entities, dict): + ner_logger.exception("Entities param is not in correct format") raise TypeError("Entities should be dict of entity details") @@ -45,6 +49,9 @@ def get_text_detection(message, entity_dict, structured_value=None, bot_message= """ Get text detection for given message on given entities dict using TextDetector module. + + If the message is string type call TextDetector.detect() mwthod, if it is list + call TextDetector.detect_bulk() method. Else, it wol raise an error. Args: message: message to detect text on entity_dict: entity details dict @@ -74,12 +81,73 @@ def get_text_detection(message, entity_dict, structured_value=None, bot_message= def parse_text_request(request): """ - Parse text request coming from POST call on `/v2/text/` + Parse text request coming from POST call on `/v2/text/` and call the + get text detection. + Message to detect text can be: + + 1) Single entry in the list, for this we use `text_detector.detect` method. + Also for this case we check if the structured value or use_fallback is present. + + 2) For mulitple message, underlying code will call `text_detector.detect_bulk` method. + In this case we ignore structured valur or use_fallback for all the entities. + Args: request: request object - Returns: - output data + output data list for all the message + Examples: + Request Object: + { + "message": ["I want to go to Jabalpur"], + "bot_message": null, + "language_script": "en", + "source_language": "en", + "entities": { + "city": { + "structured_value": "Delhi", + "fallback_value": null, + "predetected_values": ["Mumbai"], + "fuzziness": null, + "min_token_len_fuzziness": null, + "use_fallback": false + }, + "restaurant": { + "structured_value": null, + "fallback_value": null, + "predetected_values": null, + "fuzziness": null, + "min_token_len_fuzziness": null, + "use_fallback": false + } + } + } + output response: + [ + { + "entities": { + "restaurant": [], + "city": [ + { + "entity_value": { + "value": "New Delhi", + "datastore_verified": true, + "model_verified": false + }, + "detection": "structure_value_verified", + "original_text": "delhi", + "language": "en" + }, + { + "entity_value": { + "value": "Mumbai", + "datastore_verified": false, + "model_verified": true + }, + "detection": "structure_value_verified", + "original_text": "Mumbai", + "language": "en" + } + ] """ request_data = json.loads(request.body) message = request_data.get("message", []) @@ -156,12 +224,37 @@ def parse_text_request(request): def get_output_for_fallback_entities(entities_dict, language=ENGLISH_LANG): """ Generate default detection output for default fallback entities. + It will check if fallback_value is present if not it will return + empty list for that entity. + Args: entities_dict: dict of entities details language: language to run Returns: - TextDetection output for default fallback + TextDetection output (list of dict) for default fallback values + + Examples: + Input: + { + 'city': {'fallback_value': 'Mumbai', 'use_fallback': True}, + 'restaurant': {'fallback_value': None, 'use_fallback': True} + } + + Output: + + { + 'city': [ + {'entity_value': {'value': 'Mumbai', + 'datastore_verified': False, + 'model_verified': False}, + 'detection': 'fallback_value', + 'original_text': 'Mumbai', + 'language': 'en'} + ], + 'restaurant': [] + } + """ output = {} if not entities_dict: From fc2d48790b9b7c33d7b2895c7e7c0a3a461d925a Mon Sep 17 00:00:00 2001 From: Ankur Agrawal Date: Mon, 21 Sep 2020 10:51:51 +0530 Subject: [PATCH 25/31] Fix documentation for text_detector module --- ner_v2/api.py | 10 +- .../textual/tests/test_text_detection.py | 2 +- ner_v2/detectors/textual/text_detection.py | 122 +++++++++--------- ner_v2/detectors/textual/utils.py | 1 - 4 files changed, 70 insertions(+), 65 deletions(-) diff --git a/ner_v2/api.py b/ner_v2/api.py index d257a39ed..f8b82789d 100644 --- a/ner_v2/api.py +++ b/ner_v2/api.py @@ -674,6 +674,8 @@ def text(request): try: verify_text_request(request) + # if verify success parse request and get data + data = parse_text_request(request) except KeyError as err: response = {"success": False, "error": str(err)} @@ -685,9 +687,11 @@ def text(request): ner_logger.debug(response) return HttpResponse(json.dumps(response), content_type='application/json', status=400) - - # if verify success parse request and get data - data = parse_text_request(request) + except Exception as err: + response = {"success": False, "error": str(err)} + ner_logger.debug(response) + return HttpResponse(json.dumps(response), content_type='application/json', + status=400) if data: response = {"success": True, "error": None, "data": data} diff --git a/ner_v2/detectors/textual/tests/test_text_detection.py b/ner_v2/detectors/textual/tests/test_text_detection.py index 144a7a4a6..2fc4d20c5 100644 --- a/ner_v2/detectors/textual/tests/test_text_detection.py +++ b/ner_v2/detectors/textual/tests/test_text_detection.py @@ -40,7 +40,7 @@ def test_text_detector_intialization(self): self.assertEqual(language, text_detector._source_language_script) self.assertEqual(target_language_script, text_detector._target_language_script) - self.assertDictEqual(entity_dict, text_detector.entities_dict_list) + self.assertDictEqual(entity_dict, text_detector.entities_dict) @patch('ner_v2.detectors.textual.elastic_search.' 'ElasticSearchDataStore.get_multi_entity_results') diff --git a/ner_v2/detectors/textual/text_detection.py b/ner_v2/detectors/textual/text_detection.py index c24926134..dfe6e613f 100644 --- a/ner_v2/detectors/textual/text_detection.py +++ b/ner_v2/detectors/textual/text_detection.py @@ -42,22 +42,22 @@ class TextDetector(object): like date, time, email, etc do. Examples of such types of entites can be city, food dish name, brand names etc Attributes: - entities_dict_list (dict): dict with details of entities to be dected. Each entites will contailn: - `value`: name of the entity - - `_fuzziness` (str or int): If this parameter is str, elasticsearch's - auto is used with low and high term distances. Default low and high - term distances are 3 and 6 for elasticsearch. For this module they are - set to 4 and 7 respectively. - - In auto mode, if length of term is less than low it must match exactly, - if it is between [low, high) one insert/delete/substitution is allowed, - for anything higher than equal to high, two inserts/deletes/substitutions - are allowed - - `_min_token_size_for_fuzziness (int)`: minimum number of letters a word must - have to be considered for calculating edit distance with similar - ngrams from the datastore + entities_dict (dict): dict with details of entities to be dected. Each entites will contailn: + `value`: name of the entity + + `_fuzziness` (str or int): If this parameter is str, elasticsearch's + auto is used with low and high term distances. Default low and high + term distances are 3 and 6 for elasticsearch. For this module they are + set to 4 and 7 respectively. + + In auto mode, if length of term is less than low it must match exactly, + if it is between [low, high) one insert/delete/substitution is allowed, + for anything higher than equal to high, two inserts/deletes/substitutions + are allowed + + `_min_token_size_for_fuzziness (int)`: minimum number of letters a word must + have to be considered for calculating edit distance with similar + ngrams from the datastore processed_text (str): string with detected text entities removed """ @@ -65,6 +65,9 @@ def __init__(self, entity_dict=None, source_language_script=lang_constant.ENGLISH_LANG, target_language_script=ENGLISH_LANG): + # define entities to detect + self.entities_dict = entity_dict + self.processed_text = None self.__texts = [] self.__processed_texts = [] @@ -83,9 +86,6 @@ def __init__(self, entity_dict=None, self._source_language_script = source_language_script self._target_language_script = target_language_script - # define entities to detect - self.entities_dict_list = entity_dict - def _reset_state(self): """ Reset all the intermediary states of detection class. @@ -110,7 +110,7 @@ def set_fuzziness_threshold(self, fuzziness): Note that this also sets _min_token_size_for_fuzziness to first value of the iterable If this argument is int, elasticsearch will set fuzziness as min(2, fuzziness) - """ + """ try: iter(fuzziness) if len(fuzziness) == 2: @@ -154,8 +154,8 @@ def _get_fuzziness_threshold_for_token(self, token, fuzziness=None): def set_min_token_size_for_levenshtein(self, min_size): """ - Sets the minimum number of letters a word must have to be considered for calculating edit distance with similar - ngrams from the datastore + Sets the minimum number of letters a word must have to be considered for calculating edit + distance with similar ngrams from the datastore Args: min_size: integer, maximum allowed Levenshtein's distance from the word/phrase being tested for @@ -197,7 +197,8 @@ def _get_substring_from_processed_text(text, matched_tokens): In: matched_tokens = [u'1', u'pc', u'hot', u'crispy'] Out: 1 pc hot & crispy - Notice that & is dropped during tokenization but when finding original text, we recover it from processed text + Notice that & is dropped during tokenization but when finding original text, + we recover it from processed text """ def _get_tokens_and_indices(txt): @@ -262,22 +263,24 @@ def _get_tokens_and_indices(txt): def _get_text_detection_with_variants(self): """ - This function will normalise the message by breaking it into trigrams, bigrams and unigrams. The generated - ngrams will be used to create query to retrieve search results from datastore. These results will contain list - of dictionary where for each item key will be variant and value will be entity value this will be further - processed to get the original text which has been identified and will return the results - Returns: - tuple: - list of lists: list of dict for each message with key as entity name - containing the detected text entities and original message. + This function will normalise the message by breaking it into trigrams, bigrams and unigrams. + The generated ngrams will be used to create query to retrieve search results from datastore. + These results will contain list of dictionary where for each item key will be variant and + value will be entity value this will be further processed to get the original text which has + been identified and will return the results + + Returns: + tuple: + list of lists: list of dict for each message with key as entity name + containing the detected text entities and original message. """ texts = [u' '.join(TOKENIZER.tokenize(processed_text)) for processed_text in self.__processed_texts] - entities_dict_list = self.entities_dict_list - es_results = self.esdb.get_multi_entity_results(entities=list(entities_dict_list), + entities_dict = self.entities_dict + es_results = self.esdb.get_multi_entity_results(entities=list(entities_dict), texts=texts, fuzziness_threshold=self._fuzziness, search_language_script=self._target_language_script @@ -285,7 +288,7 @@ def _get_text_detection_with_variants(self): final_list = [] for index, entity_result in enumerate(es_results): result_list = {} - for each_key in entities_dict_list.keys(): + for each_key in entities_dict.keys(): original_final_list = [] value_final_list = [] @@ -354,9 +357,8 @@ def _get_entity_substring_from_text(self, text, variant, entity_name): str or unicode or None: part of the given text that was detected as entity given the variant, None otherwise Example: - >>> text_detector = TextDetector('city') + >>> text_detector = TextDetector(entity_dict={'city':{}) >>> text = 'Come to Chennai, Tamil Nadu, I will visit Delehi next year'.lower() - >>> text_detector.detect_entity(text) >>> text_detector._get_entity_substring_from_text(variant='chennai') 'chennai' >>> text_detector._get_entity_substring_from_text(variant='delhi') @@ -371,7 +373,7 @@ def _get_entity_substring_from_text(self, text, variant, entity_name): same = variant_token == text_token # get fuzziness and min_token_size_for_fuziness value from entity dict - entity_dict = self.entities_dict_list.get(entity_name, {}) + entity_dict = self.entities_dict.get(entity_name, {}) fuzziness = entity_dict.get('fuzziness') min_token_size_for_fuzziness = entity_dict.get('min_token_len_fuzziness') @@ -463,8 +465,8 @@ def detect(self, message=None, structured_value=None, **kwargs): entity_dict = { - 'city': {'fallback_value': 'Mumbai', 'use_fallback': True}, - 'restaurant': {'fallback_value': None, 'use_fallback': True} + 'city': {'fallback_value': 'Mumbai', 'use_fallback': False}, + 'restaurant': {'fallback_value': None, 'use_fallback': False} } text_detection = TextDetector(entity_dict) @@ -474,15 +476,15 @@ def detect(self, message=None, structured_value=None, **kwargs): [ { 'city': [ {'entity_value': {'value': 'Mumbai', - 'datastore_verified': False, + 'datastore_verified': True, 'model_verified': False}, - 'detection': 'fallback_value', + 'detection': 'message', 'original_text': 'Mumbai', 'language': 'en'}, {'entity_value': {'value': 'Chennai', - 'datastore_verified': False, + 'datastore_verified': True, 'model_verified': False}, - 'detection': 'fallback_value', + 'detection': 'message', 'original_text': 'Chennai', 'language': 'en'} ], @@ -502,7 +504,7 @@ def detect(self, message=None, structured_value=None, **kwargs): values, texts = [], [] text_entity_values, original_texts = value # get predetected value from entity dict - entity_dict = self.entities_dict_list.get(entity, {}) + entity_dict = self.entities_dict.get(entity, {}) predetected_values = entity_dict.get('predetected_values') or [] # get fallback value from entity dict @@ -555,8 +557,8 @@ def detect_bulk(self, messages=None, **kwargs): example: entity_dict = { - 'city': {'fallback_value': 'Mumbai', 'use_fallback': True}, - 'restaurant': {'fallback_value': None, 'use_fallback': True} + 'city': {'fallback_value': 'Mumbai', 'use_fallback': False}, + 'restaurant': {'fallback_value': None, 'use_fallback': False} } text_detection = TextDetector(entity_dict) @@ -567,28 +569,26 @@ def detect_bulk(self, messages=None, **kwargs): [ { 'city': [ {'entity_value': {'value': 'Mumbai', - 'datastore_verified': False, + 'datastore_verified': True, 'model_verified': False}, - 'detection': 'fallback_value', + 'detection': 'message', 'original_text': 'Mumbai', 'language': 'en'}, {'entity_value': {'value': 'Chennai', - 'datastore_verified': False, + 'datastore_verified': True, 'model_verified': False}, - 'detection': 'fallback_value', + 'detection': 'message', 'original_text': 'Chennai', 'language': 'en'} ], 'restaurant': []}, { - - , 'city': [], 'restaurant': [ {'entity_value': {'value': 'Domminoe's Pizza', - 'datastore_verified': False, + 'datastore_verified': True, 'model_verified': False}, - 'detection': 'fallback_value', + 'detection': 'message', 'original_text': 'dominoes', 'language': 'en'} ] @@ -597,7 +597,6 @@ def detect_bulk(self, messages=None, **kwargs): texts = messages self._process_text(texts) - res_list = self._get_text_detection_with_variants() data_list = [] for index, res in enumerate(res_list): @@ -606,7 +605,7 @@ def detect_bulk(self, messages=None, **kwargs): entities[entity] = [] values, texts = [], [] # get predetected value from entity dict - entity_dict = self.entities_dict_list.get(entity, {}) + entity_dict = self.entities_dict.get(entity, {}) predetected_values = entity_dict.get('predetected_values') or [] # get fallback value from entity dict @@ -640,8 +639,10 @@ def detect_bulk(self, messages=None, **kwargs): def output_entity_dict_list(entity_value_list, original_text_list, detection_method=None, detection_method_list=None, detection_language=ENGLISH_LANG): """ - Format detected entity values for bulk detection - Args: + Format detected entity values for bulk detection + + Args: + entity_values_list (list of lists): containing list of entity values which are identified from given detection logic original_texts_list (list of lists): containing list original values or actual values from @@ -655,11 +656,12 @@ def output_entity_dict_list(entity_value_list, original_text_list, detection_met defaults to None detection_language(str): ISO 639 code for language in which entity is detected - Returns: + Returns: + list of lists of dict: list of lists containing dictionaries, each containing entity_value, original_text and detection; entity_value is in itself a dict with its keys varying from entity to entity - Example Output: + Example Output: [ [ { diff --git a/ner_v2/detectors/textual/utils.py b/ner_v2/detectors/textual/utils.py index ff9355019..201ac23ca 100644 --- a/ner_v2/detectors/textual/utils.py +++ b/ner_v2/detectors/textual/utils.py @@ -172,7 +172,6 @@ def parse_text_request(request): data.append({"entities": {}, "language": source_language}) for each_entity, value in entities.items(): - structured_value = value.get('structured_value') use_fallback = value.get('use_fallback', False) From 06770195bc0395acb882bd626a2a046598741535 Mon Sep 17 00:00:00 2001 From: Ankur Agrawal Date: Tue, 22 Sep 2020 02:17:20 +0530 Subject: [PATCH 26/31] - Modified ESDatastore and text detection to support for using only single msearch query for structured and normat text entities - Add validation on number of messages and texts for bulk detection --- ner_constants.py | 5 + ner_v2/detectors/textual/elastic_search.py | 58 ++-- .../textual/tests/test_elastic_search.py | 19 ++ ner_v2/detectors/textual/tests/test_utils.py | 16 +- ner_v2/detectors/textual/text_detection.py | 313 +++++++++++++----- ner_v2/detectors/textual/utils.py | 88 ++--- 6 files changed, 332 insertions(+), 167 deletions(-) diff --git a/ner_constants.py b/ner_constants.py index f535c14b0..fc279bfa2 100644 --- a/ner_constants.py +++ b/ner_constants.py @@ -25,6 +25,11 @@ ENTITY_VALUE_DICT_KEY = 'value' +# datastore_verified a key to verify value from the datastore +DATASTORE_VERIFIED = 'datastore_verified' +# model_verified a key to verify value from the model +MODEL_VERIFIED = 'model_verified' + # ************************ constants tell us what to do with structured_value ************************ # This will execute entity detection on the structured_value. STRUCTURED = 0 diff --git a/ner_v2/detectors/textual/elastic_search.py b/ner_v2/detectors/textual/elastic_search.py index d8db9ef8d..a613c2833 100644 --- a/ner_v2/detectors/textual/elastic_search.py +++ b/ner_v2/detectors/textual/elastic_search.py @@ -10,6 +10,7 @@ from chatbot_ner.config import ner_logger, CHATBOT_NER_DATASTORE from datastore import constants from datastore.exceptions import DataStoreSettingsImproperlyConfiguredException +from language_utilities.constant import ENGLISH_LANG from ner_v2.detectors.textual.queries import _generate_multi_entity_es_query, \ _parse_multi_entity_es_results @@ -30,9 +31,15 @@ def __init__(self): self._connection = None self._index_name = None + self.query_data = [] + # configure variables and connection self._configure_store() + # define doc type + self.doc_type = self._connection_settings[ + constants.ELASTICSEARCH_DOC_TYPE] + def _configure_store(self, **kwargs): """ Configure self variables and connection. @@ -100,17 +107,28 @@ def _check_doc_type_for_elasticsearch(self): raise DataStoreSettingsImproperlyConfiguredException( 'Elasticsearch needs doc_type. Please configure ES_DOC_TYPE in your environment') + def generate_query_data(self, entities, texts, fuzziness_threshold=1, + search_language_script=ENGLISH_LANG): + + # check if text is string + if isinstance(texts, str): + texts = [texts] + + index_header = json.dumps({'index': self._index_name, 'type': self.doc_type}) + + data = list(chain.from_iterable([[index_header, + json.dumps(_generate_multi_entity_es_query( + entities=entities, + text=each, + fuzziness_threshold=fuzziness_threshold, + language_script=search_language_script))] + for each in texts])) + + return data + def get_multi_entity_results(self, entities, texts, fuzziness_threshold=1, - search_language_script=None, **kwargs): + search_language_script=ENGLISH_LANG, **kwargs): """ - Args: - entities: the list of entities to lookup in the datastore for getting entity values - and their variants - texts(list of strings): the text for which variants need to be find out - fuzziness_threshold: fuzziness allowed for search results on entity value variants - search_language_script: language script for ES search - kwargs: - Returns: list of collections.OrderedDict: dictionary mapping each entity for each text with their value variants to entity value @@ -149,23 +167,15 @@ def get_multi_entity_results(self, entities, texts, fuzziness_threshold=1, request_timeout = self._connection_settings.get('request_timeout', 20) index_name = self._index_name - doc_type = self._connection_settings[ - constants.ELASTICSEARCH_DOC_TYPE] - - index_header = json.dumps({'index': self._index_name, 'type': doc_type}) - - data = list(chain.from_iterable([[index_header, - json.dumps(_generate_multi_entity_es_query( - entities=entities, - text=each, - fuzziness_threshold=fuzziness_threshold, - language_script=search_language_script))] - for each in texts])) + data = [] + for entity_list, text_list in zip(entities, texts): + data.extend(self.generate_query_data(entity_list, text_list, fuzziness_threshold, + search_language_script)) - # add `\n` for each index_header and text entry - data = '\n'.join(data) + # add `\n` for each index_header and query data text entry + query_data = '\n'.join(data) - kwargs = dict(kwargs, body=data, doc_type=doc_type, index=index_name, + kwargs = dict(body=query_data, doc_type=self.doc_type, index=index_name, request_timeout=request_timeout) results = self._run_es_search(self._connection, **kwargs) diff --git a/ner_v2/detectors/textual/tests/test_elastic_search.py b/ner_v2/detectors/textual/tests/test_elastic_search.py index 3a88cd30c..ad083771f 100644 --- a/ner_v2/detectors/textual/tests/test_elastic_search.py +++ b/ner_v2/detectors/textual/tests/test_elastic_search.py @@ -62,3 +62,22 @@ def test_elasticsearch_get_dynamic_fuzziness_threshold(self): fuzzy_threshold = ElasticSearchDataStore._get_dynamic_fuzziness_threshold(fuzzy) self.assertEqual(fuzzy_threshold, 'auto') + + def test_add_query(self): + es = ElasticSearchDataStore() + + entity_list_1 = ['city', 'restaurant'] + text_1 = "I want to go to mumbai" + + query_data = es.generate_query_data(entities=entity_list_1, texts=text_1) + + assert_data = ['{"index": "entity_data", "type": "data_dictionary"}', + '{"_source": ["value", "entity_data"], ' + '"query": {"bool": {"filter": [{"terms": {"entity_data":' + ' ["city", "restaurant"]}}, {"terms": {"language_script": ["en"]}}],' + ' "should": [{"match": {"variants": {"query": "I want to go to mumbai",' + ' "fuzziness": 1, "prefix_length": 1}}}], "minimum_should_match": 1}},' + ' "highlight": {"fields": {"variants": {"type": "unified"}},' + ' "order": "score", "number_of_fragments": 20}, "size": 10000}'] + + self.assertListEqual(query_data, assert_data) diff --git a/ner_v2/detectors/textual/tests/test_utils.py b/ner_v2/detectors/textual/tests/test_utils.py index 8e55b7718..fa603f654 100644 --- a/ner_v2/detectors/textual/tests/test_utils.py +++ b/ner_v2/detectors/textual/tests/test_utils.py @@ -18,8 +18,8 @@ class TestTextualUtils(TestCase): def test_get_output_for_fallback_entities(self): - input_data = {'city': {'fallback_value': 'Mumbai', 'use_fallback': True}, - 'restaurant': {'fallback_value': None, 'use_fallback': True}} + input_data = {'city': {'fallback_value': 'Mumbai', 'ignore_message': True}, + 'restaurant': {'fallback_value': None, 'ignore_message': True}} assert_output_data = {'city': [{'entity_value': {'value': 'Mumbai', 'datastore_verified': False, @@ -72,7 +72,7 @@ def test_parse_text_request(self, mock_get_detection): "predetected_values": None, "fuzziness": 4, "min_token_len_fuzziness": 4, - "use_fallback": None + "ignore_message": None }, "restaurant": { @@ -81,7 +81,7 @@ def test_parse_text_request(self, mock_get_detection): "predetected_values": None, "fuzziness": None, "min_token_len_fuzziness": None, - "use_fallback": True + "ignore_message": True }, } @@ -125,7 +125,7 @@ def test_parse_text_request_structured(self, mock_get_detection): "predetected_values": None, "fuzziness": 4, "min_token_len_fuzziness": 4, - "use_fallback": None + "ignore_message": None }, "restaurant": { @@ -134,7 +134,7 @@ def test_parse_text_request_structured(self, mock_get_detection): "predetected_values": None, "fuzziness": None, "min_token_len_fuzziness": None, - "use_fallback": True + "ignore_message": True }, } @@ -165,7 +165,7 @@ def test_get_text_detection_string_message(self, mock_es_query): 'predetected_values': None, 'fuzziness': 4, 'min_token_len_fuzziness': 4, - 'use_fallback': None}} + 'ignore_message': None}} message = "I want to go to Mumbai" @@ -193,7 +193,7 @@ def test_get_text_detection_list_message(self, mock_es_query): 'predetected_values': None, 'fuzziness': 4, 'min_token_len_fuzziness': 4, - 'use_fallback': None}} + 'ignore_message': None}} message = ["I want to go to Mumbai", "I want to go to Delhi"] diff --git a/ner_v2/detectors/textual/text_detection.py b/ner_v2/detectors/textual/text_detection.py index dfe6e613f..968d4a596 100644 --- a/ner_v2/detectors/textual/text_detection.py +++ b/ner_v2/detectors/textual/text_detection.py @@ -15,9 +15,10 @@ from lib.nlp.levenshtein_distance import edit_distance from six.moves import range -from ner_constants import (FROM_STRUCTURE_VALUE_VERIFIED, FROM_STRUCTURE_VALUE_NOT_VERIFIED, FROM_MESSAGE, - FROM_FALLBACK_VALUE, ORIGINAL_TEXT, ENTITY_VALUE, DETECTION_METHOD, - DETECTION_LANGUAGE, ENTITY_VALUE_DICT_KEY) +from ner_constants import (FROM_STRUCTURE_VALUE_VERIFIED, FROM_STRUCTURE_VALUE_NOT_VERIFIED, + FROM_MESSAGE, FROM_FALLBACK_VALUE, ORIGINAL_TEXT, ENTITY_VALUE, + DETECTION_METHOD, DETECTION_LANGUAGE, ENTITY_VALUE_DICT_KEY) + from ner_v1.constant import DATASTORE_VERIFIED, MODEL_VERIFIED from language_utilities.constant import ENGLISH_LANG @@ -83,6 +84,7 @@ def __init__(self, entity_dict=None, # define data store and target languages self.esdb = ElasticSearchDataStore() + self._source_language_script = source_language_script self._target_language_script = target_language_script @@ -90,7 +92,6 @@ def _reset_state(self): """ Reset all the intermediary states of detection class. """ - self.tagged_text = None self.processed_text = None self.__texts = [] self.__processed_texts = [] @@ -164,6 +165,11 @@ def set_min_token_size_for_levenshtein(self, min_size): self._min_token_size_for_fuzziness = min_size def _process_text(self, texts): + """ + This will pre-process texts for detection + Args: + texts: list of message strings + """ self._reset_state() for text in texts: text = text.lower() @@ -261,84 +267,193 @@ def _get_tokens_and_indices(txt): return u' '.join(matched_tokens) - def _get_text_detection_with_variants(self): + def _process_es_result(self, entity_result, entity_list, text, + processed_text): """ + Process ElasticSearch results which will contain list of dictionary where for + each item key will be variant and value will be entity value this will be + processed to get the original text which has been identified and will + return the results dictionary for each entity detected + + Args: + entity_result: ES result for entity + entity_list: List of entity for which ES query ran + text: original text message + processed_text: processed text on which detection ran + + Returns: + result_dict: dictionary with detected text and original text for + each entity + + """ + result_dict = {} + + for each_key in entity_list: + original_final_list = [] + value_final_list = [] + variants_to_values = collections.OrderedDict() + original_final_list_ = [] + value_final_list_ = [] + + _variants_to_values = entity_result.get(each_key, []) + + if not _variants_to_values: + result_dict[each_key] = ([], []) + continue + + for variant, value in iteritems(_variants_to_values): + variant = variant.lower() + if isinstance(variant, bytes): + variant = variant.decode('utf-8') + + variants_to_values[variant] = value + variants_list = list(variants_to_values.keys()) + + exact_matches, fuzzy_variants = [], [] + + for variant in variants_list: + if u' '.join(TOKENIZER.tokenize(variant)) in text: + exact_matches.append(variant) + else: + fuzzy_variants.append(variant) + + exact_matches.sort(key=lambda s: len(TOKENIZER.tokenize(s)), reverse=True) + fuzzy_variants.sort(key=lambda s: len(TOKENIZER.tokenize(s)), reverse=True) + + variants_list = exact_matches + fuzzy_variants + for variant in variants_list: + + original_text = self._get_entity_substring_from_text(processed_text, + variant, each_key) + if original_text: + value_final_list.append(variants_to_values[variant]) + original_final_list.append(original_text) + boundary_punct_pattern = re.compile( + r'(^[{0}]+)|([{0}]+$)'.format(re.escape(string.punctuation))) + original_text_ = boundary_punct_pattern.sub("", original_text) + + _pattern = re.compile(r'\b%s\b' % re.escape(original_text_), flags=_re_flags) + tag = '__' + each_key + '__' + processed_text = _pattern.sub(tag, processed_text) + + value_final_list_.append(value_final_list) + original_final_list_.append(original_final_list) + + result_dict[each_key] = (value_final_list_, original_final_list_) + + return result_dict + + def _get_single_text_detection_with_variants(self, message): + """ + This function will normalise the message by breaking it into trigrams, + bigrams and unigrams. - This function will normalise the message by breaking it into trigrams, bigrams and unigrams. The generated ngrams will be used to create query to retrieve search results from datastore. + These results will contain list of dictionary where for each item key will be variant and value will be entity value this will be further processed to get the original text which has been identified and will return the results Returns: - tuple: - list of lists: list of dict for each message with key as entity name + list of dict: list of dict for each message with key as entity name containing the detected text entities and original message. """ + entities_dict = self.entities_dict + es_entity_list = [] + structured_value_entities_list = [] + text_value_entities_list = [] + texts = [] + + for each_entity, value in entities_dict.items(): + structured_value = value.get('structured_value') + + if structured_value: + # add entity list and text for each structured entity + # for ES query + es_entity_list.append([each_entity]) + structured_value_entities_list.append(each_entity) + texts.append(structured_value) + else: + text_value_entities_list.append(each_entity) + + if text_value_entities_list: + # add entity list and text for all other textual + # entity for ES query + es_entity_list.append(text_value_entities_list) + texts.append(message) + + # pre-process text + self._process_text(texts) texts = [u' '.join(TOKENIZER.tokenize(processed_text)) for processed_text in self.__processed_texts] - entities_dict = self.entities_dict - es_results = self.esdb.get_multi_entity_results(entities=list(entities_dict), + # fetch ES datastore search result + es_results = self.esdb.get_multi_entity_results(entities=es_entity_list, texts=texts, fuzziness_threshold=self._fuzziness, search_language_script=self._target_language_script ) + final_list = [] + result_dict = {} + for index, entity_result in enumerate(es_results): - result_list = {} - for each_key in entities_dict.keys(): + processed_text = self.__processed_texts[index] + text = texts[index] + entity_list = es_entity_list[index] + result_dict.update(self._process_es_result(entity_result=entity_result, + entity_list=entity_list, + text=text, processed_text=processed_text)) - original_final_list = [] - value_final_list = [] - variants_to_values = collections.OrderedDict() - original_final_list_ = [] - value_final_list_ = [] + final_list.append(result_dict) - _variants_to_values = entity_result.get(each_key, []) + return final_list - if not _variants_to_values: - result_list[each_key] = ([], []) - continue - for variant, value in iteritems(_variants_to_values): - variant = variant.lower() - if isinstance(variant, bytes): - variant = variant.decode('utf-8') - - variants_to_values[variant] = value - variants_list = list(variants_to_values.keys()) - - exact_matches, fuzzy_variants = [], [] - _text = texts - for variant in variants_list: - if u' '.join(TOKENIZER.tokenize(variant)) in _text[index]: - exact_matches.append(variant) - else: - fuzzy_variants.append(variant) - exact_matches.sort(key=lambda s: len(TOKENIZER.tokenize(s)), reverse=True) - fuzzy_variants.sort(key=lambda s: len(TOKENIZER.tokenize(s)), reverse=True) - - variants_list = exact_matches + fuzzy_variants - for variant in variants_list: - - original_text = self._get_entity_substring_from_text(self.__processed_texts[index], - variant, each_key) - if original_text: - value_final_list.append(variants_to_values[variant]) - original_final_list.append(original_text) - boundary_punct_pattern = re.compile( - r'(^[{0}]+)|([{0}]+$)'.format(re.escape(string.punctuation))) - original_text_ = boundary_punct_pattern.sub("", original_text) - - _pattern = re.compile(r'\b%s\b' % re.escape(original_text_), flags=_re_flags) - tag = '__' + each_key + '__' - self.__processed_texts[index] = _pattern.sub(tag, self.__processed_texts[index]) - value_final_list_.append(value_final_list) - original_final_list_.append(original_final_list) - - result_list[each_key] = (value_final_list_, original_final_list_) + def _get_bulk_text_detection_with_variants(self, messages): + """ + This function will normalise the message by breaking it into trigrams, bigrams and unigrams. + The generated ngrams will be used to create query to retrieve search results from datastore. + These results will contain list of dictionary where for each item key will be variant and + value will be entity value this will be further processed to get the original text which has + been identified and will return the results + + Args: + messages (list of str): list of message for which detection needs to be perform + + Returns: + tuple: + list of lists: list of dict for each message with key as entity name + containing the detected text entities and original message. + """ + + self._process_text(messages) + + texts = [u' '.join(TOKENIZER.tokenize(processed_text)) for + processed_text in self.__processed_texts] + + entity_list = list(self.entities_dict) + + # entity list for ES search should be list of entities + # for all list of texts + es_entity_list = [entity_list] + es_texts = [texts] + + # fetch ES datastore search result + es_results = self.esdb.get_multi_entity_results(entities=es_entity_list, + texts=es_texts, + fuzziness_threshold=self._fuzziness, + search_language_script=self._target_language_script + ) + + final_list = [] + for index, entity_result in enumerate(es_results): + processed_text = self.__processed_texts[index] + text = texts[index] + result_list = self._process_es_result(entity_result=entity_result, + entity_list=entity_list, + text=text, processed_text=processed_text) final_list.append(result_list) return final_list @@ -405,17 +520,17 @@ def _add_verification_source(values, verification_source_dict): def combine_results(self, values, original_texts, predetected_values): """ - This method is used to combine the results provided by the datastore search and the - crf_model if trained. - Args: - values (list): List of values detected by datastore - original_texts (list): List of original texts present in the texts for which value shave been - detected - predetected_values (list): Entities detected by the models like crf etc. - Returns: - combined_values (list): List of dicts each dict consisting of the entity value and additionally - the keys for the datastore and crf model detection - combined_original_texts (list): List of original texts detected by the datastore and the crf model. + This method is used to combine the results provided by the datastore search and the + crf_model if trained. + Args: + values (list): List of values detected by datastore + original_texts (list): List of original texts present in the texts for which value shave been + detected + predetected_values (list): Entities detected by the models like crf etc. + Returns: + combined_values (list): List of dicts each dict consisting of the entity value and additionally + the keys for the datastore and crf model detection + combined_original_texts (list): List of original texts detected by the datastore and the crf model. """ unprocessed_crf_original_texts = [] @@ -423,15 +538,17 @@ def combine_results(self, values, original_texts, predetected_values): values=values, verification_source_dict={DATASTORE_VERIFIED: True, MODEL_VERIFIED: False} ) combined_original_texts = original_texts + for i in range(len(predetected_values)): match = False for j in range(len(original_texts)): - if predetected_values[i] == original_texts[j]: + if predetected_values[i].lower() == original_texts[j]: combined_values[j][MODEL_VERIFIED] = True match = True break elif re.findall(r'\b%s\b' % re.escape(predetected_values[i]), original_texts[j]): - # If predetected value is a substring of some value detected by datastore, skip it from output + # If predetected value is a substring of some value detected by datastore, + # skip it from output match = True break if not match: @@ -447,15 +564,18 @@ def combine_results(self, values, original_texts, predetected_values): return combined_values, combined_original_texts - def detect(self, message=None, structured_value=None, **kwargs): + def detect(self, message=None, **kwargs): """ - This method will detect all textual entities over the single message. - After detection it will combine the result and outputs list of dictionary - for all the entities detected over message + This method will detect all textual entities over the single message. - Args: - message (str): message on which textual entities needs to be detected - structured_value(str): if this present it will preferred over message + If structured value is present for any given entity it will be preferred + over message and a new ES query is added with text as structured value. + + After detection it will combine the result and outputs list of dictionary + for all the entities detected over message + + Args: + message (str): message on which textual entities needs to be detected **kwargs: other keyword arguments if required Returns: @@ -463,14 +583,15 @@ def detect(self, message=None, structured_value=None, **kwargs): Examples: - entity_dict = { - 'city': {'fallback_value': 'Mumbai', 'use_fallback': False}, - 'restaurant': {'fallback_value': None, 'use_fallback': False} + 'city': {'structured_value': None, 'fallback_value': None}, + 'restaurant': {'structured_value': None, 'fallback_value': None}, + 'brand' : {'structured_value': 'Nike', 'fallback_value': None}, } text_detection = TextDetector(entity_dict) - text_detection.detect('Buy ticket to Chennai from Mumbai) + + text_detection.detect('Buy ticket to Chennai from Mumbai') output: [ { @@ -481,6 +602,7 @@ def detect(self, message=None, structured_value=None, **kwargs): 'detection': 'message', 'original_text': 'Mumbai', 'language': 'en'}, + {'entity_value': {'value': 'Chennai', 'datastore_verified': True, 'model_verified': False}, @@ -488,30 +610,39 @@ def detect(self, message=None, structured_value=None, **kwargs): 'original_text': 'Chennai', 'language': 'en'} ], - 'restaurant': [] + 'restaurant': [], + 'brand': [ + {'entity_value': {'value': 'Nike', + 'datastore_verified': True, + 'model_verified': False}, + "detection": "structure_value_verified", + 'original_text': 'Nike', + 'language': 'en'}] }] """ - text = structured_value if structured_value else message - self._process_text([text]) - res_list = self._get_text_detection_with_variants() + res_list = self._get_single_text_detection_with_variants(message) data_list = [] for index, res in enumerate(res_list): entities = {} + for entity, value in res.items(): entities[entity] = [] values, texts = [], [] text_entity_values, original_texts = value - # get predetected value from entity dict entity_dict = self.entities_dict.get(entity, {}) + + # get structured value from entity dict + structured_value = entity_dict.get('structured_value') + + # get predetected value from entity dict predetected_values = entity_dict.get('predetected_values') or [] # get fallback value from entity dict fallback_value = entity_dict.get('fallback_value') if text_entity_values and original_texts: - self.processed_text = self.__processed_texts[0] values, texts = text_entity_values[0], original_texts[0] entity_list, original_text_list = self.combine_results(values=values, original_texts=texts, @@ -595,10 +726,9 @@ def detect_bulk(self, messages=None, **kwargs): ] """ - texts = messages - self._process_text(texts) - res_list = self._get_text_detection_with_variants() + res_list = self._get_bulk_text_detection_with_variants(messages) data_list = [] + for index, res in enumerate(res_list): entities = {} for entity, value in res.items(): @@ -613,7 +743,6 @@ def detect_bulk(self, messages=None, **kwargs): text_entity_values, original_texts = value if text_entity_values and original_texts: - self.processed_text = self.__processed_texts[0] values, texts = text_entity_values[0], original_texts[0] entity_list, original_text_list = self.combine_results(values=values, original_texts=texts, diff --git a/ner_v2/detectors/textual/utils.py b/ner_v2/detectors/textual/utils.py index 201ac23ca..3e0ff4529 100644 --- a/ner_v2/detectors/textual/utils.py +++ b/ner_v2/detectors/textual/utils.py @@ -6,14 +6,20 @@ from chatbot_ner.config import ner_logger from language_utilities.constant import ENGLISH_LANG -from ner_constants import FROM_FALLBACK_VALUE +from ner_constants import (DATASTORE_VERIFIED, MODEL_VERIFIED, + FROM_FALLBACK_VALUE, ORIGINAL_TEXT, ENTITY_VALUE, DETECTION_METHOD, + DETECTION_LANGUAGE, ENTITY_VALUE_DICT_KEY) from ner_v2.detectors.textual.text_detection import TextDetector def verify_text_request(request): """ - Check the request object if proper message or entity is present in required - format. If not present raises appropriate error. + Check the request object + 1. If proper message or entity is present in required + format. + + 2. If length of message or entity is in allowed range + Args: request: API request object @@ -43,9 +49,22 @@ def verify_text_request(request): ner_logger.exception("Entities param is not in correct format") raise TypeError("Entities should be dict of entity details") + if len(message) > 100: + ner_logger.exception("Maximum number of message can be 100 for " + "bulk detection") + raise ValueError("Maximum number of message can be 100 for " + "bulk detection") + + if len(list(entities)) > 100: + ner_logger.exception("Maximum number of entities can be 100 for " + " detection") + raise ValueError("Maximum number of entities can be 100 for " + "bulk detection") + -def get_text_detection(message, entity_dict, structured_value=None, bot_message=None, - language=ENGLISH_LANG, target_language_script=ENGLISH_LANG, **kwargs): +def get_text_detection(message, entity_dict, bot_message=None, + language=ENGLISH_LANG, target_language_script=ENGLISH_LANG, + **kwargs): """ Get text detection for given message on given entities dict using TextDetector module. @@ -69,7 +88,6 @@ def get_text_detection(message, entity_dict, structured_value=None, bot_message= target_language_script=target_language_script) if isinstance(message, six.string_types): entity_output = text_detector.detect(message=message, - structured_value=structured_value, bot_message=bot_message) elif isinstance(message, (list, tuple)): entity_output = text_detector.detect_bulk(messages=message) @@ -86,10 +104,10 @@ def parse_text_request(request): Message to detect text can be: 1) Single entry in the list, for this we use `text_detector.detect` method. - Also for this case we check if the structured value or use_fallback is present. + Also for this case we check `ignore_message` flag is present. 2) For mulitple message, underlying code will call `text_detector.detect_bulk` method. - In this case we ignore structured valur or use_fallback for all the entities. + In this case we ignore flag for ignore_message for all the entities. Args: request: request object @@ -109,7 +127,7 @@ def parse_text_request(request): "predetected_values": ["Mumbai"], "fuzziness": null, "min_token_len_fuzziness": null, - "use_fallback": false + "ignore_message": false }, "restaurant": { "structured_value": null, @@ -117,7 +135,7 @@ def parse_text_request(request): "predetected_values": null, "fuzziness": null, "min_token_len_fuzziness": null, - "use_fallback": false + "ignore_message": false } } } @@ -165,47 +183,31 @@ def parse_text_request(request): # get first message message_str = message[0] - structured_value_entities = {} fallback_value_entities = {} text_value_entities = {} data.append({"entities": {}, "language": source_language}) for each_entity, value in entities.items(): - structured_value = value.get('structured_value') - use_fallback = value.get('use_fallback', False) + ignore_message = value.get('ignore_message', False) - if use_fallback: + if ignore_message: fallback_value_entities[each_entity] = value - elif structured_value: - structured_value_entities[each_entity] = value else: text_value_entities[each_entity] = value - # get detection for normal text entities + # get detection for text entities which has ignore_message flag + if fallback_value_entities: + output = get_output_for_fallback_entities(fallback_value_entities, source_language) + data[0]["entities"].update(output) + + # get detection for text entities output = get_text_detection(message=message_str, entity_dict=text_value_entities, structured_value=None, bot_message=bot_message, language_script=source_language, target_language_script=target_language_script) data[0]["entities"].update(output[0]) - # get detection for structured value text entities - if structured_value_entities: - for entity, value in structured_value_entities.items(): - entity_dict = {entity: value} - sv = value.get("structured_value") - output = get_text_detection(message=message_str, entity_dict=entity_dict, - structured_value=sv, bot_message=bot_message, - language_script=source_language, - target_language_script=target_language_script) - - data[0]["entities"].update(output[0]) - - # get detection for fallback value text entities - if fallback_value_entities: - output = get_output_for_fallback_entities(fallback_value_entities, source_language) - data[0]["entities"].update(output) - # check if more than one message elif len(message) > 1: text_detection_result = get_text_detection(message=message, entity_dict=entities, @@ -236,8 +238,8 @@ def get_output_for_fallback_entities(entities_dict, language=ENGLISH_LANG): Examples: Input: { - 'city': {'fallback_value': 'Mumbai', 'use_fallback': True}, - 'restaurant': {'fallback_value': None, 'use_fallback': True} + 'city': {'fallback_value': 'Mumbai', 'ignore_message': True}, + 'restaurant': {'fallback_value': None, 'ignore_message': True} } Output: @@ -268,14 +270,14 @@ def get_output_for_fallback_entities(entities_dict, language=ENGLISH_LANG): else: output[entity] = [ { - "entity_value": { - "value": fallback_value, - "datastore_verified": False, - "model_verified": False + ENTITY_VALUE: { + ENTITY_VALUE_DICT_KEY: fallback_value, + DATASTORE_VERIFIED: False, + MODEL_VERIFIED: False }, - "detection": FROM_FALLBACK_VALUE, - "original_text": fallback_value, - "language": language + DETECTION_METHOD: FROM_FALLBACK_VALUE, + ORIGINAL_TEXT: fallback_value, + DETECTION_LANGUAGE: language } ] return output From c501f841397d39ad41815a5fd349e4a1196b2c46 Mon Sep 17 00:00:00 2001 From: Ankur Agrawal Date: Tue, 22 Sep 2020 22:49:01 +0530 Subject: [PATCH 27/31] - Change function name `parse_text_request` to `get_text_entity_detection_data` - Change appropriate unit tests --- ner_v2/api.py | 6 +++--- ner_v2/detectors/textual/tests/test_utils.py | 10 +++++----- ner_v2/detectors/textual/text_detection.py | 5 +++-- ner_v2/detectors/textual/utils.py | 2 +- 4 files changed, 12 insertions(+), 11 deletions(-) diff --git a/ner_v2/api.py b/ner_v2/api.py index f8b82789d..2783d2609 100644 --- a/ner_v2/api.py +++ b/ner_v2/api.py @@ -12,7 +12,7 @@ from ner_v2.detectors.numeral.number.number_detection import NumberDetector from ner_v2.detectors.numeral.number_range.number_range_detection import NumberRangeDetector -from ner_v2.detectors.textual.utils import parse_text_request, verify_text_request +from ner_v2.detectors.textual.utils import get_text_entity_detection_data, verify_text_request from language_utilities.constant import ENGLISH_LANG from ner_v2.detectors.pattern.phone_number.phone_number_detection import PhoneDetector @@ -674,8 +674,8 @@ def text(request): try: verify_text_request(request) - # if verify success parse request and get data - data = parse_text_request(request) + # if verify success get detection data + data = get_text_entity_detection_data(request) except KeyError as err: response = {"success": False, "error": str(err)} diff --git a/ner_v2/detectors/textual/tests/test_utils.py b/ner_v2/detectors/textual/tests/test_utils.py index fa603f654..3ac32935a 100644 --- a/ner_v2/detectors/textual/tests/test_utils.py +++ b/ner_v2/detectors/textual/tests/test_utils.py @@ -9,7 +9,7 @@ from django.test import TestCase from django.http import HttpRequest -from ner_v2.detectors.textual.utils import parse_text_request, verify_text_request, \ +from ner_v2.detectors.textual.utils import get_text_entity_detection_data, verify_text_request, \ get_output_for_fallback_entities, get_text_detection tests_directory = os.path.dirname(os.path.abspath(__file__)) @@ -59,7 +59,7 @@ def test_verify_text_request_exceptions(self): self.assertRaises(TypeError, verify_text_request, request=request) @patch('ner_v2.detectors.textual.utils.get_text_detection') - def test_parse_text_request(self, mock_get_detection): + def test_get_text_entity_detection_data(self, mock_get_detection): input_data = { "message": ["I want to go to Mumbai"], "bot_message": None, @@ -98,7 +98,7 @@ def test_parse_text_request(self, mock_get_detection): 'language': 'en'}], 'restaurant': []}, 'language': 'en'}] - output = parse_text_request(request) + output = get_text_entity_detection_data(request) assert_output = [{ 'entities': {'entities': {'city': [ @@ -112,7 +112,7 @@ def test_parse_text_request(self, mock_get_detection): self.assertListEqual(output, assert_output) @patch('ner_v2.detectors.textual.utils.get_text_detection') - def test_parse_text_request_structured(self, mock_get_detection): + def test_get_text_entity_detection_data_structured(self, mock_get_detection): input_data = { "message": ["I want to go to Mumbai"], "bot_message": None, @@ -148,7 +148,7 @@ def test_parse_text_request_structured(self, mock_get_detection): {'entity_value': {'value': 'New Delhi', 'datastore_verified': True, 'model_verified': False}, 'detection': 'structure_value_verified', 'original_text': 'delhi', 'language': 'en'}]}] - output = parse_text_request(request) + output = get_text_entity_detection_data(request) assert_output = [{'entities': {'city': [ {'entity_value': {'value': 'New Delhi', 'datastore_verified': True, 'model_verified': False}, diff --git a/ner_v2/detectors/textual/text_detection.py b/ner_v2/detectors/textual/text_detection.py index 968d4a596..8b27622a7 100644 --- a/ner_v2/detectors/textual/text_detection.py +++ b/ner_v2/detectors/textual/text_detection.py @@ -294,6 +294,7 @@ def _process_es_result(self, entity_result, entity_list, text, variants_to_values = collections.OrderedDict() original_final_list_ = [] value_final_list_ = [] + _processed_text = processed_text _variants_to_values = entity_result.get(each_key, []) @@ -323,7 +324,7 @@ def _process_es_result(self, entity_result, entity_list, text, variants_list = exact_matches + fuzzy_variants for variant in variants_list: - original_text = self._get_entity_substring_from_text(processed_text, + original_text = self._get_entity_substring_from_text(_processed_text, variant, each_key) if original_text: value_final_list.append(variants_to_values[variant]) @@ -334,7 +335,7 @@ def _process_es_result(self, entity_result, entity_list, text, _pattern = re.compile(r'\b%s\b' % re.escape(original_text_), flags=_re_flags) tag = '__' + each_key + '__' - processed_text = _pattern.sub(tag, processed_text) + _processed_text = _pattern.sub(tag, _processed_text) value_final_list_.append(value_final_list) original_final_list_.append(original_final_list) diff --git a/ner_v2/detectors/textual/utils.py b/ner_v2/detectors/textual/utils.py index 3e0ff4529..76d487802 100644 --- a/ner_v2/detectors/textual/utils.py +++ b/ner_v2/detectors/textual/utils.py @@ -97,7 +97,7 @@ def get_text_detection(message, entity_dict, bot_message=None, return entity_output -def parse_text_request(request): +def get_text_entity_detection_data(request): """ Parse text request coming from POST call on `/v2/text/` and call the get text detection. From 95bfa8272fa385110c1f53d4908553082750ce94 Mon Sep 17 00:00:00 2001 From: Ankur Agrawal Date: Wed, 23 Sep 2020 10:48:33 +0530 Subject: [PATCH 28/31] Change test cases for multiple entity detection on same word of message --- .../textual/tests/test_text_detection.py | 19 +++++++----- ner_v2/detectors/textual/tests/test_utils.py | 10 +++--- ner_v2/detectors/textual/utils.py | 31 ++++++++++--------- 3 files changed, 33 insertions(+), 27 deletions(-) diff --git a/ner_v2/detectors/textual/tests/test_text_detection.py b/ner_v2/detectors/textual/tests/test_text_detection.py index 2fc4d20c5..519d2c5fd 100644 --- a/ner_v2/detectors/textual/tests/test_text_detection.py +++ b/ner_v2/detectors/textual/tests/test_text_detection.py @@ -127,15 +127,20 @@ def test_text_detection_detect_bulk_message(self, mock_es_query): result = text_detector.detect_bulk(messages=message) - assert_output = [{ - 'city': [{'entity_value': {'value': 'Mumbai', 'datastore_verified': True, 'model_verified': False}, - 'detection': 'message', 'original_text': 'mumbai', 'language': 'en'}], - 'restaurant': [{'entity_value': {'value': "Domino's Pizza", 'datastore_verified': True, - 'model_verified': False}, - 'detection': 'message', 'original_text': 'dominoes', 'language': 'en'}]}, + assert_output = [{'city': [{ + 'entity_value': {'value': 'Mumbai', 'datastore_verified': True, + 'model_verified': False}, + 'detection': 'message', 'original_text': 'mumbai', 'language': 'en'}], + 'restaurant': [{ + 'entity_value': {'value': "Domino's Pizza", 'datastore_verified': True, + 'model_verified': False}, + 'detection': 'message', 'original_text': 'dominoes', 'language': 'en'}]}, {'city': [{'entity_value': {'value': 'New Delhi', 'datastore_verified': True, 'model_verified': False}, - 'detection': 'message', 'original_text': 'delhi', 'language': 'en'}], 'restaurant': []}] + 'detection': 'message', 'original_text': 'delhi', 'language': 'en'}], + 'restaurant': [{'entity_value': {'value': 'Deli', 'datastore_verified': True, + 'model_verified': False}, + 'detection': 'message', 'original_text': 'delhi', 'language': 'en'}]}] self.maxDiff = None self.assertListEqual(result, assert_output) diff --git a/ner_v2/detectors/textual/tests/test_utils.py b/ner_v2/detectors/textual/tests/test_utils.py index 3ac32935a..8ce2c4e4e 100644 --- a/ner_v2/detectors/textual/tests/test_utils.py +++ b/ner_v2/detectors/textual/tests/test_utils.py @@ -10,7 +10,7 @@ from django.http import HttpRequest from ner_v2.detectors.textual.utils import get_text_entity_detection_data, verify_text_request, \ - get_output_for_fallback_entities, get_text_detection + get_output_for_fallback_entities, get_detection tests_directory = os.path.dirname(os.path.abspath(__file__)) @@ -58,7 +58,7 @@ def test_verify_text_request_exceptions(self): request._body = b'{"message":["something"], "entities":"something"}' self.assertRaises(TypeError, verify_text_request, request=request) - @patch('ner_v2.detectors.textual.utils.get_text_detection') + @patch('ner_v2.detectors.textual.utils.get_detection') def test_get_text_entity_detection_data(self, mock_get_detection): input_data = { "message": ["I want to go to Mumbai"], @@ -111,7 +111,7 @@ def test_get_text_entity_detection_data(self, mock_get_detection): self.assertListEqual(output, assert_output) - @patch('ner_v2.detectors.textual.utils.get_text_detection') + @patch('ner_v2.detectors.textual.utils.get_detection') def test_get_text_entity_detection_data_structured(self, mock_get_detection): input_data = { "message": ["I want to go to Mumbai"], @@ -176,7 +176,7 @@ def test_get_text_detection_string_message(self, mock_es_query): ('goa', 'goa')]) }] - output = get_text_detection(message, entity_dict) + output = get_detection(message, entity_dict) assert_output = [ {'city': [{'entity_value': {'value': 'Mumbai', 'datastore_verified': True, 'model_verified': False}, 'detection': 'message', @@ -206,7 +206,7 @@ def test_get_text_detection_list_message(self, mock_es_query): ('Wani', 'Wani'), ('goa', 'goa')])}] - output = get_text_detection(message, entity_dict) + output = get_detection(message, entity_dict) assert_output = [{'city': [ {'entity_value': {'value': 'Mumbai', 'datastore_verified': True, diff --git a/ner_v2/detectors/textual/utils.py b/ner_v2/detectors/textual/utils.py index 76d487802..8ef0177bf 100644 --- a/ner_v2/detectors/textual/utils.py +++ b/ner_v2/detectors/textual/utils.py @@ -62,9 +62,9 @@ def verify_text_request(request): "bulk detection") -def get_text_detection(message, entity_dict, bot_message=None, - language=ENGLISH_LANG, target_language_script=ENGLISH_LANG, - **kwargs): +def get_detection(message, entity_dict, bot_message=None, + language=ENGLISH_LANG, target_language_script=ENGLISH_LANG, + **kwargs): """ Get text detection for given message on given entities dict using TextDetector module. @@ -99,14 +99,15 @@ def get_text_detection(message, entity_dict, bot_message=None, def get_text_entity_detection_data(request): """ - Parse text request coming from POST call on `/v2/text/` and call the - get text detection. - Message to detect text can be: + Get details of message and entities from request and call get_detection internally + to get the results. - 1) Single entry in the list, for this we use `text_detector.detect` method. - Also for this case we check `ignore_message` flag is present. + Messages to detect text can be of two format: - 2) For mulitple message, underlying code will call `text_detector.detect_bulk` method. + 1) Single entry in the list of message, for this we use `text_detector.detect` method. + Also for this case we check if `ignore_message` flag is present. + + 2) For multiples message, underlying code will call `text_detector.detect_bulk` method. In this case we ignore flag for ignore_message for all the entities. Args: @@ -202,16 +203,16 @@ def get_text_entity_detection_data(request): data[0]["entities"].update(output) # get detection for text entities - output = get_text_detection(message=message_str, entity_dict=text_value_entities, - structured_value=None, bot_message=bot_message, - language_script=source_language, - target_language_script=target_language_script) + output = get_detection(message=message_str, entity_dict=text_value_entities, + structured_value=None, bot_message=bot_message, + language_script=source_language, + target_language_script=target_language_script) data[0]["entities"].update(output[0]) # check if more than one message elif len(message) > 1: - text_detection_result = get_text_detection(message=message, entity_dict=entities, - structured_value=None, bot_message=bot_message) + text_detection_result = get_detection(message=message, entity_dict=entities, + structured_value=None, bot_message=bot_message) data = [{"entities": x, "language": source_language} for x in text_detection_result] From 408bece99903a9d51fa8c4f1b81a81f29b6da3a7 Mon Sep 17 00:00:00 2001 From: Ankur Agrawal Date: Wed, 23 Sep 2020 16:36:57 +0530 Subject: [PATCH 29/31] - Change format of predetected value for entity from list `[]` to list of `[[]]` - Change parameter name `messsage` to `messages` in request body --- .../textual/tests/test_text_detection.py | 2 +- ner_v2/detectors/textual/tests/test_utils.py | 12 +++--- ner_v2/detectors/textual/text_detection.py | 22 ++++++++--- ner_v2/detectors/textual/utils.py | 39 ++++++++++--------- 4 files changed, 44 insertions(+), 31 deletions(-) diff --git a/ner_v2/detectors/textual/tests/test_text_detection.py b/ner_v2/detectors/textual/tests/test_text_detection.py index 519d2c5fd..d352eaba5 100644 --- a/ner_v2/detectors/textual/tests/test_text_detection.py +++ b/ner_v2/detectors/textual/tests/test_text_detection.py @@ -17,7 +17,7 @@ class TestTextualUtils(TestCase): def test_text_detector_intialization(self): entity_dict = {'city': {'structured_value': None, 'fallback_value': None, - 'predetected_values': None, + 'predetected_values': [[]], 'fuzziness': 4, 'min_token_len_fuzziness': 4, 'use_fallback': None}, diff --git a/ner_v2/detectors/textual/tests/test_utils.py b/ner_v2/detectors/textual/tests/test_utils.py index 8ce2c4e4e..89b96c934 100644 --- a/ner_v2/detectors/textual/tests/test_utils.py +++ b/ner_v2/detectors/textual/tests/test_utils.py @@ -36,7 +36,7 @@ def test_verify_text_request_ok(self): request = HttpRequest() # test if everything is ok - request._body = b'{"message":["something"], "entities":{"something":""}}' + request._body = b'{"messages":["something"], "entities":{"something":""}}' verify_text_request(request) def test_verify_text_request_exceptions(self): @@ -47,21 +47,21 @@ def test_verify_text_request_exceptions(self): self.assertRaises(KeyError, verify_text_request, request=request) # test if no entities - request._body = b'{"message": "something"}' + request._body = b'{"messages": "something"}' self.assertRaises(KeyError, verify_text_request, request=request) # test if message not in proper format - request._body = b'{"message":"something", "entities":"something"}' + request._body = b'{"messages":"something", "entities":"something"}' self.assertRaises(TypeError, verify_text_request, request=request) # test if entities not in proper format - request._body = b'{"message":["something"], "entities":"something"}' + request._body = b'{"messages":["something"], "entities":"something"}' self.assertRaises(TypeError, verify_text_request, request=request) @patch('ner_v2.detectors.textual.utils.get_detection') def test_get_text_entity_detection_data(self, mock_get_detection): input_data = { - "message": ["I want to go to Mumbai"], + "messages": ["I want to go to Mumbai"], "bot_message": None, "language_script": "en", "source_language": "en", @@ -114,7 +114,7 @@ def test_get_text_entity_detection_data(self, mock_get_detection): @patch('ner_v2.detectors.textual.utils.get_detection') def test_get_text_entity_detection_data_structured(self, mock_get_detection): input_data = { - "message": ["I want to go to Mumbai"], + "messages": ["I want to go to Mumbai"], "bot_message": None, "language_script": "en", "source_language": "en", diff --git a/ner_v2/detectors/textual/text_detection.py b/ner_v2/detectors/textual/text_detection.py index 8b27622a7..d8e8b125c 100644 --- a/ner_v2/detectors/textual/text_detection.py +++ b/ner_v2/detectors/textual/text_detection.py @@ -637,8 +637,14 @@ def detect(self, message=None, **kwargs): # get structured value from entity dict structured_value = entity_dict.get('structured_value') - # get predetected value from entity dict - predetected_values = entity_dict.get('predetected_values') or [] + # get predetected value list from entity dict + predetected_values = entity_dict.get('predetected_values') + + # get predetected value for message from index + if predetected_values: + _predetected_value = predetected_values[index] + else: + _predetected_value = [] # get fallback value from entity dict fallback_value = entity_dict.get('fallback_value') @@ -647,7 +653,7 @@ def detect(self, message=None, **kwargs): values, texts = text_entity_values[0], original_texts[0] entity_list, original_text_list = self.combine_results(values=values, original_texts=texts, - predetected_values=predetected_values) + predetected_values=_predetected_value) if structured_value: if entity_list: @@ -737,7 +743,13 @@ def detect_bulk(self, messages=None, **kwargs): values, texts = [], [] # get predetected value from entity dict entity_dict = self.entities_dict.get(entity, {}) - predetected_values = entity_dict.get('predetected_values') or [] + predetected_values = entity_dict.get('predetected_values') + + # get predetected value for message from index + if predetected_values: + _predetected_value = predetected_values[index] + else: + _predetected_value = [] # get fallback value from entity dict fallback_value = entity_dict.get('fallback_value') @@ -747,7 +759,7 @@ def detect_bulk(self, messages=None, **kwargs): values, texts = text_entity_values[0], original_texts[0] entity_list, original_text_list = self.combine_results(values=values, original_texts=texts, - predetected_values=predetected_values) + predetected_values=_predetected_value) if entity_list: value, method, original_text = entity_list, FROM_MESSAGE, original_text_list diff --git a/ner_v2/detectors/textual/utils.py b/ner_v2/detectors/textual/utils.py index 8ef0177bf..be10aae7b 100644 --- a/ner_v2/detectors/textual/utils.py +++ b/ner_v2/detectors/textual/utils.py @@ -30,26 +30,26 @@ def verify_text_request(request): """ request_data = json.loads(request.body) - message = request_data.get("message") + messages = request_data.get("messages") entities = request_data.get("entities") - if not message: - ner_logger.exception("Message param is not passed") - raise KeyError("Message is required") + if not messages: + ner_logger.exception("messages param is not passed") + raise KeyError("key messages is required") if not entities: ner_logger.exception("Entities param is not passed") raise KeyError("Entities dict is required") - if not isinstance(message, list): - ner_logger.exception("Message param is not in correct format") - raise TypeError("Message should be in format of list of string") + if not isinstance(messages, list): + ner_logger.exception("messages param is not in correct format") + raise TypeError("messages should be in format of list of string") if not isinstance(entities, dict): ner_logger.exception("Entities param is not in correct format") raise TypeError("Entities should be dict of entity details") - if len(message) > 100: + if len(messages) > 100: ner_logger.exception("Maximum number of message can be 100 for " "bulk detection") raise ValueError("Maximum number of message can be 100 for " @@ -117,7 +117,7 @@ def get_text_entity_detection_data(request): Examples: Request Object: { - "message": ["I want to go to Jabalpur"], + "messages": ["I want to go to Jabalpur"], "bot_message": null, "language_script": "en", "source_language": "en", @@ -169,7 +169,7 @@ def get_text_entity_detection_data(request): ] """ request_data = json.loads(request.body) - message = request_data.get("message", []) + messages = request_data.get("messages", []) bot_message = request_data.get("bot_message") entities = request_data.get("entities", {}) target_language_script = request_data.get('language_script') or ENGLISH_LANG @@ -177,12 +177,12 @@ def get_text_entity_detection_data(request): data = [] - message_len = len(message) + message_len = len(messages) if message_len == 1: # get first message - message_str = message[0] + message_str = messages[0] fallback_value_entities = {} text_value_entities = {} @@ -203,15 +203,16 @@ def get_text_entity_detection_data(request): data[0]["entities"].update(output) # get detection for text entities - output = get_detection(message=message_str, entity_dict=text_value_entities, - structured_value=None, bot_message=bot_message, - language_script=source_language, - target_language_script=target_language_script) - data[0]["entities"].update(output[0]) + if text_value_entities: + output = get_detection(message=message_str, entity_dict=text_value_entities, + structured_value=None, bot_message=bot_message, + language_script=source_language, + target_language_script=target_language_script) + data[0]["entities"].update(output[0]) # check if more than one message - elif len(message) > 1: - text_detection_result = get_detection(message=message, entity_dict=entities, + elif len(messages) > 1: + text_detection_result = get_detection(message=messages, entity_dict=entities, structured_value=None, bot_message=bot_message) data = [{"entities": x, "language": source_language} for x in text_detection_result] From f864a31f0fdcc5cc266db82f9c8e1a0f7d8055a4 Mon Sep 17 00:00:00 2001 From: Ankur Agrawal Date: Fri, 25 Sep 2020 14:11:43 +0530 Subject: [PATCH 30/31] - To set default `es_search` fuzziness as `auto` in all the cases - To set default fuzziness value for parsing es result to `auto:4,7` - To change substituion cost to `1` for v2/text --- .../textual/tests/test_text_detection.py | 12 ++-- ner_v2/detectors/textual/tests/test_utils.py | 4 +- ner_v2/detectors/textual/text_detection.py | 64 +++++++++---------- 3 files changed, 40 insertions(+), 40 deletions(-) diff --git a/ner_v2/detectors/textual/tests/test_text_detection.py b/ner_v2/detectors/textual/tests/test_text_detection.py index d352eaba5..e61c2c50f 100644 --- a/ner_v2/detectors/textual/tests/test_text_detection.py +++ b/ner_v2/detectors/textual/tests/test_text_detection.py @@ -18,13 +18,13 @@ def test_text_detector_intialization(self): entity_dict = {'city': {'structured_value': None, 'fallback_value': None, 'predetected_values': [[]], - 'fuzziness': 4, + 'fuzziness': "4,7", 'min_token_len_fuzziness': 4, 'use_fallback': None}, 'restaurant': {'structured_value': None, 'fallback_value': None, 'predetected_values': None, - 'fuzziness': 4, + 'fuzziness': "4,7", 'min_token_len_fuzziness': 4, 'use_fallback': None} } @@ -48,13 +48,13 @@ def test_text_detection_detect_single_message(self, mock_es_query): entity_dict = {'city': {'structured_value': None, 'fallback_value': None, 'predetected_values': None, - 'fuzziness': 4, + 'fuzziness': "4,7", 'min_token_len_fuzziness': 4, 'use_fallback': None}, 'restaurant': {'structured_value': None, 'fallback_value': None, 'predetected_values': None, - 'fuzziness': 4, + 'fuzziness': "4,7", 'min_token_len_fuzziness': 4, 'use_fallback': None} } @@ -93,13 +93,13 @@ def test_text_detection_detect_bulk_message(self, mock_es_query): entity_dict = {'city': {'structured_value': None, 'fallback_value': None, 'predetected_values': None, - 'fuzziness': 4, + 'fuzziness': "4,7", 'min_token_len_fuzziness': 4, 'use_fallback': None}, 'restaurant': {'structured_value': None, 'fallback_value': None, 'predetected_values': None, - 'fuzziness': 4, + 'fuzziness': "4,7", 'min_token_len_fuzziness': 4, 'use_fallback': None} } diff --git a/ner_v2/detectors/textual/tests/test_utils.py b/ner_v2/detectors/textual/tests/test_utils.py index 89b96c934..a192b10cc 100644 --- a/ner_v2/detectors/textual/tests/test_utils.py +++ b/ner_v2/detectors/textual/tests/test_utils.py @@ -163,7 +163,7 @@ def test_get_text_detection_string_message(self, mock_es_query): entity_dict = {'city': {'structured_value': None, 'fallback_value': None, 'predetected_values': None, - 'fuzziness': 4, + 'fuzziness': "4,7", 'min_token_len_fuzziness': 4, 'ignore_message': None}} @@ -191,7 +191,7 @@ def test_get_text_detection_list_message(self, mock_es_query): entity_dict = {'city': {'structured_value': None, 'fallback_value': None, 'predetected_values': None, - 'fuzziness': 4, + 'fuzziness': "4,7", 'min_token_len_fuzziness': 4, 'ignore_message': None}} diff --git a/ner_v2/detectors/textual/text_detection.py b/ner_v2/detectors/textual/text_detection.py index d8e8b125c..58d238a34 100644 --- a/ner_v2/detectors/textual/text_detection.py +++ b/ner_v2/detectors/textual/text_detection.py @@ -74,12 +74,12 @@ def __init__(self, entity_dict=None, self.__processed_texts = [] # defaults for auto mode - self._fuzziness = "auto:4,7" + self._fuzziness = "4,7" + self._fuzziness_lo, self._fuzziness_hi = 4, 7 self._min_token_size_for_fuzziness = self._fuzziness_lo # defaults for non-auto mode - self.set_fuzziness_threshold(fuzziness=1) self._min_token_size_for_fuzziness = 4 # define data store and target languages @@ -88,6 +88,9 @@ def __init__(self, entity_dict=None, self._source_language_script = source_language_script self._target_language_script = target_language_script + # set default ES query fuzziness as `auto` + self._es_fuzziness = "auto" + def _reset_state(self): """ Reset all the intermediary states of detection class. @@ -96,11 +99,11 @@ def _reset_state(self): self.__texts = [] self.__processed_texts = [] - def set_fuzziness_threshold(self, fuzziness): + def set_fuzziness_low_high_threshold(self, fuzziness): """ - Sets the fuzziness thresholds for similarity searches. The fuzziness threshold corresponds to the - maximum Levenshtein's distance allowed during similarity matching - + Sets the fuzziness thresholds high and low threshold for similarity searches. + The fuzziness threshold corresponds to the maximum Levenshtein's distance + allowed during similarity matching Args: fuzziness (iterable or int): If this parameter is int, elasticsearch's auto is used with @@ -114,20 +117,15 @@ def set_fuzziness_threshold(self, fuzziness): """ try: iter(fuzziness) - if len(fuzziness) == 2: - lo, hi = fuzziness + if len(fuzziness) == 3: + lo, hi = fuzziness.split(",") self._fuzziness_lo, self._fuzziness_hi = int(lo), int(hi) - self._fuzziness = "auto:" + str(self._fuzziness_lo) + "," + str(self._fuzziness_hi) - self._min_token_size_for_fuzziness = lo - else: - self._fuzziness = "auto" + self._min_token_size_for_fuzziness = self._fuzziness_lo except TypeError: - if type(fuzziness) == int or type(fuzziness) == float: - self._fuzziness = int(fuzziness) # Note that elasticsearch would take min(2, self._fuzziness) - else: - raise TypeError('Fuziness has to be either an iterable of length 2 or an int') + ner_logger.exception(f"Fuzziness not in correct format, got {fuzziness}") + raise TypeError('Fuzziness has to be an iterable of length 2 ') - def _get_fuzziness_threshold_for_token(self, token, fuzziness=None): + def _get_fuzziness_threshold_for_token(self, token): """ Return dynamic fuzziness threshold for damerau-levenshtein check based on length of token if elasticsearch fuzziness was set to auto mode @@ -140,18 +138,12 @@ def _get_fuzziness_threshold_for_token(self, token, fuzziness=None): int: fuzziness threshold for ngram matching on elastic search results """ - if not fuzziness: - fuzziness = self._fuzziness - - if type(fuzziness) == int: - return fuzziness + if len(token) < self._fuzziness_lo: + return 0 # strict match + elif len(token) >= self._fuzziness_hi: + return 2 # Allow upto two inserts/deletes and one substitution else: - if len(token) < self._fuzziness_lo: - return 0 # strict match - elif len(token) >= self._fuzziness_hi: - return 2 # Allow upto two inserts/deletes and one substitution - else: - return 1 # lo <= len < hi Allow only insert/delete + return 1 # lo <= len < hi Allow only insert/delete def set_min_token_size_for_levenshtein(self, min_size): """ @@ -392,7 +384,7 @@ def _get_single_text_detection_with_variants(self, message): # fetch ES datastore search result es_results = self.esdb.get_multi_entity_results(entities=es_entity_list, texts=texts, - fuzziness_threshold=self._fuzziness, + fuzziness_threshold=self._es_fuzziness, search_language_script=self._target_language_script ) @@ -443,7 +435,7 @@ def _get_bulk_text_detection_with_variants(self, messages): # fetch ES datastore search result es_results = self.esdb.get_multi_entity_results(entities=es_entity_list, texts=es_texts, - fuzziness_threshold=self._fuzziness, + fuzziness_threshold=self._es_fuzziness, search_language_script=self._target_language_script ) @@ -490,16 +482,24 @@ def _get_entity_substring_from_text(self, text, variant, entity_name): # get fuzziness and min_token_size_for_fuziness value from entity dict entity_dict = self.entities_dict.get(entity_name, {}) - fuzziness = entity_dict.get('fuzziness') + + # get fuzziness from entity if not set default + fuzziness = entity_dict.get('fuzziness') or self._fuzziness + + self.set_fuzziness_low_high_threshold(fuzziness) + min_token_size_for_fuzziness = entity_dict.get('min_token_len_fuzziness') if not min_token_size_for_fuzziness: min_token_size_for_fuzziness = self._min_token_size_for_fuzziness - ft = self._get_fuzziness_threshold_for_token(token=text_token, fuzziness=fuzziness) + ft = self._get_fuzziness_threshold_for_token(token=text_token) + + # set substitution cost to one if same or (len(text_token) > min_token_size_for_fuzziness and edit_distance(string1=variant_token, string2=text_token, + substitution_cost=1, max_distance=ft + 1) <= ft): original_text_tokens.append(text_token) variant_token_i += 1 From e91c3f8917ca096cbc510cded651180ec82c4093 Mon Sep 17 00:00:00 2001 From: Ankur Agrawal Date: Sun, 27 Sep 2020 19:31:35 +0530 Subject: [PATCH 31/31] Add unit test cases for TextDetector `_get_entity_substring_from_text` and `set_fuzziness_low_high_threshold` method --- .../textual/tests/test_text_detection.py | 42 +++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/ner_v2/detectors/textual/tests/test_text_detection.py b/ner_v2/detectors/textual/tests/test_text_detection.py index e61c2c50f..909cd2633 100644 --- a/ner_v2/detectors/textual/tests/test_text_detection.py +++ b/ner_v2/detectors/textual/tests/test_text_detection.py @@ -144,3 +144,45 @@ def test_text_detection_detect_bulk_message(self, mock_es_query): self.maxDiff = None self.assertListEqual(result, assert_output) + + def test_text_detection_set_fuzziness_hi_lo_threshold(self): + + entity_dict = {'city': {'structured_value': None, + 'fallback_value': None, + 'predetected_values': [[]], + 'fuzziness': "5,8", + 'min_token_len_fuzziness': 4, + 'use_fallback': None}} + language = 'en' + target_language_script = 'en' + + text_detector = TextDetector(entity_dict=entity_dict, source_language_script=language, + target_language_script=target_language_script) + + fuzziness = entity_dict['city']['fuzziness'] + + # assert for default fuzziness hi and low i.e. 4,7 + self.assertEqual(text_detector._fuzziness_lo, 4) + self.assertEqual(text_detector._fuzziness_hi, 7) + + # set new threshold and assert\ + text_detector.set_fuzziness_low_high_threshold(fuzziness) + self.assertEqual(text_detector._fuzziness_lo, 5) + self.assertEqual(text_detector._fuzziness_hi, 8) + + def test_text_detection_get_substring(self): + entity_dict = {'city': {'structured_value': None, + 'fallback_value': None, + 'predetected_values': [[]], + 'fuzziness': "2,4", + 'min_token_len_fuzziness': 4, + 'use_fallback': None}} + language = 'en' + target_language_script = 'en' + + text_detector = TextDetector(entity_dict=entity_dict, source_language_script=language, + target_language_script=target_language_script) + + substring = text_detector._get_entity_substring_from_text('Mmsbai', 'Mumbai', 'city') + + self.assertEqual(substring, 'Mmsbai')