From 07953e34b11c9e588712323cf899ad01abe6ddd8 Mon Sep 17 00:00:00 2001 From: chiragjn Date: Fri, 26 Apr 2019 15:24:33 +0530 Subject: [PATCH 1/4] Fetch value only in source_ and add deprecation warnings --- datastore/constants.py | 1 + datastore/datastore.py | 8 +++ datastore/elastic_search/query.py | 95 +++++++++++++++++++------------ 3 files changed, 68 insertions(+), 36 deletions(-) diff --git a/datastore/constants.py b/datastore/constants.py index 41e656c36..f7f76da19 100644 --- a/datastore/constants.py +++ b/datastore/constants.py @@ -7,6 +7,7 @@ ELASTICSEARCH = 'elasticsearch' ELASTICSEARCH_SEARCH_SIZE = ES_SEARCH_SIZE ELASTICSEARCH_BULK_HELPER_MESSAGE_SIZE = ES_BULK_MSG_SIZE +ELASTICSEARCH_VALUES_SEARCH_SIZE = 300000 # settings dictionary key constants ENGINE = 'engine' diff --git a/datastore/datastore.py b/datastore/datastore.py index 7799f383b..6f5680400 100644 --- a/datastore/datastore.py +++ b/datastore/datastore.py @@ -1,3 +1,5 @@ +import warnings + import elastic_search from chatbot_ner.config import ner_logger, CHATBOT_NER_DATASTORE from lib.singleton import Singleton @@ -120,6 +122,7 @@ def create(self, **kwargs): **kwargs ) + # FIXME: repopulate does not consider language of the variants def populate(self, entity_data_directory_path=DEFAULT_ENTITY_DATA_DIRECTORY, csv_file_paths=None, **kwargs): """ Populates the datastore from csv files stored in directory path indicated by entity_data_directory_path and @@ -181,6 +184,7 @@ def delete(self, **kwargs): ignore=[400, 404], **kwargs) + # FIXME: Deprecated, remove def get_entity_dictionary(self, entity_name, **kwargs): """ Args: @@ -214,6 +218,7 @@ def get_entity_dictionary(self, entity_name, **kwargs): ... u'koramangala': [u'koramangala']} """ + warnings.warn("get_entity_dictionary() is deprecated; Please use get_entity_data()", DeprecationWarning) if self._client_or_connection is None: self._connect() results_dictionary = {} @@ -308,6 +313,7 @@ def delete_entity(self, entity_name, **kwargs): ignore=[400, 404], **kwargs) + # FIXME: repopulate does not consider language of the variants def repopulate(self, entity_data_directory_path=DEFAULT_ENTITY_DATA_DIRECTORY, csv_file_paths=None, **kwargs): """ Deletes the existing data and repopulates it for entities from csv files stored in directory path indicated by @@ -378,6 +384,7 @@ def exists(self): return False + # FIXME: Deprecated, remove def update_entity_data(self, entity_name, entity_data, language_script, **kwargs): """ This method is used to populate the the entity dictionary @@ -389,6 +396,7 @@ def update_entity_data(self, entity_name, entity_data, language_script, **kwargs For Elasticsearch: Refer http://elasticsearch-py.readthedocs.io/en/master/helpers.html#elasticsearch.helpers.bulk """ + warnings.warn("update_entity_data() is deprecated; Please use add_entity_data()", DeprecationWarning) if self._client_or_connection is None: self._connect() diff --git a/datastore/elastic_search/query.py b/datastore/elastic_search/query.py index c26569e5d..da49d982c 100644 --- a/datastore/elastic_search/query.py +++ b/datastore/elastic_search/query.py @@ -1,21 +1,24 @@ from __future__ import absolute_import +import collections # std imports import copy -from six import string_types +import json import re -import collections +import warnings + +from six import string_types # Local imports from datastore import constants from external_api.constants import SENTENCE_LIST, ENTITY_LIST from language_utilities.constant import ENGLISH_LANG from lib.nlp.const import TOKENIZER -import json log_prefix = 'datastore.elastic_search.query' +# Deprecated def dictionary_query(connection, index_name, doc_type, entity_name, **kwargs): """ Get all variants data for a entity stored in the index as a dictionary @@ -32,6 +35,7 @@ def dictionary_query(connection, index_name, doc_type, entity_name, **kwargs): dictionary, search results of the 'term' query on entity_name, mapping keys to lists containing synonyms/variants of the key """ + warnings.warn("dictionary_query() is deprecated; Please use get_entity_data()", DeprecationWarning) results_dictionary = {} data = { 'query': { @@ -197,7 +201,7 @@ def get_entity_unique_values(connection, index_name, doc_type, entity_name, valu "unique_values": { "terms": { "field": "value.keyword", - "size": 300000 + "size": constants.ELASTICSEARCH_VALUES_SEARCH_SIZE, } } }, @@ -283,12 +287,14 @@ def full_text_query(connection, index_name, doc_type, entity_name, sentences, fu u'mumbai': u'mumbai', u'pune': u'pune'} """ - index = {'index': index_name, 'type': doc_type} + index_header = json.dumps({'index': index_name, 'type': doc_type}) data = [] - for sentence_ in sentences: - query = _generate_es_search_dictionary(entity_name, sentence_, fuzziness_threshold, + for sentence in sentences: + query = _generate_es_search_dictionary(entity_name=entity_name, + text=sentence, + fuzziness_threshold=fuzziness_threshold, language_script=search_language_script) - data.extend([json.dumps(index), json.dumps(query)]) + data.extend([index_header, json.dumps(query)]) data = '\n'.join(data) kwargs = dict(kwargs, body=data, doc_type=doc_type, index=index_name) @@ -359,17 +365,25 @@ def _get_dynamic_fuzziness_threshold(fuzzy_setting): return fuzzy_setting -def _generate_es_search_dictionary(entity_name, text, fuzziness_threshold, language_script=None): +def _generate_es_search_dictionary(entity_name, text, + fuzziness_threshold=1, + language_script=ENGLISH_LANG, + size=constants.ELASTICSEARCH_SEARCH_SIZE, + as_json=False): """ Generates compound elasticsearch boolean search query dictionary for the sentence. The query generated searches for entity_name in the index and returns search results for the matched word (of sentence) only if entity_name is found. Args: - entity_name: name of the entity to perform a 'term' query on - text: The text on which we need to identify the enitites. - fuzziness_threshold: fuzziness_threshold for elasticsearch match query 'fuzziness' parameter - language_script: language of documents to be searched, optional, defaults to None + entity_name (str): name of the entity to perform a 'term' query on + text (str): The text on which we need to identify the enitites. + fuzziness_threshold (int, optional): fuzziness_threshold for elasticsearch match query 'fuzziness' parameter. + Defaults to 1 + language_script (str, optional): language of documents to be searched, optional, defaults to 'en' + size (int, optional): number of records to return, defaults to `ELASTICSEARCH_SEARCH_SIZE` + as_json (bool, optional): Return the generated query as json string. useful for debug purposes. + Defaults to False Returns: dictionary, the search query for the text @@ -386,24 +400,18 @@ def _generate_es_search_dictionary(entity_name, text, fuzziness_threshold, langu must_terms.append(term_dict_entity_name) # search on language_script, add english as default search - if language_script is not None: - term_dict_language = { - 'terms': { - 'language_script': [language_script, ENGLISH_LANG] - } + term_dict_language = { + 'terms': { + 'language_script': [ENGLISH_LANG] } - must_terms.append(term_dict_language) - - data = { - 'query': { - 'bool': { - 'must': must_terms, - 'should': [], - 'minimum_should_match': 1 - } - }, 'size': constants.ELASTICSEARCH_SEARCH_SIZE } - query_should_data = [] + + if language_script != ENGLISH_LANG: + term_dict_language['terms']['language_script'].append(language_script) + + must_terms.append(term_dict_language) + + should_terms = [] query = { 'match': { 'variants': { @@ -413,15 +421,30 @@ def _generate_es_search_dictionary(entity_name, text, fuzziness_threshold, langu } } } - query_should_data.append(query) - data['query']['bool']['should'] = query_should_data - data['highlight'] = { - 'fields': { - 'variants': {} + should_terms.append(query) + + data = { + '_source': ['value'], + 'query': { + 'bool': { + 'must': must_terms, + 'should': should_terms, + 'minimum_should_match': 1 + }, + 'highlight': { + 'fields': { + 'variants': {} + }, + 'order': 'score', + 'number_of_fragments': 20 + } }, - 'order': 'score', - 'number_of_fragments': 20 + 'size': size } + + if as_json: + data = json.dumps(data) + return data From c9592b13b1f17b07efdcbe9a5de31f3b35b98f92 Mon Sep 17 00:00:00 2001 From: chiragjn Date: Fri, 26 Apr 2019 15:30:18 +0530 Subject: [PATCH 2/4] Fix lint errors --- datastore/datastore.py | 13 ++++++++----- datastore/elastic_search/__init__.py | 2 +- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/datastore/datastore.py b/datastore/datastore.py index 6f5680400..33e7f59d3 100644 --- a/datastore/datastore.py +++ b/datastore/datastore.py @@ -1,12 +1,15 @@ +from __future__ import absolute_import + import warnings -import elastic_search from chatbot_ner.config import ner_logger, CHATBOT_NER_DATASTORE +from datastore import elastic_search +from datastore.constants import (ELASTICSEARCH, ENGINE, ELASTICSEARCH_INDEX_NAME, DEFAULT_ENTITY_DATA_DIRECTORY, + ELASTICSEARCH_DOC_TYPE, ELASTICSEARCH_CRF_DATA_INDEX_NAME, + ELASTICSEARCH_CRF_DATA_DOC_TYPE) +from datastore.exceptions import (DataStoreSettingsImproperlyConfiguredException, EngineNotImplementedException, + EngineConnectionException, NonESEngineTransferException, IndexNotFoundException) from lib.singleton import Singleton -from .constants import (ELASTICSEARCH, ENGINE, ELASTICSEARCH_INDEX_NAME, DEFAULT_ENTITY_DATA_DIRECTORY, - ELASTICSEARCH_DOC_TYPE, ELASTICSEARCH_CRF_DATA_INDEX_NAME, ELASTICSEARCH_CRF_DATA_DOC_TYPE) -from .exceptions import (DataStoreSettingsImproperlyConfiguredException, EngineNotImplementedException, - EngineConnectionException, NonESEngineTransferException, IndexNotFoundException) class DataStore(object): diff --git a/datastore/elastic_search/__init__.py b/datastore/elastic_search/__init__.py index 20b6d27e0..34654dbfb 100644 --- a/datastore/elastic_search/__init__.py +++ b/datastore/elastic_search/__init__.py @@ -2,4 +2,4 @@ import create import populate import query -import transfer \ No newline at end of file +import transfer From 243f621e4025bbaf1715bf54a7da22fd7306edde Mon Sep 17 00:00:00 2001 From: chiragjn Date: Fri, 26 Apr 2019 15:41:27 +0530 Subject: [PATCH 3/4] Add highlight at the correct level in ES query --- datastore/elastic_search/query.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/datastore/elastic_search/query.py b/datastore/elastic_search/query.py index da49d982c..aaf69bb8d 100644 --- a/datastore/elastic_search/query.py +++ b/datastore/elastic_search/query.py @@ -294,7 +294,8 @@ def full_text_query(connection, index_name, doc_type, entity_name, sentences, fu text=sentence, fuzziness_threshold=fuzziness_threshold, language_script=search_language_script) - data.extend([index_header, json.dumps(query)]) + data.append(index_header) + data.append(json.dumps(query)) data = '\n'.join(data) kwargs = dict(kwargs, body=data, doc_type=doc_type, index=index_name) @@ -431,13 +432,13 @@ def _generate_es_search_dictionary(entity_name, text, 'should': should_terms, 'minimum_should_match': 1 }, - 'highlight': { - 'fields': { - 'variants': {} - }, - 'order': 'score', - 'number_of_fragments': 20 - } + }, + 'highlight': { + 'fields': { + 'variants': {} + }, + 'order': 'score', + 'number_of_fragments': 20 }, 'size': size } From e4a4dae519532d85fe8349d5f08bbbfa5f796603 Mon Sep 17 00:00:00 2001 From: chiragjn Date: Fri, 26 Apr 2019 16:29:32 +0530 Subject: [PATCH 4/4] Switch to unified highlighter for faster search on larger documents --- datastore/elastic_search/query.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/datastore/elastic_search/query.py b/datastore/elastic_search/query.py index aaf69bb8d..b6cbbac14 100644 --- a/datastore/elastic_search/query.py +++ b/datastore/elastic_search/query.py @@ -435,7 +435,9 @@ def _generate_es_search_dictionary(entity_name, text, }, 'highlight': { 'fields': { - 'variants': {} + 'variants': { + 'type': 'unified' # experimental in 5.x, default in 6.x and 7.x. Faster than 'plain' + } }, 'order': 'score', 'number_of_fragments': 20