diff --git a/datastore/constants.py b/datastore/constants.py index 41e656c36..f7f76da19 100644 --- a/datastore/constants.py +++ b/datastore/constants.py @@ -7,6 +7,7 @@ ELASTICSEARCH = 'elasticsearch' ELASTICSEARCH_SEARCH_SIZE = ES_SEARCH_SIZE ELASTICSEARCH_BULK_HELPER_MESSAGE_SIZE = ES_BULK_MSG_SIZE +ELASTICSEARCH_VALUES_SEARCH_SIZE = 300000 # settings dictionary key constants ENGINE = 'engine' diff --git a/datastore/datastore.py b/datastore/datastore.py index 7799f383b..33e7f59d3 100644 --- a/datastore/datastore.py +++ b/datastore/datastore.py @@ -1,10 +1,15 @@ -import elastic_search +from __future__ import absolute_import + +import warnings + from chatbot_ner.config import ner_logger, CHATBOT_NER_DATASTORE +from datastore import elastic_search +from datastore.constants import (ELASTICSEARCH, ENGINE, ELASTICSEARCH_INDEX_NAME, DEFAULT_ENTITY_DATA_DIRECTORY, + ELASTICSEARCH_DOC_TYPE, ELASTICSEARCH_CRF_DATA_INDEX_NAME, + ELASTICSEARCH_CRF_DATA_DOC_TYPE) +from datastore.exceptions import (DataStoreSettingsImproperlyConfiguredException, EngineNotImplementedException, + EngineConnectionException, NonESEngineTransferException, IndexNotFoundException) from lib.singleton import Singleton -from .constants import (ELASTICSEARCH, ENGINE, ELASTICSEARCH_INDEX_NAME, DEFAULT_ENTITY_DATA_DIRECTORY, - ELASTICSEARCH_DOC_TYPE, ELASTICSEARCH_CRF_DATA_INDEX_NAME, ELASTICSEARCH_CRF_DATA_DOC_TYPE) -from .exceptions import (DataStoreSettingsImproperlyConfiguredException, EngineNotImplementedException, - EngineConnectionException, NonESEngineTransferException, IndexNotFoundException) class DataStore(object): @@ -120,6 +125,7 @@ def create(self, **kwargs): **kwargs ) + # FIXME: repopulate does not consider language of the variants def populate(self, entity_data_directory_path=DEFAULT_ENTITY_DATA_DIRECTORY, csv_file_paths=None, **kwargs): """ Populates the datastore from csv files stored in directory path indicated by entity_data_directory_path and @@ -181,6 +187,7 @@ def delete(self, **kwargs): ignore=[400, 404], **kwargs) + # FIXME: Deprecated, remove def get_entity_dictionary(self, entity_name, **kwargs): """ Args: @@ -214,6 +221,7 @@ def get_entity_dictionary(self, entity_name, **kwargs): ... u'koramangala': [u'koramangala']} """ + warnings.warn("get_entity_dictionary() is deprecated; Please use get_entity_data()", DeprecationWarning) if self._client_or_connection is None: self._connect() results_dictionary = {} @@ -308,6 +316,7 @@ def delete_entity(self, entity_name, **kwargs): ignore=[400, 404], **kwargs) + # FIXME: repopulate does not consider language of the variants def repopulate(self, entity_data_directory_path=DEFAULT_ENTITY_DATA_DIRECTORY, csv_file_paths=None, **kwargs): """ Deletes the existing data and repopulates it for entities from csv files stored in directory path indicated by @@ -378,6 +387,7 @@ def exists(self): return False + # FIXME: Deprecated, remove def update_entity_data(self, entity_name, entity_data, language_script, **kwargs): """ This method is used to populate the the entity dictionary @@ -389,6 +399,7 @@ def update_entity_data(self, entity_name, entity_data, language_script, **kwargs For Elasticsearch: Refer http://elasticsearch-py.readthedocs.io/en/master/helpers.html#elasticsearch.helpers.bulk """ + warnings.warn("update_entity_data() is deprecated; Please use add_entity_data()", DeprecationWarning) if self._client_or_connection is None: self._connect() diff --git a/datastore/elastic_search/__init__.py b/datastore/elastic_search/__init__.py index 20b6d27e0..34654dbfb 100644 --- a/datastore/elastic_search/__init__.py +++ b/datastore/elastic_search/__init__.py @@ -2,4 +2,4 @@ import create import populate import query -import transfer \ No newline at end of file +import transfer diff --git a/datastore/elastic_search/query.py b/datastore/elastic_search/query.py index c26569e5d..b6cbbac14 100644 --- a/datastore/elastic_search/query.py +++ b/datastore/elastic_search/query.py @@ -1,21 +1,24 @@ from __future__ import absolute_import +import collections # std imports import copy -from six import string_types +import json import re -import collections +import warnings + +from six import string_types # Local imports from datastore import constants from external_api.constants import SENTENCE_LIST, ENTITY_LIST from language_utilities.constant import ENGLISH_LANG from lib.nlp.const import TOKENIZER -import json log_prefix = 'datastore.elastic_search.query' +# Deprecated def dictionary_query(connection, index_name, doc_type, entity_name, **kwargs): """ Get all variants data for a entity stored in the index as a dictionary @@ -32,6 +35,7 @@ def dictionary_query(connection, index_name, doc_type, entity_name, **kwargs): dictionary, search results of the 'term' query on entity_name, mapping keys to lists containing synonyms/variants of the key """ + warnings.warn("dictionary_query() is deprecated; Please use get_entity_data()", DeprecationWarning) results_dictionary = {} data = { 'query': { @@ -197,7 +201,7 @@ def get_entity_unique_values(connection, index_name, doc_type, entity_name, valu "unique_values": { "terms": { "field": "value.keyword", - "size": 300000 + "size": constants.ELASTICSEARCH_VALUES_SEARCH_SIZE, } } }, @@ -283,12 +287,15 @@ def full_text_query(connection, index_name, doc_type, entity_name, sentences, fu u'mumbai': u'mumbai', u'pune': u'pune'} """ - index = {'index': index_name, 'type': doc_type} + index_header = json.dumps({'index': index_name, 'type': doc_type}) data = [] - for sentence_ in sentences: - query = _generate_es_search_dictionary(entity_name, sentence_, fuzziness_threshold, + for sentence in sentences: + query = _generate_es_search_dictionary(entity_name=entity_name, + text=sentence, + fuzziness_threshold=fuzziness_threshold, language_script=search_language_script) - data.extend([json.dumps(index), json.dumps(query)]) + data.append(index_header) + data.append(json.dumps(query)) data = '\n'.join(data) kwargs = dict(kwargs, body=data, doc_type=doc_type, index=index_name) @@ -359,17 +366,25 @@ def _get_dynamic_fuzziness_threshold(fuzzy_setting): return fuzzy_setting -def _generate_es_search_dictionary(entity_name, text, fuzziness_threshold, language_script=None): +def _generate_es_search_dictionary(entity_name, text, + fuzziness_threshold=1, + language_script=ENGLISH_LANG, + size=constants.ELASTICSEARCH_SEARCH_SIZE, + as_json=False): """ Generates compound elasticsearch boolean search query dictionary for the sentence. The query generated searches for entity_name in the index and returns search results for the matched word (of sentence) only if entity_name is found. Args: - entity_name: name of the entity to perform a 'term' query on - text: The text on which we need to identify the enitites. - fuzziness_threshold: fuzziness_threshold for elasticsearch match query 'fuzziness' parameter - language_script: language of documents to be searched, optional, defaults to None + entity_name (str): name of the entity to perform a 'term' query on + text (str): The text on which we need to identify the enitites. + fuzziness_threshold (int, optional): fuzziness_threshold for elasticsearch match query 'fuzziness' parameter. + Defaults to 1 + language_script (str, optional): language of documents to be searched, optional, defaults to 'en' + size (int, optional): number of records to return, defaults to `ELASTICSEARCH_SEARCH_SIZE` + as_json (bool, optional): Return the generated query as json string. useful for debug purposes. + Defaults to False Returns: dictionary, the search query for the text @@ -386,24 +401,18 @@ def _generate_es_search_dictionary(entity_name, text, fuzziness_threshold, langu must_terms.append(term_dict_entity_name) # search on language_script, add english as default search - if language_script is not None: - term_dict_language = { - 'terms': { - 'language_script': [language_script, ENGLISH_LANG] - } + term_dict_language = { + 'terms': { + 'language_script': [ENGLISH_LANG] } - must_terms.append(term_dict_language) - - data = { - 'query': { - 'bool': { - 'must': must_terms, - 'should': [], - 'minimum_should_match': 1 - } - }, 'size': constants.ELASTICSEARCH_SEARCH_SIZE } - query_should_data = [] + + if language_script != ENGLISH_LANG: + term_dict_language['terms']['language_script'].append(language_script) + + must_terms.append(term_dict_language) + + should_terms = [] query = { 'match': { 'variants': { @@ -413,15 +422,32 @@ def _generate_es_search_dictionary(entity_name, text, fuzziness_threshold, langu } } } - query_should_data.append(query) - data['query']['bool']['should'] = query_should_data - data['highlight'] = { - 'fields': { - 'variants': {} + should_terms.append(query) + + data = { + '_source': ['value'], + 'query': { + 'bool': { + 'must': must_terms, + 'should': should_terms, + 'minimum_should_match': 1 + }, }, - 'order': 'score', - 'number_of_fragments': 20 + 'highlight': { + 'fields': { + 'variants': { + 'type': 'unified' # experimental in 5.x, default in 6.x and 7.x. Faster than 'plain' + } + }, + 'order': 'score', + 'number_of_fragments': 20 + }, + 'size': size } + + if as_json: + data = json.dumps(data) + return data