Skip to content

Commit

Permalink
Merge pull request #247 from hellohaptik/text_detection_limit_source
Browse files Browse the repository at this point in the history
Change highlighter, fetch value only in source_ and add deprecation warnings
  • Loading branch information
chiragjn authored Apr 26, 2019
2 parents 060d03b + e4a4dae commit 1d05102
Show file tree
Hide file tree
Showing 4 changed files with 80 additions and 42 deletions.
1 change: 1 addition & 0 deletions datastore/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
ELASTICSEARCH = 'elasticsearch'
ELASTICSEARCH_SEARCH_SIZE = ES_SEARCH_SIZE
ELASTICSEARCH_BULK_HELPER_MESSAGE_SIZE = ES_BULK_MSG_SIZE
ELASTICSEARCH_VALUES_SEARCH_SIZE = 300000

# settings dictionary key constants
ENGINE = 'engine'
Expand Down
21 changes: 16 additions & 5 deletions datastore/datastore.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,15 @@
import elastic_search
from __future__ import absolute_import

import warnings

from chatbot_ner.config import ner_logger, CHATBOT_NER_DATASTORE
from datastore import elastic_search
from datastore.constants import (ELASTICSEARCH, ENGINE, ELASTICSEARCH_INDEX_NAME, DEFAULT_ENTITY_DATA_DIRECTORY,
ELASTICSEARCH_DOC_TYPE, ELASTICSEARCH_CRF_DATA_INDEX_NAME,
ELASTICSEARCH_CRF_DATA_DOC_TYPE)
from datastore.exceptions import (DataStoreSettingsImproperlyConfiguredException, EngineNotImplementedException,
EngineConnectionException, NonESEngineTransferException, IndexNotFoundException)
from lib.singleton import Singleton
from .constants import (ELASTICSEARCH, ENGINE, ELASTICSEARCH_INDEX_NAME, DEFAULT_ENTITY_DATA_DIRECTORY,
ELASTICSEARCH_DOC_TYPE, ELASTICSEARCH_CRF_DATA_INDEX_NAME, ELASTICSEARCH_CRF_DATA_DOC_TYPE)
from .exceptions import (DataStoreSettingsImproperlyConfiguredException, EngineNotImplementedException,
EngineConnectionException, NonESEngineTransferException, IndexNotFoundException)


class DataStore(object):
Expand Down Expand Up @@ -120,6 +125,7 @@ def create(self, **kwargs):
**kwargs
)

# FIXME: repopulate does not consider language of the variants
def populate(self, entity_data_directory_path=DEFAULT_ENTITY_DATA_DIRECTORY, csv_file_paths=None, **kwargs):
"""
Populates the datastore from csv files stored in directory path indicated by entity_data_directory_path and
Expand Down Expand Up @@ -181,6 +187,7 @@ def delete(self, **kwargs):
ignore=[400, 404],
**kwargs)

# FIXME: Deprecated, remove
def get_entity_dictionary(self, entity_name, **kwargs):
"""
Args:
Expand Down Expand Up @@ -214,6 +221,7 @@ def get_entity_dictionary(self, entity_name, **kwargs):
...
u'koramangala': [u'koramangala']}
"""
warnings.warn("get_entity_dictionary() is deprecated; Please use get_entity_data()", DeprecationWarning)
if self._client_or_connection is None:
self._connect()
results_dictionary = {}
Expand Down Expand Up @@ -308,6 +316,7 @@ def delete_entity(self, entity_name, **kwargs):
ignore=[400, 404],
**kwargs)

# FIXME: repopulate does not consider language of the variants
def repopulate(self, entity_data_directory_path=DEFAULT_ENTITY_DATA_DIRECTORY, csv_file_paths=None, **kwargs):
"""
Deletes the existing data and repopulates it for entities from csv files stored in directory path indicated by
Expand Down Expand Up @@ -378,6 +387,7 @@ def exists(self):

return False

# FIXME: Deprecated, remove
def update_entity_data(self, entity_name, entity_data, language_script, **kwargs):
"""
This method is used to populate the the entity dictionary
Expand All @@ -389,6 +399,7 @@ def update_entity_data(self, entity_name, entity_data, language_script, **kwargs
For Elasticsearch:
Refer http://elasticsearch-py.readthedocs.io/en/master/helpers.html#elasticsearch.helpers.bulk
"""
warnings.warn("update_entity_data() is deprecated; Please use add_entity_data()", DeprecationWarning)
if self._client_or_connection is None:
self._connect()

Expand Down
2 changes: 1 addition & 1 deletion datastore/elastic_search/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
import create
import populate
import query
import transfer
import transfer
98 changes: 62 additions & 36 deletions datastore/elastic_search/query.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,24 @@
from __future__ import absolute_import

import collections
# std imports
import copy
from six import string_types
import json
import re
import collections
import warnings

from six import string_types

# Local imports
from datastore import constants
from external_api.constants import SENTENCE_LIST, ENTITY_LIST
from language_utilities.constant import ENGLISH_LANG
from lib.nlp.const import TOKENIZER
import json

log_prefix = 'datastore.elastic_search.query'


# Deprecated
def dictionary_query(connection, index_name, doc_type, entity_name, **kwargs):
"""
Get all variants data for a entity stored in the index as a dictionary
Expand All @@ -32,6 +35,7 @@ def dictionary_query(connection, index_name, doc_type, entity_name, **kwargs):
dictionary, search results of the 'term' query on entity_name, mapping keys to lists containing
synonyms/variants of the key
"""
warnings.warn("dictionary_query() is deprecated; Please use get_entity_data()", DeprecationWarning)
results_dictionary = {}
data = {
'query': {
Expand Down Expand Up @@ -197,7 +201,7 @@ def get_entity_unique_values(connection, index_name, doc_type, entity_name, valu
"unique_values": {
"terms": {
"field": "value.keyword",
"size": 300000
"size": constants.ELASTICSEARCH_VALUES_SEARCH_SIZE,
}
}
},
Expand Down Expand Up @@ -283,12 +287,15 @@ def full_text_query(connection, index_name, doc_type, entity_name, sentences, fu
u'mumbai': u'mumbai',
u'pune': u'pune'}
"""
index = {'index': index_name, 'type': doc_type}
index_header = json.dumps({'index': index_name, 'type': doc_type})
data = []
for sentence_ in sentences:
query = _generate_es_search_dictionary(entity_name, sentence_, fuzziness_threshold,
for sentence in sentences:
query = _generate_es_search_dictionary(entity_name=entity_name,
text=sentence,
fuzziness_threshold=fuzziness_threshold,
language_script=search_language_script)
data.extend([json.dumps(index), json.dumps(query)])
data.append(index_header)
data.append(json.dumps(query))
data = '\n'.join(data)

kwargs = dict(kwargs, body=data, doc_type=doc_type, index=index_name)
Expand Down Expand Up @@ -359,17 +366,25 @@ def _get_dynamic_fuzziness_threshold(fuzzy_setting):
return fuzzy_setting


def _generate_es_search_dictionary(entity_name, text, fuzziness_threshold, language_script=None):
def _generate_es_search_dictionary(entity_name, text,
fuzziness_threshold=1,
language_script=ENGLISH_LANG,
size=constants.ELASTICSEARCH_SEARCH_SIZE,
as_json=False):
"""
Generates compound elasticsearch boolean search query dictionary for the sentence. The query generated
searches for entity_name in the index and returns search results for the matched word (of sentence)
only if entity_name is found.
Args:
entity_name: name of the entity to perform a 'term' query on
text: The text on which we need to identify the enitites.
fuzziness_threshold: fuzziness_threshold for elasticsearch match query 'fuzziness' parameter
language_script: language of documents to be searched, optional, defaults to None
entity_name (str): name of the entity to perform a 'term' query on
text (str): The text on which we need to identify the enitites.
fuzziness_threshold (int, optional): fuzziness_threshold for elasticsearch match query 'fuzziness' parameter.
Defaults to 1
language_script (str, optional): language of documents to be searched, optional, defaults to 'en'
size (int, optional): number of records to return, defaults to `ELASTICSEARCH_SEARCH_SIZE`
as_json (bool, optional): Return the generated query as json string. useful for debug purposes.
Defaults to False
Returns:
dictionary, the search query for the text
Expand All @@ -386,24 +401,18 @@ def _generate_es_search_dictionary(entity_name, text, fuzziness_threshold, langu
must_terms.append(term_dict_entity_name)

# search on language_script, add english as default search
if language_script is not None:
term_dict_language = {
'terms': {
'language_script': [language_script, ENGLISH_LANG]
}
term_dict_language = {
'terms': {
'language_script': [ENGLISH_LANG]
}
must_terms.append(term_dict_language)

data = {
'query': {
'bool': {
'must': must_terms,
'should': [],
'minimum_should_match': 1
}
}, 'size': constants.ELASTICSEARCH_SEARCH_SIZE
}
query_should_data = []

if language_script != ENGLISH_LANG:
term_dict_language['terms']['language_script'].append(language_script)

must_terms.append(term_dict_language)

should_terms = []
query = {
'match': {
'variants': {
Expand All @@ -413,15 +422,32 @@ def _generate_es_search_dictionary(entity_name, text, fuzziness_threshold, langu
}
}
}
query_should_data.append(query)
data['query']['bool']['should'] = query_should_data
data['highlight'] = {
'fields': {
'variants': {}
should_terms.append(query)

data = {
'_source': ['value'],
'query': {
'bool': {
'must': must_terms,
'should': should_terms,
'minimum_should_match': 1
},
},
'order': 'score',
'number_of_fragments': 20
'highlight': {
'fields': {
'variants': {
'type': 'unified' # experimental in 5.x, default in 6.x and 7.x. Faster than 'plain'
}
},
'order': 'score',
'number_of_fragments': 20
},
'size': size
}

if as_json:
data = json.dumps(data)

return data


Expand Down

0 comments on commit 1d05102

Please sign in to comment.