Skip to content

Commit

Permalink
Merge pull request #248 from hellohaptik/develop
Browse files Browse the repository at this point in the history
Develop to Master 2nd May
  • Loading branch information
chiragjn authored May 2, 2019
2 parents c7d173c + 808e85a commit 72a20a4
Show file tree
Hide file tree
Showing 14 changed files with 206 additions and 120 deletions.
2 changes: 2 additions & 0 deletions .github/release-drafter.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,5 @@ categories:
label: packages-updated
- title: 👺 Miscellaneous
label: miscellaneous
exclude-labels:
- miscellaneous
1 change: 1 addition & 0 deletions datastore/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
ELASTICSEARCH = 'elasticsearch'
ELASTICSEARCH_SEARCH_SIZE = ES_SEARCH_SIZE
ELASTICSEARCH_BULK_HELPER_MESSAGE_SIZE = ES_BULK_MSG_SIZE
ELASTICSEARCH_VALUES_SEARCH_SIZE = 300000

# settings dictionary key constants
ENGINE = 'engine'
Expand Down
21 changes: 16 additions & 5 deletions datastore/datastore.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,15 @@
import elastic_search
from __future__ import absolute_import

import warnings

from chatbot_ner.config import ner_logger, CHATBOT_NER_DATASTORE
from datastore import elastic_search
from datastore.constants import (ELASTICSEARCH, ENGINE, ELASTICSEARCH_INDEX_NAME, DEFAULT_ENTITY_DATA_DIRECTORY,
ELASTICSEARCH_DOC_TYPE, ELASTICSEARCH_CRF_DATA_INDEX_NAME,
ELASTICSEARCH_CRF_DATA_DOC_TYPE)
from datastore.exceptions import (DataStoreSettingsImproperlyConfiguredException, EngineNotImplementedException,
EngineConnectionException, NonESEngineTransferException, IndexNotFoundException)
from lib.singleton import Singleton
from .constants import (ELASTICSEARCH, ENGINE, ELASTICSEARCH_INDEX_NAME, DEFAULT_ENTITY_DATA_DIRECTORY,
ELASTICSEARCH_DOC_TYPE, ELASTICSEARCH_CRF_DATA_INDEX_NAME, ELASTICSEARCH_CRF_DATA_DOC_TYPE)
from .exceptions import (DataStoreSettingsImproperlyConfiguredException, EngineNotImplementedException,
EngineConnectionException, NonESEngineTransferException, IndexNotFoundException)


class DataStore(object):
Expand Down Expand Up @@ -120,6 +125,7 @@ def create(self, **kwargs):
**kwargs
)

# FIXME: repopulate does not consider language of the variants
def populate(self, entity_data_directory_path=DEFAULT_ENTITY_DATA_DIRECTORY, csv_file_paths=None, **kwargs):
"""
Populates the datastore from csv files stored in directory path indicated by entity_data_directory_path and
Expand Down Expand Up @@ -181,6 +187,7 @@ def delete(self, **kwargs):
ignore=[400, 404],
**kwargs)

# FIXME: Deprecated, remove
def get_entity_dictionary(self, entity_name, **kwargs):
"""
Args:
Expand Down Expand Up @@ -214,6 +221,7 @@ def get_entity_dictionary(self, entity_name, **kwargs):
...
u'koramangala': [u'koramangala']}
"""
warnings.warn("get_entity_dictionary() is deprecated; Please use get_entity_data()", DeprecationWarning)
if self._client_or_connection is None:
self._connect()
results_dictionary = {}
Expand Down Expand Up @@ -308,6 +316,7 @@ def delete_entity(self, entity_name, **kwargs):
ignore=[400, 404],
**kwargs)

# FIXME: repopulate does not consider language of the variants
def repopulate(self, entity_data_directory_path=DEFAULT_ENTITY_DATA_DIRECTORY, csv_file_paths=None, **kwargs):
"""
Deletes the existing data and repopulates it for entities from csv files stored in directory path indicated by
Expand Down Expand Up @@ -378,6 +387,7 @@ def exists(self):

return False

# FIXME: Deprecated, remove
def update_entity_data(self, entity_name, entity_data, language_script, **kwargs):
"""
This method is used to populate the the entity dictionary
Expand All @@ -389,6 +399,7 @@ def update_entity_data(self, entity_name, entity_data, language_script, **kwargs
For Elasticsearch:
Refer http://elasticsearch-py.readthedocs.io/en/master/helpers.html#elasticsearch.helpers.bulk
"""
warnings.warn("update_entity_data() is deprecated; Please use add_entity_data()", DeprecationWarning)
if self._client_or_connection is None:
self._connect()

Expand Down
2 changes: 1 addition & 1 deletion datastore/elastic_search/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
import create
import populate
import query
import transfer
import transfer
98 changes: 62 additions & 36 deletions datastore/elastic_search/query.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,24 @@
from __future__ import absolute_import

import collections
# std imports
import copy
from six import string_types
import json
import re
import collections
import warnings

from six import string_types

# Local imports
from datastore import constants
from external_api.constants import SENTENCE_LIST, ENTITY_LIST
from language_utilities.constant import ENGLISH_LANG
from lib.nlp.const import TOKENIZER
import json

log_prefix = 'datastore.elastic_search.query'


# Deprecated
def dictionary_query(connection, index_name, doc_type, entity_name, **kwargs):
"""
Get all variants data for a entity stored in the index as a dictionary
Expand All @@ -32,6 +35,7 @@ def dictionary_query(connection, index_name, doc_type, entity_name, **kwargs):
dictionary, search results of the 'term' query on entity_name, mapping keys to lists containing
synonyms/variants of the key
"""
warnings.warn("dictionary_query() is deprecated; Please use get_entity_data()", DeprecationWarning)
results_dictionary = {}
data = {
'query': {
Expand Down Expand Up @@ -197,7 +201,7 @@ def get_entity_unique_values(connection, index_name, doc_type, entity_name, valu
"unique_values": {
"terms": {
"field": "value.keyword",
"size": 300000
"size": constants.ELASTICSEARCH_VALUES_SEARCH_SIZE,
}
}
},
Expand Down Expand Up @@ -283,12 +287,15 @@ def full_text_query(connection, index_name, doc_type, entity_name, sentences, fu
u'mumbai': u'mumbai',
u'pune': u'pune'}
"""
index = {'index': index_name, 'type': doc_type}
index_header = json.dumps({'index': index_name, 'type': doc_type})
data = []
for sentence_ in sentences:
query = _generate_es_search_dictionary(entity_name, sentence_, fuzziness_threshold,
for sentence in sentences:
query = _generate_es_search_dictionary(entity_name=entity_name,
text=sentence,
fuzziness_threshold=fuzziness_threshold,
language_script=search_language_script)
data.extend([json.dumps(index), json.dumps(query)])
data.append(index_header)
data.append(json.dumps(query))
data = '\n'.join(data)

kwargs = dict(kwargs, body=data, doc_type=doc_type, index=index_name)
Expand Down Expand Up @@ -359,17 +366,25 @@ def _get_dynamic_fuzziness_threshold(fuzzy_setting):
return fuzzy_setting


def _generate_es_search_dictionary(entity_name, text, fuzziness_threshold, language_script=None):
def _generate_es_search_dictionary(entity_name, text,
fuzziness_threshold=1,
language_script=ENGLISH_LANG,
size=constants.ELASTICSEARCH_SEARCH_SIZE,
as_json=False):
"""
Generates compound elasticsearch boolean search query dictionary for the sentence. The query generated
searches for entity_name in the index and returns search results for the matched word (of sentence)
only if entity_name is found.
Args:
entity_name: name of the entity to perform a 'term' query on
text: The text on which we need to identify the enitites.
fuzziness_threshold: fuzziness_threshold for elasticsearch match query 'fuzziness' parameter
language_script: language of documents to be searched, optional, defaults to None
entity_name (str): name of the entity to perform a 'term' query on
text (str): The text on which we need to identify the enitites.
fuzziness_threshold (int, optional): fuzziness_threshold for elasticsearch match query 'fuzziness' parameter.
Defaults to 1
language_script (str, optional): language of documents to be searched, optional, defaults to 'en'
size (int, optional): number of records to return, defaults to `ELASTICSEARCH_SEARCH_SIZE`
as_json (bool, optional): Return the generated query as json string. useful for debug purposes.
Defaults to False
Returns:
dictionary, the search query for the text
Expand All @@ -386,24 +401,18 @@ def _generate_es_search_dictionary(entity_name, text, fuzziness_threshold, langu
must_terms.append(term_dict_entity_name)

# search on language_script, add english as default search
if language_script is not None:
term_dict_language = {
'terms': {
'language_script': [language_script, ENGLISH_LANG]
}
term_dict_language = {
'terms': {
'language_script': [ENGLISH_LANG]
}
must_terms.append(term_dict_language)

data = {
'query': {
'bool': {
'must': must_terms,
'should': [],
'minimum_should_match': 1
}
}, 'size': constants.ELASTICSEARCH_SEARCH_SIZE
}
query_should_data = []

if language_script != ENGLISH_LANG:
term_dict_language['terms']['language_script'].append(language_script)

must_terms.append(term_dict_language)

should_terms = []
query = {
'match': {
'variants': {
Expand All @@ -413,15 +422,32 @@ def _generate_es_search_dictionary(entity_name, text, fuzziness_threshold, langu
}
}
}
query_should_data.append(query)
data['query']['bool']['should'] = query_should_data
data['highlight'] = {
'fields': {
'variants': {}
should_terms.append(query)

data = {
'_source': ['value'],
'query': {
'bool': {
'must': must_terms,
'should': should_terms,
'minimum_should_match': 1
},
},
'order': 'score',
'number_of_fragments': 20
'highlight': {
'fields': {
'variants': {
'type': 'unified' # experimental in 5.x, default in 6.x and 7.x. Faster than 'plain'
}
},
'order': 'score',
'number_of_fragments': 20
},
'size': size
}

if as_json:
data = json.dumps(data)

return data


Expand Down
6 changes: 6 additions & 0 deletions ner_v2/detectors/numeral/number/hi/data/units.csv
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
unit_type,unit_value,unit_variants
currency,rupees,rupees | rupee | rs | rupya | rupaya | rupaye | rupye | rupay | paisa | paise | inr | रूपीस | रुपया | रूपए| पैसा| पैसे| ₹
currency,dollar,Dollar | usd | डॉलर | $
package_metric_unit,mg,mg | milligram | milligrams | mgs | मिलीग्राम | मिलिग्राम | मिल्लीग्राम | मिलीग्राम्स | मिल्लीग्राम्स | मिलिग्रामस
package_metric_unit,gms,gms | grams | gram | gm | g | ग्राम | ग्राम्स
package_metric_unit,kg,kilogram | kilograms | kg | kilo | kgs | किलोग्राम | किलोग्राम्स | किलो
package_metric_unit,ml,ml | milliliter | millilitre | milliliters | millilitres | मिलीलीटर | मिलिलिटर | मिललिलिटर | मिली लीटर
package_metric_unit,ltr,ltr | litre | liter | litres | liters | l | लीटर | लिटर
package_metric_unit,pcs,pcs | pc | pieces | piece | पीस | पिस | टुकड़े | टुकड़ा
23 changes: 21 additions & 2 deletions ner_v2/detectors/numeral/number/number_detection.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import importlib
import math
import os

from ner_v2.detectors.base_detector import BaseDetector
from language_utilities.constant import ENGLISH_LANG
from ner_v2.detectors.base_detector import BaseDetector
from ner_v2.detectors.numeral.constant import NUMBER_DETECTION_RETURN_DICT_VALUE, NUMBER_DETECTION_RETURN_DICT_UNIT
from ner_v2.detectors.utils import get_lang_data_path

Expand Down Expand Up @@ -50,6 +51,7 @@ class NumberDetector(BaseDetector):
max_digit: maximum digit that a number can take
"""

@staticmethod
def get_supported_languages():
"""
Expand Down Expand Up @@ -136,7 +138,7 @@ def detect_entity(self, text, **kwargs):
for number_value_dict, original_text in zip(number_data[0], number_data[1]):
number_value = number_value_dict[NUMBER_DETECTION_RETURN_DICT_VALUE]
number_unit = number_value_dict[NUMBER_DETECTION_RETURN_DICT_UNIT]
if self.min_digit <= len(number_value) <= self.max_digit:
if self.min_digit <= self._num_digits(number_value) <= self.max_digit:
if self.unit_type and (number_unit is None or
self.language_number_detector.units_map[number_unit].type != self.unit_type):
continue
Expand Down Expand Up @@ -165,3 +167,20 @@ def set_min_max_digits(self, min_digit, max_digit):
"""
self.min_digit = min_digit
self.max_digit = max_digit

@staticmethod
def _num_digits(value):
"""
Calculate the number of digits in given number
Args:
value (str or float or int):
Returns:
int: number of digits in given number
Raises:
ValueError: if the given string cannot be cast to float
"""
v = abs(float(value))
return 1 if int(v) == 0 else (1 + int(math.log10(v)))
Loading

0 comments on commit 72a20a4

Please sign in to comment.