Merge pull request #248 from hellohaptik/develop

Develop to Master 2nd May
hellohaptik · May 2, 2019 · 72a20a4 · 72a20a4
2 parents c7d173c + 808e85a
commit 72a20a4
Show file tree

Hide file tree

Showing 14 changed files with 206 additions and 120 deletions.
diff --git a/.github/release-drafter.yml b/.github/release-drafter.yml
@@ -19,3 +19,5 @@ categories:
     label: packages-updated
   - title: 👺 Miscellaneous 
     label: miscellaneous
+exclude-labels:
+  - miscellaneous
diff --git a/datastore/constants.py b/datastore/constants.py
@@ -7,6 +7,7 @@
 ELASTICSEARCH = 'elasticsearch'
 ELASTICSEARCH_SEARCH_SIZE = ES_SEARCH_SIZE
 ELASTICSEARCH_BULK_HELPER_MESSAGE_SIZE = ES_BULK_MSG_SIZE
+ELASTICSEARCH_VALUES_SEARCH_SIZE = 300000
 
 # settings dictionary key constants
 ENGINE = 'engine'

diff --git a/datastore/datastore.py b/datastore/datastore.py
@@ -1,10 +1,15 @@
-import elastic_search
+from __future__ import absolute_import
+
+import warnings
+
 from chatbot_ner.config import ner_logger, CHATBOT_NER_DATASTORE
+from datastore import elastic_search
+from datastore.constants import (ELASTICSEARCH, ENGINE, ELASTICSEARCH_INDEX_NAME, DEFAULT_ENTITY_DATA_DIRECTORY,
+                                 ELASTICSEARCH_DOC_TYPE, ELASTICSEARCH_CRF_DATA_INDEX_NAME,
+                                 ELASTICSEARCH_CRF_DATA_DOC_TYPE)
+from datastore.exceptions import (DataStoreSettingsImproperlyConfiguredException, EngineNotImplementedException,
+                                  EngineConnectionException, NonESEngineTransferException, IndexNotFoundException)
 from lib.singleton import Singleton
-from .constants import (ELASTICSEARCH, ENGINE, ELASTICSEARCH_INDEX_NAME, DEFAULT_ENTITY_DATA_DIRECTORY,
-                        ELASTICSEARCH_DOC_TYPE, ELASTICSEARCH_CRF_DATA_INDEX_NAME, ELASTICSEARCH_CRF_DATA_DOC_TYPE)
-from .exceptions import (DataStoreSettingsImproperlyConfiguredException, EngineNotImplementedException,
-                         EngineConnectionException, NonESEngineTransferException, IndexNotFoundException)
 
 
 class DataStore(object):
@@ -120,6 +125,7 @@ def create(self, **kwargs):
                     **kwargs
                 )
 
+    # FIXME: repopulate does not consider language of the variants
     def populate(self, entity_data_directory_path=DEFAULT_ENTITY_DATA_DIRECTORY, csv_file_paths=None, **kwargs):
         """
         Populates the datastore from csv files stored in directory path indicated by entity_data_directory_path and
@@ -181,6 +187,7 @@ def delete(self, **kwargs):
                                                ignore=[400, 404],
                                                **kwargs)
 
+    # FIXME: Deprecated, remove
     def get_entity_dictionary(self, entity_name, **kwargs):
         """
         Args:
@@ -214,6 +221,7 @@ def get_entity_dictionary(self, entity_name, **kwargs):
                 ...
                 u'koramangala': [u'koramangala']}
         """
+        warnings.warn("get_entity_dictionary() is deprecated; Please use get_entity_data()", DeprecationWarning)
         if self._client_or_connection is None:
             self._connect()
         results_dictionary = {}
@@ -308,6 +316,7 @@ def delete_entity(self, entity_name, **kwargs):
                                                           ignore=[400, 404],
                                                           **kwargs)
 
+    # FIXME: repopulate does not consider language of the variants
     def repopulate(self, entity_data_directory_path=DEFAULT_ENTITY_DATA_DIRECTORY, csv_file_paths=None, **kwargs):
         """
         Deletes the existing data and repopulates it for entities from csv files stored in directory path indicated by
@@ -378,6 +387,7 @@ def exists(self):
 
         return False
 
+    # FIXME: Deprecated, remove
     def update_entity_data(self, entity_name, entity_data, language_script, **kwargs):
         """
         This method is used to populate the the entity dictionary
@@ -389,6 +399,7 @@ def update_entity_data(self, entity_name, entity_data, language_script, **kwargs
                 For Elasticsearch:
                 Refer http://elasticsearch-py.readthedocs.io/en/master/helpers.html#elasticsearch.helpers.bulk
         """
+        warnings.warn("update_entity_data() is deprecated; Please use add_entity_data()", DeprecationWarning)
         if self._client_or_connection is None:
             self._connect()
 

diff --git a/datastore/elastic_search/__init__.py b/datastore/elastic_search/__init__.py
@@ -2,4 +2,4 @@
 import create
 import populate
 import query
-import transfer
+import transfer
diff --git a/datastore/elastic_search/query.py b/datastore/elastic_search/query.py
@@ -1,21 +1,24 @@
 from __future__ import absolute_import
 
+import collections
 # std imports
 import copy
-from six import string_types
+import json
 import re
-import collections
+import warnings
+
+from six import string_types
 
 # Local imports
 from datastore import constants
 from external_api.constants import SENTENCE_LIST, ENTITY_LIST
 from language_utilities.constant import ENGLISH_LANG
 from lib.nlp.const import TOKENIZER
-import json
 
 log_prefix = 'datastore.elastic_search.query'
 
 
+# Deprecated
 def dictionary_query(connection, index_name, doc_type, entity_name, **kwargs):
     """
     Get all variants data for a entity stored in the index as a dictionary
@@ -32,6 +35,7 @@ def dictionary_query(connection, index_name, doc_type, entity_name, **kwargs):
         dictionary, search results of the 'term' query on entity_name, mapping keys to lists containing
         synonyms/variants of the key
     """
+    warnings.warn("dictionary_query() is deprecated; Please use get_entity_data()", DeprecationWarning)
     results_dictionary = {}
     data = {
         'query': {
@@ -197,7 +201,7 @@ def get_entity_unique_values(connection, index_name, doc_type, entity_name, valu
             "unique_values": {
                 "terms": {
                     "field": "value.keyword",
-                    "size": 300000
+                    "size": constants.ELASTICSEARCH_VALUES_SEARCH_SIZE,
                 }
             }
         },
@@ -283,12 +287,15 @@ def full_text_query(connection, index_name, doc_type, entity_name, sentences, fu
          u'mumbai': u'mumbai',
          u'pune': u'pune'}
     """
-    index = {'index': index_name, 'type': doc_type}
+    index_header = json.dumps({'index': index_name, 'type': doc_type})
     data = []
-    for sentence_ in sentences:
-        query = _generate_es_search_dictionary(entity_name, sentence_, fuzziness_threshold,
+    for sentence in sentences:
+        query = _generate_es_search_dictionary(entity_name=entity_name,
+                                               text=sentence,
+                                               fuzziness_threshold=fuzziness_threshold,
                                                language_script=search_language_script)
-        data.extend([json.dumps(index), json.dumps(query)])
+        data.append(index_header)
+        data.append(json.dumps(query))
     data = '\n'.join(data)
 
     kwargs = dict(kwargs, body=data, doc_type=doc_type, index=index_name)
@@ -359,17 +366,25 @@ def _get_dynamic_fuzziness_threshold(fuzzy_setting):
     return fuzzy_setting
 
 
-def _generate_es_search_dictionary(entity_name, text, fuzziness_threshold, language_script=None):
+def _generate_es_search_dictionary(entity_name, text,
+                                   fuzziness_threshold=1,
+                                   language_script=ENGLISH_LANG,
+                                   size=constants.ELASTICSEARCH_SEARCH_SIZE,
+                                   as_json=False):
     """
     Generates compound elasticsearch boolean search query dictionary for the sentence. The query generated
     searches for entity_name in the index and returns search results for the matched word (of sentence)
      only if entity_name is found.
 
     Args:
-        entity_name: name of the entity to perform a 'term' query on
-        text: The text on which we need to identify the enitites.
-        fuzziness_threshold: fuzziness_threshold for elasticsearch match query 'fuzziness' parameter
-        language_script: language of documents to be searched, optional, defaults to None
+        entity_name (str): name of the entity to perform a 'term' query on
+        text (str): The text on which we need to identify the enitites.
+        fuzziness_threshold (int, optional): fuzziness_threshold for elasticsearch match query 'fuzziness' parameter.
+            Defaults to 1
+        language_script (str, optional): language of documents to be searched, optional, defaults to 'en'
+        size (int, optional): number of records to return, defaults to `ELASTICSEARCH_SEARCH_SIZE`
+        as_json (bool, optional): Return the generated query as json string. useful for debug purposes.
+            Defaults to False
 
     Returns:
         dictionary, the search query for the text
@@ -386,24 +401,18 @@ def _generate_es_search_dictionary(entity_name, text, fuzziness_threshold, langu
     must_terms.append(term_dict_entity_name)
 
     # search on language_script, add english as default search
-    if language_script is not None:
-        term_dict_language = {
-            'terms': {
-                'language_script': [language_script, ENGLISH_LANG]
-            }
+    term_dict_language = {
+        'terms': {
+            'language_script': [ENGLISH_LANG]
         }
-        must_terms.append(term_dict_language)
-
-    data = {
-        'query': {
-            'bool': {
-                'must': must_terms,
-                'should': [],
-                'minimum_should_match': 1
-            }
-        }, 'size': constants.ELASTICSEARCH_SEARCH_SIZE
     }
-    query_should_data = []
+
+    if language_script != ENGLISH_LANG:
+        term_dict_language['terms']['language_script'].append(language_script)
+
+    must_terms.append(term_dict_language)
+
+    should_terms = []
     query = {
         'match': {
             'variants': {
@@ -413,15 +422,32 @@ def _generate_es_search_dictionary(entity_name, text, fuzziness_threshold, langu
             }
         }
     }
-    query_should_data.append(query)
-    data['query']['bool']['should'] = query_should_data
-    data['highlight'] = {
-        'fields': {
-            'variants': {}
+    should_terms.append(query)
+
+    data = {
+        '_source': ['value'],
+        'query': {
+            'bool': {
+                'must': must_terms,
+                'should': should_terms,
+                'minimum_should_match': 1
+            },
         },
-        'order': 'score',
-        'number_of_fragments': 20
+        'highlight': {
+            'fields': {
+                'variants': {
+                    'type': 'unified'  # experimental in 5.x, default in 6.x and 7.x. Faster than 'plain'
+                }
+            },
+            'order': 'score',
+            'number_of_fragments': 20
+        },
+        'size': size
     }
+
+    if as_json:
+        data = json.dumps(data)
+
     return data
 
 

diff --git a/ner_v2/detectors/numeral/number/hi/data/units.csv b/ner_v2/detectors/numeral/number/hi/data/units.csv
@@ -1,3 +1,9 @@
 unit_type,unit_value,unit_variants
 currency,rupees,rupees | rupee | rs | rupya | rupaya | rupaye | rupye | rupay | paisa | paise | inr | रूपीस | रुपया | रूपए| पैसा| पैसे| ₹  
 currency,dollar,Dollar | usd | डॉलर | $
+package_metric_unit,mg,mg | milligram | milligrams | mgs | मिलीग्राम | मिलिग्राम | मिल्लीग्राम | मिलीग्राम्स | मिल्लीग्राम्स | मिलिग्रामस
+package_metric_unit,gms,gms | grams | gram | gm | g | ग्राम | ग्राम्स
+package_metric_unit,kg,kilogram | kilograms | kg | kilo | kgs | किलोग्राम | किलोग्राम्स | किलो
+package_metric_unit,ml,ml | milliliter | millilitre | milliliters | millilitres | मिलीलीटर | मिलिलिटर | मिललिलिटर | मिली लीटर
+package_metric_unit,ltr,ltr | litre | liter | litres | liters | l | लीटर | लिटर
+package_metric_unit,pcs,pcs | pc | pieces | piece | पीस | पिस | टुकड़े | टुकड़ा
diff --git a/ner_v2/detectors/numeral/number/number_detection.py b/ner_v2/detectors/numeral/number/number_detection.py
@@ -1,8 +1,9 @@
 import importlib
+import math
 import os
 
-from ner_v2.detectors.base_detector import BaseDetector
 from language_utilities.constant import ENGLISH_LANG
+from ner_v2.detectors.base_detector import BaseDetector
 from ner_v2.detectors.numeral.constant import NUMBER_DETECTION_RETURN_DICT_VALUE, NUMBER_DETECTION_RETURN_DICT_UNIT
 from ner_v2.detectors.utils import get_lang_data_path
 
@@ -50,6 +51,7 @@ class NumberDetector(BaseDetector):
         max_digit: maximum digit that a number can take
 
     """
+
     @staticmethod
     def get_supported_languages():
         """
@@ -136,7 +138,7 @@ def detect_entity(self, text, **kwargs):
         for number_value_dict, original_text in zip(number_data[0], number_data[1]):
             number_value = number_value_dict[NUMBER_DETECTION_RETURN_DICT_VALUE]
             number_unit = number_value_dict[NUMBER_DETECTION_RETURN_DICT_UNIT]
-            if self.min_digit <= len(number_value) <= self.max_digit:
+            if self.min_digit <= self._num_digits(number_value) <= self.max_digit:
                 if self.unit_type and (number_unit is None or
                                        self.language_number_detector.units_map[number_unit].type != self.unit_type):
                     continue
@@ -165,3 +167,20 @@ def set_min_max_digits(self, min_digit, max_digit):
         """
         self.min_digit = min_digit
         self.max_digit = max_digit
+
+    @staticmethod
+    def _num_digits(value):
+        """
+        Calculate the number of digits in given number
+
+        Args:
+            value (str or float or int):
+
+        Returns:
+            int: number of digits in given number
+
+        Raises:
+            ValueError: if the given string cannot be cast to float
+        """
+        v = abs(float(value))
+        return 1 if int(v) == 0 else (1 + int(math.log10(v)))