Merge pull request #267 from hellohaptik/develop

Develop to Master - June 18, 2019
hellohaptik · Jun 24, 2019 · 07a69f1 · 07a69f1
2 parents 4574ba3 + 9467465
commit 07a69f1
Show file tree

Hide file tree

Showing 8 changed files with 383 additions and 53 deletions.
diff --git a/chatbot_ner/settings.py b/chatbot_ner/settings.py
@@ -94,6 +94,24 @@ def __getitem__(self, item):
 
 
 TEST_RUNNER = 'django_nose.NoseTestSuiteRunner'
+NOSE_ARGS = [
+    '--nocapture',
+    '--nologcapture',
+    '--verbosity=3',
+    '--ignore-files=urls.py',
+    '--ignore-files=wsgi.py',
+    '--ignore-files=manage.py',
+    '--ignore-files=initial_setup.py',
+    '--ignore-files=__init__.py',
+    '--ignore-files=const.py',
+    '--ignore-files=constant.py',
+    '--ignore-files=constants.py',
+    '--ignore-files=start_server.sh',
+    '--ignore-files=settings.py',
+    '--exclude-dir=docs/',
+    '--exclude-dir=docker/',
+    '--exclude-dir=data/',
+]
 
 # Internationalization
 # https://docs.djangoproject.com/en/1.11/topics/i18n/

diff --git a/chatbot_ner/urls.py b/chatbot_ner/urls.py
@@ -35,6 +35,13 @@
     url(r'^v2/phone_number/$', api_v2.phone_number),
     url(r'^v2/number_range/$', api_v2.number_range),
 
+    # V2 bulk detectors
+    url(r'^v2/date_bulk/$', api_v2.date),
+    url(r'^v2/time_bulk/$', api_v2.time),
+    url(r'^v2/number_bulk/$', api_v2.number),
+    url(r'^v2/number_range_bulk/$', api_v2.number_range),
+    url(r'^v2/phone_number_bulk/$', api_v2.phone_number),
+
     # Dictionary Read Write
     url(r'^entities/get_entity_word_variants', external_api.get_entity_word_variants),
     url(r'^entities/update_dictionary', external_api.update_dictionary),

diff --git a/models/crf_v2/README.md b/models/crf_v2/README.md
@@ -53,7 +53,7 @@ file_handler = open('glove_vectors', 'wb')
 pickle.dump(obj=word_vectors.wv.vectors, file=file_handler, protocol=2)
 
 if not os.path.exists('/app/models_crf/'):
-os.makedirs('/app/models_crf/')
+    os.makedirs('/app/models_crf/')
 
 ```
 
@@ -167,7 +167,7 @@ The module is used to take input as the sentence_list and entity_list and conver
     ```python
     from models.crf_v2.crf_preprocess_data import CrfPreprocessData
     docs['word_embeddings'] = 
-    CrfPreprocessData.word_embeddings(processed_pos_tag_data=each,
+    [CrfPreprocessData.word_embeddings(processed_pos_tag_data=each,
     vocab=vocab, word_vectors=word_vectors) 
     for each in docs[SENTENCE_LIST]]
 	```

diff --git a/ner_v1/detectors/numeral/budget/budget_detection.py b/ner_v1/detectors/numeral/budget/budget_detection.py
@@ -77,6 +77,24 @@ class BudgetDetector(BaseDetector):
 
     """
 
+    _scale_patterns = {
+        'k': 1000,
+        'ha?zaa?r': 1000,
+        'ha?ja?ar': 1000,
+        'thousa?nd': 1000,
+        'l': 100000,
+        'lacs?': 100000,
+        'lakh?s?': 100000,
+        'lakhs': 100000,
+        'm': 1000000,
+        'mn': 1000000,
+        'million': 1000000,
+        'mill?': 1000000,
+        'c': 10000000,
+        'cro?': 10000000,
+        'crore?s?': 10000000,
+    }
+
     def __init__(self, entity_name, source_language_script=ENGLISH_LANG, translation_enabled=False,
                  use_text_detection=False):
         """Initializes a BudgetDetector object
@@ -101,18 +119,10 @@ def __init__(self, entity_name, source_language_script=ENGLISH_LANG, translation
         self.tag = '__' + self.entity_name + '__'
         self._use_text_detection = use_text_detection
 
-        self._allowed_units = [
-            (['k', 'ha?zaa?r', 'ha?ja?ar', 'thousa?nd'], 1000),
-            (['l', 'lacs?', 'lakh?s?', 'lakhs'], 100000),
-            (['m', 'mn', 'million', 'mill?'], 1000000),
-            (['c', 'cro?', 'crore?s?'], 10000000),
-        ]
-
-        units = []
-        for _units, scale in self._allowed_units:
-            units.extend(_units)
-        units.sort(key=lambda unit: len(unit), reverse=True)
-
+        units, scales = zip(*sorted(
+            list(BudgetDetector._scale_patterns.items()), key=lambda pattern_scale: len(pattern_scale[0]), reverse=True
+        ))
+        self._scale_compiled_patterns = [(scale, re.compile(unit)) for scale, unit in zip(scales, units)]
         digits_pattern = r'((?:\d+(?:\,\d+)*(?:\.\d+)?)|(?:(?:\d+(?:\,\d+)*)?(?:\.\d+)))'
         units_pattern = r'({})?'.format('|'.join(units))
         self._budget_pattern = r'(?:rs\.|rs|rupees|rupee)?' \
@@ -121,8 +131,8 @@ def __init__(self, entity_name, source_language_script=ENGLISH_LANG, translation
 
     def get_scale(self, unit):
         if unit:
-            for _units, scale in self._allowed_units:
-                if re.search('|'.join(_units), unit):
+            for scale, pattern in self._scale_compiled_patterns:
+                if pattern.search(unit):
                     return scale
 
         return 1

diff --git a/ner_v1/tests/numeral/budget/test_budget_detection.py b/ner_v1/tests/numeral/budget/test_budget_detection.py
@@ -10,7 +10,8 @@ def setUp(self):
         self.budget_detector = BudgetDetector(entity_name='budget')
         self.budget_detector.set_min_max_digits(min_digit=1, max_digit=15)
 
-    def make_budget_dict(self, min_budget=0, max_budget=0):
+    @staticmethod
+    def make_budget_dict(min_budget=0, max_budget=0):
         return {'min_budget': min_budget, 'max_budget': max_budget, 'type': 'normal_budget'}
 
     def test_min_max_digits_limits(self):
@@ -118,6 +119,9 @@ def test_not_budgets(self):
             self.assertEqual(original_texts, [])
 
     def test_budgets_without_scales(self):
+        """
+        Test budgets without scales
+        """
         tests = [
             ('I want to buy 5 liters of milk', 0, 5, '5'),
             ('the insect is 120 millimeters tall', 0, 120, '120'),
@@ -128,3 +132,39 @@ def test_budgets_without_scales(self):
             budget_dicts, original_texts = self.budget_detector.detect_entity(text=test)
             self.assertEqual(budget_dicts, [self.make_budget_dict(max_budget=max_budget)])
             self.assertEqual(original_texts, [original_text])
+
+    def test_all_budget_scales(self):
+        """
+        Test all supported budget scales
+        """
+        tests = [
+            ('2k', 0, 2000, '2k'),
+            ('2 thousand', 0, 2000, '2 thousand'),
+            ('2 hazar', 0, 2000, '2 hazar'),
+            ('2 hazaar', 0, 2000, '2 hazaar'),
+            ('2 hajar', 0, 2000, '2 hajar'),
+            ('2 hajaar', 0, 2000, '2 hajaar'),
+            ('2l', 0, 200000, '2l'),
+            ('2 lac', 0, 200000, '2 lac'),
+            ('2 lacs', 0, 200000, '2 lacs'),
+            ('2 lak', 0, 200000, '2 lak'),
+            ('2 laks', 0, 200000, '2 laks'),
+            ('2 lakh', 0, 200000, '2 lakh'),
+            ('2 lakhs', 0, 200000, '2 lakhs'),
+            ('2m', 0, 2000000, '2m'),
+            ('2mn', 0, 2000000, '2mn'),
+            ('2 mil', 0, 2000000, '2 mil'),
+            ('2 mill', 0, 2000000, '2 mill'),
+            ('2 million', 0, 2000000, '2 million'),
+            ('2c', 0, 20000000, '2c'),
+            ('2 cr', 0, 20000000, '2 cr'),
+            ('2 cro', 0, 20000000, '2 cro'),
+            ('2 cror', 0, 20000000, '2 cror'),
+            ('2 crore', 0, 20000000, '2 crore'),
+            ('2 crores', 0, 20000000, '2 crores'),
+        ]
+
+        for test, min_budget, max_budget, original_text in tests:
+            budget_dicts, original_texts = self.budget_detector.detect_entity(text=test)
+            self.assertEqual(budget_dicts, [self.make_budget_dict(max_budget=max_budget)])
+            self.assertEqual(original_texts, [original_text])