Skip to content

Commit

Permalink
Merge pull request #267 from hellohaptik/develop
Browse files Browse the repository at this point in the history
Develop to Master - June 18, 2019
  • Loading branch information
prik2693 authored Jun 24, 2019
2 parents 4574ba3 + 9467465 commit 07a69f1
Show file tree
Hide file tree
Showing 8 changed files with 383 additions and 53 deletions.
18 changes: 18 additions & 0 deletions chatbot_ner/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,24 @@ def __getitem__(self, item):


TEST_RUNNER = 'django_nose.NoseTestSuiteRunner'
NOSE_ARGS = [
'--nocapture',
'--nologcapture',
'--verbosity=3',
'--ignore-files=urls.py',
'--ignore-files=wsgi.py',
'--ignore-files=manage.py',
'--ignore-files=initial_setup.py',
'--ignore-files=__init__.py',
'--ignore-files=const.py',
'--ignore-files=constant.py',
'--ignore-files=constants.py',
'--ignore-files=start_server.sh',
'--ignore-files=settings.py',
'--exclude-dir=docs/',
'--exclude-dir=docker/',
'--exclude-dir=data/',
]

# Internationalization
# https://docs.djangoproject.com/en/1.11/topics/i18n/
Expand Down
7 changes: 7 additions & 0 deletions chatbot_ner/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,13 @@
url(r'^v2/phone_number/$', api_v2.phone_number),
url(r'^v2/number_range/$', api_v2.number_range),

# V2 bulk detectors
url(r'^v2/date_bulk/$', api_v2.date),
url(r'^v2/time_bulk/$', api_v2.time),
url(r'^v2/number_bulk/$', api_v2.number),
url(r'^v2/number_range_bulk/$', api_v2.number_range),
url(r'^v2/phone_number_bulk/$', api_v2.phone_number),

# Dictionary Read Write
url(r'^entities/get_entity_word_variants', external_api.get_entity_word_variants),
url(r'^entities/update_dictionary', external_api.update_dictionary),
Expand Down
4 changes: 2 additions & 2 deletions models/crf_v2/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ file_handler = open('glove_vectors', 'wb')
pickle.dump(obj=word_vectors.wv.vectors, file=file_handler, protocol=2)

if not os.path.exists('/app/models_crf/'):
os.makedirs('/app/models_crf/')
os.makedirs('/app/models_crf/')

```

Expand Down Expand Up @@ -167,7 +167,7 @@ The module is used to take input as the sentence_list and entity_list and conver
```python
from models.crf_v2.crf_preprocess_data import CrfPreprocessData
docs['word_embeddings'] =
CrfPreprocessData.word_embeddings(processed_pos_tag_data=each,
[CrfPreprocessData.word_embeddings(processed_pos_tag_data=each,
vocab=vocab, word_vectors=word_vectors)
for each in docs[SENTENCE_LIST]]
```
Expand Down
38 changes: 24 additions & 14 deletions ner_v1/detectors/numeral/budget/budget_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,24 @@ class BudgetDetector(BaseDetector):
"""

_scale_patterns = {
'k': 1000,
'ha?zaa?r': 1000,
'ha?ja?ar': 1000,
'thousa?nd': 1000,
'l': 100000,
'lacs?': 100000,
'lakh?s?': 100000,
'lakhs': 100000,
'm': 1000000,
'mn': 1000000,
'million': 1000000,
'mill?': 1000000,
'c': 10000000,
'cro?': 10000000,
'crore?s?': 10000000,
}

def __init__(self, entity_name, source_language_script=ENGLISH_LANG, translation_enabled=False,
use_text_detection=False):
"""Initializes a BudgetDetector object
Expand All @@ -101,18 +119,10 @@ def __init__(self, entity_name, source_language_script=ENGLISH_LANG, translation
self.tag = '__' + self.entity_name + '__'
self._use_text_detection = use_text_detection

self._allowed_units = [
(['k', 'ha?zaa?r', 'ha?ja?ar', 'thousa?nd'], 1000),
(['l', 'lacs?', 'lakh?s?', 'lakhs'], 100000),
(['m', 'mn', 'million', 'mill?'], 1000000),
(['c', 'cro?', 'crore?s?'], 10000000),
]

units = []
for _units, scale in self._allowed_units:
units.extend(_units)
units.sort(key=lambda unit: len(unit), reverse=True)

units, scales = zip(*sorted(
list(BudgetDetector._scale_patterns.items()), key=lambda pattern_scale: len(pattern_scale[0]), reverse=True
))
self._scale_compiled_patterns = [(scale, re.compile(unit)) for scale, unit in zip(scales, units)]
digits_pattern = r'((?:\d+(?:\,\d+)*(?:\.\d+)?)|(?:(?:\d+(?:\,\d+)*)?(?:\.\d+)))'
units_pattern = r'({})?'.format('|'.join(units))
self._budget_pattern = r'(?:rs\.|rs|rupees|rupee)?' \
Expand All @@ -121,8 +131,8 @@ def __init__(self, entity_name, source_language_script=ENGLISH_LANG, translation

def get_scale(self, unit):
if unit:
for _units, scale in self._allowed_units:
if re.search('|'.join(_units), unit):
for scale, pattern in self._scale_compiled_patterns:
if pattern.search(unit):
return scale

return 1
Expand Down
42 changes: 41 additions & 1 deletion ner_v1/tests/numeral/budget/test_budget_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ def setUp(self):
self.budget_detector = BudgetDetector(entity_name='budget')
self.budget_detector.set_min_max_digits(min_digit=1, max_digit=15)

def make_budget_dict(self, min_budget=0, max_budget=0):
@staticmethod
def make_budget_dict(min_budget=0, max_budget=0):
return {'min_budget': min_budget, 'max_budget': max_budget, 'type': 'normal_budget'}

def test_min_max_digits_limits(self):
Expand Down Expand Up @@ -118,6 +119,9 @@ def test_not_budgets(self):
self.assertEqual(original_texts, [])

def test_budgets_without_scales(self):
"""
Test budgets without scales
"""
tests = [
('I want to buy 5 liters of milk', 0, 5, '5'),
('the insect is 120 millimeters tall', 0, 120, '120'),
Expand All @@ -128,3 +132,39 @@ def test_budgets_without_scales(self):
budget_dicts, original_texts = self.budget_detector.detect_entity(text=test)
self.assertEqual(budget_dicts, [self.make_budget_dict(max_budget=max_budget)])
self.assertEqual(original_texts, [original_text])

def test_all_budget_scales(self):
"""
Test all supported budget scales
"""
tests = [
('2k', 0, 2000, '2k'),
('2 thousand', 0, 2000, '2 thousand'),
('2 hazar', 0, 2000, '2 hazar'),
('2 hazaar', 0, 2000, '2 hazaar'),
('2 hajar', 0, 2000, '2 hajar'),
('2 hajaar', 0, 2000, '2 hajaar'),
('2l', 0, 200000, '2l'),
('2 lac', 0, 200000, '2 lac'),
('2 lacs', 0, 200000, '2 lacs'),
('2 lak', 0, 200000, '2 lak'),
('2 laks', 0, 200000, '2 laks'),
('2 lakh', 0, 200000, '2 lakh'),
('2 lakhs', 0, 200000, '2 lakhs'),
('2m', 0, 2000000, '2m'),
('2mn', 0, 2000000, '2mn'),
('2 mil', 0, 2000000, '2 mil'),
('2 mill', 0, 2000000, '2 mill'),
('2 million', 0, 2000000, '2 million'),
('2c', 0, 20000000, '2c'),
('2 cr', 0, 20000000, '2 cr'),
('2 cro', 0, 20000000, '2 cro'),
('2 cror', 0, 20000000, '2 cror'),
('2 crore', 0, 20000000, '2 crore'),
('2 crores', 0, 20000000, '2 crores'),
]

for test, min_budget, max_budget, original_text in tests:
budget_dicts, original_texts = self.budget_detector.detect_entity(text=test)
self.assertEqual(budget_dicts, [self.make_budget_dict(max_budget=max_budget)])
self.assertEqual(original_texts, [original_text])
Loading

0 comments on commit 07a69f1

Please sign in to comment.