Skip to content

Commit

Permalink
Merge pull request #427 from hellohaptik/develop
Browse files Browse the repository at this point in the history
Develop to Master 2021-04-29T19:39:00
  • Loading branch information
chiragjn authored May 5, 2021
2 parents ffe639a + 44c38fa commit f6e72ed
Show file tree
Hide file tree
Showing 10 changed files with 253 additions and 164 deletions.
14 changes: 4 additions & 10 deletions chatbot_ner/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,15 +44,9 @@
ES_DOC_TYPE = os.environ.get('ES_DOC_TYPE', 'data_dictionary')
ES_AUTH_NAME = os.environ.get('ES_AUTH_NAME')
ES_AUTH_PASSWORD = os.environ.get('ES_AUTH_PASSWORD')
ES_BULK_MSG_SIZE = os.environ.get('ES_BULK_MSG_SIZE', '10000')
ES_SEARCH_SIZE = os.environ.get('ES_SEARCH_SIZE', '10000')

try:
ES_BULK_MSG_SIZE = int(ES_BULK_MSG_SIZE)
ES_SEARCH_SIZE = int(ES_SEARCH_SIZE)
except ValueError:
ES_BULK_MSG_SIZE = 1000
ES_SEARCH_SIZE = 1000
ES_BULK_MSG_SIZE = int((os.environ.get('ES_BULK_MSG_SIZE') or '').strip() or '1000')
ES_SEARCH_SIZE = int((os.environ.get('ES_SEARCH_SIZE') or '').strip() or '1000')
ES_REQUEST_TIMEOUT = int((os.environ.get('ES_REQUEST_TIMEOUT') or '').strip() or '20')

ELASTICSEARCH_CRF_DATA_INDEX_NAME = os.environ.get('ELASTICSEARCH_CRF_DATA_INDEX_NAME')
ELASTICSEARCH_CRF_DATA_DOC_TYPE = os.environ.get('ELASTICSEARCH_CRF_DATA_DOC_TYPE')
Expand Down Expand Up @@ -91,7 +85,7 @@
'retry_on_timeout': False,
'max_retries': 1,
'timeout': 20,
'request_timeout': 20,
'request_timeout': ES_REQUEST_TIMEOUT,

# Transfer Specific constants (ignore if only one elasticsearch is setup)
# For detailed explanation datastore.elastic_search.transfer.py
Expand Down
4 changes: 2 additions & 2 deletions chatbot_ner/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
url(r'^v2/number_range_bulk/$', api_v2.number_range),
url(r'^v2/phone_number_bulk/$', api_v2.phone_number),

# Dictionary Read Write
# Deprecated dictionary read write, use entities/data/v1/*
url(r'^entities/get_entity_word_variants', external_api.get_entity_word_variants),
url(r'^entities/update_dictionary', external_api.update_dictionary),

Expand All @@ -54,7 +54,7 @@
url(r'^entities/get_crf_training_data', external_api.get_crf_training_data),
url(r'^entities/update_crf_training_data', external_api.update_crf_training_data),

# Train Crf Model
# Deprecated train crf model
url(r'^entities/train_crf_model', external_api.train_crf_model),

url(r'^entities/languages/v1/(?P<entity_name>.+)$', external_api.entity_language_view),
Expand Down
Empty file removed datastore_exceptions.py
Empty file.
15 changes: 7 additions & 8 deletions external_api/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def get_entity_word_variants(request):
ner_logger.exception('Error: %s' % error_message)
return HttpResponse(json.dumps(response), content_type='application/json', status=500)

except BaseException as e:
except Exception as e:
response['error'] = str(e)
ner_logger.exception('Error: %s' % e)
return HttpResponse(json.dumps(response), content_type='application/json', status=500)
Expand Down Expand Up @@ -92,7 +92,7 @@ def update_dictionary(request):
ner_logger.exception('Error: %s' % error_message)
return HttpResponse(json.dumps(response), content_type='application/json', status=500)

except BaseException as e:
except Exception as e:
response['error'] = str(e)
ner_logger.exception('Error: %s' % e)
return HttpResponse(json.dumps(response), content_type='application/json', status=500)
Expand Down Expand Up @@ -125,7 +125,7 @@ def transfer_entities(request):
ner_logger.exception('Error: %s' % error_message)
return HttpResponse(json.dumps(response), content_type='application/json', status=500)

except BaseException as e:
except Exception as e:
response['error'] = str(e)
ner_logger.exception('Error: %s' % e)
return HttpResponse(json.dumps(response), content_type='application/json', status=500)
Expand Down Expand Up @@ -166,7 +166,7 @@ def get_crf_training_data(request):
ner_logger.exception('Error: %s' % error_message)
return HttpResponse(json.dumps(response), content_type='application/json', status=500)

except BaseException as e:
except Exception as e:
response['error'] = str(e)
ner_logger.exception('Error: %s' % e)
return HttpResponse(json.dumps(response), content_type='application/json', status=500)
Expand All @@ -193,8 +193,7 @@ def update_crf_training_data(request):
external_api_data = json.loads(request.POST.get(EXTERNAL_API_DATA))
sentences = external_api_data.get(SENTENCES)
entity_name = external_api_data.get(ENTITY_NAME)
DataStore().update_entity_crf_data(entity_name=entity_name,
sentences=sentences)
DataStore().update_entity_crf_data(entity_name=entity_name, sentences=sentences)
response['success'] = True

except (DataStoreSettingsImproperlyConfiguredException,
Expand All @@ -204,7 +203,7 @@ def update_crf_training_data(request):
ner_logger.exception('Error: %s' % error_message)
return HttpResponse(json.dumps(response), content_type='application/json', status=500)

except BaseException as e:
except Exception as e:
response['error'] = str(e)
ner_logger.exception('Error: %s' % e)
return HttpResponse(json.dumps(response), content_type='application/json', status=500)
Expand Down Expand Up @@ -257,7 +256,7 @@ def train_crf_model(request):
ner_logger.exception('Error: %s' % error_message)
return HttpResponse(json.dumps(response), content_type='application/json', status=500)

except BaseException as e:
except Exception as e:
response['error'] = str(e)
ner_logger.exception('Error: %s' % e)
return HttpResponse(json.dumps(response), content_type='application/json', status=500)
Expand Down
22 changes: 17 additions & 5 deletions lib/nlp/tokenizer.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,22 @@
from __future__ import absolute_import
import re

from chatbot_ner.config import ner_logger

try:
import regex as re

_re_flags = re.UNICODE | re.V1 | re.WORD

except ImportError:
ner_logger.warning('Error importing `regex` lib, falling back to stdlib re')
import re

_re_flags = re.UNICODE

import nltk
import regex
import six

# constants
from lib.singleton import Singleton
import six

NLTK_TOKENIZER = 'WORD_TOKENIZER'
PRELOADED_NLTK_TOKENIZER = 'PRELOADED_NLTK_TOKENIZER'
Expand Down Expand Up @@ -52,7 +63,8 @@ def __lucene_standard_tokenizer(self):
Tokenizer that mimicks Elasticsearch/Lucene's standard tokenizer
Uses word boundaries defined in Unicode Annex 29
"""
words_pattern = regex.compile(r'\w(?:\B\S)*', flags=regex.V1 | regex.WORD | regex.UNICODE)

words_pattern = re.compile(r'\w(?:\B\S)*', flags=_re_flags)

def word_tokenize(text):
return words_pattern.findall(text)
Expand Down
16 changes: 14 additions & 2 deletions ner_v1/detectors/pattern/regex/regex_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,14 @@
import regex as re

_re_flags = re.UNICODE | re.V1 | re.WORD
_regex_available = True

except ImportError:
ner_logger.warning('Error importing `regex` lib, falling back to stdlib re')
import re

_re_flags = re.UNICODE
_regex_available = False


class RegexDetector(object):
Expand All @@ -45,7 +47,7 @@ def __init__(self, entity_name, pattern, re_flags=DEFAULT_FLAGS, max_matches=50)
entity_name (str): an indicator value as tag to replace detected values
pattern (raw str or str or unicode): pattern to be compiled into a re object
re_flags (int): flags to pass to re.compile.
Defaults to regex.V1 | regex.WORD | regex.UNICODE. for regex lib to re.U for stdlib re and
Defaults to `regex.U | regex.V1 | regex.WORD` for `regex` lib and `re.U` for stdlib `re`
max_matches (int): maximum number of matches to consider.
Raises:
Expand All @@ -55,7 +57,17 @@ def __init__(self, entity_name, pattern, re_flags=DEFAULT_FLAGS, max_matches=50)
self.text = ''
self.tagged_text = ''
self.processed_text = ''
self.pattern = re.compile(pattern, re_flags)
try:
self.pattern = re.compile(pattern, flags=re_flags)
except re.error:
# In very rare cases it is possible we encounter a pattern that is invalid for V1 engine but works just
# fine on V0 engine/Python's built in re. E.g. nested character sets '[[]]'
if _regex_available and (re_flags & re.V1):
re_flags = (re_flags ^ re.V1) | re.V0
self.pattern = re.compile(pattern, flags=re_flags)
ner_logger.warning(f'Failed to compile `{pattern}` with regex.V1, falling back to regex.V0')
else:
raise
self.max_matches = max_matches
self.tag = '__' + self.entity_name + '__'

Expand Down
45 changes: 44 additions & 1 deletion ner_v1/tests/pattern/regex/test_regex_detection.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,16 @@
from __future__ import absolute_import

import re
import unittest

try:
import regex as re

_regex_available = True

except ImportError:
import re

_regex_available = False

from django.test import TestCase

Expand Down Expand Up @@ -98,3 +108,36 @@ def test_dot_star(self):
self.assertEqual(regex_detector.tagged_text, expected_tagged_text)
self.assertEqual(values, expected_values)
self.assertEqual(original_texts, expected_original_texts)

def test_invalid_pattern_compile(self):
"""Test compiling invalid pattern raises re.error"""
entity_name = 'test'
pattern = '(invalid!'
with self.assertRaises(re.error):
RegexDetector(entity_name=entity_name, pattern=pattern)

@unittest.skipIf(not _regex_available, 'skipping test because `regex` lib is not available')
def test_nested_character_group_compile(self):
"""Test compiling patterns that fail with regex.V1 but work with regex.V0"""
entity_name = 'test'
pattern = '[[\\]]'
text = 'this pattern should extract box brackets [] [][[[ ]]]]]'
expected_values = ['[', ']', '[', ']', '[', '[', '[', ']', ']', ']', ']', ']']
expected_original_texts = ['[', ']', '[', ']', '[', '[', '[', ']', ']', ']', ']', ']']
regex_detector = RegexDetector(entity_name=entity_name, re_flags=RegexDetector.DEFAULT_FLAGS, pattern=pattern)
self.assertTrue((regex_detector.pattern.flags & re.V1) == 0)
self.assertTrue((regex_detector.pattern.flags & re.V0) != 0)
values, original_texts = regex_detector.detect_entity(text)
self.assertEqual(values, expected_values)
self.assertEqual(original_texts, expected_original_texts)

pattern = '[[]]'
text = 'this pattern should extract box brackets pairs [] [][[[ ]]]]]'
expected_values = ['[]', '[]']
expected_original_texts = ['[]', '[]']
regex_detector = RegexDetector(entity_name=entity_name, re_flags=RegexDetector.DEFAULT_FLAGS, pattern=pattern)
self.assertTrue((regex_detector.pattern.flags & re.V1) == 0)
self.assertTrue((regex_detector.pattern.flags & re.V0) != 0)
values, original_texts = regex_detector.detect_entity(text)
self.assertEqual(values, expected_values)
self.assertEqual(original_texts, expected_original_texts)
14 changes: 11 additions & 3 deletions ner_v2/detectors/pattern/phone_number/phone_number_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,14 @@

import re

try:
import regex

_regex_available = True
except ImportError:
_regex_available = False

import phonenumbers
import regex
from six.moves import zip

from language_utilities.constant import ENGLISH_LANG
Expand Down Expand Up @@ -35,6 +41,10 @@ def __init__(self, entity_name, language=ENGLISH_LANG, locale=None):
super(PhoneDetector, self).__init__(language, locale)
self.language = language
self.locale = locale or 'en-IN'
if _regex_available:
# This will replace all types of dashes(em or en) by hyphen.
self.locale = regex.sub('\\p{Pd}', '-', self.locale)

self.text = ''
self.phone, self.original_phone_text = [], []
self.country_code = self.get_country_code_from_locale()
Expand All @@ -55,8 +65,6 @@ def get_country_code_from_locale(self):
This method sets self.country_code from given locale
"""
regex_pattern = re.compile('[-_](.*$)', re.U)
self.locale = regex.sub("\\p{Pd}", "-",
self.locale) # This will replace all types of dashes(em or en) by hyphen.
match = regex_pattern.findall(self.locale)
if match:
return match[0].upper()
Expand Down
Loading

0 comments on commit f6e72ed

Please sign in to comment.