Skip to content

Commit

Permalink
Merge pull request #367 from hellohaptik/develop
Browse files Browse the repository at this point in the history
Develop to Master
Force Merging, newman part in the C/I is still not setup correctly so it reports a failure
  • Loading branch information
chiragjn authored May 29, 2020
2 parents 92823fe + 92412cd commit e7c3146
Show file tree
Hide file tree
Showing 16 changed files with 152 additions and 69 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ celerybeat-schedule

# dotenv
.env
update_env.sh

# virtualenv
venv/
Expand All @@ -94,7 +95,7 @@ ENV/
.idea/
.realsync

/Dockerfile
/Dockerfile*
/post-merge
/entrypoint.sh
/setup.py
Expand Down
4 changes: 1 addition & 3 deletions chatbot_ner/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,7 @@
SECRET_KEY = os.environ.get('SECRET_KEY')

# SECURITY WARNING: don't run with debug turned on in production!
_dj_debug = os.environ.get('DJANGO_DEBUG', 'false')
DEBUG = (_dj_debug and _dj_debug.lower() == 'true')
DEBUG = False

TEMPLATE_DEBUG = False

Expand Down Expand Up @@ -107,7 +106,6 @@ def __getitem__(self, item):
'--ignore-files=const.py',
'--ignore-files=constant.py',
'--ignore-files=constants.py',
'--ignore-files=start_server.sh',
'--ignore-files=settings.py',
'--ignore-files=run_postman_tests.py',
'--exclude-dir=docs/',
Expand Down
1 change: 0 additions & 1 deletion config.example
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

NAME=chatbot_ner
DJANGODIR=/app
DJANGO_DEBUG=False
DJANGO_LOG_LEVEL=DEBUG
DJANGO_SETTINGS_MODULE=chatbot_ner.settings
DJANGO_WSGI_MODULE=chatbot_ner/wsgi.py
Expand Down
2 changes: 0 additions & 2 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,6 @@ COPY requirements.txt /app/requirements.txt

RUN pip install --no-cache-dir -r /app/requirements.txt

# From start_server.sh

ENV NAME="chatbot_ner"
ENV DJANGODIR=/app
ENV NUM_WORKERS=4
Expand Down
2 changes: 0 additions & 2 deletions docker/Dockerfile-python3
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,6 @@ COPY requirements.txt /app/requirements.txt

RUN pip install --no-cache-dir -r /app/requirements.txt

# From start_server.sh

ENV NAME="chatbot_ner"
ENV DJANGODIR=/app
ENV NUM_WORKERS=4
Expand Down
3 changes: 2 additions & 1 deletion docs/install.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,9 @@ Following are the steps to create the Docker image and run NER with Docker.
2. **Bring up chatbot_ner:**

```shell
git clone https://github.com/hellohaptik/chatbot_ner.git
cd chatbot_ner
cp config.example .env (This will have all the basic environment variables to get started, You can update values accordingly)
cp config.example .env # (This will have all the basic environment variables to get started, You can update values accordingly)
cp .env docker/.env
cd docker
docker-compose up --build -d
Expand Down
58 changes: 43 additions & 15 deletions ner_v1/detectors/pattern/regex/regex_detection.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,34 @@
"""
Important note: bad regexes that cause catastrophic backtracking can hang your Python processes (especially because
Python's re does not release the GIL! If you are putting this module behind a web server be wary of ReDoS attacks.
Unfortunately there is no clean way around that, so make sure to set processing killing timeouts like harakiri for
uwsgi
"""

from __future__ import absolute_import
import re

from typing import List

from chatbot_ner.config import ner_logger

try:
import regex as re

_re_flags = re.UNICODE | re.V1 | re.WORD

except ImportError:
ner_logger.warning('Error importing `regex` lib, falling back to stdlib re')
import re

_re_flags = re.UNICODE


class RegexDetector(object):
MATCH_PLACEHOLDER = '▁▁'
DEFAULT_FLAGS = _re_flags
"""
Detect entity from text using a regular expression pattern
Detect entity from text using a regular expression pattern.
Note: Module will not return any empty or whitespace only matches
Attributes:
entity_name (str) : holds the entity name
Expand All @@ -15,11 +38,15 @@ class RegexDetector(object):
matches (list of _sre.SRE_Match): re.finditer match objects
pattern (raw str or str or unicode): pattern to be compiled into a re object
"""
def __init__(self, entity_name, pattern, re_flags=re.UNICODE):

def __init__(self, entity_name, pattern, re_flags=DEFAULT_FLAGS, max_matches=50):
"""
Args:
entity_name (str): an indicator value as tag to replace detected values
pattern (raw str or str or unicode): pattern to be compiled into a re object
re_flags (int): flags to pass to re.compile.
Defaults to regex.V1 | regex.WORD | regex.UNICODE. for regex lib to re.U for stdlib re and
max_matches (int): maximum number of matches to consider.
Raises:
TypeError: if the given pattern fails to compile
Expand All @@ -29,7 +56,7 @@ def __init__(self, entity_name, pattern, re_flags=re.UNICODE):
self.tagged_text = ''
self.processed_text = ''
self.pattern = re.compile(pattern, re_flags)
self.matches = []
self.max_matches = max_matches
self.tag = '__' + self.entity_name + '__'

def detect_entity(self, text):
Expand Down Expand Up @@ -70,27 +97,28 @@ def _detect_regex(self):
tuple containing
list: list containing substrings of text that matched the set pattern
list: list containing corresponding substrings of original text that were identified as entity values
"""
original_list = []
match_list = []
original_list = [] # type: List[str]
match_list = [] # type: List[str]
for match in self.pattern.finditer(self.processed_text):
self.matches.append(match)
match_list.append(match.group(0))
original_list.append(match.group(0))
if match.group(0).strip():
match_text = match.group(0)
match_list.append(match_text)
original_list.append(match_text)
if len(match_list) >= self.max_matches:
break
return match_list, original_list

def _update_processed_text(self, match_list):
# type: (List[str]) -> None
"""
Update processed text by removing already found entity values and update tagged text to replace found
values with the set tag
Args:
match_list: list containing substrings of text that matched the set pattern
"""
for detected_text in match_list:
self.tagged_text = self.tagged_text.replace(detected_text, self.tag)
self.processed_text = self.processed_text.replace(detected_text, '')


self.tagged_text = self.tagged_text.replace(detected_text, RegexDetector.MATCH_PLACEHOLDER, 1)
self.processed_text = self.processed_text.replace(detected_text, '', 1)
self.tagged_text = self.tagged_text.replace(RegexDetector.MATCH_PLACEHOLDER, self.tag)
File renamed without changes.
Empty file.
100 changes: 100 additions & 0 deletions ner_v1/tests/pattern/regex/test_regex_detection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
from __future__ import absolute_import

import re

from django.test import TestCase

from ner_v1.detectors.pattern.regex.regex_detection import RegexDetector


class TestRegexDetector(TestCase):
def test_max_matches(self):
"""Test max_matches argument for RegexDetector"""
entity_name = 'num'
tag = '__{}__'.format(entity_name)
pattern = '\\b(\\d+|)\\b'
text = 'there are some numbers like 345 and 2342, but the pattern is bad too it matches empty string! We ' \
'will now sprinkle this text with numbers 34634653 42342345234 12433345325 to test 17293847 345 2342'

regex_detector = RegexDetector(entity_name=entity_name, pattern=pattern, max_matches=3)
expected_values = ['345', '2342', '34634653']
expected_original_texts = ['345', '2342', '34634653']
expected_tagged_text = 'there are some numbers like {t} and {t}, but the pattern is bad too ' \
'it matches empty string! We will now sprinkle this text with' \
' numbers {t} 42342345234 12433345325 to test 17293847 345 2342'.format(t=tag)
values, original_texts = regex_detector.detect_entity(text)
self.assertEqual(regex_detector.tagged_text, expected_tagged_text)
self.assertEqual(values, expected_values)
self.assertEqual(original_texts, expected_original_texts)

regex_detector = RegexDetector(entity_name=entity_name, pattern=pattern, max_matches=50)
expected_values = ['345', '2342', '34634653', '42342345234', '12433345325', '17293847', '345', '2342']
expected_original_texts = ['345', '2342', '34634653', '42342345234', '12433345325', '17293847', '345', '2342']
expected_tagged_text = 'there are some numbers like {t} and {t}, but the pattern is bad too ' \
'it matches empty string! We will now sprinkle this text with' \
' numbers {t} {t} {t} to test {t} {t} {t}'.format(t=tag)
values, original_texts = regex_detector.detect_entity(text)
self.assertEqual(regex_detector.tagged_text, expected_tagged_text)
self.assertEqual(values, expected_values)
self.assertEqual(original_texts, expected_original_texts)

def test_non_empty_matches(self):
"""Test if RegexDetector returns only non empty matches"""
entity_name = 'test'
_ = '__{}__'.format(entity_name)
pattern = '\\b(\\d+|)\\b'
text = 'there are no numbers in this text! but the pattern is bad too, it matches empty string'

regex_detector = RegexDetector(entity_name=entity_name, pattern=pattern)
expected_values = []
expected_original_texts = []
expected_tagged_text = text
values, original_texts = regex_detector.detect_entity(text)
self.assertEqual(regex_detector.tagged_text, expected_tagged_text)
self.assertEqual(values, expected_values)
self.assertEqual(original_texts, expected_original_texts)

def test_recursive_replace(self):
"""Test protection against MemoryError when replacing in RegexDetector"""
multiplier = 30
entity_name = 'abab'
tag = '__{}__'.format(entity_name)
pattern = '\\bab\\b'
text = ' '.join(['ab'] * multiplier)

regex_detector = RegexDetector(entity_name=entity_name, pattern=pattern)
expected_values = ['ab'] * multiplier
expected_original_texts = ['ab'] * multiplier
expected_tagged_text = ' '.join(['{t}'.format(t=tag)] * multiplier)
values, original_texts = regex_detector.detect_entity(text)
self.assertEqual(regex_detector.tagged_text, expected_tagged_text)
self.assertEqual(values, expected_values)
self.assertEqual(original_texts, expected_original_texts)

def test_dot_star(self):
"""Test .* pattern for RegexDetector"""
entity_name = 'test'
tag = '__{}__'.format(entity_name)
pattern = '.*'
text = 'hello world\nlorem ipsum dolor sit amet\ntest with new lines and stuff .^!@"#$%^&*(){}[]:?><\n'

regex_detector = RegexDetector(entity_name=entity_name, pattern=pattern)
expected_values = ['hello world', 'lorem ipsum dolor sit amet',
'test with new lines and stuff .^!@"#$%^&*(){}[]:?><']
expected_original_texts = ['hello world', 'lorem ipsum dolor sit amet',
'test with new lines and stuff .^!@"#$%^&*(){}[]:?><']
expected_tagged_text = '{t}\n{t}\n{t}\n'.format(t=tag)
values, original_texts = regex_detector.detect_entity(text)
self.assertEqual(regex_detector.tagged_text, expected_tagged_text)
self.assertEqual(values, expected_values)
self.assertEqual(original_texts, expected_original_texts)

regex_detector = RegexDetector(entity_name=entity_name, re_flags=RegexDetector.DEFAULT_FLAGS | re.DOTALL,
pattern=pattern)
expected_values = [text]
expected_original_texts = [text]
expected_tagged_text = '{t}'.format(t=tag)
values, original_texts = regex_detector.detect_entity(text)
self.assertEqual(regex_detector.tagged_text, expected_tagged_text)
self.assertEqual(values, expected_values)
self.assertEqual(original_texts, expected_original_texts)
2 changes: 1 addition & 1 deletion ner_v2/tests/numeral/number/en/test_number_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ def __new__(cls, name, bases, attrs):
@classmethod
def yaml_testsuite_generator(cls):
for filepath in cls.yaml_test_files:
test_data = yaml.load(io.open(filepath, "r", encoding="utf-8"))
test_data = yaml.load(io.open(filepath, "r", encoding="utf-8"), Loader=yaml.SafeLoader)
for language in test_data["tests"]:
for i, testcase in enumerate(test_data["tests"][language]):
yield (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def __new__(cls, name, bases, attrs):
@classmethod
def yaml_testsuite_generator(cls):
for filepath in cls.yaml_test_files:
test_data = yaml.load(io.open(filepath, "r", encoding="utf-8"))
test_data = yaml.load(io.open(filepath, "r", encoding="utf-8"), Loader=yaml.SafeLoader)
for language in test_data["tests"]:
for i, testcase in enumerate(test_data["tests"][language]):
yield (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def __new__(cls, name, bases, attrs):
@classmethod
def yaml_testsuite_generator(cls):
for filepath in cls.yaml_test_files:
test_data = yaml.load(io.open(filepath, "r", encoding="utf-8"))
test_data = yaml.load(io.open(filepath, "r", encoding="utf-8"), Loader=yaml.SafeLoader)
for language in test_data["tests"]:
for i, testcase in enumerate(test_data["tests"][language]):
yield (
Expand Down
2 changes: 1 addition & 1 deletion ner_v2/tests/temporal/time/test_time_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def __new__(cls, name, bases, attrs):
@classmethod
def yaml_testsuite_generator(cls):
for filepath in cls.yaml_test_files:
test_data = yaml.load(io.open(filepath, "r", encoding="utf-8"))
test_data = yaml.load(io.open(filepath, "r", encoding="utf-8"), Loader=yaml.SafeLoader)
timezone = pytz.timezone(test_data["args"].get("timezone", "UTC"))
for language in test_data["tests"]:
for i, testcase in enumerate(test_data["tests"][language]):
Expand Down
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
phonenumberslite==8.10.18
six==1.11.0
gunicorn==19.6.0
pytz==2014.2
nltk==3.4.5
numpy==1.16
Expand Down
39 changes: 0 additions & 39 deletions start_server.sh

This file was deleted.

0 comments on commit e7c3146

Please sign in to comment.