Merge pull request #367 from hellohaptik/develop

Develop to Master Force Merging, newman part in the C/I is still not setup correctly so it reports a failure
hellohaptik · May 29, 2020 · e7c3146 · e7c3146
2 parents 92823fe + 92412cd
commit e7c3146
Show file tree

Hide file tree

Showing 16 changed files with 152 additions and 69 deletions.
diff --git a/.gitignore b/.gitignore
@@ -80,6 +80,7 @@ celerybeat-schedule
 
 # dotenv
 .env
+update_env.sh
 
 # virtualenv
 venv/
@@ -94,7 +95,7 @@ ENV/
 .idea/
 .realsync
 
-/Dockerfile
+/Dockerfile*
 /post-merge
 /entrypoint.sh
 /setup.py

diff --git a/chatbot_ner/settings.py b/chatbot_ner/settings.py
@@ -22,8 +22,7 @@
 SECRET_KEY = os.environ.get('SECRET_KEY')
 
 # SECURITY WARNING: don't run with debug turned on in production!
-_dj_debug = os.environ.get('DJANGO_DEBUG', 'false')
-DEBUG = (_dj_debug and _dj_debug.lower() == 'true')
+DEBUG = False
 
 TEMPLATE_DEBUG = False
 
@@ -107,7 +106,6 @@ def __getitem__(self, item):
     '--ignore-files=const.py',
     '--ignore-files=constant.py',
     '--ignore-files=constants.py',
-    '--ignore-files=start_server.sh',
     '--ignore-files=settings.py',
     '--ignore-files=run_postman_tests.py',
     '--exclude-dir=docs/',

diff --git a/config.example b/config.example
@@ -5,7 +5,6 @@
 
 NAME=chatbot_ner
 DJANGODIR=/app
-DJANGO_DEBUG=False
 DJANGO_LOG_LEVEL=DEBUG
 DJANGO_SETTINGS_MODULE=chatbot_ner.settings
 DJANGO_WSGI_MODULE=chatbot_ner/wsgi.py

diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -20,8 +20,6 @@ COPY requirements.txt /app/requirements.txt
 
 RUN pip install --no-cache-dir -r /app/requirements.txt
 
-# From start_server.sh
-
 ENV NAME="chatbot_ner"
 ENV DJANGODIR=/app
 ENV NUM_WORKERS=4

diff --git a/docker/Dockerfile-python3 b/docker/Dockerfile-python3
@@ -27,8 +27,6 @@ COPY requirements.txt /app/requirements.txt
 
 RUN pip install --no-cache-dir -r /app/requirements.txt
 
-# From start_server.sh
-
 ENV NAME="chatbot_ner"
 ENV DJANGODIR=/app
 ENV NUM_WORKERS=4

diff --git a/docs/install.md b/docs/install.md
@@ -49,8 +49,9 @@ Following are the steps to create the Docker image and run NER with Docker.
 2. **Bring up chatbot_ner:**
 
 ```shell
+git clone https://github.com/hellohaptik/chatbot_ner.git
 cd chatbot_ner 
-cp config.example .env    (This will have all the basic environment variables to get started, You can update values accordingly)
+cp config.example .env    # (This will have all the basic environment variables to get started, You can update values accordingly)
 cp .env docker/.env
 cd docker
 docker-compose up --build -d

diff --git a/ner_v1/detectors/pattern/regex/regex_detection.py b/ner_v1/detectors/pattern/regex/regex_detection.py
@@ -1,11 +1,34 @@
+"""
+Important note: bad regexes that cause catastrophic backtracking can hang your Python processes (especially because
+Python's re does not release the GIL! If you are putting this module behind a web server be wary of ReDoS attacks.
+Unfortunately there is no clean way around that, so make sure to set processing killing timeouts like harakiri for
+uwsgi
+"""
+
 from __future__ import absolute_import
-import re
+
+from typing import List
+
 from chatbot_ner.config import ner_logger
 
+try:
+    import regex as re
+
+    _re_flags = re.UNICODE | re.V1 | re.WORD
+
+except ImportError:
+    ner_logger.warning('Error importing `regex` lib, falling back to stdlib re')
+    import re
+
+    _re_flags = re.UNICODE
+
 
 class RegexDetector(object):
+    MATCH_PLACEHOLDER = '▁▁'
+    DEFAULT_FLAGS = _re_flags
     """
-    Detect entity from text using a regular expression pattern
+    Detect entity from text using a regular expression pattern.
+    Note: Module will not return any empty or whitespace only matches
 
     Attributes:
          entity_name (str) : holds the entity name
@@ -15,11 +38,15 @@ class RegexDetector(object):
          matches (list of _sre.SRE_Match): re.finditer match objects
          pattern (raw str or str or unicode): pattern to be compiled into a re object
     """
-    def __init__(self, entity_name, pattern, re_flags=re.UNICODE):
+
+    def __init__(self, entity_name, pattern, re_flags=DEFAULT_FLAGS, max_matches=50):
         """
         Args:
             entity_name (str): an indicator value as tag to replace detected values
             pattern (raw str or str or unicode): pattern to be compiled into a re object
+            re_flags (int): flags to pass to re.compile.
+                Defaults to regex.V1 | regex.WORD | regex.UNICODE. for regex lib to re.U for stdlib re and
+            max_matches (int): maximum number of matches to consider.
 
         Raises:
             TypeError: if the given pattern fails to compile
@@ -29,7 +56,7 @@ def __init__(self, entity_name, pattern, re_flags=re.UNICODE):
         self.tagged_text = ''
         self.processed_text = ''
         self.pattern = re.compile(pattern, re_flags)
-        self.matches = []
+        self.max_matches = max_matches
         self.tag = '__' + self.entity_name + '__'
 
     def detect_entity(self, text):
@@ -70,27 +97,28 @@ def _detect_regex(self):
             tuple containing
                 list: list containing substrings of text that matched the set pattern
                 list: list containing corresponding substrings of original text that were identified as entity values
-
         """
-        original_list = []
-        match_list = []
+        original_list = []  # type: List[str]
+        match_list = []  # type: List[str]
         for match in self.pattern.finditer(self.processed_text):
-            self.matches.append(match)
-            match_list.append(match.group(0))
-            original_list.append(match.group(0))
+            if match.group(0).strip():
+                match_text = match.group(0)
+                match_list.append(match_text)
+                original_list.append(match_text)
+            if len(match_list) >= self.max_matches:
+                break
         return match_list, original_list
 
     def _update_processed_text(self, match_list):
+        # type: (List[str]) -> None
         """
         Update processed text by removing already found entity values and update tagged text to replace found
         values with the set tag
 
         Args:
             match_list: list containing substrings of text that matched the set pattern
-
         """
         for detected_text in match_list:
-            self.tagged_text = self.tagged_text.replace(detected_text, self.tag)
-            self.processed_text = self.processed_text.replace(detected_text, '')
-
-
+            self.tagged_text = self.tagged_text.replace(detected_text, RegexDetector.MATCH_PLACEHOLDER, 1)
+            self.processed_text = self.processed_text.replace(detected_text, '', 1)
+        self.tagged_text = self.tagged_text.replace(RegexDetector.MATCH_PLACEHOLDER, self.tag)
diff --git a/ner_v1/tests/textual/__init__.py → ner_v1/tests/pattern/__init__.py b/ner_v1/tests/textual/__init__.py → ner_v1/tests/pattern/__init__.py
diff --git a/ner_v1/tests/pattern/regex/__init__.py b/ner_v1/tests/pattern/regex/__init__.py
diff --git a/ner_v1/tests/pattern/regex/test_regex_detection.py b/ner_v1/tests/pattern/regex/test_regex_detection.py
@@ -0,0 +1,100 @@
+from __future__ import absolute_import
+
+import re
+
+from django.test import TestCase
+
+from ner_v1.detectors.pattern.regex.regex_detection import RegexDetector
+
+
+class TestRegexDetector(TestCase):
+    def test_max_matches(self):
+        """Test max_matches argument for RegexDetector"""
+        entity_name = 'num'
+        tag = '__{}__'.format(entity_name)
+        pattern = '\\b(\\d+|)\\b'
+        text = 'there are some numbers like 345 and 2342, but the pattern is bad too it matches empty string! We ' \
+               'will now sprinkle this text with numbers 34634653 42342345234 12433345325 to test 17293847 345 2342'
+
+        regex_detector = RegexDetector(entity_name=entity_name, pattern=pattern, max_matches=3)
+        expected_values = ['345', '2342', '34634653']
+        expected_original_texts = ['345', '2342', '34634653']
+        expected_tagged_text = 'there are some numbers like {t} and {t}, but the pattern is bad too ' \
+                               'it matches empty string! We will now sprinkle this text with' \
+                               ' numbers {t} 42342345234 12433345325 to test 17293847 345 2342'.format(t=tag)
+        values, original_texts = regex_detector.detect_entity(text)
+        self.assertEqual(regex_detector.tagged_text, expected_tagged_text)
+        self.assertEqual(values, expected_values)
+        self.assertEqual(original_texts, expected_original_texts)
+
+        regex_detector = RegexDetector(entity_name=entity_name, pattern=pattern, max_matches=50)
+        expected_values = ['345', '2342', '34634653', '42342345234', '12433345325', '17293847', '345', '2342']
+        expected_original_texts = ['345', '2342', '34634653', '42342345234', '12433345325', '17293847', '345', '2342']
+        expected_tagged_text = 'there are some numbers like {t} and {t}, but the pattern is bad too ' \
+                               'it matches empty string! We will now sprinkle this text with' \
+                               ' numbers {t} {t} {t} to test {t} {t} {t}'.format(t=tag)
+        values, original_texts = regex_detector.detect_entity(text)
+        self.assertEqual(regex_detector.tagged_text, expected_tagged_text)
+        self.assertEqual(values, expected_values)
+        self.assertEqual(original_texts, expected_original_texts)
+
+    def test_non_empty_matches(self):
+        """Test if RegexDetector returns only non empty matches"""
+        entity_name = 'test'
+        _ = '__{}__'.format(entity_name)
+        pattern = '\\b(\\d+|)\\b'
+        text = 'there are no numbers in this text! but the pattern is bad too, it matches empty string'
+
+        regex_detector = RegexDetector(entity_name=entity_name, pattern=pattern)
+        expected_values = []
+        expected_original_texts = []
+        expected_tagged_text = text
+        values, original_texts = regex_detector.detect_entity(text)
+        self.assertEqual(regex_detector.tagged_text, expected_tagged_text)
+        self.assertEqual(values, expected_values)
+        self.assertEqual(original_texts, expected_original_texts)
+
+    def test_recursive_replace(self):
+        """Test protection against MemoryError when replacing in RegexDetector"""
+        multiplier = 30
+        entity_name = 'abab'
+        tag = '__{}__'.format(entity_name)
+        pattern = '\\bab\\b'
+        text = ' '.join(['ab'] * multiplier)
+
+        regex_detector = RegexDetector(entity_name=entity_name, pattern=pattern)
+        expected_values = ['ab'] * multiplier
+        expected_original_texts = ['ab'] * multiplier
+        expected_tagged_text = ' '.join(['{t}'.format(t=tag)] * multiplier)
+        values, original_texts = regex_detector.detect_entity(text)
+        self.assertEqual(regex_detector.tagged_text, expected_tagged_text)
+        self.assertEqual(values, expected_values)
+        self.assertEqual(original_texts, expected_original_texts)
+
+    def test_dot_star(self):
+        """Test .* pattern for RegexDetector"""
+        entity_name = 'test'
+        tag = '__{}__'.format(entity_name)
+        pattern = '.*'
+        text = 'hello world\nlorem ipsum dolor sit amet\ntest with new lines and stuff .^!@"#$%^&*(){}[]:?><\n'
+
+        regex_detector = RegexDetector(entity_name=entity_name, pattern=pattern)
+        expected_values = ['hello world', 'lorem ipsum dolor sit amet',
+                           'test with new lines and stuff .^!@"#$%^&*(){}[]:?><']
+        expected_original_texts = ['hello world', 'lorem ipsum dolor sit amet',
+                                   'test with new lines and stuff .^!@"#$%^&*(){}[]:?><']
+        expected_tagged_text = '{t}\n{t}\n{t}\n'.format(t=tag)
+        values, original_texts = regex_detector.detect_entity(text)
+        self.assertEqual(regex_detector.tagged_text, expected_tagged_text)
+        self.assertEqual(values, expected_values)
+        self.assertEqual(original_texts, expected_original_texts)
+
+        regex_detector = RegexDetector(entity_name=entity_name, re_flags=RegexDetector.DEFAULT_FLAGS | re.DOTALL,
+                                       pattern=pattern)
+        expected_values = [text]
+        expected_original_texts = [text]
+        expected_tagged_text = '{t}'.format(t=tag)
+        values, original_texts = regex_detector.detect_entity(text)
+        self.assertEqual(regex_detector.tagged_text, expected_tagged_text)
+        self.assertEqual(values, expected_values)
+        self.assertEqual(original_texts, expected_original_texts)
diff --git a/ner_v2/tests/numeral/number/en/test_number_detection.py b/ner_v2/tests/numeral/number/en/test_number_detection.py
@@ -154,7 +154,7 @@ def __new__(cls, name, bases, attrs):
     @classmethod
     def yaml_testsuite_generator(cls):
         for filepath in cls.yaml_test_files:
-            test_data = yaml.load(io.open(filepath, "r", encoding="utf-8"))
+            test_data = yaml.load(io.open(filepath, "r", encoding="utf-8"), Loader=yaml.SafeLoader)
             for language in test_data["tests"]:
                 for i, testcase in enumerate(test_data["tests"][language]):
                     yield (

diff --git a/ner_v2/tests/numeral/number_range/test_number_range_detection.py b/ner_v2/tests/numeral/number_range/test_number_range_detection.py
@@ -22,7 +22,7 @@ def __new__(cls, name, bases, attrs):
     @classmethod
     def yaml_testsuite_generator(cls):
         for filepath in cls.yaml_test_files:
-            test_data = yaml.load(io.open(filepath, "r", encoding="utf-8"))
+            test_data = yaml.load(io.open(filepath, "r", encoding="utf-8"), Loader=yaml.SafeLoader)
             for language in test_data["tests"]:
                 for i, testcase in enumerate(test_data["tests"][language]):
                     yield (

diff --git a/ner_v2/tests/pattern/phone_number/test_phone_number_detection.py b/ner_v2/tests/pattern/phone_number/test_phone_number_detection.py
@@ -22,7 +22,7 @@ def __new__(cls, name, bases, attrs):
     @classmethod
     def yaml_testsuite_generator(cls):
         for filepath in cls.yaml_test_files:
-            test_data = yaml.load(io.open(filepath, "r", encoding="utf-8"))
+            test_data = yaml.load(io.open(filepath, "r", encoding="utf-8"), Loader=yaml.SafeLoader)
             for language in test_data["tests"]:
                 for i, testcase in enumerate(test_data["tests"][language]):
                     yield (

diff --git a/ner_v2/tests/temporal/time/test_time_detection.py b/ner_v2/tests/temporal/time/test_time_detection.py
@@ -25,7 +25,7 @@ def __new__(cls, name, bases, attrs):
     @classmethod
     def yaml_testsuite_generator(cls):
         for filepath in cls.yaml_test_files:
-            test_data = yaml.load(io.open(filepath, "r", encoding="utf-8"))
+            test_data = yaml.load(io.open(filepath, "r", encoding="utf-8"), Loader=yaml.SafeLoader)
             timezone = pytz.timezone(test_data["args"].get("timezone", "UTC"))
             for language in test_data["tests"]:
                 for i, testcase in enumerate(test_data["tests"][language]):

diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,5 @@
 phonenumberslite==8.10.18
 six==1.11.0
-gunicorn==19.6.0
 pytz==2014.2
 nltk==3.4.5
 numpy==1.16

diff --git a/start_server.sh b/start_server.sh