Merge pull request #375 from hellohaptik/develop

Develop to Master
hellohaptik · Sep 29, 2020 · 1df0370 · 1df0370
2 parents e7c3146 + 8e6ecb3
commit 1df0370
Show file tree

Hide file tree

Showing 25 changed files with 2,870 additions and 170 deletions.
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
@@ -0,0 +1,17 @@
+## JIRA Ticket Number
+
+JIRA TICKET: 
+
+## Description of change
+(REMOVE ME) Please include a summary of the change and which issue is fixed. Please also include relevant motivation and context. List any dependencies that are required for this change.
+
+## Checklist (OPTIONAL):
+
+- [ ] My code follows the style guidelines of this project
+- [ ] I have performed a self-review of my own code
+- [ ] I have commented my code, particularly in hard-to-understand areas
+- [ ] I have made corresponding changes to the documentation
+- [ ] My changes generate no new warnings
+- [ ] I have added tests that prove my fix is effective or that my feature works
+- [ ] New and existing unit tests pass locally with my changes
+- [ ] Any dependent changes have been merged and published in downstream modules
diff --git a/chatbot_ner/urls.py b/chatbot_ner/urls.py
@@ -34,6 +34,7 @@
     url(r'^v2/number/$', api_v2.number),
     url(r'^v2/phone_number/$', api_v2.phone_number),
     url(r'^v2/number_range/$', api_v2.number_range),
+    url(r'^v2/text/$', api_v2.text),
 
     # V2 bulk detectors
     url(r'^v2/date_bulk/$', api_v2.date),

diff --git a/config.example b/config.example
@@ -34,8 +34,8 @@ ES_ALIAS=entity_data
 ES_INDEX_1=entity_data_v1
 ES_INDEX_2=
 ES_DOC_TYPE=data_dictionary
-ELASTICSEARCH_CRF_DATA_DOC_TYPE=training_dictionary
 ELASTICSEARCH_CRF_DATA_INDEX_NAME=entity_examples_data
+ELASTICSEARCH_CRF_DATA_DOC_TYPE=training_dictionary
 
 ES_BULK_MSG_SIZE=1000
 ES_SEARCH_SIZE=10000

diff --git a/datastore/datastore.py b/datastore/datastore.py
@@ -151,7 +151,6 @@ def create(self, err_if_exists=True, **kwargs):
 
         if self._engine == ELASTICSEARCH:
             es_url = elastic_search.connect.get_es_url()
-            es_object = elastic_search.transfer.ESTransfer(source=es_url, destination=None)
             create_map = [  # TODO: use namedtuples
                 (True, ELASTICSEARCH_INDEX_1, ELASTICSEARCH_DOC_TYPE, self._store_name,
                  self._check_doc_type_for_elasticsearch, elastic_search.create.create_entity_index),
@@ -180,8 +179,10 @@ def create(self, err_if_exists=True, **kwargs):
                     **kwargs
                 )
                 if alias_name:
-                    es_object.point_an_alias_to_index(es_url=es_url, alias_name=self._store_name,
-                                                      index_name=index_name)
+                    elastic_search.create.create_alias(connection=self._client_or_connection,
+                                                       index_list=[index_name],
+                                                       alias_name=alias_name,
+                                                       logger=ner_logger)
 
     def delete(self, err_if_does_not_exist=True, **kwargs):
         """
@@ -208,15 +209,15 @@ def delete(self, err_if_does_not_exist=True, **kwargs):
             self._connect()
 
         if self._engine == ELASTICSEARCH:
-            for index_key in [ELASTICSEARCH_INDEX_1, ELASTICSEARCH_INDEX_2, ELASTICSEARCH_CRF_DATA_INDEX_NAME]:
-                if self._connection_settings.get(index_key):
+            delete_map = [ELASTICSEARCH_INDEX_1, ELASTICSEARCH_INDEX_2, ELASTICSEARCH_CRF_DATA_INDEX_NAME]
+            for index_name_key in delete_map:
+                if self._connection_settings.get(index_name_key):
+                    index_name = self._connection_settings.get(index_name_key)
                     elastic_search.create.delete_index(connection=self._client_or_connection,
-                                                       index_name=self._store_name,
+                                                       index_name=index_name,
                                                        logger=ner_logger,
                                                        err_if_does_not_exist=err_if_does_not_exist,
                                                        **kwargs)
-            # TODO: cleanup aliases ?
-
     # === Incompatible or deprecated/duplicate APIs
 
     # FIXME: repopulate does not consider language of the variants

diff --git a/datastore/elastic_search/create.py b/datastore/elastic_search/create.py
@@ -1,9 +1,16 @@
+import logging
+from typing import List, Dict, Any
+
+from elasticsearch import Elasticsearch
+from elasticsearch.exceptions import NotFoundError
+
 from .utils import filter_kwargs
 
 log_prefix = 'datastore.elastic_search.create'
 
 
 def exists(connection, index_name):
+    # type: (Elasticsearch, str) -> bool
     """
     Checks if index_name exists
 
@@ -18,13 +25,15 @@ def exists(connection, index_name):
 
 
 def delete_index(connection, index_name, logger, err_if_does_not_exist=True, **kwargs):
+    # type: (Elasticsearch, str, logging.Logger, bool, **Any) -> None
     """
     Deletes the index named index_name
 
     Args:
         connection: Elasticsearch client object
         index_name: The name of the index
         logger: logging object to log at debug and exception level
+        err_if_does_not_exist: if to raise error if index does not exist already, defaults to True
         kwargs:
             body: The configuration for the index (settings and mappings)
             master_timeout: Specify timeout for connection to master
@@ -40,11 +49,17 @@ def delete_index(connection, index_name, logger, err_if_does_not_exist=True, **k
         else:
             return
 
+    try:
+        delete_alias(connection=connection, index_list=[index_name], alias_name='_all', logger=logger)
+    except NotFoundError:
+        logger.warning('No aliases found on on index %s', index_name)
+
     connection.indices.delete(index=index_name, **kwargs)
     logger.debug('%s: Delete Index %s: Operation successfully completed', log_prefix, index_name)
 
 
 def _create_index(connection, index_name, doc_type, logger, mapping_body, err_if_exists=True, **kwargs):
+    # type: (Elasticsearch, str, str, logging.Logger, Dict[str, Any], bool, **Any) -> None
     """
     Creates an Elasticsearch index needed for similarity based searching
     Args:
@@ -53,6 +68,7 @@ def _create_index(connection, index_name, doc_type, logger, mapping_body, err_if
         doc_type:  The type of the documents that will be indexed
         logger: logging object to log at debug and exception level
         mapping_body: dict, mappings to put on the index
+        err_if_exists: if to raise error if the index already exists, defaults to True
         kwargs:
             master_timeout: Specify timeout for connection to master
             timeout: Explicit operation timeout
@@ -118,6 +134,7 @@ def _create_index(connection, index_name, doc_type, logger, mapping_body, err_if
 
 
 def create_entity_index(connection, index_name, doc_type, logger, **kwargs):
+    # type: (Elasticsearch, str, str, logging.Logger, **Any) -> None
     """
     Creates an mapping specific to entity storage in elasticsearch and makes a call to create_index
     to create the index with the given mapping body
@@ -145,10 +162,32 @@ def create_entity_index(connection, index_name, doc_type, logger, **kwargs):
     mapping_body = {
         doc_type: {
             'properties': {
+                'language_script': {
+                    'type': 'text',
+                    'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}},
+                },
+                'value': {
+                    'type': 'text',
+                    'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}},
+                },
                 'variants': {
                     'type': 'text',
+                    'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}},
                     'analyzer': 'my_analyzer',
                     'norms': {'enabled': False},  # Needed if we want to give longer variants higher scores
+                },
+                # other removed/unused fields, kept only for backward compatibility
+                'dict_type': {
+                    'type': 'text',
+                    'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}},
+                },
+                'entity_data': {
+                    'type': 'text',
+                    'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}},
+                },
+                'source_language': {
+                    'type': 'text',
+                    'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}},
                 }
             }
         }
@@ -158,6 +197,7 @@ def create_entity_index(connection, index_name, doc_type, logger, **kwargs):
 
 
 def create_crf_index(connection, index_name, doc_type, logger, **kwargs):
+    # type: (Elasticsearch, str, str, logging.Logger, **Any) -> None
     """
     This method is used to create an index with mapping suited for story training_data
     Args:
@@ -184,17 +224,17 @@ def create_crf_index(connection, index_name, doc_type, logger, **kwargs):
     mapping_body = {
         doc_type: {
             'properties': {
-                "entity_data": {
-                    "type": "text"
+                'entity_data': {
+                    'type': 'text'
                 },
-                "sentence": {
-                    "enabled": "false"
+                'sentence': {
+                    'enabled': False
                 },
-                "entities": {
-                    "enabled": "false"
+                'entities': {
+                    'enabled': False
                 },
-                "language_script": {
-                    "type": "text"
+                'language_script': {
+                    'type': 'text'
                 }
             }
         }
@@ -204,17 +244,36 @@ def create_crf_index(connection, index_name, doc_type, logger, **kwargs):
 
 
 def create_alias(connection, index_list, alias_name, logger, **kwargs):
+    # type: (Elasticsearch, List[str], str, logging.Logger, **Any) -> None
     """
     This method is used to create alias for list of indices
     Args:
-        connection:
+        connection: Elasticsearch client object
         index_list (list): List of indices the alias has to point to
         alias_name (str): Name of the alias
         logger: logging object to log at debug and exception level
 
         **kwargs:
             https://www.elastic.co/guide/en/elasticsearch/reference/current/indices-aliases.html
     """
-    logger.debug('Alias creation %s started %s' % alias_name)
+    logger.debug('Putting alias %s to indices: %s', alias_name, str(index_list))
     connection.indices.put_alias(index=index_list, name=alias_name, **kwargs)
-    logger.debug('Alias %s now points to indices %s' % (alias_name, str(index_list)))
+    logger.debug('Alias %s now points to indices %s', alias_name, str(index_list))
+
+
+def delete_alias(connection, index_list, alias_name, logger, **kwargs):
+    # type: (Elasticsearch, List[str], str, logging.Logger, **Any) -> None
+    """
+    Delete alias `alias_name` from list of indices in `index_list`
+    Args:
+        connection: Elasticsearch client object
+        index_list (list): List of indices the alias has to point to
+        alias_name (str): Name of the alias
+        logger: logging object to log at debug and exception level
+
+        **kwargs:
+            https://www.elastic.co/guide/en/elasticsearch/reference/current/indices-aliases.html
+    """
+    logger.debug('Removing alias %s from indices: %s', alias_name, str(index_list))
+    connection.indices.delete_alias(index=index_list, name=alias_name, **kwargs)
+    logger.debug('Alias %s removed from indices %s', alias_name, str(index_list))
diff --git a/ner_constants.py b/ner_constants.py
@@ -25,6 +25,11 @@
 
 ENTITY_VALUE_DICT_KEY = 'value'
 
+# datastore_verified a key to verify value from the datastore
+DATASTORE_VERIFIED = 'datastore_verified'
+# model_verified a key to verify value from the model
+MODEL_VERIFIED = 'model_verified'
+
 # ************************ constants tell us what to do with structured_value ************************
 # This will execute entity detection on the structured_value.
 STRUCTURED = 0