Skip to content

Commit

Permalink
Merge pull request #359 from hellohaptik/develop
Browse files Browse the repository at this point in the history
Develop to Master
  • Loading branch information
chiragjn authored May 6, 2020
2 parents edb46d7 + 79bf165 commit 92823fe
Show file tree
Hide file tree
Showing 54 changed files with 5,013 additions and 392 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -105,3 +105,5 @@ sftp-config.json
logs/

.vscode
newman_reports/
dev.json
2 changes: 1 addition & 1 deletion .pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ disable=invalid-name,
import-error,
too-few-public-methods

enable=
enable=unused-import


[REPORTS]
Expand Down
66 changes: 36 additions & 30 deletions chatbot_ner/config.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from __future__ import absolute_import

import logging.handlers
import os

Expand Down Expand Up @@ -51,80 +52,69 @@
nlp_logger.addHandler(handler_stdout)

ENGINE = os.environ.get('ENGINE')
if ENGINE:
ENGINE = ENGINE.lower()
else:
ner_logger.warning("`ENGINE` variable is not set, Text type entities won't work without it")

# ES settings (Mandatory to use Text type entities)
ES_SCHEME = os.environ.get('ES_SCHEME', 'http')
ES_URL = os.environ.get('ES_URL')
ES_HOST = os.environ.get('ES_HOST')
ES_PORT = os.environ.get('ES_PORT')
ES_INDEX_NAME = os.environ.get('ES_INDEX_NAME')
ES_ALIAS = os.environ.get('ES_ALIAS')
ES_INDEX_1 = os.environ.get('ES_INDEX_1')
ES_DOC_TYPE = os.environ.get('ES_DOC_TYPE', 'data_dictionary')
ES_AUTH_NAME = os.environ.get('ES_AUTH_NAME')
ES_AUTH_PASSWORD = os.environ.get('ES_AUTH_PASSWORD')
ES_BULK_MSG_SIZE = os.environ.get('ES_BULK_MSG_SIZE', '10000')
ES_SEARCH_SIZE = os.environ.get('ES_SEARCH_SIZE', '10000')

# Crf Model Specific (Mandatory to use CRF Model)
CRF_MODELS_PATH = os.environ.get('MODELS_PATH')
CRF_EMBEDDINGS_PATH_VOCAB = os.environ.get('EMBEDDINGS_PATH_VOCAB')
CRF_EMBEDDINGS_PATH_VECTORS = os.environ.get('EMBEDDINGS_PATH_VECTORS')

try:
ES_BULK_MSG_SIZE = int(ES_BULK_MSG_SIZE)
ES_SEARCH_SIZE = int(ES_SEARCH_SIZE)
except ValueError:
ES_BULK_MSG_SIZE = 1000
ES_SEARCH_SIZE = 1000

ELASTICSEARCH_CRF_DATA_INDEX_NAME = os.environ.get('ELASTICSEARCH_CRF_DATA_INDEX_NAME')
ELASTICSEARCH_CRF_DATA_DOC_TYPE = os.environ.get('ELASTICSEARCH_CRF_DATA_DOC_TYPE')

# Optional Vars
ES_INDEX_1 = os.environ.get('ES_INDEX_1')
ES_INDEX_2 = os.environ.get('ES_INDEX_2')
DESTINATION_ES_SCHEME = os.environ.get('DESTINATION_ES_SCHEME', 'http')
DESTINATION_HOST = os.environ.get('DESTINATION_HOST')
DESTINATION_PORT = os.environ.get('DESTINATION_PORT')
DESTINATION_URL = '{scheme}://{host}:{port}'.format(**{'scheme': DESTINATION_ES_SCHEME,
'host': DESTINATION_HOST,
'port': DESTINATION_PORT})
ES_ALIAS = os.environ.get('ES_ALIAS')
ES_SCHEME = os.environ.get('ES_SCHEME')
ELASTICSEARCH_CRF_DATA_INDEX_NAME = os.environ.get('ELASTICSEARCH_CRF_DATA_INDEX_NAME')
ELASTICSEARCH_CRF_DATA_DOC_TYPE = os.environ.get('ELASTICSEARCH_CRF_DATA_DOC_TYPE')

# Crf Model Specific with additional AWS storage (optional)
CRF_MODEL_S3_BUCKET_NAME = os.environ.get('CRF_MODEL_S3_BUCKET_NAME')
CRF_MODEL_S3_BUCKET_REGION = os.environ.get('CRF_MODEL_S3_BUCKET_REGION')
WORD_EMBEDDING_REMOTE_URL = os.environ.get('WORD_EMBEDDING_REMOTE_URL')
GOOGLE_TRANSLATE_API_KEY = os.environ.get('GOOGLE_TRANSLATE_API_KEY')

if not GOOGLE_TRANSLATE_API_KEY:
ner_logger.warning('Google Translate API key is null or not set')
GOOGLE_TRANSLATE_API_KEY = ''
if ENGINE:
ENGINE = ENGINE.lower()
if ENGINE == 'elasticsearch':
if not all(key is not None and key.strip() for key in (ES_ALIAS, ES_INDEX_1, ES_DOC_TYPE)):
raise Exception(
'Invalid configuration for datastore (engine=elasticsearch). One or more of following keys: '
'`ES_ALIAS`, `ES_INDEX_1`, `ES_DOC_TYPE` is null in env')
else:
ner_logger.warning("`ENGINE` variable is not set, Text type entities won't work without it")

CHATBOT_NER_DATASTORE = {
'engine': ENGINE,
'elasticsearch': {
'connection_url': ES_URL, # Elastic Search URL
'name': ES_INDEX_NAME, # Index name used
'doc_type': ES_DOC_TYPE, # Index's doc type
'es_scheme': ES_SCHEME, # The scheme used in ES default value is http://
'host': ES_HOST, # Elastic Search Host
'port': ES_PORT, # Port of elastic search
'user': ES_AUTH_NAME,
'password': ES_AUTH_PASSWORD,
'es_alias': ES_ALIAS, # Elastic search alias used in transfer
'es_index_1': ES_INDEX_1,
'doc_type': ES_DOC_TYPE, # Index's doc type
'retry_on_timeout': False,
'max_retries': 1,
'timeout': 20,
'request_timeout': 20,

# Transfer Specific constants (ignore if only one elasticsearch is setup)
# For detailed explanation datastore.elastic_search.transfer.py
'es_index_1': ES_INDEX_1, # Index 1 used for transfer
'es_index_2': ES_INDEX_2, # Index 2 used for transfer
'destination_url': DESTINATION_URL, # Elastic search destination URL
'es_alias': ES_ALIAS, # Elastic search alias used in transfer
'es_scheme': ES_SCHEME, # The scheme used in ES default value is http://

# Training Data ES constants
'elasticsearch_crf_data_index_name': ELASTICSEARCH_CRF_DATA_INDEX_NAME,
Expand All @@ -150,7 +140,13 @@
ner_logger.warning('`ES_AWS_SERVICE` and `ES_AWS_REGION` are not set. '
'This is not a problem if you are using self hosted ES')

# TODO: Remove non functional crf code and cleanup
# Model Vars
# Crf Model Specific (Mandatory to use CRF Model)
CRF_MODELS_PATH = os.environ.get('MODELS_PATH')
CRF_EMBEDDINGS_PATH_VOCAB = os.environ.get('EMBEDDINGS_PATH_VOCAB')
CRF_EMBEDDINGS_PATH_VECTORS = os.environ.get('EMBEDDINGS_PATH_VECTORS')

if os.path.exists(MODEL_CONFIG_PATH):
dotenv.read_dotenv(MODEL_CONFIG_PATH)
else:
Expand All @@ -165,3 +161,13 @@
CITY_MODEL_PATH = os.path.join(BASE_DIR, 'data', 'models', 'crf', 'city', 'model_13062017.crf')
if not DATE_MODEL_PATH:
DATE_MODEL_PATH = os.path.join(BASE_DIR, 'data', 'models', 'crf', 'date', 'model_date.crf')

# Crf Model Specific with additional AWS storage (optional)
CRF_MODEL_S3_BUCKET_NAME = os.environ.get('CRF_MODEL_S3_BUCKET_NAME')
CRF_MODEL_S3_BUCKET_REGION = os.environ.get('CRF_MODEL_S3_BUCKET_REGION')
WORD_EMBEDDING_REMOTE_URL = os.environ.get('WORD_EMBEDDING_REMOTE_URL')
GOOGLE_TRANSLATE_API_KEY = os.environ.get('GOOGLE_TRANSLATE_API_KEY')

if not GOOGLE_TRANSLATE_API_KEY:
ner_logger.warning('Google Translate API key is null or not set')
GOOGLE_TRANSLATE_API_KEY = ''
3 changes: 2 additions & 1 deletion chatbot_ner/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,13 +102,14 @@ def __getitem__(self, item):
'--ignore-files=urls.py',
'--ignore-files=wsgi.py',
'--ignore-files=manage.py',
'--ignore-files=initial_setup.py',
'--ignore-files=nltk_setup.py',
'--ignore-files=__init__.py',
'--ignore-files=const.py',
'--ignore-files=constant.py',
'--ignore-files=constants.py',
'--ignore-files=start_server.sh',
'--ignore-files=settings.py',
'--ignore-files=run_postman_tests.py',
'--exclude-dir=docs/',
'--exclude-dir=docker/',
'--exclude-dir=data/',
Expand Down
68 changes: 44 additions & 24 deletions config.example
Original file line number Diff line number Diff line change
@@ -1,47 +1,67 @@
# This is config.example file for chatbot_ner module similar to .env.example file to hold settings
# Copy it to a file named config and fill in all the values.
# Copy it to a docker/.env and fill in all the values.
# Never push your personal keys and passwords to any public repository!
# Please don't add spaces around '='

# This is the primary engine to use. Valid values are one of the following: ['elasticsearch']
NAME=chatbot_ner
DJANGODIR=/app
DJANGO_DEBUG=False
DJANGO_LOG_LEVEL=DEBUG
DJANGO_SETTINGS_MODULE=chatbot_ner.settings
DJANGO_WSGI_MODULE=chatbot_ner/wsgi.py
# Important: Change the value of SECRET_KEY to something else and keep it secret
SECRET_KEY=!yqqcz-v@(s@kpygpvomcuu3il0q1&qtpz)e_g0ulo-sdv%c0c

NUM_WORKERS=1
MAX_REQUESTS=1000
PORT=8081
TIMEOUT=600

# This is the primary engine to use for datastore. Valid values are one of the following: ['elasticsearch']
ENGINE=elasticsearch

# ES prefixed variables correspond to settings for elasticsearch.
# ES_URL is the complete url with auth name and password required to connect. If provided, this will override ES_HOST,
# ES_PORT, ES_AUTH_NAME, ES_AUTH_PASSWORD
# ES_HOST by default is host for ES that comes up with compose
# ES_HOST and ES_PORT by default is host for ES that comes up with compose

ES_URL=
ES_AUTH_NAME=
ES_AUTH_PASSWORD=
ES_SCHEME=http
ES_HOST=elasticsearch
ES_URL=
ES_PORT=9200
ES_INDEX_NAME=entity_data
ES_ALIAS=entity_data
ES_INDEX_1=entity_data_v1
ES_INDEX_2=
ES_DOC_TYPE=data_dictionary
# ES_BULK_MSG_SIZE is an integer value
ELASTICSEARCH_CRF_DATA_DOC_TYPE=training_dictionary
ELASTICSEARCH_CRF_DATA_INDEX_NAME=entity_examples_data

ES_BULK_MSG_SIZE=1000
# ES_SEARCH_SIZE is an integer value
ES_SEARCH_SIZE=10000
# Provide the following values if you need AWS authentication
ES_AWS_SERVICE=
ES_AWS_REGION=

# Auth variables if ES is hosted on AWS
ES_AWS_ACCESS_KEY_ID=
ES_AWS_REGION=
ES_AWS_SECRET_ACCESS_KEY=
ES_AWS_SERVICE=

DESTINATION_ES_SCHEME=
DESTINATION_HOST=
DESTINATION_PORT=

NAME=chatbot_ner
DJANGODIR=/app
NUM_WORKERS=1
MAX_REQUESTS=1000
DJANGO_SETTINGS_MODULE=chatbot_ner.settings
DJANGO_WSGI_MODULE=chatbot_ner/wsgi.py
DJANGO_LOG_LEVEL=debug
DJANGO_DEBUG=False
# Important: Change the value of SECRET_KEY to something else and keep it secret
SECRET_KEY=!yqqcz-v@(s@kpygpvomcuu3il0q1&qtpz)e_g0ulo-sdv%c0c
PORT=8081
TIMEOUT=600
CITY_MODEL_TYPE=crf
CITY_MODEL_PATH=
# In order to enable entity detection for multiple languages, we use google translate. Please enter the key(optional)
GOOGLE_TRANSLATE_API_KEY=

# Deprecated CRF models configuration
MODELS_PATH=
WORD_EMBEDDING_REMOTE_URL=
EMBEDDINGS_PATH_VECTORS=
EMBEDDINGS_PATH_VOCAB=
CITY_MODEL_PATH=
CITY_MODEL_TYPE=crf
CRF_MODEL_S3_BUCKET_NAME=
CRF_MODEL_S3_BUCKET_REGION=
DATE_MODEL_PATH=
DATE_MODEL_TYPE=
5 changes: 4 additions & 1 deletion datastore/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,11 @@
ELASTICSEARCH_VALUES_SEARCH_SIZE = 300000

# settings dictionary key constants
# TODO: these should not be here, two different sources of literals
ENGINE = 'engine'
ELASTICSEARCH_INDEX_NAME = 'name'
ELASTICSEARCH_ALIAS = 'es_alias'
ELASTICSEARCH_INDEX_1 = 'es_index_1'
ELASTICSEARCH_INDEX_2 = 'es_index_2'
ELASTICSEARCH_DOC_TYPE = 'doc_type'
ELASTICSEARCH_VERSION_MAJOR, ELASTICSEARCH_VERSION_MINOR, ELASTICSEARCH_VERSION_OTHER = elasticsearch.VERSION
ELASTICSEARCH_CRF_DATA_INDEX_NAME = 'elasticsearch_crf_data_index_name'
Expand Down
Loading

0 comments on commit 92823fe

Please sign in to comment.