diff --git a/datastore/__init__.py b/datastore/__init__.py index ce4f4b817..49ca09de1 100644 --- a/datastore/__init__.py +++ b/datastore/__init__.py @@ -1 +1 @@ -from datastore import DataStore +from .datastore import DataStore diff --git a/datastore/datastore.py b/datastore/datastore.py index 33e7f59d3..270dfe73f 100644 --- a/datastore/datastore.py +++ b/datastore/datastore.py @@ -4,7 +4,7 @@ from chatbot_ner.config import ner_logger, CHATBOT_NER_DATASTORE from datastore import elastic_search -from datastore.constants import (ELASTICSEARCH, ENGINE, ELASTICSEARCH_INDEX_NAME, DEFAULT_ENTITY_DATA_DIRECTORY, +from datastore.constants import (ELASTICSEARCH, ENGINE, ELASTICSEARCH_INDEX_NAME, ELASTICSEARCH_DOC_TYPE, ELASTICSEARCH_CRF_DATA_INDEX_NAME, ELASTICSEARCH_CRF_DATA_DOC_TYPE) from datastore.exceptions import (DataStoreSettingsImproperlyConfiguredException, EngineNotImplementedException, @@ -126,7 +126,7 @@ def create(self, **kwargs): ) # FIXME: repopulate does not consider language of the variants - def populate(self, entity_data_directory_path=DEFAULT_ENTITY_DATA_DIRECTORY, csv_file_paths=None, **kwargs): + def populate(self, entity_data_directory_path=None, csv_file_paths=None, **kwargs): """ Populates the datastore from csv files stored in directory path indicated by entity_data_directory_path and from csv files at file paths in csv_file_paths list @@ -143,6 +143,11 @@ def populate(self, entity_data_directory_path=DEFAULT_ENTITY_DATA_DIRECTORY, csv All other exceptions raised by elasticsearch-py library """ + if not (entity_data_directory_path or csv_file_paths): + raise ValueError('Both `entity_data_directory_path` and `csv_file_paths` arguments cannot be None.' + 'Either provide a path to directory containing csv files using ' + '`entity_data_directory_path` or a list of paths to csv files ' + 'using `csv_file_paths`') if self._client_or_connection is None: self._connect() @@ -317,7 +322,7 @@ def delete_entity(self, entity_name, **kwargs): **kwargs) # FIXME: repopulate does not consider language of the variants - def repopulate(self, entity_data_directory_path=DEFAULT_ENTITY_DATA_DIRECTORY, csv_file_paths=None, **kwargs): + def repopulate(self, entity_data_directory_path=None, csv_file_paths=None, **kwargs): """ Deletes the existing data and repopulates it for entities from csv files stored in directory path indicated by entity_data_directory_path and from csv files at file paths in csv_file_paths list @@ -334,6 +339,12 @@ def repopulate(self, entity_data_directory_path=DEFAULT_ENTITY_DATA_DIRECTORY, c DataStoreSettingsImproperlyConfiguredException if connection settings are invalid or missing All other exceptions raised by elasticsearch-py library """ + if not (entity_data_directory_path or csv_file_paths): + raise ValueError('Both `entity_data_directory_path` and `csv_file_paths` arguments cannot be None.' + 'Either provide a path to directory containing csv files using ' + '`entity_data_directory_path` or a list of paths to csv files ' + 'using `csv_file_paths`') + if self._client_or_connection is None: self._connect() @@ -564,37 +575,40 @@ def transfer_entities_elastic_search(self, entity_list): es_object = elastic_search.transfer.ESTransfer(source=es_url, destination=destination) es_object.transfer_specific_entities(list_of_entities=entity_list) - def get_crf_data_for_entity_name(self, entity_name, **kwargs): + def get_crf_data_for_entity_name(self, entity_name, languages, **kwargs): """ This method is used to obtain the sentences and entities from sentences given entity name + Args: entity_name (str): Entity name for which training data needs to be obtained - kwargs: - For Elasticsearch: - Refer https://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.Elasticsearch.search + languages (List[str]): list of languges codes for which data is requested + **kwargs: For Elasticsearch: + Refer https://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.Elasticsearch.search + Returns: results_dictionary(dict): Dictionary consisting of the training data for the the given entity. Raises: - IndexNotFoundException if es_training_index was not found in connection settings + IndexNotFoundException: Description + IndexNotFoundException if es_training_index was not found in connection settings Example: db = Datastore() db.get_entity_training_data(entity_name, **kwargs): >> { - 'sentence_list': [ - 'My name is hardik', - 'This is my friend Ajay' + 'sentence_list': [ + 'My name is hardik', + 'This is my friend Ajay' + ], + 'entity_list': [ + [ + 'hardik' ], - 'entity_list': [ - [ - 'hardik' - ], - [ - 'Ajay' - ] + [ + 'Ajay' ] - } + ] + } """ ner_logger.debug('Datastore, get_entity_training_data, entity_name %s' % entity_name) if self._client_or_connection is None: @@ -612,24 +626,24 @@ def get_crf_data_for_entity_name(self, entity_name, **kwargs): index_name=es_training_index, doc_type=self._connection_settings[ELASTICSEARCH_CRF_DATA_DOC_TYPE], entity_name=entity_name, + languages=languages, request_timeout=request_timeout, **kwargs) ner_logger.debug('Datastore, get_entity_training_data, results_dictionary %s' % str(entity_name)) return results_dictionary - def update_entity_crf_data(self, entity_name, entity_list, language_script, sentence_list, **kwargs): + def update_entity_crf_data(self, entity_name, sentences, **kwargs): """ This method is used to populate the training data for a given entity + Args: entity_name (str): Name of the entity for which the training data has to be populated - entity_list (list): List consisting of the entities corresponding to the sentence_list - sentence_list (list): List of sentences for training - language_script (str): Language code for the language script used. - **kwargs: - For Elasticsearch: + sentences (Dict[str, List[Dict[str, str]]]: sentences mapped against their languages + **kwargs: For Elasticsearch: Refer http://elasticsearch-py.readthedocs.io/en/master/helpers.html#elasticsearch.helpers.bulk Raises: + IndexNotFoundException: Description IndexNotFoundException if es_training_index was not found in connection settings """ if self._client_or_connection is None: @@ -643,13 +657,12 @@ def update_entity_crf_data(self, entity_name, entity_list, language_script, sent raise IndexNotFoundException('Index for ELASTICSEARCH_CRF_DATA_INDEX_NAME not found. ' 'Please configure the same') - elastic_search.populate.update_entity_crf_data_populate(connection=self._client_or_connection, - index_name=es_training_index, - doc_type=self._connection_settings - [ELASTICSEARCH_CRF_DATA_DOC_TYPE], - logger=ner_logger, - entity_list=entity_list, - sentence_list=sentence_list, - entity_name=entity_name, - language_script=language_script, - **kwargs) + elastic_search \ + .populate \ + .update_entity_crf_data_populate(connection=self._client_or_connection, + index_name=es_training_index, + doc_type=self._connection_settings[ELASTICSEARCH_CRF_DATA_DOC_TYPE], + logger=ner_logger, + sentences=sentences, + entity_name=entity_name, + **kwargs) diff --git a/datastore/elastic_search/__init__.py b/datastore/elastic_search/__init__.py index 34654dbfb..5618d0c87 100644 --- a/datastore/elastic_search/__init__.py +++ b/datastore/elastic_search/__init__.py @@ -1,5 +1 @@ -import connect -import create -import populate -import query -import transfer +from . import connect, create, populate, query, transfer diff --git a/datastore/elastic_search/populate.py b/datastore/elastic_search/populate.py index 2431aa823..4af1a722e 100644 --- a/datastore/elastic_search/populate.py +++ b/datastore/elastic_search/populate.py @@ -7,14 +7,16 @@ # 3rd party imports from elasticsearch import helpers -# Local imports from chatbot_ner.config import ner_logger from datastore import constants from datastore.elastic_search.query import get_entity_data from datastore.utils import get_files_from_directory, read_csv, remove_duplicate_data +from external_api.constants import SENTENCE, ENTITIES from language_utilities.constant import ENGLISH_LANG from ner_constants import DICTIONARY_DATA_VARIANTS +# Local imports + log_prefix = 'datastore.elastic_search.populate' @@ -280,83 +282,115 @@ def entity_data_update(connection, index_name, doc_type, entity_data, entity_nam logger.debug('%s: +++ Completed: add_data_elastic_search() +++' % log_prefix) -def update_entity_crf_data_populate( - connection, index_name, doc_type, entity_list, entity_name, sentence_list, language_script, logger, **kwargs -): +def delete_entity_crf_data(connection, index_name, doc_type, entity_name, languages): + """Delete CRF data for the given entity and languages. + + Args: + connection (Elasticsearch): Elasticsearch client object + index_name (str): name of the index + doc_type (str): type of the documents being indexed + entity_name (str): ame of the entity for which the training data has to be deleted + languages (List[str]): list of language codes for which data needs to be deleted + + Returns: + TYPE: Description + """ + query = { + "query": { + "bool": { + "must": [ + { + "match": { + "entity_data": entity_name + } + } + ], + "filter": { + "terms": { + "language_script": languages + } + } + } + } + } + return connection.delete_by_query(index=index_name, body=query, doc_type=doc_type) + + +def update_entity_crf_data_populate(connection, index_name, doc_type, entity_name, sentences, logger, **kwargs): """ - This method is used to populate the elastic search traininf data. + This method is used to populate the elastic search training data. + Args: - connection: Elasticsearch client object - index_name (str): The name of the index - doc_type (str): The type of the documents being indexed - entity_name (str): Name of the entity for which the training data has to be populated - entity_list (list): List consisting of the entities corresponding to the sentence_list - sentence_list (list): List of sentences for training - language_script (str): The code for the language script - logger: logging object to log at debug and exception levellogging object to log at debug and exception level + connection (Elasticsearch): Elasticsearch client object + index_name (str): name of the index + doc_type (str): type of the documents being indexed + entity_name (str): name of the entity for which the training data has to be populated + sentences (Dict[str, List[Dict[str, str]]]): sentences collected per language + logger: logging object **kwargs: Refer http://elasticsearch-py.readthedocs.io/en/master/helpers.html#elasticsearch.helpers.bulk """ - logger.debug('%s: +++ Started: external_api_training_data_entity_update() +++' % log_prefix) - logger.debug('%s: +++ Started: delete_entity_by_name() +++' % log_prefix) - delete_entity_by_name(connection=connection, index_name=index_name, doc_type=doc_type, - entity_name=entity_name, logger=logger, **kwargs) - logger.debug('%s: +++ Completed: delete_entity_by_name() +++' % log_prefix) + logger.debug('[{0}] Started: external_api_training_data_entity_update()'.format(log_prefix)) - logger.debug('%s: +++ Started: add_training_data_elastic_search() +++' % log_prefix) - add_training_data_elastic_search(connection=connection, index_name=index_name, doc_type=doc_type, - entity_name=entity_name, - entity_list=entity_list, - sentence_list=sentence_list, - language_script=language_script, logger=logger, **kwargs) - logger.debug('%s: +++ Completed: add_training_data_elastic_search() +++' % log_prefix) + logger.debug('[{0}] Started: delete_entity_crf_data()'.format(log_prefix)) + languages = list(sentences.keys()) + delete_entity_crf_data(connection=connection, index_name=index_name, doc_type=doc_type, + entity_name=entity_name, languages=languages) + logger.debug('[{0}] Completed: delete_entity_crf_data()'.format(log_prefix)) + logger.debug('[{0}] Started: add_training_data_elastic_search()'.format(log_prefix)) + add_crf_training_data_elastic_search(connection=connection, + index_name=index_name, + doc_type=doc_type, + entity_name=entity_name, + sentences=sentences, + logger=logger, **kwargs) + logger.debug('[{0}] Completed: add_training_data_elastic_search()'.format(log_prefix)) -def add_training_data_elastic_search( - connection, index_name, doc_type, entity_name, entity_list, - sentence_list, language_script, logger, **kwargs -): + logger.debug('[{0}] Completed: external_api_training_data_entity_update()'.format(log_prefix)) + + +def add_crf_training_data_elastic_search(connection, index_name, doc_type, entity_name, sentences, logger, **kwargs): """ Adds all sentences and the corresponding entities to the specified index. If the same named entity is found a delete followed by an update is triggered + Args: - connection: Elasticsearch client object - index_name (str): The name of the index - doc_type (str): The type of the documents being indexed - entity_name (str): Name of the entity for which the training data has to be populated - entity_list (list): List consisting of the entities corresponding to the sentence_list - sentence_list (list): List of sentences for training - logger: logging object to log at debug and exception level - language_script (str): Language code of the entity script - kwargs: - Refer http://elasticsearch-py.readthedocs.io/en/master/helpers.html#elasticsearch.helpers.bulk - Example of underlying index query - {'_index': 'training_index', - 'entity_data': 'name', - 'sentence': ['My name is Ajay and this is my friend Hardik'], - 'entities': ['Ajay', 'Hardik'], - 'language_script': 'en', - '_type': 'training_index', - '_op_type': 'index' - } + connection (Elasticsearch): Description + index_name (str): Description + doc_type (str): Description + entity_name (str): Description + sentences (Dict[str, List[Dict[str, str]]]): Description + logger (TYPE): Description + **kwargs: Description + Example of underlying index query + {'_index': 'training_index', + 'entity_data': 'name', + 'sentence': ['My name is Ajay and this is my friend Hardik'], + 'entities': ['Ajay', 'Hardik'], + 'language_script': 'en', + '_type': 'training_index', + '_op_type': 'index' + } """ - str_query = [] - for sentence, entities in zip(sentence_list, entity_list): - query_dict = {'_index': index_name, - 'entity_data': entity_name, - 'sentence': sentence, - 'entities': entities, - 'language_script': language_script, - '_type': doc_type, - '_op_type': 'index' - } - str_query.append(query_dict) - if len(str_query) > constants.ELASTICSEARCH_BULK_HELPER_MESSAGE_SIZE: - result = helpers.bulk(connection, str_query, stats_only=True, **kwargs) - logger.debug('%s: \t++ %s status %s ++' % (log_prefix, entity_name, result)) - str_query = [] - if str_query: - result = helpers.bulk(connection, str_query, stats_only=True, **kwargs) - logger.debug('%s: \t++ %s status %s ++' % (log_prefix, entity_name, result)) + queries = [] + for language, sentences in sentences.items(): + for sentence in sentences: + query_dict = {'_index': index_name, + 'entity_data': entity_name, + 'sentence': sentence[SENTENCE], + 'entities': sentence[ENTITIES], + 'language_script': language, + '_type': doc_type, + '_op_type': 'index' + } + queries.append(query_dict) + if len(queries) > constants.ELASTICSEARCH_BULK_HELPER_MESSAGE_SIZE: + result = helpers.bulk(connection, queries, stats_only=True, **kwargs) + logger.debug('[{0}] Insert: {1} with status {2}'.format(log_prefix, entity_name, result)) + queries = [] + if queries: + result = helpers.bulk(connection, queries, stats_only=True, **kwargs) + logger.debug('[{0}] Insert: {1} with status {2}'.format(log_prefix, entity_name, result)) def delete_entity_data_by_values(connection, index_name, doc_type, entity_name, values=None, **kwargs): diff --git a/datastore/elastic_search/query.py b/datastore/elastic_search/query.py index b6cbbac14..bea35d630 100644 --- a/datastore/elastic_search/query.py +++ b/datastore/elastic_search/query.py @@ -9,12 +9,13 @@ from six import string_types -# Local imports from datastore import constants -from external_api.constants import SENTENCE_LIST, ENTITY_LIST +from external_api.constants import SENTENCE, ENTITIES from language_utilities.constant import ENGLISH_LANG from lib.nlp.const import TOKENIZER +# Local imports + log_prefix = 'datastore.elastic_search.query' @@ -533,17 +534,17 @@ def _parse_es_search_results(results_list): return variants_to_values_list -def get_crf_data_for_entity_name(connection, index_name, doc_type, entity_name, **kwargs): +def get_crf_data_for_entity_name(connection, index_name, doc_type, entity_name, languages, **kwargs): """ Get all sentence_list and entity_list for a entity stored in the index Args: - connection: Elasticsearch client object - index_name: The name of the index - doc_type: The type of the documents that will be indexed - entity_name: name of the entity to perform a 'term' query on - kwargs: - Refer https://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.Elasticsearch.search + connection (Elasticsearch): Elasticsearch client object + index_name (str): The name of the index + doc_type (str): The type of the documents that will be indexed + entity_name (str): name of the entity to perform a 'term' query on + languages (List[str]): list of languages for which to fetch sentences + **kwargs: optional kwargs for es Returns: dictionary, search results of the 'term' query on entity_name, mapping keys to lists containing @@ -563,29 +564,52 @@ def get_crf_data_for_entity_name(connection, index_name, doc_type, entity_name, [ 'Ajay' ] - ] + ] } - """ - results_dictionary = {SENTENCE_LIST: [], ENTITY_LIST: []} + data = { - 'query': { - 'term': { - 'entity_data': { - 'value': entity_name - } + "query": { + "bool": { + "must": [ + { + "term": { + "entity_data": { + "value": entity_name + } + } + } + ] } } } - kwargs = dict(kwargs, body=data, doc_type=doc_type, size=constants.ELASTICSEARCH_SEARCH_SIZE, index=index_name, + + if languages: + data['query']['bool']['filter'] = { + "terms": { + "language_script": languages + } + } + + kwargs = dict(kwargs, + body=data, + doc_type=doc_type, + size=constants.ELASTICSEARCH_SEARCH_SIZE, + index=index_name, scroll='1m') search_results = _run_es_search(connection, **kwargs) # Parse hits results = search_results['hits']['hits'] + language_mapped_results = collections.defaultdict(list) + for result in results: - results_dictionary[SENTENCE_LIST].append(result['_source']['sentence']) - results_dictionary[ENTITY_LIST].append(result['_source']['entities']) + language_mapped_results[result['_source']['language_script']].append( + { + SENTENCE: result['_source']['sentence'], + ENTITIES: result['_source']['entities'] + } + ) - return results_dictionary + return dict(language_mapped_results) diff --git a/docs/adding_entities.md b/docs/adding_entities.md index b5aea3a1c..cb05d7e58 100644 --- a/docs/adding_entities.md +++ b/docs/adding_entities.md @@ -21,7 +21,7 @@ Following csv files are already included in the repository at `data/entity_data/ ----------- -Chatbot ner reads data from these csv files and puts them into the datastore under a entity named after the filename of the csv file. +Chatbot NER reads data from these csv files and puts them into the datastore under a entity named after the filename of the csv file. > *csv filename should contain only lowercase english alphabets and '_' (underscore) symbol* @@ -62,24 +62,37 @@ video,mp4|mkv|mov Now lets add the newly created csv file to the datastore. -- Make sure to start the engine you configured with datastore( eg. elasticsearch) +- Make sure our containers are running ```shell - $ ~/chatbot_ner_elasticsearch/elasticsearch-5.5.0/bin/elasticsearch -d + $ docker-compose ps ``` -- Activate chatbot_ner virtual environment + You should see output like following + + ``` + Name Command State Ports + ------------------------------------------------------------------------------------------------ + docker_chatbot-ner_1 /bin/sh -c /app/docker/cmd.sh Up 0.0.0.0:8081->80/tcp, 8081/tcp + docker_elasticsearch_1 /docker-entrypoint.sh elas ... Up 9200/tcp, 9300/tcp + ``` + + > If the containers are not running, do the following + > + > ```shell + > $ cd chatbot_ner/docker + > $ docker-compose up -d + > ``` + +- Enter the chatbot-ner container ```shell - $ source /usr/local/bin/virtualenvwrapper.sh - $ workon chatbotnervenv + $ docker exec -it docker_chatbot-ner_1 bash ``` - Start a `manage.py shell` as follows ```bash - $ # change to your repository clone directory - $ cd ~/chatbot_ner/ $ python manage.py shell ``` @@ -87,7 +100,7 @@ Now lets add the newly created csv file to the datastore. ```python from datastore import DataStore - csv_file = '~/attachment_types.csv' # example file path to the csv file + csv_file = 'data/entity_data/city.csv' # example file path to the csv file db = DataStore() db.populate(csv_file_paths=[csv_file,]) ``` @@ -96,10 +109,14 @@ Now lets add the newly created csv file to the datastore. ```python from datastore import DataStore - csv_directory = '~/my_csv_files/' # example directory path containing csv files + csv_directory = 'data/entity_data/' # example directory path containing csv files db = DataStore() db.populate(entity_data_directory_path=csv_directory) ``` + + > Note: It is advised that you put the csv files inside some directory in the repo. (E.g. chatbot_ner/data/entity_data/) because the repo is mouted inside the container so the files will available both inside and outside the container + +- Once done, you can exit the shell and then exit the container ### Updating the DataStore after editing a csv file @@ -109,25 +126,59 @@ After editing and saving your csv, you will need to update the datastore with ne > **Note:** The filename needs to be same as it was before editing the file. If the new data is saved under a different filename it would be populated as a new entity with the name same as new file name. -> Make sure you are working in chatbotnervenv virtual environment and datastore engine is running. See above section +- Make sure our containers are running -On a `manage.py shell` run + ```shell + $ docker-compose ps + ``` -```python -from datastore import DataStore -csv_file = '~/attachment_types.csv' # example file path to the csv file -db = DataStore() -db.repopulate(csv_file_paths=[csv_file,]) -``` + You should see output like following - In case, you want to update multiple csv files at once, you can pass the directory path to `entity_data_directory_path` parameter of `repopulate` method as follows: + ``` + Name Command State Ports + ------------------------------------------------------------------------------------------------ + docker_chatbot-ner_1 /bin/sh -c /app/docker/cmd.sh Up 0.0.0.0:8081->80/tcp, 8081/tcp + docker_elasticsearch_1 /docker-entrypoint.sh elas ... Up 9200/tcp, 9300/tcp + ``` -```python -from datastore import DataStore -csv_directory = '~/my_csv_files/' # example directory path containing csv files -db = DataStore() -db.repopulate(entity_data_directory_path=csv_directory) -``` + > If the containers are not running, do the following + > + > ```shell + > $ cd chatbot_ner/docker + > $ docker-compose up -d + > ``` + +- Enter the chatbot-ner container + + ```shell + $ docker exec -it docker_chatbot-ner_1 bash + ``` + +- Start a `manage.py shell` as follows + + ```bash + $ python manage.py shell + ``` + +- Now run the following: + + ```python + from datastore import DataStore + csv_file = 'data/entity_data/city.csv' # example file path to the csv file + db = DataStore() + db.repopulate(csv_file_paths=[csv_file,]) + ``` + + In case, you want to update multiple csv files at once, you can pass the directory path to `entity_data_directory_path` parameter of `repopulate` method as follows: + + ```python + from datastore import DataStore + csv_directory = 'data/entity_data/' # example directory path containing csv files + db = DataStore() + db.repopulate(entity_data_directory_path=csv_directory) + ``` + + > Note: It is advised that you put the csv files inside some directory in the repo. (E.g. chatbot_ner/data/entity_data/) because the repo is mouted inside the container so the files will available both inside and outside the container ### Deleting entity data @@ -135,12 +186,44 @@ db.repopulate(entity_data_directory_path=csv_directory) To delete all data for entity, simply call `delete_entity()` on Datastore. It takes one argument- the name of the entity. This is the same as the name of the csv file used for this entity while populating its data. -> Make sure you are working in chatbotnervenv virtual environment and datastore engine is running. See above section +- Make sure our containers are running -On a `manage.py shell` run + ```shell + $ docker-compose ps + ``` -```python -from datastore import DataStore -db = DataStore() -db.delete_entity(entity_name='attachment_types') -``` + You should see output like following + + ``` + Name Command State Ports + ------------------------------------------------------------------------------------------------ + docker_chatbot-ner_1 /bin/sh -c /app/docker/cmd.sh Up 0.0.0.0:8081->80/tcp, 8081/tcp + docker_elasticsearch_1 /docker-entrypoint.sh elas ... Up 9200/tcp, 9300/tcp + ``` + + > If the containers are not running, do the following + > + > ```shell + > $ cd chatbot_ner/docker + > $ docker-compose up -d + > ``` + +- Enter the chatbot-ner container + + ```shell + $ docker exec -it docker_chatbot-ner_1 bash + ``` + +- Start a `manage.py shell` as follows + + ```bash + $ python manage.py shell + ``` + +- Now run the following (E.g. to delete `city` entity) + + ```python + from datastore import DataStore + db = DataStore() + db.delete_entity(entity_name='city') + ``` \ No newline at end of file diff --git a/docs/api_call.md b/docs/api_call.md index 10ea71641..65d26fe8c 100644 --- a/docs/api_call.md +++ b/docs/api_call.md @@ -177,34 +177,50 @@ Currently time detection support has been provided in different languages - `Eng - *CURL command* ```bash - URL='localhost' - PORT=8081 - - curl -i 'http://'$URL':'$PORT'/v2/time/?message=John%20arrived%20at%20the%20bus%20stop%20at%2013%3A50%20hrs%2C%20expecting%20the%20bus%20to%20be%20there%20in%2015%20mins.%20But%20the%20bus%20was%20scheduled%20for%2012%3A30%20pm&entity_name=time&structured_value=&fallback_value=&bot_message=&timezone=UTC&source_language=en' + curl -G -i "http://localhost:8081/v2/time/?&entity_name=time&timezone=UTC&source_language=en" \ + --data-urlencode "structured_value=" \ + --data-urlencode "fallback_value=" \ + --data-urlencode "bot_message=" \ + --data-urlencode "message=John arrived at the bus stop at 13:50 hrs, expecting the bus to be there in 15 mins. But the bus was scheduled for 12:30 pm" ``` - + > **Output**: - + ```json - {"data": [ + { + "data": [ { - "detection": "message", - "original_text": "12:30 pm", - "entity_value": { "mm": 30, "hh": 12, "nn": "pm"}, - "language": "en" + "detection": "message", + "original_text": "12:30 pm", + "entity_value": { + "mm": 30, + "hh": 12, + "nn": "pm" + }, + "language": "en" }, { - "detection": "message", - "original_text": "in 15 mins", - "entity_value": { "mm": "15", "hh": 0, "nn": "df" }, - "language": "en" + "detection": "message", + "original_text": "in 15 mins", + "entity_value": { + "mm": 15, + "hh": 0, + "nn": "df" + }, + "language": "en" }, { - "detection": "message", - "original_text": "13:50", - "entity_value": {"mm": 50, "hh": 13, "nn": "hrs"}, - "language": "en" - }]} + "detection": "message", + "original_text": "13:50", + "entity_value": { + "mm": 50, + "hh": 13, + "nn": "hrs" + }, + "language": "en" + } + ] + } ``` @@ -234,39 +250,53 @@ Currently time detection support has been provided in different languages - `Eng - *CURL command* ```bash - URL='localhost' - PORT=8081 - - curl -i 'http://'$URL':'$PORT'/v2/time/?message=राजू%20का%20बस%20१३:५०%20को%20बस%20स्टॉप%20से%20निकला%20और%20१५%20मिनट%20में%20यहाँ%20पहुंच%20जाएगा%20और%20गोवा%20को%20शाम%20में%20बारह%20बजकर%20३०%20मिनट%20पैर%20पहुंचेगा&entity_name=time&structured_value=&fallback_value=&bot_message=&timezone=UTC&source_language=en' - + curl -G -i "http://localhost:8081/v2/time/?&entity_name=time&timezone=UTC&source_language=hi" \ + --data-urlencode "structured_value=" \ + --data-urlencode "fallback_value=" \ + --data-urlencode "bot_message=" \ + --data-urlencode "message=राजू का बस १३:५० को बस स्टॉप से निकला और १५ मिनट में यहाँ पहुंच जाएगा और गोवा को शाम में बारह बजकर ३० मिनट पैर पहुंचेगा" ``` - + > **Output**: - + ```json - {"data": [ + { + "data": [ { - "detection": "message", - "original_text": "१३:५०", - "entity_value": { "mm": 1, "hh": 50,"nn": "hr"}, - "language": "hi" + "detection": "message", + "original_text": "१३:५०", + "entity_value": { + "mm": 50, + "hh": 13, + "nn": "hrs", + "time_type": null + }, + "language": "hi" }, { - "detection": "message", - "original_text": "१५ मिनट में", - "entity_value": {"mm": "15", "hh": 0, "nn": "df"}, + "detection": "message", + "original_text": "१५ मिनट में", + "entity_value": { + "mm": 15, + "hh": 0, + "nn": "df" + }, "language": "hi" }, - { - "detection": "message", - "original_text": "शाम में बारह बजकर ३० मिनट", - "entity_value": { "mm": 30, "hh": 12, "nn": "pm"}, - "language": "hi" - }] + { + "detection": "message", + "original_text": "बारह बजकर ३० मिनट", + "entity_value": { + "mm": 30, + "hh": 12, + "nn": "hrs" + }, + "language": "hi" + } + ] } - ``` - + ### 2. Date @@ -316,28 +346,38 @@ The Date detector module has the capability to detect various form of dates from - *CURL:* - ```shell - URL='localhost' - PORT=8081 - - curl -i 'http://'$URL':'$PORT'/v1/date/?message=set%20me%20reminder%20on%2023rd%20december&entity_name=date&structured_value=&fallback_value=&bot_message=%timezone=UTC&source_language=en&past_date_referenced=false' - + ```bash + curl -G -i "http://localhost:8081/v2/date/?&entity_name=date&timezone=UTC&source_language=en" \ + --data-urlencode "structured_value=" \ + --data-urlencode "fallback_value=" \ + --data-urlencode "bot_message=" \ + --data-urlencode "message=set me reminder on 23rd december" ``` > **Output:** ```json - {"data": [ + { + "data": [ { - "detection": "message", + "detection": "message", "original_text": "23rd december", - "entity_value": { "end_range": false, "from": false, "normal": true, "to": - false, "start_range": false, - "value": {"mm": 12, "yy": 2017, "dd": 23, "type": "date"} - }, - "language": "en" - }]} - + "entity_value": { + "end_range": false, + "from": false, + "normal": true, + "value": { + "mm": 12, + "yy": 2019, + "dd": 23, + "type": "date" + }, + "to": false, + "start_range": false + } + } + ] + } ``` - ***Example 2: Detecting referenced date [Hindi] from user message*** @@ -369,28 +409,40 @@ The Date detector module has the capability to detect various form of dates from - *CURL:* - ```shell - URL='localhost' - PORT=8081 - - curl -i 'http://'$URL':'$PORT'/v2/date/?message=मुझे%20कल%20सुबह%20५%20बजे%20उठा%20देना&entity_name=date&structured_value=&fallback_value=&bot_message=%timezone=UTC&source_language=en&past_date_referenced=false' - + ```bash + curl -G -i "http://localhost:8081/v2/date/?&entity_name=date&timezone=UTC&past_date_referenced=false&source_language=hi" \ + --data-urlencode "structured_value=" \ + --data-urlencode "fallback_value=" \ + --data-urlencode "bot_message=" \ + --data-urlencode "message=मुझे कल सुबह ५ बजे उठा देना" ``` > **Output:** ```json - /* Assuming today's date is 12th feb 2019*/ - {"data": [ + /* Assuming today's date is 27 June 2019*/ + { + "data": [ { - "detection": "message", + "detection": "message", "original_text": "कल", - "entity_value": { "end_range": false, "from": false, "normal": true, "to": - false, "start_range": false, - "value": {"mm": 02, "yy": 2019, "dd": 13, "type": "date"} - }, - "language": "en" - }]} + "entity_value": { + "end_range": false, + "from": false, + "normal": true, + "value": { + "mm": 6, + "yy": 2019, + "dd": 28, + "type": "date" + }, + "to": false, + "start_range": false + }, + "language": "hi" + } + ] + } ``` @@ -423,29 +475,40 @@ The Date detector module has the capability to detect various form of dates from - *CURL:* - ```shell - URL='localhost' - PORT=8081 - - curl -i 'http://'$URL':'$PORT'/v2/date/?message=आने%20वाले%20सोमवार%20को%20मेरा%20मैथ्स%20का%20एग्जाम%20है&entity_name=date&structured_value=&fallback_value=&bot_message=%timezone=UTC&source_language=en&past_date_referenced=false' - + ```bash + curl -G -i "http://localhost:8081/v2/date/?&entity_name=date&timezone=UTC&past_date_referenced=false&source_language=hi" \ + --data-urlencode "structured_value=" \ + --data-urlencode "fallback_value=" \ + --data-urlencode "bot_message=" \ + --data-urlencode "message=आने वाले सोमवार को मेरा मैथ्स का एग्जाम है" ``` > **Output:** ```json - /* Assuming today's date is 12th feb 2019*/ - {"data": [ + /* Assuming today's date is 27 June 2019*/ + { + "data": [ { - "detection": "message", - "original_text": "कल", - "entity_value": { "end_range": false, "from": false, "normal": true, "to": - false, "start_range": false, - "value": {"mm": 02, "yy": 2019, "dd": 18, "type": "date"} - }, - "language": "en" - }]} - + "detection": "message", + "original_text": "सोमवार", + "entity_value": { + "end_range": false, + "from": false, + "normal": true, + "value": { + "mm": 7, + "yy": 2019, + "dd": 1, + "type": "date" + }, + "to": false, + "start_range": false + }, + "language": "hi" + } + ] + } ``` ### 3. Number @@ -468,7 +531,7 @@ Currently number detection support has been provided for 6 different languages - ```python # For a sample query with following parameters - message=u"i want to purchase 30 units of mobile abd 40 units of telivision" + message=u"i want to purchase 30 units of mobile abd 40 units of television" entity_name='number' structured_value=None fallback_value=None @@ -489,29 +552,38 @@ Currently number detection support has been provided for 6 different languages - - *CURL command:* - ```shell - URL='localhost' - PORT=8081 - - curl -i 'http://'$URL':'$PORT'/v2/number/?message=I%20want%20to%20purchase%2030%20units%20of%20mobile%20and%2040%20units%20of%20Television&entity_name=number_of_unit&structured_value=&fallback_value=&bot_message=&min_number_digits=1&max_number_digits=2&source_language=en&unit_type=' + ```bash + curl -G -i "http://localhost:8081/v2/number/?&entity_name=number_of_unit&min_number_digits=1&max_number_digits=6&unit_type=&source_language=en" \ + --data-urlencode "structured_value=" \ + --data-urlencode "fallback_value=" \ + --data-urlencode "bot_message=" \ + --data-urlencode "message=i want to purchase 30 units of mobile abd 40 units of television" ``` - + > **Output:** - + ```json - {"data": [ + { + "data": [ { - "detection": "message", - "original_text": "30", - "entity_value": { "value": "30", "unit": null}, - "language": "en" + "detection": "message", + "original_text": "30", + "entity_value": { + "unit": null, + "value": "30" + }, + "language": "en" }, { "detection": "message", "original_text": "40", - "entity_value": { "value": "40", "unit": null}, + "entity_value": { + "unit": null, + "value": "40" + }, "language": "en" - }] + } + ] } ``` @@ -521,7 +593,7 @@ Currently number detection support has been provided for 6 different languages - ```python # For a sample query with following parameters - message=u"मुझे ३० किलो आटा और दो हजार का चीनी देना " + message=u"मुझे ३० रूपए आटा का और ३ हजार का चीनी देना" entity_name='number' structured_value=None fallback_value=None @@ -538,31 +610,43 @@ Currently number detection support has been provided for 6 different languages - output = detector.detect(message=message,structured_value=structured_value, fallback_value=fallback_value, bot_message=bot_message) print(output) - ``` - + - *CURL command:* - ```shell - URL='localhost' - PORT=8081 - - curl -i 'http://'$URL':'$PORT'/v2/number/?मुझे%20३०%20किलो%20आटा%20और%20दो%20हजार%20क%20%20चीनी%20देना &entity_name=number_of_unit&structured_value=&fallback_value=&bot_message=&min_number_digits=1&max_number_digits=2&source_language=en&unit_type=' - + ```bash + curl -G -i "http://localhost:8081/v2/number/?&entity_name=number_of_unit&min_number_digits=1&max_number_digits=6&unit_type=&source_language=hi" \ + --data-urlencode "structured_value=" \ + --data-urlencode "fallback_value=" \ + --data-urlencode "bot_message=" \ + --data-urlencode "message=मुझे ३० रूपए आटा का और ३ हजार का चीनी देना" ``` > **Output:** ```json - {"data": [ + { + "data": [ { "detection": "message", - "original_text": "३० किलो", - "entity_value": { "value": "३०", "unit": "kg"}, + "original_text": "३०", + "entity_value": { + "unit": null, + "value": "30" + }, "language": "hi" - }] + }, + { + "detection": "message", + "original_text": "३ हजार", + "entity_value": { + "unit": null, + "value": "3000" + }, + "language": "hi" + } + ] } - ``` - ***Example 3: Detecting number[Hindi in latin script] without unit in message*** @@ -590,43 +674,52 @@ Currently number detection support has been provided for 6 different languages - fallback_value=fallback_value, bot_message=bot_message) print(output) - ``` - + - *CURL command:* - ```shell - URL='localhost' - PORT=8081 - - curl -i 'http://'$URL':'$PORT'/v2/number/?mujhe%2030%20kilo%20aata%20aur%202%20hajaar%20ka%20chini%20dena%20aur%20 teen%20sau%20ka%20chawal&entity_name=number_of_unit&structured_value=&fallback_value=&bot_message=&min_number_digits=1&max_number_digits=2&source_language=en&unit_type=' - + ```bash + curl -G -i "http://localhost:8081/v2/number/?&entity_name=number_of_unit&min_number_digits=1&max_number_digits=6&unit_type=&source_language=hi" \ + --data-urlencode "structured_value=" \ + --data-urlencode "fallback_value=" \ + --data-urlencode "bot_message=" \ + --data-urlencode "message=mujhe 30 kilo aata aur 2 hajaar ka chini dena aur teen sau ka chawal" ``` > **Output:** ```json - {"data": [ + { + "data": [ { - "detection": "message", - "original_text": "30", - "entity_value": { "value": "30", "unit": null}, - "language": "hi" + "detection": "message", + "original_text": "30", + "entity_value": { + "unit": null, + "value": "30" + }, + "language": "hi" }, { - "detection": "message", - "original_text": "2 hajaar", - "entity_value": { "value": "2000", "unit": null}, - "language": "hi" + "detection": "message", + "original_text": "2 hajaar", + "entity_value": { + "unit": null, + "value": "2000" + }, + "language": "hi" }, { - "detection": "message", - "original_text": "teen sau", - "entity_value": { "value": "300", "unit": null}, - "language": "hi" + "detection": "message", + "original_text": "teen sau", + "entity_value": { + "unit": null, + "value": "300" + }, + "language": "hi" } - ]} - + ] + } ``` @@ -654,35 +747,35 @@ Currently number detection support has been provided for 6 different languages - output = detector.detect(message=message,structured_value=structured_value, fallback_value=fallback_value, bot_message=bot_message) print(output) - ``` - - - *CURL command:* - - ```shell - URL='localhost' - PORT=8081 - - curl -i 'http://'$URL':'$PORT'/v2/number/?message=i%20want%20more%20than%20Rupees%2020k%20and%2010%20pendrive&entity_name=number_of_unit&structured_value=&fallback_value=&bot_message=&min_number_digits=1&max_number_digits=2&source_language=en&unit_type=currency' + - *CURL command:* + + ```bash + curl -G -i "http://localhost:8081/v2/number/?&entity_name=number_of_unit&min_number_digits=1&max_number_digits=6&unit_type=currency&source_language=en" \ + --data-urlencode "structured_value=" \ + --data-urlencode "fallback_value=" \ + --data-urlencode "bot_message=" \ + --data-urlencode "message=i want more than Rupees 20k and 10 pendrive" ``` - + > **Output:** - + ```json - {"data": [ + { + "data": [ { - "detection": "message", - "original_text": "Rupees 20k", - "entity_value": { - "value": "20000", - "unit": "rupees" - }, - "language": "en" - }] + "detection": "message", + "original_text": "rupees 20k", + "entity_value": { + "unit": "rupees", + "value": "20000" + }, + "language": "en" + } + ] } - /* here 40 is not detected as unit_type is specified as currency, Hence it only detect numbers having currencies value in unit */ - + /* here 40 is not detected as the unit_type specified is currency, Hence it will only detect numbers with currencies mentioned as unit */ ``` ### 4. Phone number @@ -712,44 +805,46 @@ Currently number detection support has been provided for 6 different languages - fallback_value=fallback_value, bot_message=bot_message,language=source_language) print(output) - ``` - - - *CURL command:* - - ```shell - URL='localhost' - PORT=8081 - - curl -i 'http://'$URL':'$PORT'/v2/phone_number/?message=my%20contact%20number%20is%209049961794&entity_name=phone_number&structured_value=&fallback_value=&bot_message=&source_language=en' + - *CURL command:* + + ```bash + curl -G -i "http://localhost:8081/v2/phone_number/?&entity_name=phone_number&source_language=en" \ + --data-urlencode "structured_value=" \ + --data-urlencode "fallback_value=" \ + --data-urlencode "bot_message=" \ + --data-urlencode "message=send a message on 91 9820334455" ``` - - > **Output **: - + + > **Output: ** + ```json - {"data": [ + { + "data": [ { - "detection": "message", - "original_text": "9049961794", - "entity_value": { "value": "9049961794"}, - "language": "en" - }] + "detection": "message", + "original_text": "91 9820334455", + "entity_value": { + "value": "919820334455" + }, + "language": "en" + } + ] } - ``` - **Example 2: *Detecting phone number (hindi) from message*** - *Django Shell:* - + ```python message = u'मेरा मोबाइल नंबर है ९८९१९८९८७१' entity_name = 'phone_number' structured_value = None fallback_value = None bot_message = None - source_langauge='hi' # here language will be ISO 639-1 code + source_langauge='hi' # here language will be ISO 639-1 code from ner_v2.detectors.pattern.phone_number.phone_number_detection import PhoneDetector detector = PhoneDetector(language=source_langauge, entity_name=entity_name) @@ -758,37 +853,39 @@ Currently number detection support has been provided for 6 different languages - fallback_value=fallback_value, bot_message=bot_message,language=source_language) print(output) - ``` - - - *CURL command:* - - ```shell - URL='localhost' - PORT=8081 - - curl -i 'http://'$URL':'$PORT'/v2/phone_number/?message=मेरा%20मोबाइल%20नंबर%20है%20९८९१९८९८७१entity_name=phone_number&structured_value=&fallback_value=&bot_message=&source_language=en' + - *CURL command:* + + ```bash + curl -G -i "http://localhost:8081/v2/phone_number/?&entity_name=phone_number&source_language=hi" \ + --data-urlencode "structured_value=" \ + --data-urlencode "fallback_value=" \ + --data-urlencode "bot_message=" \ + --data-urlencode "message=मेरा मोबाइल नंबर है ९८९१९८९८७१" ``` - + > **Output **: - + ```json - {"data": [ + { + "data": [ { - "detection": "message", - "original_text": "९८९१९८९८७१", - "entity_value": { "value": "981117971"}, - "language": "hi" - }] + "detection": "message", + "original_text": "९८९१९८९८७१", + "entity_value": { + "value": "9891989871" + }, + "language": "hi" + } + ] } - ``` - - Example 2: *Detecting phone number from fallback value*** - + - Example 2: *Detecting phone number from **fallback value*** + - *Django Shell:* - + ```python message = u'Please call me' entity_name = 'phone_number' @@ -802,33 +899,35 @@ Currently number detection support has been provided for 6 different languages - output = detector.detect(message=message, entity_name=entity_name, structured_value=structured_value, fallback_value=fallback_value, - bot_message=bot_message,language=source_language) + bot_message=bot_message,language=source_language) print(output) - ``` - - - *CURL command:* - - ```shell - URL='localhost' - PORT=8081 - - curl -i 'http://'$URL':'$PORT'/v2/phone_number/?message=Please%20call%20me&entity_name=phone_number&structured_value=&fallback_value=9049961794&bot_message=&source_language=en' + - *CURL command:* + + ```bash + curl -G -i "http://localhost:8081/v2/phone_number/?&entity_name=phone_number&source_language=en" \ + --data-urlencode "structured_value=" \ + --data-urlencode "fallback_value=9049961794" \ + --data-urlencode "bot_message=" \ + --data-urlencode "message=Please call me" ``` - + > **Output **: - + ```json - {"data": [ + { + "data": [ { - "detection": "fallback_value", - "original_text": "9049961794", - "entity_value": {"value": "9049961794"}, - "language": "en" - }] + "detection": "fallback_value", + "original_text": "9049961794", + "entity_value": { + "value": "9049961794" + }, + "language": "en" + } + ] } - ``` @@ -838,13 +937,13 @@ Currently number detection support has been provided for 6 different languages - The Email Detector has the capability to detect emails within the given text. **API Example:** - + - **Example 1: *Detecting emails from message*** - + - *Django Shell:* - + ```python - message = u'my email id is amans.rlx@gmail.com' + message = u'my email id is hello@haptik.ai' entity_name = 'email' structured_value = None fallback_value = None @@ -855,72 +954,80 @@ Currently number detection support has been provided for 6 different languages - structured_value=structured_value, fallback_value=fallback_value, bot_message=bot_message) print(output) - ``` - - - *CURL command:* - - ```shell - URL='localhost' - PORT=8081 - - curl -i 'http://'$URL':'$PORT'/v1/email/?message=my%20email%20id%20is%20amans.rlx%40gmail.com&entity_name=email&structured_value=&fallback_value=&bot_message=' + - *CURL command:* + + ```bash + curl -G -i "http://localhost:8081/v1/email/?&entity_name=email&source_language=en" \ + --data-urlencode "structured_value=" \ + --data-urlencode "fallback_value=" \ + --data-urlencode "bot_message=" \ + --data-urlencode "message=my email id is hello@haptik.ai" ``` - + > **Output ** - + ```json - {"data": [ + { + "data": [ { - "detection": "message", - "original_text": "amans.rlx@gmail.com", - "entity_value": {"value": "amans.rlx@gmail.com"} - }] + "detection": "message", + "original_text": "hello@haptik.ai", + "entity_value": { + "value": "hello@haptik.ai" + }, + "language": "en" + } + ] } - ``` - + - ***Example 2: Detecting email from fallback value*** - + - *Django Shell:* - + ```python message = u'send this me to my email' entity_name = 'email' structured_value = None - fallback_value = 'amans.rlx@gmail.com' + fallback_value = 'hello@haptik.ai' bot_message = None - + from ner_v1.chatbot.entity_detection import get_email output = get_email(message=message,entity_name=entity_name, structured_value=structured_value, fallback_value=fallback_value, bot_message=bot_message) print(output) ``` - + - *CURL command:* - - ```shell - URL='localhost' - PORT=8081 - - curl -i 'http://'$URL':'$PORT'/v1/email/?message=send%20me%20to%20my%20email&entity_name=email&structured_value=&fallback_value=amans.rlx@gmail.com&bot_message=' - + + ```bash + curl -G -i "http://localhost:8081/v1/email/?&entity_name=email&source_language=en" \ + --data-urlencode "structured_value=" \ + --data-urlencode "fallback_value=hello@haptik.ai" \ + --data-urlencode "bot_message=" \ + --data-urlencode "message=send this me to my email" ``` - + > **Output ** - + ```json - {"data": [ + { + "data": [ { - "detection": "fallback_value", - "original_text": "abc.123@gmail.com", - "entity_value": {"value": "abc.123@gmail.com"} - }] + "detection": "fallback_value", + "original_text": "hello@haptik.ai", + "entity_value": { + "value": "hello@haptik.ai" + }, + "language": "en" + } + ] } ``` - + ### 6. Text @@ -949,40 +1056,67 @@ The Text Detector has the capability to detect custom text entity within the giv fallback_value=fallback_value, bot_message=bot_message,language=source_language) print(output) - ``` - - The above can also be done from within the Docker container's shell. Setup is in docker.md file. - - - *CURL command:* - - ```shell - URL='localhost' - PORT=8081 - - curl -i 'http://'$URL':'$PORT'/v1/text/?message=i%20want%20to%20order%20chinese%20from%20%20mainland%20china%20and%20pizza%20from%20domminos&entity_name=restaurant&structured_value=&fallback_value=&bot_message=&source_language=en' - + + *CURL command:* + + ```bash + curl -G -i "http://localhost:8081/v1/text/?&entity_name=restaurant&source_language=en" \ + --data-urlencode "structured_value=" \ + --data-urlencode "fallback_value=" \ + --data-urlencode "bot_message=" \ + --data-urlencode "message=i want to order chinese from mainland china and pizza from dominos" ``` - - > **Output **: - + + > **Output **: + ```json - {"data": [ + { + "data": [ { - "detection": "message", - "original_text": "mainland china", - "entity_value": {"value": "Mainland China"}, - "language": "en" + "detection": "message", + "original_text": "mainland china", + "entity_value": { + "crf_model_verified": false, + "datastore_verified": true, + "value": "Mainland China" + }, + "language": "en" }, { - "detection": "message", - "original_text": "dominos", - "entity_value": { "value": "Domino's Pizza"}, - "language": "en" - }] + "detection": "message", + "original_text": "dominos", + "entity_value": { + "crf_model_verified": false, + "datastore_verified": true, + "value": "Domino's Pizza" + }, + "language": "en" + }, + { + "detection": "message", + "original_text": "chinese", + "entity_value": { + "crf_model_verified": false, + "datastore_verified": true, + "value": "Yo! Chinese" + }, + "language": "en" + }, + { + "detection": "message", + "original_text": "pizza", + "entity_value": { + "crf_model_verified": false, + "datastore_verified": true, + "value": "U S Pizza" + }, + "language": "en" + } + ] } - ``` + @@ -1009,26 +1143,29 @@ The Text Detector has the capability to detect custom text entity within the giv - *CURL command:* - ```shell - URL='localhost' - PORT=8081 - - curl -i 'http://'$URL':'$PORT'/v1/text/?message=मेरे लिए कैब बुक कर दीजिये&entity_name=movie&structured_value=मुंबई&fallback_value=&bot_message=&source_language=en' - + ```bash + curl -G -i "http://localhost:8081/v1/text/?&entity_name=movie&source_language=hi" \ + --data-urlencode "structured_value=मुंबई" \ + --data-urlencode "fallback_value=" \ + --data-urlencode "bot_message=" \ + --data-urlencode "message=मेरे लिए कैब बुक कर दीजिये" ``` > **Output **: ```json - {"data": [ + { + "data": [ { - "detection": "structure_value_verified", - "original_text": "mumbai", - "entity_value": {"value": "Mumbai"}, - "language":"hi" - }] + "detection": "structure_value_not_verified", + "original_text": "मुंबई", + "entity_value": { + "value": "मुंबई" + }, + "language": "hi" + } + ] } - ``` @@ -1061,24 +1198,30 @@ The PNR Detector has the capability to detect Train/ Flight PNR number within th - *CURL command:* ```bash - URL='localhost' - PORT=8081 - - curl -i 'http://'$URL':'$PORT'/v1/pnr/?message=check%20my%20pnr%20status%20for%202141215305.&entity_name=pnr&structured_value=&fallback_value=&bot_message=' + curl -G -i "http://localhost:8081/v1/pnr/?&entity_name=pnr&source_language=en" \ + --data-urlencode "structured_value=" \ + --data-urlencode "fallback_value=" \ + --data-urlencode "bot_message=" \ + --data-urlencode "message=check my pnr status for 2141215305" ``` - + > **Output**: - + ```json - {"data": [ + { + "data": [ { - "detection": "message", - "original_text": "2141215305", - "entity_value": { "value": "2141215305"} - }] + "detection": "message", + "original_text": "2141215305", + "entity_value": { + "value": "2141215305" + }, + "language": "en" + } + ] } ``` - + ### 8. Regex @@ -1117,20 +1260,26 @@ Detect entities that match by the specified pattern. If you are not familiar wit - *CURL command:* ```bash - URL='localhost' - PORT=8081 - - curl -i 'http://'$URL':'$PORT'/v1/regex/?message=please%20apply%20AMAZON30%20coupon%20code%20to my%20cart&entity_name=regex&structured_value=&fallback_value=&bot_message=enter%20the%otp%20®ex=\d{4,6}' + curl -G -i "http://localhost:8081/v1/regex/?&entity_name=regex_coupon_code&source_language=en" \ + --data-urlencode "structured_value=" \ + --data-urlencode "fallback_value=" \ + --data-urlencode "bot_message=" \ + --data-urlencode "regex=[A-Z]+\d{2,6}" \ + --data-urlencode "message=please apply AMAZON30 coupon code to my cart" ``` > **Output:** - + ```json - {"data": [ + { + "data": [ { - "detection": "message", - "original_text": "AMAZON30", - "entity_value": "AMAZON30" - }] + "detection": "message", + "original_text": "AMAZON30", + "entity_value": { + "value": "AMAZON30" + } + } + ] } ``` diff --git a/external_api/api.py b/external_api/api.py index b5dcc7b39..4a96b2e7b 100644 --- a/external_api/api.py +++ b/external_api/api.py @@ -11,7 +11,7 @@ from chatbot_ner.config import ner_logger from external_api.constants import ENTITY_DATA, ENTITY_NAME, LANGUAGE_SCRIPT, ENTITY_LIST, \ EXTERNAL_API_DATA, SENTENCE_LIST, READ_MODEL_FROM_S3, ES_CONFIG, READ_EMBEDDINGS_FROM_REMOTE_URL, \ - LIVE_CRF_MODEL_PATH + LIVE_CRF_MODEL_PATH, SENTENCES, LANGUAGES from django.views.decorators.csrf import csrf_exempt from models.crf_v2.crf_train import CrfTrain @@ -136,7 +136,7 @@ def get_crf_training_data(request): """ This function is used obtain the training data given the entity_name. Args: - request (HttpResponse): HTTP response from url + request (HttpRequest): HTTP response from url Returns: HttpResponse : With data consisting of a dictionary consisting of sentence_list and entity_list @@ -149,8 +149,12 @@ def get_crf_training_data(request): response = {"success": False, "error": "", "result": []} try: entity_name = request.GET.get(ENTITY_NAME) - datastore_obj = DataStore() - result = datastore_obj.get_crf_data_for_entity_name(entity_name=entity_name) + languages = request.GET.get(LANGUAGES, '') + + languages = languages.split(',') if languages else [] + + result = DataStore().get_crf_data_for_entity_name(entity_name=entity_name, languages=languages) + response['result'] = result response['success'] = True @@ -174,7 +178,7 @@ def update_crf_training_data(request): """ This function is used to update the training data Args: - request (HttpResponse): HTTP response from url + request (HttpRequest): HTTP response from url Returns: HttpResponse : HttpResponse with appropriate status and error message. Example for data present in @@ -186,15 +190,10 @@ def update_crf_training_data(request): response = {"success": False, "error": "", "result": []} try: external_api_data = json.loads(request.POST.get(EXTERNAL_API_DATA)) + sentences = external_api_data.get(SENTENCES) entity_name = external_api_data.get(ENTITY_NAME) - entity_list = external_api_data.get(ENTITY_LIST) - sentence_list = external_api_data.get(SENTENCE_LIST) - language_script = external_api_data.get(LANGUAGE_SCRIPT) - datastore_obj = DataStore() - datastore_obj.update_entity_crf_data(entity_name=entity_name, - entity_list=entity_list, - sentence_list=sentence_list, - language_script=language_script) + DataStore().update_entity_crf_data(entity_name=entity_name, + sentences=sentences) response['success'] = True except (DataStoreSettingsImproperlyConfiguredException, diff --git a/external_api/constants.py b/external_api/constants.py index 6481b360a..ecbd717fa 100644 --- a/external_api/constants.py +++ b/external_api/constants.py @@ -1,9 +1,17 @@ ENTITY_NAME = 'entity_name' EXTERNAL_API_DATA = 'external_api_data' ENTITY_DATA = 'entity_data' + +SENTENCES = 'sentences' +LANGUAGES = 'languages' +ENTITIES = 'entities' +SENTENCE = 'sentence' + LANGUAGE_SCRIPT = 'language_script' ENTITY_LIST = 'entity_list' SENTENCE_LIST = 'sentence_list' + + READ_MODEL_FROM_S3 = 'read_model_from_s3' ES_CONFIG = 'es_config' READ_EMBEDDINGS_FROM_REMOTE_URL = 'read_embeddings_from_remote_url' diff --git a/initial_setup.py b/initial_setup.py index 3b93ae708..6f41fc62f 100755 --- a/initial_setup.py +++ b/initial_setup.py @@ -5,25 +5,25 @@ BASE_DIR = os.path.dirname(__file__) -print "Downloading nltk corpus: punkt ..." +print("Downloading nltk corpus: punkt ...") status = nltk.download('punkt') if not status: - print "punkt Download was unsucessful" + print("punkt Download was unsuccessful") -print "Downloading nltk corpus: wordnet ..." +print("Downloading nltk corpus: wordnet ...") status = nltk.download('wordnet') if not status: - print "wordnet Download was unsucessful" + print("wordnet Download was unsuccessful") -print "Downloading nltk corpus: MaxEnt POS ..." +print("Downloading nltk corpus: MaxEnt POS ...") status = nltk.download('maxent_treebank_pos_tagger') if not status: - print "MaxEnt POS Download was unsucessful" + print("MaxEnt POS Download was unsuccessful") -print "Downloading nltk corpus: AP POS Tagger..." +print("Downloading nltk corpus: AP POS Tagger...") status = nltk.download('averaged_perceptron_tagger') if not status: - print "AP POS Tagger Download was unsucessful" + print("AP POS Tagger Download was unsuccessful") # Below needs to be committed if you want to use existing data in the Elasticsearch Setup @@ -34,13 +34,14 @@ # POPULATING DATASTORE # Comment out entire section if you want to reuse existing data from datastore import DataStore +from datastore.constants import DEFAULT_ENTITY_DATA_DIRECTORY db = DataStore() -print "Setting up DataStore for Chatbot NER" -print "Deleting any stale data ..." +print("Setting up DataStore for Chatbot NER") +print("Deleting any stale data ...") db.delete() -print "Creating the structure ..." +print("Creating the structure ...") db.create() -print "Populating data from " + os.path.join(BASE_DIR, 'data', 'entity_data') + " ..." -db.populate() -print "Done!" +print("Populating data from " + os.path.join(BASE_DIR, 'data', 'entity_data') + " ...") +db.populate(entity_data_directory_path=DEFAULT_ENTITY_DATA_DIRECTORY) +print("Done!") diff --git a/models/crf_v2/README.md b/models/crf_v2/README.md index 2b08170af..747c43cb2 100644 --- a/models/crf_v2/README.md +++ b/models/crf_v2/README.md @@ -1,14 +1,6 @@ - -TODO -- [ ] Change Crf -> CRF - - - - ## CONDITIONAL RANDOM FIELDS - ### A. INTRODUCTION Conditional random fields (CRFs) are a class of statistical modeling method often applied in pattern recognition and machine learning and used for structured prediction. CRFs fall into the sequence modeling family. Whereas a discrete classifier predicts a label for a single sample without considering "neighboring" samples, a CRF can take context into account; e.g., the linear chain CRF (which is popular in natural language processing) predicts sequences of labels for sequences of input samples. @@ -195,11 +187,11 @@ The module is used to take input as the sentence_list and entity_list and conver 2. **isupper** - Flag to check if the first letter of the token is capitalized + Flag to check if the complete token is in upper case 3. **istitle** - Flag to check if the complete token is in upper case + Flag to check if the first letter of the token is capitalized 4. **isdigit** diff --git a/ner_constants.py b/ner_constants.py index aab49c427..dd05aa701 100644 --- a/ner_constants.py +++ b/ner_constants.py @@ -1,4 +1,3 @@ - # ************************ constant used for detection_method ************************ # when entity is detected from message @@ -45,6 +44,7 @@ PARAMETER_TIMEZONE = 'timezone' PARAMETER_REGEX = 'regex' PARAMETER_PAST_DATE_REFERENCED = 'past_date_referenced' +PARAMETER_RANGE_ENABLED = 'range_enabled' # Language parameters of the query. PARAMETER_LANGUAGE_SCRIPT = 'language_script' # ISO 639 code for language. For eg, 'en' for 'Namaste', 'Hello' @@ -56,3 +56,6 @@ PARAMETER_MIN_DIGITS = 'min_number_digits' PARAMETER_MAX_DIGITS = 'max_number_digits' PARAMETER_NUMBER_UNIT_TYPE = 'unit_type' + +# Locale for Date and Phone Number detection +PARAMETER_LOCALE = 'locale' \ No newline at end of file diff --git a/ner_v1/chatbot/entity_detection.py b/ner_v1/chatbot/entity_detection.py index 94aad4b06..4a7ff7e31 100644 --- a/ner_v1/chatbot/entity_detection.py +++ b/ner_v1/chatbot/entity_detection.py @@ -253,7 +253,7 @@ def get_text(message, entity_name, structured_value, fallback_value, bot_message fallback_value=fallback_value, bot_message=bot_message) elif isinstance(message, (list, tuple)): - entity_output = text_model_detector.detect_bulk(messages=message) + entity_output = text_model_detector.detect_bulk(messages=message, fallback_values=fallback_value) return entity_output @@ -568,7 +568,7 @@ def get_person_name(message, entity_name, structured_value, fallback_value, bot_ entity_list, original_text_list = name_detection.detect_entity(text=text, bot_message=bot_message) if not entity_list and fallback_text: - entity_list, original_text_list = NameDetector.get_format_name(fallback_text.split()) + entity_list, original_text_list = NameDetector.get_format_name(fallback_text.split(), fallback_text) detection_method = fallback_method if entity_list and original_text_list: diff --git a/ner_v1/detectors/base_detector.py b/ner_v1/detectors/base_detector.py index 3c28db1ca..c7dc6b4c3 100644 --- a/ner_v1/detectors/base_detector.py +++ b/ner_v1/detectors/base_detector.py @@ -104,15 +104,29 @@ def detect_bulk(self, messages=None, **kwargs): messages.append(translation_output[TRANSLATED_TEXT] if translation_output['status'] else '') texts = messages - entities_list, original_texts_list = self.detect_entity_bulk(texts=texts) + entities_list, original_list = self.detect_entity_bulk(texts=texts) - if entities_list: - values_list, method, original_texts_list = entities_list, FROM_MESSAGE, original_texts_list - else: - return None + fallback_values = kwargs.get('fallback_values') + values_list, detection_method_list, original_texts_list = [], [], [] + + for i in range(len(messages)): + if entities_list[i]: + values_list.append(entities_list[i]) + detection_method_list.append(FROM_MESSAGE) + original_texts_list.append(original_list[i]) + + elif fallback_values and fallback_values[i]: + values_list.append([fallback_values[i]]) + detection_method_list.append(FROM_FALLBACK_VALUE) + original_texts_list.append([fallback_values[i]]) + + else: + values_list.append([]) + detection_method_list.append(None) + original_texts_list.append([]) return self.output_entity_bulk(entity_values_list=values_list, original_texts_list=original_texts_list, - detection_method=method, + detection_method_list=detection_method_list, detection_language=self._target_language_script) def detect(self, message=None, structured_value=None, fallback_value=None, **kwargs): @@ -257,7 +271,7 @@ def output_entity_bulk(self, entity_values_list, original_texts_list, detection_ entity_value = { ENTITY_VALUE_DICT_KEY: entity_value } - method = detection_method_list[i] if detection_method_list else detection_method + method = detection_method_list[index] if detection_method_list else detection_method entity_list.append( { ENTITY_VALUE: entity_value, diff --git a/ner_v1/detectors/textual/name/name_detection.py b/ner_v1/detectors/textual/name/name_detection.py index 07ee1121e..9cf07704e 100644 --- a/ner_v1/detectors/textual/name/name_detection.py +++ b/ner_v1/detectors/textual/name/name_detection.py @@ -49,7 +49,7 @@ def __init__(self, entity_name, language=ENGLISH_LANG): self.text_detection_object = TextDetector(entity_name=entity_name) @staticmethod - def get_format_name(name_list): + def get_format_name(name_tokens, text): """ Takes input as name_list which contains the names detected. It separates the first, middle and last names. @@ -58,7 +58,7 @@ def get_format_name(name_list): 2.The original text. Args: - name_list (list): List of names detected + name_tokens (list): List of tokens in the name Example: ['yash', 'doshi'] @@ -68,19 +68,23 @@ def get_format_name(name_list): ["yash modi"] ) """ - original_text = " ".join(name_list) + entity_value = [] + original_text = [] - first_name = name_list[0] + name_text = " ".join(name_tokens) + + first_name = name_tokens[0] middle_name = None last_name = None - if len(name_list) > 1: - last_name = name_list[-1] - middle_name = " ".join(name_list[1:-1]) or None - - entity_value = {FIRST_NAME: first_name, MIDDLE_NAME: middle_name, LAST_NAME: last_name} + if name_text in text: + if len(name_tokens) > 1: + last_name = name_tokens[-1] + middle_name = " ".join(name_tokens[1:-1]) or None - return [entity_value], [original_text] + entity_value.append({FIRST_NAME: first_name, MIDDLE_NAME: middle_name, LAST_NAME: last_name}) + original_text.append(name_text) + return entity_value, original_text def text_detection_name(self, text=None): """ @@ -127,19 +131,19 @@ def get_name_using_pos_tagger(self, text): return entity_value, original_text if pattern1_match: - entity_value, original_text = self.get_format_name(pattern1_match[0][1].split()) + entity_value, original_text = self.get_format_name(pattern1_match[0][1].split(), self.text) elif pattern2_match: - entity_value, original_text = self.get_format_name(pattern2_match[0].split()) + entity_value, original_text = self.get_format_name(pattern2_match[0].split(), self.text) elif pattern3_match: - entity_value, original_text = self.get_format_name(pattern3_match[0].split()) + entity_value, original_text = self.get_format_name(pattern3_match[0].split(), self.text) elif len(name_tokens) < 4: pos_words = [word[0] for word in tagged_names if word[1].startswith('NN') or word[1].startswith('JJ')] if pos_words: - entity_value, original_text = self.get_format_name(pos_words) + entity_value, original_text = self.get_format_name(pos_words, self.text) return entity_value, original_text @@ -297,7 +301,7 @@ def detect_person_name_entity(self, replaced_text): name_list.append(name_holder) for name in name_list: - name_entity_value, original_text_value = self.get_format_name(name) + name_entity_value, original_text_value = self.get_format_name(name, self.text) original_text.extend(original_text_value) entity_value.extend(name_entity_value) diff --git a/ner_v1/detectors/textual/text/text_detection.py b/ner_v1/detectors/textual/text/text_detection.py index 2cd2cf7dc..4ece23730 100644 --- a/ner_v1/detectors/textual/text/text_detection.py +++ b/ner_v1/detectors/textual/text/text_detection.py @@ -1,5 +1,5 @@ import collections -import re +import string from six import iteritems @@ -10,6 +10,15 @@ from lib.nlp.levenshtein_distance import edit_distance from ner_v1.detectors.base_detector import BaseDetector +try: + import regex as re + _re_flags = re.UNICODE | re.V1 | re.WORD + +except ImportError: + + import re + _re_flags = re.UNICODE + class TextDetector(BaseDetector): """ @@ -419,7 +428,11 @@ def _text_detection_with_variants(self): if original_text: value_final_list.append(variants_to_values[variant]) original_final_list.append(original_text) - _pattern = re.compile(r'\b%s\b' % re.escape(original_text), re.UNICODE) + + boundary_punct_pattern = re.compile(r'(^[{0}]+)|([{0}]+$)'.format(re.escape(string.punctuation))) + original_text_ = boundary_punct_pattern.sub("", original_text) + + _pattern = re.compile(r'\b%s\b' % re.escape(original_text_), flags=_re_flags) self.__tagged_texts[index] = _pattern.sub(self.tag, self.__tagged_texts[index]) # Instead of dropping completely like in other entities, # we replace with tag to avoid matching non contiguous segments diff --git a/ner_v1/tests/textual/__init__.py b/ner_v1/tests/textual/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/ner_v2/api.py b/ner_v2/api.py index 63657248d..c7d0975c1 100644 --- a/ner_v2/api.py +++ b/ner_v2/api.py @@ -3,7 +3,8 @@ from ner_constants import PARAMETER_MESSAGE, PARAMETER_ENTITY_NAME, PARAMETER_STRUCTURED_VALUE, \ PARAMETER_FALLBACK_VALUE, \ PARAMETER_BOT_MESSAGE, PARAMETER_TIMEZONE, PARAMETER_LANGUAGE_SCRIPT, PARAMETER_SOURCE_LANGUAGE, \ - PARAMETER_PAST_DATE_REFERENCED, PARAMETER_MIN_DIGITS, PARAMETER_MAX_DIGITS, PARAMETER_NUMBER_UNIT_TYPE + PARAMETER_PAST_DATE_REFERENCED, PARAMETER_MIN_DIGITS, PARAMETER_MAX_DIGITS, PARAMETER_NUMBER_UNIT_TYPE, \ + PARAMETER_LOCALE, PARAMETER_RANGE_ENABLED from ner_v2.detectors.temporal.date.date_detection import DateAdvancedDetector from ner_v2.detectors.temporal.time.time_detection import TimeDetector @@ -40,6 +41,8 @@ def get_parameters_dictionary(request): PARAMETER_MIN_DIGITS: request.GET.get('min_number_digits'), PARAMETER_MAX_DIGITS: request.GET.get('max_number_digits'), PARAMETER_NUMBER_UNIT_TYPE: request.GET.get('unit_type'), + PARAMETER_LOCALE: request.GET.get('locale'), + PARAMETER_RANGE_ENABLED: request.GET.get('range_enabled') } return parameters_dict @@ -68,7 +71,9 @@ def parse_post_request(request): PARAMETER_SOURCE_LANGUAGE: request_data.get('source_language', ENGLISH_LANG), PARAMETER_MIN_DIGITS: request_data.get('min_number_digits'), PARAMETER_MAX_DIGITS: request_data.get('max_number_digits'), - PARAMETER_NUMBER_UNIT_TYPE: request_data.get('unit_type') + PARAMETER_NUMBER_UNIT_TYPE: request_data.get('unit_type'), + PARAMETER_LOCALE: request_data.get('locale'), + PARAMETER_RANGE_ENABLED: request_data.get('range_enabled') } return parameters_dict @@ -95,6 +100,7 @@ def date(request): timezone (str): timezone of the user source_language (str): source language code (ISO 639-1) language_script (str): language code of script (ISO 639-1) + locale (str): locale of the user(ISO 639-1) Returns: response (django.http.response.HttpResponse): HttpResponse object @@ -109,6 +115,7 @@ def date(request): timezone = 'UTC' source_language = 'hi' language_script = 'en' + locale = 'hi-in' output = date(request) print output @@ -130,7 +137,8 @@ def date(request): date_detection = DateAdvancedDetector(entity_name=parameters_dict[PARAMETER_ENTITY_NAME], language=parameters_dict[PARAMETER_SOURCE_LANGUAGE], timezone=timezone, - past_date_referenced=past_date_referenced) + past_date_referenced=past_date_referenced, + locale=parameters_dict[PARAMETER_LOCALE]) date_detection.set_bot_message(bot_message=parameters_dict[PARAMETER_BOT_MESSAGE]) @@ -203,8 +211,9 @@ def time(request): parameters_dict = get_parameters_dictionary(request) ner_logger.debug('Start: %s ' % parameters_dict[PARAMETER_ENTITY_NAME]) - timezone = parameters_dict[PARAMETER_TIMEZONE] or 'UTC' + timezone = parameters_dict[PARAMETER_TIMEZONE] or None form_check = True if parameters_dict[PARAMETER_STRUCTURED_VALUE] else False + range_enabled = True if parameters_dict[PARAMETER_RANGE_ENABLED] else False time_detection = TimeDetector(entity_name=parameters_dict[PARAMETER_ENTITY_NAME], language=parameters_dict[PARAMETER_SOURCE_LANGUAGE], timezone=timezone) @@ -218,7 +227,8 @@ def time(request): entity_output = time_detection.detect(message=message, structured_value=parameters_dict[PARAMETER_STRUCTURED_VALUE], fallback_value=parameters_dict[PARAMETER_FALLBACK_VALUE], - form_check=form_check) + form_check=form_check, + range_enabled=range_enabled) elif isinstance(message, (list, tuple)): entity_output = time_detection.detect_bulk(messages=message) @@ -509,6 +519,7 @@ def phone_number(request): ] """ try: + parameters_dict = {} if request.method == "POST": parameters_dict = parse_post_request(request) ner_logger.debug('Start Bulk Detection: %s ' % parameters_dict[PARAMETER_ENTITY_NAME]) @@ -521,8 +532,11 @@ def phone_number(request): ner_logger.debug('Entity Name %s' % entity_name) ner_logger.debug('Source Language %s' % language) - phone_number_detection = PhoneDetector(entity_name=entity_name, language=language) + phone_number_detection = PhoneDetector(entity_name=entity_name, language=language, + locale=parameters_dict[PARAMETER_LOCALE]) message = parameters_dict[PARAMETER_MESSAGE] + entity_output = None + ner_logger.debug(parameters_dict) if isinstance(message, six.string_types): entity_output = phone_number_detection.detect(message=message, structured_value=parameters_dict[PARAMETER_STRUCTURED_VALUE], diff --git a/ner_v2/detectors/numeral/constant.py b/ner_v2/detectors/numeral/constant.py index 1c3b7d3cd..8ae0ccac6 100644 --- a/ner_v2/detectors/numeral/constant.py +++ b/ner_v2/detectors/numeral/constant.py @@ -42,3 +42,4 @@ NUMBER_RANGE_MIN_VALUE = 'min_value' NUMBER_RANGE_MAX_VALUE = 'max_value' NUMBER_RANGE_VALUE_UNIT = 'unit' +NUMBER_RANGE_ABS_VALUE = 'abs_value' diff --git a/ner_v2/detectors/numeral/number/en/data/numerals_constant.csv b/ner_v2/detectors/numeral/number/en/data/numerals_constant.csv index 9c9310e69..b292c1842 100644 --- a/ner_v2/detectors/numeral/number/en/data/numerals_constant.csv +++ b/ner_v2/detectors/numeral/number/en/data/numerals_constant.csv @@ -32,3 +32,4 @@ number,name_variants,number_value,number_type 100000,lakh|lakhs|lac|lacs|l,100000,scale 100000,million|mil|m,1000000,scale 10000000,crore|crores|c|cr,10000000,scale +100000000,billion|bil|b,1000000000,scale diff --git a/ner_v2/detectors/numeral/number/en/data/units.csv b/ner_v2/detectors/numeral/number/en/data/units.csv index f53b73cf3..ec35b5ab2 100644 --- a/ner_v2/detectors/numeral/number/en/data/units.csv +++ b/ner_v2/detectors/numeral/number/en/data/units.csv @@ -1,9 +1,16 @@ unit_type,unit_value,unit_variants currency,rupees,rupees | rupee | rs | rupya | rupaya | rupaye | rupye | rupay | paisa | paise | inr | ₹ -currency,dollar,Dollar | usd | $ +currency,dollar,Dollar | dollars | usd | $ +currency,euro,Euro | euros | eur | € +currency,pound sterling,Pound sterling | pound sterlings | quid | pounds | sterling | pound | gbp | £ +currency,cent,Cents | cent | ¢ +currency,pence,Pence package_metric_unit,mg,mg | milligram | milligrams | mgs package_metric_unit,gms,gms | grams | gram | gm | g package_metric_unit,kg,kilogram | kilograms | kg | kilo | kgs package_metric_unit,ml,ml | milliliter | millilitre | milliliters | millilitres package_metric_unit,ltr,ltr | litre | liter | litres | liters | l -package_metric_unit,pcs,pcs | pc | pieces | piece \ No newline at end of file +package_metric_unit,pcs,pcs | pc | pieces | piece +channel,channel,channel |channel number | chanel | chanel number | open | go to +episode,episode, episode | episod +season,season, season | seasn diff --git a/ner_v2/detectors/numeral/number/gu/data/units.csv b/ner_v2/detectors/numeral/number/gu/data/units.csv index b71974bc1..0c260a36f 100644 --- a/ner_v2/detectors/numeral/number/gu/data/units.csv +++ b/ner_v2/detectors/numeral/number/gu/data/units.csv @@ -1,3 +1,4 @@ unit_type,unit_value,unit_variants currency,rupees,rupees | rupee | rs | rupya | rupaya | rupaye | rupye | rupay | paisa | paise | inr | ₹ |રૂપિયા | ભારતીય રૂપિયા | પૈસા currency,dollar,Dollar | usd | $ | ડોલર +channel,channel, ચનેલ | ચેનલ | ચણેલ | નંબર \ No newline at end of file diff --git a/ner_v2/detectors/numeral/number/hi/data/units.csv b/ner_v2/detectors/numeral/number/hi/data/units.csv index 53b9c82ca..17c699daa 100644 --- a/ner_v2/detectors/numeral/number/hi/data/units.csv +++ b/ner_v2/detectors/numeral/number/hi/data/units.csv @@ -7,3 +7,4 @@ package_metric_unit,kg,kilogram | kilograms | kg | kilo | kgs | किलोग package_metric_unit,ml,ml | milliliter | millilitre | milliliters | millilitres | मिलीलीटर | मिलिलिटर | मिललिलिटर | मिली लीटर package_metric_unit,ltr,ltr | litre | liter | litres | liters | l | लीटर | लिटर package_metric_unit,pcs,pcs | pc | pieces | piece | पीस | पिस | टुकड़े | टुकड़ा +channel,channel,चैनल नंबर | चॅनेल नंबर | चनेल | चैनल | चैनल | चनाल | चेनल नंबर | चनेल नंबर| चैनल नंबर| चैनल नंबर| चनाल नंबर| चेनल नंबर | नंबर \ No newline at end of file diff --git a/ner_v2/detectors/numeral/number/mr/data/numerals_constant.csv b/ner_v2/detectors/numeral/number/mr/data/numerals_constant.csv index 18252e2b8..8826726a9 100644 --- a/ner_v2/detectors/numeral/number/mr/data/numerals_constant.csv +++ b/ner_v2/detectors/numeral/number/mr/data/numerals_constant.csv @@ -23,11 +23,11 @@ number,name_variants,number_value,number_type १९,एकोणीस|ekonis|econis,19,unit २०,वीस|Vis,20,unit २१,एकवीस|Ekavis|Ekvis,21,unit -२२,बावीस|Bavis,22,unit +२२,बावीस|Bavis|Baavis,22,unit २३,तेवीस|Tevis,23,unit -२४,चोवीस|chauwis|chauvis,24,unit +२४,चोवीस|chauwis|chauvis|chovis|chowis,24,unit २५,पंचवीस|panchavis|panchvis,25,unit -२६,सव्वीस|Savvis,26,unit +२६,सव्वीस|Savvis|Sauvis,26,unit २७,सत्तावीस|Sattavis,27,unit २८,अठ्ठावीस|Aththavis|Attavis,28,unit २९,एकोणतीस|Ekonatis|Ekontis,29,unit @@ -38,14 +38,14 @@ number,name_variants,number_value,number_type ३४,चौतीस|chautis,34,unit ३५,पस्तीस|pastis,35,unit ३६,छत्तीस|Chattis,36,unit -३७,सदतीस|sadatis,37,unit -३८,अडतीस|adatis,38,unit +३७,सदतीस|sadatis|sadotis,37,unit +३८,अडतीस|adatis|adotis,38,unit ३९,एकोणचाळीस|ekonachalis|ekonchalis|econchalis,39,unit ४०,चाळीस|chalis,40,unit ४१,एक्केचाळीस|ekkechalis|Akkechalis,41,unit ४२,बेचाळीस|bechalis,42,unit ४३,त्रेचाळीस|trechalis,43,unit -४४,चव्वेचाळीस|chavvechalis|chavechalis,44,unit +४४,चव्वेचाळीस|chavvechalis|chavechalis|chavrechalis,44,unit ४५,पंचेचाळीस|pamchechalis|panchechalis,45,unit ४६,सेहेचाळीस|sehechalis|Sechalis,46,unit ४७,सत्तेचाळीस|sattechalis,47,unit @@ -63,33 +63,33 @@ number,name_variants,number_value,number_type ५९,एकोणसाठ|ekonasath|ekonasat|ekonsath,59,unit ६०,साठ|sath,60,unit ६१,एकसष्ठ|ekasashth|ekshasth,61,unit -६२,बासष्ठ|basashth|Basath,62,unit +६२,बासष्ठ|basashth|Basath|Besashth,62,unit ६३,त्रेसष्ठ|tresashth|Tresath,63,unit ६४,चौसष्ठ|chausashth|chausath,64,unit ६५,पासष्ठ|pasashth|Pasath,65,unit ६६,सहासष्ठ|sahasashth|Sahasath,66,unit -६७,सदुसष्ठ|sadusashth|sadusath,67,unit -६८,अडुसष्ठ|adusashth|adusath,68,unit -६९,एकोणसत्तर|ekonsattar,69,unit +६७,सदुसष्ठ|sadusashth|sadusath|sadosashth,67,unit +६८,अडुसष्ठ|adusashth|adusath|adosashth,68,unit +६९,एकोणसत्तर|ekonsattar|ekunsattar,69,unit ७०,सत्तर|sattar,70,unit ७१,एक्काहत्तर|ekkahattar|ekattar,71,unit ७२,बाहत्तर|bahattar|Baattar,72,unit -७३,त्र्याहत्तर|tryahattar,73,unit -७४,चौर्‍याहत्तर|chauryahattar,74,unit +७३,त्र्याहत्तर|tryahattar|tryattar,73,unit +७४,चौर्‍याहत्तर|chauryahattar|chauryattar,74,unit ७५,पंच्याहत्तर|pamchyahattar|panchattar,75,unit ७६,शहात्तर|shahattar|shattar,76,unit ७७,सत्याहत्तर|satyahattar|Satyattar,77,unit -७८,अठ्ठ्याहत्तर|aththyahattar,78,unit -७९,एकोण ऐंशी|ekon aimshi|ekon anshi,79,unit +७८,अठ्ठ्याहत्तर|aththyahattar|atthyattar ,78,unit +७९,एकोण ऐंशी|ekon aimshi|ekon anshi|ekonainshi,79,unit ८०,ऐंशी|Aenshi,80,unit ८१,एक्क्याऐंशी|ekkyaaimshi|Ekkyanshi,81,unit ८२,ब्याऐंशी|byaaimshi|byanshi,82,unit ८३,त्र्याऐंशी|Tryaaimshi|Tryaanshi,83,unit ८४,चौऱ्याऐंशी|chauryaaimshi|chauryanshi,84,unit ८५,पंच्याऐंशी|pamchyaaimshi|Panchyanshi,85,unit -८६,शहाऐंशी|shahaaimshi|Shaynshi,86,unit +८६,शहाऐंशी|shahaaimshi|Shaynshi|Shahaainshi,86,unit ८७,सत्त्याऐंशी|sattyaaimshi|satyanshi,87,unit -८८,अठ्ठ्याऐंशी|aththyaaimshi|athyanshi,88,unit +८८,अठ्ठ्याऐंशी|aththyaaimshi|athyanshi|aththyaainshi,88,unit ८९,एकोणनव्वद|ekonanavvad|ekonnavvad,89,unit ९०,नव्वद|navvad|navad,90,unit ९१,एक्क्याण्णव|ekkyannav,91,unit @@ -100,8 +100,8 @@ number,name_variants,number_value,number_type ९६,शहाण्णव|shahannav|shyanav,96,unit ९७,सत्त्याण्णव|sattyannav,97,unit ९८,अठ्ठ्याण्णव|aththyannav|athyanav,98,unit -९९,नव्व्याण्णव|navvyannav|navyannav,99,unit +९९,नव्व्याण्णव|navvyannav|navyannav|navvyanav,99,unit १००,शंभर|shambhar|shambar,100,scale -१०००,हजार|hazar|hajar,1000,scale +१०००,हजार|hazar|hajar|hajaar|hazaar,1000,scale १०००००,लाख|Lakh,100000,scale १०००००००,कोटी|koti,10000000,scale diff --git a/ner_v2/detectors/numeral/number/number_detection.py b/ner_v2/detectors/numeral/number/number_detection.py index cbfb0ebb8..56427aeaa 100644 --- a/ner_v2/detectors/numeral/number/number_detection.py +++ b/ner_v2/detectors/numeral/number/number_detection.py @@ -67,7 +67,7 @@ def get_supported_languages(): supported_languages.append(_dir) return supported_languages - def __init__(self, entity_name, language=ENGLISH_LANG, unit_type=None): + def __init__(self, entity_name, language=ENGLISH_LANG, unit_type=None, detect_without_unit=False): """Initializes a NumberDetector object Args: @@ -90,6 +90,7 @@ def __init__(self, entity_name, language=ENGLISH_LANG, unit_type=None): self.max_digit = 6 self.language = language self.unit_type = unit_type + self.detect_without_unit = detect_without_unit try: number_detector_module = importlib.import_module( 'ner_v2.detectors.numeral.number.{0}.number_detection'.format(self.language)) @@ -140,7 +141,8 @@ def detect_entity(self, text, **kwargs): number_unit = number_value_dict[NUMBER_DETECTION_RETURN_DICT_UNIT] if self.min_digit <= self._num_digits(number_value) <= self.max_digit: if self.unit_type and (number_unit is None or - self.language_number_detector.units_map[number_unit].type != self.unit_type): + self.language_number_detector.units_map[number_unit].type != self.unit_type)\ + and not self.detect_without_unit: continue validated_number.append(number_value_dict) validated_number_text.append(original_text) @@ -183,4 +185,4 @@ def _num_digits(value): ValueError: if the given string cannot be cast to float """ v = abs(float(value)) - return 1 if int(v) == 0 else (1 + int(math.log10(v))) + return 1 if int(v) == 0 else (1 + int(math.log10(v))) \ No newline at end of file diff --git a/ner_v2/detectors/numeral/number/standard_number_detector.py b/ner_v2/detectors/numeral/number/standard_number_detector.py index fc69f4a4a..b4b0ce798 100644 --- a/ner_v2/detectors/numeral/number/standard_number_detector.py +++ b/ner_v2/detectors/numeral/number/standard_number_detector.py @@ -2,7 +2,17 @@ import pandas as pd import collections import os -import re + +try: + import regex as re + + _re_flags = re.UNICODE | re.V1 | re.WORD + +except ImportError: + + import re + + _re_flags = re.UNICODE from ner_v2.detectors.numeral.constant import NUMBER_NUMERAL_FILE_VARIANTS_COLUMN_NAME, \ NUMBER_NUMERAL_FILE_VALUE_COLUMN_NAME, NUMBER_NUMERAL_FILE_TYPE_COLUMN_NAME, NUMBER_TYPE_UNIT, \ @@ -151,8 +161,10 @@ def _get_unit_from_text(self, detected_original, processed_text): # add re.escape to handle decimal cases in detected original detected_original = re.escape(detected_original) - unit_matches = re.search(r'\W+((' + self.unit_choices + r')[\.\,\s]*' + detected_original + r')|(' + - detected_original + r'\s*(' + self.unit_choices + r'))\W+', processed_text, + unit_matches = re.search(r'\W+((' + self.unit_choices + r')[.,\s]*' + detected_original + r')\W+|\W+(' + + detected_original + r'\s*(' + + self.unit_choices + r'))\W+', + processed_text, re.UNICODE) if unit_matches: original_text_prefix, unit_prefix, original_text_suffix, unit_suffix = unit_matches.groups() @@ -209,11 +221,19 @@ def _detect_number_from_words(self, number_list=None, original_list=None): numeral_text_list = re.split(r'[\-\:]', self.processed_text) for numeral_text in numeral_text_list: numbers, original_texts = get_number_from_number_word(numeral_text, self.numbers_word_map) - for number, original_text in zip(numbers, original_texts): + full_list = list(zip(numbers, original_texts)) + """ + list() is added to above zip as in python 3, zip() returns a zip object instead of zip function and + our lint checker is matching it for python 3 + """ + sorted_full_list = sorted(full_list, key=lambda kv: len(kv[1]), reverse=True) + for number, original_text in sorted_full_list: unit = None if self.unit_type: unit, original_text = self._get_unit_from_text(original_text, numeral_text) - numeral_text = numeral_text.replace(original_text, self.tag) + # numeral_text = numeral_text.replace(original_text, self.tag) + _pattern = re.compile(r'\b%s\b' % re.escape(original_text), flags=_re_flags) + numeral_text = _pattern.sub(self.tag, numeral_text) number_list.append({ NUMBER_DETECTION_RETURN_DICT_VALUE: str(number), NUMBER_DETECTION_RETURN_DICT_UNIT: unit @@ -286,7 +306,8 @@ def _detect_number_from_digit(self, number_list=None, original_list=None): unit = None if self.unit_type: unit, original_text = self._get_unit_from_text(original_text, processed_text) - processed_text = processed_text.replace(original_text, self.tag) + _pattern = re.compile(r'\b%s\b' % re.escape(original_text), flags=_re_flags) + processed_text = _pattern.sub(self.tag, processed_text) number_list.append({ NUMBER_DETECTION_RETURN_DICT_VALUE: str(number), NUMBER_DETECTION_RETURN_DICT_UNIT: unit @@ -307,8 +328,9 @@ def _update_processed_text(self, original_number_list): created from entity_name """ for detected_text in original_number_list: - self.tagged_text = self.tagged_text.replace(detected_text, self.tag) - self.processed_text = self.processed_text.replace(detected_text, '') + _pattern = re.compile(r'\b%s\b' % re.escape(detected_text), flags=_re_flags) + self.tagged_text = _pattern.sub(self.tag, self.tagged_text) + self.processed_text = _pattern.sub('', self.processed_text) class NumberDetector(BaseNumberDetector): diff --git a/ner_v2/detectors/numeral/number/te/__init__.py b/ner_v2/detectors/numeral/number/te/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/ner_v2/detectors/numeral/number/te/data/numerals_constant.csv b/ner_v2/detectors/numeral/number/te/data/numerals_constant.csv new file mode 100644 index 000000000..3bb362af5 --- /dev/null +++ b/ner_v2/detectors/numeral/number/te/data/numerals_constant.csv @@ -0,0 +1,107 @@ +number,name_variants,number_value,number_type +౦,సున్నా|సున్న|శూన్యం|సూన్యం|జీరో|sunna|shunyam|sunyam|zero,0,unit +౧.౫,ఒకటిన్నర|ఒక్కటి అర|ఒకటి అర|okatinnara|okkati ara|okati ara,1.5,unit +౨.౫,రెండున్నర|రెండు అర|rendunnara|rendu ara,2.5,unit +౧,ఒక్కటి|ఒకటి|మొదటిది|ఒకటవ|ఒకటో|ఒక|okkati|okati|modatidi|okatova|okato|okkato|okkatova|oka,1,unit +౨,రెండు|రొండు|రెండొవ|రెండో|రెండొవది|rendu|rondu|rendova|rendo|rendovadi,2,unit +౩,మూడు|మూడోవ|మూడోవ|మూడొవది|మూడో|mudu|muudu|mudova|muudova|mudovadi|mudo|muudo,3,unit +౪,నాలుగు|నాల్గు|నాల్గొవ|నాల్గొవది|నాల్గో|నాలగు|nalugu|naalugu|nalgu|nalgova|nalgovadi|nalgo|nalagu|nalagu|naalagu,4,unit +౫,ఐదు|అయిదు|ఐదొవ|అయిదవది|ఐదో|idu|ayidu|iydu|iydova|ayidovadi|ayido|ido|aidu,5,unit +౬,ఆరు|ఆఱు|ఆరొవ|ఆరొవది|ఆరో|అర డజను|అర డజన్|aru|aaru|aarova|aarovadi|aro|aaro|ara dozen|ara dozenu|ara dajanu|ara dajan,6,unit +౭,ఏడు|ఏడొవ|ఏడొవది|ఏడో|aedu|aedova|aedovadi|aedo|ado|adova|adovadi,7,unit +౮,ఎనిమిది|ఎనిమిదొవ|ఎనిమిదిది|ఎనిమిదో|enimidi|enimdova|enimididi|enimido,8,unit +౯,తొమ్మిది|తొమ్మిదిది|తొమ్మిదో|tommidi|thommidi|thommididi|thommido|tomidi|thomido,9,unit +౧౦,పది|పదొవ|పదొవది|పదో|padi|padhi|padova|padovadi|pado,10,unit +౧౧,పదకొండు|పదకొండొవ|పదకొండొవది|పదకొండో|padakondu|padakondova|padakondovadi|padakondo,11,unit +౧౨,పన్నెండు|పన్నెండవ|పన్నెండవది|పన్నెండో|పన్నెండొవ|పన్నెండొవది|పన్నెండొ|డజన్|డజను|dozenu|dojanu|dojan|dozen|pannendu|pannendova|pannendovadi|pannendo,12,unit +౧౩,పదమూడు|పదమూడొవ|పదముండొవది|పదముండొ|padamudu|padamudova|padamundovadi|padamundo,13,unit +౧౪,పద్నాలుగు|పద్నలుగొవ|పద్నాలుగుది|పద్నాలుగో|padnalugu|padnalugova|padnalugudi|padnalugo|padinalugu|padinalagu,14,unit +౧౫,పదిహేను|పదిహేనోవా|పదిహేనోవాది|పదిహేనో|పదైదు|padihenu|padihenova|padihenovadi|padiheno|padaidu,15,unit +౧౬,పదహారు|పదహారోది|పదహారో|padahaaru|padhaharu|padaharodi|padaharo,16,unit +౧౭,పదిహేడు|పదిహేనొవ|పదిహేనోవాది|పదిహేనో|padihedu|padhihedu|padihenova|padihenovadi|padiheno,17,unit +౧౮,పద్దెనిమిది|పద్దెనిమిదొవ|పద్దెనిమిదిది|పద్దెనిమిదో|పద్దెనిమిదొవ|paddenimidi|padhenimidi|paddenimidova|paddenimididi|padhenimidho|padhenimidova,18,unit +౧౯,పంతొమ్మిది|పందొమ్మిదొవ|పంతొమ్మిదో|పంతొమ్మిదొవ|పందొమ్మిది|pantommidi|pandommidova|panthommido,19,unit +౨౦,ఇరవై|ఇరవై|ఇరవయ్యొవది|ఇరవయ్యో|iravay|iravai|iravayyovadi|iravayyo,20,unit +౨౧,ఇరవయ్యొక్కటి|ఇరవై ఒకటి |ఇరవై ఒక్కటి|ఇరవై ఒకటో|iravayyokkati|iravay okati|eravay okati|iravay okato,21,unit +౨౨,ఇరవై రెండు|ఇరవై రెండొవ|ఇరవై రెండొవది|ఇరవై రెండో|iravay rendu|iravay rendova|iravay rendovadi|iravay rendo,22,unit +౨౩,ఇరవై మూడు|ఇరవై మూడవా|ఇరవై మూడొవది|ఇరవై మూడో|iravay mudu|iravay mudova|iravay mudovadi|iravay mudo,23,unit +౨౪,ఇరవై నాలుగు|ఇరవై నాల్గొవ|ఇరవై నాల్గొవది|ఇరవై నాల్గో|ఇరవై నాలగు|iravay naalugu|iravay naalgova|iravay nalgovadi|iravay nalgo|iravay nalagu|iaravay naalagu,24,unit +౨౫,ఇరవై ఐదు|ఇరవై ఐదొవ|ఇరవై ఐదొవది|ఇరవై ఐదో|పాతిక|pathika|patika|paathika|paatika|iravay aidu|iravay aidu|iravay aidova|iravay aidovadi|iravay aido,25,unit +౨౬,ఇరవై ఆఱు|ఇరవై ఆరు|ఇరవై ఆరొవ|ఇరవై ఆరో|ఇరవై ఆరొవది|iravay aaru|iravay aaru|iravay arova|iravay aaro|iravay aarovadi,26,unit +౨౭,ఇరవయ్యేడు|ఇరవయ్యేడొవ|ఇరవయ్యేడొవది|ఇరవయ్యేడో|ఇరవై ఏడు|iravayeedu|iravayeedova|iravayeedovadi|iravayeedo|iravay aedu,27,unit +౨౮,ఇరవై ఎనిమిది|ఇరవై ఎనిమిదొవ| ఇరవై ఎనిమిదిది|ఇరవై ఎనిమిదో|iravay enimidi|iravay enimidova|iraavay enimididi|iravay enimido,28,unit +౨౯,ఇరవై తొమ్మిది|ఇరవై తొమ్మిదొవది|ఇరవై తొమ్మిదో|iravay tommidi|iravay thommidi|iravay thommidovadi|iravay thommido,29,unit +౩౦,ముప్పై|ముప్పైయొవది|ముప్పైయ్యొవది|ముప్పైయొవ|ముప్పైయో|muppai|muppaiovadi|muppaiova|muppaiyo,30,unit +౩౧,ముప్పై ఒక్కటి|ముప్పై ఒకటవ|ముప్పై ఒకటోవది|ముప్పై ఒకటో|muppai okkati|muppai okatova|muppai okatovadi|muppai okato,31,unit +౩౨,ముప్పై రెండు|ముప్పై రెండొవ|ముప్పై రెండో|muppai rendu|muppai rendova|muppai rendo,32,unit +౩౩,ముప్పై మూడు|ముప్పై మూడోవ|ముప్పై మూడోవ|ముప్పై మూడొవది|ముప్పై మూడో|muppai mudu|muppai muudu|muppai mudova|muppai muudova|muppai mudovadi|muppai mudo|muppai muudo,33,unit +౩౪,ముప్పై నాలుగు|ముప్పై నాల్గు|ముప్పై నాల్గొవ|ముప్పై నాల్గొవది|ముప్పై నాల్గో|ముప్పై నాలగు|muppai nalugu|muppai naalugu|muppai nalgu|muppai nalgova|muppai nalgovadi|muppai nalgo|muppai nalagu|muppai naalagu,34,unit +౩౫,ముప్పై ఐదు|ముప్పై అయిదు|ముప్పై ఐదొవ|ముప్పై అయిదవది|ముప్పై ఐదో|muppai idu|muppai ayidu|muppai iydu|muppai iydova|muppai ayidovadi|muppai ayido|muppai ido|muppai aidu,35,unit +౩౬,ముప్పై ఆరు|ముప్పై ఆఱు|ముప్పై ఆరొవ|ముప్పై ఆరొవది|ముప్పై ఆరో|muppai aru|muppai aaru|muppai aarova|muppai aarovadi|muppai aro|muppai aaro,36,unit +౩౭,ముప్పై ఏడు|ముప్పై ఏడొవ|ముప్పై ఏడొవది|ముప్పై ఏడో|muppai aedu|muppai aedova|muppai aedovadi|muppai aedo|muppai ado|muppai adova|muppai adovadi,37,unit +౩౮,ముప్పై ఎనిమిది|ముప్పై ఎనిమిదొవ|ముప్పై ఎనిమిదిది|ముప్పై ఎనిమిదో|muppai enimidi|muppai enimdova|muppai enimididi|muppai enimido,38,unit +౩౯,ముప్పై తొమ్మిది|ముప్పై తొమ్మిదిది|ముప్పై తొమ్మిదో|muppai tommidi|muppai thommidi|muppai thommididi|muppai thommido|muppai tomidi|muppai thomido,39,unit +౪౦,నలభై|నలబై|నలభైయొవ|నలభైయొవది|నలభైయ్యొవది|నలభైయో|nalabhai|nalabhay|nalabayova|nalabayovadi|nalabhayo,40,unit +౪౧,నలభై ఒక్కటి|నలభై ఒకటి|నలభై మొదటిది|నలభై ఒకటవ|నలభై ఒకటో|nalabhai okkati|nalabhai okati|nalabhai modatidi|nalabhai okatova|nalabhai okato|nalabhai okkato|nalabhai okkatova,41,unit +౪౨,నలభై రెండు|నలభై రొండు|నలభై రెండొవ|నలభై రెండో|నలభై రెండొవది|nalabhai rendu|nalabhai rondu|nalabhai rendova|nalabhai rendo|nalabhai rendovadi,42,unit +౪౩,నలభై మూడు|నలభై మూడోవ|నలభై మూడోవ|నలభై మూడొవది|నలభై మూడో|nalabhai mudu|nalabhai muudu|nalabhai mudova|nalabhai muudova|nalabhai mudovadi|nalabhai mudo|nalabhai muudo,43,unit +౪౪,నలభై నాలుగు|నలభై నాల్గు|నలభై నాల్గొవ|నలభై నాల్గొవది|నలభై నాల్గో|nalabhai nalugu|nalabhai naalugu|nalabhai nalgu|nalabhai nalgova|nalabhai nalgovadi|nalabhai nalgo|nalabhai naalagu,44,unit +౪౫,నలభై ఐదు|నలభై అయిదు|నలభై ఐదొవ|నలభై అయిదవది|నలభై ఐదో|nalabhai idu|nalabhai ayidu|nalabhai iydu|nalabhai iydova|nalabhai ayidovadi|nalabhai ayido|nalabhai ido|nalabhai aidu,45,unit +౪౬,నలభై ఆరు|నలభై ఆఱు|నలభై ఆరొవ|నలభై ఆరొవది|నలభై ఆరో|nalabhai aru|nalabhai aaru|nalabhai aarova|nalabhai aarovadi|nalabhai aro|nalabhai aaro,46,unit +౪౭,నలభై ఏడు|నలభై ఏడొవ|నలభై ఏడొవది|నలభై ఏడో|nalabhai aedu|nalabhai aedova|nalabhai aedovadi|nalabhai aedo|nalabhai ado|nalabhai adova|nalabhai adovadi,47,unit +౪౮,నలభై ఎనిమిది|నలభై ఎనిమిదొవ|నలభై ఎనిమిదిది|నలభై ఎనిమిదో|nalabhai enimidi|nalabhai enimdova|nalabhai enimididi|nalabhai enimido,48,unit +౪౯,నలభై తొమ్మిది|నలభై తొమ్మిదిది|నలభై తొమ్మిదో|nalabhai tommidi|nalabhai thommidi|nalabhai thommididi|nalabhai thommido|nalabhai tomidi|nalabhai thomido,49,unit +౫౦,యాభై|యాభైయొవ|యాభైయొవది|నలభైయ్యొవది|యాభైయ్యో|yabhai|yabhaiyyova|yabhaiyova|yabhaiyyovadi|yabhaiyyo,50,unit +౫౧,యాభై ఒక్కటి|యాభై ఒకటి|యాభై మొదటిది|యాభై ఒకటవ|యాభై ఒకటో|yabhai okkati|yabhai okati|yabhai modatidi|yabhai okatova|yabhai okato|yabhai okkato|yabhai okkatova,51,unit +౫౨,యాభై రెండు|యాభై రొండు|యాభై రెండొవ|యాభై రెండో|యాభై రెండొవది|yabhai rendu|yabhai rondu|yabhai rendova|yabhai rendo|yabhai rendovadi,52,unit +౫౩,యాభై మూడు|యాభై మూడోవ|యాభై మూడోవ|యాభై మూడొవది|యాభై మూడో|yabhai mudu|yabhai muudu|yabhai mudova|yabhai muudova|yabhai mudovadi|yabhai mudo|yabhai muudo,53,unit +౫౪,యాభై నాలుగు|యాభై నాల్గు|యాభై నాల్గొవ|యాభై నాల్గొవది|యాభై నాల్గో|యాభై నాలగు|yabhai nalugu|yabhai naalugu|yabhai nalgu|yabhai nalgova|yabhai nalgovadi|yabhai nalgo|yabhai nalagu|yabhai nalagu|yabhai naalagu,54,unit +౫౫,యాభై ఐదు|యాభై అయిదు|యాభై ఐదొవ|యాభై అయిదవది|యాభై ఐదో|yabhai idu|yabhai ayidu|yabhai iydu|yabhai iydova|yabhai ayidovadi|yabhai ayido|yabhai ido|yabhai aidu,55,unit +౫౬,యాభై ఆరు|యాభై ఆఱు|యాభై ఆరొవ|యాభై ఆరొవది|యాభై ఆరో|yabhai aru|yabhai aaru|yabhai aarova|yabhai aarovadi|yabhai aro|yabhai aaro,56,unit +౫౭,యాభై ఏడు|యాభై ఏడొవ|యాభై ఏడొవది|యాభై ఏడో|yabhai aedu|yabhai aedova|yabhai aedovadi|yabhai aedo|yabhai ado|yabhai adova|yabhai adovadi,57,unit +౫౮,యాభై ఎనిమిది|యాభై ఎనిమిదొవ|యాభై ఎనిమిదిది|యాభై ఎనిమిదో|yabhai enimidi|yabhai enimdova|yabhai enimididi|yabhai enimido,58,unit +౫౯,యాభై తొమ్మిది|యాభై తొమ్మిదిది|యాభై తొమ్మిదో|yabhai tommidi|yabhai thommidi|yabhai thommididi|yabhai thommido|yabhai tomidi|yabhai thomido,59,unit +౬౦,అరవై|అరవైయవ|అరవైయొవ|అరవయొవది|అరవైయ్యొవది|అరవయ్యో|అరవయోవ|అరవయోవ|aravay|aravai|aravaiyova|aravaiyyovadi|aravayyo,60,unit +౬౧,అరవై ఒక్కటి|అరవై ఒకటి|అరవై మొదటిది|అరవై ఒకటవ|అరవై ఒకటో|aravai okkati|aravai okati|aravai modatidi|aravai okatova|aravai okato|aravai okkato|aravai okkatova,61,unit +౬౨,అరవై రెండు|అరవై రొండు|అరవై రెండొవ|అరవై రెండో|అరవై రెండొవది|aravai rendu|aravai rondu|aravai rendova|aravai rendo|aravai rendovadi,62,unit +౬౩,అరవై మూడు|అరవై మూడోవ|అరవై మూడోవ|అరవై మూడొవది|అరవై మూడో|aravai mudu|aravai muudu|aravai mudova|aravai muudova|aravai mudovadi|aravai mudo|aravai muudo,63,unit +౬౪,అరవై నాలుగు|అరవై నాల్గు|అరవై నాల్గొవ|అరవై నాల్గొవది|అరవై నాల్గో|అరవై నాలగు|aravai nalugu|aravai naalugu|aravai nalgu|aravai nalgova|aravai nalgovadi|aravai nalgo|aravai nalagu|aravai nalagu|aravai naalagu,64,unit +౬౫,అరవై ఐదు|అరవై అయిదు|అరవై ఐదొవ|అరవై అయిదవది|అరవై ఐదో|aravai idu|aravai ayidu|aravai iydu|aravai iydova|aravai ayidovadi|aravai ayido|aravai ido|aravai aidu,65,unit +౬౬,అరవై ఆరు|అరవై ఆఱు|అరవై ఆరొవ|అరవై ఆరొవది|అరవై ఆరో|aravai aru|aravai aaru|aravai aarova|aravai aarovadi|aravai aro|aravai aaro,66,unit +౬౭,అరవై ఏడు|అరవై ఏడొవ|అరవై ఏడొవది|అరవై ఏడో|aravai aedu|aravai aedova|aravai aedovadi|aravai aedo|aravai ado|aravai adova|aravai adovadi,67,unit +౬౮,అరవై ఎనిమిది|అరవై ఎనిమిదొవ|అరవై ఎనిమిదిది|అరవై ఎనిమిదో|aravai enimidi|aravai enimdova|aravai enimididi|aravai enimido,68,unit +౬౯,అరవై తొమ్మిది|అరవై తొమ్మిదిది|అరవై తొమ్మిదో|aravai tommidi|aravai thommidi|aravai thommididi|aravai thommido|aravai tomidi|aravai thomido,69,unit +౭౦,డెబ్బై|డెబ్బైయొవ|డెబ్బైయ్యొవది|డెబ్బైయ్యో|debhai|debhaiyyova|debhaiyovadi|debhaiyyo,70,unit +౭౧,డెబ్బై ఒక్కటి|డెబ్బై ఒకటి|డెబ్బై మొదటిది|డెబ్బై ఒకటవ|డెబ్బై ఒకటో|debhai okkati|debhai okati|debhai modatidi|debhai okatova|debhai okato|debhai okkato|debhai okkatova,71,unit +౭౨,డెబ్బై రెండు|డెబ్బై రొండు|డెబ్బై రెండొవ|డెబ్బై రెండో|డెబ్బై రెండొవది|debhai rendu|debhai rondu|debhai rendova|debhai rendo|debhai rendovadi,72,unit +౭౩,డెబ్బై మూడు|డెబ్బై మూడోవ|డెబ్బై మూడోవ|డెబ్బై మూడొవది|డెబ్బై మూడో|debhai mudu|debhai muudu|debhai mudova|debhai muudova|debhai mudovadi|debhai mudo|debhai muudo,73,unit +౭౪,డెబ్బై నాలుగు|డెబ్బై నాల్గు|డెబ్బై నాల్గొవ|డెబ్బై నాల్గొవది|డెబ్బై నాల్గో|డెబ్బై నాలగు|debhai nalugu|debhai naalugu|debhai nalgu|debhai nalgova|debhai nalgovadi|debhai nalgo|debhai nalagu|debhai nalagu|debhai naalagu,74,unit +౭౫,డెబ్బై ఐదు|డెబ్బై అయిదు|డెబ్బై ఐదొవ|డెబ్బై అయిదవది|డెబ్బై ఐదో|debhai idu|debhai ayidu|debhai iydu|debhai iydova|debhai ayidovadi|debhai ayido|debhai ido|debhai aidu,75,unit +౭౬,డెబ్బై ఆరు|డెబ్బై ఆఱు|డెబ్బై ఆరొవ|డెబ్బై ఆరొవది|డెబ్బై ఆరో|debhai aru|debhai aaru|debhai aarova|debhai aarovadi|debhai aro|debhai aaro,76,unit +౭౭,డెబ్బై ఏడు|డెబ్బై ఏడొవ|డెబ్బై ఏడొవది|డెబ్బై ఏడో|debhai aedu|debhai aedova|debhai aedovadi|debhai aedo|debhai ado|debhai adova|debhai adovadi,77,unit +౭౮,డెబ్బై ఎనిమిది|డెబ్బై ఎనిమిదొవ|డెబ్బై ఎనిమిదిది|డెబ్బై ఎనిమిదో|debhai enimidi|debhai enimdova|debhai enimididi|debhai enimido,78,unit +౭౯,డెబ్బై తొమ్మిది|డెబ్బై తొమ్మిదిది|డెబ్బై తొమ్మిదో|debhai tommidi|debhai thommidi|debhai thommididi|debhai thommido|debhai tomidi|debhai thomido,79,unit +౮౦,ఎనభై|ఎనభైయొవ|ఎనభైయొవది|ఎనభైయ్యొవది|ఎనభైయ్యో|enabhai|enabai|enabhaiyova|enabhaiyovadi|enabhaiyyo,80,unit +౮౧,ఎనభై ఒక్కటి|ఎనభై ఒకటి|ఎనభై మొదటిది|ఎనభై ఒకటవ|ఎనభై ఒకటో|enabhai okkati|enabhai okati|enabhai modatidi|enabhai okatova|enabhai okato|enabhai okkato|enabhai okkatova,81,unit +౮౨,ఎనభై రెండు|ఎనభై రొండు|ఎనభై రెండొవ|ఎనభై రెండో|ఎనభై రెండొవది|enabhai rendu|enabhai rondu|enabhai rendova|enabhai rendo|enabhai rendovadi,82,unit +౮౩,ఎనభై మూడు|ఎనభై మూడోవ|ఎనభై మూడోవ|ఎనభై మూడొవది|ఎనభై మూడో|enabhai mudu|enabhai muudu|enabhai mudova|enabhai muudova|enabhai mudovadi|enabhai mudo|enabhai muudo,83,unit +౮౪,ఎనభై నాలుగు|ఎనభై నాల్గు|ఎనభై నాల్గొవ|ఎనభై నాల్గొవది|ఎనభై నాల్గో|ఎనభై నాలగు|enabhai nalugu|enabhai naalugu|enabhai nalgu|enabhai nalgova|enabhai nalgovadi|enabhai nalgo|enabhai nalagu|enabhai nalagu|enabhai naalagu,84,unit +౮౫,ఎనభై ఐదు|ఎనభై అయిదు|ఎనభై ఐదొవ|ఎనభై అయిదవది|ఎనభై ఐదో|enabhai idu|enabhai ayidu|enabhai iydu|enabhai iydova|enabhai ayidovadi|enabhai ayido|enabhai ido|enabhai aidu,85,unit +౮౬,ఎనభై ఆరు|ఎనభై ఆఱు|ఎనభై ఆరొవ|ఎనభై ఆరొవది|ఎనభై ఆరో|enabhai aru|enabhai aaru|enabhai aarova|enabhai aarovadi|enabhai aro|enabhai aaro,86,unit +౮౭,ఎనభై ఏడు|ఎనభై ఏడొవ|ఎనభై ఏడొవది|ఎనభై ఏడో|enabhai aedu|enabhai aedova|enabhai aedovadi|enabhai aedo|enabhai ado|enabhai adova|enabhai adovadi,87,unit +౮౮,ఎనభై ఎనిమిది|ఎనభై ఎనిమిదొవ|ఎనభై ఎనిమిదిది|ఎనభై ఎనిమిదో|enabhai enimidi|enabhai enimdova|enabhai enimididi|enabhai enimido,88,unit +౮౯,ఎనభై తొమ్మిది|ఎనభై తొమ్మిదిది|ఎనభై తొమ్మిదో|enabhai tommidi|enabhai thommidi|enabhai thommididi|enabhai thommido|enabhai tomidi|enabhai thomido,89,unit +౯౦,తొంభై|తొంభైయొవ|తొంభైయొవది|తొంభైయ్యొవది|తొంభైయ్యో|thombhai|tombai|thombai|thombhaiyyova|thombaiyyovadi|thombaiyyo,90,unit +౯౧,తొంభై ఒక్కటి|తొంభై ఒకటి|తొంభై మొదటిది|తొంభై ఒకటవ|తొంభై ఒకటో|thombhai okkati|thombhai okati|thombhai modatidi|thombhai okatova|thombhai okato|thombhai okkato|thombhai okkatova,91,unit +౯౨,తొంభై రెండు|తొంభై రొండు|తొంభై రెండొవ|తొంభై రెండో|తొంభై రెండొవది|thombhai rendu|thombhai rondu|thombhai rendova|thombhai rendo|thombhai rendovadi,92,unit +౯౩,తొంభై మూడు|తొంభై మూడోవ|తొంభై మూడోవ|తొంభై మూడొవది|తొంభై మూడో|thombhai mudu|thombhai muudu|thombhai mudova|thombhai muudova|thombhai mudovadi|thombhai mudo|thombhai muudo,93,unit +౯౪,తొంభై నాలుగు|తొంభై నాల్గు|తొంభై నాల్గొవ|తొంభై నాల్గొవది|తొంభై నాల్గో|తొంభై నాలగు|thombhai nalugu|thombhai naalugu|thombhai nalgu|thombhai nalgova|thombhai nalgovadi|thombhai nalgo|thombhai nalagu|thombhai nalagu|thombhai naalagu,94,unit +౯౫,తొంభై ఐదు|తొంభై అయిదు|తొంభై ఐదొవ|తొంభై అయిదవది|తొంభై ఐదో|thombhai idu|thombhai ayidu|thombhai iydu|thombhai iydova|thombhai ayidovadi|thombhai ayido|thombhai ido|thombhai aidu,95,unit +౯౬,తొంభై ఆరు|తొంభై ఆఱు|తొంభై ఆరొవ|తొంభై ఆరొవది|తొంభై ఆరో|thombhai aru|thombhai aaru|thombhai aarova|thombhai aarovadi|thombhai aro|thombhai aaro,96,unit +౯౭,తొంభై ఏడు|తొంభై ఏడొవ|తొంభై ఏడొవది|తొంభై ఏడో|thombhai aedu|thombhai aedova|thombhai aedovadi|thombhai aedo|thombhai ado|thombhai adova|thombhai adovadi,97,unit +౯౮,తొంభై ఎనిమిది|తొంభై ఎనిమిదొవ|తొంభై ఎనిమిదిది|తొంభై ఎనిమిదో|thombhai enimidi|thombhai enimdova|thombhai enimididi|thombhai enimido,98,unit +౯౯,తొంభై తొమ్మిది|తొంభై తొమ్మిదిది|తొంభై తొమ్మిదో|thombhai tommidi|thombhai thommidi|thombhai thommididi|enabhai thommido|enabhai tomidi|enabhai thomido,99,unit +౧౦౦,వంద|నూరు|నూట|క్వింటాల్|క్వింటా|కింటా|వందల|vanda|nuru|nooru|nuta|noota|quintal|quinta|kinta|quintaa|vandala,100,scale +౧౦౦౦,వెయ్యి|టన్|టన్ను|వేయి|వేలు|వేల|వెయ్యిల|వెయ్య|వేయ్యిలా|veyyi|ton|tonne|tonnu|veelu|veela|veyla|vela|veyyila|veyya|veyyila,1000,scale +౧౦౦౦౦౦,లక్ష|లక్షలు|లక్షల|laksha|lacha|lakshala|lakshalu,100000,scale +౧౦౦౦౦౦౦౦,కోటి|కోట్లు|కోట్ల|కరోడ్|koti|kotlu|kotla|korode|corode|carode,10000000,scale \ No newline at end of file diff --git a/ner_v2/detectors/numeral/number_range/en/data/number_range_keywords.csv b/ner_v2/detectors/numeral/number_range/en/data/number_range_keywords.csv index d6fe599ea..924299a1a 100644 --- a/ner_v2/detectors/numeral/number_range/en/data/number_range_keywords.csv +++ b/ner_v2/detectors/numeral/number_range/en/data/number_range_keywords.csv @@ -1,5 +1,6 @@ range_variants,position,range_type -above | abv | abov | more than | mor than | more den | mor den | greater than | greater ,-1,min +above | abv | abov | more than | mor than | more den | mor den | greater than | greater | over,-1,min +onwards | and above | or above | or more | or great | or abov | or abv,1,min max | upto | up to | around | below | less than | less | less den,-1,max -max,1,max -To | - ,0,min_max +max ,1,max +To | - ,0,min_max \ No newline at end of file diff --git a/ner_v2/detectors/numeral/number_range/en/number_range_detection.py b/ner_v2/detectors/numeral/number_range/en/number_range_detection.py index 9c73c1015..c65b8f309 100644 --- a/ner_v2/detectors/numeral/number_range/en/number_range_detection.py +++ b/ner_v2/detectors/numeral/number_range/en/number_range_detection.py @@ -23,7 +23,8 @@ def __init__(self, entity_name, language, unit_type=None): self._detect_min_num_range_with_prefix_variants, self._detect_min_num_range_with_suffix_variants, self._detect_max_num_range_with_prefix_variants, - self._detect_max_num_range_with_suffix_variants + self._detect_max_num_range_with_suffix_variants, + self._detect_absolute_number ] def _custom_num_range_between_num_and_num(self, number_range_list=None, original_list=None): diff --git a/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py b/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py index 42f158571..92ee515cf 100644 --- a/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py +++ b/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py @@ -4,11 +4,17 @@ import pandas as pd import collections import os -import re - import ner_v2.detectors.numeral.constant as numeral_constant from ner_v2.detectors.numeral.utils import get_list_from_pipe_sep_string from ner_v2.detectors.numeral.number.number_detection import NumberDetector +try: + import regex as re + _re_flags = re.UNICODE | re.V1 | re.WORD + +except ImportError: + + import re + _re_flags = re.UNICODE NumberRangeVariant = collections.namedtuple('NumberRangeVariant', ['position', 'range_type']) ValueTextPair = collections.namedtuple('ValueTextPair', ['entity_value', 'original_text']) @@ -36,7 +42,7 @@ def __init__(self, entity_name, language, data_directory_path, unit_type=None): self.tag = '__' + entity_name + '__' self.range_variants_map = {} self.unit_type = unit_type - + self.language = language self.min_range_prefix_variants = None self.min_range_suffix_variants = None self.max_range_prefix_variants = None @@ -44,7 +50,8 @@ def __init__(self, entity_name, language, data_directory_path, unit_type=None): self.min_max_range_variants = None self.number_detected_map = {} - self.number_detector = NumberDetector(entity_name=entity_name, language=language) + self.number_detector = NumberDetector(entity_name=entity_name, language=language, unit_type=unit_type, + detect_without_unit=True) self.number_detector.set_min_max_digits(1, 100) # Method to initialise regex params @@ -55,7 +62,8 @@ def __init__(self, entity_name, language, data_directory_path, unit_type=None): self._detect_min_num_range_with_prefix_variants, self._detect_min_num_range_with_suffix_variants, self._detect_max_num_range_with_prefix_variants, - self._detect_max_num_range_with_suffix_variants + self._detect_max_num_range_with_suffix_variants, + self._detect_absolute_number ] def _init_regex_for_range(self, data_directory_path): @@ -133,7 +141,7 @@ def _get_number_tag_dict(self): Examples: >>> text = 'I want 12 dozen banana' >>> self._get_number_tag_dict() - {'__number_1': ({'value': 12, 'unit': None}, '12')} + {'__dnumber_1': ({'value': 12, 'unit': None}, '12')} """ detected_number_dict = {} entity_value_list, original_text_list = self.number_detector.detect_entity(self.processed_text) @@ -181,6 +189,25 @@ def detect_number_range(self, text): self._update_tagged_text(original_list) return number_list, original_list + def _detect_absolute_number(self, number_list, original_list): + number_list = number_list or [] + original_list = original_list or [] + abs_number_pattern = re.compile(ur'({number}\d+)'.format(number=numeral_constant.NUMBER_REPLACE_TEXT), + re.UNICODE) + abs_number_matches = abs_number_pattern.findall(self.processed_text) + for match in abs_number_matches: + entity_unit = self.number_detected_map[match].entity_value[ + numeral_constant.NUMBER_DETECTION_RETURN_DICT_UNIT] + if (self.unit_type and entity_unit) or not self.unit_type: + number_list.append({numeral_constant.NUMBER_RANGE_MAX_VALUE: None, + numeral_constant.NUMBER_RANGE_MIN_VALUE: None, + numeral_constant.NUMBER_RANGE_VALUE_UNIT: entity_unit, + numeral_constant.NUMBER_RANGE_ABS_VALUE: self. + number_detected_map[match]. + entity_value[numeral_constant.NUMBER_DETECTION_RETURN_DICT_VALUE]}) + original_list.append(self.number_detected_map[match].original_text) + return number_list, original_list + def _get_number_range(self, min_part_match, max_part_match, full_match): """ Update number_range_list and original_list by finding entity value of number tag and original text from @@ -210,12 +237,19 @@ def _get_number_range(self, min_part_match, max_part_match, full_match): if max_part_match and max_part_match in self.number_detected_map: entity_dict = self.number_detected_map[max_part_match].entity_value entity_value_max = entity_dict[numeral_constant.NUMBER_DETECTION_RETURN_DICT_VALUE] - entity_unit = entity_dict[numeral_constant.NUMBER_DETECTION_RETURN_DICT_UNIT] + if not entity_unit: + entity_unit = entity_dict[numeral_constant.NUMBER_DETECTION_RETURN_DICT_UNIT] if self.unit_type and ( entity_unit is None or self.number_detector.get_unit_type(entity_unit) != self.unit_type): return number_range, original_text + if min_part_match and max_part_match: + if float(entity_value_min) > float(entity_value_max): + temp = entity_value_max + entity_value_max = entity_value_min + entity_value_min = temp + original_text = self._get_original_text_from_tagged_text(full_match) if (entity_value_min or entity_value_max) and original_text: self.processed_text = self.processed_text.replace(full_match.strip(), '', 1) @@ -223,6 +257,7 @@ def _get_number_range(self, min_part_match, max_part_match, full_match): number_range = { numeral_constant.NUMBER_RANGE_MIN_VALUE: entity_value_min, numeral_constant.NUMBER_RANGE_MAX_VALUE: entity_value_max, + numeral_constant.NUMBER_RANGE_ABS_VALUE: None, numeral_constant.NUMBER_RANGE_VALUE_UNIT: entity_unit } return number_range, original_text @@ -378,7 +413,6 @@ def _detect_min_max_num_range(self, number_range_list=None, original_list=None): def _update_tagged_text(self, original_number_list): """ Replaces detected date with tag generated from entity_name used to initialize the object with - A final string with all dates replaced will be stored in object's tagged_text attribute A string with all dates removed will be stored in object's processed_text attribute @@ -387,7 +421,8 @@ def _update_tagged_text(self, original_number_list): created from entity_name """ for detected_text in original_number_list: - self.tagged_text = self.tagged_text.replace(detected_text, self.tag) + _pattern = re.compile(r'\b%s\b' % re.escape(detected_text), flags=_re_flags) + self.tagged_text = _pattern.sub(self.tag, self.tagged_text) class NumberRangeDetector(BaseNumberRangeDetector): diff --git a/ner_v2/detectors/numeral/utils.py b/ner_v2/detectors/numeral/utils.py index 4787fe8a5..d248aa419 100644 --- a/ner_v2/detectors/numeral/utils.py +++ b/ner_v2/detectors/numeral/utils.py @@ -11,8 +11,12 @@ def get_number_from_number_word(text, number_word_dict): detected_number_list (list): list of numeric value detected from text detected_original_text_list (list): list of original text for numeric value detected Examples: - [In] >> number_word_dict = {'one': (1, 1), 'two': (1, 2), 'three': (1, 3), 'thousand': (1000, 0), - 'four': (1, 4), 'hundred': (100, 0) + [In] >> number_word_dict = {'one': NumberVariant(scale=1, increment=1), + 'two': NumberVariant(scale=1, increment=2), + 'three': NumberVariant(scale=1, increment=3), + 'thousand': NumberVariant(scale=1000, increment=0), + 'four': NumberVariant(scale=1, increment=4), + 'hundred': NumberVariant(scale=100, increment=0) } [In] >> _get_number_from_numerals('one thousand two', number_word_dict) [Out] >> (['1002'], ['one thousand two']) @@ -24,6 +28,9 @@ def get_number_from_number_word(text, number_word_dict): detected_number_list = [] detected_original_text_list = [] + # exclude single char scales word from word number map dict + number_word_dict = {word: number_map for word, number_map in number_word_dict.items() + if (len(word) > 1 and number_map.increment == 0) or number_map.scale == 1} text = text.strip() if not text: return detected_number_list, detected_original_text_list @@ -73,7 +80,7 @@ def get_number_from_number_word(text, number_word_dict): result_text, current_text = '', '' # handle where only scale is mentioned without unit, for ex - thousand(for 1000), hundred(for 100) - current = 1 if (scale > 0 and current == 0 and increment == 0) else current + current = 1 if (scale > 1 and current == 0 and increment == 0) else current current = current * scale + increment current_text += part if scale > 1: diff --git a/ner_v2/detectors/pattern/phone_number/README.md b/ner_v2/detectors/pattern/phone_number/README.md index 0e57c4858..d0e1c0f88 100644 --- a/ner_v2/detectors/pattern/phone_number/README.md +++ b/ner_v2/detectors/pattern/phone_number/README.md @@ -1,6 +1,6 @@ ## Phone Number Detector -The Phone Number Detector has the capability to detect phone numbers from within the given text. The detector has the ability to handle multilanguage text. Additionally, this detector is scaled to handle domestic as well as international phone numbers +The Phone Number Detector has the capability to detect phone numbers from within the given text. The detector has the ability to handle multi language text. Additionally, this detector is scaled to handle domestic as well as international phone numbers We are currently providing phone number detection support in 6 languages, which are @@ -8,7 +8,7 @@ The Phone Number Detector has the capability to detect phone numbers from within - Hindi - Marathi - Gujarati -- Telgu +- Telugu - Tamil ### Usage @@ -17,54 +17,58 @@ The Phone Number Detector has the capability to detect phone numbers from within ```python >> from ner_v2.detectors.pattern.phone_number.phone_number_detection import PhoneDetector - >> detector = PhoneDetector(language='en', entity_name='phone_number') # here language will be ISO 639-1 code + >> detector = PhoneDetector(language='en', entity_name='phone_number', locale='en-IN') + # here language will be ISO 639-1 code and locale can be of the form 'language[-_]country_code' >> detector.detect_entity(text=u'send a message on 91 9820334455') - >> (['919820334455'], [u'91 9820334455']) + >> ([{'country_calling_code': '91', 'phone_number': '9820334455'}],['91 9820334455']) ``` - **Curl Command** ```bash # For a sample query with following parameters - # message="Call 022 26129857 and send 100 rs to +919820334416 and 1(408) 234-619" + # message="Call 022 2612985 and send 100 rs to +919820334416 and 1(408) 234-6192" # entity_name='phone_number' # structured_value=None # fallback_value=None # bot_message=None # source_language='en' + # locale='en-us' $ URL='localhost' $ PORT=8081 - $ curl -i 'http://'$URL':'$PORT'/v2/phone_number?message=Call%20022%2026129857%20and%20send%20100%20rs%20to%20+919820334416%20and%201%28408%29%20234-619&entity_name=phone_number&fallback_value=&bot_message=&structured_value=&source_language=en' - + $ curl -i 'http://'$URL':'$PORT'v2/phone_number?entity_name=phone_number&message=Call%20022%202612985%20and%20send%20100%20rs%20to%20%2B919820334416%20and%201(408)%20234-6192&source_language=en&locale=en-us&structured_value=&fallback_value=&bot_message=' -H 'cache-control: no-cache' -H 'postman-token: dad3f116-37f2-2627-b8c6-f89f00f19924' # Curl output $ { - "data": [ - { - "detection": "message", - "original_text": "022 26129857", - "entity_value": { - "value": "02226129857" - }, - "language": "en" - }, - { - "detection": "message", - "original_text": "919820334416", - "entity_value": { - "value": "919820334416" - }, - "language": "en" - }, - { - "detection": "message", - "original_text": "1(408) 234-619", - "entity_value": { - "value": "1408234619" - }, - "language": "en" - } - ] - } + "data": [ + { + "detection": "message", + "original_text": "022 2612985", + "entity_value": { + "phone_number": "222612985", + "country_calling_code": "1" + }, + "language": "en" + }, + { + "detection": "message", + "original_text": "+919820334416", + "entity_value": { + "phone_number": "9820334416", + "country_calling_code": "91" + }, + "language": "en" + }, + { + "detection": "message", + "original_text": "1(408) 234-6192", + "entity_value": { + "phone_number": "4082346192", + "country_calling_code": "1" + }, + "language": "en" + } + ] +} ``` \ No newline at end of file diff --git a/ner_v2/detectors/pattern/phone_number/phone_number_detection.py b/ner_v2/detectors/pattern/phone_number/phone_number_detection.py index 30ecea4d7..aa50f4079 100644 --- a/ner_v2/detectors/pattern/phone_number/phone_number_detection.py +++ b/ner_v2/detectors/pattern/phone_number/phone_number_detection.py @@ -3,6 +3,7 @@ from ner_v2.detectors.numeral.number.number_detection import NumberDetector from language_utilities.constant import ENGLISH_LANG import re +import phonenumbers class PhoneDetector(BaseDetector): @@ -12,28 +13,26 @@ class PhoneDetector(BaseDetector): Attributes: text(str): string provided to extract phone numbers detection - tagged_text (str): string in which the detected phone numbers are replaced by ____ - processed_text (str): string in which the detected phone numbers are removed phone (list): list of detected entity values original_phone_text (list): list to store substrings of the text detected as phone numbers - tag (str): entity_name prepended and appended with '__' """ - def __init__(self, entity_name, language=ENGLISH_LANG): + + def __init__(self, entity_name, language=ENGLISH_LANG, locale=None): """ Args: entity_name (str): A string by which the detected numbers would be replaced with on calling detect_entity() language (str, optional): language code of number text, defaults to 'en' + locale(str, optional): locale of the country from which you are dialing. Ex: 'en-IN' """ self._supported_languages = NumberDetector.get_supported_languages() - super(PhoneDetector, self).__init__(language) + super(PhoneDetector, self).__init__(language, locale) self.language = language - self.entity_name = entity_name + self.locale = locale or 'en-IN' self.text = '' - self.tagged_text = '' - self.processed_text = '' - self.phone = [] - self.original_phone_text = [] + self.phone, self.original_phone_text = [], [] + self.country_code = self.get_country_code_from_locale() + self.entity_name = entity_name self.tag = '__' + self.entity_name + '__' @property @@ -45,6 +44,17 @@ def supported_languages(self): """ return self._supported_languages + def get_country_code_from_locale(self): + """ + This method sets self.country_code from given locale + """ + regex_pattern = re.compile('[-_](.*$)', re.U) + match = regex_pattern.findall(self.locale) + if match: + return match[0].upper() + else: + return 'IN' + def detect_entity(self, text, **kwargs): """Detects phone numbers in the text string @@ -54,161 +64,75 @@ def detect_entity(self, text, **kwargs): Returns: - self.phone (list): list consisting the detected phone numbers + self.phone (list): list consisting the detected phone numbers and their country calling codes self.original_phone_text (list): list containing their corresponding substrings in the original message. Examples: - text = 'call +1 (408) 912-6172 and send 100rs to 9920441344' - - p = PhoneDetector(entity_name='phone_number', language='en') + text = 'call +1 (408) 912-6172' + p = PhoneDetector(entity_name='phone_number', language='en', locale='en-US') p.detect_entity(text=text) - (['14089126172', '9920441344'], [u'+1 (408) 912-6172', u'9920441344']) + ([{'country_calling_code':'1', value':'4089126172'} ], + [u'+1 (408) 912-6172']) text = '+९१ ९८१९९८३१३२ पर कॉल करें और संदेश ९८२०३३४४१६ पर कॉल करें' - p = PhoneDetector(entity_name='phone_number', language='hi') + p = PhoneDetector(entity_name='phone_number', language='hi', locale='en-IN') p.detect_entity(text=text) - (['919819983132', '9820334416'],[u'+९१ ९८१९९८३१३२', u'+९१ ९८१९९८३१३२']) + ([{'country_calling_code':'91', value':'9819983132'} + ,{ 'country_calling_code':'91', value:'9820334416'} ], + [u'+९१ ९८१९९८३१३२', u'+९१ ९८१९९८३१३२']) """ - - self.text = text - self.processed_text = self.text - self.tagged_text = self.text - - phone_number_original_list = self.get_number_regex() - - original_phone_texts = [p[0].strip() for p in phone_number_original_list] - original_phone_text = self.check_length(original_phone_texts=original_phone_texts) - clean_phone_list = [self.clean_phone_number(p) for p in original_phone_text] - phone = [self.get_number(phone) for phone in clean_phone_list] - + self.text = " " + text.lower().strip() + " " self.phone, self.original_phone_text = [], [] - - for phone_number, original_phone_number in zip(phone, original_phone_text): - if len(phone_number) >= 10: - self.phone.append(phone_number) - self.original_phone_text.append(original_phone_number) - self.get_tagged_text() - + for match in phonenumbers.PhoneNumberMatcher(self.text, self.country_code, leniency=0): + if match.number.country_code == phonenumbers.country_code_for_region(self.country_code): + self.phone.append(self.check_for_country_code(str(match.number.national_number))) + self.original_phone_text.append(self.text[match.start:match.end]) + else: + # This means our detector has detected some other country code. + self.phone.append({"country_calling_code": str(match.number.country_code), + "value": str(match.number.national_number)}) + self.original_phone_text.append(self.text[match.start:match.end]) + self.phone, self.original_phone_text = self.check_for_alphas() return self.phone, self.original_phone_text - def get_digit_length(self, text): - return len(re.findall(pattern='\d', string=text, flags=re.U)) - - def check_length(self, original_phone_texts): + def check_for_alphas(self): """ - This method is used to handle the corner case where consecutive numbers are present with - space within them. - Args: - original_phone_texts (list): list of text substrings detected by the regex - - Returns: - phone_number_list (list): list of phone numbers splitting based on length - - Examples: - original_phone_texts = ['9820334415 91 9920441388', '9820551388982347'] - check_length(original_phone_texts=original_phone_texts) - >> ['9820334415', '91 9920441388'] + checks if any leading or trailing alphabets in the detected phone numbers and removes those numbers """ - phone_number_list_1, phone_number_list2 = [], [] - - for original_phone_text in original_phone_texts: - - if self.get_digit_length(text=original_phone_text) > 13: - phone_parts = original_phone_text.split() - visited = [0 for i in range(len(phone_parts))] - - for i in range(len(phone_parts)): - temp = '' - appended_parts = [] - - for j in range(i, len(phone_parts)): - if visited[j] == 0: - temp = temp + ' ' + phone_parts[j] - appended_parts.append(j) - - if 13 >= self.get_digit_length(text=temp) > 7: - phone_number_list_1.append(temp.strip()) - for m in appended_parts: - visited[m] = 1 - break - else: - phone_number_list2.append(original_phone_text) - phone_number_list_1.extend(phone_number_list2) - return phone_number_list_1 - - def get_number(self, phone): + validated_phone = [] + validated_original_text = [] + for phone, original in zip(self.phone, self.original_phone_text): + if re.search(r'\W' + re.escape(original) + r'\W', self.text, re.UNICODE): + validated_phone.append(phone) + validated_original_text.append(original) + return validated_phone, validated_original_text + + def check_for_country_code(self, phone_num): """ - This method is used to convert phone numbers in language scripts other than English - to the English - Args: - phone (str): The string phone number which is detected and cleaned - - Returns: - phone (str): The string phone number converted to English script - + :param phone_num: the number which is to be checked for country code + :return: dict with country_code if it's in phone_num or phone_number with current country code Examples: - phone = u'९१९८१९९८३१३२' - get_number(phone=phone) - '919819983132' + phone_num = '919123456789' + countryCallingCode = 'IN' + {countryCallingCode:"91",value:"9123456789"} """ - phone_length = len(phone) - phone = str(int(phone)) - - if phone_length != len(phone): - phone = phone.zfill(phone_length) - - return phone - - def clean_phone_number(self, number): - """ - This method is used to clean the detected phone number. - Args: - number (str): The original substring which is detected and is required for cleaning - - Returns: - number (str): The number post cleaning - """ - # Remove (), -, whistespace, + - clean_regex = re.compile('([()\-\s\+]+)', re.U) - number = clean_regex.sub(string=number, repl='') - return number - - def get_number_regex(self): - - """ - This method is used to detect the phone number patterns from the provided text - Returns: - phone_number_list (list): list of patterns detected from the regex pattern - - (each pattern: (complete original text, area code, number)) - (we further utitlize only the complete original text) - Example: - p = PhoneDetector(entity_name='phone_number', language='hi') - text = u'Set a reminder on +1 (408) 912-6172' - p.text = text - p.get_number_regex() - - [(u'+1 (408) 912-6172', u'1', u'(408) 912-6172'), - (u'+91 9820334416', u'91', u'9820334416'), - (u'022 26129857', u'022', u'26129857')] - """ - phone_number_regex = re.compile( - r'((?:\(?\+(\d{1,2})\)?[\s\-\.]*)?((?=[\-\d()\s\.]{10,16}(?:[^\d]+|$))' - r'(?:[\d(]{1,20}(?:[\-)\s\.]*\d{1,20}){0,20}){1,20}))', re.U) - - phone_number_list = phone_number_regex.findall(self.text) - return phone_number_list - - def get_tagged_text(self): - """ - Replaces detected phone numbers with tag generated from entity_name used to initialize the object with - - A final string with all phone numbers replaced will be stored in object's tagged_text attribute - A string with all phone numbers removed will be stored in object's processed_text attribute + phone_dict = {} + + if len(phone_num) > 10: + check_country_regex = re.compile(r'^({country_code})\d{length}$'. + format(country_code='911|1|011 91|91', length='{10}'), re.U) + p = check_country_regex.findall(phone_num) + if len(p) == 1: + phone_dict['country_calling_code'] = p[0] + country_code_sub_regex = re.compile(r'^{detected_code}'.format(detected_code=p[0])) + phone_dict['value'] = country_code_sub_regex.sub(string=phone_num, repl='') + else: + phone_dict['country_calling_code'] = str(phonenumbers.country_code_for_region(self.country_code)) + phone_dict['value'] = phone_num + else: + phone_dict['country_calling_code'] = str(phonenumbers.country_code_for_region(self.country_code)) + phone_dict['value'] = phone_num - """ - for detected_text in self.original_phone_text: - self.tagged_text = self.tagged_text.replace(detected_text, self.tag) - self.processed_text = self.processed_text.replace(detected_text, '') + return phone_dict diff --git a/ner_v2/detectors/temporal/constant.py b/ner_v2/detectors/temporal/constant.py index eefa012cc..2a78af95b 100644 --- a/ner_v2/detectors/temporal/constant.py +++ b/ner_v2/detectors/temporal/constant.py @@ -3,6 +3,18 @@ DATETIME_CONSTANT_FILE = 'datetime_diff_constant.csv' NUMERALS_CONSTANT_FILE = 'numbers_constant.csv' +# timezones data file and its columns +# name of the data file +TIMEZONES_CONSTANT_FILE = 'timezones.csv' +# index of the csv file(try using the common standard) +TIMEZONES_CODE_COLUMN_NAME = 'code' +# all regions in Olson format pytz +TIMEZONES_ALL_REGIONS_COLUMN_NAME = 'all_regions' +# preferred region in the above all_regions (Olson format pytz) +TIMEZONES_PREFERRED_REGION_COLUMN_NAME = 'preferred' +# Formal usage variants of the index +TIMEZONE_VARIANTS_VARIANTS_COLUMN_NAME = 'timezone_variants' + CONSTANT_FILE_KEY = 'key' # date type referring to date in month like "2 tarikh" (reference: hindi) diff --git a/ner_v2/detectors/temporal/date/README.md b/ner_v2/detectors/temporal/date/README.md index 6de390fef..f8da61fc5 100644 --- a/ner_v2/detectors/temporal/date/README.md +++ b/ner_v2/detectors/temporal/date/README.md @@ -6,7 +6,7 @@ This is the V2 version of date detector module that will detect date in multiple - Hindi - Marathi - Gujarati -- Telgu +- Telugu - Tamil ### Usage @@ -14,7 +14,7 @@ This is the V2 version of date detector module that will detect date in multiple - **Python Shell** ```python - >> from ner_v2.detector.temporal.date.date_detection import DateDetector + >> from ner_v2.detectors.temporal.date.date_detection import DateDetector >> detector = DateDetector(entity_name='date', language='hi') # here language will be ISO 639-1 code >> detector.detect_entity(text= 'agla mangalvar') >> {'entity_value': [{'dd':12 ,'mm': 10, 'yy': 2018}], 'original_text':['agla mangalvar']} diff --git a/ner_v2/detectors/temporal/date/bn/data/date_constant.csv b/ner_v2/detectors/temporal/date/bn/data/date_constant.csv index 6e6447c59..912201be5 100644 --- a/ner_v2/detectors/temporal/date/bn/data/date_constant.csv +++ b/ner_v2/detectors/temporal/date/bn/data/date_constant.csv @@ -13,15 +13,15 @@ key,numeric_representation,date_type শুক্রবার|Shukrobar,4,weekday শনিবার|Shonibar,5,weekday রবিবার|Rabibar|Robibar,6,weekday -Januyari|জানুয়ারী,1,month -Phebruyari|ফেব্রুয়ারি,2,month -March|মার্চ,3,month -Epril|এপ্রিল,4,month -Me|মে,5,month -Jun|জুন,6,month -Juloi|জুলাই,7,month -Agast|অগাস্ট|আগস্ট,8,month -Septembar|সেপ্টেম্বর,9,month -Aktobar|অক্টোবর,10,month -Nabhembar|নভেম্বর,11,month -Disembar|ডিসেম্বর,12,month +জানু.|জানু|Januyari|জানুয়ারী,1,month +ফেব.|ফেব|Phebruyari|ফেব্রুয়ারি,2,month +মার্চ.|March|মার্চ,3,month +এপ্র.|এপ্র|Epril|এপ্রিল,4,month +মে.|Me|মে,5,month +জুন.|Jun|জুন,6,month +জুল.|জুল|Juloi|জুলাই,7,month +আগ.|আগ|Agast|অগাস্ট|আগস্ট,8,month +সেপ্ট.|সেপ্ট|Septembar|সেপ্টেম্বর,9,month +অক্টো.|অক্টো|Aktobar|অক্টোবর,10,month +নভে.|নভে|Nabhembar|নভেম্বর,11,month +ডিসে.|ডিসে|Disembar|ডিসেম্বর,12,month diff --git a/ner_v2/detectors/temporal/date/date_detection.py b/ner_v2/detectors/temporal/date/date_detection.py index 20214e041..d8c92f105 100644 --- a/ner_v2/detectors/temporal/date/date_detection.py +++ b/ner_v2/detectors/temporal/date/date_detection.py @@ -58,7 +58,8 @@ def get_supported_languages(): supported_languages.append(_dir) return supported_languages - def __init__(self, entity_name='date', language=ENGLISH_LANG, timezone='UTC', past_date_referenced=False): + def __init__(self, entity_name='date', locale=None, language=ENGLISH_LANG, timezone='UTC', + past_date_referenced=False): """ Initializes the DateDetector object with given entity_name and pytz timezone object @@ -70,6 +71,7 @@ def __init__(self, entity_name='date', language=ENGLISH_LANG, timezone='UTC', pa default is UTC past_date_referenced (bool): to know if past or future date is referenced for date text like 'kal', 'parso' """ + self.locale = locale self._supported_languages = self.get_supported_languages() super(DateAdvancedDetector, self).__init__(language=language) self.text = '' @@ -82,7 +84,8 @@ def __init__(self, entity_name='date', language=ENGLISH_LANG, timezone='UTC', pa self.date_detector_object = DateDetector(entity_name=entity_name, language=language, timezone=timezone, - past_date_referenced=past_date_referenced) + past_date_referenced=past_date_referenced, + locale=locale) self.bot_message = None @property @@ -756,7 +759,7 @@ class DateDetector(object): language: source language of text """ - def __init__(self, entity_name, language=ENGLISH_LANG, timezone='UTC', past_date_referenced=False): + def __init__(self, entity_name, locale=None, language=ENGLISH_LANG, timezone='UTC', past_date_referenced=False): """Initializes a DateDetector object with given entity_name and pytz timezone object Args: @@ -765,6 +768,7 @@ def __init__(self, entity_name, language=ENGLISH_LANG, timezone='UTC', past_date timezone (Optional, str): timezone identifier string that is used to create a pytz timezone object default is UTC past_date_referenced (bool): to know if past or future date is referenced for date text like 'kal', 'parso' + locale(Optional, str): user locale default is None """ self.text = '' self.tagged_text = '' @@ -777,13 +781,15 @@ def __init__(self, entity_name, language=ENGLISH_LANG, timezone='UTC', past_date self.now_date = datetime.datetime.now(tz=self.timezone) self.bot_message = None self.language = language + self.locale = locale try: date_detector_module = importlib.import_module( 'ner_v2.detectors.temporal.date.{0}.date_detection'.format(self.language)) self.language_date_detector = date_detector_module.DateDetector(entity_name=self.entity_name, past_date_referenced=past_date_referenced, - timezone=self.timezone) + timezone=self.timezone, + locale=self.locale) except ImportError: standard_date_regex = importlib.import_module( 'ner_v2.detectors.temporal.date.standard_date_regex' @@ -793,7 +799,8 @@ def __init__(self, entity_name, language=ENGLISH_LANG, timezone='UTC', past_date data_directory_path=get_lang_data_path(detector_path=os.path.abspath(__file__), lang_code=self.language), timezone=self.timezone, - past_date_referenced=past_date_referenced + past_date_referenced=past_date_referenced, + locale=self.locale ) def detect_entity(self, text, **kwargs): diff --git a/ner_v2/detectors/temporal/date/en/date_detection.py b/ner_v2/detectors/temporal/date/en/date_detection.py index 3ce428f67..5c1423fac 100644 --- a/ner_v2/detectors/temporal/date/en/date_detection.py +++ b/ner_v2/detectors/temporal/date/en/date_detection.py @@ -28,6 +28,7 @@ class DateDetector(object): original_date_text: list to store substrings of the text detected as date entities tag: entity_name prepended and appended with '__' timezone: Optional, pytz.timezone object used for getting current time, default is pytz.timezone('UTC') + locale: Optional, locale of the user for getting country code now_date: datetime object holding timestamp while DateDetector instantiation month_dictionary: dictonary mapping month indexes to month spellings and fuzzy variants(spell errors, abbreviations) @@ -70,7 +71,7 @@ class DateDetector(object): text and tagged_text will have a extra space prepended and appended after calling detect_entity(text) """ - def __init__(self, entity_name, timezone='UTC', past_date_referenced=False): + def __init__(self, entity_name, locale=None, timezone='UTC', past_date_referenced=False): """Initializes a DateDetector object with given entity_name and pytz timezone object Args: @@ -79,6 +80,7 @@ def __init__(self, entity_name, timezone='UTC', past_date_referenced=False): timezone (Optional, str): timezone identifier string that is used to create a pytz timezone object default is UTC past_date_referenced (bool): to know if past or future date is referenced for date text like 'kal', 'parso' + locale (Optional, str): user locale for getting the country code. """ self.text = '' @@ -95,6 +97,57 @@ def __init__(self, entity_name, timezone='UTC', past_date_referenced=False): self.month_dictionary = MONTH_DICT self.day_dictionary = DAY_DICT self.bot_message = None + self.locale = locale + self.country_code = None + self.default_detector_preferences = [self._gregorian_day_month_year_format, + self._gregorian_month_day_year_format, + self._gregorian_year_month_day_format, + self._gregorian_advanced_day_month_year_format, + self._day_month_format_for_arrival_departure, + self._date_range_ddth_of_mmm_to_ddth, + self._date_range_ddth_to_ddth_of_next_month, + self._gregorian_day_with_ordinals_month_year_format, + self._gregorian_advanced_year_month_day_format, + self._gregorian_year_day_month_format, + self._gregorian_month_day_with_ordinals_year_format, + self._gregorian_day_month_format, + self._gregorian_month_day_format, + self._day_after_tomorrow, + self._date_days_after, + self._date_days_later, + self._day_before_yesterday, + self._todays_date, + self._tomorrows_date, + self._yesterdays_date, + self._day_in_next_week, + self._day_range_for_nth_week_month + ] + """ + Rules to add new country code preferences: + 1. Create a new key with country code. + 2. Add all the methods which should be given higher preference in a list with the + most preferred method first. + 3. Warning: Be careful about the order in which you set your preferences. + For EX: If you set `self._gregorian_day_month_format` at a higher preference than + `self._gregorian_advanced_day_month_year_format`, only `22nd MAR` will be detected in `22nd MAR 2034`. + """ + self.country_date_detector_preferences = { + 'US': [self._gregorian_month_day_year_format], + 'IN': [self._gregorian_day_month_year_format], + } + + def get_country_code_from_locale(self): + """ + Extracts locale from country code. + Ex: locale:'en_us' sets, + self.country_code = 'US' + """ + regex_pattern = re.compile('[-_](.*$)', re.U) + match = regex_pattern.findall(self.locale) + if match: + return match[0].upper() + else: + return None def detect_date(self, text): """ @@ -111,7 +164,8 @@ def detect_date(self, text): self.text = " " + text.strip().lower() + " " self.processed_text = self.text self.tagged_text = self.text - + if self.locale: + self.country_code = self.get_country_code_from_locale() date_list = [] original_list = [] date_list, original_list = self.get_exact_date(date_list, original_list) @@ -147,51 +201,18 @@ def get_exact_date(self, date_list, original_list): corresponding substrings in the given text. """ - date_list, original_list = self._gregorian_day_month_year_format(date_list, original_list) - self._update_processed_text(original_list) - date_list, original_list = self._gregorian_month_day_year_format(date_list, original_list) - self._update_processed_text(original_list) - date_list, original_list = self._gregorian_year_month_day_format(date_list, original_list) - self._update_processed_text(original_list) - date_list, original_list = self._gregorian_advanced_day_month_year_format(date_list, original_list) - self._update_processed_text(original_list) - date_list, original_list = self._day_month_format_for_arrival_departure(date_list, original_list) - self._update_processed_text(original_list) - date_list, original_list = self._date_range_ddth_of_mmm_to_ddth(date_list, original_list) - self._update_processed_text(original_list) - date_list, original_list = self._date_range_ddth_to_ddth_of_next_month(date_list, original_list) - self._update_processed_text(original_list) - date_list, original_list = self._gregorian_day_with_ordinals_month_year_format(date_list, original_list) - self._update_processed_text(original_list) - date_list, original_list = self._gregorian_advanced_year_month_day_format(date_list, original_list) - self._update_processed_text(original_list) - date_list, original_list = self._gregorian_year_day_month_format(date_list, original_list) - self._update_processed_text(original_list) - date_list, original_list = self._gregorian_month_day_with_ordinals_year_format(date_list, original_list) - self._update_processed_text(original_list) - date_list, original_list = self._gregorian_day_month_format(date_list, original_list) - self._update_processed_text(original_list) - date_list, original_list = self._gregorian_month_day_format(date_list, original_list) - self._update_processed_text(original_list) - - date_list, original_list = self._day_after_tomorrow(date_list, original_list) - self._update_processed_text(original_list) - date_list, original_list = self._date_days_after(date_list, original_list) - self._update_processed_text(original_list) - date_list, original_list = self._date_days_later(date_list, original_list) - self._update_processed_text(original_list) - date_list, original_list = self._day_before_yesterday(date_list, original_list) - self._update_processed_text(original_list) - date_list, original_list = self._todays_date(date_list, original_list) - self._update_processed_text(original_list) - date_list, original_list = self._tomorrows_date(date_list, original_list) - self._update_processed_text(original_list) - date_list, original_list = self._yesterdays_date(date_list, original_list) - self._update_processed_text(original_list) - date_list, original_list = self._day_in_next_week(date_list, original_list) - self._update_processed_text(original_list) - date_list, original_list = self._day_range_for_nth_week_month(date_list, original_list) - self._update_processed_text(original_list) + if self.country_code and self.country_code in self.country_date_detector_preferences: + for preferred_detector in self.country_date_detector_preferences[self.country_code]: + date_list, original_list = preferred_detector(date_list, original_list) + self._update_processed_text(original_list) + for detector in self.default_detector_preferences: + if detector not in self.country_date_detector_preferences[self.country_code]: + date_list, original_list = detector(date_list, original_list) + self._update_processed_text(original_list) + else: + for detector in self.default_detector_preferences: + date_list, original_list = detector(date_list, original_list) + self._update_processed_text(original_list) return date_list, original_list @@ -224,16 +245,14 @@ def get_possible_date(self, date_list=None, original_list=None): self._update_processed_text(original_list) date_list, original_list = self._date_identification_given_day(date_list, original_list) self._update_processed_text(original_list) - # FIXME: This call order causes everyday to be taken away from "everyday except <>" which means - # FIXME: successive calls for everyday_except_weekends and everyday_except_weekdays return wrong results - date_list, original_list = self._date_identification_everyday(date_list, original_list, n_days=15) - self._update_processed_text(original_list) date_list, original_list = self._date_identification_everyday_except_weekends(date_list, original_list, n_days=15) self._update_processed_text(original_list) date_list, original_list = self._date_identification_everyday_except_weekdays(date_list, original_list, n_days=50) self._update_processed_text(original_list) + date_list, original_list = self._date_identification_everyday(date_list, original_list, n_days=15) + self._update_processed_text(original_list) date_list, original_list = self._day_within_one_week(date_list, original_list) self._update_processed_text(original_list) date_list, original_list = self._weeks_identification(date_list, original_list) @@ -285,16 +304,21 @@ def _gregorian_day_month_year_format(self, date_list=None, original_list=None): original_list = [] if date_list is None: date_list = [] - regex_pattern = re.compile(r'\b(([12][0-9]|3[01]|0?[1-9])\s?[/\-\.]\s?(1[0-2]|0?[1-9])' - r'(?:\s?[/\-\.]\s?((?:20|19)?[0-9]{2}))?)(?:\s|$)') + regex_pattern = re.compile(r'[^/\-\.\w](([12][0-9]|3[01]|0?[1-9])\s?[/\-\.]\s?(1[0-2]|0?[1-9])' + r'(?:\s?[/\-\.]\s?((?:20|19)?[0-9]{2}))?)\W') patterns = regex_pattern.findall(self.processed_text.lower()) for pattern in patterns: original = pattern[0] dd = int(pattern[1]) mm = int(pattern[2]) yy = int(self.normalize_year(pattern[3])) if pattern[3] else self.now_date.year - if not pattern[3] and self.timezone.localize(datetime.datetime(year=yy, month=mm, day=dd)) < self.now_date: - yy += 1 + try: + # to catch dates which are not possible like "31/11" (october 31st) + if not pattern[3] and self.timezone.localize(datetime.datetime(year=yy, month=mm, day=dd))\ + < self.now_date: + yy += 1 + except: + return date_list, original_list date = { 'dd': int(dd), @@ -337,14 +361,21 @@ def _gregorian_month_day_year_format(self, date_list=None, original_list=None): original_list = [] if date_list is None: date_list = [] - regex_pattern = re.compile(r'\b((1[0-2]|0?[1-9])\s?[/\-\.]\s?([12][0-9]|3[01]|0?[1-9])\s?[/\-\.]' - r'\s?((?:20|19)?[0-9]{2}))(\s|$)') + regex_pattern = re.compile(r'[^/\-\.\w]((1[0-2]|0?[1-9])\s?[/\-\.]\s?([12][0-9]|3[01]|0?[1-9])' + r'(?:\s?[/\-\.]\s?((?:20|19)?[0-9]{2}))?)\W') patterns = regex_pattern.findall(self.processed_text.lower()) for pattern in patterns: original = pattern[0] dd = pattern[2] mm = pattern[1] - yy = self.normalize_year(pattern[3]) + yy = int(self.normalize_year(pattern[3])) if pattern[3] else self.now_date.year + try: + # to catch dates which are not possible like "11/31" (october 31st) + if not pattern[3] and self.timezone.localize(datetime.datetime(year=yy, month=mm, day=dd))\ + < self.now_date: + yy += 1 + except: + return date_list, original_list date = { 'dd': int(dd), @@ -388,7 +419,7 @@ def _gregorian_year_month_day_format(self, date_list=None, original_list=None): if date_list is None: date_list = [] regex_pattern = re.compile(r'\b(((?:20|19)[0-9]{2})\s?[/\-\.]\s?' - r'(1[0-2]|0?[1-9])\s?[/\-\.]\s?([12][0-9]|3[01]|0?[1-9]))(\s|$)') + r'(1[0-2]|0?[1-9])\s?[/\-\.]\s?([12][0-9]|3[01]|0?[1-9]))\W') patterns = regex_pattern.findall(self.processed_text.lower()) for pattern in patterns: original = pattern[0] @@ -438,7 +469,7 @@ def _gregorian_advanced_day_month_year_format(self, date_list=None, original_lis if date_list is None: date_list = [] regex_pattern = re.compile(r'\b(([12][0-9]|3[01]|0?[1-9])\s?[\/\ \-\.\,]\s?([A-Za-z]+)\s?[\/\ \-\.\,]\s?' - r'((?:20|19)?[0-9]{2}))(\s|$)') + r'((?:20|19)?[0-9]{2}))\W') patterns = regex_pattern.findall(self.processed_text.lower()) for pattern in patterns: original = pattern[0].strip() @@ -492,7 +523,7 @@ def _gregorian_day_with_ordinals_month_year_format(self, date_list=None, origina if original_list is None: original_list = [] regex_pattern = re.compile(r'\b(([12][0-9]|3[01]|0?[1-9])\s?(?:nd|st|rd|th)?\s?(?:of)?[\s\,\-]\s?' - r'([A-Za-z]+)[\s\,\-]\s?((?:20|19)?[0-9]{2}))(\s|$)') + r'([A-Za-z]+)[\s\,\-]\s?((?:20|19)?[0-9]{2}))\W') patterns = regex_pattern.findall(self.processed_text.lower()) for pattern in patterns: original = pattern[0].strip() @@ -543,7 +574,7 @@ def _gregorian_advanced_year_month_day_format(self, date_list=None, original_lis if date_list is None: date_list = [] regex_pattern = re.compile(r'\b(((?:20|19)[0-9]{2})\s?[\/\ \,\-]\s?([A-Za-z]+)\s?' - r'[\/\ \,\-]\s?([12][0-9]|3[01]|0?[1-9]))(\s|$)') + r'[\/\ \,\-]\s?([12][0-9]|3[01]|0?[1-9]))\W') patterns = regex_pattern.findall(self.processed_text.lower()) for pattern in patterns: original = pattern[0] @@ -645,17 +676,27 @@ def _gregorian_month_day_with_ordinals_year_format(self, date_list=None, origina original_list = [] if date_list is None: date_list = [] - regex_pattern = re.compile(r'\b(([A-Za-z]+)[\ \,\-]\s?([12][0-9]|3[01]|0?[1-9])\s?(?:nd|st|rd|th)?' - r'[\ \,\-]\s?((?:20|19)?[0-9]{2}))(\s|$)') + regex_pattern = re.compile(r'\b(((?:20|19)?[0-9]{2})?\s?([A-Za-z]+)[\ \,\-]\s?([12][0-9]' + r'|3[01]|0?[1-9])\s?(?:nd|st|rd|th)?' + r'(?:[\ \,\-]\s?((?:20|19)?[0-9]{2}))?)\W') patterns = regex_pattern.findall(self.processed_text.lower()) for pattern in patterns: original = pattern[0] - dd = pattern[2] - probable_mm = pattern[1] - yy = self.normalize_year(pattern[3]) + yy1 = pattern[1] + yy2 = pattern[4] + dd = pattern[3] + yy = int(self.normalize_year(yy1 or yy2 or str(self.now_date.year))) + probable_mm = pattern[2] mm = self.__get_month_index(probable_mm) if mm: + try: + # to catch dates which are not possible like "31/11" (october 31st) + if not yy1 and not yy2 and self.timezone.localize(datetime.datetime(year=yy, month=mm, day=dd)) \ + < self.now_date: + yy += 1 + except: + return date_list, original_list date = { 'dd': int(dd), 'mm': int(mm), @@ -1375,13 +1416,22 @@ def _date_identification_everyday_except_weekends(self, date_list=None, original end = now + datetime.timedelta(days=n_days) regex_pattern = re.compile(r'\b((every\s?day|daily|all\s?days)\s+except\s+weekends?)\b') patterns = regex_pattern.findall(self.processed_text.lower()) - + is_everyday_result = [] if not patterns: weekday_regex_pattern = re.compile(r'\b((week\s?days?|all\sweekdays))\b') patterns = weekday_regex_pattern.findall(self.processed_text.lower()) + every_weekday_pattern = re.compile(r'\b((every|daily|recur|always|continue|every\s*day|all)\s+' + r'(week\s?days?|all\sweekdays))\b', re.IGNORECASE) + is_everyday_result = every_weekday_pattern.findall(self.text) constant_type = WEEKDAYS - if self._is_everyday_present(self.text): + if is_everyday_result: constant_type = REPEAT_WEEKDAYS + patterns = is_everyday_result + # checks if phrase of the form everyday except weekdays is present in the sentence. + regex_pattern = re.compile(r'\b((every\s?day|daily|all\s?days)\s+except\s+weekdays?)\b') + check_patterns_for_except_weekdays = regex_pattern.findall(self.processed_text.lower()) + if check_patterns_for_except_weekdays: + patterns = [] today = now.weekday() count = 0 weekend = [] @@ -1453,13 +1503,18 @@ def _date_identification_everyday_except_weekdays(self, date_list=None, original end = now + datetime.timedelta(days=n_days) regex_pattern = re.compile(r'\b((every\s?day|daily|all\s?days)\s+except\s+weekdays?)\b') patterns = regex_pattern.findall(self.processed_text.lower()) + is_everyday_result = [] if not patterns: weekend_regex_pattern = re.compile(r'\b((week\s?ends?|all\sweekends))\b') patterns = weekend_regex_pattern.findall(self.processed_text.lower()) + every_weekend_pattern = re.compile(r'\b((every|daily|recur|always|continue|every\s*day|all)' + r'\s+(week\s?ends?|all\sweekends))\b', re.IGNORECASE) + is_everyday_result = every_weekend_pattern.findall(self.text) + constant_type = WEEKENDS - if self._is_everyday_present(self.text): + if is_everyday_result: constant_type = REPEAT_WEEKENDS - + patterns = is_everyday_result today = now.weekday() count = 0 weekend = [] diff --git a/ner_v2/detectors/temporal/date/gu/data/date_constant.csv b/ner_v2/detectors/temporal/date/gu/data/date_constant.csv index e3bd92ac9..cc808eacb 100644 --- a/ner_v2/detectors/temporal/date/gu/data/date_constant.csv +++ b/ner_v2/detectors/temporal/date/gu/data/date_constant.csv @@ -12,15 +12,15 @@ key,numeric_representation,date_type શુક્રવાર|શુક્રવારે|friday|shukravar|shukrawar|sukravar|sukrawar|shukravare|shukraware,4,weekday શનિવાર|શનિવારે|saturday|shanivar|shaniwar|saniwar|sanivar|shanivare|shaniware,5,weekday રવિવાર|રવિવારે|sunday|ravivar|raviwar|ravivare|raviware,6,weekday -જાન્યુઆરી|january|jan,1,month -ફેબ્રુઆરી|february|feb|febuary,2,month -માર્ચ|march|mar,3,month -એપ્રિલ|april|apr,4,month -મે|may,5,month -જૂન|જૂને|june|jun,6,month -જુલાઈ|જુલાઇ|july|jul,7,month -ઓગસ્ટ|ઑગષ્ટ|august|aug,8,month -સપ્ટેમ્બર|september|sept|sep,9,month -ઓક્ટોબર|october|oct,10,month -નવેમ્બર|november|nov,11,month -ડિસેમ્બર|december|dec,12,month +જાન્યુ.|જાન્યુ|જાન્યુઆરી|january|jan,1,month +ફેબ્રુ.|ફેબ્રુ|ફેબ્રુઆરી|february|feb|febuary,2,month +એપ્રિ.|એપ્રિ|માર્ચ|march|mar,3,month +એપ્રિલ|એપ્રિલ.|april|apr,4,month +મે|મે.|may,5,month +જૂન|જૂન.|જૂને|june|jun,6,month +જુલા.|જુલા|જુલાઈ|જુલાઇ|july|jul,7,month +ઑગ.|ઑગ|ઓગસ્ટ|ઑગષ્ટ|august|aug,8,month +સપ્ટે.|સપ્ટે|સપ્ટેમ્બર|september|sept|sep,9,month +ઑક્ટ્.|ઑક્ટ્|ઓક્ટોબર|october|oct,10,month +નવે.|નવે|નવેમ્બર|november|nov,11,month +ડિસે.|ડિસે|ડિસેમ્બર|december|dec,12,month diff --git a/ner_v2/detectors/temporal/date/hi/data/date_constant.csv b/ner_v2/detectors/temporal/date/hi/data/date_constant.csv index ec54eaf7c..8e30d3d98 100644 --- a/ner_v2/detectors/temporal/date/hi/data/date_constant.csv +++ b/ner_v2/detectors/temporal/date/hi/data/date_constant.csv @@ -13,15 +13,15 @@ key,numeric_representation,date_type शुक्रवार|friday|shukravar|shukrawar|sukravar|sukrawar,4,weekday शनिवार|saturday|shanivar|shaniwar|saniwar|sanivar,5,weekday रविवार|sunday|ravivar|raviwar,6,weekday -जनवरी|january|jan,1,month -फेब्रुअरी|फरवरी|february|feb|febuary,2,month -मार्च|march|mar,3,month -अप्रैल|april|apr,4,month -मई|may,5,month -जून|june|jun,6,month -जुलाई|july|jul,7,month -अगस्त|august|aug,8,month -सितम्बर|september|sept|sep,9,month -अक्टूबर|october|oct,10,month -नवंबर|november|nov,11,month -दिसंबर|december|dec,12,month +जन.|जन|जनवरी|january|jan,1,month +फेब्रुअरी|फरवरी|फ़र.|फ़र|फ़रवरी|february|feb|febuary,2,month +मार्च|मार्च.|march|mar,3,month +अप्रैल|अप्रै.|अप्रै|april|apr,4,month +मई|मई.|may,5,month +जून|जून.|june|jun,6,month +जुलाई|जुल.|जुल|july|jul,7,month +अगस्त|अग.|अग|अगस्त|august|aug,8,month +सितम्बर|सित.|सित|september|sept|sep,9,month +अक्टूबर|अक्टू.|अक्टू|october|oct,10,month +नवंबर|नव.|नव|नवम्बर|november|nov,11,month +दिसंबर|दिस.|दिस|दिसम्बर|december|dec,12,month diff --git a/ner_v2/detectors/temporal/date/hi/date_detection.py b/ner_v2/detectors/temporal/date/hi/date_detection.py index f4e11d00a..289259a74 100644 --- a/ner_v2/detectors/temporal/date/hi/date_detection.py +++ b/ner_v2/detectors/temporal/date/hi/date_detection.py @@ -10,7 +10,7 @@ class DateDetector(BaseRegexDate): data_directory_path = os.path.join((os.path.dirname(os.path.abspath(__file__)).rstrip(os.sep)), LANGUAGE_DATA_DIRECTORY) - def __init__(self, entity_name, timezone='UTC', past_date_referenced=False): + def __init__(self, entity_name, locale=None, timezone='UTC', past_date_referenced=False): super(DateDetector, self).__init__(entity_name, data_directory_path=DateDetector.data_directory_path, timezone=timezone, diff --git a/ner_v2/detectors/temporal/date/mr/data/date_constant.csv b/ner_v2/detectors/temporal/date/mr/data/date_constant.csv index 776332d0f..7468eb0c8 100644 --- a/ner_v2/detectors/temporal/date/mr/data/date_constant.csv +++ b/ner_v2/detectors/temporal/date/mr/data/date_constant.csv @@ -1,27 +1,27 @@ key,key,date_type आज|Aaj,0,relative_date -उद्या|काल|कल|Udya|udhya|kaal,1,relative_date +उद्या|काल|कल|Udya|udhya|kaal|kal,1,relative_date परवा|पर्वा|Parava|parva,2,relative_date तिसऱ्या देवीशि|Tisrya Div shi,3,relative_date तारिक|तार्कीला|Tarik|Taarik|tarkila,NA,month_date_ref दिवस|दिवसं|Divas|divsan ,NA,date_literal -महिना|महीने|महिन्याचा|months|month|mahina|mahine|mahinyacha|mahinyachi|mahinyacha,NA,month_literal -सोमवार|monday|somvar|somwar,0,weekday -मंगळवार|मंगळवारी|Mangalvari|tuesday|mangalvar|mangalwar,1,weekday -बुधवार|बुधवारी|wednesday|budhvar|budhwar|budhvari|budhwari,2,weekday -गुरुवार|गुरुवारी|thursday|guruvar|guruwar|guroovaar|guroowar|guroovar|guruvari|guruwari|guroovaari|guroowari,3,weekday -शुक्रवार|शुक्रवारी|friday|shukravar|shukrawar|shukravari|shukrawari,4,weekday -शनिवार|saturday|shanivar|shaniwar,5,weekday -रविवार|रविवारी|sunday|ravivar|raviwar|ravivari|raviwari,6,weekday -जानेवारी|january|jan|Janevari,1,month -फेब्रुवारी|Phebruvari|february|feb,2,month -मार्च|march|mar,3,month -एप्रिल|april|apr|Epril,4,month -मे|may|Me,5,month -जून|june|jun,6,month -जुलै|july|jul|Julai,7,month -ऑगस्ट|august|aug|ogast,8,month -सप्टेंबर|september|sept|sep,9,month -ऑक्टोबर|october|oct|oktobar,10,month -नोव्हेंबर|november|nov|nowenber,11,month -डिसेंबर|december|dec,12,month +महिना|महीने|महिन्याचा|months|month|mahina|mahine|mahinyacha|mahinyachi|mahinyacha|mahinyancha,NA,month_literal +सोमवार|सोमवारी|monday|somvar|somwar|somvaar|somvaari|somwari,0,weekday +मंगळवार|मंगळवारी|Mangalvari|tuesday|mangalvar|mangalwar|mangalvaar,1,weekday +बुधवार|बुधवारी|wednesday|budhvar|budhwar|budhvaar|budhvari|budhwari,2,weekday +गुरुवार|गुरुवारी|thursday|guruvar|guruwar|guroovaar|guroowar|guroovar|gurvaar|guruvari|guruwari|guroovaari|guroowari,3,weekday +शुक्रवार|शुक्रवारी|friday|shukravar|shukrawar|shukravaar|shukravari|shukrawari,4,weekday +शनिवार|शनिवारी|saturday|shanivar|shaniwar|shanivaar|shanivaari|shaniwari,5,weekday +रविवार|रविवारी|sunday|ravivar|raviwar|ravivari|raviwari|ravivaari,6,weekday +जानेवारी|जाने.|जाने|january|jan|Janevari,1,month +फेब्रुवारी|फेब्रु.|फेब्रु|Phebruvari|february|feb,2,month +मार्च|मार्च.|मार्च|march|mar,3,month +एप्रिल|एप्रि.|एप्रि|april|apr|Epril,4,month +मे|मे.|may|Me,5,month +जून.|जून|june|jun,6,month +जुलै.|जुलै|july|jul|Julai,7,month +ऑग.|ऑग|ऑगस्ट|august|aug|ogast,8,month +सप्टें.|सप्टें|सप्टेंबर|september|sept|sep,9,month +ऑक्टो.|ऑक्टो|ऑक्टोबर|october|oct|oktobar,10,month +नोव्हें.|नोव्हें|नोव्हेंबर|november|nov|nowenber,11,month +डिसें.|डिसें|डिसेंबर|december|dec,12,month diff --git a/ner_v2/detectors/temporal/date/standard_date_regex.py b/ner_v2/detectors/temporal/date/standard_date_regex.py index efcea1aa4..5fb5815eb 100644 --- a/ner_v2/detectors/temporal/date/standard_date_regex.py +++ b/ner_v2/detectors/temporal/date/standard_date_regex.py @@ -13,15 +13,16 @@ class BaseRegexDate(object): - def __init__(self, entity_name, data_directory_path, timezone='UTC', past_date_referenced=False): + def __init__(self, entity_name, data_directory_path, locale=None, timezone='UTC', past_date_referenced=False): """ Base Regex class which will be imported by language date class by giving their data folder path This will create standard regex and their parser to detect date for given language. Args: data_directory_path (str): path of data folder for given language - timezone (str): user timezone default UTC + timezone (Optional, str): user timezone default UTC past_date_referenced (boolean): if the date reference is in past, this is helpful for text like 'kal', 'parso' to know if the reference is past or future. + locale (Optional, str): user locale default None """ self.text = '' self.tagged_text = '' diff --git a/ner_v2/detectors/temporal/date/te/__init__.py b/ner_v2/detectors/temporal/date/te/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/ner_v2/detectors/temporal/date/te/data/date_constant.csv b/ner_v2/detectors/temporal/date/te/data/date_constant.csv new file mode 100644 index 000000000..2dd4c1b41 --- /dev/null +++ b/ner_v2/detectors/temporal/date/te/data/date_constant.csv @@ -0,0 +1,27 @@ +key,numeric_representation,date_type +ఇవాళ|నేడు|ఈరోజు|ఈ రోజు|ఈనాడు|ఈ నాడు|ఈదినం|ఈ దినం|ఈదినము|ఈవేళ|ivala|ivaala|nedu|needu|ee roju|e roju|eroju|eeroju|eenadu|enadu|enaadu|ee dinam,0,relative_date +రేపు|మరుసటిరోజు|మరునాడు|ఱేపు|repu|reepu|marusatiroju|marusatirooju|marunadu|marunaadu,1,relative_date +ఎల్లుండి|ellundi,2,relative_date +అవతలి ఎల్లుండి|అవతలేల్లుంది|అవతలిఎల్లుండి|avathali ellundi|avathalellundi,3,relative_date +తారీఖు|రోజు|తారీఖున|దినం|tareeku|tareekhu|tareekh|roju|rooju|tareekhuna|tareekuna|dinam,NA,month_date_ref +రోజు|రోజులు|roju|rojulu,NA,date_literal +నెల|నెలలు|మాసం|మాసాలు|nela|nalalu|masam|maasam|masalu|maasaalu|masaalu,NA,month_literal +సోమవారం|సోమారం|ఇందువాసరము|somavaram|somavaaram|induvasaramu|induvaasaramu|induvasaram|monday|soma|sooma,0,weekday +మంగళవారము|అంగారకవారమ|జయవారము|mangalavaram|mangalavaaram|tuesday|mangala,1,weekday +బుధవారము|సౌమ్యవాసరము|బుధవారం|budavaram|buda|budavaaram|sowmyavaram|wednesday,2,weekday +గురువారము|బృహస్పతి వారము|లక్ష్మివారము|గురు|గురువారం|guruvaram|guruvaaram|guru|bruhaspathi varam|thursday|guru varam|guru vaaram|lakshmi varam|lakshmivaram,3,weekday +శుక్రవారము|శుక్రవారం|శుక్ర|shukravaram|shukra varam|shukravaaram|shukra,4,weekday +శనివారము|స్థిరవారము|మందవారము|shanivaram|shanivaaram|mandavaram|mandavaaram,5,weekday +ఆదివారము|భానువారము|రవివారము|అధిత్యవారము|తొలివారము|aadivaram|adivaram|adi varam|adi vaaram|bhanuvaram|bhanu|bhanu vaaram|ravivaram|ravi vaaramu|ravi varam|adityavaram|adithya varam|tholivaram|tholi vaaram|sunday,6,weekday +జన.|జన|మొదటి నెల|జనుఅరీ|జనవరి|జనవరి|january|jan|janavary,1,month +ఫిబ్ర.|ఫిబ్ర|ఫిబ్రవరి|ఫెబ్|february|feb|febravary,2,month +ఏప్రి.|ఏప్రి|మార్చ్|మార్|march|mar,3,month +ఏప్రిల్|ఏప్రిల్.|april|apr,4,month +మే|మే.|may,5,month +జూన్|జూన్.|jun|june,6,month +జులై|జులై.|jul|july,7,month +ఆగ.|ఆగ|ఆగష్టు|అగస్ట్|ఆగస్ట్|august|aug,8,month +సెప్.|సెప్|సెప్టెంబర్|september|sep|sept,9,month +అక్టో.|అక్టో|అక్టోబర్|అక్టోబరు|oct|october,10,month +నవ.|నవ|నవంబర్|november|nov,11,month +డిసె.|డిసె|డిసెంబరు|డిసెంబర్|december|dec,12,month \ No newline at end of file diff --git a/ner_v2/detectors/temporal/date/te/data/datetime_diff_constant.csv b/ner_v2/detectors/temporal/date/te/data/datetime_diff_constant.csv new file mode 100644 index 000000000..2cff92322 --- /dev/null +++ b/ner_v2/detectors/temporal/date/te/data/datetime_diff_constant.csv @@ -0,0 +1,10 @@ +key,present_in_start,adding_magnitude,datetime_type +తర్వాత|పిమ్మట|అయినాక|ఐనాకా|అయ్యాక|తరువాత|తరవాత|tarvatha|tarvata|taruvatha|ainaka|ayinaka|ayyaka|pimmata,FALSE,1,add_diff_datetime +ఈ|ee|e,TRUE,0,add_diff_datetime +ముందు|పూర్వం|ముందఱ|మునుపు|పూర్వము|ముందర|mundu|purvam|poorvam|mundara|munupu|purvamu|mundara,FALSE,-1,add_diff_datetime +ముందు|పూర్వం|ముందఱ|మునుపు|పూర్వము|ముందర|mundu|purvam|poorvam|mundara|munupu|purvamu|mundara,TRUE,-1,add_diff_datetime +వచ్చే|రాబోయే|vache|raboye|rabooye,TRUE,1,add_diff_datetime +లో|lo,FALSE,1,add_diff_datetime +ఒకటింపావు|పావు|బావు|okatimpavu|pavu|paavu|bavu|baavu,TRUE,0.25,ref_datetime +పావు తక్కువ|pavu takkuva|paavu takkuva,TRUE,-0.25,ref_datetime +అర్థ|అర్ధ|artha|arda|adtha,TRUE,0.5,ref_datetime diff --git a/ner_v2/detectors/temporal/date/te/data/numbers_constant.csv b/ner_v2/detectors/temporal/date/te/data/numbers_constant.csv new file mode 100644 index 000000000..974848519 --- /dev/null +++ b/ner_v2/detectors/temporal/date/te/data/numbers_constant.csv @@ -0,0 +1,34 @@ +key,numeric_representation +౧|ఒక్కటి|ఒకటి|మొదటిది|ఒకటవ|ఒకటో|okkati|okati|modatidi|okatova|okato,1 +౨|రెండు|రొండు|రెండొవ|రెండో|రెండొవది|rendu|rondu|rendova|rendo|rendovadi,2 +౩|మూడు|మూడోవ|మూడోవ|మూడొవది|మూడో|mudu|muudu|mudova|muudova|mudovadi|mudo|muudo,3 +౪|నాలుగు|నాల్గు|నాల్గొవ|నాల్గొవది|నాల్గో|నాలగు|nalugu|nalagu|naalagu|naalugu|nalgu|nalgova|nalgovadi|nalgo,4 +౫|ఐదు|అయిదు|ఐదొవ|అయిదవది|ఐదో|idu|ayidu|iydu|iydova|ayidovadi|ayido|ido|aidu,5 +౬|ఆరు|ఆఱు|ఆరొవ|ఆరొవది|ఆరో|aru|aaru|aarova|aarovadi|aro|aaro,6 +౭|ఏడు|ఏడొవ|ఏడొవది|ఏడో|aedu|aedova|aedovadi|aedo|ado|adova|adovadi,7 +౮|ఎనిమిది|ఎనిమిదొవ|ఎనిమిదిది|ఎనిమిదో|enimidi|enimdova|enimididi|enimido,8 +౯|తొమ్మిది|తొమ్మిదిది|తొమ్మిదో|tommidi|thommidi|thommididi|thommido|tomidi|thomido,9 +౧౦|పది|పదొవ|పదొవది|పదో|padi|padhi|padova|padovadi|pado,10 +౧౧|పదకొండు|పదకొండొవ|పదకొండొవది|పదకొండో|padakondu|padakondova|padakondovadi|padakondo,11 +౧౨|పన్నెండు|పన్నెండవ|పన్నెండవది|పన్నెండో|పన్నెండొవ|పన్నెండొవది|పన్నెండొ|pannendu|pannendova|pannendovadi|pannendo,12 +౧౩|పదమూడు|పదమూడొవ|పదముండొవది|పదముండొ|padamudu|padamudova|padamundovadi|padamundo,13 +౧౪|పద్నాలుగు|పద్నలుగొవ|పద్నాలుగుది|పద్నాలుగో|padnalugu|padnalugova|padnalugudi|padnalugo|padinalugu|padinalagu,14 +౧౫|పదిహేను|పదిహేనోవా|పదిహేనోవాది|పదిహేనో|పదైదు|padihenu|padihenova|padihenovadi|padiheno|padaidu,15 +౧౬|పదహారు|పదహారోది|పదహారో|padahaaru|padhaharu|padaharodi|padaharo,16 +౧౭|పదిహేడు|పదిహేనొవ|పదిహేనోవాది|పదిహేనో|padihedu|padhihedu|padihenova|padihenovadi|padiheno,17 +౧౮|పద్దెనిమిది|పద్దెనిమిదొవ|పద్దెనిమిదిది|పద్దెనిమిదో|పద్దెనిమిదొవ|paddenimidi|padhenimidi|paddenimidova|paddenimididi|padhenimidho|padhenimidova,18 +౧౯|పంతొమ్మిది|పందొమ్మిదొవ|పంతొమ్మిదో|పంతొమ్మిదొవ|పందొమ్మిది|pantommidi|pandommidova|panthommido,19 +౨౦|ఇరవై|ఇరవై|ఇరవయ్యోవది|ఇరవయ్యో|iravay|iravai|iravayyovadi|iravayyo,20 +౨౧|ఇరవయ్యొక్కటి|ఇరవై ఒకటి |ఇరవై ఒక్కటి|ఇరవై ఒకటో|iravayyokkati|iravay okati|eravay okati|iravay okato,21 +౨౨|ఇరవై రెండు|ఇరవై రెండొవ|ఇరవై రెండొవది|ఇరవై రెండో|iravay rendu|iravay rendova|iravay rendovadi|iravay rendo,22 +౨౩|ఇరవై మూడు|ఇరవై మూడవా|ఇరవై మూడొవది|ఇరవై మూడో|iravay mudu|iravay mudova|iravay mudovadi|iravay mudo,23 +౨౪|ఇరవై నాలుగు|ఇరవై నాల్గొవ|ఇరవై నాల్గొవది|ఇరవై నాల్గో|iravay naalugu|iravay naalgova|iravay nalgovadi|iravay nalgo,24 +౨౫|ఇరవై ఐదు|ఇరవై ఐదొవ|ఇరవై ఐదొవది|ఇరవై ఐదో|iravay aidu|iravay aidu|iravay aidova|iravay aidovadi|iravay aido,25 +౨౬|ఇరవై ఆఱు|ఇరవై ఆరు|ఇరవై ఆరొవ|ఇరవై ఆరో|ఇరవై ఆరొవది|iravay aaru|iravay aaru|iravay arova|iravay aaro|iravay aarovadi,26 +౨౭|ఇరవయ్యేడు|ఇరవయ్యేడొవ|ఇరవయ్యేడొవది|ఇరవయ్యేడో|ఇరవై ఏడు|iravayeedu|iravayeedova|iravayeedovadi|iravayeedo|iravay aedu,27 +౨౮|ఇరవై ఎనిమిది|ఇరవై ఎనిమిదొవ| ఇరవై ఎనిమిదిది|ఇరవై ఎనిమిదో|iravay enimidi|iravay enimidova|iraavay enimididi|iravay enimido,28 +౨౯|ఇరవై తొమ్మిది|ఇరవై తొమ్మిదొవది|ఇరవై తొమ్మిదో|iravay tommidi|iravay thommidi|iravay thommidovadi|iravay thommido,29 +౩౦|ముప్పై|ముప్పైయొవది|ముప్పైయొవ|ముప్పైయో|muppai|muppaiovadi|muppaiova|muppaiyo,30 +౩౧|ముప్పై ఒక్కటి|ముప్పై ఒకటవ|ముప్పై ఒకటోవది|ముప్పై ఒకటో|muppai okkati|muppai okatova|muppai okatovadi|muppai okato,31 +౧.౫|ఒకటిన్నర|ఒక్కటి అర|ఒకటి అర|okatinnara|okkati ara|okati ara,1.5 +౨.౫|రెండున్నర|రెండు అర|rendunnara|rendu ara,2.5 diff --git a/ner_v2/detectors/temporal/time/en/data/timezones.csv b/ner_v2/detectors/temporal/time/en/data/timezones.csv new file mode 100644 index 000000000..4a7c697f1 --- /dev/null +++ b/ner_v2/detectors/temporal/time/en/data/timezones.csv @@ -0,0 +1,10 @@ +code,timezone_variants,preferred,all_regions +IST,IST|Indian Time|Indian Standard Time,Asia/Kolkata,Asia/Kolkata +EST,EST|Eastern Standard Time|Eastern Time|ET|EDT,America/New_York,America/New_York|America/Detroit|America/Kentucky/Louisville|America/Kentucky/Monticello|America/Indiana/Indianapolis|America/Indiana/Vincennes|America/Indiana/Winamac|America/Indiana/Marengo|America/Indiana/Petersburg|America/Indiana/Vevay +CST,CST|Central Standard Time|Central Time|CT|CDT,America/Chicago,America/Chicago|America/Indiana/Tell_City|America/Indiana/Knox|America/Menominee|America/North_Dakota/Center|America/North_Dakota/New_Salem|America/North_Dakota/Beulah +MST,MST|Mountain Standard Time|Mountain Time|MT|MDT,America/Denver,America/Denver|America/Boise|America/Phoenix +PST,PST|Pacific Standard Time|Pacific Time|PT|PDT,America/Los_Angeles,America/Los_Angeles +AKST,AKST|Alaska Standard Time|Alaska Time|AKDT,America/Anchorage,America/Anchorage|America/Juneau|America/Sitka|America/Yakutat|America/Nome|America/Metlakatla +HST,HST|Hawaii Standard Time|HDT,America/Adak,America/Adak|Pacific/Honolulu +HAST,HAST|Hawaii-Aleutian Standard Time|Hawaii Aleutian Standard Time|Hawaii Time|HADT,Pacific/Honolulu,Pacific/Honolulu +UTC,UTC|GMT|Greenwich Mean Time|Greenwich Time|Coordinated Universal Time,UTC,UTC diff --git a/ner_v2/detectors/temporal/time/en/time_detection.py b/ner_v2/detectors/temporal/time/en/time_detection.py index be8df7d40..3cfb96b55 100644 --- a/ner_v2/detectors/temporal/time/en/time_detection.py +++ b/ner_v2/detectors/temporal/time/en/time_detection.py @@ -1,7 +1,17 @@ import re import datetime -from ner_v2.detectors.temporal.constant import AM_MERIDIEM, PM_MERIDIEM, TWELVE_HOUR, EVERY_TIME_TYPE -from ner_v2.detectors.temporal.utils import get_timezone +import collections +import pandas as pd +import os +import pytz +from ner_v2.detectors.temporal.constant import AM_MERIDIEM, PM_MERIDIEM, TWELVE_HOUR, EVERY_TIME_TYPE,\ + TIMEZONES_CONSTANT_FILE, TIMEZONE_VARIANTS_VARIANTS_COLUMN_NAME, \ + TIMEZONES_CODE_COLUMN_NAME, TIMEZONES_ALL_REGIONS_COLUMN_NAME, \ + TIMEZONES_PREFERRED_REGION_COLUMN_NAME +from ner_v2.detectors.temporal.utils import get_timezone, get_list_from_pipe_sep_string +from ner_v2.constant import LANGUAGE_DATA_DIRECTORY + +TimezoneVariants = collections.namedtuple('TimezoneVariant', ['value', 'preferred']) class TimeDetector(object): @@ -57,7 +67,7 @@ class TimeDetector(object): text and tagged_text will have a extra space prepended and appended after calling detect_entity(text) """ - def __init__(self, entity_name, timezone='UTC'): + def __init__(self, entity_name, timezone=None): """Initializes a TimeDetector object with given entity_name and timezone Args: @@ -77,8 +87,16 @@ def __init__(self, entity_name, timezone='UTC'): self.original_time_text = [] self.tag = '__' + entity_name + '__' self.bot_message = None - self.timezone = get_timezone(timezone) - self.now_date = datetime.datetime.now(self.timezone) + if timezone: + self.timezone = get_timezone(timezone) + else: + self.timezone = None + self.timezones_map = {} + + self.init_regex_and_parser(os.path.join((os.path.dirname(os.path.abspath(__file__)).rstrip(os.sep)), + LANGUAGE_DATA_DIRECTORY)) + sorted_len_timezone_keys = sorted(self.timezones_map.keys(), key=len, reverse=True) + self.timezone_choices = "|".join([re.escape(x.lower()) for x in sorted_len_timezone_keys]) def set_bot_message(self, bot_message): """ @@ -89,6 +107,40 @@ def set_bot_message(self, bot_message): """ self.bot_message = bot_message + def init_regex_and_parser(self, data_directory_path): + timezone_variants_data_path = os.path.join(data_directory_path, TIMEZONES_CONSTANT_FILE) + columns = [TIMEZONE_VARIANTS_VARIANTS_COLUMN_NAME, TIMEZONES_CODE_COLUMN_NAME, + TIMEZONES_PREFERRED_REGION_COLUMN_NAME] + if os.path.exists(timezone_variants_data_path): + timezone_variants_df = pd.read_csv(timezone_variants_data_path, usecols=columns, encoding='utf-8') + for index, row in timezone_variants_df.iterrows(): + tz_name_variants = get_list_from_pipe_sep_string(row[TIMEZONE_VARIANTS_VARIANTS_COLUMN_NAME]) + value = row[TIMEZONES_CODE_COLUMN_NAME] + preferred = row[TIMEZONES_PREFERRED_REGION_COLUMN_NAME] + for tz_name in tz_name_variants: + self.timezones_map[tz_name] = TimezoneVariants(value=value, preferred=preferred) + + def convert_to_pytz_format(self, timezone_variant): + """ + Converts informal TZ formats like EST, Eastern Time etc to Oslon format(America/New_York) supported by pytz. + :param timezone_variant: (str) Informal TZ variant + :return: Standard Olson format for pytz. + """ + timezone_code = self.timezones_map[timezone_variant].value + data_directory_path = os.path.join((os.path.dirname(os.path.abspath(__file__)).rstrip(os.sep)), + LANGUAGE_DATA_DIRECTORY) + timezone_data_path = os.path.join(data_directory_path, TIMEZONES_CONSTANT_FILE) + columns = [TIMEZONES_CODE_COLUMN_NAME, TIMEZONES_ALL_REGIONS_COLUMN_NAME] + if os.path.exists(timezone_data_path): + timezones_df = pd.read_csv(timezone_data_path, usecols=columns, index_col=TIMEZONES_CODE_COLUMN_NAME, + encoding='utf-8') + if re.search(self.timezone.zone, timezones_df.loc[timezone_code][TIMEZONES_ALL_REGIONS_COLUMN_NAME]): + return self.timezone.zone + else: + return self.timezones_map[timezone_variant].preferred + + return self.timezone.zone + def _detect_time(self, range_enabled=False, form_check=False): """ Detects all time strings in text and returns list of detected time entities and their corresponding original @@ -115,6 +167,8 @@ def _detect_time(self, range_enabled=False, form_check=False): self._update_processed_text(original_list) time_list, original_list = self._detect_end_range_12_hour_format_without_min(time_list, original_list) self._update_processed_text(original_list) + time_list, original_list = self._detect_range_24_hour_format(time_list, original_list) + self._update_processed_text(original_list) time_list, original_list = self._detect_12_hour_format(time_list, original_list) self._update_processed_text(original_list) time_list, original_list = self._detect_12_hour_without_min(time_list, original_list) @@ -222,39 +276,55 @@ def _detect_range_12_hour_format(self, time_list=None, original_list=None): time_list = [] if original_list is None: original_list = [] - patterns = re.findall( - r'\s((0?[2-9]|0?1[0-2]?)[\s-]*(?::|\.|\s)?[\s-]*?([0-5][0-9])[\s-]*?(pm|am|a\.m|p\.m)[\s-]*?to[\s-]' - r'*?(0?[2-9]|0?1[0-2]?)[\s-]*(?::|\.|\s)?[\s-]*?([0-5][0-9])[\s-]*?(pm|am|a\.m|p\.m))', - self.processed_text.lower()) + regex_patterns = re.compile( + r'\b((?:from)?({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*(?::|\.|\s)?[\s-]*?([0-5][0-9])[\s-]*?' + r'(pm|am|a\.m\.?|p\.m\.?)[\s-]*?({timezone})?\s*(?:to|-|till|until|untill|upto|up to)' + r'[\s-]*?({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*' + r'(?::|\.|\s)?[\s-]*?([0-5][0-9])[\s-]*?(pm|am|a\.m\.?|p\.m\.?)\s*({timezone})?)\b' + .format(timezone=self.timezone_choices) + ) + patterns = regex_patterns.findall(self.processed_text.lower()) for pattern in patterns: - original1 = pattern[0] - original2 = pattern[0] + original1 = pattern[0].strip() + original2 = pattern[0].strip() if self.departure_flag: time_type = 'departure' elif self.return_flag: time_type = 'return' else: time_type = None - t1 = pattern[1] - t2 = pattern[2] - ap1 = pattern[3] + t1 = pattern[2] + t2 = pattern[3] + ap1 = pattern[4] + tz1 = pattern[1] + tz2 = pattern[5] + tz = None + if tz1 or tz2: + tz = self.convert_to_pytz_format(tz1 or tz2) time1 = { 'hh': int(t1), 'mm': int(t2), 'nn': str(ap1).lower().strip('.'), + 'tz': tz or (None if not self.timezone else self.timezone.zone), 'range': 'start', 'time_type': time_type } time1['nn'] = 'am' if 'a' in time1['nn'] else time1['nn'] time1['nn'] = 'pm' if 'p' in time1['nn'] else time1['nn'] - t3 = pattern[4] - t4 = pattern[5] - ap2 = pattern[6] + t3 = pattern[7] + t4 = pattern[8] + ap2 = pattern[9] + tz3 = pattern[6] + tz4 = pattern[10] + tz = None + if tz3 or tz4: + tz = self.convert_to_pytz_format(tz3 or tz4) time2 = { 'hh': int(t3), 'mm': int(t4), 'nn': str(ap2).lower().strip('.'), + 'tz': tz or (None if not self.timezone else self.timezone.zone), 'range': 'end', 'time_type': time_type } @@ -268,6 +338,82 @@ def _detect_range_12_hour_format(self, time_list=None, original_list=None): break return time_list, original_list + def _detect_range_24_hour_format(self, time_list=None, original_list=None): + """ + Finds 24 hour range format time from text + CURRENTLY IT IS LIMITED ONLY TO ONE RANGE PER TEXT + + Args: + time_list (list): Optional, list to store dictionaries of detected time entities + original_list (list): Optional, list to store corresponding substrings of given text which were detected as + time entities + + Returns: + A tuple of two lists with first list containing the detected time entities and second list containing their + corresponding substrings in the given text. + """ + if time_list is None: + time_list = [] + if original_list is None: + original_list = [] + regex_patterns = re.compile( + r'\b((?:from)?({timezone})?\s*(00?|0?[2-9]|0?1[0-9]?|2[0-3])[:.\s]?([0-5][0-9])' + r'[\s-]*?({timezone})?\s*(?:to|-|till|until|untill|upto|up to)[\s-]*?({timezone})?\s*' + r'(00?|0?[2-9]|0?1[0-9]?|2[0-3])[:.\s]?([0-5][0-9])' + r'[\s-]*?({timezone})?)(?!\s*(?:am|pm|a\.m\.?|p\.m\.?|(?:{timezone})|\d))' + .format(timezone=self.timezone_choices) + ) + patterns = regex_patterns.findall(self.processed_text.lower()) + for pattern in patterns: + original1 = pattern[0].strip() + original2 = pattern[0].strip() + if self.departure_flag: + time_type = 'departure' + elif self.return_flag: + time_type = 'return' + else: + time_type = None + t1 = pattern[2] + t2 = pattern[3] + tz1 = pattern[1] + tz2 = pattern[4] + tz = None + if tz1 or tz2: + tz = self.convert_to_pytz_format(tz1 or tz2) + time1 = { + 'hh': int(t1), + 'mm': int(t2), + 'nn': 'hrs', + 'tz': tz or (None if not self.timezone else self.timezone.zone), + 'range': 'start', + 'time_type': time_type + } + time1['nn'] = 'am' if 'a' in time1['nn'] else time1['nn'] + time1['nn'] = 'pm' if 'p' in time1['nn'] else time1['nn'] + + t3 = pattern[6] + t4 = pattern[7] + tz3 = pattern[5] + tz4 = pattern[8] + tz = None + if tz3 or tz4: + tz = self.convert_to_pytz_format(tz3 or tz4) + time2 = { + 'hh': int(t3), + 'mm': int(t4), + 'nn': 'hrs', + 'tz': tz or (None if not self.timezone else self.timezone.zone), + 'range': 'end', + 'time_type': time_type + } + + time_list.append(time1) + original_list.append(original1) + time_list.append(time2) + original_list.append(original2) + break + return time_list, original_list + def _detect_range_12_hour_format_without_min(self, time_list=None, original_list=None): """ Finds 12 hour range format time from text without minutes @@ -286,37 +432,52 @@ def _detect_range_12_hour_format_without_min(self, time_list=None, original_list time_list = [] if original_list is None: original_list = [] - patterns = re.findall( - r'\s((0?[2-9]|0?1[0-2]?)[\s-]*(am|pm|a\.m|p\.m)[\s-]*?to[\s-]*?(0?[2-9]|0?1[0-2]?)[\s-]*' - r'(am|pm|a\.m|p\.m))', - self.processed_text.lower()) + regex_patterns = re.compile( + r'\b((?:from)?({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*(am|pm|a\.m\.?|p\.m\.?)[\s-]*?({timezone})?\s*' + r'(?:to|-|till|until|untill|upto|up to)' + r'\s*({timezone})?[\s-]*?(0?[2-9]|0?1[0-2]?)[\s-]*(am|pm|a\.m\.?|p\.m\.?)\s*({timezone})?)\b' + .format(timezone=self.timezone_choices) + ) + patterns = regex_patterns.findall(self.processed_text.lower()) for pattern in patterns: - original1 = pattern[0] - original2 = pattern[0] + original1 = pattern[0].strip() + original2 = pattern[0].strip() if self.departure_flag: time_type = 'departure' elif self.return_flag: time_type = 'return' else: time_type = None - t1 = pattern[1] - ap1 = pattern[2] + t1 = pattern[2] + ap1 = pattern[3] + tz1 = pattern[1] + tz2 = pattern[4] + tz = None + if tz1 or tz2: + tz = self.convert_to_pytz_format(tz1 or tz2) time1 = { 'hh': int(t1), 'mm': 0, 'nn': str(ap1).lower().strip('.'), + 'tz': tz or (None if not self.timezone else self.timezone.zone), 'range': 'start', 'time_type': time_type } time1['nn'] = 'am' if 'a' in time1['nn'] else time1['nn'] time1['nn'] = 'pm' if 'p' in time1['nn'] else time1['nn'] - t2 = pattern[3] - ap2 = pattern[4] + t2 = pattern[6] + ap2 = pattern[7] + tz3 = pattern[5] + tz4 = pattern[8] + tz = None + if tz3 or tz4: + tz = self.convert_to_pytz_format(tz1 or tz2) time2 = { 'hh': int(t2), 'mm': 0, 'nn': str(ap2).lower().strip('.'), + 'tz': tz or (None if not self.timezone else self.timezone.zone), 'range': 'end', 'time_type': time_type } @@ -348,11 +509,15 @@ def _detect_start_range_12_hour_format(self, time_list=None, original_list=None) time_list = [] if original_list is None: original_list = [] - patterns = re.findall(r'\s((after|aftr)[\s-]*(0?[2-9]|0?1[0-2]?)[\s-]*(?::|\.|\s)?[\s-]*?' - r'([0-5][0-9])[\s-]*?(pm|am|a\.m|p\.m))', - self.processed_text.lower()) + + regex_patterns = re.compile( + r'\b((?:after|aftr)[\s-]*({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*(?::|\.|\s)?[\s-]*?' + r'([0-5][0-9])[\s-]*?(pm|am|a\.m\.?|p\.m\.?)\s*({timezone})?)\b' + .format(timezone=self.timezone_choices) + ) + patterns = regex_patterns.findall(self.processed_text.lower()) for pattern in patterns: - original1 = pattern[0] + original1 = pattern[0].strip() if self.departure_flag: time_type = 'departure' elif self.return_flag: @@ -362,10 +527,16 @@ def _detect_start_range_12_hour_format(self, time_list=None, original_list=None) t1 = pattern[2] t2 = pattern[3] ap1 = pattern[4] + tz1 = pattern[1] + tz2 = pattern[5] + tz = None + if tz1 or tz2: + tz = self.convert_to_pytz_format(tz1 or tz2) time1 = { 'hh': int(t1), 'mm': int(t2), 'nn': str(ap1).lower().strip('.'), + 'tz': tz or (None if not self.timezone else self.timezone.zone), 'range': 'start', 'time_type': time_type } @@ -395,11 +566,13 @@ def _detect_end_range_12_hour_format(self, time_list=None, original_list=None): time_list = [] if original_list is None: original_list = [] - patterns = re.findall(r'\s((before|bfre)[\s-]*(0?[2-9]|0?1[0-2]?)[\s-]*' - r'(?::|\.|\s)?[\s-]*?([0-5][0-9])[\s-]*?(pm|am|a\.m|p\.m))', + patterns = re.findall(r'\b((?:before|bfre|till|until|untill|upto|up to)[\s-]*({timezone})?\s*' + r'(0?[2-9]|0?1[0-2]?)[\s-]*(?::|\.|\s)?[\s-]*?([0-5][0-9])[\s-]*?' + r'(pm|am|a\.m\.?|p\.m\.?)\s*({timezone})?)\b' + .format(timezone=self.timezone_choices), self.processed_text.lower()) for pattern in patterns: - original1 = pattern[0] + original1 = pattern[0].strip() if self.departure_flag: time_type = 'departure' elif self.return_flag: @@ -409,10 +582,16 @@ def _detect_end_range_12_hour_format(self, time_list=None, original_list=None): t1 = pattern[2] t2 = pattern[3] ap1 = pattern[4] + tz1 = pattern[1] + tz2 = pattern[5] + tz = None + if tz1 or tz2: + tz = self.convert_to_pytz_format(tz1 or tz2) time1 = { 'hh': int(t1), 'mm': int(t2), 'nn': str(ap1).lower().strip('.'), + 'tz': tz or (None if not self.timezone else self.timezone.zone), 'range': 'end', 'time_type': time_type } @@ -441,10 +620,11 @@ def _detect_start_range_12_hour_format_without_min(self, time_list=None, origina time_list = [] if original_list is None: original_list = [] - patterns = re.findall(r'\s((after|aftr)[\s-]*(0?[2-9]|0?1[0-2]?)[\s-]*(am|pm|a\.m|p\.m))', + patterns = re.findall(r'\b((?:after|aftr)[\s-]*({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*' + r'(am|pm|a\.m\.?|p\.m\.?)\s*({timezone})?)\b'.format(timezone=self.timezone_choices), self.processed_text.lower()) for pattern in patterns: - original1 = pattern[0] + original1 = pattern[0].strip() if self.departure_flag: time_type = 'departure' elif self.return_flag: @@ -453,10 +633,16 @@ def _detect_start_range_12_hour_format_without_min(self, time_list=None, origina time_type = None t1 = pattern[2] ap1 = pattern[3] + tz1 = pattern[1] + tz2 = pattern[4] + tz = None + if tz1 or tz2: + tz = self.convert_to_pytz_format(tz1 or tz2) time1 = { 'hh': int(t1), 'mm': 0, 'nn': str(ap1).lower().strip('.'), + 'tz': tz or (None if not self.timezone else self.timezone.zone), 'range': 'start', 'time_type': time_type } @@ -486,10 +672,13 @@ def _detect_end_range_12_hour_format_without_min(self, time_list=None, original_ time_list = [] if original_list is None: original_list = [] - patterns = re.findall(r'\s((before|bfore)[\s-]*(0?[2-9]|0?1[0-2]?)[\s-]*(am|pm|a\.m|p\.m))', + patterns = re.findall(r'\b((?:before|bfore|till|until|untill|upto|up to)' + r'[\s-]*({timezone})?\s*(0?[2-9]|0?1[0-2]?)' + r'[\s-]*(am|pm|a\.m\.?|p\.m\.?)\s*({timezone})?)\b' + .format(timezone=self.timezone_choices), self.processed_text.lower()) for pattern in patterns: - original1 = pattern[0] + original1 = pattern[0].strip() if self.departure_flag: time_type = 'departure' elif self.return_flag: @@ -498,10 +687,17 @@ def _detect_end_range_12_hour_format_without_min(self, time_list=None, original_ time_type = None t1 = pattern[2] ap1 = pattern[3] + tz1 = pattern[1] + tz2 = pattern[4] + tz = None + if tz1 or tz2: + tz = self.convert_to_pytz_format(tz1 or tz2) + time1 = { 'hh': int(t1), 'mm': 0, 'nn': str(ap1).lower().strip('.'), + 'tz': tz or (None if not self.timezone else self.timezone.zone), 'range': 'end', 'time_type': time_type } @@ -543,18 +739,25 @@ def _detect_12_hour_format(self, time_list=None, original_list=None): time_list = [] if original_list is None: original_list = [] - patterns = re.findall(r'\b((0?[2-9]|0?1[0-2]?)[\s-]*(?::|\.|\s)?[\s-]*?' - r'([0-5][0-9])[\s-]*?(pm|am|a\.m|p\.m))', + patterns = re.findall(r'\b(({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*(?::|\.|\s)?[\s-]*?([0-5][0-9])' + r'[\s-]*?(pm|am|a\.m\.?|p\.m\.?)\s*({timezone})?)\b' + .format(timezone=self.timezone_choices), self.processed_text.lower()) for pattern in patterns: - original = pattern[0] - t1 = pattern[1] - t2 = pattern[2] - ap = pattern[3] + original = pattern[0].strip() + t1 = pattern[2] + t2 = pattern[3] + ap = pattern[4] + tz1 = pattern[1] + tz2 = pattern[5] + tz = None + if tz1 or tz2: + tz = self.convert_to_pytz_format(tz1 or tz2) time = { 'hh': int(t1), 'mm': int(t2), - 'nn': str(ap).lower().strip('.') + 'nn': str(ap).lower().strip('.'), + 'tz': tz or (None if not self.timezone else self.timezone.zone), } time['nn'] = 'am' if 'a' in time['nn'] else time['nn'] @@ -592,15 +795,22 @@ def _detect_12_hour_without_min(self, time_list=None, original_list=None): time_list = [] if original_list is None: original_list = [] - patterns = re.findall(r'\s((0?[2-9]|0?1[0-2]?)[\s-]*(am|pm|a\.m|p\.m))', self.processed_text.lower()) + patterns = re.findall(r'\b(({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*(am|pm|a\.m\.?|p\.m\.?)\s*({timezone})?)\b' + .format(timezone=self.timezone_choices), self.processed_text.lower()) for pattern in patterns: - original = pattern[0] - t1 = pattern[1] - ap = pattern[2] + original = pattern[0].strip() + t1 = pattern[2] + ap = pattern[3] + tz1 = pattern[1] + tz2 = pattern[4] + tz = None + if tz1 or tz2: + tz = self.convert_to_pytz_format(tz1 or tz2) time = { 'hh': int(t1), 'mm': 0, - 'nn': str(ap).lower().strip('.') + 'nn': str(ap).lower().strip('.'), + 'tz': tz or (None if not self.timezone else self.timezone.zone), } time['nn'] = 'am' if 'a' in time['nn'] else time['nn'] time['nn'] = 'pm' if 'p' in time['nn'] else time['nn'] @@ -640,7 +850,7 @@ def _detect_time_with_difference(self, time_list=None, original_list=None): r'(min|mins|minutes|hour|hours|hrs|hr))\b', self.processed_text.lower()) for pattern in patterns: - original = pattern[0] + original = pattern[0].strip() t1 = int(pattern[2]) td = pattern[3] hours = ['hour', 'hours', 'hrs', 'hr'] @@ -657,6 +867,7 @@ def _detect_time_with_difference(self, time_list=None, original_list=None): time[setter] = t1 time[antisetter] = 0 time['nn'] = 'df' + time['tz'] = None if not self.timezone else self.timezone.zone time_list.append(time) original_list.append(original) return time_list, original_list @@ -678,14 +889,14 @@ def _detect_time_with_difference_later(self, time_list=None, original_list=None) time_list = [] if original_list is None: original_list = [] - patterns = re.findall(r'\b((\d+)\s?(min|mins|minutes|hour|hours|hrs|hr)\s?(later|ltr|latr|lter)s?)\b', + patterns = re.findall(r'\b((\d+)\s?(min|mins|minutes?|hour|hours|hrs|hr)\s?(later|ltr|latr|lter)s?)\b', self.processed_text.lower()) for pattern in patterns: - original = pattern[0] + original = pattern[0].strip() t1 = int(pattern[1]) td = pattern[2] hours = ['hour', 'hours', 'hrs', 'hr'] - mins = ['min', 'mins', 'minutes'] + mins = ['min', 'mins', 'minutes', 'minute'] setter = "" antisetter = "" if td in hours: @@ -698,6 +909,7 @@ def _detect_time_with_difference_later(self, time_list=None, original_list=None) time[setter] = t1 time[antisetter] = 0 time['nn'] = 'df' + time['tz'] = None if not self.timezone else self.timezone.zone time_list.append(time) original_list.append(original) return time_list, original_list @@ -722,7 +934,7 @@ def _detect_time_with_every_x_hour(self, time_list=None, original_list=None): patterns = re.findall(r'\b((every|evry|evy|evri)\s*(\d+)\s*(min|mins|minutes|hour|hours|hrs|hr))\b', self.processed_text.lower()) for pattern in patterns: - original = pattern[0] + original = pattern[0].strip() t1 = int(pattern[2]) td = pattern[3] hours = ['hour', 'hours', 'hrs', 'hr'] @@ -739,6 +951,7 @@ def _detect_time_with_every_x_hour(self, time_list=None, original_list=None): time[setter] = t1 time[antisetter] = 0 time['nn'] = EVERY_TIME_TYPE + time['tz'] = None if not self.timezone else self.timezone.zone time_list.append(time) original_list.append(original) return time_list, original_list @@ -760,17 +973,21 @@ def _detect_time_with_once_in_x_day(self, time_list=None, original_list=None): time_list = [] if original_list is None: original_list = [] - patterns = re.findall(r'\b((once|onc|1se)\s*(in)?\s*(\d+)\s?(day|days))\b', + patterns = re.findall(r'\b((once|onc|1se)\s*(in|every|evry|in every)?\s*(\d+|a)\s?(day|days))\b', self.processed_text.lower()) for pattern in patterns: - original = pattern[0] - t1 = 24 * int(pattern[3]) + original = pattern[0].strip() + if not pattern[3] or pattern[3] == "a": + t1 = 24 + else: + t1 = 24 * int(pattern[3]) setter = "hh" antisetter = "mm" time = dict() time[setter] = t1 time[antisetter] = 0 time['nn'] = EVERY_TIME_TYPE + time['tz'] = None if not self.timezone else self.timezone.zone time_list.append(time) original_list.append(original) return time_list, original_list @@ -802,20 +1019,28 @@ def _detect_24_hour_optional_minutes_format(self, time_list=None, original_list= time_list = [] if original_list is None: original_list = [] - patterns = re.findall(r'\b((00?|0?[2-9]|0?1[0-9]?|2[0-3])(?:[:.\s]([0-5][0-9]))?)' - r'(?!\s?(?:am|pm|a\.m|p\.m|\d))', + patterns = re.findall(r'\b(({timezone})?\s*(00?|0?[2-9]|0?1[0-9]?|2[0-3])[:.\s]([0-5][0-9])?\s*' + r'(?:h|hrs|hr)?\s*({timezone})?)(?!\s*(?:am|pm|a\.m\.?|p\.m\.?|(?:{timezone})' + r'|(?:h|hrs|hr)|\d))\b' + .format(timezone=self.timezone_choices), self.processed_text.lower()) for pattern in patterns: - original = pattern[0] + original = pattern[0].strip() t2 = 0 - t1 = pattern[1] - if pattern[2]: - t2 = pattern[2] + t1 = pattern[2] + if pattern[3]: + t2 = pattern[3] + tz1 = pattern[1] + tz2 = pattern[4] + tz = None + if tz1 or tz2: + tz = self.convert_to_pytz_format(tz1 or tz2) time = { 'hh': int(t1), 'mm': int(t2), - 'nn': 'hrs' + 'nn': 'hrs', + 'tz': tz or (None if not self.timezone else self.timezone.zone), } time_list.append(time) original_list.append(original) @@ -844,18 +1069,26 @@ def _detect_restricted_24_hour_format(self, time_list=None, original_list=None): time_list = [] if original_list is None: original_list = [] - patterns = re.findall(r'\b((00?|1[3-9]?|2[0-3])[:.\s]([0-5][0-9]))(?!\s?(?:am|pm|a\.m|p\.m|\d))', + patterns = re.findall(r'\b(({timezone})?\s*(00?|1[3-9]?|2[0-3])[:.\s]([0-5][0-9])' + r'\s*(?:h|hr|hrs)?\s*({timezone})?)(?!\s*(?:am|pm|a\.m\.?|p\.m\.?|(?:h|hrs|hr)|' + r'(?:{timezone})|\d))\b'.format(timezone=self.timezone_choices), self.processed_text.lower()) for pattern in patterns: - original = pattern[0] - t1 = pattern[1] - t2 = pattern[2] - meridiem = self._get_meridiem(int(t1), int(t2)) + original = pattern[0].strip() + t1 = pattern[2] + t2 = pattern[3] + tz1 = pattern[1] + tz2 = pattern[4] + tz = None + if tz1 or tz2: + tz = self.convert_to_pytz_format(tz1 or tz2) + meridiem = self._get_meridiem(int(t1), int(t2), tz) time = { 'hh': int(t1), 'mm': int(t2), - 'nn': meridiem + 'nn': meridiem, + 'tz': tz or (None if not self.timezone else self.timezone.zone), } time_list.append(time) original_list.append(original) @@ -896,18 +1129,25 @@ def _detect_12_hour_word_format(self, time_list=None, original_list=None): time_list = [] if original_list is None: original_list = [] - patterns = re.findall(r'\b((0?[1-9]|1[0-2])[:.\s]([0-5][0-9]))(?!\s?(?:am|pm|a\.m|p\.m|\d))', + patterns = re.findall(r'\b((0?[1-9]|1[0-2])[:.\s]([0-5][0-9]))(?!\s?(?:am|pm|a\.m\.?|p\.m\.?|\d))', self.processed_text.lower()) pattern_am = re.findall(r'\s(morning|early|subah|mrng|mrning|savere)\s', self.processed_text.lower()) - pattern_pm = re.findall(r'\s(noon|afternoon|evening|evng|evning|sham)\s', self.processed_text.lower()) + pattern_pm = re.findall(r'\s(noon|afternoon|evening|evng|evning|sham|lunch|dinner)\s', + self.processed_text.lower()) pattern_night = re.findall(r'\s(night|nite|tonight|latenight|tonit|nit|rat)\s', self.processed_text.lower()) + pattern_tz = re.findall(r'(?:\b|[^a-zA-Z])({timezone})\b'.format(timezone=self.timezone_choices), + self.processed_text.lower()) for pattern in patterns: - original = pattern[0] + original = pattern[0].strip() t1 = int(pattern[1]) t2 = int(pattern[2]) + tz = None + if pattern_tz: + tz = pattern_tz[0] time = { 'hh': t1, 'mm': t2, + 'tz': tz or (None if not self.timezone else self.timezone.zone), } if pattern_am: time['nn'] = 'am' @@ -957,12 +1197,18 @@ def _detect_12_hour_word_format2(self, time_list=None, original_list=None): pattern_am = re.findall(r'\s(morning|early|subah|mrng|mrning|savere)', self.processed_text.lower()) pattern_pm = re.findall(r'\s(noon|afternoon|evening|evng|evning|sham)', self.processed_text.lower()) pattern_night = re.findall(r'\s(night|nite|tonight|latenight|tonit|nit|rat)', self.processed_text.lower()) + pattern_tz = re.findall(r'(?:\b|[^a-zA-Z])({timezone})\b'.format(timezone=self.timezone_choices), + self.processed_text.lower()) for pattern in patterns: - original = pattern[0] + original = pattern[0].strip() t1 = int(pattern[1]) + tz = None + if pattern_tz: + tz = pattern_tz[0] time = { 'hh': t1, 'mm': 0, + 'tz': tz or (None if not self.timezone else self.timezone.zone), } if pattern_am: time['nn'] = 'am' @@ -999,21 +1245,30 @@ def _detect_24_hour_format(self, time_list=None, original_list=None): time_list = [] if original_list is None: original_list = [] - patterns = re.findall(r'\b((00?|0?[2-9]|0?1[0-9]?|2[0-3])[:.\s]([0-5][0-9]))(?!\s?(?:am|pm|a\.m|p\.m|\d))', + patterns = re.findall(r'\b(({timezone})?\s*(00?|0?[2-9]|0?1[0-9]?|2[0-3])[:.\s]?([0-5][0-9])\s*({timezone})?)' + r'(?!\s*(?:am|pm|a\.m\.?|p\.m\.?|(?:{timezone})|\d))' + .format(timezone=self.timezone_choices), self.processed_text.lower()) if not patterns: # Optional minutes but compulsory "hour" mention - patterns = re.findall(r'\b((00?|0?[2-9]|0?1[0-9]?|2[0-3])(?:[:.\s]([0-5][0-9]))?\s+(?:hours?|hrs?)\b)', + patterns = re.findall(r'\b(({timezone})?\s*(00?|0?[2-9]|0?1[0-9]?|2[0-3])(?:[:.\s]?([0-5][0-9]))?\s+' + r'(?:hours?|hrs?)\s*({timezone})?\b)'.format(timezone=self.timezone_choices), self.processed_text.lower()) for pattern in patterns: - original = pattern[0] - t1 = int(pattern[1]) - t2 = int(pattern[2]) if pattern[2] else 0 - meridiem = self._get_meridiem(t1, t2) + original = pattern[0].strip() + t1 = int(pattern[2]) + t2 = int(pattern[3]) if pattern[3] else 0 + tz1 = pattern[1] + tz2 = pattern[4] + tz = None + if tz1 or tz2: + tz = self.convert_to_pytz_format(tz1 or tz2) + meridiem = self._get_meridiem(t1, t2, tz) time = { 'hh': t1, 'mm': t2, - 'nn': meridiem + 'nn': meridiem, + 'tz': tz or (None if not self.timezone else self.timezone.zone), } time_list.append(time) original_list.append(original) @@ -1050,20 +1305,24 @@ def _detect_time_without_format(self, time_list=None, original_list=None): time_list = [] if original_list is None: original_list = [] - patterns = re.findall(r'((?:by|before|after|at|dot|exactly|exact)[\s-]*' - r'((0?[1-9]|1[0-2])[:.\s]*([0-5][0-9])?))\s', + patterns = re.findall(r'\b((?:by|before|after|at|dot|exactly|exact)[\s-]*((0?[1-9]|1[0-2])[:.\s]*' + r'([0-5][0-9])?)\s*({timezone})?)\s'.format(timezone=self.timezone_choices), self.processed_text.lower()) for pattern in patterns: - original = pattern[0] + original = pattern[0].strip() t1 = pattern[2] t2 = 0 + tz = pattern[4] or None + if tz: + tz = self.convert_to_pytz_format(tz) if pattern[3]: t2 = pattern[3] - meridiem = self._get_meridiem(int(t1), int(t2)) + meridiem = self._get_meridiem(int(t1), int(t2), tz) time = { 'hh': int(t1), 'mm': int(t2), - 'nn': meridiem + 'nn': meridiem, + 'tz': tz or (None if not self.timezone else self.timezone.zone), } time_list.append(time) original_list.append(original) @@ -1099,30 +1358,39 @@ def _detect_time_without_format_preceeding(self, time_list=None, original_list=N time_list = [] if original_list is None: original_list = [] - patterns = re.findall(r'\s(((0?[1-9]|1[0-2])[:.\s]*([0-5][0-9])?)[\s-]*' - r'(?:o\'clock|o\' clock|clock|oclock|o clock|hours))\s', + patterns = re.findall(r'\b(({timezone})?\s*((0?[1-9]|1[0-2])[:.\s]*([0-5][0-9])?)[\s-]*' + r'(?:o\'clock|o\' clock|clock|oclock|o clock|hours)\s*' + r'({timezone})?)\b'.format(timezone=self.timezone_choices), self.processed_text.lower()) if not patterns and self.bot_message: if re.findall(r"Time|time", self.bot_message.lower()): - patterns = re.findall(r'\s*((([0-2]?[0-9])()))\s*', self.processed_text.lower()) + patterns = re.findall(r'\b(({timezone})?\s*([0-2]?[0-9])' + r'()\s*({timezone})?)\b'.format(timezone=self.timezone_choices), + self.processed_text.lower()) for pattern in patterns: - original = pattern[0] + original = pattern[0].strip() t1 = pattern[2] t2 = 0 + tz1 = pattern[1] + tz2 = pattern[4] + tz = None + if tz1 or tz2: + tz = self.convert_to_pytz_format(tz1 or tz2) if pattern[3]: t2 = pattern[3] - meridiem = self._get_meridiem(int(t1), int(t2)) + meridiem = self._get_meridiem(int(t1), int(t2), tz) time = { 'hh': int(t1), 'mm': int(t2), - 'nn': meridiem + 'nn': meridiem, + 'tz': tz or (None if not self.timezone else self.timezone.zone), } time_list.append(time) original_list.append(original) return time_list, original_list - def _get_meridiem(self, hours, mins): + def _get_meridiem(self, hours, mins, timezone): """ Returns the meridiem(am/pm) for which the given hours:mins time is in within 12 hour span from the current timestamp. @@ -1135,11 +1403,19 @@ def _get_meridiem(self, hours, mins): Args: hours (int): hours in integer mins (int): mins in integer + timezone (str): timezone in 'Asia/Kolkata' format. As we want to use the tz mentioned by the user, if any. Returns meridiem type (str): returns the meridiem type whether its am and pm """ - current_datetime = self.now_date + + if timezone is not None: + new_timezone = get_timezone(timezone) + else: + # If no TZ(neither from api call not from the user message) is given, use 'UTC' + new_timezone = self.timezone or pytz.timezone('UTC') + + current_datetime = datetime.datetime.now(new_timezone) current_hour = current_datetime.hour current_min = current_datetime.minute if hours == 0 or hours >= TWELVE_HOUR: @@ -1175,10 +1451,14 @@ def _get_morning_time_range(self, time_list=None, original_list=None): if original_list is None: original_list = [] # pattern to detect morning - patterns = re.findall(r'\s(morning|early|subah|mrng|mrning|savere)', self.processed_text.lower()) + patterns = re.findall(r'\b((?:morning|early|subah|mrng|mrning|savere)\s*(?:in|of|at)?\s*({timezone})?)\b' + .format(timezone=self.timezone_choices), self.processed_text.lower()) - if patterns: - original1 = patterns[0] + for pattern in patterns: + original1 = pattern[0].strip() + tz = None + if pattern[1]: + tz = self.convert_to_pytz_format(pattern[1]) if self.departure_flag: time_type = 'departure' elif self.return_flag: @@ -1189,6 +1469,7 @@ def _get_morning_time_range(self, time_list=None, original_list=None): 'hh': 12, 'mm': 0, 'nn': 'am', + 'tz': tz or (None if not self.timezone else self.timezone.zone), 'range': 'start', 'time_type': time_type } @@ -1197,6 +1478,7 @@ def _get_morning_time_range(self, time_list=None, original_list=None): 'hh': 11, 'mm': 0, 'nn': 'am', + 'tz': tz or (None if not self.timezone else self.timezone.zone), 'range': 'end', 'time_type': time_type } @@ -1226,10 +1508,14 @@ def _get_afternoon_time_range(self, time_list=None, original_list=None): if original_list is None: original_list = [] # patterns - patterns = re.findall(r'\s(noon|afternoon)', self.processed_text.lower()) + patterns = re.findall(r'\b((?:noon|afternoon)\s*(?:in|of|at)?\s*({timezone})?)\b' + .format(timezone=self.timezone_choices), self.processed_text.lower()) - if patterns: - original1 = patterns[0] + for pattern in patterns: + original1 = pattern[0].strip() + tz = None + if pattern[1]: + tz = self.convert_to_pytz_format(pattern[1]) if self.departure_flag: time_type = 'departure' elif self.return_flag: @@ -1240,6 +1526,7 @@ def _get_afternoon_time_range(self, time_list=None, original_list=None): 'hh': 11, 'mm': 0, 'nn': 'am', + 'tz': tz or (None if not self.timezone else self.timezone.zone), 'range': 'start', 'time_type': time_type } @@ -1248,6 +1535,7 @@ def _get_afternoon_time_range(self, time_list=None, original_list=None): 'hh': 5, 'mm': 0, 'nn': 'pm', + 'tz': tz or (None if not self.timezone else self.timezone.zone), 'range': 'end', 'time_type': time_type } @@ -1277,10 +1565,14 @@ def _get_evening_time_range(self, time_list=None, original_list=None): if original_list is None: original_list = [] # patterns - patterns = re.findall(r'\s(evening|evng|evning|sham)', self.processed_text.lower()) + patterns = re.findall(r'\b((?:evening|evng|evning|sham)\s*(?:in|of|at)?\s*({timezone})?)\b' + .format(timezone=self.timezone_choices), self.processed_text.lower()) - if patterns: - original1 = patterns[0] + for pattern in patterns: + original1 = pattern[0].strip() + tz = None + if pattern[1]: + tz = self.convert_to_pytz_format(pattern[1]) if self.departure_flag: time_type = 'departure' elif self.return_flag: @@ -1291,6 +1583,7 @@ def _get_evening_time_range(self, time_list=None, original_list=None): 'hh': 5, 'mm': 0, 'nn': 'pm', + 'tz': tz or (None if not self.timezone else self.timezone.zone), 'range': 'start', 'time_type': time_type } @@ -1299,6 +1592,7 @@ def _get_evening_time_range(self, time_list=None, original_list=None): 'hh': 9, 'mm': 0, 'nn': 'pm', + 'tz': tz or (None if not self.timezone else self.timezone.zone), 'range': 'end', 'time_type': time_type } @@ -1328,10 +1622,14 @@ def _get_night_time_range(self, time_list=None, original_list=None): if original_list is None: original_list = [] # patterns - patterns = re.findall(r'\s(night|nite|tonight|latenight|tonit|nit|rat)', self.processed_text.lower()) + patterns = re.findall(r'\b((?:night|nite|tonight|latenight|tonit|nit|rat)\s*(?:in|of|at)?\s*({timezone})?)\b' + .format(timezone=self.timezone_choices), self.processed_text.lower()) - if patterns: - original1 = patterns[0] + for pattern in patterns: + original1 = pattern[0].strip() + tz = None + if pattern[1]: + tz = self.convert_to_pytz_format(pattern[1]) if self.departure_flag: time_type = 'departure' elif self.return_flag: @@ -1342,6 +1640,7 @@ def _get_night_time_range(self, time_list=None, original_list=None): 'hh': 9, 'mm': 0, 'nn': 'pm', + 'tz': tz or (None if not self.timezone else self.timezone.zone), 'range': 'start', 'time_type': time_type } @@ -1350,6 +1649,7 @@ def _get_night_time_range(self, time_list=None, original_list=None): 'hh': 12, 'mm': 0, 'nn': 'am', + 'tz': tz or (None if not self.timezone else self.timezone.zone), 'range': 'end', 'time_type': time_type } @@ -1379,12 +1679,17 @@ def _get_default_time_range(self, time_list=None, original_list=None): if original_list is None: original_list = [] # patterns - preference = re.compile(r'\s(No particular preference|No preference|No particular time|No time|' - r'anytime|any time|all day|full day|entire day|entireday)') + preference = re.compile(r'\b((?:no particular preference|no preference|no particular time|no time|' + r'anytime|any time|all day|full day|entire day|entireday)' + r'\s*(?:in|of|at)?\s*({timezone})?)\b' + .format(timezone=self.timezone_choices)) patterns = preference.findall(self.processed_text.lower()) - if patterns: - original1 = patterns[0] + for pattern in patterns: + original1 = pattern[0].strip() + tz = None + if pattern[1]: + tz = self.convert_to_pytz_format(pattern[1]) if self.departure_flag: time_type = 'departure' elif self.return_flag: @@ -1395,6 +1700,7 @@ def _get_default_time_range(self, time_list=None, original_list=None): 'hh': 12, 'mm': 0, 'nn': 'am', + 'tz': tz or (None if not self.timezone else self.timezone.zone), 'range': 'start', 'time_type': time_type } @@ -1403,6 +1709,7 @@ def _get_default_time_range(self, time_list=None, original_list=None): 'hh': 11, 'mm': 59, 'nn': 'pm', + 'tz': tz or (None if not self.timezone else self.timezone.zone), 'range': 'end', 'time_type': time_type } diff --git a/ner_v2/detectors/temporal/time/mr/data/datetime_diff_constant.csv b/ner_v2/detectors/temporal/time/mr/data/datetime_diff_constant.csv index a38be3553..6bc803ffd 100644 --- a/ner_v2/detectors/temporal/time/mr/data/datetime_diff_constant.csv +++ b/ner_v2/detectors/temporal/time/mr/data/datetime_diff_constant.csv @@ -1,10 +1,11 @@ key,present_in_start,adding_magnitude,datetime_type नंतर|Nantar,0,1,add_diff_datetime -हे|He,1,0,add_diff_datetime +हे|He|Hey,1,0,add_diff_datetime पूर्वी|अगोदर|गेल्या|Gelya|Purvi|Porvi|Agodar,0,-1,add_diff_datetime अंतिम|शेवट|शेवटी|Antim|shevat|shewat|shevati|shewati,1,-1,add_diff_datetime पुढील|पुढे|पुढच्या|पुढचा|Pudcha|Pudhil|Pudhe|Pudhchya,1,1,add_diff_datetime +मागील|मागे|मागच्या|मागचा|Maaghil|Maaghe|Maagchya|Maagcha,1,1,add_diff_datetime मी|Mi|me,0,1,add_diff_datetime -सवा|sawa|sava,1,0.25,ref_datetime +सवा|sawa|sava|sauva,1,0.25,ref_datetime पौने|paune,1,-0.25,ref_datetime -साढे|साडे|saade|sade,1,0.5,ref_datetime +साढे|साडे|saade|sade|sadhe,1,0.5,ref_datetime diff --git a/ner_v2/detectors/temporal/time/mr/data/time_constant.csv b/ner_v2/detectors/temporal/time/mr/data/time_constant.csv index 7021452a0..864d5ca57 100644 --- a/ner_v2/detectors/temporal/time/mr/data/time_constant.csv +++ b/ner_v2/detectors/temporal/time/mr/data/time_constant.csv @@ -1,11 +1,13 @@ key,time_type,meridiem आता|Ata|aata,relative_time,NA लगेच|Lagech,relative_time,NA +नंतर|Nantar,relative_time,NA वाजले|वाजता|वाजुन|Vajle|Vajta|Vazta|Vazle|Vajun|Vazun,hour,NA -तास|तासा|तासात|Tas|Taasaa|Tasa|Tasaa|Taasan|Taasat,hour,NA +तास|तासा|तासात|Tas|Taas|Taasaa|Tasa|Tasaa|Taasan|Taasat,hour,NA मिनिट|मिनिटे|Minute|Minte,minute,NA सेकंड|seconds|sec|second,second,NA -सकाळ|सकाळी|Sakal|Sakali,daytime_meridiem,am -दुपार|दुपारी|Dupar|Dupari,daytime_meridiem,pm -संध्याकाळी|संध्याकाळ|Sandhyakali|Sandhyakal,daytime_meridiem,pm +सकाळ|सकाळी|Sakal|Sakaal|Sakali|Sakaali,daytime_meridiem,am +दुपार|दुपारी|Dupar|Dupaar|Dupari,daytime_meridiem,pm +संध्याकाळी|संध्याकाळ|सायंकाळी|Sandhyakali|Sandhyakal|Sayankali,daytime_meridiem,pm रात्री|रात्र|Ratri|Ratra,daytime_meridiem,pm +पहाट|पहाटे|Pahaath|Pahaathe|Pahaates,daytime_meridiem,pm \ No newline at end of file diff --git a/ner_v2/detectors/temporal/time/standard_time_regex.py b/ner_v2/detectors/temporal/time/standard_time_regex.py index c9644ba7d..104cd58a4 100644 --- a/ner_v2/detectors/temporal/time/standard_time_regex.py +++ b/ner_v2/detectors/temporal/time/standard_time_regex.py @@ -5,6 +5,8 @@ import os import re +import pytz + from chatbot_ner.config import ner_logger from ner_v2.detectors.temporal.constant import (DATETIME_CONSTANT_FILE, ADD_DIFF_DATETIME_TYPE, NUMERALS_CONSTANT_FILE, TIME_CONSTANT_FILE, REF_DATETIME_TYPE, HOUR_TIME_TYPE, @@ -14,7 +16,7 @@ class BaseRegexTime(object): - def __init__(self, entity_name, data_directory_path, timezone='UTC'): + def __init__(self, entity_name, data_directory_path, timezone=None): """ Base Regex class which will be imported by language date class by giving their data folder path This will create standard regex and their parser to detect date for given language. @@ -27,7 +29,10 @@ def __init__(self, entity_name, data_directory_path, timezone='UTC'): self.processed_text = '' self.entity_name = entity_name self.tag = '__' + entity_name + '__' - self.timezone = get_timezone(timezone) + if timezone: + self.timezone = get_timezone(timezone) + else: + self.timezone = None self.now_date = datetime.datetime.now(tz=self.timezone) self.bot_message = None @@ -45,8 +50,7 @@ def __init__(self, entity_name, data_directory_path, timezone='UTC'): # Variable to define default order in which these regex will work self.detector_preferences = [ self._detect_time_with_coln_format, - self._detect_hour_minute - ] + self._detect_hour_minute] def set_bot_message(self, bot_message): """ @@ -190,8 +194,11 @@ def _get_meridiem(self, hours, mins, original_text): Returns str: returns the meridiem type whether its am and pm """ - current_hour = self.now_date.hour - current_min = self.now_date.minute + # If no TZ(neither from api call not from the user message) is given, use 'UTC' + new_timezone = self.timezone or pytz.timezone('UTC') + current_datetime = datetime.datetime.now(new_timezone) + current_hour = current_datetime.hour + current_min = current_datetime.minute if hours == 0 or hours >= TWELVE_HOUR: return 'hrs' @@ -267,7 +274,8 @@ def _detect_hour_minute(self, time_list, original_list): time = { 'hh': int(hh), 'mm': int(mm), - 'nn': nn + 'nn': nn, + 'tz': None if not self.timezone else self.timezone.zone } time_list.append(time) @@ -294,7 +302,7 @@ def _detect_time_with_coln_format(self, time_list, original_list): >>> time_list = [] >>> original_list = [] >>> preprocessed_text = u'आज 05:40 बजे अजना' - >>> _detect_time_with_coln_format(time_list, original_list) + >>> self._detect_time_with_coln_format(time_list, original_list) >>> ([{'hh': 5, 'mm': 40, 'nn': 'pm', 'time_type': None}], ["05:40"]) @@ -316,6 +324,7 @@ def _detect_time_with_coln_format(self, time_list, original_list): time = { 'hh': hh, 'mm': mm, + 'tz': None if not self.timezone else self.timezone.zone, 'time_type': None } diff --git a/ner_v2/detectors/temporal/time/time_detection.py b/ner_v2/detectors/temporal/time/time_detection.py index e7df5c386..d85e708ec 100644 --- a/ner_v2/detectors/temporal/time/time_detection.py +++ b/ner_v2/detectors/temporal/time/time_detection.py @@ -41,7 +41,7 @@ def get_supported_languages(): supported_languages.append(_dir) return supported_languages - def __init__(self, entity_name='time', timezone='UTC', language=ENGLISH_LANG): + def __init__(self, entity_name='time', timezone=None, language=ENGLISH_LANG): """Initializes a TimeDetector object with given entity_name and timezone Args: @@ -61,7 +61,10 @@ def __init__(self, entity_name='time', timezone='UTC', language=ENGLISH_LANG): self.time = [] self.original_time_text = [] self.tag = '__' + entity_name + '__' - self.timezone = get_timezone(timezone) + if timezone: + self.timezone = get_timezone(timezone) + else: + self.timezone = None self.language = language try: diff --git a/ner_v2/detectors/temporal/utils.py b/ner_v2/detectors/temporal/utils.py index 264d5f8fb..a90c647af 100644 --- a/ner_v2/detectors/temporal/utils.py +++ b/ner_v2/detectors/temporal/utils.py @@ -265,7 +265,7 @@ def get_next_date_with_dd(dd, after_datetime): return None, None, None -def get_timezone(timezone, ignore_errors=True): +def get_timezone(timezone, ignore_errors=False): # type: (Union[datetime.tzinfo, str, unicode], bool) -> datetime.tzinfo """ Return a datetime.tzinfo (pytz timezone object). If `timezone` is a str, try constructing a pytz @@ -275,7 +275,7 @@ def get_timezone(timezone, ignore_errors=True): Args: timezone (str or datetime.tzinfo): Either a valid timezone string or datetime.tzinfo object ignore_errors (bool, optional): when set to True, ignore errors and return a pytz.UTC when error occurs. When - set to False, raise exception when invalid timezone is given. Defaults to True. + set to False, raise exception when invalid timezone is given. Defaults to False. Returns: datetime.tzinfo: A pytz timezone object @@ -294,5 +294,17 @@ def get_timezone(timezone, ignore_errors=True): timezone = pytz.timezone('UTC') ner_logger.debug('Using "UTC" as default timezone') else: - raise + return None return timezone + + +def get_list_from_pipe_sep_string(text_string): + """ + Split numerals + Args: + text_string (str): text + Returns: + (list) : list containing numeral after split + """ + text_list = text_string.split("|") + return [x.lower().strip() for x in text_list if x.strip()] \ No newline at end of file diff --git a/ner_v2/tests/numeral/number_range/number_range_ner_tests.yaml b/ner_v2/tests/numeral/number_range/number_range_ner_tests.yaml index a1956e41a..5d7058747 100644 --- a/ner_v2/tests/numeral/number_range/number_range_ner_tests.yaml +++ b/ner_v2/tests/numeral/number_range/number_range_ner_tests.yaml @@ -1,32 +1,35 @@ tests: en: -# - id: en_1 -# message: "I want more than 200 banana" -# outputs: -# - max_value: null -# min_value: 200 -# original_text: "more than 200" -# output_id: 1 -# unit: null -# unit_type: null -# - id: en_2 -# message: "My monthly salary will be more than 2k per month" -# outputs: -# - max_value: null -# min_value: 2000 -# original_text: "more than 2k" -# output_id: 1 -# unit: null -# unit_type: null -# - id: en_3 -# message: "more than 2.5k people in the stadium" -# outputs: -# - max_value: null -# min_value: 2500 -# original_text: "more than 2.5k" -# output_id: 1 -# unit: null -# unit_type: null + - id: en_1 + message: "I want more than 200 banana" + outputs: + - max_value: null + min_value: '200' + original_text: "more than 200" + output_id: 1 + unit: null + abs_value: null + unit_type: null + - id: en_2 + message: "My monthly salary will be more than 2k per month" + outputs: + - max_value: null + min_value: '2000' + original_text: "more than 2k" + output_id: 1 + unit: null + abs_value: null + unit_type: null + - id: en_3 + message: "more than 2.5k people in the stadium" + outputs: + - max_value: null + min_value: '2500' + original_text: "more than 2.5k" + output_id: 1 + unit: null + abs_value: null + unit_type: null - id: en_4 message: "more than 200" outputs: @@ -35,6 +38,7 @@ tests: original_text: null output_id: 1 unit: null + abs_value: null unit_type: currency - id: en_5 message: "more than 2k" @@ -44,6 +48,7 @@ tests: original_text: null output_id: 1 unit: null + abs_value: null unit_type: currency - id: en_6 message: "more than 2.5k" @@ -53,60 +58,67 @@ tests: original_text: null output_id: 1 unit: null + abs_value: null unit_type: currency -# - id: en_7 -# message: "more than 200 rupees" -# outputs: -# - max_value: null -# min_value: 200 -# original_text: "more than 200 rupees" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: en_8 -# message: "more than 2k rupees" -# outputs: -# - max_value: null -# min_value: 2000 -# original_text: "more than 2k rupees" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: en_9 -# message: "more than 2.5k rupees" -# outputs: -# - max_value: null -# min_value: 2500 -# original_text: "more than 2.5k rupees" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: en_10 -# message: "200 to 300" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "200 to 300" -# output_id: 1 -# unit: null -# unit_type: null -# - id: en_11 -# message: "200 – 300" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "200 – 300" -# output_id: 1 -# unit: null -# unit_type: null -# - id: en_12 + - id: en_7 + message: "more than 200 rupees" + outputs: + - max_value: null + min_value: '200' + original_text: "more than 200 rupees" + output_id: 1 + unit: rupees + abs_value: null + unit_type: currency + - id: en_8 + message: "more than 2k rupees" + outputs: + - max_value: null + min_value: '2000' + original_text: "more than 2k rupees" + output_id: 1 + unit: rupees + abs_value: null + unit_type: currency + - id: en_9 + message: "more than 2.5k rupees" + outputs: + - max_value: null + min_value: '2500' + original_text: "more than 2.5k rupees" + output_id: 1 + unit: rupees + abs_value: null + unit_type: currency + - id: en_10 + message: "200 to 300" + outputs: + - max_value: '300' + min_value: '200' + original_text: "200 to 300" + output_id: 1 + unit: null + abs_value: null + unit_type: null + - id: en_11 + message: "200 - 300" + outputs: + - max_value: '300' + min_value: '200' + original_text: "200 - 300" + output_id: 1 + unit: null + abs_value: null + unit_type: null + - id: en_12 message: "200-300" outputs: - - max_value: 300 - min_value: 200 + - max_value: '300' + min_value: '200' original_text: "200-300" output_id: 1 unit: null + abs_value: null unit_type: null - id: en_13 message: "200 to 300" @@ -116,6 +128,7 @@ tests: original_text: null output_id: 1 unit: null + abs_value: null unit_type: currency - id: en_14 message: "200 – 300" @@ -125,6 +138,7 @@ tests: original_text: null output_id: 1 unit: null + abs_value: null unit_type: currency - id: en_15 message: "200-300" @@ -134,6 +148,7 @@ tests: original_text: null output_id: 1 unit: null + abs_value: null unit_type: currency # - id: en_16 # message: "200 to 300 ruppes" @@ -334,6 +349,7 @@ tests: original_text: null output_id: 1 unit: null + abs_value: null unit_type: currency - id: hi_14 message: "200 – 300" @@ -343,6 +359,7 @@ tests: original_text: null output_id: 1 unit: null + abs_value: null unit_type: currency - id: hi_15 message: "200-300" @@ -352,6 +369,7 @@ tests: original_text: null output_id: 1 unit: null + abs_value: null unit_type: currency # - id: hi_16 # message: "200 se 300 rupees" @@ -460,6 +478,7 @@ tests: original_text: null output_id: 1 unit: null + abs_value: null unit_type: currency - id: hi_28 message: "२ हजार से ऊपर" @@ -469,6 +488,7 @@ tests: original_text: null output_id: 1 unit: null + abs_value: null unit_type: currency - id: hi_29 message: "ज्यादा से ज्यादा ५ हजार" @@ -478,6 +498,7 @@ tests: original_text: null output_id: 1 unit: null + abs_value: null unit_type: currency # - id: hi_30 # message: "२०० रूपीस से ज्यादा" @@ -541,6 +562,7 @@ tests: original_text: null output_id: 1 unit: null + abs_value: null unit_type: currency - id: hi_37 message: "२०० – ३००" @@ -550,6 +572,7 @@ tests: original_text: null output_id: 1 unit: null + abs_value: null unit_type: currency - id: hi_38 message: "२००-३००" @@ -559,6 +582,7 @@ tests: original_text: null output_id: 1 unit: null + abs_value: null unit_type: currency # - id: hi_39 # message: "२०० से ३०० रुपया" diff --git a/ner_v2/tests/numeral/number_range/test_number_range_detection.py b/ner_v2/tests/numeral/number_range/test_number_range_detection.py index e9af7f3a5..f2470c5c5 100644 --- a/ner_v2/tests/numeral/number_range/test_number_range_detection.py +++ b/ner_v2/tests/numeral/number_range/test_number_range_detection.py @@ -40,6 +40,7 @@ def parse_expected_outputs(expected_outputs): "min_value": str(expected_output["min_value"]) if expected_output["min_value"] else None, "unit": str(expected_output["unit"]) if expected_output["unit"] else None, "max_value": str(expected_output["max_value"]) if expected_output["max_value"] else None, + "abs_value": str(expected_output["abs_value"]) if expected_output["abs_value"] else None } original_text = \ expected_output["original_text"].lower().strip() if expected_output["original_text"] else None diff --git a/ner_v2/tests/pattern/phone_number/phone_number_ner_tests.yaml b/ner_v2/tests/pattern/phone_number/phone_number_ner_tests.yaml index ca3c3cf73..f04ef56af 100644 --- a/ner_v2/tests/pattern/phone_number/phone_number_ner_tests.yaml +++ b/ner_v2/tests/pattern/phone_number/phone_number_ner_tests.yaml @@ -2,107 +2,162 @@ tests: en: - id: en_1 message: "Set a reminder on 02226129854" + locale: "en-in" outputs: - original_text: "02226129854" output_id: 1 - value: "02226129854" + value: "2226129854" + country_calling_code: '91' - id: en_2 message: "Set a reminder on 022 26129854" + locale: "en-in" outputs: - original_text: "022 26129854" output_id: 1 - value: "02226129854" + value: "2226129854" + country_calling_code: '91' - id: en_3 message: "Call the number 9820334455" + locale: "en-in" outputs: - original_text: "9820334455" output_id: 1 value: "9820334455" + country_calling_code: '91' - id: en_4 message: "Set a reminder on 919820334455" + locale: "en-in" outputs: - original_text: "919820334455" output_id: 1 - value: "919820334455" + value: "9820334455" + country_calling_code: '91' - id: en_5 message: "Set a reminder on 91 9820334455" + locale: "en-in" outputs: - original_text: "91 9820334455" output_id: 1 - value: "919820334455" + value: "9820334455" + country_calling_code: '91' - id: en_6 message: "Set a reminder on +91 9820334455" + locale: "en-in" outputs: - original_text: "+91 9820334455" output_id: 1 - value: "919820334455" + value: "9820334455" + country_calling_code: '91' - id: en_7 message: "Set a reminder on +919820334455" + locale: "en-in" outputs: - original_text: "+919820334455" output_id: 1 - value: "919820334455" + value: "9820334455" + country_calling_code: '91' - id: en_8 message: "Set a reminder on +919820334455" + locale: "en-in" outputs: - original_text: "+919820334455" output_id: 1 - value: "919820334455" + value: "9820334455" + country_calling_code: '91' - id: en_9 message: "Set a reminder on 91 9820-3344-55" + locale: "en-in" outputs: - original_text: "91 9820-3344-55" output_id: 1 - value: "919820334455" + value: "9820334455" + country_calling_code: '91' - id: en_10 message: "Set a reminder on +1 (408) 912-6172" + locale: "en-in" outputs: - original_text: "+1 (408) 912-6172" output_id: 1 - value: "14089126172" + value: "4089126172" + country_calling_code: '1' - id: en_11 message: "Set a reminder on +1 408 9126172" + locale: "en-in" outputs: - original_text: "+1 408 9126172" output_id: 1 - value: "14089126172" + value: "4089126172" + country_calling_code: '1' - id: en_12 - message: "Set a reminder on 14089126172" + message: "Set a reminder on +14089126172" + locale: "en-in" outputs: - - original_text: "14089126172" + - original_text: "+14089126172" output_id: 1 - value: "14089126172" + value: "4089126172" + country_calling_code: '1' - id: en_13 - message: "Send 1000rs to 14089126172 and call 02226129854" + message: "Send 1000rs to +14089126172 and call 02226129854" + locale: "en-in" outputs: - - original_text: "14089126172" + - original_text: "+14089126172" output_id: 1 - value: "14089126172" + value: "4089126172" + country_calling_code: '1' - original_text: "02226129854" output_id: 2 - value: "02226129854" + value: "2226129854" + country_calling_code: '91' + - id: en_14 + message: "Send 1000rs to +14089126172 and call 2226129854" + locale: "en-us" + outputs: + - original_text: "+14089126172" + output_id: 1 + value: "4089126172" + country_calling_code: '1' + - original_text: "2226129854" + output_id: 2 + value: "2226129854" + country_calling_code: '1' + - id: en_15 + message: "Send 1000rs to 2226129854b" + locale: "en-us" + outputs: + - original_text: null + output_id: 1 + value: null + country_calling_code: null hi: - id: hi_1 message: "मेरे लिए ५००र्स ९८२०३३४४५५ पे भेज देना" + locale: "en-in" outputs: - original_text: "९८२०३३४४५५" output_id: 1 value: "9820334455" + country_calling_code: '91' - id: hi_2 message: "मेरे लिए ५००र्स ९८ २०३३४४५५ पे भेज देना" + locale: "en-in" outputs: - original_text: "९८ २०३३४४५५" output_id: 1 value: "9820334455" + country_calling_code: '91' - id: hi_3 message: "मेरा लैंडलाइन नंबर ०२२२६१२९८५७ है" + locale: "en-in" outputs: - original_text: "०२२२६१२९८५७" output_id: 1 - value: "02226129857" + value: "2226129857" + country_calling_code: '91' - id: hi_4 message: "मेरा लैंडलाइन नंबर ०२२ २६१२९८५७ है" + locale: "en-in" outputs: - original_text: "०२२ २६१२९८५७" output_id: 1 - value: "02226129857" + value: "2226129857" + country_calling_code: '91' \ No newline at end of file diff --git a/ner_v2/tests/pattern/phone_number/test_phone_number_detection.py b/ner_v2/tests/pattern/phone_number/test_phone_number_detection.py index f522763dc..1450d1a98 100644 --- a/ner_v2/tests/pattern/phone_number/test_phone_number_detection.py +++ b/ner_v2/tests/pattern/phone_number/test_phone_number_detection.py @@ -39,7 +39,11 @@ def parse_expected_outputs(expected_outputs): original_text = \ expected_output["original_text"].lower().strip() if expected_output["original_text"] else None if original_text: - phone_num_list.append(str(expected_output["value"])) + phone_num_dict = { + 'value': str(expected_output["value"]), + 'country_calling_code': str(expected_output["country_calling_code"]) + } + phone_num_list.append(phone_num_dict) original_texts.append(original_text) return phone_num_list, original_texts @@ -47,7 +51,8 @@ def parse_expected_outputs(expected_outputs): def run_test(self): message = testcase["message"] - number_detector_object = PhoneDetector(entity_name='phone_number', language=language) + locale = testcase["locale"] + number_detector_object = PhoneDetector(entity_name='phone_number', language=language, locale=locale) phone_number_list, spans = number_detector_object.detect_entity(message) expected_phone_number_list, expected_spans = parse_expected_outputs(testcase["outputs"]) diff --git a/ner_v2/tests/temporal/date/en/test_date_detection.py b/ner_v2/tests/temporal/date/en/test_date_detection.py index d9740b594..93d95fa1c 100644 --- a/ner_v2/tests/temporal/date/en/test_date_detection.py +++ b/ner_v2/tests/temporal/date/en/test_date_detection.py @@ -20,6 +20,7 @@ def test_en_date_detection_date_range_ddth_of_mmm_to_ddth(self): Date detection for pattern '2nd jan to 5th' """ message = '2nd jan to 5th' + locale = 'en-in' # If we run day1 = 2 day2 = 5 @@ -30,7 +31,7 @@ def test_en_date_detection_date_range_ddth_of_mmm_to_ddth(self): year1 += 1 year2 += 1 - date_detector_object = DateAdvancedDetector(entity_name=self.entity_name, language='en') + date_detector_object = DateAdvancedDetector(entity_name=self.entity_name, language='en', locale=locale) date_dicts, original_texts = date_detector_object.detect_entity(message) self.assertIn({ @@ -50,7 +51,7 @@ def test_en_date_detection_date_range_ddth_of_mmm_to_ddth(self): 'value': {'dd': day2, 'mm': month, 'yy': year2, 'type': 'date'} }, date_dicts) - self.assertEqual(original_texts.count(message), 2) + self.assertEqual(original_texts.count(message.lower()), 2) @mock.patch('ner_v2.detectors.temporal.date.en.date_detection.get_weekdays_for_month') def test_en_date_detection_day_range_for_nth_week_month(self, mocked_get_weekdays_for_month): @@ -58,6 +59,7 @@ def test_en_date_detection_day_range_for_nth_week_month(self, mocked_get_weekday Date detection for pattern 'first week of jan' """ message = 'first week of jan' + locale = 'en-in' day1 = 1 day2 = 7 month = 1 @@ -71,7 +73,7 @@ def test_en_date_detection_day_range_for_nth_week_month(self, mocked_get_weekday mocked_get_weekdays_for_month.return_value = [day1, day2] - date_detector_object = DateAdvancedDetector(entity_name=self.entity_name, language='en') + date_detector_object = DateAdvancedDetector(entity_name=self.entity_name, language='en', locale=locale) date_dicts, original_texts = date_detector_object.detect_entity(message) # TODO: functionality is incorrect, start_range should be True in 1st and end_range should be True in second @@ -92,4 +94,154 @@ def test_en_date_detection_day_range_for_nth_week_month(self, mocked_get_weekday 'to': False, 'value': {'dd': day2, 'mm': month, 'type': 'date', 'yy': year} }, date_dicts) - self.assertEqual(original_texts.count(message), 2) + self.assertEqual(original_texts.count(message.lower()), 2) + + def test_en_date_detection_date_ddth_of_mm_of_yy_with_locale(self): + """ + Date detection for pattern '2/3/19' + """ + message = '2/3/19' + locale = 'en-us' + # If we run + day1 = 3 + month = 2 + year1 = 2019 + + date_detector_object = DateAdvancedDetector(entity_name=self.entity_name, language='en', locale=locale) + date_dicts, original_texts = date_detector_object.detect_entity(message) + + self.assertIn({ + 'normal': True, + 'start_range': False, + 'end_range': False, + 'from': False, + 'to': False, + 'value': {'dd': day1, 'mm': month, 'yy': year1, 'type': 'date'} + }, date_dicts) + + self.assertEqual(original_texts.count(message.lower()), 1) + + def test_en_gregorian_day_month_year_format(self): + """ + Date detection for pattern '2/3/17' + """ + message = '2/3/17' + locale = 'en-in' + # If we run + day1 = 2 + month = 3 + year1 = 2017 + + date_detector_object = DateAdvancedDetector(entity_name=self.entity_name, language='en', locale=locale) + date_dicts, original_texts = date_detector_object.detect_entity(message) + + self.assertIn({ + 'normal': True, + 'start_range': False, + 'end_range': False, + 'from': False, + 'to': False, + 'value': {'dd': day1, 'mm': month, 'yy': year1, 'type': 'date'} + }, date_dicts) + + self.assertEqual(original_texts.count(message.lower()), 1) + + def test_en_gregorian_year_month_day_format(self): + """ + Date detection for pattern '2017/12/01' + """ + message = '2017/12/01' + locale = 'en-in' + # If we run + day1 = 1 + month = 12 + year1 = 2017 + + date_detector_object = DateAdvancedDetector(entity_name=self.entity_name, language='en', locale=locale) + date_dicts, original_texts = date_detector_object.detect_entity(message) + + self.assertIn({ + 'normal': True, + 'start_range': False, + 'end_range': False, + 'from': False, + 'to': False, + 'value': {'dd': day1, 'mm': month, 'yy': year1, 'type': 'date'} + }, date_dicts) + + self.assertEqual(original_texts.count(message.lower()), 1) + + def test_en_gregorian_advanced_day_month_year_format(self): + """ + Date detection for pattern '02 january 1972' + """ + message = '02 january 1972' + locale = 'en-in' + # If we run + day1 = 2 + month = 1 + year1 = 1972 + + date_detector_object = DateAdvancedDetector(entity_name=self.entity_name, language='en', locale=locale) + date_dicts, original_texts = date_detector_object.detect_entity(message) + + self.assertIn({ + 'normal': True, + 'start_range': False, + 'end_range': False, + 'from': False, + 'to': False, + 'value': {'dd': day1, 'mm': month, 'yy': year1, 'type': 'date'} + }, date_dicts) + + self.assertEqual(original_texts.count(message.lower()), 1) + + def test_en_gregorian_advanced_year_month_day_format(self): + """ + Date detection for pattern '1972 january 2' + """ + message = '1972 january 2' + locale = 'en-in' + # If we run + day1 = 2 + month = 1 + year1 = 1972 + + date_detector_object = DateAdvancedDetector(entity_name=self.entity_name, language='en', locale=locale) + date_dicts, original_texts = date_detector_object.detect_entity(message) + + self.assertIn({ + 'normal': True, + 'start_range': False, + 'end_range': False, + 'from': False, + 'to': False, + 'value': {'dd': day1, 'mm': month, 'yy': year1, 'type': 'date'} + }, date_dicts) + + self.assertEqual(original_texts.count(message.lower()), 1) + + def test_en_gregorian_year_day_month_format(self): + """ + Date detection for pattern '2099 21st Nov' + """ + message = '2099 21st Nov' + locale = 'en-in' + # If we run + day1 = 21 + month = 11 + year1 = 2099 + + date_detector_object = DateAdvancedDetector(entity_name=self.entity_name, language='en', locale=locale) + date_dicts, original_texts = date_detector_object.detect_entity(message) + + self.assertIn({ + 'normal': True, + 'start_range': False, + 'end_range': False, + 'from': False, + 'to': False, + 'value': {'dd': day1, 'mm': month, 'yy': year1, 'type': 'date'} + }, date_dicts) + + self.assertEqual(original_texts.count(message.lower()), 1) \ No newline at end of file diff --git a/ner_v2/tests/temporal/time/test_time_detection.py b/ner_v2/tests/temporal/time/test_time_detection.py index 90794df42..b3577772b 100644 --- a/ner_v2/tests/temporal/time/test_time_detection.py +++ b/ner_v2/tests/temporal/time/test_time_detection.py @@ -45,6 +45,7 @@ def parse_expected_outputs(expected_outputs): "hh": expected_output["hh"], "mm": expected_output["mm"], "nn": expected_output["nn"], + 'tz': expected_output["tz"], "range": expected_output["range"], "time_type": expected_output["time_type"] } diff --git a/ner_v2/tests/temporal/time/time_ner_tests.yaml b/ner_v2/tests/temporal/time/time_ner_tests.yaml index bf4393b66..de067501d 100644 --- a/ner_v2/tests/temporal/time/time_ner_tests.yaml +++ b/ner_v2/tests/temporal/time/time_ner_tests.yaml @@ -3,34 +3,37 @@ args: tests: en: - id: en_1 - message: "the time is 12:35 am" + message: "the time is 12:35 am est" outputs: - hh: 12 mm: 35 nn: "am" - original_text: "12:35 am" + tz: "America/New_York" + original_text: "12:35 am est" output_id: 1 range: null time_type: null range_enabled: false - id: en_2 - message: "meet me at 10:33 pm at the cafe" + message: "meet me at 10:33 pm AKST at the cafe" outputs: - hh: 10 mm: 33 nn: "pm" - original_text: "10:33 pm" + tz: "America/Anchorage" + original_text: "10:33 pm akst" output_id: 1 range: null time_type: null range_enabled: false - id: en_3 - message: "meet me at 02 33 p.m. at the cafe" + message: "meet me at 02 33 p.m IST at the cafe" outputs: - hh: 2 mm: 33 nn: "pm" - original_text: "02 33 p.m" + tz: "Asia/Kolkata" + original_text: "02 33 p.m ist" output_id: 1 range: null time_type: null @@ -41,6 +44,7 @@ tests: - hh: 12 mm: 0 nn: "am" + tz: "UTC" original_text: "12 am" output_id: 1 range: null @@ -52,6 +56,7 @@ tests: - hh: 12 mm: 0 nn: "pm" + tz: "UTC" original_text: "12-pm" output_id: 1 range: null @@ -63,6 +68,7 @@ tests: - hh: 1 mm: 0 nn: "am" + tz: "UTC" original_text: "1 am" output_id: 1 range: null @@ -74,6 +80,7 @@ tests: - hh: null mm: null nn: null + tz: null original_text: null output_id: 1 range: null @@ -85,6 +92,7 @@ tests: - hh: null mm: null nn: null + tz: null original_text: null output_id: 1 range: null @@ -96,6 +104,7 @@ tests: - hh: null mm: null nn: null + tz: null original_text: null output_id: 1 range: null @@ -107,6 +116,7 @@ tests: - hh: null mm: null nn: null + tz: null original_text: null output_id: 1 range: null @@ -118,6 +128,7 @@ tests: - hh: 0 mm: 15 nn: "df" + tz: "UTC" original_text: "in 15mins" output_id: 1 range: null @@ -129,6 +140,7 @@ tests: - hh: 0 mm: 25 nn: "df" + tz: "UTC" original_text: "about 25 minutes" output_id: 1 range: null @@ -140,6 +152,7 @@ tests: - hh: 5 mm: 0 nn: "df" + tz: "UTC" original_text: "after 5 hrs" output_id: 1 range: null @@ -151,6 +164,7 @@ tests: - hh: 13 mm: 0 nn: "df" + tz: "UTC" original_text: "in around 13 hours" output_id: 1 range: null @@ -162,6 +176,7 @@ tests: - hh: 0 mm: 20 nn: "df" + tz: "UTC" original_text: "20 minutes later" output_id: 1 range: null @@ -173,6 +188,7 @@ tests: - hh: 0 mm: 5 nn: "df" + tz: "UTC" original_text: "5mins latr" output_id: 1 range: null @@ -184,6 +200,7 @@ tests: - hh: 1 mm: 0 nn: "df" + tz: "UTC" original_text: "1 hour ltr" output_id: 1 range: null @@ -195,6 +212,7 @@ tests: - hh: 3 mm: 0 nn: "df" + tz: "UTC" original_text: "3 hrs later" output_id: 1 range: null @@ -206,6 +224,7 @@ tests: - hh: 0 mm: 1440 nn: "ev" + tz: "UTC" original_text: "every 1440 minutes" output_id: 1 range: null @@ -217,6 +236,7 @@ tests: - hh: 24 mm: 0 nn: "ev" + tz: "UTC" original_text: "evry 24 hrs" output_id: 1 range: null @@ -228,6 +248,7 @@ tests: - hh: 72 mm: 0 nn: "ev" + tz: "UTC" original_text: "once in 3 days" output_id: 1 range: null @@ -239,6 +260,7 @@ tests: - hh: 24 mm: 0 nn: "ev" + tz: "UTC" original_text: "once in 1 day" output_id: 1 range: null @@ -250,6 +272,7 @@ tests: - hh: 0 mm: 35 nn: "hrs" + tz: "UTC" original_text: "00:35" output_id: 1 range: null @@ -261,6 +284,7 @@ tests: - hh: 22 mm: 33 nn: "hrs" + tz: "UTC" original_text: "22:33" output_id: 1 range: null @@ -272,6 +296,7 @@ tests: - hh: 14 mm: 33 nn: "hrs" + tz: "UTC" original_text: "14 33" output_id: 1 range: null @@ -283,6 +308,7 @@ tests: - hh: 12 mm: 0 nn: "hrs" + tz: "UTC" original_text: "12 hrs" output_id: 1 range: null @@ -294,6 +320,7 @@ tests: - hh: 0 mm: 0 nn: "hrs" + tz: "UTC" original_text: "0 hours" output_id: 1 range: null @@ -305,6 +332,7 @@ tests: - hh: null mm: null nn: null + tz: null original_text: null output_id: 1 range: null @@ -316,6 +344,7 @@ tests: - hh: null mm: null nn: null + tz: "UTC" original_text: null output_id: 1 range: null @@ -327,6 +356,7 @@ tests: - hh: null mm: null nn: null + tz: null original_text: null output_id: 1 range: null @@ -338,6 +368,7 @@ tests: - hh: 12 mm: 30 nn: "am" + tz: "UTC" original_text: "12:30" output_id: 1 range: null @@ -349,6 +380,7 @@ tests: - hh: 11 mm: 30 nn: "am" + tz: "UTC" original_text: "11:30" output_id: 1 range: null @@ -360,6 +392,7 @@ tests: - hh: 11 mm: 30 nn: "pm" + tz: "UTC" original_text: "11:30" output_id: 1 range: null @@ -371,6 +404,7 @@ tests: - hh: 12 mm: 0 nn: "am" + tz: "UTC" original_text: "12:00" output_id: 1 range: null @@ -382,6 +416,7 @@ tests: - hh: 5 mm: 29 nn: "pm" + tz: "UTC" original_text: "5:29" output_id: 1 range: null @@ -393,6 +428,7 @@ tests: - hh: 3 mm: 11 nn: "pm" + tz: "UTC" original_text: "3:11" output_id: 1 range: null @@ -404,6 +440,7 @@ tests: - hh: 12 mm: 22 nn: "pm" + tz: "UTC" original_text: "12:22" output_id: 1 range: null @@ -415,6 +452,7 @@ tests: - hh: 3 mm: 33 nn: "am" + tz: "UTC" original_text: "3:33" output_id: 1 range: null @@ -426,6 +464,7 @@ tests: - hh: 4 mm: 44 nn: "am" + tz: "UTC" original_text: "4:44" output_id: 1 range: null @@ -437,6 +476,7 @@ tests: - hh: 5 mm: 55 nn: "pm" + tz: "UTC" original_text: "5:55" output_id: 1 range: null @@ -448,6 +488,7 @@ tests: - hh: 6 mm: 0 nn: "pm" + tz: "UTC" original_text: "6:00" output_id: 1 range: null @@ -459,6 +500,7 @@ tests: - hh: 3 mm: 0 nn: "pm" + tz: "UTC" original_text: "at 3" output_id: 1 range: null @@ -470,6 +512,7 @@ tests: - hh: 12 mm: 0 nn: "pm" + tz: "UTC" original_text: "at 12" output_id: 1 range: null @@ -481,6 +524,7 @@ tests: - hh: 3 mm: 0 nn: "am" + tz: "UTC" original_text: "after 3" output_id: 1 range: null @@ -492,6 +536,7 @@ tests: - hh: 4 mm: 0 nn: "am" + tz: "UTC" original_text: "by 4" output_id: 1 range: null @@ -503,6 +548,7 @@ tests: - hh: 5 mm: 0 nn: "pm" + tz: "UTC" original_text: "before 5" output_id: 1 range: null @@ -514,6 +560,7 @@ tests: - hh: 6 mm: 0 nn: "pm" + tz: "UTC" original_text: "exact 6" output_id: 1 range: null @@ -528,16 +575,144 @@ tests: hh: 12 mm: 30 nn: "am" + tz: "UTC" range: "start" time_type: null - original_text: "12:30 am to 2:30 pm" + original_text: "from 12:30 am to 2:30 pm" - output_id: 2 hh: 2 mm: 30 nn: "pm" + tz: "UTC" range: "end" time_type: null - original_text: "12:30 am to 2:30 pm" + original_text: "from 12:30 am to 2:30 pm" + - id: en_49 + message: "Sessions begin at noon" + bot_message: null + range_enabled: true + outputs: + - output_id: 1 + hh: 11 + mm: 0 + nn: "am" + tz: "UTC" + range: "start" + time_type: null + original_text: "noon" + - output_id: 2 + hh: 5 + mm: 0 + nn: "pm" + tz: "UTC" + range: "end" + time_type: null + original_text: "noon" + - id: en_50 + message: "Sessions begin at morning" + bot_message: null + range_enabled: true + outputs: + - output_id: 1 + hh: 12 + mm: 0 + nn: "am" + tz: "UTC" + range: "start" + time_type: null + original_text: "morning" + - output_id: 2 + hh: 11 + mm: 0 + nn: "am" + tz: "UTC" + range: "end" + time_type: null + original_text: "morning" + - id: en_51 + message: "Sessions begin at evening" + bot_message: null + range_enabled: true + outputs: + - output_id: 1 + hh: 5 + mm: 0 + nn: "pm" + tz: "UTC" + range: "start" + time_type: null + original_text: "evening" + - output_id: 2 + hh: 9 + mm: 0 + nn: "pm" + tz: "UTC" + range: "end" + time_type: null + original_text: "evening" + - id: en_52 + message: "Sessions begin at night" + bot_message: null + range_enabled: true + outputs: + - output_id: 1 + hh: 9 + mm: 0 + nn: "pm" + tz: "UTC" + range: "start" + time_type: null + original_text: "night" + - output_id: 2 + hh: 12 + mm: 0 + nn: "am" + tz: "UTC" + range: "end" + time_type: null + original_text: "night" + - id: en_53 + message: "Sessions begin at no particular preference" + bot_message: null + range_enabled: true + outputs: + - output_id: 1 + hh: 12 + mm: 0 + nn: "am" + tz: "UTC" + range: "start" + time_type: null + original_text: "no particular preference" + - output_id: 2 + hh: 11 + mm: 59 + nn: "pm" + tz: "UTC" + range: "end" + time_type: null + original_text: "no particular preference" + - id: en_54 + message: "10:00 to 14:00" + bot_message: null + range_enabled: true + outputs: + - output_id: 1 + hh: 10 + mm: 0 + nn: "hrs" + tz: "UTC" + range: "start" + time_type: null + original_text: "10:00 to 14:00" + - output_id: 2 + hh: 14 + mm: 0 + nn: "hrs" + tz: "UTC" + range: "end" + time_type: null + original_text: "10:00 to 14:00" hi: - id: hi_1 message: "सुबह 10 बजे" @@ -545,6 +720,7 @@ tests: - hh: 10 mm: 0 nn: "am" + tz: "UTC" original_text: "सुबह 10 बजे" output_id: 1 range: null diff --git a/requirements.txt b/requirements.txt index 6b3791001..0e474b44a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +phonenumberslite==8.10.18 six==1.11.0 gunicorn==19.6.0 pytz==2014.2 @@ -6,7 +7,7 @@ numpy==1.10.4 elasticsearch==5.5.0 requests==2.20.0 requests-aws4auth==0.9 -django==1.11.18 +Django==1.11.22 django-dotenv==1.4.2 weighted-levenshtein==0.1 regex==2018.7.11 @@ -23,4 +24,4 @@ typing==3.6.2 flake8==3.4.1 pyaml==19.4.1 coverage==4.5.3 -nose-exclude==0.5.0 \ No newline at end of file +nose-exclude==0.5.0