From 1d6afb6c3d1414bd7400550d9ae6eb26e1686621 Mon Sep 17 00:00:00 2001 From: ranvijayj Date: Thu, 27 Jun 2019 12:36:41 +0530 Subject: [PATCH 001/237] Update Django version - Security Update Django version - Security Current version 1.11.18 has some vulnerabilities --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 6b3791001..22e9a581a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,7 @@ numpy==1.10.4 elasticsearch==5.5.0 requests==2.20.0 requests-aws4auth==0.9 -django==1.11.18 +django==1.11.19 django-dotenv==1.4.2 weighted-levenshtein==0.1 regex==2018.7.11 @@ -23,4 +23,4 @@ typing==3.6.2 flake8==3.4.1 pyaml==19.4.1 coverage==4.5.3 -nose-exclude==0.5.0 \ No newline at end of file +nose-exclude==0.5.0 From 5a6b543a090e38d29ff075758cbdcb76ff74d9a3 Mon Sep 17 00:00:00 2001 From: Chirag Jain Date: Thu, 27 Jun 2019 15:58:15 +0530 Subject: [PATCH 002/237] Update docs for adding entities when installed with docker --- datastore/__init__.py | 2 +- datastore/datastore.py | 15 ++- datastore/elastic_search/__init__.py | 6 +- docs/adding_entities.md | 143 +++++++++++++++++++++------ 4 files changed, 128 insertions(+), 38 deletions(-) diff --git a/datastore/__init__.py b/datastore/__init__.py index ce4f4b817..49ca09de1 100644 --- a/datastore/__init__.py +++ b/datastore/__init__.py @@ -1 +1 @@ -from datastore import DataStore +from .datastore import DataStore diff --git a/datastore/datastore.py b/datastore/datastore.py index 33e7f59d3..adce577e6 100644 --- a/datastore/datastore.py +++ b/datastore/datastore.py @@ -126,7 +126,7 @@ def create(self, **kwargs): ) # FIXME: repopulate does not consider language of the variants - def populate(self, entity_data_directory_path=DEFAULT_ENTITY_DATA_DIRECTORY, csv_file_paths=None, **kwargs): + def populate(self, entity_data_directory_path=None, csv_file_paths=None, **kwargs): """ Populates the datastore from csv files stored in directory path indicated by entity_data_directory_path and from csv files at file paths in csv_file_paths list @@ -143,6 +143,11 @@ def populate(self, entity_data_directory_path=DEFAULT_ENTITY_DATA_DIRECTORY, csv All other exceptions raised by elasticsearch-py library """ + if not (entity_data_directory_path or csv_file_paths): + raise ValueError('Both `entity_data_directory_path` and `csv_file_paths` arguments cannot be None.' + 'Either provide a path to directory containing csv files using ' + '`entity_data_directory_path` or a list of paths to csv files ' + 'using `csv_file_paths`') if self._client_or_connection is None: self._connect() @@ -317,7 +322,7 @@ def delete_entity(self, entity_name, **kwargs): **kwargs) # FIXME: repopulate does not consider language of the variants - def repopulate(self, entity_data_directory_path=DEFAULT_ENTITY_DATA_DIRECTORY, csv_file_paths=None, **kwargs): + def repopulate(self, entity_data_directory_path=None, csv_file_paths=None, **kwargs): """ Deletes the existing data and repopulates it for entities from csv files stored in directory path indicated by entity_data_directory_path and from csv files at file paths in csv_file_paths list @@ -334,6 +339,12 @@ def repopulate(self, entity_data_directory_path=DEFAULT_ENTITY_DATA_DIRECTORY, c DataStoreSettingsImproperlyConfiguredException if connection settings are invalid or missing All other exceptions raised by elasticsearch-py library """ + if not (entity_data_directory_path or csv_file_paths): + raise ValueError('Both `entity_data_directory_path` and `csv_file_paths` arguments cannot be None.' + 'Either provide a path to directory containing csv files using ' + '`entity_data_directory_path` or a list of paths to csv files ' + 'using `csv_file_paths`') + if self._client_or_connection is None: self._connect() diff --git a/datastore/elastic_search/__init__.py b/datastore/elastic_search/__init__.py index 34654dbfb..5618d0c87 100644 --- a/datastore/elastic_search/__init__.py +++ b/datastore/elastic_search/__init__.py @@ -1,5 +1 @@ -import connect -import create -import populate -import query -import transfer +from . import connect, create, populate, query, transfer diff --git a/docs/adding_entities.md b/docs/adding_entities.md index b5aea3a1c..9904922b1 100644 --- a/docs/adding_entities.md +++ b/docs/adding_entities.md @@ -21,7 +21,7 @@ Following csv files are already included in the repository at `data/entity_data/ ----------- -Chatbot ner reads data from these csv files and puts them into the datastore under a entity named after the filename of the csv file. +Chatbot NER reads data from these csv files and puts them into the datastore under a entity named after the filename of the csv file. > *csv filename should contain only lowercase english alphabets and '_' (underscore) symbol* @@ -62,24 +62,37 @@ video,mp4|mkv|mov Now lets add the newly created csv file to the datastore. -- Make sure to start the engine you configured with datastore( eg. elasticsearch) +- Make sure our containers are running ```shell - $ ~/chatbot_ner_elasticsearch/elasticsearch-5.5.0/bin/elasticsearch -d + $ docker-compose ps ``` -- Activate chatbot_ner virtual environment + You should see output like following + + ``` + Name Command State Ports + ------------------------------------------------------------------------------------------------ + docker_chatbot-ner_1 /bin/sh -c /app/docker/cmd.sh Up 0.0.0.0:8081->80/tcp, 8081/tcp + docker_elasticsearch_1 /docker-entrypoint.sh elas ... Up 9200/tcp, 9300/tcp + ``` + + > If the containers are not running, do the following + > + > ```shell + > $ cd chatbot_ner/docker + > $ docker-compose up -d + > ``` + +- Enter the chatbot-ner container ```shell - $ source /usr/local/bin/virtualenvwrapper.sh - $ workon chatbotnervenv + $ docker exec -it docker_chatbot-ner_1 bash ``` - Start a `manage.py shell` as follows ```bash - $ # change to your repository clone directory - $ cd ~/chatbot_ner/ $ python manage.py shell ``` @@ -100,6 +113,10 @@ Now lets add the newly created csv file to the datastore. db = DataStore() db.populate(entity_data_directory_path=csv_directory) ``` + + > Note: It is advised that you put the csv files inside some directory in the repo. (E.g. chatbot_ner/data/entity_data/) because the repo is mouted inside the container so the files will available bot inside and outside the container + +- Once done, you can exit the shell and then exit the container ### Updating the DataStore after editing a csv file @@ -109,25 +126,59 @@ After editing and saving your csv, you will need to update the datastore with ne > **Note:** The filename needs to be same as it was before editing the file. If the new data is saved under a different filename it would be populated as a new entity with the name same as new file name. -> Make sure you are working in chatbotnervenv virtual environment and datastore engine is running. See above section +- Make sure our containers are running + + ```shell + $ docker-compose ps + ``` + + You should see output like following -On a `manage.py shell` run + ``` + Name Command State Ports + ------------------------------------------------------------------------------------------------ + docker_chatbot-ner_1 /bin/sh -c /app/docker/cmd.sh Up 0.0.0.0:8081->80/tcp, 8081/tcp + docker_elasticsearch_1 /docker-entrypoint.sh elas ... Up 9200/tcp, 9300/tcp + ``` -```python -from datastore import DataStore -csv_file = '~/attachment_types.csv' # example file path to the csv file -db = DataStore() -db.repopulate(csv_file_paths=[csv_file,]) -``` + > If the containers are not running, do the following + > + > ```shell + > $ cd chatbot_ner/docker + > $ docker-compose up -d + > ``` - In case, you want to update multiple csv files at once, you can pass the directory path to `entity_data_directory_path` parameter of `repopulate` method as follows: +- Enter the chatbot-ner container -```python -from datastore import DataStore -csv_directory = '~/my_csv_files/' # example directory path containing csv files -db = DataStore() -db.repopulate(entity_data_directory_path=csv_directory) -``` + ```shell + $ docker exec -it docker_chatbot-ner_1 bash + ``` + +- Start a `manage.py shell` as follows + + ```bash + $ python manage.py shell + ``` + +- Now run the following: + + ```python + from datastore import DataStore + csv_file = '~/attachment_types.csv' # example file path to the csv file + db = DataStore() + db.repopulate(csv_file_paths=[csv_file,]) + ``` + + In case, you want to update multiple csv files at once, you can pass the directory path to `entity_data_directory_path` parameter of `repopulate` method as follows: + + ```python + from datastore import DataStore + csv_directory = '~/my_csv_files/' # example directory path containing csv files + db = DataStore() + db.repopulate(entity_data_directory_path=csv_directory) + ``` + + > Note: It is advised that you put the csv files inside some directory in the repo. (E.g. chatbot_ner/data/entity_data/) because the repo is mouted inside the container so the files will available bot inside and outside the container ### Deleting entity data @@ -135,12 +186,44 @@ db.repopulate(entity_data_directory_path=csv_directory) To delete all data for entity, simply call `delete_entity()` on Datastore. It takes one argument- the name of the entity. This is the same as the name of the csv file used for this entity while populating its data. -> Make sure you are working in chatbotnervenv virtual environment and datastore engine is running. See above section +- Make sure our containers are running + + ```shell + $ docker-compose ps + ``` -On a `manage.py shell` run + You should see output like following -```python -from datastore import DataStore -db = DataStore() -db.delete_entity(entity_name='attachment_types') -``` + ``` + Name Command State Ports + ------------------------------------------------------------------------------------------------ + docker_chatbot-ner_1 /bin/sh -c /app/docker/cmd.sh Up 0.0.0.0:8081->80/tcp, 8081/tcp + docker_elasticsearch_1 /docker-entrypoint.sh elas ... Up 9200/tcp, 9300/tcp + ``` + + > If the containers are not running, do the following + > + > ```shell + > $ cd chatbot_ner/docker + > $ docker-compose up -d + > ``` + +- Enter the chatbot-ner container + + ```shell + $ docker exec -it docker_chatbot-ner_1 bash + ``` + +- Start a `manage.py shell` as follows + + ```bash + $ python manage.py shell + ``` + +- Now run the following + + ```python + from datastore import DataStore + db = DataStore() + db.delete_entity(entity_name='attachment_types') + ``` \ No newline at end of file From 07657b6c664753a05bd9f5f7dbc4792d7d6aa654 Mon Sep 17 00:00:00 2001 From: Chirag Jain Date: Thu, 27 Jun 2019 17:17:15 +0530 Subject: [PATCH 003/237] Correct all curl calls in api_call, make sure they work and make them readable --- docs/adding_entities.md | 12 +- docs/api_call.md | 845 +++++++++++++++++++++++----------------- models/crf_v2/README.md | 12 +- 3 files changed, 505 insertions(+), 364 deletions(-) diff --git a/docs/adding_entities.md b/docs/adding_entities.md index 9904922b1..fd750ffac 100644 --- a/docs/adding_entities.md +++ b/docs/adding_entities.md @@ -100,7 +100,7 @@ Now lets add the newly created csv file to the datastore. ```python from datastore import DataStore - csv_file = '~/attachment_types.csv' # example file path to the csv file + csv_file = 'data/entity_data/city.csv' # example file path to the csv file db = DataStore() db.populate(csv_file_paths=[csv_file,]) ``` @@ -109,7 +109,7 @@ Now lets add the newly created csv file to the datastore. ```python from datastore import DataStore - csv_directory = '~/my_csv_files/' # example directory path containing csv files + csv_directory = 'data/entity_data/' # example directory path containing csv files db = DataStore() db.populate(entity_data_directory_path=csv_directory) ``` @@ -164,7 +164,7 @@ After editing and saving your csv, you will need to update the datastore with ne ```python from datastore import DataStore - csv_file = '~/attachment_types.csv' # example file path to the csv file + csv_file = 'data/entity_data/city.csv' # example file path to the csv file db = DataStore() db.repopulate(csv_file_paths=[csv_file,]) ``` @@ -173,7 +173,7 @@ After editing and saving your csv, you will need to update the datastore with ne ```python from datastore import DataStore - csv_directory = '~/my_csv_files/' # example directory path containing csv files + csv_directory = 'data/entity_data/' # example directory path containing csv files db = DataStore() db.repopulate(entity_data_directory_path=csv_directory) ``` @@ -220,10 +220,10 @@ To delete all data for entity, simply call `delete_entity()` on Datastore. It ta $ python manage.py shell ``` -- Now run the following +- Now run the following (E.g. to delete `city` entity) ```python from datastore import DataStore db = DataStore() - db.delete_entity(entity_name='attachment_types') + db.delete_entity(entity_name='city') ``` \ No newline at end of file diff --git a/docs/api_call.md b/docs/api_call.md index 10ea71641..e5b1de385 100644 --- a/docs/api_call.md +++ b/docs/api_call.md @@ -177,34 +177,50 @@ Currently time detection support has been provided in different languages - `Eng - *CURL command* ```bash - URL='localhost' - PORT=8081 + curl -G -i "http://localhost:8081/v2/time/?&entity_name=time&timezone=UTC&source_language=en" \ + --data-urlencode "structured_value=" \ + --data-urlencode "fallback_value=" \ + --data-urlencode "bot_message=" \ + --data-urlencode "message=John arrived at the bus stop at 13:50 hrs, expecting the bus to be there in 15 mins. But the bus was scheduled for 12:30 pm" +``` + +> **Output**: - curl -i 'http://'$URL':'$PORT'/v2/time/?message=John%20arrived%20at%20the%20bus%20stop%20at%2013%3A50%20hrs%2C%20expecting%20the%20bus%20to%20be%20there%20in%2015%20mins.%20But%20the%20bus%20was%20scheduled%20for%2012%3A30%20pm&entity_name=time&structured_value=&fallback_value=&bot_message=&timezone=UTC&source_language=en' - ``` - - > **Output**: - ```json - {"data": [ + { + "data": [ { - "detection": "message", - "original_text": "12:30 pm", - "entity_value": { "mm": 30, "hh": 12, "nn": "pm"}, - "language": "en" + "detection": "message", + "original_text": "12:30 pm", + "entity_value": { + "mm": 30, + "hh": 12, + "nn": "pm" + }, + "language": "en" }, { - "detection": "message", - "original_text": "in 15 mins", - "entity_value": { "mm": "15", "hh": 0, "nn": "df" }, - "language": "en" + "detection": "message", + "original_text": "in 15 mins", + "entity_value": { + "mm": 15, + "hh": 0, + "nn": "df" + }, + "language": "en" }, { - "detection": "message", - "original_text": "13:50", - "entity_value": {"mm": 50, "hh": 13, "nn": "hrs"}, - "language": "en" - }]} + "detection": "message", + "original_text": "13:50", + "entity_value": { + "mm": 50, + "hh": 13, + "nn": "hrs" + }, + "language": "en" + } + ] + } ``` @@ -234,39 +250,53 @@ Currently time detection support has been provided in different languages - `Eng - *CURL command* ```bash - URL='localhost' - PORT=8081 + curl -G -i "http://localhost:8081/v2/time/?&entity_name=time&timezone=UTC&source_language=hi" \ + --data-urlencode "structured_value=" \ + --data-urlencode "fallback_value=" \ + --data-urlencode "bot_message=" \ + --data-urlencode "message=राजू का बस १३:५० को बस स्टॉप से निकला और १५ मिनट में यहाँ पहुंच जाएगा और गोवा को शाम में बारह बजकर ३० मिनट पैर पहुंचेगा" + ``` - curl -i 'http://'$URL':'$PORT'/v2/time/?message=राजू%20का%20बस%20१३:५०%20को%20बस%20स्टॉप%20से%20निकला%20और%20१५%20मिनट%20में%20यहाँ%20पहुंच%20जाएगा%20और%20गोवा%20को%20शाम%20में%20बारह%20बजकर%20३०%20मिनट%20पैर%20पहुंचेगा&entity_name=time&structured_value=&fallback_value=&bot_message=&timezone=UTC&source_language=en' +> **Output**: - ``` - - > **Output**: - - ```json - {"data": [ +```json + { + "data": [ { - "detection": "message", - "original_text": "१३:५०", - "entity_value": { "mm": 1, "hh": 50,"nn": "hr"}, - "language": "hi" + "detection": "message", + "original_text": "१३:५०", + "entity_value": { + "mm": 50, + "hh": 13, + "nn": "hrs", + "time_type": null + }, + "language": "hi" }, { - "detection": "message", - "original_text": "१५ मिनट में", - "entity_value": {"mm": "15", "hh": 0, "nn": "df"}, + "detection": "message", + "original_text": "१५ मिनट में", + "entity_value": { + "mm": 15, + "hh": 0, + "nn": "df" + }, "language": "hi" }, - { - "detection": "message", - "original_text": "शाम में बारह बजकर ३० मिनट", - "entity_value": { "mm": 30, "hh": 12, "nn": "pm"}, - "language": "hi" - }] + { + "detection": "message", + "original_text": "बारह बजकर ३० मिनट", + "entity_value": { + "mm": 30, + "hh": 12, + "nn": "hrs" + }, + "language": "hi" + } + ] } - ``` - + ### 2. Date @@ -316,28 +346,38 @@ The Date detector module has the capability to detect various form of dates from - *CURL:* - ```shell - URL='localhost' - PORT=8081 - - curl -i 'http://'$URL':'$PORT'/v1/date/?message=set%20me%20reminder%20on%2023rd%20december&entity_name=date&structured_value=&fallback_value=&bot_message=%timezone=UTC&source_language=en&past_date_referenced=false' - + ```bash + curl -G -i "http://localhost:8081/v2/date/?&entity_name=date&timezone=UTC&source_language=en" \ + --data-urlencode "structured_value=" \ + --data-urlencode "fallback_value=" \ + --data-urlencode "bot_message=" \ + --data-urlencode "message=set me reminder on 23rd december" ``` > **Output:** ```json - {"data": [ + { + "data": [ { - "detection": "message", + "detection": "message", "original_text": "23rd december", - "entity_value": { "end_range": false, "from": false, "normal": true, "to": - false, "start_range": false, - "value": {"mm": 12, "yy": 2017, "dd": 23, "type": "date"} - }, - "language": "en" - }]} - + "entity_value": { + "end_range": false, + "from": false, + "normal": true, + "value": { + "mm": 12, + "yy": 2019, + "dd": 23, + "type": "date" + }, + "to": false, + "start_range": false + } + } + ] + } ``` - ***Example 2: Detecting referenced date [Hindi] from user message*** @@ -369,28 +409,40 @@ The Date detector module has the capability to detect various form of dates from - *CURL:* - ```shell - URL='localhost' - PORT=8081 - - curl -i 'http://'$URL':'$PORT'/v2/date/?message=मुझे%20कल%20सुबह%20५%20बजे%20उठा%20देना&entity_name=date&structured_value=&fallback_value=&bot_message=%timezone=UTC&source_language=en&past_date_referenced=false' - + ```bash + curl -G -i "http://localhost:8081/v2/date/?&entity_name=date&timezone=UTC&past_date_referenced=false&source_language=hi" \ + --data-urlencode "structured_value=" \ + --data-urlencode "fallback_value=" \ + --data-urlencode "bot_message=" \ + --data-urlencode "message=मुझे कल सुबह ५ बजे उठा देना" ``` > **Output:** ```json - /* Assuming today's date is 12th feb 2019*/ - {"data": [ + /* Assuming today's date is 27 June 2019*/ + { + "data": [ { - "detection": "message", + "detection": "message", "original_text": "कल", - "entity_value": { "end_range": false, "from": false, "normal": true, "to": - false, "start_range": false, - "value": {"mm": 02, "yy": 2019, "dd": 13, "type": "date"} - }, - "language": "en" - }]} + "entity_value": { + "end_range": false, + "from": false, + "normal": true, + "value": { + "mm": 6, + "yy": 2019, + "dd": 28, + "type": "date" + }, + "to": false, + "start_range": false + }, + "language": "hi" + } + ] + } ``` @@ -423,29 +475,40 @@ The Date detector module has the capability to detect various form of dates from - *CURL:* - ```shell - URL='localhost' - PORT=8081 - - curl -i 'http://'$URL':'$PORT'/v2/date/?message=आने%20वाले%20सोमवार%20को%20मेरा%20मैथ्स%20का%20एग्जाम%20है&entity_name=date&structured_value=&fallback_value=&bot_message=%timezone=UTC&source_language=en&past_date_referenced=false' - + ```bash + curl -G -i "http://localhost:8081/v2/date/?&entity_name=date&timezone=UTC&past_date_referenced=false&source_language=hi" \ + --data-urlencode "structured_value=" \ + --data-urlencode "fallback_value=" \ + --data-urlencode "bot_message=" \ + --data-urlencode "message=आने वाले सोमवार को मेरा मैथ्स का एग्जाम है" ``` > **Output:** ```json - /* Assuming today's date is 12th feb 2019*/ - {"data": [ + /* Assuming today's date is 27 June 2019*/ + { + "data": [ { - "detection": "message", - "original_text": "कल", - "entity_value": { "end_range": false, "from": false, "normal": true, "to": - false, "start_range": false, - "value": {"mm": 02, "yy": 2019, "dd": 18, "type": "date"} - }, - "language": "en" - }]} - + "detection": "message", + "original_text": "सोमवार", + "entity_value": { + "end_range": false, + "from": false, + "normal": true, + "value": { + "mm": 7, + "yy": 2019, + "dd": 1, + "type": "date" + }, + "to": false, + "start_range": false + }, + "language": "hi" + } + ] + } ``` ### 3. Number @@ -468,7 +531,7 @@ Currently number detection support has been provided for 6 different languages - ```python # For a sample query with following parameters - message=u"i want to purchase 30 units of mobile abd 40 units of telivision" + message=u"i want to purchase 30 units of mobile abd 40 units of television" entity_name='number' structured_value=None fallback_value=None @@ -489,29 +552,38 @@ Currently number detection support has been provided for 6 different languages - - *CURL command:* - ```shell - URL='localhost' - PORT=8081 + ```bash + curl -G -i "http://localhost:8081/v2/number/?&entity_name=number_of_unit&min_number_digits=1&max_number_digits=6&unit_type=&source_language=en" \ + --data-urlencode "structured_value=" \ + --data-urlencode "fallback_value=" \ + --data-urlencode "bot_message=" \ + --data-urlencode "message=i want to purchase 30 units of mobile abd 40 units of television" + ``` + + > **Output:** - curl -i 'http://'$URL':'$PORT'/v2/number/?message=I%20want%20to%20purchase%2030%20units%20of%20mobile%20and%2040%20units%20of%20Television&entity_name=number_of_unit&structured_value=&fallback_value=&bot_message=&min_number_digits=1&max_number_digits=2&source_language=en&unit_type=' - ``` - - > **Output:** - ```json - {"data": [ + { + "data": [ { - "detection": "message", - "original_text": "30", - "entity_value": { "value": "30", "unit": null}, - "language": "en" + "detection": "message", + "original_text": "30", + "entity_value": { + "unit": null, + "value": "30" + }, + "language": "en" }, { "detection": "message", "original_text": "40", - "entity_value": { "value": "40", "unit": null}, + "entity_value": { + "unit": null, + "value": "40" + }, "language": "en" - }] + } + ] } ``` @@ -521,7 +593,7 @@ Currently number detection support has been provided for 6 different languages - ```python # For a sample query with following parameters - message=u"मुझे ३० किलो आटा और दो हजार का चीनी देना " + message=u"मुझे ३० रूपए आटा का और ३ हजार का चीनी देना" entity_name='number' structured_value=None fallback_value=None @@ -538,31 +610,43 @@ Currently number detection support has been provided for 6 different languages - output = detector.detect(message=message,structured_value=structured_value, fallback_value=fallback_value, bot_message=bot_message) print(output) - ``` - + - *CURL command:* - ```shell - URL='localhost' - PORT=8081 - - curl -i 'http://'$URL':'$PORT'/v2/number/?मुझे%20३०%20किलो%20आटा%20और%20दो%20हजार%20क%20%20चीनी%20देना &entity_name=number_of_unit&structured_value=&fallback_value=&bot_message=&min_number_digits=1&max_number_digits=2&source_language=en&unit_type=' - + ```bash + curl -G -i "http://localhost:8081/v2/number/?&entity_name=number_of_unit&min_number_digits=1&max_number_digits=6&unit_type=&source_language=hi" \ + --data-urlencode "structured_value=" \ + --data-urlencode "fallback_value=" \ + --data-urlencode "bot_message=" \ + --data-urlencode "message=मुझे ३० रूपए आटा का और ३ हजार का चीनी देना" ``` > **Output:** ```json - {"data": [ + { + "data": [ { "detection": "message", - "original_text": "३० किलो", - "entity_value": { "value": "३०", "unit": "kg"}, + "original_text": "३०", + "entity_value": { + "unit": null, + "value": "30" + }, "language": "hi" - }] + }, + { + "detection": "message", + "original_text": "३ हजार", + "entity_value": { + "unit": null, + "value": "3000" + }, + "language": "hi" + } + ] } - ``` - ***Example 3: Detecting number[Hindi in latin script] without unit in message*** @@ -590,43 +674,52 @@ Currently number detection support has been provided for 6 different languages - fallback_value=fallback_value, bot_message=bot_message) print(output) - ``` - + - *CURL command:* - ```shell - URL='localhost' - PORT=8081 - - curl -i 'http://'$URL':'$PORT'/v2/number/?mujhe%2030%20kilo%20aata%20aur%202%20hajaar%20ka%20chini%20dena%20aur%20 teen%20sau%20ka%20chawal&entity_name=number_of_unit&structured_value=&fallback_value=&bot_message=&min_number_digits=1&max_number_digits=2&source_language=en&unit_type=' - + ```bash + curl -G -i "http://localhost:8081/v2/number/?&entity_name=number_of_unit&min_number_digits=1&max_number_digits=6&unit_type=&source_language=hi" \ + --data-urlencode "structured_value=" \ + --data-urlencode "fallback_value=" \ + --data-urlencode "bot_message=" \ + --data-urlencode "message=mujhe 30 kilo aata aur 2 hajaar ka chini dena aur teen sau ka chawal" ``` > **Output:** ```json - {"data": [ + { + "data": [ { - "detection": "message", - "original_text": "30", - "entity_value": { "value": "30", "unit": null}, - "language": "hi" + "detection": "message", + "original_text": "30", + "entity_value": { + "unit": null, + "value": "30" + }, + "language": "hi" }, { - "detection": "message", - "original_text": "2 hajaar", - "entity_value": { "value": "2000", "unit": null}, - "language": "hi" + "detection": "message", + "original_text": "2 hajaar", + "entity_value": { + "unit": null, + "value": "2000" + }, + "language": "hi" }, { - "detection": "message", - "original_text": "teen sau", - "entity_value": { "value": "300", "unit": null}, - "language": "hi" + "detection": "message", + "original_text": "teen sau", + "entity_value": { + "unit": null, + "value": "300" + }, + "language": "hi" } - ]} - + ] + } ``` @@ -654,35 +747,35 @@ Currently number detection support has been provided for 6 different languages - output = detector.detect(message=message,structured_value=structured_value, fallback_value=fallback_value, bot_message=bot_message) print(output) - ``` - - - *CURL command:* - - ```shell - URL='localhost' - PORT=8081 - - curl -i 'http://'$URL':'$PORT'/v2/number/?message=i%20want%20more%20than%20Rupees%2020k%20and%2010%20pendrive&entity_name=number_of_unit&structured_value=&fallback_value=&bot_message=&min_number_digits=1&max_number_digits=2&source_language=en&unit_type=currency' +- *CURL command:* + + ```bash + curl -G -i "http://localhost:8081/v2/number/?&entity_name=number_of_unit&min_number_digits=1&max_number_digits=6&unit_type=currency&source_language=en" \ + --data-urlencode "structured_value=" \ + --data-urlencode "fallback_value=" \ + --data-urlencode "bot_message=" \ + --data-urlencode "message=i want more than Rupees 20k and 10 pendrive" ``` - - > **Output:** - - ```json - {"data": [ + + > **Output:** + + ```json + { + "data": [ { - "detection": "message", - "original_text": "Rupees 20k", - "entity_value": { - "value": "20000", - "unit": "rupees" - }, - "language": "en" - }] + "detection": "message", + "original_text": "rupees 20k", + "entity_value": { + "unit": "rupees", + "value": "20000" + }, + "language": "en" + } + ] } - /* here 40 is not detected as unit_type is specified as currency, Hence it only detect numbers having currencies value in unit */ - + /* here 40 is not detected as the unit_type specified is currency, Hence it will only detect numbers with currencies mentioned as unit */ ``` ### 4. Phone number @@ -712,37 +805,39 @@ Currently number detection support has been provided for 6 different languages - fallback_value=fallback_value, bot_message=bot_message,language=source_language) print(output) - ``` - - - *CURL command:* - - ```shell - URL='localhost' - PORT=8081 - - curl -i 'http://'$URL':'$PORT'/v2/phone_number/?message=my%20contact%20number%20is%209049961794&entity_name=phone_number&structured_value=&fallback_value=&bot_message=&source_language=en' + - *CURL command:* + + ```bash + curl -G -i "http://localhost:8081/v2/phone_number/?&entity_name=phone_number&source_language=en" \ + --data-urlencode "structured_value=" \ + --data-urlencode "fallback_value=" \ + --data-urlencode "bot_message=" \ + --data-urlencode "message=send a message on 91 9820334455" ``` - - > **Output **: - - ```json - {"data": [ + + > **Output **: + + ```json + { + "data": [ { - "detection": "message", - "original_text": "9049961794", - "entity_value": { "value": "9049961794"}, - "language": "en" - }] - } - + "detection": "message", + "original_text": "91 9820334455", + "entity_value": { + "value": "919820334455" + }, + "language": "en" + } + ] + } ``` - **Example 2: *Detecting phone number (hindi) from message*** - *Django Shell:* - + ```python message = u'मेरा मोबाइल नंबर है ९८९१९८९८७१' entity_name = 'phone_number' @@ -758,37 +853,39 @@ Currently number detection support has been provided for 6 different languages - fallback_value=fallback_value, bot_message=bot_message,language=source_language) print(output) + ``` - ``` - - - *CURL command:* - - ```shell - URL='localhost' - PORT=8081 - - curl -i 'http://'$URL':'$PORT'/v2/phone_number/?message=मेरा%20मोबाइल%20नंबर%20है%20९८९१९८९८७१entity_name=phone_number&structured_value=&fallback_value=&bot_message=&source_language=en' - - ``` - - > **Output **: - + - *CURL command:* + + ```bash + curl -G -i "http://localhost:8081/v2/phone_number/?&entity_name=phone_number&source_language=hi" \ + --data-urlencode "structured_value=" \ + --data-urlencode "fallback_value=" \ + --data-urlencode "bot_message=" \ + --data-urlencode "message=मेरा मोबाइल नंबर है ९८९१९८९८७१" + ``` + + > **Output **: + ```json - {"data": [ + { + "data": [ { - "detection": "message", - "original_text": "९८९१९८९८७१", - "entity_value": { "value": "981117971"}, - "language": "hi" - }] - } - + "detection": "message", + "original_text": "९८९१९८९८७१", + "entity_value": { + "value": "9891989871" + }, + "language": "hi" + } + ] + } ``` - - Example 2: *Detecting phone number from fallback value*** - + - Example 2: *Detecting phone number from **fallback value*** + - *Django Shell:* - + ```python message = u'Please call me' entity_name = 'phone_number' @@ -802,33 +899,35 @@ Currently number detection support has been provided for 6 different languages - output = detector.detect(message=message, entity_name=entity_name, structured_value=structured_value, fallback_value=fallback_value, - bot_message=bot_message,language=source_language) + bot_message=bot_message,language=source_language) print(output) + ``` - ``` - - - *CURL command:* - - ```shell - URL='localhost' - PORT=8081 - - curl -i 'http://'$URL':'$PORT'/v2/phone_number/?message=Please%20call%20me&entity_name=phone_number&structured_value=&fallback_value=9049961794&bot_message=&source_language=en' - - ``` - - > **Output **: - - ```json - {"data": [ + - *CURL command:* + + ```bash + curl -G -i "http://localhost:8081/v2/phone_number/?&entity_name=phone_number&source_language=en" \ + --data-urlencode "structured_value=" \ + --data-urlencode "fallback_value=9049961794" \ + --data-urlencode "bot_message=" \ + --data-urlencode "message=Please call me" + ``` + + > **Output **: + + ```json + { + "data": [ { - "detection": "fallback_value", - "original_text": "9049961794", - "entity_value": {"value": "9049961794"}, - "language": "en" - }] - } - + "detection": "fallback_value", + "original_text": "9049961794", + "entity_value": { + "value": "9049961794" + }, + "language": "en" + } + ] + } ``` @@ -838,89 +937,97 @@ Currently number detection support has been provided for 6 different languages - The Email Detector has the capability to detect emails within the given text. **API Example:** - + - **Example 1: *Detecting emails from message*** - + - *Django Shell:* - + ```python - message = u'my email id is amans.rlx@gmail.com' + message = u'my email id is hello@haptik.ai' entity_name = 'email' structured_value = None fallback_value = None bot_message = None from ner_v1.chatbot.entity_detection import get_email - output = get_email(message=message,entity_name=entity_name, + output = get_email(message=message,entity_name=entity_name, structured_value=structured_value, - fallback_value=fallback_value, bot_message=bot_message) + fallback_value=fallback_value, bot_message=bot_message) print(output) - ``` - - - *CURL command:* - - ```shell - URL='localhost' - PORT=8081 - - curl -i 'http://'$URL':'$PORT'/v1/email/?message=my%20email%20id%20is%20amans.rlx%40gmail.com&entity_name=email&structured_value=&fallback_value=&bot_message=' + - *CURL command:* + + ```bash + curl -G -i "http://localhost:8081/v1/email/?&entity_name=email&source_language=en" \ + --data-urlencode "structured_value=" \ + --data-urlencode "fallback_value=" \ + --data-urlencode "bot_message=" \ + --data-urlencode "message=my email id is hello@haptik.ai" ``` - + > **Output ** - + ```json - {"data": [ + { + "data": [ { - "detection": "message", - "original_text": "amans.rlx@gmail.com", - "entity_value": {"value": "amans.rlx@gmail.com"} - }] + "detection": "message", + "original_text": "hello@haptik.ai", + "entity_value": { + "value": "hello@haptik.ai" + }, + "language": "en" + } + ] } - ``` - + - ***Example 2: Detecting email from fallback value*** - + - *Django Shell:* - + ```python message = u'send this me to my email' entity_name = 'email' structured_value = None - fallback_value = 'amans.rlx@gmail.com' + fallback_value = 'hello@haptik.ai' bot_message = None - + from ner_v1.chatbot.entity_detection import get_email output = get_email(message=message,entity_name=entity_name, structured_value=structured_value, fallback_value=fallback_value, bot_message=bot_message) print(output) ``` - - - *CURL command:* - - ```shell - URL='localhost' - PORT=8081 - - curl -i 'http://'$URL':'$PORT'/v1/email/?message=send%20me%20to%20my%20email&entity_name=email&structured_value=&fallback_value=amans.rlx@gmail.com&bot_message=' - + + - *CURL command:* + + ```bash + curl -G -i "http://localhost:8081/v1/email/?&entity_name=email&source_language=en" \ + --data-urlencode "structured_value=" \ + --data-urlencode "fallback_value=hello@haptik.ai" \ + --data-urlencode "bot_message=" \ + --data-urlencode "message=send this me to my email" ``` - + > **Output ** - - ```json - {"data": [ + + ```json + { + "data": [ { - "detection": "fallback_value", - "original_text": "abc.123@gmail.com", - "entity_value": {"value": "abc.123@gmail.com"} - }] + "detection": "fallback_value", + "original_text": "hello@haptik.ai", + "entity_value": { + "value": "hello@haptik.ai" + }, + "language": "en" + } + ] } ``` - + ### 6. Text @@ -949,40 +1056,67 @@ The Text Detector has the capability to detect custom text entity within the giv fallback_value=fallback_value, bot_message=bot_message,language=source_language) print(output) - ``` - - The above can also be done from within the Docker container's shell. Setup is in docker.md file. - - - *CURL command:* - - ```shell - URL='localhost' - PORT=8081 - - curl -i 'http://'$URL':'$PORT'/v1/text/?message=i%20want%20to%20order%20chinese%20from%20%20mainland%20china%20and%20pizza%20from%20domminos&entity_name=restaurant&structured_value=&fallback_value=&bot_message=&source_language=en' - + + *CURL command:* + + ```bash + curl -G -i "http://localhost:8081/v1/text/?&entity_name=restaurant&source_language=en" \ + --data-urlencode "structured_value=" \ + --data-urlencode "fallback_value=" \ + --data-urlencode "bot_message=" \ + --data-urlencode "message=i want to order chinese from mainland china and pizza from dominos" ``` - + > **Output **: - - ```json - {"data": [ + + ```json + { + "data": [ { - "detection": "message", - "original_text": "mainland china", - "entity_value": {"value": "Mainland China"}, - "language": "en" + "detection": "message", + "original_text": "mainland china", + "entity_value": { + "crf_model_verified": false, + "datastore_verified": true, + "value": "Mainland China" + }, + "language": "en" }, { - "detection": "message", - "original_text": "dominos", - "entity_value": { "value": "Domino's Pizza"}, - "language": "en" - }] + "detection": "message", + "original_text": "dominos", + "entity_value": { + "crf_model_verified": false, + "datastore_verified": true, + "value": "Domino's Pizza" + }, + "language": "en" + }, + { + "detection": "message", + "original_text": "chinese", + "entity_value": { + "crf_model_verified": false, + "datastore_verified": true, + "value": "Yo! Chinese" + }, + "language": "en" + }, + { + "detection": "message", + "original_text": "pizza", + "entity_value": { + "crf_model_verified": false, + "datastore_verified": true, + "value": "U S Pizza" + }, + "language": "en" + } + ] } - ``` + @@ -1009,26 +1143,29 @@ The Text Detector has the capability to detect custom text entity within the giv - *CURL command:* - ```shell - URL='localhost' - PORT=8081 - - curl -i 'http://'$URL':'$PORT'/v1/text/?message=मेरे लिए कैब बुक कर दीजिये&entity_name=movie&structured_value=मुंबई&fallback_value=&bot_message=&source_language=en' - + ```bash + curl -G -i "http://localhost:8081/v1/text/?&entity_name=movie&source_language=hi" \ + --data-urlencode "structured_value=मुंबई" \ + --data-urlencode "fallback_value=" \ + --data-urlencode "bot_message=" \ + --data-urlencode "message=मेरे लिए कैब बुक कर दीजिये" ``` > **Output **: ```json - {"data": [ + { + "data": [ { - "detection": "structure_value_verified", - "original_text": "mumbai", - "entity_value": {"value": "Mumbai"}, - "language":"hi" - }] + "detection": "structure_value_not_verified", + "original_text": "मुंबई", + "entity_value": { + "value": "मुंबई" + }, + "language": "hi" + } + ] } - ``` @@ -1061,24 +1198,30 @@ The PNR Detector has the capability to detect Train/ Flight PNR number within th - *CURL command:* ```bash - URL='localhost' - PORT=8081 + curl -G -i "http://localhost:8081/v1/pnr/?&entity_name=pnr&source_language=en" \ + --data-urlencode "structured_value=" \ + --data-urlencode "fallback_value=" \ + --data-urlencode "bot_message=" \ + --data-urlencode "message=check my pnr status for 2141215305" +``` + +> **Output**: - curl -i 'http://'$URL':'$PORT'/v1/pnr/?message=check%20my%20pnr%20status%20for%202141215305.&entity_name=pnr&structured_value=&fallback_value=&bot_message=' - ``` - - > **Output**: - ```json - {"data": [ + { + "data": [ { - "detection": "message", - "original_text": "2141215305", - "entity_value": { "value": "2141215305"} - }] + "detection": "message", + "original_text": "2141215305", + "entity_value": { + "value": "2141215305" + }, + "language": "en" + } + ] } ``` - + ### 8. Regex @@ -1117,20 +1260,26 @@ Detect entities that match by the specified pattern. If you are not familiar wit - *CURL command:* ```bash - URL='localhost' - PORT=8081 - - curl -i 'http://'$URL':'$PORT'/v1/regex/?message=please%20apply%20AMAZON30%20coupon%20code%20to my%20cart&entity_name=regex&structured_value=&fallback_value=&bot_message=enter%20the%otp%20®ex=\d{4,6}' + curl -G -i "http://localhost:8081/v1/regex/?&entity_name=regex_coupon_code&source_language=en" \ + --data-urlencode "structured_value=" \ + --data-urlencode "fallback_value=" \ + --data-urlencode "bot_message=" \ + --data-urlencode "regex=[A-Z]+\d{2,6}" \ + --data-urlencode "message=please apply AMAZON30 coupon code to my cart" ``` > **Output:** - + ```json - {"data": [ + { + "data": [ { - "detection": "message", - "original_text": "AMAZON30", - "entity_value": "AMAZON30" - }] + "detection": "message", + "original_text": "AMAZON30", + "entity_value": { + "value": "AMAZON30" + } + } + ] } ``` diff --git a/models/crf_v2/README.md b/models/crf_v2/README.md index 2b08170af..747c43cb2 100644 --- a/models/crf_v2/README.md +++ b/models/crf_v2/README.md @@ -1,14 +1,6 @@ - -TODO -- [ ] Change Crf -> CRF - - - - ## CONDITIONAL RANDOM FIELDS - ### A. INTRODUCTION Conditional random fields (CRFs) are a class of statistical modeling method often applied in pattern recognition and machine learning and used for structured prediction. CRFs fall into the sequence modeling family. Whereas a discrete classifier predicts a label for a single sample without considering "neighboring" samples, a CRF can take context into account; e.g., the linear chain CRF (which is popular in natural language processing) predicts sequences of labels for sequences of input samples. @@ -195,11 +187,11 @@ The module is used to take input as the sentence_list and entity_list and conver 2. **isupper** - Flag to check if the first letter of the token is capitalized + Flag to check if the complete token is in upper case 3. **istitle** - Flag to check if the complete token is in upper case + Flag to check if the first letter of the token is capitalized 4. **isdigit** From a04d6e8238580bdb53f85bf5410add409ef7e75f Mon Sep 17 00:00:00 2001 From: Chirag Jain Date: Thu, 27 Jun 2019 17:32:36 +0530 Subject: [PATCH 004/237] Fix typo --- docs/adding_entities.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/adding_entities.md b/docs/adding_entities.md index fd750ffac..cb05d7e58 100644 --- a/docs/adding_entities.md +++ b/docs/adding_entities.md @@ -114,7 +114,7 @@ Now lets add the newly created csv file to the datastore. db.populate(entity_data_directory_path=csv_directory) ``` - > Note: It is advised that you put the csv files inside some directory in the repo. (E.g. chatbot_ner/data/entity_data/) because the repo is mouted inside the container so the files will available bot inside and outside the container + > Note: It is advised that you put the csv files inside some directory in the repo. (E.g. chatbot_ner/data/entity_data/) because the repo is mouted inside the container so the files will available both inside and outside the container - Once done, you can exit the shell and then exit the container @@ -178,7 +178,7 @@ After editing and saving your csv, you will need to update the datastore with ne db.repopulate(entity_data_directory_path=csv_directory) ``` - > Note: It is advised that you put the csv files inside some directory in the repo. (E.g. chatbot_ner/data/entity_data/) because the repo is mouted inside the container so the files will available bot inside and outside the container + > Note: It is advised that you put the csv files inside some directory in the repo. (E.g. chatbot_ner/data/entity_data/) because the repo is mouted inside the container so the files will available both inside and outside the container ### Deleting entity data From e94ba684a0cdbd615a4dc09af23958bd32052696 Mon Sep 17 00:00:00 2001 From: Chirag Jain Date: Thu, 27 Jun 2019 17:37:09 +0530 Subject: [PATCH 005/237] Update initial_setup script --- datastore/datastore.py | 2 +- initial_setup.py | 29 +++++++++++++++-------------- ner_v1/tests/textual/__init__.py | 0 3 files changed, 16 insertions(+), 15 deletions(-) create mode 100644 ner_v1/tests/textual/__init__.py diff --git a/datastore/datastore.py b/datastore/datastore.py index adce577e6..715a45be8 100644 --- a/datastore/datastore.py +++ b/datastore/datastore.py @@ -4,7 +4,7 @@ from chatbot_ner.config import ner_logger, CHATBOT_NER_DATASTORE from datastore import elastic_search -from datastore.constants import (ELASTICSEARCH, ENGINE, ELASTICSEARCH_INDEX_NAME, DEFAULT_ENTITY_DATA_DIRECTORY, +from datastore.constants import (ELASTICSEARCH, ENGINE, ELASTICSEARCH_INDEX_NAME, ELASTICSEARCH_DOC_TYPE, ELASTICSEARCH_CRF_DATA_INDEX_NAME, ELASTICSEARCH_CRF_DATA_DOC_TYPE) from datastore.exceptions import (DataStoreSettingsImproperlyConfiguredException, EngineNotImplementedException, diff --git a/initial_setup.py b/initial_setup.py index 3b93ae708..2cd2b6533 100755 --- a/initial_setup.py +++ b/initial_setup.py @@ -5,25 +5,25 @@ BASE_DIR = os.path.dirname(__file__) -print "Downloading nltk corpus: punkt ..." +print("Downloading nltk corpus: punkt ...") status = nltk.download('punkt') if not status: - print "punkt Download was unsucessful" + print("punkt Download was unsucessful") -print "Downloading nltk corpus: wordnet ..." +print("Downloading nltk corpus: wordnet ...") status = nltk.download('wordnet') if not status: - print "wordnet Download was unsucessful" + print("wordnet Download was unsucessful") -print "Downloading nltk corpus: MaxEnt POS ..." +print("Downloading nltk corpus: MaxEnt POS ...") status = nltk.download('maxent_treebank_pos_tagger') if not status: - print "MaxEnt POS Download was unsucessful" + print("MaxEnt POS Download was unsucessful") -print "Downloading nltk corpus: AP POS Tagger..." +print("Downloading nltk corpus: AP POS Tagger...") status = nltk.download('averaged_perceptron_tagger') if not status: - print "AP POS Tagger Download was unsucessful" + print("AP POS Tagger Download was unsucessful") # Below needs to be committed if you want to use existing data in the Elasticsearch Setup @@ -34,13 +34,14 @@ # POPULATING DATASTORE # Comment out entire section if you want to reuse existing data from datastore import DataStore +from datastore.constants import DEFAULT_ENTITY_DATA_DIRECTORY db = DataStore() -print "Setting up DataStore for Chatbot NER" -print "Deleting any stale data ..." +print("Setting up DataStore for Chatbot NER") +print("Deleting any stale data ...") db.delete() -print "Creating the structure ..." +print("Creating the structure ...") db.create() -print "Populating data from " + os.path.join(BASE_DIR, 'data', 'entity_data') + " ..." -db.populate() -print "Done!" +print("Populating data from " + os.path.join(BASE_DIR, 'data', 'entity_data') + " ...") +db.populate(entity_data_directory_path=DEFAULT_ENTITY_DATA_DIRECTORY) +print("Done!") diff --git a/ner_v1/tests/textual/__init__.py b/ner_v1/tests/textual/__init__.py new file mode 100644 index 000000000..e69de29bb From 65f9154b8294a0ebdd560a74e20b1bdd84a76321 Mon Sep 17 00:00:00 2001 From: Chirag Jain Date: Thu, 27 Jun 2019 17:53:00 +0530 Subject: [PATCH 006/237] Fix formatting for code blocks --- docs/api_call.md | 88 ++++++++++++++++++++++++------------------------ 1 file changed, 44 insertions(+), 44 deletions(-) diff --git a/docs/api_call.md b/docs/api_call.md index e5b1de385..65d26fe8c 100644 --- a/docs/api_call.md +++ b/docs/api_call.md @@ -182,9 +182,9 @@ Currently time detection support has been provided in different languages - `Eng --data-urlencode "fallback_value=" \ --data-urlencode "bot_message=" \ --data-urlencode "message=John arrived at the bus stop at 13:50 hrs, expecting the bus to be there in 15 mins. But the bus was scheduled for 12:30 pm" -``` + ``` -> **Output**: + > **Output**: ```json { @@ -257,9 +257,9 @@ Currently time detection support has been provided in different languages - `Eng --data-urlencode "message=राजू का बस १३:५० को बस स्टॉप से निकला और १५ मिनट में यहाँ पहुंच जाएगा और गोवा को शाम में बारह बजकर ३० मिनट पैर पहुंचेगा" ``` -> **Output**: + > **Output**: -```json + ```json { "data": [ { @@ -558,9 +558,9 @@ Currently number detection support has been provided for 6 different languages - --data-urlencode "fallback_value=" \ --data-urlencode "bot_message=" \ --data-urlencode "message=i want to purchase 30 units of mobile abd 40 units of television" - ``` + ``` - > **Output:** + > **Output:** ```json { @@ -749,9 +749,9 @@ Currently number detection support has been provided for 6 different languages - print(output) ``` -- *CURL command:* + - *CURL command:* - ```bash + ```bash curl -G -i "http://localhost:8081/v2/number/?&entity_name=number_of_unit&min_number_digits=1&max_number_digits=6&unit_type=currency&source_language=en" \ --data-urlencode "structured_value=" \ --data-urlencode "fallback_value=" \ @@ -759,9 +759,9 @@ Currently number detection support has been provided for 6 different languages - --data-urlencode "message=i want more than Rupees 20k and 10 pendrive" ``` - > **Output:** + > **Output:** - ```json + ```json { "data": [ { @@ -807,19 +807,19 @@ Currently number detection support has been provided for 6 different languages - print(output) ``` - - *CURL command:* + - *CURL command:* - ```bash - curl -G -i "http://localhost:8081/v2/phone_number/?&entity_name=phone_number&source_language=en" \ + ```bash + curl -G -i "http://localhost:8081/v2/phone_number/?&entity_name=phone_number&source_language=en" \ --data-urlencode "structured_value=" \ --data-urlencode "fallback_value=" \ --data-urlencode "bot_message=" \ --data-urlencode "message=send a message on 91 9820334455" ``` - > **Output **: + > **Output: ** - ```json + ```json { "data": [ { @@ -831,7 +831,7 @@ Currently number detection support has been provided for 6 different languages - "language": "en" } ] - } + } ``` - **Example 2: *Detecting phone number (hindi) from message*** @@ -844,7 +844,7 @@ Currently number detection support has been provided for 6 different languages - structured_value = None fallback_value = None bot_message = None - source_langauge='hi' # here language will be ISO 639-1 code + source_langauge='hi' # here language will be ISO 639-1 code from ner_v2.detectors.pattern.phone_number.phone_number_detection import PhoneDetector detector = PhoneDetector(language=source_langauge, entity_name=entity_name) @@ -853,9 +853,9 @@ Currently number detection support has been provided for 6 different languages - fallback_value=fallback_value, bot_message=bot_message,language=source_language) print(output) - ``` + ``` - - *CURL command:* + - *CURL command:* ```bash curl -G -i "http://localhost:8081/v2/phone_number/?&entity_name=phone_number&source_language=hi" \ @@ -863,9 +863,9 @@ Currently number detection support has been provided for 6 different languages - --data-urlencode "fallback_value=" \ --data-urlencode "bot_message=" \ --data-urlencode "message=मेरा मोबाइल नंबर है ९८९१९८९८७१" - ``` + ``` - > **Output **: + > **Output **: ```json { @@ -877,9 +877,9 @@ Currently number detection support has been provided for 6 different languages - "value": "9891989871" }, "language": "hi" - } + } ] - } + } ``` - Example 2: *Detecting phone number from **fallback value*** @@ -901,21 +901,21 @@ Currently number detection support has been provided for 6 different languages - fallback_value=fallback_value, bot_message=bot_message,language=source_language) print(output) - ``` + ``` - - *CURL command:* + - *CURL command:* - ```bash + ```bash curl -G -i "http://localhost:8081/v2/phone_number/?&entity_name=phone_number&source_language=en" \ --data-urlencode "structured_value=" \ --data-urlencode "fallback_value=9049961794" \ - --data-urlencode "bot_message=" \ + --data-urlencode "bot_message=" \ --data-urlencode "message=Please call me" - ``` + ``` - > **Output **: + > **Output **: - ```json + ```json { "data": [ { @@ -925,9 +925,9 @@ Currently number detection support has been provided for 6 different languages - "value": "9049961794" }, "language": "en" - } + } ] - } + } ``` @@ -950,9 +950,9 @@ Currently number detection support has been provided for 6 different languages - bot_message = None from ner_v1.chatbot.entity_detection import get_email - output = get_email(message=message,entity_name=entity_name, + output = get_email(message=message,entity_name=entity_name, structured_value=structured_value, - fallback_value=fallback_value, bot_message=bot_message) + fallback_value=fallback_value, bot_message=bot_message) print(output) ``` @@ -991,7 +991,7 @@ Currently number detection support has been provided for 6 different languages - message = u'send this me to my email' entity_name = 'email' structured_value = None - fallback_value = 'hello@haptik.ai' + fallback_value = 'hello@haptik.ai' bot_message = None from ner_v1.chatbot.entity_detection import get_email @@ -1001,9 +1001,9 @@ Currently number detection support has been provided for 6 different languages - print(output) ``` - - *CURL command:* + - *CURL command:* - ```bash + ```bash curl -G -i "http://localhost:8081/v1/email/?&entity_name=email&source_language=en" \ --data-urlencode "structured_value=" \ --data-urlencode "fallback_value=hello@haptik.ai" \ @@ -1013,7 +1013,7 @@ Currently number detection support has been provided for 6 different languages - > **Output ** - ```json + ```json { "data": [ { @@ -1058,9 +1058,9 @@ The Text Detector has the capability to detect custom text entity within the giv print(output) ``` - *CURL command:* + *CURL command:* - ```bash + ```bash curl -G -i "http://localhost:8081/v1/text/?&entity_name=restaurant&source_language=en" \ --data-urlencode "structured_value=" \ --data-urlencode "fallback_value=" \ @@ -1068,9 +1068,9 @@ The Text Detector has the capability to detect custom text entity within the giv --data-urlencode "message=i want to order chinese from mainland china and pizza from dominos" ``` - > **Output **: + > **Output **: - ```json + ```json { "data": [ { @@ -1203,9 +1203,9 @@ The PNR Detector has the capability to detect Train/ Flight PNR number within th --data-urlencode "fallback_value=" \ --data-urlencode "bot_message=" \ --data-urlencode "message=check my pnr status for 2141215305" -``` + ``` -> **Output**: + > **Output**: ```json { From 4f74aeb039ea2a1467890a19bea3d37419ce460e Mon Sep 17 00:00:00 2001 From: Chirag Jain Date: Thu, 27 Jun 2019 18:10:48 +0530 Subject: [PATCH 007/237] Fix typo --- initial_setup.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/initial_setup.py b/initial_setup.py index 2cd2b6533..6f41fc62f 100755 --- a/initial_setup.py +++ b/initial_setup.py @@ -8,22 +8,22 @@ print("Downloading nltk corpus: punkt ...") status = nltk.download('punkt') if not status: - print("punkt Download was unsucessful") + print("punkt Download was unsuccessful") print("Downloading nltk corpus: wordnet ...") status = nltk.download('wordnet') if not status: - print("wordnet Download was unsucessful") + print("wordnet Download was unsuccessful") print("Downloading nltk corpus: MaxEnt POS ...") status = nltk.download('maxent_treebank_pos_tagger') if not status: - print("MaxEnt POS Download was unsucessful") + print("MaxEnt POS Download was unsuccessful") print("Downloading nltk corpus: AP POS Tagger...") status = nltk.download('averaged_perceptron_tagger') if not status: - print("AP POS Tagger Download was unsucessful") + print("AP POS Tagger Download was unsuccessful") # Below needs to be committed if you want to use existing data in the Elasticsearch Setup From ee695a4f14aafa1e1e437a1aac65262f87d28794 Mon Sep 17 00:00:00 2001 From: ranvijayj Date: Fri, 5 Jul 2019 18:11:22 +0530 Subject: [PATCH 008/237] Django version upgrade Django version upgrade django==1.11.20 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 22e9a581a..fbb8f3f81 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,7 @@ numpy==1.10.4 elasticsearch==5.5.0 requests==2.20.0 requests-aws4auth==0.9 -django==1.11.19 +django==1.11.20 django-dotenv==1.4.2 weighted-levenshtein==0.1 regex==2018.7.11 From a62e751ecf66da049b5bb91612e01933a47608a4 Mon Sep 17 00:00:00 2001 From: ranvijayj Date: Fri, 5 Jul 2019 18:14:23 +0530 Subject: [PATCH 009/237] change version to Django=1.11.22 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index fbb8f3f81..c910df5be 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,7 @@ numpy==1.10.4 elasticsearch==5.5.0 requests==2.20.0 requests-aws4auth==0.9 -django==1.11.20 +Django==1.11.22 django-dotenv==1.4.2 weighted-levenshtein==0.1 regex==2018.7.11 From 6c943b0c499b61769242f682d5d588aa4d484dc5 Mon Sep 17 00:00:00 2001 From: Ruthvik-Reddy-Haptik Date: Thu, 11 Jul 2019 18:29:11 +0530 Subject: [PATCH 010/237] added telugu date and number data --- .../detectors/numeral/number/te/__init__.py | 0 .../number/te/data/numerals_constant.csv | 107 ++++++++++++++++++ ner_v2/detectors/temporal/date/README.md | 2 +- ner_v2/detectors/temporal/date/te/__init__.py | 0 .../temporal/date/te/data/date_constant.csv | 27 +++++ .../date/te/data/datetime_diff_constant.csv | 10 ++ .../date/te/data/numbers_constant.csv | 34 ++++++ 7 files changed, 179 insertions(+), 1 deletion(-) create mode 100644 ner_v2/detectors/numeral/number/te/__init__.py create mode 100644 ner_v2/detectors/numeral/number/te/data/numerals_constant.csv create mode 100644 ner_v2/detectors/temporal/date/te/__init__.py create mode 100644 ner_v2/detectors/temporal/date/te/data/date_constant.csv create mode 100644 ner_v2/detectors/temporal/date/te/data/datetime_diff_constant.csv create mode 100644 ner_v2/detectors/temporal/date/te/data/numbers_constant.csv diff --git a/ner_v2/detectors/numeral/number/te/__init__.py b/ner_v2/detectors/numeral/number/te/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/ner_v2/detectors/numeral/number/te/data/numerals_constant.csv b/ner_v2/detectors/numeral/number/te/data/numerals_constant.csv new file mode 100644 index 000000000..3bb362af5 --- /dev/null +++ b/ner_v2/detectors/numeral/number/te/data/numerals_constant.csv @@ -0,0 +1,107 @@ +number,name_variants,number_value,number_type +౦,సున్నా|సున్న|శూన్యం|సూన్యం|జీరో|sunna|shunyam|sunyam|zero,0,unit +౧.౫,ఒకటిన్నర|ఒక్కటి అర|ఒకటి అర|okatinnara|okkati ara|okati ara,1.5,unit +౨.౫,రెండున్నర|రెండు అర|rendunnara|rendu ara,2.5,unit +౧,ఒక్కటి|ఒకటి|మొదటిది|ఒకటవ|ఒకటో|ఒక|okkati|okati|modatidi|okatova|okato|okkato|okkatova|oka,1,unit +౨,రెండు|రొండు|రెండొవ|రెండో|రెండొవది|rendu|rondu|rendova|rendo|rendovadi,2,unit +౩,మూడు|మూడోవ|మూడోవ|మూడొవది|మూడో|mudu|muudu|mudova|muudova|mudovadi|mudo|muudo,3,unit +౪,నాలుగు|నాల్గు|నాల్గొవ|నాల్గొవది|నాల్గో|నాలగు|nalugu|naalugu|nalgu|nalgova|nalgovadi|nalgo|nalagu|nalagu|naalagu,4,unit +౫,ఐదు|అయిదు|ఐదొవ|అయిదవది|ఐదో|idu|ayidu|iydu|iydova|ayidovadi|ayido|ido|aidu,5,unit +౬,ఆరు|ఆఱు|ఆరొవ|ఆరొవది|ఆరో|అర డజను|అర డజన్|aru|aaru|aarova|aarovadi|aro|aaro|ara dozen|ara dozenu|ara dajanu|ara dajan,6,unit +౭,ఏడు|ఏడొవ|ఏడొవది|ఏడో|aedu|aedova|aedovadi|aedo|ado|adova|adovadi,7,unit +౮,ఎనిమిది|ఎనిమిదొవ|ఎనిమిదిది|ఎనిమిదో|enimidi|enimdova|enimididi|enimido,8,unit +౯,తొమ్మిది|తొమ్మిదిది|తొమ్మిదో|tommidi|thommidi|thommididi|thommido|tomidi|thomido,9,unit +౧౦,పది|పదొవ|పదొవది|పదో|padi|padhi|padova|padovadi|pado,10,unit +౧౧,పదకొండు|పదకొండొవ|పదకొండొవది|పదకొండో|padakondu|padakondova|padakondovadi|padakondo,11,unit +౧౨,పన్నెండు|పన్నెండవ|పన్నెండవది|పన్నెండో|పన్నెండొవ|పన్నెండొవది|పన్నెండొ|డజన్|డజను|dozenu|dojanu|dojan|dozen|pannendu|pannendova|pannendovadi|pannendo,12,unit +౧౩,పదమూడు|పదమూడొవ|పదముండొవది|పదముండొ|padamudu|padamudova|padamundovadi|padamundo,13,unit +౧౪,పద్నాలుగు|పద్నలుగొవ|పద్నాలుగుది|పద్నాలుగో|padnalugu|padnalugova|padnalugudi|padnalugo|padinalugu|padinalagu,14,unit +౧౫,పదిహేను|పదిహేనోవా|పదిహేనోవాది|పదిహేనో|పదైదు|padihenu|padihenova|padihenovadi|padiheno|padaidu,15,unit +౧౬,పదహారు|పదహారోది|పదహారో|padahaaru|padhaharu|padaharodi|padaharo,16,unit +౧౭,పదిహేడు|పదిహేనొవ|పదిహేనోవాది|పదిహేనో|padihedu|padhihedu|padihenova|padihenovadi|padiheno,17,unit +౧౮,పద్దెనిమిది|పద్దెనిమిదొవ|పద్దెనిమిదిది|పద్దెనిమిదో|పద్దెనిమిదొవ|paddenimidi|padhenimidi|paddenimidova|paddenimididi|padhenimidho|padhenimidova,18,unit +౧౯,పంతొమ్మిది|పందొమ్మిదొవ|పంతొమ్మిదో|పంతొమ్మిదొవ|పందొమ్మిది|pantommidi|pandommidova|panthommido,19,unit +౨౦,ఇరవై|ఇరవై|ఇరవయ్యొవది|ఇరవయ్యో|iravay|iravai|iravayyovadi|iravayyo,20,unit +౨౧,ఇరవయ్యొక్కటి|ఇరవై ఒకటి |ఇరవై ఒక్కటి|ఇరవై ఒకటో|iravayyokkati|iravay okati|eravay okati|iravay okato,21,unit +౨౨,ఇరవై రెండు|ఇరవై రెండొవ|ఇరవై రెండొవది|ఇరవై రెండో|iravay rendu|iravay rendova|iravay rendovadi|iravay rendo,22,unit +౨౩,ఇరవై మూడు|ఇరవై మూడవా|ఇరవై మూడొవది|ఇరవై మూడో|iravay mudu|iravay mudova|iravay mudovadi|iravay mudo,23,unit +౨౪,ఇరవై నాలుగు|ఇరవై నాల్గొవ|ఇరవై నాల్గొవది|ఇరవై నాల్గో|ఇరవై నాలగు|iravay naalugu|iravay naalgova|iravay nalgovadi|iravay nalgo|iravay nalagu|iaravay naalagu,24,unit +౨౫,ఇరవై ఐదు|ఇరవై ఐదొవ|ఇరవై ఐదొవది|ఇరవై ఐదో|పాతిక|pathika|patika|paathika|paatika|iravay aidu|iravay aidu|iravay aidova|iravay aidovadi|iravay aido,25,unit +౨౬,ఇరవై ఆఱు|ఇరవై ఆరు|ఇరవై ఆరొవ|ఇరవై ఆరో|ఇరవై ఆరొవది|iravay aaru|iravay aaru|iravay arova|iravay aaro|iravay aarovadi,26,unit +౨౭,ఇరవయ్యేడు|ఇరవయ్యేడొవ|ఇరవయ్యేడొవది|ఇరవయ్యేడో|ఇరవై ఏడు|iravayeedu|iravayeedova|iravayeedovadi|iravayeedo|iravay aedu,27,unit +౨౮,ఇరవై ఎనిమిది|ఇరవై ఎనిమిదొవ| ఇరవై ఎనిమిదిది|ఇరవై ఎనిమిదో|iravay enimidi|iravay enimidova|iraavay enimididi|iravay enimido,28,unit +౨౯,ఇరవై తొమ్మిది|ఇరవై తొమ్మిదొవది|ఇరవై తొమ్మిదో|iravay tommidi|iravay thommidi|iravay thommidovadi|iravay thommido,29,unit +౩౦,ముప్పై|ముప్పైయొవది|ముప్పైయ్యొవది|ముప్పైయొవ|ముప్పైయో|muppai|muppaiovadi|muppaiova|muppaiyo,30,unit +౩౧,ముప్పై ఒక్కటి|ముప్పై ఒకటవ|ముప్పై ఒకటోవది|ముప్పై ఒకటో|muppai okkati|muppai okatova|muppai okatovadi|muppai okato,31,unit +౩౨,ముప్పై రెండు|ముప్పై రెండొవ|ముప్పై రెండో|muppai rendu|muppai rendova|muppai rendo,32,unit +౩౩,ముప్పై మూడు|ముప్పై మూడోవ|ముప్పై మూడోవ|ముప్పై మూడొవది|ముప్పై మూడో|muppai mudu|muppai muudu|muppai mudova|muppai muudova|muppai mudovadi|muppai mudo|muppai muudo,33,unit +౩౪,ముప్పై నాలుగు|ముప్పై నాల్గు|ముప్పై నాల్గొవ|ముప్పై నాల్గొవది|ముప్పై నాల్గో|ముప్పై నాలగు|muppai nalugu|muppai naalugu|muppai nalgu|muppai nalgova|muppai nalgovadi|muppai nalgo|muppai nalagu|muppai naalagu,34,unit +౩౫,ముప్పై ఐదు|ముప్పై అయిదు|ముప్పై ఐదొవ|ముప్పై అయిదవది|ముప్పై ఐదో|muppai idu|muppai ayidu|muppai iydu|muppai iydova|muppai ayidovadi|muppai ayido|muppai ido|muppai aidu,35,unit +౩౬,ముప్పై ఆరు|ముప్పై ఆఱు|ముప్పై ఆరొవ|ముప్పై ఆరొవది|ముప్పై ఆరో|muppai aru|muppai aaru|muppai aarova|muppai aarovadi|muppai aro|muppai aaro,36,unit +౩౭,ముప్పై ఏడు|ముప్పై ఏడొవ|ముప్పై ఏడొవది|ముప్పై ఏడో|muppai aedu|muppai aedova|muppai aedovadi|muppai aedo|muppai ado|muppai adova|muppai adovadi,37,unit +౩౮,ముప్పై ఎనిమిది|ముప్పై ఎనిమిదొవ|ముప్పై ఎనిమిదిది|ముప్పై ఎనిమిదో|muppai enimidi|muppai enimdova|muppai enimididi|muppai enimido,38,unit +౩౯,ముప్పై తొమ్మిది|ముప్పై తొమ్మిదిది|ముప్పై తొమ్మిదో|muppai tommidi|muppai thommidi|muppai thommididi|muppai thommido|muppai tomidi|muppai thomido,39,unit +౪౦,నలభై|నలబై|నలభైయొవ|నలభైయొవది|నలభైయ్యొవది|నలభైయో|nalabhai|nalabhay|nalabayova|nalabayovadi|nalabhayo,40,unit +౪౧,నలభై ఒక్కటి|నలభై ఒకటి|నలభై మొదటిది|నలభై ఒకటవ|నలభై ఒకటో|nalabhai okkati|nalabhai okati|nalabhai modatidi|nalabhai okatova|nalabhai okato|nalabhai okkato|nalabhai okkatova,41,unit +౪౨,నలభై రెండు|నలభై రొండు|నలభై రెండొవ|నలభై రెండో|నలభై రెండొవది|nalabhai rendu|nalabhai rondu|nalabhai rendova|nalabhai rendo|nalabhai rendovadi,42,unit +౪౩,నలభై మూడు|నలభై మూడోవ|నలభై మూడోవ|నలభై మూడొవది|నలభై మూడో|nalabhai mudu|nalabhai muudu|nalabhai mudova|nalabhai muudova|nalabhai mudovadi|nalabhai mudo|nalabhai muudo,43,unit +౪౪,నలభై నాలుగు|నలభై నాల్గు|నలభై నాల్గొవ|నలభై నాల్గొవది|నలభై నాల్గో|nalabhai nalugu|nalabhai naalugu|nalabhai nalgu|nalabhai nalgova|nalabhai nalgovadi|nalabhai nalgo|nalabhai naalagu,44,unit +౪౫,నలభై ఐదు|నలభై అయిదు|నలభై ఐదొవ|నలభై అయిదవది|నలభై ఐదో|nalabhai idu|nalabhai ayidu|nalabhai iydu|nalabhai iydova|nalabhai ayidovadi|nalabhai ayido|nalabhai ido|nalabhai aidu,45,unit +౪౬,నలభై ఆరు|నలభై ఆఱు|నలభై ఆరొవ|నలభై ఆరొవది|నలభై ఆరో|nalabhai aru|nalabhai aaru|nalabhai aarova|nalabhai aarovadi|nalabhai aro|nalabhai aaro,46,unit +౪౭,నలభై ఏడు|నలభై ఏడొవ|నలభై ఏడొవది|నలభై ఏడో|nalabhai aedu|nalabhai aedova|nalabhai aedovadi|nalabhai aedo|nalabhai ado|nalabhai adova|nalabhai adovadi,47,unit +౪౮,నలభై ఎనిమిది|నలభై ఎనిమిదొవ|నలభై ఎనిమిదిది|నలభై ఎనిమిదో|nalabhai enimidi|nalabhai enimdova|nalabhai enimididi|nalabhai enimido,48,unit +౪౯,నలభై తొమ్మిది|నలభై తొమ్మిదిది|నలభై తొమ్మిదో|nalabhai tommidi|nalabhai thommidi|nalabhai thommididi|nalabhai thommido|nalabhai tomidi|nalabhai thomido,49,unit +౫౦,యాభై|యాభైయొవ|యాభైయొవది|నలభైయ్యొవది|యాభైయ్యో|yabhai|yabhaiyyova|yabhaiyova|yabhaiyyovadi|yabhaiyyo,50,unit +౫౧,యాభై ఒక్కటి|యాభై ఒకటి|యాభై మొదటిది|యాభై ఒకటవ|యాభై ఒకటో|yabhai okkati|yabhai okati|yabhai modatidi|yabhai okatova|yabhai okato|yabhai okkato|yabhai okkatova,51,unit +౫౨,యాభై రెండు|యాభై రొండు|యాభై రెండొవ|యాభై రెండో|యాభై రెండొవది|yabhai rendu|yabhai rondu|yabhai rendova|yabhai rendo|yabhai rendovadi,52,unit +౫౩,యాభై మూడు|యాభై మూడోవ|యాభై మూడోవ|యాభై మూడొవది|యాభై మూడో|yabhai mudu|yabhai muudu|yabhai mudova|yabhai muudova|yabhai mudovadi|yabhai mudo|yabhai muudo,53,unit +౫౪,యాభై నాలుగు|యాభై నాల్గు|యాభై నాల్గొవ|యాభై నాల్గొవది|యాభై నాల్గో|యాభై నాలగు|yabhai nalugu|yabhai naalugu|yabhai nalgu|yabhai nalgova|yabhai nalgovadi|yabhai nalgo|yabhai nalagu|yabhai nalagu|yabhai naalagu,54,unit +౫౫,యాభై ఐదు|యాభై అయిదు|యాభై ఐదొవ|యాభై అయిదవది|యాభై ఐదో|yabhai idu|yabhai ayidu|yabhai iydu|yabhai iydova|yabhai ayidovadi|yabhai ayido|yabhai ido|yabhai aidu,55,unit +౫౬,యాభై ఆరు|యాభై ఆఱు|యాభై ఆరొవ|యాభై ఆరొవది|యాభై ఆరో|yabhai aru|yabhai aaru|yabhai aarova|yabhai aarovadi|yabhai aro|yabhai aaro,56,unit +౫౭,యాభై ఏడు|యాభై ఏడొవ|యాభై ఏడొవది|యాభై ఏడో|yabhai aedu|yabhai aedova|yabhai aedovadi|yabhai aedo|yabhai ado|yabhai adova|yabhai adovadi,57,unit +౫౮,యాభై ఎనిమిది|యాభై ఎనిమిదొవ|యాభై ఎనిమిదిది|యాభై ఎనిమిదో|yabhai enimidi|yabhai enimdova|yabhai enimididi|yabhai enimido,58,unit +౫౯,యాభై తొమ్మిది|యాభై తొమ్మిదిది|యాభై తొమ్మిదో|yabhai tommidi|yabhai thommidi|yabhai thommididi|yabhai thommido|yabhai tomidi|yabhai thomido,59,unit +౬౦,అరవై|అరవైయవ|అరవైయొవ|అరవయొవది|అరవైయ్యొవది|అరవయ్యో|అరవయోవ|అరవయోవ|aravay|aravai|aravaiyova|aravaiyyovadi|aravayyo,60,unit +౬౧,అరవై ఒక్కటి|అరవై ఒకటి|అరవై మొదటిది|అరవై ఒకటవ|అరవై ఒకటో|aravai okkati|aravai okati|aravai modatidi|aravai okatova|aravai okato|aravai okkato|aravai okkatova,61,unit +౬౨,అరవై రెండు|అరవై రొండు|అరవై రెండొవ|అరవై రెండో|అరవై రెండొవది|aravai rendu|aravai rondu|aravai rendova|aravai rendo|aravai rendovadi,62,unit +౬౩,అరవై మూడు|అరవై మూడోవ|అరవై మూడోవ|అరవై మూడొవది|అరవై మూడో|aravai mudu|aravai muudu|aravai mudova|aravai muudova|aravai mudovadi|aravai mudo|aravai muudo,63,unit +౬౪,అరవై నాలుగు|అరవై నాల్గు|అరవై నాల్గొవ|అరవై నాల్గొవది|అరవై నాల్గో|అరవై నాలగు|aravai nalugu|aravai naalugu|aravai nalgu|aravai nalgova|aravai nalgovadi|aravai nalgo|aravai nalagu|aravai nalagu|aravai naalagu,64,unit +౬౫,అరవై ఐదు|అరవై అయిదు|అరవై ఐదొవ|అరవై అయిదవది|అరవై ఐదో|aravai idu|aravai ayidu|aravai iydu|aravai iydova|aravai ayidovadi|aravai ayido|aravai ido|aravai aidu,65,unit +౬౬,అరవై ఆరు|అరవై ఆఱు|అరవై ఆరొవ|అరవై ఆరొవది|అరవై ఆరో|aravai aru|aravai aaru|aravai aarova|aravai aarovadi|aravai aro|aravai aaro,66,unit +౬౭,అరవై ఏడు|అరవై ఏడొవ|అరవై ఏడొవది|అరవై ఏడో|aravai aedu|aravai aedova|aravai aedovadi|aravai aedo|aravai ado|aravai adova|aravai adovadi,67,unit +౬౮,అరవై ఎనిమిది|అరవై ఎనిమిదొవ|అరవై ఎనిమిదిది|అరవై ఎనిమిదో|aravai enimidi|aravai enimdova|aravai enimididi|aravai enimido,68,unit +౬౯,అరవై తొమ్మిది|అరవై తొమ్మిదిది|అరవై తొమ్మిదో|aravai tommidi|aravai thommidi|aravai thommididi|aravai thommido|aravai tomidi|aravai thomido,69,unit +౭౦,డెబ్బై|డెబ్బైయొవ|డెబ్బైయ్యొవది|డెబ్బైయ్యో|debhai|debhaiyyova|debhaiyovadi|debhaiyyo,70,unit +౭౧,డెబ్బై ఒక్కటి|డెబ్బై ఒకటి|డెబ్బై మొదటిది|డెబ్బై ఒకటవ|డెబ్బై ఒకటో|debhai okkati|debhai okati|debhai modatidi|debhai okatova|debhai okato|debhai okkato|debhai okkatova,71,unit +౭౨,డెబ్బై రెండు|డెబ్బై రొండు|డెబ్బై రెండొవ|డెబ్బై రెండో|డెబ్బై రెండొవది|debhai rendu|debhai rondu|debhai rendova|debhai rendo|debhai rendovadi,72,unit +౭౩,డెబ్బై మూడు|డెబ్బై మూడోవ|డెబ్బై మూడోవ|డెబ్బై మూడొవది|డెబ్బై మూడో|debhai mudu|debhai muudu|debhai mudova|debhai muudova|debhai mudovadi|debhai mudo|debhai muudo,73,unit +౭౪,డెబ్బై నాలుగు|డెబ్బై నాల్గు|డెబ్బై నాల్గొవ|డెబ్బై నాల్గొవది|డెబ్బై నాల్గో|డెబ్బై నాలగు|debhai nalugu|debhai naalugu|debhai nalgu|debhai nalgova|debhai nalgovadi|debhai nalgo|debhai nalagu|debhai nalagu|debhai naalagu,74,unit +౭౫,డెబ్బై ఐదు|డెబ్బై అయిదు|డెబ్బై ఐదొవ|డెబ్బై అయిదవది|డెబ్బై ఐదో|debhai idu|debhai ayidu|debhai iydu|debhai iydova|debhai ayidovadi|debhai ayido|debhai ido|debhai aidu,75,unit +౭౬,డెబ్బై ఆరు|డెబ్బై ఆఱు|డెబ్బై ఆరొవ|డెబ్బై ఆరొవది|డెబ్బై ఆరో|debhai aru|debhai aaru|debhai aarova|debhai aarovadi|debhai aro|debhai aaro,76,unit +౭౭,డెబ్బై ఏడు|డెబ్బై ఏడొవ|డెబ్బై ఏడొవది|డెబ్బై ఏడో|debhai aedu|debhai aedova|debhai aedovadi|debhai aedo|debhai ado|debhai adova|debhai adovadi,77,unit +౭౮,డెబ్బై ఎనిమిది|డెబ్బై ఎనిమిదొవ|డెబ్బై ఎనిమిదిది|డెబ్బై ఎనిమిదో|debhai enimidi|debhai enimdova|debhai enimididi|debhai enimido,78,unit +౭౯,డెబ్బై తొమ్మిది|డెబ్బై తొమ్మిదిది|డెబ్బై తొమ్మిదో|debhai tommidi|debhai thommidi|debhai thommididi|debhai thommido|debhai tomidi|debhai thomido,79,unit +౮౦,ఎనభై|ఎనభైయొవ|ఎనభైయొవది|ఎనభైయ్యొవది|ఎనభైయ్యో|enabhai|enabai|enabhaiyova|enabhaiyovadi|enabhaiyyo,80,unit +౮౧,ఎనభై ఒక్కటి|ఎనభై ఒకటి|ఎనభై మొదటిది|ఎనభై ఒకటవ|ఎనభై ఒకటో|enabhai okkati|enabhai okati|enabhai modatidi|enabhai okatova|enabhai okato|enabhai okkato|enabhai okkatova,81,unit +౮౨,ఎనభై రెండు|ఎనభై రొండు|ఎనభై రెండొవ|ఎనభై రెండో|ఎనభై రెండొవది|enabhai rendu|enabhai rondu|enabhai rendova|enabhai rendo|enabhai rendovadi,82,unit +౮౩,ఎనభై మూడు|ఎనభై మూడోవ|ఎనభై మూడోవ|ఎనభై మూడొవది|ఎనభై మూడో|enabhai mudu|enabhai muudu|enabhai mudova|enabhai muudova|enabhai mudovadi|enabhai mudo|enabhai muudo,83,unit +౮౪,ఎనభై నాలుగు|ఎనభై నాల్గు|ఎనభై నాల్గొవ|ఎనభై నాల్గొవది|ఎనభై నాల్గో|ఎనభై నాలగు|enabhai nalugu|enabhai naalugu|enabhai nalgu|enabhai nalgova|enabhai nalgovadi|enabhai nalgo|enabhai nalagu|enabhai nalagu|enabhai naalagu,84,unit +౮౫,ఎనభై ఐదు|ఎనభై అయిదు|ఎనభై ఐదొవ|ఎనభై అయిదవది|ఎనభై ఐదో|enabhai idu|enabhai ayidu|enabhai iydu|enabhai iydova|enabhai ayidovadi|enabhai ayido|enabhai ido|enabhai aidu,85,unit +౮౬,ఎనభై ఆరు|ఎనభై ఆఱు|ఎనభై ఆరొవ|ఎనభై ఆరొవది|ఎనభై ఆరో|enabhai aru|enabhai aaru|enabhai aarova|enabhai aarovadi|enabhai aro|enabhai aaro,86,unit +౮౭,ఎనభై ఏడు|ఎనభై ఏడొవ|ఎనభై ఏడొవది|ఎనభై ఏడో|enabhai aedu|enabhai aedova|enabhai aedovadi|enabhai aedo|enabhai ado|enabhai adova|enabhai adovadi,87,unit +౮౮,ఎనభై ఎనిమిది|ఎనభై ఎనిమిదొవ|ఎనభై ఎనిమిదిది|ఎనభై ఎనిమిదో|enabhai enimidi|enabhai enimdova|enabhai enimididi|enabhai enimido,88,unit +౮౯,ఎనభై తొమ్మిది|ఎనభై తొమ్మిదిది|ఎనభై తొమ్మిదో|enabhai tommidi|enabhai thommidi|enabhai thommididi|enabhai thommido|enabhai tomidi|enabhai thomido,89,unit +౯౦,తొంభై|తొంభైయొవ|తొంభైయొవది|తొంభైయ్యొవది|తొంభైయ్యో|thombhai|tombai|thombai|thombhaiyyova|thombaiyyovadi|thombaiyyo,90,unit +౯౧,తొంభై ఒక్కటి|తొంభై ఒకటి|తొంభై మొదటిది|తొంభై ఒకటవ|తొంభై ఒకటో|thombhai okkati|thombhai okati|thombhai modatidi|thombhai okatova|thombhai okato|thombhai okkato|thombhai okkatova,91,unit +౯౨,తొంభై రెండు|తొంభై రొండు|తొంభై రెండొవ|తొంభై రెండో|తొంభై రెండొవది|thombhai rendu|thombhai rondu|thombhai rendova|thombhai rendo|thombhai rendovadi,92,unit +౯౩,తొంభై మూడు|తొంభై మూడోవ|తొంభై మూడోవ|తొంభై మూడొవది|తొంభై మూడో|thombhai mudu|thombhai muudu|thombhai mudova|thombhai muudova|thombhai mudovadi|thombhai mudo|thombhai muudo,93,unit +౯౪,తొంభై నాలుగు|తొంభై నాల్గు|తొంభై నాల్గొవ|తొంభై నాల్గొవది|తొంభై నాల్గో|తొంభై నాలగు|thombhai nalugu|thombhai naalugu|thombhai nalgu|thombhai nalgova|thombhai nalgovadi|thombhai nalgo|thombhai nalagu|thombhai nalagu|thombhai naalagu,94,unit +౯౫,తొంభై ఐదు|తొంభై అయిదు|తొంభై ఐదొవ|తొంభై అయిదవది|తొంభై ఐదో|thombhai idu|thombhai ayidu|thombhai iydu|thombhai iydova|thombhai ayidovadi|thombhai ayido|thombhai ido|thombhai aidu,95,unit +౯౬,తొంభై ఆరు|తొంభై ఆఱు|తొంభై ఆరొవ|తొంభై ఆరొవది|తొంభై ఆరో|thombhai aru|thombhai aaru|thombhai aarova|thombhai aarovadi|thombhai aro|thombhai aaro,96,unit +౯౭,తొంభై ఏడు|తొంభై ఏడొవ|తొంభై ఏడొవది|తొంభై ఏడో|thombhai aedu|thombhai aedova|thombhai aedovadi|thombhai aedo|thombhai ado|thombhai adova|thombhai adovadi,97,unit +౯౮,తొంభై ఎనిమిది|తొంభై ఎనిమిదొవ|తొంభై ఎనిమిదిది|తొంభై ఎనిమిదో|thombhai enimidi|thombhai enimdova|thombhai enimididi|thombhai enimido,98,unit +౯౯,తొంభై తొమ్మిది|తొంభై తొమ్మిదిది|తొంభై తొమ్మిదో|thombhai tommidi|thombhai thommidi|thombhai thommididi|enabhai thommido|enabhai tomidi|enabhai thomido,99,unit +౧౦౦,వంద|నూరు|నూట|క్వింటాల్|క్వింటా|కింటా|వందల|vanda|nuru|nooru|nuta|noota|quintal|quinta|kinta|quintaa|vandala,100,scale +౧౦౦౦,వెయ్యి|టన్|టన్ను|వేయి|వేలు|వేల|వెయ్యిల|వెయ్య|వేయ్యిలా|veyyi|ton|tonne|tonnu|veelu|veela|veyla|vela|veyyila|veyya|veyyila,1000,scale +౧౦౦౦౦౦,లక్ష|లక్షలు|లక్షల|laksha|lacha|lakshala|lakshalu,100000,scale +౧౦౦౦౦౦౦౦,కోటి|కోట్లు|కోట్ల|కరోడ్|koti|kotlu|kotla|korode|corode|carode,10000000,scale \ No newline at end of file diff --git a/ner_v2/detectors/temporal/date/README.md b/ner_v2/detectors/temporal/date/README.md index 6de390fef..66675b7d9 100644 --- a/ner_v2/detectors/temporal/date/README.md +++ b/ner_v2/detectors/temporal/date/README.md @@ -6,7 +6,7 @@ This is the V2 version of date detector module that will detect date in multiple - Hindi - Marathi - Gujarati -- Telgu +- Telugu - Tamil ### Usage diff --git a/ner_v2/detectors/temporal/date/te/__init__.py b/ner_v2/detectors/temporal/date/te/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/ner_v2/detectors/temporal/date/te/data/date_constant.csv b/ner_v2/detectors/temporal/date/te/data/date_constant.csv new file mode 100644 index 000000000..001aa9289 --- /dev/null +++ b/ner_v2/detectors/temporal/date/te/data/date_constant.csv @@ -0,0 +1,27 @@ +key,numeric_representation,date_type +ఇవాళ|నేడు|ఈరోజు|ఈ రోజు|ఈనాడు|ఈ నాడు|ఈదినం|ఈ దినం|ఈదినము|ఈవేళ|ivala|ivaala|nedu|needu|ee roju|e roju|eroju|eeroju|eenadu|enadu|enaadu|ee dinam,0,relative_date +రేపు|మరుసటిరోజు|మరునాడు|ఱేపు|repu|reepu|marusatiroju|marusatirooju|marunadu|marunaadu,1,relative_date +ఎల్లుండి|ellundi,2,relative_date +అవతలి ఎల్లుండి|అవతలేల్లుంది|అవతలిఎల్లుండి|avathali ellundi|avathalellundi,3,relative_date +తారీఖు|రోజు|తారీఖున|దినం|tareeku|tareekhu|tareekh|roju|rooju|tareekhuna|tareekuna|dinam,NA,month_date_ref +రోజు|రోజులు|roju|rojulu,NA,date_literal +నెల|నెలలు|మాసం|మాసాలు|nela|nalalu|masam|maasam|masalu|maasaalu|masaalu,NA,month_literal +సోమవారం|సోమారం|ఇందువాసరము|somavaram|somavaaram|induvasaramu|induvaasaramu|induvasaram|monday|soma|sooma,0,weekday +మంగళవారము|అంగారకవారమ|జయవారము|mangalavaram|mangalavaaram|tuesday|mangala,1,weekday +బుధవారము|సౌమ్యవాసరము|బుధవారం|budavaram|buda|budavaaram|sowmyavaram|wednesday,2,weekday +గురువారము|బృహస్పతి వారము|లక్ష్మివారము|గురు|గురువారం|guruvaram|guruvaaram|guru|bruhaspathi varam|thursday|guru varam|guru vaaram|lakshmi varam|lakshmivaram,3,weekday +శుక్రవారము|శుక్రవారం|శుక్ర|shukravaram|shukra varam|shukravaaram|shukra,4,weekday +శనివారము|స్థిరవారము|మందవారము|shanivaram|shanivaaram|mandavaram|mandavaaram,5,weekday +ఆదివారము|భానువారము|రవివారము|అధిత్యవారము|తొలివారము|aadivaram|adivaram|adi varam|adi vaaram|bhanuvaram|bhanu|bhanu vaaram|ravivaram|ravi vaaramu|ravi varam|adityavaram|adithya varam|tholivaram|tholi vaaram|sunday,6,weekday +మొదటి నెల|జనుఅరీ|జనవరి|జనవరి|january|jan|janavary,1,month +ఫిబ్రవరి|ఫెబ్|february|feb|febravary,2,month +మార్చ్|మార్|march|mar,3,month +ఏప్రిల్|april|apr,4,month +మే|may,5,month +జూన్|jun|june,6,month +జులై|jul|july,7,month +ఆగష్టు|అగస్ట్|ఆగస్ట్|august|aug,8,month +సెప్టెంబర్|september|sep|sept,9,month +అక్టోబర్|అక్టోబరు|oct|october,10,month +నవంబర్|november|nov,11,month +డిసెంబరు|డిసెంబర్|december|dec,12,month diff --git a/ner_v2/detectors/temporal/date/te/data/datetime_diff_constant.csv b/ner_v2/detectors/temporal/date/te/data/datetime_diff_constant.csv new file mode 100644 index 000000000..2cff92322 --- /dev/null +++ b/ner_v2/detectors/temporal/date/te/data/datetime_diff_constant.csv @@ -0,0 +1,10 @@ +key,present_in_start,adding_magnitude,datetime_type +తర్వాత|పిమ్మట|అయినాక|ఐనాకా|అయ్యాక|తరువాత|తరవాత|tarvatha|tarvata|taruvatha|ainaka|ayinaka|ayyaka|pimmata,FALSE,1,add_diff_datetime +ఈ|ee|e,TRUE,0,add_diff_datetime +ముందు|పూర్వం|ముందఱ|మునుపు|పూర్వము|ముందర|mundu|purvam|poorvam|mundara|munupu|purvamu|mundara,FALSE,-1,add_diff_datetime +ముందు|పూర్వం|ముందఱ|మునుపు|పూర్వము|ముందర|mundu|purvam|poorvam|mundara|munupu|purvamu|mundara,TRUE,-1,add_diff_datetime +వచ్చే|రాబోయే|vache|raboye|rabooye,TRUE,1,add_diff_datetime +లో|lo,FALSE,1,add_diff_datetime +ఒకటింపావు|పావు|బావు|okatimpavu|pavu|paavu|bavu|baavu,TRUE,0.25,ref_datetime +పావు తక్కువ|pavu takkuva|paavu takkuva,TRUE,-0.25,ref_datetime +అర్థ|అర్ధ|artha|arda|adtha,TRUE,0.5,ref_datetime diff --git a/ner_v2/detectors/temporal/date/te/data/numbers_constant.csv b/ner_v2/detectors/temporal/date/te/data/numbers_constant.csv new file mode 100644 index 000000000..974848519 --- /dev/null +++ b/ner_v2/detectors/temporal/date/te/data/numbers_constant.csv @@ -0,0 +1,34 @@ +key,numeric_representation +౧|ఒక్కటి|ఒకటి|మొదటిది|ఒకటవ|ఒకటో|okkati|okati|modatidi|okatova|okato,1 +౨|రెండు|రొండు|రెండొవ|రెండో|రెండొవది|rendu|rondu|rendova|rendo|rendovadi,2 +౩|మూడు|మూడోవ|మూడోవ|మూడొవది|మూడో|mudu|muudu|mudova|muudova|mudovadi|mudo|muudo,3 +౪|నాలుగు|నాల్గు|నాల్గొవ|నాల్గొవది|నాల్గో|నాలగు|nalugu|nalagu|naalagu|naalugu|nalgu|nalgova|nalgovadi|nalgo,4 +౫|ఐదు|అయిదు|ఐదొవ|అయిదవది|ఐదో|idu|ayidu|iydu|iydova|ayidovadi|ayido|ido|aidu,5 +౬|ఆరు|ఆఱు|ఆరొవ|ఆరొవది|ఆరో|aru|aaru|aarova|aarovadi|aro|aaro,6 +౭|ఏడు|ఏడొవ|ఏడొవది|ఏడో|aedu|aedova|aedovadi|aedo|ado|adova|adovadi,7 +౮|ఎనిమిది|ఎనిమిదొవ|ఎనిమిదిది|ఎనిమిదో|enimidi|enimdova|enimididi|enimido,8 +౯|తొమ్మిది|తొమ్మిదిది|తొమ్మిదో|tommidi|thommidi|thommididi|thommido|tomidi|thomido,9 +౧౦|పది|పదొవ|పదొవది|పదో|padi|padhi|padova|padovadi|pado,10 +౧౧|పదకొండు|పదకొండొవ|పదకొండొవది|పదకొండో|padakondu|padakondova|padakondovadi|padakondo,11 +౧౨|పన్నెండు|పన్నెండవ|పన్నెండవది|పన్నెండో|పన్నెండొవ|పన్నెండొవది|పన్నెండొ|pannendu|pannendova|pannendovadi|pannendo,12 +౧౩|పదమూడు|పదమూడొవ|పదముండొవది|పదముండొ|padamudu|padamudova|padamundovadi|padamundo,13 +౧౪|పద్నాలుగు|పద్నలుగొవ|పద్నాలుగుది|పద్నాలుగో|padnalugu|padnalugova|padnalugudi|padnalugo|padinalugu|padinalagu,14 +౧౫|పదిహేను|పదిహేనోవా|పదిహేనోవాది|పదిహేనో|పదైదు|padihenu|padihenova|padihenovadi|padiheno|padaidu,15 +౧౬|పదహారు|పదహారోది|పదహారో|padahaaru|padhaharu|padaharodi|padaharo,16 +౧౭|పదిహేడు|పదిహేనొవ|పదిహేనోవాది|పదిహేనో|padihedu|padhihedu|padihenova|padihenovadi|padiheno,17 +౧౮|పద్దెనిమిది|పద్దెనిమిదొవ|పద్దెనిమిదిది|పద్దెనిమిదో|పద్దెనిమిదొవ|paddenimidi|padhenimidi|paddenimidova|paddenimididi|padhenimidho|padhenimidova,18 +౧౯|పంతొమ్మిది|పందొమ్మిదొవ|పంతొమ్మిదో|పంతొమ్మిదొవ|పందొమ్మిది|pantommidi|pandommidova|panthommido,19 +౨౦|ఇరవై|ఇరవై|ఇరవయ్యోవది|ఇరవయ్యో|iravay|iravai|iravayyovadi|iravayyo,20 +౨౧|ఇరవయ్యొక్కటి|ఇరవై ఒకటి |ఇరవై ఒక్కటి|ఇరవై ఒకటో|iravayyokkati|iravay okati|eravay okati|iravay okato,21 +౨౨|ఇరవై రెండు|ఇరవై రెండొవ|ఇరవై రెండొవది|ఇరవై రెండో|iravay rendu|iravay rendova|iravay rendovadi|iravay rendo,22 +౨౩|ఇరవై మూడు|ఇరవై మూడవా|ఇరవై మూడొవది|ఇరవై మూడో|iravay mudu|iravay mudova|iravay mudovadi|iravay mudo,23 +౨౪|ఇరవై నాలుగు|ఇరవై నాల్గొవ|ఇరవై నాల్గొవది|ఇరవై నాల్గో|iravay naalugu|iravay naalgova|iravay nalgovadi|iravay nalgo,24 +౨౫|ఇరవై ఐదు|ఇరవై ఐదొవ|ఇరవై ఐదొవది|ఇరవై ఐదో|iravay aidu|iravay aidu|iravay aidova|iravay aidovadi|iravay aido,25 +౨౬|ఇరవై ఆఱు|ఇరవై ఆరు|ఇరవై ఆరొవ|ఇరవై ఆరో|ఇరవై ఆరొవది|iravay aaru|iravay aaru|iravay arova|iravay aaro|iravay aarovadi,26 +౨౭|ఇరవయ్యేడు|ఇరవయ్యేడొవ|ఇరవయ్యేడొవది|ఇరవయ్యేడో|ఇరవై ఏడు|iravayeedu|iravayeedova|iravayeedovadi|iravayeedo|iravay aedu,27 +౨౮|ఇరవై ఎనిమిది|ఇరవై ఎనిమిదొవ| ఇరవై ఎనిమిదిది|ఇరవై ఎనిమిదో|iravay enimidi|iravay enimidova|iraavay enimididi|iravay enimido,28 +౨౯|ఇరవై తొమ్మిది|ఇరవై తొమ్మిదొవది|ఇరవై తొమ్మిదో|iravay tommidi|iravay thommidi|iravay thommidovadi|iravay thommido,29 +౩౦|ముప్పై|ముప్పైయొవది|ముప్పైయొవ|ముప్పైయో|muppai|muppaiovadi|muppaiova|muppaiyo,30 +౩౧|ముప్పై ఒక్కటి|ముప్పై ఒకటవ|ముప్పై ఒకటోవది|ముప్పై ఒకటో|muppai okkati|muppai okatova|muppai okatovadi|muppai okato,31 +౧.౫|ఒకటిన్నర|ఒక్కటి అర|ఒకటి అర|okatinnara|okkati ara|okati ara,1.5 +౨.౫|రెండున్నర|రెండు అర|rendunnara|rendu ara,2.5 From bd1ee8e1a7d66a5c7de7fed2c0a3d1776e0fc5f9 Mon Sep 17 00:00:00 2001 From: ameya3012 Date: Thu, 11 Jul 2019 18:42:20 +0530 Subject: [PATCH 011/237] Modified marathi date time and numeral --- .../number/mr/data/numerals_constant.csv | 34 +++++++++---------- .../temporal/date/mr/data/date_constant.csv | 18 +++++----- .../time/mr/data/datetime_diff_constant.csv | 7 ++-- .../temporal/time/mr/data/time_constant.csv | 10 +++--- 4 files changed, 36 insertions(+), 33 deletions(-) diff --git a/ner_v2/detectors/numeral/number/mr/data/numerals_constant.csv b/ner_v2/detectors/numeral/number/mr/data/numerals_constant.csv index 18252e2b8..1d693e54b 100644 --- a/ner_v2/detectors/numeral/number/mr/data/numerals_constant.csv +++ b/ner_v2/detectors/numeral/number/mr/data/numerals_constant.csv @@ -23,11 +23,11 @@ number,name_variants,number_value,number_type १९,एकोणीस|ekonis|econis,19,unit २०,वीस|Vis,20,unit २१,एकवीस|Ekavis|Ekvis,21,unit -२२,बावीस|Bavis,22,unit +२२,बावीस|Bavis|Baavis,22,unit २३,तेवीस|Tevis,23,unit -२४,चोवीस|chauwis|chauvis,24,unit +२४,चोवीस|chauwis|chauvis|chovis|chowis,24,unit २५,पंचवीस|panchavis|panchvis,25,unit -२६,सव्वीस|Savvis,26,unit +२६,सव्वीस|Savvis|Sauvis,26,unit २७,सत्तावीस|Sattavis,27,unit २८,अठ्ठावीस|Aththavis|Attavis,28,unit २९,एकोणतीस|Ekonatis|Ekontis,29,unit @@ -38,14 +38,14 @@ number,name_variants,number_value,number_type ३४,चौतीस|chautis,34,unit ३५,पस्तीस|pastis,35,unit ३६,छत्तीस|Chattis,36,unit -३७,सदतीस|sadatis,37,unit -३८,अडतीस|adatis,38,unit +३७,सदतीस|sadatis|sadotis,37,unit +३८,अडतीस|adatis|adotis,38,unit ३९,एकोणचाळीस|ekonachalis|ekonchalis|econchalis,39,unit ४०,चाळीस|chalis,40,unit ४१,एक्केचाळीस|ekkechalis|Akkechalis,41,unit ४२,बेचाळीस|bechalis,42,unit ४३,त्रेचाळीस|trechalis,43,unit -४४,चव्वेचाळीस|chavvechalis|chavechalis,44,unit +४४,चव्वेचाळीस|chavvechalis|chavechalis|chavrechalis,44,unit ४५,पंचेचाळीस|pamchechalis|panchechalis,45,unit ४६,सेहेचाळीस|sehechalis|Sechalis,46,unit ४७,सत्तेचाळीस|sattechalis,47,unit @@ -63,33 +63,33 @@ number,name_variants,number_value,number_type ५९,एकोणसाठ|ekonasath|ekonasat|ekonsath,59,unit ६०,साठ|sath,60,unit ६१,एकसष्ठ|ekasashth|ekshasth,61,unit -६२,बासष्ठ|basashth|Basath,62,unit +६२,बासष्ठ|basashth|Basath|Besashth,62,unit ६३,त्रेसष्ठ|tresashth|Tresath,63,unit ६४,चौसष्ठ|chausashth|chausath,64,unit ६५,पासष्ठ|pasashth|Pasath,65,unit ६६,सहासष्ठ|sahasashth|Sahasath,66,unit -६७,सदुसष्ठ|sadusashth|sadusath,67,unit -६८,अडुसष्ठ|adusashth|adusath,68,unit -६९,एकोणसत्तर|ekonsattar,69,unit +६७,सदुसष्ठ|sadusashth|sadusath|sadosashth,67,unit +६८,अडुसष्ठ|adusashth|adusath|adosashth,68,unit +६९,एकोणसत्तर|ekonsattar|ekunsattar,69,unit ७०,सत्तर|sattar,70,unit ७१,एक्काहत्तर|ekkahattar|ekattar,71,unit ७२,बाहत्तर|bahattar|Baattar,72,unit -७३,त्र्याहत्तर|tryahattar,73,unit -७४,चौर्‍याहत्तर|chauryahattar,74,unit +७३,त्र्याहत्तर|tryahattar|tryattar,73,unit +७४,चौर्‍याहत्तर|chauryahattar|chauryattar,74,unit ७५,पंच्याहत्तर|pamchyahattar|panchattar,75,unit ७६,शहात्तर|shahattar|shattar,76,unit ७७,सत्याहत्तर|satyahattar|Satyattar,77,unit -७८,अठ्ठ्याहत्तर|aththyahattar,78,unit -७९,एकोण ऐंशी|ekon aimshi|ekon anshi,79,unit +७८,अठ्ठ्याहत्तर|aththyahattar|atthyattar ,78,unit +७९,एकोण ऐंशी|ekon aimshi|ekon anshi|ekonainshi,79,unit ८०,ऐंशी|Aenshi,80,unit ८१,एक्क्याऐंशी|ekkyaaimshi|Ekkyanshi,81,unit ८२,ब्याऐंशी|byaaimshi|byanshi,82,unit ८३,त्र्याऐंशी|Tryaaimshi|Tryaanshi,83,unit ८४,चौऱ्याऐंशी|chauryaaimshi|chauryanshi,84,unit ८५,पंच्याऐंशी|pamchyaaimshi|Panchyanshi,85,unit -८६,शहाऐंशी|shahaaimshi|Shaynshi,86,unit +८६,शहाऐंशी|shahaaimshi|Shaynshi|Shahaainshi,86,unit ८७,सत्त्याऐंशी|sattyaaimshi|satyanshi,87,unit -८८,अठ्ठ्याऐंशी|aththyaaimshi|athyanshi,88,unit +८८,अठ्ठ्याऐंशी|aththyaaimshi|athyanshi|aththyaainshi,88,unit ८९,एकोणनव्वद|ekonanavvad|ekonnavvad,89,unit ९०,नव्वद|navvad|navad,90,unit ९१,एक्क्याण्णव|ekkyannav,91,unit @@ -100,7 +100,7 @@ number,name_variants,number_value,number_type ९६,शहाण्णव|shahannav|shyanav,96,unit ९७,सत्त्याण्णव|sattyannav,97,unit ९८,अठ्ठ्याण्णव|aththyannav|athyanav,98,unit -९९,नव्व्याण्णव|navvyannav|navyannav,99,unit +९९,नव्व्याण्णव|navvyannav|navyannav|navvyanav,99,unit १००,शंभर|shambhar|shambar,100,scale १०००,हजार|hazar|hajar,1000,scale १०००००,लाख|Lakh,100000,scale diff --git a/ner_v2/detectors/temporal/date/mr/data/date_constant.csv b/ner_v2/detectors/temporal/date/mr/data/date_constant.csv index 776332d0f..94d89618f 100644 --- a/ner_v2/detectors/temporal/date/mr/data/date_constant.csv +++ b/ner_v2/detectors/temporal/date/mr/data/date_constant.csv @@ -1,18 +1,18 @@ key,key,date_type आज|Aaj,0,relative_date -उद्या|काल|कल|Udya|udhya|kaal,1,relative_date +उद्या|काल|कल|Udya|udhya|kaal|kal,1,relative_date परवा|पर्वा|Parava|parva,2,relative_date तिसऱ्या देवीशि|Tisrya Div shi,3,relative_date तारिक|तार्कीला|Tarik|Taarik|tarkila,NA,month_date_ref दिवस|दिवसं|Divas|divsan ,NA,date_literal -महिना|महीने|महिन्याचा|months|month|mahina|mahine|mahinyacha|mahinyachi|mahinyacha,NA,month_literal -सोमवार|monday|somvar|somwar,0,weekday -मंगळवार|मंगळवारी|Mangalvari|tuesday|mangalvar|mangalwar,1,weekday -बुधवार|बुधवारी|wednesday|budhvar|budhwar|budhvari|budhwari,2,weekday -गुरुवार|गुरुवारी|thursday|guruvar|guruwar|guroovaar|guroowar|guroovar|guruvari|guruwari|guroovaari|guroowari,3,weekday -शुक्रवार|शुक्रवारी|friday|shukravar|shukrawar|shukravari|shukrawari,4,weekday -शनिवार|saturday|shanivar|shaniwar,5,weekday -रविवार|रविवारी|sunday|ravivar|raviwar|ravivari|raviwari,6,weekday +महिना|महीने|महिन्याचा|months|month|mahina|mahine|mahinyacha|mahinyachi|mahinyacha|mahinyancha,NA,month_literal +सोमवार|सोमवारी|monday|somvar|somwar|somvaar|somvaari|somwari,0,weekday +मंगळवार|मंगळवारी|Mangalvari|tuesday|mangalvar|mangalwar|mangalvaar,1,weekday +बुधवार|बुधवारी|wednesday|budhvar|budhwar|budhvaar|budhvari|budhwari,2,weekday +गुरुवार|गुरुवारी|thursday|guruvar|guruwar|guroovaar|guroowar|guroovar|gurvaar|guruvari|guruwari|guroovaari|guroowari,3,weekday +शुक्रवार|शुक्रवारी|friday|shukravar|shukrawar|shukravaar|shukravari|shukrawari,4,weekday +शनिवार|शनिवारी|saturday|shanivar|shaniwar|shanivaar|shanivaari|shaniwari,5,weekday +रविवार|रविवारी|sunday|ravivar|raviwar|ravivari|raviwari|ravivaari,6,weekday जानेवारी|january|jan|Janevari,1,month फेब्रुवारी|Phebruvari|february|feb,2,month मार्च|march|mar,3,month diff --git a/ner_v2/detectors/temporal/time/mr/data/datetime_diff_constant.csv b/ner_v2/detectors/temporal/time/mr/data/datetime_diff_constant.csv index a38be3553..6bc803ffd 100644 --- a/ner_v2/detectors/temporal/time/mr/data/datetime_diff_constant.csv +++ b/ner_v2/detectors/temporal/time/mr/data/datetime_diff_constant.csv @@ -1,10 +1,11 @@ key,present_in_start,adding_magnitude,datetime_type नंतर|Nantar,0,1,add_diff_datetime -हे|He,1,0,add_diff_datetime +हे|He|Hey,1,0,add_diff_datetime पूर्वी|अगोदर|गेल्या|Gelya|Purvi|Porvi|Agodar,0,-1,add_diff_datetime अंतिम|शेवट|शेवटी|Antim|shevat|shewat|shevati|shewati,1,-1,add_diff_datetime पुढील|पुढे|पुढच्या|पुढचा|Pudcha|Pudhil|Pudhe|Pudhchya,1,1,add_diff_datetime +मागील|मागे|मागच्या|मागचा|Maaghil|Maaghe|Maagchya|Maagcha,1,1,add_diff_datetime मी|Mi|me,0,1,add_diff_datetime -सवा|sawa|sava,1,0.25,ref_datetime +सवा|sawa|sava|sauva,1,0.25,ref_datetime पौने|paune,1,-0.25,ref_datetime -साढे|साडे|saade|sade,1,0.5,ref_datetime +साढे|साडे|saade|sade|sadhe,1,0.5,ref_datetime diff --git a/ner_v2/detectors/temporal/time/mr/data/time_constant.csv b/ner_v2/detectors/temporal/time/mr/data/time_constant.csv index 7021452a0..864d5ca57 100644 --- a/ner_v2/detectors/temporal/time/mr/data/time_constant.csv +++ b/ner_v2/detectors/temporal/time/mr/data/time_constant.csv @@ -1,11 +1,13 @@ key,time_type,meridiem आता|Ata|aata,relative_time,NA लगेच|Lagech,relative_time,NA +नंतर|Nantar,relative_time,NA वाजले|वाजता|वाजुन|Vajle|Vajta|Vazta|Vazle|Vajun|Vazun,hour,NA -तास|तासा|तासात|Tas|Taasaa|Tasa|Tasaa|Taasan|Taasat,hour,NA +तास|तासा|तासात|Tas|Taas|Taasaa|Tasa|Tasaa|Taasan|Taasat,hour,NA मिनिट|मिनिटे|Minute|Minte,minute,NA सेकंड|seconds|sec|second,second,NA -सकाळ|सकाळी|Sakal|Sakali,daytime_meridiem,am -दुपार|दुपारी|Dupar|Dupari,daytime_meridiem,pm -संध्याकाळी|संध्याकाळ|Sandhyakali|Sandhyakal,daytime_meridiem,pm +सकाळ|सकाळी|Sakal|Sakaal|Sakali|Sakaali,daytime_meridiem,am +दुपार|दुपारी|Dupar|Dupaar|Dupari,daytime_meridiem,pm +संध्याकाळी|संध्याकाळ|सायंकाळी|Sandhyakali|Sandhyakal|Sayankali,daytime_meridiem,pm रात्री|रात्र|Ratri|Ratra,daytime_meridiem,pm +पहाट|पहाटे|Pahaath|Pahaathe|Pahaates,daytime_meridiem,pm \ No newline at end of file From 2ca1aeb3206482923ec905e8d09b0d919c6c333f Mon Sep 17 00:00:00 2001 From: ameya3012 Date: Thu, 11 Jul 2019 18:49:04 +0530 Subject: [PATCH 012/237] added numeral data for thousand variants --- ner_v2/detectors/numeral/number/mr/data/numerals_constant.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ner_v2/detectors/numeral/number/mr/data/numerals_constant.csv b/ner_v2/detectors/numeral/number/mr/data/numerals_constant.csv index 1d693e54b..8826726a9 100644 --- a/ner_v2/detectors/numeral/number/mr/data/numerals_constant.csv +++ b/ner_v2/detectors/numeral/number/mr/data/numerals_constant.csv @@ -102,6 +102,6 @@ number,name_variants,number_value,number_type ९८,अठ्ठ्याण्णव|aththyannav|athyanav,98,unit ९९,नव्व्याण्णव|navvyannav|navyannav|navvyanav,99,unit १००,शंभर|shambhar|shambar,100,scale -१०००,हजार|hazar|hajar,1000,scale +१०००,हजार|hazar|hajar|hajaar|hazaar,1000,scale १०००००,लाख|Lakh,100000,scale १०००००००,कोटी|koti,10000000,scale From a35e02d95d59b19c1e00fc16ba041fcee801a902 Mon Sep 17 00:00:00 2001 From: psiyan Date: Mon, 15 Jul 2019 14:40:01 +0530 Subject: [PATCH 013/237] Add language support to the CRF API - Change the API spec to account for the same. The new API has the following format: ```python { "language1": [ {sentence: "string1", entities: ["tag1", "tag2"]}, {sentence: "string2", entities: ["tag3", "tag4"] ], "language2": [ {sentence: "string3", entities: ["tag5", "tag6"]}, {sentence: "string4", entities: ["tag7", "tag8"] ] } ``` - Change the delete query in CRF, it now accounts for languages and does a direct delete query instead of the existing fetch-scroll-bulk delete. - Segregated the above delete method from the existing delete method. - Renamed CRF only methods to specify the term `crf` in their name. - Improved documentation. - Changed old `%s` style string formatting in loggers to new f-strings. - Minor changes in indentation. - Added type hints. --- datastore/datastore.py | 31 +++-- datastore/elastic_search/populate.py | 179 +++++++++++++++++---------- external_api/api.py | 13 +- external_api/constants.py | 3 + 4 files changed, 134 insertions(+), 92 deletions(-) diff --git a/datastore/datastore.py b/datastore/datastore.py index 715a45be8..b17277b81 100644 --- a/datastore/datastore.py +++ b/datastore/datastore.py @@ -1,6 +1,7 @@ from __future__ import absolute_import import warnings +from typing import Dict, List from chatbot_ner.config import ner_logger, CHATBOT_NER_DATASTORE from datastore import elastic_search @@ -628,19 +629,18 @@ def get_crf_data_for_entity_name(self, entity_name, **kwargs): ner_logger.debug('Datastore, get_entity_training_data, results_dictionary %s' % str(entity_name)) return results_dictionary - def update_entity_crf_data(self, entity_name, entity_list, language_script, sentence_list, **kwargs): + def update_entity_crf_data(self, entity_name, sentences: Dict[str, List], **kwargs): """ This method is used to populate the training data for a given entity + Args: entity_name (str): Name of the entity for which the training data has to be populated - entity_list (list): List consisting of the entities corresponding to the sentence_list - sentence_list (list): List of sentences for training - language_script (str): Language code for the language script used. - **kwargs: - For Elasticsearch: + sentences (Dict[str, List]): sentences mapped against their languages + **kwargs: For Elasticsearch: Refer http://elasticsearch-py.readthedocs.io/en/master/helpers.html#elasticsearch.helpers.bulk Raises: + IndexNotFoundException: Description IndexNotFoundException if es_training_index was not found in connection settings """ if self._client_or_connection is None: @@ -654,13 +654,12 @@ def update_entity_crf_data(self, entity_name, entity_list, language_script, sent raise IndexNotFoundException('Index for ELASTICSEARCH_CRF_DATA_INDEX_NAME not found. ' 'Please configure the same') - elastic_search.populate.update_entity_crf_data_populate(connection=self._client_or_connection, - index_name=es_training_index, - doc_type=self._connection_settings - [ELASTICSEARCH_CRF_DATA_DOC_TYPE], - logger=ner_logger, - entity_list=entity_list, - sentence_list=sentence_list, - entity_name=entity_name, - language_script=language_script, - **kwargs) + elastic_search \ + .populate \ + .update_entity_crf_data_populate(connection=self._client_or_connection, + index_name=es_training_index, + doc_type=self._connection_settings[ELASTICSEARCH_CRF_DATA_DOC_TYPE], + logger=ner_logger, + sentences=sentences, + entity_name=entity_name, + **kwargs) diff --git a/datastore/elastic_search/populate.py b/datastore/elastic_search/populate.py index 2431aa823..b2a943a8c 100644 --- a/datastore/elastic_search/populate.py +++ b/datastore/elastic_search/populate.py @@ -3,11 +3,12 @@ # std imports import os from collections import defaultdict +# Local imports +from typing import List, Dict # 3rd party imports -from elasticsearch import helpers +from elasticsearch import helpers, Elasticsearch -# Local imports from chatbot_ner.config import ner_logger from datastore import constants from datastore.elastic_search.query import get_entity_data @@ -280,83 +281,127 @@ def entity_data_update(connection, index_name, doc_type, entity_data, entity_nam logger.debug('%s: +++ Completed: add_data_elastic_search() +++' % log_prefix) -def update_entity_crf_data_populate( - connection, index_name, doc_type, entity_list, entity_name, sentence_list, language_script, logger, **kwargs -): +def delete_entity_crf_data(connection: Elasticsearch, index_name: str, + doc_type: str, entity_name: str, + languages: List[str]): + """Delete CRF data for the given entity and languages. + + Args: + connection (Elasticsearch): Elasticsearch client object + index_name (str): name of the index + doc_type (str): type of the documents being indexed + entity_name (str): ame of the entity for which the training data has to be deleted + languages (List[str]): list of language codes for which data needs to be deleted + + Returns: + TYPE: Description + """ + query = { + "query": { + "bool": { + "must": [ + { + "match": { + "entity_data": entity_name + } + } + ], + "filter": { + "terms": { + "language_script": languages + } + } + } + } + } + return connection.delete_by_query(index=index_name, body=query, doc_type=doc_type) + + +def update_entity_crf_data_populate(connection: Elasticsearch, + index_name: str, + doc_type: str, + entity_name: str, + sentences: Dict[str, List], + logger, + **kwargs): """ - This method is used to populate the elastic search traininf data. + This method is used to populate the elastic search training data. + Args: - connection: Elasticsearch client object - index_name (str): The name of the index - doc_type (str): The type of the documents being indexed - entity_name (str): Name of the entity for which the training data has to be populated - entity_list (list): List consisting of the entities corresponding to the sentence_list - sentence_list (list): List of sentences for training - language_script (str): The code for the language script - logger: logging object to log at debug and exception levellogging object to log at debug and exception level + connection (Elasticsearch): Elasticsearch client object + index_name (str): name of the index + doc_type (str): type of the documents being indexed + entity_name (str): name of the entity for which the training data has to be populated + sentences (List[Dict[str, List]]): sentences collected per language + logger: logging object **kwargs: Refer http://elasticsearch-py.readthedocs.io/en/master/helpers.html#elasticsearch.helpers.bulk """ - logger.debug('%s: +++ Started: external_api_training_data_entity_update() +++' % log_prefix) - logger.debug('%s: +++ Started: delete_entity_by_name() +++' % log_prefix) - delete_entity_by_name(connection=connection, index_name=index_name, doc_type=doc_type, - entity_name=entity_name, logger=logger, **kwargs) - logger.debug('%s: +++ Completed: delete_entity_by_name() +++' % log_prefix) - - logger.debug('%s: +++ Started: add_training_data_elastic_search() +++' % log_prefix) - add_training_data_elastic_search(connection=connection, index_name=index_name, doc_type=doc_type, + logger.debug(f'[{log_prefix}] Started: external_api_training_data_entity_update()') + + logger.debug(f'[{log_prefix}] Started: delete_entity_crf_data()') + languages = list(sentences.keys()) + delete_entity_crf_data(connection=connection, index_name=index_name, doc_type=doc_type, + entity_name=entity_name, languages=languages) + logger.debug(f'[{log_prefix}] Completed: delete_entity_crf_data()') + + logger.debug(f'[{log_prefix}] Started: add_training_data_elastic_search()') + add_crf_training_data_elastic_search(connection=connection, + index_name=index_name, + doc_type=doc_type, entity_name=entity_name, - entity_list=entity_list, - sentence_list=sentence_list, - language_script=language_script, logger=logger, **kwargs) - logger.debug('%s: +++ Completed: add_training_data_elastic_search() +++' % log_prefix) + sentences=sentences, + logger=logger, **kwargs) + logger.debug(f'[{log_prefix}] Completed: add_training_data_elastic_search()') + logger.debug(f'[{log_prefix}] Completed: external_api_training_data_entity_update()') -def add_training_data_elastic_search( - connection, index_name, doc_type, entity_name, entity_list, - sentence_list, language_script, logger, **kwargs -): + +def add_crf_training_data_elastic_search(connection: Elasticsearch, + index_name: str, + doc_type: str, entity_name: str, + sentences: Dict[str, List[Dict[str, str]]], + logger, **kwargs): """ Adds all sentences and the corresponding entities to the specified index. If the same named entity is found a delete followed by an update is triggered + Args: - connection: Elasticsearch client object - index_name (str): The name of the index - doc_type (str): The type of the documents being indexed - entity_name (str): Name of the entity for which the training data has to be populated - entity_list (list): List consisting of the entities corresponding to the sentence_list - sentence_list (list): List of sentences for training - logger: logging object to log at debug and exception level - language_script (str): Language code of the entity script - kwargs: - Refer http://elasticsearch-py.readthedocs.io/en/master/helpers.html#elasticsearch.helpers.bulk - Example of underlying index query - {'_index': 'training_index', - 'entity_data': 'name', - 'sentence': ['My name is Ajay and this is my friend Hardik'], - 'entities': ['Ajay', 'Hardik'], - 'language_script': 'en', - '_type': 'training_index', - '_op_type': 'index' - } + connection (Elasticsearch): Description + index_name (str): Description + doc_type (str): Description + entity_name (str): Description + sentences (Dict[str, List[Dict[str, str]]]): Description + logger (TYPE): Description + **kwargs: Description + Example of underlying index query + {'_index': 'training_index', + 'entity_data': 'name', + 'sentence': ['My name is Ajay and this is my friend Hardik'], + 'entities': ['Ajay', 'Hardik'], + 'language_script': 'en', + '_type': 'training_index', + '_op_type': 'index' + } """ - str_query = [] - for sentence, entities in zip(sentence_list, entity_list): - query_dict = {'_index': index_name, - 'entity_data': entity_name, - 'sentence': sentence, - 'entities': entities, - 'language_script': language_script, - '_type': doc_type, - '_op_type': 'index' - } - str_query.append(query_dict) - if len(str_query) > constants.ELASTICSEARCH_BULK_HELPER_MESSAGE_SIZE: - result = helpers.bulk(connection, str_query, stats_only=True, **kwargs) - logger.debug('%s: \t++ %s status %s ++' % (log_prefix, entity_name, result)) - str_query = [] - if str_query: - result = helpers.bulk(connection, str_query, stats_only=True, **kwargs) - logger.debug('%s: \t++ %s status %s ++' % (log_prefix, entity_name, result)) + queries = [] + for language, sentences in sentences.items(): + for sentence in sentences: + query_dict = {'_index': index_name, + 'entity_data': entity_name, + 'sentence': sentence['sentence'], + 'entities': sentence['entities'], + 'language_script': language, + '_type': doc_type, + '_op_type': 'index' + } + queries.append(query_dict) + if len(queries) > constants.ELASTICSEARCH_BULK_HELPER_MESSAGE_SIZE: + result = helpers.bulk(connection, queries, stats_only=True, **kwargs) + logger.debug(f'[{log_prefix}] Insert: {entity_name} with status {result}') + queries = [] + if queries: + result = helpers.bulk(connection, queries, stats_only=True, **kwargs) + logger.debug(f'[{log_prefix}] Insert: {entity_name} with status {result}') def delete_entity_data_by_values(connection, index_name, doc_type, entity_name, values=None, **kwargs): diff --git a/external_api/api.py b/external_api/api.py index b5dcc7b39..6a64feb7f 100644 --- a/external_api/api.py +++ b/external_api/api.py @@ -11,7 +11,7 @@ from chatbot_ner.config import ner_logger from external_api.constants import ENTITY_DATA, ENTITY_NAME, LANGUAGE_SCRIPT, ENTITY_LIST, \ EXTERNAL_API_DATA, SENTENCE_LIST, READ_MODEL_FROM_S3, ES_CONFIG, READ_EMBEDDINGS_FROM_REMOTE_URL, \ - LIVE_CRF_MODEL_PATH + LIVE_CRF_MODEL_PATH, SENTENCES from django.views.decorators.csrf import csrf_exempt from models.crf_v2.crf_train import CrfTrain @@ -186,15 +186,10 @@ def update_crf_training_data(request): response = {"success": False, "error": "", "result": []} try: external_api_data = json.loads(request.POST.get(EXTERNAL_API_DATA)) + sentences = external_api_data.get(SENTENCES) entity_name = external_api_data.get(ENTITY_NAME) - entity_list = external_api_data.get(ENTITY_LIST) - sentence_list = external_api_data.get(SENTENCE_LIST) - language_script = external_api_data.get(LANGUAGE_SCRIPT) - datastore_obj = DataStore() - datastore_obj.update_entity_crf_data(entity_name=entity_name, - entity_list=entity_list, - sentence_list=sentence_list, - language_script=language_script) + DataStore().update_entity_crf_data(entity_name=entity_name, + sentences=sentences) response['success'] = True except (DataStoreSettingsImproperlyConfiguredException, diff --git a/external_api/constants.py b/external_api/constants.py index 6481b360a..cd4d020e6 100644 --- a/external_api/constants.py +++ b/external_api/constants.py @@ -1,9 +1,12 @@ ENTITY_NAME = 'entity_name' EXTERNAL_API_DATA = 'external_api_data' ENTITY_DATA = 'entity_data' +SENTENCES = 'sentences' + LANGUAGE_SCRIPT = 'language_script' ENTITY_LIST = 'entity_list' SENTENCE_LIST = 'sentence_list' + READ_MODEL_FROM_S3 = 'read_model_from_s3' ES_CONFIG = 'es_config' READ_EMBEDDINGS_FROM_REMOTE_URL = 'read_embeddings_from_remote_url' From 1ce6de7a4eee0626269a2a7300a5904070ced08e Mon Sep 17 00:00:00 2001 From: psiyan Date: Mon, 15 Jul 2019 14:43:26 +0530 Subject: [PATCH 014/237] Fix lint errors. --- datastore/elastic_search/populate.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/datastore/elastic_search/populate.py b/datastore/elastic_search/populate.py index b2a943a8c..d66162e6a 100644 --- a/datastore/elastic_search/populate.py +++ b/datastore/elastic_search/populate.py @@ -285,14 +285,14 @@ def delete_entity_crf_data(connection: Elasticsearch, index_name: str, doc_type: str, entity_name: str, languages: List[str]): """Delete CRF data for the given entity and languages. - + Args: connection (Elasticsearch): Elasticsearch client object index_name (str): name of the index doc_type (str): type of the documents being indexed entity_name (str): ame of the entity for which the training data has to be deleted languages (List[str]): list of language codes for which data needs to be deleted - + Returns: TYPE: Description """ @@ -346,11 +346,11 @@ def update_entity_crf_data_populate(connection: Elasticsearch, logger.debug(f'[{log_prefix}] Started: add_training_data_elastic_search()') add_crf_training_data_elastic_search(connection=connection, - index_name=index_name, - doc_type=doc_type, - entity_name=entity_name, - sentences=sentences, - logger=logger, **kwargs) + index_name=index_name, + doc_type=doc_type, + entity_name=entity_name, + sentences=sentences, + logger=logger, **kwargs) logger.debug(f'[{log_prefix}] Completed: add_training_data_elastic_search()') logger.debug(f'[{log_prefix}] Completed: external_api_training_data_entity_update()') From 02b16e4b264c350ca6c822544eddf435f5335164 Mon Sep 17 00:00:00 2001 From: psiyan Date: Mon, 15 Jul 2019 14:46:11 +0530 Subject: [PATCH 015/237] Improve typing specification for sentences. --- datastore/datastore.py | 5 +++-- datastore/elastic_search/populate.py | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/datastore/datastore.py b/datastore/datastore.py index b17277b81..45582d04c 100644 --- a/datastore/datastore.py +++ b/datastore/datastore.py @@ -1,7 +1,7 @@ from __future__ import absolute_import import warnings -from typing import Dict, List +from typing import Dict, List, Union from chatbot_ner.config import ner_logger, CHATBOT_NER_DATASTORE from datastore import elastic_search @@ -629,7 +629,8 @@ def get_crf_data_for_entity_name(self, entity_name, **kwargs): ner_logger.debug('Datastore, get_entity_training_data, results_dictionary %s' % str(entity_name)) return results_dictionary - def update_entity_crf_data(self, entity_name, sentences: Dict[str, List], **kwargs): + def update_entity_crf_data(self, entity_name, + sentences: Dict[str, List[Dict[str, Union[str, List[str]]]]], **kwargs): """ This method is used to populate the training data for a given entity diff --git a/datastore/elastic_search/populate.py b/datastore/elastic_search/populate.py index d66162e6a..8c8692da7 100644 --- a/datastore/elastic_search/populate.py +++ b/datastore/elastic_search/populate.py @@ -4,7 +4,7 @@ import os from collections import defaultdict # Local imports -from typing import List, Dict +from typing import List, Dict, Union # 3rd party imports from elasticsearch import helpers, Elasticsearch @@ -359,7 +359,7 @@ def update_entity_crf_data_populate(connection: Elasticsearch, def add_crf_training_data_elastic_search(connection: Elasticsearch, index_name: str, doc_type: str, entity_name: str, - sentences: Dict[str, List[Dict[str, str]]], + sentences: Dict[str, List[Dict[str, Union[str, List[str]]]]], logger, **kwargs): """ Adds all sentences and the corresponding entities to the specified index. From 39fa7f3d7bf319f4148abfd5a37d69687c14443a Mon Sep 17 00:00:00 2001 From: psiyan Date: Mon, 15 Jul 2019 16:45:51 +0530 Subject: [PATCH 016/237] Update the CRF GET API to support language codes. --- datastore/datastore.py | 36 +++++++++++++++------------ datastore/elastic_search/query.py | 41 ++++++++++++++++++++----------- external_api/api.py | 5 ++-- external_api/constants.py | 1 + 4 files changed, 51 insertions(+), 32 deletions(-) diff --git a/datastore/datastore.py b/datastore/datastore.py index 45582d04c..0a09ee21e 100644 --- a/datastore/datastore.py +++ b/datastore/datastore.py @@ -576,37 +576,40 @@ def transfer_entities_elastic_search(self, entity_list): es_object = elastic_search.transfer.ESTransfer(source=es_url, destination=destination) es_object.transfer_specific_entities(list_of_entities=entity_list) - def get_crf_data_for_entity_name(self, entity_name, **kwargs): + def get_crf_data_for_entity_name(self, entity_name: str, languages: List[str], **kwargs): """ This method is used to obtain the sentences and entities from sentences given entity name + Args: entity_name (str): Entity name for which training data needs to be obtained - kwargs: - For Elasticsearch: - Refer https://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.Elasticsearch.search + languages (List[str]): list of languges codes for which data is requested + **kwargs: For Elasticsearch: + Refer https://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.Elasticsearch.search + Returns: results_dictionary(dict): Dictionary consisting of the training data for the the given entity. Raises: - IndexNotFoundException if es_training_index was not found in connection settings + IndexNotFoundException: Description + IndexNotFoundException if es_training_index was not found in connection settings Example: db = Datastore() db.get_entity_training_data(entity_name, **kwargs): >> { - 'sentence_list': [ - 'My name is hardik', - 'This is my friend Ajay' + 'sentence_list': [ + 'My name is hardik', + 'This is my friend Ajay' + ], + 'entity_list': [ + [ + 'hardik' ], - 'entity_list': [ - [ - 'hardik' - ], - [ - 'Ajay' - ] + [ + 'Ajay' ] - } + ] + } """ ner_logger.debug('Datastore, get_entity_training_data, entity_name %s' % entity_name) if self._client_or_connection is None: @@ -624,6 +627,7 @@ def get_crf_data_for_entity_name(self, entity_name, **kwargs): index_name=es_training_index, doc_type=self._connection_settings[ELASTICSEARCH_CRF_DATA_DOC_TYPE], entity_name=entity_name, + languages=languages, request_timeout=request_timeout, **kwargs) ner_logger.debug('Datastore, get_entity_training_data, results_dictionary %s' % str(entity_name)) diff --git a/datastore/elastic_search/query.py b/datastore/elastic_search/query.py index b6cbbac14..ab30e327f 100644 --- a/datastore/elastic_search/query.py +++ b/datastore/elastic_search/query.py @@ -6,10 +6,12 @@ import json import re import warnings +# Local imports +from typing import List +from elasticsearch import Elasticsearch from six import string_types -# Local imports from datastore import constants from external_api.constants import SENTENCE_LIST, ENTITY_LIST from language_utilities.constant import ENGLISH_LANG @@ -533,17 +535,18 @@ def _parse_es_search_results(results_list): return variants_to_values_list -def get_crf_data_for_entity_name(connection, index_name, doc_type, entity_name, **kwargs): +def get_crf_data_for_entity_name(connection: Elasticsearch, index_name: str, doc_type: str, entity_name: str, + languages: List[str], **kwargs): """ Get all sentence_list and entity_list for a entity stored in the index Args: - connection: Elasticsearch client object - index_name: The name of the index - doc_type: The type of the documents that will be indexed - entity_name: name of the entity to perform a 'term' query on - kwargs: - Refer https://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.Elasticsearch.search + connection (Elasticsearch): Elasticsearch client object + index_name (str): The name of the index + doc_type (str): The type of the documents that will be indexed + entity_name (str): name of the entity to perform a 'term' query on + languages (List[str]): list of languages for which to fetch sentences + **kwargs: optional kwargs for es Returns: dictionary, search results of the 'term' query on entity_name, mapping keys to lists containing @@ -564,15 +567,25 @@ def get_crf_data_for_entity_name(connection, index_name, doc_type, entity_name, 'Ajay' ] ] - } - + } """ results_dictionary = {SENTENCE_LIST: [], ENTITY_LIST: []} data = { - 'query': { - 'term': { - 'entity_data': { - 'value': entity_name + "query": { + "bool": { + "must": [ + { + "term": { + "entity_data": { + "value": entity_name + } + } + } + ], + "filter": { + "terms": { + "language_script": languages + } } } } diff --git a/external_api/api.py b/external_api/api.py index 6a64feb7f..78d718024 100644 --- a/external_api/api.py +++ b/external_api/api.py @@ -11,7 +11,7 @@ from chatbot_ner.config import ner_logger from external_api.constants import ENTITY_DATA, ENTITY_NAME, LANGUAGE_SCRIPT, ENTITY_LIST, \ EXTERNAL_API_DATA, SENTENCE_LIST, READ_MODEL_FROM_S3, ES_CONFIG, READ_EMBEDDINGS_FROM_REMOTE_URL, \ - LIVE_CRF_MODEL_PATH, SENTENCES + LIVE_CRF_MODEL_PATH, SENTENCES, LANGUAGES from django.views.decorators.csrf import csrf_exempt from models.crf_v2.crf_train import CrfTrain @@ -149,8 +149,9 @@ def get_crf_training_data(request): response = {"success": False, "error": "", "result": []} try: entity_name = request.GET.get(ENTITY_NAME) + languages = request.GET.get(LANGUAGES, []) datastore_obj = DataStore() - result = datastore_obj.get_crf_data_for_entity_name(entity_name=entity_name) + result = datastore_obj.get_crf_data_for_entity_name(entity_name=entity_name, languages=languages) response['result'] = result response['success'] = True diff --git a/external_api/constants.py b/external_api/constants.py index cd4d020e6..eb254e71a 100644 --- a/external_api/constants.py +++ b/external_api/constants.py @@ -2,6 +2,7 @@ EXTERNAL_API_DATA = 'external_api_data' ENTITY_DATA = 'entity_data' SENTENCES = 'sentences' +LANGUAGES = 'languages' LANGUAGE_SCRIPT = 'language_script' ENTITY_LIST = 'entity_list' From bc45dcf0cc2b7ea7cf9bd92119e90b6d2f830998 Mon Sep 17 00:00:00 2001 From: psiyan Date: Tue, 16 Jul 2019 11:51:59 +0530 Subject: [PATCH 017/237] Remove PY3 specific code. --- datastore/datastore.py | 7 +++--- datastore/elastic_search/populate.py | 36 ++++++++++------------------ datastore/elastic_search/query.py | 9 ++++--- 3 files changed, 21 insertions(+), 31 deletions(-) diff --git a/datastore/datastore.py b/datastore/datastore.py index 0a09ee21e..c716fe56a 100644 --- a/datastore/datastore.py +++ b/datastore/datastore.py @@ -576,7 +576,7 @@ def transfer_entities_elastic_search(self, entity_list): es_object = elastic_search.transfer.ESTransfer(source=es_url, destination=destination) es_object.transfer_specific_entities(list_of_entities=entity_list) - def get_crf_data_for_entity_name(self, entity_name: str, languages: List[str], **kwargs): + def get_crf_data_for_entity_name(self, entity_name, languages, **kwargs): """ This method is used to obtain the sentences and entities from sentences given entity name @@ -633,14 +633,13 @@ def get_crf_data_for_entity_name(self, entity_name: str, languages: List[str], * ner_logger.debug('Datastore, get_entity_training_data, results_dictionary %s' % str(entity_name)) return results_dictionary - def update_entity_crf_data(self, entity_name, - sentences: Dict[str, List[Dict[str, Union[str, List[str]]]]], **kwargs): + def update_entity_crf_data(self, entity_name, sentences, **kwargs): """ This method is used to populate the training data for a given entity Args: entity_name (str): Name of the entity for which the training data has to be populated - sentences (Dict[str, List]): sentences mapped against their languages + sentences (Dict[str, List[Dict[str, str]]]: sentences mapped against their languages **kwargs: For Elasticsearch: Refer http://elasticsearch-py.readthedocs.io/en/master/helpers.html#elasticsearch.helpers.bulk diff --git a/datastore/elastic_search/populate.py b/datastore/elastic_search/populate.py index 8c8692da7..6092f90ff 100644 --- a/datastore/elastic_search/populate.py +++ b/datastore/elastic_search/populate.py @@ -281,9 +281,7 @@ def entity_data_update(connection, index_name, doc_type, entity_data, entity_nam logger.debug('%s: +++ Completed: add_data_elastic_search() +++' % log_prefix) -def delete_entity_crf_data(connection: Elasticsearch, index_name: str, - doc_type: str, entity_name: str, - languages: List[str]): +def delete_entity_crf_data(connection, index_name, doc_type, entity_name, languages): """Delete CRF data for the given entity and languages. Args: @@ -317,13 +315,7 @@ def delete_entity_crf_data(connection: Elasticsearch, index_name: str, return connection.delete_by_query(index=index_name, body=query, doc_type=doc_type) -def update_entity_crf_data_populate(connection: Elasticsearch, - index_name: str, - doc_type: str, - entity_name: str, - sentences: Dict[str, List], - logger, - **kwargs): +def update_entity_crf_data_populate(connection, index_name, doc_type, entity_name, sentences, logger, **kwargs): """ This method is used to populate the elastic search training data. @@ -332,35 +324,31 @@ def update_entity_crf_data_populate(connection: Elasticsearch, index_name (str): name of the index doc_type (str): type of the documents being indexed entity_name (str): name of the entity for which the training data has to be populated - sentences (List[Dict[str, List]]): sentences collected per language + sentences (Dict[str, List[Dict[str, str]]]): sentences collected per language logger: logging object **kwargs: Refer http://elasticsearch-py.readthedocs.io/en/master/helpers.html#elasticsearch.helpers.bulk """ - logger.debug(f'[{log_prefix}] Started: external_api_training_data_entity_update()') + logger.debug('[{0}] Started: external_api_training_data_entity_update()'.format(log_prefix)) - logger.debug(f'[{log_prefix}] Started: delete_entity_crf_data()') + logger.debug('[{0}] Started: delete_entity_crf_data()'.format(log_prefix)) languages = list(sentences.keys()) delete_entity_crf_data(connection=connection, index_name=index_name, doc_type=doc_type, entity_name=entity_name, languages=languages) - logger.debug(f'[{log_prefix}] Completed: delete_entity_crf_data()') + logger.debug('[{0}] Completed: delete_entity_crf_data()'.format(log_prefix)) - logger.debug(f'[{log_prefix}] Started: add_training_data_elastic_search()') + logger.debug('[{0}] Started: add_training_data_elastic_search()'.format(log_prefix)) add_crf_training_data_elastic_search(connection=connection, index_name=index_name, doc_type=doc_type, entity_name=entity_name, sentences=sentences, logger=logger, **kwargs) - logger.debug(f'[{log_prefix}] Completed: add_training_data_elastic_search()') + logger.debug('[{0}] Completed: add_training_data_elastic_search()'.format(log_prefix)) - logger.debug(f'[{log_prefix}] Completed: external_api_training_data_entity_update()') + logger.debug('[{0}] Completed: external_api_training_data_entity_update()'.format(log_prefix)) -def add_crf_training_data_elastic_search(connection: Elasticsearch, - index_name: str, - doc_type: str, entity_name: str, - sentences: Dict[str, List[Dict[str, Union[str, List[str]]]]], - logger, **kwargs): +def add_crf_training_data_elastic_search(connection, index_name, doc_type, entity_name, sentences, logger, **kwargs): """ Adds all sentences and the corresponding entities to the specified index. If the same named entity is found a delete followed by an update is triggered @@ -397,11 +385,11 @@ def add_crf_training_data_elastic_search(connection: Elasticsearch, queries.append(query_dict) if len(queries) > constants.ELASTICSEARCH_BULK_HELPER_MESSAGE_SIZE: result = helpers.bulk(connection, queries, stats_only=True, **kwargs) - logger.debug(f'[{log_prefix}] Insert: {entity_name} with status {result}') + logger.debug('[{0}] Insert: {1} with status {2}'.format(log_prefix, entity_name, result)) queries = [] if queries: result = helpers.bulk(connection, queries, stats_only=True, **kwargs) - logger.debug(f'[{log_prefix}] Insert: {entity_name} with status {result}') + logger.debug('[{0}] Insert: {1} with status {2}'.format(log_prefix, entity_name, result)) def delete_entity_data_by_values(connection, index_name, doc_type, entity_name, values=None, **kwargs): diff --git a/datastore/elastic_search/query.py b/datastore/elastic_search/query.py index ab30e327f..edccc756d 100644 --- a/datastore/elastic_search/query.py +++ b/datastore/elastic_search/query.py @@ -535,8 +535,7 @@ def _parse_es_search_results(results_list): return variants_to_values_list -def get_crf_data_for_entity_name(connection: Elasticsearch, index_name: str, doc_type: str, entity_name: str, - languages: List[str], **kwargs): +def get_crf_data_for_entity_name(connection, index_name, doc_type, entity_name, languages, **kwargs): """ Get all sentence_list and entity_list for a entity stored in the index @@ -590,7 +589,11 @@ def get_crf_data_for_entity_name(connection: Elasticsearch, index_name: str, doc } } } - kwargs = dict(kwargs, body=data, doc_type=doc_type, size=constants.ELASTICSEARCH_SEARCH_SIZE, index=index_name, + kwargs = dict(kwargs, + body=data, + doc_type=doc_type, + size=constants.ELASTICSEARCH_SEARCH_SIZE, + index=index_name, scroll='1m') search_results = _run_es_search(connection, **kwargs) From 30cb051214242ed1af914462e83d0ab10e7676d6 Mon Sep 17 00:00:00 2001 From: psiyan Date: Tue, 16 Jul 2019 12:00:34 +0530 Subject: [PATCH 018/237] remove trailing ws --- datastore/elastic_search/query.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datastore/elastic_search/query.py b/datastore/elastic_search/query.py index edccc756d..b0856b71e 100644 --- a/datastore/elastic_search/query.py +++ b/datastore/elastic_search/query.py @@ -565,8 +565,8 @@ def get_crf_data_for_entity_name(connection, index_name, doc_type, entity_name, [ 'Ajay' ] - ] - } + ] + } """ results_dictionary = {SENTENCE_LIST: [], ENTITY_LIST: []} data = { From 8c4b8f748dfa651431275df04ee05f921e1b8e8f Mon Sep 17 00:00:00 2001 From: psiyan Date: Tue, 16 Jul 2019 12:03:22 +0530 Subject: [PATCH 019/237] remove typing --- datastore/datastore.py | 1 - 1 file changed, 1 deletion(-) diff --git a/datastore/datastore.py b/datastore/datastore.py index c716fe56a..270dfe73f 100644 --- a/datastore/datastore.py +++ b/datastore/datastore.py @@ -1,7 +1,6 @@ from __future__ import absolute_import import warnings -from typing import Dict, List, Union from chatbot_ner.config import ner_logger, CHATBOT_NER_DATASTORE from datastore import elastic_search From 25b39a835c982efdcd30ddb01a3ae92e4c907639 Mon Sep 17 00:00:00 2001 From: psiyan Date: Tue, 16 Jul 2019 12:05:53 +0530 Subject: [PATCH 020/237] remove typing --- datastore/elastic_search/populate.py | 4 ++-- datastore/elastic_search/query.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/datastore/elastic_search/populate.py b/datastore/elastic_search/populate.py index 6092f90ff..0192872cd 100644 --- a/datastore/elastic_search/populate.py +++ b/datastore/elastic_search/populate.py @@ -3,8 +3,6 @@ # std imports import os from collections import defaultdict -# Local imports -from typing import List, Dict, Union # 3rd party imports from elasticsearch import helpers, Elasticsearch @@ -16,6 +14,8 @@ from language_utilities.constant import ENGLISH_LANG from ner_constants import DICTIONARY_DATA_VARIANTS +# Local imports + log_prefix = 'datastore.elastic_search.populate' diff --git a/datastore/elastic_search/query.py b/datastore/elastic_search/query.py index b0856b71e..f58d82e86 100644 --- a/datastore/elastic_search/query.py +++ b/datastore/elastic_search/query.py @@ -6,8 +6,6 @@ import json import re import warnings -# Local imports -from typing import List from elasticsearch import Elasticsearch from six import string_types @@ -17,6 +15,8 @@ from language_utilities.constant import ENGLISH_LANG from lib.nlp.const import TOKENIZER +# Local imports + log_prefix = 'datastore.elastic_search.query' From 7ce0674eef0d175da533843727804154776d7333 Mon Sep 17 00:00:00 2001 From: psiyan Date: Tue, 16 Jul 2019 12:08:50 +0530 Subject: [PATCH 021/237] remove typing --- datastore/elastic_search/populate.py | 2 +- datastore/elastic_search/query.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/datastore/elastic_search/populate.py b/datastore/elastic_search/populate.py index 0192872cd..910fda233 100644 --- a/datastore/elastic_search/populate.py +++ b/datastore/elastic_search/populate.py @@ -5,7 +5,7 @@ from collections import defaultdict # 3rd party imports -from elasticsearch import helpers, Elasticsearch +from elasticsearch import helpers from chatbot_ner.config import ner_logger from datastore import constants diff --git a/datastore/elastic_search/query.py b/datastore/elastic_search/query.py index f58d82e86..8666b2e76 100644 --- a/datastore/elastic_search/query.py +++ b/datastore/elastic_search/query.py @@ -7,7 +7,6 @@ import re import warnings -from elasticsearch import Elasticsearch from six import string_types from datastore import constants From 4040a9cf708dbcb4d088bb6dfbafcede9352625a Mon Sep 17 00:00:00 2001 From: psiyan Date: Tue, 16 Jul 2019 16:02:01 +0530 Subject: [PATCH 022/237] bugfix language codes --- external_api/api.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/external_api/api.py b/external_api/api.py index 78d718024..4ecab19e7 100644 --- a/external_api/api.py +++ b/external_api/api.py @@ -149,9 +149,9 @@ def get_crf_training_data(request): response = {"success": False, "error": "", "result": []} try: entity_name = request.GET.get(ENTITY_NAME) - languages = request.GET.get(LANGUAGES, []) - datastore_obj = DataStore() - result = datastore_obj.get_crf_data_for_entity_name(entity_name=entity_name, languages=languages) + languages = request.GET.get(LANGUAGES, []).split('') + result = DataStore().get_crf_data_for_entity_name(entity_name=entity_name, languages=languages) + response['result'] = result response['success'] = True From b15bf3613fa848c87511b0af0d2b400832c51c6d Mon Sep 17 00:00:00 2001 From: psiyan Date: Tue, 16 Jul 2019 16:05:07 +0530 Subject: [PATCH 023/237] bugfix empty separator --- external_api/api.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/external_api/api.py b/external_api/api.py index 4ecab19e7..ac6286367 100644 --- a/external_api/api.py +++ b/external_api/api.py @@ -149,7 +149,8 @@ def get_crf_training_data(request): response = {"success": False, "error": "", "result": []} try: entity_name = request.GET.get(ENTITY_NAME) - languages = request.GET.get(LANGUAGES, []).split('') + languages = request.GET.get(LANGUAGES, []).split(',') + result = DataStore().get_crf_data_for_entity_name(entity_name=entity_name, languages=languages) response['result'] = result From 6a1bb43f2bd14871cd24c327049fa00e4eb889e5 Mon Sep 17 00:00:00 2001 From: psiyan Date: Tue, 16 Jul 2019 16:23:31 +0530 Subject: [PATCH 024/237] change POST to body --- external_api/api.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/external_api/api.py b/external_api/api.py index ac6286367..a218a275b 100644 --- a/external_api/api.py +++ b/external_api/api.py @@ -176,7 +176,7 @@ def update_crf_training_data(request): """ This function is used to update the training data Args: - request (HttpResponse): HTTP response from url + request (HttpRequest): HTTP response from url Returns: HttpResponse : HttpResponse with appropriate status and error message. Example for data present in @@ -187,7 +187,7 @@ def update_crf_training_data(request): """ response = {"success": False, "error": "", "result": []} try: - external_api_data = json.loads(request.POST.get(EXTERNAL_API_DATA)) + external_api_data = json.loads(request.body.get(EXTERNAL_API_DATA)) sentences = external_api_data.get(SENTENCES) entity_name = external_api_data.get(ENTITY_NAME) DataStore().update_entity_crf_data(entity_name=entity_name, From dc427d5860868aa266b66283e73a510651ab325b Mon Sep 17 00:00:00 2001 From: psiyan Date: Tue, 16 Jul 2019 16:24:31 +0530 Subject: [PATCH 025/237] improve body parsing --- external_api/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external_api/api.py b/external_api/api.py index a218a275b..54a296620 100644 --- a/external_api/api.py +++ b/external_api/api.py @@ -187,7 +187,7 @@ def update_crf_training_data(request): """ response = {"success": False, "error": "", "result": []} try: - external_api_data = json.loads(request.body.get(EXTERNAL_API_DATA)) + external_api_data = json.loads(request.body.decode(encoding='UTF-8')) sentences = external_api_data.get(SENTENCES) entity_name = external_api_data.get(ENTITY_NAME) DataStore().update_entity_crf_data(entity_name=entity_name, From 6d748d1ddada8ae5c66d99a233fe8204b63f80ae Mon Sep 17 00:00:00 2001 From: psiyan Date: Tue, 16 Jul 2019 16:48:54 +0530 Subject: [PATCH 026/237] improve body fetch correct key --- external_api/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external_api/api.py b/external_api/api.py index 54a296620..43351baf0 100644 --- a/external_api/api.py +++ b/external_api/api.py @@ -187,7 +187,7 @@ def update_crf_training_data(request): """ response = {"success": False, "error": "", "result": []} try: - external_api_data = json.loads(request.body.decode(encoding='UTF-8')) + external_api_data = json.loads(request.body.decode(encoding='UTF-8')).get(EXTERNAL_API_DATA) sentences = external_api_data.get(SENTENCES) entity_name = external_api_data.get(ENTITY_NAME) DataStore().update_entity_crf_data(entity_name=entity_name, From b3e5f59dfec4493c0b999b90575c90ba0dd8708d Mon Sep 17 00:00:00 2001 From: psiyan Date: Tue, 16 Jul 2019 17:00:20 +0530 Subject: [PATCH 027/237] change response format to account for languages --- datastore/elastic_search/query.py | 16 +++++++++++----- external_api/constants.py | 1 + 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/datastore/elastic_search/query.py b/datastore/elastic_search/query.py index 8666b2e76..967e13670 100644 --- a/datastore/elastic_search/query.py +++ b/datastore/elastic_search/query.py @@ -10,7 +10,7 @@ from six import string_types from datastore import constants -from external_api.constants import SENTENCE_LIST, ENTITY_LIST +from external_api.constants import SENTENCE_LIST, ENTITY_LIST, LANGUAGE_SCRIPT, SENTENCE from language_utilities.constant import ENGLISH_LANG from lib.nlp.const import TOKENIZER @@ -567,7 +567,7 @@ def get_crf_data_for_entity_name(connection, index_name, doc_type, entity_name, ] } """ - results_dictionary = {SENTENCE_LIST: [], ENTITY_LIST: []} + data = { "query": { "bool": { @@ -599,8 +599,14 @@ def get_crf_data_for_entity_name(connection, index_name, doc_type, entity_name, # Parse hits results = search_results['hits']['hits'] + language_mapped_results = collections.defaultdict(list) + for result in results: - results_dictionary[SENTENCE_LIST].append(result['_source']['sentence']) - results_dictionary[ENTITY_LIST].append(result['_source']['entities']) + language_mapped_results[result['_source']['language_script']].append( + { + SENTENCE: result['_source']['sentence'], + ENTITY_LIST: result['_source']['entities'] + } + ) - return results_dictionary + return dict(language_mapped_results) diff --git a/external_api/constants.py b/external_api/constants.py index eb254e71a..2a3ca31e2 100644 --- a/external_api/constants.py +++ b/external_api/constants.py @@ -7,6 +7,7 @@ LANGUAGE_SCRIPT = 'language_script' ENTITY_LIST = 'entity_list' SENTENCE_LIST = 'sentence_list' +SENTENCE = 'sentence' READ_MODEL_FROM_S3 = 'read_model_from_s3' ES_CONFIG = 'es_config' From 585f85fc882c0e1dd4722ccba9248b8c8c5251bc Mon Sep 17 00:00:00 2001 From: psiyan Date: Tue, 16 Jul 2019 17:00:45 +0530 Subject: [PATCH 028/237] remove unused imports --- datastore/elastic_search/query.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datastore/elastic_search/query.py b/datastore/elastic_search/query.py index 967e13670..9bd8954d2 100644 --- a/datastore/elastic_search/query.py +++ b/datastore/elastic_search/query.py @@ -10,7 +10,7 @@ from six import string_types from datastore import constants -from external_api.constants import SENTENCE_LIST, ENTITY_LIST, LANGUAGE_SCRIPT, SENTENCE +from external_api.constants import ENTITY_LIST, SENTENCE from language_utilities.constant import ENGLISH_LANG from lib.nlp.const import TOKENIZER From c84dc09f0505b30ed7cc9879658778f5787540b1 Mon Sep 17 00:00:00 2001 From: psiyan Date: Tue, 16 Jul 2019 17:08:57 +0530 Subject: [PATCH 029/237] change response keys to match with request keys --- datastore/elastic_search/populate.py | 5 +++-- datastore/elastic_search/query.py | 4 ++-- external_api/constants.py | 5 ++++- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/datastore/elastic_search/populate.py b/datastore/elastic_search/populate.py index 910fda233..4af1a722e 100644 --- a/datastore/elastic_search/populate.py +++ b/datastore/elastic_search/populate.py @@ -11,6 +11,7 @@ from datastore import constants from datastore.elastic_search.query import get_entity_data from datastore.utils import get_files_from_directory, read_csv, remove_duplicate_data +from external_api.constants import SENTENCE, ENTITIES from language_utilities.constant import ENGLISH_LANG from ner_constants import DICTIONARY_DATA_VARIANTS @@ -376,8 +377,8 @@ def add_crf_training_data_elastic_search(connection, index_name, doc_type, entit for sentence in sentences: query_dict = {'_index': index_name, 'entity_data': entity_name, - 'sentence': sentence['sentence'], - 'entities': sentence['entities'], + 'sentence': sentence[SENTENCE], + 'entities': sentence[ENTITIES], 'language_script': language, '_type': doc_type, '_op_type': 'index' diff --git a/datastore/elastic_search/query.py b/datastore/elastic_search/query.py index 9bd8954d2..fd5b50c1f 100644 --- a/datastore/elastic_search/query.py +++ b/datastore/elastic_search/query.py @@ -10,7 +10,7 @@ from six import string_types from datastore import constants -from external_api.constants import ENTITY_LIST, SENTENCE +from external_api.constants import SENTENCE, ENTITIES from language_utilities.constant import ENGLISH_LANG from lib.nlp.const import TOKENIZER @@ -605,7 +605,7 @@ def get_crf_data_for_entity_name(connection, index_name, doc_type, entity_name, language_mapped_results[result['_source']['language_script']].append( { SENTENCE: result['_source']['sentence'], - ENTITY_LIST: result['_source']['entities'] + ENTITIES: result['_source']['entities'] } ) diff --git a/external_api/constants.py b/external_api/constants.py index 2a3ca31e2..ecbd717fa 100644 --- a/external_api/constants.py +++ b/external_api/constants.py @@ -1,13 +1,16 @@ ENTITY_NAME = 'entity_name' EXTERNAL_API_DATA = 'external_api_data' ENTITY_DATA = 'entity_data' + SENTENCES = 'sentences' LANGUAGES = 'languages' +ENTITIES = 'entities' +SENTENCE = 'sentence' LANGUAGE_SCRIPT = 'language_script' ENTITY_LIST = 'entity_list' SENTENCE_LIST = 'sentence_list' -SENTENCE = 'sentence' + READ_MODEL_FROM_S3 = 'read_model_from_s3' ES_CONFIG = 'es_config' From ea8e120656ee48bbd38b499824bf1c4c91c1db99 Mon Sep 17 00:00:00 2001 From: psiyan Date: Tue, 16 Jul 2019 17:18:57 +0530 Subject: [PATCH 030/237] change default language key --- external_api/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external_api/api.py b/external_api/api.py index 43351baf0..d44fe50b4 100644 --- a/external_api/api.py +++ b/external_api/api.py @@ -149,7 +149,7 @@ def get_crf_training_data(request): response = {"success": False, "error": "", "result": []} try: entity_name = request.GET.get(ENTITY_NAME) - languages = request.GET.get(LANGUAGES, []).split(',') + languages = request.GET.get(LANGUAGES, '').split(',') result = DataStore().get_crf_data_for_entity_name(entity_name=entity_name, languages=languages) From bfed8fd6171ea6de8e7dd910c601e2fb22f854ee Mon Sep 17 00:00:00 2001 From: psiyan Date: Tue, 16 Jul 2019 17:37:41 +0530 Subject: [PATCH 031/237] add a patch for no languages --- external_api/api.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/external_api/api.py b/external_api/api.py index d44fe50b4..addabcc9f 100644 --- a/external_api/api.py +++ b/external_api/api.py @@ -136,7 +136,7 @@ def get_crf_training_data(request): """ This function is used obtain the training data given the entity_name. Args: - request (HttpResponse): HTTP response from url + request (HttpRequest): HTTP response from url Returns: HttpResponse : With data consisting of a dictionary consisting of sentence_list and entity_list @@ -149,7 +149,9 @@ def get_crf_training_data(request): response = {"success": False, "error": "", "result": []} try: entity_name = request.GET.get(ENTITY_NAME) - languages = request.GET.get(LANGUAGES, '').split(',') + languages = request.GET.get(LANGUAGES, '') + + languages = languages.split(',') if languages else [] result = DataStore().get_crf_data_for_entity_name(entity_name=entity_name, languages=languages) From bc5bd00091fc925f9578298f39bd2d9ff5b453d6 Mon Sep 17 00:00:00 2001 From: psiyan Date: Tue, 16 Jul 2019 17:44:26 +0530 Subject: [PATCH 032/237] add a patch for no languages --- datastore/elastic_search/query.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/datastore/elastic_search/query.py b/datastore/elastic_search/query.py index fd5b50c1f..bea35d630 100644 --- a/datastore/elastic_search/query.py +++ b/datastore/elastic_search/query.py @@ -579,15 +579,18 @@ def get_crf_data_for_entity_name(connection, index_name, doc_type, entity_name, } } } - ], - "filter": { - "terms": { - "language_script": languages - } - } + ] } } } + + if languages: + data['query']['bool']['filter'] = { + "terms": { + "language_script": languages + } + } + kwargs = dict(kwargs, body=data, doc_type=doc_type, From f5ec05d1ceab281b8dc1aba095be4f5b5138267c Mon Sep 17 00:00:00 2001 From: psiyan Date: Tue, 16 Jul 2019 23:11:41 +0530 Subject: [PATCH 033/237] Revert API body JSON load Revert to old POST encoded data fetch to maintain compatibility with API --- external_api/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external_api/api.py b/external_api/api.py index addabcc9f..4a96b2e7b 100644 --- a/external_api/api.py +++ b/external_api/api.py @@ -189,7 +189,7 @@ def update_crf_training_data(request): """ response = {"success": False, "error": "", "result": []} try: - external_api_data = json.loads(request.body.decode(encoding='UTF-8')).get(EXTERNAL_API_DATA) + external_api_data = json.loads(request.POST.get(EXTERNAL_API_DATA)) sentences = external_api_data.get(SENTENCES) entity_name = external_api_data.get(ENTITY_NAME) DataStore().update_entity_crf_data(entity_name=entity_name, From f9157207e8e6c801bc8775fc03437922e0302e0a Mon Sep 17 00:00:00 2001 From: ashutoshsingh0223 Date: Thu, 18 Jul 2019 15:46:25 +0530 Subject: [PATCH 034/237] replace punctuation in original_text while tagging entities --- ner_v1/detectors/textual/text/text_detection.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ner_v1/detectors/textual/text/text_detection.py b/ner_v1/detectors/textual/text/text_detection.py index 2cd2cf7dc..e768be123 100644 --- a/ner_v1/detectors/textual/text/text_detection.py +++ b/ner_v1/detectors/textual/text/text_detection.py @@ -1,5 +1,6 @@ import collections import re +import string from six import iteritems @@ -419,7 +420,9 @@ def _text_detection_with_variants(self): if original_text: value_final_list.append(variants_to_values[variant]) original_final_list.append(original_text) - _pattern = re.compile(r'\b%s\b' % re.escape(original_text), re.UNICODE) + boundary_punct_pattern = re.compile(r'(^[{0}]+)|([{0}]+$)'.format(re.escape(string.punctuation))) + original_text_= boundary_punct_pattern.sub("", original_text) + _pattern = re.compile(r'\b%s\b' % re.escape(original_text_), re.UNICODE) self.__tagged_texts[index] = _pattern.sub(self.tag, self.__tagged_texts[index]) # Instead of dropping completely like in other entities, # we replace with tag to avoid matching non contiguous segments From fb8e88e1b198f84a48dc496dce7de8fc9758fb9d Mon Sep 17 00:00:00 2001 From: ashutoshsingh0223 Date: Thu, 18 Jul 2019 15:50:23 +0530 Subject: [PATCH 035/237] replace punctuation in original_text while tagging entities --- ner_v1/detectors/textual/text/text_detection.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ner_v1/detectors/textual/text/text_detection.py b/ner_v1/detectors/textual/text/text_detection.py index e768be123..364485cce 100644 --- a/ner_v1/detectors/textual/text/text_detection.py +++ b/ner_v1/detectors/textual/text/text_detection.py @@ -420,8 +420,10 @@ def _text_detection_with_variants(self): if original_text: value_final_list.append(variants_to_values[variant]) original_final_list.append(original_text) + boundary_punct_pattern = re.compile(r'(^[{0}]+)|([{0}]+$)'.format(re.escape(string.punctuation))) - original_text_= boundary_punct_pattern.sub("", original_text) + original_text_ = boundary_punct_pattern.sub("", original_text) + _pattern = re.compile(r'\b%s\b' % re.escape(original_text_), re.UNICODE) self.__tagged_texts[index] = _pattern.sub(self.tag, self.__tagged_texts[index]) # Instead of dropping completely like in other entities, From 3532e415634f49d3ef57fbceb01e384f71a38921 Mon Sep 17 00:00:00 2001 From: ashutoshsingh0223 Date: Tue, 23 Jul 2019 18:03:49 +0530 Subject: [PATCH 036/237] use regex in place of re module --- ner_v1/detectors/textual/text/text_detection.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/ner_v1/detectors/textual/text/text_detection.py b/ner_v1/detectors/textual/text/text_detection.py index 364485cce..3d7de8d79 100644 --- a/ner_v1/detectors/textual/text/text_detection.py +++ b/ner_v1/detectors/textual/text/text_detection.py @@ -1,5 +1,5 @@ import collections -import re +import regex as re import string from six import iteritems @@ -11,6 +11,15 @@ from lib.nlp.levenshtein_distance import edit_distance from ner_v1.detectors.base_detector import BaseDetector +try: + import regex as re + _re_flags = re.UNICODE | re.V1 | re.WORD + +except ImportError: + + import re + _re_flags = re.UNICODE + class TextDetector(BaseDetector): """ @@ -424,7 +433,7 @@ def _text_detection_with_variants(self): boundary_punct_pattern = re.compile(r'(^[{0}]+)|([{0}]+$)'.format(re.escape(string.punctuation))) original_text_ = boundary_punct_pattern.sub("", original_text) - _pattern = re.compile(r'\b%s\b' % re.escape(original_text_), re.UNICODE) + _pattern = re.compile(r'\b%s\b' % re.escape(original_text_), flags=_re_flags) self.__tagged_texts[index] = _pattern.sub(self.tag, self.__tagged_texts[index]) # Instead of dropping completely like in other entities, # we replace with tag to avoid matching non contiguous segments From 4d9bc70f6320f9fff4579a2fbf0e90b3ba892025 Mon Sep 17 00:00:00 2001 From: ashutoshsingh0223 Date: Tue, 23 Jul 2019 18:05:10 +0530 Subject: [PATCH 037/237] use regex in place of re module --- ner_v1/detectors/textual/text/text_detection.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ner_v1/detectors/textual/text/text_detection.py b/ner_v1/detectors/textual/text/text_detection.py index 3d7de8d79..4ece23730 100644 --- a/ner_v1/detectors/textual/text/text_detection.py +++ b/ner_v1/detectors/textual/text/text_detection.py @@ -1,5 +1,4 @@ import collections -import regex as re import string from six import iteritems From a0f340fe467e0215756e5b1e237f0cb216470995 Mon Sep 17 00:00:00 2001 From: amansrivastava17 Date: Thu, 25 Jul 2019 12:58:28 +0530 Subject: [PATCH 038/237] added bulk text entity call from fallback values --- ner_v1/chatbot/entity_detection.py | 3 ++- ner_v1/detectors/base_detector.py | 16 ++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/ner_v1/chatbot/entity_detection.py b/ner_v1/chatbot/entity_detection.py index 94aad4b06..e4f62c94b 100644 --- a/ner_v1/chatbot/entity_detection.py +++ b/ner_v1/chatbot/entity_detection.py @@ -253,7 +253,8 @@ def get_text(message, entity_name, structured_value, fallback_value, bot_message fallback_value=fallback_value, bot_message=bot_message) elif isinstance(message, (list, tuple)): - entity_output = text_model_detector.detect_bulk(messages=message) + entity_output = text_model_detector.detect_bulk(messages=message, + fallback_value=fallback_value) return entity_output diff --git a/ner_v1/detectors/base_detector.py b/ner_v1/detectors/base_detector.py index 3c28db1ca..0a6621c55 100644 --- a/ner_v1/detectors/base_detector.py +++ b/ner_v1/detectors/base_detector.py @@ -59,6 +59,19 @@ def detect_entity(self, text, **kwargs): """ return [], [] + @abc.abstractmethod + def detect_entity_bulk(self, texts, **kwargs): + """ + This method runs the core entity detection logic defined inside entity detectors + Args: + texts: text snippet from which entities needs to be detected + **kwargs: values specific to different detectors such as 'last bot message', custom configs, etc. + Return: + tuple: Two lists of same length containing detected values and original substring from text which is used + to derive the detected value respectively + """ + return [] * len(texts), [] * len(texts) + def _set_language_processing_script(self): """ This method is used to decide the language in which detector should run it's logic based on @@ -106,8 +119,11 @@ def detect_bulk(self, messages=None, **kwargs): texts = messages entities_list, original_texts_list = self.detect_entity_bulk(texts=texts) + fallback_value_list = kwargs.get('fallback_value') if entities_list: values_list, method, original_texts_list = entities_list, FROM_MESSAGE, original_texts_list + elif fallback_value_list: + values_list, method, original_texts_list = fallback_value_list, FROM_MESSAGE, original_texts_list else: return None From 5bbb7f7fa0dc611011cabdb0a9a8c2ae3ea1a5fb Mon Sep 17 00:00:00 2001 From: amansrivastava17 Date: Thu, 25 Jul 2019 13:05:08 +0530 Subject: [PATCH 039/237] pass original list same as fallback --- ner_v1/detectors/base_detector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ner_v1/detectors/base_detector.py b/ner_v1/detectors/base_detector.py index 0a6621c55..a1d8e7c7e 100644 --- a/ner_v1/detectors/base_detector.py +++ b/ner_v1/detectors/base_detector.py @@ -123,7 +123,7 @@ def detect_bulk(self, messages=None, **kwargs): if entities_list: values_list, method, original_texts_list = entities_list, FROM_MESSAGE, original_texts_list elif fallback_value_list: - values_list, method, original_texts_list = fallback_value_list, FROM_MESSAGE, original_texts_list + values_list, method, original_texts_list = fallback_value_list, FROM_FALLBACK_VALUE, fallback_value_list else: return None From a053476f474ed9c45c51fbf879c3ff8d2f16737a Mon Sep 17 00:00:00 2001 From: amansrivastava17 Date: Thu, 25 Jul 2019 13:09:04 +0530 Subject: [PATCH 040/237] add check to check fallback value list as entity list will never be empty --- ner_v1/detectors/base_detector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ner_v1/detectors/base_detector.py b/ner_v1/detectors/base_detector.py index a1d8e7c7e..be06dafa8 100644 --- a/ner_v1/detectors/base_detector.py +++ b/ner_v1/detectors/base_detector.py @@ -120,7 +120,7 @@ def detect_bulk(self, messages=None, **kwargs): entities_list, original_texts_list = self.detect_entity_bulk(texts=texts) fallback_value_list = kwargs.get('fallback_value') - if entities_list: + if entities_list and not fallback_value_list: values_list, method, original_texts_list = entities_list, FROM_MESSAGE, original_texts_list elif fallback_value_list: values_list, method, original_texts_list = fallback_value_list, FROM_FALLBACK_VALUE, fallback_value_list From 077774dbd1bcc1934c81e30c53f9843ffb2917f1 Mon Sep 17 00:00:00 2001 From: amansrivastava17 Date: Thu, 25 Jul 2019 13:16:07 +0530 Subject: [PATCH 041/237] converting fallback value to required format --- ner_v1/detectors/base_detector.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ner_v1/detectors/base_detector.py b/ner_v1/detectors/base_detector.py index be06dafa8..73b70f5cb 100644 --- a/ner_v1/detectors/base_detector.py +++ b/ner_v1/detectors/base_detector.py @@ -123,7 +123,9 @@ def detect_bulk(self, messages=None, **kwargs): if entities_list and not fallback_value_list: values_list, method, original_texts_list = entities_list, FROM_MESSAGE, original_texts_list elif fallback_value_list: - values_list, method, original_texts_list = fallback_value_list, FROM_FALLBACK_VALUE, fallback_value_list + values_list = [[fallback_value] for fallback_value in fallback_value_list] + original_texts_list = values_list + method = FROM_FALLBACK_VALUE else: return None From e51ff7573114cf0a352b91eba5a03058d35d19b6 Mon Sep 17 00:00:00 2001 From: amansrivastava17 Date: Thu, 25 Jul 2019 14:08:38 +0530 Subject: [PATCH 042/237] removed abstract method --- ner_v1/detectors/base_detector.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/ner_v1/detectors/base_detector.py b/ner_v1/detectors/base_detector.py index 73b70f5cb..3aeacc580 100644 --- a/ner_v1/detectors/base_detector.py +++ b/ner_v1/detectors/base_detector.py @@ -59,19 +59,6 @@ def detect_entity(self, text, **kwargs): """ return [], [] - @abc.abstractmethod - def detect_entity_bulk(self, texts, **kwargs): - """ - This method runs the core entity detection logic defined inside entity detectors - Args: - texts: text snippet from which entities needs to be detected - **kwargs: values specific to different detectors such as 'last bot message', custom configs, etc. - Return: - tuple: Two lists of same length containing detected values and original substring from text which is used - to derive the detected value respectively - """ - return [] * len(texts), [] * len(texts) - def _set_language_processing_script(self): """ This method is used to decide the language in which detector should run it's logic based on From cde2a70ee47e9bb74889f756e44e4280026914ad Mon Sep 17 00:00:00 2001 From: amansrivastava17 Date: Thu, 25 Jul 2019 15:54:50 +0530 Subject: [PATCH 043/237] fix cases where detection can be possible through both message and from fallback --- ner_v1/chatbot/entity_detection.py | 3 +-- ner_v1/detectors/base_detector.py | 27 +++++++++++++++++---------- 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/ner_v1/chatbot/entity_detection.py b/ner_v1/chatbot/entity_detection.py index e4f62c94b..814cc3a5f 100644 --- a/ner_v1/chatbot/entity_detection.py +++ b/ner_v1/chatbot/entity_detection.py @@ -253,8 +253,7 @@ def get_text(message, entity_name, structured_value, fallback_value, bot_message fallback_value=fallback_value, bot_message=bot_message) elif isinstance(message, (list, tuple)): - entity_output = text_model_detector.detect_bulk(messages=message, - fallback_value=fallback_value) + entity_output = text_model_detector.detect_bulk(messages=message, fallback_values=fallback_value) return entity_output diff --git a/ner_v1/detectors/base_detector.py b/ner_v1/detectors/base_detector.py index 3aeacc580..298f4e47b 100644 --- a/ner_v1/detectors/base_detector.py +++ b/ner_v1/detectors/base_detector.py @@ -106,18 +106,25 @@ def detect_bulk(self, messages=None, **kwargs): texts = messages entities_list, original_texts_list = self.detect_entity_bulk(texts=texts) - fallback_value_list = kwargs.get('fallback_value') - if entities_list and not fallback_value_list: - values_list, method, original_texts_list = entities_list, FROM_MESSAGE, original_texts_list - elif fallback_value_list: - values_list = [[fallback_value] for fallback_value in fallback_value_list] - original_texts_list = values_list - method = FROM_FALLBACK_VALUE - else: - return None + fallback_value_list = kwargs.get('fallback_values') + values_list, detection_method_list, original_list = [], [], [] + + for i in range(len(messages)): + if entities_list[i]: + values_list.append(entities_list[i]) + detection_method_list.append(FROM_MESSAGE) + original_list.append(original_texts_list[i]) + elif fallback_value_list[i]: + values_list.append([fallback_value_list[i]]) + detection_method_list.append(FROM_FALLBACK_VALUE) + original_list.append([fallback_value_list[i]]) + else: + values_list.append([]) + detection_method_list.append([]) + original_list.append([]) return self.output_entity_bulk(entity_values_list=values_list, original_texts_list=original_texts_list, - detection_method=method, + detection_method_list=detection_method_list, detection_language=self._target_language_script) def detect(self, message=None, structured_value=None, fallback_value=None, **kwargs): From 04b76795bf4b3f6bb6f31eb9ce41f7735c6b50a3 Mon Sep 17 00:00:00 2001 From: amansrivastava17 Date: Thu, 25 Jul 2019 16:03:47 +0530 Subject: [PATCH 044/237] fix issue with detection method --- ner_v1/detectors/base_detector.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/ner_v1/detectors/base_detector.py b/ner_v1/detectors/base_detector.py index 298f4e47b..f77c469f1 100644 --- a/ner_v1/detectors/base_detector.py +++ b/ner_v1/detectors/base_detector.py @@ -104,23 +104,25 @@ def detect_bulk(self, messages=None, **kwargs): messages.append(translation_output[TRANSLATED_TEXT] if translation_output['status'] else '') texts = messages - entities_list, original_texts_list = self.detect_entity_bulk(texts=texts) + entities_list, original_list = self.detect_entity_bulk(texts=texts) - fallback_value_list = kwargs.get('fallback_values') - values_list, detection_method_list, original_list = [], [], [] + fallback_values = kwargs.get('fallback_values') + values_list, detection_method_list, original_texts_list = [], [], [] for i in range(len(messages)): if entities_list[i]: values_list.append(entities_list[i]) detection_method_list.append(FROM_MESSAGE) original_list.append(original_texts_list[i]) - elif fallback_value_list[i]: - values_list.append([fallback_value_list[i]]) + + elif fallback_values and fallback_values[i]: + values_list.append([fallback_values[i]]) detection_method_list.append(FROM_FALLBACK_VALUE) - original_list.append([fallback_value_list[i]]) + original_list.append([fallback_values[i]]) + else: values_list.append([]) - detection_method_list.append([]) + detection_method_list.append(None) original_list.append([]) return self.output_entity_bulk(entity_values_list=values_list, original_texts_list=original_texts_list, From df7d5e1e17238f77255f6cf15a3043f5a792d866 Mon Sep 17 00:00:00 2001 From: amansrivastava17 Date: Thu, 25 Jul 2019 16:33:51 +0530 Subject: [PATCH 045/237] fix issue --- ner_v1/detectors/base_detector.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ner_v1/detectors/base_detector.py b/ner_v1/detectors/base_detector.py index f77c469f1..c7dc6b4c3 100644 --- a/ner_v1/detectors/base_detector.py +++ b/ner_v1/detectors/base_detector.py @@ -113,17 +113,17 @@ def detect_bulk(self, messages=None, **kwargs): if entities_list[i]: values_list.append(entities_list[i]) detection_method_list.append(FROM_MESSAGE) - original_list.append(original_texts_list[i]) + original_texts_list.append(original_list[i]) elif fallback_values and fallback_values[i]: values_list.append([fallback_values[i]]) detection_method_list.append(FROM_FALLBACK_VALUE) - original_list.append([fallback_values[i]]) + original_texts_list.append([fallback_values[i]]) else: values_list.append([]) detection_method_list.append(None) - original_list.append([]) + original_texts_list.append([]) return self.output_entity_bulk(entity_values_list=values_list, original_texts_list=original_texts_list, detection_method_list=detection_method_list, @@ -271,7 +271,7 @@ def output_entity_bulk(self, entity_values_list, original_texts_list, detection_ entity_value = { ENTITY_VALUE_DICT_KEY: entity_value } - method = detection_method_list[i] if detection_method_list else detection_method + method = detection_method_list[index] if detection_method_list else detection_method entity_list.append( { ENTITY_VALUE: entity_value, From 384d6dc7155370eaf5742029bc5597b8eec95f84 Mon Sep 17 00:00:00 2001 From: krupalmodi18 Date: Sun, 28 Jul 2019 09:25:52 +0530 Subject: [PATCH 046/237] Adding unit channel number --- ner_v2/detectors/numeral/number/en/data/units.csv | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ner_v2/detectors/numeral/number/en/data/units.csv b/ner_v2/detectors/numeral/number/en/data/units.csv index f53b73cf3..fb73a1295 100644 --- a/ner_v2/detectors/numeral/number/en/data/units.csv +++ b/ner_v2/detectors/numeral/number/en/data/units.csv @@ -6,4 +6,5 @@ package_metric_unit,gms,gms | grams | gram | gm | g package_metric_unit,kg,kilogram | kilograms | kg | kilo | kgs package_metric_unit,ml,ml | milliliter | millilitre | milliliters | millilitres package_metric_unit,ltr,ltr | litre | liter | litres | liters | l -package_metric_unit,pcs,pcs | pc | pieces | piece \ No newline at end of file +package_metric_unit,pcs,pcs | pc | pieces | piece +channel_number,channel,channel |channel number | chanel | chanel number | open | go to \ No newline at end of file From 8583b4d74893935310daff505ba7a38273a2260f Mon Sep 17 00:00:00 2001 From: krupalmodi18 Date: Sun, 28 Jul 2019 09:28:13 +0530 Subject: [PATCH 047/237] Adding season and episode --- ner_v2/detectors/numeral/number/en/data/units.csv | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ner_v2/detectors/numeral/number/en/data/units.csv b/ner_v2/detectors/numeral/number/en/data/units.csv index fb73a1295..48d32bd76 100644 --- a/ner_v2/detectors/numeral/number/en/data/units.csv +++ b/ner_v2/detectors/numeral/number/en/data/units.csv @@ -7,4 +7,6 @@ package_metric_unit,kg,kilogram | kilograms | kg | kilo | kgs package_metric_unit,ml,ml | milliliter | millilitre | milliliters | millilitres package_metric_unit,ltr,ltr | litre | liter | litres | liters | l package_metric_unit,pcs,pcs | pc | pieces | piece -channel_number,channel,channel |channel number | chanel | chanel number | open | go to \ No newline at end of file +TV,channel,channel |channel number | chanel | chanel number | open | go to +TV,episode, episode +TV,season, season \ No newline at end of file From 7e3c95478aeef79f65253d544a11a9a4a9cfa9b1 Mon Sep 17 00:00:00 2001 From: krupalmodi18 Date: Sun, 28 Jul 2019 10:23:24 +0530 Subject: [PATCH 048/237] separating channel season episode --- ner_v2/detectors/numeral/number/en/data/units.csv | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ner_v2/detectors/numeral/number/en/data/units.csv b/ner_v2/detectors/numeral/number/en/data/units.csv index 48d32bd76..aeda0096d 100644 --- a/ner_v2/detectors/numeral/number/en/data/units.csv +++ b/ner_v2/detectors/numeral/number/en/data/units.csv @@ -7,6 +7,6 @@ package_metric_unit,kg,kilogram | kilograms | kg | kilo | kgs package_metric_unit,ml,ml | milliliter | millilitre | milliliters | millilitres package_metric_unit,ltr,ltr | litre | liter | litres | liters | l package_metric_unit,pcs,pcs | pc | pieces | piece -TV,channel,channel |channel number | chanel | chanel number | open | go to -TV,episode, episode -TV,season, season \ No newline at end of file +channel,channel,channel |channel number | chanel | chanel number | open | go to +episode,episode, episode | episod +season,season, season | seasn \ No newline at end of file From eb7d6be85eecb2cf946bf180c61026c419773d3b Mon Sep 17 00:00:00 2001 From: amansrivastava17 Date: Mon, 29 Jul 2019 17:16:41 +0530 Subject: [PATCH 049/237] fix single char scale detected as number --- ner_v2/detectors/numeral/utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/ner_v2/detectors/numeral/utils.py b/ner_v2/detectors/numeral/utils.py index 4787fe8a5..3ac70cfaf 100644 --- a/ner_v2/detectors/numeral/utils.py +++ b/ner_v2/detectors/numeral/utils.py @@ -24,6 +24,9 @@ def get_number_from_number_word(text, number_word_dict): detected_number_list = [] detected_original_text_list = [] + # exclude single char scales word from word number map dict + number_word_dict = {word: number_map for word, number_map in number_word_dict.items() + if len(word) > 1 and number_map.unit == 0} text = text.strip() if not text: return detected_number_list, detected_original_text_list @@ -72,7 +75,8 @@ def get_number_from_number_word(text, number_word_dict): result = current = 0 result_text, current_text = '', '' - # handle where only scale is mentioned without unit, for ex - thousand(for 1000), hundred(for 100) + # handle where only scale is mentioned without unit, for ex - thousand(for 1000), hundred(for 100) and + # exclude cases like 'm' (for million) or 'k' (thousand) current = 1 if (scale > 0 and current == 0 and increment == 0) else current current = current * scale + increment current_text += part From 550522a7ed5eacfb2e40bc49ebed23846a160437 Mon Sep 17 00:00:00 2001 From: amansrivastava17 Date: Mon, 29 Jul 2019 17:21:04 +0530 Subject: [PATCH 050/237] updated doc string --- ner_v2/detectors/numeral/utils.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/ner_v2/detectors/numeral/utils.py b/ner_v2/detectors/numeral/utils.py index 3ac70cfaf..74142ff6b 100644 --- a/ner_v2/detectors/numeral/utils.py +++ b/ner_v2/detectors/numeral/utils.py @@ -11,8 +11,12 @@ def get_number_from_number_word(text, number_word_dict): detected_number_list (list): list of numeric value detected from text detected_original_text_list (list): list of original text for numeric value detected Examples: - [In] >> number_word_dict = {'one': (1, 1), 'two': (1, 2), 'three': (1, 3), 'thousand': (1000, 0), - 'four': (1, 4), 'hundred': (100, 0) + [In] >> number_word_dict = {'one': NumberVariant(scale=1, increment=1), + 'two': NumberVariant(scale=1, increment=2), + 'three': NumberVariant(scale=1, increment=3), + 'thousand': NumberVariant(scale=1000, increment=0), + 'four': NumberVariant(scale=1, increment=4), + 'hundred': NumberVariant(scale=100, increment=0) } [In] >> _get_number_from_numerals('one thousand two', number_word_dict) [Out] >> (['1002'], ['one thousand two']) @@ -26,7 +30,7 @@ def get_number_from_number_word(text, number_word_dict): # exclude single char scales word from word number map dict number_word_dict = {word: number_map for word, number_map in number_word_dict.items() - if len(word) > 1 and number_map.unit == 0} + if len(word) > 1 and number_map.increment == 0} text = text.strip() if not text: return detected_number_list, detected_original_text_list From 68b5313680fb9fab9857628e9ffda8f0d7628854 Mon Sep 17 00:00:00 2001 From: amansrivastava17 Date: Mon, 29 Jul 2019 17:52:31 +0530 Subject: [PATCH 051/237] fix if condition to include all unit words --- ner_v2/detectors/numeral/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ner_v2/detectors/numeral/utils.py b/ner_v2/detectors/numeral/utils.py index 74142ff6b..e25eead95 100644 --- a/ner_v2/detectors/numeral/utils.py +++ b/ner_v2/detectors/numeral/utils.py @@ -30,7 +30,7 @@ def get_number_from_number_word(text, number_word_dict): # exclude single char scales word from word number map dict number_word_dict = {word: number_map for word, number_map in number_word_dict.items() - if len(word) > 1 and number_map.increment == 0} + if (len(word) > 1 and number_map.scale == 0) or number_map.scale == 1} text = text.strip() if not text: return detected_number_list, detected_original_text_list From e1626483b1c12941c542509eae3005710f450f5e Mon Sep 17 00:00:00 2001 From: amansrivastava17 Date: Mon, 29 Jul 2019 17:54:47 +0530 Subject: [PATCH 052/237] fix issue --- ner_v2/detectors/numeral/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ner_v2/detectors/numeral/utils.py b/ner_v2/detectors/numeral/utils.py index e25eead95..04c23f2f9 100644 --- a/ner_v2/detectors/numeral/utils.py +++ b/ner_v2/detectors/numeral/utils.py @@ -30,7 +30,7 @@ def get_number_from_number_word(text, number_word_dict): # exclude single char scales word from word number map dict number_word_dict = {word: number_map for word, number_map in number_word_dict.items() - if (len(word) > 1 and number_map.scale == 0) or number_map.scale == 1} + if (len(word) > 1 and number_map.increment == 0) or number_map.scale == 1} text = text.strip() if not text: return detected_number_list, detected_original_text_list From 9d317729acce1e76af4b5a5b8503af662bd98a4d Mon Sep 17 00:00:00 2001 From: amansrivastava17 Date: Mon, 29 Jul 2019 17:57:55 +0530 Subject: [PATCH 053/237] fix comments --- ner_v2/detectors/numeral/utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ner_v2/detectors/numeral/utils.py b/ner_v2/detectors/numeral/utils.py index 04c23f2f9..a00a08ab4 100644 --- a/ner_v2/detectors/numeral/utils.py +++ b/ner_v2/detectors/numeral/utils.py @@ -79,8 +79,7 @@ def get_number_from_number_word(text, number_word_dict): result = current = 0 result_text, current_text = '', '' - # handle where only scale is mentioned without unit, for ex - thousand(for 1000), hundred(for 100) and - # exclude cases like 'm' (for million) or 'k' (thousand) + # handle where only scale is mentioned without unit, for ex - thousand(for 1000), hundred(for 100) current = 1 if (scale > 0 and current == 0 and increment == 0) else current current = current * scale + increment current_text += part From 8c792c28abb849f4cb8cced9c86ae1d4837c3d70 Mon Sep 17 00:00:00 2001 From: krupalmodi18 Date: Tue, 30 Jul 2019 20:20:41 +0530 Subject: [PATCH 054/237] adding channel number unit for hindi and guajrati --- ner_v2/detectors/numeral/number/gu/data/units.csv | 1 + ner_v2/detectors/numeral/number/hi/data/units.csv | 1 + 2 files changed, 2 insertions(+) diff --git a/ner_v2/detectors/numeral/number/gu/data/units.csv b/ner_v2/detectors/numeral/number/gu/data/units.csv index b71974bc1..0c260a36f 100644 --- a/ner_v2/detectors/numeral/number/gu/data/units.csv +++ b/ner_v2/detectors/numeral/number/gu/data/units.csv @@ -1,3 +1,4 @@ unit_type,unit_value,unit_variants currency,rupees,rupees | rupee | rs | rupya | rupaya | rupaye | rupye | rupay | paisa | paise | inr | ₹ |રૂપિયા | ભારતીય રૂપિયા | પૈસા currency,dollar,Dollar | usd | $ | ડોલર +channel,channel, ચનેલ | ચેનલ | ચણેલ | નંબર \ No newline at end of file diff --git a/ner_v2/detectors/numeral/number/hi/data/units.csv b/ner_v2/detectors/numeral/number/hi/data/units.csv index 53b9c82ca..17c699daa 100644 --- a/ner_v2/detectors/numeral/number/hi/data/units.csv +++ b/ner_v2/detectors/numeral/number/hi/data/units.csv @@ -7,3 +7,4 @@ package_metric_unit,kg,kilogram | kilograms | kg | kilo | kgs | किलोग package_metric_unit,ml,ml | milliliter | millilitre | milliliters | millilitres | मिलीलीटर | मिलिलिटर | मिललिलिटर | मिली लीटर package_metric_unit,ltr,ltr | litre | liter | litres | liters | l | लीटर | लिटर package_metric_unit,pcs,pcs | pc | pieces | piece | पीस | पिस | टुकड़े | टुकड़ा +channel,channel,चैनल नंबर | चॅनेल नंबर | चनेल | चैनल | चैनल | चनाल | चेनल नंबर | चनेल नंबर| चैनल नंबर| चैनल नंबर| चनाल नंबर| चेनल नंबर | नंबर \ No newline at end of file From 9244d0f05d4748a4c85c425febdbb310e44e11ba Mon Sep 17 00:00:00 2001 From: Ruthvik Reddy SL <52532568+ruthviking@users.noreply.github.com> Date: Mon, 26 Aug 2019 12:35:15 +0530 Subject: [PATCH 055/237] Update README.md --- ner_v2/detectors/temporal/date/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ner_v2/detectors/temporal/date/README.md b/ner_v2/detectors/temporal/date/README.md index 66675b7d9..f8da61fc5 100644 --- a/ner_v2/detectors/temporal/date/README.md +++ b/ner_v2/detectors/temporal/date/README.md @@ -14,7 +14,7 @@ This is the V2 version of date detector module that will detect date in multiple - **Python Shell** ```python - >> from ner_v2.detector.temporal.date.date_detection import DateDetector + >> from ner_v2.detectors.temporal.date.date_detection import DateDetector >> detector = DateDetector(entity_name='date', language='hi') # here language will be ISO 639-1 code >> detector.detect_entity(text= 'agla mangalvar') >> {'entity_value': [{'dd':12 ,'mm': 10, 'yy': 2018}], 'original_text':['agla mangalvar']} From 7886ade5dbded8beec6c8ee336e8916b6e2355b2 Mon Sep 17 00:00:00 2001 From: ruthviking Date: Fri, 6 Sep 2019 17:54:25 +0530 Subject: [PATCH 056/237] added detect_without_unit for number range patterns having unit type in between min max range and min>max case --- .../numeral/number/number_detection.py | 8 +++++--- .../standard_number_range_detector.py | 17 ++++++++++++----- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/ner_v2/detectors/numeral/number/number_detection.py b/ner_v2/detectors/numeral/number/number_detection.py index cbfb0ebb8..56427aeaa 100644 --- a/ner_v2/detectors/numeral/number/number_detection.py +++ b/ner_v2/detectors/numeral/number/number_detection.py @@ -67,7 +67,7 @@ def get_supported_languages(): supported_languages.append(_dir) return supported_languages - def __init__(self, entity_name, language=ENGLISH_LANG, unit_type=None): + def __init__(self, entity_name, language=ENGLISH_LANG, unit_type=None, detect_without_unit=False): """Initializes a NumberDetector object Args: @@ -90,6 +90,7 @@ def __init__(self, entity_name, language=ENGLISH_LANG, unit_type=None): self.max_digit = 6 self.language = language self.unit_type = unit_type + self.detect_without_unit = detect_without_unit try: number_detector_module = importlib.import_module( 'ner_v2.detectors.numeral.number.{0}.number_detection'.format(self.language)) @@ -140,7 +141,8 @@ def detect_entity(self, text, **kwargs): number_unit = number_value_dict[NUMBER_DETECTION_RETURN_DICT_UNIT] if self.min_digit <= self._num_digits(number_value) <= self.max_digit: if self.unit_type and (number_unit is None or - self.language_number_detector.units_map[number_unit].type != self.unit_type): + self.language_number_detector.units_map[number_unit].type != self.unit_type)\ + and not self.detect_without_unit: continue validated_number.append(number_value_dict) validated_number_text.append(original_text) @@ -183,4 +185,4 @@ def _num_digits(value): ValueError: if the given string cannot be cast to float """ v = abs(float(value)) - return 1 if int(v) == 0 else (1 + int(math.log10(v))) + return 1 if int(v) == 0 else (1 + int(math.log10(v))) \ No newline at end of file diff --git a/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py b/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py index 42f158571..875d174ea 100644 --- a/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py +++ b/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py @@ -5,7 +5,6 @@ import collections import os import re - import ner_v2.detectors.numeral.constant as numeral_constant from ner_v2.detectors.numeral.utils import get_list_from_pipe_sep_string from ner_v2.detectors.numeral.number.number_detection import NumberDetector @@ -36,7 +35,7 @@ def __init__(self, entity_name, language, data_directory_path, unit_type=None): self.tag = '__' + entity_name + '__' self.range_variants_map = {} self.unit_type = unit_type - + self.language=language self.min_range_prefix_variants = None self.min_range_suffix_variants = None self.max_range_prefix_variants = None @@ -44,7 +43,8 @@ def __init__(self, entity_name, language, data_directory_path, unit_type=None): self.min_max_range_variants = None self.number_detected_map = {} - self.number_detector = NumberDetector(entity_name=entity_name, language=language) + self.number_detector = NumberDetector(entity_name=entity_name, language=language, unit_type=unit_type, + detect_without_unit=True) self.number_detector.set_min_max_digits(1, 100) # Method to initialise regex params @@ -133,7 +133,7 @@ def _get_number_tag_dict(self): Examples: >>> text = 'I want 12 dozen banana' >>> self._get_number_tag_dict() - {'__number_1': ({'value': 12, 'unit': None}, '12')} + {'__dnumber_1': ({'value': 12, 'unit': None}, '12')} """ detected_number_dict = {} entity_value_list, original_text_list = self.number_detector.detect_entity(self.processed_text) @@ -210,12 +210,19 @@ def _get_number_range(self, min_part_match, max_part_match, full_match): if max_part_match and max_part_match in self.number_detected_map: entity_dict = self.number_detected_map[max_part_match].entity_value entity_value_max = entity_dict[numeral_constant.NUMBER_DETECTION_RETURN_DICT_VALUE] - entity_unit = entity_dict[numeral_constant.NUMBER_DETECTION_RETURN_DICT_UNIT] + if not entity_unit: + entity_unit = entity_dict[numeral_constant.NUMBER_DETECTION_RETURN_DICT_UNIT] if self.unit_type and ( entity_unit is None or self.number_detector.get_unit_type(entity_unit) != self.unit_type): return number_range, original_text + if min_part_match and max_part_match: + if entity_value_min>entity_value_max: + temp = entity_value_max + entity_value_max = entity_value_min + entity_value_min = temp + original_text = self._get_original_text_from_tagged_text(full_match) if (entity_value_min or entity_value_max) and original_text: self.processed_text = self.processed_text.replace(full_match.strip(), '', 1) From 8db15a0d4e45333eac38a58337568076aee9a90f Mon Sep 17 00:00:00 2001 From: ruthviking Date: Fri, 6 Sep 2019 18:08:05 +0530 Subject: [PATCH 057/237] fix lint errors --- .../numeral/number_range/standard_number_range_detector.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py b/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py index 875d174ea..95d96f59a 100644 --- a/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py +++ b/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py @@ -35,7 +35,7 @@ def __init__(self, entity_name, language, data_directory_path, unit_type=None): self.tag = '__' + entity_name + '__' self.range_variants_map = {} self.unit_type = unit_type - self.language=language + self.language = language self.min_range_prefix_variants = None self.min_range_suffix_variants = None self.max_range_prefix_variants = None @@ -218,7 +218,7 @@ def _get_number_range(self, min_part_match, max_part_match, full_match): return number_range, original_text if min_part_match and max_part_match: - if entity_value_min>entity_value_max: + if entity_value_min > entity_value_max: temp = entity_value_max entity_value_max = entity_value_min entity_value_min = temp From bbbe13ef3f3647651c82a57d7e00486df7ae48c5 Mon Sep 17 00:00:00 2001 From: Ruthvik Reddy SL <52532568+ruthvik-17@users.noreply.github.com> Date: Mon, 9 Sep 2019 14:49:29 +0530 Subject: [PATCH 058/237] Update units.csv --- ner_v2/detectors/numeral/number/en/data/units.csv | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ner_v2/detectors/numeral/number/en/data/units.csv b/ner_v2/detectors/numeral/number/en/data/units.csv index aeda0096d..5f44faa37 100644 --- a/ner_v2/detectors/numeral/number/en/data/units.csv +++ b/ner_v2/detectors/numeral/number/en/data/units.csv @@ -1,6 +1,8 @@ unit_type,unit_value,unit_variants currency,rupees,rupees | rupee | rs | rupya | rupaya | rupaye | rupye | rupay | paisa | paise | inr | ₹ -currency,dollar,Dollar | usd | $ +currency,dollar,Dollar | dollars | usd | $ +currency,euro,Euro | euros | eur | € +currency,pound sterling,Pound sterling | pound sterlings | quid | pounds | sterling | pound | gbp | £ package_metric_unit,mg,mg | milligram | milligrams | mgs package_metric_unit,gms,gms | grams | gram | gm | g package_metric_unit,kg,kilogram | kilograms | kg | kilo | kgs @@ -9,4 +11,4 @@ package_metric_unit,ltr,ltr | litre | liter | litres | liters | l package_metric_unit,pcs,pcs | pc | pieces | piece channel,channel,channel |channel number | chanel | chanel number | open | go to episode,episode, episode | episod -season,season, season | seasn \ No newline at end of file +season,season, season | seasn From e568f2137f70c7737ae78b6b203f4a3381f028fe Mon Sep 17 00:00:00 2001 From: Ruthvik Reddy SL <52532568+ruthvik-17@users.noreply.github.com> Date: Mon, 9 Sep 2019 17:50:32 +0530 Subject: [PATCH 059/237] Update units.csv --- ner_v2/detectors/numeral/number/en/data/units.csv | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ner_v2/detectors/numeral/number/en/data/units.csv b/ner_v2/detectors/numeral/number/en/data/units.csv index 5f44faa37..264e9bf97 100644 --- a/ner_v2/detectors/numeral/number/en/data/units.csv +++ b/ner_v2/detectors/numeral/number/en/data/units.csv @@ -3,6 +3,8 @@ currency,rupees,rupees | rupee | rs | rupya | rupaya | rupaye | rupye | rupay | currency,dollar,Dollar | dollars | usd | $ currency,euro,Euro | euros | eur | € currency,pound sterling,Pound sterling | pound sterlings | quid | pounds | sterling | pound | gbp | £ +currency,cent,Cents | cent | c | ¢ +currency,pence,Pence | p package_metric_unit,mg,mg | milligram | milligrams | mgs package_metric_unit,gms,gms | grams | gram | gm | g package_metric_unit,kg,kilogram | kilograms | kg | kilo | kgs From bc91238154cf91b4b25fd84d99822405cda7ee4a Mon Sep 17 00:00:00 2001 From: Ruthvik Reddy SL <52532568+ruthvik-17@users.noreply.github.com> Date: Fri, 13 Sep 2019 12:40:04 +0530 Subject: [PATCH 060/237] Update units.csv removed single charecters 'p' and 'c' for pence and cents respectively. --- ner_v2/detectors/numeral/number/en/data/units.csv | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ner_v2/detectors/numeral/number/en/data/units.csv b/ner_v2/detectors/numeral/number/en/data/units.csv index 264e9bf97..ec35b5ab2 100644 --- a/ner_v2/detectors/numeral/number/en/data/units.csv +++ b/ner_v2/detectors/numeral/number/en/data/units.csv @@ -3,8 +3,8 @@ currency,rupees,rupees | rupee | rs | rupya | rupaya | rupaye | rupye | rupay | currency,dollar,Dollar | dollars | usd | $ currency,euro,Euro | euros | eur | € currency,pound sterling,Pound sterling | pound sterlings | quid | pounds | sterling | pound | gbp | £ -currency,cent,Cents | cent | c | ¢ -currency,pence,Pence | p +currency,cent,Cents | cent | ¢ +currency,pence,Pence package_metric_unit,mg,mg | milligram | milligrams | mgs package_metric_unit,gms,gms | grams | gram | gm | g package_metric_unit,kg,kilogram | kilograms | kg | kilo | kgs From b3a986bf3e2bbc60629debc510ad2e4f6dd25252 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Tue, 17 Sep 2019 12:05:06 +0530 Subject: [PATCH 061/237] added new_phone_number_detction.py to check the latency between our current detector and the new one built with phonenumbers library --- .../new_phone_number_detection.py | 81 +++++++++++++++++++ requirements.txt | 1 + 2 files changed, 82 insertions(+) create mode 100644 ner_v2/detectors/pattern/phone_number/new_phone_number_detection.py diff --git a/ner_v2/detectors/pattern/phone_number/new_phone_number_detection.py b/ner_v2/detectors/pattern/phone_number/new_phone_number_detection.py new file mode 100644 index 000000000..e179a1d88 --- /dev/null +++ b/ner_v2/detectors/pattern/phone_number/new_phone_number_detection.py @@ -0,0 +1,81 @@ +# -*- coding: utf-8 -*- +from ner_v2.detectors.base_detector import BaseDetector +from ner_v2.detectors.numeral.number.number_detection import NumberDetector +from language_utilities.constant import ENGLISH_LANG +import re +import phonenumbers + + +class PhoneDetector(BaseDetector): + """ + This method is used to detect phone numbers present in text. The phone detector takes into + consideration domestic as well as international phone numbers. + + Attributes: + text(str): string provided to extract phone numbers detection + phone (list): list of detected entity values + original_phone_text (list): list to store substrings of the text detected as phone numbers + """ + + def __init__(self, entity_name, language=ENGLISH_LANG, country_code="IN"): + # Todo: Change default from india to get it from the bot. + """ + Args: + entity_name (str): A string by which the detected numbers would be replaced with + on calling detect_entity() + language (str, optional): language code of number text, defaults to 'en' + country_code(str, optional): country code of the country from which you are using + """ + self._supported_languages = NumberDetector.get_supported_languages() + super(PhoneDetector, self).__init__(language, country_code) + self.language = language + self.entity_name = entity_name + self.text = '' + self.phone = [] + self.original_phone_text = [] + self.country_code = country_code + + @property + def supported_languages(self): + """ + This method returns the list of languages supported by entity detectors + Return: + list: List of ISO 639 codes of languages supported by subclass/detector + """ + return self._supported_languages + + def detect_entity(self, text, **kwargs): + """Detects phone numbers in the text string + + Args: + text: string to extract entities from + **kwargs: it can be used to send specific arguments in future. + + Returns: + + self.phone (list): list consisting the detected phone numbers + self.original_phone_text (list): list containing their corresponding substrings in the original message. + + Examples: + + text = 'call +1 (408) 912-6172 and send 100rs to 9920441344' + + p = PhoneDetector(entity_name='phone_number', language='en') + p.detect_entity(text=text) + (['14089126172', '9920441344'], [u'+1 (408) 912-6172', u'9920441344']) + + text = '+९१ ९८१९९८३१३२ पर कॉल करें और संदेश ९८२०३३४४१६ पर कॉल करें' + p = PhoneDetector(entity_name='phone_number', language='hi') + p.detect_entity(text=text) + (['919819983132', '9820334416'],[u'+९१ ९८१९९८३१३२', u'+९१ ९८१९९८३१३२']) + + """ + + self.text = text + self.phone, self.original_phone_text = [], [] + for match in phonenumbers.PhoneNumberMatcher(text, self.country_code): + self.phone.append({"country_calling_code": match.number.country_code, + "phone_number": match.number.national_number}) + self.original_phone_text.append(self.text[match.start:match.end]) + + return self.phone, self.original_phone_text diff --git a/requirements.txt b/requirements.txt index c910df5be..b2864fd76 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +phonenumbers six==1.11.0 gunicorn==19.6.0 pytz==2014.2 From f7b97617e50c9880127203507ec08314e3d49299 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Tue, 17 Sep 2019 12:35:13 +0530 Subject: [PATCH 062/237] removed a bug where letter L is coming along with the national number --- .../pattern/phone_number/new_phone_number_detection.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ner_v2/detectors/pattern/phone_number/new_phone_number_detection.py b/ner_v2/detectors/pattern/phone_number/new_phone_number_detection.py index e179a1d88..c99c2e07d 100644 --- a/ner_v2/detectors/pattern/phone_number/new_phone_number_detection.py +++ b/ner_v2/detectors/pattern/phone_number/new_phone_number_detection.py @@ -33,7 +33,7 @@ def __init__(self, entity_name, language=ENGLISH_LANG, country_code="IN"): self.text = '' self.phone = [] self.original_phone_text = [] - self.country_code = country_code + self.country_code = country_code.upper() @property def supported_languages(self): @@ -75,7 +75,8 @@ def detect_entity(self, text, **kwargs): self.phone, self.original_phone_text = [], [] for match in phonenumbers.PhoneNumberMatcher(text, self.country_code): self.phone.append({"country_calling_code": match.number.country_code, - "phone_number": match.number.national_number}) + "phone_number": match.number.national_number[:-1]}) + # [:-1] is to remove the letter 'L' which is coming along with the number in phonenumberslite library. self.original_phone_text.append(self.text[match.start:match.end]) return self.phone, self.original_phone_text From 2ff7282c999f696dcfb1a4a7e00313783c4c6773 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Tue, 17 Sep 2019 12:40:24 +0530 Subject: [PATCH 063/237] added compatability with both phonenumbers and phonenumberslite --- .../pattern/phone_number/new_phone_number_detection.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/ner_v2/detectors/pattern/phone_number/new_phone_number_detection.py b/ner_v2/detectors/pattern/phone_number/new_phone_number_detection.py index c99c2e07d..7003fc449 100644 --- a/ner_v2/detectors/pattern/phone_number/new_phone_number_detection.py +++ b/ner_v2/detectors/pattern/phone_number/new_phone_number_detection.py @@ -74,9 +74,14 @@ def detect_entity(self, text, **kwargs): self.text = text self.phone, self.original_phone_text = [], [] for match in phonenumbers.PhoneNumberMatcher(text, self.country_code): + if match.number.national_number[-1:] == 'L': + phone_number = match.number.national_number[:-1] + # [:-1] is to remove the letter 'L' which is coming along with the number in phonenumberslite library. + else: + phone_number = match.number.national_number + # above L bug is not there in normal phonenumbers library. self.phone.append({"country_calling_code": match.number.country_code, - "phone_number": match.number.national_number[:-1]}) - # [:-1] is to remove the letter 'L' which is coming along with the number in phonenumberslite library. + "phone_number": phone_number}) self.original_phone_text.append(self.text[match.start:match.end]) return self.phone, self.original_phone_text From fcd5159c62310c08bccc07d42bef664afa87f80b Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Tue, 17 Sep 2019 12:59:17 +0530 Subject: [PATCH 064/237] apparently L is for long which is not in python3. removed everything and added --- .../phone_number/new_phone_number_detection.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/ner_v2/detectors/pattern/phone_number/new_phone_number_detection.py b/ner_v2/detectors/pattern/phone_number/new_phone_number_detection.py index 7003fc449..dbba63f89 100644 --- a/ner_v2/detectors/pattern/phone_number/new_phone_number_detection.py +++ b/ner_v2/detectors/pattern/phone_number/new_phone_number_detection.py @@ -6,7 +6,7 @@ import phonenumbers -class PhoneDetector(BaseDetector): +class NewPhoneDetector(BaseDetector): """ This method is used to detect phone numbers present in text. The phone detector takes into consideration domestic as well as international phone numbers. @@ -27,7 +27,7 @@ def __init__(self, entity_name, language=ENGLISH_LANG, country_code="IN"): country_code(str, optional): country code of the country from which you are using """ self._supported_languages = NumberDetector.get_supported_languages() - super(PhoneDetector, self).__init__(language, country_code) + super(NewPhoneDetector, self).__init__(language, country_code) self.language = language self.entity_name = entity_name self.text = '' @@ -74,14 +74,8 @@ def detect_entity(self, text, **kwargs): self.text = text self.phone, self.original_phone_text = [], [] for match in phonenumbers.PhoneNumberMatcher(text, self.country_code): - if match.number.national_number[-1:] == 'L': - phone_number = match.number.national_number[:-1] - # [:-1] is to remove the letter 'L' which is coming along with the number in phonenumberslite library. - else: - phone_number = match.number.national_number - # above L bug is not there in normal phonenumbers library. - self.phone.append({"country_calling_code": match.number.country_code, - "phone_number": phone_number}) + self.phone.append({"country_calling_code": str(match.number.country_code), + "phone_number": str(match.number.national_number)}) self.original_phone_text.append(self.text[match.start:match.end]) return self.phone, self.original_phone_text From b1192cd82ab531260ca95eec0e0825d87de0a0ab Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Tue, 17 Sep 2019 17:48:25 +0530 Subject: [PATCH 065/237] added phonenumberslite version as 8.10.18 in requirements.txt --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index b2864fd76..0e474b44a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -phonenumbers +phonenumberslite==8.10.18 six==1.11.0 gunicorn==19.6.0 pytz==2014.2 From ec225effb8b80255253648ee1efaa8917cea220f Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Wed, 18 Sep 2019 16:37:58 +0530 Subject: [PATCH 066/237] added locale to DateDetector class --- ner_v2/detectors/temporal/date/en/date_detection.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ner_v2/detectors/temporal/date/en/date_detection.py b/ner_v2/detectors/temporal/date/en/date_detection.py index 3ce428f67..df8ff7e7d 100644 --- a/ner_v2/detectors/temporal/date/en/date_detection.py +++ b/ner_v2/detectors/temporal/date/en/date_detection.py @@ -70,7 +70,7 @@ class DateDetector(object): text and tagged_text will have a extra space prepended and appended after calling detect_entity(text) """ - def __init__(self, entity_name, timezone='UTC', past_date_referenced=False): + def __init__(self, entity_name, locale, timezone='UTC', past_date_referenced=False): """Initializes a DateDetector object with given entity_name and pytz timezone object Args: @@ -95,6 +95,7 @@ def __init__(self, entity_name, timezone='UTC', past_date_referenced=False): self.month_dictionary = MONTH_DICT self.day_dictionary = DAY_DICT self.bot_message = None + self.locale = locale def detect_date(self, text): """ From 1d7c4edbac799281dee513769fd8595cedddbac6 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Wed, 18 Sep 2019 17:15:37 +0530 Subject: [PATCH 067/237] added a print to check --- .../numeral/number_range/standard_number_range_detector.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py b/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py index 95d96f59a..9c8668f63 100644 --- a/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py +++ b/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py @@ -179,6 +179,7 @@ def detect_number_range(self, text): for detector in self.detector_preferences: number_list, original_list = detector(number_list, original_list) self._update_tagged_text(original_list) + print(self.tagged_text) return number_list, original_list def _get_number_range(self, min_part_match, max_part_match, full_match): From 1ce178da1b43bcad6d664f35f036c4f939882430 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 18 Sep 2019 12:13:44 +0000 Subject: [PATCH 068/237] added a print --- .../numeral/number_range/standard_number_range_detector.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py b/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py index 9c8668f63..e4fd3fc77 100644 --- a/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py +++ b/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py @@ -180,6 +180,7 @@ def detect_number_range(self, text): number_list, original_list = detector(number_list, original_list) self._update_tagged_text(original_list) print(self.tagged_text) + print(self.processed_text) return number_list, original_list def _get_number_range(self, min_part_match, max_part_match, full_match): From 9a04cb0535d4a6e52e028635c0f99c869c7c9adf Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Wed, 18 Sep 2019 19:08:04 +0530 Subject: [PATCH 069/237] added absolute value ckeck in number range --- ner_v2/detectors/numeral/constant.py | 1 + .../standard_number_range_detector.py | 26 +++++++++++++++++-- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/ner_v2/detectors/numeral/constant.py b/ner_v2/detectors/numeral/constant.py index 1c3b7d3cd..8ae0ccac6 100644 --- a/ner_v2/detectors/numeral/constant.py +++ b/ner_v2/detectors/numeral/constant.py @@ -42,3 +42,4 @@ NUMBER_RANGE_MIN_VALUE = 'min_value' NUMBER_RANGE_MAX_VALUE = 'max_value' NUMBER_RANGE_VALUE_UNIT = 'unit' +NUMBER_RANGE_ABS_VALUE = 'abs_value' diff --git a/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py b/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py index e4fd3fc77..e9907cc32 100644 --- a/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py +++ b/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py @@ -179,10 +179,31 @@ def detect_number_range(self, text): for detector in self.detector_preferences: number_list, original_list = detector(number_list, original_list) self._update_tagged_text(original_list) - print(self.tagged_text) - print(self.processed_text) + number_list, original_list = self._add_absolute_numbers(number_list, original_list) return number_list, original_list + def _add_absolute_numbers(self, number_list, original_list): + number_abs_list = number_list or [] + original_abs_list = original_list or [] + abs_number_pattern = re.compile(ur'({number}\d+)'.format(number=numeral_constant.NUMBER_REPLACE_TEXT), + re.UNICODE) + abs_number_matches = abs_number_pattern.findall(self.processed_text) + for match in abs_number_matches : + if self.unit_type: + if self.number_detected_map[match[0]].entity_value[numeral_constant.NUMBER_DETECTION_RETURN_DICT_UNIT]==self.unit_type: + number_abs_list.append({numeral_constant.NUMBER_RANGE_MAX_VALUE : None, + numeral_constant.NUMBER_RANGE_MIN_VALUE: None, + numeral_constant.NUMBER_RANGE_VALUE_UNIT: self.unit_type, + numeral_constant.NUMBER_RANGE_ABS_VALUE: self.number_detected_map[match[0]].entity_value}) + original_abs_list.append(self.number_detected_map[match[0]].original_text) + else: + number_abs_list.append({numeral_constant.NUMBER_RANGE_MAX_VALUE: None, + numeral_constant.NUMBER_RANGE_MIN_VALUE: None, + numeral_constant.NUMBER_RANGE_VALUE_UNIT: self.unit_type, + numeral_constant.NUMBER_RANGE_ABS_VALUE: self.number_detected_map[ + match[0]].entity_value}) + original_abs_list.append(self.number_detected_map[match[0]].original_text) + return number_abs_list, original_abs_list def _get_number_range(self, min_part_match, max_part_match, full_match): """ Update number_range_list and original_list by finding entity value of number tag and original text from @@ -232,6 +253,7 @@ def _get_number_range(self, min_part_match, max_part_match, full_match): number_range = { numeral_constant.NUMBER_RANGE_MIN_VALUE: entity_value_min, numeral_constant.NUMBER_RANGE_MAX_VALUE: entity_value_max, + numeral_constant.NUMBER_RANGE_ABS_VALUE: None, numeral_constant.NUMBER_RANGE_VALUE_UNIT: entity_unit } return number_range, original_text From 5a98ec467c423ca29daa4a69b86203061556d741 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 18 Sep 2019 13:53:00 +0000 Subject: [PATCH 070/237] modified match --- .../number_range/standard_number_range_detector.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py b/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py index e9907cc32..da83695ab 100644 --- a/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py +++ b/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py @@ -190,18 +190,18 @@ def _add_absolute_numbers(self, number_list, original_list): abs_number_matches = abs_number_pattern.findall(self.processed_text) for match in abs_number_matches : if self.unit_type: - if self.number_detected_map[match[0]].entity_value[numeral_constant.NUMBER_DETECTION_RETURN_DICT_UNIT]==self.unit_type: + if self.number_detected_map[match].entity_value[numeral_constant.NUMBER_DETECTION_RETURN_DICT_UNIT]==self.unit_type: number_abs_list.append({numeral_constant.NUMBER_RANGE_MAX_VALUE : None, numeral_constant.NUMBER_RANGE_MIN_VALUE: None, numeral_constant.NUMBER_RANGE_VALUE_UNIT: self.unit_type, - numeral_constant.NUMBER_RANGE_ABS_VALUE: self.number_detected_map[match[0]].entity_value}) - original_abs_list.append(self.number_detected_map[match[0]].original_text) + numeral_constant.NUMBER_RANGE_ABS_VALUE: self.number_detected_map[match].entity_value}) + original_abs_list.append(self.number_detected_map[match].original_text) else: number_abs_list.append({numeral_constant.NUMBER_RANGE_MAX_VALUE: None, numeral_constant.NUMBER_RANGE_MIN_VALUE: None, numeral_constant.NUMBER_RANGE_VALUE_UNIT: self.unit_type, numeral_constant.NUMBER_RANGE_ABS_VALUE: self.number_detected_map[ - match[0]].entity_value}) + match].entity_value}) original_abs_list.append(self.number_detected_map[match[0]].original_text) return number_abs_list, original_abs_list def _get_number_range(self, min_part_match, max_part_match, full_match): From e32cbc9c7d33ec74e1a41eaf8ae07dc124e1adbb Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Thu, 19 Sep 2019 13:14:25 +0530 Subject: [PATCH 071/237] added prints to debug --- .../numeral/number_range/standard_number_range_detector.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py b/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py index da83695ab..d36b6300d 100644 --- a/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py +++ b/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py @@ -179,6 +179,8 @@ def detect_number_range(self, text): for detector in self.detector_preferences: number_list, original_list = detector(number_list, original_list) self._update_tagged_text(original_list) + print(self.processed_text) + print(self.number_detected_map) number_list, original_list = self._add_absolute_numbers(number_list, original_list) return number_list, original_list @@ -189,6 +191,8 @@ def _add_absolute_numbers(self, number_list, original_list): re.UNICODE) abs_number_matches = abs_number_pattern.findall(self.processed_text) for match in abs_number_matches : + print(self.unit_type) + print(self.number_detected_map[match].entity_value[numeral_constant.NUMBER_DETECTION_RETURN_DICT_UNIT]) if self.unit_type: if self.number_detected_map[match].entity_value[numeral_constant.NUMBER_DETECTION_RETURN_DICT_UNIT]==self.unit_type: number_abs_list.append({numeral_constant.NUMBER_RANGE_MAX_VALUE : None, From aca12b207f794b604d330144febe270b09fa11a6 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Thu, 19 Sep 2019 13:20:24 +0530 Subject: [PATCH 072/237] debug _add_absolute_numbers --- .../numeral/number_range/standard_number_range_detector.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py b/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py index d36b6300d..a5cbeb730 100644 --- a/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py +++ b/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py @@ -179,8 +179,6 @@ def detect_number_range(self, text): for detector in self.detector_preferences: number_list, original_list = detector(number_list, original_list) self._update_tagged_text(original_list) - print(self.processed_text) - print(self.number_detected_map) number_list, original_list = self._add_absolute_numbers(number_list, original_list) return number_list, original_list @@ -191,10 +189,9 @@ def _add_absolute_numbers(self, number_list, original_list): re.UNICODE) abs_number_matches = abs_number_pattern.findall(self.processed_text) for match in abs_number_matches : - print(self.unit_type) - print(self.number_detected_map[match].entity_value[numeral_constant.NUMBER_DETECTION_RETURN_DICT_UNIT]) if self.unit_type: - if self.number_detected_map[match].entity_value[numeral_constant.NUMBER_DETECTION_RETURN_DICT_UNIT]==self.unit_type: + if \ + self.number_detected_map[match].entity_value[numeral_constant.NUMBER_DETECTION_RETURN_DICT_UNIT] is not None: number_abs_list.append({numeral_constant.NUMBER_RANGE_MAX_VALUE : None, numeral_constant.NUMBER_RANGE_MIN_VALUE: None, numeral_constant.NUMBER_RANGE_VALUE_UNIT: self.unit_type, From c53cf01ab5c5b13f9aa037d33cc0262f67f20de6 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Thu, 19 Sep 2019 13:23:11 +0530 Subject: [PATCH 073/237] debug _add_absolute_numbers --- .../numeral/number_range/standard_number_range_detector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py b/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py index a5cbeb730..ac29ca9e5 100644 --- a/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py +++ b/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py @@ -195,7 +195,7 @@ def _add_absolute_numbers(self, number_list, original_list): number_abs_list.append({numeral_constant.NUMBER_RANGE_MAX_VALUE : None, numeral_constant.NUMBER_RANGE_MIN_VALUE: None, numeral_constant.NUMBER_RANGE_VALUE_UNIT: self.unit_type, - numeral_constant.NUMBER_RANGE_ABS_VALUE: self.number_detected_map[match].entity_value}) + numeral_constant.NUMBER_RANGE_ABS_VALUE: self.number_detected_map[match].entity_value['value']}) original_abs_list.append(self.number_detected_map[match].original_text) else: number_abs_list.append({numeral_constant.NUMBER_RANGE_MAX_VALUE: None, From a4ce9cf69d83bd2da1ee15e559386c6ff5d2f239 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Thu, 19 Sep 2019 13:46:41 +0530 Subject: [PATCH 074/237] fixed lint errors --- .../number_range/standard_number_range_detector.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py b/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py index ac29ca9e5..9bfa63678 100644 --- a/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py +++ b/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py @@ -188,14 +188,15 @@ def _add_absolute_numbers(self, number_list, original_list): abs_number_pattern = re.compile(ur'({number}\d+)'.format(number=numeral_constant.NUMBER_REPLACE_TEXT), re.UNICODE) abs_number_matches = abs_number_pattern.findall(self.processed_text) - for match in abs_number_matches : + for match in abs_number_matches: if self.unit_type: - if \ - self.number_detected_map[match].entity_value[numeral_constant.NUMBER_DETECTION_RETURN_DICT_UNIT] is not None: - number_abs_list.append({numeral_constant.NUMBER_RANGE_MAX_VALUE : None, + if self.number_detected_map[match].entity_value[numeral_constant.NUMBER_DETECTION_RETURN_DICT_UNIT] \ + is not None: + number_abs_list.append({numeral_constant.NUMBER_RANGE_MAX_VALUE: None, numeral_constant.NUMBER_RANGE_MIN_VALUE: None, numeral_constant.NUMBER_RANGE_VALUE_UNIT: self.unit_type, - numeral_constant.NUMBER_RANGE_ABS_VALUE: self.number_detected_map[match].entity_value['value']}) + numeral_constant.NUMBER_RANGE_ABS_VALUE: + self.number_detected_map[match].entity_value['value']}) original_abs_list.append(self.number_detected_map[match].original_text) else: number_abs_list.append({numeral_constant.NUMBER_RANGE_MAX_VALUE: None, From fdfecbac632439b5c60fc2b85029ee9cb2364088 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Thu, 19 Sep 2019 14:22:11 +0530 Subject: [PATCH 075/237] fix key_error --- .../numeral/number_range/standard_number_range_detector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py b/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py index 9bfa63678..3edeb158d 100644 --- a/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py +++ b/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py @@ -204,7 +204,7 @@ def _add_absolute_numbers(self, number_list, original_list): numeral_constant.NUMBER_RANGE_VALUE_UNIT: self.unit_type, numeral_constant.NUMBER_RANGE_ABS_VALUE: self.number_detected_map[ match].entity_value}) - original_abs_list.append(self.number_detected_map[match[0]].original_text) + original_abs_list.append(self.number_detected_map[match].original_text) return number_abs_list, original_abs_list def _get_number_range(self, min_part_match, max_part_match, full_match): """ From 6c032117c5d6a41b68642bac7529dad0b0696d9d Mon Sep 17 00:00:00 2001 From: Ruthvik Reddy SL <52532568+ruthvik-17@users.noreply.github.com> Date: Thu, 19 Sep 2019 16:16:28 +0530 Subject: [PATCH 076/237] fixed max, min tests and added abs_val --- .../number_range/number_range_ner_tests.yaml | 1051 +++++++++-------- 1 file changed, 561 insertions(+), 490 deletions(-) diff --git a/ner_v2/tests/numeral/number_range/number_range_ner_tests.yaml b/ner_v2/tests/numeral/number_range/number_range_ner_tests.yaml index a1956e41a..9e6d0260a 100644 --- a/ner_v2/tests/numeral/number_range/number_range_ner_tests.yaml +++ b/ner_v2/tests/numeral/number_range/number_range_ner_tests.yaml @@ -1,32 +1,35 @@ tests: en: -# - id: en_1 -# message: "I want more than 200 banana" -# outputs: -# - max_value: null -# min_value: 200 -# original_text: "more than 200" -# output_id: 1 -# unit: null -# unit_type: null -# - id: en_2 -# message: "My monthly salary will be more than 2k per month" -# outputs: -# - max_value: null -# min_value: 2000 -# original_text: "more than 2k" -# output_id: 1 -# unit: null -# unit_type: null -# - id: en_3 -# message: "more than 2.5k people in the stadium" -# outputs: -# - max_value: null -# min_value: 2500 -# original_text: "more than 2.5k" -# output_id: 1 -# unit: null -# unit_type: null + - id: en_1 + message: "I want more than 200 banana" + outputs: + - max_value: null + min_value: '200' + original_text: "more than 200" + output_id: 1 + unit: null + abs_val: null + unit_type: null + - id: en_2 + message: "My monthly salary will be more than 2k per month" + outputs: + - max_value: null + min_value: '2000' + original_text: "more than 2k" + output_id: 1 + unit: null + abs_val: null + unit_type: null + - id: en_3 + message: "more than 2.5k people in the stadium" + outputs: + - max_value: null + min_value: 2500 + original_text: "more than 2.5k" + output_id: 1 + unit: null + abs_val: null + unit_type: null - id: en_4 message: "more than 200" outputs: @@ -35,6 +38,7 @@ tests: original_text: null output_id: 1 unit: null + abs_val: null unit_type: currency - id: en_5 message: "more than 2k" @@ -44,6 +48,7 @@ tests: original_text: null output_id: 1 unit: null + abs_val: null unit_type: currency - id: en_6 message: "more than 2.5k" @@ -53,60 +58,67 @@ tests: original_text: null output_id: 1 unit: null + abs_val: null unit_type: currency -# - id: en_7 -# message: "more than 200 rupees" -# outputs: -# - max_value: null -# min_value: 200 -# original_text: "more than 200 rupees" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: en_8 -# message: "more than 2k rupees" -# outputs: -# - max_value: null -# min_value: 2000 -# original_text: "more than 2k rupees" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: en_9 -# message: "more than 2.5k rupees" -# outputs: -# - max_value: null -# min_value: 2500 -# original_text: "more than 2.5k rupees" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: en_10 -# message: "200 to 300" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "200 to 300" -# output_id: 1 -# unit: null -# unit_type: null -# - id: en_11 -# message: "200 – 300" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "200 – 300" -# output_id: 1 -# unit: null -# unit_type: null -# - id: en_12 + - id: en_7 + message: "more than 200 rupees" + outputs: + - max_value: null + min_value: '200' + original_text: "more than 200 rupees" + output_id: 1 + unit: rupees + abs_val: null + unit_type: currency + - id: en_8 + message: "more than 2k rupees" + outputs: + - max_value: null + min_value: '2000' + original_text: "more than 2k rupees" + output_id: 1 + unit: rupees + abs_val: null + unit_type: currency + - id: en_9 + message: "more than 2.5k rupees" + outputs: + - max_value: null + min_value: '2500' + original_text: "more than 2.5k rupees" + output_id: 1 + unit: rupees + abs_val: null + unit_type: currency + - id: en_10 + message: "200 to 300" + outputs: + - max_value: '300' + min_value: '200' + original_text: "200 to 300" + output_id: 1 + unit: null + abs_val: null + unit_type: null + - id: en_11 + message: "200 – 300" + outputs: + - max_value: '300' + min_value: '200' + original_text: "200 – 300" + output_id: 1 + unit: null + abs_val: null + unit_type: null + - id: en_12 message: "200-300" outputs: - - max_value: 300 - min_value: 200 + - max_value: '300' + min_value: '200' original_text: "200-300" output_id: 1 unit: null + abs_val: null unit_type: null - id: en_13 message: "200 to 300" @@ -116,6 +128,7 @@ tests: original_text: null output_id: 1 unit: null + abs_val: null unit_type: currency - id: en_14 message: "200 – 300" @@ -125,6 +138,7 @@ tests: original_text: null output_id: 1 unit: null + abs_val: null unit_type: currency - id: en_15 message: "200-300" @@ -134,198 +148,221 @@ tests: original_text: null output_id: 1 unit: null + abs_val: null unit_type: currency -# - id: en_16 -# message: "200 to 300 ruppes" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "200 to 300 ruppes" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: en_17 -# message: "200 – 300 rupees" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "200 – 300 rupees" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: en_18 -# message: "200-300 rupees" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "200-300 rupees" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: en_19 -# message: "200 rupees to 300" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "200 rupees to 300" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: en_20 -# message: "200 rupees to 300 rupees" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "200 rupees to 300 rupees" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: en_21 -# message: "200 rupees – 300" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "200 rupees – 300" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: en_22 -# message: "200 rupees – 300 rupees" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "200 rupees – 300 rupees" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: en_23 -# message: "200-300 rupees" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "200-300 rupees" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: en_24 -# message: "200k-300k men and around 400 women" -# outputs: -# - max_value: 300000 -# min_value: 200000 -# original_text: "200k-300k" -# output_id: 1 -# unit: null -# - max_value: 400 -# min_value: null -# original_text: "around 400" -# output_id: 2 -# unit: null -# unit_type: null -# - id: en_25 -# message: "200k-300k men and around 300k women" -# outputs: -# - max_value: 300000 -# min_value: 200000 -# original_text: "200k-300k" -# output_id: 1 -# unit: null -# - max_value: 300000 -# min_value: null -# original_text: "around 300k" -# output_id: 2 -# unit: null -# unit_type: null -# - id: en_26 -# message: "between 2000 and 3000" -# outputs: -# - max_value: 3000 -# min_value: 2000 -# original_text: "between 2000 and 3000" -# output_id: 1 -# unit: null -# unit_type: null + - id: en_16 + message: "200 to 300 ruppes" + outputs: + - max_value: '300' + min_value: '200' + original_text: "200 to 300 ruppes" + output_id: 1 + unit: rupees + abs_val: null + unit_type: currency + - id: en_17 + message: "200 – 300 rupees" + outputs: + - max_value: '300' + min_value: '200' + original_text: "200 – 300 rupees" + output_id: 1 + unit: rupees + abs_val: null + unit_type: currency + - id: en_18 + message: "200-300 rupees" + outputs: + - max_value: '300' + min_value: '200' + original_text: "200-300 rupees" + output_id: 1 + unit: rupees + abs_val: null + unit_type: currency + - id: en_19 + message: "200 rupees to 300" + outputs: + - max_value: '300' + min_value: '200' + original_text: "200 rupees to 300" + output_id: 1 + unit: rupees + abs_val: null + unit_type: currency + - id: en_20 + message: "200 rupees to 300 rupees" + outputs: + - max_value: '300' + min_value: '200' + original_text: "200 rupees to 300 rupees" + output_id: 1 + unit: rupees + abs_val: null + unit_type: currency + - id: en_21 + message: "200 rupees – 300" + outputs: + - max_value: '300' + min_value: '200' + original_text: "200 rupees – 300" + output_id: 1 + unit: rupees + abs_val: null + unit_type: currency + - id: en_22 + message: "200 rupees – 300 rupees" + outputs: + - max_value: '300' + min_value: '200' + original_text: "200 rupees – 300 rupees" + output_id: 1 + unit: rupees + abs_val: null + unit_type: currency + - id: en_23 + message: "200-300 rupees" + outputs: + - max_value: '300' + min_value: '200' + original_text: "200-300 rupees" + output_id: 1 + unit: rupees + abs_val: null + unit_type: currency + - id: en_24 + message: "200k-300k men and around 400 women" + outputs: + - max_value: '300000' + min_value: '200000' + original_text: "200k-300k" + output_id: 1 + unit: null + abs_val: null + - max_value: 400 + min_value: null + original_text: "around 400" + output_id: 2 + unit: null + abs_val: null + unit_type: null + - id: en_25 + message: "200k-300k men and around 300k women" + outputs: + - max_value: '300000' + min_value: '200000' + original_text: "200k-300k" + output_id: 1 + unit: null + abs_val: null + - max_value: '300000' + min_value: null + original_text: "around 300k" + output_id: 2 + unit: null + abs_val: null + unit_type: null + - id: en_26 + message: "between 2000 and 3000" + outputs: + - max_value: '3000' + min_value: '2000' + original_text: "between 2000 and 3000" + output_id: 1 + unit: null + abs_val: null + unit_type: null hi: -# - id: hi_1 -# message: "200 se jyada" -# outputs: -# - max_value: null -# min_value: 200 -# original_text: "200 se jyada" -# output_id: 1 -# unit: null -# unit_type: null -# - id: hi_2 -# message: "2k se upar" -# outputs: -# - max_value: null -# min_value: 2000 -# original_text: "2k se upar" -# output_id: 1 -# unit: null -# unit_type: null -# - id: hi_3 -# message: "jada se jada 2500" -# outputs: -# - max_value: 2500 -# min_value: null -# original_text: "jada se jada 2500" -# output_id: 1 -# unit: null -# unit_type: null -# - id: hi_7 -# message: "200 rupees se jyada" -# outputs: -# - max_value: null -# min_value: 200 -# original_text: "200 rupees se jyada" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: hi_8 -# message: "Rupees 2000 se upar" -# outputs: -# - max_value: null -# min_value: 2000 -# original_text: "Rupees 2000 se upar" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: hi_9 -# message: "jada se jada 2500 rupees" -# outputs: -# - max_value: 2500 -# min_value: null -# original_text: "jada se jada 2500 rupees" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: hi_10 -# message: "200 se 300" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "200 se 300" -# output_id: 1 -# unit: null -# unit_type: null -# - id: hi_11 -# message: "200 – 300" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "200 – 300" -# output_id: 1 -# unit: null -# unit_type: null -# - id: hi_12 -# message: "200-300" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "200-300" -# output_id: 1 -# unit: null -# unit_type: null + - id: hi_1 + message: "200 se jyada" + outputs: + - max_value: null + min_value: '200' + original_text: "200 se jyada" + output_id: 1 + unit: null + abs_val: null + unit_type: null + - id: hi_2 + message: "2k se upar" + outputs: + - max_value: null + min_value: '2000' + original_text: "2k se upar" + output_id: 1 + unit: null + abs_val: null + unit_type: null + - id: hi_3 + message: "jada se jada 2500" + outputs: + - max_value: '2500' + min_value: null + original_text: "jada se jada 2500" + output_id: 1 + unit: null + abs_val: null + unit_type: null + - id: hi_7 + message: "200 rupees se jyada" + outputs: + - max_value: null + min_value: '200' + original_text: "200 rupees se jyada" + output_id: 1 + unit: rupees + abs_val: null + unit_type: currency + - id: hi_8 + message: "Rupees 2000 se upar" + outputs: + - max_value: null + min_value: '2000' + original_text: "Rupees 2000 se upar" + output_id: 1 + unit: rupees + abs_val: null + unit_type: currency + - id: hi_9 + message: "jada se jada 2500 rupees" + outputs: + - max_value: '2500' + min_value: null + original_text: "jada se jada 2500 rupees" + output_id: 1 + unit: rupees + abs_val: null + unit_type: currency + - id: hi_10 + message: "200 se 300" + outputs: + - max_value: '300' + min_value: '200' + original_text: "200 se 300" + output_id: 1 + unit: null + abs_val: null + unit_type: null + - id: hi_11 + message: "200 – 300" + outputs: + - max_value: '300' + min_value: '200' + original_text: "200 – 300" + output_id: 1 + unit: null + abs_val: null + unit_type: null + - id: hi_12 + message: "200-300" + outputs: + - max_value: '300' + min_value: '200' + original_text: "200-300" + output_id: 1 + unit: null + abs_val: null + unit_type: null - id: hi_13 message: "200 se 300" outputs: @@ -334,6 +371,7 @@ tests: original_text: null output_id: 1 unit: null + abs_val: null unit_type: currency - id: hi_14 message: "200 – 300" @@ -343,6 +381,7 @@ tests: original_text: null output_id: 1 unit: null + abs_val: null unit_type: currency - id: hi_15 message: "200-300" @@ -352,106 +391,118 @@ tests: original_text: null output_id: 1 unit: null + abs_val: null unit_type: currency -# - id: hi_16 -# message: "200 se 300 rupees" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "200 se 300 rupees" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: hi_17 -# message: "200 – 300 rupees" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "200 – 300 rupees" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: hi_18 -# message: "200-300 rupees" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "200-300 rupees" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: hi_19 -# message: "200 rupees se 300" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "200 rupees se 300" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: hi_20 -# message: "200 rupees se 300 rupees" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "200 rupees se 300 rupees" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: hi_21 -# message: "200 rupees – 300" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "200 rupees – 300" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: hi_22 -# message: "200 rupees – 300 rupees" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "200 rupees – 300 rupees" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: hi_23 -# message: "200-300 rupees" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "200-300 rupees" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: hi_24 -# message: "२०० से ज्यादा" -# outputs: -# - max_value: null -# min_value: 200 -# original_text: "२०० से ज्यादा" -# output_id: 1 -# unit: null -# unit_type: null -# - id: hi_25 -# message: "२ हजार से ऊपर" -# outputs: -# - max_value: null -# min_value: 2000 -# original_text: "२ हजार से ऊपर" -# output_id: 1 -# unit: null -# unit_type: null -# - id: hi_26 -# message: "ज्यादा से ज्यादा २ हजार" -# outputs: -# - max_value: 2000 -# min_value: null -# original_text: "ज्यादा से ज्यादा २ हजार" -# output_id: 1 -# unit: null -# unit_type: null + - id: hi_16 + message: "200 se 300 rupees" + outputs: + - max_value: '300' + min_value: '200' + original_text: "200 se 300 rupees" + output_id: 1 + unit: rupees + abs_val: null + unit_type: currency + - id: hi_17 + message: "200 – 300 rupees" + outputs: + - max_value: '300' + min_value: '200' + original_text: "200 – 300 rupees" + output_id: 1 + unit: rupees + abs_val: null + unit_type: currency + - id: hi_18 + message: "200-300 rupees" + outputs: + - max_value: '300' + min_value: '200' + original_text: "200-300 rupees" + output_id: 1 + unit: rupees + abs_val: null + unit_type: currency + - id: hi_19 + message: "200 rupees se 300" + outputs: + - max_value: '300' + min_value: '200' + original_text: "200 rupees se 300" + output_id: 1 + unit: rupees + abs_val: null + unit_type: currency + - id: hi_20 + message: "200 rupees se 300 rupees" + outputs: + - max_value: '300' + min_value: '200' + original_text: "200 rupees se 300 rupees" + output_id: 1 + unit: rupees + abs_val: null + unit_type: currency + - id: hi_21 + message: "200 rupees – 300" + outputs: + - max_value: '300' + min_value: '200' + original_text: "200 rupees – 300" + output_id: 1 + unit: rupees + abs_val: null + unit_type: currency + - id: hi_22 + message: "200 rupees – 300 rupees" + outputs: + - max_value: '300' + min_value: '200' + original_text: "200 rupees – 300 rupees" + output_id: 1 + unit: rupees + abs_val: null + unit_type: currency + - id: hi_23 + message: "200-300 rupees" + outputs: + - max_value: '300' + min_value: '200' + original_text: "200-300 rupees" + output_id: 1 + unit: rupees + abs_val: null + unit_type: currency + - id: hi_24 + message: "२०० से ज्यादा" + outputs: + - max_value: null + min_value: '200' + original_text: "२०० से ज्यादा" + output_id: 1 + unit: null + abs_val: null + unit_type: null + - id: hi_25 + message: "२ हजार से ऊपर" + outputs: + - max_value: null + min_value: '2000' + original_text: "२ हजार से ऊपर" + output_id: 1 + unit: null + abs_val: null + unit_type: null + - id: hi_26 + message: "ज्यादा से ज्यादा २ हजार" + outputs: + - max_value: '2000' + min_value: null + original_text: "ज्यादा से ज्यादा २ हजार" + output_id: 1 + unit: null + abs_val: null + unit_type: null - id: hi_27 message: "२०० से ज्यादा" outputs: @@ -460,6 +511,7 @@ tests: original_text: null output_id: 1 unit: null + abs_val: null unit_type: currency - id: hi_28 message: "२ हजार से ऊपर" @@ -469,6 +521,7 @@ tests: original_text: null output_id: 1 unit: null + abs_val: null unit_type: currency - id: hi_29 message: "ज्यादा से ज्यादा ५ हजार" @@ -478,61 +531,68 @@ tests: original_text: null output_id: 1 unit: null + abs_val: null unit_type: currency -# - id: hi_30 -# message: "२०० रूपीस से ज्यादा" -# outputs: -# - max_value: null -# min_value: 200 -# original_text: "२०० रूपीस से ज्यादा" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: hi_31 -# message: "रूपीस २ हजार से ऊपर" -# outputs: -# - max_value: null -# min_value: 2000 -# original_text: रूपीस २ हजार से ऊपर -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: hi_32 -# message: "ज्यादा से ज्यादा ५ हजार रुपया" -# outputs: -# - max_value: 5000 -# min_value: null -# original_text: ज्यादा से ज्यादा ५ हजार रुपया -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: hi_33 -# message: "२०० से ३००" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: २०० से ३०० -# output_id: 1 -# unit: null -# unit_type: null -# - id: hi_34 -# message: "२०० – ३००" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "२०० – ३००" -# output_id: 1 -# unit: null -# unit_type: null -# - id: hi_35 -# message: "२००-३००" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "२००-३००" -# output_id: 1 -# unit: null -# unit_type: null + - id: hi_30 + message: "२०० रूपीस से ज्यादा" + outputs: + - max_value: null + min_value: '200' + original_text: "२०० रूपीस से ज्यादा" + output_id: 1 + unit: rupees + abs_val: null + unit_type: currency + - id: hi_31 + message: "रूपीस २ हजार से ऊपर" + outputs: + - max_value: null + min_value: '2000' + original_text: रूपीस २ हजार से ऊपर + output_id: 1 + unit: rupees + abs_val: null + unit_type: currency + - id: hi_32 + message: "ज्यादा से ज्यादा ५ हजार रुपया" + outputs: + - max_value: '5000' + min_value: null + original_text: ज्यादा से ज्यादा ५ हजार रुपया + output_id: 1 + unit: rupees + abs_val: null + unit_type: currency + - id: hi_33 + message: "२०० से ३००" + outputs: + - max_value: '300' + min_value: '200' + original_text: २०० से ३०० + output_id: 1 + unit: null + abs_val: null + unit_type: null + - id: hi_34 + message: "२०० – ३००" + outputs: + - max_value: '300' + min_value: '200' + original_text: "२०० – ३००" + output_id: 1 + unit: null + abs_val: null + unit_type: null + - id: hi_35 + message: "२००-३००" + outputs: + - max_value: '300' + min_value: '200' + original_text: "२००-३००" + output_id: 1 + unit: null + abs_val: null + unit_type: null - id: hi_36 message: "२०० से ३००" outputs: @@ -541,6 +601,7 @@ tests: original_text: null output_id: 1 unit: null + abs_val: null unit_type: currency - id: hi_37 message: "२०० – ३००" @@ -550,6 +611,7 @@ tests: original_text: null output_id: 1 unit: null + abs_val: null unit_type: currency - id: hi_38 message: "२००-३००" @@ -559,76 +621,85 @@ tests: original_text: null output_id: 1 unit: null + abs_val: null unit_type: currency -# - id: hi_39 -# message: "२०० से ३०० रुपया" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "२०० से ३०० रुपया" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: hi_40 -# message: "२००-३०० रुपया" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "२००-३०० रुपया" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: hi_41 -# message: "२०० रुपया से ३००" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "२०० रुपया से ३००" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: hi_42 -# message: "२०० रुपया से ३०० रुपया" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "२०० रुपया से ३०० रुपया" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: hi_43 -# message: "२०० - ३०० रुपया" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "२०० - ३०० रुपया" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: hi_44 -# message: "२०० रुपया - ३००" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "२०० रुपया - ३००" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: hi_45 -# message: "२०० रुपया - ३०० रुपया " -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "२०० रुपया - ३०० रुपया" -# output_id: 1 -# unit: rupees -# unit_type: currency -# - id: hi_46 -# message: "२००-३०० रुपया" -# outputs: -# - max_value: 300 -# min_value: 200 -# original_text: "२००-३०० रुपया" -# output_id: 1 -# unit: rupees -# unit_type: currency + - id: hi_39 + message: "२०० से ३०० रुपया" + outputs: + - max_value: '300' + min_value: '200' + original_text: "२०० से ३०० रुपया" + output_id: 1 + unit: rupees + abs_val: null + unit_type: currency + - id: hi_40 + message: "२००-३०० रुपया" + outputs: + - max_value: '300' + min_value: '200' + original_text: "२००-३०० रुपया" + output_id: 1 + unit: rupees + abs_val: null + unit_type: currency + - id: hi_41 + message: "२०० रुपया से ३००" + outputs: + - max_value: '300' + min_value: '200' + original_text: "२०० रुपया से ३००" + output_id: 1 + unit: rupees + abs_val: null + unit_type: currency + - id: hi_42 + message: "२०० रुपया से ३०० रुपया" + outputs: + - max_value: '300' + min_value: '200' + original_text: "२०० रुपया से ३०० रुपया" + output_id: 1 + unit: rupees + abs_val: null + unit_type: currency + - id: hi_43 + message: "२०० - ३०० रुपया" + outputs: + - max_value: '300' + min_value: '200' + original_text: "२०० - ३०० रुपया" + output_id: 1 + unit: rupees + abs_val: null + unit_type: currency + - id: hi_44 + message: "२०० रुपया - ३००" + outputs: + - max_value: '300' + min_value: '200' + original_text: "२०० रुपया - ३००" + output_id: 1 + unit: rupees + abs_val: null + unit_type: currency + - id: hi_45 + message: "२०० रुपया - ३०० रुपया " + outputs: + - max_value: '300' + min_value: '200' + original_text: "२०० रुपया - ३०० रुपया" + output_id: 1 + unit: rupees + abs_val: null + unit_type: currency + - id: hi_46 + message: "२००-३०० रुपया" + outputs: + - max_value: '300' + min_value: '200' + original_text: "२००-३०० रुपया" + output_id: 1 + unit: rupees + abs_val: null + unit_type: currency From 29e8437690b46dafcec1e50e52efc20bbe008457 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Thu, 19 Sep 2019 21:49:24 +0530 Subject: [PATCH 077/237] fix number_range YAML file --- .../number_range/number_range_ner_tests.yaml | 1071 ++++++++--------- 1 file changed, 512 insertions(+), 559 deletions(-) diff --git a/ner_v2/tests/numeral/number_range/number_range_ner_tests.yaml b/ner_v2/tests/numeral/number_range/number_range_ner_tests.yaml index 9e6d0260a..72b1c3841 100644 --- a/ner_v2/tests/numeral/number_range/number_range_ner_tests.yaml +++ b/ner_v2/tests/numeral/number_range/number_range_ner_tests.yaml @@ -1,35 +1,35 @@ tests: en: - - id: en_1 - message: "I want more than 200 banana" - outputs: - - max_value: null - min_value: '200' - original_text: "more than 200" - output_id: 1 - unit: null - abs_val: null - unit_type: null - - id: en_2 - message: "My monthly salary will be more than 2k per month" - outputs: - - max_value: null - min_value: '2000' - original_text: "more than 2k" - output_id: 1 - unit: null - abs_val: null - unit_type: null - - id: en_3 - message: "more than 2.5k people in the stadium" - outputs: - - max_value: null - min_value: 2500 - original_text: "more than 2.5k" - output_id: 1 - unit: null - abs_val: null - unit_type: null + - id: en_1 + message: "I want more than 200 banana" + outputs: + - max_value: null + min_value: '200' + original_text: "more than 200" + output_id: 1 + unit: null + abs_value: null + unit_type: null + - id: en_2 + message: "My monthly salary will be more than 2k per month" + outputs: + - max_value: null + min_value: '2000' + original_text: "more than 2k" + output_id: 1 + unit: null + abs_value: null + unit_type: null + - id: en_3 + message: "more than 2.5k people in the stadium" + outputs: + - max_value: null + min_value: '2500' + original_text: "more than 2.5k" + output_id: 1 + unit: null + abs_value: null + unit_type: null - id: en_4 message: "more than 200" outputs: @@ -38,7 +38,7 @@ tests: original_text: null output_id: 1 unit: null - abs_val: null + abs_value: null unit_type: currency - id: en_5 message: "more than 2k" @@ -48,7 +48,7 @@ tests: original_text: null output_id: 1 unit: null - abs_val: null + abs_value: null unit_type: currency - id: en_6 message: "more than 2.5k" @@ -58,59 +58,59 @@ tests: original_text: null output_id: 1 unit: null - abs_val: null + abs_value: null unit_type: currency - - id: en_7 - message: "more than 200 rupees" - outputs: - - max_value: null - min_value: '200' - original_text: "more than 200 rupees" - output_id: 1 - unit: rupees - abs_val: null - unit_type: currency - - id: en_8 - message: "more than 2k rupees" - outputs: - - max_value: null - min_value: '2000' - original_text: "more than 2k rupees" - output_id: 1 - unit: rupees - abs_val: null - unit_type: currency - - id: en_9 - message: "more than 2.5k rupees" - outputs: - - max_value: null - min_value: '2500' - original_text: "more than 2.5k rupees" - output_id: 1 - unit: rupees - abs_val: null - unit_type: currency - - id: en_10 - message: "200 to 300" - outputs: - - max_value: '300' - min_value: '200' - original_text: "200 to 300" - output_id: 1 - unit: null - abs_val: null - unit_type: null - - id: en_11 - message: "200 – 300" - outputs: - - max_value: '300' - min_value: '200' - original_text: "200 – 300" - output_id: 1 - unit: null - abs_val: null - unit_type: null - - id: en_12 + - id: en_7 + message: "more than 200 rupees" + outputs: + - max_value: null + min_value: '200' + original_text: "more than 200 rupees" + output_id: 1 + unit: rupees + abs_value: null + unit_type: currency + - id: en_8 + message: "more than 2k rupees" + outputs: + - max_value: null + min_value: '2000' + original_text: "more than 2k rupees" + output_id: 1 + unit: rupees + abs_value: null + unit_type: currency + - id: en_9 + message: "more than 2.5k rupees" + outputs: + - max_value: null + min_value: '2500' + original_text: "more than 2.5k rupees" + output_id: 1 + unit: rupees + abs_value: null + unit_type: currency + - id: en_10 + message: "200 to 300" + outputs: + - max_value: '300' + min_value: '200' + original_text: "200 to 300" + output_id: 1 + unit: null + abs_value: null + unit_type: null + - id: en_11 + message: "200 – 300" + outputs: + - max_value: '300' + min_value: '200' + original_text: "200 – 300" + output_id: 1 + unit: null + abs_value: null + unit_type: null + - id: en_12 message: "200-300" outputs: - max_value: '300' @@ -118,7 +118,7 @@ tests: original_text: "200-300" output_id: 1 unit: null - abs_val: null + abs_value: null unit_type: null - id: en_13 message: "200 to 300" @@ -128,7 +128,7 @@ tests: original_text: null output_id: 1 unit: null - abs_val: null + abs_value: null unit_type: currency - id: en_14 message: "200 – 300" @@ -138,7 +138,7 @@ tests: original_text: null output_id: 1 unit: null - abs_val: null + abs_value: null unit_type: currency - id: en_15 message: "200-300" @@ -148,221 +148,199 @@ tests: original_text: null output_id: 1 unit: null - abs_val: null + abs_value: null unit_type: currency - - id: en_16 - message: "200 to 300 ruppes" - outputs: - - max_value: '300' - min_value: '200' - original_text: "200 to 300 ruppes" - output_id: 1 - unit: rupees - abs_val: null - unit_type: currency - - id: en_17 - message: "200 – 300 rupees" - outputs: - - max_value: '300' - min_value: '200' - original_text: "200 – 300 rupees" - output_id: 1 - unit: rupees - abs_val: null - unit_type: currency - - id: en_18 - message: "200-300 rupees" - outputs: - - max_value: '300' - min_value: '200' - original_text: "200-300 rupees" - output_id: 1 - unit: rupees - abs_val: null - unit_type: currency - - id: en_19 - message: "200 rupees to 300" - outputs: - - max_value: '300' - min_value: '200' - original_text: "200 rupees to 300" - output_id: 1 - unit: rupees - abs_val: null - unit_type: currency - - id: en_20 - message: "200 rupees to 300 rupees" - outputs: - - max_value: '300' - min_value: '200' - original_text: "200 rupees to 300 rupees" - output_id: 1 - unit: rupees - abs_val: null - unit_type: currency - - id: en_21 - message: "200 rupees – 300" - outputs: - - max_value: '300' - min_value: '200' - original_text: "200 rupees – 300" - output_id: 1 - unit: rupees - abs_val: null - unit_type: currency - - id: en_22 - message: "200 rupees – 300 rupees" - outputs: - - max_value: '300' - min_value: '200' - original_text: "200 rupees – 300 rupees" - output_id: 1 - unit: rupees - abs_val: null - unit_type: currency - - id: en_23 - message: "200-300 rupees" - outputs: - - max_value: '300' - min_value: '200' - original_text: "200-300 rupees" - output_id: 1 - unit: rupees - abs_val: null - unit_type: currency - - id: en_24 - message: "200k-300k men and around 400 women" - outputs: - - max_value: '300000' - min_value: '200000' - original_text: "200k-300k" - output_id: 1 - unit: null - abs_val: null - - max_value: 400 - min_value: null - original_text: "around 400" - output_id: 2 - unit: null - abs_val: null - unit_type: null - - id: en_25 - message: "200k-300k men and around 300k women" - outputs: - - max_value: '300000' - min_value: '200000' - original_text: "200k-300k" - output_id: 1 - unit: null - abs_val: null - - max_value: '300000' - min_value: null - original_text: "around 300k" - output_id: 2 - unit: null - abs_val: null - unit_type: null - - id: en_26 - message: "between 2000 and 3000" - outputs: - - max_value: '3000' - min_value: '2000' - original_text: "between 2000 and 3000" - output_id: 1 - unit: null - abs_val: null - unit_type: null +# - id: en_16 +# message: "200 to 300 ruppes" +# outputs: +# - max_value: 300 +# min_value: 200 +# original_text: "200 to 300 ruppes" +# output_id: 1 +# unit: rupees +# unit_type: currency +# - id: en_17 +# message: "200 – 300 rupees" +# outputs: +# - max_value: 300 +# min_value: 200 +# original_text: "200 – 300 rupees" +# output_id: 1 +# unit: rupees +# unit_type: currency +# - id: en_18 +# message: "200-300 rupees" +# outputs: +# - max_value: 300 +# min_value: 200 +# original_text: "200-300 rupees" +# output_id: 1 +# unit: rupees +# unit_type: currency +# - id: en_19 +# message: "200 rupees to 300" +# outputs: +# - max_value: 300 +# min_value: 200 +# original_text: "200 rupees to 300" +# output_id: 1 +# unit: rupees +# unit_type: currency +# - id: en_20 +# message: "200 rupees to 300 rupees" +# outputs: +# - max_value: 300 +# min_value: 200 +# original_text: "200 rupees to 300 rupees" +# output_id: 1 +# unit: rupees +# unit_type: currency +# - id: en_21 +# message: "200 rupees – 300" +# outputs: +# - max_value: 300 +# min_value: 200 +# original_text: "200 rupees – 300" +# output_id: 1 +# unit: rupees +# unit_type: currency +# - id: en_22 +# message: "200 rupees – 300 rupees" +# outputs: +# - max_value: 300 +# min_value: 200 +# original_text: "200 rupees – 300 rupees" +# output_id: 1 +# unit: rupees +# unit_type: currency +# - id: en_23 +# message: "200-300 rupees" +# outputs: +# - max_value: 300 +# min_value: 200 +# original_text: "200-300 rupees" +# output_id: 1 +# unit: rupees +# unit_type: currency +# - id: en_24 +# message: "200k-300k men and around 400 women" +# outputs: +# - max_value: 300000 +# min_value: 200000 +# original_text: "200k-300k" +# output_id: 1 +# unit: null +# - max_value: 400 +# min_value: null +# original_text: "around 400" +# output_id: 2 +# unit: null +# unit_type: null +# - id: en_25 +# message: "200k-300k men and around 300k women" +# outputs: +# - max_value: 300000 +# min_value: 200000 +# original_text: "200k-300k" +# output_id: 1 +# unit: null +# - max_value: 300000 +# min_value: null +# original_text: "around 300k" +# output_id: 2 +# unit: null +# unit_type: null +# - id: en_26 +# message: "between 2000 and 3000" +# outputs: +# - max_value: 3000 +# min_value: 2000 +# original_text: "between 2000 and 3000" +# output_id: 1 +# unit: null +# unit_type: null hi: - - id: hi_1 - message: "200 se jyada" - outputs: - - max_value: null - min_value: '200' - original_text: "200 se jyada" - output_id: 1 - unit: null - abs_val: null - unit_type: null - - id: hi_2 - message: "2k se upar" - outputs: - - max_value: null - min_value: '2000' - original_text: "2k se upar" - output_id: 1 - unit: null - abs_val: null - unit_type: null - - id: hi_3 - message: "jada se jada 2500" - outputs: - - max_value: '2500' - min_value: null - original_text: "jada se jada 2500" - output_id: 1 - unit: null - abs_val: null - unit_type: null - - id: hi_7 - message: "200 rupees se jyada" - outputs: - - max_value: null - min_value: '200' - original_text: "200 rupees se jyada" - output_id: 1 - unit: rupees - abs_val: null - unit_type: currency - - id: hi_8 - message: "Rupees 2000 se upar" - outputs: - - max_value: null - min_value: '2000' - original_text: "Rupees 2000 se upar" - output_id: 1 - unit: rupees - abs_val: null - unit_type: currency - - id: hi_9 - message: "jada se jada 2500 rupees" - outputs: - - max_value: '2500' - min_value: null - original_text: "jada se jada 2500 rupees" - output_id: 1 - unit: rupees - abs_val: null - unit_type: currency - - id: hi_10 - message: "200 se 300" - outputs: - - max_value: '300' - min_value: '200' - original_text: "200 se 300" - output_id: 1 - unit: null - abs_val: null - unit_type: null - - id: hi_11 - message: "200 – 300" - outputs: - - max_value: '300' - min_value: '200' - original_text: "200 – 300" - output_id: 1 - unit: null - abs_val: null - unit_type: null - - id: hi_12 - message: "200-300" - outputs: - - max_value: '300' - min_value: '200' - original_text: "200-300" - output_id: 1 - unit: null - abs_val: null - unit_type: null +# - id: hi_1 +# message: "200 se jyada" +# outputs: +# - max_value: null +# min_value: 200 +# original_text: "200 se jyada" +# output_id: 1 +# unit: null +# unit_type: null +# - id: hi_2 +# message: "2k se upar" +# outputs: +# - max_value: null +# min_value: 2000 +# original_text: "2k se upar" +# output_id: 1 +# unit: null +# unit_type: null +# - id: hi_3 +# message: "jada se jada 2500" +# outputs: +# - max_value: 2500 +# min_value: null +# original_text: "jada se jada 2500" +# output_id: 1 +# unit: null +# unit_type: null +# - id: hi_7 +# message: "200 rupees se jyada" +# outputs: +# - max_value: null +# min_value: 200 +# original_text: "200 rupees se jyada" +# output_id: 1 +# unit: rupees +# unit_type: currency +# - id: hi_8 +# message: "Rupees 2000 se upar" +# outputs: +# - max_value: null +# min_value: 2000 +# original_text: "Rupees 2000 se upar" +# output_id: 1 +# unit: rupees +# unit_type: currency +# - id: hi_9 +# message: "jada se jada 2500 rupees" +# outputs: +# - max_value: 2500 +# min_value: null +# original_text: "jada se jada 2500 rupees" +# output_id: 1 +# unit: rupees +# unit_type: currency +# - id: hi_10 +# message: "200 se 300" +# outputs: +# - max_value: 300 +# min_value: 200 +# original_text: "200 se 300" +# output_id: 1 +# unit: null +# unit_type: null +# - id: hi_11 +# message: "200 – 300" +# outputs: +# - max_value: 300 +# min_value: 200 +# original_text: "200 – 300" +# output_id: 1 +# unit: null +# unit_type: null +# - id: hi_12 +# message: "200-300" +# outputs: +# - max_value: 300 +# min_value: 200 +# original_text: "200-300" +# output_id: 1 +# unit: null +# unit_type: null - id: hi_13 message: "200 se 300" outputs: @@ -371,7 +349,7 @@ tests: original_text: null output_id: 1 unit: null - abs_val: null + abs_value: null unit_type: currency - id: hi_14 message: "200 – 300" @@ -381,7 +359,7 @@ tests: original_text: null output_id: 1 unit: null - abs_val: null + abs_value: null unit_type: currency - id: hi_15 message: "200-300" @@ -391,118 +369,107 @@ tests: original_text: null output_id: 1 unit: null - abs_val: null + abs_value: null unit_type: currency - - id: hi_16 - message: "200 se 300 rupees" - outputs: - - max_value: '300' - min_value: '200' - original_text: "200 se 300 rupees" - output_id: 1 - unit: rupees - abs_val: null - unit_type: currency - - id: hi_17 - message: "200 – 300 rupees" - outputs: - - max_value: '300' - min_value: '200' - original_text: "200 – 300 rupees" - output_id: 1 - unit: rupees - abs_val: null - unit_type: currency - - id: hi_18 - message: "200-300 rupees" - outputs: - - max_value: '300' - min_value: '200' - original_text: "200-300 rupees" - output_id: 1 - unit: rupees - abs_val: null - unit_type: currency - - id: hi_19 - message: "200 rupees se 300" - outputs: - - max_value: '300' - min_value: '200' - original_text: "200 rupees se 300" - output_id: 1 - unit: rupees - abs_val: null - unit_type: currency - - id: hi_20 - message: "200 rupees se 300 rupees" - outputs: - - max_value: '300' - min_value: '200' - original_text: "200 rupees se 300 rupees" - output_id: 1 - unit: rupees - abs_val: null - unit_type: currency - - id: hi_21 - message: "200 rupees – 300" - outputs: - - max_value: '300' - min_value: '200' - original_text: "200 rupees – 300" - output_id: 1 - unit: rupees - abs_val: null - unit_type: currency - - id: hi_22 - message: "200 rupees – 300 rupees" - outputs: - - max_value: '300' - min_value: '200' - original_text: "200 rupees – 300 rupees" - output_id: 1 - unit: rupees - abs_val: null - unit_type: currency - - id: hi_23 - message: "200-300 rupees" - outputs: - - max_value: '300' - min_value: '200' - original_text: "200-300 rupees" - output_id: 1 - unit: rupees - abs_val: null - unit_type: currency - - id: hi_24 - message: "२०० से ज्यादा" - outputs: - - max_value: null - min_value: '200' - original_text: "२०० से ज्यादा" - output_id: 1 - unit: null - abs_val: null - unit_type: null - - id: hi_25 - message: "२ हजार से ऊपर" - outputs: - - max_value: null - min_value: '2000' - original_text: "२ हजार से ऊपर" - output_id: 1 - unit: null - abs_val: null - unit_type: null - - id: hi_26 - message: "ज्यादा से ज्यादा २ हजार" - outputs: - - max_value: '2000' - min_value: null - original_text: "ज्यादा से ज्यादा २ हजार" - output_id: 1 - unit: null - abs_val: null - unit_type: null +# - id: hi_16 +# message: "200 se 300 rupees" +# outputs: +# - max_value: 300 +# min_value: 200 +# original_text: "200 se 300 rupees" +# output_id: 1 +# unit: rupees +# unit_type: currency +# - id: hi_17 +# message: "200 – 300 rupees" +# outputs: +# - max_value: 300 +# min_value: 200 +# original_text: "200 – 300 rupees" +# output_id: 1 +# unit: rupees +# unit_type: currency +# - id: hi_18 +# message: "200-300 rupees" +# outputs: +# - max_value: 300 +# min_value: 200 +# original_text: "200-300 rupees" +# output_id: 1 +# unit: rupees +# unit_type: currency +# - id: hi_19 +# message: "200 rupees se 300" +# outputs: +# - max_value: 300 +# min_value: 200 +# original_text: "200 rupees se 300" +# output_id: 1 +# unit: rupees +# unit_type: currency +# - id: hi_20 +# message: "200 rupees se 300 rupees" +# outputs: +# - max_value: 300 +# min_value: 200 +# original_text: "200 rupees se 300 rupees" +# output_id: 1 +# unit: rupees +# unit_type: currency +# - id: hi_21 +# message: "200 rupees – 300" +# outputs: +# - max_value: 300 +# min_value: 200 +# original_text: "200 rupees – 300" +# output_id: 1 +# unit: rupees +# unit_type: currency +# - id: hi_22 +# message: "200 rupees – 300 rupees" +# outputs: +# - max_value: 300 +# min_value: 200 +# original_text: "200 rupees – 300 rupees" +# output_id: 1 +# unit: rupees +# unit_type: currency +# - id: hi_23 +# message: "200-300 rupees" +# outputs: +# - max_value: 300 +# min_value: 200 +# original_text: "200-300 rupees" +# output_id: 1 +# unit: rupees +# unit_type: currency +# - id: hi_24 +# message: "२०० से ज्यादा" +# outputs: +# - max_value: null +# min_value: 200 +# original_text: "२०० से ज्यादा" +# output_id: 1 +# unit: null +# unit_type: null +# - id: hi_25 +# message: "२ हजार से ऊपर" +# outputs: +# - max_value: null +# min_value: 2000 +# original_text: "२ हजार से ऊपर" +# output_id: 1 +# unit: null +# unit_type: null +# - id: hi_26 +# message: "ज्यादा से ज्यादा २ हजार" +# outputs: +# - max_value: 2000 +# min_value: null +# original_text: "ज्यादा से ज्यादा २ हजार" +# output_id: 1 +# unit: null +# unit_type: null - id: hi_27 message: "२०० से ज्यादा" outputs: @@ -511,7 +478,7 @@ tests: original_text: null output_id: 1 unit: null - abs_val: null + abs_value: null unit_type: currency - id: hi_28 message: "२ हजार से ऊपर" @@ -521,7 +488,7 @@ tests: original_text: null output_id: 1 unit: null - abs_val: null + abs_value: null unit_type: currency - id: hi_29 message: "ज्यादा से ज्यादा ५ हजार" @@ -531,68 +498,62 @@ tests: original_text: null output_id: 1 unit: null - abs_val: null + abs_value: null unit_type: currency - - id: hi_30 - message: "२०० रूपीस से ज्यादा" - outputs: - - max_value: null - min_value: '200' - original_text: "२०० रूपीस से ज्यादा" - output_id: 1 - unit: rupees - abs_val: null - unit_type: currency - - id: hi_31 - message: "रूपीस २ हजार से ऊपर" - outputs: - - max_value: null - min_value: '2000' - original_text: रूपीस २ हजार से ऊपर - output_id: 1 - unit: rupees - abs_val: null - unit_type: currency - - id: hi_32 - message: "ज्यादा से ज्यादा ५ हजार रुपया" - outputs: - - max_value: '5000' - min_value: null - original_text: ज्यादा से ज्यादा ५ हजार रुपया - output_id: 1 - unit: rupees - abs_val: null - unit_type: currency - - id: hi_33 - message: "२०० से ३००" - outputs: - - max_value: '300' - min_value: '200' - original_text: २०० से ३०० - output_id: 1 - unit: null - abs_val: null - unit_type: null - - id: hi_34 - message: "२०० – ३००" - outputs: - - max_value: '300' - min_value: '200' - original_text: "२०० – ३००" - output_id: 1 - unit: null - abs_val: null - unit_type: null - - id: hi_35 - message: "२००-३००" - outputs: - - max_value: '300' - min_value: '200' - original_text: "२००-३००" - output_id: 1 - unit: null - abs_val: null - unit_type: null +# - id: hi_30 +# message: "२०० रूपीस से ज्यादा" +# outputs: +# - max_value: null +# min_value: 200 +# original_text: "२०० रूपीस से ज्यादा" +# output_id: 1 +# unit: rupees +# unit_type: currency +# - id: hi_31 +# message: "रूपीस २ हजार से ऊपर" +# outputs: +# - max_value: null +# min_value: 2000 +# original_text: रूपीस २ हजार से ऊपर +# output_id: 1 +# unit: rupees +# unit_type: currency +# - id: hi_32 +# message: "ज्यादा से ज्यादा ५ हजार रुपया" +# outputs: +# - max_value: 5000 +# min_value: null +# original_text: ज्यादा से ज्यादा ५ हजार रुपया +# output_id: 1 +# unit: rupees +# unit_type: currency +# - id: hi_33 +# message: "२०० से ३००" +# outputs: +# - max_value: 300 +# min_value: 200 +# original_text: २०० से ३०० +# output_id: 1 +# unit: null +# unit_type: null +# - id: hi_34 +# message: "२०० – ३००" +# outputs: +# - max_value: 300 +# min_value: 200 +# original_text: "२०० – ३००" +# output_id: 1 +# unit: null +# unit_type: null +# - id: hi_35 +# message: "२००-३००" +# outputs: +# - max_value: 300 +# min_value: 200 +# original_text: "२००-३००" +# output_id: 1 +# unit: null +# unit_type: null - id: hi_36 message: "२०० से ३००" outputs: @@ -601,7 +562,7 @@ tests: original_text: null output_id: 1 unit: null - abs_val: null + abs_value: null unit_type: currency - id: hi_37 message: "२०० – ३००" @@ -611,7 +572,7 @@ tests: original_text: null output_id: 1 unit: null - abs_val: null + abs_value: null unit_type: currency - id: hi_38 message: "२००-३००" @@ -621,85 +582,77 @@ tests: original_text: null output_id: 1 unit: null - abs_val: null + abs_value: null unit_type: currency - - id: hi_39 - message: "२०० से ३०० रुपया" - outputs: - - max_value: '300' - min_value: '200' - original_text: "२०० से ३०० रुपया" - output_id: 1 - unit: rupees - abs_val: null - unit_type: currency - - id: hi_40 - message: "२००-३०० रुपया" - outputs: - - max_value: '300' - min_value: '200' - original_text: "२००-३०० रुपया" - output_id: 1 - unit: rupees - abs_val: null - unit_type: currency - - id: hi_41 - message: "२०० रुपया से ३००" - outputs: - - max_value: '300' - min_value: '200' - original_text: "२०० रुपया से ३००" - output_id: 1 - unit: rupees - abs_val: null - unit_type: currency - - id: hi_42 - message: "२०० रुपया से ३०० रुपया" - outputs: - - max_value: '300' - min_value: '200' - original_text: "२०० रुपया से ३०० रुपया" - output_id: 1 - unit: rupees - abs_val: null - unit_type: currency - - id: hi_43 - message: "२०० - ३०० रुपया" - outputs: - - max_value: '300' - min_value: '200' - original_text: "२०० - ३०० रुपया" - output_id: 1 - unit: rupees - abs_val: null - unit_type: currency - - id: hi_44 - message: "२०० रुपया - ३००" - outputs: - - max_value: '300' - min_value: '200' - original_text: "२०० रुपया - ३००" - output_id: 1 - unit: rupees - abs_val: null - unit_type: currency - - id: hi_45 - message: "२०० रुपया - ३०० रुपया " - outputs: - - max_value: '300' - min_value: '200' - original_text: "२०० रुपया - ३०० रुपया" - output_id: 1 - unit: rupees - abs_val: null - unit_type: currency - - id: hi_46 - message: "२००-३०० रुपया" - outputs: - - max_value: '300' - min_value: '200' - original_text: "२००-३०० रुपया" - output_id: 1 - unit: rupees - abs_val: null - unit_type: currency +# - id: hi_39 +# message: "२०० से ३०० रुपया" +# outputs: +# - max_value: 300 +# min_value: 200 +# original_text: "२०० से ३०० रुपया" +# output_id: 1 +# unit: rupees +# unit_type: currency +# - id: hi_40 +# message: "२००-३०० रुपया" +# outputs: +# - max_value: 300 +# min_value: 200 +# original_text: "२००-३०० रुपया" +# output_id: 1 +# unit: rupees +# unit_type: currency +# - id: hi_41 +# message: "२०० रुपया से ३००" +# outputs: +# - max_value: 300 +# min_value: 200 +# original_text: "२०० रुपया से ३००" +# output_id: 1 +# unit: rupees +# unit_type: currency +# - id: hi_42 +# message: "२०० रुपया से ३०० रुपया" +# outputs: +# - max_value: 300 +# min_value: 200 +# original_text: "२०० रुपया से ३०० रुपया" +# output_id: 1 +# unit: rupees +# unit_type: currency +# - id: hi_43 +# message: "२०० - ३०० रुपया" +# outputs: +# - max_value: 300 +# min_value: 200 +# original_text: "२०० - ३०० रुपया" +# output_id: 1 +# unit: rupees +# unit_type: currency +# - id: hi_44 +# message: "२०० रुपया - ३००" +# outputs: +# - max_value: 300 +# min_value: 200 +# original_text: "२०० रुपया - ३००" +# output_id: 1 +# unit: rupees +# unit_type: currency +# - id: hi_45 +# message: "२०० रुपया - ३०० रुपया " +# outputs: +# - max_value: 300 +# min_value: 200 +# original_text: "२०० रुपया - ३०० रुपया" +# output_id: 1 +# unit: rupees +# unit_type: currency +# - id: hi_46 +# message: "२००-३०० रुपया" +# outputs: +# - max_value: 300 +# min_value: 200 +# original_text: "२००-३०० रुपया" +# output_id: 1 +# unit: rupees +# unit_type: currency From 6b793636f95e460c3ba3fafaa2579ec801190184 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Thu, 19 Sep 2019 21:58:22 +0530 Subject: [PATCH 078/237] added abs_value in test_number_range_detection.py --- ner_v2/tests/numeral/number_range/test_number_range_detection.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ner_v2/tests/numeral/number_range/test_number_range_detection.py b/ner_v2/tests/numeral/number_range/test_number_range_detection.py index e9af7f3a5..f2470c5c5 100644 --- a/ner_v2/tests/numeral/number_range/test_number_range_detection.py +++ b/ner_v2/tests/numeral/number_range/test_number_range_detection.py @@ -40,6 +40,7 @@ def parse_expected_outputs(expected_outputs): "min_value": str(expected_output["min_value"]) if expected_output["min_value"] else None, "unit": str(expected_output["unit"]) if expected_output["unit"] else None, "max_value": str(expected_output["max_value"]) if expected_output["max_value"] else None, + "abs_value": str(expected_output["abs_value"]) if expected_output["abs_value"] else None } original_text = \ expected_output["original_text"].lower().strip() if expected_output["original_text"] else None From 3c9889b9eb6eb18e6dbbb4ca12d592955997f796 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Thu, 19 Sep 2019 22:04:48 +0530 Subject: [PATCH 079/237] fix tests in number_range --- ner_v2/tests/numeral/number_range/number_range_ner_tests.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ner_v2/tests/numeral/number_range/number_range_ner_tests.yaml b/ner_v2/tests/numeral/number_range/number_range_ner_tests.yaml index 72b1c3841..0d4d0b077 100644 --- a/ner_v2/tests/numeral/number_range/number_range_ner_tests.yaml +++ b/ner_v2/tests/numeral/number_range/number_range_ner_tests.yaml @@ -101,7 +101,7 @@ tests: abs_value: null unit_type: null - id: en_11 - message: "200 – 300" + message: "200 - 300" outputs: - max_value: '300' min_value: '200' From 98ac7469ba07a8ed5dc9dedb4f92df333c045177 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Thu, 19 Sep 2019 22:11:11 +0530 Subject: [PATCH 080/237] fix tests in number_range --- ner_v2/tests/numeral/number_range/number_range_ner_tests.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ner_v2/tests/numeral/number_range/number_range_ner_tests.yaml b/ner_v2/tests/numeral/number_range/number_range_ner_tests.yaml index 0d4d0b077..5d7058747 100644 --- a/ner_v2/tests/numeral/number_range/number_range_ner_tests.yaml +++ b/ner_v2/tests/numeral/number_range/number_range_ner_tests.yaml @@ -105,7 +105,7 @@ tests: outputs: - max_value: '300' min_value: '200' - original_text: "200 – 300" + original_text: "200 - 300" output_id: 1 unit: null abs_value: null From 42bc4b54ad7fe54ab967848578cb91cf978fe406 Mon Sep 17 00:00:00 2001 From: amansrivastava17 Date: Fri, 20 Sep 2019 11:01:05 +0530 Subject: [PATCH 081/237] fix lint --- .../numeral/number_range/standard_number_range_detector.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py b/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py index 3edeb158d..0e5aaf0a4 100644 --- a/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py +++ b/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py @@ -411,7 +411,6 @@ def _detect_min_max_num_range(self, number_range_list=None, original_list=None): def _update_tagged_text(self, original_number_list): """ Replaces detected date with tag generated from entity_name used to initialize the object with - A final string with all dates replaced will be stored in object's tagged_text attribute A string with all dates removed will be stored in object's processed_text attribute From d25c5f786a581a167ce6eb7a6452e208af703408 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Fri, 20 Sep 2019 19:11:23 +0530 Subject: [PATCH 082/237] print to test --- ner_v2/detectors/temporal/time/en/time_detection.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ner_v2/detectors/temporal/time/en/time_detection.py b/ner_v2/detectors/temporal/time/en/time_detection.py index be8df7d40..cae506e69 100644 --- a/ner_v2/detectors/temporal/time/en/time_detection.py +++ b/ner_v2/detectors/temporal/time/en/time_detection.py @@ -202,6 +202,8 @@ def detect_time(self, text, range_enabled=False, form_check=False, **kwargs): time_data = self._detect_time(range_enabled=range_enabled, form_check=form_check) self.time = time_data[0] self.original_time_text = time_data[1] + print('processed_text', self.processed_text) + print('tagged_text', self.tagged_text) return time_data def _detect_range_12_hour_format(self, time_list=None, original_list=None): From 9f837fe61be846942fe4322377ff911993b3f03c Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Mon, 23 Sep 2019 16:24:56 +0530 Subject: [PATCH 083/237] add-word-to-number-in-phone-number --- .../new_phone_number_detection.py | 48 ++++++++++++++++++- 1 file changed, 46 insertions(+), 2 deletions(-) diff --git a/ner_v2/detectors/pattern/phone_number/new_phone_number_detection.py b/ner_v2/detectors/pattern/phone_number/new_phone_number_detection.py index dbba63f89..01c69844d 100644 --- a/ner_v2/detectors/pattern/phone_number/new_phone_number_detection.py +++ b/ner_v2/detectors/pattern/phone_number/new_phone_number_detection.py @@ -1,10 +1,14 @@ # -*- coding: utf-8 -*- from ner_v2.detectors.base_detector import BaseDetector from ner_v2.detectors.numeral.number.number_detection import NumberDetector +from ner_v2.detectors.numeral.utils import get_number_from_number_word from language_utilities.constant import ENGLISH_LANG +import collections import re import phonenumbers +NumberVariant = collections.namedtuple('NumberVariant', ['scale', 'increment']) + class NewPhoneDetector(BaseDetector): """ @@ -34,6 +38,7 @@ def __init__(self, entity_name, language=ENGLISH_LANG, country_code="IN"): self.phone = [] self.original_phone_text = [] self.country_code = country_code.upper() + self.number_word_dict = self.create_number_word_dict() @property def supported_languages(self): @@ -44,6 +49,45 @@ def supported_languages(self): """ return self._supported_languages + def create_number_word_dict(self): + number_word_dictionary = {'one': NumberVariant(scale=1, increment=1), + 'two': NumberVariant(scale=1, increment=2), + 'three': NumberVariant(scale=1, increment=3), + 'four': NumberVariant(scale=1, increment=4), + 'five': NumberVariant(scale=1, increment=5), + 'six': NumberVariant(scale=1, increment=6), + 'seven': NumberVariant(scale=1, increment=7), + 'eight': NumberVariant(scale=1, increment=8), + 'nine': NumberVariant(scale=1, increment=9), + 'zero': NumberVariant(scale=1, increment=0)} + return number_word_dictionary + + def convert_words_to_numbers(self, text): + """ + :param text: user message + :return: converted user message with words replaced with numbers + """ + numbers_dict = get_number_from_number_word(text, self.number_word_dict) + val = numbers_dict[0] + word = numbers_dict[1] + converted_sentence = text + x = zip(val, word) + unique_x = [] + for i in x: + if i not in unique_x: + unique_x.append(i) + + for j in unique_x: + pattern = re.compile(j[1], re.U) + converted_sentence = pattern.sub(string=converted_sentence, repl=str(j[0])) + + while re.search(r'(\d+)(\s+)(\d+)', converted_sentence): + converted_sentence = re.sub(r'(\d+)(\s+)(\d+)', r'\1\3', converted_sentence) + + converted_sentence = re.sub(r'(double)(\s+)(\d)', r'\3\3', converted_sentence) + converted_sentence = re.sub(r'(triple)(\s+)(\d)', r'\3\3\3', converted_sentence) + return converted_sentence + def detect_entity(self, text, **kwargs): """Detects phone numbers in the text string @@ -71,9 +115,9 @@ def detect_entity(self, text, **kwargs): """ - self.text = text + self.text = self.convert_words_to_numbers(text) self.phone, self.original_phone_text = [], [] - for match in phonenumbers.PhoneNumberMatcher(text, self.country_code): + for match in phonenumbers.PhoneNumberMatcher(self.text, self.country_code): self.phone.append({"country_calling_code": str(match.number.country_code), "phone_number": str(match.number.national_number)}) self.original_phone_text.append(self.text[match.start:match.end]) From 5f470f0bccc5279a1470748d6309a3c115bce1e7 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Mon, 23 Sep 2019 17:01:53 +0530 Subject: [PATCH 084/237] add-word-to-number-in-phone-number --- .../pattern/phone_number/new_phone_number_detection.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ner_v2/detectors/pattern/phone_number/new_phone_number_detection.py b/ner_v2/detectors/pattern/phone_number/new_phone_number_detection.py index 01c69844d..ef50f7bd5 100644 --- a/ner_v2/detectors/pattern/phone_number/new_phone_number_detection.py +++ b/ner_v2/detectors/pattern/phone_number/new_phone_number_detection.py @@ -81,11 +81,11 @@ def convert_words_to_numbers(self, text): pattern = re.compile(j[1], re.U) converted_sentence = pattern.sub(string=converted_sentence, repl=str(j[0])) - while re.search(r'(\d+)(\s+)(\d+)', converted_sentence): - converted_sentence = re.sub(r'(\d+)(\s+)(\d+)', r'\1\3', converted_sentence) - converted_sentence = re.sub(r'(double)(\s+)(\d)', r'\3\3', converted_sentence) converted_sentence = re.sub(r'(triple)(\s+)(\d)', r'\3\3\3', converted_sentence) + + while re.search(r'(\d+)(\s+)(\d+)', converted_sentence): + converted_sentence = re.sub(r'(\d+)(\s+)(\d+)', r'\1\3', converted_sentence) return converted_sentence def detect_entity(self, text, **kwargs): From e64f01ade7d86638e4e1ac14509b9685f1fe9d0f Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Mon, 23 Sep 2019 17:12:42 +0530 Subject: [PATCH 085/237] add-word-to-number-in-phone-number --- .../new_phone_number_detection.py | 34 +++++++++++-------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/ner_v2/detectors/pattern/phone_number/new_phone_number_detection.py b/ner_v2/detectors/pattern/phone_number/new_phone_number_detection.py index ef50f7bd5..88b576afa 100644 --- a/ner_v2/detectors/pattern/phone_number/new_phone_number_detection.py +++ b/ner_v2/detectors/pattern/phone_number/new_phone_number_detection.py @@ -10,6 +10,20 @@ NumberVariant = collections.namedtuple('NumberVariant', ['scale', 'increment']) +def create_number_word_dict(): + number_word_dictionary = {'one': NumberVariant(scale=1, increment=1), + 'two': NumberVariant(scale=1, increment=2), + 'three': NumberVariant(scale=1, increment=3), + 'four': NumberVariant(scale=1, increment=4), + 'five': NumberVariant(scale=1, increment=5), + 'six': NumberVariant(scale=1, increment=6), + 'seven': NumberVariant(scale=1, increment=7), + 'eight': NumberVariant(scale=1, increment=8), + 'nine': NumberVariant(scale=1, increment=9), + 'zero': NumberVariant(scale=1, increment=0)} + return number_word_dictionary + + class NewPhoneDetector(BaseDetector): """ This method is used to detect phone numbers present in text. The phone detector takes into @@ -38,7 +52,7 @@ def __init__(self, entity_name, language=ENGLISH_LANG, country_code="IN"): self.phone = [] self.original_phone_text = [] self.country_code = country_code.upper() - self.number_word_dict = self.create_number_word_dict() + self.number_word_dict = create_number_word_dict() @property def supported_languages(self): @@ -49,19 +63,6 @@ def supported_languages(self): """ return self._supported_languages - def create_number_word_dict(self): - number_word_dictionary = {'one': NumberVariant(scale=1, increment=1), - 'two': NumberVariant(scale=1, increment=2), - 'three': NumberVariant(scale=1, increment=3), - 'four': NumberVariant(scale=1, increment=4), - 'five': NumberVariant(scale=1, increment=5), - 'six': NumberVariant(scale=1, increment=6), - 'seven': NumberVariant(scale=1, increment=7), - 'eight': NumberVariant(scale=1, increment=8), - 'nine': NumberVariant(scale=1, increment=9), - 'zero': NumberVariant(scale=1, increment=0)} - return number_word_dictionary - def convert_words_to_numbers(self, text): """ :param text: user message @@ -115,7 +116,10 @@ def detect_entity(self, text, **kwargs): """ - self.text = self.convert_words_to_numbers(text) + if self.language == 'en': + self.text = self.convert_words_to_numbers(text) + else: + self.text = text self.phone, self.original_phone_text = [], [] for match in phonenumbers.PhoneNumberMatcher(self.text, self.country_code): self.phone.append({"country_calling_code": str(match.number.country_code), From c830729df0ab72549f63c305d2fe7956310271a3 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Mon, 23 Sep 2019 17:22:07 +0530 Subject: [PATCH 086/237] convert new_phone_number_detection to phone_number_detection --- .../new_phone_number_detection.py | 129 ----------- .../phone_number/phone_number_detection.py | 205 +++++------------- 2 files changed, 60 insertions(+), 274 deletions(-) delete mode 100644 ner_v2/detectors/pattern/phone_number/new_phone_number_detection.py diff --git a/ner_v2/detectors/pattern/phone_number/new_phone_number_detection.py b/ner_v2/detectors/pattern/phone_number/new_phone_number_detection.py deleted file mode 100644 index 88b576afa..000000000 --- a/ner_v2/detectors/pattern/phone_number/new_phone_number_detection.py +++ /dev/null @@ -1,129 +0,0 @@ -# -*- coding: utf-8 -*- -from ner_v2.detectors.base_detector import BaseDetector -from ner_v2.detectors.numeral.number.number_detection import NumberDetector -from ner_v2.detectors.numeral.utils import get_number_from_number_word -from language_utilities.constant import ENGLISH_LANG -import collections -import re -import phonenumbers - -NumberVariant = collections.namedtuple('NumberVariant', ['scale', 'increment']) - - -def create_number_word_dict(): - number_word_dictionary = {'one': NumberVariant(scale=1, increment=1), - 'two': NumberVariant(scale=1, increment=2), - 'three': NumberVariant(scale=1, increment=3), - 'four': NumberVariant(scale=1, increment=4), - 'five': NumberVariant(scale=1, increment=5), - 'six': NumberVariant(scale=1, increment=6), - 'seven': NumberVariant(scale=1, increment=7), - 'eight': NumberVariant(scale=1, increment=8), - 'nine': NumberVariant(scale=1, increment=9), - 'zero': NumberVariant(scale=1, increment=0)} - return number_word_dictionary - - -class NewPhoneDetector(BaseDetector): - """ - This method is used to detect phone numbers present in text. The phone detector takes into - consideration domestic as well as international phone numbers. - - Attributes: - text(str): string provided to extract phone numbers detection - phone (list): list of detected entity values - original_phone_text (list): list to store substrings of the text detected as phone numbers - """ - - def __init__(self, entity_name, language=ENGLISH_LANG, country_code="IN"): - # Todo: Change default from india to get it from the bot. - """ - Args: - entity_name (str): A string by which the detected numbers would be replaced with - on calling detect_entity() - language (str, optional): language code of number text, defaults to 'en' - country_code(str, optional): country code of the country from which you are using - """ - self._supported_languages = NumberDetector.get_supported_languages() - super(NewPhoneDetector, self).__init__(language, country_code) - self.language = language - self.entity_name = entity_name - self.text = '' - self.phone = [] - self.original_phone_text = [] - self.country_code = country_code.upper() - self.number_word_dict = create_number_word_dict() - - @property - def supported_languages(self): - """ - This method returns the list of languages supported by entity detectors - Return: - list: List of ISO 639 codes of languages supported by subclass/detector - """ - return self._supported_languages - - def convert_words_to_numbers(self, text): - """ - :param text: user message - :return: converted user message with words replaced with numbers - """ - numbers_dict = get_number_from_number_word(text, self.number_word_dict) - val = numbers_dict[0] - word = numbers_dict[1] - converted_sentence = text - x = zip(val, word) - unique_x = [] - for i in x: - if i not in unique_x: - unique_x.append(i) - - for j in unique_x: - pattern = re.compile(j[1], re.U) - converted_sentence = pattern.sub(string=converted_sentence, repl=str(j[0])) - - converted_sentence = re.sub(r'(double)(\s+)(\d)', r'\3\3', converted_sentence) - converted_sentence = re.sub(r'(triple)(\s+)(\d)', r'\3\3\3', converted_sentence) - - while re.search(r'(\d+)(\s+)(\d+)', converted_sentence): - converted_sentence = re.sub(r'(\d+)(\s+)(\d+)', r'\1\3', converted_sentence) - return converted_sentence - - def detect_entity(self, text, **kwargs): - """Detects phone numbers in the text string - - Args: - text: string to extract entities from - **kwargs: it can be used to send specific arguments in future. - - Returns: - - self.phone (list): list consisting the detected phone numbers - self.original_phone_text (list): list containing their corresponding substrings in the original message. - - Examples: - - text = 'call +1 (408) 912-6172 and send 100rs to 9920441344' - - p = PhoneDetector(entity_name='phone_number', language='en') - p.detect_entity(text=text) - (['14089126172', '9920441344'], [u'+1 (408) 912-6172', u'9920441344']) - - text = '+९१ ९८१९९८३१३२ पर कॉल करें और संदेश ९८२०३३४४१६ पर कॉल करें' - p = PhoneDetector(entity_name='phone_number', language='hi') - p.detect_entity(text=text) - (['919819983132', '9820334416'],[u'+९१ ९८१९९८३१३२', u'+९१ ९८१९९८३१३२']) - - """ - - if self.language == 'en': - self.text = self.convert_words_to_numbers(text) - else: - self.text = text - self.phone, self.original_phone_text = [], [] - for match in phonenumbers.PhoneNumberMatcher(self.text, self.country_code): - self.phone.append({"country_calling_code": str(match.number.country_code), - "phone_number": str(match.number.national_number)}) - self.original_phone_text.append(self.text[match.start:match.end]) - - return self.phone, self.original_phone_text diff --git a/ner_v2/detectors/pattern/phone_number/phone_number_detection.py b/ner_v2/detectors/pattern/phone_number/phone_number_detection.py index 30ecea4d7..fc1490782 100644 --- a/ner_v2/detectors/pattern/phone_number/phone_number_detection.py +++ b/ner_v2/detectors/pattern/phone_number/phone_number_detection.py @@ -1,8 +1,27 @@ # -*- coding: utf-8 -*- from ner_v2.detectors.base_detector import BaseDetector from ner_v2.detectors.numeral.number.number_detection import NumberDetector +from ner_v2.detectors.numeral.utils import get_number_from_number_word from language_utilities.constant import ENGLISH_LANG +import collections import re +import phonenumbers + +NumberVariant = collections.namedtuple('NumberVariant', ['scale', 'increment']) + + +def create_number_word_dict(): + number_word_dictionary = {'one': NumberVariant(scale=1, increment=1), + 'two': NumberVariant(scale=1, increment=2), + 'three': NumberVariant(scale=1, increment=3), + 'four': NumberVariant(scale=1, increment=4), + 'five': NumberVariant(scale=1, increment=5), + 'six': NumberVariant(scale=1, increment=6), + 'seven': NumberVariant(scale=1, increment=7), + 'eight': NumberVariant(scale=1, increment=8), + 'nine': NumberVariant(scale=1, increment=9), + 'zero': NumberVariant(scale=1, increment=0)} + return number_word_dictionary class PhoneDetector(BaseDetector): @@ -12,29 +31,28 @@ class PhoneDetector(BaseDetector): Attributes: text(str): string provided to extract phone numbers detection - tagged_text (str): string in which the detected phone numbers are replaced by ____ - processed_text (str): string in which the detected phone numbers are removed phone (list): list of detected entity values original_phone_text (list): list to store substrings of the text detected as phone numbers - tag (str): entity_name prepended and appended with '__' """ - def __init__(self, entity_name, language=ENGLISH_LANG): + + def __init__(self, entity_name, language=ENGLISH_LANG, country_code="IN"): + # Todo: Change default from india to get it from the bot. """ Args: entity_name (str): A string by which the detected numbers would be replaced with on calling detect_entity() language (str, optional): language code of number text, defaults to 'en' + country_code(str, optional): country code of the country from which you are using """ self._supported_languages = NumberDetector.get_supported_languages() - super(PhoneDetector, self).__init__(language) + super(PhoneDetector, self).__init__(language, country_code) self.language = language self.entity_name = entity_name self.text = '' - self.tagged_text = '' - self.processed_text = '' self.phone = [] self.original_phone_text = [] - self.tag = '__' + self.entity_name + '__' + self.country_code = country_code.upper() + self.number_word_dict = create_number_word_dict() @property def supported_languages(self): @@ -45,6 +63,32 @@ def supported_languages(self): """ return self._supported_languages + def convert_words_to_numbers(self, text): + """ + :param text: user message + :return: converted user message with words replaced with numbers + """ + numbers_dict = get_number_from_number_word(text, self.number_word_dict) + val = numbers_dict[0] + word = numbers_dict[1] + converted_sentence = text + x = zip(val, word) + unique_x = [] + for i in x: + if i not in unique_x: + unique_x.append(i) + + for j in unique_x: + pattern = re.compile(j[1], re.U) + converted_sentence = pattern.sub(string=converted_sentence, repl=str(j[0])) + + converted_sentence = re.sub(r'(double)(\s+)(\d)', r'\3\3', converted_sentence) + converted_sentence = re.sub(r'(triple)(\s+)(\d)', r'\3\3\3', converted_sentence) + + while re.search(r'(\d+)(\s+)(\d+)', converted_sentence): + converted_sentence = re.sub(r'(\d+)(\s+)(\d+)', r'\1\3', converted_sentence) + return converted_sentence + def detect_entity(self, text, **kwargs): """Detects phone numbers in the text string @@ -72,143 +116,14 @@ def detect_entity(self, text, **kwargs): """ - self.text = text - self.processed_text = self.text - self.tagged_text = self.text - - phone_number_original_list = self.get_number_regex() - - original_phone_texts = [p[0].strip() for p in phone_number_original_list] - original_phone_text = self.check_length(original_phone_texts=original_phone_texts) - clean_phone_list = [self.clean_phone_number(p) for p in original_phone_text] - phone = [self.get_number(phone) for phone in clean_phone_list] - + if self.language == 'en': + self.text = self.convert_words_to_numbers(text) + else: + self.text = text self.phone, self.original_phone_text = [], [] - - for phone_number, original_phone_number in zip(phone, original_phone_text): - if len(phone_number) >= 10: - self.phone.append(phone_number) - self.original_phone_text.append(original_phone_number) - self.get_tagged_text() + for match in phonenumbers.PhoneNumberMatcher(self.text, self.country_code): + self.phone.append({"country_calling_code": str(match.number.country_code), + "phone_number": str(match.number.national_number)}) + self.original_phone_text.append(self.text[match.start:match.end]) return self.phone, self.original_phone_text - - def get_digit_length(self, text): - return len(re.findall(pattern='\d', string=text, flags=re.U)) - - def check_length(self, original_phone_texts): - """ - This method is used to handle the corner case where consecutive numbers are present with - space within them. - Args: - original_phone_texts (list): list of text substrings detected by the regex - - Returns: - phone_number_list (list): list of phone numbers splitting based on length - - Examples: - original_phone_texts = ['9820334415 91 9920441388', '9820551388982347'] - check_length(original_phone_texts=original_phone_texts) - >> ['9820334415', '91 9920441388'] - """ - phone_number_list_1, phone_number_list2 = [], [] - - for original_phone_text in original_phone_texts: - - if self.get_digit_length(text=original_phone_text) > 13: - phone_parts = original_phone_text.split() - visited = [0 for i in range(len(phone_parts))] - - for i in range(len(phone_parts)): - temp = '' - appended_parts = [] - - for j in range(i, len(phone_parts)): - if visited[j] == 0: - temp = temp + ' ' + phone_parts[j] - appended_parts.append(j) - - if 13 >= self.get_digit_length(text=temp) > 7: - phone_number_list_1.append(temp.strip()) - for m in appended_parts: - visited[m] = 1 - break - else: - phone_number_list2.append(original_phone_text) - phone_number_list_1.extend(phone_number_list2) - return phone_number_list_1 - - def get_number(self, phone): - """ - This method is used to convert phone numbers in language scripts other than English - to the English - Args: - phone (str): The string phone number which is detected and cleaned - - Returns: - phone (str): The string phone number converted to English script - - Examples: - phone = u'९१९८१९९८३१३२' - get_number(phone=phone) - '919819983132' - """ - phone_length = len(phone) - phone = str(int(phone)) - - if phone_length != len(phone): - phone = phone.zfill(phone_length) - - return phone - - def clean_phone_number(self, number): - """ - This method is used to clean the detected phone number. - Args: - number (str): The original substring which is detected and is required for cleaning - - Returns: - number (str): The number post cleaning - """ - # Remove (), -, whistespace, + - clean_regex = re.compile('([()\-\s\+]+)', re.U) - number = clean_regex.sub(string=number, repl='') - return number - - def get_number_regex(self): - - """ - This method is used to detect the phone number patterns from the provided text - Returns: - phone_number_list (list): list of patterns detected from the regex pattern - - (each pattern: (complete original text, area code, number)) - (we further utitlize only the complete original text) - Example: - p = PhoneDetector(entity_name='phone_number', language='hi') - text = u'Set a reminder on +1 (408) 912-6172' - p.text = text - p.get_number_regex() - - [(u'+1 (408) 912-6172', u'1', u'(408) 912-6172'), - (u'+91 9820334416', u'91', u'9820334416'), - (u'022 26129857', u'022', u'26129857')] - """ - phone_number_regex = re.compile( - r'((?:\(?\+(\d{1,2})\)?[\s\-\.]*)?((?=[\-\d()\s\.]{10,16}(?:[^\d]+|$))' - r'(?:[\d(]{1,20}(?:[\-)\s\.]*\d{1,20}){0,20}){1,20}))', re.U) - - phone_number_list = phone_number_regex.findall(self.text) - return phone_number_list - - def get_tagged_text(self): - """ - Replaces detected phone numbers with tag generated from entity_name used to initialize the object with - - A final string with all phone numbers replaced will be stored in object's tagged_text attribute - A string with all phone numbers removed will be stored in object's processed_text attribute - - """ - for detected_text in self.original_phone_text: - self.tagged_text = self.tagged_text.replace(detected_text, self.tag) - self.processed_text = self.processed_text.replace(detected_text, '') From 770c06572dbae3d0851b513bbf801f74a07a0132 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Tue, 24 Sep 2019 17:27:37 +0530 Subject: [PATCH 087/237] added method to get country code from locale --- .../phone_number/phone_number_detection.py | 47 ++++++++++++++----- 1 file changed, 34 insertions(+), 13 deletions(-) diff --git a/ner_v2/detectors/pattern/phone_number/phone_number_detection.py b/ner_v2/detectors/pattern/phone_number/phone_number_detection.py index fc1490782..091fd8790 100644 --- a/ner_v2/detectors/pattern/phone_number/phone_number_detection.py +++ b/ner_v2/detectors/pattern/phone_number/phone_number_detection.py @@ -35,23 +35,24 @@ class PhoneDetector(BaseDetector): original_phone_text (list): list to store substrings of the text detected as phone numbers """ - def __init__(self, entity_name, language=ENGLISH_LANG, country_code="IN"): + def __init__(self, entity_name, language=ENGLISH_LANG, locale='en-IN'): # Todo: Change default from india to get it from the bot. """ Args: entity_name (str): A string by which the detected numbers would be replaced with on calling detect_entity() language (str, optional): language code of number text, defaults to 'en' - country_code(str, optional): country code of the country from which you are using + locale(str, optional): locale of the country from which you are dialing. Ex: 'en-IN' """ self._supported_languages = NumberDetector.get_supported_languages() - super(PhoneDetector, self).__init__(language, country_code) + super(PhoneDetector, self).__init__(language, locale) self.language = language self.entity_name = entity_name + self.locale = locale self.text = '' self.phone = [] self.original_phone_text = [] - self.country_code = country_code.upper() + self.country_code = '' self.number_word_dict = create_number_word_dict() @property @@ -63,6 +64,14 @@ def supported_languages(self): """ return self._supported_languages + def get_country_code_from_locale(self): + """ + This method sets self.country_code from given locale + """ + regex_pattern = re.compile('[-_](.*$)', re.U) + match = regex_pattern.findall(self.locale) + self.country_code = match[0].upper() + def convert_words_to_numbers(self, text): """ :param text: user message @@ -89,6 +98,14 @@ def convert_words_to_numbers(self, text): converted_sentence = re.sub(r'(\d+)(\s+)(\d+)', r'\1\3', converted_sentence) return converted_sentence + def get_correct_original_text(self, original_text): + pattern = '' + for text_match in original_text: + pattern += ('(^.*)' + '(?:' + text_match + ')') + pattern += '(.*$)' + first_pattern = re.compile(pattern, re.U) + first_matches = first_pattern.findall(self.text) + def detect_entity(self, text, **kwargs): """Detects phone numbers in the text string @@ -98,24 +115,27 @@ def detect_entity(self, text, **kwargs): Returns: - self.phone (list): list consisting the detected phone numbers + self.phone (list): list consisting the detected phone numbers and their country calling codes self.original_phone_text (list): list containing their corresponding substrings in the original message. Examples: - text = 'call +1 (408) 912-6172 and send 100rs to 9920441344' - - p = PhoneDetector(entity_name='phone_number', language='en') + text = 'call +1 (408) 912-6172' + p = PhoneDetector(entity_name='phone_number', language='en', locale='en-US') p.detect_entity(text=text) - (['14089126172', '9920441344'], [u'+1 (408) 912-6172', u'9920441344']) + ([{'country_calling_code':'1', phone_number':'4089126172'} ], + [u'+1 (408) 912-6172']) text = '+९१ ९८१९९८३१३२ पर कॉल करें और संदेश ९८२०३३४४१६ पर कॉल करें' - p = PhoneDetector(entity_name='phone_number', language='hi') + p = PhoneDetector(entity_name='phone_number', language='hi', locale='en-IN') p.detect_entity(text=text) - (['919819983132', '9820334416'],[u'+९१ ९८१९९८३१३२', u'+९१ ९८१९९८३१३२']) + ([{'country_calling_code':'91', phone_number':'9819983132'} + ,{ 'country_calling_code':'91', phone_number:'9820334416'} ], + [u'+९१ ९८१९९८३१३२', u'+९१ ९८१९९८३१३२']) """ - + self.get_country_code_from_locale() + original_text = text if self.language == 'en': self.text = self.convert_words_to_numbers(text) else: @@ -125,5 +145,6 @@ def detect_entity(self, text, **kwargs): self.phone.append({"country_calling_code": str(match.number.country_code), "phone_number": str(match.number.national_number)}) self.original_phone_text.append(self.text[match.start:match.end]) - + # if original_text != self.text: + # self.get_correct_original_text(original_text) return self.phone, self.original_phone_text From a623aace6d16a4aedc9cf5be7b24a9931da1b526 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Tue, 24 Sep 2019 17:59:02 +0530 Subject: [PATCH 088/237] shifted _detect_absolute_number to detector preferences --- .../standard_number_range_detector.py | 32 ++++++++----------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py b/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py index 3edeb158d..196791b86 100644 --- a/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py +++ b/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py @@ -55,7 +55,8 @@ def __init__(self, entity_name, language, data_directory_path, unit_type=None): self._detect_min_num_range_with_prefix_variants, self._detect_min_num_range_with_suffix_variants, self._detect_max_num_range_with_prefix_variants, - self._detect_max_num_range_with_suffix_variants + self._detect_max_num_range_with_suffix_variants, + self._detect_absolute_number ] def _init_regex_for_range(self, data_directory_path): @@ -179,33 +180,26 @@ def detect_number_range(self, text): for detector in self.detector_preferences: number_list, original_list = detector(number_list, original_list) self._update_tagged_text(original_list) - number_list, original_list = self._add_absolute_numbers(number_list, original_list) return number_list, original_list - def _add_absolute_numbers(self, number_list, original_list): - number_abs_list = number_list or [] - original_abs_list = original_list or [] + def _detect_absolute_number(self, number_list, original_list): + number_list = number_list or [] + original_list = original_list or [] abs_number_pattern = re.compile(ur'({number}\d+)'.format(number=numeral_constant.NUMBER_REPLACE_TEXT), re.UNICODE) abs_number_matches = abs_number_pattern.findall(self.processed_text) for match in abs_number_matches: - if self.unit_type: - if self.number_detected_map[match].entity_value[numeral_constant.NUMBER_DETECTION_RETURN_DICT_UNIT] \ - is not None: - number_abs_list.append({numeral_constant.NUMBER_RANGE_MAX_VALUE: None, - numeral_constant.NUMBER_RANGE_MIN_VALUE: None, - numeral_constant.NUMBER_RANGE_VALUE_UNIT: self.unit_type, - numeral_constant.NUMBER_RANGE_ABS_VALUE: - self.number_detected_map[match].entity_value['value']}) - original_abs_list.append(self.number_detected_map[match].original_text) - else: - number_abs_list.append({numeral_constant.NUMBER_RANGE_MAX_VALUE: None, + entity_unit = self.number_detected_map[match].entity_value[ + numeral_constant.NUMBER_DETECTION_RETURN_DICT_UNIT] + if (self.unit_type and entity_unit == self.unit_type) or not self.unit_type: + number_list.append({numeral_constant.NUMBER_RANGE_MAX_VALUE: None, numeral_constant.NUMBER_RANGE_MIN_VALUE: None, - numeral_constant.NUMBER_RANGE_VALUE_UNIT: self.unit_type, + numeral_constant.NUMBER_RANGE_VALUE_UNIT: entity_unit, numeral_constant.NUMBER_RANGE_ABS_VALUE: self.number_detected_map[ match].entity_value}) - original_abs_list.append(self.number_detected_map[match].original_text) - return number_abs_list, original_abs_list + original_list.append(self.number_detected_map[match].original_text) + return number_list, original_list + def _get_number_range(self, min_part_match, max_part_match, full_match): """ Update number_range_list and original_list by finding entity value of number tag and original text from From e2c2b2b8bcb01e291e1400201f9a3f7584493ecf Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Tue, 24 Sep 2019 18:23:18 +0530 Subject: [PATCH 089/237] added _detect_absolute_number in ennglish --- .../numeral/number_range/en/number_range_detection.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ner_v2/detectors/numeral/number_range/en/number_range_detection.py b/ner_v2/detectors/numeral/number_range/en/number_range_detection.py index 9c73c1015..c65b8f309 100644 --- a/ner_v2/detectors/numeral/number_range/en/number_range_detection.py +++ b/ner_v2/detectors/numeral/number_range/en/number_range_detection.py @@ -23,7 +23,8 @@ def __init__(self, entity_name, language, unit_type=None): self._detect_min_num_range_with_prefix_variants, self._detect_min_num_range_with_suffix_variants, self._detect_max_num_range_with_prefix_variants, - self._detect_max_num_range_with_suffix_variants + self._detect_max_num_range_with_suffix_variants, + self._detect_absolute_number ] def _custom_num_range_between_num_and_num(self, number_range_list=None, original_list=None): From df720d54e5794e491bdcb5a76dcf4f4c6f4ec0fa Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Tue, 24 Sep 2019 18:31:31 +0530 Subject: [PATCH 090/237] resolved lint errors and _detect_absolute_number --- .../number_range/standard_number_range_detector.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py b/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py index 63aaad9d7..dff9b6132 100644 --- a/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py +++ b/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py @@ -193,10 +193,10 @@ def _detect_absolute_number(self, number_list, original_list): numeral_constant.NUMBER_DETECTION_RETURN_DICT_UNIT] if (self.unit_type and entity_unit == self.unit_type) or not self.unit_type: number_list.append({numeral_constant.NUMBER_RANGE_MAX_VALUE: None, - numeral_constant.NUMBER_RANGE_MIN_VALUE: None, - numeral_constant.NUMBER_RANGE_VALUE_UNIT: entity_unit, - numeral_constant.NUMBER_RANGE_ABS_VALUE: self.number_detected_map[ - match].entity_value}) + numeral_constant.NUMBER_RANGE_MIN_VALUE: None, + numeral_constant.NUMBER_RANGE_VALUE_UNIT: entity_unit, + numeral_constant.NUMBER_RANGE_ABS_VALUE: self. + number_detected_map[match].entity_value.NUMBER_DETECTION_RETURN_DICT_VALUE}) original_list.append(self.number_detected_map[match].original_text) return number_list, original_list From c5e9ee245b55827057244c9975603474918be43a Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Tue, 24 Sep 2019 18:34:14 +0530 Subject: [PATCH 091/237] resolve output error --- .../numeral/number_range/standard_number_range_detector.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py b/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py index dff9b6132..8c25e6722 100644 --- a/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py +++ b/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py @@ -196,7 +196,8 @@ def _detect_absolute_number(self, number_list, original_list): numeral_constant.NUMBER_RANGE_MIN_VALUE: None, numeral_constant.NUMBER_RANGE_VALUE_UNIT: entity_unit, numeral_constant.NUMBER_RANGE_ABS_VALUE: self. - number_detected_map[match].entity_value.NUMBER_DETECTION_RETURN_DICT_VALUE}) + number_detected_map[match]. + entity_value[numeral_constant.NUMBER_DETECTION_RETURN_DICT_VALUE]}) original_list.append(self.number_detected_map[match].original_text) return number_list, original_list From 1424989f36233b6a60d53cba8ceecb239dbfba05 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Wed, 25 Sep 2019 01:19:57 +0530 Subject: [PATCH 092/237] added locale to date detector --- .../detectors/temporal/date/date_detection.py | 9 +++++--- .../temporal/date/en/date_detection.py | 22 ++++++++++++++----- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/ner_v2/detectors/temporal/date/date_detection.py b/ner_v2/detectors/temporal/date/date_detection.py index 20214e041..e21838341 100644 --- a/ner_v2/detectors/temporal/date/date_detection.py +++ b/ner_v2/detectors/temporal/date/date_detection.py @@ -756,7 +756,7 @@ class DateDetector(object): language: source language of text """ - def __init__(self, entity_name, language=ENGLISH_LANG, timezone='UTC', past_date_referenced=False): + def __init__(self, entity_name, locale, language=ENGLISH_LANG, timezone='UTC', past_date_referenced=False): """Initializes a DateDetector object with given entity_name and pytz timezone object Args: @@ -777,13 +777,15 @@ def __init__(self, entity_name, language=ENGLISH_LANG, timezone='UTC', past_date self.now_date = datetime.datetime.now(tz=self.timezone) self.bot_message = None self.language = language + self.locale = locale try: date_detector_module = importlib.import_module( 'ner_v2.detectors.temporal.date.{0}.date_detection'.format(self.language)) self.language_date_detector = date_detector_module.DateDetector(entity_name=self.entity_name, past_date_referenced=past_date_referenced, - timezone=self.timezone) + timezone=self.timezone, + locale=self.locale) except ImportError: standard_date_regex = importlib.import_module( 'ner_v2.detectors.temporal.date.standard_date_regex' @@ -793,7 +795,8 @@ def __init__(self, entity_name, language=ENGLISH_LANG, timezone='UTC', past_date data_directory_path=get_lang_data_path(detector_path=os.path.abspath(__file__), lang_code=self.language), timezone=self.timezone, - past_date_referenced=past_date_referenced + past_date_referenced=past_date_referenced, + locale=self.locale ) def detect_entity(self, text, **kwargs): diff --git a/ner_v2/detectors/temporal/date/en/date_detection.py b/ner_v2/detectors/temporal/date/en/date_detection.py index df8ff7e7d..8a8dcd421 100644 --- a/ner_v2/detectors/temporal/date/en/date_detection.py +++ b/ner_v2/detectors/temporal/date/en/date_detection.py @@ -96,6 +96,12 @@ def __init__(self, entity_name, locale, timezone='UTC', past_date_referenced=Fal self.day_dictionary = DAY_DICT self.bot_message = None self.locale = locale + self.country_code = '' + + def get_country_code_from_locale(self): + regex_pattern = re.compile('[-_](.*$)', re.U) + match = regex_pattern.findall(self.locale) + self.country_code = match[0].upper() def detect_date(self, text): """ @@ -112,7 +118,7 @@ def detect_date(self, text): self.text = " " + text.strip().lower() + " " self.processed_text = self.text self.tagged_text = self.text - + self.get_country_code_from_locale() date_list = [] original_list = [] date_list, original_list = self.get_exact_date(date_list, original_list) @@ -148,10 +154,16 @@ def get_exact_date(self, date_list, original_list): corresponding substrings in the given text. """ - date_list, original_list = self._gregorian_day_month_year_format(date_list, original_list) - self._update_processed_text(original_list) - date_list, original_list = self._gregorian_month_day_year_format(date_list, original_list) - self._update_processed_text(original_list) + if self.country_code in ['US']: + date_list, original_list = self._gregorian_month_day_year_format(date_list, original_list) + self._update_processed_text(original_list) + date_list, original_list = self._gregorian_day_month_year_format(date_list, original_list) + self._update_processed_text(original_list) + else: + date_list, original_list = self._gregorian_day_month_year_format(date_list, original_list) + self._update_processed_text(original_list) + date_list, original_list = self._gregorian_month_day_year_format(date_list, original_list) + self._update_processed_text(original_list) date_list, original_list = self._gregorian_year_month_day_format(date_list, original_list) self._update_processed_text(original_list) date_list, original_list = self._gregorian_advanced_day_month_year_format(date_list, original_list) From 60cb6f8298355bffc287dc856084dc0d9829ed8a Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Wed, 25 Sep 2019 12:11:56 +0530 Subject: [PATCH 093/237] added locale in detector api, set locale=None in date_detection --- ner_constants.py | 4 ++++ ner_v2/api.py | 9 +++++++-- ner_v2/detectors/temporal/date/date_detection.py | 2 +- ner_v2/detectors/temporal/date/en/date_detection.py | 7 +++++-- ner_v2/detectors/temporal/date/hi/date_detection.py | 2 +- ner_v2/detectors/temporal/date/standard_date_regex.py | 2 +- 6 files changed, 19 insertions(+), 7 deletions(-) diff --git a/ner_constants.py b/ner_constants.py index aab49c427..bc59a4413 100644 --- a/ner_constants.py +++ b/ner_constants.py @@ -56,3 +56,7 @@ PARAMETER_MIN_DIGITS = 'min_number_digits' PARAMETER_MAX_DIGITS = 'max_number_digits' PARAMETER_NUMBER_UNIT_TYPE = 'unit_type' + +# Locale for Date and Phone Number detection +PARAMETER_LOCALE = 'locale' + diff --git a/ner_v2/api.py b/ner_v2/api.py index 63657248d..07510edb5 100644 --- a/ner_v2/api.py +++ b/ner_v2/api.py @@ -3,7 +3,8 @@ from ner_constants import PARAMETER_MESSAGE, PARAMETER_ENTITY_NAME, PARAMETER_STRUCTURED_VALUE, \ PARAMETER_FALLBACK_VALUE, \ PARAMETER_BOT_MESSAGE, PARAMETER_TIMEZONE, PARAMETER_LANGUAGE_SCRIPT, PARAMETER_SOURCE_LANGUAGE, \ - PARAMETER_PAST_DATE_REFERENCED, PARAMETER_MIN_DIGITS, PARAMETER_MAX_DIGITS, PARAMETER_NUMBER_UNIT_TYPE + PARAMETER_PAST_DATE_REFERENCED, PARAMETER_MIN_DIGITS, PARAMETER_MAX_DIGITS, PARAMETER_NUMBER_UNIT_TYPE, \ + PARAMETER_LOCALE from ner_v2.detectors.temporal.date.date_detection import DateAdvancedDetector from ner_v2.detectors.temporal.time.time_detection import TimeDetector @@ -40,6 +41,7 @@ def get_parameters_dictionary(request): PARAMETER_MIN_DIGITS: request.GET.get('min_number_digits'), PARAMETER_MAX_DIGITS: request.GET.get('max_number_digits'), PARAMETER_NUMBER_UNIT_TYPE: request.GET.get('unit_type'), + PARAMETER_LOCALE: request.GET.get('locale'), } return parameters_dict @@ -68,7 +70,8 @@ def parse_post_request(request): PARAMETER_SOURCE_LANGUAGE: request_data.get('source_language', ENGLISH_LANG), PARAMETER_MIN_DIGITS: request_data.get('min_number_digits'), PARAMETER_MAX_DIGITS: request_data.get('max_number_digits'), - PARAMETER_NUMBER_UNIT_TYPE: request_data.get('unit_type') + PARAMETER_NUMBER_UNIT_TYPE: request_data.get('unit_type'), + PARAMETER_LOCALE: request_data.get('locale'), } return parameters_dict @@ -95,6 +98,7 @@ def date(request): timezone (str): timezone of the user source_language (str): source language code (ISO 639-1) language_script (str): language code of script (ISO 639-1) + locale (str): locale of the user(ISO 639-1) Returns: response (django.http.response.HttpResponse): HttpResponse object @@ -109,6 +113,7 @@ def date(request): timezone = 'UTC' source_language = 'hi' language_script = 'en' + locale = 'hi-in' output = date(request) print output diff --git a/ner_v2/detectors/temporal/date/date_detection.py b/ner_v2/detectors/temporal/date/date_detection.py index e21838341..67b0b8a09 100644 --- a/ner_v2/detectors/temporal/date/date_detection.py +++ b/ner_v2/detectors/temporal/date/date_detection.py @@ -756,7 +756,7 @@ class DateDetector(object): language: source language of text """ - def __init__(self, entity_name, locale, language=ENGLISH_LANG, timezone='UTC', past_date_referenced=False): + def __init__(self, entity_name, locale=None, language=ENGLISH_LANG, timezone='UTC', past_date_referenced=False): """Initializes a DateDetector object with given entity_name and pytz timezone object Args: diff --git a/ner_v2/detectors/temporal/date/en/date_detection.py b/ner_v2/detectors/temporal/date/en/date_detection.py index 8a8dcd421..47901d917 100644 --- a/ner_v2/detectors/temporal/date/en/date_detection.py +++ b/ner_v2/detectors/temporal/date/en/date_detection.py @@ -70,7 +70,7 @@ class DateDetector(object): text and tagged_text will have a extra space prepended and appended after calling detect_entity(text) """ - def __init__(self, entity_name, locale, timezone='UTC', past_date_referenced=False): + def __init__(self, entity_name, locale=None, timezone='UTC', past_date_referenced=False): """Initializes a DateDetector object with given entity_name and pytz timezone object Args: @@ -101,7 +101,10 @@ def __init__(self, entity_name, locale, timezone='UTC', past_date_referenced=Fal def get_country_code_from_locale(self): regex_pattern = re.compile('[-_](.*$)', re.U) match = regex_pattern.findall(self.locale) - self.country_code = match[0].upper() + if match is not None: + self.country_code = match[0].upper() + else: + self.country_code = None def detect_date(self, text): """ diff --git a/ner_v2/detectors/temporal/date/hi/date_detection.py b/ner_v2/detectors/temporal/date/hi/date_detection.py index f4e11d00a..289259a74 100644 --- a/ner_v2/detectors/temporal/date/hi/date_detection.py +++ b/ner_v2/detectors/temporal/date/hi/date_detection.py @@ -10,7 +10,7 @@ class DateDetector(BaseRegexDate): data_directory_path = os.path.join((os.path.dirname(os.path.abspath(__file__)).rstrip(os.sep)), LANGUAGE_DATA_DIRECTORY) - def __init__(self, entity_name, timezone='UTC', past_date_referenced=False): + def __init__(self, entity_name, locale=None, timezone='UTC', past_date_referenced=False): super(DateDetector, self).__init__(entity_name, data_directory_path=DateDetector.data_directory_path, timezone=timezone, diff --git a/ner_v2/detectors/temporal/date/standard_date_regex.py b/ner_v2/detectors/temporal/date/standard_date_regex.py index efcea1aa4..ec9716943 100644 --- a/ner_v2/detectors/temporal/date/standard_date_regex.py +++ b/ner_v2/detectors/temporal/date/standard_date_regex.py @@ -13,7 +13,7 @@ class BaseRegexDate(object): - def __init__(self, entity_name, data_directory_path, timezone='UTC', past_date_referenced=False): + def __init__(self, entity_name, data_directory_path, locale=None, timezone='UTC', past_date_referenced=False): """ Base Regex class which will be imported by language date class by giving their data folder path This will create standard regex and their parser to detect date for given language. From 6ac0e22f8db411836234bb90e7ec71c5c0694902 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Wed, 25 Sep 2019 12:22:32 +0530 Subject: [PATCH 094/237] add if not None for get_country_code_from_locale() --- ner_v2/detectors/temporal/date/en/date_detection.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/ner_v2/detectors/temporal/date/en/date_detection.py b/ner_v2/detectors/temporal/date/en/date_detection.py index 47901d917..c24d59a8d 100644 --- a/ner_v2/detectors/temporal/date/en/date_detection.py +++ b/ner_v2/detectors/temporal/date/en/date_detection.py @@ -96,12 +96,12 @@ def __init__(self, entity_name, locale=None, timezone='UTC', past_date_reference self.day_dictionary = DAY_DICT self.bot_message = None self.locale = locale - self.country_code = '' + self.country_code = None def get_country_code_from_locale(self): regex_pattern = re.compile('[-_](.*$)', re.U) match = regex_pattern.findall(self.locale) - if match is not None: + if match: self.country_code = match[0].upper() else: self.country_code = None @@ -121,7 +121,8 @@ def detect_date(self, text): self.text = " " + text.strip().lower() + " " self.processed_text = self.text self.tagged_text = self.text - self.get_country_code_from_locale() + if self.locale: + self.get_country_code_from_locale() date_list = [] original_list = [] date_list, original_list = self.get_exact_date(date_list, original_list) From 6ac6fe5bbfcbfe97bc6f127ed777778b84babd11 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Wed, 25 Sep 2019 12:48:15 +0530 Subject: [PATCH 095/237] added locale in comments --- ner_v2/detectors/temporal/date/date_detection.py | 1 + ner_v2/detectors/temporal/date/en/date_detection.py | 2 ++ ner_v2/detectors/temporal/date/standard_date_regex.py | 3 ++- 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/ner_v2/detectors/temporal/date/date_detection.py b/ner_v2/detectors/temporal/date/date_detection.py index 67b0b8a09..6fc7e42d1 100644 --- a/ner_v2/detectors/temporal/date/date_detection.py +++ b/ner_v2/detectors/temporal/date/date_detection.py @@ -765,6 +765,7 @@ def __init__(self, entity_name, locale=None, language=ENGLISH_LANG, timezone='UT timezone (Optional, str): timezone identifier string that is used to create a pytz timezone object default is UTC past_date_referenced (bool): to know if past or future date is referenced for date text like 'kal', 'parso' + locale(Optional, str): user locale default is None """ self.text = '' self.tagged_text = '' diff --git a/ner_v2/detectors/temporal/date/en/date_detection.py b/ner_v2/detectors/temporal/date/en/date_detection.py index c24d59a8d..79a7d6bae 100644 --- a/ner_v2/detectors/temporal/date/en/date_detection.py +++ b/ner_v2/detectors/temporal/date/en/date_detection.py @@ -28,6 +28,7 @@ class DateDetector(object): original_date_text: list to store substrings of the text detected as date entities tag: entity_name prepended and appended with '__' timezone: Optional, pytz.timezone object used for getting current time, default is pytz.timezone('UTC') + locale: Optional, locale of the user for getting country code now_date: datetime object holding timestamp while DateDetector instantiation month_dictionary: dictonary mapping month indexes to month spellings and fuzzy variants(spell errors, abbreviations) @@ -79,6 +80,7 @@ def __init__(self, entity_name, locale=None, timezone='UTC', past_date_reference timezone (Optional, str): timezone identifier string that is used to create a pytz timezone object default is UTC past_date_referenced (bool): to know if past or future date is referenced for date text like 'kal', 'parso' + locale (Optional, str): user locale for getting the country code. """ self.text = '' diff --git a/ner_v2/detectors/temporal/date/standard_date_regex.py b/ner_v2/detectors/temporal/date/standard_date_regex.py index ec9716943..5fb5815eb 100644 --- a/ner_v2/detectors/temporal/date/standard_date_regex.py +++ b/ner_v2/detectors/temporal/date/standard_date_regex.py @@ -19,9 +19,10 @@ def __init__(self, entity_name, data_directory_path, locale=None, timezone='UTC' This will create standard regex and their parser to detect date for given language. Args: data_directory_path (str): path of data folder for given language - timezone (str): user timezone default UTC + timezone (Optional, str): user timezone default UTC past_date_referenced (boolean): if the date reference is in past, this is helpful for text like 'kal', 'parso' to know if the reference is past or future. + locale (Optional, str): user locale default None """ self.text = '' self.tagged_text = '' From fa037358ece101a00c953869dd16b55ed2b4b644 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Wed, 25 Sep 2019 15:07:39 +0530 Subject: [PATCH 096/237] bug-fix in _detect_absolute_number --- .../numeral/number_range/standard_number_range_detector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py b/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py index 8c25e6722..220a86164 100644 --- a/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py +++ b/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py @@ -191,7 +191,7 @@ def _detect_absolute_number(self, number_list, original_list): for match in abs_number_matches: entity_unit = self.number_detected_map[match].entity_value[ numeral_constant.NUMBER_DETECTION_RETURN_DICT_UNIT] - if (self.unit_type and entity_unit == self.unit_type) or not self.unit_type: + if (self.unit_type and entity_unit) or not self.unit_type: number_list.append({numeral_constant.NUMBER_RANGE_MAX_VALUE: None, numeral_constant.NUMBER_RANGE_MIN_VALUE: None, numeral_constant.NUMBER_RANGE_VALUE_UNIT: entity_unit, From bb624b0a308ee1273e521796b6be8ed6d918e990 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Wed, 25 Sep 2019 16:48:47 +0530 Subject: [PATCH 097/237] bug-fix in min>max check --- .../numeral/number_range/standard_number_range_detector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py b/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py index 220a86164..bd439da5d 100644 --- a/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py +++ b/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py @@ -238,7 +238,7 @@ def _get_number_range(self, min_part_match, max_part_match, full_match): return number_range, original_text if min_part_match and max_part_match: - if entity_value_min > entity_value_max: + if int(entity_value_min) > int(entity_value_max): temp = entity_value_max entity_value_max = entity_value_min entity_value_min = temp From a7571601d9ccfbdf1158d4bf670eb32af8bcefdc Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Wed, 25 Sep 2019 19:35:55 +0530 Subject: [PATCH 098/237] bug-fix in _update_tagged_text of number and number range detectors --- .../numeral/number/standard_number_detector.py | 18 +++++++++++++++--- .../standard_number_range_detector.py | 12 ++++++++++-- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/ner_v2/detectors/numeral/number/standard_number_detector.py b/ner_v2/detectors/numeral/number/standard_number_detector.py index fc69f4a4a..d2c18e167 100644 --- a/ner_v2/detectors/numeral/number/standard_number_detector.py +++ b/ner_v2/detectors/numeral/number/standard_number_detector.py @@ -2,7 +2,15 @@ import pandas as pd import collections import os -import re +try: + import regex as re + _re_flags = re.UNICODE | re.V1 | re.WORD + +except ImportError: + + import re + _re_flags = re.UNICODE + from ner_v2.detectors.numeral.constant import NUMBER_NUMERAL_FILE_VARIANTS_COLUMN_NAME, \ NUMBER_NUMERAL_FILE_VALUE_COLUMN_NAME, NUMBER_NUMERAL_FILE_TYPE_COLUMN_NAME, NUMBER_TYPE_UNIT, \ @@ -306,9 +314,13 @@ def _update_processed_text(self, original_number_list): original_number_list (list): list of substrings of original text to be replaced with tag created from entity_name """ + # for detected_text in original_number_list: + # self.tagged_text = self.tagged_text.replace(detected_text, self.tag) + # self.processed_text = self.processed_text.replace(detected_text, '') + for detected_text in original_number_list: - self.tagged_text = self.tagged_text.replace(detected_text, self.tag) - self.processed_text = self.processed_text.replace(detected_text, '') + _pattern = re.compile(r'\b%s\b' % re.escape(detected_text), flags=_re_flags) + self.tagged_text = _pattern.sub(self.tag, self.tagged_text) class NumberDetector(BaseNumberDetector): diff --git a/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py b/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py index bd439da5d..a24d7d72c 100644 --- a/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py +++ b/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py @@ -4,10 +4,17 @@ import pandas as pd import collections import os -import re import ner_v2.detectors.numeral.constant as numeral_constant from ner_v2.detectors.numeral.utils import get_list_from_pipe_sep_string from ner_v2.detectors.numeral.number.number_detection import NumberDetector +try: + import regex as re + _re_flags = re.UNICODE | re.V1 | re.WORD + +except ImportError: + + import re + _re_flags = re.UNICODE NumberRangeVariant = collections.namedtuple('NumberRangeVariant', ['position', 'range_type']) ValueTextPair = collections.namedtuple('ValueTextPair', ['entity_value', 'original_text']) @@ -414,7 +421,8 @@ def _update_tagged_text(self, original_number_list): created from entity_name """ for detected_text in original_number_list: - self.tagged_text = self.tagged_text.replace(detected_text, self.tag) + _pattern = re.compile(r'\b%s\b' % re.escape(detected_text), flags=_re_flags) + self.tagged_text = _pattern.sub(self.tag, self.tagged_text) class NumberRangeDetector(BaseNumberRangeDetector): From 4b6464cdf25fa1bb14ac71b75743e9fac84315f7 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Wed, 25 Sep 2019 22:28:48 +0530 Subject: [PATCH 099/237] add processed text --- ner_v2/detectors/numeral/number/standard_number_detector.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ner_v2/detectors/numeral/number/standard_number_detector.py b/ner_v2/detectors/numeral/number/standard_number_detector.py index d2c18e167..4e7c1b193 100644 --- a/ner_v2/detectors/numeral/number/standard_number_detector.py +++ b/ner_v2/detectors/numeral/number/standard_number_detector.py @@ -321,6 +321,7 @@ def _update_processed_text(self, original_number_list): for detected_text in original_number_list: _pattern = re.compile(r'\b%s\b' % re.escape(detected_text), flags=_re_flags) self.tagged_text = _pattern.sub(self.tag, self.tagged_text) + self.processed_text = _pattern.sub('', self.processed_text) class NumberDetector(BaseNumberDetector): From b2aaedadda65a184129d99a4b75d18a84b7ab2a4 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Wed, 25 Sep 2019 22:48:02 +0530 Subject: [PATCH 100/237] add processed text --- ner_v2/detectors/numeral/number/standard_number_detector.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ner_v2/detectors/numeral/number/standard_number_detector.py b/ner_v2/detectors/numeral/number/standard_number_detector.py index 4e7c1b193..6cd6c0f0d 100644 --- a/ner_v2/detectors/numeral/number/standard_number_detector.py +++ b/ner_v2/detectors/numeral/number/standard_number_detector.py @@ -294,7 +294,8 @@ def _detect_number_from_digit(self, number_list=None, original_list=None): unit = None if self.unit_type: unit, original_text = self._get_unit_from_text(original_text, processed_text) - processed_text = processed_text.replace(original_text, self.tag) + # processed_text = processed_text.replace(original_text, self.tag) + processed_text = processed_text.replace(original_text, '') number_list.append({ NUMBER_DETECTION_RETURN_DICT_VALUE: str(number), NUMBER_DETECTION_RETURN_DICT_UNIT: unit From dd2651c039406468b29d0e181429c713c1d7263f Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Wed, 25 Sep 2019 22:52:15 +0530 Subject: [PATCH 101/237] add processed text --- ner_v2/detectors/numeral/number/standard_number_detector.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ner_v2/detectors/numeral/number/standard_number_detector.py b/ner_v2/detectors/numeral/number/standard_number_detector.py index 6cd6c0f0d..4e7c1b193 100644 --- a/ner_v2/detectors/numeral/number/standard_number_detector.py +++ b/ner_v2/detectors/numeral/number/standard_number_detector.py @@ -294,8 +294,7 @@ def _detect_number_from_digit(self, number_list=None, original_list=None): unit = None if self.unit_type: unit, original_text = self._get_unit_from_text(original_text, processed_text) - # processed_text = processed_text.replace(original_text, self.tag) - processed_text = processed_text.replace(original_text, '') + processed_text = processed_text.replace(original_text, self.tag) number_list.append({ NUMBER_DETECTION_RETURN_DICT_VALUE: str(number), NUMBER_DETECTION_RETURN_DICT_UNIT: unit From 5c46672e2a6b3ad8997d65f885131978d649ec83 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Wed, 25 Sep 2019 23:01:40 +0530 Subject: [PATCH 102/237] fix regex in _get_unit_from_text of number detector --- ner_v2/detectors/numeral/number/standard_number_detector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ner_v2/detectors/numeral/number/standard_number_detector.py b/ner_v2/detectors/numeral/number/standard_number_detector.py index 4e7c1b193..4e91df581 100644 --- a/ner_v2/detectors/numeral/number/standard_number_detector.py +++ b/ner_v2/detectors/numeral/number/standard_number_detector.py @@ -159,7 +159,7 @@ def _get_unit_from_text(self, detected_original, processed_text): # add re.escape to handle decimal cases in detected original detected_original = re.escape(detected_original) - unit_matches = re.search(r'\W+((' + self.unit_choices + r')[\.\,\s]*' + detected_original + r')|(' + + unit_matches = re.search(r'\W+((' + self.unit_choices + r')[\.\,\s]*' + detected_original + r')\W+|(' + detected_original + r'\s*(' + self.unit_choices + r'))\W+', processed_text, re.UNICODE) if unit_matches: From 97b146cc905fef944b351516559159b7a89a6396 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Wed, 25 Sep 2019 23:25:40 +0530 Subject: [PATCH 103/237] fix regex in _get_unit_from_text of number detector --- .../numeral/number/standard_number_detector.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/ner_v2/detectors/numeral/number/standard_number_detector.py b/ner_v2/detectors/numeral/number/standard_number_detector.py index 4e91df581..66d9d7c3f 100644 --- a/ner_v2/detectors/numeral/number/standard_number_detector.py +++ b/ner_v2/detectors/numeral/number/standard_number_detector.py @@ -2,15 +2,17 @@ import pandas as pd import collections import os + try: import regex as re + _re_flags = re.UNICODE | re.V1 | re.WORD except ImportError: import re - _re_flags = re.UNICODE + _re_flags = re.UNICODE from ner_v2.detectors.numeral.constant import NUMBER_NUMERAL_FILE_VARIANTS_COLUMN_NAME, \ NUMBER_NUMERAL_FILE_VALUE_COLUMN_NAME, NUMBER_NUMERAL_FILE_TYPE_COLUMN_NAME, NUMBER_TYPE_UNIT, \ @@ -159,8 +161,10 @@ def _get_unit_from_text(self, detected_original, processed_text): # add re.escape to handle decimal cases in detected original detected_original = re.escape(detected_original) - unit_matches = re.search(r'\W+((' + self.unit_choices + r')[\.\,\s]*' + detected_original + r')\W+|(' + - detected_original + r'\s*(' + self.unit_choices + r'))\W+', processed_text, + unit_matches = re.search(r'(\W+|^)((' + self.unit_choices + r')[\.\,\s]*' + detected_original + + r')(?:\W+|$)|(\W+|^)(' + detected_original + r'\s*(' + + self.unit_choices + r'))(?:\W+|$)', + processed_text, re.UNICODE) if unit_matches: original_text_prefix, unit_prefix, original_text_suffix, unit_suffix = unit_matches.groups() From bc1b26d51c4f5d41099b3f5c9bd1101c22d9b18d Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Wed, 25 Sep 2019 23:33:11 +0530 Subject: [PATCH 104/237] fix regex in _get_unit_from_text of number detector --- ner_v2/detectors/numeral/number/standard_number_detector.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ner_v2/detectors/numeral/number/standard_number_detector.py b/ner_v2/detectors/numeral/number/standard_number_detector.py index 66d9d7c3f..050a8a3d5 100644 --- a/ner_v2/detectors/numeral/number/standard_number_detector.py +++ b/ner_v2/detectors/numeral/number/standard_number_detector.py @@ -161,8 +161,7 @@ def _get_unit_from_text(self, detected_original, processed_text): # add re.escape to handle decimal cases in detected original detected_original = re.escape(detected_original) - unit_matches = re.search(r'(\W+|^)((' + self.unit_choices + r')[\.\,\s]*' + detected_original + - r')(?:\W+|$)|(\W+|^)(' + detected_original + r'\s*(' + unit_matches = re.search(r'(\W+|^)((' + self.unit_choices + r')[\.\,\s]*' + detected_original +r')(?:\W+|$)|(\W+|^)(' + detected_original + r'\s*(' + self.unit_choices + r'))(?:\W+|$)', processed_text, re.UNICODE) From 81cd9d0889a4fce199131f10d0e84b66503573f5 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Wed, 25 Sep 2019 23:36:06 +0530 Subject: [PATCH 105/237] fix regex in _get_unit_from_text of number detector --- ner_v2/detectors/numeral/number/standard_number_detector.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ner_v2/detectors/numeral/number/standard_number_detector.py b/ner_v2/detectors/numeral/number/standard_number_detector.py index 050a8a3d5..4a83e87c4 100644 --- a/ner_v2/detectors/numeral/number/standard_number_detector.py +++ b/ner_v2/detectors/numeral/number/standard_number_detector.py @@ -161,7 +161,8 @@ def _get_unit_from_text(self, detected_original, processed_text): # add re.escape to handle decimal cases in detected original detected_original = re.escape(detected_original) - unit_matches = re.search(r'(\W+|^)((' + self.unit_choices + r')[\.\,\s]*' + detected_original +r')(?:\W+|$)|(\W+|^)(' + detected_original + r'\s*(' + unit_matches = re.search(r'(?:\W+|^)((' + self.unit_choices + r')[.,\s]*' + detected_original + + r')(?:\W+|$)|(?:\W+|^)(' + detected_original + r'\s*(' + self.unit_choices + r'))(?:\W+|$)', processed_text, re.UNICODE) From 712e506311169051c0e95614f927ddd5eb71ff41 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Thu, 26 Sep 2019 00:03:26 +0530 Subject: [PATCH 106/237] added prints to debug --- .../numeral/number/standard_number_detector.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/ner_v2/detectors/numeral/number/standard_number_detector.py b/ner_v2/detectors/numeral/number/standard_number_detector.py index 4a83e87c4..aa7591770 100644 --- a/ner_v2/detectors/numeral/number/standard_number_detector.py +++ b/ner_v2/detectors/numeral/number/standard_number_detector.py @@ -161,11 +161,12 @@ def _get_unit_from_text(self, detected_original, processed_text): # add re.escape to handle decimal cases in detected original detected_original = re.escape(detected_original) - unit_matches = re.search(r'(?:\W+|^)((' + self.unit_choices + r')[.,\s]*' + detected_original + - r')(?:\W+|$)|(?:\W+|^)(' + detected_original + r'\s*(' - + self.unit_choices + r'))(?:\W+|$)', + unit_matches = re.search(r'\W+((' + self.unit_choices + r')[.,\s]*' + detected_original + + r')\W+|\W+(' + detected_original + r'\s*(' + + self.unit_choices + r'))\W+', processed_text, re.UNICODE) + print('169 unit matches',unit_matches) if unit_matches: original_text_prefix, unit_prefix, original_text_suffix, unit_suffix = unit_matches.groups() if unit_suffix: @@ -297,6 +298,8 @@ def _detect_number_from_digit(self, number_list=None, original_list=None): number = int(number) if number.is_integer() else number unit = None if self.unit_type: + print(original_text, 'before get unit', processed_text) + print('the pattern which is', pattern) unit, original_text = self._get_unit_from_text(original_text, processed_text) processed_text = processed_text.replace(original_text, self.tag) number_list.append({ @@ -323,6 +326,7 @@ def _update_processed_text(self, original_number_list): # self.processed_text = self.processed_text.replace(detected_text, '') for detected_text in original_number_list: + print('in update processed 328 original num list:', original_number_list) _pattern = re.compile(r'\b%s\b' % re.escape(detected_text), flags=_re_flags) self.tagged_text = _pattern.sub(self.tag, self.tagged_text) self.processed_text = _pattern.sub('', self.processed_text) From cd901c458118ffd5d3e93c502f5145741dbd119a Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Thu, 26 Sep 2019 00:10:06 +0530 Subject: [PATCH 107/237] added prints to debug --- ner_v2/detectors/numeral/number/standard_number_detector.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ner_v2/detectors/numeral/number/standard_number_detector.py b/ner_v2/detectors/numeral/number/standard_number_detector.py index aa7591770..683b8c791 100644 --- a/ner_v2/detectors/numeral/number/standard_number_detector.py +++ b/ner_v2/detectors/numeral/number/standard_number_detector.py @@ -301,7 +301,9 @@ def _detect_number_from_digit(self, number_list=None, original_list=None): print(original_text, 'before get unit', processed_text) print('the pattern which is', pattern) unit, original_text = self._get_unit_from_text(original_text, processed_text) - processed_text = processed_text.replace(original_text, self.tag) + _pattern = re.compile(r'\b%s\b' % re.escape(original_text), flags=_re_flags) + processed_text = _pattern.sub(self.tag, processed_text) + # processed_text = processed_text.replace(original_text, self.tag) number_list.append({ NUMBER_DETECTION_RETURN_DICT_VALUE: str(number), NUMBER_DETECTION_RETURN_DICT_UNIT: unit From 6c8cf4c598d7ca92ff03a5a9a212ca4060d9b5a0 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Thu, 26 Sep 2019 00:16:45 +0530 Subject: [PATCH 108/237] remove prints --- ner_v2/detectors/numeral/number/standard_number_detector.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/ner_v2/detectors/numeral/number/standard_number_detector.py b/ner_v2/detectors/numeral/number/standard_number_detector.py index 683b8c791..62f8685d9 100644 --- a/ner_v2/detectors/numeral/number/standard_number_detector.py +++ b/ner_v2/detectors/numeral/number/standard_number_detector.py @@ -166,7 +166,6 @@ def _get_unit_from_text(self, detected_original, processed_text): + self.unit_choices + r'))\W+', processed_text, re.UNICODE) - print('169 unit matches',unit_matches) if unit_matches: original_text_prefix, unit_prefix, original_text_suffix, unit_suffix = unit_matches.groups() if unit_suffix: @@ -298,8 +297,6 @@ def _detect_number_from_digit(self, number_list=None, original_list=None): number = int(number) if number.is_integer() else number unit = None if self.unit_type: - print(original_text, 'before get unit', processed_text) - print('the pattern which is', pattern) unit, original_text = self._get_unit_from_text(original_text, processed_text) _pattern = re.compile(r'\b%s\b' % re.escape(original_text), flags=_re_flags) processed_text = _pattern.sub(self.tag, processed_text) @@ -328,7 +325,6 @@ def _update_processed_text(self, original_number_list): # self.processed_text = self.processed_text.replace(detected_text, '') for detected_text in original_number_list: - print('in update processed 328 original num list:', original_number_list) _pattern = re.compile(r'\b%s\b' % re.escape(detected_text), flags=_re_flags) self.tagged_text = _pattern.sub(self.tag, self.tagged_text) self.processed_text = _pattern.sub('', self.processed_text) From 0243808a234eaa186db984030f768fc4418816f2 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Thu, 26 Sep 2019 00:27:14 +0530 Subject: [PATCH 109/237] remove commented code lines --- ner_v2/detectors/numeral/number/standard_number_detector.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/ner_v2/detectors/numeral/number/standard_number_detector.py b/ner_v2/detectors/numeral/number/standard_number_detector.py index 62f8685d9..642f9486e 100644 --- a/ner_v2/detectors/numeral/number/standard_number_detector.py +++ b/ner_v2/detectors/numeral/number/standard_number_detector.py @@ -300,7 +300,6 @@ def _detect_number_from_digit(self, number_list=None, original_list=None): unit, original_text = self._get_unit_from_text(original_text, processed_text) _pattern = re.compile(r'\b%s\b' % re.escape(original_text), flags=_re_flags) processed_text = _pattern.sub(self.tag, processed_text) - # processed_text = processed_text.replace(original_text, self.tag) number_list.append({ NUMBER_DETECTION_RETURN_DICT_VALUE: str(number), NUMBER_DETECTION_RETURN_DICT_UNIT: unit @@ -320,10 +319,6 @@ def _update_processed_text(self, original_number_list): original_number_list (list): list of substrings of original text to be replaced with tag created from entity_name """ - # for detected_text in original_number_list: - # self.tagged_text = self.tagged_text.replace(detected_text, self.tag) - # self.processed_text = self.processed_text.replace(detected_text, '') - for detected_text in original_number_list: _pattern = re.compile(r'\b%s\b' % re.escape(detected_text), flags=_re_flags) self.tagged_text = _pattern.sub(self.tag, self.tagged_text) From 57b49ebf08d9f0dafcf7f79840d7b44c7147a32f Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Thu, 26 Sep 2019 11:02:37 +0530 Subject: [PATCH 110/237] fix lint errors --- ner_v2/detectors/numeral/number/standard_number_detector.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ner_v2/detectors/numeral/number/standard_number_detector.py b/ner_v2/detectors/numeral/number/standard_number_detector.py index 642f9486e..7d83c4a0b 100644 --- a/ner_v2/detectors/numeral/number/standard_number_detector.py +++ b/ner_v2/detectors/numeral/number/standard_number_detector.py @@ -161,8 +161,8 @@ def _get_unit_from_text(self, detected_original, processed_text): # add re.escape to handle decimal cases in detected original detected_original = re.escape(detected_original) - unit_matches = re.search(r'\W+((' + self.unit_choices + r')[.,\s]*' + detected_original + - r')\W+|\W+(' + detected_original + r'\s*(' + unit_matches = re.search(r'\W+((' + self.unit_choices + r')[.,\s]*' + detected_original + r')\W+|\W+(' + + detected_original + r'\s*(' + self.unit_choices + r'))\W+', processed_text, re.UNICODE) From 9279048a2466a8ba1d55ff8de79c96d7b2fda322 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Thu, 26 Sep 2019 11:11:35 +0530 Subject: [PATCH 111/237] fix lint errors --- ner_v2/detectors/numeral/number/standard_number_detector.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ner_v2/detectors/numeral/number/standard_number_detector.py b/ner_v2/detectors/numeral/number/standard_number_detector.py index 7d83c4a0b..374456924 100644 --- a/ner_v2/detectors/numeral/number/standard_number_detector.py +++ b/ner_v2/detectors/numeral/number/standard_number_detector.py @@ -162,8 +162,8 @@ def _get_unit_from_text(self, detected_original, processed_text): # add re.escape to handle decimal cases in detected original detected_original = re.escape(detected_original) unit_matches = re.search(r'\W+((' + self.unit_choices + r')[.,\s]*' + detected_original + r')\W+|\W+(' + - detected_original + r'\s*(' - + self.unit_choices + r'))\W+', + detected_original + r'\s*(' + + self.unit_choices + r'))\W+', processed_text, re.UNICODE) if unit_matches: From e72937ddcb3d4ebdacec876f82a5fadd1fa97945 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Thu, 26 Sep 2019 11:21:26 +0530 Subject: [PATCH 112/237] bug fix in number from word --- ner_v2/detectors/numeral/number/standard_number_detector.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ner_v2/detectors/numeral/number/standard_number_detector.py b/ner_v2/detectors/numeral/number/standard_number_detector.py index 374456924..03a3d024c 100644 --- a/ner_v2/detectors/numeral/number/standard_number_detector.py +++ b/ner_v2/detectors/numeral/number/standard_number_detector.py @@ -225,7 +225,9 @@ def _detect_number_from_words(self, number_list=None, original_list=None): unit = None if self.unit_type: unit, original_text = self._get_unit_from_text(original_text, numeral_text) - numeral_text = numeral_text.replace(original_text, self.tag) + # numeral_text = numeral_text.replace(original_text, self.tag) + _pattern = re.compile(r'\b%s\b' % re.escape(original_text), flags=_re_flags) + numeral_text = _pattern.sub(self.tag, numeral_text) number_list.append({ NUMBER_DETECTION_RETURN_DICT_VALUE: str(number), NUMBER_DETECTION_RETURN_DICT_UNIT: unit From f9cbc27ba473b7fb3869f2e69cf7928d05b41eb4 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Thu, 26 Sep 2019 12:08:22 +0530 Subject: [PATCH 113/237] added sorted function in _detect_number_from_words --- ner_v2/detectors/numeral/number/standard_number_detector.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ner_v2/detectors/numeral/number/standard_number_detector.py b/ner_v2/detectors/numeral/number/standard_number_detector.py index 03a3d024c..a5d7b2850 100644 --- a/ner_v2/detectors/numeral/number/standard_number_detector.py +++ b/ner_v2/detectors/numeral/number/standard_number_detector.py @@ -221,7 +221,9 @@ def _detect_number_from_words(self, number_list=None, original_list=None): numeral_text_list = re.split(r'[\-\:]', self.processed_text) for numeral_text in numeral_text_list: numbers, original_texts = get_number_from_number_word(numeral_text, self.numbers_word_map) - for number, original_text in zip(numbers, original_texts): + full_list = zip(numbers, original_texts) + sorted_full_list = sorted(full_list, key=lambda kv: len(kv[1]), reverse=True) + for number, original_text in sorted_full_list: unit = None if self.unit_type: unit, original_text = self._get_unit_from_text(original_text, numeral_text) From 36a4e643eeb0f513fd4c410fa410516b017ac561 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Thu, 26 Sep 2019 12:37:12 +0530 Subject: [PATCH 114/237] fixed lint errors for zip --- ner_v2/detectors/numeral/number/standard_number_detector.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/ner_v2/detectors/numeral/number/standard_number_detector.py b/ner_v2/detectors/numeral/number/standard_number_detector.py index a5d7b2850..1d7ffea37 100644 --- a/ner_v2/detectors/numeral/number/standard_number_detector.py +++ b/ner_v2/detectors/numeral/number/standard_number_detector.py @@ -221,7 +221,11 @@ def _detect_number_from_words(self, number_list=None, original_list=None): numeral_text_list = re.split(r'[\-\:]', self.processed_text) for numeral_text in numeral_text_list: numbers, original_texts = get_number_from_number_word(numeral_text, self.numbers_word_map) - full_list = zip(numbers, original_texts) + full_list = list(zip(numbers, original_texts)) + """ + list() is added to above zip as in python 3, zip() returns a zip object instead of zip function and + our lint checker is matching it for python 3 + """ sorted_full_list = sorted(full_list, key=lambda kv: len(kv[1]), reverse=True) for number, original_text in sorted_full_list: unit = None From 197d522e4641c0c4b752c9d3958d611f07c38bd2 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Thu, 26 Sep 2019 12:40:27 +0530 Subject: [PATCH 115/237] fixed lint error --- ner_v2/detectors/numeral/number/standard_number_detector.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ner_v2/detectors/numeral/number/standard_number_detector.py b/ner_v2/detectors/numeral/number/standard_number_detector.py index 1d7ffea37..b4b0ce798 100644 --- a/ner_v2/detectors/numeral/number/standard_number_detector.py +++ b/ner_v2/detectors/numeral/number/standard_number_detector.py @@ -222,8 +222,8 @@ def _detect_number_from_words(self, number_list=None, original_list=None): for numeral_text in numeral_text_list: numbers, original_texts = get_number_from_number_word(numeral_text, self.numbers_word_map) full_list = list(zip(numbers, original_texts)) - """ - list() is added to above zip as in python 3, zip() returns a zip object instead of zip function and + """ + list() is added to above zip as in python 3, zip() returns a zip object instead of zip function and our lint checker is matching it for python 3 """ sorted_full_list = sorted(full_list, key=lambda kv: len(kv[1]), reverse=True) From aec2e2cc2e1da5faffe2422aa2cfef6b223a5372 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Thu, 26 Sep 2019 14:48:46 +0530 Subject: [PATCH 116/237] support decimal types in min>max check --- .../number_range/standard_number_range_detector.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py b/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py index a24d7d72c..bf9e1c4d8 100644 --- a/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py +++ b/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py @@ -245,10 +245,16 @@ def _get_number_range(self, min_part_match, max_part_match, full_match): return number_range, original_text if min_part_match and max_part_match: - if int(entity_value_min) > int(entity_value_max): - temp = entity_value_max - entity_value_max = entity_value_min - entity_value_min = temp + try: + if int(entity_value_min) > int(entity_value_max): + temp = entity_value_max + entity_value_max = entity_value_min + entity_value_min = temp + except : + if float(entity_value_min) > float(entity_value_max): + temp = entity_value_max + entity_value_max = entity_value_min + entity_value_min = temp original_text = self._get_original_text_from_tagged_text(full_match) if (entity_value_min or entity_value_max) and original_text: From 90001871b1c20d161928c3d27dab25ea4c98175b Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Thu, 26 Sep 2019 16:35:44 +0530 Subject: [PATCH 117/237] added locale in api.py date parameters --- ner_v2/api.py | 3 ++- .../temporal/date/en/date_detection.py | 23 +++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/ner_v2/api.py b/ner_v2/api.py index 07510edb5..90c458489 100644 --- a/ner_v2/api.py +++ b/ner_v2/api.py @@ -135,7 +135,8 @@ def date(request): date_detection = DateAdvancedDetector(entity_name=parameters_dict[PARAMETER_ENTITY_NAME], language=parameters_dict[PARAMETER_SOURCE_LANGUAGE], timezone=timezone, - past_date_referenced=past_date_referenced) + past_date_referenced=past_date_referenced, + locale=parameters_dict[PARAMETER_LOCALE]) date_detection.set_bot_message(bot_message=parameters_dict[PARAMETER_BOT_MESSAGE]) diff --git a/ner_v2/detectors/temporal/date/en/date_detection.py b/ner_v2/detectors/temporal/date/en/date_detection.py index 79a7d6bae..301e46619 100644 --- a/ner_v2/detectors/temporal/date/en/date_detection.py +++ b/ner_v2/detectors/temporal/date/en/date_detection.py @@ -99,6 +99,29 @@ def __init__(self, entity_name, locale=None, timezone='UTC', past_date_reference self.bot_message = None self.locale = locale self.country_code = None + self.detector_preferences = [self._gregorian_day_month_year_format, + self._gregorian_month_day_year_format, + self._gregorian_year_month_day_format, + self._gregorian_advanced_day_month_year_format, + self._day_month_format_for_arrival_departure, + self._date_range_ddth_of_mmm_to_ddth, + self._date_range_ddth_to_ddth_of_next_month, + self._gregorian_day_with_ordinals_month_year_format, + self._gregorian_advanced_year_month_day_format, + self._gregorian_year_day_month_format, + self._gregorian_month_day_with_ordinals_year_format, + self._gregorian_day_month_format, + self._gregorian_month_day_format, + self._day_after_tomorrow, + self._date_days_after, + self._date_days_later, + self._day_before_yesterday, + self._todays_date, + self._tomorrows_date, + self._yesterdays_date, + self._day_in_next_week, + self._day_range_for_nth_week_month + ] def get_country_code_from_locale(self): regex_pattern = re.compile('[-_](.*$)', re.U) From 1819cac2f8eb6d69b6e610ddcf63f77b58d7b44e Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Thu, 26 Sep 2019 16:48:41 +0530 Subject: [PATCH 118/237] add locale in DateAdvancedDetector --- ner_v2/api.py | 1 + ner_v2/detectors/temporal/date/date_detection.py | 7 +++++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/ner_v2/api.py b/ner_v2/api.py index 90c458489..a21ffb6f5 100644 --- a/ner_v2/api.py +++ b/ner_v2/api.py @@ -132,6 +132,7 @@ def date(request): timezone = parameters_dict[PARAMETER_TIMEZONE] or 'UTC' date_past_reference = parameters_dict.get(PARAMETER_PAST_DATE_REFERENCED, "false") past_date_referenced = date_past_reference == 'true' or date_past_reference == 'True' + locale = date_detection = DateAdvancedDetector(entity_name=parameters_dict[PARAMETER_ENTITY_NAME], language=parameters_dict[PARAMETER_SOURCE_LANGUAGE], timezone=timezone, diff --git a/ner_v2/detectors/temporal/date/date_detection.py b/ner_v2/detectors/temporal/date/date_detection.py index 6fc7e42d1..599744f8d 100644 --- a/ner_v2/detectors/temporal/date/date_detection.py +++ b/ner_v2/detectors/temporal/date/date_detection.py @@ -58,7 +58,8 @@ def get_supported_languages(): supported_languages.append(_dir) return supported_languages - def __init__(self, entity_name='date', language=ENGLISH_LANG, timezone='UTC', past_date_referenced=False): + def __init__(self, entity_name='date',locale=None, language=ENGLISH_LANG, timezone='UTC', + past_date_referenced=False): """ Initializes the DateDetector object with given entity_name and pytz timezone object @@ -70,6 +71,7 @@ def __init__(self, entity_name='date', language=ENGLISH_LANG, timezone='UTC', pa default is UTC past_date_referenced (bool): to know if past or future date is referenced for date text like 'kal', 'parso' """ + self.locale = locale self._supported_languages = self.get_supported_languages() super(DateAdvancedDetector, self).__init__(language=language) self.text = '' @@ -82,7 +84,8 @@ def __init__(self, entity_name='date', language=ENGLISH_LANG, timezone='UTC', pa self.date_detector_object = DateDetector(entity_name=entity_name, language=language, timezone=timezone, - past_date_referenced=past_date_referenced) + past_date_referenced=past_date_referenced, + locale=locale) self.bot_message = None @property From 29cf05c4b9cf488bdfaf686fe243165695a75773 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Thu, 26 Sep 2019 16:57:45 +0530 Subject: [PATCH 119/237] bug fix in api.py --- ner_v2/api.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ner_v2/api.py b/ner_v2/api.py index a21ffb6f5..90c458489 100644 --- a/ner_v2/api.py +++ b/ner_v2/api.py @@ -132,7 +132,6 @@ def date(request): timezone = parameters_dict[PARAMETER_TIMEZONE] or 'UTC' date_past_reference = parameters_dict.get(PARAMETER_PAST_DATE_REFERENCED, "false") past_date_referenced = date_past_reference == 'true' or date_past_reference == 'True' - locale = date_detection = DateAdvancedDetector(entity_name=parameters_dict[PARAMETER_ENTITY_NAME], language=parameters_dict[PARAMETER_SOURCE_LANGUAGE], timezone=timezone, From 8b02c10733b3ac15a18014e98fdbd6b6e553b0a1 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Thu, 26 Sep 2019 18:22:21 +0530 Subject: [PATCH 120/237] added self.country_date_detector_preferences --- .../temporal/date/en/date_detection.py | 122 +++++++++++------- 1 file changed, 72 insertions(+), 50 deletions(-) diff --git a/ner_v2/detectors/temporal/date/en/date_detection.py b/ner_v2/detectors/temporal/date/en/date_detection.py index 301e46619..a3630cc74 100644 --- a/ner_v2/detectors/temporal/date/en/date_detection.py +++ b/ner_v2/detectors/temporal/date/en/date_detection.py @@ -122,8 +122,17 @@ def __init__(self, entity_name, locale=None, timezone='UTC', past_date_reference self._day_in_next_week, self._day_range_for_nth_week_month ] + self.country_date_detector_preferences = { + 'US': [self._gregorian_month_day_year_format], + 'IN': [self._gregorian_day_month_year_format], + } def get_country_code_from_locale(self): + """ + Extracts locale from country code. + Ex: locale:'en_us' sets, + self.country_code = 'US' + """ regex_pattern = re.compile('[-_](.*$)', re.U) match = regex_pattern.findall(self.locale) if match: @@ -183,57 +192,70 @@ def get_exact_date(self, date_list, original_list): corresponding substrings in the given text. """ - if self.country_code in ['US']: - date_list, original_list = self._gregorian_month_day_year_format(date_list, original_list) - self._update_processed_text(original_list) - date_list, original_list = self._gregorian_day_month_year_format(date_list, original_list) - self._update_processed_text(original_list) + if self.country_code in self.country_date_detector_preferences: + for preferred_detector in self.country_date_detector_preferences[self.country_code]: + date_list, original_list = preferred_detector(date_list, original_list) + self._update_processed_text(original_list) + for detector in self.detector_preferences: + if detector not in self.country_date_detector_preferences[self.country_code]: + date_list, original_list = detector(date_list, original_list) + self._update_processed_text(original_list) else: - date_list, original_list = self._gregorian_day_month_year_format(date_list, original_list) - self._update_processed_text(original_list) - date_list, original_list = self._gregorian_month_day_year_format(date_list, original_list) - self._update_processed_text(original_list) - date_list, original_list = self._gregorian_year_month_day_format(date_list, original_list) - self._update_processed_text(original_list) - date_list, original_list = self._gregorian_advanced_day_month_year_format(date_list, original_list) - self._update_processed_text(original_list) - date_list, original_list = self._day_month_format_for_arrival_departure(date_list, original_list) - self._update_processed_text(original_list) - date_list, original_list = self._date_range_ddth_of_mmm_to_ddth(date_list, original_list) - self._update_processed_text(original_list) - date_list, original_list = self._date_range_ddth_to_ddth_of_next_month(date_list, original_list) - self._update_processed_text(original_list) - date_list, original_list = self._gregorian_day_with_ordinals_month_year_format(date_list, original_list) - self._update_processed_text(original_list) - date_list, original_list = self._gregorian_advanced_year_month_day_format(date_list, original_list) - self._update_processed_text(original_list) - date_list, original_list = self._gregorian_year_day_month_format(date_list, original_list) - self._update_processed_text(original_list) - date_list, original_list = self._gregorian_month_day_with_ordinals_year_format(date_list, original_list) - self._update_processed_text(original_list) - date_list, original_list = self._gregorian_day_month_format(date_list, original_list) - self._update_processed_text(original_list) - date_list, original_list = self._gregorian_month_day_format(date_list, original_list) - self._update_processed_text(original_list) - - date_list, original_list = self._day_after_tomorrow(date_list, original_list) - self._update_processed_text(original_list) - date_list, original_list = self._date_days_after(date_list, original_list) - self._update_processed_text(original_list) - date_list, original_list = self._date_days_later(date_list, original_list) - self._update_processed_text(original_list) - date_list, original_list = self._day_before_yesterday(date_list, original_list) - self._update_processed_text(original_list) - date_list, original_list = self._todays_date(date_list, original_list) - self._update_processed_text(original_list) - date_list, original_list = self._tomorrows_date(date_list, original_list) - self._update_processed_text(original_list) - date_list, original_list = self._yesterdays_date(date_list, original_list) - self._update_processed_text(original_list) - date_list, original_list = self._day_in_next_week(date_list, original_list) - self._update_processed_text(original_list) - date_list, original_list = self._day_range_for_nth_week_month(date_list, original_list) - self._update_processed_text(original_list) + for detector in self.detector_preferences: + date_list, original_list = detector(date_list, original_list) + self._update_processed_text(original_list) + + # if self.country_code in ['US']: + # date_list, original_list = self._gregorian_month_day_year_format(date_list, original_list) + # self._update_processed_text(original_list) + # date_list, original_list = self._gregorian_day_month_year_format(date_list, original_list) + # self._update_processed_text(original_list) + # else: + # date_list, original_list = self._gregorian_day_month_year_format(date_list, original_list) + # self._update_processed_text(original_list) + # date_list, original_list = self._gregorian_month_day_year_format(date_list, original_list) + # self._update_processed_text(original_list) + # date_list, original_list = self._gregorian_year_month_day_format(date_list, original_list) + # self._update_processed_text(original_list) + # date_list, original_list = self._gregorian_advanced_day_month_year_format(date_list, original_list) + # self._update_processed_text(original_list) + # date_list, original_list = self._day_month_format_for_arrival_departure(date_list, original_list) + # self._update_processed_text(original_list) + # date_list, original_list = self._date_range_ddth_of_mmm_to_ddth(date_list, original_list) + # self._update_processed_text(original_list) + # date_list, original_list = self._date_range_ddth_to_ddth_of_next_month(date_list, original_list) + # self._update_processed_text(original_list) + # date_list, original_list = self._gregorian_day_with_ordinals_month_year_format(date_list, original_list) + # self._update_processed_text(original_list) + # date_list, original_list = self._gregorian_advanced_year_month_day_format(date_list, original_list) + # self._update_processed_text(original_list) + # date_list, original_list = self._gregorian_year_day_month_format(date_list, original_list) + # self._update_processed_text(original_list) + # date_list, original_list = self._gregorian_month_day_with_ordinals_year_format(date_list, original_list) + # self._update_processed_text(original_list) + # date_list, original_list = self._gregorian_day_month_format(date_list, original_list) + # self._update_processed_text(original_list) + # date_list, original_list = self._gregorian_month_day_format(date_list, original_list) + # self._update_processed_text(original_list) + # + # date_list, original_list = self._day_after_tomorrow(date_list, original_list) + # self._update_processed_text(original_list) + # date_list, original_list = self._date_days_after(date_list, original_list) + # self._update_processed_text(original_list) + # date_list, original_list = self._date_days_later(date_list, original_list) + # self._update_processed_text(original_list) + # date_list, original_list = self._day_before_yesterday(date_list, original_list) + # self._update_processed_text(original_list) + # date_list, original_list = self._todays_date(date_list, original_list) + # self._update_processed_text(original_list) + # date_list, original_list = self._tomorrows_date(date_list, original_list) + # self._update_processed_text(original_list) + # date_list, original_list = self._yesterdays_date(date_list, original_list) + # self._update_processed_text(original_list) + # date_list, original_list = self._day_in_next_week(date_list, original_list) + # self._update_processed_text(original_list) + # date_list, original_list = self._day_range_for_nth_week_month(date_list, original_list) + # self._update_processed_text(original_list) return date_list, original_list From 4c916ae13bec432a5a2d50d014919e2c2ece5f82 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Thu, 26 Sep 2019 18:27:58 +0530 Subject: [PATCH 121/237] removed commented code --- .../temporal/date/en/date_detection.py | 52 ------------------- 1 file changed, 52 deletions(-) diff --git a/ner_v2/detectors/temporal/date/en/date_detection.py b/ner_v2/detectors/temporal/date/en/date_detection.py index a3630cc74..1269bcddc 100644 --- a/ner_v2/detectors/temporal/date/en/date_detection.py +++ b/ner_v2/detectors/temporal/date/en/date_detection.py @@ -205,58 +205,6 @@ def get_exact_date(self, date_list, original_list): date_list, original_list = detector(date_list, original_list) self._update_processed_text(original_list) - # if self.country_code in ['US']: - # date_list, original_list = self._gregorian_month_day_year_format(date_list, original_list) - # self._update_processed_text(original_list) - # date_list, original_list = self._gregorian_day_month_year_format(date_list, original_list) - # self._update_processed_text(original_list) - # else: - # date_list, original_list = self._gregorian_day_month_year_format(date_list, original_list) - # self._update_processed_text(original_list) - # date_list, original_list = self._gregorian_month_day_year_format(date_list, original_list) - # self._update_processed_text(original_list) - # date_list, original_list = self._gregorian_year_month_day_format(date_list, original_list) - # self._update_processed_text(original_list) - # date_list, original_list = self._gregorian_advanced_day_month_year_format(date_list, original_list) - # self._update_processed_text(original_list) - # date_list, original_list = self._day_month_format_for_arrival_departure(date_list, original_list) - # self._update_processed_text(original_list) - # date_list, original_list = self._date_range_ddth_of_mmm_to_ddth(date_list, original_list) - # self._update_processed_text(original_list) - # date_list, original_list = self._date_range_ddth_to_ddth_of_next_month(date_list, original_list) - # self._update_processed_text(original_list) - # date_list, original_list = self._gregorian_day_with_ordinals_month_year_format(date_list, original_list) - # self._update_processed_text(original_list) - # date_list, original_list = self._gregorian_advanced_year_month_day_format(date_list, original_list) - # self._update_processed_text(original_list) - # date_list, original_list = self._gregorian_year_day_month_format(date_list, original_list) - # self._update_processed_text(original_list) - # date_list, original_list = self._gregorian_month_day_with_ordinals_year_format(date_list, original_list) - # self._update_processed_text(original_list) - # date_list, original_list = self._gregorian_day_month_format(date_list, original_list) - # self._update_processed_text(original_list) - # date_list, original_list = self._gregorian_month_day_format(date_list, original_list) - # self._update_processed_text(original_list) - # - # date_list, original_list = self._day_after_tomorrow(date_list, original_list) - # self._update_processed_text(original_list) - # date_list, original_list = self._date_days_after(date_list, original_list) - # self._update_processed_text(original_list) - # date_list, original_list = self._date_days_later(date_list, original_list) - # self._update_processed_text(original_list) - # date_list, original_list = self._day_before_yesterday(date_list, original_list) - # self._update_processed_text(original_list) - # date_list, original_list = self._todays_date(date_list, original_list) - # self._update_processed_text(original_list) - # date_list, original_list = self._tomorrows_date(date_list, original_list) - # self._update_processed_text(original_list) - # date_list, original_list = self._yesterdays_date(date_list, original_list) - # self._update_processed_text(original_list) - # date_list, original_list = self._day_in_next_week(date_list, original_list) - # self._update_processed_text(original_list) - # date_list, original_list = self._day_range_for_nth_week_month(date_list, original_list) - # self._update_processed_text(original_list) - return date_list, original_list def get_possible_date(self, date_list=None, original_list=None): From 27685893fb95c7bdee4471cb3470c643fb1931b9 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Thu, 26 Sep 2019 18:34:49 +0530 Subject: [PATCH 122/237] fix lint errors --- ner_constants.py | 2 -- ner_v2/detectors/temporal/date/date_detection.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/ner_constants.py b/ner_constants.py index bc59a4413..94263b302 100644 --- a/ner_constants.py +++ b/ner_constants.py @@ -1,4 +1,3 @@ - # ************************ constant used for detection_method ************************ # when entity is detected from message @@ -59,4 +58,3 @@ # Locale for Date and Phone Number detection PARAMETER_LOCALE = 'locale' - diff --git a/ner_v2/detectors/temporal/date/date_detection.py b/ner_v2/detectors/temporal/date/date_detection.py index 599744f8d..d8c92f105 100644 --- a/ner_v2/detectors/temporal/date/date_detection.py +++ b/ner_v2/detectors/temporal/date/date_detection.py @@ -58,7 +58,7 @@ def get_supported_languages(): supported_languages.append(_dir) return supported_languages - def __init__(self, entity_name='date',locale=None, language=ENGLISH_LANG, timezone='UTC', + def __init__(self, entity_name='date', locale=None, language=ENGLISH_LANG, timezone='UTC', past_date_referenced=False): """ Initializes the DateDetector object with given entity_name and pytz timezone object From bcede03410d9614f55b00a637bdeafb8e24cd139 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Thu, 26 Sep 2019 18:36:32 +0530 Subject: [PATCH 123/237] fix lint errors --- ner_constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ner_constants.py b/ner_constants.py index 94263b302..7df14f23d 100644 --- a/ner_constants.py +++ b/ner_constants.py @@ -57,4 +57,4 @@ PARAMETER_NUMBER_UNIT_TYPE = 'unit_type' # Locale for Date and Phone Number detection -PARAMETER_LOCALE = 'locale' +PARAMETER_LOCALE = 'locale' \ No newline at end of file From 7ef3ef68db8be2827f304a12485b6c987a6d3a6d Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Fri, 27 Sep 2019 11:42:18 +0530 Subject: [PATCH 124/237] added instructions to add new country preferences --- ner_v2/detectors/temporal/date/en/date_detection.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/ner_v2/detectors/temporal/date/en/date_detection.py b/ner_v2/detectors/temporal/date/en/date_detection.py index 1269bcddc..a1a0889c7 100644 --- a/ner_v2/detectors/temporal/date/en/date_detection.py +++ b/ner_v2/detectors/temporal/date/en/date_detection.py @@ -122,6 +122,15 @@ def __init__(self, entity_name, locale=None, timezone='UTC', past_date_reference self._day_in_next_week, self._day_range_for_nth_week_month ] + """ + Rules to add new country code preferences: + 1. Create a new key with country code. + 2. Add all the methods which should be given higher preference in a list with the + most preferred method first. + 3. Warning: Be careful about the order in which you set your preferences. + For EX: If you set `self._gregorian_day_month_format` at a higher preference than + `self._gregorian_advanced_day_month_year_format`, only `22nd MAR` will be detected in `22nd MAR 2034`. + """ self.country_date_detector_preferences = { 'US': [self._gregorian_month_day_year_format], 'IN': [self._gregorian_day_month_year_format], From 90a1a23a13985a3134507e5223ca677989b8a7a2 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Fri, 27 Sep 2019 17:25:08 +0530 Subject: [PATCH 125/237] added tz in output --- .../temporal/time/en/time_detection.py | 30 ++++++++++++------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/ner_v2/detectors/temporal/time/en/time_detection.py b/ner_v2/detectors/temporal/time/en/time_detection.py index cae506e69..429d4587d 100644 --- a/ner_v2/detectors/temporal/time/en/time_detection.py +++ b/ner_v2/detectors/temporal/time/en/time_detection.py @@ -79,6 +79,7 @@ def __init__(self, entity_name, timezone='UTC'): self.bot_message = None self.timezone = get_timezone(timezone) self.now_date = datetime.datetime.now(self.timezone) + self.timezone_choices = 'ist|utc|akst|akdt|pst|pdt|cst|est|hst|mst|mdt|cdt|edt' def set_bot_message(self, bot_message): """ @@ -224,10 +225,13 @@ def _detect_range_12_hour_format(self, time_list=None, original_list=None): time_list = [] if original_list is None: original_list = [] - patterns = re.findall( - r'\s((0?[2-9]|0?1[0-2]?)[\s-]*(?::|\.|\s)?[\s-]*?([0-5][0-9])[\s-]*?(pm|am|a\.m|p\.m)[\s-]*?to[\s-]' - r'*?(0?[2-9]|0?1[0-2]?)[\s-]*(?::|\.|\s)?[\s-]*?([0-5][0-9])[\s-]*?(pm|am|a\.m|p\.m))', - self.processed_text.lower()) + regex_patterns = re.compile( + r'(({timezone})?\s(0?[2-9]|0?1[0-2]?)[\s-]*(?::|\.|\s)?[\s-]*?([0-5][0-9])[\s-]*?(pm|am|a\.m|p\.m)' + r'[\s-]*?({timezone})?\s*to[\s-]*?(ist|utc)?\s*(0?[2-9]|0?1[0-2]?)[\s-]*' + r'(?::|\.|\s)?[\s-]*?([0-5][0-9])[\s-]*?(pm|am|a\.m|p\.m)\s*({timezone})?)' + .format(timezone=self.timezone_choices) + ) + patterns = regex_patterns.findall(self.processed_text.lower()) for pattern in patterns: original1 = pattern[0] original2 = pattern[0] @@ -237,26 +241,32 @@ def _detect_range_12_hour_format(self, time_list=None, original_list=None): time_type = 'return' else: time_type = None - t1 = pattern[1] - t2 = pattern[2] - ap1 = pattern[3] + t1 = pattern[2] + t2 = pattern[3] + ap1 = pattern[4] + tz1 = pattern[1] + tz2 = pattern[5] time1 = { 'hh': int(t1), 'mm': int(t2), 'nn': str(ap1).lower().strip('.'), + 'tz': (tz1 or tz2 or self.timezone).upper(), 'range': 'start', 'time_type': time_type } time1['nn'] = 'am' if 'a' in time1['nn'] else time1['nn'] time1['nn'] = 'pm' if 'p' in time1['nn'] else time1['nn'] - t3 = pattern[4] - t4 = pattern[5] - ap2 = pattern[6] + t3 = pattern[7] + t4 = pattern[8] + ap2 = pattern[9] + tz3 = pattern[6] + tz4 = pattern[10] time2 = { 'hh': int(t3), 'mm': int(t4), 'nn': str(ap2).lower().strip('.'), + 'tz': (tz3 or tz4 or self.timezone).upper(), 'range': 'end', 'time_type': time_type } From 081130c21d7289e58d2f2bbe07c7c3abf6dbbee3 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Sun, 29 Sep 2019 23:39:58 +0530 Subject: [PATCH 126/237] test --- ner_v2/detectors/temporal/time/en/time_detection.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ner_v2/detectors/temporal/time/en/time_detection.py b/ner_v2/detectors/temporal/time/en/time_detection.py index 429d4587d..186d69fb0 100644 --- a/ner_v2/detectors/temporal/time/en/time_detection.py +++ b/ner_v2/detectors/temporal/time/en/time_detection.py @@ -250,7 +250,8 @@ def _detect_range_12_hour_format(self, time_list=None, original_list=None): 'hh': int(t1), 'mm': int(t2), 'nn': str(ap1).lower().strip('.'), - 'tz': (tz1 or tz2 or self.timezone).upper(), + # 'tz': (tz1 or tz2 or self.timezone).upper(), + 'tz': (tz1 or tz2).upper(), 'range': 'start', 'time_type': time_type } From 6b0daed432655b30d418df1403e392f597a4868a Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Sun, 29 Sep 2019 23:43:11 +0530 Subject: [PATCH 127/237] test --- ner_v2/detectors/temporal/time/en/time_detection.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ner_v2/detectors/temporal/time/en/time_detection.py b/ner_v2/detectors/temporal/time/en/time_detection.py index 186d69fb0..eac52f03f 100644 --- a/ner_v2/detectors/temporal/time/en/time_detection.py +++ b/ner_v2/detectors/temporal/time/en/time_detection.py @@ -251,7 +251,7 @@ def _detect_range_12_hour_format(self, time_list=None, original_list=None): 'mm': int(t2), 'nn': str(ap1).lower().strip('.'), # 'tz': (tz1 or tz2 or self.timezone).upper(), - 'tz': (tz1 or tz2).upper(), + 'tz': (tz1 or tz2 or 'nope').upper(), 'range': 'start', 'time_type': time_type } @@ -267,7 +267,8 @@ def _detect_range_12_hour_format(self, time_list=None, original_list=None): 'hh': int(t3), 'mm': int(t4), 'nn': str(ap2).lower().strip('.'), - 'tz': (tz3 or tz4 or self.timezone).upper(), + # 'tz': (tz3 or tz4 or self.timezone).upper(), + 'tz': (tz3 or tz4 or 'nope').upper(), 'range': 'end', 'time_type': time_type } From fa4d3fafc1ee95321e03df5470c1d4e5b369fa4c Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Mon, 30 Sep 2019 12:12:14 +0530 Subject: [PATCH 128/237] add tests --- .../temporal/date/en/test_date_detection.py | 30 +++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/ner_v2/tests/temporal/date/en/test_date_detection.py b/ner_v2/tests/temporal/date/en/test_date_detection.py index d9740b594..7819ba279 100644 --- a/ner_v2/tests/temporal/date/en/test_date_detection.py +++ b/ner_v2/tests/temporal/date/en/test_date_detection.py @@ -20,6 +20,7 @@ def test_en_date_detection_date_range_ddth_of_mmm_to_ddth(self): Date detection for pattern '2nd jan to 5th' """ message = '2nd jan to 5th' + locale = 'en-in' # If we run day1 = 2 day2 = 5 @@ -30,7 +31,7 @@ def test_en_date_detection_date_range_ddth_of_mmm_to_ddth(self): year1 += 1 year2 += 1 - date_detector_object = DateAdvancedDetector(entity_name=self.entity_name, language='en') + date_detector_object = DateAdvancedDetector(entity_name=self.entity_name, language='en', locale=locale) date_dicts, original_texts = date_detector_object.detect_entity(message) self.assertIn({ @@ -58,6 +59,7 @@ def test_en_date_detection_day_range_for_nth_week_month(self, mocked_get_weekday Date detection for pattern 'first week of jan' """ message = 'first week of jan' + locale = 'en-in' day1 = 1 day2 = 7 month = 1 @@ -71,7 +73,7 @@ def test_en_date_detection_day_range_for_nth_week_month(self, mocked_get_weekday mocked_get_weekdays_for_month.return_value = [day1, day2] - date_detector_object = DateAdvancedDetector(entity_name=self.entity_name, language='en') + date_detector_object = DateAdvancedDetector(entity_name=self.entity_name, language='en', locale=locale) date_dicts, original_texts = date_detector_object.detect_entity(message) # TODO: functionality is incorrect, start_range should be True in 1st and end_range should be True in second @@ -93,3 +95,27 @@ def test_en_date_detection_day_range_for_nth_week_month(self, mocked_get_weekday 'value': {'dd': day2, 'mm': month, 'type': 'date', 'yy': year} }, date_dicts) self.assertEqual(original_texts.count(message), 2) + + def test_en_date_detection_date_ddth_of_mm_of_yy_with_locale(self): + """ + Date detection for pattern '2nd jan to 5th' + """ + message = 'The date is 2/3/19' + locale = 'en-us' + # If we run + day1 = 3 + month = 2 + year1 = 2019 + + date_detector_object = DateAdvancedDetector(entity_name=self.entity_name, language='en', locale=locale) + date_dicts, original_texts = date_detector_object.detect_entity(message) + + self.assertIn({ + 'normal': True, + 'start_range': False, + 'end_range': False, + 'from': False, + 'to': False, 'value': {'dd': day1, 'mm': month, 'yy': year1, 'type': 'date'} + }, date_dicts) + + self.assertEqual(original_texts.count(message), 1) From 6e2c032728776d3b5b8a5309d430514180927be5 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Mon, 30 Sep 2019 13:14:04 +0530 Subject: [PATCH 129/237] bug fix test case --- ner_v2/tests/temporal/date/en/test_date_detection.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/ner_v2/tests/temporal/date/en/test_date_detection.py b/ner_v2/tests/temporal/date/en/test_date_detection.py index 7819ba279..2e861fd7a 100644 --- a/ner_v2/tests/temporal/date/en/test_date_detection.py +++ b/ner_v2/tests/temporal/date/en/test_date_detection.py @@ -98,9 +98,9 @@ def test_en_date_detection_day_range_for_nth_week_month(self, mocked_get_weekday def test_en_date_detection_date_ddth_of_mm_of_yy_with_locale(self): """ - Date detection for pattern '2nd jan to 5th' + Date detection for pattern '2/3/19' """ - message = 'The date is 2/3/19' + message = '2/3/19' locale = 'en-us' # If we run day1 = 3 @@ -115,7 +115,8 @@ def test_en_date_detection_date_ddth_of_mm_of_yy_with_locale(self): 'start_range': False, 'end_range': False, 'from': False, - 'to': False, 'value': {'dd': day1, 'mm': month, 'yy': year1, 'type': 'date'} + 'to': False, + 'value': {'dd': day1, 'mm': month, 'yy': year1, 'type': 'date'} }, date_dicts) - self.assertEqual(original_texts.count(message), 1) + self.assertEqual(original_texts.count(message), 1) \ No newline at end of file From 8d6b7b8761db615206388723408d69e92ed13218 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Mon, 30 Sep 2019 16:01:14 +0530 Subject: [PATCH 130/237] some regex changes to support Time zone --- .../temporal/time/en/time_detection.py | 53 +++++++++++++------ 1 file changed, 36 insertions(+), 17 deletions(-) diff --git a/ner_v2/detectors/temporal/time/en/time_detection.py b/ner_v2/detectors/temporal/time/en/time_detection.py index eac52f03f..feab1758d 100644 --- a/ner_v2/detectors/temporal/time/en/time_detection.py +++ b/ner_v2/detectors/temporal/time/en/time_detection.py @@ -226,10 +226,10 @@ def _detect_range_12_hour_format(self, time_list=None, original_list=None): if original_list is None: original_list = [] regex_patterns = re.compile( - r'(({timezone})?\s(0?[2-9]|0?1[0-2]?)[\s-]*(?::|\.|\s)?[\s-]*?([0-5][0-9])[\s-]*?(pm|am|a\.m|p\.m)' - r'[\s-]*?({timezone})?\s*to[\s-]*?(ist|utc)?\s*(0?[2-9]|0?1[0-2]?)[\s-]*' - r'(?::|\.|\s)?[\s-]*?([0-5][0-9])[\s-]*?(pm|am|a\.m|p\.m)\s*({timezone})?)' - .format(timezone=self.timezone_choices) + r'\W(({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*(?::|\.|\s)?[\s-]*?([0-5][0-9])[\s-]*?(pm|am|a\.m|p\.m)' + r'[\s-]*?({timezone})?\s*to[\s-]*?({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*' + r'(?::|\.|\s)?[\s-]*?([0-5][0-9])[\s-]*?(pm|am|a\.m|p\.m)\s*({timezone})?)\W' + .format(timezone=self.timezone_choices) ) patterns = regex_patterns.findall(self.processed_text.lower()) for pattern in patterns: @@ -300,10 +300,12 @@ def _detect_range_12_hour_format_without_min(self, time_list=None, original_list time_list = [] if original_list is None: original_list = [] - patterns = re.findall( - r'\s((0?[2-9]|0?1[0-2]?)[\s-]*(am|pm|a\.m|p\.m)[\s-]*?to[\s-]*?(0?[2-9]|0?1[0-2]?)[\s-]*' - r'(am|pm|a\.m|p\.m))', - self.processed_text.lower()) + regex_patterns = re.compile( + r'\W(({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*(am|pm|a\.m|p\.m)[\s-]*?({timezone})?\s*to' + r'\s*({timezone})?[\s-]*?(0?[2-9]|0?1[0-2]?)[\s-]*(am|pm|a\.m|p\.m)\s*({timezone})?)\W' + .format(timezone=self.timezone_choices) + ) + patterns = regex_patterns.findall(self.processed_text.lower()) for pattern in patterns: original1 = pattern[0] original2 = pattern[0] @@ -313,24 +315,30 @@ def _detect_range_12_hour_format_without_min(self, time_list=None, original_list time_type = 'return' else: time_type = None - t1 = pattern[1] - ap1 = pattern[2] + t1 = pattern[2] + ap1 = pattern[3] + tz1 = pattern[1] + tz2 = pattern[4] time1 = { 'hh': int(t1), 'mm': 0, 'nn': str(ap1).lower().strip('.'), + 'tz': (tz1 or tz2 or 'none').upper(), 'range': 'start', 'time_type': time_type } time1['nn'] = 'am' if 'a' in time1['nn'] else time1['nn'] time1['nn'] = 'pm' if 'p' in time1['nn'] else time1['nn'] - t2 = pattern[3] - ap2 = pattern[4] + t2 = pattern[6] + ap2 = pattern[7] + tz3 = pattern[5] + tz4 = pattern[8] time2 = { 'hh': int(t2), 'mm': 0, 'nn': str(ap2).lower().strip('.'), + 'tz': (tz3 or tz4 or 'none').upper(), 'range': 'end', 'time_type': time_type } @@ -362,9 +370,13 @@ def _detect_start_range_12_hour_format(self, time_list=None, original_list=None) time_list = [] if original_list is None: original_list = [] - patterns = re.findall(r'\s((after|aftr)[\s-]*(0?[2-9]|0?1[0-2]?)[\s-]*(?::|\.|\s)?[\s-]*?' - r'([0-5][0-9])[\s-]*?(pm|am|a\.m|p\.m))', - self.processed_text.lower()) + + regex_patterns = re.compile( + r'\W((?:after|aftr)[\s-]*({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*(?::|\.|\s)?[\s-]*?' + r'([0-5][0-9])[\s-]*?(pm|am|a\.m|p\.m)\s*({timezone})?)\W' + .format(timezone=self.timezone_choices) + ) + patterns = regex_patterns.findall(self.processed_text.lower()) for pattern in patterns: original1 = pattern[0] if self.departure_flag: @@ -376,10 +388,13 @@ def _detect_start_range_12_hour_format(self, time_list=None, original_list=None) t1 = pattern[2] t2 = pattern[3] ap1 = pattern[4] + tz1 = pattern[1] + tz2 = pattern[5] time1 = { 'hh': int(t1), 'mm': int(t2), 'nn': str(ap1).lower().strip('.'), + 'tz': (tz1 or tz2 or 'none').upper(), 'range': 'start', 'time_type': time_type } @@ -409,8 +424,9 @@ def _detect_end_range_12_hour_format(self, time_list=None, original_list=None): time_list = [] if original_list is None: original_list = [] - patterns = re.findall(r'\s((before|bfre)[\s-]*(0?[2-9]|0?1[0-2]?)[\s-]*' - r'(?::|\.|\s)?[\s-]*?([0-5][0-9])[\s-]*?(pm|am|a\.m|p\.m))', + patterns = re.findall(r'\W((?:before|bfre)[\s-]*({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*' + r'(?::|\.|\s)?[\s-]*?([0-5][0-9])[\s-]*?(pm|am|a\.m|p\.m)\s*({timezone})?)\W' + .format(timezone=self.timezone_choices), self.processed_text.lower()) for pattern in patterns: original1 = pattern[0] @@ -423,10 +439,13 @@ def _detect_end_range_12_hour_format(self, time_list=None, original_list=None): t1 = pattern[2] t2 = pattern[3] ap1 = pattern[4] + tz1 = pattern[1] + tz2 = pattern[5] time1 = { 'hh': int(t1), 'mm': int(t2), 'nn': str(ap1).lower().strip('.'), + 'tz': (tz1 or tz2 or 'none').upper(), 'range': 'end', 'time_type': time_type } From a57bee34821df59fab4751f0110508fe8c395f2c Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Mon, 30 Sep 2019 16:37:50 +0530 Subject: [PATCH 131/237] bug fix detectors order --- ner_v2/detectors/temporal/date/en/date_detection.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/ner_v2/detectors/temporal/date/en/date_detection.py b/ner_v2/detectors/temporal/date/en/date_detection.py index a1a0889c7..4e3e8162c 100644 --- a/ner_v2/detectors/temporal/date/en/date_detection.py +++ b/ner_v2/detectors/temporal/date/en/date_detection.py @@ -245,16 +245,14 @@ def get_possible_date(self, date_list=None, original_list=None): self._update_processed_text(original_list) date_list, original_list = self._date_identification_given_day(date_list, original_list) self._update_processed_text(original_list) - # FIXME: This call order causes everyday to be taken away from "everyday except <>" which means - # FIXME: successive calls for everyday_except_weekends and everyday_except_weekdays return wrong results - date_list, original_list = self._date_identification_everyday(date_list, original_list, n_days=15) - self._update_processed_text(original_list) date_list, original_list = self._date_identification_everyday_except_weekends(date_list, original_list, n_days=15) self._update_processed_text(original_list) date_list, original_list = self._date_identification_everyday_except_weekdays(date_list, original_list, n_days=50) self._update_processed_text(original_list) + date_list, original_list = self._date_identification_everyday(date_list, original_list, n_days=15) + self._update_processed_text(original_list) date_list, original_list = self._day_within_one_week(date_list, original_list) self._update_processed_text(original_list) date_list, original_list = self._weeks_identification(date_list, original_list) From 050f1919cd9b105cc6fb7f4269d3e158d8b2ca74 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Mon, 30 Sep 2019 18:15:47 +0530 Subject: [PATCH 132/237] fix _date_identification_everyday_except_weekdays --- .../detectors/temporal/date/en/date_detection.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/ner_v2/detectors/temporal/date/en/date_detection.py b/ner_v2/detectors/temporal/date/en/date_detection.py index 4e3e8162c..c5bffee49 100644 --- a/ner_v2/detectors/temporal/date/en/date_detection.py +++ b/ner_v2/detectors/temporal/date/en/date_detection.py @@ -1396,10 +1396,13 @@ def _date_identification_everyday_except_weekends(self, date_list=None, original patterns = regex_pattern.findall(self.processed_text.lower()) if not patterns: - weekday_regex_pattern = re.compile(r'\b((week\s?days?|all\sweekdays))\b') + weekday_regex_pattern = re.compile(r'\b(week\s?days?|all\sweekdays)\b') patterns = weekday_regex_pattern.findall(self.processed_text.lower()) constant_type = WEEKDAYS - if self._is_everyday_present(self.text): + every_weekday_pattern = re.compile(r'\b(every|daily|recur|always|continue|every\s*day|all)/s+' + r'(week\s?days?|all\sweekdays)\b', re.IGNORECASE) + is_everyday_result = every_weekday_pattern.findall(self.text) + if is_everyday_result: constant_type = REPEAT_WEEKDAYS today = now.weekday() count = 0 @@ -1473,12 +1476,14 @@ def _date_identification_everyday_except_weekdays(self, date_list=None, original regex_pattern = re.compile(r'\b((every\s?day|daily|all\s?days)\s+except\s+weekdays?)\b') patterns = regex_pattern.findall(self.processed_text.lower()) if not patterns: - weekend_regex_pattern = re.compile(r'\b((week\s?ends?|all\sweekends))\b') + weekend_regex_pattern = re.compile(r'\b(week\s?ends?|all\sweekends)\b') patterns = weekend_regex_pattern.findall(self.processed_text.lower()) constant_type = WEEKENDS - if self._is_everyday_present(self.text): + every_weekend_pattern = re.compile(r'\b(every|daily|recur|always|continue|every\s*day|all)' + r'\s+((week\s?ends?|all\sweekends))\b', re.IGNORECASE) + is_everyday_result = every_weekend_pattern.findall(self.text) + if is_everyday_result: constant_type = REPEAT_WEEKENDS - today = now.weekday() count = 0 weekend = [] From 45d1f7c53c338430952d82441f1272373598e1d4 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Mon, 30 Sep 2019 18:52:51 +0530 Subject: [PATCH 133/237] fix everyday bug --- .../temporal/date/en/date_detection.py | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/ner_v2/detectors/temporal/date/en/date_detection.py b/ner_v2/detectors/temporal/date/en/date_detection.py index c5bffee49..bb5eaeeae 100644 --- a/ner_v2/detectors/temporal/date/en/date_detection.py +++ b/ner_v2/detectors/temporal/date/en/date_detection.py @@ -1394,16 +1394,17 @@ def _date_identification_everyday_except_weekends(self, date_list=None, original end = now + datetime.timedelta(days=n_days) regex_pattern = re.compile(r'\b((every\s?day|daily|all\s?days)\s+except\s+weekends?)\b') patterns = regex_pattern.findall(self.processed_text.lower()) - + is_everyday_result = [] if not patterns: - weekday_regex_pattern = re.compile(r'\b(week\s?days?|all\sweekdays)\b') + weekday_regex_pattern = re.compile(r'\b((week\s?days?|all\sweekdays))\b') patterns = weekday_regex_pattern.findall(self.processed_text.lower()) + every_weekday_pattern = re.compile(r'\b((every|daily|recur|always|continue|every\s*day|all)/s+' + r'(week\s?days?|all\sweekdays))\b', re.IGNORECASE) + is_everyday_result = every_weekday_pattern.findall(self.processed_text.lower()) constant_type = WEEKDAYS - every_weekday_pattern = re.compile(r'\b(every|daily|recur|always|continue|every\s*day|all)/s+' - r'(week\s?days?|all\sweekdays)\b', re.IGNORECASE) - is_everyday_result = every_weekday_pattern.findall(self.text) if is_everyday_result: constant_type = REPEAT_WEEKDAYS + patterns = is_everyday_result today = now.weekday() count = 0 weekend = [] @@ -1475,15 +1476,18 @@ def _date_identification_everyday_except_weekdays(self, date_list=None, original end = now + datetime.timedelta(days=n_days) regex_pattern = re.compile(r'\b((every\s?day|daily|all\s?days)\s+except\s+weekdays?)\b') patterns = regex_pattern.findall(self.processed_text.lower()) + is_everyday_result = [] if not patterns: - weekend_regex_pattern = re.compile(r'\b(week\s?ends?|all\sweekends)\b') + weekend_regex_pattern = re.compile(r'\b((week\s?ends?|all\sweekends))\b') patterns = weekend_regex_pattern.findall(self.processed_text.lower()) + every_weekend_pattern = re.compile(r'\b(every|daily|recur|always|continue|every\s*day|all)' + r'\s+((week\s?ends?|all\sweekends))\b', re.IGNORECASE) + is_everyday_result = every_weekend_pattern.findall(self.processed_text.lower()) + constant_type = WEEKENDS - every_weekend_pattern = re.compile(r'\b(every|daily|recur|always|continue|every\s*day|all)' - r'\s+((week\s?ends?|all\sweekends))\b', re.IGNORECASE) - is_everyday_result = every_weekend_pattern.findall(self.text) if is_everyday_result: constant_type = REPEAT_WEEKENDS + patterns = is_everyday_result today = now.weekday() count = 0 weekend = [] From 7b378d4f80ceb29ef481b8f6d2b03f936e6bcae0 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Mon, 30 Sep 2019 19:08:55 +0530 Subject: [PATCH 134/237] prints for check --- ner_v2/detectors/temporal/date/en/date_detection.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ner_v2/detectors/temporal/date/en/date_detection.py b/ner_v2/detectors/temporal/date/en/date_detection.py index bb5eaeeae..aaa6fa4dc 100644 --- a/ner_v2/detectors/temporal/date/en/date_detection.py +++ b/ner_v2/detectors/temporal/date/en/date_detection.py @@ -1401,6 +1401,7 @@ def _date_identification_everyday_except_weekends(self, date_list=None, original every_weekday_pattern = re.compile(r'\b((every|daily|recur|always|continue|every\s*day|all)/s+' r'(week\s?days?|all\sweekdays))\b', re.IGNORECASE) is_everyday_result = every_weekday_pattern.findall(self.processed_text.lower()) + print('printing is_everyday_result in except weekends', is_everyday_result) constant_type = WEEKDAYS if is_everyday_result: constant_type = REPEAT_WEEKDAYS @@ -1483,6 +1484,7 @@ def _date_identification_everyday_except_weekdays(self, date_list=None, original every_weekend_pattern = re.compile(r'\b(every|daily|recur|always|continue|every\s*day|all)' r'\s+((week\s?ends?|all\sweekends))\b', re.IGNORECASE) is_everyday_result = every_weekend_pattern.findall(self.processed_text.lower()) + print('printing is_everyday_result in except weekends', is_everyday_result) constant_type = WEEKENDS if is_everyday_result: From 0db88d5f8b86c4aed9e986e2c201bb1ef6f1c479 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Mon, 30 Sep 2019 19:14:31 +0530 Subject: [PATCH 135/237] prints for check --- ner_v2/detectors/temporal/date/en/date_detection.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ner_v2/detectors/temporal/date/en/date_detection.py b/ner_v2/detectors/temporal/date/en/date_detection.py index aaa6fa4dc..697f7df8c 100644 --- a/ner_v2/detectors/temporal/date/en/date_detection.py +++ b/ner_v2/detectors/temporal/date/en/date_detection.py @@ -1400,7 +1400,7 @@ def _date_identification_everyday_except_weekends(self, date_list=None, original patterns = weekday_regex_pattern.findall(self.processed_text.lower()) every_weekday_pattern = re.compile(r'\b((every|daily|recur|always|continue|every\s*day|all)/s+' r'(week\s?days?|all\sweekdays))\b', re.IGNORECASE) - is_everyday_result = every_weekday_pattern.findall(self.processed_text.lower()) + is_everyday_result = every_weekday_pattern.findall(self.text) print('printing is_everyday_result in except weekends', is_everyday_result) constant_type = WEEKDAYS if is_everyday_result: @@ -1483,8 +1483,8 @@ def _date_identification_everyday_except_weekdays(self, date_list=None, original patterns = weekend_regex_pattern.findall(self.processed_text.lower()) every_weekend_pattern = re.compile(r'\b(every|daily|recur|always|continue|every\s*day|all)' r'\s+((week\s?ends?|all\sweekends))\b', re.IGNORECASE) - is_everyday_result = every_weekend_pattern.findall(self.processed_text.lower()) - print('printing is_everyday_result in except weekends', is_everyday_result) + is_everyday_result = every_weekend_pattern.findall(self.text) + print('printing is_everyday_result in except weekdays', is_everyday_result) constant_type = WEEKENDS if is_everyday_result: From 984aea313807c8d3ef2555169bb5e6e14eefb468 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Mon, 30 Sep 2019 19:19:56 +0530 Subject: [PATCH 136/237] prints for check --- ner_v2/detectors/temporal/date/en/date_detection.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ner_v2/detectors/temporal/date/en/date_detection.py b/ner_v2/detectors/temporal/date/en/date_detection.py index 697f7df8c..99f2217a9 100644 --- a/ner_v2/detectors/temporal/date/en/date_detection.py +++ b/ner_v2/detectors/temporal/date/en/date_detection.py @@ -1481,8 +1481,8 @@ def _date_identification_everyday_except_weekdays(self, date_list=None, original if not patterns: weekend_regex_pattern = re.compile(r'\b((week\s?ends?|all\sweekends))\b') patterns = weekend_regex_pattern.findall(self.processed_text.lower()) - every_weekend_pattern = re.compile(r'\b(every|daily|recur|always|continue|every\s*day|all)' - r'\s+((week\s?ends?|all\sweekends))\b', re.IGNORECASE) + every_weekend_pattern = re.compile(r'\b((every|daily|recur|always|continue|every\s*day|all)' + r'\s+(week\s?ends?|all\sweekends))\b', re.IGNORECASE) is_everyday_result = every_weekend_pattern.findall(self.text) print('printing is_everyday_result in except weekdays', is_everyday_result) From 6912cbae363e23176a506cb0b50ce6c5f918683c Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Mon, 30 Sep 2019 19:23:53 +0530 Subject: [PATCH 137/237] prints for check --- ner_v2/detectors/temporal/date/en/date_detection.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ner_v2/detectors/temporal/date/en/date_detection.py b/ner_v2/detectors/temporal/date/en/date_detection.py index 99f2217a9..7898c6ffa 100644 --- a/ner_v2/detectors/temporal/date/en/date_detection.py +++ b/ner_v2/detectors/temporal/date/en/date_detection.py @@ -1398,10 +1398,9 @@ def _date_identification_everyday_except_weekends(self, date_list=None, original if not patterns: weekday_regex_pattern = re.compile(r'\b((week\s?days?|all\sweekdays))\b') patterns = weekday_regex_pattern.findall(self.processed_text.lower()) - every_weekday_pattern = re.compile(r'\b((every|daily|recur|always|continue|every\s*day|all)/s+' + every_weekday_pattern = re.compile(r'\b((every|daily|recur|always|continue|every\s*day|all)\s+' r'(week\s?days?|all\sweekdays))\b', re.IGNORECASE) is_everyday_result = every_weekday_pattern.findall(self.text) - print('printing is_everyday_result in except weekends', is_everyday_result) constant_type = WEEKDAYS if is_everyday_result: constant_type = REPEAT_WEEKDAYS From 34e4fc8500bea789831fc00f8582fb016d07aad6 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Mon, 30 Sep 2019 19:34:04 +0530 Subject: [PATCH 138/237] prints for check --- ner_v2/detectors/temporal/date/en/date_detection.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ner_v2/detectors/temporal/date/en/date_detection.py b/ner_v2/detectors/temporal/date/en/date_detection.py index 7898c6ffa..02fee6599 100644 --- a/ner_v2/detectors/temporal/date/en/date_detection.py +++ b/ner_v2/detectors/temporal/date/en/date_detection.py @@ -1401,6 +1401,7 @@ def _date_identification_everyday_except_weekends(self, date_list=None, original every_weekday_pattern = re.compile(r'\b((every|daily|recur|always|continue|every\s*day|all)\s+' r'(week\s?days?|all\sweekdays))\b', re.IGNORECASE) is_everyday_result = every_weekday_pattern.findall(self.text) + print('printing is_everyday_result in except weekends', is_everyday_result) constant_type = WEEKDAYS if is_everyday_result: constant_type = REPEAT_WEEKDAYS From f271ea86878e4b01d3630bd296bc863928c0ca52 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Mon, 30 Sep 2019 23:57:24 +0530 Subject: [PATCH 139/237] bug fix in _date_identification_everyday_except_weekends --- ner_v2/detectors/temporal/date/en/date_detection.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/ner_v2/detectors/temporal/date/en/date_detection.py b/ner_v2/detectors/temporal/date/en/date_detection.py index 02fee6599..bfa499268 100644 --- a/ner_v2/detectors/temporal/date/en/date_detection.py +++ b/ner_v2/detectors/temporal/date/en/date_detection.py @@ -1401,11 +1401,15 @@ def _date_identification_everyday_except_weekends(self, date_list=None, original every_weekday_pattern = re.compile(r'\b((every|daily|recur|always|continue|every\s*day|all)\s+' r'(week\s?days?|all\sweekdays))\b', re.IGNORECASE) is_everyday_result = every_weekday_pattern.findall(self.text) - print('printing is_everyday_result in except weekends', is_everyday_result) constant_type = WEEKDAYS if is_everyday_result: constant_type = REPEAT_WEEKDAYS patterns = is_everyday_result + # checks if phrase of the form everyday except weekdays is present in the sentence. + regex_pattern = re.compile(r'\b((every\s?day|daily|all\s?days)\s+except\s+weekdays?)\b') + check_patterns_for_except_weekdays = regex_pattern.findall(self.processed_text.lower()) + if check_patterns_for_except_weekdays: + patterns = [] today = now.weekday() count = 0 weekend = [] @@ -1484,7 +1488,6 @@ def _date_identification_everyday_except_weekdays(self, date_list=None, original every_weekend_pattern = re.compile(r'\b((every|daily|recur|always|continue|every\s*day|all)' r'\s+(week\s?ends?|all\sweekends))\b', re.IGNORECASE) is_everyday_result = every_weekend_pattern.findall(self.text) - print('printing is_everyday_result in except weekdays', is_everyday_result) constant_type = WEEKENDS if is_everyday_result: From bf27803ba46f6b5ff50f0bfc499cd79fa3ca0abf Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Tue, 1 Oct 2019 13:42:24 +0530 Subject: [PATCH 140/237] regex changes --- ner_v2/detectors/temporal/time/en/time_detection.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/ner_v2/detectors/temporal/time/en/time_detection.py b/ner_v2/detectors/temporal/time/en/time_detection.py index feab1758d..61752a1d4 100644 --- a/ner_v2/detectors/temporal/time/en/time_detection.py +++ b/ner_v2/detectors/temporal/time/en/time_detection.py @@ -474,7 +474,8 @@ def _detect_start_range_12_hour_format_without_min(self, time_list=None, origina time_list = [] if original_list is None: original_list = [] - patterns = re.findall(r'\s((after|aftr)[\s-]*(0?[2-9]|0?1[0-2]?)[\s-]*(am|pm|a\.m|p\.m))', + patterns = re.findall(r'\W((?:after|aftr)[\s-]*({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*' + r'(am|pm|a\.m|p\.m)\s*({timezone})?)\W'.format(timezone=self.timezone_choices), self.processed_text.lower()) for pattern in patterns: original1 = pattern[0] @@ -486,10 +487,13 @@ def _detect_start_range_12_hour_format_without_min(self, time_list=None, origina time_type = None t1 = pattern[2] ap1 = pattern[3] + tz1 = pattern[1] + tz2 = pattern[4] time1 = { 'hh': int(t1), 'mm': 0, 'nn': str(ap1).lower().strip('.'), + 'tz': (tz1 or tz2 or 'none').upper(), 'range': 'start', 'time_type': time_type } @@ -519,7 +523,8 @@ def _detect_end_range_12_hour_format_without_min(self, time_list=None, original_ time_list = [] if original_list is None: original_list = [] - patterns = re.findall(r'\s((before|bfore)[\s-]*(0?[2-9]|0?1[0-2]?)[\s-]*(am|pm|a\.m|p\.m))', + patterns = re.findall(r'\W((?:before|bfore)[\s-]*({timezone})?\s*(0?[2-9]|0?1[0-2]?)' + r'[\s-]*(am|pm|a\.m|p\.m)\s*({timezone})?)\W'.format(timezone=self.timezone_choices), self.processed_text.lower()) for pattern in patterns: original1 = pattern[0] @@ -531,10 +536,14 @@ def _detect_end_range_12_hour_format_without_min(self, time_list=None, original_ time_type = None t1 = pattern[2] ap1 = pattern[3] + tz1 = pattern[1] + tz2 = pattern[4] + time1 = { 'hh': int(t1), 'mm': 0, 'nn': str(ap1).lower().strip('.'), + 'tz': (tz1 or tz2 or 'none').upper(), 'range': 'end', 'time_type': time_type } From 955b051f428b0b81ba74dfaa1341ce5509c45878 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Tue, 1 Oct 2019 13:59:53 +0530 Subject: [PATCH 141/237] detector name changes --- .../temporal/date/en/date_detection.py | 56 +++++++++---------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/ner_v2/detectors/temporal/date/en/date_detection.py b/ner_v2/detectors/temporal/date/en/date_detection.py index bfa499268..9f1d70741 100644 --- a/ner_v2/detectors/temporal/date/en/date_detection.py +++ b/ner_v2/detectors/temporal/date/en/date_detection.py @@ -99,29 +99,29 @@ def __init__(self, entity_name, locale=None, timezone='UTC', past_date_reference self.bot_message = None self.locale = locale self.country_code = None - self.detector_preferences = [self._gregorian_day_month_year_format, - self._gregorian_month_day_year_format, - self._gregorian_year_month_day_format, - self._gregorian_advanced_day_month_year_format, - self._day_month_format_for_arrival_departure, - self._date_range_ddth_of_mmm_to_ddth, - self._date_range_ddth_to_ddth_of_next_month, - self._gregorian_day_with_ordinals_month_year_format, - self._gregorian_advanced_year_month_day_format, - self._gregorian_year_day_month_format, - self._gregorian_month_day_with_ordinals_year_format, - self._gregorian_day_month_format, - self._gregorian_month_day_format, - self._day_after_tomorrow, - self._date_days_after, - self._date_days_later, - self._day_before_yesterday, - self._todays_date, - self._tomorrows_date, - self._yesterdays_date, - self._day_in_next_week, - self._day_range_for_nth_week_month - ] + self.default_detector_preferences = [self._gregorian_day_month_year_format, + self._gregorian_month_day_year_format, + self._gregorian_year_month_day_format, + self._gregorian_advanced_day_month_year_format, + self._day_month_format_for_arrival_departure, + self._date_range_ddth_of_mmm_to_ddth, + self._date_range_ddth_to_ddth_of_next_month, + self._gregorian_day_with_ordinals_month_year_format, + self._gregorian_advanced_year_month_day_format, + self._gregorian_year_day_month_format, + self._gregorian_month_day_with_ordinals_year_format, + self._gregorian_day_month_format, + self._gregorian_month_day_format, + self._day_after_tomorrow, + self._date_days_after, + self._date_days_later, + self._day_before_yesterday, + self._todays_date, + self._tomorrows_date, + self._yesterdays_date, + self._day_in_next_week, + self._day_range_for_nth_week_month + ] """ Rules to add new country code preferences: 1. Create a new key with country code. @@ -145,9 +145,9 @@ def get_country_code_from_locale(self): regex_pattern = re.compile('[-_](.*$)', re.U) match = regex_pattern.findall(self.locale) if match: - self.country_code = match[0].upper() + return match[0].upper() else: - self.country_code = None + return None def detect_date(self, text): """ @@ -165,7 +165,7 @@ def detect_date(self, text): self.processed_text = self.text self.tagged_text = self.text if self.locale: - self.get_country_code_from_locale() + self.country_code = self.get_country_code_from_locale() date_list = [] original_list = [] date_list, original_list = self.get_exact_date(date_list, original_list) @@ -205,12 +205,12 @@ def get_exact_date(self, date_list, original_list): for preferred_detector in self.country_date_detector_preferences[self.country_code]: date_list, original_list = preferred_detector(date_list, original_list) self._update_processed_text(original_list) - for detector in self.detector_preferences: + for detector in self.default_detector_preferences: if detector not in self.country_date_detector_preferences[self.country_code]: date_list, original_list = detector(date_list, original_list) self._update_processed_text(original_list) else: - for detector in self.detector_preferences: + for detector in self.default_detector_preferences: date_list, original_list = detector(date_list, original_list) self._update_processed_text(original_list) From 0c12f059a43a3448f2d238a07a4a1a5989a91353 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Tue, 1 Oct 2019 15:17:39 +0530 Subject: [PATCH 142/237] bug fix in _gregorian_day_month_year_format --- ner_v2/detectors/temporal/date/en/date_detection.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ner_v2/detectors/temporal/date/en/date_detection.py b/ner_v2/detectors/temporal/date/en/date_detection.py index 9f1d70741..2d1cd682a 100644 --- a/ner_v2/detectors/temporal/date/en/date_detection.py +++ b/ner_v2/detectors/temporal/date/en/date_detection.py @@ -304,7 +304,7 @@ def _gregorian_day_month_year_format(self, date_list=None, original_list=None): original_list = [] if date_list is None: date_list = [] - regex_pattern = re.compile(r'\b(([12][0-9]|3[01]|0?[1-9])\s?[/\-\.]\s?(1[0-2]|0?[1-9])' + regex_pattern = re.compile(r'[^/\-\.\w](([12][0-9]|3[01]|0?[1-9])\s?[/\-\.]\s?(1[0-2]|0?[1-9])' r'(?:\s?[/\-\.]\s?((?:20|19)?[0-9]{2}))?)(?:\s|$)') patterns = regex_pattern.findall(self.processed_text.lower()) for pattern in patterns: @@ -356,8 +356,8 @@ def _gregorian_month_day_year_format(self, date_list=None, original_list=None): original_list = [] if date_list is None: date_list = [] - regex_pattern = re.compile(r'\b((1[0-2]|0?[1-9])\s?[/\-\.]\s?([12][0-9]|3[01]|0?[1-9])\s?[/\-\.]' - r'\s?((?:20|19)?[0-9]{2}))(\s|$)') + regex_pattern = re.compile(r'[^/\-\.\w]((1[0-2]|0?[1-9])\s?[/\-\.]\s?([12][0-9]|3[01]|0?[1-9])' + r'(?:\s?[/\-\.]\s?((?:20|19)?[0-9]{2}))?)(?:\s|$)') patterns = regex_pattern.findall(self.processed_text.lower()) for pattern in patterns: original = pattern[0] From eb01ad22a517f24a0136eed59a12472ddadd4485 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Tue, 1 Oct 2019 15:34:16 +0530 Subject: [PATCH 143/237] handled cases like dates not possible like 31st oct --- .../temporal/date/en/date_detection.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/ner_v2/detectors/temporal/date/en/date_detection.py b/ner_v2/detectors/temporal/date/en/date_detection.py index 2d1cd682a..b59b63417 100644 --- a/ner_v2/detectors/temporal/date/en/date_detection.py +++ b/ner_v2/detectors/temporal/date/en/date_detection.py @@ -312,8 +312,12 @@ def _gregorian_day_month_year_format(self, date_list=None, original_list=None): dd = int(pattern[1]) mm = int(pattern[2]) yy = int(self.normalize_year(pattern[3])) if pattern[3] else self.now_date.year - if not pattern[3] and self.timezone.localize(datetime.datetime(year=yy, month=mm, day=dd)) < self.now_date: - yy += 1 + try: + # to catch dates which are not possible like "31/11" (october 31st) + if not pattern[3] and self.timezone.localize(datetime.datetime(year=yy, month=mm, day=dd)) < self.now_date: + yy += 1 + except: + return date_list, original_list date = { 'dd': int(dd), @@ -363,7 +367,14 @@ def _gregorian_month_day_year_format(self, date_list=None, original_list=None): original = pattern[0] dd = pattern[2] mm = pattern[1] - yy = self.normalize_year(pattern[3]) + yy = int(self.normalize_year(pattern[3])) if pattern[3] else self.now_date.year + try: + # to catch dates which are not possible like "11/31" (october 31st) + if not pattern[3] and self.timezone.localize(datetime.datetime(year=yy, month=mm, day=dd))\ + < self.now_date: + yy += 1 + except: + return date_list, original_list date = { 'dd': int(dd), From 10a65a4ba852ed295818efd8bbc30788af5bef4a Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Tue, 1 Oct 2019 15:36:39 +0530 Subject: [PATCH 144/237] fix lint --- ner_v2/detectors/temporal/date/en/date_detection.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ner_v2/detectors/temporal/date/en/date_detection.py b/ner_v2/detectors/temporal/date/en/date_detection.py index b59b63417..d019b377f 100644 --- a/ner_v2/detectors/temporal/date/en/date_detection.py +++ b/ner_v2/detectors/temporal/date/en/date_detection.py @@ -314,7 +314,8 @@ def _gregorian_day_month_year_format(self, date_list=None, original_list=None): yy = int(self.normalize_year(pattern[3])) if pattern[3] else self.now_date.year try: # to catch dates which are not possible like "31/11" (october 31st) - if not pattern[3] and self.timezone.localize(datetime.datetime(year=yy, month=mm, day=dd)) < self.now_date: + if not pattern[3] and self.timezone.localize(datetime.datetime(year=yy, month=mm, day=dd))\ + < self.now_date: yy += 1 except: return date_list, original_list From d1edac9a21a85c10f591b5a02b5d7e594543cdd0 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Tue, 1 Oct 2019 16:14:56 +0530 Subject: [PATCH 145/237] bug fixes in regex --- .../temporal/date/en/date_detection.py | 26 +++++++++++-------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/ner_v2/detectors/temporal/date/en/date_detection.py b/ner_v2/detectors/temporal/date/en/date_detection.py index d019b377f..1e6c30dbb 100644 --- a/ner_v2/detectors/temporal/date/en/date_detection.py +++ b/ner_v2/detectors/temporal/date/en/date_detection.py @@ -305,7 +305,7 @@ def _gregorian_day_month_year_format(self, date_list=None, original_list=None): if date_list is None: date_list = [] regex_pattern = re.compile(r'[^/\-\.\w](([12][0-9]|3[01]|0?[1-9])\s?[/\-\.]\s?(1[0-2]|0?[1-9])' - r'(?:\s?[/\-\.]\s?((?:20|19)?[0-9]{2}))?)(?:\s|$)') + r'(?:\s?[/\-\.]\s?((?:20|19)?[0-9]{2}))?)\W') patterns = regex_pattern.findall(self.processed_text.lower()) for pattern in patterns: original = pattern[0] @@ -362,7 +362,7 @@ def _gregorian_month_day_year_format(self, date_list=None, original_list=None): if date_list is None: date_list = [] regex_pattern = re.compile(r'[^/\-\.\w]((1[0-2]|0?[1-9])\s?[/\-\.]\s?([12][0-9]|3[01]|0?[1-9])' - r'(?:\s?[/\-\.]\s?((?:20|19)?[0-9]{2}))?)(?:\s|$)') + r'(?:\s?[/\-\.]\s?((?:20|19)?[0-9]{2}))?)\W') patterns = regex_pattern.findall(self.processed_text.lower()) for pattern in patterns: original = pattern[0] @@ -419,7 +419,7 @@ def _gregorian_year_month_day_format(self, date_list=None, original_list=None): if date_list is None: date_list = [] regex_pattern = re.compile(r'\b(((?:20|19)[0-9]{2})\s?[/\-\.]\s?' - r'(1[0-2]|0?[1-9])\s?[/\-\.]\s?([12][0-9]|3[01]|0?[1-9]))(\s|$)') + r'(1[0-2]|0?[1-9])\s?[/\-\.]\s?([12][0-9]|3[01]|0?[1-9]))\W') patterns = regex_pattern.findall(self.processed_text.lower()) for pattern in patterns: original = pattern[0] @@ -469,7 +469,7 @@ def _gregorian_advanced_day_month_year_format(self, date_list=None, original_lis if date_list is None: date_list = [] regex_pattern = re.compile(r'\b(([12][0-9]|3[01]|0?[1-9])\s?[\/\ \-\.\,]\s?([A-Za-z]+)\s?[\/\ \-\.\,]\s?' - r'((?:20|19)?[0-9]{2}))(\s|$)') + r'((?:20|19)?[0-9]{2}))\W') patterns = regex_pattern.findall(self.processed_text.lower()) for pattern in patterns: original = pattern[0].strip() @@ -523,7 +523,7 @@ def _gregorian_day_with_ordinals_month_year_format(self, date_list=None, origina if original_list is None: original_list = [] regex_pattern = re.compile(r'\b(([12][0-9]|3[01]|0?[1-9])\s?(?:nd|st|rd|th)?\s?(?:of)?[\s\,\-]\s?' - r'([A-Za-z]+)[\s\,\-]\s?((?:20|19)?[0-9]{2}))(\s|$)') + r'([A-Za-z]+)[\s\,\-]\s?((?:20|19)?[0-9]{2}))\W') patterns = regex_pattern.findall(self.processed_text.lower()) for pattern in patterns: original = pattern[0].strip() @@ -574,7 +574,7 @@ def _gregorian_advanced_year_month_day_format(self, date_list=None, original_lis if date_list is None: date_list = [] regex_pattern = re.compile(r'\b(((?:20|19)[0-9]{2})\s?[\/\ \,\-]\s?([A-Za-z]+)\s?' - r'[\/\ \,\-]\s?([12][0-9]|3[01]|0?[1-9]))(\s|$)') + r'[\/\ \,\-]\s?([12][0-9]|3[01]|0?[1-9]))\W') patterns = regex_pattern.findall(self.processed_text.lower()) for pattern in patterns: original = pattern[0] @@ -676,14 +676,18 @@ def _gregorian_month_day_with_ordinals_year_format(self, date_list=None, origina original_list = [] if date_list is None: date_list = [] - regex_pattern = re.compile(r'\b(([A-Za-z]+)[\ \,\-]\s?([12][0-9]|3[01]|0?[1-9])\s?(?:nd|st|rd|th)?' - r'[\ \,\-]\s?((?:20|19)?[0-9]{2}))(\s|$)') + regex_pattern = re.compile(r'\b(((?:20|19)?[0-9]{2})?([A-Za-z]+)[\ \,\-]\s?([12][0-9]' + r'|3[01]|0?[1-9])\s?(?:nd|st|rd|th)?' + r'(?:[\ \,\-]\s?((?:20|19)?[0-9]{2}))?)\W') patterns = regex_pattern.findall(self.processed_text.lower()) for pattern in patterns: original = pattern[0] - dd = pattern[2] - probable_mm = pattern[1] - yy = self.normalize_year(pattern[3]) + yy1 = pattern[1] + yy2 = pattern[4] + dd = pattern[3] + yy = yy1 or yy2 or '' + probable_mm = pattern[2] + yy = self.normalize_year(yy) mm = self.__get_month_index(probable_mm) if mm: From 108dcf20f7e888a40f509f0a364f2fd4a9f08d06 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Tue, 1 Oct 2019 16:20:21 +0530 Subject: [PATCH 146/237] fix regex --- ner_v2/detectors/temporal/date/en/date_detection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ner_v2/detectors/temporal/date/en/date_detection.py b/ner_v2/detectors/temporal/date/en/date_detection.py index 1e6c30dbb..58db7969c 100644 --- a/ner_v2/detectors/temporal/date/en/date_detection.py +++ b/ner_v2/detectors/temporal/date/en/date_detection.py @@ -676,7 +676,7 @@ def _gregorian_month_day_with_ordinals_year_format(self, date_list=None, origina original_list = [] if date_list is None: date_list = [] - regex_pattern = re.compile(r'\b(((?:20|19)?[0-9]{2})?([A-Za-z]+)[\ \,\-]\s?([12][0-9]' + regex_pattern = re.compile(r'\b(((?:20|19)?[0-9]{2})?\s?([A-Za-z]+)[\ \,\-]\s?([12][0-9]' r'|3[01]|0?[1-9])\s?(?:nd|st|rd|th)?' r'(?:[\ \,\-]\s?((?:20|19)?[0-9]{2}))?)\W') patterns = regex_pattern.findall(self.processed_text.lower()) From a5b2bf123dc007ae64d853ceb39c6a05fb279a2e Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Tue, 1 Oct 2019 16:32:29 +0530 Subject: [PATCH 147/237] bug fix in _gregorian_month_day_with_ordinals_year_format --- ner_v2/detectors/temporal/date/en/date_detection.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/ner_v2/detectors/temporal/date/en/date_detection.py b/ner_v2/detectors/temporal/date/en/date_detection.py index 58db7969c..f4d563376 100644 --- a/ner_v2/detectors/temporal/date/en/date_detection.py +++ b/ner_v2/detectors/temporal/date/en/date_detection.py @@ -685,12 +685,18 @@ def _gregorian_month_day_with_ordinals_year_format(self, date_list=None, origina yy1 = pattern[1] yy2 = pattern[4] dd = pattern[3] - yy = yy1 or yy2 or '' + yy = int(self.normalize_year(yy1 or yy2)) or self.now_date.year probable_mm = pattern[2] - yy = self.normalize_year(yy) mm = self.__get_month_index(probable_mm) if mm: + try: + # to catch dates which are not possible like "31/11" (october 31st) + if not yy1 and not yy2 and self.timezone.localize(datetime.datetime(year=yy, month=mm, day=dd)) \ + < self.now_date: + yy += 1 + except: + return date_list, original_list date = { 'dd': int(dd), 'mm': int(mm), From 40d42fac8af14dab7b622ca5754cf1d510b2a33f Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Tue, 1 Oct 2019 16:36:14 +0530 Subject: [PATCH 148/237] bug fix in _gregorian_month_day_with_ordinals_year_format --- ner_v2/detectors/temporal/date/en/date_detection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ner_v2/detectors/temporal/date/en/date_detection.py b/ner_v2/detectors/temporal/date/en/date_detection.py index f4d563376..e58c13614 100644 --- a/ner_v2/detectors/temporal/date/en/date_detection.py +++ b/ner_v2/detectors/temporal/date/en/date_detection.py @@ -685,7 +685,7 @@ def _gregorian_month_day_with_ordinals_year_format(self, date_list=None, origina yy1 = pattern[1] yy2 = pattern[4] dd = pattern[3] - yy = int(self.normalize_year(yy1 or yy2)) or self.now_date.year + yy = int(self.normalize_year(yy1 or yy2 or self.now_date.year)) probable_mm = pattern[2] mm = self.__get_month_index(probable_mm) From 804a23225b18bdab6440337560680c79019c6b16 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Tue, 1 Oct 2019 16:40:17 +0530 Subject: [PATCH 149/237] bug fix in _gregorian_month_day_with_ordinals_year_format --- ner_v2/detectors/temporal/date/en/date_detection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ner_v2/detectors/temporal/date/en/date_detection.py b/ner_v2/detectors/temporal/date/en/date_detection.py index e58c13614..d06b1a57e 100644 --- a/ner_v2/detectors/temporal/date/en/date_detection.py +++ b/ner_v2/detectors/temporal/date/en/date_detection.py @@ -685,7 +685,7 @@ def _gregorian_month_day_with_ordinals_year_format(self, date_list=None, origina yy1 = pattern[1] yy2 = pattern[4] dd = pattern[3] - yy = int(self.normalize_year(yy1 or yy2 or self.now_date.year)) + yy = int(self.normalize_year(yy1 or yy2 or str(self.now_date.year))) probable_mm = pattern[2] mm = self.__get_month_index(probable_mm) From edadc221c704f8934d4b8643621059ad72de65ea Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Tue, 1 Oct 2019 16:56:44 +0530 Subject: [PATCH 150/237] add self.country_code None check --- ner_v2/detectors/temporal/date/en/date_detection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ner_v2/detectors/temporal/date/en/date_detection.py b/ner_v2/detectors/temporal/date/en/date_detection.py index d06b1a57e..48ec231aa 100644 --- a/ner_v2/detectors/temporal/date/en/date_detection.py +++ b/ner_v2/detectors/temporal/date/en/date_detection.py @@ -201,7 +201,7 @@ def get_exact_date(self, date_list, original_list): corresponding substrings in the given text. """ - if self.country_code in self.country_date_detector_preferences: + if not self.country_code and self.country_code in self.country_date_detector_preferences: for preferred_detector in self.country_date_detector_preferences[self.country_code]: date_list, original_list = preferred_detector(date_list, original_list) self._update_processed_text(original_list) From bcc04e44c5e330e313a341d3d166cd51d52c11df Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Tue, 1 Oct 2019 17:17:27 +0530 Subject: [PATCH 151/237] add self.country_code None check --- ner_v2/detectors/temporal/date/en/date_detection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ner_v2/detectors/temporal/date/en/date_detection.py b/ner_v2/detectors/temporal/date/en/date_detection.py index 48ec231aa..5c1423fac 100644 --- a/ner_v2/detectors/temporal/date/en/date_detection.py +++ b/ner_v2/detectors/temporal/date/en/date_detection.py @@ -201,7 +201,7 @@ def get_exact_date(self, date_list, original_list): corresponding substrings in the given text. """ - if not self.country_code and self.country_code in self.country_date_detector_preferences: + if self.country_code and self.country_code in self.country_date_detector_preferences: for preferred_detector in self.country_date_detector_preferences[self.country_code]: date_list, original_list = preferred_detector(date_list, original_list) self._update_processed_text(original_list) From 740ef1e3c7c3d9d9f90d636137179e87ea143e1c Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Thu, 3 Oct 2019 13:45:01 +0530 Subject: [PATCH 152/237] format regex --- ner_v2/detectors/temporal/time/en/time_detection.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ner_v2/detectors/temporal/time/en/time_detection.py b/ner_v2/detectors/temporal/time/en/time_detection.py index 61752a1d4..66de3fe68 100644 --- a/ner_v2/detectors/temporal/time/en/time_detection.py +++ b/ner_v2/detectors/temporal/time/en/time_detection.py @@ -229,7 +229,7 @@ def _detect_range_12_hour_format(self, time_list=None, original_list=None): r'\W(({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*(?::|\.|\s)?[\s-]*?([0-5][0-9])[\s-]*?(pm|am|a\.m|p\.m)' r'[\s-]*?({timezone})?\s*to[\s-]*?({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*' r'(?::|\.|\s)?[\s-]*?([0-5][0-9])[\s-]*?(pm|am|a\.m|p\.m)\s*({timezone})?)\W' - .format(timezone=self.timezone_choices) + .format(timezone=self.timezone_choices) ) patterns = regex_patterns.findall(self.processed_text.lower()) for pattern in patterns: @@ -303,7 +303,7 @@ def _detect_range_12_hour_format_without_min(self, time_list=None, original_list regex_patterns = re.compile( r'\W(({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*(am|pm|a\.m|p\.m)[\s-]*?({timezone})?\s*to' r'\s*({timezone})?[\s-]*?(0?[2-9]|0?1[0-2]?)[\s-]*(am|pm|a\.m|p\.m)\s*({timezone})?)\W' - .format(timezone=self.timezone_choices) + .format(timezone=self.timezone_choices) ) patterns = regex_patterns.findall(self.processed_text.lower()) for pattern in patterns: From a2829e236d3880d198c450721b8842cccab1c17d Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Thu, 3 Oct 2019 16:38:59 +0530 Subject: [PATCH 153/237] added timezone data csv files in en --- .../temporal/time/en/data/timezone_variations.csv | 10 ++++++++++ ner_v2/detectors/temporal/time/en/data/timezones.csv | 10 ++++++++++ 2 files changed, 20 insertions(+) create mode 100644 ner_v2/detectors/temporal/time/en/data/timezone_variations.csv create mode 100644 ner_v2/detectors/temporal/time/en/data/timezones.csv diff --git a/ner_v2/detectors/temporal/time/en/data/timezone_variations.csv b/ner_v2/detectors/temporal/time/en/data/timezone_variations.csv new file mode 100644 index 000000000..97400365e --- /dev/null +++ b/ner_v2/detectors/temporal/time/en/data/timezone_variations.csv @@ -0,0 +1,10 @@ +timezone_name,timezone_variants +IST,IST|Indian Time|Indian Standard Time +EST,EST|Eastern Standard Time|Eastern Time|ET|EDT +CST,CST|Central Standard Time|Central Time|CT|CDT +MST,MST|Mountain Standard Time|Mountain Time|MT|MDT +PST,PST|Pacific Standard Time|Pacific Time|PT|PDT +AKST,AKST|Alaska Standard Time|Alaska Time|AKDT +HST,HST|Hawaii Standard Time|HDT +HAST,HAST|Hawaii-Aleutian Standard Time|Hawaii Aleutian Standard Time|Hawaii Time|HADT +UTC,UTC|GMT|Greenwich Mean Time|Greenwich Time|Coordinated Universal Time \ No newline at end of file diff --git a/ner_v2/detectors/temporal/time/en/data/timezones.csv b/ner_v2/detectors/temporal/time/en/data/timezones.csv new file mode 100644 index 000000000..c70ae1a72 --- /dev/null +++ b/ner_v2/detectors/temporal/time/en/data/timezones.csv @@ -0,0 +1,10 @@ +code,preferred,dict +IST,Asia/Kolkata,Asia/Kolkata +EST,America/New_York,America/New_York|America/Detroit|America/Kentucky/Louisville|America/Kentucky/Monticello|America/Indiana/Indianapolis|America/Indiana/Vincennes|America/Indiana/Winamac|America/Indiana/Marengo|America/Indiana/Petersburg|America/Indiana/Vevay +CST,America/Chicago,America/Chicago|America/Indiana/Tell_City|America/Indiana/Knox|America/Menominee|America/North_Dakota/Center|America/North_Dakota/New_Salem|America/North_Dakota/Beulah +MST,America/Denver,America/Denver|America/Boise|America/Phoenix +PST,America/Los_Angeles,America/Los_Angeles +AKST,America/Anchorage,America/Anchorage|America/Juneau|America/Sitka|America/Yakutat|America/Nome|America/Metlakatla +HST,America/Adak,America/Adak|Pacific/Honolulu +HAST,Pacific/Honolulu,Pacific/Honolulu +UTC,UTC,UTC \ No newline at end of file From 21b9cc8987cf3957914a0ef879e755ccddffc24e Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Fri, 4 Oct 2019 00:43:09 +0530 Subject: [PATCH 154/237] added timezone data csv files in en --- ner_v2/detectors/temporal/constant.py | 11 +++ .../time/en/data/timezone_variations.csv | 2 +- .../temporal/time/en/data/timezones.csv | 2 +- .../temporal/time/en/time_detection.py | 87 ++++++++++++++++--- ner_v2/detectors/temporal/utils.py | 12 +++ 5 files changed, 99 insertions(+), 15 deletions(-) diff --git a/ner_v2/detectors/temporal/constant.py b/ner_v2/detectors/temporal/constant.py index eefa012cc..ad6843043 100644 --- a/ner_v2/detectors/temporal/constant.py +++ b/ner_v2/detectors/temporal/constant.py @@ -3,6 +3,17 @@ DATETIME_CONSTANT_FILE = 'datetime_diff_constant.csv' NUMERALS_CONSTANT_FILE = 'numbers_constant.csv' +# timezone variants data file and its columns +TIMEZONE_VARIANTS_CONSTANT_FILE = 'timezone_variations.csv' +TIMEZONE_VARIANTS_VARIANTS_COLUMN_NAME = 'timezone_variants' +TIMEZONE_VARIANTS_VALUE_COLUMN_NAME = 'timezone_value' + +# timezones data file and its columns +TIMEZONES_CONSTANT_FILE = 'timezones.csv' +TIMEZONES_CODE_COLUMN_NAME = 'code' +TIMEZONES_ALL_REGIONS_COLUMN_NAME = 'all_regions' +TIMEZONES_PREFERRED_REGION_COLUMN_NAME = 'preferred' + CONSTANT_FILE_KEY = 'key' # date type referring to date in month like "2 tarikh" (reference: hindi) diff --git a/ner_v2/detectors/temporal/time/en/data/timezone_variations.csv b/ner_v2/detectors/temporal/time/en/data/timezone_variations.csv index 97400365e..4ed2a5e63 100644 --- a/ner_v2/detectors/temporal/time/en/data/timezone_variations.csv +++ b/ner_v2/detectors/temporal/time/en/data/timezone_variations.csv @@ -1,4 +1,4 @@ -timezone_name,timezone_variants +timezone_value,timezone_variants IST,IST|Indian Time|Indian Standard Time EST,EST|Eastern Standard Time|Eastern Time|ET|EDT CST,CST|Central Standard Time|Central Time|CT|CDT diff --git a/ner_v2/detectors/temporal/time/en/data/timezones.csv b/ner_v2/detectors/temporal/time/en/data/timezones.csv index c70ae1a72..962e2210b 100644 --- a/ner_v2/detectors/temporal/time/en/data/timezones.csv +++ b/ner_v2/detectors/temporal/time/en/data/timezones.csv @@ -1,4 +1,4 @@ -code,preferred,dict +code,preferred,all_regions IST,Asia/Kolkata,Asia/Kolkata EST,America/New_York,America/New_York|America/Detroit|America/Kentucky/Louisville|America/Kentucky/Monticello|America/Indiana/Indianapolis|America/Indiana/Vincennes|America/Indiana/Winamac|America/Indiana/Marengo|America/Indiana/Petersburg|America/Indiana/Vevay CST,America/Chicago,America/Chicago|America/Indiana/Tell_City|America/Indiana/Knox|America/Menominee|America/North_Dakota/Center|America/North_Dakota/New_Salem|America/North_Dakota/Beulah diff --git a/ner_v2/detectors/temporal/time/en/time_detection.py b/ner_v2/detectors/temporal/time/en/time_detection.py index 66de3fe68..f923c69db 100644 --- a/ner_v2/detectors/temporal/time/en/time_detection.py +++ b/ner_v2/detectors/temporal/time/en/time_detection.py @@ -1,7 +1,14 @@ import re import datetime -from ner_v2.detectors.temporal.constant import AM_MERIDIEM, PM_MERIDIEM, TWELVE_HOUR, EVERY_TIME_TYPE -from ner_v2.detectors.temporal.utils import get_timezone +import collections +import pandas as pd +import os +from ner_v2.detectors.temporal.constant import AM_MERIDIEM, PM_MERIDIEM, TWELVE_HOUR, EVERY_TIME_TYPE, \ + TIMEZONE_VARIANTS_CONSTANT_FILE, TIMEZONES_CONSTANT_FILE, TIMEZONE_VARIANTS_VARIANTS_COLUMN_NAME, \ + TIMEZONE_VARIANTS_VALUE_COLUMN_NAME, TIMEZONES_CODE_COLUMN_NAME +from ner_v2.detectors.temporal.utils import get_timezone, get_list_from_pipe_sep_string + +TimezoneVariants = collections.namedtuple('TimezoneVariant', ['value']) class TimeDetector(object): @@ -79,7 +86,11 @@ def __init__(self, entity_name, timezone='UTC'): self.bot_message = None self.timezone = get_timezone(timezone) self.now_date = datetime.datetime.now(self.timezone) - self.timezone_choices = 'ist|utc|akst|akdt|pst|pdt|cst|est|hst|mst|mdt|cdt|edt' + self.timezones_map = {} + + self.init_regex_and_parser('./data/') + sorted_len_timezone_keys = sorted(self.timezones_map.keys(), key=len, reverse=True) + self.timezone_choices = "|".join([re.escape(x.lower()) for x in sorted_len_timezone_keys]) def set_bot_message(self, bot_message): """ @@ -90,6 +101,34 @@ def set_bot_message(self, bot_message): """ self.bot_message = bot_message + def init_regex_and_parser(self, data_directory_path): + timezone_variants_data_path = os.path.join(data_directory_path, TIMEZONE_VARIANTS_CONSTANT_FILE) + if os.path.exists(timezone_variants_data_path): + timezone_variants_df = pd.read_csv(timezone_variants_data_path, encoding='utf-8') + for index, row in timezone_variants_df.iterrows(): + tz_name_variants = get_list_from_pipe_sep_string(row[TIMEZONE_VARIANTS_VARIANTS_COLUMN_NAME]) + value = row[TIMEZONE_VARIANTS_VALUE_COLUMN_NAME] + for tz_name in tz_name_variants: + self.timezones_map[tz_name] = TimezoneVariants(value=value) + + def convert_to_pytz_format(self, timezone_variant): + """ + Converts informal TZ formats like EST, Eastern Time etc to Oslon format(America/New_York) supported by pytz. + :param timezone_variant: (str) Informal TZ variant + :return: Standard Oslon format for pytz. + """ + timezone_code = self.timezones_map[timezone_variant].value + timezone_data_path = os.path.join('./data/', TIMEZONES_CONSTANT_FILE) + if os.path.exists(timezone_data_path): + timezones_df = pd.read_csv(timezone_data_path, encoding='utf-8') + timezones_df.set_index(TIMEZONES_CODE_COLUMN_NAME, inplace=True) + if re.search(self.timezone.zone, timezones_df.loc[timezone_code].TIMEZONES_ALL_REGIONS_COLUMN_NAME): + return self.timezone.zone + else: + return timezones_df.loc[timezone_code].TIMEZONES_PREFERRED_REGION_COLUMN_NAME + + return self.timezone.zone + def _detect_time(self, range_enabled=False, form_check=False): """ Detects all time strings in text and returns list of detected time entities and their corresponding original @@ -246,12 +285,14 @@ def _detect_range_12_hour_format(self, time_list=None, original_list=None): ap1 = pattern[4] tz1 = pattern[1] tz2 = pattern[5] + tz = None + if tz1 or tz2: + tz = self.convert_to_pytz_format(tz1 or tz2) time1 = { 'hh': int(t1), 'mm': int(t2), 'nn': str(ap1).lower().strip('.'), - # 'tz': (tz1 or tz2 or self.timezone).upper(), - 'tz': (tz1 or tz2 or 'nope').upper(), + 'tz': tz or self.timezone.zone, 'range': 'start', 'time_type': time_type } @@ -263,12 +304,14 @@ def _detect_range_12_hour_format(self, time_list=None, original_list=None): ap2 = pattern[9] tz3 = pattern[6] tz4 = pattern[10] + tz = None + if tz3 or tz4: + tz = self.convert_to_pytz_format(tz3 or tz4) time2 = { 'hh': int(t3), 'mm': int(t4), 'nn': str(ap2).lower().strip('.'), - # 'tz': (tz3 or tz4 or self.timezone).upper(), - 'tz': (tz3 or tz4 or 'nope').upper(), + 'tz': tz or self.timezone.zone, 'range': 'end', 'time_type': time_type } @@ -319,11 +362,14 @@ def _detect_range_12_hour_format_without_min(self, time_list=None, original_list ap1 = pattern[3] tz1 = pattern[1] tz2 = pattern[4] + tz = None + if tz1 or tz2: + tz = self.convert_to_pytz_format(tz1 or tz2) time1 = { 'hh': int(t1), 'mm': 0, 'nn': str(ap1).lower().strip('.'), - 'tz': (tz1 or tz2 or 'none').upper(), + 'tz': tz or self.timezone.zone, 'range': 'start', 'time_type': time_type } @@ -334,11 +380,14 @@ def _detect_range_12_hour_format_without_min(self, time_list=None, original_list ap2 = pattern[7] tz3 = pattern[5] tz4 = pattern[8] + tz = None + if tz3 or tz4: + tz = self.convert_to_pytz_format(tz1 or tz2) time2 = { 'hh': int(t2), 'mm': 0, 'nn': str(ap2).lower().strip('.'), - 'tz': (tz3 or tz4 or 'none').upper(), + 'tz': tz or self.timezone.zone, 'range': 'end', 'time_type': time_type } @@ -390,11 +439,14 @@ def _detect_start_range_12_hour_format(self, time_list=None, original_list=None) ap1 = pattern[4] tz1 = pattern[1] tz2 = pattern[5] + tz = None + if tz1 or tz2: + tz = self.convert_to_pytz_format(tz1 or tz2) time1 = { 'hh': int(t1), 'mm': int(t2), 'nn': str(ap1).lower().strip('.'), - 'tz': (tz1 or tz2 or 'none').upper(), + 'tz': tz or self.timezone.zone, 'range': 'start', 'time_type': time_type } @@ -441,11 +493,14 @@ def _detect_end_range_12_hour_format(self, time_list=None, original_list=None): ap1 = pattern[4] tz1 = pattern[1] tz2 = pattern[5] + tz = None + if tz1 or tz2: + tz = self.convert_to_pytz_format(tz1 or tz2) time1 = { 'hh': int(t1), 'mm': int(t2), 'nn': str(ap1).lower().strip('.'), - 'tz': (tz1 or tz2 or 'none').upper(), + 'tz': tz or self.timezone.zone, 'range': 'end', 'time_type': time_type } @@ -489,11 +544,14 @@ def _detect_start_range_12_hour_format_without_min(self, time_list=None, origina ap1 = pattern[3] tz1 = pattern[1] tz2 = pattern[4] + tz = None + if tz1 or tz2: + tz = self.convert_to_pytz_format(tz1 or tz2) time1 = { 'hh': int(t1), 'mm': 0, 'nn': str(ap1).lower().strip('.'), - 'tz': (tz1 or tz2 or 'none').upper(), + 'tz': tz or self.timezone.zone, 'range': 'start', 'time_type': time_type } @@ -538,12 +596,15 @@ def _detect_end_range_12_hour_format_without_min(self, time_list=None, original_ ap1 = pattern[3] tz1 = pattern[1] tz2 = pattern[4] + tz = None + if tz1 or tz2: + tz = self.convert_to_pytz_format(tz1 or tz2) time1 = { 'hh': int(t1), 'mm': 0, 'nn': str(ap1).lower().strip('.'), - 'tz': (tz1 or tz2 or 'none').upper(), + 'tz': tz or self.timezone.zone, 'range': 'end', 'time_type': time_type } diff --git a/ner_v2/detectors/temporal/utils.py b/ner_v2/detectors/temporal/utils.py index 264d5f8fb..2a82a8d0b 100644 --- a/ner_v2/detectors/temporal/utils.py +++ b/ner_v2/detectors/temporal/utils.py @@ -296,3 +296,15 @@ def get_timezone(timezone, ignore_errors=True): else: raise return timezone + + +def get_list_from_pipe_sep_string(text_string): + """ + Split numerals + Args: + text_string (str): text + Returns: + (list) : list containing numeral after split + """ + text_list = text_string.split("|") + return [x.lower().strip() for x in text_list if x.strip()] \ No newline at end of file From a5b82231a33ebcb4dc1ec2faa3f3af0474923835 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Fri, 4 Oct 2019 01:23:57 +0530 Subject: [PATCH 155/237] fix regex --- ner_v2/detectors/temporal/time/en/time_detection.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ner_v2/detectors/temporal/time/en/time_detection.py b/ner_v2/detectors/temporal/time/en/time_detection.py index f923c69db..eb22e9265 100644 --- a/ner_v2/detectors/temporal/time/en/time_detection.py +++ b/ner_v2/detectors/temporal/time/en/time_detection.py @@ -268,7 +268,7 @@ def _detect_range_12_hour_format(self, time_list=None, original_list=None): r'\W(({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*(?::|\.|\s)?[\s-]*?([0-5][0-9])[\s-]*?(pm|am|a\.m|p\.m)' r'[\s-]*?({timezone})?\s*to[\s-]*?({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*' r'(?::|\.|\s)?[\s-]*?([0-5][0-9])[\s-]*?(pm|am|a\.m|p\.m)\s*({timezone})?)\W' - .format(timezone=self.timezone_choices) + .format(timezone=self.timezone_choices) ) patterns = regex_patterns.findall(self.processed_text.lower()) for pattern in patterns: @@ -346,7 +346,7 @@ def _detect_range_12_hour_format_without_min(self, time_list=None, original_list regex_patterns = re.compile( r'\W(({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*(am|pm|a\.m|p\.m)[\s-]*?({timezone})?\s*to' r'\s*({timezone})?[\s-]*?(0?[2-9]|0?1[0-2]?)[\s-]*(am|pm|a\.m|p\.m)\s*({timezone})?)\W' - .format(timezone=self.timezone_choices) + .format(timezone=self.timezone_choices) ) patterns = regex_patterns.findall(self.processed_text.lower()) for pattern in patterns: @@ -423,7 +423,7 @@ def _detect_start_range_12_hour_format(self, time_list=None, original_list=None) regex_patterns = re.compile( r'\W((?:after|aftr)[\s-]*({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*(?::|\.|\s)?[\s-]*?' r'([0-5][0-9])[\s-]*?(pm|am|a\.m|p\.m)\s*({timezone})?)\W' - .format(timezone=self.timezone_choices) + .format(timezone=self.timezone_choices) ) patterns = regex_patterns.findall(self.processed_text.lower()) for pattern in patterns: From ea941c4af822c00eb1fb46b7f724e930009e50ce Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Fri, 4 Oct 2019 02:13:32 +0530 Subject: [PATCH 156/237] fix regex --- ner_v2/detectors/temporal/time/en/time_detection.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ner_v2/detectors/temporal/time/en/time_detection.py b/ner_v2/detectors/temporal/time/en/time_detection.py index eb22e9265..04bd7764e 100644 --- a/ner_v2/detectors/temporal/time/en/time_detection.py +++ b/ner_v2/detectors/temporal/time/en/time_detection.py @@ -104,6 +104,7 @@ def set_bot_message(self, bot_message): def init_regex_and_parser(self, data_directory_path): timezone_variants_data_path = os.path.join(data_directory_path, TIMEZONE_VARIANTS_CONSTANT_FILE) if os.path.exists(timezone_variants_data_path): + print('the path exists') timezone_variants_df = pd.read_csv(timezone_variants_data_path, encoding='utf-8') for index, row in timezone_variants_df.iterrows(): tz_name_variants = get_list_from_pipe_sep_string(row[TIMEZONE_VARIANTS_VARIANTS_COLUMN_NAME]) From 0dafbe26fd83ce1e77ec6ac076f92a5fc81ed764 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Fri, 4 Oct 2019 02:26:05 +0530 Subject: [PATCH 157/237] fix os path --- ner_v2/detectors/temporal/time/en/time_detection.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ner_v2/detectors/temporal/time/en/time_detection.py b/ner_v2/detectors/temporal/time/en/time_detection.py index 04bd7764e..fc07d85f7 100644 --- a/ner_v2/detectors/temporal/time/en/time_detection.py +++ b/ner_v2/detectors/temporal/time/en/time_detection.py @@ -88,7 +88,8 @@ def __init__(self, entity_name, timezone='UTC'): self.now_date = datetime.datetime.now(self.timezone) self.timezones_map = {} - self.init_regex_and_parser('./data/') + self.init_regex_and_parser(os.path.join((os.path.dirname(os.path.abspath(__file__)).rstrip(os.sep)), + 'data')) sorted_len_timezone_keys = sorted(self.timezones_map.keys(), key=len, reverse=True) self.timezone_choices = "|".join([re.escape(x.lower()) for x in sorted_len_timezone_keys]) @@ -119,7 +120,8 @@ def convert_to_pytz_format(self, timezone_variant): :return: Standard Oslon format for pytz. """ timezone_code = self.timezones_map[timezone_variant].value - timezone_data_path = os.path.join('./data/', TIMEZONES_CONSTANT_FILE) + data_directory_path = os.path.join((os.path.dirname(os.path.abspath(__file__)).rstrip(os.sep)), 'data') + timezone_data_path = os.path.join(data_directory_path, TIMEZONES_CONSTANT_FILE) if os.path.exists(timezone_data_path): timezones_df = pd.read_csv(timezone_data_path, encoding='utf-8') timezones_df.set_index(TIMEZONES_CODE_COLUMN_NAME, inplace=True) From 7242fdfbbf1b5d283801e1fb3839951d774510a0 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Fri, 4 Oct 2019 07:17:00 +0530 Subject: [PATCH 158/237] add tz --- ner_v2/detectors/temporal/time/en/time_detection.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ner_v2/detectors/temporal/time/en/time_detection.py b/ner_v2/detectors/temporal/time/en/time_detection.py index fc07d85f7..f6475dc33 100644 --- a/ner_v2/detectors/temporal/time/en/time_detection.py +++ b/ner_v2/detectors/temporal/time/en/time_detection.py @@ -5,7 +5,7 @@ import os from ner_v2.detectors.temporal.constant import AM_MERIDIEM, PM_MERIDIEM, TWELVE_HOUR, EVERY_TIME_TYPE, \ TIMEZONE_VARIANTS_CONSTANT_FILE, TIMEZONES_CONSTANT_FILE, TIMEZONE_VARIANTS_VARIANTS_COLUMN_NAME, \ - TIMEZONE_VARIANTS_VALUE_COLUMN_NAME, TIMEZONES_CODE_COLUMN_NAME + TIMEZONE_VARIANTS_VALUE_COLUMN_NAME, TIMEZONES_CODE_COLUMN_NAME, TIMEZONES_ALL_REGIONS_COLUMN_NAME from ner_v2.detectors.temporal.utils import get_timezone, get_list_from_pipe_sep_string TimezoneVariants = collections.namedtuple('TimezoneVariant', ['value']) @@ -125,7 +125,7 @@ def convert_to_pytz_format(self, timezone_variant): if os.path.exists(timezone_data_path): timezones_df = pd.read_csv(timezone_data_path, encoding='utf-8') timezones_df.set_index(TIMEZONES_CODE_COLUMN_NAME, inplace=True) - if re.search(self.timezone.zone, timezones_df.loc[timezone_code].TIMEZONES_ALL_REGIONS_COLUMN_NAME): + if re.search(self.timezone.zone, timezones_df.loc[timezone_code].all_regions): return self.timezone.zone else: return timezones_df.loc[timezone_code].TIMEZONES_PREFERRED_REGION_COLUMN_NAME From 2c92e676fe5dfe3c18f1f8960ec6cb097da426fa Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Fri, 4 Oct 2019 07:19:55 +0530 Subject: [PATCH 159/237] add tz --- ner_v2/detectors/temporal/time/en/time_detection.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ner_v2/detectors/temporal/time/en/time_detection.py b/ner_v2/detectors/temporal/time/en/time_detection.py index f6475dc33..699db7526 100644 --- a/ner_v2/detectors/temporal/time/en/time_detection.py +++ b/ner_v2/detectors/temporal/time/en/time_detection.py @@ -5,7 +5,8 @@ import os from ner_v2.detectors.temporal.constant import AM_MERIDIEM, PM_MERIDIEM, TWELVE_HOUR, EVERY_TIME_TYPE, \ TIMEZONE_VARIANTS_CONSTANT_FILE, TIMEZONES_CONSTANT_FILE, TIMEZONE_VARIANTS_VARIANTS_COLUMN_NAME, \ - TIMEZONE_VARIANTS_VALUE_COLUMN_NAME, TIMEZONES_CODE_COLUMN_NAME, TIMEZONES_ALL_REGIONS_COLUMN_NAME + TIMEZONE_VARIANTS_VALUE_COLUMN_NAME, TIMEZONES_CODE_COLUMN_NAME, TIMEZONES_ALL_REGIONS_COLUMN_NAME, \ + TIMEZONES_PREFERRED_REGION_COLUMN_NAME from ner_v2.detectors.temporal.utils import get_timezone, get_list_from_pipe_sep_string TimezoneVariants = collections.namedtuple('TimezoneVariant', ['value']) @@ -128,7 +129,7 @@ def convert_to_pytz_format(self, timezone_variant): if re.search(self.timezone.zone, timezones_df.loc[timezone_code].all_regions): return self.timezone.zone else: - return timezones_df.loc[timezone_code].TIMEZONES_PREFERRED_REGION_COLUMN_NAME + return timezones_df.loc[timezone_code].preferred return self.timezone.zone From 7e7ce1c80f4b40adf1bf80345651f7bc1edc71ff Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Fri, 4 Oct 2019 09:11:57 +0530 Subject: [PATCH 160/237] add test cases for date detection --- .../temporal/date/en/test_date_detection.py | 125 ++++++++++++++++++ 1 file changed, 125 insertions(+) diff --git a/ner_v2/tests/temporal/date/en/test_date_detection.py b/ner_v2/tests/temporal/date/en/test_date_detection.py index 2e861fd7a..8ba1a10b4 100644 --- a/ner_v2/tests/temporal/date/en/test_date_detection.py +++ b/ner_v2/tests/temporal/date/en/test_date_detection.py @@ -119,4 +119,129 @@ def test_en_date_detection_date_ddth_of_mm_of_yy_with_locale(self): 'value': {'dd': day1, 'mm': month, 'yy': year1, 'type': 'date'} }, date_dicts) + self.assertEqual(original_texts.count(message), 1) + + def test_en_gregorian_day_month_year_format(self): + """ + Date detection for pattern '2/3/17' + """ + message = '2/3/17' + locale = 'en-in' + # If we run + day1 = 2 + month = 3 + year1 = 2017 + + date_detector_object = DateAdvancedDetector(entity_name=self.entity_name, language='en', locale=locale) + date_dicts, original_texts = date_detector_object.detect_entity(message) + + self.assertIn({ + 'normal': True, + 'start_range': False, + 'end_range': False, + 'from': False, + 'to': False, + 'value': {'dd': day1, 'mm': month, 'yy': year1, 'type': 'date'} + }, date_dicts) + + self.assertEqual(original_texts.count(message), 1) + + def test_en_gregorian_year_month_day_format(self): + """ + Date detection for pattern '2017/12/01' + """ + message = '2017/12/01' + locale = 'en-in' + # If we run + day1 = 1 + month = 12 + year1 = 2017 + + date_detector_object = DateAdvancedDetector(entity_name=self.entity_name, language='en', locale=locale) + date_dicts, original_texts = date_detector_object.detect_entity(message) + + self.assertIn({ + 'normal': True, + 'start_range': False, + 'end_range': False, + 'from': False, + 'to': False, + 'value': {'dd': day1, 'mm': month, 'yy': year1, 'type': 'date'} + }, date_dicts) + + self.assertEqual(original_texts.count(message), 1) + + def test_en_gregorian_advanced_day_month_year_format(self): + """ + Date detection for pattern '02 january 1972' + """ + message = '02 january 1972' + locale = 'en-in' + # If we run + day1 = 2 + month = 1 + year1 = 1972 + + date_detector_object = DateAdvancedDetector(entity_name=self.entity_name, language='en', locale=locale) + date_dicts, original_texts = date_detector_object.detect_entity(message) + + self.assertIn({ + 'normal': True, + 'start_range': False, + 'end_range': False, + 'from': False, + 'to': False, + 'value': {'dd': day1, 'mm': month, 'yy': year1, 'type': 'date'} + }, date_dicts) + + self.assertEqual(original_texts.count(message), 1) + + def test_en_gregorian_advanced_year_month_day_format(self): + """ + Date detection for pattern '1972 january 2' + """ + message = '1972 january 2' + locale = 'en-in' + # If we run + day1 = 2 + month = 1 + year1 = 1972 + + date_detector_object = DateAdvancedDetector(entity_name=self.entity_name, language='en', locale=locale) + date_dicts, original_texts = date_detector_object.detect_entity(message) + + self.assertIn({ + 'normal': True, + 'start_range': False, + 'end_range': False, + 'from': False, + 'to': False, + 'value': {'dd': day1, 'mm': month, 'yy': year1, 'type': 'date'} + }, date_dicts) + + self.assertEqual(original_texts.count(message), 1) + + def test_en_gregorian_year_day_month_format(self): + """ + Date detection for pattern '2099 21st Nov' + """ + message = '2099 21st Nov' + locale = 'en-in' + # If we run + day1 = 21 + month = 11 + year1 = 2099 + + date_detector_object = DateAdvancedDetector(entity_name=self.entity_name, language='en', locale=locale) + date_dicts, original_texts = date_detector_object.detect_entity(message) + + self.assertIn({ + 'normal': True, + 'start_range': False, + 'end_range': False, + 'from': False, + 'to': False, + 'value': {'dd': day1, 'mm': month, 'yy': year1, 'type': 'date'} + }, date_dicts) + self.assertEqual(original_texts.count(message), 1) \ No newline at end of file From 48810e2223cf9de595fb39c765a163b7ede090a0 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Fri, 4 Oct 2019 10:51:07 +0530 Subject: [PATCH 161/237] add .lower() in test cases --- .../temporal/date/en/test_date_detection.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/ner_v2/tests/temporal/date/en/test_date_detection.py b/ner_v2/tests/temporal/date/en/test_date_detection.py index 8ba1a10b4..93d95fa1c 100644 --- a/ner_v2/tests/temporal/date/en/test_date_detection.py +++ b/ner_v2/tests/temporal/date/en/test_date_detection.py @@ -51,7 +51,7 @@ def test_en_date_detection_date_range_ddth_of_mmm_to_ddth(self): 'value': {'dd': day2, 'mm': month, 'yy': year2, 'type': 'date'} }, date_dicts) - self.assertEqual(original_texts.count(message), 2) + self.assertEqual(original_texts.count(message.lower()), 2) @mock.patch('ner_v2.detectors.temporal.date.en.date_detection.get_weekdays_for_month') def test_en_date_detection_day_range_for_nth_week_month(self, mocked_get_weekdays_for_month): @@ -94,7 +94,7 @@ def test_en_date_detection_day_range_for_nth_week_month(self, mocked_get_weekday 'to': False, 'value': {'dd': day2, 'mm': month, 'type': 'date', 'yy': year} }, date_dicts) - self.assertEqual(original_texts.count(message), 2) + self.assertEqual(original_texts.count(message.lower()), 2) def test_en_date_detection_date_ddth_of_mm_of_yy_with_locale(self): """ @@ -119,7 +119,7 @@ def test_en_date_detection_date_ddth_of_mm_of_yy_with_locale(self): 'value': {'dd': day1, 'mm': month, 'yy': year1, 'type': 'date'} }, date_dicts) - self.assertEqual(original_texts.count(message), 1) + self.assertEqual(original_texts.count(message.lower()), 1) def test_en_gregorian_day_month_year_format(self): """ @@ -144,7 +144,7 @@ def test_en_gregorian_day_month_year_format(self): 'value': {'dd': day1, 'mm': month, 'yy': year1, 'type': 'date'} }, date_dicts) - self.assertEqual(original_texts.count(message), 1) + self.assertEqual(original_texts.count(message.lower()), 1) def test_en_gregorian_year_month_day_format(self): """ @@ -169,7 +169,7 @@ def test_en_gregorian_year_month_day_format(self): 'value': {'dd': day1, 'mm': month, 'yy': year1, 'type': 'date'} }, date_dicts) - self.assertEqual(original_texts.count(message), 1) + self.assertEqual(original_texts.count(message.lower()), 1) def test_en_gregorian_advanced_day_month_year_format(self): """ @@ -194,7 +194,7 @@ def test_en_gregorian_advanced_day_month_year_format(self): 'value': {'dd': day1, 'mm': month, 'yy': year1, 'type': 'date'} }, date_dicts) - self.assertEqual(original_texts.count(message), 1) + self.assertEqual(original_texts.count(message.lower()), 1) def test_en_gregorian_advanced_year_month_day_format(self): """ @@ -219,7 +219,7 @@ def test_en_gregorian_advanced_year_month_day_format(self): 'value': {'dd': day1, 'mm': month, 'yy': year1, 'type': 'date'} }, date_dicts) - self.assertEqual(original_texts.count(message), 1) + self.assertEqual(original_texts.count(message.lower()), 1) def test_en_gregorian_year_day_month_format(self): """ @@ -244,4 +244,4 @@ def test_en_gregorian_year_day_month_format(self): 'value': {'dd': day1, 'mm': month, 'yy': year1, 'type': 'date'} }, date_dicts) - self.assertEqual(original_texts.count(message), 1) \ No newline at end of file + self.assertEqual(original_texts.count(message.lower()), 1) \ No newline at end of file From 5370cec52672c6e59f809b9e2ecd1fca3c53fd11 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Fri, 4 Oct 2019 11:06:18 +0530 Subject: [PATCH 162/237] use constants --- ner_v2/detectors/temporal/time/en/time_detection.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/ner_v2/detectors/temporal/time/en/time_detection.py b/ner_v2/detectors/temporal/time/en/time_detection.py index 699db7526..040a3168a 100644 --- a/ner_v2/detectors/temporal/time/en/time_detection.py +++ b/ner_v2/detectors/temporal/time/en/time_detection.py @@ -106,7 +106,6 @@ def set_bot_message(self, bot_message): def init_regex_and_parser(self, data_directory_path): timezone_variants_data_path = os.path.join(data_directory_path, TIMEZONE_VARIANTS_CONSTANT_FILE) if os.path.exists(timezone_variants_data_path): - print('the path exists') timezone_variants_df = pd.read_csv(timezone_variants_data_path, encoding='utf-8') for index, row in timezone_variants_df.iterrows(): tz_name_variants = get_list_from_pipe_sep_string(row[TIMEZONE_VARIANTS_VARIANTS_COLUMN_NAME]) @@ -126,10 +125,10 @@ def convert_to_pytz_format(self, timezone_variant): if os.path.exists(timezone_data_path): timezones_df = pd.read_csv(timezone_data_path, encoding='utf-8') timezones_df.set_index(TIMEZONES_CODE_COLUMN_NAME, inplace=True) - if re.search(self.timezone.zone, timezones_df.loc[timezone_code].all_regions): + if re.search(self.timezone.zone, timezones_df.loc[timezone_code][TIMEZONES_ALL_REGIONS_COLUMN_NAME]): return self.timezone.zone else: - return timezones_df.loc[timezone_code].preferred + return timezones_df.loc[timezone_code][TIMEZONES_PREFERRED_REGION_COLUMN_NAME] return self.timezone.zone @@ -246,8 +245,6 @@ def detect_time(self, text, range_enabled=False, form_check=False, **kwargs): time_data = self._detect_time(range_enabled=range_enabled, form_check=form_check) self.time = time_data[0] self.original_time_text = time_data[1] - print('processed_text', self.processed_text) - print('tagged_text', self.tagged_text) return time_data def _detect_range_12_hour_format(self, time_list=None, original_list=None): From dbdf74ffbeed556bd62c00dfbe2f84e896174f22 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Fri, 4 Oct 2019 18:55:06 +0530 Subject: [PATCH 163/237] add tz to detector methods --- .../temporal/time/en/time_detection.py | 102 +++++++++++++----- 1 file changed, 76 insertions(+), 26 deletions(-) diff --git a/ner_v2/detectors/temporal/time/en/time_detection.py b/ner_v2/detectors/temporal/time/en/time_detection.py index 040a3168a..510fda159 100644 --- a/ner_v2/detectors/temporal/time/en/time_detection.py +++ b/ner_v2/detectors/temporal/time/en/time_detection.py @@ -8,6 +8,7 @@ TIMEZONE_VARIANTS_VALUE_COLUMN_NAME, TIMEZONES_CODE_COLUMN_NAME, TIMEZONES_ALL_REGIONS_COLUMN_NAME, \ TIMEZONES_PREFERRED_REGION_COLUMN_NAME from ner_v2.detectors.temporal.utils import get_timezone, get_list_from_pipe_sep_string +from ner_v2.constant import LANGUAGE_DATA_DIRECTORY TimezoneVariants = collections.namedtuple('TimezoneVariant', ['value']) @@ -90,7 +91,7 @@ def __init__(self, entity_name, timezone='UTC'): self.timezones_map = {} self.init_regex_and_parser(os.path.join((os.path.dirname(os.path.abspath(__file__)).rstrip(os.sep)), - 'data')) + LANGUAGE_DATA_DIRECTORY)) sorted_len_timezone_keys = sorted(self.timezones_map.keys(), key=len, reverse=True) self.timezone_choices = "|".join([re.escape(x.lower()) for x in sorted_len_timezone_keys]) @@ -120,7 +121,8 @@ def convert_to_pytz_format(self, timezone_variant): :return: Standard Oslon format for pytz. """ timezone_code = self.timezones_map[timezone_variant].value - data_directory_path = os.path.join((os.path.dirname(os.path.abspath(__file__)).rstrip(os.sep)), 'data') + data_directory_path = os.path.join((os.path.dirname(os.path.abspath(__file__)).rstrip(os.sep)), + LANGUAGE_DATA_DIRECTORY) timezone_data_path = os.path.join(data_directory_path, TIMEZONES_CONSTANT_FILE) if os.path.exists(timezone_data_path): timezones_df = pd.read_csv(timezone_data_path, encoding='utf-8') @@ -647,18 +649,24 @@ def _detect_12_hour_format(self, time_list=None, original_list=None): time_list = [] if original_list is None: original_list = [] - patterns = re.findall(r'\b((0?[2-9]|0?1[0-2]?)[\s-]*(?::|\.|\s)?[\s-]*?' - r'([0-5][0-9])[\s-]*?(pm|am|a\.m|p\.m))', + patterns = re.findall(r'\b(({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*(?::|\.|\s)?[\s-]*?([0-5][0-9])' + r'[\s-]*?(pm|am|a\.m|p\.m)\s*({timezone})?)\b'.format(timezone=self.timezone_choices), self.processed_text.lower()) for pattern in patterns: original = pattern[0] - t1 = pattern[1] - t2 = pattern[2] - ap = pattern[3] + t1 = pattern[2] + t2 = pattern[3] + ap = pattern[4] + tz1 = pattern[1] + tz2 = pattern[5] + tz = None + if tz1 or tz2: + tz = self.convert_to_pytz_format(tz1 or tz2) time = { 'hh': int(t1), 'mm': int(t2), - 'nn': str(ap).lower().strip('.') + 'nn': str(ap).lower().strip('.'), + 'tz': tz or self.timezone.zone } time['nn'] = 'am' if 'a' in time['nn'] else time['nn'] @@ -696,15 +704,22 @@ def _detect_12_hour_without_min(self, time_list=None, original_list=None): time_list = [] if original_list is None: original_list = [] - patterns = re.findall(r'\s((0?[2-9]|0?1[0-2]?)[\s-]*(am|pm|a\.m|p\.m))', self.processed_text.lower()) + patterns = re.findall(r'\b(({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*(am|pm|a\.m|p\.m)\s*({timezone})?)\b' + .format(timezone=self.timezone_choices), self.processed_text.lower()) for pattern in patterns: original = pattern[0] - t1 = pattern[1] - ap = pattern[2] + t1 = pattern[2] + ap = pattern[3] + tz1 = pattern[1] + tz2 = pattern[4] + tz = None + if tz1 or tz2: + tz = self.convert_to_pytz_format(tz1 or tz2) time = { 'hh': int(t1), 'mm': 0, - 'nn': str(ap).lower().strip('.') + 'nn': str(ap).lower().strip('.'), + 'tz': tz or self.timezone.zone } time['nn'] = 'am' if 'a' in time['nn'] else time['nn'] time['nn'] = 'pm' if 'p' in time['nn'] else time['nn'] @@ -761,6 +776,7 @@ def _detect_time_with_difference(self, time_list=None, original_list=None): time[setter] = t1 time[antisetter] = 0 time['nn'] = 'df' + time['tz'] = self.timezone.zone time_list.append(time) original_list.append(original) return time_list, original_list @@ -802,6 +818,7 @@ def _detect_time_with_difference_later(self, time_list=None, original_list=None) time[setter] = t1 time[antisetter] = 0 time['nn'] = 'df' + time['tz'] = self.timezone.zone time_list.append(time) original_list.append(original) return time_list, original_list @@ -843,6 +860,7 @@ def _detect_time_with_every_x_hour(self, time_list=None, original_list=None): time[setter] = t1 time[antisetter] = 0 time['nn'] = EVERY_TIME_TYPE + time['tz'] = self.timezone.zone time_list.append(time) original_list.append(original) return time_list, original_list @@ -875,6 +893,7 @@ def _detect_time_with_once_in_x_day(self, time_list=None, original_list=None): time[setter] = t1 time[antisetter] = 0 time['nn'] = EVERY_TIME_TYPE + time['tz'] = self.timezone.zone time_list.append(time) original_list.append(original) return time_list, original_list @@ -906,20 +925,28 @@ def _detect_24_hour_optional_minutes_format(self, time_list=None, original_list= time_list = [] if original_list is None: original_list = [] - patterns = re.findall(r'\b((00?|0?[2-9]|0?1[0-9]?|2[0-3])(?:[:.\s]([0-5][0-9]))?)' - r'(?!\s?(?:am|pm|a\.m|p\.m|\d))', + patterns = re.findall(r'\b(({timezone})?\s*(00?|0?[2-9]|0?1[0-9]?|2[0-3])[:.\s]([0-5][0-9])?\s*' + r'(?:h|hrs|hr)?\s*({timezone})?)(?!\s*(?:am|pm|a\.m|p\.m|(?:{timezone})' + r'|(?:h|hrs|hr)|\d))\b' + .format(timezone=self.timezone_choices), self.processed_text.lower()) for pattern in patterns: original = pattern[0] t2 = 0 - t1 = pattern[1] - if pattern[2]: - t2 = pattern[2] + t1 = pattern[2] + if pattern[3]: + t2 = pattern[3] + tz1 = pattern[1] + tz2 = pattern[4] + tz = None + if tz1 or tz2: + tz = self.convert_to_pytz_format(tz1 or tz2) time = { 'hh': int(t1), 'mm': int(t2), - 'nn': 'hrs' + 'nn': 'hrs', + 'tz': tz or self.timezone.zone } time_list.append(time) original_list.append(original) @@ -948,18 +975,26 @@ def _detect_restricted_24_hour_format(self, time_list=None, original_list=None): time_list = [] if original_list is None: original_list = [] - patterns = re.findall(r'\b((00?|1[3-9]?|2[0-3])[:.\s]([0-5][0-9]))(?!\s?(?:am|pm|a\.m|p\.m|\d))', + patterns = re.findall(r'\b(({timezone})?\s*(00?|1[3-9]?|2[0-3])[:.\s]([0-5][0-9])' + r'\s*(?:h|hr|hrs)?\s*({timezone})?)(?!\s*(?:am|pm|a\.m|p\.m|(?:h|hrs|hr)|' + r'(?:{timezone})|\d))\b'.format(timezone=self.timezone_choices), self.processed_text.lower()) for pattern in patterns: original = pattern[0] - t1 = pattern[1] - t2 = pattern[2] + t1 = pattern[2] + t2 = pattern[3] + tz1 = pattern[1] + tz2 = pattern[4] + tz = None + if tz1 or tz2: + tz = self.convert_to_pytz_format(tz1 or tz2) meridiem = self._get_meridiem(int(t1), int(t2)) time = { 'hh': int(t1), 'mm': int(t2), - 'nn': meridiem + 'nn': meridiem, + 'tz': tz or self.timezone.zone } time_list.append(time) original_list.append(original) @@ -1005,13 +1040,17 @@ def _detect_12_hour_word_format(self, time_list=None, original_list=None): pattern_am = re.findall(r'\s(morning|early|subah|mrng|mrning|savere)\s', self.processed_text.lower()) pattern_pm = re.findall(r'\s(noon|afternoon|evening|evng|evning|sham)\s', self.processed_text.lower()) pattern_night = re.findall(r'\s(night|nite|tonight|latenight|tonit|nit|rat)\s', self.processed_text.lower()) + pattern_tz = re.findall(r'(?:\b|[^a-zA-Z])({timezone})\b'.format(timezone=self.timezone_choices), + self.processed_text.lower()) for pattern in patterns: original = pattern[0] t1 = int(pattern[1]) t2 = int(pattern[2]) + tz = pattern_tz[0] time = { 'hh': t1, 'mm': t2, + 'tz': tz or self.timezone.zone } if pattern_am: time['nn'] = 'am' @@ -1061,12 +1100,16 @@ def _detect_12_hour_word_format2(self, time_list=None, original_list=None): pattern_am = re.findall(r'\s(morning|early|subah|mrng|mrning|savere)', self.processed_text.lower()) pattern_pm = re.findall(r'\s(noon|afternoon|evening|evng|evning|sham)', self.processed_text.lower()) pattern_night = re.findall(r'\s(night|nite|tonight|latenight|tonit|nit|rat)', self.processed_text.lower()) + pattern_tz = re.findall(r'(?:\b|[^a-zA-Z])({timezone})\b'.format(timezone=self.timezone_choices), + self.processed_text.lower()) for pattern in patterns: original = pattern[0] t1 = int(pattern[1]) + tz = pattern_tz[0] time = { 'hh': t1, 'mm': 0, + 'tz': tz or self.timezone.zone } if pattern_am: time['nn'] = 'am' @@ -1103,7 +1146,8 @@ def _detect_24_hour_format(self, time_list=None, original_list=None): time_list = [] if original_list is None: original_list = [] - patterns = re.findall(r'\b((00?|0?[2-9]|0?1[0-9]?|2[0-3])[:.\s]([0-5][0-9]))(?!\s?(?:am|pm|a\.m|p\.m|\d))', + patterns = re.findall(r'\b(({timezone})?\s*(00?|0?[2-9]|0?1[0-9]?|2[0-3])[:.\s]([0-5][0-9])\s*({timezone})?)' + r'(?!\s*(?:am|pm|a\.m|p\.m|(?:{timezone})|\d))'.format(timezone = self.timezone_choices), self.processed_text.lower()) if not patterns: # Optional minutes but compulsory "hour" mention @@ -1111,13 +1155,19 @@ def _detect_24_hour_format(self, time_list=None, original_list=None): self.processed_text.lower()) for pattern in patterns: original = pattern[0] - t1 = int(pattern[1]) - t2 = int(pattern[2]) if pattern[2] else 0 + t1 = int(pattern[2]) + t2 = int(pattern[3]) if pattern[3] else 0 meridiem = self._get_meridiem(t1, t2) + tz1 = pattern[1] + tz2 = pattern[4] + tz = None + if tz1 or tz2: + tz = self.convert_to_pytz_format(tz1 or tz2) time = { 'hh': t1, 'mm': t2, - 'nn': meridiem + 'nn': meridiem, + 'tz': tz or self.timezone.zone } time_list.append(time) original_list.append(original) From d1d9f9dc058ef1724acce93a5d94830390d134f9 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Sun, 6 Oct 2019 16:38:09 +0530 Subject: [PATCH 164/237] added tz in output --- .../temporal/time/en/time_detection.py | 91 +++++++++++++++---- 1 file changed, 71 insertions(+), 20 deletions(-) diff --git a/ner_v2/detectors/temporal/time/en/time_detection.py b/ner_v2/detectors/temporal/time/en/time_detection.py index 510fda159..685ddc935 100644 --- a/ner_v2/detectors/temporal/time/en/time_detection.py +++ b/ner_v2/detectors/temporal/time/en/time_detection.py @@ -988,7 +988,7 @@ def _detect_restricted_24_hour_format(self, time_list=None, original_list=None): tz = None if tz1 or tz2: tz = self.convert_to_pytz_format(tz1 or tz2) - meridiem = self._get_meridiem(int(t1), int(t2)) + meridiem = self._get_meridiem(int(t1), int(t2), tz) time = { 'hh': int(t1), @@ -1147,11 +1147,12 @@ def _detect_24_hour_format(self, time_list=None, original_list=None): if original_list is None: original_list = [] patterns = re.findall(r'\b(({timezone})?\s*(00?|0?[2-9]|0?1[0-9]?|2[0-3])[:.\s]([0-5][0-9])\s*({timezone})?)' - r'(?!\s*(?:am|pm|a\.m|p\.m|(?:{timezone})|\d))'.format(timezone = self.timezone_choices), + r'(?!\s*(?:am|pm|a\.m|p\.m|(?:{timezone})|\d))'.format(timezone=self.timezone_choices), self.processed_text.lower()) if not patterns: # Optional minutes but compulsory "hour" mention - patterns = re.findall(r'\b((00?|0?[2-9]|0?1[0-9]?|2[0-3])(?:[:.\s]([0-5][0-9]))?\s+(?:hours?|hrs?)\b)', + patterns = re.findall(r'\b(({timezone})?\s*(00?|0?[2-9]|0?1[0-9]?|2[0-3])(?:[:.\s]([0-5][0-9]))?\s+' + r'(?:hours?|hrs?)\s*({timezone})?\b)'.format(timezone=self.timezone_choices), self.processed_text.lower()) for pattern in patterns: original = pattern[0] @@ -1204,20 +1205,24 @@ def _detect_time_without_format(self, time_list=None, original_list=None): time_list = [] if original_list is None: original_list = [] - patterns = re.findall(r'((?:by|before|after|at|dot|exactly|exact)[\s-]*' - r'((0?[1-9]|1[0-2])[:.\s]*([0-5][0-9])?))\s', + patterns = re.findall(r'\b((?:by|before|after|at|dot|exactly|exact)[\s-]*((0?[1-9]|1[0-2])[:.\s]*' + r'([0-5][0-9])?)\s*({timezone})?)\b'.format(timezone=self.timezone_choices), self.processed_text.lower()) for pattern in patterns: original = pattern[0] t1 = pattern[2] t2 = 0 + tz = pattern[4] or None + if tz: + tz = self.convert_to_pytz_format(tz) if pattern[3]: t2 = pattern[3] - meridiem = self._get_meridiem(int(t1), int(t2)) + meridiem = self._get_meridiem(int(t1), int(t2), tz) time = { 'hh': int(t1), 'mm': int(t2), - 'nn': meridiem + 'nn': meridiem, + 'tz': tz or self.timezone.zone } time_list.append(time) original_list.append(original) @@ -1253,30 +1258,39 @@ def _detect_time_without_format_preceeding(self, time_list=None, original_list=N time_list = [] if original_list is None: original_list = [] - patterns = re.findall(r'\s(((0?[1-9]|1[0-2])[:.\s]*([0-5][0-9])?)[\s-]*' - r'(?:o\'clock|o\' clock|clock|oclock|o clock|hours))\s', + patterns = re.findall(r'\b(({timezone})?\s*((0?[1-9]|1[0-2])[:.\s]*([0-5][0-9])?)[\s-]*' + r'(?:o\'clock|o\' clock|clock|oclock|o clock|hours)\s*' + r'({timezone})?)\b'.format(timezone=self.timezone_choices), self.processed_text.lower()) if not patterns and self.bot_message: if re.findall(r"Time|time", self.bot_message.lower()): - patterns = re.findall(r'\s*((([0-2]?[0-9])()))\s*', self.processed_text.lower()) + patterns = re.findall(r'\b(({timezone})?\s*([0-2]?[0-9])' + r'()\s*({timezone})?)\b'.format(timezone=self.timezone_choices), + self.processed_text.lower()) for pattern in patterns: original = pattern[0] t1 = pattern[2] t2 = 0 + tz1 = pattern[1] + tz2 = pattern[4] + tz = None + if tz1 or tz2: + tz = self.convert_to_pytz_format(tz1 or tz2) if pattern[3]: t2 = pattern[3] - meridiem = self._get_meridiem(int(t1), int(t2)) + meridiem = self._get_meridiem(int(t1), int(t2), tz) time = { 'hh': int(t1), 'mm': int(t2), - 'nn': meridiem + 'nn': meridiem, + 'tz': tz or self.timezone.zone } time_list.append(time) original_list.append(original) return time_list, original_list - def _get_meridiem(self, hours, mins): + def _get_meridiem(self, hours, mins, timezone): """ Returns the meridiem(am/pm) for which the given hours:mins time is in within 12 hour span from the current timestamp. @@ -1289,11 +1303,17 @@ def _get_meridiem(self, hours, mins): Args: hours (int): hours in integer mins (int): mins in integer + timezone (str): timezone in 'Asia/Kolkata' format. As we want to use the tz mentioned by the user, if any. Returns meridiem type (str): returns the meridiem type whether its am and pm """ - current_datetime = self.now_date + + if timezone is not None: + new_timezone = get_timezone(timezone) + else: + new_timezone = self.timezone + current_datetime = datetime.datetime.now(new_timezone) current_hour = current_datetime.hour current_min = current_datetime.minute if hours == 0 or hours >= TWELVE_HOUR: @@ -1329,10 +1349,14 @@ def _get_morning_time_range(self, time_list=None, original_list=None): if original_list is None: original_list = [] # pattern to detect morning - patterns = re.findall(r'\s(morning|early|subah|mrng|mrning|savere)', self.processed_text.lower()) + patterns = re.findall(r'\b((?:morning|early|subah|mrng|mrning|savere)\s*(?:in|of|at)?\s*({timezone})?)\b' + .format(timezone=self.timezone_choices), self.processed_text.lower()) if patterns: original1 = patterns[0] + tz = None + if patterns[1]: + tz = self.convert_to_pytz_format(patterns[1]) if self.departure_flag: time_type = 'departure' elif self.return_flag: @@ -1343,6 +1367,7 @@ def _get_morning_time_range(self, time_list=None, original_list=None): 'hh': 12, 'mm': 0, 'nn': 'am', + 'tz': tz or self.timezone.zone, 'range': 'start', 'time_type': time_type } @@ -1351,6 +1376,7 @@ def _get_morning_time_range(self, time_list=None, original_list=None): 'hh': 11, 'mm': 0, 'nn': 'am', + 'tz': tz or self.timezone.zone, 'range': 'end', 'time_type': time_type } @@ -1380,10 +1406,14 @@ def _get_afternoon_time_range(self, time_list=None, original_list=None): if original_list is None: original_list = [] # patterns - patterns = re.findall(r'\s(noon|afternoon)', self.processed_text.lower()) + patterns = re.findall(r'\b((?:noon|afternoon)\s*(?:in|of|at)?\s*({timezone})?)\b' + .format(timezone=self.timezone_choices), self.processed_text.lower()) if patterns: original1 = patterns[0] + tz = None + if patterns[1]: + tz = self.convert_to_pytz_format(patterns[1]) if self.departure_flag: time_type = 'departure' elif self.return_flag: @@ -1394,6 +1424,7 @@ def _get_afternoon_time_range(self, time_list=None, original_list=None): 'hh': 11, 'mm': 0, 'nn': 'am', + 'tz': tz or self.timezone.zone, 'range': 'start', 'time_type': time_type } @@ -1402,6 +1433,7 @@ def _get_afternoon_time_range(self, time_list=None, original_list=None): 'hh': 5, 'mm': 0, 'nn': 'pm', + 'tz': tz or self.timezone.zone, 'range': 'end', 'time_type': time_type } @@ -1431,10 +1463,14 @@ def _get_evening_time_range(self, time_list=None, original_list=None): if original_list is None: original_list = [] # patterns - patterns = re.findall(r'\s(evening|evng|evning|sham)', self.processed_text.lower()) + patterns = re.findall(r'\b((?:evening|evng|evning|sham)\s*(?:in|of|at)?\s*({timezone})?)\b' + .format(timezone=self.timezone_choices), self.processed_text.lower()) if patterns: original1 = patterns[0] + tz = None + if patterns[1]: + tz = self.convert_to_pytz_format(patterns[1]) if self.departure_flag: time_type = 'departure' elif self.return_flag: @@ -1445,6 +1481,7 @@ def _get_evening_time_range(self, time_list=None, original_list=None): 'hh': 5, 'mm': 0, 'nn': 'pm', + 'tz': tz or self.timezone.zone, 'range': 'start', 'time_type': time_type } @@ -1453,6 +1490,7 @@ def _get_evening_time_range(self, time_list=None, original_list=None): 'hh': 9, 'mm': 0, 'nn': 'pm', + 'tz': tz or self.timezone.zone, 'range': 'end', 'time_type': time_type } @@ -1482,10 +1520,14 @@ def _get_night_time_range(self, time_list=None, original_list=None): if original_list is None: original_list = [] # patterns - patterns = re.findall(r'\s(night|nite|tonight|latenight|tonit|nit|rat)', self.processed_text.lower()) + patterns = re.findall(r'\b((?:night|nite|tonight|latenight|tonit|nit|rat)\s*(?:in|of|at)?\s*({timezone})?)\b' + .format(timezone=self.timezone_choices), self.processed_text.lower()) if patterns: original1 = patterns[0] + tz = None + if patterns[1]: + tz = self.convert_to_pytz_format(patterns[1]) if self.departure_flag: time_type = 'departure' elif self.return_flag: @@ -1496,6 +1538,7 @@ def _get_night_time_range(self, time_list=None, original_list=None): 'hh': 9, 'mm': 0, 'nn': 'pm', + 'tz': tz or self.timezone.zone, 'range': 'start', 'time_type': time_type } @@ -1504,6 +1547,7 @@ def _get_night_time_range(self, time_list=None, original_list=None): 'hh': 12, 'mm': 0, 'nn': 'am', + 'tz': tz or self.timezone.zone, 'range': 'end', 'time_type': time_type } @@ -1533,12 +1577,17 @@ def _get_default_time_range(self, time_list=None, original_list=None): if original_list is None: original_list = [] # patterns - preference = re.compile(r'\s(No particular preference|No preference|No particular time|No time|' - r'anytime|any time|all day|full day|entire day|entireday)') + preference = re.compile(r'\b((?:No particular preference|No preference|No particular time|No time|' + r'anytime|any time|all day|full day|entire day|entireday)' + r'\s*(?:in|of|at)?\s*({timezone})?)\b' + .format(timezone=self.timezone_choices)) patterns = preference.findall(self.processed_text.lower()) if patterns: original1 = patterns[0] + tz = None + if patterns[1]: + tz = self.convert_to_pytz_format(patterns[1]) if self.departure_flag: time_type = 'departure' elif self.return_flag: @@ -1549,6 +1598,7 @@ def _get_default_time_range(self, time_list=None, original_list=None): 'hh': 12, 'mm': 0, 'nn': 'am', + 'tz': tz or self.timezone.zone, 'range': 'start', 'time_type': time_type } @@ -1557,6 +1607,7 @@ def _get_default_time_range(self, time_list=None, original_list=None): 'hh': 11, 'mm': 59, 'nn': 'pm', + 'tz': tz or self.timezone.zone, 'range': 'end', 'time_type': time_type } From e47c8cd8d7083699588d27ea5583689d081b86d0 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Sun, 6 Oct 2019 16:41:37 +0530 Subject: [PATCH 165/237] added tz in output --- ner_v2/detectors/temporal/time/en/time_detection.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ner_v2/detectors/temporal/time/en/time_detection.py b/ner_v2/detectors/temporal/time/en/time_detection.py index 685ddc935..0d5439c30 100644 --- a/ner_v2/detectors/temporal/time/en/time_detection.py +++ b/ner_v2/detectors/temporal/time/en/time_detection.py @@ -271,7 +271,7 @@ def _detect_range_12_hour_format(self, time_list=None, original_list=None): r'\W(({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*(?::|\.|\s)?[\s-]*?([0-5][0-9])[\s-]*?(pm|am|a\.m|p\.m)' r'[\s-]*?({timezone})?\s*to[\s-]*?({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*' r'(?::|\.|\s)?[\s-]*?([0-5][0-9])[\s-]*?(pm|am|a\.m|p\.m)\s*({timezone})?)\W' - .format(timezone=self.timezone_choices) + .format(timezone=self.timezone_choices) ) patterns = regex_patterns.findall(self.processed_text.lower()) for pattern in patterns: @@ -349,7 +349,7 @@ def _detect_range_12_hour_format_without_min(self, time_list=None, original_list regex_patterns = re.compile( r'\W(({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*(am|pm|a\.m|p\.m)[\s-]*?({timezone})?\s*to' r'\s*({timezone})?[\s-]*?(0?[2-9]|0?1[0-2]?)[\s-]*(am|pm|a\.m|p\.m)\s*({timezone})?)\W' - .format(timezone=self.timezone_choices) + .format(timezone=self.timezone_choices) ) patterns = regex_patterns.findall(self.processed_text.lower()) for pattern in patterns: @@ -426,7 +426,7 @@ def _detect_start_range_12_hour_format(self, time_list=None, original_list=None) regex_patterns = re.compile( r'\W((?:after|aftr)[\s-]*({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*(?::|\.|\s)?[\s-]*?' r'([0-5][0-9])[\s-]*?(pm|am|a\.m|p\.m)\s*({timezone})?)\W' - .format(timezone=self.timezone_choices) + .format(timezone=self.timezone_choices) ) patterns = regex_patterns.findall(self.processed_text.lower()) for pattern in patterns: @@ -1158,12 +1158,12 @@ def _detect_24_hour_format(self, time_list=None, original_list=None): original = pattern[0] t1 = int(pattern[2]) t2 = int(pattern[3]) if pattern[3] else 0 - meridiem = self._get_meridiem(t1, t2) tz1 = pattern[1] tz2 = pattern[4] tz = None if tz1 or tz2: tz = self.convert_to_pytz_format(tz1 or tz2) + meridiem = self._get_meridiem(t1, t2, tz) time = { 'hh': t1, 'mm': t2, From 2ff264c6852bfb9c71c32430d455ae638583e526 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Sun, 6 Oct 2019 20:23:26 +0530 Subject: [PATCH 166/237] add tz in tests --- ner_v2/detectors/temporal/time/en/time_detection.py | 4 +++- ner_v2/tests/temporal/time/time_ner_tests.yaml | 8 ++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/ner_v2/detectors/temporal/time/en/time_detection.py b/ner_v2/detectors/temporal/time/en/time_detection.py index 0d5439c30..561729aea 100644 --- a/ner_v2/detectors/temporal/time/en/time_detection.py +++ b/ner_v2/detectors/temporal/time/en/time_detection.py @@ -1105,7 +1105,9 @@ def _detect_12_hour_word_format2(self, time_list=None, original_list=None): for pattern in patterns: original = pattern[0] t1 = int(pattern[1]) - tz = pattern_tz[0] + tz = None + if pattern_tz: + tz = pattern_tz[0] time = { 'hh': t1, 'mm': 0, diff --git a/ner_v2/tests/temporal/time/time_ner_tests.yaml b/ner_v2/tests/temporal/time/time_ner_tests.yaml index bf4393b66..b30bd8e21 100644 --- a/ner_v2/tests/temporal/time/time_ner_tests.yaml +++ b/ner_v2/tests/temporal/time/time_ner_tests.yaml @@ -8,6 +8,7 @@ tests: - hh: 12 mm: 35 nn: "am" + tz: "UTC" original_text: "12:35 am" output_id: 1 range: null @@ -19,6 +20,7 @@ tests: - hh: 10 mm: 33 nn: "pm" + tz: "UTC" original_text: "10:33 pm" output_id: 1 range: null @@ -30,6 +32,7 @@ tests: - hh: 2 mm: 33 nn: "pm" + tz: "UTC" original_text: "02 33 p.m" output_id: 1 range: null @@ -41,6 +44,7 @@ tests: - hh: 12 mm: 0 nn: "am" + tz: "UTC" original_text: "12 am" output_id: 1 range: null @@ -52,6 +56,7 @@ tests: - hh: 12 mm: 0 nn: "pm" + tz: "UTC" original_text: "12-pm" output_id: 1 range: null @@ -63,6 +68,7 @@ tests: - hh: 1 mm: 0 nn: "am" + tz: "UTC" original_text: "1 am" output_id: 1 range: null @@ -74,6 +80,7 @@ tests: - hh: null mm: null nn: null + tz: "UTC" original_text: null output_id: 1 range: null @@ -85,6 +92,7 @@ tests: - hh: null mm: null nn: null + tz: "UTC" original_text: null output_id: 1 range: null From 8dd8db2227fe346e07636ea801c1403e899514fc Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Sun, 6 Oct 2019 20:26:35 +0530 Subject: [PATCH 167/237] add tz in tests --- ner_v2/detectors/temporal/time/en/time_detection.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ner_v2/detectors/temporal/time/en/time_detection.py b/ner_v2/detectors/temporal/time/en/time_detection.py index 561729aea..100b51b2a 100644 --- a/ner_v2/detectors/temporal/time/en/time_detection.py +++ b/ner_v2/detectors/temporal/time/en/time_detection.py @@ -1046,7 +1046,9 @@ def _detect_12_hour_word_format(self, time_list=None, original_list=None): original = pattern[0] t1 = int(pattern[1]) t2 = int(pattern[2]) - tz = pattern_tz[0] + tz = None + if pattern_tz: + tz = pattern_tz[0] time = { 'hh': t1, 'mm': t2, From 6855bfbab44a5e0b96ac83c9ccdf72084039d949 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Sun, 6 Oct 2019 20:44:51 +0530 Subject: [PATCH 168/237] added tz in time tests --- .../temporal/time/test_time_detection.py | 1 + .../tests/temporal/time/time_ner_tests.yaml | 46 ++++++++++++++++++- 2 files changed, 45 insertions(+), 2 deletions(-) diff --git a/ner_v2/tests/temporal/time/test_time_detection.py b/ner_v2/tests/temporal/time/test_time_detection.py index 90794df42..b3577772b 100644 --- a/ner_v2/tests/temporal/time/test_time_detection.py +++ b/ner_v2/tests/temporal/time/test_time_detection.py @@ -45,6 +45,7 @@ def parse_expected_outputs(expected_outputs): "hh": expected_output["hh"], "mm": expected_output["mm"], "nn": expected_output["nn"], + 'tz': expected_output["tz"], "range": expected_output["range"], "time_type": expected_output["time_type"] } diff --git a/ner_v2/tests/temporal/time/time_ner_tests.yaml b/ner_v2/tests/temporal/time/time_ner_tests.yaml index b30bd8e21..bb52f0a83 100644 --- a/ner_v2/tests/temporal/time/time_ner_tests.yaml +++ b/ner_v2/tests/temporal/time/time_ner_tests.yaml @@ -80,7 +80,7 @@ tests: - hh: null mm: null nn: null - tz: "UTC" + tz: null original_text: null output_id: 1 range: null @@ -92,7 +92,7 @@ tests: - hh: null mm: null nn: null - tz: "UTC" + tz: null original_text: null output_id: 1 range: null @@ -104,6 +104,7 @@ tests: - hh: null mm: null nn: null + tz: null original_text: null output_id: 1 range: null @@ -115,6 +116,7 @@ tests: - hh: null mm: null nn: null + tz: null original_text: null output_id: 1 range: null @@ -126,6 +128,7 @@ tests: - hh: 0 mm: 15 nn: "df" + tz: "UTC" original_text: "in 15mins" output_id: 1 range: null @@ -137,6 +140,7 @@ tests: - hh: 0 mm: 25 nn: "df" + tz: "UTC" original_text: "about 25 minutes" output_id: 1 range: null @@ -148,6 +152,7 @@ tests: - hh: 5 mm: 0 nn: "df" + tz: "UTC" original_text: "after 5 hrs" output_id: 1 range: null @@ -159,6 +164,7 @@ tests: - hh: 13 mm: 0 nn: "df" + tz: "UTC" original_text: "in around 13 hours" output_id: 1 range: null @@ -170,6 +176,7 @@ tests: - hh: 0 mm: 20 nn: "df" + tz: "UTC" original_text: "20 minutes later" output_id: 1 range: null @@ -181,6 +188,7 @@ tests: - hh: 0 mm: 5 nn: "df" + tz: "UTC" original_text: "5mins latr" output_id: 1 range: null @@ -192,6 +200,7 @@ tests: - hh: 1 mm: 0 nn: "df" + tz: "UTC" original_text: "1 hour ltr" output_id: 1 range: null @@ -203,6 +212,7 @@ tests: - hh: 3 mm: 0 nn: "df" + tz: "UTC" original_text: "3 hrs later" output_id: 1 range: null @@ -214,6 +224,7 @@ tests: - hh: 0 mm: 1440 nn: "ev" + tz: "UTC" original_text: "every 1440 minutes" output_id: 1 range: null @@ -225,6 +236,7 @@ tests: - hh: 24 mm: 0 nn: "ev" + tz: "UTC" original_text: "evry 24 hrs" output_id: 1 range: null @@ -236,6 +248,7 @@ tests: - hh: 72 mm: 0 nn: "ev" + tz: "UTC" original_text: "once in 3 days" output_id: 1 range: null @@ -247,6 +260,7 @@ tests: - hh: 24 mm: 0 nn: "ev" + tz: "UTC" original_text: "once in 1 day" output_id: 1 range: null @@ -258,6 +272,7 @@ tests: - hh: 0 mm: 35 nn: "hrs" + tz: "UTC" original_text: "00:35" output_id: 1 range: null @@ -269,6 +284,7 @@ tests: - hh: 22 mm: 33 nn: "hrs" + tz: "UTC" original_text: "22:33" output_id: 1 range: null @@ -280,6 +296,7 @@ tests: - hh: 14 mm: 33 nn: "hrs" + tz: "UTC" original_text: "14 33" output_id: 1 range: null @@ -291,6 +308,7 @@ tests: - hh: 12 mm: 0 nn: "hrs" + tz: "UTC" original_text: "12 hrs" output_id: 1 range: null @@ -302,6 +320,7 @@ tests: - hh: 0 mm: 0 nn: "hrs" + tz: "UTC" original_text: "0 hours" output_id: 1 range: null @@ -313,6 +332,7 @@ tests: - hh: null mm: null nn: null + tz: null original_text: null output_id: 1 range: null @@ -324,6 +344,7 @@ tests: - hh: null mm: null nn: null + tz: "UTC" original_text: null output_id: 1 range: null @@ -335,6 +356,7 @@ tests: - hh: null mm: null nn: null + tz: null original_text: null output_id: 1 range: null @@ -346,6 +368,7 @@ tests: - hh: 12 mm: 30 nn: "am" + tz: "UTC" original_text: "12:30" output_id: 1 range: null @@ -357,6 +380,7 @@ tests: - hh: 11 mm: 30 nn: "am" + tz: "UTC" original_text: "11:30" output_id: 1 range: null @@ -368,6 +392,7 @@ tests: - hh: 11 mm: 30 nn: "pm" + tz: "UTC" original_text: "11:30" output_id: 1 range: null @@ -379,6 +404,7 @@ tests: - hh: 12 mm: 0 nn: "am" + tz: "UTC" original_text: "12:00" output_id: 1 range: null @@ -390,6 +416,7 @@ tests: - hh: 5 mm: 29 nn: "pm" + tz: "UTC" original_text: "5:29" output_id: 1 range: null @@ -401,6 +428,7 @@ tests: - hh: 3 mm: 11 nn: "pm" + tz: "UTC" original_text: "3:11" output_id: 1 range: null @@ -412,6 +440,7 @@ tests: - hh: 12 mm: 22 nn: "pm" + tz: "UTC" original_text: "12:22" output_id: 1 range: null @@ -423,6 +452,7 @@ tests: - hh: 3 mm: 33 nn: "am" + tz: "UTC" original_text: "3:33" output_id: 1 range: null @@ -434,6 +464,7 @@ tests: - hh: 4 mm: 44 nn: "am" + tz: "UTC" original_text: "4:44" output_id: 1 range: null @@ -445,6 +476,7 @@ tests: - hh: 5 mm: 55 nn: "pm" + tz: "UTC" original_text: "5:55" output_id: 1 range: null @@ -456,6 +488,7 @@ tests: - hh: 6 mm: 0 nn: "pm" + tz: "UTC" original_text: "6:00" output_id: 1 range: null @@ -467,6 +500,7 @@ tests: - hh: 3 mm: 0 nn: "pm" + tz: "UTC" original_text: "at 3" output_id: 1 range: null @@ -478,6 +512,7 @@ tests: - hh: 12 mm: 0 nn: "pm" + tz: "UTC" original_text: "at 12" output_id: 1 range: null @@ -489,6 +524,7 @@ tests: - hh: 3 mm: 0 nn: "am" + tz: "UTC" original_text: "after 3" output_id: 1 range: null @@ -500,6 +536,7 @@ tests: - hh: 4 mm: 0 nn: "am" + tz: "UTC" original_text: "by 4" output_id: 1 range: null @@ -511,6 +548,7 @@ tests: - hh: 5 mm: 0 nn: "pm" + tz: "UTC" original_text: "before 5" output_id: 1 range: null @@ -522,6 +560,7 @@ tests: - hh: 6 mm: 0 nn: "pm" + tz: "UTC" original_text: "exact 6" output_id: 1 range: null @@ -536,6 +575,7 @@ tests: hh: 12 mm: 30 nn: "am" + tz: "UTC" range: "start" time_type: null original_text: "12:30 am to 2:30 pm" @@ -543,6 +583,7 @@ tests: hh: 2 mm: 30 nn: "pm" + tz: "UTC" range: "end" time_type: null original_text: "12:30 am to 2:30 pm" @@ -553,6 +594,7 @@ tests: - hh: 10 mm: 0 nn: "am" + tz: "UTC" original_text: "सुबह 10 बजे" output_id: 1 range: null From ba03e30536a963022bba0c7304e9a0bc1e32082f Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Sun, 6 Oct 2019 21:42:52 +0530 Subject: [PATCH 169/237] bug fix in _detect_time_without_format --- ner_v2/detectors/temporal/time/en/time_detection.py | 2 +- ner_v2/detectors/temporal/time/standard_time_regex.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/ner_v2/detectors/temporal/time/en/time_detection.py b/ner_v2/detectors/temporal/time/en/time_detection.py index 100b51b2a..332f06b89 100644 --- a/ner_v2/detectors/temporal/time/en/time_detection.py +++ b/ner_v2/detectors/temporal/time/en/time_detection.py @@ -1210,7 +1210,7 @@ def _detect_time_without_format(self, time_list=None, original_list=None): if original_list is None: original_list = [] patterns = re.findall(r'\b((?:by|before|after|at|dot|exactly|exact)[\s-]*((0?[1-9]|1[0-2])[:.\s]*' - r'([0-5][0-9])?)\s*({timezone})?)\b'.format(timezone=self.timezone_choices), + r'([0-5][0-9])?)\s*({timezone})?)\s'.format(timezone=self.timezone_choices), self.processed_text.lower()) for pattern in patterns: original = pattern[0] diff --git a/ner_v2/detectors/temporal/time/standard_time_regex.py b/ner_v2/detectors/temporal/time/standard_time_regex.py index c9644ba7d..91bc1a217 100644 --- a/ner_v2/detectors/temporal/time/standard_time_regex.py +++ b/ner_v2/detectors/temporal/time/standard_time_regex.py @@ -267,7 +267,8 @@ def _detect_hour_minute(self, time_list, original_list): time = { 'hh': int(hh), 'mm': int(mm), - 'nn': nn + 'nn': nn, + 'tz': self.timezone.zone } time_list.append(time) @@ -316,6 +317,7 @@ def _detect_time_with_coln_format(self, time_list, original_list): time = { 'hh': hh, 'mm': mm, + 'tz': self.timezone.zone, 'time_type': None } From b74621905e2949763b674deaeae6f1d9dd33ce5a Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Sun, 6 Oct 2019 21:55:41 +0530 Subject: [PATCH 170/237] bug fixes --- .../temporal/time/en/time_detection.py | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/ner_v2/detectors/temporal/time/en/time_detection.py b/ner_v2/detectors/temporal/time/en/time_detection.py index 332f06b89..eae997e16 100644 --- a/ner_v2/detectors/temporal/time/en/time_detection.py +++ b/ner_v2/detectors/temporal/time/en/time_detection.py @@ -268,9 +268,9 @@ def _detect_range_12_hour_format(self, time_list=None, original_list=None): if original_list is None: original_list = [] regex_patterns = re.compile( - r'\W(({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*(?::|\.|\s)?[\s-]*?([0-5][0-9])[\s-]*?(pm|am|a\.m|p\.m)' + r'\b(({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*(?::|\.|\s)?[\s-]*?([0-5][0-9])[\s-]*?(pm|am|a\.m|p\.m)' r'[\s-]*?({timezone})?\s*to[\s-]*?({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*' - r'(?::|\.|\s)?[\s-]*?([0-5][0-9])[\s-]*?(pm|am|a\.m|p\.m)\s*({timezone})?)\W' + r'(?::|\.|\s)?[\s-]*?([0-5][0-9])[\s-]*?(pm|am|a\.m|p\.m)\s*({timezone})?)\b' .format(timezone=self.timezone_choices) ) patterns = regex_patterns.findall(self.processed_text.lower()) @@ -347,8 +347,8 @@ def _detect_range_12_hour_format_without_min(self, time_list=None, original_list if original_list is None: original_list = [] regex_patterns = re.compile( - r'\W(({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*(am|pm|a\.m|p\.m)[\s-]*?({timezone})?\s*to' - r'\s*({timezone})?[\s-]*?(0?[2-9]|0?1[0-2]?)[\s-]*(am|pm|a\.m|p\.m)\s*({timezone})?)\W' + r'\b(({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*(am|pm|a\.m|p\.m)[\s-]*?({timezone})?\s*to' + r'\s*({timezone})?[\s-]*?(0?[2-9]|0?1[0-2]?)[\s-]*(am|pm|a\.m|p\.m)\s*({timezone})?)\b' .format(timezone=self.timezone_choices) ) patterns = regex_patterns.findall(self.processed_text.lower()) @@ -424,8 +424,8 @@ def _detect_start_range_12_hour_format(self, time_list=None, original_list=None) original_list = [] regex_patterns = re.compile( - r'\W((?:after|aftr)[\s-]*({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*(?::|\.|\s)?[\s-]*?' - r'([0-5][0-9])[\s-]*?(pm|am|a\.m|p\.m)\s*({timezone})?)\W' + r'\b((?:after|aftr)[\s-]*({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*(?::|\.|\s)?[\s-]*?' + r'([0-5][0-9])[\s-]*?(pm|am|a\.m|p\.m)\s*({timezone})?)\b' .format(timezone=self.timezone_choices) ) patterns = regex_patterns.findall(self.processed_text.lower()) @@ -479,8 +479,8 @@ def _detect_end_range_12_hour_format(self, time_list=None, original_list=None): time_list = [] if original_list is None: original_list = [] - patterns = re.findall(r'\W((?:before|bfre)[\s-]*({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*' - r'(?::|\.|\s)?[\s-]*?([0-5][0-9])[\s-]*?(pm|am|a\.m|p\.m)\s*({timezone})?)\W' + patterns = re.findall(r'\b((?:before|bfre)[\s-]*({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*' + r'(?::|\.|\s)?[\s-]*?([0-5][0-9])[\s-]*?(pm|am|a\.m|p\.m)\s*({timezone})?)\b' .format(timezone=self.timezone_choices), self.processed_text.lower()) for pattern in patterns: @@ -532,8 +532,8 @@ def _detect_start_range_12_hour_format_without_min(self, time_list=None, origina time_list = [] if original_list is None: original_list = [] - patterns = re.findall(r'\W((?:after|aftr)[\s-]*({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*' - r'(am|pm|a\.m|p\.m)\s*({timezone})?)\W'.format(timezone=self.timezone_choices), + patterns = re.findall(r'\b((?:after|aftr)[\s-]*({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*' + r'(am|pm|a\.m|p\.m)\s*({timezone})?)\b'.format(timezone=self.timezone_choices), self.processed_text.lower()) for pattern in patterns: original1 = pattern[0] @@ -584,8 +584,8 @@ def _detect_end_range_12_hour_format_without_min(self, time_list=None, original_ time_list = [] if original_list is None: original_list = [] - patterns = re.findall(r'\W((?:before|bfore)[\s-]*({timezone})?\s*(0?[2-9]|0?1[0-2]?)' - r'[\s-]*(am|pm|a\.m|p\.m)\s*({timezone})?)\W'.format(timezone=self.timezone_choices), + patterns = re.findall(r'\b((?:before|bfore)[\s-]*({timezone})?\s*(0?[2-9]|0?1[0-2]?)' + r'[\s-]*(am|pm|a\.m|p\.m)\s*({timezone})?)\b'.format(timezone=self.timezone_choices), self.processed_text.lower()) for pattern in patterns: original1 = pattern[0] From 141e3d76af8106f42ae05241ef0d0ddf8451bb23 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Sun, 6 Oct 2019 22:45:36 +0530 Subject: [PATCH 171/237] added .strip() to original texts --- .../temporal/time/en/time_detection.py | 52 +++++++++---------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/ner_v2/detectors/temporal/time/en/time_detection.py b/ner_v2/detectors/temporal/time/en/time_detection.py index eae997e16..9aba3295a 100644 --- a/ner_v2/detectors/temporal/time/en/time_detection.py +++ b/ner_v2/detectors/temporal/time/en/time_detection.py @@ -275,8 +275,8 @@ def _detect_range_12_hour_format(self, time_list=None, original_list=None): ) patterns = regex_patterns.findall(self.processed_text.lower()) for pattern in patterns: - original1 = pattern[0] - original2 = pattern[0] + original1 = pattern[0].strip() + original2 = pattern[0].strip() if self.departure_flag: time_type = 'departure' elif self.return_flag: @@ -353,8 +353,8 @@ def _detect_range_12_hour_format_without_min(self, time_list=None, original_list ) patterns = regex_patterns.findall(self.processed_text.lower()) for pattern in patterns: - original1 = pattern[0] - original2 = pattern[0] + original1 = pattern[0].strip() + original2 = pattern[0].strip() if self.departure_flag: time_type = 'departure' elif self.return_flag: @@ -430,7 +430,7 @@ def _detect_start_range_12_hour_format(self, time_list=None, original_list=None) ) patterns = regex_patterns.findall(self.processed_text.lower()) for pattern in patterns: - original1 = pattern[0] + original1 = pattern[0].strip() if self.departure_flag: time_type = 'departure' elif self.return_flag: @@ -484,7 +484,7 @@ def _detect_end_range_12_hour_format(self, time_list=None, original_list=None): .format(timezone=self.timezone_choices), self.processed_text.lower()) for pattern in patterns: - original1 = pattern[0] + original1 = pattern[0].strip() if self.departure_flag: time_type = 'departure' elif self.return_flag: @@ -536,7 +536,7 @@ def _detect_start_range_12_hour_format_without_min(self, time_list=None, origina r'(am|pm|a\.m|p\.m)\s*({timezone})?)\b'.format(timezone=self.timezone_choices), self.processed_text.lower()) for pattern in patterns: - original1 = pattern[0] + original1 = pattern[0].strip() if self.departure_flag: time_type = 'departure' elif self.return_flag: @@ -588,7 +588,7 @@ def _detect_end_range_12_hour_format_without_min(self, time_list=None, original_ r'[\s-]*(am|pm|a\.m|p\.m)\s*({timezone})?)\b'.format(timezone=self.timezone_choices), self.processed_text.lower()) for pattern in patterns: - original1 = pattern[0] + original1 = pattern[0].strip() if self.departure_flag: time_type = 'departure' elif self.return_flag: @@ -653,7 +653,7 @@ def _detect_12_hour_format(self, time_list=None, original_list=None): r'[\s-]*?(pm|am|a\.m|p\.m)\s*({timezone})?)\b'.format(timezone=self.timezone_choices), self.processed_text.lower()) for pattern in patterns: - original = pattern[0] + original = pattern[0].strip() t1 = pattern[2] t2 = pattern[3] ap = pattern[4] @@ -707,7 +707,7 @@ def _detect_12_hour_without_min(self, time_list=None, original_list=None): patterns = re.findall(r'\b(({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*(am|pm|a\.m|p\.m)\s*({timezone})?)\b' .format(timezone=self.timezone_choices), self.processed_text.lower()) for pattern in patterns: - original = pattern[0] + original = pattern[0].strip() t1 = pattern[2] ap = pattern[3] tz1 = pattern[1] @@ -759,7 +759,7 @@ def _detect_time_with_difference(self, time_list=None, original_list=None): r'(min|mins|minutes|hour|hours|hrs|hr))\b', self.processed_text.lower()) for pattern in patterns: - original = pattern[0] + original = pattern[0].strip() t1 = int(pattern[2]) td = pattern[3] hours = ['hour', 'hours', 'hrs', 'hr'] @@ -801,7 +801,7 @@ def _detect_time_with_difference_later(self, time_list=None, original_list=None) patterns = re.findall(r'\b((\d+)\s?(min|mins|minutes|hour|hours|hrs|hr)\s?(later|ltr|latr|lter)s?)\b', self.processed_text.lower()) for pattern in patterns: - original = pattern[0] + original = pattern[0].strip() t1 = int(pattern[1]) td = pattern[2] hours = ['hour', 'hours', 'hrs', 'hr'] @@ -843,7 +843,7 @@ def _detect_time_with_every_x_hour(self, time_list=None, original_list=None): patterns = re.findall(r'\b((every|evry|evy|evri)\s*(\d+)\s*(min|mins|minutes|hour|hours|hrs|hr))\b', self.processed_text.lower()) for pattern in patterns: - original = pattern[0] + original = pattern[0].strip() t1 = int(pattern[2]) td = pattern[3] hours = ['hour', 'hours', 'hrs', 'hr'] @@ -885,7 +885,7 @@ def _detect_time_with_once_in_x_day(self, time_list=None, original_list=None): patterns = re.findall(r'\b((once|onc|1se)\s*(in)?\s*(\d+)\s?(day|days))\b', self.processed_text.lower()) for pattern in patterns: - original = pattern[0] + original = pattern[0].strip() t1 = 24 * int(pattern[3]) setter = "hh" antisetter = "mm" @@ -931,7 +931,7 @@ def _detect_24_hour_optional_minutes_format(self, time_list=None, original_list= .format(timezone=self.timezone_choices), self.processed_text.lower()) for pattern in patterns: - original = pattern[0] + original = pattern[0].strip() t2 = 0 t1 = pattern[2] if pattern[3]: @@ -980,7 +980,7 @@ def _detect_restricted_24_hour_format(self, time_list=None, original_list=None): r'(?:{timezone})|\d))\b'.format(timezone=self.timezone_choices), self.processed_text.lower()) for pattern in patterns: - original = pattern[0] + original = pattern[0].strip() t1 = pattern[2] t2 = pattern[3] tz1 = pattern[1] @@ -1043,7 +1043,7 @@ def _detect_12_hour_word_format(self, time_list=None, original_list=None): pattern_tz = re.findall(r'(?:\b|[^a-zA-Z])({timezone})\b'.format(timezone=self.timezone_choices), self.processed_text.lower()) for pattern in patterns: - original = pattern[0] + original = pattern[0].strip() t1 = int(pattern[1]) t2 = int(pattern[2]) tz = None @@ -1105,7 +1105,7 @@ def _detect_12_hour_word_format2(self, time_list=None, original_list=None): pattern_tz = re.findall(r'(?:\b|[^a-zA-Z])({timezone})\b'.format(timezone=self.timezone_choices), self.processed_text.lower()) for pattern in patterns: - original = pattern[0] + original = pattern[0].strip() t1 = int(pattern[1]) tz = None if pattern_tz: @@ -1159,7 +1159,7 @@ def _detect_24_hour_format(self, time_list=None, original_list=None): r'(?:hours?|hrs?)\s*({timezone})?\b)'.format(timezone=self.timezone_choices), self.processed_text.lower()) for pattern in patterns: - original = pattern[0] + original = pattern[0].strip() t1 = int(pattern[2]) t2 = int(pattern[3]) if pattern[3] else 0 tz1 = pattern[1] @@ -1213,7 +1213,7 @@ def _detect_time_without_format(self, time_list=None, original_list=None): r'([0-5][0-9])?)\s*({timezone})?)\s'.format(timezone=self.timezone_choices), self.processed_text.lower()) for pattern in patterns: - original = pattern[0] + original = pattern[0].strip() t1 = pattern[2] t2 = 0 tz = pattern[4] or None @@ -1273,7 +1273,7 @@ def _detect_time_without_format_preceeding(self, time_list=None, original_list=N r'()\s*({timezone})?)\b'.format(timezone=self.timezone_choices), self.processed_text.lower()) for pattern in patterns: - original = pattern[0] + original = pattern[0].strip() t1 = pattern[2] t2 = 0 tz1 = pattern[1] @@ -1357,7 +1357,7 @@ def _get_morning_time_range(self, time_list=None, original_list=None): .format(timezone=self.timezone_choices), self.processed_text.lower()) if patterns: - original1 = patterns[0] + original1 = patterns[0].strip() tz = None if patterns[1]: tz = self.convert_to_pytz_format(patterns[1]) @@ -1414,7 +1414,7 @@ def _get_afternoon_time_range(self, time_list=None, original_list=None): .format(timezone=self.timezone_choices), self.processed_text.lower()) if patterns: - original1 = patterns[0] + original1 = patterns[0].strip() tz = None if patterns[1]: tz = self.convert_to_pytz_format(patterns[1]) @@ -1471,7 +1471,7 @@ def _get_evening_time_range(self, time_list=None, original_list=None): .format(timezone=self.timezone_choices), self.processed_text.lower()) if patterns: - original1 = patterns[0] + original1 = patterns[0].strip() tz = None if patterns[1]: tz = self.convert_to_pytz_format(patterns[1]) @@ -1528,7 +1528,7 @@ def _get_night_time_range(self, time_list=None, original_list=None): .format(timezone=self.timezone_choices), self.processed_text.lower()) if patterns: - original1 = patterns[0] + original1 = patterns[0].strip() tz = None if patterns[1]: tz = self.convert_to_pytz_format(patterns[1]) @@ -1588,7 +1588,7 @@ def _get_default_time_range(self, time_list=None, original_list=None): patterns = preference.findall(self.processed_text.lower()) if patterns: - original1 = patterns[0] + original1 = patterns[0].strip() tz = None if patterns[1]: tz = self.convert_to_pytz_format(patterns[1]) From 3c9a0764b76d8ac24ab244362c9162d17b66f673 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Sun, 6 Oct 2019 22:51:28 +0530 Subject: [PATCH 172/237] fix lint --- ner_v2/detectors/temporal/time/en/time_detection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ner_v2/detectors/temporal/time/en/time_detection.py b/ner_v2/detectors/temporal/time/en/time_detection.py index 9aba3295a..5ca3a28dd 100644 --- a/ner_v2/detectors/temporal/time/en/time_detection.py +++ b/ner_v2/detectors/temporal/time/en/time_detection.py @@ -1226,7 +1226,7 @@ def _detect_time_without_format(self, time_list=None, original_list=None): 'hh': int(t1), 'mm': int(t2), 'nn': meridiem, - 'tz': tz or self.timezone.zone + 'tz': tz or self.timezone.zone } time_list.append(time) original_list.append(original) From ef2bdfd897ac1d8b1b4f3252ac1cacb2f0452166 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Mon, 7 Oct 2019 10:52:39 +0530 Subject: [PATCH 173/237] added tz case in tests --- ner_v2/tests/temporal/time/time_ner_tests.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ner_v2/tests/temporal/time/time_ner_tests.yaml b/ner_v2/tests/temporal/time/time_ner_tests.yaml index bb52f0a83..48f2162a8 100644 --- a/ner_v2/tests/temporal/time/time_ner_tests.yaml +++ b/ner_v2/tests/temporal/time/time_ner_tests.yaml @@ -3,13 +3,13 @@ args: tests: en: - id: en_1 - message: "the time is 12:35 am" + message: "the time is 12:35 am est" outputs: - hh: 12 mm: 35 nn: "am" - tz: "UTC" - original_text: "12:35 am" + tz: "America/New_York" + original_text: "12:35 am"w output_id: 1 range: null time_type: null From 68aa5fd135f406c695de8329cd088d8f5bd39c92 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Mon, 7 Oct 2019 10:55:36 +0530 Subject: [PATCH 174/237] added tz case in tests --- ner_v2/tests/temporal/time/time_ner_tests.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ner_v2/tests/temporal/time/time_ner_tests.yaml b/ner_v2/tests/temporal/time/time_ner_tests.yaml index 48f2162a8..bcae0de59 100644 --- a/ner_v2/tests/temporal/time/time_ner_tests.yaml +++ b/ner_v2/tests/temporal/time/time_ner_tests.yaml @@ -9,7 +9,7 @@ tests: mm: 35 nn: "am" tz: "America/New_York" - original_text: "12:35 am"w + original_text: "12:35 am" output_id: 1 range: null time_type: null From 8ba9c255c89a07722a6bbf905dcbb9840748efea Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Mon, 7 Oct 2019 10:57:50 +0530 Subject: [PATCH 175/237] added tz case in tests --- ner_v2/tests/temporal/time/time_ner_tests.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ner_v2/tests/temporal/time/time_ner_tests.yaml b/ner_v2/tests/temporal/time/time_ner_tests.yaml index bcae0de59..28bcbe1d1 100644 --- a/ner_v2/tests/temporal/time/time_ner_tests.yaml +++ b/ner_v2/tests/temporal/time/time_ner_tests.yaml @@ -9,7 +9,7 @@ tests: mm: 35 nn: "am" tz: "America/New_York" - original_text: "12:35 am" + original_text: "12:35 am est" output_id: 1 range: null time_type: null From eb54d5e26961cae8bdffb1d9ace63560d0e08a33 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Mon, 7 Oct 2019 15:23:37 +0530 Subject: [PATCH 176/237] add tz in tests --- ner_v2/detectors/temporal/time/en/time_detection.py | 2 +- ner_v2/tests/temporal/time/time_ner_tests.yaml | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/ner_v2/detectors/temporal/time/en/time_detection.py b/ner_v2/detectors/temporal/time/en/time_detection.py index 5ca3a28dd..0b41b193d 100644 --- a/ner_v2/detectors/temporal/time/en/time_detection.py +++ b/ner_v2/detectors/temporal/time/en/time_detection.py @@ -118,7 +118,7 @@ def convert_to_pytz_format(self, timezone_variant): """ Converts informal TZ formats like EST, Eastern Time etc to Oslon format(America/New_York) supported by pytz. :param timezone_variant: (str) Informal TZ variant - :return: Standard Oslon format for pytz. + :return: Standard Olson format for pytz. """ timezone_code = self.timezones_map[timezone_variant].value data_directory_path = os.path.join((os.path.dirname(os.path.abspath(__file__)).rstrip(os.sep)), diff --git a/ner_v2/tests/temporal/time/time_ner_tests.yaml b/ner_v2/tests/temporal/time/time_ner_tests.yaml index 28bcbe1d1..88e3141b7 100644 --- a/ner_v2/tests/temporal/time/time_ner_tests.yaml +++ b/ner_v2/tests/temporal/time/time_ner_tests.yaml @@ -15,25 +15,25 @@ tests: time_type: null range_enabled: false - id: en_2 - message: "meet me at 10:33 pm at the cafe" + message: "meet me at 10:33 pm AKST at the cafe" outputs: - hh: 10 mm: 33 nn: "pm" - tz: "UTC" - original_text: "10:33 pm" + tz: "America/Anchorage" + original_text: "10:33 pm akst" output_id: 1 range: null time_type: null range_enabled: false - id: en_3 - message: "meet me at 02 33 p.m. at the cafe" + message: "meet me at 02 33 p.m IST at the cafe" outputs: - hh: 2 mm: 33 nn: "pm" - tz: "UTC" - original_text: "02 33 p.m" + tz: "Asia/Kolkata" + original_text: "02 33 p.m ist" output_id: 1 range: null time_type: null From 74d17f2dbdc61854b6dd0197f2d382801ead02d4 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Mon, 7 Oct 2019 17:08:10 +0530 Subject: [PATCH 177/237] add tz in tests --- ner_v2/detectors/temporal/time/en/time_detection.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ner_v2/detectors/temporal/time/en/time_detection.py b/ner_v2/detectors/temporal/time/en/time_detection.py index 0b41b193d..c45693574 100644 --- a/ner_v2/detectors/temporal/time/en/time_detection.py +++ b/ner_v2/detectors/temporal/time/en/time_detection.py @@ -1150,12 +1150,12 @@ def _detect_24_hour_format(self, time_list=None, original_list=None): time_list = [] if original_list is None: original_list = [] - patterns = re.findall(r'\b(({timezone})?\s*(00?|0?[2-9]|0?1[0-9]?|2[0-3])[:.\s]([0-5][0-9])\s*({timezone})?)' + patterns = re.findall(r'\b(({timezone})?\s*(00?|0?[2-9]|0?1[0-9]?|2[0-3])[:.\s]?([0-5][0-9])\s*({timezone})?)' r'(?!\s*(?:am|pm|a\.m|p\.m|(?:{timezone})|\d))'.format(timezone=self.timezone_choices), self.processed_text.lower()) if not patterns: # Optional minutes but compulsory "hour" mention - patterns = re.findall(r'\b(({timezone})?\s*(00?|0?[2-9]|0?1[0-9]?|2[0-3])(?:[:.\s]([0-5][0-9]))?\s+' + patterns = re.findall(r'\b(({timezone})?\s*(00?|0?[2-9]|0?1[0-9]?|2[0-3])(?:[:.\s]?([0-5][0-9]))?\s+' r'(?:hours?|hrs?)\s*({timezone})?\b)'.format(timezone=self.timezone_choices), self.processed_text.lower()) for pattern in patterns: From af1bf905d4050902117631383cf4c0d68f851303 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Tue, 8 Oct 2019 14:16:29 +0530 Subject: [PATCH 178/237] added PARAMETER_RANGE_ENABLED in api.py --- ner_constants.py | 1 + ner_v2/api.py | 8 ++++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/ner_constants.py b/ner_constants.py index 7df14f23d..dd05aa701 100644 --- a/ner_constants.py +++ b/ner_constants.py @@ -44,6 +44,7 @@ PARAMETER_TIMEZONE = 'timezone' PARAMETER_REGEX = 'regex' PARAMETER_PAST_DATE_REFERENCED = 'past_date_referenced' +PARAMETER_RANGE_ENABLED = 'range_enabled' # Language parameters of the query. PARAMETER_LANGUAGE_SCRIPT = 'language_script' # ISO 639 code for language. For eg, 'en' for 'Namaste', 'Hello' diff --git a/ner_v2/api.py b/ner_v2/api.py index 90c458489..184cb1eba 100644 --- a/ner_v2/api.py +++ b/ner_v2/api.py @@ -4,7 +4,7 @@ PARAMETER_FALLBACK_VALUE, \ PARAMETER_BOT_MESSAGE, PARAMETER_TIMEZONE, PARAMETER_LANGUAGE_SCRIPT, PARAMETER_SOURCE_LANGUAGE, \ PARAMETER_PAST_DATE_REFERENCED, PARAMETER_MIN_DIGITS, PARAMETER_MAX_DIGITS, PARAMETER_NUMBER_UNIT_TYPE, \ - PARAMETER_LOCALE + PARAMETER_LOCALE, PARAMETER_RANGE_ENABLED from ner_v2.detectors.temporal.date.date_detection import DateAdvancedDetector from ner_v2.detectors.temporal.time.time_detection import TimeDetector @@ -42,6 +42,7 @@ def get_parameters_dictionary(request): PARAMETER_MAX_DIGITS: request.GET.get('max_number_digits'), PARAMETER_NUMBER_UNIT_TYPE: request.GET.get('unit_type'), PARAMETER_LOCALE: request.GET.get('locale'), + PARAMETER_RANGE_ENABLED: request.GET.get('range_enabled') } return parameters_dict @@ -72,6 +73,7 @@ def parse_post_request(request): PARAMETER_MAX_DIGITS: request_data.get('max_number_digits'), PARAMETER_NUMBER_UNIT_TYPE: request_data.get('unit_type'), PARAMETER_LOCALE: request_data.get('locale'), + PARAMETER_RANGE_ENABLED: request_data.get('range_enabled') } return parameters_dict @@ -211,6 +213,7 @@ def time(request): timezone = parameters_dict[PARAMETER_TIMEZONE] or 'UTC' form_check = True if parameters_dict[PARAMETER_STRUCTURED_VALUE] else False + range_enabled = True if parameters_dict[PARAMETER_RANGE_ENABLED] else False time_detection = TimeDetector(entity_name=parameters_dict[PARAMETER_ENTITY_NAME], language=parameters_dict[PARAMETER_SOURCE_LANGUAGE], timezone=timezone) @@ -224,7 +227,8 @@ def time(request): entity_output = time_detection.detect(message=message, structured_value=parameters_dict[PARAMETER_STRUCTURED_VALUE], fallback_value=parameters_dict[PARAMETER_FALLBACK_VALUE], - form_check=form_check) + form_check=form_check, + range_enabled=range_enabled) elif isinstance(message, (list, tuple)): entity_output = time_detection.detect_bulk(messages=message) From 89e4df2bd794ff874c1dee4b0b5d5ac8772ce312 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Tue, 8 Oct 2019 21:04:21 +0530 Subject: [PATCH 179/237] fix AttributeError bug in _get_afternoon_time_range --- .../temporal/time/en/time_detection.py | 40 +++++++++---------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/ner_v2/detectors/temporal/time/en/time_detection.py b/ner_v2/detectors/temporal/time/en/time_detection.py index c45693574..3757b4b92 100644 --- a/ner_v2/detectors/temporal/time/en/time_detection.py +++ b/ner_v2/detectors/temporal/time/en/time_detection.py @@ -1356,11 +1356,11 @@ def _get_morning_time_range(self, time_list=None, original_list=None): patterns = re.findall(r'\b((?:morning|early|subah|mrng|mrning|savere)\s*(?:in|of|at)?\s*({timezone})?)\b' .format(timezone=self.timezone_choices), self.processed_text.lower()) - if patterns: - original1 = patterns[0].strip() + for pattern in patterns: + original1 = pattern[0].strip() tz = None - if patterns[1]: - tz = self.convert_to_pytz_format(patterns[1]) + if pattern[1]: + tz = self.convert_to_pytz_format(pattern[1]) if self.departure_flag: time_type = 'departure' elif self.return_flag: @@ -1413,11 +1413,11 @@ def _get_afternoon_time_range(self, time_list=None, original_list=None): patterns = re.findall(r'\b((?:noon|afternoon)\s*(?:in|of|at)?\s*({timezone})?)\b' .format(timezone=self.timezone_choices), self.processed_text.lower()) - if patterns: - original1 = patterns[0].strip() + for pattern in patterns: + original1 = pattern[0].strip() tz = None - if patterns[1]: - tz = self.convert_to_pytz_format(patterns[1]) + if pattern[1]: + tz = self.convert_to_pytz_format(pattern[1]) if self.departure_flag: time_type = 'departure' elif self.return_flag: @@ -1470,11 +1470,11 @@ def _get_evening_time_range(self, time_list=None, original_list=None): patterns = re.findall(r'\b((?:evening|evng|evning|sham)\s*(?:in|of|at)?\s*({timezone})?)\b' .format(timezone=self.timezone_choices), self.processed_text.lower()) - if patterns: - original1 = patterns[0].strip() + for pattern in patterns: + original1 = pattern[0].strip() tz = None - if patterns[1]: - tz = self.convert_to_pytz_format(patterns[1]) + if pattern[1]: + tz = self.convert_to_pytz_format(pattern[1]) if self.departure_flag: time_type = 'departure' elif self.return_flag: @@ -1527,11 +1527,11 @@ def _get_night_time_range(self, time_list=None, original_list=None): patterns = re.findall(r'\b((?:night|nite|tonight|latenight|tonit|nit|rat)\s*(?:in|of|at)?\s*({timezone})?)\b' .format(timezone=self.timezone_choices), self.processed_text.lower()) - if patterns: - original1 = patterns[0].strip() + for pattern in patterns: + original1 = pattern[0].strip() tz = None - if patterns[1]: - tz = self.convert_to_pytz_format(patterns[1]) + if pattern[1]: + tz = self.convert_to_pytz_format(pattern[1]) if self.departure_flag: time_type = 'departure' elif self.return_flag: @@ -1587,11 +1587,11 @@ def _get_default_time_range(self, time_list=None, original_list=None): .format(timezone=self.timezone_choices)) patterns = preference.findall(self.processed_text.lower()) - if patterns: - original1 = patterns[0].strip() + for pattern in patterns: + original1 = pattern[0].strip() tz = None - if patterns[1]: - tz = self.convert_to_pytz_format(patterns[1]) + if pattern[1]: + tz = self.convert_to_pytz_format(pattern[1]) if self.departure_flag: time_type = 'departure' elif self.return_flag: From f0c4b8469a7e9f88e29f13eee918aa111949f530 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Tue, 8 Oct 2019 21:56:58 +0530 Subject: [PATCH 180/237] fix minute-minutes bug --- ner_v2/detectors/temporal/time/en/time_detection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ner_v2/detectors/temporal/time/en/time_detection.py b/ner_v2/detectors/temporal/time/en/time_detection.py index 3757b4b92..59dfacad7 100644 --- a/ner_v2/detectors/temporal/time/en/time_detection.py +++ b/ner_v2/detectors/temporal/time/en/time_detection.py @@ -798,7 +798,7 @@ def _detect_time_with_difference_later(self, time_list=None, original_list=None) time_list = [] if original_list is None: original_list = [] - patterns = re.findall(r'\b((\d+)\s?(min|mins|minutes|hour|hours|hrs|hr)\s?(later|ltr|latr|lter)s?)\b', + patterns = re.findall(r'\b((\d+)\s?(min|mins|minutes?|hour|hours|hrs|hr)\s?(later|ltr|latr|lter)s?)\b', self.processed_text.lower()) for pattern in patterns: original = pattern[0].strip() From c3c17976afd6b753653b61826cc214618cef57b3 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Tue, 8 Oct 2019 22:21:33 +0530 Subject: [PATCH 181/237] added test cases for noon --- .../temporal/time/en/time_detection.py | 2 +- .../tests/temporal/time/time_ner_tests.yaml | 105 ++++++++++++++++++ 2 files changed, 106 insertions(+), 1 deletion(-) diff --git a/ner_v2/detectors/temporal/time/en/time_detection.py b/ner_v2/detectors/temporal/time/en/time_detection.py index 59dfacad7..794c32090 100644 --- a/ner_v2/detectors/temporal/time/en/time_detection.py +++ b/ner_v2/detectors/temporal/time/en/time_detection.py @@ -1581,7 +1581,7 @@ def _get_default_time_range(self, time_list=None, original_list=None): if original_list is None: original_list = [] # patterns - preference = re.compile(r'\b((?:No particular preference|No preference|No particular time|No time|' + preference = re.compile(r'\b((?:no particular preference|no preference|no particular time|no time|' r'anytime|any time|all day|full day|entire day|entireday)' r'\s*(?:in|of|at)?\s*({timezone})?)\b' .format(timezone=self.timezone_choices)) diff --git a/ner_v2/tests/temporal/time/time_ner_tests.yaml b/ner_v2/tests/temporal/time/time_ner_tests.yaml index 88e3141b7..96e1b394d 100644 --- a/ner_v2/tests/temporal/time/time_ner_tests.yaml +++ b/ner_v2/tests/temporal/time/time_ner_tests.yaml @@ -587,6 +587,111 @@ tests: range: "end" time_type: null original_text: "12:30 am to 2:30 pm" + - id: en_49 + message: "Sessions begin at noon" + bot_message: null + range_enabled: true + outputs: + - output_id: 1 + hh: 11 + mm: 0 + nn: "am" + tz: "UTC" + range: "start" + time_type: null + original_text: "noon" + - output_id: 2 + hh: 5 + mm: 0 + nn: "pm" + tz: "UTC" + range: "end" + time_type: null + original_text: "noon" + - id: en_50 + message: "Sessions begin at morning" + bot_message: null + range_enabled: true + outputs: + - output_id: 1 + hh: 12 + mm: 0 + nn: "am" + tz: "UTC" + range: "start" + time_type: null + original_text: "morning" + - output_id: 2 + hh: 11 + mm: 0 + nn: "am" + tz: "UTC" + range: "end" + time_type: null + original_text: "morning" + - id: en_51 + message: "Sessions begin at evening" + bot_message: null + range_enabled: true + outputs: + - output_id: 1 + hh: 5 + mm: 0 + nn: "pm" + tz: "UTC" + range: "start" + time_type: null + original_text: "evening" + - output_id: 2 + hh: 9 + mm: 0 + nn: "pm" + tz: "UTC" + range: "end" + time_type: null + original_text: "evening" + - id: en_52 + message: "Sessions begin at night" + bot_message: null + range_enabled: true + outputs: + - output_id: 1 + hh: 9 + mm: 0 + nn: "pm" + tz: "UTC" + range: "start" + time_type: null + original_text: "night" + - output_id: 2 + hh: 12 + mm: 0 + nn: "am" + tz: "UTC" + range: "end" + time_type: null + original_text: "night" + - id: en_53 + message: "Sessions begin at no particular preference" + bot_message: null + range_enabled: true + outputs: + - output_id: 1 + hh: 12 + mm: 0 + nn: "am" + tz: "UTC" + range: "start" + time_type: null + original_text: "no particular preference" + - output_id: 2 + hh: 11 + mm: 59 + nn: "pm" + tz: "UTC" + range: "end" + time_type: null + original_text: "no particular preference" hi: - id: hi_1 message: "सुबह 10 बजे" From 7d8a6b5864926d3edef3a7e3db0638372049587e Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Tue, 8 Oct 2019 22:28:45 +0530 Subject: [PATCH 182/237] fix _detect_time_with_difference_later --- ner_v2/detectors/temporal/time/en/time_detection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ner_v2/detectors/temporal/time/en/time_detection.py b/ner_v2/detectors/temporal/time/en/time_detection.py index 794c32090..96e95d251 100644 --- a/ner_v2/detectors/temporal/time/en/time_detection.py +++ b/ner_v2/detectors/temporal/time/en/time_detection.py @@ -805,7 +805,7 @@ def _detect_time_with_difference_later(self, time_list=None, original_list=None) t1 = int(pattern[1]) td = pattern[2] hours = ['hour', 'hours', 'hrs', 'hr'] - mins = ['min', 'mins', 'minutes'] + mins = ['min', 'mins', 'minutes', 'minute'] setter = "" antisetter = "" if td in hours: From 1edc0d0aaa77cc2a0a4d9985610687940b97062c Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Tue, 8 Oct 2019 22:54:42 +0530 Subject: [PATCH 183/237] fix lint --- ner_v2/detectors/temporal/time/en/time_detection.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ner_v2/detectors/temporal/time/en/time_detection.py b/ner_v2/detectors/temporal/time/en/time_detection.py index 96e95d251..a5abafba8 100644 --- a/ner_v2/detectors/temporal/time/en/time_detection.py +++ b/ner_v2/detectors/temporal/time/en/time_detection.py @@ -271,7 +271,7 @@ def _detect_range_12_hour_format(self, time_list=None, original_list=None): r'\b(({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*(?::|\.|\s)?[\s-]*?([0-5][0-9])[\s-]*?(pm|am|a\.m|p\.m)' r'[\s-]*?({timezone})?\s*to[\s-]*?({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*' r'(?::|\.|\s)?[\s-]*?([0-5][0-9])[\s-]*?(pm|am|a\.m|p\.m)\s*({timezone})?)\b' - .format(timezone=self.timezone_choices) + .format(timezone=self.timezone_choices) ) patterns = regex_patterns.findall(self.processed_text.lower()) for pattern in patterns: @@ -349,7 +349,7 @@ def _detect_range_12_hour_format_without_min(self, time_list=None, original_list regex_patterns = re.compile( r'\b(({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*(am|pm|a\.m|p\.m)[\s-]*?({timezone})?\s*to' r'\s*({timezone})?[\s-]*?(0?[2-9]|0?1[0-2]?)[\s-]*(am|pm|a\.m|p\.m)\s*({timezone})?)\b' - .format(timezone=self.timezone_choices) + .format(timezone=self.timezone_choices) ) patterns = regex_patterns.findall(self.processed_text.lower()) for pattern in patterns: @@ -426,7 +426,7 @@ def _detect_start_range_12_hour_format(self, time_list=None, original_list=None) regex_patterns = re.compile( r'\b((?:after|aftr)[\s-]*({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*(?::|\.|\s)?[\s-]*?' r'([0-5][0-9])[\s-]*?(pm|am|a\.m|p\.m)\s*({timezone})?)\b' - .format(timezone=self.timezone_choices) + .format(timezone=self.timezone_choices) ) patterns = regex_patterns.findall(self.processed_text.lower()) for pattern in patterns: @@ -1413,7 +1413,7 @@ def _get_afternoon_time_range(self, time_list=None, original_list=None): patterns = re.findall(r'\b((?:noon|afternoon)\s*(?:in|of|at)?\s*({timezone})?)\b' .format(timezone=self.timezone_choices), self.processed_text.lower()) - for pattern in patterns: + for pattern in patterns: original1 = pattern[0].strip() tz = None if pattern[1]: From 09df7705bf2551b4c76410f93f40c584c92c8964 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Tue, 8 Oct 2019 22:57:32 +0530 Subject: [PATCH 184/237] fix lint --- ner_v2/detectors/temporal/time/en/time_detection.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ner_v2/detectors/temporal/time/en/time_detection.py b/ner_v2/detectors/temporal/time/en/time_detection.py index a5abafba8..ed0d59404 100644 --- a/ner_v2/detectors/temporal/time/en/time_detection.py +++ b/ner_v2/detectors/temporal/time/en/time_detection.py @@ -271,7 +271,7 @@ def _detect_range_12_hour_format(self, time_list=None, original_list=None): r'\b(({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*(?::|\.|\s)?[\s-]*?([0-5][0-9])[\s-]*?(pm|am|a\.m|p\.m)' r'[\s-]*?({timezone})?\s*to[\s-]*?({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*' r'(?::|\.|\s)?[\s-]*?([0-5][0-9])[\s-]*?(pm|am|a\.m|p\.m)\s*({timezone})?)\b' - .format(timezone=self.timezone_choices) + .format(timezone=self.timezone_choices) ) patterns = regex_patterns.findall(self.processed_text.lower()) for pattern in patterns: @@ -349,7 +349,7 @@ def _detect_range_12_hour_format_without_min(self, time_list=None, original_list regex_patterns = re.compile( r'\b(({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*(am|pm|a\.m|p\.m)[\s-]*?({timezone})?\s*to' r'\s*({timezone})?[\s-]*?(0?[2-9]|0?1[0-2]?)[\s-]*(am|pm|a\.m|p\.m)\s*({timezone})?)\b' - .format(timezone=self.timezone_choices) + .format(timezone=self.timezone_choices) ) patterns = regex_patterns.findall(self.processed_text.lower()) for pattern in patterns: @@ -426,7 +426,7 @@ def _detect_start_range_12_hour_format(self, time_list=None, original_list=None) regex_patterns = re.compile( r'\b((?:after|aftr)[\s-]*({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*(?::|\.|\s)?[\s-]*?' r'([0-5][0-9])[\s-]*?(pm|am|a\.m|p\.m)\s*({timezone})?)\b' - .format(timezone=self.timezone_choices) + .format(timezone=self.timezone_choices) ) patterns = regex_patterns.findall(self.processed_text.lower()) for pattern in patterns: From 973b536cd6d503c18405af80c3c8344b161dbde6 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Wed, 9 Oct 2019 12:01:46 +0530 Subject: [PATCH 185/237] added _detect_range_24_hour_format in time detector --- .../temporal/time/en/time_detection.py | 108 +++++++++++++++--- 1 file changed, 93 insertions(+), 15 deletions(-) diff --git a/ner_v2/detectors/temporal/time/en/time_detection.py b/ner_v2/detectors/temporal/time/en/time_detection.py index ed0d59404..e1229ba16 100644 --- a/ner_v2/detectors/temporal/time/en/time_detection.py +++ b/ner_v2/detectors/temporal/time/en/time_detection.py @@ -268,9 +268,9 @@ def _detect_range_12_hour_format(self, time_list=None, original_list=None): if original_list is None: original_list = [] regex_patterns = re.compile( - r'\b(({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*(?::|\.|\s)?[\s-]*?([0-5][0-9])[\s-]*?(pm|am|a\.m|p\.m)' - r'[\s-]*?({timezone})?\s*to[\s-]*?({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*' - r'(?::|\.|\s)?[\s-]*?([0-5][0-9])[\s-]*?(pm|am|a\.m|p\.m)\s*({timezone})?)\b' + r'\b(({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*(?::|\.|\s)?[\s-]*?([0-5][0-9])[\s-]*?(pm|am|a\.m\.?|p\.m\.?)' + r'[\s-]*?({timezone})?\s*(?:to|-)[\s-]*?({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*' + r'(?::|\.|\s)?[\s-]*?([0-5][0-9])[\s-]*?(pm|am|a\.m\.?|p\.m\.?)\s*({timezone})?)\b' .format(timezone=self.timezone_choices) ) patterns = regex_patterns.findall(self.processed_text.lower()) @@ -328,6 +328,81 @@ def _detect_range_12_hour_format(self, time_list=None, original_list=None): break return time_list, original_list + def _detect_range_24_hour_format(self, time_list=None, original_list=None): + """ + Finds 24 hour range format time from text + CURRENTLY IT IS LIMITED ONLY TO ONE RANGE PER TEXT + + Args: + time_list (list): Optional, list to store dictionaries of detected time entities + original_list (list): Optional, list to store corresponding substrings of given text which were detected as + time entities + + Returns: + A tuple of two lists with first list containing the detected time entities and second list containing their + corresponding substrings in the given text. + """ + if time_list is None: + time_list = [] + if original_list is None: + original_list = [] + regex_patterns = re.compile( + r'\b(({timezone})?\s*(00?|0?[2-9]|0?1[0-9]?|2[0-3])[:.\s]?([0-5][0-9])' + r'[\s-]*?({timezone})?\s*(?:to|-)[\s-]*?({timezone})?\s*(00?|0?[2-9]|0?1[0-9]?|2[0-3])[:.\s]?([0-5][0-9])' + r'[\s-]*?({timezone})?)(?!\s*(?:am|pm|a\.m\.?|p\.m\.?|(?:{timezone})|\d))' + .format(timezone=self.timezone_choices) + ) + patterns = regex_patterns.findall(self.processed_text.lower()) + for pattern in patterns: + original1 = pattern[0].strip() + original2 = pattern[0].strip() + if self.departure_flag: + time_type = 'departure' + elif self.return_flag: + time_type = 'return' + else: + time_type = None + t1 = pattern[2] + t2 = pattern[3] + tz1 = pattern[1] + tz2 = pattern[4] + tz = None + if tz1 or tz2: + tz = self.convert_to_pytz_format(tz1 or tz2) + time1 = { + 'hh': int(t1), + 'mm': int(t2), + 'nn': 'hrs', + 'tz': tz or self.timezone.zone, + 'range': 'start', + 'time_type': time_type + } + time1['nn'] = 'am' if 'a' in time1['nn'] else time1['nn'] + time1['nn'] = 'pm' if 'p' in time1['nn'] else time1['nn'] + + t3 = pattern[6] + t4 = pattern[7] + tz3 = pattern[5] + tz4 = pattern[8] + tz = None + if tz3 or tz4: + tz = self.convert_to_pytz_format(tz3 or tz4) + time2 = { + 'hh': int(t3), + 'mm': int(t4), + 'nn': 'hrs', + 'tz': tz or self.timezone.zone, + 'range': 'end', + 'time_type': time_type + } + + time_list.append(time1) + original_list.append(original1) + time_list.append(time2) + original_list.append(original2) + break + return time_list, original_list + def _detect_range_12_hour_format_without_min(self, time_list=None, original_list=None): """ Finds 12 hour range format time from text without minutes @@ -347,8 +422,8 @@ def _detect_range_12_hour_format_without_min(self, time_list=None, original_list if original_list is None: original_list = [] regex_patterns = re.compile( - r'\b(({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*(am|pm|a\.m|p\.m)[\s-]*?({timezone})?\s*to' - r'\s*({timezone})?[\s-]*?(0?[2-9]|0?1[0-2]?)[\s-]*(am|pm|a\.m|p\.m)\s*({timezone})?)\b' + r'\b(({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*(am|pm|a\.m\.?|p\.m\.?)[\s-]*?({timezone})?\s*(?:to|-)' + r'\s*({timezone})?[\s-]*?(0?[2-9]|0?1[0-2]?)[\s-]*(am|pm|a\.m\.?|p\.m\.?)\s*({timezone})?)\b' .format(timezone=self.timezone_choices) ) patterns = regex_patterns.findall(self.processed_text.lower()) @@ -425,7 +500,7 @@ def _detect_start_range_12_hour_format(self, time_list=None, original_list=None) regex_patterns = re.compile( r'\b((?:after|aftr)[\s-]*({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*(?::|\.|\s)?[\s-]*?' - r'([0-5][0-9])[\s-]*?(pm|am|a\.m|p\.m)\s*({timezone})?)\b' + r'([0-5][0-9])[\s-]*?(pm|am|a\.m\.?|p\.m\.?)\s*({timezone})?)\b' .format(timezone=self.timezone_choices) ) patterns = regex_patterns.findall(self.processed_text.lower()) @@ -480,7 +555,7 @@ def _detect_end_range_12_hour_format(self, time_list=None, original_list=None): if original_list is None: original_list = [] patterns = re.findall(r'\b((?:before|bfre)[\s-]*({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*' - r'(?::|\.|\s)?[\s-]*?([0-5][0-9])[\s-]*?(pm|am|a\.m|p\.m)\s*({timezone})?)\b' + r'(?::|\.|\s)?[\s-]*?([0-5][0-9])[\s-]*?(pm|am|a\.m\.?|p\.m\.?)\s*({timezone})?)\b' .format(timezone=self.timezone_choices), self.processed_text.lower()) for pattern in patterns: @@ -533,7 +608,7 @@ def _detect_start_range_12_hour_format_without_min(self, time_list=None, origina if original_list is None: original_list = [] patterns = re.findall(r'\b((?:after|aftr)[\s-]*({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*' - r'(am|pm|a\.m|p\.m)\s*({timezone})?)\b'.format(timezone=self.timezone_choices), + r'(am|pm|a\.m\.?|p\.m\.?)\s*({timezone})?)\b'.format(timezone=self.timezone_choices), self.processed_text.lower()) for pattern in patterns: original1 = pattern[0].strip() @@ -585,7 +660,8 @@ def _detect_end_range_12_hour_format_without_min(self, time_list=None, original_ if original_list is None: original_list = [] patterns = re.findall(r'\b((?:before|bfore)[\s-]*({timezone})?\s*(0?[2-9]|0?1[0-2]?)' - r'[\s-]*(am|pm|a\.m|p\.m)\s*({timezone})?)\b'.format(timezone=self.timezone_choices), + r'[\s-]*(am|pm|a\.m\.?|p\.m\.?)\s*({timezone})?)\b' + .format(timezone=self.timezone_choices), self.processed_text.lower()) for pattern in patterns: original1 = pattern[0].strip() @@ -650,7 +726,8 @@ def _detect_12_hour_format(self, time_list=None, original_list=None): if original_list is None: original_list = [] patterns = re.findall(r'\b(({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*(?::|\.|\s)?[\s-]*?([0-5][0-9])' - r'[\s-]*?(pm|am|a\.m|p\.m)\s*({timezone})?)\b'.format(timezone=self.timezone_choices), + r'[\s-]*?(pm|am|a\.m\.?|p\.m\.?)\s*({timezone})?)\b' + .format(timezone=self.timezone_choices), self.processed_text.lower()) for pattern in patterns: original = pattern[0].strip() @@ -704,7 +781,7 @@ def _detect_12_hour_without_min(self, time_list=None, original_list=None): time_list = [] if original_list is None: original_list = [] - patterns = re.findall(r'\b(({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*(am|pm|a\.m|p\.m)\s*({timezone})?)\b' + patterns = re.findall(r'\b(({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*(am|pm|a\.m\.?|p\.m\.?)\s*({timezone})?)\b' .format(timezone=self.timezone_choices), self.processed_text.lower()) for pattern in patterns: original = pattern[0].strip() @@ -926,7 +1003,7 @@ def _detect_24_hour_optional_minutes_format(self, time_list=None, original_list= if original_list is None: original_list = [] patterns = re.findall(r'\b(({timezone})?\s*(00?|0?[2-9]|0?1[0-9]?|2[0-3])[:.\s]([0-5][0-9])?\s*' - r'(?:h|hrs|hr)?\s*({timezone})?)(?!\s*(?:am|pm|a\.m|p\.m|(?:{timezone})' + r'(?:h|hrs|hr)?\s*({timezone})?)(?!\s*(?:am|pm|a\.m\.?|p\.m\.?|(?:{timezone})' r'|(?:h|hrs|hr)|\d))\b' .format(timezone=self.timezone_choices), self.processed_text.lower()) @@ -976,7 +1053,7 @@ def _detect_restricted_24_hour_format(self, time_list=None, original_list=None): if original_list is None: original_list = [] patterns = re.findall(r'\b(({timezone})?\s*(00?|1[3-9]?|2[0-3])[:.\s]([0-5][0-9])' - r'\s*(?:h|hr|hrs)?\s*({timezone})?)(?!\s*(?:am|pm|a\.m|p\.m|(?:h|hrs|hr)|' + r'\s*(?:h|hr|hrs)?\s*({timezone})?)(?!\s*(?:am|pm|a\.m\.?|p\.m\.?|(?:h|hrs|hr)|' r'(?:{timezone})|\d))\b'.format(timezone=self.timezone_choices), self.processed_text.lower()) for pattern in patterns: @@ -1035,7 +1112,7 @@ def _detect_12_hour_word_format(self, time_list=None, original_list=None): time_list = [] if original_list is None: original_list = [] - patterns = re.findall(r'\b((0?[1-9]|1[0-2])[:.\s]([0-5][0-9]))(?!\s?(?:am|pm|a\.m|p\.m|\d))', + patterns = re.findall(r'\b((0?[1-9]|1[0-2])[:.\s]([0-5][0-9]))(?!\s?(?:am|pm|a\.m\.?|p\.m\.?|\d))', self.processed_text.lower()) pattern_am = re.findall(r'\s(morning|early|subah|mrng|mrning|savere)\s', self.processed_text.lower()) pattern_pm = re.findall(r'\s(noon|afternoon|evening|evng|evning|sham)\s', self.processed_text.lower()) @@ -1151,7 +1228,8 @@ def _detect_24_hour_format(self, time_list=None, original_list=None): if original_list is None: original_list = [] patterns = re.findall(r'\b(({timezone})?\s*(00?|0?[2-9]|0?1[0-9]?|2[0-3])[:.\s]?([0-5][0-9])\s*({timezone})?)' - r'(?!\s*(?:am|pm|a\.m|p\.m|(?:{timezone})|\d))'.format(timezone=self.timezone_choices), + r'(?!\s*(?:am|pm|a\.m\.?|p\.m\.?|(?:{timezone})|\d))' + .format(timezone=self.timezone_choices), self.processed_text.lower()) if not patterns: # Optional minutes but compulsory "hour" mention From 77d153f5cfe1e2bc8d5fb769852079f347126b01 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Wed, 9 Oct 2019 12:05:34 +0530 Subject: [PATCH 186/237] fix lint --- ner_v2/detectors/temporal/time/en/time_detection.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ner_v2/detectors/temporal/time/en/time_detection.py b/ner_v2/detectors/temporal/time/en/time_detection.py index e1229ba16..2c3db8dce 100644 --- a/ner_v2/detectors/temporal/time/en/time_detection.py +++ b/ner_v2/detectors/temporal/time/en/time_detection.py @@ -268,8 +268,8 @@ def _detect_range_12_hour_format(self, time_list=None, original_list=None): if original_list is None: original_list = [] regex_patterns = re.compile( - r'\b(({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*(?::|\.|\s)?[\s-]*?([0-5][0-9])[\s-]*?(pm|am|a\.m\.?|p\.m\.?)' - r'[\s-]*?({timezone})?\s*(?:to|-)[\s-]*?({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*' + r'\b(({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*(?::|\.|\s)?[\s-]*?([0-5][0-9])[\s-]*?' + r'(pm|am|a\.m\.?|p\.m\.?)[\s-]*?({timezone})?\s*(?:to|-)[\s-]*?({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*' r'(?::|\.|\s)?[\s-]*?([0-5][0-9])[\s-]*?(pm|am|a\.m\.?|p\.m\.?)\s*({timezone})?)\b' .format(timezone=self.timezone_choices) ) From ca689ead963d5b071f78566d786270c29719e8c4 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Wed, 9 Oct 2019 12:10:29 +0530 Subject: [PATCH 187/237] add time_list, original_list = self._detect_range_12_hour_format(time_list, original_list) --- ner_v2/detectors/temporal/time/en/time_detection.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ner_v2/detectors/temporal/time/en/time_detection.py b/ner_v2/detectors/temporal/time/en/time_detection.py index 2c3db8dce..da619fc35 100644 --- a/ner_v2/detectors/temporal/time/en/time_detection.py +++ b/ner_v2/detectors/temporal/time/en/time_detection.py @@ -160,6 +160,8 @@ def _detect_time(self, range_enabled=False, form_check=False): self._update_processed_text(original_list) time_list, original_list = self._detect_end_range_12_hour_format_without_min(time_list, original_list) self._update_processed_text(original_list) + time_list, original_list = self._detect_range_24_hour_format(time_list, original_list) + self._update_processed_text(original_list) time_list, original_list = self._detect_12_hour_format(time_list, original_list) self._update_processed_text(original_list) time_list, original_list = self._detect_12_hour_without_min(time_list, original_list) From e127fa5a6107c424d4c1558ca3f9ef83c660bb3a Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Wed, 9 Oct 2019 14:59:58 +0530 Subject: [PATCH 188/237] merge time/en csv files --- .../temporal/time/en/data/timezones.csv | 20 +++++------ .../temporal/time/en/time_detection.py | 34 +++++++++++-------- .../tests/temporal/time/time_ner_tests.yaml | 21 ++++++++++++ 3 files changed, 50 insertions(+), 25 deletions(-) diff --git a/ner_v2/detectors/temporal/time/en/data/timezones.csv b/ner_v2/detectors/temporal/time/en/data/timezones.csv index 962e2210b..4a7c697f1 100644 --- a/ner_v2/detectors/temporal/time/en/data/timezones.csv +++ b/ner_v2/detectors/temporal/time/en/data/timezones.csv @@ -1,10 +1,10 @@ -code,preferred,all_regions -IST,Asia/Kolkata,Asia/Kolkata -EST,America/New_York,America/New_York|America/Detroit|America/Kentucky/Louisville|America/Kentucky/Monticello|America/Indiana/Indianapolis|America/Indiana/Vincennes|America/Indiana/Winamac|America/Indiana/Marengo|America/Indiana/Petersburg|America/Indiana/Vevay -CST,America/Chicago,America/Chicago|America/Indiana/Tell_City|America/Indiana/Knox|America/Menominee|America/North_Dakota/Center|America/North_Dakota/New_Salem|America/North_Dakota/Beulah -MST,America/Denver,America/Denver|America/Boise|America/Phoenix -PST,America/Los_Angeles,America/Los_Angeles -AKST,America/Anchorage,America/Anchorage|America/Juneau|America/Sitka|America/Yakutat|America/Nome|America/Metlakatla -HST,America/Adak,America/Adak|Pacific/Honolulu -HAST,Pacific/Honolulu,Pacific/Honolulu -UTC,UTC,UTC \ No newline at end of file +code,timezone_variants,preferred,all_regions +IST,IST|Indian Time|Indian Standard Time,Asia/Kolkata,Asia/Kolkata +EST,EST|Eastern Standard Time|Eastern Time|ET|EDT,America/New_York,America/New_York|America/Detroit|America/Kentucky/Louisville|America/Kentucky/Monticello|America/Indiana/Indianapolis|America/Indiana/Vincennes|America/Indiana/Winamac|America/Indiana/Marengo|America/Indiana/Petersburg|America/Indiana/Vevay +CST,CST|Central Standard Time|Central Time|CT|CDT,America/Chicago,America/Chicago|America/Indiana/Tell_City|America/Indiana/Knox|America/Menominee|America/North_Dakota/Center|America/North_Dakota/New_Salem|America/North_Dakota/Beulah +MST,MST|Mountain Standard Time|Mountain Time|MT|MDT,America/Denver,America/Denver|America/Boise|America/Phoenix +PST,PST|Pacific Standard Time|Pacific Time|PT|PDT,America/Los_Angeles,America/Los_Angeles +AKST,AKST|Alaska Standard Time|Alaska Time|AKDT,America/Anchorage,America/Anchorage|America/Juneau|America/Sitka|America/Yakutat|America/Nome|America/Metlakatla +HST,HST|Hawaii Standard Time|HDT,America/Adak,America/Adak|Pacific/Honolulu +HAST,HAST|Hawaii-Aleutian Standard Time|Hawaii Aleutian Standard Time|Hawaii Time|HADT,Pacific/Honolulu,Pacific/Honolulu +UTC,UTC|GMT|Greenwich Mean Time|Greenwich Time|Coordinated Universal Time,UTC,UTC diff --git a/ner_v2/detectors/temporal/time/en/time_detection.py b/ner_v2/detectors/temporal/time/en/time_detection.py index da619fc35..096b3eb1b 100644 --- a/ner_v2/detectors/temporal/time/en/time_detection.py +++ b/ner_v2/detectors/temporal/time/en/time_detection.py @@ -3,14 +3,14 @@ import collections import pandas as pd import os -from ner_v2.detectors.temporal.constant import AM_MERIDIEM, PM_MERIDIEM, TWELVE_HOUR, EVERY_TIME_TYPE, \ - TIMEZONE_VARIANTS_CONSTANT_FILE, TIMEZONES_CONSTANT_FILE, TIMEZONE_VARIANTS_VARIANTS_COLUMN_NAME, \ - TIMEZONE_VARIANTS_VALUE_COLUMN_NAME, TIMEZONES_CODE_COLUMN_NAME, TIMEZONES_ALL_REGIONS_COLUMN_NAME, \ +from ner_v2.detectors.temporal.constant import AM_MERIDIEM, PM_MERIDIEM, TWELVE_HOUR, EVERY_TIME_TYPE,\ + TIMEZONES_CONSTANT_FILE, TIMEZONE_VARIANTS_VARIANTS_COLUMN_NAME, \ + TIMEZONES_CODE_COLUMN_NAME, TIMEZONES_ALL_REGIONS_COLUMN_NAME, \ TIMEZONES_PREFERRED_REGION_COLUMN_NAME from ner_v2.detectors.temporal.utils import get_timezone, get_list_from_pipe_sep_string from ner_v2.constant import LANGUAGE_DATA_DIRECTORY -TimezoneVariants = collections.namedtuple('TimezoneVariant', ['value']) +TimezoneVariants = collections.namedtuple('TimezoneVariant', ['value', 'preferred']) class TimeDetector(object): @@ -105,14 +105,17 @@ def set_bot_message(self, bot_message): self.bot_message = bot_message def init_regex_and_parser(self, data_directory_path): - timezone_variants_data_path = os.path.join(data_directory_path, TIMEZONE_VARIANTS_CONSTANT_FILE) + timezone_variants_data_path = os.path.join(data_directory_path, TIMEZONES_CONSTANT_FILE) + columns = [TIMEZONE_VARIANTS_VARIANTS_COLUMN_NAME, TIMEZONES_CODE_COLUMN_NAME, + TIMEZONES_PREFERRED_REGION_COLUMN_NAME] if os.path.exists(timezone_variants_data_path): - timezone_variants_df = pd.read_csv(timezone_variants_data_path, encoding='utf-8') + timezone_variants_df = pd.read_csv(timezone_variants_data_path, usecols=columns, encoding='utf-8') for index, row in timezone_variants_df.iterrows(): tz_name_variants = get_list_from_pipe_sep_string(row[TIMEZONE_VARIANTS_VARIANTS_COLUMN_NAME]) - value = row[TIMEZONE_VARIANTS_VALUE_COLUMN_NAME] + value = row[TIMEZONES_CODE_COLUMN_NAME] + preferred = row[TIMEZONES_PREFERRED_REGION_COLUMN_NAME] for tz_name in tz_name_variants: - self.timezones_map[tz_name] = TimezoneVariants(value=value) + self.timezones_map[tz_name] = TimezoneVariants(value=value, preferred=preferred) def convert_to_pytz_format(self, timezone_variant): """ @@ -124,13 +127,14 @@ def convert_to_pytz_format(self, timezone_variant): data_directory_path = os.path.join((os.path.dirname(os.path.abspath(__file__)).rstrip(os.sep)), LANGUAGE_DATA_DIRECTORY) timezone_data_path = os.path.join(data_directory_path, TIMEZONES_CONSTANT_FILE) + columns = [TIMEZONES_CODE_COLUMN_NAME, TIMEZONES_ALL_REGIONS_COLUMN_NAME] if os.path.exists(timezone_data_path): - timezones_df = pd.read_csv(timezone_data_path, encoding='utf-8') - timezones_df.set_index(TIMEZONES_CODE_COLUMN_NAME, inplace=True) + timezones_df = pd.read_csv(timezone_data_path, usecols=columns, index_col=TIMEZONES_CODE_COLUMN_NAME, + encoding='utf-8') if re.search(self.timezone.zone, timezones_df.loc[timezone_code][TIMEZONES_ALL_REGIONS_COLUMN_NAME]): return self.timezone.zone else: - return timezones_df.loc[timezone_code][TIMEZONES_PREFERRED_REGION_COLUMN_NAME] + return self.timezones_map[timezone_variant].preferred return self.timezone.zone @@ -273,7 +277,7 @@ def _detect_range_12_hour_format(self, time_list=None, original_list=None): r'\b(({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*(?::|\.|\s)?[\s-]*?([0-5][0-9])[\s-]*?' r'(pm|am|a\.m\.?|p\.m\.?)[\s-]*?({timezone})?\s*(?:to|-)[\s-]*?({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*' r'(?::|\.|\s)?[\s-]*?([0-5][0-9])[\s-]*?(pm|am|a\.m\.?|p\.m\.?)\s*({timezone})?)\b' - .format(timezone=self.timezone_choices) + .format(timezone=self.timezone_choices) ) patterns = regex_patterns.findall(self.processed_text.lower()) for pattern in patterns: @@ -352,7 +356,7 @@ def _detect_range_24_hour_format(self, time_list=None, original_list=None): r'\b(({timezone})?\s*(00?|0?[2-9]|0?1[0-9]?|2[0-3])[:.\s]?([0-5][0-9])' r'[\s-]*?({timezone})?\s*(?:to|-)[\s-]*?({timezone})?\s*(00?|0?[2-9]|0?1[0-9]?|2[0-3])[:.\s]?([0-5][0-9])' r'[\s-]*?({timezone})?)(?!\s*(?:am|pm|a\.m\.?|p\.m\.?|(?:{timezone})|\d))' - .format(timezone=self.timezone_choices) + .format(timezone=self.timezone_choices) ) patterns = regex_patterns.findall(self.processed_text.lower()) for pattern in patterns: @@ -426,7 +430,7 @@ def _detect_range_12_hour_format_without_min(self, time_list=None, original_list regex_patterns = re.compile( r'\b(({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*(am|pm|a\.m\.?|p\.m\.?)[\s-]*?({timezone})?\s*(?:to|-)' r'\s*({timezone})?[\s-]*?(0?[2-9]|0?1[0-2]?)[\s-]*(am|pm|a\.m\.?|p\.m\.?)\s*({timezone})?)\b' - .format(timezone=self.timezone_choices) + .format(timezone=self.timezone_choices) ) patterns = regex_patterns.findall(self.processed_text.lower()) for pattern in patterns: @@ -503,7 +507,7 @@ def _detect_start_range_12_hour_format(self, time_list=None, original_list=None) regex_patterns = re.compile( r'\b((?:after|aftr)[\s-]*({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*(?::|\.|\s)?[\s-]*?' r'([0-5][0-9])[\s-]*?(pm|am|a\.m\.?|p\.m\.?)\s*({timezone})?)\b' - .format(timezone=self.timezone_choices) + .format(timezone=self.timezone_choices) ) patterns = regex_patterns.findall(self.processed_text.lower()) for pattern in patterns: diff --git a/ner_v2/tests/temporal/time/time_ner_tests.yaml b/ner_v2/tests/temporal/time/time_ner_tests.yaml index 96e1b394d..8070a2aee 100644 --- a/ner_v2/tests/temporal/time/time_ner_tests.yaml +++ b/ner_v2/tests/temporal/time/time_ner_tests.yaml @@ -692,6 +692,27 @@ tests: range: "end" time_type: null original_text: "no particular preference" + - id: en_54 + message: "10:00 to 14:00" + bot_message: null + range_enabled: true + outputs: + - output_id: 1 + hh: 10 + mm: 0 + nn: "hrs" + tz: "UTC" + range: "start" + time_type: null + original_text: "10:00 to 14:00" + - output_id: 2 + hh: 14 + mm: 0 + nn: "hrs" + tz: "UTC" + range: "end" + time_type: null + original_text: "10:00 to 14:00" hi: - id: hi_1 message: "सुबह 10 बजे" From f7054ec98e38401054ee19778b047675b6766fa7 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Wed, 9 Oct 2019 15:06:15 +0530 Subject: [PATCH 189/237] fix lint --- ner_v2/detectors/temporal/time/en/time_detection.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ner_v2/detectors/temporal/time/en/time_detection.py b/ner_v2/detectors/temporal/time/en/time_detection.py index 096b3eb1b..63b06c63c 100644 --- a/ner_v2/detectors/temporal/time/en/time_detection.py +++ b/ner_v2/detectors/temporal/time/en/time_detection.py @@ -277,7 +277,7 @@ def _detect_range_12_hour_format(self, time_list=None, original_list=None): r'\b(({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*(?::|\.|\s)?[\s-]*?([0-5][0-9])[\s-]*?' r'(pm|am|a\.m\.?|p\.m\.?)[\s-]*?({timezone})?\s*(?:to|-)[\s-]*?({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*' r'(?::|\.|\s)?[\s-]*?([0-5][0-9])[\s-]*?(pm|am|a\.m\.?|p\.m\.?)\s*({timezone})?)\b' - .format(timezone=self.timezone_choices) + .format(timezone=self.timezone_choices) ) patterns = regex_patterns.findall(self.processed_text.lower()) for pattern in patterns: @@ -356,7 +356,7 @@ def _detect_range_24_hour_format(self, time_list=None, original_list=None): r'\b(({timezone})?\s*(00?|0?[2-9]|0?1[0-9]?|2[0-3])[:.\s]?([0-5][0-9])' r'[\s-]*?({timezone})?\s*(?:to|-)[\s-]*?({timezone})?\s*(00?|0?[2-9]|0?1[0-9]?|2[0-3])[:.\s]?([0-5][0-9])' r'[\s-]*?({timezone})?)(?!\s*(?:am|pm|a\.m\.?|p\.m\.?|(?:{timezone})|\d))' - .format(timezone=self.timezone_choices) + .format(timezone=self.timezone_choices) ) patterns = regex_patterns.findall(self.processed_text.lower()) for pattern in patterns: @@ -430,7 +430,7 @@ def _detect_range_12_hour_format_without_min(self, time_list=None, original_list regex_patterns = re.compile( r'\b(({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*(am|pm|a\.m\.?|p\.m\.?)[\s-]*?({timezone})?\s*(?:to|-)' r'\s*({timezone})?[\s-]*?(0?[2-9]|0?1[0-2]?)[\s-]*(am|pm|a\.m\.?|p\.m\.?)\s*({timezone})?)\b' - .format(timezone=self.timezone_choices) + .format(timezone=self.timezone_choices) ) patterns = regex_patterns.findall(self.processed_text.lower()) for pattern in patterns: @@ -507,7 +507,7 @@ def _detect_start_range_12_hour_format(self, time_list=None, original_list=None) regex_patterns = re.compile( r'\b((?:after|aftr)[\s-]*({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*(?::|\.|\s)?[\s-]*?' r'([0-5][0-9])[\s-]*?(pm|am|a\.m\.?|p\.m\.?)\s*({timezone})?)\b' - .format(timezone=self.timezone_choices) + .format(timezone=self.timezone_choices) ) patterns = regex_patterns.findall(self.processed_text.lower()) for pattern in patterns: From c4ecd2a161dd97f0cf3f5a3ebcce2b0e69348249 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Wed, 9 Oct 2019 15:07:18 +0530 Subject: [PATCH 190/237] deleted csv file --- .../temporal/time/en/data/timezone_variations.csv | 10 ---------- 1 file changed, 10 deletions(-) delete mode 100644 ner_v2/detectors/temporal/time/en/data/timezone_variations.csv diff --git a/ner_v2/detectors/temporal/time/en/data/timezone_variations.csv b/ner_v2/detectors/temporal/time/en/data/timezone_variations.csv deleted file mode 100644 index 4ed2a5e63..000000000 --- a/ner_v2/detectors/temporal/time/en/data/timezone_variations.csv +++ /dev/null @@ -1,10 +0,0 @@ -timezone_value,timezone_variants -IST,IST|Indian Time|Indian Standard Time -EST,EST|Eastern Standard Time|Eastern Time|ET|EDT -CST,CST|Central Standard Time|Central Time|CT|CDT -MST,MST|Mountain Standard Time|Mountain Time|MT|MDT -PST,PST|Pacific Standard Time|Pacific Time|PT|PDT -AKST,AKST|Alaska Standard Time|Alaska Time|AKDT -HST,HST|Hawaii Standard Time|HDT -HAST,HAST|Hawaii-Aleutian Standard Time|Hawaii Aleutian Standard Time|Hawaii Time|HADT -UTC,UTC|GMT|Greenwich Mean Time|Greenwich Time|Coordinated Universal Time \ No newline at end of file From b8a76b81f55d4bb5216d2b9c630cb5a89a57ac80 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Wed, 9 Oct 2019 17:08:52 +0530 Subject: [PATCH 191/237] fix PR reviews --- ner_v2/detectors/temporal/constant.py | 11 ++++++----- ner_v2/detectors/temporal/time/en/time_detection.py | 1 + 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/ner_v2/detectors/temporal/constant.py b/ner_v2/detectors/temporal/constant.py index ad6843043..2a78af95b 100644 --- a/ner_v2/detectors/temporal/constant.py +++ b/ner_v2/detectors/temporal/constant.py @@ -3,16 +3,17 @@ DATETIME_CONSTANT_FILE = 'datetime_diff_constant.csv' NUMERALS_CONSTANT_FILE = 'numbers_constant.csv' -# timezone variants data file and its columns -TIMEZONE_VARIANTS_CONSTANT_FILE = 'timezone_variations.csv' -TIMEZONE_VARIANTS_VARIANTS_COLUMN_NAME = 'timezone_variants' -TIMEZONE_VARIANTS_VALUE_COLUMN_NAME = 'timezone_value' - # timezones data file and its columns +# name of the data file TIMEZONES_CONSTANT_FILE = 'timezones.csv' +# index of the csv file(try using the common standard) TIMEZONES_CODE_COLUMN_NAME = 'code' +# all regions in Olson format pytz TIMEZONES_ALL_REGIONS_COLUMN_NAME = 'all_regions' +# preferred region in the above all_regions (Olson format pytz) TIMEZONES_PREFERRED_REGION_COLUMN_NAME = 'preferred' +# Formal usage variants of the index +TIMEZONE_VARIANTS_VARIANTS_COLUMN_NAME = 'timezone_variants' CONSTANT_FILE_KEY = 'key' diff --git a/ner_v2/detectors/temporal/time/en/time_detection.py b/ner_v2/detectors/temporal/time/en/time_detection.py index 63b06c63c..e14efee9a 100644 --- a/ner_v2/detectors/temporal/time/en/time_detection.py +++ b/ner_v2/detectors/temporal/time/en/time_detection.py @@ -11,6 +11,7 @@ from ner_v2.constant import LANGUAGE_DATA_DIRECTORY TimezoneVariants = collections.namedtuple('TimezoneVariant', ['value', 'preferred']) +# Timezone_for_none = collections.namedtuple('TZ_None_case', ['zone']) class TimeDetector(object): From 9d21c1fec41cf632c7520d00ddfcf239564fe0ba Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Wed, 9 Oct 2019 17:23:20 +0530 Subject: [PATCH 192/237] add more edge cases support --- .../detectors/numeral/number/en/data/numerals_constant.csv | 1 + ner_v2/detectors/numeral/number/en/data/units.csv | 2 +- .../numeral/number_range/en/data/number_range_keywords.csv | 6 +++--- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/ner_v2/detectors/numeral/number/en/data/numerals_constant.csv b/ner_v2/detectors/numeral/number/en/data/numerals_constant.csv index 9c9310e69..b292c1842 100644 --- a/ner_v2/detectors/numeral/number/en/data/numerals_constant.csv +++ b/ner_v2/detectors/numeral/number/en/data/numerals_constant.csv @@ -32,3 +32,4 @@ number,name_variants,number_value,number_type 100000,lakh|lakhs|lac|lacs|l,100000,scale 100000,million|mil|m,1000000,scale 10000000,crore|crores|c|cr,10000000,scale +100000000,billion|bil|b,1000000000,scale diff --git a/ner_v2/detectors/numeral/number/en/data/units.csv b/ner_v2/detectors/numeral/number/en/data/units.csv index ec35b5ab2..f79c862ce 100644 --- a/ner_v2/detectors/numeral/number/en/data/units.csv +++ b/ner_v2/detectors/numeral/number/en/data/units.csv @@ -1,5 +1,5 @@ unit_type,unit_value,unit_variants -currency,rupees,rupees | rupee | rs | rupya | rupaya | rupaye | rupye | rupay | paisa | paise | inr | ₹ +currency,rupees,rupees | rupee | rs | rupya | rupaya | rupaye | rupye | rupay | paisa | paise | inr | re | ₹ currency,dollar,Dollar | dollars | usd | $ currency,euro,Euro | euros | eur | € currency,pound sterling,Pound sterling | pound sterlings | quid | pounds | sterling | pound | gbp | £ diff --git a/ner_v2/detectors/numeral/number_range/en/data/number_range_keywords.csv b/ner_v2/detectors/numeral/number_range/en/data/number_range_keywords.csv index d6fe599ea..bf96aa062 100644 --- a/ner_v2/detectors/numeral/number_range/en/data/number_range_keywords.csv +++ b/ner_v2/detectors/numeral/number_range/en/data/number_range_keywords.csv @@ -1,5 +1,5 @@ range_variants,position,range_type -above | abv | abov | more than | mor than | more den | mor den | greater than | greater ,-1,min +above | abv | abov | more than | mor than | more den | mor den | greater than | greater | over,-1,min max | upto | up to | around | below | less than | less | less den,-1,max -max,1,max -To | - ,0,min_max +max | onwards | and above | or above ,1,max +To | - ,0,min_max \ No newline at end of file From 530e7e287baab48d3f2cb59bb205c64278128873 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Wed, 9 Oct 2019 17:26:13 +0530 Subject: [PATCH 193/237] lint fix --- .../numeral/number_range/standard_number_range_detector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py b/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py index bf9e1c4d8..6d0f6d371 100644 --- a/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py +++ b/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py @@ -250,7 +250,7 @@ def _get_number_range(self, min_part_match, max_part_match, full_match): temp = entity_value_max entity_value_max = entity_value_min entity_value_min = temp - except : + except: if float(entity_value_min) > float(entity_value_max): temp = entity_value_max entity_value_max = entity_value_min From 89766d6b7cf2f642c8b91b01f9802f5401916dd2 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Wed, 9 Oct 2019 17:35:05 +0530 Subject: [PATCH 194/237] fix PR reviews --- ner_v2/detectors/numeral/number/en/data/units.csv | 2 +- .../number_range/standard_number_range_detector.py | 14 ++++---------- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/ner_v2/detectors/numeral/number/en/data/units.csv b/ner_v2/detectors/numeral/number/en/data/units.csv index f79c862ce..ec35b5ab2 100644 --- a/ner_v2/detectors/numeral/number/en/data/units.csv +++ b/ner_v2/detectors/numeral/number/en/data/units.csv @@ -1,5 +1,5 @@ unit_type,unit_value,unit_variants -currency,rupees,rupees | rupee | rs | rupya | rupaya | rupaye | rupye | rupay | paisa | paise | inr | re | ₹ +currency,rupees,rupees | rupee | rs | rupya | rupaya | rupaye | rupye | rupay | paisa | paise | inr | ₹ currency,dollar,Dollar | dollars | usd | $ currency,euro,Euro | euros | eur | € currency,pound sterling,Pound sterling | pound sterlings | quid | pounds | sterling | pound | gbp | £ diff --git a/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py b/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py index 6d0f6d371..92ee515cf 100644 --- a/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py +++ b/ner_v2/detectors/numeral/number_range/standard_number_range_detector.py @@ -245,16 +245,10 @@ def _get_number_range(self, min_part_match, max_part_match, full_match): return number_range, original_text if min_part_match and max_part_match: - try: - if int(entity_value_min) > int(entity_value_max): - temp = entity_value_max - entity_value_max = entity_value_min - entity_value_min = temp - except: - if float(entity_value_min) > float(entity_value_max): - temp = entity_value_max - entity_value_max = entity_value_min - entity_value_min = temp + if float(entity_value_min) > float(entity_value_max): + temp = entity_value_max + entity_value_max = entity_value_min + entity_value_min = temp original_text = self._get_original_text_from_tagged_text(full_match) if (entity_value_min or entity_value_max) and original_text: From fdd40cb59a13403bac0b27986b104985894a88a0 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Wed, 9 Oct 2019 17:50:20 +0530 Subject: [PATCH 195/237] make default tz None --- .../temporal/time/en/time_detection.py | 73 ++++++++++--------- 1 file changed, 38 insertions(+), 35 deletions(-) diff --git a/ner_v2/detectors/temporal/time/en/time_detection.py b/ner_v2/detectors/temporal/time/en/time_detection.py index e14efee9a..db0f9f3df 100644 --- a/ner_v2/detectors/temporal/time/en/time_detection.py +++ b/ner_v2/detectors/temporal/time/en/time_detection.py @@ -67,7 +67,7 @@ class TimeDetector(object): text and tagged_text will have a extra space prepended and appended after calling detect_entity(text) """ - def __init__(self, entity_name, timezone='UTC'): + def __init__(self, entity_name, timezone=None): """Initializes a TimeDetector object with given entity_name and timezone Args: @@ -87,7 +87,10 @@ def __init__(self, entity_name, timezone='UTC'): self.original_time_text = [] self.tag = '__' + entity_name + '__' self.bot_message = None - self.timezone = get_timezone(timezone) + if timezone: + self.timezone = get_timezone(timezone) + else: + self.timezone = None self.now_date = datetime.datetime.now(self.timezone) self.timezones_map = {} @@ -302,7 +305,7 @@ def _detect_range_12_hour_format(self, time_list=None, original_list=None): 'hh': int(t1), 'mm': int(t2), 'nn': str(ap1).lower().strip('.'), - 'tz': tz or self.timezone.zone, + 'tz': tz or (None if not self.timezone else self.timezone.zone), 'range': 'start', 'time_type': time_type } @@ -321,7 +324,7 @@ def _detect_range_12_hour_format(self, time_list=None, original_list=None): 'hh': int(t3), 'mm': int(t4), 'nn': str(ap2).lower().strip('.'), - 'tz': tz or self.timezone.zone, + 'tz': tz or (None if not self.timezone else self.timezone.zone), 'range': 'end', 'time_type': time_type } @@ -380,7 +383,7 @@ def _detect_range_24_hour_format(self, time_list=None, original_list=None): 'hh': int(t1), 'mm': int(t2), 'nn': 'hrs', - 'tz': tz or self.timezone.zone, + 'tz': tz or (None if not self.timezone else self.timezone.zone), 'range': 'start', 'time_type': time_type } @@ -398,7 +401,7 @@ def _detect_range_24_hour_format(self, time_list=None, original_list=None): 'hh': int(t3), 'mm': int(t4), 'nn': 'hrs', - 'tz': tz or self.timezone.zone, + 'tz': tz or (None if not self.timezone else self.timezone.zone), 'range': 'end', 'time_type': time_type } @@ -454,7 +457,7 @@ def _detect_range_12_hour_format_without_min(self, time_list=None, original_list 'hh': int(t1), 'mm': 0, 'nn': str(ap1).lower().strip('.'), - 'tz': tz or self.timezone.zone, + 'tz': tz or (None if not self.timezone else self.timezone.zone), 'range': 'start', 'time_type': time_type } @@ -472,7 +475,7 @@ def _detect_range_12_hour_format_without_min(self, time_list=None, original_list 'hh': int(t2), 'mm': 0, 'nn': str(ap2).lower().strip('.'), - 'tz': tz or self.timezone.zone, + 'tz': tz or (None if not self.timezone else self.timezone.zone), 'range': 'end', 'time_type': time_type } @@ -531,7 +534,7 @@ def _detect_start_range_12_hour_format(self, time_list=None, original_list=None) 'hh': int(t1), 'mm': int(t2), 'nn': str(ap1).lower().strip('.'), - 'tz': tz or self.timezone.zone, + 'tz': tz or (None if not self.timezone else self.timezone.zone), 'range': 'start', 'time_type': time_type } @@ -585,7 +588,7 @@ def _detect_end_range_12_hour_format(self, time_list=None, original_list=None): 'hh': int(t1), 'mm': int(t2), 'nn': str(ap1).lower().strip('.'), - 'tz': tz or self.timezone.zone, + 'tz': tz or (None if not self.timezone else self.timezone.zone), 'range': 'end', 'time_type': time_type } @@ -636,7 +639,7 @@ def _detect_start_range_12_hour_format_without_min(self, time_list=None, origina 'hh': int(t1), 'mm': 0, 'nn': str(ap1).lower().strip('.'), - 'tz': tz or self.timezone.zone, + 'tz': tz or (None if not self.timezone else self.timezone.zone), 'range': 'start', 'time_type': time_type } @@ -690,7 +693,7 @@ def _detect_end_range_12_hour_format_without_min(self, time_list=None, original_ 'hh': int(t1), 'mm': 0, 'nn': str(ap1).lower().strip('.'), - 'tz': tz or self.timezone.zone, + 'tz': tz or (None if not self.timezone else self.timezone.zone), 'range': 'end', 'time_type': time_type } @@ -750,7 +753,7 @@ def _detect_12_hour_format(self, time_list=None, original_list=None): 'hh': int(t1), 'mm': int(t2), 'nn': str(ap).lower().strip('.'), - 'tz': tz or self.timezone.zone + 'tz': tz or (None if not self.timezone else self.timezone.zone), } time['nn'] = 'am' if 'a' in time['nn'] else time['nn'] @@ -803,7 +806,7 @@ def _detect_12_hour_without_min(self, time_list=None, original_list=None): 'hh': int(t1), 'mm': 0, 'nn': str(ap).lower().strip('.'), - 'tz': tz or self.timezone.zone + 'tz': tz or (None if not self.timezone else self.timezone.zone), } time['nn'] = 'am' if 'a' in time['nn'] else time['nn'] time['nn'] = 'pm' if 'p' in time['nn'] else time['nn'] @@ -860,7 +863,7 @@ def _detect_time_with_difference(self, time_list=None, original_list=None): time[setter] = t1 time[antisetter] = 0 time['nn'] = 'df' - time['tz'] = self.timezone.zone + time['tz'] = None if not self.timezone else self.timezone.zone time_list.append(time) original_list.append(original) return time_list, original_list @@ -902,7 +905,7 @@ def _detect_time_with_difference_later(self, time_list=None, original_list=None) time[setter] = t1 time[antisetter] = 0 time['nn'] = 'df' - time['tz'] = self.timezone.zone + time['tz'] = None if not self.timezone else self.timezone.zone time_list.append(time) original_list.append(original) return time_list, original_list @@ -944,7 +947,7 @@ def _detect_time_with_every_x_hour(self, time_list=None, original_list=None): time[setter] = t1 time[antisetter] = 0 time['nn'] = EVERY_TIME_TYPE - time['tz'] = self.timezone.zone + time['tz'] = None if not self.timezone else self.timezone.zone time_list.append(time) original_list.append(original) return time_list, original_list @@ -977,7 +980,7 @@ def _detect_time_with_once_in_x_day(self, time_list=None, original_list=None): time[setter] = t1 time[antisetter] = 0 time['nn'] = EVERY_TIME_TYPE - time['tz'] = self.timezone.zone + time['tz'] = None if not self.timezone else self.timezone.zone time_list.append(time) original_list.append(original) return time_list, original_list @@ -1030,7 +1033,7 @@ def _detect_24_hour_optional_minutes_format(self, time_list=None, original_list= 'hh': int(t1), 'mm': int(t2), 'nn': 'hrs', - 'tz': tz or self.timezone.zone + 'tz': tz or (None if not self.timezone else self.timezone.zone), } time_list.append(time) original_list.append(original) @@ -1078,7 +1081,7 @@ def _detect_restricted_24_hour_format(self, time_list=None, original_list=None): 'hh': int(t1), 'mm': int(t2), 'nn': meridiem, - 'tz': tz or self.timezone.zone + 'tz': tz or (None if not self.timezone else self.timezone.zone), } time_list.append(time) original_list.append(original) @@ -1136,7 +1139,7 @@ def _detect_12_hour_word_format(self, time_list=None, original_list=None): time = { 'hh': t1, 'mm': t2, - 'tz': tz or self.timezone.zone + 'tz': tz or (None if not self.timezone else self.timezone.zone), } if pattern_am: time['nn'] = 'am' @@ -1197,7 +1200,7 @@ def _detect_12_hour_word_format2(self, time_list=None, original_list=None): time = { 'hh': t1, 'mm': 0, - 'tz': tz or self.timezone.zone + 'tz': tz or (None if not self.timezone else self.timezone.zone), } if pattern_am: time['nn'] = 'am' @@ -1257,7 +1260,7 @@ def _detect_24_hour_format(self, time_list=None, original_list=None): 'hh': t1, 'mm': t2, 'nn': meridiem, - 'tz': tz or self.timezone.zone + 'tz': tz or (None if not self.timezone else self.timezone.zone), } time_list.append(time) original_list.append(original) @@ -1311,7 +1314,7 @@ def _detect_time_without_format(self, time_list=None, original_list=None): 'hh': int(t1), 'mm': int(t2), 'nn': meridiem, - 'tz': tz or self.timezone.zone + 'tz': tz or (None if not self.timezone else self.timezone.zone), } time_list.append(time) original_list.append(original) @@ -1373,7 +1376,7 @@ def _detect_time_without_format_preceeding(self, time_list=None, original_list=N 'hh': int(t1), 'mm': int(t2), 'nn': meridiem, - 'tz': tz or self.timezone.zone + 'tz': tz or (None if not self.timezone else self.timezone.zone), } time_list.append(time) original_list.append(original) @@ -1456,7 +1459,7 @@ def _get_morning_time_range(self, time_list=None, original_list=None): 'hh': 12, 'mm': 0, 'nn': 'am', - 'tz': tz or self.timezone.zone, + 'tz': tz or (None if not self.timezone else self.timezone.zone), 'range': 'start', 'time_type': time_type } @@ -1465,7 +1468,7 @@ def _get_morning_time_range(self, time_list=None, original_list=None): 'hh': 11, 'mm': 0, 'nn': 'am', - 'tz': tz or self.timezone.zone, + 'tz': tz or (None if not self.timezone else self.timezone.zone), 'range': 'end', 'time_type': time_type } @@ -1513,7 +1516,7 @@ def _get_afternoon_time_range(self, time_list=None, original_list=None): 'hh': 11, 'mm': 0, 'nn': 'am', - 'tz': tz or self.timezone.zone, + 'tz': tz or (None if not self.timezone else self.timezone.zone), 'range': 'start', 'time_type': time_type } @@ -1522,7 +1525,7 @@ def _get_afternoon_time_range(self, time_list=None, original_list=None): 'hh': 5, 'mm': 0, 'nn': 'pm', - 'tz': tz or self.timezone.zone, + 'tz': tz or (None if not self.timezone else self.timezone.zone), 'range': 'end', 'time_type': time_type } @@ -1570,7 +1573,7 @@ def _get_evening_time_range(self, time_list=None, original_list=None): 'hh': 5, 'mm': 0, 'nn': 'pm', - 'tz': tz or self.timezone.zone, + 'tz': tz or (None if not self.timezone else self.timezone.zone), 'range': 'start', 'time_type': time_type } @@ -1579,7 +1582,7 @@ def _get_evening_time_range(self, time_list=None, original_list=None): 'hh': 9, 'mm': 0, 'nn': 'pm', - 'tz': tz or self.timezone.zone, + 'tz': tz or (None if not self.timezone else self.timezone.zone), 'range': 'end', 'time_type': time_type } @@ -1627,7 +1630,7 @@ def _get_night_time_range(self, time_list=None, original_list=None): 'hh': 9, 'mm': 0, 'nn': 'pm', - 'tz': tz or self.timezone.zone, + 'tz': tz or (None if not self.timezone else self.timezone.zone), 'range': 'start', 'time_type': time_type } @@ -1636,7 +1639,7 @@ def _get_night_time_range(self, time_list=None, original_list=None): 'hh': 12, 'mm': 0, 'nn': 'am', - 'tz': tz or self.timezone.zone, + 'tz': tz or (None if not self.timezone else self.timezone.zone), 'range': 'end', 'time_type': time_type } @@ -1687,7 +1690,7 @@ def _get_default_time_range(self, time_list=None, original_list=None): 'hh': 12, 'mm': 0, 'nn': 'am', - 'tz': tz or self.timezone.zone, + 'tz': tz or (None if not self.timezone else self.timezone.zone), 'range': 'start', 'time_type': time_type } @@ -1696,7 +1699,7 @@ def _get_default_time_range(self, time_list=None, original_list=None): 'hh': 11, 'mm': 59, 'nn': 'pm', - 'tz': tz or self.timezone.zone, + 'tz': tz or (None if not self.timezone else self.timezone.zone), 'range': 'end', 'time_type': time_type } From 104aaa4feb2aecdd75794f684982d8e9ff8bc50c Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Wed, 9 Oct 2019 22:31:48 +0530 Subject: [PATCH 196/237] add none tz case --- ner_v2/api.py | 2 +- .../temporal/time/en/time_detection.py | 7 +++--- .../temporal/time/standard_time_regex.py | 25 ++++++++++++------- .../detectors/temporal/time/time_detection.py | 7 ++++-- 4 files changed, 26 insertions(+), 15 deletions(-) diff --git a/ner_v2/api.py b/ner_v2/api.py index 184cb1eba..222aa15b8 100644 --- a/ner_v2/api.py +++ b/ner_v2/api.py @@ -211,7 +211,7 @@ def time(request): parameters_dict = get_parameters_dictionary(request) ner_logger.debug('Start: %s ' % parameters_dict[PARAMETER_ENTITY_NAME]) - timezone = parameters_dict[PARAMETER_TIMEZONE] or 'UTC' + timezone = parameters_dict[PARAMETER_TIMEZONE] or None form_check = True if parameters_dict[PARAMETER_STRUCTURED_VALUE] else False range_enabled = True if parameters_dict[PARAMETER_RANGE_ENABLED] else False time_detection = TimeDetector(entity_name=parameters_dict[PARAMETER_ENTITY_NAME], diff --git a/ner_v2/detectors/temporal/time/en/time_detection.py b/ner_v2/detectors/temporal/time/en/time_detection.py index db0f9f3df..b1d773436 100644 --- a/ner_v2/detectors/temporal/time/en/time_detection.py +++ b/ner_v2/detectors/temporal/time/en/time_detection.py @@ -3,6 +3,7 @@ import collections import pandas as pd import os +import pytz from ner_v2.detectors.temporal.constant import AM_MERIDIEM, PM_MERIDIEM, TWELVE_HOUR, EVERY_TIME_TYPE,\ TIMEZONES_CONSTANT_FILE, TIMEZONE_VARIANTS_VARIANTS_COLUMN_NAME, \ TIMEZONES_CODE_COLUMN_NAME, TIMEZONES_ALL_REGIONS_COLUMN_NAME, \ @@ -11,7 +12,6 @@ from ner_v2.constant import LANGUAGE_DATA_DIRECTORY TimezoneVariants = collections.namedtuple('TimezoneVariant', ['value', 'preferred']) -# Timezone_for_none = collections.namedtuple('TZ_None_case', ['zone']) class TimeDetector(object): @@ -91,7 +91,6 @@ def __init__(self, entity_name, timezone=None): self.timezone = get_timezone(timezone) else: self.timezone = None - self.now_date = datetime.datetime.now(self.timezone) self.timezones_map = {} self.init_regex_and_parser(os.path.join((os.path.dirname(os.path.abspath(__file__)).rstrip(os.sep)), @@ -1404,7 +1403,9 @@ def _get_meridiem(self, hours, mins, timezone): if timezone is not None: new_timezone = get_timezone(timezone) else: - new_timezone = self.timezone + # If no TZ(neither from api call not from the user message) is given, use 'UTC' + new_timezone = self.timezone or pytz.timezone('UTC') + current_datetime = datetime.datetime.now(new_timezone) current_hour = current_datetime.hour current_min = current_datetime.minute diff --git a/ner_v2/detectors/temporal/time/standard_time_regex.py b/ner_v2/detectors/temporal/time/standard_time_regex.py index 91bc1a217..104cd58a4 100644 --- a/ner_v2/detectors/temporal/time/standard_time_regex.py +++ b/ner_v2/detectors/temporal/time/standard_time_regex.py @@ -5,6 +5,8 @@ import os import re +import pytz + from chatbot_ner.config import ner_logger from ner_v2.detectors.temporal.constant import (DATETIME_CONSTANT_FILE, ADD_DIFF_DATETIME_TYPE, NUMERALS_CONSTANT_FILE, TIME_CONSTANT_FILE, REF_DATETIME_TYPE, HOUR_TIME_TYPE, @@ -14,7 +16,7 @@ class BaseRegexTime(object): - def __init__(self, entity_name, data_directory_path, timezone='UTC'): + def __init__(self, entity_name, data_directory_path, timezone=None): """ Base Regex class which will be imported by language date class by giving their data folder path This will create standard regex and their parser to detect date for given language. @@ -27,7 +29,10 @@ def __init__(self, entity_name, data_directory_path, timezone='UTC'): self.processed_text = '' self.entity_name = entity_name self.tag = '__' + entity_name + '__' - self.timezone = get_timezone(timezone) + if timezone: + self.timezone = get_timezone(timezone) + else: + self.timezone = None self.now_date = datetime.datetime.now(tz=self.timezone) self.bot_message = None @@ -45,8 +50,7 @@ def __init__(self, entity_name, data_directory_path, timezone='UTC'): # Variable to define default order in which these regex will work self.detector_preferences = [ self._detect_time_with_coln_format, - self._detect_hour_minute - ] + self._detect_hour_minute] def set_bot_message(self, bot_message): """ @@ -190,8 +194,11 @@ def _get_meridiem(self, hours, mins, original_text): Returns str: returns the meridiem type whether its am and pm """ - current_hour = self.now_date.hour - current_min = self.now_date.minute + # If no TZ(neither from api call not from the user message) is given, use 'UTC' + new_timezone = self.timezone or pytz.timezone('UTC') + current_datetime = datetime.datetime.now(new_timezone) + current_hour = current_datetime.hour + current_min = current_datetime.minute if hours == 0 or hours >= TWELVE_HOUR: return 'hrs' @@ -268,7 +275,7 @@ def _detect_hour_minute(self, time_list, original_list): 'hh': int(hh), 'mm': int(mm), 'nn': nn, - 'tz': self.timezone.zone + 'tz': None if not self.timezone else self.timezone.zone } time_list.append(time) @@ -295,7 +302,7 @@ def _detect_time_with_coln_format(self, time_list, original_list): >>> time_list = [] >>> original_list = [] >>> preprocessed_text = u'आज 05:40 बजे अजना' - >>> _detect_time_with_coln_format(time_list, original_list) + >>> self._detect_time_with_coln_format(time_list, original_list) >>> ([{'hh': 5, 'mm': 40, 'nn': 'pm', 'time_type': None}], ["05:40"]) @@ -317,7 +324,7 @@ def _detect_time_with_coln_format(self, time_list, original_list): time = { 'hh': hh, 'mm': mm, - 'tz': self.timezone.zone, + 'tz': None if not self.timezone else self.timezone.zone, 'time_type': None } diff --git a/ner_v2/detectors/temporal/time/time_detection.py b/ner_v2/detectors/temporal/time/time_detection.py index e7df5c386..d85e708ec 100644 --- a/ner_v2/detectors/temporal/time/time_detection.py +++ b/ner_v2/detectors/temporal/time/time_detection.py @@ -41,7 +41,7 @@ def get_supported_languages(): supported_languages.append(_dir) return supported_languages - def __init__(self, entity_name='time', timezone='UTC', language=ENGLISH_LANG): + def __init__(self, entity_name='time', timezone=None, language=ENGLISH_LANG): """Initializes a TimeDetector object with given entity_name and timezone Args: @@ -61,7 +61,10 @@ def __init__(self, entity_name='time', timezone='UTC', language=ENGLISH_LANG): self.time = [] self.original_time_text = [] self.tag = '__' + entity_name + '__' - self.timezone = get_timezone(timezone) + if timezone: + self.timezone = get_timezone(timezone) + else: + self.timezone = None self.language = language try: From 8ff3fa06d4321cc4f50486c548ea331b8ec71cbb Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Thu, 10 Oct 2019 10:42:04 +0530 Subject: [PATCH 197/237] return none tz if invalid tz is given --- ner_v2/detectors/temporal/utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ner_v2/detectors/temporal/utils.py b/ner_v2/detectors/temporal/utils.py index 2a82a8d0b..a90c647af 100644 --- a/ner_v2/detectors/temporal/utils.py +++ b/ner_v2/detectors/temporal/utils.py @@ -265,7 +265,7 @@ def get_next_date_with_dd(dd, after_datetime): return None, None, None -def get_timezone(timezone, ignore_errors=True): +def get_timezone(timezone, ignore_errors=False): # type: (Union[datetime.tzinfo, str, unicode], bool) -> datetime.tzinfo """ Return a datetime.tzinfo (pytz timezone object). If `timezone` is a str, try constructing a pytz @@ -275,7 +275,7 @@ def get_timezone(timezone, ignore_errors=True): Args: timezone (str or datetime.tzinfo): Either a valid timezone string or datetime.tzinfo object ignore_errors (bool, optional): when set to True, ignore errors and return a pytz.UTC when error occurs. When - set to False, raise exception when invalid timezone is given. Defaults to True. + set to False, raise exception when invalid timezone is given. Defaults to False. Returns: datetime.tzinfo: A pytz timezone object @@ -294,7 +294,7 @@ def get_timezone(timezone, ignore_errors=True): timezone = pytz.timezone('UTC') ner_logger.debug('Using "UTC" as default timezone') else: - raise + return None return timezone From 47815918dcaafcbf1ae10a1487452194841459aa Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Thu, 10 Oct 2019 14:53:58 +0530 Subject: [PATCH 198/237] fix data csv --- .../numeral/number_range/en/data/number_range_keywords.csv | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ner_v2/detectors/numeral/number_range/en/data/number_range_keywords.csv b/ner_v2/detectors/numeral/number_range/en/data/number_range_keywords.csv index bf96aa062..924299a1a 100644 --- a/ner_v2/detectors/numeral/number_range/en/data/number_range_keywords.csv +++ b/ner_v2/detectors/numeral/number_range/en/data/number_range_keywords.csv @@ -1,5 +1,6 @@ range_variants,position,range_type above | abv | abov | more than | mor than | more den | mor den | greater than | greater | over,-1,min +onwards | and above | or above | or more | or great | or abov | or abv,1,min max | upto | up to | around | below | less than | less | less den,-1,max -max | onwards | and above | or above ,1,max +max ,1,max To | - ,0,min_max \ No newline at end of file From 426854d07cc85c8db8f22af2e1e2f6171b0902c8 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Thu, 10 Oct 2019 15:52:36 +0530 Subject: [PATCH 199/237] added till variants in range --- .../temporal/time/en/time_detection.py | 31 ++++++++++++------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/ner_v2/detectors/temporal/time/en/time_detection.py b/ner_v2/detectors/temporal/time/en/time_detection.py index b1d773436..59a6309ea 100644 --- a/ner_v2/detectors/temporal/time/en/time_detection.py +++ b/ner_v2/detectors/temporal/time/en/time_detection.py @@ -277,8 +277,9 @@ def _detect_range_12_hour_format(self, time_list=None, original_list=None): if original_list is None: original_list = [] regex_patterns = re.compile( - r'\b(({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*(?::|\.|\s)?[\s-]*?([0-5][0-9])[\s-]*?' - r'(pm|am|a\.m\.?|p\.m\.?)[\s-]*?({timezone})?\s*(?:to|-)[\s-]*?({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*' + r'\b((?:from)?({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*(?::|\.|\s)?[\s-]*?([0-5][0-9])[\s-]*?' + r'(pm|am|a\.m\.?|p\.m\.?)[\s-]*?({timezone})?\s*(?:to|-|till|until|untill|upto|up to)' + r'[\s-]*?({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*' r'(?::|\.|\s)?[\s-]*?([0-5][0-9])[\s-]*?(pm|am|a\.m\.?|p\.m\.?)\s*({timezone})?)\b' .format(timezone=self.timezone_choices) ) @@ -356,8 +357,9 @@ def _detect_range_24_hour_format(self, time_list=None, original_list=None): if original_list is None: original_list = [] regex_patterns = re.compile( - r'\b(({timezone})?\s*(00?|0?[2-9]|0?1[0-9]?|2[0-3])[:.\s]?([0-5][0-9])' - r'[\s-]*?({timezone})?\s*(?:to|-)[\s-]*?({timezone})?\s*(00?|0?[2-9]|0?1[0-9]?|2[0-3])[:.\s]?([0-5][0-9])' + r'\b((?:from)?({timezone})?\s*(00?|0?[2-9]|0?1[0-9]?|2[0-3])[:.\s]?([0-5][0-9])' + r'[\s-]*?({timezone})?\s*(?:to|-|till|until|untill|upto|up to)[\s-]*?({timezone})?\s*' + r'(00?|0?[2-9]|0?1[0-9]?|2[0-3])[:.\s]?([0-5][0-9])' r'[\s-]*?({timezone})?)(?!\s*(?:am|pm|a\.m\.?|p\.m\.?|(?:{timezone})|\d))' .format(timezone=self.timezone_choices) ) @@ -431,7 +433,8 @@ def _detect_range_12_hour_format_without_min(self, time_list=None, original_list if original_list is None: original_list = [] regex_patterns = re.compile( - r'\b(({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*(am|pm|a\.m\.?|p\.m\.?)[\s-]*?({timezone})?\s*(?:to|-)' + r'\b((?:from)?({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*(am|pm|a\.m\.?|p\.m\.?)[\s-]*?({timezone})?\s*' + r'(?:to|-|till|until|untill|upto|up to)' r'\s*({timezone})?[\s-]*?(0?[2-9]|0?1[0-2]?)[\s-]*(am|pm|a\.m\.?|p\.m\.?)\s*({timezone})?)\b' .format(timezone=self.timezone_choices) ) @@ -563,8 +566,9 @@ def _detect_end_range_12_hour_format(self, time_list=None, original_list=None): time_list = [] if original_list is None: original_list = [] - patterns = re.findall(r'\b((?:before|bfre)[\s-]*({timezone})?\s*(0?[2-9]|0?1[0-2]?)[\s-]*' - r'(?::|\.|\s)?[\s-]*?([0-5][0-9])[\s-]*?(pm|am|a\.m\.?|p\.m\.?)\s*({timezone})?)\b' + patterns = re.findall(r'\b((?:before|bfre|till|until|untill|upto|up to)[\s-]*({timezone})?\s*' + r'(0?[2-9]|0?1[0-2]?)[\s-]*(?::|\.|\s)?[\s-]*?([0-5][0-9])[\s-]*?' + r'(pm|am|a\.m\.?|p\.m\.?)\s*({timezone})?)\b' .format(timezone=self.timezone_choices), self.processed_text.lower()) for pattern in patterns: @@ -668,7 +672,8 @@ def _detect_end_range_12_hour_format_without_min(self, time_list=None, original_ time_list = [] if original_list is None: original_list = [] - patterns = re.findall(r'\b((?:before|bfore)[\s-]*({timezone})?\s*(0?[2-9]|0?1[0-2]?)' + patterns = re.findall(r'\b((?:before|bfore|till|until|untill|upto|up to)' + r'[\s-]*({timezone})?\s*(0?[2-9]|0?1[0-2]?)' r'[\s-]*(am|pm|a\.m\.?|p\.m\.?)\s*({timezone})?)\b' .format(timezone=self.timezone_choices), self.processed_text.lower()) @@ -968,11 +973,14 @@ def _detect_time_with_once_in_x_day(self, time_list=None, original_list=None): time_list = [] if original_list is None: original_list = [] - patterns = re.findall(r'\b((once|onc|1se)\s*(in)?\s*(\d+)\s?(day|days))\b', + patterns = re.findall(r'\b((once|onc|1se)\s*(in|every|evry)?\s*(\d+|a)\s?(day|days))\b', self.processed_text.lower()) for pattern in patterns: original = pattern[0].strip() - t1 = 24 * int(pattern[3]) + if type(pattern[3]) is int: + t1 = 24 * int(pattern[3]) + else: + t1 = 24 setter = "hh" antisetter = "mm" time = dict() @@ -1124,7 +1132,8 @@ def _detect_12_hour_word_format(self, time_list=None, original_list=None): patterns = re.findall(r'\b((0?[1-9]|1[0-2])[:.\s]([0-5][0-9]))(?!\s?(?:am|pm|a\.m\.?|p\.m\.?|\d))', self.processed_text.lower()) pattern_am = re.findall(r'\s(morning|early|subah|mrng|mrning|savere)\s', self.processed_text.lower()) - pattern_pm = re.findall(r'\s(noon|afternoon|evening|evng|evning|sham)\s', self.processed_text.lower()) + pattern_pm = re.findall(r'\s(noon|afternoon|evening|evng|evning|sham|lunch|dinner)\s', + self.processed_text.lower()) pattern_night = re.findall(r'\s(night|nite|tonight|latenight|tonit|nit|rat)\s', self.processed_text.lower()) pattern_tz = re.findall(r'(?:\b|[^a-zA-Z])({timezone})\b'.format(timezone=self.timezone_choices), self.processed_text.lower()) From 13abf95a7e445fa7aecd3379a97b921f0cf294ee Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Thu, 10 Oct 2019 16:07:26 +0530 Subject: [PATCH 200/237] fix once in a day case --- ner_v2/detectors/temporal/time/en/time_detection.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ner_v2/detectors/temporal/time/en/time_detection.py b/ner_v2/detectors/temporal/time/en/time_detection.py index 59a6309ea..a1ff1dad7 100644 --- a/ner_v2/detectors/temporal/time/en/time_detection.py +++ b/ner_v2/detectors/temporal/time/en/time_detection.py @@ -973,13 +973,13 @@ def _detect_time_with_once_in_x_day(self, time_list=None, original_list=None): time_list = [] if original_list is None: original_list = [] - patterns = re.findall(r'\b((once|onc|1se)\s*(in|every|evry)?\s*(\d+|a)\s?(day|days))\b', + patterns = re.findall(r'\b((once|onc|1se)\s*(in|every|evry|in every)?\s*(\d+|a)\s?(day|days))\b', self.processed_text.lower()) for pattern in patterns: original = pattern[0].strip() - if type(pattern[3]) is int: + try: t1 = 24 * int(pattern[3]) - else: + except: t1 = 24 setter = "hh" antisetter = "mm" From f6b95e97ef0f1db130aeb7518ea8c2d68591fc93 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Thu, 10 Oct 2019 16:14:11 +0530 Subject: [PATCH 201/237] fix test cases --- ner_v2/tests/temporal/time/time_ner_tests.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ner_v2/tests/temporal/time/time_ner_tests.yaml b/ner_v2/tests/temporal/time/time_ner_tests.yaml index 8070a2aee..f8ef49df4 100644 --- a/ner_v2/tests/temporal/time/time_ner_tests.yaml +++ b/ner_v2/tests/temporal/time/time_ner_tests.yaml @@ -578,7 +578,7 @@ tests: tz: "UTC" range: "start" time_type: null - original_text: "12:30 am to 2:30 pm" + original_text: "from 12:30 am to 2:30 pm" - output_id: 2 hh: 2 mm: 30 From abf1e8f800281a9c6a7e4a8363995ebf3a2e5e2b Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Thu, 10 Oct 2019 16:16:09 +0530 Subject: [PATCH 202/237] fix test cases --- ner_v2/tests/temporal/time/time_ner_tests.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ner_v2/tests/temporal/time/time_ner_tests.yaml b/ner_v2/tests/temporal/time/time_ner_tests.yaml index f8ef49df4..de067501d 100644 --- a/ner_v2/tests/temporal/time/time_ner_tests.yaml +++ b/ner_v2/tests/temporal/time/time_ner_tests.yaml @@ -586,7 +586,7 @@ tests: tz: "UTC" range: "end" time_type: null - original_text: "12:30 am to 2:30 pm" + original_text: "from 12:30 am to 2:30 pm" - id: en_49 message: "Sessions begin at noon" bot_message: null From 90407630a350ba91337d0daa8b898ce8a08f5b51 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Thu, 10 Oct 2019 18:31:44 +0530 Subject: [PATCH 203/237] remove try catch in _detect_time_with_once_in_x_day --- ner_v2/detectors/temporal/time/en/time_detection.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ner_v2/detectors/temporal/time/en/time_detection.py b/ner_v2/detectors/temporal/time/en/time_detection.py index a1ff1dad7..3cfb96b55 100644 --- a/ner_v2/detectors/temporal/time/en/time_detection.py +++ b/ner_v2/detectors/temporal/time/en/time_detection.py @@ -977,10 +977,10 @@ def _detect_time_with_once_in_x_day(self, time_list=None, original_list=None): self.processed_text.lower()) for pattern in patterns: original = pattern[0].strip() - try: - t1 = 24 * int(pattern[3]) - except: + if not pattern[3] or pattern[3] == "a": t1 = 24 + else: + t1 = 24 * int(pattern[3]) setter = "hh" antisetter = "mm" time = dict() From 9b4551522902731d089b39f7276b6d157c0449fc Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Mon, 14 Oct 2019 16:10:32 +0530 Subject: [PATCH 204/237] modify get_number_from_number_word --- ner_v2/detectors/numeral/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ner_v2/detectors/numeral/utils.py b/ner_v2/detectors/numeral/utils.py index a00a08ab4..bb24dceba 100644 --- a/ner_v2/detectors/numeral/utils.py +++ b/ner_v2/detectors/numeral/utils.py @@ -80,7 +80,7 @@ def get_number_from_number_word(text, number_word_dict): result_text, current_text = '', '' # handle where only scale is mentioned without unit, for ex - thousand(for 1000), hundred(for 100) - current = 1 if (scale > 0 and current == 0 and increment == 0) else current + current = 1 if (scale > 0 and current == 0 and increment == 0 and word != 'zero') else current current = current * scale + increment current_text += part if scale > 1: From 82160c1446766d15b735a184ee11b940754ef98c Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Mon, 14 Oct 2019 17:00:17 +0530 Subject: [PATCH 205/237] modify get_number_from_number_word --- ner_v2/detectors/numeral/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ner_v2/detectors/numeral/utils.py b/ner_v2/detectors/numeral/utils.py index bb24dceba..d248aa419 100644 --- a/ner_v2/detectors/numeral/utils.py +++ b/ner_v2/detectors/numeral/utils.py @@ -80,7 +80,7 @@ def get_number_from_number_word(text, number_word_dict): result_text, current_text = '', '' # handle where only scale is mentioned without unit, for ex - thousand(for 1000), hundred(for 100) - current = 1 if (scale > 0 and current == 0 and increment == 0 and word != 'zero') else current + current = 1 if (scale > 1 and current == 0 and increment == 0) else current current = current * scale + increment current_text += part if scale > 1: From 4b4ab0c9c925251e34b2fbc40b1369cc59662d28 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Tue, 15 Oct 2019 12:17:49 +0530 Subject: [PATCH 206/237] remove word-number for now --- .../phone_number/phone_number_detection.py | 43 +------------------ 1 file changed, 1 insertion(+), 42 deletions(-) diff --git a/ner_v2/detectors/pattern/phone_number/phone_number_detection.py b/ner_v2/detectors/pattern/phone_number/phone_number_detection.py index 091fd8790..9eec9225e 100644 --- a/ner_v2/detectors/pattern/phone_number/phone_number_detection.py +++ b/ner_v2/detectors/pattern/phone_number/phone_number_detection.py @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- from ner_v2.detectors.base_detector import BaseDetector from ner_v2.detectors.numeral.number.number_detection import NumberDetector -from ner_v2.detectors.numeral.utils import get_number_from_number_word from language_utilities.constant import ENGLISH_LANG import collections import re @@ -72,40 +71,6 @@ def get_country_code_from_locale(self): match = regex_pattern.findall(self.locale) self.country_code = match[0].upper() - def convert_words_to_numbers(self, text): - """ - :param text: user message - :return: converted user message with words replaced with numbers - """ - numbers_dict = get_number_from_number_word(text, self.number_word_dict) - val = numbers_dict[0] - word = numbers_dict[1] - converted_sentence = text - x = zip(val, word) - unique_x = [] - for i in x: - if i not in unique_x: - unique_x.append(i) - - for j in unique_x: - pattern = re.compile(j[1], re.U) - converted_sentence = pattern.sub(string=converted_sentence, repl=str(j[0])) - - converted_sentence = re.sub(r'(double)(\s+)(\d)', r'\3\3', converted_sentence) - converted_sentence = re.sub(r'(triple)(\s+)(\d)', r'\3\3\3', converted_sentence) - - while re.search(r'(\d+)(\s+)(\d+)', converted_sentence): - converted_sentence = re.sub(r'(\d+)(\s+)(\d+)', r'\1\3', converted_sentence) - return converted_sentence - - def get_correct_original_text(self, original_text): - pattern = '' - for text_match in original_text: - pattern += ('(^.*)' + '(?:' + text_match + ')') - pattern += '(.*$)' - first_pattern = re.compile(pattern, re.U) - first_matches = first_pattern.findall(self.text) - def detect_entity(self, text, **kwargs): """Detects phone numbers in the text string @@ -135,16 +100,10 @@ def detect_entity(self, text, **kwargs): """ self.get_country_code_from_locale() - original_text = text - if self.language == 'en': - self.text = self.convert_words_to_numbers(text) - else: - self.text = text + self.text = text self.phone, self.original_phone_text = [], [] for match in phonenumbers.PhoneNumberMatcher(self.text, self.country_code): self.phone.append({"country_calling_code": str(match.number.country_code), "phone_number": str(match.number.national_number)}) self.original_phone_text.append(self.text[match.start:match.end]) - # if original_text != self.text: - # self.get_correct_original_text(original_text) return self.phone, self.original_phone_text From 3e7da63ff136679498c60f844ec49c03bc020332 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Tue, 15 Oct 2019 14:06:20 +0530 Subject: [PATCH 207/237] add locale=parameters_dict[PARAMETER_LOCALE] in api.py for phonenumber --- ner_v2/api.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ner_v2/api.py b/ner_v2/api.py index 222aa15b8..e8cb61e33 100644 --- a/ner_v2/api.py +++ b/ner_v2/api.py @@ -537,7 +537,8 @@ def phone_number(request): entity_output = phone_number_detection.detect(message=message, structured_value=parameters_dict[PARAMETER_STRUCTURED_VALUE], fallback_value=parameters_dict[PARAMETER_FALLBACK_VALUE], - bot_message=parameters_dict[PARAMETER_BOT_MESSAGE]) + bot_message=parameters_dict[PARAMETER_BOT_MESSAGE], + locale=parameters_dict[PARAMETER_LOCALE]) elif isinstance(message, (list, tuple)): entity_output = phone_number_detection.detect_bulk(messages=message) ner_logger.debug('Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output)) From 08ef9ce5f51e81837d342fd0dd3e131085e4a4ce Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Tue, 15 Oct 2019 14:15:46 +0530 Subject: [PATCH 208/237] add locale=parameters_dict[PARAMETER_LOCALE] in api.py for phonenumber --- ner_v2/api.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ner_v2/api.py b/ner_v2/api.py index e8cb61e33..e6d7d230f 100644 --- a/ner_v2/api.py +++ b/ner_v2/api.py @@ -531,14 +531,14 @@ def phone_number(request): ner_logger.debug('Entity Name %s' % entity_name) ner_logger.debug('Source Language %s' % language) - phone_number_detection = PhoneDetector(entity_name=entity_name, language=language) + phone_number_detection = PhoneDetector(entity_name=entity_name, language=language, + locale=parameters_dict[PARAMETER_LOCALE]) message = parameters_dict[PARAMETER_MESSAGE] if isinstance(message, six.string_types): entity_output = phone_number_detection.detect(message=message, structured_value=parameters_dict[PARAMETER_STRUCTURED_VALUE], fallback_value=parameters_dict[PARAMETER_FALLBACK_VALUE], - bot_message=parameters_dict[PARAMETER_BOT_MESSAGE], - locale=parameters_dict[PARAMETER_LOCALE]) + bot_message=parameters_dict[PARAMETER_BOT_MESSAGE]) elif isinstance(message, (list, tuple)): entity_output = phone_number_detection.detect_bulk(messages=message) ner_logger.debug('Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output)) From cc941f99727add00bb7f0330dc0f344b5b8f0929 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Tue, 15 Oct 2019 15:11:23 +0530 Subject: [PATCH 209/237] add locale in ph tests --- .../phone_number/phone_number_ner_tests.yaml | 69 ++++++++++++++----- .../test_phone_number_detection.py | 3 +- 2 files changed, 54 insertions(+), 18 deletions(-) diff --git a/ner_v2/tests/pattern/phone_number/phone_number_ner_tests.yaml b/ner_v2/tests/pattern/phone_number/phone_number_ner_tests.yaml index ca3c3cf73..155c039e9 100644 --- a/ner_v2/tests/pattern/phone_number/phone_number_ner_tests.yaml +++ b/ner_v2/tests/pattern/phone_number/phone_number_ner_tests.yaml @@ -2,107 +2,142 @@ tests: en: - id: en_1 message: "Set a reminder on 02226129854" + locale: "en-in" outputs: - original_text: "02226129854" output_id: 1 - value: "02226129854" + value: "2226129854" + country_calling_code: '91' - id: en_2 message: "Set a reminder on 022 26129854" + locale: "en-in" outputs: - original_text: "022 26129854" output_id: 1 - value: "02226129854" + value: "2226129854" + country_calling_code: '91' - id: en_3 message: "Call the number 9820334455" + locale: "en-in" outputs: - original_text: "9820334455" output_id: 1 value: "9820334455" + country_calling_code: '91' - id: en_4 message: "Set a reminder on 919820334455" + locale: "en-in" outputs: - original_text: "919820334455" output_id: 1 - value: "919820334455" + value: "9820334455" + country_calling_code: '91' - id: en_5 message: "Set a reminder on 91 9820334455" + locale: "en-in" outputs: - original_text: "91 9820334455" output_id: 1 - value: "919820334455" + value: "9820334455" + country_calling_code: '91' - id: en_6 message: "Set a reminder on +91 9820334455" + locale: "en-in" outputs: - original_text: "+91 9820334455" output_id: 1 - value: "919820334455" + value: "9820334455" + country_calling_code: '91' - id: en_7 message: "Set a reminder on +919820334455" + locale: "en-in" outputs: - original_text: "+919820334455" output_id: 1 - value: "919820334455" + value: "9820334455" + country_calling_code: '91' - id: en_8 message: "Set a reminder on +919820334455" + locale: "en-in" outputs: - original_text: "+919820334455" output_id: 1 - value: "919820334455" + value: "9820334455" + country_calling_code: '91' - id: en_9 message: "Set a reminder on 91 9820-3344-55" + locale: "en-in" outputs: - original_text: "91 9820-3344-55" output_id: 1 - value: "919820334455" + value: "9820334455" + country_calling_code: '91' - id: en_10 message: "Set a reminder on +1 (408) 912-6172" + locale: "en-in" outputs: - original_text: "+1 (408) 912-6172" output_id: 1 - value: "14089126172" + value: "4089126172" + country_calling_code: '1' - id: en_11 message: "Set a reminder on +1 408 9126172" + locale: "en-in" outputs: - original_text: "+1 408 9126172" output_id: 1 - value: "14089126172" + value: "4089126172" + country_calling_code: '1' - id: en_12 - message: "Set a reminder on 14089126172" + message: "Set a reminder on +14089126172" + locale: "en-in" outputs: - - original_text: "14089126172" + - original_text: "+14089126172" output_id: 1 - value: "14089126172" + value: "4089126172" + country_calling_code: '1' - id: en_13 - message: "Send 1000rs to 14089126172 and call 02226129854" + message: "Send 1000rs to +14089126172 and call 02226129854" + locale: "en-in" outputs: - original_text: "14089126172" output_id: 1 value: "14089126172" + country_calling_code: '1' - original_text: "02226129854" output_id: 2 - value: "02226129854" + value: "2226129854" + country_calling_code: '91' hi: - id: hi_1 message: "मेरे लिए ५००र्स ९८२०३३४४५५ पे भेज देना" + locale: "en-in" outputs: - original_text: "९८२०३३४४५५" output_id: 1 value: "9820334455" + country_calling_code: '91' - id: hi_2 message: "मेरे लिए ५००र्स ९८ २०३३४४५५ पे भेज देना" + locale: "en-in" outputs: - original_text: "९८ २०३३४४५५" output_id: 1 value: "9820334455" + country_calling_code: '91' - id: hi_3 message: "मेरा लैंडलाइन नंबर ०२२२६१२९८५७ है" + locale: "en-in" outputs: - original_text: "०२२२६१२९८५७" output_id: 1 - value: "02226129857" + value: "2226129857" + country_calling_code: '91' - id: hi_4 message: "मेरा लैंडलाइन नंबर ०२२ २६१२९८५७ है" + locale: "en-in" outputs: - original_text: "०२२ २६१२९८५७" output_id: 1 - value: "02226129857" + value: "2226129857" + country_calling_code: '91' \ No newline at end of file diff --git a/ner_v2/tests/pattern/phone_number/test_phone_number_detection.py b/ner_v2/tests/pattern/phone_number/test_phone_number_detection.py index f522763dc..12035e9fd 100644 --- a/ner_v2/tests/pattern/phone_number/test_phone_number_detection.py +++ b/ner_v2/tests/pattern/phone_number/test_phone_number_detection.py @@ -47,7 +47,8 @@ def parse_expected_outputs(expected_outputs): def run_test(self): message = testcase["message"] - number_detector_object = PhoneDetector(entity_name='phone_number', language=language) + locale = testcase["locale"] + number_detector_object = PhoneDetector(entity_name='phone_number', language=language, locale=locale) phone_number_list, spans = number_detector_object.detect_entity(message) expected_phone_number_list, expected_spans = parse_expected_outputs(testcase["outputs"]) From 5e5ac402ac4ce5d153883a48bf4968f79ae4f309 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Tue, 15 Oct 2019 16:31:34 +0530 Subject: [PATCH 210/237] add phone_num_dict in ph test --- .../phone_number/phone_number_detection.py | 15 --------------- .../phone_number/test_phone_number_detection.py | 9 ++++++--- 2 files changed, 6 insertions(+), 18 deletions(-) diff --git a/ner_v2/detectors/pattern/phone_number/phone_number_detection.py b/ner_v2/detectors/pattern/phone_number/phone_number_detection.py index 9eec9225e..821031234 100644 --- a/ner_v2/detectors/pattern/phone_number/phone_number_detection.py +++ b/ner_v2/detectors/pattern/phone_number/phone_number_detection.py @@ -9,20 +9,6 @@ NumberVariant = collections.namedtuple('NumberVariant', ['scale', 'increment']) -def create_number_word_dict(): - number_word_dictionary = {'one': NumberVariant(scale=1, increment=1), - 'two': NumberVariant(scale=1, increment=2), - 'three': NumberVariant(scale=1, increment=3), - 'four': NumberVariant(scale=1, increment=4), - 'five': NumberVariant(scale=1, increment=5), - 'six': NumberVariant(scale=1, increment=6), - 'seven': NumberVariant(scale=1, increment=7), - 'eight': NumberVariant(scale=1, increment=8), - 'nine': NumberVariant(scale=1, increment=9), - 'zero': NumberVariant(scale=1, increment=0)} - return number_word_dictionary - - class PhoneDetector(BaseDetector): """ This method is used to detect phone numbers present in text. The phone detector takes into @@ -52,7 +38,6 @@ def __init__(self, entity_name, language=ENGLISH_LANG, locale='en-IN'): self.phone = [] self.original_phone_text = [] self.country_code = '' - self.number_word_dict = create_number_word_dict() @property def supported_languages(self): diff --git a/ner_v2/tests/pattern/phone_number/test_phone_number_detection.py b/ner_v2/tests/pattern/phone_number/test_phone_number_detection.py index 12035e9fd..e051e3114 100644 --- a/ner_v2/tests/pattern/phone_number/test_phone_number_detection.py +++ b/ner_v2/tests/pattern/phone_number/test_phone_number_detection.py @@ -34,14 +34,17 @@ def yaml_testsuite_generator(cls): @classmethod def get_yaml_test(cls, testcase, language, **kwargs): def parse_expected_outputs(expected_outputs): - phone_num_list, original_texts = [], [] + phone_num_dict, original_texts = {}, [] for expected_output in expected_outputs: original_text = \ expected_output["original_text"].lower().strip() if expected_output["original_text"] else None if original_text: - phone_num_list.append(str(expected_output["value"])) + phone_num_dict = { + 'phone_number': str(expected_output["value"]), + 'country_calling_code': str(expected_output["country_calling_code"]) + } original_texts.append(original_text) - return phone_num_list, original_texts + return phone_num_dict, original_texts failure_string_prefix = u"Test failed for\nText = {message}\nLanguage = {language}\n" From 7952b64dbce3a517f3ddd30e71f5d5e258661016 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Tue, 15 Oct 2019 18:14:25 +0530 Subject: [PATCH 211/237] fix test file in ph --- .../tests/pattern/phone_number/test_phone_number_detection.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ner_v2/tests/pattern/phone_number/test_phone_number_detection.py b/ner_v2/tests/pattern/phone_number/test_phone_number_detection.py index e051e3114..121cf1552 100644 --- a/ner_v2/tests/pattern/phone_number/test_phone_number_detection.py +++ b/ner_v2/tests/pattern/phone_number/test_phone_number_detection.py @@ -34,7 +34,7 @@ def yaml_testsuite_generator(cls): @classmethod def get_yaml_test(cls, testcase, language, **kwargs): def parse_expected_outputs(expected_outputs): - phone_num_dict, original_texts = {}, [] + phone_num_list, original_texts = [], [] for expected_output in expected_outputs: original_text = \ expected_output["original_text"].lower().strip() if expected_output["original_text"] else None @@ -43,6 +43,7 @@ def parse_expected_outputs(expected_outputs): 'phone_number': str(expected_output["value"]), 'country_calling_code': str(expected_output["country_calling_code"]) } + phone_num_list.append(phone_num_dict) original_texts.append(original_text) return phone_num_dict, original_texts From 8dd0e16fde9040f08e9682616b1462b02cbad3b1 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Tue, 15 Oct 2019 22:09:38 +0530 Subject: [PATCH 212/237] fix ph test --- .../tests/pattern/phone_number/test_phone_number_detection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ner_v2/tests/pattern/phone_number/test_phone_number_detection.py b/ner_v2/tests/pattern/phone_number/test_phone_number_detection.py index 121cf1552..464c8269c 100644 --- a/ner_v2/tests/pattern/phone_number/test_phone_number_detection.py +++ b/ner_v2/tests/pattern/phone_number/test_phone_number_detection.py @@ -45,7 +45,7 @@ def parse_expected_outputs(expected_outputs): } phone_num_list.append(phone_num_dict) original_texts.append(original_text) - return phone_num_dict, original_texts + return phone_num_list, original_texts failure_string_prefix = u"Test failed for\nText = {message}\nLanguage = {language}\n" From 04ee0e54d30b641d25c7e7320ff52cb3843b416a Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Tue, 15 Oct 2019 22:11:34 +0530 Subject: [PATCH 213/237] fix ph test --- ner_v2/tests/pattern/phone_number/phone_number_ner_tests.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ner_v2/tests/pattern/phone_number/phone_number_ner_tests.yaml b/ner_v2/tests/pattern/phone_number/phone_number_ner_tests.yaml index 155c039e9..d65bc9df7 100644 --- a/ner_v2/tests/pattern/phone_number/phone_number_ner_tests.yaml +++ b/ner_v2/tests/pattern/phone_number/phone_number_ner_tests.yaml @@ -102,7 +102,7 @@ tests: outputs: - original_text: "14089126172" output_id: 1 - value: "14089126172" + value: "4089126172" country_calling_code: '1' - original_text: "02226129854" output_id: 2 From 56dab503c066c4d9453dae7c5ec1712eb0d931d6 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Wed, 16 Oct 2019 11:30:39 +0530 Subject: [PATCH 214/237] add-previous regex along with the libphone number --- .../phone_number/phone_number_detection.py | 187 +++++++++++++++++- .../phone_number/phone_number_ner_tests.yaml | 2 +- 2 files changed, 185 insertions(+), 4 deletions(-) diff --git a/ner_v2/detectors/pattern/phone_number/phone_number_detection.py b/ner_v2/detectors/pattern/phone_number/phone_number_detection.py index 821031234..7fe521027 100644 --- a/ner_v2/detectors/pattern/phone_number/phone_number_detection.py +++ b/ner_v2/detectors/pattern/phone_number/phone_number_detection.py @@ -37,7 +37,11 @@ def __init__(self, entity_name, language=ENGLISH_LANG, locale='en-IN'): self.text = '' self.phone = [] self.original_phone_text = [] - self.country_code = '' + self.country_code = self.get_country_code_from_locale() + self.tagged_text = '' + self.processed_text = '' + self.country_code_dict = {'IN': 91, 'US': 1, 'GB': 44} + self.tag = '__' + self.entity_name + '__' @property def supported_languages(self): @@ -54,7 +58,7 @@ def get_country_code_from_locale(self): """ regex_pattern = re.compile('[-_](.*$)', re.U) match = regex_pattern.findall(self.locale) - self.country_code = match[0].upper() + return match[0].upper() def detect_entity(self, text, **kwargs): """Detects phone numbers in the text string @@ -84,11 +88,188 @@ def detect_entity(self, text, **kwargs): [u'+९१ ९८१९९८३१३२', u'+९१ ९८१९९८३१३२']) """ - self.get_country_code_from_locale() self.text = text self.phone, self.original_phone_text = [], [] for match in phonenumbers.PhoneNumberMatcher(self.text, self.country_code): self.phone.append({"country_calling_code": str(match.number.country_code), "phone_number": str(match.number.national_number)}) self.original_phone_text.append(self.text[match.start:match.end]) + if self.original_phone_text is None and self.country_code in self.country_code_dict: + self.phone, self.original_phone_text = self.detect_entity_from_regex(text) return self.phone, self.original_phone_text + + def detect_entity_from_regex(self, text, **kwargs): + """Detects phone numbers in the text string + Args: + text: string to extract entities from + **kwargs: it can be used to send specific arguments in future. + Returns: + self.phone (list): list consisting the detected phone numbers + self.original_phone_text (list): list containing their corresponding substrings in the original message. + Examples: + text = 'call +1 (408) 912-6172 and send 100rs to 9920441344' + p = PhoneDetector(entity_name='phone_number', language='en') + p.detect_entity(text=text) + (['14089126172', '9920441344'], [u'+1 (408) 912-6172', u'9920441344']) + text = '+९१ ९८१९९८३१३२ पर कॉल करें और संदेश ९८२०३३४४१६ पर कॉल करें' + p = PhoneDetector(entity_name='phone_number', language='hi') + p.detect_entity(text=text) + (['919819983132', '9820334416'],[u'+९१ ९८१९९८३१३२', u'+९१ ९८१९९८३१३२']) + """ + + self.text = text + self.processed_text = self.text + self.tagged_text = self.text + + phone_number_original_list = self.get_number_regex() + + original_phone_texts = [p[0].strip() for p in phone_number_original_list] + original_phone_text = self.check_length(original_phone_texts=original_phone_texts) + clean_phone_list = [self.clean_phone_number(p) for p in original_phone_text] + phone = [self.get_number(phone) for phone in clean_phone_list] + + self.phone, self.original_phone_text = [], [] + for phone_number, original_phone_number in zip(phone, original_phone_text): + if len(phone_number) >= 12: + self.phone.append(self.check_for_country_code(phone_number, self.country_code)) + self.original_phone_text.append(original_phone_number) + elif len(phone_number) >= 10: + self.phone.append({'country_calling_code': None, 'phone_number': phone_number}) + self.original_phone_text.append(original_phone_number) + self.get_tagged_text() + + return self.phone, self.original_phone_text + + def get_digit_length(self, text): + return len(re.findall(pattern='\d', string=text, flags=re.U)) + + def check_length(self, original_phone_texts): + """ + This method is used to handle the corner case where consecutive numbers are present with + space within them. + Args: + original_phone_texts (list): list of text substrings detected by the regex + Returns: + phone_number_list (list): list of phone numbers splitting based on length + Examples: + original_phone_texts = ['9820334415 91 9920441388', '9820551388982347'] + check_length(original_phone_texts=original_phone_texts) + >> ['9820334415', '91 9920441388'] + """ + phone_number_list_1, phone_number_list2 = [], [] + + for original_phone_text in original_phone_texts: + + if self.get_digit_length(text=original_phone_text) > 13: + phone_parts = original_phone_text.split() + visited = [0 for i in range(len(phone_parts))] + + for i in range(len(phone_parts)): + temp = '' + appended_parts = [] + + for j in range(i, len(phone_parts)): + if visited[j] == 0: + temp = temp + ' ' + phone_parts[j] + appended_parts.append(j) + + if 13 >= self.get_digit_length(text=temp) > 7: + phone_number_list_1.append(temp.strip()) + for m in appended_parts: + visited[m] = 1 + break + else: + phone_number_list2.append(original_phone_text) + phone_number_list_1.extend(phone_number_list2) + return phone_number_list_1 + + def check_for_country_code(self, phone_num, country_code='IN'): + """ + :param country_code: country code. default('in') + :param phone_num: the number which is to be checked for country code + :return: dict with country_code if it's in phone_num and phone_number without country code + Examples: + phone_num = '919123456789' + countryCallingCode = 'IN' + {countryCallingCode:"91",phone_number:"9123456789"} + """ + phone_dict = {} + check_country_regex = re.compile( + '^({country_code})'.format(country_code=self.country_code_dict[country_code]), re.U) + p = check_country_regex.findall(phone_num) + if len(p) == 1: + phone_dict['countryCallingCode'] = p[0] + phone_dict['phone'] = check_country_regex.sub(string=phone_num, repl='') + else: + phone_dict['countryCallingCode'] = None + phone_dict['phone'] = phone_num + + return phone_dict + + def get_number(self, phone): + """ + This method is used to convert phone numbers in language scripts other than English + to the English + Args: + phone (str): The string phone number which is detected and cleaned + Returns: + phone (str): The string phone number converted to English script + Examples: + phone = u'९१९८१९९८३१३२' + get_number(phone=phone) + '919819983132' + """ + phone_length = len(phone) + phone = str(int(phone)) + + if phone_length != len(phone): + phone = phone.zfill(phone_length) + + return phone + + def clean_phone_number(self, number): + """ + This method is used to clean the detected phone number. + Args: + number (str): The original substring which is detected and is required for cleaning + Returns: + number (str): The number post cleaning + """ + # Remove (), -, whistespace, + + clean_regex = re.compile('([()\-\s\+]+)', re.U) + number = clean_regex.sub(string=number, repl='') + return number + + def get_number_regex(self): + + """ + This method is used to detect the phone number patterns from the provided text + Returns: + phone_number_list (list): list of patterns detected from the regex pattern + (each pattern: (complete original text, area code, number)) + (we further utitlize only the complete original text) + Example: + p = PhoneDetector(entity_name='phone_number', language='hi') + text = u'Set a reminder on +1 (408) 912-6172' + p.text = text + p.get_number_regex() + [(u'+1 (408) 912-6172', u'1', u'(408) 912-6172'), + (u'+91 9820334416', u'91', u'9820334416'), + (u'022 26129857', u'022', u'26129857')] + """ + phone_number_regex = re.compile( + r'((?:\(?\+(\d{1,2})\)?[\s\-\.]*)?((?=[\-\d()\s\.]{10,16}(?:[^\d]+|$))' + r'(?:[\d(]{1,20}(?:[\-)\s\.]*\d{1,20}){0,20}){1,20}))', re.U) + + phone_number_list = phone_number_regex.findall(self.text) + return phone_number_list + + def get_tagged_text(self): + """ + Replaces detected phone numbers with tag generated from entity_name used to initialize the object with + A final string with all phone numbers replaced will be stored in object's tagged_text attribute + A string with all phone numbers removed will be stored in object's processed_text attribute + """ + for detected_text in self.original_phone_text: + self.tagged_text = self.tagged_text.replace(detected_text, self.tag) + self.processed_text = self.processed_text.replace(detected_text, '') diff --git a/ner_v2/tests/pattern/phone_number/phone_number_ner_tests.yaml b/ner_v2/tests/pattern/phone_number/phone_number_ner_tests.yaml index d65bc9df7..fb751b14f 100644 --- a/ner_v2/tests/pattern/phone_number/phone_number_ner_tests.yaml +++ b/ner_v2/tests/pattern/phone_number/phone_number_ner_tests.yaml @@ -100,7 +100,7 @@ tests: message: "Send 1000rs to +14089126172 and call 02226129854" locale: "en-in" outputs: - - original_text: "14089126172" + - original_text: "+14089126172" output_id: 1 value: "4089126172" country_calling_code: '1' From 0c569425c3c7206e5ec7c136cbed14d7d3cb4114 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Wed, 16 Oct 2019 14:55:52 +0530 Subject: [PATCH 215/237] add-previous regex along with the libphone number --- ner_v2/api.py | 2 ++ .../phone_number/phone_number_detection.py | 20 ++++++++++--------- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/ner_v2/api.py b/ner_v2/api.py index e6d7d230f..b895c25d4 100644 --- a/ner_v2/api.py +++ b/ner_v2/api.py @@ -519,6 +519,7 @@ def phone_number(request): ] """ try: + parameters_dict = {} if request.method == "POST": parameters_dict = parse_post_request(request) ner_logger.debug('Start Bulk Detection: %s ' % parameters_dict[PARAMETER_ENTITY_NAME]) @@ -534,6 +535,7 @@ def phone_number(request): phone_number_detection = PhoneDetector(entity_name=entity_name, language=language, locale=parameters_dict[PARAMETER_LOCALE]) message = parameters_dict[PARAMETER_MESSAGE] + entity_output = None if isinstance(message, six.string_types): entity_output = phone_number_detection.detect(message=message, structured_value=parameters_dict[PARAMETER_STRUCTURED_VALUE], diff --git a/ner_v2/detectors/pattern/phone_number/phone_number_detection.py b/ner_v2/detectors/pattern/phone_number/phone_number_detection.py index 7fe521027..a4d0c91c5 100644 --- a/ner_v2/detectors/pattern/phone_number/phone_number_detection.py +++ b/ner_v2/detectors/pattern/phone_number/phone_number_detection.py @@ -21,7 +21,6 @@ class PhoneDetector(BaseDetector): """ def __init__(self, entity_name, language=ENGLISH_LANG, locale='en-IN'): - # Todo: Change default from india to get it from the bot. """ Args: entity_name (str): A string by which the detected numbers would be replaced with @@ -33,7 +32,10 @@ def __init__(self, entity_name, language=ENGLISH_LANG, locale='en-IN'): super(PhoneDetector, self).__init__(language, locale) self.language = language self.entity_name = entity_name - self.locale = locale + if locale is None: + self.locale = 'en-IN' + else: + self.locale = locale self.text = '' self.phone = [] self.original_phone_text = [] @@ -94,7 +96,7 @@ def detect_entity(self, text, **kwargs): self.phone.append({"country_calling_code": str(match.number.country_code), "phone_number": str(match.number.national_number)}) self.original_phone_text.append(self.text[match.start:match.end]) - if self.original_phone_text is None and self.country_code in self.country_code_dict: + if self.original_phone_text == [] and self.country_code in self.country_code_dict: self.phone, self.original_phone_text = self.detect_entity_from_regex(text) return self.phone, self.original_phone_text @@ -183,9 +185,9 @@ def check_length(self, original_phone_texts): phone_number_list_1.extend(phone_number_list2) return phone_number_list_1 - def check_for_country_code(self, phone_num, country_code='IN'): + def check_for_country_code(self, phone_num, country_code): """ - :param country_code: country code. default('in') + :param country_code: country code :param phone_num: the number which is to be checked for country code :return: dict with country_code if it's in phone_num and phone_number without country code Examples: @@ -198,11 +200,11 @@ def check_for_country_code(self, phone_num, country_code='IN'): '^({country_code})'.format(country_code=self.country_code_dict[country_code]), re.U) p = check_country_regex.findall(phone_num) if len(p) == 1: - phone_dict['countryCallingCode'] = p[0] - phone_dict['phone'] = check_country_regex.sub(string=phone_num, repl='') + phone_dict['country_calling_code'] = p[0] + phone_dict['phone_number'] = check_country_regex.sub(string=phone_num, repl='') else: - phone_dict['countryCallingCode'] = None - phone_dict['phone'] = phone_num + phone_dict['country_calling_code'] = None + phone_dict['phone_number'] = phone_num return phone_dict From a765771f2eabefff9834e346fc55a5b7013bbc79 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Wed, 16 Oct 2019 15:11:44 +0530 Subject: [PATCH 216/237] check libphone number api --- .../pattern/phone_number/phone_number_detection.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/ner_v2/detectors/pattern/phone_number/phone_number_detection.py b/ner_v2/detectors/pattern/phone_number/phone_number_detection.py index a4d0c91c5..45a7c3815 100644 --- a/ner_v2/detectors/pattern/phone_number/phone_number_detection.py +++ b/ner_v2/detectors/pattern/phone_number/phone_number_detection.py @@ -42,7 +42,7 @@ def __init__(self, entity_name, language=ENGLISH_LANG, locale='en-IN'): self.country_code = self.get_country_code_from_locale() self.tagged_text = '' self.processed_text = '' - self.country_code_dict = {'IN': 91, 'US': 1, 'GB': 44} + self.country_code_dict = {'IN': '91', 'US': '1', 'GB': '44'} self.tag = '__' + self.entity_name + '__' @property @@ -96,8 +96,8 @@ def detect_entity(self, text, **kwargs): self.phone.append({"country_calling_code": str(match.number.country_code), "phone_number": str(match.number.national_number)}) self.original_phone_text.append(self.text[match.start:match.end]) - if self.original_phone_text == [] and self.country_code in self.country_code_dict: - self.phone, self.original_phone_text = self.detect_entity_from_regex(text) + # if self.original_phone_text == [] and self.country_code in self.country_code_dict: + # self.phone, self.original_phone_text = self.detect_entity_from_regex(text) return self.phone, self.original_phone_text def detect_entity_from_regex(self, text, **kwargs): @@ -136,7 +136,8 @@ def detect_entity_from_regex(self, text, **kwargs): self.phone.append(self.check_for_country_code(phone_number, self.country_code)) self.original_phone_text.append(original_phone_number) elif len(phone_number) >= 10: - self.phone.append({'country_calling_code': None, 'phone_number': phone_number}) + self.phone.append({'country_calling_code': self.country_code_dict[self.country_code], + 'phone_number': phone_number}) self.original_phone_text.append(original_phone_number) self.get_tagged_text() @@ -203,7 +204,7 @@ def check_for_country_code(self, phone_num, country_code): phone_dict['country_calling_code'] = p[0] phone_dict['phone_number'] = check_country_regex.sub(string=phone_num, repl='') else: - phone_dict['country_calling_code'] = None + phone_dict['country_calling_code'] = self.country_code_dict[self.country_code] phone_dict['phone_number'] = phone_num return phone_dict From 7666d05fff1f263cccea728546712c798db5e898 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Wed, 16 Oct 2019 17:45:10 +0530 Subject: [PATCH 217/237] mathch = none case --- .../phone_number/phone_number_detection.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/ner_v2/detectors/pattern/phone_number/phone_number_detection.py b/ner_v2/detectors/pattern/phone_number/phone_number_detection.py index 45a7c3815..782dbc57e 100644 --- a/ner_v2/detectors/pattern/phone_number/phone_number_detection.py +++ b/ner_v2/detectors/pattern/phone_number/phone_number_detection.py @@ -20,7 +20,7 @@ class PhoneDetector(BaseDetector): original_phone_text (list): list to store substrings of the text detected as phone numbers """ - def __init__(self, entity_name, language=ENGLISH_LANG, locale='en-IN'): + def __init__(self, entity_name, language=ENGLISH_LANG, locale=None): """ Args: entity_name (str): A string by which the detected numbers would be replaced with @@ -32,10 +32,7 @@ def __init__(self, entity_name, language=ENGLISH_LANG, locale='en-IN'): super(PhoneDetector, self).__init__(language, locale) self.language = language self.entity_name = entity_name - if locale is None: - self.locale = 'en-IN' - else: - self.locale = locale + self.locale = locale or 'en-IN' self.text = '' self.phone = [] self.original_phone_text = [] @@ -60,7 +57,10 @@ def get_country_code_from_locale(self): """ regex_pattern = re.compile('[-_](.*$)', re.U) match = regex_pattern.findall(self.locale) - return match[0].upper() + if match: + return match[0].upper() + else: + return 'IN' def detect_entity(self, text, **kwargs): """Detects phone numbers in the text string @@ -96,8 +96,8 @@ def detect_entity(self, text, **kwargs): self.phone.append({"country_calling_code": str(match.number.country_code), "phone_number": str(match.number.national_number)}) self.original_phone_text.append(self.text[match.start:match.end]) - # if self.original_phone_text == [] and self.country_code in self.country_code_dict: - # self.phone, self.original_phone_text = self.detect_entity_from_regex(text) + if self.original_phone_text == [] and self.country_code in self.country_code_dict: + self.phone, self.original_phone_text = self.detect_entity_from_regex(text) return self.phone, self.original_phone_text def detect_entity_from_regex(self, text, **kwargs): From 3e7761da75df4e82e14e2c75a66522ef4d581971 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Wed, 16 Oct 2019 18:35:44 +0530 Subject: [PATCH 218/237] added month short form data in various languages --- .../temporal/date/bn/data/date_constant.csv | 20 ++++++++-------- .../temporal/date/gu/data/date_constant.csv | 18 +++++++------- .../temporal/date/hi/data/date_constant.csv | 18 +++++++------- .../temporal/date/mr/data/date_constant.csv | 24 +++++++++---------- .../temporal/date/te/data/date_constant.csv | 16 ++++++------- 5 files changed, 48 insertions(+), 48 deletions(-) diff --git a/ner_v2/detectors/temporal/date/bn/data/date_constant.csv b/ner_v2/detectors/temporal/date/bn/data/date_constant.csv index 6e6447c59..17c821920 100644 --- a/ner_v2/detectors/temporal/date/bn/data/date_constant.csv +++ b/ner_v2/detectors/temporal/date/bn/data/date_constant.csv @@ -13,15 +13,15 @@ key,numeric_representation,date_type শুক্রবার|Shukrobar,4,weekday শনিবার|Shonibar,5,weekday রবিবার|Rabibar|Robibar,6,weekday -Januyari|জানুয়ারী,1,month -Phebruyari|ফেব্রুয়ারি,2,month -March|মার্চ,3,month -Epril|এপ্রিল,4,month +জানু|Januyari|জানুয়ারী,1,month +ফেব|Phebruyari|ফেব্রুয়ারি,2,month +মার্চ|March|মার্চ,3,month +এপ্র|Epril|এপ্রিল,4,month Me|মে,5,month Jun|জুন,6,month -Juloi|জুলাই,7,month -Agast|অগাস্ট|আগস্ট,8,month -Septembar|সেপ্টেম্বর,9,month -Aktobar|অক্টোবর,10,month -Nabhembar|নভেম্বর,11,month -Disembar|ডিসেম্বর,12,month +জুল|Juloi|জুলাই,7,month +আগ|Agast|অগাস্ট|আগস্ট,8,month +সেপ্ট|Septembar|সেপ্টেম্বর,9,month +অক্টো|Aktobar|অক্টোবর,10,month +নভে|Nabhembar|নভেম্বর,11,month +ডিসে|Disembar|ডিসেম্বর,12,month diff --git a/ner_v2/detectors/temporal/date/gu/data/date_constant.csv b/ner_v2/detectors/temporal/date/gu/data/date_constant.csv index e3bd92ac9..3f619a401 100644 --- a/ner_v2/detectors/temporal/date/gu/data/date_constant.csv +++ b/ner_v2/detectors/temporal/date/gu/data/date_constant.csv @@ -12,15 +12,15 @@ key,numeric_representation,date_type શુક્રવાર|શુક્રવારે|friday|shukravar|shukrawar|sukravar|sukrawar|shukravare|shukraware,4,weekday શનિવાર|શનિવારે|saturday|shanivar|shaniwar|saniwar|sanivar|shanivare|shaniware,5,weekday રવિવાર|રવિવારે|sunday|ravivar|raviwar|ravivare|raviware,6,weekday -જાન્યુઆરી|january|jan,1,month -ફેબ્રુઆરી|february|feb|febuary,2,month -માર્ચ|march|mar,3,month +જાન્યુ.|જાન્યુઆરી|january|jan,1,month +ફેબ્રુ.|ફેબ્રુઆરી|february|feb|febuary,2,month +એપ્રિ.|માર્ચ|march|mar,3,month એપ્રિલ|april|apr,4,month મે|may,5,month જૂન|જૂને|june|jun,6,month -જુલાઈ|જુલાઇ|july|jul,7,month -ઓગસ્ટ|ઑગષ્ટ|august|aug,8,month -સપ્ટેમ્બર|september|sept|sep,9,month -ઓક્ટોબર|october|oct,10,month -નવેમ્બર|november|nov,11,month -ડિસેમ્બર|december|dec,12,month +જુલા.|જુલાઈ|જુલાઇ|july|jul,7,month +ઑગ.|ઓગસ્ટ|ઑગષ્ટ|august|aug,8,month +સપ્ટે.|સપ્ટેમ્બર|september|sept|sep,9,month +ઑક્ટ્.|ઓક્ટોબર|october|oct,10,month +નવે.|નવેમ્બર|november|nov,11,month +ડિસે.|ડિસેમ્બર|december|dec,12,month diff --git a/ner_v2/detectors/temporal/date/hi/data/date_constant.csv b/ner_v2/detectors/temporal/date/hi/data/date_constant.csv index ec54eaf7c..235faf5a2 100644 --- a/ner_v2/detectors/temporal/date/hi/data/date_constant.csv +++ b/ner_v2/detectors/temporal/date/hi/data/date_constant.csv @@ -13,15 +13,15 @@ key,numeric_representation,date_type शुक्रवार|friday|shukravar|shukrawar|sukravar|sukrawar,4,weekday शनिवार|saturday|shanivar|shaniwar|saniwar|sanivar,5,weekday रविवार|sunday|ravivar|raviwar,6,weekday -जनवरी|january|jan,1,month -फेब्रुअरी|फरवरी|february|feb|febuary,2,month +जन.|जनवरी|january|jan,1,month +फेब्रुअरी|फरवरी|फ़र.|फ़रवरी|february|feb|febuary,2,month मार्च|march|mar,3,month -अप्रैल|april|apr,4,month +अप्रैल|अप्रै.|april|apr,4,month मई|may,5,month जून|june|jun,6,month -जुलाई|july|jul,7,month -अगस्त|august|aug,8,month -सितम्बर|september|sept|sep,9,month -अक्टूबर|october|oct,10,month -नवंबर|november|nov,11,month -दिसंबर|december|dec,12,month +जुलाई|जुल.|july|jul,7,month +अगस्त|अग.|अगस्त|august|aug,8,month +सितम्बर|सित.|september|sept|sep,9,month +अक्टूबर|अक्टू.|october|oct,10,month +नवंबर|नव.|नवम्बर|november|nov,11,month +दिसंबर|दिस.|दिसम्बर|december|dec,12,month diff --git a/ner_v2/detectors/temporal/date/mr/data/date_constant.csv b/ner_v2/detectors/temporal/date/mr/data/date_constant.csv index 94d89618f..a3f6b8e60 100644 --- a/ner_v2/detectors/temporal/date/mr/data/date_constant.csv +++ b/ner_v2/detectors/temporal/date/mr/data/date_constant.csv @@ -13,15 +13,15 @@ key,key,date_type शुक्रवार|शुक्रवारी|friday|shukravar|shukrawar|shukravaar|shukravari|shukrawari,4,weekday शनिवार|शनिवारी|saturday|shanivar|shaniwar|shanivaar|shanivaari|shaniwari,5,weekday रविवार|रविवारी|sunday|ravivar|raviwar|ravivari|raviwari|ravivaari,6,weekday -जानेवारी|january|jan|Janevari,1,month -फेब्रुवारी|Phebruvari|february|feb,2,month -मार्च|march|mar,3,month -एप्रिल|april|apr|Epril,4,month -मे|may|Me,5,month -जून|june|jun,6,month -जुलै|july|jul|Julai,7,month -ऑगस्ट|august|aug|ogast,8,month -सप्टेंबर|september|sept|sep,9,month -ऑक्टोबर|october|oct|oktobar,10,month -नोव्हेंबर|november|nov|nowenber,11,month -डिसेंबर|december|dec,12,month +जानेवारी|जाने.|january|jan|Janevari,1,month +फेब्रुवारी|फेब्रु.|Phebruvari|february|feb,2,month +मार्च|मार्च.|march|mar,3,month +एप्रिल|एप्रि.|april|apr|Epril,4,month +मे|मे.|may|Me,5,month +जून.|जून|june|jun,6,month +जुलै.|जुलै|july|jul|Julai,7,month +ऑग.|ऑगस्ट|august|aug|ogast,8,month +सप्टें.|सप्टेंबर|september|sept|sep,9,month +ऑक्टो.|ऑक्टोबर|october|oct|oktobar,10,month +नोव्हें.|नोव्हेंबर|november|nov|nowenber,11,month +डिसें.|डिसेंबर|december|dec,12,month diff --git a/ner_v2/detectors/temporal/date/te/data/date_constant.csv b/ner_v2/detectors/temporal/date/te/data/date_constant.csv index 001aa9289..ad59abc66 100644 --- a/ner_v2/detectors/temporal/date/te/data/date_constant.csv +++ b/ner_v2/detectors/temporal/date/te/data/date_constant.csv @@ -13,15 +13,15 @@ key,numeric_representation,date_type శుక్రవారము|శుక్రవారం|శుక్ర|shukravaram|shukra varam|shukravaaram|shukra,4,weekday శనివారము|స్థిరవారము|మందవారము|shanivaram|shanivaaram|mandavaram|mandavaaram,5,weekday ఆదివారము|భానువారము|రవివారము|అధిత్యవారము|తొలివారము|aadivaram|adivaram|adi varam|adi vaaram|bhanuvaram|bhanu|bhanu vaaram|ravivaram|ravi vaaramu|ravi varam|adityavaram|adithya varam|tholivaram|tholi vaaram|sunday,6,weekday -మొదటి నెల|జనుఅరీ|జనవరి|జనవరి|january|jan|janavary,1,month -ఫిబ్రవరి|ఫెబ్|february|feb|febravary,2,month -మార్చ్|మార్|march|mar,3,month +జన.|మొదటి నెల|జనుఅరీ|జనవరి|జనవరి|january|jan|janavary,1,month +ఫిబ్ర.|ఫిబ్రవరి|ఫెబ్|february|feb|febravary,2,month +ఏప్రి.|మార్చ్|మార్|march|mar,3,month ఏప్రిల్|april|apr,4,month మే|may,5,month జూన్|jun|june,6,month జులై|jul|july,7,month -ఆగష్టు|అగస్ట్|ఆగస్ట్|august|aug,8,month -సెప్టెంబర్|september|sep|sept,9,month -అక్టోబర్|అక్టోబరు|oct|october,10,month -నవంబర్|november|nov,11,month -డిసెంబరు|డిసెంబర్|december|dec,12,month +ఆగ.|ఆగష్టు|అగస్ట్|ఆగస్ట్|august|aug,8,month +సెప్.|సెప్టెంబర్|september|sep|sept,9,month +అక్టో.|అక్టోబర్|అక్టోబరు|oct|october,10,month +నవ.|నవంబర్|november|nov,11,month +డిసె.|డిసెంబరు|డిసెంబర్|december|dec,12,month \ No newline at end of file From 6f8b684fa9669aac740ccd97008bfcdb12971879 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Thu, 17 Oct 2019 10:24:30 +0530 Subject: [PATCH 219/237] add possible intl codes --- .../phone_number/phone_number_detection.py | 25 ++++++++++--------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/ner_v2/detectors/pattern/phone_number/phone_number_detection.py b/ner_v2/detectors/pattern/phone_number/phone_number_detection.py index 782dbc57e..17e18eb44 100644 --- a/ner_v2/detectors/pattern/phone_number/phone_number_detection.py +++ b/ner_v2/detectors/pattern/phone_number/phone_number_detection.py @@ -2,12 +2,9 @@ from ner_v2.detectors.base_detector import BaseDetector from ner_v2.detectors.numeral.number.number_detection import NumberDetector from language_utilities.constant import ENGLISH_LANG -import collections import re import phonenumbers -NumberVariant = collections.namedtuple('NumberVariant', ['scale', 'increment']) - class PhoneDetector(BaseDetector): """ @@ -40,6 +37,7 @@ def __init__(self, entity_name, language=ENGLISH_LANG, locale=None): self.tagged_text = '' self.processed_text = '' self.country_code_dict = {'IN': '91', 'US': '1', 'GB': '44'} + self.possible_country_code_number_list = ['91', '1', '011 91', '44'] self.tag = '__' + self.entity_name + '__' @property @@ -91,19 +89,22 @@ def detect_entity(self, text, **kwargs): """ self.text = text + self.tagged_text = self.text + self.processed_text = self.text self.phone, self.original_phone_text = [], [] for match in phonenumbers.PhoneNumberMatcher(self.text, self.country_code): self.phone.append({"country_calling_code": str(match.number.country_code), "phone_number": str(match.number.national_number)}) self.original_phone_text.append(self.text[match.start:match.end]) - if self.original_phone_text == [] and self.country_code in self.country_code_dict: - self.phone, self.original_phone_text = self.detect_entity_from_regex(text) + self.get_tagged_text() + if self.country_code in self.country_code_dict: + self.phone, self.original_phone_text = self.detect_entity_with_regex(self.tagged_text) return self.phone, self.original_phone_text - def detect_entity_from_regex(self, text, **kwargs): + def detect_entity_with_regex(self, tagged_text, **kwargs): """Detects phone numbers in the text string Args: - text: string to extract entities from + tagged_text: string to extract entities from **kwargs: it can be used to send specific arguments in future. Returns: self.phone (list): list consisting the detected phone numbers @@ -119,7 +120,7 @@ def detect_entity_from_regex(self, text, **kwargs): (['919819983132', '9820334416'],[u'+९१ ९८१९९८३१३२', u'+९१ ९८१९९८३१३२']) """ - self.text = text + self.text = tagged_text self.processed_text = self.text self.tagged_text = self.text @@ -130,12 +131,12 @@ def detect_entity_from_regex(self, text, **kwargs): clean_phone_list = [self.clean_phone_number(p) for p in original_phone_text] phone = [self.get_number(phone) for phone in clean_phone_list] - self.phone, self.original_phone_text = [], [] + # self.phone, self.original_phone_text = [], [] for phone_number, original_phone_number in zip(phone, original_phone_text): - if len(phone_number) >= 12: + if len(phone_number) >= 10: self.phone.append(self.check_for_country_code(phone_number, self.country_code)) self.original_phone_text.append(original_phone_number) - elif len(phone_number) >= 10: + else: self.phone.append({'country_calling_code': self.country_code_dict[self.country_code], 'phone_number': phone_number}) self.original_phone_text.append(original_phone_number) @@ -198,7 +199,7 @@ def check_for_country_code(self, phone_num, country_code): """ phone_dict = {} check_country_regex = re.compile( - '^({country_code})'.format(country_code=self.country_code_dict[country_code]), re.U) + r'^({country_code})\d{10}'.format(country_code='91|1|011 91'), re.U) p = check_country_regex.findall(phone_num) if len(p) == 1: phone_dict['country_calling_code'] = p[0] From f9bdfae4520d1361da35205bd8af98978297d38c Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Thu, 17 Oct 2019 11:08:37 +0530 Subject: [PATCH 220/237] fix regex --- .../pattern/phone_number/phone_number_detection.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/ner_v2/detectors/pattern/phone_number/phone_number_detection.py b/ner_v2/detectors/pattern/phone_number/phone_number_detection.py index 17e18eb44..6d53da95d 100644 --- a/ner_v2/detectors/pattern/phone_number/phone_number_detection.py +++ b/ner_v2/detectors/pattern/phone_number/phone_number_detection.py @@ -134,7 +134,7 @@ def detect_entity_with_regex(self, tagged_text, **kwargs): # self.phone, self.original_phone_text = [], [] for phone_number, original_phone_number in zip(phone, original_phone_text): if len(phone_number) >= 10: - self.phone.append(self.check_for_country_code(phone_number, self.country_code)) + self.phone.append(self.check_for_country_code(phone_number)) self.original_phone_text.append(original_phone_number) else: self.phone.append({'country_calling_code': self.country_code_dict[self.country_code], @@ -187,9 +187,8 @@ def check_length(self, original_phone_texts): phone_number_list_1.extend(phone_number_list2) return phone_number_list_1 - def check_for_country_code(self, phone_num, country_code): + def check_for_country_code(self, phone_num): """ - :param country_code: country code :param phone_num: the number which is to be checked for country code :return: dict with country_code if it's in phone_num and phone_number without country code Examples: @@ -198,8 +197,8 @@ def check_for_country_code(self, phone_num, country_code): {countryCallingCode:"91",phone_number:"9123456789"} """ phone_dict = {} - check_country_regex = re.compile( - r'^({country_code})\d{10}'.format(country_code='91|1|011 91'), re.U) + check_country_regex = re.compile(r'^({country_code})\d{length}$'. + format(country_code='911|1|011 91|91', length='{10}'), re.U) p = check_country_regex.findall(phone_num) if len(p) == 1: phone_dict['country_calling_code'] = p[0] From 5f2f4166d9b59c73246a0215418b5c4939f165c0 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Thu, 17 Oct 2019 11:16:55 +0530 Subject: [PATCH 221/237] fix regex --- .../detectors/pattern/phone_number/phone_number_detection.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ner_v2/detectors/pattern/phone_number/phone_number_detection.py b/ner_v2/detectors/pattern/phone_number/phone_number_detection.py index 6d53da95d..e30b84a11 100644 --- a/ner_v2/detectors/pattern/phone_number/phone_number_detection.py +++ b/ner_v2/detectors/pattern/phone_number/phone_number_detection.py @@ -202,7 +202,8 @@ def check_for_country_code(self, phone_num): p = check_country_regex.findall(phone_num) if len(p) == 1: phone_dict['country_calling_code'] = p[0] - phone_dict['phone_number'] = check_country_regex.sub(string=phone_num, repl='') + country_code_sub_regex = re.compile(r'^{detected_code}'.format(detected_code=p[0])) + phone_dict['phone_number'] = country_code_sub_regex.sub(string=phone_num, repl='') else: phone_dict['country_calling_code'] = self.country_code_dict[self.country_code] phone_dict['phone_number'] = phone_num From 9b7d4a5c9385b7b64fc5d2ac5a329c8e90200163 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Thu, 17 Oct 2019 12:19:13 +0530 Subject: [PATCH 222/237] added data for indic months --- .../temporal/date/bn/data/date_constant.csv | 24 +++++++++---------- .../temporal/date/gu/data/date_constant.csv | 24 +++++++++---------- .../temporal/date/hi/data/date_constant.csv | 24 +++++++++---------- .../temporal/date/mr/data/date_constant.csv | 18 +++++++------- .../temporal/date/te/data/date_constant.csv | 24 +++++++++---------- 5 files changed, 57 insertions(+), 57 deletions(-) diff --git a/ner_v2/detectors/temporal/date/bn/data/date_constant.csv b/ner_v2/detectors/temporal/date/bn/data/date_constant.csv index 17c821920..912201be5 100644 --- a/ner_v2/detectors/temporal/date/bn/data/date_constant.csv +++ b/ner_v2/detectors/temporal/date/bn/data/date_constant.csv @@ -13,15 +13,15 @@ key,numeric_representation,date_type শুক্রবার|Shukrobar,4,weekday শনিবার|Shonibar,5,weekday রবিবার|Rabibar|Robibar,6,weekday -জানু|Januyari|জানুয়ারী,1,month -ফেব|Phebruyari|ফেব্রুয়ারি,2,month -মার্চ|March|মার্চ,3,month -এপ্র|Epril|এপ্রিল,4,month -Me|মে,5,month -Jun|জুন,6,month -জুল|Juloi|জুলাই,7,month -আগ|Agast|অগাস্ট|আগস্ট,8,month -সেপ্ট|Septembar|সেপ্টেম্বর,9,month -অক্টো|Aktobar|অক্টোবর,10,month -নভে|Nabhembar|নভেম্বর,11,month -ডিসে|Disembar|ডিসেম্বর,12,month +জানু.|জানু|Januyari|জানুয়ারী,1,month +ফেব.|ফেব|Phebruyari|ফেব্রুয়ারি,2,month +মার্চ.|March|মার্চ,3,month +এপ্র.|এপ্র|Epril|এপ্রিল,4,month +মে.|Me|মে,5,month +জুন.|Jun|জুন,6,month +জুল.|জুল|Juloi|জুলাই,7,month +আগ.|আগ|Agast|অগাস্ট|আগস্ট,8,month +সেপ্ট.|সেপ্ট|Septembar|সেপ্টেম্বর,9,month +অক্টো.|অক্টো|Aktobar|অক্টোবর,10,month +নভে.|নভে|Nabhembar|নভেম্বর,11,month +ডিসে.|ডিসে|Disembar|ডিসেম্বর,12,month diff --git a/ner_v2/detectors/temporal/date/gu/data/date_constant.csv b/ner_v2/detectors/temporal/date/gu/data/date_constant.csv index 3f619a401..cc808eacb 100644 --- a/ner_v2/detectors/temporal/date/gu/data/date_constant.csv +++ b/ner_v2/detectors/temporal/date/gu/data/date_constant.csv @@ -12,15 +12,15 @@ key,numeric_representation,date_type શુક્રવાર|શુક્રવારે|friday|shukravar|shukrawar|sukravar|sukrawar|shukravare|shukraware,4,weekday શનિવાર|શનિવારે|saturday|shanivar|shaniwar|saniwar|sanivar|shanivare|shaniware,5,weekday રવિવાર|રવિવારે|sunday|ravivar|raviwar|ravivare|raviware,6,weekday -જાન્યુ.|જાન્યુઆરી|january|jan,1,month -ફેબ્રુ.|ફેબ્રુઆરી|february|feb|febuary,2,month -એપ્રિ.|માર્ચ|march|mar,3,month -એપ્રિલ|april|apr,4,month -મે|may,5,month -જૂન|જૂને|june|jun,6,month -જુલા.|જુલાઈ|જુલાઇ|july|jul,7,month -ઑગ.|ઓગસ્ટ|ઑગષ્ટ|august|aug,8,month -સપ્ટે.|સપ્ટેમ્બર|september|sept|sep,9,month -ઑક્ટ્.|ઓક્ટોબર|october|oct,10,month -નવે.|નવેમ્બર|november|nov,11,month -ડિસે.|ડિસેમ્બર|december|dec,12,month +જાન્યુ.|જાન્યુ|જાન્યુઆરી|january|jan,1,month +ફેબ્રુ.|ફેબ્રુ|ફેબ્રુઆરી|february|feb|febuary,2,month +એપ્રિ.|એપ્રિ|માર્ચ|march|mar,3,month +એપ્રિલ|એપ્રિલ.|april|apr,4,month +મે|મે.|may,5,month +જૂન|જૂન.|જૂને|june|jun,6,month +જુલા.|જુલા|જુલાઈ|જુલાઇ|july|jul,7,month +ઑગ.|ઑગ|ઓગસ્ટ|ઑગષ્ટ|august|aug,8,month +સપ્ટે.|સપ્ટે|સપ્ટેમ્બર|september|sept|sep,9,month +ઑક્ટ્.|ઑક્ટ્|ઓક્ટોબર|october|oct,10,month +નવે.|નવે|નવેમ્બર|november|nov,11,month +ડિસે.|ડિસે|ડિસેમ્બર|december|dec,12,month diff --git a/ner_v2/detectors/temporal/date/hi/data/date_constant.csv b/ner_v2/detectors/temporal/date/hi/data/date_constant.csv index 235faf5a2..8e30d3d98 100644 --- a/ner_v2/detectors/temporal/date/hi/data/date_constant.csv +++ b/ner_v2/detectors/temporal/date/hi/data/date_constant.csv @@ -13,15 +13,15 @@ key,numeric_representation,date_type शुक्रवार|friday|shukravar|shukrawar|sukravar|sukrawar,4,weekday शनिवार|saturday|shanivar|shaniwar|saniwar|sanivar,5,weekday रविवार|sunday|ravivar|raviwar,6,weekday -जन.|जनवरी|january|jan,1,month -फेब्रुअरी|फरवरी|फ़र.|फ़रवरी|february|feb|febuary,2,month -मार्च|march|mar,3,month -अप्रैल|अप्रै.|april|apr,4,month -मई|may,5,month -जून|june|jun,6,month -जुलाई|जुल.|july|jul,7,month -अगस्त|अग.|अगस्त|august|aug,8,month -सितम्बर|सित.|september|sept|sep,9,month -अक्टूबर|अक्टू.|october|oct,10,month -नवंबर|नव.|नवम्बर|november|nov,11,month -दिसंबर|दिस.|दिसम्बर|december|dec,12,month +जन.|जन|जनवरी|january|jan,1,month +फेब्रुअरी|फरवरी|फ़र.|फ़र|फ़रवरी|february|feb|febuary,2,month +मार्च|मार्च.|march|mar,3,month +अप्रैल|अप्रै.|अप्रै|april|apr,4,month +मई|मई.|may,5,month +जून|जून.|june|jun,6,month +जुलाई|जुल.|जुल|july|jul,7,month +अगस्त|अग.|अग|अगस्त|august|aug,8,month +सितम्बर|सित.|सित|september|sept|sep,9,month +अक्टूबर|अक्टू.|अक्टू|october|oct,10,month +नवंबर|नव.|नव|नवम्बर|november|nov,11,month +दिसंबर|दिस.|दिस|दिसम्बर|december|dec,12,month diff --git a/ner_v2/detectors/temporal/date/mr/data/date_constant.csv b/ner_v2/detectors/temporal/date/mr/data/date_constant.csv index a3f6b8e60..7468eb0c8 100644 --- a/ner_v2/detectors/temporal/date/mr/data/date_constant.csv +++ b/ner_v2/detectors/temporal/date/mr/data/date_constant.csv @@ -13,15 +13,15 @@ key,key,date_type शुक्रवार|शुक्रवारी|friday|shukravar|shukrawar|shukravaar|shukravari|shukrawari,4,weekday शनिवार|शनिवारी|saturday|shanivar|shaniwar|shanivaar|shanivaari|shaniwari,5,weekday रविवार|रविवारी|sunday|ravivar|raviwar|ravivari|raviwari|ravivaari,6,weekday -जानेवारी|जाने.|january|jan|Janevari,1,month -फेब्रुवारी|फेब्रु.|Phebruvari|february|feb,2,month -मार्च|मार्च.|march|mar,3,month -एप्रिल|एप्रि.|april|apr|Epril,4,month +जानेवारी|जाने.|जाने|january|jan|Janevari,1,month +फेब्रुवारी|फेब्रु.|फेब्रु|Phebruvari|february|feb,2,month +मार्च|मार्च.|मार्च|march|mar,3,month +एप्रिल|एप्रि.|एप्रि|april|apr|Epril,4,month मे|मे.|may|Me,5,month जून.|जून|june|jun,6,month जुलै.|जुलै|july|jul|Julai,7,month -ऑग.|ऑगस्ट|august|aug|ogast,8,month -सप्टें.|सप्टेंबर|september|sept|sep,9,month -ऑक्टो.|ऑक्टोबर|october|oct|oktobar,10,month -नोव्हें.|नोव्हेंबर|november|nov|nowenber,11,month -डिसें.|डिसेंबर|december|dec,12,month +ऑग.|ऑग|ऑगस्ट|august|aug|ogast,8,month +सप्टें.|सप्टें|सप्टेंबर|september|sept|sep,9,month +ऑक्टो.|ऑक्टो|ऑक्टोबर|october|oct|oktobar,10,month +नोव्हें.|नोव्हें|नोव्हेंबर|november|nov|nowenber,11,month +डिसें.|डिसें|डिसेंबर|december|dec,12,month diff --git a/ner_v2/detectors/temporal/date/te/data/date_constant.csv b/ner_v2/detectors/temporal/date/te/data/date_constant.csv index ad59abc66..2dd4c1b41 100644 --- a/ner_v2/detectors/temporal/date/te/data/date_constant.csv +++ b/ner_v2/detectors/temporal/date/te/data/date_constant.csv @@ -13,15 +13,15 @@ key,numeric_representation,date_type శుక్రవారము|శుక్రవారం|శుక్ర|shukravaram|shukra varam|shukravaaram|shukra,4,weekday శనివారము|స్థిరవారము|మందవారము|shanivaram|shanivaaram|mandavaram|mandavaaram,5,weekday ఆదివారము|భానువారము|రవివారము|అధిత్యవారము|తొలివారము|aadivaram|adivaram|adi varam|adi vaaram|bhanuvaram|bhanu|bhanu vaaram|ravivaram|ravi vaaramu|ravi varam|adityavaram|adithya varam|tholivaram|tholi vaaram|sunday,6,weekday -జన.|మొదటి నెల|జనుఅరీ|జనవరి|జనవరి|january|jan|janavary,1,month -ఫిబ్ర.|ఫిబ్రవరి|ఫెబ్|february|feb|febravary,2,month -ఏప్రి.|మార్చ్|మార్|march|mar,3,month -ఏప్రిల్|april|apr,4,month -మే|may,5,month -జూన్|jun|june,6,month -జులై|jul|july,7,month -ఆగ.|ఆగష్టు|అగస్ట్|ఆగస్ట్|august|aug,8,month -సెప్.|సెప్టెంబర్|september|sep|sept,9,month -అక్టో.|అక్టోబర్|అక్టోబరు|oct|october,10,month -నవ.|నవంబర్|november|nov,11,month -డిసె.|డిసెంబరు|డిసెంబర్|december|dec,12,month \ No newline at end of file +జన.|జన|మొదటి నెల|జనుఅరీ|జనవరి|జనవరి|january|jan|janavary,1,month +ఫిబ్ర.|ఫిబ్ర|ఫిబ్రవరి|ఫెబ్|february|feb|febravary,2,month +ఏప్రి.|ఏప్రి|మార్చ్|మార్|march|mar,3,month +ఏప్రిల్|ఏప్రిల్.|april|apr,4,month +మే|మే.|may,5,month +జూన్|జూన్.|jun|june,6,month +జులై|జులై.|jul|july,7,month +ఆగ.|ఆగ|ఆగష్టు|అగస్ట్|ఆగస్ట్|august|aug,8,month +సెప్.|సెప్|సెప్టెంబర్|september|sep|sept,9,month +అక్టో.|అక్టో|అక్టోబర్|అక్టోబరు|oct|october,10,month +నవ.|నవ|నవంబర్|november|nov,11,month +డిసె.|డిసె|డిసెంబరు|డిసెంబర్|december|dec,12,month \ No newline at end of file From 76c72a1cbd30795f799157f35cf2a9d1c48d5443 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 17 Oct 2019 07:40:29 +0000 Subject: [PATCH 223/237] add logger in api.py --- ner_v2/api.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ner_v2/api.py b/ner_v2/api.py index b895c25d4..c7d0975c1 100644 --- a/ner_v2/api.py +++ b/ner_v2/api.py @@ -536,6 +536,7 @@ def phone_number(request): locale=parameters_dict[PARAMETER_LOCALE]) message = parameters_dict[PARAMETER_MESSAGE] entity_output = None + ner_logger.debug(parameters_dict) if isinstance(message, six.string_types): entity_output = phone_number_detection.detect(message=message, structured_value=parameters_dict[PARAMETER_STRUCTURED_VALUE], From bcbc5da71ebddf40a771970f6eb48942ec63d232 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Fri, 18 Oct 2019 10:52:18 +0530 Subject: [PATCH 224/237] set leniency to zero in libphonenumber --- .../phone_number/phone_number_detection.py | 26 +++++++++++++------ .../phone_number/phone_number_ner_tests.yaml | 12 +++++++++ 2 files changed, 30 insertions(+), 8 deletions(-) diff --git a/ner_v2/detectors/pattern/phone_number/phone_number_detection.py b/ner_v2/detectors/pattern/phone_number/phone_number_detection.py index e30b84a11..a84f918ff 100644 --- a/ner_v2/detectors/pattern/phone_number/phone_number_detection.py +++ b/ner_v2/detectors/pattern/phone_number/phone_number_detection.py @@ -31,8 +31,8 @@ def __init__(self, entity_name, language=ENGLISH_LANG, locale=None): self.entity_name = entity_name self.locale = locale or 'en-IN' self.text = '' - self.phone = [] - self.original_phone_text = [] + self.phone, self.original_phone_text = [], [] + self.final_phone, self.final_original_text = [], [] self.country_code = self.get_country_code_from_locale() self.tagged_text = '' self.processed_text = '' @@ -92,14 +92,24 @@ def detect_entity(self, text, **kwargs): self.tagged_text = self.text self.processed_text = self.text self.phone, self.original_phone_text = [], [] - for match in phonenumbers.PhoneNumberMatcher(self.text, self.country_code): + self.final_phone, self.final_original_text = [], [] + for match in phonenumbers.PhoneNumberMatcher(self.text, self.country_code, leniency=0): self.phone.append({"country_calling_code": str(match.number.country_code), "phone_number": str(match.number.national_number)}) self.original_phone_text.append(self.text[match.start:match.end]) - self.get_tagged_text() - if self.country_code in self.country_code_dict: - self.phone, self.original_phone_text = self.detect_entity_with_regex(self.tagged_text) - return self.phone, self.original_phone_text + # self.get_tagged_text() + for phone_number, original_phone_number in zip(self.phone, self.original_phone_text): + if len(phone_number) > 10: + self.final_phone.append(self.check_for_country_code(phone_number)) + self.final_original_text.append(original_phone_number) + else: + self.final_phone.append({'country_calling_code': self.country_code_dict[self.country_code], + 'phone_number': phone_number}) + self.final_original_text.append(original_phone_number) + + # if self.country_code in self.country_code_dict: + # self.phone, self.original_phone_text = self.detect_entity_with_regex(self.tagged_text) + return self.final_phone, self.final_original_text def detect_entity_with_regex(self, tagged_text, **kwargs): """Detects phone numbers in the text string @@ -133,7 +143,7 @@ def detect_entity_with_regex(self, tagged_text, **kwargs): # self.phone, self.original_phone_text = [], [] for phone_number, original_phone_number in zip(phone, original_phone_text): - if len(phone_number) >= 10: + if len(phone_number) > 10: self.phone.append(self.check_for_country_code(phone_number)) self.original_phone_text.append(original_phone_number) else: diff --git a/ner_v2/tests/pattern/phone_number/phone_number_ner_tests.yaml b/ner_v2/tests/pattern/phone_number/phone_number_ner_tests.yaml index fb751b14f..d346f9c2b 100644 --- a/ner_v2/tests/pattern/phone_number/phone_number_ner_tests.yaml +++ b/ner_v2/tests/pattern/phone_number/phone_number_ner_tests.yaml @@ -108,6 +108,18 @@ tests: output_id: 2 value: "2226129854" country_calling_code: '91' + - id: en_14 + message: "Send 1000rs to +14089126172 and call 02226129854" + locale: "en-us" + outputs: + - original_text: "+14089126172" + output_id: 1 + value: "4089126172" + country_calling_code: '1' + - original_text: "02226129854" + output_id: 2 + value: "2226129854" + country_calling_code: '1' hi: - id: hi_1 message: "मेरे लिए ५००र्स ९८२०३३४४५५ पे भेज देना" From fba44ae20e5e964ddd186ba40aba2f0bc46e1b88 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Fri, 18 Oct 2019 10:55:14 +0530 Subject: [PATCH 225/237] fix YAML file --- .../phone_number/phone_number_ner_tests.yaml | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/ner_v2/tests/pattern/phone_number/phone_number_ner_tests.yaml b/ner_v2/tests/pattern/phone_number/phone_number_ner_tests.yaml index d346f9c2b..a121d3060 100644 --- a/ner_v2/tests/pattern/phone_number/phone_number_ner_tests.yaml +++ b/ner_v2/tests/pattern/phone_number/phone_number_ner_tests.yaml @@ -109,17 +109,17 @@ tests: value: "2226129854" country_calling_code: '91' - id: en_14 - message: "Send 1000rs to +14089126172 and call 02226129854" - locale: "en-us" - outputs: - - original_text: "+14089126172" - output_id: 1 - value: "4089126172" - country_calling_code: '1' - - original_text: "02226129854" - output_id: 2 - value: "2226129854" - country_calling_code: '1' + message: "Send 1000rs to +14089126172 and call 02226129854" + locale: "en-us" + outputs: + - original_text: "+14089126172" + output_id: 1 + value: "4089126172" + country_calling_code: '1' + - original_text: "02226129854" + output_id: 2 + value: "2226129854" + country_calling_code: '1' hi: - id: hi_1 message: "मेरे लिए ५००र्स ९८२०३३४४५५ पे भेज देना" From b71af48cb390b3f7d9245d5c58d0b5338d742968 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Fri, 18 Oct 2019 12:08:07 +0530 Subject: [PATCH 226/237] set leniency to zero in libphonenumber --- .../phone_number/phone_number_detection.py | 58 +++++++++++-------- 1 file changed, 33 insertions(+), 25 deletions(-) diff --git a/ner_v2/detectors/pattern/phone_number/phone_number_detection.py b/ner_v2/detectors/pattern/phone_number/phone_number_detection.py index a84f918ff..68d3cc1bf 100644 --- a/ner_v2/detectors/pattern/phone_number/phone_number_detection.py +++ b/ner_v2/detectors/pattern/phone_number/phone_number_detection.py @@ -32,12 +32,9 @@ def __init__(self, entity_name, language=ENGLISH_LANG, locale=None): self.locale = locale or 'en-IN' self.text = '' self.phone, self.original_phone_text = [], [] - self.final_phone, self.final_original_text = [], [] self.country_code = self.get_country_code_from_locale() self.tagged_text = '' self.processed_text = '' - self.country_code_dict = {'IN': '91', 'US': '1', 'GB': '44'} - self.possible_country_code_number_list = ['91', '1', '011 91', '44'] self.tag = '__' + self.entity_name + '__' @property @@ -92,24 +89,30 @@ def detect_entity(self, text, **kwargs): self.tagged_text = self.text self.processed_text = self.text self.phone, self.original_phone_text = [], [] - self.final_phone, self.final_original_text = [], [] for match in phonenumbers.PhoneNumberMatcher(self.text, self.country_code, leniency=0): - self.phone.append({"country_calling_code": str(match.number.country_code), - "phone_number": str(match.number.national_number)}) - self.original_phone_text.append(self.text[match.start:match.end]) - # self.get_tagged_text() - for phone_number, original_phone_number in zip(self.phone, self.original_phone_text): - if len(phone_number) > 10: - self.final_phone.append(self.check_for_country_code(phone_number)) - self.final_original_text.append(original_phone_number) + if match.number.country_code == phonenumbers.country_code_for_region(self.country_code): + self.phone.append(self.check_for_country_code(str(match.number.national_number))) + self.original_phone_text.append(self.text[match.start:match.end]) else: - self.final_phone.append({'country_calling_code': self.country_code_dict[self.country_code], - 'phone_number': phone_number}) - self.final_original_text.append(original_phone_number) + # This means our detector has detected some other country code. + self.phone.append({"country_calling_code": str(match.number.country_code), + "phone_number": str(match.number.national_number)}) + self.original_phone_text.append(self.text[match.start:match.end]) + + # self.get_tagged_text() + # for phone_number, original_phone_number in zip(self.phone, self.original_phone_text): + # if len(phone_number['phone_number']) > 10: + # self.final_phone.append(self.check_for_country_code(phone_number)) + # self.final_original_text.append(original_phone_number) + # else: + # self.final_phone.append({'country_calling_code': phonenumbers. + # country_code_for_region(self.country_code), + # 'phone_number': phone_number}) + # self.final_original_text.append(original_phone_number) # if self.country_code in self.country_code_dict: # self.phone, self.original_phone_text = self.detect_entity_with_regex(self.tagged_text) - return self.final_phone, self.final_original_text + return self.phone, self.original_phone_text def detect_entity_with_regex(self, tagged_text, **kwargs): """Detects phone numbers in the text string @@ -147,7 +150,7 @@ def detect_entity_with_regex(self, tagged_text, **kwargs): self.phone.append(self.check_for_country_code(phone_number)) self.original_phone_text.append(original_phone_number) else: - self.phone.append({'country_calling_code': self.country_code_dict[self.country_code], + self.phone.append({'country_calling_code': phonenumbers.country_code_for_region(self.country_code), 'phone_number': phone_number}) self.original_phone_text.append(original_phone_number) self.get_tagged_text() @@ -207,15 +210,20 @@ def check_for_country_code(self, phone_num): {countryCallingCode:"91",phone_number:"9123456789"} """ phone_dict = {} - check_country_regex = re.compile(r'^({country_code})\d{length}$'. - format(country_code='911|1|011 91|91', length='{10}'), re.U) - p = check_country_regex.findall(phone_num) - if len(p) == 1: - phone_dict['country_calling_code'] = p[0] - country_code_sub_regex = re.compile(r'^{detected_code}'.format(detected_code=p[0])) - phone_dict['phone_number'] = country_code_sub_regex.sub(string=phone_num, repl='') + + if len(phone_num) > 10: + check_country_regex = re.compile(r'^({country_code})\d{length}$'. + format(country_code='911|1|011 91|91', length='{10}'), re.U) + p = check_country_regex.findall(phone_num) + if len(p) == 1: + phone_dict['country_calling_code'] = p[0] + country_code_sub_regex = re.compile(r'^{detected_code}'.format(detected_code=p[0])) + phone_dict['phone_number'] = country_code_sub_regex.sub(string=phone_num, repl='') + else: + phone_dict['country_calling_code'] = phonenumbers.country_code_for_region(self.country_code) + phone_dict['phone_number'] = phone_num else: - phone_dict['country_calling_code'] = self.country_code_dict[self.country_code] + phone_dict['country_calling_code'] = phonenumbers.country_code_for_region(self.country_code) phone_dict['phone_number'] = phone_num return phone_dict From 01e38ccaefd5b5d0ed868129ca43f9138f3de1d4 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Fri, 18 Oct 2019 12:18:18 +0530 Subject: [PATCH 227/237] set leniency to zero in libphonenumber --- .../pattern/phone_number/phone_number_detection.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ner_v2/detectors/pattern/phone_number/phone_number_detection.py b/ner_v2/detectors/pattern/phone_number/phone_number_detection.py index 68d3cc1bf..5c13650b4 100644 --- a/ner_v2/detectors/pattern/phone_number/phone_number_detection.py +++ b/ner_v2/detectors/pattern/phone_number/phone_number_detection.py @@ -150,7 +150,7 @@ def detect_entity_with_regex(self, tagged_text, **kwargs): self.phone.append(self.check_for_country_code(phone_number)) self.original_phone_text.append(original_phone_number) else: - self.phone.append({'country_calling_code': phonenumbers.country_code_for_region(self.country_code), + self.phone.append({'country_calling_code': str(phonenumbers.country_code_for_region(self.country_code)), 'phone_number': phone_number}) self.original_phone_text.append(original_phone_number) self.get_tagged_text() @@ -220,10 +220,10 @@ def check_for_country_code(self, phone_num): country_code_sub_regex = re.compile(r'^{detected_code}'.format(detected_code=p[0])) phone_dict['phone_number'] = country_code_sub_regex.sub(string=phone_num, repl='') else: - phone_dict['country_calling_code'] = phonenumbers.country_code_for_region(self.country_code) + phone_dict['country_calling_code'] = str(phonenumbers.country_code_for_region(self.country_code)) phone_dict['phone_number'] = phone_num else: - phone_dict['country_calling_code'] = phonenumbers.country_code_for_region(self.country_code) + phone_dict['country_calling_code'] = str(phonenumbers.country_code_for_region(self.country_code)) phone_dict['phone_number'] = phone_num return phone_dict From 03dd0ecf08f6aa072e30af2598c471d604bc3430 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Fri, 18 Oct 2019 12:40:51 +0530 Subject: [PATCH 228/237] fix YAML tests --- .../phone_number/phone_number_detection.py | 175 +----------------- .../phone_number/phone_number_ner_tests.yaml | 16 +- 2 files changed, 9 insertions(+), 182 deletions(-) diff --git a/ner_v2/detectors/pattern/phone_number/phone_number_detection.py b/ner_v2/detectors/pattern/phone_number/phone_number_detection.py index 5c13650b4..d56326e32 100644 --- a/ner_v2/detectors/pattern/phone_number/phone_number_detection.py +++ b/ner_v2/detectors/pattern/phone_number/phone_number_detection.py @@ -28,14 +28,10 @@ def __init__(self, entity_name, language=ENGLISH_LANG, locale=None): self._supported_languages = NumberDetector.get_supported_languages() super(PhoneDetector, self).__init__(language, locale) self.language = language - self.entity_name = entity_name self.locale = locale or 'en-IN' self.text = '' self.phone, self.original_phone_text = [], [] self.country_code = self.get_country_code_from_locale() - self.tagged_text = '' - self.processed_text = '' - self.tag = '__' + self.entity_name + '__' @property def supported_languages(self): @@ -86,8 +82,6 @@ def detect_entity(self, text, **kwargs): """ self.text = text - self.tagged_text = self.text - self.processed_text = self.text self.phone, self.original_phone_text = [], [] for match in phonenumbers.PhoneNumberMatcher(self.text, self.country_code, leniency=0): if match.number.country_code == phonenumbers.country_code_for_region(self.country_code): @@ -99,111 +93,12 @@ def detect_entity(self, text, **kwargs): "phone_number": str(match.number.national_number)}) self.original_phone_text.append(self.text[match.start:match.end]) - # self.get_tagged_text() - # for phone_number, original_phone_number in zip(self.phone, self.original_phone_text): - # if len(phone_number['phone_number']) > 10: - # self.final_phone.append(self.check_for_country_code(phone_number)) - # self.final_original_text.append(original_phone_number) - # else: - # self.final_phone.append({'country_calling_code': phonenumbers. - # country_code_for_region(self.country_code), - # 'phone_number': phone_number}) - # self.final_original_text.append(original_phone_number) - - # if self.country_code in self.country_code_dict: - # self.phone, self.original_phone_text = self.detect_entity_with_regex(self.tagged_text) return self.phone, self.original_phone_text - def detect_entity_with_regex(self, tagged_text, **kwargs): - """Detects phone numbers in the text string - Args: - tagged_text: string to extract entities from - **kwargs: it can be used to send specific arguments in future. - Returns: - self.phone (list): list consisting the detected phone numbers - self.original_phone_text (list): list containing their corresponding substrings in the original message. - Examples: - text = 'call +1 (408) 912-6172 and send 100rs to 9920441344' - p = PhoneDetector(entity_name='phone_number', language='en') - p.detect_entity(text=text) - (['14089126172', '9920441344'], [u'+1 (408) 912-6172', u'9920441344']) - text = '+९१ ९८१९९८३१३२ पर कॉल करें और संदेश ९८२०३३४४१६ पर कॉल करें' - p = PhoneDetector(entity_name='phone_number', language='hi') - p.detect_entity(text=text) - (['919819983132', '9820334416'],[u'+९१ ९८१९९८३१३२', u'+९१ ९८१९९८३१३२']) - """ - - self.text = tagged_text - self.processed_text = self.text - self.tagged_text = self.text - - phone_number_original_list = self.get_number_regex() - - original_phone_texts = [p[0].strip() for p in phone_number_original_list] - original_phone_text = self.check_length(original_phone_texts=original_phone_texts) - clean_phone_list = [self.clean_phone_number(p) for p in original_phone_text] - phone = [self.get_number(phone) for phone in clean_phone_list] - - # self.phone, self.original_phone_text = [], [] - for phone_number, original_phone_number in zip(phone, original_phone_text): - if len(phone_number) > 10: - self.phone.append(self.check_for_country_code(phone_number)) - self.original_phone_text.append(original_phone_number) - else: - self.phone.append({'country_calling_code': str(phonenumbers.country_code_for_region(self.country_code)), - 'phone_number': phone_number}) - self.original_phone_text.append(original_phone_number) - self.get_tagged_text() - - return self.phone, self.original_phone_text - - def get_digit_length(self, text): - return len(re.findall(pattern='\d', string=text, flags=re.U)) - - def check_length(self, original_phone_texts): - """ - This method is used to handle the corner case where consecutive numbers are present with - space within them. - Args: - original_phone_texts (list): list of text substrings detected by the regex - Returns: - phone_number_list (list): list of phone numbers splitting based on length - Examples: - original_phone_texts = ['9820334415 91 9920441388', '9820551388982347'] - check_length(original_phone_texts=original_phone_texts) - >> ['9820334415', '91 9920441388'] - """ - phone_number_list_1, phone_number_list2 = [], [] - - for original_phone_text in original_phone_texts: - - if self.get_digit_length(text=original_phone_text) > 13: - phone_parts = original_phone_text.split() - visited = [0 for i in range(len(phone_parts))] - - for i in range(len(phone_parts)): - temp = '' - appended_parts = [] - - for j in range(i, len(phone_parts)): - if visited[j] == 0: - temp = temp + ' ' + phone_parts[j] - appended_parts.append(j) - - if 13 >= self.get_digit_length(text=temp) > 7: - phone_number_list_1.append(temp.strip()) - for m in appended_parts: - visited[m] = 1 - break - else: - phone_number_list2.append(original_phone_text) - phone_number_list_1.extend(phone_number_list2) - return phone_number_list_1 - def check_for_country_code(self, phone_num): """ :param phone_num: the number which is to be checked for country code - :return: dict with country_code if it's in phone_num and phone_number without country code + :return: dict with country_code if it's in phone_num or phone_number with current country code Examples: phone_num = '919123456789' countryCallingCode = 'IN' @@ -227,71 +122,3 @@ def check_for_country_code(self, phone_num): phone_dict['phone_number'] = phone_num return phone_dict - - def get_number(self, phone): - """ - This method is used to convert phone numbers in language scripts other than English - to the English - Args: - phone (str): The string phone number which is detected and cleaned - Returns: - phone (str): The string phone number converted to English script - Examples: - phone = u'९१९८१९९८३१३२' - get_number(phone=phone) - '919819983132' - """ - phone_length = len(phone) - phone = str(int(phone)) - - if phone_length != len(phone): - phone = phone.zfill(phone_length) - - return phone - - def clean_phone_number(self, number): - """ - This method is used to clean the detected phone number. - Args: - number (str): The original substring which is detected and is required for cleaning - Returns: - number (str): The number post cleaning - """ - # Remove (), -, whistespace, + - clean_regex = re.compile('([()\-\s\+]+)', re.U) - number = clean_regex.sub(string=number, repl='') - return number - - def get_number_regex(self): - - """ - This method is used to detect the phone number patterns from the provided text - Returns: - phone_number_list (list): list of patterns detected from the regex pattern - (each pattern: (complete original text, area code, number)) - (we further utitlize only the complete original text) - Example: - p = PhoneDetector(entity_name='phone_number', language='hi') - text = u'Set a reminder on +1 (408) 912-6172' - p.text = text - p.get_number_regex() - [(u'+1 (408) 912-6172', u'1', u'(408) 912-6172'), - (u'+91 9820334416', u'91', u'9820334416'), - (u'022 26129857', u'022', u'26129857')] - """ - phone_number_regex = re.compile( - r'((?:\(?\+(\d{1,2})\)?[\s\-\.]*)?((?=[\-\d()\s\.]{10,16}(?:[^\d]+|$))' - r'(?:[\d(]{1,20}(?:[\-)\s\.]*\d{1,20}){0,20}){1,20}))', re.U) - - phone_number_list = phone_number_regex.findall(self.text) - return phone_number_list - - def get_tagged_text(self): - """ - Replaces detected phone numbers with tag generated from entity_name used to initialize the object with - A final string with all phone numbers replaced will be stored in object's tagged_text attribute - A string with all phone numbers removed will be stored in object's processed_text attribute - """ - for detected_text in self.original_phone_text: - self.tagged_text = self.tagged_text.replace(detected_text, self.tag) - self.processed_text = self.processed_text.replace(detected_text, '') diff --git a/ner_v2/tests/pattern/phone_number/phone_number_ner_tests.yaml b/ner_v2/tests/pattern/phone_number/phone_number_ner_tests.yaml index a121d3060..a629d96aa 100644 --- a/ner_v2/tests/pattern/phone_number/phone_number_ner_tests.yaml +++ b/ner_v2/tests/pattern/phone_number/phone_number_ner_tests.yaml @@ -112,14 +112,14 @@ tests: message: "Send 1000rs to +14089126172 and call 02226129854" locale: "en-us" outputs: - - original_text: "+14089126172" - output_id: 1 - value: "4089126172" - country_calling_code: '1' - - original_text: "02226129854" - output_id: 2 - value: "2226129854" - country_calling_code: '1' + - original_text: "+14089126172" + output_id: 1 + value: "4089126172" + country_calling_code: '1' + - original_text: "02226129854" + output_id: 2 + value: "2226129854" + country_calling_code: '1' hi: - id: hi_1 message: "मेरे लिए ५००र्स ९८२०३३४४५५ पे भेज देना" From 8e4e401971f308df812b4f3da32446319024567d Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Fri, 18 Oct 2019 16:15:09 +0530 Subject: [PATCH 229/237] add check_for_alphas --- .../detectors/pattern/phone_number/README.md | 72 ++++++++++--------- .../phone_number/phone_number_detection.py | 11 +++ .../phone_number/phone_number_ner_tests.yaml | 12 +++- 3 files changed, 59 insertions(+), 36 deletions(-) diff --git a/ner_v2/detectors/pattern/phone_number/README.md b/ner_v2/detectors/pattern/phone_number/README.md index 0e57c4858..d0e1c0f88 100644 --- a/ner_v2/detectors/pattern/phone_number/README.md +++ b/ner_v2/detectors/pattern/phone_number/README.md @@ -1,6 +1,6 @@ ## Phone Number Detector -The Phone Number Detector has the capability to detect phone numbers from within the given text. The detector has the ability to handle multilanguage text. Additionally, this detector is scaled to handle domestic as well as international phone numbers +The Phone Number Detector has the capability to detect phone numbers from within the given text. The detector has the ability to handle multi language text. Additionally, this detector is scaled to handle domestic as well as international phone numbers We are currently providing phone number detection support in 6 languages, which are @@ -8,7 +8,7 @@ The Phone Number Detector has the capability to detect phone numbers from within - Hindi - Marathi - Gujarati -- Telgu +- Telugu - Tamil ### Usage @@ -17,54 +17,58 @@ The Phone Number Detector has the capability to detect phone numbers from within ```python >> from ner_v2.detectors.pattern.phone_number.phone_number_detection import PhoneDetector - >> detector = PhoneDetector(language='en', entity_name='phone_number') # here language will be ISO 639-1 code + >> detector = PhoneDetector(language='en', entity_name='phone_number', locale='en-IN') + # here language will be ISO 639-1 code and locale can be of the form 'language[-_]country_code' >> detector.detect_entity(text=u'send a message on 91 9820334455') - >> (['919820334455'], [u'91 9820334455']) + >> ([{'country_calling_code': '91', 'phone_number': '9820334455'}],['91 9820334455']) ``` - **Curl Command** ```bash # For a sample query with following parameters - # message="Call 022 26129857 and send 100 rs to +919820334416 and 1(408) 234-619" + # message="Call 022 2612985 and send 100 rs to +919820334416 and 1(408) 234-6192" # entity_name='phone_number' # structured_value=None # fallback_value=None # bot_message=None # source_language='en' + # locale='en-us' $ URL='localhost' $ PORT=8081 - $ curl -i 'http://'$URL':'$PORT'/v2/phone_number?message=Call%20022%2026129857%20and%20send%20100%20rs%20to%20+919820334416%20and%201%28408%29%20234-619&entity_name=phone_number&fallback_value=&bot_message=&structured_value=&source_language=en' - + $ curl -i 'http://'$URL':'$PORT'v2/phone_number?entity_name=phone_number&message=Call%20022%202612985%20and%20send%20100%20rs%20to%20%2B919820334416%20and%201(408)%20234-6192&source_language=en&locale=en-us&structured_value=&fallback_value=&bot_message=' -H 'cache-control: no-cache' -H 'postman-token: dad3f116-37f2-2627-b8c6-f89f00f19924' # Curl output $ { - "data": [ - { - "detection": "message", - "original_text": "022 26129857", - "entity_value": { - "value": "02226129857" - }, - "language": "en" - }, - { - "detection": "message", - "original_text": "919820334416", - "entity_value": { - "value": "919820334416" - }, - "language": "en" - }, - { - "detection": "message", - "original_text": "1(408) 234-619", - "entity_value": { - "value": "1408234619" - }, - "language": "en" - } - ] - } + "data": [ + { + "detection": "message", + "original_text": "022 2612985", + "entity_value": { + "phone_number": "222612985", + "country_calling_code": "1" + }, + "language": "en" + }, + { + "detection": "message", + "original_text": "+919820334416", + "entity_value": { + "phone_number": "9820334416", + "country_calling_code": "91" + }, + "language": "en" + }, + { + "detection": "message", + "original_text": "1(408) 234-6192", + "entity_value": { + "phone_number": "4082346192", + "country_calling_code": "1" + }, + "language": "en" + } + ] +} ``` \ No newline at end of file diff --git a/ner_v2/detectors/pattern/phone_number/phone_number_detection.py b/ner_v2/detectors/pattern/phone_number/phone_number_detection.py index d56326e32..e8a473ec0 100644 --- a/ner_v2/detectors/pattern/phone_number/phone_number_detection.py +++ b/ner_v2/detectors/pattern/phone_number/phone_number_detection.py @@ -93,8 +93,19 @@ def detect_entity(self, text, **kwargs): "phone_number": str(match.number.national_number)}) self.original_phone_text.append(self.text[match.start:match.end]) + self.check_for_alphas() return self.phone, self.original_phone_text + def check_for_alphas(self): + """ + checks if any leading or trailing alphabets in the detected phone numbers and removes those numbers + """ + + for phone, original in zip(self.phone, self.original_phone_text): + if re.search(r'(\w{original}|{original}[])'.format(original=k[1])): + self.phone.remove(phone) + self.original_phone_text.remove(original) + def check_for_country_code(self, phone_num): """ :param phone_num: the number which is to be checked for country code diff --git a/ner_v2/tests/pattern/phone_number/phone_number_ner_tests.yaml b/ner_v2/tests/pattern/phone_number/phone_number_ner_tests.yaml index a629d96aa..f04ef56af 100644 --- a/ner_v2/tests/pattern/phone_number/phone_number_ner_tests.yaml +++ b/ner_v2/tests/pattern/phone_number/phone_number_ner_tests.yaml @@ -109,17 +109,25 @@ tests: value: "2226129854" country_calling_code: '91' - id: en_14 - message: "Send 1000rs to +14089126172 and call 02226129854" + message: "Send 1000rs to +14089126172 and call 2226129854" locale: "en-us" outputs: - original_text: "+14089126172" output_id: 1 value: "4089126172" country_calling_code: '1' - - original_text: "02226129854" + - original_text: "2226129854" output_id: 2 value: "2226129854" country_calling_code: '1' + - id: en_15 + message: "Send 1000rs to 2226129854b" + locale: "en-us" + outputs: + - original_text: null + output_id: 1 + value: null + country_calling_code: null hi: - id: hi_1 message: "मेरे लिए ५००र्स ९८२०३३४४५५ पे भेज देना" From ecd8d67b56bb60a29dc343285dde5f732c3dfa92 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Fri, 18 Oct 2019 16:16:48 +0530 Subject: [PATCH 230/237] add check_for_alphas --- ner_v2/detectors/pattern/phone_number/phone_number_detection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ner_v2/detectors/pattern/phone_number/phone_number_detection.py b/ner_v2/detectors/pattern/phone_number/phone_number_detection.py index e8a473ec0..c98bb9f51 100644 --- a/ner_v2/detectors/pattern/phone_number/phone_number_detection.py +++ b/ner_v2/detectors/pattern/phone_number/phone_number_detection.py @@ -102,7 +102,7 @@ def check_for_alphas(self): """ for phone, original in zip(self.phone, self.original_phone_text): - if re.search(r'(\w{original}|{original}[])'.format(original=k[1])): + if re.search(r'([a-zA-Z0-9]{original}|{original}[a-zA-Z0-9])'.format(original=original)): self.phone.remove(phone) self.original_phone_text.remove(original) From a8367feecec24fb82150b5365cf3dd8d9dfce26c Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Fri, 18 Oct 2019 16:18:34 +0530 Subject: [PATCH 231/237] add check_for_alphas --- ner_v2/detectors/pattern/phone_number/phone_number_detection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ner_v2/detectors/pattern/phone_number/phone_number_detection.py b/ner_v2/detectors/pattern/phone_number/phone_number_detection.py index c98bb9f51..699196308 100644 --- a/ner_v2/detectors/pattern/phone_number/phone_number_detection.py +++ b/ner_v2/detectors/pattern/phone_number/phone_number_detection.py @@ -102,7 +102,7 @@ def check_for_alphas(self): """ for phone, original in zip(self.phone, self.original_phone_text): - if re.search(r'([a-zA-Z0-9]{original}|{original}[a-zA-Z0-9])'.format(original=original)): + if re.search(r'([a-zA-Z0-9]{original}|{original}[a-zA-Z0-9])'.format(original=original), self.text): self.phone.remove(phone) self.original_phone_text.remove(original) From d2f4d9db2b6262089cee78fcc95d91c13e8d6ff8 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Fri, 18 Oct 2019 17:05:13 +0530 Subject: [PATCH 232/237] add check_for_alphas --- .../phone_number/phone_number_detection.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/ner_v2/detectors/pattern/phone_number/phone_number_detection.py b/ner_v2/detectors/pattern/phone_number/phone_number_detection.py index 699196308..b1e06906d 100644 --- a/ner_v2/detectors/pattern/phone_number/phone_number_detection.py +++ b/ner_v2/detectors/pattern/phone_number/phone_number_detection.py @@ -81,7 +81,7 @@ def detect_entity(self, text, **kwargs): [u'+९१ ९८१९९८३१३२', u'+९१ ९८१९९८३१३२']) """ - self.text = text + self.text = " " + text.lower().strip() + " " self.phone, self.original_phone_text = [], [] for match in phonenumbers.PhoneNumberMatcher(self.text, self.country_code, leniency=0): if match.number.country_code == phonenumbers.country_code_for_region(self.country_code): @@ -92,19 +92,20 @@ def detect_entity(self, text, **kwargs): self.phone.append({"country_calling_code": str(match.number.country_code), "phone_number": str(match.number.national_number)}) self.original_phone_text.append(self.text[match.start:match.end]) - - self.check_for_alphas() + self.phone, self.original_phone_text = self.check_for_alphas() return self.phone, self.original_phone_text def check_for_alphas(self): """ checks if any leading or trailing alphabets in the detected phone numbers and removes those numbers """ - + validated_phone = [] + validated_original_text = [] for phone, original in zip(self.phone, self.original_phone_text): - if re.search(r'([a-zA-Z0-9]{original}|{original}[a-zA-Z0-9])'.format(original=original), self.text): - self.phone.remove(phone) - self.original_phone_text.remove(original) + if re.match(r'\W' + re.escape(original) + r'\W', self.text, re.UNICODE): + validated_phone.append(phone) + validated_original_text.append(original) + return validated_phone, validated_original_text def check_for_country_code(self, phone_num): """ From 1924d46899d0757b9503094644bef8b9a102731d Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Fri, 18 Oct 2019 17:32:36 +0530 Subject: [PATCH 233/237] add check_for_alphas --- ner_v2/detectors/pattern/phone_number/phone_number_detection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ner_v2/detectors/pattern/phone_number/phone_number_detection.py b/ner_v2/detectors/pattern/phone_number/phone_number_detection.py index b1e06906d..8e7111ec3 100644 --- a/ner_v2/detectors/pattern/phone_number/phone_number_detection.py +++ b/ner_v2/detectors/pattern/phone_number/phone_number_detection.py @@ -102,7 +102,7 @@ def check_for_alphas(self): validated_phone = [] validated_original_text = [] for phone, original in zip(self.phone, self.original_phone_text): - if re.match(r'\W' + re.escape(original) + r'\W', self.text, re.UNICODE): + if re.search(r'\W' + re.escape(original) + r'\W', self.text, re.UNICODE): validated_phone.append(phone) validated_original_text.append(original) return validated_phone, validated_original_text From 359be9fb924d42009198890ec40122f36c46c691 Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Mon, 21 Oct 2019 16:49:43 +0530 Subject: [PATCH 234/237] add entity_name and tag back --- ner_v2/detectors/pattern/phone_number/phone_number_detection.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ner_v2/detectors/pattern/phone_number/phone_number_detection.py b/ner_v2/detectors/pattern/phone_number/phone_number_detection.py index 8e7111ec3..f21754bfa 100644 --- a/ner_v2/detectors/pattern/phone_number/phone_number_detection.py +++ b/ner_v2/detectors/pattern/phone_number/phone_number_detection.py @@ -32,6 +32,8 @@ def __init__(self, entity_name, language=ENGLISH_LANG, locale=None): self.text = '' self.phone, self.original_phone_text = [], [] self.country_code = self.get_country_code_from_locale() + self.entity_name = entity_name + self.tag = '__' + self.entity_name + '__' @property def supported_languages(self): From 79ed768f706fac153ca8eb62068f05a3a0878016 Mon Sep 17 00:00:00 2001 From: amansrivastava17 Date: Tue, 22 Oct 2019 15:34:40 +0530 Subject: [PATCH 235/237] fix issue in name detection through pos tags as it is not checking if name detected exist in user messahe or not --- ner_v1/chatbot/entity_detection.py | 2 +- .../detectors/textual/name/name_detection.py | 34 +++++++++++-------- 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/ner_v1/chatbot/entity_detection.py b/ner_v1/chatbot/entity_detection.py index 814cc3a5f..4a7ff7e31 100644 --- a/ner_v1/chatbot/entity_detection.py +++ b/ner_v1/chatbot/entity_detection.py @@ -568,7 +568,7 @@ def get_person_name(message, entity_name, structured_value, fallback_value, bot_ entity_list, original_text_list = name_detection.detect_entity(text=text, bot_message=bot_message) if not entity_list and fallback_text: - entity_list, original_text_list = NameDetector.get_format_name(fallback_text.split()) + entity_list, original_text_list = NameDetector.get_format_name(fallback_text.split(), fallback_text) detection_method = fallback_method if entity_list and original_text_list: diff --git a/ner_v1/detectors/textual/name/name_detection.py b/ner_v1/detectors/textual/name/name_detection.py index 07ee1121e..9cf07704e 100644 --- a/ner_v1/detectors/textual/name/name_detection.py +++ b/ner_v1/detectors/textual/name/name_detection.py @@ -49,7 +49,7 @@ def __init__(self, entity_name, language=ENGLISH_LANG): self.text_detection_object = TextDetector(entity_name=entity_name) @staticmethod - def get_format_name(name_list): + def get_format_name(name_tokens, text): """ Takes input as name_list which contains the names detected. It separates the first, middle and last names. @@ -58,7 +58,7 @@ def get_format_name(name_list): 2.The original text. Args: - name_list (list): List of names detected + name_tokens (list): List of tokens in the name Example: ['yash', 'doshi'] @@ -68,19 +68,23 @@ def get_format_name(name_list): ["yash modi"] ) """ - original_text = " ".join(name_list) + entity_value = [] + original_text = [] - first_name = name_list[0] + name_text = " ".join(name_tokens) + + first_name = name_tokens[0] middle_name = None last_name = None - if len(name_list) > 1: - last_name = name_list[-1] - middle_name = " ".join(name_list[1:-1]) or None - - entity_value = {FIRST_NAME: first_name, MIDDLE_NAME: middle_name, LAST_NAME: last_name} + if name_text in text: + if len(name_tokens) > 1: + last_name = name_tokens[-1] + middle_name = " ".join(name_tokens[1:-1]) or None - return [entity_value], [original_text] + entity_value.append({FIRST_NAME: first_name, MIDDLE_NAME: middle_name, LAST_NAME: last_name}) + original_text.append(name_text) + return entity_value, original_text def text_detection_name(self, text=None): """ @@ -127,19 +131,19 @@ def get_name_using_pos_tagger(self, text): return entity_value, original_text if pattern1_match: - entity_value, original_text = self.get_format_name(pattern1_match[0][1].split()) + entity_value, original_text = self.get_format_name(pattern1_match[0][1].split(), self.text) elif pattern2_match: - entity_value, original_text = self.get_format_name(pattern2_match[0].split()) + entity_value, original_text = self.get_format_name(pattern2_match[0].split(), self.text) elif pattern3_match: - entity_value, original_text = self.get_format_name(pattern3_match[0].split()) + entity_value, original_text = self.get_format_name(pattern3_match[0].split(), self.text) elif len(name_tokens) < 4: pos_words = [word[0] for word in tagged_names if word[1].startswith('NN') or word[1].startswith('JJ')] if pos_words: - entity_value, original_text = self.get_format_name(pos_words) + entity_value, original_text = self.get_format_name(pos_words, self.text) return entity_value, original_text @@ -297,7 +301,7 @@ def detect_person_name_entity(self, replaced_text): name_list.append(name_holder) for name in name_list: - name_entity_value, original_text_value = self.get_format_name(name) + name_entity_value, original_text_value = self.get_format_name(name, self.text) original_text.extend(original_text_value) entity_value.extend(name_entity_value) From 3c085022b21d1faa357f8dcdbe456eee8cfcdd8e Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Tue, 22 Oct 2019 18:11:40 +0530 Subject: [PATCH 236/237] return value instead of phone_number --- .../phone_number/phone_number_detection.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/ner_v2/detectors/pattern/phone_number/phone_number_detection.py b/ner_v2/detectors/pattern/phone_number/phone_number_detection.py index f21754bfa..aa50f4079 100644 --- a/ner_v2/detectors/pattern/phone_number/phone_number_detection.py +++ b/ner_v2/detectors/pattern/phone_number/phone_number_detection.py @@ -72,14 +72,14 @@ def detect_entity(self, text, **kwargs): text = 'call +1 (408) 912-6172' p = PhoneDetector(entity_name='phone_number', language='en', locale='en-US') p.detect_entity(text=text) - ([{'country_calling_code':'1', phone_number':'4089126172'} ], + ([{'country_calling_code':'1', value':'4089126172'} ], [u'+1 (408) 912-6172']) text = '+९१ ९८१९९८३१३२ पर कॉल करें और संदेश ९८२०३३४४१६ पर कॉल करें' p = PhoneDetector(entity_name='phone_number', language='hi', locale='en-IN') p.detect_entity(text=text) - ([{'country_calling_code':'91', phone_number':'9819983132'} - ,{ 'country_calling_code':'91', phone_number:'9820334416'} ], + ([{'country_calling_code':'91', value':'9819983132'} + ,{ 'country_calling_code':'91', value:'9820334416'} ], [u'+९१ ९८१९९८३१३२', u'+९१ ९८१९९८३१३२']) """ @@ -92,7 +92,7 @@ def detect_entity(self, text, **kwargs): else: # This means our detector has detected some other country code. self.phone.append({"country_calling_code": str(match.number.country_code), - "phone_number": str(match.number.national_number)}) + "value": str(match.number.national_number)}) self.original_phone_text.append(self.text[match.start:match.end]) self.phone, self.original_phone_text = self.check_for_alphas() return self.phone, self.original_phone_text @@ -116,7 +116,7 @@ def check_for_country_code(self, phone_num): Examples: phone_num = '919123456789' countryCallingCode = 'IN' - {countryCallingCode:"91",phone_number:"9123456789"} + {countryCallingCode:"91",value:"9123456789"} """ phone_dict = {} @@ -127,12 +127,12 @@ def check_for_country_code(self, phone_num): if len(p) == 1: phone_dict['country_calling_code'] = p[0] country_code_sub_regex = re.compile(r'^{detected_code}'.format(detected_code=p[0])) - phone_dict['phone_number'] = country_code_sub_regex.sub(string=phone_num, repl='') + phone_dict['value'] = country_code_sub_regex.sub(string=phone_num, repl='') else: phone_dict['country_calling_code'] = str(phonenumbers.country_code_for_region(self.country_code)) - phone_dict['phone_number'] = phone_num + phone_dict['value'] = phone_num else: phone_dict['country_calling_code'] = str(phonenumbers.country_code_for_region(self.country_code)) - phone_dict['phone_number'] = phone_num + phone_dict['value'] = phone_num return phone_dict From afecd68e10bb2086ed6f485f3dd46ea32776475a Mon Sep 17 00:00:00 2001 From: ruthvik-17 Date: Tue, 22 Oct 2019 18:14:32 +0530 Subject: [PATCH 237/237] change phone_number to value --- .../tests/pattern/phone_number/test_phone_number_detection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ner_v2/tests/pattern/phone_number/test_phone_number_detection.py b/ner_v2/tests/pattern/phone_number/test_phone_number_detection.py index 464c8269c..1450d1a98 100644 --- a/ner_v2/tests/pattern/phone_number/test_phone_number_detection.py +++ b/ner_v2/tests/pattern/phone_number/test_phone_number_detection.py @@ -40,7 +40,7 @@ def parse_expected_outputs(expected_outputs): expected_output["original_text"].lower().strip() if expected_output["original_text"] else None if original_text: phone_num_dict = { - 'phone_number': str(expected_output["value"]), + 'value': str(expected_output["value"]), 'country_calling_code': str(expected_output["country_calling_code"]) } phone_num_list.append(phone_num_dict)