diff --git a/chatbot_ner/urls.py b/chatbot_ner/urls.py index 6acd460b0..a9550a7f2 100755 --- a/chatbot_ner/urls.py +++ b/chatbot_ner/urls.py @@ -9,6 +9,7 @@ urlpatterns = [ + url(r'^v1/text_bulk/$', api_v1.text), url(r'^v1/text/$', api_v1.text), url(r'^v1/location/$', api_v1.location), url(r'^v1/phone_number/$', api_v1.phone_number), diff --git a/datastore/datastore.py b/datastore/datastore.py index 586014fff..7799f383b 100644 --- a/datastore/datastore.py +++ b/datastore/datastore.py @@ -1,5 +1,3 @@ -import collections - import elastic_search from chatbot_ner.config import ner_logger, CHATBOT_NER_DATASTORE from lib.singleton import Singleton @@ -90,8 +88,9 @@ def create(self, **kwargs): ignore_unavailable: Whether specified concrete indices should be ignored when unavailable (missing or closed) - Refer https://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.client.IndicesClient.create - Refer https://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.client.IndicesClient.put_mapping + Refer-- + https://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.client.IndicesClient.create + https://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.client.IndicesClient.put_mapping Raises: DataStoreSettingsImproperlyConfiguredException if connection settings are invalid or missing @@ -231,12 +230,12 @@ def get_entity_dictionary(self, entity_name, **kwargs): return results_dictionary - def get_similar_dictionary(self, entity_name, text, fuzziness_threshold="auto:4,7", + def get_similar_dictionary(self, entity_name, texts, fuzziness_threshold="auto:4,7", search_language_script=None, **kwargs): """ Args: entity_name: the name of the entity to lookup in the datastore for getting entity values and their variants - text: the text for which variants need to be find out + texts(list of strings): the text for which variants need to be find out fuzziness_threshold: fuzziness allowed for search results on entity value variants search_language_script: language of elasticsearch documents which are eligible for match kwargs: @@ -244,7 +243,7 @@ def get_similar_dictionary(self, entity_name, text, fuzziness_threshold="auto:4, Refer https://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.Elasticsearch.search Returns: - collections.OrderedDict: dictionary mapping entity value variants to their entity value + list of collections.OrderedDict: dictionary mapping entity value variants to their entity value Example: db = DataStore() @@ -252,34 +251,36 @@ def get_similar_dictionary(self, entity_name, text, fuzziness_threshold="auto:4, db.get_similar_ngrams_dictionary(entity_name='city', ngrams_list=ngrams_list, fuzziness_threshold=2) Output: - {u'Bangalore': u'Bangalore', - u'Mulbagal': u'Mulbagal', - u'Multai': u'Multai', - u'Mumbai': u'Mumbai', - u'Pune': u'Pune', - u'Puri': u'Puri', - u'bangalore': u'bengaluru', - u'goa': u'goa', - u'mumbai': u'mumbai', - u'pune': u'pune'} - """ - results_dictionary = collections.OrderedDict() + [ + {u'Bangalore': u'Bangalore', + u'Mulbagal': u'Mulbagal', + u'Multai': u'Multai', + u'Mumbai': u'Mumbai', + u'Pune': u'Pune', + u'Puri': u'Puri', + u'bangalore': u'bengaluru', + u'goa': u'goa', + u'mumbai': u'mumbai', + u'pune': u'pune'} + ] + """ + results_list = [] if self._client_or_connection is None: self._connect() if self._engine == ELASTICSEARCH: self._check_doc_type_for_elasticsearch() request_timeout = self._connection_settings.get('request_timeout', 20) - results_dictionary = elastic_search.query.full_text_query(connection=self._client_or_connection, - index_name=self._store_name, - doc_type=self._connection_settings[ - ELASTICSEARCH_DOC_TYPE], - entity_name=entity_name, - sentence=text, - fuzziness_threshold=fuzziness_threshold, - search_language_script=search_language_script, - request_timeout=request_timeout, - **kwargs) - return results_dictionary + results_list = elastic_search.query.full_text_query(connection=self._client_or_connection, + index_name=self._store_name, + doc_type=self._connection_settings[ + ELASTICSEARCH_DOC_TYPE], + entity_name=entity_name, + sentences=texts, + fuzziness_threshold=fuzziness_threshold, + search_language_script=search_language_script, + request_timeout=request_timeout, + **kwargs) + return results_list def delete_entity(self, entity_name, **kwargs): """ diff --git a/datastore/elastic_search/query.py b/datastore/elastic_search/query.py index 0e16de1d8..c26569e5d 100644 --- a/datastore/elastic_search/query.py +++ b/datastore/elastic_search/query.py @@ -11,6 +11,7 @@ from external_api.constants import SENTENCE_LIST, ENTITY_LIST from language_utilities.constant import ENGLISH_LANG from lib.nlp.const import TOKENIZER +import json log_prefix = 'datastore.elastic_search.query' @@ -238,7 +239,7 @@ def get_entity_unique_values(connection, index_name, doc_type, entity_name, valu return values -def full_text_query(connection, index_name, doc_type, entity_name, sentence, fuzziness_threshold, +def full_text_query(connection, index_name, doc_type, entity_name, sentences, fuzziness_threshold, search_language_script=None, **kwargs): """ Performs compound elasticsearch boolean search query with highlights for the given sentence . The query @@ -249,16 +250,16 @@ def full_text_query(connection, index_name, doc_type, entity_name, sentence, fuz index_name: The name of the index doc_type: The type of the documents that will be indexed entity_name: name of the entity to perform a 'term' query on - sentence: sentence in which entity has to be searched + sentences(list of strings): sentences in which entity has to be searched fuzziness_threshold: fuzziness_threshold for elasticsearch match query 'fuzziness' parameter search_language_script: language of elasticsearch documents which are eligible for match kwargs: Refer https://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.Elasticsearch.search Returns: - collections.OrderedDict: dictionary of the parsed results from highlighted search query results - on the sentence, mapping highlighted fuzzy entity variant to entity value ordered - by relevance order returned by elasticsearch + list of collections.OrderedDict: list of dictionaries of the parsed results from highlighted search query + results on the sentence, mapping highlighted fuzzy entity variant to entity value ordered + by relevance order returned by elasticsearch Example: # The following example is just for demonstration purpose. Normally we should call @@ -282,17 +283,23 @@ def full_text_query(connection, index_name, doc_type, entity_name, sentence, fuz u'mumbai': u'mumbai', u'pune': u'pune'} """ - data = _generate_es_search_dictionary(entity_name, sentence, fuzziness_threshold, - language_script=search_language_script) - kwargs = dict(kwargs, body=data, doc_type=doc_type, size=constants.ELASTICSEARCH_SEARCH_SIZE, index=index_name) - results = _run_es_search(connection, **kwargs) - results = _parse_es_search_results(results) + index = {'index': index_name, 'type': doc_type} + data = [] + for sentence_ in sentences: + query = _generate_es_search_dictionary(entity_name, sentence_, fuzziness_threshold, + language_script=search_language_script) + data.extend([json.dumps(index), json.dumps(query)]) + data = '\n'.join(data) + + kwargs = dict(kwargs, body=data, doc_type=doc_type, index=index_name) + results = _run_es_search(connection, msearch=True, **kwargs) + results = _parse_es_search_results(results.get("responses")) return results -def _run_es_search(connection, **kwargs): +def _run_es_search(connection, msearch=False, **kwargs): """ - Execute the elasticsearch.ElasticSearch.search() method and return all results using + Execute the elasticsearch.ElasticSearch.msearch() method and return all results using elasticsearch.ElasticSearch.scroll() method if and only if scroll is passed in kwargs. Note that this is not recommended for large queries and can severly impact performance. @@ -301,11 +308,17 @@ def _run_es_search(connection, **kwargs): kwargs: Refer https://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.Elasticsearch.search Returns: - dictionary, search results from elasticsearch.ElasticSearch.search + dictionary, search results from elasticsearch.ElasticSearch.msearch """ scroll = kwargs.pop('scroll', False) if not scroll: - return connection.search(**kwargs) + if msearch: + return connection.msearch(**kwargs) + else: + return connection.search(**kwargs) + + if scroll and msearch: + raise ValueError('Scrolling is not supported in msearch mode') result = connection.search(scroll=scroll, **kwargs) scroll_id = result['_scroll_id'] @@ -388,7 +401,7 @@ def _generate_es_search_dictionary(entity_name, text, fuzziness_threshold, langu 'should': [], 'minimum_should_match': 1 } - } + }, 'size': constants.ELASTICSEARCH_SEARCH_SIZE } query_should_data = [] query = { @@ -412,77 +425,86 @@ def _generate_es_search_dictionary(entity_name, text, fuzziness_threshold, langu return data -def _parse_es_search_results(results): +def _parse_es_search_results(results_list): """ Parse highlighted results returned from elasticsearch query and generate a variants to values dictionary Args: - results (dict): search results dictionary from elasticsearch including highlights and scores + results_list (list of dict): search results list of dictionaries from elasticsearch including highlights + and scores Returns: - collections.OrderedDict: dict mapping matching variants to their entity values based on the - parsed results from highlighted search query results + list of collections.OrderedDict: list containing dicts mapping matching variants to their entity values based + on the parsed results from highlighted search query results Example: Parameter ngram_results has highlighted search results as follows: - {u'_shards': {u'failed': 0, u'successful': 5, u'total': 5}, - u'hits': {u'hits': [{u'_id': u'AVrW02UE9WNuMIY9vmWn', - u'_index': u'doc_type_name', - u'_score': 11.501145, - u'_source': {u'dict_type': u'variants', - u'entity_data': u'city', - u'value': u'goa', - u'variants': [u'', u'goa']}, - u'_type': u'data_dictionary', - u'highlight': {u'variants': [u'goa']}}, - {u'_id': u'AVrW02W99WNuMIY9vmcf', - u'_index': u'entity_data', - u'_score': 11.210829, - u'_source': {u'dict_type': u'variants', - u'entity_data': u'city', - u'value': u'Mumbai', - u'variants': [u'', u'Mumbai']}, - u'_type': u'data_dictionary', - u'highlight': {u'variants': [u'Mumbai']}}, - ... - u'max_score': 11.501145, - u'total': 17}, - u'timed_out': False, - u'took': 96} + [ + {u'_shards': {u'failed': 0, u'successful': 5, u'total': 5}, + u'hits': {u'hits': [{u'_id': u'AVrW02UE9WNuMIY9vmWn', + u'_index': u'doc_type_name', + u'_score': 11.501145, + u'_source': {u'dict_type': u'variants', + u'entity_data': u'city', + u'value': u'goa', + u'variants': [u'', u'goa']}, + u'_type': u'data_dictionary', + u'highlight': {u'variants': [u'goa']}}, + {u'_id': u'AVrW02W99WNuMIY9vmcf', + u'_index': u'entity_data', + u'_score': 11.210829, + u'_source': {u'dict_type': u'variants', + u'entity_data': u'city', + u'value': u'Mumbai', + u'variants': [u'', u'Mumbai']}, + u'_type': u'data_dictionary', + u'highlight': {u'variants': [u'Mumbai']}}, + ... + u'max_score': 11.501145, + u'total': 17}, + u'timed_out': False, + u'took': 96} + ] After parsing highlighted results, this function returns - {... - u'Mumbai': u'Mumbai', - ... - u'goa': u'goa', - u'mumbai': u'mumbai', - ... - } + [ + {... + u'Mumbai': u'Mumbai', + ... + u'goa': u'goa', + u'mumbai': u'mumbai', + ... + } + ] """ - entity_values, entity_variants = [], [] - variants_to_values = collections.OrderedDict() - if results and results['hits']['total'] > 0: - for hit in results['hits']['hits']: - if 'highlight' not in hit: - continue - - value = hit['_source']['value'] - for variant in hit['highlight']['variants']: - entity_values.append(value) - entity_variants.append(variant) - - for value, variant in zip(entity_values, entity_variants): - variant = re.sub('\s+', ' ', variant.strip()) - variant_no_highlight_tags = variant.replace('', '').replace('', '').strip() - if variant.count('') == len(TOKENIZER.tokenize(variant_no_highlight_tags)): - variant = variant_no_highlight_tags - if variant not in variants_to_values: - variants_to_values[variant] = value - - return variants_to_values + variants_to_values_list = [] + if results_list: + for results in results_list: + entity_values, entity_variants = [], [] + variants_to_values = collections.OrderedDict() + if results and results['hits']['total'] > 0: + for hit in results['hits']['hits']: + if 'highlight' not in hit: + continue + + value = hit['_source']['value'] + for variant in hit['highlight']['variants']: + entity_values.append(value) + entity_variants.append(variant) + + for value, variant in zip(entity_values, entity_variants): + variant = re.sub('\s+', ' ', variant.strip()) + variant_no_highlight_tags = variant.replace('', '').replace('', '').strip() + if variant.count('') == len(TOKENIZER.tokenize(variant_no_highlight_tags)): + variant = variant_no_highlight_tags + if variant not in variants_to_values: + variants_to_values[variant] = value + variants_to_values_list.append(variants_to_values) + + return variants_to_values_list def get_crf_data_for_entity_name(connection, index_name, doc_type, entity_name, **kwargs): diff --git a/lib/nlp/regexreplace.py b/lib/nlp/regexreplace.py index 889569b3b..3bd03783b 100644 --- a/lib/nlp/regexreplace.py +++ b/lib/nlp/regexreplace.py @@ -52,16 +52,3 @@ def text_substitute(self, text): for i, compiled_pattern in enumerate(self.pattern_compile): processed_text = compiled_pattern.sub(self.pattern_list[i][1], processed_text) return processed_text - - def unit_substitute(self, text): - processed_text = text - count = 0 - while count < len(self.pattern_list): - d = self.pattern_compile[count].findall(processed_text) - if d: - try: - processed_text = str(int(float(d[0]) * self.pattern_list[count][1])) - except (IndexError, ValueError): - pass - count += 1 - return processed_text diff --git a/ner_v1/api.py b/ner_v1/api.py index 151b5212a..316df8b81 100644 --- a/ner_v1/api.py +++ b/ner_v1/api.py @@ -22,6 +22,7 @@ PARAMETER_MAX_DIGITS, PARAMETER_READ_MODEL_FROM_S3, PARAMETER_READ_EMBEDDINGS_FROM_REMOTE_URL, PARAMETER_LIVE_CRF_MODEL_PATH) +from django.views.decorators.csrf import csrf_exempt def to_bool(value): @@ -74,19 +75,166 @@ def get_parameters_dictionary(request): return parameters_dict +def parse_post_request(request): + # type: (django.http.HttpRequest) -> Dict[str, Any] + """ + Extract POST request body from HTTP request + + Args: + request (django.http.HttpRequest): HTTP response from url + + Returns: + dict: parameters from the request + """ + request_data = json.loads(request.body) + parameters_dict = { + PARAMETER_MESSAGE: request_data.get('message'), + PARAMETER_ENTITY_NAME: request_data.get('entity_name'), + PARAMETER_STRUCTURED_VALUE: request_data.get('structured_value'), + PARAMETER_FALLBACK_VALUE: request_data.get('fallback_value'), + PARAMETER_BOT_MESSAGE: request_data.get('bot_message'), + PARAMETER_TIMEZONE: request_data.get('timezone'), + PARAMETER_REGEX: request_data.get('regex'), + PARAMETER_LANGUAGE_SCRIPT: request_data.get('language_script', ENGLISH_LANG), + PARAMETER_SOURCE_LANGUAGE: request_data.get('source_language', ENGLISH_LANG), + PARAMETER_FUZZINESS: request_data.get('fuzziness'), + PARAMETER_MIN_TOKEN_LEN_FUZZINESS: request_data.get('min_token_len_fuzziness'), + PARAMETER_MIN_DIGITS: request_data.get('min_number_digits'), + PARAMETER_MAX_DIGITS: request_data.get('max_number_digits'), + PARAMETER_READ_EMBEDDINGS_FROM_REMOTE_URL: to_bool(request_data.get('read_embeddings_from_remote_url')), + PARAMETER_READ_MODEL_FROM_S3: to_bool(request_data.get('read_model_from_s3')), + PARAMETER_LIVE_CRF_MODEL_PATH: request_data.get('live_crf_model_path') + } + + return parameters_dict + + +@csrf_exempt def text(request): """ - Run text detector with crf model on the 'message' passed in the request + Run text detector with crf model on the 'message or list of messages' passed in the request Args: request (django.http.HttpRequest): HTTP response from url Returns: - dict: GET parameters from the request + response (django.http.HttpResponse): HttpResponse object containing "entity_output" + + where "entity_output" is : + list of dict: containing dict of detected entities with their original texts for a message + OR + list of lists: containing dict of detected entities with their original texts for each message in the list + + EXAMPLES: + --- Single message + >>> message = u'i want to order chinese from mainland china and pizza from domminos' + >>> entity_name = 'restaurant' + >>> structured_value = None + >>> fallback_value = None + >>> bot_message = None + >>> entity_output = get_text(message=message, + >>> entity_name=entity_name, + >>> structured_value=structured_value, + >>> fallback_value=fallback_value, + >>> bot_message=bot_message) + >>> print(entity_output) + + [ + { + 'detection': 'message', + 'original_text': 'mainland china', + 'entity_value': {'value': u'Mainland China'} + }, + { + 'detection': 'message', + 'original_text': 'domminos', + 'entity_value': {'value': u"Domino's Pizza"} + } + ] + + + + >>> message = u'i wanted to watch movie' + >>> entity_name = 'movie' + >>> structured_value = u'inferno' + >>> fallback_value = None + >>> bot_message = None + >>> entity_output = get_text(message=message, + >>> entity_name=entity_name, + >>> structured_value=structured_value, + >>> fallback_value=fallback_value, + >>> bot_message=bot_message) + >>> print(entity_output) + + [ + { + 'detection': 'structure_value_verified', + 'original_text': 'inferno', + 'entity_value': {'value': u'Inferno'} + } + ] + + >>> message = u'i wanted to watch inferno' + >>> entity_name = 'movie' + >>> structured_value = u'delhi' + >>> fallback_value = None + >>> bot_message = None + >>> entity_output = get_text(message=message, + >>> entity_name=entity_name, + >>> structured_value=structured_value, + >>> fallback_value=fallback_value, + >>> bot_message=bot_message) + >>> print(entity_output) + + [ + { + 'detection': 'message', + 'original_text': 'inferno', + 'entity_value': {'value': u'Inferno'} + } + ] + + --- Bulk detection + >>> message = [u'book a flight to mumbai', + u'i want to go to delhi from mumbai'] + >>> entity_name = u'city' + >>> entity_output = get_text(message=message, + >>> entity_name=entity_name, + >>> structured_value=structured_value, + >>> fallback_value=fallback_value, + >>> bot_message=bot_message) + >>> print(entity_output) + + [ + [ + { + 'detection': 'message', + 'entity_value': {'value': u'mumbai'}, + 'original_text': u'mumbai' + } + ], + [ + { + 'detection': 'message', + 'entity_value': {'value': u'New Delhi'}, + 'original_text': u'delhi' + }, + { + 'detection': 'message', + 'entity_value': {'value': u'mumbai'}, + 'original_text': u'mumbai' + } + ] + ] """ try: - parameters_dict = get_parameters_dictionary(request) - ner_logger.debug('Start: %s ' % parameters_dict[PARAMETER_ENTITY_NAME]) + parameters_dict = {} + if request.method == "POST": + parameters_dict = parse_post_request(request) + ner_logger.debug('Start Bulk Detection: %s ' % parameters_dict[PARAMETER_ENTITY_NAME]) + elif request.method == "GET": + parameters_dict = get_parameters_dictionary(request) + ner_logger.debug('Start: %s ' % parameters_dict[PARAMETER_ENTITY_NAME]) entity_output = get_text( message=parameters_dict[PARAMETER_MESSAGE], entity_name=parameters_dict[PARAMETER_ENTITY_NAME], diff --git a/ner_v1/chatbot/entity_detection.py b/ner_v1/chatbot/entity_detection.py index 311066e5d..94aad4b06 100644 --- a/ner_v1/chatbot/entity_detection.py +++ b/ner_v1/chatbot/entity_detection.py @@ -17,6 +17,7 @@ from ner_v1.detectors.textual.name.name_detection import NameDetector from ner_v1.detectors.textual.text.text_detection import TextDetector from ner_v1.detectors.textual.text.text_detection_model import TextModelDetector +import six """ This file contains functionality that performs entity detection over a chatbot. @@ -94,7 +95,7 @@ def get_text(message, entity_name, structured_value, fallback_value, bot_message """Use TextDetector (datastore/elasticsearch) to detect textual entities Args: - message (str or unicode or None): natural language text on which detection logic is to be run. + message (str or unicode or None or list(bulk)): natural language text(s) on which detection logic is to be run. Note if structured value is passed detection is run on structured value instead of message entity_name (str): name of the entity. Also acts as elastic-search dictionary name @@ -109,7 +110,7 @@ def get_text(message, entity_name, structured_value, fallback_value, bot_message bot_message (str or unicode or None): previous message from a bot/agent. language (str): ISO 639-1 code of language of message **kwargs: extra configuration arguments for TextDetector - fuzziness (str or int or None): fuziness to apply while detecting text entities + fuzziness (str or int or None): fuzziness to apply while detecting text entities min_token_len_fuzziness (str or int or None): minimum length of the token to be eligible for fuzziness live_crf_model_path (str) : path to the CRF model to use to detect entites. Defaults to None read_model_from_s3 (bool): If True read CRF model from S3. Defaults to False @@ -124,73 +125,106 @@ def get_text(message, entity_name, structured_value, fallback_value, bot_message entity_value is in itself a dict with its keys varying from entity to entity Example: - - >>> message = u'i want to order chinese from mainland china and pizza from domminos' - >>> entity_name = 'restaurant' - >>> structured_value = None - >>> fallback_value = None - >>> bot_message = None - >>> output = get_text(message=message, - >>> entity_name=entity_name, - >>> structured_value=structured_value, - >>> fallback_value=fallback_value, - >>> bot_message=bot_message) - >>> print(output) - - [ - { - 'detection': 'message', - 'original_text': 'mainland china', - 'entity_value': {'value': u'Mainland China'} - }, - { - 'detection': 'message', - 'original_text': 'domminos', - 'entity_value': {'value': u"Domino's Pizza"} - } - ] - - - - >>> message = u'i wanted to watch movie' - >>> entity_name = 'movie' - >>> structured_value = u'inferno' - >>> fallback_value = None - >>> bot_message = None - >>> output = get_text(message=message, - >>> entity_name=entity_name, - >>> structured_value=structured_value, - >>> fallback_value=fallback_value, - >>> bot_message=bot_message) - >>> print(output) - - [ - { - 'detection': 'structure_value_verified', - 'original_text': 'inferno', - 'entity_value': {'value': u'Inferno'} - } - ] - - >>> message = u'i wanted to watch inferno' - >>> entity_name = 'movie' - >>> structured_value = u'delhi' - >>> fallback_value = None - >>> bot_message = None - >>> output = get_text(message=message, - >>> entity_name=entity_name, - >>> structured_value=structured_value, - >>> fallback_value=fallback_value, - >>> bot_message=bot_message) - >>> print(output) - - [ - { - 'detection': 'message', - 'original_text': 'inferno', - 'entity_value': {'value': u'Inferno'} - } - ] + --- Single message + >>> message = u'i want to order chinese from mainland china and pizza from domminos' + >>> entity_name = 'restaurant' + >>> structured_value = None + >>> fallback_value = None + >>> bot_message = None + >>> output = get_text(message=message, + >>> entity_name=entity_name, + >>> structured_value=structured_value, + >>> fallback_value=fallback_value, + >>> bot_message=bot_message) + >>> print(output) + + [ + { + 'detection': 'message', + 'original_text': 'mainland china', + 'entity_value': {'value': u'Mainland China'} + }, + { + 'detection': 'message', + 'original_text': 'domminos', + 'entity_value': {'value': u"Domino's Pizza"} + } + ] + + + + >>> message = u'i wanted to watch movie' + >>> entity_name = 'movie' + >>> structured_value = u'inferno' + >>> fallback_value = None + >>> bot_message = None + >>> output = get_text(message=message, + >>> entity_name=entity_name, + >>> structured_value=structured_value, + >>> fallback_value=fallback_value, + >>> bot_message=bot_message) + >>> print(output) + + [ + { + 'detection': 'structure_value_verified', + 'original_text': 'inferno', + 'entity_value': {'value': u'Inferno'} + } + ] + + >>> message = u'i wanted to watch inferno' + >>> entity_name = 'movie' + >>> structured_value = u'delhi' + >>> fallback_value = None + >>> bot_message = None + >>> output = get_text(message=message, + >>> entity_name=entity_name, + >>> structured_value=structured_value, + >>> fallback_value=fallback_value, + >>> bot_message=bot_message) + >>> print(output) + + [ + { + 'detection': 'message', + 'original_text': 'inferno', + 'entity_value': {'value': u'Inferno'} + } + ] + + --- Bulk detection + >>> message = [u'book a flight to mumbai', + u'i want to go to delhi from mumbai'] + >>> entity_name = 'city' + >>> output = get_text(message=message, + >>> entity_name=entity_name, + >>> structured_value=structured_value, + >>> fallback_value=fallback_value, + >>> bot_message=bot_message) + >>> print(output) + + [ + [ + { + 'detection': 'message', + 'entity_value': {'value': u'mumbai'}, + 'original_text': u'mumbai' + } + ], + [ + { + 'detection': 'message', + 'entity_value': {'value': u'New Delhi'}, + 'original_text': u'delhi' + }, + { + 'detection': 'message', + 'entity_value': {'value': u'mumbai'}, + 'original_text': u'mumbai' + } + ] + ] """ fuzziness = kwargs.get('fuzziness', None) @@ -213,10 +247,13 @@ def get_text(message, entity_name, structured_value, fallback_value, bot_message min_token_len_fuzziness = int(min_token_len_fuzziness) text_model_detector.set_min_token_size_for_levenshtein(min_size=min_token_len_fuzziness) - entity_output = text_model_detector.detect(message=message, - structured_value=structured_value, - fallback_value=fallback_value, - bot_message=bot_message) + if isinstance(message, six.string_types): + entity_output = text_model_detector.detect(message=message, + structured_value=structured_value, + fallback_value=fallback_value, + bot_message=bot_message) + elif isinstance(message, (list, tuple)): + entity_output = text_model_detector.detect_bulk(messages=message) return entity_output @@ -988,7 +1025,7 @@ def get_budget(message, entity_name, structured_value, fallback_value, bot_messa """ - budget_detection = BudgetDetector(entity_name=entity_name) + budget_detection = BudgetDetector(entity_name=entity_name, use_text_detection=True) if min_digit and max_digit: min_digit = int(min_digit) max_digit = int(max_digit) diff --git a/ner_v1/detectors/base_detector.py b/ner_v1/detectors/base_detector.py index 5abbd4f12..3c28db1ca 100644 --- a/ner_v1/detectors/base_detector.py +++ b/ner_v1/detectors/base_detector.py @@ -72,10 +72,52 @@ def _set_language_processing_script(self): raise NotImplementedError('Please enable translation or extend language support' 'for %s' % self._source_language_script) - def detect(self, message=None, structured_value=None, fallback_value=None, **kwargs): + def detect_bulk(self, messages=None, **kwargs): """ Use detector to detect entities from text. It also translates query to language compatible to detector + Args: + messages (list of strings): list of natural text(s) on which detection logic is to be run. + Returns: + dict or None: dictionary containing entity_value, original_text and detection; + entity_value is in itself a dict with its keys varying from entity to entity + + Example: + 1) Consider an example of restaurant detection from a message + + messages = ['i want to order chinese from mainland china and pizza from domminos'] + output = detect(message=message) + print output + >> [[{'detection': 'message', 'original_text': 'mainland china', 'entity_value': + {'value': u'Mainland China'}}, {'detection': 'message', 'original_text': 'domminos', + 'entity_value': {'value': u"Domino's Pizza"}}]] + """ + if messages is None: + messages = [] + if self._source_language_script != self._target_language_script and self._translation_enabled: + translation_output_list = [ + translate_text(message_, self._source_language_script, self._target_language_script) + for message_ in messages] + + messages = [] + for translation_output in translation_output_list: + messages.append(translation_output[TRANSLATED_TEXT] if translation_output['status'] else '') + + texts = messages + entities_list, original_texts_list = self.detect_entity_bulk(texts=texts) + + if entities_list: + values_list, method, original_texts_list = entities_list, FROM_MESSAGE, original_texts_list + else: + return None + + return self.output_entity_bulk(entity_values_list=values_list, original_texts_list=original_texts_list, + detection_method=method, + detection_language=self._target_language_script) + + def detect(self, message=None, structured_value=None, fallback_value=None, **kwargs): + """ + Use detector to detect entities from text. It also translates query to language compatible to detector Args: message (str): natural text on which detection logic is to be run. Note if structured value is detection is run on structured value instead of message @@ -85,14 +127,11 @@ def detect(self, message=None, structured_value=None, fallback_value=None, **kwa fallback_value (str): If the detection logic fails to detect any value either from structured_value or message then we return a fallback_value as an output. bot_message (str): previous message from a bot/agent. - Returns: dict or None: dictionary containing entity_value, original_text and detection; entity_value is in itself a dict with its keys varying from entity to entity - Example: 1) Consider an example of restaurant detection from a message - message = 'i want to order chinese from mainland china and pizza from domminos' structured_value = None fallback_value = None @@ -100,13 +139,12 @@ def detect(self, message=None, structured_value=None, fallback_value=None, **kwa output = detect(message=message, structured_value=structured_value, fallback_value=fallback_value, bot_message=bot_message) print output - + >> [{'detection': 'message', 'original_text': 'mainland china', 'entity_value': {'value': u'Mainland China'}}, {'detection': 'message', 'original_text': 'domminos', 'entity_value': {'value': u"Domino's Pizza"}}] - 2) Consider an example of movie name detection from a structured value - + message = 'i wanted to watch movie' entity_name = 'movie' structured_value = 'inferno' @@ -115,10 +153,9 @@ def detect(self, message=None, structured_value=None, fallback_value=None, **kwa output = get_text(message=message, entity_name=entity_name, structured_value=structured_value, fallback_value=fallback_value, bot_message=bot_message) print output - + >> [{'detection': 'structure_value_verified', 'original_text': 'inferno', 'entity_value': {'value': u'Inferno'}}] - 3) Consider an example of movie name detection from a message message = 'i wanted to watch inferno' entity_name = 'movie' @@ -128,9 +165,9 @@ def detect(self, message=None, structured_value=None, fallback_value=None, **kwa output = get_text(message=message, entity_name=entity_name, structured_value=structured_value, fallback_value=fallback_value, bot_message=bot_message) print output - + >> [{'detection': 'message', 'original_text': 'inferno', 'entity_value': {'value': u'Inferno'}}] - + """ if self._source_language_script != self._target_language_script and self._translation_enabled: if structured_value: @@ -161,6 +198,77 @@ def detect(self, message=None, structured_value=None, fallback_value=None, **kwa return self.output_entity_dict_list(entity_value_list=value, original_text_list=original_text, detection_method=method, detection_language=self._target_language_script) + def output_entity_bulk(self, entity_values_list, original_texts_list, detection_method=None, + detection_method_list=None, detection_language=ENGLISH_LANG): + """ + Format detected entity values for bulk detection + Args: + entity_values_list (list of lists): containing list of entity values which are identified from given + detection logic + original_texts_list (list of lists): containing list original values or actual values from + messages which are identified + detection_method (str, optional): how the entity was detected + i.e. whether from message, structured_value + or fallback, verified from model or not. + defaults to None + detection_method_list(list, optional): list containing how each entity was detected in the entity_value + list.If provided, this argument will be used over detection method + defaults to None + detection_language(str): ISO 639 code for language in which entity is detected + + Returns: + list of lists of dict: list of lists containing dictionaries, each containing entity_value, + original_text and detection; + entity_value is in itself a dict with its keys varying from entity to entity + Example Output: + [ + [ + { + "entity_value": entity_value_1, + "detection": detection_method, + "original_text": original_text_1 + }, + { + "entity_value": entity_value_2, + "detection": detection_method, + "original_text": original_text_2 + } + + ], + [ + { + "entity_value": entity_value, + "detection": detection_method, + "original_text": original_text + } + ] + ] + """ + if detection_method_list is None: + detection_method_list = [] + if entity_values_list is None: + entity_values_list = [] + + bulk_detection_entity_list = [] + for index, entity_values in enumerate(entity_values_list): + entity_list = [] + for i, entity_value in enumerate(entity_values): + if type(entity_value) in [str, six.text_type]: + entity_value = { + ENTITY_VALUE_DICT_KEY: entity_value + } + method = detection_method_list[i] if detection_method_list else detection_method + entity_list.append( + { + ENTITY_VALUE: entity_value, + DETECTION_METHOD: method, + ORIGINAL_TEXT: original_texts_list[index][i], + DETECTION_LANGUAGE: detection_language + } + ) + bulk_detection_entity_list.append(entity_list) + return bulk_detection_entity_list + @staticmethod def output_entity_dict_list(entity_value_list, original_text_list, detection_method=None, detection_method_list=None, detection_language=ENGLISH_LANG): @@ -174,7 +282,7 @@ def output_entity_dict_list(entity_value_list, original_text_list, detection_met i.e. whether from message, structured_value or fallback, verified from model or not. defaults to None - detection_method_list(list, optional): list containing how each entity was detected in the entity_value list. + detection_method_list(list, optional): list containing detection method of entity the entity_value list. if provided, this argument will be used over detection method defaults to None detection_language(str): ISO 639 code for language in which entity is detected diff --git a/ner_v1/detectors/numeral/budget/budget_detection.py b/ner_v1/detectors/numeral/budget/budget_detection.py index 827468e63..ed9883f4b 100644 --- a/ner_v1/detectors/numeral/budget/budget_detection.py +++ b/ner_v1/detectors/numeral/budget/budget_detection.py @@ -1,10 +1,9 @@ import re +from language_utilities.constant import ENGLISH_LANG from ner_v1.constant import BUDGET_TYPE_NORMAL, BUDGET_TYPE_TEXT -from lib.nlp.regexreplace import RegexReplace -from ner_v1.detectors.textual.text.text_detection import TextDetector from ner_v1.detectors.base_detector import BaseDetector -from language_utilities.constant import ENGLISH_LANG +from ner_v1.detectors.textual.text.text_detection import TextDetector class BudgetDetector(BaseDetector): @@ -72,16 +71,14 @@ class BudgetDetector(BaseDetector): budget: list of budgets detected original_budget_text: list to store substrings of the text detected as budget tag: entity_name prepended and appended with '__' - regex_object: regex object that is used to substitute k with 000 i.e. if text contains 2k then - it will be substituted as 2000 - text_detection_object: text detection object to detect text in Textual format Note: text and tagged_text will have a extra space prepended and appended after calling detect_entity(text) """ - def __init__(self, entity_name, source_language_script=ENGLISH_LANG, translation_enabled=False): + def __init__(self, entity_name, source_language_script=ENGLISH_LANG, translation_enabled=False, + use_text_detection=False): """Initializes a BudgetDetector object Args: @@ -101,26 +98,36 @@ def __init__(self, entity_name, source_language_script=ENGLISH_LANG, translation self.processed_text = '' self.budget = [] self.original_budget_text = [] - self.unit_present_list = ['k', 'l', 'm', 'c', 'h', 'th'] - regx_for_units = [(r'([\d,.]+)\s*k', 1000), - (r'([\d,.]+)\s*h', 1000), - (r'([\d,.]+)\s*th', 1000), - (r'([\d,.]+)\s*l', 100000), - (r'([\d,.]+)\s*lacs?', 100000), - (r'([\d,.]+)\s*lakh?', 100000), - (r'([\d,.]+)\s*lakhs?', 100000), - (r'([\d,.]+)\s*m', 1000000), - (r'([\d,.]+)\s*million', 1000000), - (r'([\d,.]+)\s*mill?', 1000000), - (r'([\d,.]+)\s*c', 10000000), - (r'([\d,.]+)\s*cro?', 10000000), - (r'([\d,.]+)\s*crore?', 10000000), - (r'([\d,.]+)\s*crores?', 10000000)] - self.regex_object = RegexReplace(regx_for_units) self.tag = '__' + self.entity_name + '__' - self.text_detection_object = TextDetector(entity_name=entity_name) - - def detect_entity(self, text): + self._use_text_detection = use_text_detection + + self._allowed_units = [ + (['k', 'ha?zaa?r', 'ha?ja?ar', 'thousa?nd'], 1000), + (['l', 'lacs?', 'lakh?s?', 'lakhs'], 100000), + (['m', 'mn', 'million', 'mill?'], 1000000), + (['c', 'cro?', 'crore?s?'], 10000000), + ] + + units = [] + for _units, scale in self._allowed_units: + units.extend(_units) + units.sort(key=lambda unit: len(unit), reverse=True) + + digits_pattern = r'((?:\d+(?:\,\d+)*(?:\.\d+)?)|(?:(?:\d+(?:\,\d+)*)?(?:\.\d+)))' + units_pattern = r'({})?'.format('|'.join(units)) + self._budget_pattern = r'(?:rs\.|rs|rupees|rupee)?' \ + r'\s*{}\s*{}\s*' \ + r'(?:rs\.|rs|rupees|rupee)?'.format(digits_pattern, units_pattern) + + def get_scale(self, unit): + if unit: + for _units, scale in self._allowed_units: + if re.search('|'.join(_units), unit): + return scale + + return 1 + + def detect_entity(self, text, **kwargs): """Detects budget in the text string Args: @@ -141,10 +148,8 @@ def detect_entity(self, text): self.text = ' ' + text + ' ' self.processed_text = self.text.lower() self.tagged_text = self.text - budget_data = self._detect_budget() - self.budget = budget_data[0] - self.original_budget_text = budget_data[1] - return budget_data + self.budget, self.original_budget_text = self._detect_budget() + return self.budget, self.original_budget_text @property def supported_languages(self): @@ -173,7 +178,7 @@ def _detect_budget(self): self._update_processed_text(original_list) budget_list, original_list = self._detect_any_budget(budget_list, original_list) self._update_processed_text(original_list) - if not budget_list: + if not budget_list and self._use_text_detection: budget_list, original_list = self._detect_text_budget(budget_list, original_list) self._update_processed_text(original_list) @@ -197,29 +202,30 @@ def _detect_min_budget(self, budget_list=None, original_list=None): budget_list = [] if original_list is None: original_list = [] - patterns = re.findall( - r'(\s(above|more? than|more?|greater than|greater|abv|abov|more? den|\>\s*\=?)\s+' - r'(rs.|rs|rupees|rupee)*\s*([\d.,]+\s*[klmct]?[a-z]*|[\d.,]+\s*[klmct]?[a-z]*)\s*' - r'(rs.|rs|rupees|rupee|\.)?\s)', self.processed_text.lower()) - for pattern in patterns: - original = pattern[0].strip() + pattern = re.compile(r'\s(' + r'(?:above|more? than|more?|at ?least|greater than|greater|abv|abov|more? den|\>\s*\=?)' + r'\s+' + + self._budget_pattern + + r')(?:\b|\.|\s)', flags=re.UNICODE | re.IGNORECASE) + + for match in pattern.finditer(self.processed_text): + original, amount, unit = match.groups() budget = { 'min_budget': 0, 'max_budget': 0, 'type': BUDGET_TYPE_NORMAL } - if any([unit in pattern[3] for unit in self.unit_present_list]): - replace_comma = re.sub(',', '', pattern[3]) - amount = int(self.regex_object.unit_substitute(replace_comma)) - else: - replace_comma = re.sub(',', '', pattern[3]) - amount = int(replace_comma) + scale = self.get_scale(unit) - if self.min_digit <= len(str(amount)) <= self.max_digit: - budget['min_budget'] = amount - budget_list.append(budget) - original_list.append(original) + if amount.replace(',', '').replace('.', '').isdigit(): + amount = float(amount.replace(',', '')) * scale + + amount = int(amount) # casting to int for backward compatibility + if self.min_digit <= len(str(amount)) <= self.max_digit: + budget['min_budget'] = amount + budget_list.append(budget) + original_list.append(original.strip()) return budget_list, original_list @@ -242,12 +248,14 @@ def _detect_max_budget(self, budget_list=None, original_list=None): if original_list is None: original_list = [] - patterns = re.findall( - r'(\s(max|upto|o?nly|around|below|less than|less|less den|\<\s*\=?)\s+(rs.|rs|rupees|rupee)' - r'?\s*([\d.,]+\s*[klmct]?[a-z]*|[\d.,]+\s*[klmct]?[a-z]*)\s*(rs.|rs|rupees|rupee|\.)?\s)', - self.processed_text.lower()) - for pattern in patterns: - original = pattern[0].strip() + pattern = re.compile(r'\s(' + r'(?:max|upto|o?nly|around|below|at ?most|less than|less|less den|\<\s*\=?)' + r'\s+' + + self._budget_pattern + + r')(?:\b|\.|\s)', flags=re.UNICODE | re.IGNORECASE) + + for match in pattern.finditer(self.processed_text): + original, amount, unit = match.groups() budget = { 'min_budget': 0, @@ -255,17 +263,16 @@ def _detect_max_budget(self, budget_list=None, original_list=None): 'type': BUDGET_TYPE_NORMAL } - if any([unit in pattern[3] for unit in self.unit_present_list]): - comma_removed_unit_text = pattern[3].replace(',', '') - amount = int(self.regex_object.unit_substitute(comma_removed_unit_text)) - else: - comma_removed_number = pattern[3].replace(',', '') - amount = int(comma_removed_number) + scale = self.get_scale(unit) - if self.min_digit <= len(str(amount)) <= self.max_digit: - budget['max_budget'] = amount - budget_list.append(budget) - original_list.append(original) + if amount.replace(',', '').replace('.', '').isdigit(): + amount = float(amount.replace(',', '')) * scale + + amount = int(amount) # casting to int for backward compatibility + if self.min_digit <= len(str(amount)) <= self.max_digit: + budget['max_budget'] = amount + budget_list.append(budget) + original_list.append(original.strip()) return budget_list, original_list @@ -287,58 +294,46 @@ def _detect_min_max_budget(self, budget_list=None, original_list=None): if original_list is None: original_list = [] - patterns = re.findall(r'(\s(([\d,.]+\s*[klmct]?[a-z]*)|([\d,.]+\s*[klmct]?[a-z]*))\s*(\-|to|and)\s*' - r'(([\d,.]+\s*[klmct]?[a-z]*)|([\d,.]+\s*[klmct]?[a-z]*))\.?\s)', - self.processed_text.lower()) - for pattern in patterns: - original = None - pattern = list(pattern) + pattern = re.compile(r'\s(' + + self._budget_pattern + + r'\s*(?:\-|to|and|till)\s*' + + self._budget_pattern + + r')(?:\b|\.|\s)', flags=re.UNICODE | re.IGNORECASE) + + for match in pattern.finditer(self.processed_text): + original, min_budget, min_unit, max_budget, max_unit = match.groups() + budget = { 'min_budget': 0, 'max_budget': 0, 'type': BUDGET_TYPE_NORMAL } - flag_contains_k = False - max_budget = 0 - min_budget = 0 - _min_budget = 0 - if pattern[6]: - if any([unit in pattern[6] for unit in self.unit_present_list]): - flag_contains_k = True - else: - flag_contains_k = False - comma_removed_unit_text = pattern[6].replace(',', '') - max_budget = int(self.regex_object.unit_substitute(comma_removed_unit_text)) - elif pattern[7]: - comma_removed_number = pattern[7].replace(',', '') - max_budget = int(comma_removed_number) + min_budget_scale = self.get_scale(min_unit) + max_budget_scale = self.get_scale(max_unit) + + if min_budget.replace(',', '').replace('.', '').isdigit(): + min_budget = float(min_budget.replace(',', '')) * min_budget_scale + else: min_budget = 0 - if pattern[2]: - _comma_removed_unit_text = pattern[2].replace(',', '') - _min_budget = int(self.regex_object.unit_substitute(_comma_removed_unit_text)) - if flag_contains_k: - for u in self.unit_present_list: - if u in pattern[6]: - pattern[2] = str(pattern[2]).strip() + u - break - comma_removed_unit_text = pattern[2].replace(',', '') - min_budget = int(self.regex_object.unit_substitute(comma_removed_unit_text)) - elif pattern[3]: - comma_removed_number = pattern[3].replace(',', '') - min_budget = int(comma_removed_number) - if min_budget > max_budget: - min_budget = _min_budget + if max_budget.replace(',', '').replace('.', '').isdigit(): + max_budget = float(max_budget.replace(',', '')) * max_budget_scale + else: + max_budget = 0 + + min_budget = int(min_budget) + max_budget = int(max_budget) + min_budget = min_budget if self.min_digit <= len(str(min_budget)) <= self.max_digit else 0 max_budget = max_budget if self.min_digit <= len(str(max_budget)) <= self.max_digit else 0 + if min_budget != 0 and max_budget != 0 and min_budget <= max_budget: - original = pattern[0].strip() budget['min_budget'] = min_budget budget['max_budget'] = max_budget - budget_list.append(budget) - original_list.append(original) + original_list.append(original.strip()) + return budget_list, original_list def _detect_any_budget(self, budget_list=None, original_list=None): @@ -360,40 +355,29 @@ def _detect_any_budget(self, budget_list=None, original_list=None): if original_list is None: original_list = [] - text = self.processed_text.lower().strip() - - units_patterns = [r'k|hazaa?r|haja?ar|thousand', r'l|lacs?|lakh?|lakhs?', - r'm|million|mill?', r'cro?|cror?|crore?|crores?'] - units_order = [1e3, 1e5, 1e6, 1e7] - full = re.compile(r'((rs.|rs|rupees|rupee)?\s*((\d+((\,|\.)\d+)+)|(0|[1-9]\d*)?(\.\d+)?(?<=\d))' - r'\s*(' + r'|'.join(units_patterns) + r')?\s*(rs.|rs|rupees|rupee)?)\b') - units_patterns = map(lambda s: '^' + s, units_patterns) - units_patterns = map(re.compile, units_patterns) - matches = full.findall(text) - for match in matches: - original = match[0].strip() + pattern = re.compile(r'\s(' + + self._budget_pattern + + r')(?:\b|\.|\s)', flags=re.UNICODE | re.IGNORECASE) + + for match in pattern.finditer(self.processed_text): + original, amount, unit = match.groups() budget = { 'min_budget': 0, 'max_budget': 0, 'type': BUDGET_TYPE_NORMAL } - amount, unit = match[2], match[-2] - if not amount: - continue - amount = amount.replace(',', '') - _amount = amount.split('.') - if len(_amount) > 1: - amount = ''.join(_amount[:-1]) + '.' + _amount[-1] - amount = float(amount) - for i, pattern in enumerate(units_patterns): - if pattern.findall(unit): - amount = int(amount * units_order[i]) - break - amount = int(amount) - if self.min_digit <= len(str(amount)) <= self.max_digit: - budget['max_budget'] = amount - budget_list.append(budget) - original_list.append(original) + + scale = self.get_scale(unit) + + if amount.replace(',', '').replace('.', '').isdigit(): + amount = float(amount.replace(',', '')) * scale + + amount = int(amount) # casting to int for backward compatibility + + if self.min_digit <= len(str(amount)) <= self.max_digit: + budget['max_budget'] = amount + budget_list.append(budget) + original_list.append(original.strip()) return budget_list, original_list @@ -411,11 +395,13 @@ def _detect_text_budget(self, budget_list=None, original_list=None): if original_list is None: original_list = [] - budget_text_list, original_text_list = self.text_detection_object.detect_entity(self.text) - self.tagged_text = self.text_detection_object.tagged_text - self.processed_text = self.text_detection_object.processed_text - count = 0 - while count < len(original_text_list): + text_detection_object = TextDetector(entity_name=self.entity_name) + + budget_text_list, original_text_list = text_detection_object.detect_entity(self.text) + # FIXME: Broken/Ineffective code. + self.tagged_text = text_detection_object.tagged_text + self.processed_text = text_detection_object.processed_text + for _, original_text in zip(budget_text_list, original_text_list): budget = { 'min_budget': 0, 'max_budget': 0, @@ -423,7 +409,7 @@ def _detect_text_budget(self, budget_list=None, original_list=None): } budget_list.append(budget) - count += 1 + original_list.append(original_text) return budget_list, original_list diff --git a/ner_v1/detectors/textual/name/name_detection.py b/ner_v1/detectors/textual/name/name_detection.py index 1d9317c05..5e65dff0d 100644 --- a/ner_v1/detectors/textual/name/name_detection.py +++ b/ner_v1/detectors/textual/name/name_detection.py @@ -1,5 +1,7 @@ # coding=utf-8 import re +import string + from language_utilities.constant import ENGLISH_LANG, HINDI_LANG from lib.nlp.const import nltk_tokenizer from lib.nlp.pos import POS @@ -8,7 +10,7 @@ HINDI_STOPWORDS, NAME_VARIATIONS, COMMON_HINDI_WORDS_OCCURING_WITH_NAME) from ner_v1.detectors.textual.text.text_detection import TextDetector -import string + # TODO: Refactor this module for readability and useability. Remove any hacks # TODO: Make this module python 3 compatible @@ -112,7 +114,8 @@ def get_name_using_pos_tagger(self, text): pattern1 = re.compile(r"name\s*(is|)\s*([\w\s]+)") pattern2 = re.compile(r"myself\s+([\w\s]+)") pattern3 = re.compile(r"call\s+me\s+([\w\s]+)") - name_tokens = text.split(' ') + name_tokens = text.split() + # Passing empty tokens to tag will cause IndexError tagged_names = pos_tagger_object.tag(name_tokens) pattern1_match = pattern1.findall(text) pattern2_match = pattern2.findall(text) diff --git a/ner_v1/detectors/textual/text/text_detection.py b/ner_v1/detectors/textual/text/text_detection.py index 765a620ef..3dee92a96 100644 --- a/ner_v1/detectors/textual/text/text_detection.py +++ b/ner_v1/detectors/textual/text/text_detection.py @@ -21,10 +21,8 @@ class TextDetector(BaseDetector): Attributes: - text (str): string to extract entities from entity_name (str): string by which the detected time entities would be replaced with on calling detect_entity() - text_dict (dict): dictionary to store lemmas, stems, ngrams used during detection process _fuzziness (str or int): If this parameter is str, elasticsearch's auto is used with low and high term distances. Default low and high term distances are 3 and 6 for elasticsearch. For this module they are set to 4 and 7 respectively. @@ -34,8 +32,6 @@ class TextDetector(BaseDetector): _min_token_size_for_fuzziness (int): minimum number of letters a word must have to be considered for calculating edit distance with similar ngrams from the datastore tagged_text (str): string with time entities replaced with tag defined by entity_name - text_entity_values (list): list to store detected entities from the text - original_texts (list): list of substrings of the text detected as entities processed_text (str): string with detected text entities removed tag (str): entity_name prepended and appended with '__' """ @@ -58,13 +54,12 @@ def __init__(self, entity_name=None, source_language_script=lang_constant.ENGLIS lang_constant.GUJARATI_LANG, # Added temporarily till text detection is ported to v2 api ] super(TextDetector, self).__init__(source_language_script, translation_enabled) - - self.text = None - self.text_dict = {} self.tagged_text = None - self.text_entity_values = [] - self.original_texts = [] self.processed_text = None + self.__texts = [] + self.__tagged_texts = [] + self.__processed_texts = [] + self.entity_name = entity_name self.tag = '__' + self.entity_name + '__' @@ -154,20 +149,24 @@ def set_min_token_size_for_levenshtein(self, min_size): """ self._min_token_size_for_fuzziness = min_size - def _process_text(self, text): - self.text = text.lower() - if isinstance(self.text, bytes): - self.text = self.text.decode('utf-8') + def _process_text(self, texts): + text_lowercase = [text.lower() for text in texts] + + for text in text_lowercase: + if isinstance(text, bytes): + self.__texts.append(text.decode('utf-8')) + else: + self.__texts.append(text) - self.processed_text = self.text + self.__processed_texts = self.__texts # Note: following rules have been disabled because cause problem with generating original text - # regx_to_process = RegexReplace([(r'[\'\/]', r''), (r'\s+', r' ')]) + # regex_to_process = RegexReplace([(r'[\'\/]', r''), (r'\s+', r' ')]) # self.processed_text = self.regx_to_process.text_substitute(self.processed_text) - self.processed_text = u' ' + self.processed_text + u' ' - self.tagged_text = self.processed_text + self.__processed_texts = [u' ' + processed_text + u' ' for processed_text in self.__processed_texts] + self.__tagged_texts = self.__processed_texts - def _get_substring_from_processed_text(self, matched_tokens): + def _get_substring_from_processed_text(self, text, matched_tokens): """ Get part of original text that was detected as some entity value. @@ -176,8 +175,10 @@ def _get_substring_from_processed_text(self, matched_tokens): Args: matched_tokens (list): list of tokens (usually tokens from fuzzy match results from ES) - to find as a contiguous substring in the processed text considering the effects + to find as a contiguous substring in the processed sentence considering the effects of tokenizer + text (string or unicode): sentence from self.processed_text from where indices of given token will be + given Returns: str or unicode: part of original text that corresponds to given tokens @@ -193,10 +194,10 @@ def _get_substring_from_processed_text(self, matched_tokens): Notice that & is dropped during tokenization but when finding original text, we recover it from processed text """ - def _get_tokens_and_indices(text): + def _get_tokens_and_indices(txt): """ Args: - text (str or unicode): text to get tokens from and indicies of those tokens in the given text + txt (str or unicode): text to get tokens from and indicies of those tokens in the given text Returns: tuple: @@ -210,7 +211,7 @@ def _get_tokens_and_indices(text): [(1, 2), (3, 7), (8, 10), (11, 16), (17, 18), (19, 21), (22, 25), (28, 34)]) """ - txt = text.rstrip() + ' __eos__' + txt = txt.rstrip() + ' __eos__' processed_text_tokens = TOKENIZER.tokenize(txt) processed_text_tokens_indices = [] @@ -242,17 +243,73 @@ def _get_tokens_and_indices(text): try: n = len(matched_tokens) - tokens, indices = _get_tokens_and_indices(self.processed_text) + tokens, indices = _get_tokens_and_indices(text) for i in range(len(tokens) - n + 1): if tokens[i:i + n] == matched_tokens: start = indices[i][0] end = indices[i + n - 1][1] - return self.processed_text[start:end] + return text[start:end] except (ValueError, IndexError): - ner_logger.exception('Error getting original text (%s, %s)' % (matched_tokens, self.processed_text)) + ner_logger.exception('Error getting original text (%s, %s)' % (matched_tokens, text)) return u' '.join(matched_tokens) + def detect_entity_bulk(self, texts, **kwargs): + """ + Detects all textual entities in text that are similar to variants of 'entity_name' stored in the datastore and + returns two lists of list of detected text entities and their corresponding original substrings + for each sentence in text respectively. + Note that datastore stores number of values under a entity_name and each entity_value has its own list of + variants, whenever a variant is matched sucessfully, the entity_value whose list the variant belongs to, + is returned. For more information on how data is stored, see Datastore docs. + + Args: + texts (list): list of strings(bulk detect) to extract textual entities from + **kwargs: it can be used to send specific arguments in future. for example, fuzziness, previous context. + Returns: + tuple: + list or list of lists(bulk detect): containing entity value as defined into datastore + list or list of lists(bulk detect): containing corresponding original substrings in text + Example: + DataStore().get_entity_dictionary('city') + + Output: + { + u'Agartala': [u'', u'Agartala'], + u'Barnala': [u'', u'Barnala'], + ... + u'chennai': [u'', u'chennai', u'tamilnadu', u'madras'], + u'hyderabad': [u'hyderabad'], + u'koramangala': [u'koramangala'] + } + text_detection = TextDetector('city') + list_of_sentences = ['Come to Chennai, TamilNadu, I will visit Delhi next year', + 'I live in Delhi] + + text_detection.detect_entity(list_of_sentences) + Output: + ( [ + [u'Chennai', u'New Delhi', u'chennai'], + [u'New Delhi'] + ], + [ + ['chennai', 'delhi', 'tamilnadu'], + [delhi] + ] + ) + + text_detection.tagged_text + Output: + [ + ' come to __city__, __city__, i will visit __city__ next year ', + ' i live in __city__ ' + ] + + """ + self._process_text(texts) + text_entity_values_list, original_texts_list = self._text_detection_with_variants() + return text_entity_values_list, original_texts_list + def detect_entity(self, text, **kwargs): """ Detects all textual entities in text that are similar to variants of 'entity_name' stored in the datastore and @@ -260,7 +317,6 @@ def detect_entity(self, text, **kwargs): Note that datastore stores number of values under a entity_name and each entity_value has its own list of variants, whenever a variant is matched sucessfully, the entity_value whose list the variant belongs to, is returned. For more information on how data is stored, see Datastore docs. - Args: text (unicode): string to extract textual entities from **kwargs: it can be used to send specific arguments in future. for example, fuzziness, previous context. @@ -268,10 +324,8 @@ def detect_entity(self, text, **kwargs): tuple: list: containing entity value as defined into datastore list: containing corresponding original substrings in text - Example: DataStore().get_entity_dictionary('city') - Output: { u'Agartala': [u'', u'Agartala'], @@ -281,28 +335,23 @@ def detect_entity(self, text, **kwargs): u'hyderabad': [u'hyderabad'], u'koramangala': [u'koramangala'] } - text_detection = TextDetector('city') text_detection.detect_entity('Come to Chennai, TamilNadu, I will visit Delhi next year') - Output: ([u'Chennai', u'New Delhi', u'chennai'], ['chennai', 'delhi', 'tamilnadu']) - text_detection.tagged_text - Output: ' come to __city__, __city__, i will visit __city__ next year ' - - Additionally this function assigns these lists to self.text_entity_values and self.original_texts attributes - respectively. """ - self._process_text(text) + texts = [text] + self._process_text(texts) + text_entity_values, original_texts = self._text_detection_with_variants() - values, original_texts = self._text_detection_with_variants() - - self.text_entity_values, self.original_texts = values, original_texts - - return self.text_entity_values, self.original_texts + if len(text_entity_values) > 0 and len(original_texts) > 0: + self.tagged_text = self.__tagged_texts[0] + self.processed_text = self.__processed_texts[0] + return text_entity_values[0], original_texts[0] + return [], [] def _text_detection_with_variants(self): """ @@ -313,61 +362,68 @@ def _text_detection_with_variants(self): Returns: tuple: - list: containing the detected text entities - list: containing their corresponding substrings in the original message. + list of lists: list of lists containing the detected text entities + list of lists: list of lists containing their corresponding substrings in the original message. """ - original_final_list = [] - value_final_list = [] - variants_to_values = collections.OrderedDict() - - text = u' '.join(TOKENIZER.tokenize(self.processed_text)) - _variants_to_values = self.db.get_similar_dictionary(entity_name=self.entity_name, - text=text, - fuzziness_threshold=self._fuzziness, - search_language_script=self._target_language_script) - for variant, value in iteritems(_variants_to_values): - variant = variant.lower() - if isinstance(variant, bytes): - variant = variant.decode('utf-8') - - variants_to_values[variant] = value - - variants_list = variants_to_values.keys() - - # Length based ordering, this reorders the results from datastore - # that are already sorted by some relevance scoring - - exact_matches, fuzzy_variants = [], [] - _text = u' '.join(TOKENIZER.tokenize(self.processed_text)) - for variant in variants_list: - if u' '.join(TOKENIZER.tokenize(variant)) in _text: - exact_matches.append(variant) - else: - fuzzy_variants.append(variant) - - exact_matches.sort(key=lambda s: len(TOKENIZER.tokenize(s)), reverse=True) - fuzzy_variants.sort(key=lambda s: len(TOKENIZER.tokenize(s)), reverse=True) - variants_list = exact_matches + fuzzy_variants - - for variant in variants_list: - original_text = self._get_entity_substring_from_text(variant) - if original_text: - value_final_list.append(variants_to_values[variant]) - original_final_list.append(original_text) - _pattern = re.compile(r'\b%s\b' % re.escape(original_text), re.UNICODE) - self.tagged_text = _pattern.sub(self.tag, self.tagged_text) - # Instead of dropping completely like in other entities, - # we replace with tag to avoid matching non contiguous segments - self.processed_text = _pattern.sub(self.tag, self.processed_text) - return value_final_list, original_final_list - - def _get_entity_substring_from_text(self, variant): + + original_final_list_ = [] + value_final_list_ = [] + texts = [u' '.join(TOKENIZER.tokenize(processed_text)) for processed_text in self.__processed_texts] + + _variants_to_values_list = self.db.get_similar_dictionary(entity_name=self.entity_name, + texts=texts, + fuzziness_threshold=self._fuzziness, + search_language_script=self._target_language_script) + for index, _variants_to_values in enumerate(_variants_to_values_list): + original_final_list = [] + value_final_list = [] + variants_to_values = collections.OrderedDict() + for variant, value in iteritems(_variants_to_values): + variant = variant.lower() + if isinstance(variant, bytes): + variant = variant.decode('utf-8') + + variants_to_values[variant] = value + variants_list = variants_to_values.keys() + + # Length based ordering, this reorders the results from datastore + # that are already sorted by some relevance scoring + + exact_matches, fuzzy_variants = [], [] + _text = texts + for variant in variants_list: + if u' '.join(TOKENIZER.tokenize(variant)) in _text[index]: + exact_matches.append(variant) + else: + fuzzy_variants.append(variant) + exact_matches.sort(key=lambda s: len(TOKENIZER.tokenize(s)), reverse=True) + fuzzy_variants.sort(key=lambda s: len(TOKENIZER.tokenize(s)), reverse=True) + variants_list = exact_matches + fuzzy_variants + + for variant in variants_list: + + original_text = self._get_entity_substring_from_text(self.__processed_texts[index], variant) + if original_text: + value_final_list.append(variants_to_values[variant]) + original_final_list.append(original_text) + _pattern = re.compile(r'\b%s\b' % re.escape(original_text), re.UNICODE) + self.__tagged_texts[index] = _pattern.sub(self.tag, self.__tagged_texts[index]) + # Instead of dropping completely like in other entities, + # we replace with tag to avoid matching non contiguous segments + self.__processed_texts[index] = _pattern.sub(self.tag, self.__processed_texts[index]) + value_final_list_.append(value_final_list) + original_final_list_.append(original_final_list) + + return value_final_list_, original_final_list_ + + def _get_entity_substring_from_text(self, text, variant): """ Check ngrams of the text for similarity against the variant (can be a ngram) using Levenshtein distance and return the closest substring in the text that matches the variant Args: variant(str or unicode): string, ngram of variant to fuzzy detect in the text using Levenshtein distance + text(str or unicode): sentence from self.processed on which detection is being done Returns: str or unicode or None: part of the given text that was detected as entity given the variant, @@ -383,14 +439,12 @@ def _get_entity_substring_from_text(self, variant): 'delehi' """ - text = self.processed_text variant_tokens = TOKENIZER.tokenize(variant) text_tokens = TOKENIZER.tokenize(text) original_text_tokens = [] variant_token_i = 0 for text_token in text_tokens: variant_token = variant_tokens[variant_token_i] - same = variant_token == text_token ft = self._get_fuzziness_threshold_for_token(text_token) if same or (len(text_token) > self._min_token_size_for_fuzziness @@ -400,7 +454,7 @@ def _get_entity_substring_from_text(self, variant): original_text_tokens.append(text_token) variant_token_i += 1 if variant_token_i == len(variant_tokens): - return self._get_substring_from_processed_text(original_text_tokens) + return self._get_substring_from_processed_text(text, original_text_tokens) else: original_text_tokens = [] variant_token_i = 0 diff --git a/ner_v1/detectors/textual/text/text_detection_model.py b/ner_v1/detectors/textual/text/text_detection_model.py index d3e73c1b0..0356c4295 100644 --- a/ner_v1/detectors/textual/text/text_detection_model.py +++ b/ner_v1/detectors/textual/text/text_detection_model.py @@ -5,6 +5,7 @@ from ner_constants import ENTITY_VALUE_DICT_KEY from ner_v1.constant import DATASTORE_VERIFIED, CRF_MODEL_VERIFIED from ner_v1.detectors.textual.text.text_detection import TextDetector +import six class TextModelDetector(TextDetector): @@ -47,20 +48,16 @@ def detect_entity(self, text, **kwargs): variants, whenever a variant is matched successfully, the entity_value whose list the variant belongs to, is returned. For more information on how data is stored, see Datastore docs. In addition to this method also runs the CRF MODEL if trained and provides the results for the given entity. - Args: text (str or unicode): string to extract textual entities from **kwargs: it can be used to send specific arguments in future. for example, fuzziness, previous context. - Returns: tuple: list: containing list of dicts with the source of detection for the entity value and entity value as defined into datastore list: containing corresponding original substrings in text - Example: DataStore().get_entity_dictionary('city') - Output: { u'Agartala': [u'', u'Agartala'], @@ -70,21 +67,16 @@ def detect_entity(self, text, **kwargs): u'hyderabad': [u'hyderabad'], u'koramangala': [u'koramangala'] } - text_detection = TextModelDetector('city') text_detection.detect_entity('Come to Chennai, TamilNadu, I will visit Delhi next year') - Output: ([{'datastore_verified': True,'crf_model_verified': True, 'value': u'Chennai'}, {'datastore_verified': True,'crf_model_verified': False, 'value': u'New Delhi'}, {'datastore_verified': False,'crf_model_verified': True, 'value': u'chennai'}] , ['chennai', 'delhi', 'tamilnadu']) - text_detection.tagged_text - Output: ' come to __city__, __city__, i will visit __city__ next year ' - Additionally this function assigns these lists to self.text_entity_values and self.original_texts attributes respectively. """ @@ -106,6 +98,79 @@ def detect_entity(self, text, **kwargs): return self.text_entity_values, self.original_texts + def detect_entity_bulk(self, texts, **kwargs): + """ + Detects all textual entities in text that are similar to variants of 'entity_name' stored in the datastore and + returns two lists of list of detected text entities and their corresponding original substrings + for each sentence in text respectively. + The first list being a list of list of dicts with the verification source and the values. + Note that datastore stores number of values under a entity_name and each entity_value has its own list of + variants, whenever a variant is matched successfully, the entity_value whose list the variant belongs to, + is returned. For more information on how data is stored, see Datastore docs. + In addition to this method also runs the CRF MODEL if trained and provides the results for the given entity. + + Args: + texts (list of strings): natural language sentence(s) to extract entities from + **kwargs: it can be used to send specific arguments in future. for example, fuzziness, previous context. + + Returns: + tuple: + list of lists(bulk detect): containing list of dicts with the source of detection + for the entity value and entity value as defined into datastore + + list of lists(bulk detect): containing corresponding original substrings in text + + Example: + DataStore().get_entity_dictionary('city') + + Output: + { + u'Agartala': [u'', u'Agartala'], + u'Barnala': [u'', u'Barnala'], + ... + u'chennai': [u'', u'chennai', u'tamilnadu', u'madras'], + u'hyderabad': [u'hyderabad'], + u'koramangala': [u'koramangala'] + } + text_detection = TextDetector('city') + list_of_sentences = ['Come to Chennai, TamilNadu, I will visit Delhi next year', + 'I live in Delhi] + + text_detection.detect_entity(list_of_sentences) + Output: + ( [ + [u'Chennai', u'New Delhi', u'chennai'], + [u'New Delhi'] + ], + [ + ['chennai', 'delhi', 'tamilnadu'], + [delhi] + ] + ) + + text_detection.tagged_text + Output: + [ + ' come to __city__, __city__, i will visit __city__ next year ', + ' i live in __city__ ' + ] + + Additionally this function assigns these lists to self.text_entity_values and self.original_texts attributes + respectively. + """ + + crf_original_texts = [] + + values_list, original_texts_list = super(TextModelDetector, self).detect_entity_bulk(texts, **kwargs) + text_entity_values_list, original_texts_detected_list = [], [] + for inner_values, inner_original_texts in six.moves.zip(values_list, original_texts_list): + text_entity_verified_values, original_texts = \ + self.combine_results(values=inner_values, original_texts=inner_original_texts, + crf_original_texts=crf_original_texts) + text_entity_values_list.append(text_entity_verified_values) + original_texts_detected_list.append(original_texts) + return text_entity_values_list, original_texts_detected_list + def _add_verification_source(self, values, verification_source_dict): """ Add the verification source for the detected entities diff --git a/ner_v1/tests/__init__.py b/ner_v1/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/ner_v1/tests/numeral/__init__.py b/ner_v1/tests/numeral/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/ner_v1/tests/numeral/budget/__init__.py b/ner_v1/tests/numeral/budget/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/ner_v1/tests/numeral/budget/test_budget_detection.py b/ner_v1/tests/numeral/budget/test_budget_detection.py new file mode 100644 index 000000000..c4a384896 --- /dev/null +++ b/ner_v1/tests/numeral/budget/test_budget_detection.py @@ -0,0 +1,130 @@ +from __future__ import absolute_import + +from django.test import TestCase + +from ner_v1.detectors.numeral.budget.budget_detection import BudgetDetector + + +class TestBudgetDetector(TestCase): + def setUp(self): + self.budget_detector = BudgetDetector(entity_name='budget') + self.budget_detector.set_min_max_digits(min_digit=1, max_digit=15) + + def make_budget_dict(self, min_budget=0, max_budget=0): + return {'min_budget': min_budget, 'max_budget': max_budget, 'type': 'normal_budget'} + + def test_min_max_digits_limits(self): + """ + Test min max digits limit + """ + self.budget_detector.set_min_max_digits(min_digit=2, max_digit=5) + + positive_tests = [ + 'Show products in 10,000 - 20,000 range', + 'This costs about 10 rs', + ] + + negative_tests = [ + 'my budget is .5cr', + 'Annual operating budget is 1.2cr', + 'Show me cars that cost less than 2.99mil', + 'Rs. 1 is the minimum denomination' + ] + + for test in positive_tests: + budget_dicts, original_texts = self.budget_detector.detect_entity(text=test) + self.assertTrue(original_texts) + + for test in negative_tests: + budget_dicts, original_texts = self.budget_detector.detect_entity(text=test) + self.assertFalse(original_texts) + + def test_max_budget(self): + """ + Test v1 max budget + """ + tests = [ + ('Show me cars that cost below rs. 5000', 0, 5000, 'below rs. 5000'), + ('Show me cars that cost less than 6k', 0, 6000, 'less than 6k'), + ('at most 30 rs.', 0, 30, 'at most 30 rs.'), + ('costs upto Rs.100', 0, 100, 'upto rs.100') + ] + for test, min_budget, max_budget, original_text in tests: + budget_dicts, original_texts = self.budget_detector.detect_entity(text=test) + self.assertEqual(budget_dicts, [self.make_budget_dict(max_budget=max_budget)]) + self.assertEqual(original_texts, [original_text]) + + def test_min_budget(self): + """ + Test v1 min budget + """ + tests = [ + ('Show me cars that cost above rs. 5000', 5000, 0, 'above rs. 5000'), + ('Show me cars that cost more than 6k', 6000, 0, 'more than 6k'), + ('at least 30 rs.', 30, 0, 'at least 30 rs.'), + ('costs greater than Rs.100', 100, 0, 'greater than rs.100'), + ] + for test, min_budget, max_budget, original_text in tests: + budget_dicts, original_texts = self.budget_detector.detect_entity(text=test) + self.assertEqual(budget_dicts, [self.make_budget_dict(min_budget=min_budget)]) + self.assertEqual(original_texts, [original_text]) + + def test_range(self): + """ + Test v1 budget range + """ + tests = [ + ('Show products in 10,000 - 20,000 range', 10000, 20000, '10,000 - 20,000'), + ('Show products in 10,000-20,000 range', 10000, 20000, '10,000-20,000'), + ('Show products in 10,000 till Rs. 20k range', 10000, 20000, '10,000 till rs. 20k'), + ('Show products from rs. 5,5,00 to 6,0,0,0 rupees', 5500, 6000, 'rs. 5,5,00 to 6,0,0,0 rupees'), + ] + for test, min_budget, max_budget, original_text in tests: + budget_dicts, original_texts = self.budget_detector.detect_entity(text=test) + self.assertEqual(budget_dicts, [self.make_budget_dict(min_budget=min_budget, max_budget=max_budget)]) + self.assertEqual(original_texts, [original_text]) + + def test_any_budget(self): + """ + Test v1 budget any + """ + tests = [ + ('.5cr', 0, 5000000, '.5cr'), + ('1.2cr', 0, 12000000, '1.2cr'), + ('1.5 thousand', 0, 1500, '1.5 thousand'), + ('5 hazar', 0, 5000, '5 hazar'), + ('10 rs', 0, 10, '10 rs'), + ] + for test, min_budget, max_budget, original_text in tests: + budget_dicts, original_texts = self.budget_detector.detect_entity(text=test) + self.assertEqual(budget_dicts, [self.make_budget_dict(max_budget=max_budget)]) + self.assertEqual(original_texts, [original_text]) + + def test_not_budgets(self): + """ + Test sentences that do not have any budget + """ + tests = [ + 'I want to buy 5liters of milk', + 'Your flight number is 9w998', + 'hello, your coupon code is Amazon50', + 'hello, your coupon code is 50Amazon', + 'the insect is 120millimeters tall' + ] + + for test in tests: + budget_dicts, original_texts = self.budget_detector.detect_entity(text=test) + self.assertEqual(budget_dicts, []) + self.assertEqual(original_texts, []) + + def test_budgets_without_scales(self): + tests = [ + ('I want to buy 5 liters of milk', 0, 5, '5'), + ('the insect is 120 millimeters tall', 0, 120, '120'), + ('hello, your coupon code is 50 Amazon', 0, 50, '50'), + ('Your flight number is 9w 998', 0, 998, '998'), + ] + for test, min_budget, max_budget, original_text in tests: + budget_dicts, original_texts = self.budget_detector.detect_entity(text=test) + self.assertEqual(budget_dicts, [self.make_budget_dict(max_budget=max_budget)]) + self.assertEqual(original_texts, [original_text]) diff --git a/ner_v2/detectors/numeral/number/en/data/units.csv b/ner_v2/detectors/numeral/number/en/data/units.csv index 5d9fde06f..f53b73cf3 100644 --- a/ner_v2/detectors/numeral/number/en/data/units.csv +++ b/ner_v2/detectors/numeral/number/en/data/units.csv @@ -1,9 +1,9 @@ unit_type,unit_value,unit_variants currency,rupees,rupees | rupee | rs | rupya | rupaya | rupaye | rupye | rupay | paisa | paise | inr | ₹ currency,dollar,Dollar | usd | $ -package_metric_unit,mg,mg | milligram | milligrams -package_metric_unit,gms,gms | grams | gram -package_metric_unit,kg,kilogram | kilograms | kg -package_metric_unit,ml,ml | milliliter -package_metric_unit,ltr,ltr | litre -package_metric_unit,pcs,pcs \ No newline at end of file +package_metric_unit,mg,mg | milligram | milligrams | mgs +package_metric_unit,gms,gms | grams | gram | gm | g +package_metric_unit,kg,kilogram | kilograms | kg | kilo | kgs +package_metric_unit,ml,ml | milliliter | millilitre | milliliters | millilitres +package_metric_unit,ltr,ltr | litre | liter | litres | liters | l +package_metric_unit,pcs,pcs | pc | pieces | piece \ No newline at end of file diff --git a/ner_v2/detectors/numeral/number/standard_number_detector.py b/ner_v2/detectors/numeral/number/standard_number_detector.py index befedea22..fc69f4a4a 100644 --- a/ner_v2/detectors/numeral/number/standard_number_detector.py +++ b/ner_v2/detectors/numeral/number/standard_number_detector.py @@ -147,10 +147,13 @@ def _get_unit_from_text(self, detected_original, processed_text): if not self.units_map: return unit, original_text + processed_text = " " + processed_text.strip() + " " + # add re.escape to handle decimal cases in detected original detected_original = re.escape(detected_original) - unit_matches = re.search(r'((' + self.unit_choices + r')[\.\,\s]*' + detected_original + r')|(' + - detected_original + r'\s*(' + self.unit_choices + r'))', processed_text, re.UNICODE) + unit_matches = re.search(r'\W+((' + self.unit_choices + r')[\.\,\s]*' + detected_original + r')|(' + + detected_original + r'\s*(' + self.unit_choices + r'))\W+', processed_text, + re.UNICODE) if unit_matches: original_text_prefix, unit_prefix, original_text_suffix, unit_suffix = unit_matches.groups() if unit_suffix: @@ -267,12 +270,12 @@ def _detect_number_from_digit(self, number_list=None, original_list=None): patterns = regex_numeric_patterns.findall(processed_text) for pattern in patterns: number, scale, original_text = None, None, None - if pattern[1].replace(',', ''): + if pattern[1] and pattern[1].replace(',', '').replace('.', '').isdigit(): number = pattern[1].replace(',', '') original_text = pattern[0].strip() scale = self.scale_map[pattern[2].strip()] - elif pattern[3].replace(',', ''): + elif pattern[3] and pattern[3].replace(',', '').replace('.', '').isdigit(): number = pattern[3].replace(',', '') original_text = pattern[3].strip() scale = 1 diff --git a/ner_v2/detectors/temporal/date/mr/data/datetime_diff_constant.csv b/ner_v2/detectors/temporal/date/mr/data/datetime_diff_constant.csv index a38be3553..1cc326299 100644 --- a/ner_v2/detectors/temporal/date/mr/data/datetime_diff_constant.csv +++ b/ner_v2/detectors/temporal/date/mr/data/datetime_diff_constant.csv @@ -3,7 +3,7 @@ key,present_in_start,adding_magnitude,datetime_type हे|He,1,0,add_diff_datetime पूर्वी|अगोदर|गेल्या|Gelya|Purvi|Porvi|Agodar,0,-1,add_diff_datetime अंतिम|शेवट|शेवटी|Antim|shevat|shewat|shevati|shewati,1,-1,add_diff_datetime -पुढील|पुढे|पुढच्या|पुढचा|Pudcha|Pudhil|Pudhe|Pudhchya,1,1,add_diff_datetime +पुढील|पुढे|पुढच्या|पुढचा|पुढल्या|Pudcha|Pudhil|Pudhe|Pudhchya|pudhalya,1,1,add_diff_datetime मी|Mi|me,0,1,add_diff_datetime सवा|sawa|sava,1,0.25,ref_datetime पौने|paune,1,-0.25,ref_datetime