diff --git a/chatbot_ner/settings.py b/chatbot_ner/settings.py index 86808de55..1f565f11f 100755 --- a/chatbot_ner/settings.py +++ b/chatbot_ner/settings.py @@ -94,6 +94,24 @@ def __getitem__(self, item): TEST_RUNNER = 'django_nose.NoseTestSuiteRunner' +NOSE_ARGS = [ + '--nocapture', + '--nologcapture', + '--verbosity=3', + '--ignore-files=urls.py', + '--ignore-files=wsgi.py', + '--ignore-files=manage.py', + '--ignore-files=initial_setup.py', + '--ignore-files=__init__.py', + '--ignore-files=const.py', + '--ignore-files=constant.py', + '--ignore-files=constants.py', + '--ignore-files=start_server.sh', + '--ignore-files=settings.py', + '--exclude-dir=docs/', + '--exclude-dir=docker/', + '--exclude-dir=data/', +] # Internationalization # https://docs.djangoproject.com/en/1.11/topics/i18n/ diff --git a/chatbot_ner/urls.py b/chatbot_ner/urls.py index a9550a7f2..fcc96376a 100755 --- a/chatbot_ner/urls.py +++ b/chatbot_ner/urls.py @@ -35,6 +35,13 @@ url(r'^v2/phone_number/$', api_v2.phone_number), url(r'^v2/number_range/$', api_v2.number_range), + # V2 bulk detectors + url(r'^v2/date_bulk/$', api_v2.date), + url(r'^v2/time_bulk/$', api_v2.time), + url(r'^v2/number_bulk/$', api_v2.number), + url(r'^v2/number_range_bulk/$', api_v2.number_range), + url(r'^v2/phone_number_bulk/$', api_v2.phone_number), + # Dictionary Read Write url(r'^entities/get_entity_word_variants', external_api.get_entity_word_variants), url(r'^entities/update_dictionary', external_api.update_dictionary), diff --git a/models/crf_v2/README.md b/models/crf_v2/README.md index 9ba642014..2b08170af 100644 --- a/models/crf_v2/README.md +++ b/models/crf_v2/README.md @@ -53,7 +53,7 @@ file_handler = open('glove_vectors', 'wb') pickle.dump(obj=word_vectors.wv.vectors, file=file_handler, protocol=2) if not os.path.exists('/app/models_crf/'): -os.makedirs('/app/models_crf/') + os.makedirs('/app/models_crf/') ``` @@ -167,7 +167,7 @@ The module is used to take input as the sentence_list and entity_list and conver ```python from models.crf_v2.crf_preprocess_data import CrfPreprocessData docs['word_embeddings'] = - CrfPreprocessData.word_embeddings(processed_pos_tag_data=each, + [CrfPreprocessData.word_embeddings(processed_pos_tag_data=each, vocab=vocab, word_vectors=word_vectors) for each in docs[SENTENCE_LIST]] ``` diff --git a/ner_v1/detectors/numeral/budget/budget_detection.py b/ner_v1/detectors/numeral/budget/budget_detection.py index ed9883f4b..e0d97f04c 100644 --- a/ner_v1/detectors/numeral/budget/budget_detection.py +++ b/ner_v1/detectors/numeral/budget/budget_detection.py @@ -77,6 +77,24 @@ class BudgetDetector(BaseDetector): """ + _scale_patterns = { + 'k': 1000, + 'ha?zaa?r': 1000, + 'ha?ja?ar': 1000, + 'thousa?nd': 1000, + 'l': 100000, + 'lacs?': 100000, + 'lakh?s?': 100000, + 'lakhs': 100000, + 'm': 1000000, + 'mn': 1000000, + 'million': 1000000, + 'mill?': 1000000, + 'c': 10000000, + 'cro?': 10000000, + 'crore?s?': 10000000, + } + def __init__(self, entity_name, source_language_script=ENGLISH_LANG, translation_enabled=False, use_text_detection=False): """Initializes a BudgetDetector object @@ -101,18 +119,10 @@ def __init__(self, entity_name, source_language_script=ENGLISH_LANG, translation self.tag = '__' + self.entity_name + '__' self._use_text_detection = use_text_detection - self._allowed_units = [ - (['k', 'ha?zaa?r', 'ha?ja?ar', 'thousa?nd'], 1000), - (['l', 'lacs?', 'lakh?s?', 'lakhs'], 100000), - (['m', 'mn', 'million', 'mill?'], 1000000), - (['c', 'cro?', 'crore?s?'], 10000000), - ] - - units = [] - for _units, scale in self._allowed_units: - units.extend(_units) - units.sort(key=lambda unit: len(unit), reverse=True) - + units, scales = zip(*sorted( + list(BudgetDetector._scale_patterns.items()), key=lambda pattern_scale: len(pattern_scale[0]), reverse=True + )) + self._scale_compiled_patterns = [(scale, re.compile(unit)) for scale, unit in zip(scales, units)] digits_pattern = r'((?:\d+(?:\,\d+)*(?:\.\d+)?)|(?:(?:\d+(?:\,\d+)*)?(?:\.\d+)))' units_pattern = r'({})?'.format('|'.join(units)) self._budget_pattern = r'(?:rs\.|rs|rupees|rupee)?' \ @@ -121,8 +131,8 @@ def __init__(self, entity_name, source_language_script=ENGLISH_LANG, translation def get_scale(self, unit): if unit: - for _units, scale in self._allowed_units: - if re.search('|'.join(_units), unit): + for scale, pattern in self._scale_compiled_patterns: + if pattern.search(unit): return scale return 1 diff --git a/ner_v1/tests/numeral/budget/test_budget_detection.py b/ner_v1/tests/numeral/budget/test_budget_detection.py index c4a384896..7e47a334c 100644 --- a/ner_v1/tests/numeral/budget/test_budget_detection.py +++ b/ner_v1/tests/numeral/budget/test_budget_detection.py @@ -10,7 +10,8 @@ def setUp(self): self.budget_detector = BudgetDetector(entity_name='budget') self.budget_detector.set_min_max_digits(min_digit=1, max_digit=15) - def make_budget_dict(self, min_budget=0, max_budget=0): + @staticmethod + def make_budget_dict(min_budget=0, max_budget=0): return {'min_budget': min_budget, 'max_budget': max_budget, 'type': 'normal_budget'} def test_min_max_digits_limits(self): @@ -118,6 +119,9 @@ def test_not_budgets(self): self.assertEqual(original_texts, []) def test_budgets_without_scales(self): + """ + Test budgets without scales + """ tests = [ ('I want to buy 5 liters of milk', 0, 5, '5'), ('the insect is 120 millimeters tall', 0, 120, '120'), @@ -128,3 +132,39 @@ def test_budgets_without_scales(self): budget_dicts, original_texts = self.budget_detector.detect_entity(text=test) self.assertEqual(budget_dicts, [self.make_budget_dict(max_budget=max_budget)]) self.assertEqual(original_texts, [original_text]) + + def test_all_budget_scales(self): + """ + Test all supported budget scales + """ + tests = [ + ('2k', 0, 2000, '2k'), + ('2 thousand', 0, 2000, '2 thousand'), + ('2 hazar', 0, 2000, '2 hazar'), + ('2 hazaar', 0, 2000, '2 hazaar'), + ('2 hajar', 0, 2000, '2 hajar'), + ('2 hajaar', 0, 2000, '2 hajaar'), + ('2l', 0, 200000, '2l'), + ('2 lac', 0, 200000, '2 lac'), + ('2 lacs', 0, 200000, '2 lacs'), + ('2 lak', 0, 200000, '2 lak'), + ('2 laks', 0, 200000, '2 laks'), + ('2 lakh', 0, 200000, '2 lakh'), + ('2 lakhs', 0, 200000, '2 lakhs'), + ('2m', 0, 2000000, '2m'), + ('2mn', 0, 2000000, '2mn'), + ('2 mil', 0, 2000000, '2 mil'), + ('2 mill', 0, 2000000, '2 mill'), + ('2 million', 0, 2000000, '2 million'), + ('2c', 0, 20000000, '2c'), + ('2 cr', 0, 20000000, '2 cr'), + ('2 cro', 0, 20000000, '2 cro'), + ('2 cror', 0, 20000000, '2 cror'), + ('2 crore', 0, 20000000, '2 crore'), + ('2 crores', 0, 20000000, '2 crores'), + ] + + for test, min_budget, max_budget, original_text in tests: + budget_dicts, original_texts = self.budget_detector.detect_entity(text=test) + self.assertEqual(budget_dicts, [self.make_budget_dict(max_budget=max_budget)]) + self.assertEqual(original_texts, [original_text]) diff --git a/ner_v2/api.py b/ner_v2/api.py index 81182a4d6..63657248d 100644 --- a/ner_v2/api.py +++ b/ner_v2/api.py @@ -12,9 +12,10 @@ from language_utilities.constant import ENGLISH_LANG from ner_v2.detectors.pattern.phone_number.phone_number_detection import PhoneDetector - +from django.views.decorators.csrf import csrf_exempt from django.http import HttpResponse import json +import six def get_parameters_dictionary(request): @@ -44,6 +45,36 @@ def get_parameters_dictionary(request): return parameters_dict +def parse_post_request(request): + # type: (django.http.HttpRequest) -> Dict[str, Any] + """ + Extract POST request body from HTTP request + + Args: + request (django.http.HttpRequest): HTTP response from url + + Returns: + dict: parameters from the request + """ + request_data = json.loads(request.body) + parameters_dict = { + PARAMETER_MESSAGE: request_data.get('message'), + PARAMETER_ENTITY_NAME: request_data.get('entity_name'), + PARAMETER_STRUCTURED_VALUE: request_data.get('structured_value'), + PARAMETER_FALLBACK_VALUE: request_data.get('fallback_value'), + PARAMETER_BOT_MESSAGE: request_data.get('bot_message'), + PARAMETER_TIMEZONE: request_data.get('timezone'), + PARAMETER_LANGUAGE_SCRIPT: request_data.get('language_script', ENGLISH_LANG), + PARAMETER_SOURCE_LANGUAGE: request_data.get('source_language', ENGLISH_LANG), + PARAMETER_MIN_DIGITS: request_data.get('min_number_digits'), + PARAMETER_MAX_DIGITS: request_data.get('max_number_digits'), + PARAMETER_NUMBER_UNIT_TYPE: request_data.get('unit_type') + } + + return parameters_dict + + +@csrf_exempt def date(request): """This functionality use DateAdvanceDetector to detect date. It is called through api call @@ -85,9 +116,15 @@ def date(request): 'entity_value': {'value': {'mm': 12, 'yy': 2018, 'dd': 5, 'type': 'date'}}}] """ try: - parameters_dict = get_parameters_dictionary(request) + parameters_dict = {} + if request.method == "POST": + parameters_dict = parse_post_request(request) + ner_logger.debug('Start Bulk Detection: %s ' % parameters_dict[PARAMETER_ENTITY_NAME]) + elif request.method == "GET": + parameters_dict = get_parameters_dictionary(request) + ner_logger.debug('Start: %s ' % parameters_dict[PARAMETER_ENTITY_NAME]) + timezone = parameters_dict[PARAMETER_TIMEZONE] or 'UTC' - ner_logger.debug('Start: %s ' % parameters_dict[PARAMETER_ENTITY_NAME]) date_past_reference = parameters_dict.get(PARAMETER_PAST_DATE_REFERENCED, "false") past_date_referenced = date_past_reference == 'true' or date_past_reference == 'True' date_detection = DateAdvancedDetector(entity_name=parameters_dict[PARAMETER_ENTITY_NAME], @@ -97,9 +134,16 @@ def date(request): date_detection.set_bot_message(bot_message=parameters_dict[PARAMETER_BOT_MESSAGE]) - entity_output = date_detection.detect(message=parameters_dict[PARAMETER_MESSAGE], - structured_value=parameters_dict[PARAMETER_STRUCTURED_VALUE], - fallback_value=parameters_dict[PARAMETER_FALLBACK_VALUE]) + message = parameters_dict[PARAMETER_MESSAGE] + entity_output = None + + if isinstance(message, six.string_types): + entity_output = date_detection.detect(message=message, + structured_value=parameters_dict[PARAMETER_STRUCTURED_VALUE], + fallback_value=parameters_dict[PARAMETER_FALLBACK_VALUE] + ) + elif isinstance(message, (list, tuple)): + entity_output = date_detection.detect_bulk(messages=message) ner_logger.debug('Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output)) except TypeError as e: @@ -109,6 +153,7 @@ def date(request): return HttpResponse(json.dumps({'data': entity_output}), content_type='application/json') +@csrf_exempt def time(request): """This functionality use TimeDetector to detect time. It is called through api call @@ -150,19 +195,32 @@ def time(request): 'entity_value': {'mm': 30, 'hh': 12, 'nn': 'pm'}}] """ try: - parameters_dict = get_parameters_dictionary(request) + parameters_dict = {} + if request.method == "POST": + parameters_dict = parse_post_request(request) + ner_logger.debug('Start Bulk Detection: %s ' % parameters_dict[PARAMETER_ENTITY_NAME]) + elif request.method == "GET": + parameters_dict = get_parameters_dictionary(request) + ner_logger.debug('Start: %s ' % parameters_dict[PARAMETER_ENTITY_NAME]) + timezone = parameters_dict[PARAMETER_TIMEZONE] or 'UTC' form_check = True if parameters_dict[PARAMETER_STRUCTURED_VALUE] else False - ner_logger.debug('Start: %s ' % parameters_dict[PARAMETER_ENTITY_NAME]) time_detection = TimeDetector(entity_name=parameters_dict[PARAMETER_ENTITY_NAME], language=parameters_dict[PARAMETER_SOURCE_LANGUAGE], timezone=timezone) time_detection.set_bot_message(bot_message=parameters_dict[PARAMETER_BOT_MESSAGE]) - entity_output = time_detection.detect(message=parameters_dict[PARAMETER_MESSAGE], - structured_value=parameters_dict[PARAMETER_STRUCTURED_VALUE], - fallback_value=parameters_dict[PARAMETER_FALLBACK_VALUE], - form_check=form_check) + + message = parameters_dict[PARAMETER_MESSAGE] + entity_output = None + + if isinstance(message, six.string_types): + entity_output = time_detection.detect(message=message, + structured_value=parameters_dict[PARAMETER_STRUCTURED_VALUE], + fallback_value=parameters_dict[PARAMETER_FALLBACK_VALUE], + form_check=form_check) + elif isinstance(message, (list, tuple)): + entity_output = time_detection.detect_bulk(messages=message) ner_logger.debug('Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output)) except TypeError as e: @@ -172,6 +230,7 @@ def time(request): return HttpResponse(json.dumps({'data': entity_output}), content_type='application/json') +@csrf_exempt def number(request): """Use NumberDetector to detect numerals @@ -231,8 +290,13 @@ def number(request): """ try: - parameters_dict = get_parameters_dictionary(request) - ner_logger.debug('Start: %s ' % parameters_dict[PARAMETER_ENTITY_NAME]) + parameters_dict = {} + if request.method == "POST": + parameters_dict = parse_post_request(request) + ner_logger.debug('Start Bulk Detection: %s ' % parameters_dict[PARAMETER_ENTITY_NAME]) + elif request.method == "GET": + parameters_dict = get_parameters_dictionary(request) + ner_logger.debug('Start: %s ' % parameters_dict[PARAMETER_ENTITY_NAME]) number_detection = NumberDetector(entity_name=parameters_dict[PARAMETER_ENTITY_NAME], language=parameters_dict[PARAMETER_SOURCE_LANGUAGE], @@ -243,12 +307,18 @@ def number(request): max_digit = int(parameters_dict[PARAMETER_MAX_DIGITS]) number_detection.set_min_max_digits(min_digit=min_digit, max_digit=max_digit) - entity_output = number_detection.detect(message=parameters_dict[PARAMETER_MESSAGE], - structured_value=parameters_dict[PARAMETER_STRUCTURED_VALUE], - fallback_value=parameters_dict[PARAMETER_FALLBACK_VALUE], - bot_message=parameters_dict[PARAMETER_BOT_MESSAGE]) - ner_logger.debug('Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output)) + message = parameters_dict[PARAMETER_MESSAGE] + entity_output = None + + if isinstance(message, six.string_types): + entity_output = number_detection.detect(message=message, + structured_value=parameters_dict[PARAMETER_STRUCTURED_VALUE], + fallback_value=parameters_dict[PARAMETER_FALLBACK_VALUE], + bot_message=parameters_dict[PARAMETER_BOT_MESSAGE]) + elif isinstance(message, (list, tuple)): + entity_output = number_detection.detect_bulk(messages=message) + ner_logger.debug('Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output)) except TypeError as e: ner_logger.exception('Exception for numeric: %s ' % e) return HttpResponse(status=500) @@ -256,6 +326,7 @@ def number(request): return HttpResponse(json.dumps({'data': entity_output}), content_type='application/json') +@csrf_exempt def number_range(request): """Use NumberDetector to detect numerals @@ -294,17 +365,28 @@ def number_range(request): 'max_value': '300', 'unit': None}}] """ try: - parameters_dict = get_parameters_dictionary(request) - ner_logger.debug('Start: %s ' % parameters_dict[PARAMETER_ENTITY_NAME]) + parameters_dict = {} + if request.method == "POST": + parameters_dict = parse_post_request(request) + ner_logger.debug('Start Bulk Detection: %s ' % parameters_dict[PARAMETER_ENTITY_NAME]) + elif request.method == "GET": + parameters_dict = get_parameters_dictionary(request) + ner_logger.debug('Start: %s ' % parameters_dict[PARAMETER_ENTITY_NAME]) number_range_detector = NumberRangeDetector(entity_name=parameters_dict[PARAMETER_ENTITY_NAME], language=parameters_dict[PARAMETER_SOURCE_LANGUAGE], unit_type=parameters_dict[PARAMETER_NUMBER_UNIT_TYPE]) - entity_output = number_range_detector.detect(message=parameters_dict[PARAMETER_MESSAGE], - structured_value=parameters_dict[PARAMETER_STRUCTURED_VALUE], - fallback_value=parameters_dict[PARAMETER_FALLBACK_VALUE], - bot_message=parameters_dict[PARAMETER_BOT_MESSAGE]) + message = parameters_dict[PARAMETER_MESSAGE] + entity_output = None + + if isinstance(message, six.string_types): + entity_output = number_range_detector.detect(message=message, + structured_value=parameters_dict[PARAMETER_STRUCTURED_VALUE], + fallback_value=parameters_dict[PARAMETER_FALLBACK_VALUE], + bot_message=parameters_dict[PARAMETER_BOT_MESSAGE]) + elif isinstance(message, (list, tuple)): + entity_output = number_range_detector.detect_bulk(messages=message) ner_logger.debug('Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output)) @@ -315,12 +397,13 @@ def number_range(request): return HttpResponse(json.dumps({'data': entity_output}), content_type='application/json') +@csrf_exempt def phone_number(request): """Uses PhoneDetector to detect phone numbers request params: - message (str): natural text on which detection logic is to be run. Note if structured value is - detection is run on structured value instead of message + message (list or str): string for get request and list of text for bulk call through + post request on which detection logic is to be run entity_name (str): name of the entity. Also acts as elastic-search dictionary name if entity uses elastic-search lookup structured_value (str): Value obtained from any structured elements. Note if structured value is @@ -377,12 +460,61 @@ def phone_number(request): }, "language": "en" } - ] + ] + message = ["Call 02226129857' , 'message +1(408) 92-124' ,'send 100rs to 91 9820334416 9920441344'] + entity_name = 'phone_number' + source_language = 'en' + entity_output: + [ + [{ + "detection": "message", + "original_text": "02226129857", + "entity_value": { + "value": "02226129857" + }, + "language": "en" + } + + ], + [ + { + "detection": "message", + "original_text": "+1(408) 92-124", + "entity_value": { + "value": "140892124" + }, + "language": "en" + } + ], + [ + { + "detection": "message", + "original_text": "91 9820334416", + "entity_value": { + "value": "919820334416" + }, + "language": "en" + }, + { + "detection": "message", + "original_text": "9920441344", + "entity_value": { + "value": "9920441344" + }, + "language": "en" + } + + ] + ] """ try: - parameters_dict = get_parameters_dictionary(request) - ner_logger.debug('Start: %s ' % parameters_dict[PARAMETER_ENTITY_NAME]) + if request.method == "POST": + parameters_dict = parse_post_request(request) + ner_logger.debug('Start Bulk Detection: %s ' % parameters_dict[PARAMETER_ENTITY_NAME]) + elif request.method == "GET": + parameters_dict = get_parameters_dictionary(request) + ner_logger.debug('Start: %s ' % parameters_dict[PARAMETER_ENTITY_NAME]) entity_name = parameters_dict[PARAMETER_ENTITY_NAME] language = parameters_dict[PARAMETER_SOURCE_LANGUAGE] @@ -390,11 +522,14 @@ def phone_number(request): ner_logger.debug('Source Language %s' % language) phone_number_detection = PhoneDetector(entity_name=entity_name, language=language) - - entity_output = phone_number_detection.detect(message=parameters_dict[PARAMETER_MESSAGE], - structured_value=parameters_dict[PARAMETER_STRUCTURED_VALUE], - fallback_value=parameters_dict[PARAMETER_FALLBACK_VALUE], - bot_message=parameters_dict[PARAMETER_BOT_MESSAGE]) + message = parameters_dict[PARAMETER_MESSAGE] + if isinstance(message, six.string_types): + entity_output = phone_number_detection.detect(message=message, + structured_value=parameters_dict[PARAMETER_STRUCTURED_VALUE], + fallback_value=parameters_dict[PARAMETER_FALLBACK_VALUE], + bot_message=parameters_dict[PARAMETER_BOT_MESSAGE]) + elif isinstance(message, (list, tuple)): + entity_output = phone_number_detection.detect_bulk(messages=message) ner_logger.debug('Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output)) except TypeError as e: ner_logger.exception('Exception for phone_number: %s ' % e) diff --git a/ner_v2/detectors/base_detector.py b/ner_v2/detectors/base_detector.py index bed153312..d9d0127c1 100644 --- a/ner_v2/detectors/base_detector.py +++ b/ner_v2/detectors/base_detector.py @@ -160,6 +160,52 @@ def detect(self, message=None, structured_value=None, fallback_value=None, **kwa return self.output_entity_dict_list(entity_value_list=value, original_text_list=original_text, detection_method=method, detection_language=self._processing_language) + def detect_bulk(self, messages=None, **kwargs): + """ + Use detector to detect entities from text. It also translates query to language compatible to detector + + Args: + messages (list of strings): list of natural text(s) on which detection logic is to be run. + Returns: + dict or None: dictionary containing entity_value, original_text and detection; + entity_value is in itself a dict with its keys varying from entity to entity + + Example: + 1) Consider an example of restaurant detection from a message + + messages = ['i want to order chinese from mainland china and pizza from domminos'] + output = detect(message=message) + print output + >> [[{'detection': 'message', 'original_text': 'mainland china', 'entity_value': + {'value': u'Mainland China'}}, {'detection': 'message', 'original_text': 'domminos', + 'entity_value': {'value': u"Domino's Pizza"}}]] + """ + if messages is None: + messages = [] + if self._language != self._processing_language and self._translation_enabled: + translation_output_list = [ + translate_text(message_, self._language, self._processing_language) + for message_ in messages] + + messages = [] + for translation_output in translation_output_list: + messages.append(translation_output[TRANSLATED_TEXT] if translation_output['status'] else '') + + texts = messages + bulk_entities_list, bulk_original_texts_list = [], [] + + for text in texts: + entities_list, original_texts_list = self.detect_entity(text=text, **kwargs) + bulk_entities_list.append(entities_list) + bulk_original_texts_list.append(original_texts_list) + + values_list, method, original_texts_list = bulk_entities_list, FROM_MESSAGE, bulk_original_texts_list + + return self.output_entity_bulk(entity_values_list=values_list, original_texts_list=original_texts_list, + detection_method=method, + detection_language=self._processing_language) + + @staticmethod def output_entity_dict_list(entity_value_list, original_text_list, detection_method=None, detection_method_list=None, detection_language=ENGLISH_LANG): @@ -212,3 +258,75 @@ def output_entity_dict_list(entity_value_list, original_text_list, detection_met } ) return entity_list + + @staticmethod + def output_entity_bulk(entity_values_list, original_texts_list, detection_method=None, + detection_method_list=None, detection_language=ENGLISH_LANG): + """ + Format detected entity values for bulk detection + Args: + entity_values_list (list of lists): containing list of entity values which are identified from given + detection logic + original_texts_list (list of lists): containing list original values or actual values from + messages which are identified + detection_method (str, optional): how the entity was detected + i.e. whether from message, structured_value + or fallback, verified from model or not. + defaults to None + detection_method_list(list, optional): list containing how each entity was detected in the entity_value + list.If provided, this argument will be used over detection method + defaults to None + detection_language(str): ISO 639 code for language in which entity is detected + + Returns: + list of lists of dict: list of lists containing dictionaries, each containing entity_value, + original_text and detection; + entity_value is in itself a dict with its keys varying from entity to entity + Example Output: + [ + [ + { + "entity_value": entity_value_1, + "detection": detection_method, + "original_text": original_text_1 + }, + { + "entity_value": entity_value_2, + "detection": detection_method, + "original_text": original_text_2 + } + + ], + [ + { + "entity_value": entity_value, + "detection": detection_method, + "original_text": original_text + } + ] + ] + """ + if detection_method_list is None: + detection_method_list = [] + if entity_values_list is None: + entity_values_list = [] + + bulk_detection_entity_list = [] + for index, entity_values in enumerate(entity_values_list): + entity_list = [] + for i, entity_value in enumerate(entity_values): + if type(entity_value) in [str, six.text_type]: + entity_value = { + ENTITY_VALUE_DICT_KEY: entity_value + } + method = detection_method_list[i] if detection_method_list else detection_method + entity_list.append( + { + ENTITY_VALUE: entity_value, + DETECTION_METHOD: method, + ORIGINAL_TEXT: original_texts_list[index][i], + DETECTION_LANGUAGE: detection_language + } + ) + bulk_detection_entity_list.append(entity_list) + return bulk_detection_entity_list diff --git a/requirements.txt b/requirements.txt index 8e2a16dd9..6b3791001 100644 --- a/requirements.txt +++ b/requirements.txt @@ -21,4 +21,6 @@ mock==2.0.0 django-nose==1.4.5 typing==3.6.2 flake8==3.4.1 -pyaml==19.4.1 \ No newline at end of file +pyaml==19.4.1 +coverage==4.5.3 +nose-exclude==0.5.0 \ No newline at end of file