Skip to content

Commit

Permalink
Merge pull request #473 from hellohaptik/develop
Browse files Browse the repository at this point in the history
Build support for Person Name for Newer Languages
  • Loading branch information
adisri2694 authored Mar 24, 2022
2 parents 5845f01 + 1635402 commit 82aecba
Show file tree
Hide file tree
Showing 6 changed files with 829 additions and 43 deletions.
8 changes: 6 additions & 2 deletions datastore/elastic_search/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -580,7 +580,9 @@ def get_crf_data_for_entity_name(connection, index_name, doc_type, entity_name,
]
}
"""

# TODO: Enable sorting on ES side after verifying mappings are correctly setup.
# Currently in datastore.elastic_search.create.create_crf_index we dont add keyword fields to
# `language_script` and `sentence`. We need to add integration tests for these too
data = {
"query": {
"bool": {
Expand Down Expand Up @@ -615,8 +617,10 @@ def get_crf_data_for_entity_name(connection, index_name, doc_type, entity_name,
# Parse hits
results = search_results['hits']['hits']

language_mapped_results = collections.defaultdict(list)
# TODO: Remove and switch to sorting on ES side once mappings are set correctly
results.sort(key=lambda _doc: (_doc['_source']['language_script'], _doc['_source']['sentence']))

language_mapped_results = collections.defaultdict(list)
for result in results:
language_mapped_results[result['_source']['language_script']].append(
{
Expand Down
32 changes: 22 additions & 10 deletions external_api/api.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from __future__ import absolute_import
import json
import random

from django.http import HttpResponse
from datastore.datastore import DataStore
from datastore.exceptions import (DataStoreSettingsImproperlyConfiguredException, EngineNotImplementedException,
Expand Down Expand Up @@ -238,10 +240,15 @@ def entity_data_view(request, entity_name):
"""
if request.method == 'GET':
params = request.GET.dict()
# Fetch Languages supported by the entity

shuffle = (params.get('shuffle', 'false') or '').lower() == 'true'
try:
seed = int(params.get('seed', random.randint(0, 1000000000)))
except ValueError:
raise APIHandlerException('seed should be sent as a number')

try:
pagination_size = int(params.get('size', 10))
size = int(params.get('size', 10))
except ValueError:
raise APIHandlerException('size should be sent as a number')

Expand All @@ -250,14 +257,19 @@ def entity_data_view(request, entity_name):
except ValueError:
raise APIHandlerException('from should be sent as a number')

return dictionary_utils.search_entity_values(
entity_name=entity_name,
value_search_term=params.get('value_search_term', None),
variant_search_term=params.get('variant_search_term', None),
empty_variants_only=params.get('empty_variants_only', False),
pagination_size=pagination_size,
pagination_from=pagination_from
)
try:
return dictionary_utils.search_entity_values(
entity_name=entity_name,
value_search_term=params.get('value_search_term', None),
variant_search_term=params.get('variant_search_term', None),
empty_variants_only=params.get('empty_variants_only', False),
shuffle=shuffle,
offset=pagination_from,
size=size,
seed=seed,
)
except ValueError as e:
raise APIHandlerException(str(e)) from e

elif request.method == 'POST':
# Update language support in the specified entity
Expand Down
69 changes: 50 additions & 19 deletions external_api/lib/dictionary_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from __future__ import absolute_import

import random

"""
Note:
'word' and 'value' mean the same in this context and hasn't been cleaned because of dependency
Expand Down Expand Up @@ -51,6 +53,8 @@ def entity_update_languages(entity_name, new_language_list):
raise APIHandlerException('No new languages provided. Nothing changed.')

# fetch all words
# TODO: If possible add records in single ES query instead of
# two (get_entity_unique_values + db.add_entity_data)
values = get_entity_unique_values(entity_name=entity_name)
if not values:
raise APIHandlerException('This entity does not have any records. Please verify the entity name')
Expand Down Expand Up @@ -146,13 +150,23 @@ def delete_records_by_values(entity_name, values):
)


def _shuffle_sample_values(values, shuffle, seed, size, offset=0):
if shuffle:
random.Random(seed).shuffle(values)
if size > 0 and offset >= 0:
values = values[offset:offset + size]
return values


def search_entity_values(
entity_name,
value_search_term=None,
variant_search_term=None,
empty_variants_only=False,
pagination_size=None,
pagination_from=None
shuffle=False,
size=None,
offset=0,
seed=None,
):
"""
Searches for values within the specific entity. If pagination details not specified, all
Expand All @@ -166,35 +180,50 @@ def search_entity_values(
If not provided, results are not filtered by variants
empty_variants_only (bool, optional): Flag to search for values with empty variants only
If not provided, all variants are included
pagination_size (int, optional): No. of records to fetch data when paginating
shuffle (bool, optional): whether to shuffle the records randomly. Defaults to False
size (int, optional): No. of records to fetch data when paginating
If it is None, the results will not be paginated
pagination_from (int, optional): Offset to skip initial data (useful for pagination queries)
offset (int, optional): Offset to skip initial data (useful for pagination queries)
If it is None, the results will not be paginated
seed: (int or None, optional): seed to initialize the random instance for shuffling. Defaults to None
Returns:
dict: total records (for pagination) and a list of individual records which match by the search filters
"""
values = None
total_records = None
if value_search_term or variant_search_term or empty_variants_only or pagination_size or pagination_from:
# TODO: If possible search, sample, paginate in single ES query instead of
# two (get_entity_unique_values + get_records_from_values)
# TODO: This is not the most optimal way to paginate/sample for a large number of values! Current approach fetches
# everything and then applies slicing in memory. It also relies on `get_entity_unique_values` always
# maintaining a fixed ordering of results
# Instead for ES, we can implement collapse by value -> sort -> provide from and size.
# https://www.elastic.co/guide/en/elasticsearch/reference/5.6/search-request-collapse.html
# Also read: https://www.elastic.co/guide/en/elasticsearch/reference/current/paginate-search-results.html
# https://www.elastic.co/guide/en/elasticsearch/reference/5.6/query-dsl-function-score-query.html#score-functions
is_search_query = value_search_term or variant_search_term or empty_variants_only
if is_search_query and shuffle:
raise ValueError('`shuffle=True` is not supported with following args: '
'[value_search_term, variant_search_term, empty_variants_only]')
has_pagination_args_without_shuffling = (not shuffle) and (offset or size)
if is_search_query or has_pagination_args_without_shuffling:
# Here we first figure out which values we need to return, sample them if needed (shuffling is not supported),
# and then fetch records for these filtered values
values = get_entity_unique_values(
entity_name=entity_name,
value_search_term=value_search_term,
variant_search_term=variant_search_term,
empty_variants_only=empty_variants_only,
)
total_records = len(values)
if pagination_size > 0 and pagination_from >= 0:
values = values[pagination_from:pagination_from + pagination_size]

records_dict = get_records_from_values(entity_name, values)
records_list = []
for value, variant_data in records_dict.items():
records_list.append({
"word": value,
"variants": variant_data,
})

if total_records is None:
values = _shuffle_sample_values(values=values, shuffle=False, seed=seed, size=size, offset=offset)
records_dict = get_records_from_values(entity_name, values=values)
else:
# Here we do the inverse - fetch all records first, shuffle and sample them if needed, then discard the rest
records_dict = get_records_from_values(entity_name, values=None)
values = sorted(records_dict.keys())
total_records = len(values)
values = _shuffle_sample_values(values=values, shuffle=shuffle, seed=seed, size=size, offset=offset)

records_list = [{"word": value, "variants": records_dict.get(value, {})} for value in values]
if not total_records:
total_records = len(records_list)

return {
Expand All @@ -221,6 +250,8 @@ def update_entity_records(entity_name, data):
replace_data = data.get('replace')

if replace_data:
# TODO: Delete everything for the `entity_name` without having to fetch values first!
# https://www.elastic.co/guide/en/elasticsearch/reference/5.6/docs-delete-by-query.html
values_to_delete = get_entity_unique_values(entity_name)
else:
values_to_delete = [record['word'] for record in records_to_delete]
Expand Down
7 changes: 7 additions & 0 deletions language_utilities/constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,12 @@
SWAHILI_LANG = 'sw'
ARABIC_LANG = 'ar'

POLISH_LANG='pl'
TAGALOG_LANG = 'tl'
SWEDISH_LANG = 'sv'
FINNISH_LANG = 'fi'
PORTUGUESE_LANG = 'pt'
TURKISH_LANG = 'tr'

# language translation status
TRANSLATED_TEXT = 'translated_text'
Loading

0 comments on commit 82aecba

Please sign in to comment.