Skip to content

Commit

Permalink
Merge pull request #3414 from ingef/fix/order-indepdent-of-source
Browse files Browse the repository at this point in the history
don't concat searches, instead merge them by priority. Pushing up goo…
  • Loading branch information
awildturtok authored Apr 30, 2024
2 parents 5e03fd2 + c5e5ece commit 515ff09
Show file tree
Hide file tree
Showing 5 changed files with 123 additions and 91 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@
import com.google.common.cache.CacheLoader;
import com.google.common.cache.LoadingCache;
import com.google.common.collect.Iterators;
import it.unimi.dsi.fastutil.objects.Object2LongMap;
import it.unimi.dsi.fastutil.objects.Object2LongOpenHashMap;
import jakarta.inject.Inject;
import jakarta.validation.Validator;
import lombok.Getter;
Expand Down Expand Up @@ -266,37 +268,23 @@ private List<FrontendValue> autocompleteTextFilter(SelectFilter<?> searchable, S
final Namespace namespace = namespaces.get(searchable.getDataset().getId());

// Note that FEValues is equals/hashcode only on value:
// The different sources might contain duplicate FEValue#values which we want to avoid as
// they are already sorted in terms of information weight by getSearchesFor
// The different sources might contain duplicate FEValue#values which we exploit:
// If a value is already present, it's from a search with higher priority.

// Also note: currently we are still issuing large search requests, but much smaller allocations at once, and querying only when the past is not sufficient
return namespace.getFilterSearch()
.getSearchesFor(searchable)
.stream()
.map(search -> createSourceSearchResult(search, Collections.singletonList(text), OptionalInt.empty()))
.flatMap(Collection::stream)
.distinct()
.collect(Collectors.toList());
final List<TrieSearch<FrontendValue>> searches = namespace.getFilterSearch().getSearchesFor(searchable);

final Object2LongMap<FrontendValue> overlayedWeights = new Object2LongOpenHashMap<>();

}
for (TrieSearch<FrontendValue> search : searches) {

/**
* Do a search with the supplied values.
*/
private static List<FrontendValue> createSourceSearchResult(TrieSearch<FrontendValue> search, Collection<String> values, OptionalInt numberOfTopItems) {
if (search == null) {
return Collections.emptyList();
final Object2LongMap<FrontendValue> itemWeights = search.collectWeights(List.of(text));

itemWeights.forEach(overlayedWeights::putIfAbsent);
}

final List<FrontendValue> result = search.findItems(values, numberOfTopItems.orElse(Integer.MAX_VALUE));
return TrieSearch.topItems(Integer.MAX_VALUE, overlayedWeights);

if (numberOfTopItems.isEmpty() && result.size() == Integer.MAX_VALUE) {
//TODO This looks odd, do we really expect QuickSearch to allocate that huge of a list for us?
log.warn("The quick search returned the maximum number of results ({}) which probably means not all possible results are returned.", Integer.MAX_VALUE);
}

return result;
}

public ResolvedConceptsResult resolveConceptElements(TreeConcept concept, List<String> conceptCodes) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,14 @@

import com.google.common.base.Strings;
import com.google.common.collect.Iterators;
import it.unimi.dsi.fastutil.objects.Object2LongAVLTreeMap;
import it.unimi.dsi.fastutil.objects.Object2LongMap;
import it.unimi.dsi.fastutil.objects.Object2LongOpenHashMap;
import jakarta.validation.constraints.Min;
import lombok.ToString;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.collections4.trie.PatriciaTrie;
import org.apache.commons.lang3.StringUtils;
import org.jetbrains.annotations.NotNull;

/**
* Trie based keyword search for autocompletion and resolving.
Expand Down Expand Up @@ -84,7 +85,25 @@ public TrieSearch(int ngramLength, String split) {
}

public List<T> findItems(Collection<String> queries, int limit) {
final Object2LongMap<T> itemWeights = new Object2LongAVLTreeMap<>();
final Object2LongMap<T> itemWeights = collectWeights(queries);

return topItems(limit, itemWeights);
}

@NotNull
public static <T extends Comparable<T>> List<T> topItems(int limit, Object2LongMap<T> itemWeights) {
// Sort items according to their weight, then limit.
// Note that sorting is in descending order, meaning higher-scores are better.
return itemWeights.object2LongEntrySet()
.stream()
.sorted(Comparator.comparing(Object2LongMap.Entry::getLongValue, Comparator.reverseOrder()))
.limit(limit)
.map(Map.Entry::getKey)
.collect(Collectors.toList());
}

public Object2LongMap<T> collectWeights(Collection<String> queries) {
final Object2LongMap<T> itemWeights = new Object2LongOpenHashMap<>();

// We are not guaranteed to have split queries incoming, so we normalize them for searching
queries = queries.stream().flatMap(this::split).collect(Collectors.toSet());
Expand Down Expand Up @@ -118,14 +137,7 @@ public List<T> findItems(Collection<String> queries, int limit) {
updateWeights(query, ngramHits, itemWeights, false);
}

// Sort items according to their weight, then limit.
// Note that sorting is in descending order, meaning higher-scores are better.
return itemWeights.object2LongEntrySet()
.stream()
.sorted(Comparator.comparing(Object2LongMap.Entry::getLongValue, Comparator.reverseOrder()))
.limit(limit)
.map(Map.Entry::getKey)
.collect(Collectors.toList());
return itemWeights;
}

/**
Expand Down Expand Up @@ -163,7 +175,7 @@ private void updateWeights(String query, final Map<String, List<T>> items, Objec

// We combine hits additively to favor items with multiple hits
for (T item : hits) {
itemWeights.put(item, itemWeights.getOrDefault(item, 1) + weight);
itemWeights.put(item, itemWeights.getOrDefault(item, 0) + weight);
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,6 @@
import java.util.OptionalInt;
import java.util.Set;

import jakarta.ws.rs.client.Entity;
import jakarta.ws.rs.client.Invocation;
import jakarta.ws.rs.core.MediaType;
import jakarta.ws.rs.core.Response;

import com.bakdata.conquery.apiv1.FilterTemplate;
import com.bakdata.conquery.apiv1.frontend.FrontendValue;
import com.bakdata.conquery.integration.IntegrationTest;
Expand All @@ -35,6 +30,10 @@
import com.bakdata.conquery.resources.hierarchies.HierarchyHelper;
import com.bakdata.conquery.util.support.StandaloneSupport;
import com.github.powerlibraries.io.In;
import jakarta.ws.rs.client.Entity;
import jakarta.ws.rs.client.Invocation;
import jakarta.ws.rs.core.MediaType;
import jakarta.ws.rs.core.Response;
import lombok.extern.slf4j.Slf4j;

@Slf4j
Expand All @@ -46,7 +45,9 @@ public class FilterAutocompleteTest extends IntegrationTest.Simple implements Pr
"aab,lbl-2,ov-2",
"aaa,lbl-3 & lbl-4,ov-4",
"baaa,lbl-5,ov-5",
"b,lbl-6,ov-6"
"b,lbl-6,ov-6",
"female,female-label,female-option",
"male,male-label,male-option"
};

@Override
Expand All @@ -56,51 +57,9 @@ public Set<StandaloneSupport.Mode> forModes() {

@Override
public void execute(StandaloneSupport conquery) throws Exception {
//read test specification
final String testJson =
In.resource("/tests/query/MULTI_SELECT_DATE_RESTRICTION_OR_CONCEPT_QUERY/MULTI_SELECT_DATE_RESTRICTION_OR_CONCEPT_QUERY.test.json")
.withUTF8()
.readAll();

final DatasetId dataset = conquery.getDataset().getId();

final ConqueryTestSpec test = JsonIntegrationTest.readJson(dataset, testJson);

ValidatorHelper.failOnError(log, conquery.getValidator().validate(test));

test.importRequiredData(conquery);

conquery.waitUntilWorkDone();

final CSVConfig csvConf = conquery.getConfig().getCsv();
final SelectFilter<?> filter = setupSearch(conquery);

final Concept<?> concept = conquery.getNamespace().getStorage().getAllConcepts().iterator().next();
final Connector connector = concept.getConnectors().iterator().next();
final SelectFilter<?> filter = (SelectFilter<?>) connector.getFilters().iterator().next();

// Copy search csv from resources to tmp folder.
final Path tmpCSv = Files.createTempFile("conquery_search", "csv");

Files.write(
tmpCSv,
String.join(csvConf.getLineSeparator(), RAW_LINES).getBytes(),
StandardOpenOption.TRUNCATE_EXISTING, StandardOpenOption.CREATE, StandardOpenOption.WRITE
);

final IndexService indexService = new IndexService(conquery.getConfig().getCsv().createCsvParserSettings(), "emptyDefaultLabel");

filter.setTemplate(new FilterTemplate(conquery.getDataset(), "test", tmpCSv.toUri(), "id", "{{label}}", "Hello this is {{option}}", 2, true, indexService));

final URI matchingStatsUri = HierarchyHelper.hierarchicalPath(conquery.defaultAdminURIBuilder()
, AdminDatasetResource.class, "postprocessNamespace")
.buildFromMap(Map.of(DATASET, conquery.getDataset().getId()));

conquery.getClient().target(matchingStatsUri)
.request(MediaType.APPLICATION_JSON_TYPE)
.post(null)
.close();

conquery.waitUntilWorkDone();
final Concept<?> concept = filter.getConnector().getConcept();

final URI autocompleteUri =
HierarchyHelper.hierarchicalPath(
Expand Down Expand Up @@ -128,11 +87,11 @@ public void execute(StandaloneSupport conquery) throws Exception {

final ConceptsProcessor.AutoCompleteResult resolvedFromCsv = fromCsvResponse.readEntity(ConceptsProcessor.AutoCompleteResult.class);

// "aaa" occurs after "aab" due to it consisting only of duplicate entries.
// The empty string results from `No V*a*lue` and `..Def*au*lt..`
// "aaa" occurs after "aab" due to it consisting only of duplicate entries.
// The empty string results from `No V*a*lue` and `..Def*au*lt..`

assertThat(resolvedFromCsv.values().stream().map(FrontendValue::getValue))
.containsExactly("a", "aab", "aaa", "" /* `No V*a*lue` :^) */, "baaa");
.containsExactly("a", "aab", "aaa", "male", "" /* `No V*a*lue` :^) */, "female", "baaa");

}
}
Expand All @@ -151,7 +110,7 @@ public void execute(StandaloneSupport conquery) throws Exception {

//check the resolved values
assertThat(resolvedFromValues.values().stream().map(FrontendValue::getValue))
.containsExactly("", "f", "fm");
.containsExactly("f", "female", "fm", "");
}
}

Expand All @@ -166,10 +125,60 @@ public void execute(StandaloneSupport conquery) throws Exception {
), MediaType.APPLICATION_JSON_TYPE))) {

final ConceptsProcessor.AutoCompleteResult resolvedFromCsv = fromCsvResponse.readEntity(ConceptsProcessor.AutoCompleteResult.class);
// This is probably the insertion order
// This is probably the insertion order
assertThat(resolvedFromCsv.values().stream().map(FrontendValue::getValue))
.containsExactlyInAnyOrder("", "a", "aab", "aaa", "baaa", "b", "f", "m", "mf", "fm");
.containsExactlyInAnyOrder("", "aaa", "a", "aab", "b", "baaa", "female", "male", "f", "fm", "m", "mf");
}
}
}

private static SelectFilter<?> setupSearch(StandaloneSupport conquery) throws Exception {
//read test specification
final String testJson =
In.resource("/tests/query/MULTI_SELECT_DATE_RESTRICTION_OR_CONCEPT_QUERY/MULTI_SELECT_DATE_RESTRICTION_OR_CONCEPT_QUERY.test.json")
.withUTF8()
.readAll();

final DatasetId dataset = conquery.getDataset().getId();

final ConqueryTestSpec test = JsonIntegrationTest.readJson(dataset, testJson);

ValidatorHelper.failOnError(log, conquery.getValidator().validate(test));

test.importRequiredData(conquery);

conquery.waitUntilWorkDone();

final CSVConfig csvConf = conquery.getConfig().getCsv();

final Concept<?> concept = conquery.getNamespace().getStorage().getAllConcepts().iterator().next();
final Connector connector = concept.getConnectors().iterator().next();
final SelectFilter<?> filter = (SelectFilter<?>) connector.getFilters().iterator().next();

// Copy search csv from resources to tmp folder.
final Path tmpCSv = Files.createTempFile("conquery_search", "csv");

Files.write(
tmpCSv,
String.join(csvConf.getLineSeparator(), RAW_LINES).getBytes(),
StandardOpenOption.TRUNCATE_EXISTING, StandardOpenOption.CREATE, StandardOpenOption.WRITE
);

final IndexService indexService = new IndexService(conquery.getConfig().getCsv().createCsvParserSettings(), "emptyDefaultLabel");

filter.setTemplate(new FilterTemplate(conquery.getDataset(), "test", tmpCSv.toUri(), "id", "{{label}}", "Hello this is {{option}}", 2, true, indexService));

final URI matchingStatsUri = HierarchyHelper.hierarchicalPath(conquery.defaultAdminURIBuilder()
, AdminDatasetResource.class, "postprocessNamespace")
.buildFromMap(Map.of(DATASET, conquery.getDataset().getId()));

conquery.getClient().target(matchingStatsUri)
.request(MediaType.APPLICATION_JSON_TYPE)
.post(null)
.close();

conquery.waitUntilWorkDone();

return filter;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@
import java.util.stream.Collectors;
import java.util.stream.IntStream;

import com.bakdata.conquery.apiv1.frontend.FrontendValue;
import com.bakdata.conquery.util.search.Cursor;
import it.unimi.dsi.fastutil.objects.Object2LongMap;
import it.unimi.dsi.fastutil.objects.Object2LongOpenHashMap;
import lombok.extern.slf4j.Slf4j;
import org.junit.jupiter.api.Test;

Expand All @@ -27,4 +30,15 @@ public void testCursor() {

}

@Test
public void mapInsertion() {
Object2LongMap<FrontendValue> map = new Object2LongOpenHashMap<>();

map.put(new FrontendValue("a", "label1"), 1);
map.put(new FrontendValue("a", "label2"), 1);

// canary for changes of EqualsAndHashcode behaviour
assertThat(map).hasSize(1);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,16 @@ public void anaconda() {

assertThat(results)
.describedAs("Query for %s".formatted(query))
.isEqualTo(List.of("Anaconda", "Condar", "Anaxonds", "Condor", "Honda", "Analysis", "Ananas", "Canada", "London"));
.isEqualTo(List.of("Anaconda", // Full match
"Condar", // 3 trigrams
"Condor", // 2 trigrams
"Honda", // 2 trigrams
"Anaxonds", // 1 trigram
"London", // 1 trigram
"Ananas", // 1 trigram
"Analysis", // 1 trigram
"Canada" // 1 trigram
));

}

Expand Down

0 comments on commit 515ff09

Please sign in to comment.