From 62f66b02625e79bccc4abb14a5cd9834d34f4d9e Mon Sep 17 00:00:00 2001 From: awildturtok <1553491+awildturtok@users.noreply.github.com> Date: Mon, 29 Apr 2024 18:37:52 +0200 Subject: [PATCH 1/4] don't concat searches, instead merge them by priority. Pushing up good hits, from lower valued sources (i.e. columns) --- .../resources/api/ConceptsProcessor.java | 34 ++--- .../conquery/util/search/TrieSearch.java | 28 ++-- .../tests/FilterAutocompleteTest.java | 121 ++++++++++-------- 3 files changed, 96 insertions(+), 87 deletions(-) diff --git a/backend/src/main/java/com/bakdata/conquery/resources/api/ConceptsProcessor.java b/backend/src/main/java/com/bakdata/conquery/resources/api/ConceptsProcessor.java index e6dd64867e..d14342be9e 100644 --- a/backend/src/main/java/com/bakdata/conquery/resources/api/ConceptsProcessor.java +++ b/backend/src/main/java/com/bakdata/conquery/resources/api/ConceptsProcessor.java @@ -45,6 +45,8 @@ import com.google.common.cache.CacheLoader; import com.google.common.cache.LoadingCache; import com.google.common.collect.Iterators; +import it.unimi.dsi.fastutil.objects.Object2LongAVLTreeMap; +import it.unimi.dsi.fastutil.objects.Object2LongMap; import jakarta.inject.Inject; import jakarta.validation.Validator; import lombok.Getter; @@ -266,37 +268,23 @@ private List autocompleteTextFilter(SelectFilter searchable, S final Namespace namespace = namespaces.get(searchable.getDataset().getId()); // Note that FEValues is equals/hashcode only on value: - // The different sources might contain duplicate FEValue#values which we want to avoid as - // they are already sorted in terms of information weight by getSearchesFor + // The different sources might contain duplicate FEValue#values which we exploit: + // If a value is already present, it's from a search with higher priority. - // Also note: currently we are still issuing large search requests, but much smaller allocations at once, and querying only when the past is not sufficient - return namespace.getFilterSearch() - .getSearchesFor(searchable) - .stream() - .map(search -> createSourceSearchResult(search, Collections.singletonList(text), OptionalInt.empty())) - .flatMap(Collection::stream) - .distinct() - .collect(Collectors.toList()); + final List> searches = namespace.getFilterSearch().getSearchesFor(searchable); + final Object2LongMap overlayedWeights = new Object2LongAVLTreeMap<>(); - } + for (TrieSearch search : searches) { - /** - * Do a search with the supplied values. - */ - private static List createSourceSearchResult(TrieSearch search, Collection values, OptionalInt numberOfTopItems) { - if (search == null) { - return Collections.emptyList(); + final Object2LongMap itemWeights = search.collectWeights(List.of(text)); + + itemWeights.forEach(overlayedWeights::putIfAbsent); } - final List result = search.findItems(values, numberOfTopItems.orElse(Integer.MAX_VALUE)); + return TrieSearch.topItems(Integer.MAX_VALUE, overlayedWeights); - if (numberOfTopItems.isEmpty() && result.size() == Integer.MAX_VALUE) { - //TODO This looks odd, do we really expect QuickSearch to allocate that huge of a list for us? - log.warn("The quick search returned the maximum number of results ({}) which probably means not all possible results are returned.", Integer.MAX_VALUE); - } - return result; } public ResolvedConceptsResult resolveConceptElements(TreeConcept concept, List conceptCodes) { diff --git a/backend/src/main/java/com/bakdata/conquery/util/search/TrieSearch.java b/backend/src/main/java/com/bakdata/conquery/util/search/TrieSearch.java index fced6ccfde..e5f6cc8fe9 100644 --- a/backend/src/main/java/com/bakdata/conquery/util/search/TrieSearch.java +++ b/backend/src/main/java/com/bakdata/conquery/util/search/TrieSearch.java @@ -27,6 +27,7 @@ import lombok.extern.slf4j.Slf4j; import org.apache.commons.collections4.trie.PatriciaTrie; import org.apache.commons.lang3.StringUtils; +import org.jetbrains.annotations.NotNull; /** * Trie based keyword search for autocompletion and resolving. @@ -84,6 +85,24 @@ public TrieSearch(int ngramLength, String split) { } public List findItems(Collection queries, int limit) { + final Object2LongMap itemWeights = collectWeights(queries); + + return topItems(limit, itemWeights); + } + + @NotNull + public static > List topItems(int limit, Object2LongMap itemWeights) { + // Sort items according to their weight, then limit. + // Note that sorting is in descending order, meaning higher-scores are better. + return itemWeights.object2LongEntrySet() + .stream() + .sorted(Comparator.comparing(Object2LongMap.Entry::getLongValue, Comparator.reverseOrder())) + .limit(limit) + .map(Map.Entry::getKey) + .collect(Collectors.toList()); + } + + public Object2LongMap collectWeights(Collection queries) { final Object2LongMap itemWeights = new Object2LongAVLTreeMap<>(); // We are not guaranteed to have split queries incoming, so we normalize them for searching @@ -118,14 +137,7 @@ public List findItems(Collection queries, int limit) { updateWeights(query, ngramHits, itemWeights, false); } - // Sort items according to their weight, then limit. - // Note that sorting is in descending order, meaning higher-scores are better. - return itemWeights.object2LongEntrySet() - .stream() - .sorted(Comparator.comparing(Object2LongMap.Entry::getLongValue, Comparator.reverseOrder())) - .limit(limit) - .map(Map.Entry::getKey) - .collect(Collectors.toList()); + return itemWeights; } /** diff --git a/backend/src/test/java/com/bakdata/conquery/integration/tests/FilterAutocompleteTest.java b/backend/src/test/java/com/bakdata/conquery/integration/tests/FilterAutocompleteTest.java index 4f3222f920..b319591b5b 100644 --- a/backend/src/test/java/com/bakdata/conquery/integration/tests/FilterAutocompleteTest.java +++ b/backend/src/test/java/com/bakdata/conquery/integration/tests/FilterAutocompleteTest.java @@ -12,11 +12,6 @@ import java.util.OptionalInt; import java.util.Set; -import jakarta.ws.rs.client.Entity; -import jakarta.ws.rs.client.Invocation; -import jakarta.ws.rs.core.MediaType; -import jakarta.ws.rs.core.Response; - import com.bakdata.conquery.apiv1.FilterTemplate; import com.bakdata.conquery.apiv1.frontend.FrontendValue; import com.bakdata.conquery.integration.IntegrationTest; @@ -35,6 +30,10 @@ import com.bakdata.conquery.resources.hierarchies.HierarchyHelper; import com.bakdata.conquery.util.support.StandaloneSupport; import com.github.powerlibraries.io.In; +import jakarta.ws.rs.client.Entity; +import jakarta.ws.rs.client.Invocation; +import jakarta.ws.rs.core.MediaType; +import jakarta.ws.rs.core.Response; import lombok.extern.slf4j.Slf4j; @Slf4j @@ -46,7 +45,9 @@ public class FilterAutocompleteTest extends IntegrationTest.Simple implements Pr "aab,lbl-2,ov-2", "aaa,lbl-3 & lbl-4,ov-4", "baaa,lbl-5,ov-5", - "b,lbl-6,ov-6" + "b,lbl-6,ov-6", + "female,female-label,female-option", + "male,male-label,male-option" }; @Override @@ -56,51 +57,9 @@ public Set forModes() { @Override public void execute(StandaloneSupport conquery) throws Exception { - //read test specification - final String testJson = - In.resource("/tests/query/MULTI_SELECT_DATE_RESTRICTION_OR_CONCEPT_QUERY/MULTI_SELECT_DATE_RESTRICTION_OR_CONCEPT_QUERY.test.json") - .withUTF8() - .readAll(); - - final DatasetId dataset = conquery.getDataset().getId(); - - final ConqueryTestSpec test = JsonIntegrationTest.readJson(dataset, testJson); - - ValidatorHelper.failOnError(log, conquery.getValidator().validate(test)); - - test.importRequiredData(conquery); - - conquery.waitUntilWorkDone(); - - final CSVConfig csvConf = conquery.getConfig().getCsv(); + final SelectFilter filter = setupSearch(conquery); - final Concept concept = conquery.getNamespace().getStorage().getAllConcepts().iterator().next(); - final Connector connector = concept.getConnectors().iterator().next(); - final SelectFilter filter = (SelectFilter) connector.getFilters().iterator().next(); - - // Copy search csv from resources to tmp folder. - final Path tmpCSv = Files.createTempFile("conquery_search", "csv"); - - Files.write( - tmpCSv, - String.join(csvConf.getLineSeparator(), RAW_LINES).getBytes(), - StandardOpenOption.TRUNCATE_EXISTING, StandardOpenOption.CREATE, StandardOpenOption.WRITE - ); - - final IndexService indexService = new IndexService(conquery.getConfig().getCsv().createCsvParserSettings(), "emptyDefaultLabel"); - - filter.setTemplate(new FilterTemplate(conquery.getDataset(), "test", tmpCSv.toUri(), "id", "{{label}}", "Hello this is {{option}}", 2, true, indexService)); - - final URI matchingStatsUri = HierarchyHelper.hierarchicalPath(conquery.defaultAdminURIBuilder() - , AdminDatasetResource.class, "postprocessNamespace") - .buildFromMap(Map.of(DATASET, conquery.getDataset().getId())); - - conquery.getClient().target(matchingStatsUri) - .request(MediaType.APPLICATION_JSON_TYPE) - .post(null) - .close(); - - conquery.waitUntilWorkDone(); + final Concept concept = filter.getConnector().getConcept(); final URI autocompleteUri = HierarchyHelper.hierarchicalPath( @@ -128,11 +87,11 @@ public void execute(StandaloneSupport conquery) throws Exception { final ConceptsProcessor.AutoCompleteResult resolvedFromCsv = fromCsvResponse.readEntity(ConceptsProcessor.AutoCompleteResult.class); - // "aaa" occurs after "aab" due to it consisting only of duplicate entries. - // The empty string results from `No V*a*lue` and `..Def*au*lt..` + // "aaa" occurs after "aab" due to it consisting only of duplicate entries. + // The empty string results from `No V*a*lue` and `..Def*au*lt..` assertThat(resolvedFromCsv.values().stream().map(FrontendValue::getValue)) - .containsExactly("a", "aab", "aaa", "" /* `No V*a*lue` :^) */, "baaa"); + .containsExactly("a", "aab", "aaa", "" /* `No V*a*lue` :^) */, "female", "male", "baaa"); } } @@ -151,7 +110,7 @@ public void execute(StandaloneSupport conquery) throws Exception { //check the resolved values assertThat(resolvedFromValues.values().stream().map(FrontendValue::getValue)) - .containsExactly("", "f", "fm"); + .containsExactly("f", "female", "fm", ""); } } @@ -166,10 +125,60 @@ public void execute(StandaloneSupport conquery) throws Exception { ), MediaType.APPLICATION_JSON_TYPE))) { final ConceptsProcessor.AutoCompleteResult resolvedFromCsv = fromCsvResponse.readEntity(ConceptsProcessor.AutoCompleteResult.class); - // This is probably the insertion order + // This is probably the insertion order assertThat(resolvedFromCsv.values().stream().map(FrontendValue::getValue)) - .containsExactlyInAnyOrder("", "a", "aab", "aaa", "baaa", "b", "f", "m", "mf", "fm"); + .containsExactlyInAnyOrder("", "aaa", "a", "aab", "b", "baaa", "female", "male", "f", "fm", "m", "mf"); } } } + + private static SelectFilter setupSearch(StandaloneSupport conquery) throws Exception { + //read test specification + final String testJson = + In.resource("/tests/query/MULTI_SELECT_DATE_RESTRICTION_OR_CONCEPT_QUERY/MULTI_SELECT_DATE_RESTRICTION_OR_CONCEPT_QUERY.test.json") + .withUTF8() + .readAll(); + + final DatasetId dataset = conquery.getDataset().getId(); + + final ConqueryTestSpec test = JsonIntegrationTest.readJson(dataset, testJson); + + ValidatorHelper.failOnError(log, conquery.getValidator().validate(test)); + + test.importRequiredData(conquery); + + conquery.waitUntilWorkDone(); + + final CSVConfig csvConf = conquery.getConfig().getCsv(); + + final Concept concept = conquery.getNamespace().getStorage().getAllConcepts().iterator().next(); + final Connector connector = concept.getConnectors().iterator().next(); + final SelectFilter filter = (SelectFilter) connector.getFilters().iterator().next(); + + // Copy search csv from resources to tmp folder. + final Path tmpCSv = Files.createTempFile("conquery_search", "csv"); + + Files.write( + tmpCSv, + String.join(csvConf.getLineSeparator(), RAW_LINES).getBytes(), + StandardOpenOption.TRUNCATE_EXISTING, StandardOpenOption.CREATE, StandardOpenOption.WRITE + ); + + final IndexService indexService = new IndexService(conquery.getConfig().getCsv().createCsvParserSettings(), "emptyDefaultLabel"); + + filter.setTemplate(new FilterTemplate(conquery.getDataset(), "test", tmpCSv.toUri(), "id", "{{label}}", "Hello this is {{option}}", 2, true, indexService)); + + final URI matchingStatsUri = HierarchyHelper.hierarchicalPath(conquery.defaultAdminURIBuilder() + , AdminDatasetResource.class, "postprocessNamespace") + .buildFromMap(Map.of(DATASET, conquery.getDataset().getId())); + + conquery.getClient().target(matchingStatsUri) + .request(MediaType.APPLICATION_JSON_TYPE) + .post(null) + .close(); + + conquery.waitUntilWorkDone(); + + return filter; + } } From 795505a9a05fe3777e572184a03df0cc3027f4c6 Mon Sep 17 00:00:00 2001 From: awildturtok <1553491+awildturtok@users.noreply.github.com> Date: Tue, 30 Apr 2024 14:41:47 +0200 Subject: [PATCH 2/4] don't use AVLTreeMap: It's usage of Comparable creates dupes. --- .../conquery/resources/api/ConceptsProcessor.java | 4 ++-- .../bakdata/conquery/util/search/TrieSearch.java | 4 ++-- .../resources/api/ConceptsProcessorTest.java | 14 ++++++++++++++ 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/backend/src/main/java/com/bakdata/conquery/resources/api/ConceptsProcessor.java b/backend/src/main/java/com/bakdata/conquery/resources/api/ConceptsProcessor.java index d14342be9e..5570fe4a03 100644 --- a/backend/src/main/java/com/bakdata/conquery/resources/api/ConceptsProcessor.java +++ b/backend/src/main/java/com/bakdata/conquery/resources/api/ConceptsProcessor.java @@ -45,8 +45,8 @@ import com.google.common.cache.CacheLoader; import com.google.common.cache.LoadingCache; import com.google.common.collect.Iterators; -import it.unimi.dsi.fastutil.objects.Object2LongAVLTreeMap; import it.unimi.dsi.fastutil.objects.Object2LongMap; +import it.unimi.dsi.fastutil.objects.Object2LongOpenHashMap; import jakarta.inject.Inject; import jakarta.validation.Validator; import lombok.Getter; @@ -273,7 +273,7 @@ private List autocompleteTextFilter(SelectFilter searchable, S final List> searches = namespace.getFilterSearch().getSearchesFor(searchable); - final Object2LongMap overlayedWeights = new Object2LongAVLTreeMap<>(); + final Object2LongMap overlayedWeights = new Object2LongOpenHashMap<>(); for (TrieSearch search : searches) { diff --git a/backend/src/main/java/com/bakdata/conquery/util/search/TrieSearch.java b/backend/src/main/java/com/bakdata/conquery/util/search/TrieSearch.java index e5f6cc8fe9..e6ab695909 100644 --- a/backend/src/main/java/com/bakdata/conquery/util/search/TrieSearch.java +++ b/backend/src/main/java/com/bakdata/conquery/util/search/TrieSearch.java @@ -20,8 +20,8 @@ import com.google.common.base.Strings; import com.google.common.collect.Iterators; -import it.unimi.dsi.fastutil.objects.Object2LongAVLTreeMap; import it.unimi.dsi.fastutil.objects.Object2LongMap; +import it.unimi.dsi.fastutil.objects.Object2LongOpenHashMap; import jakarta.validation.constraints.Min; import lombok.ToString; import lombok.extern.slf4j.Slf4j; @@ -103,7 +103,7 @@ public static > List topItems(int limit, Object2LongM } public Object2LongMap collectWeights(Collection queries) { - final Object2LongMap itemWeights = new Object2LongAVLTreeMap<>(); + final Object2LongMap itemWeights = new Object2LongOpenHashMap<>(); // We are not guaranteed to have split queries incoming, so we normalize them for searching queries = queries.stream().flatMap(this::split).collect(Collectors.toSet()); diff --git a/backend/src/test/java/com/bakdata/conquery/resources/api/ConceptsProcessorTest.java b/backend/src/test/java/com/bakdata/conquery/resources/api/ConceptsProcessorTest.java index 93abc3d8b6..7877fe13b3 100644 --- a/backend/src/test/java/com/bakdata/conquery/resources/api/ConceptsProcessorTest.java +++ b/backend/src/test/java/com/bakdata/conquery/resources/api/ConceptsProcessorTest.java @@ -5,7 +5,10 @@ import java.util.stream.Collectors; import java.util.stream.IntStream; +import com.bakdata.conquery.apiv1.frontend.FrontendValue; import com.bakdata.conquery.util.search.Cursor; +import it.unimi.dsi.fastutil.objects.Object2LongMap; +import it.unimi.dsi.fastutil.objects.Object2LongOpenHashMap; import lombok.extern.slf4j.Slf4j; import org.junit.jupiter.api.Test; @@ -27,4 +30,15 @@ public void testCursor() { } + @Test + public void mapInsertion() { + Object2LongMap map = new Object2LongOpenHashMap<>(); + + map.put(new FrontendValue("a", "label1"), 1); + map.put(new FrontendValue("a", "label2"), 1); + + // canary for changes of EqualsAndHashcode behaviour + assertThat(map).hasSize(1); + } + } \ No newline at end of file From c9581e453003c4b2ed793cf43c2d7007e944d522 Mon Sep 17 00:00:00 2001 From: awildturtok <1553491+awildturtok@users.noreply.github.com> Date: Tue, 30 Apr 2024 15:10:20 +0200 Subject: [PATCH 3/4] fixes ordering of values now not being alphabetic with identical weights --- .../com/bakdata/conquery/util/search/TrieSearch.java | 2 +- .../bakdata/conquery/util/search/QuickSearchTest.java | 11 ++++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/backend/src/main/java/com/bakdata/conquery/util/search/TrieSearch.java b/backend/src/main/java/com/bakdata/conquery/util/search/TrieSearch.java index e6ab695909..4e5fc6c077 100644 --- a/backend/src/main/java/com/bakdata/conquery/util/search/TrieSearch.java +++ b/backend/src/main/java/com/bakdata/conquery/util/search/TrieSearch.java @@ -175,7 +175,7 @@ private void updateWeights(String query, final Map> items, Objec // We combine hits additively to favor items with multiple hits for (T item : hits) { - itemWeights.put(item, itemWeights.getOrDefault(item, 1) + weight); + itemWeights.put(item, itemWeights.getOrDefault(item, 0) + weight); } } } diff --git a/backend/src/test/java/com/bakdata/conquery/util/search/QuickSearchTest.java b/backend/src/test/java/com/bakdata/conquery/util/search/QuickSearchTest.java index 5fabc36c61..6a289971c0 100644 --- a/backend/src/test/java/com/bakdata/conquery/util/search/QuickSearchTest.java +++ b/backend/src/test/java/com/bakdata/conquery/util/search/QuickSearchTest.java @@ -24,7 +24,16 @@ public void anaconda() { assertThat(results) .describedAs("Query for %s".formatted(query)) - .isEqualTo(List.of("Anaconda", "Condar", "Anaxonds", "Condor", "Honda", "Analysis", "Ananas", "Canada", "London")); + .isEqualTo(List.of("Anaconda", // Full match + "Condar", // 3 trigrams + "Condor", // 2 trigrams + "Honda", // 2 trigrams + "Anaxonds", // 1 trigram + "London", // 1 trigram + "Ananas", // 1 trigram + "Analysis", // 1 trigram + "Canada" // 1 trigram + )); } From c5e5ecea94fe4cf85f6e1c376f5d2dcaefbc9f28 Mon Sep 17 00:00:00 2001 From: awildturtok <1553491+awildturtok@users.noreply.github.com> Date: Tue, 30 Apr 2024 16:01:42 +0200 Subject: [PATCH 4/4] fixes order of prior alphabetically order entries --- .../conquery/integration/tests/FilterAutocompleteTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/src/test/java/com/bakdata/conquery/integration/tests/FilterAutocompleteTest.java b/backend/src/test/java/com/bakdata/conquery/integration/tests/FilterAutocompleteTest.java index b319591b5b..e05a0b4acd 100644 --- a/backend/src/test/java/com/bakdata/conquery/integration/tests/FilterAutocompleteTest.java +++ b/backend/src/test/java/com/bakdata/conquery/integration/tests/FilterAutocompleteTest.java @@ -91,7 +91,7 @@ public void execute(StandaloneSupport conquery) throws Exception { // The empty string results from `No V*a*lue` and `..Def*au*lt..` assertThat(resolvedFromCsv.values().stream().map(FrontendValue::getValue)) - .containsExactly("a", "aab", "aaa", "" /* `No V*a*lue` :^) */, "female", "male", "baaa"); + .containsExactly("a", "aab", "aaa", "male", "" /* `No V*a*lue` :^) */, "female", "baaa"); } }