Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

don't concat searches, instead merge them by priority. Pushing up goo… #3414

Merged
merged 4 commits into from
Apr 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@
import com.google.common.cache.CacheLoader;
import com.google.common.cache.LoadingCache;
import com.google.common.collect.Iterators;
import it.unimi.dsi.fastutil.objects.Object2LongMap;
import it.unimi.dsi.fastutil.objects.Object2LongOpenHashMap;
import jakarta.inject.Inject;
import jakarta.validation.Validator;
import lombok.Getter;
Expand Down Expand Up @@ -266,37 +268,23 @@ private List<FrontendValue> autocompleteTextFilter(SelectFilter<?> searchable, S
final Namespace namespace = namespaces.get(searchable.getDataset().getId());

// Note that FEValues is equals/hashcode only on value:
// The different sources might contain duplicate FEValue#values which we want to avoid as
// they are already sorted in terms of information weight by getSearchesFor
// The different sources might contain duplicate FEValue#values which we exploit:
// If a value is already present, it's from a search with higher priority.

// Also note: currently we are still issuing large search requests, but much smaller allocations at once, and querying only when the past is not sufficient
return namespace.getFilterSearch()
.getSearchesFor(searchable)
.stream()
.map(search -> createSourceSearchResult(search, Collections.singletonList(text), OptionalInt.empty()))
.flatMap(Collection::stream)
.distinct()
.collect(Collectors.toList());
final List<TrieSearch<FrontendValue>> searches = namespace.getFilterSearch().getSearchesFor(searchable);

final Object2LongMap<FrontendValue> overlayedWeights = new Object2LongOpenHashMap<>();

}
for (TrieSearch<FrontendValue> search : searches) {

/**
* Do a search with the supplied values.
*/
private static List<FrontendValue> createSourceSearchResult(TrieSearch<FrontendValue> search, Collection<String> values, OptionalInt numberOfTopItems) {
if (search == null) {
return Collections.emptyList();
final Object2LongMap<FrontendValue> itemWeights = search.collectWeights(List.of(text));

itemWeights.forEach(overlayedWeights::putIfAbsent);
}

final List<FrontendValue> result = search.findItems(values, numberOfTopItems.orElse(Integer.MAX_VALUE));
return TrieSearch.topItems(Integer.MAX_VALUE, overlayedWeights);

if (numberOfTopItems.isEmpty() && result.size() == Integer.MAX_VALUE) {
//TODO This looks odd, do we really expect QuickSearch to allocate that huge of a list for us?
log.warn("The quick search returned the maximum number of results ({}) which probably means not all possible results are returned.", Integer.MAX_VALUE);
}

return result;
}

public ResolvedConceptsResult resolveConceptElements(TreeConcept concept, List<String> conceptCodes) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,14 @@

import com.google.common.base.Strings;
import com.google.common.collect.Iterators;
import it.unimi.dsi.fastutil.objects.Object2LongAVLTreeMap;
import it.unimi.dsi.fastutil.objects.Object2LongMap;
import it.unimi.dsi.fastutil.objects.Object2LongOpenHashMap;
import jakarta.validation.constraints.Min;
import lombok.ToString;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.collections4.trie.PatriciaTrie;
import org.apache.commons.lang3.StringUtils;
import org.jetbrains.annotations.NotNull;

/**
* Trie based keyword search for autocompletion and resolving.
Expand Down Expand Up @@ -84,7 +85,25 @@ public TrieSearch(int ngramLength, String split) {
}

public List<T> findItems(Collection<String> queries, int limit) {
final Object2LongMap<T> itemWeights = new Object2LongAVLTreeMap<>();
final Object2LongMap<T> itemWeights = collectWeights(queries);

return topItems(limit, itemWeights);
}

@NotNull
public static <T extends Comparable<T>> List<T> topItems(int limit, Object2LongMap<T> itemWeights) {
// Sort items according to their weight, then limit.
// Note that sorting is in descending order, meaning higher-scores are better.
return itemWeights.object2LongEntrySet()
.stream()
.sorted(Comparator.comparing(Object2LongMap.Entry::getLongValue, Comparator.reverseOrder()))
.limit(limit)
.map(Map.Entry::getKey)
.collect(Collectors.toList());
}

public Object2LongMap<T> collectWeights(Collection<String> queries) {
final Object2LongMap<T> itemWeights = new Object2LongOpenHashMap<>();

// We are not guaranteed to have split queries incoming, so we normalize them for searching
queries = queries.stream().flatMap(this::split).collect(Collectors.toSet());
Expand Down Expand Up @@ -118,14 +137,7 @@ public List<T> findItems(Collection<String> queries, int limit) {
updateWeights(query, ngramHits, itemWeights, false);
}

// Sort items according to their weight, then limit.
// Note that sorting is in descending order, meaning higher-scores are better.
return itemWeights.object2LongEntrySet()
.stream()
.sorted(Comparator.comparing(Object2LongMap.Entry::getLongValue, Comparator.reverseOrder()))
.limit(limit)
.map(Map.Entry::getKey)
.collect(Collectors.toList());
return itemWeights;
}

/**
Expand Down Expand Up @@ -163,7 +175,7 @@ private void updateWeights(String query, final Map<String, List<T>> items, Objec

// We combine hits additively to favor items with multiple hits
for (T item : hits) {
itemWeights.put(item, itemWeights.getOrDefault(item, 1) + weight);
itemWeights.put(item, itemWeights.getOrDefault(item, 0) + weight);
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,6 @@
import java.util.OptionalInt;
import java.util.Set;

import jakarta.ws.rs.client.Entity;
import jakarta.ws.rs.client.Invocation;
import jakarta.ws.rs.core.MediaType;
import jakarta.ws.rs.core.Response;

import com.bakdata.conquery.apiv1.FilterTemplate;
import com.bakdata.conquery.apiv1.frontend.FrontendValue;
import com.bakdata.conquery.integration.IntegrationTest;
Expand All @@ -35,6 +30,10 @@
import com.bakdata.conquery.resources.hierarchies.HierarchyHelper;
import com.bakdata.conquery.util.support.StandaloneSupport;
import com.github.powerlibraries.io.In;
import jakarta.ws.rs.client.Entity;
import jakarta.ws.rs.client.Invocation;
import jakarta.ws.rs.core.MediaType;
import jakarta.ws.rs.core.Response;
import lombok.extern.slf4j.Slf4j;

@Slf4j
Expand All @@ -46,7 +45,9 @@ public class FilterAutocompleteTest extends IntegrationTest.Simple implements Pr
"aab,lbl-2,ov-2",
"aaa,lbl-3 & lbl-4,ov-4",
"baaa,lbl-5,ov-5",
"b,lbl-6,ov-6"
"b,lbl-6,ov-6",
"female,female-label,female-option",
"male,male-label,male-option"
};

@Override
Expand All @@ -56,51 +57,9 @@ public Set<StandaloneSupport.Mode> forModes() {

@Override
public void execute(StandaloneSupport conquery) throws Exception {
//read test specification
final String testJson =
In.resource("/tests/query/MULTI_SELECT_DATE_RESTRICTION_OR_CONCEPT_QUERY/MULTI_SELECT_DATE_RESTRICTION_OR_CONCEPT_QUERY.test.json")
.withUTF8()
.readAll();

final DatasetId dataset = conquery.getDataset().getId();

final ConqueryTestSpec test = JsonIntegrationTest.readJson(dataset, testJson);

ValidatorHelper.failOnError(log, conquery.getValidator().validate(test));

test.importRequiredData(conquery);

conquery.waitUntilWorkDone();

final CSVConfig csvConf = conquery.getConfig().getCsv();
final SelectFilter<?> filter = setupSearch(conquery);

final Concept<?> concept = conquery.getNamespace().getStorage().getAllConcepts().iterator().next();
final Connector connector = concept.getConnectors().iterator().next();
final SelectFilter<?> filter = (SelectFilter<?>) connector.getFilters().iterator().next();

// Copy search csv from resources to tmp folder.
final Path tmpCSv = Files.createTempFile("conquery_search", "csv");

Files.write(
tmpCSv,
String.join(csvConf.getLineSeparator(), RAW_LINES).getBytes(),
StandardOpenOption.TRUNCATE_EXISTING, StandardOpenOption.CREATE, StandardOpenOption.WRITE
);

final IndexService indexService = new IndexService(conquery.getConfig().getCsv().createCsvParserSettings(), "emptyDefaultLabel");

filter.setTemplate(new FilterTemplate(conquery.getDataset(), "test", tmpCSv.toUri(), "id", "{{label}}", "Hello this is {{option}}", 2, true, indexService));

final URI matchingStatsUri = HierarchyHelper.hierarchicalPath(conquery.defaultAdminURIBuilder()
, AdminDatasetResource.class, "postprocessNamespace")
.buildFromMap(Map.of(DATASET, conquery.getDataset().getId()));

conquery.getClient().target(matchingStatsUri)
.request(MediaType.APPLICATION_JSON_TYPE)
.post(null)
.close();

conquery.waitUntilWorkDone();
final Concept<?> concept = filter.getConnector().getConcept();

final URI autocompleteUri =
HierarchyHelper.hierarchicalPath(
Expand Down Expand Up @@ -128,11 +87,11 @@ public void execute(StandaloneSupport conquery) throws Exception {

final ConceptsProcessor.AutoCompleteResult resolvedFromCsv = fromCsvResponse.readEntity(ConceptsProcessor.AutoCompleteResult.class);

// "aaa" occurs after "aab" due to it consisting only of duplicate entries.
// The empty string results from `No V*a*lue` and `..Def*au*lt..`
// "aaa" occurs after "aab" due to it consisting only of duplicate entries.
// The empty string results from `No V*a*lue` and `..Def*au*lt..`

assertThat(resolvedFromCsv.values().stream().map(FrontendValue::getValue))
.containsExactly("a", "aab", "aaa", "" /* `No V*a*lue` :^) */, "baaa");
.containsExactly("a", "aab", "aaa", "male", "" /* `No V*a*lue` :^) */, "female", "baaa");

}
}
Expand All @@ -151,7 +110,7 @@ public void execute(StandaloneSupport conquery) throws Exception {

//check the resolved values
assertThat(resolvedFromValues.values().stream().map(FrontendValue::getValue))
.containsExactly("", "f", "fm");
.containsExactly("f", "female", "fm", "");
}
}

Expand All @@ -166,10 +125,60 @@ public void execute(StandaloneSupport conquery) throws Exception {
), MediaType.APPLICATION_JSON_TYPE))) {

final ConceptsProcessor.AutoCompleteResult resolvedFromCsv = fromCsvResponse.readEntity(ConceptsProcessor.AutoCompleteResult.class);
// This is probably the insertion order
// This is probably the insertion order
assertThat(resolvedFromCsv.values().stream().map(FrontendValue::getValue))
.containsExactlyInAnyOrder("", "a", "aab", "aaa", "baaa", "b", "f", "m", "mf", "fm");
.containsExactlyInAnyOrder("", "aaa", "a", "aab", "b", "baaa", "female", "male", "f", "fm", "m", "mf");
}
}
}

private static SelectFilter<?> setupSearch(StandaloneSupport conquery) throws Exception {
//read test specification
final String testJson =
In.resource("/tests/query/MULTI_SELECT_DATE_RESTRICTION_OR_CONCEPT_QUERY/MULTI_SELECT_DATE_RESTRICTION_OR_CONCEPT_QUERY.test.json")
.withUTF8()
.readAll();

final DatasetId dataset = conquery.getDataset().getId();

final ConqueryTestSpec test = JsonIntegrationTest.readJson(dataset, testJson);

ValidatorHelper.failOnError(log, conquery.getValidator().validate(test));

test.importRequiredData(conquery);

conquery.waitUntilWorkDone();

final CSVConfig csvConf = conquery.getConfig().getCsv();

final Concept<?> concept = conquery.getNamespace().getStorage().getAllConcepts().iterator().next();
final Connector connector = concept.getConnectors().iterator().next();
final SelectFilter<?> filter = (SelectFilter<?>) connector.getFilters().iterator().next();

// Copy search csv from resources to tmp folder.
final Path tmpCSv = Files.createTempFile("conquery_search", "csv");

Files.write(
tmpCSv,
String.join(csvConf.getLineSeparator(), RAW_LINES).getBytes(),
StandardOpenOption.TRUNCATE_EXISTING, StandardOpenOption.CREATE, StandardOpenOption.WRITE
);

final IndexService indexService = new IndexService(conquery.getConfig().getCsv().createCsvParserSettings(), "emptyDefaultLabel");

filter.setTemplate(new FilterTemplate(conquery.getDataset(), "test", tmpCSv.toUri(), "id", "{{label}}", "Hello this is {{option}}", 2, true, indexService));

final URI matchingStatsUri = HierarchyHelper.hierarchicalPath(conquery.defaultAdminURIBuilder()
, AdminDatasetResource.class, "postprocessNamespace")
.buildFromMap(Map.of(DATASET, conquery.getDataset().getId()));

conquery.getClient().target(matchingStatsUri)
.request(MediaType.APPLICATION_JSON_TYPE)
.post(null)
.close();

conquery.waitUntilWorkDone();

return filter;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@
import java.util.stream.Collectors;
import java.util.stream.IntStream;

import com.bakdata.conquery.apiv1.frontend.FrontendValue;
import com.bakdata.conquery.util.search.Cursor;
import it.unimi.dsi.fastutil.objects.Object2LongMap;
import it.unimi.dsi.fastutil.objects.Object2LongOpenHashMap;
import lombok.extern.slf4j.Slf4j;
import org.junit.jupiter.api.Test;

Expand All @@ -27,4 +30,15 @@ public void testCursor() {

}

@Test
public void mapInsertion() {
Object2LongMap<FrontendValue> map = new Object2LongOpenHashMap<>();

map.put(new FrontendValue("a", "label1"), 1);
map.put(new FrontendValue("a", "label2"), 1);

// canary for changes of EqualsAndHashcode behaviour
assertThat(map).hasSize(1);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,16 @@ public void anaconda() {

assertThat(results)
.describedAs("Query for %s".formatted(query))
.isEqualTo(List.of("Anaconda", "Condar", "Anaxonds", "Condor", "Honda", "Analysis", "Ananas", "Canada", "London"));
.isEqualTo(List.of("Anaconda", // Full match
"Condar", // 3 trigrams
"Condor", // 2 trigrams
"Honda", // 2 trigrams
"Anaxonds", // 1 trigram
"London", // 1 trigram
"Ananas", // 1 trigram
"Analysis", // 1 trigram
"Canada" // 1 trigram
));

}

Expand Down
Loading