Skip to content

Commit

Permalink
fix: OR-1972 update elastic search for duplicates
Browse files Browse the repository at this point in the history
  • Loading branch information
Jan Lesage committed Nov 23, 2023
1 parent b157237 commit e193521
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 23 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -23,24 +23,32 @@ public SearchDuplicateVerenigingDetectionService(IElasticClient client)
public async Task<IReadOnlyCollection<DuplicaatVereniging>> GetDuplicates(VerenigingsNaam naam, Locatie[] locaties)
{
var locatiesMetAdres = locaties.Where(l => l.Adres is not null).ToArray();

if (locatiesMetAdres.Length == 0) return Array.Empty<DuplicaatVereniging>();

var postcodes = locatiesMetAdres.Select(l => l.Adres!.Postcode).ToArray();
var gemeentes = locatiesMetAdres.Select(l => l.Adres!.Gemeente).ToArray();

_client.Indices.Refresh(new RefreshRequest());

var searchResponse =
await _client
.SearchAsync<DuplicateDetectionDocument>(
s => s.Query(
q => q.Bool(
b => b.Must(must => must.Match(m => FuzzyMatchOpNaam(m, f => f.Naam, naam)))
.Filter(f => f.Bool(
fb => fb.Should(MatchGemeente(gemeentes),
MatchPostcode(postcodes))
.MinimumShouldMatch(1))))));
s => s
.Size(50)
.Query(
q => q.Bool(
b => b.Must(must => must
.Match(m => FuzzyMatchOpNaam(m, path: f => f.Naam, naam))
)
.Filter(f => f.Bool(
fb => fb.Should(
MatchGemeente(gemeentes),
MatchPostcode(postcodes)
)
.MinimumShouldMatch(1))))));

return searchResponse.Documents.Select(ToDuplicateVereniging)
.ToArray();
return searchResponse.Documents.Select(ToDuplicateVereniging).ToArray();
}

private static Func<QueryContainerDescriptor<DuplicateDetectionDocument>, QueryContainer> MatchPostcode(string[] postcodes)
Expand All @@ -67,9 +75,9 @@ private static Func<QueryContainerDescriptor<DuplicateDetectionDocument>, QueryC
.Query(nq => nq
.Match(m =>
FuzzyMatchOpNaam(m,
f => f.Locaties
.First()
.Gemeente, string.Join(
path: f => f.Locaties
.First()
.Gemeente, string.Join(
separator: " ",
gemeentes))
)
Expand All @@ -81,15 +89,12 @@ private static MatchQueryDescriptor<DuplicateDetectionDocument> FuzzyMatchOpNaam
MatchQueryDescriptor<DuplicateDetectionDocument> m,
Expression<Func<DuplicateDetectionDocument, string>> path,
string query)
{
return m
.Field(path)
.Query(query)
.Analyzer(DuplicateDetectionDocumentMapping
.DuplicateAnalyzer)
.Fuzziness(Fuzziness.Auto) // Assumes this analyzer applies lowercase and asciifolding
.MinimumShouldMatch("90%");
}
=> m
.Field(path)
.Query(query)
.Analyzer(DuplicateDetectionDocumentMapping.DuplicateAnalyzer)
.Fuzziness(Fuzziness.Auto) // Assumes this analyzer applies lowercase and asciifolding
.MinimumShouldMatch("70%");

private static DuplicaatVereniging ToDuplicateVereniging(DuplicateDetectionDocument document)
=> new(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ private static AnalyzersDescriptor AddDuplicateDetectionAnalyzer(AnalyzersDescri
=> ad.Custom(DuplicateDetectionDocumentMapping.DuplicateAnalyzer,
selector: ca
=> ca
.Tokenizer("standard")
.Filters("lowercase", "asciifolding", "dutch_stop")
.Tokenizer("lowercase")
.Filters("asciifolding", "dutch_stop")
);
}

0 comments on commit e193521

Please sign in to comment.