Skip to content

Commit

Permalink
Add benchmark project, some small optimizations (#39)
Browse files Browse the repository at this point in the history
* Add benchmark project, some small optimizations

* Use .NET 8 SDK in build pipeline

* Update unit test project to .NET 8

* Make GetProfile protected internal
  • Loading branch information
paulirwin authored Aug 12, 2024
1 parent f2b0321 commit 248ef9e
Show file tree
Hide file tree
Showing 10 changed files with 161 additions and 21 deletions.
117 changes: 117 additions & 0 deletions F23.StringSimilarity.Benchmarks/Benchmarks.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
using BenchmarkDotNet.Attributes;

namespace F23.StringSimilarity.Benchmarks;

[MemoryDiagnoser]
public class Benchmarks
{
[Benchmark]
public void Cosine()
{
var cosine = new Cosine();
_ = cosine.Distance("hello", "world");
}

[Benchmark]
public void Damerau()
{
var damerau = new Damerau();
_ = damerau.Distance("hello", "world");
}

[Benchmark]
public void Jaccard()
{
var jaccard = new Jaccard();
_ = jaccard.Distance("hello", "world");
}

[Benchmark]
public void JaroWinkler()
{
var jaro = new JaroWinkler();
_ = jaro.Distance("hello", "world");
}

[Benchmark]
public void Levenshtein()
{
var levenshtein = new Levenshtein();
_ = levenshtein.Distance("hello", "world");
}

[Benchmark]
public void LongestCommonSubsequence()
{
var lcs = new LongestCommonSubsequence();
_ = lcs.Distance("hello", "world");
}

[Benchmark]
public void MetricLCS()
{
var metricLcs = new MetricLCS();
_ = metricLcs.Distance("hello", "world");
}

[Benchmark]
public void NGram()
{
var ngram = new NGram();
_ = ngram.Distance("hello", "world");
}

[Benchmark]
public void NormalizedLevenshtein()
{
var normalizedLevenshtein = new NormalizedLevenshtein();
_ = normalizedLevenshtein.Distance("hello", "world");
}

[Benchmark]
public void OptimalStringAlignment()
{
var osa = new OptimalStringAlignment();
_ = osa.Distance("hello", "world");
}

[Benchmark]
public void QGram()
{
var qGram = new QGram();
_ = qGram.Distance("hello", "world");
}

[Benchmark]
public void RatcliffObershelp()
{
var ratcliffObershelp = new RatcliffObershelp();
_ = ratcliffObershelp.Distance("hello", "world");
}

[Benchmark]
public void SorensenDice()
{
var sorensenDice = new SorensenDice();
_ = sorensenDice.Distance("hello", "world");
}

[Benchmark]
public void WeightedLevenshtein()
{
var weightedLevenshtein = new WeightedLevenshtein(new ExampleCharSub());
_ = weightedLevenshtein.Distance("hello", "world");
}

private class ExampleCharSub : ICharacterSubstitution
{
public double Cost(char c1, char c2)
{
// The cost for substituting 't' and 'r' is considered smaller as these 2 are located next to each other on a keyboard
if (c1 == 't' && c2 == 'r') return 0.5;

// For most cases, the cost of substituting 2 characters is 1.0
return 1.0;
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net8.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
</PropertyGroup>

<ItemGroup>
<PackageReference Include="BenchmarkDotNet" Version="0.13.12" />
</ItemGroup>

<ItemGroup>
<ProjectReference Include="..\src\F23.StringSimilarity\F23.StringSimilarity.csproj" />
</ItemGroup>

</Project>
4 changes: 4 additions & 0 deletions F23.StringSimilarity.Benchmarks/Program.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
using BenchmarkDotNet.Running;
using F23.StringSimilarity.Benchmarks;

BenchmarkRunner.Run<Benchmarks>();
6 changes: 6 additions & 0 deletions F23.StringSimilarity.sln
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "F23.StringSimilarity", "src
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "F23.StringSimilarity.Tests", "test\F23.StringSimilarity.Tests\F23.StringSimilarity.Tests.csproj", "{68F339E6-278F-4B04-A6ED-422AAD30591F}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "F23.StringSimilarity.Benchmarks", "F23.StringSimilarity.Benchmarks\F23.StringSimilarity.Benchmarks.csproj", "{3A9605B1-820C-43C2-8F9B-72BCA5F5543B}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Expand All @@ -21,6 +23,10 @@ Global
{68F339E6-278F-4B04-A6ED-422AAD30591F}.Debug|Any CPU.Build.0 = Debug|Any CPU
{68F339E6-278F-4B04-A6ED-422AAD30591F}.Release|Any CPU.ActiveCfg = Release|Any CPU
{68F339E6-278F-4B04-A6ED-422AAD30591F}.Release|Any CPU.Build.0 = Release|Any CPU
{3A9605B1-820C-43C2-8F9B-72BCA5F5543B}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{3A9605B1-820C-43C2-8F9B-72BCA5F5543B}.Debug|Any CPU.Build.0 = Debug|Any CPU
{3A9605B1-820C-43C2-8F9B-72BCA5F5543B}.Release|Any CPU.ActiveCfg = Release|Any CPU
{3A9605B1-820C-43C2-8F9B-72BCA5F5543B}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
Expand Down
10 changes: 5 additions & 5 deletions src/F23.StringSimilarity/Jaccard.cs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

using System;
using System.Collections.Generic;
using System.Linq;
using F23.StringSimilarity.Interfaces;

// ReSharper disable LoopCanBeConvertedToQuery
Expand Down Expand Up @@ -83,14 +84,13 @@ public double Similarity(string s1, string s2)
var profile1 = GetProfile(s1);
var profile2 = GetProfile(s2);

var union = new HashSet<string>();
union.UnionWith(profile1.Keys);
union.UnionWith(profile2.Keys);
// SSNET Specific: use LINQ for more optimal distinct count
var unionCount = profile1.Keys.Concat(profile2.Keys).Distinct().Count();

int inter = profile1.Keys.Count + profile2.Keys.Count
- union.Count;
- unionCount;

return 1.0 * inter / union.Count;
return 1.0 * inter / unionCount;
}


Expand Down
6 changes: 2 additions & 4 deletions src/F23.StringSimilarity/Levenshtein.cs
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ public double Distance<T>(ReadOnlySpan<T> s1, ReadOnlySpan<T> s2, int limit)
// create two work vectors of integer distances
int[] v0 = new int[s2.Length + 1];
int[] v1 = new int[s2.Length + 1];
int[] vtemp;
// SSNET: removed unneeded int[] vtemp;

// initialize v0 (the previous row of distances)
// this row is A[0][i]: edit distance for an empty s
Expand Down Expand Up @@ -155,9 +155,7 @@ public double Distance<T>(ReadOnlySpan<T> s1, ReadOnlySpan<T> s2, int limit)
// System.arraycopy(v1, 0, v0, 0, v0.length);

// Flip references to current and previous row
vtemp = v0;
v0 = v1;
v1 = vtemp;
(v0, v1) = (v1, v0); // SSNET specific: Swap v0 and v1 using tuples
}

return v0[s2.Length];
Expand Down
6 changes: 2 additions & 4 deletions src/F23.StringSimilarity/NGram.cs
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ public double Distance(string s0, string s1)
char[] sa = new char[sl + n - 1];
float[] p; // 'previous' cost array, horizontally
float[] d; // Cost array, horizontally
float[] d2; // Placeholder to assist in swapping p and d
// SSNET removed unneeded: float[] d2; // Placeholder to assist in swapping p and d

// Construct sa with prefix
for (int i1 = 0; i1 < sa.Length; i1++)
Expand Down Expand Up @@ -172,9 +172,7 @@ public double Distance(string s0, string s1)
d[i] = Math.Min(Math.Min(d[i - 1] + 1, p[i] + 1), p[i - 1] + ec);
}
// Copy current distance counts to 'previous row' distance counts
d2 = p;
p = d;
d = d2;
(p, d) = (d, p); // SSNET specific: swap p and d using tuples
}

// Our last action in the above loop was to switch d and p, so p now
Expand Down
6 changes: 3 additions & 3 deletions src/F23.StringSimilarity/ShingleBased.cs
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ public abstract class ShingleBased
/// <summary>
/// Pattern for finding multiple following spaces
/// </summary>
private static readonly Regex SPACE_REG = new Regex("\\s+");
private static readonly Regex SPACE_REG = new Regex("\\s+", RegexOptions.Compiled);

/// <summary>
/// </summary>
Expand All @@ -59,7 +59,7 @@ protected ShingleBased(int k)

protected ShingleBased() : this(DEFAULT_K) { }

public IDictionary<string, int> GetProfile(string s)
protected internal Dictionary<string, int> GetProfile(string s)
{
var shingles = new Dictionary<string, int>();

Expand All @@ -79,7 +79,7 @@ public IDictionary<string, int> GetProfile(string s)
}
}

return new ReadOnlyDictionary<string, int>(shingles);
return shingles;
}
}
}
7 changes: 3 additions & 4 deletions src/F23.StringSimilarity/WeightedLevenshtein.cs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
*/

using System;
using System.Threading;
using F23.StringSimilarity.Interfaces;
// ReSharper disable SuggestVarOrType_Elsewhere
// ReSharper disable TooWideLocalVariableScope
Expand Down Expand Up @@ -112,7 +113,7 @@ public double Distance(string s1, string s2, double limit)
// create two work vectors of floating point (i.e. weighted) distances
double[] v0 = new double[s2.Length + 1];
double[] v1 = new double[s2.Length + 1];
double[] vtemp;
// SSNET: removed unneeded double[] vtemp;

// initialize v0 (the previous row of distances)
// this row is A[0][i]: edit distance for an empty s1
Expand Down Expand Up @@ -166,9 +167,7 @@ public double Distance(string s1, string s2, double limit)
// copy v1 (current row) to v0 (previous row) for next iteration
// System.arraycopy(v1, 0, v0, 0, v0.length);
// Flip references to current and previous row
vtemp = v0;
v0 = v1;
v1 = vtemp;
(v0, v1) = (v1, v0); // SSNET Specific: Swap references using tuples instead of temporary
}

return v0[s2.Length];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,4 @@
<ProjectReference Include="..\..\src\F23.StringSimilarity\F23.StringSimilarity.csproj" />
</ItemGroup>

</Project>
</Project>

0 comments on commit 248ef9e

Please sign in to comment.