diff --git a/F23.StringSimilarity.Benchmarks/Benchmarks.cs b/F23.StringSimilarity.Benchmarks/Benchmarks.cs
new file mode 100644
index 0000000..bc750f9
--- /dev/null
+++ b/F23.StringSimilarity.Benchmarks/Benchmarks.cs
@@ -0,0 +1,117 @@
+using BenchmarkDotNet.Attributes;
+
+namespace F23.StringSimilarity.Benchmarks;
+
+[MemoryDiagnoser]
+public class Benchmarks
+{
+ [Benchmark]
+ public void Cosine()
+ {
+ var cosine = new Cosine();
+ _ = cosine.Distance("hello", "world");
+ }
+
+ [Benchmark]
+ public void Damerau()
+ {
+ var damerau = new Damerau();
+ _ = damerau.Distance("hello", "world");
+ }
+
+ [Benchmark]
+ public void Jaccard()
+ {
+ var jaccard = new Jaccard();
+ _ = jaccard.Distance("hello", "world");
+ }
+
+ [Benchmark]
+ public void JaroWinkler()
+ {
+ var jaro = new JaroWinkler();
+ _ = jaro.Distance("hello", "world");
+ }
+
+ [Benchmark]
+ public void Levenshtein()
+ {
+ var levenshtein = new Levenshtein();
+ _ = levenshtein.Distance("hello", "world");
+ }
+
+ [Benchmark]
+ public void LongestCommonSubsequence()
+ {
+ var lcs = new LongestCommonSubsequence();
+ _ = lcs.Distance("hello", "world");
+ }
+
+ [Benchmark]
+ public void MetricLCS()
+ {
+ var metricLcs = new MetricLCS();
+ _ = metricLcs.Distance("hello", "world");
+ }
+
+ [Benchmark]
+ public void NGram()
+ {
+ var ngram = new NGram();
+ _ = ngram.Distance("hello", "world");
+ }
+
+ [Benchmark]
+ public void NormalizedLevenshtein()
+ {
+ var normalizedLevenshtein = new NormalizedLevenshtein();
+ _ = normalizedLevenshtein.Distance("hello", "world");
+ }
+
+ [Benchmark]
+ public void OptimalStringAlignment()
+ {
+ var osa = new OptimalStringAlignment();
+ _ = osa.Distance("hello", "world");
+ }
+
+ [Benchmark]
+ public void QGram()
+ {
+ var qGram = new QGram();
+ _ = qGram.Distance("hello", "world");
+ }
+
+ [Benchmark]
+ public void RatcliffObershelp()
+ {
+ var ratcliffObershelp = new RatcliffObershelp();
+ _ = ratcliffObershelp.Distance("hello", "world");
+ }
+
+ [Benchmark]
+ public void SorensenDice()
+ {
+ var sorensenDice = new SorensenDice();
+ _ = sorensenDice.Distance("hello", "world");
+ }
+
+ [Benchmark]
+ public void WeightedLevenshtein()
+ {
+ var weightedLevenshtein = new WeightedLevenshtein(new ExampleCharSub());
+ _ = weightedLevenshtein.Distance("hello", "world");
+ }
+
+ private class ExampleCharSub : ICharacterSubstitution
+ {
+ public double Cost(char c1, char c2)
+ {
+ // The cost for substituting 't' and 'r' is considered smaller as these 2 are located next to each other on a keyboard
+ if (c1 == 't' && c2 == 'r') return 0.5;
+
+ // For most cases, the cost of substituting 2 characters is 1.0
+ return 1.0;
+ }
+ }
+}
diff --git a/F23.StringSimilarity.Benchmarks/F23.StringSimilarity.Benchmarks.csproj b/F23.StringSimilarity.Benchmarks/F23.StringSimilarity.Benchmarks.csproj
new file mode 100644
index 0000000..9d869a8
--- /dev/null
+++ b/F23.StringSimilarity.Benchmarks/F23.StringSimilarity.Benchmarks.csproj
@@ -0,0 +1,18 @@
+
+
+
+ Exe
+ net8.0
+ enable
+ enable
+
+
+
+
+
+
+
+
+
+
+
diff --git a/F23.StringSimilarity.Benchmarks/Program.cs b/F23.StringSimilarity.Benchmarks/Program.cs
new file mode 100644
index 0000000..5cf4d5b
--- /dev/null
+++ b/F23.StringSimilarity.Benchmarks/Program.cs
@@ -0,0 +1,4 @@
+using BenchmarkDotNet.Running;
+using F23.StringSimilarity.Benchmarks;
+
+BenchmarkRunner.Run();
diff --git a/F23.StringSimilarity.sln b/F23.StringSimilarity.sln
index 4f760e0..c9a10d0 100644
--- a/F23.StringSimilarity.sln
+++ b/F23.StringSimilarity.sln
@@ -7,6 +7,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "F23.StringSimilarity", "src
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "F23.StringSimilarity.Tests", "test\F23.StringSimilarity.Tests\F23.StringSimilarity.Tests.csproj", "{68F339E6-278F-4B04-A6ED-422AAD30591F}"
EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "F23.StringSimilarity.Benchmarks", "F23.StringSimilarity.Benchmarks\F23.StringSimilarity.Benchmarks.csproj", "{3A9605B1-820C-43C2-8F9B-72BCA5F5543B}"
+EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
@@ -21,6 +23,10 @@ Global
{68F339E6-278F-4B04-A6ED-422AAD30591F}.Debug|Any CPU.Build.0 = Debug|Any CPU
{68F339E6-278F-4B04-A6ED-422AAD30591F}.Release|Any CPU.ActiveCfg = Release|Any CPU
{68F339E6-278F-4B04-A6ED-422AAD30591F}.Release|Any CPU.Build.0 = Release|Any CPU
+ {3A9605B1-820C-43C2-8F9B-72BCA5F5543B}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+ {3A9605B1-820C-43C2-8F9B-72BCA5F5543B}.Debug|Any CPU.Build.0 = Debug|Any CPU
+ {3A9605B1-820C-43C2-8F9B-72BCA5F5543B}.Release|Any CPU.ActiveCfg = Release|Any CPU
+ {3A9605B1-820C-43C2-8F9B-72BCA5F5543B}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
diff --git a/src/F23.StringSimilarity/Jaccard.cs b/src/F23.StringSimilarity/Jaccard.cs
index 8bd8ca5..433de1d 100644
--- a/src/F23.StringSimilarity/Jaccard.cs
+++ b/src/F23.StringSimilarity/Jaccard.cs
@@ -24,6 +24,7 @@
using System;
using System.Collections.Generic;
+using System.Linq;
using F23.StringSimilarity.Interfaces;
// ReSharper disable LoopCanBeConvertedToQuery
@@ -83,14 +84,13 @@ public double Similarity(string s1, string s2)
var profile1 = GetProfile(s1);
var profile2 = GetProfile(s2);
- var union = new HashSet();
- union.UnionWith(profile1.Keys);
- union.UnionWith(profile2.Keys);
+ // SSNET Specific: use LINQ for more optimal distinct count
+ var unionCount = profile1.Keys.Concat(profile2.Keys).Distinct().Count();
int inter = profile1.Keys.Count + profile2.Keys.Count
- - union.Count;
+ - unionCount;
- return 1.0 * inter / union.Count;
+ return 1.0 * inter / unionCount;
}
diff --git a/src/F23.StringSimilarity/Levenshtein.cs b/src/F23.StringSimilarity/Levenshtein.cs
index 32cfe2c..0ccf97d 100644
--- a/src/F23.StringSimilarity/Levenshtein.cs
+++ b/src/F23.StringSimilarity/Levenshtein.cs
@@ -109,7 +109,7 @@ public double Distance(ReadOnlySpan s1, ReadOnlySpan s2, int limit)
// create two work vectors of integer distances
int[] v0 = new int[s2.Length + 1];
int[] v1 = new int[s2.Length + 1];
- int[] vtemp;
+ // SSNET: removed unneeded int[] vtemp;
// initialize v0 (the previous row of distances)
// this row is A[0][i]: edit distance for an empty s
@@ -155,9 +155,7 @@ public double Distance(ReadOnlySpan s1, ReadOnlySpan s2, int limit)
// System.arraycopy(v1, 0, v0, 0, v0.length);
// Flip references to current and previous row
- vtemp = v0;
- v0 = v1;
- v1 = vtemp;
+ (v0, v1) = (v1, v0); // SSNET specific: Swap v0 and v1 using tuples
}
return v0[s2.Length];
diff --git a/src/F23.StringSimilarity/NGram.cs b/src/F23.StringSimilarity/NGram.cs
index 35d46e0..34739e2 100644
--- a/src/F23.StringSimilarity/NGram.cs
+++ b/src/F23.StringSimilarity/NGram.cs
@@ -103,7 +103,7 @@ public double Distance(string s0, string s1)
char[] sa = new char[sl + n - 1];
float[] p; // 'previous' cost array, horizontally
float[] d; // Cost array, horizontally
- float[] d2; // Placeholder to assist in swapping p and d
+ // SSNET removed unneeded: float[] d2; // Placeholder to assist in swapping p and d
// Construct sa with prefix
for (int i1 = 0; i1 < sa.Length; i1++)
@@ -172,9 +172,7 @@ public double Distance(string s0, string s1)
d[i] = Math.Min(Math.Min(d[i - 1] + 1, p[i] + 1), p[i - 1] + ec);
}
// Copy current distance counts to 'previous row' distance counts
- d2 = p;
- p = d;
- d = d2;
+ (p, d) = (d, p); // SSNET specific: swap p and d using tuples
}
// Our last action in the above loop was to switch d and p, so p now
diff --git a/src/F23.StringSimilarity/ShingleBased.cs b/src/F23.StringSimilarity/ShingleBased.cs
index a5ef275..2b2e70f 100644
--- a/src/F23.StringSimilarity/ShingleBased.cs
+++ b/src/F23.StringSimilarity/ShingleBased.cs
@@ -41,7 +41,7 @@ public abstract class ShingleBased
///
/// Pattern for finding multiple following spaces
///
- private static readonly Regex SPACE_REG = new Regex("\\s+");
+ private static readonly Regex SPACE_REG = new Regex("\\s+", RegexOptions.Compiled);
///
///
@@ -59,7 +59,7 @@ protected ShingleBased(int k)
protected ShingleBased() : this(DEFAULT_K) { }
- public IDictionary GetProfile(string s)
+ protected internal Dictionary GetProfile(string s)
{
var shingles = new Dictionary();
@@ -79,7 +79,7 @@ public IDictionary GetProfile(string s)
}
}
- return new ReadOnlyDictionary(shingles);
+ return shingles;
}
}
}
diff --git a/src/F23.StringSimilarity/WeightedLevenshtein.cs b/src/F23.StringSimilarity/WeightedLevenshtein.cs
index 4aa9a43..0ac576d 100644
--- a/src/F23.StringSimilarity/WeightedLevenshtein.cs
+++ b/src/F23.StringSimilarity/WeightedLevenshtein.cs
@@ -23,6 +23,7 @@
*/
using System;
+using System.Threading;
using F23.StringSimilarity.Interfaces;
// ReSharper disable SuggestVarOrType_Elsewhere
// ReSharper disable TooWideLocalVariableScope
@@ -112,7 +113,7 @@ public double Distance(string s1, string s2, double limit)
// create two work vectors of floating point (i.e. weighted) distances
double[] v0 = new double[s2.Length + 1];
double[] v1 = new double[s2.Length + 1];
- double[] vtemp;
+ // SSNET: removed unneeded double[] vtemp;
// initialize v0 (the previous row of distances)
// this row is A[0][i]: edit distance for an empty s1
@@ -166,9 +167,7 @@ public double Distance(string s1, string s2, double limit)
// copy v1 (current row) to v0 (previous row) for next iteration
// System.arraycopy(v1, 0, v0, 0, v0.length);
// Flip references to current and previous row
- vtemp = v0;
- v0 = v1;
- v1 = vtemp;
+ (v0, v1) = (v1, v0); // SSNET Specific: Swap references using tuples instead of temporary
}
return v0[s2.Length];
diff --git a/test/F23.StringSimilarity.Tests/F23.StringSimilarity.Tests.csproj b/test/F23.StringSimilarity.Tests/F23.StringSimilarity.Tests.csproj
index 7d90b77..b74e697 100644
--- a/test/F23.StringSimilarity.Tests/F23.StringSimilarity.Tests.csproj
+++ b/test/F23.StringSimilarity.Tests/F23.StringSimilarity.Tests.csproj
@@ -30,4 +30,4 @@
-
\ No newline at end of file
+