diff --git a/F23.StringSimilarity.Benchmarks/Benchmarks.cs b/F23.StringSimilarity.Benchmarks/Benchmarks.cs new file mode 100644 index 0000000..bc750f9 --- /dev/null +++ b/F23.StringSimilarity.Benchmarks/Benchmarks.cs @@ -0,0 +1,117 @@ +using BenchmarkDotNet.Attributes; + +namespace F23.StringSimilarity.Benchmarks; + +[MemoryDiagnoser] +public class Benchmarks +{ + [Benchmark] + public void Cosine() + { + var cosine = new Cosine(); + _ = cosine.Distance("hello", "world"); + } + + [Benchmark] + public void Damerau() + { + var damerau = new Damerau(); + _ = damerau.Distance("hello", "world"); + } + + [Benchmark] + public void Jaccard() + { + var jaccard = new Jaccard(); + _ = jaccard.Distance("hello", "world"); + } + + [Benchmark] + public void JaroWinkler() + { + var jaro = new JaroWinkler(); + _ = jaro.Distance("hello", "world"); + } + + [Benchmark] + public void Levenshtein() + { + var levenshtein = new Levenshtein(); + _ = levenshtein.Distance("hello", "world"); + } + + [Benchmark] + public void LongestCommonSubsequence() + { + var lcs = new LongestCommonSubsequence(); + _ = lcs.Distance("hello", "world"); + } + + [Benchmark] + public void MetricLCS() + { + var metricLcs = new MetricLCS(); + _ = metricLcs.Distance("hello", "world"); + } + + [Benchmark] + public void NGram() + { + var ngram = new NGram(); + _ = ngram.Distance("hello", "world"); + } + + [Benchmark] + public void NormalizedLevenshtein() + { + var normalizedLevenshtein = new NormalizedLevenshtein(); + _ = normalizedLevenshtein.Distance("hello", "world"); + } + + [Benchmark] + public void OptimalStringAlignment() + { + var osa = new OptimalStringAlignment(); + _ = osa.Distance("hello", "world"); + } + + [Benchmark] + public void QGram() + { + var qGram = new QGram(); + _ = qGram.Distance("hello", "world"); + } + + [Benchmark] + public void RatcliffObershelp() + { + var ratcliffObershelp = new RatcliffObershelp(); + _ = ratcliffObershelp.Distance("hello", "world"); + } + + [Benchmark] + public void SorensenDice() + { + var sorensenDice = new SorensenDice(); + _ = sorensenDice.Distance("hello", "world"); + } + + [Benchmark] + public void WeightedLevenshtein() + { + var weightedLevenshtein = new WeightedLevenshtein(new ExampleCharSub()); + _ = weightedLevenshtein.Distance("hello", "world"); + } + + private class ExampleCharSub : ICharacterSubstitution + { + public double Cost(char c1, char c2) + { + // The cost for substituting 't' and 'r' is considered smaller as these 2 are located next to each other on a keyboard + if (c1 == 't' && c2 == 'r') return 0.5; + + // For most cases, the cost of substituting 2 characters is 1.0 + return 1.0; + } + } +} diff --git a/F23.StringSimilarity.Benchmarks/F23.StringSimilarity.Benchmarks.csproj b/F23.StringSimilarity.Benchmarks/F23.StringSimilarity.Benchmarks.csproj new file mode 100644 index 0000000..9d869a8 --- /dev/null +++ b/F23.StringSimilarity.Benchmarks/F23.StringSimilarity.Benchmarks.csproj @@ -0,0 +1,18 @@ + + + + Exe + net8.0 + enable + enable + + + + + + + + + + + diff --git a/F23.StringSimilarity.Benchmarks/Program.cs b/F23.StringSimilarity.Benchmarks/Program.cs new file mode 100644 index 0000000..5cf4d5b --- /dev/null +++ b/F23.StringSimilarity.Benchmarks/Program.cs @@ -0,0 +1,4 @@ +using BenchmarkDotNet.Running; +using F23.StringSimilarity.Benchmarks; + +BenchmarkRunner.Run(); diff --git a/F23.StringSimilarity.sln b/F23.StringSimilarity.sln index 4f760e0..c9a10d0 100644 --- a/F23.StringSimilarity.sln +++ b/F23.StringSimilarity.sln @@ -7,6 +7,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "F23.StringSimilarity", "src EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "F23.StringSimilarity.Tests", "test\F23.StringSimilarity.Tests\F23.StringSimilarity.Tests.csproj", "{68F339E6-278F-4B04-A6ED-422AAD30591F}" EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "F23.StringSimilarity.Benchmarks", "F23.StringSimilarity.Benchmarks\F23.StringSimilarity.Benchmarks.csproj", "{3A9605B1-820C-43C2-8F9B-72BCA5F5543B}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -21,6 +23,10 @@ Global {68F339E6-278F-4B04-A6ED-422AAD30591F}.Debug|Any CPU.Build.0 = Debug|Any CPU {68F339E6-278F-4B04-A6ED-422AAD30591F}.Release|Any CPU.ActiveCfg = Release|Any CPU {68F339E6-278F-4B04-A6ED-422AAD30591F}.Release|Any CPU.Build.0 = Release|Any CPU + {3A9605B1-820C-43C2-8F9B-72BCA5F5543B}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {3A9605B1-820C-43C2-8F9B-72BCA5F5543B}.Debug|Any CPU.Build.0 = Debug|Any CPU + {3A9605B1-820C-43C2-8F9B-72BCA5F5543B}.Release|Any CPU.ActiveCfg = Release|Any CPU + {3A9605B1-820C-43C2-8F9B-72BCA5F5543B}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/src/F23.StringSimilarity/Jaccard.cs b/src/F23.StringSimilarity/Jaccard.cs index 8bd8ca5..433de1d 100644 --- a/src/F23.StringSimilarity/Jaccard.cs +++ b/src/F23.StringSimilarity/Jaccard.cs @@ -24,6 +24,7 @@ using System; using System.Collections.Generic; +using System.Linq; using F23.StringSimilarity.Interfaces; // ReSharper disable LoopCanBeConvertedToQuery @@ -83,14 +84,13 @@ public double Similarity(string s1, string s2) var profile1 = GetProfile(s1); var profile2 = GetProfile(s2); - var union = new HashSet(); - union.UnionWith(profile1.Keys); - union.UnionWith(profile2.Keys); + // SSNET Specific: use LINQ for more optimal distinct count + var unionCount = profile1.Keys.Concat(profile2.Keys).Distinct().Count(); int inter = profile1.Keys.Count + profile2.Keys.Count - - union.Count; + - unionCount; - return 1.0 * inter / union.Count; + return 1.0 * inter / unionCount; } diff --git a/src/F23.StringSimilarity/Levenshtein.cs b/src/F23.StringSimilarity/Levenshtein.cs index 32cfe2c..0ccf97d 100644 --- a/src/F23.StringSimilarity/Levenshtein.cs +++ b/src/F23.StringSimilarity/Levenshtein.cs @@ -109,7 +109,7 @@ public double Distance(ReadOnlySpan s1, ReadOnlySpan s2, int limit) // create two work vectors of integer distances int[] v0 = new int[s2.Length + 1]; int[] v1 = new int[s2.Length + 1]; - int[] vtemp; + // SSNET: removed unneeded int[] vtemp; // initialize v0 (the previous row of distances) // this row is A[0][i]: edit distance for an empty s @@ -155,9 +155,7 @@ public double Distance(ReadOnlySpan s1, ReadOnlySpan s2, int limit) // System.arraycopy(v1, 0, v0, 0, v0.length); // Flip references to current and previous row - vtemp = v0; - v0 = v1; - v1 = vtemp; + (v0, v1) = (v1, v0); // SSNET specific: Swap v0 and v1 using tuples } return v0[s2.Length]; diff --git a/src/F23.StringSimilarity/NGram.cs b/src/F23.StringSimilarity/NGram.cs index 35d46e0..34739e2 100644 --- a/src/F23.StringSimilarity/NGram.cs +++ b/src/F23.StringSimilarity/NGram.cs @@ -103,7 +103,7 @@ public double Distance(string s0, string s1) char[] sa = new char[sl + n - 1]; float[] p; // 'previous' cost array, horizontally float[] d; // Cost array, horizontally - float[] d2; // Placeholder to assist in swapping p and d + // SSNET removed unneeded: float[] d2; // Placeholder to assist in swapping p and d // Construct sa with prefix for (int i1 = 0; i1 < sa.Length; i1++) @@ -172,9 +172,7 @@ public double Distance(string s0, string s1) d[i] = Math.Min(Math.Min(d[i - 1] + 1, p[i] + 1), p[i - 1] + ec); } // Copy current distance counts to 'previous row' distance counts - d2 = p; - p = d; - d = d2; + (p, d) = (d, p); // SSNET specific: swap p and d using tuples } // Our last action in the above loop was to switch d and p, so p now diff --git a/src/F23.StringSimilarity/ShingleBased.cs b/src/F23.StringSimilarity/ShingleBased.cs index a5ef275..2b2e70f 100644 --- a/src/F23.StringSimilarity/ShingleBased.cs +++ b/src/F23.StringSimilarity/ShingleBased.cs @@ -41,7 +41,7 @@ public abstract class ShingleBased /// /// Pattern for finding multiple following spaces /// - private static readonly Regex SPACE_REG = new Regex("\\s+"); + private static readonly Regex SPACE_REG = new Regex("\\s+", RegexOptions.Compiled); /// /// @@ -59,7 +59,7 @@ protected ShingleBased(int k) protected ShingleBased() : this(DEFAULT_K) { } - public IDictionary GetProfile(string s) + protected internal Dictionary GetProfile(string s) { var shingles = new Dictionary(); @@ -79,7 +79,7 @@ public IDictionary GetProfile(string s) } } - return new ReadOnlyDictionary(shingles); + return shingles; } } } diff --git a/src/F23.StringSimilarity/WeightedLevenshtein.cs b/src/F23.StringSimilarity/WeightedLevenshtein.cs index 4aa9a43..0ac576d 100644 --- a/src/F23.StringSimilarity/WeightedLevenshtein.cs +++ b/src/F23.StringSimilarity/WeightedLevenshtein.cs @@ -23,6 +23,7 @@ */ using System; +using System.Threading; using F23.StringSimilarity.Interfaces; // ReSharper disable SuggestVarOrType_Elsewhere // ReSharper disable TooWideLocalVariableScope @@ -112,7 +113,7 @@ public double Distance(string s1, string s2, double limit) // create two work vectors of floating point (i.e. weighted) distances double[] v0 = new double[s2.Length + 1]; double[] v1 = new double[s2.Length + 1]; - double[] vtemp; + // SSNET: removed unneeded double[] vtemp; // initialize v0 (the previous row of distances) // this row is A[0][i]: edit distance for an empty s1 @@ -166,9 +167,7 @@ public double Distance(string s1, string s2, double limit) // copy v1 (current row) to v0 (previous row) for next iteration // System.arraycopy(v1, 0, v0, 0, v0.length); // Flip references to current and previous row - vtemp = v0; - v0 = v1; - v1 = vtemp; + (v0, v1) = (v1, v0); // SSNET Specific: Swap references using tuples instead of temporary } return v0[s2.Length]; diff --git a/test/F23.StringSimilarity.Tests/F23.StringSimilarity.Tests.csproj b/test/F23.StringSimilarity.Tests/F23.StringSimilarity.Tests.csproj index 7d90b77..b74e697 100644 --- a/test/F23.StringSimilarity.Tests/F23.StringSimilarity.Tests.csproj +++ b/test/F23.StringSimilarity.Tests/F23.StringSimilarity.Tests.csproj @@ -30,4 +30,4 @@ - \ No newline at end of file +