-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathstringdiff.lua
132 lines (102 loc) · 4.19 KB
/
stringdiff.lua
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
--!strict
--[[
String similarity metric system / splinestein diffing algorithm.
Free to use!
]]
local StringDiff = {}
function StringDiff.longest_word(a: string, b: string) : (string, string)
-- Commutative; makes a the longest string.
if #a < #b then
a, b = b, a
end
return a, b
end
function StringDiff.longest_number(a: number, b: number) : (number, number)
-- Commutative; makes a the longest number.
if a < b then
a, b = b, a
end
return a, b
end
function StringDiff.longest_common_match(a: string, b: string) : (string, number)
-- Create an empty table to store the matches & count all matches.
local matches: {[number]: string} = {}
local all_match_count = 0
-- Iterate over the characters in b for each character in a.
for i = 1, #a do
for j = 1, #b do
-- Create an empty table to store the current match and iterate over the characters, starting at index i.
local current_match: {[number]: string} = {}
for k = i, #a do
-- Check if the character at index k in a is the same as the character at index j in b.
if string.sub(a, k, k) == string.sub(b, j, j) then
-- If the characters are the same, add the character to the current match table.
table.insert(current_match, string.sub(a, k, k))
-- Increment j to compare the next character in b.
j += 1
else
-- If the characters are different, break the inner loop.
break
end
end
-- Skip empty matches, only store those we found.
if #table.concat(current_match) > 0 then
all_match_count += 1
end
-- Update the longest match if the current one is longer.
if #current_match > #matches then
matches = current_match
end
end
end
-- Return both the longest match and the count.
return table.concat(matches), all_match_count
end
function StringDiff.perfect_common_match_calc(sentence: string) : number
--[[
In an ideal scenario, how many common matches would be made?
If both strings are "aaabb" it's (equation):
(how many a's * how many a's) + unique letter (how many b's * how many b's) + ... = 13
]]
local counts: {[string]: number} = {}
for i = 1, #sentence do
local letter: string = string.sub(sentence, i, i)
if counts[letter] then
counts[letter] = counts[letter] + 1
else
counts[letter] = 1
end
end
local total_matches = 0
for _, count in pairs(counts) do
total_matches += count * count
end
return total_matches
end
function StringDiff.compare(a: string, b: string) : (number, string)
-- Core functionality:
local a, b = StringDiff.longest_word(a, b)
-- Just no point in comparing such small strings. Lift restriction if you feel like it.
if #a < 1 then
local errmsg = "Strings are too short to compare."
warn(errmsg)
return 0, "Strings are too short to compare."
end
local perfect_match_count = StringDiff.perfect_common_match_calc(a)
local longest_match, all_match_count = StringDiff.longest_common_match(a, b)
-- Core ratio logic calculation:
local matches_length_compared = (#longest_match / #a) * 100 -- How big of a portion does the largest match take out of the sentence?
local match_count_similarity = (all_match_count / perfect_match_count) * 100 -- How many matches could've been made in an ideal perfect scenario?
local string_length_similarity = (#b / #a) * 100 -- How long both a and b are compared to each other?
-- Finalization:
local average_count_match_ratio = (matches_length_compared + match_count_similarity) / 2 -- Average out most important percentages, match length portion and count similarity for best result.
-- We want to calculate how much we should change the ratio.
local a_calc, b_calc = StringDiff.longest_number(average_count_match_ratio, string_length_similarity)
local ratio = average_count_match_ratio * (((b_calc / a_calc) * 100) / 100)
--[[ In a scenario where A and B are "Hey" and "How", you might think their general appearance,
is similar in that they are both three letters long and contain only lowercase letters.
However, this type of superficial similarity is not necessarily indicative of deeper similarity in meaning or usage.
Hence why I skip extra string_length_similarity checks that I could possibily utilize in the ratio. ]]
return ratio, longest_match
end
return StringDiff