-
Notifications
You must be signed in to change notification settings - Fork 4
/
lcss.go
218 lines (209 loc) · 7.25 KB
/
lcss.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
package lcss
import "bytes"
// LongestCommonSubstring returns the longest substring which is present in all the given strings.
// https://en.wikipedia.org/wiki/Longest_common_substring_problem
// Not to be confused with the Longest Common Subsequence.
// Complexity:
// * time: sum of `n_i*log(n_i)` where `n_i` is the length of each string.
// * space: sum of `n_i`.
// Returns a byte slice which is never a nil.
//
// ### Algorithm.
// We build suffix arrays for each of the passed string and then follow the same procedure
// as in merge sort: pick the least suffix in the lexicographical order. It is possible
// because the suffix arrays are already sorted.
// We record the last encountered suffixes from each of the strings and measure the longest
// common prefix of those at each "merge sort" step.
// The string comparisons are optimized by maintaining the char-level prefix tree of the "heads"
// of the suffix array sequences.
func LongestCommonSubstring(strs ...[]byte) []byte {
strslen := len(strs)
if strslen == 0 {
return []byte{}
}
if strslen == 1 {
return strs[0]
}
suffixes := make([][]int, strslen)
for i, str := range strs {
suffixes[i] = Qsufsort(str) // stdlib's qsufsort
}
return LongestCommonSubstringWithSuffixArrays(strs, suffixes)
}
// LongestCommonSubstringWithSuffixArrays returns the longest substring which is present in all
// the given strings. The corresponding suffix arrays must be precomputed with `lcss.Qsufsort`.
// https://en.wikipedia.org/wiki/Longest_common_substring_problem
// Not to be confused with the Longest Common Subsequence.
// Complexity:
// * time: sum of the string lengths.
// * space: sum of the string lengths.
// Returns a byte slice which is never a nil.
//
// ### Algorithm.
// We follow the same procedure as in merge sort: pick the least suffix in the lexicographical
// order. It is possible because the suffix arrays are already sorted.
// We record the last encountered suffixes from each of the strings and measure the longest
// common prefix of those at each "merge sort" step. The string comparisons are optimized by
// maintaining the char-level prefix tree of the "heads" of the suffix array sequences.
func LongestCommonSubstringWithSuffixArrays(strs [][]byte, suffixes [][]int) []byte {
strslen := len(strs)
if strslen != len(suffixes) {
panic("the suffix array must be computed with lcss.Qsufsort for each string")
}
if strslen == 0 {
return []byte{}
}
if strslen == 1 {
return strs[0]
}
minstrlen := len(strs[0]) // minimum length of the strings
for i, str := range strs {
if len(str) != len(suffixes[i]) {
panic("each suffix array must be exactly the same length as the corresponding string")
}
if minstrlen > len(str) {
minstrlen = len(str)
}
}
heads := make([]int, strslen) // position in each suffix array
boilerplate := make([][]byte, strslen) // existing suffixes in the tree
boiling := 0 // indicates how many distinct suffix arrays are presented in `boilerplate`
var root charNode // the character tree built on the strings from `boilerplate`
lcs := []byte{} // our function's return value, `var lcss []byte` does *not* work
for {
mini := -1
var minSuffixStr []byte
for i, head := range heads {
if head >= len(suffixes[i]) {
// this suffix array has been scanned till the end
continue
}
suffix := strs[i][suffixes[i][head]:]
if minSuffixStr == nil {
// initialize
mini = i
minSuffixStr = suffix
} else if bytes.Compare(minSuffixStr, suffix) > 0 {
// the current suffix is the smallest in the lexicographical order
mini = i
minSuffixStr = suffix
}
}
if mini == -1 {
// all heads exhausted
break
}
if boilerplate[mini] != nil {
// if we already have a suffix from this string, replace it with the new one
root.Remove(boilerplate[mini])
} else {
// we track the number of distinct strings which have been touched
// when `boiling` becomes strslen we can start measuring the longest common prefix
boiling++
}
boilerplate[mini] = minSuffixStr
root.Add(minSuffixStr)
heads[mini]++
if boiling == strslen && root.LongestCommonPrefixLength() > len(lcs) {
// all heads > 0, the current common prefix of the suffixes is the longest
lcs = root.LongestCommonPrefix()
if len(lcs) == minstrlen {
// early exit - we will never find a longer substring
break
}
}
}
return lcs
}
// charNode builds a tree of individual characters.
// `used` is the counter for collecting garbage: those nodes which have `used`=0 are removed.
// The root charNode always remains intact apart from `children`.
// The tree supports 4 operations:
// 1. Add() a new string.
// 2. Remove() an existing string which was previously Add()-ed.
// 3. LongestCommonPrefixLength().
// 4. LongestCommonPrefix().
type charNode struct {
char byte
children []charNode
used int
}
// Add includes a new string into the tree. We start from the root and
// increment `used` of all the nodes we visit.
func (cn *charNode) Add(str []byte) {
head := cn
for i, char := range str {
found := false
for j, child := range head.children {
if child.char == char {
head.children[j].used++
head = &head.children[j] // -> child
found = true
break
}
}
if !found {
// add the missing nodes one by one
for _, char = range str[i:] {
head.children = append(head.children, charNode{char: char, children: nil, used: 1})
head = &head.children[len(head.children)-1]
}
break
}
}
}
// Remove excludes a node which was previously Add()-ed.
// We start from the root and decrement `used` of all the nodes we visit.
// If there is a node with `used`=0, we erase it from the parent's list of children
// and stop traversing the tree.
func (cn *charNode) Remove(str []byte) {
stop := false
head := cn
for _, char := range str {
for j, child := range head.children {
if child.char != char {
continue
}
head.children[j].used--
var parent *charNode
head, parent = &head.children[j], head // shift to the child
if head.used == 0 {
parent.children = append(parent.children[:j], parent.children[j+1:]...)
// we can skip deleting the rest of the nodes - they have been already discarded
stop = true
}
break
}
if stop {
break
}
}
}
// LongestCommonPrefixLength returns the length of the longest common prefix of the strings
// which are stored in the tree. We visit the children recursively starting from the root and
// stop if `used` value decreases or there is more than one child.
func (cn charNode) LongestCommonPrefixLength() int {
var result int
for head := cn; len(head.children) == 1 && head.children[0].used >= head.used; head = head.children[0] {
result++
}
return result
}
// LongestCommonPrefix returns the longest common prefix of the strings
// which are stored in the tree. We compute the length by calling LongestCommonPrefixLength()
// and then record the characters which we visit along the way from the root to the last node.
func (cn charNode) LongestCommonPrefix() []byte {
result := make([]byte, cn.LongestCommonPrefixLength())
if len(result) == 0 {
return result
}
var i int
for head := cn.children[0]; ; head = head.children[0] {
result[i] = head.char
i++
if i == len(result) {
break
}
}
return result
}