forked from ctSkennerton/minced
-
Notifications
You must be signed in to change notification settings - Fork 0
/
SearchUtil.java
168 lines (139 loc) · 4.41 KB
/
SearchUtil.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
class SearchUtil
{
// Boyer Moore algorithm copyright by Michael Lecuyer 1998. Slight modification below.
private static final int MAXCHAR = 256; // Maximum chars in character set.
private byte pat[]; // Byte representation of pattern
private int patLen;
private int partial; // Bytes of a partial match found at the end of a text buffer
private int skip[]; // Internal BM table
private int d[]; // Internal BM table
SearchUtil()
{
skip = new int[MAXCHAR];
d = null;
}
public int boyer_mooreSearch(String text, String pattern)
{
byte[] byteText = text.getBytes();
compile(pattern);
return search(byteText, 0, text.length());
}
public int linearSsearch(String text, String pattern)
{
int textLength = text.length();
int patternLength = pattern.length();
for(int i = 0; i <= textLength - patternLength; i++)
{ String subPattern = text.substring(i, i + patternLength);
if (pattern.equals(subPattern))
{ return i;
}
}
return -1;
}
public void compile(String pattern)
{
pat = pattern.getBytes();
patLen = pat.length;
int j, k, m, t, t1, q, q1;
int f[] = new int[patLen];
d = new int[patLen];
m = patLen;
for (k = 0; k < MAXCHAR; k++)
skip[k] = m;
for (k = 1; k <= m; k++)
{
d[k-1] = (m << 1) - k;
skip[pat[k-1]] = m - k;
}
t = m + 1;
for (j = m; j > 0; j--)
{
f[j-1] = t;
while (t <= m && pat[j-1] != pat[t-1])
{
d[t-1] = (d[t-1] < m - j) ? d[t-1] : m - j;
t = f[t-1];
}
t--;
}
q = t;
t = m + 1 - q;
q1 = 1;
t1 = 0;
for (j = 1; j <= t; j++)
{
f[j-1] = t1;
while (t1 >= 1 && pat[j-1] != pat[t1-1])
t1 = f[t1-1];
t1++;
}
while (q < m)
{
for (k = q1; k <= q; k++)
d[k-1] = (d[k-1] < m + q - k) ? d[k-1] : m + q - k;
q1 = q + 1;
q = q + t - f[t-1];
t = f[t-1];
}
}
/**
* Search for the compiled pattern in the given text.
* A side effect of the search is the notion of a partial
* match at the end of the searched buffer.
* This partial match is helpful in searching text files when
* the entire file doesn't fit into memory.
*
* @param text Buffer containing the text
* @param start Start position for search
* @param length Length of text in the buffer to be searched.
*
* @return position in buffer where the pattern was found.
* @see patialMatch
*/
public int search(byte text[], int start, int length)
{
int textLen = length + start;
partial = -1; // assume no partial match
if (d == null)
return -1; // no pattern compiled, nothing matches.
int m = patLen;
if (m == 0)
return 0;
int k, j = 0;
int max = 0; // used in calculation of partial match. Max distand we jumped.
for (k = start + m - 1; k < textLen;)
{
for (j = m - 1; j >= 0 && text[k] == pat[j]; j--)
k--;
if (j == -1)
return k + 1;
int z = skip[text[k]];
max = (z > d[j]) ? z : d[j];
k += max;
}
if (k >= textLen && j > 0) // if we're near end of buffer --
{
partial = k - max - 1;
return -1; // not a real match
}
return -1; // No match
}
/**
* Returns the position at the end of the text buffer where a partial match was found.
* <P>
* In many case where a full text search of a large amount of data
* precludes access to the entire file or stream the search algorithm
* will note where the final partial match occurs.
* After an entire buffer has been searched for full matches calling
* this method will reveal if a potential match appeared at the end.
* This information can be used to patch together the partial match
* with the next buffer of data to determine if a real match occurred.
*
* @return -1 the number of bytes that formed a partial match, -1 if no
* partial match
*/
public int partialMatch()
{
return partial;
}
}