forked from t3nsor/codebook
-
Notifications
You must be signed in to change notification settings - Fork 0
/
aho-corasick.cpp
101 lines (76 loc) · 2.17 KB
/
aho-corasick.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
// Complexity: O(N + M + occ)
// N: text length
// M: sum of keywords length
// occ: occurences of keywords in text
// Max number of states in the matching machine.
// Should be equal to the sum of the length of all keywords.
const int MAXS = 500;
// Maximum number of characters in input alphabet
const int MAXC = 26;
int out[MAXS];
int f[MAXS];
int g[MAXS][MAXC];
int buildMatchingMachine(string arr[], int k) {
memset(out, 0, sizeof out);
memset(g, -1, sizeof g);
int states = 1;
for (int i = 0; i < k; ++i) {
const string &word = arr[i];
int currentState = 0;
for (int j = 0; j < word.size(); ++j) {
int ch = word[j] - 'a';
if (g[currentState][ch] == -1)
g[currentState][ch] = states++;
currentState = g[currentState][ch];
}
out[currentState] |= (1 << i);
}
for (int ch = 0; ch < MAXC; ++ch)
if (g[0][ch] == -1)
g[0][ch] = 0;
memset(f, -1, sizeof f);
queue<int> q;
for (int ch = 0; ch < MAXC; ++ch) {
if (g[0][ch] != 0) {
f[g[0][ch]] = 0;
q.push(g[0][ch]);
}
}
while (!q.empty()) {
int state = q.front(); q.pop();
for (int ch = 0; ch < MAXC; ++ch) {
if (g[state][ch] != -1) {
int failure = f[state];
while (g[failure][ch] == -1)
failure = f[failure];
failure = g[failure][ch];
f[g[state][ch]] = failure;
out[g[state][ch]] |= out[failure];
q.push(g[state][ch]);
}
}
}
return states;
}
int findNextState(int currentState, char nextInput) {
int answer = currentState;
int ch = nextInput - 'a';
while (g[answer][ch] == -1)
answer = f[answer];
return g[answer][ch];
}
void searchWords(string arr[], int k, string text) {
buildMatchingMachine(arr, k);
int currentState = 0;
for (int i = 0; i < text.size(); ++i) {
currentState = findNextState(currentState, text[i]);
if (out[currentState] == 0)
continue;
for (int j = 0; j < k; ++j) {
if (out[currentState] & (1 << j)) {
cout << "Word " << arr[j] << " appears from "
<< i - arr[j].size() + 1 << " to " << i << endl;
}
}
}
}