AC自动机及多模式匹配

  • 在接触AC自动机之前,只仅仅掌握单模式匹配的算法:比如KMP、BMH等算法;经过优化后,KMP和BMH都具有线性时间复杂度,而实际情况下,一般的匹配问题BMH具有亚线性的表现。而昨天接触的AC自动机则是一种结合了字典树和KMP的一种算法,使得在多模式匹配下,时间复杂度达到$O(Σmi + n)$,其中n为原串长度,mi为第i个模式串的长度;
  • 匹配过程中类似于KMP,原串不走回头路,利用之前已经匹配过的结果来构造特殊的字典树从而形成AC自动机;
  • 创建自动机的过程中,最为重要的是fail指针的构造;我是从这篇文章中学会的,AC自动机算法详解;fail指针的作用类似于KMP中的next数组;
  • 我的这个模板中并没有考虑对自动机的优化, 比如ptr->fail->next[i]ptr->next[i]若同时不存在, 则ptr->fail其实是可以直接指向ptr->fail->fail的原因很简单, 因为ptr->next[i]发生失配时, ptr = ptr->fail, 此时肯定仍然失配, 需要继续ptr->fail,当然优化的代价是增加对存储空间的占用, fail需要变为vector<trieTreeNode*> fail,每个字母都应对应一个fail指针。
1
////////////////////////////////////////////////////////////////////////////////
2
/*
3
1 const vector<string> &patterns: several pattern strings;
4
2 const string &s: original strings;
5
3 vector<string> &answer: the patterns which are matched in the original strings;
6
4 return the number of patterns which are matched.
7
*/
8
////////////////////////////////////////////////////////////////////////////////
9
#include <iostream>
10
#include <vector>
11
#include <queue>
12
#include <string>
13
#include <unordered_set>
14
#define ALPH_NUM 26
15
using namespace std;
16
17
struct trieTreeNode {
18
    vector<trieTreeNode*> next;
19
    bool mark;
20
    trieTreeNode *fail;
21
    trieTreeNode(): next(26, nullptr), mark(false), fail(nullptr) {}
22
};
23
24
trieTreeNode *createAcAutomation(const vector<string> &patterns);
25
int findPatterns(vector<string> &answer, trieTreeNode *root, const string &s);
26
void makeFoundPatterns(vector<string> &answer, unordered_set<trieTreeNode*> &save, trieTreeNode *root, string pattern);
27
inline char turn_char(int index);
28
int multiPatternsMatchingByAcAutomation(const vector<string> &patterns, const string &s, vector<string> &answer);
29
30
trieTreeNode *createAcAutomation(const vector<string> &patterns) {
31
    trieTreeNode *root = new trieTreeNode(), *ptr, *cur;
32
    for (int i = 0; i != patterns.size(); ++i) {
33
        cur = root;
34
        for (int k = 0; k != patterns[i].size(); ++k) {
35
            int index = patterns[i][k] - 'a';
36
            if (!cur->next[index])
37
                cur->next[index] = new trieTreeNode();
38
            cur = cur->next[index];
39
        }
40
        cur->mark = true;
41
    }
42
    queue<trieTreeNode*> makeFail;
43
    makeFail.push(root);
44
    while (!makeFail.empty()) {
45
        cur = makeFail.front(); makeFail.pop();
46
        for (int i = 0; i != ALPH_NUM; ++i) {
47
            if (cur->next[i]) {
48
                for (ptr = cur->fail; ptr && !ptr->next[i]; ptr = ptr->fail);
49
                cur->next[i]->fail = ptr ? ptr->next[i] : root;
50
                makeFail.push(cur->next[i]);
51
            }
52
        }
53
    }
54
    return root;
55
}
56
57
int findPatterns(vector<string> &answer, trieTreeNode *root, const string &s) {
58
    int count = 0;
59
    string pattern;
60
    unordered_set<trieTreeNode*> save;
61
    trieTreeNode *cur = root;
62
    for (int i = 0; i != s.size(); ) {
63
        int index = s[i] - 'a';
64
        if (!cur) {
65
            cur = root; ++i;
66
        }
67
        else if (cur->next[index]) {
68
            cur = cur->next[index];
69
            if (cur->mark) {
70
                ++count;
71
                save.insert(cur);
72
            }
73
            ++i;
74
        }
75
        else {
76
            cur = cur->fail;
77
            if (cur && cur->mark) {
78
                ++count;
79
                save.insert(cur);
80
            }
81
        }
82
    }
83
    makeFoundPatterns(answer, save, root, pattern);
84
    return count;
85
}
86
87
void makeFoundPatterns(vector<string> &answer, unordered_set<trieTreeNode*> &save,
88
    trieTreeNode *root, string pattern) {
89
    unordered_set<trieTreeNode*>::iterator it = save.find(root);
90
    if (it != save.end())
91
        answer.push_back(pattern);
92
    for (int i = 0; i != ALPH_NUM; ++i) {
93
        if (root->next[i]) {
94
            string t(pattern);
95
            t.push_back(turn_char(i));
96
            makeFoundPatterns(answer, save, root->next[i], t);
97
        }
98
    }
99
}
100
101
inline char turn_char(int index) {
102
    return 'a' + index;
103
}
104
105
int multiPatternsMatchingByAcAutomation(const vector<string> &patterns, const string &s, vector<string> &answer) {
106
    trieTreeNode *root = createAcAutomation(patterns);
107
    return findPatterns(answer, root, s);
108
}
Your browser is out-of-date!

Update your browser to view this website correctly. Update my browser now

×