模式搜索KMP算法的C++实现

308 阅读2分钟

Knuth-Morris-Pratt Algorithm in C++

KMP Algorithm for Pattern Searching - GeeksforGeeks | Knuth-Morris-Pratt Algorithm

Given a text txt[0..n-1] and a pattern pat[0..m-1], write a function search(char pat[], char txt[]) that prints all occurrences of pat[] in txt[]. You may assume that n > m.

The worst case complexity of the Naive algorithm is O(m(n-m+1)). The time complexity of KMP algorithm is O(n) in the worst case.

findPrefix(pattern, m, prefArray)

构造数组lps,lps[i]以pattern[i]为结尾长度为lps[i]的子字符串可以作为pattern的前缀。

Examples of lps[] construction:
For the pattern “AAAA”, 
lps[] is [0, 1, 2, 3]

For the pattern “ABCDE”, 
lps[] is [0, 0, 0, 0, 0]

For the pattern “AABAACAABAA”, 
lps[] is [0, 1, 0, 1, 2, 0, 1, 2, 3, 4, 5]

For the pattern “AAACAAAAAC”, 
lps[] is [0, 1, 2, 0, 1, 2, 3, 3, 3, 4] 

For the pattern “AAABAAA”, 
lps[] is [0, 1, 2, 0, 1, 2, 3]

求lps的伪代码:

findPrefix(pattern, m, prefArray)
Input − The pattern, the length of pattern and an array to store prefix location
Output − The array to store where prefixes are located
Begin
   length := 0
   prefArray[0] := 0

   for all character index ‘i’ of pattern, do
      if pattern[i] = pattern[length], then
         increase length by 1
         prefArray[i] := length
      else
         if length ≠ 0 then
            length := prefArray[length - 1]
            decrease i by 1
         else
            prefArray[i] := 0
   done
End

kmpAlgorithm(text, pattern)

在获得lps数组后求匹配字符串的伪代码

kmpAlgorithm(text, pattern)
Input: The main text, and the pattern, which will be searched
Output − The location where patterns are found
Begin
   n := size of text
   m := size of pattern
   call findPrefix(pattern, m, prefArray)

   while i < n, do
      if text[i] = pattern[j], then
         increase i and j by 1
      if j = m, then
         print the location (i-j) as there is the pattern
         j := prefArray[j-1]
      else if i < n AND pattern[j] ≠ text[i] then
         if j ≠ 0 then
            j := prefArray[j - 1]
         else
            increase i by 1
   done
End

例子:

txt = "ABABDABACDABABCABAB";
pattern = "ABABCABAB"
lps={0,0,1,2,0,1,2,3,4}

i=0,j=0
ABABDABACDABABCABAB
ABABCABAB

...

i=3,j=3
ABABDABACDABABCABAB
ABABCABAB

i=4,j=4
ABABDABACDABABCABAB
ABABCABAB
0,0,1,2,0,1,2,3,4
0,1,2,3,4,5,6,7,8

i=4,j = lps[j - 1]=lps[3]=2
ABABDABACDABABCABAB
ABABCABAB
0,0,1,2,0,1,2,3,4
0,1,2,3,4,5,6,7,8

i=4,j = lps[j - 1]=lps[2]=0
ABABDABACDABABCABAB
ABABCABAB
0,0,1,2,0,1,2,3,4
0,1,2,3,4,5,6,7,8

i=i+1=5,j = 0
ABABDABACDABABCABAB
ABABCABAB
0,0,1,2,0,1,2,3,4
0,1,2,3,4,5,6,7,8

...

#include<iostream>
#include<vector>
#include<unordered_map>
#include<algorithm>
#include <iomanip>
using namespace std;

// C++ program for implementation of KMP pattern searching 
// algorithm 

void computeLPSArray(char* pat, int M, int* lps);

// Prints occurrences of txt[] in pat[] 
void KMPSearch(char* pat, char* txt)
{
    int M = strlen(pat);
    int N = strlen(txt);

    // create lps[] that will hold the longest prefix suffix 
    // values for pattern 
    // int lps[M]; // 报错,M需要是常数,报错,使用超出生命周期的变量
    int* lps = new int[M];

    // Preprocess the pattern (calculate lps[] array) 
    computeLPSArray(pat, M, lps);

    int i = 0; // index for txt[] 
    int j = 0; // index for pat[] 
    while (i < N) {
        if (pat[j] == txt[i]) {
            j++;
            i++;
        }

        if (j == M) {
            printf("Found pattern at index %d ", i - j);
            j = lps[j - 1];
        }

        // mismatch after j matches 
        else if (i < N && pat[j] != txt[i]) {
            // Do not match lps[0..lps[j-1]] characters, 
            // they will match anyway 
            if (j != 0)
                j = lps[j - 1];
            else
                i = i + 1;
        }
    }
}

// Fills lps[] for given patttern pat[0..M-1] 
void computeLPSArray(char* pat, int M, int* lps)
{
    // length of the previous longest prefix suffix 
    int len = 0;

    lps[0] = 0; // lps[0] is always 0 

    // the loop calculates lps[i] for i = 1 to M-1 
    int i = 1;
    while (i < M) {
        if (pat[i] == pat[len]) {
            len++;
            lps[i] = len;
            i++;
        }
        else // (pat[i] != pat[len]) 
        {
            // This is tricky. Consider the example. 
            // AAACAAAA and i = 7. The idea is similar 
            // to search step. 
            if (len != 0) {
                len = lps[len - 1];

                // Also, note that we do not increment 
                // i here 
            }
            else // if (len == 0) 
            {
                lps[i] = 0;
                i++;
            }
        }
    }
}

// Driver program to test above function 
int main()
{
    char txt[] = "ABABDABACDABABCABAB";
    char pat[] = "ABABCABAB"; // lps={0,0,1,2,0,1,2,3,4}
    KMPSearch(pat, txt);
    return 0;
}

编程题 | leetcode 214. 最短回文串

给定一个字符串 s,你可以通过在字符串前面添加字符将其转换为回文串。找到并返回可以用这种方式转换的最短回文串。

思路简述:求s1="oonjk"的翻转字符串s2="kjnoo",求s2的后缀与s1的前缀的匹配。即串接字符串s=s1+"#"+s2(原字符串s中不包含"#"字符),应用求lps的算法。

class Solution {
public:
    string shortestPalindrome(string s) {
        // KMP算法中求lps数组的思想
        int sl=s.size();
        string s2(s);
        reverse(s2.begin(),s2.end());
        s+="#"+s2; // 注意:这一步的前提是原字符串s中不包含"#"字符
        vector<int> lps(s.size(),0);
        int len=0,i=1;
        while(i<s.size())
        {
            if(s[i]==s[len])
            {
                lps[i++]=++len;
            }
            else if(len!=0)len=lps[len-1];
            else i++;
        }
        s=s.substr(sl+1,sl-lps[lps.size()-1])+s.substr(0,sl);
        return s;
    }
};

又解:

class Solution {
public:
    string shortestPalindrome(string s) {
        string s2(s), s_new;
        reverse(s2.begin(), s2.end());
        s_new = s + "#" + s2;
        int i = -1, j = 0;  //i指向前缀下标初始-1,j指向后缀下标初始0
        //next[j]表示j之前的字符串的最长前缀和后缀相等的字符个数,即最长前缀的下一个字符下标
        vector<int> next(s_new.size() + 1, 0);
        next[0] = -1;
        //循环扫描字符串s_new
        while (j < s_new.size())
        {
            if (i == -1 || s_new[i] == s_new[j])
            {
                i++;
                j++;
                next[j] = i;
            }
            else
                i = next[i];
        }
        //将s_new的s2的与s非重复重复部分截取拼接上s就是最短回文串结果
        return s_new.substr(s.size() + 1, s.size() - next[s_new.size()]) + s;
    }
};