《swift-algorithm-club》——算法/字符串查找(匹配)

376 阅读4分钟

字符串搜索

暴力搜索(Brute-Force String Search)

// Input: 
let s = "Hello, World"
s.indexOf("World")

// Output:
<String.Index?> 7

// Input:
let animals = "🐶🐔🐷🐮🐱"

// Output:
<String.Index?> 6

这是一个暴力实现

extension String {
  func indexOf(_ pattern: String) -> String.Index? {
    for i in self.characters.indices {
      var j = i
      var found = true
      for p in partten.characters.indices {
        if j == self.characters.endIndex || self[j] != pattern[p] {
          found = false
          break
        } else {
          j = self.characters.index(after: j)
        }
      }
      if found {
        return i
      }
    }
    return nil
  }
}

Boyer-Moore算法(Boyer-Moore)

举个栗子更直观

source string:  Hello, World
search pattern: World
                    ^

od 不匹配,但 o 确实出现在匹配模式中,我们跳过3个位置

source string:  Hello, World
search pattern:    World
                       ^

Wd 不匹配,但 w确实出现在匹配模式中,我们跳过4个位置。

source string:  Hello, World
search pattern:        World
                           ^

这次两个字符串可能匹配,我们从后向前做暴力验证。

extension String {
  func index(of pattern: String) -> Index? {
    // Cache the length of the search pattern because we're going to
    // use it a few times and it's expensive to calculate.
    let patternLength = pattern.count
    guard patternLength > 0, patternLength <= count else { return nil }
    
    // Make the skip table. This table determines how far we skip ahead
    // when a character from the pattern is found.
    var skipTable = [Character: Int]()
    for (i, c) in pattern.enumerated() {
      skipTable[c] = patternLength - i - 1
    }

    // This points at the last character in the pattern.
    let p = pattern.index(before: pattern.endIndex)
    let lastChar = pattern[p]
    
    // The pattern is scanned right-to-left, so skip ahead in the string by 
    // the length of the pattern. (Minus 1 because startIndex already points)
    // at the first character in the source string.)
    var i = index(startIndex, offsetBy: patternLength - 1)

    // This is a helper function that steps backwards through both strings
    // until we find a character that doesn't match, or until we're reached
    // the beginning of the pattern.
    func backwards() -> Index? {
      var q = p
      var j = i
      while q > pattern.startIndex {
        j = index(before: j)
				q = index(before: q)
        if self[j] != pattern[q] { return nil }
      }
      return j
    }
    
    // The main loop.Keep going until the end of the string is reached.
    while i < endIndex {
      let c = self[i]
      
      // Does the current character match the last character from the pattern?
      if c == lastChar {
        
        // There is a possible match. Do a brute-force search backwards.
        if let k = backwards() { return k }
        
        // If not match, we can only safely skip one character ahead.
        i = index(after: i)
      } else {
        // The characters are not equal, so skip ahead. The amount to skip is
        // determined by the skip table. If the character is not present in the
        // pattern, we can skip ahead by the full pattern length. However, if 
        // the character *is* present in the pattern, there may be a match up
        // ahead and we can't skip as far.
        i = index(i, offsetBy: skipTable[c] ?? patternLength, limitedBy: endIndex) ?? endIndex
      }
    }
    return nil
  }
}

上面算法的一个变体是 Boyer-Moore-Horspool 算法

extension String {
  func index(of pattern: String) -> Index? {
    // Cache the length of the search pattern because we're going to
    // use it a few times and it's expensive to calculate.
    let patternLength = pattern.count
    guard patternLength > 0, patternLength <= characters.count else { return nil }
    
    // Make the skip table. This table determines how far we skip ahead
    // when a character from the pattern is found.
    var skipTable = [Character: Int]()
    for (i, c) in pattern.enumerated() {
      skipTable[c] = patternLength - i - 1
    }
    
    // This points at the last character in the pattern.
    let p = pattern.index(before: pattern.endIndex)
    let lastChar = pattern[p]
    
    // The pattern is scanned right-to-left, so skip ahead in the string by
    // the length of the pattern. (Minus 1 because startIndex already points
    // at the first character in the source string.)
    var i = index(startIndex, offsetBy: patternLength - 1)
    
    // This is a helper function that steps backwards through both strings
    // until we find a character that doesn't match, or until we've reached
    // the beginning of the pattern.
    func backwards() -> Index? {
      var q = p
      var j = i
      while q > pattern.startIndex {
        j = index(before: j)
        q = index(before: q)
        if self[j] != pattern[q] { return nil }
      }
      return j
    }
    
    // The main loop. Keep going until the end of the string is reached.
    while i < endIndex {
      let c = self[i]
      
      // Does the current character match the last character from the pattern?
      if c == lastChar {
        
        // There is a possible match. Do a brute-force search backwards.
        if let k = backwards() { return k }
        
        // Ensure to jump at least one character (this is needed because the first
      	// character is in the skipTable, and `skipTable[lastChar] = 0`)
        let jumpOffset = max(skipTable[c] ?? patternLength, 1)
        i = index(i, offsetBy: jumpOffset, limitBy: endIndex) ?? endIndex
      } else {
        // The character are not equal, so skip ahead. The amount to skip is
        // determind by the skip table. If the character is not present in the
        // pattern, we can skip ahead by the full pattern length. However, if
        // ahead and we can't skip as far.
        i = index(i, offsetBy: skipTable[c] ?? patternLength, limitedBy: endIndex) ?? endIndex
      }
    }
    return nil
  }
}

KMP算法(Knuth-Morris-Pratt)

该算法的思想和暴力搜索没有太大区别,只是在发生不匹配时,借助预处理过得的数组,进行更大的位移。

z[i] 和 suffixPrefix[i] 的映射关系

for patternIndex in (1 ..< patternLength).reversed() {
  textIndex = patternIndex + zeta![patternIndex] - 1
  suffixPrefix[textIndex] = zeta![patternIndex]
}
extension String {
  func indexOf(ptnr: String) -> [Int]? {
    let text = Array(self.characters)
    let pattern = Array(ptnr.characters)
    
    let textLength: Int = text.count
    let patternLength: Int = pattern.count
    
    guard patternLength > 0 else {
      return nil
    }
    
    var suffixPrefix: [Int] = [Int](repeating: 0, count: patternLength)
    var textIndex: Int = 0
    var patternIndex: Int = 0
    var indexes: [Int] = [Int]()
    
    /* Pre-processing stage: computing the table for the shifts (through Z-Algorithm) */
    let zeta = ZetaAlgorithm(ptnr: ptnr)
    
    for patternIndex in (1 ..< patternLength).reversed() {
      textIndex = patternIndex + zeta![patternIndex] - 1
      suffixPrefix[textIndex] = zeta![patternIndex]
    }
    
    /* Search stage: scanning the text for pattern matching */
    textIndex = 0
    patternIndex = 0
    
    while textIndex + (patternLength - patternIndex - 1) < textLength {
      
      while patternIndex < patternLength && text[textIndex] == pattern[patternIndex] {
        textIndex = textIndex + 1
        patternIndex = patternIndex + 1
      }
      
      if patternIndex == patternLength {
        indexes.append(textIndex - patternIndex)
      }
      
      if patternIndex == 0 {
        textIndex = textIndex + 1
      } else {
        patternIndex = suffixPrefix[patternIndex - 1]
      }
    }
    
    guard !indexes.isEmpty else {
      return nil
    }
    return indexes
  }
}

Rabin-Karp(Rabin-Karp)

首先哈希 匹配模式,然后就和暴力差不多,按序比较散列值,可以借助前一个散列值快速计算当前散列值。如果散列值相等,再做暴力验证,防止假阳性。

public func search(text: String, pattern: String) -> Int {
  // convert to array of ints
  let patternArray = pattern.flatMap { $0.asInt }
  let textArray = text.flatMap { $0.asInt }
  
  if textArray.count < patternArray.count {
    return -1
  }
  
  let patternHash = hash(array: patternArray)
  var endIdx = patternArray.count - 1
  let firstChars = Array(textArray[0...endIdx])
  let firsHash = hash(array: firstChars)
  
  if (patternHash == firstHash) {
    // Verify this was not a hash collection
    if firstChars == patternArray {
      return 0
    }
  }
  
  var prevHash = firstHash
  // Now slide the window across the text to be searched
  for idx in 1...(textArray.count - patternArray.count) {
    endIdx = idx + (patternArray.count - 1)
    let window = Array(textArray[idx...endIndex])
    let windowHash = nextHash(prevHash: prevHash, dropped: textArray[idx - 1], added: textArray[endIdx], patternSize: patternArray.count - 1)
    
    if windowHash == patternHash {
      if patternArray == window {
        return idx
      }
    }
    
    prevHash = windowHash
  }
  
  return -1
}

最长公共子序列(Longest Common Subsequence)

首先,注意,最长公共子序列最长公共子串 的区别 (子序列不需要字符连续)

动态规划的方法

func lcsLength(_ other: String) -> [[Int]] {
  var matrix = [[Int]](repeating: [Int](repeating: 0, count: other.characters.count + 1), count: self.characyers.count + 1)
  
  for (i, selfChar) in self.characters.enumerated() {
    for (j, otherChar) in other.characters.enumerated() {
      if otherChar == selfChar {
        // Common char found, add 1 to hightest lcs found so far.
        matrix[i + 1][j + 1] = matrix[i][j] + 1
      } else {
        // Not a match, propagates hightest lcs length found so far.
        matrix[i + 1][j + 1] = max(matrix[i][j+1], matrix[i+1][j])
      }
    }
  }
  return matrix
}

回溯找到实际的子序列

func backtrack(_ matrix: [[Int]]) -> String {
  var i = self.characters.count
  var j = other.characters.count
  
  var charInSequence = self.emdIndex
  
  var lcs = String()
  
  while i >= 1 && j >= 1 {
    // Indicates propagation without change: no new char was added to lcs.
    if matrix[i][j] == matrix[i][j - 1] {
      j -= 1
    }
    // Indicates propagation without change: no new char was added to lcs.
    else if matrix[i][j] == matrix[i - 1][j] {
      i -= 1
      charInSequence = self.index(before: charInSequence)
    }
    // Value on the left and above are different than current cell.
    // This means 1 was added to lcs length.
    else {
      i -= 1
      j -= 1
      charInSequence = self.index(before: charInSequence)
      lcs.append(self[charInSequence])
    }
  }
  
  return String(lcs.characters.reversed())
}

合在一起

extension String {
  public func longestCommonSubsequence(_ other: String) -> String {
    
    func lcsLength(_ other: String) -> [[Int]] {
      ...
    }
    
    func backtrack(_ matrix: [[Int]]) -> String {
      ...
    }
    
    return backtrack(lcsLength(other))
  }
}
let a = "ABCBX"
let b = "ABDCAB"
a.longestCommonSubsequence(b)  //  "ABCB"

let c = "KLMK"
a.longestCommonSubsequence(c)   // "" (no common subsequence)

"Hello world".longestCommonSubsequence("Bonjour le monde")  // "oorld"

Z-算法(Z-Algorithm)

记字符串s的长度为n。

Z算法需要维护一对值,记为left和right,简记为L和R。L和R满足s[L,R]为s串的前缀。当i为1的时候,暴力比较s[0,n-1]与s[1,n-1]可得此时的L与R,同时也得到了z[1],即suffix(1)与s本身的LCP。

假设计算至i-1,我们已经得到了当前的L与R,同时也得到了z[1]到z[i-1]的值,现在需要计算z[i]与新的L和R。

1.假设i>R,则说明不存在一个结束于i或者i之后的串,同时这个串本身也为s的一个前缀,否则R不应该小于i。对于这种情况,需要重新计算新的L与R,令L=R=i,暴力比较s与suffix(i),得到z[i]=R-i+1=R-L+1。

2.此时i<=R,令k=i-L,可以断言z[i]>=min(z[k],R-i+1)。因为根据L与R的含义,此时我们可以将L到R视作为字符串的前缀,那么i相对于L的偏移量为k。

如果z[k]<R-i+1,则z[i]必然等于z[k],基于此时,s[k,k+z[k]-1]是s[i,R]的一个前缀,同时在这种情况下L与R不变。

如果z[k]>=R-i+1,根据R的含义可知s[R+1]!=s[R-L+1],z[k]中大于R-i+1的匹配信息因为s[R+1]!=s[R-L+1]而无效,但这并不意味着s[R+1]!=s[R-i+1],此时根据z[k]可以断言z[i]至少是R-i+1,是否可以更大需要再进行计算,令L=i,更新R值,并得到此时的z[i]。

Z-算法作为模式预处理器,计算Z数组

func ZetaAlgorithm(ptrn: String) -> [Int]? {
  let pattern = Array(ptrn)
  let patternLength: Int = pattern.count
  
  guard patternLength > 0 else {
    return nil
  }
  
  var zeta: [Int] = [Int](requesting: 0, count: patternLength)
  
  var left: Int = 0
  var right: Int = 0
  var k_1: Int = 0
  var betaLength: Int = 0
  var textIndex: Int = 0
  var patternIndex: Int = 0
  
  for k in 1 ..< patternLength {
    if k > right { // Outside a Z-box: compare the characters until mismatch
      patternIndex = 0
      
      while k + patternIndex < patternLength &&
      			pattern[k + patternIndex] == pattern[patternIndex] {
      	patternIndex = patternIndex + 1        
      }
      
      zeta[k] = patternIndex
      
      if zeta[k] > 0 {
        left = k
        right = k + zeta[k] - 1
      }
    } else { // Inside a Z-box
      k_1 = k - left + 1
      betaLength = right - k + 1
      
      if zeta[k_1 - 1] < betaLength { // Entirely inside a Z-box: we can use the values computed before
        zeta[k] = zeta[k_1 - 1]
      } else if zeta[k_1 - 1] >= betaLength { // Not entirely inside a Z-box: we must proceed with comparisons too
        textIndex = betaLength
        patternIndex = right + 1
      
        while patternIndex < patternLength && pattern[textIndex] == pattern[patternIndex] {
          textIndex = textIndex + 1
          patternIndex = patternIndex + 1
        }
        
        zeta[k] = patternIndex - k
        left = k
        right = patternIndex - 1
      }
    }
  }
  return zeta
}

Z-算法作为字符串查找算法

extension String {
  func indexsOf(pattern: String) -> [Int]? {
    let patternLength: Int = pattern.count
    /* Let's calculate the Z-Zlgorithm on the concatenation of pattern and text */
    let zeta = ZetaAlgorithm(ptrn: pattern + "💲" + self)
    
    guard zeta != nil else {
      return nil
    }
    
    var indexes: [Int] = [Int]()
    
    /* Scan the zeta array to find matched patterns */
    for i in 0 ..< zeta!.count {
      if zeta![i] == patternLength {
        indexes.append(i - patternLength - 1)
      }
    }
    
    guard !indexes.isEmpty else {
      return nil
    }
    
    return indexes
  }
}