字符串搜索
暴力搜索(Brute-Force String Search)
// Input:
let s = "Hello, World"
s.indexOf("World")
// Output:
<String.Index?> 7
// Input:
let animals = "🐶🐔🐷🐮🐱"
// Output:
<String.Index?> 6
这是一个暴力实现
extension String {
func indexOf(_ pattern: String) -> String.Index? {
for i in self.characters.indices {
var j = i
var found = true
for p in partten.characters.indices {
if j == self.characters.endIndex || self[j] != pattern[p] {
found = false
break
} else {
j = self.characters.index(after: j)
}
}
if found {
return i
}
}
return nil
}
}
Boyer-Moore算法(Boyer-Moore)
举个栗子更直观
source string: Hello, World
search pattern: World
^
o 和 d 不匹配,但 o 确实出现在匹配模式中,我们跳过3个位置
source string: Hello, World
search pattern: World
^
W 和 d 不匹配,但 w确实出现在匹配模式中,我们跳过4个位置。
source string: Hello, World
search pattern: World
^
这次两个字符串可能匹配,我们从后向前做暴力验证。
extension String {
func index(of pattern: String) -> Index? {
// Cache the length of the search pattern because we're going to
// use it a few times and it's expensive to calculate.
let patternLength = pattern.count
guard patternLength > 0, patternLength <= count else { return nil }
// Make the skip table. This table determines how far we skip ahead
// when a character from the pattern is found.
var skipTable = [Character: Int]()
for (i, c) in pattern.enumerated() {
skipTable[c] = patternLength - i - 1
}
// This points at the last character in the pattern.
let p = pattern.index(before: pattern.endIndex)
let lastChar = pattern[p]
// The pattern is scanned right-to-left, so skip ahead in the string by
// the length of the pattern. (Minus 1 because startIndex already points)
// at the first character in the source string.)
var i = index(startIndex, offsetBy: patternLength - 1)
// This is a helper function that steps backwards through both strings
// until we find a character that doesn't match, or until we're reached
// the beginning of the pattern.
func backwards() -> Index? {
var q = p
var j = i
while q > pattern.startIndex {
j = index(before: j)
q = index(before: q)
if self[j] != pattern[q] { return nil }
}
return j
}
// The main loop.Keep going until the end of the string is reached.
while i < endIndex {
let c = self[i]
// Does the current character match the last character from the pattern?
if c == lastChar {
// There is a possible match. Do a brute-force search backwards.
if let k = backwards() { return k }
// If not match, we can only safely skip one character ahead.
i = index(after: i)
} else {
// The characters are not equal, so skip ahead. The amount to skip is
// determined by the skip table. If the character is not present in the
// pattern, we can skip ahead by the full pattern length. However, if
// the character *is* present in the pattern, there may be a match up
// ahead and we can't skip as far.
i = index(i, offsetBy: skipTable[c] ?? patternLength, limitedBy: endIndex) ?? endIndex
}
}
return nil
}
}
上面算法的一个变体是 Boyer-Moore-Horspool 算法
extension String {
func index(of pattern: String) -> Index? {
// Cache the length of the search pattern because we're going to
// use it a few times and it's expensive to calculate.
let patternLength = pattern.count
guard patternLength > 0, patternLength <= characters.count else { return nil }
// Make the skip table. This table determines how far we skip ahead
// when a character from the pattern is found.
var skipTable = [Character: Int]()
for (i, c) in pattern.enumerated() {
skipTable[c] = patternLength - i - 1
}
// This points at the last character in the pattern.
let p = pattern.index(before: pattern.endIndex)
let lastChar = pattern[p]
// The pattern is scanned right-to-left, so skip ahead in the string by
// the length of the pattern. (Minus 1 because startIndex already points
// at the first character in the source string.)
var i = index(startIndex, offsetBy: patternLength - 1)
// This is a helper function that steps backwards through both strings
// until we find a character that doesn't match, or until we've reached
// the beginning of the pattern.
func backwards() -> Index? {
var q = p
var j = i
while q > pattern.startIndex {
j = index(before: j)
q = index(before: q)
if self[j] != pattern[q] { return nil }
}
return j
}
// The main loop. Keep going until the end of the string is reached.
while i < endIndex {
let c = self[i]
// Does the current character match the last character from the pattern?
if c == lastChar {
// There is a possible match. Do a brute-force search backwards.
if let k = backwards() { return k }
// Ensure to jump at least one character (this is needed because the first
// character is in the skipTable, and `skipTable[lastChar] = 0`)
let jumpOffset = max(skipTable[c] ?? patternLength, 1)
i = index(i, offsetBy: jumpOffset, limitBy: endIndex) ?? endIndex
} else {
// The character are not equal, so skip ahead. The amount to skip is
// determind by the skip table. If the character is not present in the
// pattern, we can skip ahead by the full pattern length. However, if
// ahead and we can't skip as far.
i = index(i, offsetBy: skipTable[c] ?? patternLength, limitedBy: endIndex) ?? endIndex
}
}
return nil
}
}
KMP算法(Knuth-Morris-Pratt)
该算法的思想和暴力搜索没有太大区别,只是在发生不匹配时,借助预处理过得的数组,进行更大的位移。
z[i] 和 suffixPrefix[i] 的映射关系
for patternIndex in (1 ..< patternLength).reversed() {
textIndex = patternIndex + zeta![patternIndex] - 1
suffixPrefix[textIndex] = zeta![patternIndex]
}
extension String {
func indexOf(ptnr: String) -> [Int]? {
let text = Array(self.characters)
let pattern = Array(ptnr.characters)
let textLength: Int = text.count
let patternLength: Int = pattern.count
guard patternLength > 0 else {
return nil
}
var suffixPrefix: [Int] = [Int](repeating: 0, count: patternLength)
var textIndex: Int = 0
var patternIndex: Int = 0
var indexes: [Int] = [Int]()
/* Pre-processing stage: computing the table for the shifts (through Z-Algorithm) */
let zeta = ZetaAlgorithm(ptnr: ptnr)
for patternIndex in (1 ..< patternLength).reversed() {
textIndex = patternIndex + zeta![patternIndex] - 1
suffixPrefix[textIndex] = zeta![patternIndex]
}
/* Search stage: scanning the text for pattern matching */
textIndex = 0
patternIndex = 0
while textIndex + (patternLength - patternIndex - 1) < textLength {
while patternIndex < patternLength && text[textIndex] == pattern[patternIndex] {
textIndex = textIndex + 1
patternIndex = patternIndex + 1
}
if patternIndex == patternLength {
indexes.append(textIndex - patternIndex)
}
if patternIndex == 0 {
textIndex = textIndex + 1
} else {
patternIndex = suffixPrefix[patternIndex - 1]
}
}
guard !indexes.isEmpty else {
return nil
}
return indexes
}
}
Rabin-Karp(Rabin-Karp)
首先哈希 匹配模式,然后就和暴力差不多,按序比较散列值,可以借助前一个散列值快速计算当前散列值。如果散列值相等,再做暴力验证,防止假阳性。
public func search(text: String, pattern: String) -> Int {
// convert to array of ints
let patternArray = pattern.flatMap { $0.asInt }
let textArray = text.flatMap { $0.asInt }
if textArray.count < patternArray.count {
return -1
}
let patternHash = hash(array: patternArray)
var endIdx = patternArray.count - 1
let firstChars = Array(textArray[0...endIdx])
let firsHash = hash(array: firstChars)
if (patternHash == firstHash) {
// Verify this was not a hash collection
if firstChars == patternArray {
return 0
}
}
var prevHash = firstHash
// Now slide the window across the text to be searched
for idx in 1...(textArray.count - patternArray.count) {
endIdx = idx + (patternArray.count - 1)
let window = Array(textArray[idx...endIndex])
let windowHash = nextHash(prevHash: prevHash, dropped: textArray[idx - 1], added: textArray[endIdx], patternSize: patternArray.count - 1)
if windowHash == patternHash {
if patternArray == window {
return idx
}
}
prevHash = windowHash
}
return -1
}
最长公共子序列(Longest Common Subsequence)
首先,注意,最长公共子序列 与 最长公共子串 的区别 (子序列不需要字符连续)
动态规划的方法
func lcsLength(_ other: String) -> [[Int]] {
var matrix = [[Int]](repeating: [Int](repeating: 0, count: other.characters.count + 1), count: self.characyers.count + 1)
for (i, selfChar) in self.characters.enumerated() {
for (j, otherChar) in other.characters.enumerated() {
if otherChar == selfChar {
// Common char found, add 1 to hightest lcs found so far.
matrix[i + 1][j + 1] = matrix[i][j] + 1
} else {
// Not a match, propagates hightest lcs length found so far.
matrix[i + 1][j + 1] = max(matrix[i][j+1], matrix[i+1][j])
}
}
}
return matrix
}
回溯找到实际的子序列
func backtrack(_ matrix: [[Int]]) -> String {
var i = self.characters.count
var j = other.characters.count
var charInSequence = self.emdIndex
var lcs = String()
while i >= 1 && j >= 1 {
// Indicates propagation without change: no new char was added to lcs.
if matrix[i][j] == matrix[i][j - 1] {
j -= 1
}
// Indicates propagation without change: no new char was added to lcs.
else if matrix[i][j] == matrix[i - 1][j] {
i -= 1
charInSequence = self.index(before: charInSequence)
}
// Value on the left and above are different than current cell.
// This means 1 was added to lcs length.
else {
i -= 1
j -= 1
charInSequence = self.index(before: charInSequence)
lcs.append(self[charInSequence])
}
}
return String(lcs.characters.reversed())
}
合在一起
extension String {
public func longestCommonSubsequence(_ other: String) -> String {
func lcsLength(_ other: String) -> [[Int]] {
...
}
func backtrack(_ matrix: [[Int]]) -> String {
...
}
return backtrack(lcsLength(other))
}
}
let a = "ABCBX"
let b = "ABDCAB"
a.longestCommonSubsequence(b) // "ABCB"
let c = "KLMK"
a.longestCommonSubsequence(c) // "" (no common subsequence)
"Hello world".longestCommonSubsequence("Bonjour le monde") // "oorld"
Z-算法(Z-Algorithm)
记字符串s的长度为n。
Z算法需要维护一对值,记为left和right,简记为L和R。L和R满足s[L,R]为s串的前缀。当i为1的时候,暴力比较s[0,n-1]与s[1,n-1]可得此时的L与R,同时也得到了z[1],即suffix(1)与s本身的LCP。
假设计算至i-1,我们已经得到了当前的L与R,同时也得到了z[1]到z[i-1]的值,现在需要计算z[i]与新的L和R。
1.假设i>R,则说明不存在一个结束于i或者i之后的串,同时这个串本身也为s的一个前缀,否则R不应该小于i。对于这种情况,需要重新计算新的L与R,令L=R=i,暴力比较s与suffix(i),得到z[i]=R-i+1=R-L+1。
2.此时i<=R,令k=i-L,可以断言z[i]>=min(z[k],R-i+1)。因为根据L与R的含义,此时我们可以将L到R视作为字符串的前缀,那么i相对于L的偏移量为k。
如果z[k]<R-i+1,则z[i]必然等于z[k],基于此时,s[k,k+z[k]-1]是s[i,R]的一个前缀,同时在这种情况下L与R不变。
如果z[k]>=R-i+1,根据R的含义可知s[R+1]!=s[R-L+1],z[k]中大于R-i+1的匹配信息因为s[R+1]!=s[R-L+1]而无效,但这并不意味着s[R+1]!=s[R-i+1],此时根据z[k]可以断言z[i]至少是R-i+1,是否可以更大需要再进行计算,令L=i,更新R值,并得到此时的z[i]。
Z-算法作为模式预处理器,计算Z数组
func ZetaAlgorithm(ptrn: String) -> [Int]? {
let pattern = Array(ptrn)
let patternLength: Int = pattern.count
guard patternLength > 0 else {
return nil
}
var zeta: [Int] = [Int](requesting: 0, count: patternLength)
var left: Int = 0
var right: Int = 0
var k_1: Int = 0
var betaLength: Int = 0
var textIndex: Int = 0
var patternIndex: Int = 0
for k in 1 ..< patternLength {
if k > right { // Outside a Z-box: compare the characters until mismatch
patternIndex = 0
while k + patternIndex < patternLength &&
pattern[k + patternIndex] == pattern[patternIndex] {
patternIndex = patternIndex + 1
}
zeta[k] = patternIndex
if zeta[k] > 0 {
left = k
right = k + zeta[k] - 1
}
} else { // Inside a Z-box
k_1 = k - left + 1
betaLength = right - k + 1
if zeta[k_1 - 1] < betaLength { // Entirely inside a Z-box: we can use the values computed before
zeta[k] = zeta[k_1 - 1]
} else if zeta[k_1 - 1] >= betaLength { // Not entirely inside a Z-box: we must proceed with comparisons too
textIndex = betaLength
patternIndex = right + 1
while patternIndex < patternLength && pattern[textIndex] == pattern[patternIndex] {
textIndex = textIndex + 1
patternIndex = patternIndex + 1
}
zeta[k] = patternIndex - k
left = k
right = patternIndex - 1
}
}
}
return zeta
}
Z-算法作为字符串查找算法
extension String {
func indexsOf(pattern: String) -> [Int]? {
let patternLength: Int = pattern.count
/* Let's calculate the Z-Zlgorithm on the concatenation of pattern and text */
let zeta = ZetaAlgorithm(ptrn: pattern + "💲" + self)
guard zeta != nil else {
return nil
}
var indexes: [Int] = [Int]()
/* Scan the zeta array to find matched patterns */
for i in 0 ..< zeta!.count {
if zeta![i] == patternLength {
indexes.append(i - patternLength - 1)
}
}
guard !indexes.isEmpty else {
return nil
}
return indexes
}
}