search.mx raw

   1  // Copyright 2012 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  package bytes
   6  
   7  // stringFinder efficiently finds strings in a source text. It's implemented
   8  // using the Boyer-Moore string search algorithm:
   9  // https://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm
  10  // https://www.cs.utexas.edu/~moore/publications/fstrpos.pdf (note: this aged
  11  // document uses 1-based indexing)
  12  type stringFinder struct {
  13  	// pattern is the string that we are searching for in the text.
  14  	pattern []byte
  15  
  16  	// badCharSkip[b] contains the distance between the last byte of pattern
  17  	// and the rightmost occurrence of b in pattern. If b is not in pattern,
  18  	// badCharSkip[b] is len(pattern).
  19  	//
  20  	// Whenever a mismatch is found with byte b in the text, we can safely
  21  	// shift the matching frame at least badCharSkip[b] until the next time
  22  	// the matching char could be in alignment.
  23  	badCharSkip [256]int
  24  
  25  	// goodSuffixSkip[i] defines how far we can shift the matching frame given
  26  	// that the suffix pattern[i+1:] matches, but the byte pattern[i] does
  27  	// not. There are two cases to consider:
  28  	//
  29  	// 1. The matched suffix occurs elsewhere in pattern (with a different
  30  	// byte preceding it that we might possibly match). In this case, we can
  31  	// shift the matching frame to align with the next suffix chunk. For
  32  	// example, the pattern "mississi" has the suffix "issi" next occurring
  33  	// (in right-to-left order) at index 1, so goodSuffixSkip[3] ==
  34  	// shift+len(suffix) == 3+4 == 7.
  35  	//
  36  	// 2. If the matched suffix does not occur elsewhere in pattern, then the
  37  	// matching frame may share part of its prefix with the end of the
  38  	// matching suffix. In this case, goodSuffixSkip[i] will contain how far
  39  	// to shift the frame to align this portion of the prefix to the
  40  	// suffix. For example, in the pattern "abcxxxabc", when the first
  41  	// mismatch from the back is found to be in position 3, the matching
  42  	// suffix "xxabc" is not found elsewhere in the pattern. However, its
  43  	// rightmost "abc" (at position 6) is a prefix of the whole pattern, so
  44  	// goodSuffixSkip[3] == shift+len(suffix) == 6+5 == 11.
  45  	goodSuffixSkip []int
  46  }
  47  
  48  func makeStringFinder(pattern []byte) *stringFinder {
  49  	f := &stringFinder{
  50  		pattern:        pattern,
  51  		goodSuffixSkip: []int{:len(pattern)},
  52  	}
  53  	// last is the index of the last character in the pattern.
  54  	last := len(pattern) - 1
  55  
  56  	// Build bad character table.
  57  	// Bytes not in the pattern can skip one pattern's length.
  58  	for i := range f.badCharSkip {
  59  		f.badCharSkip[i] = len(pattern)
  60  	}
  61  	// The loop condition is < instead of <= so that the last byte does not
  62  	// have a zero distance to itself. Finding this byte out of place implies
  63  	// that it is not in the last position.
  64  	for i := 0; i < last; i++ {
  65  		f.badCharSkip[pattern[i]] = last - i
  66  	}
  67  
  68  	// Build good suffix table.
  69  	// First pass: set each value to the next index which starts a prefix of
  70  	// pattern.
  71  	lastPrefix := last
  72  	for i := last; i >= 0; i-- {
  73  		if HasPrefix(pattern, pattern[i+1:]) {
  74  			lastPrefix = i + 1
  75  		}
  76  		// lastPrefix is the shift, and (last-i) is len(suffix).
  77  		f.goodSuffixSkip[i] = lastPrefix + last - i
  78  	}
  79  	// Second pass: find repeats of pattern's suffix starting from the front.
  80  	for i := 0; i < last; i++ {
  81  		lenSuffix := longestCommonSuffix(pattern, pattern[1:i+1])
  82  		if pattern[i-lenSuffix] != pattern[last-lenSuffix] {
  83  			// (last-i) is the shift, and lenSuffix is len(suffix).
  84  			f.goodSuffixSkip[last-lenSuffix] = lenSuffix + last - i
  85  		}
  86  	}
  87  
  88  	return f
  89  }
  90  
  91  func longestCommonSuffix(a, b []byte) (i int) {
  92  	for ; i < len(a) && i < len(b); i++ {
  93  		if a[len(a)-1-i] != b[len(b)-1-i] {
  94  			break
  95  		}
  96  	}
  97  	return
  98  }
  99  
 100  // next returns the index in text of the first occurrence of the pattern. If
 101  // the pattern is not found, it returns -1.
 102  func (f *stringFinder) next(text []byte) int {
 103  	i := len(f.pattern) - 1
 104  	for i < len(text) {
 105  		// Compare backwards from the end until the first unmatching character.
 106  		j := len(f.pattern) - 1
 107  		for j >= 0 && text[i] == f.pattern[j] {
 108  			i--
 109  			j--
 110  		}
 111  		if j < 0 {
 112  			return i + 1 // match
 113  		}
 114  		i += max(f.badCharSkip[text[i]], f.goodSuffixSkip[j])
 115  	}
 116  	return -1
 117  }
 118