edit.mx raw

   1  // Package fuzzy provides edit distance and fuzzy string matching for
   2  // translation fallback when exact lattice lookups fail.
   3  package fuzzy
   4  
   5  // DL computes Damerau-Levenshtein distance between a and b.
   6  // Handles: insertions, deletions, substitutions, and adjacent transpositions.
   7  // Transposition adds 1 instead of 2 (standard Levenshtein), catching the
   8  // most common typo pattern ("teh"→"the", "goverment"→"government").
   9  //
  10  // Returns edit distance as int. Returns MaxDist if either string is empty
  11  // and the other is not, or if distance exceeds MaxDist early (future opt).
  12  func DL(a, b string) int {
  13  	la := runeLen(a)
  14  	lb := runeLen(b)
  15  	if la == 0 {
  16  		return lb
  17  	}
  18  	if lb == 0 {
  19  		return la
  20  	}
  21  
  22  	// Convert to rune slices for correct Unicode handling.
  23  	ra := toRunes(a)
  24  	rb := toRunes(b)
  25  
  26  	// Standard DP matrix. d[i][j] = DL distance of ra[:i] vs rb[:j].
  27  	// Row optimization: only keep current and previous two rows.
  28  	prev2 := []int{:lb + 1}
  29  	prev1 := []int{:lb + 1}
  30  	curr  := []int{:lb + 1}
  31  
  32  	for j := 0; j <= lb; j++ {
  33  		prev2[j] = j
  34  	}
  35  
  36  	for i := 1; i <= la; i++ {
  37  		curr[0] = i
  38  		for j := 1; j <= lb; j++ {
  39  			cost := 1
  40  			if ra[i-1] == rb[j-1] {
  41  				cost = 0
  42  			}
  43  			del := prev1[j] + 1
  44  			ins := curr[j-1] + 1
  45  			sub := prev1[j-1] + cost
  46  			curr[j] = minInt(del, minInt(ins, sub))
  47  
  48  			// Transposition: check if ra[i-1]==rb[j-2] and ra[i-2]==rb[j-1].
  49  			if i > 1 && j > 1 && ra[i-1] == rb[j-2] && ra[i-2] == rb[j-1] {
  50  				trans := prev2[j-2] + cost
  51  				if trans < curr[j] {
  52  					curr[j] = trans
  53  				}
  54  			}
  55  		}
  56  		// Rotate rows.
  57  		prev2, prev1, curr = prev1, curr, prev2
  58  	}
  59  	return prev1[lb]
  60  }
  61  
  62  func minInt(a, b int) int {
  63  	if a < b {
  64  		return a
  65  	}
  66  	return b
  67  }
  68  
  69  // runeLen returns the number of Unicode codepoints in s.
  70  func runeLen(s string) int {
  71  	n := 0
  72  	for i := 0; i < len(s); {
  73  		i += utf8Len(s[i])
  74  		n++
  75  	}
  76  	return n
  77  }
  78  
  79  // toRunes converts s to a slice of rune values (codepoints).
  80  func toRunes(s string) []rune {
  81  	out := []rune{:0:runeLen(s)}
  82  	i := 0
  83  	for i < len(s) {
  84  		r, size := decodeRune(s, i)
  85  		out = append(out, r)
  86  		i += size
  87  	}
  88  	return out
  89  }
  90  
  91  func utf8Len(b byte) int {
  92  	switch {
  93  	case b < 0x80:
  94  		return 1
  95  	case b < 0xE0:
  96  		return 2
  97  	case b < 0xF0:
  98  		return 3
  99  	default:
 100  		return 4
 101  	}
 102  }
 103  
 104  func decodeRune(s string, i int) (rune, int) {
 105  	b := s[i]
 106  	if b < 0x80 {
 107  		return rune(b), 1
 108  	}
 109  	if b < 0xE0 {
 110  		if i+1 >= len(s) {
 111  			return 0xFFFD, 1
 112  		}
 113  		return rune(b&0x1F)<<6 | rune(s[i+1]&0x3F), 2
 114  	}
 115  	if b < 0xF0 {
 116  		if i+2 >= len(s) {
 117  			return 0xFFFD, 1
 118  		}
 119  		return rune(b&0x0F)<<12 | rune(s[i+1]&0x3F)<<6 | rune(s[i+2]&0x3F), 3
 120  	}
 121  	if i+3 >= len(s) {
 122  		return 0xFFFD, 1
 123  	}
 124  	return rune(b&0x07)<<18 | rune(s[i+1]&0x3F)<<12 | rune(s[i+2]&0x3F)<<6 | rune(s[i+3]&0x3F), 4
 125  }
 126