// Package fuzzy provides edit distance and fuzzy string matching for // translation fallback when exact lattice lookups fail. package fuzzy // DL computes Damerau-Levenshtein distance between a and b. // Handles: insertions, deletions, substitutions, and adjacent transpositions. // Transposition adds 1 instead of 2 (standard Levenshtein), catching the // most common typo pattern ("teh"→"the", "goverment"→"government"). // // Returns edit distance as int. Returns MaxDist if either string is empty // and the other is not, or if distance exceeds MaxDist early (future opt). func DL(a, b string) int { la := runeLen(a) lb := runeLen(b) if la == 0 { return lb } if lb == 0 { return la } // Convert to rune slices for correct Unicode handling. ra := toRunes(a) rb := toRunes(b) // Standard DP matrix. d[i][j] = DL distance of ra[:i] vs rb[:j]. // Row optimization: only keep current and previous two rows. prev2 := []int{:lb + 1} prev1 := []int{:lb + 1} curr := []int{:lb + 1} for j := 0; j <= lb; j++ { prev2[j] = j } for i := 1; i <= la; i++ { curr[0] = i for j := 1; j <= lb; j++ { cost := 1 if ra[i-1] == rb[j-1] { cost = 0 } del := prev1[j] + 1 ins := curr[j-1] + 1 sub := prev1[j-1] + cost curr[j] = minInt(del, minInt(ins, sub)) // Transposition: check if ra[i-1]==rb[j-2] and ra[i-2]==rb[j-1]. if i > 1 && j > 1 && ra[i-1] == rb[j-2] && ra[i-2] == rb[j-1] { trans := prev2[j-2] + cost if trans < curr[j] { curr[j] = trans } } } // Rotate rows. prev2, prev1, curr = prev1, curr, prev2 } return prev1[lb] } func minInt(a, b int) int { if a < b { return a } return b } // runeLen returns the number of Unicode codepoints in s. func runeLen(s string) int { n := 0 for i := 0; i < len(s); { i += utf8Len(s[i]) n++ } return n } // toRunes converts s to a slice of rune values (codepoints). func toRunes(s string) []rune { out := []rune{:0:runeLen(s)} i := 0 for i < len(s) { r, size := decodeRune(s, i) out = append(out, r) i += size } return out } func utf8Len(b byte) int { switch { case b < 0x80: return 1 case b < 0xE0: return 2 case b < 0xF0: return 3 default: return 4 } } func decodeRune(s string, i int) (rune, int) { b := s[i] if b < 0x80 { return rune(b), 1 } if b < 0xE0 { if i+1 >= len(s) { return 0xFFFD, 1 } return rune(b&0x1F)<<6 | rune(s[i+1]&0x3F), 2 } if b < 0xF0 { if i+2 >= len(s) { return 0xFFFD, 1 } return rune(b&0x0F)<<12 | rune(s[i+1]&0x3F)<<6 | rune(s[i+2]&0x3F), 3 } if i+3 >= len(s) { return 0xFFFD, 1 } return rune(b&0x07)<<18 | rune(s[i+1]&0x3F)<<12 | rune(s[i+2]&0x3F)<<6 | rune(s[i+3]&0x3F), 4 }