edit.mx raw
1 // Package fuzzy provides edit distance and fuzzy string matching for
2 // translation fallback when exact lattice lookups fail.
3 package fuzzy
4
5 // DL computes Damerau-Levenshtein distance between a and b.
6 // Handles: insertions, deletions, substitutions, and adjacent transpositions.
7 // Transposition adds 1 instead of 2 (standard Levenshtein), catching the
8 // most common typo pattern ("teh"→"the", "goverment"→"government").
9 //
10 // Returns edit distance as int. Returns MaxDist if either string is empty
11 // and the other is not, or if distance exceeds MaxDist early (future opt).
12 func DL(a, b string) int {
13 la := runeLen(a)
14 lb := runeLen(b)
15 if la == 0 {
16 return lb
17 }
18 if lb == 0 {
19 return la
20 }
21
22 // Convert to rune slices for correct Unicode handling.
23 ra := toRunes(a)
24 rb := toRunes(b)
25
26 // Standard DP matrix. d[i][j] = DL distance of ra[:i] vs rb[:j].
27 // Row optimization: only keep current and previous two rows.
28 prev2 := []int{:lb + 1}
29 prev1 := []int{:lb + 1}
30 curr := []int{:lb + 1}
31
32 for j := 0; j <= lb; j++ {
33 prev2[j] = j
34 }
35
36 for i := 1; i <= la; i++ {
37 curr[0] = i
38 for j := 1; j <= lb; j++ {
39 cost := 1
40 if ra[i-1] == rb[j-1] {
41 cost = 0
42 }
43 del := prev1[j] + 1
44 ins := curr[j-1] + 1
45 sub := prev1[j-1] + cost
46 curr[j] = minInt(del, minInt(ins, sub))
47
48 // Transposition: check if ra[i-1]==rb[j-2] and ra[i-2]==rb[j-1].
49 if i > 1 && j > 1 && ra[i-1] == rb[j-2] && ra[i-2] == rb[j-1] {
50 trans := prev2[j-2] + cost
51 if trans < curr[j] {
52 curr[j] = trans
53 }
54 }
55 }
56 // Rotate rows.
57 prev2, prev1, curr = prev1, curr, prev2
58 }
59 return prev1[lb]
60 }
61
62 func minInt(a, b int) int {
63 if a < b {
64 return a
65 }
66 return b
67 }
68
69 // runeLen returns the number of Unicode codepoints in s.
70 func runeLen(s string) int {
71 n := 0
72 for i := 0; i < len(s); {
73 i += utf8Len(s[i])
74 n++
75 }
76 return n
77 }
78
79 // toRunes converts s to a slice of rune values (codepoints).
80 func toRunes(s string) []rune {
81 out := []rune{:0:runeLen(s)}
82 i := 0
83 for i < len(s) {
84 r, size := decodeRune(s, i)
85 out = append(out, r)
86 i += size
87 }
88 return out
89 }
90
91 func utf8Len(b byte) int {
92 switch {
93 case b < 0x80:
94 return 1
95 case b < 0xE0:
96 return 2
97 case b < 0xF0:
98 return 3
99 default:
100 return 4
101 }
102 }
103
104 func decodeRune(s string, i int) (rune, int) {
105 b := s[i]
106 if b < 0x80 {
107 return rune(b), 1
108 }
109 if b < 0xE0 {
110 if i+1 >= len(s) {
111 return 0xFFFD, 1
112 }
113 return rune(b&0x1F)<<6 | rune(s[i+1]&0x3F), 2
114 }
115 if b < 0xF0 {
116 if i+2 >= len(s) {
117 return 0xFFFD, 1
118 }
119 return rune(b&0x0F)<<12 | rune(s[i+1]&0x3F)<<6 | rune(s[i+2]&0x3F), 3
120 }
121 if i+3 >= len(s) {
122 return 0xFFFD, 1
123 }
124 return rune(b&0x07)<<18 | rune(s[i+1]&0x3F)<<12 | rune(s[i+2]&0x3F)<<6 | rune(s[i+3]&0x3F), 4
125 }
126