1 package fuzzy
2 3 // WordIndex groups words by codepoint length for efficient fuzzy lookup.
4 // DL distance ≤ maxDist requires |len(a) - len(b)| ≤ maxDist, so only
5 // length buckets within maxDist of the query need to be scanned.
6 //
7 // Build: O(n) — single pass, no map, no GC pressure.
8 // Query: O(candidates × DL_cost) where candidates = words in length buckets
9 // ± maxDist. For English (avg len 8) at maxDist=2: ~100-200K candidates,
10 // DL_cost=O(64) → ~50-100ms. Acceptable for interactive CLI use.
11 type WordIndex struct {
12 Words []string // all words, stable allocation
13 Lengths []int // codepoint length of Words[i]
14 Buckets [][]int // Buckets[L] = indices into Words with codepoint length L
15 MaxLen int
16 }
17 18 // Match holds a single fuzzy match result.
19 type Match struct {
20 Word string
21 Dist int
22 }
23 24 // Build constructs a WordIndex from a slice of words.
25 func Build(words []string) *WordIndex {
26 if len(words) == 0 {
27 return &WordIndex{}
28 }
29 30 // Compute codepoint lengths and find max.
31 lengths := []int{:len(words):len(words)}
32 maxLen := 0
33 for i, w := range words {
34 l := cpLen(w)
35 lengths[i] = l
36 if l > maxLen {
37 maxLen = l
38 }
39 }
40 41 buckets := [][]int{:maxLen+1}
42 for i, l := range lengths {
43 buckets[l] = append(buckets[l], i)
44 }
45 46 return &WordIndex{
47 Words: words,
48 Lengths: lengths,
49 Buckets: buckets,
50 MaxLen: maxLen,
51 }
52 }
53 54 // Search returns all words within maxDist DL edits of query, sorted by distance.
55 func (idx *WordIndex) Search(query string, maxDist int) []Match {
56 if idx == nil || len(idx.Words) == 0 {
57 return nil
58 }
59 60 qLen := cpLen(query)
61 lo := qLen - maxDist
62 if lo < 0 {
63 lo = 0
64 }
65 hi := qLen + maxDist
66 if hi > idx.MaxLen {
67 hi = idx.MaxLen
68 }
69 70 var results []Match
71 for l := lo; l <= hi; l++ {
72 for _, wi := range idx.Buckets[l] {
73 d := DL(query, idx.Words[wi])
74 if d <= maxDist {
75 results = append(results, Match{idx.Words[wi], d})
76 }
77 }
78 }
79 80 // Sort by distance ascending.
81 for i := 1; i < len(results); i++ {
82 key := results[i]
83 j := i - 1
84 for j >= 0 && results[j].Dist > key.Dist {
85 results[j+1] = results[j]
86 j--
87 }
88 results[j+1] = key
89 }
90 return results
91 }
92 93 // cpLen returns the number of Unicode codepoints in s.
94 func cpLen(s string) int {
95 n := 0
96 for i := 0; i < len(s); {
97 b := s[i]
98 switch {
99 case b < 0x80:
100 i++
101 case b < 0xE0:
102 i += 2
103 case b < 0xF0:
104 i += 3
105 default:
106 i += 4
107 }
108 if i > len(s) {
109 i = len(s)
110 }
111 n++
112 }
113 return n
114 }
115