package fuzzy // WordIndex groups words by codepoint length for efficient fuzzy lookup. // DL distance ≤ maxDist requires |len(a) - len(b)| ≤ maxDist, so only // length buckets within maxDist of the query need to be scanned. // // Build: O(n) — single pass, no map, no GC pressure. // Query: O(candidates × DL_cost) where candidates = words in length buckets // ± maxDist. For English (avg len 8) at maxDist=2: ~100-200K candidates, // DL_cost=O(64) → ~50-100ms. Acceptable for interactive CLI use. type WordIndex struct { Words []string // all words, stable allocation Lengths []int // codepoint length of Words[i] Buckets [][]int // Buckets[L] = indices into Words with codepoint length L MaxLen int } // Match holds a single fuzzy match result. type Match struct { Word string Dist int } // Build constructs a WordIndex from a slice of words. func Build(words []string) *WordIndex { if len(words) == 0 { return &WordIndex{} } // Compute codepoint lengths and find max. lengths := []int{:len(words):len(words)} maxLen := 0 for i, w := range words { l := cpLen(w) lengths[i] = l if l > maxLen { maxLen = l } } buckets := [][]int{:maxLen+1} for i, l := range lengths { buckets[l] = append(buckets[l], i) } return &WordIndex{ Words: words, Lengths: lengths, Buckets: buckets, MaxLen: maxLen, } } // Search returns all words within maxDist DL edits of query, sorted by distance. func (idx *WordIndex) Search(query string, maxDist int) []Match { if idx == nil || len(idx.Words) == 0 { return nil } qLen := cpLen(query) lo := qLen - maxDist if lo < 0 { lo = 0 } hi := qLen + maxDist if hi > idx.MaxLen { hi = idx.MaxLen } var results []Match for l := lo; l <= hi; l++ { for _, wi := range idx.Buckets[l] { d := DL(query, idx.Words[wi]) if d <= maxDist { results = append(results, Match{idx.Words[wi], d}) } } } // Sort by distance ascending. for i := 1; i < len(results); i++ { key := results[i] j := i - 1 for j >= 0 && results[j].Dist > key.Dist { results[j+1] = results[j] j-- } results[j+1] = key } return results } // cpLen returns the number of Unicode codepoints in s. func cpLen(s string) int { n := 0 for i := 0; i < len(s); { b := s[i] switch { case b < 0x80: i++ case b < 0xE0: i += 2 case b < 0xF0: i += 3 default: i += 4 } if i > len(s) { i = len(s) } n++ } return n }