bktree.mx raw

   1  package fuzzy
   2  
   3  // WordIndex groups words by codepoint length for efficient fuzzy lookup.
   4  // DL distance ≤ maxDist requires |len(a) - len(b)| ≤ maxDist, so only
   5  // length buckets within maxDist of the query need to be scanned.
   6  //
   7  // Build: O(n) — single pass, no map, no GC pressure.
   8  // Query: O(candidates × DL_cost) where candidates = words in length buckets
   9  // ± maxDist. For English (avg len 8) at maxDist=2: ~100-200K candidates,
  10  // DL_cost=O(64) → ~50-100ms. Acceptable for interactive CLI use.
  11  type WordIndex struct {
  12  	Words   []string   // all words, stable allocation
  13  	Lengths []int      // codepoint length of Words[i]
  14  	Buckets [][]int    // Buckets[L] = indices into Words with codepoint length L
  15  	MaxLen  int
  16  }
  17  
  18  // Match holds a single fuzzy match result.
  19  type Match struct {
  20  	Word string
  21  	Dist int
  22  }
  23  
  24  // Build constructs a WordIndex from a slice of words.
  25  func Build(words []string) *WordIndex {
  26  	if len(words) == 0 {
  27  		return &WordIndex{}
  28  	}
  29  
  30  	// Compute codepoint lengths and find max.
  31  	lengths := []int{:len(words):len(words)}
  32  	maxLen := 0
  33  	for i, w := range words {
  34  		l := cpLen(w)
  35  		lengths[i] = l
  36  		if l > maxLen {
  37  			maxLen = l
  38  		}
  39  	}
  40  
  41  	buckets := [][]int{:maxLen+1}
  42  	for i, l := range lengths {
  43  		buckets[l] = append(buckets[l], i)
  44  	}
  45  
  46  	return &WordIndex{
  47  		Words:   words,
  48  		Lengths: lengths,
  49  		Buckets: buckets,
  50  		MaxLen:  maxLen,
  51  	}
  52  }
  53  
  54  // Search returns all words within maxDist DL edits of query, sorted by distance.
  55  func (idx *WordIndex) Search(query string, maxDist int) []Match {
  56  	if idx == nil || len(idx.Words) == 0 {
  57  		return nil
  58  	}
  59  
  60  	qLen := cpLen(query)
  61  	lo := qLen - maxDist
  62  	if lo < 0 {
  63  		lo = 0
  64  	}
  65  	hi := qLen + maxDist
  66  	if hi > idx.MaxLen {
  67  		hi = idx.MaxLen
  68  	}
  69  
  70  	var results []Match
  71  	for l := lo; l <= hi; l++ {
  72  		for _, wi := range idx.Buckets[l] {
  73  			d := DL(query, idx.Words[wi])
  74  			if d <= maxDist {
  75  				results = append(results, Match{idx.Words[wi], d})
  76  			}
  77  		}
  78  	}
  79  
  80  	// Sort by distance ascending.
  81  	for i := 1; i < len(results); i++ {
  82  		key := results[i]
  83  		j := i - 1
  84  		for j >= 0 && results[j].Dist > key.Dist {
  85  			results[j+1] = results[j]
  86  			j--
  87  		}
  88  		results[j+1] = key
  89  	}
  90  	return results
  91  }
  92  
  93  // cpLen returns the number of Unicode codepoints in s.
  94  func cpLen(s string) int {
  95  	n := 0
  96  	for i := 0; i < len(s); {
  97  		b := s[i]
  98  		switch {
  99  		case b < 0x80:
 100  			i++
 101  		case b < 0xE0:
 102  			i += 2
 103  		case b < 0xF0:
 104  			i += 3
 105  		default:
 106  			i += 4
 107  		}
 108  		if i > len(s) {
 109  			i = len(s)
 110  		}
 111  		n++
 112  	}
 113  	return n
 114  }
 115