tokenize.go raw

   1  //go:build !(js && wasm)
   2  
   3  package database
   4  
   5  import (
   6  	"strings"
   7  	"unicode"
   8  	"unicode/utf8"
   9  
  10  	sha "github.com/minio/sha256-simd"
  11  )
  12  
  13  // stopWords contains common English function words that are too frequent to be
  14  // useful in search results. Filtering these reduces index size and prevents
  15  // high-fanout Word nodes that slow down queries.
  16  var stopWords = map[string]struct{}{
  17  	"an": {}, "as": {}, "at": {}, "be": {}, "by": {},
  18  	"do": {}, "he": {}, "if": {}, "in": {}, "is": {},
  19  	"it": {}, "me": {}, "my": {}, "no": {}, "of": {},
  20  	"on": {}, "or": {}, "so": {}, "to": {}, "up": {},
  21  	"us": {}, "we": {},
  22  	"and": {}, "are": {}, "but": {}, "for": {}, "had": {},
  23  	"has": {}, "her": {}, "him": {}, "his": {}, "how": {},
  24  	"its": {}, "not": {}, "our": {}, "the": {}, "too": {},
  25  	"was": {}, "who": {}, "you": {},
  26  	"all": {}, "can": {}, "did": {}, "got": {}, "she": {},
  27  	"been": {}, "does": {}, "each": {}, "from": {},
  28  	"have": {}, "just": {}, "more": {}, "much": {},
  29  	"must": {}, "only": {}, "some": {}, "such": {}, "than": {},
  30  	"that": {}, "them": {}, "then": {}, "they": {}, "this": {},
  31  	"very": {}, "were": {}, "what": {}, "when": {}, "will": {},
  32  	"with": {}, "your": {},
  33  	"about": {}, "could": {}, "other": {}, "their": {}, "there": {},
  34  	"these": {}, "those": {}, "which": {}, "would": {},
  35  }
  36  
  37  // TokenWords extracts unique word tokens from content, returning both the
  38  // normalized word text and its 8-byte truncated SHA-256 hash.
  39  // Rules:
  40  // - Unicode-aware: words are sequences of letters or numbers.
  41  // - Lowercased using unicode case mapping.
  42  // - Ignore URLs (starting with http://, https://, www., or containing "://").
  43  // - Ignore nostr: URIs and #[n] mentions.
  44  // - Ignore words shorter than 2 runes.
  45  // - Exclude 64-character hexadecimal strings (likely IDs/pubkeys).
  46  func TokenWords(content []byte) []WordToken {
  47  	s := string(content)
  48  	var out []WordToken
  49  	seen := make(map[string]struct{})
  50  
  51  	i := 0
  52  	for i < len(s) {
  53  		r, size := rune(s[i]), 1
  54  		if r >= 0x80 {
  55  			r, size = utf8.DecodeRuneInString(s[i:])
  56  		}
  57  
  58  		// Skip whitespace
  59  		if unicode.IsSpace(r) {
  60  			i += size
  61  			continue
  62  		}
  63  
  64  		// Skip URLs and schemes
  65  		if hasPrefixFold(s[i:], "http://") || hasPrefixFold(s[i:], "https://") || hasPrefixFold(s[i:], "nostr:") || hasPrefixFold(s[i:], "www.") {
  66  			i = skipUntilSpace(s, i)
  67  			continue
  68  		}
  69  		// If token contains "://" ahead, treat as URL and skip to space
  70  		if j := strings.Index(s[i:], "://"); j == 0 || (j > 0 && isWordStart(r)) {
  71  			// Only if it's at start of token
  72  			before := s[i : i+j]
  73  			if len(before) == 0 || allAlphaNum(before) {
  74  				i = skipUntilSpace(s, i)
  75  				continue
  76  			}
  77  		}
  78  		// Skip #[n] mentions
  79  		if r == '#' && i+size < len(s) && s[i+size] == '[' {
  80  			end := strings.IndexByte(s[i:], ']')
  81  			if end >= 0 {
  82  				i += end + 1
  83  				continue
  84  			}
  85  		}
  86  
  87  		// Collect a word
  88  		start := i
  89  		var runes []rune
  90  		for i < len(s) {
  91  			r2, size2 := rune(s[i]), 1
  92  			if r2 >= 0x80 {
  93  				r2, size2 = utf8.DecodeRuneInString(s[i:])
  94  			}
  95  			if unicode.IsLetter(r2) || unicode.IsNumber(r2) {
  96  				// Normalize decorative unicode (small caps, fraktur) to ASCII
  97  				// before lowercasing for consistent indexing
  98  				runes = append(runes, unicode.ToLower(normalizeRune(r2)))
  99  				i += size2
 100  				continue
 101  			}
 102  			break
 103  		}
 104  		// If we didn't consume any rune for a word, advance by one rune to avoid stalling
 105  		if i == start {
 106  			_, size2 := utf8.DecodeRuneInString(s[i:])
 107  			i += size2
 108  			continue
 109  		}
 110  		if len(runes) >= 2 {
 111  			w := string(runes)
 112  			// Exclude 64-char hex strings
 113  			if isHex64(w) {
 114  				continue
 115  			}
 116  			// Exclude common stop words
 117  			if _, ok := stopWords[w]; ok {
 118  				continue
 119  			}
 120  			if _, ok := seen[w]; !ok {
 121  				seen[w] = struct{}{}
 122  				h := sha.Sum256([]byte(w))
 123  				out = append(out, WordToken{Word: w, Hash: h[:8]})
 124  			}
 125  		}
 126  	}
 127  	return out
 128  }
 129  
 130  // TokenHashes extracts unique word hashes (8-byte truncated sha256) from content.
 131  // This is a convenience wrapper around TokenWords that returns only the hashes.
 132  func TokenHashes(content []byte) [][]byte {
 133  	tokens := TokenWords(content)
 134  	out := make([][]byte, len(tokens))
 135  	for i, t := range tokens {
 136  		out[i] = t.Hash
 137  	}
 138  	return out
 139  }
 140  
 141  func hasPrefixFold(s, prefix string) bool {
 142  	if len(s) < len(prefix) {
 143  		return false
 144  	}
 145  	for i := 0; i < len(prefix); i++ {
 146  		c := s[i]
 147  		p := prefix[i]
 148  		if c == p {
 149  			continue
 150  		}
 151  		// ASCII case-insensitive
 152  		if 'A' <= c && c <= 'Z' {
 153  			c = c - 'A' + 'a'
 154  		}
 155  		if 'A' <= p && p <= 'Z' {
 156  			p = p - 'A' + 'a'
 157  		}
 158  		if c != p {
 159  			return false
 160  		}
 161  	}
 162  	return true
 163  }
 164  
 165  func skipUntilSpace(s string, i int) int {
 166  	for i < len(s) {
 167  		r, size := rune(s[i]), 1
 168  		if r >= 0x80 {
 169  			r, size = utf8.DecodeRuneInString(s[i:])
 170  		}
 171  		if unicode.IsSpace(r) {
 172  			return i
 173  		}
 174  		i += size
 175  	}
 176  	return i
 177  }
 178  
 179  func allAlphaNum(s string) bool {
 180  	for _, r := range s {
 181  		if !(unicode.IsLetter(r) || unicode.IsNumber(r)) {
 182  			return false
 183  		}
 184  	}
 185  	return true
 186  }
 187  
 188  func isWordStart(r rune) bool { return unicode.IsLetter(r) || unicode.IsNumber(r) }
 189  
 190  // isHex64 returns true if s is exactly 64 hex characters (0-9, a-f)
 191  func isHex64(s string) bool {
 192  	if len(s) != 64 {
 193  		return false
 194  	}
 195  	for i := 0; i < 64; i++ {
 196  		c := s[i]
 197  		if c >= '0' && c <= '9' {
 198  			continue
 199  		}
 200  		if c >= 'a' && c <= 'f' {
 201  			continue
 202  		}
 203  		if c >= 'A' && c <= 'F' {
 204  			continue
 205  		}
 206  		return false
 207  	}
 208  	return true
 209  }
 210