tokenize_wasm.go raw

   1  //go:build js && wasm
   2  
   3  package database
   4  
   5  import (
   6  	"crypto/sha256"
   7  	"strings"
   8  	"unicode"
   9  	"unicode/utf8"
  10  )
  11  
  12  // stopWords contains common English function words that are too frequent to be
  13  // useful in search results. Filtering these reduces index size and prevents
  14  // high-fanout Word nodes that slow down queries.
  15  var stopWords = map[string]struct{}{
  16  	"an": {}, "as": {}, "at": {}, "be": {}, "by": {},
  17  	"do": {}, "he": {}, "if": {}, "in": {}, "is": {},
  18  	"it": {}, "me": {}, "my": {}, "no": {}, "of": {},
  19  	"on": {}, "or": {}, "so": {}, "to": {}, "up": {},
  20  	"us": {}, "we": {},
  21  	"and": {}, "are": {}, "but": {}, "for": {}, "had": {},
  22  	"has": {}, "her": {}, "him": {}, "his": {}, "how": {},
  23  	"its": {}, "not": {}, "our": {}, "the": {}, "too": {},
  24  	"was": {}, "who": {}, "you": {},
  25  	"all": {}, "can": {}, "did": {}, "got": {}, "she": {},
  26  	"been": {}, "does": {}, "each": {}, "from": {},
  27  	"have": {}, "just": {}, "more": {}, "much": {},
  28  	"must": {}, "only": {}, "some": {}, "such": {}, "than": {},
  29  	"that": {}, "them": {}, "then": {}, "they": {}, "this": {},
  30  	"very": {}, "were": {}, "what": {}, "when": {}, "will": {},
  31  	"with": {}, "your": {},
  32  	"about": {}, "could": {}, "other": {}, "their": {}, "there": {},
  33  	"these": {}, "those": {}, "which": {}, "would": {},
  34  }
  35  
  36  // TokenWords extracts unique word tokens from content, returning both the
  37  // normalized word text and its 8-byte truncated SHA-256 hash.
  38  // Rules:
  39  // - Unicode-aware: words are sequences of letters or numbers.
  40  // - Lowercased using unicode case mapping.
  41  // - Ignore URLs (starting with http://, https://, www., or containing "://").
  42  // - Ignore nostr: URIs and #[n] mentions.
  43  // - Ignore words shorter than 2 runes.
  44  // - Exclude 64-character hexadecimal strings (likely IDs/pubkeys).
  45  func TokenWords(content []byte) []WordToken {
  46  	s := string(content)
  47  	var out []WordToken
  48  	seen := make(map[string]struct{})
  49  
  50  	i := 0
  51  	for i < len(s) {
  52  		r, size := rune(s[i]), 1
  53  		if r >= 0x80 {
  54  			r, size = utf8.DecodeRuneInString(s[i:])
  55  		}
  56  
  57  		// Skip whitespace
  58  		if unicode.IsSpace(r) {
  59  			i += size
  60  			continue
  61  		}
  62  
  63  		// Skip URLs and schemes
  64  		if hasPrefixFold(s[i:], "http://") || hasPrefixFold(s[i:], "https://") || hasPrefixFold(s[i:], "nostr:") || hasPrefixFold(s[i:], "www.") {
  65  			i = skipUntilSpace(s, i)
  66  			continue
  67  		}
  68  		// If token contains "://" ahead, treat as URL and skip to space
  69  		if j := strings.Index(s[i:], "://"); j == 0 || (j > 0 && isWordStart(r)) {
  70  			// Only if it's at start of token
  71  			before := s[i : i+j]
  72  			if len(before) == 0 || allAlphaNum(before) {
  73  				i = skipUntilSpace(s, i)
  74  				continue
  75  			}
  76  		}
  77  		// Skip #[n] mentions
  78  		if r == '#' && i+size < len(s) && s[i+size] == '[' {
  79  			end := strings.IndexByte(s[i:], ']')
  80  			if end >= 0 {
  81  				i += end + 1
  82  				continue
  83  			}
  84  		}
  85  
  86  		// Collect a word
  87  		start := i
  88  		var runes []rune
  89  		for i < len(s) {
  90  			r2, size2 := rune(s[i]), 1
  91  			if r2 >= 0x80 {
  92  				r2, size2 = utf8.DecodeRuneInString(s[i:])
  93  			}
  94  			if unicode.IsLetter(r2) || unicode.IsNumber(r2) {
  95  				// Normalize decorative unicode (small caps, fraktur) to ASCII
  96  				// before lowercasing for consistent indexing
  97  				runes = append(runes, unicode.ToLower(normalizeRune(r2)))
  98  				i += size2
  99  				continue
 100  			}
 101  			break
 102  		}
 103  		// If we didn't consume any rune for a word, advance by one rune to avoid stalling
 104  		if i == start {
 105  			_, size2 := utf8.DecodeRuneInString(s[i:])
 106  			i += size2
 107  			continue
 108  		}
 109  		if len(runes) >= 2 {
 110  			w := string(runes)
 111  			// Exclude 64-char hex strings
 112  			if isHex64(w) {
 113  				continue
 114  			}
 115  			// Exclude common stop words
 116  			if _, ok := stopWords[w]; ok {
 117  				continue
 118  			}
 119  			if _, ok := seen[w]; !ok {
 120  				seen[w] = struct{}{}
 121  				h := sha256.Sum256([]byte(w))
 122  				out = append(out, WordToken{Word: w, Hash: h[:8]})
 123  			}
 124  		}
 125  	}
 126  	return out
 127  }
 128  
 129  // TokenHashes extracts unique word hashes (8-byte truncated sha256) from content.
 130  // This is a convenience wrapper around TokenWords that returns only the hashes.
 131  func TokenHashes(content []byte) [][]byte {
 132  	tokens := TokenWords(content)
 133  	out := make([][]byte, len(tokens))
 134  	for i, t := range tokens {
 135  		out[i] = t.Hash
 136  	}
 137  	return out
 138  }
 139  
 140  func hasPrefixFold(s, prefix string) bool {
 141  	if len(s) < len(prefix) {
 142  		return false
 143  	}
 144  	for i := 0; i < len(prefix); i++ {
 145  		c := s[i]
 146  		p := prefix[i]
 147  		if c == p {
 148  			continue
 149  		}
 150  		// ASCII case-insensitive
 151  		if 'A' <= c && c <= 'Z' {
 152  			c = c - 'A' + 'a'
 153  		}
 154  		if 'A' <= p && p <= 'Z' {
 155  			p = p - 'A' + 'a'
 156  		}
 157  		if c != p {
 158  			return false
 159  		}
 160  	}
 161  	return true
 162  }
 163  
 164  func skipUntilSpace(s string, i int) int {
 165  	for i < len(s) {
 166  		r, size := rune(s[i]), 1
 167  		if r >= 0x80 {
 168  			r, size = utf8.DecodeRuneInString(s[i:])
 169  		}
 170  		if unicode.IsSpace(r) {
 171  			return i
 172  		}
 173  		i += size
 174  	}
 175  	return i
 176  }
 177  
 178  func allAlphaNum(s string) bool {
 179  	for _, r := range s {
 180  		if !(unicode.IsLetter(r) || unicode.IsNumber(r)) {
 181  			return false
 182  		}
 183  	}
 184  	return true
 185  }
 186  
 187  func isWordStart(r rune) bool { return unicode.IsLetter(r) || unicode.IsNumber(r) }
 188  
 189  // isHex64 returns true if s is exactly 64 hex characters (0-9, a-f)
 190  func isHex64(s string) bool {
 191  	if len(s) != 64 {
 192  		return false
 193  	}
 194  	for i := 0; i < 64; i++ {
 195  		c := s[i]
 196  		if c >= '0' && c <= '9' {
 197  			continue
 198  		}
 199  		if c >= 'a' && c <= 'f' {
 200  			continue
 201  		}
 202  		if c >= 'A' && c <= 'F' {
 203  			continue
 204  		}
 205  		return false
 206  	}
 207  	return true
 208  }
 209