tokenize_wasm.go raw
1 //go:build js && wasm
2
3 package database
4
5 import (
6 "crypto/sha256"
7 "strings"
8 "unicode"
9 "unicode/utf8"
10 )
11
12 // stopWords contains common English function words that are too frequent to be
13 // useful in search results. Filtering these reduces index size and prevents
14 // high-fanout Word nodes that slow down queries.
15 var stopWords = map[string]struct{}{
16 "an": {}, "as": {}, "at": {}, "be": {}, "by": {},
17 "do": {}, "he": {}, "if": {}, "in": {}, "is": {},
18 "it": {}, "me": {}, "my": {}, "no": {}, "of": {},
19 "on": {}, "or": {}, "so": {}, "to": {}, "up": {},
20 "us": {}, "we": {},
21 "and": {}, "are": {}, "but": {}, "for": {}, "had": {},
22 "has": {}, "her": {}, "him": {}, "his": {}, "how": {},
23 "its": {}, "not": {}, "our": {}, "the": {}, "too": {},
24 "was": {}, "who": {}, "you": {},
25 "all": {}, "can": {}, "did": {}, "got": {}, "she": {},
26 "been": {}, "does": {}, "each": {}, "from": {},
27 "have": {}, "just": {}, "more": {}, "much": {},
28 "must": {}, "only": {}, "some": {}, "such": {}, "than": {},
29 "that": {}, "them": {}, "then": {}, "they": {}, "this": {},
30 "very": {}, "were": {}, "what": {}, "when": {}, "will": {},
31 "with": {}, "your": {},
32 "about": {}, "could": {}, "other": {}, "their": {}, "there": {},
33 "these": {}, "those": {}, "which": {}, "would": {},
34 }
35
36 // TokenWords extracts unique word tokens from content, returning both the
37 // normalized word text and its 8-byte truncated SHA-256 hash.
38 // Rules:
39 // - Unicode-aware: words are sequences of letters or numbers.
40 // - Lowercased using unicode case mapping.
41 // - Ignore URLs (starting with http://, https://, www., or containing "://").
42 // - Ignore nostr: URIs and #[n] mentions.
43 // - Ignore words shorter than 2 runes.
44 // - Exclude 64-character hexadecimal strings (likely IDs/pubkeys).
45 func TokenWords(content []byte) []WordToken {
46 s := string(content)
47 var out []WordToken
48 seen := make(map[string]struct{})
49
50 i := 0
51 for i < len(s) {
52 r, size := rune(s[i]), 1
53 if r >= 0x80 {
54 r, size = utf8.DecodeRuneInString(s[i:])
55 }
56
57 // Skip whitespace
58 if unicode.IsSpace(r) {
59 i += size
60 continue
61 }
62
63 // Skip URLs and schemes
64 if hasPrefixFold(s[i:], "http://") || hasPrefixFold(s[i:], "https://") || hasPrefixFold(s[i:], "nostr:") || hasPrefixFold(s[i:], "www.") {
65 i = skipUntilSpace(s, i)
66 continue
67 }
68 // If token contains "://" ahead, treat as URL and skip to space
69 if j := strings.Index(s[i:], "://"); j == 0 || (j > 0 && isWordStart(r)) {
70 // Only if it's at start of token
71 before := s[i : i+j]
72 if len(before) == 0 || allAlphaNum(before) {
73 i = skipUntilSpace(s, i)
74 continue
75 }
76 }
77 // Skip #[n] mentions
78 if r == '#' && i+size < len(s) && s[i+size] == '[' {
79 end := strings.IndexByte(s[i:], ']')
80 if end >= 0 {
81 i += end + 1
82 continue
83 }
84 }
85
86 // Collect a word
87 start := i
88 var runes []rune
89 for i < len(s) {
90 r2, size2 := rune(s[i]), 1
91 if r2 >= 0x80 {
92 r2, size2 = utf8.DecodeRuneInString(s[i:])
93 }
94 if unicode.IsLetter(r2) || unicode.IsNumber(r2) {
95 // Normalize decorative unicode (small caps, fraktur) to ASCII
96 // before lowercasing for consistent indexing
97 runes = append(runes, unicode.ToLower(normalizeRune(r2)))
98 i += size2
99 continue
100 }
101 break
102 }
103 // If we didn't consume any rune for a word, advance by one rune to avoid stalling
104 if i == start {
105 _, size2 := utf8.DecodeRuneInString(s[i:])
106 i += size2
107 continue
108 }
109 if len(runes) >= 2 {
110 w := string(runes)
111 // Exclude 64-char hex strings
112 if isHex64(w) {
113 continue
114 }
115 // Exclude common stop words
116 if _, ok := stopWords[w]; ok {
117 continue
118 }
119 if _, ok := seen[w]; !ok {
120 seen[w] = struct{}{}
121 h := sha256.Sum256([]byte(w))
122 out = append(out, WordToken{Word: w, Hash: h[:8]})
123 }
124 }
125 }
126 return out
127 }
128
129 // TokenHashes extracts unique word hashes (8-byte truncated sha256) from content.
130 // This is a convenience wrapper around TokenWords that returns only the hashes.
131 func TokenHashes(content []byte) [][]byte {
132 tokens := TokenWords(content)
133 out := make([][]byte, len(tokens))
134 for i, t := range tokens {
135 out[i] = t.Hash
136 }
137 return out
138 }
139
140 func hasPrefixFold(s, prefix string) bool {
141 if len(s) < len(prefix) {
142 return false
143 }
144 for i := 0; i < len(prefix); i++ {
145 c := s[i]
146 p := prefix[i]
147 if c == p {
148 continue
149 }
150 // ASCII case-insensitive
151 if 'A' <= c && c <= 'Z' {
152 c = c - 'A' + 'a'
153 }
154 if 'A' <= p && p <= 'Z' {
155 p = p - 'A' + 'a'
156 }
157 if c != p {
158 return false
159 }
160 }
161 return true
162 }
163
164 func skipUntilSpace(s string, i int) int {
165 for i < len(s) {
166 r, size := rune(s[i]), 1
167 if r >= 0x80 {
168 r, size = utf8.DecodeRuneInString(s[i:])
169 }
170 if unicode.IsSpace(r) {
171 return i
172 }
173 i += size
174 }
175 return i
176 }
177
178 func allAlphaNum(s string) bool {
179 for _, r := range s {
180 if !(unicode.IsLetter(r) || unicode.IsNumber(r)) {
181 return false
182 }
183 }
184 return true
185 }
186
187 func isWordStart(r rune) bool { return unicode.IsLetter(r) || unicode.IsNumber(r) }
188
189 // isHex64 returns true if s is exactly 64 hex characters (0-9, a-f)
190 func isHex64(s string) bool {
191 if len(s) != 64 {
192 return false
193 }
194 for i := 0; i < 64; i++ {
195 c := s[i]
196 if c >= '0' && c <= '9' {
197 continue
198 }
199 if c >= 'a' && c <= 'f' {
200 continue
201 }
202 if c >= 'A' && c <= 'F' {
203 continue
204 }
205 return false
206 }
207 return true
208 }
209