tokenize.go raw
1 //go:build !(js && wasm)
2
3 package database
4
5 import (
6 "strings"
7 "unicode"
8 "unicode/utf8"
9
10 sha "github.com/minio/sha256-simd"
11 )
12
13 // stopWords contains common English function words that are too frequent to be
14 // useful in search results. Filtering these reduces index size and prevents
15 // high-fanout Word nodes that slow down queries.
16 var stopWords = map[string]struct{}{
17 "an": {}, "as": {}, "at": {}, "be": {}, "by": {},
18 "do": {}, "he": {}, "if": {}, "in": {}, "is": {},
19 "it": {}, "me": {}, "my": {}, "no": {}, "of": {},
20 "on": {}, "or": {}, "so": {}, "to": {}, "up": {},
21 "us": {}, "we": {},
22 "and": {}, "are": {}, "but": {}, "for": {}, "had": {},
23 "has": {}, "her": {}, "him": {}, "his": {}, "how": {},
24 "its": {}, "not": {}, "our": {}, "the": {}, "too": {},
25 "was": {}, "who": {}, "you": {},
26 "all": {}, "can": {}, "did": {}, "got": {}, "she": {},
27 "been": {}, "does": {}, "each": {}, "from": {},
28 "have": {}, "just": {}, "more": {}, "much": {},
29 "must": {}, "only": {}, "some": {}, "such": {}, "than": {},
30 "that": {}, "them": {}, "then": {}, "they": {}, "this": {},
31 "very": {}, "were": {}, "what": {}, "when": {}, "will": {},
32 "with": {}, "your": {},
33 "about": {}, "could": {}, "other": {}, "their": {}, "there": {},
34 "these": {}, "those": {}, "which": {}, "would": {},
35 }
36
37 // TokenWords extracts unique word tokens from content, returning both the
38 // normalized word text and its 8-byte truncated SHA-256 hash.
39 // Rules:
40 // - Unicode-aware: words are sequences of letters or numbers.
41 // - Lowercased using unicode case mapping.
42 // - Ignore URLs (starting with http://, https://, www., or containing "://").
43 // - Ignore nostr: URIs and #[n] mentions.
44 // - Ignore words shorter than 2 runes.
45 // - Exclude 64-character hexadecimal strings (likely IDs/pubkeys).
46 func TokenWords(content []byte) []WordToken {
47 s := string(content)
48 var out []WordToken
49 seen := make(map[string]struct{})
50
51 i := 0
52 for i < len(s) {
53 r, size := rune(s[i]), 1
54 if r >= 0x80 {
55 r, size = utf8.DecodeRuneInString(s[i:])
56 }
57
58 // Skip whitespace
59 if unicode.IsSpace(r) {
60 i += size
61 continue
62 }
63
64 // Skip URLs and schemes
65 if hasPrefixFold(s[i:], "http://") || hasPrefixFold(s[i:], "https://") || hasPrefixFold(s[i:], "nostr:") || hasPrefixFold(s[i:], "www.") {
66 i = skipUntilSpace(s, i)
67 continue
68 }
69 // If token contains "://" ahead, treat as URL and skip to space
70 if j := strings.Index(s[i:], "://"); j == 0 || (j > 0 && isWordStart(r)) {
71 // Only if it's at start of token
72 before := s[i : i+j]
73 if len(before) == 0 || allAlphaNum(before) {
74 i = skipUntilSpace(s, i)
75 continue
76 }
77 }
78 // Skip #[n] mentions
79 if r == '#' && i+size < len(s) && s[i+size] == '[' {
80 end := strings.IndexByte(s[i:], ']')
81 if end >= 0 {
82 i += end + 1
83 continue
84 }
85 }
86
87 // Collect a word
88 start := i
89 var runes []rune
90 for i < len(s) {
91 r2, size2 := rune(s[i]), 1
92 if r2 >= 0x80 {
93 r2, size2 = utf8.DecodeRuneInString(s[i:])
94 }
95 if unicode.IsLetter(r2) || unicode.IsNumber(r2) {
96 // Normalize decorative unicode (small caps, fraktur) to ASCII
97 // before lowercasing for consistent indexing
98 runes = append(runes, unicode.ToLower(normalizeRune(r2)))
99 i += size2
100 continue
101 }
102 break
103 }
104 // If we didn't consume any rune for a word, advance by one rune to avoid stalling
105 if i == start {
106 _, size2 := utf8.DecodeRuneInString(s[i:])
107 i += size2
108 continue
109 }
110 if len(runes) >= 2 {
111 w := string(runes)
112 // Exclude 64-char hex strings
113 if isHex64(w) {
114 continue
115 }
116 // Exclude common stop words
117 if _, ok := stopWords[w]; ok {
118 continue
119 }
120 if _, ok := seen[w]; !ok {
121 seen[w] = struct{}{}
122 h := sha.Sum256([]byte(w))
123 out = append(out, WordToken{Word: w, Hash: h[:8]})
124 }
125 }
126 }
127 return out
128 }
129
130 // TokenHashes extracts unique word hashes (8-byte truncated sha256) from content.
131 // This is a convenience wrapper around TokenWords that returns only the hashes.
132 func TokenHashes(content []byte) [][]byte {
133 tokens := TokenWords(content)
134 out := make([][]byte, len(tokens))
135 for i, t := range tokens {
136 out[i] = t.Hash
137 }
138 return out
139 }
140
141 func hasPrefixFold(s, prefix string) bool {
142 if len(s) < len(prefix) {
143 return false
144 }
145 for i := 0; i < len(prefix); i++ {
146 c := s[i]
147 p := prefix[i]
148 if c == p {
149 continue
150 }
151 // ASCII case-insensitive
152 if 'A' <= c && c <= 'Z' {
153 c = c - 'A' + 'a'
154 }
155 if 'A' <= p && p <= 'Z' {
156 p = p - 'A' + 'a'
157 }
158 if c != p {
159 return false
160 }
161 }
162 return true
163 }
164
165 func skipUntilSpace(s string, i int) int {
166 for i < len(s) {
167 r, size := rune(s[i]), 1
168 if r >= 0x80 {
169 r, size = utf8.DecodeRuneInString(s[i:])
170 }
171 if unicode.IsSpace(r) {
172 return i
173 }
174 i += size
175 }
176 return i
177 }
178
179 func allAlphaNum(s string) bool {
180 for _, r := range s {
181 if !(unicode.IsLetter(r) || unicode.IsNumber(r)) {
182 return false
183 }
184 }
185 return true
186 }
187
188 func isWordStart(r rune) bool { return unicode.IsLetter(r) || unicode.IsNumber(r) }
189
190 // isHex64 returns true if s is exactly 64 hex characters (0-9, a-f)
191 func isHex64(s string) bool {
192 if len(s) != 64 {
193 return false
194 }
195 for i := 0; i < 64; i++ {
196 c := s[i]
197 if c >= '0' && c <= '9' {
198 continue
199 }
200 if c >= 'a' && c <= 'f' {
201 continue
202 }
203 if c >= 'A' && c <= 'F' {
204 continue
205 }
206 return false
207 }
208 return true
209 }
210