tokenize_test.go raw
1 //go:build !(js && wasm)
2
3 package database
4
5 import (
6 "bytes"
7 "strings"
8 "testing"
9 )
10
11 func TestTokenWords_StopWords(t *testing.T) {
12 tokens := TokenWords([]byte("the quick brown fox and the lazy dog"))
13 words := make(map[string]bool)
14 for _, tok := range tokens {
15 words[tok.Word] = true
16 }
17
18 // "the" and "and" are stop words
19 if words["the"] {
20 t.Error("stop word 'the' should be filtered")
21 }
22 if words["and"] {
23 t.Error("stop word 'and' should be filtered")
24 }
25
26 // content words should remain
27 for _, w := range []string{"quick", "brown", "fox", "lazy", "dog"} {
28 if !words[w] {
29 t.Errorf("expected word %q to be present", w)
30 }
31 }
32
33 if len(tokens) != 5 {
34 t.Errorf("expected 5 tokens, got %d", len(tokens))
35 }
36 }
37
38 func TestTokenWords_AllStopWords(t *testing.T) {
39 // Input consisting entirely of stop words should produce zero tokens
40 tokens := TokenWords([]byte("the and for with from that this they"))
41 if len(tokens) != 0 {
42 t.Errorf("expected 0 tokens for all-stop-word input, got %d", len(tokens))
43 }
44 }
45
46 func TestTokenWords_EmptyContent(t *testing.T) {
47 tokens := TokenWords([]byte(""))
48 if len(tokens) != 0 {
49 t.Errorf("expected 0 tokens for empty content, got %d", len(tokens))
50 }
51
52 tokens = TokenWords(nil)
53 if len(tokens) != 0 {
54 t.Errorf("expected 0 tokens for nil content, got %d", len(tokens))
55 }
56 }
57
58 func TestTokenWords_OnlyURLs(t *testing.T) {
59 tokens := TokenWords([]byte("https://example.com http://test.org www.foo.bar"))
60 if len(tokens) != 0 {
61 t.Errorf("expected 0 tokens for URL-only content, got %d", len(tokens))
62 }
63 }
64
65 func TestTokenWords_NumbersIncluded(t *testing.T) {
66 tokens := TokenWords([]byte("42 1337 99"))
67 words := make(map[string]bool)
68 for _, tok := range tokens {
69 words[tok.Word] = true
70 }
71
72 // "42" and "99" are 2 chars, should be included
73 if !words["42"] {
74 t.Error("expected '42' to be tokenized")
75 }
76 if !words["99"] {
77 t.Error("expected '99' to be tokenized")
78 }
79 if !words["1337"] {
80 t.Error("expected '1337' to be tokenized")
81 }
82 }
83
84 func TestTokenWords_LongContent(t *testing.T) {
85 // Generate a large input to ensure no panic or infinite loop
86 var sb strings.Builder
87 for i := 0; i < 10000; i++ {
88 sb.WriteString("word ")
89 }
90 tokens := TokenWords([]byte(sb.String()))
91
92 // "word" is deduplicated so only 1 unique token
93 if len(tokens) != 1 {
94 t.Errorf("expected 1 unique token from repeated word, got %d", len(tokens))
95 }
96 if tokens[0].Word != "word" {
97 t.Errorf("expected 'word', got %q", tokens[0].Word)
98 }
99 }
100
101 func TestTokenWords_MixedUnicode(t *testing.T) {
102 // CJK characters are letters and should be tokenized if 2+ runes
103 // Single CJK chars are filtered by min-length
104 tokens := TokenWords([]byte("hello 世界 café"))
105 words := make(map[string]bool)
106 for _, tok := range tokens {
107 words[tok.Word] = true
108 }
109
110 if !words["hello"] {
111 t.Error("expected 'hello'")
112 }
113 // "世界" is 2 runes, both are letters — should be included
114 if !words["世界"] {
115 t.Error("expected '世界' (2 CJK characters)")
116 }
117 // "café" normalizes to lowercase
118 if !words["café"] {
119 t.Error("expected 'café'")
120 }
121 }
122
123 func TestTokenWords_Hex64Exclusion(t *testing.T) {
124 hex := "abcdef0123456789abcdef0123456789abcdef0123456789abcdef0123456789"
125 content := "before " + hex + " after"
126 tokens := TokenWords([]byte(content))
127 words := make(map[string]bool)
128 for _, tok := range tokens {
129 words[tok.Word] = true
130 }
131
132 if words[hex] {
133 t.Error("64-char hex string should be excluded")
134 }
135 if !words["before"] {
136 t.Error("expected 'before'")
137 }
138 if !words["after"] {
139 t.Error("expected 'after'")
140 }
141 }
142
143 func TestTokenWords_NostrURIExclusion(t *testing.T) {
144 tokens := TokenWords([]byte("check nostr:npub1abc123def456 now"))
145 words := make(map[string]bool)
146 for _, tok := range tokens {
147 words[tok.Word] = true
148 }
149
150 if !words["check"] {
151 t.Error("expected 'check'")
152 }
153 if !words["now"] {
154 t.Error("expected 'now'")
155 }
156 // Nothing from the nostr: URI should appear
157 if words["npub1abc123def456"] {
158 t.Error("nostr: URI content should be excluded")
159 }
160 }
161
162 func TestTokenWords_MentionExclusion(t *testing.T) {
163 tokens := TokenWords([]byte("see #[0] here #[12] done"))
164 words := make(map[string]bool)
165 for _, tok := range tokens {
166 words[tok.Word] = true
167 }
168
169 if !words["see"] {
170 t.Error("expected 'see'")
171 }
172 if !words["here"] {
173 t.Error("expected 'here'")
174 }
175 if !words["done"] {
176 t.Error("expected 'done'")
177 }
178 }
179
180 func TestTokenWords_URLSchemes(t *testing.T) {
181 // Various URL-like schemes with "://" should be skipped
182 tokens := TokenWords([]byte("visit ftp://files.example.com and wss://relay.damus.io"))
183 words := make(map[string]bool)
184 for _, tok := range tokens {
185 words[tok.Word] = true
186 }
187
188 if words["ftp"] || words["files"] || words["example"] {
189 t.Error("ftp:// URL content should be excluded")
190 }
191 if words["wss"] || words["relay"] || words["damus"] {
192 t.Error("wss:// URL content should be excluded")
193 }
194 // "visit" should survive; "and" is a stop word
195 if !words["visit"] {
196 t.Error("expected 'visit'")
197 }
198 }
199
200 func TestTokenWords_MinLengthBoundary(t *testing.T) {
201 // 1-rune words are filtered, 2-rune words are kept (unless stop words)
202 tokens := TokenWords([]byte("I go ok hi"))
203 words := make(map[string]bool)
204 for _, tok := range tokens {
205 words[tok.Word] = true
206 }
207
208 // "I" is 1 rune — filtered by length
209 if words["i"] {
210 t.Error("single-rune 'I' should be filtered by min-length")
211 }
212 // "go" is 2 runes but not a stop word — should be present
213 if !words["go"] {
214 t.Error("expected 'go' (2 runes, not a stop word)")
215 }
216 // "ok" is 2 runes, not a stop word
217 if !words["ok"] {
218 t.Error("expected 'ok'")
219 }
220 // "hi" is 2 runes, not a stop word
221 if !words["hi"] {
222 t.Error("expected 'hi'")
223 }
224 }
225
226 func TestTokenWords_Deduplication(t *testing.T) {
227 tokens := TokenWords([]byte("Bitcoin bitcoin BITCOIN"))
228 if len(tokens) != 1 {
229 t.Fatalf("expected 1 deduplicated token, got %d", len(tokens))
230 }
231 if tokens[0].Word != "bitcoin" {
232 t.Errorf("expected 'bitcoin', got %q", tokens[0].Word)
233 }
234 }
235
236 func TestTokenWords_WhitespaceVariants(t *testing.T) {
237 // Tabs, newlines, and other whitespace should split tokens
238 tokens := TokenWords([]byte("hello\tworld\nnew\rline"))
239 words := make(map[string]bool)
240 for _, tok := range tokens {
241 words[tok.Word] = true
242 }
243
244 for _, w := range []string{"hello", "world", "new", "line"} {
245 if !words[w] {
246 t.Errorf("expected word %q after whitespace splitting", w)
247 }
248 }
249 }
250
251 func TestTokenWords_PunctuationSplit(t *testing.T) {
252 // Punctuation should split words, not be included in them
253 tokens := TokenWords([]byte("hello, world! foo-bar baz.qux"))
254 words := make(map[string]bool)
255 for _, tok := range tokens {
256 words[tok.Word] = true
257 }
258
259 for _, w := range []string{"hello", "world", "foo", "bar", "baz", "qux"} {
260 if !words[w] {
261 t.Errorf("expected word %q after punctuation split", w)
262 }
263 }
264 }
265
266 func TestTokenHashes_MatchesTokenWords(t *testing.T) {
267 content := []byte("bitcoin lightning network relay")
268 tokens := TokenWords(content)
269 hashes := TokenHashes(content)
270
271 if len(tokens) != len(hashes) {
272 t.Fatalf("TokenWords returned %d, TokenHashes returned %d", len(tokens), len(hashes))
273 }
274
275 for i, tok := range tokens {
276 if !bytes.Equal(tok.Hash, hashes[i]) {
277 t.Errorf("hash mismatch at index %d: TokenWords=%x, TokenHashes=%x",
278 i, tok.Hash, hashes[i])
279 }
280 }
281 }
282
283 func TestTokenWords_StopWordsList(t *testing.T) {
284 // Verify a representative sample of stop words are actually filtered
285 samples := []string{
286 "the", "and", "for", "with", "from", "that", "this",
287 "they", "were", "will", "been", "have", "about",
288 "which", "would", "their", "there", "these", "those",
289 "is", "in", "of", "to", "or", "an", "as", "at",
290 }
291 for _, sw := range samples {
292 tokens := TokenWords([]byte(sw))
293 if len(tokens) != 0 {
294 t.Errorf("stop word %q should produce 0 tokens, got %d", sw, len(tokens))
295 }
296 }
297 }
298