tokenize_test.go raw

   1  //go:build !(js && wasm)
   2  
   3  package database
   4  
   5  import (
   6  	"bytes"
   7  	"strings"
   8  	"testing"
   9  )
  10  
  11  func TestTokenWords_StopWords(t *testing.T) {
  12  	tokens := TokenWords([]byte("the quick brown fox and the lazy dog"))
  13  	words := make(map[string]bool)
  14  	for _, tok := range tokens {
  15  		words[tok.Word] = true
  16  	}
  17  
  18  	// "the" and "and" are stop words
  19  	if words["the"] {
  20  		t.Error("stop word 'the' should be filtered")
  21  	}
  22  	if words["and"] {
  23  		t.Error("stop word 'and' should be filtered")
  24  	}
  25  
  26  	// content words should remain
  27  	for _, w := range []string{"quick", "brown", "fox", "lazy", "dog"} {
  28  		if !words[w] {
  29  			t.Errorf("expected word %q to be present", w)
  30  		}
  31  	}
  32  
  33  	if len(tokens) != 5 {
  34  		t.Errorf("expected 5 tokens, got %d", len(tokens))
  35  	}
  36  }
  37  
  38  func TestTokenWords_AllStopWords(t *testing.T) {
  39  	// Input consisting entirely of stop words should produce zero tokens
  40  	tokens := TokenWords([]byte("the and for with from that this they"))
  41  	if len(tokens) != 0 {
  42  		t.Errorf("expected 0 tokens for all-stop-word input, got %d", len(tokens))
  43  	}
  44  }
  45  
  46  func TestTokenWords_EmptyContent(t *testing.T) {
  47  	tokens := TokenWords([]byte(""))
  48  	if len(tokens) != 0 {
  49  		t.Errorf("expected 0 tokens for empty content, got %d", len(tokens))
  50  	}
  51  
  52  	tokens = TokenWords(nil)
  53  	if len(tokens) != 0 {
  54  		t.Errorf("expected 0 tokens for nil content, got %d", len(tokens))
  55  	}
  56  }
  57  
  58  func TestTokenWords_OnlyURLs(t *testing.T) {
  59  	tokens := TokenWords([]byte("https://example.com http://test.org www.foo.bar"))
  60  	if len(tokens) != 0 {
  61  		t.Errorf("expected 0 tokens for URL-only content, got %d", len(tokens))
  62  	}
  63  }
  64  
  65  func TestTokenWords_NumbersIncluded(t *testing.T) {
  66  	tokens := TokenWords([]byte("42 1337 99"))
  67  	words := make(map[string]bool)
  68  	for _, tok := range tokens {
  69  		words[tok.Word] = true
  70  	}
  71  
  72  	// "42" and "99" are 2 chars, should be included
  73  	if !words["42"] {
  74  		t.Error("expected '42' to be tokenized")
  75  	}
  76  	if !words["99"] {
  77  		t.Error("expected '99' to be tokenized")
  78  	}
  79  	if !words["1337"] {
  80  		t.Error("expected '1337' to be tokenized")
  81  	}
  82  }
  83  
  84  func TestTokenWords_LongContent(t *testing.T) {
  85  	// Generate a large input to ensure no panic or infinite loop
  86  	var sb strings.Builder
  87  	for i := 0; i < 10000; i++ {
  88  		sb.WriteString("word ")
  89  	}
  90  	tokens := TokenWords([]byte(sb.String()))
  91  
  92  	// "word" is deduplicated so only 1 unique token
  93  	if len(tokens) != 1 {
  94  		t.Errorf("expected 1 unique token from repeated word, got %d", len(tokens))
  95  	}
  96  	if tokens[0].Word != "word" {
  97  		t.Errorf("expected 'word', got %q", tokens[0].Word)
  98  	}
  99  }
 100  
 101  func TestTokenWords_MixedUnicode(t *testing.T) {
 102  	// CJK characters are letters and should be tokenized if 2+ runes
 103  	// Single CJK chars are filtered by min-length
 104  	tokens := TokenWords([]byte("hello 世界 café"))
 105  	words := make(map[string]bool)
 106  	for _, tok := range tokens {
 107  		words[tok.Word] = true
 108  	}
 109  
 110  	if !words["hello"] {
 111  		t.Error("expected 'hello'")
 112  	}
 113  	// "世界" is 2 runes, both are letters — should be included
 114  	if !words["世界"] {
 115  		t.Error("expected '世界' (2 CJK characters)")
 116  	}
 117  	// "café" normalizes to lowercase
 118  	if !words["café"] {
 119  		t.Error("expected 'café'")
 120  	}
 121  }
 122  
 123  func TestTokenWords_Hex64Exclusion(t *testing.T) {
 124  	hex := "abcdef0123456789abcdef0123456789abcdef0123456789abcdef0123456789"
 125  	content := "before " + hex + " after"
 126  	tokens := TokenWords([]byte(content))
 127  	words := make(map[string]bool)
 128  	for _, tok := range tokens {
 129  		words[tok.Word] = true
 130  	}
 131  
 132  	if words[hex] {
 133  		t.Error("64-char hex string should be excluded")
 134  	}
 135  	if !words["before"] {
 136  		t.Error("expected 'before'")
 137  	}
 138  	if !words["after"] {
 139  		t.Error("expected 'after'")
 140  	}
 141  }
 142  
 143  func TestTokenWords_NostrURIExclusion(t *testing.T) {
 144  	tokens := TokenWords([]byte("check nostr:npub1abc123def456 now"))
 145  	words := make(map[string]bool)
 146  	for _, tok := range tokens {
 147  		words[tok.Word] = true
 148  	}
 149  
 150  	if !words["check"] {
 151  		t.Error("expected 'check'")
 152  	}
 153  	if !words["now"] {
 154  		t.Error("expected 'now'")
 155  	}
 156  	// Nothing from the nostr: URI should appear
 157  	if words["npub1abc123def456"] {
 158  		t.Error("nostr: URI content should be excluded")
 159  	}
 160  }
 161  
 162  func TestTokenWords_MentionExclusion(t *testing.T) {
 163  	tokens := TokenWords([]byte("see #[0] here #[12] done"))
 164  	words := make(map[string]bool)
 165  	for _, tok := range tokens {
 166  		words[tok.Word] = true
 167  	}
 168  
 169  	if !words["see"] {
 170  		t.Error("expected 'see'")
 171  	}
 172  	if !words["here"] {
 173  		t.Error("expected 'here'")
 174  	}
 175  	if !words["done"] {
 176  		t.Error("expected 'done'")
 177  	}
 178  }
 179  
 180  func TestTokenWords_URLSchemes(t *testing.T) {
 181  	// Various URL-like schemes with "://" should be skipped
 182  	tokens := TokenWords([]byte("visit ftp://files.example.com and wss://relay.damus.io"))
 183  	words := make(map[string]bool)
 184  	for _, tok := range tokens {
 185  		words[tok.Word] = true
 186  	}
 187  
 188  	if words["ftp"] || words["files"] || words["example"] {
 189  		t.Error("ftp:// URL content should be excluded")
 190  	}
 191  	if words["wss"] || words["relay"] || words["damus"] {
 192  		t.Error("wss:// URL content should be excluded")
 193  	}
 194  	// "visit" should survive; "and" is a stop word
 195  	if !words["visit"] {
 196  		t.Error("expected 'visit'")
 197  	}
 198  }
 199  
 200  func TestTokenWords_MinLengthBoundary(t *testing.T) {
 201  	// 1-rune words are filtered, 2-rune words are kept (unless stop words)
 202  	tokens := TokenWords([]byte("I go ok hi"))
 203  	words := make(map[string]bool)
 204  	for _, tok := range tokens {
 205  		words[tok.Word] = true
 206  	}
 207  
 208  	// "I" is 1 rune — filtered by length
 209  	if words["i"] {
 210  		t.Error("single-rune 'I' should be filtered by min-length")
 211  	}
 212  	// "go" is 2 runes but not a stop word — should be present
 213  	if !words["go"] {
 214  		t.Error("expected 'go' (2 runes, not a stop word)")
 215  	}
 216  	// "ok" is 2 runes, not a stop word
 217  	if !words["ok"] {
 218  		t.Error("expected 'ok'")
 219  	}
 220  	// "hi" is 2 runes, not a stop word
 221  	if !words["hi"] {
 222  		t.Error("expected 'hi'")
 223  	}
 224  }
 225  
 226  func TestTokenWords_Deduplication(t *testing.T) {
 227  	tokens := TokenWords([]byte("Bitcoin bitcoin BITCOIN"))
 228  	if len(tokens) != 1 {
 229  		t.Fatalf("expected 1 deduplicated token, got %d", len(tokens))
 230  	}
 231  	if tokens[0].Word != "bitcoin" {
 232  		t.Errorf("expected 'bitcoin', got %q", tokens[0].Word)
 233  	}
 234  }
 235  
 236  func TestTokenWords_WhitespaceVariants(t *testing.T) {
 237  	// Tabs, newlines, and other whitespace should split tokens
 238  	tokens := TokenWords([]byte("hello\tworld\nnew\rline"))
 239  	words := make(map[string]bool)
 240  	for _, tok := range tokens {
 241  		words[tok.Word] = true
 242  	}
 243  
 244  	for _, w := range []string{"hello", "world", "new", "line"} {
 245  		if !words[w] {
 246  			t.Errorf("expected word %q after whitespace splitting", w)
 247  		}
 248  	}
 249  }
 250  
 251  func TestTokenWords_PunctuationSplit(t *testing.T) {
 252  	// Punctuation should split words, not be included in them
 253  	tokens := TokenWords([]byte("hello, world! foo-bar baz.qux"))
 254  	words := make(map[string]bool)
 255  	for _, tok := range tokens {
 256  		words[tok.Word] = true
 257  	}
 258  
 259  	for _, w := range []string{"hello", "world", "foo", "bar", "baz", "qux"} {
 260  		if !words[w] {
 261  			t.Errorf("expected word %q after punctuation split", w)
 262  		}
 263  	}
 264  }
 265  
 266  func TestTokenHashes_MatchesTokenWords(t *testing.T) {
 267  	content := []byte("bitcoin lightning network relay")
 268  	tokens := TokenWords(content)
 269  	hashes := TokenHashes(content)
 270  
 271  	if len(tokens) != len(hashes) {
 272  		t.Fatalf("TokenWords returned %d, TokenHashes returned %d", len(tokens), len(hashes))
 273  	}
 274  
 275  	for i, tok := range tokens {
 276  		if !bytes.Equal(tok.Hash, hashes[i]) {
 277  			t.Errorf("hash mismatch at index %d: TokenWords=%x, TokenHashes=%x",
 278  				i, tok.Hash, hashes[i])
 279  		}
 280  	}
 281  }
 282  
 283  func TestTokenWords_StopWordsList(t *testing.T) {
 284  	// Verify a representative sample of stop words are actually filtered
 285  	samples := []string{
 286  		"the", "and", "for", "with", "from", "that", "this",
 287  		"they", "were", "will", "been", "have", "about",
 288  		"which", "would", "their", "there", "these", "those",
 289  		"is", "in", "of", "to", "or", "an", "as", "at",
 290  	}
 291  	for _, sw := range samples {
 292  		tokens := TokenWords([]byte(sw))
 293  		if len(tokens) != 0 {
 294  			t.Errorf("stop word %q should produce 0 tokens, got %d", sw, len(tokens))
 295  		}
 296  	}
 297  }
 298