//go:build !(js && wasm)

package database

import (
	"bytes"
	"strings"
	"testing"
)

func TestTokenWords_StopWords(t *testing.T) {
	tokens := TokenWords([]byte("the quick brown fox and the lazy dog"))
	words := make(map[string]bool)
	for _, tok := range tokens {
		words[tok.Word] = true
	}

	// "the" and "and" are stop words
	if words["the"] {
		t.Error("stop word 'the' should be filtered")
	}
	if words["and"] {
		t.Error("stop word 'and' should be filtered")
	}

	// content words should remain
	for _, w := range []string{"quick", "brown", "fox", "lazy", "dog"} {
		if !words[w] {
			t.Errorf("expected word %q to be present", w)
		}
	}

	if len(tokens) != 5 {
		t.Errorf("expected 5 tokens, got %d", len(tokens))
	}
}

func TestTokenWords_AllStopWords(t *testing.T) {
	// Input consisting entirely of stop words should produce zero tokens
	tokens := TokenWords([]byte("the and for with from that this they"))
	if len(tokens) != 0 {
		t.Errorf("expected 0 tokens for all-stop-word input, got %d", len(tokens))
	}
}

func TestTokenWords_EmptyContent(t *testing.T) {
	tokens := TokenWords([]byte(""))
	if len(tokens) != 0 {
		t.Errorf("expected 0 tokens for empty content, got %d", len(tokens))
	}

	tokens = TokenWords(nil)
	if len(tokens) != 0 {
		t.Errorf("expected 0 tokens for nil content, got %d", len(tokens))
	}
}

func TestTokenWords_OnlyURLs(t *testing.T) {
	tokens := TokenWords([]byte("https://example.com http://test.org www.foo.bar"))
	if len(tokens) != 0 {
		t.Errorf("expected 0 tokens for URL-only content, got %d", len(tokens))
	}
}

func TestTokenWords_NumbersIncluded(t *testing.T) {
	tokens := TokenWords([]byte("42 1337 99"))
	words := make(map[string]bool)
	for _, tok := range tokens {
		words[tok.Word] = true
	}

	// "42" and "99" are 2 chars, should be included
	if !words["42"] {
		t.Error("expected '42' to be tokenized")
	}
	if !words["99"] {
		t.Error("expected '99' to be tokenized")
	}
	if !words["1337"] {
		t.Error("expected '1337' to be tokenized")
	}
}

func TestTokenWords_LongContent(t *testing.T) {
	// Generate a large input to ensure no panic or infinite loop
	var sb strings.Builder
	for i := 0; i < 10000; i++ {
		sb.WriteString("word ")
	}
	tokens := TokenWords([]byte(sb.String()))

	// "word" is deduplicated so only 1 unique token
	if len(tokens) != 1 {
		t.Errorf("expected 1 unique token from repeated word, got %d", len(tokens))
	}
	if tokens[0].Word != "word" {
		t.Errorf("expected 'word', got %q", tokens[0].Word)
	}
}

func TestTokenWords_MixedUnicode(t *testing.T) {
	// CJK characters are letters and should be tokenized if 2+ runes
	// Single CJK chars are filtered by min-length
	tokens := TokenWords([]byte("hello 世界 café"))
	words := make(map[string]bool)
	for _, tok := range tokens {
		words[tok.Word] = true
	}

	if !words["hello"] {
		t.Error("expected 'hello'")
	}
	// "世界" is 2 runes, both are letters — should be included
	if !words["世界"] {
		t.Error("expected '世界' (2 CJK characters)")
	}
	// "café" normalizes to lowercase
	if !words["café"] {
		t.Error("expected 'café'")
	}
}

func TestTokenWords_Hex64Exclusion(t *testing.T) {
	hex := "abcdef0123456789abcdef0123456789abcdef0123456789abcdef0123456789"
	content := "before " + hex + " after"
	tokens := TokenWords([]byte(content))
	words := make(map[string]bool)
	for _, tok := range tokens {
		words[tok.Word] = true
	}

	if words[hex] {
		t.Error("64-char hex string should be excluded")
	}
	if !words["before"] {
		t.Error("expected 'before'")
	}
	if !words["after"] {
		t.Error("expected 'after'")
	}
}

func TestTokenWords_NostrURIExclusion(t *testing.T) {
	tokens := TokenWords([]byte("check nostr:npub1abc123def456 now"))
	words := make(map[string]bool)
	for _, tok := range tokens {
		words[tok.Word] = true
	}

	if !words["check"] {
		t.Error("expected 'check'")
	}
	if !words["now"] {
		t.Error("expected 'now'")
	}
	// Nothing from the nostr: URI should appear
	if words["npub1abc123def456"] {
		t.Error("nostr: URI content should be excluded")
	}
}

func TestTokenWords_MentionExclusion(t *testing.T) {
	tokens := TokenWords([]byte("see #[0] here #[12] done"))
	words := make(map[string]bool)
	for _, tok := range tokens {
		words[tok.Word] = true
	}

	if !words["see"] {
		t.Error("expected 'see'")
	}
	if !words["here"] {
		t.Error("expected 'here'")
	}
	if !words["done"] {
		t.Error("expected 'done'")
	}
}

func TestTokenWords_URLSchemes(t *testing.T) {
	// Various URL-like schemes with "://" should be skipped
	tokens := TokenWords([]byte("visit ftp://files.example.com and wss://relay.damus.io"))
	words := make(map[string]bool)
	for _, tok := range tokens {
		words[tok.Word] = true
	}

	if words["ftp"] || words["files"] || words["example"] {
		t.Error("ftp:// URL content should be excluded")
	}
	if words["wss"] || words["relay"] || words["damus"] {
		t.Error("wss:// URL content should be excluded")
	}
	// "visit" should survive; "and" is a stop word
	if !words["visit"] {
		t.Error("expected 'visit'")
	}
}

func TestTokenWords_MinLengthBoundary(t *testing.T) {
	// 1-rune words are filtered, 2-rune words are kept (unless stop words)
	tokens := TokenWords([]byte("I go ok hi"))
	words := make(map[string]bool)
	for _, tok := range tokens {
		words[tok.Word] = true
	}

	// "I" is 1 rune — filtered by length
	if words["i"] {
		t.Error("single-rune 'I' should be filtered by min-length")
	}
	// "go" is 2 runes but not a stop word — should be present
	if !words["go"] {
		t.Error("expected 'go' (2 runes, not a stop word)")
	}
	// "ok" is 2 runes, not a stop word
	if !words["ok"] {
		t.Error("expected 'ok'")
	}
	// "hi" is 2 runes, not a stop word
	if !words["hi"] {
		t.Error("expected 'hi'")
	}
}

func TestTokenWords_Deduplication(t *testing.T) {
	tokens := TokenWords([]byte("Bitcoin bitcoin BITCOIN"))
	if len(tokens) != 1 {
		t.Fatalf("expected 1 deduplicated token, got %d", len(tokens))
	}
	if tokens[0].Word != "bitcoin" {
		t.Errorf("expected 'bitcoin', got %q", tokens[0].Word)
	}
}

func TestTokenWords_WhitespaceVariants(t *testing.T) {
	// Tabs, newlines, and other whitespace should split tokens
	tokens := TokenWords([]byte("hello\tworld\nnew\rline"))
	words := make(map[string]bool)
	for _, tok := range tokens {
		words[tok.Word] = true
	}

	for _, w := range []string{"hello", "world", "new", "line"} {
		if !words[w] {
			t.Errorf("expected word %q after whitespace splitting", w)
		}
	}
}

func TestTokenWords_PunctuationSplit(t *testing.T) {
	// Punctuation should split words, not be included in them
	tokens := TokenWords([]byte("hello, world! foo-bar baz.qux"))
	words := make(map[string]bool)
	for _, tok := range tokens {
		words[tok.Word] = true
	}

	for _, w := range []string{"hello", "world", "foo", "bar", "baz", "qux"} {
		if !words[w] {
			t.Errorf("expected word %q after punctuation split", w)
		}
	}
}

func TestTokenHashes_MatchesTokenWords(t *testing.T) {
	content := []byte("bitcoin lightning network relay")
	tokens := TokenWords(content)
	hashes := TokenHashes(content)

	if len(tokens) != len(hashes) {
		t.Fatalf("TokenWords returned %d, TokenHashes returned %d", len(tokens), len(hashes))
	}

	for i, tok := range tokens {
		if !bytes.Equal(tok.Hash, hashes[i]) {
			t.Errorf("hash mismatch at index %d: TokenWords=%x, TokenHashes=%x",
				i, tok.Hash, hashes[i])
		}
	}
}

func TestTokenWords_StopWordsList(t *testing.T) {
	// Verify a representative sample of stop words are actually filtered
	samples := []string{
		"the", "and", "for", "with", "from", "that", "this",
		"they", "were", "will", "been", "have", "about",
		"which", "would", "their", "there", "these", "those",
		"is", "in", "of", "to", "or", "an", "as", "at",
	}
	for _, sw := range samples {
		tokens := TokenWords([]byte(sw))
		if len(tokens) != 0 {
			t.Errorf("stop word %q should produce 0 tokens, got %d", sw, len(tokens))
		}
	}
}