unicode_normalize_test.go raw
1 //go:build !(js && wasm)
2
3 package database
4
5 import (
6 "bytes"
7 "testing"
8 )
9
10 func TestNormalizeRune(t *testing.T) {
11 tests := []struct {
12 name string
13 input rune
14 expected rune
15 }{
16 // Small caps
17 {"small cap A", 'ᴀ', 'a'},
18 {"small cap B", 'ʙ', 'b'},
19 {"small cap C", 'ᴄ', 'c'},
20 {"small cap D", 'ᴅ', 'd'},
21 {"small cap E", 'ᴇ', 'e'},
22 {"small cap F", 'ꜰ', 'f'},
23 {"small cap G", 'ɢ', 'g'},
24 {"small cap H", 'ʜ', 'h'},
25 {"small cap I", 'ɪ', 'i'},
26 {"small cap J", 'ᴊ', 'j'},
27 {"small cap K", 'ᴋ', 'k'},
28 {"small cap L", 'ʟ', 'l'},
29 {"small cap M", 'ᴍ', 'm'},
30 {"small cap N", 'ɴ', 'n'},
31 {"small cap O", 'ᴏ', 'o'},
32 {"small cap P", 'ᴘ', 'p'},
33 {"small cap Q (ogonek)", 'ǫ', 'q'},
34 {"small cap R", 'ʀ', 'r'},
35 {"small cap S", 'ꜱ', 's'},
36 {"small cap T", 'ᴛ', 't'},
37 {"small cap U", 'ᴜ', 'u'},
38 {"small cap V", 'ᴠ', 'v'},
39 {"small cap W", 'ᴡ', 'w'},
40 {"small cap Y", 'ʏ', 'y'},
41 {"small cap Z", 'ᴢ', 'z'},
42
43 // Fraktur lowercase
44 {"fraktur lower a", '𝔞', 'a'},
45 {"fraktur lower b", '𝔟', 'b'},
46 {"fraktur lower c", '𝔠', 'c'},
47 {"fraktur lower d", '𝔡', 'd'},
48 {"fraktur lower e", '𝔢', 'e'},
49 {"fraktur lower f", '𝔣', 'f'},
50 {"fraktur lower g", '𝔤', 'g'},
51 {"fraktur lower h", '𝔥', 'h'},
52 {"fraktur lower i", '𝔦', 'i'},
53 {"fraktur lower j", '𝔧', 'j'},
54 {"fraktur lower k", '𝔨', 'k'},
55 {"fraktur lower l", '𝔩', 'l'},
56 {"fraktur lower m", '𝔪', 'm'},
57 {"fraktur lower n", '𝔫', 'n'},
58 {"fraktur lower o", '𝔬', 'o'},
59 {"fraktur lower p", '𝔭', 'p'},
60 {"fraktur lower q", '𝔮', 'q'},
61 {"fraktur lower r", '𝔯', 'r'},
62 {"fraktur lower s", '𝔰', 's'},
63 {"fraktur lower t", '𝔱', 't'},
64 {"fraktur lower u", '𝔲', 'u'},
65 {"fraktur lower v", '𝔳', 'v'},
66 {"fraktur lower w", '𝔴', 'w'},
67 {"fraktur lower x", '𝔵', 'x'},
68 {"fraktur lower y", '𝔶', 'y'},
69 {"fraktur lower z", '𝔷', 'z'},
70
71 // Fraktur uppercase (main range)
72 {"fraktur upper A", '𝔄', 'a'},
73 {"fraktur upper B", '𝔅', 'b'},
74 {"fraktur upper D", '𝔇', 'd'},
75 {"fraktur upper E", '𝔈', 'e'},
76 {"fraktur upper F", '𝔉', 'f'},
77 {"fraktur upper G", '𝔊', 'g'},
78 {"fraktur upper J", '𝔍', 'j'},
79 {"fraktur upper K", '𝔎', 'k'},
80 {"fraktur upper L", '𝔏', 'l'},
81 {"fraktur upper M", '𝔐', 'm'},
82 {"fraktur upper N", '𝔑', 'n'},
83 {"fraktur upper O", '𝔒', 'o'},
84 {"fraktur upper P", '𝔓', 'p'},
85 {"fraktur upper Q", '𝔔', 'q'},
86 {"fraktur upper S", '𝔖', 's'},
87 {"fraktur upper T", '𝔗', 't'},
88 {"fraktur upper U", '𝔘', 'u'},
89 {"fraktur upper V", '𝔙', 'v'},
90 {"fraktur upper W", '𝔚', 'w'},
91 {"fraktur upper X", '𝔛', 'x'},
92 {"fraktur upper Y", '𝔜', 'y'},
93
94 // Fraktur uppercase (Letterlike Symbols block)
95 {"fraktur upper C (letterlike)", 'ℭ', 'c'},
96 {"fraktur upper H (letterlike)", 'ℌ', 'h'},
97 {"fraktur upper I (letterlike)", 'ℑ', 'i'},
98 {"fraktur upper R (letterlike)", 'ℜ', 'r'},
99 {"fraktur upper Z (letterlike)", 'ℨ', 'z'},
100
101 // Regular ASCII should pass through unchanged
102 {"regular lowercase a", 'a', 'a'},
103 {"regular lowercase z", 'z', 'z'},
104 {"regular uppercase A", 'A', 'A'},
105 {"regular digit 5", '5', '5'},
106
107 // Other unicode should pass through unchanged
108 {"cyrillic д", 'д', 'д'},
109 {"greek α", 'α', 'α'},
110 {"emoji", '🎉', '🎉'},
111 }
112
113 for _, tt := range tests {
114 t.Run(tt.name, func(t *testing.T) {
115 result := normalizeRune(tt.input)
116 if result != tt.expected {
117 t.Errorf("normalizeRune(%q) = %q, want %q", tt.input, result, tt.expected)
118 }
119 })
120 }
121 }
122
123 func TestHasDecorativeUnicode(t *testing.T) {
124 tests := []struct {
125 name string
126 input string
127 expected bool
128 }{
129 {"plain ASCII", "hello world", false},
130 {"small caps word", "ᴅᴇᴀᴛʜ", true},
131 {"fraktur lowercase", "𝔥𝔢𝔩𝔩𝔬", true},
132 {"fraktur uppercase", "𝔇𝔈𝔄𝔗ℌ", true},
133 {"mixed with ASCII", "hello ᴡᴏʀʟᴅ", true},
134 {"single small cap", "aᴀa", true},
135 {"cyrillic (no normalize)", "привет", false},
136 {"empty string", "", false},
137 {"letterlike fraktur C", "ℭool", true},
138 }
139
140 for _, tt := range tests {
141 t.Run(tt.name, func(t *testing.T) {
142 result := hasDecorativeUnicode(tt.input)
143 if result != tt.expected {
144 t.Errorf("hasDecorativeUnicode(%q) = %v, want %v", tt.input, result, tt.expected)
145 }
146 })
147 }
148 }
149
150 func TestTokenHashesNormalization(t *testing.T) {
151 // All three representations should produce the same hash
152 ascii := TokenHashes([]byte("death"))
153 smallCaps := TokenHashes([]byte("ᴅᴇᴀᴛʜ"))
154 frakturLower := TokenHashes([]byte("𝔡𝔢𝔞𝔱𝔥"))
155 frakturUpper := TokenHashes([]byte("𝔇𝔈𝔄𝔗ℌ"))
156
157 if len(ascii) != 1 {
158 t.Fatalf("expected 1 hash for 'death', got %d", len(ascii))
159 }
160 if len(smallCaps) != 1 {
161 t.Fatalf("expected 1 hash for small caps, got %d", len(smallCaps))
162 }
163 if len(frakturLower) != 1 {
164 t.Fatalf("expected 1 hash for fraktur lower, got %d", len(frakturLower))
165 }
166 if len(frakturUpper) != 1 {
167 t.Fatalf("expected 1 hash for fraktur upper, got %d", len(frakturUpper))
168 }
169
170 // All should match the ASCII version
171 if !bytes.Equal(ascii[0], smallCaps[0]) {
172 t.Errorf("small caps hash differs from ASCII\nASCII: %x\nsmall caps: %x", ascii[0], smallCaps[0])
173 }
174 if !bytes.Equal(ascii[0], frakturLower[0]) {
175 t.Errorf("fraktur lower hash differs from ASCII\nASCII: %x\nfraktur lower: %x", ascii[0], frakturLower[0])
176 }
177 if !bytes.Equal(ascii[0], frakturUpper[0]) {
178 t.Errorf("fraktur upper hash differs from ASCII\nASCII: %x\nfraktur upper: %x", ascii[0], frakturUpper[0])
179 }
180 }
181
182 func TestTokenHashesMixedContent(t *testing.T) {
183 // Test that mixed content normalizes correctly
184 // "the" is a stop word and should be filtered, leaving "quick", "brown", "fox"
185 content := []byte("ᴛʜᴇ quick 𝔟𝔯𝔬𝔴𝔫 fox")
186 hashes := TokenHashes(content)
187
188 // Should get: "quick", "brown", "fox" (3 unique words; "the" is a stop word)
189 if len(hashes) != 3 {
190 t.Errorf("expected 3 hashes from mixed content (stop word 'the' filtered), got %d", len(hashes))
191 }
192
193 // Verify "brown" matches between decorated and plain
194 brownPlain := TokenHashes([]byte("brown"))
195 brownDecorated := TokenHashes([]byte("𝔟𝔯𝔬𝔴𝔫"))
196 if !bytes.Equal(brownPlain[0], brownDecorated[0]) {
197 t.Errorf("'brown' hash mismatch: plain=%x, decorated=%x", brownPlain[0], brownDecorated[0])
198 }
199 }
200