unicode_normalize_test.go raw

   1  //go:build !(js && wasm)
   2  
   3  package database
   4  
   5  import (
   6  	"bytes"
   7  	"testing"
   8  )
   9  
  10  func TestNormalizeRune(t *testing.T) {
  11  	tests := []struct {
  12  		name     string
  13  		input    rune
  14  		expected rune
  15  	}{
  16  		// Small caps
  17  		{"small cap A", 'ᴀ', 'a'},
  18  		{"small cap B", 'ʙ', 'b'},
  19  		{"small cap C", 'ᴄ', 'c'},
  20  		{"small cap D", 'ᴅ', 'd'},
  21  		{"small cap E", 'ᴇ', 'e'},
  22  		{"small cap F", 'ꜰ', 'f'},
  23  		{"small cap G", 'ɢ', 'g'},
  24  		{"small cap H", 'ʜ', 'h'},
  25  		{"small cap I", 'ɪ', 'i'},
  26  		{"small cap J", 'ᴊ', 'j'},
  27  		{"small cap K", 'ᴋ', 'k'},
  28  		{"small cap L", 'ʟ', 'l'},
  29  		{"small cap M", 'ᴍ', 'm'},
  30  		{"small cap N", 'ɴ', 'n'},
  31  		{"small cap O", 'ᴏ', 'o'},
  32  		{"small cap P", 'ᴘ', 'p'},
  33  		{"small cap Q (ogonek)", 'ǫ', 'q'},
  34  		{"small cap R", 'ʀ', 'r'},
  35  		{"small cap S", 'ꜱ', 's'},
  36  		{"small cap T", 'ᴛ', 't'},
  37  		{"small cap U", 'ᴜ', 'u'},
  38  		{"small cap V", 'ᴠ', 'v'},
  39  		{"small cap W", 'ᴡ', 'w'},
  40  		{"small cap Y", 'ʏ', 'y'},
  41  		{"small cap Z", 'ᴢ', 'z'},
  42  
  43  		// Fraktur lowercase
  44  		{"fraktur lower a", '𝔞', 'a'},
  45  		{"fraktur lower b", '𝔟', 'b'},
  46  		{"fraktur lower c", '𝔠', 'c'},
  47  		{"fraktur lower d", '𝔡', 'd'},
  48  		{"fraktur lower e", '𝔢', 'e'},
  49  		{"fraktur lower f", '𝔣', 'f'},
  50  		{"fraktur lower g", '𝔤', 'g'},
  51  		{"fraktur lower h", '𝔥', 'h'},
  52  		{"fraktur lower i", '𝔦', 'i'},
  53  		{"fraktur lower j", '𝔧', 'j'},
  54  		{"fraktur lower k", '𝔨', 'k'},
  55  		{"fraktur lower l", '𝔩', 'l'},
  56  		{"fraktur lower m", '𝔪', 'm'},
  57  		{"fraktur lower n", '𝔫', 'n'},
  58  		{"fraktur lower o", '𝔬', 'o'},
  59  		{"fraktur lower p", '𝔭', 'p'},
  60  		{"fraktur lower q", '𝔮', 'q'},
  61  		{"fraktur lower r", '𝔯', 'r'},
  62  		{"fraktur lower s", '𝔰', 's'},
  63  		{"fraktur lower t", '𝔱', 't'},
  64  		{"fraktur lower u", '𝔲', 'u'},
  65  		{"fraktur lower v", '𝔳', 'v'},
  66  		{"fraktur lower w", '𝔴', 'w'},
  67  		{"fraktur lower x", '𝔵', 'x'},
  68  		{"fraktur lower y", '𝔶', 'y'},
  69  		{"fraktur lower z", '𝔷', 'z'},
  70  
  71  		// Fraktur uppercase (main range)
  72  		{"fraktur upper A", '𝔄', 'a'},
  73  		{"fraktur upper B", '𝔅', 'b'},
  74  		{"fraktur upper D", '𝔇', 'd'},
  75  		{"fraktur upper E", '𝔈', 'e'},
  76  		{"fraktur upper F", '𝔉', 'f'},
  77  		{"fraktur upper G", '𝔊', 'g'},
  78  		{"fraktur upper J", '𝔍', 'j'},
  79  		{"fraktur upper K", '𝔎', 'k'},
  80  		{"fraktur upper L", '𝔏', 'l'},
  81  		{"fraktur upper M", '𝔐', 'm'},
  82  		{"fraktur upper N", '𝔑', 'n'},
  83  		{"fraktur upper O", '𝔒', 'o'},
  84  		{"fraktur upper P", '𝔓', 'p'},
  85  		{"fraktur upper Q", '𝔔', 'q'},
  86  		{"fraktur upper S", '𝔖', 's'},
  87  		{"fraktur upper T", '𝔗', 't'},
  88  		{"fraktur upper U", '𝔘', 'u'},
  89  		{"fraktur upper V", '𝔙', 'v'},
  90  		{"fraktur upper W", '𝔚', 'w'},
  91  		{"fraktur upper X", '𝔛', 'x'},
  92  		{"fraktur upper Y", '𝔜', 'y'},
  93  
  94  		// Fraktur uppercase (Letterlike Symbols block)
  95  		{"fraktur upper C (letterlike)", 'ℭ', 'c'},
  96  		{"fraktur upper H (letterlike)", 'ℌ', 'h'},
  97  		{"fraktur upper I (letterlike)", 'ℑ', 'i'},
  98  		{"fraktur upper R (letterlike)", 'ℜ', 'r'},
  99  		{"fraktur upper Z (letterlike)", 'ℨ', 'z'},
 100  
 101  		// Regular ASCII should pass through unchanged
 102  		{"regular lowercase a", 'a', 'a'},
 103  		{"regular lowercase z", 'z', 'z'},
 104  		{"regular uppercase A", 'A', 'A'},
 105  		{"regular digit 5", '5', '5'},
 106  
 107  		// Other unicode should pass through unchanged
 108  		{"cyrillic д", 'д', 'д'},
 109  		{"greek α", 'α', 'α'},
 110  		{"emoji", '🎉', '🎉'},
 111  	}
 112  
 113  	for _, tt := range tests {
 114  		t.Run(tt.name, func(t *testing.T) {
 115  			result := normalizeRune(tt.input)
 116  			if result != tt.expected {
 117  				t.Errorf("normalizeRune(%q) = %q, want %q", tt.input, result, tt.expected)
 118  			}
 119  		})
 120  	}
 121  }
 122  
 123  func TestHasDecorativeUnicode(t *testing.T) {
 124  	tests := []struct {
 125  		name     string
 126  		input    string
 127  		expected bool
 128  	}{
 129  		{"plain ASCII", "hello world", false},
 130  		{"small caps word", "ᴅᴇᴀᴛʜ", true},
 131  		{"fraktur lowercase", "𝔥𝔢𝔩𝔩𝔬", true},
 132  		{"fraktur uppercase", "𝔇𝔈𝔄𝔗ℌ", true},
 133  		{"mixed with ASCII", "hello ᴡᴏʀʟᴅ", true},
 134  		{"single small cap", "aᴀa", true},
 135  		{"cyrillic (no normalize)", "привет", false},
 136  		{"empty string", "", false},
 137  		{"letterlike fraktur C", "ℭool", true},
 138  	}
 139  
 140  	for _, tt := range tests {
 141  		t.Run(tt.name, func(t *testing.T) {
 142  			result := hasDecorativeUnicode(tt.input)
 143  			if result != tt.expected {
 144  				t.Errorf("hasDecorativeUnicode(%q) = %v, want %v", tt.input, result, tt.expected)
 145  			}
 146  		})
 147  	}
 148  }
 149  
 150  func TestTokenHashesNormalization(t *testing.T) {
 151  	// All three representations should produce the same hash
 152  	ascii := TokenHashes([]byte("death"))
 153  	smallCaps := TokenHashes([]byte("ᴅᴇᴀᴛʜ"))
 154  	frakturLower := TokenHashes([]byte("𝔡𝔢𝔞𝔱𝔥"))
 155  	frakturUpper := TokenHashes([]byte("𝔇𝔈𝔄𝔗ℌ"))
 156  
 157  	if len(ascii) != 1 {
 158  		t.Fatalf("expected 1 hash for 'death', got %d", len(ascii))
 159  	}
 160  	if len(smallCaps) != 1 {
 161  		t.Fatalf("expected 1 hash for small caps, got %d", len(smallCaps))
 162  	}
 163  	if len(frakturLower) != 1 {
 164  		t.Fatalf("expected 1 hash for fraktur lower, got %d", len(frakturLower))
 165  	}
 166  	if len(frakturUpper) != 1 {
 167  		t.Fatalf("expected 1 hash for fraktur upper, got %d", len(frakturUpper))
 168  	}
 169  
 170  	// All should match the ASCII version
 171  	if !bytes.Equal(ascii[0], smallCaps[0]) {
 172  		t.Errorf("small caps hash differs from ASCII\nASCII:      %x\nsmall caps: %x", ascii[0], smallCaps[0])
 173  	}
 174  	if !bytes.Equal(ascii[0], frakturLower[0]) {
 175  		t.Errorf("fraktur lower hash differs from ASCII\nASCII:         %x\nfraktur lower: %x", ascii[0], frakturLower[0])
 176  	}
 177  	if !bytes.Equal(ascii[0], frakturUpper[0]) {
 178  		t.Errorf("fraktur upper hash differs from ASCII\nASCII:         %x\nfraktur upper: %x", ascii[0], frakturUpper[0])
 179  	}
 180  }
 181  
 182  func TestTokenHashesMixedContent(t *testing.T) {
 183  	// Test that mixed content normalizes correctly
 184  	// "the" is a stop word and should be filtered, leaving "quick", "brown", "fox"
 185  	content := []byte("ᴛʜᴇ quick 𝔟𝔯𝔬𝔴𝔫 fox")
 186  	hashes := TokenHashes(content)
 187  
 188  	// Should get: "quick", "brown", "fox" (3 unique words; "the" is a stop word)
 189  	if len(hashes) != 3 {
 190  		t.Errorf("expected 3 hashes from mixed content (stop word 'the' filtered), got %d", len(hashes))
 191  	}
 192  
 193  	// Verify "brown" matches between decorated and plain
 194  	brownPlain := TokenHashes([]byte("brown"))
 195  	brownDecorated := TokenHashes([]byte("𝔟𝔯𝔬𝔴𝔫"))
 196  	if !bytes.Equal(brownPlain[0], brownDecorated[0]) {
 197  		t.Errorf("'brown' hash mismatch: plain=%x, decorated=%x", brownPlain[0], brownDecorated[0])
 198  	}
 199  }
 200