1 //go:build !(js && wasm)
2 3 package database
4 5 // normalizeRune maps decorative unicode characters (small caps, fraktur) back to
6 // their ASCII equivalents for consistent word indexing. This ensures that text
7 // written with decorative alphabets (e.g., "ᴅᴇᴀᴛʜ" or "𝔇𝔢𝔞𝔱𝔥") indexes the same
8 // as regular ASCII ("death").
9 //
10 // Character sets normalized:
11 // - Small Caps (used for DEATH-style text in Terry Pratchett tradition)
12 // - Mathematical Fraktur lowercase (𝔞-𝔷)
13 // - Mathematical Fraktur uppercase (𝔄-ℨ, including Letterlike Symbols block exceptions)
14 func normalizeRune(r rune) rune {
15 // Check small caps first (scattered codepoints)
16 if mapped, ok := smallCapsToASCII[r]; ok {
17 return mapped
18 }
19 20 // Check fraktur lowercase: U+1D51E to U+1D537 (contiguous range)
21 if r >= 0x1D51E && r <= 0x1D537 {
22 return 'a' + (r - 0x1D51E)
23 }
24 25 // Check fraktur uppercase main range: U+1D504 to U+1D51C (with gaps)
26 if r >= 0x1D504 && r <= 0x1D51C {
27 if mapped, ok := frakturUpperToASCII[r]; ok {
28 return mapped
29 }
30 }
31 32 // Check fraktur uppercase exceptions from Letterlike Symbols block
33 if mapped, ok := frakturLetterlikeToASCII[r]; ok {
34 return mapped
35 }
36 37 return r
38 }
39 40 // smallCapsToASCII maps small capital letters to lowercase ASCII.
41 // These are scattered across multiple Unicode blocks (IPA Extensions,
42 // Phonetic Extensions, Latin Extended-D).
43 var smallCapsToASCII = map[rune]rune{
44 'ᴀ': 'a', // U+1D00 LATIN LETTER SMALL CAPITAL A
45 'ʙ': 'b', // U+0299 LATIN LETTER SMALL CAPITAL B
46 'ᴄ': 'c', // U+1D04 LATIN LETTER SMALL CAPITAL C
47 'ᴅ': 'd', // U+1D05 LATIN LETTER SMALL CAPITAL D
48 'ᴇ': 'e', // U+1D07 LATIN LETTER SMALL CAPITAL E
49 'ꜰ': 'f', // U+A730 LATIN LETTER SMALL CAPITAL F
50 'ɢ': 'g', // U+0262 LATIN LETTER SMALL CAPITAL G
51 'ʜ': 'h', // U+029C LATIN LETTER SMALL CAPITAL H
52 'ɪ': 'i', // U+026A LATIN LETTER SMALL CAPITAL I
53 'ᴊ': 'j', // U+1D0A LATIN LETTER SMALL CAPITAL J
54 'ᴋ': 'k', // U+1D0B LATIN LETTER SMALL CAPITAL K
55 'ʟ': 'l', // U+029F LATIN LETTER SMALL CAPITAL L
56 'ᴍ': 'm', // U+1D0D LATIN LETTER SMALL CAPITAL M
57 'ɴ': 'n', // U+0274 LATIN LETTER SMALL CAPITAL N
58 'ᴏ': 'o', // U+1D0F LATIN LETTER SMALL CAPITAL O
59 'ᴘ': 'p', // U+1D18 LATIN LETTER SMALL CAPITAL P
60 'ǫ': 'q', // U+01EB LATIN SMALL LETTER O WITH OGONEK (no true small cap Q)
61 'ʀ': 'r', // U+0280 LATIN LETTER SMALL CAPITAL R
62 'ꜱ': 's', // U+A731 LATIN LETTER SMALL CAPITAL S
63 'ᴛ': 't', // U+1D1B LATIN LETTER SMALL CAPITAL T
64 'ᴜ': 'u', // U+1D1C LATIN LETTER SMALL CAPITAL U
65 'ᴠ': 'v', // U+1D20 LATIN LETTER SMALL CAPITAL V
66 'ᴡ': 'w', // U+1D21 LATIN LETTER SMALL CAPITAL W
67 // Note: no small cap X exists in standard use
68 'ʏ': 'y', // U+028F LATIN LETTER SMALL CAPITAL Y
69 'ᴢ': 'z', // U+1D22 LATIN LETTER SMALL CAPITAL Z
70 }
71 72 // frakturUpperToASCII maps Mathematical Fraktur uppercase letters to lowercase ASCII.
73 // The main range U+1D504-U+1D51C has gaps where C, H, I, R, Z use Letterlike Symbols.
74 var frakturUpperToASCII = map[rune]rune{
75 '𝔄': 'a', // U+1D504 MATHEMATICAL FRAKTUR CAPITAL A
76 '𝔅': 'b', // U+1D505 MATHEMATICAL FRAKTUR CAPITAL B
77 // C is at U+212D (Letterlike Symbols)
78 '𝔇': 'd', // U+1D507 MATHEMATICAL FRAKTUR CAPITAL D
79 '𝔈': 'e', // U+1D508 MATHEMATICAL FRAKTUR CAPITAL E
80 '𝔉': 'f', // U+1D509 MATHEMATICAL FRAKTUR CAPITAL F
81 '𝔊': 'g', // U+1D50A MATHEMATICAL FRAKTUR CAPITAL G
82 // H is at U+210C (Letterlike Symbols)
83 // I is at U+2111 (Letterlike Symbols)
84 '𝔍': 'j', // U+1D50D MATHEMATICAL FRAKTUR CAPITAL J
85 '𝔎': 'k', // U+1D50E MATHEMATICAL FRAKTUR CAPITAL K
86 '𝔏': 'l', // U+1D50F MATHEMATICAL FRAKTUR CAPITAL L
87 '𝔐': 'm', // U+1D510 MATHEMATICAL FRAKTUR CAPITAL M
88 '𝔑': 'n', // U+1D511 MATHEMATICAL FRAKTUR CAPITAL N
89 '𝔒': 'o', // U+1D512 MATHEMATICAL FRAKTUR CAPITAL O
90 '𝔓': 'p', // U+1D513 MATHEMATICAL FRAKTUR CAPITAL P
91 '𝔔': 'q', // U+1D514 MATHEMATICAL FRAKTUR CAPITAL Q
92 // R is at U+211C (Letterlike Symbols)
93 '𝔖': 's', // U+1D516 MATHEMATICAL FRAKTUR CAPITAL S
94 '𝔗': 't', // U+1D517 MATHEMATICAL FRAKTUR CAPITAL T
95 '𝔘': 'u', // U+1D518 MATHEMATICAL FRAKTUR CAPITAL U
96 '𝔙': 'v', // U+1D519 MATHEMATICAL FRAKTUR CAPITAL V
97 '𝔚': 'w', // U+1D51A MATHEMATICAL FRAKTUR CAPITAL W
98 '𝔛': 'x', // U+1D51B MATHEMATICAL FRAKTUR CAPITAL X
99 '𝔜': 'y', // U+1D51C MATHEMATICAL FRAKTUR CAPITAL Y
100 // Z is at U+2128 (Letterlike Symbols)
101 }
102 103 // frakturLetterlikeToASCII maps the Fraktur characters that live in the
104 // Letterlike Symbols block (U+2100-U+214F) rather than Mathematical Alphanumeric Symbols.
105 var frakturLetterlikeToASCII = map[rune]rune{
106 'ℭ': 'c', // U+212D BLACK-LETTER CAPITAL C
107 'ℌ': 'h', // U+210C BLACK-LETTER CAPITAL H
108 'ℑ': 'i', // U+2111 BLACK-LETTER CAPITAL I
109 'ℜ': 'r', // U+211C BLACK-LETTER CAPITAL R
110 'ℨ': 'z', // U+2128 BLACK-LETTER CAPITAL Z
111 }
112 113 // hasDecorativeUnicode checks if text contains any small caps or fraktur characters
114 // that would need normalization. Used by migration to identify events needing re-indexing.
115 func hasDecorativeUnicode(s string) bool {
116 for _, r := range s {
117 // Check small caps
118 if _, ok := smallCapsToASCII[r]; ok {
119 return true
120 }
121 // Check fraktur lowercase range
122 if r >= 0x1D51E && r <= 0x1D537 {
123 return true
124 }
125 // Check fraktur uppercase range
126 if r >= 0x1D504 && r <= 0x1D51C {
127 return true
128 }
129 // Check letterlike symbols fraktur
130 if _, ok := frakturLetterlikeToASCII[r]; ok {
131 return true
132 }
133 }
134 return false
135 }
136