unicode_normalize.go raw

   1  //go:build !(js && wasm)
   2  
   3  package database
   4  
   5  // normalizeRune maps decorative unicode characters (small caps, fraktur) back to
   6  // their ASCII equivalents for consistent word indexing. This ensures that text
   7  // written with decorative alphabets (e.g., "ᴅᴇᴀᴛʜ" or "𝔇𝔢𝔞𝔱𝔥") indexes the same
   8  // as regular ASCII ("death").
   9  //
  10  // Character sets normalized:
  11  // - Small Caps (used for DEATH-style text in Terry Pratchett tradition)
  12  // - Mathematical Fraktur lowercase (𝔞-𝔷)
  13  // - Mathematical Fraktur uppercase (𝔄-ℨ, including Letterlike Symbols block exceptions)
  14  func normalizeRune(r rune) rune {
  15  	// Check small caps first (scattered codepoints)
  16  	if mapped, ok := smallCapsToASCII[r]; ok {
  17  		return mapped
  18  	}
  19  
  20  	// Check fraktur lowercase: U+1D51E to U+1D537 (contiguous range)
  21  	if r >= 0x1D51E && r <= 0x1D537 {
  22  		return 'a' + (r - 0x1D51E)
  23  	}
  24  
  25  	// Check fraktur uppercase main range: U+1D504 to U+1D51C (with gaps)
  26  	if r >= 0x1D504 && r <= 0x1D51C {
  27  		if mapped, ok := frakturUpperToASCII[r]; ok {
  28  			return mapped
  29  		}
  30  	}
  31  
  32  	// Check fraktur uppercase exceptions from Letterlike Symbols block
  33  	if mapped, ok := frakturLetterlikeToASCII[r]; ok {
  34  		return mapped
  35  	}
  36  
  37  	return r
  38  }
  39  
  40  // smallCapsToASCII maps small capital letters to lowercase ASCII.
  41  // These are scattered across multiple Unicode blocks (IPA Extensions,
  42  // Phonetic Extensions, Latin Extended-D).
  43  var smallCapsToASCII = map[rune]rune{
  44  	'ᴀ': 'a', // U+1D00 LATIN LETTER SMALL CAPITAL A
  45  	'ʙ': 'b', // U+0299 LATIN LETTER SMALL CAPITAL B
  46  	'ᴄ': 'c', // U+1D04 LATIN LETTER SMALL CAPITAL C
  47  	'ᴅ': 'd', // U+1D05 LATIN LETTER SMALL CAPITAL D
  48  	'ᴇ': 'e', // U+1D07 LATIN LETTER SMALL CAPITAL E
  49  	'ꜰ': 'f', // U+A730 LATIN LETTER SMALL CAPITAL F
  50  	'ɢ': 'g', // U+0262 LATIN LETTER SMALL CAPITAL G
  51  	'ʜ': 'h', // U+029C LATIN LETTER SMALL CAPITAL H
  52  	'ɪ': 'i', // U+026A LATIN LETTER SMALL CAPITAL I
  53  	'ᴊ': 'j', // U+1D0A LATIN LETTER SMALL CAPITAL J
  54  	'ᴋ': 'k', // U+1D0B LATIN LETTER SMALL CAPITAL K
  55  	'ʟ': 'l', // U+029F LATIN LETTER SMALL CAPITAL L
  56  	'ᴍ': 'm', // U+1D0D LATIN LETTER SMALL CAPITAL M
  57  	'ɴ': 'n', // U+0274 LATIN LETTER SMALL CAPITAL N
  58  	'ᴏ': 'o', // U+1D0F LATIN LETTER SMALL CAPITAL O
  59  	'ᴘ': 'p', // U+1D18 LATIN LETTER SMALL CAPITAL P
  60  	'ǫ': 'q', // U+01EB LATIN SMALL LETTER O WITH OGONEK (no true small cap Q)
  61  	'ʀ': 'r', // U+0280 LATIN LETTER SMALL CAPITAL R
  62  	'ꜱ': 's', // U+A731 LATIN LETTER SMALL CAPITAL S
  63  	'ᴛ': 't', // U+1D1B LATIN LETTER SMALL CAPITAL T
  64  	'ᴜ': 'u', // U+1D1C LATIN LETTER SMALL CAPITAL U
  65  	'ᴠ': 'v', // U+1D20 LATIN LETTER SMALL CAPITAL V
  66  	'ᴡ': 'w', // U+1D21 LATIN LETTER SMALL CAPITAL W
  67  	// Note: no small cap X exists in standard use
  68  	'ʏ': 'y', // U+028F LATIN LETTER SMALL CAPITAL Y
  69  	'ᴢ': 'z', // U+1D22 LATIN LETTER SMALL CAPITAL Z
  70  }
  71  
  72  // frakturUpperToASCII maps Mathematical Fraktur uppercase letters to lowercase ASCII.
  73  // The main range U+1D504-U+1D51C has gaps where C, H, I, R, Z use Letterlike Symbols.
  74  var frakturUpperToASCII = map[rune]rune{
  75  	'𝔄': 'a', // U+1D504 MATHEMATICAL FRAKTUR CAPITAL A
  76  	'𝔅': 'b', // U+1D505 MATHEMATICAL FRAKTUR CAPITAL B
  77  	// C is at U+212D (Letterlike Symbols)
  78  	'𝔇': 'd', // U+1D507 MATHEMATICAL FRAKTUR CAPITAL D
  79  	'𝔈': 'e', // U+1D508 MATHEMATICAL FRAKTUR CAPITAL E
  80  	'𝔉': 'f', // U+1D509 MATHEMATICAL FRAKTUR CAPITAL F
  81  	'𝔊': 'g', // U+1D50A MATHEMATICAL FRAKTUR CAPITAL G
  82  	// H is at U+210C (Letterlike Symbols)
  83  	// I is at U+2111 (Letterlike Symbols)
  84  	'𝔍': 'j', // U+1D50D MATHEMATICAL FRAKTUR CAPITAL J
  85  	'𝔎': 'k', // U+1D50E MATHEMATICAL FRAKTUR CAPITAL K
  86  	'𝔏': 'l', // U+1D50F MATHEMATICAL FRAKTUR CAPITAL L
  87  	'𝔐': 'm', // U+1D510 MATHEMATICAL FRAKTUR CAPITAL M
  88  	'𝔑': 'n', // U+1D511 MATHEMATICAL FRAKTUR CAPITAL N
  89  	'𝔒': 'o', // U+1D512 MATHEMATICAL FRAKTUR CAPITAL O
  90  	'𝔓': 'p', // U+1D513 MATHEMATICAL FRAKTUR CAPITAL P
  91  	'𝔔': 'q', // U+1D514 MATHEMATICAL FRAKTUR CAPITAL Q
  92  	// R is at U+211C (Letterlike Symbols)
  93  	'𝔖': 's', // U+1D516 MATHEMATICAL FRAKTUR CAPITAL S
  94  	'𝔗': 't', // U+1D517 MATHEMATICAL FRAKTUR CAPITAL T
  95  	'𝔘': 'u', // U+1D518 MATHEMATICAL FRAKTUR CAPITAL U
  96  	'𝔙': 'v', // U+1D519 MATHEMATICAL FRAKTUR CAPITAL V
  97  	'𝔚': 'w', // U+1D51A MATHEMATICAL FRAKTUR CAPITAL W
  98  	'𝔛': 'x', // U+1D51B MATHEMATICAL FRAKTUR CAPITAL X
  99  	'𝔜': 'y', // U+1D51C MATHEMATICAL FRAKTUR CAPITAL Y
 100  	// Z is at U+2128 (Letterlike Symbols)
 101  }
 102  
 103  // frakturLetterlikeToASCII maps the Fraktur characters that live in the
 104  // Letterlike Symbols block (U+2100-U+214F) rather than Mathematical Alphanumeric Symbols.
 105  var frakturLetterlikeToASCII = map[rune]rune{
 106  	'ℭ': 'c', // U+212D BLACK-LETTER CAPITAL C
 107  	'ℌ': 'h', // U+210C BLACK-LETTER CAPITAL H
 108  	'ℑ': 'i', // U+2111 BLACK-LETTER CAPITAL I
 109  	'ℜ': 'r', // U+211C BLACK-LETTER CAPITAL R
 110  	'ℨ': 'z', // U+2128 BLACK-LETTER CAPITAL Z
 111  }
 112  
 113  // hasDecorativeUnicode checks if text contains any small caps or fraktur characters
 114  // that would need normalization. Used by migration to identify events needing re-indexing.
 115  func hasDecorativeUnicode(s string) bool {
 116  	for _, r := range s {
 117  		// Check small caps
 118  		if _, ok := smallCapsToASCII[r]; ok {
 119  			return true
 120  		}
 121  		// Check fraktur lowercase range
 122  		if r >= 0x1D51E && r <= 0x1D537 {
 123  			return true
 124  		}
 125  		// Check fraktur uppercase range
 126  		if r >= 0x1D504 && r <= 0x1D51C {
 127  			return true
 128  		}
 129  		// Check letterlike symbols fraktur
 130  		if _, ok := frakturLetterlikeToASCII[r]; ok {
 131  			return true
 132  		}
 133  	}
 134  	return false
 135  }
 136