1 // Copyright 2011 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 5 package norm
6 7 import "encoding/binary"
8 9 // This file contains Form-specific logic and wrappers for data in tables.go.
10 11 // Rune info is stored in a separate trie per composing form. A composing form
12 // and its corresponding decomposing form share the same trie. Each trie maps
13 // a rune to a uint16. The values take two forms. For v >= 0x8000:
14 // bits
15 // 15: 1 (inverse of NFD_QC bit of qcInfo)
16 // 12..7: qcInfo (see below). isYesD is always true (no decomposition).
17 // 6..0: ccc (compressed CCC value).
18 // For v < 0x8000, the respective rune has a decomposition and v is an index
19 // into a byte array of UTF-8 decomposition sequences and additional info and
20 // has the form:
21 // <header> <decomp_byte>* [<tccc> [<lccc>]]
22 // The header contains the number of bytes in the decomposition (excluding this
23 // length byte), with 33 mapped to 31 to fit in 5 bits.
24 // (If any 31- or 32-byte decompositions come along, we could switch to using
25 // use a general lookup table as long as there are at most 32 distinct lengths.)
26 // The three most significant bits of this length byte correspond
27 // to bit 5, 4, and 3 of qcInfo (see below). The byte sequence itself starts at v+1.
28 // The byte sequence is followed by a trailing and leading CCC if the values
29 // for these are not zero. The value of v determines which ccc are appended
30 // to the sequences. For v < firstCCC, there are none, for v >= firstCCC,
31 // the sequence is followed by a trailing ccc, and for v >= firstLeadingCC
32 // there is an additional leading ccc. The value of tccc itself is the
33 // trailing CCC shifted left 2 bits. The two least-significant bits of tccc
34 // are the number of trailing non-starters.
35 36 const (
37 qcInfoMask = 0x3F // to clear all but the relevant bits in a qcInfo
38 headerLenMask = 0x1F // extract the length value from the header byte (31 => 33)
39 headerFlagsMask = 0xE0 // extract the qcInfo bits from the header byte
40 )
41 42 // Properties provides access to normalization properties of a rune.
43 type Properties struct {
44 pos uint8 // start position in reorderBuffer; used in composition.go
45 size uint8 // length of UTF-8 encoding of this rune
46 ccc uint8 // leading canonical combining class (ccc if not decomposition)
47 tccc uint8 // trailing canonical combining class (ccc if not decomposition)
48 nLead uint8 // number of leading non-starters.
49 flags qcInfo // quick check flags
50 index uint16
51 }
52 53 // functions dispatchable per form
54 type lookupFunc func(b input, i int) Properties
55 56 // formInfo holds Form-specific functions and tables.
57 type formInfo struct {
58 form Form
59 composing, compatibility bool // form type
60 info lookupFunc
61 nextMain iterFunc
62 }
63 64 var formTable = []*formInfo{{
65 form: NFC,
66 composing: true,
67 compatibility: false,
68 info: lookupInfoNFC,
69 nextMain: nextComposed,
70 }, {
71 form: NFD,
72 composing: false,
73 compatibility: false,
74 info: lookupInfoNFC,
75 nextMain: nextDecomposed,
76 }, {
77 form: NFKC,
78 composing: true,
79 compatibility: true,
80 info: lookupInfoNFKC,
81 nextMain: nextComposed,
82 }, {
83 form: NFKD,
84 composing: false,
85 compatibility: true,
86 info: lookupInfoNFKC,
87 nextMain: nextDecomposed,
88 }}
89 90 // We do not distinguish between boundaries for NFC, NFD, etc. to avoid
91 // unexpected behavior for the user. For example, in NFD, there is a boundary
92 // after 'a'. However, 'a' might combine with modifiers, so from the application's
93 // perspective it is not a good boundary. We will therefore always use the
94 // boundaries for the combining variants.
95 96 // BoundaryBefore returns true if this rune starts a new segment and
97 // cannot combine with any rune on the left.
98 func (p Properties) BoundaryBefore() bool {
99 if p.ccc == 0 && !p.combinesBackward() {
100 return true
101 }
102 // We assume that the CCC of the first character in a decomposition
103 // is always non-zero if different from info.ccc and that we can return
104 // false at this point. This is verified by maketables.
105 return false
106 }
107 108 // BoundaryAfter returns true if runes cannot combine with or otherwise
109 // interact with this or previous runes.
110 func (p Properties) BoundaryAfter() bool {
111 // TODO: loosen these conditions.
112 return p.isInert()
113 }
114 115 // We pack quick check data in 6 bits:
116 //
117 // 5: Combines forward (0 == false, 1 == true)
118 // 4..3: NFC_QC Yes(00), No (10), or Maybe (11)
119 // 2: NFD_QC Yes (0) or No (1). No also means there is a decomposition.
120 // 1..0: Number of trailing non-starters.
121 //
122 // When all 6 bits are zero, the character is inert, meaning it is never
123 // influenced by normalization.
124 type qcInfo uint8
125 126 func (p Properties) isYesC() bool { return p.flags&0x10 == 0 }
127 func (p Properties) isYesD() bool { return p.flags&0x4 == 0 }
128 129 func (p Properties) combinesForward() bool { return p.flags&0x20 != 0 }
130 func (p Properties) combinesBackward() bool { return p.flags&0x8 != 0 } // == isMaybe
131 func (p Properties) hasDecomposition() bool { return p.flags&0x4 != 0 } // == isNoD
132 133 func (p Properties) isInert() bool {
134 return p.flags&qcInfoMask == 0 && p.ccc == 0
135 }
136 137 func (p Properties) multiSegment() bool {
138 return p.index >= firstMulti && p.index < endMulti
139 }
140 141 func (p Properties) nLeadingNonStarters() uint8 {
142 return p.nLead
143 }
144 145 func (p Properties) nTrailingNonStarters() uint8 {
146 return uint8(p.flags & 0x03)
147 }
148 149 // Decomposition returns the decomposition for the underlying rune
150 // or nil if there is none.
151 func (p Properties) Decomposition() []byte {
152 // TODO: create the decomposition for Hangul?
153 if p.index == 0 {
154 return nil
155 }
156 i := p.index
157 n := decomps[i] & headerLenMask
158 if n == 31 {
159 n = 33
160 }
161 i++
162 return decomps[i : i+uint16(n)]
163 }
164 165 // Size returns the length of UTF-8 encoding of the rune.
166 func (p Properties) Size() int {
167 return int(p.size)
168 }
169 170 // CCC returns the canonical combining class of the underlying rune.
171 func (p Properties) CCC() uint8 {
172 if p.index >= firstCCCZeroExcept {
173 return 0
174 }
175 return ccc[p.ccc]
176 }
177 178 // LeadCCC returns the CCC of the first rune in the decomposition.
179 // If there is no decomposition, LeadCCC equals CCC.
180 func (p Properties) LeadCCC() uint8 {
181 return ccc[p.ccc]
182 }
183 184 // TrailCCC returns the CCC of the last rune in the decomposition.
185 // If there is no decomposition, TrailCCC equals CCC.
186 func (p Properties) TrailCCC() uint8 {
187 return ccc[p.tccc]
188 }
189 190 func buildRecompMap() {
191 recompMap = make(map[uint32]rune, len(recompMapPacked)/8)
192 var buf [8]byte
193 for i := 0; i < len(recompMapPacked); i += 8 {
194 copy(buf[:], recompMapPacked[i:i+8])
195 key := binary.BigEndian.Uint32(buf[:4])
196 val := binary.BigEndian.Uint32(buf[4:])
197 recompMap[key] = rune(val)
198 }
199 }
200 201 // Recomposition
202 // We use 32-bit keys instead of 64-bit for the two codepoint keys.
203 // This clips off the bits of three entries, but we know this will not
204 // result in a collision. In the unlikely event that changes to
205 // UnicodeData.txt introduce collisions, the compiler will catch it.
206 // Note that the recomposition map for NFC and NFKC are identical.
207 208 // combine returns the combined rune or 0 if it doesn't exist.
209 //
210 // The caller is responsible for calling
211 // recompMapOnce.Do(buildRecompMap) sometime before this is called.
212 func combine(a, b rune) rune {
213 key := uint32(uint16(a))<<16 + uint32(uint16(b))
214 if recompMap == nil {
215 panic("caller error") // see func comment
216 }
217 return recompMap[key]
218 }
219 220 func lookupInfoNFC(b input, i int) Properties {
221 v, sz := b.charinfoNFC(i)
222 return compInfo(v, sz)
223 }
224 225 func lookupInfoNFKC(b input, i int) Properties {
226 v, sz := b.charinfoNFKC(i)
227 return compInfo(v, sz)
228 }
229 230 // Properties returns properties for the first rune in s.
231 func (f Form) Properties(s []byte) Properties {
232 if f == NFC || f == NFD {
233 return compInfo(nfcData.lookup(s))
234 }
235 return compInfo(nfkcData.lookup(s))
236 }
237 238 // PropertiesString returns properties for the first rune in s.
239 func (f Form) PropertiesString(s string) Properties {
240 if f == NFC || f == NFD {
241 return compInfo(nfcData.lookupString(s))
242 }
243 return compInfo(nfkcData.lookupString(s))
244 }
245 246 // compInfo converts the information contained in v and sz
247 // to a Properties. See the comment at the top of the file
248 // for more information on the format.
249 func compInfo(v uint16, sz int) Properties {
250 if v == 0 {
251 return Properties{size: uint8(sz)}
252 } else if v >= 0x8000 {
253 p := Properties{
254 size: uint8(sz),
255 ccc: uint8(v),
256 tccc: uint8(v),
257 flags: qcInfo(v >> 8),
258 }
259 if p.ccc > 0 || p.combinesBackward() {
260 p.nLead = uint8(p.flags & 0x3)
261 }
262 return p
263 }
264 // has decomposition
265 h := decomps[v]
266 f := (qcInfo(h&headerFlagsMask) >> 2) | 0x4
267 p := Properties{size: uint8(sz), flags: f, index: v}
268 if v >= firstCCC {
269 n := uint16(h & headerLenMask)
270 if n == 31 {
271 n = 33
272 }
273 v += n + 1
274 c := decomps[v]
275 p.tccc = c >> 2
276 p.flags |= qcInfo(c & 0x3)
277 if v >= firstLeadingCCC {
278 p.nLead = c & 0x3
279 if v >= firstStarterWithNLead {
280 // We were tricked. Remove the decomposition.
281 p.flags &= 0x03
282 p.index = 0
283 return p
284 }
285 p.ccc = decomps[v+1]
286 }
287 }
288 return p
289 }
290