forminfo.go raw

   1  // Copyright 2011 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  package norm
   6  
   7  import "encoding/binary"
   8  
   9  // This file contains Form-specific logic and wrappers for data in tables.go.
  10  
  11  // Rune info is stored in a separate trie per composing form. A composing form
  12  // and its corresponding decomposing form share the same trie.  Each trie maps
  13  // a rune to a uint16. The values take two forms.  For v >= 0x8000:
  14  //   bits
  15  //   15:    1 (inverse of NFD_QC bit of qcInfo)
  16  //   12..7: qcInfo (see below). isYesD is always true (no decomposition).
  17  //    6..0: ccc (compressed CCC value).
  18  // For v < 0x8000, the respective rune has a decomposition and v is an index
  19  // into a byte array of UTF-8 decomposition sequences and additional info and
  20  // has the form:
  21  //    <header> <decomp_byte>* [<tccc> [<lccc>]]
  22  // The header contains the number of bytes in the decomposition (excluding this
  23  // length byte), with 33 mapped to 31 to fit in 5 bits.
  24  // (If any 31- or 32-byte decompositions come along, we could switch to using
  25  // use a general lookup table as long as there are at most 32 distinct lengths.)
  26  // The three most significant bits of this length byte correspond
  27  // to bit 5, 4, and 3 of qcInfo (see below).  The byte sequence itself starts at v+1.
  28  // The byte sequence is followed by a trailing and leading CCC if the values
  29  // for these are not zero.  The value of v determines which ccc are appended
  30  // to the sequences.  For v < firstCCC, there are none, for v >= firstCCC,
  31  // the sequence is followed by a trailing ccc, and for v >= firstLeadingCC
  32  // there is an additional leading ccc. The value of tccc itself is the
  33  // trailing CCC shifted left 2 bits. The two least-significant bits of tccc
  34  // are the number of trailing non-starters.
  35  
  36  const (
  37  	qcInfoMask      = 0x3F // to clear all but the relevant bits in a qcInfo
  38  	headerLenMask   = 0x1F // extract the length value from the header byte (31 => 33)
  39  	headerFlagsMask = 0xE0 // extract the qcInfo bits from the header byte
  40  )
  41  
  42  // Properties provides access to normalization properties of a rune.
  43  type Properties struct {
  44  	pos   uint8  // start position in reorderBuffer; used in composition.go
  45  	size  uint8  // length of UTF-8 encoding of this rune
  46  	ccc   uint8  // leading canonical combining class (ccc if not decomposition)
  47  	tccc  uint8  // trailing canonical combining class (ccc if not decomposition)
  48  	nLead uint8  // number of leading non-starters.
  49  	flags qcInfo // quick check flags
  50  	index uint16
  51  }
  52  
  53  // functions dispatchable per form
  54  type lookupFunc func(b input, i int) Properties
  55  
  56  // formInfo holds Form-specific functions and tables.
  57  type formInfo struct {
  58  	form                     Form
  59  	composing, compatibility bool // form type
  60  	info                     lookupFunc
  61  	nextMain                 iterFunc
  62  }
  63  
  64  var formTable = []*formInfo{{
  65  	form:          NFC,
  66  	composing:     true,
  67  	compatibility: false,
  68  	info:          lookupInfoNFC,
  69  	nextMain:      nextComposed,
  70  }, {
  71  	form:          NFD,
  72  	composing:     false,
  73  	compatibility: false,
  74  	info:          lookupInfoNFC,
  75  	nextMain:      nextDecomposed,
  76  }, {
  77  	form:          NFKC,
  78  	composing:     true,
  79  	compatibility: true,
  80  	info:          lookupInfoNFKC,
  81  	nextMain:      nextComposed,
  82  }, {
  83  	form:          NFKD,
  84  	composing:     false,
  85  	compatibility: true,
  86  	info:          lookupInfoNFKC,
  87  	nextMain:      nextDecomposed,
  88  }}
  89  
  90  // We do not distinguish between boundaries for NFC, NFD, etc. to avoid
  91  // unexpected behavior for the user.  For example, in NFD, there is a boundary
  92  // after 'a'.  However, 'a' might combine with modifiers, so from the application's
  93  // perspective it is not a good boundary. We will therefore always use the
  94  // boundaries for the combining variants.
  95  
  96  // BoundaryBefore returns true if this rune starts a new segment and
  97  // cannot combine with any rune on the left.
  98  func (p Properties) BoundaryBefore() bool {
  99  	if p.ccc == 0 && !p.combinesBackward() {
 100  		return true
 101  	}
 102  	// We assume that the CCC of the first character in a decomposition
 103  	// is always non-zero if different from info.ccc and that we can return
 104  	// false at this point. This is verified by maketables.
 105  	return false
 106  }
 107  
 108  // BoundaryAfter returns true if runes cannot combine with or otherwise
 109  // interact with this or previous runes.
 110  func (p Properties) BoundaryAfter() bool {
 111  	// TODO: loosen these conditions.
 112  	return p.isInert()
 113  }
 114  
 115  // We pack quick check data in 6 bits:
 116  //
 117  //	5:    Combines forward  (0 == false, 1 == true)
 118  //	4..3: NFC_QC Yes(00), No (10), or Maybe (11)
 119  //	2:    NFD_QC Yes (0) or No (1). No also means there is a decomposition.
 120  //	1..0: Number of trailing non-starters.
 121  //
 122  // When all 6 bits are zero, the character is inert, meaning it is never
 123  // influenced by normalization.
 124  type qcInfo uint8
 125  
 126  func (p Properties) isYesC() bool { return p.flags&0x10 == 0 }
 127  func (p Properties) isYesD() bool { return p.flags&0x4 == 0 }
 128  
 129  func (p Properties) combinesForward() bool  { return p.flags&0x20 != 0 }
 130  func (p Properties) combinesBackward() bool { return p.flags&0x8 != 0 } // == isMaybe
 131  func (p Properties) hasDecomposition() bool { return p.flags&0x4 != 0 } // == isNoD
 132  
 133  func (p Properties) isInert() bool {
 134  	return p.flags&qcInfoMask == 0 && p.ccc == 0
 135  }
 136  
 137  func (p Properties) multiSegment() bool {
 138  	return p.index >= firstMulti && p.index < endMulti
 139  }
 140  
 141  func (p Properties) nLeadingNonStarters() uint8 {
 142  	return p.nLead
 143  }
 144  
 145  func (p Properties) nTrailingNonStarters() uint8 {
 146  	return uint8(p.flags & 0x03)
 147  }
 148  
 149  // Decomposition returns the decomposition for the underlying rune
 150  // or nil if there is none.
 151  func (p Properties) Decomposition() []byte {
 152  	// TODO: create the decomposition for Hangul?
 153  	if p.index == 0 {
 154  		return nil
 155  	}
 156  	i := p.index
 157  	n := decomps[i] & headerLenMask
 158  	if n == 31 {
 159  		n = 33
 160  	}
 161  	i++
 162  	return decomps[i : i+uint16(n)]
 163  }
 164  
 165  // Size returns the length of UTF-8 encoding of the rune.
 166  func (p Properties) Size() int {
 167  	return int(p.size)
 168  }
 169  
 170  // CCC returns the canonical combining class of the underlying rune.
 171  func (p Properties) CCC() uint8 {
 172  	if p.index >= firstCCCZeroExcept {
 173  		return 0
 174  	}
 175  	return ccc[p.ccc]
 176  }
 177  
 178  // LeadCCC returns the CCC of the first rune in the decomposition.
 179  // If there is no decomposition, LeadCCC equals CCC.
 180  func (p Properties) LeadCCC() uint8 {
 181  	return ccc[p.ccc]
 182  }
 183  
 184  // TrailCCC returns the CCC of the last rune in the decomposition.
 185  // If there is no decomposition, TrailCCC equals CCC.
 186  func (p Properties) TrailCCC() uint8 {
 187  	return ccc[p.tccc]
 188  }
 189  
 190  func buildRecompMap() {
 191  	recompMap = make(map[uint32]rune, len(recompMapPacked)/8)
 192  	var buf [8]byte
 193  	for i := 0; i < len(recompMapPacked); i += 8 {
 194  		copy(buf[:], recompMapPacked[i:i+8])
 195  		key := binary.BigEndian.Uint32(buf[:4])
 196  		val := binary.BigEndian.Uint32(buf[4:])
 197  		recompMap[key] = rune(val)
 198  	}
 199  }
 200  
 201  // Recomposition
 202  // We use 32-bit keys instead of 64-bit for the two codepoint keys.
 203  // This clips off the bits of three entries, but we know this will not
 204  // result in a collision. In the unlikely event that changes to
 205  // UnicodeData.txt introduce collisions, the compiler will catch it.
 206  // Note that the recomposition map for NFC and NFKC are identical.
 207  
 208  // combine returns the combined rune or 0 if it doesn't exist.
 209  //
 210  // The caller is responsible for calling
 211  // recompMapOnce.Do(buildRecompMap) sometime before this is called.
 212  func combine(a, b rune) rune {
 213  	key := uint32(uint16(a))<<16 + uint32(uint16(b))
 214  	if recompMap == nil {
 215  		panic("caller error") // see func comment
 216  	}
 217  	return recompMap[key]
 218  }
 219  
 220  func lookupInfoNFC(b input, i int) Properties {
 221  	v, sz := b.charinfoNFC(i)
 222  	return compInfo(v, sz)
 223  }
 224  
 225  func lookupInfoNFKC(b input, i int) Properties {
 226  	v, sz := b.charinfoNFKC(i)
 227  	return compInfo(v, sz)
 228  }
 229  
 230  // Properties returns properties for the first rune in s.
 231  func (f Form) Properties(s []byte) Properties {
 232  	if f == NFC || f == NFD {
 233  		return compInfo(nfcData.lookup(s))
 234  	}
 235  	return compInfo(nfkcData.lookup(s))
 236  }
 237  
 238  // PropertiesString returns properties for the first rune in s.
 239  func (f Form) PropertiesString(s string) Properties {
 240  	if f == NFC || f == NFD {
 241  		return compInfo(nfcData.lookupString(s))
 242  	}
 243  	return compInfo(nfkcData.lookupString(s))
 244  }
 245  
 246  // compInfo converts the information contained in v and sz
 247  // to a Properties.  See the comment at the top of the file
 248  // for more information on the format.
 249  func compInfo(v uint16, sz int) Properties {
 250  	if v == 0 {
 251  		return Properties{size: uint8(sz)}
 252  	} else if v >= 0x8000 {
 253  		p := Properties{
 254  			size:  uint8(sz),
 255  			ccc:   uint8(v),
 256  			tccc:  uint8(v),
 257  			flags: qcInfo(v >> 8),
 258  		}
 259  		if p.ccc > 0 || p.combinesBackward() {
 260  			p.nLead = uint8(p.flags & 0x3)
 261  		}
 262  		return p
 263  	}
 264  	// has decomposition
 265  	h := decomps[v]
 266  	f := (qcInfo(h&headerFlagsMask) >> 2) | 0x4
 267  	p := Properties{size: uint8(sz), flags: f, index: v}
 268  	if v >= firstCCC {
 269  		n := uint16(h & headerLenMask)
 270  		if n == 31 {
 271  			n = 33
 272  		}
 273  		v += n + 1
 274  		c := decomps[v]
 275  		p.tccc = c >> 2
 276  		p.flags |= qcInfo(c & 0x3)
 277  		if v >= firstLeadingCCC {
 278  			p.nLead = c & 0x3
 279  			if v >= firstStarterWithNLead {
 280  				// We were tricked. Remove the decomposition.
 281  				p.flags &= 0x03
 282  				p.index = 0
 283  				return p
 284  			}
 285  			p.ccc = decomps[v+1]
 286  		}
 287  	}
 288  	return p
 289  }
 290