prop.mx raw

   1  // Copyright 2016 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  package bidi
   6  
   7  import "unicode/utf8"
   8  
   9  // Properties provides access to BiDi properties of runes.
  10  type Properties struct {
  11  	entry uint8
  12  	last  uint8
  13  }
  14  
  15  var trie = newBidiTrie(0)
  16  
  17  // TODO: using this for bidirule reduces the running time by about 5%. Consider
  18  // if this is worth exposing or if we can find a way to speed up the Class
  19  // method.
  20  //
  21  // // CompactClass is like Class, but maps all of the BiDi control classes
  22  // // (LRO, RLO, LRE, RLE, PDF, LRI, RLI, FSI, PDI) to the class Control.
  23  // func (p Properties) CompactClass() Class {
  24  // 	return Class(p.entry & 0x0F)
  25  // }
  26  
  27  // Class returns the Bidi class for p.
  28  func (p Properties) Class() Class {
  29  	c := Class(p.entry & 0x0F)
  30  	if c == Control {
  31  		c = controlByteToClass[p.last&0xF]
  32  	}
  33  	return c
  34  }
  35  
  36  // IsBracket reports whether the rune is a bracket.
  37  func (p Properties) IsBracket() bool { return p.entry&0xF0 != 0 }
  38  
  39  // IsOpeningBracket reports whether the rune is an opening bracket.
  40  // IsBracket must return true.
  41  func (p Properties) IsOpeningBracket() bool { return p.entry&openMask != 0 }
  42  
  43  // TODO: find a better API and expose.
  44  func (p Properties) reverseBracket(r rune) rune {
  45  	return xorMasks[p.entry>>xorMaskShift] ^ r
  46  }
  47  
  48  var controlByteToClass = [16]Class{
  49  	0xD: LRO, // U+202D LeftToRightOverride,
  50  	0xE: RLO, // U+202E RightToLeftOverride,
  51  	0xA: LRE, // U+202A LeftToRightEmbedding,
  52  	0xB: RLE, // U+202B RightToLeftEmbedding,
  53  	0xC: PDF, // U+202C PopDirectionalFormat,
  54  	0x6: LRI, // U+2066 LeftToRightIsolate,
  55  	0x7: RLI, // U+2067 RightToLeftIsolate,
  56  	0x8: FSI, // U+2068 FirstStrongIsolate,
  57  	0x9: PDI, // U+2069 PopDirectionalIsolate,
  58  }
  59  
  60  // LookupRune returns properties for r.
  61  func LookupRune(r rune) (p Properties, size int) {
  62  	var buf [4]byte
  63  	n := utf8.EncodeRune(buf[:], r)
  64  	return Lookup(buf[:n])
  65  }
  66  
  67  // TODO: these lookup methods are based on the generated trie code. The returned
  68  // sizes have slightly different semantics from the generated code, in that it
  69  // always returns size==1 for an illegal UTF-8 byte (instead of the length
  70  // of the maximum invalid subsequence). Most Transformers, like unicode/norm,
  71  // leave invalid UTF-8 untouched, in which case it has performance benefits to
  72  // do so (without changing the semantics). Bidi requires the semantics used here
  73  // for the bidirule implementation to be compatible with the Go semantics.
  74  //  They ultimately should perhaps be adopted by all trie implementations, for
  75  // convenience sake.
  76  // This unrolled code also boosts performance of the secure/bidirule package by
  77  // about 30%.
  78  // So, to remove this code:
  79  //   - add option to trie generator to define return type.
  80  //   - always return 1 byte size for ill-formed UTF-8 runes.
  81  
  82  // Lookup returns properties for the first rune in s and the width in bytes of
  83  // its encoding. The size will be 0 if s does not hold enough bytes to complete
  84  // the encoding.
  85  func Lookup(s []byte) (p Properties, sz int) {
  86  	c0 := s[0]
  87  	switch {
  88  	case c0 < 0x80: // is ASCII
  89  		return Properties{entry: bidiValues[c0]}, 1
  90  	case c0 < 0xC2:
  91  		return Properties{}, 1
  92  	case c0 < 0xE0: // 2-byte UTF-8
  93  		if len(s) < 2 {
  94  			return Properties{}, 0
  95  		}
  96  		i := bidiIndex[c0]
  97  		c1 := s[1]
  98  		if c1 < 0x80 || 0xC0 <= c1 {
  99  			return Properties{}, 1
 100  		}
 101  		return Properties{entry: trie.lookupValue(uint32(i), c1)}, 2
 102  	case c0 < 0xF0: // 3-byte UTF-8
 103  		if len(s) < 3 {
 104  			return Properties{}, 0
 105  		}
 106  		i := bidiIndex[c0]
 107  		c1 := s[1]
 108  		if c1 < 0x80 || 0xC0 <= c1 {
 109  			return Properties{}, 1
 110  		}
 111  		o := uint32(i)<<6 + uint32(c1)
 112  		i = bidiIndex[o]
 113  		c2 := s[2]
 114  		if c2 < 0x80 || 0xC0 <= c2 {
 115  			return Properties{}, 1
 116  		}
 117  		return Properties{entry: trie.lookupValue(uint32(i), c2), last: c2}, 3
 118  	case c0 < 0xF8: // 4-byte UTF-8
 119  		if len(s) < 4 {
 120  			return Properties{}, 0
 121  		}
 122  		i := bidiIndex[c0]
 123  		c1 := s[1]
 124  		if c1 < 0x80 || 0xC0 <= c1 {
 125  			return Properties{}, 1
 126  		}
 127  		o := uint32(i)<<6 + uint32(c1)
 128  		i = bidiIndex[o]
 129  		c2 := s[2]
 130  		if c2 < 0x80 || 0xC0 <= c2 {
 131  			return Properties{}, 1
 132  		}
 133  		o = uint32(i)<<6 + uint32(c2)
 134  		i = bidiIndex[o]
 135  		c3 := s[3]
 136  		if c3 < 0x80 || 0xC0 <= c3 {
 137  			return Properties{}, 1
 138  		}
 139  		return Properties{entry: trie.lookupValue(uint32(i), c3)}, 4
 140  	}
 141  	// Illegal rune
 142  	return Properties{}, 1
 143  }
 144  
 145  // LookupString returns properties for the first rune in s and the width in
 146  // bytes of its encoding. The size will be 0 if s does not hold enough bytes to
 147  // complete the encoding.
 148  func LookupString(s []byte) (p Properties, sz int) {
 149  	c0 := s[0]
 150  	switch {
 151  	case c0 < 0x80: // is ASCII
 152  		return Properties{entry: bidiValues[c0]}, 1
 153  	case c0 < 0xC2:
 154  		return Properties{}, 1
 155  	case c0 < 0xE0: // 2-byte UTF-8
 156  		if len(s) < 2 {
 157  			return Properties{}, 0
 158  		}
 159  		i := bidiIndex[c0]
 160  		c1 := s[1]
 161  		if c1 < 0x80 || 0xC0 <= c1 {
 162  			return Properties{}, 1
 163  		}
 164  		return Properties{entry: trie.lookupValue(uint32(i), c1)}, 2
 165  	case c0 < 0xF0: // 3-byte UTF-8
 166  		if len(s) < 3 {
 167  			return Properties{}, 0
 168  		}
 169  		i := bidiIndex[c0]
 170  		c1 := s[1]
 171  		if c1 < 0x80 || 0xC0 <= c1 {
 172  			return Properties{}, 1
 173  		}
 174  		o := uint32(i)<<6 + uint32(c1)
 175  		i = bidiIndex[o]
 176  		c2 := s[2]
 177  		if c2 < 0x80 || 0xC0 <= c2 {
 178  			return Properties{}, 1
 179  		}
 180  		return Properties{entry: trie.lookupValue(uint32(i), c2), last: c2}, 3
 181  	case c0 < 0xF8: // 4-byte UTF-8
 182  		if len(s) < 4 {
 183  			return Properties{}, 0
 184  		}
 185  		i := bidiIndex[c0]
 186  		c1 := s[1]
 187  		if c1 < 0x80 || 0xC0 <= c1 {
 188  			return Properties{}, 1
 189  		}
 190  		o := uint32(i)<<6 + uint32(c1)
 191  		i = bidiIndex[o]
 192  		c2 := s[2]
 193  		if c2 < 0x80 || 0xC0 <= c2 {
 194  			return Properties{}, 1
 195  		}
 196  		o = uint32(i)<<6 + uint32(c2)
 197  		i = bidiIndex[o]
 198  		c3 := s[3]
 199  		if c3 < 0x80 || 0xC0 <= c3 {
 200  			return Properties{}, 1
 201  		}
 202  		return Properties{entry: trie.lookupValue(uint32(i), c3)}, 4
 203  	}
 204  	// Illegal rune
 205  	return Properties{}, 1
 206  }
 207