bidirule.mx raw

   1  // Copyright 2016 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  // Package bidirule implements the Bidi Rule defined by RFC 5893.
   6  //
   7  // This package is under development. The API may change without notice and
   8  // without preserving backward compatibility.
   9  package bidirule
  10  
  11  import (
  12  	"errors"
  13  	"unicode/utf8"
  14  
  15  	"golang.org/x/text/transform"
  16  	"golang.org/x/text/unicode/bidi"
  17  )
  18  
  19  // This file contains an implementation of RFC 5893: Right-to-Left Scripts for
  20  // Internationalized Domain Names for Applications (IDNA)
  21  //
  22  // A label is an individual component of a domain name.  Labels are usually
  23  // shown separated by dots; for example, the domain name "www.example.com" is
  24  // composed of three labels: "www", "example", and "com".
  25  //
  26  // An RTL label is a label that contains at least one character of class R, AL,
  27  // or AN. An LTR label is any label that is not an RTL label.
  28  //
  29  // A "Bidi domain name" is a domain name that contains at least one RTL label.
  30  //
  31  //  The following guarantees can be made based on the above:
  32  //
  33  //  o  In a domain name consisting of only labels that satisfy the rule,
  34  //     the requirements of Section 3 are satisfied.  Note that even LTR
  35  //     labels and pure ASCII labels have to be tested.
  36  //
  37  //  o  In a domain name consisting of only LDH labels (as defined in the
  38  //     Definitions document [RFC5890]) and labels that satisfy the rule,
  39  //     the requirements of Section 3 are satisfied as long as a label
  40  //     that starts with an ASCII digit does not come after a
  41  //     right-to-left label.
  42  //
  43  //  No guarantee is given for other combinations.
  44  
  45  // ErrInvalid indicates a label is invalid according to the Bidi Rule.
  46  var ErrInvalid = errors.New("bidirule: failed Bidi Rule")
  47  
  48  type ruleState uint8
  49  
  50  const (
  51  	ruleInitial ruleState = iota
  52  	ruleLTR
  53  	ruleLTRFinal
  54  	ruleRTL
  55  	ruleRTLFinal
  56  	ruleInvalid
  57  )
  58  
  59  type ruleTransition struct {
  60  	next ruleState
  61  	mask uint16
  62  }
  63  
  64  var transitions = [...][2]ruleTransition{
  65  	// [2.1] The first character must be a character with Bidi property L, R, or
  66  	// AL. If it has the R or AL property, it is an RTL label; if it has the L
  67  	// property, it is an LTR label.
  68  	ruleInitial: {
  69  		{ruleLTRFinal, 1 << bidi.L},
  70  		{ruleRTLFinal, 1<<bidi.R | 1<<bidi.AL},
  71  	},
  72  	ruleRTL: {
  73  		// [2.3] In an RTL label, the end of the label must be a character with
  74  		// Bidi property R, AL, EN, or AN, followed by zero or more characters
  75  		// with Bidi property NSM.
  76  		{ruleRTLFinal, 1<<bidi.R | 1<<bidi.AL | 1<<bidi.EN | 1<<bidi.AN},
  77  
  78  		// [2.2] In an RTL label, only characters with the Bidi properties R,
  79  		// AL, AN, EN, ES, CS, ET, ON, BN, or NSM are allowed.
  80  		// We exclude the entries from [2.3]
  81  		{ruleRTL, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN | 1<<bidi.NSM},
  82  	},
  83  	ruleRTLFinal: {
  84  		// [2.3] In an RTL label, the end of the label must be a character with
  85  		// Bidi property R, AL, EN, or AN, followed by zero or more characters
  86  		// with Bidi property NSM.
  87  		{ruleRTLFinal, 1<<bidi.R | 1<<bidi.AL | 1<<bidi.EN | 1<<bidi.AN | 1<<bidi.NSM},
  88  
  89  		// [2.2] In an RTL label, only characters with the Bidi properties R,
  90  		// AL, AN, EN, ES, CS, ET, ON, BN, or NSM are allowed.
  91  		// We exclude the entries from [2.3] and NSM.
  92  		{ruleRTL, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN},
  93  	},
  94  	ruleLTR: {
  95  		// [2.6] In an LTR label, the end of the label must be a character with
  96  		// Bidi property L or EN, followed by zero or more characters with Bidi
  97  		// property NSM.
  98  		{ruleLTRFinal, 1<<bidi.L | 1<<bidi.EN},
  99  
 100  		// [2.5] In an LTR label, only characters with the Bidi properties L,
 101  		// EN, ES, CS, ET, ON, BN, or NSM are allowed.
 102  		// We exclude the entries from [2.6].
 103  		{ruleLTR, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN | 1<<bidi.NSM},
 104  	},
 105  	ruleLTRFinal: {
 106  		// [2.6] In an LTR label, the end of the label must be a character with
 107  		// Bidi property L or EN, followed by zero or more characters with Bidi
 108  		// property NSM.
 109  		{ruleLTRFinal, 1<<bidi.L | 1<<bidi.EN | 1<<bidi.NSM},
 110  
 111  		// [2.5] In an LTR label, only characters with the Bidi properties L,
 112  		// EN, ES, CS, ET, ON, BN, or NSM are allowed.
 113  		// We exclude the entries from [2.6].
 114  		{ruleLTR, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN},
 115  	},
 116  	ruleInvalid: {
 117  		{ruleInvalid, 0},
 118  		{ruleInvalid, 0},
 119  	},
 120  }
 121  
 122  // [2.4] In an RTL label, if an EN is present, no AN may be present, and
 123  // vice versa.
 124  const exclusiveRTL = uint16(1<<bidi.EN | 1<<bidi.AN)
 125  
 126  // From RFC 5893
 127  // An RTL label is a label that contains at least one character of type
 128  // R, AL, or AN.
 129  //
 130  // An LTR label is any label that is not an RTL label.
 131  
 132  // Direction reports the direction of the given label as defined by RFC 5893.
 133  // The Bidi Rule does not have to be applied to labels of the category
 134  // LeftToRight.
 135  func Direction(b []byte) bidi.Direction {
 136  	for i := 0; i < len(b); {
 137  		e, sz := bidi.Lookup(b[i:])
 138  		if sz == 0 {
 139  			i++
 140  		}
 141  		c := e.Class()
 142  		if c == bidi.R || c == bidi.AL || c == bidi.AN {
 143  			return bidi.RightToLeft
 144  		}
 145  		i += sz
 146  	}
 147  	return bidi.LeftToRight
 148  }
 149  
 150  // DirectionString reports the direction of the given label as defined by RFC
 151  // 5893. The Bidi Rule does not have to be applied to labels of the category
 152  // LeftToRight.
 153  func DirectionString(s []byte) bidi.Direction {
 154  	for i := 0; i < len(s); {
 155  		e, sz := bidi.LookupString(s[i:])
 156  		if sz == 0 {
 157  			i++
 158  			continue
 159  		}
 160  		c := e.Class()
 161  		if c == bidi.R || c == bidi.AL || c == bidi.AN {
 162  			return bidi.RightToLeft
 163  		}
 164  		i += sz
 165  	}
 166  	return bidi.LeftToRight
 167  }
 168  
 169  // Valid reports whether b conforms to the BiDi rule.
 170  func Valid(b []byte) bool {
 171  	var t Transformer
 172  	if n, ok := t.advance(b); !ok || n < len(b) {
 173  		return false
 174  	}
 175  	return t.isFinal()
 176  }
 177  
 178  // ValidString reports whether s conforms to the BiDi rule.
 179  func ValidString(s []byte) bool {
 180  	var t Transformer
 181  	if n, ok := t.advanceString(s); !ok || n < len(s) {
 182  		return false
 183  	}
 184  	return t.isFinal()
 185  }
 186  
 187  // New returns a Transformer that verifies that input adheres to the Bidi Rule.
 188  func New() *Transformer {
 189  	return &Transformer{}
 190  }
 191  
 192  // Transformer implements transform.Transform.
 193  type Transformer struct {
 194  	state  ruleState
 195  	hasRTL bool
 196  	seen   uint16
 197  }
 198  
 199  // A rule can only be violated for "Bidi Domain names", meaning if one of the
 200  // following categories has been observed.
 201  func (t *Transformer) isRTL() bool {
 202  	const isRTL = 1<<bidi.R | 1<<bidi.AL | 1<<bidi.AN
 203  	return t.seen&isRTL != 0
 204  }
 205  
 206  // Reset implements transform.Transformer.
 207  func (t *Transformer) Reset() { *t = Transformer{} }
 208  
 209  // Transform implements transform.Transformer. This Transformer has state and
 210  // needs to be reset between uses.
 211  func (t *Transformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
 212  	if len(dst) < len(src) {
 213  		src = src[:len(dst)]
 214  		atEOF = false
 215  		err = transform.ErrShortDst
 216  	}
 217  	n, err1 := t.Span(src, atEOF)
 218  	copy(dst, src[:n])
 219  	if err == nil || err1 != nil && err1 != transform.ErrShortSrc {
 220  		err = err1
 221  	}
 222  	return n, n, err
 223  }
 224  
 225  // Span returns the first n bytes of src that conform to the Bidi rule.
 226  func (t *Transformer) Span(src []byte, atEOF bool) (n int, err error) {
 227  	if t.state == ruleInvalid && t.isRTL() {
 228  		return 0, ErrInvalid
 229  	}
 230  	n, ok := t.advance(src)
 231  	switch {
 232  	case !ok:
 233  		err = ErrInvalid
 234  	case n < len(src):
 235  		if !atEOF {
 236  			err = transform.ErrShortSrc
 237  			break
 238  		}
 239  		err = ErrInvalid
 240  	case !t.isFinal():
 241  		err = ErrInvalid
 242  	}
 243  	return n, err
 244  }
 245  
 246  // Precomputing the ASCII values decreases running time for the ASCII fast path
 247  // by about 30%.
 248  var asciiTable [128]bidi.Properties
 249  
 250  func init() {
 251  	for i := range asciiTable {
 252  		p, _ := bidi.LookupRune(rune(i))
 253  		asciiTable[i] = p
 254  	}
 255  }
 256  
 257  func (t *Transformer) advance(s []byte) (n int, ok bool) {
 258  	var e bidi.Properties
 259  	var sz int
 260  	for n < len(s) {
 261  		if s[n] < utf8.RuneSelf {
 262  			e, sz = asciiTable[s[n]], 1
 263  		} else {
 264  			e, sz = bidi.Lookup(s[n:])
 265  			if sz <= 1 {
 266  				if sz == 1 {
 267  					// We always consider invalid UTF-8 to be invalid, even if
 268  					// the string has not yet been determined to be RTL.
 269  					// TODO: is this correct?
 270  					return n, false
 271  				}
 272  				return n, true // incomplete UTF-8 encoding
 273  			}
 274  		}
 275  		// TODO: using CompactClass would result in noticeable speedup.
 276  		// See unicode/bidi/prop.go:Properties.CompactClass.
 277  		c := uint16(1 << e.Class())
 278  		t.seen |= c
 279  		if t.seen&exclusiveRTL == exclusiveRTL {
 280  			t.state = ruleInvalid
 281  			return n, false
 282  		}
 283  		switch tr := transitions[t.state]; {
 284  		case tr[0].mask&c != 0:
 285  			t.state = tr[0].next
 286  		case tr[1].mask&c != 0:
 287  			t.state = tr[1].next
 288  		default:
 289  			t.state = ruleInvalid
 290  			if t.isRTL() {
 291  				return n, false
 292  			}
 293  		}
 294  		n += sz
 295  	}
 296  	return n, true
 297  }
 298  
 299  func (t *Transformer) advanceString(s []byte) (n int, ok bool) {
 300  	var e bidi.Properties
 301  	var sz int
 302  	for n < len(s) {
 303  		if s[n] < utf8.RuneSelf {
 304  			e, sz = asciiTable[s[n]], 1
 305  		} else {
 306  			e, sz = bidi.LookupString(s[n:])
 307  			if sz <= 1 {
 308  				if sz == 1 {
 309  					return n, false // invalid UTF-8
 310  				}
 311  				return n, true // incomplete UTF-8 encoding
 312  			}
 313  		}
 314  		// TODO: using CompactClass results in noticeable speedup.
 315  		// See unicode/bidi/prop.go:Properties.CompactClass.
 316  		c := uint16(1 << e.Class())
 317  		t.seen |= c
 318  		if t.seen&exclusiveRTL == exclusiveRTL {
 319  			t.state = ruleInvalid
 320  			return n, false
 321  		}
 322  		switch tr := transitions[t.state]; {
 323  		case tr[0].mask&c != 0:
 324  			t.state = tr[0].next
 325  		case tr[1].mask&c != 0:
 326  			t.state = tr[1].next
 327  		default:
 328  			t.state = ruleInvalid
 329  			if t.isRTL() {
 330  				return n, false
 331  			}
 332  		}
 333  		n += sz
 334  	}
 335  	return n, true
 336  }
 337