parse.go raw

   1  // Copyright 2013 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  package language
   6  
   7  import (
   8  	"bytes"
   9  	"errors"
  10  	"fmt"
  11  	"sort"
  12  
  13  	"golang.org/x/text/internal/tag"
  14  )
  15  
  16  // isAlpha returns true if the byte is not a digit.
  17  // b must be an ASCII letter or digit.
  18  func isAlpha(b byte) bool {
  19  	return b > '9'
  20  }
  21  
  22  // isAlphaNum returns true if the string contains only ASCII letters or digits.
  23  func isAlphaNum(s []byte) bool {
  24  	for _, c := range s {
  25  		if !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9') {
  26  			return false
  27  		}
  28  	}
  29  	return true
  30  }
  31  
  32  // ErrSyntax is returned by any of the parsing functions when the
  33  // input is not well-formed, according to BCP 47.
  34  // TODO: return the position at which the syntax error occurred?
  35  var ErrSyntax = errors.New("language: tag is not well-formed")
  36  
  37  // ErrDuplicateKey is returned when a tag contains the same key twice with
  38  // different values in the -u section.
  39  var ErrDuplicateKey = errors.New("language: different values for same key in -u extension")
  40  
  41  // ValueError is returned by any of the parsing functions when the
  42  // input is well-formed but the respective subtag is not recognized
  43  // as a valid value.
  44  type ValueError struct {
  45  	v [8]byte
  46  }
  47  
  48  // NewValueError creates a new ValueError.
  49  func NewValueError(tag []byte) ValueError {
  50  	var e ValueError
  51  	copy(e.v[:], tag)
  52  	return e
  53  }
  54  
  55  func (e ValueError) tag() []byte {
  56  	n := bytes.IndexByte(e.v[:], 0)
  57  	if n == -1 {
  58  		n = 8
  59  	}
  60  	return e.v[:n]
  61  }
  62  
  63  // Error implements the error interface.
  64  func (e ValueError) Error() string {
  65  	return fmt.Sprintf("language: subtag %q is well-formed but unknown", e.tag())
  66  }
  67  
  68  // Subtag returns the subtag for which the error occurred.
  69  func (e ValueError) Subtag() string {
  70  	return string(e.tag())
  71  }
  72  
  73  // scanner is used to scan BCP 47 tokens, which are separated by _ or -.
  74  type scanner struct {
  75  	b     []byte
  76  	bytes [max99thPercentileSize]byte
  77  	token []byte
  78  	start int // start position of the current token
  79  	end   int // end position of the current token
  80  	next  int // next point for scan
  81  	err   error
  82  	done  bool
  83  }
  84  
  85  func makeScannerString(s string) scanner {
  86  	scan := scanner{}
  87  	if len(s) <= len(scan.bytes) {
  88  		scan.b = scan.bytes[:copy(scan.bytes[:], s)]
  89  	} else {
  90  		scan.b = []byte(s)
  91  	}
  92  	scan.init()
  93  	return scan
  94  }
  95  
  96  // makeScanner returns a scanner using b as the input buffer.
  97  // b is not copied and may be modified by the scanner routines.
  98  func makeScanner(b []byte) scanner {
  99  	scan := scanner{b: b}
 100  	scan.init()
 101  	return scan
 102  }
 103  
 104  func (s *scanner) init() {
 105  	for i, c := range s.b {
 106  		if c == '_' {
 107  			s.b[i] = '-'
 108  		}
 109  	}
 110  	s.scan()
 111  }
 112  
 113  // restToLower converts the string between start and end to lower case.
 114  func (s *scanner) toLower(start, end int) {
 115  	for i := start; i < end; i++ {
 116  		c := s.b[i]
 117  		if 'A' <= c && c <= 'Z' {
 118  			s.b[i] += 'a' - 'A'
 119  		}
 120  	}
 121  }
 122  
 123  func (s *scanner) setError(e error) {
 124  	if s.err == nil || (e == ErrSyntax && s.err != ErrSyntax) {
 125  		s.err = e
 126  	}
 127  }
 128  
 129  // resizeRange shrinks or grows the array at position oldStart such that
 130  // a new string of size newSize can fit between oldStart and oldEnd.
 131  // Sets the scan point to after the resized range.
 132  func (s *scanner) resizeRange(oldStart, oldEnd, newSize int) {
 133  	s.start = oldStart
 134  	if end := oldStart + newSize; end != oldEnd {
 135  		diff := end - oldEnd
 136  		var b []byte
 137  		if n := len(s.b) + diff; n > cap(s.b) {
 138  			b = make([]byte, n)
 139  			copy(b, s.b[:oldStart])
 140  		} else {
 141  			b = s.b[:n]
 142  		}
 143  		copy(b[end:], s.b[oldEnd:])
 144  		s.b = b
 145  		s.next = end + (s.next - s.end)
 146  		s.end = end
 147  	}
 148  }
 149  
 150  // replace replaces the current token with repl.
 151  func (s *scanner) replace(repl string) {
 152  	s.resizeRange(s.start, s.end, len(repl))
 153  	copy(s.b[s.start:], repl)
 154  }
 155  
 156  // gobble removes the current token from the input.
 157  // Caller must call scan after calling gobble.
 158  func (s *scanner) gobble(e error) {
 159  	s.setError(e)
 160  	if s.start == 0 {
 161  		s.b = s.b[:+copy(s.b, s.b[s.next:])]
 162  		s.end = 0
 163  	} else {
 164  		s.b = s.b[:s.start-1+copy(s.b[s.start-1:], s.b[s.end:])]
 165  		s.end = s.start - 1
 166  	}
 167  	s.next = s.start
 168  }
 169  
 170  // deleteRange removes the given range from s.b before the current token.
 171  func (s *scanner) deleteRange(start, end int) {
 172  	s.b = s.b[:start+copy(s.b[start:], s.b[end:])]
 173  	diff := end - start
 174  	s.next -= diff
 175  	s.start -= diff
 176  	s.end -= diff
 177  }
 178  
 179  // scan parses the next token of a BCP 47 string.  Tokens that are larger
 180  // than 8 characters or include non-alphanumeric characters result in an error
 181  // and are gobbled and removed from the output.
 182  // It returns the end position of the last token consumed.
 183  func (s *scanner) scan() (end int) {
 184  	end = s.end
 185  	s.token = nil
 186  	for s.start = s.next; s.next < len(s.b); {
 187  		i := bytes.IndexByte(s.b[s.next:], '-')
 188  		if i == -1 {
 189  			s.end = len(s.b)
 190  			s.next = len(s.b)
 191  			i = s.end - s.start
 192  		} else {
 193  			s.end = s.next + i
 194  			s.next = s.end + 1
 195  		}
 196  		token := s.b[s.start:s.end]
 197  		if i < 1 || i > 8 || !isAlphaNum(token) {
 198  			s.gobble(ErrSyntax)
 199  			continue
 200  		}
 201  		s.token = token
 202  		return end
 203  	}
 204  	if n := len(s.b); n > 0 && s.b[n-1] == '-' {
 205  		s.setError(ErrSyntax)
 206  		s.b = s.b[:len(s.b)-1]
 207  	}
 208  	s.done = true
 209  	return end
 210  }
 211  
 212  // acceptMinSize parses multiple tokens of the given size or greater.
 213  // It returns the end position of the last token consumed.
 214  func (s *scanner) acceptMinSize(min int) (end int) {
 215  	end = s.end
 216  	s.scan()
 217  	for ; len(s.token) >= min; s.scan() {
 218  		end = s.end
 219  	}
 220  	return end
 221  }
 222  
 223  // Parse parses the given BCP 47 string and returns a valid Tag. If parsing
 224  // failed it returns an error and any part of the tag that could be parsed.
 225  // If parsing succeeded but an unknown value was found, it returns
 226  // ValueError. The Tag returned in this case is just stripped of the unknown
 227  // value. All other values are preserved. It accepts tags in the BCP 47 format
 228  // and extensions to this standard defined in
 229  // https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
 230  func Parse(s string) (t Tag, err error) {
 231  	// TODO: consider supporting old-style locale key-value pairs.
 232  	if s == "" {
 233  		return Und, ErrSyntax
 234  	}
 235  	defer func() {
 236  		if recover() != nil {
 237  			t = Und
 238  			err = ErrSyntax
 239  			return
 240  		}
 241  	}()
 242  	if len(s) <= maxAltTaglen {
 243  		b := [maxAltTaglen]byte{}
 244  		for i, c := range s {
 245  			// Generating invalid UTF-8 is okay as it won't match.
 246  			if 'A' <= c && c <= 'Z' {
 247  				c += 'a' - 'A'
 248  			} else if c == '_' {
 249  				c = '-'
 250  			}
 251  			b[i] = byte(c)
 252  		}
 253  		if t, ok := grandfathered(b); ok {
 254  			return t, nil
 255  		}
 256  	}
 257  	scan := makeScannerString(s)
 258  	return parse(&scan, s)
 259  }
 260  
 261  func parse(scan *scanner, s string) (t Tag, err error) {
 262  	t = Und
 263  	var end int
 264  	if n := len(scan.token); n <= 1 {
 265  		scan.toLower(0, len(scan.b))
 266  		if n == 0 || scan.token[0] != 'x' {
 267  			return t, ErrSyntax
 268  		}
 269  		end = parseExtensions(scan)
 270  	} else if n >= 4 {
 271  		return Und, ErrSyntax
 272  	} else { // the usual case
 273  		t, end = parseTag(scan, true)
 274  		if n := len(scan.token); n == 1 {
 275  			t.pExt = uint16(end)
 276  			end = parseExtensions(scan)
 277  		} else if end < len(scan.b) {
 278  			scan.setError(ErrSyntax)
 279  			scan.b = scan.b[:end]
 280  		}
 281  	}
 282  	if int(t.pVariant) < len(scan.b) {
 283  		if end < len(s) {
 284  			s = s[:end]
 285  		}
 286  		if len(s) > 0 && tag.Compare(s, scan.b) == 0 {
 287  			t.str = s
 288  		} else {
 289  			t.str = string(scan.b)
 290  		}
 291  	} else {
 292  		t.pVariant, t.pExt = 0, 0
 293  	}
 294  	return t, scan.err
 295  }
 296  
 297  // parseTag parses language, script, region and variants.
 298  // It returns a Tag and the end position in the input that was parsed.
 299  // If doNorm is true, then <lang>-<extlang> will be normalized to <extlang>.
 300  func parseTag(scan *scanner, doNorm bool) (t Tag, end int) {
 301  	var e error
 302  	// TODO: set an error if an unknown lang, script or region is encountered.
 303  	t.LangID, e = getLangID(scan.token)
 304  	scan.setError(e)
 305  	scan.replace(t.LangID.String())
 306  	langStart := scan.start
 307  	end = scan.scan()
 308  	for len(scan.token) == 3 && isAlpha(scan.token[0]) {
 309  		// From http://tools.ietf.org/html/bcp47, <lang>-<extlang> tags are equivalent
 310  		// to a tag of the form <extlang>.
 311  		if doNorm {
 312  			lang, e := getLangID(scan.token)
 313  			if lang != 0 {
 314  				t.LangID = lang
 315  				langStr := lang.String()
 316  				copy(scan.b[langStart:], langStr)
 317  				scan.b[langStart+len(langStr)] = '-'
 318  				scan.start = langStart + len(langStr) + 1
 319  			}
 320  			scan.gobble(e)
 321  		}
 322  		end = scan.scan()
 323  	}
 324  	if len(scan.token) == 4 && isAlpha(scan.token[0]) {
 325  		t.ScriptID, e = getScriptID(script, scan.token)
 326  		if t.ScriptID == 0 {
 327  			scan.gobble(e)
 328  		}
 329  		end = scan.scan()
 330  	}
 331  	if n := len(scan.token); n >= 2 && n <= 3 {
 332  		t.RegionID, e = getRegionID(scan.token)
 333  		if t.RegionID == 0 {
 334  			scan.gobble(e)
 335  		} else {
 336  			scan.replace(t.RegionID.String())
 337  		}
 338  		end = scan.scan()
 339  	}
 340  	scan.toLower(scan.start, len(scan.b))
 341  	t.pVariant = byte(end)
 342  	end = parseVariants(scan, end, t)
 343  	t.pExt = uint16(end)
 344  	return t, end
 345  }
 346  
 347  var separator = []byte{'-'}
 348  
 349  // parseVariants scans tokens as long as each token is a valid variant string.
 350  // Duplicate variants are removed.
 351  func parseVariants(scan *scanner, end int, t Tag) int {
 352  	start := scan.start
 353  	varIDBuf := [4]uint8{}
 354  	variantBuf := [4][]byte{}
 355  	varID := varIDBuf[:0]
 356  	variant := variantBuf[:0]
 357  	last := -1
 358  	needSort := false
 359  	for ; len(scan.token) >= 4; scan.scan() {
 360  		// TODO: measure the impact of needing this conversion and redesign
 361  		// the data structure if there is an issue.
 362  		v, ok := variantIndex[string(scan.token)]
 363  		if !ok {
 364  			// unknown variant
 365  			// TODO: allow user-defined variants?
 366  			scan.gobble(NewValueError(scan.token))
 367  			continue
 368  		}
 369  		varID = append(varID, v)
 370  		variant = append(variant, scan.token)
 371  		if !needSort {
 372  			if last < int(v) {
 373  				last = int(v)
 374  			} else {
 375  				needSort = true
 376  				// There is no legal combinations of more than 7 variants
 377  				// (and this is by no means a useful sequence).
 378  				const maxVariants = 8
 379  				if len(varID) > maxVariants {
 380  					break
 381  				}
 382  			}
 383  		}
 384  		end = scan.end
 385  	}
 386  	if needSort {
 387  		sort.Sort(variantsSort{varID, variant})
 388  		k, l := 0, -1
 389  		for i, v := range varID {
 390  			w := int(v)
 391  			if l == w {
 392  				// Remove duplicates.
 393  				continue
 394  			}
 395  			varID[k] = varID[i]
 396  			variant[k] = variant[i]
 397  			k++
 398  			l = w
 399  		}
 400  		if str := bytes.Join(variant[:k], separator); len(str) == 0 {
 401  			end = start - 1
 402  		} else {
 403  			scan.resizeRange(start, end, len(str))
 404  			copy(scan.b[scan.start:], str)
 405  			end = scan.end
 406  		}
 407  	}
 408  	return end
 409  }
 410  
 411  type variantsSort struct {
 412  	i []uint8
 413  	v [][]byte
 414  }
 415  
 416  func (s variantsSort) Len() int {
 417  	return len(s.i)
 418  }
 419  
 420  func (s variantsSort) Swap(i, j int) {
 421  	s.i[i], s.i[j] = s.i[j], s.i[i]
 422  	s.v[i], s.v[j] = s.v[j], s.v[i]
 423  }
 424  
 425  func (s variantsSort) Less(i, j int) bool {
 426  	return s.i[i] < s.i[j]
 427  }
 428  
 429  type bytesSort struct {
 430  	b [][]byte
 431  	n int // first n bytes to compare
 432  }
 433  
 434  func (b bytesSort) Len() int {
 435  	return len(b.b)
 436  }
 437  
 438  func (b bytesSort) Swap(i, j int) {
 439  	b.b[i], b.b[j] = b.b[j], b.b[i]
 440  }
 441  
 442  func (b bytesSort) Less(i, j int) bool {
 443  	for k := 0; k < b.n; k++ {
 444  		if b.b[i][k] == b.b[j][k] {
 445  			continue
 446  		}
 447  		return b.b[i][k] < b.b[j][k]
 448  	}
 449  	return false
 450  }
 451  
 452  // parseExtensions parses and normalizes the extensions in the buffer.
 453  // It returns the last position of scan.b that is part of any extension.
 454  // It also trims scan.b to remove excess parts accordingly.
 455  func parseExtensions(scan *scanner) int {
 456  	start := scan.start
 457  	exts := [][]byte{}
 458  	private := []byte{}
 459  	end := scan.end
 460  	for len(scan.token) == 1 {
 461  		extStart := scan.start
 462  		ext := scan.token[0]
 463  		end = parseExtension(scan)
 464  		extension := scan.b[extStart:end]
 465  		if len(extension) < 3 || (ext != 'x' && len(extension) < 4) {
 466  			scan.setError(ErrSyntax)
 467  			end = extStart
 468  			continue
 469  		} else if start == extStart && (ext == 'x' || scan.start == len(scan.b)) {
 470  			scan.b = scan.b[:end]
 471  			return end
 472  		} else if ext == 'x' {
 473  			private = extension
 474  			break
 475  		}
 476  		exts = append(exts, extension)
 477  	}
 478  	sort.Sort(bytesSort{exts, 1})
 479  	if len(private) > 0 {
 480  		exts = append(exts, private)
 481  	}
 482  	scan.b = scan.b[:start]
 483  	if len(exts) > 0 {
 484  		scan.b = append(scan.b, bytes.Join(exts, separator)...)
 485  	} else if start > 0 {
 486  		// Strip trailing '-'.
 487  		scan.b = scan.b[:start-1]
 488  	}
 489  	return end
 490  }
 491  
 492  // parseExtension parses a single extension and returns the position of
 493  // the extension end.
 494  func parseExtension(scan *scanner) int {
 495  	start, end := scan.start, scan.end
 496  	switch scan.token[0] {
 497  	case 'u': // https://www.ietf.org/rfc/rfc6067.txt
 498  		attrStart := end
 499  		scan.scan()
 500  		for last := []byte{}; len(scan.token) > 2; scan.scan() {
 501  			if bytes.Compare(scan.token, last) != -1 {
 502  				// Attributes are unsorted. Start over from scratch.
 503  				p := attrStart + 1
 504  				scan.next = p
 505  				attrs := [][]byte{}
 506  				for scan.scan(); len(scan.token) > 2; scan.scan() {
 507  					attrs = append(attrs, scan.token)
 508  					end = scan.end
 509  				}
 510  				sort.Sort(bytesSort{attrs, 3})
 511  				copy(scan.b[p:], bytes.Join(attrs, separator))
 512  				break
 513  			}
 514  			last = scan.token
 515  			end = scan.end
 516  		}
 517  		// Scan key-type sequences. A key is of length 2 and may be followed
 518  		// by 0 or more "type" subtags from 3 to the maximum of 8 letters.
 519  		var last, key []byte
 520  		for attrEnd := end; len(scan.token) == 2; last = key {
 521  			key = scan.token
 522  			end = scan.end
 523  			for scan.scan(); end < scan.end && len(scan.token) > 2; scan.scan() {
 524  				end = scan.end
 525  			}
 526  			// TODO: check key value validity
 527  			if bytes.Compare(key, last) != 1 || scan.err != nil {
 528  				// We have an invalid key or the keys are not sorted.
 529  				// Start scanning keys from scratch and reorder.
 530  				p := attrEnd + 1
 531  				scan.next = p
 532  				keys := [][]byte{}
 533  				for scan.scan(); len(scan.token) == 2; {
 534  					keyStart := scan.start
 535  					end = scan.end
 536  					for scan.scan(); end < scan.end && len(scan.token) > 2; scan.scan() {
 537  						end = scan.end
 538  					}
 539  					keys = append(keys, scan.b[keyStart:end])
 540  				}
 541  				sort.Stable(bytesSort{keys, 2})
 542  				if n := len(keys); n > 0 {
 543  					k := 0
 544  					for i := 1; i < n; i++ {
 545  						if !bytes.Equal(keys[k][:2], keys[i][:2]) {
 546  							k++
 547  							keys[k] = keys[i]
 548  						} else if !bytes.Equal(keys[k], keys[i]) {
 549  							scan.setError(ErrDuplicateKey)
 550  						}
 551  					}
 552  					keys = keys[:k+1]
 553  				}
 554  				reordered := bytes.Join(keys, separator)
 555  				if e := p + len(reordered); e < end {
 556  					scan.deleteRange(e, end)
 557  					end = e
 558  				}
 559  				copy(scan.b[p:], reordered)
 560  				break
 561  			}
 562  		}
 563  	case 't': // https://www.ietf.org/rfc/rfc6497.txt
 564  		scan.scan()
 565  		if n := len(scan.token); n >= 2 && n <= 3 && isAlpha(scan.token[1]) {
 566  			_, end = parseTag(scan, false)
 567  			scan.toLower(start, end)
 568  		}
 569  		for len(scan.token) == 2 && !isAlpha(scan.token[1]) {
 570  			end = scan.acceptMinSize(3)
 571  		}
 572  	case 'x':
 573  		end = scan.acceptMinSize(1)
 574  	default:
 575  		end = scan.acceptMinSize(2)
 576  	}
 577  	return end
 578  }
 579  
 580  // getExtension returns the name, body and end position of the extension.
 581  func getExtension(s string, p int) (end int, ext string) {
 582  	if s[p] == '-' {
 583  		p++
 584  	}
 585  	if s[p] == 'x' {
 586  		return len(s), s[p:]
 587  	}
 588  	end = nextExtension(s, p)
 589  	return end, s[p:end]
 590  }
 591  
 592  // nextExtension finds the next extension within the string, searching
 593  // for the -<char>- pattern from position p.
 594  // In the fast majority of cases, language tags will have at most
 595  // one extension and extensions tend to be small.
 596  func nextExtension(s string, p int) int {
 597  	for n := len(s) - 3; p < n; {
 598  		if s[p] == '-' {
 599  			if s[p+2] == '-' {
 600  				return p
 601  			}
 602  			p += 3
 603  		} else {
 604  			p++
 605  		}
 606  	}
 607  	return len(s)
 608  }
 609