lookup.go raw

   1  // Copyright 2013 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  package language
   6  
   7  import (
   8  	"bytes"
   9  	"fmt"
  10  	"sort"
  11  	"strconv"
  12  
  13  	"golang.org/x/text/internal/tag"
  14  )
  15  
  16  // findIndex tries to find the given tag in idx and returns a standardized error
  17  // if it could not be found.
  18  func findIndex(idx tag.Index, key []byte, form string) (index int, err error) {
  19  	if !tag.FixCase(form, key) {
  20  		return 0, ErrSyntax
  21  	}
  22  	i := idx.Index(key)
  23  	if i == -1 {
  24  		return 0, NewValueError(key)
  25  	}
  26  	return i, nil
  27  }
  28  
  29  func searchUint(imap []uint16, key uint16) int {
  30  	return sort.Search(len(imap), func(i int) bool {
  31  		return imap[i] >= key
  32  	})
  33  }
  34  
  35  type Language uint16
  36  
  37  // getLangID returns the langID of s if s is a canonical subtag
  38  // or langUnknown if s is not a canonical subtag.
  39  func getLangID(s []byte) (Language, error) {
  40  	if len(s) == 2 {
  41  		return getLangISO2(s)
  42  	}
  43  	return getLangISO3(s)
  44  }
  45  
  46  // TODO language normalization as well as the AliasMaps could be moved to the
  47  // higher level package, but it is a bit tricky to separate the generation.
  48  
  49  func (id Language) Canonicalize() (Language, AliasType) {
  50  	return normLang(id)
  51  }
  52  
  53  // normLang returns the mapped langID of id according to mapping m.
  54  func normLang(id Language) (Language, AliasType) {
  55  	k := sort.Search(len(AliasMap), func(i int) bool {
  56  		return AliasMap[i].From >= uint16(id)
  57  	})
  58  	if k < len(AliasMap) && AliasMap[k].From == uint16(id) {
  59  		return Language(AliasMap[k].To), AliasTypes[k]
  60  	}
  61  	return id, AliasTypeUnknown
  62  }
  63  
  64  // getLangISO2 returns the langID for the given 2-letter ISO language code
  65  // or unknownLang if this does not exist.
  66  func getLangISO2(s []byte) (Language, error) {
  67  	if !tag.FixCase("zz", s) {
  68  		return 0, ErrSyntax
  69  	}
  70  	if i := lang.Index(s); i != -1 && lang.Elem(i)[3] != 0 {
  71  		return Language(i), nil
  72  	}
  73  	return 0, NewValueError(s)
  74  }
  75  
  76  const base = 'z' - 'a' + 1
  77  
  78  func strToInt(s []byte) uint {
  79  	v := uint(0)
  80  	for i := 0; i < len(s); i++ {
  81  		v *= base
  82  		v += uint(s[i] - 'a')
  83  	}
  84  	return v
  85  }
  86  
  87  // converts the given integer to the original ASCII string passed to strToInt.
  88  // len(s) must match the number of characters obtained.
  89  func intToStr(v uint, s []byte) {
  90  	for i := len(s) - 1; i >= 0; i-- {
  91  		s[i] = byte(v%base) + 'a'
  92  		v /= base
  93  	}
  94  }
  95  
  96  // getLangISO3 returns the langID for the given 3-letter ISO language code
  97  // or unknownLang if this does not exist.
  98  func getLangISO3(s []byte) (Language, error) {
  99  	if tag.FixCase("und", s) {
 100  		// first try to match canonical 3-letter entries
 101  		for i := lang.Index(s[:2]); i != -1; i = lang.Next(s[:2], i) {
 102  			if e := lang.Elem(i); e[3] == 0 && e[2] == s[2] {
 103  				// We treat "und" as special and always translate it to "unspecified".
 104  				// Note that ZZ and Zzzz are private use and are not treated as
 105  				// unspecified by default.
 106  				id := Language(i)
 107  				if id == nonCanonicalUnd {
 108  					return 0, nil
 109  				}
 110  				return id, nil
 111  			}
 112  		}
 113  		if i := altLangISO3.Index(s); i != -1 {
 114  			return Language(altLangIndex[altLangISO3.Elem(i)[3]]), nil
 115  		}
 116  		n := strToInt(s)
 117  		if langNoIndex[n/8]&(1<<(n%8)) != 0 {
 118  			return Language(n) + langNoIndexOffset, nil
 119  		}
 120  		// Check for non-canonical uses of ISO3.
 121  		for i := lang.Index(s[:1]); i != -1; i = lang.Next(s[:1], i) {
 122  			if e := lang.Elem(i); e[2] == s[1] && e[3] == s[2] {
 123  				return Language(i), nil
 124  			}
 125  		}
 126  		return 0, NewValueError(s)
 127  	}
 128  	return 0, ErrSyntax
 129  }
 130  
 131  // StringToBuf writes the string to b and returns the number of bytes
 132  // written.  cap(b) must be >= 3.
 133  func (id Language) StringToBuf(b []byte) int {
 134  	if id >= langNoIndexOffset {
 135  		intToStr(uint(id)-langNoIndexOffset, b[:3])
 136  		return 3
 137  	} else if id == 0 {
 138  		return copy(b, "und")
 139  	}
 140  	l := lang[id<<2:]
 141  	if l[3] == 0 {
 142  		return copy(b, l[:3])
 143  	}
 144  	return copy(b, l[:2])
 145  }
 146  
 147  // String returns the BCP 47 representation of the langID.
 148  // Use b as variable name, instead of id, to ensure the variable
 149  // used is consistent with that of Base in which this type is embedded.
 150  func (b Language) String() string {
 151  	if b == 0 {
 152  		return "und"
 153  	} else if b >= langNoIndexOffset {
 154  		b -= langNoIndexOffset
 155  		buf := [3]byte{}
 156  		intToStr(uint(b), buf[:])
 157  		return string(buf[:])
 158  	}
 159  	l := lang.Elem(int(b))
 160  	if l[3] == 0 {
 161  		return l[:3]
 162  	}
 163  	return l[:2]
 164  }
 165  
 166  // ISO3 returns the ISO 639-3 language code.
 167  func (b Language) ISO3() string {
 168  	if b == 0 || b >= langNoIndexOffset {
 169  		return b.String()
 170  	}
 171  	l := lang.Elem(int(b))
 172  	if l[3] == 0 {
 173  		return l[:3]
 174  	} else if l[2] == 0 {
 175  		return altLangISO3.Elem(int(l[3]))[:3]
 176  	}
 177  	// This allocation will only happen for 3-letter ISO codes
 178  	// that are non-canonical BCP 47 language identifiers.
 179  	return l[0:1] + l[2:4]
 180  }
 181  
 182  // IsPrivateUse reports whether this language code is reserved for private use.
 183  func (b Language) IsPrivateUse() bool {
 184  	return langPrivateStart <= b && b <= langPrivateEnd
 185  }
 186  
 187  // SuppressScript returns the script marked as SuppressScript in the IANA
 188  // language tag repository, or 0 if there is no such script.
 189  func (b Language) SuppressScript() Script {
 190  	if b < langNoIndexOffset {
 191  		return Script(suppressScript[b])
 192  	}
 193  	return 0
 194  }
 195  
 196  type Region uint16
 197  
 198  // getRegionID returns the region id for s if s is a valid 2-letter region code
 199  // or unknownRegion.
 200  func getRegionID(s []byte) (Region, error) {
 201  	if len(s) == 3 {
 202  		if isAlpha(s[0]) {
 203  			return getRegionISO3(s)
 204  		}
 205  		if i, err := strconv.ParseUint(string(s), 10, 10); err == nil {
 206  			return getRegionM49(int(i))
 207  		}
 208  	}
 209  	return getRegionISO2(s)
 210  }
 211  
 212  // getRegionISO2 returns the regionID for the given 2-letter ISO country code
 213  // or unknownRegion if this does not exist.
 214  func getRegionISO2(s []byte) (Region, error) {
 215  	i, err := findIndex(regionISO, s, "ZZ")
 216  	if err != nil {
 217  		return 0, err
 218  	}
 219  	return Region(i) + isoRegionOffset, nil
 220  }
 221  
 222  // getRegionISO3 returns the regionID for the given 3-letter ISO country code
 223  // or unknownRegion if this does not exist.
 224  func getRegionISO3(s []byte) (Region, error) {
 225  	if tag.FixCase("ZZZ", s) {
 226  		for i := regionISO.Index(s[:1]); i != -1; i = regionISO.Next(s[:1], i) {
 227  			if e := regionISO.Elem(i); e[2] == s[1] && e[3] == s[2] {
 228  				return Region(i) + isoRegionOffset, nil
 229  			}
 230  		}
 231  		for i := 0; i < len(altRegionISO3); i += 3 {
 232  			if tag.Compare(altRegionISO3[i:i+3], s) == 0 {
 233  				return Region(altRegionIDs[i/3]), nil
 234  			}
 235  		}
 236  		return 0, NewValueError(s)
 237  	}
 238  	return 0, ErrSyntax
 239  }
 240  
 241  func getRegionM49(n int) (Region, error) {
 242  	if 0 < n && n <= 999 {
 243  		const (
 244  			searchBits = 7
 245  			regionBits = 9
 246  			regionMask = 1<<regionBits - 1
 247  		)
 248  		idx := n >> searchBits
 249  		buf := fromM49[m49Index[idx]:m49Index[idx+1]]
 250  		val := uint16(n) << regionBits // we rely on bits shifting out
 251  		i := sort.Search(len(buf), func(i int) bool {
 252  			return buf[i] >= val
 253  		})
 254  		if r := fromM49[int(m49Index[idx])+i]; r&^regionMask == val {
 255  			return Region(r & regionMask), nil
 256  		}
 257  	}
 258  	var e ValueError
 259  	fmt.Fprint(bytes.NewBuffer([]byte(e.v[:])), n)
 260  	return 0, e
 261  }
 262  
 263  // normRegion returns a region if r is deprecated or 0 otherwise.
 264  // TODO: consider supporting BYS (-> BLR), CSK (-> 200 or CZ), PHI (-> PHL) and AFI (-> DJ).
 265  // TODO: consider mapping split up regions to new most populous one (like CLDR).
 266  func normRegion(r Region) Region {
 267  	m := regionOldMap
 268  	k := sort.Search(len(m), func(i int) bool {
 269  		return m[i].From >= uint16(r)
 270  	})
 271  	if k < len(m) && m[k].From == uint16(r) {
 272  		return Region(m[k].To)
 273  	}
 274  	return 0
 275  }
 276  
 277  const (
 278  	iso3166UserAssigned = 1 << iota
 279  	ccTLD
 280  	bcp47Region
 281  )
 282  
 283  func (r Region) typ() byte {
 284  	return regionTypes[r]
 285  }
 286  
 287  // String returns the BCP 47 representation for the region.
 288  // It returns "ZZ" for an unspecified region.
 289  func (r Region) String() string {
 290  	if r < isoRegionOffset {
 291  		if r == 0 {
 292  			return "ZZ"
 293  		}
 294  		return fmt.Sprintf("%03d", r.M49())
 295  	}
 296  	r -= isoRegionOffset
 297  	return regionISO.Elem(int(r))[:2]
 298  }
 299  
 300  // ISO3 returns the 3-letter ISO code of r.
 301  // Note that not all regions have a 3-letter ISO code.
 302  // In such cases this method returns "ZZZ".
 303  func (r Region) ISO3() string {
 304  	if r < isoRegionOffset {
 305  		return "ZZZ"
 306  	}
 307  	r -= isoRegionOffset
 308  	reg := regionISO.Elem(int(r))
 309  	switch reg[2] {
 310  	case 0:
 311  		return altRegionISO3[reg[3]:][:3]
 312  	case ' ':
 313  		return "ZZZ"
 314  	}
 315  	return reg[0:1] + reg[2:4]
 316  }
 317  
 318  // M49 returns the UN M.49 encoding of r, or 0 if this encoding
 319  // is not defined for r.
 320  func (r Region) M49() int {
 321  	return int(m49[r])
 322  }
 323  
 324  // IsPrivateUse reports whether r has the ISO 3166 User-assigned status. This
 325  // may include private-use tags that are assigned by CLDR and used in this
 326  // implementation. So IsPrivateUse and IsCountry can be simultaneously true.
 327  func (r Region) IsPrivateUse() bool {
 328  	return r.typ()&iso3166UserAssigned != 0
 329  }
 330  
 331  type Script uint16
 332  
 333  // getScriptID returns the script id for string s. It assumes that s
 334  // is of the format [A-Z][a-z]{3}.
 335  func getScriptID(idx tag.Index, s []byte) (Script, error) {
 336  	i, err := findIndex(idx, s, "Zzzz")
 337  	return Script(i), err
 338  }
 339  
 340  // String returns the script code in title case.
 341  // It returns "Zzzz" for an unspecified script.
 342  func (s Script) String() string {
 343  	if s == 0 {
 344  		return "Zzzz"
 345  	}
 346  	return script.Elem(int(s))
 347  }
 348  
 349  // IsPrivateUse reports whether this script code is reserved for private use.
 350  func (s Script) IsPrivateUse() bool {
 351  	return _Qaaa <= s && s <= _Qabx
 352  }
 353  
 354  const (
 355  	maxAltTaglen = len("en-US-POSIX")
 356  	maxLen       = maxAltTaglen
 357  )
 358  
 359  var (
 360  	// grandfatheredMap holds a mapping from legacy and grandfathered tags to
 361  	// their base language or index to more elaborate tag.
 362  	grandfatheredMap = map[[maxLen]byte]int16{
 363  		[maxLen]byte{'a', 'r', 't', '-', 'l', 'o', 'j', 'b', 'a', 'n'}: _jbo, // art-lojban
 364  		[maxLen]byte{'i', '-', 'a', 'm', 'i'}:                          _ami, // i-ami
 365  		[maxLen]byte{'i', '-', 'b', 'n', 'n'}:                          _bnn, // i-bnn
 366  		[maxLen]byte{'i', '-', 'h', 'a', 'k'}:                          _hak, // i-hak
 367  		[maxLen]byte{'i', '-', 'k', 'l', 'i', 'n', 'g', 'o', 'n'}:      _tlh, // i-klingon
 368  		[maxLen]byte{'i', '-', 'l', 'u', 'x'}:                          _lb,  // i-lux
 369  		[maxLen]byte{'i', '-', 'n', 'a', 'v', 'a', 'j', 'o'}:           _nv,  // i-navajo
 370  		[maxLen]byte{'i', '-', 'p', 'w', 'n'}:                          _pwn, // i-pwn
 371  		[maxLen]byte{'i', '-', 't', 'a', 'o'}:                          _tao, // i-tao
 372  		[maxLen]byte{'i', '-', 't', 'a', 'y'}:                          _tay, // i-tay
 373  		[maxLen]byte{'i', '-', 't', 's', 'u'}:                          _tsu, // i-tsu
 374  		[maxLen]byte{'n', 'o', '-', 'b', 'o', 'k'}:                     _nb,  // no-bok
 375  		[maxLen]byte{'n', 'o', '-', 'n', 'y', 'n'}:                     _nn,  // no-nyn
 376  		[maxLen]byte{'s', 'g', 'n', '-', 'b', 'e', '-', 'f', 'r'}:      _sfb, // sgn-BE-FR
 377  		[maxLen]byte{'s', 'g', 'n', '-', 'b', 'e', '-', 'n', 'l'}:      _vgt, // sgn-BE-NL
 378  		[maxLen]byte{'s', 'g', 'n', '-', 'c', 'h', '-', 'd', 'e'}:      _sgg, // sgn-CH-DE
 379  		[maxLen]byte{'z', 'h', '-', 'g', 'u', 'o', 'y', 'u'}:           _cmn, // zh-guoyu
 380  		[maxLen]byte{'z', 'h', '-', 'h', 'a', 'k', 'k', 'a'}:           _hak, // zh-hakka
 381  		[maxLen]byte{'z', 'h', '-', 'm', 'i', 'n', '-', 'n', 'a', 'n'}: _nan, // zh-min-nan
 382  		[maxLen]byte{'z', 'h', '-', 'x', 'i', 'a', 'n', 'g'}:           _hsn, // zh-xiang
 383  
 384  		// Grandfathered tags with no modern replacement will be converted as
 385  		// follows:
 386  		[maxLen]byte{'c', 'e', 'l', '-', 'g', 'a', 'u', 'l', 'i', 's', 'h'}: -1, // cel-gaulish
 387  		[maxLen]byte{'e', 'n', '-', 'g', 'b', '-', 'o', 'e', 'd'}:           -2, // en-GB-oed
 388  		[maxLen]byte{'i', '-', 'd', 'e', 'f', 'a', 'u', 'l', 't'}:           -3, // i-default
 389  		[maxLen]byte{'i', '-', 'e', 'n', 'o', 'c', 'h', 'i', 'a', 'n'}:      -4, // i-enochian
 390  		[maxLen]byte{'i', '-', 'm', 'i', 'n', 'g', 'o'}:                     -5, // i-mingo
 391  		[maxLen]byte{'z', 'h', '-', 'm', 'i', 'n'}:                          -6, // zh-min
 392  
 393  		// CLDR-specific tag.
 394  		[maxLen]byte{'r', 'o', 'o', 't'}:                                    0,  // root
 395  		[maxLen]byte{'e', 'n', '-', 'u', 's', '-', 'p', 'o', 's', 'i', 'x'}: -7, // en_US_POSIX"
 396  	}
 397  
 398  	altTagIndex = [...]uint8{0, 17, 31, 45, 61, 74, 86, 102}
 399  
 400  	altTags = "xtg-x-cel-gaulishen-GB-oxendicten-x-i-defaultund-x-i-enochiansee-x-i-mingonan-x-zh-minen-US-u-va-posix"
 401  )
 402  
 403  func grandfathered(s [maxAltTaglen]byte) (t Tag, ok bool) {
 404  	if v, ok := grandfatheredMap[s]; ok {
 405  		if v < 0 {
 406  			return Make(altTags[altTagIndex[-v-1]:altTagIndex[-v]]), true
 407  		}
 408  		t.LangID = Language(v)
 409  		return t, true
 410  	}
 411  	return t, false
 412  }
 413