parse.go raw

   1  // Copyright 2013 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  package language
   6  
   7  import (
   8  	"errors"
   9  	"sort"
  10  	"strconv"
  11  	"strings"
  12  
  13  	"golang.org/x/text/internal/language"
  14  )
  15  
  16  // ValueError is returned by any of the parsing functions when the
  17  // input is well-formed but the respective subtag is not recognized
  18  // as a valid value.
  19  type ValueError interface {
  20  	error
  21  
  22  	// Subtag returns the subtag for which the error occurred.
  23  	Subtag() string
  24  }
  25  
  26  // Parse parses the given BCP 47 string and returns a valid Tag. If parsing
  27  // failed it returns an error and any part of the tag that could be parsed.
  28  // If parsing succeeded but an unknown value was found, it returns
  29  // ValueError. The Tag returned in this case is just stripped of the unknown
  30  // value. All other values are preserved. It accepts tags in the BCP 47 format
  31  // and extensions to this standard defined in
  32  // https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
  33  // The resulting tag is canonicalized using the default canonicalization type.
  34  func Parse(s string) (t Tag, err error) {
  35  	return Default.Parse(s)
  36  }
  37  
  38  // Parse parses the given BCP 47 string and returns a valid Tag. If parsing
  39  // failed it returns an error and any part of the tag that could be parsed.
  40  // If parsing succeeded but an unknown value was found, it returns
  41  // ValueError. The Tag returned in this case is just stripped of the unknown
  42  // value. All other values are preserved. It accepts tags in the BCP 47 format
  43  // and extensions to this standard defined in
  44  // https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
  45  // The resulting tag is canonicalized using the canonicalization type c.
  46  func (c CanonType) Parse(s string) (t Tag, err error) {
  47  	defer func() {
  48  		if recover() != nil {
  49  			t = Tag{}
  50  			err = language.ErrSyntax
  51  		}
  52  	}()
  53  
  54  	tt, err := language.Parse(s)
  55  	if err != nil {
  56  		return makeTag(tt), err
  57  	}
  58  	tt, changed := canonicalize(c, tt)
  59  	if changed {
  60  		tt.RemakeString()
  61  	}
  62  	return makeTag(tt), nil
  63  }
  64  
  65  // Compose creates a Tag from individual parts, which may be of type Tag, Base,
  66  // Script, Region, Variant, []Variant, Extension, []Extension or error. If a
  67  // Base, Script or Region or slice of type Variant or Extension is passed more
  68  // than once, the latter will overwrite the former. Variants and Extensions are
  69  // accumulated, but if two extensions of the same type are passed, the latter
  70  // will replace the former. For -u extensions, though, the key-type pairs are
  71  // added, where later values overwrite older ones. A Tag overwrites all former
  72  // values and typically only makes sense as the first argument. The resulting
  73  // tag is returned after canonicalizing using the Default CanonType. If one or
  74  // more errors are encountered, one of the errors is returned.
  75  func Compose(part ...interface{}) (t Tag, err error) {
  76  	return Default.Compose(part...)
  77  }
  78  
  79  // Compose creates a Tag from individual parts, which may be of type Tag, Base,
  80  // Script, Region, Variant, []Variant, Extension, []Extension or error. If a
  81  // Base, Script or Region or slice of type Variant or Extension is passed more
  82  // than once, the latter will overwrite the former. Variants and Extensions are
  83  // accumulated, but if two extensions of the same type are passed, the latter
  84  // will replace the former. For -u extensions, though, the key-type pairs are
  85  // added, where later values overwrite older ones. A Tag overwrites all former
  86  // values and typically only makes sense as the first argument. The resulting
  87  // tag is returned after canonicalizing using CanonType c. If one or more errors
  88  // are encountered, one of the errors is returned.
  89  func (c CanonType) Compose(part ...interface{}) (t Tag, err error) {
  90  	defer func() {
  91  		if recover() != nil {
  92  			t = Tag{}
  93  			err = language.ErrSyntax
  94  		}
  95  	}()
  96  
  97  	var b language.Builder
  98  	if err = update(&b, part...); err != nil {
  99  		return und, err
 100  	}
 101  	b.Tag, _ = canonicalize(c, b.Tag)
 102  	return makeTag(b.Make()), err
 103  }
 104  
 105  var errInvalidArgument = errors.New("invalid Extension or Variant")
 106  
 107  func update(b *language.Builder, part ...interface{}) (err error) {
 108  	for _, x := range part {
 109  		switch v := x.(type) {
 110  		case Tag:
 111  			b.SetTag(v.tag())
 112  		case Base:
 113  			b.Tag.LangID = v.langID
 114  		case Script:
 115  			b.Tag.ScriptID = v.scriptID
 116  		case Region:
 117  			b.Tag.RegionID = v.regionID
 118  		case Variant:
 119  			if v.variant == "" {
 120  				err = errInvalidArgument
 121  				break
 122  			}
 123  			b.AddVariant(v.variant)
 124  		case Extension:
 125  			if v.s == "" {
 126  				err = errInvalidArgument
 127  				break
 128  			}
 129  			b.SetExt(v.s)
 130  		case []Variant:
 131  			b.ClearVariants()
 132  			for _, v := range v {
 133  				b.AddVariant(v.variant)
 134  			}
 135  		case []Extension:
 136  			b.ClearExtensions()
 137  			for _, e := range v {
 138  				b.SetExt(e.s)
 139  			}
 140  		// TODO: support parsing of raw strings based on morphology or just extensions?
 141  		case error:
 142  			if v != nil {
 143  				err = v
 144  			}
 145  		}
 146  	}
 147  	return
 148  }
 149  
 150  var errInvalidWeight = errors.New("ParseAcceptLanguage: invalid weight")
 151  var errTagListTooLarge = errors.New("tag list exceeds max length")
 152  
 153  // ParseAcceptLanguage parses the contents of an Accept-Language header as
 154  // defined in http://www.ietf.org/rfc/rfc2616.txt and returns a list of Tags and
 155  // a list of corresponding quality weights. It is more permissive than RFC 2616
 156  // and may return non-nil slices even if the input is not valid.
 157  // The Tags will be sorted by highest weight first and then by first occurrence.
 158  // Tags with a weight of zero will be dropped. An error will be returned if the
 159  // input could not be parsed.
 160  func ParseAcceptLanguage(s string) (tag []Tag, q []float32, err error) {
 161  	defer func() {
 162  		if recover() != nil {
 163  			tag = nil
 164  			q = nil
 165  			err = language.ErrSyntax
 166  		}
 167  	}()
 168  
 169  	if strings.Count(s, "-") > 1000 {
 170  		return nil, nil, errTagListTooLarge
 171  	}
 172  
 173  	var entry string
 174  	for s != "" {
 175  		if entry, s = split(s, ','); entry == "" {
 176  			continue
 177  		}
 178  
 179  		entry, weight := split(entry, ';')
 180  
 181  		// Scan the language.
 182  		t, err := Parse(entry)
 183  		if err != nil {
 184  			id, ok := acceptFallback[entry]
 185  			if !ok {
 186  				return nil, nil, err
 187  			}
 188  			t = makeTag(language.Tag{LangID: id})
 189  		}
 190  
 191  		// Scan the optional weight.
 192  		w := 1.0
 193  		if weight != "" {
 194  			weight = consume(weight, 'q')
 195  			weight = consume(weight, '=')
 196  			// consume returns the empty string when a token could not be
 197  			// consumed, resulting in an error for ParseFloat.
 198  			if w, err = strconv.ParseFloat(weight, 32); err != nil {
 199  				return nil, nil, errInvalidWeight
 200  			}
 201  			// Drop tags with a quality weight of 0.
 202  			if w <= 0 {
 203  				continue
 204  			}
 205  		}
 206  
 207  		tag = append(tag, t)
 208  		q = append(q, float32(w))
 209  	}
 210  	sort.Stable(&tagSort{tag, q})
 211  	return tag, q, nil
 212  }
 213  
 214  // consume removes a leading token c from s and returns the result or the empty
 215  // string if there is no such token.
 216  func consume(s string, c byte) string {
 217  	if s == "" || s[0] != c {
 218  		return ""
 219  	}
 220  	return strings.TrimSpace(s[1:])
 221  }
 222  
 223  func split(s string, c byte) (head, tail string) {
 224  	if i := strings.IndexByte(s, c); i >= 0 {
 225  		return strings.TrimSpace(s[:i]), strings.TrimSpace(s[i+1:])
 226  	}
 227  	return strings.TrimSpace(s), ""
 228  }
 229  
 230  // Add hack mapping to deal with a small number of cases that occur
 231  // in Accept-Language (with reasonable frequency).
 232  var acceptFallback = map[string]language.Language{
 233  	"english": _en,
 234  	"deutsch": _de,
 235  	"italian": _it,
 236  	"french":  _fr,
 237  	"*":       _mul, // defined in the spec to match all languages.
 238  }
 239  
 240  type tagSort struct {
 241  	tag []Tag
 242  	q   []float32
 243  }
 244  
 245  func (s *tagSort) Len() int {
 246  	return len(s.q)
 247  }
 248  
 249  func (s *tagSort) Less(i, j int) bool {
 250  	return s.q[i] > s.q[j]
 251  }
 252  
 253  func (s *tagSort) Swap(i, j int) {
 254  	s.tag[i], s.tag[j] = s.tag[j], s.tag[i]
 255  	s.q[i], s.q[j] = s.q[j], s.q[i]
 256  }
 257