ianaindex.go raw

   1  // Copyright 2015 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  //go:generate go run gen.go
   6  
   7  // Package ianaindex maps names to Encodings as specified by the IANA registry.
   8  // This includes both the MIME and IANA names.
   9  //
  10  // See http://www.iana.org/assignments/character-sets/character-sets.xhtml for
  11  // more details.
  12  package ianaindex
  13  
  14  import (
  15  	"errors"
  16  	"sort"
  17  	"strings"
  18  
  19  	"golang.org/x/text/encoding"
  20  	"golang.org/x/text/encoding/charmap"
  21  	"golang.org/x/text/encoding/internal/identifier"
  22  	"golang.org/x/text/encoding/japanese"
  23  	"golang.org/x/text/encoding/korean"
  24  	"golang.org/x/text/encoding/simplifiedchinese"
  25  	"golang.org/x/text/encoding/traditionalchinese"
  26  	"golang.org/x/text/encoding/unicode"
  27  )
  28  
  29  // TODO: remove the "Status... incomplete" in the package doc comment.
  30  // TODO: allow users to specify their own aliases?
  31  // TODO: allow users to specify their own indexes?
  32  // TODO: allow canonicalizing names
  33  
  34  // NOTE: only use these top-level variables if we can get the linker to drop
  35  // the indexes when they are not used. Make them a function or perhaps only
  36  // support MIME otherwise.
  37  
  38  var (
  39  	// MIME is an index to map MIME names.
  40  	MIME *Index = mime
  41  
  42  	// IANA is an index that supports all names and aliases using IANA names as
  43  	// the canonical identifier.
  44  	IANA *Index = iana
  45  
  46  	// MIB is an index that associates the MIB display name with an Encoding.
  47  	MIB *Index = mib
  48  
  49  	mime = &Index{mimeName, ianaToMIB, ianaAliases, encodings[:]}
  50  	iana = &Index{ianaName, ianaToMIB, ianaAliases, encodings[:]}
  51  	mib  = &Index{mibName, ianaToMIB, ianaAliases, encodings[:]}
  52  )
  53  
  54  // Index maps names registered by IANA to Encodings.
  55  // Currently different Indexes only differ in the names they return for
  56  // encodings. In the future they may also differ in supported aliases.
  57  type Index struct {
  58  	names func(i int) string
  59  	toMIB []identifier.MIB // Sorted slice of supported MIBs
  60  	alias map[string]int
  61  	enc   []encoding.Encoding
  62  }
  63  
  64  var (
  65  	errInvalidName = errors.New("ianaindex: invalid encoding name")
  66  	errUnknown     = errors.New("ianaindex: unknown Encoding")
  67  	errUnsupported = errors.New("ianaindex: unsupported Encoding")
  68  )
  69  
  70  // Encoding returns an Encoding for IANA-registered names. Matching is
  71  // case-insensitive.
  72  //
  73  // If the provided name doesn't match a IANA-registered charset, an error is
  74  // returned. If the name matches a IANA-registered charset but isn't supported,
  75  // a nil encoding and a nil error are returned.
  76  func (x *Index) Encoding(name string) (encoding.Encoding, error) {
  77  	name = strings.TrimSpace(name)
  78  	// First try without lowercasing (possibly creating an allocation).
  79  	i, ok := x.alias[name]
  80  	if !ok {
  81  		i, ok = x.alias[strings.ToLower(name)]
  82  		if !ok {
  83  			return nil, errInvalidName
  84  		}
  85  	}
  86  	return x.enc[i], nil
  87  }
  88  
  89  // Name reports the canonical name of the given Encoding. It will return an
  90  // error if the e is not associated with a known encoding scheme.
  91  func (x *Index) Name(e encoding.Encoding) (string, error) {
  92  	id, ok := e.(identifier.Interface)
  93  	if !ok {
  94  		return "", errUnknown
  95  	}
  96  	mib, _ := id.ID()
  97  	if mib == 0 {
  98  		return "", errUnknown
  99  	}
 100  	v := findMIB(x.toMIB, mib)
 101  	if v == -1 {
 102  		return "", errUnsupported
 103  	}
 104  	return x.names(v), nil
 105  }
 106  
 107  // TODO: the coverage of this index is rather spotty. Allowing users to set
 108  // encodings would allow:
 109  // - users to increase coverage
 110  // - allow a partially loaded set of encodings in case the user doesn't need to
 111  //   them all.
 112  // - write an OS-specific wrapper for supported encodings and set them.
 113  // The exact definition of Set depends a bit on if and how we want to let users
 114  // write their own Encoding implementations. Also, it is not possible yet to
 115  // only partially load the encodings without doing some refactoring. Until this
 116  // is solved, we might as well not support Set.
 117  // // Set sets the e to be used for the encoding scheme identified by name. Only
 118  // // canonical names may be used. An empty name assigns e to its internally
 119  // // associated encoding scheme.
 120  // func (x *Index) Set(name string, e encoding.Encoding) error {
 121  // 	panic("TODO: implement")
 122  // }
 123  
 124  func findMIB(x []identifier.MIB, mib identifier.MIB) int {
 125  	i := sort.Search(len(x), func(i int) bool { return x[i] >= mib })
 126  	if i < len(x) && x[i] == mib {
 127  		return i
 128  	}
 129  	return -1
 130  }
 131  
 132  const maxMIMENameLen = '0' - 1 // officially 40, but we leave some buffer.
 133  
 134  func mimeName(x int) string {
 135  	n := ianaNames[x]
 136  	// See gen.go for a description of the encoding.
 137  	if n[0] <= maxMIMENameLen {
 138  		return n[1:n[0]]
 139  	}
 140  	return n
 141  }
 142  
 143  func ianaName(x int) string {
 144  	n := ianaNames[x]
 145  	// See gen.go for a description of the encoding.
 146  	if n[0] <= maxMIMENameLen {
 147  		return n[n[0]:]
 148  	}
 149  	return n
 150  }
 151  
 152  func mibName(x int) string {
 153  	return mibNames[x]
 154  }
 155  
 156  var encodings = [numIANA]encoding.Encoding{
 157  	enc3:    asciiEnc,
 158  	enc106:  unicode.UTF8,
 159  	enc1015: unicode.UTF16(unicode.BigEndian, unicode.UseBOM),
 160  	enc1013: unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM),
 161  	enc1014: unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM),
 162  	enc2028: charmap.CodePage037,
 163  	enc2011: charmap.CodePage437,
 164  	enc2009: charmap.CodePage850,
 165  	enc2010: charmap.CodePage852,
 166  	enc2046: charmap.CodePage855,
 167  	enc2089: charmap.CodePage858,
 168  	enc2048: charmap.CodePage860,
 169  	enc2013: charmap.CodePage862,
 170  	enc2050: charmap.CodePage863,
 171  	enc2052: charmap.CodePage865,
 172  	enc2086: charmap.CodePage866,
 173  	enc2102: charmap.CodePage1047,
 174  	enc2091: charmap.CodePage1140,
 175  	enc4:    charmap.ISO8859_1,
 176  	enc5:    charmap.ISO8859_2,
 177  	enc6:    charmap.ISO8859_3,
 178  	enc7:    charmap.ISO8859_4,
 179  	enc8:    charmap.ISO8859_5,
 180  	enc9:    charmap.ISO8859_6,
 181  	enc81:   charmap.ISO8859_6E,
 182  	enc82:   charmap.ISO8859_6I,
 183  	enc10:   charmap.ISO8859_7,
 184  	enc11:   charmap.ISO8859_8,
 185  	enc84:   charmap.ISO8859_8E,
 186  	enc85:   charmap.ISO8859_8I,
 187  	enc12:   charmap.ISO8859_9,
 188  	enc13:   charmap.ISO8859_10,
 189  	enc109:  charmap.ISO8859_13,
 190  	enc110:  charmap.ISO8859_14,
 191  	enc111:  charmap.ISO8859_15,
 192  	enc112:  charmap.ISO8859_16,
 193  	enc2084: charmap.KOI8R,
 194  	enc2088: charmap.KOI8U,
 195  	enc2027: charmap.Macintosh,
 196  	enc2109: charmap.Windows874,
 197  	enc2250: charmap.Windows1250,
 198  	enc2251: charmap.Windows1251,
 199  	enc2252: charmap.Windows1252,
 200  	enc2253: charmap.Windows1253,
 201  	enc2254: charmap.Windows1254,
 202  	enc2255: charmap.Windows1255,
 203  	enc2256: charmap.Windows1256,
 204  	enc2257: charmap.Windows1257,
 205  	enc2258: charmap.Windows1258,
 206  	enc18:   japanese.EUCJP,
 207  	enc39:   japanese.ISO2022JP,
 208  	enc17:   japanese.ShiftJIS,
 209  	enc38:   korean.EUCKR,
 210  	enc114:  simplifiedchinese.GB18030,
 211  	enc113:  simplifiedchinese.GBK,
 212  	enc2085: simplifiedchinese.HZGB2312,
 213  	enc2026: traditionalchinese.Big5,
 214  }
 215