encoding.go raw

   1  // Copyright 2013 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  // Package encoding defines an interface for character encodings, such as Shift
   6  // JIS and Windows 1252, that can convert to and from UTF-8.
   7  //
   8  // Encoding implementations are provided in other packages, such as
   9  // golang.org/x/text/encoding/charmap and
  10  // golang.org/x/text/encoding/japanese.
  11  package encoding // import "golang.org/x/text/encoding"
  12  
  13  import (
  14  	"errors"
  15  	"io"
  16  	"strconv"
  17  	"unicode/utf8"
  18  
  19  	"golang.org/x/text/encoding/internal/identifier"
  20  	"golang.org/x/text/transform"
  21  )
  22  
  23  // TODO:
  24  // - There seems to be some inconsistency in when decoders return errors
  25  //   and when not. Also documentation seems to suggest they shouldn't return
  26  //   errors at all (except for UTF-16).
  27  // - Encoders seem to rely on or at least benefit from the input being in NFC
  28  //   normal form. Perhaps add an example how users could prepare their output.
  29  
  30  // Encoding is a character set encoding that can be transformed to and from
  31  // UTF-8.
  32  type Encoding interface {
  33  	// NewDecoder returns a Decoder.
  34  	NewDecoder() *Decoder
  35  
  36  	// NewEncoder returns an Encoder.
  37  	NewEncoder() *Encoder
  38  }
  39  
  40  // A Decoder converts bytes to UTF-8. It implements transform.Transformer.
  41  //
  42  // Transforming source bytes that are not of that encoding will not result in an
  43  // error per se. Each byte that cannot be transcoded will be represented in the
  44  // output by the UTF-8 encoding of '\uFFFD', the replacement rune.
  45  type Decoder struct {
  46  	transform.Transformer
  47  
  48  	// This forces external creators of Decoders to use names in struct
  49  	// initializers, allowing for future extendibility without having to break
  50  	// code.
  51  	_ struct{}
  52  }
  53  
  54  // Bytes converts the given encoded bytes to UTF-8. It returns the converted
  55  // bytes or nil, err if any error occurred.
  56  func (d *Decoder) Bytes(b []byte) ([]byte, error) {
  57  	b, _, err := transform.Bytes(d, b)
  58  	if err != nil {
  59  		return nil, err
  60  	}
  61  	return b, nil
  62  }
  63  
  64  // String converts the given encoded string to UTF-8. It returns the converted
  65  // string or "", err if any error occurred.
  66  func (d *Decoder) String(s string) (string, error) {
  67  	s, _, err := transform.String(d, s)
  68  	if err != nil {
  69  		return "", err
  70  	}
  71  	return s, nil
  72  }
  73  
  74  // Reader wraps another Reader to decode its bytes.
  75  //
  76  // The Decoder may not be used for any other operation as long as the returned
  77  // Reader is in use.
  78  func (d *Decoder) Reader(r io.Reader) io.Reader {
  79  	return transform.NewReader(r, d)
  80  }
  81  
  82  // An Encoder converts bytes from UTF-8. It implements transform.Transformer.
  83  //
  84  // Each rune that cannot be transcoded will result in an error. In this case,
  85  // the transform will consume all source byte up to, not including the offending
  86  // rune. Transforming source bytes that are not valid UTF-8 will be replaced by
  87  // `\uFFFD`. To return early with an error instead, use transform.Chain to
  88  // preprocess the data with a UTF8Validator.
  89  type Encoder struct {
  90  	transform.Transformer
  91  
  92  	// This forces external creators of Encoders to use names in struct
  93  	// initializers, allowing for future extendibility without having to break
  94  	// code.
  95  	_ struct{}
  96  }
  97  
  98  // Bytes converts bytes from UTF-8. It returns the converted bytes or nil, err if
  99  // any error occurred.
 100  func (e *Encoder) Bytes(b []byte) ([]byte, error) {
 101  	b, _, err := transform.Bytes(e, b)
 102  	if err != nil {
 103  		return nil, err
 104  	}
 105  	return b, nil
 106  }
 107  
 108  // String converts a string from UTF-8. It returns the converted string or
 109  // "", err if any error occurred.
 110  func (e *Encoder) String(s string) (string, error) {
 111  	s, _, err := transform.String(e, s)
 112  	if err != nil {
 113  		return "", err
 114  	}
 115  	return s, nil
 116  }
 117  
 118  // Writer wraps another Writer to encode its UTF-8 output.
 119  //
 120  // The Encoder may not be used for any other operation as long as the returned
 121  // Writer is in use.
 122  func (e *Encoder) Writer(w io.Writer) io.Writer {
 123  	return transform.NewWriter(w, e)
 124  }
 125  
 126  // ASCIISub is the ASCII substitute character, as recommended by
 127  // https://unicode.org/reports/tr36/#Text_Comparison
 128  const ASCIISub = '\x1a'
 129  
 130  // Nop is the nop encoding. Its transformed bytes are the same as the source
 131  // bytes; it does not replace invalid UTF-8 sequences.
 132  var Nop Encoding = nop{}
 133  
 134  type nop struct{}
 135  
 136  func (nop) NewDecoder() *Decoder {
 137  	return &Decoder{Transformer: transform.Nop}
 138  }
 139  func (nop) NewEncoder() *Encoder {
 140  	return &Encoder{Transformer: transform.Nop}
 141  }
 142  
 143  // Replacement is the replacement encoding. Decoding from the replacement
 144  // encoding yields a single '\uFFFD' replacement rune. Encoding from UTF-8 to
 145  // the replacement encoding yields the same as the source bytes except that
 146  // invalid UTF-8 is converted to '\uFFFD'.
 147  //
 148  // It is defined at http://encoding.spec.whatwg.org/#replacement
 149  var Replacement Encoding = replacement{}
 150  
 151  type replacement struct{}
 152  
 153  func (replacement) NewDecoder() *Decoder {
 154  	return &Decoder{Transformer: replacementDecoder{}}
 155  }
 156  
 157  func (replacement) NewEncoder() *Encoder {
 158  	return &Encoder{Transformer: replacementEncoder{}}
 159  }
 160  
 161  func (replacement) ID() (mib identifier.MIB, other string) {
 162  	return identifier.Replacement, ""
 163  }
 164  
 165  type replacementDecoder struct{ transform.NopResetter }
 166  
 167  func (replacementDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
 168  	if len(dst) < 3 {
 169  		return 0, 0, transform.ErrShortDst
 170  	}
 171  	if atEOF {
 172  		const fffd = "\ufffd"
 173  		dst[0] = fffd[0]
 174  		dst[1] = fffd[1]
 175  		dst[2] = fffd[2]
 176  		nDst = 3
 177  	}
 178  	return nDst, len(src), nil
 179  }
 180  
 181  type replacementEncoder struct{ transform.NopResetter }
 182  
 183  func (replacementEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
 184  	r, size := rune(0), 0
 185  
 186  	for ; nSrc < len(src); nSrc += size {
 187  		r = rune(src[nSrc])
 188  
 189  		// Decode a 1-byte rune.
 190  		if r < utf8.RuneSelf {
 191  			size = 1
 192  
 193  		} else {
 194  			// Decode a multi-byte rune.
 195  			r, size = utf8.DecodeRune(src[nSrc:])
 196  			if size == 1 {
 197  				// All valid runes of size 1 (those below utf8.RuneSelf) were
 198  				// handled above. We have invalid UTF-8 or we haven't seen the
 199  				// full character yet.
 200  				if !atEOF && !utf8.FullRune(src[nSrc:]) {
 201  					err = transform.ErrShortSrc
 202  					break
 203  				}
 204  				r = '\ufffd'
 205  			}
 206  		}
 207  
 208  		if nDst+utf8.RuneLen(r) > len(dst) {
 209  			err = transform.ErrShortDst
 210  			break
 211  		}
 212  		nDst += utf8.EncodeRune(dst[nDst:], r)
 213  	}
 214  	return nDst, nSrc, err
 215  }
 216  
 217  // HTMLEscapeUnsupported wraps encoders to replace source runes outside the
 218  // repertoire of the destination encoding with HTML escape sequences.
 219  //
 220  // This wrapper exists to comply to URL and HTML forms requiring a
 221  // non-terminating legacy encoder. The produced sequences may lead to data
 222  // loss as they are indistinguishable from legitimate input. To avoid this
 223  // issue, use UTF-8 encodings whenever possible.
 224  func HTMLEscapeUnsupported(e *Encoder) *Encoder {
 225  	return &Encoder{Transformer: &errorHandler{e, errorToHTML}}
 226  }
 227  
 228  // ReplaceUnsupported wraps encoders to replace source runes outside the
 229  // repertoire of the destination encoding with an encoding-specific
 230  // replacement.
 231  //
 232  // This wrapper is only provided for backwards compatibility and legacy
 233  // handling. Its use is strongly discouraged. Use UTF-8 whenever possible.
 234  func ReplaceUnsupported(e *Encoder) *Encoder {
 235  	return &Encoder{Transformer: &errorHandler{e, errorToReplacement}}
 236  }
 237  
 238  type errorHandler struct {
 239  	*Encoder
 240  	handler func(dst []byte, r rune, err repertoireError) (n int, ok bool)
 241  }
 242  
 243  // TODO: consider making this error public in some form.
 244  type repertoireError interface {
 245  	Replacement() byte
 246  }
 247  
 248  func (h errorHandler) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
 249  	nDst, nSrc, err = h.Transformer.Transform(dst, src, atEOF)
 250  	for err != nil {
 251  		rerr, ok := err.(repertoireError)
 252  		if !ok {
 253  			return nDst, nSrc, err
 254  		}
 255  		r, sz := utf8.DecodeRune(src[nSrc:])
 256  		n, ok := h.handler(dst[nDst:], r, rerr)
 257  		if !ok {
 258  			return nDst, nSrc, transform.ErrShortDst
 259  		}
 260  		err = nil
 261  		nDst += n
 262  		if nSrc += sz; nSrc < len(src) {
 263  			var dn, sn int
 264  			dn, sn, err = h.Transformer.Transform(dst[nDst:], src[nSrc:], atEOF)
 265  			nDst += dn
 266  			nSrc += sn
 267  		}
 268  	}
 269  	return nDst, nSrc, err
 270  }
 271  
 272  func errorToHTML(dst []byte, r rune, err repertoireError) (n int, ok bool) {
 273  	buf := [8]byte{}
 274  	b := strconv.AppendUint(buf[:0], uint64(r), 10)
 275  	if n = len(b) + len("&#;"); n >= len(dst) {
 276  		return 0, false
 277  	}
 278  	dst[0] = '&'
 279  	dst[1] = '#'
 280  	dst[copy(dst[2:], b)+2] = ';'
 281  	return n, true
 282  }
 283  
 284  func errorToReplacement(dst []byte, r rune, err repertoireError) (n int, ok bool) {
 285  	if len(dst) == 0 {
 286  		return 0, false
 287  	}
 288  	dst[0] = err.Replacement()
 289  	return 1, true
 290  }
 291  
 292  // ErrInvalidUTF8 means that a transformer encountered invalid UTF-8.
 293  var ErrInvalidUTF8 = errors.New("encoding: invalid UTF-8")
 294  
 295  // UTF8Validator is a transformer that returns ErrInvalidUTF8 on the first
 296  // input byte that is not valid UTF-8.
 297  var UTF8Validator transform.Transformer = utf8Validator{}
 298  
 299  type utf8Validator struct{ transform.NopResetter }
 300  
 301  func (utf8Validator) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
 302  	n := len(src)
 303  	if n > len(dst) {
 304  		n = len(dst)
 305  	}
 306  	for i := 0; i < n; {
 307  		if c := src[i]; c < utf8.RuneSelf {
 308  			dst[i] = c
 309  			i++
 310  			continue
 311  		}
 312  		_, size := utf8.DecodeRune(src[i:])
 313  		if size == 1 {
 314  			// All valid runes of size 1 (those below utf8.RuneSelf) were
 315  			// handled above. We have invalid UTF-8 or we haven't seen the
 316  			// full character yet.
 317  			err = ErrInvalidUTF8
 318  			if !atEOF && !utf8.FullRune(src[i:]) {
 319  				err = transform.ErrShortSrc
 320  			}
 321  			return i, i, err
 322  		}
 323  		if i+size > len(dst) {
 324  			return i, i, transform.ErrShortDst
 325  		}
 326  		for ; size > 0; size-- {
 327  			dst[i] = src[i]
 328  			i++
 329  		}
 330  	}
 331  	if len(src) > len(dst) {
 332  		err = transform.ErrShortDst
 333  	}
 334  	return n, n, err
 335  }
 336