encodedword.mx raw

   1  // Copyright 2015 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  package mime
   6  
   7  import (
   8  	"bytes"
   9  	"encoding/base64"
  10  	"errors"
  11  	"fmt"
  12  	"io"
  13  	"unicode"
  14  	"unicode/utf8"
  15  )
  16  
  17  // A WordEncoder is an RFC 2047 encoded-word encoder.
  18  type WordEncoder byte
  19  
  20  const (
  21  	// BEncoding represents Base64 encoding scheme as defined by RFC 2045.
  22  	BEncoding = WordEncoder('b')
  23  	// QEncoding represents the Q-encoding scheme as defined by RFC 2047.
  24  	QEncoding = WordEncoder('q')
  25  )
  26  
  27  var (
  28  	errInvalidWord = errors.New("mime: invalid RFC 2047 encoded-word")
  29  )
  30  
  31  // Encode returns the encoded-word form of s. If s is ASCII without special
  32  // characters, it is returned unchanged. The provided charset is the IANA
  33  // charset name of s. It is case insensitive.
  34  func (e WordEncoder) Encode(charset, s string) string {
  35  	if !needsEncoding(s) {
  36  		return s
  37  	}
  38  	return e.encodeWord(charset, s)
  39  }
  40  
  41  func needsEncoding(s string) bool {
  42  	for _, b := range s {
  43  		if (b < ' ' || b > '~') && b != '\t' {
  44  			return true
  45  		}
  46  	}
  47  	return false
  48  }
  49  
  50  // encodeWord encodes a string into an encoded-word.
  51  func (e WordEncoder) encodeWord(charset, s string) string {
  52  	var buf bytes.Buffer
  53  	// Could use a hint like len(s)*3, but that's not enough for cases
  54  	// with word splits and too much for simpler inputs.
  55  	// 48 is close to maxEncodedWordLen/2, but adjusted to allocator size class.
  56  	buf.Grow(48)
  57  
  58  	e.openWord(&buf, charset)
  59  	if e == BEncoding {
  60  		e.bEncode(&buf, charset, s)
  61  	} else {
  62  		e.qEncode(&buf, charset, s)
  63  	}
  64  	closeWord(&buf)
  65  
  66  	return buf.String()
  67  }
  68  
  69  const (
  70  	// The maximum length of an encoded-word is 75 characters.
  71  	// See RFC 2047, section 2.
  72  	maxEncodedWordLen = 75
  73  	// maxContentLen is how much content can be encoded, ignoring the header and
  74  	// 2-byte footer.
  75  	maxContentLen = maxEncodedWordLen - len("=?UTF-8?q?") - len("?=")
  76  )
  77  
  78  var maxBase64Len = base64.StdEncoding.DecodedLen(maxContentLen)
  79  
  80  // bEncode encodes s using base64 encoding and writes it to buf.
  81  func (e WordEncoder) bEncode(buf *bytes.Buffer, charset, s string) {
  82  	w := base64.NewEncoder(base64.StdEncoding, buf)
  83  	// If the charset is not UTF-8 or if the content is short, do not bother
  84  	// splitting the encoded-word.
  85  	if !isUTF8(charset) || base64.StdEncoding.EncodedLen(len(s)) <= maxContentLen {
  86  		io.WriteString(w, s)
  87  		w.Close()
  88  		return
  89  	}
  90  
  91  	var currentLen, last, runeLen int
  92  	for i := 0; i < len(s); i += runeLen {
  93  		// Multi-byte characters must not be split across encoded-words.
  94  		// See RFC 2047, section 5.3.
  95  		_, runeLen = utf8.DecodeRuneInString(s[i:])
  96  
  97  		if currentLen+runeLen <= maxBase64Len {
  98  			currentLen += runeLen
  99  		} else {
 100  			io.WriteString(w, s[last:i])
 101  			w.Close()
 102  			e.splitWord(buf, charset)
 103  			last = i
 104  			currentLen = runeLen
 105  		}
 106  	}
 107  	io.WriteString(w, s[last:])
 108  	w.Close()
 109  }
 110  
 111  // qEncode encodes s using Q encoding and writes it to buf. It splits the
 112  // encoded-words when necessary.
 113  func (e WordEncoder) qEncode(buf *bytes.Buffer, charset, s string) {
 114  	// We only split encoded-words when the charset is UTF-8.
 115  	if !isUTF8(charset) {
 116  		writeQString(buf, s)
 117  		return
 118  	}
 119  
 120  	var currentLen, runeLen int
 121  	for i := 0; i < len(s); i += runeLen {
 122  		b := s[i]
 123  		// Multi-byte characters must not be split across encoded-words.
 124  		// See RFC 2047, section 5.3.
 125  		var encLen int
 126  		if b >= ' ' && b <= '~' && b != '=' && b != '?' && b != '_' {
 127  			runeLen, encLen = 1, 1
 128  		} else {
 129  			_, runeLen = utf8.DecodeRuneInString(s[i:])
 130  			encLen = 3 * runeLen
 131  		}
 132  
 133  		if currentLen+encLen > maxContentLen {
 134  			e.splitWord(buf, charset)
 135  			currentLen = 0
 136  		}
 137  		writeQString(buf, s[i:i+runeLen])
 138  		currentLen += encLen
 139  	}
 140  }
 141  
 142  // writeQString encodes s using Q encoding and writes it to buf.
 143  func writeQString(buf *bytes.Buffer, s string) {
 144  	for i := 0; i < len(s); i++ {
 145  		switch b := s[i]; {
 146  		case b == ' ':
 147  			buf.WriteByte('_')
 148  		case b >= '!' && b <= '~' && b != '=' && b != '?' && b != '_':
 149  			buf.WriteByte(b)
 150  		default:
 151  			buf.WriteByte('=')
 152  			buf.WriteByte(upperhex[b>>4])
 153  			buf.WriteByte(upperhex[b&0x0f])
 154  		}
 155  	}
 156  }
 157  
 158  // openWord writes the beginning of an encoded-word into buf.
 159  func (e WordEncoder) openWord(buf *bytes.Buffer, charset string) {
 160  	buf.WriteString("=?")
 161  	buf.WriteString(charset)
 162  	buf.WriteByte('?')
 163  	buf.WriteByte(byte(e))
 164  	buf.WriteByte('?')
 165  }
 166  
 167  // closeWord writes the end of an encoded-word into buf.
 168  func closeWord(buf *bytes.Buffer) {
 169  	buf.WriteString("?=")
 170  }
 171  
 172  // splitWord closes the current encoded-word and opens a new one.
 173  func (e WordEncoder) splitWord(buf *bytes.Buffer, charset string) {
 174  	closeWord(buf)
 175  	buf.WriteByte(' ')
 176  	e.openWord(buf, charset)
 177  }
 178  
 179  func isUTF8(charset string) bool {
 180  	return bytes.EqualFold(charset, "UTF-8")
 181  }
 182  
 183  const upperhex = "0123456789ABCDEF"
 184  
 185  // A WordDecoder decodes MIME headers containing RFC 2047 encoded-words.
 186  type WordDecoder struct {
 187  	// CharsetReader, if non-nil, defines a function to generate
 188  	// charset-conversion readers, converting from the provided
 189  	// charset into UTF-8.
 190  	// Charsets are always lower-case. utf-8, iso-8859-1 and us-ascii charsets
 191  	// are handled by default.
 192  	// One of the CharsetReader's result values must be non-nil.
 193  	CharsetReader func(charset string, input io.Reader) (io.Reader, error)
 194  }
 195  
 196  // Decode decodes an RFC 2047 encoded-word.
 197  func (d *WordDecoder) Decode(word string) (string, error) {
 198  	// See https://tools.ietf.org/html/rfc2047#section-2 for details.
 199  	// Our decoder is permissive, we accept empty encoded-text.
 200  	if len(word) < 8 || !bytes.HasPrefix(word, "=?") || !bytes.HasSuffix(word, "?=") || bytes.Count(word, "?") != 4 {
 201  		return "", errInvalidWord
 202  	}
 203  	word = word[2 : len(word)-2]
 204  
 205  	// split word "UTF-8?q?text" into "UTF-8", 'q', and "text"
 206  	charset, text, _ := bytes.Cut(word, "?")
 207  	if charset == "" {
 208  		return "", errInvalidWord
 209  	}
 210  	encoding, text, _ := bytes.Cut(text, "?")
 211  	if len(encoding) != 1 {
 212  		return "", errInvalidWord
 213  	}
 214  
 215  	content, err := decode(encoding[0], text)
 216  	if err != nil {
 217  		return "", err
 218  	}
 219  
 220  	var buf bytes.Buffer
 221  	if err := d.convert(&buf, charset, content); err != nil {
 222  		return "", err
 223  	}
 224  	return buf.String(), nil
 225  }
 226  
 227  // DecodeHeader decodes all encoded-words of the given string. It returns an
 228  // error if and only if [WordDecoder.CharsetReader] of d returns an error.
 229  func (d *WordDecoder) DecodeHeader(header string) (string, error) {
 230  	// If there is no encoded-word, returns before creating a buffer.
 231  	i := bytes.Index(header, "=?")
 232  	if i == -1 {
 233  		return header, nil
 234  	}
 235  
 236  	var buf bytes.Buffer
 237  
 238  	buf.WriteString(header[:i])
 239  	header = header[i:]
 240  
 241  	betweenWords := false
 242  	for {
 243  		start := bytes.Index(header, "=?")
 244  		if start == -1 {
 245  			break
 246  		}
 247  		cur := start + len("=?")
 248  
 249  		i := bytes.Index(header[cur:], "?")
 250  		if i == -1 {
 251  			break
 252  		}
 253  		charset := header[cur : cur+i]
 254  		cur += i + len("?")
 255  
 256  		if len(header) < cur+len("Q??=") {
 257  			break
 258  		}
 259  		encoding := header[cur]
 260  		cur++
 261  
 262  		if header[cur] != '?' {
 263  			break
 264  		}
 265  		cur++
 266  
 267  		j := bytes.Index(header[cur:], "?=")
 268  		if j == -1 {
 269  			break
 270  		}
 271  		text := header[cur : cur+j]
 272  		end := cur + j + len("?=")
 273  
 274  		content, err := decode(encoding, text)
 275  		if err != nil {
 276  			betweenWords = false
 277  			buf.WriteString(header[:start+2])
 278  			header = header[start+2:]
 279  			continue
 280  		}
 281  
 282  		// Write characters before the encoded-word. White-space and newline
 283  		// characters separating two encoded-words must be deleted.
 284  		if start > 0 && (!betweenWords || hasNonWhitespace(header[:start])) {
 285  			buf.WriteString(header[:start])
 286  		}
 287  
 288  		if err := d.convert(&buf, charset, content); err != nil {
 289  			return "", err
 290  		}
 291  
 292  		header = header[end:]
 293  		betweenWords = true
 294  	}
 295  
 296  	if len(header) > 0 {
 297  		buf.WriteString(header)
 298  	}
 299  
 300  	return buf.String(), nil
 301  }
 302  
 303  func decode(encoding byte, text string) ([]byte, error) {
 304  	switch encoding {
 305  	case 'B', 'b':
 306  		return base64.StdEncoding.DecodeString(text)
 307  	case 'Q', 'q':
 308  		return qDecode(text)
 309  	default:
 310  		return nil, errInvalidWord
 311  	}
 312  }
 313  
 314  func (d *WordDecoder) convert(buf *bytes.Buffer, charset string, content []byte) error {
 315  	switch {
 316  	case bytes.EqualFold("utf-8", charset):
 317  		buf.Write(content)
 318  	case bytes.EqualFold("iso-8859-1", charset):
 319  		for _, c := range content {
 320  			buf.WriteRune(rune(c))
 321  		}
 322  	case bytes.EqualFold("us-ascii", charset):
 323  		for _, c := range content {
 324  			if c >= utf8.RuneSelf {
 325  				buf.WriteRune(unicode.ReplacementChar)
 326  			} else {
 327  				buf.WriteByte(c)
 328  			}
 329  		}
 330  	default:
 331  		if d.CharsetReader == nil {
 332  			return fmt.Errorf("mime: unhandled charset %q", charset)
 333  		}
 334  		r, err := d.CharsetReader(bytes.ToLower(charset), bytes.NewReader(content))
 335  		if err != nil {
 336  			return err
 337  		}
 338  		if _, err = io.Copy(buf, r); err != nil {
 339  			return err
 340  		}
 341  	}
 342  	return nil
 343  }
 344  
 345  // hasNonWhitespace reports whether s (assumed to be ASCII) contains at least
 346  // one byte of non-whitespace.
 347  func hasNonWhitespace(s string) bool {
 348  	for _, b := range s {
 349  		switch b {
 350  		// Encoded-words can only be separated by linear white spaces which does
 351  		// not include vertical tabs (\v).
 352  		case ' ', '\t', '\n', '\r':
 353  		default:
 354  			return true
 355  		}
 356  	}
 357  	return false
 358  }
 359  
 360  // qDecode decodes a Q encoded string.
 361  func qDecode(s string) ([]byte, error) {
 362  	dec := []byte{:len(s)}
 363  	n := 0
 364  	for i := 0; i < len(s); i++ {
 365  		switch c := s[i]; {
 366  		case c == '_':
 367  			dec[n] = ' '
 368  		case c == '=':
 369  			if i+2 >= len(s) {
 370  				return nil, errInvalidWord
 371  			}
 372  			b, err := readHexByte(s[i+1], s[i+2])
 373  			if err != nil {
 374  				return nil, err
 375  			}
 376  			dec[n] = b
 377  			i += 2
 378  		case (c <= '~' && c >= ' ') || c == '\n' || c == '\r' || c == '\t':
 379  			dec[n] = c
 380  		default:
 381  			return nil, errInvalidWord
 382  		}
 383  		n++
 384  	}
 385  
 386  	return dec[:n], nil
 387  }
 388  
 389  // readHexByte returns the byte from its quoted-printable representation.
 390  func readHexByte(a, b byte) (byte, error) {
 391  	var hb, lb byte
 392  	var err error
 393  	if hb, err = fromHex(a); err != nil {
 394  		return 0, err
 395  	}
 396  	if lb, err = fromHex(b); err != nil {
 397  		return 0, err
 398  	}
 399  	return hb<<4 | lb, nil
 400  }
 401  
 402  func fromHex(b byte) (byte, error) {
 403  	switch {
 404  	case b >= '0' && b <= '9':
 405  		return b - '0', nil
 406  	case b >= 'A' && b <= 'F':
 407  		return b - 'A' + 10, nil
 408  	// Accept badly encoded bytes.
 409  	case b >= 'a' && b <= 'f':
 410  		return b - 'a' + 10, nil
 411  	}
 412  	return 0, fmt.Errorf("mime: invalid hex byte %#02x", b)
 413  }
 414