encodedword.mx raw
1 // Copyright 2015 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 package mime
6
7 import (
8 "bytes"
9 "encoding/base64"
10 "errors"
11 "fmt"
12 "io"
13 "unicode"
14 "unicode/utf8"
15 )
16
17 // A WordEncoder is an RFC 2047 encoded-word encoder.
18 type WordEncoder byte
19
20 const (
21 // BEncoding represents Base64 encoding scheme as defined by RFC 2045.
22 BEncoding = WordEncoder('b')
23 // QEncoding represents the Q-encoding scheme as defined by RFC 2047.
24 QEncoding = WordEncoder('q')
25 )
26
27 var (
28 errInvalidWord = errors.New("mime: invalid RFC 2047 encoded-word")
29 )
30
31 // Encode returns the encoded-word form of s. If s is ASCII without special
32 // characters, it is returned unchanged. The provided charset is the IANA
33 // charset name of s. It is case insensitive.
34 func (e WordEncoder) Encode(charset, s string) string {
35 if !needsEncoding(s) {
36 return s
37 }
38 return e.encodeWord(charset, s)
39 }
40
41 func needsEncoding(s string) bool {
42 for _, b := range s {
43 if (b < ' ' || b > '~') && b != '\t' {
44 return true
45 }
46 }
47 return false
48 }
49
50 // encodeWord encodes a string into an encoded-word.
51 func (e WordEncoder) encodeWord(charset, s string) string {
52 var buf bytes.Buffer
53 // Could use a hint like len(s)*3, but that's not enough for cases
54 // with word splits and too much for simpler inputs.
55 // 48 is close to maxEncodedWordLen/2, but adjusted to allocator size class.
56 buf.Grow(48)
57
58 e.openWord(&buf, charset)
59 if e == BEncoding {
60 e.bEncode(&buf, charset, s)
61 } else {
62 e.qEncode(&buf, charset, s)
63 }
64 closeWord(&buf)
65
66 return buf.String()
67 }
68
69 const (
70 // The maximum length of an encoded-word is 75 characters.
71 // See RFC 2047, section 2.
72 maxEncodedWordLen = 75
73 // maxContentLen is how much content can be encoded, ignoring the header and
74 // 2-byte footer.
75 maxContentLen = maxEncodedWordLen - len("=?UTF-8?q?") - len("?=")
76 )
77
78 var maxBase64Len = base64.StdEncoding.DecodedLen(maxContentLen)
79
80 // bEncode encodes s using base64 encoding and writes it to buf.
81 func (e WordEncoder) bEncode(buf *bytes.Buffer, charset, s string) {
82 w := base64.NewEncoder(base64.StdEncoding, buf)
83 // If the charset is not UTF-8 or if the content is short, do not bother
84 // splitting the encoded-word.
85 if !isUTF8(charset) || base64.StdEncoding.EncodedLen(len(s)) <= maxContentLen {
86 io.WriteString(w, s)
87 w.Close()
88 return
89 }
90
91 var currentLen, last, runeLen int
92 for i := 0; i < len(s); i += runeLen {
93 // Multi-byte characters must not be split across encoded-words.
94 // See RFC 2047, section 5.3.
95 _, runeLen = utf8.DecodeRuneInString(s[i:])
96
97 if currentLen+runeLen <= maxBase64Len {
98 currentLen += runeLen
99 } else {
100 io.WriteString(w, s[last:i])
101 w.Close()
102 e.splitWord(buf, charset)
103 last = i
104 currentLen = runeLen
105 }
106 }
107 io.WriteString(w, s[last:])
108 w.Close()
109 }
110
111 // qEncode encodes s using Q encoding and writes it to buf. It splits the
112 // encoded-words when necessary.
113 func (e WordEncoder) qEncode(buf *bytes.Buffer, charset, s string) {
114 // We only split encoded-words when the charset is UTF-8.
115 if !isUTF8(charset) {
116 writeQString(buf, s)
117 return
118 }
119
120 var currentLen, runeLen int
121 for i := 0; i < len(s); i += runeLen {
122 b := s[i]
123 // Multi-byte characters must not be split across encoded-words.
124 // See RFC 2047, section 5.3.
125 var encLen int
126 if b >= ' ' && b <= '~' && b != '=' && b != '?' && b != '_' {
127 runeLen, encLen = 1, 1
128 } else {
129 _, runeLen = utf8.DecodeRuneInString(s[i:])
130 encLen = 3 * runeLen
131 }
132
133 if currentLen+encLen > maxContentLen {
134 e.splitWord(buf, charset)
135 currentLen = 0
136 }
137 writeQString(buf, s[i:i+runeLen])
138 currentLen += encLen
139 }
140 }
141
142 // writeQString encodes s using Q encoding and writes it to buf.
143 func writeQString(buf *bytes.Buffer, s string) {
144 for i := 0; i < len(s); i++ {
145 switch b := s[i]; {
146 case b == ' ':
147 buf.WriteByte('_')
148 case b >= '!' && b <= '~' && b != '=' && b != '?' && b != '_':
149 buf.WriteByte(b)
150 default:
151 buf.WriteByte('=')
152 buf.WriteByte(upperhex[b>>4])
153 buf.WriteByte(upperhex[b&0x0f])
154 }
155 }
156 }
157
158 // openWord writes the beginning of an encoded-word into buf.
159 func (e WordEncoder) openWord(buf *bytes.Buffer, charset string) {
160 buf.WriteString("=?")
161 buf.WriteString(charset)
162 buf.WriteByte('?')
163 buf.WriteByte(byte(e))
164 buf.WriteByte('?')
165 }
166
167 // closeWord writes the end of an encoded-word into buf.
168 func closeWord(buf *bytes.Buffer) {
169 buf.WriteString("?=")
170 }
171
172 // splitWord closes the current encoded-word and opens a new one.
173 func (e WordEncoder) splitWord(buf *bytes.Buffer, charset string) {
174 closeWord(buf)
175 buf.WriteByte(' ')
176 e.openWord(buf, charset)
177 }
178
179 func isUTF8(charset string) bool {
180 return bytes.EqualFold(charset, "UTF-8")
181 }
182
183 const upperhex = "0123456789ABCDEF"
184
185 // A WordDecoder decodes MIME headers containing RFC 2047 encoded-words.
186 type WordDecoder struct {
187 // CharsetReader, if non-nil, defines a function to generate
188 // charset-conversion readers, converting from the provided
189 // charset into UTF-8.
190 // Charsets are always lower-case. utf-8, iso-8859-1 and us-ascii charsets
191 // are handled by default.
192 // One of the CharsetReader's result values must be non-nil.
193 CharsetReader func(charset string, input io.Reader) (io.Reader, error)
194 }
195
196 // Decode decodes an RFC 2047 encoded-word.
197 func (d *WordDecoder) Decode(word string) (string, error) {
198 // See https://tools.ietf.org/html/rfc2047#section-2 for details.
199 // Our decoder is permissive, we accept empty encoded-text.
200 if len(word) < 8 || !bytes.HasPrefix(word, "=?") || !bytes.HasSuffix(word, "?=") || bytes.Count(word, "?") != 4 {
201 return "", errInvalidWord
202 }
203 word = word[2 : len(word)-2]
204
205 // split word "UTF-8?q?text" into "UTF-8", 'q', and "text"
206 charset, text, _ := bytes.Cut(word, "?")
207 if charset == "" {
208 return "", errInvalidWord
209 }
210 encoding, text, _ := bytes.Cut(text, "?")
211 if len(encoding) != 1 {
212 return "", errInvalidWord
213 }
214
215 content, err := decode(encoding[0], text)
216 if err != nil {
217 return "", err
218 }
219
220 var buf bytes.Buffer
221 if err := d.convert(&buf, charset, content); err != nil {
222 return "", err
223 }
224 return buf.String(), nil
225 }
226
227 // DecodeHeader decodes all encoded-words of the given string. It returns an
228 // error if and only if [WordDecoder.CharsetReader] of d returns an error.
229 func (d *WordDecoder) DecodeHeader(header string) (string, error) {
230 // If there is no encoded-word, returns before creating a buffer.
231 i := bytes.Index(header, "=?")
232 if i == -1 {
233 return header, nil
234 }
235
236 var buf bytes.Buffer
237
238 buf.WriteString(header[:i])
239 header = header[i:]
240
241 betweenWords := false
242 for {
243 start := bytes.Index(header, "=?")
244 if start == -1 {
245 break
246 }
247 cur := start + len("=?")
248
249 i := bytes.Index(header[cur:], "?")
250 if i == -1 {
251 break
252 }
253 charset := header[cur : cur+i]
254 cur += i + len("?")
255
256 if len(header) < cur+len("Q??=") {
257 break
258 }
259 encoding := header[cur]
260 cur++
261
262 if header[cur] != '?' {
263 break
264 }
265 cur++
266
267 j := bytes.Index(header[cur:], "?=")
268 if j == -1 {
269 break
270 }
271 text := header[cur : cur+j]
272 end := cur + j + len("?=")
273
274 content, err := decode(encoding, text)
275 if err != nil {
276 betweenWords = false
277 buf.WriteString(header[:start+2])
278 header = header[start+2:]
279 continue
280 }
281
282 // Write characters before the encoded-word. White-space and newline
283 // characters separating two encoded-words must be deleted.
284 if start > 0 && (!betweenWords || hasNonWhitespace(header[:start])) {
285 buf.WriteString(header[:start])
286 }
287
288 if err := d.convert(&buf, charset, content); err != nil {
289 return "", err
290 }
291
292 header = header[end:]
293 betweenWords = true
294 }
295
296 if len(header) > 0 {
297 buf.WriteString(header)
298 }
299
300 return buf.String(), nil
301 }
302
303 func decode(encoding byte, text string) ([]byte, error) {
304 switch encoding {
305 case 'B', 'b':
306 return base64.StdEncoding.DecodeString(text)
307 case 'Q', 'q':
308 return qDecode(text)
309 default:
310 return nil, errInvalidWord
311 }
312 }
313
314 func (d *WordDecoder) convert(buf *bytes.Buffer, charset string, content []byte) error {
315 switch {
316 case bytes.EqualFold("utf-8", charset):
317 buf.Write(content)
318 case bytes.EqualFold("iso-8859-1", charset):
319 for _, c := range content {
320 buf.WriteRune(rune(c))
321 }
322 case bytes.EqualFold("us-ascii", charset):
323 for _, c := range content {
324 if c >= utf8.RuneSelf {
325 buf.WriteRune(unicode.ReplacementChar)
326 } else {
327 buf.WriteByte(c)
328 }
329 }
330 default:
331 if d.CharsetReader == nil {
332 return fmt.Errorf("mime: unhandled charset %q", charset)
333 }
334 r, err := d.CharsetReader(bytes.ToLower(charset), bytes.NewReader(content))
335 if err != nil {
336 return err
337 }
338 if _, err = io.Copy(buf, r); err != nil {
339 return err
340 }
341 }
342 return nil
343 }
344
345 // hasNonWhitespace reports whether s (assumed to be ASCII) contains at least
346 // one byte of non-whitespace.
347 func hasNonWhitespace(s string) bool {
348 for _, b := range s {
349 switch b {
350 // Encoded-words can only be separated by linear white spaces which does
351 // not include vertical tabs (\v).
352 case ' ', '\t', '\n', '\r':
353 default:
354 return true
355 }
356 }
357 return false
358 }
359
360 // qDecode decodes a Q encoded string.
361 func qDecode(s string) ([]byte, error) {
362 dec := []byte{:len(s)}
363 n := 0
364 for i := 0; i < len(s); i++ {
365 switch c := s[i]; {
366 case c == '_':
367 dec[n] = ' '
368 case c == '=':
369 if i+2 >= len(s) {
370 return nil, errInvalidWord
371 }
372 b, err := readHexByte(s[i+1], s[i+2])
373 if err != nil {
374 return nil, err
375 }
376 dec[n] = b
377 i += 2
378 case (c <= '~' && c >= ' ') || c == '\n' || c == '\r' || c == '\t':
379 dec[n] = c
380 default:
381 return nil, errInvalidWord
382 }
383 n++
384 }
385
386 return dec[:n], nil
387 }
388
389 // readHexByte returns the byte from its quoted-printable representation.
390 func readHexByte(a, b byte) (byte, error) {
391 var hb, lb byte
392 var err error
393 if hb, err = fromHex(a); err != nil {
394 return 0, err
395 }
396 if lb, err = fromHex(b); err != nil {
397 return 0, err
398 }
399 return hb<<4 | lb, nil
400 }
401
402 func fromHex(b byte) (byte, error) {
403 switch {
404 case b >= '0' && b <= '9':
405 return b - '0', nil
406 case b >= 'A' && b <= 'F':
407 return b - 'A' + 10, nil
408 // Accept badly encoded bytes.
409 case b >= 'a' && b <= 'f':
410 return b - 'a' + 10, nil
411 }
412 return 0, fmt.Errorf("mime: invalid hex byte %#02x", b)
413 }
414