1 // Copyright 2009 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 5 //go:generate go run makeisprint.go -output isprint.go
6 7 package strconv
8 9 import (
10 "unicode/utf8"
11 )
12 13 const (
14 lowerhex = "0123456789abcdef"
15 upperhex = "0123456789ABCDEF"
16 )
17 18 // contains reports whether the string contains the byte c.
19 func contains(s []byte, c byte) bool {
20 return index(s, c) != -1
21 }
22 23 func quoteWith(s []byte, quote byte, ASCIIonly, graphicOnly bool) []byte {
24 return []byte(appendQuotedWith([]byte{:0:3*len(s)/2}, s, quote, ASCIIonly, graphicOnly))
25 }
26 27 func quoteRuneWith(r rune, quote byte, ASCIIonly, graphicOnly bool) []byte {
28 return []byte(appendQuotedRuneWith(nil, r, quote, ASCIIonly, graphicOnly))
29 }
30 31 func appendQuotedWith(buf []byte, s []byte, quote byte, ASCIIonly, graphicOnly bool) []byte {
32 // Often called with big strings, so preallocate. If there's quoting,
33 // this is conservative but still helps a lot.
34 if cap(buf)-len(buf) < len(s) {
35 nBuf := []byte{:len(buf):len(buf)+1+len(s)+1}
36 copy(nBuf, buf)
37 buf = nBuf
38 }
39 buf = append(buf, quote)
40 for width := 0; len(s) > 0; s = s[width:] {
41 r := rune(s[0])
42 width = 1
43 if r >= utf8.RuneSelf {
44 r, width = utf8.DecodeRuneInString(s)
45 }
46 if width == 1 && r == utf8.RuneError {
47 buf = append(buf, `\x`...)
48 buf = append(buf, lowerhex[s[0]>>4])
49 buf = append(buf, lowerhex[s[0]&0xF])
50 continue
51 }
52 buf = appendEscapedRune(buf, r, quote, ASCIIonly, graphicOnly)
53 }
54 buf = append(buf, quote)
55 return buf
56 }
57 58 func appendQuotedRuneWith(buf []byte, r rune, quote byte, ASCIIonly, graphicOnly bool) []byte {
59 buf = append(buf, quote)
60 if !utf8.ValidRune(r) {
61 r = utf8.RuneError
62 }
63 buf = appendEscapedRune(buf, r, quote, ASCIIonly, graphicOnly)
64 buf = append(buf, quote)
65 return buf
66 }
67 68 func appendEscapedRune(buf []byte, r rune, quote byte, ASCIIonly, graphicOnly bool) []byte {
69 if r == rune(quote) || r == '\\' { // always backslashed
70 buf = append(buf, '\\')
71 buf = append(buf, byte(r))
72 return buf
73 }
74 if ASCIIonly {
75 if r < utf8.RuneSelf && IsPrint(r) {
76 buf = append(buf, byte(r))
77 return buf
78 }
79 } else if IsPrint(r) || graphicOnly && isInGraphicList(r) {
80 return utf8.AppendRune(buf, r)
81 }
82 switch r {
83 case '\a':
84 buf = append(buf, `\a`...)
85 case '\b':
86 buf = append(buf, `\b`...)
87 case '\f':
88 buf = append(buf, `\f`...)
89 case '\n':
90 buf = append(buf, `\n`...)
91 case '\r':
92 buf = append(buf, `\r`...)
93 case '\t':
94 buf = append(buf, `\t`...)
95 case '\v':
96 buf = append(buf, `\v`...)
97 default:
98 switch {
99 case r < ' ' || r == 0x7f:
100 buf = append(buf, `\x`...)
101 buf = append(buf, lowerhex[byte(r)>>4])
102 buf = append(buf, lowerhex[byte(r)&0xF])
103 case !utf8.ValidRune(r):
104 r = 0xFFFD
105 buf = append(buf, `\u`...)
106 for s := 12; s >= 0; s -= 4 {
107 buf = append(buf, lowerhex[r>>uint(s)&0xF])
108 }
109 case r < 0x10000:
110 buf = append(buf, `\u`...)
111 for s := 12; s >= 0; s -= 4 {
112 buf = append(buf, lowerhex[r>>uint(s)&0xF])
113 }
114 default:
115 buf = append(buf, `\U`...)
116 for s := 28; s >= 0; s -= 4 {
117 buf = append(buf, lowerhex[r>>uint(s)&0xF])
118 }
119 }
120 }
121 return buf
122 }
123 124 // Quote returns a double-quoted Go string literal representing s. The
125 // returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for
126 // control characters and non-printable characters as defined by
127 // [IsPrint].
128 func Quote(s []byte) []byte {
129 return quoteWith(s, '"', false, false)
130 }
131 132 // AppendQuote appends a double-quoted Go string literal representing s,
133 // as generated by [Quote], to dst and returns the extended buffer.
134 func AppendQuote(dst []byte, s []byte) []byte {
135 return appendQuotedWith(dst, s, '"', false, false)
136 }
137 138 // QuoteToASCII returns a double-quoted Go string literal representing s.
139 // The returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for
140 // non-ASCII characters and non-printable characters as defined by [IsPrint].
141 func QuoteToASCII(s []byte) []byte {
142 return quoteWith(s, '"', true, false)
143 }
144 145 // AppendQuoteToASCII appends a double-quoted Go string literal representing s,
146 // as generated by [QuoteToASCII], to dst and returns the extended buffer.
147 func AppendQuoteToASCII(dst []byte, s []byte) []byte {
148 return appendQuotedWith(dst, s, '"', true, false)
149 }
150 151 // QuoteToGraphic returns a double-quoted Go string literal representing s.
152 // The returned string leaves Unicode graphic characters, as defined by
153 // [IsGraphic], unchanged and uses Go escape sequences (\t, \n, \xFF, \u0100)
154 // for non-graphic characters.
155 func QuoteToGraphic(s []byte) []byte {
156 return quoteWith(s, '"', false, true)
157 }
158 159 // AppendQuoteToGraphic appends a double-quoted Go string literal representing s,
160 // as generated by [QuoteToGraphic], to dst and returns the extended buffer.
161 func AppendQuoteToGraphic(dst []byte, s []byte) []byte {
162 return appendQuotedWith(dst, s, '"', false, true)
163 }
164 165 // QuoteRune returns a single-quoted Go character literal representing the
166 // rune. The returned string uses Go escape sequences (\t, \n, \xFF, \u0100)
167 // for control characters and non-printable characters as defined by [IsPrint].
168 // If r is not a valid Unicode code point, it is interpreted as the Unicode
169 // replacement character U+FFFD.
170 func QuoteRune(r rune) []byte {
171 return quoteRuneWith(r, '\'', false, false)
172 }
173 174 // AppendQuoteRune appends a single-quoted Go character literal representing the rune,
175 // as generated by [QuoteRune], to dst and returns the extended buffer.
176 func AppendQuoteRune(dst []byte, r rune) []byte {
177 return appendQuotedRuneWith(dst, r, '\'', false, false)
178 }
179 180 // QuoteRuneToASCII returns a single-quoted Go character literal representing
181 // the rune. The returned string uses Go escape sequences (\t, \n, \xFF,
182 // \u0100) for non-ASCII characters and non-printable characters as defined
183 // by [IsPrint].
184 // If r is not a valid Unicode code point, it is interpreted as the Unicode
185 // replacement character U+FFFD.
186 func QuoteRuneToASCII(r rune) []byte {
187 return quoteRuneWith(r, '\'', true, false)
188 }
189 190 // AppendQuoteRuneToASCII appends a single-quoted Go character literal representing the rune,
191 // as generated by [QuoteRuneToASCII], to dst and returns the extended buffer.
192 func AppendQuoteRuneToASCII(dst []byte, r rune) []byte {
193 return appendQuotedRuneWith(dst, r, '\'', true, false)
194 }
195 196 // QuoteRuneToGraphic returns a single-quoted Go character literal representing
197 // the rune. If the rune is not a Unicode graphic character,
198 // as defined by [IsGraphic], the returned string will use a Go escape sequence
199 // (\t, \n, \xFF, \u0100).
200 // If r is not a valid Unicode code point, it is interpreted as the Unicode
201 // replacement character U+FFFD.
202 func QuoteRuneToGraphic(r rune) []byte {
203 return quoteRuneWith(r, '\'', false, true)
204 }
205 206 // AppendQuoteRuneToGraphic appends a single-quoted Go character literal representing the rune,
207 // as generated by [QuoteRuneToGraphic], to dst and returns the extended buffer.
208 func AppendQuoteRuneToGraphic(dst []byte, r rune) []byte {
209 return appendQuotedRuneWith(dst, r, '\'', false, true)
210 }
211 212 // CanBackquote reports whether the string s can be represented
213 // unchanged as a single-line backquoted string without control
214 // characters other than tab.
215 func CanBackquote(s []byte) bool {
216 for len(s) > 0 {
217 r, wid := utf8.DecodeRuneInString(s)
218 s = s[wid:]
219 if wid > 1 {
220 if r == '\ufeff' {
221 return false // BOMs are invisible and should not be quoted.
222 }
223 continue // All other multibyte runes are correctly encoded and assumed printable.
224 }
225 if r == utf8.RuneError {
226 return false
227 }
228 if (r < ' ' && r != '\t') || r == '`' || r == '\u007F' {
229 return false
230 }
231 }
232 return true
233 }
234 235 func unhex(b byte) (v rune, ok bool) {
236 c := rune(b)
237 switch {
238 case '0' <= c && c <= '9':
239 return c - '0', true
240 case 'a' <= c && c <= 'f':
241 return c - 'a' + 10, true
242 case 'A' <= c && c <= 'F':
243 return c - 'A' + 10, true
244 }
245 return
246 }
247 248 // UnquoteChar decodes the first character or byte in the escaped string
249 // or character literal represented by the string s.
250 // It returns four values:
251 //
252 // 1. value, the decoded Unicode code point or byte value;
253 // 2. multibyte, a boolean indicating whether the decoded character requires a multibyte UTF-8 representation;
254 // 3. tail, the remainder of the string after the character; and
255 // 4. an error that will be nil if the character is syntactically valid.
256 //
257 // The second argument, quote, specifies the type of literal being parsed
258 // and therefore which escaped quote character is permitted.
259 // If set to a single quote, it permits the sequence \' and disallows unescaped '.
260 // If set to a double quote, it permits \" and disallows unescaped ".
261 // If set to zero, it does not permit either escape and allows both quote characters to appear unescaped.
262 func UnquoteChar(s []byte, quote byte) (value rune, multibyte bool, tail []byte, err error) {
263 // easy cases
264 if len(s) == 0 {
265 err = ErrSyntax
266 return
267 }
268 switch c := s[0]; {
269 case c == quote && (quote == '\'' || quote == '"'):
270 err = ErrSyntax
271 return
272 case c >= utf8.RuneSelf:
273 r, size := utf8.DecodeRuneInString(s)
274 return r, true, s[size:], nil
275 case c != '\\':
276 return rune(s[0]), false, s[1:], nil
277 }
278 279 // hard case: c is backslash
280 if len(s) <= 1 {
281 err = ErrSyntax
282 return
283 }
284 c := s[1]
285 s = s[2:]
286 287 switch c {
288 case 'a':
289 value = '\a'
290 case 'b':
291 value = '\b'
292 case 'f':
293 value = '\f'
294 case 'n':
295 value = '\n'
296 case 'r':
297 value = '\r'
298 case 't':
299 value = '\t'
300 case 'v':
301 value = '\v'
302 case 'x', 'u', 'U':
303 n := 0
304 switch c {
305 case 'x':
306 n = 2
307 case 'u':
308 n = 4
309 case 'U':
310 n = 8
311 }
312 var v rune
313 if len(s) < n {
314 err = ErrSyntax
315 return
316 }
317 for j := 0; j < n; j++ {
318 x, ok := unhex(s[j])
319 if !ok {
320 err = ErrSyntax
321 return
322 }
323 v = v<<4 | x
324 }
325 s = s[n:]
326 if c == 'x' {
327 // single-byte string, possibly not UTF-8
328 value = v
329 break
330 }
331 if !utf8.ValidRune(v) {
332 err = ErrSyntax
333 return
334 }
335 value = v
336 multibyte = true
337 case '0', '1', '2', '3', '4', '5', '6', '7':
338 v := rune(c) - '0'
339 if len(s) < 2 {
340 err = ErrSyntax
341 return
342 }
343 for j := 0; j < 2; j++ { // one digit already; two more
344 x := rune(s[j]) - '0'
345 if x < 0 || x > 7 {
346 err = ErrSyntax
347 return
348 }
349 v = (v << 3) | x
350 }
351 s = s[2:]
352 if v > 255 {
353 err = ErrSyntax
354 return
355 }
356 value = v
357 case '\\':
358 value = '\\'
359 case '\'', '"':
360 if c != quote {
361 err = ErrSyntax
362 return
363 }
364 value = rune(c)
365 default:
366 err = ErrSyntax
367 return
368 }
369 tail = s
370 return
371 }
372 373 // QuotedPrefix returns the quoted string (as understood by [Unquote]) at the prefix of s.
374 // If s does not start with a valid quoted string, QuotedPrefix returns an error.
375 func QuotedPrefix(s []byte) ([]byte, error) {
376 out, _, err := unquote(s, false)
377 return out, err
378 }
379 380 // Unquote interprets s as a single-quoted, double-quoted,
381 // or backquoted Go string literal, returning the string value
382 // that s quotes. (If s is single-quoted, it would be a Go
383 // character literal; Unquote returns the corresponding
384 // one-character string. For an empty character literal
385 // Unquote returns the empty string.)
386 func Unquote(s []byte) ([]byte, error) {
387 out, rem, err := unquote(s, true)
388 if len(rem) > 0 {
389 return "", ErrSyntax
390 }
391 return out, err
392 }
393 394 // unquote parses a quoted string at the start of the input,
395 // returning the parsed prefix, the remaining suffix, and any parse errors.
396 // If unescape is true, the parsed prefix is unescaped,
397 // otherwise the input prefix is provided verbatim.
398 func unquote(in []byte, unescape bool) (out, rem []byte, err error) {
399 // Determine the quote form and optimistically find the terminating quote.
400 if len(in) < 2 {
401 return "", in, ErrSyntax
402 }
403 quote := in[0]
404 end := index(in[1:], quote)
405 if end < 0 {
406 return "", in, ErrSyntax
407 }
408 end += 2 // position after terminating quote; may be wrong if escape sequences are present
409 410 switch quote {
411 case '`':
412 switch {
413 case !unescape:
414 out = in[:end] // include quotes
415 case !contains(in[:end], '\r'):
416 out = in[len("`") : end-len("`")] // exclude quotes
417 default:
418 // Carriage return characters ('\r') inside raw string literals
419 // are discarded from the raw string value.
420 buf := []byte{:0:end-len("`")-len("\r")-len("`")}
421 for i := len("`"); i < end-len("`"); i++ {
422 if in[i] != '\r' {
423 buf = append(buf, in[i])
424 }
425 }
426 out = []byte(buf)
427 }
428 // NOTE: Prior implementations did not verify that raw strings consist
429 // of valid UTF-8 characters and we continue to not verify it as such.
430 // The Go specification does not explicitly require valid UTF-8,
431 // but only mention that it is implicitly valid for Go source code
432 // (which must be valid UTF-8).
433 return out, in[end:], nil
434 case '"', '\'':
435 // Handle quoted strings without any escape sequences.
436 if !contains(in[:end], '\\') && !contains(in[:end], '\n') {
437 var valid bool
438 switch quote {
439 case '"':
440 valid = utf8.ValidString(in[len(`"`) : end-len(`"`)])
441 case '\'':
442 r, n := utf8.DecodeRuneInString(in[len("'") : end-len("'")])
443 valid = len("'")+n+len("'") == end && (r != utf8.RuneError || n != 1)
444 }
445 if valid {
446 out = in[:end]
447 if unescape {
448 out = out[1 : end-1] // exclude quotes
449 }
450 return out, in[end:], nil
451 }
452 }
453 454 // Handle quoted strings with escape sequences.
455 var buf []byte
456 in0 := in
457 in = in[1:] // skip starting quote
458 if unescape {
459 buf = []byte{:0:3*end/2} // try to avoid more allocations
460 }
461 for len(in) > 0 && in[0] != quote {
462 // Process the next character,
463 // rejecting any unescaped newline characters which are invalid.
464 r, multibyte, rem, err := UnquoteChar(in, quote)
465 if in[0] == '\n' || err != nil {
466 return "", in0, ErrSyntax
467 }
468 in = rem
469 470 // Append the character if unescaping the input.
471 if unescape {
472 if r < utf8.RuneSelf || !multibyte {
473 buf = append(buf, byte(r))
474 } else {
475 buf = utf8.AppendRune(buf, r)
476 }
477 }
478 479 // Single quoted strings must be a single character.
480 if quote == '\'' {
481 break
482 }
483 }
484 485 // Verify that the string ends with a terminating quote.
486 if !(len(in) > 0 && in[0] == quote) {
487 return "", in0, ErrSyntax
488 }
489 in = in[1:] // skip terminating quote
490 491 if unescape {
492 return []byte(buf), in, nil
493 }
494 return in0[:len(in0)-len(in)], in, nil
495 default:
496 return "", in, ErrSyntax
497 }
498 }
499 500 // bsearch is semantically the same as [slices.BinarySearch] (without NaN checks)
501 // We copied this function because we can not import "slices" here.
502 func bsearch[S ~[]E, E ~uint16 | ~uint32](s S, v E) (int, bool) {
503 n := len(s)
504 i, j := 0, n
505 for i < j {
506 h := i + (j-i)>>1
507 if s[h] < v {
508 i = h + 1
509 } else {
510 j = h
511 }
512 }
513 return i, i < n && s[i] == v
514 }
515 516 // TODO: IsPrint is a local implementation of unicode.IsPrint, verified by the tests
517 // to give the same answer. It allows this package not to depend on unicode,
518 // and therefore not pull in all the Unicode tables. If the linker were better
519 // at tossing unused tables, we could get rid of this implementation.
520 // That would be nice.
521 522 // IsPrint reports whether the rune is defined as printable by Go, with
523 // the same definition as [unicode.IsPrint]: letters, numbers, punctuation,
524 // symbols and ASCII space.
525 func IsPrint(r rune) bool {
526 // Fast check for Latin-1
527 if r <= 0xFF {
528 if 0x20 <= r && r <= 0x7E {
529 // All the ASCII is printable from space through DEL-1.
530 return true
531 }
532 if 0xA1 <= r && r <= 0xFF {
533 // Similarly for ¡ through ÿ...
534 return r != 0xAD // ...except for the bizarre soft hyphen.
535 }
536 return false
537 }
538 539 // Same algorithm, either on uint16 or uint32 value.
540 // First, find first i such that isPrint[i] >= x.
541 // This is the index of either the start or end of a pair that might span x.
542 // The start is even (isPrint[i&^1]) and the end is odd (isPrint[i|1]).
543 // If we find x in a range, make sure x is not in isNotPrint list.
544 545 if 0 <= r && r < 1<<16 {
546 rr, isPrint, isNotPrint := uint16(r), isPrint16, isNotPrint16
547 i, _ := bsearch(isPrint, rr)
548 if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr {
549 return false
550 }
551 _, found := bsearch(isNotPrint, rr)
552 return !found
553 }
554 555 rr, isPrint, isNotPrint := uint32(r), isPrint32, isNotPrint32
556 i, _ := bsearch(isPrint, rr)
557 if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr {
558 return false
559 }
560 if r >= 0x20000 {
561 return true
562 }
563 r -= 0x10000
564 _, found := bsearch(isNotPrint, uint16(r))
565 return !found
566 }
567 568 // IsGraphic reports whether the rune is defined as a Graphic by Unicode. Such
569 // characters include letters, marks, numbers, punctuation, symbols, and
570 // spaces, from categories L, M, N, P, S, and Zs.
571 func IsGraphic(r rune) bool {
572 if IsPrint(r) {
573 return true
574 }
575 return isInGraphicList(r)
576 }
577 578 // isInGraphicList reports whether the rune is in the isGraphic list. This separation
579 // from IsGraphic allows quoteWith to avoid two calls to IsPrint.
580 // Should be called only if IsPrint fails.
581 func isInGraphicList(r rune) bool {
582 // We know r must fit in 16 bits - see makeisprint.go.
583 if r > 0xFFFF {
584 return false
585 }
586 _, found := bsearch(isGraphic, uint16(r))
587 return found
588 }
589