quote.mx raw

   1  // Copyright 2009 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  //go:generate go run makeisprint.go -output isprint.go
   6  
   7  package strconv
   8  
   9  import (
  10  	"unicode/utf8"
  11  )
  12  
  13  const (
  14  	lowerhex = "0123456789abcdef"
  15  	upperhex = "0123456789ABCDEF"
  16  )
  17  
  18  // contains reports whether the string contains the byte c.
  19  func contains(s []byte, c byte) bool {
  20  	return index(s, c) != -1
  21  }
  22  
  23  func quoteWith(s []byte, quote byte, ASCIIonly, graphicOnly bool) []byte {
  24  	return []byte(appendQuotedWith([]byte{:0:3*len(s)/2}, s, quote, ASCIIonly, graphicOnly))
  25  }
  26  
  27  func quoteRuneWith(r rune, quote byte, ASCIIonly, graphicOnly bool) []byte {
  28  	return []byte(appendQuotedRuneWith(nil, r, quote, ASCIIonly, graphicOnly))
  29  }
  30  
  31  func appendQuotedWith(buf []byte, s []byte, quote byte, ASCIIonly, graphicOnly bool) []byte {
  32  	// Often called with big strings, so preallocate. If there's quoting,
  33  	// this is conservative but still helps a lot.
  34  	if cap(buf)-len(buf) < len(s) {
  35  		nBuf := []byte{:len(buf):len(buf)+1+len(s)+1}
  36  		copy(nBuf, buf)
  37  		buf = nBuf
  38  	}
  39  	buf = append(buf, quote)
  40  	for width := 0; len(s) > 0; s = s[width:] {
  41  		r := rune(s[0])
  42  		width = 1
  43  		if r >= utf8.RuneSelf {
  44  			r, width = utf8.DecodeRuneInString(s)
  45  		}
  46  		if width == 1 && r == utf8.RuneError {
  47  			buf = append(buf, `\x`...)
  48  			buf = append(buf, lowerhex[s[0]>>4])
  49  			buf = append(buf, lowerhex[s[0]&0xF])
  50  			continue
  51  		}
  52  		buf = appendEscapedRune(buf, r, quote, ASCIIonly, graphicOnly)
  53  	}
  54  	buf = append(buf, quote)
  55  	return buf
  56  }
  57  
  58  func appendQuotedRuneWith(buf []byte, r rune, quote byte, ASCIIonly, graphicOnly bool) []byte {
  59  	buf = append(buf, quote)
  60  	if !utf8.ValidRune(r) {
  61  		r = utf8.RuneError
  62  	}
  63  	buf = appendEscapedRune(buf, r, quote, ASCIIonly, graphicOnly)
  64  	buf = append(buf, quote)
  65  	return buf
  66  }
  67  
  68  func appendEscapedRune(buf []byte, r rune, quote byte, ASCIIonly, graphicOnly bool) []byte {
  69  	if r == rune(quote) || r == '\\' { // always backslashed
  70  		buf = append(buf, '\\')
  71  		buf = append(buf, byte(r))
  72  		return buf
  73  	}
  74  	if ASCIIonly {
  75  		if r < utf8.RuneSelf && IsPrint(r) {
  76  			buf = append(buf, byte(r))
  77  			return buf
  78  		}
  79  	} else if IsPrint(r) || graphicOnly && isInGraphicList(r) {
  80  		return utf8.AppendRune(buf, r)
  81  	}
  82  	switch r {
  83  	case '\a':
  84  		buf = append(buf, `\a`...)
  85  	case '\b':
  86  		buf = append(buf, `\b`...)
  87  	case '\f':
  88  		buf = append(buf, `\f`...)
  89  	case '\n':
  90  		buf = append(buf, `\n`...)
  91  	case '\r':
  92  		buf = append(buf, `\r`...)
  93  	case '\t':
  94  		buf = append(buf, `\t`...)
  95  	case '\v':
  96  		buf = append(buf, `\v`...)
  97  	default:
  98  		switch {
  99  		case r < ' ' || r == 0x7f:
 100  			buf = append(buf, `\x`...)
 101  			buf = append(buf, lowerhex[byte(r)>>4])
 102  			buf = append(buf, lowerhex[byte(r)&0xF])
 103  		case !utf8.ValidRune(r):
 104  			r = 0xFFFD
 105  			buf = append(buf, `\u`...)
 106  			for s := 12; s >= 0; s -= 4 {
 107  				buf = append(buf, lowerhex[r>>uint(s)&0xF])
 108  			}
 109  		case r < 0x10000:
 110  			buf = append(buf, `\u`...)
 111  			for s := 12; s >= 0; s -= 4 {
 112  				buf = append(buf, lowerhex[r>>uint(s)&0xF])
 113  			}
 114  		default:
 115  			buf = append(buf, `\U`...)
 116  			for s := 28; s >= 0; s -= 4 {
 117  				buf = append(buf, lowerhex[r>>uint(s)&0xF])
 118  			}
 119  		}
 120  	}
 121  	return buf
 122  }
 123  
 124  // Quote returns a double-quoted Go string literal representing s. The
 125  // returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for
 126  // control characters and non-printable characters as defined by
 127  // [IsPrint].
 128  func Quote(s []byte) []byte {
 129  	return quoteWith(s, '"', false, false)
 130  }
 131  
 132  // AppendQuote appends a double-quoted Go string literal representing s,
 133  // as generated by [Quote], to dst and returns the extended buffer.
 134  func AppendQuote(dst []byte, s []byte) []byte {
 135  	return appendQuotedWith(dst, s, '"', false, false)
 136  }
 137  
 138  // QuoteToASCII returns a double-quoted Go string literal representing s.
 139  // The returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for
 140  // non-ASCII characters and non-printable characters as defined by [IsPrint].
 141  func QuoteToASCII(s []byte) []byte {
 142  	return quoteWith(s, '"', true, false)
 143  }
 144  
 145  // AppendQuoteToASCII appends a double-quoted Go string literal representing s,
 146  // as generated by [QuoteToASCII], to dst and returns the extended buffer.
 147  func AppendQuoteToASCII(dst []byte, s []byte) []byte {
 148  	return appendQuotedWith(dst, s, '"', true, false)
 149  }
 150  
 151  // QuoteToGraphic returns a double-quoted Go string literal representing s.
 152  // The returned string leaves Unicode graphic characters, as defined by
 153  // [IsGraphic], unchanged and uses Go escape sequences (\t, \n, \xFF, \u0100)
 154  // for non-graphic characters.
 155  func QuoteToGraphic(s []byte) []byte {
 156  	return quoteWith(s, '"', false, true)
 157  }
 158  
 159  // AppendQuoteToGraphic appends a double-quoted Go string literal representing s,
 160  // as generated by [QuoteToGraphic], to dst and returns the extended buffer.
 161  func AppendQuoteToGraphic(dst []byte, s []byte) []byte {
 162  	return appendQuotedWith(dst, s, '"', false, true)
 163  }
 164  
 165  // QuoteRune returns a single-quoted Go character literal representing the
 166  // rune. The returned string uses Go escape sequences (\t, \n, \xFF, \u0100)
 167  // for control characters and non-printable characters as defined by [IsPrint].
 168  // If r is not a valid Unicode code point, it is interpreted as the Unicode
 169  // replacement character U+FFFD.
 170  func QuoteRune(r rune) []byte {
 171  	return quoteRuneWith(r, '\'', false, false)
 172  }
 173  
 174  // AppendQuoteRune appends a single-quoted Go character literal representing the rune,
 175  // as generated by [QuoteRune], to dst and returns the extended buffer.
 176  func AppendQuoteRune(dst []byte, r rune) []byte {
 177  	return appendQuotedRuneWith(dst, r, '\'', false, false)
 178  }
 179  
 180  // QuoteRuneToASCII returns a single-quoted Go character literal representing
 181  // the rune. The returned string uses Go escape sequences (\t, \n, \xFF,
 182  // \u0100) for non-ASCII characters and non-printable characters as defined
 183  // by [IsPrint].
 184  // If r is not a valid Unicode code point, it is interpreted as the Unicode
 185  // replacement character U+FFFD.
 186  func QuoteRuneToASCII(r rune) []byte {
 187  	return quoteRuneWith(r, '\'', true, false)
 188  }
 189  
 190  // AppendQuoteRuneToASCII appends a single-quoted Go character literal representing the rune,
 191  // as generated by [QuoteRuneToASCII], to dst and returns the extended buffer.
 192  func AppendQuoteRuneToASCII(dst []byte, r rune) []byte {
 193  	return appendQuotedRuneWith(dst, r, '\'', true, false)
 194  }
 195  
 196  // QuoteRuneToGraphic returns a single-quoted Go character literal representing
 197  // the rune. If the rune is not a Unicode graphic character,
 198  // as defined by [IsGraphic], the returned string will use a Go escape sequence
 199  // (\t, \n, \xFF, \u0100).
 200  // If r is not a valid Unicode code point, it is interpreted as the Unicode
 201  // replacement character U+FFFD.
 202  func QuoteRuneToGraphic(r rune) []byte {
 203  	return quoteRuneWith(r, '\'', false, true)
 204  }
 205  
 206  // AppendQuoteRuneToGraphic appends a single-quoted Go character literal representing the rune,
 207  // as generated by [QuoteRuneToGraphic], to dst and returns the extended buffer.
 208  func AppendQuoteRuneToGraphic(dst []byte, r rune) []byte {
 209  	return appendQuotedRuneWith(dst, r, '\'', false, true)
 210  }
 211  
 212  // CanBackquote reports whether the string s can be represented
 213  // unchanged as a single-line backquoted string without control
 214  // characters other than tab.
 215  func CanBackquote(s []byte) bool {
 216  	for len(s) > 0 {
 217  		r, wid := utf8.DecodeRuneInString(s)
 218  		s = s[wid:]
 219  		if wid > 1 {
 220  			if r == '\ufeff' {
 221  				return false // BOMs are invisible and should not be quoted.
 222  			}
 223  			continue // All other multibyte runes are correctly encoded and assumed printable.
 224  		}
 225  		if r == utf8.RuneError {
 226  			return false
 227  		}
 228  		if (r < ' ' && r != '\t') || r == '`' || r == '\u007F' {
 229  			return false
 230  		}
 231  	}
 232  	return true
 233  }
 234  
 235  func unhex(b byte) (v rune, ok bool) {
 236  	c := rune(b)
 237  	switch {
 238  	case '0' <= c && c <= '9':
 239  		return c - '0', true
 240  	case 'a' <= c && c <= 'f':
 241  		return c - 'a' + 10, true
 242  	case 'A' <= c && c <= 'F':
 243  		return c - 'A' + 10, true
 244  	}
 245  	return
 246  }
 247  
 248  // UnquoteChar decodes the first character or byte in the escaped string
 249  // or character literal represented by the string s.
 250  // It returns four values:
 251  //
 252  //  1. value, the decoded Unicode code point or byte value;
 253  //  2. multibyte, a boolean indicating whether the decoded character requires a multibyte UTF-8 representation;
 254  //  3. tail, the remainder of the string after the character; and
 255  //  4. an error that will be nil if the character is syntactically valid.
 256  //
 257  // The second argument, quote, specifies the type of literal being parsed
 258  // and therefore which escaped quote character is permitted.
 259  // If set to a single quote, it permits the sequence \' and disallows unescaped '.
 260  // If set to a double quote, it permits \" and disallows unescaped ".
 261  // If set to zero, it does not permit either escape and allows both quote characters to appear unescaped.
 262  func UnquoteChar(s []byte, quote byte) (value rune, multibyte bool, tail []byte, err error) {
 263  	// easy cases
 264  	if len(s) == 0 {
 265  		err = ErrSyntax
 266  		return
 267  	}
 268  	switch c := s[0]; {
 269  	case c == quote && (quote == '\'' || quote == '"'):
 270  		err = ErrSyntax
 271  		return
 272  	case c >= utf8.RuneSelf:
 273  		r, size := utf8.DecodeRuneInString(s)
 274  		return r, true, s[size:], nil
 275  	case c != '\\':
 276  		return rune(s[0]), false, s[1:], nil
 277  	}
 278  
 279  	// hard case: c is backslash
 280  	if len(s) <= 1 {
 281  		err = ErrSyntax
 282  		return
 283  	}
 284  	c := s[1]
 285  	s = s[2:]
 286  
 287  	switch c {
 288  	case 'a':
 289  		value = '\a'
 290  	case 'b':
 291  		value = '\b'
 292  	case 'f':
 293  		value = '\f'
 294  	case 'n':
 295  		value = '\n'
 296  	case 'r':
 297  		value = '\r'
 298  	case 't':
 299  		value = '\t'
 300  	case 'v':
 301  		value = '\v'
 302  	case 'x', 'u', 'U':
 303  		n := 0
 304  		switch c {
 305  		case 'x':
 306  			n = 2
 307  		case 'u':
 308  			n = 4
 309  		case 'U':
 310  			n = 8
 311  		}
 312  		var v rune
 313  		if len(s) < n {
 314  			err = ErrSyntax
 315  			return
 316  		}
 317  		for j := 0; j < n; j++ {
 318  			x, ok := unhex(s[j])
 319  			if !ok {
 320  				err = ErrSyntax
 321  				return
 322  			}
 323  			v = v<<4 | x
 324  		}
 325  		s = s[n:]
 326  		if c == 'x' {
 327  			// single-byte string, possibly not UTF-8
 328  			value = v
 329  			break
 330  		}
 331  		if !utf8.ValidRune(v) {
 332  			err = ErrSyntax
 333  			return
 334  		}
 335  		value = v
 336  		multibyte = true
 337  	case '0', '1', '2', '3', '4', '5', '6', '7':
 338  		v := rune(c) - '0'
 339  		if len(s) < 2 {
 340  			err = ErrSyntax
 341  			return
 342  		}
 343  		for j := 0; j < 2; j++ { // one digit already; two more
 344  			x := rune(s[j]) - '0'
 345  			if x < 0 || x > 7 {
 346  				err = ErrSyntax
 347  				return
 348  			}
 349  			v = (v << 3) | x
 350  		}
 351  		s = s[2:]
 352  		if v > 255 {
 353  			err = ErrSyntax
 354  			return
 355  		}
 356  		value = v
 357  	case '\\':
 358  		value = '\\'
 359  	case '\'', '"':
 360  		if c != quote {
 361  			err = ErrSyntax
 362  			return
 363  		}
 364  		value = rune(c)
 365  	default:
 366  		err = ErrSyntax
 367  		return
 368  	}
 369  	tail = s
 370  	return
 371  }
 372  
 373  // QuotedPrefix returns the quoted string (as understood by [Unquote]) at the prefix of s.
 374  // If s does not start with a valid quoted string, QuotedPrefix returns an error.
 375  func QuotedPrefix(s []byte) ([]byte, error) {
 376  	out, _, err := unquote(s, false)
 377  	return out, err
 378  }
 379  
 380  // Unquote interprets s as a single-quoted, double-quoted,
 381  // or backquoted Go string literal, returning the string value
 382  // that s quotes.  (If s is single-quoted, it would be a Go
 383  // character literal; Unquote returns the corresponding
 384  // one-character string. For an empty character literal
 385  // Unquote returns the empty string.)
 386  func Unquote(s []byte) ([]byte, error) {
 387  	out, rem, err := unquote(s, true)
 388  	if len(rem) > 0 {
 389  		return "", ErrSyntax
 390  	}
 391  	return out, err
 392  }
 393  
 394  // unquote parses a quoted string at the start of the input,
 395  // returning the parsed prefix, the remaining suffix, and any parse errors.
 396  // If unescape is true, the parsed prefix is unescaped,
 397  // otherwise the input prefix is provided verbatim.
 398  func unquote(in []byte, unescape bool) (out, rem []byte, err error) {
 399  	// Determine the quote form and optimistically find the terminating quote.
 400  	if len(in) < 2 {
 401  		return "", in, ErrSyntax
 402  	}
 403  	quote := in[0]
 404  	end := index(in[1:], quote)
 405  	if end < 0 {
 406  		return "", in, ErrSyntax
 407  	}
 408  	end += 2 // position after terminating quote; may be wrong if escape sequences are present
 409  
 410  	switch quote {
 411  	case '`':
 412  		switch {
 413  		case !unescape:
 414  			out = in[:end] // include quotes
 415  		case !contains(in[:end], '\r'):
 416  			out = in[len("`") : end-len("`")] // exclude quotes
 417  		default:
 418  			// Carriage return characters ('\r') inside raw string literals
 419  			// are discarded from the raw string value.
 420  			buf := []byte{:0:end-len("`")-len("\r")-len("`")}
 421  			for i := len("`"); i < end-len("`"); i++ {
 422  				if in[i] != '\r' {
 423  					buf = append(buf, in[i])
 424  				}
 425  			}
 426  			out = []byte(buf)
 427  		}
 428  		// NOTE: Prior implementations did not verify that raw strings consist
 429  		// of valid UTF-8 characters and we continue to not verify it as such.
 430  		// The Go specification does not explicitly require valid UTF-8,
 431  		// but only mention that it is implicitly valid for Go source code
 432  		// (which must be valid UTF-8).
 433  		return out, in[end:], nil
 434  	case '"', '\'':
 435  		// Handle quoted strings without any escape sequences.
 436  		if !contains(in[:end], '\\') && !contains(in[:end], '\n') {
 437  			var valid bool
 438  			switch quote {
 439  			case '"':
 440  				valid = utf8.ValidString(in[len(`"`) : end-len(`"`)])
 441  			case '\'':
 442  				r, n := utf8.DecodeRuneInString(in[len("'") : end-len("'")])
 443  				valid = len("'")+n+len("'") == end && (r != utf8.RuneError || n != 1)
 444  			}
 445  			if valid {
 446  				out = in[:end]
 447  				if unescape {
 448  					out = out[1 : end-1] // exclude quotes
 449  				}
 450  				return out, in[end:], nil
 451  			}
 452  		}
 453  
 454  		// Handle quoted strings with escape sequences.
 455  		var buf []byte
 456  		in0 := in
 457  		in = in[1:] // skip starting quote
 458  		if unescape {
 459  			buf = []byte{:0:3*end/2} // try to avoid more allocations
 460  		}
 461  		for len(in) > 0 && in[0] != quote {
 462  			// Process the next character,
 463  			// rejecting any unescaped newline characters which are invalid.
 464  			r, multibyte, rem, err := UnquoteChar(in, quote)
 465  			if in[0] == '\n' || err != nil {
 466  				return "", in0, ErrSyntax
 467  			}
 468  			in = rem
 469  
 470  			// Append the character if unescaping the input.
 471  			if unescape {
 472  				if r < utf8.RuneSelf || !multibyte {
 473  					buf = append(buf, byte(r))
 474  				} else {
 475  					buf = utf8.AppendRune(buf, r)
 476  				}
 477  			}
 478  
 479  			// Single quoted strings must be a single character.
 480  			if quote == '\'' {
 481  				break
 482  			}
 483  		}
 484  
 485  		// Verify that the string ends with a terminating quote.
 486  		if !(len(in) > 0 && in[0] == quote) {
 487  			return "", in0, ErrSyntax
 488  		}
 489  		in = in[1:] // skip terminating quote
 490  
 491  		if unescape {
 492  			return []byte(buf), in, nil
 493  		}
 494  		return in0[:len(in0)-len(in)], in, nil
 495  	default:
 496  		return "", in, ErrSyntax
 497  	}
 498  }
 499  
 500  // bsearch is semantically the same as [slices.BinarySearch] (without NaN checks)
 501  // We copied this function because we can not import "slices" here.
 502  func bsearch[S ~[]E, E ~uint16 | ~uint32](s S, v E) (int, bool) {
 503  	n := len(s)
 504  	i, j := 0, n
 505  	for i < j {
 506  		h := i + (j-i)>>1
 507  		if s[h] < v {
 508  			i = h + 1
 509  		} else {
 510  			j = h
 511  		}
 512  	}
 513  	return i, i < n && s[i] == v
 514  }
 515  
 516  // TODO: IsPrint is a local implementation of unicode.IsPrint, verified by the tests
 517  // to give the same answer. It allows this package not to depend on unicode,
 518  // and therefore not pull in all the Unicode tables. If the linker were better
 519  // at tossing unused tables, we could get rid of this implementation.
 520  // That would be nice.
 521  
 522  // IsPrint reports whether the rune is defined as printable by Go, with
 523  // the same definition as [unicode.IsPrint]: letters, numbers, punctuation,
 524  // symbols and ASCII space.
 525  func IsPrint(r rune) bool {
 526  	// Fast check for Latin-1
 527  	if r <= 0xFF {
 528  		if 0x20 <= r && r <= 0x7E {
 529  			// All the ASCII is printable from space through DEL-1.
 530  			return true
 531  		}
 532  		if 0xA1 <= r && r <= 0xFF {
 533  			// Similarly for ¡ through ÿ...
 534  			return r != 0xAD // ...except for the bizarre soft hyphen.
 535  		}
 536  		return false
 537  	}
 538  
 539  	// Same algorithm, either on uint16 or uint32 value.
 540  	// First, find first i such that isPrint[i] >= x.
 541  	// This is the index of either the start or end of a pair that might span x.
 542  	// The start is even (isPrint[i&^1]) and the end is odd (isPrint[i|1]).
 543  	// If we find x in a range, make sure x is not in isNotPrint list.
 544  
 545  	if 0 <= r && r < 1<<16 {
 546  		rr, isPrint, isNotPrint := uint16(r), isPrint16, isNotPrint16
 547  		i, _ := bsearch(isPrint, rr)
 548  		if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr {
 549  			return false
 550  		}
 551  		_, found := bsearch(isNotPrint, rr)
 552  		return !found
 553  	}
 554  
 555  	rr, isPrint, isNotPrint := uint32(r), isPrint32, isNotPrint32
 556  	i, _ := bsearch(isPrint, rr)
 557  	if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr {
 558  		return false
 559  	}
 560  	if r >= 0x20000 {
 561  		return true
 562  	}
 563  	r -= 0x10000
 564  	_, found := bsearch(isNotPrint, uint16(r))
 565  	return !found
 566  }
 567  
 568  // IsGraphic reports whether the rune is defined as a Graphic by Unicode. Such
 569  // characters include letters, marks, numbers, punctuation, symbols, and
 570  // spaces, from categories L, M, N, P, S, and Zs.
 571  func IsGraphic(r rune) bool {
 572  	if IsPrint(r) {
 573  		return true
 574  	}
 575  	return isInGraphicList(r)
 576  }
 577  
 578  // isInGraphicList reports whether the rune is in the isGraphic list. This separation
 579  // from IsGraphic allows quoteWith to avoid two calls to IsPrint.
 580  // Should be called only if IsPrint fails.
 581  func isInGraphicList(r rune) bool {
 582  	// We know r must fit in 16 bits - see makeisprint.go.
 583  	if r > 0xFFFF {
 584  		return false
 585  	}
 586  	_, found := bsearch(isGraphic, uint16(r))
 587  	return found
 588  }
 589