decode_string.go raw

   1  // Copyright 2018 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  package text
   6  
   7  import (
   8  	"bytes"
   9  	"strconv"
  10  	"strings"
  11  	"unicode"
  12  	"unicode/utf16"
  13  	"unicode/utf8"
  14  
  15  	"google.golang.org/protobuf/internal/strs"
  16  )
  17  
  18  // parseStringValue parses string field token.
  19  // This differs from parseString since the text format allows
  20  // multiple back-to-back string literals where they are semantically treated
  21  // as a single large string with all values concatenated.
  22  //
  23  // E.g., `"foo" "bar" "baz"` => "foobarbaz"
  24  func (d *Decoder) parseStringValue() (Token, error) {
  25  	// Note that the ending quote is sufficient to unambiguously mark the end
  26  	// of a string. Thus, the text grammar does not require intervening
  27  	// whitespace or control characters in-between strings.
  28  	// Thus, the following is valid:
  29  	//	`"foo"'bar'"baz"` => "foobarbaz"
  30  	in0 := d.in
  31  	var ss []string
  32  	for len(d.in) > 0 && (d.in[0] == '"' || d.in[0] == '\'') {
  33  		s, err := d.parseString()
  34  		if err != nil {
  35  			return Token{}, err
  36  		}
  37  		ss = append(ss, s)
  38  	}
  39  	// d.in already points to the end of the value at this point.
  40  	return Token{
  41  		kind:  Scalar,
  42  		attrs: stringValue,
  43  		pos:   len(d.orig) - len(in0),
  44  		raw:   in0[:len(in0)-len(d.in)],
  45  		str:   strings.Join(ss, ""),
  46  	}, nil
  47  }
  48  
  49  // parseString parses a string value enclosed in " or '.
  50  func (d *Decoder) parseString() (string, error) {
  51  	in := d.in
  52  	if len(in) == 0 {
  53  		return "", ErrUnexpectedEOF
  54  	}
  55  	quote := in[0]
  56  	in = in[1:]
  57  	i := indexNeedEscapeInBytes(in)
  58  	in, out := in[i:], in[:i:i] // set cap to prevent mutations
  59  	for len(in) > 0 {
  60  		switch r, n := utf8.DecodeRune(in); {
  61  		case r == utf8.RuneError && n == 1:
  62  			return "", d.newSyntaxError("invalid UTF-8 detected")
  63  		case r == 0 || r == '\n':
  64  			return "", d.newSyntaxError("invalid character %q in string", r)
  65  		case r == rune(quote):
  66  			in = in[1:]
  67  			d.consume(len(d.in) - len(in))
  68  			return string(out), nil
  69  		case r == '\\':
  70  			if len(in) < 2 {
  71  				return "", ErrUnexpectedEOF
  72  			}
  73  			switch r := in[1]; r {
  74  			case '"', '\'', '\\', '?':
  75  				in, out = in[2:], append(out, r)
  76  			case 'a':
  77  				in, out = in[2:], append(out, '\a')
  78  			case 'b':
  79  				in, out = in[2:], append(out, '\b')
  80  			case 'n':
  81  				in, out = in[2:], append(out, '\n')
  82  			case 'r':
  83  				in, out = in[2:], append(out, '\r')
  84  			case 't':
  85  				in, out = in[2:], append(out, '\t')
  86  			case 'v':
  87  				in, out = in[2:], append(out, '\v')
  88  			case 'f':
  89  				in, out = in[2:], append(out, '\f')
  90  			case '0', '1', '2', '3', '4', '5', '6', '7':
  91  				// One, two, or three octal characters.
  92  				n := len(in[1:]) - len(bytes.TrimLeft(in[1:], "01234567"))
  93  				if n > 3 {
  94  					n = 3
  95  				}
  96  				v, err := strconv.ParseUint(string(in[1:1+n]), 8, 8)
  97  				if err != nil {
  98  					return "", d.newSyntaxError("invalid octal escape code %q in string", in[:1+n])
  99  				}
 100  				in, out = in[1+n:], append(out, byte(v))
 101  			case 'x':
 102  				// One or two hexadecimal characters.
 103  				n := len(in[2:]) - len(bytes.TrimLeft(in[2:], "0123456789abcdefABCDEF"))
 104  				if n > 2 {
 105  					n = 2
 106  				}
 107  				v, err := strconv.ParseUint(string(in[2:2+n]), 16, 8)
 108  				if err != nil {
 109  					return "", d.newSyntaxError("invalid hex escape code %q in string", in[:2+n])
 110  				}
 111  				in, out = in[2+n:], append(out, byte(v))
 112  			case 'u', 'U':
 113  				// Four or eight hexadecimal characters
 114  				n := 6
 115  				if r == 'U' {
 116  					n = 10
 117  				}
 118  				if len(in) < n {
 119  					return "", ErrUnexpectedEOF
 120  				}
 121  				v, err := strconv.ParseUint(string(in[2:n]), 16, 32)
 122  				if utf8.MaxRune < v || err != nil {
 123  					return "", d.newSyntaxError("invalid Unicode escape code %q in string", in[:n])
 124  				}
 125  				in = in[n:]
 126  
 127  				r := rune(v)
 128  				if utf16.IsSurrogate(r) {
 129  					if len(in) < 6 {
 130  						return "", ErrUnexpectedEOF
 131  					}
 132  					v, err := strconv.ParseUint(string(in[2:6]), 16, 16)
 133  					r = utf16.DecodeRune(r, rune(v))
 134  					if in[0] != '\\' || in[1] != 'u' || r == unicode.ReplacementChar || err != nil {
 135  						return "", d.newSyntaxError("invalid Unicode escape code %q in string", in[:6])
 136  					}
 137  					in = in[6:]
 138  				}
 139  				out = append(out, string(r)...)
 140  			default:
 141  				return "", d.newSyntaxError("invalid escape code %q in string", in[:2])
 142  			}
 143  		default:
 144  			i := indexNeedEscapeInBytes(in[n:])
 145  			in, out = in[n+i:], append(out, in[:n+i]...)
 146  		}
 147  	}
 148  	return "", ErrUnexpectedEOF
 149  }
 150  
 151  // indexNeedEscapeInString returns the index of the character that needs
 152  // escaping. If no characters need escaping, this returns the input length.
 153  func indexNeedEscapeInBytes(b []byte) int { return indexNeedEscapeInString(strs.UnsafeString(b)) }
 154  
 155  // UnmarshalString returns an unescaped string given a textproto string value.
 156  // String value needs to contain single or double quotes. This is only used by
 157  // internal/encoding/defval package for unmarshaling bytes.
 158  func UnmarshalString(s string) (string, error) {
 159  	d := NewDecoder([]byte(s))
 160  	return d.parseString()
 161  }
 162