decode.go raw
1 // Copyright 2018 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 package text
6
7 import (
8 "bytes"
9 "fmt"
10 "io"
11 "strconv"
12 "unicode/utf8"
13
14 "google.golang.org/protobuf/internal/errors"
15 )
16
17 // Decoder is a token-based textproto decoder.
18 type Decoder struct {
19 // lastCall is last method called, either readCall or peekCall.
20 // Initial value is readCall.
21 lastCall call
22
23 // lastToken contains the last read token.
24 lastToken Token
25
26 // lastErr contains the last read error.
27 lastErr error
28
29 // openStack is a stack containing the byte characters for MessageOpen and
30 // ListOpen kinds. The top of stack represents the message or the list that
31 // the current token is nested in. An empty stack means the current token is
32 // at the top level message. The characters '{' and '<' both represent the
33 // MessageOpen kind.
34 openStack []byte
35
36 // orig is used in reporting line and column.
37 orig []byte
38 // in contains the unconsumed input.
39 in []byte
40 }
41
42 // NewDecoder returns a Decoder to read the given []byte.
43 func NewDecoder(b []byte) *Decoder {
44 return &Decoder{orig: b, in: b}
45 }
46
47 // ErrUnexpectedEOF means that EOF was encountered in the middle of the input.
48 var ErrUnexpectedEOF = errors.New("%v", io.ErrUnexpectedEOF)
49
50 // call specifies which Decoder method was invoked.
51 type call uint8
52
53 const (
54 readCall call = iota
55 peekCall
56 )
57
58 // Peek looks ahead and returns the next token and error without advancing a read.
59 func (d *Decoder) Peek() (Token, error) {
60 defer func() { d.lastCall = peekCall }()
61 if d.lastCall == readCall {
62 d.lastToken, d.lastErr = d.Read()
63 }
64 return d.lastToken, d.lastErr
65 }
66
67 // Read returns the next token.
68 // It will return an error if there is no valid token.
69 func (d *Decoder) Read() (Token, error) {
70 defer func() { d.lastCall = readCall }()
71 if d.lastCall == peekCall {
72 return d.lastToken, d.lastErr
73 }
74
75 tok, err := d.parseNext(d.lastToken.kind)
76 if err != nil {
77 return Token{}, err
78 }
79
80 switch tok.kind {
81 case comma, semicolon:
82 tok, err = d.parseNext(tok.kind)
83 if err != nil {
84 return Token{}, err
85 }
86 }
87 d.lastToken = tok
88 return tok, nil
89 }
90
91 const (
92 mismatchedFmt = "mismatched close character %q"
93 unexpectedFmt = "unexpected character %q"
94 )
95
96 // parseNext parses the next Token based on given last kind.
97 func (d *Decoder) parseNext(lastKind Kind) (Token, error) {
98 // Trim leading spaces.
99 d.consume(0)
100 isEOF := false
101 if len(d.in) == 0 {
102 isEOF = true
103 }
104
105 switch lastKind {
106 case EOF:
107 return d.consumeToken(EOF, 0, 0), nil
108
109 case bof:
110 // Start of top level message. Next token can be EOF or Name.
111 if isEOF {
112 return d.consumeToken(EOF, 0, 0), nil
113 }
114 return d.parseFieldName()
115
116 case Name:
117 // Next token can be MessageOpen, ListOpen or Scalar.
118 if isEOF {
119 return Token{}, ErrUnexpectedEOF
120 }
121 switch ch := d.in[0]; ch {
122 case '{', '<':
123 d.pushOpenStack(ch)
124 return d.consumeToken(MessageOpen, 1, 0), nil
125 case '[':
126 d.pushOpenStack(ch)
127 return d.consumeToken(ListOpen, 1, 0), nil
128 default:
129 return d.parseScalar()
130 }
131
132 case Scalar:
133 openKind, closeCh := d.currentOpenKind()
134 switch openKind {
135 case bof:
136 // Top level message.
137 // Next token can be EOF, comma, semicolon or Name.
138 if isEOF {
139 return d.consumeToken(EOF, 0, 0), nil
140 }
141 switch d.in[0] {
142 case ',':
143 return d.consumeToken(comma, 1, 0), nil
144 case ';':
145 return d.consumeToken(semicolon, 1, 0), nil
146 default:
147 return d.parseFieldName()
148 }
149
150 case MessageOpen:
151 // Next token can be MessageClose, comma, semicolon or Name.
152 if isEOF {
153 return Token{}, ErrUnexpectedEOF
154 }
155 switch ch := d.in[0]; ch {
156 case closeCh:
157 d.popOpenStack()
158 return d.consumeToken(MessageClose, 1, 0), nil
159 case otherCloseChar[closeCh]:
160 return Token{}, d.newSyntaxError(mismatchedFmt, ch)
161 case ',':
162 return d.consumeToken(comma, 1, 0), nil
163 case ';':
164 return d.consumeToken(semicolon, 1, 0), nil
165 default:
166 return d.parseFieldName()
167 }
168
169 case ListOpen:
170 // Next token can be ListClose or comma.
171 if isEOF {
172 return Token{}, ErrUnexpectedEOF
173 }
174 switch ch := d.in[0]; ch {
175 case ']':
176 d.popOpenStack()
177 return d.consumeToken(ListClose, 1, 0), nil
178 case ',':
179 return d.consumeToken(comma, 1, 0), nil
180 default:
181 return Token{}, d.newSyntaxError(unexpectedFmt, ch)
182 }
183 }
184
185 case MessageOpen:
186 // Next token can be MessageClose or Name.
187 if isEOF {
188 return Token{}, ErrUnexpectedEOF
189 }
190 _, closeCh := d.currentOpenKind()
191 switch ch := d.in[0]; ch {
192 case closeCh:
193 d.popOpenStack()
194 return d.consumeToken(MessageClose, 1, 0), nil
195 case otherCloseChar[closeCh]:
196 return Token{}, d.newSyntaxError(mismatchedFmt, ch)
197 default:
198 return d.parseFieldName()
199 }
200
201 case MessageClose:
202 openKind, closeCh := d.currentOpenKind()
203 switch openKind {
204 case bof:
205 // Top level message.
206 // Next token can be EOF, comma, semicolon or Name.
207 if isEOF {
208 return d.consumeToken(EOF, 0, 0), nil
209 }
210 switch ch := d.in[0]; ch {
211 case ',':
212 return d.consumeToken(comma, 1, 0), nil
213 case ';':
214 return d.consumeToken(semicolon, 1, 0), nil
215 default:
216 return d.parseFieldName()
217 }
218
219 case MessageOpen:
220 // Next token can be MessageClose, comma, semicolon or Name.
221 if isEOF {
222 return Token{}, ErrUnexpectedEOF
223 }
224 switch ch := d.in[0]; ch {
225 case closeCh:
226 d.popOpenStack()
227 return d.consumeToken(MessageClose, 1, 0), nil
228 case otherCloseChar[closeCh]:
229 return Token{}, d.newSyntaxError(mismatchedFmt, ch)
230 case ',':
231 return d.consumeToken(comma, 1, 0), nil
232 case ';':
233 return d.consumeToken(semicolon, 1, 0), nil
234 default:
235 return d.parseFieldName()
236 }
237
238 case ListOpen:
239 // Next token can be ListClose or comma
240 if isEOF {
241 return Token{}, ErrUnexpectedEOF
242 }
243 switch ch := d.in[0]; ch {
244 case closeCh:
245 d.popOpenStack()
246 return d.consumeToken(ListClose, 1, 0), nil
247 case ',':
248 return d.consumeToken(comma, 1, 0), nil
249 default:
250 return Token{}, d.newSyntaxError(unexpectedFmt, ch)
251 }
252 }
253
254 case ListOpen:
255 // Next token can be ListClose, MessageStart or Scalar.
256 if isEOF {
257 return Token{}, ErrUnexpectedEOF
258 }
259 switch ch := d.in[0]; ch {
260 case ']':
261 d.popOpenStack()
262 return d.consumeToken(ListClose, 1, 0), nil
263 case '{', '<':
264 d.pushOpenStack(ch)
265 return d.consumeToken(MessageOpen, 1, 0), nil
266 default:
267 return d.parseScalar()
268 }
269
270 case ListClose:
271 openKind, closeCh := d.currentOpenKind()
272 switch openKind {
273 case bof:
274 // Top level message.
275 // Next token can be EOF, comma, semicolon or Name.
276 if isEOF {
277 return d.consumeToken(EOF, 0, 0), nil
278 }
279 switch ch := d.in[0]; ch {
280 case ',':
281 return d.consumeToken(comma, 1, 0), nil
282 case ';':
283 return d.consumeToken(semicolon, 1, 0), nil
284 default:
285 return d.parseFieldName()
286 }
287
288 case MessageOpen:
289 // Next token can be MessageClose, comma, semicolon or Name.
290 if isEOF {
291 return Token{}, ErrUnexpectedEOF
292 }
293 switch ch := d.in[0]; ch {
294 case closeCh:
295 d.popOpenStack()
296 return d.consumeToken(MessageClose, 1, 0), nil
297 case otherCloseChar[closeCh]:
298 return Token{}, d.newSyntaxError(mismatchedFmt, ch)
299 case ',':
300 return d.consumeToken(comma, 1, 0), nil
301 case ';':
302 return d.consumeToken(semicolon, 1, 0), nil
303 default:
304 return d.parseFieldName()
305 }
306
307 default:
308 // It is not possible to have this case. Let it panic below.
309 }
310
311 case comma, semicolon:
312 openKind, closeCh := d.currentOpenKind()
313 switch openKind {
314 case bof:
315 // Top level message. Next token can be EOF or Name.
316 if isEOF {
317 return d.consumeToken(EOF, 0, 0), nil
318 }
319 return d.parseFieldName()
320
321 case MessageOpen:
322 // Next token can be MessageClose or Name.
323 if isEOF {
324 return Token{}, ErrUnexpectedEOF
325 }
326 switch ch := d.in[0]; ch {
327 case closeCh:
328 d.popOpenStack()
329 return d.consumeToken(MessageClose, 1, 0), nil
330 case otherCloseChar[closeCh]:
331 return Token{}, d.newSyntaxError(mismatchedFmt, ch)
332 default:
333 return d.parseFieldName()
334 }
335
336 case ListOpen:
337 if lastKind == semicolon {
338 // It is not be possible to have this case as logic here
339 // should not have produced a semicolon Token when inside a
340 // list. Let it panic below.
341 break
342 }
343 // Next token can be MessageOpen or Scalar.
344 if isEOF {
345 return Token{}, ErrUnexpectedEOF
346 }
347 switch ch := d.in[0]; ch {
348 case '{', '<':
349 d.pushOpenStack(ch)
350 return d.consumeToken(MessageOpen, 1, 0), nil
351 default:
352 return d.parseScalar()
353 }
354 }
355 }
356
357 line, column := d.Position(len(d.orig) - len(d.in))
358 panic(fmt.Sprintf("Decoder.parseNext: bug at handling line %d:%d with lastKind=%v", line, column, lastKind))
359 }
360
361 var otherCloseChar = map[byte]byte{
362 '}': '>',
363 '>': '}',
364 }
365
366 // currentOpenKind indicates whether current position is inside a message, list
367 // or top-level message by returning MessageOpen, ListOpen or bof respectively.
368 // If the returned kind is either a MessageOpen or ListOpen, it also returns the
369 // corresponding closing character.
370 func (d *Decoder) currentOpenKind() (Kind, byte) {
371 if len(d.openStack) == 0 {
372 return bof, 0
373 }
374 openCh := d.openStack[len(d.openStack)-1]
375 switch openCh {
376 case '{':
377 return MessageOpen, '}'
378 case '<':
379 return MessageOpen, '>'
380 case '[':
381 return ListOpen, ']'
382 }
383 panic(fmt.Sprintf("Decoder: openStack contains invalid byte %c", openCh))
384 }
385
386 func (d *Decoder) pushOpenStack(ch byte) {
387 d.openStack = append(d.openStack, ch)
388 }
389
390 func (d *Decoder) popOpenStack() {
391 d.openStack = d.openStack[:len(d.openStack)-1]
392 }
393
394 // parseFieldName parses field name and separator.
395 func (d *Decoder) parseFieldName() (tok Token, err error) {
396 defer func() {
397 if err == nil && d.tryConsumeChar(':') {
398 tok.attrs |= hasSeparator
399 }
400 }()
401
402 // Extension or Any type URL.
403 if d.in[0] == '[' {
404 return d.parseTypeName()
405 }
406
407 // Identifier.
408 if size := parseIdent(d.in, false); size > 0 {
409 return d.consumeToken(Name, size, uint8(IdentName)), nil
410 }
411
412 // Field number. Identify if input is a valid number that is not negative
413 // and is decimal integer within 32-bit range.
414 if num := parseNumber(d.in); num.size > 0 {
415 str := num.string(d.in)
416 if !num.neg && num.kind == numDec {
417 if _, err := strconv.ParseInt(str, 10, 32); err == nil {
418 return d.consumeToken(Name, num.size, uint8(FieldNumber)), nil
419 }
420 }
421 return Token{}, d.newSyntaxError("invalid field number: %s", str)
422 }
423
424 return Token{}, d.newSyntaxError("invalid field name: %s", errId(d.in))
425 }
426
427 // parseTypeName parses an Any type URL or an extension field name. The name is
428 // enclosed in [ and ] characters. We allow almost arbitrary type URL prefixes,
429 // closely following the text-format spec [1,2]. We implement "ExtensionName |
430 // AnyName" as follows (with some exceptions for backwards compatibility):
431 //
432 // char = [-_a-zA-Z0-9]
433 // url_char = char | [.~!$&'()*+,;=] | "%", hex, hex
434 //
435 // Ident = char, { char }
436 // TypeName = Ident, { ".", Ident } ;
437 // UrlPrefix = url_char, { url_char | "/" } ;
438 // ExtensionName = "[", TypeName, "]" ;
439 // AnyName = "[", UrlPrefix, "/", TypeName, "]" ;
440 //
441 // Additionally, we allow arbitrary whitespace and comments between [ and ].
442 //
443 // [1] https://protobuf.dev/reference/protobuf/textformat-spec/#characters
444 // [2] https://protobuf.dev/reference/protobuf/textformat-spec/#field-names
445 func (d *Decoder) parseTypeName() (Token, error) {
446 // Use alias s to advance first in order to use d.in for error handling.
447 // Caller already checks for [ as first character (d.in[0] == '[').
448 s := consume(d.in[1:], 0)
449 if len(s) == 0 {
450 return Token{}, ErrUnexpectedEOF
451 }
452
453 // Collect everything between [ and ] in name.
454 var name []byte
455 var closed bool
456 for len(s) > 0 && !closed {
457 switch {
458 case s[0] == ']':
459 s = s[1:]
460 closed = true
461
462 case s[0] == '/' || isTypeNameChar(s[0]) || isUrlExtraChar(s[0]):
463 name = append(name, s[0])
464 s = consume(s[1:], 0)
465
466 // URL percent-encoded chars
467 case s[0] == '%':
468 if len(s) < 3 || !isHexChar(s[1]) || !isHexChar(s[2]) {
469 return Token{}, d.parseTypeNameError(s, 3)
470 }
471 name = append(name, s[0], s[1], s[2])
472 s = consume(s[3:], 0)
473
474 default:
475 return Token{}, d.parseTypeNameError(s, 1)
476 }
477 }
478
479 if !closed {
480 return Token{}, ErrUnexpectedEOF
481 }
482
483 // Split collected name on last '/' into urlPrefix and typeName (if '/' is
484 // present).
485 typeName := name
486 if i := bytes.LastIndexByte(name, '/'); i != -1 {
487 urlPrefix := name[:i]
488 typeName = name[i+1:]
489
490 // urlPrefix may be empty (for backwards compatibility).
491 // If non-empty, it must not start with '/'.
492 if len(urlPrefix) > 0 && urlPrefix[0] == '/' {
493 return Token{}, d.parseTypeNameError(s, 0)
494 }
495 }
496
497 // typeName must not be empty (note: "" splits to [""]) and all identifier
498 // parts must not be empty.
499 for _, ident := range bytes.Split(typeName, []byte{'.'}) {
500 if len(ident) == 0 {
501 return Token{}, d.parseTypeNameError(s, 0)
502 }
503 }
504
505 // typeName must not contain any percent-encoded or special URL chars.
506 for _, b := range typeName {
507 if b == '%' || (b != '.' && isUrlExtraChar(b)) {
508 return Token{}, d.parseTypeNameError(s, 0)
509 }
510 }
511
512 startPos := len(d.orig) - len(d.in)
513 endPos := len(d.orig) - len(s)
514 d.in = s
515 d.consume(0)
516
517 return Token{
518 kind: Name,
519 attrs: uint8(TypeName),
520 pos: startPos,
521 raw: d.orig[startPos:endPos],
522 str: string(name),
523 }, nil
524 }
525
526 func (d *Decoder) parseTypeNameError(s []byte, numUnconsumedChars int) error {
527 return d.newSyntaxError(
528 "invalid type URL/extension field name: %s",
529 d.in[:len(d.in)-len(s)+min(numUnconsumedChars, len(s))],
530 )
531 }
532
533 func isHexChar(b byte) bool {
534 return ('0' <= b && b <= '9') ||
535 ('a' <= b && b <= 'f') ||
536 ('A' <= b && b <= 'F')
537 }
538
539 func isTypeNameChar(b byte) bool {
540 return b == '-' || b == '_' ||
541 ('0' <= b && b <= '9') ||
542 ('a' <= b && b <= 'z') ||
543 ('A' <= b && b <= 'Z')
544 }
545
546 // isUrlExtraChar complements isTypeNameChar with extra characters that we allow
547 // in URLs but not in type names. Note that '/' is not included so that it can
548 // be treated specially.
549 func isUrlExtraChar(b byte) bool {
550 switch b {
551 case '.', '~', '!', '$', '&', '(', ')', '*', '+', ',', ';', '=':
552 return true
553 default:
554 return false
555 }
556 }
557
558 // parseIdent parses an unquoted proto identifier and returns size.
559 // If allowNeg is true, it allows '-' to be the first character in the
560 // identifier. This is used when parsing literal values like -infinity, etc.
561 // Regular expression matches an identifier: `^[_a-zA-Z][_a-zA-Z0-9]*`
562 func parseIdent(input []byte, allowNeg bool) int {
563 var size int
564
565 s := input
566 if len(s) == 0 {
567 return 0
568 }
569
570 if allowNeg && s[0] == '-' {
571 s = s[1:]
572 size++
573 if len(s) == 0 {
574 return 0
575 }
576 }
577
578 switch {
579 case s[0] == '_',
580 'a' <= s[0] && s[0] <= 'z',
581 'A' <= s[0] && s[0] <= 'Z':
582 s = s[1:]
583 size++
584 default:
585 return 0
586 }
587
588 for len(s) > 0 && (s[0] == '_' ||
589 'a' <= s[0] && s[0] <= 'z' ||
590 'A' <= s[0] && s[0] <= 'Z' ||
591 '0' <= s[0] && s[0] <= '9') {
592 s = s[1:]
593 size++
594 }
595
596 if len(s) > 0 && !isDelim(s[0]) {
597 return 0
598 }
599
600 return size
601 }
602
603 // parseScalar parses for a string, literal or number value.
604 func (d *Decoder) parseScalar() (Token, error) {
605 if d.in[0] == '"' || d.in[0] == '\'' {
606 return d.parseStringValue()
607 }
608
609 if tok, ok := d.parseLiteralValue(); ok {
610 return tok, nil
611 }
612
613 if tok, ok := d.parseNumberValue(); ok {
614 return tok, nil
615 }
616
617 return Token{}, d.newSyntaxError("invalid scalar value: %s", errId(d.in))
618 }
619
620 // parseLiteralValue parses a literal value. A literal value is used for
621 // bools, special floats and enums. This function simply identifies that the
622 // field value is a literal.
623 func (d *Decoder) parseLiteralValue() (Token, bool) {
624 size := parseIdent(d.in, true)
625 if size == 0 {
626 return Token{}, false
627 }
628 return d.consumeToken(Scalar, size, literalValue), true
629 }
630
631 // consumeToken constructs a Token for given Kind from d.in and consumes given
632 // size-length from it.
633 func (d *Decoder) consumeToken(kind Kind, size int, attrs uint8) Token {
634 // Important to compute raw and pos before consuming.
635 tok := Token{
636 kind: kind,
637 attrs: attrs,
638 pos: len(d.orig) - len(d.in),
639 raw: d.in[:size],
640 }
641 d.consume(size)
642 return tok
643 }
644
645 // newSyntaxError returns a syntax error with line and column information for
646 // current position.
647 func (d *Decoder) newSyntaxError(f string, x ...any) error {
648 e := errors.New(f, x...)
649 line, column := d.Position(len(d.orig) - len(d.in))
650 return errors.New("syntax error (line %d:%d): %v", line, column, e)
651 }
652
653 // Position returns line and column number of given index of the original input.
654 // It will panic if index is out of range.
655 func (d *Decoder) Position(idx int) (line int, column int) {
656 b := d.orig[:idx]
657 line = bytes.Count(b, []byte("\n")) + 1
658 if i := bytes.LastIndexByte(b, '\n'); i >= 0 {
659 b = b[i+1:]
660 }
661 column = utf8.RuneCount(b) + 1 // ignore multi-rune characters
662 return line, column
663 }
664
665 func (d *Decoder) tryConsumeChar(c byte) bool {
666 if len(d.in) > 0 && d.in[0] == c {
667 d.consume(1)
668 return true
669 }
670 return false
671 }
672
673 // consume consumes n bytes of input and any subsequent whitespace or comments.
674 func (d *Decoder) consume(n int) {
675 d.in = consume(d.in, n)
676 return
677 }
678
679 // consume consumes n bytes of input and any subsequent whitespace or comments.
680 func consume(b []byte, n int) []byte {
681 b = b[n:]
682 for len(b) > 0 {
683 switch b[0] {
684 case ' ', '\n', '\r', '\t':
685 b = b[1:]
686 case '#':
687 if i := bytes.IndexByte(b, '\n'); i >= 0 {
688 b = b[i+len("\n"):]
689 } else {
690 b = nil
691 }
692 default:
693 return b
694 }
695 }
696 return b
697 }
698
699 // errId extracts a byte sequence that looks like an invalid ID
700 // (for the purposes of error reporting).
701 func errId(seq []byte) []byte {
702 const maxLen = 32
703 for i := 0; i < len(seq); {
704 if i > maxLen {
705 return append(seq[:i:i], "…"...)
706 }
707 r, size := utf8.DecodeRune(seq[i:])
708 if r > utf8.RuneSelf || (r != '/' && isDelim(byte(r))) {
709 if i == 0 {
710 // Either the first byte is invalid UTF-8 or a
711 // delimiter, or the first rune is non-ASCII.
712 // Return it as-is.
713 i = size
714 }
715 return seq[:i:i]
716 }
717 i += size
718 }
719 // No delimiter found.
720 return seq
721 }
722
723 // isDelim returns true if given byte is a delimiter character.
724 func isDelim(c byte) bool {
725 return !(c == '-' || c == '+' || c == '.' || c == '_' ||
726 ('a' <= c && c <= 'z') ||
727 ('A' <= c && c <= 'Z') ||
728 ('0' <= c && c <= '9'))
729 }
730