decode.go raw

   1  // Copyright 2018 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  package json
   6  
   7  import (
   8  	"bytes"
   9  	"fmt"
  10  	"io"
  11  	"regexp"
  12  	"unicode/utf8"
  13  
  14  	"google.golang.org/protobuf/internal/errors"
  15  )
  16  
  17  // call specifies which Decoder method was invoked.
  18  type call uint8
  19  
  20  const (
  21  	readCall call = iota
  22  	peekCall
  23  )
  24  
  25  const unexpectedFmt = "unexpected token %s"
  26  
  27  // ErrUnexpectedEOF means that EOF was encountered in the middle of the input.
  28  var ErrUnexpectedEOF = errors.New("%v", io.ErrUnexpectedEOF)
  29  
  30  // Decoder is a token-based JSON decoder.
  31  type Decoder struct {
  32  	// lastCall is last method called, either readCall or peekCall.
  33  	// Initial value is readCall.
  34  	lastCall call
  35  
  36  	// lastToken contains the last read token.
  37  	lastToken Token
  38  
  39  	// lastErr contains the last read error.
  40  	lastErr error
  41  
  42  	// openStack is a stack containing ObjectOpen and ArrayOpen values. The
  43  	// top of stack represents the object or the array the current value is
  44  	// directly located in.
  45  	openStack []Kind
  46  
  47  	// orig is used in reporting line and column.
  48  	orig []byte
  49  	// in contains the unconsumed input.
  50  	in []byte
  51  }
  52  
  53  // NewDecoder returns a Decoder to read the given []byte.
  54  func NewDecoder(b []byte) *Decoder {
  55  	return &Decoder{orig: b, in: b}
  56  }
  57  
  58  // Peek looks ahead and returns the next token kind without advancing a read.
  59  func (d *Decoder) Peek() (Token, error) {
  60  	defer func() { d.lastCall = peekCall }()
  61  	if d.lastCall == readCall {
  62  		d.lastToken, d.lastErr = d.Read()
  63  	}
  64  	return d.lastToken, d.lastErr
  65  }
  66  
  67  // Read returns the next JSON token.
  68  // It will return an error if there is no valid token.
  69  func (d *Decoder) Read() (Token, error) {
  70  	const scalar = Null | Bool | Number | String
  71  
  72  	defer func() { d.lastCall = readCall }()
  73  	if d.lastCall == peekCall {
  74  		return d.lastToken, d.lastErr
  75  	}
  76  
  77  	tok, err := d.parseNext()
  78  	if err != nil {
  79  		return Token{}, err
  80  	}
  81  
  82  	switch tok.kind {
  83  	case EOF:
  84  		if len(d.openStack) != 0 ||
  85  			d.lastToken.kind&scalar|ObjectClose|ArrayClose == 0 {
  86  			return Token{}, ErrUnexpectedEOF
  87  		}
  88  
  89  	case Null:
  90  		if !d.isValueNext() {
  91  			return Token{}, d.newSyntaxError(tok.pos, unexpectedFmt, tok.RawString())
  92  		}
  93  
  94  	case Bool, Number:
  95  		if !d.isValueNext() {
  96  			return Token{}, d.newSyntaxError(tok.pos, unexpectedFmt, tok.RawString())
  97  		}
  98  
  99  	case String:
 100  		if d.isValueNext() {
 101  			break
 102  		}
 103  		// This string token should only be for a field name.
 104  		if d.lastToken.kind&(ObjectOpen|comma) == 0 {
 105  			return Token{}, d.newSyntaxError(tok.pos, unexpectedFmt, tok.RawString())
 106  		}
 107  		if len(d.in) == 0 {
 108  			return Token{}, ErrUnexpectedEOF
 109  		}
 110  		if c := d.in[0]; c != ':' {
 111  			return Token{}, d.newSyntaxError(d.currPos(), `unexpected character %s, missing ":" after field name`, string(c))
 112  		}
 113  		tok.kind = Name
 114  		d.consume(1)
 115  
 116  	case ObjectOpen, ArrayOpen:
 117  		if !d.isValueNext() {
 118  			return Token{}, d.newSyntaxError(tok.pos, unexpectedFmt, tok.RawString())
 119  		}
 120  		d.openStack = append(d.openStack, tok.kind)
 121  
 122  	case ObjectClose:
 123  		if len(d.openStack) == 0 ||
 124  			d.lastToken.kind&(Name|comma) != 0 ||
 125  			d.openStack[len(d.openStack)-1] != ObjectOpen {
 126  			return Token{}, d.newSyntaxError(tok.pos, unexpectedFmt, tok.RawString())
 127  		}
 128  		d.openStack = d.openStack[:len(d.openStack)-1]
 129  
 130  	case ArrayClose:
 131  		if len(d.openStack) == 0 ||
 132  			d.lastToken.kind == comma ||
 133  			d.openStack[len(d.openStack)-1] != ArrayOpen {
 134  			return Token{}, d.newSyntaxError(tok.pos, unexpectedFmt, tok.RawString())
 135  		}
 136  		d.openStack = d.openStack[:len(d.openStack)-1]
 137  
 138  	case comma:
 139  		if len(d.openStack) == 0 ||
 140  			d.lastToken.kind&(scalar|ObjectClose|ArrayClose) == 0 {
 141  			return Token{}, d.newSyntaxError(tok.pos, unexpectedFmt, tok.RawString())
 142  		}
 143  	}
 144  
 145  	// Update d.lastToken only after validating token to be in the right sequence.
 146  	d.lastToken = tok
 147  
 148  	if d.lastToken.kind == comma {
 149  		return d.Read()
 150  	}
 151  	return tok, nil
 152  }
 153  
 154  // Any sequence that looks like a non-delimiter (for error reporting).
 155  var errRegexp = regexp.MustCompile(`^([-+._a-zA-Z0-9]{1,32}|.)`)
 156  
 157  // parseNext parses for the next JSON token. It returns a Token object for
 158  // different types, except for Name. It does not handle whether the next token
 159  // is in a valid sequence or not.
 160  func (d *Decoder) parseNext() (Token, error) {
 161  	// Trim leading spaces.
 162  	d.consume(0)
 163  
 164  	in := d.in
 165  	if len(in) == 0 {
 166  		return d.consumeToken(EOF, 0), nil
 167  	}
 168  
 169  	switch in[0] {
 170  	case 'n':
 171  		if n := matchWithDelim("null", in); n != 0 {
 172  			return d.consumeToken(Null, n), nil
 173  		}
 174  
 175  	case 't':
 176  		if n := matchWithDelim("true", in); n != 0 {
 177  			return d.consumeBoolToken(true, n), nil
 178  		}
 179  
 180  	case 'f':
 181  		if n := matchWithDelim("false", in); n != 0 {
 182  			return d.consumeBoolToken(false, n), nil
 183  		}
 184  
 185  	case '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
 186  		if n, ok := parseNumber(in); ok {
 187  			return d.consumeToken(Number, n), nil
 188  		}
 189  
 190  	case '"':
 191  		s, n, err := d.parseString(in)
 192  		if err != nil {
 193  			return Token{}, err
 194  		}
 195  		return d.consumeStringToken(s, n), nil
 196  
 197  	case '{':
 198  		return d.consumeToken(ObjectOpen, 1), nil
 199  
 200  	case '}':
 201  		return d.consumeToken(ObjectClose, 1), nil
 202  
 203  	case '[':
 204  		return d.consumeToken(ArrayOpen, 1), nil
 205  
 206  	case ']':
 207  		return d.consumeToken(ArrayClose, 1), nil
 208  
 209  	case ',':
 210  		return d.consumeToken(comma, 1), nil
 211  	}
 212  	return Token{}, d.newSyntaxError(d.currPos(), "invalid value %s", errRegexp.Find(in))
 213  }
 214  
 215  // newSyntaxError returns an error with line and column information useful for
 216  // syntax errors.
 217  func (d *Decoder) newSyntaxError(pos int, f string, x ...any) error {
 218  	e := errors.New(f, x...)
 219  	line, column := d.Position(pos)
 220  	return errors.New("syntax error (line %d:%d): %v", line, column, e)
 221  }
 222  
 223  // Position returns line and column number of given index of the original input.
 224  // It will panic if index is out of range.
 225  func (d *Decoder) Position(idx int) (line int, column int) {
 226  	b := d.orig[:idx]
 227  	line = bytes.Count(b, []byte("\n")) + 1
 228  	if i := bytes.LastIndexByte(b, '\n'); i >= 0 {
 229  		b = b[i+1:]
 230  	}
 231  	column = utf8.RuneCount(b) + 1 // ignore multi-rune characters
 232  	return line, column
 233  }
 234  
 235  // currPos returns the current index position of d.in from d.orig.
 236  func (d *Decoder) currPos() int {
 237  	return len(d.orig) - len(d.in)
 238  }
 239  
 240  // matchWithDelim matches s with the input b and verifies that the match
 241  // terminates with a delimiter of some form (e.g., r"[^-+_.a-zA-Z0-9]").
 242  // As a special case, EOF is considered a delimiter. It returns the length of s
 243  // if there is a match, else 0.
 244  func matchWithDelim(s string, b []byte) int {
 245  	if !bytes.HasPrefix(b, []byte(s)) {
 246  		return 0
 247  	}
 248  
 249  	n := len(s)
 250  	if n < len(b) && isNotDelim(b[n]) {
 251  		return 0
 252  	}
 253  	return n
 254  }
 255  
 256  // isNotDelim returns true if given byte is a not delimiter character.
 257  func isNotDelim(c byte) bool {
 258  	return (c == '-' || c == '+' || c == '.' || c == '_' ||
 259  		('a' <= c && c <= 'z') ||
 260  		('A' <= c && c <= 'Z') ||
 261  		('0' <= c && c <= '9'))
 262  }
 263  
 264  // consume consumes n bytes of input and any subsequent whitespace.
 265  func (d *Decoder) consume(n int) {
 266  	d.in = d.in[n:]
 267  	for len(d.in) > 0 {
 268  		switch d.in[0] {
 269  		case ' ', '\n', '\r', '\t':
 270  			d.in = d.in[1:]
 271  		default:
 272  			return
 273  		}
 274  	}
 275  }
 276  
 277  // isValueNext returns true if next type should be a JSON value: Null,
 278  // Number, String or Bool.
 279  func (d *Decoder) isValueNext() bool {
 280  	if len(d.openStack) == 0 {
 281  		return d.lastToken.kind == 0
 282  	}
 283  
 284  	start := d.openStack[len(d.openStack)-1]
 285  	switch start {
 286  	case ObjectOpen:
 287  		return d.lastToken.kind&Name != 0
 288  	case ArrayOpen:
 289  		return d.lastToken.kind&(ArrayOpen|comma) != 0
 290  	}
 291  	panic(fmt.Sprintf(
 292  		"unreachable logic in Decoder.isValueNext, lastToken.kind: %v, openStack: %v",
 293  		d.lastToken.kind, start))
 294  }
 295  
 296  // consumeToken constructs a Token for given Kind with raw value derived from
 297  // current d.in and given size, and consumes the given size-length of it.
 298  func (d *Decoder) consumeToken(kind Kind, size int) Token {
 299  	tok := Token{
 300  		kind: kind,
 301  		raw:  d.in[:size],
 302  		pos:  len(d.orig) - len(d.in),
 303  	}
 304  	d.consume(size)
 305  	return tok
 306  }
 307  
 308  // consumeBoolToken constructs a Token for a Bool kind with raw value derived from
 309  // current d.in and given size.
 310  func (d *Decoder) consumeBoolToken(b bool, size int) Token {
 311  	tok := Token{
 312  		kind: Bool,
 313  		raw:  d.in[:size],
 314  		pos:  len(d.orig) - len(d.in),
 315  		boo:  b,
 316  	}
 317  	d.consume(size)
 318  	return tok
 319  }
 320  
 321  // consumeStringToken constructs a Token for a String kind with raw value derived
 322  // from current d.in and given size.
 323  func (d *Decoder) consumeStringToken(s string, size int) Token {
 324  	tok := Token{
 325  		kind: String,
 326  		raw:  d.in[:size],
 327  		pos:  len(d.orig) - len(d.in),
 328  		str:  s,
 329  	}
 330  	d.consume(size)
 331  	return tok
 332  }
 333  
 334  // Clone returns a copy of the Decoder for use in reading ahead the next JSON
 335  // object, array or other values without affecting current Decoder.
 336  func (d *Decoder) Clone() *Decoder {
 337  	ret := *d
 338  	ret.openStack = append([]Kind(nil), ret.openStack...)
 339  	return &ret
 340  }
 341