json_scanner.go raw
1 // Copyright (C) MongoDB, Inc. 2017-present.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License"); you may
4 // not use this file except in compliance with the License. You may obtain
5 // a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
6
7 package bsonrw
8
9 import (
10 "bytes"
11 "errors"
12 "fmt"
13 "io"
14 "math"
15 "strconv"
16 "unicode"
17 "unicode/utf16"
18 )
19
20 type jsonTokenType byte
21
22 const (
23 jttBeginObject jsonTokenType = iota
24 jttEndObject
25 jttBeginArray
26 jttEndArray
27 jttColon
28 jttComma
29 jttInt32
30 jttInt64
31 jttDouble
32 jttString
33 jttBool
34 jttNull
35 jttEOF
36 )
37
38 type jsonToken struct {
39 t jsonTokenType
40 v interface{}
41 p int
42 }
43
44 type jsonScanner struct {
45 r io.Reader
46 buf []byte
47 pos int
48 lastReadErr error
49 }
50
51 // nextToken returns the next JSON token if one exists. A token is a character
52 // of the JSON grammar, a number, a string, or a literal.
53 func (js *jsonScanner) nextToken() (*jsonToken, error) {
54 c, err := js.readNextByte()
55
56 // keep reading until a non-space is encountered (break on read error or EOF)
57 for isWhiteSpace(c) && err == nil {
58 c, err = js.readNextByte()
59 }
60
61 if err == io.EOF {
62 return &jsonToken{t: jttEOF}, nil
63 } else if err != nil {
64 return nil, err
65 }
66
67 // switch on the character
68 switch c {
69 case '{':
70 return &jsonToken{t: jttBeginObject, v: byte('{'), p: js.pos - 1}, nil
71 case '}':
72 return &jsonToken{t: jttEndObject, v: byte('}'), p: js.pos - 1}, nil
73 case '[':
74 return &jsonToken{t: jttBeginArray, v: byte('['), p: js.pos - 1}, nil
75 case ']':
76 return &jsonToken{t: jttEndArray, v: byte(']'), p: js.pos - 1}, nil
77 case ':':
78 return &jsonToken{t: jttColon, v: byte(':'), p: js.pos - 1}, nil
79 case ',':
80 return &jsonToken{t: jttComma, v: byte(','), p: js.pos - 1}, nil
81 case '"': // RFC-8259 only allows for double quotes (") not single (')
82 return js.scanString()
83 default:
84 // check if it's a number
85 if c == '-' || isDigit(c) {
86 return js.scanNumber(c)
87 } else if c == 't' || c == 'f' || c == 'n' {
88 // maybe a literal
89 return js.scanLiteral(c)
90 } else {
91 return nil, fmt.Errorf("invalid JSON input. Position: %d. Character: %c", js.pos-1, c)
92 }
93 }
94 }
95
96 // readNextByte attempts to read the next byte from the buffer. If the buffer
97 // has been exhausted, this function calls readIntoBuf, thus refilling the
98 // buffer and resetting the read position to 0
99 func (js *jsonScanner) readNextByte() (byte, error) {
100 if js.pos >= len(js.buf) {
101 err := js.readIntoBuf()
102
103 if err != nil {
104 return 0, err
105 }
106 }
107
108 b := js.buf[js.pos]
109 js.pos++
110
111 return b, nil
112 }
113
114 // readNNextBytes reads n bytes into dst, starting at offset
115 func (js *jsonScanner) readNNextBytes(dst []byte, n, offset int) error {
116 var err error
117
118 for i := 0; i < n; i++ {
119 dst[i+offset], err = js.readNextByte()
120 if err != nil {
121 return err
122 }
123 }
124
125 return nil
126 }
127
128 // readIntoBuf reads up to 512 bytes from the scanner's io.Reader into the buffer
129 func (js *jsonScanner) readIntoBuf() error {
130 if js.lastReadErr != nil {
131 js.buf = js.buf[:0]
132 js.pos = 0
133 return js.lastReadErr
134 }
135
136 if cap(js.buf) == 0 {
137 js.buf = make([]byte, 0, 512)
138 }
139
140 n, err := js.r.Read(js.buf[:cap(js.buf)])
141 if err != nil {
142 js.lastReadErr = err
143 if n > 0 {
144 err = nil
145 }
146 }
147 js.buf = js.buf[:n]
148 js.pos = 0
149
150 return err
151 }
152
153 func isWhiteSpace(c byte) bool {
154 return c == ' ' || c == '\t' || c == '\r' || c == '\n'
155 }
156
157 func isDigit(c byte) bool {
158 return unicode.IsDigit(rune(c))
159 }
160
161 func isValueTerminator(c byte) bool {
162 return c == ',' || c == '}' || c == ']' || isWhiteSpace(c)
163 }
164
165 // getu4 decodes the 4-byte hex sequence from the beginning of s, returning the hex value as a rune,
166 // or it returns -1. Note that the "\u" from the unicode escape sequence should not be present.
167 // It is copied and lightly modified from the Go JSON decode function at
168 // https://github.com/golang/go/blob/1b0a0316802b8048d69da49dc23c5a5ab08e8ae8/src/encoding/json/decode.go#L1169-L1188
169 func getu4(s []byte) rune {
170 if len(s) < 4 {
171 return -1
172 }
173 var r rune
174 for _, c := range s[:4] {
175 switch {
176 case '0' <= c && c <= '9':
177 c = c - '0'
178 case 'a' <= c && c <= 'f':
179 c = c - 'a' + 10
180 case 'A' <= c && c <= 'F':
181 c = c - 'A' + 10
182 default:
183 return -1
184 }
185 r = r*16 + rune(c)
186 }
187 return r
188 }
189
190 // scanString reads from an opening '"' to a closing '"' and handles escaped characters
191 func (js *jsonScanner) scanString() (*jsonToken, error) {
192 var b bytes.Buffer
193 var c byte
194 var err error
195
196 p := js.pos - 1
197
198 for {
199 c, err = js.readNextByte()
200 if err != nil {
201 if err == io.EOF {
202 return nil, errors.New("end of input in JSON string")
203 }
204 return nil, err
205 }
206
207 evalNextChar:
208 switch c {
209 case '\\':
210 c, err = js.readNextByte()
211 if err != nil {
212 if err == io.EOF {
213 return nil, errors.New("end of input in JSON string")
214 }
215 return nil, err
216 }
217
218 evalNextEscapeChar:
219 switch c {
220 case '"', '\\', '/':
221 b.WriteByte(c)
222 case 'b':
223 b.WriteByte('\b')
224 case 'f':
225 b.WriteByte('\f')
226 case 'n':
227 b.WriteByte('\n')
228 case 'r':
229 b.WriteByte('\r')
230 case 't':
231 b.WriteByte('\t')
232 case 'u':
233 us := make([]byte, 4)
234 err = js.readNNextBytes(us, 4, 0)
235 if err != nil {
236 return nil, fmt.Errorf("invalid unicode sequence in JSON string: %s", us)
237 }
238
239 rn := getu4(us)
240
241 // If the rune we just decoded is the high or low value of a possible surrogate pair,
242 // try to decode the next sequence as the low value of a surrogate pair. We're
243 // expecting the next sequence to be another Unicode escape sequence (e.g. "\uDD1E"),
244 // but need to handle cases where the input is not a valid surrogate pair.
245 // For more context on unicode surrogate pairs, see:
246 // https://www.christianfscott.com/rust-chars-vs-go-runes/
247 // https://www.unicode.org/glossary/#high_surrogate_code_point
248 if utf16.IsSurrogate(rn) {
249 c, err = js.readNextByte()
250 if err != nil {
251 if err == io.EOF {
252 return nil, errors.New("end of input in JSON string")
253 }
254 return nil, err
255 }
256
257 // If the next value isn't the beginning of a backslash escape sequence, write
258 // the Unicode replacement character for the surrogate value and goto the
259 // beginning of the next char eval block.
260 if c != '\\' {
261 b.WriteRune(unicode.ReplacementChar)
262 goto evalNextChar
263 }
264
265 c, err = js.readNextByte()
266 if err != nil {
267 if err == io.EOF {
268 return nil, errors.New("end of input in JSON string")
269 }
270 return nil, err
271 }
272
273 // If the next value isn't the beginning of a unicode escape sequence, write the
274 // Unicode replacement character for the surrogate value and goto the beginning
275 // of the next escape char eval block.
276 if c != 'u' {
277 b.WriteRune(unicode.ReplacementChar)
278 goto evalNextEscapeChar
279 }
280
281 err = js.readNNextBytes(us, 4, 0)
282 if err != nil {
283 return nil, fmt.Errorf("invalid unicode sequence in JSON string: %s", us)
284 }
285
286 rn2 := getu4(us)
287
288 // Try to decode the pair of runes as a utf16 surrogate pair. If that fails, write
289 // the Unicode replacement character for the surrogate value and the 2nd decoded rune.
290 if rnPair := utf16.DecodeRune(rn, rn2); rnPair != unicode.ReplacementChar {
291 b.WriteRune(rnPair)
292 } else {
293 b.WriteRune(unicode.ReplacementChar)
294 b.WriteRune(rn2)
295 }
296
297 break
298 }
299
300 b.WriteRune(rn)
301 default:
302 return nil, fmt.Errorf("invalid escape sequence in JSON string '\\%c'", c)
303 }
304 case '"':
305 return &jsonToken{t: jttString, v: b.String(), p: p}, nil
306 default:
307 b.WriteByte(c)
308 }
309 }
310 }
311
312 // scanLiteral reads an unquoted sequence of characters and determines if it is one of
313 // three valid JSON literals (true, false, null); if so, it returns the appropriate
314 // jsonToken; otherwise, it returns an error
315 func (js *jsonScanner) scanLiteral(first byte) (*jsonToken, error) {
316 p := js.pos - 1
317
318 lit := make([]byte, 4)
319 lit[0] = first
320
321 err := js.readNNextBytes(lit, 3, 1)
322 if err != nil {
323 return nil, err
324 }
325
326 c5, err := js.readNextByte()
327
328 if bytes.Equal([]byte("true"), lit) && (isValueTerminator(c5) || err == io.EOF) {
329 js.pos = int(math.Max(0, float64(js.pos-1)))
330 return &jsonToken{t: jttBool, v: true, p: p}, nil
331 } else if bytes.Equal([]byte("null"), lit) && (isValueTerminator(c5) || err == io.EOF) {
332 js.pos = int(math.Max(0, float64(js.pos-1)))
333 return &jsonToken{t: jttNull, v: nil, p: p}, nil
334 } else if bytes.Equal([]byte("fals"), lit) {
335 if c5 == 'e' {
336 c5, err = js.readNextByte()
337
338 if isValueTerminator(c5) || err == io.EOF {
339 js.pos = int(math.Max(0, float64(js.pos-1)))
340 return &jsonToken{t: jttBool, v: false, p: p}, nil
341 }
342 }
343 }
344
345 return nil, fmt.Errorf("invalid JSON literal. Position: %d, literal: %s", p, lit)
346 }
347
348 type numberScanState byte
349
350 const (
351 nssSawLeadingMinus numberScanState = iota
352 nssSawLeadingZero
353 nssSawIntegerDigits
354 nssSawDecimalPoint
355 nssSawFractionDigits
356 nssSawExponentLetter
357 nssSawExponentSign
358 nssSawExponentDigits
359 nssDone
360 nssInvalid
361 )
362
363 // scanNumber reads a JSON number (according to RFC-8259)
364 func (js *jsonScanner) scanNumber(first byte) (*jsonToken, error) {
365 var b bytes.Buffer
366 var s numberScanState
367 var c byte
368 var err error
369
370 t := jttInt64 // assume it's an int64 until the type can be determined
371 start := js.pos - 1
372
373 b.WriteByte(first)
374
375 switch first {
376 case '-':
377 s = nssSawLeadingMinus
378 case '0':
379 s = nssSawLeadingZero
380 default:
381 s = nssSawIntegerDigits
382 }
383
384 for {
385 c, err = js.readNextByte()
386
387 if err != nil && err != io.EOF {
388 return nil, err
389 }
390
391 switch s {
392 case nssSawLeadingMinus:
393 switch c {
394 case '0':
395 s = nssSawLeadingZero
396 b.WriteByte(c)
397 default:
398 if isDigit(c) {
399 s = nssSawIntegerDigits
400 b.WriteByte(c)
401 } else {
402 s = nssInvalid
403 }
404 }
405 case nssSawLeadingZero:
406 switch c {
407 case '.':
408 s = nssSawDecimalPoint
409 b.WriteByte(c)
410 case 'e', 'E':
411 s = nssSawExponentLetter
412 b.WriteByte(c)
413 case '}', ']', ',':
414 s = nssDone
415 default:
416 if isWhiteSpace(c) || err == io.EOF {
417 s = nssDone
418 } else {
419 s = nssInvalid
420 }
421 }
422 case nssSawIntegerDigits:
423 switch c {
424 case '.':
425 s = nssSawDecimalPoint
426 b.WriteByte(c)
427 case 'e', 'E':
428 s = nssSawExponentLetter
429 b.WriteByte(c)
430 case '}', ']', ',':
431 s = nssDone
432 default:
433 if isWhiteSpace(c) || err == io.EOF {
434 s = nssDone
435 } else if isDigit(c) {
436 s = nssSawIntegerDigits
437 b.WriteByte(c)
438 } else {
439 s = nssInvalid
440 }
441 }
442 case nssSawDecimalPoint:
443 t = jttDouble
444 if isDigit(c) {
445 s = nssSawFractionDigits
446 b.WriteByte(c)
447 } else {
448 s = nssInvalid
449 }
450 case nssSawFractionDigits:
451 switch c {
452 case 'e', 'E':
453 s = nssSawExponentLetter
454 b.WriteByte(c)
455 case '}', ']', ',':
456 s = nssDone
457 default:
458 if isWhiteSpace(c) || err == io.EOF {
459 s = nssDone
460 } else if isDigit(c) {
461 s = nssSawFractionDigits
462 b.WriteByte(c)
463 } else {
464 s = nssInvalid
465 }
466 }
467 case nssSawExponentLetter:
468 t = jttDouble
469 switch c {
470 case '+', '-':
471 s = nssSawExponentSign
472 b.WriteByte(c)
473 default:
474 if isDigit(c) {
475 s = nssSawExponentDigits
476 b.WriteByte(c)
477 } else {
478 s = nssInvalid
479 }
480 }
481 case nssSawExponentSign:
482 if isDigit(c) {
483 s = nssSawExponentDigits
484 b.WriteByte(c)
485 } else {
486 s = nssInvalid
487 }
488 case nssSawExponentDigits:
489 switch c {
490 case '}', ']', ',':
491 s = nssDone
492 default:
493 if isWhiteSpace(c) || err == io.EOF {
494 s = nssDone
495 } else if isDigit(c) {
496 s = nssSawExponentDigits
497 b.WriteByte(c)
498 } else {
499 s = nssInvalid
500 }
501 }
502 }
503
504 switch s {
505 case nssInvalid:
506 return nil, fmt.Errorf("invalid JSON number. Position: %d", start)
507 case nssDone:
508 js.pos = int(math.Max(0, float64(js.pos-1)))
509 if t != jttDouble {
510 v, err := strconv.ParseInt(b.String(), 10, 64)
511 if err == nil {
512 if v < math.MinInt32 || v > math.MaxInt32 {
513 return &jsonToken{t: jttInt64, v: v, p: start}, nil
514 }
515
516 return &jsonToken{t: jttInt32, v: int32(v), p: start}, nil
517 }
518 }
519
520 v, err := strconv.ParseFloat(b.String(), 64)
521 if err != nil {
522 return nil, err
523 }
524
525 return &jsonToken{t: jttDouble, v: v, p: start}, nil
526 }
527 }
528 }
529