1 package toml
2 3 import (
4 "fmt"
5 "reflect"
6 "runtime"
7 "strings"
8 "unicode"
9 "unicode/utf8"
10 )
11 12 type itemType int
13 14 const (
15 itemError itemType = iota
16 itemEOF
17 itemText
18 itemString
19 itemStringEsc
20 itemRawString
21 itemMultilineString
22 itemRawMultilineString
23 itemBool
24 itemInteger
25 itemFloat
26 itemDatetime
27 itemArray // the start of an array
28 itemArrayEnd
29 itemTableStart
30 itemTableEnd
31 itemArrayTableStart
32 itemArrayTableEnd
33 itemKeyStart
34 itemKeyEnd
35 itemCommentStart
36 itemInlineTableStart
37 itemInlineTableEnd
38 )
39 40 const eof = 0
41 42 type stateFn func(lx *lexer) stateFn
43 44 func (p Position) String() string {
45 return fmt.Sprintf("at line %d; start %d; length %d", p.Line, p.Start, p.Len)
46 }
47 48 type lexer struct {
49 input string
50 start int
51 pos int
52 line int
53 state stateFn
54 items chan item
55 esc bool
56 57 // Allow for backing up up to 4 runes. This is necessary because TOML
58 // contains 3-rune tokens (""" and ''').
59 prevWidths [4]int
60 nprev int // how many of prevWidths are in use
61 atEOF bool // If we emit an eof, we can still back up, but it is not OK to call next again.
62 63 // A stack of state functions used to maintain context.
64 //
65 // The idea is to reuse parts of the state machine in various places. For
66 // example, values can appear at the top level or within arbitrarily nested
67 // arrays. The last state on the stack is used after a value has been lexed.
68 // Similarly for comments.
69 stack []stateFn
70 }
71 72 type item struct {
73 typ itemType
74 val string
75 err error
76 pos Position
77 }
78 79 func (lx *lexer) nextItem() item {
80 for {
81 select {
82 case item := <-lx.items:
83 return item
84 default:
85 lx.state = lx.state(lx)
86 //fmt.Printf(" STATE %-24s current: %-10s stack: %s\n", lx.state, lx.current(), lx.stack)
87 }
88 }
89 }
90 91 func lex(input string) *lexer {
92 lx := &lexer{
93 input: input,
94 state: lexTop,
95 items: make(chan item, 10),
96 stack: make([]stateFn, 0, 10),
97 line: 1,
98 }
99 return lx
100 }
101 102 func (lx *lexer) push(state stateFn) {
103 lx.stack = append(lx.stack, state)
104 }
105 106 func (lx *lexer) pop() stateFn {
107 if len(lx.stack) == 0 {
108 panic("BUG in lexer: no states to pop")
109 }
110 last := lx.stack[len(lx.stack)-1]
111 lx.stack = lx.stack[0 : len(lx.stack)-1]
112 return last
113 }
114 115 func (lx *lexer) current() string {
116 return lx.input[lx.start:lx.pos]
117 }
118 119 func (lx lexer) getPos() Position {
120 p := Position{
121 Line: lx.line,
122 Start: lx.start,
123 Len: lx.pos - lx.start,
124 }
125 if p.Len <= 0 {
126 p.Len = 1
127 }
128 return p
129 }
130 131 func (lx *lexer) emit(typ itemType) {
132 // Needed for multiline strings ending with an incomplete UTF-8 sequence.
133 if lx.start > lx.pos {
134 lx.error(errLexUTF8{lx.input[lx.pos]})
135 return
136 }
137 lx.items <- item{typ: typ, pos: lx.getPos(), val: lx.current()}
138 lx.start = lx.pos
139 }
140 141 func (lx *lexer) emitTrim(typ itemType) {
142 lx.items <- item{typ: typ, pos: lx.getPos(), val: strings.TrimSpace(lx.current())}
143 lx.start = lx.pos
144 }
145 146 func (lx *lexer) next() (r rune) {
147 if lx.atEOF {
148 panic("BUG in lexer: next called after EOF")
149 }
150 if lx.pos >= len(lx.input) {
151 lx.atEOF = true
152 return eof
153 }
154 155 if lx.input[lx.pos] == '\n' {
156 lx.line++
157 }
158 lx.prevWidths[3] = lx.prevWidths[2]
159 lx.prevWidths[2] = lx.prevWidths[1]
160 lx.prevWidths[1] = lx.prevWidths[0]
161 if lx.nprev < 4 {
162 lx.nprev++
163 }
164 165 r, w := utf8.DecodeRuneInString(lx.input[lx.pos:])
166 if r == utf8.RuneError && w == 1 {
167 lx.error(errLexUTF8{lx.input[lx.pos]})
168 return utf8.RuneError
169 }
170 171 // Note: don't use peek() here, as this calls next().
172 if isControl(r) || (r == '\r' && (len(lx.input)-1 == lx.pos || lx.input[lx.pos+1] != '\n')) {
173 lx.errorControlChar(r)
174 return utf8.RuneError
175 }
176 177 lx.prevWidths[0] = w
178 lx.pos += w
179 return r
180 }
181 182 // ignore skips over the pending input before this point.
183 func (lx *lexer) ignore() {
184 lx.start = lx.pos
185 }
186 187 // backup steps back one rune. Can be called 4 times between calls to next.
188 func (lx *lexer) backup() {
189 if lx.atEOF {
190 lx.atEOF = false
191 return
192 }
193 if lx.nprev < 1 {
194 panic("BUG in lexer: backed up too far")
195 }
196 w := lx.prevWidths[0]
197 lx.prevWidths[0] = lx.prevWidths[1]
198 lx.prevWidths[1] = lx.prevWidths[2]
199 lx.prevWidths[2] = lx.prevWidths[3]
200 lx.nprev--
201 202 lx.pos -= w
203 if lx.pos < len(lx.input) && lx.input[lx.pos] == '\n' {
204 lx.line--
205 }
206 }
207 208 // accept consumes the next rune if it's equal to `valid`.
209 func (lx *lexer) accept(valid rune) bool {
210 if lx.next() == valid {
211 return true
212 }
213 lx.backup()
214 return false
215 }
216 217 // peek returns but does not consume the next rune in the input.
218 func (lx *lexer) peek() rune {
219 r := lx.next()
220 lx.backup()
221 return r
222 }
223 224 // skip ignores all input that matches the given predicate.
225 func (lx *lexer) skip(pred func(rune) bool) {
226 for {
227 r := lx.next()
228 if pred(r) {
229 continue
230 }
231 lx.backup()
232 lx.ignore()
233 return
234 }
235 }
236 237 // error stops all lexing by emitting an error and returning `nil`.
238 //
239 // Note that any value that is a character is escaped if it's a special
240 // character (newlines, tabs, etc.).
241 func (lx *lexer) error(err error) stateFn {
242 if lx.atEOF {
243 return lx.errorPrevLine(err)
244 }
245 lx.items <- item{typ: itemError, pos: lx.getPos(), err: err}
246 return nil
247 }
248 249 // errorfPrevline is like error(), but sets the position to the last column of
250 // the previous line.
251 //
252 // This is so that unexpected EOF or NL errors don't show on a new blank line.
253 func (lx *lexer) errorPrevLine(err error) stateFn {
254 pos := lx.getPos()
255 pos.Line--
256 pos.Len = 1
257 pos.Start = lx.pos - 1
258 lx.items <- item{typ: itemError, pos: pos, err: err}
259 return nil
260 }
261 262 // errorPos is like error(), but allows explicitly setting the position.
263 func (lx *lexer) errorPos(start, length int, err error) stateFn {
264 pos := lx.getPos()
265 pos.Start = start
266 pos.Len = length
267 lx.items <- item{typ: itemError, pos: pos, err: err}
268 return nil
269 }
270 271 // errorf is like error, and creates a new error.
272 func (lx *lexer) errorf(format string, values ...any) stateFn {
273 if lx.atEOF {
274 pos := lx.getPos()
275 if lx.pos >= 1 && lx.input[lx.pos-1] == '\n' {
276 pos.Line--
277 }
278 pos.Len = 1
279 pos.Start = lx.pos - 1
280 lx.items <- item{typ: itemError, pos: pos, err: fmt.Errorf(format, values...)}
281 return nil
282 }
283 lx.items <- item{typ: itemError, pos: lx.getPos(), err: fmt.Errorf(format, values...)}
284 return nil
285 }
286 287 func (lx *lexer) errorControlChar(cc rune) stateFn {
288 return lx.errorPos(lx.pos-1, 1, errLexControl{cc})
289 }
290 291 // lexTop consumes elements at the top level of TOML data.
292 func lexTop(lx *lexer) stateFn {
293 r := lx.next()
294 if isWhitespace(r) || isNL(r) {
295 return lexSkip(lx, lexTop)
296 }
297 switch r {
298 case '#':
299 lx.push(lexTop)
300 return lexCommentStart
301 case '[':
302 return lexTableStart
303 case eof:
304 if lx.pos > lx.start {
305 // TODO: never reached? I think this can only occur on a bug in the
306 // lexer(?)
307 return lx.errorf("unexpected EOF")
308 }
309 lx.emit(itemEOF)
310 return nil
311 }
312 313 // At this point, the only valid item can be a key, so we back up
314 // and let the key lexer do the rest.
315 lx.backup()
316 lx.push(lexTopEnd)
317 return lexKeyStart
318 }
319 320 // lexTopEnd is entered whenever a top-level item has been consumed. (A value
321 // or a table.) It must see only whitespace, and will turn back to lexTop
322 // upon a newline. If it sees EOF, it will quit the lexer successfully.
323 func lexTopEnd(lx *lexer) stateFn {
324 r := lx.next()
325 switch {
326 case r == '#':
327 // a comment will read to a newline for us.
328 lx.push(lexTop)
329 return lexCommentStart
330 case isWhitespace(r):
331 return lexTopEnd
332 case isNL(r):
333 lx.ignore()
334 return lexTop
335 case r == eof:
336 lx.emit(itemEOF)
337 return nil
338 }
339 return lx.errorf("expected a top-level item to end with a newline, comment, or EOF, but got %q instead", r)
340 }
341 342 // lexTable lexes the beginning of a table. Namely, it makes sure that
343 // it starts with a character other than '.' and ']'.
344 // It assumes that '[' has already been consumed.
345 // It also handles the case that this is an item in an array of tables.
346 // e.g., '[[name]]'.
347 func lexTableStart(lx *lexer) stateFn {
348 if lx.peek() == '[' {
349 lx.next()
350 lx.emit(itemArrayTableStart)
351 lx.push(lexArrayTableEnd)
352 } else {
353 lx.emit(itemTableStart)
354 lx.push(lexTableEnd)
355 }
356 return lexTableNameStart
357 }
358 359 func lexTableEnd(lx *lexer) stateFn {
360 lx.emit(itemTableEnd)
361 return lexTopEnd
362 }
363 364 func lexArrayTableEnd(lx *lexer) stateFn {
365 if r := lx.next(); r != ']' {
366 return lx.errorf("expected end of table array name delimiter ']', but got %q instead", r)
367 }
368 lx.emit(itemArrayTableEnd)
369 return lexTopEnd
370 }
371 372 func lexTableNameStart(lx *lexer) stateFn {
373 lx.skip(isWhitespace)
374 switch r := lx.peek(); {
375 case r == ']' || r == eof:
376 return lx.errorf("unexpected end of table name (table names cannot be empty)")
377 case r == '.':
378 return lx.errorf("unexpected table separator (table names cannot be empty)")
379 case r == '"' || r == '\'':
380 lx.ignore()
381 lx.push(lexTableNameEnd)
382 return lexQuotedName
383 default:
384 lx.push(lexTableNameEnd)
385 return lexBareName
386 }
387 }
388 389 // lexTableNameEnd reads the end of a piece of a table name, optionally
390 // consuming whitespace.
391 func lexTableNameEnd(lx *lexer) stateFn {
392 lx.skip(isWhitespace)
393 switch r := lx.next(); {
394 case r == '.':
395 lx.ignore()
396 return lexTableNameStart
397 case r == ']':
398 return lx.pop()
399 default:
400 return lx.errorf("expected '.' or ']' to end table name, but got %q instead", r)
401 }
402 }
403 404 // lexBareName lexes one part of a key or table.
405 //
406 // It assumes that at least one valid character for the table has already been
407 // read.
408 //
409 // Lexes only one part, e.g. only 'a' inside 'a.b'.
410 func lexBareName(lx *lexer) stateFn {
411 r := lx.next()
412 if isBareKeyChar(r) {
413 return lexBareName
414 }
415 lx.backup()
416 lx.emit(itemText)
417 return lx.pop()
418 }
419 420 // lexQuotedName lexes one part of a quoted key or table name. It assumes that
421 // it starts lexing at the quote itself (" or ').
422 //
423 // Lexes only one part, e.g. only '"a"' inside '"a".b'.
424 func lexQuotedName(lx *lexer) stateFn {
425 r := lx.next()
426 switch {
427 case r == '"':
428 lx.ignore() // ignore the '"'
429 return lexString
430 case r == '\'':
431 lx.ignore() // ignore the "'"
432 return lexRawString
433 434 // TODO: I don't think any of the below conditions can ever be reached?
435 case isWhitespace(r):
436 return lexSkip(lx, lexValue)
437 case r == eof:
438 return lx.errorf("unexpected EOF; expected value")
439 default:
440 return lx.errorf("expected value but found %q instead", r)
441 }
442 }
443 444 // lexKeyStart consumes all key parts until a '='.
445 func lexKeyStart(lx *lexer) stateFn {
446 lx.skip(isWhitespace)
447 switch r := lx.peek(); {
448 case r == '=' || r == eof:
449 return lx.errorf("unexpected '=': key name appears blank")
450 case r == '.':
451 return lx.errorf("unexpected '.': keys cannot start with a '.'")
452 case r == '"' || r == '\'':
453 lx.ignore()
454 fallthrough
455 default: // Bare key
456 lx.emit(itemKeyStart)
457 return lexKeyNameStart
458 }
459 }
460 461 func lexKeyNameStart(lx *lexer) stateFn {
462 lx.skip(isWhitespace)
463 switch r := lx.peek(); {
464 default:
465 lx.push(lexKeyEnd)
466 return lexBareName
467 case r == '"' || r == '\'':
468 lx.ignore()
469 lx.push(lexKeyEnd)
470 return lexQuotedName
471 472 // TODO: I think these can never be reached?
473 case r == '=' || r == eof:
474 return lx.errorf("unexpected '='")
475 case r == '.':
476 return lx.errorf("unexpected '.'")
477 }
478 }
479 480 // lexKeyEnd consumes the end of a key and trims whitespace (up to the key
481 // separator).
482 func lexKeyEnd(lx *lexer) stateFn {
483 lx.skip(isWhitespace)
484 switch r := lx.next(); {
485 case isWhitespace(r):
486 return lexSkip(lx, lexKeyEnd)
487 case r == eof: // TODO: never reached
488 return lx.errorf("unexpected EOF; expected key separator '='")
489 case r == '.':
490 lx.ignore()
491 return lexKeyNameStart
492 case r == '=':
493 lx.emit(itemKeyEnd)
494 return lexSkip(lx, lexValue)
495 default:
496 if r == '\n' {
497 return lx.errorPrevLine(fmt.Errorf("expected '.' or '=', but got %q instead", r))
498 }
499 return lx.errorf("expected '.' or '=', but got %q instead", r)
500 }
501 }
502 503 // lexValue starts the consumption of a value anywhere a value is expected.
504 // lexValue will ignore whitespace.
505 // After a value is lexed, the last state on the next is popped and returned.
506 func lexValue(lx *lexer) stateFn {
507 // We allow whitespace to precede a value, but NOT newlines.
508 // In array syntax, the array states are responsible for ignoring newlines.
509 r := lx.next()
510 switch {
511 case isWhitespace(r):
512 return lexSkip(lx, lexValue)
513 case isDigit(r):
514 lx.backup() // avoid an extra state and use the same as above
515 return lexNumberOrDateStart
516 }
517 switch r {
518 case '[':
519 lx.ignore()
520 lx.emit(itemArray)
521 return lexArrayValue
522 case '{':
523 lx.ignore()
524 lx.emit(itemInlineTableStart)
525 return lexInlineTableValue
526 case '"':
527 if lx.accept('"') {
528 if lx.accept('"') {
529 lx.ignore() // Ignore """
530 return lexMultilineString
531 }
532 lx.backup()
533 }
534 lx.ignore() // ignore the '"'
535 return lexString
536 case '\'':
537 if lx.accept('\'') {
538 if lx.accept('\'') {
539 lx.ignore() // Ignore """
540 return lexMultilineRawString
541 }
542 lx.backup()
543 }
544 lx.ignore() // ignore the "'"
545 return lexRawString
546 case '.': // special error case, be kind to users
547 return lx.errorf("floats must start with a digit, not '.'")
548 case 'i', 'n':
549 if (lx.accept('n') && lx.accept('f')) || (lx.accept('a') && lx.accept('n')) {
550 lx.emit(itemFloat)
551 return lx.pop()
552 }
553 case '-', '+':
554 return lexDecimalNumberStart
555 }
556 if unicode.IsLetter(r) {
557 // Be permissive here; lexBool will give a nice error if the
558 // user wrote something like
559 // x = foo
560 // (i.e. not 'true' or 'false' but is something else word-like.)
561 lx.backup()
562 return lexBool
563 }
564 if r == eof {
565 return lx.errorf("unexpected EOF; expected value")
566 }
567 if r == '\n' {
568 return lx.errorPrevLine(fmt.Errorf("expected value but found %q instead", r))
569 }
570 return lx.errorf("expected value but found %q instead", r)
571 }
572 573 // lexArrayValue consumes one value in an array. It assumes that '[' or ','
574 // have already been consumed. All whitespace and newlines are ignored.
575 func lexArrayValue(lx *lexer) stateFn {
576 r := lx.next()
577 switch {
578 case isWhitespace(r) || isNL(r):
579 return lexSkip(lx, lexArrayValue)
580 case r == '#':
581 lx.push(lexArrayValue)
582 return lexCommentStart
583 case r == ',':
584 return lx.errorf("unexpected comma")
585 case r == ']':
586 return lexArrayEnd
587 }
588 589 lx.backup()
590 lx.push(lexArrayValueEnd)
591 return lexValue
592 }
593 594 // lexArrayValueEnd consumes everything between the end of an array value and
595 // the next value (or the end of the array): it ignores whitespace and newlines
596 // and expects either a ',' or a ']'.
597 func lexArrayValueEnd(lx *lexer) stateFn {
598 switch r := lx.next(); {
599 case isWhitespace(r) || isNL(r):
600 return lexSkip(lx, lexArrayValueEnd)
601 case r == '#':
602 lx.push(lexArrayValueEnd)
603 return lexCommentStart
604 case r == ',':
605 lx.ignore()
606 return lexArrayValue // move on to the next value
607 case r == ']':
608 return lexArrayEnd
609 default:
610 return lx.errorf("expected a comma (',') or array terminator (']'), but got %s", runeOrEOF(r))
611 }
612 }
613 614 // lexArrayEnd finishes the lexing of an array.
615 // It assumes that a ']' has just been consumed.
616 func lexArrayEnd(lx *lexer) stateFn {
617 lx.ignore()
618 lx.emit(itemArrayEnd)
619 return lx.pop()
620 }
621 622 // lexInlineTableValue consumes one key/value pair in an inline table.
623 // It assumes that '{' or ',' have already been consumed. Whitespace is ignored.
624 func lexInlineTableValue(lx *lexer) stateFn {
625 r := lx.next()
626 switch {
627 case isWhitespace(r):
628 return lexSkip(lx, lexInlineTableValue)
629 case isNL(r):
630 return lexSkip(lx, lexInlineTableValue)
631 case r == '#':
632 lx.push(lexInlineTableValue)
633 return lexCommentStart
634 case r == ',':
635 return lx.errorf("unexpected comma")
636 case r == '}':
637 return lexInlineTableEnd
638 }
639 lx.backup()
640 lx.push(lexInlineTableValueEnd)
641 return lexKeyStart
642 }
643 644 // lexInlineTableValueEnd consumes everything between the end of an inline table
645 // key/value pair and the next pair (or the end of the table):
646 // it ignores whitespace and expects either a ',' or a '}'.
647 func lexInlineTableValueEnd(lx *lexer) stateFn {
648 switch r := lx.next(); {
649 case isWhitespace(r):
650 return lexSkip(lx, lexInlineTableValueEnd)
651 case isNL(r):
652 return lexSkip(lx, lexInlineTableValueEnd)
653 case r == '#':
654 lx.push(lexInlineTableValueEnd)
655 return lexCommentStart
656 case r == ',':
657 lx.ignore()
658 lx.skip(isWhitespace)
659 if lx.peek() == '}' {
660 return lexInlineTableValueEnd
661 }
662 return lexInlineTableValue
663 case r == '}':
664 return lexInlineTableEnd
665 default:
666 return lx.errorf("expected a comma or an inline table terminator '}', but got %s instead", runeOrEOF(r))
667 }
668 }
669 670 func runeOrEOF(r rune) string {
671 if r == eof {
672 return "end of file"
673 }
674 return "'" + string(r) + "'"
675 }
676 677 // lexInlineTableEnd finishes the lexing of an inline table.
678 // It assumes that a '}' has just been consumed.
679 func lexInlineTableEnd(lx *lexer) stateFn {
680 lx.ignore()
681 lx.emit(itemInlineTableEnd)
682 return lx.pop()
683 }
684 685 // lexString consumes the inner contents of a string. It assumes that the
686 // beginning '"' has already been consumed and ignored.
687 func lexString(lx *lexer) stateFn {
688 r := lx.next()
689 switch {
690 case r == eof:
691 return lx.errorf(`unexpected EOF; expected '"'`)
692 case isNL(r):
693 return lx.errorPrevLine(errLexStringNL{})
694 case r == '\\':
695 lx.push(lexString)
696 return lexStringEscape
697 case r == '"':
698 lx.backup()
699 if lx.esc {
700 lx.esc = false
701 lx.emit(itemStringEsc)
702 } else {
703 lx.emit(itemString)
704 }
705 lx.next()
706 lx.ignore()
707 return lx.pop()
708 }
709 return lexString
710 }
711 712 // lexMultilineString consumes the inner contents of a string. It assumes that
713 // the beginning '"""' has already been consumed and ignored.
714 func lexMultilineString(lx *lexer) stateFn {
715 r := lx.next()
716 switch r {
717 default:
718 return lexMultilineString
719 case eof:
720 return lx.errorf(`unexpected EOF; expected '"""'`)
721 case '\\':
722 return lexMultilineStringEscape
723 case '"':
724 /// Found " → try to read two more "".
725 if lx.accept('"') {
726 if lx.accept('"') {
727 /// Peek ahead: the string can contain " and "", including at the
728 /// end: """str"""""
729 /// 6 or more at the end, however, is an error.
730 if lx.peek() == '"' {
731 /// Check if we already lexed 5 's; if so we have 6 now, and
732 /// that's just too many man!
733 ///
734 /// Second check is for the edge case:
735 ///
736 /// two quotes allowed.
737 /// vv
738 /// """lol \""""""
739 /// ^^ ^^^---- closing three
740 /// escaped
741 ///
742 /// But ugly, but it works
743 if strings.HasSuffix(lx.current(), `"""""`) && !strings.HasSuffix(lx.current(), `\"""""`) {
744 return lx.errorf(`unexpected '""""""'`)
745 }
746 lx.backup()
747 lx.backup()
748 return lexMultilineString
749 }
750 751 lx.backup() /// backup: don't include the """ in the item.
752 lx.backup()
753 lx.backup()
754 lx.esc = false
755 lx.emit(itemMultilineString)
756 lx.next() /// Read over ''' again and discard it.
757 lx.next()
758 lx.next()
759 lx.ignore()
760 return lx.pop()
761 }
762 lx.backup()
763 }
764 return lexMultilineString
765 }
766 }
767 768 // lexRawString consumes a raw string. Nothing can be escaped in such a string.
769 // It assumes that the beginning "'" has already been consumed and ignored.
770 func lexRawString(lx *lexer) stateFn {
771 r := lx.next()
772 switch {
773 default:
774 return lexRawString
775 case r == eof:
776 return lx.errorf(`unexpected EOF; expected "'"`)
777 case isNL(r):
778 return lx.errorPrevLine(errLexStringNL{})
779 case r == '\'':
780 lx.backup()
781 lx.emit(itemRawString)
782 lx.next()
783 lx.ignore()
784 return lx.pop()
785 }
786 }
787 788 // lexMultilineRawString consumes a raw string. Nothing can be escaped in such a
789 // string. It assumes that the beginning triple-' has already been consumed and
790 // ignored.
791 func lexMultilineRawString(lx *lexer) stateFn {
792 r := lx.next()
793 switch r {
794 default:
795 return lexMultilineRawString
796 case eof:
797 return lx.errorf(`unexpected EOF; expected "'''"`)
798 case '\'':
799 /// Found ' → try to read two more ''.
800 if lx.accept('\'') {
801 if lx.accept('\'') {
802 /// Peek ahead: the string can contain ' and '', including at the
803 /// end: '''str'''''
804 /// 6 or more at the end, however, is an error.
805 if lx.peek() == '\'' {
806 /// Check if we already lexed 5 's; if so we have 6 now, and
807 /// that's just too many man!
808 if strings.HasSuffix(lx.current(), "'''''") {
809 return lx.errorf(`unexpected "''''''"`)
810 }
811 lx.backup()
812 lx.backup()
813 return lexMultilineRawString
814 }
815 816 lx.backup() /// backup: don't include the ''' in the item.
817 lx.backup()
818 lx.backup()
819 lx.emit(itemRawMultilineString)
820 lx.next() /// Read over ''' again and discard it.
821 lx.next()
822 lx.next()
823 lx.ignore()
824 return lx.pop()
825 }
826 lx.backup()
827 }
828 return lexMultilineRawString
829 }
830 }
831 832 // lexMultilineStringEscape consumes an escaped character. It assumes that the
833 // preceding '\\' has already been consumed.
834 func lexMultilineStringEscape(lx *lexer) stateFn {
835 if isNL(lx.next()) { /// \ escaping newline.
836 return lexMultilineString
837 }
838 lx.backup()
839 lx.push(lexMultilineString)
840 return lexStringEscape(lx)
841 }
842 843 func lexStringEscape(lx *lexer) stateFn {
844 lx.esc = true
845 r := lx.next()
846 switch r {
847 case 'e':
848 fallthrough
849 case 'b':
850 fallthrough
851 case 't':
852 fallthrough
853 case 'n':
854 fallthrough
855 case 'f':
856 fallthrough
857 case 'r':
858 fallthrough
859 case '"':
860 fallthrough
861 case ' ', '\t':
862 // Inside """ .. """ strings you can use \ to escape newlines, and any
863 // amount of whitespace can be between the \ and \n.
864 fallthrough
865 case '\\':
866 return lx.pop()
867 case 'x':
868 return lexHexEscape
869 case 'u':
870 return lexShortUnicodeEscape
871 case 'U':
872 return lexLongUnicodeEscape
873 }
874 return lx.error(errLexEscape{r})
875 }
876 877 func lexHexEscape(lx *lexer) stateFn {
878 var r rune
879 for i := 0; i < 2; i++ {
880 r = lx.next()
881 if !isHex(r) {
882 return lx.errorf(`expected two hexadecimal digits after '\x', but got %q instead`, lx.current())
883 }
884 }
885 return lx.pop()
886 }
887 888 func lexShortUnicodeEscape(lx *lexer) stateFn {
889 var r rune
890 for i := 0; i < 4; i++ {
891 r = lx.next()
892 if !isHex(r) {
893 return lx.errorf(`expected four hexadecimal digits after '\u', but got %q instead`, lx.current())
894 }
895 }
896 return lx.pop()
897 }
898 899 func lexLongUnicodeEscape(lx *lexer) stateFn {
900 var r rune
901 for i := 0; i < 8; i++ {
902 r = lx.next()
903 if !isHex(r) {
904 return lx.errorf(`expected eight hexadecimal digits after '\U', but got %q instead`, lx.current())
905 }
906 }
907 return lx.pop()
908 }
909 910 // lexNumberOrDateStart processes the first character of a value which begins
911 // with a digit. It exists to catch values starting with '0', so that
912 // lexBaseNumberOrDate can differentiate base prefixed integers from other
913 // types.
914 func lexNumberOrDateStart(lx *lexer) stateFn {
915 if lx.next() == '0' {
916 return lexBaseNumberOrDate
917 }
918 return lexNumberOrDate
919 }
920 921 // lexNumberOrDate consumes either an integer, float or datetime.
922 func lexNumberOrDate(lx *lexer) stateFn {
923 r := lx.next()
924 if isDigit(r) {
925 return lexNumberOrDate
926 }
927 switch r {
928 case '-', ':':
929 return lexDatetime
930 case '_':
931 return lexDecimalNumber
932 case '.', 'e', 'E':
933 return lexFloat
934 }
935 936 lx.backup()
937 lx.emit(itemInteger)
938 return lx.pop()
939 }
940 941 // lexDatetime consumes a Datetime, to a first approximation.
942 // The parser validates that it matches one of the accepted formats.
943 func lexDatetime(lx *lexer) stateFn {
944 r := lx.next()
945 if isDigit(r) {
946 return lexDatetime
947 }
948 switch r {
949 case '-', ':', 'T', 't', ' ', '.', 'Z', 'z', '+':
950 return lexDatetime
951 }
952 953 lx.backup()
954 lx.emitTrim(itemDatetime)
955 return lx.pop()
956 }
957 958 // lexHexInteger consumes a hexadecimal integer after seeing the '0x' prefix.
959 func lexHexInteger(lx *lexer) stateFn {
960 r := lx.next()
961 if isHex(r) {
962 return lexHexInteger
963 }
964 switch r {
965 case '_':
966 return lexHexInteger
967 }
968 969 lx.backup()
970 lx.emit(itemInteger)
971 return lx.pop()
972 }
973 974 // lexOctalInteger consumes an octal integer after seeing the '0o' prefix.
975 func lexOctalInteger(lx *lexer) stateFn {
976 r := lx.next()
977 if isOctal(r) {
978 return lexOctalInteger
979 }
980 switch r {
981 case '_':
982 return lexOctalInteger
983 }
984 985 lx.backup()
986 lx.emit(itemInteger)
987 return lx.pop()
988 }
989 990 // lexBinaryInteger consumes a binary integer after seeing the '0b' prefix.
991 func lexBinaryInteger(lx *lexer) stateFn {
992 r := lx.next()
993 if isBinary(r) {
994 return lexBinaryInteger
995 }
996 switch r {
997 case '_':
998 return lexBinaryInteger
999 }
1000 1001 lx.backup()
1002 lx.emit(itemInteger)
1003 return lx.pop()
1004 }
1005 1006 // lexDecimalNumber consumes a decimal float or integer.
1007 func lexDecimalNumber(lx *lexer) stateFn {
1008 r := lx.next()
1009 if isDigit(r) {
1010 return lexDecimalNumber
1011 }
1012 switch r {
1013 case '.', 'e', 'E':
1014 return lexFloat
1015 case '_':
1016 return lexDecimalNumber
1017 }
1018 1019 lx.backup()
1020 lx.emit(itemInteger)
1021 return lx.pop()
1022 }
1023 1024 // lexDecimalNumber consumes the first digit of a number beginning with a sign.
1025 // It assumes the sign has already been consumed. Values which start with a sign
1026 // are only allowed to be decimal integers or floats.
1027 //
1028 // The special "nan" and "inf" values are also recognized.
1029 func lexDecimalNumberStart(lx *lexer) stateFn {
1030 r := lx.next()
1031 1032 // Special error cases to give users better error messages
1033 switch r {
1034 case 'i':
1035 if !lx.accept('n') || !lx.accept('f') {
1036 return lx.errorf("invalid float: '%s'", lx.current())
1037 }
1038 lx.emit(itemFloat)
1039 return lx.pop()
1040 case 'n':
1041 if !lx.accept('a') || !lx.accept('n') {
1042 return lx.errorf("invalid float: '%s'", lx.current())
1043 }
1044 lx.emit(itemFloat)
1045 return lx.pop()
1046 case '0':
1047 p := lx.peek()
1048 switch p {
1049 case 'b', 'o', 'x':
1050 return lx.errorf("cannot use sign with non-decimal numbers: '%s%c'", lx.current(), p)
1051 }
1052 case '.':
1053 return lx.errorf("floats must start with a digit, not '.'")
1054 }
1055 1056 if isDigit(r) {
1057 return lexDecimalNumber
1058 }
1059 1060 return lx.errorf("expected a digit but got %q", r)
1061 }
1062 1063 // lexBaseNumberOrDate differentiates between the possible values which
1064 // start with '0'. It assumes that before reaching this state, the initial '0'
1065 // has been consumed.
1066 func lexBaseNumberOrDate(lx *lexer) stateFn {
1067 r := lx.next()
1068 // Note: All datetimes start with at least two digits, so we don't
1069 // handle date characters (':', '-', etc.) here.
1070 if isDigit(r) {
1071 return lexNumberOrDate
1072 }
1073 switch r {
1074 case '_':
1075 // Can only be decimal, because there can't be an underscore
1076 // between the '0' and the base designator, and dates can't
1077 // contain underscores.
1078 return lexDecimalNumber
1079 case '.', 'e', 'E':
1080 return lexFloat
1081 case 'b':
1082 r = lx.peek()
1083 if !isBinary(r) {
1084 lx.errorf("not a binary number: '%s%c'", lx.current(), r)
1085 }
1086 return lexBinaryInteger
1087 case 'o':
1088 r = lx.peek()
1089 if !isOctal(r) {
1090 lx.errorf("not an octal number: '%s%c'", lx.current(), r)
1091 }
1092 return lexOctalInteger
1093 case 'x':
1094 r = lx.peek()
1095 if !isHex(r) {
1096 lx.errorf("not a hexadecimal number: '%s%c'", lx.current(), r)
1097 }
1098 return lexHexInteger
1099 }
1100 1101 lx.backup()
1102 lx.emit(itemInteger)
1103 return lx.pop()
1104 }
1105 1106 // lexFloat consumes the elements of a float. It allows any sequence of
1107 // float-like characters, so floats emitted by the lexer are only a first
1108 // approximation and must be validated by the parser.
1109 func lexFloat(lx *lexer) stateFn {
1110 r := lx.next()
1111 if isDigit(r) {
1112 return lexFloat
1113 }
1114 switch r {
1115 case '_', '.', '-', '+', 'e', 'E':
1116 return lexFloat
1117 }
1118 1119 lx.backup()
1120 lx.emit(itemFloat)
1121 return lx.pop()
1122 }
1123 1124 // lexBool consumes a bool string: 'true' or 'false.
1125 func lexBool(lx *lexer) stateFn {
1126 var rs []rune
1127 for {
1128 r := lx.next()
1129 if !unicode.IsLetter(r) {
1130 lx.backup()
1131 break
1132 }
1133 rs = append(rs, r)
1134 }
1135 s := string(rs)
1136 switch s {
1137 case "true", "false":
1138 lx.emit(itemBool)
1139 return lx.pop()
1140 }
1141 return lx.errorf("expected value but found %q instead", s)
1142 }
1143 1144 // lexCommentStart begins the lexing of a comment. It will emit
1145 // itemCommentStart and consume no characters, passing control to lexComment.
1146 func lexCommentStart(lx *lexer) stateFn {
1147 lx.ignore()
1148 lx.emit(itemCommentStart)
1149 return lexComment
1150 }
1151 1152 // lexComment lexes an entire comment. It assumes that '#' has been consumed.
1153 // It will consume *up to* the first newline character, and pass control
1154 // back to the last state on the stack.
1155 func lexComment(lx *lexer) stateFn {
1156 switch r := lx.next(); {
1157 case isNL(r) || r == eof:
1158 lx.backup()
1159 lx.emit(itemText)
1160 return lx.pop()
1161 default:
1162 return lexComment
1163 }
1164 }
1165 1166 // lexSkip ignores all slurped input and moves on to the next state.
1167 func lexSkip(lx *lexer, nextState stateFn) stateFn {
1168 lx.ignore()
1169 return nextState
1170 }
1171 1172 func (s stateFn) String() string {
1173 if s == nil {
1174 return "<nil>"
1175 }
1176 name := runtime.FuncForPC(reflect.ValueOf(s).Pointer()).Name()
1177 if i := strings.LastIndexByte(name, '.'); i > -1 {
1178 name = name[i+1:]
1179 }
1180 return name + "()"
1181 }
1182 1183 func (itype itemType) String() string {
1184 switch itype {
1185 case itemError:
1186 return "Error"
1187 case itemEOF:
1188 return "EOF"
1189 case itemText:
1190 return "Text"
1191 case itemString, itemStringEsc, itemRawString, itemMultilineString, itemRawMultilineString:
1192 return "String"
1193 case itemBool:
1194 return "Bool"
1195 case itemInteger:
1196 return "Integer"
1197 case itemFloat:
1198 return "Float"
1199 case itemDatetime:
1200 return "DateTime"
1201 case itemArray:
1202 return "Array"
1203 case itemArrayEnd:
1204 return "ArrayEnd"
1205 case itemTableStart:
1206 return "TableStart"
1207 case itemTableEnd:
1208 return "TableEnd"
1209 case itemArrayTableStart:
1210 return "ArrayTableStart"
1211 case itemArrayTableEnd:
1212 return "ArrayTableEnd"
1213 case itemKeyStart:
1214 return "KeyStart"
1215 case itemKeyEnd:
1216 return "KeyEnd"
1217 case itemCommentStart:
1218 return "CommentStart"
1219 case itemInlineTableStart:
1220 return "InlineTableStart"
1221 case itemInlineTableEnd:
1222 return "InlineTableEnd"
1223 }
1224 panic(fmt.Sprintf("BUG: Unknown type '%d'.", int(itype)))
1225 }
1226 1227 func (item item) String() string {
1228 return fmt.Sprintf("(%s, %s)", item.typ, item.val)
1229 }
1230 1231 func isWhitespace(r rune) bool { return r == '\t' || r == ' ' }
1232 func isNL(r rune) bool { return r == '\n' || r == '\r' }
1233 func isControl(r rune) bool { // Control characters except \t, \r, \n
1234 switch r {
1235 case '\t', '\r', '\n':
1236 return false
1237 default:
1238 return (r >= 0x00 && r <= 0x1f) || r == 0x7f
1239 }
1240 }
1241 func isDigit(r rune) bool { return r >= '0' && r <= '9' }
1242 func isBinary(r rune) bool { return r == '0' || r == '1' }
1243 func isOctal(r rune) bool { return r >= '0' && r <= '7' }
1244 func isHex(r rune) bool { return (r >= '0' && r <= '9') || (r|0x20 >= 'a' && r|0x20 <= 'f') }
1245 func isBareKeyChar(r rune) bool {
1246 return (r >= 'A' && r <= 'Z') || (r >= 'a' && r <= 'z') ||
1247 (r >= '0' && r <= '9') || r == '_' || r == '-'
1248 }
1249