lex.go raw

   1  package toml
   2  
   3  import (
   4  	"fmt"
   5  	"reflect"
   6  	"runtime"
   7  	"strings"
   8  	"unicode"
   9  	"unicode/utf8"
  10  )
  11  
  12  type itemType int
  13  
  14  const (
  15  	itemError itemType = iota
  16  	itemEOF
  17  	itemText
  18  	itemString
  19  	itemStringEsc
  20  	itemRawString
  21  	itemMultilineString
  22  	itemRawMultilineString
  23  	itemBool
  24  	itemInteger
  25  	itemFloat
  26  	itemDatetime
  27  	itemArray // the start of an array
  28  	itemArrayEnd
  29  	itemTableStart
  30  	itemTableEnd
  31  	itemArrayTableStart
  32  	itemArrayTableEnd
  33  	itemKeyStart
  34  	itemKeyEnd
  35  	itemCommentStart
  36  	itemInlineTableStart
  37  	itemInlineTableEnd
  38  )
  39  
  40  const eof = 0
  41  
  42  type stateFn func(lx *lexer) stateFn
  43  
  44  func (p Position) String() string {
  45  	return fmt.Sprintf("at line %d; start %d; length %d", p.Line, p.Start, p.Len)
  46  }
  47  
  48  type lexer struct {
  49  	input string
  50  	start int
  51  	pos   int
  52  	line  int
  53  	state stateFn
  54  	items chan item
  55  	esc   bool
  56  
  57  	// Allow for backing up up to 4 runes. This is necessary because TOML
  58  	// contains 3-rune tokens (""" and ''').
  59  	prevWidths [4]int
  60  	nprev      int  // how many of prevWidths are in use
  61  	atEOF      bool // If we emit an eof, we can still back up, but it is not OK to call next again.
  62  
  63  	// A stack of state functions used to maintain context.
  64  	//
  65  	// The idea is to reuse parts of the state machine in various places. For
  66  	// example, values can appear at the top level or within arbitrarily nested
  67  	// arrays. The last state on the stack is used after a value has been lexed.
  68  	// Similarly for comments.
  69  	stack []stateFn
  70  }
  71  
  72  type item struct {
  73  	typ itemType
  74  	val string
  75  	err error
  76  	pos Position
  77  }
  78  
  79  func (lx *lexer) nextItem() item {
  80  	for {
  81  		select {
  82  		case item := <-lx.items:
  83  			return item
  84  		default:
  85  			lx.state = lx.state(lx)
  86  			//fmt.Printf("     STATE %-24s  current: %-10s	stack: %s\n", lx.state, lx.current(), lx.stack)
  87  		}
  88  	}
  89  }
  90  
  91  func lex(input string) *lexer {
  92  	lx := &lexer{
  93  		input: input,
  94  		state: lexTop,
  95  		items: make(chan item, 10),
  96  		stack: make([]stateFn, 0, 10),
  97  		line:  1,
  98  	}
  99  	return lx
 100  }
 101  
 102  func (lx *lexer) push(state stateFn) {
 103  	lx.stack = append(lx.stack, state)
 104  }
 105  
 106  func (lx *lexer) pop() stateFn {
 107  	if len(lx.stack) == 0 {
 108  		panic("BUG in lexer: no states to pop")
 109  	}
 110  	last := lx.stack[len(lx.stack)-1]
 111  	lx.stack = lx.stack[0 : len(lx.stack)-1]
 112  	return last
 113  }
 114  
 115  func (lx *lexer) current() string {
 116  	return lx.input[lx.start:lx.pos]
 117  }
 118  
 119  func (lx lexer) getPos() Position {
 120  	p := Position{
 121  		Line:  lx.line,
 122  		Start: lx.start,
 123  		Len:   lx.pos - lx.start,
 124  	}
 125  	if p.Len <= 0 {
 126  		p.Len = 1
 127  	}
 128  	return p
 129  }
 130  
 131  func (lx *lexer) emit(typ itemType) {
 132  	// Needed for multiline strings ending with an incomplete UTF-8 sequence.
 133  	if lx.start > lx.pos {
 134  		lx.error(errLexUTF8{lx.input[lx.pos]})
 135  		return
 136  	}
 137  	lx.items <- item{typ: typ, pos: lx.getPos(), val: lx.current()}
 138  	lx.start = lx.pos
 139  }
 140  
 141  func (lx *lexer) emitTrim(typ itemType) {
 142  	lx.items <- item{typ: typ, pos: lx.getPos(), val: strings.TrimSpace(lx.current())}
 143  	lx.start = lx.pos
 144  }
 145  
 146  func (lx *lexer) next() (r rune) {
 147  	if lx.atEOF {
 148  		panic("BUG in lexer: next called after EOF")
 149  	}
 150  	if lx.pos >= len(lx.input) {
 151  		lx.atEOF = true
 152  		return eof
 153  	}
 154  
 155  	if lx.input[lx.pos] == '\n' {
 156  		lx.line++
 157  	}
 158  	lx.prevWidths[3] = lx.prevWidths[2]
 159  	lx.prevWidths[2] = lx.prevWidths[1]
 160  	lx.prevWidths[1] = lx.prevWidths[0]
 161  	if lx.nprev < 4 {
 162  		lx.nprev++
 163  	}
 164  
 165  	r, w := utf8.DecodeRuneInString(lx.input[lx.pos:])
 166  	if r == utf8.RuneError && w == 1 {
 167  		lx.error(errLexUTF8{lx.input[lx.pos]})
 168  		return utf8.RuneError
 169  	}
 170  
 171  	// Note: don't use peek() here, as this calls next().
 172  	if isControl(r) || (r == '\r' && (len(lx.input)-1 == lx.pos || lx.input[lx.pos+1] != '\n')) {
 173  		lx.errorControlChar(r)
 174  		return utf8.RuneError
 175  	}
 176  
 177  	lx.prevWidths[0] = w
 178  	lx.pos += w
 179  	return r
 180  }
 181  
 182  // ignore skips over the pending input before this point.
 183  func (lx *lexer) ignore() {
 184  	lx.start = lx.pos
 185  }
 186  
 187  // backup steps back one rune. Can be called 4 times between calls to next.
 188  func (lx *lexer) backup() {
 189  	if lx.atEOF {
 190  		lx.atEOF = false
 191  		return
 192  	}
 193  	if lx.nprev < 1 {
 194  		panic("BUG in lexer: backed up too far")
 195  	}
 196  	w := lx.prevWidths[0]
 197  	lx.prevWidths[0] = lx.prevWidths[1]
 198  	lx.prevWidths[1] = lx.prevWidths[2]
 199  	lx.prevWidths[2] = lx.prevWidths[3]
 200  	lx.nprev--
 201  
 202  	lx.pos -= w
 203  	if lx.pos < len(lx.input) && lx.input[lx.pos] == '\n' {
 204  		lx.line--
 205  	}
 206  }
 207  
 208  // accept consumes the next rune if it's equal to `valid`.
 209  func (lx *lexer) accept(valid rune) bool {
 210  	if lx.next() == valid {
 211  		return true
 212  	}
 213  	lx.backup()
 214  	return false
 215  }
 216  
 217  // peek returns but does not consume the next rune in the input.
 218  func (lx *lexer) peek() rune {
 219  	r := lx.next()
 220  	lx.backup()
 221  	return r
 222  }
 223  
 224  // skip ignores all input that matches the given predicate.
 225  func (lx *lexer) skip(pred func(rune) bool) {
 226  	for {
 227  		r := lx.next()
 228  		if pred(r) {
 229  			continue
 230  		}
 231  		lx.backup()
 232  		lx.ignore()
 233  		return
 234  	}
 235  }
 236  
 237  // error stops all lexing by emitting an error and returning `nil`.
 238  //
 239  // Note that any value that is a character is escaped if it's a special
 240  // character (newlines, tabs, etc.).
 241  func (lx *lexer) error(err error) stateFn {
 242  	if lx.atEOF {
 243  		return lx.errorPrevLine(err)
 244  	}
 245  	lx.items <- item{typ: itemError, pos: lx.getPos(), err: err}
 246  	return nil
 247  }
 248  
 249  // errorfPrevline is like error(), but sets the position to the last column of
 250  // the previous line.
 251  //
 252  // This is so that unexpected EOF or NL errors don't show on a new blank line.
 253  func (lx *lexer) errorPrevLine(err error) stateFn {
 254  	pos := lx.getPos()
 255  	pos.Line--
 256  	pos.Len = 1
 257  	pos.Start = lx.pos - 1
 258  	lx.items <- item{typ: itemError, pos: pos, err: err}
 259  	return nil
 260  }
 261  
 262  // errorPos is like error(), but allows explicitly setting the position.
 263  func (lx *lexer) errorPos(start, length int, err error) stateFn {
 264  	pos := lx.getPos()
 265  	pos.Start = start
 266  	pos.Len = length
 267  	lx.items <- item{typ: itemError, pos: pos, err: err}
 268  	return nil
 269  }
 270  
 271  // errorf is like error, and creates a new error.
 272  func (lx *lexer) errorf(format string, values ...any) stateFn {
 273  	if lx.atEOF {
 274  		pos := lx.getPos()
 275  		if lx.pos >= 1 && lx.input[lx.pos-1] == '\n' {
 276  			pos.Line--
 277  		}
 278  		pos.Len = 1
 279  		pos.Start = lx.pos - 1
 280  		lx.items <- item{typ: itemError, pos: pos, err: fmt.Errorf(format, values...)}
 281  		return nil
 282  	}
 283  	lx.items <- item{typ: itemError, pos: lx.getPos(), err: fmt.Errorf(format, values...)}
 284  	return nil
 285  }
 286  
 287  func (lx *lexer) errorControlChar(cc rune) stateFn {
 288  	return lx.errorPos(lx.pos-1, 1, errLexControl{cc})
 289  }
 290  
 291  // lexTop consumes elements at the top level of TOML data.
 292  func lexTop(lx *lexer) stateFn {
 293  	r := lx.next()
 294  	if isWhitespace(r) || isNL(r) {
 295  		return lexSkip(lx, lexTop)
 296  	}
 297  	switch r {
 298  	case '#':
 299  		lx.push(lexTop)
 300  		return lexCommentStart
 301  	case '[':
 302  		return lexTableStart
 303  	case eof:
 304  		if lx.pos > lx.start {
 305  			// TODO: never reached? I think this can only occur on a bug in the
 306  			// lexer(?)
 307  			return lx.errorf("unexpected EOF")
 308  		}
 309  		lx.emit(itemEOF)
 310  		return nil
 311  	}
 312  
 313  	// At this point, the only valid item can be a key, so we back up
 314  	// and let the key lexer do the rest.
 315  	lx.backup()
 316  	lx.push(lexTopEnd)
 317  	return lexKeyStart
 318  }
 319  
 320  // lexTopEnd is entered whenever a top-level item has been consumed. (A value
 321  // or a table.) It must see only whitespace, and will turn back to lexTop
 322  // upon a newline. If it sees EOF, it will quit the lexer successfully.
 323  func lexTopEnd(lx *lexer) stateFn {
 324  	r := lx.next()
 325  	switch {
 326  	case r == '#':
 327  		// a comment will read to a newline for us.
 328  		lx.push(lexTop)
 329  		return lexCommentStart
 330  	case isWhitespace(r):
 331  		return lexTopEnd
 332  	case isNL(r):
 333  		lx.ignore()
 334  		return lexTop
 335  	case r == eof:
 336  		lx.emit(itemEOF)
 337  		return nil
 338  	}
 339  	return lx.errorf("expected a top-level item to end with a newline, comment, or EOF, but got %q instead", r)
 340  }
 341  
 342  // lexTable lexes the beginning of a table. Namely, it makes sure that
 343  // it starts with a character other than '.' and ']'.
 344  // It assumes that '[' has already been consumed.
 345  // It also handles the case that this is an item in an array of tables.
 346  // e.g., '[[name]]'.
 347  func lexTableStart(lx *lexer) stateFn {
 348  	if lx.peek() == '[' {
 349  		lx.next()
 350  		lx.emit(itemArrayTableStart)
 351  		lx.push(lexArrayTableEnd)
 352  	} else {
 353  		lx.emit(itemTableStart)
 354  		lx.push(lexTableEnd)
 355  	}
 356  	return lexTableNameStart
 357  }
 358  
 359  func lexTableEnd(lx *lexer) stateFn {
 360  	lx.emit(itemTableEnd)
 361  	return lexTopEnd
 362  }
 363  
 364  func lexArrayTableEnd(lx *lexer) stateFn {
 365  	if r := lx.next(); r != ']' {
 366  		return lx.errorf("expected end of table array name delimiter ']', but got %q instead", r)
 367  	}
 368  	lx.emit(itemArrayTableEnd)
 369  	return lexTopEnd
 370  }
 371  
 372  func lexTableNameStart(lx *lexer) stateFn {
 373  	lx.skip(isWhitespace)
 374  	switch r := lx.peek(); {
 375  	case r == ']' || r == eof:
 376  		return lx.errorf("unexpected end of table name (table names cannot be empty)")
 377  	case r == '.':
 378  		return lx.errorf("unexpected table separator (table names cannot be empty)")
 379  	case r == '"' || r == '\'':
 380  		lx.ignore()
 381  		lx.push(lexTableNameEnd)
 382  		return lexQuotedName
 383  	default:
 384  		lx.push(lexTableNameEnd)
 385  		return lexBareName
 386  	}
 387  }
 388  
 389  // lexTableNameEnd reads the end of a piece of a table name, optionally
 390  // consuming whitespace.
 391  func lexTableNameEnd(lx *lexer) stateFn {
 392  	lx.skip(isWhitespace)
 393  	switch r := lx.next(); {
 394  	case r == '.':
 395  		lx.ignore()
 396  		return lexTableNameStart
 397  	case r == ']':
 398  		return lx.pop()
 399  	default:
 400  		return lx.errorf("expected '.' or ']' to end table name, but got %q instead", r)
 401  	}
 402  }
 403  
 404  // lexBareName lexes one part of a key or table.
 405  //
 406  // It assumes that at least one valid character for the table has already been
 407  // read.
 408  //
 409  // Lexes only one part, e.g. only 'a' inside 'a.b'.
 410  func lexBareName(lx *lexer) stateFn {
 411  	r := lx.next()
 412  	if isBareKeyChar(r) {
 413  		return lexBareName
 414  	}
 415  	lx.backup()
 416  	lx.emit(itemText)
 417  	return lx.pop()
 418  }
 419  
 420  // lexQuotedName lexes one part of a quoted key or table name. It assumes that
 421  // it starts lexing at the quote itself (" or ').
 422  //
 423  // Lexes only one part, e.g. only '"a"' inside '"a".b'.
 424  func lexQuotedName(lx *lexer) stateFn {
 425  	r := lx.next()
 426  	switch {
 427  	case r == '"':
 428  		lx.ignore() // ignore the '"'
 429  		return lexString
 430  	case r == '\'':
 431  		lx.ignore() // ignore the "'"
 432  		return lexRawString
 433  
 434  	// TODO: I don't think any of the below conditions can ever be reached?
 435  	case isWhitespace(r):
 436  		return lexSkip(lx, lexValue)
 437  	case r == eof:
 438  		return lx.errorf("unexpected EOF; expected value")
 439  	default:
 440  		return lx.errorf("expected value but found %q instead", r)
 441  	}
 442  }
 443  
 444  // lexKeyStart consumes all key parts until a '='.
 445  func lexKeyStart(lx *lexer) stateFn {
 446  	lx.skip(isWhitespace)
 447  	switch r := lx.peek(); {
 448  	case r == '=' || r == eof:
 449  		return lx.errorf("unexpected '=': key name appears blank")
 450  	case r == '.':
 451  		return lx.errorf("unexpected '.': keys cannot start with a '.'")
 452  	case r == '"' || r == '\'':
 453  		lx.ignore()
 454  		fallthrough
 455  	default: // Bare key
 456  		lx.emit(itemKeyStart)
 457  		return lexKeyNameStart
 458  	}
 459  }
 460  
 461  func lexKeyNameStart(lx *lexer) stateFn {
 462  	lx.skip(isWhitespace)
 463  	switch r := lx.peek(); {
 464  	default:
 465  		lx.push(lexKeyEnd)
 466  		return lexBareName
 467  	case r == '"' || r == '\'':
 468  		lx.ignore()
 469  		lx.push(lexKeyEnd)
 470  		return lexQuotedName
 471  
 472  	// TODO: I think these can never be reached?
 473  	case r == '=' || r == eof:
 474  		return lx.errorf("unexpected '='")
 475  	case r == '.':
 476  		return lx.errorf("unexpected '.'")
 477  	}
 478  }
 479  
 480  // lexKeyEnd consumes the end of a key and trims whitespace (up to the key
 481  // separator).
 482  func lexKeyEnd(lx *lexer) stateFn {
 483  	lx.skip(isWhitespace)
 484  	switch r := lx.next(); {
 485  	case isWhitespace(r):
 486  		return lexSkip(lx, lexKeyEnd)
 487  	case r == eof: // TODO: never reached
 488  		return lx.errorf("unexpected EOF; expected key separator '='")
 489  	case r == '.':
 490  		lx.ignore()
 491  		return lexKeyNameStart
 492  	case r == '=':
 493  		lx.emit(itemKeyEnd)
 494  		return lexSkip(lx, lexValue)
 495  	default:
 496  		if r == '\n' {
 497  			return lx.errorPrevLine(fmt.Errorf("expected '.' or '=', but got %q instead", r))
 498  		}
 499  		return lx.errorf("expected '.' or '=', but got %q instead", r)
 500  	}
 501  }
 502  
 503  // lexValue starts the consumption of a value anywhere a value is expected.
 504  // lexValue will ignore whitespace.
 505  // After a value is lexed, the last state on the next is popped and returned.
 506  func lexValue(lx *lexer) stateFn {
 507  	// We allow whitespace to precede a value, but NOT newlines.
 508  	// In array syntax, the array states are responsible for ignoring newlines.
 509  	r := lx.next()
 510  	switch {
 511  	case isWhitespace(r):
 512  		return lexSkip(lx, lexValue)
 513  	case isDigit(r):
 514  		lx.backup() // avoid an extra state and use the same as above
 515  		return lexNumberOrDateStart
 516  	}
 517  	switch r {
 518  	case '[':
 519  		lx.ignore()
 520  		lx.emit(itemArray)
 521  		return lexArrayValue
 522  	case '{':
 523  		lx.ignore()
 524  		lx.emit(itemInlineTableStart)
 525  		return lexInlineTableValue
 526  	case '"':
 527  		if lx.accept('"') {
 528  			if lx.accept('"') {
 529  				lx.ignore() // Ignore """
 530  				return lexMultilineString
 531  			}
 532  			lx.backup()
 533  		}
 534  		lx.ignore() // ignore the '"'
 535  		return lexString
 536  	case '\'':
 537  		if lx.accept('\'') {
 538  			if lx.accept('\'') {
 539  				lx.ignore() // Ignore """
 540  				return lexMultilineRawString
 541  			}
 542  			lx.backup()
 543  		}
 544  		lx.ignore() // ignore the "'"
 545  		return lexRawString
 546  	case '.': // special error case, be kind to users
 547  		return lx.errorf("floats must start with a digit, not '.'")
 548  	case 'i', 'n':
 549  		if (lx.accept('n') && lx.accept('f')) || (lx.accept('a') && lx.accept('n')) {
 550  			lx.emit(itemFloat)
 551  			return lx.pop()
 552  		}
 553  	case '-', '+':
 554  		return lexDecimalNumberStart
 555  	}
 556  	if unicode.IsLetter(r) {
 557  		// Be permissive here; lexBool will give a nice error if the
 558  		// user wrote something like
 559  		//   x = foo
 560  		// (i.e. not 'true' or 'false' but is something else word-like.)
 561  		lx.backup()
 562  		return lexBool
 563  	}
 564  	if r == eof {
 565  		return lx.errorf("unexpected EOF; expected value")
 566  	}
 567  	if r == '\n' {
 568  		return lx.errorPrevLine(fmt.Errorf("expected value but found %q instead", r))
 569  	}
 570  	return lx.errorf("expected value but found %q instead", r)
 571  }
 572  
 573  // lexArrayValue consumes one value in an array. It assumes that '[' or ','
 574  // have already been consumed. All whitespace and newlines are ignored.
 575  func lexArrayValue(lx *lexer) stateFn {
 576  	r := lx.next()
 577  	switch {
 578  	case isWhitespace(r) || isNL(r):
 579  		return lexSkip(lx, lexArrayValue)
 580  	case r == '#':
 581  		lx.push(lexArrayValue)
 582  		return lexCommentStart
 583  	case r == ',':
 584  		return lx.errorf("unexpected comma")
 585  	case r == ']':
 586  		return lexArrayEnd
 587  	}
 588  
 589  	lx.backup()
 590  	lx.push(lexArrayValueEnd)
 591  	return lexValue
 592  }
 593  
 594  // lexArrayValueEnd consumes everything between the end of an array value and
 595  // the next value (or the end of the array): it ignores whitespace and newlines
 596  // and expects either a ',' or a ']'.
 597  func lexArrayValueEnd(lx *lexer) stateFn {
 598  	switch r := lx.next(); {
 599  	case isWhitespace(r) || isNL(r):
 600  		return lexSkip(lx, lexArrayValueEnd)
 601  	case r == '#':
 602  		lx.push(lexArrayValueEnd)
 603  		return lexCommentStart
 604  	case r == ',':
 605  		lx.ignore()
 606  		return lexArrayValue // move on to the next value
 607  	case r == ']':
 608  		return lexArrayEnd
 609  	default:
 610  		return lx.errorf("expected a comma (',') or array terminator (']'), but got %s", runeOrEOF(r))
 611  	}
 612  }
 613  
 614  // lexArrayEnd finishes the lexing of an array.
 615  // It assumes that a ']' has just been consumed.
 616  func lexArrayEnd(lx *lexer) stateFn {
 617  	lx.ignore()
 618  	lx.emit(itemArrayEnd)
 619  	return lx.pop()
 620  }
 621  
 622  // lexInlineTableValue consumes one key/value pair in an inline table.
 623  // It assumes that '{' or ',' have already been consumed. Whitespace is ignored.
 624  func lexInlineTableValue(lx *lexer) stateFn {
 625  	r := lx.next()
 626  	switch {
 627  	case isWhitespace(r):
 628  		return lexSkip(lx, lexInlineTableValue)
 629  	case isNL(r):
 630  		return lexSkip(lx, lexInlineTableValue)
 631  	case r == '#':
 632  		lx.push(lexInlineTableValue)
 633  		return lexCommentStart
 634  	case r == ',':
 635  		return lx.errorf("unexpected comma")
 636  	case r == '}':
 637  		return lexInlineTableEnd
 638  	}
 639  	lx.backup()
 640  	lx.push(lexInlineTableValueEnd)
 641  	return lexKeyStart
 642  }
 643  
 644  // lexInlineTableValueEnd consumes everything between the end of an inline table
 645  // key/value pair and the next pair (or the end of the table):
 646  // it ignores whitespace and expects either a ',' or a '}'.
 647  func lexInlineTableValueEnd(lx *lexer) stateFn {
 648  	switch r := lx.next(); {
 649  	case isWhitespace(r):
 650  		return lexSkip(lx, lexInlineTableValueEnd)
 651  	case isNL(r):
 652  		return lexSkip(lx, lexInlineTableValueEnd)
 653  	case r == '#':
 654  		lx.push(lexInlineTableValueEnd)
 655  		return lexCommentStart
 656  	case r == ',':
 657  		lx.ignore()
 658  		lx.skip(isWhitespace)
 659  		if lx.peek() == '}' {
 660  			return lexInlineTableValueEnd
 661  		}
 662  		return lexInlineTableValue
 663  	case r == '}':
 664  		return lexInlineTableEnd
 665  	default:
 666  		return lx.errorf("expected a comma or an inline table terminator '}', but got %s instead", runeOrEOF(r))
 667  	}
 668  }
 669  
 670  func runeOrEOF(r rune) string {
 671  	if r == eof {
 672  		return "end of file"
 673  	}
 674  	return "'" + string(r) + "'"
 675  }
 676  
 677  // lexInlineTableEnd finishes the lexing of an inline table.
 678  // It assumes that a '}' has just been consumed.
 679  func lexInlineTableEnd(lx *lexer) stateFn {
 680  	lx.ignore()
 681  	lx.emit(itemInlineTableEnd)
 682  	return lx.pop()
 683  }
 684  
 685  // lexString consumes the inner contents of a string. It assumes that the
 686  // beginning '"' has already been consumed and ignored.
 687  func lexString(lx *lexer) stateFn {
 688  	r := lx.next()
 689  	switch {
 690  	case r == eof:
 691  		return lx.errorf(`unexpected EOF; expected '"'`)
 692  	case isNL(r):
 693  		return lx.errorPrevLine(errLexStringNL{})
 694  	case r == '\\':
 695  		lx.push(lexString)
 696  		return lexStringEscape
 697  	case r == '"':
 698  		lx.backup()
 699  		if lx.esc {
 700  			lx.esc = false
 701  			lx.emit(itemStringEsc)
 702  		} else {
 703  			lx.emit(itemString)
 704  		}
 705  		lx.next()
 706  		lx.ignore()
 707  		return lx.pop()
 708  	}
 709  	return lexString
 710  }
 711  
 712  // lexMultilineString consumes the inner contents of a string. It assumes that
 713  // the beginning '"""' has already been consumed and ignored.
 714  func lexMultilineString(lx *lexer) stateFn {
 715  	r := lx.next()
 716  	switch r {
 717  	default:
 718  		return lexMultilineString
 719  	case eof:
 720  		return lx.errorf(`unexpected EOF; expected '"""'`)
 721  	case '\\':
 722  		return lexMultilineStringEscape
 723  	case '"':
 724  		/// Found " → try to read two more "".
 725  		if lx.accept('"') {
 726  			if lx.accept('"') {
 727  				/// Peek ahead: the string can contain " and "", including at the
 728  				/// end: """str"""""
 729  				/// 6 or more at the end, however, is an error.
 730  				if lx.peek() == '"' {
 731  					/// Check if we already lexed 5 's; if so we have 6 now, and
 732  					/// that's just too many man!
 733  					///
 734  					/// Second check is for the edge case:
 735  					///
 736  					///            two quotes allowed.
 737  					///            vv
 738  					///   """lol \""""""
 739  					///          ^^  ^^^---- closing three
 740  					///     escaped
 741  					///
 742  					/// But ugly, but it works
 743  					if strings.HasSuffix(lx.current(), `"""""`) && !strings.HasSuffix(lx.current(), `\"""""`) {
 744  						return lx.errorf(`unexpected '""""""'`)
 745  					}
 746  					lx.backup()
 747  					lx.backup()
 748  					return lexMultilineString
 749  				}
 750  
 751  				lx.backup() /// backup: don't include the """ in the item.
 752  				lx.backup()
 753  				lx.backup()
 754  				lx.esc = false
 755  				lx.emit(itemMultilineString)
 756  				lx.next() /// Read over ''' again and discard it.
 757  				lx.next()
 758  				lx.next()
 759  				lx.ignore()
 760  				return lx.pop()
 761  			}
 762  			lx.backup()
 763  		}
 764  		return lexMultilineString
 765  	}
 766  }
 767  
 768  // lexRawString consumes a raw string. Nothing can be escaped in such a string.
 769  // It assumes that the beginning "'" has already been consumed and ignored.
 770  func lexRawString(lx *lexer) stateFn {
 771  	r := lx.next()
 772  	switch {
 773  	default:
 774  		return lexRawString
 775  	case r == eof:
 776  		return lx.errorf(`unexpected EOF; expected "'"`)
 777  	case isNL(r):
 778  		return lx.errorPrevLine(errLexStringNL{})
 779  	case r == '\'':
 780  		lx.backup()
 781  		lx.emit(itemRawString)
 782  		lx.next()
 783  		lx.ignore()
 784  		return lx.pop()
 785  	}
 786  }
 787  
 788  // lexMultilineRawString consumes a raw string. Nothing can be escaped in such a
 789  // string. It assumes that the beginning triple-' has already been consumed and
 790  // ignored.
 791  func lexMultilineRawString(lx *lexer) stateFn {
 792  	r := lx.next()
 793  	switch r {
 794  	default:
 795  		return lexMultilineRawString
 796  	case eof:
 797  		return lx.errorf(`unexpected EOF; expected "'''"`)
 798  	case '\'':
 799  		/// Found ' → try to read two more ''.
 800  		if lx.accept('\'') {
 801  			if lx.accept('\'') {
 802  				/// Peek ahead: the string can contain ' and '', including at the
 803  				/// end: '''str'''''
 804  				/// 6 or more at the end, however, is an error.
 805  				if lx.peek() == '\'' {
 806  					/// Check if we already lexed 5 's; if so we have 6 now, and
 807  					/// that's just too many man!
 808  					if strings.HasSuffix(lx.current(), "'''''") {
 809  						return lx.errorf(`unexpected "''''''"`)
 810  					}
 811  					lx.backup()
 812  					lx.backup()
 813  					return lexMultilineRawString
 814  				}
 815  
 816  				lx.backup() /// backup: don't include the ''' in the item.
 817  				lx.backup()
 818  				lx.backup()
 819  				lx.emit(itemRawMultilineString)
 820  				lx.next() /// Read over ''' again and discard it.
 821  				lx.next()
 822  				lx.next()
 823  				lx.ignore()
 824  				return lx.pop()
 825  			}
 826  			lx.backup()
 827  		}
 828  		return lexMultilineRawString
 829  	}
 830  }
 831  
 832  // lexMultilineStringEscape consumes an escaped character. It assumes that the
 833  // preceding '\\' has already been consumed.
 834  func lexMultilineStringEscape(lx *lexer) stateFn {
 835  	if isNL(lx.next()) { /// \ escaping newline.
 836  		return lexMultilineString
 837  	}
 838  	lx.backup()
 839  	lx.push(lexMultilineString)
 840  	return lexStringEscape(lx)
 841  }
 842  
 843  func lexStringEscape(lx *lexer) stateFn {
 844  	lx.esc = true
 845  	r := lx.next()
 846  	switch r {
 847  	case 'e':
 848  		fallthrough
 849  	case 'b':
 850  		fallthrough
 851  	case 't':
 852  		fallthrough
 853  	case 'n':
 854  		fallthrough
 855  	case 'f':
 856  		fallthrough
 857  	case 'r':
 858  		fallthrough
 859  	case '"':
 860  		fallthrough
 861  	case ' ', '\t':
 862  		// Inside """ .. """ strings you can use \ to escape newlines, and any
 863  		// amount of whitespace can be between the \ and \n.
 864  		fallthrough
 865  	case '\\':
 866  		return lx.pop()
 867  	case 'x':
 868  		return lexHexEscape
 869  	case 'u':
 870  		return lexShortUnicodeEscape
 871  	case 'U':
 872  		return lexLongUnicodeEscape
 873  	}
 874  	return lx.error(errLexEscape{r})
 875  }
 876  
 877  func lexHexEscape(lx *lexer) stateFn {
 878  	var r rune
 879  	for i := 0; i < 2; i++ {
 880  		r = lx.next()
 881  		if !isHex(r) {
 882  			return lx.errorf(`expected two hexadecimal digits after '\x', but got %q instead`, lx.current())
 883  		}
 884  	}
 885  	return lx.pop()
 886  }
 887  
 888  func lexShortUnicodeEscape(lx *lexer) stateFn {
 889  	var r rune
 890  	for i := 0; i < 4; i++ {
 891  		r = lx.next()
 892  		if !isHex(r) {
 893  			return lx.errorf(`expected four hexadecimal digits after '\u', but got %q instead`, lx.current())
 894  		}
 895  	}
 896  	return lx.pop()
 897  }
 898  
 899  func lexLongUnicodeEscape(lx *lexer) stateFn {
 900  	var r rune
 901  	for i := 0; i < 8; i++ {
 902  		r = lx.next()
 903  		if !isHex(r) {
 904  			return lx.errorf(`expected eight hexadecimal digits after '\U', but got %q instead`, lx.current())
 905  		}
 906  	}
 907  	return lx.pop()
 908  }
 909  
 910  // lexNumberOrDateStart processes the first character of a value which begins
 911  // with a digit. It exists to catch values starting with '0', so that
 912  // lexBaseNumberOrDate can differentiate base prefixed integers from other
 913  // types.
 914  func lexNumberOrDateStart(lx *lexer) stateFn {
 915  	if lx.next() == '0' {
 916  		return lexBaseNumberOrDate
 917  	}
 918  	return lexNumberOrDate
 919  }
 920  
 921  // lexNumberOrDate consumes either an integer, float or datetime.
 922  func lexNumberOrDate(lx *lexer) stateFn {
 923  	r := lx.next()
 924  	if isDigit(r) {
 925  		return lexNumberOrDate
 926  	}
 927  	switch r {
 928  	case '-', ':':
 929  		return lexDatetime
 930  	case '_':
 931  		return lexDecimalNumber
 932  	case '.', 'e', 'E':
 933  		return lexFloat
 934  	}
 935  
 936  	lx.backup()
 937  	lx.emit(itemInteger)
 938  	return lx.pop()
 939  }
 940  
 941  // lexDatetime consumes a Datetime, to a first approximation.
 942  // The parser validates that it matches one of the accepted formats.
 943  func lexDatetime(lx *lexer) stateFn {
 944  	r := lx.next()
 945  	if isDigit(r) {
 946  		return lexDatetime
 947  	}
 948  	switch r {
 949  	case '-', ':', 'T', 't', ' ', '.', 'Z', 'z', '+':
 950  		return lexDatetime
 951  	}
 952  
 953  	lx.backup()
 954  	lx.emitTrim(itemDatetime)
 955  	return lx.pop()
 956  }
 957  
 958  // lexHexInteger consumes a hexadecimal integer after seeing the '0x' prefix.
 959  func lexHexInteger(lx *lexer) stateFn {
 960  	r := lx.next()
 961  	if isHex(r) {
 962  		return lexHexInteger
 963  	}
 964  	switch r {
 965  	case '_':
 966  		return lexHexInteger
 967  	}
 968  
 969  	lx.backup()
 970  	lx.emit(itemInteger)
 971  	return lx.pop()
 972  }
 973  
 974  // lexOctalInteger consumes an octal integer after seeing the '0o' prefix.
 975  func lexOctalInteger(lx *lexer) stateFn {
 976  	r := lx.next()
 977  	if isOctal(r) {
 978  		return lexOctalInteger
 979  	}
 980  	switch r {
 981  	case '_':
 982  		return lexOctalInteger
 983  	}
 984  
 985  	lx.backup()
 986  	lx.emit(itemInteger)
 987  	return lx.pop()
 988  }
 989  
 990  // lexBinaryInteger consumes a binary integer after seeing the '0b' prefix.
 991  func lexBinaryInteger(lx *lexer) stateFn {
 992  	r := lx.next()
 993  	if isBinary(r) {
 994  		return lexBinaryInteger
 995  	}
 996  	switch r {
 997  	case '_':
 998  		return lexBinaryInteger
 999  	}
1000  
1001  	lx.backup()
1002  	lx.emit(itemInteger)
1003  	return lx.pop()
1004  }
1005  
1006  // lexDecimalNumber consumes a decimal float or integer.
1007  func lexDecimalNumber(lx *lexer) stateFn {
1008  	r := lx.next()
1009  	if isDigit(r) {
1010  		return lexDecimalNumber
1011  	}
1012  	switch r {
1013  	case '.', 'e', 'E':
1014  		return lexFloat
1015  	case '_':
1016  		return lexDecimalNumber
1017  	}
1018  
1019  	lx.backup()
1020  	lx.emit(itemInteger)
1021  	return lx.pop()
1022  }
1023  
1024  // lexDecimalNumber consumes the first digit of a number beginning with a sign.
1025  // It assumes the sign has already been consumed. Values which start with a sign
1026  // are only allowed to be decimal integers or floats.
1027  //
1028  // The special "nan" and "inf" values are also recognized.
1029  func lexDecimalNumberStart(lx *lexer) stateFn {
1030  	r := lx.next()
1031  
1032  	// Special error cases to give users better error messages
1033  	switch r {
1034  	case 'i':
1035  		if !lx.accept('n') || !lx.accept('f') {
1036  			return lx.errorf("invalid float: '%s'", lx.current())
1037  		}
1038  		lx.emit(itemFloat)
1039  		return lx.pop()
1040  	case 'n':
1041  		if !lx.accept('a') || !lx.accept('n') {
1042  			return lx.errorf("invalid float: '%s'", lx.current())
1043  		}
1044  		lx.emit(itemFloat)
1045  		return lx.pop()
1046  	case '0':
1047  		p := lx.peek()
1048  		switch p {
1049  		case 'b', 'o', 'x':
1050  			return lx.errorf("cannot use sign with non-decimal numbers: '%s%c'", lx.current(), p)
1051  		}
1052  	case '.':
1053  		return lx.errorf("floats must start with a digit, not '.'")
1054  	}
1055  
1056  	if isDigit(r) {
1057  		return lexDecimalNumber
1058  	}
1059  
1060  	return lx.errorf("expected a digit but got %q", r)
1061  }
1062  
1063  // lexBaseNumberOrDate differentiates between the possible values which
1064  // start with '0'. It assumes that before reaching this state, the initial '0'
1065  // has been consumed.
1066  func lexBaseNumberOrDate(lx *lexer) stateFn {
1067  	r := lx.next()
1068  	// Note: All datetimes start with at least two digits, so we don't
1069  	// handle date characters (':', '-', etc.) here.
1070  	if isDigit(r) {
1071  		return lexNumberOrDate
1072  	}
1073  	switch r {
1074  	case '_':
1075  		// Can only be decimal, because there can't be an underscore
1076  		// between the '0' and the base designator, and dates can't
1077  		// contain underscores.
1078  		return lexDecimalNumber
1079  	case '.', 'e', 'E':
1080  		return lexFloat
1081  	case 'b':
1082  		r = lx.peek()
1083  		if !isBinary(r) {
1084  			lx.errorf("not a binary number: '%s%c'", lx.current(), r)
1085  		}
1086  		return lexBinaryInteger
1087  	case 'o':
1088  		r = lx.peek()
1089  		if !isOctal(r) {
1090  			lx.errorf("not an octal number: '%s%c'", lx.current(), r)
1091  		}
1092  		return lexOctalInteger
1093  	case 'x':
1094  		r = lx.peek()
1095  		if !isHex(r) {
1096  			lx.errorf("not a hexadecimal number: '%s%c'", lx.current(), r)
1097  		}
1098  		return lexHexInteger
1099  	}
1100  
1101  	lx.backup()
1102  	lx.emit(itemInteger)
1103  	return lx.pop()
1104  }
1105  
1106  // lexFloat consumes the elements of a float. It allows any sequence of
1107  // float-like characters, so floats emitted by the lexer are only a first
1108  // approximation and must be validated by the parser.
1109  func lexFloat(lx *lexer) stateFn {
1110  	r := lx.next()
1111  	if isDigit(r) {
1112  		return lexFloat
1113  	}
1114  	switch r {
1115  	case '_', '.', '-', '+', 'e', 'E':
1116  		return lexFloat
1117  	}
1118  
1119  	lx.backup()
1120  	lx.emit(itemFloat)
1121  	return lx.pop()
1122  }
1123  
1124  // lexBool consumes a bool string: 'true' or 'false.
1125  func lexBool(lx *lexer) stateFn {
1126  	var rs []rune
1127  	for {
1128  		r := lx.next()
1129  		if !unicode.IsLetter(r) {
1130  			lx.backup()
1131  			break
1132  		}
1133  		rs = append(rs, r)
1134  	}
1135  	s := string(rs)
1136  	switch s {
1137  	case "true", "false":
1138  		lx.emit(itemBool)
1139  		return lx.pop()
1140  	}
1141  	return lx.errorf("expected value but found %q instead", s)
1142  }
1143  
1144  // lexCommentStart begins the lexing of a comment. It will emit
1145  // itemCommentStart and consume no characters, passing control to lexComment.
1146  func lexCommentStart(lx *lexer) stateFn {
1147  	lx.ignore()
1148  	lx.emit(itemCommentStart)
1149  	return lexComment
1150  }
1151  
1152  // lexComment lexes an entire comment. It assumes that '#' has been consumed.
1153  // It will consume *up to* the first newline character, and pass control
1154  // back to the last state on the stack.
1155  func lexComment(lx *lexer) stateFn {
1156  	switch r := lx.next(); {
1157  	case isNL(r) || r == eof:
1158  		lx.backup()
1159  		lx.emit(itemText)
1160  		return lx.pop()
1161  	default:
1162  		return lexComment
1163  	}
1164  }
1165  
1166  // lexSkip ignores all slurped input and moves on to the next state.
1167  func lexSkip(lx *lexer, nextState stateFn) stateFn {
1168  	lx.ignore()
1169  	return nextState
1170  }
1171  
1172  func (s stateFn) String() string {
1173  	if s == nil {
1174  		return "<nil>"
1175  	}
1176  	name := runtime.FuncForPC(reflect.ValueOf(s).Pointer()).Name()
1177  	if i := strings.LastIndexByte(name, '.'); i > -1 {
1178  		name = name[i+1:]
1179  	}
1180  	return name + "()"
1181  }
1182  
1183  func (itype itemType) String() string {
1184  	switch itype {
1185  	case itemError:
1186  		return "Error"
1187  	case itemEOF:
1188  		return "EOF"
1189  	case itemText:
1190  		return "Text"
1191  	case itemString, itemStringEsc, itemRawString, itemMultilineString, itemRawMultilineString:
1192  		return "String"
1193  	case itemBool:
1194  		return "Bool"
1195  	case itemInteger:
1196  		return "Integer"
1197  	case itemFloat:
1198  		return "Float"
1199  	case itemDatetime:
1200  		return "DateTime"
1201  	case itemArray:
1202  		return "Array"
1203  	case itemArrayEnd:
1204  		return "ArrayEnd"
1205  	case itemTableStart:
1206  		return "TableStart"
1207  	case itemTableEnd:
1208  		return "TableEnd"
1209  	case itemArrayTableStart:
1210  		return "ArrayTableStart"
1211  	case itemArrayTableEnd:
1212  		return "ArrayTableEnd"
1213  	case itemKeyStart:
1214  		return "KeyStart"
1215  	case itemKeyEnd:
1216  		return "KeyEnd"
1217  	case itemCommentStart:
1218  		return "CommentStart"
1219  	case itemInlineTableStart:
1220  		return "InlineTableStart"
1221  	case itemInlineTableEnd:
1222  		return "InlineTableEnd"
1223  	}
1224  	panic(fmt.Sprintf("BUG: Unknown type '%d'.", int(itype)))
1225  }
1226  
1227  func (item item) String() string {
1228  	return fmt.Sprintf("(%s, %s)", item.typ, item.val)
1229  }
1230  
1231  func isWhitespace(r rune) bool { return r == '\t' || r == ' ' }
1232  func isNL(r rune) bool         { return r == '\n' || r == '\r' }
1233  func isControl(r rune) bool { // Control characters except \t, \r, \n
1234  	switch r {
1235  	case '\t', '\r', '\n':
1236  		return false
1237  	default:
1238  		return (r >= 0x00 && r <= 0x1f) || r == 0x7f
1239  	}
1240  }
1241  func isDigit(r rune) bool  { return r >= '0' && r <= '9' }
1242  func isBinary(r rune) bool { return r == '0' || r == '1' }
1243  func isOctal(r rune) bool  { return r >= '0' && r <= '7' }
1244  func isHex(r rune) bool    { return (r >= '0' && r <= '9') || (r|0x20 >= 'a' && r|0x20 <= 'f') }
1245  func isBareKeyChar(r rune) bool {
1246  	return (r >= 'A' && r <= 'Z') || (r >= 'a' && r <= 'z') ||
1247  		(r >= '0' && r <= '9') || r == '_' || r == '-'
1248  }
1249