scanner.mx raw

   1  // Copyright 2009 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  // Package scanner implements a scanner for Go source text.
   6  // It takes a []byte as source which can then be tokenized
   7  // through repeated calls to the Scan method.
   8  package scanner
   9  
  10  import (
  11  	"bytes"
  12  	"fmt"
  13  	"go/token"
  14  	"path/filepath"
  15  	"strconv"
  16  	"unicode"
  17  	"unicode/utf8"
  18  )
  19  
  20  // An ErrorHandler may be provided to [Scanner.Init]. If a syntax error is
  21  // encountered and a handler was installed, the handler is called with a
  22  // position and an error message. The position points to the beginning of
  23  // the offending token.
  24  type ErrorHandler func(pos token.Position, msg string)
  25  
  26  // A Scanner holds the scanner's internal state while processing
  27  // a given text. It can be allocated as part of another data
  28  // structure but must be initialized via [Scanner.Init] before use.
  29  type Scanner struct {
  30  	// immutable state
  31  	file *token.File  // source file handle
  32  	dir  string       // directory portion of file.Name()
  33  	src  []byte       // source
  34  	err  ErrorHandler // error reporting; or nil
  35  	mode Mode         // scanning mode
  36  
  37  	// scanning state
  38  	ch         rune      // current character
  39  	offset     int       // character offset
  40  	rdOffset   int       // reading offset (position after current character)
  41  	lineOffset int       // current line offset
  42  	insertSemi bool      // insert a semicolon before next newline
  43  	nlPos      token.Pos // position of newline in preceding comment
  44  
  45  	// public state - ok to modify
  46  	ErrorCount int // number of errors encountered
  47  }
  48  
  49  const (
  50  	bom = 0xFEFF // byte order mark, only permitted as very first character
  51  	eof = -1     // end of file
  52  )
  53  
  54  // Read the next Unicode char into s.ch.
  55  // s.ch < 0 means end-of-file.
  56  //
  57  // For optimization, there is some overlap between this method and
  58  // s.scanIdentifier.
  59  func (s *Scanner) next() {
  60  	if s.rdOffset < len(s.src) {
  61  		s.offset = s.rdOffset
  62  		if s.ch == '\n' {
  63  			s.lineOffset = s.offset
  64  			s.file.AddLine(s.offset)
  65  		}
  66  		r, w := rune(s.src[s.rdOffset]), 1
  67  		switch {
  68  		case r == 0:
  69  			s.error(s.offset, "illegal character NUL")
  70  		case r >= utf8.RuneSelf:
  71  			// not ASCII
  72  			r, w = utf8.DecodeRune(s.src[s.rdOffset:])
  73  			if r == utf8.RuneError && w == 1 {
  74  				in := s.src[s.rdOffset:]
  75  				if s.offset == 0 &&
  76  					len(in) >= 2 &&
  77  					(in[0] == 0xFF && in[1] == 0xFE || in[0] == 0xFE && in[1] == 0xFF) {
  78  					// U+FEFF BOM at start of file, encoded as big- or little-endian
  79  					// UCS-2 (i.e. 2-byte UTF-16). Give specific error (go.dev/issue/71950).
  80  					s.error(s.offset, "illegal UTF-8 encoding (got UTF-16)")
  81  					s.rdOffset += len(in) // consume all input to avoid error cascade
  82  				} else {
  83  					s.error(s.offset, "illegal UTF-8 encoding")
  84  				}
  85  			} else if r == bom && s.offset > 0 {
  86  				s.error(s.offset, "illegal byte order mark")
  87  			}
  88  		}
  89  		s.rdOffset += w
  90  		s.ch = r
  91  	} else {
  92  		s.offset = len(s.src)
  93  		if s.ch == '\n' {
  94  			s.lineOffset = s.offset
  95  			s.file.AddLine(s.offset)
  96  		}
  97  		s.ch = eof
  98  	}
  99  }
 100  
 101  // peek returns the byte following the most recently read character without
 102  // advancing the scanner. If the scanner is at EOF, peek returns 0.
 103  func (s *Scanner) peek() byte {
 104  	if s.rdOffset < len(s.src) {
 105  		return s.src[s.rdOffset]
 106  	}
 107  	return 0
 108  }
 109  
 110  // A mode value is a set of flags (or 0).
 111  // They control scanner behavior.
 112  type Mode uint
 113  
 114  const (
 115  	ScanComments    Mode = 1 << iota // return comments as COMMENT tokens
 116  	dontInsertSemis                  // do not automatically insert semicolons - for testing only
 117  )
 118  
 119  // Init prepares the scanner s to tokenize the text src by setting the
 120  // scanner at the beginning of src. The scanner uses the file set file
 121  // for position information and it adds line information for each line.
 122  // It is ok to re-use the same file when re-scanning the same file as
 123  // line information which is already present is ignored. Init causes a
 124  // panic if the file size does not match the src size.
 125  //
 126  // Calls to [Scanner.Scan] will invoke the error handler err if they encounter a
 127  // syntax error and err is not nil. Also, for each error encountered,
 128  // the [Scanner] field ErrorCount is incremented by one. The mode parameter
 129  // determines how comments are handled.
 130  //
 131  // Note that Init may call err if there is an error in the first character
 132  // of the file.
 133  func (s *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode Mode) {
 134  	// Explicitly initialize all fields since a scanner may be reused.
 135  	if file.Size() != len(src) {
 136  		panic(fmt.Sprintf("file size (%d) does not match src len (%d)", file.Size(), len(src)))
 137  	}
 138  	s.file = file
 139  	s.dir, _ = filepath.Split(file.Name())
 140  	s.src = src
 141  	s.err = err
 142  	s.mode = mode
 143  
 144  	s.ch = ' '
 145  	s.offset = 0
 146  	s.rdOffset = 0
 147  	s.lineOffset = 0
 148  	s.insertSemi = false
 149  	s.ErrorCount = 0
 150  
 151  	s.next()
 152  	if s.ch == bom {
 153  		s.next() // ignore BOM at file beginning
 154  	}
 155  }
 156  
 157  func (s *Scanner) error(offs int, msg string) {
 158  	if s.err != nil {
 159  		s.err(s.file.Position(s.file.Pos(offs)), msg)
 160  	}
 161  	s.ErrorCount++
 162  }
 163  
 164  func (s *Scanner) errorf(offs int, format string, args ...any) {
 165  	s.error(offs, fmt.Sprintf(format, args...))
 166  }
 167  
 168  // scanComment returns the text of the comment and (if nonzero)
 169  // the offset of the first newline within it, which implies a
 170  // /*...*/ comment.
 171  func (s *Scanner) scanComment() (string, int) {
 172  	// initial '/' already consumed; s.ch == '/' || s.ch == '*'
 173  	offs := s.offset - 1 // position of initial '/'
 174  	next := -1           // position immediately following the comment; < 0 means invalid comment
 175  	numCR := 0
 176  	nlOffset := 0 // offset of first newline within /*...*/ comment
 177  
 178  	if s.ch == '/' {
 179  		//-style comment
 180  		// (the final '\n' is not considered part of the comment)
 181  		s.next()
 182  		for s.ch != '\n' && s.ch >= 0 {
 183  			if s.ch == '\r' {
 184  				numCR++
 185  			}
 186  			s.next()
 187  		}
 188  		// if we are at '\n', the position following the comment is afterwards
 189  		next = s.offset
 190  		if s.ch == '\n' {
 191  			next++
 192  		}
 193  		goto exit
 194  	}
 195  
 196  	/*-style comment */
 197  	s.next()
 198  	for s.ch >= 0 {
 199  		ch := s.ch
 200  		if ch == '\r' {
 201  			numCR++
 202  		} else if ch == '\n' && nlOffset == 0 {
 203  			nlOffset = s.offset
 204  		}
 205  		s.next()
 206  		if ch == '*' && s.ch == '/' {
 207  			s.next()
 208  			next = s.offset
 209  			goto exit
 210  		}
 211  	}
 212  
 213  	s.error(offs, "comment not terminated")
 214  
 215  exit:
 216  	lit := s.src[offs:s.offset]
 217  
 218  	// On Windows, a (//-comment) line may end in "\r\n".
 219  	// Remove the final '\r' before analyzing the text for
 220  	// line directives (matching the compiler). Remove any
 221  	// other '\r' afterwards (matching the pre-existing be-
 222  	// havior of the scanner).
 223  	if numCR > 0 && len(lit) >= 2 && lit[1] == '/' && lit[len(lit)-1] == '\r' {
 224  		lit = lit[:len(lit)-1]
 225  		numCR--
 226  	}
 227  
 228  	// interpret line directives
 229  	// (//line directives must start at the beginning of the current line)
 230  	if next >= 0 /* implies valid comment */ && (lit[1] == '*' || offs == s.lineOffset) && bytes.HasPrefix(lit[2:], prefix) {
 231  		s.updateLineInfo(next, offs, lit)
 232  	}
 233  
 234  	if numCR > 0 {
 235  		lit = stripCR(lit, lit[1] == '*')
 236  	}
 237  
 238  	return string(lit), nlOffset
 239  }
 240  
 241  var prefix = []byte("line ")
 242  
 243  // updateLineInfo parses the incoming comment text at offset offs
 244  // as a line directive. If successful, it updates the line info table
 245  // for the position next per the line directive.
 246  func (s *Scanner) updateLineInfo(next, offs int, text []byte) {
 247  	// extract comment text
 248  	if text[1] == '*' {
 249  		text = text[:len(text)-2] // lop off trailing "*/"
 250  	}
 251  	text = text[7:] // lop off leading "//line " or "/*line "
 252  	offs += 7
 253  
 254  	i, n, ok := trailingDigits(text)
 255  	if i == 0 {
 256  		return // ignore (not a line directive)
 257  	}
 258  	// i > 0
 259  
 260  	if !ok {
 261  		// text has a suffix :xxx but xxx is not a number
 262  		s.error(offs+i, "invalid line number: "+string(text[i:]))
 263  		return
 264  	}
 265  
 266  	// Put a cap on the maximum size of line and column numbers.
 267  	// 30 bits allows for some additional space before wrapping an int32.
 268  	// Keep this consistent with cmd/compile/internal/syntax.PosMax.
 269  	const maxLineCol = 1 << 30
 270  	var line, col int
 271  	i2, n2, ok2 := trailingDigits(text[:i-1])
 272  	if ok2 {
 273  		//line filename:line:col
 274  		i, i2 = i2, i
 275  		line, col = n2, n
 276  		if col == 0 || col > maxLineCol {
 277  			s.error(offs+i2, "invalid column number: "+string(text[i2:]))
 278  			return
 279  		}
 280  		text = text[:i2-1] // lop off ":col"
 281  	} else {
 282  		//line filename:line
 283  		line = n
 284  	}
 285  
 286  	if line == 0 || line > maxLineCol {
 287  		s.error(offs+i, "invalid line number: "+string(text[i:]))
 288  		return
 289  	}
 290  
 291  	// If we have a column (//line filename:line:col form),
 292  	// an empty filename means to use the previous filename.
 293  	filename := string(text[:i-1]) // lop off ":line", and trim white space
 294  	if filename == "" && ok2 {
 295  		filename = s.file.Position(s.file.Pos(offs)).Filename
 296  	} else if filename != "" {
 297  		// Put a relative filename in the current directory.
 298  		// This is for compatibility with earlier releases.
 299  		// See issue 26671.
 300  		filename = filepath.Clean(filename)
 301  		if !filepath.IsAbs(filename) {
 302  			filename = filepath.Join(s.dir, filename)
 303  		}
 304  	}
 305  
 306  	s.file.AddLineColumnInfo(next, filename, line, col)
 307  }
 308  
 309  func trailingDigits(text []byte) (int, int, bool) {
 310  	i := bytes.LastIndexByte(text, ':') // look from right (Windows filenames may contain ':')
 311  	if i < 0 {
 312  		return 0, 0, false // no ":"
 313  	}
 314  	// i >= 0
 315  	n, err := strconv.ParseUint(string(text[i+1:]), 10, 0)
 316  	return i + 1, int(n), err == nil
 317  }
 318  
 319  func isLetter(ch rune) bool {
 320  	return 'a' <= lower(ch) && lower(ch) <= 'z' || ch == '_' || ch >= utf8.RuneSelf && unicode.IsLetter(ch)
 321  }
 322  
 323  func isDigit(ch rune) bool {
 324  	return isDecimal(ch) || ch >= utf8.RuneSelf && unicode.IsDigit(ch)
 325  }
 326  
 327  // scanIdentifier reads the string of valid identifier characters at s.offset.
 328  // It must only be called when s.ch is known to be a valid letter.
 329  //
 330  // Be careful when making changes to this function: it is optimized and affects
 331  // scanning performance significantly.
 332  func (s *Scanner) scanIdentifier() string {
 333  	offs := s.offset
 334  
 335  	// Optimize for the common case of an ASCII identifier.
 336  	//
 337  	// Ranging over s.src[s.rdOffset:] lets us avoid some bounds checks, and
 338  	// avoids conversions to runes.
 339  	//
 340  	// In case we encounter a non-ASCII character, fall back on the slower path
 341  	// of calling into s.next().
 342  	for rdOffset, b := range s.src[s.rdOffset:] {
 343  		if 'a' <= b && b <= 'z' || 'A' <= b && b <= 'Z' || b == '_' || '0' <= b && b <= '9' {
 344  			// Avoid assigning a rune for the common case of an ascii character.
 345  			continue
 346  		}
 347  		s.rdOffset += rdOffset
 348  		if 0 < b && b < utf8.RuneSelf {
 349  			// Optimization: we've encountered an ASCII character that's not a letter
 350  			// or number. Avoid the call into s.next() and corresponding set up.
 351  			//
 352  			// Note that s.next() does some line accounting if s.ch is '\n', so this
 353  			// shortcut is only possible because we know that the preceding character
 354  			// is not '\n'.
 355  			s.ch = rune(b)
 356  			s.offset = s.rdOffset
 357  			s.rdOffset++
 358  			goto exit
 359  		}
 360  		// We know that the preceding character is valid for an identifier because
 361  		// scanIdentifier is only called when s.ch is a letter, so calling s.next()
 362  		// at s.rdOffset resets the scanner state.
 363  		s.next()
 364  		for isLetter(s.ch) || isDigit(s.ch) {
 365  			s.next()
 366  		}
 367  		goto exit
 368  	}
 369  	s.offset = len(s.src)
 370  	s.rdOffset = len(s.src)
 371  	s.ch = eof
 372  
 373  exit:
 374  	return string(s.src[offs:s.offset])
 375  }
 376  
 377  func digitVal(ch rune) int {
 378  	switch {
 379  	case '0' <= ch && ch <= '9':
 380  		return int(ch - '0')
 381  	case 'a' <= lower(ch) && lower(ch) <= 'f':
 382  		return int(lower(ch) - 'a' + 10)
 383  	}
 384  	return 16 // larger than any legal digit val
 385  }
 386  
 387  func lower(ch rune) rune     { return ('a' - 'A') | ch } // returns lower-case ch iff ch is ASCII letter
 388  func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' }
 389  func isHex(ch rune) bool     { return '0' <= ch && ch <= '9' || 'a' <= lower(ch) && lower(ch) <= 'f' }
 390  
 391  // digits accepts the sequence { digit | '_' }.
 392  // If base <= 10, digits accepts any decimal digit but records
 393  // the offset (relative to the source start) of a digit >= base
 394  // in *invalid, if *invalid < 0.
 395  // digits returns a bitset describing whether the sequence contained
 396  // digits (bit 0 is set), or separators '_' (bit 1 is set).
 397  func (s *Scanner) digits(base int, invalid *int) (digsep int) {
 398  	if base <= 10 {
 399  		max := rune('0' + base)
 400  		for isDecimal(s.ch) || s.ch == '_' {
 401  			ds := 1
 402  			if s.ch == '_' {
 403  				ds = 2
 404  			} else if s.ch >= max && *invalid < 0 {
 405  				*invalid = s.offset // record invalid rune offset
 406  			}
 407  			digsep |= ds
 408  			s.next()
 409  		}
 410  	} else {
 411  		for isHex(s.ch) || s.ch == '_' {
 412  			ds := 1
 413  			if s.ch == '_' {
 414  				ds = 2
 415  			}
 416  			digsep |= ds
 417  			s.next()
 418  		}
 419  	}
 420  	return
 421  }
 422  
 423  func (s *Scanner) scanNumber() (token.Token, string) {
 424  	offs := s.offset
 425  	tok := token.ILLEGAL
 426  
 427  	base := 10        // number base
 428  	prefix := rune(0) // one of 0 (decimal), '0' (0-octal), 'x', 'o', or 'b'
 429  	digsep := 0       // bit 0: digit present, bit 1: '_' present
 430  	invalid := -1     // index of invalid digit in literal, or < 0
 431  
 432  	// integer part
 433  	if s.ch != '.' {
 434  		tok = token.INT
 435  		if s.ch == '0' {
 436  			s.next()
 437  			switch lower(s.ch) {
 438  			case 'x':
 439  				s.next()
 440  				base, prefix = 16, 'x'
 441  			case 'o':
 442  				s.next()
 443  				base, prefix = 8, 'o'
 444  			case 'b':
 445  				s.next()
 446  				base, prefix = 2, 'b'
 447  			default:
 448  				base, prefix = 8, '0'
 449  				digsep = 1 // leading 0
 450  			}
 451  		}
 452  		digsep |= s.digits(base, &invalid)
 453  	}
 454  
 455  	// fractional part
 456  	if s.ch == '.' {
 457  		tok = token.FLOAT
 458  		if prefix == 'o' || prefix == 'b' {
 459  			s.error(s.offset, "invalid radix point in "+litname(prefix))
 460  		}
 461  		s.next()
 462  		digsep |= s.digits(base, &invalid)
 463  	}
 464  
 465  	if digsep&1 == 0 {
 466  		s.error(s.offset, litname(prefix)+" has no digits")
 467  	}
 468  
 469  	// exponent
 470  	if e := lower(s.ch); e == 'e' || e == 'p' {
 471  		switch {
 472  		case e == 'e' && prefix != 0 && prefix != '0':
 473  			s.errorf(s.offset, "%q exponent requires decimal mantissa", s.ch)
 474  		case e == 'p' && prefix != 'x':
 475  			s.errorf(s.offset, "%q exponent requires hexadecimal mantissa", s.ch)
 476  		}
 477  		s.next()
 478  		tok = token.FLOAT
 479  		if s.ch == '+' || s.ch == '-' {
 480  			s.next()
 481  		}
 482  		ds := s.digits(10, nil)
 483  		digsep |= ds
 484  		if ds&1 == 0 {
 485  			s.error(s.offset, "exponent has no digits")
 486  		}
 487  	} else if prefix == 'x' && tok == token.FLOAT {
 488  		s.error(s.offset, "hexadecimal mantissa requires a 'p' exponent")
 489  	}
 490  
 491  	// suffix 'i'
 492  	if s.ch == 'i' {
 493  		tok = token.IMAG
 494  		s.next()
 495  	}
 496  
 497  	lit := string(s.src[offs:s.offset])
 498  	if tok == token.INT && invalid >= 0 {
 499  		s.errorf(invalid, "invalid digit %q in %s", lit[invalid-offs], litname(prefix))
 500  	}
 501  	if digsep&2 != 0 {
 502  		if i := invalidSep(lit); i >= 0 {
 503  			s.error(offs+i, "'_' must separate successive digits")
 504  		}
 505  	}
 506  
 507  	return tok, lit
 508  }
 509  
 510  func litname(prefix rune) string {
 511  	switch prefix {
 512  	case 'x':
 513  		return "hexadecimal literal"
 514  	case 'o', '0':
 515  		return "octal literal"
 516  	case 'b':
 517  		return "binary literal"
 518  	}
 519  	return "decimal literal"
 520  }
 521  
 522  // invalidSep returns the index of the first invalid separator in x, or -1.
 523  func invalidSep(x string) int {
 524  	x1 := ' ' // prefix char, we only care if it's 'x'
 525  	d := '.'  // digit, one of '_', '0' (a digit), or '.' (anything else)
 526  	i := 0
 527  
 528  	// a prefix counts as a digit
 529  	if len(x) >= 2 && x[0] == '0' {
 530  		x1 = lower(rune(x[1]))
 531  		if x1 == 'x' || x1 == 'o' || x1 == 'b' {
 532  			d = '0'
 533  			i = 2
 534  		}
 535  	}
 536  
 537  	// mantissa and exponent
 538  	for ; i < len(x); i++ {
 539  		p := d // previous digit
 540  		d = rune(x[i])
 541  		switch {
 542  		case d == '_':
 543  			if p != '0' {
 544  				return i
 545  			}
 546  		case isDecimal(d) || x1 == 'x' && isHex(d):
 547  			d = '0'
 548  		default:
 549  			if p == '_' {
 550  				return i - 1
 551  			}
 552  			d = '.'
 553  		}
 554  	}
 555  	if d == '_' {
 556  		return len(x) - 1
 557  	}
 558  
 559  	return -1
 560  }
 561  
 562  // scanEscape parses an escape sequence where rune is the accepted
 563  // escaped quote. In case of a syntax error, it stops at the offending
 564  // character (without consuming it) and returns false. Otherwise
 565  // it returns true.
 566  func (s *Scanner) scanEscape(quote rune) bool {
 567  	offs := s.offset
 568  
 569  	var n int
 570  	var base, max uint32
 571  	switch s.ch {
 572  	case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
 573  		s.next()
 574  		return true
 575  	case '0', '1', '2', '3', '4', '5', '6', '7':
 576  		n, base, max = 3, 8, 255
 577  	case 'x':
 578  		s.next()
 579  		n, base, max = 2, 16, 255
 580  	case 'u':
 581  		s.next()
 582  		n, base, max = 4, 16, unicode.MaxRune
 583  	case 'U':
 584  		s.next()
 585  		n, base, max = 8, 16, unicode.MaxRune
 586  	default:
 587  		msg := "unknown escape sequence"
 588  		if s.ch < 0 {
 589  			msg = "escape sequence not terminated"
 590  		}
 591  		s.error(offs, msg)
 592  		return false
 593  	}
 594  
 595  	var x uint32
 596  	for n > 0 {
 597  		d := uint32(digitVal(s.ch))
 598  		if d >= base {
 599  			msg := fmt.Sprintf("illegal character %#U in escape sequence", s.ch)
 600  			if s.ch < 0 {
 601  				msg = "escape sequence not terminated"
 602  			}
 603  			s.error(s.offset, msg)
 604  			return false
 605  		}
 606  		x = x*base + d
 607  		s.next()
 608  		n--
 609  	}
 610  
 611  	if x > max || 0xD800 <= x && x < 0xE000 {
 612  		s.error(offs, "escape sequence is invalid Unicode code point")
 613  		return false
 614  	}
 615  
 616  	return true
 617  }
 618  
 619  func (s *Scanner) scanRune() string {
 620  	// '\'' opening already consumed
 621  	offs := s.offset - 1
 622  
 623  	valid := true
 624  	n := 0
 625  	for {
 626  		ch := s.ch
 627  		if ch == '\n' || ch < 0 {
 628  			// only report error if we don't have one already
 629  			if valid {
 630  				s.error(offs, "rune literal not terminated")
 631  				valid = false
 632  			}
 633  			break
 634  		}
 635  		s.next()
 636  		if ch == '\'' {
 637  			break
 638  		}
 639  		n++
 640  		if ch == '\\' {
 641  			if !s.scanEscape('\'') {
 642  				valid = false
 643  			}
 644  			// continue to read to closing quote
 645  		}
 646  	}
 647  
 648  	if valid && n != 1 {
 649  		s.error(offs, "illegal rune literal")
 650  	}
 651  
 652  	return string(s.src[offs:s.offset])
 653  }
 654  
 655  func (s *Scanner) scanString() string {
 656  	// '"' opening already consumed
 657  	offs := s.offset - 1
 658  
 659  	for {
 660  		ch := s.ch
 661  		if ch == '\n' || ch < 0 {
 662  			s.error(offs, "string literal not terminated")
 663  			break
 664  		}
 665  		s.next()
 666  		if ch == '"' {
 667  			break
 668  		}
 669  		if ch == '\\' {
 670  			s.scanEscape('"')
 671  		}
 672  	}
 673  
 674  	return string(s.src[offs:s.offset])
 675  }
 676  
 677  func stripCR(b []byte, comment bool) []byte {
 678  	c := []byte{:len(b)}
 679  	i := 0
 680  	for j, ch := range b {
 681  		// In a /*-style comment, don't strip \r from *\r/ (incl.
 682  		// sequences of \r from *\r\r...\r/) since the resulting
 683  		// */ would terminate the comment too early unless the \r
 684  		// is immediately following the opening /* in which case
 685  		// it's ok because /*/ is not closed yet (issue #11151).
 686  		if ch != '\r' || comment && i > len("/*") && c[i-1] == '*' && j+1 < len(b) && b[j+1] == '/' {
 687  			c[i] = ch
 688  			i++
 689  		}
 690  	}
 691  	return c[:i]
 692  }
 693  
 694  func (s *Scanner) scanRawString() string {
 695  	// '`' opening already consumed
 696  	offs := s.offset - 1
 697  
 698  	hasCR := false
 699  	for {
 700  		ch := s.ch
 701  		if ch < 0 {
 702  			s.error(offs, "raw string literal not terminated")
 703  			break
 704  		}
 705  		s.next()
 706  		if ch == '`' {
 707  			break
 708  		}
 709  		if ch == '\r' {
 710  			hasCR = true
 711  		}
 712  	}
 713  
 714  	lit := s.src[offs:s.offset]
 715  	if hasCR {
 716  		lit = stripCR(lit, false)
 717  	}
 718  
 719  	return string(lit)
 720  }
 721  
 722  func (s *Scanner) skipWhitespace() {
 723  	for s.ch == ' ' || s.ch == '\t' || s.ch == '\n' && !s.insertSemi || s.ch == '\r' {
 724  		s.next()
 725  	}
 726  }
 727  
 728  // Helper functions for scanning multi-byte tokens such as >> += >>= .
 729  // Different routines recognize different length tok_i based on matches
 730  // of ch_i. If a token ends in '=', the result is tok1 or tok3
 731  // respectively. Otherwise, the result is tok0 if there was no other
 732  // matching character, or tok2 if the matching character was ch2.
 733  
 734  func (s *Scanner) switch2(tok0, tok1 token.Token) token.Token {
 735  	if s.ch == '=' {
 736  		s.next()
 737  		return tok1
 738  	}
 739  	return tok0
 740  }
 741  
 742  func (s *Scanner) switch3(tok0, tok1 token.Token, ch2 rune, tok2 token.Token) token.Token {
 743  	if s.ch == '=' {
 744  		s.next()
 745  		return tok1
 746  	}
 747  	if s.ch == ch2 {
 748  		s.next()
 749  		return tok2
 750  	}
 751  	return tok0
 752  }
 753  
 754  func (s *Scanner) switch4(tok0, tok1 token.Token, ch2 rune, tok2, tok3 token.Token) token.Token {
 755  	if s.ch == '=' {
 756  		s.next()
 757  		return tok1
 758  	}
 759  	if s.ch == ch2 {
 760  		s.next()
 761  		if s.ch == '=' {
 762  			s.next()
 763  			return tok3
 764  		}
 765  		return tok2
 766  	}
 767  	return tok0
 768  }
 769  
 770  // Scan scans the next token and returns the token position, the token,
 771  // and its literal string if applicable. The source end is indicated by
 772  // [token.EOF].
 773  //
 774  // If the returned token is a literal ([token.IDENT], [token.INT], [token.FLOAT],
 775  // [token.IMAG], [token.CHAR], [token.STRING]) or [token.COMMENT], the literal string
 776  // has the corresponding value.
 777  //
 778  // If the returned token is a keyword, the literal string is the keyword.
 779  //
 780  // If the returned token is [token.SEMICOLON], the corresponding
 781  // literal string is ";" if the semicolon was present in the source,
 782  // and "\n" if the semicolon was inserted because of a newline or
 783  // at EOF.
 784  //
 785  // If the returned token is [token.ILLEGAL], the literal string is the
 786  // offending character.
 787  //
 788  // In all other cases, Scan returns an empty literal string.
 789  //
 790  // For more tolerant parsing, Scan will return a valid token if
 791  // possible even if a syntax error was encountered. Thus, even
 792  // if the resulting token sequence contains no illegal tokens,
 793  // a client may not assume that no error occurred. Instead it
 794  // must check the scanner's ErrorCount or the number of calls
 795  // of the error handler, if there was one installed.
 796  //
 797  // Scan adds line information to the file added to the file
 798  // set with Init. Token positions are relative to that file
 799  // and thus relative to the file set.
 800  func (s *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) {
 801  scanAgain:
 802  	if s.nlPos.IsValid() {
 803  		// Return artificial ';' token after /*...*/ comment
 804  		// containing newline, at position of first newline.
 805  		pos, tok, lit = s.nlPos, token.SEMICOLON, "\n"
 806  		s.nlPos = token.NoPos
 807  		return
 808  	}
 809  
 810  	s.skipWhitespace()
 811  
 812  	// current token start
 813  	pos = s.file.Pos(s.offset)
 814  
 815  	// determine token value
 816  	insertSemi := false
 817  	switch ch := s.ch; {
 818  	case isLetter(ch):
 819  		lit = s.scanIdentifier()
 820  		if len(lit) > 1 {
 821  			// keywords are longer than one letter - avoid lookup otherwise
 822  			tok = token.Lookup(lit)
 823  			switch tok {
 824  			case token.IDENT, token.BREAK, token.CONTINUE, token.FALLTHROUGH, token.RETURN:
 825  				insertSemi = true
 826  			}
 827  		} else {
 828  			insertSemi = true
 829  			tok = token.IDENT
 830  		}
 831  	case isDecimal(ch) || ch == '.' && isDecimal(rune(s.peek())):
 832  		insertSemi = true
 833  		tok, lit = s.scanNumber()
 834  	default:
 835  		s.next() // always make progress
 836  		switch ch {
 837  		case eof:
 838  			if s.insertSemi {
 839  				s.insertSemi = false // EOF consumed
 840  				return pos, token.SEMICOLON, "\n"
 841  			}
 842  			tok = token.EOF
 843  		case '\n':
 844  			// we only reach here if s.insertSemi was
 845  			// set in the first place and exited early
 846  			// from s.skipWhitespace()
 847  			s.insertSemi = false // newline consumed
 848  			return pos, token.SEMICOLON, "\n"
 849  		case '"':
 850  			insertSemi = true
 851  			tok = token.STRING
 852  			lit = s.scanString()
 853  		case '\'':
 854  			insertSemi = true
 855  			tok = token.CHAR
 856  			lit = s.scanRune()
 857  		case '`':
 858  			insertSemi = true
 859  			tok = token.STRING
 860  			lit = s.scanRawString()
 861  		case ':':
 862  			tok = s.switch2(token.COLON, token.DEFINE)
 863  		case '.':
 864  			// fractions starting with a '.' are handled by outer switch
 865  			tok = token.PERIOD
 866  			if s.ch == '.' && s.peek() == '.' {
 867  				s.next()
 868  				s.next() // consume last '.'
 869  				tok = token.ELLIPSIS
 870  			}
 871  		case ',':
 872  			tok = token.COMMA
 873  		case ';':
 874  			tok = token.SEMICOLON
 875  			lit = ";"
 876  		case '(':
 877  			tok = token.LPAREN
 878  		case ')':
 879  			insertSemi = true
 880  			tok = token.RPAREN
 881  		case '[':
 882  			tok = token.LBRACK
 883  		case ']':
 884  			insertSemi = true
 885  			tok = token.RBRACK
 886  		case '{':
 887  			tok = token.LBRACE
 888  		case '}':
 889  			insertSemi = true
 890  			tok = token.RBRACE
 891  		case '+':
 892  			tok = s.switch3(token.ADD, token.ADD_ASSIGN, '+', token.INC)
 893  			if tok == token.INC {
 894  				insertSemi = true
 895  			}
 896  		case '-':
 897  			tok = s.switch3(token.SUB, token.SUB_ASSIGN, '-', token.DEC)
 898  			if tok == token.DEC {
 899  				insertSemi = true
 900  			}
 901  		case '*':
 902  			tok = s.switch2(token.MUL, token.MUL_ASSIGN)
 903  		case '/':
 904  			if s.ch == '/' || s.ch == '*' {
 905  				// comment
 906  				comment, nlOffset := s.scanComment()
 907  				if s.insertSemi && nlOffset != 0 {
 908  					// For /*...*/ containing \n, return
 909  					// COMMENT then artificial SEMICOLON.
 910  					s.nlPos = s.file.Pos(nlOffset)
 911  					s.insertSemi = false
 912  				} else {
 913  					insertSemi = s.insertSemi // preserve insertSemi info
 914  				}
 915  				if s.mode&ScanComments == 0 {
 916  					// skip comment
 917  					goto scanAgain
 918  				}
 919  				tok = token.COMMENT
 920  				lit = comment
 921  			} else {
 922  				// division
 923  				tok = s.switch2(token.QUO, token.QUO_ASSIGN)
 924  			}
 925  		case '%':
 926  			tok = s.switch2(token.REM, token.REM_ASSIGN)
 927  		case '^':
 928  			tok = s.switch2(token.XOR, token.XOR_ASSIGN)
 929  		case '<':
 930  			if s.ch == '-' {
 931  				s.next()
 932  				tok = token.ARROW
 933  			} else {
 934  				tok = s.switch4(token.LSS, token.LEQ, '<', token.SHL, token.SHL_ASSIGN)
 935  			}
 936  		case '>':
 937  			tok = s.switch4(token.GTR, token.GEQ, '>', token.SHR, token.SHR_ASSIGN)
 938  		case '=':
 939  			tok = s.switch2(token.ASSIGN, token.EQL)
 940  		case '!':
 941  			tok = s.switch2(token.NOT, token.NEQ)
 942  		case '&':
 943  			if s.ch == '^' {
 944  				s.next()
 945  				tok = s.switch2(token.AND_NOT, token.AND_NOT_ASSIGN)
 946  			} else {
 947  				tok = s.switch3(token.AND, token.AND_ASSIGN, '&', token.LAND)
 948  			}
 949  		case '|':
 950  			tok = s.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR)
 951  		case '~':
 952  			tok = token.TILDE
 953  		default:
 954  			// next reports unexpected BOMs - don't repeat
 955  			if ch != bom {
 956  				// Report an informative error for U+201[CD] quotation
 957  				// marks, which are easily introduced via copy and paste.
 958  				if ch == '“' || ch == '”' {
 959  					s.errorf(s.file.Offset(pos), "curly quotation mark %q (use neutral %q)", ch, '"')
 960  				} else {
 961  					s.errorf(s.file.Offset(pos), "illegal character %#U", ch)
 962  				}
 963  			}
 964  			insertSemi = s.insertSemi // preserve insertSemi info
 965  			tok = token.ILLEGAL
 966  			lit = string(ch)
 967  		}
 968  	}
 969  	if s.mode&dontInsertSemis == 0 {
 970  		s.insertSemi = insertSemi
 971  	}
 972  
 973  	return
 974  }
 975