1 // Copyright 2009 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 5 // Package scanner implements a scanner for Go source text.
6 // It takes a []byte as source which can then be tokenized
7 // through repeated calls to the Scan method.
8 package scanner
9 10 import (
11 "bytes"
12 "fmt"
13 "go/token"
14 "path/filepath"
15 "strconv"
16 "unicode"
17 "unicode/utf8"
18 )
19 20 // An ErrorHandler may be provided to [Scanner.Init]. If a syntax error is
21 // encountered and a handler was installed, the handler is called with a
22 // position and an error message. The position points to the beginning of
23 // the offending token.
24 type ErrorHandler func(pos token.Position, msg string)
25 26 // A Scanner holds the scanner's internal state while processing
27 // a given text. It can be allocated as part of another data
28 // structure but must be initialized via [Scanner.Init] before use.
29 type Scanner struct {
30 // immutable state
31 file *token.File // source file handle
32 dir string // directory portion of file.Name()
33 src []byte // source
34 err ErrorHandler // error reporting; or nil
35 mode Mode // scanning mode
36 37 // scanning state
38 ch rune // current character
39 offset int // character offset
40 rdOffset int // reading offset (position after current character)
41 lineOffset int // current line offset
42 insertSemi bool // insert a semicolon before next newline
43 nlPos token.Pos // position of newline in preceding comment
44 45 // public state - ok to modify
46 ErrorCount int // number of errors encountered
47 }
48 49 const (
50 bom = 0xFEFF // byte order mark, only permitted as very first character
51 eof = -1 // end of file
52 )
53 54 // Read the next Unicode char into s.ch.
55 // s.ch < 0 means end-of-file.
56 //
57 // For optimization, there is some overlap between this method and
58 // s.scanIdentifier.
59 func (s *Scanner) next() {
60 if s.rdOffset < len(s.src) {
61 s.offset = s.rdOffset
62 if s.ch == '\n' {
63 s.lineOffset = s.offset
64 s.file.AddLine(s.offset)
65 }
66 r, w := rune(s.src[s.rdOffset]), 1
67 switch {
68 case r == 0:
69 s.error(s.offset, "illegal character NUL")
70 case r >= utf8.RuneSelf:
71 // not ASCII
72 r, w = utf8.DecodeRune(s.src[s.rdOffset:])
73 if r == utf8.RuneError && w == 1 {
74 in := s.src[s.rdOffset:]
75 if s.offset == 0 &&
76 len(in) >= 2 &&
77 (in[0] == 0xFF && in[1] == 0xFE || in[0] == 0xFE && in[1] == 0xFF) {
78 // U+FEFF BOM at start of file, encoded as big- or little-endian
79 // UCS-2 (i.e. 2-byte UTF-16). Give specific error (go.dev/issue/71950).
80 s.error(s.offset, "illegal UTF-8 encoding (got UTF-16)")
81 s.rdOffset += len(in) // consume all input to avoid error cascade
82 } else {
83 s.error(s.offset, "illegal UTF-8 encoding")
84 }
85 } else if r == bom && s.offset > 0 {
86 s.error(s.offset, "illegal byte order mark")
87 }
88 }
89 s.rdOffset += w
90 s.ch = r
91 } else {
92 s.offset = len(s.src)
93 if s.ch == '\n' {
94 s.lineOffset = s.offset
95 s.file.AddLine(s.offset)
96 }
97 s.ch = eof
98 }
99 }
100 101 // peek returns the byte following the most recently read character without
102 // advancing the scanner. If the scanner is at EOF, peek returns 0.
103 func (s *Scanner) peek() byte {
104 if s.rdOffset < len(s.src) {
105 return s.src[s.rdOffset]
106 }
107 return 0
108 }
109 110 // A mode value is a set of flags (or 0).
111 // They control scanner behavior.
112 type Mode uint
113 114 const (
115 ScanComments Mode = 1 << iota // return comments as COMMENT tokens
116 dontInsertSemis // do not automatically insert semicolons - for testing only
117 )
118 119 // Init prepares the scanner s to tokenize the text src by setting the
120 // scanner at the beginning of src. The scanner uses the file set file
121 // for position information and it adds line information for each line.
122 // It is ok to re-use the same file when re-scanning the same file as
123 // line information which is already present is ignored. Init causes a
124 // panic if the file size does not match the src size.
125 //
126 // Calls to [Scanner.Scan] will invoke the error handler err if they encounter a
127 // syntax error and err is not nil. Also, for each error encountered,
128 // the [Scanner] field ErrorCount is incremented by one. The mode parameter
129 // determines how comments are handled.
130 //
131 // Note that Init may call err if there is an error in the first character
132 // of the file.
133 func (s *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode Mode) {
134 // Explicitly initialize all fields since a scanner may be reused.
135 if file.Size() != len(src) {
136 panic(fmt.Sprintf("file size (%d) does not match src len (%d)", file.Size(), len(src)))
137 }
138 s.file = file
139 s.dir, _ = filepath.Split(file.Name())
140 s.src = src
141 s.err = err
142 s.mode = mode
143 144 s.ch = ' '
145 s.offset = 0
146 s.rdOffset = 0
147 s.lineOffset = 0
148 s.insertSemi = false
149 s.ErrorCount = 0
150 151 s.next()
152 if s.ch == bom {
153 s.next() // ignore BOM at file beginning
154 }
155 }
156 157 func (s *Scanner) error(offs int, msg string) {
158 if s.err != nil {
159 s.err(s.file.Position(s.file.Pos(offs)), msg)
160 }
161 s.ErrorCount++
162 }
163 164 func (s *Scanner) errorf(offs int, format string, args ...any) {
165 s.error(offs, fmt.Sprintf(format, args...))
166 }
167 168 // scanComment returns the text of the comment and (if nonzero)
169 // the offset of the first newline within it, which implies a
170 // /*...*/ comment.
171 func (s *Scanner) scanComment() (string, int) {
172 // initial '/' already consumed; s.ch == '/' || s.ch == '*'
173 offs := s.offset - 1 // position of initial '/'
174 next := -1 // position immediately following the comment; < 0 means invalid comment
175 numCR := 0
176 nlOffset := 0 // offset of first newline within /*...*/ comment
177 178 if s.ch == '/' {
179 //-style comment
180 // (the final '\n' is not considered part of the comment)
181 s.next()
182 for s.ch != '\n' && s.ch >= 0 {
183 if s.ch == '\r' {
184 numCR++
185 }
186 s.next()
187 }
188 // if we are at '\n', the position following the comment is afterwards
189 next = s.offset
190 if s.ch == '\n' {
191 next++
192 }
193 goto exit
194 }
195 196 /*-style comment */
197 s.next()
198 for s.ch >= 0 {
199 ch := s.ch
200 if ch == '\r' {
201 numCR++
202 } else if ch == '\n' && nlOffset == 0 {
203 nlOffset = s.offset
204 }
205 s.next()
206 if ch == '*' && s.ch == '/' {
207 s.next()
208 next = s.offset
209 goto exit
210 }
211 }
212 213 s.error(offs, "comment not terminated")
214 215 exit:
216 lit := s.src[offs:s.offset]
217 218 // On Windows, a (//-comment) line may end in "\r\n".
219 // Remove the final '\r' before analyzing the text for
220 // line directives (matching the compiler). Remove any
221 // other '\r' afterwards (matching the pre-existing be-
222 // havior of the scanner).
223 if numCR > 0 && len(lit) >= 2 && lit[1] == '/' && lit[len(lit)-1] == '\r' {
224 lit = lit[:len(lit)-1]
225 numCR--
226 }
227 228 // interpret line directives
229 // (//line directives must start at the beginning of the current line)
230 if next >= 0 /* implies valid comment */ && (lit[1] == '*' || offs == s.lineOffset) && bytes.HasPrefix(lit[2:], prefix) {
231 s.updateLineInfo(next, offs, lit)
232 }
233 234 if numCR > 0 {
235 lit = stripCR(lit, lit[1] == '*')
236 }
237 238 return string(lit), nlOffset
239 }
240 241 var prefix = []byte("line ")
242 243 // updateLineInfo parses the incoming comment text at offset offs
244 // as a line directive. If successful, it updates the line info table
245 // for the position next per the line directive.
246 func (s *Scanner) updateLineInfo(next, offs int, text []byte) {
247 // extract comment text
248 if text[1] == '*' {
249 text = text[:len(text)-2] // lop off trailing "*/"
250 }
251 text = text[7:] // lop off leading "//line " or "/*line "
252 offs += 7
253 254 i, n, ok := trailingDigits(text)
255 if i == 0 {
256 return // ignore (not a line directive)
257 }
258 // i > 0
259 260 if !ok {
261 // text has a suffix :xxx but xxx is not a number
262 s.error(offs+i, "invalid line number: "+string(text[i:]))
263 return
264 }
265 266 // Put a cap on the maximum size of line and column numbers.
267 // 30 bits allows for some additional space before wrapping an int32.
268 // Keep this consistent with cmd/compile/internal/syntax.PosMax.
269 const maxLineCol = 1 << 30
270 var line, col int
271 i2, n2, ok2 := trailingDigits(text[:i-1])
272 if ok2 {
273 //line filename:line:col
274 i, i2 = i2, i
275 line, col = n2, n
276 if col == 0 || col > maxLineCol {
277 s.error(offs+i2, "invalid column number: "+string(text[i2:]))
278 return
279 }
280 text = text[:i2-1] // lop off ":col"
281 } else {
282 //line filename:line
283 line = n
284 }
285 286 if line == 0 || line > maxLineCol {
287 s.error(offs+i, "invalid line number: "+string(text[i:]))
288 return
289 }
290 291 // If we have a column (//line filename:line:col form),
292 // an empty filename means to use the previous filename.
293 filename := string(text[:i-1]) // lop off ":line", and trim white space
294 if filename == "" && ok2 {
295 filename = s.file.Position(s.file.Pos(offs)).Filename
296 } else if filename != "" {
297 // Put a relative filename in the current directory.
298 // This is for compatibility with earlier releases.
299 // See issue 26671.
300 filename = filepath.Clean(filename)
301 if !filepath.IsAbs(filename) {
302 filename = filepath.Join(s.dir, filename)
303 }
304 }
305 306 s.file.AddLineColumnInfo(next, filename, line, col)
307 }
308 309 func trailingDigits(text []byte) (int, int, bool) {
310 i := bytes.LastIndexByte(text, ':') // look from right (Windows filenames may contain ':')
311 if i < 0 {
312 return 0, 0, false // no ":"
313 }
314 // i >= 0
315 n, err := strconv.ParseUint(string(text[i+1:]), 10, 0)
316 return i + 1, int(n), err == nil
317 }
318 319 func isLetter(ch rune) bool {
320 return 'a' <= lower(ch) && lower(ch) <= 'z' || ch == '_' || ch >= utf8.RuneSelf && unicode.IsLetter(ch)
321 }
322 323 func isDigit(ch rune) bool {
324 return isDecimal(ch) || ch >= utf8.RuneSelf && unicode.IsDigit(ch)
325 }
326 327 // scanIdentifier reads the string of valid identifier characters at s.offset.
328 // It must only be called when s.ch is known to be a valid letter.
329 //
330 // Be careful when making changes to this function: it is optimized and affects
331 // scanning performance significantly.
332 func (s *Scanner) scanIdentifier() string {
333 offs := s.offset
334 335 // Optimize for the common case of an ASCII identifier.
336 //
337 // Ranging over s.src[s.rdOffset:] lets us avoid some bounds checks, and
338 // avoids conversions to runes.
339 //
340 // In case we encounter a non-ASCII character, fall back on the slower path
341 // of calling into s.next().
342 for rdOffset, b := range s.src[s.rdOffset:] {
343 if 'a' <= b && b <= 'z' || 'A' <= b && b <= 'Z' || b == '_' || '0' <= b && b <= '9' {
344 // Avoid assigning a rune for the common case of an ascii character.
345 continue
346 }
347 s.rdOffset += rdOffset
348 if 0 < b && b < utf8.RuneSelf {
349 // Optimization: we've encountered an ASCII character that's not a letter
350 // or number. Avoid the call into s.next() and corresponding set up.
351 //
352 // Note that s.next() does some line accounting if s.ch is '\n', so this
353 // shortcut is only possible because we know that the preceding character
354 // is not '\n'.
355 s.ch = rune(b)
356 s.offset = s.rdOffset
357 s.rdOffset++
358 goto exit
359 }
360 // We know that the preceding character is valid for an identifier because
361 // scanIdentifier is only called when s.ch is a letter, so calling s.next()
362 // at s.rdOffset resets the scanner state.
363 s.next()
364 for isLetter(s.ch) || isDigit(s.ch) {
365 s.next()
366 }
367 goto exit
368 }
369 s.offset = len(s.src)
370 s.rdOffset = len(s.src)
371 s.ch = eof
372 373 exit:
374 return string(s.src[offs:s.offset])
375 }
376 377 func digitVal(ch rune) int {
378 switch {
379 case '0' <= ch && ch <= '9':
380 return int(ch - '0')
381 case 'a' <= lower(ch) && lower(ch) <= 'f':
382 return int(lower(ch) - 'a' + 10)
383 }
384 return 16 // larger than any legal digit val
385 }
386 387 func lower(ch rune) rune { return ('a' - 'A') | ch } // returns lower-case ch iff ch is ASCII letter
388 func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' }
389 func isHex(ch rune) bool { return '0' <= ch && ch <= '9' || 'a' <= lower(ch) && lower(ch) <= 'f' }
390 391 // digits accepts the sequence { digit | '_' }.
392 // If base <= 10, digits accepts any decimal digit but records
393 // the offset (relative to the source start) of a digit >= base
394 // in *invalid, if *invalid < 0.
395 // digits returns a bitset describing whether the sequence contained
396 // digits (bit 0 is set), or separators '_' (bit 1 is set).
397 func (s *Scanner) digits(base int, invalid *int) (digsep int) {
398 if base <= 10 {
399 max := rune('0' + base)
400 for isDecimal(s.ch) || s.ch == '_' {
401 ds := 1
402 if s.ch == '_' {
403 ds = 2
404 } else if s.ch >= max && *invalid < 0 {
405 *invalid = s.offset // record invalid rune offset
406 }
407 digsep |= ds
408 s.next()
409 }
410 } else {
411 for isHex(s.ch) || s.ch == '_' {
412 ds := 1
413 if s.ch == '_' {
414 ds = 2
415 }
416 digsep |= ds
417 s.next()
418 }
419 }
420 return
421 }
422 423 func (s *Scanner) scanNumber() (token.Token, string) {
424 offs := s.offset
425 tok := token.ILLEGAL
426 427 base := 10 // number base
428 prefix := rune(0) // one of 0 (decimal), '0' (0-octal), 'x', 'o', or 'b'
429 digsep := 0 // bit 0: digit present, bit 1: '_' present
430 invalid := -1 // index of invalid digit in literal, or < 0
431 432 // integer part
433 if s.ch != '.' {
434 tok = token.INT
435 if s.ch == '0' {
436 s.next()
437 switch lower(s.ch) {
438 case 'x':
439 s.next()
440 base, prefix = 16, 'x'
441 case 'o':
442 s.next()
443 base, prefix = 8, 'o'
444 case 'b':
445 s.next()
446 base, prefix = 2, 'b'
447 default:
448 base, prefix = 8, '0'
449 digsep = 1 // leading 0
450 }
451 }
452 digsep |= s.digits(base, &invalid)
453 }
454 455 // fractional part
456 if s.ch == '.' {
457 tok = token.FLOAT
458 if prefix == 'o' || prefix == 'b' {
459 s.error(s.offset, "invalid radix point in "+litname(prefix))
460 }
461 s.next()
462 digsep |= s.digits(base, &invalid)
463 }
464 465 if digsep&1 == 0 {
466 s.error(s.offset, litname(prefix)+" has no digits")
467 }
468 469 // exponent
470 if e := lower(s.ch); e == 'e' || e == 'p' {
471 switch {
472 case e == 'e' && prefix != 0 && prefix != '0':
473 s.errorf(s.offset, "%q exponent requires decimal mantissa", s.ch)
474 case e == 'p' && prefix != 'x':
475 s.errorf(s.offset, "%q exponent requires hexadecimal mantissa", s.ch)
476 }
477 s.next()
478 tok = token.FLOAT
479 if s.ch == '+' || s.ch == '-' {
480 s.next()
481 }
482 ds := s.digits(10, nil)
483 digsep |= ds
484 if ds&1 == 0 {
485 s.error(s.offset, "exponent has no digits")
486 }
487 } else if prefix == 'x' && tok == token.FLOAT {
488 s.error(s.offset, "hexadecimal mantissa requires a 'p' exponent")
489 }
490 491 // suffix 'i'
492 if s.ch == 'i' {
493 tok = token.IMAG
494 s.next()
495 }
496 497 lit := string(s.src[offs:s.offset])
498 if tok == token.INT && invalid >= 0 {
499 s.errorf(invalid, "invalid digit %q in %s", lit[invalid-offs], litname(prefix))
500 }
501 if digsep&2 != 0 {
502 if i := invalidSep(lit); i >= 0 {
503 s.error(offs+i, "'_' must separate successive digits")
504 }
505 }
506 507 return tok, lit
508 }
509 510 func litname(prefix rune) string {
511 switch prefix {
512 case 'x':
513 return "hexadecimal literal"
514 case 'o', '0':
515 return "octal literal"
516 case 'b':
517 return "binary literal"
518 }
519 return "decimal literal"
520 }
521 522 // invalidSep returns the index of the first invalid separator in x, or -1.
523 func invalidSep(x string) int {
524 x1 := ' ' // prefix char, we only care if it's 'x'
525 d := '.' // digit, one of '_', '0' (a digit), or '.' (anything else)
526 i := 0
527 528 // a prefix counts as a digit
529 if len(x) >= 2 && x[0] == '0' {
530 x1 = lower(rune(x[1]))
531 if x1 == 'x' || x1 == 'o' || x1 == 'b' {
532 d = '0'
533 i = 2
534 }
535 }
536 537 // mantissa and exponent
538 for ; i < len(x); i++ {
539 p := d // previous digit
540 d = rune(x[i])
541 switch {
542 case d == '_':
543 if p != '0' {
544 return i
545 }
546 case isDecimal(d) || x1 == 'x' && isHex(d):
547 d = '0'
548 default:
549 if p == '_' {
550 return i - 1
551 }
552 d = '.'
553 }
554 }
555 if d == '_' {
556 return len(x) - 1
557 }
558 559 return -1
560 }
561 562 // scanEscape parses an escape sequence where rune is the accepted
563 // escaped quote. In case of a syntax error, it stops at the offending
564 // character (without consuming it) and returns false. Otherwise
565 // it returns true.
566 func (s *Scanner) scanEscape(quote rune) bool {
567 offs := s.offset
568 569 var n int
570 var base, max uint32
571 switch s.ch {
572 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
573 s.next()
574 return true
575 case '0', '1', '2', '3', '4', '5', '6', '7':
576 n, base, max = 3, 8, 255
577 case 'x':
578 s.next()
579 n, base, max = 2, 16, 255
580 case 'u':
581 s.next()
582 n, base, max = 4, 16, unicode.MaxRune
583 case 'U':
584 s.next()
585 n, base, max = 8, 16, unicode.MaxRune
586 default:
587 msg := "unknown escape sequence"
588 if s.ch < 0 {
589 msg = "escape sequence not terminated"
590 }
591 s.error(offs, msg)
592 return false
593 }
594 595 var x uint32
596 for n > 0 {
597 d := uint32(digitVal(s.ch))
598 if d >= base {
599 msg := fmt.Sprintf("illegal character %#U in escape sequence", s.ch)
600 if s.ch < 0 {
601 msg = "escape sequence not terminated"
602 }
603 s.error(s.offset, msg)
604 return false
605 }
606 x = x*base + d
607 s.next()
608 n--
609 }
610 611 if x > max || 0xD800 <= x && x < 0xE000 {
612 s.error(offs, "escape sequence is invalid Unicode code point")
613 return false
614 }
615 616 return true
617 }
618 619 func (s *Scanner) scanRune() string {
620 // '\'' opening already consumed
621 offs := s.offset - 1
622 623 valid := true
624 n := 0
625 for {
626 ch := s.ch
627 if ch == '\n' || ch < 0 {
628 // only report error if we don't have one already
629 if valid {
630 s.error(offs, "rune literal not terminated")
631 valid = false
632 }
633 break
634 }
635 s.next()
636 if ch == '\'' {
637 break
638 }
639 n++
640 if ch == '\\' {
641 if !s.scanEscape('\'') {
642 valid = false
643 }
644 // continue to read to closing quote
645 }
646 }
647 648 if valid && n != 1 {
649 s.error(offs, "illegal rune literal")
650 }
651 652 return string(s.src[offs:s.offset])
653 }
654 655 func (s *Scanner) scanString() string {
656 // '"' opening already consumed
657 offs := s.offset - 1
658 659 for {
660 ch := s.ch
661 if ch == '\n' || ch < 0 {
662 s.error(offs, "string literal not terminated")
663 break
664 }
665 s.next()
666 if ch == '"' {
667 break
668 }
669 if ch == '\\' {
670 s.scanEscape('"')
671 }
672 }
673 674 return string(s.src[offs:s.offset])
675 }
676 677 func stripCR(b []byte, comment bool) []byte {
678 c := []byte{:len(b)}
679 i := 0
680 for j, ch := range b {
681 // In a /*-style comment, don't strip \r from *\r/ (incl.
682 // sequences of \r from *\r\r...\r/) since the resulting
683 // */ would terminate the comment too early unless the \r
684 // is immediately following the opening /* in which case
685 // it's ok because /*/ is not closed yet (issue #11151).
686 if ch != '\r' || comment && i > len("/*") && c[i-1] == '*' && j+1 < len(b) && b[j+1] == '/' {
687 c[i] = ch
688 i++
689 }
690 }
691 return c[:i]
692 }
693 694 func (s *Scanner) scanRawString() string {
695 // '`' opening already consumed
696 offs := s.offset - 1
697 698 hasCR := false
699 for {
700 ch := s.ch
701 if ch < 0 {
702 s.error(offs, "raw string literal not terminated")
703 break
704 }
705 s.next()
706 if ch == '`' {
707 break
708 }
709 if ch == '\r' {
710 hasCR = true
711 }
712 }
713 714 lit := s.src[offs:s.offset]
715 if hasCR {
716 lit = stripCR(lit, false)
717 }
718 719 return string(lit)
720 }
721 722 func (s *Scanner) skipWhitespace() {
723 for s.ch == ' ' || s.ch == '\t' || s.ch == '\n' && !s.insertSemi || s.ch == '\r' {
724 s.next()
725 }
726 }
727 728 // Helper functions for scanning multi-byte tokens such as >> += >>= .
729 // Different routines recognize different length tok_i based on matches
730 // of ch_i. If a token ends in '=', the result is tok1 or tok3
731 // respectively. Otherwise, the result is tok0 if there was no other
732 // matching character, or tok2 if the matching character was ch2.
733 734 func (s *Scanner) switch2(tok0, tok1 token.Token) token.Token {
735 if s.ch == '=' {
736 s.next()
737 return tok1
738 }
739 return tok0
740 }
741 742 func (s *Scanner) switch3(tok0, tok1 token.Token, ch2 rune, tok2 token.Token) token.Token {
743 if s.ch == '=' {
744 s.next()
745 return tok1
746 }
747 if s.ch == ch2 {
748 s.next()
749 return tok2
750 }
751 return tok0
752 }
753 754 func (s *Scanner) switch4(tok0, tok1 token.Token, ch2 rune, tok2, tok3 token.Token) token.Token {
755 if s.ch == '=' {
756 s.next()
757 return tok1
758 }
759 if s.ch == ch2 {
760 s.next()
761 if s.ch == '=' {
762 s.next()
763 return tok3
764 }
765 return tok2
766 }
767 return tok0
768 }
769 770 // Scan scans the next token and returns the token position, the token,
771 // and its literal string if applicable. The source end is indicated by
772 // [token.EOF].
773 //
774 // If the returned token is a literal ([token.IDENT], [token.INT], [token.FLOAT],
775 // [token.IMAG], [token.CHAR], [token.STRING]) or [token.COMMENT], the literal string
776 // has the corresponding value.
777 //
778 // If the returned token is a keyword, the literal string is the keyword.
779 //
780 // If the returned token is [token.SEMICOLON], the corresponding
781 // literal string is ";" if the semicolon was present in the source,
782 // and "\n" if the semicolon was inserted because of a newline or
783 // at EOF.
784 //
785 // If the returned token is [token.ILLEGAL], the literal string is the
786 // offending character.
787 //
788 // In all other cases, Scan returns an empty literal string.
789 //
790 // For more tolerant parsing, Scan will return a valid token if
791 // possible even if a syntax error was encountered. Thus, even
792 // if the resulting token sequence contains no illegal tokens,
793 // a client may not assume that no error occurred. Instead it
794 // must check the scanner's ErrorCount or the number of calls
795 // of the error handler, if there was one installed.
796 //
797 // Scan adds line information to the file added to the file
798 // set with Init. Token positions are relative to that file
799 // and thus relative to the file set.
800 func (s *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) {
801 scanAgain:
802 if s.nlPos.IsValid() {
803 // Return artificial ';' token after /*...*/ comment
804 // containing newline, at position of first newline.
805 pos, tok, lit = s.nlPos, token.SEMICOLON, "\n"
806 s.nlPos = token.NoPos
807 return
808 }
809 810 s.skipWhitespace()
811 812 // current token start
813 pos = s.file.Pos(s.offset)
814 815 // determine token value
816 insertSemi := false
817 switch ch := s.ch; {
818 case isLetter(ch):
819 lit = s.scanIdentifier()
820 if len(lit) > 1 {
821 // keywords are longer than one letter - avoid lookup otherwise
822 tok = token.Lookup(lit)
823 switch tok {
824 case token.IDENT, token.BREAK, token.CONTINUE, token.FALLTHROUGH, token.RETURN:
825 insertSemi = true
826 }
827 } else {
828 insertSemi = true
829 tok = token.IDENT
830 }
831 case isDecimal(ch) || ch == '.' && isDecimal(rune(s.peek())):
832 insertSemi = true
833 tok, lit = s.scanNumber()
834 default:
835 s.next() // always make progress
836 switch ch {
837 case eof:
838 if s.insertSemi {
839 s.insertSemi = false // EOF consumed
840 return pos, token.SEMICOLON, "\n"
841 }
842 tok = token.EOF
843 case '\n':
844 // we only reach here if s.insertSemi was
845 // set in the first place and exited early
846 // from s.skipWhitespace()
847 s.insertSemi = false // newline consumed
848 return pos, token.SEMICOLON, "\n"
849 case '"':
850 insertSemi = true
851 tok = token.STRING
852 lit = s.scanString()
853 case '\'':
854 insertSemi = true
855 tok = token.CHAR
856 lit = s.scanRune()
857 case '`':
858 insertSemi = true
859 tok = token.STRING
860 lit = s.scanRawString()
861 case ':':
862 tok = s.switch2(token.COLON, token.DEFINE)
863 case '.':
864 // fractions starting with a '.' are handled by outer switch
865 tok = token.PERIOD
866 if s.ch == '.' && s.peek() == '.' {
867 s.next()
868 s.next() // consume last '.'
869 tok = token.ELLIPSIS
870 }
871 case ',':
872 tok = token.COMMA
873 case ';':
874 tok = token.SEMICOLON
875 lit = ";"
876 case '(':
877 tok = token.LPAREN
878 case ')':
879 insertSemi = true
880 tok = token.RPAREN
881 case '[':
882 tok = token.LBRACK
883 case ']':
884 insertSemi = true
885 tok = token.RBRACK
886 case '{':
887 tok = token.LBRACE
888 case '}':
889 insertSemi = true
890 tok = token.RBRACE
891 case '+':
892 tok = s.switch3(token.ADD, token.ADD_ASSIGN, '+', token.INC)
893 if tok == token.INC {
894 insertSemi = true
895 }
896 case '-':
897 tok = s.switch3(token.SUB, token.SUB_ASSIGN, '-', token.DEC)
898 if tok == token.DEC {
899 insertSemi = true
900 }
901 case '*':
902 tok = s.switch2(token.MUL, token.MUL_ASSIGN)
903 case '/':
904 if s.ch == '/' || s.ch == '*' {
905 // comment
906 comment, nlOffset := s.scanComment()
907 if s.insertSemi && nlOffset != 0 {
908 // For /*...*/ containing \n, return
909 // COMMENT then artificial SEMICOLON.
910 s.nlPos = s.file.Pos(nlOffset)
911 s.insertSemi = false
912 } else {
913 insertSemi = s.insertSemi // preserve insertSemi info
914 }
915 if s.mode&ScanComments == 0 {
916 // skip comment
917 goto scanAgain
918 }
919 tok = token.COMMENT
920 lit = comment
921 } else {
922 // division
923 tok = s.switch2(token.QUO, token.QUO_ASSIGN)
924 }
925 case '%':
926 tok = s.switch2(token.REM, token.REM_ASSIGN)
927 case '^':
928 tok = s.switch2(token.XOR, token.XOR_ASSIGN)
929 case '<':
930 if s.ch == '-' {
931 s.next()
932 tok = token.ARROW
933 } else {
934 tok = s.switch4(token.LSS, token.LEQ, '<', token.SHL, token.SHL_ASSIGN)
935 }
936 case '>':
937 tok = s.switch4(token.GTR, token.GEQ, '>', token.SHR, token.SHR_ASSIGN)
938 case '=':
939 tok = s.switch2(token.ASSIGN, token.EQL)
940 case '!':
941 tok = s.switch2(token.NOT, token.NEQ)
942 case '&':
943 if s.ch == '^' {
944 s.next()
945 tok = s.switch2(token.AND_NOT, token.AND_NOT_ASSIGN)
946 } else {
947 tok = s.switch3(token.AND, token.AND_ASSIGN, '&', token.LAND)
948 }
949 case '|':
950 tok = s.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR)
951 case '~':
952 tok = token.TILDE
953 default:
954 // next reports unexpected BOMs - don't repeat
955 if ch != bom {
956 // Report an informative error for U+201[CD] quotation
957 // marks, which are easily introduced via copy and paste.
958 if ch == '“' || ch == '”' {
959 s.errorf(s.file.Offset(pos), "curly quotation mark %q (use neutral %q)", ch, '"')
960 } else {
961 s.errorf(s.file.Offset(pos), "illegal character %#U", ch)
962 }
963 }
964 insertSemi = s.insertSemi // preserve insertSemi info
965 tok = token.ILLEGAL
966 lit = string(ch)
967 }
968 }
969 if s.mode&dontInsertSemis == 0 {
970 s.insertSemi = insertSemi
971 }
972 973 return
974 }
975