scanner.go raw
1 // Copyright 2016 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // This file implements scanner, a lexical tokenizer for
6 // Go source. After initialization, consecutive calls of
7 // next advance the scanner one token at a time.
8 //
9 // This file, source.go, tokens.go, and token_string.go are self-contained
10 // (`go tool compile scanner.go source.go tokens.go token_string.go` compiles)
11 // and thus could be made into their own package.
12
13 package syntax
14
15 import (
16 "fmt"
17 "io"
18 "unicode"
19 "unicode/utf8"
20 )
21
22 // The mode flags below control which comments are reported
23 // by calling the error handler. If no flag is set, comments
24 // are ignored.
25 const (
26 comments uint = 1 << iota // call handler for all comments
27 directives // call handler for directives only
28 )
29
30 type Scanner struct {
31 Source
32 mode uint
33 nlsemi bool // if set '\n' and EOF translate to ';'
34
35 // current token, valid after calling next()
36 Line, Col uint32
37 Blank bool // line is blank up to col
38 Tok Token
39 Lit string // valid if tok is _Name, _Literal, or _Semi ("semicolon", "newline", or "EOF"); may be malformed if bad is true
40 Bad bool // valid if tok is _Literal, true if a syntax error occurred, lit may be malformed
41 Kind LitKind // valid if tok is _Literal
42 Op Operator // valid if tok is _Operator, _Star, _AssignOp, or _IncOp
43 Prec int32 // valid if tok is _Operator, _Star, _AssignOp, or _IncOp
44 }
45
46 func (s *Scanner) Init(src io.Reader, errh func(line, col uint32, msg string), mode uint) {
47 s.Source.init(src, errh)
48 s.mode = mode
49 s.nlsemi = false
50 }
51
52 // errorf reports an error at the most recently read character position.
53 func (s *Scanner) Errorf(format string, args ...interface{}) {
54 s.error(fmt.Sprintf(format, args...))
55 }
56
57 // errorAtf reports an error at a byte column offset relative to the current token start.
58 func (s *Scanner) ErrorAtf(offset int32, format string, args ...interface{}) {
59 s.errh(s.line, s.col+uint32(offset), fmt.Sprintf(format, args...))
60 }
61
62 // setLit sets the scanner state for a recognized _Literal token.
63 func (s *Scanner) SetLit(kind LitKind, ok bool) {
64 s.nlsemi = true
65 s.Tok = Literal
66 s.Lit = string(s.segment())
67 s.Bad = !ok
68 s.Kind = kind
69 }
70
71 // next advances the scanner by reading the next token.
72 //
73 // If a read, source encoding, or lexical error occurs, next calls
74 // the installed error handler with the respective error position
75 // and message. The error message is guaranteed to be non-empty and
76 // never starts with a '/'. The error handler must exist.
77 //
78 // If the scanner mode includes the comments flag and a comment
79 // (including comments containing directives) is encountered, the
80 // error handler is also called with each comment position and text
81 // (including opening /* or // and closing */, but without a newline
82 // at the end of line comments). Comment text always starts with a /
83 // which can be used to distinguish these handler calls from errors.
84 //
85 // If the scanner mode includes the directives (but not the comments)
86 // flag, only comments containing a //line, /*line, or //go: directive
87 // are reported, in the same way as regular comments.
88 func (s *Scanner) Next() {
89 nlsemi := s.nlsemi
90 s.nlsemi = false
91
92 redo:
93 // skip white space
94 s.stop()
95 startLine, startCol := s.pos()
96 for s.ch == ' ' || s.ch == '\t' || s.ch == '\n' && !nlsemi || s.ch == '\r' {
97 s.nextch()
98 }
99
100 // token start
101 s.Line, s.Col = s.pos()
102 s.Blank = s.line > startLine || startCol == Colbase
103 s.start()
104 if IsLetter(s.ch) || s.ch >= utf8.RuneSelf && s.AtIdentChar(true) {
105 s.nextch()
106 s.Ident()
107 return
108 }
109
110 switch s.ch {
111 case -1:
112 if nlsemi {
113 s.Lit = "EOF"
114 s.Tok = Semi
115 break
116 }
117 s.Tok =EOF
118
119 case '\n':
120 s.nextch()
121 s.Lit = "newline"
122 s.Tok = Semi
123
124 case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
125 s.Number(false)
126
127 case '"':
128 s.stdString()
129
130 case '`':
131 s.rawString()
132
133 case '\'':
134 s.rune()
135
136 case '(':
137 s.nextch()
138 s.Tok = Lparen
139
140 case '[':
141 s.nextch()
142 s.Tok = Lbrack
143
144 case '{':
145 s.nextch()
146 s.Tok = Lbrace
147
148 case ',':
149 s.nextch()
150 s.Tok = Comma
151
152 case ';':
153 s.nextch()
154 s.Lit = "semicolon"
155 s.Tok = Semi
156
157 case ')':
158 s.nextch()
159 s.nlsemi = true
160 s.Tok = Rparen
161
162 case ']':
163 s.nextch()
164 s.nlsemi = true
165 s.Tok = Rbrack
166
167 case '}':
168 s.nextch()
169 s.nlsemi = true
170 s.Tok = Rbrace
171
172 case ':':
173 s.nextch()
174 if s.ch == '=' {
175 s.nextch()
176 s.Tok = Define
177 break
178 }
179 s.Tok = Colon
180
181 case '.':
182 s.nextch()
183 if IsDecimal(s.ch) {
184 s.Number(true)
185 break
186 }
187 if s.ch == '.' {
188 s.nextch()
189 if s.ch == '.' {
190 s.nextch()
191 s.Tok = DotDotDot
192 break
193 }
194 s.rewind() // now s.ch holds 1st '.'
195 s.nextch() // consume 1st '.' again
196 }
197 s.Tok = Dot
198
199 case '+':
200 s.nextch()
201 s.Op, s.Prec = Add, PrecAdd
202 if s.ch != '+' {
203 goto assignop
204 }
205 s.nextch()
206 s.nlsemi = true
207 s.Tok = IncOp
208
209 case '-':
210 s.nextch()
211 s.Op, s.Prec = Sub, PrecAdd
212 if s.ch != '-' {
213 goto assignop
214 }
215 s.nextch()
216 s.nlsemi = true
217 s.Tok = IncOp
218
219 case '*':
220 s.nextch()
221 s.Op, s.Prec = Mul, PrecMul
222 // don't goto assignop - want _Star token
223 if s.ch == '=' {
224 s.nextch()
225 s.Tok = AssignOp
226 break
227 }
228 s.Tok = Star
229
230 case '/':
231 s.nextch()
232 if s.ch == '/' {
233 s.nextch()
234 s.lineComment()
235 goto redo
236 }
237 if s.ch == '*' {
238 s.nextch()
239 s.fullComment()
240 if line, _ := s.pos(); line > s.Line && nlsemi {
241 // A multi-line comment acts like a newline;
242 // it translates to a ';' if nlsemi is set.
243 s.Lit = "newline"
244 s.Tok = Semi
245 break
246 }
247 goto redo
248 }
249 s.Op, s.Prec = Div, PrecMul
250 goto assignop
251
252 case '%':
253 s.nextch()
254 s.Op, s.Prec = Rem, PrecMul
255 goto assignop
256
257 case '&':
258 s.nextch()
259 if s.ch == '&' {
260 s.nextch()
261 s.Op, s.Prec = AndAnd, PrecAndAnd
262 s.Tok = OperatorType
263 break
264 }
265 s.Op, s.Prec = And, PrecMul
266 if s.ch == '^' {
267 s.nextch()
268 s.Op = AndNot
269 }
270 goto assignop
271
272 case '|':
273 s.nextch()
274 if s.ch == '|' {
275 s.nextch()
276 s.Op, s.Prec = OrOr, PrecOrOr
277 s.Tok = OperatorType
278 break
279 }
280 s.Op, s.Prec = Or, PrecAdd
281 goto assignop
282
283 case '^':
284 s.nextch()
285 s.Op, s.Prec = Xor, PrecAdd
286 goto assignop
287
288 case '<':
289 s.nextch()
290 if s.ch == '=' {
291 s.nextch()
292 s.Op, s.Prec = Leq, PrecCmp
293 s.Tok = OperatorType
294 break
295 }
296 if s.ch == '<' {
297 s.nextch()
298 s.Op, s.Prec = Shl, PrecMul
299 goto assignop
300 }
301 if s.ch == '-' {
302 s.nextch()
303 s.Tok = Arrow
304 break
305 }
306 s.Op, s.Prec = Lss, PrecCmp
307 s.Tok = OperatorType
308
309 case '>':
310 s.nextch()
311 if s.ch == '=' {
312 s.nextch()
313 s.Op, s.Prec = Geq, PrecCmp
314 s.Tok =OperatorType
315 break
316 }
317 if s.ch == '>' {
318 s.nextch()
319 s.Op, s.Prec = Shr, PrecMul
320 goto assignop
321 }
322 s.Op, s.Prec = Gtr, PrecCmp
323 s.Tok = OperatorType
324
325 case '=':
326 s.nextch()
327 if s.ch == '=' {
328 s.nextch()
329 s.Op, s.Prec = Eql, PrecCmp
330 s.Tok = OperatorType
331 break
332 }
333 s.Tok = Assign
334
335 case '!':
336 s.nextch()
337 if s.ch == '=' {
338 s.nextch()
339 s.Op, s.Prec = Neq, PrecCmp
340 s.Tok = OperatorType
341 break
342 }
343 s.Op, s.Prec = Not, 0
344 s.Tok = OperatorType
345
346 case '~':
347 s.nextch()
348 s.Op, s.Prec = Tilde, 0
349 s.Tok = OperatorType
350
351 default:
352 s.Errorf("invalid character %#U", s.ch)
353 s.nextch()
354 goto redo
355 }
356
357 return
358
359 assignop:
360 if s.ch == '=' {
361 s.nextch()
362 s.Tok = AssignOp
363 return
364 }
365 s.Tok = OperatorType
366 }
367
368 func (s *Scanner) Ident() {
369 // accelerate common case (7bit ASCII)
370 for IsLetter(s.ch) || IsDecimal(s.ch) {
371 s.nextch()
372 }
373
374 // general case
375 if s.ch >= utf8.RuneSelf {
376 for s.AtIdentChar(false) {
377 s.nextch()
378 }
379 }
380
381 // possibly a keyword
382 lit := s.segment()
383 if len(lit) >= 2 {
384 if tok := keywordMap[Hash(lit)]; tok != 0 && TokStrFast(tok) == string(lit) {
385 s.nlsemi = contains(1<<Break|1<<Continue|1<<Fallthrough|1<<Return, tok)
386 s.Tok = tok
387 return
388 }
389 }
390
391 s.nlsemi = true
392 s.Lit = string(lit)
393 s.Tok = NameType
394 }
395
396 // tokStrFast is a faster version of token.String, which assumes that tok
397 // is one of the valid tokens - and can thus skip bounds checks.
398 func TokStrFast(tok Token) string {
399 return token_name[token_index[tok-1]:token_index[tok]]
400 }
401
402 func (s *Scanner) AtIdentChar(first bool) bool {
403 switch {
404 case unicode.IsLetter(s.ch) || s.ch == '_':
405 // ok
406 case unicode.IsDigit(s.ch):
407 if first {
408 s.Errorf("identifier cannot begin with digit %#U", s.ch)
409 }
410 case s.ch >= utf8.RuneSelf:
411 s.Errorf("invalid character %#U in identifier", s.ch)
412 default:
413 return false
414 }
415 return true
416 }
417
418 // hash is a perfect hash function for keywords.
419 // It assumes that s has at least length 2.
420 func Hash(s []byte) uint {
421 return (uint(s[0])<<4 ^ uint(s[1]) + uint(len(s))) & uint(len(keywordMap)-1)
422 }
423
424 var keywordMap [1 << 6]Token // size must be power of two
425
426 func init() { Init() }
427
428 func Init() {
429 // populate keywordMap
430 for tok := Break; tok <= Var; tok++ {
431 h := Hash([]byte(tok.String()))
432 if keywordMap[h] != 0 {
433 panic("imperfect hash")
434 }
435 keywordMap[h] = tok
436 }
437 }
438
439 func Lower(ch rune) rune { return ('a' - 'A') | ch } // returns lower-case ch iff ch is ASCII letter
440 func IsLetter(ch rune) bool { return 'a' <= Lower(ch) && Lower(ch) <= 'z' || ch == '_' }
441 func IsDecimal(ch rune) bool { return '0' <= ch && ch <= '9' }
442 func IsHex(ch rune) bool { return '0' <= ch && ch <= '9' || 'a' <= Lower(ch) && Lower(ch) <= 'f' }
443
444 // Digits accepts the sequence { digit | '_' }.
445 // If base <= 10, Digits accepts any decimal digit but records
446 // the index (relative to the literal start) of a digit >= base
447 // in *invalid, if *invalid < 0.
448 // Digits returns a bitset describing whether the sequence contained
449 // Digits (bit 0 is set), or separators '_' (bit 1 is set).
450 func (s *Scanner) Digits(base int32, invalid *int32) (digsep int32) {
451 if base <= 10 {
452 max := rune('0' + base)
453 for IsDecimal(s.ch) || s.ch == '_' {
454 ds := int32(1)
455 if s.ch == '_' {
456 ds = 2
457 } else if s.ch >= max && *invalid < 0 {
458 _, col := s.pos()
459 *invalid = int32(col - s.col)
460 }
461 digsep |= ds
462 s.nextch()
463 }
464 } else {
465 for IsHex(s.ch) || s.ch == '_' {
466 ds := int32(1)
467 if s.ch == '_' {
468 ds = 2
469 }
470 digsep |= ds
471 s.nextch()
472 }
473 }
474 return
475 }
476
477 func (s *Scanner) Number(seenPoint bool) {
478 ok := true
479 kind := IntLit
480 base := int32(10) // number base
481 prefix := rune(0) // one of 0 (decimal), '0' (0-octal), 'x', 'o', or 'b'
482 digsep := int32(0) // bit 0: digit present, bit 1: '_' present
483 invalid := int32(-1) // index of invalid digit in literal, or < 0
484
485 // integer part
486 if !seenPoint {
487 if s.ch == '0' {
488 s.nextch()
489 switch Lower(s.ch) {
490 case 'x':
491 s.nextch()
492 base, prefix = 16, 'x'
493 case 'o':
494 s.nextch()
495 base, prefix = 8, 'o'
496 case 'b':
497 s.nextch()
498 base, prefix = 2, 'b'
499 default:
500 base, prefix = 8, '0'
501 digsep = 1 // leading 0
502 }
503 }
504 digsep |= s.Digits(base, &invalid)
505 if s.ch == '.' {
506 if prefix == 'o' || prefix == 'b' {
507 s.Errorf("invalid radix point in %s literal", baseName(base))
508 ok = false
509 }
510 s.nextch()
511 seenPoint = true
512 }
513 }
514
515 // fractional part
516 if seenPoint {
517 kind = FloatLit
518 digsep |= s.Digits(base, &invalid)
519 }
520
521 if digsep&1 == 0 && ok {
522 s.Errorf("%s literal has no digits", baseName(base))
523 ok = false
524 }
525
526 // exponent
527 if e := Lower(s.ch); e == 'e' || e == 'p' {
528 if ok {
529 switch {
530 case e == 'e' && prefix != 0 && prefix != '0':
531 s.Errorf("%q exponent requires decimal mantissa", s.ch)
532 ok = false
533 case e == 'p' && prefix != 'x':
534 s.Errorf("%q exponent requires hexadecimal mantissa", s.ch)
535 ok = false
536 }
537 }
538 s.nextch()
539 kind = FloatLit
540 if s.ch == '+' || s.ch == '-' {
541 s.nextch()
542 }
543 digsep = s.Digits(10, nil) | digsep&2 // don't lose sep bit
544 if digsep&1 == 0 && ok {
545 s.Errorf("exponent has no digits")
546 ok = false
547 }
548 } else if prefix == 'x' && kind == FloatLit && ok {
549 s.Errorf("hexadecimal mantissa requires a 'p' exponent")
550 ok = false
551 }
552
553 // suffix 'i'
554 if s.ch == 'i' {
555 kind = ImagLit
556 s.nextch()
557 }
558
559 s.SetLit(kind, ok) // do this now so we can use s.lit below
560
561 if kind == IntLit && invalid >= 0 && ok {
562 s.ErrorAtf(invalid, "invalid digit %q in %s literal", s.Lit[invalid], baseName(base))
563 ok = false
564 }
565
566 if digsep&2 != 0 && ok {
567 if i := invalidSep(s.Lit); i >= 0 {
568 s.ErrorAtf(i, "'_' must separate successive digits")
569 ok = false
570 }
571 }
572
573 s.Bad = !ok // correct s.bad
574 }
575
576 func baseName(base int32) string {
577 switch base {
578 case 2:
579 return "binary"
580 case 8:
581 return "octal"
582 case 10:
583 return "decimal"
584 case 16:
585 return "hexadecimal"
586 }
587 panic("invalid base")
588 }
589
590 // invalidSep returns the index of the first invalid separator in x, or -1.
591 func invalidSep(x string) int32 {
592 x1 := ' ' // prefix char, we only care if it's 'x'
593 d := '.' // digit, one of '_', '0' (a digit), or '.' (anything else)
594 i := int32(0)
595
596 // a prefix counts as a digit
597 if len(x) >= 2 && x[0] == '0' {
598 x1 = Lower(rune(x[1]))
599 if x1 == 'x' || x1 == 'o' || x1 == 'b' {
600 d = '0'
601 i = 2
602 }
603 }
604
605 // mantissa and exponent
606 for ; i < int32(len(x)); i++ {
607 p := d // previous digit
608 d = rune(x[i])
609 switch {
610 case d == '_':
611 if p != '0' {
612 return i
613 }
614 case IsDecimal(d) || x1 == 'x' && IsHex(d):
615 d = '0'
616 default:
617 if p == '_' {
618 return i - 1
619 }
620 d = '.'
621 }
622 }
623 if d == '_' {
624 return int32(len(x)) - 1
625 }
626
627 return -1
628 }
629
630 func (s *Scanner) rune() {
631 ok := true
632 s.nextch()
633
634 n := 0
635 for ; ; n++ {
636 if s.ch == '\'' {
637 if ok {
638 if n == 0 {
639 s.Errorf("empty rune literal or unescaped '")
640 ok = false
641 } else if n != 1 {
642 s.ErrorAtf(0, "more than one character in rune literal")
643 ok = false
644 }
645 }
646 s.nextch()
647 break
648 }
649 if s.ch == '\\' {
650 s.nextch()
651 if !s.escape('\'') {
652 ok = false
653 }
654 continue
655 }
656 if s.ch == '\n' {
657 if ok {
658 s.Errorf("newline in rune literal")
659 ok = false
660 }
661 break
662 }
663 if s.ch < 0 {
664 if ok {
665 s.ErrorAtf(0, "rune literal not terminated")
666 ok = false
667 }
668 break
669 }
670 s.nextch()
671 }
672
673 s.SetLit(RuneLit, ok)
674 }
675
676 func (s *Scanner) stdString() {
677 ok := true
678 s.nextch()
679
680 for {
681 if s.ch == '"' {
682 s.nextch()
683 break
684 }
685 if s.ch == '\\' {
686 s.nextch()
687 if !s.escape('"') {
688 ok = false
689 }
690 continue
691 }
692 if s.ch == '\n' {
693 s.Errorf("newline in string")
694 ok = false
695 break
696 }
697 if s.ch < 0 {
698 s.ErrorAtf(0, "string not terminated")
699 ok = false
700 break
701 }
702 s.nextch()
703 }
704
705 s.SetLit(StringLit, ok)
706 }
707
708 func (s *Scanner) rawString() {
709 ok := true
710 s.nextch()
711
712 for {
713 if s.ch == '`' {
714 s.nextch()
715 break
716 }
717 if s.ch < 0 {
718 s.ErrorAtf(0, "string not terminated")
719 ok = false
720 break
721 }
722 s.nextch()
723 }
724 // We leave CRs in the string since they are part of the
725 // literal (even though they are not part of the literal
726 // value).
727
728 s.SetLit(StringLit, ok)
729 }
730
731 func (s *Scanner) comment(text string) {
732 s.ErrorAtf(0, "%s", text)
733 }
734
735 func (s *Scanner) skipLine() {
736 // don't consume '\n' - needed for nlsemi logic
737 for s.ch >= 0 && s.ch != '\n' {
738 s.nextch()
739 }
740 }
741
742 func (s *Scanner) lineComment() {
743 // opening has already been consumed
744
745 if s.mode&comments != 0 {
746 s.skipLine()
747 s.comment(string(s.segment()))
748 return
749 }
750
751 // are we saving directives? or is this definitely not a directive?
752 if s.mode&directives == 0 || (s.ch != 'g' && s.ch != 'l') {
753 s.stop()
754 s.skipLine()
755 return
756 }
757
758 // recognize go: or line directives
759 prefix := "go:"
760 if s.ch == 'l' {
761 prefix = "line "
762 }
763
764 for _, r := range prefix {
765 if s.ch != r { s.stop(); s.skipLine(); return }
766 s.nextch()
767 }
768 // directive text
769 s.skipLine()
770 s.comment(string(s.segment()))
771 }
772
773 func (s *Scanner) skipComment() bool {
774 for s.ch >= 0 {
775 for s.ch == '*' {
776 s.nextch()
777 if s.ch == '/' {
778 s.nextch()
779 return true
780 }
781 }
782 s.nextch()
783 }
784 s.ErrorAtf(0, "comment not terminated")
785 return false
786 }
787
788 func (s *Scanner) fullComment() {
789 /* opening has already been consumed */
790
791 if s.mode&comments != 0 {
792 if s.skipComment() {
793 s.comment(string(s.segment()))
794 }
795 return
796 }
797
798 if s.mode&directives == 0 || s.ch != 'l' {
799 s.stop()
800 s.skipComment()
801 return
802 }
803
804 // recognize line directive
805 const prefix = "line "
806
807 for _, r := range prefix {
808 if s.ch != r { s.stop(); s.skipComment(); return }
809 s.nextch()
810 }
811 // directive text
812 if s.skipComment() {
813 s.comment(string(s.segment()))
814 }
815 }
816
817 func (s *Scanner) escape(quote rune) bool {
818 var n int32
819 var base, max uint32
820
821 switch s.ch {
822 case quote, 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\':
823 s.nextch()
824 return true
825 case '0', '1', '2', '3', '4', '5', '6', '7':
826 n, base, max = 3, 8, 255
827 case 'x':
828 s.nextch()
829 n, base, max = 2, 16, 255
830 case 'u':
831 s.nextch()
832 n, base, max = 4, 16, unicode.MaxRune
833 case 'U':
834 s.nextch()
835 n, base, max = 8, 16, unicode.MaxRune
836 default:
837 if s.ch < 0 {
838 return true // complain in caller about EOF
839 }
840 s.Errorf("unknown escape")
841 return false
842 }
843
844 var x uint32
845 for i := n; i > 0; i-- {
846 if s.ch < 0 {
847 return true // complain in caller about EOF
848 }
849 d := base
850 if IsDecimal(s.ch) {
851 d = uint32(s.ch) - '0'
852 } else if 'a' <= Lower(s.ch) && Lower(s.ch) <= 'f' {
853 d = uint32(Lower(s.ch)) - 'a' + 10
854 }
855 if d >= base {
856 s.Errorf("invalid character %q in %s escape", s.ch, baseName(int32(base)))
857 return false
858 }
859 // d < base
860 x = x*base + d
861 s.nextch()
862 }
863
864 if x > max && base == 8 {
865 s.Errorf("octal escape value %d > 255", x)
866 return false
867 }
868
869 if x > max || 0xD800 <= x && x < 0xE000 /* surrogate range */ {
870 s.Errorf("escape is invalid Unicode code point %#U", x)
871 return false
872 }
873
874 return true
875 }
876