scanner.mx raw
1 package main
2
3 import (
4 "fmt"
5 "io"
6 "unicode"
7 "unicode/utf8"
8 )
9
10 const (
11 comments uint = 1 << iota
12 directives
13 )
14
15 type Scanner struct {
16 Source
17 mode uint
18 nlsemi bool
19
20 Line, Col uint32
21 Blank bool
22 Tok Token
23 Lit string
24 Bad bool
25 Kind LitKind
26 Op Operator
27 Prec int32
28
29 keywordMap [1 << 6]Token
30 keywordsReady bool
31 }
32
33 func (s *Scanner) Init(src io.Reader, errh func(line, col uint32, msg string), mode uint) {
34 s.Source.init(src, errh)
35 s.mode = mode
36 s.nlsemi = false
37 s.initKeywords()
38 }
39
40 func (s *Scanner) Errorf(format string, args ...interface{}) {
41 s.error(fmt.Sprintf(format, args...))
42 }
43
44 func (s *Scanner) ErrorAtf(offset int32, format string, args ...interface{}) {
45 s.errh(s.line, s.col+uint32(offset), fmt.Sprintf(format, args...))
46 }
47
48 func (s *Scanner) SetLit(kind LitKind, ok bool) {
49 s.nlsemi = true
50 s.Tok = Literal
51 s.Lit = string(s.segmentCopy())
52 s.Bad = !ok
53 s.Kind = kind
54 }
55
56 func (s *Scanner) Next() {
57 nlsemi := s.nlsemi
58 s.nlsemi = false
59
60 redo:
61 s.stop()
62 startLine, startCol := s.pos()
63 for s.ch == ' ' || s.ch == '\t' || s.ch == '\n' && !nlsemi || s.ch == '\r' {
64 s.nextch()
65 }
66
67 s.Line, s.Col = s.pos()
68 s.Blank = s.line > startLine || startCol == Colbase
69 s.start()
70 if IsLetter(s.ch) || s.ch >= utf8.RuneSelf && s.AtIdentChar(true) {
71 s.nextch()
72 s.Ident()
73 return
74 }
75
76 switch s.ch {
77 case -1:
78 if nlsemi {
79 s.Lit = "EOF"
80 s.Tok = Semi
81 break
82 }
83 s.Tok = EOF
84
85 case '\n':
86 s.nextch()
87 s.Lit = "newline"
88 s.Tok = Semi
89
90 case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
91 s.Number(false)
92
93 case '"':
94 s.stdString()
95
96 case '`':
97 s.rawString()
98
99 case '\'':
100 s.rune()
101
102 case '(':
103 s.nextch()
104 s.Tok = Lparen
105
106 case '[':
107 s.nextch()
108 s.Tok = Lbrack
109
110 case '{':
111 s.nextch()
112 s.Tok = Lbrace
113
114 case ',':
115 s.nextch()
116 s.Tok = Comma
117
118 case ';':
119 s.nextch()
120 s.Lit = "semicolon"
121 s.Tok = Semi
122
123 case ')':
124 s.nextch()
125 s.nlsemi = true
126 s.Tok = Rparen
127
128 case ']':
129 s.nextch()
130 s.nlsemi = true
131 s.Tok = Rbrack
132
133 case '}':
134 s.nextch()
135 s.nlsemi = true
136 s.Tok = Rbrace
137
138 case ':':
139 s.nextch()
140 if s.ch == '=' {
141 s.nextch()
142 s.Tok = Define
143 break
144 }
145 s.Tok = Colon
146
147 case '.':
148 s.nextch()
149 if IsDecimal(s.ch) {
150 s.Number(true)
151 break
152 }
153 if s.ch == '.' {
154 s.nextch()
155 if s.ch == '.' {
156 s.nextch()
157 s.Tok = DotDotDot
158 break
159 }
160 s.rewind()
161 s.nextch()
162 }
163 s.Tok = Dot
164
165 case '+':
166 s.nextch()
167 s.Op, s.Prec = Add, PrecAdd
168 if s.ch != '+' {
169 goto assignop
170 }
171 s.nextch()
172 s.nlsemi = true
173 s.Tok = IncOp
174
175 case '-':
176 s.nextch()
177 s.Op, s.Prec = Sub, PrecAdd
178 if s.ch != '-' {
179 goto assignop
180 }
181 s.nextch()
182 s.nlsemi = true
183 s.Tok = IncOp
184
185 case '*':
186 s.nextch()
187 s.Op, s.Prec = Mul, PrecMul
188 if s.ch == '=' {
189 s.nextch()
190 s.Tok = AssignOp
191 break
192 }
193 s.Tok = Star
194
195 case '/':
196 s.nextch()
197 if s.ch == '/' {
198 s.nextch()
199 s.lineComment()
200 goto redo
201 }
202 if s.ch == '*' {
203 s.nextch()
204 s.fullComment()
205 if line, _ := s.pos(); line > s.Line && nlsemi {
206 s.Lit = "newline"
207 s.Tok = Semi
208 break
209 }
210 goto redo
211 }
212 s.Op, s.Prec = Div, PrecMul
213 goto assignop
214
215 case '%':
216 s.nextch()
217 s.Op, s.Prec = Rem, PrecMul
218 goto assignop
219
220 case '&':
221 s.nextch()
222 if s.ch == '&' {
223 s.nextch()
224 s.Op, s.Prec = AndAnd, PrecAndAnd
225 s.Tok = OperatorType
226 break
227 }
228 s.Op, s.Prec = And, PrecMul
229 if s.ch == '^' {
230 s.nextch()
231 s.Op = AndNot
232 }
233 goto assignop
234
235 case '|':
236 s.nextch()
237 if s.ch == '|' {
238 s.nextch()
239 s.Op, s.Prec = OrOr, PrecOrOr
240 s.Tok = OperatorType
241 break
242 }
243 s.Op, s.Prec = Or, PrecAdd
244 goto assignop
245
246 case '^':
247 s.nextch()
248 s.Op, s.Prec = Xor, PrecAdd
249 goto assignop
250
251 case '<':
252 s.nextch()
253 if s.ch == '=' {
254 s.nextch()
255 s.Op, s.Prec = Leq, PrecCmp
256 s.Tok = OperatorType
257 break
258 }
259 if s.ch == '<' {
260 s.nextch()
261 s.Op, s.Prec = Shl, PrecMul
262 goto assignop
263 }
264 if s.ch == '-' {
265 s.nextch()
266 s.Tok = Arrow
267 break
268 }
269 s.Op, s.Prec = Lss, PrecCmp
270 s.Tok = OperatorType
271
272 case '>':
273 s.nextch()
274 if s.ch == '=' {
275 s.nextch()
276 s.Op, s.Prec = Geq, PrecCmp
277 s.Tok = OperatorType
278 break
279 }
280 if s.ch == '>' {
281 s.nextch()
282 s.Op, s.Prec = Shr, PrecMul
283 goto assignop
284 }
285 s.Op, s.Prec = Gtr, PrecCmp
286 s.Tok = OperatorType
287
288 case '=':
289 s.nextch()
290 if s.ch == '=' {
291 s.nextch()
292 s.Op, s.Prec = Eql, PrecCmp
293 s.Tok = OperatorType
294 break
295 }
296 s.Tok = Assign
297
298 case '!':
299 s.nextch()
300 if s.ch == '=' {
301 s.nextch()
302 s.Op, s.Prec = Neq, PrecCmp
303 s.Tok = OperatorType
304 break
305 }
306 s.Op, s.Prec = Not, 0
307 s.Tok = OperatorType
308
309 case '~':
310 s.nextch()
311 s.Op, s.Prec = Tilde, 0
312 s.Tok = OperatorType
313
314 default:
315 s.Errorf("invalid character %#U", s.ch)
316 s.nextch()
317 goto redo
318 }
319
320 return
321
322 assignop:
323 if s.ch == '=' {
324 s.nextch()
325 s.Tok = AssignOp
326 return
327 }
328 s.Tok = OperatorType
329 }
330
331 func (s *Scanner) Ident() {
332 for IsLetter(s.ch) || IsDecimal(s.ch) {
333 s.nextch()
334 }
335
336 if s.ch >= utf8.RuneSelf {
337 for s.AtIdentChar(false) {
338 s.nextch()
339 }
340 }
341
342 lit := s.segment()
343 if len(lit) >= 2 {
344 h := (uint(lit[0])<<4 ^ uint(lit[1]) + uint(len(lit))) & 63
345 if tok := s.keywordMap[h]; tok != 0 && tokStrFast(tok) == string(lit) {
346 s.nlsemi = contains(1<<Break|1<<Continue|1<<Fallthrough|1<<Return, tok)
347 s.Tok = tok
348 return
349 }
350 }
351
352 s.nlsemi = true
353 c := []byte{:len(lit)}
354 copy(c, lit)
355 s.Lit = string(c)
356 s.Tok = NameType
357 }
358
359 func tokStrFast(tok Token) string {
360 idx := [48]uint8{0, 3, 7, 14, 16, 19, 23, 24, 26, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 42, 47, 51, 55, 60, 68, 75, 80, 84, 95, 98, 102, 104, 108, 110, 116, 125, 128, 135, 140, 146, 152, 158, 164, 168, 171, 171}
361 return token_name[idx[tok-1]:idx[tok]]
362 }
363
364 func (s *Scanner) AtIdentChar(first bool) bool {
365 switch {
366 case unicode.IsLetter(s.ch) || s.ch == '_':
367 case unicode.IsDigit(s.ch):
368 if first {
369 s.Errorf("identifier cannot begin with digit %#U", s.ch)
370 }
371 case s.ch >= utf8.RuneSelf:
372 s.Errorf("invalid character %#U in identifier", s.ch)
373 default:
374 return false
375 }
376 return true
377 }
378
379 func (s *Scanner) initKeywords() {
380 if s.keywordsReady {
381 return
382 }
383 s.keywordsReady = true
384 for tok := Break; tok <= Var; tok++ {
385 b := []byte(tok.String())
386 h := (uint(b[0])<<4 ^ uint(b[1]) + uint(len(b))) & 63
387 if s.keywordMap[h] != 0 {
388 panic("imperfect hash")
389 }
390 s.keywordMap[h] = tok
391 }
392 }
393
394 func Lower(ch rune) rune { return ('a' - 'A') | ch }
395 func IsLetter(ch rune) bool { return 'a' <= Lower(ch) && Lower(ch) <= 'z' || ch == '_' }
396 func IsDecimal(ch rune) bool { return '0' <= ch && ch <= '9' }
397 func IsHex(ch rune) bool { return '0' <= ch && ch <= '9' || 'a' <= Lower(ch) && Lower(ch) <= 'f' }
398
399 func (s *Scanner) Digits(base int32, invalid *int32) (digsep int32) {
400 if base <= 10 {
401 max := rune('0' + base)
402 for IsDecimal(s.ch) || s.ch == '_' {
403 ds := int32(1)
404 if s.ch == '_' {
405 ds = 2
406 } else if s.ch >= max && *invalid < 0 {
407 _, col := s.pos()
408 *invalid = int32(col - s.col)
409 }
410 digsep |= ds
411 s.nextch()
412 }
413 } else {
414 for IsHex(s.ch) || s.ch == '_' {
415 ds := int32(1)
416 if s.ch == '_' {
417 ds = 2
418 }
419 digsep |= ds
420 s.nextch()
421 }
422 }
423 return
424 }
425
426 func (s *Scanner) Number(seenPoint bool) {
427 ok := true
428 kind := IntLit
429 base := int32(10)
430 prefix := rune(0)
431 digsep := int32(0)
432 invalid := int32(-1)
433
434 if !seenPoint {
435 if s.ch == '0' {
436 s.nextch()
437 switch Lower(s.ch) {
438 case 'x':
439 s.nextch()
440 base, prefix = 16, 'x'
441 case 'o':
442 s.nextch()
443 base, prefix = 8, 'o'
444 case 'b':
445 s.nextch()
446 base, prefix = 2, 'b'
447 default:
448 base, prefix = 8, '0'
449 digsep = 1
450 }
451 }
452 digsep |= s.Digits(base, &invalid)
453 if s.ch == '.' {
454 if prefix == 'o' || prefix == 'b' {
455 s.Errorf("invalid radix point in %s literal", baseName(base))
456 ok = false
457 }
458 s.nextch()
459 seenPoint = true
460 }
461 }
462
463 if seenPoint {
464 kind = FloatLit
465 digsep |= s.Digits(base, &invalid)
466 }
467
468 if digsep&1 == 0 && ok {
469 s.Errorf("%s literal has no digits", baseName(base))
470 ok = false
471 }
472
473 if e := Lower(s.ch); e == 'e' || e == 'p' {
474 if ok {
475 switch {
476 case e == 'e' && prefix != 0 && prefix != '0':
477 s.Errorf("%q exponent requires decimal mantissa", s.ch)
478 ok = false
479 case e == 'p' && prefix != 'x':
480 s.Errorf("%q exponent requires hexadecimal mantissa", s.ch)
481 ok = false
482 }
483 }
484 s.nextch()
485 kind = FloatLit
486 if s.ch == '+' || s.ch == '-' {
487 s.nextch()
488 }
489 digsep = s.Digits(10, nil) | digsep&2
490 if digsep&1 == 0 && ok {
491 s.Errorf("exponent has no digits")
492 ok = false
493 }
494 } else if prefix == 'x' && kind == FloatLit && ok {
495 s.Errorf("hexadecimal mantissa requires a 'p' exponent")
496 ok = false
497 }
498
499 if s.ch == 'i' {
500 kind = ImagLit
501 s.nextch()
502 }
503
504 s.SetLit(kind, ok)
505
506 if kind == IntLit && invalid >= 0 && ok {
507 s.ErrorAtf(invalid, "invalid digit %q in %s literal", s.Lit[invalid], baseName(base))
508 ok = false
509 }
510
511 if digsep&2 != 0 && ok {
512 if i := invalidSep(s.Lit); i >= 0 {
513 s.ErrorAtf(i, "'_' must separate successive digits")
514 ok = false
515 }
516 }
517
518 s.Bad = !ok
519 }
520
521 func baseName(base int32) string {
522 switch base {
523 case 2:
524 return "binary"
525 case 8:
526 return "octal"
527 case 10:
528 return "decimal"
529 case 16:
530 return "hexadecimal"
531 }
532 panic("invalid base")
533 }
534
535 func invalidSep(x string) int32 {
536 x1 := ' '
537 d := '.'
538 i := int32(0)
539
540 if len(x) >= 2 && x[0] == '0' {
541 x1 = Lower(rune(x[1]))
542 if x1 == 'x' || x1 == 'o' || x1 == 'b' {
543 d = '0'
544 i = 2
545 }
546 }
547
548 for ; i < int32(len(x)); i++ {
549 p := d
550 d = rune(x[i])
551 switch {
552 case d == '_':
553 if p != '0' {
554 return i
555 }
556 case IsDecimal(d) || x1 == 'x' && IsHex(d):
557 d = '0'
558 default:
559 if p == '_' {
560 return i - 1
561 }
562 d = '.'
563 }
564 }
565 if d == '_' {
566 return int32(len(x)) - 1
567 }
568
569 return -1
570 }
571
572 func (s *Scanner) rune() {
573 ok := true
574 s.nextch()
575
576 n := 0
577 for ; ; n++ {
578 if s.ch == '\'' {
579 if ok {
580 if n == 0 {
581 s.Errorf("empty rune literal or unescaped '")
582 ok = false
583 } else if n != 1 {
584 s.ErrorAtf(0, "more than one character in rune literal")
585 ok = false
586 }
587 }
588 s.nextch()
589 break
590 }
591 if s.ch == '\\' {
592 s.nextch()
593 if !s.escape('\'') {
594 ok = false
595 }
596 continue
597 }
598 if s.ch == '\n' {
599 if ok {
600 s.Errorf("newline in rune literal")
601 ok = false
602 }
603 break
604 }
605 if s.ch < 0 {
606 if ok {
607 s.ErrorAtf(0, "rune literal not terminated")
608 ok = false
609 }
610 break
611 }
612 s.nextch()
613 }
614
615 s.SetLit(RuneLit, ok)
616 }
617
618 func (s *Scanner) stdString() {
619 ok := true
620 s.nextch()
621
622 for {
623 if s.ch == '"' {
624 s.nextch()
625 break
626 }
627 if s.ch == '\\' {
628 s.nextch()
629 if !s.escape('"') {
630 ok = false
631 }
632 continue
633 }
634 if s.ch == '\n' {
635 s.Errorf("newline in string")
636 ok = false
637 break
638 }
639 if s.ch < 0 {
640 s.ErrorAtf(0, "string not terminated")
641 ok = false
642 break
643 }
644 s.nextch()
645 }
646
647 s.SetLit(StringLit, ok)
648 }
649
650 func (s *Scanner) rawString() {
651 ok := true
652 s.nextch()
653
654 for {
655 if s.ch == '`' {
656 s.nextch()
657 break
658 }
659 if s.ch < 0 {
660 s.ErrorAtf(0, "string not terminated")
661 ok = false
662 break
663 }
664 s.nextch()
665 }
666
667 s.SetLit(StringLit, ok)
668 }
669
670 func (s *Scanner) comment(text string) {
671 s.ErrorAtf(0, "%s", text)
672 }
673
674 func (s *Scanner) skipLine() {
675 for s.ch >= 0 && s.ch != '\n' {
676 s.nextch()
677 }
678 }
679
680 func (s *Scanner) lineComment() {
681 if s.mode&comments != 0 {
682 s.skipLine()
683 s.comment(string(s.segment()))
684 return
685 }
686
687 if s.mode&directives == 0 || (s.ch != 'g' && s.ch != 'l') {
688 s.stop()
689 s.skipLine()
690 return
691 }
692
693 prefix := "go:"
694 if s.ch == 'l' {
695 prefix = "line "
696 }
697
698 for _, r := range prefix {
699 if s.ch != rune(r) {
700 s.stop()
701 s.skipLine()
702 return
703 }
704 s.nextch()
705 }
706 s.skipLine()
707 s.comment(string(s.segment()))
708 }
709
710 func (s *Scanner) skipComment() bool {
711 for s.ch >= 0 {
712 for s.ch == '*' {
713 s.nextch()
714 if s.ch == '/' {
715 s.nextch()
716 return true
717 }
718 }
719 s.nextch()
720 }
721 s.ErrorAtf(0, "comment not terminated")
722 return false
723 }
724
725 func (s *Scanner) fullComment() {
726 if s.mode&comments != 0 {
727 if s.skipComment() {
728 s.comment(string(s.segment()))
729 }
730 return
731 }
732
733 if s.mode&directives == 0 || s.ch != 'l' {
734 s.stop()
735 s.skipComment()
736 return
737 }
738
739 const prefix = "line "
740
741 for _, r := range prefix {
742 if s.ch != rune(r) {
743 s.stop()
744 s.skipComment()
745 return
746 }
747 s.nextch()
748 }
749 if s.skipComment() {
750 s.comment(string(s.segment()))
751 }
752 }
753
754 func (s *Scanner) escape(quote rune) bool {
755 var n int32
756 var base, max uint32
757
758 switch s.ch {
759 case quote, 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\':
760 s.nextch()
761 return true
762 case '0', '1', '2', '3', '4', '5', '6', '7':
763 n, base, max = 3, 8, 255
764 case 'x':
765 s.nextch()
766 n, base, max = 2, 16, 255
767 case 'u':
768 s.nextch()
769 n, base, max = 4, 16, unicode.MaxRune
770 case 'U':
771 s.nextch()
772 n, base, max = 8, 16, unicode.MaxRune
773 default:
774 if s.ch < 0 {
775 return true
776 }
777 s.Errorf("unknown escape")
778 return false
779 }
780
781 var x uint32
782 for i := n; i > 0; i-- {
783 if s.ch < 0 {
784 return true
785 }
786 d := base
787 if IsDecimal(s.ch) {
788 d = uint32(s.ch) - '0'
789 } else if 'a' <= Lower(s.ch) && Lower(s.ch) <= 'f' {
790 d = uint32(Lower(s.ch)) - 'a' + 10
791 }
792 if d >= base {
793 s.Errorf("invalid character %q in %s escape", s.ch, baseName(int32(base)))
794 return false
795 }
796 x = x*base + d
797 s.nextch()
798 }
799
800 if x > max && base == 8 {
801 s.Errorf("octal escape value %d > 255", x)
802 return false
803 }
804
805 if x > max || 0xD800 <= x && x < 0xE000 {
806 s.Errorf("escape is invalid Unicode code point %#U", x)
807 return false
808 }
809
810 return true
811 }
812
813 func String(n Node) string {
814 return fmt.Sprintf("%T", n)
815 }
816
817 func StartPos(n Node) Pos {
818 return n.Pos()
819 }
820