scanner.mx raw
1 package main
2
3 import (
4 "fmt"
5 "io"
6 "unicode"
7 "unicode/utf8"
8 )
9
10 const (
11 comments uint = 1 << iota
12 directives
13 )
14
15 type Scanner struct {
16 Source
17 mode uint
18 nlsemi bool
19
20 Line, Col uint32
21 Blank bool
22 Tok Token
23 Lit string
24 Bad bool
25 Kind LitKind
26 Op Operator
27 Prec int32
28 }
29
30 func (s *Scanner) Init(src io.Reader, errh func(line, col uint32, msg string), mode uint) {
31 s.Source.init(src, errh)
32 s.mode = mode
33 s.nlsemi = false
34 }
35
36 func (s *Scanner) Errorf(format string, args ...interface{}) {
37 s.error(fmt.Sprintf(format, args...))
38 }
39
40 func (s *Scanner) ErrorAtf(offset int32, format string, args ...interface{}) {
41 s.errh(s.line, s.col+uint32(offset), fmt.Sprintf(format, args...))
42 }
43
44 func (s *Scanner) SetLit(kind LitKind, ok bool) {
45 s.nlsemi = true
46 s.Tok = Literal
47 s.Lit = string(s.segmentCopy())
48 s.Bad = !ok
49 s.Kind = kind
50 }
51
52 func (s *Scanner) Next() {
53 nlsemi := s.nlsemi
54 s.nlsemi = false
55
56 redo:
57 s.stop()
58 startLine, startCol := s.pos()
59 for s.ch == ' ' || s.ch == '\t' || s.ch == '\n' && !nlsemi || s.ch == '\r' {
60 s.nextch()
61 }
62
63 s.Line, s.Col = s.pos()
64 s.Blank = s.line > startLine || startCol == Colbase
65 s.start()
66 if IsLetter(s.ch) || s.ch >= utf8.RuneSelf && s.AtIdentChar(true) {
67 s.nextch()
68 s.Ident()
69 return
70 }
71
72 switch s.ch {
73 case -1:
74 if nlsemi {
75 s.Lit = "EOF"
76 s.Tok = Semi
77 break
78 }
79 s.Tok = EOF
80
81 case '\n':
82 s.nextch()
83 s.Lit = "newline"
84 s.Tok = Semi
85
86 case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
87 s.Number(false)
88
89 case '"':
90 s.stdString()
91
92 case '`':
93 s.rawString()
94
95 case '\'':
96 s.rune()
97
98 case '(':
99 s.nextch()
100 s.Tok = Lparen
101
102 case '[':
103 s.nextch()
104 s.Tok = Lbrack
105
106 case '{':
107 s.nextch()
108 s.Tok = Lbrace
109
110 case ',':
111 s.nextch()
112 s.Tok = Comma
113
114 case ';':
115 s.nextch()
116 s.Lit = "semicolon"
117 s.Tok = Semi
118
119 case ')':
120 s.nextch()
121 s.nlsemi = true
122 s.Tok = Rparen
123
124 case ']':
125 s.nextch()
126 s.nlsemi = true
127 s.Tok = Rbrack
128
129 case '}':
130 s.nextch()
131 s.nlsemi = true
132 s.Tok = Rbrace
133
134 case ':':
135 s.nextch()
136 if s.ch == '=' {
137 s.nextch()
138 s.Tok = Define
139 break
140 }
141 s.Tok = Colon
142
143 case '.':
144 s.nextch()
145 if IsDecimal(s.ch) {
146 s.Number(true)
147 break
148 }
149 if s.ch == '.' {
150 s.nextch()
151 if s.ch == '.' {
152 s.nextch()
153 s.Tok = DotDotDot
154 break
155 }
156 s.rewind()
157 s.nextch()
158 }
159 s.Tok = Dot
160
161 case '+':
162 s.nextch()
163 s.Op, s.Prec = Add, PrecAdd
164 if s.ch != '+' {
165 goto assignop
166 }
167 s.nextch()
168 s.nlsemi = true
169 s.Tok = IncOp
170
171 case '-':
172 s.nextch()
173 s.Op, s.Prec = Sub, PrecAdd
174 if s.ch != '-' {
175 goto assignop
176 }
177 s.nextch()
178 s.nlsemi = true
179 s.Tok = IncOp
180
181 case '*':
182 s.nextch()
183 s.Op, s.Prec = Mul, PrecMul
184 if s.ch == '=' {
185 s.nextch()
186 s.Tok = AssignOp
187 break
188 }
189 s.Tok = Star
190
191 case '/':
192 s.nextch()
193 if s.ch == '/' {
194 s.nextch()
195 s.lineComment()
196 goto redo
197 }
198 if s.ch == '*' {
199 s.nextch()
200 s.fullComment()
201 if line, _ := s.pos(); line > s.Line && nlsemi {
202 s.Lit = "newline"
203 s.Tok = Semi
204 break
205 }
206 goto redo
207 }
208 s.Op, s.Prec = Div, PrecMul
209 goto assignop
210
211 case '%':
212 s.nextch()
213 s.Op, s.Prec = Rem, PrecMul
214 goto assignop
215
216 case '&':
217 s.nextch()
218 if s.ch == '&' {
219 s.nextch()
220 s.Op, s.Prec = AndAnd, PrecAndAnd
221 s.Tok = OperatorType
222 break
223 }
224 s.Op, s.Prec = And, PrecMul
225 if s.ch == '^' {
226 s.nextch()
227 s.Op = AndNot
228 }
229 goto assignop
230
231 case '|':
232 s.nextch()
233 if s.ch == '|' {
234 s.nextch()
235 s.Op, s.Prec = OrOr, PrecOrOr
236 s.Tok = OperatorType
237 break
238 }
239 s.Op, s.Prec = Or, PrecAdd
240 goto assignop
241
242 case '^':
243 s.nextch()
244 s.Op, s.Prec = Xor, PrecAdd
245 goto assignop
246
247 case '<':
248 s.nextch()
249 if s.ch == '=' {
250 s.nextch()
251 s.Op, s.Prec = Leq, PrecCmp
252 s.Tok = OperatorType
253 break
254 }
255 if s.ch == '<' {
256 s.nextch()
257 s.Op, s.Prec = Shl, PrecMul
258 goto assignop
259 }
260 if s.ch == '-' {
261 s.nextch()
262 s.Tok = Arrow
263 break
264 }
265 s.Op, s.Prec = Lss, PrecCmp
266 s.Tok = OperatorType
267
268 case '>':
269 s.nextch()
270 if s.ch == '=' {
271 s.nextch()
272 s.Op, s.Prec = Geq, PrecCmp
273 s.Tok = OperatorType
274 break
275 }
276 if s.ch == '>' {
277 s.nextch()
278 s.Op, s.Prec = Shr, PrecMul
279 goto assignop
280 }
281 s.Op, s.Prec = Gtr, PrecCmp
282 s.Tok = OperatorType
283
284 case '=':
285 s.nextch()
286 if s.ch == '=' {
287 s.nextch()
288 s.Op, s.Prec = Eql, PrecCmp
289 s.Tok = OperatorType
290 break
291 }
292 s.Tok = Assign
293
294 case '!':
295 s.nextch()
296 if s.ch == '=' {
297 s.nextch()
298 s.Op, s.Prec = Neq, PrecCmp
299 s.Tok = OperatorType
300 break
301 }
302 s.Op, s.Prec = Not, 0
303 s.Tok = OperatorType
304
305 case '~':
306 s.nextch()
307 s.Op, s.Prec = Tilde, 0
308 s.Tok = OperatorType
309
310 default:
311 s.Errorf("invalid character %#U", s.ch)
312 s.nextch()
313 goto redo
314 }
315
316 return
317
318 assignop:
319 if s.ch == '=' {
320 s.nextch()
321 s.Tok = AssignOp
322 return
323 }
324 s.Tok = OperatorType
325 }
326
327 func (s *Scanner) Ident() {
328 for IsLetter(s.ch) || IsDecimal(s.ch) {
329 s.nextch()
330 }
331
332 if s.ch >= utf8.RuneSelf {
333 for s.AtIdentChar(false) {
334 s.nextch()
335 }
336 }
337
338 lit := s.segment()
339 if len(lit) >= 2 {
340 if tok := keywordMap[Hash(lit)]; tok != 0 && TokStrFast(tok) == string(lit) {
341 s.nlsemi = contains(1<<Break|1<<Continue|1<<Fallthrough|1<<Return, tok)
342 s.Tok = tok
343 return
344 }
345 }
346
347 s.nlsemi = true
348 c := []byte{:len(lit)}
349 copy(c, lit)
350 s.Lit = string(c)
351 s.Tok = NameType
352 }
353
354 func TokStrFast(tok Token) string {
355 return token_name[token_index[tok-1]:token_index[tok]]
356 }
357
358 func (s *Scanner) AtIdentChar(first bool) bool {
359 switch {
360 case unicode.IsLetter(s.ch) || s.ch == '_':
361 case unicode.IsDigit(s.ch):
362 if first {
363 s.Errorf("identifier cannot begin with digit %#U", s.ch)
364 }
365 case s.ch >= utf8.RuneSelf:
366 s.Errorf("invalid character %#U in identifier", s.ch)
367 default:
368 return false
369 }
370 return true
371 }
372
373 func Hash(s []byte) uint {
374 return (uint(s[0])<<4 ^ uint(s[1]) + uint(len(s))) & uint(len(keywordMap)-1)
375 }
376
377 var keywordMap [1 << 6]Token
378
379 var keywordsInitialized bool
380
381 func InitKeywords() {
382 if keywordsInitialized {
383 return
384 }
385 keywordsInitialized = true
386 for tok := Break; tok <= Var; tok++ {
387 h := Hash([]byte(tok.String()))
388 if keywordMap[h] != 0 {
389 panic("imperfect hash")
390 }
391 keywordMap[h] = tok
392 }
393 }
394
395 func Lower(ch rune) rune { return ('a' - 'A') | ch }
396 func IsLetter(ch rune) bool { return 'a' <= Lower(ch) && Lower(ch) <= 'z' || ch == '_' }
397 func IsDecimal(ch rune) bool { return '0' <= ch && ch <= '9' }
398 func IsHex(ch rune) bool { return '0' <= ch && ch <= '9' || 'a' <= Lower(ch) && Lower(ch) <= 'f' }
399
400 func (s *Scanner) Digits(base int32, invalid *int32) (digsep int32) {
401 if base <= 10 {
402 max := rune('0' + base)
403 for IsDecimal(s.ch) || s.ch == '_' {
404 ds := int32(1)
405 if s.ch == '_' {
406 ds = 2
407 } else if s.ch >= max && *invalid < 0 {
408 _, col := s.pos()
409 *invalid = int32(col - s.col)
410 }
411 digsep |= ds
412 s.nextch()
413 }
414 } else {
415 for IsHex(s.ch) || s.ch == '_' {
416 ds := int32(1)
417 if s.ch == '_' {
418 ds = 2
419 }
420 digsep |= ds
421 s.nextch()
422 }
423 }
424 return
425 }
426
427 func (s *Scanner) Number(seenPoint bool) {
428 ok := true
429 kind := IntLit
430 base := int32(10)
431 prefix := rune(0)
432 digsep := int32(0)
433 invalid := int32(-1)
434
435 if !seenPoint {
436 if s.ch == '0' {
437 s.nextch()
438 switch Lower(s.ch) {
439 case 'x':
440 s.nextch()
441 base, prefix = 16, 'x'
442 case 'o':
443 s.nextch()
444 base, prefix = 8, 'o'
445 case 'b':
446 s.nextch()
447 base, prefix = 2, 'b'
448 default:
449 base, prefix = 8, '0'
450 digsep = 1
451 }
452 }
453 digsep |= s.Digits(base, &invalid)
454 if s.ch == '.' {
455 if prefix == 'o' || prefix == 'b' {
456 s.Errorf("invalid radix point in %s literal", baseName(base))
457 ok = false
458 }
459 s.nextch()
460 seenPoint = true
461 }
462 }
463
464 if seenPoint {
465 kind = FloatLit
466 digsep |= s.Digits(base, &invalid)
467 }
468
469 if digsep&1 == 0 && ok {
470 s.Errorf("%s literal has no digits", baseName(base))
471 ok = false
472 }
473
474 if e := Lower(s.ch); e == 'e' || e == 'p' {
475 if ok {
476 switch {
477 case e == 'e' && prefix != 0 && prefix != '0':
478 s.Errorf("%q exponent requires decimal mantissa", s.ch)
479 ok = false
480 case e == 'p' && prefix != 'x':
481 s.Errorf("%q exponent requires hexadecimal mantissa", s.ch)
482 ok = false
483 }
484 }
485 s.nextch()
486 kind = FloatLit
487 if s.ch == '+' || s.ch == '-' {
488 s.nextch()
489 }
490 digsep = s.Digits(10, nil) | digsep&2
491 if digsep&1 == 0 && ok {
492 s.Errorf("exponent has no digits")
493 ok = false
494 }
495 } else if prefix == 'x' && kind == FloatLit && ok {
496 s.Errorf("hexadecimal mantissa requires a 'p' exponent")
497 ok = false
498 }
499
500 if s.ch == 'i' {
501 kind = ImagLit
502 s.nextch()
503 }
504
505 s.SetLit(kind, ok)
506
507 if kind == IntLit && invalid >= 0 && ok {
508 s.ErrorAtf(invalid, "invalid digit %q in %s literal", s.Lit[invalid], baseName(base))
509 ok = false
510 }
511
512 if digsep&2 != 0 && ok {
513 if i := invalidSep(s.Lit); i >= 0 {
514 s.ErrorAtf(i, "'_' must separate successive digits")
515 ok = false
516 }
517 }
518
519 s.Bad = !ok
520 }
521
522 func baseName(base int32) string {
523 switch base {
524 case 2:
525 return "binary"
526 case 8:
527 return "octal"
528 case 10:
529 return "decimal"
530 case 16:
531 return "hexadecimal"
532 }
533 panic("invalid base")
534 }
535
536 func invalidSep(x string) int32 {
537 x1 := ' '
538 d := '.'
539 i := int32(0)
540
541 if len(x) >= 2 && x[0] == '0' {
542 x1 = Lower(rune(x[1]))
543 if x1 == 'x' || x1 == 'o' || x1 == 'b' {
544 d = '0'
545 i = 2
546 }
547 }
548
549 for ; i < int32(len(x)); i++ {
550 p := d
551 d = rune(x[i])
552 switch {
553 case d == '_':
554 if p != '0' {
555 return i
556 }
557 case IsDecimal(d) || x1 == 'x' && IsHex(d):
558 d = '0'
559 default:
560 if p == '_' {
561 return i - 1
562 }
563 d = '.'
564 }
565 }
566 if d == '_' {
567 return int32(len(x)) - 1
568 }
569
570 return -1
571 }
572
573 func (s *Scanner) rune() {
574 ok := true
575 s.nextch()
576
577 n := 0
578 for ; ; n++ {
579 if s.ch == '\'' {
580 if ok {
581 if n == 0 {
582 s.Errorf("empty rune literal or unescaped '")
583 ok = false
584 } else if n != 1 {
585 s.ErrorAtf(0, "more than one character in rune literal")
586 ok = false
587 }
588 }
589 s.nextch()
590 break
591 }
592 if s.ch == '\\' {
593 s.nextch()
594 if !s.escape('\'') {
595 ok = false
596 }
597 continue
598 }
599 if s.ch == '\n' {
600 if ok {
601 s.Errorf("newline in rune literal")
602 ok = false
603 }
604 break
605 }
606 if s.ch < 0 {
607 if ok {
608 s.ErrorAtf(0, "rune literal not terminated")
609 ok = false
610 }
611 break
612 }
613 s.nextch()
614 }
615
616 s.SetLit(RuneLit, ok)
617 }
618
619 func (s *Scanner) stdString() {
620 ok := true
621 s.nextch()
622
623 for {
624 if s.ch == '"' {
625 s.nextch()
626 break
627 }
628 if s.ch == '\\' {
629 s.nextch()
630 if !s.escape('"') {
631 ok = false
632 }
633 continue
634 }
635 if s.ch == '\n' {
636 s.Errorf("newline in string")
637 ok = false
638 break
639 }
640 if s.ch < 0 {
641 s.ErrorAtf(0, "string not terminated")
642 ok = false
643 break
644 }
645 s.nextch()
646 }
647
648 s.SetLit(StringLit, ok)
649 }
650
651 func (s *Scanner) rawString() {
652 ok := true
653 s.nextch()
654
655 for {
656 if s.ch == '`' {
657 s.nextch()
658 break
659 }
660 if s.ch < 0 {
661 s.ErrorAtf(0, "string not terminated")
662 ok = false
663 break
664 }
665 s.nextch()
666 }
667
668 s.SetLit(StringLit, ok)
669 }
670
671 func (s *Scanner) comment(text string) {
672 s.ErrorAtf(0, "%s", text)
673 }
674
675 func (s *Scanner) skipLine() {
676 for s.ch >= 0 && s.ch != '\n' {
677 s.nextch()
678 }
679 }
680
681 func (s *Scanner) lineComment() {
682 if s.mode&comments != 0 {
683 s.skipLine()
684 s.comment(string(s.segment()))
685 return
686 }
687
688 if s.mode&directives == 0 || (s.ch != 'g' && s.ch != 'l') {
689 s.stop()
690 s.skipLine()
691 return
692 }
693
694 prefix := "go:"
695 if s.ch == 'l' {
696 prefix = "line "
697 }
698
699 for _, r := range prefix {
700 if s.ch != rune(r) {
701 s.stop()
702 s.skipLine()
703 return
704 }
705 s.nextch()
706 }
707 s.skipLine()
708 s.comment(string(s.segment()))
709 }
710
711 func (s *Scanner) skipComment() bool {
712 for s.ch >= 0 {
713 for s.ch == '*' {
714 s.nextch()
715 if s.ch == '/' {
716 s.nextch()
717 return true
718 }
719 }
720 s.nextch()
721 }
722 s.ErrorAtf(0, "comment not terminated")
723 return false
724 }
725
726 func (s *Scanner) fullComment() {
727 if s.mode&comments != 0 {
728 if s.skipComment() {
729 s.comment(string(s.segment()))
730 }
731 return
732 }
733
734 if s.mode&directives == 0 || s.ch != 'l' {
735 s.stop()
736 s.skipComment()
737 return
738 }
739
740 const prefix = "line "
741
742 for _, r := range prefix {
743 if s.ch != rune(r) {
744 s.stop()
745 s.skipComment()
746 return
747 }
748 s.nextch()
749 }
750 if s.skipComment() {
751 s.comment(string(s.segment()))
752 }
753 }
754
755 func (s *Scanner) escape(quote rune) bool {
756 var n int32
757 var base, max uint32
758
759 switch s.ch {
760 case quote, 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\':
761 s.nextch()
762 return true
763 case '0', '1', '2', '3', '4', '5', '6', '7':
764 n, base, max = 3, 8, 255
765 case 'x':
766 s.nextch()
767 n, base, max = 2, 16, 255
768 case 'u':
769 s.nextch()
770 n, base, max = 4, 16, unicode.MaxRune
771 case 'U':
772 s.nextch()
773 n, base, max = 8, 16, unicode.MaxRune
774 default:
775 if s.ch < 0 {
776 return true
777 }
778 s.Errorf("unknown escape")
779 return false
780 }
781
782 var x uint32
783 for i := n; i > 0; i-- {
784 if s.ch < 0 {
785 return true
786 }
787 d := base
788 if IsDecimal(s.ch) {
789 d = uint32(s.ch) - '0'
790 } else if 'a' <= Lower(s.ch) && Lower(s.ch) <= 'f' {
791 d = uint32(Lower(s.ch)) - 'a' + 10
792 }
793 if d >= base {
794 s.Errorf("invalid character %q in %s escape", s.ch, baseName(int32(base)))
795 return false
796 }
797 x = x*base + d
798 s.nextch()
799 }
800
801 if x > max && base == 8 {
802 s.Errorf("octal escape value %d > 255", x)
803 return false
804 }
805
806 if x > max || 0xD800 <= x && x < 0xE000 {
807 s.Errorf("escape is invalid Unicode code point %#U", x)
808 return false
809 }
810
811 return true
812 }
813
814 func String(n Node) string {
815 return fmt.Sprintf("%T", n)
816 }
817
818 func StartPos(n Node) Pos {
819 return n.Pos()
820 }
821