scanner.mx raw
1 package main
2
3 import (
4 "fmt"
5 "io"
6 "unicode"
7 "unicode/utf8"
8 )
9
10 const (
11 comments uint = 1 << iota
12 directives
13 )
14
15 type Scanner struct {
16 Source
17 mode uint
18 nlsemi bool
19
20 Line, Col uint32
21 Blank bool
22 Tok Token
23 Lit string
24 Bad bool
25 Kind LitKind
26 Op Operator
27 Prec int32
28 }
29
30 func (s *Scanner) Init(src io.Reader, errh func(line, col uint32, msg string), mode uint) {
31 s.Source.init(src, errh)
32 s.mode = mode
33 s.nlsemi = false
34 }
35
36 func (s *Scanner) Errorf(format string, args ...interface{}) {
37 s.error(fmt.Sprintf(format, args...))
38 }
39
40 func (s *Scanner) ErrorAtf(offset int32, format string, args ...interface{}) {
41 s.errh(s.line, s.col+uint32(offset), fmt.Sprintf(format, args...))
42 }
43
44 func (s *Scanner) SetLit(kind LitKind, ok bool) {
45 s.nlsemi = true
46 s.Tok = Literal
47 s.Lit = string(s.segment())
48 s.Bad = !ok
49 s.Kind = kind
50 }
51
52 func (s *Scanner) Next() {
53 nlsemi := s.nlsemi
54 s.nlsemi = false
55
56 redo:
57 s.stop()
58 startLine, startCol := s.pos()
59 for s.ch == ' ' || s.ch == '\t' || s.ch == '\n' && !nlsemi || s.ch == '\r' {
60 s.nextch()
61 }
62
63 s.Line, s.Col = s.pos()
64 s.Blank = s.line > startLine || startCol == Colbase
65 s.start()
66 if IsLetter(s.ch) || s.ch >= utf8.RuneSelf && s.AtIdentChar(true) {
67 s.nextch()
68 s.Ident()
69 return
70 }
71
72 switch s.ch {
73 case -1:
74 if nlsemi {
75 s.Lit = "EOF"
76 s.Tok = Semi
77 break
78 }
79 s.Tok = EOF
80
81 case '\n':
82 s.nextch()
83 s.Lit = "newline"
84 s.Tok = Semi
85
86 case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
87 s.Number(false)
88
89 case '"':
90 s.stdString()
91
92 case '`':
93 s.rawString()
94
95 case '\'':
96 s.rune()
97
98 case '(':
99 s.nextch()
100 s.Tok = Lparen
101
102 case '[':
103 s.nextch()
104 s.Tok = Lbrack
105
106 case '{':
107 s.nextch()
108 s.Tok = Lbrace
109
110 case ',':
111 s.nextch()
112 s.Tok = Comma
113
114 case ';':
115 s.nextch()
116 s.Lit = "semicolon"
117 s.Tok = Semi
118
119 case ')':
120 s.nextch()
121 s.nlsemi = true
122 s.Tok = Rparen
123
124 case ']':
125 s.nextch()
126 s.nlsemi = true
127 s.Tok = Rbrack
128
129 case '}':
130 s.nextch()
131 s.nlsemi = true
132 s.Tok = Rbrace
133
134 case ':':
135 s.nextch()
136 if s.ch == '=' {
137 s.nextch()
138 s.Tok = Define
139 break
140 }
141 s.Tok = Colon
142
143 case '.':
144 s.nextch()
145 if IsDecimal(s.ch) {
146 s.Number(true)
147 break
148 }
149 if s.ch == '.' {
150 s.nextch()
151 if s.ch == '.' {
152 s.nextch()
153 s.Tok = DotDotDot
154 break
155 }
156 s.rewind()
157 s.nextch()
158 }
159 s.Tok = Dot
160
161 case '+':
162 s.nextch()
163 s.Op, s.Prec = Add, PrecAdd
164 if s.ch != '+' {
165 goto assignop
166 }
167 s.nextch()
168 s.nlsemi = true
169 s.Tok = IncOp
170
171 case '-':
172 s.nextch()
173 s.Op, s.Prec = Sub, PrecAdd
174 if s.ch != '-' {
175 goto assignop
176 }
177 s.nextch()
178 s.nlsemi = true
179 s.Tok = IncOp
180
181 case '*':
182 s.nextch()
183 s.Op, s.Prec = Mul, PrecMul
184 if s.ch == '=' {
185 s.nextch()
186 s.Tok = AssignOp
187 break
188 }
189 s.Tok = Star
190
191 case '/':
192 s.nextch()
193 if s.ch == '/' {
194 s.nextch()
195 s.lineComment()
196 goto redo
197 }
198 if s.ch == '*' {
199 s.nextch()
200 s.fullComment()
201 if line, _ := s.pos(); line > s.Line && nlsemi {
202 s.Lit = "newline"
203 s.Tok = Semi
204 break
205 }
206 goto redo
207 }
208 s.Op, s.Prec = Div, PrecMul
209 goto assignop
210
211 case '%':
212 s.nextch()
213 s.Op, s.Prec = Rem, PrecMul
214 goto assignop
215
216 case '&':
217 s.nextch()
218 if s.ch == '&' {
219 s.nextch()
220 s.Op, s.Prec = AndAnd, PrecAndAnd
221 s.Tok = OperatorType
222 break
223 }
224 s.Op, s.Prec = And, PrecMul
225 if s.ch == '^' {
226 s.nextch()
227 s.Op = AndNot
228 }
229 goto assignop
230
231 case '|':
232 s.nextch()
233 if s.ch == '|' {
234 s.nextch()
235 s.Op, s.Prec = OrOr, PrecOrOr
236 s.Tok = OperatorType
237 break
238 }
239 s.Op, s.Prec = Or, PrecAdd
240 goto assignop
241
242 case '^':
243 s.nextch()
244 s.Op, s.Prec = Xor, PrecAdd
245 goto assignop
246
247 case '<':
248 s.nextch()
249 if s.ch == '=' {
250 s.nextch()
251 s.Op, s.Prec = Leq, PrecCmp
252 s.Tok = OperatorType
253 break
254 }
255 if s.ch == '<' {
256 s.nextch()
257 s.Op, s.Prec = Shl, PrecMul
258 goto assignop
259 }
260 if s.ch == '-' {
261 s.nextch()
262 s.Tok = Arrow
263 break
264 }
265 s.Op, s.Prec = Lss, PrecCmp
266 s.Tok = OperatorType
267
268 case '>':
269 s.nextch()
270 if s.ch == '=' {
271 s.nextch()
272 s.Op, s.Prec = Geq, PrecCmp
273 s.Tok = OperatorType
274 break
275 }
276 if s.ch == '>' {
277 s.nextch()
278 s.Op, s.Prec = Shr, PrecMul
279 goto assignop
280 }
281 s.Op, s.Prec = Gtr, PrecCmp
282 s.Tok = OperatorType
283
284 case '=':
285 s.nextch()
286 if s.ch == '=' {
287 s.nextch()
288 s.Op, s.Prec = Eql, PrecCmp
289 s.Tok = OperatorType
290 break
291 }
292 s.Tok = Assign
293
294 case '!':
295 s.nextch()
296 if s.ch == '=' {
297 s.nextch()
298 s.Op, s.Prec = Neq, PrecCmp
299 s.Tok = OperatorType
300 break
301 }
302 s.Op, s.Prec = Not, 0
303 s.Tok = OperatorType
304
305 case '~':
306 s.nextch()
307 s.Op, s.Prec = Tilde, 0
308 s.Tok = OperatorType
309
310 default:
311 s.Errorf("invalid character %#U", s.ch)
312 s.nextch()
313 goto redo
314 }
315
316 return
317
318 assignop:
319 if s.ch == '=' {
320 s.nextch()
321 s.Tok = AssignOp
322 return
323 }
324 s.Tok = OperatorType
325 }
326
327 func (s *Scanner) Ident() {
328 for IsLetter(s.ch) || IsDecimal(s.ch) {
329 s.nextch()
330 }
331
332 if s.ch >= utf8.RuneSelf {
333 for s.AtIdentChar(false) {
334 s.nextch()
335 }
336 }
337
338 lit := s.segment()
339 if len(lit) >= 2 {
340 if tok := keywordMap[Hash(lit)]; tok != 0 && TokStrFast(tok) == string(lit) {
341 s.nlsemi = contains(1<<Break|1<<Continue|1<<Fallthrough|1<<Return, tok)
342 s.Tok = tok
343 return
344 }
345 }
346
347 s.nlsemi = true
348 s.Lit = string(lit)
349 s.Tok = NameType
350 }
351
352 func TokStrFast(tok Token) string {
353 return token_name[token_index[tok-1]:token_index[tok]]
354 }
355
356 func (s *Scanner) AtIdentChar(first bool) bool {
357 switch {
358 case unicode.IsLetter(s.ch) || s.ch == '_':
359 case unicode.IsDigit(s.ch):
360 if first {
361 s.Errorf("identifier cannot begin with digit %#U", s.ch)
362 }
363 case s.ch >= utf8.RuneSelf:
364 s.Errorf("invalid character %#U in identifier", s.ch)
365 default:
366 return false
367 }
368 return true
369 }
370
371 func Hash(s []byte) uint {
372 return (uint(s[0])<<4 ^ uint(s[1]) + uint(len(s))) & uint(len(keywordMap)-1)
373 }
374
375 var keywordMap [1 << 6]Token
376
377 var keywordsInitialized bool
378
379 func InitKeywords() {
380 if keywordsInitialized {
381 return
382 }
383 keywordsInitialized = true
384 for tok := Break; tok <= Var; tok++ {
385 h := Hash([]byte(tok.String()))
386 if keywordMap[h] != 0 {
387 panic("imperfect hash")
388 }
389 keywordMap[h] = tok
390 }
391 }
392
393 func Lower(ch rune) rune { return ('a' - 'A') | ch }
394 func IsLetter(ch rune) bool { return 'a' <= Lower(ch) && Lower(ch) <= 'z' || ch == '_' }
395 func IsDecimal(ch rune) bool { return '0' <= ch && ch <= '9' }
396 func IsHex(ch rune) bool { return '0' <= ch && ch <= '9' || 'a' <= Lower(ch) && Lower(ch) <= 'f' }
397
398 func (s *Scanner) Digits(base int32, invalid *int32) (digsep int32) {
399 if base <= 10 {
400 max := rune('0' + base)
401 for IsDecimal(s.ch) || s.ch == '_' {
402 ds := int32(1)
403 if s.ch == '_' {
404 ds = 2
405 } else if s.ch >= max && *invalid < 0 {
406 _, col := s.pos()
407 *invalid = int32(col - s.col)
408 }
409 digsep |= ds
410 s.nextch()
411 }
412 } else {
413 for IsHex(s.ch) || s.ch == '_' {
414 ds := int32(1)
415 if s.ch == '_' {
416 ds = 2
417 }
418 digsep |= ds
419 s.nextch()
420 }
421 }
422 return
423 }
424
425 func (s *Scanner) Number(seenPoint bool) {
426 ok := true
427 kind := IntLit
428 base := int32(10)
429 prefix := rune(0)
430 digsep := int32(0)
431 invalid := int32(-1)
432
433 if !seenPoint {
434 if s.ch == '0' {
435 s.nextch()
436 switch Lower(s.ch) {
437 case 'x':
438 s.nextch()
439 base, prefix = 16, 'x'
440 case 'o':
441 s.nextch()
442 base, prefix = 8, 'o'
443 case 'b':
444 s.nextch()
445 base, prefix = 2, 'b'
446 default:
447 base, prefix = 8, '0'
448 digsep = 1
449 }
450 }
451 digsep |= s.Digits(base, &invalid)
452 if s.ch == '.' {
453 if prefix == 'o' || prefix == 'b' {
454 s.Errorf("invalid radix point in %s literal", baseName(base))
455 ok = false
456 }
457 s.nextch()
458 seenPoint = true
459 }
460 }
461
462 if seenPoint {
463 kind = FloatLit
464 digsep |= s.Digits(base, &invalid)
465 }
466
467 if digsep&1 == 0 && ok {
468 s.Errorf("%s literal has no digits", baseName(base))
469 ok = false
470 }
471
472 if e := Lower(s.ch); e == 'e' || e == 'p' {
473 if ok {
474 switch {
475 case e == 'e' && prefix != 0 && prefix != '0':
476 s.Errorf("%q exponent requires decimal mantissa", s.ch)
477 ok = false
478 case e == 'p' && prefix != 'x':
479 s.Errorf("%q exponent requires hexadecimal mantissa", s.ch)
480 ok = false
481 }
482 }
483 s.nextch()
484 kind = FloatLit
485 if s.ch == '+' || s.ch == '-' {
486 s.nextch()
487 }
488 digsep = s.Digits(10, nil) | digsep&2
489 if digsep&1 == 0 && ok {
490 s.Errorf("exponent has no digits")
491 ok = false
492 }
493 } else if prefix == 'x' && kind == FloatLit && ok {
494 s.Errorf("hexadecimal mantissa requires a 'p' exponent")
495 ok = false
496 }
497
498 if s.ch == 'i' {
499 kind = ImagLit
500 s.nextch()
501 }
502
503 s.SetLit(kind, ok)
504
505 if kind == IntLit && invalid >= 0 && ok {
506 s.ErrorAtf(invalid, "invalid digit %q in %s literal", s.Lit[invalid], baseName(base))
507 ok = false
508 }
509
510 if digsep&2 != 0 && ok {
511 if i := invalidSep(s.Lit); i >= 0 {
512 s.ErrorAtf(i, "'_' must separate successive digits")
513 ok = false
514 }
515 }
516
517 s.Bad = !ok
518 }
519
520 func baseName(base int32) string {
521 switch base {
522 case 2:
523 return "binary"
524 case 8:
525 return "octal"
526 case 10:
527 return "decimal"
528 case 16:
529 return "hexadecimal"
530 }
531 panic("invalid base")
532 }
533
534 func invalidSep(x string) int32 {
535 x1 := ' '
536 d := '.'
537 i := int32(0)
538
539 if len(x) >= 2 && x[0] == '0' {
540 x1 = Lower(rune(x[1]))
541 if x1 == 'x' || x1 == 'o' || x1 == 'b' {
542 d = '0'
543 i = 2
544 }
545 }
546
547 for ; i < int32(len(x)); i++ {
548 p := d
549 d = rune(x[i])
550 switch {
551 case d == '_':
552 if p != '0' {
553 return i
554 }
555 case IsDecimal(d) || x1 == 'x' && IsHex(d):
556 d = '0'
557 default:
558 if p == '_' {
559 return i - 1
560 }
561 d = '.'
562 }
563 }
564 if d == '_' {
565 return int32(len(x)) - 1
566 }
567
568 return -1
569 }
570
571 func (s *Scanner) rune() {
572 ok := true
573 s.nextch()
574
575 n := 0
576 for ; ; n++ {
577 if s.ch == '\'' {
578 if ok {
579 if n == 0 {
580 s.Errorf("empty rune literal or unescaped '")
581 ok = false
582 } else if n != 1 {
583 s.ErrorAtf(0, "more than one character in rune literal")
584 ok = false
585 }
586 }
587 s.nextch()
588 break
589 }
590 if s.ch == '\\' {
591 s.nextch()
592 if !s.escape('\'') {
593 ok = false
594 }
595 continue
596 }
597 if s.ch == '\n' {
598 if ok {
599 s.Errorf("newline in rune literal")
600 ok = false
601 }
602 break
603 }
604 if s.ch < 0 {
605 if ok {
606 s.ErrorAtf(0, "rune literal not terminated")
607 ok = false
608 }
609 break
610 }
611 s.nextch()
612 }
613
614 s.SetLit(RuneLit, ok)
615 }
616
617 func (s *Scanner) stdString() {
618 ok := true
619 s.nextch()
620
621 for {
622 if s.ch == '"' {
623 s.nextch()
624 break
625 }
626 if s.ch == '\\' {
627 s.nextch()
628 if !s.escape('"') {
629 ok = false
630 }
631 continue
632 }
633 if s.ch == '\n' {
634 s.Errorf("newline in string")
635 ok = false
636 break
637 }
638 if s.ch < 0 {
639 s.ErrorAtf(0, "string not terminated")
640 ok = false
641 break
642 }
643 s.nextch()
644 }
645
646 s.SetLit(StringLit, ok)
647 }
648
649 func (s *Scanner) rawString() {
650 ok := true
651 s.nextch()
652
653 for {
654 if s.ch == '`' {
655 s.nextch()
656 break
657 }
658 if s.ch < 0 {
659 s.ErrorAtf(0, "string not terminated")
660 ok = false
661 break
662 }
663 s.nextch()
664 }
665
666 s.SetLit(StringLit, ok)
667 }
668
669 func (s *Scanner) comment(text string) {
670 s.ErrorAtf(0, "%s", text)
671 }
672
673 func (s *Scanner) skipLine() {
674 for s.ch >= 0 && s.ch != '\n' {
675 s.nextch()
676 }
677 }
678
679 func (s *Scanner) lineComment() {
680 if s.mode&comments != 0 {
681 s.skipLine()
682 s.comment(string(s.segment()))
683 return
684 }
685
686 if s.mode&directives == 0 || (s.ch != 'g' && s.ch != 'l') {
687 s.stop()
688 s.skipLine()
689 return
690 }
691
692 prefix := "go:"
693 if s.ch == 'l' {
694 prefix = "line "
695 }
696
697 for _, r := range prefix {
698 if s.ch != rune(r) {
699 s.stop()
700 s.skipLine()
701 return
702 }
703 s.nextch()
704 }
705 s.skipLine()
706 s.comment(string(s.segment()))
707 }
708
709 func (s *Scanner) skipComment() bool {
710 for s.ch >= 0 {
711 for s.ch == '*' {
712 s.nextch()
713 if s.ch == '/' {
714 s.nextch()
715 return true
716 }
717 }
718 s.nextch()
719 }
720 s.ErrorAtf(0, "comment not terminated")
721 return false
722 }
723
724 func (s *Scanner) fullComment() {
725 if s.mode&comments != 0 {
726 if s.skipComment() {
727 s.comment(string(s.segment()))
728 }
729 return
730 }
731
732 if s.mode&directives == 0 || s.ch != 'l' {
733 s.stop()
734 s.skipComment()
735 return
736 }
737
738 const prefix = "line "
739
740 for _, r := range prefix {
741 if s.ch != rune(r) {
742 s.stop()
743 s.skipComment()
744 return
745 }
746 s.nextch()
747 }
748 if s.skipComment() {
749 s.comment(string(s.segment()))
750 }
751 }
752
753 func (s *Scanner) escape(quote rune) bool {
754 var n int32
755 var base, max uint32
756
757 switch s.ch {
758 case quote, 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\':
759 s.nextch()
760 return true
761 case '0', '1', '2', '3', '4', '5', '6', '7':
762 n, base, max = 3, 8, 255
763 case 'x':
764 s.nextch()
765 n, base, max = 2, 16, 255
766 case 'u':
767 s.nextch()
768 n, base, max = 4, 16, unicode.MaxRune
769 case 'U':
770 s.nextch()
771 n, base, max = 8, 16, unicode.MaxRune
772 default:
773 if s.ch < 0 {
774 return true
775 }
776 s.Errorf("unknown escape")
777 return false
778 }
779
780 var x uint32
781 for i := n; i > 0; i-- {
782 if s.ch < 0 {
783 return true
784 }
785 d := base
786 if IsDecimal(s.ch) {
787 d = uint32(s.ch) - '0'
788 } else if 'a' <= Lower(s.ch) && Lower(s.ch) <= 'f' {
789 d = uint32(Lower(s.ch)) - 'a' + 10
790 }
791 if d >= base {
792 s.Errorf("invalid character %q in %s escape", s.ch, baseName(int32(base)))
793 return false
794 }
795 x = x*base + d
796 s.nextch()
797 }
798
799 if x > max && base == 8 {
800 s.Errorf("octal escape value %d > 255", x)
801 return false
802 }
803
804 if x > max || 0xD800 <= x && x < 0xE000 {
805 s.Errorf("escape is invalid Unicode code point %#U", x)
806 return false
807 }
808
809 return true
810 }
811
812 func String(n Node) string {
813 return fmt.Sprintf("%T", n)
814 }
815
816 func StartPos(n Node) Pos {
817 return n.Pos()
818 }
819