scanner.mx raw

   1  package main
   2  
   3  import (
   4  	"fmt"
   5  	"io"
   6  	"unicode"
   7  	"unicode/utf8"
   8  )
   9  
  10  const (
  11  	comments   uint = 1 << iota
  12  	directives
  13  )
  14  
  15  type Scanner struct {
  16  	Source
  17  	mode   uint
  18  	nlsemi bool
  19  
  20  	Line, Col uint32
  21  	Blank     bool
  22  	Tok       Token
  23  	Lit       string
  24  	Bad       bool
  25  	Kind      LitKind
  26  	Op        Operator
  27  	Prec      int32
  28  
  29  	keywordMap   [1 << 6]Token
  30  	keywordsReady bool
  31  }
  32  
  33  func (s *Scanner) Init(src io.Reader, errh func(line, col uint32, msg string), mode uint) {
  34  	s.Source.init(src, errh)
  35  	s.mode = mode
  36  	s.nlsemi = false
  37  	s.initKeywords()
  38  }
  39  
  40  func (s *Scanner) Errorf(format string, args ...interface{}) {
  41  	s.error(fmt.Sprintf(format, args...))
  42  }
  43  
  44  func (s *Scanner) ErrorAtf(offset int32, format string, args ...interface{}) {
  45  	s.errh(s.line, s.col+uint32(offset), fmt.Sprintf(format, args...))
  46  }
  47  
  48  func (s *Scanner) SetLit(kind LitKind, ok bool) {
  49  	s.nlsemi = true
  50  	s.Tok = Literal
  51  	s.Lit = string(s.segmentCopy())
  52  	s.Bad = !ok
  53  	s.Kind = kind
  54  }
  55  
  56  func (s *Scanner) Next() {
  57  	nlsemi := s.nlsemi
  58  	s.nlsemi = false
  59  
  60  redo:
  61  	s.stop()
  62  	startLine, startCol := s.pos()
  63  	for s.ch == ' ' || s.ch == '\t' || s.ch == '\n' && !nlsemi || s.ch == '\r' {
  64  		s.nextch()
  65  	}
  66  
  67  	s.Line, s.Col = s.pos()
  68  	s.Blank = s.line > startLine || startCol == Colbase
  69  	s.start()
  70  	if IsLetter(s.ch) || s.ch >= utf8.RuneSelf && s.AtIdentChar(true) {
  71  		s.nextch()
  72  		s.Ident()
  73  		return
  74  	}
  75  
  76  	switch s.ch {
  77  	case -1:
  78  		if nlsemi {
  79  			s.Lit = "EOF"
  80  			s.Tok = Semi
  81  			break
  82  		}
  83  		s.Tok = EOF
  84  
  85  	case '\n':
  86  		s.nextch()
  87  		s.Lit = "newline"
  88  		s.Tok = Semi
  89  
  90  	case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
  91  		s.Number(false)
  92  
  93  	case '"':
  94  		s.stdString()
  95  
  96  	case '`':
  97  		s.rawString()
  98  
  99  	case '\'':
 100  		s.rune()
 101  
 102  	case '(':
 103  		s.nextch()
 104  		s.Tok = Lparen
 105  
 106  	case '[':
 107  		s.nextch()
 108  		s.Tok = Lbrack
 109  
 110  	case '{':
 111  		s.nextch()
 112  		s.Tok = Lbrace
 113  
 114  	case ',':
 115  		s.nextch()
 116  		s.Tok = Comma
 117  
 118  	case ';':
 119  		s.nextch()
 120  		s.Lit = "semicolon"
 121  		s.Tok = Semi
 122  
 123  	case ')':
 124  		s.nextch()
 125  		s.nlsemi = true
 126  		s.Tok = Rparen
 127  
 128  	case ']':
 129  		s.nextch()
 130  		s.nlsemi = true
 131  		s.Tok = Rbrack
 132  
 133  	case '}':
 134  		s.nextch()
 135  		s.nlsemi = true
 136  		s.Tok = Rbrace
 137  
 138  	case ':':
 139  		s.nextch()
 140  		if s.ch == '=' {
 141  			s.nextch()
 142  			s.Tok = Define
 143  			break
 144  		}
 145  		s.Tok = Colon
 146  
 147  	case '.':
 148  		s.nextch()
 149  		if IsDecimal(s.ch) {
 150  			s.Number(true)
 151  			break
 152  		}
 153  		if s.ch == '.' {
 154  			s.nextch()
 155  			if s.ch == '.' {
 156  				s.nextch()
 157  				s.Tok = DotDotDot
 158  				break
 159  			}
 160  			s.rewind()
 161  			s.nextch()
 162  		}
 163  		s.Tok = Dot
 164  
 165  	case '+':
 166  		s.nextch()
 167  		s.Op, s.Prec = Add, PrecAdd
 168  		if s.ch != '+' {
 169  			goto assignop
 170  		}
 171  		s.nextch()
 172  		s.nlsemi = true
 173  		s.Tok = IncOp
 174  
 175  	case '-':
 176  		s.nextch()
 177  		s.Op, s.Prec = Sub, PrecAdd
 178  		if s.ch != '-' {
 179  			goto assignop
 180  		}
 181  		s.nextch()
 182  		s.nlsemi = true
 183  		s.Tok = IncOp
 184  
 185  	case '*':
 186  		s.nextch()
 187  		s.Op, s.Prec = Mul, PrecMul
 188  		if s.ch == '=' {
 189  			s.nextch()
 190  			s.Tok = AssignOp
 191  			break
 192  		}
 193  		s.Tok = Star
 194  
 195  	case '/':
 196  		s.nextch()
 197  		if s.ch == '/' {
 198  			s.nextch()
 199  			s.lineComment()
 200  			goto redo
 201  		}
 202  		if s.ch == '*' {
 203  			s.nextch()
 204  			s.fullComment()
 205  			if line, _ := s.pos(); line > s.Line && nlsemi {
 206  				s.Lit = "newline"
 207  				s.Tok = Semi
 208  				break
 209  			}
 210  			goto redo
 211  		}
 212  		s.Op, s.Prec = Div, PrecMul
 213  		goto assignop
 214  
 215  	case '%':
 216  		s.nextch()
 217  		s.Op, s.Prec = Rem, PrecMul
 218  		goto assignop
 219  
 220  	case '&':
 221  		s.nextch()
 222  		if s.ch == '&' {
 223  			s.nextch()
 224  			s.Op, s.Prec = AndAnd, PrecAndAnd
 225  			s.Tok = OperatorType
 226  			break
 227  		}
 228  		s.Op, s.Prec = And, PrecMul
 229  		if s.ch == '^' {
 230  			s.nextch()
 231  			s.Op = AndNot
 232  		}
 233  		goto assignop
 234  
 235  	case '|':
 236  		s.nextch()
 237  		if s.ch == '|' {
 238  			s.nextch()
 239  			s.Op, s.Prec = OrOr, PrecOrOr
 240  			s.Tok = OperatorType
 241  			break
 242  		}
 243  		s.Op, s.Prec = Or, PrecAdd
 244  		goto assignop
 245  
 246  	case '^':
 247  		s.nextch()
 248  		s.Op, s.Prec = Xor, PrecAdd
 249  		goto assignop
 250  
 251  	case '<':
 252  		s.nextch()
 253  		if s.ch == '=' {
 254  			s.nextch()
 255  			s.Op, s.Prec = Leq, PrecCmp
 256  			s.Tok = OperatorType
 257  			break
 258  		}
 259  		if s.ch == '<' {
 260  			s.nextch()
 261  			s.Op, s.Prec = Shl, PrecMul
 262  			goto assignop
 263  		}
 264  		if s.ch == '-' {
 265  			s.nextch()
 266  			s.Tok = Arrow
 267  			break
 268  		}
 269  		s.Op, s.Prec = Lss, PrecCmp
 270  		s.Tok = OperatorType
 271  
 272  	case '>':
 273  		s.nextch()
 274  		if s.ch == '=' {
 275  			s.nextch()
 276  			s.Op, s.Prec = Geq, PrecCmp
 277  			s.Tok = OperatorType
 278  			break
 279  		}
 280  		if s.ch == '>' {
 281  			s.nextch()
 282  			s.Op, s.Prec = Shr, PrecMul
 283  			goto assignop
 284  		}
 285  		s.Op, s.Prec = Gtr, PrecCmp
 286  		s.Tok = OperatorType
 287  
 288  	case '=':
 289  		s.nextch()
 290  		if s.ch == '=' {
 291  			s.nextch()
 292  			s.Op, s.Prec = Eql, PrecCmp
 293  			s.Tok = OperatorType
 294  			break
 295  		}
 296  		s.Tok = Assign
 297  
 298  	case '!':
 299  		s.nextch()
 300  		if s.ch == '=' {
 301  			s.nextch()
 302  			s.Op, s.Prec = Neq, PrecCmp
 303  			s.Tok = OperatorType
 304  			break
 305  		}
 306  		s.Op, s.Prec = Not, 0
 307  		s.Tok = OperatorType
 308  
 309  	case '~':
 310  		s.nextch()
 311  		s.Op, s.Prec = Tilde, 0
 312  		s.Tok = OperatorType
 313  
 314  	default:
 315  		s.Errorf("invalid character %#U", s.ch)
 316  		s.nextch()
 317  		goto redo
 318  	}
 319  
 320  	return
 321  
 322  assignop:
 323  	if s.ch == '=' {
 324  		s.nextch()
 325  		s.Tok = AssignOp
 326  		return
 327  	}
 328  	s.Tok = OperatorType
 329  }
 330  
 331  func (s *Scanner) Ident() {
 332  	for IsLetter(s.ch) || IsDecimal(s.ch) {
 333  		s.nextch()
 334  	}
 335  
 336  	if s.ch >= utf8.RuneSelf {
 337  		for s.AtIdentChar(false) {
 338  			s.nextch()
 339  		}
 340  	}
 341  
 342  	lit := s.segment()
 343  	if len(lit) >= 2 {
 344  		h := (uint(lit[0])<<4 ^ uint(lit[1]) + uint(len(lit))) & 63
 345  		if tok := s.keywordMap[h]; tok != 0 && tokStrFast(tok) == string(lit) {
 346  			s.nlsemi = contains(1<<Break|1<<Continue|1<<Fallthrough|1<<Return, tok)
 347  			s.Tok = tok
 348  			return
 349  		}
 350  	}
 351  
 352  	s.nlsemi = true
 353  	c := []byte{:len(lit)}
 354  	copy(c, lit)
 355  	s.Lit = string(c)
 356  	s.Tok = NameType
 357  }
 358  
 359  func tokStrFast(tok Token) string {
 360  	idx := [48]uint8{0, 3, 7, 14, 16, 19, 23, 24, 26, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 42, 47, 51, 55, 60, 68, 75, 80, 84, 95, 98, 102, 104, 108, 110, 116, 125, 128, 135, 140, 146, 152, 158, 164, 168, 171, 171}
 361  	return token_name[idx[tok-1]:idx[tok]]
 362  }
 363  
 364  func (s *Scanner) AtIdentChar(first bool) bool {
 365  	switch {
 366  	case unicode.IsLetter(s.ch) || s.ch == '_':
 367  	case unicode.IsDigit(s.ch):
 368  		if first {
 369  			s.Errorf("identifier cannot begin with digit %#U", s.ch)
 370  		}
 371  	case s.ch >= utf8.RuneSelf:
 372  		s.Errorf("invalid character %#U in identifier", s.ch)
 373  	default:
 374  		return false
 375  	}
 376  	return true
 377  }
 378  
 379  func (s *Scanner) initKeywords() {
 380  	if s.keywordsReady {
 381  		return
 382  	}
 383  	s.keywordsReady = true
 384  	for tok := Break; tok <= Var; tok++ {
 385  		b := []byte(tok.String())
 386  		h := (uint(b[0])<<4 ^ uint(b[1]) + uint(len(b))) & 63
 387  		if s.keywordMap[h] != 0 {
 388  			panic("imperfect hash")
 389  		}
 390  		s.keywordMap[h] = tok
 391  	}
 392  }
 393  
 394  func Lower(ch rune) rune     { return ('a' - 'A') | ch }
 395  func IsLetter(ch rune) bool  { return 'a' <= Lower(ch) && Lower(ch) <= 'z' || ch == '_' }
 396  func IsDecimal(ch rune) bool { return '0' <= ch && ch <= '9' }
 397  func IsHex(ch rune) bool     { return '0' <= ch && ch <= '9' || 'a' <= Lower(ch) && Lower(ch) <= 'f' }
 398  
 399  func (s *Scanner) Digits(base int32, invalid *int32) (digsep int32) {
 400  	if base <= 10 {
 401  		max := rune('0' + base)
 402  		for IsDecimal(s.ch) || s.ch == '_' {
 403  			ds := int32(1)
 404  			if s.ch == '_' {
 405  				ds = 2
 406  			} else if s.ch >= max && *invalid < 0 {
 407  				_, col := s.pos()
 408  				*invalid = int32(col - s.col)
 409  			}
 410  			digsep |= ds
 411  			s.nextch()
 412  		}
 413  	} else {
 414  		for IsHex(s.ch) || s.ch == '_' {
 415  			ds := int32(1)
 416  			if s.ch == '_' {
 417  				ds = 2
 418  			}
 419  			digsep |= ds
 420  			s.nextch()
 421  		}
 422  	}
 423  	return
 424  }
 425  
 426  func (s *Scanner) Number(seenPoint bool) {
 427  	ok := true
 428  	kind := IntLit
 429  	base := int32(10)
 430  	prefix := rune(0)
 431  	digsep := int32(0)
 432  	invalid := int32(-1)
 433  
 434  	if !seenPoint {
 435  		if s.ch == '0' {
 436  			s.nextch()
 437  			switch Lower(s.ch) {
 438  			case 'x':
 439  				s.nextch()
 440  				base, prefix = 16, 'x'
 441  			case 'o':
 442  				s.nextch()
 443  				base, prefix = 8, 'o'
 444  			case 'b':
 445  				s.nextch()
 446  				base, prefix = 2, 'b'
 447  			default:
 448  				base, prefix = 8, '0'
 449  				digsep = 1
 450  			}
 451  		}
 452  		digsep |= s.Digits(base, &invalid)
 453  		if s.ch == '.' {
 454  			if prefix == 'o' || prefix == 'b' {
 455  				s.Errorf("invalid radix point in %s literal", baseName(base))
 456  				ok = false
 457  			}
 458  			s.nextch()
 459  			seenPoint = true
 460  		}
 461  	}
 462  
 463  	if seenPoint {
 464  		kind = FloatLit
 465  		digsep |= s.Digits(base, &invalid)
 466  	}
 467  
 468  	if digsep&1 == 0 && ok {
 469  		s.Errorf("%s literal has no digits", baseName(base))
 470  		ok = false
 471  	}
 472  
 473  	if e := Lower(s.ch); e == 'e' || e == 'p' {
 474  		if ok {
 475  			switch {
 476  			case e == 'e' && prefix != 0 && prefix != '0':
 477  				s.Errorf("%q exponent requires decimal mantissa", s.ch)
 478  				ok = false
 479  			case e == 'p' && prefix != 'x':
 480  				s.Errorf("%q exponent requires hexadecimal mantissa", s.ch)
 481  				ok = false
 482  			}
 483  		}
 484  		s.nextch()
 485  		kind = FloatLit
 486  		if s.ch == '+' || s.ch == '-' {
 487  			s.nextch()
 488  		}
 489  		digsep = s.Digits(10, nil) | digsep&2
 490  		if digsep&1 == 0 && ok {
 491  			s.Errorf("exponent has no digits")
 492  			ok = false
 493  		}
 494  	} else if prefix == 'x' && kind == FloatLit && ok {
 495  		s.Errorf("hexadecimal mantissa requires a 'p' exponent")
 496  		ok = false
 497  	}
 498  
 499  	if s.ch == 'i' {
 500  		kind = ImagLit
 501  		s.nextch()
 502  	}
 503  
 504  	s.SetLit(kind, ok)
 505  
 506  	if kind == IntLit && invalid >= 0 && ok {
 507  		s.ErrorAtf(invalid, "invalid digit %q in %s literal", s.Lit[invalid], baseName(base))
 508  		ok = false
 509  	}
 510  
 511  	if digsep&2 != 0 && ok {
 512  		if i := invalidSep(s.Lit); i >= 0 {
 513  			s.ErrorAtf(i, "'_' must separate successive digits")
 514  			ok = false
 515  		}
 516  	}
 517  
 518  	s.Bad = !ok
 519  }
 520  
 521  func baseName(base int32) string {
 522  	switch base {
 523  	case 2:
 524  		return "binary"
 525  	case 8:
 526  		return "octal"
 527  	case 10:
 528  		return "decimal"
 529  	case 16:
 530  		return "hexadecimal"
 531  	}
 532  	panic("invalid base")
 533  }
 534  
 535  func invalidSep(x string) int32 {
 536  	x1 := ' '
 537  	d := '.'
 538  	i := int32(0)
 539  
 540  	if len(x) >= 2 && x[0] == '0' {
 541  		x1 = Lower(rune(x[1]))
 542  		if x1 == 'x' || x1 == 'o' || x1 == 'b' {
 543  			d = '0'
 544  			i = 2
 545  		}
 546  	}
 547  
 548  	for ; i < int32(len(x)); i++ {
 549  		p := d
 550  		d = rune(x[i])
 551  		switch {
 552  		case d == '_':
 553  			if p != '0' {
 554  				return i
 555  			}
 556  		case IsDecimal(d) || x1 == 'x' && IsHex(d):
 557  			d = '0'
 558  		default:
 559  			if p == '_' {
 560  				return i - 1
 561  			}
 562  			d = '.'
 563  		}
 564  	}
 565  	if d == '_' {
 566  		return int32(len(x)) - 1
 567  	}
 568  
 569  	return -1
 570  }
 571  
 572  func (s *Scanner) rune() {
 573  	ok := true
 574  	s.nextch()
 575  
 576  	n := 0
 577  	for ; ; n++ {
 578  		if s.ch == '\'' {
 579  			if ok {
 580  				if n == 0 {
 581  					s.Errorf("empty rune literal or unescaped '")
 582  					ok = false
 583  				} else if n != 1 {
 584  					s.ErrorAtf(0, "more than one character in rune literal")
 585  					ok = false
 586  				}
 587  			}
 588  			s.nextch()
 589  			break
 590  		}
 591  		if s.ch == '\\' {
 592  			s.nextch()
 593  			if !s.escape('\'') {
 594  				ok = false
 595  			}
 596  			continue
 597  		}
 598  		if s.ch == '\n' {
 599  			if ok {
 600  				s.Errorf("newline in rune literal")
 601  				ok = false
 602  			}
 603  			break
 604  		}
 605  		if s.ch < 0 {
 606  			if ok {
 607  				s.ErrorAtf(0, "rune literal not terminated")
 608  				ok = false
 609  			}
 610  			break
 611  		}
 612  		s.nextch()
 613  	}
 614  
 615  	s.SetLit(RuneLit, ok)
 616  }
 617  
 618  func (s *Scanner) stdString() {
 619  	ok := true
 620  	s.nextch()
 621  
 622  	for {
 623  		if s.ch == '"' {
 624  			s.nextch()
 625  			break
 626  		}
 627  		if s.ch == '\\' {
 628  			s.nextch()
 629  			if !s.escape('"') {
 630  				ok = false
 631  			}
 632  			continue
 633  		}
 634  		if s.ch == '\n' {
 635  			s.Errorf("newline in string")
 636  			ok = false
 637  			break
 638  		}
 639  		if s.ch < 0 {
 640  			s.ErrorAtf(0, "string not terminated")
 641  			ok = false
 642  			break
 643  		}
 644  		s.nextch()
 645  	}
 646  
 647  	s.SetLit(StringLit, ok)
 648  }
 649  
 650  func (s *Scanner) rawString() {
 651  	ok := true
 652  	s.nextch()
 653  
 654  	for {
 655  		if s.ch == '`' {
 656  			s.nextch()
 657  			break
 658  		}
 659  		if s.ch < 0 {
 660  			s.ErrorAtf(0, "string not terminated")
 661  			ok = false
 662  			break
 663  		}
 664  		s.nextch()
 665  	}
 666  
 667  	s.SetLit(StringLit, ok)
 668  }
 669  
 670  func (s *Scanner) comment(text string) {
 671  	s.ErrorAtf(0, "%s", text)
 672  }
 673  
 674  func (s *Scanner) skipLine() {
 675  	for s.ch >= 0 && s.ch != '\n' {
 676  		s.nextch()
 677  	}
 678  }
 679  
 680  func (s *Scanner) lineComment() {
 681  	if s.mode&comments != 0 {
 682  		s.skipLine()
 683  		s.comment(string(s.segment()))
 684  		return
 685  	}
 686  
 687  	if s.mode&directives == 0 || (s.ch != 'g' && s.ch != 'l') {
 688  		s.stop()
 689  		s.skipLine()
 690  		return
 691  	}
 692  
 693  	prefix := "go:"
 694  	if s.ch == 'l' {
 695  		prefix = "line "
 696  	}
 697  
 698  	for _, r := range prefix {
 699  		if s.ch != rune(r) {
 700  			s.stop()
 701  			s.skipLine()
 702  			return
 703  		}
 704  		s.nextch()
 705  	}
 706  	s.skipLine()
 707  	s.comment(string(s.segment()))
 708  }
 709  
 710  func (s *Scanner) skipComment() bool {
 711  	for s.ch >= 0 {
 712  		for s.ch == '*' {
 713  			s.nextch()
 714  			if s.ch == '/' {
 715  				s.nextch()
 716  				return true
 717  			}
 718  		}
 719  		s.nextch()
 720  	}
 721  	s.ErrorAtf(0, "comment not terminated")
 722  	return false
 723  }
 724  
 725  func (s *Scanner) fullComment() {
 726  	if s.mode&comments != 0 {
 727  		if s.skipComment() {
 728  			s.comment(string(s.segment()))
 729  		}
 730  		return
 731  	}
 732  
 733  	if s.mode&directives == 0 || s.ch != 'l' {
 734  		s.stop()
 735  		s.skipComment()
 736  		return
 737  	}
 738  
 739  	const prefix = "line "
 740  
 741  	for _, r := range prefix {
 742  		if s.ch != rune(r) {
 743  			s.stop()
 744  			s.skipComment()
 745  			return
 746  		}
 747  		s.nextch()
 748  	}
 749  	if s.skipComment() {
 750  		s.comment(string(s.segment()))
 751  	}
 752  }
 753  
 754  func (s *Scanner) escape(quote rune) bool {
 755  	var n int32
 756  	var base, max uint32
 757  
 758  	switch s.ch {
 759  	case quote, 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\':
 760  		s.nextch()
 761  		return true
 762  	case '0', '1', '2', '3', '4', '5', '6', '7':
 763  		n, base, max = 3, 8, 255
 764  	case 'x':
 765  		s.nextch()
 766  		n, base, max = 2, 16, 255
 767  	case 'u':
 768  		s.nextch()
 769  		n, base, max = 4, 16, unicode.MaxRune
 770  	case 'U':
 771  		s.nextch()
 772  		n, base, max = 8, 16, unicode.MaxRune
 773  	default:
 774  		if s.ch < 0 {
 775  			return true
 776  		}
 777  		s.Errorf("unknown escape")
 778  		return false
 779  	}
 780  
 781  	var x uint32
 782  	for i := n; i > 0; i-- {
 783  		if s.ch < 0 {
 784  			return true
 785  		}
 786  		d := base
 787  		if IsDecimal(s.ch) {
 788  			d = uint32(s.ch) - '0'
 789  		} else if 'a' <= Lower(s.ch) && Lower(s.ch) <= 'f' {
 790  			d = uint32(Lower(s.ch)) - 'a' + 10
 791  		}
 792  		if d >= base {
 793  			s.Errorf("invalid character %q in %s escape", s.ch, baseName(int32(base)))
 794  			return false
 795  		}
 796  		x = x*base + d
 797  		s.nextch()
 798  	}
 799  
 800  	if x > max && base == 8 {
 801  		s.Errorf("octal escape value %d > 255", x)
 802  		return false
 803  	}
 804  
 805  	if x > max || 0xD800 <= x && x < 0xE000 {
 806  		s.Errorf("escape is invalid Unicode code point %#U", x)
 807  		return false
 808  	}
 809  
 810  	return true
 811  }
 812  
 813  func String(n Node) string {
 814  	return fmt.Sprintf("%T", n)
 815  }
 816  
 817  func StartPos(n Node) Pos {
 818  	return n.Pos()
 819  }
 820