scanner.mx raw

   1  package main
   2  
   3  import (
   4  	"fmt"
   5  	"io"
   6  	"unicode"
   7  	"unicode/utf8"
   8  )
   9  
  10  const (
  11  	comments   uint = 1 << iota
  12  	directives
  13  )
  14  
  15  type Scanner struct {
  16  	Source
  17  	mode   uint
  18  	nlsemi bool
  19  
  20  	Line, Col uint32
  21  	Blank     bool
  22  	Tok       Token
  23  	Lit       string
  24  	Bad       bool
  25  	Kind      LitKind
  26  	Op        Operator
  27  	Prec      int32
  28  }
  29  
  30  func (s *Scanner) Init(src io.Reader, errh func(line, col uint32, msg string), mode uint) {
  31  	s.Source.init(src, errh)
  32  	s.mode = mode
  33  	s.nlsemi = false
  34  }
  35  
  36  func (s *Scanner) Errorf(format string, args ...interface{}) {
  37  	s.error(fmt.Sprintf(format, args...))
  38  }
  39  
  40  func (s *Scanner) ErrorAtf(offset int32, format string, args ...interface{}) {
  41  	s.errh(s.line, s.col+uint32(offset), fmt.Sprintf(format, args...))
  42  }
  43  
  44  func (s *Scanner) SetLit(kind LitKind, ok bool) {
  45  	s.nlsemi = true
  46  	s.Tok = Literal
  47  	s.Lit = string(s.segmentCopy())
  48  	s.Bad = !ok
  49  	s.Kind = kind
  50  }
  51  
  52  func (s *Scanner) Next() {
  53  	nlsemi := s.nlsemi
  54  	s.nlsemi = false
  55  
  56  redo:
  57  	s.stop()
  58  	startLine, startCol := s.pos()
  59  	for s.ch == ' ' || s.ch == '\t' || s.ch == '\n' && !nlsemi || s.ch == '\r' {
  60  		s.nextch()
  61  	}
  62  
  63  	s.Line, s.Col = s.pos()
  64  	s.Blank = s.line > startLine || startCol == Colbase
  65  	s.start()
  66  	if IsLetter(s.ch) || s.ch >= utf8.RuneSelf && s.AtIdentChar(true) {
  67  		s.nextch()
  68  		s.Ident()
  69  		return
  70  	}
  71  
  72  	switch s.ch {
  73  	case -1:
  74  		if nlsemi {
  75  			s.Lit = "EOF"
  76  			s.Tok = Semi
  77  			break
  78  		}
  79  		s.Tok = EOF
  80  
  81  	case '\n':
  82  		s.nextch()
  83  		s.Lit = "newline"
  84  		s.Tok = Semi
  85  
  86  	case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
  87  		s.Number(false)
  88  
  89  	case '"':
  90  		s.stdString()
  91  
  92  	case '`':
  93  		s.rawString()
  94  
  95  	case '\'':
  96  		s.rune()
  97  
  98  	case '(':
  99  		s.nextch()
 100  		s.Tok = Lparen
 101  
 102  	case '[':
 103  		s.nextch()
 104  		s.Tok = Lbrack
 105  
 106  	case '{':
 107  		s.nextch()
 108  		s.Tok = Lbrace
 109  
 110  	case ',':
 111  		s.nextch()
 112  		s.Tok = Comma
 113  
 114  	case ';':
 115  		s.nextch()
 116  		s.Lit = "semicolon"
 117  		s.Tok = Semi
 118  
 119  	case ')':
 120  		s.nextch()
 121  		s.nlsemi = true
 122  		s.Tok = Rparen
 123  
 124  	case ']':
 125  		s.nextch()
 126  		s.nlsemi = true
 127  		s.Tok = Rbrack
 128  
 129  	case '}':
 130  		s.nextch()
 131  		s.nlsemi = true
 132  		s.Tok = Rbrace
 133  
 134  	case ':':
 135  		s.nextch()
 136  		if s.ch == '=' {
 137  			s.nextch()
 138  			s.Tok = Define
 139  			break
 140  		}
 141  		s.Tok = Colon
 142  
 143  	case '.':
 144  		s.nextch()
 145  		if IsDecimal(s.ch) {
 146  			s.Number(true)
 147  			break
 148  		}
 149  		if s.ch == '.' {
 150  			s.nextch()
 151  			if s.ch == '.' {
 152  				s.nextch()
 153  				s.Tok = DotDotDot
 154  				break
 155  			}
 156  			s.rewind()
 157  			s.nextch()
 158  		}
 159  		s.Tok = Dot
 160  
 161  	case '+':
 162  		s.nextch()
 163  		s.Op, s.Prec = Add, PrecAdd
 164  		if s.ch != '+' {
 165  			goto assignop
 166  		}
 167  		s.nextch()
 168  		s.nlsemi = true
 169  		s.Tok = IncOp
 170  
 171  	case '-':
 172  		s.nextch()
 173  		s.Op, s.Prec = Sub, PrecAdd
 174  		if s.ch != '-' {
 175  			goto assignop
 176  		}
 177  		s.nextch()
 178  		s.nlsemi = true
 179  		s.Tok = IncOp
 180  
 181  	case '*':
 182  		s.nextch()
 183  		s.Op, s.Prec = Mul, PrecMul
 184  		if s.ch == '=' {
 185  			s.nextch()
 186  			s.Tok = AssignOp
 187  			break
 188  		}
 189  		s.Tok = Star
 190  
 191  	case '/':
 192  		s.nextch()
 193  		if s.ch == '/' {
 194  			s.nextch()
 195  			s.lineComment()
 196  			goto redo
 197  		}
 198  		if s.ch == '*' {
 199  			s.nextch()
 200  			s.fullComment()
 201  			if line, _ := s.pos(); line > s.Line && nlsemi {
 202  				s.Lit = "newline"
 203  				s.Tok = Semi
 204  				break
 205  			}
 206  			goto redo
 207  		}
 208  		s.Op, s.Prec = Div, PrecMul
 209  		goto assignop
 210  
 211  	case '%':
 212  		s.nextch()
 213  		s.Op, s.Prec = Rem, PrecMul
 214  		goto assignop
 215  
 216  	case '&':
 217  		s.nextch()
 218  		if s.ch == '&' {
 219  			s.nextch()
 220  			s.Op, s.Prec = AndAnd, PrecAndAnd
 221  			s.Tok = OperatorType
 222  			break
 223  		}
 224  		s.Op, s.Prec = And, PrecMul
 225  		if s.ch == '^' {
 226  			s.nextch()
 227  			s.Op = AndNot
 228  		}
 229  		goto assignop
 230  
 231  	case '|':
 232  		s.nextch()
 233  		if s.ch == '|' {
 234  			s.nextch()
 235  			s.Op, s.Prec = OrOr, PrecOrOr
 236  			s.Tok = OperatorType
 237  			break
 238  		}
 239  		s.Op, s.Prec = Or, PrecAdd
 240  		goto assignop
 241  
 242  	case '^':
 243  		s.nextch()
 244  		s.Op, s.Prec = Xor, PrecAdd
 245  		goto assignop
 246  
 247  	case '<':
 248  		s.nextch()
 249  		if s.ch == '=' {
 250  			s.nextch()
 251  			s.Op, s.Prec = Leq, PrecCmp
 252  			s.Tok = OperatorType
 253  			break
 254  		}
 255  		if s.ch == '<' {
 256  			s.nextch()
 257  			s.Op, s.Prec = Shl, PrecMul
 258  			goto assignop
 259  		}
 260  		if s.ch == '-' {
 261  			s.nextch()
 262  			s.Tok = Arrow
 263  			break
 264  		}
 265  		s.Op, s.Prec = Lss, PrecCmp
 266  		s.Tok = OperatorType
 267  
 268  	case '>':
 269  		s.nextch()
 270  		if s.ch == '=' {
 271  			s.nextch()
 272  			s.Op, s.Prec = Geq, PrecCmp
 273  			s.Tok = OperatorType
 274  			break
 275  		}
 276  		if s.ch == '>' {
 277  			s.nextch()
 278  			s.Op, s.Prec = Shr, PrecMul
 279  			goto assignop
 280  		}
 281  		s.Op, s.Prec = Gtr, PrecCmp
 282  		s.Tok = OperatorType
 283  
 284  	case '=':
 285  		s.nextch()
 286  		if s.ch == '=' {
 287  			s.nextch()
 288  			s.Op, s.Prec = Eql, PrecCmp
 289  			s.Tok = OperatorType
 290  			break
 291  		}
 292  		s.Tok = Assign
 293  
 294  	case '!':
 295  		s.nextch()
 296  		if s.ch == '=' {
 297  			s.nextch()
 298  			s.Op, s.Prec = Neq, PrecCmp
 299  			s.Tok = OperatorType
 300  			break
 301  		}
 302  		s.Op, s.Prec = Not, 0
 303  		s.Tok = OperatorType
 304  
 305  	case '~':
 306  		s.nextch()
 307  		s.Op, s.Prec = Tilde, 0
 308  		s.Tok = OperatorType
 309  
 310  	default:
 311  		s.Errorf("invalid character %#U", s.ch)
 312  		s.nextch()
 313  		goto redo
 314  	}
 315  
 316  	return
 317  
 318  assignop:
 319  	if s.ch == '=' {
 320  		s.nextch()
 321  		s.Tok = AssignOp
 322  		return
 323  	}
 324  	s.Tok = OperatorType
 325  }
 326  
 327  func (s *Scanner) Ident() {
 328  	for IsLetter(s.ch) || IsDecimal(s.ch) {
 329  		s.nextch()
 330  	}
 331  
 332  	if s.ch >= utf8.RuneSelf {
 333  		for s.AtIdentChar(false) {
 334  			s.nextch()
 335  		}
 336  	}
 337  
 338  	lit := s.segment()
 339  	if len(lit) >= 2 {
 340  		if tok := keywordMap[Hash(lit)]; tok != 0 && TokStrFast(tok) == string(lit) {
 341  			s.nlsemi = contains(1<<Break|1<<Continue|1<<Fallthrough|1<<Return, tok)
 342  			s.Tok = tok
 343  			return
 344  		}
 345  	}
 346  
 347  	s.nlsemi = true
 348  	c := []byte{:len(lit)}
 349  	copy(c, lit)
 350  	s.Lit = string(c)
 351  	s.Tok = NameType
 352  }
 353  
 354  func TokStrFast(tok Token) string {
 355  	return token_name[token_index[tok-1]:token_index[tok]]
 356  }
 357  
 358  func (s *Scanner) AtIdentChar(first bool) bool {
 359  	switch {
 360  	case unicode.IsLetter(s.ch) || s.ch == '_':
 361  	case unicode.IsDigit(s.ch):
 362  		if first {
 363  			s.Errorf("identifier cannot begin with digit %#U", s.ch)
 364  		}
 365  	case s.ch >= utf8.RuneSelf:
 366  		s.Errorf("invalid character %#U in identifier", s.ch)
 367  	default:
 368  		return false
 369  	}
 370  	return true
 371  }
 372  
 373  func Hash(s []byte) uint {
 374  	return (uint(s[0])<<4 ^ uint(s[1]) + uint(len(s))) & uint(len(keywordMap)-1)
 375  }
 376  
 377  var keywordMap [1 << 6]Token
 378  
 379  var keywordsInitialized bool
 380  
 381  func InitKeywords() {
 382  	if keywordsInitialized {
 383  		return
 384  	}
 385  	keywordsInitialized = true
 386  	for tok := Break; tok <= Var; tok++ {
 387  		h := Hash([]byte(tok.String()))
 388  		if keywordMap[h] != 0 {
 389  			panic("imperfect hash")
 390  		}
 391  		keywordMap[h] = tok
 392  	}
 393  }
 394  
 395  func Lower(ch rune) rune     { return ('a' - 'A') | ch }
 396  func IsLetter(ch rune) bool  { return 'a' <= Lower(ch) && Lower(ch) <= 'z' || ch == '_' }
 397  func IsDecimal(ch rune) bool { return '0' <= ch && ch <= '9' }
 398  func IsHex(ch rune) bool     { return '0' <= ch && ch <= '9' || 'a' <= Lower(ch) && Lower(ch) <= 'f' }
 399  
 400  func (s *Scanner) Digits(base int32, invalid *int32) (digsep int32) {
 401  	if base <= 10 {
 402  		max := rune('0' + base)
 403  		for IsDecimal(s.ch) || s.ch == '_' {
 404  			ds := int32(1)
 405  			if s.ch == '_' {
 406  				ds = 2
 407  			} else if s.ch >= max && *invalid < 0 {
 408  				_, col := s.pos()
 409  				*invalid = int32(col - s.col)
 410  			}
 411  			digsep |= ds
 412  			s.nextch()
 413  		}
 414  	} else {
 415  		for IsHex(s.ch) || s.ch == '_' {
 416  			ds := int32(1)
 417  			if s.ch == '_' {
 418  				ds = 2
 419  			}
 420  			digsep |= ds
 421  			s.nextch()
 422  		}
 423  	}
 424  	return
 425  }
 426  
 427  func (s *Scanner) Number(seenPoint bool) {
 428  	ok := true
 429  	kind := IntLit
 430  	base := int32(10)
 431  	prefix := rune(0)
 432  	digsep := int32(0)
 433  	invalid := int32(-1)
 434  
 435  	if !seenPoint {
 436  		if s.ch == '0' {
 437  			s.nextch()
 438  			switch Lower(s.ch) {
 439  			case 'x':
 440  				s.nextch()
 441  				base, prefix = 16, 'x'
 442  			case 'o':
 443  				s.nextch()
 444  				base, prefix = 8, 'o'
 445  			case 'b':
 446  				s.nextch()
 447  				base, prefix = 2, 'b'
 448  			default:
 449  				base, prefix = 8, '0'
 450  				digsep = 1
 451  			}
 452  		}
 453  		digsep |= s.Digits(base, &invalid)
 454  		if s.ch == '.' {
 455  			if prefix == 'o' || prefix == 'b' {
 456  				s.Errorf("invalid radix point in %s literal", baseName(base))
 457  				ok = false
 458  			}
 459  			s.nextch()
 460  			seenPoint = true
 461  		}
 462  	}
 463  
 464  	if seenPoint {
 465  		kind = FloatLit
 466  		digsep |= s.Digits(base, &invalid)
 467  	}
 468  
 469  	if digsep&1 == 0 && ok {
 470  		s.Errorf("%s literal has no digits", baseName(base))
 471  		ok = false
 472  	}
 473  
 474  	if e := Lower(s.ch); e == 'e' || e == 'p' {
 475  		if ok {
 476  			switch {
 477  			case e == 'e' && prefix != 0 && prefix != '0':
 478  				s.Errorf("%q exponent requires decimal mantissa", s.ch)
 479  				ok = false
 480  			case e == 'p' && prefix != 'x':
 481  				s.Errorf("%q exponent requires hexadecimal mantissa", s.ch)
 482  				ok = false
 483  			}
 484  		}
 485  		s.nextch()
 486  		kind = FloatLit
 487  		if s.ch == '+' || s.ch == '-' {
 488  			s.nextch()
 489  		}
 490  		digsep = s.Digits(10, nil) | digsep&2
 491  		if digsep&1 == 0 && ok {
 492  			s.Errorf("exponent has no digits")
 493  			ok = false
 494  		}
 495  	} else if prefix == 'x' && kind == FloatLit && ok {
 496  		s.Errorf("hexadecimal mantissa requires a 'p' exponent")
 497  		ok = false
 498  	}
 499  
 500  	if s.ch == 'i' {
 501  		kind = ImagLit
 502  		s.nextch()
 503  	}
 504  
 505  	s.SetLit(kind, ok)
 506  
 507  	if kind == IntLit && invalid >= 0 && ok {
 508  		s.ErrorAtf(invalid, "invalid digit %q in %s literal", s.Lit[invalid], baseName(base))
 509  		ok = false
 510  	}
 511  
 512  	if digsep&2 != 0 && ok {
 513  		if i := invalidSep(s.Lit); i >= 0 {
 514  			s.ErrorAtf(i, "'_' must separate successive digits")
 515  			ok = false
 516  		}
 517  	}
 518  
 519  	s.Bad = !ok
 520  }
 521  
 522  func baseName(base int32) string {
 523  	switch base {
 524  	case 2:
 525  		return "binary"
 526  	case 8:
 527  		return "octal"
 528  	case 10:
 529  		return "decimal"
 530  	case 16:
 531  		return "hexadecimal"
 532  	}
 533  	panic("invalid base")
 534  }
 535  
 536  func invalidSep(x string) int32 {
 537  	x1 := ' '
 538  	d := '.'
 539  	i := int32(0)
 540  
 541  	if len(x) >= 2 && x[0] == '0' {
 542  		x1 = Lower(rune(x[1]))
 543  		if x1 == 'x' || x1 == 'o' || x1 == 'b' {
 544  			d = '0'
 545  			i = 2
 546  		}
 547  	}
 548  
 549  	for ; i < int32(len(x)); i++ {
 550  		p := d
 551  		d = rune(x[i])
 552  		switch {
 553  		case d == '_':
 554  			if p != '0' {
 555  				return i
 556  			}
 557  		case IsDecimal(d) || x1 == 'x' && IsHex(d):
 558  			d = '0'
 559  		default:
 560  			if p == '_' {
 561  				return i - 1
 562  			}
 563  			d = '.'
 564  		}
 565  	}
 566  	if d == '_' {
 567  		return int32(len(x)) - 1
 568  	}
 569  
 570  	return -1
 571  }
 572  
 573  func (s *Scanner) rune() {
 574  	ok := true
 575  	s.nextch()
 576  
 577  	n := 0
 578  	for ; ; n++ {
 579  		if s.ch == '\'' {
 580  			if ok {
 581  				if n == 0 {
 582  					s.Errorf("empty rune literal or unescaped '")
 583  					ok = false
 584  				} else if n != 1 {
 585  					s.ErrorAtf(0, "more than one character in rune literal")
 586  					ok = false
 587  				}
 588  			}
 589  			s.nextch()
 590  			break
 591  		}
 592  		if s.ch == '\\' {
 593  			s.nextch()
 594  			if !s.escape('\'') {
 595  				ok = false
 596  			}
 597  			continue
 598  		}
 599  		if s.ch == '\n' {
 600  			if ok {
 601  				s.Errorf("newline in rune literal")
 602  				ok = false
 603  			}
 604  			break
 605  		}
 606  		if s.ch < 0 {
 607  			if ok {
 608  				s.ErrorAtf(0, "rune literal not terminated")
 609  				ok = false
 610  			}
 611  			break
 612  		}
 613  		s.nextch()
 614  	}
 615  
 616  	s.SetLit(RuneLit, ok)
 617  }
 618  
 619  func (s *Scanner) stdString() {
 620  	ok := true
 621  	s.nextch()
 622  
 623  	for {
 624  		if s.ch == '"' {
 625  			s.nextch()
 626  			break
 627  		}
 628  		if s.ch == '\\' {
 629  			s.nextch()
 630  			if !s.escape('"') {
 631  				ok = false
 632  			}
 633  			continue
 634  		}
 635  		if s.ch == '\n' {
 636  			s.Errorf("newline in string")
 637  			ok = false
 638  			break
 639  		}
 640  		if s.ch < 0 {
 641  			s.ErrorAtf(0, "string not terminated")
 642  			ok = false
 643  			break
 644  		}
 645  		s.nextch()
 646  	}
 647  
 648  	s.SetLit(StringLit, ok)
 649  }
 650  
 651  func (s *Scanner) rawString() {
 652  	ok := true
 653  	s.nextch()
 654  
 655  	for {
 656  		if s.ch == '`' {
 657  			s.nextch()
 658  			break
 659  		}
 660  		if s.ch < 0 {
 661  			s.ErrorAtf(0, "string not terminated")
 662  			ok = false
 663  			break
 664  		}
 665  		s.nextch()
 666  	}
 667  
 668  	s.SetLit(StringLit, ok)
 669  }
 670  
 671  func (s *Scanner) comment(text string) {
 672  	s.ErrorAtf(0, "%s", text)
 673  }
 674  
 675  func (s *Scanner) skipLine() {
 676  	for s.ch >= 0 && s.ch != '\n' {
 677  		s.nextch()
 678  	}
 679  }
 680  
 681  func (s *Scanner) lineComment() {
 682  	if s.mode&comments != 0 {
 683  		s.skipLine()
 684  		s.comment(string(s.segment()))
 685  		return
 686  	}
 687  
 688  	if s.mode&directives == 0 || (s.ch != 'g' && s.ch != 'l') {
 689  		s.stop()
 690  		s.skipLine()
 691  		return
 692  	}
 693  
 694  	prefix := "go:"
 695  	if s.ch == 'l' {
 696  		prefix = "line "
 697  	}
 698  
 699  	for _, r := range prefix {
 700  		if s.ch != rune(r) {
 701  			s.stop()
 702  			s.skipLine()
 703  			return
 704  		}
 705  		s.nextch()
 706  	}
 707  	s.skipLine()
 708  	s.comment(string(s.segment()))
 709  }
 710  
 711  func (s *Scanner) skipComment() bool {
 712  	for s.ch >= 0 {
 713  		for s.ch == '*' {
 714  			s.nextch()
 715  			if s.ch == '/' {
 716  				s.nextch()
 717  				return true
 718  			}
 719  		}
 720  		s.nextch()
 721  	}
 722  	s.ErrorAtf(0, "comment not terminated")
 723  	return false
 724  }
 725  
 726  func (s *Scanner) fullComment() {
 727  	if s.mode&comments != 0 {
 728  		if s.skipComment() {
 729  			s.comment(string(s.segment()))
 730  		}
 731  		return
 732  	}
 733  
 734  	if s.mode&directives == 0 || s.ch != 'l' {
 735  		s.stop()
 736  		s.skipComment()
 737  		return
 738  	}
 739  
 740  	const prefix = "line "
 741  
 742  	for _, r := range prefix {
 743  		if s.ch != rune(r) {
 744  			s.stop()
 745  			s.skipComment()
 746  			return
 747  		}
 748  		s.nextch()
 749  	}
 750  	if s.skipComment() {
 751  		s.comment(string(s.segment()))
 752  	}
 753  }
 754  
 755  func (s *Scanner) escape(quote rune) bool {
 756  	var n int32
 757  	var base, max uint32
 758  
 759  	switch s.ch {
 760  	case quote, 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\':
 761  		s.nextch()
 762  		return true
 763  	case '0', '1', '2', '3', '4', '5', '6', '7':
 764  		n, base, max = 3, 8, 255
 765  	case 'x':
 766  		s.nextch()
 767  		n, base, max = 2, 16, 255
 768  	case 'u':
 769  		s.nextch()
 770  		n, base, max = 4, 16, unicode.MaxRune
 771  	case 'U':
 772  		s.nextch()
 773  		n, base, max = 8, 16, unicode.MaxRune
 774  	default:
 775  		if s.ch < 0 {
 776  			return true
 777  		}
 778  		s.Errorf("unknown escape")
 779  		return false
 780  	}
 781  
 782  	var x uint32
 783  	for i := n; i > 0; i-- {
 784  		if s.ch < 0 {
 785  			return true
 786  		}
 787  		d := base
 788  		if IsDecimal(s.ch) {
 789  			d = uint32(s.ch) - '0'
 790  		} else if 'a' <= Lower(s.ch) && Lower(s.ch) <= 'f' {
 791  			d = uint32(Lower(s.ch)) - 'a' + 10
 792  		}
 793  		if d >= base {
 794  			s.Errorf("invalid character %q in %s escape", s.ch, baseName(int32(base)))
 795  			return false
 796  		}
 797  		x = x*base + d
 798  		s.nextch()
 799  	}
 800  
 801  	if x > max && base == 8 {
 802  		s.Errorf("octal escape value %d > 255", x)
 803  		return false
 804  	}
 805  
 806  	if x > max || 0xD800 <= x && x < 0xE000 {
 807  		s.Errorf("escape is invalid Unicode code point %#U", x)
 808  		return false
 809  	}
 810  
 811  	return true
 812  }
 813  
 814  func String(n Node) string {
 815  	return fmt.Sprintf("%T", n)
 816  }
 817  
 818  func StartPos(n Node) Pos {
 819  	return n.Pos()
 820  }
 821