scanner.mx raw

   1  package main
   2  
   3  import (
   4  	"fmt"
   5  	"io"
   6  	"unicode"
   7  	"unicode/utf8"
   8  )
   9  
  10  const (
  11  	comments   uint = 1 << iota
  12  	directives
  13  )
  14  
  15  type Scanner struct {
  16  	Source
  17  	mode   uint
  18  	nlsemi bool
  19  
  20  	Line, Col uint32
  21  	Blank     bool
  22  	Tok       Token
  23  	Lit       string
  24  	Bad       bool
  25  	Kind      LitKind
  26  	Op        Operator
  27  	Prec      int32
  28  }
  29  
  30  func (s *Scanner) Init(src io.Reader, errh func(line, col uint32, msg string), mode uint) {
  31  	s.Source.init(src, errh)
  32  	s.mode = mode
  33  	s.nlsemi = false
  34  }
  35  
  36  func (s *Scanner) Errorf(format string, args ...interface{}) {
  37  	s.error(fmt.Sprintf(format, args...))
  38  }
  39  
  40  func (s *Scanner) ErrorAtf(offset int32, format string, args ...interface{}) {
  41  	s.errh(s.line, s.col+uint32(offset), fmt.Sprintf(format, args...))
  42  }
  43  
  44  func (s *Scanner) SetLit(kind LitKind, ok bool) {
  45  	s.nlsemi = true
  46  	s.Tok = Literal
  47  	s.Lit = string(s.segment())
  48  	s.Bad = !ok
  49  	s.Kind = kind
  50  }
  51  
  52  func (s *Scanner) Next() {
  53  	nlsemi := s.nlsemi
  54  	s.nlsemi = false
  55  
  56  redo:
  57  	s.stop()
  58  	startLine, startCol := s.pos()
  59  	for s.ch == ' ' || s.ch == '\t' || s.ch == '\n' && !nlsemi || s.ch == '\r' {
  60  		s.nextch()
  61  	}
  62  
  63  	s.Line, s.Col = s.pos()
  64  	s.Blank = s.line > startLine || startCol == Colbase
  65  	s.start()
  66  	if IsLetter(s.ch) || s.ch >= utf8.RuneSelf && s.AtIdentChar(true) {
  67  		s.nextch()
  68  		s.Ident()
  69  		return
  70  	}
  71  
  72  	switch s.ch {
  73  	case -1:
  74  		if nlsemi {
  75  			s.Lit = "EOF"
  76  			s.Tok = Semi
  77  			break
  78  		}
  79  		s.Tok = EOF
  80  
  81  	case '\n':
  82  		s.nextch()
  83  		s.Lit = "newline"
  84  		s.Tok = Semi
  85  
  86  	case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
  87  		s.Number(false)
  88  
  89  	case '"':
  90  		s.stdString()
  91  
  92  	case '`':
  93  		s.rawString()
  94  
  95  	case '\'':
  96  		s.rune()
  97  
  98  	case '(':
  99  		s.nextch()
 100  		s.Tok = Lparen
 101  
 102  	case '[':
 103  		s.nextch()
 104  		s.Tok = Lbrack
 105  
 106  	case '{':
 107  		s.nextch()
 108  		s.Tok = Lbrace
 109  
 110  	case ',':
 111  		s.nextch()
 112  		s.Tok = Comma
 113  
 114  	case ';':
 115  		s.nextch()
 116  		s.Lit = "semicolon"
 117  		s.Tok = Semi
 118  
 119  	case ')':
 120  		s.nextch()
 121  		s.nlsemi = true
 122  		s.Tok = Rparen
 123  
 124  	case ']':
 125  		s.nextch()
 126  		s.nlsemi = true
 127  		s.Tok = Rbrack
 128  
 129  	case '}':
 130  		s.nextch()
 131  		s.nlsemi = true
 132  		s.Tok = Rbrace
 133  
 134  	case ':':
 135  		s.nextch()
 136  		if s.ch == '=' {
 137  			s.nextch()
 138  			s.Tok = Define
 139  			break
 140  		}
 141  		s.Tok = Colon
 142  
 143  	case '.':
 144  		s.nextch()
 145  		if IsDecimal(s.ch) {
 146  			s.Number(true)
 147  			break
 148  		}
 149  		if s.ch == '.' {
 150  			s.nextch()
 151  			if s.ch == '.' {
 152  				s.nextch()
 153  				s.Tok = DotDotDot
 154  				break
 155  			}
 156  			s.rewind()
 157  			s.nextch()
 158  		}
 159  		s.Tok = Dot
 160  
 161  	case '+':
 162  		s.nextch()
 163  		s.Op, s.Prec = Add, PrecAdd
 164  		if s.ch != '+' {
 165  			goto assignop
 166  		}
 167  		s.nextch()
 168  		s.nlsemi = true
 169  		s.Tok = IncOp
 170  
 171  	case '-':
 172  		s.nextch()
 173  		s.Op, s.Prec = Sub, PrecAdd
 174  		if s.ch != '-' {
 175  			goto assignop
 176  		}
 177  		s.nextch()
 178  		s.nlsemi = true
 179  		s.Tok = IncOp
 180  
 181  	case '*':
 182  		s.nextch()
 183  		s.Op, s.Prec = Mul, PrecMul
 184  		if s.ch == '=' {
 185  			s.nextch()
 186  			s.Tok = AssignOp
 187  			break
 188  		}
 189  		s.Tok = Star
 190  
 191  	case '/':
 192  		s.nextch()
 193  		if s.ch == '/' {
 194  			s.nextch()
 195  			s.lineComment()
 196  			goto redo
 197  		}
 198  		if s.ch == '*' {
 199  			s.nextch()
 200  			s.fullComment()
 201  			if line, _ := s.pos(); line > s.Line && nlsemi {
 202  				s.Lit = "newline"
 203  				s.Tok = Semi
 204  				break
 205  			}
 206  			goto redo
 207  		}
 208  		s.Op, s.Prec = Div, PrecMul
 209  		goto assignop
 210  
 211  	case '%':
 212  		s.nextch()
 213  		s.Op, s.Prec = Rem, PrecMul
 214  		goto assignop
 215  
 216  	case '&':
 217  		s.nextch()
 218  		if s.ch == '&' {
 219  			s.nextch()
 220  			s.Op, s.Prec = AndAnd, PrecAndAnd
 221  			s.Tok = OperatorType
 222  			break
 223  		}
 224  		s.Op, s.Prec = And, PrecMul
 225  		if s.ch == '^' {
 226  			s.nextch()
 227  			s.Op = AndNot
 228  		}
 229  		goto assignop
 230  
 231  	case '|':
 232  		s.nextch()
 233  		if s.ch == '|' {
 234  			s.nextch()
 235  			s.Op, s.Prec = OrOr, PrecOrOr
 236  			s.Tok = OperatorType
 237  			break
 238  		}
 239  		s.Op, s.Prec = Or, PrecAdd
 240  		goto assignop
 241  
 242  	case '^':
 243  		s.nextch()
 244  		s.Op, s.Prec = Xor, PrecAdd
 245  		goto assignop
 246  
 247  	case '<':
 248  		s.nextch()
 249  		if s.ch == '=' {
 250  			s.nextch()
 251  			s.Op, s.Prec = Leq, PrecCmp
 252  			s.Tok = OperatorType
 253  			break
 254  		}
 255  		if s.ch == '<' {
 256  			s.nextch()
 257  			s.Op, s.Prec = Shl, PrecMul
 258  			goto assignop
 259  		}
 260  		if s.ch == '-' {
 261  			s.nextch()
 262  			s.Tok = Arrow
 263  			break
 264  		}
 265  		s.Op, s.Prec = Lss, PrecCmp
 266  		s.Tok = OperatorType
 267  
 268  	case '>':
 269  		s.nextch()
 270  		if s.ch == '=' {
 271  			s.nextch()
 272  			s.Op, s.Prec = Geq, PrecCmp
 273  			s.Tok = OperatorType
 274  			break
 275  		}
 276  		if s.ch == '>' {
 277  			s.nextch()
 278  			s.Op, s.Prec = Shr, PrecMul
 279  			goto assignop
 280  		}
 281  		s.Op, s.Prec = Gtr, PrecCmp
 282  		s.Tok = OperatorType
 283  
 284  	case '=':
 285  		s.nextch()
 286  		if s.ch == '=' {
 287  			s.nextch()
 288  			s.Op, s.Prec = Eql, PrecCmp
 289  			s.Tok = OperatorType
 290  			break
 291  		}
 292  		s.Tok = Assign
 293  
 294  	case '!':
 295  		s.nextch()
 296  		if s.ch == '=' {
 297  			s.nextch()
 298  			s.Op, s.Prec = Neq, PrecCmp
 299  			s.Tok = OperatorType
 300  			break
 301  		}
 302  		s.Op, s.Prec = Not, 0
 303  		s.Tok = OperatorType
 304  
 305  	case '~':
 306  		s.nextch()
 307  		s.Op, s.Prec = Tilde, 0
 308  		s.Tok = OperatorType
 309  
 310  	default:
 311  		s.Errorf("invalid character %#U", s.ch)
 312  		s.nextch()
 313  		goto redo
 314  	}
 315  
 316  	return
 317  
 318  assignop:
 319  	if s.ch == '=' {
 320  		s.nextch()
 321  		s.Tok = AssignOp
 322  		return
 323  	}
 324  	s.Tok = OperatorType
 325  }
 326  
 327  func (s *Scanner) Ident() {
 328  	for IsLetter(s.ch) || IsDecimal(s.ch) {
 329  		s.nextch()
 330  	}
 331  
 332  	if s.ch >= utf8.RuneSelf {
 333  		for s.AtIdentChar(false) {
 334  			s.nextch()
 335  		}
 336  	}
 337  
 338  	lit := s.segment()
 339  	if len(lit) >= 2 {
 340  		if tok := keywordMap[Hash(lit)]; tok != 0 && TokStrFast(tok) == string(lit) {
 341  			s.nlsemi = contains(1<<Break|1<<Continue|1<<Fallthrough|1<<Return, tok)
 342  			s.Tok = tok
 343  			return
 344  		}
 345  	}
 346  
 347  	s.nlsemi = true
 348  	s.Lit = string(lit)
 349  	s.Tok = NameType
 350  }
 351  
 352  func TokStrFast(tok Token) string {
 353  	return token_name[token_index[tok-1]:token_index[tok]]
 354  }
 355  
 356  func (s *Scanner) AtIdentChar(first bool) bool {
 357  	switch {
 358  	case unicode.IsLetter(s.ch) || s.ch == '_':
 359  	case unicode.IsDigit(s.ch):
 360  		if first {
 361  			s.Errorf("identifier cannot begin with digit %#U", s.ch)
 362  		}
 363  	case s.ch >= utf8.RuneSelf:
 364  		s.Errorf("invalid character %#U in identifier", s.ch)
 365  	default:
 366  		return false
 367  	}
 368  	return true
 369  }
 370  
 371  func Hash(s []byte) uint {
 372  	return (uint(s[0])<<4 ^ uint(s[1]) + uint(len(s))) & uint(len(keywordMap)-1)
 373  }
 374  
 375  var keywordMap [1 << 6]Token
 376  
 377  var keywordsInitialized bool
 378  
 379  func InitKeywords() {
 380  	if keywordsInitialized {
 381  		return
 382  	}
 383  	keywordsInitialized = true
 384  	for tok := Break; tok <= Var; tok++ {
 385  		h := Hash([]byte(tok.String()))
 386  		if keywordMap[h] != 0 {
 387  			panic("imperfect hash")
 388  		}
 389  		keywordMap[h] = tok
 390  	}
 391  }
 392  
 393  func Lower(ch rune) rune     { return ('a' - 'A') | ch }
 394  func IsLetter(ch rune) bool  { return 'a' <= Lower(ch) && Lower(ch) <= 'z' || ch == '_' }
 395  func IsDecimal(ch rune) bool { return '0' <= ch && ch <= '9' }
 396  func IsHex(ch rune) bool     { return '0' <= ch && ch <= '9' || 'a' <= Lower(ch) && Lower(ch) <= 'f' }
 397  
 398  func (s *Scanner) Digits(base int32, invalid *int32) (digsep int32) {
 399  	if base <= 10 {
 400  		max := rune('0' + base)
 401  		for IsDecimal(s.ch) || s.ch == '_' {
 402  			ds := int32(1)
 403  			if s.ch == '_' {
 404  				ds = 2
 405  			} else if s.ch >= max && *invalid < 0 {
 406  				_, col := s.pos()
 407  				*invalid = int32(col - s.col)
 408  			}
 409  			digsep |= ds
 410  			s.nextch()
 411  		}
 412  	} else {
 413  		for IsHex(s.ch) || s.ch == '_' {
 414  			ds := int32(1)
 415  			if s.ch == '_' {
 416  				ds = 2
 417  			}
 418  			digsep |= ds
 419  			s.nextch()
 420  		}
 421  	}
 422  	return
 423  }
 424  
 425  func (s *Scanner) Number(seenPoint bool) {
 426  	ok := true
 427  	kind := IntLit
 428  	base := int32(10)
 429  	prefix := rune(0)
 430  	digsep := int32(0)
 431  	invalid := int32(-1)
 432  
 433  	if !seenPoint {
 434  		if s.ch == '0' {
 435  			s.nextch()
 436  			switch Lower(s.ch) {
 437  			case 'x':
 438  				s.nextch()
 439  				base, prefix = 16, 'x'
 440  			case 'o':
 441  				s.nextch()
 442  				base, prefix = 8, 'o'
 443  			case 'b':
 444  				s.nextch()
 445  				base, prefix = 2, 'b'
 446  			default:
 447  				base, prefix = 8, '0'
 448  				digsep = 1
 449  			}
 450  		}
 451  		digsep |= s.Digits(base, &invalid)
 452  		if s.ch == '.' {
 453  			if prefix == 'o' || prefix == 'b' {
 454  				s.Errorf("invalid radix point in %s literal", baseName(base))
 455  				ok = false
 456  			}
 457  			s.nextch()
 458  			seenPoint = true
 459  		}
 460  	}
 461  
 462  	if seenPoint {
 463  		kind = FloatLit
 464  		digsep |= s.Digits(base, &invalid)
 465  	}
 466  
 467  	if digsep&1 == 0 && ok {
 468  		s.Errorf("%s literal has no digits", baseName(base))
 469  		ok = false
 470  	}
 471  
 472  	if e := Lower(s.ch); e == 'e' || e == 'p' {
 473  		if ok {
 474  			switch {
 475  			case e == 'e' && prefix != 0 && prefix != '0':
 476  				s.Errorf("%q exponent requires decimal mantissa", s.ch)
 477  				ok = false
 478  			case e == 'p' && prefix != 'x':
 479  				s.Errorf("%q exponent requires hexadecimal mantissa", s.ch)
 480  				ok = false
 481  			}
 482  		}
 483  		s.nextch()
 484  		kind = FloatLit
 485  		if s.ch == '+' || s.ch == '-' {
 486  			s.nextch()
 487  		}
 488  		digsep = s.Digits(10, nil) | digsep&2
 489  		if digsep&1 == 0 && ok {
 490  			s.Errorf("exponent has no digits")
 491  			ok = false
 492  		}
 493  	} else if prefix == 'x' && kind == FloatLit && ok {
 494  		s.Errorf("hexadecimal mantissa requires a 'p' exponent")
 495  		ok = false
 496  	}
 497  
 498  	if s.ch == 'i' {
 499  		kind = ImagLit
 500  		s.nextch()
 501  	}
 502  
 503  	s.SetLit(kind, ok)
 504  
 505  	if kind == IntLit && invalid >= 0 && ok {
 506  		s.ErrorAtf(invalid, "invalid digit %q in %s literal", s.Lit[invalid], baseName(base))
 507  		ok = false
 508  	}
 509  
 510  	if digsep&2 != 0 && ok {
 511  		if i := invalidSep(s.Lit); i >= 0 {
 512  			s.ErrorAtf(i, "'_' must separate successive digits")
 513  			ok = false
 514  		}
 515  	}
 516  
 517  	s.Bad = !ok
 518  }
 519  
 520  func baseName(base int32) string {
 521  	switch base {
 522  	case 2:
 523  		return "binary"
 524  	case 8:
 525  		return "octal"
 526  	case 10:
 527  		return "decimal"
 528  	case 16:
 529  		return "hexadecimal"
 530  	}
 531  	panic("invalid base")
 532  }
 533  
 534  func invalidSep(x string) int32 {
 535  	x1 := ' '
 536  	d := '.'
 537  	i := int32(0)
 538  
 539  	if len(x) >= 2 && x[0] == '0' {
 540  		x1 = Lower(rune(x[1]))
 541  		if x1 == 'x' || x1 == 'o' || x1 == 'b' {
 542  			d = '0'
 543  			i = 2
 544  		}
 545  	}
 546  
 547  	for ; i < int32(len(x)); i++ {
 548  		p := d
 549  		d = rune(x[i])
 550  		switch {
 551  		case d == '_':
 552  			if p != '0' {
 553  				return i
 554  			}
 555  		case IsDecimal(d) || x1 == 'x' && IsHex(d):
 556  			d = '0'
 557  		default:
 558  			if p == '_' {
 559  				return i - 1
 560  			}
 561  			d = '.'
 562  		}
 563  	}
 564  	if d == '_' {
 565  		return int32(len(x)) - 1
 566  	}
 567  
 568  	return -1
 569  }
 570  
 571  func (s *Scanner) rune() {
 572  	ok := true
 573  	s.nextch()
 574  
 575  	n := 0
 576  	for ; ; n++ {
 577  		if s.ch == '\'' {
 578  			if ok {
 579  				if n == 0 {
 580  					s.Errorf("empty rune literal or unescaped '")
 581  					ok = false
 582  				} else if n != 1 {
 583  					s.ErrorAtf(0, "more than one character in rune literal")
 584  					ok = false
 585  				}
 586  			}
 587  			s.nextch()
 588  			break
 589  		}
 590  		if s.ch == '\\' {
 591  			s.nextch()
 592  			if !s.escape('\'') {
 593  				ok = false
 594  			}
 595  			continue
 596  		}
 597  		if s.ch == '\n' {
 598  			if ok {
 599  				s.Errorf("newline in rune literal")
 600  				ok = false
 601  			}
 602  			break
 603  		}
 604  		if s.ch < 0 {
 605  			if ok {
 606  				s.ErrorAtf(0, "rune literal not terminated")
 607  				ok = false
 608  			}
 609  			break
 610  		}
 611  		s.nextch()
 612  	}
 613  
 614  	s.SetLit(RuneLit, ok)
 615  }
 616  
 617  func (s *Scanner) stdString() {
 618  	ok := true
 619  	s.nextch()
 620  
 621  	for {
 622  		if s.ch == '"' {
 623  			s.nextch()
 624  			break
 625  		}
 626  		if s.ch == '\\' {
 627  			s.nextch()
 628  			if !s.escape('"') {
 629  				ok = false
 630  			}
 631  			continue
 632  		}
 633  		if s.ch == '\n' {
 634  			s.Errorf("newline in string")
 635  			ok = false
 636  			break
 637  		}
 638  		if s.ch < 0 {
 639  			s.ErrorAtf(0, "string not terminated")
 640  			ok = false
 641  			break
 642  		}
 643  		s.nextch()
 644  	}
 645  
 646  	s.SetLit(StringLit, ok)
 647  }
 648  
 649  func (s *Scanner) rawString() {
 650  	ok := true
 651  	s.nextch()
 652  
 653  	for {
 654  		if s.ch == '`' {
 655  			s.nextch()
 656  			break
 657  		}
 658  		if s.ch < 0 {
 659  			s.ErrorAtf(0, "string not terminated")
 660  			ok = false
 661  			break
 662  		}
 663  		s.nextch()
 664  	}
 665  
 666  	s.SetLit(StringLit, ok)
 667  }
 668  
 669  func (s *Scanner) comment(text string) {
 670  	s.ErrorAtf(0, "%s", text)
 671  }
 672  
 673  func (s *Scanner) skipLine() {
 674  	for s.ch >= 0 && s.ch != '\n' {
 675  		s.nextch()
 676  	}
 677  }
 678  
 679  func (s *Scanner) lineComment() {
 680  	if s.mode&comments != 0 {
 681  		s.skipLine()
 682  		s.comment(string(s.segment()))
 683  		return
 684  	}
 685  
 686  	if s.mode&directives == 0 || (s.ch != 'g' && s.ch != 'l') {
 687  		s.stop()
 688  		s.skipLine()
 689  		return
 690  	}
 691  
 692  	prefix := "go:"
 693  	if s.ch == 'l' {
 694  		prefix = "line "
 695  	}
 696  
 697  	for _, r := range prefix {
 698  		if s.ch != rune(r) {
 699  			s.stop()
 700  			s.skipLine()
 701  			return
 702  		}
 703  		s.nextch()
 704  	}
 705  	s.skipLine()
 706  	s.comment(string(s.segment()))
 707  }
 708  
 709  func (s *Scanner) skipComment() bool {
 710  	for s.ch >= 0 {
 711  		for s.ch == '*' {
 712  			s.nextch()
 713  			if s.ch == '/' {
 714  				s.nextch()
 715  				return true
 716  			}
 717  		}
 718  		s.nextch()
 719  	}
 720  	s.ErrorAtf(0, "comment not terminated")
 721  	return false
 722  }
 723  
 724  func (s *Scanner) fullComment() {
 725  	if s.mode&comments != 0 {
 726  		if s.skipComment() {
 727  			s.comment(string(s.segment()))
 728  		}
 729  		return
 730  	}
 731  
 732  	if s.mode&directives == 0 || s.ch != 'l' {
 733  		s.stop()
 734  		s.skipComment()
 735  		return
 736  	}
 737  
 738  	const prefix = "line "
 739  
 740  	for _, r := range prefix {
 741  		if s.ch != rune(r) {
 742  			s.stop()
 743  			s.skipComment()
 744  			return
 745  		}
 746  		s.nextch()
 747  	}
 748  	if s.skipComment() {
 749  		s.comment(string(s.segment()))
 750  	}
 751  }
 752  
 753  func (s *Scanner) escape(quote rune) bool {
 754  	var n int32
 755  	var base, max uint32
 756  
 757  	switch s.ch {
 758  	case quote, 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\':
 759  		s.nextch()
 760  		return true
 761  	case '0', '1', '2', '3', '4', '5', '6', '7':
 762  		n, base, max = 3, 8, 255
 763  	case 'x':
 764  		s.nextch()
 765  		n, base, max = 2, 16, 255
 766  	case 'u':
 767  		s.nextch()
 768  		n, base, max = 4, 16, unicode.MaxRune
 769  	case 'U':
 770  		s.nextch()
 771  		n, base, max = 8, 16, unicode.MaxRune
 772  	default:
 773  		if s.ch < 0 {
 774  			return true
 775  		}
 776  		s.Errorf("unknown escape")
 777  		return false
 778  	}
 779  
 780  	var x uint32
 781  	for i := n; i > 0; i-- {
 782  		if s.ch < 0 {
 783  			return true
 784  		}
 785  		d := base
 786  		if IsDecimal(s.ch) {
 787  			d = uint32(s.ch) - '0'
 788  		} else if 'a' <= Lower(s.ch) && Lower(s.ch) <= 'f' {
 789  			d = uint32(Lower(s.ch)) - 'a' + 10
 790  		}
 791  		if d >= base {
 792  			s.Errorf("invalid character %q in %s escape", s.ch, baseName(int32(base)))
 793  			return false
 794  		}
 795  		x = x*base + d
 796  		s.nextch()
 797  	}
 798  
 799  	if x > max && base == 8 {
 800  		s.Errorf("octal escape value %d > 255", x)
 801  		return false
 802  	}
 803  
 804  	if x > max || 0xD800 <= x && x < 0xE000 {
 805  		s.Errorf("escape is invalid Unicode code point %#U", x)
 806  		return false
 807  	}
 808  
 809  	return true
 810  }
 811  
 812  func String(n Node) string {
 813  	return fmt.Sprintf("%T", n)
 814  }
 815  
 816  func StartPos(n Node) Pos {
 817  	return n.Pos()
 818  }
 819