scanner.go raw

   1  package scanner
   2  
   3  import (
   4  	"bytes"
   5  	"fmt"
   6  	"os"
   7  	"unicode"
   8  	"unicode/utf8"
   9  
  10  	"github.com/hashicorp/hcl/json/token"
  11  )
  12  
  13  // eof represents a marker rune for the end of the reader.
  14  const eof = rune(0)
  15  
  16  // Scanner defines a lexical scanner
  17  type Scanner struct {
  18  	buf *bytes.Buffer // Source buffer for advancing and scanning
  19  	src []byte        // Source buffer for immutable access
  20  
  21  	// Source Position
  22  	srcPos  token.Pos // current position
  23  	prevPos token.Pos // previous position, used for peek() method
  24  
  25  	lastCharLen int // length of last character in bytes
  26  	lastLineLen int // length of last line in characters (for correct column reporting)
  27  
  28  	tokStart int // token text start position
  29  	tokEnd   int // token text end  position
  30  
  31  	// Error is called for each error encountered. If no Error
  32  	// function is set, the error is reported to os.Stderr.
  33  	Error func(pos token.Pos, msg string)
  34  
  35  	// ErrorCount is incremented by one for each error encountered.
  36  	ErrorCount int
  37  
  38  	// tokPos is the start position of most recently scanned token; set by
  39  	// Scan. The Filename field is always left untouched by the Scanner.  If
  40  	// an error is reported (via Error) and Position is invalid, the scanner is
  41  	// not inside a token.
  42  	tokPos token.Pos
  43  }
  44  
  45  // New creates and initializes a new instance of Scanner using src as
  46  // its source content.
  47  func New(src []byte) *Scanner {
  48  	// even though we accept a src, we read from a io.Reader compatible type
  49  	// (*bytes.Buffer). So in the future we might easily change it to streaming
  50  	// read.
  51  	b := bytes.NewBuffer(src)
  52  	s := &Scanner{
  53  		buf: b,
  54  		src: src,
  55  	}
  56  
  57  	// srcPosition always starts with 1
  58  	s.srcPos.Line = 1
  59  	return s
  60  }
  61  
  62  // next reads the next rune from the bufferred reader. Returns the rune(0) if
  63  // an error occurs (or io.EOF is returned).
  64  func (s *Scanner) next() rune {
  65  	ch, size, err := s.buf.ReadRune()
  66  	if err != nil {
  67  		// advance for error reporting
  68  		s.srcPos.Column++
  69  		s.srcPos.Offset += size
  70  		s.lastCharLen = size
  71  		return eof
  72  	}
  73  
  74  	if ch == utf8.RuneError && size == 1 {
  75  		s.srcPos.Column++
  76  		s.srcPos.Offset += size
  77  		s.lastCharLen = size
  78  		s.err("illegal UTF-8 encoding")
  79  		return ch
  80  	}
  81  
  82  	// remember last position
  83  	s.prevPos = s.srcPos
  84  
  85  	s.srcPos.Column++
  86  	s.lastCharLen = size
  87  	s.srcPos.Offset += size
  88  
  89  	if ch == '\n' {
  90  		s.srcPos.Line++
  91  		s.lastLineLen = s.srcPos.Column
  92  		s.srcPos.Column = 0
  93  	}
  94  
  95  	// debug
  96  	// fmt.Printf("ch: %q, offset:column: %d:%d\n", ch, s.srcPos.Offset, s.srcPos.Column)
  97  	return ch
  98  }
  99  
 100  // unread unreads the previous read Rune and updates the source position
 101  func (s *Scanner) unread() {
 102  	if err := s.buf.UnreadRune(); err != nil {
 103  		panic(err) // this is user fault, we should catch it
 104  	}
 105  	s.srcPos = s.prevPos // put back last position
 106  }
 107  
 108  // peek returns the next rune without advancing the reader.
 109  func (s *Scanner) peek() rune {
 110  	peek, _, err := s.buf.ReadRune()
 111  	if err != nil {
 112  		return eof
 113  	}
 114  
 115  	s.buf.UnreadRune()
 116  	return peek
 117  }
 118  
 119  // Scan scans the next token and returns the token.
 120  func (s *Scanner) Scan() token.Token {
 121  	ch := s.next()
 122  
 123  	// skip white space
 124  	for isWhitespace(ch) {
 125  		ch = s.next()
 126  	}
 127  
 128  	var tok token.Type
 129  
 130  	// token text markings
 131  	s.tokStart = s.srcPos.Offset - s.lastCharLen
 132  
 133  	// token position, initial next() is moving the offset by one(size of rune
 134  	// actually), though we are interested with the starting point
 135  	s.tokPos.Offset = s.srcPos.Offset - s.lastCharLen
 136  	if s.srcPos.Column > 0 {
 137  		// common case: last character was not a '\n'
 138  		s.tokPos.Line = s.srcPos.Line
 139  		s.tokPos.Column = s.srcPos.Column
 140  	} else {
 141  		// last character was a '\n'
 142  		// (we cannot be at the beginning of the source
 143  		// since we have called next() at least once)
 144  		s.tokPos.Line = s.srcPos.Line - 1
 145  		s.tokPos.Column = s.lastLineLen
 146  	}
 147  
 148  	switch {
 149  	case isLetter(ch):
 150  		lit := s.scanIdentifier()
 151  		if lit == "true" || lit == "false" {
 152  			tok = token.BOOL
 153  		} else if lit == "null" {
 154  			tok = token.NULL
 155  		} else {
 156  			s.err("illegal char")
 157  		}
 158  	case isDecimal(ch):
 159  		tok = s.scanNumber(ch)
 160  	default:
 161  		switch ch {
 162  		case eof:
 163  			tok = token.EOF
 164  		case '"':
 165  			tok = token.STRING
 166  			s.scanString()
 167  		case '.':
 168  			tok = token.PERIOD
 169  			ch = s.peek()
 170  			if isDecimal(ch) {
 171  				tok = token.FLOAT
 172  				ch = s.scanMantissa(ch)
 173  				ch = s.scanExponent(ch)
 174  			}
 175  		case '[':
 176  			tok = token.LBRACK
 177  		case ']':
 178  			tok = token.RBRACK
 179  		case '{':
 180  			tok = token.LBRACE
 181  		case '}':
 182  			tok = token.RBRACE
 183  		case ',':
 184  			tok = token.COMMA
 185  		case ':':
 186  			tok = token.COLON
 187  		case '-':
 188  			if isDecimal(s.peek()) {
 189  				ch := s.next()
 190  				tok = s.scanNumber(ch)
 191  			} else {
 192  				s.err("illegal char")
 193  			}
 194  		default:
 195  			s.err("illegal char: " + string(ch))
 196  		}
 197  	}
 198  
 199  	// finish token ending
 200  	s.tokEnd = s.srcPos.Offset
 201  
 202  	// create token literal
 203  	var tokenText string
 204  	if s.tokStart >= 0 {
 205  		tokenText = string(s.src[s.tokStart:s.tokEnd])
 206  	}
 207  	s.tokStart = s.tokEnd // ensure idempotency of tokenText() call
 208  
 209  	return token.Token{
 210  		Type: tok,
 211  		Pos:  s.tokPos,
 212  		Text: tokenText,
 213  	}
 214  }
 215  
 216  // scanNumber scans a HCL number definition starting with the given rune
 217  func (s *Scanner) scanNumber(ch rune) token.Type {
 218  	zero := ch == '0'
 219  	pos := s.srcPos
 220  
 221  	s.scanMantissa(ch)
 222  	ch = s.next() // seek forward
 223  	if ch == 'e' || ch == 'E' {
 224  		ch = s.scanExponent(ch)
 225  		return token.FLOAT
 226  	}
 227  
 228  	if ch == '.' {
 229  		ch = s.scanFraction(ch)
 230  		if ch == 'e' || ch == 'E' {
 231  			ch = s.next()
 232  			ch = s.scanExponent(ch)
 233  		}
 234  		return token.FLOAT
 235  	}
 236  
 237  	if ch != eof {
 238  		s.unread()
 239  	}
 240  
 241  	// If we have a larger number and this is zero, error
 242  	if zero && pos != s.srcPos {
 243  		s.err("numbers cannot start with 0")
 244  	}
 245  
 246  	return token.NUMBER
 247  }
 248  
 249  // scanMantissa scans the mantissa beginning from the rune. It returns the next
 250  // non decimal rune. It's used to determine wheter it's a fraction or exponent.
 251  func (s *Scanner) scanMantissa(ch rune) rune {
 252  	scanned := false
 253  	for isDecimal(ch) {
 254  		ch = s.next()
 255  		scanned = true
 256  	}
 257  
 258  	if scanned && ch != eof {
 259  		s.unread()
 260  	}
 261  	return ch
 262  }
 263  
 264  // scanFraction scans the fraction after the '.' rune
 265  func (s *Scanner) scanFraction(ch rune) rune {
 266  	if ch == '.' {
 267  		ch = s.peek() // we peek just to see if we can move forward
 268  		ch = s.scanMantissa(ch)
 269  	}
 270  	return ch
 271  }
 272  
 273  // scanExponent scans the remaining parts of an exponent after the 'e' or 'E'
 274  // rune.
 275  func (s *Scanner) scanExponent(ch rune) rune {
 276  	if ch == 'e' || ch == 'E' {
 277  		ch = s.next()
 278  		if ch == '-' || ch == '+' {
 279  			ch = s.next()
 280  		}
 281  		ch = s.scanMantissa(ch)
 282  	}
 283  	return ch
 284  }
 285  
 286  // scanString scans a quoted string
 287  func (s *Scanner) scanString() {
 288  	braces := 0
 289  	for {
 290  		// '"' opening already consumed
 291  		// read character after quote
 292  		ch := s.next()
 293  
 294  		if ch == '\n' || ch < 0 || ch == eof {
 295  			s.err("literal not terminated")
 296  			return
 297  		}
 298  
 299  		if ch == '"' {
 300  			break
 301  		}
 302  
 303  		// If we're going into a ${} then we can ignore quotes for awhile
 304  		if braces == 0 && ch == '$' && s.peek() == '{' {
 305  			braces++
 306  			s.next()
 307  		} else if braces > 0 && ch == '{' {
 308  			braces++
 309  		}
 310  		if braces > 0 && ch == '}' {
 311  			braces--
 312  		}
 313  
 314  		if ch == '\\' {
 315  			s.scanEscape()
 316  		}
 317  	}
 318  
 319  	return
 320  }
 321  
 322  // scanEscape scans an escape sequence
 323  func (s *Scanner) scanEscape() rune {
 324  	// http://en.cppreference.com/w/cpp/language/escape
 325  	ch := s.next() // read character after '/'
 326  	switch ch {
 327  	case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '"':
 328  		// nothing to do
 329  	case '0', '1', '2', '3', '4', '5', '6', '7':
 330  		// octal notation
 331  		ch = s.scanDigits(ch, 8, 3)
 332  	case 'x':
 333  		// hexademical notation
 334  		ch = s.scanDigits(s.next(), 16, 2)
 335  	case 'u':
 336  		// universal character name
 337  		ch = s.scanDigits(s.next(), 16, 4)
 338  	case 'U':
 339  		// universal character name
 340  		ch = s.scanDigits(s.next(), 16, 8)
 341  	default:
 342  		s.err("illegal char escape")
 343  	}
 344  	return ch
 345  }
 346  
 347  // scanDigits scans a rune with the given base for n times. For example an
 348  // octal notation \184 would yield in scanDigits(ch, 8, 3)
 349  func (s *Scanner) scanDigits(ch rune, base, n int) rune {
 350  	for n > 0 && digitVal(ch) < base {
 351  		ch = s.next()
 352  		n--
 353  	}
 354  	if n > 0 {
 355  		s.err("illegal char escape")
 356  	}
 357  
 358  	// we scanned all digits, put the last non digit char back
 359  	s.unread()
 360  	return ch
 361  }
 362  
 363  // scanIdentifier scans an identifier and returns the literal string
 364  func (s *Scanner) scanIdentifier() string {
 365  	offs := s.srcPos.Offset - s.lastCharLen
 366  	ch := s.next()
 367  	for isLetter(ch) || isDigit(ch) || ch == '-' {
 368  		ch = s.next()
 369  	}
 370  
 371  	if ch != eof {
 372  		s.unread() // we got identifier, put back latest char
 373  	}
 374  
 375  	return string(s.src[offs:s.srcPos.Offset])
 376  }
 377  
 378  // recentPosition returns the position of the character immediately after the
 379  // character or token returned by the last call to Scan.
 380  func (s *Scanner) recentPosition() (pos token.Pos) {
 381  	pos.Offset = s.srcPos.Offset - s.lastCharLen
 382  	switch {
 383  	case s.srcPos.Column > 0:
 384  		// common case: last character was not a '\n'
 385  		pos.Line = s.srcPos.Line
 386  		pos.Column = s.srcPos.Column
 387  	case s.lastLineLen > 0:
 388  		// last character was a '\n'
 389  		// (we cannot be at the beginning of the source
 390  		// since we have called next() at least once)
 391  		pos.Line = s.srcPos.Line - 1
 392  		pos.Column = s.lastLineLen
 393  	default:
 394  		// at the beginning of the source
 395  		pos.Line = 1
 396  		pos.Column = 1
 397  	}
 398  	return
 399  }
 400  
 401  // err prints the error of any scanning to s.Error function. If the function is
 402  // not defined, by default it prints them to os.Stderr
 403  func (s *Scanner) err(msg string) {
 404  	s.ErrorCount++
 405  	pos := s.recentPosition()
 406  
 407  	if s.Error != nil {
 408  		s.Error(pos, msg)
 409  		return
 410  	}
 411  
 412  	fmt.Fprintf(os.Stderr, "%s: %s\n", pos, msg)
 413  }
 414  
 415  // isHexadecimal returns true if the given rune is a letter
 416  func isLetter(ch rune) bool {
 417  	return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= 0x80 && unicode.IsLetter(ch)
 418  }
 419  
 420  // isHexadecimal returns true if the given rune is a decimal digit
 421  func isDigit(ch rune) bool {
 422  	return '0' <= ch && ch <= '9' || ch >= 0x80 && unicode.IsDigit(ch)
 423  }
 424  
 425  // isHexadecimal returns true if the given rune is a decimal number
 426  func isDecimal(ch rune) bool {
 427  	return '0' <= ch && ch <= '9'
 428  }
 429  
 430  // isHexadecimal returns true if the given rune is an hexadecimal number
 431  func isHexadecimal(ch rune) bool {
 432  	return '0' <= ch && ch <= '9' || 'a' <= ch && ch <= 'f' || 'A' <= ch && ch <= 'F'
 433  }
 434  
 435  // isWhitespace returns true if the rune is a space, tab, newline or carriage return
 436  func isWhitespace(ch rune) bool {
 437  	return ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r'
 438  }
 439  
 440  // digitVal returns the integer value of a given octal,decimal or hexadecimal rune
 441  func digitVal(ch rune) int {
 442  	switch {
 443  	case '0' <= ch && ch <= '9':
 444  		return int(ch - '0')
 445  	case 'a' <= ch && ch <= 'f':
 446  		return int(ch - 'a' + 10)
 447  	case 'A' <= ch && ch <= 'F':
 448  		return int(ch - 'A' + 10)
 449  	}
 450  	return 16 // larger than any legal digit val
 451  }
 452