source.go raw

   1  package syntax
   2  
   3  import (
   4  	"io"
   5  	"unicode/utf8"
   6  )
   7  
   8  
   9  // The source buffer is accessed using three indices b (begin),
  10  // r (read), and e (end):
  11  //
  12  // - If b >= 0, it points to the beginning of a segment of most
  13  //   recently read characters (typically a Go literal).
  14  //
  15  // - r points to the byte immediately following the most recently
  16  //   read character ch, which starts at r-chw.
  17  //
  18  // - e points to the byte immediately following the last byte that
  19  //   was read into the buffer.
  20  //
  21  // The buffer content is terminated at buf[e] with the sentinel
  22  // character utf8.RuneSelf. This makes it possible to test for
  23  // the common case of ASCII characters with a single 'if' (see
  24  // nextch method).
  25  //
  26  //                +------ content in use -------+
  27  //                v                             v
  28  // buf [...read...|...segment...|ch|...unread...|s|...free...]
  29  //                ^             ^  ^            ^
  30  //                |             |  |            |
  31  //                b         r-chw  r            e
  32  //
  33  // Invariant: -1 <= b < r <= e < len(buf) && buf[e] == sentinel
  34  
  35  type Source struct {
  36  	in   io.Reader
  37  	errh func(line, col uint32, msg string)
  38  
  39  	buf       []byte // source buffer
  40  	ioerr     error  // pending I/O error, or nil
  41  	b, r, e   int32    // buffer indices (see comment above)
  42  	line, col uint32   // source position of ch (0-based)
  43  	ch        rune   // most recently read character
  44  	chw       int32    // width of ch
  45  }
  46  
  47  const sentinel = utf8.RuneSelf
  48  
  49  func (s *Source) init(in io.Reader, errh func(line, col uint32, msg string)) {
  50  	s.in = in
  51  	s.errh = errh
  52  
  53  	if s.buf == nil {
  54  		s.buf = make([]byte, nextSize(0))
  55  	}
  56  	s.buf[0] = sentinel
  57  	s.ioerr = nil
  58  	s.b, s.r, s.e = -1, 0, 0
  59  	s.line, s.col = 0, 0
  60  	s.ch = ' '
  61  	s.chw = 0
  62  }
  63  
  64  // starting points for line and column numbers
  65  const Linebase = 1
  66  const Colbase = 1
  67  
  68  // pos returns the (line, col) source position of s.ch.
  69  func (s *Source) pos() (line, col uint32) {
  70  	return Linebase + s.line, Colbase + s.col
  71  }
  72  
  73  // Debugpos returns the (line, col) source position of s.ch.
  74  func (s *Source) Debugpos() (line, col uint32) {
  75  	return Linebase + s.line, Colbase + s.col
  76  }
  77  
  78  // error reports the error msg at source position s.pos().
  79  func (s *Source) error(msg string) {
  80  	line, col := s.pos()
  81  	s.errh(line, col, msg)
  82  }
  83  
  84  // start starts a new active source segment (including s.ch).
  85  // As long as stop has not been called, the active segment's
  86  // bytes (excluding s.ch) may be retrieved by calling segment.
  87  func (s *Source) start()          { s.b = s.r - s.chw }
  88  func (s *Source) stop()           { s.b = -1 }
  89  func (s *Source) segment() []byte { return s.buf[s.b : s.r-s.chw] }
  90  
  91  // rewind rewinds the scanner's read position and character s.ch
  92  // to the start of the currently active segment, which must not
  93  // contain any newlines (otherwise position information will be
  94  // incorrect). Currently, rewind is only needed for handling the
  95  // source sequence ".."; it must not be called outside an active
  96  // segment.
  97  func (s *Source) rewind() {
  98  	// ok to verify precondition - rewind is rarely called
  99  	if s.b < 0 {
 100  		panic("no active segment")
 101  	}
 102  	s.col -= uint32(s.r - s.b)
 103  	s.r = s.b
 104  	s.nextch()
 105  }
 106  
 107  func (s *Source) nextch() {
 108  redo:
 109  	s.col += uint32(s.chw)
 110  	if s.ch == '\n' {
 111  		s.line++
 112  		s.col = 0
 113  	}
 114  
 115  	// fast common case: at least one ASCII character
 116  	if s.ch = rune(s.buf[s.r]); s.ch < sentinel {
 117  		s.r++
 118  		s.chw = 1
 119  		if s.ch == 0 {
 120  			s.error("invalid NUL character")
 121  			goto redo
 122  		}
 123  		return
 124  	}
 125  
 126  	// slower general case: add more bytes to buffer if we don't have a full rune
 127  	for s.e-s.r < utf8.UTFMax && !utf8.FullRune(s.buf[s.r:s.e]) && s.ioerr == nil {
 128  		s.fill()
 129  	}
 130  
 131  	// EOF
 132  	if s.r == s.e {
 133  		if s.ioerr != io.EOF {
 134  			// ensure we never start with a '/' (e.g., rooted path) in the error message
 135  			s.error("I/O error: " + s.ioerr.Error())
 136  			s.ioerr = nil
 137  		}
 138  		s.ch = -1
 139  		s.chw = 0
 140  		return
 141  	}
 142  
 143  	var w int
 144  	s.ch, w = utf8.DecodeRune(s.buf[s.r:s.e])
 145  	s.chw = int32(w)
 146  	s.r += s.chw
 147  
 148  	if s.ch == utf8.RuneError && s.chw == 1 {
 149  		s.error("invalid UTF-8 encoding")
 150  		goto redo
 151  	}
 152  
 153  	// BOM's are only allowed as the first character in a file
 154  	const BOM = 0xfeff
 155  	if s.ch == BOM {
 156  		if s.line > 0 || s.col > 0 {
 157  			s.error("invalid BOM in the middle of the file")
 158  		}
 159  		goto redo
 160  	}
 161  }
 162  
 163  // fill reads more source bytes into s.buf.
 164  // It returns with at least one more byte in the buffer, or with s.ioerr != nil.
 165  func (s *Source) fill() {
 166  	// determine content to preserve
 167  	b := s.r
 168  	if s.b >= 0 {
 169  		b = s.b
 170  		s.b = 0 // after buffer has grown or content has been moved down
 171  	}
 172  	content := s.buf[b:s.e]
 173  
 174  	// grow buffer or move content down
 175  	if len(content)*2 > len(s.buf) {
 176  		s.buf = make([]byte, nextSize(int32(len(s.buf))))
 177  		copy(s.buf, content)
 178  	} else if b > 0 {
 179  		copy(s.buf, content)
 180  	}
 181  	s.r -= b
 182  	s.e -= b
 183  
 184  	// read more data: try a limited number of times
 185  	for i := 0; i < 10; i++ {
 186  		var n int32
 187  		var nn int
 188  		nn, s.ioerr = s.in.Read(s.buf[s.e : len(s.buf)-1])
 189  		n = int32(nn) // -1 to leave space for sentinel
 190  		if n < 0 {
 191  			panic("negative read") // incorrect underlying io.Reader implementation
 192  		}
 193  		if n > 0 || s.ioerr != nil {
 194  			s.e += n
 195  			s.buf[s.e] = sentinel
 196  			return
 197  		}
 198  		// n == 0
 199  	}
 200  
 201  	s.buf[s.e] = sentinel
 202  	s.ioerr = io.ErrNoProgress
 203  }
 204  
 205  // nextSize returns the next bigger size for a buffer of a given size.
 206  func nextSize(size int32) int32 {
 207  	const min = 4 << 10 // 4K: minimum buffer size
 208  	const max = 1 << 20 // 1M: maximum buffer size which is still doubled
 209  	if size < min {
 210  		return min
 211  	}
 212  	if size <= max {
 213  		return size << 1
 214  	}
 215  	return size + max
 216  }
 217