1 package syntax
2 3 import (
4 "io"
5 "unicode/utf8"
6 )
7 8 9 // The source buffer is accessed using three indices b (begin),
10 // r (read), and e (end):
11 //
12 // - If b >= 0, it points to the beginning of a segment of most
13 // recently read characters (typically a Go literal).
14 //
15 // - r points to the byte immediately following the most recently
16 // read character ch, which starts at r-chw.
17 //
18 // - e points to the byte immediately following the last byte that
19 // was read into the buffer.
20 //
21 // The buffer content is terminated at buf[e] with the sentinel
22 // character utf8.RuneSelf. This makes it possible to test for
23 // the common case of ASCII characters with a single 'if' (see
24 // nextch method).
25 //
26 // +------ content in use -------+
27 // v v
28 // buf [...read...|...segment...|ch|...unread...|s|...free...]
29 // ^ ^ ^ ^
30 // | | | |
31 // b r-chw r e
32 //
33 // Invariant: -1 <= b < r <= e < len(buf) && buf[e] == sentinel
34 35 type Source struct {
36 in io.Reader
37 errh func(line, col uint32, msg string)
38 39 buf []byte // source buffer
40 ioerr error // pending I/O error, or nil
41 b, r, e int32 // buffer indices (see comment above)
42 line, col uint32 // source position of ch (0-based)
43 ch rune // most recently read character
44 chw int32 // width of ch
45 }
46 47 const sentinel = utf8.RuneSelf
48 49 func (s *Source) init(in io.Reader, errh func(line, col uint32, msg string)) {
50 s.in = in
51 s.errh = errh
52 53 if s.buf == nil {
54 s.buf = make([]byte, nextSize(0))
55 }
56 s.buf[0] = sentinel
57 s.ioerr = nil
58 s.b, s.r, s.e = -1, 0, 0
59 s.line, s.col = 0, 0
60 s.ch = ' '
61 s.chw = 0
62 }
63 64 // starting points for line and column numbers
65 const Linebase = 1
66 const Colbase = 1
67 68 // pos returns the (line, col) source position of s.ch.
69 func (s *Source) pos() (line, col uint32) {
70 return Linebase + s.line, Colbase + s.col
71 }
72 73 // Debugpos returns the (line, col) source position of s.ch.
74 func (s *Source) Debugpos() (line, col uint32) {
75 return Linebase + s.line, Colbase + s.col
76 }
77 78 // error reports the error msg at source position s.pos().
79 func (s *Source) error(msg string) {
80 line, col := s.pos()
81 s.errh(line, col, msg)
82 }
83 84 // start starts a new active source segment (including s.ch).
85 // As long as stop has not been called, the active segment's
86 // bytes (excluding s.ch) may be retrieved by calling segment.
87 func (s *Source) start() { s.b = s.r - s.chw }
88 func (s *Source) stop() { s.b = -1 }
89 func (s *Source) segment() []byte { return s.buf[s.b : s.r-s.chw] }
90 91 // rewind rewinds the scanner's read position and character s.ch
92 // to the start of the currently active segment, which must not
93 // contain any newlines (otherwise position information will be
94 // incorrect). Currently, rewind is only needed for handling the
95 // source sequence ".."; it must not be called outside an active
96 // segment.
97 func (s *Source) rewind() {
98 // ok to verify precondition - rewind is rarely called
99 if s.b < 0 {
100 panic("no active segment")
101 }
102 s.col -= uint32(s.r - s.b)
103 s.r = s.b
104 s.nextch()
105 }
106 107 func (s *Source) nextch() {
108 redo:
109 s.col += uint32(s.chw)
110 if s.ch == '\n' {
111 s.line++
112 s.col = 0
113 }
114 115 // fast common case: at least one ASCII character
116 if s.ch = rune(s.buf[s.r]); s.ch < sentinel {
117 s.r++
118 s.chw = 1
119 if s.ch == 0 {
120 s.error("invalid NUL character")
121 goto redo
122 }
123 return
124 }
125 126 // slower general case: add more bytes to buffer if we don't have a full rune
127 for s.e-s.r < utf8.UTFMax && !utf8.FullRune(s.buf[s.r:s.e]) && s.ioerr == nil {
128 s.fill()
129 }
130 131 // EOF
132 if s.r == s.e {
133 if s.ioerr != io.EOF {
134 // ensure we never start with a '/' (e.g., rooted path) in the error message
135 s.error("I/O error: " + s.ioerr.Error())
136 s.ioerr = nil
137 }
138 s.ch = -1
139 s.chw = 0
140 return
141 }
142 143 var w int
144 s.ch, w = utf8.DecodeRune(s.buf[s.r:s.e])
145 s.chw = int32(w)
146 s.r += s.chw
147 148 if s.ch == utf8.RuneError && s.chw == 1 {
149 s.error("invalid UTF-8 encoding")
150 goto redo
151 }
152 153 // BOM's are only allowed as the first character in a file
154 const BOM = 0xfeff
155 if s.ch == BOM {
156 if s.line > 0 || s.col > 0 {
157 s.error("invalid BOM in the middle of the file")
158 }
159 goto redo
160 }
161 }
162 163 // fill reads more source bytes into s.buf.
164 // It returns with at least one more byte in the buffer, or with s.ioerr != nil.
165 func (s *Source) fill() {
166 // determine content to preserve
167 b := s.r
168 if s.b >= 0 {
169 b = s.b
170 s.b = 0 // after buffer has grown or content has been moved down
171 }
172 content := s.buf[b:s.e]
173 174 // grow buffer or move content down
175 if len(content)*2 > len(s.buf) {
176 s.buf = make([]byte, nextSize(int32(len(s.buf))))
177 copy(s.buf, content)
178 } else if b > 0 {
179 copy(s.buf, content)
180 }
181 s.r -= b
182 s.e -= b
183 184 // read more data: try a limited number of times
185 for i := 0; i < 10; i++ {
186 var n int32
187 var nn int
188 nn, s.ioerr = s.in.Read(s.buf[s.e : len(s.buf)-1])
189 n = int32(nn) // -1 to leave space for sentinel
190 if n < 0 {
191 panic("negative read") // incorrect underlying io.Reader implementation
192 }
193 if n > 0 || s.ioerr != nil {
194 s.e += n
195 s.buf[s.e] = sentinel
196 return
197 }
198 // n == 0
199 }
200 201 s.buf[s.e] = sentinel
202 s.ioerr = io.ErrNoProgress
203 }
204 205 // nextSize returns the next bigger size for a buffer of a given size.
206 func nextSize(size int32) int32 {
207 const min = 4 << 10 // 4K: minimum buffer size
208 const max = 1 << 20 // 1M: maximum buffer size which is still doubled
209 if size < min {
210 return min
211 }
212 if size <= max {
213 return size << 1
214 }
215 return size + max
216 }
217