scanner.go raw
1 package unstable
2
3 import "github.com/pelletier/go-toml/v2/internal/characters"
4
5 func scanFollows(b []byte, pattern string) bool {
6 n := len(pattern)
7
8 return len(b) >= n && string(b[:n]) == pattern
9 }
10
11 func scanFollowsMultilineBasicStringDelimiter(b []byte) bool {
12 return scanFollows(b, `"""`)
13 }
14
15 func scanFollowsMultilineLiteralStringDelimiter(b []byte) bool {
16 return scanFollows(b, `'''`)
17 }
18
19 func scanFollowsTrue(b []byte) bool {
20 return scanFollows(b, `true`)
21 }
22
23 func scanFollowsFalse(b []byte) bool {
24 return scanFollows(b, `false`)
25 }
26
27 func scanFollowsInf(b []byte) bool {
28 return scanFollows(b, `inf`)
29 }
30
31 func scanFollowsNan(b []byte) bool {
32 return scanFollows(b, `nan`)
33 }
34
35 func scanUnquotedKey(b []byte) ([]byte, []byte) {
36 // unquoted-key = 1*( ALPHA / DIGIT / %x2D / %x5F ) ; A-Z / a-z / 0-9 / - / _
37 for i := 0; i < len(b); i++ {
38 if !isUnquotedKeyChar(b[i]) {
39 return b[:i], b[i:]
40 }
41 }
42
43 return b, b[len(b):]
44 }
45
46 func isUnquotedKeyChar(r byte) bool {
47 return (r >= 'A' && r <= 'Z') || (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') || r == '-' || r == '_'
48 }
49
50 func scanLiteralString(b []byte) ([]byte, []byte, error) {
51 // literal-string = apostrophe *literal-char apostrophe
52 // apostrophe = %x27 ; ' apostrophe
53 // literal-char = %x09 / %x20-26 / %x28-7E / non-ascii
54 for i := 1; i < len(b); {
55 switch b[i] {
56 case '\'':
57 return b[:i+1], b[i+1:], nil
58 case '\n', '\r':
59 return nil, nil, NewParserError(b[i:i+1], "literal strings cannot have new lines")
60 }
61 size := characters.Utf8ValidNext(b[i:])
62 if size == 0 {
63 return nil, nil, NewParserError(b[i:i+1], "invalid character")
64 }
65 i += size
66 }
67
68 return nil, nil, NewParserError(b[len(b):], "unterminated literal string")
69 }
70
71 func scanMultilineLiteralString(b []byte) ([]byte, []byte, error) {
72 // ml-literal-string = ml-literal-string-delim [ newline ] ml-literal-body
73 // ml-literal-string-delim
74 // ml-literal-string-delim = 3apostrophe
75 // ml-literal-body = *mll-content *( mll-quotes 1*mll-content ) [ mll-quotes ]
76 //
77 // mll-content = mll-char / newline
78 // mll-char = %x09 / %x20-26 / %x28-7E / non-ascii
79 // mll-quotes = 1*2apostrophe
80 for i := 3; i < len(b); {
81 switch b[i] {
82 case '\'':
83 if scanFollowsMultilineLiteralStringDelimiter(b[i:]) {
84 i += 3
85
86 // At that point we found 3 apostrophe, and i is the
87 // index of the byte after the third one. The scanner
88 // needs to be eager, because there can be an extra 2
89 // apostrophe that can be accepted at the end of the
90 // string.
91
92 if i >= len(b) || b[i] != '\'' {
93 return b[:i], b[i:], nil
94 }
95 i++
96
97 if i >= len(b) || b[i] != '\'' {
98 return b[:i], b[i:], nil
99 }
100 i++
101
102 if i < len(b) && b[i] == '\'' {
103 return nil, nil, NewParserError(b[i-3:i+1], "''' not allowed in multiline literal string")
104 }
105
106 return b[:i], b[i:], nil
107 }
108 case '\r':
109 if len(b) < i+2 {
110 return nil, nil, NewParserError(b[len(b):], `need a \n after \r`)
111 }
112 if b[i+1] != '\n' {
113 return nil, nil, NewParserError(b[i:i+2], `need a \n after \r`)
114 }
115 i += 2 // skip the \n
116 continue
117 }
118 size := characters.Utf8ValidNext(b[i:])
119 if size == 0 {
120 return nil, nil, NewParserError(b[i:i+1], "invalid character")
121 }
122 i += size
123 }
124
125 return nil, nil, NewParserError(b[len(b):], `multiline literal string not terminated by '''`)
126 }
127
128 func scanWindowsNewline(b []byte) ([]byte, []byte, error) {
129 const lenCRLF = 2
130 if len(b) < lenCRLF {
131 return nil, nil, NewParserError(b, "windows new line expected")
132 }
133
134 if b[1] != '\n' {
135 return nil, nil, NewParserError(b, `windows new line should be \r\n`)
136 }
137
138 return b[:lenCRLF], b[lenCRLF:], nil
139 }
140
141 func scanWhitespace(b []byte) ([]byte, []byte) {
142 for i := 0; i < len(b); i++ {
143 switch b[i] {
144 case ' ', '\t':
145 continue
146 default:
147 return b[:i], b[i:]
148 }
149 }
150
151 return b, b[len(b):]
152 }
153
154 func scanComment(b []byte) ([]byte, []byte, error) {
155 // comment-start-symbol = %x23 ; #
156 // non-ascii = %x80-D7FF / %xE000-10FFFF
157 // non-eol = %x09 / %x20-7F / non-ascii
158 //
159 // comment = comment-start-symbol *non-eol
160
161 for i := 1; i < len(b); {
162 if b[i] == '\n' {
163 return b[:i], b[i:], nil
164 }
165 if b[i] == '\r' {
166 if i+1 < len(b) && b[i+1] == '\n' {
167 return b[:i+1], b[i+1:], nil
168 }
169 return nil, nil, NewParserError(b[i:i+1], "invalid character in comment")
170 }
171 size := characters.Utf8ValidNext(b[i:])
172 if size == 0 {
173 return nil, nil, NewParserError(b[i:i+1], "invalid character in comment")
174 }
175
176 i += size
177 }
178
179 return b, b[len(b):], nil
180 }
181
182 func scanBasicString(b []byte) ([]byte, bool, []byte, error) {
183 // basic-string = quotation-mark *basic-char quotation-mark
184 // quotation-mark = %x22 ; "
185 // basic-char = basic-unescaped / escaped
186 // basic-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii
187 // escaped = escape escape-seq-char
188 escaped := false
189 i := 1
190
191 for ; i < len(b); i++ {
192 switch b[i] {
193 case '"':
194 return b[:i+1], escaped, b[i+1:], nil
195 case '\n', '\r':
196 return nil, escaped, nil, NewParserError(b[i:i+1], "basic strings cannot have new lines")
197 case '\\':
198 if len(b) < i+2 {
199 return nil, escaped, nil, NewParserError(b[i:i+1], "need a character after \\")
200 }
201 escaped = true
202 i++ // skip the next character
203 }
204 }
205
206 return nil, escaped, nil, NewParserError(b[len(b):], `basic string not terminated by "`)
207 }
208
209 func scanMultilineBasicString(b []byte) ([]byte, bool, []byte, error) {
210 // ml-basic-string = ml-basic-string-delim [ newline ] ml-basic-body
211 // ml-basic-string-delim
212 // ml-basic-string-delim = 3quotation-mark
213 // ml-basic-body = *mlb-content *( mlb-quotes 1*mlb-content ) [ mlb-quotes ]
214 //
215 // mlb-content = mlb-char / newline / mlb-escaped-nl
216 // mlb-char = mlb-unescaped / escaped
217 // mlb-quotes = 1*2quotation-mark
218 // mlb-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii
219 // mlb-escaped-nl = escape ws newline *( wschar / newline )
220
221 escaped := false
222 i := 3
223
224 for ; i < len(b); i++ {
225 switch b[i] {
226 case '"':
227 if scanFollowsMultilineBasicStringDelimiter(b[i:]) {
228 i += 3
229
230 // At that point we found 3 apostrophe, and i is the
231 // index of the byte after the third one. The scanner
232 // needs to be eager, because there can be an extra 2
233 // apostrophe that can be accepted at the end of the
234 // string.
235
236 if i >= len(b) || b[i] != '"' {
237 return b[:i], escaped, b[i:], nil
238 }
239 i++
240
241 if i >= len(b) || b[i] != '"' {
242 return b[:i], escaped, b[i:], nil
243 }
244 i++
245
246 if i < len(b) && b[i] == '"' {
247 return nil, escaped, nil, NewParserError(b[i-3:i+1], `""" not allowed in multiline basic string`)
248 }
249
250 return b[:i], escaped, b[i:], nil
251 }
252 case '\\':
253 if len(b) < i+2 {
254 return nil, escaped, nil, NewParserError(b[len(b):], "need a character after \\")
255 }
256 escaped = true
257 i++ // skip the next character
258 case '\r':
259 if len(b) < i+2 {
260 return nil, escaped, nil, NewParserError(b[len(b):], `need a \n after \r`)
261 }
262 if b[i+1] != '\n' {
263 return nil, escaped, nil, NewParserError(b[i:i+2], `need a \n after \r`)
264 }
265 i++ // skip the \n
266 }
267 }
268
269 return nil, escaped, nil, NewParserError(b[len(b):], `multiline basic string not terminated by """`)
270 }
271