scanner.go raw
1 package scanner
2
3 import (
4 "io"
5 "strings"
6
7 "github.com/goccy/go-yaml/token"
8 "golang.org/x/xerrors"
9 )
10
11 // IndentState state for indent
12 type IndentState int
13
14 const (
15 // IndentStateEqual equals previous indent
16 IndentStateEqual IndentState = iota
17 // IndentStateUp more indent than previous
18 IndentStateUp
19 // IndentStateDown less indent than previous
20 IndentStateDown
21 // IndentStateKeep uses not indent token
22 IndentStateKeep
23 )
24
25 // Scanner holds the scanner's internal state while processing a given text.
26 // It can be allocated as part of another data structure but must be initialized via Init before use.
27 type Scanner struct {
28 source []rune
29 sourcePos int
30 sourceSize int
31 line int
32 column int
33 offset int
34 prevIndentLevel int
35 prevIndentNum int
36 prevIndentColumn int
37 docStartColumn int
38 indentLevel int
39 indentNum int
40 isFirstCharAtLine bool
41 isAnchor bool
42 startedFlowSequenceNum int
43 startedFlowMapNum int
44 indentState IndentState
45 savedPos *token.Position
46 }
47
48 func (s *Scanner) pos() *token.Position {
49 return &token.Position{
50 Line: s.line,
51 Column: s.column,
52 Offset: s.offset,
53 IndentNum: s.indentNum,
54 IndentLevel: s.indentLevel,
55 }
56 }
57
58 func (s *Scanner) bufferedToken(ctx *Context) *token.Token {
59 if s.savedPos != nil {
60 tk := ctx.bufferedToken(s.savedPos)
61 s.savedPos = nil
62 return tk
63 }
64 line := s.line
65 column := s.column - len(ctx.buf)
66 level := s.indentLevel
67 if ctx.isSaveIndentMode() {
68 line -= s.newLineCount(ctx.buf)
69 column = strings.Index(string(ctx.obuf), string(ctx.buf)) + 1
70 // Since we are in a literal, folded or raw folded
71 // we can use the indent level from the last token.
72 last := ctx.lastToken()
73 if last != nil { // The last token should never be nil here.
74 level = last.Position.IndentLevel + 1
75 }
76 }
77 return ctx.bufferedToken(&token.Position{
78 Line: line,
79 Column: column,
80 Offset: s.offset - len(ctx.buf),
81 IndentNum: s.indentNum,
82 IndentLevel: level,
83 })
84 }
85
86 func (s *Scanner) progressColumn(ctx *Context, num int) {
87 s.column += num
88 s.offset += num
89 ctx.progress(num)
90 }
91
92 func (s *Scanner) progressLine(ctx *Context) {
93 s.column = 1
94 s.line++
95 s.offset++
96 s.indentNum = 0
97 s.isFirstCharAtLine = true
98 s.isAnchor = false
99 ctx.progress(1)
100 }
101
102 func (s *Scanner) isNeededKeepPreviousIndentNum(ctx *Context, c rune) bool {
103 if !s.isChangedToIndentStateUp() {
104 return false
105 }
106 if ctx.isDocument() {
107 return true
108 }
109 if c == '-' && ctx.existsBuffer() {
110 return true
111 }
112 return false
113 }
114
115 func (s *Scanner) isNewLineChar(c rune) bool {
116 if c == '\n' {
117 return true
118 }
119 if c == '\r' {
120 return true
121 }
122 return false
123 }
124
125 func (s *Scanner) newLineCount(src []rune) int {
126 size := len(src)
127 cnt := 0
128 for i := 0; i < size; i++ {
129 c := src[i]
130 switch c {
131 case '\r':
132 if i+1 < size && src[i+1] == '\n' {
133 i++
134 }
135 cnt++
136 case '\n':
137 cnt++
138 }
139 }
140 return cnt
141 }
142
143 func (s *Scanner) updateIndentState(ctx *Context) {
144 indentNumBasedIndentState := s.indentState
145 if s.prevIndentNum < s.indentNum {
146 s.indentLevel = s.prevIndentLevel + 1
147 indentNumBasedIndentState = IndentStateUp
148 } else if s.prevIndentNum == s.indentNum {
149 s.indentLevel = s.prevIndentLevel
150 indentNumBasedIndentState = IndentStateEqual
151 } else {
152 indentNumBasedIndentState = IndentStateDown
153 if s.prevIndentLevel > 0 {
154 s.indentLevel = s.prevIndentLevel - 1
155 }
156 }
157
158 if s.prevIndentColumn > 0 {
159 if s.prevIndentColumn < s.column {
160 s.indentState = IndentStateUp
161 } else if s.prevIndentColumn != s.column || indentNumBasedIndentState != IndentStateEqual {
162 // The following case ( current position is 'd' ), some variables becomes like here
163 // - prevIndentColumn: 1 of 'a'
164 // - indentNumBasedIndentState: IndentStateDown because d's indentNum(1) is less than c's indentNum(3).
165 // Therefore, s.prevIndentColumn(1) == s.column(1) is true, but we want to treat this as IndentStateDown.
166 // So, we look also current indentState value by the above prevIndentNum based logic, and determins finally indentState.
167 // ---
168 // a:
169 // b
170 // c
171 // d: e
172 // ^
173 s.indentState = IndentStateDown
174 } else {
175 s.indentState = IndentStateEqual
176 }
177 } else {
178 s.indentState = indentNumBasedIndentState
179 }
180 }
181
182 func (s *Scanner) updateIndent(ctx *Context, c rune) {
183 if s.isFirstCharAtLine && s.isNewLineChar(c) && ctx.isDocument() {
184 return
185 }
186 if s.isFirstCharAtLine && c == ' ' {
187 s.indentNum++
188 return
189 }
190 if !s.isFirstCharAtLine {
191 s.indentState = IndentStateKeep
192 return
193 }
194 s.updateIndentState(ctx)
195 s.isFirstCharAtLine = false
196 if s.isNeededKeepPreviousIndentNum(ctx, c) {
197 return
198 }
199 if s.indentState != IndentStateUp {
200 s.prevIndentColumn = 0
201 }
202 s.prevIndentNum = s.indentNum
203 s.prevIndentLevel = s.indentLevel
204 }
205
206 func (s *Scanner) isChangedToIndentStateDown() bool {
207 return s.indentState == IndentStateDown
208 }
209
210 func (s *Scanner) isChangedToIndentStateUp() bool {
211 return s.indentState == IndentStateUp
212 }
213
214 func (s *Scanner) isChangedToIndentStateEqual() bool {
215 return s.indentState == IndentStateEqual
216 }
217
218 func (s *Scanner) addBufferedTokenIfExists(ctx *Context) {
219 ctx.addToken(s.bufferedToken(ctx))
220 }
221
222 func (s *Scanner) breakLiteral(ctx *Context) {
223 s.docStartColumn = 0
224 ctx.breakLiteral()
225 }
226
227 func (s *Scanner) scanSingleQuote(ctx *Context) (tk *token.Token, pos int) {
228 ctx.addOriginBuf('\'')
229 srcpos := s.pos()
230 startIndex := ctx.idx + 1
231 src := ctx.src
232 size := len(src)
233 value := []rune{}
234 isFirstLineChar := false
235 isNewLine := false
236 for idx := startIndex; idx < size; idx++ {
237 if !isNewLine {
238 s.progressColumn(ctx, 1)
239 } else {
240 isNewLine = false
241 }
242 c := src[idx]
243 pos = idx + 1
244 ctx.addOriginBuf(c)
245 if s.isNewLineChar(c) {
246 value = append(value, ' ')
247 isFirstLineChar = true
248 isNewLine = true
249 s.progressLine(ctx)
250 continue
251 } else if c == ' ' && isFirstLineChar {
252 continue
253 } else if c != '\'' {
254 value = append(value, c)
255 isFirstLineChar = false
256 continue
257 }
258 if idx+1 < len(ctx.src) && ctx.src[idx+1] == '\'' {
259 // '' handle as ' character
260 value = append(value, c)
261 ctx.addOriginBuf(c)
262 idx++
263 continue
264 }
265 s.progressColumn(ctx, 1)
266 tk = token.SingleQuote(string(value), string(ctx.obuf), srcpos)
267 pos = idx - startIndex + 1
268 return
269 }
270 return
271 }
272
273 func hexToInt(b rune) int {
274 if b >= 'A' && b <= 'F' {
275 return int(b) - 'A' + 10
276 }
277 if b >= 'a' && b <= 'f' {
278 return int(b) - 'a' + 10
279 }
280 return int(b) - '0'
281 }
282
283 func hexRunesToInt(b []rune) int {
284 sum := 0
285 for i := 0; i < len(b); i++ {
286 sum += hexToInt(b[i]) << (uint(len(b)-i-1) * 4)
287 }
288 return sum
289 }
290
291 func (s *Scanner) scanDoubleQuote(ctx *Context) (tk *token.Token, pos int) {
292 ctx.addOriginBuf('"')
293 srcpos := s.pos()
294 startIndex := ctx.idx + 1
295 src := ctx.src
296 size := len(src)
297 value := []rune{}
298 isFirstLineChar := false
299 isNewLine := false
300 for idx := startIndex; idx < size; idx++ {
301 if !isNewLine {
302 s.progressColumn(ctx, 1)
303 } else {
304 isNewLine = false
305 }
306 c := src[idx]
307 pos = idx + 1
308 ctx.addOriginBuf(c)
309 if s.isNewLineChar(c) {
310 value = append(value, ' ')
311 isFirstLineChar = true
312 isNewLine = true
313 s.progressLine(ctx)
314 continue
315 } else if c == ' ' && isFirstLineChar {
316 continue
317 } else if c == '\\' {
318 isFirstLineChar = false
319 if idx+1 < size {
320 nextChar := src[idx+1]
321 switch nextChar {
322 case 'b':
323 ctx.addOriginBuf(nextChar)
324 value = append(value, '\b')
325 idx++
326 continue
327 case 'e':
328 ctx.addOriginBuf(nextChar)
329 value = append(value, '\x1B')
330 idx++
331 continue
332 case 'f':
333 ctx.addOriginBuf(nextChar)
334 value = append(value, '\f')
335 idx++
336 continue
337 case 'n':
338 ctx.addOriginBuf(nextChar)
339 value = append(value, '\n')
340 idx++
341 continue
342 case 'v':
343 ctx.addOriginBuf(nextChar)
344 value = append(value, '\v')
345 idx++
346 continue
347 case 'L': // LS (#x2028)
348 ctx.addOriginBuf(nextChar)
349 value = append(value, []rune{'\xE2', '\x80', '\xA8'}...)
350 idx++
351 continue
352 case 'N': // NEL (#x85)
353 ctx.addOriginBuf(nextChar)
354 value = append(value, []rune{'\xC2', '\x85'}...)
355 idx++
356 continue
357 case 'P': // PS (#x2029)
358 ctx.addOriginBuf(nextChar)
359 value = append(value, []rune{'\xE2', '\x80', '\xA9'}...)
360 idx++
361 continue
362 case '_': // #xA0
363 ctx.addOriginBuf(nextChar)
364 value = append(value, []rune{'\xC2', '\xA0'}...)
365 idx++
366 continue
367 case '"':
368 ctx.addOriginBuf(nextChar)
369 value = append(value, nextChar)
370 idx++
371 continue
372 case 'x':
373 if idx+3 >= size {
374 // TODO: need to return error
375 //err = xerrors.New("invalid escape character \\x")
376 return
377 }
378 codeNum := hexRunesToInt(src[idx+2 : idx+4])
379 value = append(value, rune(codeNum))
380 idx += 3
381 continue
382 case 'u':
383 if idx+5 >= size {
384 // TODO: need to return error
385 //err = xerrors.New("invalid escape character \\u")
386 return
387 }
388 codeNum := hexRunesToInt(src[idx+2 : idx+6])
389 value = append(value, rune(codeNum))
390 idx += 5
391 continue
392 case 'U':
393 if idx+9 >= size {
394 // TODO: need to return error
395 //err = xerrors.New("invalid escape character \\U")
396 return
397 }
398 codeNum := hexRunesToInt(src[idx+2 : idx+10])
399 value = append(value, rune(codeNum))
400 idx += 9
401 continue
402 case '\\':
403 ctx.addOriginBuf(nextChar)
404 idx++
405 }
406 }
407 value = append(value, c)
408 continue
409 } else if c != '"' {
410 value = append(value, c)
411 isFirstLineChar = false
412 continue
413 }
414 s.progressColumn(ctx, 1)
415 tk = token.DoubleQuote(string(value), string(ctx.obuf), srcpos)
416 pos = idx - startIndex + 1
417 return
418 }
419 return
420 }
421
422 func (s *Scanner) scanQuote(ctx *Context, ch rune) (tk *token.Token, pos int) {
423 if ch == '\'' {
424 return s.scanSingleQuote(ctx)
425 }
426 return s.scanDoubleQuote(ctx)
427 }
428
429 func (s *Scanner) isMergeKey(ctx *Context) bool {
430 if ctx.repeatNum('<') != 2 {
431 return false
432 }
433 src := ctx.src
434 size := len(src)
435 for idx := ctx.idx + 2; idx < size; idx++ {
436 c := src[idx]
437 if c == ' ' {
438 continue
439 }
440 if c != ':' {
441 return false
442 }
443 if idx+1 < size {
444 nc := src[idx+1]
445 if nc == ' ' || s.isNewLineChar(nc) {
446 return true
447 }
448 }
449 }
450 return false
451 }
452
453 func (s *Scanner) scanTag(ctx *Context) (tk *token.Token, pos int) {
454 ctx.addOriginBuf('!')
455 ctx.progress(1) // skip '!' character
456 for idx, c := range ctx.src[ctx.idx:] {
457 pos = idx + 1
458 ctx.addOriginBuf(c)
459 switch c {
460 case ' ', '\n', '\r':
461 value := ctx.source(ctx.idx-1, ctx.idx+idx)
462 tk = token.Tag(value, string(ctx.obuf), s.pos())
463 pos = len([]rune(value))
464 return
465 }
466 }
467 return
468 }
469
470 func (s *Scanner) scanComment(ctx *Context) (tk *token.Token, pos int) {
471 ctx.addOriginBuf('#')
472 ctx.progress(1) // skip '#' character
473 for idx, c := range ctx.src[ctx.idx:] {
474 pos = idx + 1
475 ctx.addOriginBuf(c)
476 switch c {
477 case '\n', '\r':
478 if ctx.previousChar() == '\\' {
479 continue
480 }
481 value := ctx.source(ctx.idx, ctx.idx+idx)
482 tk = token.Comment(value, string(ctx.obuf), s.pos())
483 pos = len([]rune(value)) + 1
484 return
485 }
486 }
487 return
488 }
489
490 func trimCommentFromLiteralOpt(text string) (string, error) {
491 idx := strings.Index(text, "#")
492 if idx < 0 {
493 return text, nil
494 }
495 if idx == 0 {
496 return "", xerrors.New("invalid literal header")
497 }
498 return text[:idx-1], nil
499 }
500
501 func (s *Scanner) scanLiteral(ctx *Context, c rune) {
502 ctx.addOriginBuf(c)
503 if ctx.isEOS() {
504 if ctx.isLiteral {
505 ctx.addBuf(c)
506 }
507 value := ctx.bufferedSrc()
508 ctx.addToken(token.String(string(value), string(ctx.obuf), s.pos()))
509 ctx.resetBuffer()
510 s.progressColumn(ctx, 1)
511 } else if s.isNewLineChar(c) {
512 if ctx.isLiteral {
513 ctx.addBuf(c)
514 } else {
515 ctx.addBuf(' ')
516 }
517 s.progressLine(ctx)
518 } else if s.isFirstCharAtLine && c == ' ' {
519 if 0 < s.docStartColumn && s.docStartColumn <= s.column {
520 ctx.addBuf(c)
521 }
522 s.progressColumn(ctx, 1)
523 } else {
524 if s.docStartColumn == 0 {
525 s.docStartColumn = s.column
526 }
527 ctx.addBuf(c)
528 s.progressColumn(ctx, 1)
529 }
530 }
531
532 func (s *Scanner) scanLiteralHeader(ctx *Context) (pos int, err error) {
533 header := ctx.currentChar()
534 ctx.addOriginBuf(header)
535 ctx.progress(1) // skip '|' or '>' character
536 for idx, c := range ctx.src[ctx.idx:] {
537 pos = idx
538 ctx.addOriginBuf(c)
539 switch c {
540 case '\n', '\r':
541 value := ctx.source(ctx.idx, ctx.idx+idx)
542 opt := strings.TrimRight(value, " ")
543 orgOptLen := len(opt)
544 opt, err = trimCommentFromLiteralOpt(opt)
545 if err != nil {
546 return
547 }
548 switch opt {
549 case "", "+", "-",
550 "0", "1", "2", "3", "4", "5", "6", "7", "8", "9":
551 hasComment := len(opt) < orgOptLen
552 if header == '|' {
553 if hasComment {
554 commentLen := orgOptLen - len(opt)
555 headerPos := strings.Index(string(ctx.obuf), "|")
556 litBuf := ctx.obuf[:len(ctx.obuf)-commentLen-headerPos]
557 commentBuf := ctx.obuf[len(litBuf):]
558 ctx.addToken(token.Literal("|"+opt, string(litBuf), s.pos()))
559 s.column += len(litBuf)
560 s.offset += len(litBuf)
561 commentHeader := strings.Index(value, "#")
562 ctx.addToken(token.Comment(string(value[commentHeader+1:]), string(commentBuf), s.pos()))
563 } else {
564 ctx.addToken(token.Literal("|"+opt, string(ctx.obuf), s.pos()))
565 }
566 ctx.isLiteral = true
567 } else if header == '>' {
568 if hasComment {
569 commentLen := orgOptLen - len(opt)
570 headerPos := strings.Index(string(ctx.obuf), ">")
571 foldedBuf := ctx.obuf[:len(ctx.obuf)-commentLen-headerPos]
572 commentBuf := ctx.obuf[len(foldedBuf):]
573 ctx.addToken(token.Folded(">"+opt, string(foldedBuf), s.pos()))
574 s.column += len(foldedBuf)
575 s.offset += len(foldedBuf)
576 commentHeader := strings.Index(value, "#")
577 ctx.addToken(token.Comment(string(value[commentHeader+1:]), string(commentBuf), s.pos()))
578 } else {
579 ctx.addToken(token.Folded(">"+opt, string(ctx.obuf), s.pos()))
580 }
581 ctx.isFolded = true
582 }
583 s.indentState = IndentStateKeep
584 ctx.resetBuffer()
585 ctx.literalOpt = opt
586 return
587 }
588 break
589 }
590 }
591 err = xerrors.New("invalid literal header")
592 return
593 }
594
595 func (s *Scanner) scanNewLine(ctx *Context, c rune) {
596 if len(ctx.buf) > 0 && s.savedPos == nil {
597 s.savedPos = s.pos()
598 s.savedPos.Column -= len(ctx.bufferedSrc())
599 }
600
601 // if the following case, origin buffer has unnecessary two spaces.
602 // So, `removeRightSpaceFromOriginBuf` remove them, also fix column number too.
603 // ---
604 // a:[space][space]
605 // b: c
606 removedNum := ctx.removeRightSpaceFromBuf()
607 if removedNum > 0 {
608 s.column -= removedNum
609 s.offset -= removedNum
610 if s.savedPos != nil {
611 s.savedPos.Column -= removedNum
612 }
613 }
614
615 if ctx.isEOS() {
616 s.addBufferedTokenIfExists(ctx)
617 } else if s.isAnchor {
618 s.addBufferedTokenIfExists(ctx)
619 }
620 ctx.addBuf(' ')
621 ctx.addOriginBuf(c)
622 ctx.isSingleLine = false
623 s.progressLine(ctx)
624 }
625
626 func (s *Scanner) scan(ctx *Context) (pos int) {
627 for ctx.next() {
628 pos = ctx.nextPos()
629 c := ctx.currentChar()
630 s.updateIndent(ctx, c)
631 if ctx.isDocument() {
632 if s.isChangedToIndentStateEqual() ||
633 s.isChangedToIndentStateDown() {
634 s.addBufferedTokenIfExists(ctx)
635 s.breakLiteral(ctx)
636 } else {
637 s.scanLiteral(ctx, c)
638 continue
639 }
640 } else if s.isChangedToIndentStateDown() {
641 s.addBufferedTokenIfExists(ctx)
642 } else if s.isChangedToIndentStateEqual() {
643 // if first character is new line character, buffer expect to raw folded literal
644 if len(ctx.obuf) > 0 && s.newLineCount(ctx.obuf) <= 1 {
645 // doesn't raw folded literal
646 s.addBufferedTokenIfExists(ctx)
647 }
648 }
649 switch c {
650 case '{':
651 if !ctx.existsBuffer() {
652 ctx.addOriginBuf(c)
653 ctx.addToken(token.MappingStart(string(ctx.obuf), s.pos()))
654 s.startedFlowMapNum++
655 s.progressColumn(ctx, 1)
656 return
657 }
658 case '}':
659 if !ctx.existsBuffer() || s.startedFlowMapNum > 0 {
660 ctx.addToken(s.bufferedToken(ctx))
661 ctx.addOriginBuf(c)
662 ctx.addToken(token.MappingEnd(string(ctx.obuf), s.pos()))
663 s.startedFlowMapNum--
664 s.progressColumn(ctx, 1)
665 return
666 }
667 case '.':
668 if s.indentNum == 0 && s.column == 1 && ctx.repeatNum('.') == 3 {
669 ctx.addToken(token.DocumentEnd(string(ctx.obuf)+"...", s.pos()))
670 s.progressColumn(ctx, 3)
671 pos += 2
672 return
673 }
674 case '<':
675 if s.isMergeKey(ctx) {
676 s.prevIndentColumn = s.column
677 ctx.addToken(token.MergeKey(string(ctx.obuf)+"<<", s.pos()))
678 s.progressColumn(ctx, 1)
679 pos++
680 return
681 }
682 case '-':
683 if s.indentNum == 0 && s.column == 1 && ctx.repeatNum('-') == 3 {
684 s.addBufferedTokenIfExists(ctx)
685 ctx.addToken(token.DocumentHeader(string(ctx.obuf)+"---", s.pos()))
686 s.progressColumn(ctx, 3)
687 pos += 2
688 return
689 }
690 if ctx.existsBuffer() && s.isChangedToIndentStateUp() {
691 // raw folded
692 ctx.isRawFolded = true
693 ctx.addBuf(c)
694 ctx.addOriginBuf(c)
695 s.progressColumn(ctx, 1)
696 continue
697 }
698 if ctx.existsBuffer() {
699 // '-' is literal
700 ctx.addBuf(c)
701 ctx.addOriginBuf(c)
702 s.progressColumn(ctx, 1)
703 continue
704 }
705 nc := ctx.nextChar()
706 if nc == ' ' || s.isNewLineChar(nc) {
707 s.addBufferedTokenIfExists(ctx)
708 ctx.addOriginBuf(c)
709 tk := token.SequenceEntry(string(ctx.obuf), s.pos())
710 s.prevIndentColumn = tk.Position.Column
711 ctx.addToken(tk)
712 s.progressColumn(ctx, 1)
713 return
714 }
715 case '[':
716 if !ctx.existsBuffer() {
717 ctx.addOriginBuf(c)
718 ctx.addToken(token.SequenceStart(string(ctx.obuf), s.pos()))
719 s.startedFlowSequenceNum++
720 s.progressColumn(ctx, 1)
721 return
722 }
723 case ']':
724 if !ctx.existsBuffer() || s.startedFlowSequenceNum > 0 {
725 s.addBufferedTokenIfExists(ctx)
726 ctx.addOriginBuf(c)
727 ctx.addToken(token.SequenceEnd(string(ctx.obuf), s.pos()))
728 s.startedFlowSequenceNum--
729 s.progressColumn(ctx, 1)
730 return
731 }
732 case ',':
733 if s.startedFlowSequenceNum > 0 || s.startedFlowMapNum > 0 {
734 s.addBufferedTokenIfExists(ctx)
735 ctx.addOriginBuf(c)
736 ctx.addToken(token.CollectEntry(string(ctx.obuf), s.pos()))
737 s.progressColumn(ctx, 1)
738 return
739 }
740 case ':':
741 nc := ctx.nextChar()
742 if s.startedFlowMapNum > 0 || nc == ' ' || s.isNewLineChar(nc) || ctx.isNextEOS() {
743 // mapping value
744 tk := s.bufferedToken(ctx)
745 if tk != nil {
746 s.prevIndentColumn = tk.Position.Column
747 ctx.addToken(tk)
748 } else if tk := ctx.lastToken(); tk != nil {
749 // If the map key is quote, the buffer does not exist because it has already been cut into tokens.
750 // Therefore, we need to check the last token.
751 if tk.Indicator == token.QuotedScalarIndicator {
752 s.prevIndentColumn = tk.Position.Column
753 }
754 }
755 ctx.addToken(token.MappingValue(s.pos()))
756 s.progressColumn(ctx, 1)
757 return
758 }
759 case '|', '>':
760 if !ctx.existsBuffer() {
761 progress, err := s.scanLiteralHeader(ctx)
762 if err != nil {
763 // TODO: returns syntax error object
764 return
765 }
766 s.progressColumn(ctx, progress)
767 s.progressLine(ctx)
768 continue
769 }
770 case '!':
771 if !ctx.existsBuffer() {
772 token, progress := s.scanTag(ctx)
773 ctx.addToken(token)
774 s.progressColumn(ctx, progress)
775 if c := ctx.previousChar(); s.isNewLineChar(c) {
776 s.progressLine(ctx)
777 }
778 pos += progress
779 return
780 }
781 case '%':
782 if !ctx.existsBuffer() && s.indentNum == 0 {
783 ctx.addToken(token.Directive(string(ctx.obuf)+"%", s.pos()))
784 s.progressColumn(ctx, 1)
785 return
786 }
787 case '?':
788 nc := ctx.nextChar()
789 if !ctx.existsBuffer() && nc == ' ' {
790 ctx.addToken(token.MappingKey(s.pos()))
791 s.progressColumn(ctx, 1)
792 return
793 }
794 case '&':
795 if !ctx.existsBuffer() {
796 s.addBufferedTokenIfExists(ctx)
797 ctx.addOriginBuf(c)
798 ctx.addToken(token.Anchor(string(ctx.obuf), s.pos()))
799 s.progressColumn(ctx, 1)
800 s.isAnchor = true
801 return
802 }
803 case '*':
804 if !ctx.existsBuffer() {
805 s.addBufferedTokenIfExists(ctx)
806 ctx.addOriginBuf(c)
807 ctx.addToken(token.Alias(string(ctx.obuf), s.pos()))
808 s.progressColumn(ctx, 1)
809 return
810 }
811 case '#':
812 if !ctx.existsBuffer() || ctx.previousChar() == ' ' {
813 s.addBufferedTokenIfExists(ctx)
814 token, progress := s.scanComment(ctx)
815 ctx.addToken(token)
816 s.progressColumn(ctx, progress)
817 s.progressLine(ctx)
818 pos += progress
819 return
820 }
821 case '\'', '"':
822 if !ctx.existsBuffer() {
823 token, progress := s.scanQuote(ctx, c)
824 ctx.addToken(token)
825 pos += progress
826 // If the non-whitespace character immediately following the quote is ':', the quote should be treated as a map key.
827 // Therefore, do not return and continue processing as a normal map key.
828 if ctx.currentCharWithSkipWhitespace() == ':' {
829 continue
830 }
831 return
832 }
833 case '\r', '\n':
834 // There is no problem that we ignore CR which followed by LF and normalize it to LF, because of following YAML1.2 spec.
835 // > Line breaks inside scalar content must be normalized by the YAML processor. Each such line break must be parsed into a single line feed character.
836 // > Outside scalar content, YAML allows any line break to be used to terminate lines.
837 // > -- https://yaml.org/spec/1.2/spec.html
838 if c == '\r' && ctx.nextChar() == '\n' {
839 ctx.addOriginBuf('\r')
840 ctx.progress(1)
841 c = '\n'
842 }
843 s.scanNewLine(ctx, c)
844 continue
845 case ' ':
846 if ctx.isSaveIndentMode() || (!s.isAnchor && !s.isFirstCharAtLine) {
847 ctx.addBuf(c)
848 ctx.addOriginBuf(c)
849 s.progressColumn(ctx, 1)
850 continue
851 }
852 if s.isFirstCharAtLine {
853 s.progressColumn(ctx, 1)
854 ctx.addOriginBuf(c)
855 continue
856 }
857 s.addBufferedTokenIfExists(ctx)
858 pos-- // to rescan white space at next scanning for adding white space to next buffer.
859 s.isAnchor = false
860 return
861 }
862 ctx.addBuf(c)
863 ctx.addOriginBuf(c)
864 s.progressColumn(ctx, 1)
865 }
866 s.addBufferedTokenIfExists(ctx)
867 return
868 }
869
870 // Init prepares the scanner s to tokenize the text src by setting the scanner at the beginning of src.
871 func (s *Scanner) Init(text string) {
872 src := []rune(text)
873 s.source = src
874 s.sourcePos = 0
875 s.sourceSize = len(src)
876 s.line = 1
877 s.column = 1
878 s.offset = 1
879 s.prevIndentLevel = 0
880 s.prevIndentNum = 0
881 s.prevIndentColumn = 0
882 s.indentLevel = 0
883 s.indentNum = 0
884 s.isFirstCharAtLine = true
885 }
886
887 // Scan scans the next token and returns the token collection. The source end is indicated by io.EOF.
888 func (s *Scanner) Scan() (token.Tokens, error) {
889 if s.sourcePos >= s.sourceSize {
890 return nil, io.EOF
891 }
892 ctx := newContext(s.source[s.sourcePos:])
893 defer ctx.release()
894 progress := s.scan(ctx)
895 s.sourcePos += progress
896 var tokens token.Tokens
897 tokens = append(tokens, ctx.tokens...)
898 return tokens, nil
899 }
900