parse.go raw
1 // Copyright 2010 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 package html
6
7 import (
8 "errors"
9 "fmt"
10 "io"
11 "strings"
12
13 a "golang.org/x/net/html/atom"
14 )
15
16 // A parser implements the HTML5 parsing algorithm:
17 // https://html.spec.whatwg.org/multipage/syntax.html#tree-construction
18 type parser struct {
19 // tokenizer provides the tokens for the parser.
20 tokenizer *Tokenizer
21 // tok is the most recently read token.
22 tok Token
23 // Self-closing tags like <hr/> are treated as start tags, except that
24 // hasSelfClosingToken is set while they are being processed.
25 hasSelfClosingToken bool
26 // doc is the document root element.
27 doc *Node
28 // The stack of open elements (section 12.2.4.2) and active formatting
29 // elements (section 12.2.4.3).
30 oe, afe nodeStack
31 // Element pointers (section 12.2.4.4).
32 head, form *Node
33 // Other parsing state flags (section 12.2.4.5).
34 scripting, framesetOK bool
35 // The stack of template insertion modes
36 templateStack insertionModeStack
37 // im is the current insertion mode.
38 im insertionMode
39 // originalIM is the insertion mode to go back to after completing a text
40 // or inTableText insertion mode.
41 originalIM insertionMode
42 // fosterParenting is whether new elements should be inserted according to
43 // the foster parenting rules (section 12.2.6.1).
44 fosterParenting bool
45 // quirks is whether the parser is operating in "quirks mode."
46 quirks bool
47 // fragment is whether the parser is parsing an HTML fragment.
48 fragment bool
49 // context is the context element when parsing an HTML fragment
50 // (section 12.4).
51 context *Node
52 }
53
54 func (p *parser) top() *Node {
55 if n := p.oe.top(); n != nil {
56 return n
57 }
58 return p.doc
59 }
60
61 // Stop tags for use in popUntil. These come from section 12.2.4.2.
62 var (
63 defaultScopeStopTags = map[string][]a.Atom{
64 "": {a.Applet, a.Caption, a.Html, a.Table, a.Td, a.Th, a.Marquee, a.Object, a.Template},
65 "math": {a.AnnotationXml, a.Mi, a.Mn, a.Mo, a.Ms, a.Mtext},
66 "svg": {a.Desc, a.ForeignObject, a.Title},
67 }
68 )
69
70 type scope int
71
72 const (
73 defaultScope scope = iota
74 listItemScope
75 buttonScope
76 tableScope
77 tableRowScope
78 tableBodyScope
79 selectScope
80 )
81
82 // popUntil pops the stack of open elements at the highest element whose tag
83 // is in matchTags, provided there is no higher element in the scope's stop
84 // tags (as defined in section 12.2.4.2). It returns whether or not there was
85 // such an element. If there was not, popUntil leaves the stack unchanged.
86 //
87 // For example, the set of stop tags for table scope is: "html", "table". If
88 // the stack was:
89 // ["html", "body", "font", "table", "b", "i", "u"]
90 // then popUntil(tableScope, "font") would return false, but
91 // popUntil(tableScope, "i") would return true and the stack would become:
92 // ["html", "body", "font", "table", "b"]
93 //
94 // If an element's tag is in both the stop tags and matchTags, then the stack
95 // will be popped and the function returns true (provided, of course, there was
96 // no higher element in the stack that was also in the stop tags). For example,
97 // popUntil(tableScope, "table") returns true and leaves:
98 // ["html", "body", "font"]
99 func (p *parser) popUntil(s scope, matchTags ...a.Atom) bool {
100 if i := p.indexOfElementInScope(s, matchTags...); i != -1 {
101 p.oe = p.oe[:i]
102 return true
103 }
104 return false
105 }
106
107 // indexOfElementInScope returns the index in p.oe of the highest element whose
108 // tag is in matchTags that is in scope. If no matching element is in scope, it
109 // returns -1.
110 func (p *parser) indexOfElementInScope(s scope, matchTags ...a.Atom) int {
111 for i := len(p.oe) - 1; i >= 0; i-- {
112 tagAtom := p.oe[i].DataAtom
113 if p.oe[i].Namespace == "" {
114 for _, t := range matchTags {
115 if t == tagAtom {
116 return i
117 }
118 }
119 switch s {
120 case defaultScope:
121 // No-op.
122 case listItemScope:
123 if tagAtom == a.Ol || tagAtom == a.Ul {
124 return -1
125 }
126 case buttonScope:
127 if tagAtom == a.Button {
128 return -1
129 }
130 case tableScope:
131 if tagAtom == a.Html || tagAtom == a.Table || tagAtom == a.Template {
132 return -1
133 }
134 case selectScope:
135 if tagAtom != a.Optgroup && tagAtom != a.Option {
136 return -1
137 }
138 default:
139 panic(fmt.Sprintf("html: internal error: indexOfElementInScope unknown scope: %d", s))
140 }
141 }
142 switch s {
143 case defaultScope, listItemScope, buttonScope:
144 for _, t := range defaultScopeStopTags[p.oe[i].Namespace] {
145 if t == tagAtom {
146 return -1
147 }
148 }
149 }
150 }
151 return -1
152 }
153
154 // elementInScope is like popUntil, except that it doesn't modify the stack of
155 // open elements.
156 func (p *parser) elementInScope(s scope, matchTags ...a.Atom) bool {
157 return p.indexOfElementInScope(s, matchTags...) != -1
158 }
159
160 // clearStackToContext pops elements off the stack of open elements until a
161 // scope-defined element is found.
162 func (p *parser) clearStackToContext(s scope) {
163 for i := len(p.oe) - 1; i >= 0; i-- {
164 tagAtom := p.oe[i].DataAtom
165 switch s {
166 case tableScope:
167 if tagAtom == a.Html || tagAtom == a.Table || tagAtom == a.Template {
168 p.oe = p.oe[:i+1]
169 return
170 }
171 case tableRowScope:
172 if tagAtom == a.Html || tagAtom == a.Tr || tagAtom == a.Template {
173 p.oe = p.oe[:i+1]
174 return
175 }
176 case tableBodyScope:
177 if tagAtom == a.Html || tagAtom == a.Tbody || tagAtom == a.Tfoot || tagAtom == a.Thead || tagAtom == a.Template {
178 p.oe = p.oe[:i+1]
179 return
180 }
181 default:
182 panic(fmt.Sprintf("html: internal error: clearStackToContext unknown scope: %d", s))
183 }
184 }
185 }
186
187 // parseGenericRawTextElement implements the generic raw text element parsing
188 // algorithm defined in 12.2.6.2.
189 // https://html.spec.whatwg.org/multipage/parsing.html#parsing-elements-that-contain-only-text
190 // TODO: Since both RAWTEXT and RCDATA states are treated as tokenizer's part
191 // officially, need to make tokenizer consider both states.
192 func (p *parser) parseGenericRawTextElement() {
193 p.addElement()
194 p.originalIM = p.im
195 p.im = textIM
196 }
197
198 // generateImpliedEndTags pops nodes off the stack of open elements as long as
199 // the top node has a tag name of dd, dt, li, optgroup, option, p, rb, rp, rt or rtc.
200 // If exceptions are specified, nodes with that name will not be popped off.
201 func (p *parser) generateImpliedEndTags(exceptions ...string) {
202 var i int
203 loop:
204 for i = len(p.oe) - 1; i >= 0; i-- {
205 n := p.oe[i]
206 if n.Type != ElementNode {
207 break
208 }
209 switch n.DataAtom {
210 case a.Dd, a.Dt, a.Li, a.Optgroup, a.Option, a.P, a.Rb, a.Rp, a.Rt, a.Rtc:
211 for _, except := range exceptions {
212 if n.Data == except {
213 break loop
214 }
215 }
216 continue
217 }
218 break
219 }
220
221 p.oe = p.oe[:i+1]
222 }
223
224 // addChild adds a child node n to the top element, and pushes n onto the stack
225 // of open elements if it is an element node.
226 func (p *parser) addChild(n *Node) {
227 if p.shouldFosterParent() {
228 p.fosterParent(n)
229 } else {
230 p.top().AppendChild(n)
231 }
232
233 if n.Type == ElementNode {
234 p.insertOpenElement(n)
235 }
236 }
237
238 func (p *parser) insertOpenElement(n *Node) {
239 p.oe = append(p.oe, n)
240 if len(p.oe) > 512 {
241 panic("html: open stack of elements exceeds 512 nodes")
242 }
243 }
244
245 // shouldFosterParent returns whether the next node to be added should be
246 // foster parented.
247 func (p *parser) shouldFosterParent() bool {
248 if p.fosterParenting {
249 switch p.top().DataAtom {
250 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:
251 return true
252 }
253 }
254 return false
255 }
256
257 // fosterParent adds a child node according to the foster parenting rules.
258 // Section 12.2.6.1, "foster parenting".
259 func (p *parser) fosterParent(n *Node) {
260 var table, parent, prev, template *Node
261 var i int
262 for i = len(p.oe) - 1; i >= 0; i-- {
263 if p.oe[i].DataAtom == a.Table {
264 table = p.oe[i]
265 break
266 }
267 }
268
269 var j int
270 for j = len(p.oe) - 1; j >= 0; j-- {
271 if p.oe[j].DataAtom == a.Template {
272 template = p.oe[j]
273 break
274 }
275 }
276
277 if template != nil && (table == nil || j > i) {
278 template.AppendChild(n)
279 return
280 }
281
282 if table == nil {
283 // The foster parent is the html element.
284 parent = p.oe[0]
285 } else {
286 parent = table.Parent
287 }
288 if parent == nil {
289 parent = p.oe[i-1]
290 }
291
292 if table != nil {
293 prev = table.PrevSibling
294 } else {
295 prev = parent.LastChild
296 }
297 if prev != nil && prev.Type == TextNode && n.Type == TextNode {
298 prev.Data += n.Data
299 return
300 }
301
302 parent.InsertBefore(n, table)
303 }
304
305 // addText adds text to the preceding node if it is a text node, or else it
306 // calls addChild with a new text node.
307 func (p *parser) addText(text string) {
308 if text == "" {
309 return
310 }
311
312 if p.shouldFosterParent() {
313 p.fosterParent(&Node{
314 Type: TextNode,
315 Data: text,
316 })
317 return
318 }
319
320 t := p.top()
321 if n := t.LastChild; n != nil && n.Type == TextNode {
322 n.Data += text
323 return
324 }
325 p.addChild(&Node{
326 Type: TextNode,
327 Data: text,
328 })
329 }
330
331 // addElement adds a child element based on the current token.
332 func (p *parser) addElement() {
333 p.addChild(&Node{
334 Type: ElementNode,
335 DataAtom: p.tok.DataAtom,
336 Data: p.tok.Data,
337 Attr: p.tok.Attr,
338 })
339 }
340
341 // Section 12.2.4.3.
342 func (p *parser) addFormattingElement() {
343 tagAtom, attr := p.tok.DataAtom, p.tok.Attr
344 p.addElement()
345
346 // Implement the Noah's Ark clause, but with three per family instead of two.
347 identicalElements := 0
348 findIdenticalElements:
349 for i := len(p.afe) - 1; i >= 0; i-- {
350 n := p.afe[i]
351 if n.Type == scopeMarkerNode {
352 break
353 }
354 if n.Type != ElementNode {
355 continue
356 }
357 if n.Namespace != "" {
358 continue
359 }
360 if n.DataAtom != tagAtom {
361 continue
362 }
363 if len(n.Attr) != len(attr) {
364 continue
365 }
366 compareAttributes:
367 for _, t0 := range n.Attr {
368 for _, t1 := range attr {
369 if t0.Key == t1.Key && t0.Namespace == t1.Namespace && t0.Val == t1.Val {
370 // Found a match for this attribute, continue with the next attribute.
371 continue compareAttributes
372 }
373 }
374 // If we get here, there is no attribute that matches a.
375 // Therefore the element is not identical to the new one.
376 continue findIdenticalElements
377 }
378
379 identicalElements++
380 if identicalElements >= 3 {
381 p.afe.remove(n)
382 }
383 }
384
385 p.afe = append(p.afe, p.top())
386 }
387
388 // Section 12.2.4.3.
389 func (p *parser) clearActiveFormattingElements() {
390 for {
391 if n := p.afe.pop(); len(p.afe) == 0 || n.Type == scopeMarkerNode {
392 return
393 }
394 }
395 }
396
397 // Section 12.2.4.3.
398 func (p *parser) reconstructActiveFormattingElements() {
399 n := p.afe.top()
400 if n == nil {
401 return
402 }
403 if n.Type == scopeMarkerNode || p.oe.index(n) != -1 {
404 return
405 }
406 i := len(p.afe) - 1
407 for n.Type != scopeMarkerNode && p.oe.index(n) == -1 {
408 if i == 0 {
409 i = -1
410 break
411 }
412 i--
413 n = p.afe[i]
414 }
415 for {
416 i++
417 clone := p.afe[i].clone()
418 p.addChild(clone)
419 p.afe[i] = clone
420 if i == len(p.afe)-1 {
421 break
422 }
423 }
424 }
425
426 // Section 12.2.5.
427 func (p *parser) acknowledgeSelfClosingTag() {
428 p.hasSelfClosingToken = false
429 }
430
431 // An insertion mode (section 12.2.4.1) is the state transition function from
432 // a particular state in the HTML5 parser's state machine. It updates the
433 // parser's fields depending on parser.tok (where ErrorToken means EOF).
434 // It returns whether the token was consumed.
435 type insertionMode func(*parser) bool
436
437 // setOriginalIM sets the insertion mode to return to after completing a text or
438 // inTableText insertion mode.
439 // Section 12.2.4.1, "using the rules for".
440 func (p *parser) setOriginalIM() {
441 if p.originalIM != nil {
442 panic("html: bad parser state: originalIM was set twice")
443 }
444 p.originalIM = p.im
445 }
446
447 // Section 12.2.4.1, "reset the insertion mode".
448 func (p *parser) resetInsertionMode() {
449 for i := len(p.oe) - 1; i >= 0; i-- {
450 n := p.oe[i]
451 last := i == 0
452 if last && p.context != nil {
453 n = p.context
454 }
455
456 switch n.DataAtom {
457 case a.Select:
458 if !last {
459 for ancestor, first := n, p.oe[0]; ancestor != first; {
460 ancestor = p.oe[p.oe.index(ancestor)-1]
461 switch ancestor.DataAtom {
462 case a.Template:
463 p.im = inSelectIM
464 return
465 case a.Table:
466 p.im = inSelectInTableIM
467 return
468 }
469 }
470 }
471 p.im = inSelectIM
472 case a.Td, a.Th:
473 // TODO: remove this divergence from the HTML5 spec.
474 //
475 // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668
476 p.im = inCellIM
477 case a.Tr:
478 p.im = inRowIM
479 case a.Tbody, a.Thead, a.Tfoot:
480 p.im = inTableBodyIM
481 case a.Caption:
482 p.im = inCaptionIM
483 case a.Colgroup:
484 p.im = inColumnGroupIM
485 case a.Table:
486 p.im = inTableIM
487 case a.Template:
488 // TODO: remove this divergence from the HTML5 spec.
489 if n.Namespace != "" {
490 continue
491 }
492 p.im = p.templateStack.top()
493 case a.Head:
494 // TODO: remove this divergence from the HTML5 spec.
495 //
496 // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668
497 p.im = inHeadIM
498 case a.Body:
499 p.im = inBodyIM
500 case a.Frameset:
501 p.im = inFramesetIM
502 case a.Html:
503 if p.head == nil {
504 p.im = beforeHeadIM
505 } else {
506 p.im = afterHeadIM
507 }
508 default:
509 if last {
510 p.im = inBodyIM
511 return
512 }
513 continue
514 }
515 return
516 }
517 }
518
519 const whitespace = " \t\r\n\f"
520
521 // Section 12.2.6.4.1.
522 func initialIM(p *parser) bool {
523 switch p.tok.Type {
524 case TextToken:
525 p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace)
526 if len(p.tok.Data) == 0 {
527 // It was all whitespace, so ignore it.
528 return true
529 }
530 case CommentToken:
531 p.doc.AppendChild(&Node{
532 Type: CommentNode,
533 Data: p.tok.Data,
534 })
535 return true
536 case DoctypeToken:
537 n, quirks := parseDoctype(p.tok.Data)
538 p.doc.AppendChild(n)
539 p.quirks = quirks
540 p.im = beforeHTMLIM
541 return true
542 }
543 p.quirks = true
544 p.im = beforeHTMLIM
545 return false
546 }
547
548 // Section 12.2.6.4.2.
549 func beforeHTMLIM(p *parser) bool {
550 switch p.tok.Type {
551 case DoctypeToken:
552 // Ignore the token.
553 return true
554 case TextToken:
555 p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace)
556 if len(p.tok.Data) == 0 {
557 // It was all whitespace, so ignore it.
558 return true
559 }
560 case StartTagToken:
561 if p.tok.DataAtom == a.Html {
562 p.addElement()
563 p.im = beforeHeadIM
564 return true
565 }
566 case EndTagToken:
567 switch p.tok.DataAtom {
568 case a.Head, a.Body, a.Html, a.Br:
569 p.parseImpliedToken(StartTagToken, a.Html, a.Html.String())
570 return false
571 default:
572 // Ignore the token.
573 return true
574 }
575 case CommentToken:
576 p.doc.AppendChild(&Node{
577 Type: CommentNode,
578 Data: p.tok.Data,
579 })
580 return true
581 }
582 p.parseImpliedToken(StartTagToken, a.Html, a.Html.String())
583 return false
584 }
585
586 // Section 12.2.6.4.3.
587 func beforeHeadIM(p *parser) bool {
588 switch p.tok.Type {
589 case TextToken:
590 p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace)
591 if len(p.tok.Data) == 0 {
592 // It was all whitespace, so ignore it.
593 return true
594 }
595 case StartTagToken:
596 switch p.tok.DataAtom {
597 case a.Head:
598 p.addElement()
599 p.head = p.top()
600 p.im = inHeadIM
601 return true
602 case a.Html:
603 return inBodyIM(p)
604 }
605 case EndTagToken:
606 switch p.tok.DataAtom {
607 case a.Head, a.Body, a.Html, a.Br:
608 p.parseImpliedToken(StartTagToken, a.Head, a.Head.String())
609 return false
610 default:
611 // Ignore the token.
612 return true
613 }
614 case CommentToken:
615 p.addChild(&Node{
616 Type: CommentNode,
617 Data: p.tok.Data,
618 })
619 return true
620 case DoctypeToken:
621 // Ignore the token.
622 return true
623 }
624
625 p.parseImpliedToken(StartTagToken, a.Head, a.Head.String())
626 return false
627 }
628
629 // Section 12.2.6.4.4.
630 func inHeadIM(p *parser) bool {
631 switch p.tok.Type {
632 case TextToken:
633 s := strings.TrimLeft(p.tok.Data, whitespace)
634 if len(s) < len(p.tok.Data) {
635 // Add the initial whitespace to the current node.
636 p.addText(p.tok.Data[:len(p.tok.Data)-len(s)])
637 if s == "" {
638 return true
639 }
640 p.tok.Data = s
641 }
642 case StartTagToken:
643 switch p.tok.DataAtom {
644 case a.Html:
645 return inBodyIM(p)
646 case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta:
647 p.addElement()
648 p.oe.pop()
649 p.acknowledgeSelfClosingTag()
650 return true
651 case a.Noscript:
652 if p.scripting {
653 p.parseGenericRawTextElement()
654 return true
655 }
656 p.addElement()
657 p.im = inHeadNoscriptIM
658 // Don't let the tokenizer go into raw text mode when scripting is disabled.
659 p.tokenizer.NextIsNotRawText()
660 return true
661 case a.Script, a.Title:
662 p.addElement()
663 p.setOriginalIM()
664 p.im = textIM
665 return true
666 case a.Noframes, a.Style:
667 p.parseGenericRawTextElement()
668 return true
669 case a.Head:
670 // Ignore the token.
671 return true
672 case a.Template:
673 // TODO: remove this divergence from the HTML5 spec.
674 //
675 // We don't handle all of the corner cases when mixing foreign
676 // content (i.e. <math> or <svg>) with <template>. Without this
677 // early return, we can get into an infinite loop, possibly because
678 // of the "TODO... further divergence" a little below.
679 //
680 // As a workaround, if we are mixing foreign content and templates,
681 // just ignore the rest of the HTML. Foreign content is rare and a
682 // relatively old HTML feature. Templates are also rare and a
683 // relatively new HTML feature. Their combination is very rare.
684 for _, e := range p.oe {
685 if e.Namespace != "" {
686 p.im = ignoreTheRemainingTokens
687 return true
688 }
689 }
690
691 p.addElement()
692 p.afe = append(p.afe, &scopeMarker)
693 p.framesetOK = false
694 p.im = inTemplateIM
695 p.templateStack = append(p.templateStack, inTemplateIM)
696 return true
697 }
698 case EndTagToken:
699 switch p.tok.DataAtom {
700 case a.Head:
701 p.oe.pop()
702 p.im = afterHeadIM
703 return true
704 case a.Body, a.Html, a.Br:
705 p.parseImpliedToken(EndTagToken, a.Head, a.Head.String())
706 return false
707 case a.Template:
708 if !p.oe.contains(a.Template) {
709 return true
710 }
711 // TODO: remove this further divergence from the HTML5 spec.
712 //
713 // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668
714 p.generateImpliedEndTags()
715 for i := len(p.oe) - 1; i >= 0; i-- {
716 if n := p.oe[i]; n.Namespace == "" && n.DataAtom == a.Template {
717 p.oe = p.oe[:i]
718 break
719 }
720 }
721 p.clearActiveFormattingElements()
722 p.templateStack.pop()
723 p.resetInsertionMode()
724 return true
725 default:
726 // Ignore the token.
727 return true
728 }
729 case CommentToken:
730 p.addChild(&Node{
731 Type: CommentNode,
732 Data: p.tok.Data,
733 })
734 return true
735 case DoctypeToken:
736 // Ignore the token.
737 return true
738 }
739
740 p.parseImpliedToken(EndTagToken, a.Head, a.Head.String())
741 return false
742 }
743
744 // Section 12.2.6.4.5.
745 func inHeadNoscriptIM(p *parser) bool {
746 switch p.tok.Type {
747 case DoctypeToken:
748 // Ignore the token.
749 return true
750 case StartTagToken:
751 switch p.tok.DataAtom {
752 case a.Html:
753 return inBodyIM(p)
754 case a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Style:
755 return inHeadIM(p)
756 case a.Head:
757 // Ignore the token.
758 return true
759 case a.Noscript:
760 // Don't let the tokenizer go into raw text mode even when a <noscript>
761 // tag is in "in head noscript" insertion mode.
762 p.tokenizer.NextIsNotRawText()
763 // Ignore the token.
764 return true
765 }
766 case EndTagToken:
767 switch p.tok.DataAtom {
768 case a.Noscript, a.Br:
769 default:
770 // Ignore the token.
771 return true
772 }
773 case TextToken:
774 s := strings.TrimLeft(p.tok.Data, whitespace)
775 if len(s) == 0 {
776 // It was all whitespace.
777 return inHeadIM(p)
778 }
779 case CommentToken:
780 return inHeadIM(p)
781 }
782 p.oe.pop()
783 if p.top().DataAtom != a.Head {
784 panic("html: the new current node will be a head element.")
785 }
786 p.im = inHeadIM
787 if p.tok.DataAtom == a.Noscript {
788 return true
789 }
790 return false
791 }
792
793 // Section 12.2.6.4.6.
794 func afterHeadIM(p *parser) bool {
795 switch p.tok.Type {
796 case TextToken:
797 s := strings.TrimLeft(p.tok.Data, whitespace)
798 if len(s) < len(p.tok.Data) {
799 // Add the initial whitespace to the current node.
800 p.addText(p.tok.Data[:len(p.tok.Data)-len(s)])
801 if s == "" {
802 return true
803 }
804 p.tok.Data = s
805 }
806 case StartTagToken:
807 switch p.tok.DataAtom {
808 case a.Html:
809 return inBodyIM(p)
810 case a.Body:
811 p.addElement()
812 p.framesetOK = false
813 p.im = inBodyIM
814 return true
815 case a.Frameset:
816 p.addElement()
817 p.im = inFramesetIM
818 return true
819 case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title:
820 p.insertOpenElement(p.head)
821 defer p.oe.remove(p.head)
822 return inHeadIM(p)
823 case a.Head:
824 // Ignore the token.
825 return true
826 }
827 case EndTagToken:
828 switch p.tok.DataAtom {
829 case a.Body, a.Html, a.Br:
830 // Drop down to creating an implied <body> tag.
831 case a.Template:
832 return inHeadIM(p)
833 default:
834 // Ignore the token.
835 return true
836 }
837 case CommentToken:
838 p.addChild(&Node{
839 Type: CommentNode,
840 Data: p.tok.Data,
841 })
842 return true
843 case DoctypeToken:
844 // Ignore the token.
845 return true
846 }
847
848 p.parseImpliedToken(StartTagToken, a.Body, a.Body.String())
849 p.framesetOK = true
850 if p.tok.Type == ErrorToken {
851 // Stop parsing.
852 return true
853 }
854 return false
855 }
856
857 // copyAttributes copies attributes of src not found on dst to dst.
858 func copyAttributes(dst *Node, src Token) {
859 if len(src.Attr) == 0 {
860 return
861 }
862 attr := map[string]string{}
863 for _, t := range dst.Attr {
864 attr[t.Key] = t.Val
865 }
866 for _, t := range src.Attr {
867 if _, ok := attr[t.Key]; !ok {
868 dst.Attr = append(dst.Attr, t)
869 attr[t.Key] = t.Val
870 }
871 }
872 }
873
874 // Section 12.2.6.4.7.
875 func inBodyIM(p *parser) bool {
876 switch p.tok.Type {
877 case TextToken:
878 d := p.tok.Data
879 switch n := p.oe.top(); n.DataAtom {
880 case a.Pre, a.Listing:
881 if n.FirstChild == nil {
882 // Ignore a newline at the start of a <pre> block.
883 if d != "" && d[0] == '\r' {
884 d = d[1:]
885 }
886 if d != "" && d[0] == '\n' {
887 d = d[1:]
888 }
889 }
890 }
891 d = strings.Replace(d, "\x00", "", -1)
892 if d == "" {
893 return true
894 }
895 p.reconstructActiveFormattingElements()
896 p.addText(d)
897 if p.framesetOK && strings.TrimLeft(d, whitespace) != "" {
898 // There were non-whitespace characters inserted.
899 p.framesetOK = false
900 }
901 case StartTagToken:
902 switch p.tok.DataAtom {
903 case a.Html:
904 if p.oe.contains(a.Template) {
905 return true
906 }
907 copyAttributes(p.oe[0], p.tok)
908 case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title:
909 return inHeadIM(p)
910 case a.Body:
911 if p.oe.contains(a.Template) {
912 return true
913 }
914 if len(p.oe) >= 2 {
915 body := p.oe[1]
916 if body.Type == ElementNode && body.DataAtom == a.Body {
917 p.framesetOK = false
918 copyAttributes(body, p.tok)
919 }
920 }
921 case a.Frameset:
922 if !p.framesetOK || len(p.oe) < 2 || p.oe[1].DataAtom != a.Body {
923 // Ignore the token.
924 return true
925 }
926 body := p.oe[1]
927 if body.Parent != nil {
928 body.Parent.RemoveChild(body)
929 }
930 p.oe = p.oe[:1]
931 p.addElement()
932 p.im = inFramesetIM
933 return true
934 case a.Address, a.Article, a.Aside, a.Blockquote, a.Center, a.Details, a.Dialog, a.Dir, a.Div, a.Dl, a.Fieldset, a.Figcaption, a.Figure, a.Footer, a.Header, a.Hgroup, a.Main, a.Menu, a.Nav, a.Ol, a.P, a.Search, a.Section, a.Summary, a.Ul:
935 p.popUntil(buttonScope, a.P)
936 p.addElement()
937 case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6:
938 p.popUntil(buttonScope, a.P)
939 switch n := p.top(); n.DataAtom {
940 case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6:
941 p.oe.pop()
942 }
943 p.addElement()
944 case a.Pre, a.Listing:
945 p.popUntil(buttonScope, a.P)
946 p.addElement()
947 // The newline, if any, will be dealt with by the TextToken case.
948 p.framesetOK = false
949 case a.Form:
950 if p.form != nil && !p.oe.contains(a.Template) {
951 // Ignore the token
952 return true
953 }
954 p.popUntil(buttonScope, a.P)
955 p.addElement()
956 if !p.oe.contains(a.Template) {
957 p.form = p.top()
958 }
959 case a.Li:
960 p.framesetOK = false
961 for i := len(p.oe) - 1; i >= 0; i-- {
962 node := p.oe[i]
963 switch node.DataAtom {
964 case a.Li:
965 p.oe = p.oe[:i]
966 case a.Address, a.Div, a.P:
967 continue
968 default:
969 if !isSpecialElement(node) {
970 continue
971 }
972 }
973 break
974 }
975 p.popUntil(buttonScope, a.P)
976 p.addElement()
977 case a.Dd, a.Dt:
978 p.framesetOK = false
979 for i := len(p.oe) - 1; i >= 0; i-- {
980 node := p.oe[i]
981 switch node.DataAtom {
982 case a.Dd, a.Dt:
983 p.oe = p.oe[:i]
984 case a.Address, a.Div, a.P:
985 continue
986 default:
987 if !isSpecialElement(node) {
988 continue
989 }
990 }
991 break
992 }
993 p.popUntil(buttonScope, a.P)
994 p.addElement()
995 case a.Plaintext:
996 p.popUntil(buttonScope, a.P)
997 p.addElement()
998 case a.Button:
999 p.popUntil(defaultScope, a.Button)
1000 p.reconstructActiveFormattingElements()
1001 p.addElement()
1002 p.framesetOK = false
1003 case a.A:
1004 for i := len(p.afe) - 1; i >= 0 && p.afe[i].Type != scopeMarkerNode; i-- {
1005 if n := p.afe[i]; n.Type == ElementNode && n.DataAtom == a.A {
1006 p.inBodyEndTagFormatting(a.A, "a")
1007 p.oe.remove(n)
1008 p.afe.remove(n)
1009 break
1010 }
1011 }
1012 p.reconstructActiveFormattingElements()
1013 p.addFormattingElement()
1014 case a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U:
1015 p.reconstructActiveFormattingElements()
1016 p.addFormattingElement()
1017 case a.Nobr:
1018 p.reconstructActiveFormattingElements()
1019 if p.elementInScope(defaultScope, a.Nobr) {
1020 p.inBodyEndTagFormatting(a.Nobr, "nobr")
1021 p.reconstructActiveFormattingElements()
1022 }
1023 p.addFormattingElement()
1024 case a.Applet, a.Marquee, a.Object:
1025 p.reconstructActiveFormattingElements()
1026 p.addElement()
1027 p.afe = append(p.afe, &scopeMarker)
1028 p.framesetOK = false
1029 case a.Table:
1030 if !p.quirks {
1031 p.popUntil(buttonScope, a.P)
1032 }
1033 p.addElement()
1034 p.framesetOK = false
1035 p.im = inTableIM
1036 return true
1037 case a.Area, a.Br, a.Embed, a.Img, a.Input, a.Keygen, a.Wbr:
1038 p.reconstructActiveFormattingElements()
1039 p.addElement()
1040 p.oe.pop()
1041 p.acknowledgeSelfClosingTag()
1042 if p.tok.DataAtom == a.Input {
1043 for _, t := range p.tok.Attr {
1044 if t.Key == "type" {
1045 if strings.EqualFold(t.Val, "hidden") {
1046 // Skip setting framesetOK = false
1047 return true
1048 }
1049 }
1050 }
1051 }
1052 p.framesetOK = false
1053 case a.Param, a.Source, a.Track:
1054 p.addElement()
1055 p.oe.pop()
1056 p.acknowledgeSelfClosingTag()
1057 case a.Hr:
1058 p.popUntil(buttonScope, a.P)
1059 p.addElement()
1060 p.oe.pop()
1061 p.acknowledgeSelfClosingTag()
1062 p.framesetOK = false
1063 case a.Image:
1064 p.tok.DataAtom = a.Img
1065 p.tok.Data = a.Img.String()
1066 return false
1067 case a.Textarea:
1068 p.addElement()
1069 p.setOriginalIM()
1070 p.framesetOK = false
1071 p.im = textIM
1072 case a.Xmp:
1073 p.popUntil(buttonScope, a.P)
1074 p.reconstructActiveFormattingElements()
1075 p.framesetOK = false
1076 p.parseGenericRawTextElement()
1077 case a.Iframe:
1078 p.framesetOK = false
1079 p.parseGenericRawTextElement()
1080 case a.Noembed:
1081 p.parseGenericRawTextElement()
1082 case a.Noscript:
1083 if p.scripting {
1084 p.parseGenericRawTextElement()
1085 return true
1086 }
1087 p.reconstructActiveFormattingElements()
1088 p.addElement()
1089 // Don't let the tokenizer go into raw text mode when scripting is disabled.
1090 p.tokenizer.NextIsNotRawText()
1091 case a.Select:
1092 p.reconstructActiveFormattingElements()
1093 p.addElement()
1094 p.framesetOK = false
1095 p.im = inSelectIM
1096 return true
1097 case a.Optgroup, a.Option:
1098 if p.top().DataAtom == a.Option {
1099 p.oe.pop()
1100 }
1101 p.reconstructActiveFormattingElements()
1102 p.addElement()
1103 case a.Rb, a.Rtc:
1104 if p.elementInScope(defaultScope, a.Ruby) {
1105 p.generateImpliedEndTags()
1106 }
1107 p.addElement()
1108 case a.Rp, a.Rt:
1109 if p.elementInScope(defaultScope, a.Ruby) {
1110 p.generateImpliedEndTags("rtc")
1111 }
1112 p.addElement()
1113 case a.Math, a.Svg:
1114 p.reconstructActiveFormattingElements()
1115 if p.tok.DataAtom == a.Math {
1116 adjustAttributeNames(p.tok.Attr, mathMLAttributeAdjustments)
1117 } else {
1118 adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments)
1119 }
1120 adjustForeignAttributes(p.tok.Attr)
1121 p.addElement()
1122 p.top().Namespace = p.tok.Data
1123 if p.hasSelfClosingToken {
1124 p.oe.pop()
1125 p.acknowledgeSelfClosingTag()
1126 }
1127 return true
1128 case a.Caption, a.Col, a.Colgroup, a.Frame, a.Head, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr:
1129 // Ignore the token.
1130 default:
1131 p.reconstructActiveFormattingElements()
1132 p.addElement()
1133 }
1134 case EndTagToken:
1135 switch p.tok.DataAtom {
1136 case a.Body:
1137 if p.elementInScope(defaultScope, a.Body) {
1138 p.im = afterBodyIM
1139 }
1140 case a.Html:
1141 if p.elementInScope(defaultScope, a.Body) {
1142 p.parseImpliedToken(EndTagToken, a.Body, a.Body.String())
1143 return false
1144 }
1145 return true
1146 case a.Address, a.Article, a.Aside, a.Blockquote, a.Button, a.Center, a.Details, a.Dialog, a.Dir, a.Div, a.Dl, a.Fieldset, a.Figcaption, a.Figure, a.Footer, a.Header, a.Hgroup, a.Listing, a.Main, a.Menu, a.Nav, a.Ol, a.Pre, a.Search, a.Section, a.Summary, a.Ul:
1147 p.popUntil(defaultScope, p.tok.DataAtom)
1148 case a.Form:
1149 if p.oe.contains(a.Template) {
1150 i := p.indexOfElementInScope(defaultScope, a.Form)
1151 if i == -1 {
1152 // Ignore the token.
1153 return true
1154 }
1155 p.generateImpliedEndTags()
1156 if p.oe[i].DataAtom != a.Form {
1157 // Ignore the token.
1158 return true
1159 }
1160 p.popUntil(defaultScope, a.Form)
1161 } else {
1162 node := p.form
1163 p.form = nil
1164 i := p.indexOfElementInScope(defaultScope, a.Form)
1165 if node == nil || i == -1 || p.oe[i] != node {
1166 // Ignore the token.
1167 return true
1168 }
1169 p.generateImpliedEndTags()
1170 p.oe.remove(node)
1171 }
1172 case a.P:
1173 if !p.elementInScope(buttonScope, a.P) {
1174 p.parseImpliedToken(StartTagToken, a.P, a.P.String())
1175 }
1176 p.popUntil(buttonScope, a.P)
1177 case a.Li:
1178 p.popUntil(listItemScope, a.Li)
1179 case a.Dd, a.Dt:
1180 p.popUntil(defaultScope, p.tok.DataAtom)
1181 case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6:
1182 p.popUntil(defaultScope, a.H1, a.H2, a.H3, a.H4, a.H5, a.H6)
1183 case a.A, a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.Nobr, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U:
1184 p.inBodyEndTagFormatting(p.tok.DataAtom, p.tok.Data)
1185 case a.Applet, a.Marquee, a.Object:
1186 if p.popUntil(defaultScope, p.tok.DataAtom) {
1187 p.clearActiveFormattingElements()
1188 }
1189 case a.Br:
1190 p.tok.Type = StartTagToken
1191 return false
1192 case a.Template:
1193 return inHeadIM(p)
1194 default:
1195 p.inBodyEndTagOther(p.tok.DataAtom, p.tok.Data)
1196 }
1197 case CommentToken:
1198 p.addChild(&Node{
1199 Type: CommentNode,
1200 Data: p.tok.Data,
1201 })
1202 case ErrorToken:
1203 // TODO: remove this divergence from the HTML5 spec.
1204 if len(p.templateStack) > 0 {
1205 p.im = inTemplateIM
1206 return false
1207 }
1208 for _, e := range p.oe {
1209 switch e.DataAtom {
1210 case a.Dd, a.Dt, a.Li, a.Optgroup, a.Option, a.P, a.Rb, a.Rp, a.Rt, a.Rtc, a.Tbody, a.Td, a.Tfoot, a.Th,
1211 a.Thead, a.Tr, a.Body, a.Html:
1212 default:
1213 return true
1214 }
1215 }
1216 }
1217
1218 return true
1219 }
1220
1221 func (p *parser) inBodyEndTagFormatting(tagAtom a.Atom, tagName string) {
1222 // This is the "adoption agency" algorithm, described at
1223 // https://html.spec.whatwg.org/multipage/syntax.html#adoptionAgency
1224
1225 // TODO: this is a fairly literal line-by-line translation of that algorithm.
1226 // Once the code successfully parses the comprehensive test suite, we should
1227 // refactor this code to be more idiomatic.
1228
1229 // Steps 1-2
1230 if current := p.oe.top(); current.Data == tagName && p.afe.index(current) == -1 {
1231 p.oe.pop()
1232 return
1233 }
1234
1235 // Steps 3-5. The outer loop.
1236 for i := 0; i < 8; i++ {
1237 // Step 6. Find the formatting element.
1238 var formattingElement *Node
1239 for j := len(p.afe) - 1; j >= 0; j-- {
1240 if p.afe[j].Type == scopeMarkerNode {
1241 break
1242 }
1243 if p.afe[j].DataAtom == tagAtom {
1244 formattingElement = p.afe[j]
1245 break
1246 }
1247 }
1248 if formattingElement == nil {
1249 p.inBodyEndTagOther(tagAtom, tagName)
1250 return
1251 }
1252
1253 // Step 7. Ignore the tag if formatting element is not in the stack of open elements.
1254 feIndex := p.oe.index(formattingElement)
1255 if feIndex == -1 {
1256 p.afe.remove(formattingElement)
1257 return
1258 }
1259 // Step 8. Ignore the tag if formatting element is not in the scope.
1260 if !p.elementInScope(defaultScope, tagAtom) {
1261 // Ignore the tag.
1262 return
1263 }
1264
1265 // Step 9. This step is omitted because it's just a parse error but no need to return.
1266
1267 // Steps 10-11. Find the furthest block.
1268 var furthestBlock *Node
1269 for _, e := range p.oe[feIndex:] {
1270 if isSpecialElement(e) {
1271 furthestBlock = e
1272 break
1273 }
1274 }
1275 if furthestBlock == nil {
1276 e := p.oe.pop()
1277 for e != formattingElement {
1278 e = p.oe.pop()
1279 }
1280 p.afe.remove(e)
1281 return
1282 }
1283
1284 // Steps 12-13. Find the common ancestor and bookmark node.
1285 commonAncestor := p.oe[feIndex-1]
1286 bookmark := p.afe.index(formattingElement)
1287
1288 // Step 14. The inner loop. Find the lastNode to reparent.
1289 lastNode := furthestBlock
1290 node := furthestBlock
1291 x := p.oe.index(node)
1292 // Step 14.1.
1293 j := 0
1294 for {
1295 // Step 14.2.
1296 j++
1297 // Step. 14.3.
1298 x--
1299 node = p.oe[x]
1300 // Step 14.4. Go to the next step if node is formatting element.
1301 if node == formattingElement {
1302 break
1303 }
1304 // Step 14.5. Remove node from the list of active formatting elements if
1305 // inner loop counter is greater than three and node is in the list of
1306 // active formatting elements.
1307 if ni := p.afe.index(node); j > 3 && ni > -1 {
1308 p.afe.remove(node)
1309 // If any element of the list of active formatting elements is removed,
1310 // we need to take care whether bookmark should be decremented or not.
1311 // This is because the value of bookmark may exceed the size of the
1312 // list by removing elements from the list.
1313 if ni <= bookmark {
1314 bookmark--
1315 }
1316 continue
1317 }
1318 // Step 14.6. Continue the next inner loop if node is not in the list of
1319 // active formatting elements.
1320 if p.afe.index(node) == -1 {
1321 p.oe.remove(node)
1322 continue
1323 }
1324 // Step 14.7.
1325 clone := node.clone()
1326 p.afe[p.afe.index(node)] = clone
1327 p.oe[p.oe.index(node)] = clone
1328 node = clone
1329 // Step 14.8.
1330 if lastNode == furthestBlock {
1331 bookmark = p.afe.index(node) + 1
1332 }
1333 // Step 14.9.
1334 if lastNode.Parent != nil {
1335 lastNode.Parent.RemoveChild(lastNode)
1336 }
1337 node.AppendChild(lastNode)
1338 // Step 14.10.
1339 lastNode = node
1340 }
1341
1342 // Step 15. Reparent lastNode to the common ancestor,
1343 // or for misnested table nodes, to the foster parent.
1344 if lastNode.Parent != nil {
1345 lastNode.Parent.RemoveChild(lastNode)
1346 }
1347 switch commonAncestor.DataAtom {
1348 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:
1349 p.fosterParent(lastNode)
1350 default:
1351 commonAncestor.AppendChild(lastNode)
1352 }
1353
1354 // Steps 16-18. Reparent nodes from the furthest block's children
1355 // to a clone of the formatting element.
1356 clone := formattingElement.clone()
1357 reparentChildren(clone, furthestBlock)
1358 furthestBlock.AppendChild(clone)
1359
1360 // Step 19. Fix up the list of active formatting elements.
1361 if oldLoc := p.afe.index(formattingElement); oldLoc != -1 && oldLoc < bookmark {
1362 // Move the bookmark with the rest of the list.
1363 bookmark--
1364 }
1365 p.afe.remove(formattingElement)
1366 p.afe.insert(bookmark, clone)
1367
1368 // Step 20. Fix up the stack of open elements.
1369 p.oe.remove(formattingElement)
1370 p.oe.insert(p.oe.index(furthestBlock)+1, clone)
1371 }
1372 }
1373
1374 // inBodyEndTagOther performs the "any other end tag" algorithm for inBodyIM.
1375 // "Any other end tag" handling from 12.2.6.5 The rules for parsing tokens in foreign content
1376 // https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inforeign
1377 func (p *parser) inBodyEndTagOther(tagAtom a.Atom, tagName string) {
1378 for i := len(p.oe) - 1; i >= 0; i-- {
1379 // Two element nodes have the same tag if they have the same Data (a
1380 // string-typed field). As an optimization, for common HTML tags, each
1381 // Data string is assigned a unique, non-zero DataAtom (a uint32-typed
1382 // field), since integer comparison is faster than string comparison.
1383 // Uncommon (custom) tags get a zero DataAtom.
1384 //
1385 // The if condition here is equivalent to (p.oe[i].Data == tagName).
1386 if (p.oe[i].DataAtom == tagAtom) &&
1387 ((tagAtom != 0) || (p.oe[i].Data == tagName)) {
1388 p.oe = p.oe[:i]
1389 break
1390 }
1391 if isSpecialElement(p.oe[i]) {
1392 break
1393 }
1394 }
1395 }
1396
1397 // Section 12.2.6.4.8.
1398 func textIM(p *parser) bool {
1399 switch p.tok.Type {
1400 case ErrorToken:
1401 p.oe.pop()
1402 case TextToken:
1403 d := p.tok.Data
1404 if n := p.oe.top(); n.DataAtom == a.Textarea && n.FirstChild == nil {
1405 // Ignore a newline at the start of a <textarea> block.
1406 if d != "" && d[0] == '\r' {
1407 d = d[1:]
1408 }
1409 if d != "" && d[0] == '\n' {
1410 d = d[1:]
1411 }
1412 }
1413 if d == "" {
1414 return true
1415 }
1416 p.addText(d)
1417 return true
1418 case EndTagToken:
1419 p.oe.pop()
1420 }
1421 p.im = p.originalIM
1422 p.originalIM = nil
1423 return p.tok.Type == EndTagToken
1424 }
1425
1426 // Section 12.2.6.4.9.
1427 func inTableIM(p *parser) bool {
1428 switch p.tok.Type {
1429 case TextToken:
1430 p.tok.Data = strings.Replace(p.tok.Data, "\x00", "", -1)
1431 switch p.oe.top().DataAtom {
1432 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:
1433 if strings.Trim(p.tok.Data, whitespace) == "" {
1434 p.addText(p.tok.Data)
1435 return true
1436 }
1437 }
1438 case StartTagToken:
1439 switch p.tok.DataAtom {
1440 case a.Caption:
1441 p.clearStackToContext(tableScope)
1442 p.afe = append(p.afe, &scopeMarker)
1443 p.addElement()
1444 p.im = inCaptionIM
1445 return true
1446 case a.Colgroup:
1447 p.clearStackToContext(tableScope)
1448 p.addElement()
1449 p.im = inColumnGroupIM
1450 return true
1451 case a.Col:
1452 p.parseImpliedToken(StartTagToken, a.Colgroup, a.Colgroup.String())
1453 return false
1454 case a.Tbody, a.Tfoot, a.Thead:
1455 p.clearStackToContext(tableScope)
1456 p.addElement()
1457 p.im = inTableBodyIM
1458 return true
1459 case a.Td, a.Th, a.Tr:
1460 p.parseImpliedToken(StartTagToken, a.Tbody, a.Tbody.String())
1461 return false
1462 case a.Table:
1463 if p.popUntil(tableScope, a.Table) {
1464 p.resetInsertionMode()
1465 return false
1466 }
1467 // Ignore the token.
1468 return true
1469 case a.Style, a.Script, a.Template:
1470 return inHeadIM(p)
1471 case a.Input:
1472 for _, t := range p.tok.Attr {
1473 if t.Key == "type" && strings.EqualFold(t.Val, "hidden") {
1474 p.addElement()
1475 p.oe.pop()
1476 return true
1477 }
1478 }
1479 // Otherwise drop down to the default action.
1480 case a.Form:
1481 if p.oe.contains(a.Template) || p.form != nil {
1482 // Ignore the token.
1483 return true
1484 }
1485 p.addElement()
1486 p.form = p.oe.pop()
1487 case a.Select:
1488 p.reconstructActiveFormattingElements()
1489 switch p.top().DataAtom {
1490 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:
1491 p.fosterParenting = true
1492 }
1493 p.addElement()
1494 p.fosterParenting = false
1495 p.framesetOK = false
1496 p.im = inSelectInTableIM
1497 return true
1498 }
1499 case EndTagToken:
1500 switch p.tok.DataAtom {
1501 case a.Table:
1502 if p.popUntil(tableScope, a.Table) {
1503 p.resetInsertionMode()
1504 return true
1505 }
1506 // Ignore the token.
1507 return true
1508 case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr:
1509 // Ignore the token.
1510 return true
1511 case a.Template:
1512 return inHeadIM(p)
1513 }
1514 case CommentToken:
1515 p.addChild(&Node{
1516 Type: CommentNode,
1517 Data: p.tok.Data,
1518 })
1519 return true
1520 case DoctypeToken:
1521 // Ignore the token.
1522 return true
1523 case ErrorToken:
1524 return inBodyIM(p)
1525 }
1526
1527 p.fosterParenting = true
1528 defer func() { p.fosterParenting = false }()
1529
1530 return inBodyIM(p)
1531 }
1532
1533 // Section 12.2.6.4.11.
1534 func inCaptionIM(p *parser) bool {
1535 switch p.tok.Type {
1536 case StartTagToken:
1537 switch p.tok.DataAtom {
1538 case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Td, a.Tfoot, a.Thead, a.Tr:
1539 if !p.popUntil(tableScope, a.Caption) {
1540 // Ignore the token.
1541 return true
1542 }
1543 p.clearActiveFormattingElements()
1544 p.im = inTableIM
1545 return false
1546 case a.Select:
1547 p.reconstructActiveFormattingElements()
1548 p.addElement()
1549 p.framesetOK = false
1550 p.im = inSelectInTableIM
1551 return true
1552 }
1553 case EndTagToken:
1554 switch p.tok.DataAtom {
1555 case a.Caption:
1556 if p.popUntil(tableScope, a.Caption) {
1557 p.clearActiveFormattingElements()
1558 p.im = inTableIM
1559 }
1560 return true
1561 case a.Table:
1562 if !p.popUntil(tableScope, a.Caption) {
1563 // Ignore the token.
1564 return true
1565 }
1566 p.clearActiveFormattingElements()
1567 p.im = inTableIM
1568 return false
1569 case a.Body, a.Col, a.Colgroup, a.Html, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr:
1570 // Ignore the token.
1571 return true
1572 }
1573 }
1574 return inBodyIM(p)
1575 }
1576
1577 // Section 12.2.6.4.12.
1578 func inColumnGroupIM(p *parser) bool {
1579 switch p.tok.Type {
1580 case TextToken:
1581 s := strings.TrimLeft(p.tok.Data, whitespace)
1582 if len(s) < len(p.tok.Data) {
1583 // Add the initial whitespace to the current node.
1584 p.addText(p.tok.Data[:len(p.tok.Data)-len(s)])
1585 if s == "" {
1586 return true
1587 }
1588 p.tok.Data = s
1589 }
1590 case CommentToken:
1591 p.addChild(&Node{
1592 Type: CommentNode,
1593 Data: p.tok.Data,
1594 })
1595 return true
1596 case DoctypeToken:
1597 // Ignore the token.
1598 return true
1599 case StartTagToken:
1600 switch p.tok.DataAtom {
1601 case a.Html:
1602 return inBodyIM(p)
1603 case a.Col:
1604 p.addElement()
1605 p.oe.pop()
1606 p.acknowledgeSelfClosingTag()
1607 return true
1608 case a.Template:
1609 return inHeadIM(p)
1610 }
1611 case EndTagToken:
1612 switch p.tok.DataAtom {
1613 case a.Colgroup:
1614 if p.oe.top().DataAtom == a.Colgroup {
1615 p.oe.pop()
1616 p.im = inTableIM
1617 }
1618 return true
1619 case a.Col:
1620 // Ignore the token.
1621 return true
1622 case a.Template:
1623 return inHeadIM(p)
1624 }
1625 case ErrorToken:
1626 return inBodyIM(p)
1627 }
1628 if p.oe.top().DataAtom != a.Colgroup {
1629 return true
1630 }
1631 p.oe.pop()
1632 p.im = inTableIM
1633 return false
1634 }
1635
1636 // Section 12.2.6.4.13.
1637 func inTableBodyIM(p *parser) bool {
1638 switch p.tok.Type {
1639 case StartTagToken:
1640 switch p.tok.DataAtom {
1641 case a.Tr:
1642 p.clearStackToContext(tableBodyScope)
1643 p.addElement()
1644 p.im = inRowIM
1645 return true
1646 case a.Td, a.Th:
1647 p.parseImpliedToken(StartTagToken, a.Tr, a.Tr.String())
1648 return false
1649 case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Tfoot, a.Thead:
1650 if p.popUntil(tableScope, a.Tbody, a.Thead, a.Tfoot) {
1651 p.im = inTableIM
1652 return false
1653 }
1654 // Ignore the token.
1655 return true
1656 }
1657 case EndTagToken:
1658 switch p.tok.DataAtom {
1659 case a.Tbody, a.Tfoot, a.Thead:
1660 if p.elementInScope(tableScope, p.tok.DataAtom) {
1661 p.clearStackToContext(tableBodyScope)
1662 p.oe.pop()
1663 p.im = inTableIM
1664 }
1665 return true
1666 case a.Table:
1667 if p.popUntil(tableScope, a.Tbody, a.Thead, a.Tfoot) {
1668 p.im = inTableIM
1669 return false
1670 }
1671 // Ignore the token.
1672 return true
1673 case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Td, a.Th, a.Tr:
1674 // Ignore the token.
1675 return true
1676 }
1677 case CommentToken:
1678 p.addChild(&Node{
1679 Type: CommentNode,
1680 Data: p.tok.Data,
1681 })
1682 return true
1683 }
1684
1685 return inTableIM(p)
1686 }
1687
1688 // Section 13.2.6.4.14.
1689 func inRowIM(p *parser) bool {
1690 switch p.tok.Type {
1691 case StartTagToken:
1692 switch p.tok.DataAtom {
1693 case a.Td, a.Th:
1694 p.clearStackToContext(tableRowScope)
1695 p.addElement()
1696 p.afe = append(p.afe, &scopeMarker)
1697 p.im = inCellIM
1698 return true
1699 case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Tfoot, a.Thead, a.Tr:
1700 if p.elementInScope(tableScope, a.Tr) {
1701 p.clearStackToContext(tableRowScope)
1702 p.oe.pop()
1703 p.im = inTableBodyIM
1704 return false
1705 }
1706 // Ignore the token.
1707 return true
1708 }
1709 case EndTagToken:
1710 switch p.tok.DataAtom {
1711 case a.Tr:
1712 if p.elementInScope(tableScope, a.Tr) {
1713 p.clearStackToContext(tableRowScope)
1714 p.oe.pop()
1715 p.im = inTableBodyIM
1716 return true
1717 }
1718 // Ignore the token.
1719 return true
1720 case a.Table:
1721 if p.elementInScope(tableScope, a.Tr) {
1722 p.clearStackToContext(tableRowScope)
1723 p.oe.pop()
1724 p.im = inTableBodyIM
1725 return false
1726 }
1727 // Ignore the token.
1728 return true
1729 case a.Tbody, a.Tfoot, a.Thead:
1730 if p.elementInScope(tableScope, p.tok.DataAtom) && p.elementInScope(tableScope, a.Tr) {
1731 p.clearStackToContext(tableRowScope)
1732 p.oe.pop()
1733 p.im = inTableBodyIM
1734 return false
1735 }
1736 // Ignore the token.
1737 return true
1738 case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Td, a.Th:
1739 // Ignore the token.
1740 return true
1741 }
1742 }
1743
1744 return inTableIM(p)
1745 }
1746
1747 // Section 12.2.6.4.15.
1748 func inCellIM(p *parser) bool {
1749 switch p.tok.Type {
1750 case StartTagToken:
1751 switch p.tok.DataAtom {
1752 case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr:
1753 if p.popUntil(tableScope, a.Td, a.Th) {
1754 // Close the cell and reprocess.
1755 p.clearActiveFormattingElements()
1756 p.im = inRowIM
1757 return false
1758 }
1759 // Ignore the token.
1760 return true
1761 case a.Select:
1762 p.reconstructActiveFormattingElements()
1763 p.addElement()
1764 p.framesetOK = false
1765 p.im = inSelectInTableIM
1766 return true
1767 }
1768 case EndTagToken:
1769 switch p.tok.DataAtom {
1770 case a.Td, a.Th:
1771 if !p.popUntil(tableScope, p.tok.DataAtom) {
1772 // Ignore the token.
1773 return true
1774 }
1775 p.clearActiveFormattingElements()
1776 p.im = inRowIM
1777 return true
1778 case a.Body, a.Caption, a.Col, a.Colgroup, a.Html:
1779 // Ignore the token.
1780 return true
1781 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:
1782 if !p.elementInScope(tableScope, p.tok.DataAtom) {
1783 // Ignore the token.
1784 return true
1785 }
1786 // Close the cell and reprocess.
1787 if p.popUntil(tableScope, a.Td, a.Th) {
1788 p.clearActiveFormattingElements()
1789 }
1790 p.im = inRowIM
1791 return false
1792 }
1793 }
1794 return inBodyIM(p)
1795 }
1796
1797 // Section 12.2.6.4.16.
1798 func inSelectIM(p *parser) bool {
1799 switch p.tok.Type {
1800 case TextToken:
1801 p.addText(strings.Replace(p.tok.Data, "\x00", "", -1))
1802 case StartTagToken:
1803 switch p.tok.DataAtom {
1804 case a.Html:
1805 return inBodyIM(p)
1806 case a.Option:
1807 if p.top().DataAtom == a.Option {
1808 p.oe.pop()
1809 }
1810 p.addElement()
1811 case a.Optgroup:
1812 if p.top().DataAtom == a.Option {
1813 p.oe.pop()
1814 }
1815 if p.top().DataAtom == a.Optgroup {
1816 p.oe.pop()
1817 }
1818 p.addElement()
1819 case a.Select:
1820 if !p.popUntil(selectScope, a.Select) {
1821 // Ignore the token.
1822 return true
1823 }
1824 p.resetInsertionMode()
1825 case a.Input, a.Keygen, a.Textarea:
1826 if p.elementInScope(selectScope, a.Select) {
1827 p.parseImpliedToken(EndTagToken, a.Select, a.Select.String())
1828 return false
1829 }
1830 // In order to properly ignore <textarea>, we need to change the tokenizer mode.
1831 p.tokenizer.NextIsNotRawText()
1832 // Ignore the token.
1833 return true
1834 case a.Script, a.Template:
1835 return inHeadIM(p)
1836 case a.Iframe, a.Noembed, a.Noframes, a.Noscript, a.Plaintext, a.Style, a.Title, a.Xmp:
1837 // Don't let the tokenizer go into raw text mode when there are raw tags
1838 // to be ignored. These tags should be ignored from the tokenizer
1839 // properly.
1840 p.tokenizer.NextIsNotRawText()
1841 // Ignore the token.
1842 return true
1843 }
1844 case EndTagToken:
1845 switch p.tok.DataAtom {
1846 case a.Option:
1847 if p.top().DataAtom == a.Option {
1848 p.oe.pop()
1849 }
1850 case a.Optgroup:
1851 i := len(p.oe) - 1
1852 if p.oe[i].DataAtom == a.Option {
1853 i--
1854 }
1855 if p.oe[i].DataAtom == a.Optgroup {
1856 p.oe = p.oe[:i]
1857 }
1858 case a.Select:
1859 if !p.popUntil(selectScope, a.Select) {
1860 // Ignore the token.
1861 return true
1862 }
1863 p.resetInsertionMode()
1864 case a.Template:
1865 return inHeadIM(p)
1866 }
1867 case CommentToken:
1868 p.addChild(&Node{
1869 Type: CommentNode,
1870 Data: p.tok.Data,
1871 })
1872 case DoctypeToken:
1873 // Ignore the token.
1874 return true
1875 case ErrorToken:
1876 return inBodyIM(p)
1877 }
1878
1879 return true
1880 }
1881
1882 // Section 12.2.6.4.17.
1883 func inSelectInTableIM(p *parser) bool {
1884 switch p.tok.Type {
1885 case StartTagToken, EndTagToken:
1886 switch p.tok.DataAtom {
1887 case a.Caption, a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr, a.Td, a.Th:
1888 if p.tok.Type == EndTagToken && !p.elementInScope(tableScope, p.tok.DataAtom) {
1889 // Ignore the token.
1890 return true
1891 }
1892 // This is like p.popUntil(selectScope, a.Select), but it also
1893 // matches <math select>, not just <select>. Matching the MathML
1894 // tag is arguably incorrect (conceptually), but it mimics what
1895 // Chromium does.
1896 for i := len(p.oe) - 1; i >= 0; i-- {
1897 if n := p.oe[i]; n.DataAtom == a.Select {
1898 p.oe = p.oe[:i]
1899 break
1900 }
1901 }
1902 p.resetInsertionMode()
1903 return false
1904 }
1905 }
1906 return inSelectIM(p)
1907 }
1908
1909 // Section 12.2.6.4.18.
1910 func inTemplateIM(p *parser) bool {
1911 switch p.tok.Type {
1912 case TextToken, CommentToken, DoctypeToken:
1913 return inBodyIM(p)
1914 case StartTagToken:
1915 switch p.tok.DataAtom {
1916 case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title:
1917 return inHeadIM(p)
1918 case a.Caption, a.Colgroup, a.Tbody, a.Tfoot, a.Thead:
1919 p.templateStack.pop()
1920 p.templateStack = append(p.templateStack, inTableIM)
1921 p.im = inTableIM
1922 return false
1923 case a.Col:
1924 p.templateStack.pop()
1925 p.templateStack = append(p.templateStack, inColumnGroupIM)
1926 p.im = inColumnGroupIM
1927 return false
1928 case a.Tr:
1929 p.templateStack.pop()
1930 p.templateStack = append(p.templateStack, inTableBodyIM)
1931 p.im = inTableBodyIM
1932 return false
1933 case a.Td, a.Th:
1934 p.templateStack.pop()
1935 p.templateStack = append(p.templateStack, inRowIM)
1936 p.im = inRowIM
1937 return false
1938 default:
1939 p.templateStack.pop()
1940 p.templateStack = append(p.templateStack, inBodyIM)
1941 p.im = inBodyIM
1942 return false
1943 }
1944 case EndTagToken:
1945 switch p.tok.DataAtom {
1946 case a.Template:
1947 return inHeadIM(p)
1948 default:
1949 // Ignore the token.
1950 return true
1951 }
1952 case ErrorToken:
1953 if !p.oe.contains(a.Template) {
1954 // Ignore the token.
1955 return true
1956 }
1957 // TODO: remove this divergence from the HTML5 spec.
1958 //
1959 // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668
1960 p.generateImpliedEndTags()
1961 for i := len(p.oe) - 1; i >= 0; i-- {
1962 if n := p.oe[i]; n.Namespace == "" && n.DataAtom == a.Template {
1963 p.oe = p.oe[:i]
1964 break
1965 }
1966 }
1967 p.clearActiveFormattingElements()
1968 p.templateStack.pop()
1969 p.resetInsertionMode()
1970 return false
1971 }
1972 return false
1973 }
1974
1975 // Section 12.2.6.4.19.
1976 func afterBodyIM(p *parser) bool {
1977 switch p.tok.Type {
1978 case ErrorToken:
1979 // Stop parsing.
1980 return true
1981 case TextToken:
1982 s := strings.TrimLeft(p.tok.Data, whitespace)
1983 if len(s) == 0 {
1984 // It was all whitespace.
1985 return inBodyIM(p)
1986 }
1987 case StartTagToken:
1988 if p.tok.DataAtom == a.Html {
1989 return inBodyIM(p)
1990 }
1991 case EndTagToken:
1992 if p.tok.DataAtom == a.Html {
1993 if !p.fragment {
1994 p.im = afterAfterBodyIM
1995 }
1996 return true
1997 }
1998 case CommentToken:
1999 // The comment is attached to the <html> element.
2000 if len(p.oe) < 1 || p.oe[0].DataAtom != a.Html {
2001 panic("html: bad parser state: <html> element not found, in the after-body insertion mode")
2002 }
2003 p.oe[0].AppendChild(&Node{
2004 Type: CommentNode,
2005 Data: p.tok.Data,
2006 })
2007 return true
2008 }
2009 p.im = inBodyIM
2010 return false
2011 }
2012
2013 // Section 12.2.6.4.20.
2014 func inFramesetIM(p *parser) bool {
2015 switch p.tok.Type {
2016 case CommentToken:
2017 p.addChild(&Node{
2018 Type: CommentNode,
2019 Data: p.tok.Data,
2020 })
2021 case TextToken:
2022 // Ignore all text but whitespace.
2023 s := strings.Map(func(c rune) rune {
2024 switch c {
2025 case ' ', '\t', '\n', '\f', '\r':
2026 return c
2027 }
2028 return -1
2029 }, p.tok.Data)
2030 if s != "" {
2031 p.addText(s)
2032 }
2033 case StartTagToken:
2034 switch p.tok.DataAtom {
2035 case a.Html:
2036 return inBodyIM(p)
2037 case a.Frameset:
2038 p.addElement()
2039 case a.Frame:
2040 p.addElement()
2041 p.oe.pop()
2042 p.acknowledgeSelfClosingTag()
2043 case a.Noframes:
2044 return inHeadIM(p)
2045 }
2046 case EndTagToken:
2047 switch p.tok.DataAtom {
2048 case a.Frameset:
2049 if p.oe.top().DataAtom != a.Html {
2050 p.oe.pop()
2051 if p.oe.top().DataAtom != a.Frameset {
2052 p.im = afterFramesetIM
2053 return true
2054 }
2055 }
2056 }
2057 default:
2058 // Ignore the token.
2059 }
2060 return true
2061 }
2062
2063 // Section 12.2.6.4.21.
2064 func afterFramesetIM(p *parser) bool {
2065 switch p.tok.Type {
2066 case CommentToken:
2067 p.addChild(&Node{
2068 Type: CommentNode,
2069 Data: p.tok.Data,
2070 })
2071 case TextToken:
2072 // Ignore all text but whitespace.
2073 s := strings.Map(func(c rune) rune {
2074 switch c {
2075 case ' ', '\t', '\n', '\f', '\r':
2076 return c
2077 }
2078 return -1
2079 }, p.tok.Data)
2080 if s != "" {
2081 p.addText(s)
2082 }
2083 case StartTagToken:
2084 switch p.tok.DataAtom {
2085 case a.Html:
2086 return inBodyIM(p)
2087 case a.Noframes:
2088 return inHeadIM(p)
2089 }
2090 case EndTagToken:
2091 switch p.tok.DataAtom {
2092 case a.Html:
2093 p.im = afterAfterFramesetIM
2094 return true
2095 }
2096 default:
2097 // Ignore the token.
2098 }
2099 return true
2100 }
2101
2102 // Section 12.2.6.4.22.
2103 func afterAfterBodyIM(p *parser) bool {
2104 switch p.tok.Type {
2105 case ErrorToken:
2106 // Stop parsing.
2107 return true
2108 case TextToken:
2109 s := strings.TrimLeft(p.tok.Data, whitespace)
2110 if len(s) == 0 {
2111 // It was all whitespace.
2112 return inBodyIM(p)
2113 }
2114 case StartTagToken:
2115 if p.tok.DataAtom == a.Html {
2116 return inBodyIM(p)
2117 }
2118 case CommentToken:
2119 p.doc.AppendChild(&Node{
2120 Type: CommentNode,
2121 Data: p.tok.Data,
2122 })
2123 return true
2124 case DoctypeToken:
2125 return inBodyIM(p)
2126 }
2127 p.im = inBodyIM
2128 return false
2129 }
2130
2131 // Section 12.2.6.4.23.
2132 func afterAfterFramesetIM(p *parser) bool {
2133 switch p.tok.Type {
2134 case CommentToken:
2135 p.doc.AppendChild(&Node{
2136 Type: CommentNode,
2137 Data: p.tok.Data,
2138 })
2139 case TextToken:
2140 // Ignore all text but whitespace.
2141 s := strings.Map(func(c rune) rune {
2142 switch c {
2143 case ' ', '\t', '\n', '\f', '\r':
2144 return c
2145 }
2146 return -1
2147 }, p.tok.Data)
2148 if s != "" {
2149 p.tok.Data = s
2150 return inBodyIM(p)
2151 }
2152 case StartTagToken:
2153 switch p.tok.DataAtom {
2154 case a.Html:
2155 return inBodyIM(p)
2156 case a.Noframes:
2157 return inHeadIM(p)
2158 }
2159 case DoctypeToken:
2160 return inBodyIM(p)
2161 default:
2162 // Ignore the token.
2163 }
2164 return true
2165 }
2166
2167 func ignoreTheRemainingTokens(p *parser) bool {
2168 return true
2169 }
2170
2171 const whitespaceOrNUL = whitespace + "\x00"
2172
2173 // Section 12.2.6.5
2174 func parseForeignContent(p *parser) bool {
2175 switch p.tok.Type {
2176 case TextToken:
2177 if p.framesetOK {
2178 p.framesetOK = strings.TrimLeft(p.tok.Data, whitespaceOrNUL) == ""
2179 }
2180 p.tok.Data = strings.Replace(p.tok.Data, "\x00", "\ufffd", -1)
2181 p.addText(p.tok.Data)
2182 case CommentToken:
2183 p.addChild(&Node{
2184 Type: CommentNode,
2185 Data: p.tok.Data,
2186 })
2187 case StartTagToken:
2188 if !p.fragment {
2189 b := breakout[p.tok.Data]
2190 if p.tok.DataAtom == a.Font {
2191 loop:
2192 for _, attr := range p.tok.Attr {
2193 switch attr.Key {
2194 case "color", "face", "size":
2195 b = true
2196 break loop
2197 }
2198 }
2199 }
2200 if b {
2201 for i := len(p.oe) - 1; i >= 0; i-- {
2202 n := p.oe[i]
2203 if n.Namespace == "" || htmlIntegrationPoint(n) || mathMLTextIntegrationPoint(n) {
2204 p.oe = p.oe[:i+1]
2205 break
2206 }
2207 }
2208 return false
2209 }
2210 }
2211 current := p.adjustedCurrentNode()
2212 switch current.Namespace {
2213 case "math":
2214 adjustAttributeNames(p.tok.Attr, mathMLAttributeAdjustments)
2215 case "svg":
2216 // Adjust SVG tag names. The tokenizer lower-cases tag names, but
2217 // SVG wants e.g. "foreignObject" with a capital second "O".
2218 if x := svgTagNameAdjustments[p.tok.Data]; x != "" {
2219 p.tok.DataAtom = a.Lookup([]byte(x))
2220 p.tok.Data = x
2221 }
2222 adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments)
2223 default:
2224 panic("html: bad parser state: unexpected namespace")
2225 }
2226 adjustForeignAttributes(p.tok.Attr)
2227 namespace := current.Namespace
2228 p.addElement()
2229 p.top().Namespace = namespace
2230 if namespace != "" {
2231 // Don't let the tokenizer go into raw text mode in foreign content
2232 // (e.g. in an SVG <title> tag).
2233 p.tokenizer.NextIsNotRawText()
2234 }
2235 if p.hasSelfClosingToken {
2236 p.oe.pop()
2237 p.acknowledgeSelfClosingTag()
2238 }
2239 case EndTagToken:
2240 if strings.EqualFold(p.oe[len(p.oe)-1].Data, p.tok.Data) {
2241 p.oe = p.oe[:len(p.oe)-1]
2242 return true
2243 }
2244 for i := len(p.oe) - 1; i >= 0; i-- {
2245 if strings.EqualFold(p.oe[i].Data, p.tok.Data) {
2246 p.oe = p.oe[:i]
2247 return true
2248 }
2249 if i > 0 && p.oe[i-1].Namespace == "" {
2250 break
2251 }
2252 }
2253 return p.im(p)
2254 default:
2255 // Ignore the token.
2256 }
2257 return true
2258 }
2259
2260 // Section 12.2.4.2.
2261 func (p *parser) adjustedCurrentNode() *Node {
2262 if len(p.oe) == 1 && p.fragment && p.context != nil {
2263 return p.context
2264 }
2265 return p.oe.top()
2266 }
2267
2268 // Section 12.2.6.
2269 func (p *parser) inForeignContent() bool {
2270 if len(p.oe) == 0 {
2271 return false
2272 }
2273 n := p.adjustedCurrentNode()
2274 if n.Namespace == "" {
2275 return false
2276 }
2277 if mathMLTextIntegrationPoint(n) {
2278 if p.tok.Type == StartTagToken && p.tok.DataAtom != a.Mglyph && p.tok.DataAtom != a.Malignmark {
2279 return false
2280 }
2281 if p.tok.Type == TextToken {
2282 return false
2283 }
2284 }
2285 if n.Namespace == "math" && n.DataAtom == a.AnnotationXml && p.tok.Type == StartTagToken && p.tok.DataAtom == a.Svg {
2286 return false
2287 }
2288 if htmlIntegrationPoint(n) && (p.tok.Type == StartTagToken || p.tok.Type == TextToken) {
2289 return false
2290 }
2291 if p.tok.Type == ErrorToken {
2292 return false
2293 }
2294 return true
2295 }
2296
2297 // parseImpliedToken parses a token as though it had appeared in the parser's
2298 // input.
2299 func (p *parser) parseImpliedToken(t TokenType, dataAtom a.Atom, data string) {
2300 realToken, selfClosing := p.tok, p.hasSelfClosingToken
2301 p.tok = Token{
2302 Type: t,
2303 DataAtom: dataAtom,
2304 Data: data,
2305 }
2306 p.hasSelfClosingToken = false
2307 p.parseCurrentToken()
2308 p.tok, p.hasSelfClosingToken = realToken, selfClosing
2309 }
2310
2311 // parseCurrentToken runs the current token through the parsing routines
2312 // until it is consumed.
2313 func (p *parser) parseCurrentToken() {
2314 if p.tok.Type == SelfClosingTagToken {
2315 p.hasSelfClosingToken = true
2316 p.tok.Type = StartTagToken
2317 }
2318
2319 consumed := false
2320 for !consumed {
2321 if p.inForeignContent() {
2322 consumed = parseForeignContent(p)
2323 } else {
2324 consumed = p.im(p)
2325 }
2326 }
2327
2328 if p.hasSelfClosingToken {
2329 // This is a parse error, but ignore it.
2330 p.hasSelfClosingToken = false
2331 }
2332 }
2333
2334 func (p *parser) parse() (err error) {
2335 defer func() {
2336 if panicErr := recover(); panicErr != nil {
2337 err = fmt.Errorf("%s", panicErr)
2338 }
2339 }()
2340 // Iterate until EOF. Any other error will cause an early return.
2341 for err != io.EOF {
2342 // CDATA sections are allowed only in foreign content.
2343 n := p.oe.top()
2344 p.tokenizer.AllowCDATA(n != nil && n.Namespace != "")
2345 // Read and parse the next token.
2346 p.tokenizer.Next()
2347 p.tok = p.tokenizer.Token()
2348 if p.tok.Type == ErrorToken {
2349 err = p.tokenizer.Err()
2350 if err != nil && err != io.EOF {
2351 return err
2352 }
2353 }
2354 p.parseCurrentToken()
2355 }
2356 return nil
2357 }
2358
2359 // Parse returns the parse tree for the HTML from the given Reader.
2360 //
2361 // It implements the HTML5 parsing algorithm
2362 // (https://html.spec.whatwg.org/multipage/syntax.html#tree-construction),
2363 // which is very complicated. The resultant tree can contain implicitly created
2364 // nodes that have no explicit <tag> listed in r's data, and nodes' parents can
2365 // differ from the nesting implied by a naive processing of start and end
2366 // <tag>s. Conversely, explicit <tag>s in r's data can be silently dropped,
2367 // with no corresponding node in the resulting tree.
2368 //
2369 // Parse will reject HTML that is nested deeper than 512 elements.
2370 //
2371 // The input is assumed to be UTF-8 encoded.
2372 func Parse(r io.Reader) (*Node, error) {
2373 return ParseWithOptions(r)
2374 }
2375
2376 // ParseFragment parses a fragment of HTML and returns the nodes that were
2377 // found. If the fragment is the InnerHTML for an existing element, pass that
2378 // element in context.
2379 //
2380 // It has the same intricacies as Parse.
2381 func ParseFragment(r io.Reader, context *Node) ([]*Node, error) {
2382 return ParseFragmentWithOptions(r, context)
2383 }
2384
2385 // ParseOption configures a parser.
2386 type ParseOption func(p *parser)
2387
2388 // ParseOptionEnableScripting configures the scripting flag.
2389 // https://html.spec.whatwg.org/multipage/webappapis.html#enabling-and-disabling-scripting
2390 //
2391 // By default, scripting is enabled.
2392 func ParseOptionEnableScripting(enable bool) ParseOption {
2393 return func(p *parser) {
2394 p.scripting = enable
2395 }
2396 }
2397
2398 // ParseWithOptions is like Parse, with options.
2399 func ParseWithOptions(r io.Reader, opts ...ParseOption) (*Node, error) {
2400 p := &parser{
2401 tokenizer: NewTokenizer(r),
2402 doc: &Node{
2403 Type: DocumentNode,
2404 },
2405 scripting: true,
2406 framesetOK: true,
2407 im: initialIM,
2408 }
2409
2410 for _, f := range opts {
2411 f(p)
2412 }
2413
2414 if err := p.parse(); err != nil {
2415 return nil, err
2416 }
2417 return p.doc, nil
2418 }
2419
2420 // ParseFragmentWithOptions is like ParseFragment, with options.
2421 func ParseFragmentWithOptions(r io.Reader, context *Node, opts ...ParseOption) ([]*Node, error) {
2422 contextTag := ""
2423 if context != nil {
2424 if context.Type != ElementNode {
2425 return nil, errors.New("html: ParseFragment of non-element Node")
2426 }
2427 // The next check isn't just context.DataAtom.String() == context.Data because
2428 // it is valid to pass an element whose tag isn't a known atom. For example,
2429 // DataAtom == 0 and Data = "tagfromthefuture" is perfectly consistent.
2430 if context.DataAtom != a.Lookup([]byte(context.Data)) {
2431 return nil, fmt.Errorf("html: inconsistent Node: DataAtom=%q, Data=%q", context.DataAtom, context.Data)
2432 }
2433 contextTag = context.DataAtom.String()
2434 }
2435 p := &parser{
2436 doc: &Node{
2437 Type: DocumentNode,
2438 },
2439 scripting: true,
2440 fragment: true,
2441 context: context,
2442 }
2443 if context != nil && context.Namespace != "" {
2444 p.tokenizer = NewTokenizer(r)
2445 } else {
2446 p.tokenizer = NewTokenizerFragment(r, contextTag)
2447 }
2448
2449 for _, f := range opts {
2450 f(p)
2451 }
2452
2453 root := &Node{
2454 Type: ElementNode,
2455 DataAtom: a.Html,
2456 Data: a.Html.String(),
2457 }
2458 p.doc.AppendChild(root)
2459 p.oe = nodeStack{root}
2460 if context != nil && context.DataAtom == a.Template {
2461 p.templateStack = append(p.templateStack, inTemplateIM)
2462 }
2463 p.resetInsertionMode()
2464
2465 for n := context; n != nil; n = n.Parent {
2466 if n.Type == ElementNode && n.DataAtom == a.Form {
2467 p.form = n
2468 break
2469 }
2470 }
2471
2472 if err := p.parse(); err != nil {
2473 return nil, err
2474 }
2475
2476 parent := p.doc
2477 if context != nil {
2478 parent = root
2479 }
2480
2481 var result []*Node
2482 for c := parent.FirstChild; c != nil; {
2483 next := c.NextSibling
2484 parent.RemoveChild(c)
2485 result = append(result, c)
2486 c = next
2487 }
2488 return result, nil
2489 }
2490