HtmlTreeBuilderState.java
package org.jsoup.parser;
import org.jsoup.helper.Validate;
import org.jsoup.internal.StringUtil;
import org.jsoup.nodes.Attribute;
import org.jsoup.nodes.Attributes;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.DocumentType;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.Range;
import org.jspecify.annotations.Nullable;
import java.util.ArrayList;
import static org.jsoup.internal.StringUtil.inSorted;
import static org.jsoup.parser.HtmlTreeBuilder.isSpecial;
import static org.jsoup.parser.HtmlTreeBuilderState.Constants.*;
/**
* The Tree Builder's current state. Each state embodies the processing for the state, and transitions to other states.
*/
enum HtmlTreeBuilderState {
Initial {
@Override boolean process(Token t, HtmlTreeBuilder tb) {
if (isWhitespace(t)) {
return true; // ignore whitespace until we get the first content
} else if (t.isComment()) {
tb.insertCommentNode(t.asComment());
} else if (t.isDoctype()) {
// todo: parse error check on expected doctypes
Token.Doctype d = t.asDoctype();
DocumentType doctype = new DocumentType(
tb.settings.normalizeTag(d.getName()), d.getPublicIdentifier(), d.getSystemIdentifier());
doctype.setPubSysKey(d.getPubSysKey());
tb.getDocument().appendChild(doctype);
tb.onNodeInserted(doctype);
// todo: quirk state check on more doctype ids, if deemed useful (most are ancient legacy and presumably irrelevant)
if (d.isForceQuirks() || !doctype.name().equals("html") || doctype.publicId().equalsIgnoreCase("HTML"))
tb.getDocument().quirksMode(Document.QuirksMode.quirks);
tb.transition(BeforeHtml);
} else {
// todo: check not iframe srcdoc
tb.getDocument().quirksMode(Document.QuirksMode.quirks); // missing doctype
tb.transition(BeforeHtml);
return tb.process(t); // re-process token
}
return true;
}
},
BeforeHtml {
@Override boolean process(Token t, HtmlTreeBuilder tb) {
if (t.isDoctype()) {
tb.error(this);
return false;
} else if (t.isComment()) {
tb.insertCommentNode(t.asComment());
} else if (isWhitespace(t)) {
tb.insertCharacterNode(t.asCharacter()); // out of spec - include whitespace
} else if (t.isStartTag() && t.asStartTag().normalName().equals("html")) {
tb.insertElementFor(t.asStartTag());
tb.transition(BeforeHead);
} else if (t.isEndTag() && (inSorted(t.asEndTag().normalName(), BeforeHtmlToHead))) {
return anythingElse(t, tb);
} else if (t.isEndTag()) {
tb.error(this);
return false;
} else {
return anythingElse(t, tb);
}
return true;
}
private boolean anythingElse(Token t, HtmlTreeBuilder tb) {
tb.processStartTag("html");
tb.transition(BeforeHead);
return tb.process(t);
}
},
BeforeHead {
@Override boolean process(Token t, HtmlTreeBuilder tb) {
if (isWhitespace(t)) {
tb.insertCharacterNode(t.asCharacter()); // out of spec - include whitespace
} else if (t.isComment()) {
tb.insertCommentNode(t.asComment());
} else if (t.isDoctype()) {
tb.error(this);
return false;
} else if (t.isStartTag() && t.asStartTag().normalName().equals("html")) {
return InBody.process(t, tb); // does not transition
} else if (t.isStartTag() && t.asStartTag().normalName().equals("head")) {
Element head = tb.insertElementFor(t.asStartTag());
tb.setHeadElement(head);
tb.transition(InHead);
} else if (t.isEndTag() && (inSorted(t.asEndTag().normalName(), BeforeHtmlToHead))) {
tb.processStartTag("head");
return tb.process(t);
} else if (t.isEndTag()) {
tb.error(this);
return false;
} else {
tb.processStartTag("head");
return tb.process(t);
}
return true;
}
},
InHead {
@Override boolean process(Token t, HtmlTreeBuilder tb) {
if (isWhitespace(t)) {
tb.insertCharacterNode(t.asCharacter()); // out of spec - include whitespace
return true;
}
final String name;
switch (t.type) {
case Comment:
tb.insertCommentNode(t.asComment());
break;
case Doctype:
tb.error(this);
return false;
case StartTag:
Token.StartTag start = t.asStartTag();
name = start.normalName();
if (name.equals("html")) {
return InBody.process(t, tb);
} else if (inSorted(name, InHeadEmpty)) {
Element el = tb.insertEmptyElementFor(start);
// jsoup special: update base the first time it is seen
if (name.equals("base") && el.hasAttr("href"))
tb.maybeSetBaseUri(el);
} else if (name.equals("meta")) {
tb.insertEmptyElementFor(start);
} else if (name.equals("title")) {
HandleTextState(start, tb, tb.tagFor(start).textState());
} else if (inSorted(name, InHeadRaw)) {
HandleTextState(start, tb, tb.tagFor(start).textState());
} else if (name.equals("noscript")) {
// else if noscript && scripting flag = true: rawtext (jsoup doesn't run script, to handle as noscript)
tb.insertElementFor(start);
tb.transition(InHeadNoscript);
} else if (name.equals("script")) {
// skips some script rules as won't execute them
tb.tokeniser.transition(TokeniserState.ScriptData);
tb.markInsertionMode();
tb.transition(Text);
tb.insertElementFor(start);
} else if (name.equals("head")) {
tb.error(this);
return false;
} else if (name.equals("template")) {
tb.insertElementFor(start);
tb.insertMarkerToFormattingElements();
tb.framesetOk(false);
tb.transition(InTemplate);
tb.pushTemplateMode(InTemplate);
} else {
return anythingElse(t, tb);
}
break;
case EndTag:
Token.EndTag end = t.asEndTag();
name = end.normalName();
if (name.equals("head")) {
tb.pop();
tb.transition(AfterHead);
} else if (inSorted(name, Constants.InHeadEnd)) {
return anythingElse(t, tb);
} else if (name.equals("template")) {
if (!tb.onStack(name)) {
tb.error(this);
} else {
tb.generateImpliedEndTags(true);
if (!tb.currentElementIs(name)) tb.error(this);
tb.popStackToClose(name);
tb.clearFormattingElementsToLastMarker();
tb.popTemplateMode();
tb.resetInsertionMode();
}
}
else {
tb.error(this);
return false;
}
break;
default:
return anythingElse(t, tb);
}
return true;
}
private boolean anythingElse(Token t, TreeBuilder tb) {
tb.processEndTag("head");
return tb.process(t);
}
},
InHeadNoscript {
@Override boolean process(Token t, HtmlTreeBuilder tb) {
if (t.isDoctype()) {
tb.error(this);
} else if (t.isStartTag() && t.asStartTag().normalName().equals("html")) {
return tb.process(t, InBody);
} else if (t.isEndTag() && t.asEndTag().normalName().equals("noscript")) {
tb.pop();
tb.transition(InHead);
} else if (isWhitespace(t) || t.isComment() || (t.isStartTag() && inSorted(t.asStartTag().normalName(),
InHeadNoScriptHead))) {
return tb.process(t, InHead);
} else if (t.isEndTag() && t.asEndTag().normalName().equals("br")) {
return anythingElse(t, tb);
} else if ((t.isStartTag() && inSorted(t.asStartTag().normalName(), InHeadNoscriptIgnore)) || t.isEndTag()) {
tb.error(this);
return false;
} else {
return anythingElse(t, tb);
}
return true;
}
private boolean anythingElse(Token t, HtmlTreeBuilder tb) {
// note that this deviates from spec, which is to pop out of noscript and reprocess in head:
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inheadnoscript
// allows content to be inserted as data
tb.error(this);
tb.insertCharacterNode(new Token.Character().data(t.toString()));
return true;
}
},
AfterHead {
@Override boolean process(Token t, HtmlTreeBuilder tb) {
if (isWhitespace(t)) {
tb.insertCharacterNode(t.asCharacter());
} else if (t.isComment()) {
tb.insertCommentNode(t.asComment());
} else if (t.isDoctype()) {
tb.error(this);
} else if (t.isStartTag()) {
Token.StartTag startTag = t.asStartTag();
String name = startTag.normalName();
if (name.equals("html")) {
return tb.process(t, InBody);
} else if (name.equals("body")) {
tb.insertElementFor(startTag);
tb.framesetOk(false);
tb.transition(InBody);
} else if (name.equals("frameset")) {
tb.insertElementFor(startTag);
tb.transition(InFrameset);
} else if (inSorted(name, InBodyStartToHead)) {
tb.error(this);
Element head = tb.getHeadElement();
tb.push(head);
tb.process(t, InHead);
tb.removeFromStack(head);
} else if (name.equals("head")) {
tb.error(this);
return false;
} else {
anythingElse(t, tb);
}
} else if (t.isEndTag()) {
String name = t.asEndTag().normalName();
if (inSorted(name, AfterHeadBody)) {
anythingElse(t, tb);
} else if (name.equals("template")) {
tb.process(t, InHead);
}
else {
tb.error(this);
return false;
}
} else {
anythingElse(t, tb);
}
return true;
}
private boolean anythingElse(Token t, HtmlTreeBuilder tb) {
tb.processStartTag("body");
tb.framesetOk(true);
return tb.process(t);
}
},
InBody {
@Override boolean process(Token t, HtmlTreeBuilder tb) {
switch (t.type) {
case Character: {
Token.Character c = t.asCharacter();
if (c.getData().equals(nullString)) {
tb.error(this);
return false;
} else if (tb.framesetOk() && isWhitespace(c)) { // don't check if whitespace if frames already closed
tb.reconstructFormattingElements();
tb.insertCharacterNode(c);
} else {
tb.reconstructFormattingElements();
tb.insertCharacterNode(c);
tb.framesetOk(false);
}
break;
}
case Comment: {
tb.insertCommentNode(t.asComment());
break;
}
case Doctype: {
tb.error(this);
return false;
}
case StartTag:
return inBodyStartTag(t, tb);
case EndTag:
return inBodyEndTag(t, tb);
case EOF:
if (tb.templateModeSize() > 0)
return tb.process(t, InTemplate);
if (tb.onStackNot(InBodyEndOtherErrors))
tb.error(this);
// stop parsing
break;
default:
Validate.wtf("Unexpected state: " + t.type); // XmlDecl only in XmlTreeBuilder
}
return true;
}
private boolean inBodyStartTag(Token t, HtmlTreeBuilder tb) {
final Token.StartTag startTag = t.asStartTag();
final String name = startTag.normalName();
final ArrayList<Element> stack;
Element el;
switch (name) {
case "a":
if (tb.getActiveFormattingElement("a") != null) {
tb.error(this);
tb.processEndTag("a");
// still on stack?
Element remainingA = tb.getFromStack("a");
if (remainingA != null) {
tb.removeFromActiveFormattingElements(remainingA);
tb.removeFromStack(remainingA);
}
}
tb.reconstructFormattingElements();
el = tb.insertElementFor(startTag);
tb.pushActiveFormattingElements(el);
break;
case "span":
// same as final else, but short circuits lots of checks
tb.reconstructFormattingElements();
tb.insertElementFor(startTag);
break;
case "li":
tb.framesetOk(false);
stack = tb.getStack();
for (int i = stack.size() - 1; i > 0; i--) {
el = stack.get(i);
if (el.nameIs("li")) {
tb.processEndTag("li");
break;
}
if (isSpecial(el) && !inSorted(el.normalName(), Constants.InBodyStartLiBreakers))
break;
}
if (tb.inButtonScope("p")) {
tb.processEndTag("p");
}
tb.insertElementFor(startTag);
break;
case "html":
tb.error(this);
if (tb.onStack("template")) return false; // ignore
// otherwise, merge attributes onto real html (if present)
stack = tb.getStack();
if (stack.size() > 0) {
Element html = tb.getStack().get(0);
mergeAttributes(startTag, html);
}
break;
case "body":
tb.error(this);
stack = tb.getStack();
if (stack.size() == 1 || (stack.size() > 2 && !stack.get(1).nameIs("body")) || tb.onStack("template")) {
// only in fragment case
return false; // ignore
} else {
tb.framesetOk(false);
// will be on stack if this is a nested body. won't be if closed (which is a variance from spec, which leaves it on)
Element body = tb.getFromStack("body");
if (body != null) mergeAttributes(startTag, body);
}
break;
case "frameset":
tb.error(this);
stack = tb.getStack();
if (stack.size() == 1 || (stack.size() > 2 && !stack.get(1).nameIs("body"))) {
// only in fragment case
return false; // ignore
} else if (!tb.framesetOk()) {
return false; // ignore frameset
} else {
Element second = stack.get(1);
if (second.parent() != null)
second.remove();
// pop up to html element
while (stack.size() > 1)
stack.remove(stack.size() - 1);
tb.insertElementFor(startTag);
tb.transition(InFrameset);
}
break;
case "form":
if (tb.getFormElement() != null && !tb.onStack("template")) {
tb.error(this);
return false;
}
if (tb.inButtonScope("p")) {
tb.closeElement("p");
}
tb.insertFormElement(startTag, true, true); // won't associate to any template
break;
case "plaintext":
if (tb.inButtonScope("p")) {
tb.processEndTag("p");
}
tb.insertElementFor(startTag);
tb.tokeniser.transition(TokeniserState.PLAINTEXT); // once in, never gets out
break;
case "button":
if (tb.inButtonScope("button")) {
// close and reprocess
tb.error(this);
tb.processEndTag("button");
tb.process(startTag);
} else {
tb.reconstructFormattingElements();
tb.insertElementFor(startTag);
tb.framesetOk(false);
}
break;
case "nobr":
tb.reconstructFormattingElements();
if (tb.inScope("nobr")) {
tb.error(this);
tb.processEndTag("nobr");
tb.reconstructFormattingElements();
}
el = tb.insertElementFor(startTag);
tb.pushActiveFormattingElements(el);
break;
case "table":
if (tb.getDocument().quirksMode() != Document.QuirksMode.quirks && tb.inButtonScope("p")) {
tb.processEndTag("p");
}
tb.insertElementFor(startTag);
tb.framesetOk(false);
tb.transition(InTable);
break;
case "input":
tb.reconstructFormattingElements();
el = tb.insertEmptyElementFor(startTag);
if (!el.attr("type").equalsIgnoreCase("hidden"))
tb.framesetOk(false);
break;
case "hr":
if (tb.inButtonScope("p")) {
tb.processEndTag("p");
}
tb.insertEmptyElementFor(startTag);
tb.framesetOk(false);
break;
case "image":
if (tb.getFromStack("svg") == null)
return tb.process(startTag.name("img")); // change <image> to <img>, unless in svg
else
tb.insertElementFor(startTag);
break;
case "textarea":
tb.framesetOk(false);
HandleTextState(startTag, tb, tb.tagFor(startTag).textState());
break;
case "xmp":
if (tb.inButtonScope("p")) {
tb.processEndTag("p");
}
tb.reconstructFormattingElements();
tb.framesetOk(false);
HandleTextState(startTag, tb, tb.tagFor(startTag).textState());
break;
case "iframe":
tb.framesetOk(false);
HandleTextState(startTag, tb, tb.tagFor(startTag).textState());
break;
case "noembed":
// also handle noscript if script enabled
HandleTextState(startTag, tb, tb.tagFor(startTag).textState());
break;
case "select":
tb.reconstructFormattingElements();
tb.insertElementFor(startTag);
tb.framesetOk(false);
if (startTag.selfClosing) break; // don't change states if not added to the stack
HtmlTreeBuilderState state = tb.state();
if (state.equals(InTable) || state.equals(InCaption) || state.equals(InTableBody) || state.equals(InRow) || state.equals(InCell))
tb.transition(InSelectInTable);
else
tb.transition(InSelect);
break;
case "math":
tb.reconstructFormattingElements();
tb.insertForeignElementFor(startTag, Parser.NamespaceMathml);
break;
case "svg":
tb.reconstructFormattingElements();
tb.insertForeignElementFor(startTag, Parser.NamespaceSvg);
break;
// static final String[] Headings = new String[]{"h1", "h2", "h3", "h4", "h5", "h6"};
case "h1":
case "h2":
case "h3":
case "h4":
case "h5":
case "h6":
if (tb.inButtonScope("p")) {
tb.processEndTag("p");
}
if (inSorted(tb.currentElement().normalName(), Constants.Headings)) {
tb.error(this);
tb.pop();
}
tb.insertElementFor(startTag);
break;
// static final String[] InBodyStartPreListing = new String[]{"listing", "pre"};
case "pre":
case "listing":
if (tb.inButtonScope("p")) {
tb.processEndTag("p");
}
tb.insertElementFor(startTag);
tb.reader.matchConsume("\n"); // ignore LF if next token
tb.framesetOk(false);
break;
// static final String[] DdDt = new String[]{"dd", "dt"};
case "dd":
case "dt":
tb.framesetOk(false);
stack = tb.getStack();
final int bottom = stack.size() - 1;
final int upper = bottom >= MaxStackScan ? bottom - MaxStackScan : 0;
for (int i = bottom; i >= upper; i--) {
el = stack.get(i);
if (inSorted(el.normalName(), Constants.DdDt)) {
tb.processEndTag(el.normalName());
break;
}
if (isSpecial(el) && !inSorted(el.normalName(), Constants.InBodyStartLiBreakers))
break;
}
if (tb.inButtonScope("p")) {
tb.processEndTag("p");
}
tb.insertElementFor(startTag);
break;
case "optgroup":
case "option":
if (tb.currentElementIs("option"))
tb.processEndTag("option");
tb.reconstructFormattingElements();
tb.insertElementFor(startTag);
break;
case "rb":
case "rtc":
if (tb.inScope("ruby")) {
tb.generateImpliedEndTags();
if (!tb.currentElementIs("ruby"))
tb.error(this);
}
tb.insertElementFor(startTag);
break;
case "rp":
case "rt":
if (tb.inScope("ruby")) {
tb.generateImpliedEndTags("rtc");
if (!tb.currentElementIs("rtc") && !tb.currentElementIs("ruby"))
tb.error(this);
}
tb.insertElementFor(startTag);
break;
// InBodyStartEmptyFormatters:
case "area":
case "br":
case "embed":
case "img":
case "keygen":
case "wbr":
tb.reconstructFormattingElements();
tb.insertEmptyElementFor(startTag);
tb.framesetOk(false);
break;
// Formatters:
case "b":
case "big":
case "code":
case "em":
case "font":
case "i":
case "s":
case "small":
case "strike":
case "strong":
case "tt":
case "u":
tb.reconstructFormattingElements();
el = tb.insertElementFor(startTag);
tb.pushActiveFormattingElements(el);
break;
default:
Tag tag = tb.tagFor(startTag);
TokeniserState textState = tag.textState();
if (textState != null) { // custom rcdata or rawtext (if we were in head, will have auto-transitioned here)
HandleTextState(startTag, tb, textState);
} else if (!tag.isKnownTag()) { // no other special rules for custom tags
tb.insertElementFor(startTag);
} else if (inSorted(name, Constants.InBodyStartPClosers)) {
if (tb.inButtonScope("p")) tb.processEndTag("p");
tb.insertElementFor(startTag);
} else if (inSorted(name, Constants.InBodyStartToHead)) {
return tb.process(t, InHead);
} else if (inSorted(name, Constants.InBodyStartApplets)) {
tb.reconstructFormattingElements();
tb.insertElementFor(startTag);
tb.insertMarkerToFormattingElements();
tb.framesetOk(false);
} else if (inSorted(name, Constants.InBodyStartMedia)) {
tb.insertEmptyElementFor(startTag);
} else if (inSorted(name, Constants.InBodyStartDrop)) {
tb.error(this);
return false;
} else {
tb.reconstructFormattingElements();
tb.insertElementFor(startTag);
}
}
return true;
}
private static final int MaxStackScan = 24; // used for DD / DT scan, prevents runaway
private boolean inBodyEndTag(Token t, HtmlTreeBuilder tb) {
final Token.EndTag endTag = t.asEndTag();
final String name = endTag.normalName();
switch (name) {
case "template":
tb.process(t, InHead);
break;
case "sarcasm": // *sigh*
case "span":
// same as final fall through, but saves short circuit
return anyOtherEndTag(t, tb);
case "li":
if (!tb.inListItemScope(name)) {
tb.error(this);
return false;
} else {
tb.generateImpliedEndTags(name);
if (!tb.currentElementIs(name))
tb.error(this);
tb.popStackToClose(name);
}
break;
case "body":
if (!tb.inScope("body")) {
tb.error(this);
return false;
} else {
if (tb.onStackNot(InBodyEndOtherErrors))
tb.error(this);
tb.trackNodePosition(tb.getFromStack("body"), false); // track source position of close; body is left on stack, in case of trailers
tb.transition(AfterBody);
}
break;
case "html":
if (!tb.onStack("body")) {
tb.error(this);
return false; // ignore
} else {
if (tb.onStackNot(InBodyEndOtherErrors))
tb.error(this);
tb.transition(AfterBody);
return tb.process(t); // re-process
}
case "form":
if (!tb.onStack("template")) {
Element currentForm = tb.getFormElement();
tb.setFormElement(null);
if (currentForm == null || !tb.inScope(name)) {
tb.error(this);
return false;
}
tb.generateImpliedEndTags();
if (!tb.currentElementIs(name))
tb.error(this);
// remove currentForm from stack. will shift anything under up.
tb.removeFromStack(currentForm);
} else { // template on stack
if (!tb.inScope(name)) {
tb.error(this);
return false;
}
tb.generateImpliedEndTags();
if (!tb.currentElementIs(name)) tb.error(this);
tb.popStackToClose(name);
}
break;
case "p":
if (!tb.inButtonScope(name)) {
tb.error(this);
tb.processStartTag(name); // if no p to close, creates an empty <p></p>
return tb.process(endTag);
} else {
tb.generateImpliedEndTags(name);
if (!tb.currentElementIs(name))
tb.error(this);
tb.popStackToClose(name);
}
break;
case "dd":
case "dt":
if (!tb.inScope(name)) {
tb.error(this);
return false;
} else {
tb.generateImpliedEndTags(name);
if (!tb.currentElementIs(name))
tb.error(this);
tb.popStackToClose(name);
}
break;
case "h1":
case "h2":
case "h3":
case "h4":
case "h5":
case "h6":
if (!tb.inScope(Constants.Headings)) {
tb.error(this);
return false;
} else {
tb.generateImpliedEndTags(name);
if (!tb.currentElementIs(name))
tb.error(this);
tb.popStackToClose(Constants.Headings);
}
break;
case "br":
tb.error(this);
tb.processStartTag("br");
return false;
default:
// todo - move rest to switch if desired
if (inSorted(name, Constants.InBodyEndAdoptionFormatters)) {
return inBodyEndTagAdoption(t, tb);
} else if (inSorted(name, Constants.InBodyEndClosers)) {
if (!tb.inScope(name)) {
// nothing to close
tb.error(this);
return false;
} else {
tb.generateImpliedEndTags();
if (!tb.currentElementIs(name))
tb.error(this);
tb.popStackToClose(name);
}
} else if (inSorted(name, Constants.InBodyStartApplets)) {
if (!tb.inScope("name")) {
if (!tb.inScope(name)) {
tb.error(this);
return false;
}
tb.generateImpliedEndTags();
if (!tb.currentElementIs(name))
tb.error(this);
tb.popStackToClose(name);
tb.clearFormattingElementsToLastMarker();
}
} else {
return anyOtherEndTag(t, tb);
}
}
return true;
}
boolean anyOtherEndTag(Token t, HtmlTreeBuilder tb) {
final String name = t.asEndTag().normalName; // case insensitive search - goal is to preserve output case, not for the parse to be case sensitive
final ArrayList<Element> stack = tb.getStack();
// deviate from spec slightly to speed when super deeply nested
Element elFromStack = tb.getFromStack(name);
if (elFromStack == null) {
tb.error(this);
return false;
}
for (int pos = stack.size() - 1; pos >= 0; pos--) {
Element node = stack.get(pos);
if (node.nameIs(name)) {
tb.generateImpliedEndTags(name);
if (!tb.currentElementIs(name))
tb.error(this);
tb.popStackToClose(name);
break;
} else {
if (isSpecial(node)) {
tb.error(this);
return false;
}
}
}
return true;
}
private boolean inBodyEndTagAdoption(Token t, HtmlTreeBuilder tb) {
// https://html.spec.whatwg.org/multipage/parsing.html#adoption-agency-algorithm
// JH: Including the spec notes here to simplify tracking / correcting. It's a bit gnarly and there may still be some nuances I haven't caught. But test cases and comparisons to browsers check out.
// The adoption agency algorithm, which takes as its only argument a token token for which the algorithm is being run, consists of the following steps:
final Token.EndTag endTag = t.asEndTag();
final String subject = endTag.normalName; // 1. Let subject be token's tag name.
// 2. If the [current node] is an [HTML element] whose tag name is subject, and the [current node] is not in the [list of active formatting elements], then pop the [current node] off the [stack of open elements] and return.
if (tb.currentElement().normalName().equals(subject) && !tb.isInActiveFormattingElements(tb.currentElement())) {
tb.pop();
return true;
}
int outer = 0; // 3. Let outerLoopCounter be 0.
while (true) { // 4. While true:
if (outer >= 8) { // 1. If outerLoopCounter is greater than or equal to 8, then return.
return true;
}
outer++; // 2. Increment outerLoopCounter by 1.
// 3. Let formattingElement be the last element in the [list of active formatting elements] that:
// - is between the end of the list and the last [marker] in the list, if any, or the start of the list otherwise, and
// - has the tag name subject.
// If there is no such element, then return and instead act as described in the "any other end tag" entry above.
Element formatEl = null;
for (int i = tb.formattingElements.size() - 1; i >= 0; i--) {
Element next = tb.formattingElements.get(i);
if (next == null) // marker
break;
if (next.normalName().equals(subject)) {
formatEl = next;
break;
}
}
if (formatEl == null) {
return anyOtherEndTag(t, tb);
}
// 4. If formattingElement is not in the [stack of open elements], then this is a [parse error]; remove the element from the list, and return.
if (!tb.onStack(formatEl)) {
tb.error(this);
tb.removeFromActiveFormattingElements(formatEl);
return true;
}
// 5. If formattingElement is in the [stack of open elements], but the element is not [in scope], then this is a [parse error]; return.
if (!tb.inScope(formatEl.normalName())) {
tb.error(this);
return false;
} else if (tb.currentElement() != formatEl) { // 6. If formattingElement is not the [current node], this is a [parse error].
tb.error(this);
}
// 7. Let furthestBlock be the topmost node in the [stack of open elements] that is lower in the stack than formattingElement, and is an element in the [special]category. There might not be one.
Element furthestBlock = null;
ArrayList<Element> stack = tb.getStack();
int fei = stack.lastIndexOf(formatEl);
if (fei != -1) { // look down the stack
for (int i = fei + 1; i < stack.size(); i++) {
Element el = stack.get(i);
if (isSpecial(el)) {
furthestBlock = el;
break;
}
}
}
// 8. If there is no furthestBlock, then the UA must first pop all the nodes from the bottom of the [stack of open elements], from the [current node] up to and including formattingElement, then remove formattingElement from the [list of active formatting elements], and finally return.
if (furthestBlock == null) {
while (tb.currentElement() != formatEl) {
tb.pop();
}
tb.pop();
tb.removeFromActiveFormattingElements(formatEl);
return true;
}
Element commonAncestor = tb.aboveOnStack(formatEl); // 9. Let commonAncestor be the element immediately above formattingElement in the [stack of open elements].
if (commonAncestor == null) { tb.error(this); return true; } // Would be a WTF
// 10. Let a bookmark note the position of formattingElement in the [list of active formatting elements] relative to the elements on either side of it in the list.
// JH - I think this means its index? Or do we need a linked list?
int bookmark = tb.positionOfElement(formatEl);
Element el = furthestBlock; // 11. Let node and lastNode be furthestBlock.
Element lastEl = furthestBlock;
int inner = 0; // 12. Let innerLoopCounter be 0.
while (true) { // 13. While true:
inner++; // 1. Increment innerLoopCounter by 1.
// 2. Let node be the element immediately above node in the [stack of open elements], or if node is no longer in the [stack of open elements] , the element that was immediately above node in the [stack of open elements] before node was removed.
if (!tb.onStack(el)) {
// if node was removed from stack, use the element that was above it
el = el.parent(); // JH - is there a situation where it's not the parent?
} else {
el = tb.aboveOnStack(el);
}
if (el == null || el.nameIs("body")) {
tb.error(this); // shouldn't be able to hit
break;
}
// 3. If node is formattingElement, then [break].
if (el == formatEl) {
break;
}
// 4. If innerLoopCounter is greater than 3 and node is in the [list of active formatting elements], then remove node from the [list of active formatting elements].
if (inner > 3 && tb.isInActiveFormattingElements(el)) {
tb.removeFromActiveFormattingElements(el);
break;
}
// 5. If node is not in the [list of active formatting elements], then remove node from the [stack of open elements] and [continue].
if (!tb.isInActiveFormattingElements(el)) {
tb.removeFromStack(el);
continue;
}
// 6. [Create an element for the token] for which the element node was created, in the [HTML namespace], with commonAncestor as the intended parent; replace the entry for node in the [list of active formatting elements] with an entry for the new element, replace the entry for node in the [stack of open elements] with an entry for the new element, and let node be the new element.
if (!tb.onStack(el)) { // stale formatting element; cannot adopt/replace
tb.error(this);
tb.removeFromActiveFormattingElements(el);
break; // exit inner loop; proceed with step 14 using current lastEl
}
Element replacement = new Element(tb.tagFor(el.nodeName(), el.normalName(), tb.defaultNamespace(), ParseSettings.preserveCase), tb.getBaseUri());
tb.replaceActiveFormattingElement(el, replacement);
tb.replaceOnStack(el, replacement);
el = replacement;
// 7. If lastNode is furthestBlock, then move the aforementioned bookmark to be immediately after the new node in the [list of active formatting elements].
if (lastEl == furthestBlock) {
bookmark = tb.positionOfElement(el) + 1;
}
el.appendChild(lastEl); // 8. [Append] lastNode to node.
lastEl = el; // 9. Set lastNode to node.
} // end inner loop # 13
// 14. Insert whatever lastNode ended up being in the previous step at the [appropriate place for inserting a node], but using commonAncestor as the _override target_.
// todo - impl https://html.spec.whatwg.org/multipage/parsing.html#appropriate-place-for-inserting-a-node fostering
// just use commonAncestor as target:
commonAncestor.appendChild(lastEl);
// 15. [Create an element for the token] for which formattingElement was created, in the [HTML namespace], with furthestBlock as the intended parent.
Element adoptor = new Element(formatEl.tag(), tb.getBaseUri());
adoptor.attributes().addAll(formatEl.attributes()); // also attributes
// 16. Take all of the child nodes of furthestBlock and append them to the element created in the last step.
for (Node child : furthestBlock.childNodes()) {
adoptor.appendChild(child);
}
furthestBlock.appendChild(adoptor); // 17. Append that new element to furthestBlock.
// 18. Remove formattingElement from the [list of active formatting elements], and insert the new element into the [list of active formatting elements] at the position of the aforementioned bookmark.
tb.removeFromActiveFormattingElements(formatEl);
tb.pushWithBookmark(adoptor, bookmark);
// 19. Remove formattingElement from the [stack of open elements], and insert the new element into the [stack of open elements] immediately below the position of furthestBlock in that stack.
tb.removeFromStack(formatEl);
tb.insertOnStackAfter(furthestBlock, adoptor);
} // end of outer loop # 4
}
},
Text {
// in script, style etc. normally treated as data tags
@Override boolean process(Token t, HtmlTreeBuilder tb) {
if (t.isCharacter()) {
tb.insertCharacterNode(t.asCharacter());
} else if (t.isEOF()) {
tb.error(this);
// if current node is script: already started
tb.pop();
tb.transition(tb.originalState());
if (tb.state() == Text) // stack is such that we couldn't transition out; just close
tb.transition(InBody);
return tb.process(t);
} else if (t.isEndTag()) {
// if: An end tag whose tag name is "script" -- scripting nesting level, if evaluating scripts
tb.pop();
tb.transition(tb.originalState());
}
return true;
}
},
InTable {
@Override boolean process(Token t, HtmlTreeBuilder tb) {
if (t.isCharacter() && inSorted(tb.currentElement().normalName(), InTableFoster)) {
tb.resetPendingTableCharacters();
tb.markInsertionMode();
tb.transition(InTableText);
return tb.process(t);
} else if (t.isComment()) {
tb.insertCommentNode(t.asComment());
return true;
} else if (t.isDoctype()) {
tb.error(this);
return false;
} else if (t.isStartTag()) {
Token.StartTag startTag = t.asStartTag();
String name = startTag.normalName();
if (name.equals("caption")) {
tb.clearStackToTableContext();
tb.insertMarkerToFormattingElements();
tb.insertElementFor(startTag);
tb.transition(InCaption);
} else if (name.equals("colgroup")) {
tb.clearStackToTableContext();
tb.insertElementFor(startTag);
tb.transition(InColumnGroup);
} else if (name.equals("col")) {
tb.clearStackToTableContext();
tb.processStartTag("colgroup");
return tb.process(t);
} else if (inSorted(name, InTableToBody)) {
tb.clearStackToTableContext();
tb.insertElementFor(startTag);
tb.transition(InTableBody);
} else if (inSorted(name, InTableAddBody)) {
tb.clearStackToTableContext();
tb.processStartTag("tbody");
return tb.process(t);
} else if (name.equals("table")) {
tb.error(this);
if (!tb.inTableScope(name)) { // ignore it
return false;
} else {
tb.popStackToClose(name);
if (!tb.resetInsertionMode()) {
// not per spec - but haven't transitioned out of table. so try something else
tb.insertElementFor(startTag);
return true;
}
return tb.process(t);
}
} else if (inSorted(name, InTableToHead)) {
return tb.process(t, InHead);
} else if (name.equals("input")) {
if (!(startTag.hasAttributes() && startTag.attributes.get("type").equalsIgnoreCase("hidden"))) {
return anythingElse(t, tb);
} else {
tb.insertEmptyElementFor(startTag);
}
} else if (name.equals("form")) {
tb.error(this);
if (tb.getFormElement() != null || tb.onStack("template"))
return false;
else {
tb.insertFormElement(startTag, false, false); // not added to stack. can associate to template
}
} else {
return anythingElse(t, tb);
}
return true; // todo: check if should return processed http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#parsing-main-intable
} else if (t.isEndTag()) {
Token.EndTag endTag = t.asEndTag();
String name = endTag.normalName();
if (name.equals("table")) {
if (!tb.inTableScope(name)) {
tb.error(this);
return false;
} else {
tb.popStackToClose("table");
tb.resetInsertionMode();
}
} else if (inSorted(name, InTableEndErr)) {
tb.error(this);
return false;
} else if (name.equals("template")) {
tb.process(t, InHead);
} else {
return anythingElse(t, tb);
}
return true; // todo: as above todo
} else if (t.isEOF()) {
if (tb.currentElementIs("html"))
tb.error(this);
return true; // stops parsing
}
return anythingElse(t, tb);
}
boolean anythingElse(Token t, HtmlTreeBuilder tb) {
tb.error(this);
tb.setFosterInserts(true);
tb.process(t, InBody);
tb.setFosterInserts(false);
return true;
}
},
InTableText {
@Override boolean process(Token t, HtmlTreeBuilder tb) {
if (t.type == Token.TokenType.Character) {
Token.Character c = t.asCharacter();
if (c.getData().equals(nullString)) {
tb.error(this);
return false;
} else {
tb.addPendingTableCharacters(c);
}
} else {
// insert gathered table text into the correct element:
if (tb.getPendingTableCharacters().size() > 0) {
final Token og = tb.currentToken; // update current token, so we can track cursor pos correctly
for (Token.Character c : tb.getPendingTableCharacters()) {
tb.currentToken = c;
if (!isWhitespace(c)) {
// InTable anything else section:
tb.error(this);
if (inSorted(tb.currentElement().normalName(), InTableFoster)) {
tb.setFosterInserts(true);
tb.process(c, InBody);
tb.setFosterInserts(false);
} else {
tb.process(c, InBody);
}
} else
tb.insertCharacterNode(c);
}
tb.currentToken = og;
tb.resetPendingTableCharacters();
}
tb.transition(tb.originalState());
return tb.process(t);
}
return true;
}
},
InCaption {
@Override boolean process(Token t, HtmlTreeBuilder tb) {
if (t.isEndTag() && t.asEndTag().normalName().equals("caption")) {
if (!tb.inTableScope("caption")) { // fragment case
tb.error(this);
return false;
} else {
tb.generateImpliedEndTags();
if (!tb.currentElementIs("caption")) tb.error(this);
tb.popStackToClose("caption");
tb.clearFormattingElementsToLastMarker();
tb.transition(InTable);
}
} else if ((
t.isStartTag() && inSorted(t.asStartTag().normalName(), InCellCol) ||
t.isEndTag() && t.asEndTag().normalName().equals("table"))
) {
// same as above but processes after transition
if (!tb.inTableScope("caption")) { // fragment case
tb.error(this);
return false;
}
tb.generateImpliedEndTags(false);
if (!tb.currentElementIs("caption")) tb.error(this);
tb.popStackToClose("caption");
tb.clearFormattingElementsToLastMarker();
tb.transition(InTable);
InTable.process(t, tb); // doesn't check foreign context
} else if (t.isEndTag() && inSorted(t.asEndTag().normalName(), InCaptionIgnore)) {
tb.error(this);
return false;
} else {
return tb.process(t, InBody);
}
return true;
}
},
InColumnGroup {
@Override boolean process(Token t, HtmlTreeBuilder tb) {
if (isWhitespace(t)) {
tb.insertCharacterNode(t.asCharacter());
return true;
}
switch (t.type) {
case Comment:
tb.insertCommentNode(t.asComment());
break;
case Doctype:
tb.error(this);
break;
case StartTag:
Token.StartTag startTag = t.asStartTag();
switch (startTag.normalName()) {
case "html":
return tb.process(t, InBody);
case "col":
tb.insertEmptyElementFor(startTag);
break;
case "template":
tb.process(t, InHead);
break;
default:
return anythingElse(t, tb);
}
break;
case EndTag:
Token.EndTag endTag = t.asEndTag();
String name = endTag.normalName();
switch (name) {
case "colgroup":
if (!tb.currentElementIs(name)) {
tb.error(this);
return false;
} else {
tb.pop();
tb.transition(InTable);
}
break;
case "template":
tb.process(t, InHead);
break;
default:
return anythingElse(t, tb);
}
break;
case EOF:
if (tb.currentElementIs("html"))
return true; // stop parsing; frag case
else
return anythingElse(t, tb);
default:
return anythingElse(t, tb);
}
return true;
}
private boolean anythingElse(Token t, HtmlTreeBuilder tb) {
if (!tb.currentElementIs("colgroup")) {
tb.error(this);
return false;
}
tb.pop();
tb.transition(InTable);
tb.process(t);
return true;
}
},
InTableBody {
@Override boolean process(Token t, HtmlTreeBuilder tb) {
final String name;
switch (t.type) {
case StartTag:
Token.StartTag startTag = t.asStartTag();
name = startTag.normalName();
if (name.equals("tr")) {
tb.clearStackToTableBodyContext();
tb.insertElementFor(startTag);
tb.transition(InRow);
} else if (inSorted(name, InCellNames)) {
tb.error(this);
tb.processStartTag("tr");
return tb.process(startTag);
} else if (inSorted(name, InTableBodyExit)) {
return exitTableBody(t, tb);
} else
return anythingElse(t, tb);
break;
case EndTag:
Token.EndTag endTag = t.asEndTag();
name = endTag.normalName();
if (inSorted(name, InTableEndIgnore)) {
if (!tb.inTableScope(name)) {
tb.error(this);
return false;
} else {
tb.clearStackToTableBodyContext();
tb.pop();
tb.transition(InTable);
}
} else if (name.equals("table")) {
return exitTableBody(t, tb);
} else if (inSorted(name, InTableBodyEndIgnore)) {
tb.error(this);
return false;
} else
return anythingElse(t, tb);
break;
default:
return anythingElse(t, tb);
}
return true;
}
private boolean exitTableBody(Token t, HtmlTreeBuilder tb) {
if (!(tb.inTableScope("tbody") || tb.inTableScope("thead") || tb.inScope("tfoot"))) {
// frag case
tb.error(this);
return false;
}
tb.clearStackToTableBodyContext();
tb.processEndTag(tb.currentElement().normalName()); // tbody, tfoot, thead
return tb.process(t);
}
private boolean anythingElse(Token t, HtmlTreeBuilder tb) {
return tb.process(t, InTable);
}
},
InRow {
@Override boolean process(Token t, HtmlTreeBuilder tb) {
if (t.isStartTag()) {
Token.StartTag startTag = t.asStartTag();
String name = startTag.normalName();
if (inSorted(name, InCellNames)) { // td, th
tb.clearStackToTableRowContext();
tb.insertElementFor(startTag);
tb.transition(InCell);
tb.insertMarkerToFormattingElements();
} else if (inSorted(name, InRowMissing)) { // "caption", "col", "colgroup", "tbody", "tfoot", "thead", "tr"
if (!tb.inTableScope("tr")) {
tb.error(this);
return false;
}
tb.clearStackToTableRowContext();
tb.pop(); // tr
tb.transition(InTableBody);
return tb.process(t);
} else {
return anythingElse(t, tb);
}
} else if (t.isEndTag()) {
Token.EndTag endTag = t.asEndTag();
String name = endTag.normalName();
if (name.equals("tr")) {
if (!tb.inTableScope(name)) {
tb.error(this); // frag
return false;
}
tb.clearStackToTableRowContext();
tb.pop(); // tr
tb.transition(InTableBody);
} else if (name.equals("table")) {
if (!tb.inTableScope("tr")) {
tb.error(this);
return false;
}
tb.clearStackToTableRowContext();
tb.pop(); // tr
tb.transition(InTableBody);
return tb.process(t);
} else if (inSorted(name, InTableToBody)) { // "tbody", "tfoot", "thead"
if (!tb.inTableScope(name)) {
tb.error(this);
return false;
}
if (!tb.inTableScope("tr")) {
// not an error per spec?
return false;
}
tb.clearStackToTableRowContext();
tb.pop(); // tr
tb.transition(InTableBody);
return tb.process(t);
} else if (inSorted(name, InRowIgnore)) {
tb.error(this);
return false;
} else {
return anythingElse(t, tb);
}
} else {
return anythingElse(t, tb);
}
return true;
}
private boolean anythingElse(Token t, HtmlTreeBuilder tb) {
return tb.process(t, InTable);
}
},
InCell {
@Override boolean process(Token t, HtmlTreeBuilder tb) {
if (t.isEndTag()) {
Token.EndTag endTag = t.asEndTag();
String name = endTag.normalName();
if (inSorted(name, Constants.InCellNames)) { // td, th
if (!tb.inTableScope(name)) {
tb.error(this);
tb.transition(InRow); // might not be in scope if empty: <td /> and processing fake end tag
return false;
}
tb.generateImpliedEndTags();
if (!tb.currentElementIs(name))
tb.error(this);
tb.popStackToClose(name);
tb.clearFormattingElementsToLastMarker();
tb.transition(InRow);
} else if (inSorted(name, Constants.InCellBody)) {
tb.error(this);
return false;
} else if (inSorted(name, Constants.InCellTable)) {
if (!tb.inTableScope(name)) {
tb.error(this);
return false;
}
closeCell(tb);
return tb.process(t);
} else {
return anythingElse(t, tb);
}
} else if (t.isStartTag() &&
inSorted(t.asStartTag().normalName(), Constants.InCellCol)) {
if (!(tb.inTableScope("td") || tb.inTableScope("th"))) {
tb.error(this);
return false;
}
closeCell(tb);
return tb.process(t);
} else {
return anythingElse(t, tb);
}
return true;
}
private boolean anythingElse(Token t, HtmlTreeBuilder tb) {
return tb.process(t, InBody);
}
private void closeCell(HtmlTreeBuilder tb) {
if (tb.inTableScope("td"))
tb.processEndTag("td");
else
tb.processEndTag("th"); // only here if th or td in scope
}
},
InSelect {
@Override boolean process(Token t, HtmlTreeBuilder tb) {
final String name;
switch (t.type) {
case Character:
Token.Character c = t.asCharacter();
if (c.getData().equals(nullString)) {
tb.error(this);
return false;
} else {
tb.insertCharacterNode(c);
}
break;
case Comment:
tb.insertCommentNode(t.asComment());
break;
case Doctype:
tb.error(this);
return false;
case StartTag:
Token.StartTag start = t.asStartTag();
name = start.normalName();
if (name.equals("html"))
return tb.process(start, InBody);
else if (name.equals("option")) {
if (tb.currentElementIs("option"))
tb.processEndTag("option");
tb.insertElementFor(start);
} else if (name.equals("optgroup")) {
if (tb.currentElementIs("option"))
tb.processEndTag("option"); // pop option and flow to pop optgroup
if (tb.currentElementIs("optgroup"))
tb.processEndTag("optgroup");
tb.insertElementFor(start);
} else if (name.equals("select")) {
tb.error(this);
return tb.processEndTag("select");
} else if (inSorted(name, InSelectEnd)) {
tb.error(this);
if (!tb.inSelectScope("select"))
return false; // frag
// spec says close select then reprocess; leads to recursion. iter directly:
do {
tb.popStackToClose("select");
tb.resetInsertionMode();
} while (tb.inSelectScope("select")); // collapse invalid nested selects
return tb.process(start);
} else if (name.equals("script") || name.equals("template")) {
return tb.process(t, InHead);
} else {
return anythingElse(t, tb);
}
break;
case EndTag:
Token.EndTag end = t.asEndTag();
name = end.normalName();
switch (name) {
case "optgroup":
if (tb.currentElementIs("option") && tb.aboveOnStack(tb.currentElement()) != null && tb.aboveOnStack(tb.currentElement()).nameIs("optgroup"))
tb.processEndTag("option");
if (tb.currentElementIs("optgroup"))
tb.pop();
else
tb.error(this);
break;
case "option":
if (tb.currentElementIs("option"))
tb.pop();
else
tb.error(this);
break;
case "select":
if (!tb.inSelectScope(name)) {
tb.error(this);
return false;
} else {
tb.popStackToClose(name);
tb.resetInsertionMode();
}
break;
case "template":
return tb.process(t, InHead);
default:
return anythingElse(t, tb);
}
break;
case EOF:
if (!tb.currentElementIs("html"))
tb.error(this);
break;
default:
return anythingElse(t, tb);
}
return true;
}
private boolean anythingElse(Token t, HtmlTreeBuilder tb) {
tb.error(this);
return false;
}
},
InSelectInTable {
@Override boolean process(Token t, HtmlTreeBuilder tb) {
if (t.isStartTag() && inSorted(t.asStartTag().normalName(), InSelectTableEnd)) {
tb.error(this);
tb.popStackToClose("select");
tb.resetInsertionMode();
return tb.process(t);
} else if (t.isEndTag() && inSorted(t.asEndTag().normalName(), InSelectTableEnd)) {
tb.error(this);
if (tb.inTableScope(t.asEndTag().normalName())) {
tb.popStackToClose("select");
tb.resetInsertionMode();
return (tb.process(t));
} else
return false;
} else {
return tb.process(t, InSelect);
}
}
},
InTemplate {
@Override boolean process(Token t, HtmlTreeBuilder tb) {
final String name;
switch (t.type) {
case Character:
case Comment:
case Doctype:
tb.process(t, InBody);
break;
case StartTag:
name = t.asStartTag().normalName();
if (inSorted(name, InTemplateToHead))
tb.process(t, InHead);
else if (inSorted(name, InTemplateToTable)) {
tb.popTemplateMode();
tb.pushTemplateMode(InTable);
tb.transition(InTable);
return tb.process(t);
}
else if (name.equals("col")) {
tb.popTemplateMode();
tb.pushTemplateMode(InColumnGroup);
tb.transition(InColumnGroup);
return tb.process(t);
} else if (name.equals("tr")) {
tb.popTemplateMode();
tb.pushTemplateMode(InTableBody);
tb.transition(InTableBody);
return tb.process(t);
} else if (name.equals("td") || name.equals("th")) {
tb.popTemplateMode();
tb.pushTemplateMode(InRow);
tb.transition(InRow);
return tb.process(t);
} else {
tb.popTemplateMode();
tb.pushTemplateMode(InBody);
tb.transition(InBody);
return tb.process(t);
}
break;
case EndTag:
name = t.asEndTag().normalName();
if (name.equals("template"))
tb.process(t, InHead);
else {
tb.error(this);
return false;
}
break;
case EOF:
if (!tb.onStack("template")) {// stop parsing
return true;
}
tb.error(this);
tb.popStackToClose("template");
tb.clearFormattingElementsToLastMarker();
tb.popTemplateMode();
tb.resetInsertionMode();
// spec deviation - if we did not break out of Template, stop processing, and don't worry about cleaning up ultra-deep template stacks
// limited depth because this can recurse and will blow stack if too deep
if (tb.state() != InTemplate && tb.templateModeSize() < 12)
return tb.process(t);
else return true;
default:
Validate.wtf("Unexpected state: " + t.type); // XmlDecl only in XmlTreeBuilder
}
return true;
}
},
AfterBody {
@Override boolean process(Token t, HtmlTreeBuilder tb) {
Element html = tb.getFromStack("html");
if (isWhitespace(t)) {
// spec deviation - currently body is still on stack, but we want this to go to the html node
if (html != null)
tb.insertCharacterToElement(t.asCharacter(), html);
else
tb.process(t, InBody); // will get into body
} else if (t.isComment()) {
tb.insertCommentNode(t.asComment()); // into html node
} else if (t.isDoctype()) {
tb.error(this);
return false;
} else if (t.isStartTag() && t.asStartTag().normalName().equals("html")) {
return tb.process(t, InBody);
} else if (t.isEndTag() && t.asEndTag().normalName().equals("html")) {
if (tb.isFragmentParsing()) {
tb.error(this);
return false;
} else {
if (html != null) tb.trackNodePosition(html, false); // track source position of close; html is left on stack, in case of trailers
tb.transition(AfterAfterBody);
}
} else if (t.isEOF()) {
// chillax! we're done
} else {
tb.error(this);
tb.resetBody();
return tb.process(t);
}
return true;
}
},
InFrameset {
@Override boolean process(Token t, HtmlTreeBuilder tb) {
if (isWhitespace(t)) {
tb.insertCharacterNode(t.asCharacter());
} else if (t.isComment()) {
tb.insertCommentNode(t.asComment());
} else if (t.isDoctype()) {
tb.error(this);
return false;
} else if (t.isStartTag()) {
Token.StartTag start = t.asStartTag();
switch (start.normalName()) {
case "html":
return tb.process(start, InBody);
case "frameset":
tb.insertElementFor(start);
break;
case "frame":
tb.insertEmptyElementFor(start);
break;
case "noframes":
return tb.process(start, InHead);
default:
tb.error(this);
return false;
}
} else if (t.isEndTag() && t.asEndTag().normalName().equals("frameset")) {
if (tb.currentElementIs("html")) { // frag
tb.error(this);
return false;
} else {
tb.pop();
if (!tb.isFragmentParsing() && !tb.currentElementIs("frameset")) {
tb.transition(AfterFrameset);
}
}
} else if (t.isEOF()) {
if (!tb.currentElementIs("html")) {
tb.error(this);
return true;
}
} else {
tb.error(this);
return false;
}
return true;
}
},
AfterFrameset {
@Override boolean process(Token t, HtmlTreeBuilder tb) {
if (isWhitespace(t)) {
tb.insertCharacterNode(t.asCharacter());
} else if (t.isComment()) {
tb.insertCommentNode(t.asComment());
} else if (t.isDoctype()) {
tb.error(this);
return false;
} else if (t.isStartTag() && t.asStartTag().normalName().equals("html")) {
return tb.process(t, InBody);
} else if (t.isEndTag() && t.asEndTag().normalName().equals("html")) {
tb.transition(AfterAfterFrameset);
} else if (t.isStartTag() && t.asStartTag().normalName().equals("noframes")) {
return tb.process(t, InHead);
} else if (t.isEOF()) {
// cool your heels, we're complete
} else {
tb.error(this);
return false;
}
return true;
}
},
AfterAfterBody {
@Override boolean process(Token t, HtmlTreeBuilder tb) {
if (t.isComment()) {
tb.insertCommentNode(t.asComment());
} else if (t.isDoctype() || (t.isStartTag() && t.asStartTag().normalName().equals("html"))) {
return tb.process(t, InBody);
} else if (isWhitespace(t)) {
// spec deviation - body and html still on stack, but want this space to go after </html>
Element doc = tb.getDocument();
tb.insertCharacterToElement(t.asCharacter(), doc);
}else if (t.isEOF()) {
// nice work chuck
} else {
tb.error(this);
tb.resetBody();
return tb.process(t);
}
return true;
}
},
AfterAfterFrameset {
@Override boolean process(Token t, HtmlTreeBuilder tb) {
if (t.isComment()) {
tb.insertCommentNode(t.asComment());
} else if (t.isDoctype() || isWhitespace(t) || (t.isStartTag() && t.asStartTag().normalName().equals("html"))) {
return tb.process(t, InBody);
} else if (t.isEOF()) {
// nice work chuck
} else if (t.isStartTag() && t.asStartTag().normalName().equals("noframes")) {
return tb.process(t, InHead);
} else {
tb.error(this);
return false;
}
return true;
}
},
ForeignContent {
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inforeign
@Override boolean process(Token t, HtmlTreeBuilder tb) {
switch (t.type) {
case Character:
Token.Character c = t.asCharacter();
if (c.getData().equals(nullString))
tb.error(this);
else if (HtmlTreeBuilderState.isWhitespace(c))
tb.insertCharacterNode(c);
else {
tb.insertCharacterNode(c);
tb.framesetOk(false);
}
break;
case Comment:
tb.insertCommentNode(t.asComment());
break;
case Doctype:
tb.error(this);
break;
case StartTag:
Token.StartTag start = t.asStartTag();
if (StringUtil.in(start.normalName, InForeignToHtml))
return processAsHtml(t, tb);
if (start.normalName.equals("font") && (
start.hasAttributeIgnoreCase("color")
|| start.hasAttributeIgnoreCase("face")
|| start.hasAttributeIgnoreCase("size")))
return processAsHtml(t, tb);
// Any other start:
// (whatwg says to fix up tag name and attribute case per a table - we will preserve original case instead)
String namespace = tb.currentElement().tag().namespace();
tb.insertForeignElementFor(start, namespace);
// (self-closing handled in insert)
// if self-closing svg script -- level and execution elided
// seemingly not in spec, but as browser behavior, get into ScriptData state for svg script; and allow custom data tags
TokeniserState textState = tb.tagFor(start.tagName.value(), start.normalName, namespace, tb.settings).textState();
if (textState != null) {
if (start.normalName.equals("script"))
tb.tokeniser.transition(TokeniserState.ScriptData);
else
tb.tokeniser.transition(textState);
}
break;
case EndTag:
Token.EndTag end = t.asEndTag();
if (end.normalName.equals("br") || end.normalName.equals("p"))
return processAsHtml(t, tb);
if (end.normalName.equals("script") && tb.currentElementIs("script", Parser.NamespaceSvg)) {
// script level and execution elided.
tb.pop();
return true;
}
// Any other end tag
ArrayList<Element> stack = tb.getStack();
if (stack.isEmpty())
Validate.wtf("Stack unexpectedly empty");
int i = stack.size() - 1;
Element el = stack.get(i);
if (!el.nameIs(end.normalName))
tb.error(this);
while (i != 0) {
if (el.nameIs(end.normalName)) {
tb.popStackToCloseAnyNamespace(el.normalName());
return true;
}
i--;
el = stack.get(i);
if (el.tag().namespace().equals(Parser.NamespaceHtml)) {
return processAsHtml(t, tb);
}
}
break;
case EOF:
// won't come through here, but for completion:
break;
default:
Validate.wtf("Unexpected state: " + t.type); // XmlDecl only in XmlTreeBuilder
}
return true;
}
boolean processAsHtml(Token t, HtmlTreeBuilder tb) {
return tb.state().process(t, tb);
}
};
private static void mergeAttributes(Token.StartTag source, Element dest) {
if (!source.hasAttributes()) return;
for (Attribute attr : source.attributes) { // only iterates public attributes
Attributes destAttrs = dest.attributes();
if (!destAttrs.hasKey(attr.getKey())) {
Range.AttributeRange range = attr.sourceRange(); // need to grab range before its parent changes
destAttrs.put(attr);
if (source.trackSource) { // copy the attribute range
destAttrs.sourceRange(attr.getKey(), range);
}
}
}
}
private static final String nullString = String.valueOf('\u0000');
abstract boolean process(Token t, HtmlTreeBuilder tb);
private static boolean isWhitespace(Token t) {
if (t.isCharacter()) {
String data = t.asCharacter().getData();
return StringUtil.isBlank(data);
}
return false;
}
private static void HandleTextState(Token.StartTag startTag, HtmlTreeBuilder tb, @Nullable TokeniserState state) {
if (state != null)
tb.tokeniser.transition(state);
tb.markInsertionMode();
tb.transition(Text);
tb.insertElementFor(startTag);
}
// lists of tags to search through
static final class Constants {
static final String[] InHeadEmpty = new String[]{"base", "basefont", "bgsound", "command", "link"};
static final String[] InHeadRaw = new String[]{"noframes", "style"};
static final String[] InHeadEnd = new String[]{"body", "br", "html"};
static final String[] AfterHeadBody = new String[]{"body", "br", "html"};
static final String[] BeforeHtmlToHead = new String[]{"body", "br", "head", "html", };
static final String[] InHeadNoScriptHead = new String[]{"basefont", "bgsound", "link", "meta", "noframes", "style"};
static final String[] InBodyStartToHead = new String[]{"base", "basefont", "bgsound", "command", "link", "meta", "noframes", "script", "style", "template", "title"};
static final String[] InBodyStartPClosers = new String[]{"address", "article", "aside", "blockquote", "center", "details", "dir", "div", "dl",
"fieldset", "figcaption", "figure", "footer", "header", "hgroup", "menu", "nav", "ol",
"p", "section", "summary", "ul"};
static final String[] Headings = new String[]{"h1", "h2", "h3", "h4", "h5", "h6"};
static final String[] InBodyStartLiBreakers = new String[]{"address", "div", "p"};
static final String[] DdDt = new String[]{"dd", "dt"};
static final String[] InBodyStartApplets = new String[]{"applet", "marquee", "object"};
static final String[] InBodyStartMedia = new String[]{"param", "source", "track"};
static final String[] InBodyStartInputAttribs = new String[]{"action", "name", "prompt"};
static final String[] InBodyStartDrop = new String[]{"caption", "col", "colgroup", "frame", "head", "tbody", "td", "tfoot", "th", "thead", "tr"};
static final String[] InBodyEndClosers = new String[]{"address", "article", "aside", "blockquote", "button", "center", "details", "dir", "div",
"dl", "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "listing", "menu",
"nav", "ol", "pre", "section", "summary", "ul"};
static final String[] InBodyEndOtherErrors = new String[] {"body", "dd", "dt", "html", "li", "optgroup", "option", "p", "rb", "rp", "rt", "rtc", "tbody", "td", "tfoot", "th", "thead", "tr"};
static final String[] InBodyEndAdoptionFormatters = new String[]{"a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u"};
static final String[] InTableToBody = new String[]{"tbody", "tfoot", "thead"};
static final String[] InTableAddBody = new String[]{"td", "th", "tr"};
static final String[] InTableToHead = new String[]{"script", "style", "template"};
static final String[] InCellNames = new String[]{"td", "th"};
static final String[] InCellBody = new String[]{"body", "caption", "col", "colgroup", "html"};
static final String[] InCellTable = new String[]{ "table", "tbody", "tfoot", "thead", "tr"};
static final String[] InCellCol = new String[]{"caption", "col", "colgroup", "tbody", "td", "tfoot", "th", "thead", "tr"};
static final String[] InTableEndErr = new String[]{"body", "caption", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr"};
static final String[] InTableFoster = new String[]{"table", "tbody", "tfoot", "thead", "tr"};
static final String[] InTableBodyExit = new String[]{"caption", "col", "colgroup", "tbody", "tfoot", "thead"};
static final String[] InTableBodyEndIgnore = new String[]{"body", "caption", "col", "colgroup", "html", "td", "th", "tr"};
static final String[] InRowMissing = new String[]{"caption", "col", "colgroup", "tbody", "tfoot", "thead", "tr"};
static final String[] InRowIgnore = new String[]{"body", "caption", "col", "colgroup", "html", "td", "th"};
static final String[] InSelectEnd = new String[]{"input", "keygen", "textarea"};
static final String[] InSelectTableEnd = new String[]{"caption", "table", "tbody", "td", "tfoot", "th", "thead", "tr"};
static final String[] InTableEndIgnore = new String[]{"tbody", "tfoot", "thead"};
static final String[] InHeadNoscriptIgnore = new String[]{"head", "noscript"};
static final String[] InCaptionIgnore = new String[]{"body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr"};
static final String[] InTemplateToHead = new String[] {"base", "basefont", "bgsound", "link", "meta", "noframes", "script", "style", "template", "title"};
static final String[] InTemplateToTable = new String[] {"caption", "colgroup", "tbody", "tfoot", "thead"};
static final String[] InForeignToHtml = new String[] {"b", "big", "blockquote", "body", "br", "center", "code", "dd", "div", "dl", "dt", "em", "embed", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "i", "img", "li", "listing", "menu", "meta", "nobr", "ol", "p", "pre", "ruby", "s", "small", "span", "strike", "strong", "sub", "sup", "table", "tt", "u", "ul", "var"};
}
}