XmlTreeBuilder.java

package org.jsoup.parser;

import org.jsoup.helper.Validate;
import org.jsoup.internal.SharedConstants;
import org.jsoup.nodes.Attribute;
import org.jsoup.nodes.Attributes;
import org.jsoup.nodes.CDataNode;
import org.jsoup.nodes.Comment;
import org.jsoup.nodes.DataNode;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.DocumentType;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Entities;
import org.jsoup.nodes.LeafNode;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.nodes.XmlDeclaration;
import org.jsoup.select.Elements;
import org.jspecify.annotations.Nullable;

import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayDeque;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import static org.jsoup.parser.Parser.NamespaceXml;

/**
 * Use the {@code XmlTreeBuilder} when you want to parse XML without any of the HTML DOM rules being applied to the
 * document.
 * <p>Usage example: {@code Document xmlDoc = Jsoup.parse(html, baseUrl, Parser.xmlParser());}</p>
 *
 * @author Jonathan Hedley
 */
public class XmlTreeBuilder extends TreeBuilder {
    static final String XmlnsKey = "xmlns";
    static final String XmlnsPrefix = "xmlns:";
    private final ArrayDeque<HashMap<String, String>> namespacesStack = new ArrayDeque<>(); // stack of namespaces, prefix => urn

    @Override ParseSettings defaultSettings() {
        return ParseSettings.preserveCase;
    }

    @Override
    protected void initialiseParse(Reader input, String baseUri, Parser parser) {
        super.initialiseParse(input, baseUri, parser);
        doc.outputSettings()
            .syntax(Document.OutputSettings.Syntax.xml)
            .escapeMode(Entities.EscapeMode.xhtml)
            .prettyPrint(false); // as XML, we don't understand what whitespace is significant or not

        namespacesStack.clear();
        HashMap<String, String> ns = new HashMap<>();
        ns.put("xml", NamespaceXml);
        ns.put("", NamespaceXml);
        namespacesStack.push(ns);
    }

    @Override
    void initialiseParseFragment(@Nullable Element context) {
        super.initialiseParseFragment(context);
        if (context == null) return;

        // transition to the tag's text state if available
        TokeniserState textState = context.tag().textState();
        if (textState != null) tokeniser.transition(textState);

        // reconstitute the namespace stack by traversing the element and its parents (top down)
        Elements chain = context.parents();
        chain.add(0, context);
        for (int i = chain.size() - 1; i >= 0; i--) {
            Element el = chain.get(i);
            HashMap<String, String> namespaces = new HashMap<>(namespacesStack.peek());
            namespacesStack.push(namespaces);
            if (el.attributesSize() > 0) {
                processNamespaces(el.attributes(), namespaces);
            }
        }
    }

    Document parse(Reader input, String baseUri) {
        return parse(input, baseUri, new Parser(this));
    }

    Document parse(String input, String baseUri) {
        return parse(new StringReader(input), baseUri, new Parser(this));
    }

    @Override List<Node> completeParseFragment() {
        return doc.childNodes();
    }

    @Override
    XmlTreeBuilder newInstance() {
        return new XmlTreeBuilder();
    }

    @Override public String defaultNamespace() {
        return NamespaceXml;
    }

    @Override
    TagSet defaultTagSet() {
        return new TagSet(); // an empty tagset
    }

    @Override
    protected boolean process(Token token) {
        currentToken = token;

        // start tag, end tag, doctype, xmldecl, comment, character, eof
        switch (token.type) {
            case StartTag:
                insertElementFor(token.asStartTag());
                break;
            case EndTag:
                popStackToClose(token.asEndTag());
                break;
            case Comment:
                insertCommentFor(token.asComment());
                break;
            case Character:
                insertCharacterFor(token.asCharacter());
                break;
            case Doctype:
                insertDoctypeFor(token.asDoctype());
                break;
            case XmlDecl:
                insertXmlDeclarationFor(token.asXmlDecl());
                break;
            case EOF: // could put some normalisation here if desired
                break;
            default:
                Validate.fail("Unexpected token type: " + token.type);
        }
        return true;
    }

    void insertElementFor(Token.StartTag startTag) {
        // handle namespace for tag
        HashMap<String, String> namespaces = new HashMap<>(namespacesStack.peek());
        namespacesStack.push(namespaces);

        Attributes attributes = startTag.attributes;
        if (attributes != null) {
            settings.normalizeAttributes(attributes);
            attributes.deduplicate(settings);
            processNamespaces(attributes, namespaces);
            applyNamespacesToAttributes(attributes, namespaces);
        }

        String tagName = startTag.tagName.value();
        String ns = resolveNamespace(tagName, namespaces);
        Tag tag = tagFor(tagName, startTag.normalName, ns, settings);
        Element el = new Element(tag, null, attributes);
        currentElement().appendChild(el);
        push(el);

        if (startTag.isSelfClosing()) {
            tag.setSeenSelfClose();
            pop(); // push & pop ensures onNodeInserted & onNodeClosed
        } else if (tag.isEmpty()) {
            pop(); // custom defined void tag
        } else {
            TokeniserState textState = tag.textState();
            if (textState != null) tokeniser.transition(textState);
        }
    }

    private static void processNamespaces(Attributes attributes, HashMap<String, String> namespaces) {
        // process attributes for namespaces (xmlns, xmlns:)
        for (Attribute attr : attributes) {
            String key = attr.getKey();
            String value = attr.getValue();
            if (key.equals(XmlnsKey)) {
                namespaces.put("", value); // new default for this level
            } else if (key.startsWith(XmlnsPrefix)) {
                String nsPrefix = key.substring(XmlnsPrefix.length());
                namespaces.put(nsPrefix, value);
            }
        }
    }

    private static void applyNamespacesToAttributes(Attributes attributes, HashMap<String, String> namespaces) {
        // second pass, apply namespace to attributes. Collects them first then adds (as userData is an attribute)
        Map<String, String> attrPrefix = new HashMap<>();
        for (Attribute attr: attributes) {
            String prefix = attr.prefix();
            if (!prefix.isEmpty()) {
                if (prefix.equals(XmlnsKey)) continue;
                String ns = namespaces.get(prefix);
                if (ns != null) attrPrefix.put(SharedConstants.XmlnsAttr + prefix, ns);
            }
        }
        for (Map.Entry<String, String> entry : attrPrefix.entrySet())
            attributes.userData(entry.getKey(), entry.getValue());
    }

    private static String resolveNamespace(String tagName, HashMap<String, String> namespaces) {
        String ns = namespaces.get("");
        int pos = tagName.indexOf(':');
        if (pos > 0) {
            String prefix = tagName.substring(0, pos);
            if (namespaces.containsKey(prefix))
                ns = namespaces.get(prefix);
        }
        return ns;
    }

    void insertLeafNode(LeafNode node) {
        currentElement().appendChild(node);
        onNodeInserted(node);
    }

    void insertCommentFor(Token.Comment commentToken) {
        Comment comment = new Comment(commentToken.getData());
        insertLeafNode(comment);
    }

    void insertCharacterFor(Token.Character token) {
        final String data = token.getData();
        LeafNode node;
        if      (token.isCData())                       node = new CDataNode(data);
        else if (currentElement().tag().is(Tag.Data))   node = new DataNode(data);
        else                                            node = new TextNode(data);
        insertLeafNode(node);
    }

    void insertDoctypeFor(Token.Doctype token) {
        DocumentType doctypeNode = new DocumentType(settings.normalizeTag(token.getName()), token.getPublicIdentifier(), token.getSystemIdentifier());
        doctypeNode.setPubSysKey(token.getPubSysKey());
        insertLeafNode(doctypeNode);
    }

    void insertXmlDeclarationFor(Token.XmlDecl token) {
        XmlDeclaration decl = new XmlDeclaration(token.name(), token.isDeclaration);
        if (token.attributes != null) decl.attributes().addAll(token.attributes);
        insertLeafNode(decl);
    }

    @Override
    Element pop() {
        namespacesStack.pop();
        return super.pop();
    }

    /**
     * If the stack contains an element with this tag's name, pop up the stack to remove the first occurrence. If not
     * found, skips.
     *
     * @param endTag tag to close
     */
    protected void popStackToClose(Token.EndTag endTag) {
        // like in HtmlTreeBuilder - don't scan up forever for very (artificially) deeply nested stacks
        String elName = settings.normalizeTag(endTag.name());
        Element firstFound = null;

        final int bottom = stack.size() - 1;
        final int upper = bottom >= maxQueueDepth ? bottom - maxQueueDepth : 0;

        for (int pos = stack.size() -1; pos >= upper; pos--) {
            Element next = stack.get(pos);
            if (next.nodeName().equals(elName)) {
                firstFound = next;
                break;
            }
        }
        if (firstFound == null)
            return; // not found, skip

        for (int pos = stack.size() -1; pos >= 0; pos--) {
            Element next = pop();
            if (next == firstFound) {
                break;
            }
        }
    }
    private static final int maxQueueDepth = 256; // an arbitrary tension point between real XML and crafted pain
}