QueryParser.java

package org.jsoup.select;

import org.jsoup.internal.StringUtil;
import org.jsoup.helper.Validate;
import org.jsoup.nodes.CDataNode;
import org.jsoup.nodes.Comment;
import org.jsoup.nodes.DataNode;
import org.jsoup.nodes.LeafNode;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.parser.TokenQueue;
import org.jspecify.annotations.Nullable;

import java.util.function.Function;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import static org.jsoup.select.StructuralEvaluator.ImmediateParentRun;
import static org.jsoup.internal.Normalizer.normalize;

/**
 * Parses a CSS selector into an Evaluator tree.
 */
public class QueryParser implements AutoCloseable {
    private final static char[] Combinators = {'>', '+', '~'}; // ' ' is also a combinator, but found implicitly
    private final static String[] AttributeEvals = new String[]{"=", "!=", "^=", "$=", "*=", "~="};
    private final static char[] SequenceEnders = {',', ')'};

    private final TokenQueue tq;
    private final String query;
    private boolean inNodeContext; // ::comment:contains should act on node value, vs element text

    /**
     * Create a new QueryParser.
     * @param query CSS query
     */
    private QueryParser(String query) {
        Validate.notEmpty(query);
        query = query.trim();
        this.query = query;
        this.tq = new TokenQueue(query);
    }

    /**
     Parse a CSS query into an Evaluator. If you are evaluating the same query repeatedly, it may be more efficient to
     parse it once and reuse the Evaluator.

     @param query CSS query
     @return Evaluator
     @see Selector selector query syntax
     @throws Selector.SelectorParseException if the CSS query is invalid
     */
    public static Evaluator parse(String query) {
        try (QueryParser p = new QueryParser(query)) {
            return p.parse();
        } catch (IllegalArgumentException e) {
            throw new Selector.SelectorParseException(e.getMessage());
        }
    }

    /**
     Parse the query. We use this simplified expression of the grammar:
     <pre>
     SelectorGroup   ::= Selector (',' Selector)*
     Selector        ::= [ Combinator ] SimpleSequence ( Combinator SimpleSequence )*
     SimpleSequence  ::= [ TypeSelector ] ( ID | Class | Attribute | Pseudo )*
     Pseudo           ::= ':' Name [ '(' SelectorGroup ')' ]
     Combinator      ::= S+         // descendant (whitespace)
     | '>'       // child
     | '+'       // adjacent sibling
     | '~'       // general sibling
     </pre>

     See <a href="https://www.w3.org/TR/selectors-4/#grammar">selectors-4</a> for the real thing
     */
    Evaluator parse() {
        Evaluator eval = parseSelectorGroup();
        tq.consumeWhitespace();
        if (!tq.isEmpty())
            throw new Selector.SelectorParseException("Could not parse query '%s': unexpected token at '%s'", query, tq.remainder());
        return eval;
    }

    Evaluator parseSelectorGroup() {
        // SelectorGroup. Into an Or if > 1 Selector
        Evaluator left = parseSelector();
        while (tq.matchChomp(',')) {
            Evaluator right = parseSelector();
            left = or(left, right);
        }
        return left;
    }

    Evaluator parseSelector() {
        // Selector ::= [ Combinator ] SimpleSequence ( Combinator SimpleSequence )*
        tq.consumeWhitespace();

        Evaluator left;
        if (tq.matchesAny(Combinators)) {
            // e.g. query is "> div"; left side is root element
            left = new StructuralEvaluator.Root();
        } else {
            left = parseSimpleSequence();
        }

        while (true) {
            char combinator = 0;
            if (tq.consumeWhitespace())
                combinator = ' ';            // maybe descendant?
            if (tq.matchesAny(Combinators)) // no, explicit
                combinator = tq.consume();
            else if (tq.matchesAny(SequenceEnders)) // , - space after simple like "foo , bar"; ) - close of :has()
                break;

            if (combinator != 0) {
                Evaluator right = parseSimpleSequence();
                left = combinator(left, combinator, right);
            } else {
                break;
            }
        }
        return left;
    }

    Evaluator parseSimpleSequence() {
        // SimpleSequence ::= TypeSelector? ( Hash | Class | Pseudo )*
        Evaluator left = null;
        tq.consumeWhitespace();

        // one optional type selector
        if (tq.matchesWord() || tq.matches("*|"))
            left = byTag();
        else if (tq.matchChomp('*'))
            left = new Evaluator.AllElements();

        // zero or more subclasses (#, ., [)
        while(true) {
            Evaluator right = parseSubclass();
            if (right != null) {
                left = and(left, right);
            }
            else break; // no more simple tokens
        }

        if (left == null)
            throw new Selector.SelectorParseException("Could not parse query '%s': unexpected token at '%s'", query, tq.remainder());
        return left;
    }

    static Evaluator combinator(Evaluator left, char combinator, Evaluator right) {
        switch (combinator) {
            case '>':
                ImmediateParentRun run = left instanceof ImmediateParentRun ?
                    (ImmediateParentRun) left : new ImmediateParentRun(left);
                run.add(right);
                return run;
            case ' ':
                return and(new StructuralEvaluator.Ancestor(left), right);
            case '+':
                return and(new StructuralEvaluator.ImmediatePreviousSibling(left), right);
            case '~':
                return and(new StructuralEvaluator.PreviousSibling(left), right);
            default:
                throw new Selector.SelectorParseException("Unknown combinator '%s'", combinator);
        }
    }

    @Nullable Evaluator parseSubclass() {
        //  Subclass: ID | Class | Attribute | Pseudo
        if      (tq.matchChomp('#'))    return byId();
        else if (tq.matchChomp('.'))    return byClass();
        else if (tq.matches('['))       return byAttribute();
        else if (tq.matchChomp("::"))   return parseNodeSelector(); // ::comment etc
        else if (tq.matchChomp(':'))    return parsePseudoSelector();
        else                            return null;
    }

    /** Merge two evals into an Or. */
    static Evaluator or(Evaluator left, Evaluator right) {
        if (left instanceof CombiningEvaluator.Or) {
            ((CombiningEvaluator.Or) left).add(right);
            return left;
        }
        return new CombiningEvaluator.Or(left, right);
    }

    /** Merge two evals into an And. */
    static Evaluator and(@Nullable Evaluator left, Evaluator right) {
        if (left == null) return right;
        if (left instanceof CombiningEvaluator.And) {
            ((CombiningEvaluator.And) left).add(right);
            return left;
        }
        return new CombiningEvaluator.And(left, right);
    }

    private Evaluator parsePseudoSelector() {
        final String pseudo = tq.consumeCssIdentifier();
        switch (pseudo) {
            case "lt":
                return new Evaluator.IndexLessThan(consumeIndex());
            case "gt":
                return new Evaluator.IndexGreaterThan(consumeIndex());
            case "eq":
                return new Evaluator.IndexEquals(consumeIndex());
            case "has":
                return has();
            case "is":
                return is();
            case "contains":
                return contains(false);
            case "containsOwn":
                return contains(true);
            case "containsWholeText":
                return containsWholeText(false);
            case "containsWholeOwnText":
                return containsWholeText(true);
            case "containsData":
                return containsData();
            case "matches":
                return matches(false);
            case "matchesOwn":
                return matches(true);
            case "matchesWholeText":
                return matchesWholeText(false);
            case "matchesWholeOwnText":
                return matchesWholeText(true);
            case "not":
                return not();
            case "nth-child":
                return cssNthChild(false, false);
            case "nth-last-child":
                return cssNthChild(true, false);
            case "nth-of-type":
                return cssNthChild(false, true);
            case "nth-last-of-type":
                return cssNthChild(true, true);
            case "first-child":
                return new Evaluator.IsFirstChild();
            case "last-child":
                return new Evaluator.IsLastChild();
            case "first-of-type":
                return new Evaluator.IsFirstOfType();
            case "last-of-type":
                return new Evaluator.IsLastOfType();
            case "only-child":
                return new Evaluator.IsOnlyChild();
            case "only-of-type":
                return new Evaluator.IsOnlyOfType();
            case "empty":
                return new Evaluator.IsEmpty();
            case "blank":
                return new NodeEvaluator.BlankValue();
            case "root":
                return new Evaluator.IsRoot();
            case "matchText":
                return new Evaluator.MatchText();
            default:
                throw new Selector.SelectorParseException("Could not parse query '%s': unexpected token at '%s'", query, tq.remainder());
        }
    }

    // ::comment etc
    private Evaluator parseNodeSelector() {
        final String pseudo = tq.consumeCssIdentifier();
        inNodeContext = true;  // Enter node context

        Evaluator left;
        switch (pseudo) {
            case "node":
                left = new NodeEvaluator.InstanceType(Node.class, pseudo);
                break;
            case "leafnode":
                left = new NodeEvaluator.InstanceType(LeafNode.class, pseudo);
                break;
            case "text":
                left = new NodeEvaluator.InstanceType(TextNode.class, pseudo);
                break;
            case "comment":
                left = new NodeEvaluator.InstanceType(Comment.class, pseudo);
                break;
            case "data":
                left = new NodeEvaluator.InstanceType(DataNode.class, pseudo);
                break;
            case "cdata":
                left = new NodeEvaluator.InstanceType(CDataNode.class, pseudo);
                break;
            default:
                throw new Selector.SelectorParseException(
                    "Could not parse query '%s': unknown node type '::%s'", query, pseudo);
        }

        // Handle following subclasses in node context (like ::comment:contains())
        Evaluator right;
        while ((right = parseSubclass()) != null) {
            left = and(left, right);
        }

        inNodeContext = false;
        return left;
    }

    private Evaluator byId() {
        String id = tq.consumeCssIdentifier();
        Validate.notEmpty(id);
        return new Evaluator.Id(id);
    }

    private Evaluator byClass() {
        String className = tq.consumeCssIdentifier();
        Validate.notEmpty(className);
        return new Evaluator.Class(className.trim());
    }

    private Evaluator byTag() {
        // todo - these aren't dealing perfectly with case sensitivity. For case sensitive parsers, we should also make
        // the tag in the selector case-sensitive (and also attribute names). But for now, normalize (lower-case) for
        // consistency - both the selector and the element tag
        String tagName = normalize(tq.consumeElementSelector());
        Validate.notEmpty(tagName);

        // namespaces:
        if (tagName.startsWith("*|")) { // namespaces: wildcard match equals(tagName) or ending in ":"+tagName
            String plainTag = tagName.substring(2); // strip *|
            return new CombiningEvaluator.Or(
                new Evaluator.Tag(plainTag),
                new Evaluator.TagEndsWith(":" + plainTag)
            );
        } else if (tagName.endsWith("|*")) { // ns|*
            String ns = tagName.substring(0, tagName.length() - 2) + ":"; // strip |*, to ns:
            return new Evaluator.TagStartsWith(ns);
        } else if (tagName.contains("|")) { // flip "abc|def" to "abc:def"
            tagName = tagName.replace("|", ":");
        }

        return new Evaluator.Tag(tagName);
    }

    private Evaluator byAttribute() {
        try (TokenQueue cq = new TokenQueue(tq.chompBalanced('[', ']'))) {
            return evaluatorForAttribute(cq);
        }
    }

    private Evaluator evaluatorForAttribute(TokenQueue cq) {
        String key = cq.consumeToAny(AttributeEvals); // eq, not, start, end, contain, match, (no val)
        Validate.notEmpty(key);
        cq.consumeWhitespace();
        final Evaluator eval;

        if (cq.isEmpty()) {
            if (key.startsWith("^"))
                eval = new Evaluator.AttributeStarting(key.substring(1));
            else if (key.equals("*")) // any attribute
                eval = new Evaluator.AttributeStarting("");
            else
                eval = new Evaluator.Attribute(key);
        } else {
            if (cq.matchChomp('='))
                eval = new Evaluator.AttributeWithValue(key, cq.remainder());
            else if (cq.matchChomp("!="))
                eval = new Evaluator.AttributeWithValueNot(key, cq.remainder());
            else if (cq.matchChomp("^="))
                eval = new Evaluator.AttributeWithValueStarting(key, cq.remainder());
            else if (cq.matchChomp("$="))
                eval = new Evaluator.AttributeWithValueEnding(key, cq.remainder());
            else if (cq.matchChomp("*="))
                eval = new Evaluator.AttributeWithValueContaining(key, cq.remainder());
            else if (cq.matchChomp("~="))
                eval = new Evaluator.AttributeWithValueMatching(key, Pattern.compile(cq.remainder()));
            else
                throw new Selector.SelectorParseException(
                    "Could not parse attribute query '%s': unexpected token at '%s'", query, cq.remainder());
        }
        return eval;
    }

    //pseudo selectors :first-child, :last-child, :nth-child, ...
    private static final Pattern NthStepOffset = Pattern.compile("(([+-])?(\\d+)?)n(\\s*([+-])?\\s*\\d+)?", Pattern.CASE_INSENSITIVE);
    private static final Pattern NthOffset = Pattern.compile("([+-])?(\\d+)");

    private Evaluator cssNthChild(boolean last, boolean ofType) {
        String arg = normalize(consumeParens()); // arg is like "odd", or "-n+2", within nth-child(odd)
        final int step, offset;
        if ("odd".equals(arg)) {
            step = 2;
            offset = 1;
        } else if ("even".equals(arg)) {
            step = 2;
            offset = 0;
        } else {
            Matcher stepOffsetM, stepM;
            if ((stepOffsetM = NthStepOffset.matcher(arg)).matches()) {
                if (stepOffsetM.group(3) != null) // has digits, like 3n+2 or -3n+2
                    step = Integer.parseInt(stepOffsetM.group(1).replaceFirst("^\\+", ""));
                else // no digits, might be like n+2, or -n+2. if group(2) == "-", it���s -1;
                    step = "-".equals(stepOffsetM.group(2)) ? -1 : 1;
                offset =
                    stepOffsetM.group(4) != null ? Integer.parseInt(stepOffsetM.group(4).replaceFirst("^\\+", "")) : 0;
            } else if ((stepM = NthOffset.matcher(arg)).matches()) {
                step = 0;
                offset = Integer.parseInt(stepM.group().replaceFirst("^\\+", ""));
            } else {
                throw new Selector.SelectorParseException("Could not parse nth-index '%s': unexpected format", arg);
            }
        }

        return ofType
            ? (last ? new Evaluator.IsNthLastOfType(step, offset) : new Evaluator.IsNthOfType(step, offset))
            : (last ? new Evaluator.IsNthLastChild(step, offset) : new Evaluator.IsNthChild(step, offset));
    }

    private String consumeParens() {
        return tq.chompBalanced('(', ')');
    }

    private int consumeIndex() {
        String index = consumeParens().trim();
        Validate.isTrue(StringUtil.isNumeric(index), "Index must be numeric");
        return Integer.parseInt(index);
    }

    // pseudo selector :has(el)
    private Evaluator has() {
        return parseNested(StructuralEvaluator.Has::new, ":has() must have a selector");
    }

    // pseudo selector :is()
    private Evaluator is() {
        return parseNested(StructuralEvaluator.Is::new, ":is() must have a selector");
    }

    private Evaluator parseNested(Function<Evaluator, Evaluator> func, String err) {
        Validate.isTrue(tq.matchChomp('('), err);
        Evaluator eval = parseSelectorGroup();
        Validate.isTrue(tq.matchChomp(')'), err);
        return func.apply(eval);
    }

    // pseudo selector :contains(text), containsOwn(text)
    private Evaluator contains(boolean own) {
        String query = own ? ":containsOwn" : ":contains";
        String searchText = TokenQueue.unescape(consumeParens());
        Validate.notEmpty(searchText, query + "(text) query must not be empty");

        if (inNodeContext)
            return new NodeEvaluator.ContainsValue(searchText);

        return own
            ? new Evaluator.ContainsOwnText(searchText)
            : new Evaluator.ContainsText(searchText);
    }

    private Evaluator containsWholeText(boolean own) {
        String query = own ? ":containsWholeOwnText" : ":containsWholeText";
        String searchText = TokenQueue.unescape(consumeParens());
        Validate.notEmpty(searchText, query + "(text) query must not be empty");
        return own
            ? new Evaluator.ContainsWholeOwnText(searchText)
            : new Evaluator.ContainsWholeText(searchText);
    }

    // pseudo selector :containsData(data)
    private Evaluator containsData() {
        String searchText = TokenQueue.unescape(consumeParens());
        Validate.notEmpty(searchText, ":containsData(text) query must not be empty");
        return new Evaluator.ContainsData(searchText);
    }

    // :matches(regex), matchesOwn(regex)
    private Evaluator matches(boolean own) {
        String query = own ? ":matchesOwn" : ":matches";
        String regex = consumeParens(); // don't unescape, as regex bits will be escaped
        Validate.notEmpty(regex, query + "(regex) query must not be empty");
        Pattern pattern = Pattern.compile(regex);

        if (inNodeContext)
            return new NodeEvaluator.MatchesValue(pattern);

        return own
            ? new Evaluator.MatchesOwn(pattern)
            : new Evaluator.Matches(pattern);
    }

    // :matches(regex), matchesOwn(regex)
    private Evaluator matchesWholeText(boolean own) {
        String query = own ? ":matchesWholeOwnText" : ":matchesWholeText";
        String regex = consumeParens(); // don't unescape, as regex bits will be escaped
        Validate.notEmpty(regex, query + "(regex) query must not be empty");

        return own
            ? new Evaluator.MatchesWholeOwnText(Pattern.compile(regex))
            : new Evaluator.MatchesWholeText(Pattern.compile(regex));
    }

    // :not(selector)
    private Evaluator not() {
        String subQuery = consumeParens();
        Validate.notEmpty(subQuery, ":not(selector) subselect must not be empty");

        return new StructuralEvaluator.Not(parse(subQuery));
    }

    @Override
    public String toString() {
        return query;
    }

    @Override
    public void close() {
        tq.close();
    }
}