RegexAsStringFunctionOptimizer.java

/*******************************************************************************
 * Copyright (c) 2022 Eclipse RDF4J contributors.
 *
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Eclipse Distribution License v1.0
 * which accompanies this distribution, and is available at
 * http://www.eclipse.org/org/documents/edl-v10.php.
 *
 * SPDX-License-Identifier: BSD-3-Clause
 *******************************************************************************/

package org.eclipse.rdf4j.query.algebra.evaluation.optimizer;

import org.eclipse.rdf4j.model.ValueFactory;
import org.eclipse.rdf4j.model.vocabulary.FN;
import org.eclipse.rdf4j.query.BindingSet;
import org.eclipse.rdf4j.query.Dataset;
import org.eclipse.rdf4j.query.algebra.Compare;
import org.eclipse.rdf4j.query.algebra.FunctionCall;
import org.eclipse.rdf4j.query.algebra.Regex;
import org.eclipse.rdf4j.query.algebra.TupleExpr;
import org.eclipse.rdf4j.query.algebra.ValueConstant;
import org.eclipse.rdf4j.query.algebra.ValueExpr;
import org.eclipse.rdf4j.query.algebra.evaluation.QueryOptimizer;
import org.eclipse.rdf4j.query.algebra.helpers.AbstractSimpleQueryModelVisitor;

/**
 * A query optimizer that replaces REGEX with {@link FunctionCall}s that are equivalent operators
 *
 * @author Jerven Bolleman
 */
public class RegexAsStringFunctionOptimizer implements QueryOptimizer {

	private final ValueFactory vf;

	public RegexAsStringFunctionOptimizer(ValueFactory vf) {
		this.vf = vf;
	}

	/**
	 * Applies generally applicable optimizations to the supplied query: variable assignments are inlined.
	 */
	@Override
	public void optimize(TupleExpr tupleExpr, Dataset dataset, BindingSet bindings) {
		tupleExpr.visit(new RegexAsStringFunctionVisitor(vf));
	}

	private static class RegexAsStringFunctionVisitor extends AbstractSimpleQueryModelVisitor<RuntimeException> {
		private final ValueFactory vf;

		protected RegexAsStringFunctionVisitor(ValueFactory vf) {
			super(false);
			this.vf = vf;
		}

		@Override
		public void meet(Regex node) {
			final ValueExpr flagsArg = node.getFlagsArg();
			if (flagsArg == null || flagsArg.toString().isEmpty()) {
				// if we have no flags then we can not be in case insensitive mode
				if (node.getPatternArg() instanceof ValueConstant) {
					ValueConstant vc = (ValueConstant) node.getPatternArg();
					String regex = vc.getValue().stringValue();
					final boolean anchoredAtStart = regex.startsWith("^");
					final boolean anchoredAtEnd = regex.endsWith("$");

					if (anchoredAtStart && anchoredAtEnd) {
						equalsCandidate(node, regex);
					} else if (anchoredAtStart) {
						strstartsCandidate(node, regex);
					} else if (anchoredAtEnd) {
						strendsCandidate(node, regex);
					} else {
						containsCandidate(node, regex);
					}
				}
			}
			super.meet(node);
		}

		private void containsCandidate(Regex node, String potential) {
			if (plain(potential)) {
				node.replaceWith(new FunctionCall(FN.CONTAINS.stringValue(), node.getArg().clone(),
						node.getPatternArg().clone()));
			}
		}

		// If we have one of these chars it is likely to be an real regex.
		// If we are willing to peek in a Pattern object we could be sure
		// this seems a valid start with no regexes marked as simple strings
		private boolean plain(String potential) {
			for (int not : new char[] { '?', '*', '+', '{', '|', '\\', '.', '[', ']', '&', '(', ')' }) {
				if (potential.indexOf(not) != -1) {
					return false;
				}
			}
			return true;
		}

		private void strendsCandidate(Regex node, String regex) {
			final String potential = regex.substring(0, regex.length() - 1);
			if (plain(potential)) {
				ValueConstant vc = new ValueConstant(vf.createLiteral(potential));
				node.replaceWith(new FunctionCall(FN.ENDS_WITH.stringValue(), node.getArg().clone(), vc));
			}
		}

		private void strstartsCandidate(Regex node, String regex) {
			final String potential = regex.substring(1);
			if (plain(potential)) {
				ValueConstant vc = new ValueConstant(vf.createLiteral(potential));
				node.replaceWith(new FunctionCall(FN.STARTS_WITH.stringValue(), node.getArg().clone(), vc));
			}
		}

		private void equalsCandidate(Regex node, String regex) {
			final String potential = regex.substring(1, regex.length() - 1);
			if (plain(potential)) {
				ValueConstant vc = new ValueConstant(vf.createLiteral(potential));
				node.replaceWith(new Compare(node.getArg().clone(), vc, Compare.CompareOp.EQ));
			}
		}
	}
}