QueryPrologLexer.java

/*******************************************************************************
 * Copyright (c) 2015 Eclipse RDF4J contributors, Aduna, and others.
 *
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Eclipse Distribution License v1.0
 * which accompanies this distribution, and is available at
 * http://www.eclipse.org/org/documents/edl-v10.php.
 *
 * SPDX-License-Identifier: BSD-3-Clause
 *******************************************************************************/
package org.eclipse.rdf4j.query.parser;

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * A simple lexer that tokenizes a syntactically legal input SPARQL query string on prolog items (prefixes, base
 * declarations, IRIs, comments, and syntactical tokens such as keywords, opening and closing brackets, and hashes).
 *
 * @author Jeen Broekstra
 */
public class QueryPrologLexer {

	public enum TokenType {
		PREFIX_KEYWORD,
		PREFIX,
		BASE_KEYWORD,
		LBRACKET,
		RBRACKET,
		IRI,
		HASH,
		COMMENT,
		REST_OF_QUERY
	}

	private static final Token HASH_TOKEN = new Token(TokenType.HASH, "#");

	private static final Token PREFIX_KEYWORD_TOKEN = new Token(TokenType.PREFIX_KEYWORD, "PREFIX");

	private static final Token BASE_KEYWORD_TOKEN = new Token(TokenType.BASE_KEYWORD, "BASE");

	private static final Token LBRACKET_TOKEN = new Token(TokenType.LBRACKET, "<");

	private static final Token RBRACKET_TOKEN = new Token(TokenType.RBRACKET, ">");

	private static final Pattern IRI_PATTERN = Pattern.compile("^<([^>]*)>*");

	private static final Pattern PREFIX_PATTERN = Pattern.compile("^prefix([^:]+):", Pattern.CASE_INSENSITIVE);

	// private static final Pattern COMMENT_PATTERN = Pattern.compile("^#([^\n]+/)");
	private static final Pattern COMMENT_PATTERN = Pattern.compile("^(#.*((\r)?\n|(\r)?\n*))*");

	public static class Token {

		public final TokenType t;

		public final String s;

		public Token(TokenType t, String s) {
			this.t = t;
			this.s = s;
		}

		public TokenType getType() {
			return t;
		}

		/**
		 * Get the corresponding string value for this token. For example in the case of an {@link TokenType#IRI} token,
		 * this will return the string representation of that IRI.
		 */
		public String getStringValue() {
			return s;
		}

		@Override
		public String toString() {
			return "[" + t.toString() + "] '" + s + "'";
		}
	}

	/**
	 * Tokenizes a syntactically legal input SPARQL query on prolog elements. The last token in the returned list is of
	 * type {@link TokenType#REST_OF_QUERY} and contains the SPARQL query string minus the prolog.
	 *
	 * @param input a syntactically legal SPARQL query string
	 * @return a list with tokens for each prolog element. If the input string is syntactically legal SPARQL, the final
	 *         returned token is guaranteed to be of type {@link TokenType#REST_OF_QUERY} and to contain the SPARQL
	 *         query string minus the prolog. If the input string is not syntactically legal SPARQL, the method will
	 *         still return normally but no guarantees about the returned list are made.
	 */
	public static List<Token> lex(String input) {
		final List<Token> result = new ArrayList<>();
		for (int i = 0; i < input.length();) {
			char c = input.charAt(i);
			switch (c) {
			case '#':
				result.add(HASH_TOKEN);
				String comment = readComment(input, i);
				i += comment.length() + 1; // 1 for hash
				result.add(new Token(TokenType.COMMENT, comment));
				break;
			case 'p':
			case 'P':
				result.add(PREFIX_KEYWORD_TOKEN);
				// read PREFIX
				String prefix = readPrefix(input, i);
				result.add(new Token(TokenType.PREFIX, prefix.trim()));
				i = i + prefix.length() + 7; // 6 for prefix keyword, 1 for ':'
				break;
			case 'b':
			case 'B':
				result.add(BASE_KEYWORD_TOKEN);
				i += 4; // 4 for base keyword
				break;
			case '<':
				// read IRI
				result.add(LBRACKET_TOKEN);
				String iri = readIRI(input, i);
				result.add(new Token(TokenType.IRI, iri));
				result.add(RBRACKET_TOKEN);
				i += iri.length() + 2; // 2 for opening and closing brackets
				break;
			default:
				if (Character.isWhitespace(c)) {
					i++;
				} else {
					String restOfQuery = input.substring(i);
					result.add(new Token(TokenType.REST_OF_QUERY, restOfQuery));
					i += restOfQuery.length();
				}
				break;
			}
		}

		return result;
	}

	/**
	 * Tokenizes the input string on prolog elements and returns the final Token. If the input string is a syntactically
	 * legal SPARQL query, this Token will be of type {@link TokenType#REST_OF_QUERY} and contain the query string minus
	 * prolog.
	 *
	 * @param input a syntactically legal SPARQL string
	 * @return if the input is syntactically legal SPARQL, a Token containing the query string without prolog. If the
	 *         input is not syntactically legal, the method will still exist normally, but no guarantees are made about
	 *         the returned object.
	 */
	public static Token getRestOfQueryToken(String input) {
		Token result = null;
		for (int i = 0; i < input.length();) {
			char c = input.charAt(i);
			switch (c) {
			case '#':
				String comment = readComment(input, i);
				i += comment.length() + 1; // 1 for hash
				break;
			case 'p':
			case 'P':
				// read PREFIX
				String prefix = readPrefix(input, i);
				if (prefix == null) {
					prefix = ""; // prevent NPE on bad input
				}
				i = i + prefix.length() + 7; // 6 for prefix keyword, 1 for ':'
				break;
			case 'b':
			case 'B':
				i += 4; // 4 for base keyword
				break;
			case '<':
				// read IRI
				String iri = readIRI(input, i);
				if (iri == null) {
					iri = ""; // prevent NPE on bad input
				}
				i += iri.length() + 2; // 2 for opening and closing brackets
				break;
			default:
				if (Character.isWhitespace(c)) {
					i++;
				} else {
					String restOfQuery = input.substring(i);
					result = (new Token(TokenType.REST_OF_QUERY, restOfQuery));
					i += restOfQuery.length();
				}
				break;
			}
		}

		return result;
	}

	/**
	 * Reads the first comment line from the input, and returns the comment line (including the line break character)
	 * without the leading "#".
	 *
	 * @param input
	 * @param index
	 * @return
	 */
	private static String readComment(String input, int index) {
		String comment = null;
		Matcher matcher = COMMENT_PATTERN.matcher(input.substring(index));
		if (matcher.find()) {
			comment = matcher.group(0);
			// the regex group includes the # => just remove it
			comment = comment.substring(1);
		}
		return comment;
	}

	private static String readPrefix(String input, int index) {
		String prefix = null;
		Matcher matcher = PREFIX_PATTERN.matcher(input.substring(index));
		if (matcher.find()) {
			prefix = matcher.group(1);
		}
		return prefix;
	}

	private static String readIRI(String input, int index) {
		String iri = null;
		Matcher matcher = IRI_PATTERN.matcher(input.substring(index));
		if (matcher.find()) {
			iri = matcher.group(1);
		}
		return iri;
	}
}