SparqlFormatter.java
/*******************************************************************************
* Copyright (c) 2025 Eclipse RDF4J contributors.
*
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Distribution License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/org/documents/edl-v10.php.
*
* SPDX-License-Identifier: BSD-3-Clause
******************************************************************************/
package org.eclipse.rdf4j.queryrender;
public final class SparqlFormatter {
private SparqlFormatter() {
}
public static String format(String query) {
return format(query, 2);
}
public static String format(String query, int indentWidth) {
if (query == null) {
return null;
}
final String s = query;
final int n = s.length();
StringBuilder out = new StringBuilder(n + 64);
int braceIndent = 0; // spaces due to { }
boolean atLineStart = true;
int lineStart = 0; // start index in 'out' of the current line
int pendingPredicateCol = -1; // set after ';', used exactly once on the next non-ws token
State st = new State();
for (int i = 0; i < n; i++) {
char ch = s.charAt(i);
// COMMENT MODE
if (st.inComment) {
out.append(ch);
if (ch == '\n') {
atLineStart = true;
lineStart = out.length();
st.inComment = false;
pendingPredicateCol = -1; // new line cancels alignment
}
continue;
}
// STRING MODES
if (st.inString) {
out.append(ch);
if (st.esc) {
st.esc = false;
continue;
}
if (ch == '\\') {
st.esc = true;
continue;
}
if (ch == st.quote) {
if (st.longString) {
if (i + 2 < n && s.charAt(i + 1) == st.quote && s.charAt(i + 2) == st.quote) {
out.append(st.quote).append(st.quote);
i += 2;
st.resetString();
}
} else {
st.resetString();
}
}
continue;
}
// IRI MODE
if (st.inIRI) {
out.append(ch);
if (ch == '>') {
st.inIRI = false;
}
continue;
}
// TOP-LEVEL: decide behavior
if (ch == '#') {
// Start a comment at current line; honor pending alignment if at line start.
if (atLineStart) {
appendLineIndent(out, braceIndent, pendingPredicateCol);
atLineStart = false;
pendingPredicateCol = -1;
}
out.append('#');
st.inComment = true;
continue;
}
if (ch == '<') { // IRI start
if (atLineStart) {
appendLineIndent(out, braceIndent, pendingPredicateCol);
atLineStart = false;
pendingPredicateCol = -1;
}
out.append('<');
st.inIRI = true;
continue;
}
if (ch == '"' || ch == '\'') { // string start
if (atLineStart) {
appendLineIndent(out, braceIndent, pendingPredicateCol);
atLineStart = false;
pendingPredicateCol = -1;
}
boolean isLong = (i + 2 < n && s.charAt(i + 1) == ch && s.charAt(i + 2) == ch);
out.append(ch);
if (isLong) {
out.append(ch).append(ch);
i += 2;
}
st.startString(ch, isLong);
continue;
}
if (ch == '{') {
if (atLineStart) {
appendIndent(out, braceIndent);
} else if (needsSpaceBefore(out)) {
out.append(' ');
}
out.append('{').append('\n');
atLineStart = true;
lineStart = out.length();
braceIndent += indentWidth;
pendingPredicateCol = -1; // after an opening brace, no predicate alignment pending
i = skipWs(s, i + 1) - 1; // normalize whitespace after '{'
continue;
}
if (ch == '}') {
// finish any partial line
if (!atLineStart) {
rstripLine(out, lineStart);
out.append('\n');
}
braceIndent = Math.max(0, braceIndent - indentWidth);
appendIndent(out, braceIndent);
out.append('}').append('\n');
atLineStart = true;
lineStart = out.length();
pendingPredicateCol = -1;
// handle "} UNION {"
int j = skipWs(s, i + 1);
if (matchesWordIgnoreCase(s, j, "UNION")) {
appendIndent(out, braceIndent + 2);
out.append("UNION").append('\n');
atLineStart = true;
lineStart = out.length();
j = skipWs(s, j + 5);
if (j < n && s.charAt(j) == '{') {
appendIndent(out, braceIndent);
out.append('{').append('\n');
atLineStart = true;
lineStart = out.length();
braceIndent += indentWidth;
j = skipWs(s, j + 1);
}
i = j - 1;
} else {
i = j - 1;
}
continue;
}
if (ch == '[') {
if (atLineStart) {
appendLineIndent(out, braceIndent, pendingPredicateCol);
atLineStart = false;
pendingPredicateCol = -1;
}
int after = formatSquareBlock(s, i, out, lineStart); // writes either [] or a multi-line block
i = after - 1;
// if helper ended with newline, reflect that
if (out.length() > 0 && out.charAt(out.length() - 1) == '\n') {
atLineStart = true;
lineStart = out.length();
}
continue;
}
if (ch == '(') {
if (atLineStart) {
appendLineIndent(out, braceIndent, pendingPredicateCol);
atLineStart = false;
pendingPredicateCol = -1;
}
int after = formatParenCollapsed(s, i, out);
i = after - 1;
continue;
}
if (ch == ';') {
// End of predicate-object pair (outside []), start next predicate under the same column.
out.append(';');
pendingPredicateCol = computePredicateColumnFromCurrentLine(out, lineStart);
out.append('\n');
atLineStart = true;
lineStart = out.length();
// CRITICAL: skip all whitespace in INPUT following ';' so we don't double-indent.
i = skipWs(s, i + 1) - 1;
continue;
}
if (ch == '\r' || ch == '\n') {
if (!atLineStart) {
rstripLine(out, lineStart);
out.append('\n');
atLineStart = true;
lineStart = out.length();
}
i = skipNewlines(s, i + 1) - 1;
pendingPredicateCol = -1; // a raw newline resets alignment
continue;
}
if (ch == ' ' || ch == '\t') {
// Drop leading indentation from the input; otherwise copy spaces.
if (!atLineStart) {
out.append(ch);
}
while (atLineStart && i + 1 < n && (s.charAt(i + 1) == ' ' || s.charAt(i + 1) == '\t')) {
i++;
}
continue;
}
// Default: normal token character
if (atLineStart) {
appendLineIndent(out, braceIndent, pendingPredicateCol);
atLineStart = false;
pendingPredicateCol = -1;
}
out.append(ch);
}
// Trim trailing whitespace/newlines.
int end = out.length();
while (end > 0 && Character.isWhitespace(out.charAt(end - 1))) {
end--;
}
return out.substring(0, end);
}
/* ================= helpers ================= */
private static void appendLineIndent(StringBuilder out, int braceIndent, int pendingPredicateCol) {
appendIndent(out, pendingPredicateCol >= 0 ? pendingPredicateCol : braceIndent);
}
private static void appendIndent(StringBuilder sb, int spaces) {
for (int i = 0; i < spaces; i++) {
sb.append(' ');
}
}
private static void rstripLine(StringBuilder sb, int lineStart) {
int i = sb.length();
while (i > lineStart) {
char c = sb.charAt(i - 1);
if (c == ' ' || c == '\t') {
i--;
} else {
break;
}
}
if (i < sb.length()) {
sb.setLength(i);
}
}
private static boolean needsSpaceBefore(StringBuilder out) {
int len = out.length();
return len > 0 && !Character.isWhitespace(out.charAt(len - 1));
}
private static int skipWs(String s, int pos) {
int i = pos;
while (i < s.length()) {
char c = s.charAt(i);
if (c != ' ' && c != '\t' && c != '\r' && c != '\n') {
break;
}
i++;
}
return i;
}
private static int skipNewlines(String s, int pos) {
int i = pos;
while (i < s.length()) {
char c = s.charAt(i);
if (c != '\r' && c != '\n') {
break;
}
i++;
}
return i;
}
private static boolean matchesWordIgnoreCase(String s, int pos, String word) {
int end = pos + word.length();
if (pos < 0 || end > s.length()) {
return false;
}
if (!s.regionMatches(true, pos, word, 0, word.length())) {
return false;
}
if (end < s.length() && isWordChar(s.charAt(end))) {
return false;
}
return pos == 0 || !isWordChar(s.charAt(pos - 1));
}
private static boolean isWordChar(char c) {
return Character.isLetterOrDigit(c) || c == '_';
}
/** Decide the predicate start column by reading the ALREADY EMITTED current line. */
private static int computePredicateColumnFromCurrentLine(StringBuilder out, int lineStart) {
int i = lineStart, n = out.length();
while (i < n && (out.charAt(i) == ' ' || out.charAt(i) == '\t')) {
i++; // leading spaces
}
i = skipSubjectToken(out, i, n); // subject token
while (i < n && (out.charAt(i) == ' ' || out.charAt(i) == '\t')) {
i++; // spaces before predicate
}
return i - lineStart;
}
private static int skipSubjectToken(CharSequence s, int i, int n) {
if (i >= n) {
return i;
}
char c = s.charAt(i);
if (c == '[') { // blank node subject
int depth = 0;
boolean inIRI = false, inStr = false, esc = false;
char q = 0;
for (int j = i + 1; j < n; j++) {
char d = s.charAt(j);
if (inIRI) {
if (d == '>') {
inIRI = false;
}
continue;
}
if (inStr) {
if (esc) {
esc = false;
continue;
}
if (d == '\\') {
esc = true;
continue;
}
if (d == q) {
inStr = false;
}
continue;
}
if (d == '<') {
inIRI = true;
continue;
}
if (d == '"' || d == '\'') {
inStr = true;
q = d;
continue;
}
if (d == '[') {
depth++;
continue;
}
if (d == ']') {
if (depth == 0) {
return j + 1;
}
depth--;
}
}
return n;
}
if (c == '(') { // collection subject
int depth = 0;
boolean inIRI = false, inStr = false, esc = false;
char q = 0;
for (int j = i + 1; j < n; j++) {
char d = s.charAt(j);
if (inIRI) {
if (d == '>') {
inIRI = false;
}
continue;
}
if (inStr) {
if (esc) {
esc = false;
continue;
}
if (d == '\\') {
esc = true;
continue;
}
if (d == q) {
inStr = false;
}
continue;
}
if (d == '<') {
inIRI = true;
continue;
}
if (d == '"' || d == '\'') {
inStr = true;
q = d;
continue;
}
if (d == '(') {
depth++;
continue;
}
if (d == ')') {
if (depth == 0) {
return j + 1;
}
depth--;
}
}
return n;
}
if (c == '<') { // IRI subject
int j = i + 1;
while (j < n && s.charAt(j) != '>') {
j++;
}
return Math.min(n, j + 1);
}
if (c == '?' || c == '$') { // variable subject
int j = i + 1;
while (j < n && isNameChar(s.charAt(j))) {
j++;
}
return j;
}
// QName or 'a'
int j = i;
while (j < n) {
char d = s.charAt(j);
if (Character.isWhitespace(d)) {
break;
}
if ("{}[]().,;".indexOf(d) >= 0) {
break;
}
j++;
}
return j;
}
private static boolean isNameChar(char c) {
return Character.isLetterOrDigit(c) || c == '_' || c == '-';
}
/* -------- square brackets -------- */
/**
* Format a '[' ... ']' block. - If no top-level ';' inside: single line with collapsed inner whitespace: `[ ... ]`
* - Else: multi-line with content indented 2 spaces past '[' and ']' aligned under '['. Returns index AFTER the
* matching ']' in the INPUT.
*/
private static int formatSquareBlock(String s, int i, StringBuilder out, int lineStartOut) {
final int n = s.length();
int j = i + 1;
ScanState scan = new ScanState();
int innerDepth = 0;
boolean hasTopLevelSemicolon = false;
for (; j < n; j++) {
char c = s.charAt(j);
if (scan.inComment) {
if (c == '\n') {
scan.inComment = false;
}
continue;
}
if (scan.inIRI) {
if (c == '>') {
scan.inIRI = false;
}
continue;
}
if (scan.inString) {
if (scan.esc) {
scan.esc = false;
continue;
}
if (c == '\\') {
scan.esc = true;
continue;
}
if (c == scan.quote) {
if (scan.longString) {
if (j + 2 < n && s.charAt(j + 1) == scan.quote && s.charAt(j + 2) == scan.quote) {
j += 2;
scan.resetString();
}
} else {
scan.resetString();
}
}
continue;
}
if (c == '#') {
scan.inComment = true;
continue;
}
if (c == '<') {
scan.inIRI = true;
continue;
}
if (c == '"' || c == '\'') {
boolean isLong = (j + 2 < n && s.charAt(j + 1) == c && s.charAt(j + 2) == c);
scan.startString(c, isLong);
continue;
}
if (c == '[') {
innerDepth++;
continue;
}
if (c == ']') {
if (innerDepth == 0) {
break;
}
innerDepth--;
continue;
}
if (c == ';' && innerDepth == 0) {
hasTopLevelSemicolon = true;
}
}
int end = j; // position of the matching ']'
if (end >= n || s.charAt(end) != ']') {
out.append('['); // unmatched; emit literal '[' and move on
return i + 1;
}
if (!hasTopLevelSemicolon) {
// Single-line blank node: normalize inner ws to single spaces.
String inner = collapseWsExceptInStringsAndIRIs(s.substring(i + 1, end));
if (inner.isEmpty()) {
out.append("[]");
} else {
out.append('[').append(' ').append(inner).append(' ').append(']');
}
return end + 1;
}
// Multi-line blank node
int bracketCol = out.length() - lineStartOut; // column where '[' appears
out.append('[').append('\n');
int contentIndent = bracketCol + 2;
int k = i + 1;
boolean atLineStart = true;
while (k < end) {
char c = s.charAt(k);
// comments
if (scan.inComment) {
if (atLineStart) {
appendIndent(out, contentIndent);
atLineStart = false;
}
out.append(c);
if (c == '\n') {
atLineStart = true;
scan.inComment = false;
}
k++;
continue;
}
// IRIs
if (scan.inIRI) {
if (atLineStart) {
appendIndent(out, contentIndent);
atLineStart = false;
}
out.append(c);
if (c == '>') {
scan.inIRI = false;
}
k++;
continue;
}
// strings
if (scan.inString) {
if (atLineStart) {
appendIndent(out, contentIndent);
atLineStart = false;
}
out.append(c);
if (scan.esc) {
scan.esc = false;
k++;
continue;
}
if (c == '\\') {
scan.esc = true;
k++;
continue;
}
if (c == scan.quote) {
if (scan.longString) {
if (k + 2 < end && s.charAt(k + 1) == scan.quote && s.charAt(k + 2) == scan.quote) {
out.append(scan.quote).append(scan.quote);
k += 3;
scan.resetString();
continue;
}
} else {
scan.resetString();
}
}
k++;
continue;
}
// structural
if (c == '#') {
if (atLineStart) {
appendIndent(out, contentIndent);
atLineStart = false;
}
out.append('#');
scan.inComment = true;
k++;
continue;
}
if (c == '<') {
if (atLineStart) {
appendIndent(out, contentIndent);
atLineStart = false;
}
out.append('<');
scan.inIRI = true;
k++;
continue;
}
if (c == '"' || c == '\'') {
boolean isLong = (k + 2 < end && s.charAt(k + 1) == c && s.charAt(k + 2) == c);
if (atLineStart) {
appendIndent(out, contentIndent);
atLineStart = false;
}
out.append(c);
if (isLong) {
out.append(c).append(c);
k += 3;
} else {
k++;
}
scan.startString(c, isLong);
continue;
}
if (c == '[') {
if (atLineStart) {
appendIndent(out, contentIndent);
atLineStart = false;
}
int after = formatSquareBlock(s, k, out,
out.length() - (out.length() - (out.length() - contentIndent))); // effectively line start
k = after;
continue;
}
if (c == '(') {
if (atLineStart) {
appendIndent(out, contentIndent);
atLineStart = false;
}
int after = formatParenCollapsed(s, k, out);
k = after;
continue;
}
if (c == ';') {
out.append(';').append('\n');
atLineStart = true;
k = skipWs(s, k + 1);
continue;
}
if (c == '\r' || c == '\n') {
if (!atLineStart) {
out.append(' ');
}
k = skipNewlines(s, k + 1);
continue;
}
if (c == ' ' || c == '\t') {
int w = k + 1;
while (w < end && (s.charAt(w) == ' ' || s.charAt(w) == '\t')) {
w++;
}
if (!atLineStart) {
out.append(' ');
}
k = w;
continue;
}
if (atLineStart) {
appendIndent(out, contentIndent);
atLineStart = false;
}
out.append(c);
k++;
}
// Close and align ']'
if (out.length() == 0 || out.charAt(out.length() - 1) != '\n') {
out.append('\n');
}
appendIndent(out, bracketCol);
out.append(']');
return end + 1;
}
/** Format a '(' ... ')' block by collapsing inner whitespace to single spaces. */
private static int formatParenCollapsed(String s, int i, StringBuilder out) {
final int n = s.length();
int j = i + 1;
ScanState scan = new ScanState();
int parenDepth = 0;
StringBuilder inner = new StringBuilder();
for (; j < n; j++) {
char c = s.charAt(j);
if (scan.inComment) {
if (c == '\n') {
scan.inComment = false;
}
continue;
}
if (scan.inIRI) {
inner.append(c);
if (c == '>') {
scan.inIRI = false;
}
continue;
}
if (scan.inString) {
inner.append(c);
if (scan.esc) {
scan.esc = false;
continue;
}
if (c == '\\') {
scan.esc = true;
continue;
}
if (c == scan.quote) {
if (scan.longString) {
if (j + 2 < n && s.charAt(j + 1) == scan.quote && s.charAt(j + 2) == scan.quote) {
inner.append(scan.quote).append(scan.quote);
j += 2;
scan.resetString();
}
} else {
scan.resetString();
}
}
continue;
}
if (c == '#') {
scan.inComment = true;
continue;
}
if (c == '<') {
inner.append('<');
scan.inIRI = true;
continue;
}
if (c == '"' || c == '\'') {
boolean isLong = (j + 2 < n && s.charAt(j + 1) == c && s.charAt(j + 2) == c);
inner.append(c);
if (isLong) {
inner.append(c).append(c);
j += 2;
}
scan.startString(c, isLong);
continue;
}
if (c == '(') {
parenDepth++;
inner.append(c);
continue;
}
if (c == ')') {
if (parenDepth == 0) {
break;
}
parenDepth--;
inner.append(c);
continue;
}
inner.append(c);
}
int end = j;
String collapsed = collapseSimple(inner);
out.append('(');
if (!collapsed.isEmpty()) {
out.append(' ').append(collapsed).append(' ');
}
out.append(')');
return end + 1;
}
private static String collapseSimple(CharSequence inner) {
StringBuilder dst = new StringBuilder(inner.length());
boolean lastSpace = false;
for (int i = 0; i < inner.length(); i++) {
char c = inner.charAt(i);
if (Character.isWhitespace(c)) {
if (!lastSpace) {
dst.append(' ');
lastSpace = true;
}
} else {
dst.append(c);
lastSpace = false;
}
}
int a = 0, b = dst.length();
if (a < b && dst.charAt(a) == ' ') {
a++;
}
if (a < b && dst.charAt(b - 1) == ' ') {
b--;
}
return dst.substring(a, b);
}
private static String collapseWsExceptInStringsAndIRIs(String src) {
StringBuilder dst = new StringBuilder(src.length());
boolean inIRI = false, inStr = false, esc = false, longStr = false;
char quote = 0;
boolean wroteSpace = false;
for (int i = 0; i < src.length(); i++) {
char c = src.charAt(i);
if (inIRI) {
dst.append(c);
if (c == '>') {
inIRI = false;
}
continue;
}
if (inStr) {
dst.append(c);
if (esc) {
esc = false;
continue;
}
if (c == '\\') {
esc = true;
continue;
}
if (c == quote) {
if (longStr) {
if (i + 2 < src.length() && src.charAt(i + 1) == quote && src.charAt(i + 2) == quote) {
dst.append(quote).append(quote);
i += 2;
inStr = false;
}
} else {
inStr = false;
}
}
continue;
}
if (c == '<') {
dst.append(c);
inIRI = true;
wroteSpace = false;
continue;
}
if (c == '"' || c == '\'') {
boolean isLong = (i + 2 < src.length() && src.charAt(i + 1) == c && src.charAt(i + 2) == c);
dst.append(c);
if (isLong) {
dst.append(c).append(c);
i += 2;
}
inStr = true;
quote = c;
longStr = isLong;
wroteSpace = false;
continue;
}
if (Character.isWhitespace(c)) {
if (!wroteSpace) {
dst.append(' ');
wroteSpace = true;
}
continue;
}
dst.append(c);
wroteSpace = false;
}
int a = 0, b = dst.length();
if (a < b && dst.charAt(a) == ' ') {
a++;
}
if (a < b && dst.charAt(b - 1) == ' ') {
b--;
}
return dst.substring(a, b);
}
/* ===== small state carriers ===== */
private static final class State {
boolean inIRI = false, inComment = false, inString = false, longString = false, esc = false;
char quote = 0;
void startString(char q, boolean isLong) {
inString = true;
quote = q;
longString = isLong;
esc = false;
}
void resetString() {
inString = false;
longString = false;
quote = 0;
esc = false;
}
}
private static final class ScanState {
boolean inIRI = false, inComment = false, inString = false, longString = false, esc = false;
char quote = 0;
void startString(char q, boolean isLong) {
inString = true;
quote = q;
longString = isLong;
esc = false;
}
void resetString() {
inString = false;
longString = false;
quote = 0;
esc = false;
}
}
public static void main(String[] args) {
String test = "SELECT ?s ?o WHERE {\n" +
" {\n" +
" ?s ex:pC ?u2 .\n" +
" FILTER EXISTS {\n" +
" {\n" +
" ?s ex:pC ?u0 .\n" +
" FILTER EXISTS { { \n" +
" ?s !(ex:pB|foaf:name) ?o .\n" +
" } }\n" +
" }\n" +
" UNION\n" +
" {\n" +
" ?u1 ex:pD ?v1 .\n" +
" }\n" +
" }\n" +
" }\n" +
"}";
// System.out.println("Original:\n" + test);
// System.out.println("Formatted:");
System.out.println(format(test));
}
}