TokenQueue.java
package org.jsoup.parser;
import org.jsoup.internal.StringUtil;
import org.jsoup.helper.Validate;
/**
A character reader with helpers focusing on parsing CSS selectors. Used internally by jsoup. API subject to changes.
*/
public class TokenQueue implements AutoCloseable {
private static final char Esc = '\\'; // escape char for chomp balanced.
private static final char Hyphen_Minus = '-';
private static final char Unicode_Null = '\u0000';
private static final char Replacement = '\uFFFD';
private final CharacterReader reader;
/**
Create a new TokenQueue.
@param data string of data to back queue.
*/
public TokenQueue(String data) {
reader = new CharacterReader(data);
}
/**
Is the queue empty?
@return true if no data left in queue.
*/
public boolean isEmpty() {
return reader.isEmpty();
}
/**
Consume one character off queue.
@return first character on queue.
*/
public char consume() {
return reader.consume();
}
/**
Drops the next character off the queue.
*/
public void advance() {
if (!isEmpty()) reader.advance();
}
char current() {
return reader.current();
}
/**
Tests if the next characters on the queue match the sequence, case-insensitively.
@param seq String to check queue for.
@return true if the next characters match.
*/
public boolean matches(String seq) {
return reader.matchesIgnoreCase(seq);
}
/** Tests if the next character on the queue matches the character, case-sensitively. */
public boolean matches(char c) {
return reader.matches(c);
}
/**
Tests if the next characters match any of the sequences, case-<b>sensitively</b>.
@param seq list of chars to case-sensitively check for
@return true of any matched, false if none did
*/
public boolean matchesAny(char... seq) {
return reader.matchesAny(seq);
}
/**
If the queue case-insensitively matches the supplied string, consume it off the queue.
@param seq String to search for, and if found, remove from queue.
@return true if found and removed, false if not found.
*/
public boolean matchChomp(String seq) {
return reader.matchConsumeIgnoreCase(seq);
}
/** If the queue matches the supplied (case-sensitive) character, consume it off the queue. */
public boolean matchChomp(char c) {
if (reader.matches(c)) {
consume();
return true;
}
return false;
}
/**
Tests if queue starts with a whitespace character.
@return if starts with whitespace
*/
public boolean matchesWhitespace() {
return StringUtil.isWhitespace(reader.current());
}
/**
Test if the queue matches a tag word character (letter or digit).
@return if matches a word character
*/
public boolean matchesWord() {
return Character.isLetterOrDigit(reader.current());
}
/**
Consumes the supplied sequence of the queue, case-insensitively. If the queue does not start with the supplied
sequence, will throw an illegal state exception -- but you should be running match() against that condition.
@param seq sequence to remove from head of queue.
*/
public void consume(String seq) {
boolean found = reader.matchConsumeIgnoreCase(seq);
if (!found) throw new IllegalStateException("Queue did not match expected sequence");
}
/**
Pulls a string off the queue, up to but exclusive of the match sequence, or to the queue running out.
@param seq String to end on (and not include in return, but leave on queue). <b>Case-sensitive.</b>
@return The matched data consumed from queue.
*/
public String consumeTo(String seq) {
return reader.consumeTo(seq);
}
/**
Consumes to the first sequence provided, or to the end of the queue. Leaves the terminator on the queue.
@param seq any number of terminators to consume to. <b>Case-insensitive.</b>
@return consumed string
*/
public String consumeToAny(String... seq) {
StringBuilder sb = StringUtil.borrowBuilder();
OUT: while (!isEmpty()) {
for (String s : seq) {
if (reader.matchesIgnoreCase(s)) break OUT;
}
sb.append(consume());
}
return StringUtil.releaseBuilder(sb);
}
/**
Pulls a balanced string off the queue. E.g. if queue is "(one (two) three) four", (,) will return "one (two) three",
and leave " four" on the queue. Unbalanced openers and closers can be quoted (with ' or ") or escaped (with \).
Those escapes will be left in the returned string, which is suitable for regexes (where we need to preserve the
escape), but unsuitable for contains text strings; use unescape for that.
@param open opener
@param close closer
@return data matched from the queue
*/
public String chompBalanced(char open, char close) {
StringBuilder accum = StringUtil.borrowBuilder();
int depth = 0;
char prev = 0;
boolean inSingle = false;
boolean inDouble = false;
boolean inRegexQE = false; // regex \Q .. \E escapes from Pattern.quote()
reader.mark(); // mark the initial position to restore if needed
do {
if (isEmpty()) break;
char c = consume();
if (prev == Esc) {
if (c == 'Q') inRegexQE = true;
else if (c == 'E') inRegexQE = false;
accum.append(c);
} else {
if (c == '\'' && c != open && !inDouble) inSingle = !inSingle;
else if (c == '"' && c != open && !inSingle) inDouble = !inDouble;
if (inSingle || inDouble || inRegexQE) {
accum.append(c);
} else if (c == open) {
depth++;
if (depth > 1) accum.append(c); // don't include the outer match pair in the return
} else if (c == close) {
depth--;
if (depth > 0) accum.append(c);
} else {
accum.append(c);
}
}
prev = c;
} while (depth > 0);
String out = StringUtil.releaseBuilder(accum);
if (depth > 0) {// ran out of queue before seeing enough )
reader.rewindToMark(); // restore position if we don't have a balanced string
Validate.fail("Did not find balanced marker at '" + out + "'");
}
return out;
}
/**
* Unescape a \ escaped string.
* @param in backslash escaped string
* @return unescaped string
*/
public static String unescape(String in) {
if (in.indexOf(Esc) == -1) return in;
StringBuilder out = StringUtil.borrowBuilder();
char last = 0;
for (char c : in.toCharArray()) {
if (c == Esc) {
if (last == Esc) {
out.append(c);
c = 0;
}
}
else
out.append(c);
last = c;
}
return StringUtil.releaseBuilder(out);
}
/**
Given a CSS identifier (such as a tag, ID, or class), escape any CSS special characters that would otherwise not be
valid in a selector.
@see <a href="https://www.w3.org/TR/cssom-1/#serialize-an-identifier">CSS Object Model, serialize an identifier</a>
*/
public static String escapeCssIdentifier(String in) {
if (in.isEmpty()) return in;
StringBuilder out = StringUtil.borrowBuilder();
TokenQueue q = new TokenQueue(in);
char firstChar = q.current();
if (firstChar == Hyphen_Minus) {
q.advance();
if (q.isEmpty()) {
// If the character is the first character and is a "-" (U+002D), and there is no second character, then
// the escaped character.
appendEscaped(out, Hyphen_Minus);
} else {
out.append(Hyphen_Minus);
char secondChar = q.current();
if (StringUtil.isDigit(secondChar)) {
// If the character is the second character and is in the range [0-9] (U+0030 to U+0039) and the
// first character is a "-" (U+002D), then the character escaped as code point.
appendEscapedCodepoint(out, q.consume());
}
}
} else if (StringUtil.isDigit(firstChar)) {
// If the character is the first character and is in the range [0-9] (U+0030 to U+0039), then the character
// escaped as code point.
appendEscapedCodepoint(out, q.consume());
}
while (!q.isEmpty()) {
// Note: It's fine to iterate on chars because non-ASCII characters are never escaped. So surrogate pairs
// are kept intact.
char c = q.consume();
if (c == Unicode_Null) {
// If the character is NULL (U+0000), then the REPLACEMENT CHARACTER (U+FFFD).
out.append(Replacement);
} else if (c <= '\u001F' || c == '\u007F') {
// If the character is in the range [\1-\1f] (U+0001 to U+001F) or is U+007F, then the character
// escaped as code point.
appendEscapedCodepoint(out, c);
} else if (isIdent(c)) {
// If the character is not handled by one of the above rules and is greater than or equal to U+0080,
// is "-" (U+002D) or "_" (U+005F), or is in one of the ranges [0-9] (U+0030 to U+0039),
// [A-Z] (U+0041 to U+005A), or [a-z] (U+0061 to U+007A), then the character itself.
out.append(c);
} else {
// Otherwise, the escaped character.
appendEscaped(out, c);
}
}
q.close();
return StringUtil.releaseBuilder(out);
}
private static void appendEscaped(StringBuilder out, char c) {
out.append(Esc).append(c);
}
private static void appendEscapedCodepoint(StringBuilder out, char c) {
out.append(Esc).append(Integer.toHexString(c)).append(' ');
}
/**
* Pulls the next run of whitespace characters of the queue.
* @return Whether consuming whitespace or not
*/
public boolean consumeWhitespace() {
boolean seen = false;
while (matchesWhitespace()) {
advance();
seen = true;
}
return seen;
}
/**
* Consume a CSS element selector (tag name, but | instead of : for namespaces (or *| for wildcard namespace), to not conflict with :pseudo selects).
*
* @return tag name
*/
public String consumeElementSelector() {
return consumeEscapedCssIdentifier(ElementSelectorChars);
}
private static final char[] ElementSelectorChars = {'*', '|', '_', '-'};
/**
Consume a CSS identifier (ID or class) off the queue.
<p>Note: For backwards compatibility this method supports improperly formatted CSS identifiers, e.g. {@code 1} instead
of {@code \31}.</p>
@return The unescaped identifier.
@throws IllegalArgumentException if an invalid escape sequence was found. Afterward, the state of the TokenQueue
is undefined.
@see <a href="https://www.w3.org/TR/css-syntax-3/#consume-name">CSS Syntax Module Level 3, Consume an ident sequence</a>
@see <a href="https://www.w3.org/TR/css-syntax-3/#typedef-ident-token">CSS Syntax Module Level 3, ident-token</a>
*/
public String consumeCssIdentifier() {
if (isEmpty()) throw new IllegalArgumentException("CSS identifier expected, but end of input found");
// Fast path for CSS identifiers that don't contain escape sequences.
String identifier = reader.consumeMatching(TokenQueue::isIdent);
char c = current();
if (c != Esc && c != Unicode_Null) {
// If we didn't end on an Esc or a Null, we consumed the whole identifier
return identifier;
}
// An escape sequence was found. Use a StringBuilder to store the decoded CSS identifier.
StringBuilder out = StringUtil.borrowBuilder();
if (!identifier.isEmpty()) {
// Copy the CSS identifier up to the first escape sequence.
out.append(identifier);
}
while (!isEmpty()) {
c = current();
if (isIdent(c)) {
out.append(consume());
} else if (c == Unicode_Null) {
// https://www.w3.org/TR/css-syntax-3/#input-preprocessing
advance();
out.append(Replacement);
} else if (c == Esc) {
advance();
if (!isEmpty() && isNewline(current())) {
// Not a valid escape sequence. This is treated as the end of the CSS identifier.
reader.unconsume();
break;
} else {
consumeCssEscapeSequenceInto(out);
}
} else {
break;
}
}
return StringUtil.releaseBuilder(out);
}
private void consumeCssEscapeSequenceInto(StringBuilder out) {
if (isEmpty()) {
out.append(Replacement);
return;
}
char firstEscaped = consume();
if (!StringUtil.isHexDigit(firstEscaped)) {
out.append(firstEscaped);
} else {
reader.unconsume(); // put back the first hex digit
String hexString = reader.consumeMatching(StringUtil::isHexDigit, 6); // consume up to 6 hex digits
int codePoint;
try {
codePoint = Integer.parseInt(hexString, 16);
} catch (NumberFormatException e) {
throw new IllegalArgumentException("Invalid escape sequence: " + hexString, e);
}
if (isValidCodePoint(codePoint)) {
out.appendCodePoint(codePoint);
} else {
out.append(Replacement);
}
if (!isEmpty()) {
char c = current();
if (c == '\r') {
// Since there's currently no input preprocessing, check for CRLF here.
// https://www.w3.org/TR/css-syntax-3/#input-preprocessing
advance();
if (!isEmpty() && current() == '\n') advance();
} else if (c == ' ' || c == '\t' || isNewline(c)) {
advance();
}
}
}
}
// statics below specifically for CSS identifiers:
// https://www.w3.org/TR/css-syntax-3/#non-ascii-code-point
private static boolean isNonAscii(char c) {
return c >= '\u0080';
}
// https://www.w3.org/TR/css-syntax-3/#ident-start-code-point
private static boolean isIdentStart(char c) {
return c == '_' || StringUtil.isAsciiLetter(c) || isNonAscii(c);
}
// https://www.w3.org/TR/css-syntax-3/#ident-code-point
private static boolean isIdent(char c) {
return c == Hyphen_Minus || StringUtil.isDigit(c) || isIdentStart(c);
}
// https://www.w3.org/TR/css-syntax-3/#newline
// Note: currently there's no preprocessing happening.
private static boolean isNewline(char c) {
return c == '\n' || c == '\r' || c == '\f';
}
// https://www.w3.org/TR/css-syntax-3/#consume-an-escaped-code-point
private static boolean isValidCodePoint(int codePoint) {
return codePoint != 0 && Character.isValidCodePoint(codePoint) && !Character.isSurrogate((char) codePoint);
}
private static final char[] CssIdentifierChars = {'-', '_'};
private String consumeEscapedCssIdentifier(char... matches) {
StringBuilder sb = StringUtil.borrowBuilder();
while (!isEmpty()) {
char c = current();
if (c == Esc) {
advance();
if (!isEmpty()) sb.append(consume());
else break;
} else if (matchesCssIdentifier(matches)) {
sb.append(c);
advance();
} else {
break;
}
}
return StringUtil.releaseBuilder(sb);
}
private boolean matchesCssIdentifier(char... matches) {
return matchesWord() || reader.matchesAny(matches);
}
/**
Consume and return whatever is left on the queue.
@return remainder of queue.
*/
public String remainder() {
return reader.consumeToEnd();
}
@Override
public String toString() {
return reader.toString();
}
@Override
public void close() {
reader.close(); // releases buffer back to pool
}
}