WstxInputData.java
/* Woodstox XML processor
*
* Copyright (c) 2004 Tatu Saloranta, tatu.saloranta@iki.fi
*
* Licensed under the License specified in file LICENSE, included with
* the source code.
* You may not use this file except in compliance with the License.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.ctc.wstx.io;
import com.ctc.wstx.util.XmlChars;
import java.util.stream.IntStream;
/**
* Base class used by readers (specifically, by
* {@link com.ctc.wstx.sr.StreamScanner}, and its sub-classes)
* to encapsulate input buffer portion of the class. Philosophically
* this should probably be done via containment (composition), not
* sub-classing but for performance reason, this "core" class is generally
* extended from instead.
*<p>
* Main reason for the input data portion to be factored out of main
* class is that this way it can also be passed to nested input handling
* Objects, which can then manipulate input buffers of the caller,
* efficiently.
*/
public class WstxInputData
{
// // // Some well-known chars:
/**
* Null-character is used as return value from some method(s), since
* it is not a legal character in an XML document.
*/
public final static char CHAR_NULL = '\u0000';
public final static char INT_NULL = 0;
public final static char CHAR_SPACE = (char) 0x0020;
public final static char INT_SPACE = 0x0020;
/**
* This constant defines the highest Unicode character allowed
* in XML content.
*/
public final static int MAX_UNICODE_CHAR = 0x10FFFF;
// @since 7.1.1
private static final boolean[] ASCII_NAME_START_CHARS = new boolean[128];
static {
IntStream.rangeClosed('a', 'z').forEach(i -> ASCII_NAME_START_CHARS[i] = true);
IntStream.rangeClosed('A', 'Z').forEach(i -> ASCII_NAME_START_CHARS[i] = true);
ASCII_NAME_START_CHARS['_'] = true;
}
// @since 7.1.1
private static final boolean[] ASCII_NAME_CHARS = new boolean[128];
static {
IntStream.rangeClosed('a', 'z').forEach(i -> ASCII_NAME_CHARS[i] = true);
IntStream.rangeClosed('A', 'Z').forEach(i -> ASCII_NAME_CHARS[i] = true);
IntStream.rangeClosed('0', '9').forEach(i -> ASCII_NAME_CHARS[i] = true);
ASCII_NAME_CHARS['.'] = true;
ASCII_NAME_CHARS['-'] = true;
ASCII_NAME_CHARS['_'] = true;
}
/*
////////////////////////////////////////////////////
// Configuration
////////////////////////////////////////////////////
*/
/**
* Flag that indicates whether XML content is to be treated as per
* XML 1.1 specification or not (if not, it'll use xml 1.0).
*/
protected boolean mXml11 = false;
/*
////////////////////////////////////////////////////
// Current input data
////////////////////////////////////////////////////
*/
/**
* Current buffer from which data is read; generally data is read into
* buffer from input source, but not always (especially when using nested
* input contexts when expanding parsed entity references etc).
*/
protected char[] mInputBuffer;
/**
* Pointer to next available character in buffer
*/
protected int mInputPtr = 0;
/**
* Index of character after last available one in the buffer.
*/
protected int mInputEnd = 0;
/*
////////////////////////////////////////////////////
// Current input location information
////////////////////////////////////////////////////
*/
/**
* Number of characters that were contained in previous blocks
* (blocks that were already processed prior to the current buffer).
*/
protected long mCurrInputProcessed = 0L;
/**
* Current row location of current point in input buffer, starting
* from 1
*/
protected int mCurrInputRow = 1;
/**
* Current index of the first character of the current row in input
* buffer. Needed to calculate column position, if necessary; benefit
* of not having column itself is that this only has to be updated
* once per line.
*/
protected int mCurrInputRowStart = 0;
/*
////////////////////////////////////////////////////
// Life-cycle
////////////////////////////////////////////////////
*/
protected WstxInputData() {
}
/**
* Note: Only public due to sub-classes needing to call this on
* base class instance from different package (confusing?)
*/
public void copyBufferStateFrom(WstxInputData src)
{
mInputBuffer = src.mInputBuffer;
mInputPtr = src.mInputPtr;
mInputEnd = src.mInputEnd;
mCurrInputProcessed = src.mCurrInputProcessed;
mCurrInputRow = src.mCurrInputRow;
mCurrInputRowStart = src.mCurrInputRowStart;
}
/*
////////////////////////////////////////////////////
// Public/package API, character classes
////////////////////////////////////////////////////
*/
/**
* Method that can be used to check whether specified character
* is a valid first character of an XML 1.0/1.1 name; except that
* colon (:) is not recognized as a start char here: caller has
* to verify it separately (since it generally affects namespace
* mapping of a qualified name).
*/
protected final boolean isNameStartChar(char c)
{
/* First, let's handle 7-bit ascii range (identical between xml
* 1.0 and 1.1)
*/
if (c < 128) {
// this is performance critical, so we use a lookup table instead of if-branches
return ASCII_NAME_START_CHARS[c];
}
/* Ok, otherwise need to use a big honking bit sets... which
* differ between 1.0 and 1.1
*/
return mXml11 ? XmlChars.is11NameStartChar(c) : XmlChars.is10NameStartChar(c);
}
/**
* Method that can be used to check whether specified character
* is a valid character of an XML 1.0/1.1 name as any other char than
* the first one; except that colon (:) is not recognized as valid here:
* caller has to verify it separately (since it generally affects namespace
* mapping of a qualified name).
*/
protected final boolean isNameChar(char c)
{
// First, let's handle 7-bit ascii range
if (c < 128) {
// this is performance critical, so we use a lookup table instead of if-branches
return ASCII_NAME_CHARS[c];
}
return mXml11 ? XmlChars.is11NameChar(c) : XmlChars.is10NameChar(c);
}
public final static boolean isNameStartChar(char c, boolean nsAware, boolean xml11)
{
/* First, let's handle 7-bit ascii range (identical between xml
* 1.0 and 1.1)
*/
if (c <= 0x7A) { // 'z' or earlier
if (c >= 0x61) { // 'a' - 'z' are ok
return true;
}
if (c < 0x41) { // before 'A' just white space (and colon)
if (c == ':' && !nsAware) {
return true;
}
return false;
}
return (c <= 0x5A) || (c == '_'); // 'A' - 'Z' and '_' are ok
}
/* Ok, otherwise need to use a big honking bit sets... which
* differ between 1.0 and 1.1
*/
return xml11 ? XmlChars.is11NameStartChar(c) : XmlChars.is10NameStartChar(c);
}
public final static boolean isNameChar(char c, boolean nsAware, boolean xml11)
{
// First, let's handle 7-bit ascii range
if (c <= 0x7A) { // 'z' or earlier
if (c >= 0x61) { // 'a' - 'z' are ok
return true;
}
if (c <= 0x5A) {
if (c >= 0x41) { // 'A' - 'Z' ok too
return true;
}
// As are 0-9, '.' and '-'
return (c >= 0x30 && c <= 0x39) || (c == '.') || (c == '-')
|| (c == ':' && !nsAware);
}
return (c == 0x5F); // '_' is ok too
}
return xml11 ? XmlChars.is11NameChar(c) : XmlChars.is10NameChar(c);
}
/**
* Method that can be called to check whether given String contains
* any characters that are not legal XML names.
*
* @return Index of the first illegal xml name characters, if any;
* -1 if the name is completely legal
*/
public final static int findIllegalNameChar(String name, boolean nsAware, boolean xml11)
{
int len = name.length();
if (len < 1) {
return -1;
}
char c = name.charAt(0);
// First char legal?
if (c <= 0x7A) { // 'z' or earlier
if (c < 0x61) { // 'a' - 'z' (0x61 - 0x7A) are ok
if (c < 0x41) { // before 'A' just white space (except colon)
if (c != ':' || nsAware) { // ':' == 0x3A
return 0;
}
} else if ((c > 0x5A) && (c != '_')) {
// 'A' - 'Z' and '_' are ok
return 0;
}
}
} else {
if (xml11) {
if (!XmlChars.is11NameStartChar(c)) {
return 0;
}
} else {
if (!XmlChars.is10NameStartChar(c)) {
return 0;
}
}
}
for (int i = 1; i < len; ++i) {
c = name.charAt(i);
if (c <= 0x7A) { // 'z' or earlier
if (c >= 0x61) { // 'a' - 'z' are ok
continue;
}
if (c <= 0x5A) {
if (c >= 0x41) { // 'A' - 'Z' ok too
continue;
}
// As are 0-9, '.' and '-'
if ((c >= 0x30 && c <= 0x39) || (c == '.') || (c == '-')) {
continue;
}
// And finally, colon, in non-ns-aware mode
if (c == ':' && !nsAware) { // ':' == 0x3A
continue;
}
} else if (c == 0x5F) { // '_' is ok too
continue;
}
} else {
if (xml11) {
if (XmlChars.is11NameChar(c)) {
continue;
}
} else {
if (XmlChars.is10NameChar(c)) {
continue;
}
}
}
return i;
}
return -1;
}
public final static int findIllegalNmtokenChar(String nmtoken, boolean nsAware, boolean xml11)
{
int len = nmtoken.length();
// No special handling for the first char, just the loop
for (int i = 1; i < len; ++i) {
char c = nmtoken.charAt(i);
if (c <= 0x7A) { // 'z' or earlier
if (c >= 0x61) { // 'a' - 'z' are ok
continue;
}
if (c <= 0x5A) {
if (c >= 0x41) { // 'A' - 'Z' ok too
continue;
}
// As are 0-9, '.' and '-'
if ((c >= 0x30 && c <= 0x39) || (c == '.') || (c == '-')) {
continue;
}
// And finally, colon, in non-ns-aware mode
if (c == ':' && !nsAware) { // ':' == 0x3A
continue;
}
} else if (c == 0x5F) { // '_' is ok too
continue;
}
} else {
if (xml11) {
if (XmlChars.is11NameChar(c)) {
continue;
}
} else {
if (XmlChars.is10NameChar(c)) {
continue;
}
}
}
return i;
}
return -1;
}
public final static boolean isSpaceChar(char c)
{
return (c <= CHAR_SPACE);
}
@SuppressWarnings("cast")
public static String getCharDesc(char c)
{
int i = (int) c;
if (Character.isISOControl(c)) {
return "(CTRL-CHAR, code "+i+")";
}
if (i > 255) {
return "'"+c+"' (code "+i+" / 0x"+Integer.toHexString(i)+")";
}
return "'"+c+"' (code "+i+")";
}
}