WstxInputData.java

/* Woodstox XML processor
 *
 * Copyright (c) 2004 Tatu Saloranta, tatu.saloranta@iki.fi
 *
 * Licensed under the License specified in file LICENSE, included with
 * the source code.
 * You may not use this file except in compliance with the License.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.ctc.wstx.io;

import com.ctc.wstx.util.XmlChars;

import java.util.stream.IntStream;

/**
 * Base class used by readers (specifically, by
 * {@link com.ctc.wstx.sr.StreamScanner}, and its sub-classes)
 * to encapsulate input buffer portion of the class. Philosophically
 * this should probably be done via containment (composition), not
 * sub-classing but for performance reason, this "core" class is generally
 * extended from instead.
 *<p>
 * Main reason for the input data portion to be factored out of main
 * class is that this way it can also be passed to nested input handling
 * Objects, which can then manipulate input buffers of the caller,
 * efficiently.
 */
public class WstxInputData
{
    // // // Some well-known chars:

    /**
     * Null-character is used as return value from some method(s), since
     * it is not a legal character in an XML document.
     */
    public final static char CHAR_NULL = '\u0000';
    public final static char INT_NULL = 0;

    public final static char CHAR_SPACE = (char) 0x0020;
    public final static char INT_SPACE = 0x0020;

    /**
     * This constant defines the highest Unicode character allowed
     * in XML content.
     */
    public final static int MAX_UNICODE_CHAR = 0x10FFFF;

    // @since 7.1.1
    private static final boolean[] ASCII_NAME_START_CHARS = new boolean[128];
    static {
        IntStream.rangeClosed('a', 'z').forEach(i -> ASCII_NAME_START_CHARS[i] = true);
        IntStream.rangeClosed('A', 'Z').forEach(i -> ASCII_NAME_START_CHARS[i] = true);
        ASCII_NAME_START_CHARS['_'] = true;
    }

    // @since 7.1.1
    private static final boolean[] ASCII_NAME_CHARS = new boolean[128];
    static {
        IntStream.rangeClosed('a', 'z').forEach(i -> ASCII_NAME_CHARS[i] = true);
        IntStream.rangeClosed('A', 'Z').forEach(i -> ASCII_NAME_CHARS[i] = true);
        IntStream.rangeClosed('0', '9').forEach(i -> ASCII_NAME_CHARS[i] = true);
        ASCII_NAME_CHARS['.'] = true;
        ASCII_NAME_CHARS['-'] = true;
        ASCII_NAME_CHARS['_'] = true;
    }

    /*
    ////////////////////////////////////////////////////
    // Configuration
    ////////////////////////////////////////////////////
     */

    /**
     * Flag that indicates whether XML content is to be treated as per
     * XML 1.1 specification or not (if not, it'll use xml 1.0).
     */
    protected boolean mXml11 = false;

    /*
    ////////////////////////////////////////////////////
    // Current input data
    ////////////////////////////////////////////////////
     */

    /**
     * Current buffer from which data is read; generally data is read into
     * buffer from input source, but not always (especially when using nested
     * input contexts when expanding parsed entity references etc).
     */
    protected char[] mInputBuffer;

    /**
     * Pointer to next available character in buffer
     */
    protected int mInputPtr = 0;

    /**
     * Index of character after last available one in the buffer.
     */
    protected int mInputEnd = 0;

    /*
    ////////////////////////////////////////////////////
    // Current input location information
    ////////////////////////////////////////////////////
     */

    /**
     * Number of characters that were contained in previous blocks
     * (blocks that were already processed prior to the current buffer).
     */
    protected long mCurrInputProcessed = 0L;

    /**
     * Current row location of current point in input buffer, starting
     * from 1
     */
    protected int mCurrInputRow = 1;

    /**
     * Current index of the first character of the current row in input
     * buffer. Needed to calculate column position, if necessary; benefit
     * of not having column itself is that this only has to be updated
     * once per line.
     */
    protected int mCurrInputRowStart = 0;

    /*
    ////////////////////////////////////////////////////
    // Life-cycle
    ////////////////////////////////////////////////////
     */

    protected WstxInputData() {
    }

    /**
     * Note: Only public due to sub-classes needing to call this on
     * base class instance from different package (confusing?)
     */
    public void copyBufferStateFrom(WstxInputData src)
    {
        mInputBuffer = src.mInputBuffer;
        mInputPtr = src.mInputPtr;
        mInputEnd = src.mInputEnd;

        mCurrInputProcessed = src.mCurrInputProcessed;
        mCurrInputRow = src.mCurrInputRow;
        mCurrInputRowStart = src.mCurrInputRowStart;
    }

    /*
    ////////////////////////////////////////////////////
    // Public/package API, character classes
    ////////////////////////////////////////////////////
     */

    /**
     * Method that can be used to check whether specified character
     * is a valid first character of an XML 1.0/1.1 name; except that
     * colon (:) is not recognized as a start char here: caller has
     * to verify it separately (since it generally affects namespace
     * mapping of a qualified name).
     */
    protected final boolean isNameStartChar(char c)
    {
        /* First, let's handle 7-bit ascii range (identical between xml
         * 1.0 and 1.1)
         */
        if (c < 128) {
            // this is performance critical, so we use a lookup table instead of if-branches
            return ASCII_NAME_START_CHARS[c];
        }
        /* Ok, otherwise need to use a big honking bit sets... which
         * differ between 1.0 and 1.1
         */
        return mXml11 ? XmlChars.is11NameStartChar(c) : XmlChars.is10NameStartChar(c);
    }

    /**
     * Method that can be used to check whether specified character
     * is a valid character of an XML 1.0/1.1 name as any other char than
     * the first one; except that colon (:) is not recognized as valid here:
     * caller has to verify it separately (since it generally affects namespace
     * mapping of a qualified name).
     */
    protected final boolean isNameChar(char c)
    {
        // First, let's handle 7-bit ascii range
        if (c < 128) {
            // this is performance critical, so we use a lookup table instead of if-branches
            return ASCII_NAME_CHARS[c];
        }
        return mXml11 ? XmlChars.is11NameChar(c) : XmlChars.is10NameChar(c);
    }

    public final static boolean isNameStartChar(char c, boolean nsAware, boolean xml11)
    {
        /* First, let's handle 7-bit ascii range (identical between xml
         * 1.0 and 1.1)
         */
        if (c <= 0x7A) { // 'z' or earlier
            if (c >= 0x61) { // 'a' - 'z' are ok
                return true;
            }
            if (c < 0x41) { // before 'A' just white space (and colon)
                if (c == ':' && !nsAware) {
                    return true;
                }
                return false;
            }
            return (c <= 0x5A) || (c == '_'); // 'A' - 'Z' and '_' are ok
        }
        /* Ok, otherwise need to use a big honking bit sets... which
         * differ between 1.0 and 1.1
         */
        return xml11 ? XmlChars.is11NameStartChar(c) : XmlChars.is10NameStartChar(c);
    }

    public final static boolean isNameChar(char c, boolean nsAware, boolean xml11)
    {
        // First, let's handle 7-bit ascii range
        if (c <= 0x7A) { // 'z' or earlier
            if (c >= 0x61) { // 'a' - 'z' are ok
                return true;
            }
            if (c <= 0x5A) {
                if (c >= 0x41) { // 'A' - 'Z' ok too
                    return true;
                }
                // As are 0-9, '.' and '-'
                return (c >= 0x30 && c <= 0x39) || (c == '.') || (c == '-')
                    || (c == ':' && !nsAware);
            }
            return (c == 0x5F); // '_' is ok too
        }
        return xml11 ? XmlChars.is11NameChar(c) : XmlChars.is10NameChar(c);
    }

    /**
     * Method that can be called to check whether given String contains
     * any characters that are not legal XML names.
     *
     * @return Index of the first illegal xml name characters, if any;
     *   -1 if the name is completely legal
     */
    public final static int findIllegalNameChar(String name, boolean nsAware, boolean xml11)
    {
        int len = name.length();
        if (len < 1) {
            return -1;
        }

        char c = name.charAt(0);
        
        // First char legal?
        if (c <= 0x7A) { // 'z' or earlier
            if (c < 0x61) { // 'a' - 'z' (0x61 - 0x7A) are ok
                if (c < 0x41) { // before 'A' just white space (except colon)
                    if (c != ':' || nsAware) { // ':' == 0x3A
                        return 0;
                    }
                } else if ((c > 0x5A) && (c != '_')) {
                    // 'A' - 'Z' and '_' are ok
                    return 0;
                }
            }
        } else { 
            if (xml11) {
                if (!XmlChars.is11NameStartChar(c)) {
                    return 0;
                }
            } else {
                if (!XmlChars.is10NameStartChar(c)) {
                    return 0;
                }
            }
        }
        
        for (int i = 1; i < len; ++i) {
            c = name.charAt(i);
            if (c <= 0x7A) { // 'z' or earlier
                if (c >= 0x61) { // 'a' - 'z' are ok
                    continue;
                }
                if (c <= 0x5A) {
                    if (c >= 0x41) { // 'A' - 'Z' ok too
                        continue;
                    }
                    // As are 0-9, '.' and '-'
                    if ((c >= 0x30 && c <= 0x39) || (c == '.') || (c == '-')) {
                        continue;
                    }
                    // And finally, colon, in non-ns-aware mode
                    if (c == ':' && !nsAware) { // ':' == 0x3A
                        continue;
                    }
                } else if (c == 0x5F) { // '_' is ok too
                    continue;
                }
            } else {
                if (xml11) {
                    if (XmlChars.is11NameChar(c)) {
                        continue;
                    }
                } else {
                    if (XmlChars.is10NameChar(c)) {
                        continue;
                    }
                }
            }
            return i;
        }

        return -1;
    }

    public final static int findIllegalNmtokenChar(String nmtoken, boolean nsAware, boolean xml11)
    {
        int len = nmtoken.length();
        // No special handling for the first char, just the loop
        for (int i = 1; i < len; ++i) {
            char c = nmtoken.charAt(i);
            if (c <= 0x7A) { // 'z' or earlier
                if (c >= 0x61) { // 'a' - 'z' are ok
                    continue;
                }
                if (c <= 0x5A) {
                    if (c >= 0x41) { // 'A' - 'Z' ok too
                        continue;
                    }
                    // As are 0-9, '.' and '-'
                    if ((c >= 0x30 && c <= 0x39) || (c == '.') || (c == '-')) {
                        continue;
                    }
                    // And finally, colon, in non-ns-aware mode
                    if (c == ':' && !nsAware) { // ':' == 0x3A
                        continue;
                    }
                } else if (c == 0x5F) { // '_' is ok too
                    continue;
                }
            } else {
                if (xml11) {
                    if (XmlChars.is11NameChar(c)) {
                        continue;
                    }
                } else {
                    if (XmlChars.is10NameChar(c)) {
                        continue;
                    }
                }
            }
            return i;
        }
        return -1;
    }

    public final static boolean isSpaceChar(char c)
    {
        return (c <= CHAR_SPACE);
    }

    @SuppressWarnings("cast")
	public static String getCharDesc(char c)
    {
        int i = (int) c;
        if (Character.isISOControl(c)) {
            return "(CTRL-CHAR, code "+i+")";
        }
        if (i > 255) {
            return "'"+c+"' (code "+i+" / 0x"+Integer.toHexString(i)+")";
        }
        return "'"+c+"' (code "+i+")";
    }

}