XMLTokener.java

package org.json;

/*
Public Domain.
*/

import java.io.Reader;

/**
 * The XMLTokener extends the JSONTokener to provide additional methods
 * for the parsing of XML texts.
 * @author JSON.org
 * @version 2015-12-09
 */
public class XMLTokener extends JSONTokener {


   /** The table of entity values. It initially contains Character values for
    * amp, apos, gt, lt, quot.
    */
   public static final java.util.HashMap<String, Character> entity;

   private XMLParserConfiguration configuration = XMLParserConfiguration.ORIGINAL;

   static {
       entity = new java.util.HashMap<String, Character>(8);
       entity.put("amp",  XML.AMP);
       entity.put("apos", XML.APOS);
       entity.put("gt",   XML.GT);
       entity.put("lt",   XML.LT);
       entity.put("quot", XML.QUOT);
   }

    /**
     * Construct an XMLTokener from a Reader.
     * @param r A source reader.
     */
    public XMLTokener(Reader r) {
        super(r);
    }

    /**
     * Construct an XMLTokener from a string.
     * @param s A source string.
     */
    public XMLTokener(String s) {
        super(s);
    }

    /**
     * Construct an XMLTokener from a Reader and an XMLParserConfiguration.
     * @param r A source reader.
     * @param configuration the configuration that can be used to set certain flags
     */
    public XMLTokener(Reader r, XMLParserConfiguration configuration) {
        super(r);
        this.configuration = configuration;
    }

    /**
     * Get the text in the CDATA block.
     * @return The string up to the <code>]]&gt;</code>.
     * @throws JSONException If the <code>]]&gt;</code> is not found.
     */
    public String nextCDATA() throws JSONException {
        char         c;
        int          i;
        StringBuilder sb = new StringBuilder();
        while (more()) {
            c = next();
            sb.append(c);
            i = sb.length() - 3;
            if (i >= 0 && sb.charAt(i) == ']' &&
                          sb.charAt(i + 1) == ']' && sb.charAt(i + 2) == '>') {
                sb.setLength(i);
                return sb.toString();
            }
        }
        throw syntaxError("Unclosed CDATA");
    }


    /**
     * Get the next XML outer token, trimming whitespace. There are two kinds
     * of tokens: the <pre>{@code '<' }</pre> character which begins a markup
     * tag, and the content
     * text between markup tags.
     *
     * @return  A string, or a <pre>{@code '<' }</pre> Character, or null if
     * there is no more source text.
     * @throws JSONException if a called function has an error
     */
    public Object nextContent() throws JSONException {
        char         c;
        StringBuilder sb;
        do {
            c = next();
        } while (Character.isWhitespace(c) && configuration.shouldTrimWhiteSpace());
        if (c == 0) {
            return null;
        }
        if (c == '<') {
            return XML.LT;
        }
        sb = new StringBuilder();
        for (;;) {
            if (c == 0) {
                return sb.toString().trim();
            }
            if (c == '<') {
                back();
                if (configuration.shouldTrimWhiteSpace()) {
                    return sb.toString().trim();
                } else return sb.toString();
            }
            if (c == '&') {
                sb.append(nextEntity(c));
            } else {
                sb.append(c);
            }
            c = next();
        }
    }


    /**
     * <pre>{@code
     * Return the next entity. These entities are translated to Characters:
     *     &amp;  &apos;  &gt;  &lt;  &quot;.
     * }</pre>
     * @param ampersand An ampersand character.
     * @return  A Character or an entity String if the entity is not recognized.
     * @throws JSONException If missing ';' in XML entity.
     */
    public Object nextEntity(@SuppressWarnings("unused") char ampersand) throws JSONException {
        StringBuilder sb = new StringBuilder();
        for (;;) {
            char c = next();
            if (Character.isLetterOrDigit(c) || c == '#') {
                sb.append(Character.toLowerCase(c));
            } else if (c == ';') {
                break;
            } else {
                throw syntaxError("Missing ';' in XML entity: &" + sb);
            }
        }
        String string = sb.toString();
        return unescapeEntity(string);
    }
    
    /**
     * Unescape an XML entity encoding;
     * @param e entity (only the actual entity value, not the preceding & or ending ;
     * @return the unescaped entity string
     * @throws JSONException if the entity is malformed
     */
    static String unescapeEntity(String e) throws JSONException {
        // validate
        if (e == null || e.isEmpty()) {
            return "";
        }
        // if our entity is an encoded unicode point, parse it.
        if (e.charAt(0) == '#') {
            if (e.length() < 2) {
                throw new JSONException("Invalid numeric character reference: &#;");
            }
            int cp = (e.charAt(1) == 'x' || e.charAt(1) == 'X')
                ? parseHexEntity(e)
                : parseDecimalEntity(e);
            return new String(new int[] {cp}, 0, 1);
        }
        Character knownEntity = entity.get(e);
        if (knownEntity == null) {
            // we don't know the entity so keep it encoded
            return '&' + e + ';';
        }
        return knownEntity.toString();
    }

    /**
     * Parse a hexadecimal numeric character reference (e.g., "&#xABC;").
     * @param e entity string starting with '#' (e.g., "#x1F4A9")
     * @return the Unicode code point
     * @throws JSONException if the format is invalid
     */
    private static int parseHexEntity(String e) throws JSONException {
        // hex encoded unicode - need at least one hex digit after #x
        if (e.length() < 3) {
            throw new JSONException("Invalid hex character reference: missing hex digits in &#" + e.substring(1) + ";");
        }
        String hex = e.substring(2);
        if (!isValidHex(hex)) {
            throw new JSONException("Invalid hex character reference: &#" + e.substring(1) + ";");
        }
        try {
            return Integer.parseInt(hex, 16);
        } catch (NumberFormatException nfe) {
            throw new JSONException("Invalid hex character reference: &#" + e.substring(1) + ";", nfe);
        }
    }

    /**
     * Parse a decimal numeric character reference (e.g., "&#123;").
     * @param e entity string starting with '#' (e.g., "#123")
     * @return the Unicode code point
     * @throws JSONException if the format is invalid
     */
    private static int parseDecimalEntity(String e) throws JSONException {
        String decimal = e.substring(1);
        if (!isValidDecimal(decimal)) {
            throw new JSONException("Invalid decimal character reference: &#" + decimal + ";");
        }
        try {
            return Integer.parseInt(decimal);
        } catch (NumberFormatException nfe) {
            throw new JSONException("Invalid decimal character reference: &#" + decimal + ";", nfe);
        }
    }

    /**
     * Check if a string contains only valid hexadecimal digits.
     * @param s the string to check
     * @return true if s is non-empty and contains only hex digits (0-9, a-f, A-F)
     */
    private static boolean isValidHex(String s) {
        if (s == null || s.isEmpty()) {
            return false;
        }
        for (int i = 0; i < s.length(); i++) {
            char c = s.charAt(i);
            if (!((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'))) {
                return false;
            }
        }
        return true;
    }

    /**
     * Check if a string contains only valid decimal digits.
     * @param s the string to check
     * @return true if s is non-empty and contains only digits (0-9)
     */
    private static boolean isValidDecimal(String s) {
        if (s == null || s.isEmpty()) {
            return false;
        }
        for (int i = 0; i < s.length(); i++) {
            char c = s.charAt(i);
            if (c < '0' || c > '9') {
                return false;
            }
        }
        return true;
    }


    /**
     * <pre>{@code 
     * Returns the next XML meta token. This is used for skipping over <!...>
     * and <?...?> structures.
     *  }</pre>
     * @return <pre>{@code Syntax characters (< > / = ! ?) are returned as
     *  Character, and strings and names are returned as Boolean. We don't care
     *  what the values actually are.
     *  }</pre>
     * @throws JSONException If a string is not properly closed or if the XML
     *  is badly structured.
     */
    public Object nextMeta() throws JSONException {
        char c;
        char q;
        do {
            c = next();
        } while (Character.isWhitespace(c));
        switch (c) {
        case 0:
            throw syntaxError("Misshaped meta tag");
        case '<':
            return XML.LT;
        case '>':
            return XML.GT;
        case '/':
            return XML.SLASH;
        case '=':
            return XML.EQ;
        case '!':
            return XML.BANG;
        case '?':
            return XML.QUEST;
        case '"':
        case '\'':
            q = c;
            for (;;) {
                c = next();
                if (c == 0) {
                    throw syntaxError("Unterminated string");
                }
                if (c == q) {
                    return Boolean.TRUE;
                }
            }
        default:
            for (;;) {
                c = next();
                if (Character.isWhitespace(c)) {
                    return Boolean.TRUE;
                }
                switch (c) {
                case 0:
                    throw syntaxError("Unterminated string");
                case '<':
                case '>':
                case '/':
                case '=':
                case '!':
                case '?':
                case '"':
                case '\'':
                    back();
                    return Boolean.TRUE;
                }
            }
        }
    }


    /**
     * <pre>{@code
     * Get the next XML Token. These tokens are found inside of angle
     * brackets. It may be one of these characters: / > = ! ? or it
     * may be a string wrapped in single quotes or double quotes, or it may be a
     * name.
     * }</pre>
     * @return a String or a Character.
     * @throws JSONException If the XML is not well formed.
     */
    public Object nextToken() throws JSONException {
        char c;
        char q;
        StringBuilder sb;
        do {
            c = next();
        } while (Character.isWhitespace(c));
        switch (c) {
        case 0:
            throw syntaxError("Misshaped element");
        case '<':
            throw syntaxError("Misplaced '<'");
        case '>':
            return XML.GT;
        case '/':
            return XML.SLASH;
        case '=':
            return XML.EQ;
        case '!':
            return XML.BANG;
        case '?':
            return XML.QUEST;

// Quoted string

        case '"':
        case '\'':
            q = c;
            sb = new StringBuilder();
            for (;;) {
                c = next();
                if (c == 0) {
                    throw syntaxError("Unterminated string");
                }
                if (c == q) {
                    return sb.toString();
                }
                if (c == '&') {
                    sb.append(nextEntity(c));
                } else {
                    sb.append(c);
                }
            }
        default:

// Name

            sb = new StringBuilder();
            for (;;) {
                sb.append(c);
                c = next();
                if (Character.isWhitespace(c)) {
                    return sb.toString();
                }
                switch (c) {
                case 0:
                    return sb.toString();
                case '>':
                case '/':
                case '=':
                case '!':
                case '?':
                case '[':
                case ']':
                    back();
                    return sb.toString();
                case '<':
                case '"':
                case '\'':
                    throw syntaxError("Bad character in a name");
                }
            }
        }
    }


    /**
     * Skip characters until past the requested string.
     * If it is not found, we are left at the end of the source with a result of false.
     * @param to A string to skip past.
     */
    // The Android implementation of JSONTokener has a public method of public void skipPast(String to)
    // even though ours does not have that method, to have API compatibility, our method in the subclass
    // should match.
    public void skipPast(String to) {
        boolean b;
        char c;
        int i;
        int j;
        int offset = 0;
        int length = to.length();
        char[] circle = new char[length];

        /*
         * First fill the circle buffer with as many characters as are in the
         * to string. If we reach an early end, bail.
         */

        for (i = 0; i < length; i += 1) {
            c = next();
            if (c == 0) {
                return;
            }
            circle[i] = c;
        }

        /* We will loop, possibly for all of the remaining characters. */

        for (;;) {
            j = offset;
            b = true;

            /* Compare the circle buffer with the to string. */

            for (i = 0; i < length; i += 1) {
                if (circle[j] != to.charAt(i)) {
                    b = false;
                    break;
                }
                j += 1;
                if (j >= length) {
                    j -= length;
                }
            }

            /* If we exit the loop with b intact, then victory is ours. */

            if (b) {
                return;
            }

            /* Get the next character. If there isn't one, then defeat is ours. */

            c = next();
            if (c == 0) {
                return;
            }
            /*
             * Shove the character in the circle buffer and advance the
             * circle offset. The offset is mod n.
             */
            circle[offset] = c;
            offset += 1;
            if (offset >= length) {
                offset -= length;
            }
        }
    }
}