TestCDataRead.java

package org.codehaus.stax.test.stream;

import java.io.*;

import javax.xml.stream.*;

/**
 * Unit test suite that tests that the stream reader does in fact
 * coalesce adjacent text/CDATA segments when told to do so.
 */
public class TestCDataRead
    extends BaseStreamTest
{
    final static String CDATA1;
    final static String CDATA2;
    static {
        StringBuilder sb1 = new StringBuilder(8000);
        StringBuilder sb2 = new StringBuilder(8000);

        sb1.append("...");
        sb2.append("\n \n\n ");

        /* Let's add enough stuff to probably cause segmentation...
         */
        for (int i = 0; i < 200; ++i) {
            String txt = "Round #"+i+"; & that's fun: &x"+i+"; <> %xx; &ent  <<< %%% <![CDATA ]]  > ]> ";
            sb1.append(txt);
            sb1.append("  ");
            sb2.append("\n");
            sb2.append(txt);
        }

        CDATA1 = sb1.toString();
        CDATA2 = sb2.toString();
    }

    final static String CDATA3 = " ]] ";

    final static String EXP_CDATA = CDATA1 + CDATA2 + CDATA3;

    final static String VALID_XML =
        "<root>"
        +"<![CDATA["+CDATA1+"]]>"
        +"<![CDATA["+CDATA2+"]]>"
        +"<![CDATA[]]>"
        +"<![CDATA["+CDATA3+"]]>"
        +"</root>";

    /**
     * This test verifies that no character quoting need (or can) be
     * done within CDATA section.
     */
    public void testCDataSimple()
        throws XMLStreamException
    {
        String XML = "<doc><![CDATA[<&]>]]]></doc>";
        String EXP = "<&]>]";
        XMLStreamReader sr = getReader(XML, true);
        assertTokenType(START_ELEMENT, sr.next());
        // In coalescing mode, all CDATA are reported as CHARACTERS
        assertTokenType(CHARACTERS, sr.next());
        String act = getAndVerifyText(sr);
        assertEquals(EXP, act);
        assertTokenType(END_ELEMENT, sr.next());
    }

    public void testCDataCoalescing()
        throws XMLStreamException
    {
        XMLStreamReader sr = getReader(VALID_XML, true);
        assertTokenType(START_ELEMENT, sr.next());
        // In coalescing mode, all CDATA are reported as CHARACTERS
        assertTokenType(CHARACTERS, sr.next());
        String act = getAndVerifyText(sr);
        assertEquals(EXP_CDATA, act);
        assertTokenType(END_ELEMENT, sr.next());
    }

    public void testCDataNonCoalescing()
        throws XMLStreamException
    {
        XMLStreamReader sr = getReader(VALID_XML, false);
        assertTokenType(START_ELEMENT, sr.next());
        int type = sr.next();
        /* 07-Dec-2004, TSa: StAX specs actually allow returning
         *   CHARACTERS too...
         */
        if (type != CHARACTERS) {
            assertEquals("Unexpected token type ("
                         +tokenTypeDesc(type)
                         +") returned; expected CDATA or CHARACTERS",
                         CDATA, type);
        }

        StringBuilder sb = new StringBuilder(16000);
        do {
            sb.append(getAndVerifyText(sr));
            type = sr.next();
        } while (type == CDATA || type == CHARACTERS);
        assertEquals(EXP_CDATA, sb.toString());
        assertTokenType(END_ELEMENT, sr.getEventType());
    }

    public void testInvalidCData()
        throws XMLStreamException
    {
        String XML = "<root><![CDATA[   </root>";
        String MSG = "unfinished CDATA section";
        streamThroughFailing(getReader(XML, false), MSG);
        streamThroughFailing(getReader(XML, true), MSG);

        XML = "<root><![CDATA  [text]]>   </root>";
        MSG = "malformed CDATA section";
        streamThroughFailing(getReader(XML, false), MSG);
        streamThroughFailing(getReader(XML, true), MSG);

        XML = "<root><!  [ CDATA  [text]]>   </root>";
        streamThroughFailing(getReader(XML, false), MSG);
        streamThroughFailing(getReader(XML, true), MSG);

        XML = "<root><![CDATA[text   ]] >   </root>";
        streamThroughFailing(getReader(XML, false), MSG);
        streamThroughFailing(getReader(XML, true), MSG);
    }

    /**
     * This unit test verifies that nested CData sections cause
     * an error. It is related to another test, which just checks
     * that ]]> (with no quoting) is illegal, but parsers may deal
     * with them differently.
     *<p>
     * Note: this is directly based on XMLTest/SAXTest #735.
     */
    public void testInvalidNestedCData()
        throws XMLStreamException
    {
        String XML = "<doc>\n<![CDATA[\n"
            +"<![CDATA[XML doesn't allow CDATA sections to nest]]>\n"
            +"\n]]>\n</doc>";

        main_loop:
        for (int i = 0; i < 2; ++i) {
            boolean coal = (i > 0);
            XMLStreamReader sr = getReader(XML, coal);
            assertTokenType(START_ELEMENT, sr.next());
            // Ok, now should get an exception...
            StringBuilder sb = new StringBuilder();
            int type = -1;
            try {
                while (true) {
                    type = sr.next();
                    if (type != CDATA && type != CHARACTERS) {
                        break;
                    }
                    sb.append(getAndVerifyText(sr));
                }
            } catch (XMLStreamException sex) {
                // good
                continue;
            } catch (RuntimeException rex) {
                /* Hmmh. Some implementations may throw a runtime exception,
                 * if things are lazily parsed (for example, Woodstox).
                 * But let's allow this only if a nested exception is
                 * of proper type
                 */
                Throwable t = rex;
                while (t != null) {
                    if (t instanceof XMLStreamException) {
                        continue main_loop;
                    }
                    t = t.getCause();
                }
                fail("Expected an XMLStreamException for nested CDATA section (coalescing: "+coal+"); instead got exception ("+rex.getClass()+"): "+rex.getMessage());
            }
            fail("Expected an exception for nested CDATA section (coalescing: "+coal+"); instead got text \""+sb.toString()+"\" (next event "+tokenTypeDesc(type)+")");
        }
    }

    // [WSTX-294]: Incorrect coalescing in some cases
    public void testIssue294() throws Exception
    {
        XMLInputFactory f = getInputFactory();
        setCoalescing(f, true);

        InputStream in = getClass().getResource("issue294.xml").openStream();

        // Important: only occurs when we construct a Reader -- not with InputStream
        // (different offsets, perhaps?)
        XMLStreamReader sr = f.createXMLStreamReader(new InputStreamReader(in, "UTF-8"));

        assertTokenType(START_ELEMENT, sr.next());
        assertEquals("Envelope", sr.getLocalName());
        assertTokenType(CHARACTERS, sr.next()); // white space
        assertTokenType(START_ELEMENT, sr.next());
        assertEquals("Body", sr.getLocalName());
        assertTokenType(CHARACTERS, sr.next()); // white space
        assertTokenType(START_ELEMENT, sr.next());
        assertEquals("helloResponse", sr.getLocalName());
        assertTokenType(CHARACTERS, sr.next()); // white space
        assertTokenType(START_ELEMENT, sr.next());
        assertEquals("return", sr.getLocalName());

        assertTokenType(CHARACTERS, sr.next());

        String text = getAndVerifyText(sr);

        // Should start with "abcde"
        if (!text.startsWith("abcde")) {
            if (text.length() > 5) {
                text = text.substring(0, 5);
            }
            fail("Expected cdata in 'return' element to start with 'abcde': instead got: '"+text+"'");
        }
        
        assertTokenType(END_ELEMENT, sr.next());
        assertEquals("return", sr.getLocalName());
        assertTokenType(CHARACTERS, sr.next());
        assertTokenType(END_ELEMENT, sr.next());
        assertEquals("helloResponse", sr.getLocalName());
        assertTokenType(CHARACTERS, sr.next());
        assertTokenType(END_ELEMENT, sr.next());
        assertEquals("Body", sr.getLocalName());
        assertTokenType(CHARACTERS, sr.next());
        assertTokenType(END_ELEMENT, sr.next());
        assertEquals("Envelope", sr.getLocalName());

        sr.close();
        in.close();
    }

    // [woodstox-core#21]: CDATA contents truncated to buffer size (500 initially)
    public void testLongerCData2() throws Exception
    {
        String SRC_TEXT =
                "\r\n123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678\r\n"
                          + "<embededElement>Woodstox 4.0.5 does not like this embedded element.  However, if you take\r\n"
                          + "out one or more characters from the really long line (so that less than 500 characters come between\r\n"
                          + "'CDATA[' and the opening of the embeddedElement tag (including LF), then Woodstox will instead\r\n"
                          + "complain that the CDATA section wasn't ended.";
        String DST_TEXT = SRC_TEXT.replace("\r\n", "\n");
        String XML = "<?xml version='1.0' encoding='utf-8'?>\r\n"
                     + "<test><![CDATA[" + SRC_TEXT + "]]></test>";
        XMLInputFactory f = getInputFactory();
        // important: don't force coalescing, that'll convert CDATA to CHARACTERS
        f.setProperty(XMLInputFactory.IS_COALESCING, Boolean.FALSE);

        XMLStreamReader sr = f.createXMLStreamReader(new StringReader(XML));
        assertTokenType(START_DOCUMENT, sr.getEventType());
        assertTokenType(START_ELEMENT, sr.next());
        assertEquals("test", sr.getLocalName());
        assertTokenType(CDATA, sr.next());
        // This should still work, although with linefeed replacements
        final String text = sr.getText();
        if (text.length() != DST_TEXT.length()) {
            fail("Length expected as "+DST_TEXT.length()+", was "+text.length());
        }
        if (!text.equals(DST_TEXT)) {
            fail("Length as expected ("+DST_TEXT.length()+"), contents differ:\n"+text);
        }
        assertTokenType(END_ELEMENT, sr.next());
        sr.close();
    }

    // [woodstox-core#22]: and some CDATA contents truncation via different codepath
    public void testLongerCData3() throws Exception {
        String SRC_TEXT =
            "123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678\r\n"
                + "<embededElement>Woodstox 4.0.5 does not like this embedded element.  However, if you take\r\n"
                + "out one or more characters from the really long line (so that less than 500 characters come between\r\n"
                + "'CDATA[' and the opening of the embeddedElement tag (including LF), then Woodstox will instead\r\n"
                + "complain that the CDATA section wasn't ended.";
        String DST_TEXT = SRC_TEXT.replace("\r\n", "\n");
        String XML = "<?xml version='1.0' encoding='utf-8'?>\r\n"
            + "<test><![CDATA[" + SRC_TEXT + "]]></test>";
        XMLInputFactory f = getInputFactory();
        // important: don't force coalescing, that'll convert CDATA to CHARACTERS
        f.setProperty(XMLInputFactory.IS_COALESCING, Boolean.FALSE);

        XMLStreamReader sr = f.createXMLStreamReader(new StringReader(XML));
        assertTokenType(START_DOCUMENT, sr.getEventType());
        assertTokenType(START_ELEMENT, sr.next());
        assertEquals("test", sr.getLocalName());
        assertTokenType(CDATA, sr.next());
        // This should still work, although with linefeed replacements
        String text = sr.getText();
        if (text.length() != DST_TEXT.length()) {
//            System.err.println("DEBUG: initial length = "+text.length());
            while (sr.next() == CDATA) {
                text += sr.getText();
//                System.err.println("DEBUG: another CDATA, len now: "+text.length());
            }
//            fail("Length expected as " + DST_TEXT.length() + ", was " + text.length());
        }
        if (!text.equals(DST_TEXT)) {
            fail("Length as expected (" + DST_TEXT.length() + "), contents differ:\n" + text);
        }
//        assertTokenType(END_ELEMENT, sr.next());
        sr.close();
    }

    /*
    ///////////////////////////////////////////////////////////////////////
    // Private methods, other
    ///////////////////////////////////////////////////////////////////////
     */

    private XMLStreamReader getReader(String contents, boolean coalescing)
        throws XMLStreamException
    {
        XMLInputFactory f = getInputFactory();
        setCoalescing(f, coalescing);
        setReplaceEntities(f, true);
        setValidating(f, false);
        return constructStreamReader(f, contents);
    }

}