TestUTF8Reader.java

package wstxtest.io;

import java.io.*;
import java.util.Arrays;

import org.junit.jupiter.api.Test;

import com.ctc.wstx.api.ReaderConfig;
import com.ctc.wstx.io.UTF8Reader;

/**
 * Unit test created to verify fix to
 * <a href="http://jira.codehaus.org/browse/WSTX-143">WSTX-143</a>
 * and
 * <a href="https://github.com/FasterXML/woodstox/pull/291/">woodstox#291</a>.
 *
 * @author Matt Gormley
 */
public class TestUTF8Reader extends wstxtest.BaseJUnit4Test
{
    @SuppressWarnings("resource")
    @Test
    public void testDelAtBufferBoundary() throws Exception
    {
        final int BYTE_BUFFER_SIZE = 4;
        final int CHAR_BUFFER_SIZE = 1 + BYTE_BUFFER_SIZE;
        final int INPUT_SIZE = 4 * BYTE_BUFFER_SIZE; // could be of arbitrary size
        final byte CHAR_FILLER = 32; // doesn't even matter, just need an ascii char
        final byte CHAR_DEL = 127;

        // Create input that will cause the array index out of bounds exception
        byte[] inputBytes = new byte[INPUT_SIZE];
        Arrays.fill(inputBytes, CHAR_FILLER);
        inputBytes[BYTE_BUFFER_SIZE - 1] = CHAR_DEL;
        InputStream in = new ByteArrayInputStream(inputBytes);

        // Create the UTF8Reader
        ReaderConfig cfg = ReaderConfig.createFullDefaults();
        byte[] byteBuffer = new byte[BYTE_BUFFER_SIZE];
        UTF8Reader reader = new UTF8Reader(cfg,in, byteBuffer, 0, 0, false);

        // Run the reader on the input
        char[] charBuffer = new char[CHAR_BUFFER_SIZE];
        reader.read(charBuffer, 0, charBuffer.length);
    }

    @Test
    public void testOverlongEncodingsRejected() throws Exception
    {
        // Overlong forms decode to a codepoint below the minimum for their
        // byte length; these must be rejected as malformed (RFC 3629)
        assertRejected(new byte[]{(byte)0xC0,(byte)0xBC}); // overlong '<'
        assertRejected(new byte[]{(byte)0xC0,(byte)0x80}); // overlong NUL
        assertRejected(new byte[]{(byte)0xE0,(byte)0x80,(byte)0xAF}); // overlong '/'
        assertRejected(new byte[]{(byte)0xF0,(byte)0x80,(byte)0x81,(byte)0x81}); // overlong 4-byte

        // Shortest (valid) forms for the same boundaries must still decode
        assertEquals("<", decode(new byte[]{(byte)0x3C}));
        assertEquals("��", decode(new byte[]{(byte)0xC3,(byte)0xA9}));
        assertEquals("���", decode(new byte[]{(byte)0xE2,(byte)0x82,(byte)0xAC}));
        assertEquals(new String(Character.toChars(0x1F600)),
                decode(new byte[]{(byte)0xF0,(byte)0x9F,(byte)0x98,(byte)0x80}));
    }

    private static void assertRejected(byte[] input) throws Exception
    {
        try {
            decode(input);
            fail("Expected CharConversionException for overlong UTF-8 sequence");
        } catch (CharConversionException e) {
            // expected
        }
    }

    @SuppressWarnings("resource")
    private static String decode(byte[] input) throws Exception
    {
        ReaderConfig cfg = ReaderConfig.createFullDefaults();
        UTF8Reader reader = new UTF8Reader(cfg, new ByteArrayInputStream(input),
                new byte[16], 0, 0, false);
        char[] cbuf = new char[16];
        int count = reader.read(cbuf, 0, cbuf.length);
        return new String(cbuf, 0, count);
    }
}