UTF8SurrogateValidation363Test.java

package tools.jackson.core.unittest.read;

import org.junit.jupiter.api.Test;

import tools.jackson.core.*;
import tools.jackson.core.exc.StreamReadException;
import tools.jackson.core.json.JsonFactory;
import tools.jackson.core.unittest.JacksonCoreTestBase;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.fail;

/**
 * Tests for [jackson-core#363]: UTF-8 parser should reject 3-byte UTF-8 sequences
 * that encode surrogate code points (U+D800 to U+DFFF), which are illegal in UTF-8.
 */
class UTF8SurrogateValidation363Test
    extends JacksonCoreTestBase
{
    private final JsonFactory FACTORY = newStreamFactory();

    /**
     * Test that parser rejects 3-byte UTF-8 sequence encoding U+D800 (start of surrogate range).
     * In UTF-8, U+D800 would be encoded as: ED A0 80
     */
    @Test
    void rejectSurrogateD800InString() throws Exception
    {
        // JSON: {"value":"X"}
        // where X is the invalid 3-byte sequence ED A0 80 (U+D800)
        byte[] doc = new byte[] {
            '{', '"', 'v', 'a', 'l', 'u', 'e', '"', ':',
            '"',
            (byte) 0xED, (byte) 0xA0, (byte) 0x80, // Invalid: U+D800 surrogate
            '"',
            '}'
        };

        try (JsonParser p = FACTORY.createParser(ObjectReadContext.empty(), doc)) {
            assertToken(JsonToken.START_OBJECT, p.nextToken());
            assertToken(JsonToken.PROPERTY_NAME, p.nextToken());
            assertEquals("value", p.currentName());

            // This should fail when trying to read the string value
            assertToken(JsonToken.VALUE_STRING, p.nextToken());
            p.getString(); // Actual parsing happens here  (lazy parsing)
            fail("Should have thrown an exception for surrogate code point in UTF-8");
        } catch (StreamReadException e) {
            verifyException(e, "Invalid UTF-8");
        }
    }

    /**
     * Test that parser rejects 3-byte UTF-8 sequence encoding U+DFFF (end of surrogate range).
     * In UTF-8, U+DFFF would be encoded as: ED BF BF
     */
    @Test
    void rejectSurrogateDFFFInString() throws Exception
    {
        // JSON: {"value":"X"}
        // where X is the invalid 3-byte sequence ED BF BF (U+DFFF)
        byte[] doc = new byte[] {
            '{', '"', 'v', 'a', 'l', 'u', 'e', '"', ':',
            '"',
            (byte) 0xED, (byte) 0xBF, (byte) 0xBF, // Invalid: U+DFFF surrogate
            '"',
            '}'
        };

        try (JsonParser p = FACTORY.createParser(ObjectReadContext.empty(), doc)) {
            assertToken(JsonToken.START_OBJECT, p.nextToken());
            assertToken(JsonToken.PROPERTY_NAME, p.nextToken());
            assertEquals("value", p.currentName());

            // This should fail when trying to read the string value
            assertToken(JsonToken.VALUE_STRING, p.nextToken());
            p.getString(); // Actual parsing happens here  (lazy parsing)
            fail("Should have thrown an exception for surrogate code point in UTF-8");
        } catch (StreamReadException e) {
            verifyException(e, "Invalid UTF-8");
        }
    }

    /**
     * Test that parser rejects 3-byte UTF-8 sequence encoding U+DABC (middle of surrogate range).
     * In UTF-8, U+DABC would be encoded as: ED AA BC
     */
    @Test
    void rejectSurrogateMiddleInString() throws Exception
    {
        // JSON: {"value":"X"}
        // where X is the invalid 3-byte sequence ED AA BC (U+DABC)
        byte[] doc = new byte[] {
            '{', '"', 'v', 'a', 'l', 'u', 'e', '"', ':',
            '"',
            (byte) 0xED, (byte) 0xAA, (byte) 0xBC, // Invalid: U+DABC surrogate
            '"',
            '}'
        };

        try (JsonParser p = FACTORY.createParser(ObjectReadContext.empty(), doc)) {
            assertToken(JsonToken.START_OBJECT, p.nextToken());
            assertToken(JsonToken.PROPERTY_NAME, p.nextToken());
            assertEquals("value", p.currentName());

            // This should fail when trying to read the string value
            assertToken(JsonToken.VALUE_STRING, p.nextToken());
            p.getString(); // Actual parsing happens here  (lazy parsing)
            fail("Should have thrown an exception for surrogate code point in UTF-8");
        } catch (StreamReadException e) {
            verifyException(e, "Invalid UTF-8");
        }
    }

    /**
     * Test that parser rejects surrogate in field name as well.
     */
    @Test
    void rejectSurrogateInFieldName() throws Exception
    {
        // JSON: {"X":"value"}
        // where X is the invalid 3-byte sequence ED A0 80 (U+D800)
        byte[] doc = new byte[] {
            '{', '"',
            (byte) 0xED, (byte) 0xA0, (byte) 0x80, // Invalid: U+D800 surrogate
            '"', ':', '"', 'v', 'a', 'l', 'u', 'e', '"',
            '}'
        };

        try (JsonParser p = FACTORY.createParser(ObjectReadContext.empty(), doc)) {
            assertToken(JsonToken.START_OBJECT, p.nextToken());

            // This should fail when trying to read the field name
            // (no lazy parsing for names)
            assertToken(JsonToken.PROPERTY_NAME, p.nextToken());
            fail("Should have thrown an exception for surrogate code point in UTF-8");
        } catch (StreamReadException e) {
            verifyException(e, "Invalid UTF-8");
        }
    }

    /**
     * Sanity check: valid 3-byte UTF-8 sequences just before surrogate range should work.
     * U+D7FF is the last valid code point before the surrogate range.
     * In UTF-8: ED 9F BF
     */
    @Test
    void acceptValidBeforeSurrogateRange() throws Exception
    {
        // JSON: {"value":"X"}
        // where X is the valid 3-byte sequence ED 9F BF (U+D7FF)
        byte[] doc = new byte[] {
            '{', '"', 'v', 'a', 'l', 'u', 'e', '"', ':',
            '"',
            (byte) 0xED, (byte) 0x9F, (byte) 0xBF, // Valid: U+D7FF (just before surrogates)
            '"',
            '}'
        };

        try (JsonParser p = FACTORY.createParser(ObjectReadContext.empty(), doc)) {
            assertToken(JsonToken.START_OBJECT, p.nextToken());
            assertToken(JsonToken.PROPERTY_NAME, p.nextToken());
            assertEquals("value", p.currentName());
            assertToken(JsonToken.VALUE_STRING, p.nextToken());
            assertEquals("\uD7FF", p.getString());
            assertToken(JsonToken.END_OBJECT, p.nextToken());
        }
    }

    /**
     * Sanity check: valid 3-byte UTF-8 sequences just after surrogate range should work.
     * U+E000 is the first valid code point after the surrogate range.
     * In UTF-8: EE 80 80
     */
    @Test
    void acceptValidAfterSurrogateRange() throws Exception
    {
        // JSON: {"value":"X"}
        // where X is the valid 3-byte sequence EE 80 80 (U+E000)
        byte[] doc = new byte[] {
            '{', '"', 'v', 'a', 'l', 'u', 'e', '"', ':',
            '"',
            (byte) 0xEE, (byte) 0x80, (byte) 0x80, // Valid: U+E000 (just after surrogates)
            '"',
            '}'
        };

        try (JsonParser p = FACTORY.createParser(ObjectReadContext.empty(), doc)) {
            assertToken(JsonToken.START_OBJECT, p.nextToken());
            assertToken(JsonToken.PROPERTY_NAME, p.nextToken());
            assertEquals("value", p.currentName());
            assertToken(JsonToken.VALUE_STRING, p.nextToken());
            assertEquals("\uE000", p.getString());
            assertToken(JsonToken.END_OBJECT, p.nextToken());
        }
    }
}