TestEncodingRead.java
package org.codehaus.stax.test.stream;
import java.io.*;
import javax.xml.stream.*;
/**
* Unit test suite that tests handling of text encoding, as specified
* by XML declaration and/or specific byte-order markers.
*/
public class TestEncodingRead
extends BaseStreamTest
{
final String UTF_1 = String.valueOf((char) 0x41); // 'A'
final String UTF_2 = String.valueOf((char) 0xA0); // nbsp
final String UTF_3 = String.valueOf((char) 0xB61); // some char that needs 3-byte encoding
final String UTF_CONTENT = ""
+UTF_1 + UTF_2 + UTF_3
+UTF_1 + UTF_1 + UTF_2 + UTF_2 + UTF_3 + UTF_3
+UTF_3 + UTF_3 + UTF_2 + UTF_2 + UTF_1 + UTF_1
+UTF_1 + UTF_3 + UTF_2
+UTF_2 + UTF_1 + UTF_3
+UTF_2 + UTF_3 + UTF_1
+UTF_3 + UTF_1 + UTF_2
+UTF_3 + UTF_2 + UTF_1
;
final static byte[] BE_BOM = new byte[] { (byte) 0xFE, (byte) 0xFF };
final static byte[] LE_BOM = new byte[] { (byte) 0xFF, (byte) 0xFE };
final static byte[] UTF8_BOM = new byte[] { (byte) 0xEF, (byte) 0xBB, (byte) 0xBF };
/**
* Test to check that UTF-8 stream with no leading BOM is succesfully
* handled by parser.
*/
public void testUTF8()
throws Exception
{
doTestEncoding("UTF-8", "UTF-8", null);
doTestEncoding("UTF-8", null, null);
}
/**
* Test to check that UTF-8 stream with leading BOM is succesfully
* handled by parser.
*/
public void testUTF8WithBOM()
throws Exception
{
doTestEncoding("UTF-8", "UTF-8", UTF8_BOM);
doTestEncoding("UTF-8", null, UTF8_BOM);
}
public void testUTF8Surrogates()
throws XMLStreamException, IOException
{
String XML = "<?xml version='1.0' encoding='UTF-8'?><root>XXXX</root>";
int ix = XML.indexOf('X');
byte[] src = XML.getBytes("UTF-8");
// A somewhat random high-order Unicode char:
src[ix] = (byte)0xF1;
src[ix+1] = (byte)0x90;
src[ix+2] = (byte)0x88;
src[ix+3] = (byte)0x88;
InputStream in = new ByteArrayInputStream(src);
XMLInputFactory f = getInputFactory();
XMLStreamReader sr = f.createXMLStreamReader(in);
assertTokenType(START_ELEMENT, sr.next());
assertEquals("root", sr.getLocalName());
assertTokenType(CHARACTERS, sr.next());
String str = getAndVerifyText(sr);
// Should result in a surrogate pair...
assertEquals(2, str.length());
assertEquals((char) 0xd900, str.charAt(0));
assertEquals((char) 0xde08, str.charAt(1));
assertTokenType(END_ELEMENT, sr.next());
}
public void testUTF16BEWithBOM()
throws XMLStreamException,
UnsupportedEncodingException
{
doTestEncoding("UTF-16BE", "UTF-16", BE_BOM);
doTestEncoding("UTF-16BE", null, BE_BOM);
doTestEncoding2(true);
}
public void testUTF16LEWithBOM()
throws XMLStreamException,
UnsupportedEncodingException
{
doTestEncoding("UTF-16LE", "UTF-16", LE_BOM);
doTestEncoding("UTF-16LE", null, LE_BOM);
doTestEncoding2(false);
}
/*
////////////////////////////////////////
// Private methods, shared test code
////////////////////////////////////////
*/
/**
* @param javaEnc Name of encoding as understood by JDK; used to
* instantiate JDK encoder/decoder to use for test
* @param xmlEnc Name of encoding as included in xml declaration;
* null to indicate nothing should be added
* @param bom Pre-defined bom bytes to prepend to input, if any.
*/
public void doTestEncoding(String javaEnc, String xmlEnc,
byte[] bom)
throws XMLStreamException,
UnsupportedEncodingException
{
String XML = "<?xml version='1.0'";
if (xmlEnc != null) {
XML += " encoding='"+xmlEnc+"'";
}
XML += "?><root>"+UTF_CONTENT+"</root>";
byte[] b = XML.getBytes(javaEnc);
if (bom != null) {
byte[] orig = b;
b = new byte[b.length + bom.length];
System.arraycopy(bom, 0, b, 0, bom.length);
System.arraycopy(orig, 0, b, bom.length, orig.length);
}
XMLStreamReader sr = getReader(b);
if (xmlEnc != null) {
assertEquals(xmlEnc, sr.getCharacterEncodingScheme());
} else {
/* otherwise... should we get some info? Preferably yes;
* (getEncoding() should return auto-detected encoding)
* but this is not strictly mandated by the specs?
*/
}
assertEquals(START_ELEMENT, sr.next());
assertEquals(CHARACTERS, sr.next());
assertEquals(UTF_CONTENT, getAllText(sr));
assertEquals(END_ELEMENT, sr.getEventType());
assertEquals(END_DOCUMENT, sr.next());
}
private void doTestEncoding2(boolean bigEndian)
throws XMLStreamException
{
/* 20-Jan-2006, TSa: Ok, let's try another variation that may
* causes problem; UTF-16 is vague, and if using JDK provided
* readers, parser has to indicate endianness.
*/
final String XML = "<?xml version='1.0' encoding='UTF-16'?>\n"
+"<!--comment--><root>text</root>";
int len = XML.length();
byte[] b = new byte[2 + len + len];
if (bigEndian) {
b[0] = (byte) 0xFE;
b[1] = (byte) 0xFF;
} else {
b[0] = (byte) 0xFF;
b[1] = (byte) 0xFE;
}
int offset = bigEndian ? 3 : 2;
for (int i = 0; i < len; ++i) {
b[offset + i + i] = (byte) XML.charAt(i);
}
XMLStreamReader sr = getReader(b);
// may get white space...
int type = sr.next();
if (type == SPACE) {
type = sr.next();
}
assertTokenType(COMMENT, type);
assertTokenType(START_ELEMENT, sr.next());
assertTokenType(CHARACTERS, sr.next());
assertEquals("text", getAndVerifyText(sr));
assertTokenType(END_ELEMENT, sr.next());
sr.close();
}
/*
////////////////////////////////////////
// Private methods, other
////////////////////////////////////////
*/
private XMLStreamReader getReader(byte[] contents)
throws XMLStreamException
{
XMLInputFactory f = getInputFactory();
setValidating(f, false);
return constructStreamReader(f, contents);
}
}