TestEncodingDetection.java
package wstxtest.stream;
import javax.xml.stream.*;
import org.codehaus.stax2.XMLInputFactory2;
import org.codehaus.stax2.io.Stax2ByteArraySource;
/**
* This set on unit tests checks that woodstox-specific invariants
* regarding automatic input encoding detection are maintained. Some
* of these might be required by stax specification too, but it is not
* quite certain, thus tests are included in woodstox-specific packages.
*/
public class TestEncodingDetection
extends BaseStreamTest
{
final static String ENC_EBCDIC_IN_PREFIX = "cp";
final static String ENC_EBCDIC_OUT_PREFIX = "IBM";
public void testUtf8() throws Exception
{
/* Default is, in absence of any other indications, UTF-8...
* let's check the shortest legal doc:
*/
String XML = "<a/>";
byte[] b = XML.getBytes("UTF-8");
XMLStreamReader sr = getReader(b);
assertTokenType(START_DOCUMENT, sr.getEventType());
assertNull(sr.getCharacterEncodingScheme());
assertEquals("UTF-8", sr.getEncoding());
// let's iterate just for fun though
assertTokenType(START_ELEMENT, sr.next());
sr.close();
}
// for [woodstox-core#117]
public void testWindows1252() throws Exception
{
final String doc = "<?xml version='1.0' encoding='WINDOWS-1252'?><x/>";
// it's just ASCII so getBytes() can use whatever
final byte[] b = doc.getBytes("UTF-8");
XMLStreamReader sr = getReader(b);
assertTokenType(START_DOCUMENT, sr.getEventType());
assertEquals("WINDOWS-1252", sr.getCharacterEncodingScheme());
assertEquals("WINDOWS-1252", sr.getEncoding());
// let's iterate just for fun though
assertTokenType(START_ELEMENT, sr.next());
assertTokenType(END_ELEMENT, sr.next());
sr.close();
}
public void testUtf16() throws Exception
{
// Should be able to figure out encoding...
String XML = ".<?xml version='1.0'?><root/>";
/* Let's first check a somewhat common case; figuring out UTF-16
* encoded doc (which has to have BOM, thus); first, big-endian
*/
StringBuilder sb = new StringBuilder(XML);
sb.setCharAt(0, (char) 0xFEFF);
byte[] b = getUtf16Bytes(sb.toString(), true);
XMLStreamReader sr = getReader(b);
assertTokenType(START_DOCUMENT, sr.getEventType());
assertNull(sr.getCharacterEncodingScheme());
assertEquals("UTF-16BE", sr.getEncoding());
// let's iterate just for fun though
assertTokenType(START_ELEMENT, sr.next());
sr.close();
// and then little-endian
b = getUtf16Bytes(sb.toString(), false);
sr = getReader(b);
assertTokenType(START_DOCUMENT, sr.getEventType());
assertNull(sr.getCharacterEncodingScheme());
assertEquals("UTF-16LE", sr.getEncoding());
assertTokenType(START_ELEMENT, sr.next());
sr.close();
}
/**
* Testing for EBCDIC is tricky, mostly due to possible
* complexity of the things to support, as well as lack
* of sample documents and difficulty in reading ones
* that exist (it not being 7-bit ascii compatible).
* But let's try a straight-forward (naive?) test
* to verify that what is supposed to work does.
*/
public void testEBCDIC() throws Exception
{
final String[] subtypes = new String[] {
"037", "277", "278", "280", "284", "285", "297",
"420", "424", "500", "870", "871", "918",
};
for (String subtype : subtypes) {
String actEnc = ENC_EBCDIC_IN_PREFIX + subtype;
String xml = "<?xml version='1.0' encoding='"+actEnc+"' ?>"
+"<root attr='123'>rock & roll!<!-- comment --></root>";
byte[] bytes = xml.getBytes(actEnc);
XMLStreamReader sr = getReader(bytes);
assertTokenType(START_DOCUMENT, sr.getEventType());
// Declared encoding should match 100%
assertEquals(actEnc, sr.getCharacterEncodingScheme());
// Found encoding, though, can be changed
String expEnc = ENC_EBCDIC_OUT_PREFIX + subtype;
assertEquals(expEnc, sr.getEncoding());
assertTokenType(START_ELEMENT, sr.next());
assertEquals("root", sr.getLocalName());
assertTokenType(CHARACTERS, sr.next());
assertEquals("rock & roll!", getAndVerifyText(sr));
assertTokenType(COMMENT, sr.next());
assertEquals(" comment ", getAndVerifyText(sr));
assertTokenType(END_ELEMENT, sr.next());
assertEquals("root", sr.getLocalName());
sr.close();
}
}
/*
/////////////////////////////////////////
// Non-test methods
/////////////////////////////////////////
*/
private byte[] getUtf16Bytes(String input, boolean bigEndian)
{
int len = input.length();
byte[] b = new byte[len+len];
int offset = bigEndian ? 1 : 0; // offset for LSB
for (int i = 0; i < len; ++i) {
int c = input.charAt(i);
// BOM is 2-byte, others 1 byte...
b[i+i+offset] = (byte) (c & 0xFF);
b[i+i+(1 - offset)] = (byte) (c >> 8);
}
return b;
}
private XMLStreamReader getReader(byte[] b) throws Exception
{
XMLInputFactory2 f = getInputFactory();
Stax2ByteArraySource src = new Stax2ByteArraySource(b, 0, b.length);
return f.createXMLStreamReader(src);
}
}