CharsetAliases.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.html.charsetdetector;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.StandardCharsets;
import java.nio.charset.UnsupportedCharsetException;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import org.apache.tika.parser.html.charsetdetector.charsets.ReplacementCharset;
import org.apache.tika.parser.html.charsetdetector.charsets.XUserDefinedCharset;
/**
* Singleton class that associates standard charset names to java charset implementations
* https://encoding.spec.whatwg.org/#ref-for-iso-8859-8-i
*/
final class CharsetAliases {
private static final Map<String, Charset> charsetsByLabel = new HashMap<>();
private CharsetAliases() {
}
/**
* @param label a charset name
* @return the corresponding java charset, if there is one. Otherwise, null
*/
static Charset getCharsetByLabel(String label) {
if (label == null) {
return null;
}
synchronized (charsetsByLabel) {
// Lazy initialization
if (charsetsByLabel.isEmpty()) {
addAll();
}
}
label = label.trim().toLowerCase(Locale.US);
return charsetsByLabel.get(label);
}
private static void addAll() {
addCharset(charset("Big5"), "big5", "big5-hkscs", "cn-big5", "csbig5", "x-x-big5");
addCharset(charset("EUC-JP"), "cseucpkdfmtjapanese", "euc-jp", "x-euc-jp");
addCharset(charset("EUC-KR"), "cseuckr", "csksc56011987", "euc-kr", "iso-ir-149", "korean",
"ks_c_5601-1987", "ks_c_5601-1989", "ksc5601", "ksc_5601", "windows-949");
addCharset(charset("GBK"), "chinese", "csgb2312", "csiso58gb231280", "gb2312", "gb_2312",
"gb_2312-80", "gbk", "iso-ir-58", "x-gbk");
addCharset(charset("IBM866"), "866", "cp866", "csibm866", "ibm866");
addCharset(charset("ISO-2022-JP"), "csiso2022jp", "iso-2022-jp");
addCharset(charset("ISO-8859-10", "ISO-8859-4"), "csisolatin6", "iso-8859-10", "iso-ir-157",
"iso8859-10", "iso885910", "l6", "latin6");
addCharset(charset("ISO-8859-13"), "iso-8859-13", "iso8859-13", "iso885913");
addCharset(charset("ISO-8859-14", "ISO-8859-1"), "iso-8859-14", "iso8859-14", "iso885914");
addCharset(charset("ISO-8859-15"), "csisolatin9", "iso-8859-15", "iso8859-15", "iso885915",
"iso_8859-15", "l9");
addCharset(charset("ISO-8859-16", "ISO-8859-1"), "iso-8859-16");
addCharset(charset("ISO-8859-2"), "csisolatin2", "iso-8859-2", "iso-ir-101", "iso8859-2",
"iso88592", "iso_8859-2", "iso_8859-2:1987", "l2", "latin2");
addCharset(charset("ISO-8859-3"), "csisolatin3", "iso-8859-3", "iso-ir-109", "iso8859-3",
"iso88593", "iso_8859-3", "iso_8859-3:1988", "l3", "latin3");
addCharset(charset("ISO-8859-4"), "csisolatin4", "iso-8859-4", "iso-ir-110", "iso8859-4",
"iso88594", "iso_8859-4", "iso_8859-4:1988", "l4", "latin4");
addCharset(charset("ISO-8859-5"), "csisolatincyrillic", "cyrillic", "iso-8859-5",
"iso-ir-144", "iso8859-5", "iso88595", "iso_8859-5", "iso_8859-5:1988");
addCharset(charset("ISO-8859-6"), "arabic", "asmo-708", "csiso88596e", "csiso88596i",
"csisolatinarabic", "ecma-114", "iso-8859-6", "iso-8859-6-e", "iso-8859-6-i",
"iso-ir-127", "iso8859-6", "iso88596", "iso_8859-6", "iso_8859-6:1987");
addCharset(charset("ISO-8859-7"), "csisolatingreek", "ecma-118", "elot_928", "greek",
"greek8", "iso-8859-7", "iso-ir-126", "iso8859-7", "iso88597", "iso_8859-7",
"iso_8859-7:1987", "sun_eu_greek");
// ISO-8859-8 actually should have an influence on the layout direction
// (text should be decoded in the visual order). However, this is not implemented in tika.
addCharset(charset("ISO-8859-8"), "csiso88598e", "csisolatinhebrew", "hebrew", "iso-8859-8",
"iso-8859-8-e", "iso-ir-138", "iso8859-8", "iso88598", "iso_8859-8",
"iso_8859-8:1988", "visual");
addCharset(charset("ISO-8859-8-I", "ISO-8859-8"), "csiso88598i", "iso-8859-8-i", "logical");
addCharset(charset("KOI8-R"), "cskoi8r", "koi", "koi8", "koi8-r", "koi8_r");
addCharset(charset("KOI8-U"), "koi8-ru", "koi8-u");
addCharset(charset("Shift_JIS"), "csshiftjis", "ms932", "ms_kanji", "shift-jis",
"shift_jis", "sjis", "windows-31j", "x-sjis");
addCharset(charset("UTF-16BE"), "utf-16be");
addCharset(charset("UTF-16LE"), "utf-16", "utf-16le");
addCharset(charset("UTF-8"), "unicode-1-1-utf-8", "utf-8", "utf8");
addCharset(charset("gb18030"), "gb18030");
addCharset(charset("windows-1250"), "cp1250", "windows-1250", "x-cp1250");
addCharset(charset("windows-1251"), "cp1251", "windows-1251", "x-cp1251");
addCharset(charset("windows-1252"), "ansi_x3.4-1968", "ascii", "cp1252", "cp819",
"csisolatin1", "ibm819", "iso-8859-1", "iso-ir-100", "iso8859-1", "iso88591",
"iso_8859-1", "iso_8859-1:1987", "l1", "latin1", "us-ascii", "windows-1252",
"x-cp1252");
addCharset(charset("windows-1253"), "cp1253", "windows-1253", "x-cp1253");
addCharset(charset("windows-1254"), "cp1254", "csisolatin5", "iso-8859-9", "iso-ir-148",
"iso8859-9", "iso88599", "iso_8859-9", "iso_8859-9:1989", "l5", "latin5",
"windows-1254", "x-cp1254");
addCharset(charset("windows-1255"), "cp1255", "windows-1255", "x-cp1255");
addCharset(charset("windows-1256"), "cp1256", "windows-1256", "x-cp1256");
addCharset(charset("windows-1257"), "cp1257", "windows-1257", "x-cp1257");
addCharset(charset("windows-1258"), "cp1258", "windows-1258", "x-cp1258");
addCharset(charset("windows-874"), "dos-874", "iso-8859-11", "iso8859-11", "iso885911",
"tis-620", "windows-874");
addCharset(charset("x-MacCyrillic"), "x-mac-cyrillic", "x-mac-ukrainian");
addCharset(charset("x-MacRoman"), "csmacintosh", "mac", "macintosh", "x-mac-roman");
// The "replacement" charset is a dummy charset. It is present to mitigate wrong-charset
// attacks
addCharset(new ReplacementCharset(), "csiso2022kr", "hz-gb-2312", "iso-2022-cn",
"iso-2022-cn-ext", "iso-2022-kr", "replacement");
// The x-user-defined charset is not present in java
addCharset(new XUserDefinedCharset(), "x-user-defined");
}
/**
* @param names jvm charset names
* @return the first of the given charsets that exists in the current JVM,
* or ISO_8859_1 if none exists
*/
private static Charset charset(String... names) {
for (String name : names) {
try {
return Charset.forName(name);
} catch (IllegalCharsetNameException | UnsupportedCharsetException e) {
/* pass */
}
}
// The only single-byte charset extended charset that must be present on every Java platform
return StandardCharsets.ISO_8859_1;
}
/**
* @param charset name of the charset in the JVM
* @param names standard W3C charset names
*/
private static void addCharset(Charset charset, String... names) {
for (String name : names) {
charsetsByLabel.put(name, charset);
}
}
}