RTFCharsetMaps.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.microsoft.rtf.jflex;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import org.apache.tika.utils.CharsetUtils;
/**
* Shared charset maps for RTF parsing. Maps RTF {@code \fcharsetN} and
* {@code \ansicpgN} values to Java {@link Charset} instances.
*
* <p>Extracted from the original {@code TextExtractor} so both the JFlex-based
* parser and decapsulator can reuse them.</p>
*/
public final class RTFCharsetMaps {
public static final Charset WINDOWS_1252 = Charset.forName("windows-1252");
/**
* Maps {@code \fcharsetN} values to Java charsets.
* The RTF font table uses these to declare per-font character encodings.
*/
public static final Map<Integer, Charset> FCHARSET_MAP;
/**
* Maps {@code \ansicpgN} values to Java charsets.
* This is the global ANSI code page declared in the RTF header.
*/
public static final Map<Integer, Charset> ANSICPG_MAP;
static {
Map<Integer, Charset> fcharset = new HashMap<>();
fcharset.put(0, WINDOWS_1252); // ANSI
// charset 1 = Default, charset 2 = Symbol
fcharset.put(77, getCharset("MacRoman")); // Mac Roman
fcharset.put(78, getCharset("Shift_JIS")); // Mac Shift Jis
fcharset.put(79, getCharset("ms949")); // Mac Hangul
fcharset.put(80, getCharset("GB2312")); // Mac GB2312
fcharset.put(81, getCharset("Big5")); // Mac Big5
fcharset.put(82, getCharset("johab")); // Mac Johab (old)
fcharset.put(83, getCharset("MacHebrew")); // Mac Hebrew
fcharset.put(84, getCharset("MacArabic")); // Mac Arabic
fcharset.put(85, getCharset("MacGreek")); // Mac Greek
fcharset.put(86, getCharset("MacTurkish")); // Mac Turkish
fcharset.put(87, getCharset("MacThai")); // Mac Thai
fcharset.put(88, getCharset("cp1250")); // Mac East Europe
fcharset.put(89, getCharset("cp1251")); // Mac Russian
fcharset.put(128, getCharset("MS932")); // Shift JIS
fcharset.put(129, getCharset("ms949")); // Hangul
fcharset.put(130, getCharset("ms1361")); // Johab
fcharset.put(134, getCharset("ms936")); // GB2312
fcharset.put(136, getCharset("ms950")); // Big5
fcharset.put(161, getCharset("cp1253")); // Greek
fcharset.put(162, getCharset("cp1254")); // Turkish
fcharset.put(163, getCharset("cp1258")); // Vietnamese
fcharset.put(177, getCharset("cp1255")); // Hebrew
fcharset.put(178, getCharset("cp1256")); // Arabic
fcharset.put(186, getCharset("cp1257")); // Baltic
fcharset.put(204, getCharset("cp1251")); // Russian
fcharset.put(222, getCharset("ms874")); // Thai
fcharset.put(238, getCharset("cp1250")); // Eastern European
fcharset.put(254, getCharset("cp437")); // PC 437
fcharset.put(255, getCharset("cp850")); // OEM
FCHARSET_MAP = Collections.unmodifiableMap(fcharset);
}
static {
Map<Integer, Charset> ansicpg = new HashMap<>();
ansicpg.put(437, getCharset("CP437")); // US IBM
ansicpg.put(708, getCharset("ISO-8859-6")); // Arabic (ASMO 708)
ansicpg.put(709, getCharset("windows-709")); // Arabic (ASMO 449+)
ansicpg.put(710, getCharset("windows-710")); // Arabic (transparent)
ansicpg.put(711, getCharset("windows-711")); // Arabic (Nafitha)
ansicpg.put(720, getCharset("windows-720")); // Arabic (transparent ASMO)
ansicpg.put(819, getCharset("CP819")); // Windows 3.1 (US/Western)
ansicpg.put(850, getCharset("CP850")); // IBM Multilingual
ansicpg.put(852, getCharset("CP852")); // Eastern European
ansicpg.put(860, getCharset("CP860")); // Portuguese
ansicpg.put(862, getCharset("CP862")); // Hebrew
ansicpg.put(863, getCharset("CP863")); // French Canadian
ansicpg.put(864, getCharset("CP864")); // Arabic
ansicpg.put(865, getCharset("CP865")); // Norwegian
ansicpg.put(866, getCharset("CP866")); // Soviet Union
ansicpg.put(874, getCharset("MS874")); // Thai
ansicpg.put(932, getCharset("MS932")); // Japanese
ansicpg.put(936, getCharset("MS936")); // Simplified Chinese
ansicpg.put(949, getCharset("CP949")); // Korean
ansicpg.put(950, getCharset("CP950")); // Traditional Chinese
ansicpg.put(1250, getCharset("CP1250")); // Eastern European
ansicpg.put(1251, getCharset("CP1251")); // Cyrillic
ansicpg.put(1252, getCharset("CP1252")); // Western European
ansicpg.put(1253, getCharset("CP1253")); // Greek
ansicpg.put(1254, getCharset("CP1254")); // Turkish
ansicpg.put(1255, getCharset("CP1255")); // Hebrew
ansicpg.put(1256, getCharset("CP1256")); // Arabic
ansicpg.put(1257, getCharset("CP1257")); // Baltic
ansicpg.put(1258, getCharset("CP1258")); // Vietnamese
ansicpg.put(1361, getCharset("x-Johab")); // Johab
ansicpg.put(10000, getCharset("MacRoman")); // Mac Roman
ansicpg.put(10001, getCharset("Shift_JIS")); // Mac Japan
ansicpg.put(10004, getCharset("MacArabic")); // Mac Arabic
ansicpg.put(10005, getCharset("MacHebrew")); // Mac Hebrew
ansicpg.put(10006, getCharset("MacGreek")); // Mac Greek
ansicpg.put(10007, getCharset("MacCyrillic")); // Mac Cyrillic
ansicpg.put(10029, getCharset("x-MacCentralEurope")); // Mac Latin2
ansicpg.put(10081, getCharset("MacTurkish")); // Mac Turkish
ansicpg.put(57002, getCharset("x-ISCII91")); // Devanagari
ansicpg.put(57003, getCharset("windows-57003")); // Bengali
ansicpg.put(57004, getCharset("windows-57004")); // Tamil
ansicpg.put(57005, getCharset("windows-57005")); // Telugu
ansicpg.put(57006, getCharset("windows-57006")); // Assamese
ansicpg.put(57007, getCharset("windows-57007")); // Oriya
ansicpg.put(57008, getCharset("windows-57008")); // Kannada
ansicpg.put(57009, getCharset("windows-57009")); // Malayalam
ansicpg.put(57010, getCharset("windows-57010")); // Gujarati
ansicpg.put(57011, getCharset("windows-57011")); // Punjabi
ANSICPG_MAP = Collections.unmodifiableMap(ansicpg);
}
private RTFCharsetMaps() {
}
/**
* Resolve a charset by name, falling back to US-ASCII if unavailable.
*/
static Charset getCharset(String name) {
try {
return CharsetUtils.forName(name);
} catch (IllegalArgumentException e) {
return StandardCharsets.US_ASCII;
}
}
/**
* Resolve an ANSI code page number to a Java Charset.
* Tries the ANSICPG_MAP first, then falls back to {@code windows-N} and {@code cpN}.
* Returns {@code WINDOWS_1252} if nothing matches.
*/
public static Charset resolveCodePage(int cpNumber) {
Charset cs = ANSICPG_MAP.get(cpNumber);
if (cs != null) {
return cs;
}
try {
return Charset.forName("windows-" + cpNumber);
} catch (Exception e) {
try {
return Charset.forName("cp" + cpNumber);
} catch (Exception e2) {
return WINDOWS_1252;
}
}
}
}