/src/xerces-c/src/xercesc/framework/XMLRecognizer.cpp
Line | Count | Source |
1 | | /* |
2 | | * Licensed to the Apache Software Foundation (ASF) under one or more |
3 | | * contributor license agreements. See the NOTICE file distributed with |
4 | | * this work for additional information regarding copyright ownership. |
5 | | * The ASF licenses this file to You under the Apache License, Version 2.0 |
6 | | * (the "License"); you may not use this file except in compliance with |
7 | | * the License. You may obtain a copy of the License at |
8 | | * |
9 | | * http://www.apache.org/licenses/LICENSE-2.0 |
10 | | * |
11 | | * Unless required by applicable law or agreed to in writing, software |
12 | | * distributed under the License is distributed on an "AS IS" BASIS, |
13 | | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | * See the License for the specific language governing permissions and |
15 | | * limitations under the License. |
16 | | */ |
17 | | |
18 | | /** |
19 | | * $Id: XMLRecognizer.cpp 555320 2007-07-11 16:05:13Z amassari $ |
20 | | */ |
21 | | |
22 | | |
23 | | // --------------------------------------------------------------------------- |
24 | | // Includes |
25 | | // --------------------------------------------------------------------------- |
26 | | #include <xercesc/framework/XMLRecognizer.hpp> |
27 | | #include <xercesc/util/RuntimeException.hpp> |
28 | | #include <xercesc/util/XMLString.hpp> |
29 | | |
30 | | XERCES_CPP_NAMESPACE_BEGIN |
31 | | |
32 | | // --------------------------------------------------------------------------- |
33 | | // Local data |
34 | | // |
35 | | // gEncodingNameMap |
36 | | // This array maps the Encodings enum values to their canonical names. |
37 | | // Be sure to keep this in sync with that enum! |
38 | | // --------------------------------------------------------------------------- |
39 | | static const XMLCh* gEncodingNameMap[XMLRecognizer::Encodings_Count] = |
40 | | { |
41 | | XMLUni::fgEBCDICEncodingString |
42 | | , XMLUni::fgUCS4BEncodingString |
43 | | , XMLUni::fgUCS4LEncodingString |
44 | | , XMLUni::fgUSASCIIEncodingString |
45 | | , XMLUni::fgUTF8EncodingString |
46 | | , XMLUni::fgUTF16BEncodingString |
47 | | , XMLUni::fgUTF16LEncodingString |
48 | | , XMLUni::fgXMLChEncodingString |
49 | | }; |
50 | | |
51 | | |
52 | | |
53 | | // --------------------------------------------------------------------------- |
54 | | // XMLRecognizer: Public, const static data |
55 | | // |
56 | | // gXXXPre |
57 | | // gXXXPreLen |
58 | | // The byte sequence prefixes for all of the encodings that we can |
59 | | // auto sense. Also included is the length of each sequence. |
60 | | // --------------------------------------------------------------------------- |
61 | | const char XMLRecognizer::fgASCIIPre[] = { 0x3C, 0x3F, 0x78, 0x6D, 0x6C, 0x20 }; |
62 | | const XMLSize_t XMLRecognizer::fgASCIIPreLen = 6; |
63 | | const XMLByte XMLRecognizer::fgEBCDICPre[] = { 0x4C, 0x6F, 0xA7, 0x94, 0x93, 0x40 }; |
64 | | const XMLSize_t XMLRecognizer::fgEBCDICPreLen = 6; |
65 | | const XMLByte XMLRecognizer::fgUTF16BPre[] = { 0x00, 0x3C, 0x00, 0x3F, 0x00, 0x78, 0x00, 0x6D, 0x00, 0x6C, 0x00, 0x20 }; |
66 | | const XMLByte XMLRecognizer::fgUTF16LPre[] = { 0x3C, 0x00, 0x3F, 0x00, 0x78, 0x00, 0x6D, 0x00, 0x6C, 0x00, 0x20, 0x00 }; |
67 | | const XMLSize_t XMLRecognizer::fgUTF16PreLen = 12; |
68 | | const XMLByte XMLRecognizer::fgUCS4BPre[] = |
69 | | { |
70 | | 0x00, 0x00, 0x00, 0x3C, 0x00, 0x00, 0x00, 0x3F |
71 | | , 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D |
72 | | , 0x00, 0x00, 0x00, 0x6C, 0x00, 0x00, 0x00, 0x20 |
73 | | }; |
74 | | const XMLByte XMLRecognizer::fgUCS4LPre[] = |
75 | | { |
76 | | 0x3C, 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00 |
77 | | , 0x78, 0x00, 0x00, 0x00, 0x6D, 0x00, 0x00, 0x00 |
78 | | , 0x6C, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00 |
79 | | }; |
80 | | const XMLSize_t XMLRecognizer::fgUCS4PreLen = 24; |
81 | | |
82 | | const char XMLRecognizer::fgUTF8BOM[] = {(char)0xEF, (char)0xBB, (char)0xBF}; |
83 | | const XMLSize_t XMLRecognizer::fgUTF8BOMLen = 3; |
84 | | |
85 | | // --------------------------------------------------------------------------- |
86 | | // XMLRecognizer: Encoding recognition methods |
87 | | // --------------------------------------------------------------------------- |
88 | | XMLRecognizer::Encodings |
89 | | XMLRecognizer::basicEncodingProbe( const XMLByte* const rawBuffer |
90 | | , const XMLSize_t rawByteCount) |
91 | 17.6k | { |
92 | | // |
93 | | // As an optimization to check the 90% case, check first for the ASCII |
94 | | // sequence '<?xml', which means its either US-ASCII, UTF-8, or some |
95 | | // other encoding that we don't do manually but which happens to share |
96 | | // the US-ASCII code points for these characters. So just return UTF-8 |
97 | | // to get us through the first line. |
98 | | // |
99 | 17.6k | if (rawByteCount >= fgASCIIPreLen) |
100 | 17.6k | { |
101 | 17.6k | if (!memcmp(rawBuffer, fgASCIIPre, fgASCIIPreLen)) |
102 | 7.81k | return UTF_8; |
103 | 17.6k | } |
104 | | |
105 | | // |
106 | | // If the count of raw bytes is less than 2, it cannot be anything |
107 | | // we understand, so return UTF-8 as a fallback. |
108 | | // |
109 | 9.85k | if (rawByteCount < 2) |
110 | 0 | return UTF_8; |
111 | | |
112 | | // |
113 | | // We have two to four bytes, so lets check for a UTF-16 BOM. That |
114 | | // is quick to check and enough to identify two major encodings. |
115 | | // |
116 | | |
117 | 9.85k | if (rawByteCount < 4) |
118 | 0 | { |
119 | 0 | if ((rawBuffer[0] == 0xFE) && (rawBuffer[1] == 0xFF)) |
120 | 0 | return UTF_16B; |
121 | 0 | else if ((rawBuffer[0] == 0xFF) && (rawBuffer[1] == 0xFE)) |
122 | 0 | return UTF_16L; |
123 | 0 | else |
124 | 0 | return UTF_8; |
125 | 0 | } |
126 | | |
127 | | /*** |
128 | | * F.1 Detection Without External Encoding Information |
129 | | * |
130 | | * Because each XML entity not accompanied by external encoding information and |
131 | | * not in UTF-8 or UTF-16 encoding must begin with an XML encoding declaration, |
132 | | * in which the first characters must be '<?xml', any conforming processor can detect, |
133 | | * after two to four octets of input, which of the following cases apply. |
134 | | * |
135 | | * In reading this list, it may help to know that in UCS-4, '<' is "#x0000003C" and |
136 | | * '?' is "#x0000003F", and the Byte Order Mark required of UTF-16 data streams is |
137 | | * "#xFEFF". The notation ## is used to denote any byte value except that two consecutive |
138 | | * ##s cannot be both 00. |
139 | | * |
140 | | * With a Byte Order Mark: |
141 | | * |
142 | | * 00 00 FE FF UCS-4, big-endian machine (1234 order) |
143 | | * FF FE 00 00 UCS-4, little-endian machine (4321 order) |
144 | | * 00 00 FF FE UCS-4, unusual octet order (2143) |
145 | | * FE FF 00 00 UCS-4, unusual octet order (3412) |
146 | | * FE FF ## ## UTF-16, big-endian |
147 | | * FF FE ## ## UTF-16, little-endian |
148 | | * EF BB BF UTF-8 |
149 | | * |
150 | | ***/ |
151 | | |
152 | | // |
153 | | // We have at least four bytes, so we can check all BOM |
154 | | // for UCS-4BE, UCS-4LE, UTF-16BE and UTF-16LE as well. |
155 | | // |
156 | 9.85k | if ((rawBuffer[0] == 0x00) && (rawBuffer[1] == 0x00) && (rawBuffer[2] == 0xFE) && (rawBuffer[3] == 0xFF)) |
157 | 4 | return UCS_4B; |
158 | 9.85k | else if ((rawBuffer[0] == 0xFF) && (rawBuffer[1] == 0xFE) && (rawBuffer[2] == 0x00) && (rawBuffer[3] == 0x00)) |
159 | 13 | return UCS_4L; |
160 | 9.83k | else if ((rawBuffer[0] == 0xFE) && (rawBuffer[1] == 0xFF)) |
161 | 46 | return UTF_16B; |
162 | 9.79k | else if ((rawBuffer[0] == 0xFF) && (rawBuffer[1] == 0xFE)) |
163 | 83 | return UTF_16L; |
164 | | |
165 | | // |
166 | | // We have at least 4 bytes. So lets check the 4 byte sequences that |
167 | | // indicate other UTF-16 and UCS encodings. |
168 | | // |
169 | 9.70k | if ((rawBuffer[0] == 0x00) || (rawBuffer[0] == 0x3C)) |
170 | 2.90k | { |
171 | 2.90k | if (rawByteCount >= fgUCS4PreLen && !memcmp(rawBuffer, fgUCS4BPre, fgUCS4PreLen)) |
172 | 22 | return UCS_4B; |
173 | 2.87k | else if (rawByteCount >= fgUCS4PreLen && !memcmp(rawBuffer, fgUCS4LPre, fgUCS4PreLen)) |
174 | 3 | return UCS_4L; |
175 | 2.87k | else if (rawByteCount >= fgUTF16PreLen && !memcmp(rawBuffer, fgUTF16BPre, fgUTF16PreLen)) |
176 | 37 | return UTF_16B; |
177 | 2.83k | else if (rawByteCount >= fgUTF16PreLen && !memcmp(rawBuffer, fgUTF16LPre, fgUTF16PreLen)) |
178 | 21 | return UTF_16L; |
179 | 2.90k | } |
180 | | |
181 | | // |
182 | | // See if we have enough bytes to possibly match the EBCDIC prefix. |
183 | | // If so, try it. |
184 | | // |
185 | 9.62k | if (rawByteCount > fgEBCDICPreLen) |
186 | 9.62k | { |
187 | 9.62k | if (!memcmp(rawBuffer, fgEBCDICPre, fgEBCDICPreLen)) |
188 | 18 | return EBCDIC; |
189 | 9.62k | } |
190 | | |
191 | | // |
192 | | // Does not seem to be anything we know, so go with UTF-8 to get at |
193 | | // least through the first line and see what it really is. |
194 | | // |
195 | 9.60k | return UTF_8; |
196 | 9.62k | } |
197 | | |
198 | | |
199 | | XMLRecognizer::Encodings |
200 | | XMLRecognizer::encodingForName(const XMLCh* const encName) |
201 | 7.10k | { |
202 | | // |
203 | | // Compare the passed string, assume input string is already uppercased, |
204 | | // to the variations that we recognize. |
205 | | // |
206 | | // !!NOTE: Note that we don't handle EBCDIC here because we don't handle |
207 | | // that one ourselves. It is allowed to fall into 'other'. |
208 | | // |
209 | 7.10k | if (encName == XMLUni::fgXMLChEncodingString || |
210 | 7.10k | !XMLString::compareString(encName, XMLUni::fgXMLChEncodingString)) |
211 | 0 | { |
212 | 0 | return XMLRecognizer::XERCES_XMLCH; |
213 | 0 | } |
214 | 7.10k | else if (!XMLString::compareString(encName, XMLUni::fgUTF8EncodingString) |
215 | 7.01k | || !XMLString::compareString(encName, XMLUni::fgUTF8EncodingString2)) |
216 | 84 | { |
217 | 84 | return XMLRecognizer::UTF_8; |
218 | 84 | } |
219 | 7.01k | else if (!XMLString::compareString(encName, XMLUni::fgUSASCIIEncodingString) |
220 | 6.94k | || !XMLString::compareString(encName, XMLUni::fgUSASCIIEncodingString2) |
221 | 6.94k | || !XMLString::compareString(encName, XMLUni::fgUSASCIIEncodingString3) |
222 | 6.87k | || !XMLString::compareString(encName, XMLUni::fgUSASCIIEncodingString4)) |
223 | 138 | { |
224 | 138 | return XMLRecognizer::US_ASCII; |
225 | 138 | } |
226 | 6.87k | else if (!XMLString::compareString(encName, XMLUni::fgUTF16LEncodingString) |
227 | 6.87k | || !XMLString::compareString(encName, XMLUni::fgUTF16LEncodingString2)) |
228 | 8 | { |
229 | 8 | return XMLRecognizer::UTF_16L; |
230 | 8 | } |
231 | 6.87k | else if (!XMLString::compareString(encName, XMLUni::fgUTF16BEncodingString) |
232 | 6.87k | || !XMLString::compareString(encName, XMLUni::fgUTF16BEncodingString2)) |
233 | 24 | { |
234 | 24 | return XMLRecognizer::UTF_16B; |
235 | 24 | } |
236 | 6.84k | else if (!XMLString::compareString(encName, XMLUni::fgUTF16EncodingString)) |
237 | 0 | { |
238 | 0 | return XMLPlatformUtils::fgXMLChBigEndian?XMLRecognizer::UTF_16B:XMLRecognizer::UTF_16L; |
239 | 0 | } |
240 | 6.84k | else if (!XMLString::compareString(encName, XMLUni::fgUCS4LEncodingString) |
241 | 6.84k | || !XMLString::compareString(encName, XMLUni::fgUCS4LEncodingString2)) |
242 | 0 | { |
243 | 0 | return XMLRecognizer::UCS_4L; |
244 | 0 | } |
245 | 6.84k | else if (!XMLString::compareString(encName, XMLUni::fgUCS4BEncodingString) |
246 | 6.84k | || !XMLString::compareString(encName, XMLUni::fgUCS4BEncodingString2)) |
247 | 0 | { |
248 | 0 | return XMLRecognizer::UCS_4B; |
249 | 0 | } |
250 | 6.84k | else if (!XMLString::compareString(encName, XMLUni::fgUCS4EncodingString)) |
251 | 0 | { |
252 | 0 | return XMLPlatformUtils::fgXMLChBigEndian?XMLRecognizer::UCS_4B:XMLRecognizer::UCS_4L; |
253 | 0 | } |
254 | | |
255 | | // Return 'other' since we don't recognizer it |
256 | 6.84k | return XMLRecognizer::OtherEncoding; |
257 | 7.10k | } |
258 | | |
259 | | |
260 | | const XMLCh* |
261 | | XMLRecognizer::nameForEncoding(const XMLRecognizer::Encodings theEncoding |
262 | | , MemoryManager* const manager) |
263 | 426k | { |
264 | 426k | if (theEncoding >= Encodings_Count) |
265 | 0 | ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::XMLRec_UnknownEncoding, manager); |
266 | | |
267 | 426k | return gEncodingNameMap[theEncoding]; |
268 | 426k | } |
269 | | |
270 | | XERCES_CPP_NAMESPACE_END |