Coverage Report

Created: 2026-04-01 06:57

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/xerces-c/src/xercesc/framework/XMLRecognizer.cpp
Line
Count
Source
1
/*
2
 * Licensed to the Apache Software Foundation (ASF) under one or more
3
 * contributor license agreements.  See the NOTICE file distributed with
4
 * this work for additional information regarding copyright ownership.
5
 * The ASF licenses this file to You under the Apache License, Version 2.0
6
 * (the "License"); you may not use this file except in compliance with
7
 * the License.  You may obtain a copy of the License at
8
 * 
9
 *      http://www.apache.org/licenses/LICENSE-2.0
10
 * 
11
 * Unless required by applicable law or agreed to in writing, software
12
 * distributed under the License is distributed on an "AS IS" BASIS,
13
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 * See the License for the specific language governing permissions and
15
 * limitations under the License.
16
 */
17
18
/**
19
 *  $Id: XMLRecognizer.cpp 555320 2007-07-11 16:05:13Z amassari $
20
 */
21
22
23
// ---------------------------------------------------------------------------
24
//  Includes
25
// ---------------------------------------------------------------------------
26
#include <xercesc/framework/XMLRecognizer.hpp>
27
#include <xercesc/util/RuntimeException.hpp>
28
#include <xercesc/util/XMLString.hpp>
29
30
XERCES_CPP_NAMESPACE_BEGIN
31
32
// ---------------------------------------------------------------------------
33
//  Local data
34
//
35
//  gEncodingNameMap
36
//      This array maps the Encodings enum values to their canonical names.
37
//      Be sure to keep this in sync with that enum!
38
// ---------------------------------------------------------------------------
39
static const XMLCh* gEncodingNameMap[XMLRecognizer::Encodings_Count] =
40
{
41
    XMLUni::fgEBCDICEncodingString
42
    , XMLUni::fgUCS4BEncodingString
43
    , XMLUni::fgUCS4LEncodingString
44
    , XMLUni::fgUSASCIIEncodingString
45
    , XMLUni::fgUTF8EncodingString
46
    , XMLUni::fgUTF16BEncodingString
47
    , XMLUni::fgUTF16LEncodingString
48
    , XMLUni::fgXMLChEncodingString
49
};
50
51
52
53
// ---------------------------------------------------------------------------
54
//  XMLRecognizer: Public, const static data
55
//
56
//  gXXXPre
57
//  gXXXPreLen
58
//      The byte sequence prefixes for all of the encodings that we can
59
//      auto sense. Also included is the length of each sequence.
60
// ---------------------------------------------------------------------------
61
const char           XMLRecognizer::fgASCIIPre[]  = { 0x3C, 0x3F, 0x78, 0x6D, 0x6C, 0x20 };
62
const XMLSize_t      XMLRecognizer::fgASCIIPreLen = 6;
63
const XMLByte        XMLRecognizer::fgEBCDICPre[] = { 0x4C, 0x6F, 0xA7, 0x94, 0x93, 0x40 };
64
const XMLSize_t      XMLRecognizer::fgEBCDICPreLen = 6;
65
const XMLByte        XMLRecognizer::fgUTF16BPre[] = { 0x00, 0x3C, 0x00, 0x3F, 0x00, 0x78, 0x00, 0x6D, 0x00, 0x6C, 0x00, 0x20 };
66
const XMLByte        XMLRecognizer::fgUTF16LPre[] = { 0x3C, 0x00, 0x3F, 0x00, 0x78, 0x00, 0x6D, 0x00, 0x6C, 0x00, 0x20, 0x00 };
67
const XMLSize_t      XMLRecognizer::fgUTF16PreLen = 12;
68
const XMLByte        XMLRecognizer::fgUCS4BPre[]  =
69
{
70
        0x00, 0x00, 0x00, 0x3C, 0x00, 0x00, 0x00, 0x3F
71
    ,   0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D
72
    ,   0x00, 0x00, 0x00, 0x6C, 0x00, 0x00, 0x00, 0x20
73
};
74
const XMLByte        XMLRecognizer::fgUCS4LPre[]  =
75
{
76
        0x3C, 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00
77
    ,   0x78, 0x00, 0x00, 0x00, 0x6D, 0x00, 0x00, 0x00
78
    ,   0x6C, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00
79
};
80
const XMLSize_t      XMLRecognizer::fgUCS4PreLen = 24;
81
82
const char           XMLRecognizer::fgUTF8BOM[] = {(char)0xEF, (char)0xBB, (char)0xBF};
83
const XMLSize_t      XMLRecognizer::fgUTF8BOMLen = 3;
84
85
// ---------------------------------------------------------------------------
86
//  XMLRecognizer: Encoding recognition methods
87
// ---------------------------------------------------------------------------
88
XMLRecognizer::Encodings
89
XMLRecognizer::basicEncodingProbe(  const   XMLByte* const  rawBuffer
90
                                    , const XMLSize_t       rawByteCount)
91
17.6k
{
92
    //
93
    //  As an optimization to check the 90% case, check first for the ASCII
94
    //  sequence '<?xml', which means its either US-ASCII, UTF-8, or some
95
    //  other encoding that we don't do manually but which happens to share
96
    //  the US-ASCII code points for these characters. So just return UTF-8
97
    //  to get us through the first line.
98
    //
99
17.6k
    if (rawByteCount >= fgASCIIPreLen)
100
17.6k
    {
101
17.6k
        if (!memcmp(rawBuffer, fgASCIIPre, fgASCIIPreLen))
102
7.81k
            return UTF_8;
103
17.6k
    }
104
105
    //
106
    //  If the count of raw bytes is less than 2, it cannot be anything
107
    //  we understand, so return UTF-8 as a fallback.
108
    //
109
9.85k
    if (rawByteCount < 2)
110
0
        return UTF_8;
111
         
112
    //  
113
    //  We have two to four bytes, so lets check for a UTF-16 BOM. That
114
    //  is quick to check and enough to identify two major encodings.   
115
    // 
116
117
9.85k
    if (rawByteCount < 4)
118
0
    {
119
0
        if ((rawBuffer[0] == 0xFE) && (rawBuffer[1] == 0xFF))
120
0
            return UTF_16B;
121
0
        else if ((rawBuffer[0] == 0xFF) && (rawBuffer[1] == 0xFE))
122
0
            return UTF_16L;
123
0
        else 
124
0
            return UTF_8;
125
0
    }
126
127
    /***
128
     *    F.1 Detection Without External Encoding Information
129
     *
130
     *    Because each XML entity not accompanied by external encoding information and 
131
     *    not in UTF-8 or UTF-16 encoding must begin with an XML encoding declaration, 
132
     *    in which the first characters must be '<?xml', any conforming processor can detect, 
133
     *    after two to four octets of input, which of the following cases apply. 
134
     *
135
     *    In reading this list, it may help to know that in UCS-4, '<' is "#x0000003C" and 
136
     *    '?' is "#x0000003F", and the Byte Order Mark required of UTF-16 data streams is 
137
     *    "#xFEFF". The notation ## is used to denote any byte value except that two consecutive 
138
     *    ##s cannot be both 00.
139
     *
140
     *    With a Byte Order Mark:
141
     *
142
     *    00 00 FE FF           UCS-4,    big-endian machine    (1234 order) 
143
     *    FF FE 00 00           UCS-4,    little-endian machine (4321 order) 
144
     *    00 00 FF FE           UCS-4,    unusual octet order   (2143) 
145
     *    FE FF 00 00           UCS-4,    unusual octet order   (3412) 
146
     *    FE FF ## ##           UTF-16,   big-endian 
147
     *    FF FE ## ##           UTF-16,   little-endian 
148
     *    EF BB BF              UTF-8 
149
     *
150
     ***/
151
152
    //
153
    //  We have at least four bytes, so we can check all BOM
154
    //  for UCS-4BE, UCS-4LE, UTF-16BE and UTF-16LE as well.
155
    //
156
9.85k
    if ((rawBuffer[0] == 0x00) && (rawBuffer[1] == 0x00) && (rawBuffer[2] == 0xFE) && (rawBuffer[3] == 0xFF))
157
4
        return UCS_4B;
158
9.85k
    else if ((rawBuffer[0] == 0xFF) && (rawBuffer[1] == 0xFE) && (rawBuffer[2] == 0x00) && (rawBuffer[3] == 0x00))
159
13
        return UCS_4L;
160
9.83k
    else if ((rawBuffer[0] == 0xFE) && (rawBuffer[1] == 0xFF))
161
46
        return UTF_16B;
162
9.79k
    else if ((rawBuffer[0] == 0xFF) && (rawBuffer[1] == 0xFE))
163
83
        return UTF_16L;
164
165
    //
166
    //  We have at least 4 bytes. So lets check the 4 byte sequences that
167
    //  indicate other UTF-16 and UCS encodings.
168
    //
169
9.70k
    if ((rawBuffer[0] == 0x00) || (rawBuffer[0] == 0x3C))
170
2.90k
    {
171
2.90k
        if (rawByteCount >= fgUCS4PreLen && !memcmp(rawBuffer, fgUCS4BPre, fgUCS4PreLen))
172
22
            return UCS_4B;
173
2.87k
        else if (rawByteCount >= fgUCS4PreLen && !memcmp(rawBuffer, fgUCS4LPre, fgUCS4PreLen))
174
3
            return UCS_4L;
175
2.87k
        else if (rawByteCount >= fgUTF16PreLen && !memcmp(rawBuffer, fgUTF16BPre, fgUTF16PreLen))
176
37
            return UTF_16B;
177
2.83k
        else if (rawByteCount >= fgUTF16PreLen && !memcmp(rawBuffer, fgUTF16LPre, fgUTF16PreLen))
178
21
            return UTF_16L;
179
2.90k
    }
180
181
    //
182
    //  See if we have enough bytes to possibly match the EBCDIC prefix.
183
    //  If so, try it.
184
    //
185
9.62k
    if (rawByteCount > fgEBCDICPreLen)
186
9.62k
    {
187
9.62k
        if (!memcmp(rawBuffer, fgEBCDICPre, fgEBCDICPreLen))
188
18
            return EBCDIC;
189
9.62k
    }
190
191
    //
192
    //  Does not seem to be anything we know, so go with UTF-8 to get at
193
    //  least through the first line and see what it really is.
194
    //
195
9.60k
    return UTF_8;
196
9.62k
}
197
198
199
XMLRecognizer::Encodings
200
XMLRecognizer::encodingForName(const XMLCh* const encName)
201
7.10k
{
202
    //
203
    //  Compare the passed string, assume input string is already uppercased,
204
    //  to the variations that we recognize.
205
    //
206
    //  !!NOTE: Note that we don't handle EBCDIC here because we don't handle
207
    //  that one ourselves. It is allowed to fall into 'other'.
208
    //
209
7.10k
    if (encName == XMLUni::fgXMLChEncodingString ||
210
7.10k
        !XMLString::compareString(encName, XMLUni::fgXMLChEncodingString))
211
0
    {
212
0
        return XMLRecognizer::XERCES_XMLCH;
213
0
    }
214
7.10k
    else if (!XMLString::compareString(encName, XMLUni::fgUTF8EncodingString)
215
7.01k
         ||  !XMLString::compareString(encName, XMLUni::fgUTF8EncodingString2))
216
84
    {
217
84
        return XMLRecognizer::UTF_8;
218
84
    }
219
7.01k
    else if (!XMLString::compareString(encName, XMLUni::fgUSASCIIEncodingString)
220
6.94k
         ||  !XMLString::compareString(encName, XMLUni::fgUSASCIIEncodingString2)
221
6.94k
         ||  !XMLString::compareString(encName, XMLUni::fgUSASCIIEncodingString3)
222
6.87k
         ||  !XMLString::compareString(encName, XMLUni::fgUSASCIIEncodingString4))
223
138
    {
224
138
        return XMLRecognizer::US_ASCII;
225
138
    }
226
6.87k
    else if (!XMLString::compareString(encName, XMLUni::fgUTF16LEncodingString)
227
6.87k
         ||  !XMLString::compareString(encName, XMLUni::fgUTF16LEncodingString2))
228
8
    {
229
8
        return XMLRecognizer::UTF_16L;
230
8
    }
231
6.87k
    else if (!XMLString::compareString(encName, XMLUni::fgUTF16BEncodingString)
232
6.87k
         ||  !XMLString::compareString(encName, XMLUni::fgUTF16BEncodingString2))
233
24
    {
234
24
        return XMLRecognizer::UTF_16B;
235
24
    }
236
6.84k
    else if (!XMLString::compareString(encName, XMLUni::fgUTF16EncodingString))
237
0
    {
238
0
        return XMLPlatformUtils::fgXMLChBigEndian?XMLRecognizer::UTF_16B:XMLRecognizer::UTF_16L;
239
0
    }
240
6.84k
    else if (!XMLString::compareString(encName, XMLUni::fgUCS4LEncodingString)
241
6.84k
         ||  !XMLString::compareString(encName, XMLUni::fgUCS4LEncodingString2))
242
0
    {
243
0
        return XMLRecognizer::UCS_4L;
244
0
    }
245
6.84k
    else if (!XMLString::compareString(encName, XMLUni::fgUCS4BEncodingString)
246
6.84k
         ||  !XMLString::compareString(encName, XMLUni::fgUCS4BEncodingString2))
247
0
    {
248
0
        return XMLRecognizer::UCS_4B;
249
0
    }
250
6.84k
    else if (!XMLString::compareString(encName, XMLUni::fgUCS4EncodingString))
251
0
    {
252
0
        return XMLPlatformUtils::fgXMLChBigEndian?XMLRecognizer::UCS_4B:XMLRecognizer::UCS_4L;
253
0
    }
254
255
    // Return 'other' since we don't recognizer it
256
6.84k
    return XMLRecognizer::OtherEncoding;
257
7.10k
}
258
259
260
const XMLCh*
261
XMLRecognizer::nameForEncoding(const XMLRecognizer::Encodings theEncoding
262
                               , MemoryManager* const manager)
263
426k
{
264
426k
    if (theEncoding >= Encodings_Count)
265
0
        ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::XMLRec_UnknownEncoding, manager);
266
267
426k
    return gEncodingNameMap[theEncoding];
268
426k
}
269
270
XERCES_CPP_NAMESPACE_END