Coverage Report

Created: 2025-07-12 08:01

/src/tidy-html5/src/utf8.c
Line
Count
Source (jump to first uncovered line)
1
/* utf8.c -- convert characters to/from UTF-8
2
3
  (c) 1998-2007 (W3C) MIT, ERCIM, Keio University
4
  See tidy.h for the copyright notice.
5
6
  Uses public interfaces to abstract input source and output
7
  sink, which may be user supplied or either FILE* or memory
8
  based Tidy implementations.  Encoding support is uniform
9
  regardless of I/O mechanism.
10
11
  Note, UTF-8 encoding, by itself, does not affect the actual
12
  "codepoints" of the underlying character encoding.  In the
13
  cases of ASCII, Latin1, Unicode (16-bit, BMP), these all 
14
  refer to ISO-10646 "codepoints".  For anything else, they
15
  refer to some other "codepoint" set.
16
17
  Put another way, UTF-8 is a variable length method to 
18
  represent any non-negative integer value.  The glyph 
19
  that a integer value represents is unchanged and defined
20
  externally (e.g. by ISO-10646, Big5, Win1252, MacRoman,
21
  Latin2-9, and so on).
22
23
  Put still another way, UTF-8 is more of a _transfer_ encoding
24
  than a _character_ encoding, per se.
25
*/
26
27
#include "tidy.h"
28
#include "forward.h"
29
#include "utf8.h"
30
31
/* 
32
UTF-8 encoding/decoding functions
33
Return # of bytes in UTF-8 sequence; result < 0 if illegal sequence
34
35
Also see below for UTF-16 encoding/decoding functions
36
37
References :
38
39
1) UCS Transformation Format 8 (UTF-8):
40
ISO/IEC 10646-1:1996 Amendment 2 or ISO/IEC 10646-1:2000 Annex D
41
<http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335>
42
<http://www.cl.cam.ac.uk/~mgk25/ucs/ISO-10646-UTF-8.html>
43
44
Table 4 - Mapping from UCS-4 to UTF-8
45
46
2) Unicode standards:
47
<https://www.unicode.org/standard/standard.html>
48
49
3) Legal UTF-8 byte sequences:
50
<https://www.unicode.org/versions/corrigendum1.html>
51
52
Code point          1st byte    2nd byte    3rd byte    4th byte
53
----------          --------    --------    --------    --------
54
U+0000..U+007F      00..7F
55
U+0080..U+07FF      C2..DF      80..BF
56
U+0800..U+0FFF      E0          A0..BF      80..BF
57
U+1000..U+FFFF      E1..EF      80..BF      80..BF
58
U+10000..U+3FFFF    F0          90..BF      80..BF      80..BF
59
U+40000..U+FFFFF    F1..F3      80..BF      80..BF      80..BF
60
U+100000..U+10FFFF  F4          80..8F      80..BF      80..BF
61
62
The definition of UTF-8 in Annex D of ISO/IEC 10646-1:2000 also
63
allows for the use of five- and six-byte sequences to encode
64
characters that are outside the range of the Unicode character
65
set; those five- and six-byte sequences are illegal for the use
66
of UTF-8 as a transformation of Unicode characters. ISO/IEC 10646
67
does not allow mapping of unpaired surrogates, nor U+FFFE and U+FFFF
68
(but it does allow other noncharacters).
69
70
4) RFC 2279: UTF-8, a transformation format of ISO 10646:
71
<http://www.ietf.org/rfc/rfc2279.txt>
72
73
5) UTF-8 and Unicode FAQ:
74
<http://www.cl.cam.ac.uk/~mgk25/unicode.html>
75
76
6) Markus Kuhn's UTF-8 decoder stress test file:
77
<http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt>
78
79
7) UTF-8 Demo:
80
<http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-demo.txt>
81
82
8) UTF-8 Sampler:
83
<http://www.columbia.edu/kermit/utf8.html>
84
85
9) Transformation Format for 16 Planes of Group 00 (UTF-16):
86
ISO/IEC 10646-1:1996 Amendment 1 or ISO/IEC 10646-1:2000 Annex C
87
<http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n2005/n2005.pdf>
88
<http://www.cl.cam.ac.uk/~mgk25/ucs/ISO-10646-UTF-16.html>
89
90
10) RFC 2781: UTF-16, an encoding of ISO 10646:
91
<http://www.ietf.org/rfc/rfc2781.txt>
92
93
11) UTF-16 invalid surrogate pairs:
94
<https://www.unicode.org/faq/utf_bom.html#16>
95
96
UTF-16       UTF-8          UCS-4
97
D83F DFF*    F0 9F BF B*    0001FFF*
98
D87F DFF*    F0 AF BF B*    0002FFF*
99
D8BF DFF*    F0 BF BF B*    0003FFF*
100
D8FF DFF*    F1 8F BF B*    0004FFF*
101
D93F DFF*    F1 9F BF B*    0005FFF*
102
D97F DFF*    F1 AF BF B*    0006FFF*
103
                ...
104
DBBF DFF*    F3 BF BF B*    000FFFF*
105
DBFF DFF*    F4 8F BF B*    0010FFF*
106
107
* = E or F
108
                                   
109
1010  A
110
1011  B
111
1100  C
112
1101  D
113
1110  E
114
1111  F
115
116
*/
117
118
#define kNumUTF8Sequences        7
119
#define kMaxUTF8Bytes            4
120
121
25.2M
#define kUTF8ByteSwapNotAChar    0xFFFE
122
24.5M
#define kUTF8NotAChar            0xFFFF
123
124
23.8M
#define kMaxUTF8FromUCS4         0x10FFFF
125
126
18
#define kUTF16SurrogatesBegin    0x10000
127
413k
#define kMaxUTF16FromUCS4        0x10FFFF
128
129
/* UTF-16 surrogate pair areas */
130
828k
#define kUTF16LowSurrogateBegin  0xD800
131
1.89k
#define kUTF16LowSurrogateEnd    0xDBFF
132
687
#define kUTF16HighSurrogateBegin 0xDC00
133
24
#define kUTF16HighSurrogateEnd   0xDFFF
134
135
136
/* offsets into validUTF8 table below */
137
static const int offsetUTF8Sequences[kMaxUTF8Bytes + 1] =
138
{
139
    0, /* 1 byte */
140
    1, /* 2 bytes */
141
    2, /* 3 bytes */
142
    4, /* 4 bytes */
143
    kNumUTF8Sequences /* must be last */
144
};
145
146
static const struct validUTF8Sequence
147
{
148
     uint lowChar;
149
     uint highChar;
150
     int  numBytes;
151
     byte validBytes[8];
152
} validUTF8[kNumUTF8Sequences] =
153
{
154
/*   low       high   #bytes  byte 1      byte 2      byte 3      byte 4 */
155
    {0x0000,   0x007F,   1, {0x00, 0x7F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
156
    {0x0080,   0x07FF,   2, {0xC2, 0xDF, 0x80, 0xBF, 0x00, 0x00, 0x00, 0x00}},
157
    {0x0800,   0x0FFF,   3, {0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF, 0x00, 0x00}},
158
    {0x1000,   0xFFFF,   3, {0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF, 0x00, 0x00}},
159
    {0x10000,  0x3FFFF,  4, {0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF}},
160
    {0x40000,  0xFFFFF,  4, {0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF}},
161
    {0x100000, 0x10FFFF, 4, {0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF}} 
162
};
163
164
int TY_(DecodeUTF8BytesToChar)( uint* c, uint firstByte, ctmbstr successorBytes,
165
                                TidyInputSource* inp, int* count )
166
24.0M
{
167
24.0M
    byte tempbuf[10];
168
24.0M
    byte *buf = &tempbuf[0];
169
24.0M
    uint ch = 0, n = 0;
170
24.0M
    int i, bytes = 0;
171
24.0M
    Bool hasError = no;
172
    
173
24.0M
    if ( successorBytes )
174
31.4k
        buf = (byte*) successorBytes;
175
        
176
    /* special check if we have been passed an EOF char */
177
24.0M
    if ( firstByte == EndOfStream )
178
0
    {
179
        /* at present */
180
0
        *c = firstByte;
181
0
        *count = 1;
182
0
        return 0;
183
0
    }
184
185
24.0M
    ch = firstByte; /* first byte is passed in separately */
186
    
187
24.0M
    if (ch <= 0x7F) /* 0XXX XXXX one byte */
188
23.7M
    {
189
23.7M
        n = ch;
190
23.7M
        bytes = 1;
191
23.7M
    }
192
305k
    else if ((ch & 0xE0) == 0xC0)  /* 110X XXXX  two bytes */
193
15.1k
    {
194
15.1k
        n = ch & 31;
195
15.1k
        bytes = 2;
196
15.1k
    }
197
290k
    else if ((ch & 0xF0) == 0xE0)  /* 1110 XXXX  three bytes */
198
54.9k
    {
199
54.9k
        n = ch & 15;
200
54.9k
        bytes = 3;
201
54.9k
    }
202
235k
    else if ((ch & 0xF8) == 0xF0)  /* 1111 0XXX  four bytes */
203
8.79k
    {
204
8.79k
        n = ch & 7;
205
8.79k
        bytes = 4;
206
8.79k
    }
207
226k
    else if ((ch & 0xFC) == 0xF8)  /* 1111 10XX  five bytes */
208
1.20k
    {
209
1.20k
        n = ch & 3;
210
1.20k
        bytes = 5;
211
1.20k
        hasError = yes;
212
1.20k
    }
213
225k
    else if ((ch & 0xFE) == 0xFC)  /* 1111 110X  six bytes */
214
198
    {
215
198
        n = ch & 1;
216
198
        bytes = 6;
217
198
        hasError = yes;
218
198
    }
219
225k
    else
220
225k
    {
221
        /* not a valid first byte of a UTF-8 sequence */
222
225k
        n = ch;
223
225k
        bytes = 1;
224
225k
        hasError = yes;
225
225k
    }
226
227
    /* successor bytes should have the form 10XX XXXX */
228
229
    /* If caller supplied buffer, use it.  Else see if caller
230
    ** supplied an input source, use that.
231
    */
232
24.0M
    if ( successorBytes )
233
31.4k
    {
234
81.9k
        for ( i=0; i < bytes-1; ++i )
235
53.3k
        {
236
53.3k
            if ( !buf[i] || (buf[i] & 0xC0) != 0x80 )
237
2.80k
            {
238
2.80k
                hasError = yes;
239
2.80k
                bytes = i+1;
240
2.80k
                break;
241
2.80k
            }
242
50.5k
            n = (n << 6) | (buf[i] & 0x3F);
243
50.5k
        }
244
31.4k
    }
245
24.0M
    else if ( inp )
246
24.0M
    {
247
24.1M
        for ( i=0; i < bytes-1 && !inp->eof(inp->sourceData); ++i )
248
70.8k
        {
249
70.8k
            int b = inp->getByte( inp->sourceData );
250
70.8k
            buf[i] = (tmbchar) b;
251
252
            /* End of data or illegal successor byte value */
253
70.8k
            if ( b == EOF || (buf[i] & 0xC0) != 0x80 )
254
37.1k
            {
255
37.1k
                hasError = yes;
256
37.1k
                bytes = i+1;
257
37.1k
                if ( b != EOF )
258
37.1k
                    inp->ungetByte( inp->sourceData, buf[i] );
259
37.1k
                break;
260
37.1k
            }
261
33.7k
            n = (n << 6) | (buf[i] & 0x3F);
262
33.7k
        }
263
24.0M
    }
264
0
    else if ( bytes > 1 )
265
0
    {
266
0
        hasError = yes;
267
0
        bytes = 1;
268
0
    }
269
    
270
24.0M
    if (!hasError && ((n == kUTF8ByteSwapNotAChar) || (n == kUTF8NotAChar)))
271
4
        hasError = yes;
272
        
273
24.0M
    if (!hasError && (n > kMaxUTF8FromUCS4))
274
14
        hasError = yes;
275
276
24.0M
    if (!hasError)
277
23.8M
    {
278
23.8M
        int lo, hi;
279
        
280
23.8M
        lo = offsetUTF8Sequences[bytes - 1];
281
23.8M
        hi = offsetUTF8Sequences[bytes] - 1;
282
        
283
        /* check for overlong sequences */
284
23.8M
        if ((n < validUTF8[lo].lowChar) || (n > validUTF8[hi].highChar))
285
648
            hasError = yes;
286
23.8M
        else
287
23.8M
        {
288
23.8M
            hasError = yes; /* assume error until proven otherwise */
289
        
290
47.7M
            for (i = lo; i <= hi; i++)
291
23.8M
            {
292
23.8M
                int tempCount;
293
23.8M
                byte theByte;
294
                
295
47.8M
                for (tempCount = 0; tempCount < bytes; tempCount++)
296
23.9M
                {
297
23.9M
                    if (!tempCount)
298
23.8M
                        theByte = (tmbchar) firstByte;
299
94.6k
                    else
300
94.6k
                        theByte = buf[tempCount - 1];
301
                        
302
23.9M
                    if ( theByte >= validUTF8[i].validBytes[(tempCount * 2)] &&
303
23.9M
                         theByte <= validUTF8[i].validBytes[(tempCount * 2) + 1] )
304
23.9M
                        hasError = no;
305
23.9M
                    if (hasError)
306
31.1k
                        break;
307
23.9M
                }
308
23.8M
            }
309
23.8M
        }
310
23.8M
    }
311
312
#if 1 && defined(_DEBUG)
313
    if ( hasError )
314
    {
315
       /* debug */
316
       fprintf( stderr, "UTF-8 decoding error of %d bytes : ", bytes );
317
       fprintf( stderr, "0x%02x ", firstByte );
318
       for (i = 1; i < bytes; i++)
319
           fprintf( stderr, "0x%02x ", buf[i - 1] );
320
       fprintf( stderr, " = U+%04X\n", n );
321
    }
322
#endif
323
324
24.0M
    *count = bytes;
325
24.0M
    *c = n;
326
24.0M
    if ( hasError )
327
265k
        return -1;
328
23.8M
    return 0;
329
24.0M
}
330
331
int TY_(EncodeCharToUTF8Bytes)( uint c, tmbstr encodebuf,
332
                                TidyOutputSink* outp, int* count )
333
229M
{
334
229M
    byte tempbuf[10] = {0};
335
229M
    byte* buf = &tempbuf[0];
336
229M
    int bytes = 0;
337
229M
    Bool hasError = no;
338
    
339
229M
    if ( encodebuf )
340
125M
        buf = (byte*) encodebuf;
341
        
342
229M
    if (c <= 0x7F)  /* 0XXX XXXX one byte */
343
227M
    {
344
227M
        buf[0] = (tmbchar) c;
345
227M
        bytes = 1;
346
227M
    }
347
1.22M
    else if (c <= 0x7FF)  /* 110X XXXX  two bytes */
348
2.12k
    {
349
2.12k
        buf[0] = (tmbchar) ( 0xC0 | (c >> 6) );
350
2.12k
        buf[1] = (tmbchar) ( 0x80 | (c & 0x3F) );
351
2.12k
        bytes = 2;
352
2.12k
    }
353
1.22M
    else if (c <= 0xFFFF)  /* 1110 XXXX  three bytes */
354
714k
    {
355
714k
        buf[0] = (tmbchar) (0xE0 | (c >> 12));
356
714k
        buf[1] = (tmbchar) (0x80 | ((c >> 6) & 0x3F));
357
714k
        buf[2] = (tmbchar) (0x80 | (c & 0x3F));
358
714k
        bytes = 3;
359
714k
        if ( c == kUTF8ByteSwapNotAChar || c == kUTF8NotAChar )
360
966
            hasError = yes;
361
714k
    }
362
511k
    else if (c <= 0x1FFFFF)  /* 1111 0XXX  four bytes */
363
1.08k
    {
364
1.08k
        buf[0] = (tmbchar) (0xF0 | (c >> 18));
365
1.08k
        buf[1] = (tmbchar) (0x80 | ((c >> 12) & 0x3F));
366
1.08k
        buf[2] = (tmbchar) (0x80 | ((c >> 6) & 0x3F));
367
1.08k
        buf[3] = (tmbchar) (0x80 | (c & 0x3F));
368
1.08k
        bytes = 4;
369
1.08k
        if (c > kMaxUTF8FromUCS4)
370
0
            hasError = yes;
371
1.08k
    }
372
510k
    else if (c <= 0x3FFFFFF)  /* 1111 10XX  five bytes */
373
0
    {
374
0
        buf[0] = (tmbchar) (0xF8 | (c >> 24));
375
0
        buf[1] = (tmbchar) (0x80 | (c >> 18));
376
0
        buf[2] = (tmbchar) (0x80 | ((c >> 12) & 0x3F));
377
0
        buf[3] = (tmbchar) (0x80 | ((c >> 6) & 0x3F));
378
0
        buf[4] = (tmbchar) (0x80 | (c & 0x3F));
379
0
        bytes = 5;
380
0
        hasError = yes;
381
0
    }
382
510k
    else if (c <= 0x7FFFFFFF)  /* 1111 110X  six bytes */
383
0
    {
384
0
        buf[0] = (tmbchar) (0xFC | (c >> 30));
385
0
        buf[1] = (tmbchar) (0x80 | ((c >> 24) & 0x3F));
386
0
        buf[2] = (tmbchar) (0x80 | ((c >> 18) & 0x3F));
387
0
        buf[3] = (tmbchar) (0x80 | ((c >> 12) & 0x3F));
388
0
        buf[4] = (tmbchar) (0x80 | ((c >> 6) & 0x3F));
389
0
        buf[5] = (tmbchar) (0x80 | (c & 0x3F));
390
0
        bytes = 6;
391
0
        hasError = yes;
392
0
    }
393
510k
    else
394
510k
        hasError = yes;
395
        
396
    /* don't output invalid UTF-8 byte sequence to a stream */
397
229M
    if ( !hasError && outp != NULL )
398
103M
    {
399
103M
        int ix;
400
207M
        for ( ix=0; ix < bytes; ++ix )
401
103M
          outp->putByte( outp->sinkData, buf[ix] );
402
103M
    }
403
404
#if 1 && defined(_DEBUG)
405
    if ( hasError )
406
    {
407
        int i;
408
        fprintf( stderr, "UTF-8 encoding error for U+%x : ", c );
409
        for (i = 0; i < bytes; i++)
410
            fprintf( stderr, "0x%02x ", buf[i] );
411
        fprintf( stderr, "\n" );
412
    }
413
#endif
414
    
415
229M
    *count = bytes;
416
229M
    if (hasError)
417
511k
        return -1;
418
228M
    return 0;
419
229M
}
420
421
422
/* return one less than the number of bytes used by the UTF-8 byte sequence */
423
/* str points to the UTF-8 byte sequence */
424
/* the Unicode char is returned in *ch */
425
uint TY_(GetUTF8)( ctmbstr str, uint *ch )
426
31.4k
{
427
31.4k
    uint n;
428
31.4k
    int bytes;
429
430
31.4k
    int err;
431
    
432
31.4k
    bytes = 0;
433
    
434
    /* first byte "str[0]" is passed in separately from the */
435
    /* rest of the UTF-8 byte sequence starting at "str[1]" */
436
31.4k
    err = TY_(DecodeUTF8BytesToChar)( &n, str[0], str+1, NULL, &bytes );
437
31.4k
    if (err)
438
5.09k
    {
439
#if 1 && defined(_DEBUG)
440
        fprintf(stderr, "pprint UTF-8 decoding error for U+%x : ", n);
441
#endif
442
5.09k
        n = 0xFFFD; /* replacement char */
443
5.09k
    }
444
445
31.4k
    *ch = n;
446
31.4k
    return bytes - 1;
447
31.4k
}
448
449
/* store char c as UTF-8 encoded byte stream */
450
tmbstr TY_(PutUTF8)( tmbstr buf, uint c )
451
0
{
452
0
    int err, count = 0;
453
        
454
0
    err = TY_(EncodeCharToUTF8Bytes)( c, buf, NULL, &count );
455
0
    if (err)
456
0
    {
457
#if 1 && defined(_DEBUG)
458
        fprintf(stderr, "pprint UTF-8 encoding error for U+%x : ", c);
459
#endif
460
        /* replacement char 0xFFFD encoded as UTF-8 */
461
0
        buf[0] = (byte) 0xEF;
462
0
        buf[1] = (byte) 0xBF;
463
0
        buf[2] = (byte) 0xBD;
464
0
        count = 3;
465
0
    }
466
    
467
0
    buf += count;
468
0
    return buf;
469
0
}
470
471
Bool    TY_(IsValidUTF16FromUCS4)( tchar ucs4 )
472
413k
{
473
413k
  return ( ucs4 <= kMaxUTF16FromUCS4 );
474
413k
}
475
476
Bool    TY_(IsHighSurrogate)( tchar ch )
477
339
{
478
339
    return ( ch >= kUTF16HighSurrogateBegin && ch <= kUTF16HighSurrogateEnd );
479
339
}
480
Bool    TY_(IsLowSurrogate)( tchar ch )
481
414k
{
482
414k
    return ( ch >= kUTF16LowSurrogateBegin && ch <= kUTF16LowSurrogateEnd );
483
414k
}
484
485
tchar   TY_(CombineSurrogatePair)( tchar high, tchar low )
486
9
{
487
9
    assert( TY_(IsHighSurrogate)(high) && TY_(IsLowSurrogate)(low) );
488
9
    return ( ((low - kUTF16LowSurrogateBegin) * 0x400) + 
489
9
             high - kUTF16HighSurrogateBegin + 0x10000 );
490
9
}
491
492
Bool   TY_(SplitSurrogatePair)( tchar utf16, tchar* low, tchar* high )
493
0
{
494
0
    Bool status = ( TY_(IsValidCombinedChar)( utf16 ) && high && low );
495
0
    if ( status )
496
0
    {
497
0
        *low  = (utf16 - kUTF16SurrogatesBegin) / 0x400 + kUTF16LowSurrogateBegin;
498
0
        *high = (utf16 - kUTF16SurrogatesBegin) % 0x400 + kUTF16HighSurrogateBegin;
499
0
    }
500
0
    return status;
501
0
}
502
503
Bool    TY_(IsValidCombinedChar)( tchar ch )
504
9
{
505
9
    return ( ch >= kUTF16SurrogatesBegin &&
506
9
             (ch & 0x0000FFFE) != 0x0000FFFE &&
507
9
             (ch & 0x0000FFFF) != 0x0000FFFF );
508
9
}
509
510
Bool    TY_(IsCombinedChar)( tchar ch )
511
0
{
512
0
    return ( ch >= kUTF16SurrogatesBegin );
513
0
}
514
515
/*
516
 * local variables:
517
 * mode: c
518
 * indent-tabs-mode: nil
519
 * c-basic-offset: 4
520
 * eval: (c-set-offset 'substatement-open 0)
521
 * end:
522
 */