/src/tidy-html5/src/utf8.c

Source (jump to first uncovered line)
/* utf8.c -- convert characters to/from UTF-8

  (c) 1998-2007 (W3C) MIT, ERCIM, Keio University
  See tidy.h for the copyright notice.

  Uses public interfaces to abstract input source and output
  sink, which may be user supplied or either FILE* or memory
  based Tidy implementations.  Encoding support is uniform
  regardless of I/O mechanism.

  Note, UTF-8 encoding, by itself, does not affect the actual
  "codepoints" of the underlying character encoding.  In the
  cases of ASCII, Latin1, Unicode (16-bit, BMP), these all 
  refer to ISO-10646 "codepoints".  For anything else, they
  refer to some other "codepoint" set.

  Put another way, UTF-8 is a variable length method to 
  represent any non-negative integer value.  The glyph 
  that a integer value represents is unchanged and defined
  externally (e.g. by ISO-10646, Big5, Win1252, MacRoman,
  Latin2-9, and so on).

  Put still another way, UTF-8 is more of a _transfer_ encoding
  than a _character_ encoding, per se.
*/

#include "tidy.h"
#include "forward.h"
#include "utf8.h"

/* 
UTF-8 encoding/decoding functions
Return # of bytes in UTF-8 sequence; result < 0 if illegal sequence

Also see below for UTF-16 encoding/decoding functions

References :

1) UCS Transformation Format 8 (UTF-8):
ISO/IEC 10646-1:1996 Amendment 2 or ISO/IEC 10646-1:2000 Annex D
<http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335>
<http://www.cl.cam.ac.uk/~mgk25/ucs/ISO-10646-UTF-8.html>

Table 4 - Mapping from UCS-4 to UTF-8

2) Unicode standards:
<https://www.unicode.org/standard/standard.html>

3) Legal UTF-8 byte sequences:
<https://www.unicode.org/versions/corrigendum1.html>

Code point          1st byte    2nd byte    3rd byte    4th byte
----------          --------    --------    --------    --------
U+0000..U+007F      00..7F
U+0080..U+07FF      C2..DF      80..BF
U+0800..U+0FFF      E0          A0..BF      80..BF
U+1000..U+FFFF      E1..EF      80..BF      80..BF
U+10000..U+3FFFF    F0          90..BF      80..BF      80..BF
U+40000..U+FFFFF    F1..F3      80..BF      80..BF      80..BF
U+100000..U+10FFFF  F4          80..8F      80..BF      80..BF

The definition of UTF-8 in Annex D of ISO/IEC 10646-1:2000 also
allows for the use of five- and six-byte sequences to encode
characters that are outside the range of the Unicode character
set; those five- and six-byte sequences are illegal for the use
of UTF-8 as a transformation of Unicode characters. ISO/IEC 10646
does not allow mapping of unpaired surrogates, nor U+FFFE and U+FFFF
(but it does allow other noncharacters).

4) RFC 2279: UTF-8, a transformation format of ISO 10646:
<http://www.ietf.org/rfc/rfc2279.txt>

5) UTF-8 and Unicode FAQ:
<http://www.cl.cam.ac.uk/~mgk25/unicode.html>

6) Markus Kuhn's UTF-8 decoder stress test file:
<http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt>

7) UTF-8 Demo:
<http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-demo.txt>

8) UTF-8 Sampler:
<http://www.columbia.edu/kermit/utf8.html>

9) Transformation Format for 16 Planes of Group 00 (UTF-16):
ISO/IEC 10646-1:1996 Amendment 1 or ISO/IEC 10646-1:2000 Annex C
<http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n2005/n2005.pdf>
<http://www.cl.cam.ac.uk/~mgk25/ucs/ISO-10646-UTF-16.html>

10) RFC 2781: UTF-16, an encoding of ISO 10646:
<http://www.ietf.org/rfc/rfc2781.txt>

11) UTF-16 invalid surrogate pairs:
<https://www.unicode.org/faq/utf_bom.html#16>

UTF-16       UTF-8          UCS-4
D83F DFF*    F0 9F BF B*    0001FFF*
D87F DFF*    F0 AF BF B*    0002FFF*
D8BF DFF*    F0 BF BF B*    0003FFF*
D8FF DFF*    F1 8F BF B*    0004FFF*
D93F DFF*    F1 9F BF B*    0005FFF*
D97F DFF*    F1 AF BF B*    0006FFF*
                ...
DBBF DFF*    F3 BF BF B*    000FFFF*
DBFF DFF*    F4 8F BF B*    0010FFF*

* = E or F
                                   
1010  A
1011  B
1100  C
1101  D
1110  E
1111  F

*/

#define kNumUTF8Sequences        7
#define kMaxUTF8Bytes            4

#define kUTF8ByteSwapNotAChar    0xFFFE
#define kUTF8NotAChar            0xFFFF

#define kMaxUTF8FromUCS4         0x10FFFF

#define kUTF16SurrogatesBegin    0x10000
#define kMaxUTF16FromUCS4        0x10FFFF

/* UTF-16 surrogate pair areas */
#define kUTF16LowSurrogateBegin  0xD800
#define kUTF16LowSurrogateEnd    0xDBFF
#define kUTF16HighSurrogateBegin 0xDC00
#define kUTF16HighSurrogateEnd   0xDFFF


/* offsets into validUTF8 table below */
static const int offsetUTF8Sequences[kMaxUTF8Bytes + 1] =
{
    0, /* 1 byte */
    1, /* 2 bytes */
    2, /* 3 bytes */
    4, /* 4 bytes */
    kNumUTF8Sequences /* must be last */
};

static const struct validUTF8Sequence
{
     uint lowChar;
     uint highChar;
     int  numBytes;
     byte validBytes[8];
} validUTF8[kNumUTF8Sequences] =
{
/*   low       high   #bytes  byte 1      byte 2      byte 3      byte 4 */
    {0x0000,   0x007F,   1, {0x00, 0x7F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
    {0x0080,   0x07FF,   2, {0xC2, 0xDF, 0x80, 0xBF, 0x00, 0x00, 0x00, 0x00}},
    {0x0800,   0x0FFF,   3, {0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF, 0x00, 0x00}},
    {0x1000,   0xFFFF,   3, {0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF, 0x00, 0x00}},
    {0x10000,  0x3FFFF,  4, {0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF}},
    {0x40000,  0xFFFFF,  4, {0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF}},
    {0x100000, 0x10FFFF, 4, {0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF}} 
};

int TY_(DecodeUTF8BytesToChar)( uint* c, uint firstByte, ctmbstr successorBytes,
                                TidyInputSource* inp, int* count )
{
    byte tempbuf[10];
    byte *buf = &tempbuf[0];
    uint ch = 0, n = 0;
    int i, bytes = 0;
    Bool hasError = no;
    
    if ( successorBytes )
        buf = (byte*) successorBytes;
        
    /* special check if we have been passed an EOF char */
    if ( firstByte == EndOfStream )
    {
        /* at present */
        *c = firstByte;
        *count = 1;
        return 0;
    }

    ch = firstByte; /* first byte is passed in separately */
    
    if (ch <= 0x7F) /* 0XXX XXXX one byte */
    {
        n = ch;
        bytes = 1;
    }
    else if ((ch & 0xE0) == 0xC0)  /* 110X XXXX  two bytes */
    {
        n = ch & 31;
        bytes = 2;
    }
    else if ((ch & 0xF0) == 0xE0)  /* 1110 XXXX  three bytes */
    {
        n = ch & 15;
        bytes = 3;
    }
    else if ((ch & 0xF8) == 0xF0)  /* 1111 0XXX  four bytes */
    {
        n = ch & 7;
        bytes = 4;
    }
    else if ((ch & 0xFC) == 0xF8)  /* 1111 10XX  five bytes */
    {
        n = ch & 3;
        bytes = 5;
        hasError = yes;
    }
    else if ((ch & 0xFE) == 0xFC)  /* 1111 110X  six bytes */
    {
        n = ch & 1;
        bytes = 6;
        hasError = yes;
    }
    else
    {
        /* not a valid first byte of a UTF-8 sequence */
        n = ch;
        bytes = 1;
        hasError = yes;
    }

    /* successor bytes should have the form 10XX XXXX */

    /* If caller supplied buffer, use it.  Else see if caller
    ** supplied an input source, use that.
    */
    if ( successorBytes )
    {
        for ( i=0; i < bytes-1; ++i )
        {
            if ( !buf[i] || (buf[i] & 0xC0) != 0x80 )
            {
                hasError = yes;
                bytes = i+1;
                break;
            }
            n = (n << 6) | (buf[i] & 0x3F);
        }
    }
    else if ( inp )
    {
        for ( i=0; i < bytes-1 && !inp->eof(inp->sourceData); ++i )
        {
            int b = inp->getByte( inp->sourceData );
            buf[i] = (tmbchar) b;

            /* End of data or illegal successor byte value */
            if ( b == EOF || (buf[i] & 0xC0) != 0x80 )
            {
                hasError = yes;
                bytes = i+1;
                if ( b != EOF )
                    inp->ungetByte( inp->sourceData, buf[i] );
                break;
            }
            n = (n << 6) | (buf[i] & 0x3F);
        }
    }
    else if ( bytes > 1 )
    {
        hasError = yes;
        bytes = 1;
    }
    
    if (!hasError && ((n == kUTF8ByteSwapNotAChar) || (n == kUTF8NotAChar)))
        hasError = yes;
        
    if (!hasError && (n > kMaxUTF8FromUCS4))
        hasError = yes;

    if (!hasError)
    {
        int lo, hi;
        
        lo = offsetUTF8Sequences[bytes - 1];
        hi = offsetUTF8Sequences[bytes] - 1;
        
        /* check for overlong sequences */
        if ((n < validUTF8[lo].lowChar) || (n > validUTF8[hi].highChar))
            hasError = yes;
        else
        {
            hasError = yes; /* assume error until proven otherwise */
        
            for (i = lo; i <= hi; i++)
            {
                int tempCount;
                byte theByte;
                
                for (tempCount = 0; tempCount < bytes; tempCount++)
                {
                    if (!tempCount)
                        theByte = (tmbchar) firstByte;
                    else
                        theByte = buf[tempCount - 1];
                        
                    if ( theByte >= validUTF8[i].validBytes[(tempCount * 2)] &&
                         theByte <= validUTF8[i].validBytes[(tempCount * 2) + 1] )
                        hasError = no;
                    if (hasError)
                        break;
                }
            }
        }
    }

#if 1 && defined(_DEBUG)
    if ( hasError )
    {
       /* debug */
       fprintf( stderr, "UTF-8 decoding error of %d bytes : ", bytes );
       fprintf( stderr, "0x%02x ", firstByte );
       for (i = 1; i < bytes; i++)
           fprintf( stderr, "0x%02x ", buf[i - 1] );
       fprintf( stderr, " = U+%04X\n", n );
    }
#endif

    *count = bytes;
    *c = n;
    if ( hasError )
        return -1;
    return 0;
}

int TY_(EncodeCharToUTF8Bytes)( uint c, tmbstr encodebuf,
                                TidyOutputSink* outp, int* count )
{
    byte tempbuf[10] = {0};
    byte* buf = &tempbuf[0];
    int bytes = 0;
    Bool hasError = no;
    
    if ( encodebuf )
        buf = (byte*) encodebuf;
        
    if (c <= 0x7F)  /* 0XXX XXXX one byte */
    {
        buf[0] = (tmbchar) c;
        bytes = 1;
    }
    else if (c <= 0x7FF)  /* 110X XXXX  two bytes */
    {
        buf[0] = (tmbchar) ( 0xC0 | (c >> 6) );
        buf[1] = (tmbchar) ( 0x80 | (c & 0x3F) );
        bytes = 2;
    }
    else if (c <= 0xFFFF)  /* 1110 XXXX  three bytes */
    {
        buf[0] = (tmbchar) (0xE0 | (c >> 12));
        buf[1] = (tmbchar) (0x80 | ((c >> 6) & 0x3F));
        buf[2] = (tmbchar) (0x80 | (c & 0x3F));
        bytes = 3;
        if ( c == kUTF8ByteSwapNotAChar || c == kUTF8NotAChar )
            hasError = yes;
    }
    else if (c <= 0x1FFFFF)  /* 1111 0XXX  four bytes */
    {
        buf[0] = (tmbchar) (0xF0 | (c >> 18));
        buf[1] = (tmbchar) (0x80 | ((c >> 12) & 0x3F));
        buf[2] = (tmbchar) (0x80 | ((c >> 6) & 0x3F));
        buf[3] = (tmbchar) (0x80 | (c & 0x3F));
        bytes = 4;
        if (c > kMaxUTF8FromUCS4)
            hasError = yes;
    }
    else if (c <= 0x3FFFFFF)  /* 1111 10XX  five bytes */
    {
        buf[0] = (tmbchar) (0xF8 | (c >> 24));
        buf[1] = (tmbchar) (0x80 | (c >> 18));
        buf[2] = (tmbchar) (0x80 | ((c >> 12) & 0x3F));
        buf[3] = (tmbchar) (0x80 | ((c >> 6) & 0x3F));
        buf[4] = (tmbchar) (0x80 | (c & 0x3F));
        bytes = 5;
        hasError = yes;
    }
    else if (c <= 0x7FFFFFFF)  /* 1111 110X  six bytes */
    {
        buf[0] = (tmbchar) (0xFC | (c >> 30));
        buf[1] = (tmbchar) (0x80 | ((c >> 24) & 0x3F));
        buf[2] = (tmbchar) (0x80 | ((c >> 18) & 0x3F));
        buf[3] = (tmbchar) (0x80 | ((c >> 12) & 0x3F));
        buf[4] = (tmbchar) (0x80 | ((c >> 6) & 0x3F));
        buf[5] = (tmbchar) (0x80 | (c & 0x3F));
        bytes = 6;
        hasError = yes;
    }
    else
        hasError = yes;
        
    /* don't output invalid UTF-8 byte sequence to a stream */
    if ( !hasError && outp != NULL )
    {
        int ix;
        for ( ix=0; ix < bytes; ++ix )
          outp->putByte( outp->sinkData, buf[ix] );
    }

#if 1 && defined(_DEBUG)
    if ( hasError )
    {
        int i;
        fprintf( stderr, "UTF-8 encoding error for U+%x : ", c );
        for (i = 0; i < bytes; i++)
            fprintf( stderr, "0x%02x ", buf[i] );
        fprintf( stderr, "\n" );
    }
#endif
    
    *count = bytes;
    if (hasError)
        return -1;
    return 0;
}


/* return one less than the number of bytes used by the UTF-8 byte sequence */
/* str points to the UTF-8 byte sequence */
/* the Unicode char is returned in *ch */
uint TY_(GetUTF8)( ctmbstr str, uint *ch )
{
    uint n;
    int bytes;

    int err;
    
    bytes = 0;
    
    /* first byte "str[0]" is passed in separately from the */
    /* rest of the UTF-8 byte sequence starting at "str[1]" */
    err = TY_(DecodeUTF8BytesToChar)( &n, str[0], str+1, NULL, &bytes );
    if (err)
    {
#if 1 && defined(_DEBUG)
        fprintf(stderr, "pprint UTF-8 decoding error for U+%x : ", n);
#endif
        n = 0xFFFD; /* replacement char */
    }

    *ch = n;
    return bytes - 1;
}

/* store char c as UTF-8 encoded byte stream */
tmbstr TY_(PutUTF8)( tmbstr buf, uint c )
{
    int err, count = 0;
        
    err = TY_(EncodeCharToUTF8Bytes)( c, buf, NULL, &count );
    if (err)
    {
#if 1 && defined(_DEBUG)
        fprintf(stderr, "pprint UTF-8 encoding error for U+%x : ", c);
#endif
        /* replacement char 0xFFFD encoded as UTF-8 */
        buf[0] = (byte) 0xEF;
        buf[1] = (byte) 0xBF;
        buf[2] = (byte) 0xBD;
        count = 3;
    }
    
    buf += count;
    return buf;
}

Bool    TY_(IsValidUTF16FromUCS4)( tchar ucs4 )
{
  return ( ucs4 <= kMaxUTF16FromUCS4 );
}

Bool    TY_(IsHighSurrogate)( tchar ch )
{
    return ( ch >= kUTF16HighSurrogateBegin && ch <= kUTF16HighSurrogateEnd );
}
Bool    TY_(IsLowSurrogate)( tchar ch )
{
    return ( ch >= kUTF16LowSurrogateBegin && ch <= kUTF16LowSurrogateEnd );
}

tchar   TY_(CombineSurrogatePair)( tchar high, tchar low )
{
    assert( TY_(IsHighSurrogate)(high) && TY_(IsLowSurrogate)(low) );
    return ( ((low - kUTF16LowSurrogateBegin) * 0x400) + 
             high - kUTF16HighSurrogateBegin + 0x10000 );
}

Bool   TY_(SplitSurrogatePair)( tchar utf16, tchar* low, tchar* high )
{
    Bool status = ( TY_(IsValidCombinedChar)( utf16 ) && high && low );
    if ( status )
    {
        *low  = (utf16 - kUTF16SurrogatesBegin) / 0x400 + kUTF16LowSurrogateBegin;
        *high = (utf16 - kUTF16SurrogatesBegin) % 0x400 + kUTF16HighSurrogateBegin;
    }
    return status;
}

Bool    TY_(IsValidCombinedChar)( tchar ch )
{
    return ( ch >= kUTF16SurrogatesBegin &&
             (ch & 0x0000FFFE) != 0x0000FFFE &&
             (ch & 0x0000FFFF) != 0x0000FFFF );
}

Bool    TY_(IsCombinedChar)( tchar ch )
{
    return ( ch >= kUTF16SurrogatesBegin );
}

/*
 * local variables:
 * mode: c
 * indent-tabs-mode: nil
 * c-basic-offset: 4
 * eval: (c-set-offset 'substatement-open 0)
 * end:
 */

Coverage Report

Created: 2025-07-12 08:01

Line	Count	Source (jump to first uncovered line)
1		/* utf8.c -- convert characters to/from UTF-8
2
3		(c) 1998-2007 (W3C) MIT, ERCIM, Keio University
4		See tidy.h for the copyright notice.
5
6		Uses public interfaces to abstract input source and output
7		sink, which may be user supplied or either FILE* or memory
8		based Tidy implementations. Encoding support is uniform
9		regardless of I/O mechanism.
10
11		Note, UTF-8 encoding, by itself, does not affect the actual
12		"codepoints" of the underlying character encoding. In the
13		cases of ASCII, Latin1, Unicode (16-bit, BMP), these all
14		refer to ISO-10646 "codepoints". For anything else, they
15		refer to some other "codepoint" set.
16
17		Put another way, UTF-8 is a variable length method to
18		represent any non-negative integer value. The glyph
19		that a integer value represents is unchanged and defined
20		externally (e.g. by ISO-10646, Big5, Win1252, MacRoman,
21		Latin2-9, and so on).
22
23		Put still another way, UTF-8 is more of a _transfer_ encoding
24		than a _character_ encoding, per se.
25		*/
26
27		#include "tidy.h"
28		#include "forward.h"
29		#include "utf8.h"
30
31		/*
32		UTF-8 encoding/decoding functions
33		Return # of bytes in UTF-8 sequence; result < 0 if illegal sequence
34
35		Also see below for UTF-16 encoding/decoding functions
36
37		References :
38
39		1) UCS Transformation Format 8 (UTF-8):
40		ISO/IEC 10646-1:1996 Amendment 2 or ISO/IEC 10646-1:2000 Annex D
41		<http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335>
42		<http://www.cl.cam.ac.uk/~mgk25/ucs/ISO-10646-UTF-8.html>
43
44		Table 4 - Mapping from UCS-4 to UTF-8
45
46		2) Unicode standards:
47		<https://www.unicode.org/standard/standard.html>
48
49		3) Legal UTF-8 byte sequences:
50		<https://www.unicode.org/versions/corrigendum1.html>
51
52		Code point 1st byte 2nd byte 3rd byte 4th byte
53		---------- -------- -------- -------- --------
54		U+0000..U+007F 00..7F
55		U+0080..U+07FF C2..DF 80..BF
56		U+0800..U+0FFF E0 A0..BF 80..BF
57		U+1000..U+FFFF E1..EF 80..BF 80..BF
58		U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
59		U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
60		U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
61
62		The definition of UTF-8 in Annex D of ISO/IEC 10646-1:2000 also
63		allows for the use of five- and six-byte sequences to encode
64		characters that are outside the range of the Unicode character
65		set; those five- and six-byte sequences are illegal for the use
66		of UTF-8 as a transformation of Unicode characters. ISO/IEC 10646
67		does not allow mapping of unpaired surrogates, nor U+FFFE and U+FFFF
68		(but it does allow other noncharacters).
69
70		4) RFC 2279: UTF-8, a transformation format of ISO 10646:
71		<http://www.ietf.org/rfc/rfc2279.txt>
72
73		5) UTF-8 and Unicode FAQ:
74		<http://www.cl.cam.ac.uk/~mgk25/unicode.html>
75
76		6) Markus Kuhn's UTF-8 decoder stress test file:
77		<http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt>
78
79		7) UTF-8 Demo:
80		<http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-demo.txt>
81
82		8) UTF-8 Sampler:
83		<http://www.columbia.edu/kermit/utf8.html>
84
85		9) Transformation Format for 16 Planes of Group 00 (UTF-16):
86		ISO/IEC 10646-1:1996 Amendment 1 or ISO/IEC 10646-1:2000 Annex C
87		<http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n2005/n2005.pdf>
88		<http://www.cl.cam.ac.uk/~mgk25/ucs/ISO-10646-UTF-16.html>
89
90		10) RFC 2781: UTF-16, an encoding of ISO 10646:
91		<http://www.ietf.org/rfc/rfc2781.txt>
92
93		11) UTF-16 invalid surrogate pairs:
94		<https://www.unicode.org/faq/utf_bom.html#16>
95
96		UTF-16 UTF-8 UCS-4
97		D83F DFF* F0 9F BF B* 0001FFF*
98		D87F DFF* F0 AF BF B* 0002FFF*
99		D8BF DFF* F0 BF BF B* 0003FFF*
100		D8FF DFF* F1 8F BF B* 0004FFF*
101		D93F DFF* F1 9F BF B* 0005FFF*
102		D97F DFF* F1 AF BF B* 0006FFF*
103		...
104		DBBF DFF* F3 BF BF B* 000FFFF*
105		DBFF DFF* F4 8F BF B* 0010FFF*
106
107		* = E or F
108
109		1010 A
110		1011 B
111		1100 C
112		1101 D
113		1110 E
114		1111 F
115
116		*/
117
118		#define kNumUTF8Sequences 7
119		#define kMaxUTF8Bytes 4
120
121	25.2M	#define kUTF8ByteSwapNotAChar 0xFFFE
122	24.5M	#define kUTF8NotAChar 0xFFFF
123
124	23.8M	#define kMaxUTF8FromUCS4 0x10FFFF
125
126	18	#define kUTF16SurrogatesBegin 0x10000
127	413k	#define kMaxUTF16FromUCS4 0x10FFFF
128
129		/* UTF-16 surrogate pair areas */
130	828k	#define kUTF16LowSurrogateBegin 0xD800
131	1.89k	#define kUTF16LowSurrogateEnd 0xDBFF
132	687	#define kUTF16HighSurrogateBegin 0xDC00
133	24	#define kUTF16HighSurrogateEnd 0xDFFF
134
135
136		/* offsets into validUTF8 table below */
137		static const int offsetUTF8Sequences[kMaxUTF8Bytes + 1] =
138		{
139		0, /* 1 byte */
140		1, /* 2 bytes */
141		2, /* 3 bytes */
142		4, /* 4 bytes */
143		kNumUTF8Sequences /* must be last */
144		};
145
146		static const struct validUTF8Sequence
147		{
148		uint lowChar;
149		uint highChar;
150		int numBytes;
151		byte validBytes[8];
152		} validUTF8[kNumUTF8Sequences] =
153		{
154		/* low high #bytes byte 1 byte 2 byte 3 byte 4 */
155		{0x0000, 0x007F, 1, {0x00, 0x7F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
156		{0x0080, 0x07FF, 2, {0xC2, 0xDF, 0x80, 0xBF, 0x00, 0x00, 0x00, 0x00}},
157		{0x0800, 0x0FFF, 3, {0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF, 0x00, 0x00}},
158		{0x1000, 0xFFFF, 3, {0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF, 0x00, 0x00}},
159		{0x10000, 0x3FFFF, 4, {0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF}},
160		{0x40000, 0xFFFFF, 4, {0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF}},
161		{0x100000, 0x10FFFF, 4, {0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF}}
162		};
163
164		int TY_(DecodeUTF8BytesToChar)( uint* c, uint firstByte, ctmbstr successorBytes,
165		TidyInputSource* inp, int* count )
166	24.0M	{
167	24.0M	byte tempbuf[10];
168	24.0M	byte *buf = &tempbuf[0];
169	24.0M	uint ch = 0, n = 0;
170	24.0M	int i, bytes = 0;
171	24.0M	Bool hasError = no;
172
173	24.0M	if ( successorBytes )
174	31.4k	buf = (byte*) successorBytes;
175
176		/* special check if we have been passed an EOF char */
177	24.0M	if ( firstByte == EndOfStream )
178	0	{
179		/* at present */
180	0	*c = firstByte;
181	0	*count = 1;
182	0	return 0;
183	0	}
184
185	24.0M	ch = firstByte; /* first byte is passed in separately */
186
187	24.0M	if (ch <= 0x7F) /* 0XXX XXXX one byte */
188	23.7M	{
189	23.7M	n = ch;
190	23.7M	bytes = 1;
191	23.7M	}
192	305k	else if ((ch & 0xE0) == 0xC0) /* 110X XXXX two bytes */
193	15.1k	{
194	15.1k	n = ch & 31;
195	15.1k	bytes = 2;
196	15.1k	}
197	290k	else if ((ch & 0xF0) == 0xE0) /* 1110 XXXX three bytes */
198	54.9k	{
199	54.9k	n = ch & 15;
200	54.9k	bytes = 3;
201	54.9k	}
202	235k	else if ((ch & 0xF8) == 0xF0) /* 1111 0XXX four bytes */
203	8.79k	{
204	8.79k	n = ch & 7;
205	8.79k	bytes = 4;
206	8.79k	}
207	226k	else if ((ch & 0xFC) == 0xF8) /* 1111 10XX five bytes */
208	1.20k	{
209	1.20k	n = ch & 3;
210	1.20k	bytes = 5;
211	1.20k	hasError = yes;
212	1.20k	}
213	225k	else if ((ch & 0xFE) == 0xFC) /* 1111 110X six bytes */
214	198	{
215	198	n = ch & 1;
216	198	bytes = 6;
217	198	hasError = yes;
218	198	}
219	225k	else
220	225k	{
221		/* not a valid first byte of a UTF-8 sequence */
222	225k	n = ch;
223	225k	bytes = 1;
224	225k	hasError = yes;
225	225k	}
226
227		/* successor bytes should have the form 10XX XXXX */
228
229		/* If caller supplied buffer, use it. Else see if caller
230		** supplied an input source, use that.
231		*/
232	24.0M	if ( successorBytes )
233	31.4k	{
234	81.9k	for ( i=0; i < bytes-1; ++i )
235	53.3k	{
236	53.3k	if ( !buf[i] \|\| (buf[i] & 0xC0) != 0x80 )
237	2.80k	{
238	2.80k	hasError = yes;
239	2.80k	bytes = i+1;
240	2.80k	break;
241	2.80k	}
242	50.5k	n = (n << 6) \| (buf[i] & 0x3F);
243	50.5k	}
244	31.4k	}
245	24.0M	else if ( inp )
246	24.0M	{
247	24.1M	for ( i=0; i < bytes-1 && !inp->eof(inp->sourceData); ++i )
248	70.8k	{
249	70.8k	int b = inp->getByte( inp->sourceData );
250	70.8k	buf[i] = (tmbchar) b;
251
252		/* End of data or illegal successor byte value */
253	70.8k	if ( b == EOF \|\| (buf[i] & 0xC0) != 0x80 )
254	37.1k	{
255	37.1k	hasError = yes;
256	37.1k	bytes = i+1;
257	37.1k	if ( b != EOF )
258	37.1k	inp->ungetByte( inp->sourceData, buf[i] );
259	37.1k	break;
260	37.1k	}
261	33.7k	n = (n << 6) \| (buf[i] & 0x3F);
262	33.7k	}
263	24.0M	}
264	0	else if ( bytes > 1 )
265	0	{
266	0	hasError = yes;
267	0	bytes = 1;
268	0	}
269
270	24.0M	if (!hasError && ((n == kUTF8ByteSwapNotAChar) \|\| (n == kUTF8NotAChar)))
271	4	hasError = yes;
272
273	24.0M	if (!hasError && (n > kMaxUTF8FromUCS4))
274	14	hasError = yes;
275
276	24.0M	if (!hasError)
277	23.8M	{
278	23.8M	int lo, hi;
279
280	23.8M	lo = offsetUTF8Sequences[bytes - 1];
281	23.8M	hi = offsetUTF8Sequences[bytes] - 1;
282
283		/* check for overlong sequences */
284	23.8M	if ((n < validUTF8[lo].lowChar) \|\| (n > validUTF8[hi].highChar))
285	648	hasError = yes;
286	23.8M	else
287	23.8M	{
288	23.8M	hasError = yes; /* assume error until proven otherwise */
289
290	47.7M	for (i = lo; i <= hi; i++)
291	23.8M	{
292	23.8M	int tempCount;
293	23.8M	byte theByte;
294
295	47.8M	for (tempCount = 0; tempCount < bytes; tempCount++)
296	23.9M	{
297	23.9M	if (!tempCount)
298	23.8M	theByte = (tmbchar) firstByte;
299	94.6k	else
300	94.6k	theByte = buf[tempCount - 1];
301
302	23.9M	if ( theByte >= validUTF8[i].validBytes[(tempCount * 2)] &&
303	23.9M	theByte <= validUTF8[i].validBytes[(tempCount * 2) + 1] )
304	23.9M	hasError = no;
305	23.9M	if (hasError)
306	31.1k	break;
307	23.9M	}
308	23.8M	}
309	23.8M	}
310	23.8M	}
311
312		#if 1 && defined(_DEBUG)
313		if ( hasError )
314		{
315		/* debug */
316		fprintf( stderr, "UTF-8 decoding error of %d bytes : ", bytes );
317		fprintf( stderr, "0x%02x ", firstByte );
318		for (i = 1; i < bytes; i++)
319		fprintf( stderr, "0x%02x ", buf[i - 1] );
320		fprintf( stderr, " = U+%04X\n", n );
321		}
322		#endif
323
324	24.0M	*count = bytes;
325	24.0M	*c = n;
326	24.0M	if ( hasError )
327	265k	return -1;
328	23.8M	return 0;
329	24.0M	}
330
331		int TY_(EncodeCharToUTF8Bytes)( uint c, tmbstr encodebuf,
332		TidyOutputSink* outp, int* count )
333	229M	{
334	229M	byte tempbuf[10] = {0};
335	229M	byte* buf = &tempbuf[0];
336	229M	int bytes = 0;
337	229M	Bool hasError = no;
338
339	229M	if ( encodebuf )
340	125M	buf = (byte*) encodebuf;
341
342	229M	if (c <= 0x7F) /* 0XXX XXXX one byte */
343	227M	{
344	227M	buf[0] = (tmbchar) c;
345	227M	bytes = 1;
346	227M	}
347	1.22M	else if (c <= 0x7FF) /* 110X XXXX two bytes */
348	2.12k	{
349	2.12k	buf[0] = (tmbchar) ( 0xC0 \| (c >> 6) );
350	2.12k	buf[1] = (tmbchar) ( 0x80 \| (c & 0x3F) );
351	2.12k	bytes = 2;
352	2.12k	}
353	1.22M	else if (c <= 0xFFFF) /* 1110 XXXX three bytes */
354	714k	{
355	714k	buf[0] = (tmbchar) (0xE0 \| (c >> 12));
356	714k	buf[1] = (tmbchar) (0x80 \| ((c >> 6) & 0x3F));
357	714k	buf[2] = (tmbchar) (0x80 \| (c & 0x3F));
358	714k	bytes = 3;
359	714k	if ( c == kUTF8ByteSwapNotAChar \|\| c == kUTF8NotAChar )
360	966	hasError = yes;
361	714k	}
362	511k	else if (c <= 0x1FFFFF) /* 1111 0XXX four bytes */
363	1.08k	{
364	1.08k	buf[0] = (tmbchar) (0xF0 \| (c >> 18));
365	1.08k	buf[1] = (tmbchar) (0x80 \| ((c >> 12) & 0x3F));
366	1.08k	buf[2] = (tmbchar) (0x80 \| ((c >> 6) & 0x3F));
367	1.08k	buf[3] = (tmbchar) (0x80 \| (c & 0x3F));
368	1.08k	bytes = 4;
369	1.08k	if (c > kMaxUTF8FromUCS4)
370	0	hasError = yes;
371	1.08k	}
372	510k	else if (c <= 0x3FFFFFF) /* 1111 10XX five bytes */
373	0	{
374	0	buf[0] = (tmbchar) (0xF8 \| (c >> 24));
375	0	buf[1] = (tmbchar) (0x80 \| (c >> 18));
376	0	buf[2] = (tmbchar) (0x80 \| ((c >> 12) & 0x3F));
377	0	buf[3] = (tmbchar) (0x80 \| ((c >> 6) & 0x3F));
378	0	buf[4] = (tmbchar) (0x80 \| (c & 0x3F));
379	0	bytes = 5;
380	0	hasError = yes;
381	0	}
382	510k	else if (c <= 0x7FFFFFFF) /* 1111 110X six bytes */
383	0	{
384	0	buf[0] = (tmbchar) (0xFC \| (c >> 30));
385	0	buf[1] = (tmbchar) (0x80 \| ((c >> 24) & 0x3F));
386	0	buf[2] = (tmbchar) (0x80 \| ((c >> 18) & 0x3F));
387	0	buf[3] = (tmbchar) (0x80 \| ((c >> 12) & 0x3F));
388	0	buf[4] = (tmbchar) (0x80 \| ((c >> 6) & 0x3F));
389	0	buf[5] = (tmbchar) (0x80 \| (c & 0x3F));
390	0	bytes = 6;
391	0	hasError = yes;
392	0	}
393	510k	else
394	510k	hasError = yes;
395
396		/* don't output invalid UTF-8 byte sequence to a stream */
397	229M	if ( !hasError && outp != NULL )
398	103M	{
399	103M	int ix;
400	207M	for ( ix=0; ix < bytes; ++ix )
401	103M	outp->putByte( outp->sinkData, buf[ix] );
402	103M	}
403
404		#if 1 && defined(_DEBUG)
405		if ( hasError )
406		{
407		int i;
408		fprintf( stderr, "UTF-8 encoding error for U+%x : ", c );
409		for (i = 0; i < bytes; i++)
410		fprintf( stderr, "0x%02x ", buf[i] );
411		fprintf( stderr, "\n" );
412		}
413		#endif
414
415	229M	*count = bytes;
416	229M	if (hasError)
417	511k	return -1;
418	228M	return 0;
419	229M	}
420
421
422		/* return one less than the number of bytes used by the UTF-8 byte sequence */
423		/* str points to the UTF-8 byte sequence */
424		/* the Unicode char is returned in ch /
425		uint TY_(GetUTF8)( ctmbstr str, uint *ch )
426	31.4k	{
427	31.4k	uint n;
428	31.4k	int bytes;
429
430	31.4k	int err;
431
432	31.4k	bytes = 0;
433
434		/* first byte "str[0]" is passed in separately from the */
435		/* rest of the UTF-8 byte sequence starting at "str[1]" */
436	31.4k	err = TY_(DecodeUTF8BytesToChar)( &n, str[0], str+1, NULL, &bytes );
437	31.4k	if (err)
438	5.09k	{
439		#if 1 && defined(_DEBUG)
440		fprintf(stderr, "pprint UTF-8 decoding error for U+%x : ", n);
441		#endif
442	5.09k	n = 0xFFFD; /* replacement char */
443	5.09k	}
444
445	31.4k	*ch = n;
446	31.4k	return bytes - 1;
447	31.4k	}
448
449		/* store char c as UTF-8 encoded byte stream */
450		tmbstr TY_(PutUTF8)( tmbstr buf, uint c )
451	0	{
452	0	int err, count = 0;
453
454	0	err = TY_(EncodeCharToUTF8Bytes)( c, buf, NULL, &count );
455	0	if (err)
456	0	{
457		#if 1 && defined(_DEBUG)
458		fprintf(stderr, "pprint UTF-8 encoding error for U+%x : ", c);
459		#endif
460		/* replacement char 0xFFFD encoded as UTF-8 */
461	0	buf[0] = (byte) 0xEF;
462	0	buf[1] = (byte) 0xBF;
463	0	buf[2] = (byte) 0xBD;
464	0	count = 3;
465	0	}
466
467	0	buf += count;
468	0	return buf;
469	0	}
470
471		Bool TY_(IsValidUTF16FromUCS4)( tchar ucs4 )
472	413k	{
473	413k	return ( ucs4 <= kMaxUTF16FromUCS4 );
474	413k	}
475
476		Bool TY_(IsHighSurrogate)( tchar ch )
477	339	{
478	339	return ( ch >= kUTF16HighSurrogateBegin && ch <= kUTF16HighSurrogateEnd );
479	339	}
480		Bool TY_(IsLowSurrogate)( tchar ch )
481	414k	{
482	414k	return ( ch >= kUTF16LowSurrogateBegin && ch <= kUTF16LowSurrogateEnd );
483	414k	}
484
485		tchar TY_(CombineSurrogatePair)( tchar high, tchar low )
486	9	{
487	9	assert( TY_(IsHighSurrogate)(high) && TY_(IsLowSurrogate)(low) );
488	9	return ( ((low - kUTF16LowSurrogateBegin) * 0x400) +
489	9	high - kUTF16HighSurrogateBegin + 0x10000 );
490	9	}
491
492		Bool TY_(SplitSurrogatePair)( tchar utf16, tchar* low, tchar* high )
493	0	{
494	0	Bool status = ( TY_(IsValidCombinedChar)( utf16 ) && high && low );
495	0	if ( status )
496	0	{
497	0	*low = (utf16 - kUTF16SurrogatesBegin) / 0x400 + kUTF16LowSurrogateBegin;
498	0	*high = (utf16 - kUTF16SurrogatesBegin) % 0x400 + kUTF16HighSurrogateBegin;
499	0	}
500	0	return status;
501	0	}
502
503		Bool TY_(IsValidCombinedChar)( tchar ch )
504	9	{
505	9	return ( ch >= kUTF16SurrogatesBegin &&
506	9	(ch & 0x0000FFFE) != 0x0000FFFE &&
507	9	(ch & 0x0000FFFF) != 0x0000FFFF );
508	9	}
509
510		Bool TY_(IsCombinedChar)( tchar ch )
511	0	{
512	0	return ( ch >= kUTF16SurrogatesBegin );
513	0	}
514
515		/*
516		* local variables:
517		* mode: c
518		* indent-tabs-mode: nil
519		* c-basic-offset: 4
520		* eval: (c-set-offset 'substatement-open 0)
521		* end:
522		*/