/src/ghostpdl/base/gp_utf8.c

Source (jump to first uncovered line)
/* Copyright (C) 2001-2024 Artifex Software, Inc.
   All Rights Reserved.

   This software is provided AS-IS with no warranty, either express or
   implied.

   This software is distributed under license and may not be copied,
   modified or distributed except as expressly authorized under the terms
   of the license contained in the file LICENSE in this distribution.

   Refer to licensing information at http://www.artifex.com or contact
   Artifex Software, Inc.,  39 Mesa Street, Suite 108A, San Francisco,
   CA 94129, USA, for further information.
*/


#include "gp_utf8.h"

static int
decode_utf8(const char **inp, unsigned int leading_byte)
{
    const char *in = *inp;
    unsigned char c;
    unsigned int codepoint;

    if (leading_byte < 0x80) {
        codepoint = leading_byte;
    } else if ((leading_byte & 0xE0) == 0xC0) {
        codepoint = leading_byte & 0x1F;
        /* Any encoded value that fails to use bit 1 upwards of this
         * byte would have been better encoded in a short form. */
        if (codepoint < 2)
            goto fail_overlong;
        c = (unsigned char)*in++;
        if ((c & 0xC0) != 0x80)
            goto fail;
        codepoint = (codepoint<<6) | (c & 0x3f);
    } else if ((leading_byte & 0xF0) == 0xE0) {
        codepoint = leading_byte & 0xF;
        /* Any encoding that does not use any of the data bits in this
         * byte would have been better encoded in a shorter form. */
        if (codepoint == 0)
            goto fail_overlong;
        c = (unsigned char)*in++;
        if ((c & 0xC0) != 0x80)
            goto fail;
        codepoint = (codepoint<<6) | (c & 0x3f);
        c = (unsigned char)*in++;
        if ((c & 0xC0) != 0x80)
            goto fail;
        codepoint = (codepoint<<6) | (c & 0x3f);
    } else if ((leading_byte & 0xF8) == 0xF0) {
        codepoint = leading_byte & 0x7;
        /* Any encoding that does not use any of the data bits in this
         * byte would have been better encoded in a shorter form. */
        if (codepoint == 0)
            goto fail_overlong;
        c = (unsigned char)*in++;
        if ((c & 0xC0) != 0x80)
            goto fail;
        codepoint = (codepoint<<6) | (c & 0x3f);
        c = (unsigned char)*in++;
        if ((c & 0xC0) != 0x80)
            goto fail;
        codepoint = (codepoint<<6) | (c & 0x3f);
        c = (unsigned char)*in++;
        if ((c & 0xC0) != 0x80)
            goto fail;
        codepoint = (codepoint<<6) | (c & 0x3f);
        /* Check for UTF-16 surrogates which are invalid in UTF-8 */
        if (codepoint >= 0xD800 && codepoint <= 0xDFFF)
            goto fail_overlong;
        /* Codepoints 0 to 0xFFFF (other than the surrogate pair
         * ranges) can be coded for trivially. We can code for
         * codepoints up to 0x10FFFF using surrogate pairs.
         * Anything higher than that is forbidden. */
        if (codepoint > 0x10FFFF)
            goto fail_overlong;
    }
    else
    {
        /* Longer UTF-8 encodings give more than 21 bits of data.
         * The longest we can encode in utf-16 (even using surrogate
         * pairs is 20 bits, so all these should fail. */
        if (0)
        {
            /* If we fail, unread the last one, and return the unicode replacement char. */
fail:
           in--;
        }
fail_overlong:
       /* If we jump to here it's because we've detected an 'overlong' encoding.
        * While this seems harmless, it's actually illegal, for good reason;
        * this is typically an attempt to sneak stuff past security checks, like
        * "../" in paths. Fail this. */
       codepoint = 0xfffd;
    }
    *inp = in;

    return codepoint;
}

int gp_utf8_to_uint16(unsigned short *out, const char *in)
{
    unsigned int i;
    unsigned int len = 1;

    if (out) {
        while ((i = *(unsigned char *)in++) != 0) {
            /* Decode UTF-8 */
            i = decode_utf8(&in, i);

            /* Encode, allowing for surrogates. */
            if (i >= 0x10000 && i <= 0x10ffff)
            {
                i -= 0x10000;
                *out++ = 0xd800 + (i>>10);
                *out++ = 0xdc00 + (i & 0x3ff);
                len++;
            }
            else if (i > 0x10000)
            {
                return -1;
            }
            else
                *out++ = (unsigned short)i;
            len++;
        }
        *out = 0;
    } else {
        while ((i = *(unsigned char *)in++) != 0) {
            /* Decode UTF-8 */
            i = decode_utf8(&in, i);

            /* Encode, allowing for surrogates. */
            if (i >= 0x10000 && i <= 0x10ffff)
                len++;
            else if (i > 0x10000)
                return -1;
            len++;
        }
    }
    return len;
}

int gp_uint16_to_utf8(char *out, const unsigned short *in)
{
    unsigned int i;
    unsigned int len = 1;

    if (out) {
        while ((i = (unsigned int)*in++) != 0) {
            /* Decode surrogates */
            if (i >= 0xD800 && i <= 0xDBFF)
            {
                /* High surrogate. Must be followed by a low surrogate, or this is a failure. */
                int hi = i & 0x3ff;
                int j = (unsigned int)*in++;
                if (j == 0 || (j <= 0xDC00 || j >= 0xDFFF))
                {
                    /* Failure! Unicode replacement char! */
                    in--;
                    i = 0xfffd;
                } else {
                    /* Decode surrogates */
                    i = 0x10000 + (hi<<10) + (j & 0x3ff);
                }
            } else if (i >= 0xDC00 && i <= 0xDFFF)
            {
                /* Lone low surrogate. Failure. Unicode replacement char. */
                i = 0xfffd;
            }

            /* Encode output */
            if (i < 0x80) {
                *out++ = (char)i;
                len++;
            } else if (i < 0x800) {
                *out++ = 0xC0 | ( i>> 6        );
                *out++ = 0x80 | ( i      & 0x3F);
                len+=2;
            } else if (i < 0x10000) {
                *out++ = 0xE0 | ( i>>12        );
                *out++ = 0x80 | ((i>> 6) & 0x3F);
                *out++ = 0x80 | ( i      & 0x3F);
                len+=3;
            } else {
                *out++ = 0xF0 | ( i>>18        );
                *out++ = 0x80 | ((i>>12) & 0x3F);
                *out++ = 0x80 | ((i>> 6) & 0x3F);
                *out++ = 0x80 | ( i      & 0x3F);
                len+=4;
            }
        }
        *out = 0;
    } else {
        while ((i = (unsigned int)*in++) != 0) {
            /* Decode surrogates */
            if (i >= 0xD800 && i <= 0xDBFF)
            {
                /* High surrogate. Must be followed by a low surrogate, or this is a failure. */
                int hi = i & 0x3ff;
                int j = (unsigned int)*in++;
                if (j == 0 || (j <= 0xDC00 || j >= 0xDFFF))
                {
                    /* Failure! Unicode replacement char! */
                    in--;
                    i = 0xfffd;
                } else {
                    /* Decode surrogates */
                    i = 0x10000 + (hi<<10) + (j & 0x3ff);
                }
            } else if (i >= 0xDC00 && i <= 0xDFFF)
            {
                /* Lone low surrogate. Failure. Unicode replacement char. */
                i = 0xfffd;
            }

            if (i < 0x80) {
                len++;
            } else if (i < 0x800) {
                len += 2;
            } else if (i < 0x10000) {
                len += 3;
            } else {
                len += 4;
            }
        }
    }
    return len;
}

Coverage Report

Created: 2025-06-24 07:01

Line	Count	Source (jump to first uncovered line)
1		/* Copyright (C) 2001-2024 Artifex Software, Inc.
2		All Rights Reserved.
3
4		This software is provided AS-IS with no warranty, either express or
5		implied.
6
7		This software is distributed under license and may not be copied,
8		modified or distributed except as expressly authorized under the terms
9		of the license contained in the file LICENSE in this distribution.
10
11		Refer to licensing information at http://www.artifex.com or contact
12		Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
13		CA 94129, USA, for further information.
14		*/
15
16
17		#include "gp_utf8.h"
18
19		static int
20		decode_utf8(const char **inp, unsigned int leading_byte)
21	0	{
22	0	const char in = inp;
23	0	unsigned char c;
24	0	unsigned int codepoint;
25
26	0	if (leading_byte < 0x80) {
27	0	codepoint = leading_byte;
28	0	} else if ((leading_byte & 0xE0) == 0xC0) {
29	0	codepoint = leading_byte & 0x1F;
30		/* Any encoded value that fails to use bit 1 upwards of this
31		* byte would have been better encoded in a short form. */
32	0	if (codepoint < 2)
33	0	goto fail_overlong;
34	0	c = (unsigned char)*in++;
35	0	if ((c & 0xC0) != 0x80)
36	0	goto fail;
37	0	codepoint = (codepoint<<6) \| (c & 0x3f);
38	0	} else if ((leading_byte & 0xF0) == 0xE0) {
39	0	codepoint = leading_byte & 0xF;
40		/* Any encoding that does not use any of the data bits in this
41		* byte would have been better encoded in a shorter form. */
42	0	if (codepoint == 0)
43	0	goto fail_overlong;
44	0	c = (unsigned char)*in++;
45	0	if ((c & 0xC0) != 0x80)
46	0	goto fail;
47	0	codepoint = (codepoint<<6) \| (c & 0x3f);
48	0	c = (unsigned char)*in++;
49	0	if ((c & 0xC0) != 0x80)
50	0	goto fail;
51	0	codepoint = (codepoint<<6) \| (c & 0x3f);
52	0	} else if ((leading_byte & 0xF8) == 0xF0) {
53	0	codepoint = leading_byte & 0x7;
54		/* Any encoding that does not use any of the data bits in this
55		* byte would have been better encoded in a shorter form. */
56	0	if (codepoint == 0)
57	0	goto fail_overlong;
58	0	c = (unsigned char)*in++;
59	0	if ((c & 0xC0) != 0x80)
60	0	goto fail;
61	0	codepoint = (codepoint<<6) \| (c & 0x3f);
62	0	c = (unsigned char)*in++;
63	0	if ((c & 0xC0) != 0x80)
64	0	goto fail;
65	0	codepoint = (codepoint<<6) \| (c & 0x3f);
66	0	c = (unsigned char)*in++;
67	0	if ((c & 0xC0) != 0x80)
68	0	goto fail;
69	0	codepoint = (codepoint<<6) \| (c & 0x3f);
70		/* Check for UTF-16 surrogates which are invalid in UTF-8 */
71	0	if (codepoint >= 0xD800 && codepoint <= 0xDFFF)
72	0	goto fail_overlong;
73		/* Codepoints 0 to 0xFFFF (other than the surrogate pair
74		* ranges) can be coded for trivially. We can code for
75		* codepoints up to 0x10FFFF using surrogate pairs.
76		* Anything higher than that is forbidden. */
77	0	if (codepoint > 0x10FFFF)
78	0	goto fail_overlong;
79	0	}
80	0	else
81	0	{
82		/* Longer UTF-8 encodings give more than 21 bits of data.
83		* The longest we can encode in utf-16 (even using surrogate
84		* pairs is 20 bits, so all these should fail. */
85	0	if (0)
86	0	{
87		/* If we fail, unread the last one, and return the unicode replacement char. */
88	0	fail:
89	0	in--;
90	0	}
91	0	fail_overlong:
92		/* If we jump to here it's because we've detected an 'overlong' encoding.
93		* While this seems harmless, it's actually illegal, for good reason;
94		* this is typically an attempt to sneak stuff past security checks, like
95		* "../" in paths. Fail this. */
96	0	codepoint = 0xfffd;
97	0	}
98	0	*inp = in;
99
100	0	return codepoint;
101	0	}
102
103		int gp_utf8_to_uint16(unsigned short out, const char in)
104	0	{
105	0	unsigned int i;
106	0	unsigned int len = 1;
107
108	0	if (out) {
109	0	while ((i = (unsigned char )in++) != 0) {
110		/* Decode UTF-8 */
111	0	i = decode_utf8(&in, i);
112
113		/* Encode, allowing for surrogates. */
114	0	if (i >= 0x10000 && i <= 0x10ffff)
115	0	{
116	0	i -= 0x10000;
117	0	*out++ = 0xd800 + (i>>10);
118	0	*out++ = 0xdc00 + (i & 0x3ff);
119	0	len++;
120	0	}
121	0	else if (i > 0x10000)
122	0	{
123	0	return -1;
124	0	}
125	0	else
126	0	*out++ = (unsigned short)i;
127	0	len++;
128	0	}
129	0	*out = 0;
130	0	} else {
131	0	while ((i = (unsigned char )in++) != 0) {
132		/* Decode UTF-8 */
133	0	i = decode_utf8(&in, i);
134
135		/* Encode, allowing for surrogates. */
136	0	if (i >= 0x10000 && i <= 0x10ffff)
137	0	len++;
138	0	else if (i > 0x10000)
139	0	return -1;
140	0	len++;
141	0	}
142	0	}
143	0	return len;
144	0	}
145
146		int gp_uint16_to_utf8(char out, const unsigned short in)
147	0	{
148	0	unsigned int i;
149	0	unsigned int len = 1;
150
151	0	if (out) {
152	0	while ((i = (unsigned int)*in++) != 0) {
153		/* Decode surrogates */
154	0	if (i >= 0xD800 && i <= 0xDBFF)
155	0	{
156		/* High surrogate. Must be followed by a low surrogate, or this is a failure. */
157	0	int hi = i & 0x3ff;
158	0	int j = (unsigned int)*in++;
159	0	if (j == 0 \|\| (j <= 0xDC00 \|\| j >= 0xDFFF))
160	0	{
161		/* Failure! Unicode replacement char! */
162	0	in--;
163	0	i = 0xfffd;
164	0	} else {
165		/* Decode surrogates */
166	0	i = 0x10000 + (hi<<10) + (j & 0x3ff);
167	0	}
168	0	} else if (i >= 0xDC00 && i <= 0xDFFF)
169	0	{
170		/* Lone low surrogate. Failure. Unicode replacement char. */
171	0	i = 0xfffd;
172	0	}
173
174		/* Encode output */
175	0	if (i < 0x80) {
176	0	*out++ = (char)i;
177	0	len++;
178	0	} else if (i < 0x800) {
179	0	*out++ = 0xC0 \| ( i>> 6 );
180	0	*out++ = 0x80 \| ( i & 0x3F);
181	0	len+=2;
182	0	} else if (i < 0x10000) {
183	0	*out++ = 0xE0 \| ( i>>12 );
184	0	*out++ = 0x80 \| ((i>> 6) & 0x3F);
185	0	*out++ = 0x80 \| ( i & 0x3F);
186	0	len+=3;
187	0	} else {
188	0	*out++ = 0xF0 \| ( i>>18 );
189	0	*out++ = 0x80 \| ((i>>12) & 0x3F);
190	0	*out++ = 0x80 \| ((i>> 6) & 0x3F);
191	0	*out++ = 0x80 \| ( i & 0x3F);
192	0	len+=4;
193	0	}
194	0	}
195	0	*out = 0;
196	0	} else {
197	0	while ((i = (unsigned int)*in++) != 0) {
198		/* Decode surrogates */
199	0	if (i >= 0xD800 && i <= 0xDBFF)
200	0	{
201		/* High surrogate. Must be followed by a low surrogate, or this is a failure. */
202	0	int hi = i & 0x3ff;
203	0	int j = (unsigned int)*in++;
204	0	if (j == 0 \|\| (j <= 0xDC00 \|\| j >= 0xDFFF))
205	0	{
206		/* Failure! Unicode replacement char! */
207	0	in--;
208	0	i = 0xfffd;
209	0	} else {
210		/* Decode surrogates */
211	0	i = 0x10000 + (hi<<10) + (j & 0x3ff);
212	0	}
213	0	} else if (i >= 0xDC00 && i <= 0xDFFF)
214	0	{
215		/* Lone low surrogate. Failure. Unicode replacement char. */
216	0	i = 0xfffd;
217	0	}
218
219	0	if (i < 0x80) {
220	0	len++;
221	0	} else if (i < 0x800) {
222	0	len += 2;
223	0	} else if (i < 0x10000) {
224	0	len += 3;
225	0	} else {
226	0	len += 4;
227	0	}
228	0	}
229	0	}
230	0	return len;
231	0	}