Coverage Report

Created: 2025-06-24 07:01

/src/ghostpdl/base/gp_utf8.c
Line
Count
Source (jump to first uncovered line)
1
/* Copyright (C) 2001-2024 Artifex Software, Inc.
2
   All Rights Reserved.
3
4
   This software is provided AS-IS with no warranty, either express or
5
   implied.
6
7
   This software is distributed under license and may not be copied,
8
   modified or distributed except as expressly authorized under the terms
9
   of the license contained in the file LICENSE in this distribution.
10
11
   Refer to licensing information at http://www.artifex.com or contact
12
   Artifex Software, Inc.,  39 Mesa Street, Suite 108A, San Francisco,
13
   CA 94129, USA, for further information.
14
*/
15
16
17
#include "gp_utf8.h"
18
19
static int
20
decode_utf8(const char **inp, unsigned int leading_byte)
21
0
{
22
0
    const char *in = *inp;
23
0
    unsigned char c;
24
0
    unsigned int codepoint;
25
26
0
    if (leading_byte < 0x80) {
27
0
        codepoint = leading_byte;
28
0
    } else if ((leading_byte & 0xE0) == 0xC0) {
29
0
        codepoint = leading_byte & 0x1F;
30
        /* Any encoded value that fails to use bit 1 upwards of this
31
         * byte would have been better encoded in a short form. */
32
0
        if (codepoint < 2)
33
0
            goto fail_overlong;
34
0
        c = (unsigned char)*in++;
35
0
        if ((c & 0xC0) != 0x80)
36
0
            goto fail;
37
0
        codepoint = (codepoint<<6) | (c & 0x3f);
38
0
    } else if ((leading_byte & 0xF0) == 0xE0) {
39
0
        codepoint = leading_byte & 0xF;
40
        /* Any encoding that does not use any of the data bits in this
41
         * byte would have been better encoded in a shorter form. */
42
0
        if (codepoint == 0)
43
0
            goto fail_overlong;
44
0
        c = (unsigned char)*in++;
45
0
        if ((c & 0xC0) != 0x80)
46
0
            goto fail;
47
0
        codepoint = (codepoint<<6) | (c & 0x3f);
48
0
        c = (unsigned char)*in++;
49
0
        if ((c & 0xC0) != 0x80)
50
0
            goto fail;
51
0
        codepoint = (codepoint<<6) | (c & 0x3f);
52
0
    } else if ((leading_byte & 0xF8) == 0xF0) {
53
0
        codepoint = leading_byte & 0x7;
54
        /* Any encoding that does not use any of the data bits in this
55
         * byte would have been better encoded in a shorter form. */
56
0
        if (codepoint == 0)
57
0
            goto fail_overlong;
58
0
        c = (unsigned char)*in++;
59
0
        if ((c & 0xC0) != 0x80)
60
0
            goto fail;
61
0
        codepoint = (codepoint<<6) | (c & 0x3f);
62
0
        c = (unsigned char)*in++;
63
0
        if ((c & 0xC0) != 0x80)
64
0
            goto fail;
65
0
        codepoint = (codepoint<<6) | (c & 0x3f);
66
0
        c = (unsigned char)*in++;
67
0
        if ((c & 0xC0) != 0x80)
68
0
            goto fail;
69
0
        codepoint = (codepoint<<6) | (c & 0x3f);
70
        /* Check for UTF-16 surrogates which are invalid in UTF-8 */
71
0
        if (codepoint >= 0xD800 && codepoint <= 0xDFFF)
72
0
            goto fail_overlong;
73
        /* Codepoints 0 to 0xFFFF (other than the surrogate pair
74
         * ranges) can be coded for trivially. We can code for
75
         * codepoints up to 0x10FFFF using surrogate pairs.
76
         * Anything higher than that is forbidden. */
77
0
        if (codepoint > 0x10FFFF)
78
0
            goto fail_overlong;
79
0
    }
80
0
    else
81
0
    {
82
        /* Longer UTF-8 encodings give more than 21 bits of data.
83
         * The longest we can encode in utf-16 (even using surrogate
84
         * pairs is 20 bits, so all these should fail. */
85
0
        if (0)
86
0
        {
87
            /* If we fail, unread the last one, and return the unicode replacement char. */
88
0
fail:
89
0
           in--;
90
0
        }
91
0
fail_overlong:
92
       /* If we jump to here it's because we've detected an 'overlong' encoding.
93
        * While this seems harmless, it's actually illegal, for good reason;
94
        * this is typically an attempt to sneak stuff past security checks, like
95
        * "../" in paths. Fail this. */
96
0
       codepoint = 0xfffd;
97
0
    }
98
0
    *inp = in;
99
100
0
    return codepoint;
101
0
}
102
103
int gp_utf8_to_uint16(unsigned short *out, const char *in)
104
0
{
105
0
    unsigned int i;
106
0
    unsigned int len = 1;
107
108
0
    if (out) {
109
0
        while ((i = *(unsigned char *)in++) != 0) {
110
            /* Decode UTF-8 */
111
0
            i = decode_utf8(&in, i);
112
113
            /* Encode, allowing for surrogates. */
114
0
            if (i >= 0x10000 && i <= 0x10ffff)
115
0
            {
116
0
                i -= 0x10000;
117
0
                *out++ = 0xd800 + (i>>10);
118
0
                *out++ = 0xdc00 + (i & 0x3ff);
119
0
                len++;
120
0
            }
121
0
            else if (i > 0x10000)
122
0
            {
123
0
                return -1;
124
0
            }
125
0
            else
126
0
                *out++ = (unsigned short)i;
127
0
            len++;
128
0
        }
129
0
        *out = 0;
130
0
    } else {
131
0
        while ((i = *(unsigned char *)in++) != 0) {
132
            /* Decode UTF-8 */
133
0
            i = decode_utf8(&in, i);
134
135
            /* Encode, allowing for surrogates. */
136
0
            if (i >= 0x10000 && i <= 0x10ffff)
137
0
                len++;
138
0
            else if (i > 0x10000)
139
0
                return -1;
140
0
            len++;
141
0
        }
142
0
    }
143
0
    return len;
144
0
}
145
146
int gp_uint16_to_utf8(char *out, const unsigned short *in)
147
0
{
148
0
    unsigned int i;
149
0
    unsigned int len = 1;
150
151
0
    if (out) {
152
0
        while ((i = (unsigned int)*in++) != 0) {
153
            /* Decode surrogates */
154
0
            if (i >= 0xD800 && i <= 0xDBFF)
155
0
            {
156
                /* High surrogate. Must be followed by a low surrogate, or this is a failure. */
157
0
                int hi = i & 0x3ff;
158
0
                int j = (unsigned int)*in++;
159
0
                if (j == 0 || (j <= 0xDC00 || j >= 0xDFFF))
160
0
                {
161
                    /* Failure! Unicode replacement char! */
162
0
                    in--;
163
0
                    i = 0xfffd;
164
0
                } else {
165
                    /* Decode surrogates */
166
0
                    i = 0x10000 + (hi<<10) + (j & 0x3ff);
167
0
                }
168
0
            } else if (i >= 0xDC00 && i <= 0xDFFF)
169
0
            {
170
                /* Lone low surrogate. Failure. Unicode replacement char. */
171
0
                i = 0xfffd;
172
0
            }
173
174
            /* Encode output */
175
0
            if (i < 0x80) {
176
0
                *out++ = (char)i;
177
0
                len++;
178
0
            } else if (i < 0x800) {
179
0
                *out++ = 0xC0 | ( i>> 6        );
180
0
                *out++ = 0x80 | ( i      & 0x3F);
181
0
                len+=2;
182
0
            } else if (i < 0x10000) {
183
0
                *out++ = 0xE0 | ( i>>12        );
184
0
                *out++ = 0x80 | ((i>> 6) & 0x3F);
185
0
                *out++ = 0x80 | ( i      & 0x3F);
186
0
                len+=3;
187
0
            } else {
188
0
                *out++ = 0xF0 | ( i>>18        );
189
0
                *out++ = 0x80 | ((i>>12) & 0x3F);
190
0
                *out++ = 0x80 | ((i>> 6) & 0x3F);
191
0
                *out++ = 0x80 | ( i      & 0x3F);
192
0
                len+=4;
193
0
            }
194
0
        }
195
0
        *out = 0;
196
0
    } else {
197
0
        while ((i = (unsigned int)*in++) != 0) {
198
            /* Decode surrogates */
199
0
            if (i >= 0xD800 && i <= 0xDBFF)
200
0
            {
201
                /* High surrogate. Must be followed by a low surrogate, or this is a failure. */
202
0
                int hi = i & 0x3ff;
203
0
                int j = (unsigned int)*in++;
204
0
                if (j == 0 || (j <= 0xDC00 || j >= 0xDFFF))
205
0
                {
206
                    /* Failure! Unicode replacement char! */
207
0
                    in--;
208
0
                    i = 0xfffd;
209
0
                } else {
210
                    /* Decode surrogates */
211
0
                    i = 0x10000 + (hi<<10) + (j & 0x3ff);
212
0
                }
213
0
            } else if (i >= 0xDC00 && i <= 0xDFFF)
214
0
            {
215
                /* Lone low surrogate. Failure. Unicode replacement char. */
216
0
                i = 0xfffd;
217
0
            }
218
219
0
            if (i < 0x80) {
220
0
                len++;
221
0
            } else if (i < 0x800) {
222
0
                len += 2;
223
0
            } else if (i < 0x10000) {
224
0
                len += 3;
225
0
            } else {
226
0
                len += 4;
227
0
            }
228
0
        }
229
0
    }
230
0
    return len;
231
0
}