/src/ghostpdl/base/gp_utf8.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* Copyright (C) 2001-2024 Artifex Software, Inc. |
2 | | All Rights Reserved. |
3 | | |
4 | | This software is provided AS-IS with no warranty, either express or |
5 | | implied. |
6 | | |
7 | | This software is distributed under license and may not be copied, |
8 | | modified or distributed except as expressly authorized under the terms |
9 | | of the license contained in the file LICENSE in this distribution. |
10 | | |
11 | | Refer to licensing information at http://www.artifex.com or contact |
12 | | Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, |
13 | | CA 94129, USA, for further information. |
14 | | */ |
15 | | |
16 | | |
17 | | #include "gp_utf8.h" |
18 | | |
19 | | static int |
20 | | decode_utf8(const char **inp, unsigned int leading_byte) |
21 | 0 | { |
22 | 0 | const char *in = *inp; |
23 | 0 | unsigned char c; |
24 | 0 | unsigned int codepoint; |
25 | |
|
26 | 0 | if (leading_byte < 0x80) { |
27 | 0 | codepoint = leading_byte; |
28 | 0 | } else if ((leading_byte & 0xE0) == 0xC0) { |
29 | 0 | codepoint = leading_byte & 0x1F; |
30 | | /* Any encoded value that fails to use bit 1 upwards of this |
31 | | * byte would have been better encoded in a short form. */ |
32 | 0 | if (codepoint < 2) |
33 | 0 | goto fail_overlong; |
34 | 0 | c = (unsigned char)*in++; |
35 | 0 | if ((c & 0xC0) != 0x80) |
36 | 0 | goto fail; |
37 | 0 | codepoint = (codepoint<<6) | (c & 0x3f); |
38 | 0 | } else if ((leading_byte & 0xF0) == 0xE0) { |
39 | 0 | codepoint = leading_byte & 0xF; |
40 | | /* Any encoding that does not use any of the data bits in this |
41 | | * byte would have been better encoded in a shorter form. */ |
42 | 0 | if (codepoint == 0) |
43 | 0 | goto fail_overlong; |
44 | 0 | c = (unsigned char)*in++; |
45 | 0 | if ((c & 0xC0) != 0x80) |
46 | 0 | goto fail; |
47 | 0 | codepoint = (codepoint<<6) | (c & 0x3f); |
48 | 0 | c = (unsigned char)*in++; |
49 | 0 | if ((c & 0xC0) != 0x80) |
50 | 0 | goto fail; |
51 | 0 | codepoint = (codepoint<<6) | (c & 0x3f); |
52 | 0 | } else if ((leading_byte & 0xF8) == 0xF0) { |
53 | 0 | codepoint = leading_byte & 0x7; |
54 | | /* Any encoding that does not use any of the data bits in this |
55 | | * byte would have been better encoded in a shorter form. */ |
56 | 0 | if (codepoint == 0) |
57 | 0 | goto fail_overlong; |
58 | 0 | c = (unsigned char)*in++; |
59 | 0 | if ((c & 0xC0) != 0x80) |
60 | 0 | goto fail; |
61 | 0 | codepoint = (codepoint<<6) | (c & 0x3f); |
62 | 0 | c = (unsigned char)*in++; |
63 | 0 | if ((c & 0xC0) != 0x80) |
64 | 0 | goto fail; |
65 | 0 | codepoint = (codepoint<<6) | (c & 0x3f); |
66 | 0 | c = (unsigned char)*in++; |
67 | 0 | if ((c & 0xC0) != 0x80) |
68 | 0 | goto fail; |
69 | 0 | codepoint = (codepoint<<6) | (c & 0x3f); |
70 | | /* Check for UTF-16 surrogates which are invalid in UTF-8 */ |
71 | 0 | if (codepoint >= 0xD800 && codepoint <= 0xDFFF) |
72 | 0 | goto fail_overlong; |
73 | | /* Codepoints 0 to 0xFFFF (other than the surrogate pair |
74 | | * ranges) can be coded for trivially. We can code for |
75 | | * codepoints up to 0x10FFFF using surrogate pairs. |
76 | | * Anything higher than that is forbidden. */ |
77 | 0 | if (codepoint > 0x10FFFF) |
78 | 0 | goto fail_overlong; |
79 | 0 | } |
80 | 0 | else |
81 | 0 | { |
82 | | /* Longer UTF-8 encodings give more than 21 bits of data. |
83 | | * The longest we can encode in utf-16 (even using surrogate |
84 | | * pairs is 20 bits, so all these should fail. */ |
85 | 0 | if (0) |
86 | 0 | { |
87 | | /* If we fail, unread the last one, and return the unicode replacement char. */ |
88 | 0 | fail: |
89 | 0 | in--; |
90 | 0 | } |
91 | 0 | fail_overlong: |
92 | | /* If we jump to here it's because we've detected an 'overlong' encoding. |
93 | | * While this seems harmless, it's actually illegal, for good reason; |
94 | | * this is typically an attempt to sneak stuff past security checks, like |
95 | | * "../" in paths. Fail this. */ |
96 | 0 | codepoint = 0xfffd; |
97 | 0 | } |
98 | 0 | *inp = in; |
99 | |
|
100 | 0 | return codepoint; |
101 | 0 | } |
102 | | |
103 | | int gp_utf8_to_uint16(unsigned short *out, const char *in) |
104 | 0 | { |
105 | 0 | unsigned int i; |
106 | 0 | unsigned int len = 1; |
107 | |
|
108 | 0 | if (out) { |
109 | 0 | while ((i = *(unsigned char *)in++) != 0) { |
110 | | /* Decode UTF-8 */ |
111 | 0 | i = decode_utf8(&in, i); |
112 | | |
113 | | /* Encode, allowing for surrogates. */ |
114 | 0 | if (i >= 0x10000 && i <= 0x10ffff) |
115 | 0 | { |
116 | 0 | i -= 0x10000; |
117 | 0 | *out++ = 0xd800 + (i>>10); |
118 | 0 | *out++ = 0xdc00 + (i & 0x3ff); |
119 | 0 | len++; |
120 | 0 | } |
121 | 0 | else if (i > 0x10000) |
122 | 0 | { |
123 | 0 | return -1; |
124 | 0 | } |
125 | 0 | else |
126 | 0 | *out++ = (unsigned short)i; |
127 | 0 | len++; |
128 | 0 | } |
129 | 0 | *out = 0; |
130 | 0 | } else { |
131 | 0 | while ((i = *(unsigned char *)in++) != 0) { |
132 | | /* Decode UTF-8 */ |
133 | 0 | i = decode_utf8(&in, i); |
134 | | |
135 | | /* Encode, allowing for surrogates. */ |
136 | 0 | if (i >= 0x10000 && i <= 0x10ffff) |
137 | 0 | len++; |
138 | 0 | else if (i > 0x10000) |
139 | 0 | return -1; |
140 | 0 | len++; |
141 | 0 | } |
142 | 0 | } |
143 | 0 | return len; |
144 | 0 | } |
145 | | |
146 | | int gp_uint16_to_utf8(char *out, const unsigned short *in) |
147 | 0 | { |
148 | 0 | unsigned int i; |
149 | 0 | unsigned int len = 1; |
150 | |
|
151 | 0 | if (out) { |
152 | 0 | while ((i = (unsigned int)*in++) != 0) { |
153 | | /* Decode surrogates */ |
154 | 0 | if (i >= 0xD800 && i <= 0xDBFF) |
155 | 0 | { |
156 | | /* High surrogate. Must be followed by a low surrogate, or this is a failure. */ |
157 | 0 | int hi = i & 0x3ff; |
158 | 0 | int j = (unsigned int)*in++; |
159 | 0 | if (j == 0 || (j <= 0xDC00 || j >= 0xDFFF)) |
160 | 0 | { |
161 | | /* Failure! Unicode replacement char! */ |
162 | 0 | in--; |
163 | 0 | i = 0xfffd; |
164 | 0 | } else { |
165 | | /* Decode surrogates */ |
166 | 0 | i = 0x10000 + (hi<<10) + (j & 0x3ff); |
167 | 0 | } |
168 | 0 | } else if (i >= 0xDC00 && i <= 0xDFFF) |
169 | 0 | { |
170 | | /* Lone low surrogate. Failure. Unicode replacement char. */ |
171 | 0 | i = 0xfffd; |
172 | 0 | } |
173 | | |
174 | | /* Encode output */ |
175 | 0 | if (i < 0x80) { |
176 | 0 | *out++ = (char)i; |
177 | 0 | len++; |
178 | 0 | } else if (i < 0x800) { |
179 | 0 | *out++ = 0xC0 | ( i>> 6 ); |
180 | 0 | *out++ = 0x80 | ( i & 0x3F); |
181 | 0 | len+=2; |
182 | 0 | } else if (i < 0x10000) { |
183 | 0 | *out++ = 0xE0 | ( i>>12 ); |
184 | 0 | *out++ = 0x80 | ((i>> 6) & 0x3F); |
185 | 0 | *out++ = 0x80 | ( i & 0x3F); |
186 | 0 | len+=3; |
187 | 0 | } else { |
188 | 0 | *out++ = 0xF0 | ( i>>18 ); |
189 | 0 | *out++ = 0x80 | ((i>>12) & 0x3F); |
190 | 0 | *out++ = 0x80 | ((i>> 6) & 0x3F); |
191 | 0 | *out++ = 0x80 | ( i & 0x3F); |
192 | 0 | len+=4; |
193 | 0 | } |
194 | 0 | } |
195 | 0 | *out = 0; |
196 | 0 | } else { |
197 | 0 | while ((i = (unsigned int)*in++) != 0) { |
198 | | /* Decode surrogates */ |
199 | 0 | if (i >= 0xD800 && i <= 0xDBFF) |
200 | 0 | { |
201 | | /* High surrogate. Must be followed by a low surrogate, or this is a failure. */ |
202 | 0 | int hi = i & 0x3ff; |
203 | 0 | int j = (unsigned int)*in++; |
204 | 0 | if (j == 0 || (j <= 0xDC00 || j >= 0xDFFF)) |
205 | 0 | { |
206 | | /* Failure! Unicode replacement char! */ |
207 | 0 | in--; |
208 | 0 | i = 0xfffd; |
209 | 0 | } else { |
210 | | /* Decode surrogates */ |
211 | 0 | i = 0x10000 + (hi<<10) + (j & 0x3ff); |
212 | 0 | } |
213 | 0 | } else if (i >= 0xDC00 && i <= 0xDFFF) |
214 | 0 | { |
215 | | /* Lone low surrogate. Failure. Unicode replacement char. */ |
216 | 0 | i = 0xfffd; |
217 | 0 | } |
218 | |
|
219 | 0 | if (i < 0x80) { |
220 | 0 | len++; |
221 | 0 | } else if (i < 0x800) { |
222 | 0 | len += 2; |
223 | 0 | } else if (i < 0x10000) { |
224 | 0 | len += 3; |
225 | 0 | } else { |
226 | 0 | len += 4; |
227 | 0 | } |
228 | 0 | } |
229 | 0 | } |
230 | 0 | return len; |
231 | 0 | } |