/src/libzip/lib/zip_utf-8.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | zip_utf-8.c -- UTF-8 support functions for libzip |
3 | | Copyright (C) 2011-2024 Dieter Baron and Thomas Klausner |
4 | | |
5 | | This file is part of libzip, a library to manipulate ZIP archives. |
6 | | The authors can be contacted at <info@libzip.org> |
7 | | |
8 | | Redistribution and use in source and binary forms, with or without |
9 | | modification, are permitted provided that the following conditions |
10 | | are met: |
11 | | 1. Redistributions of source code must retain the above copyright |
12 | | notice, this list of conditions and the following disclaimer. |
13 | | 2. Redistributions in binary form must reproduce the above copyright |
14 | | notice, this list of conditions and the following disclaimer in |
15 | | the documentation and/or other materials provided with the |
16 | | distribution. |
17 | | 3. The names of the authors may not be used to endorse or promote |
18 | | products derived from this software without specific prior |
19 | | written permission. |
20 | | |
21 | | THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS |
22 | | OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
23 | | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
24 | | ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY |
25 | | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
26 | | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE |
27 | | GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
28 | | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER |
29 | | IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR |
30 | | OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN |
31 | | IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
32 | | */ |
33 | | |
34 | | |
35 | | #include "zipint.h" |
36 | | |
37 | | #include <stdlib.h> |
38 | | |
39 | | |
40 | | static const zip_uint16_t _cp437_to_unicode[256] = { |
41 | | /* 0x00 - 0x0F */ |
42 | | 0x0000, 0x263A, 0x263B, 0x2665, 0x2666, 0x2663, 0x2660, 0x2022, 0x25D8, 0x25CB, 0x25D9, 0x2642, 0x2640, 0x266A, 0x266B, 0x263C, |
43 | | |
44 | | /* 0x10 - 0x1F */ |
45 | | 0x25BA, 0x25C4, 0x2195, 0x203C, 0x00B6, 0x00A7, 0x25AC, 0x21A8, 0x2191, 0x2193, 0x2192, 0x2190, 0x221F, 0x2194, 0x25B2, 0x25BC, |
46 | | |
47 | | /* 0x20 - 0x2F */ |
48 | | 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F, |
49 | | |
50 | | /* 0x30 - 0x3F */ |
51 | | 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F, |
52 | | |
53 | | /* 0x40 - 0x4F */ |
54 | | 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F, |
55 | | |
56 | | /* 0x50 - 0x5F */ |
57 | | 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F, |
58 | | |
59 | | /* 0x60 - 0x6F */ |
60 | | 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F, |
61 | | |
62 | | /* 0x70 - 0x7F */ |
63 | | 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, 0x007E, 0x2302, |
64 | | |
65 | | /* 0x80 - 0x8F */ |
66 | | 0x00C7, 0x00FC, 0x00E9, 0x00E2, 0x00E4, 0x00E0, 0x00E5, 0x00E7, 0x00EA, 0x00EB, 0x00E8, 0x00EF, 0x00EE, 0x00EC, 0x00C4, 0x00C5, |
67 | | |
68 | | /* 0x90 - 0x9F */ |
69 | | 0x00C9, 0x00E6, 0x00C6, 0x00F4, 0x00F6, 0x00F2, 0x00FB, 0x00F9, 0x00FF, 0x00D6, 0x00DC, 0x00A2, 0x00A3, 0x00A5, 0x20A7, 0x0192, |
70 | | |
71 | | /* 0xA0 - 0xAF */ |
72 | | 0x00E1, 0x00ED, 0x00F3, 0x00FA, 0x00F1, 0x00D1, 0x00AA, 0x00BA, 0x00BF, 0x2310, 0x00AC, 0x00BD, 0x00BC, 0x00A1, 0x00AB, 0x00BB, |
73 | | |
74 | | /* 0xB0 - 0xBF */ |
75 | | 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556, 0x2555, 0x2563, 0x2551, 0x2557, 0x255D, 0x255C, 0x255B, 0x2510, |
76 | | |
77 | | /* 0xC0 - 0xCF */ |
78 | | 0x2514, 0x2534, 0x252C, 0x251C, 0x2500, 0x253C, 0x255E, 0x255F, 0x255A, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256C, 0x2567, |
79 | | |
80 | | /* 0xD0 - 0xDF */ |
81 | | 0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256B, 0x256A, 0x2518, 0x250C, 0x2588, 0x2584, 0x258C, 0x2590, 0x2580, |
82 | | |
83 | | /* 0xE0 - 0xEF */ |
84 | | 0x03B1, 0x00DF, 0x0393, 0x03C0, 0x03A3, 0x03C3, 0x00B5, 0x03C4, 0x03A6, 0x0398, 0x03A9, 0x03B4, 0x221E, 0x03C6, 0x03B5, 0x2229, |
85 | | |
86 | | /* 0xF0 - 0xFF */ |
87 | | 0x2261, 0x00B1, 0x2265, 0x2264, 0x2320, 0x2321, 0x00F7, 0x2248, 0x00B0, 0x2219, 0x00B7, 0x221A, 0x207F, 0x00B2, 0x25A0, 0x00A0}; |
88 | | |
89 | 0 | #define UTF_8_LEN_2_MASK 0xe0 |
90 | 0 | #define UTF_8_LEN_2_MATCH 0xc0 |
91 | 0 | #define UTF_8_LEN_3_MASK 0xf0 |
92 | 0 | #define UTF_8_LEN_3_MATCH 0xe0 |
93 | 0 | #define UTF_8_LEN_4_MASK 0xf8 |
94 | 0 | #define UTF_8_LEN_4_MATCH 0xf0 |
95 | 0 | #define UTF_8_CONTINUE_MASK 0xc0 |
96 | 0 | #define UTF_8_CONTINUE_MATCH 0x80 |
97 | | |
98 | | |
99 | | zip_encoding_type_t |
100 | 0 | _zip_guess_encoding(zip_string_t *str, zip_encoding_type_t expected_encoding) { |
101 | 0 | zip_encoding_type_t enc; |
102 | 0 | const zip_uint8_t *name; |
103 | 0 | zip_uint32_t i, j, ulen; |
104 | 0 | bool can_be_ascii = true; |
105 | 0 | bool can_be_utf8 = true; |
106 | 0 | bool has_control_characters = false; |
107 | |
|
108 | 0 | if (str == NULL) { |
109 | 0 | return ZIP_ENCODING_ASCII; |
110 | 0 | } |
111 | | |
112 | 0 | name = str->raw; |
113 | |
|
114 | 0 | if (str->encoding != ZIP_ENCODING_UNKNOWN) { |
115 | 0 | return str->encoding; |
116 | 0 | } |
117 | | |
118 | 0 | for (i = 0; i < str->length; i++) { |
119 | 0 | if (name[i] < 128) { |
120 | 0 | if (name[i] < 32 && name[i] != '\r' && name[i] != '\n' && name[i] != '\t') { |
121 | 0 | has_control_characters = true; |
122 | 0 | } |
123 | 0 | continue; |
124 | 0 | } |
125 | | |
126 | 0 | can_be_ascii = false; |
127 | 0 | if ((name[i] & UTF_8_LEN_2_MASK) == UTF_8_LEN_2_MATCH) { |
128 | 0 | ulen = 1; |
129 | 0 | } |
130 | 0 | else if ((name[i] & UTF_8_LEN_3_MASK) == UTF_8_LEN_3_MATCH) { |
131 | 0 | ulen = 2; |
132 | 0 | } |
133 | 0 | else if ((name[i] & UTF_8_LEN_4_MASK) == UTF_8_LEN_4_MATCH) { |
134 | 0 | ulen = 3; |
135 | 0 | } |
136 | 0 | else { |
137 | 0 | can_be_utf8 = false; |
138 | 0 | break; |
139 | 0 | } |
140 | | |
141 | 0 | if (i + ulen >= str->length) { |
142 | 0 | can_be_utf8 = false; |
143 | 0 | break; |
144 | 0 | } |
145 | | |
146 | 0 | for (j = 1; j <= ulen; j++) { |
147 | 0 | if ((name[i + j] & UTF_8_CONTINUE_MASK) != UTF_8_CONTINUE_MATCH) { |
148 | 0 | can_be_utf8 = false; |
149 | 0 | goto done; |
150 | 0 | } |
151 | 0 | } |
152 | 0 | i += ulen; |
153 | 0 | } |
154 | | |
155 | 0 | done: |
156 | 0 | enc = ZIP_ENCODING_CP437; |
157 | |
|
158 | 0 | switch (expected_encoding) { |
159 | 0 | case ZIP_ENCODING_UTF8_KNOWN: |
160 | 0 | case ZIP_ENCODING_UTF8_GUESSED: |
161 | 0 | if (can_be_utf8) { |
162 | 0 | enc = ZIP_ENCODING_UTF8_KNOWN; |
163 | 0 | } |
164 | 0 | else { |
165 | 0 | enc = ZIP_ENCODING_ERROR; |
166 | 0 | } |
167 | 0 | break; |
168 | | |
169 | 0 | case ZIP_ENCODING_ASCII: |
170 | 0 | if (can_be_ascii && !has_control_characters) { |
171 | 0 | enc = ZIP_ENCODING_ASCII; |
172 | 0 | } |
173 | 0 | else { |
174 | 0 | enc = ZIP_ENCODING_ERROR; |
175 | 0 | } |
176 | 0 | break; |
177 | | |
178 | 0 | case ZIP_ENCODING_CP437: |
179 | 0 | enc = ZIP_ENCODING_CP437; |
180 | 0 | break; |
181 | | |
182 | 0 | case ZIP_ENCODING_UNKNOWN: |
183 | 0 | if (can_be_ascii && !has_control_characters) { |
184 | | /* only bytes from 0x20-0x7F */ |
185 | 0 | enc = ZIP_ENCODING_ASCII; |
186 | 0 | } |
187 | 0 | else if (can_be_ascii && has_control_characters) { |
188 | | /* only bytes from 0x00-0x7F */ |
189 | 0 | enc = ZIP_ENCODING_CP437; |
190 | 0 | } |
191 | 0 | else if (can_be_utf8) { |
192 | | /* contains bytes from 0x80-0xFF and is valid UTF-8 */ |
193 | 0 | enc = ZIP_ENCODING_UTF8_GUESSED; |
194 | 0 | } |
195 | 0 | else { |
196 | | /* fallback */ |
197 | 0 | enc = ZIP_ENCODING_CP437; |
198 | 0 | } |
199 | 0 | break; |
200 | 0 | case ZIP_ENCODING_ERROR: |
201 | | /* invalid, shouldn't happen */ |
202 | 0 | enc = ZIP_ENCODING_ERROR; |
203 | 0 | break; |
204 | 0 | } |
205 | | |
206 | 0 | str->encoding = enc; |
207 | 0 | return enc; |
208 | 0 | } |
209 | | |
210 | | |
211 | | static zip_uint32_t |
212 | 0 | _zip_unicode_to_utf8_len(zip_uint32_t codepoint) { |
213 | 0 | if (codepoint < 0x0080) { |
214 | 0 | return 1; |
215 | 0 | } |
216 | 0 | if (codepoint < 0x0800) { |
217 | 0 | return 2; |
218 | 0 | } |
219 | 0 | if (codepoint < 0x10000) { |
220 | 0 | return 3; |
221 | 0 | } |
222 | 0 | return 4; |
223 | 0 | } |
224 | | |
225 | | |
226 | | static zip_uint32_t |
227 | 0 | _zip_unicode_to_utf8(zip_uint32_t codepoint, zip_uint8_t *buf) { |
228 | 0 | if (codepoint < 0x0080) { |
229 | 0 | buf[0] = codepoint & 0xff; |
230 | 0 | return 1; |
231 | 0 | } |
232 | 0 | if (codepoint < 0x0800) { |
233 | 0 | buf[0] = (zip_uint8_t)(UTF_8_LEN_2_MATCH | ((codepoint >> 6) & 0x1f)); |
234 | 0 | buf[1] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | (codepoint & 0x3f)); |
235 | 0 | return 2; |
236 | 0 | } |
237 | 0 | if (codepoint < 0x10000) { |
238 | 0 | buf[0] = (zip_uint8_t)(UTF_8_LEN_3_MATCH | ((codepoint >> 12) & 0x0f)); |
239 | 0 | buf[1] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | ((codepoint >> 6) & 0x3f)); |
240 | 0 | buf[2] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | (codepoint & 0x3f)); |
241 | 0 | return 3; |
242 | 0 | } |
243 | 0 | buf[0] = (zip_uint8_t)(UTF_8_LEN_4_MATCH | ((codepoint >> 18) & 0x07)); |
244 | 0 | buf[1] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | ((codepoint >> 12) & 0x3f)); |
245 | 0 | buf[2] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | ((codepoint >> 6) & 0x3f)); |
246 | 0 | buf[3] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | (codepoint & 0x3f)); |
247 | 0 | return 4; |
248 | 0 | } |
249 | | |
250 | | |
251 | | zip_uint8_t * |
252 | 0 | _zip_cp437_to_utf8(const zip_uint8_t *const _cp437buf, zip_uint32_t len, zip_uint32_t *utf8_lenp, zip_error_t *error) { |
253 | 0 | zip_uint8_t *cp437buf = (zip_uint8_t *)_cp437buf; |
254 | 0 | zip_uint8_t *utf8buf; |
255 | 0 | zip_uint32_t buflen, i, offset; |
256 | |
|
257 | 0 | if (len == 0) { |
258 | 0 | if (utf8_lenp) { |
259 | 0 | *utf8_lenp = 0; |
260 | 0 | } |
261 | 0 | return NULL; |
262 | 0 | } |
263 | | |
264 | 0 | buflen = 1; |
265 | 0 | for (i = 0; i < len; i++) { |
266 | 0 | buflen += _zip_unicode_to_utf8_len(_cp437_to_unicode[cp437buf[i]]); |
267 | 0 | } |
268 | |
|
269 | 0 | if ((utf8buf = (zip_uint8_t *)malloc(buflen)) == NULL) { |
270 | 0 | zip_error_set(error, ZIP_ER_MEMORY, 0); |
271 | 0 | return NULL; |
272 | 0 | } |
273 | | |
274 | 0 | offset = 0; |
275 | 0 | for (i = 0; i < len; i++) { |
276 | 0 | offset += _zip_unicode_to_utf8(_cp437_to_unicode[cp437buf[i]], utf8buf + offset); |
277 | 0 | } |
278 | |
|
279 | 0 | utf8buf[buflen - 1] = 0; |
280 | 0 | if (utf8_lenp) { |
281 | 0 | *utf8_lenp = buflen - 1; |
282 | 0 | } |
283 | 0 | return utf8buf; |
284 | 0 | } |