Coverage Report

Created: 2025-07-23 08:18

/src/libzip/lib/zip_utf-8.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
  zip_utf-8.c -- UTF-8 support functions for libzip
3
  Copyright (C) 2011-2024 Dieter Baron and Thomas Klausner
4
5
  This file is part of libzip, a library to manipulate ZIP archives.
6
  The authors can be contacted at <info@libzip.org>
7
8
  Redistribution and use in source and binary forms, with or without
9
  modification, are permitted provided that the following conditions
10
  are met:
11
  1. Redistributions of source code must retain the above copyright
12
     notice, this list of conditions and the following disclaimer.
13
  2. Redistributions in binary form must reproduce the above copyright
14
     notice, this list of conditions and the following disclaimer in
15
     the documentation and/or other materials provided with the
16
     distribution.
17
  3. The names of the authors may not be used to endorse or promote
18
     products derived from this software without specific prior
19
     written permission.
20
21
  THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS
22
  OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23
  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24
  ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
25
  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26
  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
27
  GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28
  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
29
  IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
30
  OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
31
  IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32
*/
33
34
35
#include "zipint.h"
36
37
#include <stdlib.h>
38
39
40
static const zip_uint16_t _cp437_to_unicode[256] = {
41
    /* 0x00 - 0x0F */
42
    0x0000, 0x263A, 0x263B, 0x2665, 0x2666, 0x2663, 0x2660, 0x2022, 0x25D8, 0x25CB, 0x25D9, 0x2642, 0x2640, 0x266A, 0x266B, 0x263C,
43
44
    /* 0x10 - 0x1F */
45
    0x25BA, 0x25C4, 0x2195, 0x203C, 0x00B6, 0x00A7, 0x25AC, 0x21A8, 0x2191, 0x2193, 0x2192, 0x2190, 0x221F, 0x2194, 0x25B2, 0x25BC,
46
47
    /* 0x20 - 0x2F */
48
    0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F,
49
50
    /* 0x30 - 0x3F */
51
    0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F,
52
53
    /* 0x40 - 0x4F */
54
    0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F,
55
56
    /* 0x50 - 0x5F */
57
    0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F,
58
59
    /* 0x60 - 0x6F */
60
    0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F,
61
62
    /* 0x70 - 0x7F */
63
    0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, 0x007E, 0x2302,
64
65
    /* 0x80 - 0x8F */
66
    0x00C7, 0x00FC, 0x00E9, 0x00E2, 0x00E4, 0x00E0, 0x00E5, 0x00E7, 0x00EA, 0x00EB, 0x00E8, 0x00EF, 0x00EE, 0x00EC, 0x00C4, 0x00C5,
67
68
    /* 0x90 - 0x9F */
69
    0x00C9, 0x00E6, 0x00C6, 0x00F4, 0x00F6, 0x00F2, 0x00FB, 0x00F9, 0x00FF, 0x00D6, 0x00DC, 0x00A2, 0x00A3, 0x00A5, 0x20A7, 0x0192,
70
71
    /* 0xA0 - 0xAF */
72
    0x00E1, 0x00ED, 0x00F3, 0x00FA, 0x00F1, 0x00D1, 0x00AA, 0x00BA, 0x00BF, 0x2310, 0x00AC, 0x00BD, 0x00BC, 0x00A1, 0x00AB, 0x00BB,
73
74
    /* 0xB0 - 0xBF */
75
    0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556, 0x2555, 0x2563, 0x2551, 0x2557, 0x255D, 0x255C, 0x255B, 0x2510,
76
77
    /* 0xC0 - 0xCF */
78
    0x2514, 0x2534, 0x252C, 0x251C, 0x2500, 0x253C, 0x255E, 0x255F, 0x255A, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256C, 0x2567,
79
80
    /* 0xD0 - 0xDF */
81
    0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256B, 0x256A, 0x2518, 0x250C, 0x2588, 0x2584, 0x258C, 0x2590, 0x2580,
82
83
    /* 0xE0 - 0xEF */
84
    0x03B1, 0x00DF, 0x0393, 0x03C0, 0x03A3, 0x03C3, 0x00B5, 0x03C4, 0x03A6, 0x0398, 0x03A9, 0x03B4, 0x221E, 0x03C6, 0x03B5, 0x2229,
85
86
    /* 0xF0 - 0xFF */
87
    0x2261, 0x00B1, 0x2265, 0x2264, 0x2320, 0x2321, 0x00F7, 0x2248, 0x00B0, 0x2219, 0x00B7, 0x221A, 0x207F, 0x00B2, 0x25A0, 0x00A0};
88
89
0
#define UTF_8_LEN_2_MASK 0xe0
90
0
#define UTF_8_LEN_2_MATCH 0xc0
91
0
#define UTF_8_LEN_3_MASK 0xf0
92
0
#define UTF_8_LEN_3_MATCH 0xe0
93
0
#define UTF_8_LEN_4_MASK 0xf8
94
0
#define UTF_8_LEN_4_MATCH 0xf0
95
0
#define UTF_8_CONTINUE_MASK 0xc0
96
0
#define UTF_8_CONTINUE_MATCH 0x80
97
98
99
zip_encoding_type_t
100
0
_zip_guess_encoding(zip_string_t *str, zip_encoding_type_t expected_encoding) {
101
0
    zip_encoding_type_t enc;
102
0
    const zip_uint8_t *name;
103
0
    zip_uint32_t i, j, ulen;
104
0
    bool can_be_ascii = true;
105
0
    bool can_be_utf8 = true;
106
0
    bool has_control_characters = false;
107
108
0
    if (str == NULL) {
109
0
        return ZIP_ENCODING_ASCII;
110
0
    }
111
112
0
    name = str->raw;
113
114
0
    if (str->encoding != ZIP_ENCODING_UNKNOWN) {
115
0
        return str->encoding;
116
0
    }
117
118
0
    for (i = 0; i < str->length; i++) {
119
0
        if (name[i] < 128) {
120
0
            if (name[i] < 32 && name[i] != '\r' && name[i] != '\n' && name[i] != '\t') {
121
0
                has_control_characters = true;
122
0
            }
123
0
            continue;
124
0
        }
125
126
0
        can_be_ascii = false;
127
0
        if ((name[i] & UTF_8_LEN_2_MASK) == UTF_8_LEN_2_MATCH) {
128
0
            ulen = 1;
129
0
        }
130
0
        else if ((name[i] & UTF_8_LEN_3_MASK) == UTF_8_LEN_3_MATCH) {
131
0
            ulen = 2;
132
0
        }
133
0
        else if ((name[i] & UTF_8_LEN_4_MASK) == UTF_8_LEN_4_MATCH) {
134
0
            ulen = 3;
135
0
        }
136
0
        else {
137
0
            can_be_utf8 = false;
138
0
            break;
139
0
        }
140
141
0
        if (i + ulen >= str->length) {
142
0
            can_be_utf8 = false;
143
0
            break;
144
0
        }
145
146
0
        for (j = 1; j <= ulen; j++) {
147
0
            if ((name[i + j] & UTF_8_CONTINUE_MASK) != UTF_8_CONTINUE_MATCH) {
148
0
                can_be_utf8 = false;
149
0
                goto done;
150
0
            }
151
0
        }
152
0
        i += ulen;
153
0
    }
154
155
0
 done:
156
0
    enc = ZIP_ENCODING_CP437;
157
158
0
    switch (expected_encoding) {
159
0
    case ZIP_ENCODING_UTF8_KNOWN:
160
0
    case ZIP_ENCODING_UTF8_GUESSED:
161
0
        if (can_be_utf8) {
162
0
            enc = ZIP_ENCODING_UTF8_KNOWN;
163
0
        }
164
0
        else {
165
0
            enc = ZIP_ENCODING_ERROR;
166
0
        }
167
0
        break;
168
169
0
    case ZIP_ENCODING_ASCII:
170
0
        if (can_be_ascii && !has_control_characters) {
171
0
            enc = ZIP_ENCODING_ASCII;
172
0
        }
173
0
        else {
174
0
            enc = ZIP_ENCODING_ERROR;
175
0
        }
176
0
        break;
177
178
0
    case ZIP_ENCODING_CP437:
179
0
        enc = ZIP_ENCODING_CP437;
180
0
        break;
181
182
0
    case ZIP_ENCODING_UNKNOWN:
183
0
        if (can_be_ascii && !has_control_characters) {
184
            /* only bytes from 0x20-0x7F */
185
0
            enc = ZIP_ENCODING_ASCII;
186
0
        }
187
0
        else if (can_be_ascii && has_control_characters) {
188
            /* only bytes from 0x00-0x7F */
189
0
            enc = ZIP_ENCODING_CP437;
190
0
        }
191
0
        else if (can_be_utf8) {
192
            /* contains bytes from 0x80-0xFF and is valid UTF-8 */
193
0
            enc =  ZIP_ENCODING_UTF8_GUESSED;
194
0
        }
195
0
        else {
196
            /* fallback */
197
0
            enc = ZIP_ENCODING_CP437;
198
0
        }
199
0
        break;
200
0
    case ZIP_ENCODING_ERROR:
201
        /* invalid, shouldn't happen */
202
0
        enc = ZIP_ENCODING_ERROR;
203
0
        break;
204
0
    }
205
206
0
    str->encoding = enc;
207
0
    return enc;
208
0
}
209
210
211
static zip_uint32_t
212
0
_zip_unicode_to_utf8_len(zip_uint32_t codepoint) {
213
0
    if (codepoint < 0x0080) {
214
0
        return 1;
215
0
    }
216
0
    if (codepoint < 0x0800) {
217
0
        return 2;
218
0
    }
219
0
    if (codepoint < 0x10000) {
220
0
        return 3;
221
0
    }
222
0
    return 4;
223
0
}
224
225
226
static zip_uint32_t
227
0
_zip_unicode_to_utf8(zip_uint32_t codepoint, zip_uint8_t *buf) {
228
0
    if (codepoint < 0x0080) {
229
0
        buf[0] = codepoint & 0xff;
230
0
        return 1;
231
0
    }
232
0
    if (codepoint < 0x0800) {
233
0
        buf[0] = (zip_uint8_t)(UTF_8_LEN_2_MATCH | ((codepoint >> 6) & 0x1f));
234
0
        buf[1] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | (codepoint & 0x3f));
235
0
        return 2;
236
0
    }
237
0
    if (codepoint < 0x10000) {
238
0
        buf[0] = (zip_uint8_t)(UTF_8_LEN_3_MATCH | ((codepoint >> 12) & 0x0f));
239
0
        buf[1] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | ((codepoint >> 6) & 0x3f));
240
0
        buf[2] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | (codepoint & 0x3f));
241
0
        return 3;
242
0
    }
243
0
    buf[0] = (zip_uint8_t)(UTF_8_LEN_4_MATCH | ((codepoint >> 18) & 0x07));
244
0
    buf[1] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | ((codepoint >> 12) & 0x3f));
245
0
    buf[2] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | ((codepoint >> 6) & 0x3f));
246
0
    buf[3] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | (codepoint & 0x3f));
247
0
    return 4;
248
0
}
249
250
251
zip_uint8_t *
252
0
_zip_cp437_to_utf8(const zip_uint8_t *const _cp437buf, zip_uint32_t len, zip_uint32_t *utf8_lenp, zip_error_t *error) {
253
0
    zip_uint8_t *cp437buf = (zip_uint8_t *)_cp437buf;
254
0
    zip_uint8_t *utf8buf;
255
0
    zip_uint32_t buflen, i, offset;
256
257
0
    if (len == 0) {
258
0
        if (utf8_lenp) {
259
0
            *utf8_lenp = 0;
260
0
        }
261
0
        return NULL;
262
0
    }
263
264
0
    buflen = 1;
265
0
    for (i = 0; i < len; i++) {
266
0
        buflen += _zip_unicode_to_utf8_len(_cp437_to_unicode[cp437buf[i]]);
267
0
    }
268
269
0
    if ((utf8buf = (zip_uint8_t *)malloc(buflen)) == NULL) {
270
0
        zip_error_set(error, ZIP_ER_MEMORY, 0);
271
0
        return NULL;
272
0
    }
273
274
0
    offset = 0;
275
0
    for (i = 0; i < len; i++) {
276
0
        offset += _zip_unicode_to_utf8(_cp437_to_unicode[cp437buf[i]], utf8buf + offset);
277
0
    }
278
279
0
    utf8buf[buflen - 1] = 0;
280
0
    if (utf8_lenp) {
281
0
        *utf8_lenp = buflen - 1;
282
0
    }
283
0
    return utf8buf;
284
0
}