Coverage Report

Created: 2025-12-14 07:01

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libzip/lib/zip_utf-8.c
Line
Count
Source
1
/*
2
  zip_utf-8.c -- UTF-8 support functions for libzip
3
  Copyright (C) 2011-2024 Dieter Baron and Thomas Klausner
4
5
  This file is part of libzip, a library to manipulate ZIP archives.
6
  The authors can be contacted at <info@libzip.org>
7
8
  Redistribution and use in source and binary forms, with or without
9
  modification, are permitted provided that the following conditions
10
  are met:
11
  1. Redistributions of source code must retain the above copyright
12
     notice, this list of conditions and the following disclaimer.
13
  2. Redistributions in binary form must reproduce the above copyright
14
     notice, this list of conditions and the following disclaimer in
15
     the documentation and/or other materials provided with the
16
     distribution.
17
  3. The names of the authors may not be used to endorse or promote
18
     products derived from this software without specific prior
19
     written permission.
20
21
  THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS
22
  OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23
  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24
  ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
25
  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26
  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
27
  GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28
  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
29
  IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
30
  OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
31
  IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32
*/
33
34
35
#include "zipint.h"
36
37
#include <stdlib.h>
38
39
40
static const zip_uint16_t _cp437_to_unicode[256] = {
41
    /* 0x00 - 0x0F */
42
    0x0000, 0x263A, 0x263B, 0x2665, 0x2666, 0x2663, 0x2660, 0x2022, 0x25D8, 0x25CB, 0x25D9, 0x2642, 0x2640, 0x266A, 0x266B, 0x263C,
43
44
    /* 0x10 - 0x1F */
45
    0x25BA, 0x25C4, 0x2195, 0x203C, 0x00B6, 0x00A7, 0x25AC, 0x21A8, 0x2191, 0x2193, 0x2192, 0x2190, 0x221F, 0x2194, 0x25B2, 0x25BC,
46
47
    /* 0x20 - 0x2F */
48
    0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F,
49
50
    /* 0x30 - 0x3F */
51
    0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F,
52
53
    /* 0x40 - 0x4F */
54
    0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F,
55
56
    /* 0x50 - 0x5F */
57
    0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F,
58
59
    /* 0x60 - 0x6F */
60
    0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F,
61
62
    /* 0x70 - 0x7F */
63
    0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, 0x007E, 0x2302,
64
65
    /* 0x80 - 0x8F */
66
    0x00C7, 0x00FC, 0x00E9, 0x00E2, 0x00E4, 0x00E0, 0x00E5, 0x00E7, 0x00EA, 0x00EB, 0x00E8, 0x00EF, 0x00EE, 0x00EC, 0x00C4, 0x00C5,
67
68
    /* 0x90 - 0x9F */
69
    0x00C9, 0x00E6, 0x00C6, 0x00F4, 0x00F6, 0x00F2, 0x00FB, 0x00F9, 0x00FF, 0x00D6, 0x00DC, 0x00A2, 0x00A3, 0x00A5, 0x20A7, 0x0192,
70
71
    /* 0xA0 - 0xAF */
72
    0x00E1, 0x00ED, 0x00F3, 0x00FA, 0x00F1, 0x00D1, 0x00AA, 0x00BA, 0x00BF, 0x2310, 0x00AC, 0x00BD, 0x00BC, 0x00A1, 0x00AB, 0x00BB,
73
74
    /* 0xB0 - 0xBF */
75
    0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556, 0x2555, 0x2563, 0x2551, 0x2557, 0x255D, 0x255C, 0x255B, 0x2510,
76
77
    /* 0xC0 - 0xCF */
78
    0x2514, 0x2534, 0x252C, 0x251C, 0x2500, 0x253C, 0x255E, 0x255F, 0x255A, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256C, 0x2567,
79
80
    /* 0xD0 - 0xDF */
81
    0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256B, 0x256A, 0x2518, 0x250C, 0x2588, 0x2584, 0x258C, 0x2590, 0x2580,
82
83
    /* 0xE0 - 0xEF */
84
    0x03B1, 0x00DF, 0x0393, 0x03C0, 0x03A3, 0x03C3, 0x00B5, 0x03C4, 0x03A6, 0x0398, 0x03A9, 0x03B4, 0x221E, 0x03C6, 0x03B5, 0x2229,
85
86
    /* 0xF0 - 0xFF */
87
    0x2261, 0x00B1, 0x2265, 0x2264, 0x2320, 0x2321, 0x00F7, 0x2248, 0x00B0, 0x2219, 0x00B7, 0x221A, 0x207F, 0x00B2, 0x25A0, 0x00A0};
88
89
3.82k
#define UTF_8_LEN_2_MASK 0xe0
90
15.0k
#define UTF_8_LEN_2_MATCH 0xc0
91
3.13k
#define UTF_8_LEN_3_MASK 0xf0
92
35.3k
#define UTF_8_LEN_3_MATCH 0xe0
93
2.56k
#define UTF_8_LEN_4_MASK 0xf8
94
2.56k
#define UTF_8_LEN_4_MATCH 0xf0
95
2.47k
#define UTF_8_CONTINUE_MASK 0xc0
96
78.1k
#define UTF_8_CONTINUE_MATCH 0x80
97
98
99
zip_encoding_type_t
100
15.9k
_zip_guess_encoding(zip_string_t *str, zip_encoding_type_t expected_encoding) {
101
15.9k
    zip_encoding_type_t enc;
102
15.9k
    const zip_uint8_t *name;
103
15.9k
    zip_uint32_t i, j, ulen;
104
15.9k
    bool can_be_ascii = true;
105
15.9k
    bool can_be_utf8 = true;
106
15.9k
    bool has_control_characters = false;
107
108
15.9k
    if (str == NULL) {
109
0
        return ZIP_ENCODING_ASCII;
110
0
    }
111
112
15.9k
    name = str->raw;
113
114
15.9k
    if (str->encoding != ZIP_ENCODING_UNKNOWN) {
115
0
        return str->encoding;
116
0
    }
117
118
170k
    for (i = 0; i < str->length; i++) {
119
157k
        if (name[i] < 128) {
120
153k
            if (name[i] < 32 && name[i] != '\r' && name[i] != '\n' && name[i] != '\t') {
121
31.2k
                has_control_characters = true;
122
31.2k
            }
123
153k
            continue;
124
153k
        }
125
126
3.82k
        can_be_ascii = false;
127
3.82k
        if ((name[i] & UTF_8_LEN_2_MASK) == UTF_8_LEN_2_MATCH) {
128
692
            ulen = 1;
129
692
        }
130
3.13k
        else if ((name[i] & UTF_8_LEN_3_MASK) == UTF_8_LEN_3_MATCH) {
131
569
            ulen = 2;
132
569
        }
133
2.56k
        else if ((name[i] & UTF_8_LEN_4_MASK) == UTF_8_LEN_4_MATCH) {
134
556
            ulen = 3;
135
556
        }
136
2.00k
        else {
137
2.00k
            can_be_utf8 = false;
138
2.00k
            break;
139
2.00k
        }
140
141
1.81k
        if (i + ulen >= str->length) {
142
282
            can_be_utf8 = false;
143
282
            break;
144
282
        }
145
146
3.42k
        for (j = 1; j <= ulen; j++) {
147
2.47k
            if ((name[i + j] & UTF_8_CONTINUE_MASK) != UTF_8_CONTINUE_MATCH) {
148
578
                can_be_utf8 = false;
149
578
                goto done;
150
578
            }
151
2.47k
        }
152
957
        i += ulen;
153
957
    }
154
155
15.9k
 done:
156
15.9k
    enc = ZIP_ENCODING_CP437;
157
158
15.9k
    switch (expected_encoding) {
159
12.1k
    case ZIP_ENCODING_UTF8_KNOWN:
160
12.1k
    case ZIP_ENCODING_UTF8_GUESSED:
161
12.1k
        if (can_be_utf8) {
162
10.5k
            enc = ZIP_ENCODING_UTF8_KNOWN;
163
10.5k
        }
164
1.63k
        else {
165
1.63k
            enc = ZIP_ENCODING_ERROR;
166
1.63k
        }
167
12.1k
        break;
168
169
0
    case ZIP_ENCODING_ASCII:
170
0
        if (can_be_ascii && !has_control_characters) {
171
0
            enc = ZIP_ENCODING_ASCII;
172
0
        }
173
0
        else {
174
0
            enc = ZIP_ENCODING_ERROR;
175
0
        }
176
0
        break;
177
178
0
    case ZIP_ENCODING_CP437:
179
0
        enc = ZIP_ENCODING_CP437;
180
0
        break;
181
182
3.76k
    case ZIP_ENCODING_UNKNOWN:
183
3.76k
        if (can_be_ascii && !has_control_characters) {
184
            /* only bytes from 0x20-0x7F */
185
2.17k
            enc = ZIP_ENCODING_ASCII;
186
2.17k
        }
187
1.58k
        else if (can_be_ascii && has_control_characters) {
188
            /* only bytes from 0x00-0x7F */
189
326
            enc = ZIP_ENCODING_CP437;
190
326
        }
191
1.26k
        else if (can_be_utf8) {
192
            /* contains bytes from 0x80-0xFF and is valid UTF-8 */
193
30
            enc =  ZIP_ENCODING_UTF8_GUESSED;
194
30
        }
195
1.23k
        else {
196
            /* fallback */
197
1.23k
            enc = ZIP_ENCODING_CP437;
198
1.23k
        }
199
3.76k
        break;
200
0
    case ZIP_ENCODING_ERROR:
201
        /* invalid, shouldn't happen */
202
0
        enc = ZIP_ENCODING_ERROR;
203
0
        break;
204
15.9k
    }
205
206
15.9k
    str->encoding = enc;
207
15.9k
    return enc;
208
15.9k
}
209
210
211
static zip_uint32_t
212
128k
_zip_unicode_to_utf8_len(zip_uint32_t codepoint) {
213
128k
    if (codepoint < 0x0080) {
214
85.3k
        return 1;
215
85.3k
    }
216
43.4k
    if (codepoint < 0x0800) {
217
11.2k
        return 2;
218
11.2k
    }
219
32.2k
    if (codepoint < 0x10000) {
220
32.2k
        return 3;
221
32.2k
    }
222
0
    return 4;
223
32.2k
}
224
225
226
static zip_uint32_t
227
128k
_zip_unicode_to_utf8(zip_uint32_t codepoint, zip_uint8_t *buf) {
228
128k
    if (codepoint < 0x0080) {
229
85.3k
        buf[0] = codepoint & 0xff;
230
85.3k
        return 1;
231
85.3k
    }
232
43.4k
    if (codepoint < 0x0800) {
233
11.2k
        buf[0] = (zip_uint8_t)(UTF_8_LEN_2_MATCH | ((codepoint >> 6) & 0x1f));
234
11.2k
        buf[1] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | (codepoint & 0x3f));
235
11.2k
        return 2;
236
11.2k
    }
237
32.2k
    if (codepoint < 0x10000) {
238
32.2k
        buf[0] = (zip_uint8_t)(UTF_8_LEN_3_MATCH | ((codepoint >> 12) & 0x0f));
239
32.2k
        buf[1] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | ((codepoint >> 6) & 0x3f));
240
32.2k
        buf[2] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | (codepoint & 0x3f));
241
32.2k
        return 3;
242
32.2k
    }
243
0
    buf[0] = (zip_uint8_t)(UTF_8_LEN_4_MATCH | ((codepoint >> 18) & 0x07));
244
0
    buf[1] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | ((codepoint >> 12) & 0x3f));
245
0
    buf[2] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | ((codepoint >> 6) & 0x3f));
246
0
    buf[3] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | (codepoint & 0x3f));
247
0
    return 4;
248
32.2k
}
249
250
251
zip_uint8_t *
252
1.55k
_zip_cp437_to_utf8(const zip_uint8_t *const _cp437buf, zip_uint32_t len, zip_uint32_t *utf8_lenp, zip_error_t *error) {
253
1.55k
    zip_uint8_t *cp437buf = (zip_uint8_t *)_cp437buf;
254
1.55k
    zip_uint8_t *utf8buf;
255
1.55k
    zip_uint32_t buflen, i, offset;
256
257
1.55k
    if (len == 0) {
258
0
        if (utf8_lenp) {
259
0
            *utf8_lenp = 0;
260
0
        }
261
0
        return NULL;
262
0
    }
263
264
1.55k
    buflen = 1;
265
130k
    for (i = 0; i < len; i++) {
266
128k
        buflen += _zip_unicode_to_utf8_len(_cp437_to_unicode[cp437buf[i]]);
267
128k
    }
268
269
1.55k
    if ((utf8buf = (zip_uint8_t *)malloc(buflen)) == NULL) {
270
0
        zip_error_set(error, ZIP_ER_MEMORY, 0);
271
0
        return NULL;
272
0
    }
273
274
1.55k
    offset = 0;
275
130k
    for (i = 0; i < len; i++) {
276
128k
        offset += _zip_unicode_to_utf8(_cp437_to_unicode[cp437buf[i]], utf8buf + offset);
277
128k
    }
278
279
1.55k
    utf8buf[buflen - 1] = 0;
280
1.55k
    if (utf8_lenp) {
281
1.55k
        *utf8_lenp = buflen - 1;
282
1.55k
    }
283
1.55k
    return utf8buf;
284
1.55k
}