Coverage Report

Created: 2025-12-31 06:42

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libzip/lib/zip_utf-8.c
Line
Count
Source
1
/*
2
  zip_utf-8.c -- UTF-8 support functions for libzip
3
  Copyright (C) 2011-2024 Dieter Baron and Thomas Klausner
4
5
  This file is part of libzip, a library to manipulate ZIP archives.
6
  The authors can be contacted at <info@libzip.org>
7
8
  Redistribution and use in source and binary forms, with or without
9
  modification, are permitted provided that the following conditions
10
  are met:
11
  1. Redistributions of source code must retain the above copyright
12
     notice, this list of conditions and the following disclaimer.
13
  2. Redistributions in binary form must reproduce the above copyright
14
     notice, this list of conditions and the following disclaimer in
15
     the documentation and/or other materials provided with the
16
     distribution.
17
  3. The names of the authors may not be used to endorse or promote
18
     products derived from this software without specific prior
19
     written permission.
20
21
  THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS
22
  OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23
  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24
  ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
25
  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26
  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
27
  GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28
  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
29
  IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
30
  OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
31
  IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32
*/
33
34
35
#include "zipint.h"
36
37
#include <stdlib.h>
38
39
40
static const zip_uint16_t _cp437_to_unicode[256] = {
41
    /* 0x00 - 0x0F */
42
    0x0000, 0x263A, 0x263B, 0x2665, 0x2666, 0x2663, 0x2660, 0x2022, 0x25D8, 0x25CB, 0x25D9, 0x2642, 0x2640, 0x266A, 0x266B, 0x263C,
43
44
    /* 0x10 - 0x1F */
45
    0x25BA, 0x25C4, 0x2195, 0x203C, 0x00B6, 0x00A7, 0x25AC, 0x21A8, 0x2191, 0x2193, 0x2192, 0x2190, 0x221F, 0x2194, 0x25B2, 0x25BC,
46
47
    /* 0x20 - 0x2F */
48
    0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F,
49
50
    /* 0x30 - 0x3F */
51
    0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F,
52
53
    /* 0x40 - 0x4F */
54
    0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F,
55
56
    /* 0x50 - 0x5F */
57
    0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F,
58
59
    /* 0x60 - 0x6F */
60
    0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F,
61
62
    /* 0x70 - 0x7F */
63
    0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, 0x007E, 0x2302,
64
65
    /* 0x80 - 0x8F */
66
    0x00C7, 0x00FC, 0x00E9, 0x00E2, 0x00E4, 0x00E0, 0x00E5, 0x00E7, 0x00EA, 0x00EB, 0x00E8, 0x00EF, 0x00EE, 0x00EC, 0x00C4, 0x00C5,
67
68
    /* 0x90 - 0x9F */
69
    0x00C9, 0x00E6, 0x00C6, 0x00F4, 0x00F6, 0x00F2, 0x00FB, 0x00F9, 0x00FF, 0x00D6, 0x00DC, 0x00A2, 0x00A3, 0x00A5, 0x20A7, 0x0192,
70
71
    /* 0xA0 - 0xAF */
72
    0x00E1, 0x00ED, 0x00F3, 0x00FA, 0x00F1, 0x00D1, 0x00AA, 0x00BA, 0x00BF, 0x2310, 0x00AC, 0x00BD, 0x00BC, 0x00A1, 0x00AB, 0x00BB,
73
74
    /* 0xB0 - 0xBF */
75
    0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556, 0x2555, 0x2563, 0x2551, 0x2557, 0x255D, 0x255C, 0x255B, 0x2510,
76
77
    /* 0xC0 - 0xCF */
78
    0x2514, 0x2534, 0x252C, 0x251C, 0x2500, 0x253C, 0x255E, 0x255F, 0x255A, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256C, 0x2567,
79
80
    /* 0xD0 - 0xDF */
81
    0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256B, 0x256A, 0x2518, 0x250C, 0x2588, 0x2584, 0x258C, 0x2590, 0x2580,
82
83
    /* 0xE0 - 0xEF */
84
    0x03B1, 0x00DF, 0x0393, 0x03C0, 0x03A3, 0x03C3, 0x00B5, 0x03C4, 0x03A6, 0x0398, 0x03A9, 0x03B4, 0x221E, 0x03C6, 0x03B5, 0x2229,
85
86
    /* 0xF0 - 0xFF */
87
    0x2261, 0x00B1, 0x2265, 0x2264, 0x2320, 0x2321, 0x00F7, 0x2248, 0x00B0, 0x2219, 0x00B7, 0x221A, 0x207F, 0x00B2, 0x25A0, 0x00A0};
88
89
11.4k
#define UTF_8_LEN_2_MASK 0xe0
90
33.1k
#define UTF_8_LEN_2_MATCH 0xc0
91
10.0k
#define UTF_8_LEN_3_MASK 0xf0
92
46.9k
#define UTF_8_LEN_3_MATCH 0xe0
93
4.82k
#define UTF_8_LEN_4_MASK 0xf8
94
4.82k
#define UTF_8_LEN_4_MATCH 0xf0
95
6.45k
#define UTF_8_CONTINUE_MASK 0xc0
96
101k
#define UTF_8_CONTINUE_MATCH 0x80
97
98
99
zip_encoding_type_t
100
43.7k
_zip_guess_encoding(zip_string_t *str, zip_encoding_type_t expected_encoding) {
101
43.7k
    zip_encoding_type_t enc;
102
43.7k
    const zip_uint8_t *name;
103
43.7k
    zip_uint32_t i, j, ulen;
104
43.7k
    bool can_be_ascii = true;
105
43.7k
    bool can_be_utf8 = true;
106
43.7k
    bool has_control_characters = false;
107
108
43.7k
    if (str == NULL) {
109
2.80k
        return ZIP_ENCODING_ASCII;
110
2.80k
    }
111
112
40.9k
    name = str->raw;
113
114
40.9k
    if (str->encoding != ZIP_ENCODING_UNKNOWN) {
115
2.80k
        return str->encoding;
116
2.80k
    }
117
118
554k
    for (i = 0; i < str->length; i++) {
119
525k
        if (name[i] < 128) {
120
513k
            if (name[i] < 32 && name[i] != '\r' && name[i] != '\n' && name[i] != '\t') {
121
56.8k
                has_control_characters = true;
122
56.8k
            }
123
513k
            continue;
124
513k
        }
125
126
11.4k
        can_be_ascii = false;
127
11.4k
        if ((name[i] & UTF_8_LEN_2_MASK) == UTF_8_LEN_2_MATCH) {
128
1.45k
            ulen = 1;
129
1.45k
        }
130
10.0k
        else if ((name[i] & UTF_8_LEN_3_MASK) == UTF_8_LEN_3_MATCH) {
131
5.21k
            ulen = 2;
132
5.21k
        }
133
4.82k
        else if ((name[i] & UTF_8_LEN_4_MASK) == UTF_8_LEN_4_MATCH) {
134
1.17k
            ulen = 3;
135
1.17k
        }
136
3.64k
        else {
137
3.64k
            can_be_utf8 = false;
138
3.64k
            break;
139
3.64k
        }
140
141
7.84k
        if (i + ulen >= str->length) {
142
4.14k
            can_be_utf8 = false;
143
4.14k
            break;
144
4.14k
        }
145
146
9.12k
        for (j = 1; j <= ulen; j++) {
147
6.45k
            if ((name[i + j] & UTF_8_CONTINUE_MASK) != UTF_8_CONTINUE_MATCH) {
148
1.03k
                can_be_utf8 = false;
149
1.03k
                goto done;
150
1.03k
            }
151
6.45k
        }
152
2.67k
        i += ulen;
153
2.67k
    }
154
155
38.1k
 done:
156
38.1k
    enc = ZIP_ENCODING_CP437;
157
158
38.1k
    switch (expected_encoding) {
159
22.7k
    case ZIP_ENCODING_UTF8_KNOWN:
160
22.7k
    case ZIP_ENCODING_UTF8_GUESSED:
161
22.7k
        if (can_be_utf8) {
162
16.1k
            enc = ZIP_ENCODING_UTF8_KNOWN;
163
16.1k
        }
164
6.57k
        else {
165
6.57k
            enc = ZIP_ENCODING_ERROR;
166
6.57k
        }
167
22.7k
        break;
168
169
0
    case ZIP_ENCODING_ASCII:
170
0
        if (can_be_ascii && !has_control_characters) {
171
0
            enc = ZIP_ENCODING_ASCII;
172
0
        }
173
0
        else {
174
0
            enc = ZIP_ENCODING_ERROR;
175
0
        }
176
0
        break;
177
178
0
    case ZIP_ENCODING_CP437:
179
0
        enc = ZIP_ENCODING_CP437;
180
0
        break;
181
182
15.4k
    case ZIP_ENCODING_UNKNOWN:
183
15.4k
        if (can_be_ascii && !has_control_characters) {
184
            /* only bytes from 0x20-0x7F */
185
12.4k
            enc = ZIP_ENCODING_ASCII;
186
12.4k
        }
187
3.02k
        else if (can_be_ascii && has_control_characters) {
188
            /* only bytes from 0x00-0x7F */
189
687
            enc = ZIP_ENCODING_CP437;
190
687
        }
191
2.33k
        else if (can_be_utf8) {
192
            /* contains bytes from 0x80-0xFF and is valid UTF-8 */
193
81
            enc =  ZIP_ENCODING_UTF8_GUESSED;
194
81
        }
195
2.25k
        else {
196
            /* fallback */
197
2.25k
            enc = ZIP_ENCODING_CP437;
198
2.25k
        }
199
15.4k
        break;
200
0
    case ZIP_ENCODING_ERROR:
201
        /* invalid, shouldn't happen */
202
0
        enc = ZIP_ENCODING_ERROR;
203
0
        break;
204
38.1k
    }
205
206
38.1k
    str->encoding = enc;
207
38.1k
    return enc;
208
38.1k
}
209
210
211
static zip_uint32_t
212
175k
_zip_unicode_to_utf8_len(zip_uint32_t codepoint) {
213
175k
    if (codepoint < 0x0080) {
214
117k
        return 1;
215
117k
    }
216
58.5k
    if (codepoint < 0x0800) {
217
21.6k
        return 2;
218
21.6k
    }
219
36.8k
    if (codepoint < 0x10000) {
220
36.8k
        return 3;
221
36.8k
    }
222
0
    return 4;
223
36.8k
}
224
225
226
static zip_uint32_t
227
175k
_zip_unicode_to_utf8(zip_uint32_t codepoint, zip_uint8_t *buf) {
228
175k
    if (codepoint < 0x0080) {
229
117k
        buf[0] = codepoint & 0xff;
230
117k
        return 1;
231
117k
    }
232
58.5k
    if (codepoint < 0x0800) {
233
21.6k
        buf[0] = (zip_uint8_t)(UTF_8_LEN_2_MATCH | ((codepoint >> 6) & 0x1f));
234
21.6k
        buf[1] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | (codepoint & 0x3f));
235
21.6k
        return 2;
236
21.6k
    }
237
36.8k
    if (codepoint < 0x10000) {
238
36.8k
        buf[0] = (zip_uint8_t)(UTF_8_LEN_3_MATCH | ((codepoint >> 12) & 0x0f));
239
36.8k
        buf[1] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | ((codepoint >> 6) & 0x3f));
240
36.8k
        buf[2] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | (codepoint & 0x3f));
241
36.8k
        return 3;
242
36.8k
    }
243
0
    buf[0] = (zip_uint8_t)(UTF_8_LEN_4_MATCH | ((codepoint >> 18) & 0x07));
244
0
    buf[1] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | ((codepoint >> 12) & 0x3f));
245
0
    buf[2] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | ((codepoint >> 6) & 0x3f));
246
0
    buf[3] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | (codepoint & 0x3f));
247
0
    return 4;
248
36.8k
}
249
250
251
zip_uint8_t *
252
2.93k
_zip_cp437_to_utf8(const zip_uint8_t *const _cp437buf, zip_uint32_t len, zip_uint32_t *utf8_lenp, zip_error_t *error) {
253
2.93k
    zip_uint8_t *cp437buf = (zip_uint8_t *)_cp437buf;
254
2.93k
    zip_uint8_t *utf8buf;
255
2.93k
    zip_uint32_t buflen, i, offset;
256
257
2.93k
    if (len == 0) {
258
0
        if (utf8_lenp) {
259
0
            *utf8_lenp = 0;
260
0
        }
261
0
        return NULL;
262
0
    }
263
264
2.93k
    buflen = 1;
265
178k
    for (i = 0; i < len; i++) {
266
175k
        buflen += _zip_unicode_to_utf8_len(_cp437_to_unicode[cp437buf[i]]);
267
175k
    }
268
269
2.93k
    if ((utf8buf = (zip_uint8_t *)malloc(buflen)) == NULL) {
270
0
        zip_error_set(error, ZIP_ER_MEMORY, 0);
271
0
        return NULL;
272
0
    }
273
274
2.93k
    offset = 0;
275
178k
    for (i = 0; i < len; i++) {
276
175k
        offset += _zip_unicode_to_utf8(_cp437_to_unicode[cp437buf[i]], utf8buf + offset);
277
175k
    }
278
279
2.93k
    utf8buf[buflen - 1] = 0;
280
2.93k
    if (utf8_lenp) {
281
2.93k
        *utf8_lenp = buflen - 1;
282
2.93k
    }
283
2.93k
    return utf8buf;
284
2.93k
}