Coverage Report

Created: 2026-03-14 06:48

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libzip/lib/zip_utf-8.c
Line
Count
Source
1
/*
2
  zip_utf-8.c -- UTF-8 support functions for libzip
3
  Copyright (C) 2011-2024 Dieter Baron and Thomas Klausner
4
5
  This file is part of libzip, a library to manipulate ZIP archives.
6
  The authors can be contacted at <info@libzip.org>
7
8
  Redistribution and use in source and binary forms, with or without
9
  modification, are permitted provided that the following conditions
10
  are met:
11
  1. Redistributions of source code must retain the above copyright
12
     notice, this list of conditions and the following disclaimer.
13
  2. Redistributions in binary form must reproduce the above copyright
14
     notice, this list of conditions and the following disclaimer in
15
     the documentation and/or other materials provided with the
16
     distribution.
17
  3. The names of the authors may not be used to endorse or promote
18
     products derived from this software without specific prior
19
     written permission.
20
21
  THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS
22
  OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23
  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24
  ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
25
  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26
  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
27
  GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28
  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
29
  IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
30
  OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
31
  IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32
*/
33
34
35
#include "zipint.h"
36
37
#include <stdlib.h>
38
39
40
static const zip_uint16_t _cp437_to_unicode[256] = {
41
    /* 0x00 - 0x0F */
42
    0x0000, 0x263A, 0x263B, 0x2665, 0x2666, 0x2663, 0x2660, 0x2022, 0x25D8, 0x25CB, 0x25D9, 0x2642, 0x2640, 0x266A, 0x266B, 0x263C,
43
44
    /* 0x10 - 0x1F */
45
    0x25BA, 0x25C4, 0x2195, 0x203C, 0x00B6, 0x00A7, 0x25AC, 0x21A8, 0x2191, 0x2193, 0x2192, 0x2190, 0x221F, 0x2194, 0x25B2, 0x25BC,
46
47
    /* 0x20 - 0x2F */
48
    0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F,
49
50
    /* 0x30 - 0x3F */
51
    0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F,
52
53
    /* 0x40 - 0x4F */
54
    0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F,
55
56
    /* 0x50 - 0x5F */
57
    0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F,
58
59
    /* 0x60 - 0x6F */
60
    0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F,
61
62
    /* 0x70 - 0x7F */
63
    0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, 0x007E, 0x2302,
64
65
    /* 0x80 - 0x8F */
66
    0x00C7, 0x00FC, 0x00E9, 0x00E2, 0x00E4, 0x00E0, 0x00E5, 0x00E7, 0x00EA, 0x00EB, 0x00E8, 0x00EF, 0x00EE, 0x00EC, 0x00C4, 0x00C5,
67
68
    /* 0x90 - 0x9F */
69
    0x00C9, 0x00E6, 0x00C6, 0x00F4, 0x00F6, 0x00F2, 0x00FB, 0x00F9, 0x00FF, 0x00D6, 0x00DC, 0x00A2, 0x00A3, 0x00A5, 0x20A7, 0x0192,
70
71
    /* 0xA0 - 0xAF */
72
    0x00E1, 0x00ED, 0x00F3, 0x00FA, 0x00F1, 0x00D1, 0x00AA, 0x00BA, 0x00BF, 0x2310, 0x00AC, 0x00BD, 0x00BC, 0x00A1, 0x00AB, 0x00BB,
73
74
    /* 0xB0 - 0xBF */
75
    0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556, 0x2555, 0x2563, 0x2551, 0x2557, 0x255D, 0x255C, 0x255B, 0x2510,
76
77
    /* 0xC0 - 0xCF */
78
    0x2514, 0x2534, 0x252C, 0x251C, 0x2500, 0x253C, 0x255E, 0x255F, 0x255A, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256C, 0x2567,
79
80
    /* 0xD0 - 0xDF */
81
    0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256B, 0x256A, 0x2518, 0x250C, 0x2588, 0x2584, 0x258C, 0x2590, 0x2580,
82
83
    /* 0xE0 - 0xEF */
84
    0x03B1, 0x00DF, 0x0393, 0x03C0, 0x03A3, 0x03C3, 0x00B5, 0x03C4, 0x03A6, 0x0398, 0x03A9, 0x03B4, 0x221E, 0x03C6, 0x03B5, 0x2229,
85
86
    /* 0xF0 - 0xFF */
87
    0x2261, 0x00B1, 0x2265, 0x2264, 0x2320, 0x2321, 0x00F7, 0x2248, 0x00B0, 0x2219, 0x00B7, 0x221A, 0x207F, 0x00B2, 0x25A0, 0x00A0};
88
89
5.97k
#define UTF_8_LEN_2_MASK 0xe0
90
27.0k
#define UTF_8_LEN_2_MATCH 0xc0
91
4.85k
#define UTF_8_LEN_3_MASK 0xf0
92
44.9k
#define UTF_8_LEN_3_MATCH 0xe0
93
2.97k
#define UTF_8_LEN_4_MASK 0xf8
94
2.97k
#define UTF_8_LEN_4_MATCH 0xf0
95
3.98k
#define UTF_8_CONTINUE_MASK 0xc0
96
105k
#define UTF_8_CONTINUE_MATCH 0x80
97
98
99
20.2k
zip_encoding_type_t _zip_guess_encoding(zip_string_t *str, zip_encoding_type_t expected_encoding) {
100
20.2k
    zip_encoding_type_t enc;
101
20.2k
    const zip_uint8_t *name;
102
20.2k
    zip_uint32_t i, j, ulen;
103
20.2k
    bool can_be_ascii = true;
104
20.2k
    bool can_be_utf8 = true;
105
20.2k
    bool has_control_characters = false;
106
107
20.2k
    if (str == NULL) {
108
0
        return ZIP_ENCODING_ASCII;
109
0
    }
110
111
20.2k
    name = str->raw;
112
113
20.2k
    if (str->encoding != ZIP_ENCODING_UNKNOWN) {
114
0
        return str->encoding;
115
0
    }
116
117
232k
    for (i = 0; i < str->length; i++) {
118
216k
        if (name[i] < 128) {
119
210k
            if (name[i] < 32 && name[i] != '\r' && name[i] != '\n' && name[i] != '\t') {
120
39.8k
                has_control_characters = true;
121
39.8k
            }
122
210k
            continue;
123
210k
        }
124
125
5.97k
        can_be_ascii = false;
126
5.97k
        if ((name[i] & UTF_8_LEN_2_MASK) == UTF_8_LEN_2_MATCH) {
127
1.11k
            ulen = 1;
128
1.11k
        }
129
4.85k
        else if ((name[i] & UTF_8_LEN_3_MASK) == UTF_8_LEN_3_MATCH) {
130
1.87k
            ulen = 2;
131
1.87k
        }
132
2.97k
        else if ((name[i] & UTF_8_LEN_4_MASK) == UTF_8_LEN_4_MATCH) {
133
623
            ulen = 3;
134
623
        }
135
2.35k
        else {
136
2.35k
            can_be_utf8 = false;
137
2.35k
            break;
138
2.35k
        }
139
140
3.61k
        if (i + ulen >= str->length) {
141
1.13k
            can_be_utf8 = false;
142
1.13k
            break;
143
1.13k
        }
144
145
5.73k
        for (j = 1; j <= ulen; j++) {
146
3.98k
            if ((name[i + j] & UTF_8_CONTINUE_MASK) != UTF_8_CONTINUE_MATCH) {
147
734
                can_be_utf8 = false;
148
734
                goto done;
149
734
            }
150
3.98k
        }
151
1.74k
        i += ulen;
152
1.74k
    }
153
154
20.2k
done:
155
20.2k
    enc = ZIP_ENCODING_CP437;
156
157
20.2k
    switch (expected_encoding) {
158
15.1k
    case ZIP_ENCODING_UTF8_KNOWN:
159
15.1k
    case ZIP_ENCODING_UTF8_GUESSED:
160
15.1k
        if (can_be_utf8) {
161
12.3k
            enc = ZIP_ENCODING_UTF8_KNOWN;
162
12.3k
        }
163
2.75k
        else {
164
2.75k
            enc = ZIP_ENCODING_ERROR;
165
2.75k
        }
166
15.1k
        break;
167
168
0
    case ZIP_ENCODING_ASCII:
169
0
        if (can_be_ascii && !has_control_characters) {
170
0
            enc = ZIP_ENCODING_ASCII;
171
0
        }
172
0
        else {
173
0
            enc = ZIP_ENCODING_ERROR;
174
0
        }
175
0
        break;
176
177
0
    case ZIP_ENCODING_CP437:
178
0
        enc = ZIP_ENCODING_CP437;
179
0
        break;
180
181
5.11k
    case ZIP_ENCODING_UNKNOWN:
182
5.11k
        if (can_be_ascii && !has_control_characters) {
183
            /* only bytes from 0x20-0x7F */
184
3.28k
            enc = ZIP_ENCODING_ASCII;
185
3.28k
        }
186
1.83k
        else if (can_be_ascii && has_control_characters) {
187
            /* only bytes from 0x00-0x7F */
188
330
            enc = ZIP_ENCODING_CP437;
189
330
        }
190
1.50k
        else if (can_be_utf8) {
191
            /* contains bytes from 0x80-0xFF and is valid UTF-8 */
192
40
            enc = ZIP_ENCODING_UTF8_GUESSED;
193
40
        }
194
1.46k
        else {
195
            /* fallback */
196
1.46k
            enc = ZIP_ENCODING_CP437;
197
1.46k
        }
198
5.11k
        break;
199
0
    case ZIP_ENCODING_ERROR:
200
        /* invalid, shouldn't happen */
201
0
        enc = ZIP_ENCODING_ERROR;
202
0
        break;
203
20.2k
    }
204
205
20.2k
    str->encoding = enc;
206
20.2k
    return enc;
207
20.2k
}
208
209
210
166k
static zip_uint32_t _zip_unicode_to_utf8_len(zip_uint32_t codepoint) {
211
166k
    if (codepoint < 0x0080) {
212
105k
        return 1;
213
105k
    }
214
61.1k
    if (codepoint < 0x0800) {
215
21.0k
        return 2;
216
21.0k
    }
217
40.1k
    if (codepoint < 0x10000) {
218
40.1k
        return 3;
219
40.1k
    }
220
0
    return 4;
221
40.1k
}
222
223
224
166k
static zip_uint32_t _zip_unicode_to_utf8(zip_uint32_t codepoint, zip_uint8_t *buf) {
225
166k
    if (codepoint < 0x0080) {
226
105k
        buf[0] = codepoint & 0xff;
227
105k
        return 1;
228
105k
    }
229
61.1k
    if (codepoint < 0x0800) {
230
21.0k
        buf[0] = (zip_uint8_t)(UTF_8_LEN_2_MATCH | ((codepoint >> 6) & 0x1f));
231
21.0k
        buf[1] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | (codepoint & 0x3f));
232
21.0k
        return 2;
233
21.0k
    }
234
40.1k
    if (codepoint < 0x10000) {
235
40.1k
        buf[0] = (zip_uint8_t)(UTF_8_LEN_3_MATCH | ((codepoint >> 12) & 0x0f));
236
40.1k
        buf[1] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | ((codepoint >> 6) & 0x3f));
237
40.1k
        buf[2] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | (codepoint & 0x3f));
238
40.1k
        return 3;
239
40.1k
    }
240
0
    buf[0] = (zip_uint8_t)(UTF_8_LEN_4_MATCH | ((codepoint >> 18) & 0x07));
241
0
    buf[1] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | ((codepoint >> 12) & 0x3f));
242
0
    buf[2] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | ((codepoint >> 6) & 0x3f));
243
0
    buf[3] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | (codepoint & 0x3f));
244
0
    return 4;
245
40.1k
}
246
247
248
1.79k
zip_uint8_t *_zip_cp437_to_utf8(const zip_uint8_t *const _cp437buf, zip_uint32_t len, zip_uint32_t *utf8_lenp, zip_error_t *error) {
249
1.79k
    zip_uint8_t *cp437buf = (zip_uint8_t *)_cp437buf;
250
1.79k
    zip_uint8_t *utf8buf;
251
1.79k
    zip_uint32_t buflen, i, offset;
252
253
1.79k
    if (len == 0) {
254
0
        if (utf8_lenp) {
255
0
            *utf8_lenp = 0;
256
0
        }
257
0
        return NULL;
258
0
    }
259
260
1.79k
    buflen = 1;
261
168k
    for (i = 0; i < len; i++) {
262
166k
        buflen += _zip_unicode_to_utf8_len(_cp437_to_unicode[cp437buf[i]]);
263
166k
    }
264
265
1.79k
    if ((utf8buf = (zip_uint8_t *)malloc(buflen)) == NULL) {
266
0
        zip_error_set(error, ZIP_ER_MEMORY, 0);
267
0
        return NULL;
268
0
    }
269
270
1.79k
    offset = 0;
271
168k
    for (i = 0; i < len; i++) {
272
166k
        offset += _zip_unicode_to_utf8(_cp437_to_unicode[cp437buf[i]], utf8buf + offset);
273
166k
    }
274
275
1.79k
    utf8buf[buflen - 1] = 0;
276
1.79k
    if (utf8_lenp) {
277
1.79k
        *utf8_lenp = buflen - 1;
278
1.79k
    }
279
1.79k
    return utf8buf;
280
1.79k
}