/src/wget2/libwget/encoding.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2012-2015 Tim Ruehsen |
3 | | * Copyright (c) 2015-2024 Free Software Foundation, Inc. |
4 | | * |
5 | | * This file is part of libwget. |
6 | | * |
7 | | * Libwget is free software: you can redistribute it and/or modify |
8 | | * it under the terms of the GNU Lesser General Public License as published by |
9 | | * the Free Software Foundation, either version 3 of the License, or |
10 | | * (at your option) any later version. |
11 | | * |
12 | | * Libwget is distributed in the hope that it will be useful, |
13 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
15 | | * GNU Lesser General Public License for more details. |
16 | | * |
17 | | * You should have received a copy of the GNU Lesser General Public License |
18 | | * along with libwget. If not, see <https://www.gnu.org/licenses/>. |
19 | | * |
20 | | * |
21 | | * a collection of charset encoding routines |
22 | | * |
23 | | * Changelog |
24 | | * 02.10.2013 Tim Ruehsen created |
25 | | * |
26 | | */ |
27 | | |
28 | | #include <config.h> |
29 | | |
30 | | #include <string.h> |
31 | | #include <errno.h> |
32 | | |
33 | | #ifdef HAVE_ICONV |
34 | | # include <iconv.h> |
35 | | #endif |
36 | | |
37 | | #include <langinfo.h> |
38 | | |
39 | | #if defined HAVE_IDN2_H && defined WITH_LIBIDN2 |
40 | | # include <idn2.h> |
41 | | #elif defined HAVE_IDNA_H && defined WITH_LIBIDN |
42 | | # include <idna.h> |
43 | | # ifdef _WIN32 |
44 | | # include <idn-free.h> |
45 | | # endif |
46 | | #elif defined HAVE_IDN_IDNA_H && defined WITH_LIBIDN |
47 | | // OpenSolaris uses the idn subdir |
48 | | # include <idn/idna.h> |
49 | | #endif |
50 | | |
51 | | #include <wget.h> |
52 | | #include "private.h" |
53 | | |
54 | | const char *wget_local_charset_encoding(void) |
55 | 0 | { |
56 | 0 | const char *encoding = nl_langinfo(CODESET); |
57 | |
|
58 | 0 | if (encoding && *encoding) |
59 | 0 | return wget_strdup(encoding); |
60 | | |
61 | 0 | return wget_strdup("ASCII"); |
62 | 0 | } |
63 | | |
64 | | // void *wget_memiconv(const void *src, size_t length, const char *src_encoding, const char *dst_encoding) |
65 | | int wget_memiconv(const char *src_encoding, const void *src, size_t srclen, const char *dst_encoding, char **out, size_t *outlen) |
66 | 0 | { |
67 | 0 | if (!src) |
68 | 0 | return WGET_E_INVALID; |
69 | | |
70 | 0 | #ifdef HAVE_ICONV |
71 | 0 | if (!src_encoding) |
72 | 0 | src_encoding = "iso-8859-1"; // default character-set for most browsers |
73 | 0 | if (!dst_encoding) |
74 | 0 | dst_encoding = "iso-8859-1"; // default character-set for most browsers |
75 | |
|
76 | 0 | if (wget_strcasecmp_ascii(src_encoding, dst_encoding)) { |
77 | 0 | int ret = WGET_E_UNKNOWN; |
78 | 0 | iconv_t cd = iconv_open(dst_encoding, src_encoding); |
79 | |
|
80 | 0 | if (cd != (iconv_t)-1) { |
81 | 0 | char *tmp = (char *) src; // iconv won't change where src points to, but changes tmp itself |
82 | 0 | size_t tmp_len = srclen; |
83 | 0 | size_t dst_len = tmp_len * 6, dst_len_tmp = dst_len; |
84 | 0 | char *dst = wget_malloc(dst_len + 1), *dst_tmp = dst; |
85 | |
|
86 | 0 | if (!dst) { |
87 | 0 | iconv_close(cd); |
88 | 0 | return WGET_E_MEMORY; |
89 | 0 | } |
90 | | |
91 | 0 | errno = 0; |
92 | 0 | if (iconv(cd, (ICONV_CONST char **)&tmp, &tmp_len, &dst_tmp, &dst_len_tmp) == 0 |
93 | 0 | && iconv(cd, NULL, NULL, &dst_tmp, &dst_len_tmp) == 0) |
94 | 0 | { |
95 | 0 | debug_printf("transcoded %zu bytes from '%s' to '%s'\n", srclen, src_encoding, dst_encoding); |
96 | 0 | if (out) { |
97 | | // here we reduce the allocated memory size, if it fails we use the original memory chunk |
98 | 0 | tmp = wget_realloc(dst, dst_len - dst_len_tmp + 1); |
99 | 0 | if (!tmp) |
100 | 0 | tmp = dst; |
101 | 0 | tmp[dst_len - dst_len_tmp] = 0; |
102 | 0 | *out = tmp; |
103 | 0 | } else |
104 | 0 | xfree(dst); |
105 | |
|
106 | 0 | if (outlen) |
107 | 0 | *outlen = dst_len - dst_len_tmp; |
108 | |
|
109 | 0 | ret = WGET_E_SUCCESS; |
110 | 0 | } else { |
111 | | // erno == 0 means some codepoints were encoded non-reversible, treat as error |
112 | 0 | error_printf(_("Failed to transcode '%s' string into '%s' (%d)\n"), src_encoding, dst_encoding, errno); |
113 | 0 | xfree(dst); |
114 | |
|
115 | 0 | if (out) |
116 | 0 | *out = NULL; |
117 | |
|
118 | 0 | if (outlen) |
119 | 0 | *outlen = 0; |
120 | 0 | } |
121 | |
|
122 | 0 | iconv_close(cd); |
123 | 0 | } else |
124 | 0 | error_printf(_("Failed to prepare transcoding '%s' into '%s' (%d)\n"), src_encoding, dst_encoding, errno); |
125 | | |
126 | 0 | return ret; |
127 | 0 | } |
128 | 0 | #endif |
129 | | |
130 | 0 | if (out) |
131 | 0 | *out = wget_strmemdup(src, srclen); |
132 | |
|
133 | 0 | if (outlen) |
134 | 0 | *outlen = srclen; |
135 | |
|
136 | 0 | return WGET_E_SUCCESS; |
137 | 0 | } |
138 | | |
139 | | // src must be a ASCII compatible C string |
140 | | char *wget_striconv(const char *src, const char *src_encoding, const char *dst_encoding) |
141 | 0 | { |
142 | 0 | if (!src) |
143 | 0 | return NULL; |
144 | | |
145 | 0 | char *dst; |
146 | 0 | if (wget_memiconv(src_encoding, src, strlen(src), dst_encoding, &dst, NULL)) |
147 | 0 | return NULL; |
148 | | |
149 | 0 | return dst; |
150 | 0 | } |
151 | | |
152 | | bool wget_str_needs_encoding(const char *s) |
153 | 0 | { |
154 | 0 | if (!s) |
155 | 0 | return false; |
156 | | |
157 | 0 | while (*s && (*s & ~0x7f) == 0) s++; |
158 | |
|
159 | 0 | return *s != 0; |
160 | 0 | } |
161 | | |
162 | | bool wget_str_is_valid_utf8(const char *utf8) |
163 | 0 | { |
164 | 0 | const unsigned char *s = (const unsigned char *) utf8; |
165 | |
|
166 | 0 | if (!s) |
167 | 0 | return 0; |
168 | | |
169 | 0 | while (*s) { |
170 | 0 | if ((*s & 0x80) == 0) /* 0xxxxxxx ASCII char */ |
171 | 0 | s++; |
172 | 0 | else if ((*s & 0xE0) == 0xC0) /* 110xxxxx 10xxxxxx */ { |
173 | 0 | if ((s[1] & 0xC0) != 0x80) |
174 | 0 | return 0; |
175 | 0 | s += 2; |
176 | 0 | } else if ((*s & 0xF0) == 0xE0) /* 1110xxxx 10xxxxxx 10xxxxxx */ { |
177 | 0 | if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80) |
178 | 0 | return 0; |
179 | 0 | s += 3; |
180 | 0 | } else if ((*s & 0xF8) == 0xF0) /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ { |
181 | 0 | if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80 || (s[3] & 0xC0) != 0x80) |
182 | 0 | return 0; |
183 | 0 | s += 4; |
184 | 0 | } else |
185 | 0 | return 0; |
186 | 0 | } |
187 | | |
188 | 0 | return 1; |
189 | 0 | } |
190 | | |
191 | | char *wget_str_to_utf8(const char *src, const char *encoding) |
192 | 0 | { |
193 | 0 | return wget_striconv(src, encoding, "utf-8"); |
194 | 0 | } |
195 | | |
196 | | char *wget_utf8_to_str(const char *src, const char *encoding) |
197 | 0 | { |
198 | 0 | return wget_striconv(src, "utf-8", encoding); |
199 | 0 | } |
200 | | |
201 | | #ifdef WITH_LIBIDN |
202 | | /* |
203 | | * Work around a libidn <= 1.30 vulnerability. |
204 | | * |
205 | | * The function checks for a valid UTF-8 character sequence before |
206 | | * passing it to idna_to_ascii_8z(). |
207 | | * |
208 | | * [1] https://lists.gnu.org/archive/html/help-libidn/2015-05/msg00002.html |
209 | | * [2] https://lists.gnu.org/archive/html/bug-wget/2015-06/msg00002.html |
210 | | * [3] https://curl.haxx.se/mail/lib-2015-06/0143.html |
211 | | */ |
212 | | static int WGET_GCC_PURE _utf8_is_valid(const char *utf8) |
213 | | { |
214 | | const unsigned char *s = (const unsigned char *) utf8; |
215 | | |
216 | | while (*s) { |
217 | | if ((*s & 0x80) == 0) /* 0xxxxxxx ASCII char */ |
218 | | s++; |
219 | | else if ((*s & 0xE0) == 0xC0) /* 110xxxxx 10xxxxxx */ { |
220 | | if ((s[1] & 0xC0) != 0x80) |
221 | | return 0; |
222 | | s += 2; |
223 | | } else if ((*s & 0xF0) == 0xE0) /* 1110xxxx 10xxxxxx 10xxxxxx */ { |
224 | | if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80) |
225 | | return 0; |
226 | | s += 3; |
227 | | } else if ((*s & 0xF8) == 0xF0) /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ { |
228 | | if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80 || (s[3] & 0xC0) != 0x80) |
229 | | return 0; |
230 | | s += 4; |
231 | | } else |
232 | | return 0; |
233 | | } |
234 | | |
235 | | return 1; |
236 | | } |
237 | | #endif |
238 | | |
239 | | /* We convert hostnames and thus have to apply IDN2_USE_STD3_ASCII_RULES. |
240 | | * If we don't do, the result could contain any ascii characters, |
241 | | * e.g. 'evil.c\u2100.example.com' will be converted into |
242 | | * 'evil.ca/c.example.com', which seems no good idea. */ |
243 | | const char *wget_str_to_ascii(const char *src) |
244 | 0 | { |
245 | 0 | #ifdef WITH_LIBIDN2 |
246 | 0 | if (wget_str_needs_encoding(src)) { |
247 | 0 | char *asc = NULL; |
248 | 0 | int rc; |
249 | 0 | if ((rc = idn2_lookup_u8((uint8_t *)src, (uint8_t **)&asc, IDN2_NONTRANSITIONAL|IDN2_USE_STD3_ASCII_RULES)) != IDN2_OK) |
250 | 0 | rc = idn2_lookup_u8((uint8_t *)src, (uint8_t **)&asc, IDN2_TRANSITIONAL|IDN2_USE_STD3_ASCII_RULES); |
251 | 0 | if (rc == IDN2_OK) |
252 | 0 | { |
253 | 0 | debug_printf("idn2 '%s' -> '%s'\n", src, asc); |
254 | | # ifdef _WIN32 |
255 | | src = wget_strdup(asc); |
256 | | idn2_free(asc); |
257 | | # else |
258 | 0 | src = asc; |
259 | 0 | # endif |
260 | 0 | } else |
261 | 0 | error_printf(_("toASCII(%s) failed (%d): %s\n"), src, rc, idn2_strerror(rc)); |
262 | 0 | } |
263 | | #elif defined WITH_LIBIDN |
264 | | if (wget_str_needs_encoding(src)) { |
265 | | char *asc = NULL; |
266 | | |
267 | | if (_utf8_is_valid(src)) { |
268 | | int rc; |
269 | | |
270 | | // idna_to_ascii_8z() automatically converts UTF-8 to lowercase |
271 | | if ((rc = idna_to_ascii_8z(src, &asc, IDNA_USE_STD3_ASCII_RULES)) == IDNA_SUCCESS) { |
272 | | // debug_printf("toASCII '%s' -> '%s'\n", src, asc); |
273 | | # ifdef _WIN32 |
274 | | src = wget_strdup(asc); |
275 | | idn_free(asc); |
276 | | # else |
277 | | src = asc; |
278 | | # endif |
279 | | } else |
280 | | error_printf(_("toASCII failed (%d): %s\n"), rc, idna_strerror(rc)); |
281 | | } |
282 | | else |
283 | | error_printf(_("Invalid UTF-8 sequence not converted: '%s'\n"), src); |
284 | | } |
285 | | #else |
286 | | if (wget_str_needs_encoding(src)) { |
287 | | error_printf(_("toASCII not available: '%s'\n"), src); |
288 | | } |
289 | | #endif |
290 | |
|
291 | 0 | return src; |
292 | 0 | } |