/src/libunistring/lib/striconveha.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* Character set conversion with error handling and autodetection. |
2 | | Copyright (C) 2002, 2005, 2007, 2009-2024 Free Software Foundation, Inc. |
3 | | Written by Bruno Haible. |
4 | | |
5 | | This file is free software: you can redistribute it and/or modify |
6 | | it under the terms of the GNU Lesser General Public License as |
7 | | published by the Free Software Foundation; either version 2.1 of the |
8 | | License, or (at your option) any later version. |
9 | | |
10 | | This file is distributed in the hope that it will be useful, |
11 | | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
13 | | GNU Lesser General Public License for more details. |
14 | | |
15 | | You should have received a copy of the GNU Lesser General Public License |
16 | | along with this program. If not, see <https://www.gnu.org/licenses/>. */ |
17 | | |
18 | | #include <config.h> |
19 | | |
20 | | /* Specification. */ |
21 | | #include "striconveha.h" |
22 | | |
23 | | #include <errno.h> |
24 | | #include <stdlib.h> |
25 | | #include <string.h> |
26 | | |
27 | | #include "malloca.h" |
28 | | #include "c-strcase.h" |
29 | | #include "striconveh.h" |
30 | | |
31 | | #define SIZEOF(a) (sizeof(a)/sizeof(a[0])) |
32 | | |
33 | | |
34 | | /* Autodetection list. */ |
35 | | |
36 | | struct autodetect_alias |
37 | | { |
38 | | struct autodetect_alias *next; |
39 | | const char *name; |
40 | | const char * const *encodings_to_try; |
41 | | }; |
42 | | |
43 | | static const char * const autodetect_utf8_try[] = |
44 | | { |
45 | | /* Try UTF-8 first. There are very few ISO-8859-1 inputs that would |
46 | | be valid UTF-8, but many UTF-8 inputs are valid ISO-8859-1. */ |
47 | | "UTF-8", "ISO-8859-1", |
48 | | NULL |
49 | | }; |
50 | | static const char * const autodetect_jp_try[] = |
51 | | { |
52 | | /* Try 7-bit encoding first. If the input contains bytes >= 0x80, |
53 | | it will fail. |
54 | | Try EUC-JP next. Short SHIFT_JIS inputs may come out wrong. This |
55 | | is unavoidable. People will condemn SHIFT_JIS. |
56 | | If we tried SHIFT_JIS first, then some short EUC-JP inputs would |
57 | | come out wrong, and people would condemn EUC-JP and Unix, which |
58 | | would not be good. |
59 | | Finally try SHIFT_JIS. */ |
60 | | "ISO-2022-JP-2", "EUC-JP", "SHIFT_JIS", |
61 | | NULL |
62 | | }; |
63 | | static const char * const autodetect_kr_try[] = |
64 | | { |
65 | | /* Try 7-bit encoding first. If the input contains bytes >= 0x80, |
66 | | it will fail. |
67 | | Finally try EUC-KR. */ |
68 | | "ISO-2022-KR", "EUC-KR", |
69 | | NULL |
70 | | }; |
71 | | |
72 | | static struct autodetect_alias autodetect_predefined[] = |
73 | | { |
74 | | { &autodetect_predefined[1], "autodetect_utf8", autodetect_utf8_try }, |
75 | | { &autodetect_predefined[2], "autodetect_jp", autodetect_jp_try }, |
76 | | { NULL, "autodetect_kr", autodetect_kr_try } |
77 | | }; |
78 | | |
79 | | static struct autodetect_alias *autodetect_list = &autodetect_predefined[0]; |
80 | | static struct autodetect_alias **autodetect_list_end = |
81 | | &autodetect_predefined[SIZEOF(autodetect_predefined)-1].next; |
82 | | |
83 | | int |
84 | | uniconv_register_autodetect (const char *name, |
85 | | const char * const *try_in_order) |
86 | 0 | { |
87 | 0 | size_t namelen; |
88 | 0 | size_t listlen; |
89 | 0 | size_t memneed; |
90 | 0 | size_t i; |
91 | | |
92 | | /* The TRY_IN_ORDER list must not be empty. */ |
93 | 0 | if (try_in_order[0] == NULL) |
94 | 0 | { |
95 | 0 | errno = EINVAL; |
96 | 0 | return -1; |
97 | 0 | } |
98 | | |
99 | | /* We must deep-copy NAME and TRY_IN_ORDER, because they may be allocated |
100 | | with dynamic extent. */ |
101 | 0 | namelen = strlen (name) + 1; |
102 | 0 | memneed = sizeof (struct autodetect_alias) + namelen + sizeof (char *); |
103 | 0 | for (i = 0; try_in_order[i] != NULL; i++) |
104 | 0 | memneed += sizeof (char *) + strlen (try_in_order[i]) + 1; |
105 | 0 | listlen = i; |
106 | |
|
107 | 0 | void *memory = malloc (memneed); |
108 | 0 | if (memory != NULL) |
109 | 0 | { |
110 | 0 | struct autodetect_alias *new_alias = memory; |
111 | 0 | memory = new_alias + 1; |
112 | |
|
113 | 0 | char const **new_try_in_order = memory; |
114 | 0 | memory = new_try_in_order + listlen + 1; |
115 | |
|
116 | 0 | char *new_name = memcpy (memory, name, namelen); |
117 | 0 | memory = new_name + namelen; |
118 | |
|
119 | 0 | for (i = 0; i < listlen; i++) |
120 | 0 | { |
121 | 0 | size_t len = strlen (try_in_order[i]) + 1; |
122 | 0 | char *copy = memcpy (memory, try_in_order[i], len); |
123 | 0 | new_try_in_order[i] = copy; |
124 | 0 | memory = copy + len; |
125 | 0 | } |
126 | 0 | new_try_in_order[i] = NULL; |
127 | | |
128 | | /* Now insert the new alias. */ |
129 | 0 | new_alias->name = new_name; |
130 | 0 | new_alias->encodings_to_try = new_try_in_order; |
131 | 0 | new_alias->next = NULL; |
132 | | /* FIXME: Not multithread-safe. */ |
133 | 0 | *autodetect_list_end = new_alias; |
134 | 0 | autodetect_list_end = &new_alias->next; |
135 | 0 | return 0; |
136 | 0 | } |
137 | 0 | else |
138 | 0 | { |
139 | 0 | errno = ENOMEM; |
140 | 0 | return -1; |
141 | 0 | } |
142 | 0 | } |
143 | | |
144 | | /* Like mem_iconveha, except no handling of transliteration. */ |
145 | | static int |
146 | | mem_iconveha_notranslit (const char *src, size_t srclen, |
147 | | const char *from_codeset, const char *to_codeset, |
148 | | enum iconv_ilseq_handler handler, |
149 | | size_t *offsets, |
150 | | char **resultp, size_t *lengthp) |
151 | 0 | { |
152 | 0 | int retval = mem_iconveh (src, srclen, from_codeset, to_codeset, handler, |
153 | 0 | offsets, resultp, lengthp); |
154 | 0 | if (retval >= 0 || errno != EINVAL) |
155 | 0 | return retval; |
156 | 0 | else |
157 | 0 | { |
158 | 0 | struct autodetect_alias *alias; |
159 | | |
160 | | /* Unsupported from_codeset or to_codeset. Check whether the caller |
161 | | requested autodetection. */ |
162 | 0 | for (alias = autodetect_list; alias != NULL; alias = alias->next) |
163 | 0 | if (strcmp (from_codeset, alias->name) == 0) |
164 | 0 | { |
165 | 0 | const char * const *encodings; |
166 | |
|
167 | 0 | if (handler != iconveh_error) |
168 | 0 | { |
169 | | /* First try all encodings without any forgiving. */ |
170 | 0 | encodings = alias->encodings_to_try; |
171 | 0 | do |
172 | 0 | { |
173 | 0 | retval = mem_iconveha_notranslit (src, srclen, |
174 | 0 | *encodings, to_codeset, |
175 | 0 | iconveh_error, offsets, |
176 | 0 | resultp, lengthp); |
177 | 0 | if (!(retval < 0 && errno == EILSEQ)) |
178 | 0 | return retval; |
179 | 0 | encodings++; |
180 | 0 | } |
181 | 0 | while (*encodings != NULL); |
182 | 0 | } |
183 | | |
184 | 0 | encodings = alias->encodings_to_try; |
185 | 0 | do |
186 | 0 | { |
187 | 0 | retval = mem_iconveha_notranslit (src, srclen, |
188 | 0 | *encodings, to_codeset, |
189 | 0 | handler, offsets, |
190 | 0 | resultp, lengthp); |
191 | 0 | if (!(retval < 0 && errno == EILSEQ)) |
192 | 0 | return retval; |
193 | 0 | encodings++; |
194 | 0 | } |
195 | 0 | while (*encodings != NULL); |
196 | | |
197 | | /* Return the last call's result. */ |
198 | 0 | return -1; |
199 | 0 | } |
200 | | |
201 | | /* It wasn't an autodetection name. */ |
202 | 0 | errno = EINVAL; |
203 | 0 | return -1; |
204 | 0 | } |
205 | 0 | } |
206 | | |
207 | | int |
208 | | mem_iconveha (const char *src, size_t srclen, |
209 | | const char *from_codeset, const char *to_codeset, |
210 | | bool transliterate, |
211 | | enum iconv_ilseq_handler handler, |
212 | | size_t *offsets, |
213 | | char **resultp, size_t *lengthp) |
214 | 0 | { |
215 | 0 | if (srclen == 0) |
216 | 0 | { |
217 | | /* Nothing to convert. */ |
218 | 0 | *lengthp = 0; |
219 | 0 | return 0; |
220 | 0 | } |
221 | | |
222 | | /* When using GNU libc >= 2.2 or GNU libiconv >= 1.5 or Citrus/FreeBSD/macOS |
223 | | iconv, we want to use transliteration. */ |
224 | 0 | #if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) \ |
225 | 0 | && !defined __UCLIBC__) \ |
226 | 0 | || _LIBICONV_VERSION >= 0x0105 \ |
227 | 0 | || defined ICONV_SET_TRANSLITERATE |
228 | 0 | if (transliterate) |
229 | 0 | { |
230 | 0 | int retval; |
231 | 0 | size_t len = strlen (to_codeset); |
232 | 0 | char *to_codeset_suffixed = (char *) malloca (len + 10 + 1); |
233 | 0 | if (to_codeset_suffixed == NULL) |
234 | 0 | { |
235 | 0 | errno = ENOMEM; |
236 | 0 | return -1; |
237 | 0 | } |
238 | 0 | memcpy (to_codeset_suffixed, to_codeset, len); |
239 | 0 | memcpy (to_codeset_suffixed + len, "//TRANSLIT", 10 + 1); |
240 | |
|
241 | 0 | retval = mem_iconveha_notranslit (src, srclen, |
242 | 0 | from_codeset, to_codeset_suffixed, |
243 | 0 | handler, offsets, resultp, lengthp); |
244 | |
|
245 | 0 | freea (to_codeset_suffixed); |
246 | |
|
247 | 0 | return retval; |
248 | 0 | } |
249 | 0 | else |
250 | 0 | #endif |
251 | 0 | return mem_iconveha_notranslit (src, srclen, |
252 | 0 | from_codeset, to_codeset, |
253 | 0 | handler, offsets, resultp, lengthp); |
254 | 0 | } |
255 | | |
256 | | /* Like str_iconveha, except no handling of transliteration. */ |
257 | | static char * |
258 | | str_iconveha_notranslit (const char *src, |
259 | | const char *from_codeset, const char *to_codeset, |
260 | | enum iconv_ilseq_handler handler) |
261 | 0 | { |
262 | 0 | char *result = str_iconveh (src, from_codeset, to_codeset, handler); |
263 | |
|
264 | 0 | if (result != NULL || errno != EINVAL) |
265 | 0 | return result; |
266 | 0 | else |
267 | 0 | { |
268 | 0 | struct autodetect_alias *alias; |
269 | | |
270 | | /* Unsupported from_codeset or to_codeset. Check whether the caller |
271 | | requested autodetection. */ |
272 | 0 | for (alias = autodetect_list; alias != NULL; alias = alias->next) |
273 | 0 | if (strcmp (from_codeset, alias->name) == 0) |
274 | 0 | { |
275 | 0 | const char * const *encodings; |
276 | |
|
277 | 0 | if (handler != iconveh_error) |
278 | 0 | { |
279 | | /* First try all encodings without any forgiving. */ |
280 | 0 | encodings = alias->encodings_to_try; |
281 | 0 | do |
282 | 0 | { |
283 | 0 | result = str_iconveha_notranslit (src, |
284 | 0 | *encodings, to_codeset, |
285 | 0 | iconveh_error); |
286 | 0 | if (!(result == NULL && errno == EILSEQ)) |
287 | 0 | return result; |
288 | 0 | encodings++; |
289 | 0 | } |
290 | 0 | while (*encodings != NULL); |
291 | 0 | } |
292 | | |
293 | 0 | encodings = alias->encodings_to_try; |
294 | 0 | do |
295 | 0 | { |
296 | 0 | result = str_iconveha_notranslit (src, |
297 | 0 | *encodings, to_codeset, |
298 | 0 | handler); |
299 | 0 | if (!(result == NULL && errno == EILSEQ)) |
300 | 0 | return result; |
301 | 0 | encodings++; |
302 | 0 | } |
303 | 0 | while (*encodings != NULL); |
304 | | |
305 | | /* Return the last call's result. */ |
306 | 0 | return NULL; |
307 | 0 | } |
308 | | |
309 | | /* It wasn't an autodetection name. */ |
310 | 0 | errno = EINVAL; |
311 | 0 | return NULL; |
312 | 0 | } |
313 | 0 | } |
314 | | |
315 | | char * |
316 | | str_iconveha (const char *src, |
317 | | const char *from_codeset, const char *to_codeset, |
318 | | bool transliterate, |
319 | | enum iconv_ilseq_handler handler) |
320 | 0 | { |
321 | 0 | if (*src == '\0' || c_strcasecmp (from_codeset, to_codeset) == 0) |
322 | 0 | { |
323 | 0 | char *result = strdup (src); |
324 | |
|
325 | 0 | if (result == NULL) |
326 | 0 | errno = ENOMEM; |
327 | 0 | return result; |
328 | 0 | } |
329 | | |
330 | | /* When using GNU libc >= 2.2 or GNU libiconv >= 1.5 or Citrus/FreeBSD/macOS |
331 | | iconv, we want to use transliteration. */ |
332 | 0 | #if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) \ |
333 | 0 | && !defined __UCLIBC__) \ |
334 | 0 | || _LIBICONV_VERSION >= 0x0105 \ |
335 | 0 | || defined ICONV_SET_TRANSLITERATE |
336 | 0 | if (transliterate) |
337 | 0 | { |
338 | 0 | char *result; |
339 | 0 | size_t len = strlen (to_codeset); |
340 | 0 | char *to_codeset_suffixed = (char *) malloca (len + 10 + 1); |
341 | 0 | if (to_codeset_suffixed == NULL) |
342 | 0 | { |
343 | 0 | errno = ENOMEM; |
344 | 0 | return NULL; |
345 | 0 | } |
346 | 0 | memcpy (to_codeset_suffixed, to_codeset, len); |
347 | 0 | memcpy (to_codeset_suffixed + len, "//TRANSLIT", 10 + 1); |
348 | |
|
349 | 0 | result = str_iconveha_notranslit (src, from_codeset, to_codeset_suffixed, |
350 | 0 | handler); |
351 | |
|
352 | 0 | freea (to_codeset_suffixed); |
353 | |
|
354 | 0 | return result; |
355 | 0 | } |
356 | 0 | else |
357 | 0 | #endif |
358 | 0 | return str_iconveha_notranslit (src, from_codeset, to_codeset, handler); |
359 | 0 | } |