/src/neomutt/mutt/charset.c
Line | Count | Source (jump to first uncovered line) |
1 | | /** |
2 | | * @file |
3 | | * Conversion between different character encodings |
4 | | * |
5 | | * @authors |
6 | | * Copyright (C) 2017 Tobias Angele <toogley@mailbox.org> |
7 | | * Copyright (C) 2017-2023 Richard Russon <rich@flatcap.org> |
8 | | * Copyright (C) 2018-2023 Pietro Cerutti <gahr@gahr.ch> |
9 | | * Copyright (C) 2023 Steinar H Gunderson <steinar+neomutt@gunderson.no> |
10 | | * |
11 | | * @copyright |
12 | | * This program is free software: you can redistribute it and/or modify it under |
13 | | * the terms of the GNU General Public License as published by the Free Software |
14 | | * Foundation, either version 2 of the License, or (at your option) any later |
15 | | * version. |
16 | | * |
17 | | * This program is distributed in the hope that it will be useful, but WITHOUT |
18 | | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
19 | | * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
20 | | * details. |
21 | | * |
22 | | * You should have received a copy of the GNU General Public License along with |
23 | | * this program. If not, see <http://www.gnu.org/licenses/>. |
24 | | */ |
25 | | |
26 | | /** |
27 | | * @page mutt_charset Conversion between different character encodings |
28 | | * |
29 | | * Conversion between different character encodings |
30 | | */ |
31 | | |
32 | | #include "config.h" |
33 | | #include <errno.h> |
34 | | #include <iconv.h> |
35 | | #include <langinfo.h> |
36 | | #include <limits.h> |
37 | | #include <stdbool.h> |
38 | | #include <stdio.h> |
39 | | #include <string.h> |
40 | | #include "charset.h" |
41 | | #include "buffer.h" |
42 | | #include "list.h" |
43 | | #include "logging2.h" |
44 | | #include "memory.h" |
45 | | #include "pool.h" |
46 | | #include "queue.h" |
47 | | #include "regex3.h" |
48 | | #include "slist.h" |
49 | | #include "string2.h" |
50 | | #ifdef ENABLE_NLS |
51 | | #include <libintl.h> |
52 | | #endif |
53 | | |
54 | | #ifndef EILSEQ |
55 | | #define EILSEQ EINVAL |
56 | | #endif |
57 | | |
58 | | /** |
59 | | * ReplacementChar - When a Unicode character can't be displayed, use this instead |
60 | | */ |
61 | | wchar_t ReplacementChar = '?'; |
62 | | |
63 | | /** |
64 | | * CharsetIsUtf8 - Is the user's current character set utf-8? |
65 | | */ |
66 | | bool CharsetIsUtf8 = false; |
67 | | |
68 | | /** |
69 | | * struct Lookup - Regex to String lookup table |
70 | | * |
71 | | * This is used by 'charset-hook' and 'iconv-hook'. |
72 | | */ |
73 | | struct Lookup |
74 | | { |
75 | | enum LookupType type; ///< Lookup type |
76 | | struct Regex regex; ///< Regular expression |
77 | | char *replacement; ///< Alternative charset to use |
78 | | TAILQ_ENTRY(Lookup) entries; ///< Linked list |
79 | | }; |
80 | | TAILQ_HEAD(LookupList, Lookup); |
81 | | |
82 | | /// Lookup table of preferred character set names |
83 | | static struct LookupList Lookups = TAILQ_HEAD_INITIALIZER(Lookups); |
84 | | |
85 | | /** |
86 | | * struct IconvCacheEntry - Cached iconv conversion descriptor |
87 | | */ |
88 | | struct IconvCacheEntry |
89 | | { |
90 | | char *fromcode1; ///< Source character set |
91 | | char *tocode1; ///< Destination character set |
92 | | iconv_t cd; ///< iconv conversion descriptor |
93 | | }; |
94 | | |
95 | | /// Max size of the iconv cache |
96 | 0 | #define ICONV_CACHE_SIZE 16 |
97 | | /// Cache of iconv conversion descriptors |
98 | | static struct IconvCacheEntry IconvCache[ICONV_CACHE_SIZE]; |
99 | | /// Number of iconv descriptors in the cache |
100 | | static int IconvCacheUsed = 0; |
101 | | |
102 | | /** |
103 | | * struct MimeNames - MIME name lookup entry |
104 | | */ |
105 | | struct MimeNames |
106 | | { |
107 | | const char *key; |
108 | | const char *pref; |
109 | | }; |
110 | | |
111 | | /** |
112 | | * PreferredMimeNames - Lookup table of preferred charsets |
113 | | * |
114 | | * The following list has been created manually from the data under: |
115 | | * http://www.isi.edu/in-notes/iana/assignments/character-sets |
116 | | * Last update: 2000-09-07 |
117 | | * |
118 | | * @note It includes only the subset of character sets for which a preferred |
119 | | * MIME name is given. |
120 | | */ |
121 | | static const struct MimeNames PreferredMimeNames[] = { |
122 | | // clang-format off |
123 | | { "ansi_x3.4-1968", "us-ascii" }, |
124 | | { "iso-ir-6", "us-ascii" }, |
125 | | { "iso_646.irv:1991", "us-ascii" }, |
126 | | { "ascii", "us-ascii" }, |
127 | | { "iso646-us", "us-ascii" }, |
128 | | { "us", "us-ascii" }, |
129 | | { "ibm367", "us-ascii" }, |
130 | | { "cp367", "us-ascii" }, |
131 | | { "csASCII", "us-ascii" }, |
132 | | |
133 | | { "csISO2022KR", "iso-2022-kr" }, |
134 | | { "csEUCKR", "euc-kr" }, |
135 | | { "csISO2022JP", "iso-2022-jp" }, |
136 | | { "csISO2022JP2", "iso-2022-jp-2" }, |
137 | | |
138 | | { "ISO_8859-1:1987", "iso-8859-1" }, |
139 | | { "iso-ir-100", "iso-8859-1" }, |
140 | | { "iso_8859-1", "iso-8859-1" }, |
141 | | { "latin1", "iso-8859-1" }, |
142 | | { "l1", "iso-8859-1" }, |
143 | | { "IBM819", "iso-8859-1" }, |
144 | | { "CP819", "iso-8859-1" }, |
145 | | { "csISOLatin1", "iso-8859-1" }, |
146 | | |
147 | | { "ISO_8859-2:1987", "iso-8859-2" }, |
148 | | { "iso-ir-101", "iso-8859-2" }, |
149 | | { "iso_8859-2", "iso-8859-2" }, |
150 | | { "latin2", "iso-8859-2" }, |
151 | | { "l2", "iso-8859-2" }, |
152 | | { "csISOLatin2", "iso-8859-2" }, |
153 | | |
154 | | { "ISO_8859-3:1988", "iso-8859-3" }, |
155 | | { "iso-ir-109", "iso-8859-3" }, |
156 | | { "ISO_8859-3", "iso-8859-3" }, |
157 | | { "latin3", "iso-8859-3" }, |
158 | | { "l3", "iso-8859-3" }, |
159 | | { "csISOLatin3", "iso-8859-3" }, |
160 | | |
161 | | { "ISO_8859-4:1988", "iso-8859-4" }, |
162 | | { "iso-ir-110", "iso-8859-4" }, |
163 | | { "ISO_8859-4", "iso-8859-4" }, |
164 | | { "latin4", "iso-8859-4" }, |
165 | | { "l4", "iso-8859-4" }, |
166 | | { "csISOLatin4", "iso-8859-4" }, |
167 | | |
168 | | { "ISO_8859-6:1987", "iso-8859-6" }, |
169 | | { "iso-ir-127", "iso-8859-6" }, |
170 | | { "iso_8859-6", "iso-8859-6" }, |
171 | | { "ECMA-114", "iso-8859-6" }, |
172 | | { "ASMO-708", "iso-8859-6" }, |
173 | | { "arabic", "iso-8859-6" }, |
174 | | { "csISOLatinArabic", "iso-8859-6" }, |
175 | | |
176 | | { "ISO_8859-7:1987", "iso-8859-7" }, |
177 | | { "iso-ir-126", "iso-8859-7" }, |
178 | | { "ISO_8859-7", "iso-8859-7" }, |
179 | | { "ELOT_928", "iso-8859-7" }, |
180 | | { "ECMA-118", "iso-8859-7" }, |
181 | | { "greek", "iso-8859-7" }, |
182 | | { "greek8", "iso-8859-7" }, |
183 | | { "csISOLatinGreek", "iso-8859-7" }, |
184 | | |
185 | | { "ISO_8859-8:1988", "iso-8859-8" }, |
186 | | { "iso-ir-138", "iso-8859-8" }, |
187 | | { "ISO_8859-8", "iso-8859-8" }, |
188 | | { "hebrew", "iso-8859-8" }, |
189 | | { "csISOLatinHebrew", "iso-8859-8" }, |
190 | | |
191 | | { "ISO_8859-5:1988", "iso-8859-5" }, |
192 | | { "iso-ir-144", "iso-8859-5" }, |
193 | | { "ISO_8859-5", "iso-8859-5" }, |
194 | | { "cyrillic", "iso-8859-5" }, |
195 | | { "csISOLatinCyrillic", "iso-8859-5" }, |
196 | | |
197 | | { "ISO_8859-9:1989", "iso-8859-9" }, |
198 | | { "iso-ir-148", "iso-8859-9" }, |
199 | | { "ISO_8859-9", "iso-8859-9" }, |
200 | | { "latin5", "iso-8859-9" }, /* this is not a bug */ |
201 | | { "l5", "iso-8859-9" }, |
202 | | { "csISOLatin5", "iso-8859-9" }, |
203 | | |
204 | | { "ISO_8859-10:1992", "iso-8859-10" }, |
205 | | { "iso-ir-157", "iso-8859-10" }, |
206 | | { "latin6", "iso-8859-10" }, /* this is not a bug */ |
207 | | { "l6", "iso-8859-10" }, |
208 | | { "csISOLatin6", "iso-8859-10" }, |
209 | | |
210 | | { "csKOI8r", "koi8-r" }, |
211 | | |
212 | | { "MS_Kanji", "Shift_JIS" }, /* Note the underscore! */ |
213 | | { "csShiftJis", "Shift_JIS" }, |
214 | | |
215 | | { "Extended_UNIX_Code_Packed_Format_for_Japanese", |
216 | | "euc-jp" }, |
217 | | { "csEUCPkdFmtJapanese", "euc-jp" }, |
218 | | |
219 | | { "csGB2312", "gb2312" }, |
220 | | { "csbig5", "big5" }, |
221 | | |
222 | | /* End of official brain damage. |
223 | | * What follows has been taken from glibc's localedata files. */ |
224 | | |
225 | | { "iso_8859-13", "iso-8859-13" }, |
226 | | { "iso-ir-179", "iso-8859-13" }, |
227 | | { "latin7", "iso-8859-13" }, /* this is not a bug */ |
228 | | { "l7", "iso-8859-13" }, |
229 | | |
230 | | { "iso_8859-14", "iso-8859-14" }, |
231 | | { "latin8", "iso-8859-14" }, /* this is not a bug */ |
232 | | { "l8", "iso-8859-14" }, |
233 | | |
234 | | { "iso_8859-15", "iso-8859-15" }, |
235 | | { "latin9", "iso-8859-15" }, /* this is not a bug */ |
236 | | |
237 | | /* Suggested by Ionel Mugurel Ciobica <tgakic@sg10.chem.tue.nl> */ |
238 | | { "latin0", "iso-8859-15" }, /* this is not a bug */ |
239 | | |
240 | | { "iso_8859-16", "iso-8859-16" }, |
241 | | { "latin10", "iso-8859-16" }, /* this is not a bug */ |
242 | | |
243 | | { "646", "us-ascii" }, |
244 | | |
245 | | /* http://www.sun.com/software/white-papers/wp-unicode/ */ |
246 | | |
247 | | { "eucJP", "euc-jp" }, |
248 | | { "PCK", "Shift_JIS" }, |
249 | | { "ko_KR-euc", "euc-kr" }, |
250 | | { "zh_TW-big5", "big5" }, |
251 | | |
252 | | /* seems to be common on some systems */ |
253 | | |
254 | | { "sjis", "Shift_JIS" }, |
255 | | { "euc-jp-ms", "eucJP-ms" }, |
256 | | |
257 | | /* If you happen to encounter system-specific brain-damage with respect to |
258 | | * character set naming, please add it above this comment, and submit a patch |
259 | | * to <neomutt-devel@neomutt.org> */ |
260 | | |
261 | | { NULL, NULL }, |
262 | | // clang-format on |
263 | | }; |
264 | | |
265 | | /** |
266 | | * lookup_new - Create a new Lookup |
267 | | * @retval ptr New Lookup |
268 | | */ |
269 | | static struct Lookup *lookup_new(void) |
270 | 0 | { |
271 | 0 | return MUTT_MEM_CALLOC(1, struct Lookup); |
272 | 0 | } |
273 | | |
274 | | /** |
275 | | * lookup_free - Free a Lookup |
276 | | * @param ptr Lookup to free |
277 | | */ |
278 | | static void lookup_free(struct Lookup **ptr) |
279 | 0 | { |
280 | 0 | if (!ptr || !*ptr) |
281 | 0 | return; |
282 | | |
283 | 0 | struct Lookup *l = *ptr; |
284 | 0 | FREE(&l->replacement); |
285 | 0 | FREE(&l->regex.pattern); |
286 | 0 | if (l->regex.regex) |
287 | 0 | regfree(l->regex.regex); |
288 | 0 | FREE(&l->regex.regex); |
289 | 0 | FREE(&l->regex); |
290 | |
|
291 | 0 | FREE(ptr); |
292 | 0 | } |
293 | | |
294 | | /** |
295 | | * lookup_charset - Look for a preferred character set name |
296 | | * @param type Type, e.g. #MUTT_LOOKUP_CHARSET |
297 | | * @param cs Character set |
298 | | * @retval ptr Charset string |
299 | | * |
300 | | * If the character set matches one of the regexes, |
301 | | * then return the replacement name. |
302 | | */ |
303 | | static const char *lookup_charset(enum LookupType type, const char *cs) |
304 | 0 | { |
305 | 0 | if (!cs) |
306 | 0 | return NULL; |
307 | | |
308 | 0 | struct Lookup *l = NULL; |
309 | |
|
310 | 0 | TAILQ_FOREACH(l, &Lookups, entries) |
311 | 0 | { |
312 | 0 | if (l->type != type) |
313 | 0 | continue; |
314 | 0 | if (mutt_regex_match(&l->regex, cs)) |
315 | 0 | return l->replacement; |
316 | 0 | } |
317 | 0 | return NULL; |
318 | 0 | } |
319 | | |
320 | | /** |
321 | | * mutt_ch_convert_nonmime_string - Try to convert a string using a list of character sets |
322 | | * @param[in] assumed_charset From $assumed_charset |
323 | | * @param[in] charset From $charset |
324 | | * @param[in,out] ps String to be converted |
325 | | * @retval 0 Success |
326 | | * @retval -1 Error |
327 | | * |
328 | | * Work through `$assumed_charset` looking for a character set conversion that |
329 | | * works. Failing that, try mutt_ch_get_default_charset(). |
330 | | */ |
331 | | int mutt_ch_convert_nonmime_string(const struct Slist *const assumed_charset, |
332 | | const char *charset, char **ps) |
333 | 0 | { |
334 | 0 | if (!ps) |
335 | 0 | return -1; |
336 | | |
337 | 0 | char *u = *ps; |
338 | 0 | const size_t ulen = mutt_str_len(u); |
339 | 0 | if (ulen == 0) |
340 | 0 | return 0; |
341 | | |
342 | 0 | const struct ListNode *np = NULL; |
343 | 0 | STAILQ_FOREACH(np, &assumed_charset->head, entries) |
344 | 0 | { |
345 | 0 | char const *c = np->data; |
346 | 0 | size_t n = mutt_str_len(c); |
347 | 0 | char *fromcode = MUTT_MEM_MALLOC(n + 1, char); |
348 | 0 | mutt_str_copy(fromcode, c, n + 1); |
349 | 0 | char *s = mutt_strn_dup(u, ulen); |
350 | 0 | int m = mutt_ch_convert_string(&s, fromcode, charset, MUTT_ICONV_NO_FLAGS); |
351 | 0 | FREE(&fromcode); |
352 | 0 | if (m == 0) |
353 | 0 | { |
354 | 0 | FREE(ps); |
355 | 0 | *ps = s; |
356 | 0 | return 0; |
357 | 0 | } |
358 | 0 | FREE(&s); |
359 | 0 | } |
360 | 0 | mutt_ch_convert_string(ps, mutt_ch_get_default_charset(assumed_charset), |
361 | 0 | charset, MUTT_ICONV_HOOK_FROM); |
362 | 0 | return -1; |
363 | 0 | } |
364 | | |
365 | | /** |
366 | | * mutt_ch_canonical_charset - Canonicalise the charset of a string |
367 | | * @param buf Buffer for canonical character set name |
368 | | * @param buflen Length of buffer |
369 | | * @param name Name to be canonicalised |
370 | | * |
371 | | * This first ties off any charset extension such as "//TRANSLIT", |
372 | | * canonicalizes the charset and re-adds the extension |
373 | | */ |
374 | | void mutt_ch_canonical_charset(char *buf, size_t buflen, const char *name) |
375 | 0 | { |
376 | 0 | if (!buf || !name) |
377 | 0 | return; |
378 | | |
379 | 0 | char in[1024] = { 0 }; |
380 | 0 | char scratch[1024 + 10] = { 0 }; |
381 | 0 | struct Buffer *canon = buf_pool_get(); |
382 | |
|
383 | 0 | mutt_str_copy(in, name, sizeof(in)); |
384 | 0 | char *ext = strchr(in, '/'); |
385 | 0 | if (ext) |
386 | 0 | *ext++ = '\0'; |
387 | |
|
388 | 0 | if (mutt_istr_equal(in, "utf-8") || mutt_istr_equal(in, "utf8")) |
389 | 0 | { |
390 | 0 | buf_strcpy(canon, "utf-8"); |
391 | 0 | goto out; |
392 | 0 | } |
393 | | |
394 | | /* catch some common iso-8859-something misspellings */ |
395 | 0 | size_t plen; |
396 | 0 | if ((plen = mutt_istr_startswith(in, "8859")) && (in[plen] != '-')) |
397 | 0 | snprintf(scratch, sizeof(scratch), "iso-8859-%s", in + plen); |
398 | 0 | else if ((plen = mutt_istr_startswith(in, "8859-"))) |
399 | 0 | snprintf(scratch, sizeof(scratch), "iso-8859-%s", in + plen); |
400 | 0 | else if ((plen = mutt_istr_startswith(in, "iso8859")) && (in[plen] != '-')) |
401 | 0 | snprintf(scratch, sizeof(scratch), "iso_8859-%s", in + plen); |
402 | 0 | else if ((plen = mutt_istr_startswith(in, "iso8859-"))) |
403 | 0 | snprintf(scratch, sizeof(scratch), "iso_8859-%s", in + plen); |
404 | 0 | else |
405 | 0 | mutt_str_copy(scratch, in, sizeof(scratch)); |
406 | |
|
407 | 0 | for (size_t i = 0; PreferredMimeNames[i].key; i++) |
408 | 0 | { |
409 | 0 | if (mutt_istr_equal(scratch, PreferredMimeNames[i].key)) |
410 | 0 | { |
411 | 0 | buf_strcpy(canon, PreferredMimeNames[i].pref); |
412 | 0 | goto out; |
413 | 0 | } |
414 | 0 | } |
415 | | |
416 | 0 | buf_strcpy(canon, scratch); |
417 | 0 | buf_lower(canon); // for cosmetics' sake |
418 | |
|
419 | 0 | out: |
420 | 0 | if (ext && (*ext != '\0')) |
421 | 0 | { |
422 | 0 | buf_addch(canon, '/'); |
423 | 0 | buf_addstr(canon, ext); |
424 | 0 | } |
425 | |
|
426 | 0 | mutt_str_copy(buf, buf_string(canon), buflen); |
427 | 0 | buf_pool_release(&canon); |
428 | 0 | } |
429 | | |
430 | | /** |
431 | | * mutt_ch_chscmp - Are the names of two character sets equivalent? |
432 | | * @param cs1 First character set |
433 | | * @param cs2 Second character set |
434 | | * @retval true Names are equivalent |
435 | | * @retval false Names differ |
436 | | * |
437 | | * Charsets may have extensions that mutt_ch_canonical_charset() leaves intact; |
438 | | * we expect 'cs2' to originate from neomutt code, not user input (i.e. 'cs2' |
439 | | * does _not_ have any extension) we simply check if the shorter string is a |
440 | | * prefix for the longer. |
441 | | */ |
442 | | bool mutt_ch_chscmp(const char *cs1, const char *cs2) |
443 | 0 | { |
444 | 0 | if (!cs1 || !cs2) |
445 | 0 | return false; |
446 | | |
447 | 0 | char buf[256] = { 0 }; |
448 | |
|
449 | 0 | mutt_ch_canonical_charset(buf, sizeof(buf), cs1); |
450 | |
|
451 | 0 | int len1 = mutt_str_len(buf); |
452 | 0 | int len2 = mutt_str_len(cs2); |
453 | |
|
454 | 0 | return mutt_istrn_equal(((len1 > len2) ? buf : cs2), |
455 | 0 | ((len1 > len2) ? cs2 : buf), MIN(len1, len2)); |
456 | 0 | } |
457 | | |
458 | | /** |
459 | | * mutt_ch_get_default_charset - Get the default character set |
460 | | * @param assumed_charset From $assumed_charset |
461 | | * @retval ptr Name of the default character set |
462 | | * |
463 | | * @warning This returns a pointer to a static buffer. Do not free it. |
464 | | */ |
465 | | const char *mutt_ch_get_default_charset(const struct Slist *const assumed_charset) |
466 | 843 | { |
467 | 843 | static char fcharset[128]; |
468 | 843 | const char *c = NULL; |
469 | | |
470 | 843 | if (assumed_charset && (assumed_charset->count > 0)) |
471 | 0 | c = STAILQ_FIRST(&assumed_charset->head)->data; |
472 | 843 | else |
473 | 843 | c = "us-ascii"; |
474 | | |
475 | 843 | mutt_str_copy(fcharset, c, sizeof(fcharset)); |
476 | 843 | return fcharset; |
477 | 843 | } |
478 | | |
479 | | /** |
480 | | * mutt_ch_get_langinfo_charset - Get the user's choice of character set |
481 | | * @retval ptr Charset string |
482 | | * |
483 | | * Get the canonical character set used by the user's locale. |
484 | | * The caller must free the returned string. |
485 | | */ |
486 | | char *mutt_ch_get_langinfo_charset(void) |
487 | 0 | { |
488 | 0 | char buf[1024] = { 0 }; |
489 | |
|
490 | 0 | mutt_ch_canonical_charset(buf, sizeof(buf), nl_langinfo(CODESET)); |
491 | |
|
492 | 0 | if (buf[0] != '\0') |
493 | 0 | return mutt_str_dup(buf); |
494 | | |
495 | 0 | return mutt_str_dup("iso-8859-1"); |
496 | 0 | } |
497 | | |
498 | | /** |
499 | | * mutt_ch_lookup_add - Add a new character set lookup |
500 | | * @param type Type of character set, e.g. #MUTT_LOOKUP_CHARSET |
501 | | * @param pat Pattern to match |
502 | | * @param replace Replacement string |
503 | | * @param err Buffer for error message |
504 | | * @retval true Lookup added to list |
505 | | * @retval false Regex string was invalid |
506 | | * |
507 | | * Add a regex for a character set and a replacement name. |
508 | | */ |
509 | | bool mutt_ch_lookup_add(enum LookupType type, const char *pat, |
510 | | const char *replace, struct Buffer *err) |
511 | 0 | { |
512 | 0 | if (!pat || !replace) |
513 | 0 | return false; |
514 | | |
515 | 0 | regex_t *rx = MUTT_MEM_CALLOC(1, regex_t); |
516 | 0 | int rc = REG_COMP(rx, pat, REG_ICASE); |
517 | 0 | if (rc != 0) |
518 | 0 | { |
519 | 0 | regerror(rc, rx, err->data, err->dsize); |
520 | 0 | FREE(&rx); |
521 | 0 | return false; |
522 | 0 | } |
523 | | |
524 | 0 | struct Lookup *l = lookup_new(); |
525 | 0 | l->type = type; |
526 | 0 | l->replacement = mutt_str_dup(replace); |
527 | 0 | l->regex.pattern = mutt_str_dup(pat); |
528 | 0 | l->regex.regex = rx; |
529 | 0 | l->regex.pat_not = false; |
530 | |
|
531 | 0 | TAILQ_INSERT_TAIL(&Lookups, l, entries); |
532 | |
|
533 | 0 | return true; |
534 | 0 | } |
535 | | |
536 | | /** |
537 | | * mutt_ch_lookup_remove - Remove all the character set lookups |
538 | | * |
539 | | * Empty the list of replacement character set names. |
540 | | */ |
541 | | void mutt_ch_lookup_remove(void) |
542 | 0 | { |
543 | 0 | struct Lookup *l = NULL; |
544 | 0 | struct Lookup *tmp = NULL; |
545 | |
|
546 | 0 | TAILQ_FOREACH_SAFE(l, &Lookups, entries, tmp) |
547 | 0 | { |
548 | 0 | TAILQ_REMOVE(&Lookups, l, entries); |
549 | 0 | lookup_free(&l); |
550 | 0 | } |
551 | 0 | } |
552 | | |
553 | | /** |
554 | | * mutt_ch_charset_lookup - Look for a replacement character set |
555 | | * @param chs Character set to lookup |
556 | | * @retval ptr Replacement character set (if a 'charset-hook' matches) |
557 | | * @retval NULL No matching hook |
558 | | * |
559 | | * Look through all the 'charset-hook's. |
560 | | * If one matches return the replacement character set. |
561 | | */ |
562 | | const char *mutt_ch_charset_lookup(const char *chs) |
563 | 0 | { |
564 | 0 | return lookup_charset(MUTT_LOOKUP_CHARSET, chs); |
565 | 0 | } |
566 | | |
567 | | /** |
568 | | * mutt_ch_iconv_open - Set up iconv for conversions |
569 | | * @param tocode Current character set |
570 | | * @param fromcode Target character set |
571 | | * @param flags Flags, e.g. #MUTT_ICONV_HOOK_FROM |
572 | | * @retval ptr iconv handle for the conversion |
573 | | * |
574 | | * Like iconv_open, but canonicalises the charsets, applies charset-hooks, |
575 | | * recanonicalises, and finally applies iconv-hooks. Parameter flags=0 skips |
576 | | * charset-hooks, while MUTT_ICONV_HOOK_FROM applies them to fromcode. Callers |
577 | | * should use flags=0 when fromcode can safely be considered true, either some |
578 | | * constant, or some value provided by the user; MUTT_ICONV_HOOK_FROM should be |
579 | | * used only when fromcode is unsure, taken from a possibly wrong incoming MIME |
580 | | * label, or such. Misusing MUTT_ICONV_HOOK_FROM leads to unwanted interactions |
581 | | * in some setups. |
582 | | * |
583 | | * Since calling iconv_open() repeatedly can be expensive, we keep a cache of |
584 | | * the most recently used iconv_t objects, kept in LRU order. This means that |
585 | | * you should not call iconv_close() on the object yourself. All remaining |
586 | | * objects in the cache will exit when main() calls mutt_ch_cache_cleanup(). |
587 | | * |
588 | | * @note By design charset-hooks should never be, and are never, applied |
589 | | * to tocode. |
590 | | * |
591 | | * @note The top-well-named MUTT_ICONV_HOOK_FROM acts on charset-hooks, |
592 | | * not at all on iconv-hooks. |
593 | | */ |
594 | | iconv_t mutt_ch_iconv_open(const char *tocode, const char *fromcode, uint8_t flags) |
595 | 0 | { |
596 | 0 | char tocode1[128] = { 0 }; |
597 | 0 | char fromcode1[128] = { 0 }; |
598 | 0 | const char *tocode2 = NULL, *fromcode2 = NULL; |
599 | 0 | const char *tmp = NULL; |
600 | | |
601 | | /* transform to MIME preferred charset names */ |
602 | 0 | mutt_ch_canonical_charset(tocode1, sizeof(tocode1), tocode); |
603 | 0 | mutt_ch_canonical_charset(fromcode1, sizeof(fromcode1), fromcode); |
604 | | |
605 | | /* maybe apply charset-hooks and recanonicalise fromcode, |
606 | | * but only when caller asked us to sanitize a potentially wrong |
607 | | * charset name incoming from the wild exterior. */ |
608 | 0 | if (flags & MUTT_ICONV_HOOK_FROM) |
609 | 0 | { |
610 | 0 | tmp = mutt_ch_charset_lookup(fromcode1); |
611 | 0 | if (tmp) |
612 | 0 | mutt_ch_canonical_charset(fromcode1, sizeof(fromcode1), tmp); |
613 | 0 | } |
614 | | |
615 | | /* check if we have this pair cached already */ |
616 | 0 | for (int i = 0; i < IconvCacheUsed; i++) |
617 | 0 | { |
618 | 0 | if (strcmp(tocode1, IconvCache[i].tocode1) == 0 && |
619 | 0 | strcmp(fromcode1, IconvCache[i].fromcode1) == 0) |
620 | 0 | { |
621 | 0 | iconv_t cd = IconvCache[i].cd; |
622 | | |
623 | | /* make room for this one at the top */ |
624 | 0 | struct IconvCacheEntry top = IconvCache[i]; |
625 | 0 | for (int j = i - 1; j >= 0; j--) |
626 | 0 | { |
627 | 0 | IconvCache[j + 1] = IconvCache[j]; |
628 | 0 | } |
629 | 0 | IconvCache[0] = top; |
630 | |
|
631 | 0 | if (iconv_t_valid(cd)) |
632 | 0 | { |
633 | | /* reset state */ |
634 | 0 | iconv(cd, NULL, NULL, NULL, NULL); |
635 | 0 | } |
636 | 0 | return cd; |
637 | 0 | } |
638 | 0 | } |
639 | | |
640 | | /* not found in cache */ |
641 | | /* always apply iconv-hooks to suit system's iconv tastes */ |
642 | 0 | tocode2 = mutt_ch_iconv_lookup(tocode1); |
643 | 0 | tocode2 = tocode2 ? tocode2 : tocode1; |
644 | 0 | fromcode2 = mutt_ch_iconv_lookup(fromcode1); |
645 | 0 | fromcode2 = fromcode2 ? fromcode2 : fromcode1; |
646 | | |
647 | | /* call system iconv with names it appreciates */ |
648 | 0 | iconv_t cd = iconv_open(tocode2, fromcode2); |
649 | |
|
650 | 0 | if (IconvCacheUsed == ICONV_CACHE_SIZE) |
651 | 0 | { |
652 | 0 | mutt_debug(LL_DEBUG2, "iconv: dropping %s -> %s from the cache\n", |
653 | 0 | IconvCache[IconvCacheUsed - 1].fromcode1, |
654 | 0 | IconvCache[IconvCacheUsed - 1].tocode1); |
655 | | /* get rid of the oldest entry */ |
656 | 0 | FREE(&IconvCache[IconvCacheUsed - 1].fromcode1); |
657 | 0 | FREE(&IconvCache[IconvCacheUsed - 1].tocode1); |
658 | 0 | if (iconv_t_valid(IconvCache[IconvCacheUsed - 1].cd)) |
659 | 0 | { |
660 | 0 | iconv_close(IconvCache[IconvCacheUsed - 1].cd); |
661 | 0 | } |
662 | 0 | IconvCacheUsed--; |
663 | 0 | } |
664 | | |
665 | | /* make room for this one at the top */ |
666 | 0 | for (int j = IconvCacheUsed - 1; j >= 0; j--) |
667 | 0 | { |
668 | 0 | IconvCache[j + 1] = IconvCache[j]; |
669 | 0 | } |
670 | |
|
671 | 0 | IconvCacheUsed++; |
672 | |
|
673 | 0 | mutt_debug(LL_DEBUG2, "iconv: adding %s -> %s to the cache\n", fromcode1, tocode1); |
674 | 0 | IconvCache[0].fromcode1 = strdup(fromcode1); |
675 | 0 | IconvCache[0].tocode1 = strdup(tocode1); |
676 | 0 | IconvCache[0].cd = cd; |
677 | |
|
678 | 0 | return cd; |
679 | 0 | } |
680 | | |
681 | | /** |
682 | | * mutt_ch_iconv - Change the encoding of a string |
683 | | * @param[in] cd Iconv conversion descriptor |
684 | | * @param[in,out] inbuf Buffer to convert |
685 | | * @param[in,out] inbytesleft Length of buffer to convert |
686 | | * @param[in,out] outbuf Buffer for the result |
687 | | * @param[in,out] outbytesleft Length of result buffer |
688 | | * @param[in] inrepls Input replacement characters |
689 | | * @param[in] outrepl Output replacement characters |
690 | | * @param[out] iconverrno Errno if iconv() fails, 0 if it succeeds |
691 | | * @retval num Characters converted |
692 | | * |
693 | | * Like iconv, but keeps going even when the input is invalid |
694 | | * If you're supplying inrepls, the source charset should be stateless; |
695 | | * if you're supplying an outrepl, the target charset should be. |
696 | | */ |
697 | | size_t mutt_ch_iconv(iconv_t cd, const char **inbuf, size_t *inbytesleft, |
698 | | char **outbuf, size_t *outbytesleft, const char **inrepls, |
699 | | const char *outrepl, int *iconverrno) |
700 | 0 | { |
701 | 0 | size_t rc = 0; |
702 | 0 | const char *ib = *inbuf; |
703 | 0 | size_t ibl = *inbytesleft; |
704 | 0 | char *ob = *outbuf; |
705 | 0 | size_t obl = *outbytesleft; |
706 | |
|
707 | 0 | while (true) |
708 | 0 | { |
709 | 0 | errno = 0; |
710 | 0 | const size_t ret1 = iconv(cd, (ICONV_CONST char **) &ib, &ibl, &ob, &obl); |
711 | 0 | if (ret1 != ICONV_ILLEGAL_SEQ) |
712 | 0 | rc += ret1; |
713 | 0 | if (iconverrno) |
714 | 0 | *iconverrno = errno; |
715 | |
|
716 | 0 | if (ibl && obl && (errno == EILSEQ)) |
717 | 0 | { |
718 | 0 | if (inrepls) |
719 | 0 | { |
720 | | /* Try replacing the input */ |
721 | 0 | const char **t = NULL; |
722 | 0 | for (t = inrepls; *t; t++) |
723 | 0 | { |
724 | 0 | const char *ib1 = *t; |
725 | 0 | size_t ibl1 = strlen(*t); |
726 | 0 | char *ob1 = ob; |
727 | 0 | size_t obl1 = obl; |
728 | 0 | iconv(cd, (ICONV_CONST char **) &ib1, &ibl1, &ob1, &obl1); |
729 | 0 | if (ibl1 == 0) |
730 | 0 | { |
731 | 0 | ib++; |
732 | 0 | ibl--; |
733 | 0 | ob = ob1; |
734 | 0 | obl = obl1; |
735 | 0 | rc++; |
736 | 0 | break; |
737 | 0 | } |
738 | 0 | } |
739 | 0 | if (*t) |
740 | 0 | continue; |
741 | 0 | } |
742 | | /* Replace the output */ |
743 | 0 | if (!outrepl) |
744 | 0 | outrepl = "?"; |
745 | 0 | iconv(cd, NULL, NULL, &ob, &obl); |
746 | 0 | if (obl) |
747 | 0 | { |
748 | 0 | int n = strlen(outrepl); |
749 | 0 | if (n > obl) |
750 | 0 | { |
751 | 0 | outrepl = "?"; |
752 | 0 | n = 1; |
753 | 0 | } |
754 | 0 | memcpy(ob, outrepl, n); |
755 | 0 | ib++; |
756 | 0 | ibl--; |
757 | 0 | ob += n; |
758 | 0 | obl -= n; |
759 | 0 | rc++; |
760 | 0 | iconv(cd, NULL, NULL, NULL, NULL); /* for good measure */ |
761 | 0 | continue; |
762 | 0 | } |
763 | 0 | } |
764 | 0 | *inbuf = ib; |
765 | 0 | *inbytesleft = ibl; |
766 | 0 | *outbuf = ob; |
767 | 0 | *outbytesleft = obl; |
768 | 0 | return rc; |
769 | 0 | } |
770 | 0 | } |
771 | | |
772 | | /** |
773 | | * mutt_ch_iconv_lookup - Look for a replacement character set |
774 | | * @param chs Character set to lookup |
775 | | * @retval ptr Replacement character set (if a 'iconv-hook' matches) |
776 | | * @retval NULL No matching hook |
777 | | * |
778 | | * Look through all the 'iconv-hook's. |
779 | | * If one matches return the replacement character set. |
780 | | */ |
781 | | const char *mutt_ch_iconv_lookup(const char *chs) |
782 | 0 | { |
783 | 0 | return lookup_charset(MUTT_LOOKUP_ICONV, chs); |
784 | 0 | } |
785 | | |
786 | | /** |
787 | | * mutt_ch_check - Check whether a string can be converted between encodings |
788 | | * @param[in] s String to check |
789 | | * @param[in] slen Length of the string to check |
790 | | * @param[in] from Current character set |
791 | | * @param[in] to Target character set |
792 | | * @retval 0 Success |
793 | | * @retval -1 Error in iconv_open() |
794 | | * @retval >0 Errno as set by iconv() |
795 | | */ |
796 | | int mutt_ch_check(const char *s, size_t slen, const char *from, const char *to) |
797 | 0 | { |
798 | 0 | if (!s || !from || !to) |
799 | 0 | return -1; |
800 | | |
801 | 0 | int rc = 0; |
802 | 0 | iconv_t cd = mutt_ch_iconv_open(to, from, MUTT_ICONV_NO_FLAGS); |
803 | 0 | if (!iconv_t_valid(cd)) |
804 | 0 | return -1; |
805 | | |
806 | 0 | size_t outlen = MB_LEN_MAX * slen; |
807 | 0 | char *out = MUTT_MEM_MALLOC(outlen + 1, char); |
808 | 0 | char *saved_out = out; |
809 | |
|
810 | 0 | const size_t convlen = iconv(cd, (ICONV_CONST char **) &s, &slen, &out, &outlen); |
811 | 0 | if (convlen == ICONV_ILLEGAL_SEQ) |
812 | 0 | rc = errno; |
813 | |
|
814 | 0 | FREE(&saved_out); |
815 | 0 | return rc; |
816 | 0 | } |
817 | | |
818 | | /** |
819 | | * mutt_ch_convert_string - Convert a string between encodings |
820 | | * @param[in,out] ps String to convert |
821 | | * @param[in] from Current character set |
822 | | * @param[in] to Target character set |
823 | | * @param[in] flags Flags, e.g. #MUTT_ICONV_HOOK_FROM |
824 | | * @retval 0 Success |
825 | | * @retval -1 Invalid arguments or failure to open an iconv channel |
826 | | * @retval errno Failure in iconv conversion |
827 | | * |
828 | | * Parameter flags is given as-is to mutt_ch_iconv_open(). |
829 | | * See there for its meaning and usage policy. |
830 | | */ |
831 | | int mutt_ch_convert_string(char **ps, const char *from, const char *to, uint8_t flags) |
832 | 6.43k | { |
833 | 6.43k | if (!ps) |
834 | 0 | return -1; |
835 | | |
836 | 6.43k | char *s = *ps; |
837 | | |
838 | 6.43k | if (!s || (*s == '\0')) |
839 | 2.43k | return 0; |
840 | | |
841 | 4.00k | if (!to || !from) |
842 | 4.00k | return -1; |
843 | | |
844 | 0 | const char *repls[] = { "\357\277\275", "?", 0 }; |
845 | 0 | int rc = 0; |
846 | |
|
847 | 0 | iconv_t cd = mutt_ch_iconv_open(to, from, flags); |
848 | 0 | if (!iconv_t_valid(cd)) |
849 | 0 | return -1; |
850 | | |
851 | 0 | const char **inrepls = NULL; |
852 | 0 | const char *outrepl = NULL; |
853 | |
|
854 | 0 | if (mutt_ch_is_utf8(to)) |
855 | 0 | outrepl = "\357\277\275"; |
856 | 0 | else if (mutt_ch_is_utf8(from)) |
857 | 0 | inrepls = repls; |
858 | 0 | else |
859 | 0 | outrepl = "?"; |
860 | |
|
861 | 0 | const char *ib = s; |
862 | 0 | size_t ibl = strlen(s); |
863 | 0 | if (ibl >= (SIZE_MAX / MB_LEN_MAX)) |
864 | 0 | { |
865 | 0 | return -1; |
866 | 0 | } |
867 | 0 | size_t obl = MB_LEN_MAX * ibl; |
868 | 0 | char *buf = MUTT_MEM_MALLOC(obl + 1, char); |
869 | 0 | char *ob = buf; |
870 | |
|
871 | 0 | mutt_ch_iconv(cd, &ib, &ibl, &ob, &obl, inrepls, outrepl, &rc); |
872 | 0 | iconv(cd, 0, 0, &ob, &obl); |
873 | |
|
874 | 0 | *ob = '\0'; |
875 | |
|
876 | 0 | FREE(ps); |
877 | 0 | *ps = buf; |
878 | |
|
879 | 0 | mutt_str_adjust(ps); |
880 | 0 | return rc; |
881 | 0 | } |
882 | | |
883 | | /** |
884 | | * mutt_ch_check_charset - Does iconv understand a character set? |
885 | | * @param cs Character set to check |
886 | | * @param strict Check strictly by using iconv |
887 | | * @retval true Character set is valid |
888 | | * |
889 | | * If `strict` is false, then finding a matching character set in |
890 | | * #PreferredMimeNames will be enough. |
891 | | * If `strict` is true, or the charset is not in #PreferredMimeNames, then |
892 | | * iconv() with be run. |
893 | | */ |
894 | | bool mutt_ch_check_charset(const char *cs, bool strict) |
895 | 0 | { |
896 | 0 | if (!cs) |
897 | 0 | return false; |
898 | | |
899 | 0 | if (mutt_ch_is_utf8(cs)) |
900 | 0 | return true; |
901 | | |
902 | 0 | if (!strict) |
903 | 0 | { |
904 | 0 | for (int i = 0; PreferredMimeNames[i].key; i++) |
905 | 0 | { |
906 | 0 | if (mutt_istr_equal(PreferredMimeNames[i].key, cs) || |
907 | 0 | mutt_istr_equal(PreferredMimeNames[i].pref, cs)) |
908 | 0 | { |
909 | 0 | return true; |
910 | 0 | } |
911 | 0 | } |
912 | 0 | } |
913 | | |
914 | 0 | iconv_t cd = mutt_ch_iconv_open(cs, cs, MUTT_ICONV_NO_FLAGS); |
915 | 0 | if (iconv_t_valid(cd)) |
916 | 0 | { |
917 | 0 | return true; |
918 | 0 | } |
919 | | |
920 | 0 | return false; |
921 | 0 | } |
922 | | |
923 | | /** |
924 | | * mutt_ch_fgetconv_open - Prepare a file for charset conversion |
925 | | * @param fp FILE ptr to prepare |
926 | | * @param from Current character set |
927 | | * @param to Destination character set |
928 | | * @param flags Flags, e.g. #MUTT_ICONV_HOOK_FROM |
929 | | * @retval ptr fgetconv handle |
930 | | * |
931 | | * Parameter flags is given as-is to mutt_ch_iconv_open(). |
932 | | */ |
933 | | struct FgetConv *mutt_ch_fgetconv_open(FILE *fp, const char *from, const char *to, uint8_t flags) |
934 | 0 | { |
935 | 0 | iconv_t cd = ICONV_T_INVALID; |
936 | |
|
937 | 0 | if (from && to) |
938 | 0 | cd = mutt_ch_iconv_open(to, from, flags); |
939 | |
|
940 | 0 | struct FgetConv *fc = MUTT_MEM_CALLOC(1, struct FgetConv); |
941 | 0 | fc->fp = fp; |
942 | 0 | fc->cd = cd; |
943 | |
|
944 | 0 | if (iconv_t_valid(cd)) |
945 | 0 | { |
946 | 0 | static const char *repls[] = { "\357\277\275", "?", 0 }; |
947 | |
|
948 | 0 | fc->p = fc->bufo; |
949 | 0 | fc->ob = fc->bufo; |
950 | 0 | fc->ib = fc->bufi; |
951 | 0 | fc->ibl = 0; |
952 | 0 | fc->inrepls = mutt_ch_is_utf8(to) ? repls : repls + 1; |
953 | 0 | } |
954 | |
|
955 | 0 | return fc; |
956 | 0 | } |
957 | | |
958 | | /** |
959 | | * mutt_ch_fgetconv_close - Close an fgetconv handle |
960 | | * @param[out] ptr fgetconv handle |
961 | | */ |
962 | | void mutt_ch_fgetconv_close(struct FgetConv **ptr) |
963 | 0 | { |
964 | 0 | if (!ptr || !*ptr) |
965 | 0 | return; |
966 | | |
967 | 0 | FREE(ptr); |
968 | 0 | } |
969 | | |
970 | | /** |
971 | | * mutt_ch_fgetconv - Convert a file's character set |
972 | | * @param fc FgetConv handle |
973 | | * @retval num Next character in the converted file |
974 | | * @retval EOF Error |
975 | | * |
976 | | * A file is read into a buffer and its character set is converted. |
977 | | * Each call to this function will return one converted character. |
978 | | * The buffer is refilled automatically when empty. |
979 | | */ |
980 | | int mutt_ch_fgetconv(struct FgetConv *fc) |
981 | 0 | { |
982 | 0 | if (!fc) |
983 | 0 | return EOF; |
984 | 0 | if (!iconv_t_valid(fc->cd)) |
985 | 0 | return fgetc(fc->fp); |
986 | 0 | if (!fc->p) |
987 | 0 | return EOF; |
988 | 0 | if (fc->p < fc->ob) |
989 | 0 | return (unsigned char) *(fc->p)++; |
990 | | |
991 | | /* Try to convert some more */ |
992 | 0 | fc->p = fc->bufo; |
993 | 0 | fc->ob = fc->bufo; |
994 | 0 | if (fc->ibl) |
995 | 0 | { |
996 | 0 | size_t obl = sizeof(fc->bufo); |
997 | 0 | iconv(fc->cd, (ICONV_CONST char **) &fc->ib, &fc->ibl, &fc->ob, &obl); |
998 | 0 | if (fc->p < fc->ob) |
999 | 0 | return (unsigned char) *(fc->p)++; |
1000 | 0 | } |
1001 | | |
1002 | | /* If we trusted iconv a bit more, we would at this point |
1003 | | * ask why it had stopped converting ... */ |
1004 | | |
1005 | | /* Try to read some more */ |
1006 | 0 | if ((fc->ibl == sizeof(fc->bufi)) || |
1007 | 0 | (fc->ibl && (fc->ib + fc->ibl < fc->bufi + sizeof(fc->bufi)))) |
1008 | 0 | { |
1009 | 0 | fc->p = 0; |
1010 | 0 | return EOF; |
1011 | 0 | } |
1012 | 0 | if (fc->ibl) |
1013 | 0 | memcpy(fc->bufi, fc->ib, fc->ibl); |
1014 | 0 | fc->ib = fc->bufi; |
1015 | 0 | fc->ibl += fread(fc->ib + fc->ibl, 1, sizeof(fc->bufi) - fc->ibl, fc->fp); |
1016 | | |
1017 | | /* Try harder this time to convert some */ |
1018 | 0 | if (fc->ibl) |
1019 | 0 | { |
1020 | 0 | size_t obl = sizeof(fc->bufo); |
1021 | 0 | mutt_ch_iconv(fc->cd, (const char **) &fc->ib, &fc->ibl, &fc->ob, &obl, |
1022 | 0 | fc->inrepls, 0, NULL); |
1023 | 0 | if (fc->p < fc->ob) |
1024 | 0 | return (unsigned char) *(fc->p)++; |
1025 | 0 | } |
1026 | | |
1027 | | /* Either the file has finished or one of the buffers is too small */ |
1028 | 0 | fc->p = 0; |
1029 | 0 | return EOF; |
1030 | 0 | } |
1031 | | |
1032 | | /** |
1033 | | * mutt_ch_fgetconvs - Convert a file's charset into a string buffer |
1034 | | * @param buf Buffer for result |
1035 | | * @param buflen Length of buffer |
1036 | | * @param fc FgetConv handle |
1037 | | * @retval ptr Success, result buffer |
1038 | | * @retval NULL Error |
1039 | | * |
1040 | | * Read a file into a buffer, converting the character set as it goes. |
1041 | | */ |
1042 | | char *mutt_ch_fgetconvs(char *buf, size_t buflen, struct FgetConv *fc) |
1043 | 0 | { |
1044 | 0 | if (!buf) |
1045 | 0 | return NULL; |
1046 | | |
1047 | 0 | size_t r; |
1048 | 0 | for (r = 0; (r + 1) < buflen;) |
1049 | 0 | { |
1050 | 0 | const int c = mutt_ch_fgetconv(fc); |
1051 | 0 | if (c == EOF) |
1052 | 0 | break; |
1053 | 0 | buf[r++] = (char) c; |
1054 | 0 | if (c == '\n') |
1055 | 0 | break; |
1056 | 0 | } |
1057 | 0 | buf[r] = '\0'; |
1058 | |
|
1059 | 0 | if (r > 0) |
1060 | 0 | return buf; |
1061 | | |
1062 | 0 | return NULL; |
1063 | 0 | } |
1064 | | |
1065 | | /** |
1066 | | * mutt_ch_set_charset - Update the records for a new character set |
1067 | | * @param charset New character set |
1068 | | * |
1069 | | * Check if this character set is utf-8 and pick a suitable replacement |
1070 | | * character for unprintable characters. |
1071 | | * |
1072 | | * @note This calls `bind_textdomain_codeset()` which will affect future |
1073 | | * message translations. |
1074 | | */ |
1075 | | void mutt_ch_set_charset(const char *charset) |
1076 | 0 | { |
1077 | 0 | char buf[256] = { 0 }; |
1078 | |
|
1079 | 0 | mutt_ch_canonical_charset(buf, sizeof(buf), charset); |
1080 | |
|
1081 | 0 | if (mutt_ch_is_utf8(buf)) |
1082 | 0 | { |
1083 | 0 | CharsetIsUtf8 = true; |
1084 | 0 | ReplacementChar = 0xfffd; /* replacement character */ |
1085 | 0 | } |
1086 | 0 | else |
1087 | 0 | { |
1088 | 0 | CharsetIsUtf8 = false; |
1089 | 0 | ReplacementChar = '?'; |
1090 | 0 | } |
1091 | |
|
1092 | | #if defined(HAVE_BIND_TEXTDOMAIN_CODESET) && defined(ENABLE_NLS) |
1093 | | bind_textdomain_codeset(PACKAGE, buf); |
1094 | | #endif |
1095 | 0 | } |
1096 | | |
1097 | | /** |
1098 | | * mutt_ch_choose - Figure the best charset to encode a string |
1099 | | * @param[in] fromcode Original charset of the string |
1100 | | * @param[in] charsets List of potential charsets to use |
1101 | | * @param[in] u String to encode |
1102 | | * @param[in] ulen Length of the string to encode |
1103 | | * @param[out] d If not NULL, point it to the converted string |
1104 | | * @param[out] dlen If not NULL, point it to the length of the d string |
1105 | | * @retval ptr Best performing charset |
1106 | | * @retval NULL None could be found |
1107 | | */ |
1108 | | char *mutt_ch_choose(const char *fromcode, const struct Slist *charsets, |
1109 | | const char *u, size_t ulen, char **d, size_t *dlen) |
1110 | 0 | { |
1111 | 0 | if (!fromcode || !charsets) |
1112 | 0 | return NULL; |
1113 | | |
1114 | 0 | char *e = NULL, *tocode = NULL; |
1115 | 0 | size_t elen = 0, bestn = 0; |
1116 | |
|
1117 | 0 | const struct ListNode *np = NULL; |
1118 | 0 | STAILQ_FOREACH(np, &charsets->head, entries) |
1119 | 0 | { |
1120 | 0 | char *t = mutt_str_dup(np->data); |
1121 | 0 | if (!t) |
1122 | 0 | continue; |
1123 | | |
1124 | 0 | size_t n = mutt_str_len(t); |
1125 | 0 | char *s = mutt_strn_dup(u, ulen); |
1126 | 0 | const int rc = d ? mutt_ch_convert_string(&s, fromcode, t, MUTT_ICONV_NO_FLAGS) : |
1127 | 0 | mutt_ch_check(s, ulen, fromcode, t); |
1128 | 0 | if (rc) |
1129 | 0 | { |
1130 | 0 | FREE(&t); |
1131 | 0 | FREE(&s); |
1132 | 0 | continue; |
1133 | 0 | } |
1134 | 0 | size_t slen = mutt_str_len(s); |
1135 | |
|
1136 | 0 | if (!tocode || (n < bestn)) |
1137 | 0 | { |
1138 | 0 | bestn = n; |
1139 | 0 | FREE(&tocode); |
1140 | 0 | tocode = t; |
1141 | 0 | if (d) |
1142 | 0 | { |
1143 | 0 | FREE(&e); |
1144 | 0 | e = s; |
1145 | 0 | } |
1146 | 0 | else |
1147 | 0 | { |
1148 | 0 | FREE(&s); |
1149 | 0 | } |
1150 | 0 | elen = slen; |
1151 | 0 | } |
1152 | 0 | else |
1153 | 0 | { |
1154 | 0 | FREE(&t); |
1155 | 0 | FREE(&s); |
1156 | 0 | } |
1157 | 0 | } |
1158 | 0 | if (tocode) |
1159 | 0 | { |
1160 | 0 | if (d) |
1161 | 0 | *d = e; |
1162 | 0 | if (dlen) |
1163 | 0 | *dlen = elen; |
1164 | |
|
1165 | 0 | char canonical_buf[1024] = { 0 }; |
1166 | 0 | mutt_ch_canonical_charset(canonical_buf, sizeof(canonical_buf), tocode); |
1167 | 0 | mutt_str_replace(&tocode, canonical_buf); |
1168 | 0 | } |
1169 | 0 | return tocode; |
1170 | 0 | } |
1171 | | |
1172 | | /** |
1173 | | * mutt_ch_cache_cleanup - Clean up the cached iconv handles and charset strings |
1174 | | */ |
1175 | | void mutt_ch_cache_cleanup(void) |
1176 | 0 | { |
1177 | 0 | for (int i = 0; i < IconvCacheUsed; i++) |
1178 | 0 | { |
1179 | 0 | FREE(&IconvCache[i].fromcode1); |
1180 | 0 | FREE(&IconvCache[i].tocode1); |
1181 | 0 | if (iconv_t_valid(IconvCache[i].cd)) |
1182 | 0 | { |
1183 | 0 | iconv_close(IconvCache[i].cd); |
1184 | 0 | } |
1185 | 0 | } |
1186 | 0 | IconvCacheUsed = 0; |
1187 | 0 | } |