Coverage Report

Created: 2025-03-11 06:49

/src/neomutt/mutt/charset.c
Line
Count
Source (jump to first uncovered line)
1
/**
2
 * @file
3
 * Conversion between different character encodings
4
 *
5
 * @authors
6
 * Copyright (C) 2017 Tobias Angele <toogley@mailbox.org>
7
 * Copyright (C) 2017-2023 Richard Russon <rich@flatcap.org>
8
 * Copyright (C) 2018-2023 Pietro Cerutti <gahr@gahr.ch>
9
 * Copyright (C) 2023 Steinar H Gunderson <steinar+neomutt@gunderson.no>
10
 *
11
 * @copyright
12
 * This program is free software: you can redistribute it and/or modify it under
13
 * the terms of the GNU General Public License as published by the Free Software
14
 * Foundation, either version 2 of the License, or (at your option) any later
15
 * version.
16
 *
17
 * This program is distributed in the hope that it will be useful, but WITHOUT
18
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
19
 * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
20
 * details.
21
 *
22
 * You should have received a copy of the GNU General Public License along with
23
 * this program.  If not, see <http://www.gnu.org/licenses/>.
24
 */
25
26
/**
27
 * @page mutt_charset Conversion between different character encodings
28
 *
29
 * Conversion between different character encodings
30
 */
31
32
#include "config.h"
33
#include <errno.h>
34
#include <iconv.h>
35
#include <langinfo.h>
36
#include <limits.h>
37
#include <stdbool.h>
38
#include <stdio.h>
39
#include <string.h>
40
#include "charset.h"
41
#include "buffer.h"
42
#include "list.h"
43
#include "logging2.h"
44
#include "memory.h"
45
#include "pool.h"
46
#include "queue.h"
47
#include "regex3.h"
48
#include "slist.h"
49
#include "string2.h"
50
#ifdef ENABLE_NLS
51
#include <libintl.h>
52
#endif
53
54
#ifndef EILSEQ
55
#define EILSEQ EINVAL
56
#endif
57
58
/**
59
 * ReplacementChar - When a Unicode character can't be displayed, use this instead
60
 */
61
wchar_t ReplacementChar = '?';
62
63
/**
64
 * CharsetIsUtf8 - Is the user's current character set utf-8?
65
 */
66
bool CharsetIsUtf8 = false;
67
68
/**
69
 * struct Lookup - Regex to String lookup table
70
 *
71
 * This is used by 'charset-hook' and 'iconv-hook'.
72
 */
73
struct Lookup
74
{
75
  enum LookupType type;        ///< Lookup type
76
  struct Regex regex;          ///< Regular expression
77
  char *replacement;           ///< Alternative charset to use
78
  TAILQ_ENTRY(Lookup) entries; ///< Linked list
79
};
80
TAILQ_HEAD(LookupList, Lookup);
81
82
/// Lookup table of preferred character set names
83
static struct LookupList Lookups = TAILQ_HEAD_INITIALIZER(Lookups);
84
85
/**
86
 * struct IconvCacheEntry - Cached iconv conversion descriptor
87
 */
88
struct IconvCacheEntry
89
{
90
  char *fromcode1; ///< Source character set
91
  char *tocode1;   ///< Destination character set
92
  iconv_t cd;      ///< iconv conversion descriptor
93
};
94
95
/// Max size of the iconv cache
96
0
#define ICONV_CACHE_SIZE 16
97
/// Cache of iconv conversion descriptors
98
static struct IconvCacheEntry IconvCache[ICONV_CACHE_SIZE];
99
/// Number of iconv descriptors in the cache
100
static int IconvCacheUsed = 0;
101
102
/**
103
 * struct MimeNames - MIME name lookup entry
104
 */
105
struct MimeNames
106
{
107
  const char *key;
108
  const char *pref;
109
};
110
111
/**
112
 * PreferredMimeNames - Lookup table of preferred charsets
113
 *
114
 * The following list has been created manually from the data under:
115
 * http://www.isi.edu/in-notes/iana/assignments/character-sets
116
 * Last update: 2000-09-07
117
 *
118
 * @note It includes only the subset of character sets for which a preferred
119
 * MIME name is given.
120
 */
121
static const struct MimeNames PreferredMimeNames[] = {
122
  // clang-format off
123
  { "ansi_x3.4-1968",        "us-ascii"      },
124
  { "iso-ir-6",              "us-ascii"      },
125
  { "iso_646.irv:1991",      "us-ascii"      },
126
  { "ascii",                 "us-ascii"      },
127
  { "iso646-us",             "us-ascii"      },
128
  { "us",                    "us-ascii"      },
129
  { "ibm367",                "us-ascii"      },
130
  { "cp367",                 "us-ascii"      },
131
  { "csASCII",               "us-ascii"      },
132
133
  { "csISO2022KR",           "iso-2022-kr"   },
134
  { "csEUCKR",               "euc-kr"        },
135
  { "csISO2022JP",           "iso-2022-jp"   },
136
  { "csISO2022JP2",          "iso-2022-jp-2" },
137
138
  { "ISO_8859-1:1987",       "iso-8859-1"    },
139
  { "iso-ir-100",            "iso-8859-1"    },
140
  { "iso_8859-1",            "iso-8859-1"    },
141
  { "latin1",                "iso-8859-1"    },
142
  { "l1",                    "iso-8859-1"    },
143
  { "IBM819",                "iso-8859-1"    },
144
  { "CP819",                 "iso-8859-1"    },
145
  { "csISOLatin1",           "iso-8859-1"    },
146
147
  { "ISO_8859-2:1987",       "iso-8859-2"    },
148
  { "iso-ir-101",            "iso-8859-2"    },
149
  { "iso_8859-2",            "iso-8859-2"    },
150
  { "latin2",                "iso-8859-2"    },
151
  { "l2",                    "iso-8859-2"    },
152
  { "csISOLatin2",           "iso-8859-2"    },
153
154
  { "ISO_8859-3:1988",       "iso-8859-3"    },
155
  { "iso-ir-109",            "iso-8859-3"    },
156
  { "ISO_8859-3",            "iso-8859-3"    },
157
  { "latin3",                "iso-8859-3"    },
158
  { "l3",                    "iso-8859-3"    },
159
  { "csISOLatin3",           "iso-8859-3"    },
160
161
  { "ISO_8859-4:1988",       "iso-8859-4"    },
162
  { "iso-ir-110",            "iso-8859-4"    },
163
  { "ISO_8859-4",            "iso-8859-4"    },
164
  { "latin4",                "iso-8859-4"    },
165
  { "l4",                    "iso-8859-4"    },
166
  { "csISOLatin4",           "iso-8859-4"    },
167
168
  { "ISO_8859-6:1987",       "iso-8859-6"    },
169
  { "iso-ir-127",            "iso-8859-6"    },
170
  { "iso_8859-6",            "iso-8859-6"    },
171
  { "ECMA-114",              "iso-8859-6"    },
172
  { "ASMO-708",              "iso-8859-6"    },
173
  { "arabic",                "iso-8859-6"    },
174
  { "csISOLatinArabic",      "iso-8859-6"    },
175
176
  { "ISO_8859-7:1987",       "iso-8859-7"    },
177
  { "iso-ir-126",            "iso-8859-7"    },
178
  { "ISO_8859-7",            "iso-8859-7"    },
179
  { "ELOT_928",              "iso-8859-7"    },
180
  { "ECMA-118",              "iso-8859-7"    },
181
  { "greek",                 "iso-8859-7"    },
182
  { "greek8",                "iso-8859-7"    },
183
  { "csISOLatinGreek",       "iso-8859-7"    },
184
185
  { "ISO_8859-8:1988",       "iso-8859-8"    },
186
  { "iso-ir-138",            "iso-8859-8"    },
187
  { "ISO_8859-8",            "iso-8859-8"    },
188
  { "hebrew",                "iso-8859-8"    },
189
  { "csISOLatinHebrew",      "iso-8859-8"    },
190
191
  { "ISO_8859-5:1988",       "iso-8859-5"    },
192
  { "iso-ir-144",            "iso-8859-5"    },
193
  { "ISO_8859-5",            "iso-8859-5"    },
194
  { "cyrillic",              "iso-8859-5"    },
195
  { "csISOLatinCyrillic",    "iso-8859-5"    },
196
197
  { "ISO_8859-9:1989",       "iso-8859-9"    },
198
  { "iso-ir-148",            "iso-8859-9"    },
199
  { "ISO_8859-9",            "iso-8859-9"    },
200
  { "latin5",                "iso-8859-9"    },  /* this is not a bug */
201
  { "l5",                    "iso-8859-9"    },
202
  { "csISOLatin5",           "iso-8859-9"    },
203
204
  { "ISO_8859-10:1992",      "iso-8859-10"   },
205
  { "iso-ir-157",            "iso-8859-10"   },
206
  { "latin6",                "iso-8859-10"   },  /* this is not a bug */
207
  { "l6",                    "iso-8859-10"   },
208
  { "csISOLatin6",           "iso-8859-10"   },
209
210
  { "csKOI8r",               "koi8-r"        },
211
212
  { "MS_Kanji",              "Shift_JIS"     },  /* Note the underscore! */
213
  { "csShiftJis",            "Shift_JIS"     },
214
215
  { "Extended_UNIX_Code_Packed_Format_for_Japanese",
216
                             "euc-jp"        },
217
  { "csEUCPkdFmtJapanese",   "euc-jp"        },
218
219
  { "csGB2312",              "gb2312"        },
220
  { "csbig5",                "big5"          },
221
222
  /* End of official brain damage.
223
   * What follows has been taken from glibc's localedata files.  */
224
225
  { "iso_8859-13",           "iso-8859-13"   },
226
  { "iso-ir-179",            "iso-8859-13"   },
227
  { "latin7",                "iso-8859-13"   },  /* this is not a bug */
228
  { "l7",                    "iso-8859-13"   },
229
230
  { "iso_8859-14",           "iso-8859-14"   },
231
  { "latin8",                "iso-8859-14"   },  /* this is not a bug */
232
  { "l8",                    "iso-8859-14"   },
233
234
  { "iso_8859-15",           "iso-8859-15"   },
235
  { "latin9",                "iso-8859-15"   },  /* this is not a bug */
236
237
  /* Suggested by Ionel Mugurel Ciobica <tgakic@sg10.chem.tue.nl> */
238
  { "latin0",                "iso-8859-15"   },  /* this is not a bug */
239
240
  { "iso_8859-16",           "iso-8859-16"   },
241
  { "latin10",               "iso-8859-16"   },  /* this is not a bug */
242
243
  { "646",                   "us-ascii"      },
244
245
  /* http://www.sun.com/software/white-papers/wp-unicode/ */
246
247
  { "eucJP",                 "euc-jp"        },
248
  { "PCK",                   "Shift_JIS"     },
249
  { "ko_KR-euc",             "euc-kr"        },
250
  { "zh_TW-big5",            "big5"          },
251
252
  /* seems to be common on some systems */
253
254
  { "sjis",                  "Shift_JIS"     },
255
  { "euc-jp-ms",             "eucJP-ms"      },
256
257
  /* If you happen to encounter system-specific brain-damage with respect to
258
   * character set naming, please add it above this comment, and submit a patch
259
   * to <neomutt-devel@neomutt.org> */
260
261
  { NULL, NULL },
262
  // clang-format on
263
};
264
265
/**
266
 * lookup_new - Create a new Lookup
267
 * @retval ptr New Lookup
268
 */
269
static struct Lookup *lookup_new(void)
270
0
{
271
0
  return MUTT_MEM_CALLOC(1, struct Lookup);
272
0
}
273
274
/**
275
 * lookup_free - Free a Lookup
276
 * @param ptr Lookup to free
277
 */
278
static void lookup_free(struct Lookup **ptr)
279
0
{
280
0
  if (!ptr || !*ptr)
281
0
    return;
282
283
0
  struct Lookup *l = *ptr;
284
0
  FREE(&l->replacement);
285
0
  FREE(&l->regex.pattern);
286
0
  if (l->regex.regex)
287
0
    regfree(l->regex.regex);
288
0
  FREE(&l->regex.regex);
289
0
  FREE(&l->regex);
290
291
0
  FREE(ptr);
292
0
}
293
294
/**
295
 * lookup_charset - Look for a preferred character set name
296
 * @param type Type, e.g. #MUTT_LOOKUP_CHARSET
297
 * @param cs   Character set
298
 * @retval ptr Charset string
299
 *
300
 * If the character set matches one of the regexes,
301
 * then return the replacement name.
302
 */
303
static const char *lookup_charset(enum LookupType type, const char *cs)
304
0
{
305
0
  if (!cs)
306
0
    return NULL;
307
308
0
  struct Lookup *l = NULL;
309
310
0
  TAILQ_FOREACH(l, &Lookups, entries)
311
0
  {
312
0
    if (l->type != type)
313
0
      continue;
314
0
    if (mutt_regex_match(&l->regex, cs))
315
0
      return l->replacement;
316
0
  }
317
0
  return NULL;
318
0
}
319
320
/**
321
 * mutt_ch_convert_nonmime_string - Try to convert a string using a list of character sets
322
 * @param[in]     assumed_charset From $assumed_charset
323
 * @param[in]     charset         From $charset
324
 * @param[in,out] ps              String to be converted
325
 * @retval 0  Success
326
 * @retval -1 Error
327
 *
328
 * Work through `$assumed_charset` looking for a character set conversion that
329
 * works.  Failing that, try mutt_ch_get_default_charset().
330
 */
331
int mutt_ch_convert_nonmime_string(const struct Slist *const assumed_charset,
332
                                   const char *charset, char **ps)
333
0
{
334
0
  if (!ps)
335
0
    return -1;
336
337
0
  char *u = *ps;
338
0
  const size_t ulen = mutt_str_len(u);
339
0
  if (ulen == 0)
340
0
    return 0;
341
342
0
  const struct ListNode *np = NULL;
343
0
  STAILQ_FOREACH(np, &assumed_charset->head, entries)
344
0
  {
345
0
    char const *c = np->data;
346
0
    size_t n = mutt_str_len(c);
347
0
    char *fromcode = MUTT_MEM_MALLOC(n + 1, char);
348
0
    mutt_str_copy(fromcode, c, n + 1);
349
0
    char *s = mutt_strn_dup(u, ulen);
350
0
    int m = mutt_ch_convert_string(&s, fromcode, charset, MUTT_ICONV_NO_FLAGS);
351
0
    FREE(&fromcode);
352
0
    if (m == 0)
353
0
    {
354
0
      FREE(ps);
355
0
      *ps = s;
356
0
      return 0;
357
0
    }
358
0
    FREE(&s);
359
0
  }
360
0
  mutt_ch_convert_string(ps, mutt_ch_get_default_charset(assumed_charset),
361
0
                         charset, MUTT_ICONV_HOOK_FROM);
362
0
  return -1;
363
0
}
364
365
/**
366
 * mutt_ch_canonical_charset - Canonicalise the charset of a string
367
 * @param buf Buffer for canonical character set name
368
 * @param buflen Length of buffer
369
 * @param name Name to be canonicalised
370
 *
371
 * This first ties off any charset extension such as "//TRANSLIT",
372
 * canonicalizes the charset and re-adds the extension
373
 */
374
void mutt_ch_canonical_charset(char *buf, size_t buflen, const char *name)
375
0
{
376
0
  if (!buf || !name)
377
0
    return;
378
379
0
  char in[1024] = { 0 };
380
0
  char scratch[1024 + 10] = { 0 };
381
0
  struct Buffer *canon = buf_pool_get();
382
383
0
  mutt_str_copy(in, name, sizeof(in));
384
0
  char *ext = strchr(in, '/');
385
0
  if (ext)
386
0
    *ext++ = '\0';
387
388
0
  if (mutt_istr_equal(in, "utf-8") || mutt_istr_equal(in, "utf8"))
389
0
  {
390
0
    buf_strcpy(canon, "utf-8");
391
0
    goto out;
392
0
  }
393
394
  /* catch some common iso-8859-something misspellings */
395
0
  size_t plen;
396
0
  if ((plen = mutt_istr_startswith(in, "8859")) && (in[plen] != '-'))
397
0
    snprintf(scratch, sizeof(scratch), "iso-8859-%s", in + plen);
398
0
  else if ((plen = mutt_istr_startswith(in, "8859-")))
399
0
    snprintf(scratch, sizeof(scratch), "iso-8859-%s", in + plen);
400
0
  else if ((plen = mutt_istr_startswith(in, "iso8859")) && (in[plen] != '-'))
401
0
    snprintf(scratch, sizeof(scratch), "iso_8859-%s", in + plen);
402
0
  else if ((plen = mutt_istr_startswith(in, "iso8859-")))
403
0
    snprintf(scratch, sizeof(scratch), "iso_8859-%s", in + plen);
404
0
  else
405
0
    mutt_str_copy(scratch, in, sizeof(scratch));
406
407
0
  for (size_t i = 0; PreferredMimeNames[i].key; i++)
408
0
  {
409
0
    if (mutt_istr_equal(scratch, PreferredMimeNames[i].key))
410
0
    {
411
0
      buf_strcpy(canon, PreferredMimeNames[i].pref);
412
0
      goto out;
413
0
    }
414
0
  }
415
416
0
  buf_strcpy(canon, scratch);
417
0
  buf_lower(canon); // for cosmetics' sake
418
419
0
out:
420
0
  if (ext && (*ext != '\0'))
421
0
  {
422
0
    buf_addch(canon, '/');
423
0
    buf_addstr(canon, ext);
424
0
  }
425
426
0
  mutt_str_copy(buf, buf_string(canon), buflen);
427
0
  buf_pool_release(&canon);
428
0
}
429
430
/**
431
 * mutt_ch_chscmp - Are the names of two character sets equivalent?
432
 * @param cs1 First character set
433
 * @param cs2 Second character set
434
 * @retval true  Names are equivalent
435
 * @retval false Names differ
436
 *
437
 * Charsets may have extensions that mutt_ch_canonical_charset() leaves intact;
438
 * we expect 'cs2' to originate from neomutt code, not user input (i.e. 'cs2'
439
 * does _not_ have any extension) we simply check if the shorter string is a
440
 * prefix for the longer.
441
 */
442
bool mutt_ch_chscmp(const char *cs1, const char *cs2)
443
0
{
444
0
  if (!cs1 || !cs2)
445
0
    return false;
446
447
0
  char buf[256] = { 0 };
448
449
0
  mutt_ch_canonical_charset(buf, sizeof(buf), cs1);
450
451
0
  int len1 = mutt_str_len(buf);
452
0
  int len2 = mutt_str_len(cs2);
453
454
0
  return mutt_istrn_equal(((len1 > len2) ? buf : cs2),
455
0
                          ((len1 > len2) ? cs2 : buf), MIN(len1, len2));
456
0
}
457
458
/**
459
 * mutt_ch_get_default_charset - Get the default character set
460
 * @param assumed_charset From $assumed_charset
461
 * @retval ptr Name of the default character set
462
 *
463
 * @warning This returns a pointer to a static buffer.  Do not free it.
464
 */
465
const char *mutt_ch_get_default_charset(const struct Slist *const assumed_charset)
466
843
{
467
843
  static char fcharset[128];
468
843
  const char *c = NULL;
469
470
843
  if (assumed_charset && (assumed_charset->count > 0))
471
0
    c = STAILQ_FIRST(&assumed_charset->head)->data;
472
843
  else
473
843
    c = "us-ascii";
474
475
843
  mutt_str_copy(fcharset, c, sizeof(fcharset));
476
843
  return fcharset;
477
843
}
478
479
/**
480
 * mutt_ch_get_langinfo_charset - Get the user's choice of character set
481
 * @retval ptr Charset string
482
 *
483
 * Get the canonical character set used by the user's locale.
484
 * The caller must free the returned string.
485
 */
486
char *mutt_ch_get_langinfo_charset(void)
487
0
{
488
0
  char buf[1024] = { 0 };
489
490
0
  mutt_ch_canonical_charset(buf, sizeof(buf), nl_langinfo(CODESET));
491
492
0
  if (buf[0] != '\0')
493
0
    return mutt_str_dup(buf);
494
495
0
  return mutt_str_dup("iso-8859-1");
496
0
}
497
498
/**
499
 * mutt_ch_lookup_add - Add a new character set lookup
500
 * @param type    Type of character set, e.g. #MUTT_LOOKUP_CHARSET
501
 * @param pat     Pattern to match
502
 * @param replace Replacement string
503
 * @param err     Buffer for error message
504
 * @retval true  Lookup added to list
505
 * @retval false Regex string was invalid
506
 *
507
 * Add a regex for a character set and a replacement name.
508
 */
509
bool mutt_ch_lookup_add(enum LookupType type, const char *pat,
510
                        const char *replace, struct Buffer *err)
511
0
{
512
0
  if (!pat || !replace)
513
0
    return false;
514
515
0
  regex_t *rx = MUTT_MEM_CALLOC(1, regex_t);
516
0
  int rc = REG_COMP(rx, pat, REG_ICASE);
517
0
  if (rc != 0)
518
0
  {
519
0
    regerror(rc, rx, err->data, err->dsize);
520
0
    FREE(&rx);
521
0
    return false;
522
0
  }
523
524
0
  struct Lookup *l = lookup_new();
525
0
  l->type = type;
526
0
  l->replacement = mutt_str_dup(replace);
527
0
  l->regex.pattern = mutt_str_dup(pat);
528
0
  l->regex.regex = rx;
529
0
  l->regex.pat_not = false;
530
531
0
  TAILQ_INSERT_TAIL(&Lookups, l, entries);
532
533
0
  return true;
534
0
}
535
536
/**
537
 * mutt_ch_lookup_remove - Remove all the character set lookups
538
 *
539
 * Empty the list of replacement character set names.
540
 */
541
void mutt_ch_lookup_remove(void)
542
0
{
543
0
  struct Lookup *l = NULL;
544
0
  struct Lookup *tmp = NULL;
545
546
0
  TAILQ_FOREACH_SAFE(l, &Lookups, entries, tmp)
547
0
  {
548
0
    TAILQ_REMOVE(&Lookups, l, entries);
549
0
    lookup_free(&l);
550
0
  }
551
0
}
552
553
/**
554
 * mutt_ch_charset_lookup - Look for a replacement character set
555
 * @param chs Character set to lookup
556
 * @retval ptr  Replacement character set (if a 'charset-hook' matches)
557
 * @retval NULL No matching hook
558
 *
559
 * Look through all the 'charset-hook's.
560
 * If one matches return the replacement character set.
561
 */
562
const char *mutt_ch_charset_lookup(const char *chs)
563
0
{
564
0
  return lookup_charset(MUTT_LOOKUP_CHARSET, chs);
565
0
}
566
567
/**
568
 * mutt_ch_iconv_open - Set up iconv for conversions
569
 * @param tocode   Current character set
570
 * @param fromcode Target character set
571
 * @param flags    Flags, e.g. #MUTT_ICONV_HOOK_FROM
572
 * @retval ptr iconv handle for the conversion
573
 *
574
 * Like iconv_open, but canonicalises the charsets, applies charset-hooks,
575
 * recanonicalises, and finally applies iconv-hooks. Parameter flags=0 skips
576
 * charset-hooks, while MUTT_ICONV_HOOK_FROM applies them to fromcode. Callers
577
 * should use flags=0 when fromcode can safely be considered true, either some
578
 * constant, or some value provided by the user; MUTT_ICONV_HOOK_FROM should be
579
 * used only when fromcode is unsure, taken from a possibly wrong incoming MIME
580
 * label, or such. Misusing MUTT_ICONV_HOOK_FROM leads to unwanted interactions
581
 * in some setups.
582
 *
583
 * Since calling iconv_open() repeatedly can be expensive, we keep a cache of
584
 * the most recently used iconv_t objects, kept in LRU order. This means that
585
 * you should not call iconv_close() on the object yourself. All remaining
586
 * objects in the cache will exit when main() calls mutt_ch_cache_cleanup().
587
 *
588
 * @note By design charset-hooks should never be, and are never, applied
589
 * to tocode.
590
 *
591
 * @note The top-well-named MUTT_ICONV_HOOK_FROM acts on charset-hooks,
592
 * not at all on iconv-hooks.
593
 */
594
iconv_t mutt_ch_iconv_open(const char *tocode, const char *fromcode, uint8_t flags)
595
0
{
596
0
  char tocode1[128] = { 0 };
597
0
  char fromcode1[128] = { 0 };
598
0
  const char *tocode2 = NULL, *fromcode2 = NULL;
599
0
  const char *tmp = NULL;
600
601
  /* transform to MIME preferred charset names */
602
0
  mutt_ch_canonical_charset(tocode1, sizeof(tocode1), tocode);
603
0
  mutt_ch_canonical_charset(fromcode1, sizeof(fromcode1), fromcode);
604
605
  /* maybe apply charset-hooks and recanonicalise fromcode,
606
   * but only when caller asked us to sanitize a potentially wrong
607
   * charset name incoming from the wild exterior. */
608
0
  if (flags & MUTT_ICONV_HOOK_FROM)
609
0
  {
610
0
    tmp = mutt_ch_charset_lookup(fromcode1);
611
0
    if (tmp)
612
0
      mutt_ch_canonical_charset(fromcode1, sizeof(fromcode1), tmp);
613
0
  }
614
615
  /* check if we have this pair cached already */
616
0
  for (int i = 0; i < IconvCacheUsed; i++)
617
0
  {
618
0
    if (strcmp(tocode1, IconvCache[i].tocode1) == 0 &&
619
0
        strcmp(fromcode1, IconvCache[i].fromcode1) == 0)
620
0
    {
621
0
      iconv_t cd = IconvCache[i].cd;
622
623
      /* make room for this one at the top */
624
0
      struct IconvCacheEntry top = IconvCache[i];
625
0
      for (int j = i - 1; j >= 0; j--)
626
0
      {
627
0
        IconvCache[j + 1] = IconvCache[j];
628
0
      }
629
0
      IconvCache[0] = top;
630
631
0
      if (iconv_t_valid(cd))
632
0
      {
633
        /* reset state */
634
0
        iconv(cd, NULL, NULL, NULL, NULL);
635
0
      }
636
0
      return cd;
637
0
    }
638
0
  }
639
640
  /* not found in cache */
641
  /* always apply iconv-hooks to suit system's iconv tastes */
642
0
  tocode2 = mutt_ch_iconv_lookup(tocode1);
643
0
  tocode2 = tocode2 ? tocode2 : tocode1;
644
0
  fromcode2 = mutt_ch_iconv_lookup(fromcode1);
645
0
  fromcode2 = fromcode2 ? fromcode2 : fromcode1;
646
647
  /* call system iconv with names it appreciates */
648
0
  iconv_t cd = iconv_open(tocode2, fromcode2);
649
650
0
  if (IconvCacheUsed == ICONV_CACHE_SIZE)
651
0
  {
652
0
    mutt_debug(LL_DEBUG2, "iconv: dropping %s -> %s from the cache\n",
653
0
               IconvCache[IconvCacheUsed - 1].fromcode1,
654
0
               IconvCache[IconvCacheUsed - 1].tocode1);
655
    /* get rid of the oldest entry */
656
0
    FREE(&IconvCache[IconvCacheUsed - 1].fromcode1);
657
0
    FREE(&IconvCache[IconvCacheUsed - 1].tocode1);
658
0
    if (iconv_t_valid(IconvCache[IconvCacheUsed - 1].cd))
659
0
    {
660
0
      iconv_close(IconvCache[IconvCacheUsed - 1].cd);
661
0
    }
662
0
    IconvCacheUsed--;
663
0
  }
664
665
  /* make room for this one at the top */
666
0
  for (int j = IconvCacheUsed - 1; j >= 0; j--)
667
0
  {
668
0
    IconvCache[j + 1] = IconvCache[j];
669
0
  }
670
671
0
  IconvCacheUsed++;
672
673
0
  mutt_debug(LL_DEBUG2, "iconv: adding %s -> %s to the cache\n", fromcode1, tocode1);
674
0
  IconvCache[0].fromcode1 = strdup(fromcode1);
675
0
  IconvCache[0].tocode1 = strdup(tocode1);
676
0
  IconvCache[0].cd = cd;
677
678
0
  return cd;
679
0
}
680
681
/**
682
 * mutt_ch_iconv - Change the encoding of a string
683
 * @param[in]     cd           Iconv conversion descriptor
684
 * @param[in,out] inbuf        Buffer to convert
685
 * @param[in,out] inbytesleft  Length of buffer to convert
686
 * @param[in,out] outbuf       Buffer for the result
687
 * @param[in,out] outbytesleft Length of result buffer
688
 * @param[in]     inrepls      Input replacement characters
689
 * @param[in]     outrepl      Output replacement characters
690
 * @param[out]    iconverrno   Errno if iconv() fails, 0 if it succeeds
691
 * @retval num Characters converted
692
 *
693
 * Like iconv, but keeps going even when the input is invalid
694
 * If you're supplying inrepls, the source charset should be stateless;
695
 * if you're supplying an outrepl, the target charset should be.
696
 */
697
size_t mutt_ch_iconv(iconv_t cd, const char **inbuf, size_t *inbytesleft,
698
                     char **outbuf, size_t *outbytesleft, const char **inrepls,
699
                     const char *outrepl, int *iconverrno)
700
0
{
701
0
  size_t rc = 0;
702
0
  const char *ib = *inbuf;
703
0
  size_t ibl = *inbytesleft;
704
0
  char *ob = *outbuf;
705
0
  size_t obl = *outbytesleft;
706
707
0
  while (true)
708
0
  {
709
0
    errno = 0;
710
0
    const size_t ret1 = iconv(cd, (ICONV_CONST char **) &ib, &ibl, &ob, &obl);
711
0
    if (ret1 != ICONV_ILLEGAL_SEQ)
712
0
      rc += ret1;
713
0
    if (iconverrno)
714
0
      *iconverrno = errno;
715
716
0
    if (ibl && obl && (errno == EILSEQ))
717
0
    {
718
0
      if (inrepls)
719
0
      {
720
        /* Try replacing the input */
721
0
        const char **t = NULL;
722
0
        for (t = inrepls; *t; t++)
723
0
        {
724
0
          const char *ib1 = *t;
725
0
          size_t ibl1 = strlen(*t);
726
0
          char *ob1 = ob;
727
0
          size_t obl1 = obl;
728
0
          iconv(cd, (ICONV_CONST char **) &ib1, &ibl1, &ob1, &obl1);
729
0
          if (ibl1 == 0)
730
0
          {
731
0
            ib++;
732
0
            ibl--;
733
0
            ob = ob1;
734
0
            obl = obl1;
735
0
            rc++;
736
0
            break;
737
0
          }
738
0
        }
739
0
        if (*t)
740
0
          continue;
741
0
      }
742
      /* Replace the output */
743
0
      if (!outrepl)
744
0
        outrepl = "?";
745
0
      iconv(cd, NULL, NULL, &ob, &obl);
746
0
      if (obl)
747
0
      {
748
0
        int n = strlen(outrepl);
749
0
        if (n > obl)
750
0
        {
751
0
          outrepl = "?";
752
0
          n = 1;
753
0
        }
754
0
        memcpy(ob, outrepl, n);
755
0
        ib++;
756
0
        ibl--;
757
0
        ob += n;
758
0
        obl -= n;
759
0
        rc++;
760
0
        iconv(cd, NULL, NULL, NULL, NULL); /* for good measure */
761
0
        continue;
762
0
      }
763
0
    }
764
0
    *inbuf = ib;
765
0
    *inbytesleft = ibl;
766
0
    *outbuf = ob;
767
0
    *outbytesleft = obl;
768
0
    return rc;
769
0
  }
770
0
}
771
772
/**
773
 * mutt_ch_iconv_lookup - Look for a replacement character set
774
 * @param chs Character set to lookup
775
 * @retval ptr  Replacement character set (if a 'iconv-hook' matches)
776
 * @retval NULL No matching hook
777
 *
778
 * Look through all the 'iconv-hook's.
779
 * If one matches return the replacement character set.
780
 */
781
const char *mutt_ch_iconv_lookup(const char *chs)
782
0
{
783
0
  return lookup_charset(MUTT_LOOKUP_ICONV, chs);
784
0
}
785
786
/**
787
 * mutt_ch_check - Check whether a string can be converted between encodings
788
 * @param[in] s     String to check
789
 * @param[in] slen  Length of the string to check
790
 * @param[in] from  Current character set
791
 * @param[in] to    Target character set
792
 * @retval 0  Success
793
 * @retval -1 Error in iconv_open()
794
 * @retval >0 Errno as set by iconv()
795
 */
796
int mutt_ch_check(const char *s, size_t slen, const char *from, const char *to)
797
0
{
798
0
  if (!s || !from || !to)
799
0
    return -1;
800
801
0
  int rc = 0;
802
0
  iconv_t cd = mutt_ch_iconv_open(to, from, MUTT_ICONV_NO_FLAGS);
803
0
  if (!iconv_t_valid(cd))
804
0
    return -1;
805
806
0
  size_t outlen = MB_LEN_MAX * slen;
807
0
  char *out = MUTT_MEM_MALLOC(outlen + 1, char);
808
0
  char *saved_out = out;
809
810
0
  const size_t convlen = iconv(cd, (ICONV_CONST char **) &s, &slen, &out, &outlen);
811
0
  if (convlen == ICONV_ILLEGAL_SEQ)
812
0
    rc = errno;
813
814
0
  FREE(&saved_out);
815
0
  return rc;
816
0
}
817
818
/**
819
 * mutt_ch_convert_string - Convert a string between encodings
820
 * @param[in,out] ps    String to convert
821
 * @param[in]     from  Current character set
822
 * @param[in]     to    Target character set
823
 * @param[in]     flags Flags, e.g. #MUTT_ICONV_HOOK_FROM
824
 * @retval 0      Success
825
 * @retval -1     Invalid arguments or failure to open an iconv channel
826
 * @retval errno  Failure in iconv conversion
827
 *
828
 * Parameter flags is given as-is to mutt_ch_iconv_open().
829
 * See there for its meaning and usage policy.
830
 */
831
int mutt_ch_convert_string(char **ps, const char *from, const char *to, uint8_t flags)
832
6.43k
{
833
6.43k
  if (!ps)
834
0
    return -1;
835
836
6.43k
  char *s = *ps;
837
838
6.43k
  if (!s || (*s == '\0'))
839
2.43k
    return 0;
840
841
4.00k
  if (!to || !from)
842
4.00k
    return -1;
843
844
0
  const char *repls[] = { "\357\277\275", "?", 0 };
845
0
  int rc = 0;
846
847
0
  iconv_t cd = mutt_ch_iconv_open(to, from, flags);
848
0
  if (!iconv_t_valid(cd))
849
0
    return -1;
850
851
0
  const char **inrepls = NULL;
852
0
  const char *outrepl = NULL;
853
854
0
  if (mutt_ch_is_utf8(to))
855
0
    outrepl = "\357\277\275";
856
0
  else if (mutt_ch_is_utf8(from))
857
0
    inrepls = repls;
858
0
  else
859
0
    outrepl = "?";
860
861
0
  const char *ib = s;
862
0
  size_t ibl = strlen(s);
863
0
  if (ibl >= (SIZE_MAX / MB_LEN_MAX))
864
0
  {
865
0
    return -1;
866
0
  }
867
0
  size_t obl = MB_LEN_MAX * ibl;
868
0
  char *buf = MUTT_MEM_MALLOC(obl + 1, char);
869
0
  char *ob = buf;
870
871
0
  mutt_ch_iconv(cd, &ib, &ibl, &ob, &obl, inrepls, outrepl, &rc);
872
0
  iconv(cd, 0, 0, &ob, &obl);
873
874
0
  *ob = '\0';
875
876
0
  FREE(ps);
877
0
  *ps = buf;
878
879
0
  mutt_str_adjust(ps);
880
0
  return rc;
881
0
}
882
883
/**
884
 * mutt_ch_check_charset - Does iconv understand a character set?
885
 * @param cs     Character set to check
886
 * @param strict Check strictly by using iconv
887
 * @retval true Character set is valid
888
 *
889
 * If `strict` is false, then finding a matching character set in
890
 * #PreferredMimeNames will be enough.
891
 * If `strict` is true, or the charset is not in #PreferredMimeNames, then
892
 * iconv() with be run.
893
 */
894
bool mutt_ch_check_charset(const char *cs, bool strict)
895
0
{
896
0
  if (!cs)
897
0
    return false;
898
899
0
  if (mutt_ch_is_utf8(cs))
900
0
    return true;
901
902
0
  if (!strict)
903
0
  {
904
0
    for (int i = 0; PreferredMimeNames[i].key; i++)
905
0
    {
906
0
      if (mutt_istr_equal(PreferredMimeNames[i].key, cs) ||
907
0
          mutt_istr_equal(PreferredMimeNames[i].pref, cs))
908
0
      {
909
0
        return true;
910
0
      }
911
0
    }
912
0
  }
913
914
0
  iconv_t cd = mutt_ch_iconv_open(cs, cs, MUTT_ICONV_NO_FLAGS);
915
0
  if (iconv_t_valid(cd))
916
0
  {
917
0
    return true;
918
0
  }
919
920
0
  return false;
921
0
}
922
923
/**
924
 * mutt_ch_fgetconv_open - Prepare a file for charset conversion
925
 * @param fp    FILE ptr to prepare
926
 * @param from  Current character set
927
 * @param to    Destination character set
928
 * @param flags Flags, e.g. #MUTT_ICONV_HOOK_FROM
929
 * @retval ptr fgetconv handle
930
 *
931
 * Parameter flags is given as-is to mutt_ch_iconv_open().
932
 */
933
struct FgetConv *mutt_ch_fgetconv_open(FILE *fp, const char *from, const char *to, uint8_t flags)
934
0
{
935
0
  iconv_t cd = ICONV_T_INVALID;
936
937
0
  if (from && to)
938
0
    cd = mutt_ch_iconv_open(to, from, flags);
939
940
0
  struct FgetConv *fc = MUTT_MEM_CALLOC(1, struct FgetConv);
941
0
  fc->fp = fp;
942
0
  fc->cd = cd;
943
944
0
  if (iconv_t_valid(cd))
945
0
  {
946
0
    static const char *repls[] = { "\357\277\275", "?", 0 };
947
948
0
    fc->p = fc->bufo;
949
0
    fc->ob = fc->bufo;
950
0
    fc->ib = fc->bufi;
951
0
    fc->ibl = 0;
952
0
    fc->inrepls = mutt_ch_is_utf8(to) ? repls : repls + 1;
953
0
  }
954
955
0
  return fc;
956
0
}
957
958
/**
959
 * mutt_ch_fgetconv_close - Close an fgetconv handle
960
 * @param[out] ptr fgetconv handle
961
 */
962
void mutt_ch_fgetconv_close(struct FgetConv **ptr)
963
0
{
964
0
  if (!ptr || !*ptr)
965
0
    return;
966
967
0
  FREE(ptr);
968
0
}
969
970
/**
971
 * mutt_ch_fgetconv - Convert a file's character set
972
 * @param fc FgetConv handle
973
 * @retval num Next character in the converted file
974
 * @retval EOF Error
975
 *
976
 * A file is read into a buffer and its character set is converted.
977
 * Each call to this function will return one converted character.
978
 * The buffer is refilled automatically when empty.
979
 */
980
int mutt_ch_fgetconv(struct FgetConv *fc)
981
0
{
982
0
  if (!fc)
983
0
    return EOF;
984
0
  if (!iconv_t_valid(fc->cd))
985
0
    return fgetc(fc->fp);
986
0
  if (!fc->p)
987
0
    return EOF;
988
0
  if (fc->p < fc->ob)
989
0
    return (unsigned char) *(fc->p)++;
990
991
  /* Try to convert some more */
992
0
  fc->p = fc->bufo;
993
0
  fc->ob = fc->bufo;
994
0
  if (fc->ibl)
995
0
  {
996
0
    size_t obl = sizeof(fc->bufo);
997
0
    iconv(fc->cd, (ICONV_CONST char **) &fc->ib, &fc->ibl, &fc->ob, &obl);
998
0
    if (fc->p < fc->ob)
999
0
      return (unsigned char) *(fc->p)++;
1000
0
  }
1001
1002
  /* If we trusted iconv a bit more, we would at this point
1003
   * ask why it had stopped converting ... */
1004
1005
  /* Try to read some more */
1006
0
  if ((fc->ibl == sizeof(fc->bufi)) ||
1007
0
      (fc->ibl && (fc->ib + fc->ibl < fc->bufi + sizeof(fc->bufi))))
1008
0
  {
1009
0
    fc->p = 0;
1010
0
    return EOF;
1011
0
  }
1012
0
  if (fc->ibl)
1013
0
    memcpy(fc->bufi, fc->ib, fc->ibl);
1014
0
  fc->ib = fc->bufi;
1015
0
  fc->ibl += fread(fc->ib + fc->ibl, 1, sizeof(fc->bufi) - fc->ibl, fc->fp);
1016
1017
  /* Try harder this time to convert some */
1018
0
  if (fc->ibl)
1019
0
  {
1020
0
    size_t obl = sizeof(fc->bufo);
1021
0
    mutt_ch_iconv(fc->cd, (const char **) &fc->ib, &fc->ibl, &fc->ob, &obl,
1022
0
                  fc->inrepls, 0, NULL);
1023
0
    if (fc->p < fc->ob)
1024
0
      return (unsigned char) *(fc->p)++;
1025
0
  }
1026
1027
  /* Either the file has finished or one of the buffers is too small */
1028
0
  fc->p = 0;
1029
0
  return EOF;
1030
0
}
1031
1032
/**
1033
 * mutt_ch_fgetconvs - Convert a file's charset into a string buffer
1034
 * @param buf    Buffer for result
1035
 * @param buflen Length of buffer
1036
 * @param fc     FgetConv handle
1037
 * @retval ptr  Success, result buffer
1038
 * @retval NULL Error
1039
 *
1040
 * Read a file into a buffer, converting the character set as it goes.
1041
 */
1042
char *mutt_ch_fgetconvs(char *buf, size_t buflen, struct FgetConv *fc)
1043
0
{
1044
0
  if (!buf)
1045
0
    return NULL;
1046
1047
0
  size_t r;
1048
0
  for (r = 0; (r + 1) < buflen;)
1049
0
  {
1050
0
    const int c = mutt_ch_fgetconv(fc);
1051
0
    if (c == EOF)
1052
0
      break;
1053
0
    buf[r++] = (char) c;
1054
0
    if (c == '\n')
1055
0
      break;
1056
0
  }
1057
0
  buf[r] = '\0';
1058
1059
0
  if (r > 0)
1060
0
    return buf;
1061
1062
0
  return NULL;
1063
0
}
1064
1065
/**
1066
 * mutt_ch_set_charset - Update the records for a new character set
1067
 * @param charset New character set
1068
 *
1069
 * Check if this character set is utf-8 and pick a suitable replacement
1070
 * character for unprintable characters.
1071
 *
1072
 * @note This calls `bind_textdomain_codeset()` which will affect future
1073
 * message translations.
1074
 */
1075
void mutt_ch_set_charset(const char *charset)
1076
0
{
1077
0
  char buf[256] = { 0 };
1078
1079
0
  mutt_ch_canonical_charset(buf, sizeof(buf), charset);
1080
1081
0
  if (mutt_ch_is_utf8(buf))
1082
0
  {
1083
0
    CharsetIsUtf8 = true;
1084
0
    ReplacementChar = 0xfffd; /* replacement character */
1085
0
  }
1086
0
  else
1087
0
  {
1088
0
    CharsetIsUtf8 = false;
1089
0
    ReplacementChar = '?';
1090
0
  }
1091
1092
#if defined(HAVE_BIND_TEXTDOMAIN_CODESET) && defined(ENABLE_NLS)
1093
  bind_textdomain_codeset(PACKAGE, buf);
1094
#endif
1095
0
}
1096
1097
/**
1098
 * mutt_ch_choose - Figure the best charset to encode a string
1099
 * @param[in] fromcode Original charset of the string
1100
 * @param[in] charsets List of potential charsets to use
1101
 * @param[in] u        String to encode
1102
 * @param[in] ulen     Length of the string to encode
1103
 * @param[out] d       If not NULL, point it to the converted string
1104
 * @param[out] dlen    If not NULL, point it to the length of the d string
1105
 * @retval ptr  Best performing charset
1106
 * @retval NULL None could be found
1107
 */
1108
char *mutt_ch_choose(const char *fromcode, const struct Slist *charsets,
1109
                     const char *u, size_t ulen, char **d, size_t *dlen)
1110
0
{
1111
0
  if (!fromcode || !charsets)
1112
0
    return NULL;
1113
1114
0
  char *e = NULL, *tocode = NULL;
1115
0
  size_t elen = 0, bestn = 0;
1116
1117
0
  const struct ListNode *np = NULL;
1118
0
  STAILQ_FOREACH(np, &charsets->head, entries)
1119
0
  {
1120
0
    char *t = mutt_str_dup(np->data);
1121
0
    if (!t)
1122
0
      continue;
1123
1124
0
    size_t n = mutt_str_len(t);
1125
0
    char *s = mutt_strn_dup(u, ulen);
1126
0
    const int rc = d ? mutt_ch_convert_string(&s, fromcode, t, MUTT_ICONV_NO_FLAGS) :
1127
0
                       mutt_ch_check(s, ulen, fromcode, t);
1128
0
    if (rc)
1129
0
    {
1130
0
      FREE(&t);
1131
0
      FREE(&s);
1132
0
      continue;
1133
0
    }
1134
0
    size_t slen = mutt_str_len(s);
1135
1136
0
    if (!tocode || (n < bestn))
1137
0
    {
1138
0
      bestn = n;
1139
0
      FREE(&tocode);
1140
0
      tocode = t;
1141
0
      if (d)
1142
0
      {
1143
0
        FREE(&e);
1144
0
        e = s;
1145
0
      }
1146
0
      else
1147
0
      {
1148
0
        FREE(&s);
1149
0
      }
1150
0
      elen = slen;
1151
0
    }
1152
0
    else
1153
0
    {
1154
0
      FREE(&t);
1155
0
      FREE(&s);
1156
0
    }
1157
0
  }
1158
0
  if (tocode)
1159
0
  {
1160
0
    if (d)
1161
0
      *d = e;
1162
0
    if (dlen)
1163
0
      *dlen = elen;
1164
1165
0
    char canonical_buf[1024] = { 0 };
1166
0
    mutt_ch_canonical_charset(canonical_buf, sizeof(canonical_buf), tocode);
1167
0
    mutt_str_replace(&tocode, canonical_buf);
1168
0
  }
1169
0
  return tocode;
1170
0
}
1171
1172
/**
1173
 * mutt_ch_cache_cleanup - Clean up the cached iconv handles and charset strings
1174
 */
1175
void mutt_ch_cache_cleanup(void)
1176
0
{
1177
0
  for (int i = 0; i < IconvCacheUsed; i++)
1178
0
  {
1179
0
    FREE(&IconvCache[i].fromcode1);
1180
0
    FREE(&IconvCache[i].tocode1);
1181
0
    if (iconv_t_valid(IconvCache[i].cd))
1182
0
    {
1183
0
      iconv_close(IconvCache[i].cd);
1184
0
    }
1185
0
  }
1186
0
  IconvCacheUsed = 0;
1187
0
}