Coverage Report

Created: 2023-09-25 07:17

/src/neomutt/convert/convert.c
Line
Count
Source (jump to first uncovered line)
1
/**
2
 * @file
3
 * Conversion between different character encodings
4
 *
5
 * @authors
6
 * Copyright (C) 2022 Michal Siedlaczek <michal@siedlaczek.me>
7
 *
8
 * @copyright
9
 * This program is free software: you can redistribute it and/or modify it under
10
 * the terms of the GNU General Public License as published by the Free Software
11
 * Foundation, either version 2 of the License, or (at your option) any later
12
 * version.
13
 *
14
 * This program is distributed in the hope that it will be useful, but WITHOUT
15
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
16
 * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
17
 * details.
18
 *
19
 * You should have received a copy of the GNU General Public License along with
20
 * this program.  If not, see <http://www.gnu.org/licenses/>.
21
 */
22
23
/**
24
 * @page convert_convert File Charset Conversion
25
 *
26
 * Converting files between charsets.
27
 */
28
29
#include "config.h"
30
#include <errno.h>
31
#include <iconv.h>
32
#include <stdbool.h>
33
#include <stdio.h>
34
#include <string.h>
35
#include "mutt/lib.h"
36
#include "email/lib.h"
37
#include "lib.h"
38
39
/**
40
 * mutt_convert_file_to - Change the encoding of a file
41
 * @param[in]  fp         File to convert
42
 * @param[in]  fromcode   Original encoding
43
 * @param[in]  tocodes    List of target encodings
44
 * @param[out] tocode     Chosen encoding
45
 * @param[out] info       Encoding information
46
 * @retval -1 Error, no conversion was possible
47
 * @retval >0 Success, number of bytes converted
48
 *
49
 * Find the best charset conversion of the file from fromcode into one
50
 * of the tocodes. If successful, set *tocode and Content *info and
51
 * return the number of characters converted inexactly.
52
 *
53
 * We convert via UTF-8 in order to avoid the condition -1(EINVAL),
54
 * which would otherwise prevent us from knowing the number of inexact
55
 * conversions. Where the candidate target charset is UTF-8 we avoid
56
 * doing the second conversion because iconv_open("UTF-8", "UTF-8")
57
 * fails with some libraries.
58
 *
59
 * We assume that the output from iconv is never more than 4 times as
60
 * long as the input for any pair of charsets we might be interested
61
 * in.
62
 */
63
size_t mutt_convert_file_to(FILE *fp, const char *fromcode, struct Slist const *const tocodes,
64
                            int *tocode, struct Content *info)
65
0
{
66
0
  char bufi[256], bufu[512], bufo[4 * sizeof(bufi)];
67
0
  size_t rc;
68
69
0
  const iconv_t cd1 = mutt_ch_iconv_open("utf-8", fromcode, MUTT_ICONV_NO_FLAGS);
70
0
  if (!iconv_t_valid(cd1))
71
0
    return -1;
72
73
0
  int ncodes = tocodes->count;
74
0
  iconv_t *cd = mutt_mem_calloc(ncodes, sizeof(iconv_t));
75
0
  size_t *score = mutt_mem_calloc(ncodes, sizeof(size_t));
76
0
  struct ContentState *states = mutt_mem_calloc(ncodes, sizeof(struct ContentState));
77
0
  struct Content *infos = mutt_mem_calloc(ncodes, sizeof(struct Content));
78
79
0
  struct ListNode *np = NULL;
80
0
  int ni = 0;
81
0
  STAILQ_FOREACH(np, &tocodes->head, entries)
82
0
  {
83
0
    if (!mutt_istr_equal(np->data, "utf-8"))
84
0
    {
85
0
      cd[ni] = mutt_ch_iconv_open(np->data, "utf-8", MUTT_ICONV_NO_FLAGS);
86
0
    }
87
0
    else
88
0
    {
89
      /* Special case for conversion to UTF-8 */
90
0
      cd[ni] = ICONV_T_INVALID;
91
0
      score[ni] = ICONV_ILLEGAL_SEQ;
92
0
    }
93
0
    ni += 1;
94
0
  }
95
96
0
  rewind(fp);
97
0
  size_t ibl = 0;
98
0
  while (true)
99
0
  {
100
    /* Try to fill input buffer */
101
0
    size_t n = fread(bufi + ibl, 1, sizeof(bufi) - ibl, fp);
102
0
    ibl += n;
103
104
    /* Convert to UTF-8 */
105
0
    const char *ib = bufi;
106
0
    char *ob = bufu;
107
0
    size_t obl = sizeof(bufu);
108
0
    n = iconv(cd1, (ICONV_CONST char **) ((ibl != 0) ? &ib : 0), &ibl, &ob, &obl);
109
0
    if ((n == ICONV_ILLEGAL_SEQ) && (((errno != EINVAL) && (errno != E2BIG)) || (ib == bufi)))
110
0
    {
111
0
      rc = ICONV_ILLEGAL_SEQ;
112
0
      break;
113
0
    }
114
0
    const size_t ubl1 = ob - bufu;
115
116
    /* Convert from UTF-8 */
117
0
    for (int i = 0; i < ncodes; i++)
118
0
    {
119
0
      if (iconv_t_valid(cd[i]) && (score[i] != ICONV_ILLEGAL_SEQ))
120
0
      {
121
0
        const char *ub = bufu;
122
0
        size_t ubl = ubl1;
123
0
        ob = bufo;
124
0
        obl = sizeof(bufo);
125
0
        n = iconv(cd[i], (ICONV_CONST char **) ((ibl || ubl) ? &ub : 0), &ubl, &ob, &obl);
126
0
        if (n == ICONV_ILLEGAL_SEQ)
127
0
        {
128
0
          score[i] = ICONV_ILLEGAL_SEQ;
129
0
        }
130
0
        else
131
0
        {
132
0
          score[i] += n;
133
0
          mutt_update_content_info(&infos[i], &states[i], bufo, ob - bufo);
134
0
        }
135
0
      }
136
0
      else if (!iconv_t_valid(cd[i]) && (score[i] == ICONV_ILLEGAL_SEQ))
137
0
      {
138
        /* Special case for conversion to UTF-8 */
139
0
        mutt_update_content_info(&infos[i], &states[i], bufu, ubl1);
140
0
      }
141
0
    }
142
143
0
    if (ibl)
144
0
    {
145
      /* Save unused input */
146
0
      memmove(bufi, ib, ibl);
147
0
    }
148
0
    else if (!ubl1 && (ib < bufi + sizeof(bufi)))
149
0
    {
150
0
      rc = 0;
151
0
      break;
152
0
    }
153
0
  }
154
155
0
  if (rc == 0)
156
0
  {
157
    /* Find best score */
158
0
    rc = ICONV_ILLEGAL_SEQ;
159
0
    for (int i = 0; i < ncodes; i++)
160
0
    {
161
0
      if (!iconv_t_valid(cd[i]) && (score[i] == ICONV_ILLEGAL_SEQ))
162
0
      {
163
        /* Special case for conversion to UTF-8 */
164
0
        *tocode = i;
165
0
        rc = 0;
166
0
        break;
167
0
      }
168
0
      else if (!iconv_t_valid(cd[i]) || (score[i] == ICONV_ILLEGAL_SEQ))
169
0
      {
170
0
        continue;
171
0
      }
172
0
      else if ((rc == ICONV_ILLEGAL_SEQ) || (score[i] < rc))
173
0
      {
174
0
        *tocode = i;
175
0
        rc = score[i];
176
0
        if (rc == 0)
177
0
          break;
178
0
      }
179
0
    }
180
0
    if (rc != ICONV_ILLEGAL_SEQ)
181
0
    {
182
0
      memcpy(info, &infos[*tocode], sizeof(struct Content));
183
0
      mutt_update_content_info(info, &states[*tocode], 0, 0); /* EOF */
184
0
    }
185
0
  }
186
187
0
  FREE(&cd);
188
0
  FREE(&infos);
189
0
  FREE(&score);
190
0
  FREE(&states);
191
192
0
  return rc;
193
0
}
194
195
/**
196
 * mutt_convert_file_from_to - Convert a file between encodings
197
 * @param[in]  fp        File to read from
198
 * @param[in]  fromcodes Charsets to try converting FROM
199
 * @param[in]  tocodes   Charsets to try converting TO
200
 * @param[out] fromcode  From charset selected
201
 * @param[out] tocode    To charset selected
202
 * @param[out] info      Info about the file
203
 * @retval num               Characters converted
204
 * @retval ICONV_ILLEGAL_SEQ Error (as a size_t)
205
 *
206
 * Find the first of the fromcodes that gives a valid conversion and the best
207
 * charset conversion of the file into one of the tocodes. If successful, set
208
 * *fromcode and *tocode to dynamically allocated strings, set Content *info,
209
 * and return the number of characters converted inexactly. If no conversion
210
 * was possible, return -1.
211
 */
212
size_t mutt_convert_file_from_to(FILE *fp, const struct Slist *fromcodes,
213
                                 const struct Slist *tocodes, char **fromcode,
214
                                 char **tocode, struct Content *info)
215
0
{
216
0
  char **tcode = NULL;
217
0
  size_t rc;
218
0
  int cn;
219
0
  struct ListNode *np = NULL;
220
221
  /* Copy them */
222
0
  tcode = mutt_mem_calloc(tocodes->count, sizeof(char *));
223
0
  np = NULL;
224
0
  cn = 0;
225
0
  STAILQ_FOREACH(np, &tocodes->head, entries)
226
0
  {
227
0
    tcode[cn++] = mutt_str_dup(np->data);
228
0
  }
229
230
0
  rc = ICONV_ILLEGAL_SEQ;
231
0
  np = NULL;
232
0
  cn = 0;
233
0
  STAILQ_FOREACH(np, &fromcodes->head, entries)
234
0
  {
235
    /* Try each fromcode in turn */
236
0
    rc = mutt_convert_file_to(fp, np->data, tocodes, &cn, info);
237
0
    if (rc != ICONV_ILLEGAL_SEQ)
238
0
    {
239
0
      *fromcode = np->data;
240
0
      *tocode = tcode[cn];
241
0
      tcode[cn] = 0;
242
0
      break;
243
0
    }
244
0
  }
245
246
  /* Free memory */
247
0
  for (cn = 0; cn < tocodes->count; cn++)
248
0
    FREE(&tcode[cn]);
249
250
0
  FREE(&tcode);
251
252
0
  return rc;
253
0
}