/src/neomutt/convert/convert.c
Line | Count | Source (jump to first uncovered line) |
1 | | /** |
2 | | * @file |
3 | | * Conversion between different character encodings |
4 | | * |
5 | | * @authors |
6 | | * Copyright (C) 2022 Michal Siedlaczek <michal@siedlaczek.me> |
7 | | * |
8 | | * @copyright |
9 | | * This program is free software: you can redistribute it and/or modify it under |
10 | | * the terms of the GNU General Public License as published by the Free Software |
11 | | * Foundation, either version 2 of the License, or (at your option) any later |
12 | | * version. |
13 | | * |
14 | | * This program is distributed in the hope that it will be useful, but WITHOUT |
15 | | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
16 | | * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
17 | | * details. |
18 | | * |
19 | | * You should have received a copy of the GNU General Public License along with |
20 | | * this program. If not, see <http://www.gnu.org/licenses/>. |
21 | | */ |
22 | | |
23 | | /** |
24 | | * @page convert_convert File Charset Conversion |
25 | | * |
26 | | * Converting files between charsets. |
27 | | */ |
28 | | |
29 | | #include "config.h" |
30 | | #include <errno.h> |
31 | | #include <iconv.h> |
32 | | #include <stdbool.h> |
33 | | #include <stdio.h> |
34 | | #include <string.h> |
35 | | #include "mutt/lib.h" |
36 | | #include "email/lib.h" |
37 | | #include "lib.h" |
38 | | |
39 | | /** |
40 | | * mutt_convert_file_to - Change the encoding of a file |
41 | | * @param[in] fp File to convert |
42 | | * @param[in] fromcode Original encoding |
43 | | * @param[in] tocodes List of target encodings |
44 | | * @param[out] tocode Chosen encoding |
45 | | * @param[out] info Encoding information |
46 | | * @retval -1 Error, no conversion was possible |
47 | | * @retval >0 Success, number of bytes converted |
48 | | * |
49 | | * Find the best charset conversion of the file from fromcode into one |
50 | | * of the tocodes. If successful, set *tocode and Content *info and |
51 | | * return the number of characters converted inexactly. |
52 | | * |
53 | | * We convert via UTF-8 in order to avoid the condition -1(EINVAL), |
54 | | * which would otherwise prevent us from knowing the number of inexact |
55 | | * conversions. Where the candidate target charset is UTF-8 we avoid |
56 | | * doing the second conversion because iconv_open("UTF-8", "UTF-8") |
57 | | * fails with some libraries. |
58 | | * |
59 | | * We assume that the output from iconv is never more than 4 times as |
60 | | * long as the input for any pair of charsets we might be interested |
61 | | * in. |
62 | | */ |
63 | | size_t mutt_convert_file_to(FILE *fp, const char *fromcode, struct Slist const *const tocodes, |
64 | | int *tocode, struct Content *info) |
65 | 0 | { |
66 | 0 | char bufi[256], bufu[512], bufo[4 * sizeof(bufi)]; |
67 | 0 | size_t rc; |
68 | |
|
69 | 0 | const iconv_t cd1 = mutt_ch_iconv_open("utf-8", fromcode, MUTT_ICONV_NO_FLAGS); |
70 | 0 | if (!iconv_t_valid(cd1)) |
71 | 0 | return -1; |
72 | | |
73 | 0 | int ncodes = tocodes->count; |
74 | 0 | iconv_t *cd = mutt_mem_calloc(ncodes, sizeof(iconv_t)); |
75 | 0 | size_t *score = mutt_mem_calloc(ncodes, sizeof(size_t)); |
76 | 0 | struct ContentState *states = mutt_mem_calloc(ncodes, sizeof(struct ContentState)); |
77 | 0 | struct Content *infos = mutt_mem_calloc(ncodes, sizeof(struct Content)); |
78 | |
|
79 | 0 | struct ListNode *np = NULL; |
80 | 0 | int ni = 0; |
81 | 0 | STAILQ_FOREACH(np, &tocodes->head, entries) |
82 | 0 | { |
83 | 0 | if (!mutt_istr_equal(np->data, "utf-8")) |
84 | 0 | { |
85 | 0 | cd[ni] = mutt_ch_iconv_open(np->data, "utf-8", MUTT_ICONV_NO_FLAGS); |
86 | 0 | } |
87 | 0 | else |
88 | 0 | { |
89 | | /* Special case for conversion to UTF-8 */ |
90 | 0 | cd[ni] = ICONV_T_INVALID; |
91 | 0 | score[ni] = ICONV_ILLEGAL_SEQ; |
92 | 0 | } |
93 | 0 | ni += 1; |
94 | 0 | } |
95 | |
|
96 | 0 | rewind(fp); |
97 | 0 | size_t ibl = 0; |
98 | 0 | while (true) |
99 | 0 | { |
100 | | /* Try to fill input buffer */ |
101 | 0 | size_t n = fread(bufi + ibl, 1, sizeof(bufi) - ibl, fp); |
102 | 0 | ibl += n; |
103 | | |
104 | | /* Convert to UTF-8 */ |
105 | 0 | const char *ib = bufi; |
106 | 0 | char *ob = bufu; |
107 | 0 | size_t obl = sizeof(bufu); |
108 | 0 | n = iconv(cd1, (ICONV_CONST char **) ((ibl != 0) ? &ib : 0), &ibl, &ob, &obl); |
109 | 0 | if ((n == ICONV_ILLEGAL_SEQ) && (((errno != EINVAL) && (errno != E2BIG)) || (ib == bufi))) |
110 | 0 | { |
111 | 0 | rc = ICONV_ILLEGAL_SEQ; |
112 | 0 | break; |
113 | 0 | } |
114 | 0 | const size_t ubl1 = ob - bufu; |
115 | | |
116 | | /* Convert from UTF-8 */ |
117 | 0 | for (int i = 0; i < ncodes; i++) |
118 | 0 | { |
119 | 0 | if (iconv_t_valid(cd[i]) && (score[i] != ICONV_ILLEGAL_SEQ)) |
120 | 0 | { |
121 | 0 | const char *ub = bufu; |
122 | 0 | size_t ubl = ubl1; |
123 | 0 | ob = bufo; |
124 | 0 | obl = sizeof(bufo); |
125 | 0 | n = iconv(cd[i], (ICONV_CONST char **) ((ibl || ubl) ? &ub : 0), &ubl, &ob, &obl); |
126 | 0 | if (n == ICONV_ILLEGAL_SEQ) |
127 | 0 | { |
128 | 0 | score[i] = ICONV_ILLEGAL_SEQ; |
129 | 0 | } |
130 | 0 | else |
131 | 0 | { |
132 | 0 | score[i] += n; |
133 | 0 | mutt_update_content_info(&infos[i], &states[i], bufo, ob - bufo); |
134 | 0 | } |
135 | 0 | } |
136 | 0 | else if (!iconv_t_valid(cd[i]) && (score[i] == ICONV_ILLEGAL_SEQ)) |
137 | 0 | { |
138 | | /* Special case for conversion to UTF-8 */ |
139 | 0 | mutt_update_content_info(&infos[i], &states[i], bufu, ubl1); |
140 | 0 | } |
141 | 0 | } |
142 | |
|
143 | 0 | if (ibl) |
144 | 0 | { |
145 | | /* Save unused input */ |
146 | 0 | memmove(bufi, ib, ibl); |
147 | 0 | } |
148 | 0 | else if (!ubl1 && (ib < bufi + sizeof(bufi))) |
149 | 0 | { |
150 | 0 | rc = 0; |
151 | 0 | break; |
152 | 0 | } |
153 | 0 | } |
154 | |
|
155 | 0 | if (rc == 0) |
156 | 0 | { |
157 | | /* Find best score */ |
158 | 0 | rc = ICONV_ILLEGAL_SEQ; |
159 | 0 | for (int i = 0; i < ncodes; i++) |
160 | 0 | { |
161 | 0 | if (!iconv_t_valid(cd[i]) && (score[i] == ICONV_ILLEGAL_SEQ)) |
162 | 0 | { |
163 | | /* Special case for conversion to UTF-8 */ |
164 | 0 | *tocode = i; |
165 | 0 | rc = 0; |
166 | 0 | break; |
167 | 0 | } |
168 | 0 | else if (!iconv_t_valid(cd[i]) || (score[i] == ICONV_ILLEGAL_SEQ)) |
169 | 0 | { |
170 | 0 | continue; |
171 | 0 | } |
172 | 0 | else if ((rc == ICONV_ILLEGAL_SEQ) || (score[i] < rc)) |
173 | 0 | { |
174 | 0 | *tocode = i; |
175 | 0 | rc = score[i]; |
176 | 0 | if (rc == 0) |
177 | 0 | break; |
178 | 0 | } |
179 | 0 | } |
180 | 0 | if (rc != ICONV_ILLEGAL_SEQ) |
181 | 0 | { |
182 | 0 | memcpy(info, &infos[*tocode], sizeof(struct Content)); |
183 | 0 | mutt_update_content_info(info, &states[*tocode], 0, 0); /* EOF */ |
184 | 0 | } |
185 | 0 | } |
186 | |
|
187 | 0 | FREE(&cd); |
188 | 0 | FREE(&infos); |
189 | 0 | FREE(&score); |
190 | 0 | FREE(&states); |
191 | |
|
192 | 0 | return rc; |
193 | 0 | } |
194 | | |
195 | | /** |
196 | | * mutt_convert_file_from_to - Convert a file between encodings |
197 | | * @param[in] fp File to read from |
198 | | * @param[in] fromcodes Charsets to try converting FROM |
199 | | * @param[in] tocodes Charsets to try converting TO |
200 | | * @param[out] fromcode From charset selected |
201 | | * @param[out] tocode To charset selected |
202 | | * @param[out] info Info about the file |
203 | | * @retval num Characters converted |
204 | | * @retval ICONV_ILLEGAL_SEQ Error (as a size_t) |
205 | | * |
206 | | * Find the first of the fromcodes that gives a valid conversion and the best |
207 | | * charset conversion of the file into one of the tocodes. If successful, set |
208 | | * *fromcode and *tocode to dynamically allocated strings, set Content *info, |
209 | | * and return the number of characters converted inexactly. If no conversion |
210 | | * was possible, return -1. |
211 | | */ |
212 | | size_t mutt_convert_file_from_to(FILE *fp, const struct Slist *fromcodes, |
213 | | const struct Slist *tocodes, char **fromcode, |
214 | | char **tocode, struct Content *info) |
215 | 0 | { |
216 | 0 | char **tcode = NULL; |
217 | 0 | size_t rc; |
218 | 0 | int cn; |
219 | 0 | struct ListNode *np = NULL; |
220 | | |
221 | | /* Copy them */ |
222 | 0 | tcode = mutt_mem_calloc(tocodes->count, sizeof(char *)); |
223 | 0 | np = NULL; |
224 | 0 | cn = 0; |
225 | 0 | STAILQ_FOREACH(np, &tocodes->head, entries) |
226 | 0 | { |
227 | 0 | tcode[cn++] = mutt_str_dup(np->data); |
228 | 0 | } |
229 | |
|
230 | 0 | rc = ICONV_ILLEGAL_SEQ; |
231 | 0 | np = NULL; |
232 | 0 | cn = 0; |
233 | 0 | STAILQ_FOREACH(np, &fromcodes->head, entries) |
234 | 0 | { |
235 | | /* Try each fromcode in turn */ |
236 | 0 | rc = mutt_convert_file_to(fp, np->data, tocodes, &cn, info); |
237 | 0 | if (rc != ICONV_ILLEGAL_SEQ) |
238 | 0 | { |
239 | 0 | *fromcode = np->data; |
240 | 0 | *tocode = tcode[cn]; |
241 | 0 | tcode[cn] = 0; |
242 | 0 | break; |
243 | 0 | } |
244 | 0 | } |
245 | | |
246 | | /* Free memory */ |
247 | 0 | for (cn = 0; cn < tocodes->count; cn++) |
248 | 0 | FREE(&tcode[cn]); |
249 | |
|
250 | 0 | FREE(&tcode); |
251 | |
|
252 | 0 | return rc; |
253 | 0 | } |