/src/neomutt/convert/convert.c

Source (jump to first uncovered line)
/**
 * @file
 * Conversion between different character encodings
 *
 * @authors
 * Copyright (C) 2022 Michal Siedlaczek <michal@siedlaczek.me>
 *
 * @copyright
 * This program is free software: you can redistribute it and/or modify it under
 * the terms of the GNU General Public License as published by the Free Software
 * Foundation, either version 2 of the License, or (at your option) any later
 * version.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
 * details.
 *
 * You should have received a copy of the GNU General Public License along with
 * this program.  If not, see <http://www.gnu.org/licenses/>.
 */

/**
 * @page convert_convert File Charset Conversion
 *
 * Converting files between charsets.
 */

#include "config.h"
#include <errno.h>
#include <iconv.h>
#include <stdbool.h>
#include <stdio.h>
#include <string.h>
#include "mutt/lib.h"
#include "email/lib.h"
#include "lib.h"

/**
 * mutt_convert_file_to - Change the encoding of a file
 * @param[in]  fp         File to convert
 * @param[in]  fromcode   Original encoding
 * @param[in]  tocodes    List of target encodings
 * @param[out] tocode     Chosen encoding
 * @param[out] info       Encoding information
 * @retval -1 Error, no conversion was possible
 * @retval >0 Success, number of bytes converted
 *
 * Find the best charset conversion of the file from fromcode into one
 * of the tocodes. If successful, set *tocode and Content *info and
 * return the number of characters converted inexactly.
 *
 * We convert via UTF-8 in order to avoid the condition -1(EINVAL),
 * which would otherwise prevent us from knowing the number of inexact
 * conversions. Where the candidate target charset is UTF-8 we avoid
 * doing the second conversion because iconv_open("UTF-8", "UTF-8")
 * fails with some libraries.
 *
 * We assume that the output from iconv is never more than 4 times as
 * long as the input for any pair of charsets we might be interested
 * in.
 */
size_t mutt_convert_file_to(FILE *fp, const char *fromcode, struct Slist const *const tocodes,
                            int *tocode, struct Content *info)
{
  char bufi[256], bufu[512], bufo[4 * sizeof(bufi)];
  size_t rc;

  const iconv_t cd1 = mutt_ch_iconv_open("utf-8", fromcode, MUTT_ICONV_NO_FLAGS);
  if (!iconv_t_valid(cd1))
    return -1;

  int ncodes = tocodes->count;
  iconv_t *cd = mutt_mem_calloc(ncodes, sizeof(iconv_t));
  size_t *score = mutt_mem_calloc(ncodes, sizeof(size_t));
  struct ContentState *states = mutt_mem_calloc(ncodes, sizeof(struct ContentState));
  struct Content *infos = mutt_mem_calloc(ncodes, sizeof(struct Content));

  struct ListNode *np = NULL;
  int ni = 0;
  STAILQ_FOREACH(np, &tocodes->head, entries)
  {
    if (!mutt_istr_equal(np->data, "utf-8"))
    {
      cd[ni] = mutt_ch_iconv_open(np->data, "utf-8", MUTT_ICONV_NO_FLAGS);
    }
    else
    {
      /* Special case for conversion to UTF-8 */
      cd[ni] = ICONV_T_INVALID;
      score[ni] = ICONV_ILLEGAL_SEQ;
    }
    ni += 1;
  }

  rewind(fp);
  size_t ibl = 0;
  while (true)
  {
    /* Try to fill input buffer */
    size_t n = fread(bufi + ibl, 1, sizeof(bufi) - ibl, fp);
    ibl += n;

    /* Convert to UTF-8 */
    const char *ib = bufi;
    char *ob = bufu;
    size_t obl = sizeof(bufu);
    n = iconv(cd1, (ICONV_CONST char **) ((ibl != 0) ? &ib : 0), &ibl, &ob, &obl);
    if ((n == ICONV_ILLEGAL_SEQ) && (((errno != EINVAL) && (errno != E2BIG)) || (ib == bufi)))
    {
      rc = ICONV_ILLEGAL_SEQ;
      break;
    }
    const size_t ubl1 = ob - bufu;

    /* Convert from UTF-8 */
    for (int i = 0; i < ncodes; i++)
    {
      if (iconv_t_valid(cd[i]) && (score[i] != ICONV_ILLEGAL_SEQ))
      {
        const char *ub = bufu;
        size_t ubl = ubl1;
        ob = bufo;
        obl = sizeof(bufo);
        n = iconv(cd[i], (ICONV_CONST char **) ((ibl || ubl) ? &ub : 0), &ubl, &ob, &obl);
        if (n == ICONV_ILLEGAL_SEQ)
        {
          score[i] = ICONV_ILLEGAL_SEQ;
        }
        else
        {
          score[i] += n;
          mutt_update_content_info(&infos[i], &states[i], bufo, ob - bufo);
        }
      }
      else if (!iconv_t_valid(cd[i]) && (score[i] == ICONV_ILLEGAL_SEQ))
      {
        /* Special case for conversion to UTF-8 */
        mutt_update_content_info(&infos[i], &states[i], bufu, ubl1);
      }
    }

    if (ibl)
    {
      /* Save unused input */
      memmove(bufi, ib, ibl);
    }
    else if (!ubl1 && (ib < bufi + sizeof(bufi)))
    {
      rc = 0;
      break;
    }
  }

  if (rc == 0)
  {
    /* Find best score */
    rc = ICONV_ILLEGAL_SEQ;
    for (int i = 0; i < ncodes; i++)
    {
      if (!iconv_t_valid(cd[i]) && (score[i] == ICONV_ILLEGAL_SEQ))
      {
        /* Special case for conversion to UTF-8 */
        *tocode = i;
        rc = 0;
        break;
      }
      else if (!iconv_t_valid(cd[i]) || (score[i] == ICONV_ILLEGAL_SEQ))
      {
        continue;
      }
      else if ((rc == ICONV_ILLEGAL_SEQ) || (score[i] < rc))
      {
        *tocode = i;
        rc = score[i];
        if (rc == 0)
          break;
      }
    }
    if (rc != ICONV_ILLEGAL_SEQ)
    {
      memcpy(info, &infos[*tocode], sizeof(struct Content));
      mutt_update_content_info(info, &states[*tocode], 0, 0); /* EOF */
    }
  }

  FREE(&cd);
  FREE(&infos);
  FREE(&score);
  FREE(&states);

  return rc;
}

/**
 * mutt_convert_file_from_to - Convert a file between encodings
 * @param[in]  fp        File to read from
 * @param[in]  fromcodes Charsets to try converting FROM
 * @param[in]  tocodes   Charsets to try converting TO
 * @param[out] fromcode  From charset selected
 * @param[out] tocode    To charset selected
 * @param[out] info      Info about the file
 * @retval num               Characters converted
 * @retval ICONV_ILLEGAL_SEQ Error (as a size_t)
 *
 * Find the first of the fromcodes that gives a valid conversion and the best
 * charset conversion of the file into one of the tocodes. If successful, set
 * *fromcode and *tocode to dynamically allocated strings, set Content *info,
 * and return the number of characters converted inexactly. If no conversion
 * was possible, return -1.
 */
size_t mutt_convert_file_from_to(FILE *fp, const struct Slist *fromcodes,
                                 const struct Slist *tocodes, char **fromcode,
                                 char **tocode, struct Content *info)
{
  char **tcode = NULL;
  size_t rc;
  int cn;
  struct ListNode *np = NULL;

  /* Copy them */
  tcode = mutt_mem_calloc(tocodes->count, sizeof(char *));
  np = NULL;
  cn = 0;
  STAILQ_FOREACH(np, &tocodes->head, entries)
  {
    tcode[cn++] = mutt_str_dup(np->data);
  }

  rc = ICONV_ILLEGAL_SEQ;
  np = NULL;
  cn = 0;
  STAILQ_FOREACH(np, &fromcodes->head, entries)
  {
    /* Try each fromcode in turn */
    rc = mutt_convert_file_to(fp, np->data, tocodes, &cn, info);
    if (rc != ICONV_ILLEGAL_SEQ)
    {
      *fromcode = np->data;
      *tocode = tcode[cn];
      tcode[cn] = 0;
      break;
    }
  }

  /* Free memory */
  for (cn = 0; cn < tocodes->count; cn++)
    FREE(&tcode[cn]);

  FREE(&tcode);

  return rc;
}

Coverage Report

Created: 2023-09-25 07:17

Line	Count	Source (jump to first uncovered line)
1		/**
2		* @file
3		* Conversion between different character encodings
4		*
5		* @authors
6		* Copyright (C) 2022 Michal Siedlaczek <michal@siedlaczek.me>
7		*
8		* @copyright
9		* This program is free software: you can redistribute it and/or modify it under
10		* the terms of the GNU General Public License as published by the Free Software
11		* Foundation, either version 2 of the License, or (at your option) any later
12		* version.
13		*
14		* This program is distributed in the hope that it will be useful, but WITHOUT
15		* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
16		* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
17		* details.
18		*
19		* You should have received a copy of the GNU General Public License along with
20		* this program. If not, see <http://www.gnu.org/licenses/>.
21		*/
22
23		/**
24		* @page convert_convert File Charset Conversion
25		*
26		* Converting files between charsets.
27		*/
28
29		#include "config.h"
30		#include <errno.h>
31		#include <iconv.h>
32		#include <stdbool.h>
33		#include <stdio.h>
34		#include <string.h>
35		#include "mutt/lib.h"
36		#include "email/lib.h"
37		#include "lib.h"
38
39		/**
40		* mutt_convert_file_to - Change the encoding of a file
41		* @param[in] fp File to convert
42		* @param[in] fromcode Original encoding
43		* @param[in] tocodes List of target encodings
44		* @param[out] tocode Chosen encoding
45		* @param[out] info Encoding information
46		* @retval -1 Error, no conversion was possible
47		* @retval >0 Success, number of bytes converted
48		*
49		* Find the best charset conversion of the file from fromcode into one
50		* of the tocodes. If successful, set tocode and Content info and
51		* return the number of characters converted inexactly.
52		*
53		* We convert via UTF-8 in order to avoid the condition -1(EINVAL),
54		* which would otherwise prevent us from knowing the number of inexact
55		* conversions. Where the candidate target charset is UTF-8 we avoid
56		* doing the second conversion because iconv_open("UTF-8", "UTF-8")
57		* fails with some libraries.
58		*
59		* We assume that the output from iconv is never more than 4 times as
60		* long as the input for any pair of charsets we might be interested
61		* in.
62		*/
63		size_t mutt_convert_file_to(FILE fp, const char fromcode, struct Slist const *const tocodes,
64		int tocode, struct Content info)
65	0	{
66	0	char bufi[256], bufu[512], bufo[4 * sizeof(bufi)];
67	0	size_t rc;
68
69	0	const iconv_t cd1 = mutt_ch_iconv_open("utf-8", fromcode, MUTT_ICONV_NO_FLAGS);
70	0	if (!iconv_t_valid(cd1))
71	0	return -1;
72
73	0	int ncodes = tocodes->count;
74	0	iconv_t *cd = mutt_mem_calloc(ncodes, sizeof(iconv_t));
75	0	size_t *score = mutt_mem_calloc(ncodes, sizeof(size_t));
76	0	struct ContentState *states = mutt_mem_calloc(ncodes, sizeof(struct ContentState));
77	0	struct Content *infos = mutt_mem_calloc(ncodes, sizeof(struct Content));
78
79	0	struct ListNode *np = NULL;
80	0	int ni = 0;
81	0	STAILQ_FOREACH(np, &tocodes->head, entries)
82	0	{
83	0	if (!mutt_istr_equal(np->data, "utf-8"))
84	0	{
85	0	cd[ni] = mutt_ch_iconv_open(np->data, "utf-8", MUTT_ICONV_NO_FLAGS);
86	0	}
87	0	else
88	0	{
89		/* Special case for conversion to UTF-8 */
90	0	cd[ni] = ICONV_T_INVALID;
91	0	score[ni] = ICONV_ILLEGAL_SEQ;
92	0	}
93	0	ni += 1;
94	0	}
95
96	0	rewind(fp);
97	0	size_t ibl = 0;
98	0	while (true)
99	0	{
100		/* Try to fill input buffer */
101	0	size_t n = fread(bufi + ibl, 1, sizeof(bufi) - ibl, fp);
102	0	ibl += n;
103
104		/* Convert to UTF-8 */
105	0	const char *ib = bufi;
106	0	char *ob = bufu;
107	0	size_t obl = sizeof(bufu);
108	0	n = iconv(cd1, (ICONV_CONST char **) ((ibl != 0) ? &ib : 0), &ibl, &ob, &obl);
109	0	if ((n == ICONV_ILLEGAL_SEQ) && (((errno != EINVAL) && (errno != E2BIG)) \|\| (ib == bufi)))
110	0	{
111	0	rc = ICONV_ILLEGAL_SEQ;
112	0	break;
113	0	}
114	0	const size_t ubl1 = ob - bufu;
115
116		/* Convert from UTF-8 */
117	0	for (int i = 0; i < ncodes; i++)
118	0	{
119	0	if (iconv_t_valid(cd[i]) && (score[i] != ICONV_ILLEGAL_SEQ))
120	0	{
121	0	const char *ub = bufu;
122	0	size_t ubl = ubl1;
123	0	ob = bufo;
124	0	obl = sizeof(bufo);
125	0	n = iconv(cd[i], (ICONV_CONST char **) ((ibl \|\| ubl) ? &ub : 0), &ubl, &ob, &obl);
126	0	if (n == ICONV_ILLEGAL_SEQ)
127	0	{
128	0	score[i] = ICONV_ILLEGAL_SEQ;
129	0	}
130	0	else
131	0	{
132	0	score[i] += n;
133	0	mutt_update_content_info(&infos[i], &states[i], bufo, ob - bufo);
134	0	}
135	0	}
136	0	else if (!iconv_t_valid(cd[i]) && (score[i] == ICONV_ILLEGAL_SEQ))
137	0	{
138		/* Special case for conversion to UTF-8 */
139	0	mutt_update_content_info(&infos[i], &states[i], bufu, ubl1);
140	0	}
141	0	}
142
143	0	if (ibl)
144	0	{
145		/* Save unused input */
146	0	memmove(bufi, ib, ibl);
147	0	}
148	0	else if (!ubl1 && (ib < bufi + sizeof(bufi)))
149	0	{
150	0	rc = 0;
151	0	break;
152	0	}
153	0	}
154
155	0	if (rc == 0)
156	0	{
157		/* Find best score */
158	0	rc = ICONV_ILLEGAL_SEQ;
159	0	for (int i = 0; i < ncodes; i++)
160	0	{
161	0	if (!iconv_t_valid(cd[i]) && (score[i] == ICONV_ILLEGAL_SEQ))
162	0	{
163		/* Special case for conversion to UTF-8 */
164	0	*tocode = i;
165	0	rc = 0;
166	0	break;
167	0	}
168	0	else if (!iconv_t_valid(cd[i]) \|\| (score[i] == ICONV_ILLEGAL_SEQ))
169	0	{
170	0	continue;
171	0	}
172	0	else if ((rc == ICONV_ILLEGAL_SEQ) \|\| (score[i] < rc))
173	0	{
174	0	*tocode = i;
175	0	rc = score[i];
176	0	if (rc == 0)
177	0	break;
178	0	}
179	0	}
180	0	if (rc != ICONV_ILLEGAL_SEQ)
181	0	{
182	0	memcpy(info, &infos[*tocode], sizeof(struct Content));
183	0	mutt_update_content_info(info, &states[tocode], 0, 0); / EOF */
184	0	}
185	0	}
186
187	0	FREE(&cd);
188	0	FREE(&infos);
189	0	FREE(&score);
190	0	FREE(&states);
191
192	0	return rc;
193	0	}
194
195		/**
196		* mutt_convert_file_from_to - Convert a file between encodings
197		* @param[in] fp File to read from
198		* @param[in] fromcodes Charsets to try converting FROM
199		* @param[in] tocodes Charsets to try converting TO
200		* @param[out] fromcode From charset selected
201		* @param[out] tocode To charset selected
202		* @param[out] info Info about the file
203		* @retval num Characters converted
204		* @retval ICONV_ILLEGAL_SEQ Error (as a size_t)
205		*
206		* Find the first of the fromcodes that gives a valid conversion and the best
207		* charset conversion of the file into one of the tocodes. If successful, set
208		* fromcode and tocode to dynamically allocated strings, set Content *info,
209		* and return the number of characters converted inexactly. If no conversion
210		* was possible, return -1.
211		*/
212		size_t mutt_convert_file_from_to(FILE fp, const struct Slist fromcodes,
213		const struct Slist tocodes, char *fromcode,
214		char *tocode, struct Content info)
215	0	{
216	0	char **tcode = NULL;
217	0	size_t rc;
218	0	int cn;
219	0	struct ListNode *np = NULL;
220
221		/* Copy them */
222	0	tcode = mutt_mem_calloc(tocodes->count, sizeof(char *));
223	0	np = NULL;
224	0	cn = 0;
225	0	STAILQ_FOREACH(np, &tocodes->head, entries)
226	0	{
227	0	tcode[cn++] = mutt_str_dup(np->data);
228	0	}
229
230	0	rc = ICONV_ILLEGAL_SEQ;
231	0	np = NULL;
232	0	cn = 0;
233	0	STAILQ_FOREACH(np, &fromcodes->head, entries)
234	0	{
235		/* Try each fromcode in turn */
236	0	rc = mutt_convert_file_to(fp, np->data, tocodes, &cn, info);
237	0	if (rc != ICONV_ILLEGAL_SEQ)
238	0	{
239	0	*fromcode = np->data;
240	0	*tocode = tcode[cn];
241	0	tcode[cn] = 0;
242	0	break;
243	0	}
244	0	}
245
246		/* Free memory */
247	0	for (cn = 0; cn < tocodes->count; cn++)
248	0	FREE(&tcode[cn]);
249
250	0	FREE(&tcode);
251
252	0	return rc;
253	0	}