/src/wget2/libwget/encoding.c

Source (jump to first uncovered line)
/*
 * Copyright (c) 2012-2015 Tim Ruehsen
 * Copyright (c) 2015-2024 Free Software Foundation, Inc.
 *
 * This file is part of libwget.
 *
 * Libwget is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Libwget is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with libwget.  If not, see <https://www.gnu.org/licenses/>.
 *
 *
 * a collection of charset encoding routines
 *
 * Changelog
 * 02.10.2013  Tim Ruehsen  created
 *
 */

#include <config.h>

#include <string.h>
#include <errno.h>

#ifdef HAVE_ICONV
# include <iconv.h>
#endif

#include <langinfo.h>

#if defined HAVE_IDN2_H && defined WITH_LIBIDN2
# include <idn2.h>
#elif defined HAVE_IDNA_H && defined WITH_LIBIDN
# include <idna.h>
# ifdef _WIN32
#   include <idn-free.h>
# endif
#elif defined HAVE_IDN_IDNA_H && defined WITH_LIBIDN
// OpenSolaris uses the idn subdir
# include <idn/idna.h>
#endif

#include <wget.h>
#include "private.h"

const char *wget_local_charset_encoding(void)
{
  const char *encoding = nl_langinfo(CODESET);

  if (encoding && *encoding)
    return wget_strdup(encoding);

  return wget_strdup("ASCII");
}

// void *wget_memiconv(const void *src, size_t length, const char *src_encoding, const char *dst_encoding)
int wget_memiconv(const char *src_encoding, const void *src, size_t srclen, const char *dst_encoding, char **out, size_t *outlen)
{
  if (!src)
    return WGET_E_INVALID;

#ifdef HAVE_ICONV
  if (!src_encoding)
    src_encoding = "iso-8859-1"; // default character-set for most browsers
  if (!dst_encoding)
    dst_encoding = "iso-8859-1"; // default character-set for most browsers

  if (wget_strcasecmp_ascii(src_encoding, dst_encoding)) {
    int ret = WGET_E_UNKNOWN;
    iconv_t cd = iconv_open(dst_encoding, src_encoding);

    if (cd != (iconv_t)-1) {
      char *tmp = (char *) src; // iconv won't change where src points to, but changes tmp itself
      size_t tmp_len = srclen;
      size_t dst_len = tmp_len * 6, dst_len_tmp = dst_len;
      char *dst = wget_malloc(dst_len + 1), *dst_tmp = dst;

      if (!dst) {
        iconv_close(cd);
        return WGET_E_MEMORY;
      }

      errno = 0;
      if (iconv(cd, (ICONV_CONST char **)&tmp, &tmp_len, &dst_tmp, &dst_len_tmp) == 0
        && iconv(cd, NULL, NULL, &dst_tmp, &dst_len_tmp) == 0)
      {
        debug_printf("transcoded %zu bytes from '%s' to '%s'\n", srclen, src_encoding, dst_encoding);
        if (out) {
          // here we reduce the allocated memory size, if it fails we use the original memory chunk
          tmp = wget_realloc(dst, dst_len - dst_len_tmp + 1);
          if (!tmp)
            tmp = dst;
          tmp[dst_len - dst_len_tmp] = 0;
          *out = tmp;
        } else
          xfree(dst);

        if (outlen)
          *outlen = dst_len - dst_len_tmp;

        ret = WGET_E_SUCCESS;
      } else {
        // erno == 0 means some codepoints were encoded non-reversible, treat as error
        error_printf(_("Failed to transcode '%s' string into '%s' (%d)\n"), src_encoding, dst_encoding, errno);
        xfree(dst);

        if (out)
          *out = NULL;

        if (outlen)
          *outlen = 0;
      }

      iconv_close(cd);
    } else
      error_printf(_("Failed to prepare transcoding '%s' into '%s' (%d)\n"), src_encoding, dst_encoding, errno);

    return ret;
  }
#endif

  if (out)
    *out = wget_strmemdup(src, srclen);

  if (outlen)
    *outlen = srclen;

  return WGET_E_SUCCESS;
}

// src must be a ASCII compatible C string
char *wget_striconv(const char *src, const char *src_encoding, const char *dst_encoding)
{
  if (!src)
    return NULL;

  char *dst;
  if (wget_memiconv(src_encoding, src, strlen(src), dst_encoding, &dst, NULL))
    return NULL;

  return dst;
}

bool wget_str_needs_encoding(const char *s)
{
  if (!s)
    return false;

  while (*s && (*s & ~0x7f) == 0) s++;

  return *s != 0;
}

bool wget_str_is_valid_utf8(const char *utf8)
{
  const unsigned char *s = (const unsigned char *) utf8;

  if (!s)
    return 0;

  while (*s) {
    if ((*s & 0x80) == 0) /* 0xxxxxxx ASCII char */
      s++;
    else if ((*s & 0xE0) == 0xC0) /* 110xxxxx 10xxxxxx */ {
      if ((s[1] & 0xC0) != 0x80)
        return 0;
      s += 2;
    } else if ((*s & 0xF0) == 0xE0) /* 1110xxxx 10xxxxxx 10xxxxxx */ {
      if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80)
        return 0;
      s += 3;
    } else if ((*s & 0xF8) == 0xF0) /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ {
      if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80 || (s[3] & 0xC0) != 0x80)
        return 0;
      s += 4;
    } else
      return 0;
  }

  return 1;
}

char *wget_str_to_utf8(const char *src, const char *encoding)
{
  return wget_striconv(src, encoding, "utf-8");
}

char *wget_utf8_to_str(const char *src, const char *encoding)
{
  return wget_striconv(src, "utf-8", encoding);
}

#ifdef WITH_LIBIDN
/*
 * Work around a libidn <= 1.30 vulnerability.
 *
 * The function checks for a valid UTF-8 character sequence before
 * passing it to idna_to_ascii_8z().
 *
 * [1] https://lists.gnu.org/archive/html/help-libidn/2015-05/msg00002.html
 * [2] https://lists.gnu.org/archive/html/bug-wget/2015-06/msg00002.html
 * [3] https://curl.haxx.se/mail/lib-2015-06/0143.html
 */
static int WGET_GCC_PURE _utf8_is_valid(const char *utf8)
{
  const unsigned char *s = (const unsigned char *) utf8;

  while (*s) {
    if ((*s & 0x80) == 0) /* 0xxxxxxx ASCII char */
      s++;
    else if ((*s & 0xE0) == 0xC0) /* 110xxxxx 10xxxxxx */ {
      if ((s[1] & 0xC0) != 0x80)
        return 0;
      s += 2;
    } else if ((*s & 0xF0) == 0xE0) /* 1110xxxx 10xxxxxx 10xxxxxx */ {
      if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80)
        return 0;
      s += 3;
    } else if ((*s & 0xF8) == 0xF0) /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ {
      if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80 || (s[3] & 0xC0) != 0x80)
        return 0;
      s += 4;
    } else
      return 0;
  }

  return 1;
}
#endif

/* We convert hostnames and thus have to apply IDN2_USE_STD3_ASCII_RULES.
 * If we don't do, the result could contain any ascii characters,
 * e.g. 'evil.c\u2100.example.com' will be converted into
 * 'evil.ca/c.example.com', which seems no good idea. */
const char *wget_str_to_ascii(const char *src)
{
#ifdef WITH_LIBIDN2
  if (wget_str_needs_encoding(src)) {
    char *asc = NULL;
    int rc;
    if ((rc = idn2_lookup_u8((uint8_t *)src, (uint8_t **)&asc, IDN2_NONTRANSITIONAL|IDN2_USE_STD3_ASCII_RULES)) != IDN2_OK)
      rc = idn2_lookup_u8((uint8_t *)src, (uint8_t **)&asc, IDN2_TRANSITIONAL|IDN2_USE_STD3_ASCII_RULES);
    if (rc == IDN2_OK)
    {
      debug_printf("idn2 '%s' -> '%s'\n", src, asc);
#  ifdef _WIN32
        src = wget_strdup(asc);
        idn2_free(asc);
#  else
        src = asc;
#  endif
    } else
      error_printf(_("toASCII(%s) failed (%d): %s\n"), src, rc, idn2_strerror(rc));
  }
#elif defined WITH_LIBIDN
  if (wget_str_needs_encoding(src)) {
    char *asc = NULL;

    if (_utf8_is_valid(src)) {
      int rc;

      // idna_to_ascii_8z() automatically converts UTF-8 to lowercase
      if ((rc = idna_to_ascii_8z(src, &asc, IDNA_USE_STD3_ASCII_RULES)) == IDNA_SUCCESS) {
        // debug_printf("toASCII '%s' -> '%s'\n", src, asc);
# ifdef _WIN32
        src = wget_strdup(asc);
        idn_free(asc);
# else
        src = asc;
# endif
      } else
        error_printf(_("toASCII failed (%d): %s\n"), rc, idna_strerror(rc));
    }
    else
      error_printf(_("Invalid UTF-8 sequence not converted: '%s'\n"), src);
  }
#else
  if (wget_str_needs_encoding(src)) {
    error_printf(_("toASCII not available: '%s'\n"), src);
  }
#endif

  return src;
}

Coverage Report

Created: 2025-01-28 06:58

Line	Count	Source (jump to first uncovered line)
1		/*
2		* Copyright (c) 2012-2015 Tim Ruehsen
3		* Copyright (c) 2015-2024 Free Software Foundation, Inc.
4		*
5		* This file is part of libwget.
6		*
7		* Libwget is free software: you can redistribute it and/or modify
8		* it under the terms of the GNU Lesser General Public License as published by
9		* the Free Software Foundation, either version 3 of the License, or
10		* (at your option) any later version.
11		*
12		* Libwget is distributed in the hope that it will be useful,
13		* but WITHOUT ANY WARRANTY; without even the implied warranty of
14		* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15		* GNU Lesser General Public License for more details.
16		*
17		* You should have received a copy of the GNU Lesser General Public License
18		* along with libwget. If not, see <https://www.gnu.org/licenses/>.
19		*
20		*
21		* a collection of charset encoding routines
22		*
23		* Changelog
24		* 02.10.2013 Tim Ruehsen created
25		*
26		*/
27
28		#include <config.h>
29
30		#include <string.h>
31		#include <errno.h>
32
33		#ifdef HAVE_ICONV
34		# include <iconv.h>
35		#endif
36
37		#include <langinfo.h>
38
39		#if defined HAVE_IDN2_H && defined WITH_LIBIDN2
40		# include <idn2.h>
41		#elif defined HAVE_IDNA_H && defined WITH_LIBIDN
42		# include <idna.h>
43		# ifdef _WIN32
44		# include <idn-free.h>
45		# endif
46		#elif defined HAVE_IDN_IDNA_H && defined WITH_LIBIDN
47		// OpenSolaris uses the idn subdir
48		# include <idn/idna.h>
49		#endif
50
51		#include <wget.h>
52		#include "private.h"
53
54		const char *wget_local_charset_encoding(void)
55	0	{
56	0	const char *encoding = nl_langinfo(CODESET);
57
58	0	if (encoding && *encoding)
59	0	return wget_strdup(encoding);
60
61	0	return wget_strdup("ASCII");
62	0	}
63
64		// void wget_memiconv(const void src, size_t length, const char src_encoding, const char dst_encoding)
65		int wget_memiconv(const char src_encoding, const void src, size_t srclen, const char dst_encoding, char out, size_t outlen)
66	0	{
67	0	if (!src)
68	0	return WGET_E_INVALID;
69
70	0	#ifdef HAVE_ICONV
71	0	if (!src_encoding)
72	0	src_encoding = "iso-8859-1"; // default character-set for most browsers
73	0	if (!dst_encoding)
74	0	dst_encoding = "iso-8859-1"; // default character-set for most browsers
75
76	0	if (wget_strcasecmp_ascii(src_encoding, dst_encoding)) {
77	0	int ret = WGET_E_UNKNOWN;
78	0	iconv_t cd = iconv_open(dst_encoding, src_encoding);
79
80	0	if (cd != (iconv_t)-1) {
81	0	char tmp = (char ) src; // iconv won't change where src points to, but changes tmp itself
82	0	size_t tmp_len = srclen;
83	0	size_t dst_len = tmp_len * 6, dst_len_tmp = dst_len;
84	0	char dst = wget_malloc(dst_len + 1), dst_tmp = dst;
85
86	0	if (!dst) {
87	0	iconv_close(cd);
88	0	return WGET_E_MEMORY;
89	0	}
90
91	0	errno = 0;
92	0	if (iconv(cd, (ICONV_CONST char **)&tmp, &tmp_len, &dst_tmp, &dst_len_tmp) == 0
93	0	&& iconv(cd, NULL, NULL, &dst_tmp, &dst_len_tmp) == 0)
94	0	{
95	0	debug_printf("transcoded %zu bytes from '%s' to '%s'\n", srclen, src_encoding, dst_encoding);
96	0	if (out) {
97		// here we reduce the allocated memory size, if it fails we use the original memory chunk
98	0	tmp = wget_realloc(dst, dst_len - dst_len_tmp + 1);
99	0	if (!tmp)
100	0	tmp = dst;
101	0	tmp[dst_len - dst_len_tmp] = 0;
102	0	*out = tmp;
103	0	} else
104	0	xfree(dst);
105
106	0	if (outlen)
107	0	*outlen = dst_len - dst_len_tmp;
108
109	0	ret = WGET_E_SUCCESS;
110	0	} else {
111		// erno == 0 means some codepoints were encoded non-reversible, treat as error
112	0	error_printf(_("Failed to transcode '%s' string into '%s' (%d)\n"), src_encoding, dst_encoding, errno);
113	0	xfree(dst);
114
115	0	if (out)
116	0	*out = NULL;
117
118	0	if (outlen)
119	0	*outlen = 0;
120	0	}
121
122	0	iconv_close(cd);
123	0	} else
124	0	error_printf(_("Failed to prepare transcoding '%s' into '%s' (%d)\n"), src_encoding, dst_encoding, errno);
125
126	0	return ret;
127	0	}
128	0	#endif
129
130	0	if (out)
131	0	*out = wget_strmemdup(src, srclen);
132
133	0	if (outlen)
134	0	*outlen = srclen;
135
136	0	return WGET_E_SUCCESS;
137	0	}
138
139		// src must be a ASCII compatible C string
140		char wget_striconv(const char src, const char src_encoding, const char dst_encoding)
141	0	{
142	0	if (!src)
143	0	return NULL;
144
145	0	char *dst;
146	0	if (wget_memiconv(src_encoding, src, strlen(src), dst_encoding, &dst, NULL))
147	0	return NULL;
148
149	0	return dst;
150	0	}
151
152		bool wget_str_needs_encoding(const char *s)
153	0	{
154	0	if (!s)
155	0	return false;
156
157	0	while (s && (s & ~0x7f) == 0) s++;
158
159	0	return *s != 0;
160	0	}
161
162		bool wget_str_is_valid_utf8(const char *utf8)
163	0	{
164	0	const unsigned char s = (const unsigned char ) utf8;
165
166	0	if (!s)
167	0	return 0;
168
169	0	while (*s) {
170	0	if ((s & 0x80) == 0) / 0xxxxxxx ASCII char */
171	0	s++;
172	0	else if ((s & 0xE0) == 0xC0) / 110xxxxx 10xxxxxx */ {
173	0	if ((s[1] & 0xC0) != 0x80)
174	0	return 0;
175	0	s += 2;
176	0	} else if ((s & 0xF0) == 0xE0) / 1110xxxx 10xxxxxx 10xxxxxx */ {
177	0	if ((s[1] & 0xC0) != 0x80 \|\| (s[2] & 0xC0) != 0x80)
178	0	return 0;
179	0	s += 3;
180	0	} else if ((s & 0xF8) == 0xF0) / 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ {
181	0	if ((s[1] & 0xC0) != 0x80 \|\| (s[2] & 0xC0) != 0x80 \|\| (s[3] & 0xC0) != 0x80)
182	0	return 0;
183	0	s += 4;
184	0	} else
185	0	return 0;
186	0	}
187
188	0	return 1;
189	0	}
190
191		char wget_str_to_utf8(const char src, const char *encoding)
192	0	{
193	0	return wget_striconv(src, encoding, "utf-8");
194	0	}
195
196		char wget_utf8_to_str(const char src, const char *encoding)
197	0	{
198	0	return wget_striconv(src, "utf-8", encoding);
199	0	}
200
201		#ifdef WITH_LIBIDN
202		/*
203		* Work around a libidn <= 1.30 vulnerability.
204		*
205		* The function checks for a valid UTF-8 character sequence before
206		* passing it to idna_to_ascii_8z().
207		*
208		* [1] https://lists.gnu.org/archive/html/help-libidn/2015-05/msg00002.html
209		* [2] https://lists.gnu.org/archive/html/bug-wget/2015-06/msg00002.html
210		* [3] https://curl.haxx.se/mail/lib-2015-06/0143.html
211		*/
212		static int WGET_GCC_PURE _utf8_is_valid(const char *utf8)
213		{
214		const unsigned char s = (const unsigned char ) utf8;
215
216		while (*s) {
217		if ((s & 0x80) == 0) / 0xxxxxxx ASCII char */
218		s++;
219		else if ((s & 0xE0) == 0xC0) / 110xxxxx 10xxxxxx */ {
220		if ((s[1] & 0xC0) != 0x80)
221		return 0;
222		s += 2;
223		} else if ((s & 0xF0) == 0xE0) / 1110xxxx 10xxxxxx 10xxxxxx */ {
224		if ((s[1] & 0xC0) != 0x80 \|\| (s[2] & 0xC0) != 0x80)
225		return 0;
226		s += 3;
227		} else if ((s & 0xF8) == 0xF0) / 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ {
228		if ((s[1] & 0xC0) != 0x80 \|\| (s[2] & 0xC0) != 0x80 \|\| (s[3] & 0xC0) != 0x80)
229		return 0;
230		s += 4;
231		} else
232		return 0;
233		}
234
235		return 1;
236		}
237		#endif
238
239		/* We convert hostnames and thus have to apply IDN2_USE_STD3_ASCII_RULES.
240		* If we don't do, the result could contain any ascii characters,
241		* e.g. 'evil.c\u2100.example.com' will be converted into
242		* 'evil.ca/c.example.com', which seems no good idea. */
243		const char wget_str_to_ascii(const char src)
244	0	{
245	0	#ifdef WITH_LIBIDN2
246	0	if (wget_str_needs_encoding(src)) {
247	0	char *asc = NULL;
248	0	int rc;
249	0	if ((rc = idn2_lookup_u8((uint8_t )src, (uint8_t *)&asc, IDN2_NONTRANSITIONAL\|IDN2_USE_STD3_ASCII_RULES)) != IDN2_OK)
250	0	rc = idn2_lookup_u8((uint8_t )src, (uint8_t *)&asc, IDN2_TRANSITIONAL\|IDN2_USE_STD3_ASCII_RULES);
251	0	if (rc == IDN2_OK)
252	0	{
253	0	debug_printf("idn2 '%s' -> '%s'\n", src, asc);
254		# ifdef _WIN32
255		src = wget_strdup(asc);
256		idn2_free(asc);
257		# else
258	0	src = asc;
259	0	# endif
260	0	} else
261	0	error_printf(_("toASCII(%s) failed (%d): %s\n"), src, rc, idn2_strerror(rc));
262	0	}
263		#elif defined WITH_LIBIDN
264		if (wget_str_needs_encoding(src)) {
265		char *asc = NULL;
266
267		if (_utf8_is_valid(src)) {
268		int rc;
269
270		// idna_to_ascii_8z() automatically converts UTF-8 to lowercase
271		if ((rc = idna_to_ascii_8z(src, &asc, IDNA_USE_STD3_ASCII_RULES)) == IDNA_SUCCESS) {
272		// debug_printf("toASCII '%s' -> '%s'\n", src, asc);
273		# ifdef _WIN32
274		src = wget_strdup(asc);
275		idn_free(asc);
276		# else
277		src = asc;
278		# endif
279		} else
280		error_printf(_("toASCII failed (%d): %s\n"), rc, idna_strerror(rc));
281		}
282		else
283		error_printf(_("Invalid UTF-8 sequence not converted: '%s'\n"), src);
284		}
285		#else
286		if (wget_str_needs_encoding(src)) {
287		error_printf(_("toASCII not available: '%s'\n"), src);
288		}
289		#endif
290
291	0	return src;
292	0	}