Coverage Report

Created: 2025-01-28 06:58

/src/wget2/libwget/encoding.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2012-2015 Tim Ruehsen
3
 * Copyright (c) 2015-2024 Free Software Foundation, Inc.
4
 *
5
 * This file is part of libwget.
6
 *
7
 * Libwget is free software: you can redistribute it and/or modify
8
 * it under the terms of the GNU Lesser General Public License as published by
9
 * the Free Software Foundation, either version 3 of the License, or
10
 * (at your option) any later version.
11
 *
12
 * Libwget is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
 * GNU Lesser General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU Lesser General Public License
18
 * along with libwget.  If not, see <https://www.gnu.org/licenses/>.
19
 *
20
 *
21
 * a collection of charset encoding routines
22
 *
23
 * Changelog
24
 * 02.10.2013  Tim Ruehsen  created
25
 *
26
 */
27
28
#include <config.h>
29
30
#include <string.h>
31
#include <errno.h>
32
33
#ifdef HAVE_ICONV
34
# include <iconv.h>
35
#endif
36
37
#include <langinfo.h>
38
39
#if defined HAVE_IDN2_H && defined WITH_LIBIDN2
40
# include <idn2.h>
41
#elif defined HAVE_IDNA_H && defined WITH_LIBIDN
42
# include <idna.h>
43
# ifdef _WIN32
44
#   include <idn-free.h>
45
# endif
46
#elif defined HAVE_IDN_IDNA_H && defined WITH_LIBIDN
47
// OpenSolaris uses the idn subdir
48
# include <idn/idna.h>
49
#endif
50
51
#include <wget.h>
52
#include "private.h"
53
54
const char *wget_local_charset_encoding(void)
55
0
{
56
0
  const char *encoding = nl_langinfo(CODESET);
57
58
0
  if (encoding && *encoding)
59
0
    return wget_strdup(encoding);
60
61
0
  return wget_strdup("ASCII");
62
0
}
63
64
// void *wget_memiconv(const void *src, size_t length, const char *src_encoding, const char *dst_encoding)
65
int wget_memiconv(const char *src_encoding, const void *src, size_t srclen, const char *dst_encoding, char **out, size_t *outlen)
66
0
{
67
0
  if (!src)
68
0
    return WGET_E_INVALID;
69
70
0
#ifdef HAVE_ICONV
71
0
  if (!src_encoding)
72
0
    src_encoding = "iso-8859-1"; // default character-set for most browsers
73
0
  if (!dst_encoding)
74
0
    dst_encoding = "iso-8859-1"; // default character-set for most browsers
75
76
0
  if (wget_strcasecmp_ascii(src_encoding, dst_encoding)) {
77
0
    int ret = WGET_E_UNKNOWN;
78
0
    iconv_t cd = iconv_open(dst_encoding, src_encoding);
79
80
0
    if (cd != (iconv_t)-1) {
81
0
      char *tmp = (char *) src; // iconv won't change where src points to, but changes tmp itself
82
0
      size_t tmp_len = srclen;
83
0
      size_t dst_len = tmp_len * 6, dst_len_tmp = dst_len;
84
0
      char *dst = wget_malloc(dst_len + 1), *dst_tmp = dst;
85
86
0
      if (!dst) {
87
0
        iconv_close(cd);
88
0
        return WGET_E_MEMORY;
89
0
      }
90
91
0
      errno = 0;
92
0
      if (iconv(cd, (ICONV_CONST char **)&tmp, &tmp_len, &dst_tmp, &dst_len_tmp) == 0
93
0
        && iconv(cd, NULL, NULL, &dst_tmp, &dst_len_tmp) == 0)
94
0
      {
95
0
        debug_printf("transcoded %zu bytes from '%s' to '%s'\n", srclen, src_encoding, dst_encoding);
96
0
        if (out) {
97
          // here we reduce the allocated memory size, if it fails we use the original memory chunk
98
0
          tmp = wget_realloc(dst, dst_len - dst_len_tmp + 1);
99
0
          if (!tmp)
100
0
            tmp = dst;
101
0
          tmp[dst_len - dst_len_tmp] = 0;
102
0
          *out = tmp;
103
0
        } else
104
0
          xfree(dst);
105
106
0
        if (outlen)
107
0
          *outlen = dst_len - dst_len_tmp;
108
109
0
        ret = WGET_E_SUCCESS;
110
0
      } else {
111
        // erno == 0 means some codepoints were encoded non-reversible, treat as error
112
0
        error_printf(_("Failed to transcode '%s' string into '%s' (%d)\n"), src_encoding, dst_encoding, errno);
113
0
        xfree(dst);
114
115
0
        if (out)
116
0
          *out = NULL;
117
118
0
        if (outlen)
119
0
          *outlen = 0;
120
0
      }
121
122
0
      iconv_close(cd);
123
0
    } else
124
0
      error_printf(_("Failed to prepare transcoding '%s' into '%s' (%d)\n"), src_encoding, dst_encoding, errno);
125
126
0
    return ret;
127
0
  }
128
0
#endif
129
130
0
  if (out)
131
0
    *out = wget_strmemdup(src, srclen);
132
133
0
  if (outlen)
134
0
    *outlen = srclen;
135
136
0
  return WGET_E_SUCCESS;
137
0
}
138
139
// src must be a ASCII compatible C string
140
char *wget_striconv(const char *src, const char *src_encoding, const char *dst_encoding)
141
0
{
142
0
  if (!src)
143
0
    return NULL;
144
145
0
  char *dst;
146
0
  if (wget_memiconv(src_encoding, src, strlen(src), dst_encoding, &dst, NULL))
147
0
    return NULL;
148
149
0
  return dst;
150
0
}
151
152
bool wget_str_needs_encoding(const char *s)
153
0
{
154
0
  if (!s)
155
0
    return false;
156
157
0
  while (*s && (*s & ~0x7f) == 0) s++;
158
159
0
  return *s != 0;
160
0
}
161
162
bool wget_str_is_valid_utf8(const char *utf8)
163
0
{
164
0
  const unsigned char *s = (const unsigned char *) utf8;
165
166
0
  if (!s)
167
0
    return 0;
168
169
0
  while (*s) {
170
0
    if ((*s & 0x80) == 0) /* 0xxxxxxx ASCII char */
171
0
      s++;
172
0
    else if ((*s & 0xE0) == 0xC0) /* 110xxxxx 10xxxxxx */ {
173
0
      if ((s[1] & 0xC0) != 0x80)
174
0
        return 0;
175
0
      s += 2;
176
0
    } else if ((*s & 0xF0) == 0xE0) /* 1110xxxx 10xxxxxx 10xxxxxx */ {
177
0
      if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80)
178
0
        return 0;
179
0
      s += 3;
180
0
    } else if ((*s & 0xF8) == 0xF0) /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ {
181
0
      if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80 || (s[3] & 0xC0) != 0x80)
182
0
        return 0;
183
0
      s += 4;
184
0
    } else
185
0
      return 0;
186
0
  }
187
188
0
  return 1;
189
0
}
190
191
char *wget_str_to_utf8(const char *src, const char *encoding)
192
0
{
193
0
  return wget_striconv(src, encoding, "utf-8");
194
0
}
195
196
char *wget_utf8_to_str(const char *src, const char *encoding)
197
0
{
198
0
  return wget_striconv(src, "utf-8", encoding);
199
0
}
200
201
#ifdef WITH_LIBIDN
202
/*
203
 * Work around a libidn <= 1.30 vulnerability.
204
 *
205
 * The function checks for a valid UTF-8 character sequence before
206
 * passing it to idna_to_ascii_8z().
207
 *
208
 * [1] https://lists.gnu.org/archive/html/help-libidn/2015-05/msg00002.html
209
 * [2] https://lists.gnu.org/archive/html/bug-wget/2015-06/msg00002.html
210
 * [3] https://curl.haxx.se/mail/lib-2015-06/0143.html
211
 */
212
static int WGET_GCC_PURE _utf8_is_valid(const char *utf8)
213
{
214
  const unsigned char *s = (const unsigned char *) utf8;
215
216
  while (*s) {
217
    if ((*s & 0x80) == 0) /* 0xxxxxxx ASCII char */
218
      s++;
219
    else if ((*s & 0xE0) == 0xC0) /* 110xxxxx 10xxxxxx */ {
220
      if ((s[1] & 0xC0) != 0x80)
221
        return 0;
222
      s += 2;
223
    } else if ((*s & 0xF0) == 0xE0) /* 1110xxxx 10xxxxxx 10xxxxxx */ {
224
      if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80)
225
        return 0;
226
      s += 3;
227
    } else if ((*s & 0xF8) == 0xF0) /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ {
228
      if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80 || (s[3] & 0xC0) != 0x80)
229
        return 0;
230
      s += 4;
231
    } else
232
      return 0;
233
  }
234
235
  return 1;
236
}
237
#endif
238
239
/* We convert hostnames and thus have to apply IDN2_USE_STD3_ASCII_RULES.
240
 * If we don't do, the result could contain any ascii characters,
241
 * e.g. 'evil.c\u2100.example.com' will be converted into
242
 * 'evil.ca/c.example.com', which seems no good idea. */
243
const char *wget_str_to_ascii(const char *src)
244
0
{
245
0
#ifdef WITH_LIBIDN2
246
0
  if (wget_str_needs_encoding(src)) {
247
0
    char *asc = NULL;
248
0
    int rc;
249
0
    if ((rc = idn2_lookup_u8((uint8_t *)src, (uint8_t **)&asc, IDN2_NONTRANSITIONAL|IDN2_USE_STD3_ASCII_RULES)) != IDN2_OK)
250
0
      rc = idn2_lookup_u8((uint8_t *)src, (uint8_t **)&asc, IDN2_TRANSITIONAL|IDN2_USE_STD3_ASCII_RULES);
251
0
    if (rc == IDN2_OK)
252
0
    {
253
0
      debug_printf("idn2 '%s' -> '%s'\n", src, asc);
254
#  ifdef _WIN32
255
        src = wget_strdup(asc);
256
        idn2_free(asc);
257
#  else
258
0
        src = asc;
259
0
#  endif
260
0
    } else
261
0
      error_printf(_("toASCII(%s) failed (%d): %s\n"), src, rc, idn2_strerror(rc));
262
0
  }
263
#elif defined WITH_LIBIDN
264
  if (wget_str_needs_encoding(src)) {
265
    char *asc = NULL;
266
267
    if (_utf8_is_valid(src)) {
268
      int rc;
269
270
      // idna_to_ascii_8z() automatically converts UTF-8 to lowercase
271
      if ((rc = idna_to_ascii_8z(src, &asc, IDNA_USE_STD3_ASCII_RULES)) == IDNA_SUCCESS) {
272
        // debug_printf("toASCII '%s' -> '%s'\n", src, asc);
273
# ifdef _WIN32
274
        src = wget_strdup(asc);
275
        idn_free(asc);
276
# else
277
        src = asc;
278
# endif
279
      } else
280
        error_printf(_("toASCII failed (%d): %s\n"), rc, idna_strerror(rc));
281
    }
282
    else
283
      error_printf(_("Invalid UTF-8 sequence not converted: '%s'\n"), src);
284
  }
285
#else
286
  if (wget_str_needs_encoding(src)) {
287
    error_printf(_("toASCII not available: '%s'\n"), src);
288
  }
289
#endif
290
291
0
  return src;
292
0
}