Coverage Report

Created: 2025-03-06 06:58

/src/wget/lib/mbrtoc32.c
Line
Count
Source (jump to first uncovered line)
1
/* Convert multibyte character to 32-bit wide character.
2
   Copyright (C) 2020-2025 Free Software Foundation, Inc.
3
4
   This file is free software: you can redistribute it and/or modify
5
   it under the terms of the GNU Lesser General Public License as
6
   published by the Free Software Foundation; either version 2.1 of the
7
   License, or (at your option) any later version.
8
9
   This file is distributed in the hope that it will be useful,
10
   but WITHOUT ANY WARRANTY; without even the implied warranty of
11
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12
   GNU Lesser General Public License for more details.
13
14
   You should have received a copy of the GNU Lesser General Public License
15
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
16
17
/* Written by Bruno Haible <bruno@clisp.org>, 2020.  */
18
19
#include <config.h>
20
21
/* Specification.  */
22
#include <uchar.h>
23
24
#include "attribute.h"
25
26
#include <errno.h>
27
#include <stdlib.h>
28
29
#if GL_CHAR32_T_IS_UNICODE
30
# include "lc-charset-unicode.h"
31
#endif
32
33
#if GNULIB_defined_mbstate_t /* AIX, IRIX */
34
/* Implement mbrtoc32() on top of mbtowc() for the non-UTF-8 locales
35
   and directly for the UTF-8 locales.  */
36
37
/* Note: On AIX (64-bit) we can implement mbrtoc32 in two equivalent ways:
38
   - in a way that parallels the override of mbrtowc; this is the code branch
39
     here;
40
   - in a way that invokes the overridden mbrtowc; this would be the #else
41
     branch below.
42
   They are equivalent.  */
43
44
# if AVOID_ANY_THREADS
45
46
/* The option '--disable-threads' explicitly requests no locking.  */
47
48
# elif defined _WIN32 && !defined __CYGWIN__
49
50
#  define WIN32_LEAN_AND_MEAN  /* avoid including junk */
51
#  include <windows.h>
52
53
# elif HAVE_PTHREAD_API
54
55
#  include <pthread.h>
56
#  if HAVE_THREADS_H && HAVE_WEAK_SYMBOLS
57
#   include <threads.h>
58
#   pragma weak thrd_exit
59
#   define c11_threads_in_use() (thrd_exit != NULL)
60
#  else
61
#   define c11_threads_in_use() 0
62
#  endif
63
64
# elif HAVE_THREADS_H
65
66
#  include <threads.h>
67
68
# endif
69
70
# include "lc-charset-dispatch.h"
71
# include "mbtowc-lock.h"
72
73
static_assert (sizeof (mbstate_t) >= 4);
74
static char internal_state[4];
75
76
size_t
77
mbrtoc32 (char32_t *pwc, const char *s, size_t n, mbstate_t *ps)
78
{
79
# define FITS_IN_CHAR_TYPE(wc)  1
80
# include "mbrtowc-impl.h"
81
}
82
83
#else /* glibc, macOS, FreeBSD, NetBSD, OpenBSD, HP-UX, Solaris, Cygwin, mingw, MSVC, Minix, Android */
84
85
/* Implement mbrtoc32() based on the original mbrtoc32() or on mbrtowc().  */
86
87
# include <wchar.h>
88
89
# include "localcharset.h"
90
# include "streq.h"
91
92
# if MBRTOC32_IN_C_LOCALE_MAYBE_EILSEQ
93
#  include "hard-locale.h"
94
#  include <locale.h>
95
# endif
96
97
static mbstate_t internal_state;
98
99
size_t
100
mbrtoc32 (char32_t *pwc, const char *s, size_t n, mbstate_t *ps)
101
# undef mbrtoc32
102
0
{
103
  /* It's simpler to handle the case s == NULL upfront, than to worry about
104
     this case later, before every test of pwc and n.  */
105
0
  if (s == NULL)
106
0
    {
107
0
      pwc = NULL;
108
0
      s = "";
109
0
      n = 1;
110
0
    }
111
112
# if MBRTOC32_EMPTY_INPUT_BUG || _GL_SMALL_WCHAR_T
113
  if (n == 0)
114
    return (size_t) -2;
115
# endif
116
117
0
  if (ps == NULL)
118
0
    ps = &internal_state;
119
120
0
# if HAVE_WORKING_MBRTOC32 && HAVE_WORKING_C32RTOMB && !MBRTOC32_MULTIBYTE_LOCALE_BUG
121
  /* mbrtoc32() may produce different values for wc than mbrtowc().  Therefore
122
     use mbrtoc32().  */
123
124
#  if defined _WIN32 && !defined __CYGWIN__
125
  char32_t wc;
126
  size_t ret = mbrtoc32 (&wc, s, n, ps);
127
  if (ret < (size_t) -2 && pwc != NULL)
128
    *pwc = wc;
129
#  else
130
0
  size_t ret = mbrtoc32 (pwc, s, n, ps);
131
0
#  endif
132
133
#  if GNULIB_MBRTOC32_REGULAR
134
  /* Verify that mbrtoc32 is regular.  */
135
  if (ret < (size_t) -3 && ! mbsinit (ps))
136
    /* This occurs on glibc 2.36.  */
137
    mbszero (ps);
138
  if (ret == (size_t) -3)
139
    abort ();
140
#  endif
141
142
0
#  if MBRTOC32_IN_C_LOCALE_MAYBE_EILSEQ
143
0
  if ((size_t) -2 <= ret && n != 0 && ! hard_locale (LC_CTYPE))
144
0
    {
145
0
      if (pwc != NULL)
146
0
        *pwc = (unsigned char) *s;
147
0
      return 1;
148
0
    }
149
0
#  endif
150
151
0
  return ret;
152
153
# elif _GL_SMALL_WCHAR_T
154
155
  /* Special-case all encodings that may produce wide character values
156
     > WCHAR_MAX.  */
157
  const char *encoding = locale_charset ();
158
  if (STREQ_OPT (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
159
    {
160
      /* Special-case the UTF-8 encoding.  Assume that the wide-character
161
         encoding in a UTF-8 locale is UCS-2 or, equivalently, UTF-16.  */
162
      /* Here n > 0.  */
163
      char *pstate = (char *)ps;
164
      size_t nstate = pstate[0];
165
      char buf[4];
166
      const char *p;
167
      size_t m;
168
      int res;
169
170
      switch (nstate)
171
        {
172
        case 0:
173
          p = s;
174
          m = n;
175
          break;
176
        case 3:
177
          buf[2] = pstate[3];
178
          FALLTHROUGH;
179
        case 2:
180
          buf[1] = pstate[2];
181
          FALLTHROUGH;
182
        case 1:
183
          buf[0] = pstate[1];
184
          p = buf;
185
          m = nstate;
186
          buf[m++] = s[0];
187
          if (n >= 2 && m < 4)
188
            {
189
              buf[m++] = s[1];
190
              if (n >= 3 && m < 4)
191
                buf[m++] = s[2];
192
            }
193
          break;
194
        default:
195
          errno = EINVAL;
196
          return (size_t)(-1);
197
        }
198
199
      /* Here m > 0.  */
200
201
      {
202
#  define FITS_IN_CHAR_TYPE(wc)  1
203
#  include "mbrtowc-impl-utf8.h"
204
      }
205
206
     success:
207
      if (nstate >= (res > 0 ? res : 1))
208
        abort ();
209
      res -= nstate;
210
      /* Set *ps to an initial state.  */
211
#  if defined _WIN32 && !defined __CYGWIN__
212
      /* Native Windows.  */
213
      /* MSVC defines 'mbstate_t' as an 8-byte struct; the first 4 bytes matter.
214
         On mingw, 'mbstate_t' is sometimes defined as 'int', sometimes defined
215
         as an 8-byte struct, of which the first 4 bytes matter.  */
216
      *(unsigned int *)pstate = 0;
217
#  elif defined __CYGWIN__
218
      /* Cygwin defines 'mbstate_t' as an 8-byte struct; the first 4 bytes
219
         matter.  */
220
      ps->__count = 0;
221
#  else
222
      pstate[0] = 0;
223
#  endif
224
      return res;
225
226
     incomplete:
227
      {
228
        size_t k = nstate;
229
        /* Here 0 <= k < m < 4.  */
230
        pstate[++k] = s[0];
231
        if (k < m)
232
          {
233
            pstate[++k] = s[1];
234
            if (k < m)
235
              pstate[++k] = s[2];
236
          }
237
        if (k != m)
238
          abort ();
239
      }
240
      pstate[0] = m;
241
      return (size_t)(-2);
242
243
     invalid:
244
      errno = EILSEQ;
245
      /* The conversion state is undefined, says POSIX.  */
246
      return (size_t)(-1);
247
    }
248
  else
249
    {
250
      wchar_t wc;
251
      size_t ret = mbrtowc (&wc, s, n, ps);
252
      if (ret < (size_t) -2 && pwc != NULL)
253
        *pwc = wc;
254
      return ret;
255
    }
256
257
# else
258
259
  /* char32_t and wchar_t are equivalent.  Use mbrtowc().  */
260
  wchar_t wc;
261
  size_t ret = mbrtowc (&wc, s, n, ps);
262
263
#  if GNULIB_MBRTOC32_REGULAR
264
  /* Ensure that mbrtoc32 is regular.  */
265
  if (ret < (size_t) -2 && ! mbsinit (ps))
266
    /* This occurs on glibc 2.12.  */
267
    mbszero (ps);
268
#  endif
269
270
#  if GL_CHAR32_T_IS_UNICODE && GL_CHAR32_T_VS_WCHAR_T_NEEDS_CONVERSION
271
  if (ret < (size_t) -2 && wc != 0)
272
    {
273
      wc = locale_encoding_to_unicode (wc);
274
      if (wc == 0)
275
        {
276
          ret = (size_t) -1;
277
          errno = EILSEQ;
278
        }
279
    }
280
#  endif
281
  if (ret < (size_t) -2 && pwc != NULL)
282
    *pwc = wc;
283
  return ret;
284
285
# endif
286
0
}
287
288
#endif