Line | Count | Source (jump to first uncovered line) |
1 | | /* Convert multibyte character to 32-bit wide character. |
2 | | Copyright (C) 2020-2025 Free Software Foundation, Inc. |
3 | | |
4 | | This file is free software: you can redistribute it and/or modify |
5 | | it under the terms of the GNU Lesser General Public License as |
6 | | published by the Free Software Foundation; either version 2.1 of the |
7 | | License, or (at your option) any later version. |
8 | | |
9 | | This file is distributed in the hope that it will be useful, |
10 | | but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
12 | | GNU Lesser General Public License for more details. |
13 | | |
14 | | You should have received a copy of the GNU Lesser General Public License |
15 | | along with this program. If not, see <https://www.gnu.org/licenses/>. */ |
16 | | |
17 | | /* Written by Bruno Haible <bruno@clisp.org>, 2020. */ |
18 | | |
19 | | #include <config.h> |
20 | | |
21 | | /* Specification. */ |
22 | | #include <uchar.h> |
23 | | |
24 | | #include "attribute.h" |
25 | | |
26 | | #include <errno.h> |
27 | | #include <stdlib.h> |
28 | | |
29 | | #if GL_CHAR32_T_IS_UNICODE |
30 | | # include "lc-charset-unicode.h" |
31 | | #endif |
32 | | |
33 | | #if GNULIB_defined_mbstate_t /* AIX, IRIX */ |
34 | | /* Implement mbrtoc32() on top of mbtowc() for the non-UTF-8 locales |
35 | | and directly for the UTF-8 locales. */ |
36 | | |
37 | | /* Note: On AIX (64-bit) we can implement mbrtoc32 in two equivalent ways: |
38 | | - in a way that parallels the override of mbrtowc; this is the code branch |
39 | | here; |
40 | | - in a way that invokes the overridden mbrtowc; this would be the #else |
41 | | branch below. |
42 | | They are equivalent. */ |
43 | | |
44 | | # if AVOID_ANY_THREADS |
45 | | |
46 | | /* The option '--disable-threads' explicitly requests no locking. */ |
47 | | |
48 | | # elif defined _WIN32 && !defined __CYGWIN__ |
49 | | |
50 | | # define WIN32_LEAN_AND_MEAN /* avoid including junk */ |
51 | | # include <windows.h> |
52 | | |
53 | | # elif HAVE_PTHREAD_API |
54 | | |
55 | | # include <pthread.h> |
56 | | # if HAVE_THREADS_H && HAVE_WEAK_SYMBOLS |
57 | | # include <threads.h> |
58 | | # pragma weak thrd_exit |
59 | | # define c11_threads_in_use() (thrd_exit != NULL) |
60 | | # else |
61 | | # define c11_threads_in_use() 0 |
62 | | # endif |
63 | | |
64 | | # elif HAVE_THREADS_H |
65 | | |
66 | | # include <threads.h> |
67 | | |
68 | | # endif |
69 | | |
70 | | # include "lc-charset-dispatch.h" |
71 | | # include "mbtowc-lock.h" |
72 | | |
73 | | static_assert (sizeof (mbstate_t) >= 4); |
74 | | static char internal_state[4]; |
75 | | |
76 | | size_t |
77 | | mbrtoc32 (char32_t *pwc, const char *s, size_t n, mbstate_t *ps) |
78 | | { |
79 | | # define FITS_IN_CHAR_TYPE(wc) 1 |
80 | | # include "mbrtowc-impl.h" |
81 | | } |
82 | | |
83 | | #else /* glibc, macOS, FreeBSD, NetBSD, OpenBSD, HP-UX, Solaris, Cygwin, mingw, MSVC, Minix, Android */ |
84 | | |
85 | | /* Implement mbrtoc32() based on the original mbrtoc32() or on mbrtowc(). */ |
86 | | |
87 | | # include <wchar.h> |
88 | | |
89 | | # include "localcharset.h" |
90 | | # include "streq.h" |
91 | | |
92 | | # if MBRTOC32_IN_C_LOCALE_MAYBE_EILSEQ |
93 | | # include "hard-locale.h" |
94 | | # include <locale.h> |
95 | | # endif |
96 | | |
97 | | static mbstate_t internal_state; |
98 | | |
99 | | size_t |
100 | | mbrtoc32 (char32_t *pwc, const char *s, size_t n, mbstate_t *ps) |
101 | | # undef mbrtoc32 |
102 | 0 | { |
103 | | /* It's simpler to handle the case s == NULL upfront, than to worry about |
104 | | this case later, before every test of pwc and n. */ |
105 | 0 | if (s == NULL) |
106 | 0 | { |
107 | 0 | pwc = NULL; |
108 | 0 | s = ""; |
109 | 0 | n = 1; |
110 | 0 | } |
111 | |
|
112 | | # if MBRTOC32_EMPTY_INPUT_BUG || _GL_SMALL_WCHAR_T |
113 | | if (n == 0) |
114 | | return (size_t) -2; |
115 | | # endif |
116 | |
|
117 | 0 | if (ps == NULL) |
118 | 0 | ps = &internal_state; |
119 | |
|
120 | 0 | # if HAVE_WORKING_MBRTOC32 && HAVE_WORKING_C32RTOMB && !MBRTOC32_MULTIBYTE_LOCALE_BUG |
121 | | /* mbrtoc32() may produce different values for wc than mbrtowc(). Therefore |
122 | | use mbrtoc32(). */ |
123 | |
|
124 | | # if defined _WIN32 && !defined __CYGWIN__ |
125 | | char32_t wc; |
126 | | size_t ret = mbrtoc32 (&wc, s, n, ps); |
127 | | if (ret < (size_t) -2 && pwc != NULL) |
128 | | *pwc = wc; |
129 | | # else |
130 | 0 | size_t ret = mbrtoc32 (pwc, s, n, ps); |
131 | 0 | # endif |
132 | |
|
133 | | # if GNULIB_MBRTOC32_REGULAR |
134 | | /* Verify that mbrtoc32 is regular. */ |
135 | | if (ret < (size_t) -3 && ! mbsinit (ps)) |
136 | | /* This occurs on glibc 2.36. */ |
137 | | mbszero (ps); |
138 | | if (ret == (size_t) -3) |
139 | | abort (); |
140 | | # endif |
141 | |
|
142 | 0 | # if MBRTOC32_IN_C_LOCALE_MAYBE_EILSEQ |
143 | 0 | if ((size_t) -2 <= ret && n != 0 && ! hard_locale (LC_CTYPE)) |
144 | 0 | { |
145 | 0 | if (pwc != NULL) |
146 | 0 | *pwc = (unsigned char) *s; |
147 | 0 | return 1; |
148 | 0 | } |
149 | 0 | # endif |
150 | | |
151 | 0 | return ret; |
152 | |
|
153 | | # elif _GL_SMALL_WCHAR_T |
154 | | |
155 | | /* Special-case all encodings that may produce wide character values |
156 | | > WCHAR_MAX. */ |
157 | | const char *encoding = locale_charset (); |
158 | | if (STREQ_OPT (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0)) |
159 | | { |
160 | | /* Special-case the UTF-8 encoding. Assume that the wide-character |
161 | | encoding in a UTF-8 locale is UCS-2 or, equivalently, UTF-16. */ |
162 | | /* Here n > 0. */ |
163 | | char *pstate = (char *)ps; |
164 | | size_t nstate = pstate[0]; |
165 | | char buf[4]; |
166 | | const char *p; |
167 | | size_t m; |
168 | | int res; |
169 | | |
170 | | switch (nstate) |
171 | | { |
172 | | case 0: |
173 | | p = s; |
174 | | m = n; |
175 | | break; |
176 | | case 3: |
177 | | buf[2] = pstate[3]; |
178 | | FALLTHROUGH; |
179 | | case 2: |
180 | | buf[1] = pstate[2]; |
181 | | FALLTHROUGH; |
182 | | case 1: |
183 | | buf[0] = pstate[1]; |
184 | | p = buf; |
185 | | m = nstate; |
186 | | buf[m++] = s[0]; |
187 | | if (n >= 2 && m < 4) |
188 | | { |
189 | | buf[m++] = s[1]; |
190 | | if (n >= 3 && m < 4) |
191 | | buf[m++] = s[2]; |
192 | | } |
193 | | break; |
194 | | default: |
195 | | errno = EINVAL; |
196 | | return (size_t)(-1); |
197 | | } |
198 | | |
199 | | /* Here m > 0. */ |
200 | | |
201 | | { |
202 | | # define FITS_IN_CHAR_TYPE(wc) 1 |
203 | | # include "mbrtowc-impl-utf8.h" |
204 | | } |
205 | | |
206 | | success: |
207 | | if (nstate >= (res > 0 ? res : 1)) |
208 | | abort (); |
209 | | res -= nstate; |
210 | | /* Set *ps to an initial state. */ |
211 | | # if defined _WIN32 && !defined __CYGWIN__ |
212 | | /* Native Windows. */ |
213 | | /* MSVC defines 'mbstate_t' as an 8-byte struct; the first 4 bytes matter. |
214 | | On mingw, 'mbstate_t' is sometimes defined as 'int', sometimes defined |
215 | | as an 8-byte struct, of which the first 4 bytes matter. */ |
216 | | *(unsigned int *)pstate = 0; |
217 | | # elif defined __CYGWIN__ |
218 | | /* Cygwin defines 'mbstate_t' as an 8-byte struct; the first 4 bytes |
219 | | matter. */ |
220 | | ps->__count = 0; |
221 | | # else |
222 | | pstate[0] = 0; |
223 | | # endif |
224 | | return res; |
225 | | |
226 | | incomplete: |
227 | | { |
228 | | size_t k = nstate; |
229 | | /* Here 0 <= k < m < 4. */ |
230 | | pstate[++k] = s[0]; |
231 | | if (k < m) |
232 | | { |
233 | | pstate[++k] = s[1]; |
234 | | if (k < m) |
235 | | pstate[++k] = s[2]; |
236 | | } |
237 | | if (k != m) |
238 | | abort (); |
239 | | } |
240 | | pstate[0] = m; |
241 | | return (size_t)(-2); |
242 | | |
243 | | invalid: |
244 | | errno = EILSEQ; |
245 | | /* The conversion state is undefined, says POSIX. */ |
246 | | return (size_t)(-1); |
247 | | } |
248 | | else |
249 | | { |
250 | | wchar_t wc; |
251 | | size_t ret = mbrtowc (&wc, s, n, ps); |
252 | | if (ret < (size_t) -2 && pwc != NULL) |
253 | | *pwc = wc; |
254 | | return ret; |
255 | | } |
256 | | |
257 | | # else |
258 | | |
259 | | /* char32_t and wchar_t are equivalent. Use mbrtowc(). */ |
260 | | wchar_t wc; |
261 | | size_t ret = mbrtowc (&wc, s, n, ps); |
262 | | |
263 | | # if GNULIB_MBRTOC32_REGULAR |
264 | | /* Ensure that mbrtoc32 is regular. */ |
265 | | if (ret < (size_t) -2 && ! mbsinit (ps)) |
266 | | /* This occurs on glibc 2.12. */ |
267 | | mbszero (ps); |
268 | | # endif |
269 | | |
270 | | # if GL_CHAR32_T_IS_UNICODE && GL_CHAR32_T_VS_WCHAR_T_NEEDS_CONVERSION |
271 | | if (ret < (size_t) -2 && wc != 0) |
272 | | { |
273 | | wc = locale_encoding_to_unicode (wc); |
274 | | if (wc == 0) |
275 | | { |
276 | | ret = (size_t) -1; |
277 | | errno = EILSEQ; |
278 | | } |
279 | | } |
280 | | # endif |
281 | | if (ret < (size_t) -2 && pwc != NULL) |
282 | | *pwc = wc; |
283 | | return ret; |
284 | | |
285 | | # endif |
286 | 0 | } |
287 | | |
288 | | #endif |