Line | Count | Source (jump to first uncovered line) |
1 | | /* $OpenBSD: utf8.c,v 1.11 2020/05/01 06:28:52 djm Exp $ */ |
2 | | /* |
3 | | * Copyright (c) 2016 Ingo Schwarze <schwarze@openbsd.org> |
4 | | * |
5 | | * Permission to use, copy, modify, and distribute this software for any |
6 | | * purpose with or without fee is hereby granted, provided that the above |
7 | | * copyright notice and this permission notice appear in all copies. |
8 | | * |
9 | | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
10 | | * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF |
11 | | * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR |
12 | | * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
13 | | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN |
14 | | * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF |
15 | | * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
16 | | */ |
17 | | |
18 | | /* |
19 | | * Utility functions for multibyte-character handling, |
20 | | * in particular to sanitize untrusted strings for terminal output. |
21 | | */ |
22 | | |
23 | | #include "includes.h" |
24 | | |
25 | | #include <sys/types.h> |
26 | | #ifdef HAVE_LANGINFO_H |
27 | | # include <langinfo.h> |
28 | | #endif |
29 | | #include <limits.h> |
30 | | #include <locale.h> |
31 | | #include <stdarg.h> |
32 | | #include <stdio.h> |
33 | | #include <stdlib.h> |
34 | | #include <string.h> |
35 | | #if defined(HAVE_STRNVIS) && defined(HAVE_VIS_H) && !defined(BROKEN_STRNVIS) |
36 | | # include <vis.h> |
37 | | #endif |
38 | | #ifdef HAVE_WCHAR_H |
39 | | # include <wchar.h> |
40 | | #endif |
41 | | |
42 | | #include "utf8.h" |
43 | | |
44 | | static int dangerous_locale(void); |
45 | | static int grow_dst(char **, size_t *, size_t, char **, size_t); |
46 | | |
47 | | |
48 | | /* |
49 | | * For US-ASCII and UTF-8 encodings, we can safely recover from |
50 | | * encoding errors and from non-printable characters. For any |
51 | | * other encodings, err to the side of caution and abort parsing: |
52 | | * For state-dependent encodings, recovery is impossible. |
53 | | * For arbitrary encodings, replacement of non-printable |
54 | | * characters would be non-trivial and too fragile. |
55 | | * The comments indicate what nl_langinfo(CODESET) |
56 | | * returns for US-ASCII on various operating systems. |
57 | | */ |
58 | | |
59 | | static int |
60 | 0 | dangerous_locale(void) { |
61 | 0 | char *loc; |
62 | |
|
63 | 0 | loc = nl_langinfo(CODESET); |
64 | 0 | return strcmp(loc, "UTF-8") != 0 && |
65 | 0 | strcmp(loc, "US-ASCII") != 0 && /* OpenBSD */ |
66 | 0 | strcmp(loc, "ANSI_X3.4-1968") != 0 && /* Linux */ |
67 | 0 | strcmp(loc, "ISO8859-1") != 0 && /* AIX */ |
68 | 0 | strcmp(loc, "646") != 0 && /* Solaris, NetBSD */ |
69 | 0 | strcmp(loc, "") != 0; /* Solaris 6 */ |
70 | 0 | } |
71 | | |
72 | | static int |
73 | | grow_dst(char **dst, size_t *sz, size_t maxsz, char **dp, size_t need) |
74 | 0 | { |
75 | 0 | char *tp; |
76 | 0 | size_t tsz; |
77 | |
|
78 | 0 | if (*dp + need < *dst + *sz) |
79 | 0 | return 0; |
80 | 0 | tsz = *sz + 128; |
81 | 0 | if (tsz > maxsz) |
82 | 0 | tsz = maxsz; |
83 | 0 | if ((tp = recallocarray(*dst, *sz, tsz, 1)) == NULL) |
84 | 0 | return -1; |
85 | 0 | *dp = tp + (*dp - *dst); |
86 | 0 | *dst = tp; |
87 | 0 | *sz = tsz; |
88 | 0 | return 0; |
89 | 0 | } |
90 | | |
91 | | /* |
92 | | * The following two functions limit the number of bytes written, |
93 | | * including the terminating '\0', to sz. Unless wp is NULL, |
94 | | * they limit the number of display columns occupied to *wp. |
95 | | * Whichever is reached first terminates the output string. |
96 | | * To stay close to the standard interfaces, they return the number of |
97 | | * non-NUL bytes that would have been written if both were unlimited. |
98 | | * If wp is NULL, newline, carriage return, and tab are allowed; |
99 | | * otherwise, the actual number of columns occupied by what was |
100 | | * written is returned in *wp. |
101 | | */ |
102 | | |
103 | | int |
104 | | vasnmprintf(char **str, size_t maxsz, int *wp, const char *fmt, va_list ap) |
105 | 0 | { |
106 | 0 | char *src; /* Source string returned from vasprintf. */ |
107 | 0 | char *sp; /* Pointer into src. */ |
108 | 0 | char *dst; /* Destination string to be returned. */ |
109 | 0 | char *dp; /* Pointer into dst. */ |
110 | 0 | char *tp; /* Temporary pointer for dst. */ |
111 | 0 | size_t sz; /* Number of bytes allocated for dst. */ |
112 | 0 | wchar_t wc; /* Wide character at sp. */ |
113 | 0 | int len; /* Number of bytes in the character at sp. */ |
114 | 0 | int ret; /* Number of bytes needed to format src. */ |
115 | 0 | int width; /* Display width of the character wc. */ |
116 | 0 | int total_width, max_width, print; |
117 | |
|
118 | 0 | src = NULL; |
119 | 0 | if ((ret = vasprintf(&src, fmt, ap)) <= 0) |
120 | 0 | goto fail; |
121 | | |
122 | 0 | sz = strlen(src) + 1; |
123 | 0 | if ((dst = malloc(sz)) == NULL) { |
124 | 0 | free(src); |
125 | 0 | ret = -1; |
126 | 0 | goto fail; |
127 | 0 | } |
128 | | |
129 | 0 | if (maxsz > INT_MAX) |
130 | 0 | maxsz = INT_MAX; |
131 | |
|
132 | 0 | sp = src; |
133 | 0 | dp = dst; |
134 | 0 | ret = 0; |
135 | 0 | print = 1; |
136 | 0 | total_width = 0; |
137 | 0 | max_width = wp == NULL ? INT_MAX : *wp; |
138 | 0 | while (*sp != '\0') { |
139 | 0 | if ((len = mbtowc(&wc, sp, MB_CUR_MAX)) == -1) { |
140 | 0 | (void)mbtowc(NULL, NULL, MB_CUR_MAX); |
141 | 0 | if (dangerous_locale()) { |
142 | 0 | ret = -1; |
143 | 0 | break; |
144 | 0 | } |
145 | 0 | len = 1; |
146 | 0 | width = -1; |
147 | 0 | } else if (wp == NULL && |
148 | 0 | (wc == L'\n' || wc == L'\r' || wc == L'\t')) { |
149 | | /* |
150 | | * Don't use width uninitialized; the actual |
151 | | * value doesn't matter because total_width |
152 | | * is only returned for wp != NULL. |
153 | | */ |
154 | 0 | width = 0; |
155 | 0 | } else if ((width = wcwidth(wc)) == -1 && |
156 | 0 | dangerous_locale()) { |
157 | 0 | ret = -1; |
158 | 0 | break; |
159 | 0 | } |
160 | | |
161 | | /* Valid, printable character. */ |
162 | | |
163 | 0 | if (width >= 0) { |
164 | 0 | if (print && (dp - dst >= (int)maxsz - len || |
165 | 0 | total_width > max_width - width)) |
166 | 0 | print = 0; |
167 | 0 | if (print) { |
168 | 0 | if (grow_dst(&dst, &sz, maxsz, |
169 | 0 | &dp, len) == -1) { |
170 | 0 | ret = -1; |
171 | 0 | break; |
172 | 0 | } |
173 | 0 | total_width += width; |
174 | 0 | memcpy(dp, sp, len); |
175 | 0 | dp += len; |
176 | 0 | } |
177 | 0 | sp += len; |
178 | 0 | if (ret >= 0) |
179 | 0 | ret += len; |
180 | 0 | continue; |
181 | 0 | } |
182 | | |
183 | | /* Escaping required. */ |
184 | | |
185 | 0 | while (len > 0) { |
186 | 0 | if (print && (dp - dst >= (int)maxsz - 4 || |
187 | 0 | total_width > max_width - 4)) |
188 | 0 | print = 0; |
189 | 0 | if (print) { |
190 | 0 | if (grow_dst(&dst, &sz, maxsz, |
191 | 0 | &dp, 4) == -1) { |
192 | 0 | ret = -1; |
193 | 0 | break; |
194 | 0 | } |
195 | 0 | tp = vis(dp, *sp, VIS_OCTAL | VIS_ALL, 0); |
196 | 0 | width = tp - dp; |
197 | 0 | total_width += width; |
198 | 0 | dp = tp; |
199 | 0 | } else |
200 | 0 | width = 4; |
201 | 0 | len--; |
202 | 0 | sp++; |
203 | 0 | if (ret >= 0) |
204 | 0 | ret += width; |
205 | 0 | } |
206 | 0 | if (len > 0) |
207 | 0 | break; |
208 | 0 | } |
209 | 0 | free(src); |
210 | 0 | *dp = '\0'; |
211 | 0 | *str = dst; |
212 | 0 | if (wp != NULL) |
213 | 0 | *wp = total_width; |
214 | | |
215 | | /* |
216 | | * If the string was truncated by the width limit but |
217 | | * would have fit into the size limit, the only sane way |
218 | | * to report the problem is using the return value, such |
219 | | * that the usual idiom "if (ret < 0 || ret >= sz) error" |
220 | | * works as expected. |
221 | | */ |
222 | |
|
223 | 0 | if (ret < (int)maxsz && !print) |
224 | 0 | ret = -1; |
225 | 0 | return ret; |
226 | | |
227 | 0 | fail: |
228 | 0 | if (wp != NULL) |
229 | 0 | *wp = 0; |
230 | 0 | if (ret == 0) { |
231 | 0 | *str = src; |
232 | 0 | return 0; |
233 | 0 | } else { |
234 | 0 | *str = NULL; |
235 | 0 | return -1; |
236 | 0 | } |
237 | 0 | } |
238 | | |
239 | | int |
240 | | snmprintf(char *str, size_t sz, int *wp, const char *fmt, ...) |
241 | 0 | { |
242 | 0 | va_list ap; |
243 | 0 | char *cp = NULL; |
244 | 0 | int ret; |
245 | |
|
246 | 0 | va_start(ap, fmt); |
247 | 0 | ret = vasnmprintf(&cp, sz, wp, fmt, ap); |
248 | 0 | va_end(ap); |
249 | 0 | if (cp != NULL) { |
250 | 0 | (void)strlcpy(str, cp, sz); |
251 | 0 | free(cp); |
252 | 0 | } else |
253 | 0 | *str = '\0'; |
254 | 0 | return ret; |
255 | 0 | } |
256 | | |
257 | | int |
258 | | asmprintf(char **outp, size_t sz, int *wp, const char *fmt, ...) |
259 | 0 | { |
260 | 0 | va_list ap; |
261 | 0 | int ret; |
262 | |
|
263 | 0 | *outp = NULL; |
264 | 0 | va_start(ap, fmt); |
265 | 0 | ret = vasnmprintf(outp, sz, wp, fmt, ap); |
266 | 0 | va_end(ap); |
267 | |
|
268 | 0 | return ret; |
269 | 0 | } |
270 | | |
271 | | /* |
272 | | * To stay close to the standard interfaces, the following functions |
273 | | * return the number of non-NUL bytes written. |
274 | | */ |
275 | | |
276 | | int |
277 | | vfmprintf(FILE *stream, const char *fmt, va_list ap) |
278 | 0 | { |
279 | 0 | char *str = NULL; |
280 | 0 | int ret; |
281 | |
|
282 | 0 | if ((ret = vasnmprintf(&str, INT_MAX, NULL, fmt, ap)) < 0) { |
283 | 0 | free(str); |
284 | 0 | return -1; |
285 | 0 | } |
286 | 0 | if (fputs(str, stream) == EOF) |
287 | 0 | ret = -1; |
288 | 0 | free(str); |
289 | 0 | return ret; |
290 | 0 | } |
291 | | |
292 | | int |
293 | | fmprintf(FILE *stream, const char *fmt, ...) |
294 | 0 | { |
295 | 0 | va_list ap; |
296 | 0 | int ret; |
297 | |
|
298 | 0 | va_start(ap, fmt); |
299 | 0 | ret = vfmprintf(stream, fmt, ap); |
300 | 0 | va_end(ap); |
301 | 0 | return ret; |
302 | 0 | } |
303 | | |
304 | | int |
305 | | mprintf(const char *fmt, ...) |
306 | 0 | { |
307 | 0 | va_list ap; |
308 | 0 | int ret; |
309 | |
|
310 | 0 | va_start(ap, fmt); |
311 | 0 | ret = vfmprintf(stdout, fmt, ap); |
312 | 0 | va_end(ap); |
313 | 0 | return ret; |
314 | 0 | } |
315 | | |
316 | | /* |
317 | | * Set up libc for multibyte output in the user's chosen locale. |
318 | | * |
319 | | * XXX: we are known to have problems with Turkish (i/I confusion) so we |
320 | | * deliberately fall back to the C locale for now. Longer term we should |
321 | | * always prefer to select C.[encoding] if possible, but there's no |
322 | | * standardisation in locales between systems, so we'll need to survey |
323 | | * what's out there first. |
324 | | */ |
325 | | void |
326 | | msetlocale(void) |
327 | 0 | { |
328 | 0 | const char *vars[] = { "LC_ALL", "LC_CTYPE", "LANG", NULL }; |
329 | 0 | char *cp; |
330 | 0 | int i; |
331 | | |
332 | | /* |
333 | | * We can't yet cope with dotless/dotted I in Turkish locales, |
334 | | * so fall back to the C locale for these. |
335 | | */ |
336 | 0 | for (i = 0; vars[i] != NULL; i++) { |
337 | 0 | if ((cp = getenv(vars[i])) == NULL) |
338 | 0 | continue; |
339 | 0 | if (strncasecmp(cp, "TR", 2) != 0) |
340 | 0 | break; |
341 | | /* |
342 | | * If we're in a UTF-8 locale then prefer to use |
343 | | * the C.UTF-8 locale (or equivalent) if it exists. |
344 | | */ |
345 | 0 | if ((strcasestr(cp, "UTF-8") != NULL || |
346 | 0 | strcasestr(cp, "UTF8") != NULL) && |
347 | 0 | (setlocale(LC_CTYPE, "C.UTF-8") != NULL || |
348 | 0 | setlocale(LC_CTYPE, "POSIX.UTF-8") != NULL)) |
349 | 0 | return; |
350 | 0 | setlocale(LC_CTYPE, "C"); |
351 | 0 | return; |
352 | 0 | } |
353 | | /* We can handle this locale */ |
354 | 0 | setlocale(LC_CTYPE, ""); |
355 | 0 | } |