Coverage Report

Created: 2025-07-12 06:14

/src/openssh/utf8.c
Line
Count
Source (jump to first uncovered line)
1
/* $OpenBSD: utf8.c,v 1.11 2020/05/01 06:28:52 djm Exp $ */
2
/*
3
 * Copyright (c) 2016 Ingo Schwarze <schwarze@openbsd.org>
4
 *
5
 * Permission to use, copy, modify, and distribute this software for any
6
 * purpose with or without fee is hereby granted, provided that the above
7
 * copyright notice and this permission notice appear in all copies.
8
 *
9
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16
 */
17
18
/*
19
 * Utility functions for multibyte-character handling,
20
 * in particular to sanitize untrusted strings for terminal output.
21
 */
22
23
#include "includes.h"
24
25
#include <sys/types.h>
26
#ifdef HAVE_LANGINFO_H
27
# include <langinfo.h>
28
#endif
29
#include <limits.h>
30
#include <locale.h>
31
#include <stdarg.h>
32
#include <stdio.h>
33
#include <stdlib.h>
34
#include <string.h>
35
#if defined(HAVE_STRNVIS) && defined(HAVE_VIS_H) && !defined(BROKEN_STRNVIS)
36
# include <vis.h>
37
#endif
38
#ifdef HAVE_WCHAR_H
39
# include <wchar.h>
40
#endif
41
42
#include "utf8.h"
43
44
static int   dangerous_locale(void);
45
static int   grow_dst(char **, size_t *, size_t, char **, size_t);
46
47
48
/*
49
 * For US-ASCII and UTF-8 encodings, we can safely recover from
50
 * encoding errors and from non-printable characters.  For any
51
 * other encodings, err to the side of caution and abort parsing:
52
 * For state-dependent encodings, recovery is impossible.
53
 * For arbitrary encodings, replacement of non-printable
54
 * characters would be non-trivial and too fragile.
55
 * The comments indicate what nl_langinfo(CODESET)
56
 * returns for US-ASCII on various operating systems.
57
 */
58
59
static int
60
0
dangerous_locale(void) {
61
0
  char  *loc;
62
63
0
  loc = nl_langinfo(CODESET);
64
0
  return strcmp(loc, "UTF-8") != 0 &&
65
0
      strcmp(loc, "US-ASCII") != 0 &&   /* OpenBSD */
66
0
      strcmp(loc, "ANSI_X3.4-1968") != 0 && /* Linux */
67
0
      strcmp(loc, "ISO8859-1") != 0 &&   /* AIX */
68
0
      strcmp(loc, "646") != 0 &&     /* Solaris, NetBSD */
69
0
      strcmp(loc, "") != 0;     /* Solaris 6 */
70
0
}
71
72
static int
73
grow_dst(char **dst, size_t *sz, size_t maxsz, char **dp, size_t need)
74
0
{
75
0
  char  *tp;
76
0
  size_t   tsz;
77
78
0
  if (*dp + need < *dst + *sz)
79
0
    return 0;
80
0
  tsz = *sz + 128;
81
0
  if (tsz > maxsz)
82
0
    tsz = maxsz;
83
0
  if ((tp = recallocarray(*dst, *sz, tsz, 1)) == NULL)
84
0
    return -1;
85
0
  *dp = tp + (*dp - *dst);
86
0
  *dst = tp;
87
0
  *sz = tsz;
88
0
  return 0;
89
0
}
90
91
/*
92
 * The following two functions limit the number of bytes written,
93
 * including the terminating '\0', to sz.  Unless wp is NULL,
94
 * they limit the number of display columns occupied to *wp.
95
 * Whichever is reached first terminates the output string.
96
 * To stay close to the standard interfaces, they return the number of
97
 * non-NUL bytes that would have been written if both were unlimited.
98
 * If wp is NULL, newline, carriage return, and tab are allowed;
99
 * otherwise, the actual number of columns occupied by what was
100
 * written is returned in *wp.
101
 */
102
103
int
104
vasnmprintf(char **str, size_t maxsz, int *wp, const char *fmt, va_list ap)
105
0
{
106
0
  char  *src; /* Source string returned from vasprintf. */
107
0
  char  *sp;  /* Pointer into src. */
108
0
  char  *dst; /* Destination string to be returned. */
109
0
  char  *dp;  /* Pointer into dst. */
110
0
  char  *tp;  /* Temporary pointer for dst. */
111
0
  size_t   sz;  /* Number of bytes allocated for dst. */
112
0
  wchar_t  wc;  /* Wide character at sp. */
113
0
  int  len; /* Number of bytes in the character at sp. */
114
0
  int  ret; /* Number of bytes needed to format src. */
115
0
  int  width; /* Display width of the character wc. */
116
0
  int  total_width, max_width, print;
117
118
0
  src = NULL;
119
0
  if ((ret = vasprintf(&src, fmt, ap)) <= 0)
120
0
    goto fail;
121
122
0
  sz = strlen(src) + 1;
123
0
  if ((dst = malloc(sz)) == NULL) {
124
0
    free(src);
125
0
    ret = -1;
126
0
    goto fail;
127
0
  }
128
129
0
  if (maxsz > INT_MAX)
130
0
    maxsz = INT_MAX;
131
132
0
  sp = src;
133
0
  dp = dst;
134
0
  ret = 0;
135
0
  print = 1;
136
0
  total_width = 0;
137
0
  max_width = wp == NULL ? INT_MAX : *wp;
138
0
  while (*sp != '\0') {
139
0
    if ((len = mbtowc(&wc, sp, MB_CUR_MAX)) == -1) {
140
0
      (void)mbtowc(NULL, NULL, MB_CUR_MAX);
141
0
      if (dangerous_locale()) {
142
0
        ret = -1;
143
0
        break;
144
0
      }
145
0
      len = 1;
146
0
      width = -1;
147
0
    } else if (wp == NULL &&
148
0
        (wc == L'\n' || wc == L'\r' || wc == L'\t')) {
149
      /*
150
       * Don't use width uninitialized; the actual
151
       * value doesn't matter because total_width
152
       * is only returned for wp != NULL.
153
       */
154
0
      width = 0;
155
0
    } else if ((width = wcwidth(wc)) == -1 &&
156
0
        dangerous_locale()) {
157
0
      ret = -1;
158
0
      break;
159
0
    }
160
161
    /* Valid, printable character. */
162
163
0
    if (width >= 0) {
164
0
      if (print && (dp - dst >= (int)maxsz - len ||
165
0
          total_width > max_width - width))
166
0
        print = 0;
167
0
      if (print) {
168
0
        if (grow_dst(&dst, &sz, maxsz,
169
0
            &dp, len) == -1) {
170
0
          ret = -1;
171
0
          break;
172
0
        }
173
0
        total_width += width;
174
0
        memcpy(dp, sp, len);
175
0
        dp += len;
176
0
      }
177
0
      sp += len;
178
0
      if (ret >= 0)
179
0
        ret += len;
180
0
      continue;
181
0
    }
182
183
    /* Escaping required. */
184
185
0
    while (len > 0) {
186
0
      if (print && (dp - dst >= (int)maxsz - 4 ||
187
0
          total_width > max_width - 4))
188
0
        print = 0;
189
0
      if (print) {
190
0
        if (grow_dst(&dst, &sz, maxsz,
191
0
            &dp, 4) == -1) {
192
0
          ret = -1;
193
0
          break;
194
0
        }
195
0
        tp = vis(dp, *sp, VIS_OCTAL | VIS_ALL, 0);
196
0
        width = tp - dp;
197
0
        total_width += width;
198
0
        dp = tp;
199
0
      } else
200
0
        width = 4;
201
0
      len--;
202
0
      sp++;
203
0
      if (ret >= 0)
204
0
        ret += width;
205
0
    }
206
0
    if (len > 0)
207
0
      break;
208
0
  }
209
0
  free(src);
210
0
  *dp = '\0';
211
0
  *str = dst;
212
0
  if (wp != NULL)
213
0
    *wp = total_width;
214
215
  /*
216
   * If the string was truncated by the width limit but
217
   * would have fit into the size limit, the only sane way
218
   * to report the problem is using the return value, such
219
   * that the usual idiom "if (ret < 0 || ret >= sz) error"
220
   * works as expected.
221
   */
222
223
0
  if (ret < (int)maxsz && !print)
224
0
    ret = -1;
225
0
  return ret;
226
227
0
fail:
228
0
  if (wp != NULL)
229
0
    *wp = 0;
230
0
  if (ret == 0) {
231
0
    *str = src;
232
0
    return 0;
233
0
  } else {
234
0
    *str = NULL;
235
0
    return -1;
236
0
  }
237
0
}
238
239
int
240
snmprintf(char *str, size_t sz, int *wp, const char *fmt, ...)
241
0
{
242
0
  va_list  ap;
243
0
  char  *cp = NULL;
244
0
  int  ret;
245
246
0
  va_start(ap, fmt);
247
0
  ret = vasnmprintf(&cp, sz, wp, fmt, ap);
248
0
  va_end(ap);
249
0
  if (cp != NULL) {
250
0
    (void)strlcpy(str, cp, sz);
251
0
    free(cp);
252
0
  } else
253
0
    *str = '\0';
254
0
  return ret;
255
0
}
256
257
int
258
asmprintf(char **outp, size_t sz, int *wp, const char *fmt, ...)
259
0
{
260
0
  va_list  ap;
261
0
  int  ret;
262
263
0
  *outp = NULL;
264
0
  va_start(ap, fmt);
265
0
  ret = vasnmprintf(outp, sz, wp, fmt, ap);
266
0
  va_end(ap);
267
268
0
  return ret;
269
0
}
270
271
/*
272
 * To stay close to the standard interfaces, the following functions
273
 * return the number of non-NUL bytes written.
274
 */
275
276
int
277
vfmprintf(FILE *stream, const char *fmt, va_list ap)
278
0
{
279
0
  char  *str = NULL;
280
0
  int  ret;
281
282
0
  if ((ret = vasnmprintf(&str, INT_MAX, NULL, fmt, ap)) < 0) {
283
0
    free(str);
284
0
    return -1;
285
0
  }
286
0
  if (fputs(str, stream) == EOF)
287
0
    ret = -1;
288
0
  free(str);
289
0
  return ret;
290
0
}
291
292
int
293
fmprintf(FILE *stream, const char *fmt, ...)
294
0
{
295
0
  va_list  ap;
296
0
  int  ret;
297
298
0
  va_start(ap, fmt);
299
0
  ret = vfmprintf(stream, fmt, ap);
300
0
  va_end(ap);
301
0
  return ret;
302
0
}
303
304
int
305
mprintf(const char *fmt, ...)
306
0
{
307
0
  va_list  ap;
308
0
  int  ret;
309
310
0
  va_start(ap, fmt);
311
0
  ret = vfmprintf(stdout, fmt, ap);
312
0
  va_end(ap);
313
0
  return ret;
314
0
}
315
316
/*
317
 * Set up libc for multibyte output in the user's chosen locale.
318
 *
319
 * XXX: we are known to have problems with Turkish (i/I confusion) so we
320
 *      deliberately fall back to the C locale for now. Longer term we should
321
 *      always prefer to select C.[encoding] if possible, but there's no
322
 *      standardisation in locales between systems, so we'll need to survey
323
 *      what's out there first.
324
 */
325
void
326
msetlocale(void)
327
0
{
328
0
  const char *vars[] = { "LC_ALL", "LC_CTYPE", "LANG", NULL };
329
0
  char *cp;
330
0
  int i;
331
332
  /*
333
   * We can't yet cope with dotless/dotted I in Turkish locales,
334
   * so fall back to the C locale for these.
335
   */
336
0
  for (i = 0; vars[i] != NULL; i++) {
337
0
    if ((cp = getenv(vars[i])) == NULL)
338
0
      continue;
339
0
    if (strncasecmp(cp, "TR", 2) != 0)
340
0
      break;
341
    /*
342
     * If we're in a UTF-8 locale then prefer to use
343
     * the C.UTF-8 locale (or equivalent) if it exists.
344
     */
345
0
    if ((strcasestr(cp, "UTF-8") != NULL ||
346
0
        strcasestr(cp, "UTF8") != NULL) &&
347
0
        (setlocale(LC_CTYPE, "C.UTF-8") != NULL ||
348
0
        setlocale(LC_CTYPE, "POSIX.UTF-8") != NULL))
349
0
      return;
350
0
    setlocale(LC_CTYPE, "C");
351
0
    return;
352
0
  }
353
  /* We can handle this locale */
354
0
  setlocale(LC_CTYPE, "");
355
0
}