/src/openssh/utf8.c

Source (jump to first uncovered line)
/* $OpenBSD: utf8.c,v 1.11 2020/05/01 06:28:52 djm Exp $ */
/*
 * Copyright (c) 2016 Ingo Schwarze <schwarze@openbsd.org>
 *
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 */

/*
 * Utility functions for multibyte-character handling,
 * in particular to sanitize untrusted strings for terminal output.
 */

#include "includes.h"

#include <sys/types.h>
#ifdef HAVE_LANGINFO_H
# include <langinfo.h>
#endif
#include <limits.h>
#include <locale.h>
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#if defined(HAVE_STRNVIS) && defined(HAVE_VIS_H) && !defined(BROKEN_STRNVIS)
# include <vis.h>
#endif
#ifdef HAVE_WCHAR_H
# include <wchar.h>
#endif

#include "utf8.h"

static int   dangerous_locale(void);
static int   grow_dst(char **, size_t *, size_t, char **, size_t);


/*
 * For US-ASCII and UTF-8 encodings, we can safely recover from
 * encoding errors and from non-printable characters.  For any
 * other encodings, err to the side of caution and abort parsing:
 * For state-dependent encodings, recovery is impossible.
 * For arbitrary encodings, replacement of non-printable
 * characters would be non-trivial and too fragile.
 * The comments indicate what nl_langinfo(CODESET)
 * returns for US-ASCII on various operating systems.
 */

static int
dangerous_locale(void) {
  char  *loc;

  loc = nl_langinfo(CODESET);
  return strcmp(loc, "UTF-8") != 0 &&
      strcmp(loc, "US-ASCII") != 0 &&   /* OpenBSD */
      strcmp(loc, "ANSI_X3.4-1968") != 0 && /* Linux */
      strcmp(loc, "ISO8859-1") != 0 &&   /* AIX */
      strcmp(loc, "646") != 0 &&     /* Solaris, NetBSD */
      strcmp(loc, "") != 0;     /* Solaris 6 */
}

static int
grow_dst(char **dst, size_t *sz, size_t maxsz, char **dp, size_t need)
{
  char  *tp;
  size_t   tsz;

  if (*dp + need < *dst + *sz)
    return 0;
  tsz = *sz + 128;
  if (tsz > maxsz)
    tsz = maxsz;
  if ((tp = recallocarray(*dst, *sz, tsz, 1)) == NULL)
    return -1;
  *dp = tp + (*dp - *dst);
  *dst = tp;
  *sz = tsz;
  return 0;
}

/*
 * The following two functions limit the number of bytes written,
 * including the terminating '\0', to sz.  Unless wp is NULL,
 * they limit the number of display columns occupied to *wp.
 * Whichever is reached first terminates the output string.
 * To stay close to the standard interfaces, they return the number of
 * non-NUL bytes that would have been written if both were unlimited.
 * If wp is NULL, newline, carriage return, and tab are allowed;
 * otherwise, the actual number of columns occupied by what was
 * written is returned in *wp.
 */

int
vasnmprintf(char **str, size_t maxsz, int *wp, const char *fmt, va_list ap)
{
  char  *src; /* Source string returned from vasprintf. */
  char  *sp;  /* Pointer into src. */
  char  *dst; /* Destination string to be returned. */
  char  *dp;  /* Pointer into dst. */
  char  *tp;  /* Temporary pointer for dst. */
  size_t   sz;  /* Number of bytes allocated for dst. */
  wchar_t  wc;  /* Wide character at sp. */
  int  len; /* Number of bytes in the character at sp. */
  int  ret; /* Number of bytes needed to format src. */
  int  width; /* Display width of the character wc. */
  int  total_width, max_width, print;

  src = NULL;
  if ((ret = vasprintf(&src, fmt, ap)) <= 0)
    goto fail;

  sz = strlen(src) + 1;
  if ((dst = malloc(sz)) == NULL) {
    free(src);
    ret = -1;
    goto fail;
  }

  if (maxsz > INT_MAX)
    maxsz = INT_MAX;

  sp = src;
  dp = dst;
  ret = 0;
  print = 1;
  total_width = 0;
  max_width = wp == NULL ? INT_MAX : *wp;
  while (*sp != '\0') {
    if ((len = mbtowc(&wc, sp, MB_CUR_MAX)) == -1) {
      (void)mbtowc(NULL, NULL, MB_CUR_MAX);
      if (dangerous_locale()) {
        ret = -1;
        break;
      }
      len = 1;
      width = -1;
    } else if (wp == NULL &&
        (wc == L'\n' || wc == L'\r' || wc == L'\t')) {
      /*
       * Don't use width uninitialized; the actual
       * value doesn't matter because total_width
       * is only returned for wp != NULL.
       */
      width = 0;
    } else if ((width = wcwidth(wc)) == -1 &&
        dangerous_locale()) {
      ret = -1;
      break;
    }

    /* Valid, printable character. */

    if (width >= 0) {
      if (print && (dp - dst >= (int)maxsz - len ||
          total_width > max_width - width))
        print = 0;
      if (print) {
        if (grow_dst(&dst, &sz, maxsz,
            &dp, len) == -1) {
          ret = -1;
          break;
        }
        total_width += width;
        memcpy(dp, sp, len);
        dp += len;
      }
      sp += len;
      if (ret >= 0)
        ret += len;
      continue;
    }

    /* Escaping required. */

    while (len > 0) {
      if (print && (dp - dst >= (int)maxsz - 4 ||
          total_width > max_width - 4))
        print = 0;
      if (print) {
        if (grow_dst(&dst, &sz, maxsz,
            &dp, 4) == -1) {
          ret = -1;
          break;
        }
        tp = vis(dp, *sp, VIS_OCTAL | VIS_ALL, 0);
        width = tp - dp;
        total_width += width;
        dp = tp;
      } else
        width = 4;
      len--;
      sp++;
      if (ret >= 0)
        ret += width;
    }
    if (len > 0)
      break;
  }
  free(src);
  *dp = '\0';
  *str = dst;
  if (wp != NULL)
    *wp = total_width;

  /*
   * If the string was truncated by the width limit but
   * would have fit into the size limit, the only sane way
   * to report the problem is using the return value, such
   * that the usual idiom "if (ret < 0 || ret >= sz) error"
   * works as expected.
   */

  if (ret < (int)maxsz && !print)
    ret = -1;
  return ret;

fail:
  if (wp != NULL)
    *wp = 0;
  if (ret == 0) {
    *str = src;
    return 0;
  } else {
    *str = NULL;
    return -1;
  }
}

int
snmprintf(char *str, size_t sz, int *wp, const char *fmt, ...)
{
  va_list  ap;
  char  *cp = NULL;
  int  ret;

  va_start(ap, fmt);
  ret = vasnmprintf(&cp, sz, wp, fmt, ap);
  va_end(ap);
  if (cp != NULL) {
    (void)strlcpy(str, cp, sz);
    free(cp);
  } else
    *str = '\0';
  return ret;
}

int
asmprintf(char **outp, size_t sz, int *wp, const char *fmt, ...)
{
  va_list  ap;
  int  ret;

  *outp = NULL;
  va_start(ap, fmt);
  ret = vasnmprintf(outp, sz, wp, fmt, ap);
  va_end(ap);

  return ret;
}

/*
 * To stay close to the standard interfaces, the following functions
 * return the number of non-NUL bytes written.
 */

int
vfmprintf(FILE *stream, const char *fmt, va_list ap)
{
  char  *str = NULL;
  int  ret;

  if ((ret = vasnmprintf(&str, INT_MAX, NULL, fmt, ap)) < 0) {
    free(str);
    return -1;
  }
  if (fputs(str, stream) == EOF)
    ret = -1;
  free(str);
  return ret;
}

int
fmprintf(FILE *stream, const char *fmt, ...)
{
  va_list  ap;
  int  ret;

  va_start(ap, fmt);
  ret = vfmprintf(stream, fmt, ap);
  va_end(ap);
  return ret;
}

int
mprintf(const char *fmt, ...)
{
  va_list  ap;
  int  ret;

  va_start(ap, fmt);
  ret = vfmprintf(stdout, fmt, ap);
  va_end(ap);
  return ret;
}

/*
 * Set up libc for multibyte output in the user's chosen locale.
 *
 * XXX: we are known to have problems with Turkish (i/I confusion) so we
 *      deliberately fall back to the C locale for now. Longer term we should
 *      always prefer to select C.[encoding] if possible, but there's no
 *      standardisation in locales between systems, so we'll need to survey
 *      what's out there first.
 */
void
msetlocale(void)
{
  const char *vars[] = { "LC_ALL", "LC_CTYPE", "LANG", NULL };
  char *cp;
  int i;

  /*
   * We can't yet cope with dotless/dotted I in Turkish locales,
   * so fall back to the C locale for these.
   */
  for (i = 0; vars[i] != NULL; i++) {
    if ((cp = getenv(vars[i])) == NULL)
      continue;
    if (strncasecmp(cp, "TR", 2) != 0)
      break;
    /*
     * If we're in a UTF-8 locale then prefer to use
     * the C.UTF-8 locale (or equivalent) if it exists.
     */
    if ((strcasestr(cp, "UTF-8") != NULL ||
        strcasestr(cp, "UTF8") != NULL) &&
        (setlocale(LC_CTYPE, "C.UTF-8") != NULL ||
        setlocale(LC_CTYPE, "POSIX.UTF-8") != NULL))
      return;
    setlocale(LC_CTYPE, "C");
    return;
  }
  /* We can handle this locale */
  setlocale(LC_CTYPE, "");
}

Coverage Report

Created: 2025-07-12 06:14

Line	Count	Source (jump to first uncovered line)
1		/* $OpenBSD: utf8.c,v 1.11 2020/05/01 06:28:52 djm Exp $ */
2		/*
3		* Copyright (c) 2016 Ingo Schwarze <schwarze@openbsd.org>
4		*
5		* Permission to use, copy, modify, and distribute this software for any
6		* purpose with or without fee is hereby granted, provided that the above
7		* copyright notice and this permission notice appear in all copies.
8		*
9		* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10		* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11		* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12		* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13		* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14		* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15		* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16		*/
17
18		/*
19		* Utility functions for multibyte-character handling,
20		* in particular to sanitize untrusted strings for terminal output.
21		*/
22
23		#include "includes.h"
24
25		#include <sys/types.h>
26		#ifdef HAVE_LANGINFO_H
27		# include <langinfo.h>
28		#endif
29		#include <limits.h>
30		#include <locale.h>
31		#include <stdarg.h>
32		#include <stdio.h>
33		#include <stdlib.h>
34		#include <string.h>
35		#if defined(HAVE_STRNVIS) && defined(HAVE_VIS_H) && !defined(BROKEN_STRNVIS)
36		# include <vis.h>
37		#endif
38		#ifdef HAVE_WCHAR_H
39		# include <wchar.h>
40		#endif
41
42		#include "utf8.h"
43
44		static int dangerous_locale(void);
45		static int grow_dst(char *, size_t , size_t, char **, size_t);
46
47
48		/*
49		* For US-ASCII and UTF-8 encodings, we can safely recover from
50		* encoding errors and from non-printable characters. For any
51		* other encodings, err to the side of caution and abort parsing:
52		* For state-dependent encodings, recovery is impossible.
53		* For arbitrary encodings, replacement of non-printable
54		* characters would be non-trivial and too fragile.
55		* The comments indicate what nl_langinfo(CODESET)
56		* returns for US-ASCII on various operating systems.
57		*/
58
59		static int
60	0	dangerous_locale(void) {
61	0	char *loc;
62
63	0	loc = nl_langinfo(CODESET);
64	0	return strcmp(loc, "UTF-8") != 0 &&
65	0	strcmp(loc, "US-ASCII") != 0 && /* OpenBSD */
66	0	strcmp(loc, "ANSI_X3.4-1968") != 0 && /* Linux */
67	0	strcmp(loc, "ISO8859-1") != 0 && /* AIX */
68	0	strcmp(loc, "646") != 0 && /* Solaris, NetBSD */
69	0	strcmp(loc, "") != 0; /* Solaris 6 */
70	0	}
71
72		static int
73		grow_dst(char *dst, size_t sz, size_t maxsz, char **dp, size_t need)
74	0	{
75	0	char *tp;
76	0	size_t tsz;
77
78	0	if (dp + need < dst + *sz)
79	0	return 0;
80	0	tsz = *sz + 128;
81	0	if (tsz > maxsz)
82	0	tsz = maxsz;
83	0	if ((tp = recallocarray(dst, sz, tsz, 1)) == NULL)
84	0	return -1;
85	0	dp = tp + (dp - *dst);
86	0	*dst = tp;
87	0	*sz = tsz;
88	0	return 0;
89	0	}
90
91		/*
92		* The following two functions limit the number of bytes written,
93		* including the terminating '\0', to sz. Unless wp is NULL,
94		* they limit the number of display columns occupied to *wp.
95		* Whichever is reached first terminates the output string.
96		* To stay close to the standard interfaces, they return the number of
97		* non-NUL bytes that would have been written if both were unlimited.
98		* If wp is NULL, newline, carriage return, and tab are allowed;
99		* otherwise, the actual number of columns occupied by what was
100		* written is returned in *wp.
101		*/
102
103		int
104		vasnmprintf(char *str, size_t maxsz, int wp, const char *fmt, va_list ap)
105	0	{
106	0	char src; / Source string returned from vasprintf. */
107	0	char sp; / Pointer into src. */
108	0	char dst; / Destination string to be returned. */
109	0	char dp; / Pointer into dst. */
110	0	char tp; / Temporary pointer for dst. */
111	0	size_t sz; /* Number of bytes allocated for dst. */
112	0	wchar_t wc; /* Wide character at sp. */
113	0	int len; /* Number of bytes in the character at sp. */
114	0	int ret; /* Number of bytes needed to format src. */
115	0	int width; /* Display width of the character wc. */
116	0	int total_width, max_width, print;
117
118	0	src = NULL;
119	0	if ((ret = vasprintf(&src, fmt, ap)) <= 0)
120	0	goto fail;
121
122	0	sz = strlen(src) + 1;
123	0	if ((dst = malloc(sz)) == NULL) {
124	0	free(src);
125	0	ret = -1;
126	0	goto fail;
127	0	}
128
129	0	if (maxsz > INT_MAX)
130	0	maxsz = INT_MAX;
131
132	0	sp = src;
133	0	dp = dst;
134	0	ret = 0;
135	0	print = 1;
136	0	total_width = 0;
137	0	max_width = wp == NULL ? INT_MAX : *wp;
138	0	while (*sp != '\0') {
139	0	if ((len = mbtowc(&wc, sp, MB_CUR_MAX)) == -1) {
140	0	(void)mbtowc(NULL, NULL, MB_CUR_MAX);
141	0	if (dangerous_locale()) {
142	0	ret = -1;
143	0	break;
144	0	}
145	0	len = 1;
146	0	width = -1;
147	0	} else if (wp == NULL &&
148	0	(wc == L'\n' \|\| wc == L'\r' \|\| wc == L'\t')) {
149		/*
150		* Don't use width uninitialized; the actual
151		* value doesn't matter because total_width
152		* is only returned for wp != NULL.
153		*/
154	0	width = 0;
155	0	} else if ((width = wcwidth(wc)) == -1 &&
156	0	dangerous_locale()) {
157	0	ret = -1;
158	0	break;
159	0	}
160
161		/* Valid, printable character. */
162
163	0	if (width >= 0) {
164	0	if (print && (dp - dst >= (int)maxsz - len \|\|
165	0	total_width > max_width - width))
166	0	print = 0;
167	0	if (print) {
168	0	if (grow_dst(&dst, &sz, maxsz,
169	0	&dp, len) == -1) {
170	0	ret = -1;
171	0	break;
172	0	}
173	0	total_width += width;
174	0	memcpy(dp, sp, len);
175	0	dp += len;
176	0	}
177	0	sp += len;
178	0	if (ret >= 0)
179	0	ret += len;
180	0	continue;
181	0	}
182
183		/* Escaping required. */
184
185	0	while (len > 0) {
186	0	if (print && (dp - dst >= (int)maxsz - 4 \|\|
187	0	total_width > max_width - 4))
188	0	print = 0;
189	0	if (print) {
190	0	if (grow_dst(&dst, &sz, maxsz,
191	0	&dp, 4) == -1) {
192	0	ret = -1;
193	0	break;
194	0	}
195	0	tp = vis(dp, *sp, VIS_OCTAL \| VIS_ALL, 0);
196	0	width = tp - dp;
197	0	total_width += width;
198	0	dp = tp;
199	0	} else
200	0	width = 4;
201	0	len--;
202	0	sp++;
203	0	if (ret >= 0)
204	0	ret += width;
205	0	}
206	0	if (len > 0)
207	0	break;
208	0	}
209	0	free(src);
210	0	*dp = '\0';
211	0	*str = dst;
212	0	if (wp != NULL)
213	0	*wp = total_width;
214
215		/*
216		* If the string was truncated by the width limit but
217		* would have fit into the size limit, the only sane way
218		* to report the problem is using the return value, such
219		* that the usual idiom "if (ret < 0 \|\| ret >= sz) error"
220		* works as expected.
221		*/
222
223	0	if (ret < (int)maxsz && !print)
224	0	ret = -1;
225	0	return ret;
226
227	0	fail:
228	0	if (wp != NULL)
229	0	*wp = 0;
230	0	if (ret == 0) {
231	0	*str = src;
232	0	return 0;
233	0	} else {
234	0	*str = NULL;
235	0	return -1;
236	0	}
237	0	}
238
239		int
240		snmprintf(char str, size_t sz, int wp, const char *fmt, ...)
241	0	{
242	0	va_list ap;
243	0	char *cp = NULL;
244	0	int ret;
245
246	0	va_start(ap, fmt);
247	0	ret = vasnmprintf(&cp, sz, wp, fmt, ap);
248	0	va_end(ap);
249	0	if (cp != NULL) {
250	0	(void)strlcpy(str, cp, sz);
251	0	free(cp);
252	0	} else
253	0	*str = '\0';
254	0	return ret;
255	0	}
256
257		int
258		asmprintf(char *outp, size_t sz, int wp, const char *fmt, ...)
259	0	{
260	0	va_list ap;
261	0	int ret;
262
263	0	*outp = NULL;
264	0	va_start(ap, fmt);
265	0	ret = vasnmprintf(outp, sz, wp, fmt, ap);
266	0	va_end(ap);
267
268	0	return ret;
269	0	}
270
271		/*
272		* To stay close to the standard interfaces, the following functions
273		* return the number of non-NUL bytes written.
274		*/
275
276		int
277		vfmprintf(FILE stream, const char fmt, va_list ap)
278	0	{
279	0	char *str = NULL;
280	0	int ret;
281
282	0	if ((ret = vasnmprintf(&str, INT_MAX, NULL, fmt, ap)) < 0) {
283	0	free(str);
284	0	return -1;
285	0	}
286	0	if (fputs(str, stream) == EOF)
287	0	ret = -1;
288	0	free(str);
289	0	return ret;
290	0	}
291
292		int
293		fmprintf(FILE stream, const char fmt, ...)
294	0	{
295	0	va_list ap;
296	0	int ret;
297
298	0	va_start(ap, fmt);
299	0	ret = vfmprintf(stream, fmt, ap);
300	0	va_end(ap);
301	0	return ret;
302	0	}
303
304		int
305		mprintf(const char *fmt, ...)
306	0	{
307	0	va_list ap;
308	0	int ret;
309
310	0	va_start(ap, fmt);
311	0	ret = vfmprintf(stdout, fmt, ap);
312	0	va_end(ap);
313	0	return ret;
314	0	}
315
316		/*
317		* Set up libc for multibyte output in the user's chosen locale.
318		*
319		* XXX: we are known to have problems with Turkish (i/I confusion) so we
320		* deliberately fall back to the C locale for now. Longer term we should
321		* always prefer to select C.[encoding] if possible, but there's no
322		* standardisation in locales between systems, so we'll need to survey
323		* what's out there first.
324		*/
325		void
326		msetlocale(void)
327	0	{
328	0	const char *vars[] = { "LC_ALL", "LC_CTYPE", "LANG", NULL };
329	0	char *cp;
330	0	int i;
331
332		/*
333		* We can't yet cope with dotless/dotted I in Turkish locales,
334		* so fall back to the C locale for these.
335		*/
336	0	for (i = 0; vars[i] != NULL; i++) {
337	0	if ((cp = getenv(vars[i])) == NULL)
338	0	continue;
339	0	if (strncasecmp(cp, "TR", 2) != 0)
340	0	break;
341		/*
342		* If we're in a UTF-8 locale then prefer to use
343		* the C.UTF-8 locale (or equivalent) if it exists.
344		*/
345	0	if ((strcasestr(cp, "UTF-8") != NULL \|\|
346	0	strcasestr(cp, "UTF8") != NULL) &&
347	0	(setlocale(LC_CTYPE, "C.UTF-8") != NULL \|\|
348	0	setlocale(LC_CTYPE, "POSIX.UTF-8") != NULL))
349	0	return;
350	0	setlocale(LC_CTYPE, "C");
351	0	return;
352	0	}
353		/* We can handle this locale */
354	0	setlocale(LC_CTYPE, "");
355	0	}