Coverage Report

Created: 2025-08-24 07:01

/src/tmux/utf8.c
Line
Count
Source (jump to first uncovered line)
1
/* $OpenBSD$ */
2
3
/*
4
 * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com>
5
 *
6
 * Permission to use, copy, modify, and distribute this software for any
7
 * purpose with or without fee is hereby granted, provided that the above
8
 * copyright notice and this permission notice appear in all copies.
9
 *
10
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14
 * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
15
 * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
16
 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17
 */
18
19
#include <sys/types.h>
20
21
#include <ctype.h>
22
#include <errno.h>
23
#include <stdlib.h>
24
#include <string.h>
25
#include <wchar.h>
26
27
#include "compat.h"
28
#include "tmux.h"
29
30
struct utf8_width_item {
31
  wchar_t       wc;
32
  u_int       width;
33
  int       allocated;
34
35
  RB_ENTRY(utf8_width_item) entry;
36
};
37
38
static int
39
utf8_width_cache_cmp(struct utf8_width_item *uw1, struct utf8_width_item *uw2)
40
0
{
41
0
  if (uw1->wc < uw2->wc)
42
0
    return (-1);
43
0
  if (uw1->wc > uw2->wc)
44
0
    return (1);
45
0
  return (0);
46
0
}
47
RB_HEAD(utf8_width_cache, utf8_width_item);
48
RB_GENERATE_STATIC(utf8_width_cache, utf8_width_item, entry,
49
    utf8_width_cache_cmp);
50
static struct utf8_width_cache utf8_width_cache =
51
    RB_INITIALIZER(utf8_width_cache);
52
53
static struct utf8_width_item utf8_default_width_cache[] = {
54
  { .wc = 0x0261D, .width = 2 },
55
  { .wc = 0x026F9, .width = 2 },
56
  { .wc = 0x0270A, .width = 2 },
57
  { .wc = 0x0270B, .width = 2 },
58
  { .wc = 0x0270C, .width = 2 },
59
  { .wc = 0x0270D, .width = 2 },
60
  { .wc = 0x1F1E6, .width = 2 },
61
  { .wc = 0x1F1E7, .width = 2 },
62
  { .wc = 0x1F1E8, .width = 2 },
63
  { .wc = 0x1F1E9, .width = 2 },
64
  { .wc = 0x1F1EA, .width = 2 },
65
  { .wc = 0x1F1EB, .width = 2 },
66
  { .wc = 0x1F1EC, .width = 2 },
67
  { .wc = 0x1F1ED, .width = 2 },
68
  { .wc = 0x1F1EE, .width = 2 },
69
  { .wc = 0x1F1EF, .width = 2 },
70
  { .wc = 0x1F1F0, .width = 2 },
71
  { .wc = 0x1F1F1, .width = 2 },
72
  { .wc = 0x1F1F2, .width = 2 },
73
  { .wc = 0x1F1F3, .width = 2 },
74
  { .wc = 0x1F1F4, .width = 2 },
75
  { .wc = 0x1F1F5, .width = 2 },
76
  { .wc = 0x1F1F6, .width = 2 },
77
  { .wc = 0x1F1F7, .width = 2 },
78
  { .wc = 0x1F1F8, .width = 2 },
79
  { .wc = 0x1F1F9, .width = 2 },
80
  { .wc = 0x1F1FA, .width = 2 },
81
  { .wc = 0x1F1FB, .width = 2 },
82
  { .wc = 0x1F1FC, .width = 2 },
83
  { .wc = 0x1F1FD, .width = 2 },
84
  { .wc = 0x1F1FE, .width = 2 },
85
  { .wc = 0x1F1FF, .width = 2 },
86
  { .wc = 0x1F385, .width = 2 },
87
  { .wc = 0x1F3C2, .width = 2 },
88
  { .wc = 0x1F3C3, .width = 2 },
89
  { .wc = 0x1F3C4, .width = 2 },
90
  { .wc = 0x1F3C7, .width = 2 },
91
  { .wc = 0x1F3CA, .width = 2 },
92
  { .wc = 0x1F3CB, .width = 2 },
93
  { .wc = 0x1F3CC, .width = 2 },
94
  { .wc = 0x1F3FB, .width = 2 },
95
  { .wc = 0x1F3FC, .width = 2 },
96
  { .wc = 0x1F3FD, .width = 2 },
97
  { .wc = 0x1F3FE, .width = 2 },
98
  { .wc = 0x1F3FF, .width = 2 },
99
  { .wc = 0x1F442, .width = 2 },
100
  { .wc = 0x1F443, .width = 2 },
101
  { .wc = 0x1F446, .width = 2 },
102
  { .wc = 0x1F447, .width = 2 },
103
  { .wc = 0x1F448, .width = 2 },
104
  { .wc = 0x1F449, .width = 2 },
105
  { .wc = 0x1F44A, .width = 2 },
106
  { .wc = 0x1F44B, .width = 2 },
107
  { .wc = 0x1F44C, .width = 2 },
108
  { .wc = 0x1F44D, .width = 2 },
109
  { .wc = 0x1F44E, .width = 2 },
110
  { .wc = 0x1F44F, .width = 2 },
111
  { .wc = 0x1F450, .width = 2 },
112
  { .wc = 0x1F466, .width = 2 },
113
  { .wc = 0x1F467, .width = 2 },
114
  { .wc = 0x1F468, .width = 2 },
115
  { .wc = 0x1F469, .width = 2 },
116
  { .wc = 0x1F46B, .width = 2 },
117
  { .wc = 0x1F46C, .width = 2 },
118
  { .wc = 0x1F46D, .width = 2 },
119
  { .wc = 0x1F46E, .width = 2 },
120
  { .wc = 0x1F470, .width = 2 },
121
  { .wc = 0x1F471, .width = 2 },
122
  { .wc = 0x1F472, .width = 2 },
123
  { .wc = 0x1F473, .width = 2 },
124
  { .wc = 0x1F474, .width = 2 },
125
  { .wc = 0x1F475, .width = 2 },
126
  { .wc = 0x1F476, .width = 2 },
127
  { .wc = 0x1F477, .width = 2 },
128
  { .wc = 0x1F478, .width = 2 },
129
  { .wc = 0x1F47C, .width = 2 },
130
  { .wc = 0x1F481, .width = 2 },
131
  { .wc = 0x1F482, .width = 2 },
132
  { .wc = 0x1F483, .width = 2 },
133
  { .wc = 0x1F485, .width = 2 },
134
  { .wc = 0x1F486, .width = 2 },
135
  { .wc = 0x1F487, .width = 2 },
136
  { .wc = 0x1F48F, .width = 2 },
137
  { .wc = 0x1F491, .width = 2 },
138
  { .wc = 0x1F4AA, .width = 2 },
139
  { .wc = 0x1F574, .width = 2 },
140
  { .wc = 0x1F575, .width = 2 },
141
  { .wc = 0x1F57A, .width = 2 },
142
  { .wc = 0x1F590, .width = 2 },
143
  { .wc = 0x1F595, .width = 2 },
144
  { .wc = 0x1F596, .width = 2 },
145
  { .wc = 0x1F645, .width = 2 },
146
  { .wc = 0x1F646, .width = 2 },
147
  { .wc = 0x1F647, .width = 2 },
148
  { .wc = 0x1F64B, .width = 2 },
149
  { .wc = 0x1F64C, .width = 2 },
150
  { .wc = 0x1F64D, .width = 2 },
151
  { .wc = 0x1F64E, .width = 2 },
152
  { .wc = 0x1F64F, .width = 2 },
153
  { .wc = 0x1F6A3, .width = 2 },
154
  { .wc = 0x1F6B4, .width = 2 },
155
  { .wc = 0x1F6B5, .width = 2 },
156
  { .wc = 0x1F6B6, .width = 2 },
157
  { .wc = 0x1F6C0, .width = 2 },
158
  { .wc = 0x1F6CC, .width = 2 },
159
  { .wc = 0x1F90C, .width = 2 },
160
  { .wc = 0x1F90F, .width = 2 },
161
  { .wc = 0x1F918, .width = 2 },
162
  { .wc = 0x1F919, .width = 2 },
163
  { .wc = 0x1F91A, .width = 2 },
164
  { .wc = 0x1F91B, .width = 2 },
165
  { .wc = 0x1F91C, .width = 2 },
166
  { .wc = 0x1F91D, .width = 2 },
167
  { .wc = 0x1F91E, .width = 2 },
168
  { .wc = 0x1F91F, .width = 2 },
169
  { .wc = 0x1F926, .width = 2 },
170
  { .wc = 0x1F930, .width = 2 },
171
  { .wc = 0x1F931, .width = 2 },
172
  { .wc = 0x1F932, .width = 2 },
173
  { .wc = 0x1F933, .width = 2 },
174
  { .wc = 0x1F934, .width = 2 },
175
  { .wc = 0x1F935, .width = 2 },
176
  { .wc = 0x1F936, .width = 2 },
177
  { .wc = 0x1F937, .width = 2 },
178
  { .wc = 0x1F938, .width = 2 },
179
  { .wc = 0x1F939, .width = 2 },
180
  { .wc = 0x1F93D, .width = 2 },
181
  { .wc = 0x1F93E, .width = 2 },
182
  { .wc = 0x1F977, .width = 2 },
183
  { .wc = 0x1F9B5, .width = 2 },
184
  { .wc = 0x1F9B6, .width = 2 },
185
  { .wc = 0x1F9B8, .width = 2 },
186
  { .wc = 0x1F9B9, .width = 2 },
187
  { .wc = 0x1F9BB, .width = 2 },
188
  { .wc = 0x1F9CD, .width = 2 },
189
  { .wc = 0x1F9CE, .width = 2 },
190
  { .wc = 0x1F9CF, .width = 2 },
191
  { .wc = 0x1F9D1, .width = 2 },
192
  { .wc = 0x1F9D2, .width = 2 },
193
  { .wc = 0x1F9D3, .width = 2 },
194
  { .wc = 0x1F9D4, .width = 2 },
195
  { .wc = 0x1F9D5, .width = 2 },
196
  { .wc = 0x1F9D6, .width = 2 },
197
  { .wc = 0x1F9D7, .width = 2 },
198
  { .wc = 0x1F9D8, .width = 2 },
199
  { .wc = 0x1F9D9, .width = 2 },
200
  { .wc = 0x1F9DA, .width = 2 },
201
  { .wc = 0x1F9DB, .width = 2 },
202
  { .wc = 0x1F9DC, .width = 2 },
203
  { .wc = 0x1F9DD, .width = 2 },
204
  { .wc = 0x1FAC3, .width = 2 },
205
  { .wc = 0x1FAC4, .width = 2 },
206
  { .wc = 0x1FAC5, .width = 2 },
207
  { .wc = 0x1FAF0, .width = 2 },
208
  { .wc = 0x1FAF1, .width = 2 },
209
  { .wc = 0x1FAF2, .width = 2 },
210
  { .wc = 0x1FAF3, .width = 2 },
211
  { .wc = 0x1FAF4, .width = 2 },
212
  { .wc = 0x1FAF5, .width = 2 },
213
  { .wc = 0x1FAF6, .width = 2 },
214
  { .wc = 0x1FAF7, .width = 2 },
215
  { .wc = 0x1FAF8, .width = 2 }
216
};
217
218
struct utf8_item {
219
  RB_ENTRY(utf8_item) index_entry;
220
  u_int     index;
221
222
  RB_ENTRY(utf8_item) data_entry;
223
  char      data[UTF8_SIZE];
224
  u_char      size;
225
};
226
227
static int
228
utf8_data_cmp(struct utf8_item *ui1, struct utf8_item *ui2)
229
0
{
230
0
  if (ui1->size < ui2->size)
231
0
    return (-1);
232
0
  if (ui1->size > ui2->size)
233
0
    return (1);
234
0
  return (memcmp(ui1->data, ui2->data, ui1->size));
235
0
}
236
RB_HEAD(utf8_data_tree, utf8_item);
237
RB_GENERATE_STATIC(utf8_data_tree, utf8_item, data_entry, utf8_data_cmp);
238
static struct utf8_data_tree utf8_data_tree = RB_INITIALIZER(utf8_data_tree);
239
240
static int
241
utf8_index_cmp(struct utf8_item *ui1, struct utf8_item *ui2)
242
0
{
243
0
  if (ui1->index < ui2->index)
244
0
    return (-1);
245
0
  if (ui1->index > ui2->index)
246
0
    return (1);
247
0
  return (0);
248
0
}
249
RB_HEAD(utf8_index_tree, utf8_item);
250
RB_GENERATE_STATIC(utf8_index_tree, utf8_item, index_entry, utf8_index_cmp);
251
static struct utf8_index_tree utf8_index_tree = RB_INITIALIZER(utf8_index_tree);
252
253
static int  utf8_no_width;
254
static u_int  utf8_next_index;
255
256
3.25k
#define UTF8_GET_SIZE(uc) (((uc) >> 24) & 0x1f)
257
3.25k
#define UTF8_GET_WIDTH(uc) (((uc) >> 29) - 1)
258
259
673k
#define UTF8_SET_SIZE(size) (((utf8_char)(size)) << 24)
260
673k
#define UTF8_SET_WIDTH(width) ((((utf8_char)(width)) + 1) << 29)
261
262
/* Get a UTF-8 item from data. */
263
static struct utf8_item *
264
utf8_item_by_data(const u_char *data, size_t size)
265
0
{
266
0
  struct utf8_item  ui;
267
268
0
  memcpy(ui.data, data, size);
269
0
  ui.size = size;
270
271
0
  return (RB_FIND(utf8_data_tree, &utf8_data_tree, &ui));
272
0
}
273
274
/* Get a UTF-8 item from data. */
275
static struct utf8_item *
276
utf8_item_by_index(u_int index)
277
0
{
278
0
  struct utf8_item  ui;
279
280
0
  ui.index = index;
281
282
0
  return (RB_FIND(utf8_index_tree, &utf8_index_tree, &ui));
283
0
}
284
285
/* Find a codepoint in the cache. */
286
static struct utf8_width_item *
287
utf8_find_in_width_cache(wchar_t wc)
288
0
{
289
0
  struct utf8_width_item  uw;
290
291
0
  uw.wc = wc;
292
0
  return RB_FIND(utf8_width_cache, &utf8_width_cache, &uw);
293
0
}
294
295
/* Parse a single codepoint option. */
296
static void
297
utf8_add_to_width_cache(const char *s)
298
0
{
299
0
  struct utf8_width_item  *uw, *old;
300
0
  char      *copy, *cp, *endptr;
301
0
  u_int      width;
302
0
  const char    *errstr;
303
0
  struct utf8_data  *ud;
304
0
  wchar_t      wc;
305
0
  unsigned long long   n;
306
307
0
  copy = xstrdup(s);
308
0
  if ((cp = strchr(copy, '=')) == NULL) {
309
0
    free(copy);
310
0
    return;
311
0
  }
312
0
  *cp++ = '\0';
313
314
0
  width = strtonum(cp, 0, 2, &errstr);
315
0
  if (errstr != NULL) {
316
0
    free(copy);
317
0
    return;
318
0
  }
319
320
0
  if (strncmp(copy, "U+", 2) == 0) {
321
0
    errno = 0;
322
0
    n = strtoull(copy + 2, &endptr, 16);
323
0
    if (copy[2] == '\0' ||
324
0
        *endptr != '\0' ||
325
0
        n == 0 ||
326
0
        n > WCHAR_MAX ||
327
0
        (errno == ERANGE && n == ULLONG_MAX)) {
328
0
      free(copy);
329
0
      return;
330
0
    }
331
0
    wc = n;
332
0
  } else {
333
0
    utf8_no_width = 1;
334
0
    ud = utf8_fromcstr(copy);
335
0
    utf8_no_width = 0;
336
0
    if (ud[0].size == 0 || ud[1].size != 0) {
337
0
      free(ud);
338
0
      free(copy);
339
0
      return;
340
0
    }
341
#ifdef HAVE_UTF8PROC
342
    if (utf8proc_mbtowc(&wc, ud[0].data, ud[0].size) <= 0) {
343
#else
344
0
    if (mbtowc(&wc, ud[0].data, ud[0].size) <= 0) {
345
0
#endif
346
0
      free(ud);
347
0
      free(copy);
348
0
      return;
349
0
    }
350
0
    free(ud);
351
0
  }
352
353
0
  log_debug("Unicode width cache: %08X=%u", (u_int)wc, width);
354
355
0
  uw = xcalloc(1, sizeof *uw);
356
0
  uw->wc = wc;
357
0
  uw->width = width;
358
0
  uw->allocated = 1;
359
360
0
  old = RB_INSERT(utf8_width_cache, &utf8_width_cache, uw);
361
0
  if (old != NULL) {
362
0
    RB_REMOVE(utf8_width_cache, &utf8_width_cache, old);
363
0
    if (old->allocated)
364
0
      free(old);
365
0
    RB_INSERT(utf8_width_cache, &utf8_width_cache, uw);
366
0
  }
367
368
0
  free(copy);
369
0
}
370
371
/* Rebuild cache of widths. */
372
void
373
utf8_update_width_cache(void)
374
0
{
375
0
  struct utf8_width_item    *uw, *uw1;
376
0
  struct options_entry    *o;
377
0
  struct options_array_item *a;
378
0
  u_int        i;
379
380
0
  RB_FOREACH_SAFE (uw, utf8_width_cache, &utf8_width_cache, uw1) {
381
0
    RB_REMOVE(utf8_width_cache, &utf8_width_cache, uw);
382
0
    if (uw->allocated)
383
0
      free(uw);
384
0
  }
385
386
0
  for (i = 0; i < nitems(utf8_default_width_cache); i++) {
387
0
    RB_INSERT(utf8_width_cache, &utf8_width_cache,
388
0
        &utf8_default_width_cache[i]);
389
0
  }
390
391
0
  o = options_get(global_options, "codepoint-widths");
392
0
  a = options_array_first(o);
393
0
  while (a != NULL) {
394
0
    utf8_add_to_width_cache(options_array_item_value(a)->string);
395
0
    a = options_array_next(a);
396
0
  }
397
0
}
398
399
/* Add a UTF-8 item. */
400
static int
401
utf8_put_item(const u_char *data, size_t size, u_int *index)
402
0
{
403
0
  struct utf8_item  *ui;
404
405
0
  ui = utf8_item_by_data(data, size);
406
0
  if (ui != NULL) {
407
0
    *index = ui->index;
408
0
    log_debug("%s: found %.*s = %u", __func__, (int)size, data,
409
0
        *index);
410
0
    return (0);
411
0
  }
412
413
0
  if (utf8_next_index == 0xffffff + 1)
414
0
    return (-1);
415
416
0
  ui = xcalloc(1, sizeof *ui);
417
0
  ui->index = utf8_next_index++;
418
0
  RB_INSERT(utf8_index_tree, &utf8_index_tree, ui);
419
420
0
  memcpy(ui->data, data, size);
421
0
  ui->size = size;
422
0
  RB_INSERT(utf8_data_tree, &utf8_data_tree, ui);
423
424
0
  *index = ui->index;
425
0
  log_debug("%s: added %.*s = %u", __func__, (int)size, data, *index);
426
0
  return (0);
427
0
}
428
429
/* Get UTF-8 character from data. */
430
enum utf8_state
431
utf8_from_data(const struct utf8_data *ud, utf8_char *uc)
432
671k
{
433
671k
  u_int index;
434
435
671k
  if (ud->width > 2)
436
0
    fatalx("invalid UTF-8 width: %u", ud->width);
437
438
671k
  if (ud->size > UTF8_SIZE)
439
0
    goto fail;
440
671k
  if (ud->size <= 3) {
441
671k
    index = (((utf8_char)ud->data[2] << 16)|
442
671k
        ((utf8_char)ud->data[1] << 8)|
443
671k
        ((utf8_char)ud->data[0]));
444
671k
  } else if (utf8_put_item(ud->data, ud->size, &index) != 0)
445
0
    goto fail;
446
671k
  *uc = UTF8_SET_SIZE(ud->size)|UTF8_SET_WIDTH(ud->width)|index;
447
671k
  log_debug("%s: (%d %d %.*s) -> %08x", __func__, ud->width, ud->size,
448
671k
      (int)ud->size, ud->data, *uc);
449
671k
  return (UTF8_DONE);
450
451
0
fail:
452
0
  if (ud->width == 0)
453
0
    *uc = UTF8_SET_SIZE(0)|UTF8_SET_WIDTH(0);
454
0
  else if (ud->width == 1)
455
0
    *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x20;
456
0
  else
457
0
    *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x2020;
458
0
  return (UTF8_ERROR);
459
671k
}
460
461
/* Get UTF-8 data from character. */
462
void
463
utf8_to_data(utf8_char uc, struct utf8_data *ud)
464
3.25k
{
465
3.25k
  struct utf8_item  *ui;
466
3.25k
  u_int      index;
467
468
3.25k
  memset(ud, 0, sizeof *ud);
469
3.25k
  ud->size = ud->have = UTF8_GET_SIZE(uc);
470
3.25k
  ud->width = UTF8_GET_WIDTH(uc);
471
472
3.25k
  if (ud->size <= 3) {
473
3.25k
    ud->data[2] = (uc >> 16);
474
3.25k
    ud->data[1] = ((uc >> 8) & 0xff);
475
3.25k
    ud->data[0] = (uc & 0xff);
476
3.25k
  } else {
477
0
    index = (uc & 0xffffff);
478
0
    if ((ui = utf8_item_by_index(index)) == NULL)
479
0
      memset(ud->data, ' ', ud->size);
480
0
    else
481
0
      memcpy(ud->data, ui->data, ud->size);
482
0
  }
483
484
3.25k
  log_debug("%s: %08x -> (%d %d %.*s)", __func__, uc, ud->width, ud->size,
485
3.25k
      (int)ud->size, ud->data);
486
3.25k
}
487
488
/* Get UTF-8 character from a single ASCII character. */
489
u_int
490
utf8_build_one(u_char ch)
491
2.13k
{
492
2.13k
  return (UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|ch);
493
2.13k
}
494
495
/* Set a single character. */
496
void
497
utf8_set(struct utf8_data *ud, u_char ch)
498
113k
{
499
113k
  static const struct utf8_data empty = { { 0 }, 1, 1, 1 };
500
501
113k
  memcpy(ud, &empty, sizeof *ud);
502
113k
  *ud->data = ch;
503
113k
}
504
505
/* Copy UTF-8 character. */
506
void
507
utf8_copy(struct utf8_data *to, const struct utf8_data *from)
508
17.5k
{
509
17.5k
  u_int i;
510
511
17.5k
  memcpy(to, from, sizeof *to);
512
513
557k
  for (i = to->size; i < sizeof to->data; i++)
514
540k
    to->data[i] = '\0';
515
17.5k
}
516
517
/* Get width of Unicode character. */
518
static enum utf8_state
519
utf8_width(struct utf8_data *ud, int *width)
520
696
{
521
696
  struct utf8_width_item  *uw;
522
696
  wchar_t      wc;
523
524
696
  if (utf8_towc(ud, &wc) != UTF8_DONE)
525
696
    return (UTF8_ERROR);
526
0
  uw = utf8_find_in_width_cache(wc);
527
0
  if (uw != NULL) {
528
0
    *width = uw->width;
529
0
    log_debug("cached width for %08X is %d", (u_int)wc, *width);
530
0
    return (UTF8_DONE);
531
0
  }
532
#ifdef HAVE_UTF8PROC
533
  *width = utf8proc_wcwidth(wc);
534
  log_debug("utf8proc_wcwidth(%05X) returned %d", (u_int)wc, *width);
535
#else
536
0
  *width = wcwidth(wc);
537
0
  log_debug("wcwidth(%05X) returned %d", (u_int)wc, *width);
538
0
  if (*width < 0) {
539
    /*
540
     * C1 control characters are nonprintable, so they are always
541
     * zero width.
542
     */
543
0
    *width = (wc >= 0x80 && wc <= 0x9f) ? 0 : 1;
544
0
  }
545
0
#endif
546
0
  if (*width >= 0 && *width <= 0xff)
547
0
    return (UTF8_DONE);
548
0
  return (UTF8_ERROR);
549
0
}
550
551
/* Convert UTF-8 character to wide character. */
552
enum utf8_state
553
utf8_towc(const struct utf8_data *ud, wchar_t *wc)
554
3.85k
{
555
#ifdef HAVE_UTF8PROC
556
  switch (utf8proc_mbtowc(wc, ud->data, ud->size)) {
557
#else
558
3.85k
  switch (mbtowc(wc, ud->data, ud->size)) {
559
0
#endif
560
2.22k
  case -1:
561
2.22k
    log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
562
2.22k
        errno);
563
2.22k
    mbtowc(NULL, NULL, MB_CUR_MAX);
564
2.22k
    return (UTF8_ERROR);
565
0
  case 0:
566
0
    return (UTF8_ERROR);
567
3.85k
  }
568
1.62k
  log_debug("UTF-8 %.*s is %05X", (int)ud->size, ud->data, (u_int)*wc);
569
1.62k
  return (UTF8_DONE);
570
3.85k
}
571
572
/* Convert wide character to UTF-8 character. */
573
enum utf8_state
574
utf8_fromwc(wchar_t wc, struct utf8_data *ud)
575
0
{
576
0
  int size, width;
577
578
#ifdef HAVE_UTF8PROC
579
  size = utf8proc_wctomb(ud->data, wc);
580
#else
581
0
  size = wctomb(ud->data, wc);
582
0
#endif
583
0
  if (size < 0) {
584
0
    log_debug("UTF-8 %d, wctomb() %d", wc, errno);
585
0
    wctomb(NULL, 0);
586
0
    return (UTF8_ERROR);
587
0
  }
588
0
  if (size == 0)
589
0
    return (UTF8_ERROR);
590
0
  ud->size = ud->have = size;
591
0
  if (utf8_width(ud, &width) == UTF8_DONE) {
592
0
    ud->width = width;
593
0
    return (UTF8_DONE);
594
0
  }
595
0
  return (UTF8_ERROR);
596
0
}
597
598
/*
599
 * Open UTF-8 sequence.
600
 *
601
 * 11000010-11011111 C2-DF start of 2-byte sequence
602
 * 11100000-11101111 E0-EF start of 3-byte sequence
603
 * 11110000-11110100 F0-F4 start of 4-byte sequence
604
 */
605
enum utf8_state
606
utf8_open(struct utf8_data *ud, u_char ch)
607
165k
{
608
165k
  memset(ud, 0, sizeof *ud);
609
165k
  if (ch >= 0xc2 && ch <= 0xdf)
610
2.32k
    ud->size = 2;
611
163k
  else if (ch >= 0xe0 && ch <= 0xef)
612
662
    ud->size = 3;
613
162k
  else if (ch >= 0xf0 && ch <= 0xf4)
614
1.45k
    ud->size = 4;
615
160k
  else
616
160k
    return (UTF8_ERROR);
617
4.44k
  utf8_append(ud, ch);
618
4.44k
  return (UTF8_MORE);
619
165k
}
620
621
/* Append character to UTF-8, closing if finished. */
622
enum utf8_state
623
utf8_append(struct utf8_data *ud, u_char ch)
624
9.54k
{
625
9.54k
  int width;
626
627
9.54k
  if (ud->have >= ud->size)
628
0
    fatalx("UTF-8 character overflow");
629
9.54k
  if (ud->size > sizeof ud->data)
630
0
    fatalx("UTF-8 character size too large");
631
632
9.54k
  if (ud->have != 0 && (ch & 0xc0) != 0x80)
633
3.70k
    ud->width = 0xff;
634
635
9.54k
  ud->data[ud->have++] = ch;
636
9.54k
  if (ud->have != ud->size)
637
6.96k
    return (UTF8_MORE);
638
639
2.57k
  if (!utf8_no_width) {
640
2.57k
    if (ud->width == 0xff)
641
1.88k
      return (UTF8_ERROR);
642
696
    if (utf8_width(ud, &width) != UTF8_DONE)
643
696
      return (UTF8_ERROR);
644
0
    ud->width = width;
645
0
  }
646
647
0
  return (UTF8_DONE);
648
2.57k
}
649
650
/*
651
 * Encode len characters from src into dst, which is guaranteed to have four
652
 * bytes available for each character from src (for \abc or UTF-8) plus space
653
 * for \0.
654
 */
655
int
656
utf8_strvis(char *dst, const char *src, size_t len, int flag)
657
14.6k
{
658
14.6k
  struct utf8_data   ud;
659
14.6k
  const char    *start = dst, *end = src + len;
660
14.6k
  enum utf8_state    more;
661
14.6k
  size_t       i;
662
663
31.6k
  while (src < end) {
664
16.9k
    if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
665
5.44k
      while (++src < end && more == UTF8_MORE)
666
2.85k
        more = utf8_append(&ud, *src);
667
2.59k
      if (more == UTF8_DONE) {
668
        /* UTF-8 character finished. */
669
0
        for (i = 0; i < ud.size; i++)
670
0
          *dst++ = ud.data[i];
671
0
        continue;
672
0
      }
673
      /* Not a complete, valid UTF-8 character. */
674
2.59k
      src -= ud.have;
675
2.59k
    }
676
16.9k
    if ((flag & VIS_DQ) && src[0] == '$' && src < end - 1) {
677
0
      if (isalpha((u_char)src[1]) ||
678
0
          src[1] == '_' ||
679
0
          src[1] == '{')
680
0
        *dst++ = '\\';
681
0
      *dst++ = '$';
682
16.9k
    } else if (src < end - 1)
683
6.69k
      dst = vis(dst, src[0], flag, src[1]);
684
10.2k
    else if (src < end)
685
10.2k
      dst = vis(dst, src[0], flag, '\0');
686
16.9k
    src++;
687
16.9k
  }
688
14.6k
  *dst = '\0';
689
14.6k
  return (dst - start);
690
14.6k
}
691
692
/* Same as utf8_strvis but allocate the buffer. */
693
int
694
utf8_stravis(char **dst, const char *src, int flag)
695
14.6k
{
696
14.6k
  char  *buf;
697
14.6k
  int  len;
698
699
14.6k
  buf = xreallocarray(NULL, 4, strlen(src) + 1);
700
14.6k
  len = utf8_strvis(buf, src, strlen(src), flag);
701
702
14.6k
  *dst = xrealloc(buf, len + 1);
703
14.6k
  return (len);
704
14.6k
}
705
706
/* Same as utf8_strvis but allocate the buffer. */
707
int
708
utf8_stravisx(char **dst, const char *src, size_t srclen, int flag)
709
0
{
710
0
  char  *buf;
711
0
  int  len;
712
713
0
  buf = xreallocarray(NULL, 4, srclen + 1);
714
0
  len = utf8_strvis(buf, src, srclen, flag);
715
716
0
  *dst = xrealloc(buf, len + 1);
717
0
  return (len);
718
0
}
719
720
/* Does this string contain anything that isn't valid UTF-8? */
721
int
722
utf8_isvalid(const char *s)
723
17.4k
{
724
17.4k
  struct utf8_data ud;
725
17.4k
  const char  *end;
726
17.4k
  enum utf8_state  more;
727
728
17.4k
  end = s + strlen(s);
729
163k
  while (s < end) {
730
146k
    if ((more = utf8_open(&ud, *s)) == UTF8_MORE) {
731
1.32k
      while (++s < end && more == UTF8_MORE)
732
740
        more = utf8_append(&ud, *s);
733
582
      if (more == UTF8_DONE)
734
0
        continue;
735
582
      return (0);
736
582
    }
737
146k
    if (*s < 0x20 || *s > 0x7e)
738
530
      return (0);
739
145k
    s++;
740
145k
  }
741
16.3k
  return (1);
742
17.4k
}
743
744
/*
745
 * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free
746
 * the returned string. Anything not valid printable ASCII or UTF-8 is
747
 * stripped.
748
 */
749
char *
750
utf8_sanitize(const char *src)
751
0
{
752
0
  char    *dst = NULL;
753
0
  size_t     n = 0;
754
0
  enum utf8_state  more;
755
0
  struct utf8_data ud;
756
0
  u_int    i;
757
758
0
  while (*src != '\0') {
759
0
    dst = xreallocarray(dst, n + 1, sizeof *dst);
760
0
    if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
761
0
      while (*++src != '\0' && more == UTF8_MORE)
762
0
        more = utf8_append(&ud, *src);
763
0
      if (more == UTF8_DONE) {
764
0
        dst = xreallocarray(dst, n + ud.width,
765
0
            sizeof *dst);
766
0
        for (i = 0; i < ud.width; i++)
767
0
          dst[n++] = '_';
768
0
        continue;
769
0
      }
770
0
      src -= ud.have;
771
0
    }
772
0
    if (*src > 0x1f && *src < 0x7f)
773
0
      dst[n++] = *src;
774
0
    else
775
0
      dst[n++] = '_';
776
0
    src++;
777
0
  }
778
0
  dst = xreallocarray(dst, n + 1, sizeof *dst);
779
0
  dst[n] = '\0';
780
0
  return (dst);
781
0
}
782
783
/* Get UTF-8 buffer length. */
784
size_t
785
utf8_strlen(const struct utf8_data *s)
786
0
{
787
0
  size_t  i;
788
789
0
  for (i = 0; s[i].size != 0; i++)
790
0
    /* nothing */;
791
0
  return (i);
792
0
}
793
794
/* Get UTF-8 string width. */
795
u_int
796
utf8_strwidth(const struct utf8_data *s, ssize_t n)
797
0
{
798
0
  ssize_t i;
799
0
  u_int width = 0;
800
801
0
  for (i = 0; s[i].size != 0; i++) {
802
0
    if (n != -1 && n == i)
803
0
      break;
804
0
    width += s[i].width;
805
0
  }
806
0
  return (width);
807
0
}
808
809
/*
810
 * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0.
811
 * Caller frees.
812
 */
813
struct utf8_data *
814
utf8_fromcstr(const char *src)
815
0
{
816
0
  struct utf8_data  *dst = NULL;
817
0
  size_t       n = 0;
818
0
  enum utf8_state    more;
819
820
0
  while (*src != '\0') {
821
0
    dst = xreallocarray(dst, n + 1, sizeof *dst);
822
0
    if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) {
823
0
      while (*++src != '\0' && more == UTF8_MORE)
824
0
        more = utf8_append(&dst[n], *src);
825
0
      if (more == UTF8_DONE) {
826
0
        n++;
827
0
        continue;
828
0
      }
829
0
      src -= dst[n].have;
830
0
    }
831
0
    utf8_set(&dst[n], *src);
832
0
    n++;
833
0
    src++;
834
0
  }
835
0
  dst = xreallocarray(dst, n + 1, sizeof *dst);
836
0
  dst[n].size = 0;
837
0
  return (dst);
838
0
}
839
840
/* Convert from a buffer of UTF-8 characters into a string. Caller frees. */
841
char *
842
utf8_tocstr(struct utf8_data *src)
843
0
{
844
0
  char  *dst = NULL;
845
0
  size_t   n = 0;
846
847
0
  for(; src->size != 0; src++) {
848
0
    dst = xreallocarray(dst, n + src->size, 1);
849
0
    memcpy(dst + n, src->data, src->size);
850
0
    n += src->size;
851
0
  }
852
0
  dst = xreallocarray(dst, n + 1, 1);
853
0
  dst[n] = '\0';
854
0
  return (dst);
855
0
}
856
857
/* Get width of UTF-8 string. */
858
u_int
859
utf8_cstrwidth(const char *s)
860
0
{
861
0
  struct utf8_data  tmp;
862
0
  u_int     width;
863
0
  enum utf8_state   more;
864
865
0
  width = 0;
866
0
  while (*s != '\0') {
867
0
    if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) {
868
0
      while (*++s != '\0' && more == UTF8_MORE)
869
0
        more = utf8_append(&tmp, *s);
870
0
      if (more == UTF8_DONE) {
871
0
        width += tmp.width;
872
0
        continue;
873
0
      }
874
0
      s -= tmp.have;
875
0
    }
876
0
    if (*s > 0x1f && *s != 0x7f)
877
0
      width++;
878
0
    s++;
879
0
  }
880
0
  return (width);
881
0
}
882
883
/* Pad UTF-8 string to width on the left. Caller frees. */
884
char *
885
utf8_padcstr(const char *s, u_int width)
886
0
{
887
0
  size_t   slen;
888
0
  char  *out;
889
0
  u_int  n, i;
890
891
0
  n = utf8_cstrwidth(s);
892
0
  if (n >= width)
893
0
    return (xstrdup(s));
894
895
0
  slen = strlen(s);
896
0
  out = xmalloc(slen + 1 + (width - n));
897
0
  memcpy(out, s, slen);
898
0
  for (i = n; i < width; i++)
899
0
    out[slen++] = ' ';
900
0
  out[slen] = '\0';
901
0
  return (out);
902
0
}
903
904
/* Pad UTF-8 string to width on the right. Caller frees. */
905
char *
906
utf8_rpadcstr(const char *s, u_int width)
907
0
{
908
0
  size_t   slen;
909
0
  char  *out;
910
0
  u_int  n, i;
911
912
0
  n = utf8_cstrwidth(s);
913
0
  if (n >= width)
914
0
    return (xstrdup(s));
915
916
0
  slen = strlen(s);
917
0
  out = xmalloc(slen + 1 + (width - n));
918
0
  for (i = 0; i < width - n; i++)
919
0
    out[i] = ' ';
920
0
  memcpy(out + i, s, slen);
921
0
  out[i + slen] = '\0';
922
0
  return (out);
923
0
}
924
925
int
926
utf8_cstrhas(const char *s, const struct utf8_data *ud)
927
0
{
928
0
  struct utf8_data  *copy, *loop;
929
0
  int      found = 0;
930
931
0
  copy = utf8_fromcstr(s);
932
0
  for (loop = copy; loop->size != 0; loop++) {
933
0
    if (loop->size != ud->size)
934
0
      continue;
935
0
    if (memcmp(loop->data, ud->data, loop->size) == 0) {
936
0
      found = 1;
937
0
      break;
938
0
    }
939
0
  }
940
0
  free(copy);
941
942
0
  return (found);
943
0
}