Coverage Report

Created: 2026-06-10 06:30

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/tmux/utf8.c
Line
Count
Source
1
/* $OpenBSD$ */
2
3
/*
4
 * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com>
5
 *
6
 * Permission to use, copy, modify, and distribute this software for any
7
 * purpose with or without fee is hereby granted, provided that the above
8
 * copyright notice and this permission notice appear in all copies.
9
 *
10
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14
 * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
15
 * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
16
 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17
 */
18
19
#include <sys/types.h>
20
21
#include <ctype.h>
22
#include <errno.h>
23
#include <stdlib.h>
24
#include <string.h>
25
#include <wchar.h>
26
27
#include "compat.h"
28
#include "tmux.h"
29
30
struct utf8_width_item {
31
  wchar_t       wc;
32
  u_int       width;
33
  int       allocated;
34
35
  RB_ENTRY(utf8_width_item) entry;
36
};
37
38
static int
39
utf8_width_cache_cmp(struct utf8_width_item *uw1, struct utf8_width_item *uw2)
40
0
{
41
0
  if (uw1->wc < uw2->wc)
42
0
    return (-1);
43
0
  if (uw1->wc > uw2->wc)
44
0
    return (1);
45
0
  return (0);
46
0
}
47
RB_HEAD(utf8_width_cache, utf8_width_item);
48
0
RB_GENERATE_STATIC(utf8_width_cache, utf8_width_item, entry,
Unexecuted instantiation: utf8.c:utf8_width_cache_RB_MINMAX
Unexecuted instantiation: utf8.c:utf8_width_cache_RB_REMOVE
Unexecuted instantiation: utf8.c:utf8_width_cache_RB_REMOVE_COLOR
Unexecuted instantiation: utf8.c:utf8_width_cache_RB_INSERT
Unexecuted instantiation: utf8.c:utf8_width_cache_RB_FIND
49
0
    utf8_width_cache_cmp);
50
0
static struct utf8_width_cache utf8_width_cache =
51
0
    RB_INITIALIZER(utf8_width_cache);
52
0
53
0
static struct utf8_width_item utf8_default_width_cache[] = {
54
0
  { .wc = 0x0261D, .width = 2 },
55
0
  { .wc = 0x026F9, .width = 2 },
56
0
  { .wc = 0x0270A, .width = 2 },
57
0
  { .wc = 0x0270B, .width = 2 },
58
0
  { .wc = 0x0270C, .width = 2 },
59
0
  { .wc = 0x0270D, .width = 2 },
60
0
  { .wc = 0x1F1E6, .width = 1 },
61
0
  { .wc = 0x1F1E7, .width = 1 },
62
0
  { .wc = 0x1F1E8, .width = 1 },
63
0
  { .wc = 0x1F1E9, .width = 1 },
64
0
  { .wc = 0x1F1EA, .width = 1 },
65
0
  { .wc = 0x1F1EB, .width = 1 },
66
0
  { .wc = 0x1F1EC, .width = 1 },
67
0
  { .wc = 0x1F1ED, .width = 1 },
68
0
  { .wc = 0x1F1EE, .width = 1 },
69
0
  { .wc = 0x1F1EF, .width = 1 },
70
0
  { .wc = 0x1F1F0, .width = 1 },
71
0
  { .wc = 0x1F1F1, .width = 1 },
72
0
  { .wc = 0x1F1F2, .width = 1 },
73
0
  { .wc = 0x1F1F3, .width = 1 },
74
0
  { .wc = 0x1F1F4, .width = 1 },
75
0
  { .wc = 0x1F1F5, .width = 1 },
76
0
  { .wc = 0x1F1F6, .width = 1 },
77
0
  { .wc = 0x1F1F7, .width = 1 },
78
0
  { .wc = 0x1F1F8, .width = 1 },
79
0
  { .wc = 0x1F1F9, .width = 1 },
80
0
  { .wc = 0x1F1FA, .width = 1 },
81
0
  { .wc = 0x1F1FB, .width = 1 },
82
0
  { .wc = 0x1F1FC, .width = 1 },
83
0
  { .wc = 0x1F1FD, .width = 1 },
84
0
  { .wc = 0x1F1FE, .width = 1 },
85
0
  { .wc = 0x1F1FF, .width = 1 },
86
0
  { .wc = 0x1F385, .width = 2 },
87
0
  { .wc = 0x1F3C2, .width = 2 },
88
0
  { .wc = 0x1F3C3, .width = 2 },
89
0
  { .wc = 0x1F3C4, .width = 2 },
90
0
  { .wc = 0x1F3C7, .width = 2 },
91
0
  { .wc = 0x1F3CA, .width = 2 },
92
0
  { .wc = 0x1F3CB, .width = 2 },
93
0
  { .wc = 0x1F3CC, .width = 2 },
94
0
  { .wc = 0x1F3FB, .width = 2 },
95
0
  { .wc = 0x1F3FC, .width = 2 },
96
0
  { .wc = 0x1F3FD, .width = 2 },
97
0
  { .wc = 0x1F3FE, .width = 2 },
98
0
  { .wc = 0x1F3FF, .width = 2 },
99
0
  { .wc = 0x1F442, .width = 2 },
100
0
  { .wc = 0x1F443, .width = 2 },
101
0
  { .wc = 0x1F446, .width = 2 },
102
0
  { .wc = 0x1F447, .width = 2 },
103
0
  { .wc = 0x1F448, .width = 2 },
104
0
  { .wc = 0x1F449, .width = 2 },
105
0
  { .wc = 0x1F44A, .width = 2 },
106
0
  { .wc = 0x1F44B, .width = 2 },
107
0
  { .wc = 0x1F44C, .width = 2 },
108
0
  { .wc = 0x1F44D, .width = 2 },
109
0
  { .wc = 0x1F44E, .width = 2 },
110
0
  { .wc = 0x1F44F, .width = 2 },
111
0
  { .wc = 0x1F450, .width = 2 },
112
0
  { .wc = 0x1F466, .width = 2 },
113
0
  { .wc = 0x1F467, .width = 2 },
114
0
  { .wc = 0x1F468, .width = 2 },
115
0
  { .wc = 0x1F469, .width = 2 },
116
0
  { .wc = 0x1F46B, .width = 2 },
117
0
  { .wc = 0x1F46C, .width = 2 },
118
0
  { .wc = 0x1F46D, .width = 2 },
119
0
  { .wc = 0x1F46E, .width = 2 },
120
0
  { .wc = 0x1F470, .width = 2 },
121
0
  { .wc = 0x1F471, .width = 2 },
122
0
  { .wc = 0x1F472, .width = 2 },
123
0
  { .wc = 0x1F473, .width = 2 },
124
0
  { .wc = 0x1F474, .width = 2 },
125
0
  { .wc = 0x1F475, .width = 2 },
126
0
  { .wc = 0x1F476, .width = 2 },
127
0
  { .wc = 0x1F477, .width = 2 },
128
0
  { .wc = 0x1F478, .width = 2 },
129
0
  { .wc = 0x1F47C, .width = 2 },
130
0
  { .wc = 0x1F481, .width = 2 },
131
0
  { .wc = 0x1F482, .width = 2 },
132
0
  { .wc = 0x1F483, .width = 2 },
133
0
  { .wc = 0x1F485, .width = 2 },
134
0
  { .wc = 0x1F486, .width = 2 },
135
0
  { .wc = 0x1F487, .width = 2 },
136
0
  { .wc = 0x1F48F, .width = 2 },
137
0
  { .wc = 0x1F491, .width = 2 },
138
0
  { .wc = 0x1F4AA, .width = 2 },
139
0
  { .wc = 0x1F574, .width = 2 },
140
0
  { .wc = 0x1F575, .width = 2 },
141
0
  { .wc = 0x1F57A, .width = 2 },
142
0
  { .wc = 0x1F590, .width = 2 },
143
0
  { .wc = 0x1F595, .width = 2 },
144
0
  { .wc = 0x1F596, .width = 2 },
145
0
  { .wc = 0x1F645, .width = 2 },
146
0
  { .wc = 0x1F646, .width = 2 },
147
0
  { .wc = 0x1F647, .width = 2 },
148
0
  { .wc = 0x1F64B, .width = 2 },
149
0
  { .wc = 0x1F64C, .width = 2 },
150
0
  { .wc = 0x1F64D, .width = 2 },
151
0
  { .wc = 0x1F64E, .width = 2 },
152
0
  { .wc = 0x1F64F, .width = 2 },
153
0
  { .wc = 0x1F6A3, .width = 2 },
154
0
  { .wc = 0x1F6B4, .width = 2 },
155
0
  { .wc = 0x1F6B5, .width = 2 },
156
0
  { .wc = 0x1F6B6, .width = 2 },
157
0
  { .wc = 0x1F6C0, .width = 2 },
158
0
  { .wc = 0x1F6CC, .width = 2 },
159
0
  { .wc = 0x1F90C, .width = 2 },
160
0
  { .wc = 0x1F90F, .width = 2 },
161
0
  { .wc = 0x1F918, .width = 2 },
162
0
  { .wc = 0x1F919, .width = 2 },
163
0
  { .wc = 0x1F91A, .width = 2 },
164
0
  { .wc = 0x1F91B, .width = 2 },
165
0
  { .wc = 0x1F91C, .width = 2 },
166
0
  { .wc = 0x1F91D, .width = 2 },
167
0
  { .wc = 0x1F91E, .width = 2 },
168
0
  { .wc = 0x1F91F, .width = 2 },
169
0
  { .wc = 0x1F926, .width = 2 },
170
0
  { .wc = 0x1F930, .width = 2 },
171
0
  { .wc = 0x1F931, .width = 2 },
172
0
  { .wc = 0x1F932, .width = 2 },
173
0
  { .wc = 0x1F933, .width = 2 },
174
0
  { .wc = 0x1F934, .width = 2 },
175
0
  { .wc = 0x1F935, .width = 2 },
176
0
  { .wc = 0x1F936, .width = 2 },
177
0
  { .wc = 0x1F937, .width = 2 },
178
0
  { .wc = 0x1F938, .width = 2 },
179
0
  { .wc = 0x1F939, .width = 2 },
180
0
  { .wc = 0x1F93D, .width = 2 },
181
0
  { .wc = 0x1F93E, .width = 2 },
182
0
  { .wc = 0x1F977, .width = 2 },
183
0
  { .wc = 0x1F9B5, .width = 2 },
184
0
  { .wc = 0x1F9B6, .width = 2 },
185
0
  { .wc = 0x1F9B8, .width = 2 },
186
0
  { .wc = 0x1F9B9, .width = 2 },
187
0
  { .wc = 0x1F9BB, .width = 2 },
188
0
  { .wc = 0x1F9CD, .width = 2 },
189
0
  { .wc = 0x1F9CE, .width = 2 },
190
0
  { .wc = 0x1F9CF, .width = 2 },
191
0
  { .wc = 0x1F9D1, .width = 2 },
192
0
  { .wc = 0x1F9D2, .width = 2 },
193
0
  { .wc = 0x1F9D3, .width = 2 },
194
0
  { .wc = 0x1F9D4, .width = 2 },
195
0
  { .wc = 0x1F9D5, .width = 2 },
196
0
  { .wc = 0x1F9D6, .width = 2 },
197
0
  { .wc = 0x1F9D7, .width = 2 },
198
0
  { .wc = 0x1F9D8, .width = 2 },
199
0
  { .wc = 0x1F9D9, .width = 2 },
200
0
  { .wc = 0x1F9DA, .width = 2 },
201
0
  { .wc = 0x1F9DB, .width = 2 },
202
0
  { .wc = 0x1F9DC, .width = 2 },
203
0
  { .wc = 0x1F9DD, .width = 2 },
204
0
  { .wc = 0x1FAC3, .width = 2 },
205
0
  { .wc = 0x1FAC4, .width = 2 },
206
0
  { .wc = 0x1FAC5, .width = 2 },
207
0
  { .wc = 0x1FAF0, .width = 2 },
208
0
  { .wc = 0x1FAF1, .width = 2 },
209
0
  { .wc = 0x1FAF2, .width = 2 },
210
0
  { .wc = 0x1FAF3, .width = 2 },
211
0
  { .wc = 0x1FAF4, .width = 2 },
212
0
  { .wc = 0x1FAF5, .width = 2 },
213
0
  { .wc = 0x1FAF6, .width = 2 },
214
0
  { .wc = 0x1FAF7, .width = 2 },
215
0
  { .wc = 0x1FAF8, .width = 2 }
216
0
};
217
0
218
0
struct utf8_item {
219
0
  RB_ENTRY(utf8_item) index_entry;
220
0
  u_int     index;
221
0
222
0
  RB_ENTRY(utf8_item) data_entry;
223
0
  char      data[UTF8_SIZE];
224
0
  u_char      size;
225
0
};
226
0
227
0
static int
228
0
utf8_data_cmp(struct utf8_item *ui1, struct utf8_item *ui2)
229
0
{
230
0
  if (ui1->size < ui2->size)
231
0
    return (-1);
232
0
  if (ui1->size > ui2->size)
233
0
    return (1);
234
0
  return (memcmp(ui1->data, ui2->data, ui1->size));
235
0
}
236
RB_HEAD(utf8_data_tree, utf8_item);
237
0
RB_GENERATE_STATIC(utf8_data_tree, utf8_item, data_entry, utf8_data_cmp);
Unexecuted instantiation: utf8.c:utf8_data_tree_RB_FIND
Unexecuted instantiation: utf8.c:utf8_data_tree_RB_INSERT
238
0
static struct utf8_data_tree utf8_data_tree = RB_INITIALIZER(utf8_data_tree);
239
0
240
0
static int
241
0
utf8_index_cmp(struct utf8_item *ui1, struct utf8_item *ui2)
242
0
{
243
0
  if (ui1->index < ui2->index)
244
0
    return (-1);
245
0
  if (ui1->index > ui2->index)
246
0
    return (1);
247
0
  return (0);
248
0
}
249
RB_HEAD(utf8_index_tree, utf8_item);
250
0
RB_GENERATE_STATIC(utf8_index_tree, utf8_item, index_entry, utf8_index_cmp);
Unexecuted instantiation: utf8.c:utf8_index_tree_RB_INSERT
Unexecuted instantiation: utf8.c:utf8_index_tree_RB_FIND
251
0
static struct utf8_index_tree utf8_index_tree = RB_INITIALIZER(utf8_index_tree);
252
0
253
0
static int  utf8_no_width;
254
0
static u_int  utf8_next_index;
255
0
256
0
#define UTF8_GET_SIZE(uc) (((uc) >> 24) & 0x1f)
257
0
#define UTF8_GET_WIDTH(uc) (((uc) >> 29) - 1)
258
259
0
#define UTF8_SET_SIZE(size) (((utf8_char)(size)) << 24)
260
0
#define UTF8_SET_WIDTH(width) ((((utf8_char)(width)) + 1) << 29)
261
262
/* Get a UTF-8 item from data. */
263
static struct utf8_item *
264
utf8_item_by_data(const u_char *data, size_t size)
265
0
{
266
0
  struct utf8_item  ui;
267
268
0
  memcpy(ui.data, data, size);
269
0
  ui.size = size;
270
271
0
  return (RB_FIND(utf8_data_tree, &utf8_data_tree, &ui));
272
0
}
273
274
/* Get a UTF-8 item from data. */
275
static struct utf8_item *
276
utf8_item_by_index(u_int index)
277
0
{
278
0
  struct utf8_item  ui;
279
280
0
  ui.index = index;
281
282
0
  return (RB_FIND(utf8_index_tree, &utf8_index_tree, &ui));
283
0
}
284
285
/* Find a codepoint in the cache. */
286
static struct utf8_width_item *
287
utf8_find_in_width_cache(wchar_t wc)
288
0
{
289
0
  struct utf8_width_item  uw;
290
291
0
  uw.wc = wc;
292
0
  return RB_FIND(utf8_width_cache, &utf8_width_cache, &uw);
293
0
}
294
295
/* Add to width cache. */
296
static void
297
utf8_insert_width_cache(wchar_t wc, u_int width)
298
0
{
299
0
  struct utf8_width_item  *uw, *old;
300
301
0
  log_debug("Unicode width cache: %08X=%u", (u_int)wc, width);
302
303
0
  uw = xcalloc(1, sizeof *uw);
304
0
  uw->wc = wc;
305
0
  uw->width = width;
306
0
  uw->allocated = 1;
307
308
0
  old = RB_INSERT(utf8_width_cache, &utf8_width_cache, uw);
309
0
  if (old != NULL) {
310
0
    RB_REMOVE(utf8_width_cache, &utf8_width_cache, old);
311
0
    if (old->allocated)
312
0
      free(old);
313
0
    RB_INSERT(utf8_width_cache, &utf8_width_cache, uw);
314
0
  }
315
0
}
316
317
/* Parse a single codepoint option. */
318
static void
319
utf8_add_to_width_cache(const char *s)
320
0
{
321
0
  char      *copy, *cp, *endptr;
322
0
  u_int      width;
323
0
  const char    *errstr;
324
0
  struct utf8_data  *ud;
325
0
  wchar_t      wc, wc_start, wc_end;
326
0
  unsigned long long   n;
327
328
0
  copy = xstrdup(s);
329
0
  if ((cp = strchr(copy, '=')) == NULL) {
330
0
    free(copy);
331
0
    return;
332
0
  }
333
0
  *cp++ = '\0';
334
335
0
  width = strtonum(cp, 0, 2, &errstr);
336
0
  if (errstr != NULL) {
337
0
    free(copy);
338
0
    return;
339
0
  }
340
341
0
  if (strncmp(copy, "U+", 2) == 0) {
342
0
    errno = 0;
343
0
    n = strtoull(copy + 2, &endptr, 16);
344
0
    if (copy[2] == '\0' ||
345
0
        n == 0 ||
346
0
        n > WCHAR_MAX ||
347
0
        (errno == ERANGE && n == ULLONG_MAX)) {
348
0
      free(copy);
349
0
      return;
350
0
    }
351
0
    wc_start = n;
352
0
    if (*endptr == '-') {
353
0
      endptr++;
354
0
      if (strncmp(endptr, "U+", 2) != 0) {
355
0
        free(copy);
356
0
        return;
357
0
      }
358
0
      errno = 0;
359
0
      n = strtoull(endptr + 2, &endptr, 16);
360
0
      if (*endptr != '\0' ||
361
0
          n == 0 ||
362
0
          n > WCHAR_MAX ||
363
0
          (errno == ERANGE && n == ULLONG_MAX) ||
364
0
          (wchar_t)n < wc_start) {
365
0
        free(copy);
366
0
        return;
367
0
      }
368
0
      wc_end = n;
369
0
    } else {
370
0
      if (*endptr != '\0') {
371
0
        free(copy);
372
0
        return;
373
0
      }
374
0
      wc_end = wc_start;
375
0
    }
376
377
0
    for (wc = wc_start; wc <= wc_end; wc++)
378
0
      utf8_insert_width_cache(wc, width);
379
0
  } else {
380
0
    utf8_no_width = 1;
381
0
    ud = utf8_fromcstr(copy);
382
0
    utf8_no_width = 0;
383
0
    if (ud[0].size == 0 || ud[1].size != 0) {
384
0
      free(ud);
385
0
      free(copy);
386
0
      return;
387
0
    }
388
#ifdef HAVE_UTF8PROC
389
    if (utf8proc_mbtowc(&wc, ud[0].data, ud[0].size) <= 0) {
390
#else
391
0
    if (mbtowc(&wc, ud[0].data, ud[0].size) <= 0) {
392
0
#endif
393
0
      free(ud);
394
0
      free(copy);
395
0
      return;
396
0
    }
397
0
    free(ud);
398
399
0
    utf8_insert_width_cache(wc, width);
400
0
  }
401
402
0
  free(copy);
403
0
}
404
405
/* Rebuild cache of widths. */
406
void
407
utf8_update_width_cache(void)
408
0
{
409
0
  struct utf8_width_item    *uw, *uw1;
410
0
  struct options_entry    *o;
411
0
  struct options_array_item *a;
412
0
  u_int        i;
413
414
0
  RB_FOREACH_SAFE (uw, utf8_width_cache, &utf8_width_cache, uw1) {
415
0
    RB_REMOVE(utf8_width_cache, &utf8_width_cache, uw);
416
0
    if (uw->allocated)
417
0
      free(uw);
418
0
  }
419
420
0
  for (i = 0; i < nitems(utf8_default_width_cache); i++) {
421
0
    RB_INSERT(utf8_width_cache, &utf8_width_cache,
422
0
        &utf8_default_width_cache[i]);
423
0
  }
424
425
0
  o = options_get(global_options, "codepoint-widths");
426
0
  a = options_array_first(o);
427
0
  while (a != NULL) {
428
0
    utf8_add_to_width_cache(options_array_item_value(a)->string);
429
0
    a = options_array_next(a);
430
0
  }
431
0
}
432
433
/* Add a UTF-8 item. */
434
static int
435
utf8_put_item(const u_char *data, size_t size, u_int *index)
436
0
{
437
0
  struct utf8_item  *ui;
438
439
0
  ui = utf8_item_by_data(data, size);
440
0
  if (ui != NULL) {
441
0
    *index = ui->index;
442
0
    log_debug("%s: found %.*s = %u", __func__, (int)size, data,
443
0
        *index);
444
0
    return (0);
445
0
  }
446
447
0
  if (utf8_next_index == 0xffffff + 1)
448
0
    return (-1);
449
450
0
  ui = xcalloc(1, sizeof *ui);
451
0
  ui->index = utf8_next_index++;
452
0
  RB_INSERT(utf8_index_tree, &utf8_index_tree, ui);
453
454
0
  memcpy(ui->data, data, size);
455
0
  ui->size = size;
456
0
  RB_INSERT(utf8_data_tree, &utf8_data_tree, ui);
457
458
0
  *index = ui->index;
459
0
  log_debug("%s: added %.*s = %u", __func__, (int)size, data, *index);
460
0
  return (0);
461
0
}
462
463
/* Get UTF-8 character from data. */
464
enum utf8_state
465
utf8_from_data(const struct utf8_data *ud, utf8_char *uc)
466
0
{
467
0
  u_int index;
468
469
0
  if (ud->width > 2)
470
0
    fatalx("invalid UTF-8 width: %u", ud->width);
471
472
0
  if (ud->size > UTF8_SIZE)
473
0
    goto fail;
474
0
  if (ud->size <= 3) {
475
0
    index = (((utf8_char)ud->data[2] << 16)|
476
0
        ((utf8_char)ud->data[1] << 8)|
477
0
        ((utf8_char)ud->data[0]));
478
0
  } else if (utf8_put_item(ud->data, ud->size, &index) != 0)
479
0
    goto fail;
480
0
  *uc = UTF8_SET_SIZE(ud->size)|UTF8_SET_WIDTH(ud->width)|index;
481
0
  log_debug("%s: (%d %d %.*s) -> %08x", __func__, ud->width, ud->size,
482
0
      (int)ud->size, ud->data, *uc);
483
0
  return (UTF8_DONE);
484
485
0
fail:
486
0
  if (ud->width == 0)
487
0
    *uc = UTF8_SET_SIZE(0)|UTF8_SET_WIDTH(0);
488
0
  else if (ud->width == 1)
489
0
    *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x20;
490
0
  else
491
0
    *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x2020;
492
0
  return (UTF8_ERROR);
493
0
}
494
495
/* Get UTF-8 data from character. */
496
void
497
utf8_to_data(utf8_char uc, struct utf8_data *ud)
498
0
{
499
0
  struct utf8_item  *ui;
500
0
  u_int      index;
501
502
0
  memset(ud, 0, sizeof *ud);
503
0
  ud->size = ud->have = UTF8_GET_SIZE(uc);
504
0
  ud->width = UTF8_GET_WIDTH(uc);
505
506
0
  if (ud->size <= 3) {
507
0
    ud->data[2] = (uc >> 16);
508
0
    ud->data[1] = ((uc >> 8) & 0xff);
509
0
    ud->data[0] = (uc & 0xff);
510
0
  } else {
511
0
    index = (uc & 0xffffff);
512
0
    if ((ui = utf8_item_by_index(index)) == NULL)
513
0
      memset(ud->data, ' ', ud->size);
514
0
    else
515
0
      memcpy(ud->data, ui->data, ud->size);
516
0
  }
517
518
0
  log_debug("%s: %08x -> (%d %d %.*s)", __func__, uc, ud->width, ud->size,
519
0
      (int)ud->size, ud->data);
520
0
}
521
522
/* Get UTF-8 character from a single ASCII character. */
523
u_int
524
utf8_build_one(u_char ch)
525
0
{
526
0
  return (UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|ch);
527
0
}
528
529
/* Set a single character. */
530
void
531
utf8_set(struct utf8_data *ud, u_char ch)
532
0
{
533
0
  static const struct utf8_data empty = { { 0 }, 1, 1, 1 };
534
535
0
  memcpy(ud, &empty, sizeof *ud);
536
0
  *ud->data = ch;
537
0
}
538
539
/* Copy UTF-8 character. */
540
void
541
utf8_copy(struct utf8_data *to, const struct utf8_data *from)
542
0
{
543
0
  u_int i;
544
545
0
  memcpy(to, from, sizeof *to);
546
547
0
  for (i = to->size; i < sizeof to->data; i++)
548
0
    to->data[i] = '\0';
549
0
}
550
551
/* Get width of Unicode character. */
552
static enum utf8_state
553
utf8_width(struct utf8_data *ud, int *width)
554
0
{
555
0
  struct utf8_width_item  *uw;
556
0
  wchar_t      wc;
557
558
0
  if (utf8_towc(ud, &wc) != UTF8_DONE)
559
0
    return (UTF8_ERROR);
560
0
  uw = utf8_find_in_width_cache(wc);
561
0
  if (uw != NULL) {
562
0
    *width = uw->width;
563
0
    log_debug("cached width for %08X is %d", (u_int)wc, *width);
564
0
    return (UTF8_DONE);
565
0
  }
566
#ifdef HAVE_UTF8PROC
567
  *width = utf8proc_wcwidth(wc);
568
  log_debug("utf8proc_wcwidth(%05X) returned %d", (u_int)wc, *width);
569
#else
570
0
  *width = wcwidth(wc);
571
0
  log_debug("wcwidth(%05X) returned %d", (u_int)wc, *width);
572
0
  if (*width < 0) {
573
    /*
574
     * C1 control characters are nonprintable, so they are always
575
     * zero width.
576
     */
577
0
    *width = (wc >= 0x80 && wc <= 0x9f) ? 0 : 1;
578
0
  }
579
0
#endif
580
0
  if (*width >= 0 && *width <= 0xff)
581
0
    return (UTF8_DONE);
582
0
  return (UTF8_ERROR);
583
0
}
584
585
/* Convert UTF-8 character to wide character. */
586
enum utf8_state
587
utf8_towc(const struct utf8_data *ud, wchar_t *wc)
588
0
{
589
#ifdef HAVE_UTF8PROC
590
  switch (utf8proc_mbtowc(wc, ud->data, ud->size)) {
591
#else
592
0
  switch (mbtowc(wc, ud->data, ud->size)) {
593
0
#endif
594
0
  case -1:
595
0
    log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
596
0
        errno);
597
0
    mbtowc(NULL, NULL, MB_CUR_MAX);
598
0
    return (UTF8_ERROR);
599
0
  case 0:
600
0
    return (UTF8_ERROR);
601
0
  }
602
0
  log_debug("UTF-8 %.*s is U+%06X", (int)ud->size, ud->data, (u_int)*wc);
603
0
  return (UTF8_DONE);
604
0
}
605
606
/* Convert wide character to UTF-8 character. */
607
enum utf8_state
608
utf8_fromwc(wchar_t wc, struct utf8_data *ud)
609
0
{
610
0
  int size, width;
611
612
#ifdef HAVE_UTF8PROC
613
  size = utf8proc_wctomb(ud->data, wc);
614
#else
615
0
  size = wctomb(ud->data, wc);
616
0
#endif
617
0
  if (size < 0) {
618
0
    log_debug("UTF-8 %d, wctomb() %d", wc, errno);
619
0
    wctomb(NULL, 0);
620
0
    return (UTF8_ERROR);
621
0
  }
622
0
  if (size == 0)
623
0
    return (UTF8_ERROR);
624
0
  ud->size = ud->have = size;
625
0
  if (utf8_width(ud, &width) == UTF8_DONE) {
626
0
    ud->width = width;
627
0
    return (UTF8_DONE);
628
0
  }
629
0
  return (UTF8_ERROR);
630
0
}
631
632
/*
633
 * Open UTF-8 sequence.
634
 *
635
 * 11000010-11011111 C2-DF start of 2-byte sequence
636
 * 11100000-11101111 E0-EF start of 3-byte sequence
637
 * 11110000-11110100 F0-F4 start of 4-byte sequence
638
 */
639
enum utf8_state
640
utf8_open(struct utf8_data *ud, u_char ch)
641
0
{
642
0
  memset(ud, 0, sizeof *ud);
643
0
  if (ch >= 0xc2 && ch <= 0xdf)
644
0
    ud->size = 2;
645
0
  else if (ch >= 0xe0 && ch <= 0xef)
646
0
    ud->size = 3;
647
0
  else if (ch >= 0xf0 && ch <= 0xf4)
648
0
    ud->size = 4;
649
0
  else
650
0
    return (UTF8_ERROR);
651
0
  utf8_append(ud, ch);
652
0
  return (UTF8_MORE);
653
0
}
654
655
/* Append character to UTF-8, closing if finished. */
656
enum utf8_state
657
utf8_append(struct utf8_data *ud, u_char ch)
658
0
{
659
0
  int width;
660
661
0
  if (ud->have >= ud->size)
662
0
    fatalx("UTF-8 character overflow");
663
0
  if (ud->size > sizeof ud->data)
664
0
    fatalx("UTF-8 character size too large");
665
666
0
  if (ud->have != 0 && (ch & 0xc0) != 0x80)
667
0
    ud->width = 0xff;
668
669
0
  ud->data[ud->have++] = ch;
670
0
  if (ud->have != ud->size)
671
0
    return (UTF8_MORE);
672
673
0
  if (!utf8_no_width) {
674
0
    if (ud->width == 0xff)
675
0
      return (UTF8_ERROR);
676
0
    if (utf8_width(ud, &width) != UTF8_DONE)
677
0
      return (UTF8_ERROR);
678
0
    ud->width = width;
679
0
  }
680
681
0
  return (UTF8_DONE);
682
0
}
683
684
/*
685
 * Encode len characters from src into dst, which is guaranteed to have four
686
 * bytes available for each character from src (for \abc or UTF-8) plus space
687
 * for \0.
688
 */
689
size_t
690
utf8_strvis(char *dst, const char *src, size_t len, int flag)
691
0
{
692
0
  struct utf8_data   ud;
693
0
  const char    *start = dst, *end = src + len;
694
0
  enum utf8_state    more;
695
0
  size_t       i;
696
697
0
  while (src < end) {
698
0
    if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
699
0
      while (++src < end && more == UTF8_MORE)
700
0
        more = utf8_append(&ud, *src);
701
0
      if (more == UTF8_DONE) {
702
        /* UTF-8 character finished. */
703
0
        for (i = 0; i < ud.size; i++)
704
0
          *dst++ = ud.data[i];
705
0
        continue;
706
0
      }
707
      /* Not a complete, valid UTF-8 character. */
708
0
      src -= ud.have;
709
0
    }
710
0
    if ((flag & VIS_DQ) && src[0] == '$' && src < end - 1) {
711
0
      if (isalpha((u_char)src[1]) ||
712
0
          src[1] == '_' ||
713
0
          src[1] == '{')
714
0
        *dst++ = '\\';
715
0
      *dst++ = '$';
716
0
    } else if (src < end - 1)
717
0
      dst = vis(dst, src[0], flag, src[1]);
718
0
    else if (src < end)
719
0
      dst = vis(dst, src[0], flag, '\0');
720
0
    src++;
721
0
  }
722
0
  *dst = '\0';
723
0
  return (dst - start);
724
0
}
725
726
/* Same as utf8_strvis but allocate the buffer. */
727
size_t
728
utf8_stravis(char **dst, const char *src, int flag)
729
0
{
730
0
  char  *buf;
731
0
  size_t   len;
732
733
0
  buf = xreallocarray(NULL, 4, strlen(src) + 1);
734
0
  len = utf8_strvis(buf, src, strlen(src), flag);
735
736
0
  *dst = xrealloc(buf, len + 1);
737
0
  return (len);
738
0
}
739
740
/* Same as utf8_strvis but allocate the buffer. */
741
size_t
742
utf8_stravisx(char **dst, const char *src, size_t srclen, int flag)
743
0
{
744
0
  char  *buf;
745
0
  size_t   len;
746
747
0
  buf = xreallocarray(NULL, 4, srclen + 1);
748
0
  len = utf8_strvis(buf, src, srclen, flag);
749
750
0
  *dst = xrealloc(buf, len + 1);
751
0
  return (len);
752
0
}
753
754
/* Does this string contain anything that isn't valid UTF-8? */
755
int
756
utf8_isvalid(const char *s)
757
0
{
758
0
  struct utf8_data ud;
759
0
  const char  *end;
760
0
  enum utf8_state  more;
761
762
0
  end = s + strlen(s);
763
0
  while (s < end) {
764
0
    if ((more = utf8_open(&ud, *s)) == UTF8_MORE) {
765
0
      while (++s < end && more == UTF8_MORE)
766
0
        more = utf8_append(&ud, *s);
767
0
      if (more == UTF8_DONE)
768
0
        continue;
769
0
      return (0);
770
0
    }
771
0
    if (*s < 0x20 || *s > 0x7e)
772
0
      return (0);
773
0
    s++;
774
0
  }
775
0
  return (1);
776
0
}
777
778
/*
779
 * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free
780
 * the returned string. Anything not valid printable ASCII or UTF-8 is
781
 * stripped.
782
 */
783
char *
784
utf8_sanitize(const char *src)
785
0
{
786
0
  char    *dst = NULL;
787
0
  size_t     n = 0;
788
0
  enum utf8_state  more;
789
0
  struct utf8_data ud;
790
0
  u_int    i;
791
792
0
  while (*src != '\0') {
793
0
    dst = xreallocarray(dst, n + 1, sizeof *dst);
794
0
    if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
795
0
      while (*++src != '\0' && more == UTF8_MORE)
796
0
        more = utf8_append(&ud, *src);
797
0
      if (more == UTF8_DONE) {
798
0
        dst = xreallocarray(dst, n + ud.width,
799
0
            sizeof *dst);
800
0
        for (i = 0; i < ud.width; i++)
801
0
          dst[n++] = '_';
802
0
        continue;
803
0
      }
804
0
      src -= ud.have;
805
0
    }
806
0
    if (*src > 0x1f && *src < 0x7f)
807
0
      dst[n++] = *src;
808
0
    else
809
0
      dst[n++] = '_';
810
0
    src++;
811
0
  }
812
0
  dst = xreallocarray(dst, n + 1, sizeof *dst);
813
0
  dst[n] = '\0';
814
0
  return (dst);
815
0
}
816
817
/* Get UTF-8 buffer length. */
818
size_t
819
utf8_strlen(const struct utf8_data *s)
820
0
{
821
0
  size_t  i;
822
823
0
  for (i = 0; s[i].size != 0; i++)
824
0
    /* nothing */;
825
0
  return (i);
826
0
}
827
828
/* Get UTF-8 string width. */
829
u_int
830
utf8_strwidth(const struct utf8_data *s, ssize_t n)
831
0
{
832
0
  ssize_t i;
833
0
  u_int width = 0;
834
835
0
  for (i = 0; s[i].size != 0; i++) {
836
0
    if (n != -1 && n == i)
837
0
      break;
838
0
    width += s[i].width;
839
0
  }
840
0
  return (width);
841
0
}
842
843
/*
844
 * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0.
845
 * Caller frees.
846
 */
847
struct utf8_data *
848
utf8_fromcstr(const char *src)
849
0
{
850
0
  struct utf8_data  *dst = NULL;
851
0
  size_t       n = 0;
852
0
  enum utf8_state    more;
853
854
0
  while (*src != '\0') {
855
0
    dst = xreallocarray(dst, n + 1, sizeof *dst);
856
0
    if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) {
857
0
      while (*++src != '\0' && more == UTF8_MORE)
858
0
        more = utf8_append(&dst[n], *src);
859
0
      if (more == UTF8_DONE) {
860
0
        n++;
861
0
        continue;
862
0
      }
863
0
      src -= dst[n].have;
864
0
    }
865
0
    utf8_set(&dst[n], *src);
866
0
    n++;
867
0
    src++;
868
0
  }
869
0
  dst = xreallocarray(dst, n + 1, sizeof *dst);
870
0
  dst[n].size = 0;
871
0
  return (dst);
872
0
}
873
874
/* Convert from a buffer of UTF-8 characters into a string. Caller frees. */
875
char *
876
utf8_tocstr(struct utf8_data *src)
877
0
{
878
0
  char  *dst = NULL;
879
0
  size_t   n = 0;
880
881
0
  for(; src->size != 0; src++) {
882
0
    dst = xreallocarray(dst, n + src->size, 1);
883
0
    memcpy(dst + n, src->data, src->size);
884
0
    n += src->size;
885
0
  }
886
0
  dst = xreallocarray(dst, n + 1, 1);
887
0
  dst[n] = '\0';
888
0
  return (dst);
889
0
}
890
891
/* Get width of UTF-8 string. */
892
u_int
893
utf8_cstrwidth(const char *s)
894
0
{
895
0
  struct utf8_data  tmp;
896
0
  u_int     width;
897
0
  enum utf8_state   more;
898
899
0
  width = 0;
900
0
  while (*s != '\0') {
901
0
    if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) {
902
0
      while (*++s != '\0' && more == UTF8_MORE)
903
0
        more = utf8_append(&tmp, *s);
904
0
      if (more == UTF8_DONE) {
905
0
        width += tmp.width;
906
0
        continue;
907
0
      }
908
0
      s -= tmp.have;
909
0
    }
910
0
    if (*s > 0x1f && *s != 0x7f)
911
0
      width++;
912
0
    s++;
913
0
  }
914
0
  return (width);
915
0
}
916
917
/* Pad UTF-8 string to width on the left. Caller frees. */
918
char *
919
utf8_padcstr(const char *s, u_int width)
920
0
{
921
0
  size_t   slen;
922
0
  char  *out;
923
0
  u_int  n, i;
924
925
0
  n = utf8_cstrwidth(s);
926
0
  if (n >= width)
927
0
    return (xstrdup(s));
928
929
0
  slen = strlen(s);
930
0
  out = xmalloc(slen + 1 + (width - n));
931
0
  memcpy(out, s, slen);
932
0
  for (i = n; i < width; i++)
933
0
    out[slen++] = ' ';
934
0
  out[slen] = '\0';
935
0
  return (out);
936
0
}
937
938
/* Pad UTF-8 string to width on the right. Caller frees. */
939
char *
940
utf8_rpadcstr(const char *s, u_int width)
941
0
{
942
0
  size_t   slen;
943
0
  char  *out;
944
0
  u_int  n, i;
945
946
0
  n = utf8_cstrwidth(s);
947
0
  if (n >= width)
948
0
    return (xstrdup(s));
949
950
0
  slen = strlen(s);
951
0
  out = xmalloc(slen + 1 + (width - n));
952
0
  for (i = 0; i < width - n; i++)
953
0
    out[i] = ' ';
954
0
  memcpy(out + i, s, slen);
955
0
  out[i + slen] = '\0';
956
0
  return (out);
957
0
}
958
959
int
960
utf8_cstrhas(const char *s, const struct utf8_data *ud)
961
0
{
962
0
  struct utf8_data  *copy, *loop;
963
0
  int      found = 0;
964
965
0
  copy = utf8_fromcstr(s);
966
0
  for (loop = copy; loop->size != 0; loop++) {
967
0
    if (loop->size != ud->size)
968
0
      continue;
969
0
    if (memcmp(loop->data, ud->data, loop->size) == 0) {
970
0
      found = 1;
971
0
      break;
972
0
    }
973
0
  }
974
0
  free(copy);
975
976
0
  return (found);
977
0
}