Coverage Report

Created: 2026-01-10 06:48

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/w3m/libwc/utf8.c
Line
Count
Source
1
2
#ifdef USE_UNICODE
3
4
#include "wc.h"
5
#include "ucs.h"
6
#include "utf8.h"
7
#include "wtf.h"
8
9
wc_uint8 WC_UTF8_MAP[ 0x100 ] = {
10
   8, 8, 8, 8, 8, 8, 8, 8,  8, 8, 8, 8, 8, 8, 8, 8,
11
   8, 8, 8, 8, 8, 8, 8, 8,  8, 8, 8, 8, 8, 8, 8, 8,
12
   1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
13
   1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
14
   1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
15
   1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
16
   1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
17
   1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 8,
18
19
   0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
20
   0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
21
   0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
22
   0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
23
   2, 2, 2, 2, 2, 2, 2, 2,  2, 2, 2, 2, 2, 2, 2, 2,
24
   2, 2, 2, 2, 2, 2, 2, 2,  2, 2, 2, 2, 2, 2, 2, 2,
25
   3, 3, 3, 3, 3, 3, 3, 3,  3, 3, 3, 3, 3, 3, 3, 3,
26
   4, 4, 4, 4, 4, 4, 4, 4,  5, 5, 5, 5, 6, 6, 7, 7,
27
};
28
29
static wc_uchar utf8_buf[7];
30
31
size_t
32
wc_ucs_to_utf8(wc_uint32 ucs, wc_uchar *utf8)
33
51.0M
{
34
51.0M
    if (ucs < WC_C_UTF8_L2) {
35
2.71M
  utf8[0] =   ucs;
36
2.71M
  utf8[1] = 0;
37
2.71M
  return 1;
38
48.3M
    } else if (ucs < WC_C_UTF8_L3) {
39
40.7k
  utf8[0] =  (ucs >> 6)          | 0xc0;
40
40.7k
  utf8[1] =  (ucs        & 0x3f) | 0x80;
41
40.7k
  utf8[2] = 0;
42
40.7k
  return 2;
43
48.3M
    } else if (ucs < WC_C_UTF8_L4) {
44
60.2k
  utf8[0] =  (ucs >> 12)         | 0xe0;
45
60.2k
  utf8[1] = ((ucs >> 6)  & 0x3f) | 0x80;
46
60.2k
  utf8[2] =  (ucs        & 0x3f) | 0x80;
47
60.2k
  utf8[3] = 0;
48
60.2k
  return 3;
49
48.2M
    } else if (ucs < WC_C_UTF8_L5) {
50
48.2M
  utf8[0] =  (ucs >> 18)         | 0xf0;
51
48.2M
  utf8[1] = ((ucs >> 12) & 0x3f) | 0x80;
52
48.2M
  utf8[2] = ((ucs >> 6)  & 0x3f) | 0x80;
53
48.2M
  utf8[3] =  (ucs        & 0x3f) | 0x80;
54
48.2M
  utf8[4] = 0;
55
48.2M
  return 4;
56
48.2M
    } else if (ucs < WC_C_UTF8_L6) {
57
257
  utf8[0] =  (ucs >> 24)         | 0xf8;
58
257
  utf8[1] = ((ucs >> 18) & 0x3f) | 0x80;
59
257
  utf8[2] = ((ucs >> 12) & 0x3f) | 0x80;
60
257
  utf8[3] = ((ucs >> 6)  & 0x3f) | 0x80;
61
257
  utf8[4] =  (ucs        & 0x3f) | 0x80;
62
257
  utf8[5] = 0;
63
257
  return 5;
64
1.07k
    } else if (ucs <= WC_C_UCS4_END) {
65
605
  utf8[0] =  (ucs >> 30)         | 0xfc;
66
605
  utf8[1] = ((ucs >> 24) & 0x3f) | 0x80;
67
605
  utf8[2] = ((ucs >> 18) & 0x3f) | 0x80;
68
605
  utf8[3] = ((ucs >> 12) & 0x3f) | 0x80;
69
605
  utf8[4] = ((ucs >> 6)  & 0x3f) | 0x80;
70
605
  utf8[5] =  (ucs        & 0x3f) | 0x80;
71
605
  utf8[6] = 0;
72
605
  return 6;
73
605
    } else {
74
471
  utf8[0] = 0;
75
471
  return 0;
76
471
    }
77
51.0M
}
78
79
wc_uint32
80
wc_utf8_to_ucs(wc_uchar *utf8)
81
274k
{
82
274k
    wc_uint32 ucs;
83
84
274k
    switch (WC_UTF8_MAP[utf8[0]]) {
85
0
    case 1:
86
0
  ucs =  (wc_uint32) utf8[0];
87
0
  if (ucs >= WC_C_UTF8_L2)
88
0
      break;
89
0
  return ucs;
90
194k
    case 2:
91
194k
  ucs = ((wc_uint32)(utf8[0] & 0x1f) << 6)
92
194k
      |  (wc_uint32)(utf8[1] & 0x3f);
93
194k
  if (ucs < WC_C_UTF8_L2)
94
13.3k
      break;
95
181k
  return ucs;
96
28.4k
    case 3:
97
28.4k
  ucs = ((wc_uint32)(utf8[0] & 0x0f) << 12)
98
28.4k
      | ((wc_uint32)(utf8[1] & 0x3f) << 6)
99
28.4k
      |  (wc_uint32)(utf8[2] & 0x3f);
100
28.4k
  if (ucs < WC_C_UTF8_L3)
101
1.03k
      break;
102
27.3k
  return ucs;
103
48.4k
    case 4:
104
48.4k
  ucs = ((wc_uint32)(utf8[0] & 0x07) << 18)
105
48.4k
      | ((wc_uint32)(utf8[1] & 0x3f) << 12)
106
48.4k
      | ((wc_uint32)(utf8[2] & 0x3f) << 6)
107
48.4k
      |  (wc_uint32)(utf8[3] & 0x3f);
108
48.4k
  if (ucs < WC_C_UTF8_L4)
109
313
      break;
110
48.0k
  return ucs;
111
1.10k
    case 5:
112
1.10k
  ucs = ((wc_uint32)(utf8[0] & 0x03) << 24)
113
1.10k
      | ((wc_uint32)(utf8[1] & 0x3f) << 18)
114
1.10k
      | ((wc_uint32)(utf8[2] & 0x3f) << 12)
115
1.10k
      | ((wc_uint32)(utf8[3] & 0x3f) << 6)
116
1.10k
      |  (wc_uint32)(utf8[4] & 0x3f);
117
1.10k
  if (ucs < WC_C_UTF8_L5)
118
210
      break;
119
893
  return ucs;
120
1.54k
    case 6:
121
1.54k
  ucs = ((wc_uint32)(utf8[0] & 0x01) << 30)
122
1.54k
      | ((wc_uint32)(utf8[1] & 0x3f) << 24)
123
1.54k
      | ((wc_uint32)(utf8[2] & 0x3f) << 18)
124
1.54k
      | ((wc_uint32)(utf8[3] & 0x3f) << 12)
125
1.54k
      | ((wc_uint32)(utf8[4] & 0x3f) << 6)
126
1.54k
      |  (wc_uint32)(utf8[5] & 0x3f);
127
1.54k
  if (ucs < WC_C_UTF8_L6)
128
201
      break;
129
1.34k
  return ucs;
130
0
    default:
131
0
  break;
132
274k
    }
133
15.0k
    return WC_C_UCS4_ERROR;
134
274k
}
135
136
Str
137
wc_conv_from_utf8(Str is, wc_ces ces)
138
1.86k
{
139
1.86k
    Str os;
140
1.86k
    wc_uchar *sp = (wc_uchar *)is->ptr;
141
1.86k
    wc_uchar *ep = sp + is->length;
142
1.86k
    wc_uchar *p;
143
1.86k
    wc_uchar *q = NULL;
144
1.86k
    int state = WC_UTF8_NOSTATE;
145
1.86k
    size_t next = 0;
146
1.86k
    wc_uint32 ucs;
147
1.86k
    wc_status st;
148
149
4.22k
    for (p = sp; p < ep && *p < 0x80; p++)
150
2.36k
  ;
151
1.86k
    if (p == ep)
152
73
  return is;
153
1.78k
    os = Strnew_size(is->length + is->length / 3);
154
1.78k
    if (p > sp)
155
355
  Strcat_charp_n(os, is->ptr, (int)(p - sp));
156
157
1.78k
    st.tag = NULL;
158
1.78k
    st.ntag = 0;
159
70.9M
    for (; p < ep; p++) {
160
70.9M
  switch (state) {
161
68.6M
  case WC_UTF8_NOSTATE:
162
68.6M
      next = WC_UTF8_MAP[*p];
163
68.6M
      switch (next) {
164
41.2M
      case 1:
165
41.2M
    wtf_push_ucs(os, (wc_uint32)*p, &st);
166
41.2M
    break;
167
4.79M
      case 8:
168
4.79M
    Strcat_char(os, (char)*p);
169
4.79M
    break;
170
20.4M
      case 0:
171
20.5M
      case 7:
172
20.5M
    wtf_push_unknown(os, p, 1);
173
20.5M
    break;
174
2.02M
      default:
175
2.02M
    q = p;
176
2.02M
    next--;
177
2.02M
    state = WC_UTF8_NEXT;
178
2.02M
    break;
179
68.6M
      }
180
68.6M
      break;
181
68.6M
  case WC_UTF8_NEXT:
182
2.34M
      if (WC_UTF8_MAP[*p]) {
183
1.75M
    wtf_push_unknown(os, q, p - q + 1);
184
1.75M
    state = WC_UTF8_NOSTATE;
185
1.75M
    break;
186
1.75M
      }
187
595k
      if (--next)
188
321k
    break;
189
274k
      state = WC_UTF8_NOSTATE;
190
274k
      ucs = wc_utf8_to_ucs(q);
191
274k
      if (ucs == WC_C_UCS4_ERROR ||
192
258k
    (ucs >= WC_C_UCS2_SURROGATE && ucs <= WC_C_UCS2_SURROGATE_END))
193
15.9k
    wtf_push_unknown(os, q, p - q + 1);
194
258k
      else if (ucs != WC_C_UCS2_BOM)
195
257k
    wtf_push_ucs(os, ucs, &st);
196
274k
      break;
197
70.9M
  }
198
70.9M
    }
199
1.78k
    switch (state) {
200
52
    case WC_UTF8_NEXT:
201
52
  wtf_push_unknown(os, q, p - q);
202
52
  break;
203
1.78k
    }
204
1.78k
    return os;
205
1.78k
}
206
207
static int
208
wc_push_tag_to_utf8(Str os, int ntag)
209
5.08M
{
210
5.08M
    char *p;
211
212
5.08M
    if (ntag) {
213
2.54M
  p = wc_ucs_get_tag(ntag);
214
2.54M
  if (p == NULL)
215
508
      ntag = 0;
216
2.54M
    }
217
5.08M
    if (ntag) {
218
2.54M
  wc_ucs_to_utf8(WC_C_LANGUAGE_TAG, utf8_buf);
219
2.54M
  Strcat_charp(os, (char *)utf8_buf);
220
45.7M
  for (; *p; p++) {
221
43.1M
      wc_ucs_to_utf8(WC_C_LANGUAGE_TAG0 | *p, utf8_buf);
222
43.1M
      Strcat_charp(os, (char *)utf8_buf);
223
43.1M
  }
224
2.54M
    } else {
225
2.54M
  wc_ucs_to_utf8(WC_C_CANCEL_TAG, utf8_buf);
226
2.54M
  Strcat_charp(os, (char *)utf8_buf);
227
2.54M
    }
228
5.08M
    return ntag;
229
5.08M
}
230
231
void
232
wc_push_to_utf8(Str os, wc_wchar_t cc, wc_status *st)
233
6.51M
{
234
6.60M
  while (1) {
235
6.60M
    switch (WC_CCS_SET(cc.ccs)) {
236
1.79M
    case WC_CCS_US_ASCII:
237
1.79M
  if (st->ntag)
238
887k
      st->ntag = wc_push_tag_to_utf8(os, 0);
239
1.79M
  Strcat_char(os, (char)(cc.code & 0x7f));
240
1.79M
  return;
241
100k
    case WC_CCS_UCS2:
242
102k
    case WC_CCS_UCS4:
243
102k
  if (st->ntag)
244
226
      st->ntag = wc_push_tag_to_utf8(os, 0);
245
102k
  wc_ucs_to_utf8(cc.code, utf8_buf);
246
102k
  Strcat_charp(os, (char *)utf8_buf);
247
102k
  return;
248
2.71M
    case WC_CCS_UCS_TAG:
249
2.71M
  if (WcOption.use_language_tag && wc_ucs_tag_to_tag(cc.code) != st->ntag)
250
2.54M
      st->ntag = wc_push_tag_to_utf8(os, wc_ucs_tag_to_tag(cc.code));
251
2.71M
  wc_ucs_to_utf8(wc_ucs_tag_to_ucs(cc.code), utf8_buf);
252
2.71M
  Strcat_charp(os, (char *)utf8_buf);
253
2.71M
  return;
254
474
    case WC_CCS_ISO_8859_1:
255
474
  if (st->ntag)
256
262
      st->ntag = wc_push_tag_to_utf8(os, 0);
257
474
  wc_ucs_to_utf8((cc.code | 0x80), utf8_buf);
258
474
  Strcat_charp(os, (char *)utf8_buf);
259
474
  return;
260
3.00k
    case WC_CCS_UNKNOWN_W:
261
3.00k
  if (!WcOption.no_replace) {
262
3.00k
      if (st->ntag)
263
359
          st->ntag = wc_push_tag_to_utf8(os, 0);
264
3.00k
      Strcat_charp(os, WC_REPLACE_W);
265
3.00k
  }
266
3.00k
  return;
267
1.89M
    case WC_CCS_UNKNOWN:
268
1.89M
  if (!WcOption.no_replace) {
269
1.89M
      if (st->ntag)
270
1.65M
          st->ntag = wc_push_tag_to_utf8(os, 0);
271
1.89M
      Strcat_charp(os, WC_REPLACE);
272
1.89M
  }
273
1.89M
  return;
274
91.8k
    default:
275
91.8k
  if (WcOption.ucs_conv &&
276
91.8k
    (cc.code = wc_any_to_ucs(cc)) != WC_C_UCS4_ERROR)
277
85.8k
      cc.ccs = WC_CCS_UCS2;
278
5.95k
  else
279
5.95k
      cc.ccs = WC_CCS_IS_WIDE(cc.ccs) ? WC_CCS_UNKNOWN_W : WC_CCS_UNKNOWN;
280
91.8k
  continue;
281
6.60M
    }
282
6.60M
  }
283
6.51M
}
284
285
void
286
wc_push_to_utf8_end(Str os, wc_status *st)
287
492
{
288
492
    if (st->ntag)
289
94
  st->ntag = wc_push_tag_to_utf8(os, 0);
290
492
    return;
291
492
}
292
293
Str
294
wc_char_conv_from_utf8(wc_uchar c, wc_status *st)
295
0
{
296
0
    static Str os;
297
0
    static wc_uchar buf[6];
298
0
    static size_t nbuf, next;
299
0
    wc_uint32 ucs;
300
301
0
    if (st->state == -1) {
302
0
  st->state = WC_UTF8_NOSTATE;
303
0
  os = Strnew_size(8);
304
0
  st->tag = NULL;
305
0
  st->ntag = 0;
306
0
  nbuf = 0;
307
0
    }
308
309
0
    switch (st->state) {
310
0
    case WC_UTF8_NOSTATE:
311
0
  switch (next = WC_UTF8_MAP[c]) {
312
0
  case 1:
313
0
      wtf_push_ucs(os, (wc_uint32)c, st);
314
0
      break;
315
0
  case 8:
316
0
      Strcat_char(os, (char)c);
317
0
      break;
318
0
  case 0:
319
0
  case 7:
320
0
      break;
321
0
  default:
322
0
      buf[nbuf++] = c;
323
0
      next--;
324
0
      st->state = WC_UTF8_NEXT;
325
0
      return NULL;
326
0
  }
327
0
  break;
328
0
    case WC_UTF8_NEXT:
329
0
  if (WC_UTF8_MAP[c])
330
0
      break;
331
0
  buf[nbuf++] = c;
332
0
  if (--next)
333
0
      return NULL;
334
0
  ucs = wc_utf8_to_ucs(buf);
335
0
  if (ucs == WC_C_UCS4_ERROR ||
336
0
      (ucs >= WC_C_UCS2_SURROGATE && ucs <= WC_C_UCS2_SURROGATE_END))
337
0
      break;
338
0
  if (ucs != WC_C_UCS2_BOM)
339
0
      wtf_push_ucs(os, ucs, st);
340
0
  break;
341
0
    }
342
0
    st->state = -1;
343
0
    return os;
344
0
}
345
346
#endif