Coverage Report

Created: 2026-06-28 06:39

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/moddable/modules/data/text/decoder/textdecoder.c
Line
Count
Source
1
/*
2
* Copyright (c) 2021-2022  Moddable Tech, Inc.
3
*
4
*   This file is part of the Moddable SDK Runtime.
5
*
6
*   The Moddable SDK Runtime is free software: you can redistribute it and/or modify
7
*   it under the terms of the GNU Lesser General Public License as published by
8
*   the Free Software Foundation, either version 3 of the License, or
9
*   (at your option) any later version.
10
*
11
*   The Moddable SDK Runtime is distributed in the hope that it will be useful,
12
*   but WITHOUT ANY WARRANTY; without even the implied warranty of
13
*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
*   GNU Lesser General Public License for more details.
15
*
16
*   You should have received a copy of the GNU Lesser General Public License
17
*   along with the Moddable SDK Runtime.  If not, see <http://www.gnu.org/licenses/>.
18
*
19
*/
20
21
#include "xsmc.h"
22
#include "xsHost.h"
23
#ifdef kPocoRotation
24
  // Moddable SDK
25
  #include "mc.xs.h"      // for xsID_ values
26
27
  #define VALIDATE 1
28
#else
29
  // xst, xsnap, etc
30
  #include <stdbool.h>
31
32
  #define xsID_ignoreBOM (xsID("ignoreBOM"))
33
  #define xsID_fatal (xsID("fatal"))
34
  #define xsID_stream (xsID("stream"))
35
#endif
36
37
typedef struct {
38
  uint8_t   ignoreBOM;
39
  uint8_t   fatal;
40
41
  // left over when streaming
42
  uint8_t   bufferLength;
43
  uint8_t   buffer[12];
44
} modTextDecoderRecord, *modTextDecoder;
45
46
static uint8_t isLegalUTF8(const uint8_t *source, int length);
47
48
void xs_textdecoder_destructor(void *data)
49
0
{
50
0
}
51
52
void xs_textdecoder(xsMachine *the)
53
25.5k
{
54
25.5k
  modTextDecoderRecord decoder;
55
25.5k
  int argc = xsmcArgc;
56
57
25.5k
  if (argc && c_strcmp(xsmcToString(xsArg(0)), "utf-8"))
58
1
    xsRangeError("unsupported encoding");
59
60
25.5k
#if !VALIDATE
61
25.5k
  xsmcGet(xsResult, xsTarget, xsID("prototype"));
62
25.5k
  xsResult = xsNewHostInstance(xsResult);
63
25.5k
  xsThis = xsResult;
64
25.5k
  xsmcSetHostDestructor(xsThis, NULL);
65
25.5k
  c_memset(&decoder, 0, sizeof(decoder));
66
25.5k
#endif
67
68
25.5k
  decoder.ignoreBOM = false;
69
25.5k
  decoder.fatal = false;
70
25.5k
  decoder.bufferLength = 0;
71
25.5k
  if (argc >= 2) {
72
0
    xsmcVars(1);
73
74
0
    xsmcGet(xsVar(0), xsArg(1), xsID_ignoreBOM);
75
0
    decoder.ignoreBOM = xsmcTest(xsVar(0));
76
77
0
    xsmcGet(xsVar(0), xsArg(1), xsID_fatal);
78
0
    decoder.fatal = xsmcTest(xsVar(0));
79
0
  }
80
81
25.5k
  xsmcSetHostChunk(xsThis, &decoder, sizeof(decoder));
82
25.5k
}
83
84
/*
85
  UTF-8 BOM is sequence 0xEF,0xBB,0xBF
86
  Replacement character sequence in UTF-8 is 0xEF 0xBF 0xBD
87
  null character maps to 0xC0, 0x80
88
  
89
  implementation overallocates by 3 bytes if BOM is present and ignoreBOM is false
90
*/
91
92
void xs_textdecoder_decode(xsMachine *the)
93
25.5k
{
94
25.5k
  uint8_t *src, *srcEnd, *dst, *dst3;
95
25.5k
  uint8_t *buffer;
96
25.5k
  xsUnsignedValue srcLength, bufferLength;
97
25.5k
  modTextDecoder td;
98
25.5k
  uint8_t srcOffset = 0;
99
25.5k
  uint32_t outLength = 0;
100
25.5k
  uint8_t stream = 0;
101
25.5k
  int argc = xsmcArgc;
102
103
25.5k
  if (argc > 1) {
104
0
    xsmcVars(1);
105
106
0
    xsmcGet(xsVar(0), xsArg(1), xsID_stream);
107
0
    stream = xsmcToBoolean(xsVar(0));
108
0
  }
109
110
25.5k
  if (argc) {
111
25.5k
    xsmcGetBufferReadable(xsArg(0), (void **)&src, &srcLength);
112
25.5k
    srcEnd = src + srcLength;
113
25.5k
  }
114
0
  else
115
0
    src = srcEnd = NULL;
116
117
#if VALIDATE
118
  td = xsmcGetHostChunkValidate(xsThis, xs_textdecoder_destructor);
119
#else
120
25.5k
  td = xsmcGetHostChunk(xsThis);
121
25.5k
#endif
122
25.5k
  buffer = td->buffer;
123
25.5k
  bufferLength = td->bufferLength;
124
125
39.8M
  while ((src < srcEnd) || bufferLength) {
126
39.8M
    unsigned char first, clen, i;
127
39.8M
    uint8_t utf8[4];
128
129
39.8M
    if (bufferLength) {
130
0
      bufferLength--;
131
0
      first = *buffer++;
132
0
    }
133
39.8M
    else
134
39.8M
      first = c_read8(src++);
135
39.8M
    if (first < 0x80) {
136
34.3M
      outLength += (0 == first) ? 2 : 1;
137
34.3M
      continue;
138
34.3M
    }
139
140
5.54M
    if (0xC0 == (first & 0xE0))
141
1.12M
      clen = 1;
142
4.42M
    else if (0xE0 == (first & 0xF0))
143
436k
      clen = 2;
144
3.98M
    else if (0xF0 == (first & 0xF0))
145
2.41M
      clen = 3;
146
1.57M
    else if (td->fatal)
147
0
      goto fatal;
148
1.57M
    else {
149
1.57M
      outLength += 3;
150
1.57M
      continue;
151
1.57M
    }
152
153
3.97M
    if (clen > ((srcEnd - src) + bufferLength)) {
154
770
      if (stream)
155
0
        break; // decode to here. remainder saved below
156
157
770
      if (td->fatal)
158
0
        goto fatal;
159
160
770
      outLength += 3;
161
770
      if (!src)    // flush
162
0
        break;
163
770
      continue;
164
770
    }
165
166
3.97M
    utf8[0] = first;
167
13.2M
    for (i = 0; i < clen; i++) {
168
9.22M
      if (i < bufferLength)
169
0
        utf8[i + 1] = buffer[i];
170
9.22M
      else
171
9.22M
        utf8[i + 1] = c_read8(src + i - bufferLength);
172
9.22M
    }
173
174
3.97M
    if (!isLegalUTF8(utf8, clen + 1)) {
175
3.73M
      if (td->fatal)
176
0
        goto fatal;
177
178
3.73M
      uint8_t lower = 0x80, upper = 0xBF;
179
3.73M
      if (0xE0 == first)
180
9.26k
        lower = 0xA0;
181
3.72M
      else if (0xED == first)
182
33.3k
        lower = 0x9F;
183
3.69M
      else if (0xF0 == first)
184
23.0k
        lower = 0x90;
185
3.66M
      else if (0xF4 == first)
186
8.70k
        lower = 0x8F;
187
3.65M
      else if (first > 0xF4)  // no valid next byte
188
2.31M
        clen = 0;
189
190
3.73M
      const uint8_t *s = &utf8[1];
191
3.76M
      while (clen-- > 0) {
192
1.45M
        uint8_t c = *s++;
193
1.45M
        if ((lower <= c) && (c <= upper)) {
194
34.1k
          if (bufferLength) {
195
0
            bufferLength--;
196
0
            buffer++;
197
0
          }
198
34.1k
          else
199
34.1k
            src++;
200
34.1k
        }
201
1.41M
        else
202
1.41M
          break;
203
1.45M
      }
204
205
3.73M
      outLength += 3;
206
3.73M
      continue;
207
3.73M
    }
208
209
239k
#if mxCESU8
210
239k
    outLength += (3 == clen) ? 6 : (clen + 1);
211
#else
212
    outLength += clen + 1;
213
#endif
214
239k
    if (bufferLength) {
215
0
      if (bufferLength >= clen) {
216
0
        bufferLength -= clen;
217
0
        buffer += clen;
218
0
      }
219
0
      else {
220
0
        src += clen - bufferLength; 
221
0
        bufferLength = 0;
222
0
      }
223
0
    }
224
239k
    else
225
239k
      src += clen;
226
239k
  }
227
228
25.5k
  xsmcSetStringBuffer(xsResult, NULL, outLength + 1);
229
230
25.5k
  if (argc) {
231
25.5k
    xsmcGetBufferReadable(xsArg(0), (void **)&src, &srcLength);
232
25.5k
    srcEnd = src + srcLength;
233
25.5k
    src += srcOffset;
234
25.5k
  }
235
0
  else
236
0
    src = srcEnd = NULL;
237
238
#if VALIDATE
239
  td = xsmcGetHostChunkValidate(xsThis, xs_textdecoder_destructor);
240
#else
241
25.5k
  td = xsmcGetHostChunk(xsThis);
242
25.5k
#endif
243
25.5k
  buffer = td->buffer;
244
25.5k
  bufferLength = td->bufferLength;
245
246
25.5k
  dst = (uint8_t *)xsmcToString(xsResult);
247
25.5k
  dst3 = td->ignoreBOM ? NULL : (dst + 3);
248
249
39.8M
  while ((src < srcEnd) || bufferLength) {
250
39.8M
    unsigned char first, clen, i, firstFromBuffer;
251
39.8M
    uint8_t utf8[4];
252
253
39.8M
    if (bufferLength) {
254
0
      bufferLength--;
255
0
      first = *buffer++;
256
0
      firstFromBuffer = 1;
257
0
    }
258
39.8M
    else {
259
39.8M
      first = c_read8(src++);
260
39.8M
      firstFromBuffer = 0;
261
39.8M
    }
262
39.8M
    if (first < 0x80) {
263
34.3M
      if (first)
264
32.0M
        *dst++ = first;
265
2.25M
      else {
266
2.25M
        *dst++ = 0xC0;
267
2.25M
        *dst++ = 0x80;
268
2.25M
      }
269
34.3M
      continue;
270
34.3M
    }
271
272
5.54M
    if (0xC0 == (first & 0xE0))
273
1.12M
      clen = 1;
274
4.42M
    else if (0xE0 == (first & 0xF0))
275
436k
      clen = 2;
276
3.98M
    else if (0xF0 == (first & 0xF0))
277
2.41M
      clen = 3;
278
1.57M
    else {
279
1.57M
      *dst++ = 0xEF;
280
1.57M
      *dst++ = 0xBF;
281
1.57M
      *dst++ = 0xBD;
282
1.57M
      continue;
283
1.57M
    }
284
285
3.97M
    if (clen > ((srcEnd - src) + bufferLength)) {
286
770
      if (stream) {
287
        // put back "first". remainder saved below.
288
0
        if (firstFromBuffer) {
289
0
          buffer--;
290
0
          bufferLength++;
291
0
        }
292
0
        else
293
0
          src--;
294
0
        break;
295
        
296
0
      }
297
298
770
      *dst++ = 0xEF;
299
770
      *dst++ = 0xBF;
300
770
      *dst++ = 0xBD;
301
770
      if (!src)
302
0
        break; // flush
303
770
      continue;
304
770
    }
305
306
3.97M
    utf8[0] = first;
307
13.2M
    for (i = 0; i < clen; i++) {
308
9.22M
      if (i < bufferLength)
309
0
        utf8[i + 1] = buffer[i];
310
9.22M
      else
311
9.22M
        utf8[i + 1] = c_read8(src + i - bufferLength);
312
9.22M
    }
313
314
3.97M
    if (!isLegalUTF8(utf8, clen + 1)) {
315
3.73M
      *dst++ = 0xEF;
316
3.73M
      *dst++ = 0xBF;
317
3.73M
      *dst++ = 0xBD;
318
319
3.73M
      uint8_t lower = 0x80, upper = 0xBF;
320
3.73M
      if (0xE0 == first)
321
9.26k
        lower = 0xA0;
322
3.72M
      else if (0xED == first)
323
33.3k
        lower = 0x9F;
324
3.69M
      else if (0xF0 == first)
325
23.0k
        lower = 0x90;
326
3.66M
      else if (0xF4 == first)
327
8.70k
        lower = 0x8F;
328
3.65M
      else if (first > 0xF4)  // no valid next byte
329
2.31M
        clen = 0;
330
331
3.73M
      const uint8_t *s = &utf8[1];
332
3.76M
      while (clen-- > 0) {
333
1.45M
        uint8_t c = *s++;
334
1.45M
        if ((lower <= c) && (c <= upper)) {
335
34.1k
          if (bufferLength) {
336
0
            bufferLength--;
337
0
            buffer++;
338
0
          }
339
34.1k
          else
340
34.1k
            src++;
341
34.1k
        }
342
1.41M
        else
343
1.41M
          break;
344
1.45M
      }
345
346
3.73M
      continue;
347
3.73M
    }
348
349
239k
#if mxCESU8
350
239k
    if (3 != clen) {
351
215k
      *dst++ = first;
352
570k
      for (i = 0; i < clen; i++)
353
355k
        *dst++ = utf8[i + 1];
354
215k
    }
355
24.1k
    else {
356
24.1k
      xsIntegerValue c;
357
24.1k
      fxUTF8Decode((xsStringValue)utf8, &c);
358
24.1k
      c -= 0x10000;
359
24.1k
      fxUTF8Encode((xsStringValue)dst, 0xD800 + (c >> 10));
360
24.1k
      dst += 3;
361
24.1k
      fxUTF8Encode((xsStringValue)dst, 0xDC00 + (c & 0x3FF));
362
24.1k
      dst += 3;
363
24.1k
    }
364
#else
365
    *dst++ = first;
366
    for (i = 0; i < clen; i++)
367
      *dst++ = utf8[i + 1];
368
#endif
369
    
370
239k
    if ((0xEF == first) && (dst == dst3)) {
371
49
      if ((0xBF == dst[-1]) && (0xBB == dst[-2]))
372
46
        dst -= 3;
373
49
    }
374
375
239k
    if (bufferLength) {
376
0
      if (bufferLength >= clen) {
377
0
        bufferLength -= clen;
378
0
        buffer += clen;
379
0
      }
380
0
      else {
381
0
        src += clen - bufferLength; 
382
0
        bufferLength = 0;
383
0
      }
384
0
    }
385
239k
    else
386
239k
      src += clen;
387
239k
  }
388
25.5k
  *dst++ = 0;
389
390
25.5k
  if (src) {
391
25.5k
    c_memcpy(td->buffer, buffer, bufferLength);
392
25.5k
    c_memcpy(td->buffer + bufferLength, src, srcEnd - src);
393
25.5k
    td->bufferLength = bufferLength + (srcEnd - src);
394
25.5k
  }
395
0
  else 
396
0
    td->bufferLength =  0;   // flush
397
398
25.5k
  return;
399
400
0
fatal:
401
0
  xsTypeError("invalid utf-8");
402
25.5k
}
403
404
void xs_textdecoder_get_encoding(xsMachine *the)
405
0
{
406
0
  xsmcSetString(xsResult, "utf-8");
407
0
}
408
409
void xs_textdecoder_get_ignoreBOM(xsMachine *the)
410
0
{
411
#if VALIDATE
412
  modTextDecoder td = xsmcGetHostChunkValidate(xsThis, xs_textdecoder_destructor);
413
#else
414
0
  modTextDecoder td = xsmcGetHostChunk(xsThis);
415
0
#endif
416
0
  xsmcSetBoolean(xsResult, td->ignoreBOM);
417
0
}
418
419
void xs_textdecoder_get_fatal(xsMachine *the)
420
0
{
421
#if VALIDATE
422
  modTextDecoder td = xsmcGetHostChunkValidate(xsThis, xs_textdecoder_destructor);
423
#else
424
0
  modTextDecoder td = xsmcGetHostChunk(xsThis);
425
0
#endif
426
0
  xsmcSetBoolean(xsResult, td->fatal);
427
0
}
428
429
#if !VALIDATE
430
void modInstallTextDecoder(xsMachine *the)
431
25.5k
{
432
25.5k
  #define kPrototype (0)
433
25.5k
  #define kConstructor (1)
434
25.5k
  #define kScratch (2)
435
436
51.0k
  xsBeginHost(the);
437
51.0k
  xsmcVars(3);
438
439
51.0k
  xsVar(kPrototype) = xsNewHostObject(NULL);
440
51.0k
  xsVar(kConstructor) = xsNewHostConstructor(xs_textdecoder, 2, xsVar(kPrototype));
441
51.0k
  xsmcDefine(xsGlobal, xsID("TextDecoder"), xsVar(kConstructor), xsDontEnum);
442
443
51.0k
  xsVar(kScratch) = xsNewHostFunction(xs_textdecoder_decode, 1);
444
51.0k
  xsmcDefine(xsVar(kPrototype), xsID("decode"), xsVar(kScratch), xsDontEnum);
445
51.0k
  xsVar(kScratch) = xsNewHostFunction(xs_textdecoder_get_encoding, 0);
446
51.0k
  xsmcDefine(xsVar(kPrototype), xsID("encoding"), xsVar(kScratch), xsIsGetter | xsDontEnum);
447
51.0k
  xsVar(kScratch) = xsNewHostFunction(xs_textdecoder_get_ignoreBOM, 0);
448
51.0k
  xsmcDefine(xsVar(kPrototype), xsID("ignoreBOM"), xsVar(kScratch), xsIsGetter | xsDontEnum);
449
51.0k
  xsVar(kScratch) = xsNewHostFunction(xs_textdecoder_get_fatal, 0);
450
51.0k
  xsmcDefine(xsVar(kPrototype), xsID("fatal"), xsVar(kScratch), xsIsGetter | xsDontEnum);
451
452
51.0k
  xsEndHost(the);
453
25.5k
}
454
#endif
455
456
/*
457
 * Copyright 2001-2004 Unicode, Inc.
458
 * 
459
 * Disclaimer
460
 * 
461
 * This source code is provided as is by Unicode, Inc. No claims are
462
 * made as to fitness for any particular purpose. No warranties of any
463
 * kind are expressed or implied. The recipient agrees to determine
464
 * applicability of information provided. If this file has been
465
 * purchased on magnetic or optical media from Unicode, Inc., the
466
 * sole remedy for any claim will be exchange of defective media
467
 * within 90 days of receipt.
468
 * 
469
 * Limitations on Rights to Redistribute This Code
470
 * 
471
 * Unicode, Inc. hereby grants the right to freely use the information
472
 * supplied in this file in the creation of products supporting the
473
 * Unicode Standard, and to make copies of this file in any form
474
 * for internal or external distribution as long as this notice
475
 * remains attached.
476
 */
477
 
478
7.94M
uint8_t isLegalUTF8(const uint8_t *source, int length) {
479
7.94M
    uint8_t a;
480
7.94M
    const uint8_t *srcptr = source+length;
481
7.94M
    switch (length) {
482
0
    default: return false;
483
  /* Everything else falls through when "true"... */
484
4.81M
    case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
485
1.03M
    case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
486
2.66M
    case 2: if ((a = (*--srcptr)) > 0xBF) return false;
487
488
708k
  switch (*source) {
489
      /* no fall-through in this inner switch */
490
9.96k
      case 0xE0: if (a < 0xA0) return false; break;
491
27.1k
      case 0xED: if (a > 0x9F) return false; break;
492
26.1k
      case 0xF0: if (a < 0x90) return false; break;
493
9.42k
      case 0xF4: if (a > 0x8F) return false; break;
494
658k
      default:   if (a < 0x80) return false;
495
708k
  }
496
497
505k
    case 1: if (*source >= 0x80 && *source < 0xC2) return false;
498
7.94M
    }
499
500k
    if (*source > 0xF4) return false;
500
479k
    return true;
501
500k
}
502