Coverage Report

Created: 2025-06-13 06:17

/src/moddable/modules/data/text/decoder/textdecoder.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
* Copyright (c) 2021-2022  Moddable Tech, Inc.
3
*
4
*   This file is part of the Moddable SDK Runtime.
5
*
6
*   The Moddable SDK Runtime is free software: you can redistribute it and/or modify
7
*   it under the terms of the GNU Lesser General Public License as published by
8
*   the Free Software Foundation, either version 3 of the License, or
9
*   (at your option) any later version.
10
*
11
*   The Moddable SDK Runtime is distributed in the hope that it will be useful,
12
*   but WITHOUT ANY WARRANTY; without even the implied warranty of
13
*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
*   GNU Lesser General Public License for more details.
15
*
16
*   You should have received a copy of the GNU Lesser General Public License
17
*   along with the Moddable SDK Runtime.  If not, see <http://www.gnu.org/licenses/>.
18
*
19
*/
20
21
#include "xsmc.h"
22
#include "xsHost.h"
23
#ifdef kPocoRotation
24
  // Moddable SDK
25
  #include "mc.xs.h"      // for xsID_ values
26
27
  #define VALIDATE 1
28
#else
29
  // xst, xsnap, etc
30
  #include <stdbool.h>
31
32
  #define xsID_ignoreBOM (xsID("ignoreBOM"))
33
  #define xsID_fatal (xsID("fatal"))
34
  #define xsID_stream (xsID("stream"))
35
#endif
36
37
typedef struct {
38
  uint8_t   ignoreBOM;
39
  uint8_t   fatal;
40
41
  // left over when streaming
42
  uint8_t   bufferLength;
43
  uint8_t   buffer[12];
44
} modTextDecoderRecord, *modTextDecoder;
45
46
static uint8_t isLegalUTF8(const uint8_t *source, int length);
47
48
void xs_textdecoder_destructor(void *data)
49
0
{
50
0
}
51
52
void xs_textdecoder(xsMachine *the)
53
35.3k
{
54
35.3k
  modTextDecoderRecord decoder;
55
35.3k
  int argc = xsmcArgc;
56
57
35.3k
  if (argc && c_strcmp(xsmcToString(xsArg(0)), "utf-8"))
58
0
    xsRangeError("unsupported encoding");
59
60
35.3k
#if !VALIDATE
61
35.3k
  xsmcGet(xsResult, xsTarget, xsID("prototype"));
62
35.3k
  xsResult = xsNewHostInstance(xsResult);
63
35.3k
  xsThis = xsResult;
64
35.3k
  xsmcSetHostDestructor(xsThis, NULL);
65
35.3k
  c_memset(&decoder, 0, sizeof(decoder));
66
35.3k
#endif
67
68
35.3k
  decoder.ignoreBOM = false;
69
35.3k
  decoder.fatal = false;
70
35.3k
  decoder.bufferLength = 0;
71
35.3k
  if (argc >= 2) {
72
0
    xsmcVars(1);
73
74
0
    xsmcGet(xsVar(0), xsArg(1), xsID_ignoreBOM);
75
0
    decoder.ignoreBOM = xsmcTest(xsVar(0));
76
77
0
    xsmcGet(xsVar(0), xsArg(1), xsID_fatal);
78
0
    decoder.fatal = xsmcTest(xsVar(0));
79
0
  }
80
81
35.3k
  xsmcSetHostChunk(xsThis, &decoder, sizeof(decoder));
82
35.3k
}
83
84
/*
85
  UTF-8 BOM is sequence 0xEF,0xBB,0xBF
86
  Replacement character sequence in UTF-8 is 0xEF 0xBF 0xBD
87
  null character maps to 0xC0, 0x80
88
  
89
  implementation overallocates by 3 bytes if BOM is present and ignoreBOM is false
90
*/
91
92
void xs_textdecoder_decode(xsMachine *the)
93
35.3k
{
94
35.3k
  uint8_t *src, *srcEnd, *dst, *dst3;
95
35.3k
  uint8_t *buffer;
96
35.3k
  xsUnsignedValue srcLength, bufferLength;
97
35.3k
  modTextDecoder td;
98
35.3k
  uint8_t srcOffset = 0;
99
35.3k
  uint32_t outLength = 0;
100
35.3k
  uint8_t stream = 0;
101
35.3k
  int argc = xsmcArgc;
102
103
35.3k
  if (argc > 1) {
104
0
    xsmcVars(1);
105
106
0
    xsmcGet(xsVar(0), xsArg(1), xsID_stream);
107
0
    stream = xsmcToBoolean(xsVar(0));
108
0
  }
109
110
35.3k
  if (argc) {
111
35.3k
    xsmcGetBufferReadable(xsArg(0), (void **)&src, &srcLength);
112
35.3k
    srcEnd = src + srcLength;
113
35.3k
  }
114
0
  else
115
0
    src = srcEnd = NULL;
116
117
#if VALIDATE
118
  td = xsmcGetHostChunkValidate(xsThis, xs_textdecoder_destructor);
119
#else
120
35.3k
  td = xsmcGetHostChunk(xsThis);
121
35.3k
#endif
122
35.3k
  buffer = td->buffer;
123
35.3k
  bufferLength = td->bufferLength;
124
125
193M
  while ((src < srcEnd) || bufferLength) {
126
193M
    unsigned char first, clen, i;
127
193M
    uint8_t utf8[4];
128
129
193M
    if (bufferLength) {
130
0
      bufferLength--;
131
0
      first = *buffer++;
132
0
    }
133
193M
    else
134
193M
      first = c_read8(src++);
135
193M
    if (first < 0x80) {
136
177M
      outLength += (0 == first) ? 2 : 1;
137
177M
      continue;
138
177M
    }
139
140
15.7M
    if (0xC0 == (first & 0xE0))
141
2.29M
      clen = 1;
142
13.4M
    else if (0xE0 == (first & 0xF0))
143
1.74M
      clen = 2;
144
11.7M
    else if (0xF0 == (first & 0xF0))
145
4.47M
      clen = 3;
146
7.24M
    else if (td->fatal)
147
0
      goto fatal;
148
7.24M
    else {
149
7.24M
      outLength += 3;
150
7.24M
      continue;
151
7.24M
    }
152
153
8.51M
    if (clen > ((srcEnd - src) + bufferLength)) {
154
1.74k
      if (stream)
155
0
        break; // decode to here. remainder saved below
156
157
1.74k
      if (td->fatal)
158
0
        goto fatal;
159
160
1.74k
      outLength += 3;
161
1.74k
      if (!src)    // flush
162
0
        break;
163
1.74k
      continue;
164
1.74k
    }
165
166
8.51M
    utf8[0] = first;
167
27.7M
    for (i = 0; i < clen; i++) {
168
19.2M
      if (i < bufferLength)
169
0
        utf8[i + 1] = buffer[i];
170
19.2M
      else
171
19.2M
        utf8[i + 1] = c_read8(src + i - bufferLength);
172
19.2M
    }
173
174
8.51M
    if (!isLegalUTF8(utf8, clen + 1)) {
175
7.80M
      if (td->fatal)
176
0
        goto fatal;
177
178
7.80M
      uint8_t lower = 0x80, upper = 0xBF;
179
7.80M
      if (0xE0 == first)
180
37.6k
        lower = 0xA0;
181
7.76M
      else if (0xED == first)
182
59.1k
        lower = 0x9F;
183
7.70M
      else if (0xF0 == first)
184
56.7k
        lower = 0x90;
185
7.65M
      else if (0xF4 == first)
186
47.2k
        lower = 0x8F;
187
7.60M
      else if (first > 0xF4)  // no valid next byte
188
4.15M
        clen = 0;
189
190
8.08M
      while (clen-- > 0) {
191
3.90M
        uint8_t c = c_read8(src);
192
3.90M
        if ((lower <= c) && (c <= upper))
193
278k
          src++;
194
3.62M
        else
195
3.62M
          break;
196
3.90M
      }
197
198
7.80M
      outLength += 3;
199
7.80M
      continue;
200
7.80M
    }
201
202
713k
#if mxCESU8
203
713k
    outLength += (3 == clen) ? 6 : (clen + 1);
204
#else
205
    outLength += clen + 1;
206
#endif
207
713k
    if (bufferLength) {
208
0
      if (bufferLength >= clen) {
209
0
        bufferLength -= clen;
210
0
        buffer += clen;
211
0
      }
212
0
      else {
213
0
        src += clen - bufferLength; 
214
0
        bufferLength = 0;
215
0
      }
216
0
    }
217
713k
    else
218
713k
      src += clen;
219
713k
  }
220
221
35.3k
  xsmcSetStringBuffer(xsResult, NULL, outLength + 1);
222
223
35.3k
  if (argc) {
224
35.3k
    xsmcGetBufferReadable(xsArg(0), (void **)&src, &srcLength);
225
35.3k
    srcEnd = src + srcLength;
226
35.3k
    src += srcOffset;
227
35.3k
  }
228
0
  else
229
0
    src = srcEnd = NULL;
230
231
#if VALIDATE
232
  td = xsmcGetHostChunkValidate(xsThis, xs_textdecoder_destructor);
233
#else
234
35.3k
  td = xsmcGetHostChunk(xsThis);
235
35.3k
#endif
236
35.3k
  buffer = td->buffer;
237
35.3k
  bufferLength = td->bufferLength;
238
239
35.3k
  dst = (uint8_t *)xsmcToString(xsResult);
240
35.3k
  dst3 = td->ignoreBOM ? NULL : (dst + 3);
241
242
193M
  while ((src < srcEnd) || bufferLength) {
243
193M
    unsigned char first, clen, i, firstFromBuffer;
244
193M
    uint8_t utf8[4];
245
246
193M
    if (bufferLength) {
247
0
      bufferLength--;
248
0
      first = *buffer++;
249
0
      firstFromBuffer = 1;
250
0
    }
251
193M
    else {
252
193M
      first = c_read8(src++);
253
193M
      firstFromBuffer = 0;
254
193M
    }
255
193M
    if (first < 0x80) {
256
177M
      if (first)
257
167M
        *dst++ = first;
258
10.3M
      else {
259
10.3M
        *dst++ = 0xC0;
260
10.3M
        *dst++ = 0x80;
261
10.3M
      }
262
177M
      continue;
263
177M
    }
264
265
15.7M
    if (0xC0 == (first & 0xE0))
266
2.29M
      clen = 1;
267
13.4M
    else if (0xE0 == (first & 0xF0))
268
1.74M
      clen = 2;
269
11.7M
    else if (0xF0 == (first & 0xF0))
270
4.47M
      clen = 3;
271
7.24M
    else {
272
7.24M
      *dst++ = 0xEF;
273
7.24M
      *dst++ = 0xBF;
274
7.24M
      *dst++ = 0xBD;
275
7.24M
      continue;
276
7.24M
    }
277
278
8.51M
    if (clen > ((srcEnd - src) + bufferLength)) {
279
1.74k
      if (stream) {
280
        // put back "first". remainder saved below.
281
0
        if (firstFromBuffer) {
282
0
          buffer--;
283
0
          bufferLength++;
284
0
        }
285
0
        else
286
0
          src--;
287
0
        break;
288
        
289
0
      }
290
291
1.74k
      *dst++ = 0xEF;
292
1.74k
      *dst++ = 0xBF;
293
1.74k
      *dst++ = 0xBD;
294
1.74k
      if (!src)
295
0
        break; // flush
296
1.74k
      continue;
297
1.74k
    }
298
299
8.51M
    utf8[0] = first;
300
27.7M
    for (i = 0; i < clen; i++) {
301
19.2M
      if (i < bufferLength)
302
0
        utf8[i + 1] = buffer[i];
303
19.2M
      else
304
19.2M
        utf8[i + 1] = c_read8(src + i - bufferLength);
305
19.2M
    }
306
307
8.51M
    if (!isLegalUTF8(utf8, clen + 1)) {
308
7.80M
      *dst++ = 0xEF;
309
7.80M
      *dst++ = 0xBF;
310
7.80M
      *dst++ = 0xBD;
311
312
7.80M
      uint8_t lower = 0x80, upper = 0xBF;
313
7.80M
      if (0xE0 == first)
314
37.6k
        lower = 0xA0;
315
7.76M
      else if (0xED == first)
316
59.1k
        lower = 0x9F;
317
7.70M
      else if (0xF0 == first)
318
56.7k
        lower = 0x90;
319
7.65M
      else if (0xF4 == first)
320
47.2k
        lower = 0x8F;
321
7.60M
      else if (first > 0xF4)  // no valid next byte
322
4.15M
        clen = 0;
323
324
8.08M
      while (clen-- > 0) {
325
3.90M
        uint8_t c = c_read8(src);
326
3.90M
        if ((lower <= c) && (c <= upper))
327
278k
          src++;
328
3.62M
        else
329
3.62M
          break;
330
3.90M
      }
331
332
7.80M
      continue;
333
7.80M
    }
334
335
713k
#if mxCESU8
336
713k
    if (3 != clen) {
337
667k
      *dst++ = first;
338
1.68M
      for (i = 0; i < clen; i++)
339
1.02M
        *dst++ = utf8[i + 1];
340
667k
    }
341
45.8k
    else {
342
45.8k
      xsIntegerValue c;
343
45.8k
      fxUTF8Decode((xsStringValue)utf8, &c);
344
45.8k
      c -= 0x10000;
345
45.8k
      fxUTF8Encode((xsStringValue)dst, 0xD800 + (c >> 10));
346
45.8k
      dst += 3;
347
45.8k
      fxUTF8Encode((xsStringValue)dst, 0xDC00 + (c & 0x3FF));
348
45.8k
      dst += 3;
349
45.8k
    }
350
#else
351
    *dst++ = first;
352
    for (i = 0; i < clen; i++)
353
      *dst++ = utf8[i + 1];
354
#endif
355
    
356
713k
    if ((0xEF == first) && (dst == dst3)) {
357
272
      if ((0xBF == dst[-1]) && (0xBB == dst[-2]))
358
247
        dst -= 3;
359
272
    }
360
361
713k
    if (bufferLength) {
362
0
      if (bufferLength >= clen) {
363
0
        bufferLength -= clen;
364
0
        buffer += clen;
365
0
      }
366
0
      else {
367
0
        src += clen - bufferLength; 
368
0
        bufferLength = 0;
369
0
      }
370
0
    }
371
713k
    else
372
713k
      src += clen;
373
713k
  }
374
35.3k
  *dst++ = 0;
375
376
35.3k
  if (src) {
377
35.3k
    c_memcpy(td->buffer, buffer, bufferLength);
378
35.3k
    c_memcpy(td->buffer + bufferLength, src, srcEnd - src);
379
35.3k
    td->bufferLength = bufferLength + (srcEnd - src);
380
35.3k
  }
381
0
  else 
382
0
    td->bufferLength =  0;   // flush
383
384
35.3k
  return;
385
386
0
fatal:
387
0
  xsTypeError("invalid utf-8");
388
35.3k
}
389
390
void xs_textdecoder_get_encoding(xsMachine *the)
391
0
{
392
0
  xsmcSetString(xsResult, "utf-8");
393
0
}
394
395
void xs_textdecoder_get_ignoreBOM(xsMachine *the)
396
0
{
397
#if VALIDATE
398
  modTextDecoder td = xsmcGetHostChunkValidate(xsThis, xs_textdecoder_destructor);
399
#else
400
0
  modTextDecoder td = xsmcGetHostChunk(xsThis);
401
0
#endif
402
0
  xsmcSetBoolean(xsResult, td->ignoreBOM);
403
0
}
404
405
void xs_textdecoder_get_fatal(xsMachine *the)
406
0
{
407
#if VALIDATE
408
  modTextDecoder td = xsmcGetHostChunkValidate(xsThis, xs_textdecoder_destructor);
409
#else
410
0
  modTextDecoder td = xsmcGetHostChunk(xsThis);
411
0
#endif
412
0
  xsmcSetBoolean(xsResult, td->fatal);
413
0
}
414
415
#if !VALIDATE
416
void modInstallTextDecoder(xsMachine *the)
417
35.3k
{
418
35.3k
  #define kPrototype (0)
419
35.3k
  #define kConstructor (1)
420
35.3k
  #define kScratch (2)
421
422
70.6k
  xsBeginHost(the);
423
70.6k
  xsmcVars(3);
424
425
70.6k
  xsVar(kPrototype) = xsNewHostObject(NULL);
426
70.6k
  xsVar(kConstructor) = xsNewHostConstructor(xs_textdecoder, 2, xsVar(kPrototype));
427
70.6k
  xsmcDefine(xsGlobal, xsID("TextDecoder"), xsVar(kConstructor), xsDontEnum);
428
429
70.6k
  xsVar(kScratch) = xsNewHostFunction(xs_textdecoder_decode, 1);
430
70.6k
  xsmcDefine(xsVar(kPrototype), xsID("decode"), xsVar(kScratch), xsDontEnum);
431
70.6k
  xsVar(kScratch) = xsNewHostFunction(xs_textdecoder_get_encoding, 0);
432
70.6k
  xsmcDefine(xsVar(kPrototype), xsID("encoding"), xsVar(kScratch), xsIsGetter | xsDontEnum);
433
70.6k
  xsVar(kScratch) = xsNewHostFunction(xs_textdecoder_get_ignoreBOM, 0);
434
70.6k
  xsmcDefine(xsVar(kPrototype), xsID("ignoreBOM"), xsVar(kScratch), xsIsGetter | xsDontEnum);
435
70.6k
  xsVar(kScratch) = xsNewHostFunction(xs_textdecoder_get_fatal, 0);
436
70.6k
  xsmcDefine(xsVar(kPrototype), xsID("fatal"), xsVar(kScratch), xsIsGetter | xsDontEnum);
437
438
70.6k
  xsEndHost(the);
439
35.3k
}
440
#endif
441
442
/*
443
 * Copyright 2001-2004 Unicode, Inc.
444
 * 
445
 * Disclaimer
446
 * 
447
 * This source code is provided as is by Unicode, Inc. No claims are
448
 * made as to fitness for any particular purpose. No warranties of any
449
 * kind are expressed or implied. The recipient agrees to determine
450
 * applicability of information provided. If this file has been
451
 * purchased on magnetic or optical media from Unicode, Inc., the
452
 * sole remedy for any claim will be exchange of defective media
453
 * within 90 days of receipt.
454
 * 
455
 * Limitations on Rights to Redistribute This Code
456
 * 
457
 * Unicode, Inc. hereby grants the right to freely use the information
458
 * supplied in this file in the creation of products supporting the
459
 * Unicode Standard, and to make copies of this file in any form
460
 * for internal or external distribution as long as this notice
461
 * remains attached.
462
 */
463
 
464
17.0M
uint8_t isLegalUTF8(const uint8_t *source, int length) {
465
17.0M
    uint8_t a;
466
17.0M
    const uint8_t *srcptr = source+length;
467
17.0M
    switch (length) {
468
0
    default: return false;
469
  /* Everything else falls through when "true"... */
470
8.94M
    case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
471
4.04M
    case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
472
5.77M
    case 2: if ((a = (*--srcptr)) > 0xBF) return false;
473
474
3.01M
  switch (*source) {
475
      /* no fall-through in this inner switch */
476
21.5k
      case 0xE0: if (a < 0xA0) return false; break;
477
38.6k
      case 0xED: if (a > 0x9F) return false; break;
478
35.5k
      case 0xF0: if (a < 0x90) return false; break;
479
11.3k
      case 0xF4: if (a > 0x8F) return false; break;
480
2.93M
      default:   if (a < 0x80) return false;
481
3.01M
  }
482
483
1.50M
    case 1: if (*source >= 0x80 && *source < 0xC2) return false;
484
17.0M
    }
485
1.46M
    if (*source > 0xF4) return false;
486
1.42M
    return true;
487
1.46M
}
488