/src/moddable/modules/data/text/decoder/textdecoder.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2021-2022 Moddable Tech, Inc. |
3 | | * |
4 | | * This file is part of the Moddable SDK Runtime. |
5 | | * |
6 | | * The Moddable SDK Runtime is free software: you can redistribute it and/or modify |
7 | | * it under the terms of the GNU Lesser General Public License as published by |
8 | | * the Free Software Foundation, either version 3 of the License, or |
9 | | * (at your option) any later version. |
10 | | * |
11 | | * The Moddable SDK Runtime is distributed in the hope that it will be useful, |
12 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14 | | * GNU Lesser General Public License for more details. |
15 | | * |
16 | | * You should have received a copy of the GNU Lesser General Public License |
17 | | * along with the Moddable SDK Runtime. If not, see <http://www.gnu.org/licenses/>. |
18 | | * |
19 | | */ |
20 | | |
21 | | #include "xsmc.h" |
22 | | #include "xsHost.h" |
23 | | #ifdef kPocoRotation |
24 | | // Moddable SDK |
25 | | #include "mc.xs.h" // for xsID_ values |
26 | | |
27 | | #define VALIDATE 1 |
28 | | #else |
29 | | // xst, xsnap, etc |
30 | | #include <stdbool.h> |
31 | | |
32 | | #define xsID_ignoreBOM (xsID("ignoreBOM")) |
33 | | #define xsID_fatal (xsID("fatal")) |
34 | | #define xsID_stream (xsID("stream")) |
35 | | #endif |
36 | | |
37 | | typedef struct { |
38 | | uint8_t ignoreBOM; |
39 | | uint8_t fatal; |
40 | | |
41 | | // left over when streaming |
42 | | uint8_t bufferLength; |
43 | | uint8_t buffer[12]; |
44 | | } modTextDecoderRecord, *modTextDecoder; |
45 | | |
46 | | static uint8_t isLegalUTF8(const uint8_t *source, int length); |
47 | | |
48 | | void xs_textdecoder_destructor(void *data) |
49 | 0 | { |
50 | 0 | } |
51 | | |
52 | | void xs_textdecoder(xsMachine *the) |
53 | 27.8k | { |
54 | 27.8k | modTextDecoderRecord decoder; |
55 | 27.8k | int argc = xsmcArgc; |
56 | | |
57 | 27.8k | if (argc && c_strcmp(xsmcToString(xsArg(0)), "utf-8")) |
58 | 0 | xsRangeError("unsupported encoding"); |
59 | | |
60 | 27.8k | #if !VALIDATE |
61 | 27.8k | xsmcGet(xsResult, xsTarget, xsID("prototype")); |
62 | 27.8k | xsResult = xsNewHostInstance(xsResult); |
63 | 27.8k | xsThis = xsResult; |
64 | 27.8k | xsmcSetHostDestructor(xsThis, NULL); |
65 | 27.8k | c_memset(&decoder, 0, sizeof(decoder)); |
66 | 27.8k | #endif |
67 | | |
68 | 27.8k | decoder.ignoreBOM = false; |
69 | 27.8k | decoder.fatal = false; |
70 | 27.8k | decoder.bufferLength = 0; |
71 | 27.8k | if (argc >= 2) { |
72 | 0 | xsmcVars(1); |
73 | |
|
74 | 0 | xsmcGet(xsVar(0), xsArg(1), xsID_ignoreBOM); |
75 | 0 | decoder.ignoreBOM = xsmcTest(xsVar(0)); |
76 | |
|
77 | 0 | xsmcGet(xsVar(0), xsArg(1), xsID_fatal); |
78 | 0 | decoder.fatal = xsmcTest(xsVar(0)); |
79 | 0 | } |
80 | | |
81 | 27.8k | xsmcSetHostChunk(xsThis, &decoder, sizeof(decoder)); |
82 | 27.8k | } |
83 | | |
84 | | /* |
85 | | UTF-8 BOM is sequence 0xEF,0xBB,0xBF |
86 | | Replacement character sequence in UTF-8 is 0xEF 0xBF 0xBD |
87 | | null character maps to 0xC0, 0x80 |
88 | | |
89 | | implementation overallocates by 3 bytes if BOM is present and ignoreBOM is false |
90 | | */ |
91 | | |
92 | | void xs_textdecoder_decode(xsMachine *the) |
93 | 27.8k | { |
94 | 27.8k | uint8_t *src, *srcEnd, *dst, *dst3; |
95 | 27.8k | uint8_t *buffer; |
96 | 27.8k | xsUnsignedValue srcLength, bufferLength; |
97 | 27.8k | modTextDecoder td; |
98 | 27.8k | uint8_t srcOffset = 0; |
99 | 27.8k | uint32_t outLength = 0; |
100 | 27.8k | uint8_t stream = 0; |
101 | 27.8k | int argc = xsmcArgc; |
102 | | |
103 | 27.8k | if (argc > 1) { |
104 | 0 | xsmcVars(1); |
105 | |
|
106 | 0 | xsmcGet(xsVar(0), xsArg(1), xsID_stream); |
107 | 0 | stream = xsmcToBoolean(xsVar(0)); |
108 | 0 | } |
109 | | |
110 | 27.8k | if (argc) { |
111 | 27.8k | xsmcGetBufferReadable(xsArg(0), (void **)&src, &srcLength); |
112 | 27.8k | srcEnd = src + srcLength; |
113 | 27.8k | } |
114 | 0 | else |
115 | 0 | src = srcEnd = NULL; |
116 | | |
117 | | #if VALIDATE |
118 | | td = xsmcGetHostChunkValidate(xsThis, xs_textdecoder_destructor); |
119 | | #else |
120 | 27.8k | td = xsmcGetHostChunk(xsThis); |
121 | 27.8k | #endif |
122 | 27.8k | buffer = td->buffer; |
123 | 27.8k | bufferLength = td->bufferLength; |
124 | | |
125 | 179M | while ((src < srcEnd) || bufferLength) { |
126 | 179M | unsigned char first, clen, i; |
127 | 179M | uint8_t utf8[4]; |
128 | | |
129 | 179M | if (bufferLength) { |
130 | 0 | bufferLength--; |
131 | 0 | first = *buffer++; |
132 | 0 | } |
133 | 179M | else |
134 | 179M | first = c_read8(src++); |
135 | 179M | if (first < 0x80) { |
136 | 168M | outLength += (0 == first) ? 2 : 1; |
137 | 168M | continue; |
138 | 168M | } |
139 | | |
140 | 10.9M | if (0xC0 == (first & 0xE0)) |
141 | 1.71M | clen = 1; |
142 | 9.27M | else if (0xE0 == (first & 0xF0)) |
143 | 882k | clen = 2; |
144 | 8.38M | else if (0xF0 == (first & 0xF0)) |
145 | 3.42M | clen = 3; |
146 | 4.96M | else if (td->fatal) |
147 | 0 | goto fatal; |
148 | 4.96M | else { |
149 | 4.96M | outLength += 3; |
150 | 4.96M | continue; |
151 | 4.96M | } |
152 | | |
153 | 6.02M | if (clen > ((srcEnd - src) + bufferLength)) { |
154 | 1.16k | if (stream) |
155 | 0 | break; // decode to here. remainder saved below |
156 | | |
157 | 1.16k | if (td->fatal) |
158 | 0 | goto fatal; |
159 | | |
160 | 1.16k | outLength += 3; |
161 | 1.16k | if (!src) // flush |
162 | 0 | break; |
163 | 1.16k | continue; |
164 | 1.16k | } |
165 | | |
166 | 6.02M | utf8[0] = first; |
167 | 19.7M | for (i = 0; i < clen; i++) { |
168 | 13.7M | if (i < bufferLength) |
169 | 0 | utf8[i + 1] = buffer[i]; |
170 | 13.7M | else |
171 | 13.7M | utf8[i + 1] = c_read8(src + i - bufferLength); |
172 | 13.7M | } |
173 | | |
174 | 6.02M | if (!isLegalUTF8(utf8, clen + 1)) { |
175 | 5.54M | if (td->fatal) |
176 | 0 | goto fatal; |
177 | | |
178 | 5.54M | uint8_t lower = 0x80, upper = 0xBF; |
179 | 5.54M | if (0xE0 == first) |
180 | 16.4k | lower = 0xA0; |
181 | 5.52M | else if (0xED == first) |
182 | 26.9k | lower = 0x9F; |
183 | 5.49M | else if (0xF0 == first) |
184 | 23.2k | lower = 0x90; |
185 | 5.47M | else if (0xF4 == first) |
186 | 20.8k | lower = 0x8F; |
187 | 5.45M | else if (first > 0xF4) // no valid next byte |
188 | 3.25M | clen = 0; |
189 | | |
190 | 5.70M | while (clen-- > 0) { |
191 | 2.43M | uint8_t c = c_read8(src); |
192 | 2.43M | if ((lower <= c) && (c <= upper)) |
193 | 161k | src++; |
194 | 2.27M | else |
195 | 2.27M | break; |
196 | 2.43M | } |
197 | | |
198 | 5.54M | outLength += 3; |
199 | 5.54M | continue; |
200 | 5.54M | } |
201 | | |
202 | 478k | #if mxCESU8 |
203 | 478k | outLength += (3 == clen) ? 6 : (clen + 1); |
204 | | #else |
205 | | outLength += clen + 1; |
206 | | #endif |
207 | 478k | if (bufferLength) { |
208 | 0 | if (bufferLength >= clen) { |
209 | 0 | bufferLength -= clen; |
210 | 0 | buffer += clen; |
211 | 0 | } |
212 | 0 | else { |
213 | 0 | src += clen - bufferLength; |
214 | 0 | bufferLength = 0; |
215 | 0 | } |
216 | 0 | } |
217 | 478k | else |
218 | 478k | src += clen; |
219 | 478k | } |
220 | | |
221 | 27.8k | xsmcSetStringBuffer(xsResult, NULL, outLength + 1); |
222 | | |
223 | 27.8k | if (argc) { |
224 | 27.8k | xsmcGetBufferReadable(xsArg(0), (void **)&src, &srcLength); |
225 | 27.8k | srcEnd = src + srcLength; |
226 | 27.8k | src += srcOffset; |
227 | 27.8k | } |
228 | 0 | else |
229 | 0 | src = srcEnd = NULL; |
230 | | |
231 | | #if VALIDATE |
232 | | td = xsmcGetHostChunkValidate(xsThis, xs_textdecoder_destructor); |
233 | | #else |
234 | 27.8k | td = xsmcGetHostChunk(xsThis); |
235 | 27.8k | #endif |
236 | 27.8k | buffer = td->buffer; |
237 | 27.8k | bufferLength = td->bufferLength; |
238 | | |
239 | 27.8k | dst = (uint8_t *)xsmcToString(xsResult); |
240 | 27.8k | dst3 = td->ignoreBOM ? NULL : (dst + 3); |
241 | | |
242 | 179M | while ((src < srcEnd) || bufferLength) { |
243 | 179M | unsigned char first, clen, i, firstFromBuffer; |
244 | 179M | uint8_t utf8[4]; |
245 | | |
246 | 179M | if (bufferLength) { |
247 | 0 | bufferLength--; |
248 | 0 | first = *buffer++; |
249 | 0 | firstFromBuffer = 1; |
250 | 0 | } |
251 | 179M | else { |
252 | 179M | first = c_read8(src++); |
253 | 179M | firstFromBuffer = 0; |
254 | 179M | } |
255 | 179M | if (first < 0x80) { |
256 | 168M | if (first) |
257 | 160M | *dst++ = first; |
258 | 7.43M | else { |
259 | 7.43M | *dst++ = 0xC0; |
260 | 7.43M | *dst++ = 0x80; |
261 | 7.43M | } |
262 | 168M | continue; |
263 | 168M | } |
264 | | |
265 | 10.9M | if (0xC0 == (first & 0xE0)) |
266 | 1.71M | clen = 1; |
267 | 9.27M | else if (0xE0 == (first & 0xF0)) |
268 | 882k | clen = 2; |
269 | 8.38M | else if (0xF0 == (first & 0xF0)) |
270 | 3.42M | clen = 3; |
271 | 4.96M | else { |
272 | 4.96M | *dst++ = 0xEF; |
273 | 4.96M | *dst++ = 0xBF; |
274 | 4.96M | *dst++ = 0xBD; |
275 | 4.96M | continue; |
276 | 4.96M | } |
277 | | |
278 | 6.02M | if (clen > ((srcEnd - src) + bufferLength)) { |
279 | 1.16k | if (stream) { |
280 | | // put back "first". remainder saved below. |
281 | 0 | if (firstFromBuffer) { |
282 | 0 | buffer--; |
283 | 0 | bufferLength++; |
284 | 0 | } |
285 | 0 | else |
286 | 0 | src--; |
287 | 0 | break; |
288 | | |
289 | 0 | } |
290 | | |
291 | 1.16k | *dst++ = 0xEF; |
292 | 1.16k | *dst++ = 0xBF; |
293 | 1.16k | *dst++ = 0xBD; |
294 | 1.16k | if (!src) |
295 | 0 | break; // flush |
296 | 1.16k | continue; |
297 | 1.16k | } |
298 | | |
299 | 6.02M | utf8[0] = first; |
300 | 19.7M | for (i = 0; i < clen; i++) { |
301 | 13.7M | if (i < bufferLength) |
302 | 0 | utf8[i + 1] = buffer[i]; |
303 | 13.7M | else |
304 | 13.7M | utf8[i + 1] = c_read8(src + i - bufferLength); |
305 | 13.7M | } |
306 | | |
307 | 6.02M | if (!isLegalUTF8(utf8, clen + 1)) { |
308 | 5.54M | *dst++ = 0xEF; |
309 | 5.54M | *dst++ = 0xBF; |
310 | 5.54M | *dst++ = 0xBD; |
311 | | |
312 | 5.54M | uint8_t lower = 0x80, upper = 0xBF; |
313 | 5.54M | if (0xE0 == first) |
314 | 16.4k | lower = 0xA0; |
315 | 5.52M | else if (0xED == first) |
316 | 26.9k | lower = 0x9F; |
317 | 5.49M | else if (0xF0 == first) |
318 | 23.2k | lower = 0x90; |
319 | 5.47M | else if (0xF4 == first) |
320 | 20.8k | lower = 0x8F; |
321 | 5.45M | else if (first > 0xF4) // no valid next byte |
322 | 3.25M | clen = 0; |
323 | | |
324 | 5.70M | while (clen-- > 0) { |
325 | 2.43M | uint8_t c = c_read8(src); |
326 | 2.43M | if ((lower <= c) && (c <= upper)) |
327 | 161k | src++; |
328 | 2.27M | else |
329 | 2.27M | break; |
330 | 2.43M | } |
331 | | |
332 | 5.54M | continue; |
333 | 5.54M | } |
334 | | |
335 | 478k | #if mxCESU8 |
336 | 478k | if (3 != clen) { |
337 | 447k | *dst++ = first; |
338 | 1.18M | for (i = 0; i < clen; i++) |
339 | 736k | *dst++ = utf8[i + 1]; |
340 | 447k | } |
341 | 31.0k | else { |
342 | 31.0k | xsIntegerValue c; |
343 | 31.0k | fxUTF8Decode((xsStringValue)utf8, &c); |
344 | 31.0k | c -= 0x10000; |
345 | 31.0k | fxUTF8Encode((xsStringValue)dst, 0xD800 + (c >> 10)); |
346 | 31.0k | dst += 3; |
347 | 31.0k | fxUTF8Encode((xsStringValue)dst, 0xDC00 + (c & 0x3FF)); |
348 | 31.0k | dst += 3; |
349 | 31.0k | } |
350 | | #else |
351 | | *dst++ = first; |
352 | | for (i = 0; i < clen; i++) |
353 | | *dst++ = utf8[i + 1]; |
354 | | #endif |
355 | | |
356 | 478k | if ((0xEF == first) && (dst == dst3)) { |
357 | 266 | if ((0xBF == dst[-1]) && (0xBB == dst[-2])) |
358 | 239 | dst -= 3; |
359 | 266 | } |
360 | | |
361 | 478k | if (bufferLength) { |
362 | 0 | if (bufferLength >= clen) { |
363 | 0 | bufferLength -= clen; |
364 | 0 | buffer += clen; |
365 | 0 | } |
366 | 0 | else { |
367 | 0 | src += clen - bufferLength; |
368 | 0 | bufferLength = 0; |
369 | 0 | } |
370 | 0 | } |
371 | 478k | else |
372 | 478k | src += clen; |
373 | 478k | } |
374 | 27.8k | *dst++ = 0; |
375 | | |
376 | 27.8k | if (src) { |
377 | 27.8k | c_memcpy(td->buffer, buffer, bufferLength); |
378 | 27.8k | c_memcpy(td->buffer + bufferLength, src, srcEnd - src); |
379 | 27.8k | td->bufferLength = bufferLength + (srcEnd - src); |
380 | 27.8k | } |
381 | 0 | else |
382 | 0 | td->bufferLength = 0; // flush |
383 | | |
384 | 27.8k | return; |
385 | | |
386 | 0 | fatal: |
387 | 0 | xsTypeError("invalid utf-8"); |
388 | 27.8k | } |
389 | | |
390 | | void xs_textdecoder_get_encoding(xsMachine *the) |
391 | 0 | { |
392 | 0 | xsmcSetString(xsResult, "utf-8"); |
393 | 0 | } |
394 | | |
395 | | void xs_textdecoder_get_ignoreBOM(xsMachine *the) |
396 | 0 | { |
397 | | #if VALIDATE |
398 | | modTextDecoder td = xsmcGetHostChunkValidate(xsThis, xs_textdecoder_destructor); |
399 | | #else |
400 | 0 | modTextDecoder td = xsmcGetHostChunk(xsThis); |
401 | 0 | #endif |
402 | 0 | xsmcSetBoolean(xsResult, td->ignoreBOM); |
403 | 0 | } |
404 | | |
405 | | void xs_textdecoder_get_fatal(xsMachine *the) |
406 | 0 | { |
407 | | #if VALIDATE |
408 | | modTextDecoder td = xsmcGetHostChunkValidate(xsThis, xs_textdecoder_destructor); |
409 | | #else |
410 | 0 | modTextDecoder td = xsmcGetHostChunk(xsThis); |
411 | 0 | #endif |
412 | 0 | xsmcSetBoolean(xsResult, td->fatal); |
413 | 0 | } |
414 | | |
415 | | #if !VALIDATE |
416 | | void modInstallTextDecoder(xsMachine *the) |
417 | 27.8k | { |
418 | 27.8k | #define kPrototype (0) |
419 | 27.8k | #define kConstructor (1) |
420 | 27.8k | #define kScratch (2) |
421 | | |
422 | 55.7k | xsBeginHost(the); |
423 | 55.7k | xsmcVars(3); |
424 | | |
425 | 55.7k | xsVar(kPrototype) = xsNewHostObject(NULL); |
426 | 55.7k | xsVar(kConstructor) = xsNewHostConstructor(xs_textdecoder, 2, xsVar(kPrototype)); |
427 | 55.7k | xsmcDefine(xsGlobal, xsID("TextDecoder"), xsVar(kConstructor), xsDontEnum); |
428 | | |
429 | 55.7k | xsVar(kScratch) = xsNewHostFunction(xs_textdecoder_decode, 1); |
430 | 55.7k | xsmcDefine(xsVar(kPrototype), xsID("decode"), xsVar(kScratch), xsDontEnum); |
431 | 55.7k | xsVar(kScratch) = xsNewHostFunction(xs_textdecoder_get_encoding, 0); |
432 | 55.7k | xsmcDefine(xsVar(kPrototype), xsID("encoding"), xsVar(kScratch), xsIsGetter | xsDontEnum); |
433 | 55.7k | xsVar(kScratch) = xsNewHostFunction(xs_textdecoder_get_ignoreBOM, 0); |
434 | 55.7k | xsmcDefine(xsVar(kPrototype), xsID("ignoreBOM"), xsVar(kScratch), xsIsGetter | xsDontEnum); |
435 | 55.7k | xsVar(kScratch) = xsNewHostFunction(xs_textdecoder_get_fatal, 0); |
436 | 55.7k | xsmcDefine(xsVar(kPrototype), xsID("fatal"), xsVar(kScratch), xsIsGetter | xsDontEnum); |
437 | | |
438 | 55.7k | xsEndHost(the); |
439 | 27.8k | } |
440 | | #endif |
441 | | |
442 | | /* |
443 | | * Copyright 2001-2004 Unicode, Inc. |
444 | | * |
445 | | * Disclaimer |
446 | | * |
447 | | * This source code is provided as is by Unicode, Inc. No claims are |
448 | | * made as to fitness for any particular purpose. No warranties of any |
449 | | * kind are expressed or implied. The recipient agrees to determine |
450 | | * applicability of information provided. If this file has been |
451 | | * purchased on magnetic or optical media from Unicode, Inc., the |
452 | | * sole remedy for any claim will be exchange of defective media |
453 | | * within 90 days of receipt. |
454 | | * |
455 | | * Limitations on Rights to Redistribute This Code |
456 | | * |
457 | | * Unicode, Inc. hereby grants the right to freely use the information |
458 | | * supplied in this file in the creation of products supporting the |
459 | | * Unicode Standard, and to make copies of this file in any form |
460 | | * for internal or external distribution as long as this notice |
461 | | * remains attached. |
462 | | */ |
463 | | |
464 | 12.0M | uint8_t isLegalUTF8(const uint8_t *source, int length) { |
465 | 12.0M | uint8_t a; |
466 | 12.0M | const uint8_t *srcptr = source+length; |
467 | 12.0M | switch (length) { |
468 | 0 | default: return false; |
469 | | /* Everything else falls through when "true"... */ |
470 | 6.84M | case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; |
471 | 2.10M | case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; |
472 | 4.29M | case 2: if ((a = (*--srcptr)) > 0xBF) return false; |
473 | | |
474 | 1.78M | switch (*source) { |
475 | | /* no fall-through in this inner switch */ |
476 | 14.0k | case 0xE0: if (a < 0xA0) return false; break; |
477 | 16.1k | case 0xED: if (a > 0x9F) return false; break; |
478 | 14.3k | case 0xF0: if (a < 0x90) return false; break; |
479 | 8.68k | case 0xF4: if (a > 0x8F) return false; break; |
480 | 1.73M | default: if (a < 0x80) return false; |
481 | 1.78M | } |
482 | | |
483 | 1.01M | case 1: if (*source >= 0x80 && *source < 0xC2) return false; |
484 | 12.0M | } |
485 | 995k | if (*source > 0xF4) return false; |
486 | 956k | return true; |
487 | 995k | } |
488 | | |