/src/moddable/modules/data/text/decoder/textdecoder.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2021-2022 Moddable Tech, Inc. |
3 | | * |
4 | | * This file is part of the Moddable SDK Runtime. |
5 | | * |
6 | | * The Moddable SDK Runtime is free software: you can redistribute it and/or modify |
7 | | * it under the terms of the GNU Lesser General Public License as published by |
8 | | * the Free Software Foundation, either version 3 of the License, or |
9 | | * (at your option) any later version. |
10 | | * |
11 | | * The Moddable SDK Runtime is distributed in the hope that it will be useful, |
12 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14 | | * GNU Lesser General Public License for more details. |
15 | | * |
16 | | * You should have received a copy of the GNU Lesser General Public License |
17 | | * along with the Moddable SDK Runtime. If not, see <http://www.gnu.org/licenses/>. |
18 | | * |
19 | | */ |
20 | | |
21 | | #include "xsmc.h" |
22 | | #include "xsHost.h" |
23 | | #ifdef kPocoRotation |
24 | | // Moddable SDK |
25 | | #include "mc.xs.h" // for xsID_ values |
26 | | |
27 | | #define VALIDATE 1 |
28 | | #else |
29 | | // xst, xsnap, etc |
30 | | #include <stdbool.h> |
31 | | |
32 | | #define xsID_ignoreBOM (xsID("ignoreBOM")) |
33 | | #define xsID_fatal (xsID("fatal")) |
34 | | #define xsID_stream (xsID("stream")) |
35 | | #endif |
36 | | |
37 | | typedef struct { |
38 | | uint8_t ignoreBOM; |
39 | | uint8_t fatal; |
40 | | |
41 | | // left over when streaming |
42 | | uint8_t bufferLength; |
43 | | uint8_t buffer[12]; |
44 | | } modTextDecoderRecord, *modTextDecoder; |
45 | | |
46 | | static uint8_t isLegalUTF8(const uint8_t *source, int length); |
47 | | |
48 | | void xs_textdecoder_destructor(void *data) |
49 | 0 | { |
50 | 0 | } |
51 | | |
52 | | void xs_textdecoder(xsMachine *the) |
53 | 31.7k | { |
54 | 31.7k | modTextDecoderRecord decoder; |
55 | 31.7k | int argc = xsmcArgc; |
56 | | |
57 | 31.7k | if (argc && c_strcmp(xsmcToString(xsArg(0)), "utf-8")) |
58 | 1 | xsRangeError("unsupported encoding"); |
59 | | |
60 | 31.7k | #if !VALIDATE |
61 | 31.7k | xsmcGet(xsResult, xsTarget, xsID("prototype")); |
62 | 31.7k | xsResult = xsNewHostInstance(xsResult); |
63 | 31.7k | xsThis = xsResult; |
64 | 31.7k | xsmcSetHostDestructor(xsThis, NULL); |
65 | 31.7k | c_memset(&decoder, 0, sizeof(decoder)); |
66 | 31.7k | #endif |
67 | | |
68 | 31.7k | decoder.ignoreBOM = false; |
69 | 31.7k | decoder.fatal = false; |
70 | 31.7k | decoder.bufferLength = 0; |
71 | 31.7k | if (argc >= 2) { |
72 | 0 | xsmcVars(1); |
73 | |
|
74 | 0 | xsmcGet(xsVar(0), xsArg(1), xsID_ignoreBOM); |
75 | 0 | decoder.ignoreBOM = xsmcTest(xsVar(0)); |
76 | |
|
77 | 0 | xsmcGet(xsVar(0), xsArg(1), xsID_fatal); |
78 | 0 | decoder.fatal = xsmcTest(xsVar(0)); |
79 | 0 | } |
80 | | |
81 | 31.7k | xsmcSetHostChunk(xsThis, &decoder, sizeof(decoder)); |
82 | 31.7k | } |
83 | | |
84 | | /* |
85 | | UTF-8 BOM is sequence 0xEF,0xBB,0xBF |
86 | | Replacement character sequence in UTF-8 is 0xEF 0xBF 0xBD |
87 | | null character maps to 0xC0, 0x80 |
88 | | |
89 | | implementation overallocates by 3 bytes if BOM is present and ignoreBOM is false |
90 | | */ |
91 | | |
92 | | void xs_textdecoder_decode(xsMachine *the) |
93 | 31.7k | { |
94 | 31.7k | uint8_t *src, *srcEnd, *dst, *dst3; |
95 | 31.7k | uint8_t *buffer; |
96 | 31.7k | xsUnsignedValue srcLength, bufferLength; |
97 | 31.7k | modTextDecoder td; |
98 | 31.7k | uint8_t srcOffset = 0; |
99 | 31.7k | uint32_t outLength = 0; |
100 | 31.7k | uint8_t stream = 0; |
101 | 31.7k | int argc = xsmcArgc; |
102 | | |
103 | 31.7k | if (argc > 1) { |
104 | 0 | xsmcVars(1); |
105 | |
|
106 | 0 | xsmcGet(xsVar(0), xsArg(1), xsID_stream); |
107 | 0 | stream = xsmcToBoolean(xsVar(0)); |
108 | 0 | } |
109 | | |
110 | 31.7k | if (argc) { |
111 | 31.7k | xsmcGetBufferReadable(xsArg(0), (void **)&src, &srcLength); |
112 | 31.7k | srcEnd = src + srcLength; |
113 | 31.7k | } |
114 | 0 | else |
115 | 0 | src = srcEnd = NULL; |
116 | | |
117 | | #if VALIDATE |
118 | | td = xsmcGetHostChunkValidate(xsThis, xs_textdecoder_destructor); |
119 | | #else |
120 | 31.7k | td = xsmcGetHostChunk(xsThis); |
121 | 31.7k | #endif |
122 | 31.7k | buffer = td->buffer; |
123 | 31.7k | bufferLength = td->bufferLength; |
124 | | |
125 | 156M | while ((src < srcEnd) || bufferLength) { |
126 | 156M | unsigned char first, clen, i; |
127 | 156M | uint8_t utf8[4]; |
128 | | |
129 | 156M | if (bufferLength) { |
130 | 0 | bufferLength--; |
131 | 0 | first = *buffer++; |
132 | 0 | } |
133 | 156M | else |
134 | 156M | first = c_read8(src++); |
135 | 156M | if (first < 0x80) { |
136 | 141M | outLength += (0 == first) ? 2 : 1; |
137 | 141M | continue; |
138 | 141M | } |
139 | | |
140 | 14.1M | if (0xC0 == (first & 0xE0)) |
141 | 1.39M | clen = 1; |
142 | 12.7M | else if (0xE0 == (first & 0xF0)) |
143 | 588k | clen = 2; |
144 | 12.1M | else if (0xF0 == (first & 0xF0)) |
145 | 3.76M | clen = 3; |
146 | 8.42M | else if (td->fatal) |
147 | 0 | goto fatal; |
148 | 8.42M | else { |
149 | 8.42M | outLength += 3; |
150 | 8.42M | continue; |
151 | 8.42M | } |
152 | | |
153 | 5.74M | if (clen > ((srcEnd - src) + bufferLength)) { |
154 | 1.13k | if (stream) |
155 | 0 | break; // decode to here. remainder saved below |
156 | | |
157 | 1.13k | if (td->fatal) |
158 | 0 | goto fatal; |
159 | | |
160 | 1.13k | outLength += 3; |
161 | 1.13k | if (!src) // flush |
162 | 0 | break; |
163 | 1.13k | continue; |
164 | 1.13k | } |
165 | | |
166 | 5.74M | utf8[0] = first; |
167 | 19.5M | for (i = 0; i < clen; i++) { |
168 | 13.8M | if (i < bufferLength) |
169 | 0 | utf8[i + 1] = buffer[i]; |
170 | 13.8M | else |
171 | 13.8M | utf8[i + 1] = c_read8(src + i - bufferLength); |
172 | 13.8M | } |
173 | | |
174 | 5.74M | if (!isLegalUTF8(utf8, clen + 1)) { |
175 | 5.40M | if (td->fatal) |
176 | 0 | goto fatal; |
177 | | |
178 | 5.40M | uint8_t lower = 0x80, upper = 0xBF; |
179 | 5.40M | if (0xE0 == first) |
180 | 11.8k | lower = 0xA0; |
181 | 5.39M | else if (0xED == first) |
182 | 36.0k | lower = 0x9F; |
183 | 5.35M | else if (0xF0 == first) |
184 | 25.7k | lower = 0x90; |
185 | 5.33M | else if (0xF4 == first) |
186 | 11.4k | lower = 0x8F; |
187 | 5.32M | else if (first > 0xF4) // no valid next byte |
188 | 3.63M | clen = 0; |
189 | | |
190 | 5.40M | const uint8_t *s = &utf8[1]; |
191 | 5.48M | while (clen-- > 0) { |
192 | 1.84M | uint8_t c = *s++; |
193 | 1.84M | if ((lower <= c) && (c <= upper)) { |
194 | 82.0k | if (bufferLength) { |
195 | 0 | bufferLength--; |
196 | 0 | buffer++; |
197 | 0 | } |
198 | 82.0k | else |
199 | 82.0k | src++; |
200 | 82.0k | } |
201 | 1.76M | else |
202 | 1.76M | break; |
203 | 1.84M | } |
204 | | |
205 | 5.40M | outLength += 3; |
206 | 5.40M | continue; |
207 | 5.40M | } |
208 | | |
209 | 334k | #if mxCESU8 |
210 | 334k | outLength += (3 == clen) ? 6 : (clen + 1); |
211 | | #else |
212 | | outLength += clen + 1; |
213 | | #endif |
214 | 334k | if (bufferLength) { |
215 | 0 | if (bufferLength >= clen) { |
216 | 0 | bufferLength -= clen; |
217 | 0 | buffer += clen; |
218 | 0 | } |
219 | 0 | else { |
220 | 0 | src += clen - bufferLength; |
221 | 0 | bufferLength = 0; |
222 | 0 | } |
223 | 0 | } |
224 | 334k | else |
225 | 334k | src += clen; |
226 | 334k | } |
227 | | |
228 | 31.7k | xsmcSetStringBuffer(xsResult, NULL, outLength + 1); |
229 | | |
230 | 31.7k | if (argc) { |
231 | 31.7k | xsmcGetBufferReadable(xsArg(0), (void **)&src, &srcLength); |
232 | 31.7k | srcEnd = src + srcLength; |
233 | 31.7k | src += srcOffset; |
234 | 31.7k | } |
235 | 0 | else |
236 | 0 | src = srcEnd = NULL; |
237 | | |
238 | | #if VALIDATE |
239 | | td = xsmcGetHostChunkValidate(xsThis, xs_textdecoder_destructor); |
240 | | #else |
241 | 31.7k | td = xsmcGetHostChunk(xsThis); |
242 | 31.7k | #endif |
243 | 31.7k | buffer = td->buffer; |
244 | 31.7k | bufferLength = td->bufferLength; |
245 | | |
246 | 31.7k | dst = (uint8_t *)xsmcToString(xsResult); |
247 | 31.7k | dst3 = td->ignoreBOM ? NULL : (dst + 3); |
248 | | |
249 | 156M | while ((src < srcEnd) || bufferLength) { |
250 | 156M | unsigned char first, clen, i, firstFromBuffer; |
251 | 156M | uint8_t utf8[4]; |
252 | | |
253 | 156M | if (bufferLength) { |
254 | 0 | bufferLength--; |
255 | 0 | first = *buffer++; |
256 | 0 | firstFromBuffer = 1; |
257 | 0 | } |
258 | 156M | else { |
259 | 156M | first = c_read8(src++); |
260 | 156M | firstFromBuffer = 0; |
261 | 156M | } |
262 | 156M | if (first < 0x80) { |
263 | 141M | if (first) |
264 | 136M | *dst++ = first; |
265 | 5.26M | else { |
266 | 5.26M | *dst++ = 0xC0; |
267 | 5.26M | *dst++ = 0x80; |
268 | 5.26M | } |
269 | 141M | continue; |
270 | 141M | } |
271 | | |
272 | 14.1M | if (0xC0 == (first & 0xE0)) |
273 | 1.39M | clen = 1; |
274 | 12.7M | else if (0xE0 == (first & 0xF0)) |
275 | 588k | clen = 2; |
276 | 12.1M | else if (0xF0 == (first & 0xF0)) |
277 | 3.76M | clen = 3; |
278 | 8.42M | else { |
279 | 8.42M | *dst++ = 0xEF; |
280 | 8.42M | *dst++ = 0xBF; |
281 | 8.42M | *dst++ = 0xBD; |
282 | 8.42M | continue; |
283 | 8.42M | } |
284 | | |
285 | 5.74M | if (clen > ((srcEnd - src) + bufferLength)) { |
286 | 1.13k | if (stream) { |
287 | | // put back "first". remainder saved below. |
288 | 0 | if (firstFromBuffer) { |
289 | 0 | buffer--; |
290 | 0 | bufferLength++; |
291 | 0 | } |
292 | 0 | else |
293 | 0 | src--; |
294 | 0 | break; |
295 | | |
296 | 0 | } |
297 | | |
298 | 1.13k | *dst++ = 0xEF; |
299 | 1.13k | *dst++ = 0xBF; |
300 | 1.13k | *dst++ = 0xBD; |
301 | 1.13k | if (!src) |
302 | 0 | break; // flush |
303 | 1.13k | continue; |
304 | 1.13k | } |
305 | | |
306 | 5.74M | utf8[0] = first; |
307 | 19.5M | for (i = 0; i < clen; i++) { |
308 | 13.8M | if (i < bufferLength) |
309 | 0 | utf8[i + 1] = buffer[i]; |
310 | 13.8M | else |
311 | 13.8M | utf8[i + 1] = c_read8(src + i - bufferLength); |
312 | 13.8M | } |
313 | | |
314 | 5.74M | if (!isLegalUTF8(utf8, clen + 1)) { |
315 | 5.40M | *dst++ = 0xEF; |
316 | 5.40M | *dst++ = 0xBF; |
317 | 5.40M | *dst++ = 0xBD; |
318 | | |
319 | 5.40M | uint8_t lower = 0x80, upper = 0xBF; |
320 | 5.40M | if (0xE0 == first) |
321 | 11.8k | lower = 0xA0; |
322 | 5.39M | else if (0xED == first) |
323 | 36.0k | lower = 0x9F; |
324 | 5.35M | else if (0xF0 == first) |
325 | 25.7k | lower = 0x90; |
326 | 5.33M | else if (0xF4 == first) |
327 | 11.4k | lower = 0x8F; |
328 | 5.32M | else if (first > 0xF4) // no valid next byte |
329 | 3.63M | clen = 0; |
330 | | |
331 | 5.40M | const uint8_t *s = &utf8[1]; |
332 | 5.48M | while (clen-- > 0) { |
333 | 1.84M | uint8_t c = *s++; |
334 | 1.84M | if ((lower <= c) && (c <= upper)) { |
335 | 82.0k | if (bufferLength) { |
336 | 0 | bufferLength--; |
337 | 0 | buffer++; |
338 | 0 | } |
339 | 82.0k | else |
340 | 82.0k | src++; |
341 | 82.0k | } |
342 | 1.76M | else |
343 | 1.76M | break; |
344 | 1.84M | } |
345 | | |
346 | 5.40M | continue; |
347 | 5.40M | } |
348 | | |
349 | 334k | #if mxCESU8 |
350 | 334k | if (3 != clen) { |
351 | 296k | *dst++ = first; |
352 | 792k | for (i = 0; i < clen; i++) |
353 | 496k | *dst++ = utf8[i + 1]; |
354 | 296k | } |
355 | 38.4k | else { |
356 | 38.4k | xsIntegerValue c; |
357 | 38.4k | fxUTF8Decode((xsStringValue)utf8, &c); |
358 | 38.4k | c -= 0x10000; |
359 | 38.4k | fxUTF8Encode((xsStringValue)dst, 0xD800 + (c >> 10)); |
360 | 38.4k | dst += 3; |
361 | 38.4k | fxUTF8Encode((xsStringValue)dst, 0xDC00 + (c & 0x3FF)); |
362 | 38.4k | dst += 3; |
363 | 38.4k | } |
364 | | #else |
365 | | *dst++ = first; |
366 | | for (i = 0; i < clen; i++) |
367 | | *dst++ = utf8[i + 1]; |
368 | | #endif |
369 | | |
370 | 334k | if ((0xEF == first) && (dst == dst3)) { |
371 | 263 | if ((0xBF == dst[-1]) && (0xBB == dst[-2])) |
372 | 241 | dst -= 3; |
373 | 263 | } |
374 | | |
375 | 334k | if (bufferLength) { |
376 | 0 | if (bufferLength >= clen) { |
377 | 0 | bufferLength -= clen; |
378 | 0 | buffer += clen; |
379 | 0 | } |
380 | 0 | else { |
381 | 0 | src += clen - bufferLength; |
382 | 0 | bufferLength = 0; |
383 | 0 | } |
384 | 0 | } |
385 | 334k | else |
386 | 334k | src += clen; |
387 | 334k | } |
388 | 31.7k | *dst++ = 0; |
389 | | |
390 | 31.7k | if (src) { |
391 | 31.7k | c_memcpy(td->buffer, buffer, bufferLength); |
392 | 31.7k | c_memcpy(td->buffer + bufferLength, src, srcEnd - src); |
393 | 31.7k | td->bufferLength = bufferLength + (srcEnd - src); |
394 | 31.7k | } |
395 | 0 | else |
396 | 0 | td->bufferLength = 0; // flush |
397 | | |
398 | 31.7k | return; |
399 | | |
400 | 0 | fatal: |
401 | 0 | xsTypeError("invalid utf-8"); |
402 | 31.7k | } |
403 | | |
404 | | void xs_textdecoder_get_encoding(xsMachine *the) |
405 | 0 | { |
406 | 0 | xsmcSetString(xsResult, "utf-8"); |
407 | 0 | } |
408 | | |
409 | | void xs_textdecoder_get_ignoreBOM(xsMachine *the) |
410 | 0 | { |
411 | | #if VALIDATE |
412 | | modTextDecoder td = xsmcGetHostChunkValidate(xsThis, xs_textdecoder_destructor); |
413 | | #else |
414 | 0 | modTextDecoder td = xsmcGetHostChunk(xsThis); |
415 | 0 | #endif |
416 | 0 | xsmcSetBoolean(xsResult, td->ignoreBOM); |
417 | 0 | } |
418 | | |
419 | | void xs_textdecoder_get_fatal(xsMachine *the) |
420 | 0 | { |
421 | | #if VALIDATE |
422 | | modTextDecoder td = xsmcGetHostChunkValidate(xsThis, xs_textdecoder_destructor); |
423 | | #else |
424 | 0 | modTextDecoder td = xsmcGetHostChunk(xsThis); |
425 | 0 | #endif |
426 | 0 | xsmcSetBoolean(xsResult, td->fatal); |
427 | 0 | } |
428 | | |
429 | | #if !VALIDATE |
430 | | void modInstallTextDecoder(xsMachine *the) |
431 | 31.7k | { |
432 | 31.7k | #define kPrototype (0) |
433 | 31.7k | #define kConstructor (1) |
434 | 31.7k | #define kScratch (2) |
435 | | |
436 | 63.5k | xsBeginHost(the); |
437 | 63.5k | xsmcVars(3); |
438 | | |
439 | 63.5k | xsVar(kPrototype) = xsNewHostObject(NULL); |
440 | 63.5k | xsVar(kConstructor) = xsNewHostConstructor(xs_textdecoder, 2, xsVar(kPrototype)); |
441 | 63.5k | xsmcDefine(xsGlobal, xsID("TextDecoder"), xsVar(kConstructor), xsDontEnum); |
442 | | |
443 | 63.5k | xsVar(kScratch) = xsNewHostFunction(xs_textdecoder_decode, 1); |
444 | 63.5k | xsmcDefine(xsVar(kPrototype), xsID("decode"), xsVar(kScratch), xsDontEnum); |
445 | 63.5k | xsVar(kScratch) = xsNewHostFunction(xs_textdecoder_get_encoding, 0); |
446 | 63.5k | xsmcDefine(xsVar(kPrototype), xsID("encoding"), xsVar(kScratch), xsIsGetter | xsDontEnum); |
447 | 63.5k | xsVar(kScratch) = xsNewHostFunction(xs_textdecoder_get_ignoreBOM, 0); |
448 | 63.5k | xsmcDefine(xsVar(kPrototype), xsID("ignoreBOM"), xsVar(kScratch), xsIsGetter | xsDontEnum); |
449 | 63.5k | xsVar(kScratch) = xsNewHostFunction(xs_textdecoder_get_fatal, 0); |
450 | 63.5k | xsmcDefine(xsVar(kPrototype), xsID("fatal"), xsVar(kScratch), xsIsGetter | xsDontEnum); |
451 | | |
452 | 63.5k | xsEndHost(the); |
453 | 31.7k | } |
454 | | #endif |
455 | | |
456 | | /* |
457 | | * Copyright 2001-2004 Unicode, Inc. |
458 | | * |
459 | | * Disclaimer |
460 | | * |
461 | | * This source code is provided as is by Unicode, Inc. No claims are |
462 | | * made as to fitness for any particular purpose. No warranties of any |
463 | | * kind are expressed or implied. The recipient agrees to determine |
464 | | * applicability of information provided. If this file has been |
465 | | * purchased on magnetic or optical media from Unicode, Inc., the |
466 | | * sole remedy for any claim will be exchange of defective media |
467 | | * within 90 days of receipt. |
468 | | * |
469 | | * Limitations on Rights to Redistribute This Code |
470 | | * |
471 | | * Unicode, Inc. hereby grants the right to freely use the information |
472 | | * supplied in this file in the creation of products supporting the |
473 | | * Unicode Standard, and to make copies of this file in any form |
474 | | * for internal or external distribution as long as this notice |
475 | | * remains attached. |
476 | | */ |
477 | | |
478 | 11.4M | uint8_t isLegalUTF8(const uint8_t *source, int length) { |
479 | 11.4M | uint8_t a; |
480 | 11.4M | const uint8_t *srcptr = source+length; |
481 | 11.4M | switch (length) { |
482 | 0 | default: return false; |
483 | | /* Everything else falls through when "true"... */ |
484 | 7.52M | case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; |
485 | 1.41M | case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; |
486 | 3.36M | case 2: if ((a = (*--srcptr)) > 0xBF) return false; |
487 | | |
488 | 1.05M | switch (*source) { |
489 | | /* no fall-through in this inner switch */ |
490 | 11.9k | case 0xE0: if (a < 0xA0) return false; break; |
491 | 32.8k | case 0xED: if (a > 0x9F) return false; break; |
492 | 31.5k | case 0xF0: if (a < 0x90) return false; break; |
493 | 10.3k | case 0xF4: if (a > 0x8F) return false; break; |
494 | 994k | default: if (a < 0x80) return false; |
495 | 1.05M | } |
496 | | |
497 | 700k | case 1: if (*source >= 0x80 && *source < 0xC2) return false; |
498 | 11.4M | } |
499 | 691k | if (*source > 0xF4) return false; |
500 | 668k | return true; |
501 | 691k | } |
502 | | |