/src/ghostpdl/gpdl/txttop.c
Line | Count | Source |
1 | | /* Copyright (C) 2026 Artifex Software, Inc. |
2 | | All Rights Reserved. |
3 | | |
4 | | This software is provided AS-IS with no warranty, either express or |
5 | | implied. |
6 | | |
7 | | This software is distributed under license and may not be copied, |
8 | | modified or distributed except as expressly authorized under the terms |
9 | | of the license contained in the file LICENSE in this distribution. |
10 | | |
11 | | Refer to licensing information at http://www.artifex.com or contact |
12 | | Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, |
13 | | CA 94129, USA, for further information. |
14 | | */ |
15 | | |
16 | | /* Top-level API implementation for text file handling */ |
17 | | |
18 | | /* Language wrapper implementation (see pltop.h) */ |
19 | | |
20 | | |
21 | | /* Enable the following for a dump of the codepoints to stdout. */ |
22 | | /* #define DEBUG_CODEPOINTS */ |
23 | | |
24 | | /* Enable the following for a hacky dump of the output PCL to file. */ |
25 | | /* #define DEBUG_DUMP_PCL */ |
26 | | |
27 | | #ifdef DEBUG_DUMP_PCL |
28 | | #include <stdio.h> |
29 | | static FILE *debug_pcl_out = NULL; |
30 | | static void wipe(void) |
31 | | { |
32 | | fclose(debug_pcl_out); |
33 | | debug_pcl_out = NULL; |
34 | | } |
35 | | static void |
36 | | debug_as_pcl(const char *p, int n) |
37 | | { |
38 | | if (debug_pcl_out == NULL) |
39 | | { |
40 | | debug_pcl_out = fopen("debug_pcl_out", "wb"); |
41 | | atexit(wipe); |
42 | | } |
43 | | fwrite(p, n, 1, debug_pcl_out); |
44 | | } |
45 | | #endif |
46 | | |
47 | | #include "pltop.h" |
48 | | #include "plmain.h" |
49 | | |
50 | | #include "plparse.h" /* for e_ExitLanguage */ |
51 | | #include "plmain.h" |
52 | | #include "gxdevice.h" /* so we can include gxht.h below */ |
53 | | #include "gserrors.h" |
54 | | #include "gp.h" |
55 | | #include "assert_.h" |
56 | | |
57 | | /* |
58 | | * The TXT interpeter is identical to pl_interp_t. |
59 | | * The TXT interpreter instance is derived from pl_interp_implementation_t. |
60 | | */ |
61 | | |
62 | | typedef enum |
63 | | { |
64 | | TXT_STATE_INIT = 0, |
65 | | TXT_STATE_UTF8, |
66 | | TXT_STATE_UTF8_MAYBE, |
67 | | TXT_STATE_UTF16_LE, |
68 | | TXT_STATE_UTF16_BE, |
69 | | TXT_STATE_ASCII |
70 | | } txt_state_t; |
71 | | |
72 | | typedef struct txt_interp_instance_s txt_interp_instance_t; |
73 | | |
74 | | struct txt_interp_instance_s |
75 | | { |
76 | | gs_memory_t *memory; /* memory allocator to use */ |
77 | | |
78 | | pl_interp_implementation_t *sub; |
79 | | gx_device *device; |
80 | | |
81 | | int buffered; |
82 | | byte buffer[4]; |
83 | | |
84 | | int state; |
85 | | int detected; |
86 | | int just_had_lf; |
87 | | int just_had_cr; |
88 | | int col; |
89 | | int sent; |
90 | | }; |
91 | | |
92 | | enum |
93 | | { |
94 | | TXT_UNDETECTED = -1, |
95 | | TXT_UNKNOWN, |
96 | | TXT_UTF8, |
97 | | TXT_UTF8_MAYBE, |
98 | | TXT_UTF16_LE, |
99 | | TXT_UTF16_BE, |
100 | | TXT_ASCII, |
101 | | }; |
102 | | |
103 | | static int |
104 | | identify_from_buffer(const unsigned char *s, int len) |
105 | 18.3k | { |
106 | 18.3k | int count_controls = 0; |
107 | 18.3k | int count_hi = 0; |
108 | 18.3k | int count_tabs = 0; |
109 | 18.3k | int plausibly_utf8 = 1; |
110 | 18.3k | int i; |
111 | | |
112 | | /* UTF-8 with a BOM */ |
113 | 18.3k | if (len >= 3 && s[0] == 0xef && s[1] == 0xbb && s[2] == 0xbf) |
114 | 0 | return TXT_UTF8; |
115 | | /* UTF-16 (little endian) */ |
116 | 18.3k | if (len >= 2 && s[0] == 0xff && s[1] == 0xfe) |
117 | 82 | return TXT_UTF16_LE; |
118 | | /* UTF-16 (big endian) */ |
119 | 18.2k | if (len >= 2 && s[0] == 0xfe && s[1] == 0xff) |
120 | 131 | return TXT_UTF16_BE; |
121 | | |
122 | | /* Gather some stats. */ |
123 | 22.5M | for (i = 0; i < len; i++) |
124 | 22.5M | { |
125 | 22.5M | if (s[i] == 9) |
126 | 87.0k | { |
127 | 87.0k | count_tabs++; |
128 | 87.0k | } |
129 | 22.4M | else if (s[i] == 12) |
130 | 133k | { |
131 | | /* Form feed. We'll let that slide. */ |
132 | 133k | } |
133 | 22.2M | else if (s[i] == 10) |
134 | 150k | { |
135 | 150k | if (i+1 < len && s[i+1] == 13) |
136 | 316 | i++; |
137 | 150k | } |
138 | 22.1M | else if (s[i] == 13) |
139 | 126k | { |
140 | 126k | if (i+1 < len && s[i+1] == 10) |
141 | 72.4k | i++; |
142 | 126k | } |
143 | 22.0M | else if (s[i] < 32 || s[i] == 0x7f) |
144 | 5.36M | { |
145 | 5.36M | count_controls++; |
146 | 5.36M | } |
147 | 16.6M | else if (s[i] < 0x7f) |
148 | 11.5M | { |
149 | | /* Seems like a reasonable ASCII value. */ |
150 | 11.5M | } |
151 | 5.06M | else |
152 | 5.06M | { |
153 | 5.06M | count_hi++; |
154 | 5.06M | if ((s[i] & 0xF8) == 0xF0) |
155 | 311k | { |
156 | | /* 3 following bytes */ |
157 | 311k | if (i+1 < len && (s[i+1] & 0xC0) != 0x80) |
158 | 302k | plausibly_utf8 = 0; |
159 | 9.52k | else if (i+2 < len && (s[i+2] & 0xC0) != 0x80) |
160 | 6.00k | plausibly_utf8 = 0; |
161 | 3.52k | else if (i+3 < len && (s[i+3] & 0xC0) != 0x80) |
162 | 1.72k | plausibly_utf8 = 0; |
163 | 1.79k | else |
164 | 1.79k | i+=3; |
165 | 311k | } |
166 | 4.75M | else if ((s[i] & 0xF0) == 0xE0) |
167 | 331k | { |
168 | | /* 2 following bytes */ |
169 | 331k | if (i+1 < len && (s[i+1] & 0xC0) != 0x80) |
170 | 305k | plausibly_utf8 = 0; |
171 | 25.4k | else if (i+2 < len && (s[i+2] & 0xC0) != 0x80) |
172 | 16.8k | plausibly_utf8 = 0; |
173 | 8.59k | else |
174 | 8.59k | i+=2; |
175 | 331k | } |
176 | 4.42M | else if ((s[i] & 0xE0) == 0xC0) |
177 | 1.34M | { |
178 | | /* 1 following bytes */ |
179 | 1.34M | if (i+1 < len && (s[i+1] & 0xC0) != 0x80) |
180 | 1.23M | plausibly_utf8 = 0; |
181 | 102k | else |
182 | 102k | i++; |
183 | 1.34M | } |
184 | 3.08M | else |
185 | 3.08M | plausibly_utf8 = 0; |
186 | 5.06M | } |
187 | 22.5M | } |
188 | | |
189 | | /* Any (non tab/cr/lf/ff) control characters probably means this isn't text. */ |
190 | 18.0k | if (count_controls > 0) |
191 | 16.8k | return TXT_UNKNOWN; |
192 | | /* If we've managed to decode all that as utf8 without problem, it's probably text. */ |
193 | 1.28k | if (plausibly_utf8) |
194 | 771 | return TXT_UTF8_MAYBE; |
195 | | /* If we're hitting too many top bit set chars, give up. */ |
196 | 515 | if (count_hi > len/10) |
197 | 183 | return TXT_UNKNOWN; |
198 | | |
199 | 332 | return TXT_ASCII; |
200 | 515 | } |
201 | | |
202 | | static int |
203 | | txt_detect_language(const char *t, int len) |
204 | 17.7k | { |
205 | 17.7k | const unsigned char *s = (const unsigned char *)t; |
206 | | |
207 | 17.7k | switch (identify_from_buffer(s, len)) |
208 | 17.7k | { |
209 | 0 | case TXT_UTF8: |
210 | 45 | case TXT_UTF16_LE: |
211 | 112 | case TXT_UTF16_BE: |
212 | | /* PCL spots files with lots of ESCs in them at confidence |
213 | | * level 80. We'll use 70, cos we don't want to override that. */ |
214 | 112 | return 70; |
215 | 447 | case TXT_UTF8_MAYBE: |
216 | 634 | case TXT_ASCII: |
217 | 634 | return 60; |
218 | 0 | default: |
219 | 16.9k | case TXT_UNKNOWN: |
220 | 16.9k | break; |
221 | 17.7k | } |
222 | | |
223 | 16.9k | return 0; |
224 | 17.7k | } |
225 | | |
226 | | static const pl_interp_characteristics_t * |
227 | | txt_impl_characteristics(const pl_interp_implementation_t *pimpl) |
228 | 37.5k | { |
229 | 37.5k | static pl_interp_characteristics_t txt_characteristics = |
230 | 37.5k | { |
231 | 37.5k | "TXT", |
232 | 37.5k | txt_detect_language, |
233 | 37.5k | }; |
234 | 37.5k | return &txt_characteristics; |
235 | 37.5k | } |
236 | | |
237 | | /* Do per-instance interpreter allocation/init. No device is set yet */ |
238 | | static int |
239 | | txt_impl_allocate_interp_instance(pl_interp_implementation_t *impl, |
240 | | gs_memory_t *pmem) |
241 | 8.09k | { |
242 | 8.09k | txt_interp_instance_t *instance; |
243 | | |
244 | 8.09k | instance = (txt_interp_instance_t *) gs_alloc_bytes(pmem, |
245 | 8.09k | sizeof(txt_interp_instance_t), "txt_impl_allocate_interp_instance"); |
246 | | |
247 | 8.09k | if (!instance) |
248 | 0 | return_error(gs_error_VMerror); |
249 | | |
250 | 8.09k | instance->memory = pmem; |
251 | 8.09k | instance->sub = NULL; |
252 | | |
253 | 8.09k | impl->interp_client_data = instance; |
254 | | |
255 | 8.09k | return 0; |
256 | 8.09k | } |
257 | | |
258 | | /* Prepare interp instance for the next "job" */ |
259 | | static int |
260 | | txt_impl_init_job(pl_interp_implementation_t *impl, |
261 | | gx_device *pdevice) |
262 | 570 | { |
263 | 570 | txt_interp_instance_t *instance = impl->interp_client_data; |
264 | | |
265 | 570 | instance->device = pdevice; |
266 | 570 | instance->state = TXT_STATE_INIT; |
267 | 570 | instance->buffered = 0; |
268 | 570 | instance->detected = TXT_UNDETECTED; |
269 | 570 | instance->just_had_lf = 0; |
270 | 570 | instance->just_had_cr = 0; |
271 | 570 | instance->col = 0; |
272 | | |
273 | 570 | instance->sub = pl_main_get_pcl_instance(instance->memory); |
274 | | |
275 | 570 | return pl_init_job(instance->sub, instance->device); |
276 | 570 | } |
277 | | |
278 | 4.47k | #define ESC 27 |
279 | | |
280 | | static int |
281 | | send_bytes(txt_interp_instance_t *instance, const byte *p, int n) |
282 | 662k | { |
283 | 662k | stream_cursor_read cursor; |
284 | | |
285 | | #ifdef DEBUG_DUMP_PCL |
286 | | debug_as_pcl(p, n); |
287 | | #endif |
288 | | |
289 | 662k | stream_cursor_read_init(&cursor, p, n); |
290 | | |
291 | 662k | return instance->sub->proc_process(instance->sub, &cursor); |
292 | 662k | } |
293 | | |
294 | | static void |
295 | | drop_buffered(txt_interp_instance_t *instance, int n) |
296 | 657k | { |
297 | 657k | assert(instance->buffered >= n); |
298 | 657k | instance->buffered -= n; |
299 | 657k | if (instance->buffered > 0) |
300 | 705 | memmove(instance->buffer, &instance->buffer[n], instance->buffered); |
301 | 657k | } |
302 | | |
303 | | static int |
304 | | send_pcl_init(txt_interp_instance_t *instance) |
305 | 407 | { |
306 | 407 | static byte init[] = { |
307 | 407 | ESC, 'E', /* Reset */ |
308 | 407 | ESC, '&', 'l', '0', 'O', /* Orientation */ |
309 | 407 | ESC, '&', 'k', '1', '0', 'H', /* Horizontal spacing 10/120 of an inch. */ |
310 | 407 | ESC, '&', 'l', '8', 'C', /* Vertical line spacing 8/48 of an inch. */ |
311 | 407 | ESC, '&', 't', '8', '3', 'P', /* &t = double byte parsing, 83 = utf-8, P = ? */ |
312 | 407 | ESC, '(', '1', '8', 'N', /* Primary symbol set = 18N = Unicode */ |
313 | 407 | ESC, '(', 's', '0', 'P', /* Fixed pitch */ |
314 | 407 | ESC, '(', 's', '1', '2', 'H', /* Secondary fixed pitch 12cpi */ |
315 | 407 | ESC, '(', 's', '8', 'V', /* Point size 8 */ |
316 | 407 | ESC, '(', 's', '3', 'T', /* Typeface number 3 */ |
317 | 407 | ESC, '&', 's', '0', 'C' /* Wrappity wrap wrap */ |
318 | 407 | }; |
319 | | |
320 | 407 | return send_bytes(instance, init, sizeof(init)); |
321 | 407 | } |
322 | | |
323 | | static int |
324 | | send_urc(txt_interp_instance_t *instance, int n) |
325 | 125 | { |
326 | 125 | static byte unicode_replacement_char_as_utf8[] = { 0xe3, 0xbf, 0xbd }; |
327 | | |
328 | 125 | if (instance->state == TXT_STATE_UTF8_MAYBE) |
329 | 20 | { |
330 | | /* We were guessing that this was UTF8. Now we know it's not. Drop back to ascii. */ |
331 | 20 | instance->state = TXT_STATE_ASCII; |
332 | 20 | return 0; |
333 | 20 | } |
334 | | |
335 | 105 | drop_buffered(instance, n); |
336 | | |
337 | 105 | instance->sent = 1; |
338 | 105 | return send_bytes(instance, unicode_replacement_char_as_utf8, sizeof(unicode_replacement_char_as_utf8)); |
339 | 125 | } |
340 | | |
341 | | static int |
342 | | send_utf8(txt_interp_instance_t *instance, int val) |
343 | 661k | { |
344 | 661k | byte buf[4]; |
345 | 661k | int n; |
346 | | |
347 | | /* Finally, send the val! */ |
348 | 661k | if (val < 0x80) |
349 | 619k | { |
350 | 619k | buf[0] = val; |
351 | 619k | n = 1; |
352 | 619k | } |
353 | 42.4k | else if (val < 0x800) |
354 | 37.3k | { |
355 | 37.3k | buf[0] = 0xC0 + (val>>6); |
356 | 37.3k | buf[1] = 0x80 + (val & 0x3F); |
357 | 37.3k | n = 2; |
358 | 37.3k | } |
359 | 5.10k | else if (val < 0x10000) |
360 | 5.09k | { |
361 | 5.09k | buf[0] = 0xE0 + (val>>12); |
362 | 5.09k | buf[1] = 0x80 + ((val>>6) & 0x3F); |
363 | 5.09k | buf[2] = 0x80 + (val & 0x3F); |
364 | 5.09k | n = 3; |
365 | 5.09k | } |
366 | 4 | else |
367 | 4 | { |
368 | 4 | buf[0] = 0xF0 + (val>>18); |
369 | 4 | buf[1] = 0x80 + ((val>>12) & 0x3F); |
370 | 4 | buf[2] = 0x80 + ((val>>6) & 0x3F); |
371 | 4 | buf[3] = 0x80 + (val & 0x3F); |
372 | 4 | n = 4; |
373 | 4 | } |
374 | 661k | return send_bytes(instance, buf, n); |
375 | 661k | } |
376 | | |
377 | | /* All our actual codepoints should flow through here. So this is where |
378 | | * we do the housekeeping. */ |
379 | | static int |
380 | | send_codepoint(txt_interp_instance_t *instance, int val) |
381 | 657k | { |
382 | 657k | int code; |
383 | | |
384 | | #ifdef DEBUG_CODEPOINTS |
385 | | dprintf3("Sending codepoint %d (%x) %c\n", val, val, val >= 32 && val <= 255 && val != 127 ? val : '.'); |
386 | | #endif |
387 | | |
388 | 657k | instance->sent = 1; |
389 | | /* Tidy up whatever mess of CR/LF we are passed. */ |
390 | 657k | if (val == '\r') |
391 | 525 | { |
392 | | /* If we've got a CR and we've just had a LF, swallow this. */ |
393 | 525 | if (instance->just_had_lf) |
394 | 0 | { |
395 | 0 | instance->just_had_lf = 0; |
396 | 0 | return 0; |
397 | 0 | } |
398 | 525 | instance->just_had_cr = 1; |
399 | 525 | val = '\n'; |
400 | 525 | } |
401 | 656k | else if (val == '\n') |
402 | 3.69k | { |
403 | | /* If we've got a LF and we've just had a CR, swallow this. */ |
404 | 3.69k | if (instance->just_had_cr) |
405 | 157 | { |
406 | 157 | instance->just_had_cr = 0; |
407 | 157 | return 0; |
408 | 157 | } |
409 | 3.53k | instance->just_had_lf = 1; |
410 | 3.53k | } |
411 | 652k | else |
412 | 652k | { |
413 | 652k | instance->just_had_cr = 0; |
414 | 652k | instance->just_had_lf = 0; |
415 | 652k | } |
416 | | |
417 | | /* Keep track of what column we're at to so we can do tab handling. */ |
418 | 656k | if (val == '\n') |
419 | 4.05k | { |
420 | 4.05k | instance->col = 0; |
421 | 4.05k | code = send_utf8(instance, '\n'); |
422 | 4.05k | if (code < 0) |
423 | 0 | return code; |
424 | 4.05k | return send_utf8(instance, '\r'); |
425 | 4.05k | } |
426 | 652k | if (val == '\t') |
427 | 200 | { |
428 | 200 | int spaces = 8 - (instance->col & 7); |
429 | 1.12k | while (spaces--) |
430 | 924 | { |
431 | 924 | int code = send_utf8(instance, ' '); |
432 | 924 | if (code < 0) |
433 | 0 | return code; |
434 | 924 | instance->col++; |
435 | 924 | } |
436 | 200 | return 0; |
437 | 200 | } |
438 | 652k | instance->col++; |
439 | | |
440 | | #if 0 |
441 | | /* No need for this as PCL line wrapping works for us. If PCL ever |
442 | | * decides to wrap at a number of columns that aren't a multiple of |
443 | | * 8 then we'll need to do it manually again!. */ |
444 | | if (instance->col == 80) |
445 | | { |
446 | | instance->col = 0; |
447 | | code = send_utf8(instance, '\n'); |
448 | | if (code < 0) |
449 | | return code; |
450 | | return send_utf8(instance, '\r'); |
451 | | } |
452 | | #endif |
453 | | |
454 | 652k | return send_utf8(instance, val); |
455 | 652k | } |
456 | | |
457 | | static int |
458 | | process_block(txt_interp_instance_t *instance, const byte *ptr, int n) |
459 | 866 | { |
460 | 866 | int code; |
461 | 866 | byte *s = &instance->buffer[0]; |
462 | 866 | int old_state = instance->state; |
463 | 866 | int val; |
464 | | |
465 | 866 | if (instance->detected == TXT_UNDETECTED) |
466 | 570 | { |
467 | 570 | instance->detected = identify_from_buffer(ptr, n); |
468 | | /* If we're thinking we're ASCII, go straight there. Otherwise, we'll let the |
469 | | * BOM detection below run its course. */ |
470 | 570 | if (instance->detected == TXT_ASCII) |
471 | 145 | instance->state = TXT_STATE_ASCII; |
472 | 570 | } |
473 | | |
474 | 866 | instance->sent = 0; |
475 | 679k | while (n) |
476 | 679k | { |
477 | | /* instance->sent records whether we pulled anything out of the buffer |
478 | | * last time round the loop. If we changed state, then don't refill the |
479 | | * buffer. Otherwise only fill the buffer if we didn't a char last time |
480 | | * (maybe we need char 2 of a 2 char sequence?) or if we haven't got |
481 | | * anything in the buffer already. */ |
482 | 679k | if (instance->state == old_state && (!instance->sent || instance->buffered == 0)) |
483 | 677k | { |
484 | 677k | assert(instance->buffered < 4); |
485 | 677k | s[instance->buffered++] = *ptr++; |
486 | 677k | n--; |
487 | 677k | } |
488 | 679k | old_state = instance->state; |
489 | | |
490 | 679k | instance->sent = 0; |
491 | 679k | switch (instance->state) |
492 | 679k | { |
493 | 1.14k | case TXT_STATE_INIT: |
494 | | |
495 | 1.14k | if (instance->buffered == 3 && s[0] == 0xef && s[1] == 0xbb && s[2] == 0xbf) |
496 | 0 | { |
497 | 0 | instance->state = TXT_STATE_UTF8; |
498 | 0 | drop_buffered(instance, 3); |
499 | 0 | } |
500 | 1.14k | else if (instance->buffered == 2 && s[0] == 0xff && s[1] == 0xfe) |
501 | 37 | { |
502 | 37 | instance->state = TXT_STATE_UTF16_LE; |
503 | 37 | drop_buffered(instance, 2); |
504 | 37 | } |
505 | 1.10k | else if (instance->buffered == 2 && s[0] == 0xfe && s[1] == 0xff) |
506 | 64 | { |
507 | 64 | instance->state = TXT_STATE_UTF16_BE; |
508 | 64 | drop_buffered(instance, 2); |
509 | 64 | } |
510 | 1.03k | else if (instance->buffered >= 3) |
511 | 306 | { |
512 | | /* We haven't found a BOM, try for utf8. */ |
513 | 306 | instance->state = TXT_STATE_UTF8_MAYBE; |
514 | 306 | } |
515 | | |
516 | | /* If we've recognised the BOM, then send the init string. */ |
517 | 1.14k | if (instance->state != TXT_STATE_INIT) |
518 | 407 | { |
519 | 407 | code = send_pcl_init(instance); |
520 | 407 | if (code < 0) |
521 | 0 | return code; |
522 | 407 | } |
523 | 1.14k | break; |
524 | 1.14k | case TXT_STATE_UTF8: |
525 | 308k | case TXT_STATE_UTF8_MAYBE: |
526 | 308k | if ((s[0] & 0xF8) == 0xF0) |
527 | 26 | { |
528 | | /* 3 following bytes */ |
529 | 26 | if (instance->buffered >= 2 && (s[1] & 0xC0) != 0x80) |
530 | 0 | { |
531 | 0 | code = send_urc(instance, 1); |
532 | 0 | if (code < 0) |
533 | 0 | return code; |
534 | 0 | } |
535 | 26 | else if (instance->buffered >= 3 && (s[2] & 0xC0) != 0x80) |
536 | 0 | { |
537 | 0 | code = send_urc(instance, 2); |
538 | 0 | if (code < 0) |
539 | 0 | return code; |
540 | 0 | } |
541 | 26 | else if (instance->buffered == 4 && (s[3] & 0xC0) != 0x80) |
542 | 0 | { |
543 | 0 | code = send_urc(instance, 3); |
544 | 0 | if (code < 0) |
545 | 0 | return code; |
546 | 0 | } |
547 | 26 | else if (instance->buffered == 4) |
548 | 4 | { |
549 | | /* Valid encoding of 4 bytes */ |
550 | 4 | val = ((s[0] & 0x7)<<18) | ((s[1] & 0x3f)<<12) | ((s[2] & 0x3f)<<6) | (s[3] & 0x3f); |
551 | 4 | drop_buffered(instance, 4); |
552 | 4 | code = send_codepoint(instance, val); |
553 | 4 | if (code < 0) |
554 | 0 | return code; |
555 | 4 | } |
556 | 22 | else if (instance->buffered != 1 && instance->buffered != 2 && instance->buffered != 3) |
557 | 0 | { |
558 | | /* Should never happen. */ |
559 | 0 | return_error(gs_error_Fatal); |
560 | 0 | } |
561 | 26 | } |
562 | 308k | else if ((s[0] & 0xF0) == 0xE0) |
563 | 21 | { |
564 | | /* 2 following bytes */ |
565 | 21 | if (instance->buffered >= 2 && (s[1] & 0xC0) != 0x80) |
566 | 1 | { |
567 | 1 | code = send_urc(instance, 1); |
568 | 1 | if (code < 0) |
569 | 0 | return code; |
570 | 1 | } |
571 | 20 | else if (instance->buffered >= 3 && (s[2] & 0xC0) != 0x80) |
572 | 0 | { |
573 | 0 | code = send_urc(instance, 2); |
574 | 0 | if (code < 0) |
575 | 0 | return code; |
576 | 0 | } |
577 | 20 | else if (instance->buffered == 3) |
578 | 4 | { |
579 | | /* Valid encoding of 3 bytes */ |
580 | 4 | val = ((s[0] & 0xF)<<12) | ((s[1] & 0x3f)<<6) | (s[2] & 0x3f); |
581 | 4 | drop_buffered(instance, 3); |
582 | 4 | code = send_codepoint(instance, val); |
583 | 4 | if (code < 0) |
584 | 0 | return code; |
585 | 4 | } |
586 | 16 | else if (instance->buffered != 1 && instance->buffered != 2) |
587 | 0 | { |
588 | | /* Should never happen. */ |
589 | 0 | return_error(gs_error_Fatal); |
590 | 0 | } |
591 | 21 | } |
592 | 308k | else if ((s[0] & 0xE0) == 0xC0) |
593 | 27 | { |
594 | | /* 1 following bytes */ |
595 | 27 | if (instance->buffered >= 2 && (s[1] & 0xC0) != 0x80) |
596 | 4 | { |
597 | 4 | code = send_urc(instance, 1); |
598 | 4 | if (code < 0) |
599 | 0 | return code; |
600 | 4 | } |
601 | 23 | else if (instance->buffered == 2) |
602 | 7 | { |
603 | | /* Valid encoding of 2 bytes */ |
604 | 7 | val = ((s[0] & 0x1F)<<6) | (s[1] & 0x3f); |
605 | 7 | drop_buffered(instance, 2); |
606 | 7 | code = send_codepoint(instance, val); |
607 | 7 | if (code < 0) |
608 | 0 | return code; |
609 | 7 | } |
610 | 16 | else if (instance->buffered != 1) |
611 | 2 | { |
612 | | /* Should never happen. */ |
613 | 2 | return_error(gs_error_Fatal); |
614 | 2 | } |
615 | 27 | } |
616 | 308k | else if ((s[0] & 0xC0) == 0x80) |
617 | 11 | { |
618 | | /* A continuation byte at the start. Should never see this. */ |
619 | 11 | code = send_urc(instance, 1); |
620 | 11 | if (code < 0) |
621 | 0 | return code; |
622 | 11 | } |
623 | 307k | else if (s[0] < 0x80) |
624 | 307k | { |
625 | | /* Simple byte. */ |
626 | 307k | val = s[0]; |
627 | 307k | drop_buffered(instance, 1); |
628 | 307k | code = send_codepoint(instance, val); |
629 | 307k | if (code < 0) |
630 | 0 | return code; |
631 | 307k | } |
632 | 4 | else |
633 | 4 | { |
634 | | /* Bytes we should never see in a UTF-8 file! (0xf8-0xff) */ |
635 | 4 | code = send_urc(instance, 1); |
636 | 4 | if (code < 0) |
637 | 0 | return code; |
638 | 4 | } |
639 | 308k | break; |
640 | 308k | case TXT_STATE_UTF16_LE: |
641 | 2.39k | if (instance->buffered < 2) |
642 | 1.22k | break; |
643 | 1.17k | if (s[1] >= 0xD8 && s[1] < 0xDC) |
644 | 0 | { |
645 | | /* High surrogate */ |
646 | 0 | if (instance->buffered < 4) |
647 | 0 | break; |
648 | 0 | if (s[3] < 0xDC || s[3] > 0xDF) |
649 | 0 | { |
650 | | /* Not followed by a low surrogate! Ignore the high surrogate. */ |
651 | 0 | code = send_urc(instance, 2); |
652 | 0 | if (code < 0) |
653 | 0 | return code; |
654 | 0 | } |
655 | 0 | val = (((s[0] | (s[1]<<8)) - 0xdc00)<<10) + (s[2] | (s[3]<<8)) - 0xdc00 + 0x10000; |
656 | 0 | drop_buffered(instance, 4); |
657 | 0 | } |
658 | 1.17k | else |
659 | 1.17k | { |
660 | 1.17k | val = s[0] | (s[1]<<8); |
661 | 1.17k | drop_buffered(instance, 2); |
662 | 1.17k | } |
663 | 1.17k | code = send_codepoint(instance, val); |
664 | 1.17k | if (code < 0) |
665 | 0 | return code; |
666 | 1.17k | break; |
667 | 38.6k | case TXT_STATE_UTF16_BE: |
668 | 38.6k | if (instance->buffered < 2) |
669 | 19.2k | break; |
670 | 19.4k | if (s[0] >= 0xD8 && s[0] < 0xDC) |
671 | 315 | { |
672 | | /* High surrogate */ |
673 | 315 | if (instance->buffered < 4) |
674 | 210 | break; |
675 | 105 | if (s[2] < 0xDC || s[2] > 0xDF) |
676 | 105 | { |
677 | | /* Not followed by a low surrogate! Ignore the high surrogate. */ |
678 | 105 | code = send_urc(instance, 2); |
679 | 105 | if (code < 0) |
680 | 0 | return code; |
681 | 105 | break; |
682 | 105 | } |
683 | 0 | val = (((s[1] | (s[0]<<8)) - 0xdc00)<<10) + (s[3] | (s[2]<<8)) - 0xdc00 + 0x10000; |
684 | 0 | drop_buffered(instance, 4); |
685 | 0 | } |
686 | 19.1k | else |
687 | 19.1k | { |
688 | 19.1k | val = s[1] | (s[0]<<8); |
689 | 19.1k | drop_buffered(instance, 2); |
690 | 19.1k | } |
691 | 19.1k | code = send_codepoint(instance, val); |
692 | 19.1k | if (code < 0) |
693 | 0 | return code; |
694 | 19.1k | break; |
695 | 328k | case TXT_STATE_ASCII: |
696 | 657k | while (instance->buffered > 0) |
697 | 328k | { |
698 | 328k | code = send_codepoint(instance, s[0]); |
699 | 328k | if (code < 0) |
700 | 0 | return code; |
701 | 328k | drop_buffered(instance, 1); |
702 | 328k | } |
703 | 328k | break; |
704 | 328k | default: |
705 | 0 | return_error(gs_error_Fatal); |
706 | 679k | } |
707 | 679k | } |
708 | 864 | return 0; |
709 | 866 | } |
710 | | |
711 | | /* Parse an entire random access file */ |
712 | | #if 0 |
713 | | static int |
714 | | txt_impl_process_file(pl_interp_implementation_t *impl, const char *filename) |
715 | | { |
716 | | txt_interp_instance_t *instance = impl->interp_client_data; |
717 | | int code, code1; |
718 | | gp_file *file; |
719 | | |
720 | | file = gp_fopen(instance->memory, filename, "rb"); |
721 | | if (file == 0) |
722 | | return_error(gs_error_ioerror); |
723 | | |
724 | | instance->sub = pl_main_get_pcl_instance(instance->memory); |
725 | | |
726 | | code = pl_init_job(instance->sub, instance->device); |
727 | | if (code >= 0) |
728 | | { |
729 | | code = pl_process_file(instance->sub, filename); |
730 | | } |
731 | | |
732 | | code1 = pl_dnit_job(instance->sub); |
733 | | if (code >= 0) |
734 | | code = code1; |
735 | | |
736 | | gp_fclose(file); |
737 | | |
738 | | return code; |
739 | | } |
740 | | #endif |
741 | | |
742 | | /* Do any setup for parser per-cursor */ |
743 | | static int /* ret 0 or +ve if ok, else -ve error code */ |
744 | | txt_impl_process_begin(pl_interp_implementation_t * impl) |
745 | 570 | { |
746 | 570 | return 0; |
747 | 570 | } |
748 | | |
749 | | /* Parse a cursor-full of data */ |
750 | | static int |
751 | | txt_impl_process(pl_interp_implementation_t *impl, stream_cursor_read *cursor) |
752 | 866 | { |
753 | 866 | txt_interp_instance_t *instance = impl->interp_client_data; |
754 | 866 | int avail; |
755 | 866 | int code; |
756 | | |
757 | 866 | avail = cursor->limit - cursor->ptr; |
758 | 866 | code = process_block(instance, cursor->ptr + 1, avail); |
759 | 866 | cursor->ptr = cursor->limit; |
760 | | |
761 | 866 | return code; |
762 | 866 | } |
763 | | |
764 | | static int /* ret 0 or +ve if ok, else -ve error code */ |
765 | | txt_impl_process_end(pl_interp_implementation_t * impl) |
766 | 570 | { |
767 | 570 | return 0; |
768 | 570 | } |
769 | | |
770 | | /* Skip to end of job. |
771 | | * Return 1 if done, 0 ok but EOJ not found, else negative error code. |
772 | | */ |
773 | | static int |
774 | | txt_impl_flush_to_eoj(pl_interp_implementation_t *impl, stream_cursor_read *pcursor) |
775 | 2 | { |
776 | | /* assume SO files cannot be pjl embedded */ |
777 | 2 | pcursor->ptr = pcursor->limit; |
778 | 2 | return 0; |
779 | 2 | } |
780 | | |
781 | | /* Parser action for end-of-file */ |
782 | | static int |
783 | | txt_impl_process_eof(pl_interp_implementation_t *impl) |
784 | 568 | { |
785 | 568 | txt_interp_instance_t *instance = impl->interp_client_data; |
786 | | |
787 | 568 | if (instance->sub) |
788 | 568 | return pl_process_eof(instance->sub); |
789 | | |
790 | 0 | return 0; |
791 | 568 | } |
792 | | |
793 | | /* Report any errors after running a job */ |
794 | | static int |
795 | | txt_impl_report_errors(pl_interp_implementation_t *impl, |
796 | | int code, /* prev termination status */ |
797 | | long file_position, /* file position of error, -1 if unknown */ |
798 | | bool force_to_cout /* force errors to cout */ |
799 | | ) |
800 | 2 | { |
801 | 2 | txt_interp_instance_t *instance = impl->interp_client_data; |
802 | 2 | int ret = 0; |
803 | | |
804 | 2 | if (instance->sub) |
805 | 2 | ret = pl_report_errors(instance->sub, code, file_position, force_to_cout); |
806 | | |
807 | 2 | return ret; |
808 | 2 | } |
809 | | |
810 | | /* Wrap up interp instance after a "job" */ |
811 | | static int |
812 | | txt_impl_dnit_job(pl_interp_implementation_t *impl) |
813 | 570 | { |
814 | 570 | txt_interp_instance_t *instance = impl->interp_client_data; |
815 | 570 | int code = 0; |
816 | | |
817 | 570 | if (instance->sub) |
818 | 570 | code = pl_dnit_job(instance->sub); |
819 | 570 | instance->sub = NULL; |
820 | 570 | instance->device = NULL; |
821 | | |
822 | 570 | return code; |
823 | 570 | } |
824 | | |
825 | | /* Deallocate a interpreter instance */ |
826 | | static int |
827 | | txt_impl_deallocate_interp_instance(pl_interp_implementation_t *impl) |
828 | 8.09k | { |
829 | 8.09k | txt_interp_instance_t *instance = impl->interp_client_data; |
830 | | |
831 | 8.09k | gs_free_object(instance->memory, instance, "so_impl_deallocate_interp_instance"); |
832 | | |
833 | 8.09k | return 0; |
834 | 8.09k | } |
835 | | |
836 | | /* Parser implementation descriptor */ |
837 | | pl_interp_implementation_t txt_implementation = |
838 | | { |
839 | | txt_impl_characteristics, |
840 | | txt_impl_allocate_interp_instance, |
841 | | NULL, /* get_device_memory */ |
842 | | NULL, /* set_param */ |
843 | | NULL, /* add_path */ |
844 | | NULL, /* post_args_init */ |
845 | | txt_impl_init_job, |
846 | | NULL, /* run_prefix_commands */ |
847 | | NULL, /* txt_impl_process_file, */ |
848 | | txt_impl_process_begin, |
849 | | txt_impl_process, |
850 | | txt_impl_process_end, |
851 | | txt_impl_flush_to_eoj, |
852 | | txt_impl_process_eof, |
853 | | txt_impl_report_errors, |
854 | | txt_impl_dnit_job, |
855 | | txt_impl_deallocate_interp_instance, |
856 | | NULL, |
857 | | }; |