/src/ghostpdl/gpdl/txttop.c
Line | Count | Source |
1 | | /* Copyright (C) 2026 Artifex Software, Inc. |
2 | | All Rights Reserved. |
3 | | |
4 | | This software is provided AS-IS with no warranty, either express or |
5 | | implied. |
6 | | |
7 | | This software is distributed under license and may not be copied, |
8 | | modified or distributed except as expressly authorized under the terms |
9 | | of the license contained in the file LICENSE in this distribution. |
10 | | |
11 | | Refer to licensing information at http://www.artifex.com or contact |
12 | | Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, |
13 | | CA 94129, USA, for further information. |
14 | | */ |
15 | | |
16 | | /* Top-level API implementation for text file handling */ |
17 | | |
18 | | /* Language wrapper implementation (see pltop.h) */ |
19 | | |
20 | | |
21 | | /* Enable the following for a dump of the codepoints to stdout. */ |
22 | | /* #define DEBUG_CODEPOINTS */ |
23 | | |
24 | | /* Enable the following for a hacky dump of the output PCL to file. */ |
25 | | /* #define DEBUG_DUMP_PCL */ |
26 | | |
27 | | #ifdef DEBUG_DUMP_PCL |
28 | | #include <stdio.h> |
29 | | static FILE *debug_pcl_out = NULL; |
30 | | static void wipe(void) |
31 | | { |
32 | | fclose(debug_pcl_out); |
33 | | debug_pcl_out = NULL; |
34 | | } |
35 | | static void |
36 | | debug_as_pcl(const char *p, int n) |
37 | | { |
38 | | if (debug_pcl_out == NULL) |
39 | | { |
40 | | debug_pcl_out = fopen("debug_pcl_out", "wb"); |
41 | | atexit(wipe); |
42 | | } |
43 | | fwrite(p, n, 1, debug_pcl_out); |
44 | | } |
45 | | #endif |
46 | | |
47 | | #include "pltop.h" |
48 | | #include "plmain.h" |
49 | | |
50 | | #include "plparse.h" /* for e_ExitLanguage */ |
51 | | #include "plmain.h" |
52 | | #include "gxdevice.h" /* so we can include gxht.h below */ |
53 | | #include "gserrors.h" |
54 | | #include "gp.h" |
55 | | #include "assert_.h" |
56 | | |
57 | | /* |
58 | | * The TXT interpeter is identical to pl_interp_t. |
59 | | * The TXT interpreter instance is derived from pl_interp_implementation_t. |
60 | | */ |
61 | | |
62 | | typedef enum |
63 | | { |
64 | | TXT_STATE_INIT = 0, |
65 | | TXT_STATE_UTF8, |
66 | | TXT_STATE_UTF8_MAYBE, |
67 | | TXT_STATE_UTF16_LE, |
68 | | TXT_STATE_UTF16_BE, |
69 | | TXT_STATE_ASCII |
70 | | } txt_state_t; |
71 | | |
72 | | typedef struct txt_interp_instance_s txt_interp_instance_t; |
73 | | |
74 | | struct txt_interp_instance_s |
75 | | { |
76 | | gs_memory_t *memory; /* memory allocator to use */ |
77 | | |
78 | | pl_interp_implementation_t *sub; |
79 | | gx_device *device; |
80 | | |
81 | | int buffered; |
82 | | byte buffer[4]; |
83 | | |
84 | | int state; |
85 | | int detected; |
86 | | int just_had_lf; |
87 | | int just_had_cr; |
88 | | int col; |
89 | | int sent; |
90 | | }; |
91 | | |
92 | | enum |
93 | | { |
94 | | TXT_UNDETECTED = -1, |
95 | | TXT_UNKNOWN, |
96 | | TXT_UTF8, |
97 | | TXT_UTF8_MAYBE, |
98 | | TXT_UTF16_LE, |
99 | | TXT_UTF16_BE, |
100 | | TXT_ASCII, |
101 | | }; |
102 | | |
103 | | static int |
104 | | identify_from_buffer(const unsigned char *s, int len) |
105 | 19.9k | { |
106 | 19.9k | int count_controls = 0; |
107 | 19.9k | int count_hi = 0; |
108 | 19.9k | int count_tabs = 0; |
109 | 19.9k | int plausibly_utf8 = 1; |
110 | 19.9k | int i; |
111 | | |
112 | | /* UTF-8 with a BOM */ |
113 | 19.9k | if (len >= 3 && s[0] == 0xef && s[1] == 0xbb && s[2] == 0xbf) |
114 | 2 | return TXT_UTF8; |
115 | | /* UTF-16 (little endian) */ |
116 | 19.9k | if (len >= 2 && s[0] == 0xff && s[1] == 0xfe) |
117 | 118 | return TXT_UTF16_LE; |
118 | | /* UTF-16 (big endian) */ |
119 | 19.7k | if (len >= 2 && s[0] == 0xfe && s[1] == 0xff) |
120 | 204 | return TXT_UTF16_BE; |
121 | | |
122 | | /* Gather some stats. */ |
123 | 24.4M | for (i = 0; i < len; i++) |
124 | 24.3M | { |
125 | 24.3M | if (s[i] == 9) |
126 | 91.3k | { |
127 | 91.3k | count_tabs++; |
128 | 91.3k | } |
129 | 24.2M | else if (s[i] == 12) |
130 | 140k | { |
131 | | /* Form feed. We'll let that slide. */ |
132 | 140k | } |
133 | 24.1M | else if (s[i] == 10) |
134 | 167k | { |
135 | 167k | if (i+1 < len && s[i+1] == 13) |
136 | 687 | i++; |
137 | 167k | } |
138 | 23.9M | else if (s[i] == 13) |
139 | 139k | { |
140 | 139k | if (i+1 < len && s[i+1] == 10) |
141 | 81.6k | i++; |
142 | 139k | } |
143 | 23.8M | else if (s[i] < 32 || s[i] == 0x7f) |
144 | 5.69M | { |
145 | 5.69M | count_controls++; |
146 | 5.69M | } |
147 | 18.1M | else if (s[i] < 0x7f) |
148 | 12.7M | { |
149 | | /* Seems like a reasonable ASCII value. */ |
150 | 12.7M | } |
151 | 5.43M | else |
152 | 5.43M | { |
153 | 5.43M | count_hi++; |
154 | 5.43M | if ((s[i] & 0xF8) == 0xF0) |
155 | 321k | { |
156 | | /* 3 following bytes */ |
157 | 321k | if (i+1 < len && (s[i+1] & 0xC0) != 0x80) |
158 | 308k | plausibly_utf8 = 0; |
159 | 13.5k | else if (i+2 < len && (s[i+2] & 0xC0) != 0x80) |
160 | 8.51k | plausibly_utf8 = 0; |
161 | 5.02k | else if (i+3 < len && (s[i+3] & 0xC0) != 0x80) |
162 | 2.50k | plausibly_utf8 = 0; |
163 | 2.51k | else |
164 | 2.51k | i+=3; |
165 | 321k | } |
166 | 5.11M | else if ((s[i] & 0xF0) == 0xE0) |
167 | 358k | { |
168 | | /* 2 following bytes */ |
169 | 358k | if (i+1 < len && (s[i+1] & 0xC0) != 0x80) |
170 | 327k | plausibly_utf8 = 0; |
171 | 31.4k | else if (i+2 < len && (s[i+2] & 0xC0) != 0x80) |
172 | 21.2k | plausibly_utf8 = 0; |
173 | 10.1k | else |
174 | 10.1k | i+=2; |
175 | 358k | } |
176 | 4.75M | else if ((s[i] & 0xE0) == 0xC0) |
177 | 1.42M | { |
178 | | /* 1 following bytes */ |
179 | 1.42M | if (i+1 < len && (s[i+1] & 0xC0) != 0x80) |
180 | 1.31M | plausibly_utf8 = 0; |
181 | 110k | else |
182 | 110k | i++; |
183 | 1.42M | } |
184 | 3.33M | else |
185 | 3.33M | plausibly_utf8 = 0; |
186 | 5.43M | } |
187 | 24.3M | } |
188 | | |
189 | | /* Any (non tab/cr/lf/ff) control characters probably means this isn't text. */ |
190 | 19.5k | if (count_controls > 0) |
191 | 18.1k | return TXT_UNKNOWN; |
192 | | /* If we've managed to decode all that as utf8 without problem, it's probably text. */ |
193 | 1.45k | if (plausibly_utf8) |
194 | 886 | return TXT_UTF8_MAYBE; |
195 | | /* If we're hitting too many top bit set chars, give up. */ |
196 | 565 | if (count_hi > len/10) |
197 | 206 | return TXT_UNKNOWN; |
198 | | |
199 | 359 | return TXT_ASCII; |
200 | 565 | } |
201 | | |
202 | | static int |
203 | | txt_detect_language(const char *t, int len) |
204 | 19.2k | { |
205 | 19.2k | const unsigned char *s = (const unsigned char *)t; |
206 | | |
207 | 19.2k | switch (identify_from_buffer(s, len)) |
208 | 19.2k | { |
209 | 1 | case TXT_UTF8: |
210 | 64 | case TXT_UTF16_LE: |
211 | 168 | case TXT_UTF16_BE: |
212 | | /* PCL spots files with lots of ESCs in them at confidence |
213 | | * level 80. We'll use 70, cos we don't want to override that. */ |
214 | 168 | return 70; |
215 | 520 | case TXT_UTF8_MAYBE: |
216 | 727 | case TXT_ASCII: |
217 | 727 | return 60; |
218 | 0 | default: |
219 | 18.3k | case TXT_UNKNOWN: |
220 | 18.3k | break; |
221 | 19.2k | } |
222 | | |
223 | 18.3k | return 0; |
224 | 19.2k | } |
225 | | |
226 | | static const pl_interp_characteristics_t * |
227 | | txt_impl_characteristics(const pl_interp_implementation_t *pimpl) |
228 | 40.9k | { |
229 | 40.9k | static pl_interp_characteristics_t txt_characteristics = |
230 | 40.9k | { |
231 | 40.9k | "TXT", |
232 | 40.9k | txt_detect_language, |
233 | 40.9k | }; |
234 | 40.9k | return &txt_characteristics; |
235 | 40.9k | } |
236 | | |
237 | | /* Do per-instance interpreter allocation/init. No device is set yet */ |
238 | | static int |
239 | | txt_impl_allocate_interp_instance(pl_interp_implementation_t *impl, |
240 | | gs_memory_t *pmem) |
241 | 8.97k | { |
242 | 8.97k | txt_interp_instance_t *instance; |
243 | | |
244 | 8.97k | instance = (txt_interp_instance_t *) gs_alloc_bytes(pmem, |
245 | 8.97k | sizeof(txt_interp_instance_t), "txt_impl_allocate_interp_instance"); |
246 | | |
247 | 8.97k | if (!instance) |
248 | 0 | return_error(gs_error_VMerror); |
249 | | |
250 | 8.97k | instance->memory = pmem; |
251 | 8.97k | instance->sub = NULL; |
252 | | |
253 | 8.97k | impl->interp_client_data = instance; |
254 | | |
255 | 8.97k | return 0; |
256 | 8.97k | } |
257 | | |
258 | | /* Prepare interp instance for the next "job" */ |
259 | | static int |
260 | | txt_impl_init_job(pl_interp_implementation_t *impl, |
261 | | gx_device *pdevice) |
262 | 674 | { |
263 | 674 | txt_interp_instance_t *instance = impl->interp_client_data; |
264 | | |
265 | 674 | instance->device = pdevice; |
266 | 674 | instance->state = TXT_STATE_INIT; |
267 | 674 | instance->buffered = 0; |
268 | 674 | instance->detected = TXT_UNDETECTED; |
269 | 674 | instance->just_had_lf = 0; |
270 | 674 | instance->just_had_cr = 0; |
271 | 674 | instance->col = 0; |
272 | | |
273 | 674 | instance->sub = pl_main_get_pcl_instance(instance->memory); |
274 | | |
275 | 674 | return pl_init_job(instance->sub, instance->device); |
276 | 674 | } |
277 | | |
278 | 5.54k | #define ESC 27 |
279 | | |
280 | | static int |
281 | | send_bytes(txt_interp_instance_t *instance, const byte *p, int n) |
282 | 718k | { |
283 | 718k | stream_cursor_read cursor; |
284 | | |
285 | | #ifdef DEBUG_DUMP_PCL |
286 | | debug_as_pcl(p, n); |
287 | | #endif |
288 | | |
289 | 718k | stream_cursor_read_init(&cursor, p, n); |
290 | | |
291 | 718k | return instance->sub->proc_process(instance->sub, &cursor); |
292 | 718k | } |
293 | | |
294 | | static void |
295 | | drop_buffered(txt_interp_instance_t *instance, int n) |
296 | 711k | { |
297 | 711k | assert(instance->buffered >= n); |
298 | 711k | instance->buffered -= n; |
299 | 711k | if (instance->buffered > 0) |
300 | 1.26k | memmove(instance->buffer, &instance->buffer[n], instance->buffered); |
301 | 711k | } |
302 | | |
303 | | static int |
304 | | send_pcl_init(txt_interp_instance_t *instance) |
305 | 504 | { |
306 | 504 | static byte init[] = { |
307 | 504 | ESC, 'E', /* Reset */ |
308 | 504 | ESC, '&', 'l', '0', 'O', /* Orientation */ |
309 | 504 | ESC, '&', 'k', '1', '0', 'H', /* Horizontal spacing 10/120 of an inch. */ |
310 | 504 | ESC, '&', 'l', '8', 'C', /* Vertical line spacing 8/48 of an inch. */ |
311 | 504 | ESC, '&', 't', '8', '3', 'P', /* &t = double byte parsing, 83 = utf-8, P = ? */ |
312 | 504 | ESC, '(', '1', '8', 'N', /* Primary symbol set = 18N = Unicode */ |
313 | 504 | ESC, '(', 's', '0', 'P', /* Fixed pitch */ |
314 | 504 | ESC, '(', 's', '1', '2', 'H', /* Secondary fixed pitch 12cpi */ |
315 | 504 | ESC, '(', 's', '8', 'V', /* Point size 8 */ |
316 | 504 | ESC, '(', 's', '3', 'T', /* Typeface number 3 */ |
317 | 504 | ESC, '&', 's', '0', 'C' /* Wrappity wrap wrap */ |
318 | 504 | }; |
319 | | |
320 | 504 | return send_bytes(instance, init, sizeof(init)); |
321 | 504 | } |
322 | | |
323 | | static int |
324 | | send_urc(txt_interp_instance_t *instance, int n) |
325 | 257 | { |
326 | 257 | static byte unicode_replacement_char_as_utf8[] = { 0xe3, 0xbf, 0xbd }; |
327 | | |
328 | 257 | if (instance->state == TXT_STATE_UTF8_MAYBE) |
329 | 30 | { |
330 | | /* We were guessing that this was UTF8. Now we know it's not. Drop back to ascii. */ |
331 | 30 | instance->state = TXT_STATE_ASCII; |
332 | 30 | return 0; |
333 | 30 | } |
334 | | |
335 | 227 | drop_buffered(instance, n); |
336 | | |
337 | 227 | instance->sent = 1; |
338 | 227 | return send_bytes(instance, unicode_replacement_char_as_utf8, sizeof(unicode_replacement_char_as_utf8)); |
339 | 257 | } |
340 | | |
341 | | static int |
342 | | send_utf8(txt_interp_instance_t *instance, int val) |
343 | 717k | { |
344 | 717k | byte buf[4]; |
345 | 717k | int n; |
346 | | |
347 | | /* Finally, send the val! */ |
348 | 717k | if (val < 0x80) |
349 | 611k | { |
350 | 611k | buf[0] = val; |
351 | 611k | n = 1; |
352 | 611k | } |
353 | 106k | else if (val < 0x800) |
354 | 69.8k | { |
355 | 69.8k | buf[0] = 0xC0 + (val>>6); |
356 | 69.8k | buf[1] = 0x80 + (val & 0x3F); |
357 | 69.8k | n = 2; |
358 | 69.8k | } |
359 | 36.9k | else if (val < 0x10000) |
360 | 36.9k | { |
361 | 36.9k | buf[0] = 0xE0 + (val>>12); |
362 | 36.9k | buf[1] = 0x80 + ((val>>6) & 0x3F); |
363 | 36.9k | buf[2] = 0x80 + (val & 0x3F); |
364 | 36.9k | n = 3; |
365 | 36.9k | } |
366 | 7 | else |
367 | 7 | { |
368 | 7 | buf[0] = 0xF0 + (val>>18); |
369 | 7 | buf[1] = 0x80 + ((val>>12) & 0x3F); |
370 | 7 | buf[2] = 0x80 + ((val>>6) & 0x3F); |
371 | 7 | buf[3] = 0x80 + (val & 0x3F); |
372 | 7 | n = 4; |
373 | 7 | } |
374 | 717k | return send_bytes(instance, buf, n); |
375 | 717k | } |
376 | | |
377 | | /* All our actual codepoints should flow through here. So this is where |
378 | | * we do the housekeeping. */ |
379 | | static int |
380 | | send_codepoint(txt_interp_instance_t *instance, int val) |
381 | 711k | { |
382 | 711k | int code; |
383 | | |
384 | | #ifdef DEBUG_CODEPOINTS |
385 | | dprintf3("Sending codepoint %d (%x) %c\n", val, val, val >= 32 && val <= 255 && val != 127 ? val : '.'); |
386 | | #endif |
387 | | |
388 | 711k | instance->sent = 1; |
389 | | /* Tidy up whatever mess of CR/LF we are passed. */ |
390 | 711k | if (val == '\r') |
391 | 1.46k | { |
392 | | /* If we've got a CR and we've just had a LF, swallow this. */ |
393 | 1.46k | if (instance->just_had_lf) |
394 | 196 | { |
395 | 196 | instance->just_had_lf = 0; |
396 | 196 | return 0; |
397 | 196 | } |
398 | 1.26k | instance->just_had_cr = 1; |
399 | 1.26k | val = '\n'; |
400 | 1.26k | } |
401 | 710k | else if (val == '\n') |
402 | 3.27k | { |
403 | | /* If we've got a LF and we've just had a CR, swallow this. */ |
404 | 3.27k | if (instance->just_had_cr) |
405 | 554 | { |
406 | 554 | instance->just_had_cr = 0; |
407 | 554 | return 0; |
408 | 554 | } |
409 | 2.71k | instance->just_had_lf = 1; |
410 | 2.71k | } |
411 | 707k | else |
412 | 707k | { |
413 | 707k | instance->just_had_cr = 0; |
414 | 707k | instance->just_had_lf = 0; |
415 | 707k | } |
416 | | |
417 | | /* Keep track of what column we're at to so we can do tab handling. */ |
418 | 710k | if (val == '\n') |
419 | 3.98k | { |
420 | 3.98k | instance->col = 0; |
421 | 3.98k | code = send_utf8(instance, '\n'); |
422 | 3.98k | if (code < 0 && code != gs_error_NeedInput) |
423 | 0 | return code; |
424 | 3.98k | return send_utf8(instance, '\r'); |
425 | 3.98k | } |
426 | 707k | if (val == '\t') |
427 | 665 | { |
428 | 665 | int spaces = 8 - (instance->col & 7); |
429 | 4.29k | while (spaces--) |
430 | 3.62k | { |
431 | 3.62k | int code = send_utf8(instance, ' '); |
432 | 3.62k | if (code < 0 && code != gs_error_NeedInput) |
433 | 0 | return code; |
434 | 3.62k | instance->col++; |
435 | 3.62k | } |
436 | 665 | return 0; |
437 | 665 | } |
438 | 706k | instance->col++; |
439 | | |
440 | | #if 0 |
441 | | /* No need for this as PCL line wrapping works for us. If PCL ever |
442 | | * decides to wrap at a number of columns that aren't a multiple of |
443 | | * 8 then we'll need to do it manually again!. */ |
444 | | if (instance->col == 80) |
445 | | { |
446 | | instance->col = 0; |
447 | | code = send_utf8(instance, '\n'); |
448 | | if (code < 0 && code != gs_error_NeedInput)) |
449 | | return code; |
450 | | return send_utf8(instance, '\r'); |
451 | | } |
452 | | #endif |
453 | | |
454 | 706k | return send_utf8(instance, val); |
455 | 707k | } |
456 | | |
457 | | static int |
458 | | process_block(txt_interp_instance_t *instance, const byte *ptr, int n) |
459 | 1.17k | { |
460 | 1.17k | int code; |
461 | 1.17k | byte *s = &instance->buffer[0]; |
462 | 1.17k | int old_state = instance->state; |
463 | 1.17k | int val; |
464 | | |
465 | 1.17k | if (instance->detected == TXT_UNDETECTED) |
466 | 674 | { |
467 | 674 | instance->detected = identify_from_buffer(ptr, n); |
468 | | /* If we're thinking we're ASCII, go straight there. Otherwise, we'll let the |
469 | | * BOM detection below run its course. */ |
470 | 674 | if (instance->detected == TXT_ASCII) |
471 | 152 | instance->state = TXT_STATE_ASCII; |
472 | 674 | } |
473 | | |
474 | 1.17k | instance->sent = 0; |
475 | 783k | while (n) |
476 | 783k | { |
477 | | /* instance->sent records whether we pulled anything out of the buffer |
478 | | * last time round the loop. If we changed state, then don't refill the |
479 | | * buffer. Otherwise only fill the buffer if we didn't a char last time |
480 | | * (maybe we need char 2 of a 2 char sequence?) or if we haven't got |
481 | | * anything in the buffer already. */ |
482 | 783k | if (instance->state == old_state && (!instance->sent || instance->buffered == 0)) |
483 | 781k | { |
484 | 781k | assert(instance->buffered < 4); |
485 | 781k | s[instance->buffered++] = *ptr++; |
486 | 781k | n--; |
487 | 781k | } |
488 | 783k | old_state = instance->state; |
489 | | |
490 | 783k | instance->sent = 0; |
491 | 783k | switch (instance->state) |
492 | 783k | { |
493 | 1.37k | case TXT_STATE_INIT: |
494 | | |
495 | 1.37k | if (instance->buffered == 3 && s[0] == 0xef && s[1] == 0xbb && s[2] == 0xbf) |
496 | 1 | { |
497 | 1 | instance->state = TXT_STATE_UTF8; |
498 | 1 | drop_buffered(instance, 3); |
499 | 1 | } |
500 | 1.37k | else if (instance->buffered == 2 && s[0] == 0xff && s[1] == 0xfe) |
501 | 55 | { |
502 | 55 | instance->state = TXT_STATE_UTF16_LE; |
503 | 55 | drop_buffered(instance, 2); |
504 | 55 | } |
505 | 1.32k | else if (instance->buffered == 2 && s[0] == 0xfe && s[1] == 0xff) |
506 | 100 | { |
507 | 100 | instance->state = TXT_STATE_UTF16_BE; |
508 | 100 | drop_buffered(instance, 2); |
509 | 100 | } |
510 | 1.22k | else if (instance->buffered >= 3) |
511 | 348 | { |
512 | | /* We haven't found a BOM, try for utf8. */ |
513 | 348 | instance->state = TXT_STATE_UTF8_MAYBE; |
514 | 348 | } |
515 | | |
516 | | /* If we've recognised the BOM, then send the init string. */ |
517 | 1.37k | if (instance->state != TXT_STATE_INIT) |
518 | 504 | { |
519 | 504 | code = send_pcl_init(instance); |
520 | 504 | if (code < 0) { |
521 | 504 | if (code != gs_error_NeedInput || n == 0) |
522 | 4 | return code; |
523 | 504 | } |
524 | 504 | } |
525 | 1.37k | break; |
526 | 1.37k | case TXT_STATE_UTF8: |
527 | 246k | case TXT_STATE_UTF8_MAYBE: |
528 | 246k | if ((s[0] & 0xF8) == 0xF0) |
529 | 38 | { |
530 | | /* 3 following bytes */ |
531 | 38 | if (instance->buffered >= 2 && (s[1] & 0xC0) != 0x80) |
532 | 1 | { |
533 | 1 | code = send_urc(instance, 1); |
534 | 1 | if (code < 0) { |
535 | 0 | if (code != gs_error_NeedInput || n == 0) |
536 | 0 | return code; |
537 | 0 | } |
538 | 1 | } |
539 | 37 | else if (instance->buffered >= 3 && (s[2] & 0xC0) != 0x80) |
540 | 0 | { |
541 | 0 | code = send_urc(instance, 2); |
542 | 0 | if (code < 0) { |
543 | 0 | if (code != gs_error_NeedInput || n == 0) |
544 | 0 | return code; |
545 | 0 | } |
546 | 0 | } |
547 | 37 | else if (instance->buffered == 4 && (s[3] & 0xC0) != 0x80) |
548 | 0 | { |
549 | 0 | code = send_urc(instance, 3); |
550 | 0 | if (code < 0) { |
551 | 0 | if (code != gs_error_NeedInput || n == 0) |
552 | 0 | return code; |
553 | 0 | } |
554 | 0 | } |
555 | 37 | else if (instance->buffered == 4) |
556 | 7 | { |
557 | | /* Valid encoding of 4 bytes */ |
558 | 7 | val = ((s[0] & 0x7)<<18) | ((s[1] & 0x3f)<<12) | ((s[2] & 0x3f)<<6) | (s[3] & 0x3f); |
559 | 7 | drop_buffered(instance, 4); |
560 | 7 | code = send_codepoint(instance, val); |
561 | 7 | if (code < 0) { |
562 | 7 | if (code != gs_error_NeedInput || n == 0) |
563 | 1 | return code; |
564 | 7 | } |
565 | 7 | } |
566 | 30 | else if (instance->buffered != 1 && instance->buffered != 2 && instance->buffered != 3) |
567 | 0 | { |
568 | | /* Should never happen. */ |
569 | 0 | return_error(gs_error_Fatal); |
570 | 0 | } |
571 | 38 | } |
572 | 246k | else if ((s[0] & 0xF0) == 0xE0) |
573 | 31 | { |
574 | | /* 2 following bytes */ |
575 | 31 | if (instance->buffered >= 2 && (s[1] & 0xC0) != 0x80) |
576 | 3 | { |
577 | 3 | code = send_urc(instance, 1); |
578 | 3 | if (code < 0) { |
579 | 0 | if (code != gs_error_NeedInput || n == 0) |
580 | 0 | return code; |
581 | 0 | } |
582 | 3 | } |
583 | 28 | else if (instance->buffered >= 3 && (s[2] & 0xC0) != 0x80) |
584 | 0 | { |
585 | 0 | code = send_urc(instance, 2); |
586 | 0 | if (code < 0) { |
587 | 0 | if (code != gs_error_NeedInput || n == 0) |
588 | 0 | return code; |
589 | 0 | } |
590 | 0 | } |
591 | 28 | else if (instance->buffered == 3) |
592 | 6 | { |
593 | | /* Valid encoding of 3 bytes */ |
594 | 6 | val = ((s[0] & 0xF)<<12) | ((s[1] & 0x3f)<<6) | (s[2] & 0x3f); |
595 | 6 | drop_buffered(instance, 3); |
596 | 6 | code = send_codepoint(instance, val); |
597 | 6 | if (code < 0) { |
598 | 6 | if (code != gs_error_NeedInput || n == 0) |
599 | 0 | return code; |
600 | 6 | } |
601 | 6 | } |
602 | 22 | else if (instance->buffered != 1 && instance->buffered != 2) |
603 | 1 | { |
604 | | /* Should never happen. */ |
605 | 1 | return_error(gs_error_Fatal); |
606 | 1 | } |
607 | 31 | } |
608 | 246k | else if ((s[0] & 0xE0) == 0xC0) |
609 | 29 | { |
610 | | /* 1 following bytes */ |
611 | 29 | if (instance->buffered >= 2 && (s[1] & 0xC0) != 0x80) |
612 | 4 | { |
613 | 4 | code = send_urc(instance, 1); |
614 | 4 | if (code < 0) { |
615 | 0 | if (code != gs_error_NeedInput || n == 0) |
616 | 0 | return code; |
617 | 0 | } |
618 | 4 | } |
619 | 25 | else if (instance->buffered == 2) |
620 | 7 | { |
621 | | /* Valid encoding of 2 bytes */ |
622 | 7 | val = ((s[0] & 0x1F)<<6) | (s[1] & 0x3f); |
623 | 7 | drop_buffered(instance, 2); |
624 | 7 | code = send_codepoint(instance, val); |
625 | 7 | if (code < 0) { |
626 | 7 | if (code != gs_error_NeedInput || n == 0) |
627 | 1 | return code; |
628 | 7 | } |
629 | 7 | } |
630 | 18 | else if (instance->buffered != 1) |
631 | 2 | { |
632 | | /* Should never happen. */ |
633 | 2 | return_error(gs_error_Fatal); |
634 | 2 | } |
635 | 29 | } |
636 | 246k | else if ((s[0] & 0xC0) == 0x80) |
637 | 13 | { |
638 | | /* A continuation byte at the start. Should never see this. */ |
639 | 13 | code = send_urc(instance, 1); |
640 | 13 | if (code < 0) { |
641 | 0 | if (code != gs_error_NeedInput || n == 0) |
642 | 0 | return code; |
643 | 0 | } |
644 | 13 | } |
645 | 246k | else if (s[0] < 0x80) |
646 | 246k | { |
647 | | /* Simple byte. */ |
648 | 246k | val = s[0]; |
649 | 246k | drop_buffered(instance, 1); |
650 | 246k | code = send_codepoint(instance, val); |
651 | 246k | if (code < 0) { |
652 | 245k | if (code != gs_error_NeedInput || n == 0) |
653 | 386 | return code; |
654 | 245k | } |
655 | 246k | } |
656 | 9 | else |
657 | 9 | { |
658 | | /* Bytes we should never see in a UTF-8 file! (0xf8-0xff) */ |
659 | 9 | code = send_urc(instance, 1); |
660 | 9 | if (code < 0) { |
661 | 0 | if (code != gs_error_NeedInput || n == 0) |
662 | 0 | return code; |
663 | 0 | } |
664 | 9 | } |
665 | 245k | break; |
666 | 245k | case TXT_STATE_UTF16_LE: |
667 | 21.4k | if (instance->buffered < 2) |
668 | 10.7k | break; |
669 | 10.7k | if (s[1] >= 0xD8 && s[1] < 0xDC) |
670 | 102 | { |
671 | | /* High surrogate */ |
672 | 102 | if (instance->buffered < 4) |
673 | 68 | break; |
674 | 34 | if (s[3] < 0xDC || s[3] > 0xDF) |
675 | 29 | { |
676 | | /* Not followed by a low surrogate! Ignore the high surrogate. */ |
677 | 29 | code = send_urc(instance, 2); |
678 | 29 | if (code < 0) |
679 | 29 | return code; |
680 | 0 | break; |
681 | 29 | } |
682 | 5 | val = (((s[0] | (s[1]<<8)) - 0xdc00)<<10) + (s[2] | (s[3]<<8)) - 0xdc00 + 0x10000; |
683 | 5 | drop_buffered(instance, 4); |
684 | 5 | } |
685 | 10.6k | else |
686 | 10.6k | { |
687 | 10.6k | val = s[0] | (s[1]<<8); |
688 | 10.6k | drop_buffered(instance, 2); |
689 | 10.6k | } |
690 | 10.6k | code = send_codepoint(instance, val); |
691 | 10.6k | if (code < 0) { |
692 | 10.6k | if (code != gs_error_NeedInput || n == 0) |
693 | 31 | return code; |
694 | 10.6k | } |
695 | 10.6k | break; |
696 | 117k | case TXT_STATE_UTF16_BE: |
697 | 117k | if (instance->buffered < 2) |
698 | 58.4k | break; |
699 | 58.9k | if (s[0] >= 0xD8 && s[0] < 0xDC) |
700 | 604 | { |
701 | | /* High surrogate */ |
702 | 604 | if (instance->buffered < 4) |
703 | 402 | break; |
704 | 202 | if (s[2] < 0xDC || s[2] > 0xDF) |
705 | 198 | { |
706 | | /* Not followed by a low surrogate! Ignore the high surrogate. */ |
707 | 198 | code = send_urc(instance, 2); |
708 | 198 | if (code < 0) |
709 | 198 | return code; |
710 | 0 | break; |
711 | 198 | } |
712 | 4 | val = (((s[1] | (s[0]<<8)) - 0xdc00)<<10) + (s[3] | (s[2]<<8)) - 0xdc00 + 0x10000; |
713 | 4 | drop_buffered(instance, 4); |
714 | 4 | } |
715 | 58.3k | else |
716 | 58.3k | { |
717 | 58.3k | val = s[1] | (s[0]<<8); |
718 | 58.3k | drop_buffered(instance, 2); |
719 | 58.3k | } |
720 | 58.3k | code = send_codepoint(instance, val); |
721 | 58.3k | if (code < 0) { |
722 | 58.3k | if (code != gs_error_NeedInput || n == 0) |
723 | 77 | return code; |
724 | 58.3k | } |
725 | 58.2k | break; |
726 | 396k | case TXT_STATE_ASCII: |
727 | 792k | while (instance->buffered > 0) |
728 | 396k | { |
729 | 396k | code = send_codepoint(instance, s[0]); |
730 | 396k | if (code < 0) { |
731 | 395k | if (code != gs_error_NeedInput || n == 0) |
732 | 349 | return code; |
733 | 395k | } |
734 | 396k | drop_buffered(instance, 1); |
735 | 396k | } |
736 | 396k | break; |
737 | 396k | default: |
738 | 0 | return_error(gs_error_Fatal); |
739 | 783k | } |
740 | 783k | } |
741 | 99 | return 0; |
742 | 1.17k | } |
743 | | |
744 | | /* Parse an entire random access file */ |
745 | | #if 0 |
746 | | static int |
747 | | txt_impl_process_file(pl_interp_implementation_t *impl, const char *filename) |
748 | | { |
749 | | txt_interp_instance_t *instance = impl->interp_client_data; |
750 | | int code, code1; |
751 | | gp_file *file; |
752 | | |
753 | | file = gp_fopen(instance->memory, filename, "rb"); |
754 | | if (file == 0) |
755 | | return_error(gs_error_ioerror); |
756 | | |
757 | | instance->sub = pl_main_get_pcl_instance(instance->memory); |
758 | | |
759 | | code = pl_init_job(instance->sub, instance->device); |
760 | | if (code >= 0) |
761 | | { |
762 | | code = pl_process_file(instance->sub, filename); |
763 | | } |
764 | | |
765 | | code1 = pl_dnit_job(instance->sub); |
766 | | if (code >= 0) |
767 | | code = code1; |
768 | | |
769 | | gp_fclose(file); |
770 | | |
771 | | return code; |
772 | | } |
773 | | #endif |
774 | | |
775 | | /* Do any setup for parser per-cursor */ |
776 | | static int /* ret 0 or +ve if ok, else -ve error code */ |
777 | | txt_impl_process_begin(pl_interp_implementation_t * impl) |
778 | 674 | { |
779 | 674 | return 0; |
780 | 674 | } |
781 | | |
782 | | /* Parse a cursor-full of data */ |
783 | | static int |
784 | | txt_impl_process(pl_interp_implementation_t *impl, stream_cursor_read *cursor) |
785 | 1.17k | { |
786 | 1.17k | txt_interp_instance_t *instance = impl->interp_client_data; |
787 | 1.17k | int avail; |
788 | 1.17k | int code; |
789 | | |
790 | 1.17k | avail = cursor->limit - cursor->ptr; |
791 | 1.17k | code = process_block(instance, cursor->ptr + 1, avail); |
792 | 1.17k | cursor->ptr = cursor->limit; |
793 | | |
794 | 1.17k | return code; |
795 | 1.17k | } |
796 | | |
797 | | static int /* ret 0 or +ve if ok, else -ve error code */ |
798 | | txt_impl_process_end(pl_interp_implementation_t * impl) |
799 | 674 | { |
800 | 674 | return 0; |
801 | 674 | } |
802 | | |
803 | | /* Skip to end of job. |
804 | | * Return 1 if done, 0 ok but EOJ not found, else negative error code. |
805 | | */ |
806 | | static int |
807 | | txt_impl_flush_to_eoj(pl_interp_implementation_t *impl, stream_cursor_read *pcursor) |
808 | 3 | { |
809 | | /* assume SO files cannot be pjl embedded */ |
810 | 3 | pcursor->ptr = pcursor->limit; |
811 | 3 | return 0; |
812 | 3 | } |
813 | | |
814 | | /* Parser action for end-of-file */ |
815 | | static int |
816 | | txt_impl_process_eof(pl_interp_implementation_t *impl) |
817 | 671 | { |
818 | 671 | txt_interp_instance_t *instance = impl->interp_client_data; |
819 | | |
820 | 671 | if (instance->sub) |
821 | 671 | return pl_process_eof(instance->sub); |
822 | | |
823 | 0 | return 0; |
824 | 671 | } |
825 | | |
826 | | /* Report any errors after running a job */ |
827 | | static int |
828 | | txt_impl_report_errors(pl_interp_implementation_t *impl, |
829 | | int code, /* prev termination status */ |
830 | | long file_position, /* file position of error, -1 if unknown */ |
831 | | bool force_to_cout /* force errors to cout */ |
832 | | ) |
833 | 3 | { |
834 | 3 | txt_interp_instance_t *instance = impl->interp_client_data; |
835 | 3 | int ret = 0; |
836 | | |
837 | 3 | if (instance->sub) |
838 | 3 | ret = pl_report_errors(instance->sub, code, file_position, force_to_cout); |
839 | | |
840 | 3 | return ret; |
841 | 3 | } |
842 | | |
843 | | /* Wrap up interp instance after a "job" */ |
844 | | static int |
845 | | txt_impl_dnit_job(pl_interp_implementation_t *impl) |
846 | 674 | { |
847 | 674 | txt_interp_instance_t *instance = impl->interp_client_data; |
848 | 674 | int code = 0; |
849 | | |
850 | 674 | if (instance->sub) |
851 | 674 | code = pl_dnit_job(instance->sub); |
852 | 674 | instance->sub = NULL; |
853 | 674 | instance->device = NULL; |
854 | | |
855 | 674 | return code; |
856 | 674 | } |
857 | | |
858 | | /* Deallocate a interpreter instance */ |
859 | | static int |
860 | | txt_impl_deallocate_interp_instance(pl_interp_implementation_t *impl) |
861 | 8.97k | { |
862 | 8.97k | txt_interp_instance_t *instance = impl->interp_client_data; |
863 | | |
864 | 8.97k | gs_free_object(instance->memory, instance, "so_impl_deallocate_interp_instance"); |
865 | | |
866 | 8.97k | return 0; |
867 | 8.97k | } |
868 | | |
869 | | /* Parser implementation descriptor */ |
870 | | pl_interp_implementation_t txt_implementation = |
871 | | { |
872 | | txt_impl_characteristics, |
873 | | txt_impl_allocate_interp_instance, |
874 | | NULL, /* get_device_memory */ |
875 | | NULL, /* set_param */ |
876 | | NULL, /* add_path */ |
877 | | NULL, /* post_args_init */ |
878 | | txt_impl_init_job, |
879 | | NULL, /* run_prefix_commands */ |
880 | | NULL, /* txt_impl_process_file, */ |
881 | | txt_impl_process_begin, |
882 | | txt_impl_process, |
883 | | txt_impl_process_end, |
884 | | txt_impl_flush_to_eoj, |
885 | | txt_impl_process_eof, |
886 | | txt_impl_report_errors, |
887 | | txt_impl_dnit_job, |
888 | | txt_impl_deallocate_interp_instance, |
889 | | NULL, |
890 | | }; |