Coverage Report

Created: 2026-04-09 07:06

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/ghostpdl/gpdl/txttop.c
Line
Count
Source
1
/* Copyright (C) 2026 Artifex Software, Inc.
2
   All Rights Reserved.
3
4
   This software is provided AS-IS with no warranty, either express or
5
   implied.
6
7
   This software is distributed under license and may not be copied,
8
   modified or distributed except as expressly authorized under the terms
9
   of the license contained in the file LICENSE in this distribution.
10
11
   Refer to licensing information at http://www.artifex.com or contact
12
   Artifex Software, Inc.,  39 Mesa Street, Suite 108A, San Francisco,
13
   CA 94129, USA, for further information.
14
*/
15
16
/* Top-level API implementation for text file handling */
17
18
/* Language wrapper implementation (see pltop.h) */
19
20
21
/* Enable the following for a dump of the codepoints to stdout. */
22
/* #define DEBUG_CODEPOINTS */
23
24
/* Enable the following for a hacky dump of the output PCL to file. */
25
/* #define DEBUG_DUMP_PCL */
26
27
#ifdef DEBUG_DUMP_PCL
28
#include <stdio.h>
29
static FILE *debug_pcl_out = NULL;
30
static void wipe(void)
31
{
32
        fclose(debug_pcl_out);
33
        debug_pcl_out = NULL;
34
}
35
static void
36
debug_as_pcl(const char *p, int n)
37
{
38
        if (debug_pcl_out == NULL)
39
        {
40
            debug_pcl_out = fopen("debug_pcl_out", "wb");
41
            atexit(wipe);
42
        }
43
        fwrite(p, n, 1, debug_pcl_out);
44
}
45
#endif
46
47
#include "pltop.h"
48
#include "plmain.h"
49
50
#include "plparse.h" /* for e_ExitLanguage */
51
#include "plmain.h"
52
#include "gxdevice.h" /* so we can include gxht.h below */
53
#include "gserrors.h"
54
#include "gp.h"
55
#include "assert_.h"
56
57
/*
58
 * The TXT interpeter is identical to pl_interp_t.
59
 * The TXT interpreter instance is derived from pl_interp_implementation_t.
60
 */
61
62
typedef enum
63
{
64
    TXT_STATE_INIT = 0,
65
    TXT_STATE_UTF8,
66
    TXT_STATE_UTF8_MAYBE,
67
    TXT_STATE_UTF16_LE,
68
    TXT_STATE_UTF16_BE,
69
    TXT_STATE_ASCII
70
} txt_state_t;
71
72
typedef struct txt_interp_instance_s txt_interp_instance_t;
73
74
struct txt_interp_instance_s
75
{
76
    gs_memory_t *memory;                /* memory allocator to use */
77
78
    pl_interp_implementation_t *sub;
79
    gx_device *device;
80
81
    int buffered;
82
    byte buffer[4];
83
84
    int state;
85
    int detected;
86
    int just_had_lf;
87
    int just_had_cr;
88
    int col;
89
    int sent;
90
};
91
92
enum
93
{
94
    TXT_UNDETECTED = -1,
95
    TXT_UNKNOWN,
96
    TXT_UTF8,
97
    TXT_UTF8_MAYBE,
98
    TXT_UTF16_LE,
99
    TXT_UTF16_BE,
100
    TXT_ASCII,
101
};
102
103
static int
104
identify_from_buffer(const unsigned char *s, int len)
105
19.9k
{
106
19.9k
    int count_controls = 0;
107
19.9k
    int count_hi = 0;
108
19.9k
    int count_tabs = 0;
109
19.9k
    int plausibly_utf8 = 1;
110
19.9k
    int i;
111
112
    /* UTF-8 with a BOM */
113
19.9k
    if (len >= 3 && s[0] == 0xef && s[1] == 0xbb && s[2] == 0xbf)
114
2
        return TXT_UTF8;
115
    /* UTF-16 (little endian) */
116
19.9k
    if (len >= 2 && s[0] == 0xff && s[1] == 0xfe)
117
118
        return TXT_UTF16_LE;
118
    /* UTF-16 (big endian) */
119
19.7k
    if (len >= 2 && s[0] == 0xfe && s[1] == 0xff)
120
204
        return TXT_UTF16_BE;
121
122
    /* Gather some stats. */
123
24.4M
    for (i = 0; i < len; i++)
124
24.3M
    {
125
24.3M
        if (s[i] == 9)
126
91.3k
        {
127
91.3k
            count_tabs++;
128
91.3k
        }
129
24.2M
        else if (s[i] == 12)
130
140k
        {
131
            /* Form feed. We'll let that slide. */
132
140k
        }
133
24.1M
        else if (s[i] == 10)
134
167k
        {
135
167k
           if (i+1 < len && s[i+1] == 13)
136
687
                i++;
137
167k
        }
138
23.9M
        else if (s[i] == 13)
139
139k
        {
140
139k
           if (i+1 < len && s[i+1] == 10)
141
81.6k
                i++;
142
139k
        }
143
23.8M
        else if (s[i] < 32 || s[i] == 0x7f)
144
5.69M
        {
145
5.69M
            count_controls++;
146
5.69M
        }
147
18.1M
        else if (s[i] < 0x7f)
148
12.7M
        {
149
            /* Seems like a reasonable ASCII value. */
150
12.7M
        }
151
5.43M
        else
152
5.43M
        {
153
5.43M
            count_hi++;
154
5.43M
            if ((s[i] & 0xF8) == 0xF0)
155
321k
            {
156
                /* 3 following bytes */
157
321k
                if (i+1 < len && (s[i+1] & 0xC0) != 0x80)
158
308k
                    plausibly_utf8 = 0;
159
13.5k
                else if (i+2 < len && (s[i+2] & 0xC0) != 0x80)
160
8.51k
                    plausibly_utf8 = 0;
161
5.02k
                else if (i+3 < len && (s[i+3] & 0xC0) != 0x80)
162
2.50k
                    plausibly_utf8 = 0;
163
2.51k
                else
164
2.51k
                    i+=3;
165
321k
            }
166
5.11M
            else if ((s[i] & 0xF0) == 0xE0)
167
358k
            {
168
                /* 2 following bytes */
169
358k
                if (i+1 < len && (s[i+1] & 0xC0) != 0x80)
170
327k
                    plausibly_utf8 = 0;
171
31.4k
                else if (i+2 < len && (s[i+2] & 0xC0) != 0x80)
172
21.2k
                    plausibly_utf8 = 0;
173
10.1k
                else
174
10.1k
                    i+=2;
175
358k
            }
176
4.75M
            else if ((s[i] & 0xE0) == 0xC0)
177
1.42M
            {
178
                /* 1 following bytes */
179
1.42M
                if (i+1 < len && (s[i+1] & 0xC0) != 0x80)
180
1.31M
                    plausibly_utf8 = 0;
181
110k
                else
182
110k
                    i++;
183
1.42M
            }
184
3.33M
            else
185
3.33M
                plausibly_utf8 = 0;
186
5.43M
        }
187
24.3M
    }
188
189
    /* Any (non tab/cr/lf/ff) control characters probably means this isn't text. */
190
19.5k
    if (count_controls > 0)
191
18.1k
        return TXT_UNKNOWN;
192
    /* If we've managed to decode all that as utf8 without problem, it's probably text. */
193
1.45k
    if (plausibly_utf8)
194
886
        return TXT_UTF8_MAYBE;
195
    /* If we're hitting too many top bit set chars, give up. */
196
565
    if (count_hi > len/10)
197
206
        return TXT_UNKNOWN;
198
199
359
    return TXT_ASCII;
200
565
}
201
202
static int
203
txt_detect_language(const char *t, int len)
204
19.2k
{
205
19.2k
    const unsigned char *s = (const unsigned char *)t;
206
207
19.2k
    switch (identify_from_buffer(s, len))
208
19.2k
    {
209
1
    case TXT_UTF8:
210
64
    case TXT_UTF16_LE:
211
168
    case TXT_UTF16_BE:
212
        /* PCL spots files with lots of ESCs in them at confidence
213
         * level 80. We'll use 70, cos we don't want to override that. */
214
168
        return 70;
215
520
    case TXT_UTF8_MAYBE:
216
727
    case TXT_ASCII:
217
727
        return 60;
218
0
    default:
219
18.3k
    case TXT_UNKNOWN:
220
18.3k
        break;
221
19.2k
    }
222
223
18.3k
    return 0;
224
19.2k
}
225
226
static const pl_interp_characteristics_t *
227
txt_impl_characteristics(const pl_interp_implementation_t *pimpl)
228
40.9k
{
229
40.9k
    static pl_interp_characteristics_t txt_characteristics =
230
40.9k
    {
231
40.9k
        "TXT",
232
40.9k
        txt_detect_language,
233
40.9k
    };
234
40.9k
    return &txt_characteristics;
235
40.9k
}
236
237
/* Do per-instance interpreter allocation/init. No device is set yet */
238
static int
239
txt_impl_allocate_interp_instance(pl_interp_implementation_t *impl,
240
                                  gs_memory_t *pmem)
241
8.97k
{
242
8.97k
    txt_interp_instance_t *instance;
243
244
8.97k
    instance = (txt_interp_instance_t *) gs_alloc_bytes(pmem,
245
8.97k
            sizeof(txt_interp_instance_t), "txt_impl_allocate_interp_instance");
246
247
8.97k
    if (!instance)
248
0
        return_error(gs_error_VMerror);
249
250
8.97k
    instance->memory = pmem;
251
8.97k
    instance->sub = NULL;
252
253
8.97k
    impl->interp_client_data = instance;
254
255
8.97k
    return 0;
256
8.97k
}
257
258
/* Prepare interp instance for the next "job" */
259
static int
260
txt_impl_init_job(pl_interp_implementation_t *impl,
261
                  gx_device                  *pdevice)
262
674
{
263
674
    txt_interp_instance_t *instance = impl->interp_client_data;
264
265
674
    instance->device = pdevice;
266
674
    instance->state = TXT_STATE_INIT;
267
674
    instance->buffered = 0;
268
674
    instance->detected = TXT_UNDETECTED;
269
674
    instance->just_had_lf = 0;
270
674
    instance->just_had_cr = 0;
271
674
    instance->col = 0;
272
273
674
    instance->sub = pl_main_get_pcl_instance(instance->memory);
274
275
674
    return pl_init_job(instance->sub, instance->device);
276
674
}
277
278
5.54k
#define ESC 27
279
280
static int
281
send_bytes(txt_interp_instance_t *instance, const byte *p, int n)
282
718k
{
283
718k
    stream_cursor_read cursor;
284
285
#ifdef DEBUG_DUMP_PCL
286
    debug_as_pcl(p, n);
287
#endif
288
289
718k
    stream_cursor_read_init(&cursor, p, n);
290
291
718k
    return instance->sub->proc_process(instance->sub, &cursor);
292
718k
}
293
294
static void
295
drop_buffered(txt_interp_instance_t *instance, int n)
296
711k
{
297
711k
    assert(instance->buffered >= n);
298
711k
    instance->buffered -= n;
299
711k
    if (instance->buffered > 0)
300
1.26k
        memmove(instance->buffer, &instance->buffer[n], instance->buffered);
301
711k
}
302
303
static int
304
send_pcl_init(txt_interp_instance_t *instance)
305
504
{
306
504
    static byte init[] = {
307
504
            ESC, 'E',                     /* Reset */
308
504
            ESC, '&', 'l', '0', 'O',      /* Orientation */
309
504
            ESC, '&', 'k', '1', '0', 'H', /* Horizontal spacing 10/120 of an inch. */
310
504
            ESC, '&', 'l', '8', 'C',      /* Vertical line spacing 8/48 of an inch. */
311
504
            ESC, '&', 't', '8', '3', 'P', /* &t = double byte parsing, 83 = utf-8, P = ? */
312
504
            ESC, '(', '1', '8', 'N',      /* Primary symbol set = 18N = Unicode */
313
504
            ESC, '(', 's', '0', 'P',      /* Fixed pitch */
314
504
            ESC, '(', 's', '1', '2', 'H', /* Secondary fixed pitch 12cpi */
315
504
            ESC, '(', 's', '8', 'V',      /* Point size 8 */
316
504
            ESC, '(', 's', '3', 'T',      /* Typeface number 3 */
317
504
            ESC, '&', 's', '0', 'C'       /* Wrappity wrap wrap */
318
504
    };
319
320
504
    return send_bytes(instance, init, sizeof(init));
321
504
}
322
323
static int
324
send_urc(txt_interp_instance_t *instance, int n)
325
257
{
326
257
    static byte unicode_replacement_char_as_utf8[] = { 0xe3, 0xbf, 0xbd };
327
328
257
    if (instance->state == TXT_STATE_UTF8_MAYBE)
329
30
    {
330
        /* We were guessing that this was UTF8. Now we know it's not. Drop back to ascii. */
331
30
        instance->state = TXT_STATE_ASCII;
332
30
        return 0;
333
30
    }
334
335
227
    drop_buffered(instance, n);
336
337
227
    instance->sent = 1;
338
227
    return send_bytes(instance, unicode_replacement_char_as_utf8, sizeof(unicode_replacement_char_as_utf8));
339
257
}
340
341
static int
342
send_utf8(txt_interp_instance_t *instance, int val)
343
717k
{
344
717k
    byte buf[4];
345
717k
    int n;
346
347
    /* Finally, send the val! */
348
717k
    if (val < 0x80)
349
611k
    {
350
611k
        buf[0] = val;
351
611k
        n = 1;
352
611k
    }
353
106k
    else if (val < 0x800)
354
69.8k
    {
355
69.8k
        buf[0] = 0xC0 + (val>>6);
356
69.8k
        buf[1] = 0x80 + (val & 0x3F);
357
69.8k
        n = 2;
358
69.8k
    }
359
36.9k
    else if (val < 0x10000)
360
36.9k
    {
361
36.9k
        buf[0] = 0xE0 + (val>>12);
362
36.9k
        buf[1] = 0x80 + ((val>>6) & 0x3F);
363
36.9k
        buf[2] = 0x80 + (val & 0x3F);
364
36.9k
        n = 3;
365
36.9k
    }
366
7
    else
367
7
    {
368
7
        buf[0] = 0xF0 + (val>>18);
369
7
        buf[1] = 0x80 + ((val>>12) & 0x3F);
370
7
        buf[2] = 0x80 + ((val>>6) & 0x3F);
371
7
        buf[3] = 0x80 + (val & 0x3F);
372
7
        n = 4;
373
7
    }
374
717k
    return send_bytes(instance, buf, n);
375
717k
}
376
377
/* All our actual codepoints should flow through here. So this is where
378
 * we do the housekeeping. */
379
static int
380
send_codepoint(txt_interp_instance_t *instance, int val)
381
711k
{
382
711k
    int code;
383
384
#ifdef DEBUG_CODEPOINTS
385
    dprintf3("Sending codepoint %d (%x) %c\n", val, val, val >= 32 && val <= 255 && val != 127 ? val : '.');
386
#endif
387
388
711k
    instance->sent = 1;
389
    /* Tidy up whatever mess of CR/LF we are passed. */
390
711k
    if (val == '\r')
391
1.46k
    {
392
        /* If we've got a CR and we've just had a LF, swallow this. */
393
1.46k
        if (instance->just_had_lf)
394
196
        {
395
196
            instance->just_had_lf = 0;
396
196
            return 0;
397
196
        }
398
1.26k
        instance->just_had_cr = 1;
399
1.26k
        val = '\n';
400
1.26k
    }
401
710k
    else if (val == '\n')
402
3.27k
    {
403
        /* If we've got a LF and we've just had a CR, swallow this. */
404
3.27k
        if (instance->just_had_cr)
405
554
        {
406
554
            instance->just_had_cr = 0;
407
554
            return 0;
408
554
        }
409
2.71k
        instance->just_had_lf = 1;
410
2.71k
    }
411
707k
    else
412
707k
    {
413
707k
        instance->just_had_cr = 0;
414
707k
        instance->just_had_lf = 0;
415
707k
    }
416
417
    /* Keep track of what column we're at to so we can do tab handling. */
418
710k
    if (val == '\n')
419
3.98k
    {
420
3.98k
        instance->col = 0;
421
3.98k
        code = send_utf8(instance, '\n');
422
3.98k
        if (code < 0 && code != gs_error_NeedInput)
423
0
            return code;
424
3.98k
        return send_utf8(instance, '\r');
425
3.98k
    }
426
707k
    if (val == '\t')
427
665
    {
428
665
        int spaces = 8 - (instance->col & 7);
429
4.29k
        while (spaces--)
430
3.62k
        {
431
3.62k
            int code = send_utf8(instance, ' ');
432
3.62k
            if (code < 0 && code != gs_error_NeedInput)
433
0
                return code;
434
3.62k
            instance->col++;
435
3.62k
        }
436
665
        return 0;
437
665
    }
438
706k
    instance->col++;
439
440
#if 0
441
    /* No need for this as PCL line wrapping works for us. If PCL ever
442
     * decides to wrap at a number of columns that aren't a multiple of
443
     * 8 then we'll need to do it manually again!. */
444
    if (instance->col == 80)
445
    {
446
        instance->col = 0;
447
        code = send_utf8(instance, '\n');
448
        if (code < 0 && code != gs_error_NeedInput))
449
            return code;
450
        return send_utf8(instance, '\r');
451
    }
452
#endif
453
454
706k
    return send_utf8(instance, val);
455
707k
}
456
457
static int
458
process_block(txt_interp_instance_t *instance, const byte *ptr, int n)
459
1.17k
{
460
1.17k
    int code;
461
1.17k
    byte *s = &instance->buffer[0];
462
1.17k
    int old_state = instance->state;
463
1.17k
    int val;
464
465
1.17k
    if (instance->detected == TXT_UNDETECTED)
466
674
    {
467
674
        instance->detected = identify_from_buffer(ptr, n);
468
        /* If we're thinking we're ASCII, go straight there. Otherwise, we'll let the
469
         * BOM detection below run its course. */
470
674
        if (instance->detected == TXT_ASCII)
471
152
            instance->state = TXT_STATE_ASCII;
472
674
    }
473
474
1.17k
    instance->sent = 0;
475
783k
    while (n)
476
783k
    {
477
        /* instance->sent records whether we pulled anything out of the buffer
478
         * last time round the loop. If we changed state, then don't refill the
479
         * buffer. Otherwise only fill the buffer if we didn't a char last time
480
         * (maybe we need char 2 of a 2 char sequence?) or if we haven't got
481
         * anything in the buffer already. */
482
783k
        if (instance->state == old_state && (!instance->sent || instance->buffered == 0))
483
781k
        {
484
781k
            assert(instance->buffered < 4);
485
781k
            s[instance->buffered++] = *ptr++;
486
781k
            n--;
487
781k
        }
488
783k
        old_state = instance->state;
489
490
783k
        instance->sent = 0;
491
783k
        switch (instance->state)
492
783k
        {
493
1.37k
        case TXT_STATE_INIT:
494
495
1.37k
            if (instance->buffered == 3 && s[0] == 0xef && s[1] == 0xbb && s[2] == 0xbf)
496
1
            {
497
1
                instance->state = TXT_STATE_UTF8;
498
1
                drop_buffered(instance, 3);
499
1
            }
500
1.37k
            else if (instance->buffered == 2 && s[0] == 0xff && s[1] == 0xfe)
501
55
            {
502
55
                instance->state = TXT_STATE_UTF16_LE;
503
55
                drop_buffered(instance, 2);
504
55
            }
505
1.32k
            else if (instance->buffered == 2 && s[0] == 0xfe && s[1] == 0xff)
506
100
            {
507
100
                instance->state = TXT_STATE_UTF16_BE;
508
100
                drop_buffered(instance, 2);
509
100
            }
510
1.22k
            else if (instance->buffered >= 3)
511
348
            {
512
                /* We haven't found a BOM, try for utf8. */
513
348
                instance->state = TXT_STATE_UTF8_MAYBE;
514
348
            }
515
516
            /* If we've recognised the BOM, then send the init string. */
517
1.37k
            if (instance->state != TXT_STATE_INIT)
518
504
            {
519
504
                code = send_pcl_init(instance);
520
504
                if (code < 0) {
521
504
                    if (code != gs_error_NeedInput || n == 0)
522
4
                        return code;
523
504
                }
524
504
            }
525
1.37k
            break;
526
1.37k
        case TXT_STATE_UTF8:
527
246k
        case TXT_STATE_UTF8_MAYBE:
528
246k
            if ((s[0] & 0xF8) == 0xF0)
529
38
            {
530
                /* 3 following bytes */
531
38
                if (instance->buffered >= 2 && (s[1] & 0xC0) != 0x80)
532
1
                {
533
1
                    code = send_urc(instance, 1);
534
1
                    if (code < 0) {
535
0
                        if (code != gs_error_NeedInput || n == 0)
536
0
                            return code;
537
0
                    }
538
1
                }
539
37
                else if (instance->buffered >= 3 && (s[2] & 0xC0) != 0x80)
540
0
                {
541
0
                    code = send_urc(instance, 2);
542
0
                    if (code < 0) {
543
0
                        if (code != gs_error_NeedInput || n == 0)
544
0
                            return code;
545
0
                    }
546
0
                }
547
37
                else if (instance->buffered == 4 && (s[3] & 0xC0) != 0x80)
548
0
                {
549
0
                    code = send_urc(instance, 3);
550
0
                    if (code < 0) {
551
0
                        if (code != gs_error_NeedInput || n == 0)
552
0
                            return code;
553
0
                    }
554
0
                }
555
37
                else if (instance->buffered == 4)
556
7
                {
557
                    /* Valid encoding of 4 bytes */
558
7
                    val = ((s[0] & 0x7)<<18) | ((s[1] & 0x3f)<<12) | ((s[2] & 0x3f)<<6) |  (s[3] & 0x3f);
559
7
                    drop_buffered(instance, 4);
560
7
                    code = send_codepoint(instance, val);
561
7
                    if (code < 0) {
562
7
                        if (code != gs_error_NeedInput || n == 0)
563
1
                            return code;
564
7
                    }
565
7
                }
566
30
                else if (instance->buffered != 1 && instance->buffered != 2 && instance->buffered != 3)
567
0
                {
568
                    /* Should never happen. */
569
0
                    return_error(gs_error_Fatal);
570
0
                }
571
38
            }
572
246k
            else if ((s[0] & 0xF0) == 0xE0)
573
31
            {
574
                /* 2 following bytes */
575
31
                if (instance->buffered >= 2 && (s[1] & 0xC0) != 0x80)
576
3
                {
577
3
                    code = send_urc(instance, 1);
578
3
                    if (code < 0) {
579
0
                        if (code != gs_error_NeedInput || n == 0)
580
0
                            return code;
581
0
                    }
582
3
                }
583
28
                else if (instance->buffered >= 3 && (s[2] & 0xC0) != 0x80)
584
0
                {
585
0
                    code = send_urc(instance, 2);
586
0
                    if (code < 0) {
587
0
                        if (code != gs_error_NeedInput || n == 0)
588
0
                            return code;
589
0
                    }
590
0
                }
591
28
                else if (instance->buffered == 3)
592
6
                {
593
                    /* Valid encoding of 3 bytes */
594
6
                    val = ((s[0] & 0xF)<<12) | ((s[1] & 0x3f)<<6) | (s[2] & 0x3f);
595
6
                    drop_buffered(instance, 3);
596
6
                    code = send_codepoint(instance, val);
597
6
                    if (code < 0) {
598
6
                        if (code != gs_error_NeedInput || n == 0)
599
0
                            return code;
600
6
                    }
601
6
                }
602
22
                else if (instance->buffered != 1 && instance->buffered != 2)
603
1
                {
604
                    /* Should never happen. */
605
1
                    return_error(gs_error_Fatal);
606
1
                }
607
31
            }
608
246k
            else if ((s[0] & 0xE0) == 0xC0)
609
29
            {
610
                /* 1 following bytes */
611
29
                if (instance->buffered >= 2 && (s[1] & 0xC0) != 0x80)
612
4
                {
613
4
                    code = send_urc(instance, 1);
614
4
                    if (code < 0) {
615
0
                        if (code != gs_error_NeedInput || n == 0)
616
0
                            return code;
617
0
                    }
618
4
                }
619
25
                else if (instance->buffered == 2)
620
7
                {
621
                    /* Valid encoding of 2 bytes */
622
7
                    val = ((s[0] & 0x1F)<<6) | (s[1] & 0x3f);
623
7
                    drop_buffered(instance, 2);
624
7
                    code = send_codepoint(instance, val);
625
7
                    if (code < 0) {
626
7
                        if (code != gs_error_NeedInput || n == 0)
627
1
                            return code;
628
7
                    }
629
7
                }
630
18
                else if (instance->buffered != 1)
631
2
                {
632
                    /* Should never happen. */
633
2
                    return_error(gs_error_Fatal);
634
2
                }
635
29
            }
636
246k
            else if ((s[0] & 0xC0) == 0x80)
637
13
            {
638
                /* A continuation byte at the start. Should never see this. */
639
13
                code = send_urc(instance, 1);
640
13
                if (code < 0) {
641
0
                    if (code != gs_error_NeedInput || n == 0)
642
0
                        return code;
643
0
                }
644
13
            }
645
246k
            else if (s[0] < 0x80)
646
246k
            {
647
                /* Simple byte. */
648
246k
                val = s[0];
649
246k
                drop_buffered(instance, 1);
650
246k
                code = send_codepoint(instance, val);
651
246k
                if (code < 0) {
652
245k
                    if (code != gs_error_NeedInput || n == 0)
653
386
                        return code;
654
245k
                }
655
246k
            }
656
9
            else
657
9
            {
658
                /* Bytes we should never see in a UTF-8 file! (0xf8-0xff) */
659
9
                code = send_urc(instance, 1);
660
9
                if (code < 0) {
661
0
                    if (code != gs_error_NeedInput || n == 0)
662
0
                        return code;
663
0
                }
664
9
            }
665
245k
            break;
666
245k
        case TXT_STATE_UTF16_LE:
667
21.4k
            if (instance->buffered < 2)
668
10.7k
                break;
669
10.7k
            if (s[1] >= 0xD8 && s[1] < 0xDC)
670
102
            {
671
                /* High surrogate */
672
102
                if (instance->buffered < 4)
673
68
                    break;
674
34
                if (s[3] < 0xDC || s[3] > 0xDF)
675
29
                {
676
                    /* Not followed by a low surrogate! Ignore the high surrogate. */
677
29
                    code = send_urc(instance, 2);
678
29
                    if (code < 0)
679
29
                        return code;
680
0
                    break;
681
29
                }
682
5
                val = (((s[0] | (s[1]<<8)) - 0xdc00)<<10) + (s[2] | (s[3]<<8)) - 0xdc00 + 0x10000;
683
5
                drop_buffered(instance, 4);
684
5
            }
685
10.6k
            else
686
10.6k
            {
687
10.6k
                val = s[0] | (s[1]<<8);
688
10.6k
                drop_buffered(instance, 2);
689
10.6k
            }
690
10.6k
            code = send_codepoint(instance, val);
691
10.6k
            if (code < 0) {
692
10.6k
                if (code != gs_error_NeedInput || n == 0)
693
31
                    return code;
694
10.6k
            }
695
10.6k
            break;
696
117k
        case TXT_STATE_UTF16_BE:
697
117k
            if (instance->buffered < 2)
698
58.4k
                break;
699
58.9k
            if (s[0] >= 0xD8 && s[0] < 0xDC)
700
604
            {
701
                /* High surrogate */
702
604
                if (instance->buffered < 4)
703
402
                    break;
704
202
                if (s[2] < 0xDC || s[2] > 0xDF)
705
198
                {
706
                    /* Not followed by a low surrogate! Ignore the high surrogate. */
707
198
                    code = send_urc(instance, 2);
708
198
                    if (code < 0)
709
198
                        return code;
710
0
                    break;
711
198
                }
712
4
                val = (((s[1] | (s[0]<<8)) - 0xdc00)<<10) + (s[3] | (s[2]<<8)) - 0xdc00 + 0x10000;
713
4
                drop_buffered(instance, 4);
714
4
            }
715
58.3k
            else
716
58.3k
            {
717
58.3k
                val = s[1] | (s[0]<<8);
718
58.3k
                drop_buffered(instance, 2);
719
58.3k
            }
720
58.3k
            code = send_codepoint(instance, val);
721
58.3k
            if (code < 0) {
722
58.3k
                if (code != gs_error_NeedInput || n == 0)
723
77
                    return code;
724
58.3k
            }
725
58.2k
            break;
726
396k
        case TXT_STATE_ASCII:
727
792k
            while (instance->buffered > 0)
728
396k
            {
729
396k
                code = send_codepoint(instance, s[0]);
730
396k
                if (code < 0) {
731
395k
                    if (code != gs_error_NeedInput || n == 0)
732
349
                        return code;
733
395k
                }
734
396k
                drop_buffered(instance, 1);
735
396k
            }
736
396k
            break;
737
396k
        default:
738
0
            return_error(gs_error_Fatal);
739
783k
        }
740
783k
    }
741
99
    return 0;
742
1.17k
}
743
744
/* Parse an entire random access file */
745
#if 0
746
static int
747
txt_impl_process_file(pl_interp_implementation_t *impl, const char *filename)
748
{
749
    txt_interp_instance_t *instance = impl->interp_client_data;
750
    int code, code1;
751
    gp_file *file;
752
753
    file = gp_fopen(instance->memory, filename, "rb");
754
    if (file == 0)
755
        return_error(gs_error_ioerror);
756
757
    instance->sub = pl_main_get_pcl_instance(instance->memory);
758
759
    code = pl_init_job(instance->sub, instance->device);
760
    if (code >= 0)
761
    {
762
        code = pl_process_file(instance->sub, filename);
763
    }
764
765
    code1 = pl_dnit_job(instance->sub);
766
    if (code >= 0)
767
        code = code1;
768
769
    gp_fclose(file);
770
771
    return code;
772
}
773
#endif
774
775
/* Do any setup for parser per-cursor */
776
static int                      /* ret 0 or +ve if ok, else -ve error code */
777
txt_impl_process_begin(pl_interp_implementation_t * impl)
778
674
{
779
674
    return 0;
780
674
}
781
782
/* Parse a cursor-full of data */
783
static int
784
txt_impl_process(pl_interp_implementation_t *impl, stream_cursor_read *cursor)
785
1.17k
{
786
1.17k
    txt_interp_instance_t *instance = impl->interp_client_data;
787
1.17k
    int avail;
788
1.17k
    int code;
789
790
1.17k
    avail = cursor->limit - cursor->ptr;
791
1.17k
    code = process_block(instance, cursor->ptr + 1, avail);
792
1.17k
    cursor->ptr = cursor->limit;
793
794
1.17k
    return code;
795
1.17k
}
796
797
static int                      /* ret 0 or +ve if ok, else -ve error code */
798
txt_impl_process_end(pl_interp_implementation_t * impl)
799
674
{
800
674
    return 0;
801
674
}
802
803
/* Skip to end of job.
804
 * Return 1 if done, 0 ok but EOJ not found, else negative error code.
805
 */
806
static int
807
txt_impl_flush_to_eoj(pl_interp_implementation_t *impl, stream_cursor_read *pcursor)
808
3
{
809
    /* assume SO files cannot be pjl embedded */
810
3
    pcursor->ptr = pcursor->limit;
811
3
    return 0;
812
3
}
813
814
/* Parser action for end-of-file */
815
static int
816
txt_impl_process_eof(pl_interp_implementation_t *impl)
817
671
{
818
671
    txt_interp_instance_t *instance = impl->interp_client_data;
819
820
671
    if (instance->sub)
821
671
        return pl_process_eof(instance->sub);
822
823
0
    return 0;
824
671
}
825
826
/* Report any errors after running a job */
827
static int
828
txt_impl_report_errors(pl_interp_implementation_t *impl,
829
                       int code,           /* prev termination status */
830
                       long file_position, /* file position of error, -1 if unknown */
831
                       bool force_to_cout  /* force errors to cout */
832
                       )
833
3
{
834
3
    txt_interp_instance_t *instance = impl->interp_client_data;
835
3
    int ret = 0;
836
837
3
    if (instance->sub)
838
3
        ret = pl_report_errors(instance->sub, code, file_position, force_to_cout);
839
840
3
    return ret;
841
3
}
842
843
/* Wrap up interp instance after a "job" */
844
static int
845
txt_impl_dnit_job(pl_interp_implementation_t *impl)
846
674
{
847
674
    txt_interp_instance_t *instance = impl->interp_client_data;
848
674
    int code = 0;
849
850
674
    if (instance->sub)
851
674
        code = pl_dnit_job(instance->sub);
852
674
    instance->sub = NULL;
853
674
    instance->device = NULL;
854
855
674
    return code;
856
674
}
857
858
/* Deallocate a interpreter instance */
859
static int
860
txt_impl_deallocate_interp_instance(pl_interp_implementation_t *impl)
861
8.97k
{
862
8.97k
    txt_interp_instance_t *instance = impl->interp_client_data;
863
864
8.97k
    gs_free_object(instance->memory, instance, "so_impl_deallocate_interp_instance");
865
866
8.97k
    return 0;
867
8.97k
}
868
869
/* Parser implementation descriptor */
870
pl_interp_implementation_t txt_implementation =
871
{
872
    txt_impl_characteristics,
873
    txt_impl_allocate_interp_instance,
874
    NULL,                       /* get_device_memory */
875
    NULL,                       /* set_param */
876
    NULL,                       /* add_path */
877
    NULL,                       /* post_args_init */
878
    txt_impl_init_job,
879
    NULL,                       /* run_prefix_commands */
880
    NULL,                       /* txt_impl_process_file, */
881
    txt_impl_process_begin,
882
    txt_impl_process,
883
    txt_impl_process_end,
884
    txt_impl_flush_to_eoj,
885
    txt_impl_process_eof,
886
    txt_impl_report_errors,
887
    txt_impl_dnit_job,
888
    txt_impl_deallocate_interp_instance,
889
    NULL,
890
};