Coverage Report

Created: 2026-04-01 07:17

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/ghostpdl/gpdl/txttop.c
Line
Count
Source
1
/* Copyright (C) 2026 Artifex Software, Inc.
2
   All Rights Reserved.
3
4
   This software is provided AS-IS with no warranty, either express or
5
   implied.
6
7
   This software is distributed under license and may not be copied,
8
   modified or distributed except as expressly authorized under the terms
9
   of the license contained in the file LICENSE in this distribution.
10
11
   Refer to licensing information at http://www.artifex.com or contact
12
   Artifex Software, Inc.,  39 Mesa Street, Suite 108A, San Francisco,
13
   CA 94129, USA, for further information.
14
*/
15
16
/* Top-level API implementation for text file handling */
17
18
/* Language wrapper implementation (see pltop.h) */
19
20
21
/* Enable the following for a dump of the codepoints to stdout. */
22
/* #define DEBUG_CODEPOINTS */
23
24
/* Enable the following for a hacky dump of the output PCL to file. */
25
/* #define DEBUG_DUMP_PCL */
26
27
#ifdef DEBUG_DUMP_PCL
28
#include <stdio.h>
29
static FILE *debug_pcl_out = NULL;
30
static void wipe(void)
31
{
32
        fclose(debug_pcl_out);
33
        debug_pcl_out = NULL;
34
}
35
static void
36
debug_as_pcl(const char *p, int n)
37
{
38
        if (debug_pcl_out == NULL)
39
        {
40
            debug_pcl_out = fopen("debug_pcl_out", "wb");
41
            atexit(wipe);
42
        }
43
        fwrite(p, n, 1, debug_pcl_out);
44
}
45
#endif
46
47
#include "pltop.h"
48
#include "plmain.h"
49
50
#include "plparse.h" /* for e_ExitLanguage */
51
#include "plmain.h"
52
#include "gxdevice.h" /* so we can include gxht.h below */
53
#include "gserrors.h"
54
#include "gp.h"
55
#include "assert_.h"
56
57
/*
58
 * The TXT interpeter is identical to pl_interp_t.
59
 * The TXT interpreter instance is derived from pl_interp_implementation_t.
60
 */
61
62
typedef enum
63
{
64
    TXT_STATE_INIT = 0,
65
    TXT_STATE_UTF8,
66
    TXT_STATE_UTF8_MAYBE,
67
    TXT_STATE_UTF16_LE,
68
    TXT_STATE_UTF16_BE,
69
    TXT_STATE_ASCII
70
} txt_state_t;
71
72
typedef struct txt_interp_instance_s txt_interp_instance_t;
73
74
struct txt_interp_instance_s
75
{
76
    gs_memory_t *memory;                /* memory allocator to use */
77
78
    pl_interp_implementation_t *sub;
79
    gx_device *device;
80
81
    int buffered;
82
    byte buffer[4];
83
84
    int state;
85
    int detected;
86
    int just_had_lf;
87
    int just_had_cr;
88
    int col;
89
    int sent;
90
};
91
92
enum
93
{
94
    TXT_UNDETECTED = -1,
95
    TXT_UNKNOWN,
96
    TXT_UTF8,
97
    TXT_UTF8_MAYBE,
98
    TXT_UTF16_LE,
99
    TXT_UTF16_BE,
100
    TXT_ASCII,
101
};
102
103
static int
104
identify_from_buffer(const unsigned char *s, int len)
105
18.3k
{
106
18.3k
    int count_controls = 0;
107
18.3k
    int count_hi = 0;
108
18.3k
    int count_tabs = 0;
109
18.3k
    int plausibly_utf8 = 1;
110
18.3k
    int i;
111
112
    /* UTF-8 with a BOM */
113
18.3k
    if (len >= 3 && s[0] == 0xef && s[1] == 0xbb && s[2] == 0xbf)
114
0
        return TXT_UTF8;
115
    /* UTF-16 (little endian) */
116
18.3k
    if (len >= 2 && s[0] == 0xff && s[1] == 0xfe)
117
82
        return TXT_UTF16_LE;
118
    /* UTF-16 (big endian) */
119
18.2k
    if (len >= 2 && s[0] == 0xfe && s[1] == 0xff)
120
131
        return TXT_UTF16_BE;
121
122
    /* Gather some stats. */
123
22.5M
    for (i = 0; i < len; i++)
124
22.5M
    {
125
22.5M
        if (s[i] == 9)
126
87.0k
        {
127
87.0k
            count_tabs++;
128
87.0k
        }
129
22.4M
        else if (s[i] == 12)
130
133k
        {
131
            /* Form feed. We'll let that slide. */
132
133k
        }
133
22.2M
        else if (s[i] == 10)
134
150k
        {
135
150k
           if (i+1 < len && s[i+1] == 13)
136
316
                i++;
137
150k
        }
138
22.1M
        else if (s[i] == 13)
139
126k
        {
140
126k
           if (i+1 < len && s[i+1] == 10)
141
72.4k
                i++;
142
126k
        }
143
22.0M
        else if (s[i] < 32 || s[i] == 0x7f)
144
5.36M
        {
145
5.36M
            count_controls++;
146
5.36M
        }
147
16.6M
        else if (s[i] < 0x7f)
148
11.5M
        {
149
            /* Seems like a reasonable ASCII value. */
150
11.5M
        }
151
5.06M
        else
152
5.06M
        {
153
5.06M
            count_hi++;
154
5.06M
            if ((s[i] & 0xF8) == 0xF0)
155
311k
            {
156
                /* 3 following bytes */
157
311k
                if (i+1 < len && (s[i+1] & 0xC0) != 0x80)
158
302k
                    plausibly_utf8 = 0;
159
9.52k
                else if (i+2 < len && (s[i+2] & 0xC0) != 0x80)
160
6.00k
                    plausibly_utf8 = 0;
161
3.52k
                else if (i+3 < len && (s[i+3] & 0xC0) != 0x80)
162
1.72k
                    plausibly_utf8 = 0;
163
1.79k
                else
164
1.79k
                    i+=3;
165
311k
            }
166
4.75M
            else if ((s[i] & 0xF0) == 0xE0)
167
331k
            {
168
                /* 2 following bytes */
169
331k
                if (i+1 < len && (s[i+1] & 0xC0) != 0x80)
170
305k
                    plausibly_utf8 = 0;
171
25.4k
                else if (i+2 < len && (s[i+2] & 0xC0) != 0x80)
172
16.8k
                    plausibly_utf8 = 0;
173
8.59k
                else
174
8.59k
                    i+=2;
175
331k
            }
176
4.42M
            else if ((s[i] & 0xE0) == 0xC0)
177
1.34M
            {
178
                /* 1 following bytes */
179
1.34M
                if (i+1 < len && (s[i+1] & 0xC0) != 0x80)
180
1.23M
                    plausibly_utf8 = 0;
181
102k
                else
182
102k
                    i++;
183
1.34M
            }
184
3.08M
            else
185
3.08M
                plausibly_utf8 = 0;
186
5.06M
        }
187
22.5M
    }
188
189
    /* Any (non tab/cr/lf/ff) control characters probably means this isn't text. */
190
18.0k
    if (count_controls > 0)
191
16.8k
        return TXT_UNKNOWN;
192
    /* If we've managed to decode all that as utf8 without problem, it's probably text. */
193
1.28k
    if (plausibly_utf8)
194
771
        return TXT_UTF8_MAYBE;
195
    /* If we're hitting too many top bit set chars, give up. */
196
515
    if (count_hi > len/10)
197
183
        return TXT_UNKNOWN;
198
199
332
    return TXT_ASCII;
200
515
}
201
202
static int
203
txt_detect_language(const char *t, int len)
204
17.7k
{
205
17.7k
    const unsigned char *s = (const unsigned char *)t;
206
207
17.7k
    switch (identify_from_buffer(s, len))
208
17.7k
    {
209
0
    case TXT_UTF8:
210
45
    case TXT_UTF16_LE:
211
112
    case TXT_UTF16_BE:
212
        /* PCL spots files with lots of ESCs in them at confidence
213
         * level 80. We'll use 70, cos we don't want to override that. */
214
112
        return 70;
215
447
    case TXT_UTF8_MAYBE:
216
634
    case TXT_ASCII:
217
634
        return 60;
218
0
    default:
219
16.9k
    case TXT_UNKNOWN:
220
16.9k
        break;
221
17.7k
    }
222
223
16.9k
    return 0;
224
17.7k
}
225
226
static const pl_interp_characteristics_t *
227
txt_impl_characteristics(const pl_interp_implementation_t *pimpl)
228
37.5k
{
229
37.5k
    static pl_interp_characteristics_t txt_characteristics =
230
37.5k
    {
231
37.5k
        "TXT",
232
37.5k
        txt_detect_language,
233
37.5k
    };
234
37.5k
    return &txt_characteristics;
235
37.5k
}
236
237
/* Do per-instance interpreter allocation/init. No device is set yet */
238
static int
239
txt_impl_allocate_interp_instance(pl_interp_implementation_t *impl,
240
                                  gs_memory_t *pmem)
241
8.09k
{
242
8.09k
    txt_interp_instance_t *instance;
243
244
8.09k
    instance = (txt_interp_instance_t *) gs_alloc_bytes(pmem,
245
8.09k
            sizeof(txt_interp_instance_t), "txt_impl_allocate_interp_instance");
246
247
8.09k
    if (!instance)
248
0
        return_error(gs_error_VMerror);
249
250
8.09k
    instance->memory = pmem;
251
8.09k
    instance->sub = NULL;
252
253
8.09k
    impl->interp_client_data = instance;
254
255
8.09k
    return 0;
256
8.09k
}
257
258
/* Prepare interp instance for the next "job" */
259
static int
260
txt_impl_init_job(pl_interp_implementation_t *impl,
261
                  gx_device                  *pdevice)
262
570
{
263
570
    txt_interp_instance_t *instance = impl->interp_client_data;
264
265
570
    instance->device = pdevice;
266
570
    instance->state = TXT_STATE_INIT;
267
570
    instance->buffered = 0;
268
570
    instance->detected = TXT_UNDETECTED;
269
570
    instance->just_had_lf = 0;
270
570
    instance->just_had_cr = 0;
271
570
    instance->col = 0;
272
273
570
    instance->sub = pl_main_get_pcl_instance(instance->memory);
274
275
570
    return pl_init_job(instance->sub, instance->device);
276
570
}
277
278
4.47k
#define ESC 27
279
280
static int
281
send_bytes(txt_interp_instance_t *instance, const byte *p, int n)
282
662k
{
283
662k
    stream_cursor_read cursor;
284
285
#ifdef DEBUG_DUMP_PCL
286
    debug_as_pcl(p, n);
287
#endif
288
289
662k
    stream_cursor_read_init(&cursor, p, n);
290
291
662k
    return instance->sub->proc_process(instance->sub, &cursor);
292
662k
}
293
294
static void
295
drop_buffered(txt_interp_instance_t *instance, int n)
296
657k
{
297
657k
    assert(instance->buffered >= n);
298
657k
    instance->buffered -= n;
299
657k
    if (instance->buffered > 0)
300
705
        memmove(instance->buffer, &instance->buffer[n], instance->buffered);
301
657k
}
302
303
static int
304
send_pcl_init(txt_interp_instance_t *instance)
305
407
{
306
407
    static byte init[] = {
307
407
            ESC, 'E',                     /* Reset */
308
407
            ESC, '&', 'l', '0', 'O',      /* Orientation */
309
407
            ESC, '&', 'k', '1', '0', 'H', /* Horizontal spacing 10/120 of an inch. */
310
407
            ESC, '&', 'l', '8', 'C',      /* Vertical line spacing 8/48 of an inch. */
311
407
            ESC, '&', 't', '8', '3', 'P', /* &t = double byte parsing, 83 = utf-8, P = ? */
312
407
            ESC, '(', '1', '8', 'N',      /* Primary symbol set = 18N = Unicode */
313
407
            ESC, '(', 's', '0', 'P',      /* Fixed pitch */
314
407
            ESC, '(', 's', '1', '2', 'H', /* Secondary fixed pitch 12cpi */
315
407
            ESC, '(', 's', '8', 'V',      /* Point size 8 */
316
407
            ESC, '(', 's', '3', 'T',      /* Typeface number 3 */
317
407
            ESC, '&', 's', '0', 'C'       /* Wrappity wrap wrap */
318
407
    };
319
320
407
    return send_bytes(instance, init, sizeof(init));
321
407
}
322
323
static int
324
send_urc(txt_interp_instance_t *instance, int n)
325
125
{
326
125
    static byte unicode_replacement_char_as_utf8[] = { 0xe3, 0xbf, 0xbd };
327
328
125
    if (instance->state == TXT_STATE_UTF8_MAYBE)
329
20
    {
330
        /* We were guessing that this was UTF8. Now we know it's not. Drop back to ascii. */
331
20
        instance->state = TXT_STATE_ASCII;
332
20
        return 0;
333
20
    }
334
335
105
    drop_buffered(instance, n);
336
337
105
    instance->sent = 1;
338
105
    return send_bytes(instance, unicode_replacement_char_as_utf8, sizeof(unicode_replacement_char_as_utf8));
339
125
}
340
341
static int
342
send_utf8(txt_interp_instance_t *instance, int val)
343
661k
{
344
661k
    byte buf[4];
345
661k
    int n;
346
347
    /* Finally, send the val! */
348
661k
    if (val < 0x80)
349
619k
    {
350
619k
        buf[0] = val;
351
619k
        n = 1;
352
619k
    }
353
42.4k
    else if (val < 0x800)
354
37.3k
    {
355
37.3k
        buf[0] = 0xC0 + (val>>6);
356
37.3k
        buf[1] = 0x80 + (val & 0x3F);
357
37.3k
        n = 2;
358
37.3k
    }
359
5.10k
    else if (val < 0x10000)
360
5.09k
    {
361
5.09k
        buf[0] = 0xE0 + (val>>12);
362
5.09k
        buf[1] = 0x80 + ((val>>6) & 0x3F);
363
5.09k
        buf[2] = 0x80 + (val & 0x3F);
364
5.09k
        n = 3;
365
5.09k
    }
366
4
    else
367
4
    {
368
4
        buf[0] = 0xF0 + (val>>18);
369
4
        buf[1] = 0x80 + ((val>>12) & 0x3F);
370
4
        buf[2] = 0x80 + ((val>>6) & 0x3F);
371
4
        buf[3] = 0x80 + (val & 0x3F);
372
4
        n = 4;
373
4
    }
374
661k
    return send_bytes(instance, buf, n);
375
661k
}
376
377
/* All our actual codepoints should flow through here. So this is where
378
 * we do the housekeeping. */
379
static int
380
send_codepoint(txt_interp_instance_t *instance, int val)
381
657k
{
382
657k
    int code;
383
384
#ifdef DEBUG_CODEPOINTS
385
    dprintf3("Sending codepoint %d (%x) %c\n", val, val, val >= 32 && val <= 255 && val != 127 ? val : '.');
386
#endif
387
388
657k
    instance->sent = 1;
389
    /* Tidy up whatever mess of CR/LF we are passed. */
390
657k
    if (val == '\r')
391
525
    {
392
        /* If we've got a CR and we've just had a LF, swallow this. */
393
525
        if (instance->just_had_lf)
394
0
        {
395
0
            instance->just_had_lf = 0;
396
0
            return 0;
397
0
        }
398
525
        instance->just_had_cr = 1;
399
525
        val = '\n';
400
525
    }
401
656k
    else if (val == '\n')
402
3.69k
    {
403
        /* If we've got a LF and we've just had a CR, swallow this. */
404
3.69k
        if (instance->just_had_cr)
405
157
        {
406
157
            instance->just_had_cr = 0;
407
157
            return 0;
408
157
        }
409
3.53k
        instance->just_had_lf = 1;
410
3.53k
    }
411
652k
    else
412
652k
    {
413
652k
        instance->just_had_cr = 0;
414
652k
        instance->just_had_lf = 0;
415
652k
    }
416
417
    /* Keep track of what column we're at to so we can do tab handling. */
418
656k
    if (val == '\n')
419
4.05k
    {
420
4.05k
        instance->col = 0;
421
4.05k
        code = send_utf8(instance, '\n');
422
4.05k
        if (code < 0)
423
0
            return code;
424
4.05k
        return send_utf8(instance, '\r');
425
4.05k
    }
426
652k
    if (val == '\t')
427
200
    {
428
200
        int spaces = 8 - (instance->col & 7);
429
1.12k
        while (spaces--)
430
924
        {
431
924
            int code = send_utf8(instance, ' ');
432
924
            if (code < 0)
433
0
                return code;
434
924
            instance->col++;
435
924
        }
436
200
        return 0;
437
200
    }
438
652k
    instance->col++;
439
440
#if 0
441
    /* No need for this as PCL line wrapping works for us. If PCL ever
442
     * decides to wrap at a number of columns that aren't a multiple of
443
     * 8 then we'll need to do it manually again!. */
444
    if (instance->col == 80)
445
    {
446
        instance->col = 0;
447
        code = send_utf8(instance, '\n');
448
        if (code < 0)
449
            return code;
450
        return send_utf8(instance, '\r');
451
    }
452
#endif
453
454
652k
    return send_utf8(instance, val);
455
652k
}
456
457
static int
458
process_block(txt_interp_instance_t *instance, const byte *ptr, int n)
459
866
{
460
866
    int code;
461
866
    byte *s = &instance->buffer[0];
462
866
    int old_state = instance->state;
463
866
    int val;
464
465
866
    if (instance->detected == TXT_UNDETECTED)
466
570
    {
467
570
        instance->detected = identify_from_buffer(ptr, n);
468
        /* If we're thinking we're ASCII, go straight there. Otherwise, we'll let the
469
         * BOM detection below run its course. */
470
570
        if (instance->detected == TXT_ASCII)
471
145
            instance->state = TXT_STATE_ASCII;
472
570
    }
473
474
866
    instance->sent = 0;
475
679k
    while (n)
476
679k
    {
477
        /* instance->sent records whether we pulled anything out of the buffer
478
         * last time round the loop. If we changed state, then don't refill the
479
         * buffer. Otherwise only fill the buffer if we didn't a char last time
480
         * (maybe we need char 2 of a 2 char sequence?) or if we haven't got
481
         * anything in the buffer already. */
482
679k
        if (instance->state == old_state && (!instance->sent || instance->buffered == 0))
483
677k
        {
484
677k
            assert(instance->buffered < 4);
485
677k
            s[instance->buffered++] = *ptr++;
486
677k
            n--;
487
677k
        }
488
679k
        old_state = instance->state;
489
490
679k
        instance->sent = 0;
491
679k
        switch (instance->state)
492
679k
        {
493
1.14k
        case TXT_STATE_INIT:
494
495
1.14k
            if (instance->buffered == 3 && s[0] == 0xef && s[1] == 0xbb && s[2] == 0xbf)
496
0
            {
497
0
                instance->state = TXT_STATE_UTF8;
498
0
                drop_buffered(instance, 3);
499
0
            }
500
1.14k
            else if (instance->buffered == 2 && s[0] == 0xff && s[1] == 0xfe)
501
37
            {
502
37
                instance->state = TXT_STATE_UTF16_LE;
503
37
                drop_buffered(instance, 2);
504
37
            }
505
1.10k
            else if (instance->buffered == 2 && s[0] == 0xfe && s[1] == 0xff)
506
64
            {
507
64
                instance->state = TXT_STATE_UTF16_BE;
508
64
                drop_buffered(instance, 2);
509
64
            }
510
1.03k
            else if (instance->buffered >= 3)
511
306
            {
512
                /* We haven't found a BOM, try for utf8. */
513
306
                instance->state = TXT_STATE_UTF8_MAYBE;
514
306
            }
515
516
            /* If we've recognised the BOM, then send the init string. */
517
1.14k
            if (instance->state != TXT_STATE_INIT)
518
407
            {
519
407
                code = send_pcl_init(instance);
520
407
                if (code < 0)
521
0
                    return code;
522
407
            }
523
1.14k
            break;
524
1.14k
        case TXT_STATE_UTF8:
525
308k
        case TXT_STATE_UTF8_MAYBE:
526
308k
            if ((s[0] & 0xF8) == 0xF0)
527
26
            {
528
                /* 3 following bytes */
529
26
                if (instance->buffered >= 2 && (s[1] & 0xC0) != 0x80)
530
0
                {
531
0
                    code = send_urc(instance, 1);
532
0
                    if (code < 0)
533
0
                        return code;
534
0
                }
535
26
                else if (instance->buffered >= 3 && (s[2] & 0xC0) != 0x80)
536
0
                {
537
0
                    code = send_urc(instance, 2);
538
0
                    if (code < 0)
539
0
                        return code;
540
0
                }
541
26
                else if (instance->buffered == 4 && (s[3] & 0xC0) != 0x80)
542
0
                {
543
0
                    code = send_urc(instance, 3);
544
0
                    if (code < 0)
545
0
                        return code;
546
0
                }
547
26
                else if (instance->buffered == 4)
548
4
                {
549
                    /* Valid encoding of 4 bytes */
550
4
                    val = ((s[0] & 0x7)<<18) | ((s[1] & 0x3f)<<12) | ((s[2] & 0x3f)<<6) |  (s[3] & 0x3f);
551
4
                    drop_buffered(instance, 4);
552
4
                    code = send_codepoint(instance, val);
553
4
                    if (code < 0)
554
0
                        return code;
555
4
                }
556
22
                else if (instance->buffered != 1 && instance->buffered != 2 && instance->buffered != 3)
557
0
                {
558
                    /* Should never happen. */
559
0
                    return_error(gs_error_Fatal);
560
0
                }
561
26
            }
562
308k
            else if ((s[0] & 0xF0) == 0xE0)
563
21
            {
564
                /* 2 following bytes */
565
21
                if (instance->buffered >= 2 && (s[1] & 0xC0) != 0x80)
566
1
                {
567
1
                    code = send_urc(instance, 1);
568
1
                    if (code < 0)
569
0
                        return code;
570
1
                }
571
20
                else if (instance->buffered >= 3 && (s[2] & 0xC0) != 0x80)
572
0
                {
573
0
                    code = send_urc(instance, 2);
574
0
                    if (code < 0)
575
0
                        return code;
576
0
                }
577
20
                else if (instance->buffered == 3)
578
4
                {
579
                    /* Valid encoding of 3 bytes */
580
4
                    val = ((s[0] & 0xF)<<12) | ((s[1] & 0x3f)<<6) | (s[2] & 0x3f);
581
4
                    drop_buffered(instance, 3);
582
4
                    code = send_codepoint(instance, val);
583
4
                    if (code < 0)
584
0
                        return code;
585
4
                }
586
16
                else if (instance->buffered != 1 && instance->buffered != 2)
587
0
                {
588
                    /* Should never happen. */
589
0
                    return_error(gs_error_Fatal);
590
0
                }
591
21
            }
592
308k
            else if ((s[0] & 0xE0) == 0xC0)
593
27
            {
594
                /* 1 following bytes */
595
27
                if (instance->buffered >= 2 && (s[1] & 0xC0) != 0x80)
596
4
                {
597
4
                    code = send_urc(instance, 1);
598
4
                    if (code < 0)
599
0
                        return code;
600
4
                }
601
23
                else if (instance->buffered == 2)
602
7
                {
603
                    /* Valid encoding of 2 bytes */
604
7
                    val = ((s[0] & 0x1F)<<6) | (s[1] & 0x3f);
605
7
                    drop_buffered(instance, 2);
606
7
                    code = send_codepoint(instance, val);
607
7
                    if (code < 0)
608
0
                        return code;
609
7
                }
610
16
                else if (instance->buffered != 1)
611
2
                {
612
                    /* Should never happen. */
613
2
                    return_error(gs_error_Fatal);
614
2
                }
615
27
            }
616
308k
            else if ((s[0] & 0xC0) == 0x80)
617
11
            {
618
                /* A continuation byte at the start. Should never see this. */
619
11
                code = send_urc(instance, 1);
620
11
                if (code < 0)
621
0
                    return code;
622
11
            }
623
307k
            else if (s[0] < 0x80)
624
307k
            {
625
                /* Simple byte. */
626
307k
                val = s[0];
627
307k
                drop_buffered(instance, 1);
628
307k
                code = send_codepoint(instance, val);
629
307k
                if (code < 0)
630
0
                    return code;
631
307k
            }
632
4
            else
633
4
            {
634
                /* Bytes we should never see in a UTF-8 file! (0xf8-0xff) */
635
4
                code = send_urc(instance, 1);
636
4
                if (code < 0)
637
0
                    return code;
638
4
            }
639
308k
            break;
640
308k
        case TXT_STATE_UTF16_LE:
641
2.39k
            if (instance->buffered < 2)
642
1.22k
                break;
643
1.17k
            if (s[1] >= 0xD8 && s[1] < 0xDC)
644
0
            {
645
                /* High surrogate */
646
0
                if (instance->buffered < 4)
647
0
                    break;
648
0
                if (s[3] < 0xDC || s[3] > 0xDF)
649
0
                {
650
                    /* Not followed by a low surrogate! Ignore the high surrogate. */
651
0
                    code = send_urc(instance, 2);
652
0
                    if (code < 0)
653
0
                        return code;
654
0
                }
655
0
                val = (((s[0] | (s[1]<<8)) - 0xdc00)<<10) + (s[2] | (s[3]<<8)) - 0xdc00 + 0x10000;
656
0
                drop_buffered(instance, 4);
657
0
            }
658
1.17k
            else
659
1.17k
            {
660
1.17k
                val = s[0] | (s[1]<<8);
661
1.17k
                drop_buffered(instance, 2);
662
1.17k
            }
663
1.17k
            code = send_codepoint(instance, val);
664
1.17k
            if (code < 0)
665
0
                return code;
666
1.17k
            break;
667
38.6k
        case TXT_STATE_UTF16_BE:
668
38.6k
            if (instance->buffered < 2)
669
19.2k
                break;
670
19.4k
            if (s[0] >= 0xD8 && s[0] < 0xDC)
671
315
            {
672
                /* High surrogate */
673
315
                if (instance->buffered < 4)
674
210
                    break;
675
105
                if (s[2] < 0xDC || s[2] > 0xDF)
676
105
                {
677
                    /* Not followed by a low surrogate! Ignore the high surrogate. */
678
105
                    code = send_urc(instance, 2);
679
105
                    if (code < 0)
680
0
                        return code;
681
105
                    break;
682
105
                }
683
0
                val = (((s[1] | (s[0]<<8)) - 0xdc00)<<10) + (s[3] | (s[2]<<8)) - 0xdc00 + 0x10000;
684
0
                drop_buffered(instance, 4);
685
0
            }
686
19.1k
            else
687
19.1k
            {
688
19.1k
                val = s[1] | (s[0]<<8);
689
19.1k
                drop_buffered(instance, 2);
690
19.1k
            }
691
19.1k
            code = send_codepoint(instance, val);
692
19.1k
            if (code < 0)
693
0
                return code;
694
19.1k
            break;
695
328k
        case TXT_STATE_ASCII:
696
657k
            while (instance->buffered > 0)
697
328k
            {
698
328k
                code = send_codepoint(instance, s[0]);
699
328k
                if (code < 0)
700
0
                    return code;
701
328k
                drop_buffered(instance, 1);
702
328k
            }
703
328k
            break;
704
328k
        default:
705
0
            return_error(gs_error_Fatal);
706
679k
        }
707
679k
    }
708
864
    return 0;
709
866
}
710
711
/* Parse an entire random access file */
712
#if 0
713
static int
714
txt_impl_process_file(pl_interp_implementation_t *impl, const char *filename)
715
{
716
    txt_interp_instance_t *instance = impl->interp_client_data;
717
    int code, code1;
718
    gp_file *file;
719
720
    file = gp_fopen(instance->memory, filename, "rb");
721
    if (file == 0)
722
        return_error(gs_error_ioerror);
723
724
    instance->sub = pl_main_get_pcl_instance(instance->memory);
725
726
    code = pl_init_job(instance->sub, instance->device);
727
    if (code >= 0)
728
    {
729
        code = pl_process_file(instance->sub, filename);
730
    }
731
732
    code1 = pl_dnit_job(instance->sub);
733
    if (code >= 0)
734
        code = code1;
735
736
    gp_fclose(file);
737
738
    return code;
739
}
740
#endif
741
742
/* Do any setup for parser per-cursor */
743
static int                      /* ret 0 or +ve if ok, else -ve error code */
744
txt_impl_process_begin(pl_interp_implementation_t * impl)
745
570
{
746
570
    return 0;
747
570
}
748
749
/* Parse a cursor-full of data */
750
static int
751
txt_impl_process(pl_interp_implementation_t *impl, stream_cursor_read *cursor)
752
866
{
753
866
    txt_interp_instance_t *instance = impl->interp_client_data;
754
866
    int avail;
755
866
    int code;
756
757
866
    avail = cursor->limit - cursor->ptr;
758
866
    code = process_block(instance, cursor->ptr + 1, avail);
759
866
    cursor->ptr = cursor->limit;
760
761
866
    return code;
762
866
}
763
764
static int                      /* ret 0 or +ve if ok, else -ve error code */
765
txt_impl_process_end(pl_interp_implementation_t * impl)
766
570
{
767
570
    return 0;
768
570
}
769
770
/* Skip to end of job.
771
 * Return 1 if done, 0 ok but EOJ not found, else negative error code.
772
 */
773
static int
774
txt_impl_flush_to_eoj(pl_interp_implementation_t *impl, stream_cursor_read *pcursor)
775
2
{
776
    /* assume SO files cannot be pjl embedded */
777
2
    pcursor->ptr = pcursor->limit;
778
2
    return 0;
779
2
}
780
781
/* Parser action for end-of-file */
782
static int
783
txt_impl_process_eof(pl_interp_implementation_t *impl)
784
568
{
785
568
    txt_interp_instance_t *instance = impl->interp_client_data;
786
787
568
    if (instance->sub)
788
568
        return pl_process_eof(instance->sub);
789
790
0
    return 0;
791
568
}
792
793
/* Report any errors after running a job */
794
static int
795
txt_impl_report_errors(pl_interp_implementation_t *impl,
796
                       int code,           /* prev termination status */
797
                       long file_position, /* file position of error, -1 if unknown */
798
                       bool force_to_cout  /* force errors to cout */
799
                       )
800
2
{
801
2
    txt_interp_instance_t *instance = impl->interp_client_data;
802
2
    int ret = 0;
803
804
2
    if (instance->sub)
805
2
        ret = pl_report_errors(instance->sub, code, file_position, force_to_cout);
806
807
2
    return ret;
808
2
}
809
810
/* Wrap up interp instance after a "job" */
811
static int
812
txt_impl_dnit_job(pl_interp_implementation_t *impl)
813
570
{
814
570
    txt_interp_instance_t *instance = impl->interp_client_data;
815
570
    int code = 0;
816
817
570
    if (instance->sub)
818
570
        code = pl_dnit_job(instance->sub);
819
570
    instance->sub = NULL;
820
570
    instance->device = NULL;
821
822
570
    return code;
823
570
}
824
825
/* Deallocate a interpreter instance */
826
static int
827
txt_impl_deallocate_interp_instance(pl_interp_implementation_t *impl)
828
8.09k
{
829
8.09k
    txt_interp_instance_t *instance = impl->interp_client_data;
830
831
8.09k
    gs_free_object(instance->memory, instance, "so_impl_deallocate_interp_instance");
832
833
8.09k
    return 0;
834
8.09k
}
835
836
/* Parser implementation descriptor */
837
pl_interp_implementation_t txt_implementation =
838
{
839
    txt_impl_characteristics,
840
    txt_impl_allocate_interp_instance,
841
    NULL,                       /* get_device_memory */
842
    NULL,                       /* set_param */
843
    NULL,                       /* add_path */
844
    NULL,                       /* post_args_init */
845
    txt_impl_init_job,
846
    NULL,                       /* run_prefix_commands */
847
    NULL,                       /* txt_impl_process_file, */
848
    txt_impl_process_begin,
849
    txt_impl_process,
850
    txt_impl_process_end,
851
    txt_impl_flush_to_eoj,
852
    txt_impl_process_eof,
853
    txt_impl_report_errors,
854
    txt_impl_dnit_job,
855
    txt_impl_deallocate_interp_instance,
856
    NULL,
857
};