Coverage Report

Created: 2025-11-09 06:15

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/tidy-html5/src/streamio.c
Line
Count
Source
1
/* streamio.c -- handles character stream I/O
2
3
  (c) 1998-2008 (W3C) MIT, ERCIM, Keio University
4
  See tidy.h for the copyright notice.
5
6
  Wrapper around Tidy input source and output sink
7
  that calls appropriate interfaces, and applies
8
  necessary char encoding transformations: to/from
9
  ISO-10646 and/or UTF-8.
10
11
*/
12
13
#include <stdio.h>
14
#include <errno.h>
15
16
#include "streamio.h"
17
#include "tidy-int.h"
18
#include "lexer.h"
19
#include "message.h"
20
#include "utf8.h"
21
#include "tmbstr.h"
22
23
24
/************************
25
** Forward Declarations
26
************************/
27
28
static uint ReadCharFromStream( StreamIn* in );
29
30
static uint ReadByte( StreamIn* in );
31
static void UngetByte( StreamIn* in, uint byteValue );
32
33
static void PutByte( uint byteValue, StreamOut* out );
34
35
static void EncodeWin1252( uint c, StreamOut* out );
36
static void EncodeMacRoman( uint c, StreamOut* out );
37
static void EncodeIbm858( uint c, StreamOut* out );
38
static void EncodeLatin0( uint c, StreamOut* out );
39
40
static uint DecodeIbm850(uint c);
41
static uint DecodeLatin0(uint c);
42
43
static uint PopChar( StreamIn *in );
44
45
/******************************
46
** Static (duration) Globals
47
******************************/
48
49
static StreamOut stderrStreamOut = 
50
{
51
    ASCII,
52
    FSM_ASCII,
53
    DEFAULT_NL_CONFIG,
54
    FileIO,
55
    { 0, TY_(filesink_putByte) }
56
};
57
58
static StreamOut stdoutStreamOut = 
59
{
60
    ASCII,
61
    FSM_ASCII,
62
    DEFAULT_NL_CONFIG,
63
    FileIO,
64
    { 0, TY_(filesink_putByte) }
65
};
66
67
StreamOut* TY_(StdErrOutput)(void)
68
64
{
69
64
  if ( stderrStreamOut.sink.sinkData == 0 )
70
1
      stderrStreamOut.sink.sinkData = stderr;
71
64
  return &stderrStreamOut;
72
64
}
73
74
void  TY_(ReleaseStreamOut)( TidyDocImpl *doc,  StreamOut* out )
75
128
{
76
128
    if ( out && out != &stderrStreamOut && out != &stdoutStreamOut )
77
64
    {
78
64
        if ( out->iotype == FileIO )
79
0
            fclose( (FILE*) out->sink.sinkData );
80
64
        TidyDocFree( doc, out );
81
64
    }
82
128
}
83
84
/************************
85
** Source
86
************************/
87
88
static void InitLastPos( StreamIn *in );
89
90
StreamIn* TY_(initStreamIn)( TidyDocImpl* doc, int encoding )
91
64
{
92
64
    StreamIn *in = (StreamIn*) TidyDocAlloc( doc, sizeof(StreamIn) );
93
94
64
    TidyClearMemory( in, sizeof(StreamIn) );
95
64
    in->curline = 1;
96
64
    in->curcol = 1;
97
64
    in->encoding = encoding;
98
64
    in->state = FSM_ASCII;
99
64
    in->doc = doc;
100
64
    in->bufsize = CHARBUF_SIZE;
101
64
    in->allocator = doc->allocator;
102
64
    in->charbuf = (tchar*)TidyDocAlloc(doc, sizeof(tchar) * in->bufsize);
103
64
    InitLastPos( in );
104
64
    return in;
105
64
}
106
107
void TY_(freeStreamIn)(StreamIn* in)
108
64
{
109
64
    TidyFree(in->allocator, in->charbuf);
110
64
    TidyFree(in->allocator, in);
111
64
}
112
113
StreamIn* TY_(FileInput)( TidyDocImpl* doc, FILE *fp, int encoding )
114
0
{
115
0
    StreamIn *in = TY_(initStreamIn)( doc, encoding );
116
0
    if ( TY_(initFileSource)( doc->allocator, &in->source, fp ) != 0 )
117
0
    {
118
0
        TY_(freeStreamIn)( in );
119
0
        return NULL;
120
0
    }
121
0
    in->iotype = FileIO;
122
0
    return in;
123
0
}
124
125
StreamIn* TY_(BufferInput)( TidyDocImpl* doc, TidyBuffer* buf, int encoding )
126
64
{
127
64
    StreamIn *in = TY_(initStreamIn)( doc, encoding );
128
64
    tidyInitInputBuffer( &in->source, buf );
129
64
    in->iotype = BufferIO;
130
64
    return in;
131
64
}
132
133
StreamIn* TY_(UserInput)( TidyDocImpl* doc, TidyInputSource* source, int encoding )
134
0
{
135
0
    StreamIn *in = TY_(initStreamIn)( doc, encoding );
136
0
    memcpy( &in->source, source, sizeof(TidyInputSource) );
137
0
    in->iotype = UserIO;
138
0
    return in;
139
0
}
140
141
int TY_(ReadBOMEncoding)(StreamIn *in)
142
64
{
143
64
    uint c, c1;
144
64
    uint bom;
145
146
64
    c = ReadByte(in);
147
64
    if (c == EndOfStream)
148
0
        return -1;
149
150
64
    c1 = ReadByte( in );
151
64
    if (c1 == EndOfStream)
152
0
    {
153
0
        UngetByte(in, c);
154
0
        return -1;
155
0
    }
156
157
    /* todo: dont warn about mismatch for auto input encoding */
158
    /* todo: let the user override the encoding found here */
159
160
64
    bom = (c << 8) + c1;
161
162
64
    if ( bom == UNICODE_BOM_BE )
163
0
    {
164
        /* big-endian UTF-16 */
165
0
        if ( in->encoding != UTF16 && in->encoding != UTF16BE )
166
0
            TY_(ReportEncodingWarning)(in->doc, ENCODING_MISMATCH, UTF16BE);
167
168
0
        return UTF16BE; /* return decoded BOM */
169
0
    }
170
64
    else if (bom == UNICODE_BOM_LE)
171
1
    {
172
        /* little-endian UTF-16 */
173
1
        if (in->encoding != UTF16 && in->encoding != UTF16LE)
174
1
            TY_(ReportEncodingWarning)(in->doc, ENCODING_MISMATCH, UTF16LE);
175
176
1
        return UTF16LE; /* return decoded BOM */
177
1
    }
178
63
    else
179
63
    {
180
63
        uint c2 = ReadByte(in);
181
182
63
        if (c2 == EndOfStream)
183
0
        {
184
0
            UngetByte(in, c1);
185
0
            UngetByte(in, c);
186
0
            return -1;
187
0
        }
188
189
63
        if (((c << 16) + (c1 << 8) + c2) == UNICODE_BOM_UTF8)
190
0
        {
191
            /* UTF-8 */
192
0
            if (in->encoding != UTF8)
193
0
                TY_(ReportEncodingWarning)(in->doc, ENCODING_MISMATCH, UTF8);
194
195
0
            return UTF8;
196
0
        }
197
63
        else
198
63
            UngetByte( in, c2 );
199
63
    }
200
201
63
    UngetByte(in, c1);
202
63
    UngetByte(in, c);
203
204
63
    return -1;
205
64
}
206
207
static void InitLastPos( StreamIn *in )
208
64
{
209
64
    in->curlastpos = 0;
210
64
    in->firstlastpos = 0;
211
64
}
212
213
static void PopLastPos( StreamIn *in )
214
140M
{
215
140M
    in->curlastpos = (in->curlastpos+1)%LASTPOS_SIZE;
216
140M
    if ( in->curlastpos == in->firstlastpos )
217
139M
        in->firstlastpos = (in->firstlastpos+1)%LASTPOS_SIZE;
218
140M
}
219
220
static void SaveLastPos( StreamIn *in )
221
139M
{
222
139M
    PopLastPos( in );
223
139M
    in->lastcols[in->curlastpos] = in->curcol;
224
139M
}
225
226
static void RestoreLastPos( StreamIn *in )
227
1.03M
{
228
1.03M
    if ( in->firstlastpos == in->curlastpos )
229
0
        in->curcol = 0;
230
1.03M
    else
231
1.03M
    {
232
1.03M
        in->curcol = in->lastcols[in->curlastpos];
233
1.03M
        if ( in->curlastpos == 0 )
234
16.0k
            in->curlastpos = LASTPOS_SIZE;
235
1.03M
        in->curlastpos--;
236
1.03M
    }
237
1.03M
}
238
239
uint TY_(ReadChar)( StreamIn *in )
240
140M
{
241
140M
    uint c = EndOfStream;
242
243
140M
    if ( in->pushed )
244
1.03M
        return PopChar( in );
245
246
139M
    SaveLastPos( in );
247
248
139M
    if ( in->tabs > 0 )
249
96.0M
    {
250
96.0M
        in->curcol++;
251
96.0M
        in->tabs--;
252
96.0M
        return ' ';
253
96.0M
    }
254
    
255
42.9M
    for (;;)
256
46.5M
    {
257
46.5M
        c = ReadCharFromStream(in);
258
259
46.5M
        if ( EndOfStream == c )
260
1.59k
            return EndOfStream;
261
262
46.5M
        if (c == '\n')
263
1.18M
        {
264
1.18M
            in->curcol = 1;
265
1.18M
            in->curline++;
266
1.18M
            break;
267
1.18M
        }
268
269
45.3M
        if (c == '\t')
270
13.7M
        {
271
13.7M
            Bool keeptabs = cfg( in->doc, TidyKeepTabs );
272
13.7M
            if (!keeptabs) {
273
13.7M
                uint tabsize = cfg(in->doc, TidyTabSize);
274
13.7M
                in->tabs = tabsize > 0 ?
275
13.7M
                    tabsize - ((in->curcol - 1) % tabsize) - 1
276
13.7M
                    : 0;
277
13.7M
                c = ' ';
278
13.7M
            }
279
13.7M
            in->curcol++;
280
13.7M
            break;
281
13.7M
        }
282
283
        /* #427663 - map '\r' to '\n' - Andy Quick 11 Aug 00 */
284
31.6M
        if (c == '\r')
285
13.3k
        {
286
13.3k
            c = ReadCharFromStream(in);
287
13.3k
            if (c != '\n')
288
13.1k
            {
289
13.1k
                TY_(UngetChar)( c, in );
290
13.1k
                c = '\n';
291
13.1k
            }
292
222
            else
293
222
            {
294
222
            }
295
13.3k
            in->curcol = 1;
296
13.3k
            in->curline++;
297
13.3k
            break;
298
13.3k
        }
299
300
31.6M
#ifndef NO_NATIVE_ISO2022_SUPPORT
301
        /* strip control characters, except for Esc */
302
31.6M
        if (c == '\033')
303
294
            break;
304
31.6M
#endif
305
306
        /* Form Feed is allowed in HTML */
307
31.6M
        if ( c == '\015' && !cfgBool(in->doc, TidyXmlTags) )
308
0
            break;
309
            
310
31.6M
        if ( c < 32 )
311
3.59M
            continue; /* discard control char */
312
313
        /* watch out for chars that have already been decoded such as */
314
        /* IS02022, UTF-8 etc, that don't require further decoding */
315
316
28.0M
        if (
317
28.0M
            in->encoding == RAW
318
28.0M
#ifndef NO_NATIVE_ISO2022_SUPPORT
319
28.0M
         || in->encoding == ISO2022
320
28.0M
#endif
321
28.0M
         || in->encoding == UTF8
322
286k
         || in->encoding == SHIFTJIS /* #431953 - RJ */
323
286k
         || in->encoding == BIG5     /* #431953 - RJ */
324
28.0M
           )
325
27.7M
        {
326
27.7M
            in->curcol++;
327
27.7M
            break;
328
27.7M
        }
329
330
        /* handle surrogate pairs */
331
286k
        if ( in->encoding == UTF16LE ||
332
0
             in->encoding == UTF16   ||
333
0
             in->encoding == UTF16BE )
334
286k
        {
335
286k
            if ( !TY_(IsValidUTF16FromUCS4)(c) )
336
0
            {
337
                /* invalid UTF-16 value */
338
0
                TY_(ReportEncodingError)(in->doc, INVALID_UTF16, c, yes);
339
0
                c = 0;
340
0
            }
341
286k
            else if ( TY_(IsLowSurrogate)(c) )
342
0
            {
343
0
                uint n = c;
344
0
                uint m = ReadCharFromStream( in );
345
0
                if ( m == EndOfStream )
346
0
                   return EndOfStream;
347
348
0
                c = 0;
349
0
                if ( TY_(IsHighSurrogate)(m) )
350
0
                {
351
0
                    n = TY_(CombineSurrogatePair)( m, n );
352
0
                    if ( TY_(IsValidCombinedChar)(n) )
353
0
                        c = n;
354
0
                }
355
                /* not a valid pair */
356
0
                if ( 0 == c )
357
0
                    TY_(ReportEncodingError)( in->doc, INVALID_UTF16, c, yes );
358
0
            }
359
286k
        }
360
361
        /* Do first: acts on range 128 - 255 */
362
286k
        switch ( in->encoding )
363
286k
        {
364
0
        case MACROMAN:
365
0
            c = TY_(DecodeMacRoman)( c );
366
0
            break;
367
0
        case IBM858:
368
0
            c = DecodeIbm850( c );
369
0
            break;
370
0
        case LATIN0:
371
0
            c = DecodeLatin0( c );
372
0
            break;
373
286k
        }
374
375
        /* produced e.g. as a side-effect of smart quotes in Word */
376
        /* but can't happen if using MACROMAN encoding */
377
286k
        if ( 127 < c && c < 160 )
378
0
        {
379
0
            uint c1 = 0, replMode = DISCARDED_CHAR;
380
0
            Bool isVendorChar = ( in->encoding == WIN1252 ||
381
0
                                  in->encoding == MACROMAN );
382
0
            Bool isMacChar    = ( in->encoding == MACROMAN );
383
            
384
            /* set error position just before offending character */
385
0
            if (in->doc->lexer)
386
0
            {
387
0
                in->doc->lexer->lines = in->curline;
388
0
                in->doc->lexer->columns = in->curcol;
389
0
            }
390
                
391
0
            if ( isMacChar )
392
0
                c1 = TY_(DecodeMacRoman)( c );
393
0
            else
394
0
                c1 = TY_(DecodeWin1252)( c );
395
0
            if ( c1 )
396
0
                replMode = REPLACED_CHAR;
397
                
398
0
            if ( c1 == 0 && isVendorChar )
399
0
                TY_(ReportEncodingError)(in->doc, VENDOR_SPECIFIC_CHARS, c, replMode == DISCARDED_CHAR);
400
0
            else if ( ! isVendorChar )
401
0
                TY_(ReportEncodingError)(in->doc, INVALID_SGML_CHARS, c, replMode == DISCARDED_CHAR);
402
                
403
0
            c = c1;
404
0
        }
405
406
286k
        if ( c == 0 )
407
0
            continue; /* illegal char is discarded */
408
        
409
286k
        in->curcol++;
410
286k
        break;
411
286k
    }
412
413
42.9M
    return c;
414
42.9M
}
415
416
static uint PopChar( StreamIn *in )
417
1.03M
{
418
1.03M
    uint c = EndOfStream;
419
1.03M
    if ( in->pushed )
420
1.03M
    {
421
1.03M
        assert( in->bufpos > 0 );
422
1.03M
        c = in->charbuf[ --in->bufpos ];
423
1.03M
        if ( in->bufpos == 0 )
424
852k
            in->pushed = no;
425
426
1.03M
        if ( c == '\n' )
427
1.48k
        {
428
1.48k
            in->curcol = 1;
429
1.48k
            in->curline++;
430
1.48k
            PopLastPos( in );
431
1.48k
            return c;
432
1.48k
        }
433
1.03M
        in->curcol++;
434
1.03M
        PopLastPos( in );
435
1.03M
    }
436
1.03M
    return c;
437
1.03M
}
438
439
void TY_(UngetChar)( uint c, StreamIn *in )
440
1.03M
{
441
1.03M
    if (c == EndOfStream)
442
41
    {
443
        /* fprintf(stderr, "Attempt to UngetChar EOF\n"); */
444
41
        return;
445
41
    }
446
    
447
1.03M
    in->pushed = yes;
448
449
1.03M
    if (in->bufpos + 1 >= in->bufsize)
450
35
        in->charbuf = (tchar*)TidyRealloc(in->allocator, in->charbuf, sizeof(tchar) * ++(in->bufsize));
451
452
1.03M
    in->charbuf[(in->bufpos)++] = c;
453
454
1.03M
    if (c == '\n')
455
1.48k
        --(in->curline);
456
457
1.03M
    RestoreLastPos( in );
458
1.03M
}
459
460
461
462
/************************
463
** Sink
464
************************/
465
466
static StreamOut* initStreamOut( TidyDocImpl* doc, int encoding, uint nl )
467
128
{
468
128
    StreamOut* out = (StreamOut*) TidyDocAlloc( doc, sizeof(StreamOut) );
469
128
    TidyClearMemory( out, sizeof(StreamOut) );
470
128
    out->encoding = encoding;
471
128
    out->state = FSM_ASCII;
472
128
    out->nl = nl;
473
128
    return out;
474
128
}
475
476
StreamOut* TY_(FileOutput)( TidyDocImpl *doc, FILE* fp, int encoding, uint nl )
477
0
{
478
0
    StreamOut* out = initStreamOut( doc, encoding, nl );
479
0
    TY_(initFileSink)( &out->sink, fp );
480
0
    out->iotype = FileIO;
481
0
    return out;
482
0
}
483
StreamOut* TY_(BufferOutput)( TidyDocImpl *doc, TidyBuffer* buf, int encoding, uint nl )
484
128
{
485
128
    StreamOut* out = initStreamOut( doc, encoding, nl );
486
128
    tidyInitOutputBuffer( &out->sink, buf );
487
128
    out->iotype = BufferIO;
488
128
    return out;
489
128
}
490
StreamOut* TY_(UserOutput)( TidyDocImpl *doc, TidyOutputSink* sink, int encoding, uint nl )
491
0
{
492
0
    StreamOut* out = initStreamOut( doc, encoding, nl );
493
0
    memcpy( &out->sink, sink, sizeof(TidyOutputSink) );
494
0
    out->iotype = UserIO;
495
0
    return out;
496
0
}
497
498
void TY_(WriteChar)( uint c, StreamOut* out )
499
107M
{
500
    /* Translate outgoing newlines */
501
107M
    if ( LF == c )
502
683k
    {
503
683k
      if ( out->nl == TidyCRLF )
504
0
          TY_(WriteChar)( CR, out );
505
683k
      else if ( out->nl == TidyCR )
506
0
          c = CR;
507
683k
    }
508
509
107M
    if (out->encoding == MACROMAN)
510
0
    {
511
0
        EncodeMacRoman( c, out );
512
0
    }
513
107M
    else if (out->encoding == WIN1252)
514
0
    {
515
0
        EncodeWin1252( c, out );
516
0
    }
517
107M
    else if (out->encoding == IBM858)
518
0
    {
519
0
        EncodeIbm858( c, out );
520
0
    }
521
107M
    else if (out->encoding == LATIN0)
522
0
    {
523
0
        EncodeLatin0( c, out );
524
0
    }
525
526
107M
    else if (out->encoding == UTF8)
527
107M
    {
528
107M
        int count = 0;
529
        
530
107M
        TY_(EncodeCharToUTF8Bytes)( c, NULL, &out->sink, &count );
531
107M
        if (count <= 0)
532
0
        {
533
            /* replacement char 0xFFFD encoded as UTF-8 */
534
0
            PutByte(0xEF, out); PutByte(0xBF, out); PutByte(0xBF, out);
535
0
        }
536
107M
    }
537
0
#ifndef NO_NATIVE_ISO2022_SUPPORT
538
0
    else if (out->encoding == ISO2022)
539
0
    {
540
0
        if (c == 0x1b)  /* ESC */
541
0
            out->state = FSM_ESC;
542
0
        else
543
0
        {
544
0
            switch (out->state)
545
0
            {
546
0
            case FSM_ESC:
547
0
                if (c == '$')
548
0
                    out->state = FSM_ESCD;
549
0
                else if (c == '(')
550
0
                    out->state = FSM_ESCP;
551
0
                else
552
0
                    out->state = FSM_ASCII;
553
0
                break;
554
555
0
            case FSM_ESCD:
556
0
                if (c == '(')
557
0
                    out->state = FSM_ESCDP;
558
0
                else
559
0
                    out->state = FSM_NONASCII;
560
0
                break;
561
562
0
            case FSM_ESCDP:
563
0
                out->state = FSM_NONASCII;
564
0
                break;
565
566
0
            case FSM_ESCP:
567
0
                out->state = FSM_ASCII;
568
0
                break;
569
570
0
            case FSM_NONASCII:
571
0
                c &= 0x7F;
572
0
                break;
573
574
0
            case FSM_ASCII:
575
0
                break;
576
0
            }
577
0
        }
578
579
0
        PutByte(c, out);
580
0
    }
581
0
#endif /* NO_NATIVE_ISO2022_SUPPORT */
582
583
0
    else if ( out->encoding == UTF16LE ||
584
0
              out->encoding == UTF16BE ||
585
0
              out->encoding == UTF16 )
586
0
    {
587
0
        int i, numChars = 1;
588
0
        uint theChars[2];
589
        
590
0
        if ( !TY_(IsValidUTF16FromUCS4)(c) )
591
0
        {
592
            /* invalid UTF-16 value */
593
0
            numChars = 0;
594
0
        }
595
0
        else if ( TY_(IsCombinedChar)(c) )
596
0
        {
597
            /* output both, unless something goes wrong */
598
0
            numChars = 2;
599
0
            if ( !TY_(SplitSurrogatePair)(c, &theChars[0], &theChars[1]) )
600
0
            {
601
0
                numChars = 0;
602
0
            }
603
0
        }
604
0
        else
605
0
        {
606
            /* just put the char out */
607
0
            theChars[0] = c;
608
0
        }
609
        
610
0
        for (i = 0; i < numChars; i++)
611
0
        {
612
0
            c = theChars[i];
613
            
614
0
            if (out->encoding == UTF16LE)
615
0
            {
616
0
                uint ch = c & 0xFF; PutByte(ch, out); 
617
0
                ch = (c >> 8) & 0xFF; PutByte(ch, out); 
618
0
            }
619
    
620
0
            else if (out->encoding == UTF16BE || out->encoding == UTF16)
621
0
            {
622
0
                uint ch = (c >> 8) & 0xFF; PutByte(ch, out); 
623
0
                ch = c & 0xFF; PutByte(ch, out); 
624
0
            }
625
0
        }
626
0
    }
627
0
    else if (out->encoding == BIG5 || out->encoding == SHIFTJIS)
628
0
    {
629
0
        if (c < 128)
630
0
            PutByte(c, out);
631
0
        else
632
0
        {
633
0
            uint ch = (c >> 8) & 0xFF; PutByte(ch, out); 
634
0
            ch = c & 0xFF; PutByte(ch, out); 
635
0
        }
636
0
    }
637
0
    else
638
0
        PutByte( c, out );
639
107M
}
640
641
642
643
/****************************
644
** Miscellaneous / Helpers
645
****************************/
646
647
/* Mapping for Windows Western character set CP 1252
648
** (chars 128-159/U+0080-U+009F) to Unicode.
649
*/
650
static const uint Win2Unicode[32] =
651
{
652
    0x20AC, 0x0000, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
653
    0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x0000, 0x017D, 0x0000,
654
    0x0000, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
655
    0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x0000, 0x017E, 0x0178
656
};
657
658
/* Function for conversion from Windows-1252 to Unicode */
659
uint TY_(DecodeWin1252)(uint c)
660
0
{
661
0
    if (127 < c && c < 160)
662
0
        c = Win2Unicode[c - 128];
663
        
664
0
    return c;
665
0
}
666
667
static void EncodeWin1252( uint c, StreamOut* out )
668
0
{
669
0
    if (c < 128 || (c > 159 && c < 256))
670
0
        PutByte(c, out);
671
0
    else
672
0
    {
673
0
        int i;
674
675
0
        for (i = 128; i < 160; i++)
676
0
            if (Win2Unicode[i - 128] == c)
677
0
            {
678
0
                PutByte(i, out);
679
0
                break;
680
0
            }
681
0
    }
682
0
}
683
684
/*
685
   John Love-Jensen contributed this table for mapping MacRoman
686
   character set to Unicode
687
*/
688
689
/* modified to only need chars 128-255/U+0080-U+00FF - Terry Teague 19 Aug 01 */
690
static const uint Mac2Unicode[128] = 
691
{
692
    /* x7F = DEL */
693
    
694
    0x00C4, 0x00C5, 0x00C7, 0x00C9, 0x00D1, 0x00D6, 0x00DC, 0x00E1,
695
    0x00E0, 0x00E2, 0x00E4, 0x00E3, 0x00E5, 0x00E7, 0x00E9, 0x00E8,
696
697
    0x00EA, 0x00EB, 0x00ED, 0x00EC, 0x00EE, 0x00EF, 0x00F1, 0x00F3,
698
    0x00F2, 0x00F4, 0x00F6, 0x00F5, 0x00FA, 0x00F9, 0x00FB, 0x00FC,
699
700
    0x2020, 0x00B0, 0x00A2, 0x00A3, 0x00A7, 0x2022, 0x00B6, 0x00DF,
701
    0x00AE, 0x00A9, 0x2122, 0x00B4, 0x00A8, 0x2260, 0x00C6, 0x00D8,
702
703
    0x221E, 0x00B1, 0x2264, 0x2265, 0x00A5, 0x00B5, 0x2202, 0x2211,
704
                                            /* =BD U+2126 OHM SIGN */
705
    0x220F, 0x03C0, 0x222B, 0x00AA, 0x00BA, 0x03A9, 0x00E6, 0x00F8,
706
707
    0x00BF, 0x00A1, 0x00AC, 0x221A, 0x0192, 0x2248, 0x2206, 0x00AB,
708
    0x00BB, 0x2026, 0x00A0, 0x00C0, 0x00C3, 0x00D5, 0x0152, 0x0153,
709
710
    0x2013, 0x2014, 0x201C, 0x201D, 0x2018, 0x2019, 0x00F7, 0x25CA,
711
                            /* =DB U+00A4 CURRENCY SIGN */
712
    0x00FF, 0x0178, 0x2044, 0x20AC, 0x2039, 0x203A, 0xFB01, 0xFB02,
713
714
    0x2021, 0x00B7, 0x201A, 0x201E, 0x2030, 0x00C2, 0x00CA, 0x00C1,
715
    0x00CB, 0x00C8, 0x00CD, 0x00CE, 0x00CF, 0x00CC, 0x00D3, 0x00D4,
716
    /* xF0 = Apple Logo */
717
    /* =F0 U+2665 BLACK HEART SUIT */
718
    0xF8FF, 0x00D2, 0x00DA, 0x00DB, 0x00D9, 0x0131, 0x02C6, 0x02DC,
719
    0x00AF, 0x02D8, 0x02D9, 0x02DA, 0x00B8, 0x02DD, 0x02DB, 0x02C7
720
};
721
722
/* Function to convert from MacRoman to Unicode */
723
uint TY_(DecodeMacRoman)(uint c)
724
0
{
725
0
    if (127 < c && c < 256) /* Is. #891 */
726
0
        c = Mac2Unicode[c - 128];
727
0
    return c;
728
0
}
729
730
static void EncodeMacRoman( uint c, StreamOut* out )
731
0
{
732
0
        if (c < 128)
733
0
            PutByte(c, out);
734
0
        else
735
0
        {
736
            /* For mac users, map Unicode back to MacRoman. */
737
0
            int i;
738
0
            for (i = 128; i < 256; i++)
739
0
            {
740
0
                if (Mac2Unicode[i - 128] == c)
741
0
                {
742
0
                    PutByte(i, out);
743
0
                    break;
744
0
                }
745
0
            }
746
0
        }
747
0
}
748
749
/* Mapping for OS/2 Western character set CP 850
750
** (chars 128-255) to Unicode.
751
*/
752
static const uint IBM2Unicode[128] =
753
{
754
    0x00C7, 0x00FC, 0x00E9, 0x00E2, 0x00E4, 0x00E0, 0x00E5, 0x00E7,
755
    0x00EA, 0x00EB, 0x00E8, 0x00EF, 0x00EE, 0x00EC, 0x00C4, 0x00C5,
756
    0x00C9, 0x00E6, 0x00C6, 0x00F4, 0x00F6, 0x00F2, 0x00FB, 0x00F9,
757
    0x00FF, 0x00D6, 0x00DC, 0x00F8, 0x00A3, 0x00D8, 0x00D7, 0x0192,
758
    0x00E1, 0x00ED, 0x00F3, 0x00FA, 0x00F1, 0x00D1, 0x00AA, 0x00BA,
759
    0x00BF, 0x00AE, 0x00AC, 0x00BD, 0x00BC, 0x00A1, 0x00AB, 0x00BB,
760
    0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x00C1, 0x00C2, 0x00C0,
761
    0x00A9, 0x2563, 0x2551, 0x2557, 0x255D, 0x00A2, 0x00A5, 0x2510,
762
    0x2514, 0x2534, 0x252C, 0x251C, 0x2500, 0x253C, 0x00E3, 0x00C3,
763
    0x255A, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x00a4,
764
    0x00f0, 0x00d0, 0x00ca, 0x00cb, 0x00c8, 0x20AC, 0x00cd, 0x00ce,
765
    0x00cf, 0x2518, 0x250c, 0x2588, 0x2584, 0x00a6, 0x00cc, 0x2580,
766
    0x00d3, 0x00df, 0x00d4, 0x00d2, 0x00f5, 0x00d5, 0x00b5, 0x00fe,
767
    0x00de, 0x00da, 0x00db, 0x00d9, 0x00fd, 0x00dd, 0x00af, 0x00b4,
768
    0x00ad, 0x00b1, 0x2017, 0x00be, 0x00b6, 0x00a7, 0x00f7, 0x00b8,
769
    0x00b0, 0x00a8, 0x00b7, 0x00b9, 0x00b3, 0x00b2, 0x25a0, 0x00a0
770
};
771
772
/* Function for conversion from OS/2-850 to Unicode */
773
static uint DecodeIbm850(uint c)
774
0
{
775
0
    if (127 < c && c < 256)
776
0
        c = IBM2Unicode[c - 128];
777
778
0
    return c;
779
0
}
780
781
/* For OS/2,Java users, map Unicode back to IBM858 (IBM850+Euro). */
782
static void EncodeIbm858( uint c, StreamOut* out )
783
0
{
784
0
    if (c < 128)
785
0
        PutByte(c, out);
786
0
    else
787
0
    {
788
0
        int i;
789
0
        for (i = 128; i < 256; i++)
790
0
        {
791
0
            if (IBM2Unicode[i - 128] == c)
792
0
            {
793
0
                PutByte(i, out);
794
0
                break;
795
0
            }
796
0
        }
797
0
    }
798
0
}
799
800
801
/* Convert from Latin0 (aka Latin9, ISO-8859-15) to Unicode */
802
static uint DecodeLatin0(uint c)
803
0
{
804
0
    if (163 < c && c < 191)
805
0
    {
806
0
        switch (c)
807
0
        {
808
0
        case 0xA4: c = 0x20AC; break;
809
0
        case 0xA6: c = 0x0160; break;
810
0
        case 0xA8: c = 0x0161; break;
811
0
        case 0xB4: c = 0x017D; break;
812
0
        case 0xB8: c = 0x017E; break;
813
0
        case 0xBC: c = 0x0152; break;
814
0
        case 0xBD: c = 0x0153; break;
815
0
        case 0xBE: c = 0x0178; break;
816
0
        }
817
0
    }
818
0
    return c;
819
0
}
820
821
/* Map Unicode back to ISO-8859-15. */
822
static void EncodeLatin0( uint c, StreamOut* out )
823
0
{
824
0
    switch (c)
825
0
    {
826
0
    case 0x20AC: c = 0xA4; break;
827
0
    case 0x0160: c = 0xA6; break;
828
0
    case 0x0161: c = 0xA8; break;
829
0
    case 0x017D: c = 0xB4; break;
830
0
    case 0x017E: c = 0xB8; break;
831
0
    case 0x0152: c = 0xBC; break;
832
0
    case 0x0153: c = 0xBD; break;
833
0
    case 0x0178: c = 0xBE; break;
834
0
    }
835
0
    PutByte(c, out);
836
0
}
837
838
/* Facilitates user defined source by providing
839
** an entry point to marshal pointers-to-functions.
840
** Needed by .NET and possibly other language bindings.
841
*/
842
Bool TIDY_CALL tidyInitSource( TidyInputSource*  source,
843
                               void*             srcData,
844
                               TidyGetByteFunc   gbFunc,
845
                               TidyUngetByteFunc ugbFunc,
846
                               TidyEOFFunc       endFunc )
847
0
{
848
0
  Bool status = ( source && srcData && gbFunc && ugbFunc && endFunc );
849
850
0
  if ( status )
851
0
  {
852
0
    source->sourceData = srcData;
853
0
    source->getByte    = gbFunc;
854
0
    source->ungetByte  = ugbFunc;
855
0
    source->eof        = endFunc;
856
0
  }
857
858
0
  return status;
859
0
}
860
861
Bool TIDY_CALL tidyInitSink( TidyOutputSink* sink,
862
                             void*           snkData,
863
                             TidyPutByteFunc pbFunc )
864
0
{
865
0
  Bool status = ( sink && snkData && pbFunc );
866
0
  if ( status )
867
0
  {
868
0
    sink->sinkData = snkData;
869
0
    sink->putByte  = pbFunc;
870
0
  }
871
0
  return status;
872
0
}
873
874
/* GetByte must return a byte value in a signed
875
** integer so that a negative value can signal EOF
876
** without interfering w/ 0-255 legitimate byte values.
877
*/
878
uint TIDY_CALL tidyGetByte( TidyInputSource* source )
879
46.8M
{
880
46.8M
  int bv = source->getByte( source->sourceData );
881
46.8M
  return (uint) bv;
882
46.8M
}
883
Bool TIDY_CALL tidyIsEOF( TidyInputSource* source )
884
46.6M
{
885
46.6M
  return source->eof( source->sourceData );
886
46.6M
}
887
void TIDY_CALL tidyUngetByte( TidyInputSource* source, uint ch )
888
189
{
889
189
    source->ungetByte( source->sourceData, (byte) ch );
890
189
}
891
void TIDY_CALL tidyPutByte( TidyOutputSink* sink, uint ch )
892
0
{
893
0
    sink->putByte( sink->sinkData, (byte) ch );
894
0
}
895
896
static uint ReadByte( StreamIn* in )
897
46.8M
{
898
46.8M
    return tidyGetByte( &in->source );
899
46.8M
}
900
Bool TY_(IsEOF)( StreamIn* in )
901
46.6M
{
902
46.6M
    return tidyIsEOF( &in->source );
903
46.6M
}
904
static void UngetByte( StreamIn* in, uint byteValue )
905
189
{
906
189
    tidyUngetByte( &in->source, byteValue );
907
189
}
908
static void PutByte( uint byteValue, StreamOut* out )
909
0
{
910
0
    tidyPutByte( &out->sink, byteValue );
911
0
}
912
913
/* read char from stream */
914
static uint ReadCharFromStream( StreamIn* in )
915
46.5M
{
916
46.5M
    uint c, n;
917
918
46.5M
    if ( TY_(IsEOF)(in) )
919
1.58k
        return EndOfStream;
920
    
921
46.5M
    c = ReadByte( in );
922
923
46.5M
    if (c == EndOfStream)
924
0
        return c;
925
926
46.5M
#ifndef NO_NATIVE_ISO2022_SUPPORT
927
    /*
928
       A document in ISO-2022 based encoding uses some ESC sequences
929
       called "designator" to switch character sets. The designators
930
       defined and used in ISO-2022-JP are:
931
932
        "ESC" + "(" + ?     for ISO646 variants
933
934
        "ESC" + "$" + ?     and
935
        "ESC" + "$" + "(" + ?   for multibyte character sets
936
937
       Where ? stands for a single character used to indicate the
938
       character set for multibyte characters.
939
940
       Tidy handles this by preserving the escape sequence and
941
       setting the top bit of each byte for non-ascii chars. This
942
       bit is then cleared on output. The input stream keeps track
943
       of the state to determine when to set/clear the bit.
944
    */
945
946
46.5M
    if (in->encoding == ISO2022)
947
0
    {
948
0
        if (c == 0x1b)  /* ESC */
949
0
        {
950
0
            in->state = FSM_ESC;
951
0
            return c;
952
0
        }
953
954
0
        switch (in->state)
955
0
        {
956
0
        case FSM_ESC:
957
0
            if (c == '$')
958
0
                in->state = FSM_ESCD;
959
0
            else if (c == '(')
960
0
                in->state = FSM_ESCP;
961
0
            else
962
0
                in->state = FSM_ASCII;
963
0
            break;
964
965
0
        case FSM_ESCD:
966
0
            if (c == '(')
967
0
                in->state = FSM_ESCDP;
968
0
            else
969
0
                in->state = FSM_NONASCII;
970
0
            break;
971
972
0
        case FSM_ESCDP:
973
0
            in->state = FSM_NONASCII;
974
0
            break;
975
976
0
        case FSM_ESCP:
977
0
            in->state = FSM_ASCII;
978
0
            break;
979
980
0
        case FSM_NONASCII:
981
0
            c |= 0x80;
982
0
            break;
983
984
0
        case FSM_ASCII:
985
0
            break;
986
0
        }
987
988
0
        return c;
989
0
    }
990
46.5M
#endif /* NO_NATIVE_ISO2022_SUPPORT */
991
992
46.5M
    if ( in->encoding == UTF16LE )
993
286k
    {
994
286k
        uint c1 = ReadByte( in );
995
286k
        if ( EndOfStream == c1 )
996
1
            return EndOfStream;
997
286k
        n = (c1 << 8) + c;
998
286k
        return n;
999
286k
    }
1000
1001
46.3M
    if ((in->encoding == UTF16) || (in->encoding == UTF16BE)) /* UTF-16 is big-endian by default */
1002
0
    {
1003
0
        uint c1 = ReadByte( in );
1004
0
        if ( EndOfStream == c1 )
1005
0
            return EndOfStream;
1006
0
        n = (c << 8) + c1;
1007
0
        return n;
1008
0
    }
1009
1010
46.3M
    if ( in->encoding == UTF8 )
1011
46.3M
    {
1012
        /* deal with UTF-8 encoded char */
1013
1014
46.3M
        int err, count = 0;
1015
        
1016
        /* first byte "c" is passed in separately */
1017
46.3M
        err = TY_(DecodeUTF8BytesToChar)( &n, c, NULL, &in->source, &count );
1018
46.3M
        if (!err && (n == (uint)EndOfStream) && (count == 1)) /* EOF */
1019
0
            return EndOfStream;
1020
46.3M
        else if (err)
1021
932k
        {
1022
            /* set error position just before offending character */
1023
932k
            in->doc->lexer->lines = in->curline;
1024
932k
            in->doc->lexer->columns = in->curcol;
1025
1026
932k
            TY_(ReportEncodingError)(in->doc, INVALID_UTF8, n, no);
1027
932k
            n = 0xFFFD; /* replacement char */
1028
932k
        }
1029
        
1030
46.3M
        return n;
1031
46.3M
    }
1032
    
1033
    /*
1034
       This section is suitable for any "multibyte" variable-width 
1035
       character encoding in which a one-byte code is less than
1036
       128, and the first byte of a two-byte code is greater or
1037
       equal to 128. Note that Big5 and ShiftJIS fit into this
1038
       kind, even though their second byte may be less than 128
1039
    */
1040
0
    if ((in->encoding == BIG5) || (in->encoding == SHIFTJIS))
1041
0
    {
1042
0
        if (c < 128)
1043
0
            return c;
1044
0
        else if ((in->encoding == SHIFTJIS) && (c >= 0xa1 && c <= 0xdf)) /* 461643 - fix suggested by Rick Cameron 14 Sep 01 */
1045
0
        {
1046
            /*
1047
              Rick Cameron pointed out that for Shift_JIS, the values from
1048
              0xa1 through 0xdf represent singe-byte characters
1049
              (U+FF61 to U+FF9F - half-shift Katakana)
1050
            */
1051
0
            return c;
1052
0
        }
1053
0
        else
1054
0
        {
1055
0
            uint c1 = ReadByte( in );
1056
0
            if ( EndOfStream == c1 )
1057
0
                return EndOfStream;
1058
0
            n = (c << 8) + c1;
1059
0
            return n;
1060
0
        }
1061
0
    }
1062
0
    else
1063
0
        n = c;
1064
        
1065
0
    return n;
1066
0
}
1067
1068
/* Output a Byte Order Mark if required */
1069
void TY_(outBOM)( StreamOut *out )
1070
0
{
1071
0
    if ( out->encoding == UTF8
1072
0
         || out->encoding == UTF16LE
1073
0
         || out->encoding == UTF16BE
1074
0
         || out->encoding == UTF16
1075
0
       )
1076
0
    {
1077
        /* this will take care of encoding the BOM correctly */
1078
0
        TY_(WriteChar)( UNICODE_BOM, out );
1079
0
    }
1080
0
}
1081
1082
/* this is in intermediate fix for various problems in the */
1083
/* long term code and data in charsets.c should be used    */
1084
static struct _enc2iana
1085
{
1086
    uint id;
1087
    ctmbstr name;
1088
    ctmbstr tidyOptName;
1089
} const enc2iana[] =
1090
{
1091
  { ASCII,    "us-ascii",     "ascii"   },
1092
  { LATIN0,   "iso-8859-15",  "latin0"  },
1093
  { LATIN1,   "iso-8859-1",   "latin1"  },
1094
  { UTF8,     "utf-8",        "utf8"   },
1095
  { MACROMAN, "macintosh",    "mac"     },
1096
  { WIN1252,  "windows-1252", "win1252" },
1097
  { IBM858,   "ibm00858",     "ibm858"  },
1098
  { UTF16LE,  "utf-16",       "utf16le" },
1099
  { UTF16BE,  "utf-16",       "utf16be" },
1100
  { UTF16,    "utf-16",       "utf16"   },
1101
  { BIG5,     "big5",         "big5"    },
1102
  { SHIFTJIS, "shift_jis",    "shiftjis"},
1103
#ifndef NO_NATIVE_ISO2022_SUPPORT
1104
  { ISO2022,  NULL,           "iso2022" },
1105
#endif
1106
  { RAW,      NULL,           "raw"     }
1107
};
1108
1109
ctmbstr TY_(GetEncodingNameFromTidyId)(uint id)
1110
66
{
1111
66
    uint i;
1112
1113
268
    for (i = 0; enc2iana[i].name; ++i)
1114
268
        if (enc2iana[i].id == id)
1115
66
            return enc2iana[i].name;
1116
1117
0
    return NULL;
1118
66
}
1119
1120
ctmbstr TY_(GetEncodingOptNameFromTidyId)(uint id)
1121
0
{
1122
0
    uint i;
1123
1124
0
    for (i = 0; i < sizeof(enc2iana)/sizeof(enc2iana[0]); ++i)
1125
0
        if (enc2iana[i].id == id)
1126
0
            return enc2iana[i].tidyOptName;
1127
1128
0
    return NULL;
1129
0
}
1130
1131
int TY_(GetCharEncodingFromOptName)( ctmbstr charenc )
1132
0
{
1133
0
    uint i;
1134
1135
0
    for (i = 0; i < sizeof(enc2iana)/sizeof(enc2iana[0]); ++i)
1136
0
        if (TY_(tmbstrcasecmp)(charenc, enc2iana[i].tidyOptName) == 0 )
1137
0
            return enc2iana[i].id;
1138
1139
0
    return -1;
1140
0
}
1141
1142
/*
1143
 * local variables:
1144
 * mode: c
1145
 * indent-tabs-mode: nil
1146
 * c-basic-offset: 4
1147
 * eval: (c-set-offset 'substatement-open 0)
1148
 * end:
1149
 */