Coverage Report

Created: 2025-07-18 06:12

/src/tidy-html5/src/streamio.c
Line
Count
Source (jump to first uncovered line)
1
/* streamio.c -- handles character stream I/O
2
3
  (c) 1998-2008 (W3C) MIT, ERCIM, Keio University
4
  See tidy.h for the copyright notice.
5
6
  Wrapper around Tidy input source and output sink
7
  that calls appropriate interfaces, and applies
8
  necessary char encoding transformations: to/from
9
  ISO-10646 and/or UTF-8.
10
11
*/
12
13
#include <stdio.h>
14
#include <errno.h>
15
16
#include "streamio.h"
17
#include "tidy-int.h"
18
#include "lexer.h"
19
#include "message.h"
20
#include "utf8.h"
21
#include "tmbstr.h"
22
23
24
/************************
25
** Forward Declarations
26
************************/
27
28
static uint ReadCharFromStream( StreamIn* in );
29
30
static uint ReadByte( StreamIn* in );
31
static void UngetByte( StreamIn* in, uint byteValue );
32
33
static void PutByte( uint byteValue, StreamOut* out );
34
35
static void EncodeWin1252( uint c, StreamOut* out );
36
static void EncodeMacRoman( uint c, StreamOut* out );
37
static void EncodeIbm858( uint c, StreamOut* out );
38
static void EncodeLatin0( uint c, StreamOut* out );
39
40
static uint DecodeIbm850(uint c);
41
static uint DecodeLatin0(uint c);
42
43
static uint PopChar( StreamIn *in );
44
45
/******************************
46
** Static (duration) Globals
47
******************************/
48
49
static StreamOut stderrStreamOut = 
50
{
51
    ASCII,
52
    FSM_ASCII,
53
    DEFAULT_NL_CONFIG,
54
    FileIO,
55
    { 0, TY_(filesink_putByte) }
56
};
57
58
static StreamOut stdoutStreamOut = 
59
{
60
    ASCII,
61
    FSM_ASCII,
62
    DEFAULT_NL_CONFIG,
63
    FileIO,
64
    { 0, TY_(filesink_putByte) }
65
};
66
67
StreamOut* TY_(StdErrOutput)(void)
68
861
{
69
861
  if ( stderrStreamOut.sink.sinkData == 0 )
70
1
      stderrStreamOut.sink.sinkData = stderr;
71
861
  return &stderrStreamOut;
72
861
}
73
74
void  TY_(ReleaseStreamOut)( TidyDocImpl *doc,  StreamOut* out )
75
1.72k
{
76
1.72k
    if ( out && out != &stderrStreamOut && out != &stdoutStreamOut )
77
861
    {
78
861
        if ( out->iotype == FileIO )
79
0
            fclose( (FILE*) out->sink.sinkData );
80
861
        TidyDocFree( doc, out );
81
861
    }
82
1.72k
}
83
84
/************************
85
** Source
86
************************/
87
88
static void InitLastPos( StreamIn *in );
89
90
StreamIn* TY_(initStreamIn)( TidyDocImpl* doc, int encoding )
91
861
{
92
861
    StreamIn *in = (StreamIn*) TidyDocAlloc( doc, sizeof(StreamIn) );
93
94
861
    TidyClearMemory( in, sizeof(StreamIn) );
95
861
    in->curline = 1;
96
861
    in->curcol = 1;
97
861
    in->encoding = encoding;
98
861
    in->state = FSM_ASCII;
99
861
    in->doc = doc;
100
861
    in->bufsize = CHARBUF_SIZE;
101
861
    in->allocator = doc->allocator;
102
861
    in->charbuf = (tchar*)TidyDocAlloc(doc, sizeof(tchar) * in->bufsize);
103
861
    InitLastPos( in );
104
861
    return in;
105
861
}
106
107
void TY_(freeStreamIn)(StreamIn* in)
108
861
{
109
861
    TidyFree(in->allocator, in->charbuf);
110
861
    TidyFree(in->allocator, in);
111
861
}
112
113
StreamIn* TY_(FileInput)( TidyDocImpl* doc, FILE *fp, int encoding )
114
828
{
115
828
    StreamIn *in = TY_(initStreamIn)( doc, encoding );
116
828
    if ( TY_(initFileSource)( doc->allocator, &in->source, fp ) != 0 )
117
0
    {
118
0
        TY_(freeStreamIn)( in );
119
0
        return NULL;
120
0
    }
121
828
    in->iotype = FileIO;
122
828
    return in;
123
828
}
124
125
StreamIn* TY_(BufferInput)( TidyDocImpl* doc, TidyBuffer* buf, int encoding )
126
33
{
127
33
    StreamIn *in = TY_(initStreamIn)( doc, encoding );
128
33
    tidyInitInputBuffer( &in->source, buf );
129
33
    in->iotype = BufferIO;
130
33
    return in;
131
33
}
132
133
StreamIn* TY_(UserInput)( TidyDocImpl* doc, TidyInputSource* source, int encoding )
134
0
{
135
0
    StreamIn *in = TY_(initStreamIn)( doc, encoding );
136
0
    memcpy( &in->source, source, sizeof(TidyInputSource) );
137
0
    in->iotype = UserIO;
138
0
    return in;
139
0
}
140
141
int TY_(ReadBOMEncoding)(StreamIn *in)
142
861
{
143
861
    uint c, c1;
144
861
    uint bom;
145
146
861
    c = ReadByte(in);
147
861
    if (c == EndOfStream)
148
0
        return -1;
149
150
861
    c1 = ReadByte( in );
151
861
    if (c1 == EndOfStream)
152
1
    {
153
1
        UngetByte(in, c);
154
1
        return -1;
155
1
    }
156
157
    /* todo: dont warn about mismatch for auto input encoding */
158
    /* todo: let the user override the encoding found here */
159
160
860
    bom = (c << 8) + c1;
161
162
860
    if ( bom == UNICODE_BOM_BE )
163
29
    {
164
        /* big-endian UTF-16 */
165
29
        if ( in->encoding != UTF16 && in->encoding != UTF16BE )
166
29
            TY_(ReportEncodingWarning)(in->doc, ENCODING_MISMATCH, UTF16BE);
167
168
29
        return UTF16BE; /* return decoded BOM */
169
29
    }
170
831
    else if (bom == UNICODE_BOM_LE)
171
24
    {
172
        /* little-endian UTF-16 */
173
24
        if (in->encoding != UTF16 && in->encoding != UTF16LE)
174
24
            TY_(ReportEncodingWarning)(in->doc, ENCODING_MISMATCH, UTF16LE);
175
176
24
        return UTF16LE; /* return decoded BOM */
177
24
    }
178
807
    else
179
807
    {
180
807
        uint c2 = ReadByte(in);
181
182
807
        if (c2 == EndOfStream)
183
0
        {
184
0
            UngetByte(in, c1);
185
0
            UngetByte(in, c);
186
0
            return -1;
187
0
        }
188
189
807
        if (((c << 16) + (c1 << 8) + c2) == UNICODE_BOM_UTF8)
190
0
        {
191
            /* UTF-8 */
192
0
            if (in->encoding != UTF8)
193
0
                TY_(ReportEncodingWarning)(in->doc, ENCODING_MISMATCH, UTF8);
194
195
0
            return UTF8;
196
0
        }
197
807
        else
198
807
            UngetByte( in, c2 );
199
807
    }
200
201
807
    UngetByte(in, c1);
202
807
    UngetByte(in, c);
203
204
807
    return -1;
205
860
}
206
207
static void InitLastPos( StreamIn *in )
208
861
{
209
861
    in->curlastpos = 0;
210
861
    in->firstlastpos = 0;
211
861
}
212
213
static void PopLastPos( StreamIn *in )
214
152M
{
215
152M
    in->curlastpos = (in->curlastpos+1)%LASTPOS_SIZE;
216
152M
    if ( in->curlastpos == in->firstlastpos )
217
148M
        in->firstlastpos = (in->firstlastpos+1)%LASTPOS_SIZE;
218
152M
}
219
220
static void SaveLastPos( StreamIn *in )
221
148M
{
222
148M
    PopLastPos( in );
223
148M
    in->lastcols[in->curlastpos] = in->curcol;
224
148M
}
225
226
static void RestoreLastPos( StreamIn *in )
227
4.32M
{
228
4.32M
    if ( in->firstlastpos == in->curlastpos )
229
122k
        in->curcol = 0;
230
4.20M
    else
231
4.20M
    {
232
4.20M
        in->curcol = in->lastcols[in->curlastpos];
233
4.20M
        if ( in->curlastpos == 0 )
234
65.7k
            in->curlastpos = LASTPOS_SIZE;
235
4.20M
        in->curlastpos--;
236
4.20M
    }
237
4.32M
}
238
239
uint TY_(ReadChar)( StreamIn *in )
240
152M
{
241
152M
    uint c = EndOfStream;
242
243
152M
    if ( in->pushed )
244
4.32M
        return PopChar( in );
245
246
148M
    SaveLastPos( in );
247
248
148M
    if ( in->tabs > 0 )
249
105M
    {
250
105M
        in->curcol++;
251
105M
        in->tabs--;
252
105M
        return ' ';
253
105M
    }
254
    
255
42.4M
    for (;;)
256
47.4M
    {
257
47.4M
        c = ReadCharFromStream(in);
258
259
47.4M
        if ( EndOfStream == c )
260
1.00M
            return EndOfStream;
261
262
46.4M
        if (c == '\n')
263
793k
        {
264
793k
            in->curcol = 1;
265
793k
            in->curline++;
266
793k
            break;
267
793k
        }
268
269
45.6M
        if (c == '\t')
270
15.2M
        {
271
15.2M
            Bool keeptabs = cfg( in->doc, TidyKeepTabs );
272
15.2M
            if (!keeptabs) {
273
15.1M
                uint tabsize = cfg(in->doc, TidyTabSize);
274
15.1M
                in->tabs = tabsize > 0 ?
275
15.1M
                    tabsize - ((in->curcol - 1) % tabsize) - 1
276
15.1M
                    : 0;
277
15.1M
                c = ' ';
278
15.1M
            }
279
15.2M
            in->curcol++;
280
15.2M
            break;
281
15.2M
        }
282
283
        /* #427663 - map '\r' to '\n' - Andy Quick 11 Aug 00 */
284
30.3M
        if (c == '\r')
285
104k
        {
286
104k
            c = ReadCharFromStream(in);
287
104k
            if (c != '\n')
288
85.4k
            {
289
85.4k
                TY_(UngetChar)( c, in );
290
85.4k
                c = '\n';
291
85.4k
            }
292
19.4k
            else
293
19.4k
            {
294
19.4k
            }
295
104k
            in->curcol = 1;
296
104k
            in->curline++;
297
104k
            break;
298
104k
        }
299
300
30.2M
#ifndef NO_NATIVE_ISO2022_SUPPORT
301
        /* strip control characters, except for Esc */
302
30.2M
        if (c == '\033')
303
8.84k
            break;
304
30.2M
#endif
305
306
        /* Form Feed is allowed in HTML */
307
30.2M
        if ( c == '\015' && !cfgBool(in->doc, TidyXmlTags) )
308
0
            break;
309
            
310
30.2M
        if ( c < 32 )
311
4.95M
            continue; /* discard control char */
312
313
        /* watch out for chars that have already been decoded such as */
314
        /* IS02022, UTF-8 etc, that don't require further decoding */
315
316
25.2M
        if (
317
25.2M
            in->encoding == RAW
318
25.2M
#ifndef NO_NATIVE_ISO2022_SUPPORT
319
25.2M
         || in->encoding == ISO2022
320
25.2M
#endif
321
25.2M
         || in->encoding == UTF8
322
25.2M
         || in->encoding == SHIFTJIS /* #431953 - RJ */
323
25.2M
         || in->encoding == BIG5     /* #431953 - RJ */
324
25.2M
           )
325
24.6M
        {
326
24.6M
            in->curcol++;
327
24.6M
            break;
328
24.6M
        }
329
330
        /* handle surrogate pairs */
331
605k
        if ( in->encoding == UTF16LE ||
332
605k
             in->encoding == UTF16   ||
333
605k
             in->encoding == UTF16BE )
334
605k
        {
335
605k
            if ( !TY_(IsValidUTF16FromUCS4)(c) )
336
0
            {
337
                /* invalid UTF-16 value */
338
0
                TY_(ReportEncodingError)(in->doc, INVALID_UTF16, c, yes);
339
0
                c = 0;
340
0
            }
341
605k
            else if ( TY_(IsLowSurrogate)(c) )
342
2.44k
            {
343
2.44k
                uint n = c;
344
2.44k
                uint m = ReadCharFromStream( in );
345
2.44k
                if ( m == EndOfStream )
346
0
                   return EndOfStream;
347
348
2.44k
                c = 0;
349
2.44k
                if ( TY_(IsHighSurrogate)(m) )
350
65
                {
351
65
                    n = TY_(CombineSurrogatePair)( m, n );
352
65
                    if ( TY_(IsValidCombinedChar)(n) )
353
62
                        c = n;
354
65
                }
355
                /* not a valid pair */
356
2.44k
                if ( 0 == c )
357
2.38k
                    TY_(ReportEncodingError)( in->doc, INVALID_UTF16, c, yes );
358
2.44k
            }
359
605k
        }
360
361
        /* Do first: acts on range 128 - 255 */
362
605k
        switch ( in->encoding )
363
605k
        {
364
0
        case MACROMAN:
365
0
            c = TY_(DecodeMacRoman)( c );
366
0
            break;
367
0
        case IBM858:
368
0
            c = DecodeIbm850( c );
369
0
            break;
370
0
        case LATIN0:
371
0
            c = DecodeLatin0( c );
372
0
            break;
373
605k
        }
374
375
        /* produced e.g. as a side-effect of smart quotes in Word */
376
        /* but can't happen if using MACROMAN encoding */
377
605k
        if ( 127 < c && c < 160 )
378
3.56k
        {
379
3.56k
            uint c1 = 0, replMode = DISCARDED_CHAR;
380
3.56k
            Bool isVendorChar = ( in->encoding == WIN1252 ||
381
3.56k
                                  in->encoding == MACROMAN );
382
3.56k
            Bool isMacChar    = ( in->encoding == MACROMAN );
383
            
384
            /* set error position just before offending character */
385
3.56k
            if (in->doc->lexer)
386
3.56k
            {
387
3.56k
                in->doc->lexer->lines = in->curline;
388
3.56k
                in->doc->lexer->columns = in->curcol;
389
3.56k
            }
390
                
391
3.56k
            if ( isMacChar )
392
0
                c1 = TY_(DecodeMacRoman)( c );
393
3.56k
            else
394
3.56k
                c1 = TY_(DecodeWin1252)( c );
395
3.56k
            if ( c1 )
396
3.04k
                replMode = REPLACED_CHAR;
397
                
398
3.56k
            if ( c1 == 0 && isVendorChar )
399
0
                TY_(ReportEncodingError)(in->doc, VENDOR_SPECIFIC_CHARS, c, replMode == DISCARDED_CHAR);
400
3.56k
            else if ( ! isVendorChar )
401
3.56k
                TY_(ReportEncodingError)(in->doc, INVALID_SGML_CHARS, c, replMode == DISCARDED_CHAR);
402
                
403
3.56k
            c = c1;
404
3.56k
        }
405
406
605k
        if ( c == 0 )
407
2.89k
            continue; /* illegal char is discarded */
408
        
409
602k
        in->curcol++;
410
602k
        break;
411
605k
    }
412
413
41.4M
    return c;
414
42.4M
}
415
416
static uint PopChar( StreamIn *in )
417
4.32M
{
418
4.32M
    uint c = EndOfStream;
419
4.32M
    if ( in->pushed )
420
4.32M
    {
421
4.32M
        assert( in->bufpos > 0 );
422
4.32M
        c = in->charbuf[ --in->bufpos ];
423
4.32M
        if ( in->bufpos == 0 )
424
2.82M
            in->pushed = no;
425
426
4.32M
        if ( c == '\n' )
427
13.9k
        {
428
13.9k
            in->curcol = 1;
429
13.9k
            in->curline++;
430
13.9k
            PopLastPos( in );
431
13.9k
            return c;
432
13.9k
        }
433
4.31M
        in->curcol++;
434
4.31M
        PopLastPos( in );
435
4.31M
    }
436
4.31M
    return c;
437
4.32M
}
438
439
void TY_(UngetChar)( uint c, StreamIn *in )
440
4.32M
{
441
4.32M
    if (c == EndOfStream)
442
1.13k
    {
443
        /* fprintf(stderr, "Attempt to UngetChar EOF\n"); */
444
1.13k
        return;
445
1.13k
    }
446
    
447
4.32M
    in->pushed = yes;
448
449
4.32M
    if (in->bufpos + 1 >= in->bufsize)
450
115k
        in->charbuf = (tchar*)TidyRealloc(in->allocator, in->charbuf, sizeof(tchar) * ++(in->bufsize));
451
452
4.32M
    in->charbuf[(in->bufpos)++] = c;
453
454
4.32M
    if (c == '\n')
455
13.9k
        --(in->curline);
456
457
4.32M
    RestoreLastPos( in );
458
4.32M
}
459
460
461
462
/************************
463
** Sink
464
************************/
465
466
static StreamOut* initStreamOut( TidyDocImpl* doc, int encoding, uint nl )
467
861
{
468
861
    StreamOut* out = (StreamOut*) TidyDocAlloc( doc, sizeof(StreamOut) );
469
861
    TidyClearMemory( out, sizeof(StreamOut) );
470
861
    out->encoding = encoding;
471
861
    out->state = FSM_ASCII;
472
861
    out->nl = nl;
473
861
    return out;
474
861
}
475
476
StreamOut* TY_(FileOutput)( TidyDocImpl *doc, FILE* fp, int encoding, uint nl )
477
0
{
478
0
    StreamOut* out = initStreamOut( doc, encoding, nl );
479
0
    TY_(initFileSink)( &out->sink, fp );
480
0
    out->iotype = FileIO;
481
0
    return out;
482
0
}
483
StreamOut* TY_(BufferOutput)( TidyDocImpl *doc, TidyBuffer* buf, int encoding, uint nl )
484
861
{
485
861
    StreamOut* out = initStreamOut( doc, encoding, nl );
486
861
    tidyInitOutputBuffer( &out->sink, buf );
487
861
    out->iotype = BufferIO;
488
861
    return out;
489
861
}
490
StreamOut* TY_(UserOutput)( TidyDocImpl *doc, TidyOutputSink* sink, int encoding, uint nl )
491
0
{
492
0
    StreamOut* out = initStreamOut( doc, encoding, nl );
493
0
    memcpy( &out->sink, sink, sizeof(TidyOutputSink) );
494
0
    out->iotype = UserIO;
495
0
    return out;
496
0
}
497
498
void TY_(WriteChar)( uint c, StreamOut* out )
499
1.65M
{
500
    /* Translate outgoing newlines */
501
1.65M
    if ( LF == c )
502
1.65M
    {
503
1.65M
      if ( out->nl == TidyCRLF )
504
0
          TY_(WriteChar)( CR, out );
505
1.65M
      else if ( out->nl == TidyCR )
506
0
          c = CR;
507
1.65M
    }
508
509
1.65M
    if (out->encoding == MACROMAN)
510
0
    {
511
0
        EncodeMacRoman( c, out );
512
0
    }
513
1.65M
    else if (out->encoding == WIN1252)
514
0
    {
515
0
        EncodeWin1252( c, out );
516
0
    }
517
1.65M
    else if (out->encoding == IBM858)
518
0
    {
519
0
        EncodeIbm858( c, out );
520
0
    }
521
1.65M
    else if (out->encoding == LATIN0)
522
0
    {
523
0
        EncodeLatin0( c, out );
524
0
    }
525
526
1.65M
    else if (out->encoding == UTF8)
527
1.65M
    {
528
1.65M
        int count = 0;
529
        
530
1.65M
        TY_(EncodeCharToUTF8Bytes)( c, NULL, &out->sink, &count );
531
1.65M
        if (count <= 0)
532
0
        {
533
            /* replacement char 0xFFFD encoded as UTF-8 */
534
0
            PutByte(0xEF, out); PutByte(0xBF, out); PutByte(0xBF, out);
535
0
        }
536
1.65M
    }
537
0
#ifndef NO_NATIVE_ISO2022_SUPPORT
538
0
    else if (out->encoding == ISO2022)
539
0
    {
540
0
        if (c == 0x1b)  /* ESC */
541
0
            out->state = FSM_ESC;
542
0
        else
543
0
        {
544
0
            switch (out->state)
545
0
            {
546
0
            case FSM_ESC:
547
0
                if (c == '$')
548
0
                    out->state = FSM_ESCD;
549
0
                else if (c == '(')
550
0
                    out->state = FSM_ESCP;
551
0
                else
552
0
                    out->state = FSM_ASCII;
553
0
                break;
554
555
0
            case FSM_ESCD:
556
0
                if (c == '(')
557
0
                    out->state = FSM_ESCDP;
558
0
                else
559
0
                    out->state = FSM_NONASCII;
560
0
                break;
561
562
0
            case FSM_ESCDP:
563
0
                out->state = FSM_NONASCII;
564
0
                break;
565
566
0
            case FSM_ESCP:
567
0
                out->state = FSM_ASCII;
568
0
                break;
569
570
0
            case FSM_NONASCII:
571
0
                c &= 0x7F;
572
0
                break;
573
574
0
            case FSM_ASCII:
575
0
                break;
576
0
            }
577
0
        }
578
579
0
        PutByte(c, out);
580
0
    }
581
0
#endif /* NO_NATIVE_ISO2022_SUPPORT */
582
583
0
    else if ( out->encoding == UTF16LE ||
584
0
              out->encoding == UTF16BE ||
585
0
              out->encoding == UTF16 )
586
0
    {
587
0
        int i, numChars = 1;
588
0
        uint theChars[2];
589
        
590
0
        if ( !TY_(IsValidUTF16FromUCS4)(c) )
591
0
        {
592
            /* invalid UTF-16 value */
593
0
            numChars = 0;
594
0
        }
595
0
        else if ( TY_(IsCombinedChar)(c) )
596
0
        {
597
            /* output both, unless something goes wrong */
598
0
            numChars = 2;
599
0
            if ( !TY_(SplitSurrogatePair)(c, &theChars[0], &theChars[1]) )
600
0
            {
601
0
                numChars = 0;
602
0
            }
603
0
        }
604
0
        else
605
0
        {
606
            /* just put the char out */
607
0
            theChars[0] = c;
608
0
        }
609
        
610
0
        for (i = 0; i < numChars; i++)
611
0
        {
612
0
            c = theChars[i];
613
            
614
0
            if (out->encoding == UTF16LE)
615
0
            {
616
0
                uint ch = c & 0xFF; PutByte(ch, out); 
617
0
                ch = (c >> 8) & 0xFF; PutByte(ch, out); 
618
0
            }
619
    
620
0
            else if (out->encoding == UTF16BE || out->encoding == UTF16)
621
0
            {
622
0
                uint ch = (c >> 8) & 0xFF; PutByte(ch, out); 
623
0
                ch = c & 0xFF; PutByte(ch, out); 
624
0
            }
625
0
        }
626
0
    }
627
0
    else if (out->encoding == BIG5 || out->encoding == SHIFTJIS)
628
0
    {
629
0
        if (c < 128)
630
0
            PutByte(c, out);
631
0
        else
632
0
        {
633
0
            uint ch = (c >> 8) & 0xFF; PutByte(ch, out); 
634
0
            ch = c & 0xFF; PutByte(ch, out); 
635
0
        }
636
0
    }
637
0
    else
638
0
        PutByte( c, out );
639
1.65M
}
640
641
642
643
/****************************
644
** Miscellaneous / Helpers
645
****************************/
646
647
/* Mapping for Windows Western character set CP 1252
648
** (chars 128-159/U+0080-U+009F) to Unicode.
649
*/
650
static const uint Win2Unicode[32] =
651
{
652
    0x20AC, 0x0000, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
653
    0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x0000, 0x017D, 0x0000,
654
    0x0000, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
655
    0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x0000, 0x017E, 0x0178
656
};
657
658
/* Function for conversion from Windows-1252 to Unicode */
659
uint TY_(DecodeWin1252)(uint c)
660
3.71k
{
661
3.71k
    if (127 < c && c < 160)
662
3.71k
        c = Win2Unicode[c - 128];
663
        
664
3.71k
    return c;
665
3.71k
}
666
667
static void EncodeWin1252( uint c, StreamOut* out )
668
0
{
669
0
    if (c < 128 || (c > 159 && c < 256))
670
0
        PutByte(c, out);
671
0
    else
672
0
    {
673
0
        int i;
674
675
0
        for (i = 128; i < 160; i++)
676
0
            if (Win2Unicode[i - 128] == c)
677
0
            {
678
0
                PutByte(i, out);
679
0
                break;
680
0
            }
681
0
    }
682
0
}
683
684
/*
685
   John Love-Jensen contributed this table for mapping MacRoman
686
   character set to Unicode
687
*/
688
689
/* modified to only need chars 128-255/U+0080-U+00FF - Terry Teague 19 Aug 01 */
690
static const uint Mac2Unicode[128] = 
691
{
692
    /* x7F = DEL */
693
    
694
    0x00C4, 0x00C5, 0x00C7, 0x00C9, 0x00D1, 0x00D6, 0x00DC, 0x00E1,
695
    0x00E0, 0x00E2, 0x00E4, 0x00E3, 0x00E5, 0x00E7, 0x00E9, 0x00E8,
696
697
    0x00EA, 0x00EB, 0x00ED, 0x00EC, 0x00EE, 0x00EF, 0x00F1, 0x00F3,
698
    0x00F2, 0x00F4, 0x00F6, 0x00F5, 0x00FA, 0x00F9, 0x00FB, 0x00FC,
699
700
    0x2020, 0x00B0, 0x00A2, 0x00A3, 0x00A7, 0x2022, 0x00B6, 0x00DF,
701
    0x00AE, 0x00A9, 0x2122, 0x00B4, 0x00A8, 0x2260, 0x00C6, 0x00D8,
702
703
    0x221E, 0x00B1, 0x2264, 0x2265, 0x00A5, 0x00B5, 0x2202, 0x2211,
704
                                            /* =BD U+2126 OHM SIGN */
705
    0x220F, 0x03C0, 0x222B, 0x00AA, 0x00BA, 0x03A9, 0x00E6, 0x00F8,
706
707
    0x00BF, 0x00A1, 0x00AC, 0x221A, 0x0192, 0x2248, 0x2206, 0x00AB,
708
    0x00BB, 0x2026, 0x00A0, 0x00C0, 0x00C3, 0x00D5, 0x0152, 0x0153,
709
710
    0x2013, 0x2014, 0x201C, 0x201D, 0x2018, 0x2019, 0x00F7, 0x25CA,
711
                            /* =DB U+00A4 CURRENCY SIGN */
712
    0x00FF, 0x0178, 0x2044, 0x20AC, 0x2039, 0x203A, 0xFB01, 0xFB02,
713
714
    0x2021, 0x00B7, 0x201A, 0x201E, 0x2030, 0x00C2, 0x00CA, 0x00C1,
715
    0x00CB, 0x00C8, 0x00CD, 0x00CE, 0x00CF, 0x00CC, 0x00D3, 0x00D4,
716
    /* xF0 = Apple Logo */
717
    /* =F0 U+2665 BLACK HEART SUIT */
718
    0xF8FF, 0x00D2, 0x00DA, 0x00DB, 0x00D9, 0x0131, 0x02C6, 0x02DC,
719
    0x00AF, 0x02D8, 0x02D9, 0x02DA, 0x00B8, 0x02DD, 0x02DB, 0x02C7
720
};
721
722
/* Function to convert from MacRoman to Unicode */
723
uint TY_(DecodeMacRoman)(uint c)
724
0
{
725
0
    if (127 < c && c < 256) /* Is. #891 */
726
0
        c = Mac2Unicode[c - 128];
727
0
    return c;
728
0
}
729
730
static void EncodeMacRoman( uint c, StreamOut* out )
731
0
{
732
0
        if (c < 128)
733
0
            PutByte(c, out);
734
0
        else
735
0
        {
736
            /* For mac users, map Unicode back to MacRoman. */
737
0
            int i;
738
0
            for (i = 128; i < 256; i++)
739
0
            {
740
0
                if (Mac2Unicode[i - 128] == c)
741
0
                {
742
0
                    PutByte(i, out);
743
0
                    break;
744
0
                }
745
0
            }
746
0
        }
747
0
}
748
749
/* Mapping for OS/2 Western character set CP 850
750
** (chars 128-255) to Unicode.
751
*/
752
static const uint IBM2Unicode[128] =
753
{
754
    0x00C7, 0x00FC, 0x00E9, 0x00E2, 0x00E4, 0x00E0, 0x00E5, 0x00E7,
755
    0x00EA, 0x00EB, 0x00E8, 0x00EF, 0x00EE, 0x00EC, 0x00C4, 0x00C5,
756
    0x00C9, 0x00E6, 0x00C6, 0x00F4, 0x00F6, 0x00F2, 0x00FB, 0x00F9,
757
    0x00FF, 0x00D6, 0x00DC, 0x00F8, 0x00A3, 0x00D8, 0x00D7, 0x0192,
758
    0x00E1, 0x00ED, 0x00F3, 0x00FA, 0x00F1, 0x00D1, 0x00AA, 0x00BA,
759
    0x00BF, 0x00AE, 0x00AC, 0x00BD, 0x00BC, 0x00A1, 0x00AB, 0x00BB,
760
    0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x00C1, 0x00C2, 0x00C0,
761
    0x00A9, 0x2563, 0x2551, 0x2557, 0x255D, 0x00A2, 0x00A5, 0x2510,
762
    0x2514, 0x2534, 0x252C, 0x251C, 0x2500, 0x253C, 0x00E3, 0x00C3,
763
    0x255A, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x00a4,
764
    0x00f0, 0x00d0, 0x00ca, 0x00cb, 0x00c8, 0x20AC, 0x00cd, 0x00ce,
765
    0x00cf, 0x2518, 0x250c, 0x2588, 0x2584, 0x00a6, 0x00cc, 0x2580,
766
    0x00d3, 0x00df, 0x00d4, 0x00d2, 0x00f5, 0x00d5, 0x00b5, 0x00fe,
767
    0x00de, 0x00da, 0x00db, 0x00d9, 0x00fd, 0x00dd, 0x00af, 0x00b4,
768
    0x00ad, 0x00b1, 0x2017, 0x00be, 0x00b6, 0x00a7, 0x00f7, 0x00b8,
769
    0x00b0, 0x00a8, 0x00b7, 0x00b9, 0x00b3, 0x00b2, 0x25a0, 0x00a0
770
};
771
772
/* Function for conversion from OS/2-850 to Unicode */
773
static uint DecodeIbm850(uint c)
774
0
{
775
0
    if (127 < c && c < 256)
776
0
        c = IBM2Unicode[c - 128];
777
778
0
    return c;
779
0
}
780
781
/* For OS/2,Java users, map Unicode back to IBM858 (IBM850+Euro). */
782
static void EncodeIbm858( uint c, StreamOut* out )
783
0
{
784
0
    if (c < 128)
785
0
        PutByte(c, out);
786
0
    else
787
0
    {
788
0
        int i;
789
0
        for (i = 128; i < 256; i++)
790
0
        {
791
0
            if (IBM2Unicode[i - 128] == c)
792
0
            {
793
0
                PutByte(i, out);
794
0
                break;
795
0
            }
796
0
        }
797
0
    }
798
0
}
799
800
801
/* Convert from Latin0 (aka Latin9, ISO-8859-15) to Unicode */
802
static uint DecodeLatin0(uint c)
803
0
{
804
0
    if (163 < c && c < 191)
805
0
    {
806
0
        switch (c)
807
0
        {
808
0
        case 0xA4: c = 0x20AC; break;
809
0
        case 0xA6: c = 0x0160; break;
810
0
        case 0xA8: c = 0x0161; break;
811
0
        case 0xB4: c = 0x017D; break;
812
0
        case 0xB8: c = 0x017E; break;
813
0
        case 0xBC: c = 0x0152; break;
814
0
        case 0xBD: c = 0x0153; break;
815
0
        case 0xBE: c = 0x0178; break;
816
0
        }
817
0
    }
818
0
    return c;
819
0
}
820
821
/* Map Unicode back to ISO-8859-15. */
822
static void EncodeLatin0( uint c, StreamOut* out )
823
0
{
824
0
    switch (c)
825
0
    {
826
0
    case 0x20AC: c = 0xA4; break;
827
0
    case 0x0160: c = 0xA6; break;
828
0
    case 0x0161: c = 0xA8; break;
829
0
    case 0x017D: c = 0xB4; break;
830
0
    case 0x017E: c = 0xB8; break;
831
0
    case 0x0152: c = 0xBC; break;
832
0
    case 0x0153: c = 0xBD; break;
833
0
    case 0x0178: c = 0xBE; break;
834
0
    }
835
0
    PutByte(c, out);
836
0
}
837
838
/* Facilitates user defined source by providing
839
** an entry point to marshal pointers-to-functions.
840
** Needed by .NET and possibly other language bindings.
841
*/
842
Bool TIDY_CALL tidyInitSource( TidyInputSource*  source,
843
                               void*             srcData,
844
                               TidyGetByteFunc   gbFunc,
845
                               TidyUngetByteFunc ugbFunc,
846
                               TidyEOFFunc       endFunc )
847
0
{
848
0
  Bool status = ( source && srcData && gbFunc && ugbFunc && endFunc );
849
850
0
  if ( status )
851
0
  {
852
0
    source->sourceData = srcData;
853
0
    source->getByte    = gbFunc;
854
0
    source->ungetByte  = ugbFunc;
855
0
    source->eof        = endFunc;
856
0
  }
857
858
0
  return status;
859
0
}
860
861
Bool TIDY_CALL tidyInitSink( TidyOutputSink* sink,
862
                             void*           snkData,
863
                             TidyPutByteFunc pbFunc )
864
0
{
865
0
  Bool status = ( sink && snkData && pbFunc );
866
0
  if ( status )
867
0
  {
868
0
    sink->sinkData = snkData;
869
0
    sink->putByte  = pbFunc;
870
0
  }
871
0
  return status;
872
0
}
873
874
/* GetByte must return a byte value in a signed
875
** integer so that a negative value can signal EOF
876
** without interfering w/ 0-255 legitimate byte values.
877
*/
878
uint TIDY_CALL tidyGetByte( TidyInputSource* source )
879
47.2M
{
880
47.2M
  int bv = source->getByte( source->sourceData );
881
47.2M
  return (uint) bv;
882
47.2M
}
883
Bool TIDY_CALL tidyIsEOF( TidyInputSource* source )
884
47.8M
{
885
47.8M
  return source->eof( source->sourceData );
886
47.8M
}
887
void TIDY_CALL tidyUngetByte( TidyInputSource* source, uint ch )
888
2.42k
{
889
2.42k
    source->ungetByte( source->sourceData, (byte) ch );
890
2.42k
}
891
void TIDY_CALL tidyPutByte( TidyOutputSink* sink, uint ch )
892
0
{
893
0
    sink->putByte( sink->sinkData, (byte) ch );
894
0
}
895
896
static uint ReadByte( StreamIn* in )
897
47.2M
{
898
47.2M
    return tidyGetByte( &in->source );
899
47.2M
}
900
Bool TY_(IsEOF)( StreamIn* in )
901
47.8M
{
902
47.8M
    return tidyIsEOF( &in->source );
903
47.8M
}
904
static void UngetByte( StreamIn* in, uint byteValue )
905
2.42k
{
906
2.42k
    tidyUngetByte( &in->source, byteValue );
907
2.42k
}
908
static void PutByte( uint byteValue, StreamOut* out )
909
0
{
910
0
    tidyPutByte( &out->sink, byteValue );
911
0
}
912
913
/* read char from stream */
914
static uint ReadCharFromStream( StreamIn* in )
915
47.5M
{
916
47.5M
    uint c, n;
917
918
47.5M
    if ( TY_(IsEOF)(in) )
919
1.00M
        return EndOfStream;
920
    
921
46.5M
    c = ReadByte( in );
922
923
46.5M
    if (c == EndOfStream)
924
0
        return c;
925
926
46.5M
#ifndef NO_NATIVE_ISO2022_SUPPORT
927
    /*
928
       A document in ISO-2022 based encoding uses some ESC sequences
929
       called "designator" to switch character sets. The designators
930
       defined and used in ISO-2022-JP are:
931
932
        "ESC" + "(" + ?     for ISO646 variants
933
934
        "ESC" + "$" + ?     and
935
        "ESC" + "$" + "(" + ?   for multibyte character sets
936
937
       Where ? stands for a single character used to indicate the
938
       character set for multibyte characters.
939
940
       Tidy handles this by preserving the escape sequence and
941
       setting the top bit of each byte for non-ascii chars. This
942
       bit is then cleared on output. The input stream keeps track
943
       of the state to determine when to set/clear the bit.
944
    */
945
946
46.5M
    if (in->encoding == ISO2022)
947
0
    {
948
0
        if (c == 0x1b)  /* ESC */
949
0
        {
950
0
            in->state = FSM_ESC;
951
0
            return c;
952
0
        }
953
954
0
        switch (in->state)
955
0
        {
956
0
        case FSM_ESC:
957
0
            if (c == '$')
958
0
                in->state = FSM_ESCD;
959
0
            else if (c == '(')
960
0
                in->state = FSM_ESCP;
961
0
            else
962
0
                in->state = FSM_ASCII;
963
0
            break;
964
965
0
        case FSM_ESCD:
966
0
            if (c == '(')
967
0
                in->state = FSM_ESCDP;
968
0
            else
969
0
                in->state = FSM_NONASCII;
970
0
            break;
971
972
0
        case FSM_ESCDP:
973
0
            in->state = FSM_NONASCII;
974
0
            break;
975
976
0
        case FSM_ESCP:
977
0
            in->state = FSM_ASCII;
978
0
            break;
979
980
0
        case FSM_NONASCII:
981
0
            c |= 0x80;
982
0
            break;
983
984
0
        case FSM_ASCII:
985
0
            break;
986
0
        }
987
988
0
        return c;
989
0
    }
990
46.5M
#endif /* NO_NATIVE_ISO2022_SUPPORT */
991
992
46.5M
    if ( in->encoding == UTF16LE )
993
283k
    {
994
283k
        uint c1 = ReadByte( in );
995
283k
        if ( EndOfStream == c1 )
996
0
            return EndOfStream;
997
283k
        n = (c1 << 8) + c;
998
283k
        return n;
999
283k
    }
1000
1001
46.2M
    if ((in->encoding == UTF16) || (in->encoding == UTF16BE)) /* UTF-16 is big-endian by default */
1002
400k
    {
1003
400k
        uint c1 = ReadByte( in );
1004
400k
        if ( EndOfStream == c1 )
1005
0
            return EndOfStream;
1006
400k
        n = (c << 8) + c1;
1007
400k
        return n;
1008
400k
    }
1009
1010
45.8M
    if ( in->encoding == UTF8 )
1011
45.8M
    {
1012
        /* deal with UTF-8 encoded char */
1013
1014
45.8M
        int err, count = 0;
1015
        
1016
        /* first byte "c" is passed in separately */
1017
45.8M
        err = TY_(DecodeUTF8BytesToChar)( &n, c, NULL, &in->source, &count );
1018
45.8M
        if (!err && (n == (uint)EndOfStream) && (count == 1)) /* EOF */
1019
0
            return EndOfStream;
1020
45.8M
        else if (err)
1021
3.30M
        {
1022
            /* set error position just before offending character */
1023
3.30M
            in->doc->lexer->lines = in->curline;
1024
3.30M
            in->doc->lexer->columns = in->curcol;
1025
1026
3.30M
            TY_(ReportEncodingError)(in->doc, INVALID_UTF8, n, no);
1027
3.30M
            n = 0xFFFD; /* replacement char */
1028
3.30M
        }
1029
        
1030
45.8M
        return n;
1031
45.8M
    }
1032
    
1033
    /*
1034
       This section is suitable for any "multibyte" variable-width 
1035
       character encoding in which a one-byte code is less than
1036
       128, and the first byte of a two-byte code is greater or
1037
       equal to 128. Note that Big5 and ShiftJIS fit into this
1038
       kind, even though their second byte may be less than 128
1039
    */
1040
0
    if ((in->encoding == BIG5) || (in->encoding == SHIFTJIS))
1041
0
    {
1042
0
        if (c < 128)
1043
0
            return c;
1044
0
        else if ((in->encoding == SHIFTJIS) && (c >= 0xa1 && c <= 0xdf)) /* 461643 - fix suggested by Rick Cameron 14 Sep 01 */
1045
0
        {
1046
            /*
1047
              Rick Cameron pointed out that for Shift_JIS, the values from
1048
              0xa1 through 0xdf represent singe-byte characters
1049
              (U+FF61 to U+FF9F - half-shift Katakana)
1050
            */
1051
0
            return c;
1052
0
        }
1053
0
        else
1054
0
        {
1055
0
            uint c1 = ReadByte( in );
1056
0
            if ( EndOfStream == c1 )
1057
0
                return EndOfStream;
1058
0
            n = (c << 8) + c1;
1059
0
            return n;
1060
0
        }
1061
0
    }
1062
0
    else
1063
0
        n = c;
1064
        
1065
0
    return n;
1066
0
}
1067
1068
/* Output a Byte Order Mark if required */
1069
void TY_(outBOM)( StreamOut *out )
1070
0
{
1071
0
    if ( out->encoding == UTF8
1072
0
         || out->encoding == UTF16LE
1073
0
         || out->encoding == UTF16BE
1074
0
         || out->encoding == UTF16
1075
0
       )
1076
0
    {
1077
        /* this will take care of encoding the BOM correctly */
1078
0
        TY_(WriteChar)( UNICODE_BOM, out );
1079
0
    }
1080
0
}
1081
1082
/* this is in intermediate fix for various problems in the */
1083
/* long term code and data in charsets.c should be used    */
1084
static struct _enc2iana
1085
{
1086
    uint id;
1087
    ctmbstr name;
1088
    ctmbstr tidyOptName;
1089
} const enc2iana[] =
1090
{
1091
  { ASCII,    "us-ascii",     "ascii"   },
1092
  { LATIN0,   "iso-8859-15",  "latin0"  },
1093
  { LATIN1,   "iso-8859-1",   "latin1"  },
1094
  { UTF8,     "utf-8",        "utf8"   },
1095
  { MACROMAN, "macintosh",    "mac"     },
1096
  { WIN1252,  "windows-1252", "win1252" },
1097
  { IBM858,   "ibm00858",     "ibm858"  },
1098
  { UTF16LE,  "utf-16",       "utf16le" },
1099
  { UTF16BE,  "utf-16",       "utf16be" },
1100
  { UTF16,    "utf-16",       "utf16"   },
1101
  { BIG5,     "big5",         "big5"    },
1102
  { SHIFTJIS, "shift_jis",    "shiftjis"},
1103
#ifndef NO_NATIVE_ISO2022_SUPPORT
1104
  { ISO2022,  NULL,           "iso2022" },
1105
#endif
1106
  { RAW,      NULL,           "raw"     }
1107
};
1108
1109
ctmbstr TY_(GetEncodingNameFromTidyId)(uint id)
1110
106
{
1111
106
    uint i;
1112
1113
665
    for (i = 0; enc2iana[i].name; ++i)
1114
665
        if (enc2iana[i].id == id)
1115
106
            return enc2iana[i].name;
1116
1117
0
    return NULL;
1118
106
}
1119
1120
ctmbstr TY_(GetEncodingOptNameFromTidyId)(uint id)
1121
0
{
1122
0
    uint i;
1123
1124
0
    for (i = 0; i < sizeof(enc2iana)/sizeof(enc2iana[0]); ++i)
1125
0
        if (enc2iana[i].id == id)
1126
0
            return enc2iana[i].tidyOptName;
1127
1128
0
    return NULL;
1129
0
}
1130
1131
int TY_(GetCharEncodingFromOptName)( ctmbstr charenc )
1132
0
{
1133
0
    uint i;
1134
1135
0
    for (i = 0; i < sizeof(enc2iana)/sizeof(enc2iana[0]); ++i)
1136
0
        if (TY_(tmbstrcasecmp)(charenc, enc2iana[i].tidyOptName) == 0 )
1137
0
            return enc2iana[i].id;
1138
1139
0
    return -1;
1140
0
}
1141
1142
/*
1143
 * local variables:
1144
 * mode: c
1145
 * indent-tabs-mode: nil
1146
 * c-basic-offset: 4
1147
 * eval: (c-set-offset 'substatement-open 0)
1148
 * end:
1149
 */