/src/tidy-html5/src/streamio.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* streamio.c -- handles character stream I/O |
2 | | |
3 | | (c) 1998-2008 (W3C) MIT, ERCIM, Keio University |
4 | | See tidy.h for the copyright notice. |
5 | | |
6 | | Wrapper around Tidy input source and output sink |
7 | | that calls appropriate interfaces, and applies |
8 | | necessary char encoding transformations: to/from |
9 | | ISO-10646 and/or UTF-8. |
10 | | |
11 | | */ |
12 | | |
13 | | #include <stdio.h> |
14 | | #include <errno.h> |
15 | | |
16 | | #include "streamio.h" |
17 | | #include "tidy-int.h" |
18 | | #include "lexer.h" |
19 | | #include "message.h" |
20 | | #include "utf8.h" |
21 | | #include "tmbstr.h" |
22 | | |
23 | | |
24 | | /************************ |
25 | | ** Forward Declarations |
26 | | ************************/ |
27 | | |
28 | | static uint ReadCharFromStream( StreamIn* in ); |
29 | | |
30 | | static uint ReadByte( StreamIn* in ); |
31 | | static void UngetByte( StreamIn* in, uint byteValue ); |
32 | | |
33 | | static void PutByte( uint byteValue, StreamOut* out ); |
34 | | |
35 | | static void EncodeWin1252( uint c, StreamOut* out ); |
36 | | static void EncodeMacRoman( uint c, StreamOut* out ); |
37 | | static void EncodeIbm858( uint c, StreamOut* out ); |
38 | | static void EncodeLatin0( uint c, StreamOut* out ); |
39 | | |
40 | | static uint DecodeIbm850(uint c); |
41 | | static uint DecodeLatin0(uint c); |
42 | | |
43 | | static uint PopChar( StreamIn *in ); |
44 | | |
45 | | /****************************** |
46 | | ** Static (duration) Globals |
47 | | ******************************/ |
48 | | |
49 | | static StreamOut stderrStreamOut = |
50 | | { |
51 | | ASCII, |
52 | | FSM_ASCII, |
53 | | DEFAULT_NL_CONFIG, |
54 | | FileIO, |
55 | | { 0, TY_(filesink_putByte) } |
56 | | }; |
57 | | |
58 | | static StreamOut stdoutStreamOut = |
59 | | { |
60 | | ASCII, |
61 | | FSM_ASCII, |
62 | | DEFAULT_NL_CONFIG, |
63 | | FileIO, |
64 | | { 0, TY_(filesink_putByte) } |
65 | | }; |
66 | | |
67 | | StreamOut* TY_(StdErrOutput)(void) |
68 | 861 | { |
69 | 861 | if ( stderrStreamOut.sink.sinkData == 0 ) |
70 | 1 | stderrStreamOut.sink.sinkData = stderr; |
71 | 861 | return &stderrStreamOut; |
72 | 861 | } |
73 | | |
74 | | void TY_(ReleaseStreamOut)( TidyDocImpl *doc, StreamOut* out ) |
75 | 1.72k | { |
76 | 1.72k | if ( out && out != &stderrStreamOut && out != &stdoutStreamOut ) |
77 | 861 | { |
78 | 861 | if ( out->iotype == FileIO ) |
79 | 0 | fclose( (FILE*) out->sink.sinkData ); |
80 | 861 | TidyDocFree( doc, out ); |
81 | 861 | } |
82 | 1.72k | } |
83 | | |
84 | | /************************ |
85 | | ** Source |
86 | | ************************/ |
87 | | |
88 | | static void InitLastPos( StreamIn *in ); |
89 | | |
90 | | StreamIn* TY_(initStreamIn)( TidyDocImpl* doc, int encoding ) |
91 | 861 | { |
92 | 861 | StreamIn *in = (StreamIn*) TidyDocAlloc( doc, sizeof(StreamIn) ); |
93 | | |
94 | 861 | TidyClearMemory( in, sizeof(StreamIn) ); |
95 | 861 | in->curline = 1; |
96 | 861 | in->curcol = 1; |
97 | 861 | in->encoding = encoding; |
98 | 861 | in->state = FSM_ASCII; |
99 | 861 | in->doc = doc; |
100 | 861 | in->bufsize = CHARBUF_SIZE; |
101 | 861 | in->allocator = doc->allocator; |
102 | 861 | in->charbuf = (tchar*)TidyDocAlloc(doc, sizeof(tchar) * in->bufsize); |
103 | 861 | InitLastPos( in ); |
104 | 861 | return in; |
105 | 861 | } |
106 | | |
107 | | void TY_(freeStreamIn)(StreamIn* in) |
108 | 861 | { |
109 | 861 | TidyFree(in->allocator, in->charbuf); |
110 | 861 | TidyFree(in->allocator, in); |
111 | 861 | } |
112 | | |
113 | | StreamIn* TY_(FileInput)( TidyDocImpl* doc, FILE *fp, int encoding ) |
114 | 828 | { |
115 | 828 | StreamIn *in = TY_(initStreamIn)( doc, encoding ); |
116 | 828 | if ( TY_(initFileSource)( doc->allocator, &in->source, fp ) != 0 ) |
117 | 0 | { |
118 | 0 | TY_(freeStreamIn)( in ); |
119 | 0 | return NULL; |
120 | 0 | } |
121 | 828 | in->iotype = FileIO; |
122 | 828 | return in; |
123 | 828 | } |
124 | | |
125 | | StreamIn* TY_(BufferInput)( TidyDocImpl* doc, TidyBuffer* buf, int encoding ) |
126 | 33 | { |
127 | 33 | StreamIn *in = TY_(initStreamIn)( doc, encoding ); |
128 | 33 | tidyInitInputBuffer( &in->source, buf ); |
129 | 33 | in->iotype = BufferIO; |
130 | 33 | return in; |
131 | 33 | } |
132 | | |
133 | | StreamIn* TY_(UserInput)( TidyDocImpl* doc, TidyInputSource* source, int encoding ) |
134 | 0 | { |
135 | 0 | StreamIn *in = TY_(initStreamIn)( doc, encoding ); |
136 | 0 | memcpy( &in->source, source, sizeof(TidyInputSource) ); |
137 | 0 | in->iotype = UserIO; |
138 | 0 | return in; |
139 | 0 | } |
140 | | |
141 | | int TY_(ReadBOMEncoding)(StreamIn *in) |
142 | 861 | { |
143 | 861 | uint c, c1; |
144 | 861 | uint bom; |
145 | | |
146 | 861 | c = ReadByte(in); |
147 | 861 | if (c == EndOfStream) |
148 | 0 | return -1; |
149 | | |
150 | 861 | c1 = ReadByte( in ); |
151 | 861 | if (c1 == EndOfStream) |
152 | 1 | { |
153 | 1 | UngetByte(in, c); |
154 | 1 | return -1; |
155 | 1 | } |
156 | | |
157 | | /* todo: dont warn about mismatch for auto input encoding */ |
158 | | /* todo: let the user override the encoding found here */ |
159 | | |
160 | 860 | bom = (c << 8) + c1; |
161 | | |
162 | 860 | if ( bom == UNICODE_BOM_BE ) |
163 | 29 | { |
164 | | /* big-endian UTF-16 */ |
165 | 29 | if ( in->encoding != UTF16 && in->encoding != UTF16BE ) |
166 | 29 | TY_(ReportEncodingWarning)(in->doc, ENCODING_MISMATCH, UTF16BE); |
167 | | |
168 | 29 | return UTF16BE; /* return decoded BOM */ |
169 | 29 | } |
170 | 831 | else if (bom == UNICODE_BOM_LE) |
171 | 24 | { |
172 | | /* little-endian UTF-16 */ |
173 | 24 | if (in->encoding != UTF16 && in->encoding != UTF16LE) |
174 | 24 | TY_(ReportEncodingWarning)(in->doc, ENCODING_MISMATCH, UTF16LE); |
175 | | |
176 | 24 | return UTF16LE; /* return decoded BOM */ |
177 | 24 | } |
178 | 807 | else |
179 | 807 | { |
180 | 807 | uint c2 = ReadByte(in); |
181 | | |
182 | 807 | if (c2 == EndOfStream) |
183 | 0 | { |
184 | 0 | UngetByte(in, c1); |
185 | 0 | UngetByte(in, c); |
186 | 0 | return -1; |
187 | 0 | } |
188 | | |
189 | 807 | if (((c << 16) + (c1 << 8) + c2) == UNICODE_BOM_UTF8) |
190 | 0 | { |
191 | | /* UTF-8 */ |
192 | 0 | if (in->encoding != UTF8) |
193 | 0 | TY_(ReportEncodingWarning)(in->doc, ENCODING_MISMATCH, UTF8); |
194 | |
|
195 | 0 | return UTF8; |
196 | 0 | } |
197 | 807 | else |
198 | 807 | UngetByte( in, c2 ); |
199 | 807 | } |
200 | | |
201 | 807 | UngetByte(in, c1); |
202 | 807 | UngetByte(in, c); |
203 | | |
204 | 807 | return -1; |
205 | 860 | } |
206 | | |
207 | | static void InitLastPos( StreamIn *in ) |
208 | 861 | { |
209 | 861 | in->curlastpos = 0; |
210 | 861 | in->firstlastpos = 0; |
211 | 861 | } |
212 | | |
213 | | static void PopLastPos( StreamIn *in ) |
214 | 152M | { |
215 | 152M | in->curlastpos = (in->curlastpos+1)%LASTPOS_SIZE; |
216 | 152M | if ( in->curlastpos == in->firstlastpos ) |
217 | 148M | in->firstlastpos = (in->firstlastpos+1)%LASTPOS_SIZE; |
218 | 152M | } |
219 | | |
220 | | static void SaveLastPos( StreamIn *in ) |
221 | 148M | { |
222 | 148M | PopLastPos( in ); |
223 | 148M | in->lastcols[in->curlastpos] = in->curcol; |
224 | 148M | } |
225 | | |
226 | | static void RestoreLastPos( StreamIn *in ) |
227 | 4.32M | { |
228 | 4.32M | if ( in->firstlastpos == in->curlastpos ) |
229 | 122k | in->curcol = 0; |
230 | 4.20M | else |
231 | 4.20M | { |
232 | 4.20M | in->curcol = in->lastcols[in->curlastpos]; |
233 | 4.20M | if ( in->curlastpos == 0 ) |
234 | 65.7k | in->curlastpos = LASTPOS_SIZE; |
235 | 4.20M | in->curlastpos--; |
236 | 4.20M | } |
237 | 4.32M | } |
238 | | |
239 | | uint TY_(ReadChar)( StreamIn *in ) |
240 | 152M | { |
241 | 152M | uint c = EndOfStream; |
242 | | |
243 | 152M | if ( in->pushed ) |
244 | 4.32M | return PopChar( in ); |
245 | | |
246 | 148M | SaveLastPos( in ); |
247 | | |
248 | 148M | if ( in->tabs > 0 ) |
249 | 105M | { |
250 | 105M | in->curcol++; |
251 | 105M | in->tabs--; |
252 | 105M | return ' '; |
253 | 105M | } |
254 | | |
255 | 42.4M | for (;;) |
256 | 47.4M | { |
257 | 47.4M | c = ReadCharFromStream(in); |
258 | | |
259 | 47.4M | if ( EndOfStream == c ) |
260 | 1.00M | return EndOfStream; |
261 | | |
262 | 46.4M | if (c == '\n') |
263 | 793k | { |
264 | 793k | in->curcol = 1; |
265 | 793k | in->curline++; |
266 | 793k | break; |
267 | 793k | } |
268 | | |
269 | 45.6M | if (c == '\t') |
270 | 15.2M | { |
271 | 15.2M | Bool keeptabs = cfg( in->doc, TidyKeepTabs ); |
272 | 15.2M | if (!keeptabs) { |
273 | 15.1M | uint tabsize = cfg(in->doc, TidyTabSize); |
274 | 15.1M | in->tabs = tabsize > 0 ? |
275 | 15.1M | tabsize - ((in->curcol - 1) % tabsize) - 1 |
276 | 15.1M | : 0; |
277 | 15.1M | c = ' '; |
278 | 15.1M | } |
279 | 15.2M | in->curcol++; |
280 | 15.2M | break; |
281 | 15.2M | } |
282 | | |
283 | | /* #427663 - map '\r' to '\n' - Andy Quick 11 Aug 00 */ |
284 | 30.3M | if (c == '\r') |
285 | 104k | { |
286 | 104k | c = ReadCharFromStream(in); |
287 | 104k | if (c != '\n') |
288 | 85.4k | { |
289 | 85.4k | TY_(UngetChar)( c, in ); |
290 | 85.4k | c = '\n'; |
291 | 85.4k | } |
292 | 19.4k | else |
293 | 19.4k | { |
294 | 19.4k | } |
295 | 104k | in->curcol = 1; |
296 | 104k | in->curline++; |
297 | 104k | break; |
298 | 104k | } |
299 | | |
300 | 30.2M | #ifndef NO_NATIVE_ISO2022_SUPPORT |
301 | | /* strip control characters, except for Esc */ |
302 | 30.2M | if (c == '\033') |
303 | 8.84k | break; |
304 | 30.2M | #endif |
305 | | |
306 | | /* Form Feed is allowed in HTML */ |
307 | 30.2M | if ( c == '\015' && !cfgBool(in->doc, TidyXmlTags) ) |
308 | 0 | break; |
309 | | |
310 | 30.2M | if ( c < 32 ) |
311 | 4.95M | continue; /* discard control char */ |
312 | | |
313 | | /* watch out for chars that have already been decoded such as */ |
314 | | /* IS02022, UTF-8 etc, that don't require further decoding */ |
315 | | |
316 | 25.2M | if ( |
317 | 25.2M | in->encoding == RAW |
318 | 25.2M | #ifndef NO_NATIVE_ISO2022_SUPPORT |
319 | 25.2M | || in->encoding == ISO2022 |
320 | 25.2M | #endif |
321 | 25.2M | || in->encoding == UTF8 |
322 | 25.2M | || in->encoding == SHIFTJIS /* #431953 - RJ */ |
323 | 25.2M | || in->encoding == BIG5 /* #431953 - RJ */ |
324 | 25.2M | ) |
325 | 24.6M | { |
326 | 24.6M | in->curcol++; |
327 | 24.6M | break; |
328 | 24.6M | } |
329 | | |
330 | | /* handle surrogate pairs */ |
331 | 605k | if ( in->encoding == UTF16LE || |
332 | 605k | in->encoding == UTF16 || |
333 | 605k | in->encoding == UTF16BE ) |
334 | 605k | { |
335 | 605k | if ( !TY_(IsValidUTF16FromUCS4)(c) ) |
336 | 0 | { |
337 | | /* invalid UTF-16 value */ |
338 | 0 | TY_(ReportEncodingError)(in->doc, INVALID_UTF16, c, yes); |
339 | 0 | c = 0; |
340 | 0 | } |
341 | 605k | else if ( TY_(IsLowSurrogate)(c) ) |
342 | 2.44k | { |
343 | 2.44k | uint n = c; |
344 | 2.44k | uint m = ReadCharFromStream( in ); |
345 | 2.44k | if ( m == EndOfStream ) |
346 | 0 | return EndOfStream; |
347 | | |
348 | 2.44k | c = 0; |
349 | 2.44k | if ( TY_(IsHighSurrogate)(m) ) |
350 | 65 | { |
351 | 65 | n = TY_(CombineSurrogatePair)( m, n ); |
352 | 65 | if ( TY_(IsValidCombinedChar)(n) ) |
353 | 62 | c = n; |
354 | 65 | } |
355 | | /* not a valid pair */ |
356 | 2.44k | if ( 0 == c ) |
357 | 2.38k | TY_(ReportEncodingError)( in->doc, INVALID_UTF16, c, yes ); |
358 | 2.44k | } |
359 | 605k | } |
360 | | |
361 | | /* Do first: acts on range 128 - 255 */ |
362 | 605k | switch ( in->encoding ) |
363 | 605k | { |
364 | 0 | case MACROMAN: |
365 | 0 | c = TY_(DecodeMacRoman)( c ); |
366 | 0 | break; |
367 | 0 | case IBM858: |
368 | 0 | c = DecodeIbm850( c ); |
369 | 0 | break; |
370 | 0 | case LATIN0: |
371 | 0 | c = DecodeLatin0( c ); |
372 | 0 | break; |
373 | 605k | } |
374 | | |
375 | | /* produced e.g. as a side-effect of smart quotes in Word */ |
376 | | /* but can't happen if using MACROMAN encoding */ |
377 | 605k | if ( 127 < c && c < 160 ) |
378 | 3.56k | { |
379 | 3.56k | uint c1 = 0, replMode = DISCARDED_CHAR; |
380 | 3.56k | Bool isVendorChar = ( in->encoding == WIN1252 || |
381 | 3.56k | in->encoding == MACROMAN ); |
382 | 3.56k | Bool isMacChar = ( in->encoding == MACROMAN ); |
383 | | |
384 | | /* set error position just before offending character */ |
385 | 3.56k | if (in->doc->lexer) |
386 | 3.56k | { |
387 | 3.56k | in->doc->lexer->lines = in->curline; |
388 | 3.56k | in->doc->lexer->columns = in->curcol; |
389 | 3.56k | } |
390 | | |
391 | 3.56k | if ( isMacChar ) |
392 | 0 | c1 = TY_(DecodeMacRoman)( c ); |
393 | 3.56k | else |
394 | 3.56k | c1 = TY_(DecodeWin1252)( c ); |
395 | 3.56k | if ( c1 ) |
396 | 3.04k | replMode = REPLACED_CHAR; |
397 | | |
398 | 3.56k | if ( c1 == 0 && isVendorChar ) |
399 | 0 | TY_(ReportEncodingError)(in->doc, VENDOR_SPECIFIC_CHARS, c, replMode == DISCARDED_CHAR); |
400 | 3.56k | else if ( ! isVendorChar ) |
401 | 3.56k | TY_(ReportEncodingError)(in->doc, INVALID_SGML_CHARS, c, replMode == DISCARDED_CHAR); |
402 | | |
403 | 3.56k | c = c1; |
404 | 3.56k | } |
405 | | |
406 | 605k | if ( c == 0 ) |
407 | 2.89k | continue; /* illegal char is discarded */ |
408 | | |
409 | 602k | in->curcol++; |
410 | 602k | break; |
411 | 605k | } |
412 | | |
413 | 41.4M | return c; |
414 | 42.4M | } |
415 | | |
416 | | static uint PopChar( StreamIn *in ) |
417 | 4.32M | { |
418 | 4.32M | uint c = EndOfStream; |
419 | 4.32M | if ( in->pushed ) |
420 | 4.32M | { |
421 | 4.32M | assert( in->bufpos > 0 ); |
422 | 4.32M | c = in->charbuf[ --in->bufpos ]; |
423 | 4.32M | if ( in->bufpos == 0 ) |
424 | 2.82M | in->pushed = no; |
425 | | |
426 | 4.32M | if ( c == '\n' ) |
427 | 13.9k | { |
428 | 13.9k | in->curcol = 1; |
429 | 13.9k | in->curline++; |
430 | 13.9k | PopLastPos( in ); |
431 | 13.9k | return c; |
432 | 13.9k | } |
433 | 4.31M | in->curcol++; |
434 | 4.31M | PopLastPos( in ); |
435 | 4.31M | } |
436 | 4.31M | return c; |
437 | 4.32M | } |
438 | | |
439 | | void TY_(UngetChar)( uint c, StreamIn *in ) |
440 | 4.32M | { |
441 | 4.32M | if (c == EndOfStream) |
442 | 1.13k | { |
443 | | /* fprintf(stderr, "Attempt to UngetChar EOF\n"); */ |
444 | 1.13k | return; |
445 | 1.13k | } |
446 | | |
447 | 4.32M | in->pushed = yes; |
448 | | |
449 | 4.32M | if (in->bufpos + 1 >= in->bufsize) |
450 | 115k | in->charbuf = (tchar*)TidyRealloc(in->allocator, in->charbuf, sizeof(tchar) * ++(in->bufsize)); |
451 | | |
452 | 4.32M | in->charbuf[(in->bufpos)++] = c; |
453 | | |
454 | 4.32M | if (c == '\n') |
455 | 13.9k | --(in->curline); |
456 | | |
457 | 4.32M | RestoreLastPos( in ); |
458 | 4.32M | } |
459 | | |
460 | | |
461 | | |
462 | | /************************ |
463 | | ** Sink |
464 | | ************************/ |
465 | | |
466 | | static StreamOut* initStreamOut( TidyDocImpl* doc, int encoding, uint nl ) |
467 | 861 | { |
468 | 861 | StreamOut* out = (StreamOut*) TidyDocAlloc( doc, sizeof(StreamOut) ); |
469 | 861 | TidyClearMemory( out, sizeof(StreamOut) ); |
470 | 861 | out->encoding = encoding; |
471 | 861 | out->state = FSM_ASCII; |
472 | 861 | out->nl = nl; |
473 | 861 | return out; |
474 | 861 | } |
475 | | |
476 | | StreamOut* TY_(FileOutput)( TidyDocImpl *doc, FILE* fp, int encoding, uint nl ) |
477 | 0 | { |
478 | 0 | StreamOut* out = initStreamOut( doc, encoding, nl ); |
479 | 0 | TY_(initFileSink)( &out->sink, fp ); |
480 | 0 | out->iotype = FileIO; |
481 | 0 | return out; |
482 | 0 | } |
483 | | StreamOut* TY_(BufferOutput)( TidyDocImpl *doc, TidyBuffer* buf, int encoding, uint nl ) |
484 | 861 | { |
485 | 861 | StreamOut* out = initStreamOut( doc, encoding, nl ); |
486 | 861 | tidyInitOutputBuffer( &out->sink, buf ); |
487 | 861 | out->iotype = BufferIO; |
488 | 861 | return out; |
489 | 861 | } |
490 | | StreamOut* TY_(UserOutput)( TidyDocImpl *doc, TidyOutputSink* sink, int encoding, uint nl ) |
491 | 0 | { |
492 | 0 | StreamOut* out = initStreamOut( doc, encoding, nl ); |
493 | 0 | memcpy( &out->sink, sink, sizeof(TidyOutputSink) ); |
494 | 0 | out->iotype = UserIO; |
495 | 0 | return out; |
496 | 0 | } |
497 | | |
498 | | void TY_(WriteChar)( uint c, StreamOut* out ) |
499 | 1.65M | { |
500 | | /* Translate outgoing newlines */ |
501 | 1.65M | if ( LF == c ) |
502 | 1.65M | { |
503 | 1.65M | if ( out->nl == TidyCRLF ) |
504 | 0 | TY_(WriteChar)( CR, out ); |
505 | 1.65M | else if ( out->nl == TidyCR ) |
506 | 0 | c = CR; |
507 | 1.65M | } |
508 | | |
509 | 1.65M | if (out->encoding == MACROMAN) |
510 | 0 | { |
511 | 0 | EncodeMacRoman( c, out ); |
512 | 0 | } |
513 | 1.65M | else if (out->encoding == WIN1252) |
514 | 0 | { |
515 | 0 | EncodeWin1252( c, out ); |
516 | 0 | } |
517 | 1.65M | else if (out->encoding == IBM858) |
518 | 0 | { |
519 | 0 | EncodeIbm858( c, out ); |
520 | 0 | } |
521 | 1.65M | else if (out->encoding == LATIN0) |
522 | 0 | { |
523 | 0 | EncodeLatin0( c, out ); |
524 | 0 | } |
525 | | |
526 | 1.65M | else if (out->encoding == UTF8) |
527 | 1.65M | { |
528 | 1.65M | int count = 0; |
529 | | |
530 | 1.65M | TY_(EncodeCharToUTF8Bytes)( c, NULL, &out->sink, &count ); |
531 | 1.65M | if (count <= 0) |
532 | 0 | { |
533 | | /* replacement char 0xFFFD encoded as UTF-8 */ |
534 | 0 | PutByte(0xEF, out); PutByte(0xBF, out); PutByte(0xBF, out); |
535 | 0 | } |
536 | 1.65M | } |
537 | 0 | #ifndef NO_NATIVE_ISO2022_SUPPORT |
538 | 0 | else if (out->encoding == ISO2022) |
539 | 0 | { |
540 | 0 | if (c == 0x1b) /* ESC */ |
541 | 0 | out->state = FSM_ESC; |
542 | 0 | else |
543 | 0 | { |
544 | 0 | switch (out->state) |
545 | 0 | { |
546 | 0 | case FSM_ESC: |
547 | 0 | if (c == '$') |
548 | 0 | out->state = FSM_ESCD; |
549 | 0 | else if (c == '(') |
550 | 0 | out->state = FSM_ESCP; |
551 | 0 | else |
552 | 0 | out->state = FSM_ASCII; |
553 | 0 | break; |
554 | | |
555 | 0 | case FSM_ESCD: |
556 | 0 | if (c == '(') |
557 | 0 | out->state = FSM_ESCDP; |
558 | 0 | else |
559 | 0 | out->state = FSM_NONASCII; |
560 | 0 | break; |
561 | | |
562 | 0 | case FSM_ESCDP: |
563 | 0 | out->state = FSM_NONASCII; |
564 | 0 | break; |
565 | | |
566 | 0 | case FSM_ESCP: |
567 | 0 | out->state = FSM_ASCII; |
568 | 0 | break; |
569 | | |
570 | 0 | case FSM_NONASCII: |
571 | 0 | c &= 0x7F; |
572 | 0 | break; |
573 | | |
574 | 0 | case FSM_ASCII: |
575 | 0 | break; |
576 | 0 | } |
577 | 0 | } |
578 | | |
579 | 0 | PutByte(c, out); |
580 | 0 | } |
581 | 0 | #endif /* NO_NATIVE_ISO2022_SUPPORT */ |
582 | | |
583 | 0 | else if ( out->encoding == UTF16LE || |
584 | 0 | out->encoding == UTF16BE || |
585 | 0 | out->encoding == UTF16 ) |
586 | 0 | { |
587 | 0 | int i, numChars = 1; |
588 | 0 | uint theChars[2]; |
589 | | |
590 | 0 | if ( !TY_(IsValidUTF16FromUCS4)(c) ) |
591 | 0 | { |
592 | | /* invalid UTF-16 value */ |
593 | 0 | numChars = 0; |
594 | 0 | } |
595 | 0 | else if ( TY_(IsCombinedChar)(c) ) |
596 | 0 | { |
597 | | /* output both, unless something goes wrong */ |
598 | 0 | numChars = 2; |
599 | 0 | if ( !TY_(SplitSurrogatePair)(c, &theChars[0], &theChars[1]) ) |
600 | 0 | { |
601 | 0 | numChars = 0; |
602 | 0 | } |
603 | 0 | } |
604 | 0 | else |
605 | 0 | { |
606 | | /* just put the char out */ |
607 | 0 | theChars[0] = c; |
608 | 0 | } |
609 | | |
610 | 0 | for (i = 0; i < numChars; i++) |
611 | 0 | { |
612 | 0 | c = theChars[i]; |
613 | | |
614 | 0 | if (out->encoding == UTF16LE) |
615 | 0 | { |
616 | 0 | uint ch = c & 0xFF; PutByte(ch, out); |
617 | 0 | ch = (c >> 8) & 0xFF; PutByte(ch, out); |
618 | 0 | } |
619 | | |
620 | 0 | else if (out->encoding == UTF16BE || out->encoding == UTF16) |
621 | 0 | { |
622 | 0 | uint ch = (c >> 8) & 0xFF; PutByte(ch, out); |
623 | 0 | ch = c & 0xFF; PutByte(ch, out); |
624 | 0 | } |
625 | 0 | } |
626 | 0 | } |
627 | 0 | else if (out->encoding == BIG5 || out->encoding == SHIFTJIS) |
628 | 0 | { |
629 | 0 | if (c < 128) |
630 | 0 | PutByte(c, out); |
631 | 0 | else |
632 | 0 | { |
633 | 0 | uint ch = (c >> 8) & 0xFF; PutByte(ch, out); |
634 | 0 | ch = c & 0xFF; PutByte(ch, out); |
635 | 0 | } |
636 | 0 | } |
637 | 0 | else |
638 | 0 | PutByte( c, out ); |
639 | 1.65M | } |
640 | | |
641 | | |
642 | | |
643 | | /**************************** |
644 | | ** Miscellaneous / Helpers |
645 | | ****************************/ |
646 | | |
647 | | /* Mapping for Windows Western character set CP 1252 |
648 | | ** (chars 128-159/U+0080-U+009F) to Unicode. |
649 | | */ |
650 | | static const uint Win2Unicode[32] = |
651 | | { |
652 | | 0x20AC, 0x0000, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, |
653 | | 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x0000, 0x017D, 0x0000, |
654 | | 0x0000, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, |
655 | | 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x0000, 0x017E, 0x0178 |
656 | | }; |
657 | | |
658 | | /* Function for conversion from Windows-1252 to Unicode */ |
659 | | uint TY_(DecodeWin1252)(uint c) |
660 | 3.71k | { |
661 | 3.71k | if (127 < c && c < 160) |
662 | 3.71k | c = Win2Unicode[c - 128]; |
663 | | |
664 | 3.71k | return c; |
665 | 3.71k | } |
666 | | |
667 | | static void EncodeWin1252( uint c, StreamOut* out ) |
668 | 0 | { |
669 | 0 | if (c < 128 || (c > 159 && c < 256)) |
670 | 0 | PutByte(c, out); |
671 | 0 | else |
672 | 0 | { |
673 | 0 | int i; |
674 | |
|
675 | 0 | for (i = 128; i < 160; i++) |
676 | 0 | if (Win2Unicode[i - 128] == c) |
677 | 0 | { |
678 | 0 | PutByte(i, out); |
679 | 0 | break; |
680 | 0 | } |
681 | 0 | } |
682 | 0 | } |
683 | | |
684 | | /* |
685 | | John Love-Jensen contributed this table for mapping MacRoman |
686 | | character set to Unicode |
687 | | */ |
688 | | |
689 | | /* modified to only need chars 128-255/U+0080-U+00FF - Terry Teague 19 Aug 01 */ |
690 | | static const uint Mac2Unicode[128] = |
691 | | { |
692 | | /* x7F = DEL */ |
693 | | |
694 | | 0x00C4, 0x00C5, 0x00C7, 0x00C9, 0x00D1, 0x00D6, 0x00DC, 0x00E1, |
695 | | 0x00E0, 0x00E2, 0x00E4, 0x00E3, 0x00E5, 0x00E7, 0x00E9, 0x00E8, |
696 | | |
697 | | 0x00EA, 0x00EB, 0x00ED, 0x00EC, 0x00EE, 0x00EF, 0x00F1, 0x00F3, |
698 | | 0x00F2, 0x00F4, 0x00F6, 0x00F5, 0x00FA, 0x00F9, 0x00FB, 0x00FC, |
699 | | |
700 | | 0x2020, 0x00B0, 0x00A2, 0x00A3, 0x00A7, 0x2022, 0x00B6, 0x00DF, |
701 | | 0x00AE, 0x00A9, 0x2122, 0x00B4, 0x00A8, 0x2260, 0x00C6, 0x00D8, |
702 | | |
703 | | 0x221E, 0x00B1, 0x2264, 0x2265, 0x00A5, 0x00B5, 0x2202, 0x2211, |
704 | | /* =BD U+2126 OHM SIGN */ |
705 | | 0x220F, 0x03C0, 0x222B, 0x00AA, 0x00BA, 0x03A9, 0x00E6, 0x00F8, |
706 | | |
707 | | 0x00BF, 0x00A1, 0x00AC, 0x221A, 0x0192, 0x2248, 0x2206, 0x00AB, |
708 | | 0x00BB, 0x2026, 0x00A0, 0x00C0, 0x00C3, 0x00D5, 0x0152, 0x0153, |
709 | | |
710 | | 0x2013, 0x2014, 0x201C, 0x201D, 0x2018, 0x2019, 0x00F7, 0x25CA, |
711 | | /* =DB U+00A4 CURRENCY SIGN */ |
712 | | 0x00FF, 0x0178, 0x2044, 0x20AC, 0x2039, 0x203A, 0xFB01, 0xFB02, |
713 | | |
714 | | 0x2021, 0x00B7, 0x201A, 0x201E, 0x2030, 0x00C2, 0x00CA, 0x00C1, |
715 | | 0x00CB, 0x00C8, 0x00CD, 0x00CE, 0x00CF, 0x00CC, 0x00D3, 0x00D4, |
716 | | /* xF0 = Apple Logo */ |
717 | | /* =F0 U+2665 BLACK HEART SUIT */ |
718 | | 0xF8FF, 0x00D2, 0x00DA, 0x00DB, 0x00D9, 0x0131, 0x02C6, 0x02DC, |
719 | | 0x00AF, 0x02D8, 0x02D9, 0x02DA, 0x00B8, 0x02DD, 0x02DB, 0x02C7 |
720 | | }; |
721 | | |
722 | | /* Function to convert from MacRoman to Unicode */ |
723 | | uint TY_(DecodeMacRoman)(uint c) |
724 | 0 | { |
725 | 0 | if (127 < c && c < 256) /* Is. #891 */ |
726 | 0 | c = Mac2Unicode[c - 128]; |
727 | 0 | return c; |
728 | 0 | } |
729 | | |
730 | | static void EncodeMacRoman( uint c, StreamOut* out ) |
731 | 0 | { |
732 | 0 | if (c < 128) |
733 | 0 | PutByte(c, out); |
734 | 0 | else |
735 | 0 | { |
736 | | /* For mac users, map Unicode back to MacRoman. */ |
737 | 0 | int i; |
738 | 0 | for (i = 128; i < 256; i++) |
739 | 0 | { |
740 | 0 | if (Mac2Unicode[i - 128] == c) |
741 | 0 | { |
742 | 0 | PutByte(i, out); |
743 | 0 | break; |
744 | 0 | } |
745 | 0 | } |
746 | 0 | } |
747 | 0 | } |
748 | | |
749 | | /* Mapping for OS/2 Western character set CP 850 |
750 | | ** (chars 128-255) to Unicode. |
751 | | */ |
752 | | static const uint IBM2Unicode[128] = |
753 | | { |
754 | | 0x00C7, 0x00FC, 0x00E9, 0x00E2, 0x00E4, 0x00E0, 0x00E5, 0x00E7, |
755 | | 0x00EA, 0x00EB, 0x00E8, 0x00EF, 0x00EE, 0x00EC, 0x00C4, 0x00C5, |
756 | | 0x00C9, 0x00E6, 0x00C6, 0x00F4, 0x00F6, 0x00F2, 0x00FB, 0x00F9, |
757 | | 0x00FF, 0x00D6, 0x00DC, 0x00F8, 0x00A3, 0x00D8, 0x00D7, 0x0192, |
758 | | 0x00E1, 0x00ED, 0x00F3, 0x00FA, 0x00F1, 0x00D1, 0x00AA, 0x00BA, |
759 | | 0x00BF, 0x00AE, 0x00AC, 0x00BD, 0x00BC, 0x00A1, 0x00AB, 0x00BB, |
760 | | 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x00C1, 0x00C2, 0x00C0, |
761 | | 0x00A9, 0x2563, 0x2551, 0x2557, 0x255D, 0x00A2, 0x00A5, 0x2510, |
762 | | 0x2514, 0x2534, 0x252C, 0x251C, 0x2500, 0x253C, 0x00E3, 0x00C3, |
763 | | 0x255A, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x00a4, |
764 | | 0x00f0, 0x00d0, 0x00ca, 0x00cb, 0x00c8, 0x20AC, 0x00cd, 0x00ce, |
765 | | 0x00cf, 0x2518, 0x250c, 0x2588, 0x2584, 0x00a6, 0x00cc, 0x2580, |
766 | | 0x00d3, 0x00df, 0x00d4, 0x00d2, 0x00f5, 0x00d5, 0x00b5, 0x00fe, |
767 | | 0x00de, 0x00da, 0x00db, 0x00d9, 0x00fd, 0x00dd, 0x00af, 0x00b4, |
768 | | 0x00ad, 0x00b1, 0x2017, 0x00be, 0x00b6, 0x00a7, 0x00f7, 0x00b8, |
769 | | 0x00b0, 0x00a8, 0x00b7, 0x00b9, 0x00b3, 0x00b2, 0x25a0, 0x00a0 |
770 | | }; |
771 | | |
772 | | /* Function for conversion from OS/2-850 to Unicode */ |
773 | | static uint DecodeIbm850(uint c) |
774 | 0 | { |
775 | 0 | if (127 < c && c < 256) |
776 | 0 | c = IBM2Unicode[c - 128]; |
777 | |
|
778 | 0 | return c; |
779 | 0 | } |
780 | | |
781 | | /* For OS/2,Java users, map Unicode back to IBM858 (IBM850+Euro). */ |
782 | | static void EncodeIbm858( uint c, StreamOut* out ) |
783 | 0 | { |
784 | 0 | if (c < 128) |
785 | 0 | PutByte(c, out); |
786 | 0 | else |
787 | 0 | { |
788 | 0 | int i; |
789 | 0 | for (i = 128; i < 256; i++) |
790 | 0 | { |
791 | 0 | if (IBM2Unicode[i - 128] == c) |
792 | 0 | { |
793 | 0 | PutByte(i, out); |
794 | 0 | break; |
795 | 0 | } |
796 | 0 | } |
797 | 0 | } |
798 | 0 | } |
799 | | |
800 | | |
801 | | /* Convert from Latin0 (aka Latin9, ISO-8859-15) to Unicode */ |
802 | | static uint DecodeLatin0(uint c) |
803 | 0 | { |
804 | 0 | if (163 < c && c < 191) |
805 | 0 | { |
806 | 0 | switch (c) |
807 | 0 | { |
808 | 0 | case 0xA4: c = 0x20AC; break; |
809 | 0 | case 0xA6: c = 0x0160; break; |
810 | 0 | case 0xA8: c = 0x0161; break; |
811 | 0 | case 0xB4: c = 0x017D; break; |
812 | 0 | case 0xB8: c = 0x017E; break; |
813 | 0 | case 0xBC: c = 0x0152; break; |
814 | 0 | case 0xBD: c = 0x0153; break; |
815 | 0 | case 0xBE: c = 0x0178; break; |
816 | 0 | } |
817 | 0 | } |
818 | 0 | return c; |
819 | 0 | } |
820 | | |
821 | | /* Map Unicode back to ISO-8859-15. */ |
822 | | static void EncodeLatin0( uint c, StreamOut* out ) |
823 | 0 | { |
824 | 0 | switch (c) |
825 | 0 | { |
826 | 0 | case 0x20AC: c = 0xA4; break; |
827 | 0 | case 0x0160: c = 0xA6; break; |
828 | 0 | case 0x0161: c = 0xA8; break; |
829 | 0 | case 0x017D: c = 0xB4; break; |
830 | 0 | case 0x017E: c = 0xB8; break; |
831 | 0 | case 0x0152: c = 0xBC; break; |
832 | 0 | case 0x0153: c = 0xBD; break; |
833 | 0 | case 0x0178: c = 0xBE; break; |
834 | 0 | } |
835 | 0 | PutByte(c, out); |
836 | 0 | } |
837 | | |
838 | | /* Facilitates user defined source by providing |
839 | | ** an entry point to marshal pointers-to-functions. |
840 | | ** Needed by .NET and possibly other language bindings. |
841 | | */ |
842 | | Bool TIDY_CALL tidyInitSource( TidyInputSource* source, |
843 | | void* srcData, |
844 | | TidyGetByteFunc gbFunc, |
845 | | TidyUngetByteFunc ugbFunc, |
846 | | TidyEOFFunc endFunc ) |
847 | 0 | { |
848 | 0 | Bool status = ( source && srcData && gbFunc && ugbFunc && endFunc ); |
849 | |
|
850 | 0 | if ( status ) |
851 | 0 | { |
852 | 0 | source->sourceData = srcData; |
853 | 0 | source->getByte = gbFunc; |
854 | 0 | source->ungetByte = ugbFunc; |
855 | 0 | source->eof = endFunc; |
856 | 0 | } |
857 | |
|
858 | 0 | return status; |
859 | 0 | } |
860 | | |
861 | | Bool TIDY_CALL tidyInitSink( TidyOutputSink* sink, |
862 | | void* snkData, |
863 | | TidyPutByteFunc pbFunc ) |
864 | 0 | { |
865 | 0 | Bool status = ( sink && snkData && pbFunc ); |
866 | 0 | if ( status ) |
867 | 0 | { |
868 | 0 | sink->sinkData = snkData; |
869 | 0 | sink->putByte = pbFunc; |
870 | 0 | } |
871 | 0 | return status; |
872 | 0 | } |
873 | | |
874 | | /* GetByte must return a byte value in a signed |
875 | | ** integer so that a negative value can signal EOF |
876 | | ** without interfering w/ 0-255 legitimate byte values. |
877 | | */ |
878 | | uint TIDY_CALL tidyGetByte( TidyInputSource* source ) |
879 | 47.2M | { |
880 | 47.2M | int bv = source->getByte( source->sourceData ); |
881 | 47.2M | return (uint) bv; |
882 | 47.2M | } |
883 | | Bool TIDY_CALL tidyIsEOF( TidyInputSource* source ) |
884 | 47.8M | { |
885 | 47.8M | return source->eof( source->sourceData ); |
886 | 47.8M | } |
887 | | void TIDY_CALL tidyUngetByte( TidyInputSource* source, uint ch ) |
888 | 2.42k | { |
889 | 2.42k | source->ungetByte( source->sourceData, (byte) ch ); |
890 | 2.42k | } |
891 | | void TIDY_CALL tidyPutByte( TidyOutputSink* sink, uint ch ) |
892 | 0 | { |
893 | 0 | sink->putByte( sink->sinkData, (byte) ch ); |
894 | 0 | } |
895 | | |
896 | | static uint ReadByte( StreamIn* in ) |
897 | 47.2M | { |
898 | 47.2M | return tidyGetByte( &in->source ); |
899 | 47.2M | } |
900 | | Bool TY_(IsEOF)( StreamIn* in ) |
901 | 47.8M | { |
902 | 47.8M | return tidyIsEOF( &in->source ); |
903 | 47.8M | } |
904 | | static void UngetByte( StreamIn* in, uint byteValue ) |
905 | 2.42k | { |
906 | 2.42k | tidyUngetByte( &in->source, byteValue ); |
907 | 2.42k | } |
908 | | static void PutByte( uint byteValue, StreamOut* out ) |
909 | 0 | { |
910 | 0 | tidyPutByte( &out->sink, byteValue ); |
911 | 0 | } |
912 | | |
913 | | /* read char from stream */ |
914 | | static uint ReadCharFromStream( StreamIn* in ) |
915 | 47.5M | { |
916 | 47.5M | uint c, n; |
917 | | |
918 | 47.5M | if ( TY_(IsEOF)(in) ) |
919 | 1.00M | return EndOfStream; |
920 | | |
921 | 46.5M | c = ReadByte( in ); |
922 | | |
923 | 46.5M | if (c == EndOfStream) |
924 | 0 | return c; |
925 | | |
926 | 46.5M | #ifndef NO_NATIVE_ISO2022_SUPPORT |
927 | | /* |
928 | | A document in ISO-2022 based encoding uses some ESC sequences |
929 | | called "designator" to switch character sets. The designators |
930 | | defined and used in ISO-2022-JP are: |
931 | | |
932 | | "ESC" + "(" + ? for ISO646 variants |
933 | | |
934 | | "ESC" + "$" + ? and |
935 | | "ESC" + "$" + "(" + ? for multibyte character sets |
936 | | |
937 | | Where ? stands for a single character used to indicate the |
938 | | character set for multibyte characters. |
939 | | |
940 | | Tidy handles this by preserving the escape sequence and |
941 | | setting the top bit of each byte for non-ascii chars. This |
942 | | bit is then cleared on output. The input stream keeps track |
943 | | of the state to determine when to set/clear the bit. |
944 | | */ |
945 | | |
946 | 46.5M | if (in->encoding == ISO2022) |
947 | 0 | { |
948 | 0 | if (c == 0x1b) /* ESC */ |
949 | 0 | { |
950 | 0 | in->state = FSM_ESC; |
951 | 0 | return c; |
952 | 0 | } |
953 | | |
954 | 0 | switch (in->state) |
955 | 0 | { |
956 | 0 | case FSM_ESC: |
957 | 0 | if (c == '$') |
958 | 0 | in->state = FSM_ESCD; |
959 | 0 | else if (c == '(') |
960 | 0 | in->state = FSM_ESCP; |
961 | 0 | else |
962 | 0 | in->state = FSM_ASCII; |
963 | 0 | break; |
964 | | |
965 | 0 | case FSM_ESCD: |
966 | 0 | if (c == '(') |
967 | 0 | in->state = FSM_ESCDP; |
968 | 0 | else |
969 | 0 | in->state = FSM_NONASCII; |
970 | 0 | break; |
971 | | |
972 | 0 | case FSM_ESCDP: |
973 | 0 | in->state = FSM_NONASCII; |
974 | 0 | break; |
975 | | |
976 | 0 | case FSM_ESCP: |
977 | 0 | in->state = FSM_ASCII; |
978 | 0 | break; |
979 | | |
980 | 0 | case FSM_NONASCII: |
981 | 0 | c |= 0x80; |
982 | 0 | break; |
983 | | |
984 | 0 | case FSM_ASCII: |
985 | 0 | break; |
986 | 0 | } |
987 | | |
988 | 0 | return c; |
989 | 0 | } |
990 | 46.5M | #endif /* NO_NATIVE_ISO2022_SUPPORT */ |
991 | | |
992 | 46.5M | if ( in->encoding == UTF16LE ) |
993 | 283k | { |
994 | 283k | uint c1 = ReadByte( in ); |
995 | 283k | if ( EndOfStream == c1 ) |
996 | 0 | return EndOfStream; |
997 | 283k | n = (c1 << 8) + c; |
998 | 283k | return n; |
999 | 283k | } |
1000 | | |
1001 | 46.2M | if ((in->encoding == UTF16) || (in->encoding == UTF16BE)) /* UTF-16 is big-endian by default */ |
1002 | 400k | { |
1003 | 400k | uint c1 = ReadByte( in ); |
1004 | 400k | if ( EndOfStream == c1 ) |
1005 | 0 | return EndOfStream; |
1006 | 400k | n = (c << 8) + c1; |
1007 | 400k | return n; |
1008 | 400k | } |
1009 | | |
1010 | 45.8M | if ( in->encoding == UTF8 ) |
1011 | 45.8M | { |
1012 | | /* deal with UTF-8 encoded char */ |
1013 | | |
1014 | 45.8M | int err, count = 0; |
1015 | | |
1016 | | /* first byte "c" is passed in separately */ |
1017 | 45.8M | err = TY_(DecodeUTF8BytesToChar)( &n, c, NULL, &in->source, &count ); |
1018 | 45.8M | if (!err && (n == (uint)EndOfStream) && (count == 1)) /* EOF */ |
1019 | 0 | return EndOfStream; |
1020 | 45.8M | else if (err) |
1021 | 3.30M | { |
1022 | | /* set error position just before offending character */ |
1023 | 3.30M | in->doc->lexer->lines = in->curline; |
1024 | 3.30M | in->doc->lexer->columns = in->curcol; |
1025 | | |
1026 | 3.30M | TY_(ReportEncodingError)(in->doc, INVALID_UTF8, n, no); |
1027 | 3.30M | n = 0xFFFD; /* replacement char */ |
1028 | 3.30M | } |
1029 | | |
1030 | 45.8M | return n; |
1031 | 45.8M | } |
1032 | | |
1033 | | /* |
1034 | | This section is suitable for any "multibyte" variable-width |
1035 | | character encoding in which a one-byte code is less than |
1036 | | 128, and the first byte of a two-byte code is greater or |
1037 | | equal to 128. Note that Big5 and ShiftJIS fit into this |
1038 | | kind, even though their second byte may be less than 128 |
1039 | | */ |
1040 | 0 | if ((in->encoding == BIG5) || (in->encoding == SHIFTJIS)) |
1041 | 0 | { |
1042 | 0 | if (c < 128) |
1043 | 0 | return c; |
1044 | 0 | else if ((in->encoding == SHIFTJIS) && (c >= 0xa1 && c <= 0xdf)) /* 461643 - fix suggested by Rick Cameron 14 Sep 01 */ |
1045 | 0 | { |
1046 | | /* |
1047 | | Rick Cameron pointed out that for Shift_JIS, the values from |
1048 | | 0xa1 through 0xdf represent singe-byte characters |
1049 | | (U+FF61 to U+FF9F - half-shift Katakana) |
1050 | | */ |
1051 | 0 | return c; |
1052 | 0 | } |
1053 | 0 | else |
1054 | 0 | { |
1055 | 0 | uint c1 = ReadByte( in ); |
1056 | 0 | if ( EndOfStream == c1 ) |
1057 | 0 | return EndOfStream; |
1058 | 0 | n = (c << 8) + c1; |
1059 | 0 | return n; |
1060 | 0 | } |
1061 | 0 | } |
1062 | 0 | else |
1063 | 0 | n = c; |
1064 | | |
1065 | 0 | return n; |
1066 | 0 | } |
1067 | | |
1068 | | /* Output a Byte Order Mark if required */ |
1069 | | void TY_(outBOM)( StreamOut *out ) |
1070 | 0 | { |
1071 | 0 | if ( out->encoding == UTF8 |
1072 | 0 | || out->encoding == UTF16LE |
1073 | 0 | || out->encoding == UTF16BE |
1074 | 0 | || out->encoding == UTF16 |
1075 | 0 | ) |
1076 | 0 | { |
1077 | | /* this will take care of encoding the BOM correctly */ |
1078 | 0 | TY_(WriteChar)( UNICODE_BOM, out ); |
1079 | 0 | } |
1080 | 0 | } |
1081 | | |
1082 | | /* this is in intermediate fix for various problems in the */ |
1083 | | /* long term code and data in charsets.c should be used */ |
1084 | | static struct _enc2iana |
1085 | | { |
1086 | | uint id; |
1087 | | ctmbstr name; |
1088 | | ctmbstr tidyOptName; |
1089 | | } const enc2iana[] = |
1090 | | { |
1091 | | { ASCII, "us-ascii", "ascii" }, |
1092 | | { LATIN0, "iso-8859-15", "latin0" }, |
1093 | | { LATIN1, "iso-8859-1", "latin1" }, |
1094 | | { UTF8, "utf-8", "utf8" }, |
1095 | | { MACROMAN, "macintosh", "mac" }, |
1096 | | { WIN1252, "windows-1252", "win1252" }, |
1097 | | { IBM858, "ibm00858", "ibm858" }, |
1098 | | { UTF16LE, "utf-16", "utf16le" }, |
1099 | | { UTF16BE, "utf-16", "utf16be" }, |
1100 | | { UTF16, "utf-16", "utf16" }, |
1101 | | { BIG5, "big5", "big5" }, |
1102 | | { SHIFTJIS, "shift_jis", "shiftjis"}, |
1103 | | #ifndef NO_NATIVE_ISO2022_SUPPORT |
1104 | | { ISO2022, NULL, "iso2022" }, |
1105 | | #endif |
1106 | | { RAW, NULL, "raw" } |
1107 | | }; |
1108 | | |
1109 | | ctmbstr TY_(GetEncodingNameFromTidyId)(uint id) |
1110 | 106 | { |
1111 | 106 | uint i; |
1112 | | |
1113 | 665 | for (i = 0; enc2iana[i].name; ++i) |
1114 | 665 | if (enc2iana[i].id == id) |
1115 | 106 | return enc2iana[i].name; |
1116 | | |
1117 | 0 | return NULL; |
1118 | 106 | } |
1119 | | |
1120 | | ctmbstr TY_(GetEncodingOptNameFromTidyId)(uint id) |
1121 | 0 | { |
1122 | 0 | uint i; |
1123 | |
|
1124 | 0 | for (i = 0; i < sizeof(enc2iana)/sizeof(enc2iana[0]); ++i) |
1125 | 0 | if (enc2iana[i].id == id) |
1126 | 0 | return enc2iana[i].tidyOptName; |
1127 | | |
1128 | 0 | return NULL; |
1129 | 0 | } |
1130 | | |
1131 | | int TY_(GetCharEncodingFromOptName)( ctmbstr charenc ) |
1132 | 0 | { |
1133 | 0 | uint i; |
1134 | |
|
1135 | 0 | for (i = 0; i < sizeof(enc2iana)/sizeof(enc2iana[0]); ++i) |
1136 | 0 | if (TY_(tmbstrcasecmp)(charenc, enc2iana[i].tidyOptName) == 0 ) |
1137 | 0 | return enc2iana[i].id; |
1138 | | |
1139 | 0 | return -1; |
1140 | 0 | } |
1141 | | |
1142 | | /* |
1143 | | * local variables: |
1144 | | * mode: c |
1145 | | * indent-tabs-mode: nil |
1146 | | * c-basic-offset: 4 |
1147 | | * eval: (c-set-offset 'substatement-open 0) |
1148 | | * end: |
1149 | | */ |