/src/tidy-html5/src/utf8.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* utf8.c -- convert characters to/from UTF-8 |
2 | | |
3 | | (c) 1998-2007 (W3C) MIT, ERCIM, Keio University |
4 | | See tidy.h for the copyright notice. |
5 | | |
6 | | Uses public interfaces to abstract input source and output |
7 | | sink, which may be user supplied or either FILE* or memory |
8 | | based Tidy implementations. Encoding support is uniform |
9 | | regardless of I/O mechanism. |
10 | | |
11 | | Note, UTF-8 encoding, by itself, does not affect the actual |
12 | | "codepoints" of the underlying character encoding. In the |
13 | | cases of ASCII, Latin1, Unicode (16-bit, BMP), these all |
14 | | refer to ISO-10646 "codepoints". For anything else, they |
15 | | refer to some other "codepoint" set. |
16 | | |
17 | | Put another way, UTF-8 is a variable length method to |
18 | | represent any non-negative integer value. The glyph |
19 | | that a integer value represents is unchanged and defined |
20 | | externally (e.g. by ISO-10646, Big5, Win1252, MacRoman, |
21 | | Latin2-9, and so on). |
22 | | |
23 | | Put still another way, UTF-8 is more of a _transfer_ encoding |
24 | | than a _character_ encoding, per se. |
25 | | */ |
26 | | |
27 | | #include "tidy.h" |
28 | | #include "forward.h" |
29 | | #include "utf8.h" |
30 | | |
31 | | /* |
32 | | UTF-8 encoding/decoding functions |
33 | | Return # of bytes in UTF-8 sequence; result < 0 if illegal sequence |
34 | | |
35 | | Also see below for UTF-16 encoding/decoding functions |
36 | | |
37 | | References : |
38 | | |
39 | | 1) UCS Transformation Format 8 (UTF-8): |
40 | | ISO/IEC 10646-1:1996 Amendment 2 or ISO/IEC 10646-1:2000 Annex D |
41 | | <http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335> |
42 | | <http://www.cl.cam.ac.uk/~mgk25/ucs/ISO-10646-UTF-8.html> |
43 | | |
44 | | Table 4 - Mapping from UCS-4 to UTF-8 |
45 | | |
46 | | 2) Unicode standards: |
47 | | <https://www.unicode.org/standard/standard.html> |
48 | | |
49 | | 3) Legal UTF-8 byte sequences: |
50 | | <https://www.unicode.org/versions/corrigendum1.html> |
51 | | |
52 | | Code point 1st byte 2nd byte 3rd byte 4th byte |
53 | | ---------- -------- -------- -------- -------- |
54 | | U+0000..U+007F 00..7F |
55 | | U+0080..U+07FF C2..DF 80..BF |
56 | | U+0800..U+0FFF E0 A0..BF 80..BF |
57 | | U+1000..U+FFFF E1..EF 80..BF 80..BF |
58 | | U+10000..U+3FFFF F0 90..BF 80..BF 80..BF |
59 | | U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF |
60 | | U+100000..U+10FFFF F4 80..8F 80..BF 80..BF |
61 | | |
62 | | The definition of UTF-8 in Annex D of ISO/IEC 10646-1:2000 also |
63 | | allows for the use of five- and six-byte sequences to encode |
64 | | characters that are outside the range of the Unicode character |
65 | | set; those five- and six-byte sequences are illegal for the use |
66 | | of UTF-8 as a transformation of Unicode characters. ISO/IEC 10646 |
67 | | does not allow mapping of unpaired surrogates, nor U+FFFE and U+FFFF |
68 | | (but it does allow other noncharacters). |
69 | | |
70 | | 4) RFC 2279: UTF-8, a transformation format of ISO 10646: |
71 | | <http://www.ietf.org/rfc/rfc2279.txt> |
72 | | |
73 | | 5) UTF-8 and Unicode FAQ: |
74 | | <http://www.cl.cam.ac.uk/~mgk25/unicode.html> |
75 | | |
76 | | 6) Markus Kuhn's UTF-8 decoder stress test file: |
77 | | <http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt> |
78 | | |
79 | | 7) UTF-8 Demo: |
80 | | <http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-demo.txt> |
81 | | |
82 | | 8) UTF-8 Sampler: |
83 | | <http://www.columbia.edu/kermit/utf8.html> |
84 | | |
85 | | 9) Transformation Format for 16 Planes of Group 00 (UTF-16): |
86 | | ISO/IEC 10646-1:1996 Amendment 1 or ISO/IEC 10646-1:2000 Annex C |
87 | | <http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n2005/n2005.pdf> |
88 | | <http://www.cl.cam.ac.uk/~mgk25/ucs/ISO-10646-UTF-16.html> |
89 | | |
90 | | 10) RFC 2781: UTF-16, an encoding of ISO 10646: |
91 | | <http://www.ietf.org/rfc/rfc2781.txt> |
92 | | |
93 | | 11) UTF-16 invalid surrogate pairs: |
94 | | <https://www.unicode.org/faq/utf_bom.html#16> |
95 | | |
96 | | UTF-16 UTF-8 UCS-4 |
97 | | D83F DFF* F0 9F BF B* 0001FFF* |
98 | | D87F DFF* F0 AF BF B* 0002FFF* |
99 | | D8BF DFF* F0 BF BF B* 0003FFF* |
100 | | D8FF DFF* F1 8F BF B* 0004FFF* |
101 | | D93F DFF* F1 9F BF B* 0005FFF* |
102 | | D97F DFF* F1 AF BF B* 0006FFF* |
103 | | ... |
104 | | DBBF DFF* F3 BF BF B* 000FFFF* |
105 | | DBFF DFF* F4 8F BF B* 0010FFF* |
106 | | |
107 | | * = E or F |
108 | | |
109 | | 1010 A |
110 | | 1011 B |
111 | | 1100 C |
112 | | 1101 D |
113 | | 1110 E |
114 | | 1111 F |
115 | | |
116 | | */ |
117 | | |
118 | | #define kNumUTF8Sequences 7 |
119 | | #define kMaxUTF8Bytes 4 |
120 | | |
121 | 25.2M | #define kUTF8ByteSwapNotAChar 0xFFFE |
122 | 24.5M | #define kUTF8NotAChar 0xFFFF |
123 | | |
124 | 23.8M | #define kMaxUTF8FromUCS4 0x10FFFF |
125 | | |
126 | 18 | #define kUTF16SurrogatesBegin 0x10000 |
127 | 413k | #define kMaxUTF16FromUCS4 0x10FFFF |
128 | | |
129 | | /* UTF-16 surrogate pair areas */ |
130 | 828k | #define kUTF16LowSurrogateBegin 0xD800 |
131 | 1.89k | #define kUTF16LowSurrogateEnd 0xDBFF |
132 | 687 | #define kUTF16HighSurrogateBegin 0xDC00 |
133 | 24 | #define kUTF16HighSurrogateEnd 0xDFFF |
134 | | |
135 | | |
136 | | /* offsets into validUTF8 table below */ |
137 | | static const int offsetUTF8Sequences[kMaxUTF8Bytes + 1] = |
138 | | { |
139 | | 0, /* 1 byte */ |
140 | | 1, /* 2 bytes */ |
141 | | 2, /* 3 bytes */ |
142 | | 4, /* 4 bytes */ |
143 | | kNumUTF8Sequences /* must be last */ |
144 | | }; |
145 | | |
146 | | static const struct validUTF8Sequence |
147 | | { |
148 | | uint lowChar; |
149 | | uint highChar; |
150 | | int numBytes; |
151 | | byte validBytes[8]; |
152 | | } validUTF8[kNumUTF8Sequences] = |
153 | | { |
154 | | /* low high #bytes byte 1 byte 2 byte 3 byte 4 */ |
155 | | {0x0000, 0x007F, 1, {0x00, 0x7F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}}, |
156 | | {0x0080, 0x07FF, 2, {0xC2, 0xDF, 0x80, 0xBF, 0x00, 0x00, 0x00, 0x00}}, |
157 | | {0x0800, 0x0FFF, 3, {0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF, 0x00, 0x00}}, |
158 | | {0x1000, 0xFFFF, 3, {0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF, 0x00, 0x00}}, |
159 | | {0x10000, 0x3FFFF, 4, {0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF}}, |
160 | | {0x40000, 0xFFFFF, 4, {0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF}}, |
161 | | {0x100000, 0x10FFFF, 4, {0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF}} |
162 | | }; |
163 | | |
164 | | int TY_(DecodeUTF8BytesToChar)( uint* c, uint firstByte, ctmbstr successorBytes, |
165 | | TidyInputSource* inp, int* count ) |
166 | 24.0M | { |
167 | 24.0M | byte tempbuf[10]; |
168 | 24.0M | byte *buf = &tempbuf[0]; |
169 | 24.0M | uint ch = 0, n = 0; |
170 | 24.0M | int i, bytes = 0; |
171 | 24.0M | Bool hasError = no; |
172 | | |
173 | 24.0M | if ( successorBytes ) |
174 | 31.4k | buf = (byte*) successorBytes; |
175 | | |
176 | | /* special check if we have been passed an EOF char */ |
177 | 24.0M | if ( firstByte == EndOfStream ) |
178 | 0 | { |
179 | | /* at present */ |
180 | 0 | *c = firstByte; |
181 | 0 | *count = 1; |
182 | 0 | return 0; |
183 | 0 | } |
184 | | |
185 | 24.0M | ch = firstByte; /* first byte is passed in separately */ |
186 | | |
187 | 24.0M | if (ch <= 0x7F) /* 0XXX XXXX one byte */ |
188 | 23.7M | { |
189 | 23.7M | n = ch; |
190 | 23.7M | bytes = 1; |
191 | 23.7M | } |
192 | 305k | else if ((ch & 0xE0) == 0xC0) /* 110X XXXX two bytes */ |
193 | 15.1k | { |
194 | 15.1k | n = ch & 31; |
195 | 15.1k | bytes = 2; |
196 | 15.1k | } |
197 | 290k | else if ((ch & 0xF0) == 0xE0) /* 1110 XXXX three bytes */ |
198 | 54.9k | { |
199 | 54.9k | n = ch & 15; |
200 | 54.9k | bytes = 3; |
201 | 54.9k | } |
202 | 235k | else if ((ch & 0xF8) == 0xF0) /* 1111 0XXX four bytes */ |
203 | 8.79k | { |
204 | 8.79k | n = ch & 7; |
205 | 8.79k | bytes = 4; |
206 | 8.79k | } |
207 | 226k | else if ((ch & 0xFC) == 0xF8) /* 1111 10XX five bytes */ |
208 | 1.20k | { |
209 | 1.20k | n = ch & 3; |
210 | 1.20k | bytes = 5; |
211 | 1.20k | hasError = yes; |
212 | 1.20k | } |
213 | 225k | else if ((ch & 0xFE) == 0xFC) /* 1111 110X six bytes */ |
214 | 198 | { |
215 | 198 | n = ch & 1; |
216 | 198 | bytes = 6; |
217 | 198 | hasError = yes; |
218 | 198 | } |
219 | 225k | else |
220 | 225k | { |
221 | | /* not a valid first byte of a UTF-8 sequence */ |
222 | 225k | n = ch; |
223 | 225k | bytes = 1; |
224 | 225k | hasError = yes; |
225 | 225k | } |
226 | | |
227 | | /* successor bytes should have the form 10XX XXXX */ |
228 | | |
229 | | /* If caller supplied buffer, use it. Else see if caller |
230 | | ** supplied an input source, use that. |
231 | | */ |
232 | 24.0M | if ( successorBytes ) |
233 | 31.4k | { |
234 | 81.9k | for ( i=0; i < bytes-1; ++i ) |
235 | 53.3k | { |
236 | 53.3k | if ( !buf[i] || (buf[i] & 0xC0) != 0x80 ) |
237 | 2.80k | { |
238 | 2.80k | hasError = yes; |
239 | 2.80k | bytes = i+1; |
240 | 2.80k | break; |
241 | 2.80k | } |
242 | 50.5k | n = (n << 6) | (buf[i] & 0x3F); |
243 | 50.5k | } |
244 | 31.4k | } |
245 | 24.0M | else if ( inp ) |
246 | 24.0M | { |
247 | 24.1M | for ( i=0; i < bytes-1 && !inp->eof(inp->sourceData); ++i ) |
248 | 70.8k | { |
249 | 70.8k | int b = inp->getByte( inp->sourceData ); |
250 | 70.8k | buf[i] = (tmbchar) b; |
251 | | |
252 | | /* End of data or illegal successor byte value */ |
253 | 70.8k | if ( b == EOF || (buf[i] & 0xC0) != 0x80 ) |
254 | 37.1k | { |
255 | 37.1k | hasError = yes; |
256 | 37.1k | bytes = i+1; |
257 | 37.1k | if ( b != EOF ) |
258 | 37.1k | inp->ungetByte( inp->sourceData, buf[i] ); |
259 | 37.1k | break; |
260 | 37.1k | } |
261 | 33.7k | n = (n << 6) | (buf[i] & 0x3F); |
262 | 33.7k | } |
263 | 24.0M | } |
264 | 0 | else if ( bytes > 1 ) |
265 | 0 | { |
266 | 0 | hasError = yes; |
267 | 0 | bytes = 1; |
268 | 0 | } |
269 | | |
270 | 24.0M | if (!hasError && ((n == kUTF8ByteSwapNotAChar) || (n == kUTF8NotAChar))) |
271 | 4 | hasError = yes; |
272 | | |
273 | 24.0M | if (!hasError && (n > kMaxUTF8FromUCS4)) |
274 | 14 | hasError = yes; |
275 | | |
276 | 24.0M | if (!hasError) |
277 | 23.8M | { |
278 | 23.8M | int lo, hi; |
279 | | |
280 | 23.8M | lo = offsetUTF8Sequences[bytes - 1]; |
281 | 23.8M | hi = offsetUTF8Sequences[bytes] - 1; |
282 | | |
283 | | /* check for overlong sequences */ |
284 | 23.8M | if ((n < validUTF8[lo].lowChar) || (n > validUTF8[hi].highChar)) |
285 | 648 | hasError = yes; |
286 | 23.8M | else |
287 | 23.8M | { |
288 | 23.8M | hasError = yes; /* assume error until proven otherwise */ |
289 | | |
290 | 47.7M | for (i = lo; i <= hi; i++) |
291 | 23.8M | { |
292 | 23.8M | int tempCount; |
293 | 23.8M | byte theByte; |
294 | | |
295 | 47.8M | for (tempCount = 0; tempCount < bytes; tempCount++) |
296 | 23.9M | { |
297 | 23.9M | if (!tempCount) |
298 | 23.8M | theByte = (tmbchar) firstByte; |
299 | 94.6k | else |
300 | 94.6k | theByte = buf[tempCount - 1]; |
301 | | |
302 | 23.9M | if ( theByte >= validUTF8[i].validBytes[(tempCount * 2)] && |
303 | 23.9M | theByte <= validUTF8[i].validBytes[(tempCount * 2) + 1] ) |
304 | 23.9M | hasError = no; |
305 | 23.9M | if (hasError) |
306 | 31.1k | break; |
307 | 23.9M | } |
308 | 23.8M | } |
309 | 23.8M | } |
310 | 23.8M | } |
311 | | |
312 | | #if 1 && defined(_DEBUG) |
313 | | if ( hasError ) |
314 | | { |
315 | | /* debug */ |
316 | | fprintf( stderr, "UTF-8 decoding error of %d bytes : ", bytes ); |
317 | | fprintf( stderr, "0x%02x ", firstByte ); |
318 | | for (i = 1; i < bytes; i++) |
319 | | fprintf( stderr, "0x%02x ", buf[i - 1] ); |
320 | | fprintf( stderr, " = U+%04X\n", n ); |
321 | | } |
322 | | #endif |
323 | | |
324 | 24.0M | *count = bytes; |
325 | 24.0M | *c = n; |
326 | 24.0M | if ( hasError ) |
327 | 265k | return -1; |
328 | 23.8M | return 0; |
329 | 24.0M | } |
330 | | |
331 | | int TY_(EncodeCharToUTF8Bytes)( uint c, tmbstr encodebuf, |
332 | | TidyOutputSink* outp, int* count ) |
333 | 229M | { |
334 | 229M | byte tempbuf[10] = {0}; |
335 | 229M | byte* buf = &tempbuf[0]; |
336 | 229M | int bytes = 0; |
337 | 229M | Bool hasError = no; |
338 | | |
339 | 229M | if ( encodebuf ) |
340 | 125M | buf = (byte*) encodebuf; |
341 | | |
342 | 229M | if (c <= 0x7F) /* 0XXX XXXX one byte */ |
343 | 227M | { |
344 | 227M | buf[0] = (tmbchar) c; |
345 | 227M | bytes = 1; |
346 | 227M | } |
347 | 1.22M | else if (c <= 0x7FF) /* 110X XXXX two bytes */ |
348 | 2.12k | { |
349 | 2.12k | buf[0] = (tmbchar) ( 0xC0 | (c >> 6) ); |
350 | 2.12k | buf[1] = (tmbchar) ( 0x80 | (c & 0x3F) ); |
351 | 2.12k | bytes = 2; |
352 | 2.12k | } |
353 | 1.22M | else if (c <= 0xFFFF) /* 1110 XXXX three bytes */ |
354 | 714k | { |
355 | 714k | buf[0] = (tmbchar) (0xE0 | (c >> 12)); |
356 | 714k | buf[1] = (tmbchar) (0x80 | ((c >> 6) & 0x3F)); |
357 | 714k | buf[2] = (tmbchar) (0x80 | (c & 0x3F)); |
358 | 714k | bytes = 3; |
359 | 714k | if ( c == kUTF8ByteSwapNotAChar || c == kUTF8NotAChar ) |
360 | 966 | hasError = yes; |
361 | 714k | } |
362 | 511k | else if (c <= 0x1FFFFF) /* 1111 0XXX four bytes */ |
363 | 1.08k | { |
364 | 1.08k | buf[0] = (tmbchar) (0xF0 | (c >> 18)); |
365 | 1.08k | buf[1] = (tmbchar) (0x80 | ((c >> 12) & 0x3F)); |
366 | 1.08k | buf[2] = (tmbchar) (0x80 | ((c >> 6) & 0x3F)); |
367 | 1.08k | buf[3] = (tmbchar) (0x80 | (c & 0x3F)); |
368 | 1.08k | bytes = 4; |
369 | 1.08k | if (c > kMaxUTF8FromUCS4) |
370 | 0 | hasError = yes; |
371 | 1.08k | } |
372 | 510k | else if (c <= 0x3FFFFFF) /* 1111 10XX five bytes */ |
373 | 0 | { |
374 | 0 | buf[0] = (tmbchar) (0xF8 | (c >> 24)); |
375 | 0 | buf[1] = (tmbchar) (0x80 | (c >> 18)); |
376 | 0 | buf[2] = (tmbchar) (0x80 | ((c >> 12) & 0x3F)); |
377 | 0 | buf[3] = (tmbchar) (0x80 | ((c >> 6) & 0x3F)); |
378 | 0 | buf[4] = (tmbchar) (0x80 | (c & 0x3F)); |
379 | 0 | bytes = 5; |
380 | 0 | hasError = yes; |
381 | 0 | } |
382 | 510k | else if (c <= 0x7FFFFFFF) /* 1111 110X six bytes */ |
383 | 0 | { |
384 | 0 | buf[0] = (tmbchar) (0xFC | (c >> 30)); |
385 | 0 | buf[1] = (tmbchar) (0x80 | ((c >> 24) & 0x3F)); |
386 | 0 | buf[2] = (tmbchar) (0x80 | ((c >> 18) & 0x3F)); |
387 | 0 | buf[3] = (tmbchar) (0x80 | ((c >> 12) & 0x3F)); |
388 | 0 | buf[4] = (tmbchar) (0x80 | ((c >> 6) & 0x3F)); |
389 | 0 | buf[5] = (tmbchar) (0x80 | (c & 0x3F)); |
390 | 0 | bytes = 6; |
391 | 0 | hasError = yes; |
392 | 0 | } |
393 | 510k | else |
394 | 510k | hasError = yes; |
395 | | |
396 | | /* don't output invalid UTF-8 byte sequence to a stream */ |
397 | 229M | if ( !hasError && outp != NULL ) |
398 | 103M | { |
399 | 103M | int ix; |
400 | 207M | for ( ix=0; ix < bytes; ++ix ) |
401 | 103M | outp->putByte( outp->sinkData, buf[ix] ); |
402 | 103M | } |
403 | | |
404 | | #if 1 && defined(_DEBUG) |
405 | | if ( hasError ) |
406 | | { |
407 | | int i; |
408 | | fprintf( stderr, "UTF-8 encoding error for U+%x : ", c ); |
409 | | for (i = 0; i < bytes; i++) |
410 | | fprintf( stderr, "0x%02x ", buf[i] ); |
411 | | fprintf( stderr, "\n" ); |
412 | | } |
413 | | #endif |
414 | | |
415 | 229M | *count = bytes; |
416 | 229M | if (hasError) |
417 | 511k | return -1; |
418 | 228M | return 0; |
419 | 229M | } |
420 | | |
421 | | |
422 | | /* return one less than the number of bytes used by the UTF-8 byte sequence */ |
423 | | /* str points to the UTF-8 byte sequence */ |
424 | | /* the Unicode char is returned in *ch */ |
425 | | uint TY_(GetUTF8)( ctmbstr str, uint *ch ) |
426 | 31.4k | { |
427 | 31.4k | uint n; |
428 | 31.4k | int bytes; |
429 | | |
430 | 31.4k | int err; |
431 | | |
432 | 31.4k | bytes = 0; |
433 | | |
434 | | /* first byte "str[0]" is passed in separately from the */ |
435 | | /* rest of the UTF-8 byte sequence starting at "str[1]" */ |
436 | 31.4k | err = TY_(DecodeUTF8BytesToChar)( &n, str[0], str+1, NULL, &bytes ); |
437 | 31.4k | if (err) |
438 | 5.09k | { |
439 | | #if 1 && defined(_DEBUG) |
440 | | fprintf(stderr, "pprint UTF-8 decoding error for U+%x : ", n); |
441 | | #endif |
442 | 5.09k | n = 0xFFFD; /* replacement char */ |
443 | 5.09k | } |
444 | | |
445 | 31.4k | *ch = n; |
446 | 31.4k | return bytes - 1; |
447 | 31.4k | } |
448 | | |
449 | | /* store char c as UTF-8 encoded byte stream */ |
450 | | tmbstr TY_(PutUTF8)( tmbstr buf, uint c ) |
451 | 0 | { |
452 | 0 | int err, count = 0; |
453 | | |
454 | 0 | err = TY_(EncodeCharToUTF8Bytes)( c, buf, NULL, &count ); |
455 | 0 | if (err) |
456 | 0 | { |
457 | | #if 1 && defined(_DEBUG) |
458 | | fprintf(stderr, "pprint UTF-8 encoding error for U+%x : ", c); |
459 | | #endif |
460 | | /* replacement char 0xFFFD encoded as UTF-8 */ |
461 | 0 | buf[0] = (byte) 0xEF; |
462 | 0 | buf[1] = (byte) 0xBF; |
463 | 0 | buf[2] = (byte) 0xBD; |
464 | 0 | count = 3; |
465 | 0 | } |
466 | | |
467 | 0 | buf += count; |
468 | 0 | return buf; |
469 | 0 | } |
470 | | |
471 | | Bool TY_(IsValidUTF16FromUCS4)( tchar ucs4 ) |
472 | 413k | { |
473 | 413k | return ( ucs4 <= kMaxUTF16FromUCS4 ); |
474 | 413k | } |
475 | | |
476 | | Bool TY_(IsHighSurrogate)( tchar ch ) |
477 | 339 | { |
478 | 339 | return ( ch >= kUTF16HighSurrogateBegin && ch <= kUTF16HighSurrogateEnd ); |
479 | 339 | } |
480 | | Bool TY_(IsLowSurrogate)( tchar ch ) |
481 | 414k | { |
482 | 414k | return ( ch >= kUTF16LowSurrogateBegin && ch <= kUTF16LowSurrogateEnd ); |
483 | 414k | } |
484 | | |
485 | | tchar TY_(CombineSurrogatePair)( tchar high, tchar low ) |
486 | 9 | { |
487 | 9 | assert( TY_(IsHighSurrogate)(high) && TY_(IsLowSurrogate)(low) ); |
488 | 9 | return ( ((low - kUTF16LowSurrogateBegin) * 0x400) + |
489 | 9 | high - kUTF16HighSurrogateBegin + 0x10000 ); |
490 | 9 | } |
491 | | |
492 | | Bool TY_(SplitSurrogatePair)( tchar utf16, tchar* low, tchar* high ) |
493 | 0 | { |
494 | 0 | Bool status = ( TY_(IsValidCombinedChar)( utf16 ) && high && low ); |
495 | 0 | if ( status ) |
496 | 0 | { |
497 | 0 | *low = (utf16 - kUTF16SurrogatesBegin) / 0x400 + kUTF16LowSurrogateBegin; |
498 | 0 | *high = (utf16 - kUTF16SurrogatesBegin) % 0x400 + kUTF16HighSurrogateBegin; |
499 | 0 | } |
500 | 0 | return status; |
501 | 0 | } |
502 | | |
503 | | Bool TY_(IsValidCombinedChar)( tchar ch ) |
504 | 9 | { |
505 | 9 | return ( ch >= kUTF16SurrogatesBegin && |
506 | 9 | (ch & 0x0000FFFE) != 0x0000FFFE && |
507 | 9 | (ch & 0x0000FFFF) != 0x0000FFFF ); |
508 | 9 | } |
509 | | |
510 | | Bool TY_(IsCombinedChar)( tchar ch ) |
511 | 0 | { |
512 | 0 | return ( ch >= kUTF16SurrogatesBegin ); |
513 | 0 | } |
514 | | |
515 | | /* |
516 | | * local variables: |
517 | | * mode: c |
518 | | * indent-tabs-mode: nil |
519 | | * c-basic-offset: 4 |
520 | | * eval: (c-set-offset 'substatement-open 0) |
521 | | * end: |
522 | | */ |