Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) Ian F. Darwin 1986-1995. |
3 | | * Software written by Ian F. Darwin and others; |
4 | | * maintained 1995-present by Christos Zoulas and others. |
5 | | * |
6 | | * Redistribution and use in source and binary forms, with or without |
7 | | * modification, are permitted provided that the following conditions |
8 | | * are met: |
9 | | * 1. Redistributions of source code must retain the above copyright |
10 | | * notice immediately at the beginning of the file, without modification, |
11 | | * this list of conditions, and the following disclaimer. |
12 | | * 2. Redistributions in binary form must reproduce the above copyright |
13 | | * notice, this list of conditions and the following disclaimer in the |
14 | | * documentation and/or other materials provided with the distribution. |
15 | | * |
16 | | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND |
17 | | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
18 | | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
19 | | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR |
20 | | * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
21 | | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
22 | | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
23 | | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
24 | | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
25 | | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
26 | | * SUCH DAMAGE. |
27 | | */ |
28 | | /* |
29 | | * Encoding -- determine the character encoding of a text file. |
30 | | * |
31 | | * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit |
32 | | * international characters. |
33 | | */ |
34 | | |
35 | | #include "file.h" |
36 | | |
37 | | #ifndef lint |
38 | | FILE_RCSID("@(#)$File: encoding.c,v 1.44 2024/12/26 18:41:27 christos Exp $") |
39 | | #endif /* lint */ |
40 | | |
41 | | #include "magic.h" |
42 | | #include <string.h> |
43 | | #include <stdlib.h> |
44 | | |
45 | | |
46 | | file_private int looks_ascii(const unsigned char *, size_t, file_unichar_t *, |
47 | | size_t *); |
48 | | file_private int looks_utf8_with_BOM(const unsigned char *, size_t, file_unichar_t *, |
49 | | size_t *); |
50 | | file_private int looks_utf7(const unsigned char *, size_t, file_unichar_t *, |
51 | | size_t *); |
52 | | file_private int looks_ucs16(const unsigned char *, size_t, file_unichar_t *, |
53 | | size_t *); |
54 | | file_private int looks_ucs32(const unsigned char *, size_t, file_unichar_t *, |
55 | | size_t *); |
56 | | file_private int looks_latin1(const unsigned char *, size_t, file_unichar_t *, |
57 | | size_t *); |
58 | | file_private int looks_extended(const unsigned char *, size_t, file_unichar_t *, |
59 | | size_t *); |
60 | | file_private void from_ebcdic(const unsigned char *, size_t, unsigned char *); |
61 | | |
62 | | #ifdef DEBUG_ENCODING |
63 | | #define DPRINTF(a) printf a |
64 | | #else |
65 | | #define DPRINTF(a) |
66 | | #endif |
67 | | |
68 | | /* |
69 | | * Try to determine whether text is in some character code we can |
70 | | * identify. Each of these tests, if it succeeds, will leave |
71 | | * the text converted into one-file_unichar_t-per-character Unicode in |
72 | | * ubuf, and the number of characters converted in ulen. |
73 | | */ |
74 | | file_protected int |
75 | | file_encoding(struct magic_set *ms, const struct buffer *b, |
76 | | file_unichar_t **ubuf, size_t *ulen, const char **code, |
77 | | const char **code_mime, const char **type) |
78 | 56.6k | { |
79 | 56.6k | const unsigned char *buf = CAST(const unsigned char *, b->fbuf); |
80 | 56.6k | size_t nbytes = b->flen; |
81 | 56.6k | size_t mlen; |
82 | 56.6k | int rv = 1, ucs_type; |
83 | 56.6k | file_unichar_t *udefbuf; |
84 | 56.6k | size_t udeflen; |
85 | | |
86 | 56.6k | if (ubuf == NULL) |
87 | 31.1k | ubuf = &udefbuf; |
88 | 56.6k | if (ulen == NULL) |
89 | 31.1k | ulen = &udeflen; |
90 | | |
91 | 56.6k | *type = "text"; |
92 | 56.6k | *ulen = 0; |
93 | 56.6k | *code = "unknown"; |
94 | 56.6k | *code_mime = "binary"; |
95 | | |
96 | 56.6k | if (nbytes > ms->encoding_max) |
97 | 2.56k | nbytes = ms->encoding_max; |
98 | | |
99 | 56.6k | mlen = (nbytes + 1) * sizeof((*ubuf)[0]); |
100 | 56.6k | *ubuf = CAST(file_unichar_t *, calloc(CAST(size_t, 1), mlen)); |
101 | 56.6k | if (*ubuf == NULL) { |
102 | 0 | file_oomem(ms, mlen); |
103 | 0 | goto done; |
104 | 0 | } |
105 | 56.6k | if (looks_ascii(buf, nbytes, *ubuf, ulen)) { |
106 | 4.00k | if (looks_utf7(buf, nbytes, *ubuf, ulen) > 0) { |
107 | 28 | DPRINTF(("utf-7 %" SIZE_T_FORMAT "u\n", *ulen)); |
108 | 28 | *code = "Unicode text, UTF-7"; |
109 | 28 | *code_mime = "utf-7"; |
110 | 3.97k | } else { |
111 | 3.97k | DPRINTF(("ascii %" SIZE_T_FORMAT "u\n", *ulen)); |
112 | 3.97k | *code = "ASCII"; |
113 | 3.97k | *code_mime = "us-ascii"; |
114 | 3.97k | } |
115 | 52.6k | } else if (looks_utf8_with_BOM(buf, nbytes, *ubuf, ulen) > 0) { |
116 | 16 | DPRINTF(("utf8/bom %" SIZE_T_FORMAT "u\n", *ulen)); |
117 | 16 | *code = "Unicode text, UTF-8 (with BOM)"; |
118 | 16 | *code_mime = "utf-8"; |
119 | 52.6k | } else if (file_looks_utf8(buf, nbytes, *ubuf, ulen) > 1) { |
120 | 431 | DPRINTF(("utf8 %" SIZE_T_FORMAT "u\n", *ulen)); |
121 | 431 | *code = "Unicode text, UTF-8"; |
122 | 431 | *code_mime = "utf-8"; |
123 | 52.2k | } else if ((ucs_type = looks_ucs32(buf, nbytes, *ubuf, ulen)) != 0) { |
124 | 822 | if (ucs_type == 1) { |
125 | 373 | *code = "Unicode text, UTF-32, little-endian"; |
126 | 373 | *code_mime = "utf-32le"; |
127 | 449 | } else { |
128 | 449 | *code = "Unicode text, UTF-32, big-endian"; |
129 | 449 | *code_mime = "utf-32be"; |
130 | 449 | } |
131 | 822 | DPRINTF(("ucs32 %" SIZE_T_FORMAT "u\n", *ulen)); |
132 | 51.4k | } else if ((ucs_type = looks_ucs16(buf, nbytes, *ubuf, ulen)) != 0) { |
133 | 741 | if (ucs_type == 1) { |
134 | 415 | *code = "Unicode text, UTF-16, little-endian"; |
135 | 415 | *code_mime = "utf-16le"; |
136 | 415 | } else { |
137 | 326 | *code = "Unicode text, UTF-16, big-endian"; |
138 | 326 | *code_mime = "utf-16be"; |
139 | 326 | } |
140 | 741 | DPRINTF(("ucs16 %" SIZE_T_FORMAT "u\n", *ulen)); |
141 | 50.6k | } else if (looks_latin1(buf, nbytes, *ubuf, ulen)) { |
142 | 1.87k | DPRINTF(("latin1 %" SIZE_T_FORMAT "u\n", *ulen)); |
143 | 1.87k | *code = "ISO-8859"; |
144 | 1.87k | *code_mime = "iso-8859-1"; |
145 | 48.8k | } else if (looks_extended(buf, nbytes, *ubuf, ulen)) { |
146 | 928 | DPRINTF(("extended %" SIZE_T_FORMAT "u\n", *ulen)); |
147 | 928 | *code = "Non-ISO extended-ASCII"; |
148 | 928 | *code_mime = "unknown-8bit"; |
149 | 47.8k | } else { |
150 | 47.8k | unsigned char *nbuf; |
151 | | |
152 | 47.8k | mlen = (nbytes + 1) * sizeof(nbuf[0]); |
153 | 47.8k | if ((nbuf = CAST(unsigned char *, malloc(mlen))) == NULL) { |
154 | 0 | file_oomem(ms, mlen); |
155 | 0 | goto done; |
156 | 0 | } |
157 | 47.8k | from_ebcdic(buf, nbytes, nbuf); |
158 | | |
159 | 47.8k | if (looks_ascii(nbuf, nbytes, *ubuf, ulen)) { |
160 | 182 | DPRINTF(("ebcdic %" SIZE_T_FORMAT "u\n", *ulen)); |
161 | 182 | *code = "EBCDIC"; |
162 | 182 | *code_mime = "ebcdic"; |
163 | 47.6k | } else if (looks_latin1(nbuf, nbytes, *ubuf, ulen)) { |
164 | 175 | DPRINTF(("ebcdic/international %" SIZE_T_FORMAT "u\n", |
165 | 175 | *ulen)); |
166 | 175 | *code = "International EBCDIC"; |
167 | 175 | *code_mime = "ebcdic"; |
168 | 47.5k | } else { /* Doesn't look like text at all */ |
169 | 47.5k | DPRINTF(("binary\n")); |
170 | 47.5k | rv = 0; |
171 | 47.5k | *type = "binary"; |
172 | 47.5k | } |
173 | 47.8k | free(nbuf); |
174 | 47.8k | } |
175 | | |
176 | 56.6k | done: |
177 | 56.6k | if (ubuf == &udefbuf) |
178 | 31.1k | free(udefbuf); |
179 | | |
180 | 56.6k | return rv; |
181 | 56.6k | } |
182 | | |
183 | | /* |
184 | | * This table reflects a particular philosophy about what constitutes |
185 | | * "text," and there is room for disagreement about it. |
186 | | * |
187 | | * Version 3.31 of the file command considered a file to be ASCII if |
188 | | * each of its characters was approved by either the isascii() or |
189 | | * isalpha() function. On most systems, this would mean that any |
190 | | * file consisting only of characters in the range 0x00 ... 0x7F |
191 | | * would be called ASCII text, but many systems might reasonably |
192 | | * consider some characters outside this range to be alphabetic, |
193 | | * so the file command would call such characters ASCII. It might |
194 | | * have been more accurate to call this "considered textual on the |
195 | | * local system" than "ASCII." |
196 | | * |
197 | | * It considered a file to be "International language text" if each |
198 | | * of its characters was either an ASCII printing character (according |
199 | | * to the real ASCII standard, not the above test), a character in |
200 | | * the range 0x80 ... 0xFF, or one of the following control characters: |
201 | | * backspace, tab, line feed, vertical tab, form feed, carriage return, |
202 | | * escape. No attempt was made to determine the language in which files |
203 | | * of this type were written. |
204 | | * |
205 | | * |
206 | | * The table below considers a file to be ASCII if all of its characters |
207 | | * are either ASCII printing characters (again, according to the X3.4 |
208 | | * standard, not isascii()) or any of the following controls: bell, |
209 | | * backspace, tab, line feed, form feed, carriage return, esc, nextline. |
210 | | * |
211 | | * I include bell because some programs (particularly shell scripts) |
212 | | * use it literally, even though it is rare in normal text. I exclude |
213 | | * vertical tab because it never seems to be used in real text. I also |
214 | | * include, with hesitation, the X3.64/ECMA-43 control nextline (0x85), |
215 | | * because that's what the dd EBCDIC->ASCII table maps the EBCDIC newline |
216 | | * character to. It might be more appropriate to include it in the 8859 |
217 | | * set instead of the ASCII set, but it's got to be included in *something* |
218 | | * we recognize or EBCDIC files aren't going to be considered textual. |
219 | | * Some old Unix source files use SO/SI (^N/^O) to shift between Greek |
220 | | * and Latin characters, so these should possibly be allowed. But they |
221 | | * make a real mess on VT100-style displays if they're not paired properly, |
222 | | * so we are probably better off not calling them text. |
223 | | * |
224 | | * A file is considered to be ISO-8859 text if its characters are all |
225 | | * either ASCII, according to the above definition, or printing characters |
226 | | * from the ISO-8859 8-bit extension, characters 0xA0 ... 0xFF. |
227 | | * |
228 | | * Finally, a file is considered to be international text from some other |
229 | | * character code if its characters are all either ISO-8859 (according to |
230 | | * the above definition) or characters in the range 0x80 ... 0x9F, which |
231 | | * ISO-8859 considers to be control characters but the IBM PC and Macintosh |
232 | | * consider to be printing characters. |
233 | | */ |
234 | | |
235 | | #define F 0 /* character never appears in text */ |
236 | 37.2M | #define T 1 /* character appears in plain ASCII text */ |
237 | | #define I 2 /* character appears in ISO-8859 text */ |
238 | | #define X 3 /* character appears in non-ISO extended ASCII (Mac, IBM PC) */ |
239 | | |
240 | | /* |
241 | | * SUB (substitute character ^Z) was used as EOF in DOS and early Windows |
242 | | * NEL (next line 0x85) is considered in ECMAScript as whitespace |
243 | | */ |
244 | | file_private char text_chars[256] = { |
245 | | /* BEL BS HT LF VT FF CR */ |
246 | | F, F, F, F, F, F, F, T, T, T, T, T, T, T, F, F, /* 0x0X */ |
247 | | /* SUB ESC */ |
248 | | F, F, F, F, F, F, F, F, F, F, T, T, F, F, F, F, /* 0x1X */ |
249 | | T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x2X */ |
250 | | T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x3X */ |
251 | | T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x4X */ |
252 | | T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x5X */ |
253 | | T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x6X */ |
254 | | T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, /* 0x7X */ |
255 | | /* NEL */ |
256 | | X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X, /* 0x8X */ |
257 | | X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, /* 0x9X */ |
258 | | I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xaX */ |
259 | | I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xbX */ |
260 | | I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xcX */ |
261 | | I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xdX */ |
262 | | I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xeX */ |
263 | | I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I /* 0xfX */ |
264 | | }; |
265 | | |
266 | | #define LOOKS(NAME, COND) \ |
267 | | file_private int \ |
268 | | looks_ ## NAME(const unsigned char *buf, size_t nbytes, file_unichar_t *ubuf, \ |
269 | 251k | size_t *ulen) \ |
270 | 251k | { \ |
271 | 251k | size_t i; \ |
272 | 251k | \ |
273 | 251k | *ulen = 0; \ |
274 | 251k | \ |
275 | 66.4M | for (i = 0; i < nbytes; i++) { \ |
276 | 66.4M | int t = text_chars[buf[i]]; \ |
277 | 66.4M | \ |
278 | 112M | if (COND) \ |
279 | 66.4M | return 0; \ |
280 | 66.4M | \ |
281 | 66.4M | ubuf[(*ulen)++] = buf[i]; \ |
282 | 66.2M | } \ |
283 | 251k | return 1; \ |
284 | 251k | } Line | Count | Source | 269 | 104k | size_t *ulen) \ | 270 | 104k | { \ | 271 | 104k | size_t i; \ | 272 | 104k | \ | 273 | 104k | *ulen = 0; \ | 274 | 104k | \ | 275 | 42.4M | for (i = 0; i < nbytes; i++) { \ | 276 | 42.4M | int t = text_chars[buf[i]]; \ | 277 | 42.4M | \ | 278 | 42.4M | if (COND) \ | 279 | 42.4M | return 0; \ | 280 | 42.4M | \ | 281 | 42.4M | ubuf[(*ulen)++] = buf[i]; \ | 282 | 42.3M | } \ | 283 | 104k | return 1; \ | 284 | 104k | } |
Line | Count | Source | 269 | 98.3k | size_t *ulen) \ | 270 | 98.3k | { \ | 271 | 98.3k | size_t i; \ | 272 | 98.3k | \ | 273 | 98.3k | *ulen = 0; \ | 274 | 98.3k | \ | 275 | 19.4M | for (i = 0; i < nbytes; i++) { \ | 276 | 19.4M | int t = text_chars[buf[i]]; \ | 277 | 19.4M | \ | 278 | 52.8M | if (COND) \ | 279 | 19.4M | return 0; \ | 280 | 19.4M | \ | 281 | 19.4M | ubuf[(*ulen)++] = buf[i]; \ | 282 | 19.3M | } \ | 283 | 98.3k | return 1; \ | 284 | 98.3k | } |
encoding.c:looks_extended Line | Count | Source | 269 | 48.8k | size_t *ulen) \ | 270 | 48.8k | { \ | 271 | 48.8k | size_t i; \ | 272 | 48.8k | \ | 273 | 48.8k | *ulen = 0; \ | 274 | 48.8k | \ | 275 | 4.53M | for (i = 0; i < nbytes; i++) { \ | 276 | 4.52M | int t = text_chars[buf[i]]; \ | 277 | 4.52M | \ | 278 | 16.9M | if (COND) \ | 279 | 4.52M | return 0; \ | 280 | 4.52M | \ | 281 | 4.52M | ubuf[(*ulen)++] = buf[i]; \ | 282 | 4.48M | } \ | 283 | 48.8k | return 1; \ | 284 | 48.8k | } |
|
285 | | |
286 | | LOOKS(ascii, t != T) |
287 | | LOOKS(latin1, t != T && t != I) |
288 | | LOOKS(extended, t != T && t != I && t != X) |
289 | | |
290 | | /* |
291 | | * Decide whether some text looks like UTF-8. Returns: |
292 | | * |
293 | | * -1: invalid UTF-8 |
294 | | * 0: uses odd control characters, so doesn't look like text |
295 | | * 1: 7-bit text |
296 | | * 2: definitely UTF-8 text (valid high-bit set bytes) |
297 | | * |
298 | | * If ubuf is non-NULL on entry, text is decoded into ubuf, *ulen; |
299 | | * ubuf must be big enough! |
300 | | */ |
301 | | |
302 | | // from: https://golang.org/src/unicode/utf8/utf8.go |
303 | | |
304 | 229k | #define XX 0xF1 // invalid: size 1 |
305 | | #define AS 0xF0 // ASCII: size 1 |
306 | | #define S1 0x02 // accept 0, size 2 |
307 | | #define S2 0x13 // accept 1, size 3 |
308 | | #define S3 0x03 // accept 0, size 3 |
309 | | #define S4 0x23 // accept 2, size 3 |
310 | | #define S5 0x34 // accept 3, size 4 |
311 | | #define S6 0x04 // accept 0, size 4 |
312 | | #define S7 0x44 // accept 4, size 4 |
313 | | |
314 | | #define LOCB 0x80 |
315 | | #define HICB 0xBF |
316 | | |
317 | | // first is information about the first byte in a UTF-8 sequence. |
318 | | static const uint8_t first[] = { |
319 | | // 1 2 3 4 5 6 7 8 9 A B C D E F |
320 | | AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x00-0x0F |
321 | | AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x10-0x1F |
322 | | AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x20-0x2F |
323 | | AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x30-0x3F |
324 | | AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x40-0x4F |
325 | | AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x50-0x5F |
326 | | AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x60-0x6F |
327 | | AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x70-0x7F |
328 | | // 1 2 3 4 5 6 7 8 9 A B C D E F |
329 | | XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0x80-0x8F |
330 | | XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0x90-0x9F |
331 | | XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xA0-0xAF |
332 | | XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xB0-0xBF |
333 | | XX, XX, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, // 0xC0-0xCF |
334 | | S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, // 0xD0-0xDF |
335 | | S2, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S4, S3, S3, // 0xE0-0xEF |
336 | | S5, S6, S6, S6, S7, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xF0-0xFF |
337 | | }; |
338 | | |
339 | | // acceptRange gives the range of valid values for the second byte in a UTF-8 |
340 | | // sequence. |
341 | | static struct accept_range { |
342 | | uint8_t lo; // lowest value for second byte. |
343 | | uint8_t hi; // highest value for second byte. |
344 | | } accept_ranges[16] = { |
345 | | // acceptRanges has size 16 to avoid bounds checks in the code that uses it. |
346 | | { LOCB, HICB }, |
347 | | { 0xA0, HICB }, |
348 | | { LOCB, 0x9F }, |
349 | | { 0x90, HICB }, |
350 | | { LOCB, 0x8F }, |
351 | | }; |
352 | | |
353 | | file_protected int |
354 | | file_looks_utf8(const unsigned char *buf, size_t nbytes, file_unichar_t *ubuf, |
355 | | size_t *ulen) |
356 | 58.3k | { |
357 | 58.3k | size_t i; |
358 | 58.3k | int n; |
359 | 58.3k | file_unichar_t c; |
360 | 58.3k | int gotone = 0, ctrl = 0; |
361 | | |
362 | 58.3k | if (ubuf) |
363 | 52.8k | *ulen = 0; |
364 | | |
365 | 37.5M | for (i = 0; i < nbytes; i++) { |
366 | 37.4M | if ((buf[i] & 0x80) == 0) { /* 0xxxxxxx is plain ASCII */ |
367 | | /* |
368 | | * Even if the whole file is valid UTF-8 sequences, |
369 | | * still reject it if it uses weird control characters. |
370 | | */ |
371 | | |
372 | 37.2M | if (text_chars[buf[i]] != T) |
373 | 31.5M | ctrl = 1; |
374 | | |
375 | 37.2M | if (ubuf) |
376 | 37.2M | ubuf[(*ulen)++] = buf[i]; |
377 | 37.2M | } else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */ |
378 | 23.7k | return -1; |
379 | 229k | } else { /* 11xxxxxx begins UTF-8 */ |
380 | 229k | int following; |
381 | 229k | uint8_t x = first[buf[i]]; |
382 | 229k | const struct accept_range *ar = |
383 | 229k | &accept_ranges[(unsigned int)x >> 4]; |
384 | 229k | if (x == XX) |
385 | 9.82k | return -1; |
386 | | |
387 | 220k | if ((buf[i] & 0x20) == 0) { /* 110xxxxx */ |
388 | 149k | c = buf[i] & 0x1f; |
389 | 149k | following = 1; |
390 | 149k | } else if ((buf[i] & 0x10) == 0) { /* 1110xxxx */ |
391 | 54.2k | c = buf[i] & 0x0f; |
392 | 54.2k | following = 2; |
393 | 54.2k | } else if ((buf[i] & 0x08) == 0) { /* 11110xxx */ |
394 | 15.8k | c = buf[i] & 0x07; |
395 | 15.8k | following = 3; |
396 | 15.8k | } else if ((buf[i] & 0x04) == 0) { /* 111110xx */ |
397 | 0 | c = buf[i] & 0x03; |
398 | 0 | following = 4; |
399 | 0 | } else if ((buf[i] & 0x02) == 0) { /* 1111110x */ |
400 | 0 | c = buf[i] & 0x01; |
401 | 0 | following = 5; |
402 | 0 | } else |
403 | 0 | return -1; |
404 | | |
405 | 500k | for (n = 0; n < following; n++) { |
406 | 298k | i++; |
407 | 298k | if (i >= nbytes) |
408 | 706 | goto done; |
409 | | |
410 | 297k | if (n == 0 && |
411 | 297k | (buf[i] < ar->lo || buf[i] > ar->hi)) |
412 | 16.2k | return -1; |
413 | | |
414 | 281k | if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40)) |
415 | 1.00k | return -1; |
416 | | |
417 | 280k | c = (c << 6) + (buf[i] & 0x3f); |
418 | 280k | } |
419 | | |
420 | 202k | if (ubuf) |
421 | 201k | ubuf[(*ulen)++] = c; |
422 | 202k | gotone = 1; |
423 | 202k | } |
424 | 37.4M | } |
425 | 7.50k | done: |
426 | 7.50k | return ctrl ? 0 : (gotone ? 2 : 1); |
427 | 58.3k | } |
428 | | |
429 | | /* |
430 | | * Decide whether some text looks like UTF-8 with BOM. If there is no |
431 | | * BOM, return -1; otherwise return the result of looks_utf8 on the |
432 | | * rest of the text. |
433 | | */ |
434 | | file_private int |
435 | | looks_utf8_with_BOM(const unsigned char *buf, size_t nbytes, |
436 | | file_unichar_t *ubuf, size_t *ulen) |
437 | 52.6k | { |
438 | 52.6k | if (nbytes > 3 && buf[0] == 0xef && buf[1] == 0xbb && buf[2] == 0xbf) |
439 | 134 | return file_looks_utf8(buf + 3, nbytes - 3, ubuf, ulen); |
440 | 52.5k | else |
441 | 52.5k | return -1; |
442 | 52.6k | } |
443 | | |
444 | | file_private int |
445 | | looks_utf7(const unsigned char *buf, size_t nbytes, file_unichar_t *ubuf, |
446 | | size_t *ulen) |
447 | 4.00k | { |
448 | 4.00k | if (nbytes > 4 && buf[0] == '+' && buf[1] == '/' && buf[2] == 'v') |
449 | 34 | switch (buf[3]) { |
450 | 8 | case '8': |
451 | 14 | case '9': |
452 | 20 | case '+': |
453 | 28 | case '/': |
454 | 28 | if (ubuf) |
455 | 28 | *ulen = 0; |
456 | 28 | return 1; |
457 | 6 | default: |
458 | 6 | return -1; |
459 | 34 | } |
460 | 3.96k | else |
461 | 3.96k | return -1; |
462 | 4.00k | } |
463 | | |
464 | 845k | #define UCS16_NOCHAR(c) ((c) >= 0xfdd0 && (c) <= 0xfdef) |
465 | 845k | #define UCS16_HISURR(c) ((c) >= 0xd800 && (c) <= 0xdbff) |
466 | 871k | #define UCS16_LOSURR(c) ((c) >= 0xdc00 && (c) <= 0xdfff) |
467 | | |
468 | | file_private int |
469 | | looks_ucs16(const unsigned char *bf, size_t nbytes, file_unichar_t *ubf, |
470 | | size_t *ulen) |
471 | 51.4k | { |
472 | 51.4k | int bigend; |
473 | 51.4k | uint32_t hi; |
474 | 51.4k | size_t i; |
475 | | |
476 | 51.4k | if (nbytes < 2) |
477 | 166 | return 0; |
478 | | |
479 | 51.2k | if (bf[0] == 0xff && bf[1] == 0xfe) |
480 | 689 | bigend = 0; |
481 | 50.5k | else if (bf[0] == 0xfe && bf[1] == 0xff) |
482 | 474 | bigend = 1; |
483 | 50.0k | else |
484 | 50.0k | return 0; |
485 | | |
486 | 1.16k | *ulen = 0; |
487 | 1.16k | hi = 0; |
488 | | |
489 | 846k | for (i = 2; i + 1 < nbytes; i += 2) { |
490 | 845k | uint32_t uc; |
491 | | |
492 | 845k | if (bigend) |
493 | 291k | uc = CAST(uint32_t, |
494 | 845k | bf[i + 1] | (CAST(file_unichar_t, bf[i]) << 8)); |
495 | 554k | else |
496 | 554k | uc = CAST(uint32_t, |
497 | 845k | bf[i] | (CAST(file_unichar_t, bf[i + 1]) << 8)); |
498 | | |
499 | 845k | uc &= 0xffff; |
500 | | |
501 | 845k | switch (uc) { |
502 | 2 | case 0xfffe: |
503 | 30 | case 0xffff: |
504 | 30 | return 0; |
505 | 845k | default: |
506 | 845k | if (UCS16_NOCHAR(uc)) |
507 | 32 | return 0; |
508 | 845k | break; |
509 | 845k | } |
510 | 845k | if (hi) { |
511 | 26.7k | if (!UCS16_LOSURR(uc)) |
512 | 144 | return 0; |
513 | 26.5k | uc = 0x10000 + 0x400 * (hi - 1) + (uc - 0xdc00); |
514 | 26.5k | hi = 0; |
515 | 26.5k | } |
516 | 845k | if (uc < 128 && text_chars[CAST(size_t, uc)] != T) |
517 | 142 | return 0; |
518 | 845k | ubf[(*ulen)++] = uc; |
519 | 845k | if (UCS16_HISURR(uc)) |
520 | 26.8k | hi = uc - 0xd800 + 1; |
521 | 845k | if (UCS16_LOSURR(uc)) |
522 | 74 | return 0; |
523 | 845k | } |
524 | | |
525 | 741 | return 1 + bigend; |
526 | 1.16k | } |
527 | | |
528 | | file_private int |
529 | | looks_ucs32(const unsigned char *bf, size_t nbytes, file_unichar_t *ubf, |
530 | | size_t *ulen) |
531 | 52.2k | { |
532 | 52.2k | int bigend; |
533 | 52.2k | size_t i; |
534 | | |
535 | 52.2k | if (nbytes < 4) |
536 | 2.54k | return 0; |
537 | | |
538 | 49.7k | if (bf[0] == 0xff && bf[1] == 0xfe && bf[2] == 0 && bf[3] == 0) |
539 | 434 | bigend = 0; |
540 | 49.2k | else if (bf[0] == 0 && bf[1] == 0 && bf[2] == 0xfe && bf[3] == 0xff) |
541 | 466 | bigend = 1; |
542 | 48.8k | else |
543 | 48.8k | return 0; |
544 | | |
545 | 900 | *ulen = 0; |
546 | | |
547 | 728k | for (i = 4; i + 3 < nbytes; i += 4) { |
548 | | /* XXX fix to properly handle chars > 65536 */ |
549 | | |
550 | 727k | if (bigend) |
551 | 237k | ubf[(*ulen)++] = CAST(file_unichar_t, bf[i + 3]) |
552 | 237k | | (CAST(file_unichar_t, bf[i + 2]) << 8) |
553 | 237k | | (CAST(file_unichar_t, bf[i + 1]) << 16) |
554 | 237k | | (CAST(file_unichar_t, bf[i]) << 24); |
555 | 490k | else |
556 | 490k | ubf[(*ulen)++] = CAST(file_unichar_t, bf[i + 0]) |
557 | 490k | | (CAST(file_unichar_t, bf[i + 1]) << 8) |
558 | 490k | | (CAST(file_unichar_t, bf[i + 2]) << 16) |
559 | 490k | | (CAST(file_unichar_t, bf[i + 3]) << 24); |
560 | | |
561 | 727k | if (ubf[*ulen - 1] == 0xfffe) |
562 | 14 | return 0; |
563 | 727k | if (ubf[*ulen - 1] < 128 && |
564 | 727k | text_chars[CAST(size_t, ubf[*ulen - 1])] != T) |
565 | 64 | return 0; |
566 | 727k | } |
567 | | |
568 | 822 | return 1 + bigend; |
569 | 900 | } |
570 | | #undef F |
571 | | #undef T |
572 | | #undef I |
573 | | #undef X |
574 | | |
575 | | /* |
576 | | * This table maps each EBCDIC character to an (8-bit extended) ASCII |
577 | | * character, as specified in the rationale for the dd(1) command in |
578 | | * draft 11.2 (September, 1991) of the POSIX P1003.2 standard. |
579 | | * |
580 | | * Unfortunately it does not seem to correspond exactly to any of the |
581 | | * five variants of EBCDIC documented in IBM's _Enterprise Systems |
582 | | * Architecture/390: Principles of Operation_, SA22-7201-06, Seventh |
583 | | * Edition, July, 1999, pp. I-1 - I-4. |
584 | | * |
585 | | * Fortunately, though, all versions of EBCDIC, including this one, agree |
586 | | * on most of the printing characters that also appear in (7-bit) ASCII. |
587 | | * Of these, only '|', '!', '~', '^', '[', and ']' are in question at all. |
588 | | * |
589 | | * Fortunately too, there is general agreement that codes 0x00 through |
590 | | * 0x3F represent control characters, 0x41 a nonbreaking space, and the |
591 | | * remainder printing characters. |
592 | | * |
593 | | * This is sufficient to allow us to identify EBCDIC text and to distinguish |
594 | | * between old-style and internationalized examples of text. |
595 | | */ |
596 | | |
597 | | file_private unsigned char ebcdic_to_ascii[] = { |
598 | | 0, 1, 2, 3, 156, 9, 134, 127, 151, 141, 142, 11, 12, 13, 14, 15, |
599 | | 16, 17, 18, 19, 157, 133, 8, 135, 24, 25, 146, 143, 28, 29, 30, 31, |
600 | | 128, 129, 130, 131, 132, 10, 23, 27, 136, 137, 138, 139, 140, 5, 6, 7, |
601 | | 144, 145, 22, 147, 148, 149, 150, 4, 152, 153, 154, 155, 20, 21, 158, 26, |
602 | | ' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|', |
603 | | '&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~', |
604 | | '-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?', |
605 | | 186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'','=', '"', |
606 | | 195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201, |
607 | | 202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208, |
608 | | 209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215, |
609 | | 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231, |
610 | | '{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237, |
611 | | '}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243, |
612 | | '\\',159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249, |
613 | | '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255 |
614 | | }; |
615 | | |
616 | | #ifdef notdef |
617 | | /* |
618 | | * The following EBCDIC-to-ASCII table may relate more closely to reality, |
619 | | * or at least to modern reality. It comes from |
620 | | * |
621 | | * http://ftp.s390.ibm.com/products/oe/bpxqp9.html |
622 | | * |
623 | | * and maps the characters of EBCDIC code page 1047 (the code used for |
624 | | * Unix-derived software on IBM's 390 systems) to the corresponding |
625 | | * characters from ISO 8859-1. |
626 | | * |
627 | | * If this table is used instead of the above one, some of the special |
628 | | * cases for the NEL character can be taken out of the code. |
629 | | */ |
630 | | |
631 | | file_private unsigned char ebcdic_1047_to_8859[] = { |
632 | | 0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F, |
633 | | 0x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F, |
634 | | 0x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07, |
635 | | 0x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A, |
636 | | 0x20,0xA0,0xE2,0xE4,0xE0,0xE1,0xE3,0xE5,0xE7,0xF1,0xA2,0x2E,0x3C,0x28,0x2B,0x7C, |
637 | | 0x26,0xE9,0xEA,0xEB,0xE8,0xED,0xEE,0xEF,0xEC,0xDF,0x21,0x24,0x2A,0x29,0x3B,0x5E, |
638 | | 0x2D,0x2F,0xC2,0xC4,0xC0,0xC1,0xC3,0xC5,0xC7,0xD1,0xA6,0x2C,0x25,0x5F,0x3E,0x3F, |
639 | | 0xF8,0xC9,0xCA,0xCB,0xC8,0xCD,0xCE,0xCF,0xCC,0x60,0x3A,0x23,0x40,0x27,0x3D,0x22, |
640 | | 0xD8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xAB,0xBB,0xF0,0xFD,0xFE,0xB1, |
641 | | 0xB0,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0xAA,0xBA,0xE6,0xB8,0xC6,0xA4, |
642 | | 0xB5,0x7E,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0xA1,0xBF,0xD0,0x5B,0xDE,0xAE, |
643 | | 0xAC,0xA3,0xA5,0xB7,0xA9,0xA7,0xB6,0xBC,0xBD,0xBE,0xDD,0xA8,0xAF,0x5D,0xB4,0xD7, |
644 | | 0x7B,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xAD,0xF4,0xF6,0xF2,0xF3,0xF5, |
645 | | 0x7D,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0xB9,0xFB,0xFC,0xF9,0xFA,0xFF, |
646 | | 0x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5, |
647 | | 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F |
648 | | }; |
649 | | #endif |
650 | | |
651 | | /* |
652 | | * Copy buf[0 ... nbytes-1] into out[], translating EBCDIC to ASCII. |
653 | | */ |
654 | | file_private void |
655 | | from_ebcdic(const unsigned char *buf, size_t nbytes, unsigned char *out) |
656 | 47.8k | { |
657 | 47.8k | size_t i; |
658 | | |
659 | 236M | for (i = 0; i < nbytes; i++) { |
660 | 236M | out[i] = ebcdic_to_ascii[buf[i]]; |
661 | 236M | } |
662 | 47.8k | } |