Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) Ian F. Darwin 1986-1995. |
3 | | * Software written by Ian F. Darwin and others; |
4 | | * maintained 1995-present by Christos Zoulas and others. |
5 | | * |
6 | | * Redistribution and use in source and binary forms, with or without |
7 | | * modification, are permitted provided that the following conditions |
8 | | * are met: |
9 | | * 1. Redistributions of source code must retain the above copyright |
10 | | * notice immediately at the beginning of the file, without modification, |
11 | | * this list of conditions, and the following disclaimer. |
12 | | * 2. Redistributions in binary form must reproduce the above copyright |
13 | | * notice, this list of conditions and the following disclaimer in the |
14 | | * documentation and/or other materials provided with the distribution. |
15 | | * |
16 | | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND |
17 | | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
18 | | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
19 | | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR |
20 | | * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
21 | | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
22 | | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
23 | | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
24 | | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
25 | | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
26 | | * SUCH DAMAGE. |
27 | | */ |
28 | | /* |
29 | | * ASCII magic -- try to detect text encoding. |
30 | | * |
31 | | * Extensively modified by Eric Fischer <enf@pobox.com> in July, 2000, |
32 | | * to handle character codes other than ASCII on a unified basis. |
33 | | */ |
34 | | |
35 | | #include "file.h" |
36 | | |
37 | | #ifndef lint |
38 | | FILE_RCSID("@(#)$File: ascmagic.c,v 1.116 2023/05/21 16:08:50 christos Exp $") |
39 | | #endif /* lint */ |
40 | | |
41 | | #include "magic.h" |
42 | | #include <string.h> |
43 | | #include <ctype.h> |
44 | | #include <stdlib.h> |
45 | | #ifdef HAVE_UNISTD_H |
46 | | #include <unistd.h> |
47 | | #endif |
48 | | |
49 | 12.2M | #define MAXLINELEN 300 /* longest sane line length */ |
50 | | #define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \ |
51 | | || (x) == 0x85 || (x) == '\f') |
52 | | |
53 | | file_private unsigned char *encode_utf8(unsigned char *, size_t, file_unichar_t *, |
54 | | size_t); |
55 | | file_private size_t trim_nuls(const unsigned char *, size_t); |
56 | | |
57 | | /* |
58 | | * Undo the NUL-termination kindly provided by process() |
59 | | * but leave at least one byte to look at |
60 | | */ |
61 | | file_private size_t |
62 | | trim_nuls(const unsigned char *buf, size_t nbytes) |
63 | 12.4k | { |
64 | 158M | while (nbytes > 1 && buf[nbytes - 1] == '\0') |
65 | 158M | nbytes--; |
66 | | |
67 | 12.4k | return nbytes; |
68 | 12.4k | } |
69 | | |
70 | | file_protected int |
71 | | file_ascmagic(struct magic_set *ms, const struct buffer *b, int text) |
72 | 10.2k | { |
73 | 10.2k | file_unichar_t *ubuf = NULL; |
74 | 10.2k | size_t ulen = 0; |
75 | 10.2k | int rv = 1; |
76 | 10.2k | struct buffer bb; |
77 | | |
78 | 10.2k | const char *code = NULL; |
79 | 10.2k | const char *code_mime = NULL; |
80 | 10.2k | const char *type = NULL; |
81 | | |
82 | 10.2k | bb = *b; |
83 | 10.2k | bb.flen = trim_nuls(CAST(const unsigned char *, b->fbuf), b->flen); |
84 | | /* |
85 | | * Avoid trimming at an odd byte if the original buffer was evenly |
86 | | * sized; this avoids losing the last character on UTF-16 LE text |
87 | | */ |
88 | 10.2k | if ((bb.flen & 1) && !(b->flen & 1)) |
89 | 952 | bb.flen++; |
90 | | |
91 | | /* If file doesn't look like any sort of text, give up. */ |
92 | 10.2k | if (file_encoding(ms, &bb, &ubuf, &ulen, &code, &code_mime, |
93 | 10.2k | &type) == 0) |
94 | 8.04k | rv = 0; |
95 | 2.21k | else |
96 | 2.21k | rv = file_ascmagic_with_encoding(ms, &bb, |
97 | 2.21k | ubuf, ulen, code, type, text); |
98 | | |
99 | 10.2k | free(ubuf); |
100 | | |
101 | 10.2k | return rv; |
102 | 10.2k | } |
103 | | |
104 | | file_protected int |
105 | | file_ascmagic_with_encoding(struct magic_set *ms, const struct buffer *b, |
106 | | file_unichar_t *ubuf, size_t ulen, const char *code, const char *type, |
107 | | int text) |
108 | 2.21k | { |
109 | 2.21k | struct buffer bb; |
110 | 2.21k | const unsigned char *buf = CAST(const unsigned char *, b->fbuf); |
111 | 2.21k | size_t nbytes = b->flen; |
112 | 2.21k | unsigned char *utf8_buf = NULL, *utf8_end; |
113 | 2.21k | size_t mlen, i, len; |
114 | 2.21k | int rv = -1; |
115 | 2.21k | int mime = ms->flags & MAGIC_MIME; |
116 | 2.21k | int need_separator = 0; |
117 | | |
118 | 2.21k | const char *subtype = NULL; |
119 | | |
120 | 2.21k | int has_escapes = 0; |
121 | 2.21k | int has_backspace = 0; |
122 | 2.21k | int seen_cr = 0; |
123 | | |
124 | 2.21k | size_t n_crlf = 0; |
125 | 2.21k | size_t n_lf = 0; |
126 | 2.21k | size_t n_cr = 0; |
127 | 2.21k | size_t n_nel = 0; |
128 | 2.21k | int executable = 0; |
129 | | |
130 | 2.21k | size_t last_line_end = CAST(size_t, -1); |
131 | 2.21k | size_t has_long_lines = 0; |
132 | | |
133 | 2.21k | nbytes = trim_nuls(buf, nbytes); |
134 | | |
135 | | /* If we have fewer than 2 bytes, give up. */ |
136 | 2.21k | if (nbytes <= 1) { |
137 | 16 | rv = 0; |
138 | 16 | goto done; |
139 | 16 | } |
140 | | |
141 | 2.20k | if (ulen > 0 && (ms->flags & MAGIC_NO_CHECK_SOFT) == 0) { |
142 | | /* Convert ubuf to UTF-8 and try text soft magic */ |
143 | | /* malloc size is a conservative overestimate; could be |
144 | | improved, or at least realloced after conversion. */ |
145 | 2.13k | mlen = ulen * 6; |
146 | 2.13k | if ((utf8_buf = CAST(unsigned char *, malloc(mlen))) == NULL) { |
147 | 0 | file_oomem(ms, mlen); |
148 | 0 | goto done; |
149 | 0 | } |
150 | 2.13k | if ((utf8_end = encode_utf8(utf8_buf, mlen, ubuf, ulen)) |
151 | 2.13k | == NULL) { |
152 | 44 | rv = 0; |
153 | 44 | goto done; |
154 | 44 | } |
155 | 2.08k | buffer_init(&bb, b->fd, &b->st, utf8_buf, |
156 | 2.08k | CAST(size_t, utf8_end - utf8_buf)); |
157 | | |
158 | 2.08k | if ((rv = file_softmagic(ms, &bb, NULL, NULL, |
159 | 2.08k | TEXTTEST, text)) == 0) |
160 | 1.98k | rv = -1; |
161 | 100 | else |
162 | 100 | need_separator = 1; |
163 | 2.08k | buffer_fini(&bb); |
164 | 2.08k | if ((ms->flags & (MAGIC_APPLE|MAGIC_EXTENSION))) { |
165 | 0 | rv = rv == -1 ? 0 : 1; |
166 | 0 | goto done; |
167 | 0 | } |
168 | 2.08k | } |
169 | | |
170 | 2.15k | if ((ms->flags & (MAGIC_APPLE|MAGIC_EXTENSION))) { |
171 | 0 | rv = 0; |
172 | 0 | goto done; |
173 | 0 | } |
174 | | |
175 | | /* Now try to discover other details about the file. */ |
176 | 12.2M | for (i = 0; i < ulen; i++) { |
177 | 12.2M | if (ubuf[i] == '\n') { |
178 | 2.01M | if (seen_cr) |
179 | 181k | n_crlf++; |
180 | 1.83M | else |
181 | 1.83M | n_lf++; |
182 | 2.01M | last_line_end = i; |
183 | 10.2M | } else if (seen_cr) |
184 | 871k | n_cr++; |
185 | | |
186 | 12.2M | seen_cr = (ubuf[i] == '\r'); |
187 | 12.2M | if (seen_cr) |
188 | 1.05M | last_line_end = i; |
189 | | |
190 | 12.2M | if (ubuf[i] == 0x85) { /* X3.64/ECMA-43 "next line" character */ |
191 | 994k | n_nel++; |
192 | 994k | last_line_end = i; |
193 | 994k | } |
194 | | |
195 | | /* If this line is _longer_ than MAXLINELEN, remember it. */ |
196 | 12.2M | if (i > last_line_end + MAXLINELEN) { |
197 | 7.94M | size_t ll = i - last_line_end; |
198 | 7.94M | if (ll > has_long_lines) |
199 | 7.74M | has_long_lines = ll; |
200 | 7.94M | } |
201 | | |
202 | 12.2M | if (ubuf[i] == '\033') |
203 | 50.3k | has_escapes = 1; |
204 | 12.2M | if (ubuf[i] == '\b') |
205 | 96.7k | has_backspace = 1; |
206 | 12.2M | } |
207 | | |
208 | 2.15k | if (strcmp(type, "binary") == 0) { |
209 | 0 | rv = 0; |
210 | 0 | goto done; |
211 | 0 | } |
212 | 2.15k | len = file_printedlen(ms); |
213 | 2.15k | if (mime) { |
214 | 0 | if ((mime & MAGIC_MIME_TYPE) != 0) { |
215 | 0 | if (len) { |
216 | | /* |
217 | | * Softmagic printed something, we |
218 | | * are either done, or we need a separator |
219 | | */ |
220 | 0 | if ((ms->flags & MAGIC_CONTINUE) == 0) { |
221 | 0 | rv = 1; |
222 | 0 | goto done; |
223 | 0 | } |
224 | 0 | if (need_separator && file_separator(ms) == -1) |
225 | 0 | goto done; |
226 | 0 | } |
227 | 0 | if (file_printf(ms, "text/plain") == -1) |
228 | 0 | goto done; |
229 | 0 | } |
230 | 2.15k | } else { |
231 | 2.15k | if (len) { |
232 | 699 | switch (file_replace(ms, " text$", ", ")) { |
233 | 671 | case 0: |
234 | 671 | switch (file_replace(ms, " text executable$", |
235 | 671 | ", ")) { |
236 | 611 | case 0: |
237 | 611 | if (file_printf(ms, ", ") == -1) |
238 | 0 | goto done; |
239 | 611 | break; |
240 | 611 | case -1: |
241 | 0 | goto done; |
242 | 60 | default: |
243 | 60 | executable = 1; |
244 | 60 | break; |
245 | 671 | } |
246 | 671 | break; |
247 | 671 | case -1: |
248 | 0 | goto done; |
249 | 28 | default: |
250 | 28 | break; |
251 | 699 | } |
252 | 699 | } |
253 | | |
254 | 2.15k | if (file_printf(ms, "%s", code) == -1) |
255 | 0 | goto done; |
256 | | |
257 | 2.15k | if (subtype) { |
258 | 0 | if (file_printf(ms, " %s", subtype) == -1) |
259 | 0 | goto done; |
260 | 0 | } |
261 | | |
262 | 2.15k | if (file_printf(ms, " %s", type) == -1) |
263 | 0 | goto done; |
264 | | |
265 | 2.15k | if (executable) |
266 | 60 | if (file_printf(ms, " executable") == -1) |
267 | 0 | goto done; |
268 | | |
269 | 2.15k | if (has_long_lines) |
270 | 219 | if (file_printf(ms, ", with very long lines (%" |
271 | 219 | SIZE_T_FORMAT "u)", has_long_lines) == -1) |
272 | 0 | goto done; |
273 | | |
274 | | /* |
275 | | * Only report line terminators if we find one other than LF, |
276 | | * or if we find none at all. |
277 | | */ |
278 | 2.15k | if ((n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) || |
279 | 2.15k | (n_crlf != 0 || n_cr != 0 || n_nel != 0)) { |
280 | 2.00k | if (file_printf(ms, ", with") == -1) |
281 | 0 | goto done; |
282 | | |
283 | 2.00k | if (n_crlf == 0 && n_cr == 0 && |
284 | 2.00k | n_nel == 0 && n_lf == 0) { |
285 | 1.81k | if (file_printf(ms, " no") == -1) |
286 | 0 | goto done; |
287 | 1.81k | } else { |
288 | 190 | if (n_crlf) { |
289 | 46 | if (file_printf(ms, " CRLF") == -1) |
290 | 0 | goto done; |
291 | 46 | if (n_cr || n_lf || n_nel) |
292 | 37 | if (file_printf(ms, ",") == -1) |
293 | 0 | goto done; |
294 | 46 | } |
295 | 190 | if (n_cr) { |
296 | 107 | if (file_printf(ms, " CR") == -1) |
297 | 0 | goto done; |
298 | 107 | if (n_lf || n_nel) |
299 | 53 | if (file_printf(ms, ",") == -1) |
300 | 0 | goto done; |
301 | 107 | } |
302 | 190 | if (n_lf) { |
303 | 63 | if (file_printf(ms, " LF") == -1) |
304 | 0 | goto done; |
305 | 63 | if (n_nel) |
306 | 17 | if (file_printf(ms, ",") == -1) |
307 | 0 | goto done; |
308 | 63 | } |
309 | 190 | if (n_nel) |
310 | 81 | if (file_printf(ms, " NEL") == -1) |
311 | 0 | goto done; |
312 | 190 | } |
313 | | |
314 | 2.00k | if (file_printf(ms, " line terminators") == -1) |
315 | 0 | goto done; |
316 | 2.00k | } |
317 | | |
318 | 2.15k | if (has_escapes) |
319 | 32 | if (file_printf(ms, ", with escape sequences") == -1) |
320 | 0 | goto done; |
321 | 2.15k | if (has_backspace) |
322 | 47 | if (file_printf(ms, ", with overstriking") == -1) |
323 | 0 | goto done; |
324 | 2.15k | } |
325 | 2.15k | rv = 1; |
326 | 2.21k | done: |
327 | 2.21k | free(utf8_buf); |
328 | | |
329 | 2.21k | return rv; |
330 | 2.15k | } |
331 | | |
332 | | /* |
333 | | * Encode Unicode string as UTF-8, returning pointer to character |
334 | | * after end of string, or NULL if an invalid character is found. |
335 | | */ |
336 | | file_private unsigned char * |
337 | | encode_utf8(unsigned char *buf, size_t len, file_unichar_t *ubuf, size_t ulen) |
338 | 2.13k | { |
339 | 2.13k | size_t i; |
340 | 2.13k | unsigned char *end = buf + len; |
341 | | |
342 | 12.2M | for (i = 0; i < ulen; i++) { |
343 | 12.2M | if (ubuf[i] <= 0x7f) { |
344 | 9.15M | if (end - buf < 1) |
345 | 0 | return NULL; |
346 | 9.15M | *buf++ = CAST(unsigned char, ubuf[i]); |
347 | 9.15M | continue; |
348 | 9.15M | } |
349 | 3.07M | if (ubuf[i] <= 0x7ff) { |
350 | 2.77M | if (end - buf < 2) |
351 | 0 | return NULL; |
352 | 2.77M | *buf++ = CAST(unsigned char, (ubuf[i] >> 6) + 0xc0); |
353 | 2.77M | goto out1; |
354 | 2.77M | } |
355 | 307k | if (ubuf[i] <= 0xffff) { |
356 | 188k | if (end - buf < 3) |
357 | 0 | return NULL; |
358 | 188k | *buf++ = CAST(unsigned char, (ubuf[i] >> 12) + 0xe0); |
359 | 188k | goto out2; |
360 | 188k | } |
361 | 118k | if (ubuf[i] <= 0x1fffff) { |
362 | 4.18k | if (end - buf < 4) |
363 | 0 | return NULL; |
364 | 4.18k | *buf++ = CAST(unsigned char, (ubuf[i] >> 18) + 0xf0); |
365 | 4.18k | goto out3; |
366 | 4.18k | } |
367 | 114k | if (ubuf[i] <= 0x3ffffff) { |
368 | 25.7k | if (end - buf < 5) |
369 | 0 | return NULL; |
370 | 25.7k | *buf++ = CAST(unsigned char, (ubuf[i] >> 24) + 0xf8); |
371 | 25.7k | goto out4; |
372 | 25.7k | } |
373 | 89.0k | if (ubuf[i] <= 0x7fffffff) { |
374 | 88.9k | if (end - buf < 6) |
375 | 0 | return NULL; |
376 | 88.9k | *buf++ = CAST(unsigned char, (ubuf[i] >> 30) + 0xfc); |
377 | 88.9k | goto out5; |
378 | 88.9k | } |
379 | | /* Invalid character */ |
380 | 44 | return NULL; |
381 | 88.9k | out5: *buf++ = CAST(unsigned char, ((ubuf[i] >> 24) & 0x3f) + 0x80); |
382 | 114k | out4: *buf++ = CAST(unsigned char, ((ubuf[i] >> 18) & 0x3f) + 0x80); |
383 | 118k | out3: *buf++ = CAST(unsigned char, ((ubuf[i] >> 12) & 0x3f) + 0x80); |
384 | 307k | out2: *buf++ = CAST(unsigned char, ((ubuf[i] >> 6) & 0x3f) + 0x80); |
385 | 3.07M | out1: *buf++ = CAST(unsigned char, ((ubuf[i] >> 0) & 0x3f) + 0x80); |
386 | 3.07M | } |
387 | | |
388 | 2.08k | return buf; |
389 | 2.13k | } |