/src/libsoup/libsoup/content-sniffer/soup-content-sniffer.c
Line | Count | Source |
1 | | /* -*- Mode: C; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 8 -*- */ |
2 | | /* |
3 | | * soup-content-sniffer.c |
4 | | * |
5 | | * Copyright (C) 2009, 2013 Gustavo Noronha Silva. |
6 | | * |
7 | | * This code implements the following specification: |
8 | | * |
9 | | * http://mimesniff.spec.whatwg.org/ as of 11 June 2013 |
10 | | */ |
11 | | |
12 | | #ifdef HAVE_CONFIG_H |
13 | | #include <config.h> |
14 | | #endif |
15 | | |
16 | | #include <string.h> |
17 | | |
18 | | #include "soup-content-sniffer.h" |
19 | | #include "soup-session-feature-private.h" |
20 | | #include "soup-content-processor.h" |
21 | | #include "soup-content-sniffer-stream.h" |
22 | | #include "soup-message-private.h" |
23 | | #include "soup-message-headers-private.h" |
24 | | #include "soup-session-feature-private.h" |
25 | | |
26 | | /** |
27 | | * SoupContentSniffer: |
28 | | * |
29 | | * Sniffs the mime type of messages. |
30 | | * |
31 | | * A [class@ContentSniffer] tries to detect the actual content type of |
32 | | * the files that are being downloaded by looking at some of the data |
33 | | * before the [class@Message] emits its [signal@Message::got-headers] signal. |
34 | | * [class@ContentSniffer] implements [iface@SessionFeature], so you can add |
35 | | * content sniffing to a session with [method@Session.add_feature] or |
36 | | * [method@Session.add_feature_by_type]. |
37 | | **/ |
38 | | |
39 | | static void soup_content_sniffer_session_feature_init (SoupSessionFeatureInterface *feature_interface, gpointer interface_data); |
40 | | |
41 | | static SoupContentProcessorInterface *soup_content_sniffer_default_content_processor_interface; |
42 | | static void soup_content_sniffer_content_processor_init (SoupContentProcessorInterface *interface, gpointer interface_data); |
43 | | |
44 | | struct _SoupContentSniffer { |
45 | | GObject parent_instance; |
46 | | }; |
47 | | |
48 | 1.20k | G_DEFINE_FINAL_TYPE_WITH_CODE (SoupContentSniffer, soup_content_sniffer, G_TYPE_OBJECT, |
49 | 1.20k | G_IMPLEMENT_INTERFACE (SOUP_TYPE_SESSION_FEATURE, |
50 | 1.20k | soup_content_sniffer_session_feature_init) |
51 | 1.20k | G_IMPLEMENT_INTERFACE (SOUP_TYPE_CONTENT_PROCESSOR, |
52 | 1.20k | soup_content_sniffer_content_processor_init)) |
53 | 1.20k | |
54 | 1.20k | |
55 | 1.20k | static GInputStream * |
56 | 1.20k | soup_content_sniffer_content_processor_wrap_input (SoupContentProcessor *processor, |
57 | 1.20k | GInputStream *base_stream, |
58 | 1.20k | SoupMessage *msg, |
59 | 1.20k | GError **error) |
60 | 1.20k | { |
61 | 0 | return g_object_new (SOUP_TYPE_CONTENT_SNIFFER_STREAM, |
62 | 0 | "base-stream", base_stream, |
63 | 0 | "message", msg, |
64 | 0 | "sniffer", SOUP_CONTENT_SNIFFER (processor), |
65 | 0 | NULL); |
66 | 0 | } |
67 | | |
68 | | static void |
69 | | soup_content_sniffer_content_processor_init (SoupContentProcessorInterface *processor_interface, |
70 | | gpointer interface_data) |
71 | 1 | { |
72 | 1 | soup_content_sniffer_default_content_processor_interface = |
73 | 1 | g_type_default_interface_peek (SOUP_TYPE_CONTENT_PROCESSOR); |
74 | | |
75 | 1 | processor_interface->processing_stage = SOUP_STAGE_BODY_DATA; |
76 | 1 | processor_interface->wrap_input = soup_content_sniffer_content_processor_wrap_input; |
77 | 1 | } |
78 | | |
79 | | static void |
80 | | soup_content_sniffer_init (SoupContentSniffer *content_sniffer) |
81 | 1.20k | { |
82 | 1.20k | } |
83 | | |
84 | | typedef struct { |
85 | | const guchar *mask; |
86 | | const guchar *pattern; |
87 | | guint pattern_length; |
88 | | const char *sniffed_type; |
89 | | } SoupContentSnifferMediaPattern; |
90 | | |
91 | | static char* |
92 | | sniff_media (SoupContentSniffer *sniffer, |
93 | | GBytes *buffer, |
94 | | SoupContentSnifferMediaPattern table[], |
95 | | int table_length) |
96 | 8.08k | { |
97 | | |
98 | 8.08k | gsize resource_length; |
99 | 8.08k | const guchar *resource = g_bytes_get_data (buffer, &resource_length); |
100 | 8.08k | resource_length = MIN (512, resource_length); |
101 | 8.08k | int i; |
102 | | |
103 | 72.1k | for (i = 0; i < table_length; i++) { |
104 | 64.1k | SoupContentSnifferMediaPattern *type_row = &(table[i]); |
105 | 64.1k | guint j; |
106 | | |
107 | 64.1k | if (resource_length < type_row->pattern_length) |
108 | 15.7k | continue; |
109 | | |
110 | 51.2k | for (j = 0; j < type_row->pattern_length; j++) { |
111 | 51.1k | if ((type_row->mask[j] & resource[j]) != type_row->pattern[j]) |
112 | 48.3k | break; |
113 | 51.1k | } |
114 | | |
115 | | /* This means our comparison above matched completely */ |
116 | 48.3k | if (j == type_row->pattern_length) |
117 | 84 | return g_strdup (type_row->sniffed_type); |
118 | 48.3k | } |
119 | | |
120 | 8.00k | return NULL; |
121 | 8.08k | } |
122 | | |
123 | | /* This table is based on the MIMESNIFF spec; |
124 | | * See 6.1 Matching an image type pattern |
125 | | */ |
126 | | static SoupContentSnifferMediaPattern image_types_table[] = { |
127 | | |
128 | | /* Windows icon signature. */ |
129 | | { (const guchar *)"\xFF\xFF\xFF\xFF", |
130 | | (const guchar *)"\x00\x00\x01\x00", |
131 | | 4, |
132 | | "image/x-icon" }, |
133 | | |
134 | | /* Windows cursor signature. */ |
135 | | { (const guchar *)"\xFF\xFF\xFF\xFF", |
136 | | (const guchar *)"\x00\x00\x02\x00", |
137 | | 4, |
138 | | "image/x-icon" }, |
139 | | |
140 | | /* BMP. */ |
141 | | { (const guchar *)"\xFF\xFF", |
142 | | (const guchar *)"BM", |
143 | | 2, |
144 | | "image/bmp" }, |
145 | | |
146 | | /* GIFs. */ |
147 | | { (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF", |
148 | | (const guchar *)"GIF87a", |
149 | | 6, |
150 | | "image/gif" }, |
151 | | |
152 | | { (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF", |
153 | | (const guchar *)"GIF89a", |
154 | | 6, |
155 | | "image/gif" }, |
156 | | |
157 | | /* WEBP. */ |
158 | | { (const guchar *)"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF\xFF\xFF", |
159 | | (const guchar *)"RIFF\x00\x00\x00\x00WEBPVP", |
160 | | 14, |
161 | | "image/webp" }, |
162 | | |
163 | | /* PNG. */ |
164 | | { (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", |
165 | | (const guchar *)"\x89PNG\x0D\x0A\x1A\x0A", |
166 | | 8, |
167 | | "image/png" }, |
168 | | |
169 | | /* JPEG. */ |
170 | | { (const guchar *)"\xFF\xFF\xFF", |
171 | | (const guchar *)"\xFF\xD8\xFF", |
172 | | 3, |
173 | | "image/jpeg" }, |
174 | | }; |
175 | | |
176 | | static char* |
177 | | sniff_images (SoupContentSniffer *sniffer, GBytes *buffer) |
178 | 4.07k | { |
179 | 4.07k | return sniff_media (sniffer, |
180 | 4.07k | buffer, |
181 | 4.07k | image_types_table, |
182 | 4.07k | G_N_ELEMENTS (image_types_table)); |
183 | 4.07k | } |
184 | | |
185 | | /* This table is based on the MIMESNIFF spec; |
186 | | * See 6.2 Matching an audio or video type pattern |
187 | | */ |
188 | | static SoupContentSnifferMediaPattern audio_video_types_table[] = { |
189 | | { (const guchar *)"\xFF\xFF\xFF\xFF", |
190 | | (const guchar *)"\x1A\x45\xDF\xA3", |
191 | | 4, |
192 | | "video/webm" }, |
193 | | |
194 | | { (const guchar *)"\xFF\xFF\xFF\xFF", |
195 | | (const guchar *)".snd", |
196 | | 4, |
197 | | "audio/basic" }, |
198 | | |
199 | | |
200 | | { (const guchar *)"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF", |
201 | | (const guchar *)"FORM\0\0\0\0AIFF", |
202 | | 12, |
203 | | "audio/aiff" }, |
204 | | |
205 | | { (const guchar *)"\xFF\xFF\xFF", |
206 | | (const guchar *)"ID3", |
207 | | 3, |
208 | | "audio/mpeg" }, |
209 | | |
210 | | { (const guchar *)"\xFF\xFF\xFF\xFF\xFF", |
211 | | (const guchar *)"OggS\0", |
212 | | 5, |
213 | | "application/ogg" }, |
214 | | |
215 | | { (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", |
216 | | (const guchar *)"MThd\x00\x00\x00\x06", |
217 | | 8, |
218 | | "audio/midi" }, |
219 | | |
220 | | { (const guchar *)"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF", |
221 | | (const guchar *)"RIFF\x00\x00\x00\x00AVI ", |
222 | | 12, |
223 | | "video/avi" }, |
224 | | |
225 | | { (const guchar *)"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF", |
226 | | (const guchar *)"RIFF\x00\x00\x00\x00WAVE", |
227 | | 12, |
228 | | "audio/wave" }, |
229 | | }; |
230 | | |
231 | | static gboolean |
232 | | data_has_prefix (const char *data, const char *prefix, gsize max_length) |
233 | 16.9k | { |
234 | 16.9k | if (strlen (prefix) > max_length) |
235 | 182 | return FALSE; |
236 | | |
237 | 16.7k | return memcmp (data, prefix, strlen (prefix)) == 0; |
238 | 16.9k | } |
239 | | |
240 | | static gboolean |
241 | | sniff_mp4 (SoupContentSniffer *sniffer, GBytes *buffer) |
242 | 4.00k | { |
243 | 4.00k | gsize resource_length; |
244 | 4.00k | const char *resource = g_bytes_get_data (buffer, &resource_length); |
245 | 4.00k | resource_length = MIN (512, resource_length); |
246 | 4.00k | guint32 box_size; |
247 | 4.00k | guint i; |
248 | | |
249 | 4.00k | if (resource_length < sizeof (guint32)) |
250 | 584 | return FALSE; |
251 | | |
252 | 3.42k | box_size = *((guint32*)resource); |
253 | | |
254 | 3.42k | #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ |
255 | 3.42k | box_size = ((box_size >> 24) | |
256 | 3.42k | ((box_size << 8) & 0x00FF0000) | |
257 | 3.42k | ((box_size >> 8) & 0x0000FF00) | |
258 | 3.42k | (box_size << 24)); |
259 | 3.42k | #endif |
260 | | |
261 | 3.42k | if (resource_length < 12 || resource_length < box_size || box_size % 4 != 0) |
262 | 2.83k | return FALSE; |
263 | | |
264 | 590 | if (!data_has_prefix (resource + 4, "ftyp", resource_length - 4)) |
265 | 109 | return FALSE; |
266 | | |
267 | 481 | if (!data_has_prefix (resource + 8, "mp4", resource_length - 8)) |
268 | 190 | return FALSE; |
269 | | |
270 | 2.21k | for (i = 16; i < box_size && i < resource_length; i = i + 4) { |
271 | 1.93k | if (data_has_prefix (resource + i, "mp4", resource_length - i)) |
272 | 13 | return TRUE; |
273 | 1.93k | } |
274 | | |
275 | 278 | return FALSE; |
276 | 291 | } |
277 | | |
278 | | static char* |
279 | | sniff_audio_video (SoupContentSniffer *sniffer, GBytes *buffer) |
280 | 4.01k | { |
281 | 4.01k | char *sniffed_type; |
282 | | |
283 | 4.01k | sniffed_type = sniff_media (sniffer, |
284 | 4.01k | buffer, |
285 | 4.01k | audio_video_types_table, |
286 | 4.01k | G_N_ELEMENTS (audio_video_types_table)); |
287 | | |
288 | 4.01k | if (sniffed_type != NULL) |
289 | 7 | return sniffed_type; |
290 | | |
291 | 4.00k | if (sniff_mp4 (sniffer, buffer)) |
292 | 13 | return g_strdup ("video/mp4"); |
293 | | |
294 | 3.99k | return NULL; |
295 | 4.00k | } |
296 | | |
297 | | /* This table is based on the MIMESNIFF spec; |
298 | | * See 7.1 Identifying a resource with an unknown MIME type |
299 | | */ |
300 | | typedef struct { |
301 | | /* @has_ws is TRUE if @pattern contains "generic" whitespace */ |
302 | | gboolean has_ws; |
303 | | /* @has_tag_termination is TRUE if we should check for a tag-terminating |
304 | | * byte (0x20 " " or 0x3E ">") after the pattern match. |
305 | | */ |
306 | | gboolean has_tag_termination; |
307 | | const guchar *mask; |
308 | | const guchar *pattern; |
309 | | guint pattern_length; |
310 | | const char *sniffed_type; |
311 | | gboolean scriptable; |
312 | | } SoupContentSnifferPattern; |
313 | | |
314 | | |
315 | | /* When has_ws is TRUE, spaces in the pattern will indicate where insignificant space |
316 | | * is allowed. Those spaces are marked with \x00 on the mask. |
317 | | */ |
318 | | static SoupContentSnifferPattern types_table[] = { |
319 | | /* Scriptable types. */ |
320 | | |
321 | | { TRUE, TRUE, |
322 | | (const guchar *)"\x00\xFF\xFF\xDF\xDF\xDF\xDF\xDF\xDF\xDF\xFF\xDF\xDF\xDF\xDF", |
323 | | (const guchar *)" <!DOCTYPE HTML", |
324 | | 14, |
325 | | "text/html", |
326 | | TRUE }, |
327 | | |
328 | | { TRUE, TRUE, |
329 | | (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF", |
330 | | (const guchar *)" <HTML", |
331 | | 5, |
332 | | "text/html", |
333 | | TRUE }, |
334 | | |
335 | | { TRUE, TRUE, |
336 | | (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF", |
337 | | (const guchar *)" <HEAD", |
338 | | 5, |
339 | | "text/html", |
340 | | TRUE }, |
341 | | |
342 | | { TRUE, TRUE, |
343 | | (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF\xDF\xDF", |
344 | | (const guchar *)" <SCRIPT", |
345 | | 7, |
346 | | "text/html", |
347 | | TRUE }, |
348 | | |
349 | | { TRUE, TRUE, |
350 | | (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF\xDF\xDF", |
351 | | (const guchar *)" <IFRAME", |
352 | | 7, |
353 | | "text/html", |
354 | | TRUE }, |
355 | | |
356 | | { TRUE, TRUE, |
357 | | (const guchar *)"\x00\xFF\xDF\xFF", |
358 | | (const guchar *)" <H1", |
359 | | 3, |
360 | | "text/html", |
361 | | TRUE }, |
362 | | |
363 | | { TRUE, TRUE, |
364 | | (const guchar *)"\x00\xFF\xDF\xDF\xDF", |
365 | | (const guchar *)" <DIV", |
366 | | 4, |
367 | | "text/html", |
368 | | TRUE }, |
369 | | |
370 | | { TRUE, TRUE, |
371 | | (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF", |
372 | | (const guchar *)" <FONT", |
373 | | 5, |
374 | | "text/html", |
375 | | TRUE }, |
376 | | |
377 | | { TRUE, TRUE, |
378 | | (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF\xDF", |
379 | | (const guchar *)" <TABLE", |
380 | | 6, |
381 | | "text/html", |
382 | | TRUE }, |
383 | | |
384 | | { TRUE, TRUE, |
385 | | (const guchar *)"\x00\xFF\xDF", |
386 | | (const guchar *)" <A", |
387 | | 2, |
388 | | "text/html", |
389 | | TRUE }, |
390 | | |
391 | | { TRUE, TRUE, |
392 | | (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF\xDF", |
393 | | (const guchar *)" <STYLE", |
394 | | 6, |
395 | | "text/html", |
396 | | TRUE }, |
397 | | |
398 | | { TRUE, TRUE, |
399 | | (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF\xDF", |
400 | | (const guchar *)" <TITLE", |
401 | | 6, |
402 | | "text/html", |
403 | | TRUE }, |
404 | | |
405 | | { TRUE, TRUE, |
406 | | (const guchar *)"\x00\xFF\xDF", |
407 | | (const guchar *)" <B", |
408 | | 2, |
409 | | "text/html", |
410 | | TRUE }, |
411 | | |
412 | | { TRUE, TRUE, |
413 | | (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF", |
414 | | (const guchar *)" <BODY", |
415 | | 5, |
416 | | "text/html", |
417 | | TRUE }, |
418 | | |
419 | | { TRUE, TRUE, |
420 | | (const guchar *)"\x00\xFF\xDF\xDF", |
421 | | (const guchar *)" <BR", |
422 | | 3, |
423 | | "text/html", |
424 | | TRUE }, |
425 | | |
426 | | { TRUE, TRUE, |
427 | | (const guchar *)"\x00\xFF\xDF", |
428 | | (const guchar *)" <P", |
429 | | 2, |
430 | | "text/html", |
431 | | TRUE }, |
432 | | |
433 | | { TRUE, TRUE, |
434 | | (const guchar *)"\x00\xFF\xFF\xFF\xFF", |
435 | | (const guchar *)" <!--", |
436 | | 4, |
437 | | "text/html", |
438 | | TRUE }, |
439 | | |
440 | | { TRUE, FALSE, |
441 | | (const guchar *)"\x00\xFF\xFF\xFF\xFF\xFF", |
442 | | (const guchar *)" <?xml", |
443 | | 5, |
444 | | "text/xml", |
445 | | TRUE }, |
446 | | |
447 | | { FALSE, FALSE, |
448 | | (const guchar *)"\xFF\xFF\xFF\xFF\xFF", |
449 | | (const guchar *)"%PDF-", |
450 | | 5, |
451 | | "application/pdf", |
452 | | TRUE }, |
453 | | |
454 | | /* Non-scriptable types. */ |
455 | | { FALSE, FALSE, |
456 | | (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", |
457 | | (const guchar *)"%!PS-Adobe-", |
458 | | 11, |
459 | | "application/postscript", |
460 | | FALSE }, |
461 | | |
462 | | { FALSE, FALSE, /* UTF-16BE BOM */ |
463 | | (const guchar *)"\xFF\xFF\x00\x00", |
464 | | (const guchar *)"\xFE\xFF\x00\x00", |
465 | | 4, |
466 | | "text/plain", |
467 | | FALSE }, |
468 | | |
469 | | { FALSE, FALSE, /* UTF-16LE BOM */ |
470 | | (const guchar *)"\xFF\xFF\x00\x00", |
471 | | (const guchar *)"\xFF\xFE\x00\x00", |
472 | | 4, |
473 | | "text/plain", |
474 | | FALSE }, |
475 | | |
476 | | { FALSE, FALSE, /* UTF-8 BOM */ |
477 | | (const guchar *)"\xFF\xFF\xFF\x00", |
478 | | (const guchar *)"\xEF\xBB\xBF\x00", |
479 | | 4, |
480 | | "text/plain", |
481 | | FALSE }, |
482 | | }; |
483 | | |
484 | | /* Whether a given byte looks like it might be part of binary content. |
485 | | * Source: HTML5 spec; borrowed from the Chromium mime sniffer code, |
486 | | * which is BSD-licensed |
487 | | */ |
488 | | static char byte_looks_binary[] = { |
489 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, /* 0x00 - 0x0F */ |
490 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, /* 0x10 - 0x1F */ |
491 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x20 - 0x2F */ |
492 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x30 - 0x3F */ |
493 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x40 - 0x4F */ |
494 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x50 - 0x5F */ |
495 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x60 - 0x6F */ |
496 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x70 - 0x7F */ |
497 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x80 - 0x8F */ |
498 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x90 - 0x9F */ |
499 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xA0 - 0xAF */ |
500 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xB0 - 0xBF */ |
501 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xC0 - 0xCF */ |
502 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xD0 - 0xDF */ |
503 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xE0 - 0xEF */ |
504 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xF0 - 0xFF */ |
505 | | }; |
506 | | |
507 | | /* HTML5: 2.7.4 Content-Type sniffing: unknown type */ |
508 | | static char* |
509 | | sniff_unknown (SoupContentSniffer *sniffer, GBytes *buffer, |
510 | | gboolean sniff_scriptable) |
511 | 2.97k | { |
512 | 2.97k | char *sniffed_type = NULL; |
513 | 2.97k | gsize resource_length; |
514 | 2.97k | const guchar *resource = g_bytes_get_data (buffer, &resource_length); |
515 | 2.97k | resource_length = MIN (512, resource_length); |
516 | 2.97k | guint i; |
517 | | |
518 | 2.97k | if (resource_length == 0) |
519 | 0 | return g_strdup ("text/plain"); |
520 | | |
521 | 70.0k | for (i = 0; i < G_N_ELEMENTS (types_table); i++) { |
522 | 67.2k | SoupContentSnifferPattern *type_row = &(types_table[i]); |
523 | | |
524 | 67.2k | if (!sniff_scriptable && type_row->scriptable) |
525 | 0 | continue; |
526 | | |
527 | 67.2k | if (type_row->has_ws) { |
528 | 52.8k | guint index_stream = 0; |
529 | 52.8k | guint index_pattern = 0; |
530 | 52.8k | gboolean skip_row = FALSE; |
531 | | |
532 | 232k | while ((index_stream < resource_length - 1) && |
533 | 229k | (index_pattern <= type_row->pattern_length)) { |
534 | | /* Skip insignificant white space ("WS" in the spec) */ |
535 | 229k | if (type_row->pattern[index_pattern] == ' ') { |
536 | 142k | if (resource[index_stream] == '\x09' || |
537 | 130k | resource[index_stream] == '\x0a' || |
538 | 114k | resource[index_stream] == '\x0c' || |
539 | 104k | resource[index_stream] == '\x0d' || |
540 | 61.2k | resource[index_stream] == '\x20') |
541 | 91.9k | index_stream++; |
542 | 50.7k | else |
543 | 50.7k | index_pattern++; |
544 | 142k | } else { |
545 | 87.0k | if ((type_row->mask[index_pattern] & resource[index_stream]) != type_row->pattern[index_pattern]) { |
546 | 50.1k | skip_row = TRUE; |
547 | 50.1k | break; |
548 | 50.1k | } |
549 | 36.8k | index_pattern++; |
550 | 36.8k | index_stream++; |
551 | 36.8k | } |
552 | 229k | } |
553 | | |
554 | 52.8k | if (skip_row) |
555 | 50.1k | continue; |
556 | | |
557 | 2.66k | if (index_pattern > type_row->pattern_length) { |
558 | 348 | if (type_row->has_tag_termination && |
559 | 333 | resource[index_stream] != '\x20' && |
560 | 304 | resource[index_stream] != '\x3E') |
561 | 262 | continue; |
562 | | |
563 | 86 | return g_strdup (type_row->sniffed_type); |
564 | 348 | } |
565 | 14.4k | } else { |
566 | 14.4k | guint j; |
567 | | |
568 | 14.4k | if (resource_length < type_row->pattern_length) |
569 | 3.07k | continue; |
570 | | |
571 | 11.5k | for (j = 0; j < type_row->pattern_length; j++) { |
572 | 11.5k | if ((type_row->mask[j] & resource[j]) != type_row->pattern[j]) |
573 | 11.3k | break; |
574 | 11.5k | } |
575 | | |
576 | | /* This means our comparison above matched completely */ |
577 | 11.3k | if (j == type_row->pattern_length) |
578 | 22 | return g_strdup (type_row->sniffed_type); |
579 | 11.3k | } |
580 | 67.2k | } |
581 | | |
582 | 2.86k | sniffed_type = sniff_images (sniffer, buffer); |
583 | | |
584 | 2.86k | if (sniffed_type != NULL) |
585 | 57 | return sniffed_type; |
586 | | |
587 | 2.81k | sniffed_type = sniff_audio_video (sniffer, buffer); |
588 | | |
589 | 2.81k | if (sniffed_type != NULL) |
590 | 11 | return sniffed_type; |
591 | | |
592 | 72.2k | for (i = 0; i < resource_length; i++) { |
593 | 71.0k | if (byte_looks_binary[resource[i]]) |
594 | 1.59k | return g_strdup ("application/octet-stream"); |
595 | 71.0k | } |
596 | | |
597 | 1.20k | return g_strdup ("text/plain"); |
598 | 2.79k | } |
599 | | |
600 | | /* MIMESNIFF: 7.2 Sniffing a mislabeled binary resource */ |
601 | | static char* |
602 | | sniff_text_or_binary (SoupContentSniffer *sniffer, GBytes *buffer) |
603 | 1.20k | { |
604 | 1.20k | gsize resource_length; |
605 | 1.20k | const guchar *resource = g_bytes_get_data (buffer, &resource_length); |
606 | 1.20k | resource_length = MIN (512, resource_length); |
607 | 1.20k | gboolean looks_binary = FALSE; |
608 | 1.20k | int i; |
609 | | |
610 | | /* 2. Detecting UTF-16BE, UTF-16LE BOMs means it's text/plain */ |
611 | 1.20k | if (resource_length >= 2) { |
612 | 1.20k | if ((resource[0] == 0xFE && resource[1] == 0xFF) || |
613 | 1.19k | (resource[0] == 0xFF && resource[1] == 0xFE)) |
614 | 8 | return g_strdup ("text/plain"); |
615 | 1.20k | } |
616 | | |
617 | | /* 3. UTF-8 BOM. */ |
618 | 1.19k | if (resource_length >= 3) { |
619 | 1.13k | if (resource[0] == 0xEF && resource[1] == 0xBB && resource[2] == 0xBF) |
620 | 5 | return g_strdup ("text/plain"); |
621 | 1.13k | } |
622 | | |
623 | | /* 4. Look to see if any of the first n bytes looks binary */ |
624 | 31.3k | for (i = 0; i < resource_length; i++) { |
625 | 30.7k | if (byte_looks_binary[resource[i]]) { |
626 | 565 | looks_binary = TRUE; |
627 | 565 | break; |
628 | 565 | } |
629 | 30.7k | } |
630 | | |
631 | 1.19k | if (!looks_binary) |
632 | 627 | return g_strdup ("text/plain"); |
633 | | |
634 | | /* 5. Execute 7.1 Identifying a resource with an unknown MIME type. |
635 | | * TODO: sniff-scriptable needs to be unset. |
636 | | */ |
637 | 565 | return sniff_unknown (sniffer, buffer, TRUE); |
638 | 1.19k | } |
639 | | |
640 | | static gboolean |
641 | | skip_insignificant_space (const char *resource, gsize *pos, gsize resource_length) |
642 | 2.94k | { |
643 | 2.94k | if (*pos >= resource_length) |
644 | 33 | return TRUE; |
645 | | |
646 | 7.96k | while ((resource[*pos] == '\x09') || |
647 | 6.64k | (resource[*pos] == '\x20') || |
648 | 5.62k | (resource[*pos] == '\x0A') || |
649 | 5.16k | (resource[*pos] == '\x0D')) { |
650 | 5.16k | *pos = *pos + 1; |
651 | | |
652 | 5.16k | if (*pos >= resource_length) |
653 | 113 | return TRUE; |
654 | 5.16k | } |
655 | | |
656 | 2.80k | return FALSE; |
657 | 2.91k | } |
658 | | |
659 | | static char* |
660 | | sniff_feed_or_html (SoupContentSniffer *sniffer, GBytes *buffer) |
661 | 1.20k | { |
662 | 1.20k | gsize resource_length; |
663 | 1.20k | const char *resource = g_bytes_get_data (buffer, &resource_length); |
664 | 1.20k | resource_length = MIN (512, resource_length); |
665 | 1.20k | gsize pos = 0; |
666 | | |
667 | 1.20k | if (resource_length < 3) |
668 | 65 | goto text_html; |
669 | | |
670 | | /* Skip a leading UTF-8 BOM */ |
671 | 1.14k | if ((guchar)resource[0] == 0xEF && (guchar)resource[1] == 0xBB && (guchar)resource[2] == 0xBF) |
672 | 5 | pos = 3; |
673 | | |
674 | 2.43k | look_for_tag: |
675 | 2.43k | if (skip_insignificant_space (resource, &pos, resource_length)) |
676 | 53 | goto text_html; |
677 | | |
678 | 2.38k | if (resource[pos] != '<') |
679 | 327 | return g_strdup ("text/html"); |
680 | | |
681 | 2.05k | pos++; |
682 | | |
683 | 2.05k | if ((pos + 2) > resource_length) |
684 | 15 | goto text_html; |
685 | | |
686 | | /* Skip comments. */ |
687 | 2.03k | if (data_has_prefix (resource + pos, "!--", resource_length - pos)) { |
688 | 230 | pos = pos + 3; |
689 | | |
690 | 230 | if ((pos + 2) > resource_length) |
691 | 10 | goto text_html; |
692 | | |
693 | 5.22k | while (!data_has_prefix (resource + pos, "-->", resource_length - pos)) { |
694 | 5.06k | pos++; |
695 | | |
696 | 5.06k | if ((pos + 2) > resource_length) |
697 | 59 | goto text_html; |
698 | 5.06k | } |
699 | | |
700 | 161 | pos = pos + 3; |
701 | | |
702 | 161 | goto look_for_tag; |
703 | 220 | } |
704 | | |
705 | 1.80k | if (pos > resource_length) |
706 | 0 | goto text_html; |
707 | | |
708 | 1.80k | if (resource[pos] == '!') { |
709 | 11.3k | do { |
710 | 11.3k | pos++; |
711 | | |
712 | 11.3k | if ((pos + 1) > resource_length) |
713 | 49 | goto text_html; |
714 | 11.3k | } while (resource[pos] != '>'); |
715 | | |
716 | 794 | pos++; |
717 | | |
718 | 794 | goto look_for_tag; |
719 | 965 | } else if (resource[pos] == '?') { |
720 | 5.04k | do { |
721 | 5.04k | pos++; |
722 | | |
723 | 5.04k | if ((pos + 1) > resource_length) |
724 | 62 | goto text_html; |
725 | 5.04k | } while (!data_has_prefix (resource + pos, "?>", resource_length - pos)); |
726 | | |
727 | 338 | pos = pos + 2; |
728 | | |
729 | 338 | goto look_for_tag; |
730 | 400 | } |
731 | | |
732 | 565 | if ((pos + 3) > resource_length) |
733 | 51 | goto text_html; |
734 | | |
735 | 514 | if (data_has_prefix (resource + pos, "rss", resource_length - pos)) |
736 | 2 | return g_strdup ("application/rss+xml"); |
737 | | |
738 | 512 | if ((pos + 4) > resource_length) |
739 | 79 | goto text_html; |
740 | | |
741 | 433 | if (data_has_prefix (resource + pos, "feed", resource_length - pos)) |
742 | 1 | return g_strdup ("application/atom+xml"); |
743 | | |
744 | 432 | if ((pos + 7) > resource_length) |
745 | 37 | goto text_html; |
746 | | |
747 | 395 | if (data_has_prefix (resource + pos, "rdf:RDF", resource_length - pos)) { |
748 | 346 | pos = pos + 7; |
749 | | |
750 | 346 | if (skip_insignificant_space (resource, &pos, resource_length)) |
751 | 32 | goto text_html; |
752 | | |
753 | 314 | if ((pos + 32) > resource_length) |
754 | 47 | goto text_html; |
755 | | |
756 | 267 | if (data_has_prefix (resource + pos, "xmlns=\"http://purl.org/rss/1.0/\"", resource_length - pos)) { |
757 | 82 | pos = pos + 32; |
758 | | |
759 | 82 | if (skip_insignificant_space (resource, &pos, resource_length)) |
760 | 30 | goto text_html; |
761 | | |
762 | 52 | if ((pos + 55) > resource_length) |
763 | 50 | goto text_html; |
764 | | |
765 | 2 | if (data_has_prefix (resource + pos, "xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"", resource_length - pos)) |
766 | 1 | return g_strdup ("application/rss+xml"); |
767 | 2 | } |
768 | | |
769 | 186 | if ((pos + 55) > resource_length) |
770 | 88 | goto text_html; |
771 | | |
772 | 98 | if (data_has_prefix (resource + pos, "xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"", resource_length - pos)) { |
773 | 86 | pos = pos + 55; |
774 | | |
775 | 86 | if (skip_insignificant_space (resource, &pos, resource_length)) |
776 | 31 | goto text_html; |
777 | | |
778 | 55 | if ((pos + 32) > resource_length) |
779 | 48 | goto text_html; |
780 | | |
781 | 7 | if (data_has_prefix (resource + pos, "xmlns=\"http://purl.org/rss/1.0/\"", resource_length - pos)) |
782 | 1 | return g_strdup ("application/rss+xml"); |
783 | 7 | } |
784 | 98 | } |
785 | | |
786 | 873 | text_html: |
787 | 873 | return g_strdup ("text/html"); |
788 | 395 | } |
789 | | |
790 | | /** |
791 | | * soup_content_sniffer_sniff: |
792 | | * @sniffer: a #SoupContentSniffer |
793 | | * @msg: the message to sniff |
794 | | * @buffer: a buffer containing the start of @msg's response body |
795 | | * @params: (element-type utf8 utf8) (out) (transfer full) (nullable): return |
796 | | * location for Content-Type parameters (eg, "charset"), or %NULL |
797 | | * |
798 | | * Sniffs @buffer to determine its Content-Type. |
799 | | * |
800 | | * The result may also be influenced by the Content-Type declared in @msg's |
801 | | * response headers. |
802 | | * |
803 | | * Returns: the sniffed Content-Type of @buffer; this will never be %NULL, |
804 | | * but may be `application/octet-stream`. |
805 | | */ |
806 | | char * |
807 | | soup_content_sniffer_sniff (SoupContentSniffer *sniffer, SoupMessage *msg, |
808 | | GBytes *buffer, GHashTable **params) |
809 | 8.43k | { |
810 | 8.43k | const char *content_type; |
811 | 8.43k | const char *x_content_type_options; |
812 | 8.43k | char *sniffed_type = NULL; |
813 | 8.43k | gboolean no_sniff = FALSE; |
814 | | |
815 | 8.43k | content_type = soup_message_headers_get_content_type (soup_message_get_response_headers (msg), params); |
816 | | |
817 | | /* MIMESNIFF: 7 Determining the sniffed MIME type of a resource. */ |
818 | | |
819 | 8.43k | x_content_type_options = soup_message_headers_get_one_common (soup_message_get_response_headers (msg), SOUP_HEADER_X_CONTENT_TYPE_OPTIONS); |
820 | 8.43k | if (!g_strcmp0 (x_content_type_options, "nosniff")) |
821 | 0 | no_sniff = TRUE; |
822 | | |
823 | | /* 1. Unknown/undefined supplied type with sniff-scritable = !nosniff. */ |
824 | 8.43k | if ((content_type == NULL) || |
825 | 7.23k | !g_ascii_strcasecmp (content_type, "unknown/unknown") || |
826 | 7.23k | !g_ascii_strcasecmp (content_type, "application/unknown") || |
827 | 6.02k | !g_ascii_strcasecmp (content_type, "*/*")) |
828 | 2.41k | return sniff_unknown (sniffer, buffer, !no_sniff); |
829 | | |
830 | | /* 2. If nosniff is specified in X-Content-Type-Options use the supplied MIME type. */ |
831 | 6.02k | if (no_sniff) |
832 | 0 | return g_strdup (content_type); |
833 | | |
834 | | /* 3. check-for-apache-bug */ |
835 | 6.02k | if ((content_type != NULL) && |
836 | 6.02k | (g_str_equal (content_type, "text/plain") || |
837 | 4.82k | g_str_equal (content_type, "text/plain; charset=ISO-8859-1") || |
838 | 4.82k | g_str_equal (content_type, "text/plain; charset=iso-8859-1") || |
839 | 4.82k | g_str_equal (content_type, "text/plain; charset=UTF-8"))) |
840 | 1.20k | return sniff_text_or_binary (sniffer, buffer); |
841 | | |
842 | | /* 4. XML types sent by the server are always used. */ |
843 | 4.82k | if (g_str_has_suffix (content_type, "+xml") || |
844 | 4.82k | !g_ascii_strcasecmp (content_type, "text/xml") || |
845 | 3.61k | !g_ascii_strcasecmp (content_type, "application/xml")) |
846 | 1.20k | return g_strdup (content_type); |
847 | | |
848 | | /* 5. Distinguish feed from HTML. */ |
849 | 3.61k | if (!g_ascii_strcasecmp (content_type, "text/html")) |
850 | 1.20k | return sniff_feed_or_html (sniffer, buffer); |
851 | | |
852 | | /* 6. Image types. |
853 | | */ |
854 | 2.41k | if (!g_ascii_strncasecmp (content_type, "image/", 6)) { |
855 | 1.20k | sniffed_type = sniff_images (sniffer, buffer); |
856 | 1.20k | if (sniffed_type != NULL) |
857 | 20 | return sniffed_type; |
858 | 1.18k | return g_strdup (content_type); |
859 | 1.20k | } |
860 | | |
861 | | /* 7. Audio and video types. */ |
862 | 1.20k | if (!g_ascii_strncasecmp (content_type, "audio/", 6) || |
863 | 1.20k | !g_ascii_strncasecmp (content_type, "video/", 6) || |
864 | 1.20k | !g_ascii_strcasecmp (content_type, "application/ogg")) { |
865 | 1.20k | sniffed_type = sniff_audio_video (sniffer, buffer); |
866 | 1.20k | if (sniffed_type != NULL) |
867 | 9 | return sniffed_type; |
868 | 1.19k | return g_strdup (content_type); |
869 | 1.20k | } |
870 | | |
871 | | /* If we got text/plain, use text_or_binary */ |
872 | 0 | if (g_str_equal (content_type, "text/plain")) { |
873 | 0 | return sniff_text_or_binary (sniffer, buffer); |
874 | 0 | } |
875 | | |
876 | 0 | return g_strdup (content_type); |
877 | 0 | } |
878 | | |
879 | | static void |
880 | | soup_content_sniffer_request_queued (SoupSessionFeature *feature, |
881 | | SoupMessage *msg) |
882 | 0 | { |
883 | 0 | soup_message_set_content_sniffer (msg, SOUP_CONTENT_SNIFFER (feature)); |
884 | 0 | } |
885 | | |
886 | | static void |
887 | | soup_content_sniffer_request_unqueued (SoupSessionFeature *feature, |
888 | | SoupMessage *msg) |
889 | 0 | { |
890 | 0 | soup_message_set_content_sniffer (msg, NULL); |
891 | 0 | } |
892 | | |
893 | | static void |
894 | | soup_content_sniffer_class_init (SoupContentSnifferClass *content_sniffer_class) |
895 | 1 | { |
896 | 1 | } |
897 | | |
898 | | static void |
899 | | soup_content_sniffer_session_feature_init (SoupSessionFeatureInterface *feature_interface, |
900 | | gpointer interface_data) |
901 | 1 | { |
902 | 1 | feature_interface->request_queued = soup_content_sniffer_request_queued; |
903 | 1 | feature_interface->request_unqueued = soup_content_sniffer_request_unqueued; |
904 | 1 | } |
905 | | |
906 | | /** |
907 | | * soup_content_sniffer_new: |
908 | | * |
909 | | * Creates a new [class@ContentSniffer]. |
910 | | * |
911 | | * Returns: a new #SoupContentSniffer |
912 | | **/ |
913 | | SoupContentSniffer * |
914 | | soup_content_sniffer_new (void) |
915 | 1.20k | { |
916 | 1.20k | return g_object_new (SOUP_TYPE_CONTENT_SNIFFER, NULL); |
917 | 1.20k | } |