Coverage Report

Created: 2026-01-09 07:11

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libsoup/libsoup/content-sniffer/soup-content-sniffer.c
Line
Count
Source
1
/* -*- Mode: C; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 8 -*- */
2
/*
3
 * soup-content-sniffer.c
4
 *
5
 * Copyright (C) 2009, 2013 Gustavo Noronha Silva.
6
 *
7
 * This code implements the following specification:
8
 *
9
 *  http://mimesniff.spec.whatwg.org/ as of 11 June 2013
10
 */
11
12
#ifdef HAVE_CONFIG_H
13
#include <config.h>
14
#endif
15
16
#include <string.h>
17
18
#include "soup-content-sniffer.h"
19
#include "soup-session-feature-private.h"
20
#include "soup-content-processor.h"
21
#include "soup-content-sniffer-stream.h"
22
#include "soup-message-private.h"
23
#include "soup-message-headers-private.h"
24
#include "soup-session-feature-private.h"
25
26
/**
27
 * SoupContentSniffer:
28
 *
29
 * Sniffs the mime type of messages.
30
 *
31
 * A [class@ContentSniffer] tries to detect the actual content type of
32
 * the files that are being downloaded by looking at some of the data
33
 * before the [class@Message] emits its [signal@Message::got-headers] signal.
34
 * [class@ContentSniffer] implements [iface@SessionFeature], so you can add
35
 * content sniffing to a session with [method@Session.add_feature] or
36
 * [method@Session.add_feature_by_type].
37
 **/
38
39
static void soup_content_sniffer_session_feature_init (SoupSessionFeatureInterface *feature_interface, gpointer interface_data);
40
41
static SoupContentProcessorInterface *soup_content_sniffer_default_content_processor_interface;
42
static void soup_content_sniffer_content_processor_init (SoupContentProcessorInterface *interface, gpointer interface_data);
43
44
struct _SoupContentSniffer {
45
        GObject parent_instance;
46
};
47
48
1.20k
G_DEFINE_FINAL_TYPE_WITH_CODE (SoupContentSniffer, soup_content_sniffer, G_TYPE_OBJECT,
49
1.20k
             G_IMPLEMENT_INTERFACE (SOUP_TYPE_SESSION_FEATURE,
50
1.20k
                  soup_content_sniffer_session_feature_init)
51
1.20k
             G_IMPLEMENT_INTERFACE (SOUP_TYPE_CONTENT_PROCESSOR,
52
1.20k
                  soup_content_sniffer_content_processor_init))
53
1.20k
54
1.20k
55
1.20k
static GInputStream *
56
1.20k
soup_content_sniffer_content_processor_wrap_input (SoupContentProcessor *processor,
57
1.20k
               GInputStream *base_stream,
58
1.20k
               SoupMessage *msg,
59
1.20k
               GError **error)
60
1.20k
{
61
0
  return g_object_new (SOUP_TYPE_CONTENT_SNIFFER_STREAM,
62
0
           "base-stream", base_stream,
63
0
           "message", msg,
64
0
           "sniffer", SOUP_CONTENT_SNIFFER (processor),
65
0
           NULL);
66
0
}
67
68
static void
69
soup_content_sniffer_content_processor_init (SoupContentProcessorInterface *processor_interface,
70
                                            gpointer interface_data)
71
1
{
72
1
  soup_content_sniffer_default_content_processor_interface =
73
1
    g_type_default_interface_peek (SOUP_TYPE_CONTENT_PROCESSOR);
74
75
1
  processor_interface->processing_stage = SOUP_STAGE_BODY_DATA;
76
1
  processor_interface->wrap_input = soup_content_sniffer_content_processor_wrap_input;
77
1
}
78
79
static void
80
soup_content_sniffer_init (SoupContentSniffer *content_sniffer)
81
1.20k
{
82
1.20k
}
83
84
typedef struct {
85
  const guchar *mask;
86
  const guchar *pattern;
87
  guint         pattern_length;
88
  const char   *sniffed_type;
89
} SoupContentSnifferMediaPattern;
90
91
static char*
92
sniff_media (SoupContentSniffer *sniffer,
93
       GBytes *buffer,
94
       SoupContentSnifferMediaPattern table[],
95
       int table_length)
96
8.08k
{
97
98
8.08k
        gsize resource_length;
99
8.08k
        const guchar *resource = g_bytes_get_data (buffer, &resource_length);
100
8.08k
        resource_length = MIN (512, resource_length);
101
8.08k
  int i;
102
103
72.1k
  for (i = 0; i < table_length; i++) {
104
64.1k
    SoupContentSnifferMediaPattern *type_row = &(table[i]);
105
64.1k
    guint j;
106
107
64.1k
    if (resource_length < type_row->pattern_length)
108
15.7k
      continue;
109
110
51.2k
    for (j = 0; j < type_row->pattern_length; j++) {
111
51.1k
      if ((type_row->mask[j] & resource[j]) != type_row->pattern[j])
112
48.3k
        break;
113
51.1k
    }
114
115
    /* This means our comparison above matched completely */
116
48.3k
    if (j == type_row->pattern_length)
117
84
      return g_strdup (type_row->sniffed_type);
118
48.3k
  }
119
120
8.00k
  return NULL;
121
8.08k
}
122
123
/* This table is based on the MIMESNIFF spec;
124
 * See 6.1 Matching an image type pattern
125
 */
126
static SoupContentSnifferMediaPattern image_types_table[] = {
127
128
  /* Windows icon signature. */
129
  { (const guchar *)"\xFF\xFF\xFF\xFF",
130
    (const guchar *)"\x00\x00\x01\x00",
131
    4,
132
    "image/x-icon" },
133
134
  /* Windows cursor signature. */
135
  { (const guchar *)"\xFF\xFF\xFF\xFF",
136
    (const guchar *)"\x00\x00\x02\x00",
137
    4,
138
    "image/x-icon" },
139
140
  /* BMP. */
141
  { (const guchar *)"\xFF\xFF",
142
    (const guchar *)"BM",
143
    2,
144
    "image/bmp" },
145
146
  /* GIFs. */
147
  { (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF",
148
    (const guchar *)"GIF87a",
149
    6,
150
    "image/gif" },
151
152
  { (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF",
153
    (const guchar *)"GIF89a",
154
    6,
155
    "image/gif" },
156
157
  /* WEBP. */
158
  { (const guchar *)"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF\xFF\xFF",
159
    (const guchar *)"RIFF\x00\x00\x00\x00WEBPVP",
160
    14,
161
    "image/webp" },
162
163
  /* PNG. */
164
  { (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF",
165
    (const guchar *)"\x89PNG\x0D\x0A\x1A\x0A",
166
    8,
167
    "image/png" },
168
169
  /* JPEG. */
170
  { (const guchar *)"\xFF\xFF\xFF",
171
    (const guchar *)"\xFF\xD8\xFF",
172
    3,
173
    "image/jpeg" },
174
};
175
176
static char*
177
sniff_images (SoupContentSniffer *sniffer, GBytes *buffer)
178
4.07k
{
179
4.07k
  return sniff_media (sniffer,
180
4.07k
          buffer,
181
4.07k
          image_types_table,
182
4.07k
          G_N_ELEMENTS (image_types_table));
183
4.07k
}
184
185
/* This table is based on the MIMESNIFF spec;
186
 * See 6.2 Matching an audio or video type pattern
187
 */
188
static SoupContentSnifferMediaPattern audio_video_types_table[] = {
189
  { (const guchar *)"\xFF\xFF\xFF\xFF",
190
    (const guchar *)"\x1A\x45\xDF\xA3",
191
    4,
192
    "video/webm" },
193
194
  { (const guchar *)"\xFF\xFF\xFF\xFF",
195
    (const guchar *)".snd",
196
    4,
197
    "audio/basic" },
198
199
200
  { (const guchar *)"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF",
201
    (const guchar *)"FORM\0\0\0\0AIFF",
202
    12,
203
    "audio/aiff" },
204
205
  { (const guchar *)"\xFF\xFF\xFF",
206
    (const guchar *)"ID3",
207
    3,
208
    "audio/mpeg" },
209
210
  { (const guchar *)"\xFF\xFF\xFF\xFF\xFF",
211
    (const guchar *)"OggS\0",
212
    5,
213
    "application/ogg" },
214
215
  { (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF",
216
    (const guchar *)"MThd\x00\x00\x00\x06",
217
    8,
218
    "audio/midi" },
219
220
  { (const guchar *)"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF",
221
    (const guchar *)"RIFF\x00\x00\x00\x00AVI ",
222
    12,
223
    "video/avi" },
224
225
  { (const guchar *)"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF",
226
    (const guchar *)"RIFF\x00\x00\x00\x00WAVE",
227
    12,
228
    "audio/wave" },
229
};
230
231
static gboolean
232
data_has_prefix (const char *data, const char *prefix, gsize max_length)
233
16.9k
{
234
16.9k
        if (strlen (prefix) > max_length)
235
182
                return FALSE;
236
237
16.7k
        return memcmp (data, prefix, strlen (prefix)) == 0;
238
16.9k
}
239
240
static gboolean
241
sniff_mp4 (SoupContentSniffer *sniffer, GBytes *buffer)
242
4.00k
{
243
4.00k
  gsize resource_length;
244
4.00k
  const char *resource = g_bytes_get_data (buffer, &resource_length);
245
4.00k
  resource_length = MIN (512, resource_length);
246
4.00k
  guint32 box_size;
247
4.00k
  guint i;
248
249
4.00k
        if (resource_length < sizeof (guint32))
250
584
                return FALSE;
251
252
3.42k
  box_size = *((guint32*)resource);
253
254
3.42k
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
255
3.42k
  box_size = ((box_size >> 24) |
256
3.42k
        ((box_size << 8) & 0x00FF0000) |
257
3.42k
        ((box_size >> 8) & 0x0000FF00) |
258
3.42k
        (box_size << 24));
259
3.42k
#endif
260
261
3.42k
  if (resource_length < 12 || resource_length < box_size || box_size % 4 != 0)
262
2.83k
    return FALSE;
263
264
590
  if (!data_has_prefix (resource + 4, "ftyp", resource_length - 4))
265
109
    return FALSE;
266
267
481
  if (!data_has_prefix (resource + 8, "mp4", resource_length - 8))
268
190
    return FALSE;
269
270
2.21k
  for (i = 16; i < box_size && i < resource_length; i = i + 4) {
271
1.93k
    if (data_has_prefix (resource + i, "mp4", resource_length - i))
272
13
      return TRUE;
273
1.93k
  }
274
275
278
  return FALSE;
276
291
}
277
278
static char*
279
sniff_audio_video (SoupContentSniffer *sniffer, GBytes *buffer)
280
4.01k
{
281
4.01k
  char *sniffed_type;
282
283
4.01k
  sniffed_type = sniff_media (sniffer,
284
4.01k
            buffer,
285
4.01k
            audio_video_types_table,
286
4.01k
            G_N_ELEMENTS (audio_video_types_table));
287
288
4.01k
  if (sniffed_type != NULL)
289
7
    return sniffed_type;
290
291
4.00k
  if (sniff_mp4 (sniffer, buffer))
292
13
    return g_strdup ("video/mp4");
293
294
3.99k
  return NULL;
295
4.00k
}
296
297
/* This table is based on the MIMESNIFF spec;
298
 * See 7.1 Identifying a resource with an unknown MIME type
299
 */
300
typedef struct {
301
  /* @has_ws is TRUE if @pattern contains "generic" whitespace */
302
  gboolean      has_ws;
303
  /* @has_tag_termination is TRUE if we should check for a tag-terminating
304
   * byte (0x20 " " or 0x3E ">") after the pattern match.
305
   */
306
  gboolean      has_tag_termination;
307
  const guchar *mask;
308
  const guchar *pattern;
309
  guint         pattern_length;
310
  const char   *sniffed_type;
311
  gboolean      scriptable;
312
} SoupContentSnifferPattern;
313
314
315
/* When has_ws is TRUE, spaces in the pattern will indicate where insignificant space
316
 * is allowed. Those spaces are marked with \x00 on the mask.
317
 */
318
static SoupContentSnifferPattern types_table[] = {
319
  /* Scriptable types. */
320
321
  { TRUE, TRUE,
322
    (const guchar *)"\x00\xFF\xFF\xDF\xDF\xDF\xDF\xDF\xDF\xDF\xFF\xDF\xDF\xDF\xDF",
323
    (const guchar *)" <!DOCTYPE HTML",
324
    14,
325
    "text/html",
326
    TRUE },
327
328
  { TRUE, TRUE,
329
    (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF",
330
    (const guchar *)" <HTML",
331
    5,
332
    "text/html",
333
    TRUE },
334
335
  { TRUE, TRUE,
336
    (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF",
337
    (const guchar *)" <HEAD",
338
    5,
339
    "text/html",
340
    TRUE },
341
342
  { TRUE, TRUE,
343
    (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF\xDF\xDF",
344
    (const guchar *)" <SCRIPT",
345
    7,
346
    "text/html",
347
    TRUE },
348
349
  { TRUE, TRUE,
350
    (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF\xDF\xDF",
351
    (const guchar *)" <IFRAME",
352
    7,
353
    "text/html",
354
    TRUE },
355
356
  { TRUE, TRUE,
357
    (const guchar *)"\x00\xFF\xDF\xFF",
358
    (const guchar *)" <H1",
359
    3,
360
    "text/html",
361
    TRUE },
362
363
  { TRUE, TRUE,
364
    (const guchar *)"\x00\xFF\xDF\xDF\xDF",
365
    (const guchar *)" <DIV",
366
    4,
367
    "text/html",
368
    TRUE },
369
370
  { TRUE, TRUE,
371
    (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF",
372
    (const guchar *)" <FONT",
373
    5,
374
    "text/html",
375
    TRUE },
376
377
  { TRUE, TRUE,
378
    (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF\xDF",
379
    (const guchar *)" <TABLE",
380
    6,
381
    "text/html",
382
    TRUE },
383
384
  { TRUE, TRUE,
385
    (const guchar *)"\x00\xFF\xDF",
386
    (const guchar *)" <A",
387
    2,
388
    "text/html",
389
    TRUE },
390
391
  { TRUE, TRUE,
392
    (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF\xDF",
393
    (const guchar *)" <STYLE",
394
    6,
395
    "text/html",
396
    TRUE },
397
398
  { TRUE, TRUE,
399
    (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF\xDF",
400
    (const guchar *)" <TITLE",
401
    6,
402
    "text/html",
403
    TRUE },
404
405
  { TRUE, TRUE,
406
    (const guchar *)"\x00\xFF\xDF",
407
    (const guchar *)" <B",
408
    2,
409
    "text/html",
410
    TRUE },
411
412
  { TRUE, TRUE,
413
    (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF",
414
    (const guchar *)" <BODY",
415
    5,
416
    "text/html",
417
    TRUE },
418
419
  { TRUE, TRUE,
420
    (const guchar *)"\x00\xFF\xDF\xDF",
421
    (const guchar *)" <BR",
422
    3,
423
    "text/html",
424
    TRUE },
425
426
  { TRUE, TRUE,
427
    (const guchar *)"\x00\xFF\xDF",
428
    (const guchar *)" <P",
429
    2,
430
    "text/html",
431
    TRUE },
432
433
  { TRUE, TRUE,
434
    (const guchar *)"\x00\xFF\xFF\xFF\xFF",
435
    (const guchar *)" <!--",
436
    4,
437
    "text/html",
438
    TRUE },
439
440
  { TRUE, FALSE,
441
    (const guchar *)"\x00\xFF\xFF\xFF\xFF\xFF",
442
    (const guchar *)" <?xml",
443
    5,
444
    "text/xml",
445
    TRUE },
446
447
  { FALSE, FALSE,
448
    (const guchar *)"\xFF\xFF\xFF\xFF\xFF",
449
    (const guchar *)"%PDF-",
450
    5,
451
    "application/pdf",
452
    TRUE },
453
454
  /* Non-scriptable types. */
455
  { FALSE, FALSE,
456
    (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF",
457
    (const guchar *)"%!PS-Adobe-",
458
    11,
459
    "application/postscript",
460
    FALSE },
461
462
  { FALSE, FALSE, /* UTF-16BE BOM */
463
    (const guchar *)"\xFF\xFF\x00\x00",
464
    (const guchar *)"\xFE\xFF\x00\x00",
465
    4,
466
    "text/plain",
467
    FALSE },
468
469
  { FALSE, FALSE, /* UTF-16LE BOM */
470
    (const guchar *)"\xFF\xFF\x00\x00",
471
    (const guchar *)"\xFF\xFE\x00\x00",
472
    4,
473
    "text/plain",
474
    FALSE },
475
476
  { FALSE, FALSE, /* UTF-8 BOM */
477
    (const guchar *)"\xFF\xFF\xFF\x00",
478
    (const guchar *)"\xEF\xBB\xBF\x00",
479
    4,
480
    "text/plain",
481
    FALSE },
482
};
483
484
/* Whether a given byte looks like it might be part of binary content.
485
 * Source: HTML5 spec; borrowed from the Chromium mime sniffer code,
486
 * which is BSD-licensed
487
 */
488
static char byte_looks_binary[] = {
489
  1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1,  /* 0x00 - 0x0F */
490
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,  /* 0x10 - 0x1F */
491
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x20 - 0x2F */
492
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x30 - 0x3F */
493
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x40 - 0x4F */
494
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x50 - 0x5F */
495
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x60 - 0x6F */
496
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x70 - 0x7F */
497
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x80 - 0x8F */
498
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x90 - 0x9F */
499
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0xA0 - 0xAF */
500
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0xB0 - 0xBF */
501
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0xC0 - 0xCF */
502
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0xD0 - 0xDF */
503
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0xE0 - 0xEF */
504
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0xF0 - 0xFF */
505
};
506
507
/* HTML5: 2.7.4 Content-Type sniffing: unknown type */
508
static char*
509
sniff_unknown (SoupContentSniffer *sniffer, GBytes *buffer,
510
         gboolean sniff_scriptable)
511
2.97k
{
512
2.97k
  char *sniffed_type = NULL;
513
2.97k
  gsize resource_length;
514
2.97k
  const guchar *resource = g_bytes_get_data (buffer, &resource_length);
515
2.97k
  resource_length = MIN (512, resource_length);
516
2.97k
  guint i;
517
518
2.97k
        if (resource_length == 0)
519
0
                return g_strdup ("text/plain");
520
521
70.0k
  for (i = 0; i < G_N_ELEMENTS (types_table); i++) {
522
67.2k
    SoupContentSnifferPattern *type_row = &(types_table[i]);
523
524
67.2k
    if (!sniff_scriptable && type_row->scriptable)
525
0
      continue;
526
527
67.2k
    if (type_row->has_ws) {
528
52.8k
      guint index_stream = 0;
529
52.8k
      guint index_pattern = 0;
530
52.8k
      gboolean skip_row = FALSE;
531
532
232k
      while ((index_stream < resource_length - 1) &&
533
229k
             (index_pattern <= type_row->pattern_length)) {
534
        /* Skip insignificant white space ("WS" in the spec) */
535
229k
        if (type_row->pattern[index_pattern] == ' ') {
536
142k
          if (resource[index_stream] == '\x09' ||
537
130k
              resource[index_stream] == '\x0a' ||
538
114k
              resource[index_stream] == '\x0c' ||
539
104k
              resource[index_stream] == '\x0d' ||
540
61.2k
              resource[index_stream] == '\x20')
541
91.9k
            index_stream++;
542
50.7k
          else
543
50.7k
            index_pattern++;
544
142k
        } else {
545
87.0k
          if ((type_row->mask[index_pattern] & resource[index_stream]) != type_row->pattern[index_pattern]) {
546
50.1k
            skip_row = TRUE;
547
50.1k
            break;
548
50.1k
          }
549
36.8k
          index_pattern++;
550
36.8k
          index_stream++;
551
36.8k
        }
552
229k
      }
553
554
52.8k
      if (skip_row)
555
50.1k
        continue;
556
557
2.66k
      if (index_pattern > type_row->pattern_length) {
558
348
        if (type_row->has_tag_termination &&
559
333
            resource[index_stream] != '\x20' &&
560
304
            resource[index_stream] != '\x3E')
561
262
          continue;
562
563
86
        return g_strdup (type_row->sniffed_type);
564
348
      }
565
14.4k
    } else {
566
14.4k
      guint j;
567
568
14.4k
      if (resource_length < type_row->pattern_length)
569
3.07k
        continue;
570
571
11.5k
      for (j = 0; j < type_row->pattern_length; j++) {
572
11.5k
        if ((type_row->mask[j] & resource[j]) != type_row->pattern[j])
573
11.3k
          break;
574
11.5k
      }
575
576
      /* This means our comparison above matched completely */
577
11.3k
      if (j == type_row->pattern_length)
578
22
        return g_strdup (type_row->sniffed_type);
579
11.3k
    }
580
67.2k
  }
581
582
2.86k
  sniffed_type = sniff_images (sniffer, buffer);
583
584
2.86k
  if (sniffed_type != NULL)
585
57
    return sniffed_type;
586
587
2.81k
  sniffed_type = sniff_audio_video (sniffer, buffer);
588
589
2.81k
  if (sniffed_type != NULL)
590
11
    return sniffed_type;
591
592
72.2k
  for (i = 0; i < resource_length; i++) {
593
71.0k
    if (byte_looks_binary[resource[i]])
594
1.59k
      return g_strdup ("application/octet-stream");
595
71.0k
  }
596
597
1.20k
  return g_strdup ("text/plain");
598
2.79k
}
599
600
/* MIMESNIFF: 7.2 Sniffing a mislabeled binary resource */
601
static char*
602
sniff_text_or_binary (SoupContentSniffer *sniffer, GBytes *buffer)
603
1.20k
{
604
1.20k
  gsize resource_length;
605
1.20k
  const guchar *resource = g_bytes_get_data (buffer, &resource_length);
606
1.20k
  resource_length = MIN (512, resource_length);
607
1.20k
  gboolean looks_binary = FALSE;
608
1.20k
  int i;
609
610
  /* 2. Detecting UTF-16BE, UTF-16LE BOMs means it's text/plain */
611
1.20k
  if (resource_length >= 2) {
612
1.20k
    if ((resource[0] == 0xFE && resource[1] == 0xFF) ||
613
1.19k
        (resource[0] == 0xFF && resource[1] == 0xFE))
614
8
      return g_strdup ("text/plain");
615
1.20k
  }
616
617
  /* 3. UTF-8 BOM. */
618
1.19k
  if (resource_length >= 3) {
619
1.13k
    if (resource[0] == 0xEF && resource[1] == 0xBB && resource[2] == 0xBF)
620
5
      return g_strdup ("text/plain");
621
1.13k
  }
622
623
  /* 4. Look to see if any of the first n bytes looks binary */
624
31.3k
  for (i = 0; i < resource_length; i++) {
625
30.7k
    if (byte_looks_binary[resource[i]]) {
626
565
      looks_binary = TRUE;
627
565
      break;
628
565
    }
629
30.7k
  }
630
631
1.19k
  if (!looks_binary)
632
627
    return g_strdup ("text/plain");
633
634
  /* 5. Execute 7.1 Identifying a resource with an unknown MIME type.
635
   * TODO: sniff-scriptable needs to be unset.
636
   */
637
565
  return sniff_unknown (sniffer, buffer, TRUE);
638
1.19k
}
639
640
static gboolean
641
skip_insignificant_space (const char *resource, gsize *pos, gsize resource_length)
642
2.94k
{
643
2.94k
        if (*pos >= resource_length)
644
33
          return TRUE;
645
646
7.96k
  while ((resource[*pos] == '\x09') ||
647
6.64k
         (resource[*pos] == '\x20') ||
648
5.62k
         (resource[*pos] == '\x0A') ||
649
5.16k
         (resource[*pos] == '\x0D')) {
650
5.16k
    *pos = *pos + 1;
651
652
5.16k
    if (*pos >= resource_length)
653
113
      return TRUE;
654
5.16k
  }
655
656
2.80k
  return FALSE;
657
2.91k
}
658
659
static char*
660
sniff_feed_or_html (SoupContentSniffer *sniffer, GBytes *buffer)
661
1.20k
{
662
1.20k
  gsize resource_length;
663
1.20k
  const char *resource = g_bytes_get_data (buffer, &resource_length);
664
1.20k
  resource_length = MIN (512, resource_length);
665
1.20k
  gsize pos = 0;
666
667
1.20k
  if (resource_length < 3)
668
65
    goto text_html;
669
670
  /* Skip a leading UTF-8 BOM */
671
1.14k
  if ((guchar)resource[0] == 0xEF && (guchar)resource[1] == 0xBB && (guchar)resource[2] == 0xBF)
672
5
    pos = 3;
673
674
2.43k
 look_for_tag:
675
2.43k
  if (skip_insignificant_space (resource, &pos, resource_length))
676
53
    goto text_html;
677
678
2.38k
  if (resource[pos] != '<')
679
327
    return g_strdup ("text/html");
680
681
2.05k
  pos++;
682
683
2.05k
  if ((pos + 2) > resource_length)
684
15
    goto text_html;
685
686
  /* Skip comments. */
687
2.03k
  if (data_has_prefix (resource + pos, "!--", resource_length - pos)) {
688
230
    pos = pos + 3;
689
690
230
    if ((pos + 2) > resource_length)
691
10
      goto text_html;
692
693
5.22k
    while (!data_has_prefix (resource + pos, "-->", resource_length - pos)) {
694
5.06k
      pos++;
695
696
5.06k
      if ((pos + 2) > resource_length)
697
59
        goto text_html;
698
5.06k
    }
699
700
161
    pos = pos + 3;
701
702
161
    goto look_for_tag;
703
220
  }
704
705
1.80k
  if (pos > resource_length)
706
0
    goto text_html;
707
708
1.80k
  if (resource[pos] == '!') {
709
11.3k
    do {
710
11.3k
      pos++;
711
712
11.3k
      if ((pos + 1) > resource_length)
713
49
        goto text_html;
714
11.3k
    } while (resource[pos] != '>');
715
716
794
    pos++;
717
718
794
    goto look_for_tag;
719
965
  } else if (resource[pos] == '?') {
720
5.04k
    do {
721
5.04k
      pos++;
722
723
5.04k
      if ((pos + 1) > resource_length)
724
62
        goto text_html;
725
5.04k
    } while (!data_has_prefix (resource + pos, "?>", resource_length - pos));
726
727
338
    pos = pos + 2;
728
729
338
    goto look_for_tag;
730
400
  }
731
732
565
  if ((pos + 3) > resource_length)
733
51
    goto text_html;
734
735
514
  if (data_has_prefix (resource + pos, "rss", resource_length - pos))
736
2
    return g_strdup ("application/rss+xml");
737
738
512
  if ((pos + 4) > resource_length)
739
79
    goto text_html;
740
741
433
  if (data_has_prefix (resource + pos, "feed", resource_length - pos))
742
1
    return g_strdup ("application/atom+xml");
743
744
432
  if ((pos + 7) > resource_length)
745
37
    goto text_html;
746
747
395
  if (data_has_prefix (resource + pos, "rdf:RDF", resource_length - pos)) {
748
346
    pos = pos + 7;
749
750
346
    if (skip_insignificant_space (resource, &pos, resource_length))
751
32
      goto text_html;
752
753
314
    if ((pos + 32) > resource_length)
754
47
      goto text_html;
755
756
267
    if (data_has_prefix (resource + pos, "xmlns=\"http://purl.org/rss/1.0/\"", resource_length - pos)) {
757
82
      pos = pos + 32;
758
759
82
      if (skip_insignificant_space (resource, &pos, resource_length))
760
30
        goto text_html;
761
762
52
      if ((pos + 55) > resource_length)
763
50
        goto text_html;
764
765
2
      if (data_has_prefix (resource + pos, "xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"", resource_length - pos))
766
1
        return g_strdup ("application/rss+xml");
767
2
    }
768
769
186
    if ((pos + 55) > resource_length)
770
88
      goto text_html;
771
772
98
    if (data_has_prefix (resource + pos, "xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"", resource_length - pos)) {
773
86
      pos = pos + 55;
774
775
86
      if (skip_insignificant_space (resource, &pos, resource_length))
776
31
        goto text_html;
777
778
55
      if ((pos + 32) > resource_length)
779
48
        goto text_html;
780
781
7
      if (data_has_prefix (resource + pos, "xmlns=\"http://purl.org/rss/1.0/\"", resource_length - pos))
782
1
        return g_strdup ("application/rss+xml");
783
7
    }
784
98
  }
785
786
873
 text_html:
787
873
  return g_strdup ("text/html");
788
395
}
789
790
/**
791
 * soup_content_sniffer_sniff:
792
 * @sniffer: a #SoupContentSniffer
793
 * @msg: the message to sniff
794
 * @buffer: a buffer containing the start of @msg's response body
795
 * @params: (element-type utf8 utf8) (out) (transfer full) (nullable): return
796
 *   location for Content-Type parameters (eg, "charset"), or %NULL
797
 *
798
 * Sniffs @buffer to determine its Content-Type.
799
 *
800
 * The result may also be influenced by the Content-Type declared in @msg's
801
 * response headers.
802
 *
803
 * Returns: the sniffed Content-Type of @buffer; this will never be %NULL,
804
 *   but may be `application/octet-stream`.
805
 */
806
char *
807
soup_content_sniffer_sniff (SoupContentSniffer *sniffer, SoupMessage *msg,
808
          GBytes *buffer, GHashTable **params)
809
8.43k
{
810
8.43k
  const char *content_type;
811
8.43k
  const char *x_content_type_options;
812
8.43k
  char *sniffed_type = NULL;
813
8.43k
  gboolean no_sniff = FALSE;
814
815
8.43k
  content_type = soup_message_headers_get_content_type (soup_message_get_response_headers (msg), params);
816
817
  /* MIMESNIFF: 7 Determining the sniffed MIME type of a resource. */
818
819
8.43k
  x_content_type_options = soup_message_headers_get_one_common (soup_message_get_response_headers (msg), SOUP_HEADER_X_CONTENT_TYPE_OPTIONS);
820
8.43k
  if (!g_strcmp0 (x_content_type_options, "nosniff"))
821
0
    no_sniff = TRUE;
822
823
  /* 1. Unknown/undefined supplied type with sniff-scritable = !nosniff. */
824
8.43k
  if ((content_type == NULL) ||
825
7.23k
      !g_ascii_strcasecmp (content_type, "unknown/unknown") ||
826
7.23k
      !g_ascii_strcasecmp (content_type, "application/unknown") ||
827
6.02k
      !g_ascii_strcasecmp (content_type, "*/*"))
828
2.41k
    return sniff_unknown (sniffer, buffer, !no_sniff);
829
830
  /* 2. If nosniff is specified in X-Content-Type-Options use the supplied MIME type. */
831
6.02k
  if (no_sniff)
832
0
    return g_strdup (content_type);
833
834
  /* 3. check-for-apache-bug */
835
6.02k
  if ((content_type != NULL) &&
836
6.02k
      (g_str_equal (content_type, "text/plain") ||
837
4.82k
       g_str_equal (content_type, "text/plain; charset=ISO-8859-1") ||
838
4.82k
       g_str_equal (content_type, "text/plain; charset=iso-8859-1") ||
839
4.82k
       g_str_equal (content_type, "text/plain; charset=UTF-8")))
840
1.20k
    return sniff_text_or_binary (sniffer, buffer);
841
842
  /* 4. XML types sent by the server are always used. */
843
4.82k
  if (g_str_has_suffix (content_type, "+xml") ||
844
4.82k
      !g_ascii_strcasecmp (content_type, "text/xml") ||
845
3.61k
      !g_ascii_strcasecmp (content_type, "application/xml"))
846
1.20k
    return g_strdup (content_type);
847
848
  /* 5. Distinguish feed from HTML. */
849
3.61k
  if (!g_ascii_strcasecmp (content_type, "text/html"))
850
1.20k
    return sniff_feed_or_html (sniffer, buffer);
851
852
  /* 6. Image types.
853
   */
854
2.41k
  if (!g_ascii_strncasecmp (content_type, "image/", 6)) {
855
1.20k
    sniffed_type = sniff_images (sniffer, buffer);
856
1.20k
    if (sniffed_type != NULL)
857
20
      return sniffed_type;
858
1.18k
    return g_strdup (content_type);
859
1.20k
  }
860
861
  /* 7. Audio and video types. */
862
1.20k
  if (!g_ascii_strncasecmp (content_type, "audio/", 6) ||
863
1.20k
      !g_ascii_strncasecmp (content_type, "video/", 6) ||
864
1.20k
      !g_ascii_strcasecmp (content_type, "application/ogg")) {
865
1.20k
          sniffed_type = sniff_audio_video (sniffer, buffer);
866
1.20k
          if (sniffed_type != NULL)
867
9
            return sniffed_type;
868
1.19k
    return g_strdup (content_type);
869
1.20k
        }
870
871
  /* If we got text/plain, use text_or_binary */
872
0
  if (g_str_equal (content_type, "text/plain")) {
873
0
    return sniff_text_or_binary (sniffer, buffer);
874
0
  }
875
876
0
  return g_strdup (content_type);
877
0
}
878
879
static void
880
soup_content_sniffer_request_queued (SoupSessionFeature *feature,
881
             SoupMessage        *msg)
882
0
{
883
0
  soup_message_set_content_sniffer (msg, SOUP_CONTENT_SNIFFER (feature));
884
0
}
885
886
static void
887
soup_content_sniffer_request_unqueued (SoupSessionFeature *feature,
888
               SoupMessage        *msg)
889
0
{
890
0
  soup_message_set_content_sniffer (msg, NULL);
891
0
}
892
893
static void
894
soup_content_sniffer_class_init (SoupContentSnifferClass *content_sniffer_class)
895
1
{
896
1
}
897
898
static void
899
soup_content_sniffer_session_feature_init (SoupSessionFeatureInterface *feature_interface,
900
             gpointer interface_data)
901
1
{
902
1
  feature_interface->request_queued = soup_content_sniffer_request_queued;
903
1
  feature_interface->request_unqueued = soup_content_sniffer_request_unqueued;
904
1
}
905
906
/**
907
 * soup_content_sniffer_new:
908
 *
909
 * Creates a new [class@ContentSniffer].
910
 *
911
 * Returns: a new #SoupContentSniffer
912
 **/
913
SoupContentSniffer *
914
soup_content_sniffer_new (void)
915
1.20k
{
916
1.20k
  return g_object_new (SOUP_TYPE_CONTENT_SNIFFER, NULL);
917
1.20k
}