1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
2 /*
3 * soup-content-sniffer.c
4 *
5 * Copyright (C) 2009, 2013 Gustavo Noronha Silva.
6 *
7 * This code implements the following specification:
8 *
9 * http://mimesniff.spec.whatwg.org/ as of 11 June 2013
10 */
11
12 #ifdef HAVE_CONFIG_H
13 #include <config.h>
14 #endif
15
16 #include <string.h>
17
18 #include "soup-content-sniffer.h"
19 #include "soup.h"
20 #include "soup-content-processor.h"
21 #include "soup-content-sniffer-stream.h"
22 #include "soup-message-private.h"
23
24 /**
25 * SECTION:soup-content-sniffer
26 * @short_description: Content sniffing for SoupSession
27 *
28 * A #SoupContentSniffer tries to detect the actual content type of
29 * the files that are being downloaded by looking at some of the data
30 * before the #SoupMessage emits its #SoupMessage::got-headers signal.
31 * #SoupContentSniffer implements #SoupSessionFeature, so you can add
32 * content sniffing to a session with soup_session_add_feature() or
33 * soup_session_add_feature_by_type().
34 *
35 * Since: 2.28
36 **/
37
38 static void soup_content_sniffer_session_feature_init (SoupSessionFeatureInterface *feature_interface, gpointer interface_data);
39
40 static SoupContentProcessorInterface *soup_content_sniffer_default_content_processor_interface;
41 static void soup_content_sniffer_content_processor_init (SoupContentProcessorInterface *interface, gpointer interface_data);
42
43
G_DEFINE_TYPE_WITH_CODE(SoupContentSniffer,soup_content_sniffer,G_TYPE_OBJECT,G_IMPLEMENT_INTERFACE (SOUP_TYPE_SESSION_FEATURE,soup_content_sniffer_session_feature_init)G_IMPLEMENT_INTERFACE (SOUP_TYPE_CONTENT_PROCESSOR,soup_content_sniffer_content_processor_init))44 G_DEFINE_TYPE_WITH_CODE (SoupContentSniffer, soup_content_sniffer, G_TYPE_OBJECT,
45 G_IMPLEMENT_INTERFACE (SOUP_TYPE_SESSION_FEATURE,
46 soup_content_sniffer_session_feature_init)
47 G_IMPLEMENT_INTERFACE (SOUP_TYPE_CONTENT_PROCESSOR,
48 soup_content_sniffer_content_processor_init))
49
50
51 static GInputStream *
52 soup_content_sniffer_content_processor_wrap_input (SoupContentProcessor *processor,
53 GInputStream *base_stream,
54 SoupMessage *msg,
55 GError **error)
56 {
57 return g_object_new (SOUP_TYPE_CONTENT_SNIFFER_STREAM,
58 "base-stream", base_stream,
59 "message", msg,
60 "sniffer", SOUP_CONTENT_SNIFFER (processor),
61 NULL);
62 }
63
64 static void
soup_content_sniffer_content_processor_init(SoupContentProcessorInterface * processor_interface,gpointer interface_data)65 soup_content_sniffer_content_processor_init (SoupContentProcessorInterface *processor_interface,
66 gpointer interface_data)
67 {
68 soup_content_sniffer_default_content_processor_interface =
69 g_type_default_interface_peek (SOUP_TYPE_CONTENT_PROCESSOR);
70
71 processor_interface->processing_stage = SOUP_STAGE_BODY_DATA;
72 processor_interface->wrap_input = soup_content_sniffer_content_processor_wrap_input;
73 }
74
75 static void
soup_content_sniffer_init(SoupContentSniffer * content_sniffer)76 soup_content_sniffer_init (SoupContentSniffer *content_sniffer)
77 {
78 }
79
80 typedef struct {
81 const guchar *mask;
82 const guchar *pattern;
83 guint pattern_length;
84 const char *sniffed_type;
85 } SoupContentSnifferMediaPattern;
86
87 static char*
sniff_media(SoupContentSniffer * sniffer,SoupBuffer * buffer,SoupContentSnifferMediaPattern table[],int table_length)88 sniff_media (SoupContentSniffer *sniffer,
89 SoupBuffer *buffer,
90 SoupContentSnifferMediaPattern table[],
91 int table_length)
92 {
93 const guchar *resource = (const guchar *)buffer->data;
94 guint resource_length = MIN (512, buffer->length);
95 int i;
96
97 for (i = 0; i < table_length; i++) {
98 SoupContentSnifferMediaPattern *type_row = &(table[i]);
99 guint j;
100
101 if (resource_length < type_row->pattern_length)
102 continue;
103
104 for (j = 0; j < type_row->pattern_length; j++) {
105 if ((type_row->mask[j] & resource[j]) != type_row->pattern[j])
106 break;
107 }
108
109 /* This means our comparison above matched completely */
110 if (j == type_row->pattern_length)
111 return g_strdup (type_row->sniffed_type);
112 }
113
114 return NULL;
115 }
116
117 /* This table is based on the MIMESNIFF spec;
118 * See 6.1 Matching an image type pattern
119 */
120 static SoupContentSnifferMediaPattern image_types_table[] = {
121
122 /* Windows icon signature. */
123 { (const guchar *)"\xFF\xFF\xFF\xFF",
124 (const guchar *)"\x00\x00\x01\x00",
125 4,
126 "image/x-icon" },
127
128 /* Windows cursor signature. */
129 { (const guchar *)"\xFF\xFF\xFF\xFF",
130 (const guchar *)"\x00\x00\x02\x00",
131 4,
132 "image/x-icon" },
133
134 /* BMP. */
135 { (const guchar *)"\xFF\xFF",
136 (const guchar *)"BM",
137 2,
138 "image/bmp" },
139
140 /* GIFs. */
141 { (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF",
142 (const guchar *)"GIF87a",
143 6,
144 "image/gif" },
145
146 { (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF",
147 (const guchar *)"GIF89a",
148 6,
149 "image/gif" },
150
151 /* WEBP. */
152 { (const guchar *)"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF\xFF\xFF",
153 (const guchar *)"RIFF\x00\x00\x00\x00WEBPVP",
154 14,
155 "image/webp" },
156
157 /* PNG. */
158 { (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF",
159 (const guchar *)"\x89PNG\x0D\x0A\x1A\x0A",
160 8,
161 "image/png" },
162
163 /* JPEG. */
164 { (const guchar *)"\xFF\xFF\xFF",
165 (const guchar *)"\xFF\xD8\xFF",
166 3,
167 "image/jpeg" },
168 };
169
170 static char*
sniff_images(SoupContentSniffer * sniffer,SoupBuffer * buffer)171 sniff_images (SoupContentSniffer *sniffer, SoupBuffer *buffer)
172 {
173 return sniff_media (sniffer,
174 buffer,
175 image_types_table,
176 G_N_ELEMENTS (image_types_table));
177 }
178
179 /* This table is based on the MIMESNIFF spec;
180 * See 6.2 Matching an audio or video type pattern
181 */
182 static SoupContentSnifferMediaPattern audio_video_types_table[] = {
183 { (const guchar *)"\xFF\xFF\xFF\xFF",
184 (const guchar *)"\x1A\x45\xDF\xA3",
185 4,
186 "video/webm" },
187
188 { (const guchar *)"\xFF\xFF\xFF\xFF",
189 (const guchar *)".snd",
190 4,
191 "audio/basic" },
192
193
194 { (const guchar *)"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF",
195 (const guchar *)"FORM\0\0\0\0AIFF",
196 12,
197 "audio/aiff" },
198
199 { (const guchar *)"\xFF\xFF\xFF",
200 (const guchar *)"ID3",
201 3,
202 "audio/mpeg" },
203
204 { (const guchar *)"\xFF\xFF\xFF\xFF\xFF",
205 (const guchar *)"OggS\0",
206 5,
207 "application/ogg" },
208
209 { (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF",
210 (const guchar *)"MThd\x00\x00\x00\x06",
211 8,
212 "audio/midi" },
213
214 { (const guchar *)"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF",
215 (const guchar *)"RIFF\x00\x00\x00\x00AVI ",
216 12,
217 "video/avi" },
218
219 { (const guchar *)"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF",
220 (const guchar *)"RIFF\x00\x00\x00\x00WAVE",
221 12,
222 "audio/wave" },
223 };
224
225 static gboolean
sniff_mp4(SoupContentSniffer * sniffer,SoupBuffer * buffer)226 sniff_mp4 (SoupContentSniffer *sniffer, SoupBuffer *buffer)
227 {
228 const char *resource = (const char *)buffer->data;
229 guint resource_length = MIN (512, buffer->length);
230 guint32 box_size = *((guint32*)resource);
231 guint i;
232
233 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
234 box_size = ((box_size >> 24) |
235 ((box_size << 8) & 0x00FF0000) |
236 ((box_size >> 8) & 0x0000FF00) |
237 (box_size << 24));
238 #endif
239
240 if (resource_length < 12 || resource_length < box_size || box_size % 4 != 0)
241 return FALSE;
242
243 if (!g_str_has_prefix (resource + 4, "ftyp"))
244 return FALSE;
245
246 if (!g_str_has_prefix (resource + 8, "mp4"))
247 return FALSE;
248
249 for (i = 16; i < box_size && i < resource_length; i = i + 4) {
250 if (g_str_has_prefix (resource + i, "mp4"))
251 return TRUE;
252 }
253
254 return FALSE;
255 }
256
257 static char*
sniff_audio_video(SoupContentSniffer * sniffer,SoupBuffer * buffer)258 sniff_audio_video (SoupContentSniffer *sniffer, SoupBuffer *buffer)
259 {
260 char *sniffed_type;
261
262 sniffed_type = sniff_media (sniffer,
263 buffer,
264 audio_video_types_table,
265 G_N_ELEMENTS (audio_video_types_table));
266
267 if (sniffed_type != NULL)
268 return sniffed_type;
269
270 if (sniff_mp4 (sniffer, buffer))
271 return g_strdup ("video/mp4");
272
273 return NULL;
274 }
275
276 /* This table is based on the MIMESNIFF spec;
277 * See 7.1 Identifying a resource with an unknown MIME type
278 */
279 typedef struct {
280 /* @has_ws is TRUE if @pattern contains "generic" whitespace */
281 gboolean has_ws;
282 /* @has_tag_termination is TRUE if we should check for a tag-terminating
283 * byte (0x20 " " or 0x3E ">") after the pattern match.
284 */
285 gboolean has_tag_termination;
286 const guchar *mask;
287 const guchar *pattern;
288 guint pattern_length;
289 const char *sniffed_type;
290 gboolean scriptable;
291 } SoupContentSnifferPattern;
292
293
294 /* When has_ws is TRUE, spaces in the pattern will indicate where insignificant space
295 * is allowed. Those spaces are marked with \x00 on the mask.
296 */
297 static SoupContentSnifferPattern types_table[] = {
298 /* Scriptable types. */
299
300 { TRUE, TRUE,
301 (const guchar *)"\x00\xFF\xFF\xDF\xDF\xDF\xDF\xDF\xDF\xDF\xFF\xDF\xDF\xDF\xDF",
302 (const guchar *)" <!DOCTYPE HTML",
303 14,
304 "text/html",
305 TRUE },
306
307 { TRUE, TRUE,
308 (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF",
309 (const guchar *)" <HTML",
310 5,
311 "text/html",
312 TRUE },
313
314 { TRUE, TRUE,
315 (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF",
316 (const guchar *)" <HEAD",
317 5,
318 "text/html",
319 TRUE },
320
321 { TRUE, TRUE,
322 (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF\xDF\xDF",
323 (const guchar *)" <SCRIPT",
324 7,
325 "text/html",
326 TRUE },
327
328 { TRUE, TRUE,
329 (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF\xDF\xDF",
330 (const guchar *)" <IFRAME",
331 7,
332 "text/html",
333 TRUE },
334
335 { TRUE, TRUE,
336 (const guchar *)"\x00\xFF\xDF\xFF",
337 (const guchar *)" <H1",
338 3,
339 "text/html",
340 TRUE },
341
342 { TRUE, TRUE,
343 (const guchar *)"\x00\xFF\xDF\xDF\xDF",
344 (const guchar *)" <DIV",
345 4,
346 "text/html",
347 TRUE },
348
349 { TRUE, TRUE,
350 (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF",
351 (const guchar *)" <FONT",
352 5,
353 "text/html",
354 TRUE },
355
356 { TRUE, TRUE,
357 (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF\xDF",
358 (const guchar *)" <TABLE",
359 6,
360 "text/html",
361 TRUE },
362
363 { TRUE, TRUE,
364 (const guchar *)"\x00\xFF\xDF",
365 (const guchar *)" <A",
366 2,
367 "text/html",
368 TRUE },
369
370 { TRUE, TRUE,
371 (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF\xDF",
372 (const guchar *)" <STYLE",
373 6,
374 "text/html",
375 TRUE },
376
377 { TRUE, TRUE,
378 (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF\xDF",
379 (const guchar *)" <TITLE",
380 6,
381 "text/html",
382 TRUE },
383
384 { TRUE, TRUE,
385 (const guchar *)"\x00\xFF\xDF",
386 (const guchar *)" <B",
387 2,
388 "text/html",
389 TRUE },
390
391 { TRUE, TRUE,
392 (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF",
393 (const guchar *)" <BODY",
394 5,
395 "text/html",
396 TRUE },
397
398 { TRUE, TRUE,
399 (const guchar *)"\x00\xFF\xDF\xDF",
400 (const guchar *)" <BR",
401 3,
402 "text/html",
403 TRUE },
404
405 { TRUE, TRUE,
406 (const guchar *)"\x00\xFF\xDF",
407 (const guchar *)" <P",
408 2,
409 "text/html",
410 TRUE },
411
412 { TRUE, TRUE,
413 (const guchar *)"\x00\xFF\xFF\xFF\xFF",
414 (const guchar *)" <!--",
415 4,
416 "text/html",
417 TRUE },
418
419 { TRUE, FALSE,
420 (const guchar *)"\x00\xFF\xFF\xFF\xFF\xFF",
421 (const guchar *)" <?xml",
422 5,
423 "text/xml",
424 TRUE },
425
426 { FALSE, FALSE,
427 (const guchar *)"\xFF\xFF\xFF\xFF\xFF",
428 (const guchar *)"%PDF-",
429 5,
430 "application/pdf",
431 TRUE },
432
433 /* Non-scriptable types. */
434 { FALSE, FALSE,
435 (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF",
436 (const guchar *)"%!PS-Adobe-",
437 11,
438 "application/postscript",
439 FALSE },
440
441 { FALSE, FALSE, /* UTF-16BE BOM */
442 (const guchar *)"\xFF\xFF\x00\x00",
443 (const guchar *)"\xFE\xFF\x00\x00",
444 4,
445 "text/plain",
446 FALSE },
447
448 { FALSE, FALSE, /* UTF-16LE BOM */
449 (const guchar *)"\xFF\xFF\x00\x00",
450 (const guchar *)"\xFF\xFE\x00\x00",
451 4,
452 "text/plain",
453 FALSE },
454
455 { FALSE, FALSE, /* UTF-8 BOM */
456 (const guchar *)"\xFF\xFF\xFF\x00",
457 (const guchar *)"\xEF\xBB\xBF\x00",
458 4,
459 "text/plain",
460 FALSE },
461 };
462
463 /* Whether a given byte looks like it might be part of binary content.
464 * Source: HTML5 spec; borrowed from the Chromium mime sniffer code,
465 * which is BSD-licensed
466 */
467 static char byte_looks_binary[] = {
468 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, /* 0x00 - 0x0F */
469 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, /* 0x10 - 0x1F */
470 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x20 - 0x2F */
471 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x30 - 0x3F */
472 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x40 - 0x4F */
473 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x50 - 0x5F */
474 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x60 - 0x6F */
475 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x70 - 0x7F */
476 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x80 - 0x8F */
477 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x90 - 0x9F */
478 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xA0 - 0xAF */
479 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xB0 - 0xBF */
480 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xC0 - 0xCF */
481 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xD0 - 0xDF */
482 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xE0 - 0xEF */
483 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xF0 - 0xFF */
484 };
485
486 /* HTML5: 2.7.4 Content-Type sniffing: unknown type */
487 static char*
sniff_unknown(SoupContentSniffer * sniffer,SoupBuffer * buffer,gboolean sniff_scriptable)488 sniff_unknown (SoupContentSniffer *sniffer, SoupBuffer *buffer,
489 gboolean sniff_scriptable)
490 {
491 char *sniffed_type = NULL;
492 const guchar *resource = (const guchar *)buffer->data;
493 guint resource_length = MIN (512, buffer->length);
494 guint i;
495
496 for (i = 0; i < G_N_ELEMENTS (types_table); i++) {
497 SoupContentSnifferPattern *type_row = &(types_table[i]);
498
499 if (!sniff_scriptable && type_row->scriptable)
500 continue;
501
502 if (type_row->has_ws) {
503 guint index_stream = 0;
504 guint index_pattern = 0;
505 gboolean skip_row = FALSE;
506
507 while ((index_stream < resource_length) &&
508 (index_pattern <= type_row->pattern_length)) {
509 /* Skip insignificant white space ("WS" in the spec) */
510 if (type_row->pattern[index_pattern] == ' ') {
511 if (resource[index_stream] == '\x09' ||
512 resource[index_stream] == '\x0a' ||
513 resource[index_stream] == '\x0c' ||
514 resource[index_stream] == '\x0d' ||
515 resource[index_stream] == '\x20')
516 index_stream++;
517 else
518 index_pattern++;
519 } else {
520 if ((type_row->mask[index_pattern] & resource[index_stream]) != type_row->pattern[index_pattern]) {
521 skip_row = TRUE;
522 break;
523 }
524 index_pattern++;
525 index_stream++;
526 }
527 }
528
529 if (skip_row)
530 continue;
531
532 if (index_pattern > type_row->pattern_length) {
533 if (type_row->has_tag_termination &&
534 resource[index_stream] != '\x20' &&
535 resource[index_stream] != '\x3E')
536 continue;
537
538 return g_strdup (type_row->sniffed_type);
539 }
540 } else {
541 guint j;
542
543 if (resource_length < type_row->pattern_length)
544 continue;
545
546 for (j = 0; j < type_row->pattern_length; j++) {
547 if ((type_row->mask[j] & resource[j]) != type_row->pattern[j])
548 break;
549 }
550
551 /* This means our comparison above matched completely */
552 if (j == type_row->pattern_length)
553 return g_strdup (type_row->sniffed_type);
554 }
555 }
556
557 sniffed_type = sniff_images (sniffer, buffer);
558
559 if (sniffed_type != NULL)
560 return sniffed_type;
561
562 sniffed_type = sniff_audio_video (sniffer, buffer);
563
564 if (sniffed_type != NULL)
565 return sniffed_type;
566
567 for (i = 0; i < resource_length; i++) {
568 if (byte_looks_binary[resource[i]])
569 return g_strdup ("application/octet-stream");
570 }
571
572 return g_strdup ("text/plain");
573 }
574
575 /* MIMESNIFF: 7.2 Sniffing a mislabeled binary resource */
576 static char*
sniff_text_or_binary(SoupContentSniffer * sniffer,SoupBuffer * buffer)577 sniff_text_or_binary (SoupContentSniffer *sniffer, SoupBuffer *buffer)
578 {
579 const guchar *resource = (const guchar *)buffer->data;
580 int resource_length = MIN (512, buffer->length);
581 gboolean looks_binary = FALSE;
582 int i;
583
584 /* 2. Detecting UTF-16BE, UTF-16LE BOMs means it's text/plain */
585 if (resource_length >= 2) {
586 if ((resource[0] == 0xFE && resource[1] == 0xFF) ||
587 (resource[0] == 0xFF && resource[1] == 0xFE))
588 return g_strdup ("text/plain");
589 }
590
591 /* 3. UTF-8 BOM. */
592 if (resource_length >= 3) {
593 if (resource[0] == 0xEF && resource[1] == 0xBB && resource[2] == 0xBF)
594 return g_strdup ("text/plain");
595 }
596
597 /* 4. Look to see if any of the first n bytes looks binary */
598 for (i = 0; i < resource_length; i++) {
599 if (byte_looks_binary[resource[i]]) {
600 looks_binary = TRUE;
601 break;
602 }
603 }
604
605 if (!looks_binary)
606 return g_strdup ("text/plain");
607
608 /* 5. Execute 7.1 Identifying a resource with an unknown MIME type.
609 * TODO: sniff-scriptable needs to be unset.
610 */
611 return sniff_unknown (sniffer, buffer, TRUE);
612 }
613
614 static gboolean
skip_insignificant_space(const char * resource,int * pos,int resource_length)615 skip_insignificant_space (const char *resource, int *pos, int resource_length)
616 {
617 while ((resource[*pos] == '\x09') ||
618 (resource[*pos] == '\x20') ||
619 (resource[*pos] == '\x0A') ||
620 (resource[*pos] == '\x0D')) {
621 *pos = *pos + 1;
622
623 if (*pos > resource_length)
624 return TRUE;
625 }
626
627 return FALSE;
628 }
629
630 static char*
sniff_feed_or_html(SoupContentSniffer * sniffer,SoupBuffer * buffer)631 sniff_feed_or_html (SoupContentSniffer *sniffer, SoupBuffer *buffer)
632 {
633 const char *resource = (const char *)buffer->data;
634 int resource_length = MIN (512, buffer->length);
635 int pos = 0;
636
637 if (resource_length < 3)
638 goto text_html;
639
640 /* Skip a leading UTF-8 BOM */
641 if (resource[0] == 0xEF && resource[1] == 0xBB && resource[2] == 0xBF)
642 pos = 3;
643
644 look_for_tag:
645 if (pos > resource_length)
646 goto text_html;
647
648 if (skip_insignificant_space (resource, &pos, resource_length))
649 goto text_html;
650
651 if (resource[pos] != '<')
652 return g_strdup ("text/html");
653
654 pos++;
655
656 if ((pos + 2) > resource_length)
657 goto text_html;
658
659 /* Skip comments. */
660 if (g_str_has_prefix (resource + pos, "!--")) {
661 pos = pos + 3;
662
663 if ((pos + 2) > resource_length)
664 goto text_html;
665
666 while (!g_str_has_prefix (resource + pos, "-->")) {
667 pos++;
668
669 if ((pos + 2) > resource_length)
670 goto text_html;
671 }
672
673 pos = pos + 3;
674
675 goto look_for_tag;
676 }
677
678 if (pos > resource_length)
679 goto text_html;
680
681 if (resource[pos] == '!') {
682 do {
683 pos++;
684
685 if (pos > resource_length)
686 goto text_html;
687 } while (resource[pos] != '>');
688
689 pos++;
690
691 goto look_for_tag;
692 } else if (resource[pos] == '?') {
693 do {
694 pos++;
695
696 if ((pos + 1) > resource_length)
697 goto text_html;
698 } while (!g_str_has_prefix (resource + pos, "?>"));
699
700 pos = pos + 2;
701
702 goto look_for_tag;
703 }
704
705 if ((pos + 3) > resource_length)
706 goto text_html;
707
708 if (g_str_has_prefix (resource + pos, "rss"))
709 return g_strdup ("application/rss+xml");
710
711 if ((pos + 4) > resource_length)
712 goto text_html;
713
714 if (g_str_has_prefix (resource + pos, "feed"))
715 return g_strdup ("application/atom+xml");
716
717 if ((pos + 7) > resource_length)
718 goto text_html;
719
720 if (g_str_has_prefix (resource + pos, "rdf:RDF")) {
721 pos = pos + 7;
722
723 if (skip_insignificant_space (resource, &pos, resource_length))
724 goto text_html;
725
726 if ((pos + 32) > resource_length)
727 goto text_html;
728
729 if (g_str_has_prefix (resource + pos, "xmlns=\"http://purl.org/rss/1.0/\"")) {
730 pos = pos + 32;
731
732 if (skip_insignificant_space (resource, &pos, resource_length))
733 goto text_html;
734
735 if ((pos + 55) > resource_length)
736 goto text_html;
737
738 if (g_str_has_prefix (resource + pos, "xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\""))
739 return g_strdup ("application/rss+xml");
740 }
741
742 if ((pos + 55) > resource_length)
743 goto text_html;
744
745 if (g_str_has_prefix (resource + pos, "xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"")) {
746 pos = pos + 55;
747
748 if (skip_insignificant_space (resource, &pos, resource_length))
749 goto text_html;
750
751 if ((pos + 32) > resource_length)
752 goto text_html;
753
754 if (g_str_has_prefix (resource + pos, "xmlns=\"http://purl.org/rss/1.0/\""))
755 return g_strdup ("application/rss+xml");
756 }
757 }
758
759 text_html:
760 return g_strdup ("text/html");
761 }
762
763 static char *
soup_content_sniffer_real_sniff(SoupContentSniffer * sniffer,SoupMessage * msg,SoupBuffer * buffer,GHashTable ** params)764 soup_content_sniffer_real_sniff (SoupContentSniffer *sniffer, SoupMessage *msg,
765 SoupBuffer *buffer, GHashTable **params)
766 {
767 const char *content_type;
768 const char *x_content_type_options;
769 char *sniffed_type = NULL;
770 gboolean no_sniff = FALSE;
771
772 content_type = soup_message_headers_get_content_type (msg->response_headers, params);
773
774 /* MIMESNIFF: 7 Determining the sniffed MIME type of a resource. */
775
776 x_content_type_options = soup_message_headers_get_one (msg->response_headers, "X-Content-Type-Options");
777 if (!g_strcmp0 (x_content_type_options, "nosniff"))
778 no_sniff = TRUE;
779
780 /* 1. Unknown/undefined supplied type with sniff-scritable = !nosniff. */
781 if ((content_type == NULL) ||
782 !g_ascii_strcasecmp (content_type, "unknown/unknown") ||
783 !g_ascii_strcasecmp (content_type, "application/unknown") ||
784 !g_ascii_strcasecmp (content_type, "*/*"))
785 return sniff_unknown (sniffer, buffer, !no_sniff);
786
787 /* 2. If nosniff is specified in X-Content-Type-Options use the supplied MIME type. */
788 if (no_sniff)
789 return g_strdup (content_type);
790
791 /* 3. check-for-apache-bug */
792 if ((content_type != NULL) &&
793 (g_str_equal (content_type, "text/plain") ||
794 g_str_equal (content_type, "text/plain; charset=ISO-8859-1") ||
795 g_str_equal (content_type, "text/plain; charset=iso-8859-1") ||
796 g_str_equal (content_type, "text/plain; charset=UTF-8")))
797 return sniff_text_or_binary (sniffer, buffer);
798
799 /* 4. XML types sent by the server are always used. */
800 if (g_str_has_suffix (content_type, "+xml") ||
801 !g_ascii_strcasecmp (content_type, "text/xml") ||
802 !g_ascii_strcasecmp (content_type, "application/xml"))
803 return g_strdup (content_type);
804
805 /* 5. Distinguish feed from HTML. */
806 if (!g_ascii_strcasecmp (content_type, "text/html"))
807 return sniff_feed_or_html (sniffer, buffer);
808
809 /* 6. Image types.
810 */
811 if (!g_ascii_strncasecmp (content_type, "image/", 6)) {
812 sniffed_type = sniff_images (sniffer, buffer);
813 if (sniffed_type != NULL)
814 return sniffed_type;
815 return g_strdup (content_type);
816 }
817
818 /* 7. Audio and video types. */
819 if (!g_ascii_strncasecmp (content_type, "audio/", 6) ||
820 !g_ascii_strncasecmp (content_type, "video/", 6) ||
821 !g_ascii_strcasecmp (content_type, "application/ogg")) {
822 sniffed_type = sniff_audio_video (sniffer, buffer);
823 if (sniffed_type != NULL)
824 return sniffed_type;
825 return g_strdup (content_type);
826 }
827
828 /* If we got text/plain, use text_or_binary */
829 if (g_str_equal (content_type, "text/plain")) {
830 return sniff_text_or_binary (sniffer, buffer);
831 }
832
833 return g_strdup (content_type);
834 }
835
836 static gsize
soup_content_sniffer_real_get_buffer_size(SoupContentSniffer * sniffer)837 soup_content_sniffer_real_get_buffer_size (SoupContentSniffer *sniffer)
838 {
839 return 512;
840 }
841
842 static void
soup_content_sniffer_got_headers_cb(SoupMessage * msg,SoupContentSniffer * sniffer)843 soup_content_sniffer_got_headers_cb (SoupMessage *msg, SoupContentSniffer *sniffer)
844 {
845 soup_message_set_bytes_for_sniffing (msg, soup_content_sniffer_get_buffer_size (sniffer));
846 }
847
848 static void
soup_content_sniffer_request_queued(SoupSessionFeature * feature,SoupSession * session,SoupMessage * msg)849 soup_content_sniffer_request_queued (SoupSessionFeature *feature,
850 SoupSession *session,
851 SoupMessage *msg)
852 {
853 soup_message_set_content_sniffer (msg, SOUP_CONTENT_SNIFFER (feature));
854 g_signal_connect (msg, "got-headers",
855 G_CALLBACK (soup_content_sniffer_got_headers_cb),
856 feature);
857 }
858
859 static void
soup_content_sniffer_request_unqueued(SoupSessionFeature * feature,SoupSession * session,SoupMessage * msg)860 soup_content_sniffer_request_unqueued (SoupSessionFeature *feature,
861 SoupSession *session,
862 SoupMessage *msg)
863 {
864 soup_message_set_content_sniffer (msg, NULL);
865
866 g_signal_handlers_disconnect_by_func (msg, soup_content_sniffer_got_headers_cb, feature);
867 }
868
869 static void
soup_content_sniffer_class_init(SoupContentSnifferClass * content_sniffer_class)870 soup_content_sniffer_class_init (SoupContentSnifferClass *content_sniffer_class)
871 {
872 content_sniffer_class->sniff = soup_content_sniffer_real_sniff;
873 content_sniffer_class->get_buffer_size = soup_content_sniffer_real_get_buffer_size;
874 }
875
876 static void
soup_content_sniffer_session_feature_init(SoupSessionFeatureInterface * feature_interface,gpointer interface_data)877 soup_content_sniffer_session_feature_init (SoupSessionFeatureInterface *feature_interface,
878 gpointer interface_data)
879 {
880 feature_interface->request_queued = soup_content_sniffer_request_queued;
881 feature_interface->request_unqueued = soup_content_sniffer_request_unqueued;
882 }
883
884 /**
885 * soup_content_sniffer_new:
886 *
887 * Creates a new #SoupContentSniffer.
888 *
889 * Returns: a new #SoupContentSniffer
890 *
891 * Since: 2.28
892 **/
893 SoupContentSniffer *
soup_content_sniffer_new(void)894 soup_content_sniffer_new (void)
895 {
896 return g_object_new (SOUP_TYPE_CONTENT_SNIFFER, NULL);
897 }
898
899 /**
900 * soup_content_sniffer_sniff:
901 * @sniffer: a #SoupContentSniffer
902 * @msg: the message to sniff
903 * @buffer: a buffer containing the start of @msg's response body
904 * @params: (element-type utf8 utf8) (out) (transfer full) (allow-none): return
905 * location for Content-Type parameters (eg, "charset"), or %NULL
906 *
907 * Sniffs @buffer to determine its Content-Type. The result may also
908 * be influenced by the Content-Type declared in @msg's response
909 * headers.
910 *
911 * Return value: the sniffed Content-Type of @buffer; this will never be %NULL,
912 * but may be "application/octet-stream".
913 *
914 * Since: 2.28
915 */
916 char *
soup_content_sniffer_sniff(SoupContentSniffer * sniffer,SoupMessage * msg,SoupBuffer * buffer,GHashTable ** params)917 soup_content_sniffer_sniff (SoupContentSniffer *sniffer,
918 SoupMessage *msg, SoupBuffer *buffer,
919 GHashTable **params)
920 {
921 g_return_val_if_fail (SOUP_IS_CONTENT_SNIFFER (sniffer), NULL);
922 g_return_val_if_fail (SOUP_IS_MESSAGE (msg), NULL);
923 g_return_val_if_fail (buffer != NULL, NULL);
924
925 return SOUP_CONTENT_SNIFFER_GET_CLASS (sniffer)->sniff (sniffer, msg, buffer, params);
926 }
927
928 /**
929 * soup_content_sniffer_get_buffer_size:
930 * @sniffer: a #SoupContentSniffer
931 *
932 * Gets the number of bytes @sniffer needs in order to properly sniff
933 * a buffer.
934 *
935 * Return value: the number of bytes to sniff
936 *
937 * Since: 2.28
938 */
939 gsize
soup_content_sniffer_get_buffer_size(SoupContentSniffer * sniffer)940 soup_content_sniffer_get_buffer_size (SoupContentSniffer *sniffer)
941 {
942 g_return_val_if_fail (SOUP_IS_CONTENT_SNIFFER (sniffer), 0);
943
944 return SOUP_CONTENT_SNIFFER_GET_CLASS (sniffer)->get_buffer_size (sniffer);
945 }
946