• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
2 /*
3  * soup-content-sniffer.c
4  *
5  * Copyright (C) 2009, 2013 Gustavo Noronha Silva.
6  *
7  * This code implements the following specification:
8  *
9  *  http://mimesniff.spec.whatwg.org/ as of 11 June 2013
10  */
11 
12 #ifdef HAVE_CONFIG_H
13 #include <config.h>
14 #endif
15 
16 #include <string.h>
17 
18 #include "soup-content-sniffer.h"
19 #include "soup.h"
20 #include "soup-content-processor.h"
21 #include "soup-content-sniffer-stream.h"
22 #include "soup-message-private.h"
23 
24 /**
25  * SECTION:soup-content-sniffer
26  * @short_description: Content sniffing for SoupSession
27  *
28  * A #SoupContentSniffer tries to detect the actual content type of
29  * the files that are being downloaded by looking at some of the data
30  * before the #SoupMessage emits its #SoupMessage::got-headers signal.
31  * #SoupContentSniffer implements #SoupSessionFeature, so you can add
32  * content sniffing to a session with soup_session_add_feature() or
33  * soup_session_add_feature_by_type().
34  *
35  * Since: 2.28
36  **/
37 
38 static void soup_content_sniffer_session_feature_init (SoupSessionFeatureInterface *feature_interface, gpointer interface_data);
39 
40 static SoupContentProcessorInterface *soup_content_sniffer_default_content_processor_interface;
41 static void soup_content_sniffer_content_processor_init (SoupContentProcessorInterface *interface, gpointer interface_data);
42 
43 
G_DEFINE_TYPE_WITH_CODE(SoupContentSniffer,soup_content_sniffer,G_TYPE_OBJECT,G_IMPLEMENT_INTERFACE (SOUP_TYPE_SESSION_FEATURE,soup_content_sniffer_session_feature_init)G_IMPLEMENT_INTERFACE (SOUP_TYPE_CONTENT_PROCESSOR,soup_content_sniffer_content_processor_init))44 G_DEFINE_TYPE_WITH_CODE (SoupContentSniffer, soup_content_sniffer, G_TYPE_OBJECT,
45 			 G_IMPLEMENT_INTERFACE (SOUP_TYPE_SESSION_FEATURE,
46 						soup_content_sniffer_session_feature_init)
47 			 G_IMPLEMENT_INTERFACE (SOUP_TYPE_CONTENT_PROCESSOR,
48 						soup_content_sniffer_content_processor_init))
49 
50 
51 static GInputStream *
52 soup_content_sniffer_content_processor_wrap_input (SoupContentProcessor *processor,
53 						   GInputStream *base_stream,
54 						   SoupMessage *msg,
55 						   GError **error)
56 {
57 	return g_object_new (SOUP_TYPE_CONTENT_SNIFFER_STREAM,
58 			     "base-stream", base_stream,
59 			     "message", msg,
60 			     "sniffer", SOUP_CONTENT_SNIFFER (processor),
61 			     NULL);
62 }
63 
64 static void
soup_content_sniffer_content_processor_init(SoupContentProcessorInterface * processor_interface,gpointer interface_data)65 soup_content_sniffer_content_processor_init (SoupContentProcessorInterface *processor_interface,
66                                             gpointer interface_data)
67 {
68 	soup_content_sniffer_default_content_processor_interface =
69 		g_type_default_interface_peek (SOUP_TYPE_CONTENT_PROCESSOR);
70 
71 	processor_interface->processing_stage = SOUP_STAGE_BODY_DATA;
72 	processor_interface->wrap_input = soup_content_sniffer_content_processor_wrap_input;
73 }
74 
75 static void
soup_content_sniffer_init(SoupContentSniffer * content_sniffer)76 soup_content_sniffer_init (SoupContentSniffer *content_sniffer)
77 {
78 }
79 
80 typedef struct {
81 	const guchar *mask;
82 	const guchar *pattern;
83 	guint         pattern_length;
84 	const char   *sniffed_type;
85 } SoupContentSnifferMediaPattern;
86 
87 static char*
sniff_media(SoupContentSniffer * sniffer,SoupBuffer * buffer,SoupContentSnifferMediaPattern table[],int table_length)88 sniff_media (SoupContentSniffer *sniffer,
89 	     SoupBuffer *buffer,
90 	     SoupContentSnifferMediaPattern table[],
91 	     int table_length)
92 {
93 	const guchar *resource = (const guchar *)buffer->data;
94 	guint resource_length = MIN (512, buffer->length);
95 	int i;
96 
97 	for (i = 0; i < table_length; i++) {
98 		SoupContentSnifferMediaPattern *type_row = &(table[i]);
99 		guint j;
100 
101 		if (resource_length < type_row->pattern_length)
102 			continue;
103 
104 		for (j = 0; j < type_row->pattern_length; j++) {
105 			if ((type_row->mask[j] & resource[j]) != type_row->pattern[j])
106 				break;
107 		}
108 
109 		/* This means our comparison above matched completely */
110 		if (j == type_row->pattern_length)
111 			return g_strdup (type_row->sniffed_type);
112 	}
113 
114 	return NULL;
115 }
116 
117 /* This table is based on the MIMESNIFF spec;
118  * See 6.1 Matching an image type pattern
119  */
120 static SoupContentSnifferMediaPattern image_types_table[] = {
121 
122 	/* Windows icon signature. */
123 	{ (const guchar *)"\xFF\xFF\xFF\xFF",
124 	  (const guchar *)"\x00\x00\x01\x00",
125 	  4,
126 	  "image/x-icon" },
127 
128 	/* Windows cursor signature. */
129 	{ (const guchar *)"\xFF\xFF\xFF\xFF",
130 	  (const guchar *)"\x00\x00\x02\x00",
131 	  4,
132 	  "image/x-icon" },
133 
134 	/* BMP. */
135 	{ (const guchar *)"\xFF\xFF",
136 	  (const guchar *)"BM",
137 	  2,
138 	  "image/bmp" },
139 
140 	/* GIFs. */
141 	{ (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF",
142 	  (const guchar *)"GIF87a",
143 	  6,
144 	  "image/gif" },
145 
146 	{ (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF",
147 	  (const guchar *)"GIF89a",
148 	  6,
149 	  "image/gif" },
150 
151 	/* WEBP. */
152 	{ (const guchar *)"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF\xFF\xFF",
153 	  (const guchar *)"RIFF\x00\x00\x00\x00WEBPVP",
154 	  14,
155 	  "image/webp" },
156 
157 	/* PNG. */
158 	{ (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF",
159 	  (const guchar *)"\x89PNG\x0D\x0A\x1A\x0A",
160 	  8,
161 	  "image/png" },
162 
163 	/* JPEG. */
164 	{ (const guchar *)"\xFF\xFF\xFF",
165 	  (const guchar *)"\xFF\xD8\xFF",
166 	  3,
167 	  "image/jpeg" },
168 };
169 
170 static char*
sniff_images(SoupContentSniffer * sniffer,SoupBuffer * buffer)171 sniff_images (SoupContentSniffer *sniffer, SoupBuffer *buffer)
172 {
173 	return sniff_media (sniffer,
174 			    buffer,
175 			    image_types_table,
176 			    G_N_ELEMENTS (image_types_table));
177 }
178 
179 /* This table is based on the MIMESNIFF spec;
180  * See 6.2 Matching an audio or video type pattern
181  */
182 static SoupContentSnifferMediaPattern audio_video_types_table[] = {
183 	{ (const guchar *)"\xFF\xFF\xFF\xFF",
184 	  (const guchar *)"\x1A\x45\xDF\xA3",
185 	  4,
186 	  "video/webm" },
187 
188 	{ (const guchar *)"\xFF\xFF\xFF\xFF",
189 	  (const guchar *)".snd",
190 	  4,
191 	  "audio/basic" },
192 
193 
194 	{ (const guchar *)"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF",
195 	  (const guchar *)"FORM\0\0\0\0AIFF",
196 	  12,
197 	  "audio/aiff" },
198 
199 	{ (const guchar *)"\xFF\xFF\xFF",
200 	  (const guchar *)"ID3",
201 	  3,
202 	  "audio/mpeg" },
203 
204 	{ (const guchar *)"\xFF\xFF\xFF\xFF\xFF",
205 	  (const guchar *)"OggS\0",
206 	  5,
207 	  "application/ogg" },
208 
209 	{ (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF",
210 	  (const guchar *)"MThd\x00\x00\x00\x06",
211 	  8,
212 	  "audio/midi" },
213 
214 	{ (const guchar *)"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF",
215 	  (const guchar *)"RIFF\x00\x00\x00\x00AVI ",
216 	  12,
217 	  "video/avi" },
218 
219 	{ (const guchar *)"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF",
220 	  (const guchar *)"RIFF\x00\x00\x00\x00WAVE",
221 	  12,
222 	  "audio/wave" },
223 };
224 
225 static gboolean
sniff_mp4(SoupContentSniffer * sniffer,SoupBuffer * buffer)226 sniff_mp4 (SoupContentSniffer *sniffer, SoupBuffer *buffer)
227 {
228 	const char *resource = (const char *)buffer->data;
229 	guint resource_length = MIN (512, buffer->length);
230 	guint32 box_size = *((guint32*)resource);
231 	guint i;
232 
233 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
234 	box_size = ((box_size >> 24) |
235 		    ((box_size << 8) & 0x00FF0000) |
236 		    ((box_size >> 8) & 0x0000FF00) |
237 		    (box_size << 24));
238 #endif
239 
240 	if (resource_length < 12 || resource_length < box_size || box_size % 4 != 0)
241 		return FALSE;
242 
243 	if (!g_str_has_prefix (resource + 4, "ftyp"))
244 		return FALSE;
245 
246 	if (!g_str_has_prefix (resource + 8, "mp4"))
247 		return FALSE;
248 
249 	for (i = 16; i < box_size && i < resource_length; i = i + 4) {
250 		if (g_str_has_prefix (resource + i, "mp4"))
251 			return TRUE;
252 	}
253 
254 	return FALSE;
255 }
256 
257 static char*
sniff_audio_video(SoupContentSniffer * sniffer,SoupBuffer * buffer)258 sniff_audio_video (SoupContentSniffer *sniffer, SoupBuffer *buffer)
259 {
260 	char *sniffed_type;
261 
262 	sniffed_type = sniff_media (sniffer,
263 				    buffer,
264 				    audio_video_types_table,
265 				    G_N_ELEMENTS (audio_video_types_table));
266 
267 	if (sniffed_type != NULL)
268 		return sniffed_type;
269 
270 	if (sniff_mp4 (sniffer, buffer))
271 		return g_strdup ("video/mp4");
272 
273 	return NULL;
274 }
275 
276 /* This table is based on the MIMESNIFF spec;
277  * See 7.1 Identifying a resource with an unknown MIME type
278  */
279 typedef struct {
280 	/* @has_ws is TRUE if @pattern contains "generic" whitespace */
281 	gboolean      has_ws;
282 	/* @has_tag_termination is TRUE if we should check for a tag-terminating
283 	 * byte (0x20 " " or 0x3E ">") after the pattern match.
284 	 */
285 	gboolean      has_tag_termination;
286 	const guchar *mask;
287 	const guchar *pattern;
288 	guint         pattern_length;
289 	const char   *sniffed_type;
290 	gboolean      scriptable;
291 } SoupContentSnifferPattern;
292 
293 
294 /* When has_ws is TRUE, spaces in the pattern will indicate where insignificant space
295  * is allowed. Those spaces are marked with \x00 on the mask.
296  */
297 static SoupContentSnifferPattern types_table[] = {
298 	/* Scriptable types. */
299 
300 	{ TRUE, TRUE,
301 	  (const guchar *)"\x00\xFF\xFF\xDF\xDF\xDF\xDF\xDF\xDF\xDF\xFF\xDF\xDF\xDF\xDF",
302 	  (const guchar *)" <!DOCTYPE HTML",
303 	  14,
304 	  "text/html",
305 	  TRUE },
306 
307 	{ TRUE, TRUE,
308 	  (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF",
309 	  (const guchar *)" <HTML",
310 	  5,
311 	  "text/html",
312 	  TRUE },
313 
314 	{ TRUE, TRUE,
315 	  (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF",
316 	  (const guchar *)" <HEAD",
317 	  5,
318 	  "text/html",
319 	  TRUE },
320 
321 	{ TRUE, TRUE,
322 	  (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF\xDF\xDF",
323 	  (const guchar *)" <SCRIPT",
324 	  7,
325 	  "text/html",
326 	  TRUE },
327 
328 	{ TRUE, TRUE,
329 	  (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF\xDF\xDF",
330 	  (const guchar *)" <IFRAME",
331 	  7,
332 	  "text/html",
333 	  TRUE },
334 
335 	{ TRUE, TRUE,
336 	  (const guchar *)"\x00\xFF\xDF\xFF",
337 	  (const guchar *)" <H1",
338 	  3,
339 	  "text/html",
340 	  TRUE },
341 
342 	{ TRUE, TRUE,
343 	  (const guchar *)"\x00\xFF\xDF\xDF\xDF",
344 	  (const guchar *)" <DIV",
345 	  4,
346 	  "text/html",
347 	  TRUE },
348 
349 	{ TRUE, TRUE,
350 	  (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF",
351 	  (const guchar *)" <FONT",
352 	  5,
353 	  "text/html",
354 	  TRUE },
355 
356 	{ TRUE, TRUE,
357 	  (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF\xDF",
358 	  (const guchar *)" <TABLE",
359 	  6,
360 	  "text/html",
361 	  TRUE },
362 
363 	{ TRUE, TRUE,
364 	  (const guchar *)"\x00\xFF\xDF",
365 	  (const guchar *)" <A",
366 	  2,
367 	  "text/html",
368 	  TRUE },
369 
370 	{ TRUE, TRUE,
371 	  (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF\xDF",
372 	  (const guchar *)" <STYLE",
373 	  6,
374 	  "text/html",
375 	  TRUE },
376 
377 	{ TRUE, TRUE,
378 	  (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF\xDF",
379 	  (const guchar *)" <TITLE",
380 	  6,
381 	  "text/html",
382 	  TRUE },
383 
384 	{ TRUE, TRUE,
385 	  (const guchar *)"\x00\xFF\xDF",
386 	  (const guchar *)" <B",
387 	  2,
388 	  "text/html",
389 	  TRUE },
390 
391 	{ TRUE, TRUE,
392 	  (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF",
393 	  (const guchar *)" <BODY",
394 	  5,
395 	  "text/html",
396 	  TRUE },
397 
398 	{ TRUE, TRUE,
399 	  (const guchar *)"\x00\xFF\xDF\xDF",
400 	  (const guchar *)" <BR",
401 	  3,
402 	  "text/html",
403 	  TRUE },
404 
405 	{ TRUE, TRUE,
406 	  (const guchar *)"\x00\xFF\xDF",
407 	  (const guchar *)" <P",
408 	  2,
409 	  "text/html",
410 	  TRUE },
411 
412 	{ TRUE, TRUE,
413 	  (const guchar *)"\x00\xFF\xFF\xFF\xFF",
414 	  (const guchar *)" <!--",
415 	  4,
416 	  "text/html",
417 	  TRUE },
418 
419 	{ TRUE, FALSE,
420 	  (const guchar *)"\x00\xFF\xFF\xFF\xFF\xFF",
421 	  (const guchar *)" <?xml",
422 	  5,
423 	  "text/xml",
424 	  TRUE },
425 
426 	{ FALSE, FALSE,
427 	  (const guchar *)"\xFF\xFF\xFF\xFF\xFF",
428 	  (const guchar *)"%PDF-",
429 	  5,
430 	  "application/pdf",
431 	  TRUE },
432 
433 	/* Non-scriptable types. */
434 	{ FALSE, FALSE,
435 	  (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF",
436 	  (const guchar *)"%!PS-Adobe-",
437 	  11,
438 	  "application/postscript",
439 	  FALSE },
440 
441 	{ FALSE, FALSE, /* UTF-16BE BOM */
442 	  (const guchar *)"\xFF\xFF\x00\x00",
443 	  (const guchar *)"\xFE\xFF\x00\x00",
444 	  4,
445 	  "text/plain",
446 	  FALSE },
447 
448 	{ FALSE, FALSE, /* UTF-16LE BOM */
449 	  (const guchar *)"\xFF\xFF\x00\x00",
450 	  (const guchar *)"\xFF\xFE\x00\x00",
451 	  4,
452 	  "text/plain",
453 	  FALSE },
454 
455 	{ FALSE, FALSE, /* UTF-8 BOM */
456 	  (const guchar *)"\xFF\xFF\xFF\x00",
457 	  (const guchar *)"\xEF\xBB\xBF\x00",
458 	  4,
459 	  "text/plain",
460 	  FALSE },
461 };
462 
463 /* Whether a given byte looks like it might be part of binary content.
464  * Source: HTML5 spec; borrowed from the Chromium mime sniffer code,
465  * which is BSD-licensed
466  */
467 static char byte_looks_binary[] = {
468 	1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1,  /* 0x00 - 0x0F */
469 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,  /* 0x10 - 0x1F */
470 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x20 - 0x2F */
471 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x30 - 0x3F */
472 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x40 - 0x4F */
473 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x50 - 0x5F */
474 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x60 - 0x6F */
475 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x70 - 0x7F */
476 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x80 - 0x8F */
477 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x90 - 0x9F */
478 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0xA0 - 0xAF */
479 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0xB0 - 0xBF */
480 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0xC0 - 0xCF */
481 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0xD0 - 0xDF */
482 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0xE0 - 0xEF */
483 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0xF0 - 0xFF */
484 };
485 
486 /* HTML5: 2.7.4 Content-Type sniffing: unknown type */
487 static char*
sniff_unknown(SoupContentSniffer * sniffer,SoupBuffer * buffer,gboolean sniff_scriptable)488 sniff_unknown (SoupContentSniffer *sniffer, SoupBuffer *buffer,
489 	       gboolean sniff_scriptable)
490 {
491 	char *sniffed_type = NULL;
492 	const guchar *resource = (const guchar *)buffer->data;
493 	guint resource_length = MIN (512, buffer->length);
494 	guint i;
495 
496 	for (i = 0; i < G_N_ELEMENTS (types_table); i++) {
497 		SoupContentSnifferPattern *type_row = &(types_table[i]);
498 
499 		if (!sniff_scriptable && type_row->scriptable)
500 			continue;
501 
502 		if (type_row->has_ws) {
503 			guint index_stream = 0;
504 			guint index_pattern = 0;
505 			gboolean skip_row = FALSE;
506 
507 			while ((index_stream < resource_length) &&
508 			       (index_pattern <= type_row->pattern_length)) {
509 				/* Skip insignificant white space ("WS" in the spec) */
510 				if (type_row->pattern[index_pattern] == ' ') {
511 					if (resource[index_stream] == '\x09' ||
512 					    resource[index_stream] == '\x0a' ||
513 					    resource[index_stream] == '\x0c' ||
514 					    resource[index_stream] == '\x0d' ||
515 					    resource[index_stream] == '\x20')
516 						index_stream++;
517 					else
518 						index_pattern++;
519 				} else {
520 					if ((type_row->mask[index_pattern] & resource[index_stream]) != type_row->pattern[index_pattern]) {
521 						skip_row = TRUE;
522 						break;
523 					}
524 					index_pattern++;
525 					index_stream++;
526 				}
527 			}
528 
529 			if (skip_row)
530 				continue;
531 
532 			if (index_pattern > type_row->pattern_length) {
533 				if (type_row->has_tag_termination &&
534 				    resource[index_stream] != '\x20' &&
535 				    resource[index_stream] != '\x3E')
536 					continue;
537 
538 				return g_strdup (type_row->sniffed_type);
539 			}
540 		} else {
541 			guint j;
542 
543 			if (resource_length < type_row->pattern_length)
544 				continue;
545 
546 			for (j = 0; j < type_row->pattern_length; j++) {
547 				if ((type_row->mask[j] & resource[j]) != type_row->pattern[j])
548 					break;
549 			}
550 
551 			/* This means our comparison above matched completely */
552 			if (j == type_row->pattern_length)
553 				return g_strdup (type_row->sniffed_type);
554 		}
555 	}
556 
557 	sniffed_type = sniff_images (sniffer, buffer);
558 
559 	if (sniffed_type != NULL)
560 		return sniffed_type;
561 
562 	sniffed_type = sniff_audio_video (sniffer, buffer);
563 
564 	if (sniffed_type != NULL)
565 		return sniffed_type;
566 
567 	for (i = 0; i < resource_length; i++) {
568 		if (byte_looks_binary[resource[i]])
569 			return g_strdup ("application/octet-stream");
570 	}
571 
572 	return g_strdup ("text/plain");
573 }
574 
575 /* MIMESNIFF: 7.2 Sniffing a mislabeled binary resource */
576 static char*
sniff_text_or_binary(SoupContentSniffer * sniffer,SoupBuffer * buffer)577 sniff_text_or_binary (SoupContentSniffer *sniffer, SoupBuffer *buffer)
578 {
579 	const guchar *resource = (const guchar *)buffer->data;
580 	int resource_length = MIN (512, buffer->length);
581 	gboolean looks_binary = FALSE;
582 	int i;
583 
584 	/* 2. Detecting UTF-16BE, UTF-16LE BOMs means it's text/plain */
585 	if (resource_length >= 2) {
586 		if ((resource[0] == 0xFE && resource[1] == 0xFF) ||
587 		    (resource[0] == 0xFF && resource[1] == 0xFE))
588 			return g_strdup ("text/plain");
589 	}
590 
591 	/* 3. UTF-8 BOM. */
592 	if (resource_length >= 3) {
593 		if (resource[0] == 0xEF && resource[1] == 0xBB && resource[2] == 0xBF)
594 			return g_strdup ("text/plain");
595 	}
596 
597 	/* 4. Look to see if any of the first n bytes looks binary */
598 	for (i = 0; i < resource_length; i++) {
599 		if (byte_looks_binary[resource[i]]) {
600 			looks_binary = TRUE;
601 			break;
602 		}
603 	}
604 
605 	if (!looks_binary)
606 		return g_strdup ("text/plain");
607 
608 	/* 5. Execute 7.1 Identifying a resource with an unknown MIME type.
609 	 * TODO: sniff-scriptable needs to be unset.
610 	 */
611 	return sniff_unknown (sniffer, buffer, TRUE);
612 }
613 
614 static gboolean
skip_insignificant_space(const char * resource,int * pos,int resource_length)615 skip_insignificant_space (const char *resource, int *pos, int resource_length)
616 {
617 	while ((resource[*pos] == '\x09') ||
618 	       (resource[*pos] == '\x20') ||
619 	       (resource[*pos] == '\x0A') ||
620 	       (resource[*pos] == '\x0D')) {
621 		*pos = *pos + 1;
622 
623 		if (*pos > resource_length)
624 			return TRUE;
625 	}
626 
627 	return FALSE;
628 }
629 
630 static char*
sniff_feed_or_html(SoupContentSniffer * sniffer,SoupBuffer * buffer)631 sniff_feed_or_html (SoupContentSniffer *sniffer, SoupBuffer *buffer)
632 {
633 	const char *resource = (const char *)buffer->data;
634 	int resource_length = MIN (512, buffer->length);
635 	int pos = 0;
636 
637 	if (resource_length < 3)
638 		goto text_html;
639 
640 	/* Skip a leading UTF-8 BOM */
641 	if (resource[0] == 0xEF && resource[1] == 0xBB && resource[2] == 0xBF)
642 		pos = 3;
643 
644  look_for_tag:
645 	if (pos > resource_length)
646 		goto text_html;
647 
648 	if (skip_insignificant_space (resource, &pos, resource_length))
649 		goto text_html;
650 
651 	if (resource[pos] != '<')
652 		return g_strdup ("text/html");
653 
654 	pos++;
655 
656 	if ((pos + 2) > resource_length)
657 		goto text_html;
658 
659 	/* Skip comments. */
660 	if (g_str_has_prefix (resource + pos, "!--")) {
661 		pos = pos + 3;
662 
663 		if ((pos + 2) > resource_length)
664 			goto text_html;
665 
666 		while (!g_str_has_prefix (resource + pos, "-->")) {
667 			pos++;
668 
669 			if ((pos + 2) > resource_length)
670 				goto text_html;
671 		}
672 
673 		pos = pos + 3;
674 
675 		goto look_for_tag;
676 	}
677 
678 	if (pos > resource_length)
679 		goto text_html;
680 
681 	if (resource[pos] == '!') {
682 		do {
683 			pos++;
684 
685 			if (pos > resource_length)
686 				goto text_html;
687 		} while (resource[pos] != '>');
688 
689 		pos++;
690 
691 		goto look_for_tag;
692 	} else if (resource[pos] == '?') {
693 		do {
694 			pos++;
695 
696 			if ((pos + 1) > resource_length)
697 				goto text_html;
698 		} while (!g_str_has_prefix (resource + pos, "?>"));
699 
700 		pos = pos + 2;
701 
702 		goto look_for_tag;
703 	}
704 
705 	if ((pos + 3) > resource_length)
706 		goto text_html;
707 
708 	if (g_str_has_prefix (resource + pos, "rss"))
709 		return g_strdup ("application/rss+xml");
710 
711 	if ((pos + 4) > resource_length)
712 		goto text_html;
713 
714 	if (g_str_has_prefix (resource + pos, "feed"))
715 		return g_strdup ("application/atom+xml");
716 
717 	if ((pos + 7) > resource_length)
718 		goto text_html;
719 
720 	if (g_str_has_prefix (resource + pos, "rdf:RDF")) {
721 		pos = pos + 7;
722 
723 		if (skip_insignificant_space (resource, &pos, resource_length))
724 			goto text_html;
725 
726 		if ((pos + 32) > resource_length)
727 			goto text_html;
728 
729 		if (g_str_has_prefix (resource + pos, "xmlns=\"http://purl.org/rss/1.0/\"")) {
730 			pos = pos + 32;
731 
732 			if (skip_insignificant_space (resource, &pos, resource_length))
733 				goto text_html;
734 
735 			if ((pos + 55) > resource_length)
736 				goto text_html;
737 
738 			if (g_str_has_prefix (resource + pos, "xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\""))
739 				return g_strdup ("application/rss+xml");
740 		}
741 
742 		if ((pos + 55) > resource_length)
743 			goto text_html;
744 
745 		if (g_str_has_prefix (resource + pos, "xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"")) {
746 			pos = pos + 55;
747 
748 			if (skip_insignificant_space (resource, &pos, resource_length))
749 				goto text_html;
750 
751 			if ((pos + 32) > resource_length)
752 				goto text_html;
753 
754 			if (g_str_has_prefix (resource + pos, "xmlns=\"http://purl.org/rss/1.0/\""))
755 				return g_strdup ("application/rss+xml");
756 		}
757 	}
758 
759  text_html:
760 	return g_strdup ("text/html");
761 }
762 
763 static char *
soup_content_sniffer_real_sniff(SoupContentSniffer * sniffer,SoupMessage * msg,SoupBuffer * buffer,GHashTable ** params)764 soup_content_sniffer_real_sniff (SoupContentSniffer *sniffer, SoupMessage *msg,
765 				 SoupBuffer *buffer, GHashTable **params)
766 {
767 	const char *content_type;
768 	const char *x_content_type_options;
769 	char *sniffed_type = NULL;
770 	gboolean no_sniff = FALSE;
771 
772 	content_type = soup_message_headers_get_content_type (msg->response_headers, params);
773 
774 	/* MIMESNIFF: 7 Determining the sniffed MIME type of a resource. */
775 
776 	x_content_type_options = soup_message_headers_get_one (msg->response_headers, "X-Content-Type-Options");
777 	if (!g_strcmp0 (x_content_type_options, "nosniff"))
778 		no_sniff = TRUE;
779 
780 	/* 1. Unknown/undefined supplied type with sniff-scritable = !nosniff. */
781 	if ((content_type == NULL) ||
782 	    !g_ascii_strcasecmp (content_type, "unknown/unknown") ||
783 	    !g_ascii_strcasecmp (content_type, "application/unknown") ||
784 	    !g_ascii_strcasecmp (content_type, "*/*"))
785 		return sniff_unknown (sniffer, buffer, !no_sniff);
786 
787 	/* 2. If nosniff is specified in X-Content-Type-Options use the supplied MIME type. */
788 	if (no_sniff)
789 		return g_strdup (content_type);
790 
791 	/* 3. check-for-apache-bug */
792 	if ((content_type != NULL) &&
793 	    (g_str_equal (content_type, "text/plain") ||
794 	     g_str_equal (content_type, "text/plain; charset=ISO-8859-1") ||
795 	     g_str_equal (content_type, "text/plain; charset=iso-8859-1") ||
796 	     g_str_equal (content_type, "text/plain; charset=UTF-8")))
797 		return sniff_text_or_binary (sniffer, buffer);
798 
799 	/* 4. XML types sent by the server are always used. */
800 	if (g_str_has_suffix (content_type, "+xml") ||
801 	    !g_ascii_strcasecmp (content_type, "text/xml") ||
802 	    !g_ascii_strcasecmp (content_type, "application/xml"))
803 		return g_strdup (content_type);
804 
805 	/* 5. Distinguish feed from HTML. */
806 	if (!g_ascii_strcasecmp (content_type, "text/html"))
807 		return sniff_feed_or_html (sniffer, buffer);
808 
809 	/* 6. Image types.
810 	 */
811 	if (!g_ascii_strncasecmp (content_type, "image/", 6)) {
812 		sniffed_type = sniff_images (sniffer, buffer);
813 		if (sniffed_type != NULL)
814 			return sniffed_type;
815 		return g_strdup (content_type);
816 	}
817 
818 	/* 7. Audio and video types. */
819 	if (!g_ascii_strncasecmp (content_type, "audio/", 6) ||
820 	    !g_ascii_strncasecmp (content_type, "video/", 6) ||
821 	    !g_ascii_strcasecmp (content_type, "application/ogg")) {
822 	        sniffed_type = sniff_audio_video (sniffer, buffer);
823 	        if (sniffed_type != NULL)
824 		        return sniffed_type;
825 		return g_strdup (content_type);
826         }
827 
828 	/* If we got text/plain, use text_or_binary */
829 	if (g_str_equal (content_type, "text/plain")) {
830 		return sniff_text_or_binary (sniffer, buffer);
831 	}
832 
833 	return g_strdup (content_type);
834 }
835 
836 static gsize
soup_content_sniffer_real_get_buffer_size(SoupContentSniffer * sniffer)837 soup_content_sniffer_real_get_buffer_size (SoupContentSniffer *sniffer)
838 {
839 	return 512;
840 }
841 
842 static void
soup_content_sniffer_got_headers_cb(SoupMessage * msg,SoupContentSniffer * sniffer)843 soup_content_sniffer_got_headers_cb (SoupMessage *msg, SoupContentSniffer *sniffer)
844 {
845 	soup_message_set_bytes_for_sniffing (msg, soup_content_sniffer_get_buffer_size (sniffer));
846 }
847 
848 static void
soup_content_sniffer_request_queued(SoupSessionFeature * feature,SoupSession * session,SoupMessage * msg)849 soup_content_sniffer_request_queued (SoupSessionFeature *feature,
850 				     SoupSession *session,
851 				     SoupMessage *msg)
852 {
853 	soup_message_set_content_sniffer (msg, SOUP_CONTENT_SNIFFER (feature));
854 	g_signal_connect (msg, "got-headers",
855 			  G_CALLBACK (soup_content_sniffer_got_headers_cb),
856 			  feature);
857 }
858 
859 static void
soup_content_sniffer_request_unqueued(SoupSessionFeature * feature,SoupSession * session,SoupMessage * msg)860 soup_content_sniffer_request_unqueued (SoupSessionFeature *feature,
861 				       SoupSession *session,
862 				       SoupMessage *msg)
863 {
864 	soup_message_set_content_sniffer (msg, NULL);
865 
866 	g_signal_handlers_disconnect_by_func (msg, soup_content_sniffer_got_headers_cb, feature);
867 }
868 
869 static void
soup_content_sniffer_class_init(SoupContentSnifferClass * content_sniffer_class)870 soup_content_sniffer_class_init (SoupContentSnifferClass *content_sniffer_class)
871 {
872 	content_sniffer_class->sniff = soup_content_sniffer_real_sniff;
873 	content_sniffer_class->get_buffer_size = soup_content_sniffer_real_get_buffer_size;
874 }
875 
876 static void
soup_content_sniffer_session_feature_init(SoupSessionFeatureInterface * feature_interface,gpointer interface_data)877 soup_content_sniffer_session_feature_init (SoupSessionFeatureInterface *feature_interface,
878 					   gpointer interface_data)
879 {
880 	feature_interface->request_queued = soup_content_sniffer_request_queued;
881 	feature_interface->request_unqueued = soup_content_sniffer_request_unqueued;
882 }
883 
884 /**
885  * soup_content_sniffer_new:
886  *
887  * Creates a new #SoupContentSniffer.
888  *
889  * Returns: a new #SoupContentSniffer
890  *
891  * Since: 2.28
892  **/
893 SoupContentSniffer *
soup_content_sniffer_new(void)894 soup_content_sniffer_new (void)
895 {
896 	return g_object_new (SOUP_TYPE_CONTENT_SNIFFER, NULL);
897 }
898 
899 /**
900  * soup_content_sniffer_sniff:
901  * @sniffer: a #SoupContentSniffer
902  * @msg: the message to sniff
903  * @buffer: a buffer containing the start of @msg's response body
904  * @params: (element-type utf8 utf8) (out) (transfer full) (allow-none): return
905  *   location for Content-Type parameters (eg, "charset"), or %NULL
906  *
907  * Sniffs @buffer to determine its Content-Type. The result may also
908  * be influenced by the Content-Type declared in @msg's response
909  * headers.
910  *
911  * Return value: the sniffed Content-Type of @buffer; this will never be %NULL,
912  *   but may be "application/octet-stream".
913  *
914  * Since: 2.28
915  */
916 char *
soup_content_sniffer_sniff(SoupContentSniffer * sniffer,SoupMessage * msg,SoupBuffer * buffer,GHashTable ** params)917 soup_content_sniffer_sniff (SoupContentSniffer *sniffer,
918 			    SoupMessage *msg, SoupBuffer *buffer,
919 			    GHashTable **params)
920 {
921 	g_return_val_if_fail (SOUP_IS_CONTENT_SNIFFER (sniffer), NULL);
922 	g_return_val_if_fail (SOUP_IS_MESSAGE (msg), NULL);
923 	g_return_val_if_fail (buffer != NULL, NULL);
924 
925 	return SOUP_CONTENT_SNIFFER_GET_CLASS (sniffer)->sniff (sniffer, msg, buffer, params);
926 }
927 
928 /**
929  * soup_content_sniffer_get_buffer_size:
930  * @sniffer: a #SoupContentSniffer
931  *
932  * Gets the number of bytes @sniffer needs in order to properly sniff
933  * a buffer.
934  *
935  * Return value: the number of bytes to sniff
936  *
937  * Since: 2.28
938  */
939 gsize
soup_content_sniffer_get_buffer_size(SoupContentSniffer * sniffer)940 soup_content_sniffer_get_buffer_size (SoupContentSniffer *sniffer)
941 {
942 	g_return_val_if_fail (SOUP_IS_CONTENT_SNIFFER (sniffer), 0);
943 
944 	return SOUP_CONTENT_SNIFFER_GET_CLASS (sniffer)->get_buffer_size (sniffer);
945 }
946