1 // Copyright 2012 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // Detecting mime types is a tricky business because we need to balance
6 // compatibility concerns with security issues. Here is a survey of how other
7 // browsers behave and then a description of how we intend to behave.
8 //
9 // HTML payload, no Content-Type header:
10 // * IE 7: Render as HTML
11 // * Firefox 2: Render as HTML
12 // * Safari 3: Render as HTML
13 // * Opera 9: Render as HTML
14 //
15 // Here the choice seems clear:
16 // => Chrome: Render as HTML
17 //
18 // HTML payload, Content-Type: "text/plain":
19 // * IE 7: Render as HTML
20 // * Firefox 2: Render as text
21 // * Safari 3: Render as text (Note: Safari will Render as HTML if the URL
22 // has an HTML extension)
23 // * Opera 9: Render as text
24 //
25 // Here we choose to follow the majority (and break some compatibility with IE).
26 // Many folks dislike IE's behavior here.
27 // => Chrome: Render as text
28 // We generalize this as follows. If the Content-Type header is text/plain
29 // we won't detect dangerous mime types (those that can execute script).
30 //
31 // HTML payload, Content-Type: "application/octet-stream":
32 // * IE 7: Render as HTML
33 // * Firefox 2: Download as application/octet-stream
34 // * Safari 3: Render as HTML
35 // * Opera 9: Render as HTML
36 //
37 // We follow Firefox.
38 // => Chrome: Download as application/octet-stream
39 // One factor in this decision is that IIS 4 and 5 will send
40 // application/octet-stream for .xhtml files (because they don't recognize
41 // the extension). We did some experiments and it looks like this doesn't occur
42 // very often on the web. We choose the more secure option.
43 //
44 // GIF payload, no Content-Type header:
45 // * IE 7: Render as GIF
46 // * Firefox 2: Render as GIF
47 // * Safari 3: Download as Unknown (Note: Safari will Render as GIF if the
48 // URL has an GIF extension)
49 // * Opera 9: Render as GIF
50 //
51 // The choice is clear.
52 // => Chrome: Render as GIF
53 // Once we decide to render HTML without a Content-Type header, there isn't much
54 // reason not to render GIFs.
55 //
56 // GIF payload, Content-Type: "text/plain":
57 // * IE 7: Render as GIF
58 // * Firefox 2: Download as application/octet-stream (Note: Firefox will
59 // Download as GIF if the URL has an GIF extension)
60 // * Safari 3: Download as Unknown (Note: Safari will Render as GIF if the
61 // URL has an GIF extension)
62 // * Opera 9: Render as GIF
63 //
64 // Displaying as text/plain makes little sense as the content will look like
65 // gibberish. Here, we could change our minds and download.
66 // => Chrome: Render as GIF
67 //
68 // GIF payload, Content-Type: "application/octet-stream":
69 // * IE 7: Render as GIF
70 // * Firefox 2: Download as application/octet-stream (Note: Firefox will
71 // Download as GIF if the URL has an GIF extension)
72 // * Safari 3: Download as Unknown (Note: Safari will Render as GIF if the
73 // URL has an GIF extension)
74 // * Opera 9: Render as GIF
75 //
76 // We used to render as GIF here, but the problem is that some sites want to
77 // trigger downloads by sending application/octet-stream (even though they
78 // should be sending Content-Disposition: attachment). Although it is safe
79 // to render as GIF from a security perspective, we actually get better
80 // compatibility if we don't sniff from application/octet stream at all.
81 // => Chrome: Download as application/octet-stream
82 //
83 // Note that our definition of HTML payload is much stricter than IE's
84 // definition and roughly the same as Firefox's definition.
85
86 #include <stdint.h>
87 #include <string>
88
89 #include "net/base/mime_sniffer.h"
90
91 #include "base/check_op.h"
92 #include "base/containers/span.h"
93 #include "base/notreached.h"
94 #include "base/strings/string_util.h"
95 #include "build/build_config.h"
96 #include "url/gurl.h"
97
98 namespace net {
99
100 // The number of content bytes we need to use all our magic numbers. Feel free
101 // to increase this number if you add a longer magic number.
102 static const size_t kBytesRequiredForMagic = 42;
103
104 struct MagicNumber {
105 const char* const mime_type;
106 const std::string_view magic;
107 bool is_string;
108 const char* const mask; // if set, must have same length as |magic|
109 };
110
111 #define MAGIC_NUMBER(mime_type, magic) \
112 { (mime_type), std::string_view((magic), sizeof(magic) - 1), false, nullptr }
113
114 template <int MagicSize, int MaskSize>
115 class VerifySizes {
116 static_assert(MagicSize == MaskSize, "sizes must be equal");
117
118 public:
119 enum { SIZES = MagicSize };
120 };
121
122 #define verified_sizeof(magic, mask) \
123 VerifySizes<sizeof(magic), sizeof(mask)>::SIZES
124
125 #define MAGIC_MASK(mime_type, magic, mask) \
126 { \
127 (mime_type), std::string_view((magic), verified_sizeof(magic, mask) - 1), \
128 false, (mask) \
129 }
130
131 // Magic strings are case insensitive and must not include '\0' characters
132 #define MAGIC_STRING(mime_type, magic) \
133 { (mime_type), std::string_view((magic), sizeof(magic) - 1), true, nullptr }
134
135 static const MagicNumber kMagicNumbers[] = {
136 // Source: HTML 5 specification
137 MAGIC_NUMBER("application/pdf", "%PDF-"),
138 MAGIC_NUMBER("application/postscript", "%!PS-Adobe-"),
139 MAGIC_NUMBER("image/gif", "GIF87a"),
140 MAGIC_NUMBER("image/gif", "GIF89a"),
141 MAGIC_NUMBER("image/png", "\x89" "PNG\x0D\x0A\x1A\x0A"),
142 MAGIC_NUMBER("image/jpeg", "\xFF\xD8\xFF"),
143 MAGIC_NUMBER("image/bmp", "BM"),
144 // Source: Mozilla
145 MAGIC_NUMBER("text/plain", "#!"), // Script
146 MAGIC_NUMBER("text/plain", "%!"), // Script, similar to PS
147 MAGIC_NUMBER("text/plain", "From"),
148 MAGIC_NUMBER("text/plain", ">From"),
149 // Chrome specific
150 MAGIC_NUMBER("application/x-gzip", "\x1F\x8B\x08"),
151 MAGIC_NUMBER("audio/x-pn-realaudio", "\x2E\x52\x4D\x46"),
152 MAGIC_NUMBER("video/x-ms-asf",
153 "\x30\x26\xB2\x75\x8E\x66\xCF\x11\xA6\xD9\x00\xAA\x00\x62\xCE\x6C"),
154 MAGIC_NUMBER("image/tiff", "I I"),
155 MAGIC_NUMBER("image/tiff", "II*"),
156 MAGIC_NUMBER("image/tiff", "MM\x00*"),
157 MAGIC_NUMBER("audio/mpeg", "ID3"),
158 MAGIC_NUMBER("image/webp", "RIFF....WEBPVP"),
159 MAGIC_NUMBER("video/webm", "\x1A\x45\xDF\xA3"),
160 MAGIC_NUMBER("application/zip", "PK\x03\x04"),
161 MAGIC_NUMBER("application/x-rar-compressed", "Rar!\x1A\x07\x00"),
162 MAGIC_NUMBER("application/x-msmetafile", "\xD7\xCD\xC6\x9A"),
163 MAGIC_NUMBER("application/octet-stream", "MZ"), // EXE
164 // Sniffing for Flash:
165 //
166 // MAGIC_NUMBER("application/x-shockwave-flash", "CWS"),
167 // MAGIC_NUMBER("application/x-shockwave-flash", "FLV"),
168 // MAGIC_NUMBER("application/x-shockwave-flash", "FWS"),
169 //
170 // Including these magic number for Flash is a trade off.
171 //
172 // Pros:
173 // * Flash is an important and popular file format
174 //
175 // Cons:
176 // * These patterns are fairly weak
177 // * If we mistakenly decide something is Flash, we will execute it
178 // in the origin of an unsuspecting site. This could be a security
179 // vulnerability if the site allows users to upload content.
180 //
181 // On balance, we do not include these patterns.
182 };
183
184 // The number of content bytes we need to use all our Microsoft Office magic
185 // numbers.
186 static const size_t kBytesRequiredForOfficeMagic = 8;
187
188 static const MagicNumber kOfficeMagicNumbers[] = {
189 MAGIC_NUMBER("CFB", "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1"),
190 MAGIC_NUMBER("OOXML", "PK\x03\x04"),
191 };
192
193 enum OfficeDocType {
194 DOC_TYPE_WORD,
195 DOC_TYPE_EXCEL,
196 DOC_TYPE_POWERPOINT,
197 DOC_TYPE_NONE
198 };
199
200 struct OfficeExtensionType {
201 OfficeDocType doc_type;
202 const std::string_view extension;
203 };
204
205 #define OFFICE_EXTENSION(type, extension) \
206 { (type), std::string_view((extension), sizeof(extension) - 1) }
207
208 static const OfficeExtensionType kOfficeExtensionTypes[] = {
209 OFFICE_EXTENSION(DOC_TYPE_WORD, ".doc"),
210 OFFICE_EXTENSION(DOC_TYPE_EXCEL, ".xls"),
211 OFFICE_EXTENSION(DOC_TYPE_POWERPOINT, ".ppt"),
212 OFFICE_EXTENSION(DOC_TYPE_WORD, ".docx"),
213 OFFICE_EXTENSION(DOC_TYPE_EXCEL, ".xlsx"),
214 OFFICE_EXTENSION(DOC_TYPE_POWERPOINT, ".pptx"),
215 };
216
217 static const MagicNumber kExtraMagicNumbers[] = {
218 MAGIC_NUMBER("image/x-xbitmap", "#define"),
219 MAGIC_NUMBER("image/x-icon", "\x00\x00\x01\x00"),
220 MAGIC_NUMBER("audio/wav", "RIFF....WAVEfmt "),
221 MAGIC_NUMBER("video/avi", "RIFF....AVI LIST"),
222 MAGIC_NUMBER("audio/ogg", "OggS\0"),
223 MAGIC_MASK("video/mpeg", "\x00\x00\x01\xB0", "\xFF\xFF\xFF\xF0"),
224 MAGIC_MASK("audio/mpeg", "\xFF\xE0", "\xFF\xE0"),
225 MAGIC_NUMBER("video/3gpp", "....ftyp3g"),
226 MAGIC_NUMBER("video/3gpp", "....ftypavcl"),
227 MAGIC_NUMBER("video/mp4", "....ftyp"),
228 MAGIC_NUMBER("video/quicktime", "....moov"),
229 MAGIC_NUMBER("application/x-shockwave-flash", "CWS"),
230 MAGIC_NUMBER("application/x-shockwave-flash", "FWS"),
231 MAGIC_NUMBER("video/x-flv", "FLV"),
232 MAGIC_NUMBER("audio/x-flac", "fLaC"),
233 // Per https://tools.ietf.org/html/rfc3267#section-8.1
234 MAGIC_NUMBER("audio/amr", "#!AMR\n"),
235
236 // RAW image types.
237 MAGIC_NUMBER("image/x-canon-cr2", "II\x2a\x00\x10\x00\x00\x00CR"),
238 MAGIC_NUMBER("image/x-canon-crw", "II\x1a\x00\x00\x00HEAPCCDR"),
239 MAGIC_NUMBER("image/x-minolta-mrw", "\x00MRM"),
240 MAGIC_NUMBER("image/x-olympus-orf", "MMOR"), // big-endian
241 MAGIC_NUMBER("image/x-olympus-orf", "IIRO"), // little-endian
242 MAGIC_NUMBER("image/x-olympus-orf", "IIRS"), // little-endian
243 MAGIC_NUMBER("image/x-fuji-raf", "FUJIFILMCCD-RAW "),
244 MAGIC_NUMBER("image/x-panasonic-raw",
245 "IIU\x00\x08\x00\x00\x00"), // Panasonic .raw
246 MAGIC_NUMBER("image/x-panasonic-raw",
247 "IIU\x00\x18\x00\x00\x00"), // Panasonic .rw2
248 MAGIC_NUMBER("image/x-phaseone-raw", "MMMMRaw"),
249 MAGIC_NUMBER("image/x-x3f", "FOVb"),
250 };
251
252 // Our HTML sniffer differs slightly from Mozilla. For example, Mozilla will
253 // decide that a document that begins "<!DOCTYPE SOAP-ENV:Envelope PUBLIC " is
254 // HTML, but we will not.
255
256 #define MAGIC_HTML_TAG(tag) \
257 MAGIC_STRING("text/html", "<" tag)
258
259 static const MagicNumber kSniffableTags[] = {
260 // XML processing directive. Although this is not an HTML mime type, we sniff
261 // for this in the HTML phase because text/xml is just as powerful as HTML and
262 // we want to leverage our white space skipping technology.
263 MAGIC_NUMBER("text/xml", "<?xml"), // Mozilla
264 // DOCTYPEs
265 MAGIC_HTML_TAG("!DOCTYPE html"), // HTML5 spec
266 // Sniffable tags, ordered by how often they occur in sniffable documents.
267 MAGIC_HTML_TAG("script"), // HTML5 spec, Mozilla
268 MAGIC_HTML_TAG("html"), // HTML5 spec, Mozilla
269 MAGIC_HTML_TAG("!--"),
270 MAGIC_HTML_TAG("head"), // HTML5 spec, Mozilla
271 MAGIC_HTML_TAG("iframe"), // Mozilla
272 MAGIC_HTML_TAG("h1"), // Mozilla
273 MAGIC_HTML_TAG("div"), // Mozilla
274 MAGIC_HTML_TAG("font"), // Mozilla
275 MAGIC_HTML_TAG("table"), // Mozilla
276 MAGIC_HTML_TAG("a"), // Mozilla
277 MAGIC_HTML_TAG("style"), // Mozilla
278 MAGIC_HTML_TAG("title"), // Mozilla
279 MAGIC_HTML_TAG("b"), // Mozilla
280 MAGIC_HTML_TAG("body"), // Mozilla
281 MAGIC_HTML_TAG("br"),
282 MAGIC_HTML_TAG("p"), // Mozilla
283 };
284
285 // Compare content header to a magic number where magic_entry can contain '.'
286 // for single character of anything, allowing some bytes to be skipped.
MagicCmp(std::string_view content,std::string_view magic_entry)287 static bool MagicCmp(std::string_view content, std::string_view magic_entry) {
288 DCHECK_GE(content.length(), magic_entry.length());
289
290 for (size_t i = 0; i < magic_entry.length(); ++i) {
291 if (magic_entry[i] != '.' && magic_entry[i] != content[i])
292 return false;
293 }
294 return true;
295 }
296
297 // Like MagicCmp() except that it ANDs each byte with a mask before
298 // the comparison, because there are some bits we don't care about.
MagicMaskCmp(std::string_view content,std::string_view magic_entry,std::string_view magic_mask)299 static bool MagicMaskCmp(std::string_view content,
300 std::string_view magic_entry,
301 std::string_view magic_mask) {
302 DCHECK_GE(content.length(), magic_entry.length());
303
304 for (size_t i = 0; i < magic_entry.length(); ++i) {
305 if (magic_entry[i] != '.' && magic_entry[i] != (magic_mask[i] & content[i]))
306 return false;
307 }
308 return true;
309 }
310
MatchMagicNumber(std::string_view content,const MagicNumber & magic_entry,std::string * result)311 static bool MatchMagicNumber(std::string_view content,
312 const MagicNumber& magic_entry,
313 std::string* result) {
314 // Keep kBytesRequiredForMagic honest.
315 DCHECK_LE(magic_entry.magic.length(), kBytesRequiredForMagic);
316
317 bool match = false;
318 if (content.length() >= magic_entry.magic.length()) {
319 if (magic_entry.is_string) {
320 // Consistency check - string entries should have no embedded nulls.
321 DCHECK_EQ(std::string_view::npos, magic_entry.magic.find('\0'));
322
323 // Do a case-insensitive prefix comparison.
324 match = base::StartsWith(content, magic_entry.magic,
325 base::CompareCase::INSENSITIVE_ASCII);
326 } else if (!magic_entry.mask) {
327 match = MagicCmp(content, magic_entry.magic);
328 } else {
329 std::string_view magic_mask(magic_entry.mask, magic_entry.magic.length());
330 match = MagicMaskCmp(content, magic_entry.magic, magic_mask);
331 }
332 }
333
334 if (match) {
335 result->assign(magic_entry.mime_type);
336 return true;
337 }
338 return false;
339 }
340
CheckForMagicNumbers(std::string_view content,base::span<const MagicNumber> magic_numbers,std::string * result)341 static bool CheckForMagicNumbers(std::string_view content,
342 base::span<const MagicNumber> magic_numbers,
343 std::string* result) {
344 for (const MagicNumber& magic : magic_numbers) {
345 if (MatchMagicNumber(content, magic, result))
346 return true;
347 }
348 return false;
349 }
350
351 // Truncates |string_piece| to length |max_size| and returns true if
352 // |string_piece| is now exactly |max_size|.
TruncateStringPiece(const size_t max_size,std::string_view * string_piece)353 static bool TruncateStringPiece(const size_t max_size,
354 std::string_view* string_piece) {
355 // Keep kMaxBytesToSniff honest.
356 DCHECK_LE(static_cast<int>(max_size), kMaxBytesToSniff);
357
358 *string_piece = string_piece->substr(0, max_size);
359 return string_piece->length() == max_size;
360 }
361
362 // Returns true and sets result if the content appears to be HTML.
363 // Clears have_enough_content if more data could possibly change the result.
SniffForHTML(std::string_view content,bool * have_enough_content,std::string * result)364 static bool SniffForHTML(std::string_view content,
365 bool* have_enough_content,
366 std::string* result) {
367 // For HTML, we are willing to consider up to 512 bytes. This may be overly
368 // conservative as IE only considers 256.
369 *have_enough_content &= TruncateStringPiece(512, &content);
370
371 // We adopt a strategy similar to that used by Mozilla to sniff HTML tags,
372 // but with some modifications to better match the HTML5 spec.
373 std::string_view trimmed =
374 base::TrimWhitespaceASCII(content, base::TRIM_LEADING);
375
376 // |trimmed| now starts at first non-whitespace character (or is empty).
377 return CheckForMagicNumbers(trimmed, kSniffableTags, result);
378 }
379
380 // Returns true and sets result if the content matches any of kMagicNumbers.
381 // Clears have_enough_content if more data could possibly change the result.
SniffForMagicNumbers(std::string_view content,bool * have_enough_content,std::string * result)382 static bool SniffForMagicNumbers(std::string_view content,
383 bool* have_enough_content,
384 std::string* result) {
385 *have_enough_content &= TruncateStringPiece(kBytesRequiredForMagic, &content);
386
387 // Check our big table of Magic Numbers
388 return CheckForMagicNumbers(content, kMagicNumbers, result);
389 }
390
391 // Returns true and sets result if the content matches any of
392 // kOfficeMagicNumbers, and the URL has the proper extension.
393 // Clears |have_enough_content| if more data could possibly change the result.
SniffForOfficeDocs(std::string_view content,const GURL & url,bool * have_enough_content,std::string * result)394 static bool SniffForOfficeDocs(std::string_view content,
395 const GURL& url,
396 bool* have_enough_content,
397 std::string* result) {
398 *have_enough_content &=
399 TruncateStringPiece(kBytesRequiredForOfficeMagic, &content);
400
401 // Check our table of magic numbers for Office file types.
402 std::string office_version;
403 if (!CheckForMagicNumbers(content, kOfficeMagicNumbers, &office_version))
404 return false;
405
406 OfficeDocType type = DOC_TYPE_NONE;
407 std::string_view url_path = url.path_piece();
408 for (const auto& office_extension : kOfficeExtensionTypes) {
409 if (base::EndsWith(url_path, office_extension.extension,
410 base::CompareCase::INSENSITIVE_ASCII)) {
411 type = office_extension.doc_type;
412 break;
413 }
414 }
415
416 if (type == DOC_TYPE_NONE)
417 return false;
418
419 if (office_version == "CFB") {
420 switch (type) {
421 case DOC_TYPE_WORD:
422 *result = "application/msword";
423 return true;
424 case DOC_TYPE_EXCEL:
425 *result = "application/vnd.ms-excel";
426 return true;
427 case DOC_TYPE_POWERPOINT:
428 *result = "application/vnd.ms-powerpoint";
429 return true;
430 case DOC_TYPE_NONE:
431 NOTREACHED();
432 }
433 } else if (office_version == "OOXML") {
434 switch (type) {
435 case DOC_TYPE_WORD:
436 *result = "application/vnd.openxmlformats-officedocument."
437 "wordprocessingml.document";
438 return true;
439 case DOC_TYPE_EXCEL:
440 *result = "application/vnd.openxmlformats-officedocument."
441 "spreadsheetml.sheet";
442 return true;
443 case DOC_TYPE_POWERPOINT:
444 *result = "application/vnd.openxmlformats-officedocument."
445 "presentationml.presentation";
446 return true;
447 case DOC_TYPE_NONE:
448 NOTREACHED();
449 }
450 }
451
452 NOTREACHED();
453 }
454
IsOfficeType(const std::string & type_hint)455 static bool IsOfficeType(const std::string& type_hint) {
456 return (type_hint == "application/msword" ||
457 type_hint == "application/vnd.ms-excel" ||
458 type_hint == "application/vnd.ms-powerpoint" ||
459 type_hint == "application/vnd.openxmlformats-officedocument."
460 "wordprocessingml.document" ||
461 type_hint == "application/vnd.openxmlformats-officedocument."
462 "spreadsheetml.sheet" ||
463 type_hint == "application/vnd.openxmlformats-officedocument."
464 "presentationml.presentation" ||
465 type_hint == "application/vnd.ms-excel.sheet.macroenabled.12" ||
466 type_hint == "application/vnd.ms-word.document.macroenabled.12" ||
467 type_hint == "application/vnd.ms-powerpoint.presentation."
468 "macroenabled.12" ||
469 type_hint == "application/mspowerpoint" ||
470 type_hint == "application/msexcel" ||
471 type_hint == "application/vnd.ms-word" ||
472 type_hint == "application/vnd.ms-word.document.12" ||
473 type_hint == "application/vnd.msword");
474 }
475
476 // This function checks for files that have a Microsoft Office MIME type
477 // set, but are not actually Office files.
478 //
479 // If this is not actually an Office file, |*result| is set to
480 // "application/octet-stream", otherwise it is not modified.
481 //
482 // Returns false if additional data is required to determine the file type, or
483 // true if there is enough data to make a decision.
SniffForInvalidOfficeDocs(std::string_view content,const GURL & url,std::string * result)484 static bool SniffForInvalidOfficeDocs(std::string_view content,
485 const GURL& url,
486 std::string* result) {
487 if (!TruncateStringPiece(kBytesRequiredForOfficeMagic, &content))
488 return false;
489
490 // Check our table of magic numbers for Office file types. If it does not
491 // match one, the MIME type was invalid. Set it instead to a safe value.
492 std::string office_version;
493 if (!CheckForMagicNumbers(content, kOfficeMagicNumbers, &office_version)) {
494 *result = "application/octet-stream";
495 }
496
497 // We have enough information to determine if this was a Microsoft Office
498 // document or not, so sniffing is completed.
499 return true;
500 }
501
502 // Tags that indicate the content is likely XML.
503 static const MagicNumber kMagicXML[] = {
504 MAGIC_STRING("application/atom+xml", "<feed"),
505 MAGIC_STRING("application/rss+xml", "<rss"),
506 };
507
508 // Returns true and sets result if the content appears to contain XHTML or a
509 // feed.
510 // Clears have_enough_content if more data could possibly change the result.
511 //
512 // TODO(evanm): this is similar but more conservative than what Safari does,
513 // while HTML5 has a different recommendation -- what should we do?
514 // TODO(evanm): this is incorrect for documents whose encoding isn't a superset
515 // of ASCII -- do we care?
SniffXML(std::string_view content,bool * have_enough_content,std::string * result)516 static bool SniffXML(std::string_view content,
517 bool* have_enough_content,
518 std::string* result) {
519 // We allow at most 300 bytes of content before we expect the opening tag.
520 *have_enough_content &= TruncateStringPiece(300, &content);
521
522 // This loop iterates through tag-looking offsets in the file.
523 // We want to skip XML processing instructions (of the form "<?xml ...")
524 // and stop at the first "plain" tag, then make a decision on the mime-type
525 // based on the name (or possibly attributes) of that tag.
526 const int kMaxTagIterations = 5;
527 size_t pos = 0;
528 for (size_t i = 0; i < kMaxTagIterations && pos < content.length(); ++i) {
529 pos = content.find('<', pos);
530 if (pos == std::string_view::npos) {
531 return false;
532 }
533
534 std::string_view current = content.substr(pos);
535
536 // Skip XML and DOCTYPE declarations.
537 static constexpr std::string_view kXmlPrefix("<?xml");
538 static constexpr std::string_view kDocTypePrefix("<!DOCTYPE");
539 if (base::StartsWith(current, kXmlPrefix,
540 base::CompareCase::INSENSITIVE_ASCII) ||
541 base::StartsWith(current, kDocTypePrefix,
542 base::CompareCase::INSENSITIVE_ASCII)) {
543 ++pos;
544 continue;
545 }
546
547 if (CheckForMagicNumbers(current, kMagicXML, result))
548 return true;
549
550 // TODO(evanm): handle RSS 1.0, which is an RDF format and more difficult
551 // to identify.
552
553 // If we get here, we've hit an initial tag that hasn't matched one of the
554 // above tests. Abort.
555 return true;
556 }
557
558 // We iterated too far without finding a start tag.
559 // If we have more content to look at, we aren't going to change our mind by
560 // seeing more bytes from the network.
561 return pos < content.length();
562 }
563
564 // Byte order marks
565 static const MagicNumber kByteOrderMark[] = {
566 MAGIC_NUMBER("text/plain", "\xFE\xFF"), // UTF-16BE
567 MAGIC_NUMBER("text/plain", "\xFF\xFE"), // UTF-16LE
568 MAGIC_NUMBER("text/plain", "\xEF\xBB\xBF"), // UTF-8
569 };
570
571 // Returns true and sets result to "application/octet-stream" if the content
572 // appears to be binary data. Otherwise, returns false and sets "text/plain".
573 // Clears have_enough_content if more data could possibly change the result.
SniffBinary(std::string_view content,bool * have_enough_content,std::string * result)574 static bool SniffBinary(std::string_view content,
575 bool* have_enough_content,
576 std::string* result) {
577 // There is no consensus about exactly how to sniff for binary content.
578 // * IE 7: Don't sniff for binary looking bytes, but trust the file extension.
579 // * Firefox 3.5: Sniff first 4096 bytes for a binary looking byte.
580 // Here, we side with FF, but with a smaller buffer. This size was chosen
581 // because it is small enough to comfortably fit into a single packet (after
582 // allowing for headers) and yet large enough to account for binary formats
583 // that have a significant amount of ASCII at the beginning (crbug.com/15314).
584 const bool is_truncated = TruncateStringPiece(kMaxBytesToSniff, &content);
585
586 // First, we look for a BOM.
587 std::string unused;
588 if (CheckForMagicNumbers(content, kByteOrderMark, &unused)) {
589 // If there is BOM, we think the buffer is not binary.
590 result->assign("text/plain");
591 return false;
592 }
593
594 // Next we look to see if any of the bytes "look binary."
595 if (LooksLikeBinary(content)) {
596 result->assign("application/octet-stream");
597 return true;
598 }
599
600 // No evidence either way. Default to non-binary and, if truncated, clear
601 // have_enough_content because there could be a binary looking byte in the
602 // truncated data.
603 *have_enough_content &= is_truncated;
604 result->assign("text/plain");
605 return false;
606 }
607
IsUnknownMimeType(std::string_view mime_type)608 static bool IsUnknownMimeType(std::string_view mime_type) {
609 // TODO(tc): Maybe reuse some code in net/http/http_response_headers.* here.
610 // If we do, please be careful not to alter the semantics at all.
611 static const char* const kUnknownMimeTypes[] = {
612 // Empty mime types are as unknown as they get.
613 "",
614 // The unknown/unknown type is popular and uninformative
615 "unknown/unknown",
616 // The second most popular unknown mime type is application/unknown
617 "application/unknown",
618 // Firefox rejects a mime type if it is exactly */*
619 "*/*",
620 };
621 for (const char* const unknown_mime_type : kUnknownMimeTypes) {
622 if (mime_type == unknown_mime_type)
623 return true;
624 }
625 if (mime_type.find('/') == std::string_view::npos) {
626 // Firefox rejects a mime type if it does not contain a slash
627 return true;
628 }
629 return false;
630 }
631
632 // Returns true and sets result if the content appears to be a crx (Chrome
633 // extension) file.
634 // Clears have_enough_content if more data could possibly change the result.
SniffCRX(std::string_view content,const GURL & url,bool * have_enough_content,std::string * result)635 static bool SniffCRX(std::string_view content,
636 const GURL& url,
637 bool* have_enough_content,
638 std::string* result) {
639 // Technically, the crx magic number is just Cr24, but the bytes after that
640 // are a version number which changes infrequently. Including it in the
641 // sniffing gives us less room for error. If the version number ever changes,
642 // we can just add an entry to this list.
643 static const struct MagicNumber kCRXMagicNumbers[] = {
644 MAGIC_NUMBER("application/x-chrome-extension", "Cr24\x02\x00\x00\x00"),
645 MAGIC_NUMBER("application/x-chrome-extension", "Cr24\x03\x00\x00\x00")};
646
647 // Only consider files that have the extension ".crx".
648 if (!url.path_piece().ends_with(".crx")) {
649 return false;
650 }
651
652 *have_enough_content &= TruncateStringPiece(kBytesRequiredForMagic, &content);
653 return CheckForMagicNumbers(content, kCRXMagicNumbers, result);
654 }
655
ShouldSniffMimeType(const GURL & url,std::string_view mime_type)656 bool ShouldSniffMimeType(const GURL& url, std::string_view mime_type) {
657 bool sniffable_scheme = url.is_empty() || url.SchemeIsHTTPOrHTTPS() ||
658 #if BUILDFLAG(IS_ANDROID)
659 url.SchemeIs("content") ||
660 #endif
661 url.SchemeIsFile() || url.SchemeIsFileSystem();
662 if (!sniffable_scheme)
663 return false;
664
665 static const char* const kSniffableTypes[] = {
666 // Many web servers are misconfigured to send text/plain for many
667 // different types of content.
668 "text/plain",
669 // We want to sniff application/octet-stream for
670 // application/x-chrome-extension, but nothing else.
671 "application/octet-stream",
672 // XHTML and Atom/RSS feeds are often served as plain xml instead of
673 // their more specific mime types.
674 "text/xml",
675 "application/xml",
676 // Check for false Microsoft Office MIME types.
677 "application/msword",
678 "application/vnd.ms-excel",
679 "application/vnd.ms-powerpoint",
680 "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
681 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
682 "application/vnd.openxmlformats-officedocument.presentationml.presentation",
683 "application/vnd.ms-excel.sheet.macroenabled.12",
684 "application/vnd.ms-word.document.macroenabled.12",
685 "application/vnd.ms-powerpoint.presentation.macroenabled.12",
686 "application/mspowerpoint",
687 "application/msexcel",
688 "application/vnd.ms-word",
689 "application/vnd.ms-word.document.12",
690 "application/vnd.msword",
691 };
692 for (const char* const sniffable_type : kSniffableTypes) {
693 if (mime_type == sniffable_type)
694 return true;
695 }
696 if (IsUnknownMimeType(mime_type)) {
697 // The web server didn't specify a content type or specified a mime
698 // type that we ignore.
699 return true;
700 }
701 return false;
702 }
703
SniffMimeType(std::string_view content,const GURL & url,const std::string & type_hint,ForceSniffFileUrlsForHtml force_sniff_file_url_for_html,std::string * result)704 bool SniffMimeType(std::string_view content,
705 const GURL& url,
706 const std::string& type_hint,
707 ForceSniffFileUrlsForHtml force_sniff_file_url_for_html,
708 std::string* result) {
709 // Sanity check.
710 DCHECK_LT(content.length(), 1000000U);
711 DCHECK(result);
712
713 // By default, we assume we have enough content.
714 // Each sniff routine may unset this if it wasn't provided enough content.
715 bool have_enough_content = true;
716
717 // By default, we'll return the type hint.
718 // Each sniff routine may modify this if it has a better guess..
719 result->assign(type_hint);
720
721 // If the file has a Microsoft Office MIME type, we should only check that it
722 // is a valid Office file. Because this is the only reason we sniff files
723 // with a Microsoft Office MIME type, we can return early.
724 if (IsOfficeType(type_hint))
725 return SniffForInvalidOfficeDocs(content, url, result);
726
727 // Cache information about the type_hint
728 bool hint_is_unknown_mime_type = IsUnknownMimeType(type_hint);
729
730 // First check for HTML, unless it's a file URL and
731 // |allow_sniffing_files_urls_as_html| is false.
732 if (hint_is_unknown_mime_type &&
733 (!url.SchemeIsFile() ||
734 force_sniff_file_url_for_html == ForceSniffFileUrlsForHtml::kEnabled)) {
735 // We're only willing to sniff HTML if the server has not supplied a mime
736 // type, or if the type it did supply indicates that it doesn't know what
737 // the type should be.
738 if (SniffForHTML(content, &have_enough_content, result))
739 return true; // We succeeded in sniffing HTML. No more content needed.
740 }
741
742 // We're only willing to sniff for binary in 3 cases:
743 // 1. The server has not supplied a mime type.
744 // 2. The type it did supply indicates that it doesn't know what the type
745 // should be.
746 // 3. The type is "text/plain" which is the default on some web servers and
747 // could be indicative of a mis-configuration that we shield the user from.
748 const bool hint_is_text_plain = (type_hint == "text/plain");
749 if (hint_is_unknown_mime_type || hint_is_text_plain) {
750 if (!SniffBinary(content, &have_enough_content, result)) {
751 // If the server said the content was text/plain and it doesn't appear
752 // to be binary, then we trust it.
753 if (hint_is_text_plain) {
754 return have_enough_content;
755 }
756 }
757 }
758
759 // If we have plain XML, sniff XML subtypes.
760 if (type_hint == "text/xml" || type_hint == "application/xml") {
761 // We're not interested in sniffing these types for images and the like.
762 // Instead, we're looking explicitly for a feed. If we don't find one
763 // we're done and return early.
764 if (SniffXML(content, &have_enough_content, result))
765 return true;
766 return have_enough_content;
767 }
768
769 // CRX files (Chrome extensions) have a special sniffing algorithm. It is
770 // tighter than the others because we don't have to match legacy behavior.
771 if (SniffCRX(content, url, &have_enough_content, result))
772 return true;
773
774 // Check the file extension and magic numbers to see if this is an Office
775 // document. This needs to be checked before the general magic numbers
776 // because zip files and Office documents (OOXML) have the same magic number.
777 if (SniffForOfficeDocs(content, url, &have_enough_content, result)) {
778 return true; // We've matched a magic number. No more content needed.
779 }
780
781 // We're not interested in sniffing for magic numbers when the type_hint
782 // is application/octet-stream. Time to bail out.
783 if (type_hint == "application/octet-stream")
784 return have_enough_content;
785
786 // Now we look in our large table of magic numbers to see if we can find
787 // anything that matches the content.
788 if (SniffForMagicNumbers(content, &have_enough_content, result))
789 return true; // We've matched a magic number. No more content needed.
790
791 return have_enough_content;
792 }
793
SniffMimeTypeFromLocalData(std::string_view content,std::string * result)794 bool SniffMimeTypeFromLocalData(std::string_view content, std::string* result) {
795 // First check the extra table.
796 if (CheckForMagicNumbers(content, kExtraMagicNumbers, result))
797 return true;
798 // Finally check the original table.
799 return CheckForMagicNumbers(content, kMagicNumbers, result);
800 }
801
LooksLikeBinary(std::string_view content)802 bool LooksLikeBinary(std::string_view content) {
803 // The definition of "binary bytes" is from the spec at
804 // https://mimesniff.spec.whatwg.org/#binary-data-byte
805 //
806 // The bytes which are considered to be "binary" are all < 0x20. Encode them
807 // one bit per byte, with 1 for a "binary" bit, and 0 for a "text" bit. The
808 // least-significant bit represents byte 0x00, the most-significant bit
809 // represents byte 0x1F.
810 const uint32_t kBinaryBits =
811 ~(1u << '\t' | 1u << '\n' | 1u << '\r' | 1u << '\f' | 1u << '\x1b');
812 for (char c : content) {
813 uint8_t byte = static_cast<uint8_t>(c);
814 if (byte < 0x20 && (kBinaryBits & (1u << byte)))
815 return true;
816 }
817 return false;
818 }
819
820 } // namespace net
821