1 /*
2 * Copyright (C) 2011 Google Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
6 * met:
7 *
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above
11 * copyright notice, this list of conditions and the following disclaimer
12 * in the documentation and/or other materials provided with the
13 * distribution.
14 * * Neither the name of Google Inc. nor the names of its
15 * contributors may be used to endorse or promote products derived from
16 * this software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31 #include "config.h"
32 #include "platform/mhtml/MHTMLParser.h"
33
34 #include "platform/MIMETypeRegistry.h"
35 #include "platform/mhtml/ArchiveResource.h"
36 #include "platform/mhtml/MHTMLArchive.h"
37 #include "platform/network/ParsedContentType.h"
38 #include "platform/text/QuotedPrintable.h"
39 #include "wtf/HashMap.h"
40 #include "wtf/RefCounted.h"
41 #include "wtf/text/Base64.h"
42 #include "wtf/text/StringBuilder.h"
43 #include "wtf/text/StringConcatenate.h"
44 #include "wtf/text/StringHash.h"
45 #include "wtf/text/WTFString.h"
46
47 namespace blink {
48
49 // This class is a limited MIME parser used to parse the MIME headers of MHTML files.
50 class MIMEHeader : public RefCountedWillBeGarbageCollectedFinalized<MIMEHeader> {
51 public:
create()52 static PassRefPtrWillBeRawPtr<MIMEHeader> create()
53 {
54 return adoptRefWillBeNoop(new MIMEHeader());
55 }
56
57 enum Encoding {
58 QuotedPrintable,
59 Base64,
60 EightBit,
61 SevenBit,
62 Binary,
63 Unknown
64 };
65
66 static PassRefPtrWillBeRawPtr<MIMEHeader> parseHeader(SharedBufferChunkReader* crLFLineReader);
67
isMultipart() const68 bool isMultipart() const { return m_contentType.startsWith("multipart/"); }
69
contentType() const70 String contentType() const { return m_contentType; }
charset() const71 String charset() const { return m_charset; }
contentTransferEncoding() const72 Encoding contentTransferEncoding() const { return m_contentTransferEncoding; }
contentLocation() const73 String contentLocation() const { return m_contentLocation; }
74
75 // Multi-part type and boundaries are only valid for multipart MIME headers.
multiPartType() const76 String multiPartType() const { return m_multipartType; }
endOfPartBoundary() const77 String endOfPartBoundary() const { return m_endOfPartBoundary; }
endOfDocumentBoundary() const78 String endOfDocumentBoundary() const { return m_endOfDocumentBoundary; }
79
trace(Visitor *)80 void trace(Visitor*) { }
81
82 private:
83 MIMEHeader();
84
85 static Encoding parseContentTransferEncoding(const String&);
86
87 String m_contentType;
88 String m_charset;
89 Encoding m_contentTransferEncoding;
90 String m_contentLocation;
91 String m_multipartType;
92 String m_endOfPartBoundary;
93 String m_endOfDocumentBoundary;
94 };
95
96 typedef HashMap<String, String> KeyValueMap;
97
retrieveKeyValuePairs(blink::SharedBufferChunkReader * buffer)98 static KeyValueMap retrieveKeyValuePairs(blink::SharedBufferChunkReader* buffer)
99 {
100 KeyValueMap keyValuePairs;
101 String line;
102 String key;
103 StringBuilder value;
104 while (!(line = buffer->nextChunkAsUTF8StringWithLatin1Fallback()).isNull()) {
105 if (line.isEmpty())
106 break; // Empty line means end of key/value section.
107 if (line[0] == '\t') {
108 ASSERT(!key.isEmpty());
109 value.append(line.substring(1));
110 continue;
111 }
112 // New key/value, store the previous one if any.
113 if (!key.isEmpty()) {
114 if (keyValuePairs.find(key) != keyValuePairs.end())
115 WTF_LOG_ERROR("Key duplicate found in MIME header. Key is '%s', previous value replaced.", key.ascii().data());
116 keyValuePairs.add(key, value.toString().stripWhiteSpace());
117 key = String();
118 value.clear();
119 }
120 size_t semiColonIndex = line.find(':');
121 if (semiColonIndex == kNotFound) {
122 // This is not a key value pair, ignore.
123 continue;
124 }
125 key = line.substring(0, semiColonIndex).lower().stripWhiteSpace();
126 value.append(line.substring(semiColonIndex + 1));
127 }
128 // Store the last property if there is one.
129 if (!key.isEmpty())
130 keyValuePairs.set(key, value.toString().stripWhiteSpace());
131 return keyValuePairs;
132 }
133
parseHeader(SharedBufferChunkReader * buffer)134 PassRefPtrWillBeRawPtr<MIMEHeader> MIMEHeader::parseHeader(SharedBufferChunkReader* buffer)
135 {
136 RefPtrWillBeRawPtr<MIMEHeader> mimeHeader = MIMEHeader::create();
137 KeyValueMap keyValuePairs = retrieveKeyValuePairs(buffer);
138 KeyValueMap::iterator mimeParametersIterator = keyValuePairs.find("content-type");
139 if (mimeParametersIterator != keyValuePairs.end()) {
140 ParsedContentType parsedContentType(mimeParametersIterator->value);
141 mimeHeader->m_contentType = parsedContentType.mimeType();
142 if (!mimeHeader->isMultipart()) {
143 mimeHeader->m_charset = parsedContentType.charset().stripWhiteSpace();
144 } else {
145 mimeHeader->m_multipartType = parsedContentType.parameterValueForName("type");
146 mimeHeader->m_endOfPartBoundary = parsedContentType.parameterValueForName("boundary");
147 if (mimeHeader->m_endOfPartBoundary.isNull()) {
148 WTF_LOG_ERROR("No boundary found in multipart MIME header.");
149 return nullptr;
150 }
151 mimeHeader->m_endOfPartBoundary.insert("--", 0);
152 mimeHeader->m_endOfDocumentBoundary = mimeHeader->m_endOfPartBoundary;
153 mimeHeader->m_endOfDocumentBoundary.append("--");
154 }
155 }
156
157 mimeParametersIterator = keyValuePairs.find("content-transfer-encoding");
158 if (mimeParametersIterator != keyValuePairs.end())
159 mimeHeader->m_contentTransferEncoding = parseContentTransferEncoding(mimeParametersIterator->value);
160
161 mimeParametersIterator = keyValuePairs.find("content-location");
162 if (mimeParametersIterator != keyValuePairs.end())
163 mimeHeader->m_contentLocation = mimeParametersIterator->value;
164
165 return mimeHeader.release();
166 }
167
parseContentTransferEncoding(const String & text)168 MIMEHeader::Encoding MIMEHeader::parseContentTransferEncoding(const String& text)
169 {
170 String encoding = text.stripWhiteSpace().lower();
171 if (encoding == "base64")
172 return Base64;
173 if (encoding == "quoted-printable")
174 return QuotedPrintable;
175 if (encoding == "8bit")
176 return EightBit;
177 if (encoding == "7bit")
178 return SevenBit;
179 if (encoding == "binary")
180 return Binary;
181 WTF_LOG_ERROR("Unknown encoding '%s' found in MIME header.", text.ascii().data());
182 return Unknown;
183 }
184
MIMEHeader()185 MIMEHeader::MIMEHeader()
186 : m_contentTransferEncoding(Unknown)
187 {
188 }
189
skipLinesUntilBoundaryFound(SharedBufferChunkReader & lineReader,const String & boundary)190 static bool skipLinesUntilBoundaryFound(SharedBufferChunkReader& lineReader, const String& boundary)
191 {
192 String line;
193 while (!(line = lineReader.nextChunkAsUTF8StringWithLatin1Fallback()).isNull()) {
194 if (line == boundary)
195 return true;
196 }
197 return false;
198 }
199
MHTMLParser(SharedBuffer * data)200 MHTMLParser::MHTMLParser(SharedBuffer* data)
201 : m_lineReader(data, "\r\n")
202 {
203 }
204
parseArchive()205 PassRefPtrWillBeRawPtr<MHTMLArchive> MHTMLParser::parseArchive()
206 {
207 RefPtrWillBeRawPtr<MIMEHeader> header = MIMEHeader::parseHeader(&m_lineReader);
208 return parseArchiveWithHeader(header.get());
209 }
210
parseArchiveWithHeader(MIMEHeader * header)211 PassRefPtrWillBeRawPtr<MHTMLArchive> MHTMLParser::parseArchiveWithHeader(MIMEHeader* header)
212 {
213 if (!header) {
214 WTF_LOG_ERROR("Failed to parse MHTML part: no header.");
215 return nullptr;
216 }
217
218 RefPtrWillBeRawPtr<MHTMLArchive> archive = MHTMLArchive::create();
219 if (!header->isMultipart()) {
220 // With IE a page with no resource is not multi-part.
221 bool endOfArchiveReached = false;
222 RefPtrWillBeRawPtr<ArchiveResource> resource = parseNextPart(*header, String(), String(), endOfArchiveReached);
223 if (!resource)
224 return nullptr;
225 archive->setMainResource(resource);
226 return archive;
227 }
228
229 // Skip the message content (it's a generic browser specific message).
230 skipLinesUntilBoundaryFound(m_lineReader, header->endOfPartBoundary());
231
232 bool endOfArchive = false;
233 while (!endOfArchive) {
234 RefPtrWillBeRawPtr<MIMEHeader> resourceHeader = MIMEHeader::parseHeader(&m_lineReader);
235 if (!resourceHeader) {
236 WTF_LOG_ERROR("Failed to parse MHTML, invalid MIME header.");
237 return nullptr;
238 }
239 if (resourceHeader->contentType() == "multipart/alternative") {
240 // Ignore IE nesting which makes little sense (IE seems to nest only some of the frames).
241 RefPtrWillBeRawPtr<MHTMLArchive> subframeArchive = parseArchiveWithHeader(resourceHeader.get());
242 if (!subframeArchive) {
243 WTF_LOG_ERROR("Failed to parse MHTML subframe.");
244 return nullptr;
245 }
246 bool endOfPartReached = skipLinesUntilBoundaryFound(m_lineReader, header->endOfPartBoundary());
247 ASSERT_UNUSED(endOfPartReached, endOfPartReached);
248 // The top-frame is the first frame found, regardless of the nesting level.
249 if (subframeArchive->mainResource())
250 addResourceToArchive(subframeArchive->mainResource(), archive.get());
251 archive->addSubframeArchive(subframeArchive);
252 continue;
253 }
254
255 RefPtrWillBeRawPtr<ArchiveResource> resource = parseNextPart(*resourceHeader, header->endOfPartBoundary(), header->endOfDocumentBoundary(), endOfArchive);
256 if (!resource) {
257 WTF_LOG_ERROR("Failed to parse MHTML part.");
258 return nullptr;
259 }
260 addResourceToArchive(resource.get(), archive.get());
261 }
262
263 return archive.release();
264 }
265
addResourceToArchive(ArchiveResource * resource,MHTMLArchive * archive)266 void MHTMLParser::addResourceToArchive(ArchiveResource* resource, MHTMLArchive* archive)
267 {
268 const AtomicString& mimeType = resource->mimeType();
269 if (!MIMETypeRegistry::isSupportedNonImageMIMEType(mimeType) || MIMETypeRegistry::isSupportedJavaScriptMIMEType(mimeType) || mimeType == "text/css") {
270 m_resources.append(resource);
271 return;
272 }
273
274 // The first document suitable resource is the main frame.
275 if (!archive->mainResource()) {
276 archive->setMainResource(resource);
277 m_frames.append(archive);
278 return;
279 }
280
281 RefPtrWillBeRawPtr<MHTMLArchive> subframe = MHTMLArchive::create();
282 subframe->setMainResource(resource);
283 m_frames.append(subframe);
284 }
285
parseNextPart(const MIMEHeader & mimeHeader,const String & endOfPartBoundary,const String & endOfDocumentBoundary,bool & endOfArchiveReached)286 PassRefPtrWillBeRawPtr<ArchiveResource> MHTMLParser::parseNextPart(const MIMEHeader& mimeHeader, const String& endOfPartBoundary, const String& endOfDocumentBoundary, bool& endOfArchiveReached)
287 {
288 ASSERT(endOfPartBoundary.isEmpty() == endOfDocumentBoundary.isEmpty());
289
290 // If no content transfer encoding is specified, default to binary encoding.
291 MIMEHeader::Encoding contentTransferEncoding = mimeHeader.contentTransferEncoding();
292 if (contentTransferEncoding == MIMEHeader::Unknown)
293 contentTransferEncoding = MIMEHeader::Binary;
294
295 RefPtr<SharedBuffer> content = SharedBuffer::create();
296 const bool checkBoundary = !endOfPartBoundary.isEmpty();
297 bool endOfPartReached = false;
298 if (contentTransferEncoding == MIMEHeader::Binary) {
299 if (!checkBoundary) {
300 WTF_LOG_ERROR("Binary contents requires end of part");
301 return nullptr;
302 }
303 m_lineReader.setSeparator(endOfPartBoundary.utf8().data());
304 Vector<char> part;
305 if (!m_lineReader.nextChunk(part)) {
306 WTF_LOG_ERROR("Binary contents requires end of part");
307 return nullptr;
308 }
309 content->append(part);
310 m_lineReader.setSeparator("\r\n");
311 Vector<char> nextChars;
312 if (m_lineReader.peek(nextChars, 2) != 2) {
313 WTF_LOG_ERROR("Invalid seperator.");
314 return nullptr;
315 }
316 endOfPartReached = true;
317 ASSERT(nextChars.size() == 2);
318 endOfArchiveReached = (nextChars[0] == '-' && nextChars[1] == '-');
319 if (!endOfArchiveReached) {
320 String line = m_lineReader.nextChunkAsUTF8StringWithLatin1Fallback();
321 if (!line.isEmpty()) {
322 WTF_LOG_ERROR("No CRLF at end of binary section.");
323 return nullptr;
324 }
325 }
326 } else {
327 String line;
328 while (!(line = m_lineReader.nextChunkAsUTF8StringWithLatin1Fallback()).isNull()) {
329 endOfArchiveReached = (line == endOfDocumentBoundary);
330 if (checkBoundary && (line == endOfPartBoundary || endOfArchiveReached)) {
331 endOfPartReached = true;
332 break;
333 }
334 // Note that we use line.utf8() and not line.ascii() as ascii turns special characters (such as tab, line-feed...) into '?'.
335 content->append(line.utf8().data(), line.length());
336 if (contentTransferEncoding == MIMEHeader::QuotedPrintable) {
337 // The line reader removes the \r\n, but we need them for the content in this case as the QuotedPrintable decoder expects CR-LF terminated lines.
338 content->append("\r\n", 2);
339 }
340 }
341 }
342 if (!endOfPartReached && checkBoundary) {
343 WTF_LOG_ERROR("No bounday found for MHTML part.");
344 return nullptr;
345 }
346
347 Vector<char> data;
348 switch (contentTransferEncoding) {
349 case MIMEHeader::Base64:
350 if (!base64Decode(content->data(), content->size(), data)) {
351 WTF_LOG_ERROR("Invalid base64 content for MHTML part.");
352 return nullptr;
353 }
354 break;
355 case MIMEHeader::QuotedPrintable:
356 quotedPrintableDecode(content->data(), content->size(), data);
357 break;
358 case MIMEHeader::EightBit:
359 case MIMEHeader::SevenBit:
360 case MIMEHeader::Binary:
361 data.append(content->data(), content->size());
362 break;
363 default:
364 WTF_LOG_ERROR("Invalid encoding for MHTML part.");
365 return nullptr;
366 }
367 RefPtr<SharedBuffer> contentBuffer = SharedBuffer::adoptVector(data);
368 // FIXME: the URL in the MIME header could be relative, we should resolve it if it is.
369 // The specs mentions 5 ways to resolve a URL: http://tools.ietf.org/html/rfc2557#section-5
370 // IE and Firefox (UNMht) seem to generate only absolute URLs.
371 KURL location = KURL(KURL(), mimeHeader.contentLocation());
372 return ArchiveResource::create(contentBuffer, location, AtomicString(mimeHeader.contentType()), AtomicString(mimeHeader.charset()), String());
373 }
374
frameCount() const375 size_t MHTMLParser::frameCount() const
376 {
377 return m_frames.size();
378 }
379
frameAt(size_t index) const380 MHTMLArchive* MHTMLParser::frameAt(size_t index) const
381 {
382 return m_frames[index].get();
383 }
384
subResourceCount() const385 size_t MHTMLParser::subResourceCount() const
386 {
387 return m_resources.size();
388 }
389
subResourceAt(size_t index) const390 ArchiveResource* MHTMLParser::subResourceAt(size_t index) const
391 {
392 return m_resources[index].get();
393 }
394
395 }
396