• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2011 Google Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions are
6  * met:
7  *
8  *     * Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  *     * Redistributions in binary form must reproduce the above
11  * copyright notice, this list of conditions and the following disclaimer
12  * in the documentation and/or other materials provided with the
13  * distribution.
14  *     * Neither the name of Google Inc. nor the names of its
15  * contributors may be used to endorse or promote products derived from
16  * this software without specific prior written permission.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29  */
30 
31 #include "config.h"
32 #include "platform/mhtml/MHTMLParser.h"
33 
34 #include "platform/MIMETypeRegistry.h"
35 #include "platform/mhtml/ArchiveResource.h"
36 #include "platform/mhtml/MHTMLArchive.h"
37 #include "platform/network/ParsedContentType.h"
38 #include "platform/text/QuotedPrintable.h"
39 #include "wtf/HashMap.h"
40 #include "wtf/RefCounted.h"
41 #include "wtf/text/Base64.h"
42 #include "wtf/text/StringBuilder.h"
43 #include "wtf/text/StringConcatenate.h"
44 #include "wtf/text/StringHash.h"
45 #include "wtf/text/WTFString.h"
46 
47 namespace blink {
48 
49 // This class is a limited MIME parser used to parse the MIME headers of MHTML files.
50 class MIMEHeader : public RefCountedWillBeGarbageCollectedFinalized<MIMEHeader> {
51 public:
create()52     static PassRefPtrWillBeRawPtr<MIMEHeader> create()
53     {
54         return adoptRefWillBeNoop(new MIMEHeader());
55     }
56 
57     enum Encoding {
58         QuotedPrintable,
59         Base64,
60         EightBit,
61         SevenBit,
62         Binary,
63         Unknown
64     };
65 
66     static PassRefPtrWillBeRawPtr<MIMEHeader> parseHeader(SharedBufferChunkReader* crLFLineReader);
67 
isMultipart() const68     bool isMultipart() const { return m_contentType.startsWith("multipart/"); }
69 
contentType() const70     String contentType() const { return m_contentType; }
charset() const71     String charset() const { return m_charset; }
contentTransferEncoding() const72     Encoding contentTransferEncoding() const { return m_contentTransferEncoding; }
contentLocation() const73     String contentLocation() const { return m_contentLocation; }
74 
75     // Multi-part type and boundaries are only valid for multipart MIME headers.
multiPartType() const76     String multiPartType() const { return m_multipartType; }
endOfPartBoundary() const77     String endOfPartBoundary() const { return m_endOfPartBoundary; }
endOfDocumentBoundary() const78     String endOfDocumentBoundary() const { return m_endOfDocumentBoundary; }
79 
trace(Visitor *)80     void trace(Visitor*) { }
81 
82 private:
83     MIMEHeader();
84 
85     static Encoding parseContentTransferEncoding(const String&);
86 
87     String m_contentType;
88     String m_charset;
89     Encoding m_contentTransferEncoding;
90     String m_contentLocation;
91     String m_multipartType;
92     String m_endOfPartBoundary;
93     String m_endOfDocumentBoundary;
94 };
95 
96 typedef HashMap<String, String> KeyValueMap;
97 
retrieveKeyValuePairs(blink::SharedBufferChunkReader * buffer)98 static KeyValueMap retrieveKeyValuePairs(blink::SharedBufferChunkReader* buffer)
99 {
100     KeyValueMap keyValuePairs;
101     String line;
102     String key;
103     StringBuilder value;
104     while (!(line = buffer->nextChunkAsUTF8StringWithLatin1Fallback()).isNull()) {
105         if (line.isEmpty())
106             break; // Empty line means end of key/value section.
107         if (line[0] == '\t') {
108             ASSERT(!key.isEmpty());
109             value.append(line.substring(1));
110             continue;
111         }
112         // New key/value, store the previous one if any.
113         if (!key.isEmpty()) {
114             if (keyValuePairs.find(key) != keyValuePairs.end())
115                 WTF_LOG_ERROR("Key duplicate found in MIME header. Key is '%s', previous value replaced.", key.ascii().data());
116             keyValuePairs.add(key, value.toString().stripWhiteSpace());
117             key = String();
118             value.clear();
119         }
120         size_t semiColonIndex = line.find(':');
121         if (semiColonIndex == kNotFound) {
122             // This is not a key value pair, ignore.
123             continue;
124         }
125         key = line.substring(0, semiColonIndex).lower().stripWhiteSpace();
126         value.append(line.substring(semiColonIndex + 1));
127     }
128     // Store the last property if there is one.
129     if (!key.isEmpty())
130         keyValuePairs.set(key, value.toString().stripWhiteSpace());
131     return keyValuePairs;
132 }
133 
parseHeader(SharedBufferChunkReader * buffer)134 PassRefPtrWillBeRawPtr<MIMEHeader> MIMEHeader::parseHeader(SharedBufferChunkReader* buffer)
135 {
136     RefPtrWillBeRawPtr<MIMEHeader> mimeHeader = MIMEHeader::create();
137     KeyValueMap keyValuePairs = retrieveKeyValuePairs(buffer);
138     KeyValueMap::iterator mimeParametersIterator = keyValuePairs.find("content-type");
139     if (mimeParametersIterator != keyValuePairs.end()) {
140         ParsedContentType parsedContentType(mimeParametersIterator->value);
141         mimeHeader->m_contentType = parsedContentType.mimeType();
142         if (!mimeHeader->isMultipart()) {
143             mimeHeader->m_charset = parsedContentType.charset().stripWhiteSpace();
144         } else {
145             mimeHeader->m_multipartType = parsedContentType.parameterValueForName("type");
146             mimeHeader->m_endOfPartBoundary = parsedContentType.parameterValueForName("boundary");
147             if (mimeHeader->m_endOfPartBoundary.isNull()) {
148                 WTF_LOG_ERROR("No boundary found in multipart MIME header.");
149                 return nullptr;
150             }
151             mimeHeader->m_endOfPartBoundary.insert("--", 0);
152             mimeHeader->m_endOfDocumentBoundary = mimeHeader->m_endOfPartBoundary;
153             mimeHeader->m_endOfDocumentBoundary.append("--");
154         }
155     }
156 
157     mimeParametersIterator = keyValuePairs.find("content-transfer-encoding");
158     if (mimeParametersIterator != keyValuePairs.end())
159         mimeHeader->m_contentTransferEncoding = parseContentTransferEncoding(mimeParametersIterator->value);
160 
161     mimeParametersIterator = keyValuePairs.find("content-location");
162     if (mimeParametersIterator != keyValuePairs.end())
163         mimeHeader->m_contentLocation = mimeParametersIterator->value;
164 
165     return mimeHeader.release();
166 }
167 
parseContentTransferEncoding(const String & text)168 MIMEHeader::Encoding MIMEHeader::parseContentTransferEncoding(const String& text)
169 {
170     String encoding = text.stripWhiteSpace().lower();
171     if (encoding == "base64")
172         return Base64;
173     if (encoding == "quoted-printable")
174         return QuotedPrintable;
175     if (encoding == "8bit")
176         return EightBit;
177     if (encoding == "7bit")
178         return SevenBit;
179     if (encoding == "binary")
180         return Binary;
181     WTF_LOG_ERROR("Unknown encoding '%s' found in MIME header.", text.ascii().data());
182     return Unknown;
183 }
184 
MIMEHeader()185 MIMEHeader::MIMEHeader()
186     : m_contentTransferEncoding(Unknown)
187 {
188 }
189 
skipLinesUntilBoundaryFound(SharedBufferChunkReader & lineReader,const String & boundary)190 static bool skipLinesUntilBoundaryFound(SharedBufferChunkReader& lineReader, const String& boundary)
191 {
192     String line;
193     while (!(line = lineReader.nextChunkAsUTF8StringWithLatin1Fallback()).isNull()) {
194         if (line == boundary)
195             return true;
196     }
197     return false;
198 }
199 
MHTMLParser(SharedBuffer * data)200 MHTMLParser::MHTMLParser(SharedBuffer* data)
201     : m_lineReader(data, "\r\n")
202 {
203 }
204 
parseArchive()205 PassRefPtrWillBeRawPtr<MHTMLArchive> MHTMLParser::parseArchive()
206 {
207     RefPtrWillBeRawPtr<MIMEHeader> header = MIMEHeader::parseHeader(&m_lineReader);
208     return parseArchiveWithHeader(header.get());
209 }
210 
parseArchiveWithHeader(MIMEHeader * header)211 PassRefPtrWillBeRawPtr<MHTMLArchive> MHTMLParser::parseArchiveWithHeader(MIMEHeader* header)
212 {
213     if (!header) {
214         WTF_LOG_ERROR("Failed to parse MHTML part: no header.");
215         return nullptr;
216     }
217 
218     RefPtrWillBeRawPtr<MHTMLArchive> archive = MHTMLArchive::create();
219     if (!header->isMultipart()) {
220         // With IE a page with no resource is not multi-part.
221         bool endOfArchiveReached = false;
222         RefPtrWillBeRawPtr<ArchiveResource> resource = parseNextPart(*header, String(), String(), endOfArchiveReached);
223         if (!resource)
224             return nullptr;
225         archive->setMainResource(resource);
226         return archive;
227     }
228 
229     // Skip the message content (it's a generic browser specific message).
230     skipLinesUntilBoundaryFound(m_lineReader, header->endOfPartBoundary());
231 
232     bool endOfArchive = false;
233     while (!endOfArchive) {
234         RefPtrWillBeRawPtr<MIMEHeader> resourceHeader = MIMEHeader::parseHeader(&m_lineReader);
235         if (!resourceHeader) {
236             WTF_LOG_ERROR("Failed to parse MHTML, invalid MIME header.");
237             return nullptr;
238         }
239         if (resourceHeader->contentType() == "multipart/alternative") {
240             // Ignore IE nesting which makes little sense (IE seems to nest only some of the frames).
241             RefPtrWillBeRawPtr<MHTMLArchive> subframeArchive = parseArchiveWithHeader(resourceHeader.get());
242             if (!subframeArchive) {
243                 WTF_LOG_ERROR("Failed to parse MHTML subframe.");
244                 return nullptr;
245             }
246             bool endOfPartReached = skipLinesUntilBoundaryFound(m_lineReader, header->endOfPartBoundary());
247             ASSERT_UNUSED(endOfPartReached, endOfPartReached);
248             // The top-frame is the first frame found, regardless of the nesting level.
249             if (subframeArchive->mainResource())
250                 addResourceToArchive(subframeArchive->mainResource(), archive.get());
251             archive->addSubframeArchive(subframeArchive);
252             continue;
253         }
254 
255         RefPtrWillBeRawPtr<ArchiveResource> resource = parseNextPart(*resourceHeader, header->endOfPartBoundary(), header->endOfDocumentBoundary(), endOfArchive);
256         if (!resource) {
257             WTF_LOG_ERROR("Failed to parse MHTML part.");
258             return nullptr;
259         }
260         addResourceToArchive(resource.get(), archive.get());
261     }
262 
263     return archive.release();
264 }
265 
addResourceToArchive(ArchiveResource * resource,MHTMLArchive * archive)266 void MHTMLParser::addResourceToArchive(ArchiveResource* resource, MHTMLArchive* archive)
267 {
268     const AtomicString& mimeType = resource->mimeType();
269     if (!MIMETypeRegistry::isSupportedNonImageMIMEType(mimeType) || MIMETypeRegistry::isSupportedJavaScriptMIMEType(mimeType) || mimeType == "text/css") {
270         m_resources.append(resource);
271         return;
272     }
273 
274     // The first document suitable resource is the main frame.
275     if (!archive->mainResource()) {
276         archive->setMainResource(resource);
277         m_frames.append(archive);
278         return;
279     }
280 
281     RefPtrWillBeRawPtr<MHTMLArchive> subframe = MHTMLArchive::create();
282     subframe->setMainResource(resource);
283     m_frames.append(subframe);
284 }
285 
parseNextPart(const MIMEHeader & mimeHeader,const String & endOfPartBoundary,const String & endOfDocumentBoundary,bool & endOfArchiveReached)286 PassRefPtrWillBeRawPtr<ArchiveResource> MHTMLParser::parseNextPart(const MIMEHeader& mimeHeader, const String& endOfPartBoundary, const String& endOfDocumentBoundary, bool& endOfArchiveReached)
287 {
288     ASSERT(endOfPartBoundary.isEmpty() == endOfDocumentBoundary.isEmpty());
289 
290     // If no content transfer encoding is specified, default to binary encoding.
291     MIMEHeader::Encoding contentTransferEncoding = mimeHeader.contentTransferEncoding();
292     if (contentTransferEncoding == MIMEHeader::Unknown)
293         contentTransferEncoding = MIMEHeader::Binary;
294 
295     RefPtr<SharedBuffer> content = SharedBuffer::create();
296     const bool checkBoundary = !endOfPartBoundary.isEmpty();
297     bool endOfPartReached = false;
298     if (contentTransferEncoding == MIMEHeader::Binary) {
299         if (!checkBoundary) {
300             WTF_LOG_ERROR("Binary contents requires end of part");
301             return nullptr;
302         }
303         m_lineReader.setSeparator(endOfPartBoundary.utf8().data());
304         Vector<char> part;
305         if (!m_lineReader.nextChunk(part)) {
306             WTF_LOG_ERROR("Binary contents requires end of part");
307             return nullptr;
308         }
309         content->append(part);
310         m_lineReader.setSeparator("\r\n");
311         Vector<char> nextChars;
312         if (m_lineReader.peek(nextChars, 2) != 2) {
313             WTF_LOG_ERROR("Invalid seperator.");
314             return nullptr;
315         }
316         endOfPartReached = true;
317         ASSERT(nextChars.size() == 2);
318         endOfArchiveReached = (nextChars[0] == '-' && nextChars[1] == '-');
319         if (!endOfArchiveReached) {
320             String line = m_lineReader.nextChunkAsUTF8StringWithLatin1Fallback();
321             if (!line.isEmpty()) {
322                 WTF_LOG_ERROR("No CRLF at end of binary section.");
323                 return nullptr;
324             }
325         }
326     } else {
327         String line;
328         while (!(line = m_lineReader.nextChunkAsUTF8StringWithLatin1Fallback()).isNull()) {
329             endOfArchiveReached = (line == endOfDocumentBoundary);
330             if (checkBoundary && (line == endOfPartBoundary || endOfArchiveReached)) {
331                 endOfPartReached = true;
332                 break;
333             }
334             // Note that we use line.utf8() and not line.ascii() as ascii turns special characters (such as tab, line-feed...) into '?'.
335             content->append(line.utf8().data(), line.length());
336             if (contentTransferEncoding == MIMEHeader::QuotedPrintable) {
337                 // The line reader removes the \r\n, but we need them for the content in this case as the QuotedPrintable decoder expects CR-LF terminated lines.
338                 content->append("\r\n", 2);
339             }
340         }
341     }
342     if (!endOfPartReached && checkBoundary) {
343         WTF_LOG_ERROR("No bounday found for MHTML part.");
344         return nullptr;
345     }
346 
347     Vector<char> data;
348     switch (contentTransferEncoding) {
349     case MIMEHeader::Base64:
350         if (!base64Decode(content->data(), content->size(), data)) {
351             WTF_LOG_ERROR("Invalid base64 content for MHTML part.");
352             return nullptr;
353         }
354         break;
355     case MIMEHeader::QuotedPrintable:
356         quotedPrintableDecode(content->data(), content->size(), data);
357         break;
358     case MIMEHeader::EightBit:
359     case MIMEHeader::SevenBit:
360     case MIMEHeader::Binary:
361         data.append(content->data(), content->size());
362         break;
363     default:
364         WTF_LOG_ERROR("Invalid encoding for MHTML part.");
365         return nullptr;
366     }
367     RefPtr<SharedBuffer> contentBuffer = SharedBuffer::adoptVector(data);
368     // FIXME: the URL in the MIME header could be relative, we should resolve it if it is.
369     // The specs mentions 5 ways to resolve a URL: http://tools.ietf.org/html/rfc2557#section-5
370     // IE and Firefox (UNMht) seem to generate only absolute URLs.
371     KURL location = KURL(KURL(), mimeHeader.contentLocation());
372     return ArchiveResource::create(contentBuffer, location, AtomicString(mimeHeader.contentType()), AtomicString(mimeHeader.charset()), String());
373 }
374 
frameCount() const375 size_t MHTMLParser::frameCount() const
376 {
377     return m_frames.size();
378 }
379 
frameAt(size_t index) const380 MHTMLArchive* MHTMLParser::frameAt(size_t index) const
381 {
382     return m_frames[index].get();
383 }
384 
subResourceCount() const385 size_t MHTMLParser::subResourceCount() const
386 {
387     return m_resources.size();
388 }
389 
subResourceAt(size_t index) const390 ArchiveResource* MHTMLParser::subResourceAt(size_t index) const
391 {
392     return m_resources[index].get();
393 }
394 
395 }
396