1 /*
2 * Copyright (C) 2011 Google Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
6 * met:
7 *
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above
11 * copyright notice, this list of conditions and the following disclaimer
12 * in the documentation and/or other materials provided with the
13 * distribution.
14 * * Neither the name of Google Inc. nor the names of its
15 * contributors may be used to endorse or promote products derived from
16 * this software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31 #include "config.h"
32 #include "platform/mhtml/MHTMLArchive.h"
33
34 #include "platform/DateComponents.h"
35 #include "platform/MIMETypeRegistry.h"
36 #include "platform/SerializedResource.h"
37 #include "platform/SharedBuffer.h"
38 #include "platform/mhtml/MHTMLParser.h"
39 #include "platform/text/QuotedPrintable.h"
40 #include "platform/weborigin/SchemeRegistry.h"
41 #include "wtf/CryptographicallyRandomNumber.h"
42 #include "wtf/DateMath.h"
43 #include "wtf/text/Base64.h"
44 #include "wtf/text/StringBuilder.h"
45
46 namespace WebCore {
47
48 const char* const quotedPrintable = "quoted-printable";
49 const char* const base64 = "base64";
50 const char* const binary = "binary";
51
generateRandomBoundary()52 static String generateRandomBoundary()
53 {
54 // Trying to generate random boundaries similar to IE/UnMHT (ex: ----=_NextPart_000_001B_01CC157B.96F808A0).
55 const size_t randomValuesLength = 10;
56 char randomValues[randomValuesLength];
57 cryptographicallyRandomValues(&randomValues, randomValuesLength);
58 StringBuilder stringBuilder;
59 stringBuilder.append("----=_NextPart_000_");
60 for (size_t i = 0; i < randomValuesLength; ++i) {
61 if (i == 2)
62 stringBuilder.append('_');
63 else if (i == 6)
64 stringBuilder.append('.');
65 stringBuilder.append(lowerNibbleToASCIIHexDigit(randomValues[i]));
66 stringBuilder.append(upperNibbleToASCIIHexDigit(randomValues[i]));
67 }
68 return stringBuilder.toString();
69 }
70
replaceNonPrintableCharacters(const String & text)71 static String replaceNonPrintableCharacters(const String& text)
72 {
73 StringBuilder stringBuilder;
74 for (size_t i = 0; i < text.length(); ++i) {
75 if (isASCIIPrintable(text[i]))
76 stringBuilder.append(text[i]);
77 else
78 stringBuilder.append('?');
79 }
80 return stringBuilder.toString();
81 }
82
MHTMLArchive()83 MHTMLArchive::MHTMLArchive()
84 {
85 }
86
~MHTMLArchive()87 MHTMLArchive::~MHTMLArchive()
88 {
89 // Because all frames know about each other we need to perform a deep clearing of the archives graph.
90 clearAllSubframeArchives();
91 }
92
create()93 PassRefPtr<MHTMLArchive> MHTMLArchive::create()
94 {
95 return adoptRef(new MHTMLArchive);
96 }
97
create(const KURL & url,SharedBuffer * data)98 PassRefPtr<MHTMLArchive> MHTMLArchive::create(const KURL& url, SharedBuffer* data)
99 {
100 // For security reasons we only load MHTML pages from local URLs.
101 if (!SchemeRegistry::shouldTreatURLSchemeAsLocal(url.protocol()))
102 return nullptr;
103
104 MHTMLParser parser(data);
105 RefPtr<MHTMLArchive> mainArchive = parser.parseArchive();
106 if (!mainArchive)
107 return nullptr; // Invalid MHTML file.
108
109 // Since MHTML is a flat format, we need to make all frames aware of all resources.
110 for (size_t i = 0; i < parser.frameCount(); ++i) {
111 RefPtr<MHTMLArchive> archive = parser.frameAt(i);
112 for (size_t j = 1; j < parser.frameCount(); ++j) {
113 if (i != j)
114 archive->addSubframeArchive(parser.frameAt(j));
115 }
116 for (size_t j = 0; j < parser.subResourceCount(); ++j)
117 archive->addSubresource(parser.subResourceAt(j));
118 }
119 return mainArchive.release();
120 }
121
generateMHTMLData(const Vector<SerializedResource> & resources,EncodingPolicy encodingPolicy,const String & title,const String & mimeType)122 PassRefPtr<SharedBuffer> MHTMLArchive::generateMHTMLData(const Vector<SerializedResource>& resources, EncodingPolicy encodingPolicy, const String& title, const String& mimeType)
123 {
124 String boundary = generateRandomBoundary();
125 String endOfResourceBoundary = "--" + boundary + "\r\n";
126
127 DateComponents now;
128 now.setMillisecondsSinceEpochForDateTime(currentTimeMS());
129 String dateString = makeRFC2822DateString(now.weekDay(), now.monthDay(), now.month(), now.fullYear(), now.hour(), now.minute(), now.second(), 0);
130
131 StringBuilder stringBuilder;
132 stringBuilder.append("From: <Saved by WebKit>\r\n");
133 stringBuilder.append("Subject: ");
134 // We replace non ASCII characters with '?' characters to match IE's behavior.
135 stringBuilder.append(replaceNonPrintableCharacters(title));
136 stringBuilder.append("\r\nDate: ");
137 stringBuilder.append(dateString);
138 stringBuilder.append("\r\nMIME-Version: 1.0\r\n");
139 stringBuilder.append("Content-Type: multipart/related;\r\n");
140 stringBuilder.append("\ttype=\"");
141 stringBuilder.append(mimeType);
142 stringBuilder.append("\";\r\n");
143 stringBuilder.append("\tboundary=\"");
144 stringBuilder.append(boundary);
145 stringBuilder.append("\"\r\n\r\n");
146
147 // We use utf8() below instead of ascii() as ascii() replaces CRLFs with ?? (we still only have put ASCII characters in it).
148 ASSERT(stringBuilder.toString().containsOnlyASCII());
149 CString asciiString = stringBuilder.toString().utf8();
150 RefPtr<SharedBuffer> mhtmlData = SharedBuffer::create();
151 mhtmlData->append(asciiString.data(), asciiString.length());
152
153 for (size_t i = 0; i < resources.size(); ++i) {
154 const SerializedResource& resource = resources[i];
155
156 stringBuilder.clear();
157 stringBuilder.append(endOfResourceBoundary);
158 stringBuilder.append("Content-Type: ");
159 stringBuilder.append(resource.mimeType);
160
161 const char* contentEncoding = 0;
162 if (encodingPolicy == UseBinaryEncoding)
163 contentEncoding = binary;
164 else if (MIMETypeRegistry::isSupportedJavaScriptMIMEType(resource.mimeType) || MIMETypeRegistry::isSupportedNonImageMIMEType(resource.mimeType))
165 contentEncoding = quotedPrintable;
166 else
167 contentEncoding = base64;
168
169 stringBuilder.append("\r\nContent-Transfer-Encoding: ");
170 stringBuilder.append(contentEncoding);
171 stringBuilder.append("\r\nContent-Location: ");
172 stringBuilder.append(resource.url);
173 stringBuilder.append("\r\n\r\n");
174
175 asciiString = stringBuilder.toString().utf8();
176 mhtmlData->append(asciiString.data(), asciiString.length());
177
178 if (!strcmp(contentEncoding, binary)) {
179 const char* data;
180 size_t position = 0;
181 while (size_t length = resource.data->getSomeData(data, position)) {
182 mhtmlData->append(data, length);
183 position += length;
184 }
185 } else {
186 // FIXME: ideally we would encode the content as a stream without having to fetch it all.
187 const char* data = resource.data->data();
188 size_t dataLength = resource.data->size();
189 Vector<char> encodedData;
190 if (!strcmp(contentEncoding, quotedPrintable)) {
191 quotedPrintableEncode(data, dataLength, encodedData);
192 mhtmlData->append(encodedData.data(), encodedData.size());
193 mhtmlData->append("\r\n", 2);
194 } else {
195 ASSERT(!strcmp(contentEncoding, base64));
196 // We are not specifying insertLFs = true below as it would cut the lines with LFs and MHTML requires CRLFs.
197 base64Encode(data, dataLength, encodedData);
198 const size_t maximumLineLength = 76;
199 size_t index = 0;
200 size_t encodedDataLength = encodedData.size();
201 do {
202 size_t lineLength = std::min(encodedDataLength - index, maximumLineLength);
203 mhtmlData->append(encodedData.data() + index, lineLength);
204 mhtmlData->append("\r\n", 2);
205 index += maximumLineLength;
206 } while (index < encodedDataLength);
207 }
208 }
209 }
210
211 asciiString = String("--" + boundary + "--\r\n").utf8();
212 mhtmlData->append(asciiString.data(), asciiString.length());
213
214 return mhtmlData.release();
215 }
216
clearAllSubframeArchives()217 void MHTMLArchive::clearAllSubframeArchives()
218 {
219 Vector<RefPtr<MHTMLArchive> > clearedArchives;
220 clearAllSubframeArchivesImpl(&clearedArchives);
221 }
222
clearAllSubframeArchivesImpl(Vector<RefPtr<MHTMLArchive>> * clearedArchives)223 void MHTMLArchive::clearAllSubframeArchivesImpl(Vector<RefPtr<MHTMLArchive> >* clearedArchives)
224 {
225 for (Vector<RefPtr<MHTMLArchive> >::iterator it = m_subframeArchives.begin(); it != m_subframeArchives.end(); ++it) {
226 if (!clearedArchives->contains(*it)) {
227 clearedArchives->append(*it);
228 (*it)->clearAllSubframeArchivesImpl(clearedArchives);
229 }
230 }
231 m_subframeArchives.clear();
232 }
233
234 }
235