1 /*
2 * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
3 * Copyright (C) 2009, 2010 Google Inc. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
15 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
16 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
17 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
18 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
19 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
20 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27 #include "config.h"
28 #include "MarkupAccumulator.h"
29
30 #include "CDATASection.h"
31 #include "Comment.h"
32 #include "DocumentFragment.h"
33 #include "DocumentType.h"
34 #include "Editor.h"
35 #include "HTMLElement.h"
36 #include "HTMLNames.h"
37 #include "KURL.h"
38 #include "ProcessingInstruction.h"
39 #include "XMLNSNames.h"
40 #include <wtf/unicode/CharacterNames.h>
41
42 namespace WebCore {
43
44 using namespace HTMLNames;
45
appendCharactersReplacingEntities(Vector<UChar> & out,const UChar * content,size_t length,EntityMask entityMask)46 void appendCharactersReplacingEntities(Vector<UChar>& out, const UChar* content, size_t length, EntityMask entityMask)
47 {
48 DEFINE_STATIC_LOCAL(const String, ampReference, ("&"));
49 DEFINE_STATIC_LOCAL(const String, ltReference, ("<"));
50 DEFINE_STATIC_LOCAL(const String, gtReference, (">"));
51 DEFINE_STATIC_LOCAL(const String, quotReference, ("""));
52 DEFINE_STATIC_LOCAL(const String, nbspReference, (" "));
53
54 static const EntityDescription entityMaps[] = {
55 { '&', ampReference, EntityAmp },
56 { '<', ltReference, EntityLt },
57 { '>', gtReference, EntityGt },
58 { '"', quotReference, EntityQuot },
59 { noBreakSpace, nbspReference, EntityNbsp },
60 };
61
62 size_t positionAfterLastEntity = 0;
63 for (size_t i = 0; i < length; ++i) {
64 for (size_t m = 0; m < WTF_ARRAY_LENGTH(entityMaps); ++m) {
65 if (content[i] == entityMaps[m].entity && entityMaps[m].mask & entityMask) {
66 out.append(content + positionAfterLastEntity, i - positionAfterLastEntity);
67 append(out, entityMaps[m].reference);
68 positionAfterLastEntity = i + 1;
69 break;
70 }
71 }
72 }
73 out.append(content + positionAfterLastEntity, length - positionAfterLastEntity);
74 }
75
MarkupAccumulator(Vector<Node * > * nodes,EAbsoluteURLs shouldResolveURLs,const Range * range)76 MarkupAccumulator::MarkupAccumulator(Vector<Node*>* nodes, EAbsoluteURLs shouldResolveURLs, const Range* range)
77 : m_nodes(nodes)
78 , m_range(range)
79 , m_shouldResolveURLs(shouldResolveURLs)
80 {
81 }
82
~MarkupAccumulator()83 MarkupAccumulator::~MarkupAccumulator()
84 {
85 }
86
serializeNodes(Node * node,Node * nodeToSkip,EChildrenOnly childrenOnly)87 String MarkupAccumulator::serializeNodes(Node* node, Node* nodeToSkip, EChildrenOnly childrenOnly)
88 {
89 Vector<UChar> out;
90 serializeNodesWithNamespaces(node, nodeToSkip, childrenOnly, 0);
91 out.reserveInitialCapacity(length());
92 concatenateMarkup(out);
93 return String::adopt(out);
94 }
95
serializeNodesWithNamespaces(Node * node,Node * nodeToSkip,EChildrenOnly childrenOnly,const Namespaces * namespaces)96 void MarkupAccumulator::serializeNodesWithNamespaces(Node* node, Node* nodeToSkip, EChildrenOnly childrenOnly, const Namespaces* namespaces)
97 {
98 if (node == nodeToSkip)
99 return;
100
101 Namespaces namespaceHash;
102 if (namespaces)
103 namespaceHash = *namespaces;
104
105 if (!childrenOnly)
106 appendStartTag(node, &namespaceHash);
107
108 if (!(node->document()->isHTMLDocument() && elementCannotHaveEndTag(node))) {
109 for (Node* current = node->firstChild(); current; current = current->nextSibling())
110 serializeNodesWithNamespaces(current, nodeToSkip, IncludeNode, &namespaceHash);
111 }
112
113 if (!childrenOnly)
114 appendEndTag(node);
115 }
116
appendString(const String & string)117 void MarkupAccumulator::appendString(const String& string)
118 {
119 m_succeedingMarkup.append(string);
120 }
121
appendStartTag(Node * node,Namespaces * namespaces)122 void MarkupAccumulator::appendStartTag(Node* node, Namespaces* namespaces)
123 {
124 Vector<UChar> markup;
125 appendStartMarkup(markup, node, namespaces);
126 appendString(String::adopt(markup));
127 if (m_nodes)
128 m_nodes->append(node);
129 }
130
appendEndTag(Node * node)131 void MarkupAccumulator::appendEndTag(Node* node)
132 {
133 Vector<UChar> markup;
134 appendEndMarkup(markup, node);
135 appendString(String::adopt(markup));
136 }
137
totalLength(const Vector<String> & strings)138 size_t MarkupAccumulator::totalLength(const Vector<String>& strings)
139 {
140 size_t length = 0;
141 for (size_t i = 0; i < strings.size(); ++i)
142 length += strings[i].length();
143 return length;
144 }
145
146 // FIXME: This is a very inefficient way of accumulating the markup.
147 // We're converting results of appendStartMarkup and appendEndMarkup from Vector<UChar> to String
148 // and then back to Vector<UChar> and again to String here.
concatenateMarkup(Vector<UChar> & out)149 void MarkupAccumulator::concatenateMarkup(Vector<UChar>& out)
150 {
151 for (size_t i = 0; i < m_succeedingMarkup.size(); ++i)
152 append(out, m_succeedingMarkup[i]);
153 }
154
appendAttributeValue(Vector<UChar> & result,const String & attribute,bool documentIsHTML)155 void MarkupAccumulator::appendAttributeValue(Vector<UChar>& result, const String& attribute, bool documentIsHTML)
156 {
157 appendCharactersReplacingEntities(result, attribute.characters(), attribute.length(),
158 documentIsHTML ? EntityMaskInHTMLAttributeValue : EntityMaskInAttributeValue);
159 }
160
appendQuotedURLAttributeValue(Vector<UChar> & result,const String & urlString)161 void MarkupAccumulator::appendQuotedURLAttributeValue(Vector<UChar>& result, const String& urlString)
162 {
163 UChar quoteChar = '\"';
164 String strippedURLString = urlString.stripWhiteSpace();
165 if (protocolIsJavaScript(strippedURLString)) {
166 // minimal escaping for javascript urls
167 if (strippedURLString.contains('"')) {
168 if (strippedURLString.contains('\''))
169 strippedURLString.replace('\"', """);
170 else
171 quoteChar = '\'';
172 }
173 result.append(quoteChar);
174 append(result, strippedURLString);
175 result.append(quoteChar);
176 return;
177 }
178
179 // FIXME: This does not fully match other browsers. Firefox percent-escapes non-ASCII characters for innerHTML.
180 result.append(quoteChar);
181 appendAttributeValue(result, urlString, false);
182 result.append(quoteChar);
183 }
184
appendNodeValue(Vector<UChar> & out,const Node * node,const Range * range,EntityMask entityMask)185 void MarkupAccumulator::appendNodeValue(Vector<UChar>& out, const Node* node, const Range* range, EntityMask entityMask)
186 {
187 String str = node->nodeValue();
188 const UChar* characters = str.characters();
189 size_t length = str.length();
190
191 if (range) {
192 ExceptionCode ec;
193 if (node == range->endContainer(ec))
194 length = range->endOffset(ec);
195 if (node == range->startContainer(ec)) {
196 size_t start = range->startOffset(ec);
197 characters += start;
198 length -= start;
199 }
200 }
201
202 appendCharactersReplacingEntities(out, characters, length, entityMask);
203 }
204
shouldAddNamespaceElement(const Element * element)205 bool MarkupAccumulator::shouldAddNamespaceElement(const Element* element)
206 {
207 // Don't add namespace attribute if it is already defined for this elem.
208 const AtomicString& prefix = element->prefix();
209 AtomicString attr = !prefix.isEmpty() ? "xmlns:" + prefix : "xmlns";
210 return !element->hasAttribute(attr);
211 }
212
shouldAddNamespaceAttribute(const Attribute & attribute,Namespaces & namespaces)213 bool MarkupAccumulator::shouldAddNamespaceAttribute(const Attribute& attribute, Namespaces& namespaces)
214 {
215 namespaces.checkConsistency();
216
217 // Don't add namespace attributes twice
218 if (attribute.name() == XMLNSNames::xmlnsAttr) {
219 namespaces.set(emptyAtom.impl(), attribute.value().impl());
220 return false;
221 }
222
223 QualifiedName xmlnsPrefixAttr(xmlnsAtom, attribute.localName(), XMLNSNames::xmlnsNamespaceURI);
224 if (attribute.name() == xmlnsPrefixAttr) {
225 namespaces.set(attribute.localName().impl(), attribute.value().impl());
226 return false;
227 }
228
229 return true;
230 }
231
appendNamespace(Vector<UChar> & result,const AtomicString & prefix,const AtomicString & namespaceURI,Namespaces & namespaces)232 void MarkupAccumulator::appendNamespace(Vector<UChar>& result, const AtomicString& prefix, const AtomicString& namespaceURI, Namespaces& namespaces)
233 {
234 namespaces.checkConsistency();
235 if (namespaceURI.isEmpty())
236 return;
237
238 // Use emptyAtoms's impl() for both null and empty strings since the HashMap can't handle 0 as a key
239 AtomicStringImpl* pre = prefix.isEmpty() ? emptyAtom.impl() : prefix.impl();
240 AtomicStringImpl* foundNS = namespaces.get(pre);
241 if (foundNS != namespaceURI.impl()) {
242 namespaces.set(pre, namespaceURI.impl());
243 result.append(' ');
244 append(result, xmlnsAtom.string());
245 if (!prefix.isEmpty()) {
246 result.append(':');
247 append(result, prefix);
248 }
249
250 result.append('=');
251 result.append('"');
252 appendAttributeValue(result, namespaceURI, false);
253 result.append('"');
254 }
255 }
256
entityMaskForText(Text * text) const257 EntityMask MarkupAccumulator::entityMaskForText(Text* text) const
258 {
259 const QualifiedName* parentName = 0;
260 if (text->parentElement())
261 parentName = &static_cast<Element*>(text->parentElement())->tagQName();
262
263 if (parentName && (*parentName == scriptTag || *parentName == styleTag || *parentName == xmpTag))
264 return EntityMaskInCDATA;
265
266 return text->document()->isHTMLDocument() ? EntityMaskInHTMLPCDATA : EntityMaskInPCDATA;
267 }
268
appendText(Vector<UChar> & out,Text * text)269 void MarkupAccumulator::appendText(Vector<UChar>& out, Text* text)
270 {
271 appendNodeValue(out, text, m_range, entityMaskForText(text));
272 }
273
appendComment(Vector<UChar> & out,const String & comment)274 void MarkupAccumulator::appendComment(Vector<UChar>& out, const String& comment)
275 {
276 // FIXME: Comment content is not escaped, but XMLSerializer (and possibly other callers) should raise an exception if it includes "-->".
277 append(out, "<!--");
278 append(out, comment);
279 append(out, "-->");
280 }
281
appendDocumentType(Vector<UChar> & result,const DocumentType * n)282 void MarkupAccumulator::appendDocumentType(Vector<UChar>& result, const DocumentType* n)
283 {
284 if (n->name().isEmpty())
285 return;
286
287 append(result, "<!DOCTYPE ");
288 append(result, n->name());
289 if (!n->publicId().isEmpty()) {
290 append(result, " PUBLIC \"");
291 append(result, n->publicId());
292 append(result, "\"");
293 if (!n->systemId().isEmpty()) {
294 append(result, " \"");
295 append(result, n->systemId());
296 append(result, "\"");
297 }
298 } else if (!n->systemId().isEmpty()) {
299 append(result, " SYSTEM \"");
300 append(result, n->systemId());
301 append(result, "\"");
302 }
303 if (!n->internalSubset().isEmpty()) {
304 append(result, " [");
305 append(result, n->internalSubset());
306 append(result, "]");
307 }
308 append(result, ">");
309 }
310
appendProcessingInstruction(Vector<UChar> & out,const String & target,const String & data)311 void MarkupAccumulator::appendProcessingInstruction(Vector<UChar>& out, const String& target, const String& data)
312 {
313 // FIXME: PI data is not escaped, but XMLSerializer (and possibly other callers) this should raise an exception if it includes "?>".
314 append(out, "<?");
315 append(out, target);
316 append(out, " ");
317 append(out, data);
318 append(out, "?>");
319 }
320
appendElement(Vector<UChar> & out,Element * element,Namespaces * namespaces)321 void MarkupAccumulator::appendElement(Vector<UChar>& out, Element* element, Namespaces* namespaces)
322 {
323 appendOpenTag(out, element, namespaces);
324
325 NamedNodeMap* attributes = element->attributes();
326 unsigned length = attributes->length();
327 for (unsigned int i = 0; i < length; i++)
328 appendAttribute(out, element, *attributes->attributeItem(i), namespaces);
329
330 appendCloseTag(out, element);
331 }
332
appendOpenTag(Vector<UChar> & out,Element * element,Namespaces * namespaces)333 void MarkupAccumulator::appendOpenTag(Vector<UChar>& out, Element* element, Namespaces* namespaces)
334 {
335 out.append('<');
336 append(out, element->nodeNamePreservingCase());
337 if (!element->document()->isHTMLDocument() && namespaces && shouldAddNamespaceElement(element))
338 appendNamespace(out, element->prefix(), element->namespaceURI(), *namespaces);
339 }
340
appendCloseTag(Vector<UChar> & out,Element * element)341 void MarkupAccumulator::appendCloseTag(Vector<UChar>& out, Element* element)
342 {
343 if (shouldSelfClose(element)) {
344 if (element->isHTMLElement())
345 out.append(' '); // XHTML 1.0 <-> HTML compatibility.
346 out.append('/');
347 }
348 out.append('>');
349 }
350
appendAttribute(Vector<UChar> & out,Element * element,const Attribute & attribute,Namespaces * namespaces)351 void MarkupAccumulator::appendAttribute(Vector<UChar>& out, Element* element, const Attribute& attribute, Namespaces* namespaces)
352 {
353 bool documentIsHTML = element->document()->isHTMLDocument();
354
355 out.append(' ');
356
357 if (documentIsHTML)
358 append(out, attribute.name().localName());
359 else
360 append(out, attribute.name().toString());
361
362 out.append('=');
363
364 if (element->isURLAttribute(const_cast<Attribute*>(&attribute))) {
365 // We don't want to complete file:/// URLs because it may contain sensitive information
366 // about the user's system.
367 if (shouldResolveURLs() && !element->document()->url().isLocalFile())
368 appendQuotedURLAttributeValue(out, element->document()->completeURL(attribute.value()).string());
369 else
370 appendQuotedURLAttributeValue(out, attribute.value());
371 } else {
372 out.append('\"');
373 appendAttributeValue(out, attribute.value(), documentIsHTML);
374 out.append('\"');
375 }
376
377 if (!documentIsHTML && namespaces && shouldAddNamespaceAttribute(attribute, *namespaces))
378 appendNamespace(out, attribute.prefix(), attribute.namespaceURI(), *namespaces);
379 }
380
appendCDATASection(Vector<UChar> & out,const String & section)381 void MarkupAccumulator::appendCDATASection(Vector<UChar>& out, const String& section)
382 {
383 // FIXME: CDATA content is not escaped, but XMLSerializer (and possibly other callers) should raise an exception if it includes "]]>".
384 append(out, "<![CDATA[");
385 append(out, section);
386 append(out, "]]>");
387 }
388
appendStartMarkup(Vector<UChar> & result,const Node * node,Namespaces * namespaces)389 void MarkupAccumulator::appendStartMarkup(Vector<UChar>& result, const Node* node, Namespaces* namespaces)
390 {
391 if (namespaces)
392 namespaces->checkConsistency();
393
394 switch (node->nodeType()) {
395 case Node::TEXT_NODE:
396 appendText(result, static_cast<Text*>(const_cast<Node*>(node)));
397 break;
398 case Node::COMMENT_NODE:
399 appendComment(result, static_cast<const Comment*>(node)->data());
400 break;
401 case Node::DOCUMENT_NODE:
402 case Node::DOCUMENT_FRAGMENT_NODE:
403 break;
404 case Node::DOCUMENT_TYPE_NODE:
405 appendDocumentType(result, static_cast<const DocumentType*>(node));
406 break;
407 case Node::PROCESSING_INSTRUCTION_NODE:
408 appendProcessingInstruction(result, static_cast<const ProcessingInstruction*>(node)->target(), static_cast<const ProcessingInstruction*>(node)->data());
409 break;
410 case Node::ELEMENT_NODE:
411 appendElement(result, static_cast<Element*>(const_cast<Node*>(node)), namespaces);
412 break;
413 case Node::CDATA_SECTION_NODE:
414 appendCDATASection(result, static_cast<const CDATASection*>(node)->data());
415 break;
416 case Node::ATTRIBUTE_NODE:
417 case Node::ENTITY_NODE:
418 case Node::ENTITY_REFERENCE_NODE:
419 case Node::NOTATION_NODE:
420 case Node::XPATH_NAMESPACE_NODE:
421 ASSERT_NOT_REACHED();
422 break;
423 }
424 }
425
426 // Rules of self-closure
427 // 1. No elements in HTML documents use the self-closing syntax.
428 // 2. Elements w/ children never self-close because they use a separate end tag.
429 // 3. HTML elements which do not have a "forbidden" end tag will close with a separate end tag.
430 // 4. Other elements self-close.
shouldSelfClose(const Node * node)431 bool MarkupAccumulator::shouldSelfClose(const Node* node)
432 {
433 if (node->document()->isHTMLDocument())
434 return false;
435 if (node->hasChildNodes())
436 return false;
437 if (node->isHTMLElement() && !elementCannotHaveEndTag(node))
438 return false;
439 return true;
440 }
441
elementCannotHaveEndTag(const Node * node)442 bool MarkupAccumulator::elementCannotHaveEndTag(const Node* node)
443 {
444 if (!node->isHTMLElement())
445 return false;
446
447 // FIXME: ieForbidsInsertHTML may not be the right function to call here
448 // ieForbidsInsertHTML is used to disallow setting innerHTML/outerHTML
449 // or createContextualFragment. It does not necessarily align with
450 // which elements should be serialized w/o end tags.
451 return static_cast<const HTMLElement*>(node)->ieForbidsInsertHTML();
452 }
453
appendEndMarkup(Vector<UChar> & result,const Node * node)454 void MarkupAccumulator::appendEndMarkup(Vector<UChar>& result, const Node* node)
455 {
456 if (!node->isElementNode() || shouldSelfClose(node) || (!node->hasChildNodes() && elementCannotHaveEndTag(node)))
457 return;
458
459 result.append('<');
460 result.append('/');
461 append(result, static_cast<const Element*>(node)->nodeNamePreservingCase());
462 result.append('>');
463 }
464
465 }
466