1 /*
2 Copyright (C) 1997 Martin Jones (mjones@kde.org)
3 (C) 1997 Torben Weis (weis@kde.org)
4 (C) 1999,2001 Lars Knoll (knoll@kde.org)
5 (C) 2000,2001 Dirk Mueller (mueller@kde.org)
6 Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
7 Copyright (C) 2009 Torch Mobile Inc. All rights reserved. (http://www.torchmobile.com/)
8
9 This library is free software; you can redistribute it and/or
10 modify it under the terms of the GNU Library General Public
11 License as published by the Free Software Foundation; either
12 version 2 of the License, or (at your option) any later version.
13
14 This library is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 Library General Public License for more details.
18
19 You should have received a copy of the GNU Library General Public License
20 along with this library; see the file COPYING.LIB. If not, write to
21 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
22 Boston, MA 02110-1301, USA.
23 */
24
25 #include "config.h"
26 #include "HTMLParser.h"
27
28 #include "CharacterNames.h"
29 #include "CSSPropertyNames.h"
30 #include "CSSValueKeywords.h"
31 #include "ChromeClient.h"
32 #include "Comment.h"
33 #include "Console.h"
34 #include "DOMWindow.h"
35 #include "DocumentFragment.h"
36 #include "DocumentType.h"
37 #include "Frame.h"
38 #include "HTMLBodyElement.h"
39 #include "HTMLDocument.h"
40 #include "HTMLDivElement.h"
41 #include "HTMLDListElement.h"
42 #include "HTMLElementFactory.h"
43 #include "HTMLFormElement.h"
44 #include "HTMLHeadElement.h"
45 #include "HTMLHRElement.h"
46 #include "HTMLHtmlElement.h"
47 #include "HTMLIsIndexElement.h"
48 #include "HTMLMapElement.h"
49 #include "HTMLNames.h"
50 #include "HTMLParserQuirks.h"
51 #include "HTMLTableCellElement.h"
52 #include "HTMLTableRowElement.h"
53 #include "HTMLTableSectionElement.h"
54 #include "HTMLTokenizer.h"
55 #include "LocalizedStrings.h"
56 #include "Page.h"
57 #include "Settings.h"
58 #include "Text.h"
59 #include <wtf/StdLibExtras.h>
60
61 namespace WebCore {
62
63 using namespace HTMLNames;
64
65 static const unsigned cMaxRedundantTagDepth = 20;
66 static const unsigned cResidualStyleMaxDepth = 200;
67
68 static const int minBlockLevelTagPriority = 3;
69
70 // A cap on the number of tags with priority minBlockLevelTagPriority or higher
71 // allowed in m_blockStack. The cap is enforced by adding such new elements as
72 // siblings instead of children once it is reached.
73 static const size_t cMaxBlockDepth = 4096;
74
75 struct HTMLStackElem : Noncopyable {
HTMLStackElemWebCore::HTMLStackElem76 HTMLStackElem(const AtomicString& t, int lvl, Node* n, bool r, HTMLStackElem* nx)
77 : tagName(t)
78 , level(lvl)
79 , strayTableContent(false)
80 , node(n)
81 , didRefNode(r)
82 , next(nx)
83 {
84 }
85
derefNodeWebCore::HTMLStackElem86 void derefNode()
87 {
88 if (didRefNode)
89 node->deref();
90 }
91
92 AtomicString tagName;
93 int level;
94 bool strayTableContent;
95 Node* node;
96 bool didRefNode;
97 HTMLStackElem* next;
98 };
99
100 /**
101 * The parser parses tokenized input into the document, building up the
102 * document tree. If the document is well-formed, parsing it is straightforward.
103 *
104 * Unfortunately, we have to handle many HTML documents that are not well-formed,
105 * so the parser has to be tolerant about errors.
106 *
107 * We have to take care of at least the following error conditions:
108 *
109 * 1. The element being added is explicitly forbidden inside some outer tag.
110 * In this case we should close all tags up to the one, which forbids
111 * the element, and add it afterwards.
112 *
113 * 2. We are not allowed to add the element directly. It could be that
114 * the person writing the document forgot some tag in between (or that the
115 * tag in between is optional). This could be the case with the following
116 * tags: HTML HEAD BODY TBODY TR TD LI (did I forget any?).
117 *
118 * 3. We want to add a block element inside to an inline element. Close all
119 * inline elements up to the next higher block element.
120 *
121 * 4. If this doesn't help, close elements until we are allowed to add the
122 * element or ignore the tag.
123 *
124 */
125
HTMLParser(HTMLDocument * doc,bool reportErrors)126 HTMLParser::HTMLParser(HTMLDocument* doc, bool reportErrors)
127 : m_document(doc)
128 , m_current(doc)
129 , m_didRefCurrent(false)
130 , m_blockStack(0)
131 , m_blocksInStack(0)
132 , m_hasPElementInScope(NotInScope)
133 , m_inBody(false)
134 , m_haveContent(false)
135 , m_haveFrameSet(false)
136 , m_isParsingFragment(false)
137 , m_reportErrors(reportErrors)
138 , m_handlingResidualStyleAcrossBlocks(false)
139 , m_inStrayTableContent(0)
140 , m_parserQuirks(m_document->page() ? m_document->page()->chrome()->client()->createHTMLParserQuirks() : 0)
141 {
142 }
143
HTMLParser(DocumentFragment * frag)144 HTMLParser::HTMLParser(DocumentFragment* frag)
145 : m_document(frag->document())
146 , m_current(frag)
147 , m_didRefCurrent(true)
148 , m_blockStack(0)
149 , m_blocksInStack(0)
150 , m_hasPElementInScope(NotInScope)
151 , m_inBody(true)
152 , m_haveContent(false)
153 , m_haveFrameSet(false)
154 , m_isParsingFragment(true)
155 , m_reportErrors(false)
156 , m_handlingResidualStyleAcrossBlocks(false)
157 , m_inStrayTableContent(0)
158 , m_parserQuirks(m_document->page() ? m_document->page()->chrome()->client()->createHTMLParserQuirks() : 0)
159 {
160 if (frag)
161 frag->ref();
162 }
163
~HTMLParser()164 HTMLParser::~HTMLParser()
165 {
166 freeBlock();
167 if (m_didRefCurrent)
168 m_current->deref();
169 }
170
reset()171 void HTMLParser::reset()
172 {
173 ASSERT(!m_isParsingFragment);
174
175 setCurrent(m_document);
176
177 freeBlock();
178
179 m_inBody = false;
180 m_haveFrameSet = false;
181 m_haveContent = false;
182 m_inStrayTableContent = 0;
183
184 m_currentFormElement = 0;
185 m_currentMapElement = 0;
186 m_head = 0;
187 m_isindexElement = 0;
188
189 m_skipModeTag = nullAtom;
190
191 if (m_parserQuirks)
192 m_parserQuirks->reset();
193 }
194
setCurrent(Node * newCurrent)195 void HTMLParser::setCurrent(Node* newCurrent)
196 {
197 bool didRefNewCurrent = newCurrent && newCurrent != m_document;
198 if (didRefNewCurrent)
199 newCurrent->ref();
200 if (m_didRefCurrent)
201 m_current->deref();
202 m_current = newCurrent;
203 m_didRefCurrent = didRefNewCurrent;
204 }
205
parseToken(Token * t)206 PassRefPtr<Node> HTMLParser::parseToken(Token* t)
207 {
208 if (!m_skipModeTag.isNull()) {
209 if (!t->beginTag && t->tagName == m_skipModeTag)
210 // Found the end tag for the current skip mode, so we're done skipping.
211 m_skipModeTag = nullAtom;
212 else if (m_current->localName() == t->tagName)
213 // Do not skip </iframe>.
214 // FIXME: What does that comment mean? How can it be right to parse a token without clearing m_skipModeTag?
215 ;
216 else
217 return 0;
218 }
219
220 // Apparently some sites use </br> instead of <br>. Be compatible with IE and Firefox and treat this like <br>.
221 if (t->isCloseTag(brTag) && m_document->inCompatMode()) {
222 reportError(MalformedBRError);
223 t->beginTag = true;
224 }
225
226 if (!t->beginTag) {
227 processCloseTag(t);
228 return 0;
229 }
230
231 // Ignore spaces, if we're not inside a paragraph or other inline code.
232 // Do not alter the text if it is part of a scriptTag.
233 if (t->tagName == textAtom && t->text && m_current->localName() != scriptTag) {
234 if (m_inBody && !skipMode() && m_current->localName() != styleTag &&
235 m_current->localName() != titleTag && !t->text->containsOnlyWhitespace())
236 m_haveContent = true;
237
238 RefPtr<Node> n;
239 String text = t->text.get();
240 unsigned charsLeft = text.length();
241 while (charsLeft) {
242 // split large blocks of text to nodes of manageable size
243 n = Text::createWithLengthLimit(m_document, text, charsLeft);
244 if (!insertNode(n.get(), t->selfClosingTag))
245 return 0;
246 }
247 return n;
248 }
249
250 RefPtr<Node> n = getNode(t);
251 // just to be sure, and to catch currently unimplemented stuff
252 if (!n)
253 return 0;
254
255 // set attributes
256 if (n->isHTMLElement()) {
257 HTMLElement* e = static_cast<HTMLElement*>(n.get());
258 e->setAttributeMap(t->attrs.get());
259
260 // take care of optional close tags
261 if (e->endTagRequirement() == TagStatusOptional)
262 popBlock(t->tagName);
263
264 // If the node does not have a forbidden end tag requirement, and if the broken XML self-closing
265 // syntax was used, report an error.
266 if (t->brokenXMLStyle && e->endTagRequirement() != TagStatusForbidden) {
267 if (t->tagName == scriptTag)
268 reportError(IncorrectXMLCloseScriptWarning);
269 else
270 reportError(IncorrectXMLSelfCloseError, &t->tagName);
271 }
272 }
273
274 if (!insertNode(n.get(), t->selfClosingTag)) {
275 // we couldn't insert the node
276
277 if (n->isElementNode()) {
278 Element* e = static_cast<Element*>(n.get());
279 e->setAttributeMap(0);
280 }
281
282 if (m_currentMapElement == n)
283 m_currentMapElement = 0;
284
285 if (m_currentFormElement == n)
286 m_currentFormElement = 0;
287
288 if (m_head == n)
289 m_head = 0;
290
291 return 0;
292 }
293 return n;
294 }
295
parseDoctypeToken(DoctypeToken * t)296 void HTMLParser::parseDoctypeToken(DoctypeToken* t)
297 {
298 // Ignore any doctype after the first. Ignore doctypes in fragments.
299 if (m_document->doctype() || m_isParsingFragment || m_current != m_document)
300 return;
301
302 // Make a new doctype node and set it as our doctype.
303 m_document->addChild(DocumentType::create(m_document, String::adopt(t->m_name), String::adopt(t->m_publicID), String::adopt(t->m_systemID)));
304 }
305
isTableSection(const Node * n)306 static bool isTableSection(const Node* n)
307 {
308 return n->hasTagName(tbodyTag) || n->hasTagName(tfootTag) || n->hasTagName(theadTag);
309 }
310
isTablePart(const Node * n)311 static bool isTablePart(const Node* n)
312 {
313 return n->hasTagName(trTag) || n->hasTagName(tdTag) || n->hasTagName(thTag) ||
314 isTableSection(n);
315 }
316
isTableRelated(const Node * n)317 static bool isTableRelated(const Node* n)
318 {
319 return n->hasTagName(tableTag) || isTablePart(n);
320 }
321
isScopingTag(const AtomicString & tagName)322 static bool isScopingTag(const AtomicString& tagName)
323 {
324 return tagName == appletTag || tagName == captionTag || tagName == tdTag || tagName == thTag || tagName == buttonTag || tagName == marqueeTag || tagName == objectTag || tagName == tableTag || tagName == htmlTag;
325 }
326
insertNode(Node * n,bool flat)327 bool HTMLParser::insertNode(Node* n, bool flat)
328 {
329 RefPtr<Node> protectNode(n);
330
331 const AtomicString& localName = n->localName();
332 int tagPriority = n->isHTMLElement() ? static_cast<HTMLElement*>(n)->tagPriority() : 0;
333
334 // <table> is never allowed inside stray table content. Always pop out of the stray table content
335 // and close up the first table, and then start the second table as a sibling.
336 if (m_inStrayTableContent && localName == tableTag)
337 popBlock(tableTag);
338
339 if (tagPriority >= minBlockLevelTagPriority) {
340 while (m_blocksInStack >= cMaxBlockDepth)
341 popBlock(m_blockStack->tagName);
342 }
343
344 if (m_parserQuirks && !m_parserQuirks->shouldInsertNode(m_current, n))
345 return false;
346
347 // let's be stupid and just try to insert it.
348 // this should work if the document is well-formed
349 Node* newNode = m_current->addChild(n);
350 if (!newNode)
351 return handleError(n, flat, localName, tagPriority); // Try to handle the error.
352
353 // don't push elements without end tags (e.g., <img>) on the stack
354 bool parentAttached = m_current->attached();
355 if (tagPriority > 0 && !flat) {
356 if (newNode == m_current) {
357 // This case should only be hit when a demoted <form> is placed inside a table.
358 ASSERT(localName == formTag);
359 reportError(FormInsideTablePartError, &m_current->localName());
360 HTMLFormElement* form = static_cast<HTMLFormElement*>(n);
361 form->setDemoted(true);
362 } else {
363 // The pushBlock function transfers ownership of current to the block stack
364 // so we're guaranteed that m_didRefCurrent is false. The code below is an
365 // optimized version of setCurrent that takes advantage of that fact and also
366 // assumes that newNode is neither 0 nor a pointer to the document.
367 pushBlock(localName, tagPriority);
368 newNode->beginParsingChildren();
369 ASSERT(!m_didRefCurrent);
370 newNode->ref();
371 m_current = newNode;
372 m_didRefCurrent = true;
373 }
374 if (parentAttached && !n->attached() && !m_isParsingFragment)
375 n->attach();
376 } else {
377 if (parentAttached && !n->attached() && !m_isParsingFragment)
378 n->attach();
379 n->finishParsingChildren();
380 }
381
382 if (localName == htmlTag && m_document->frame())
383 m_document->frame()->loader()->dispatchDocumentElementAvailable();
384
385 return true;
386 }
387
handleError(Node * n,bool flat,const AtomicString & localName,int tagPriority)388 bool HTMLParser::handleError(Node* n, bool flat, const AtomicString& localName, int tagPriority)
389 {
390 // Error handling code. This is just ad hoc handling of specific parent/child combinations.
391 HTMLElement* e;
392 bool handled = false;
393
394 // 1. Check out the element's tag name to decide how to deal with errors.
395 if (n->isHTMLElement()) {
396 HTMLElement* h = static_cast<HTMLElement*>(n);
397 if (h->hasLocalName(trTag) || h->hasLocalName(thTag) || h->hasLocalName(tdTag)) {
398 if (m_inStrayTableContent && !isTableRelated(m_current)) {
399 reportError(MisplacedTablePartError, &localName, &m_current->localName());
400 // pop out to the nearest enclosing table-related tag.
401 while (m_blockStack && !isTableRelated(m_current))
402 popOneBlock();
403 return insertNode(n);
404 }
405 } else if (h->hasLocalName(headTag)) {
406 if (!m_current->isDocumentNode() && !m_current->hasTagName(htmlTag)) {
407 reportError(MisplacedHeadError);
408 return false;
409 }
410 } else if (h->hasLocalName(metaTag) || h->hasLocalName(linkTag) || h->hasLocalName(baseTag)) {
411 bool createdHead = false;
412 if (!m_head) {
413 createHead();
414 createdHead = true;
415 }
416 if (m_head) {
417 if (!createdHead)
418 reportError(MisplacedHeadContentError, &localName, &m_current->localName());
419 if (m_head->addChild(n)) {
420 if (!n->attached() && !m_isParsingFragment)
421 n->attach();
422 return true;
423 } else
424 return false;
425 }
426 } else if (h->hasLocalName(htmlTag)) {
427 if (!m_current->isDocumentNode() ) {
428 if (m_document->documentElement() && m_document->documentElement()->hasTagName(htmlTag)) {
429 reportError(RedundantHTMLBodyError, &localName);
430 // we have another <HTML> element.... apply attributes to existing one
431 // make sure we don't overwrite already existing attributes
432 NamedNodeMap* map = static_cast<Element*>(n)->attributes(true);
433 Element* existingHTML = static_cast<Element*>(m_document->documentElement());
434 NamedNodeMap* bmap = existingHTML->attributes(false);
435 for (unsigned l = 0; map && l < map->length(); ++l) {
436 Attribute* it = map->attributeItem(l);
437 if (!bmap->getAttributeItem(it->name()))
438 existingHTML->setAttribute(it->name(), it->value());
439 }
440 }
441 return false;
442 }
443 } else if (h->hasLocalName(titleTag) || h->hasLocalName(styleTag) || h->hasLocalName(scriptTag)) {
444 bool createdHead = false;
445 if (!m_head) {
446 createHead();
447 createdHead = true;
448 }
449 if (m_head) {
450 Node* newNode = m_head->addChild(n);
451 if (!newNode) {
452 setSkipMode(h->tagQName());
453 return false;
454 }
455
456 if (!createdHead)
457 reportError(MisplacedHeadContentError, &localName, &m_current->localName());
458
459 pushBlock(localName, tagPriority);
460 newNode->beginParsingChildren();
461 setCurrent(newNode);
462 if (!n->attached() && !m_isParsingFragment)
463 n->attach();
464 return true;
465 }
466 if (m_inBody) {
467 setSkipMode(h->tagQName());
468 return false;
469 }
470 } else if (h->hasLocalName(bodyTag)) {
471 if (m_inBody && m_document->body()) {
472 // we have another <BODY> element.... apply attributes to existing one
473 // make sure we don't overwrite already existing attributes
474 // some sites use <body bgcolor=rightcolor>...<body bgcolor=wrongcolor>
475 reportError(RedundantHTMLBodyError, &localName);
476 NamedNodeMap* map = static_cast<Element*>(n)->attributes(true);
477 Element* existingBody = m_document->body();
478 NamedNodeMap* bmap = existingBody->attributes(false);
479 for (unsigned l = 0; map && l < map->length(); ++l) {
480 Attribute* it = map->attributeItem(l);
481 if (!bmap->getAttributeItem(it->name()))
482 existingBody->setAttribute(it->name(), it->value());
483 }
484 return false;
485 }
486 else if (!m_current->isDocumentNode())
487 return false;
488 } else if (h->hasLocalName(areaTag)) {
489 if (m_currentMapElement) {
490 reportError(MisplacedAreaError, &m_current->localName());
491 m_currentMapElement->addChild(n);
492 if (!n->attached() && !m_isParsingFragment)
493 n->attach();
494 handled = true;
495 return true;
496 }
497 return false;
498 } else if (h->hasLocalName(colgroupTag) || h->hasLocalName(captionTag)) {
499 if (isTableRelated(m_current)) {
500 while (m_blockStack && isTablePart(m_current))
501 popOneBlock();
502 return insertNode(n);
503 }
504 }
505 } else if (n->isCommentNode() && !m_head)
506 return false;
507
508 // 2. Next we examine our currently active element to do some further error handling.
509 if (m_current->isHTMLElement()) {
510 HTMLElement* h = static_cast<HTMLElement*>(m_current);
511 const AtomicString& currentTagName = h->localName();
512 if (h->hasLocalName(htmlTag)) {
513 HTMLElement* elt = n->isHTMLElement() ? static_cast<HTMLElement*>(n) : 0;
514 if (elt && (elt->hasLocalName(scriptTag) || elt->hasLocalName(styleTag) ||
515 elt->hasLocalName(metaTag) || elt->hasLocalName(linkTag) ||
516 elt->hasLocalName(objectTag) || elt->hasLocalName(embedTag) ||
517 elt->hasLocalName(titleTag) || elt->hasLocalName(isindexTag) ||
518 elt->hasLocalName(baseTag))) {
519 if (!m_head) {
520 m_head = new HTMLHeadElement(headTag, m_document);
521 insertNode(m_head.get());
522 handled = true;
523 }
524 } else {
525 if (n->isTextNode()) {
526 Text* t = static_cast<Text*>(n);
527 if (t->containsOnlyWhitespace())
528 return false;
529 }
530 if (!m_haveFrameSet) {
531 // Ensure that head exists.
532 // But not for older versions of Mail, where the implicit <head> isn't expected - <rdar://problem/6863795>
533 if (shouldCreateImplicitHead(m_document))
534 createHead();
535
536 popBlock(headTag);
537 e = new HTMLBodyElement(bodyTag, m_document);
538 startBody();
539 insertNode(e);
540 handled = true;
541 } else
542 reportError(MisplacedFramesetContentError, &localName);
543 }
544 } else if (h->hasLocalName(headTag)) {
545 if (n->hasTagName(htmlTag))
546 return false;
547 else {
548 // This means the body starts here...
549 if (!m_haveFrameSet) {
550 ASSERT(currentTagName == headTag);
551 popBlock(currentTagName);
552 e = new HTMLBodyElement(bodyTag, m_document);
553 startBody();
554 insertNode(e);
555 handled = true;
556 } else
557 reportError(MisplacedFramesetContentError, &localName);
558 }
559 } else if (h->hasLocalName(addressTag) || h->hasLocalName(fontTag)
560 || h->hasLocalName(styleTag) || h->hasLocalName(titleTag)) {
561 reportError(MisplacedContentRetryError, &localName, ¤tTagName);
562 popBlock(currentTagName);
563 handled = true;
564 } else if (h->hasLocalName(captionTag)) {
565 // Illegal content in a caption. Close the caption and try again.
566 reportError(MisplacedCaptionContentError, &localName);
567 popBlock(currentTagName);
568 if (isTablePart(n))
569 return insertNode(n, flat);
570 } else if (h->hasLocalName(tableTag) || h->hasLocalName(trTag) || isTableSection(h)) {
571 if (n->hasTagName(tableTag)) {
572 reportError(MisplacedTableError, ¤tTagName);
573 if (m_isParsingFragment && !h->hasLocalName(tableTag))
574 // fragment may contain table parts without <table> ancestor, pop them one by one
575 popBlock(h->localName());
576 popBlock(localName); // end the table
577 handled = true; // ...and start a new one
578 } else {
579 ExceptionCode ec = 0;
580 Node* node = m_current;
581 Node* parent = node->parentNode();
582 // A script may have removed the current node's parent from the DOM
583 // http://bugs.webkit.org/show_bug.cgi?id=7137
584 // FIXME: we should do real recovery here and re-parent with the correct node.
585 if (!parent)
586 return false;
587 Node* grandparent = parent->parentNode();
588
589 if (n->isTextNode() ||
590 (h->hasLocalName(trTag) &&
591 isTableSection(parent) && grandparent && grandparent->hasTagName(tableTag)) ||
592 ((!n->hasTagName(tdTag) && !n->hasTagName(thTag) &&
593 !n->hasTagName(formTag) && !n->hasTagName(scriptTag)) && isTableSection(node) &&
594 parent->hasTagName(tableTag))) {
595 node = (node->hasTagName(tableTag)) ? node :
596 ((node->hasTagName(trTag)) ? grandparent : parent);
597 // This can happen with fragments
598 if (!node)
599 return false;
600 Node* parent = node->parentNode();
601 if (!parent)
602 return false;
603 parent->insertBefore(n, node, ec);
604 if (!ec) {
605 reportError(StrayTableContentError, &localName, ¤tTagName);
606 if (n->isHTMLElement() && tagPriority > 0 &&
607 !flat && static_cast<HTMLElement*>(n)->endTagRequirement() != TagStatusForbidden)
608 {
609 pushBlock(localName, tagPriority);
610 n->beginParsingChildren();
611 setCurrent(n);
612 m_inStrayTableContent++;
613 m_blockStack->strayTableContent = true;
614 }
615 return true;
616 }
617 }
618
619 if (!ec) {
620 if (m_current->hasTagName(trTag)) {
621 reportError(TablePartRequiredError, &localName, &tdTag.localName());
622 e = new HTMLTableCellElement(tdTag, m_document);
623 } else if (m_current->hasTagName(tableTag)) {
624 // Don't report an error in this case, since making a <tbody> happens all the time when you have <table><tr>,
625 // and it isn't really a parse error per se.
626 e = new HTMLTableSectionElement(tbodyTag, m_document);
627 } else {
628 reportError(TablePartRequiredError, &localName, &trTag.localName());
629 e = new HTMLTableRowElement(trTag, m_document);
630 }
631
632 insertNode(e);
633 handled = true;
634 }
635 }
636 } else if (h->hasLocalName(objectTag)) {
637 reportError(MisplacedContentRetryError, &localName, ¤tTagName);
638 popBlock(objectTag);
639 handled = true;
640 } else if (h->hasLocalName(pTag) || isHeaderTag(currentTagName)) {
641 if (!isInline(n)) {
642 popBlock(currentTagName);
643 handled = true;
644 }
645 } else if (h->hasLocalName(optionTag) || h->hasLocalName(optgroupTag)) {
646 if (localName == optgroupTag) {
647 popBlock(currentTagName);
648 handled = true;
649 } else if (localName == selectTag) {
650 // IE treats a nested select as </select>. Let's do the same
651 popBlock(localName);
652 }
653 } else if (h->hasLocalName(selectTag)) {
654 if (localName == inputTag || localName == textareaTag) {
655 reportError(MisplacedContentRetryError, &localName, ¤tTagName);
656 popBlock(currentTagName);
657 handled = true;
658 }
659 } else if (h->hasLocalName(colgroupTag)) {
660 popBlock(currentTagName);
661 handled = true;
662 } else if (!h->hasLocalName(bodyTag)) {
663 if (isInline(m_current)) {
664 popInlineBlocks();
665 handled = true;
666 }
667 }
668 } else if (m_current->isDocumentNode()) {
669 if (n->isTextNode()) {
670 Text* t = static_cast<Text*>(n);
671 if (t->containsOnlyWhitespace())
672 return false;
673 }
674
675 if (!m_document->documentElement()) {
676 e = new HTMLHtmlElement(htmlTag, m_document);
677 insertNode(e);
678 handled = true;
679 }
680 }
681
682 // 3. If we couldn't handle the error, just return false and attempt to error-correct again.
683 if (!handled) {
684 reportError(IgnoredContentError, &localName, &m_current->localName());
685 return false;
686 }
687 return insertNode(n);
688 }
689
690 typedef bool (HTMLParser::*CreateErrorCheckFunc)(Token* t, RefPtr<Node>&);
691 typedef HashMap<AtomicStringImpl*, CreateErrorCheckFunc> FunctionMap;
692
textCreateErrorCheck(Token * t,RefPtr<Node> & result)693 bool HTMLParser::textCreateErrorCheck(Token* t, RefPtr<Node>& result)
694 {
695 result = new Text(m_document, t->text.get());
696 return false;
697 }
698
commentCreateErrorCheck(Token * t,RefPtr<Node> & result)699 bool HTMLParser::commentCreateErrorCheck(Token* t, RefPtr<Node>& result)
700 {
701 result = new Comment(m_document, t->text.get());
702 return false;
703 }
704
headCreateErrorCheck(Token *,RefPtr<Node> & result)705 bool HTMLParser::headCreateErrorCheck(Token*, RefPtr<Node>& result)
706 {
707 if (!m_head || m_current->localName() == htmlTag) {
708 m_head = new HTMLHeadElement(headTag, m_document);
709 result = m_head;
710 } else
711 reportError(MisplacedHeadError);
712 return false;
713 }
714
bodyCreateErrorCheck(Token *,RefPtr<Node> &)715 bool HTMLParser::bodyCreateErrorCheck(Token*, RefPtr<Node>&)
716 {
717 // body no longer allowed if we have a frameset
718 if (m_haveFrameSet)
719 return false;
720
721 // Ensure that head exists (unless parsing a fragment).
722 // But not for older versions of Mail, where the implicit <head> isn't expected - <rdar://problem/6863795>
723 if (!m_isParsingFragment && shouldCreateImplicitHead(m_document))
724 createHead();
725
726 popBlock(headTag);
727 startBody();
728 return true;
729 }
730
framesetCreateErrorCheck(Token *,RefPtr<Node> &)731 bool HTMLParser::framesetCreateErrorCheck(Token*, RefPtr<Node>&)
732 {
733 popBlock(headTag);
734 if (m_inBody && !m_haveFrameSet && !m_haveContent) {
735 popBlock(bodyTag);
736 // ### actually for IE document.body returns the now hidden "body" element
737 // we can't implement that behaviour now because it could cause too many
738 // regressions and the headaches are not worth the work as long as there is
739 // no site actually relying on that detail (Dirk)
740 if (m_document->body())
741 m_document->body()->setAttribute(styleAttr, "display:none");
742 m_inBody = false;
743 }
744 if ((m_haveContent || m_haveFrameSet) && m_current->localName() == htmlTag)
745 return false;
746 m_haveFrameSet = true;
747 startBody();
748 return true;
749 }
750
formCreateErrorCheck(Token * t,RefPtr<Node> & result)751 bool HTMLParser::formCreateErrorCheck(Token* t, RefPtr<Node>& result)
752 {
753 // Only create a new form if we're not already inside one.
754 // This is consistent with other browsers' behavior.
755 if (!m_currentFormElement) {
756 m_currentFormElement = new HTMLFormElement(formTag, m_document);
757 result = m_currentFormElement;
758 pCloserCreateErrorCheck(t, result);
759 }
760 return false;
761 }
762
isindexCreateErrorCheck(Token * t,RefPtr<Node> & result)763 bool HTMLParser::isindexCreateErrorCheck(Token* t, RefPtr<Node>& result)
764 {
765 RefPtr<Node> n = handleIsindex(t);
766 if (!m_inBody)
767 m_isindexElement = n.release();
768 else {
769 t->selfClosingTag = true;
770 result = n.release();
771 }
772 return false;
773 }
774
selectCreateErrorCheck(Token *,RefPtr<Node> &)775 bool HTMLParser::selectCreateErrorCheck(Token*, RefPtr<Node>&)
776 {
777 return true;
778 }
779
ddCreateErrorCheck(Token * t,RefPtr<Node> & result)780 bool HTMLParser::ddCreateErrorCheck(Token* t, RefPtr<Node>& result)
781 {
782 pCloserCreateErrorCheck(t, result);
783 popBlock(dtTag);
784 popBlock(ddTag);
785 return true;
786 }
787
dtCreateErrorCheck(Token * t,RefPtr<Node> & result)788 bool HTMLParser::dtCreateErrorCheck(Token* t, RefPtr<Node>& result)
789 {
790 pCloserCreateErrorCheck(t, result);
791 popBlock(ddTag);
792 popBlock(dtTag);
793 return true;
794 }
795
rpCreateErrorCheck(Token *,RefPtr<Node> &)796 bool HTMLParser::rpCreateErrorCheck(Token*, RefPtr<Node>&)
797 {
798 popBlock(rpTag);
799 popBlock(rtTag);
800 return true;
801 }
802
rtCreateErrorCheck(Token *,RefPtr<Node> &)803 bool HTMLParser::rtCreateErrorCheck(Token*, RefPtr<Node>&)
804 {
805 popBlock(rpTag);
806 popBlock(rtTag);
807 return true;
808 }
809
nestedCreateErrorCheck(Token * t,RefPtr<Node> &)810 bool HTMLParser::nestedCreateErrorCheck(Token* t, RefPtr<Node>&)
811 {
812 popBlock(t->tagName);
813 return true;
814 }
815
nestedPCloserCreateErrorCheck(Token * t,RefPtr<Node> & result)816 bool HTMLParser::nestedPCloserCreateErrorCheck(Token* t, RefPtr<Node>& result)
817 {
818 pCloserCreateErrorCheck(t, result);
819 popBlock(t->tagName);
820 return true;
821 }
822
nestedStyleCreateErrorCheck(Token * t,RefPtr<Node> &)823 bool HTMLParser::nestedStyleCreateErrorCheck(Token* t, RefPtr<Node>&)
824 {
825 return allowNestedRedundantTag(t->tagName);
826 }
827
tableCellCreateErrorCheck(Token *,RefPtr<Node> &)828 bool HTMLParser::tableCellCreateErrorCheck(Token*, RefPtr<Node>&)
829 {
830 popBlock(tdTag);
831 popBlock(thTag);
832 return true;
833 }
834
tableSectionCreateErrorCheck(Token *,RefPtr<Node> &)835 bool HTMLParser::tableSectionCreateErrorCheck(Token*, RefPtr<Node>&)
836 {
837 popBlock(theadTag);
838 popBlock(tbodyTag);
839 popBlock(tfootTag);
840 return true;
841 }
842
noembedCreateErrorCheck(Token *,RefPtr<Node> &)843 bool HTMLParser::noembedCreateErrorCheck(Token*, RefPtr<Node>&)
844 {
845 setSkipMode(noembedTag);
846 return true;
847 }
848
noframesCreateErrorCheck(Token *,RefPtr<Node> &)849 bool HTMLParser::noframesCreateErrorCheck(Token*, RefPtr<Node>&)
850 {
851 setSkipMode(noframesTag);
852 return true;
853 }
854
noscriptCreateErrorCheck(Token *,RefPtr<Node> &)855 bool HTMLParser::noscriptCreateErrorCheck(Token*, RefPtr<Node>&)
856 {
857 if (!m_isParsingFragment) {
858 Settings* settings = m_document->settings();
859 if (settings && settings->isJavaScriptEnabled())
860 setSkipMode(noscriptTag);
861 }
862 return true;
863 }
864
pCloserCreateErrorCheck(Token *,RefPtr<Node> &)865 bool HTMLParser::pCloserCreateErrorCheck(Token*, RefPtr<Node>&)
866 {
867 if (hasPElementInScope())
868 popBlock(pTag);
869 return true;
870 }
871
pCloserStrictCreateErrorCheck(Token *,RefPtr<Node> &)872 bool HTMLParser::pCloserStrictCreateErrorCheck(Token*, RefPtr<Node>&)
873 {
874 if (m_document->inCompatMode())
875 return true;
876 if (hasPElementInScope())
877 popBlock(pTag);
878 return true;
879 }
880
mapCreateErrorCheck(Token *,RefPtr<Node> & result)881 bool HTMLParser::mapCreateErrorCheck(Token*, RefPtr<Node>& result)
882 {
883 m_currentMapElement = new HTMLMapElement(mapTag, m_document);
884 result = m_currentMapElement;
885 return false;
886 }
887
getNode(Token * t)888 PassRefPtr<Node> HTMLParser::getNode(Token* t)
889 {
890 // Init our error handling table.
891 DEFINE_STATIC_LOCAL(FunctionMap, gFunctionMap, ());
892 if (gFunctionMap.isEmpty()) {
893 gFunctionMap.set(aTag.localName().impl(), &HTMLParser::nestedCreateErrorCheck);
894 gFunctionMap.set(addressTag.localName().impl(), &HTMLParser::pCloserCreateErrorCheck);
895 gFunctionMap.set(bTag.localName().impl(), &HTMLParser::nestedStyleCreateErrorCheck);
896 gFunctionMap.set(bigTag.localName().impl(), &HTMLParser::nestedStyleCreateErrorCheck);
897 gFunctionMap.set(blockquoteTag.localName().impl(), &HTMLParser::pCloserCreateErrorCheck);
898 gFunctionMap.set(bodyTag.localName().impl(), &HTMLParser::bodyCreateErrorCheck);
899 gFunctionMap.set(buttonTag.localName().impl(), &HTMLParser::nestedCreateErrorCheck);
900 gFunctionMap.set(centerTag.localName().impl(), &HTMLParser::pCloserCreateErrorCheck);
901 gFunctionMap.set(commentAtom.impl(), &HTMLParser::commentCreateErrorCheck);
902 gFunctionMap.set(ddTag.localName().impl(), &HTMLParser::ddCreateErrorCheck);
903 gFunctionMap.set(dirTag.localName().impl(), &HTMLParser::pCloserCreateErrorCheck);
904 gFunctionMap.set(divTag.localName().impl(), &HTMLParser::pCloserCreateErrorCheck);
905 gFunctionMap.set(dlTag.localName().impl(), &HTMLParser::pCloserCreateErrorCheck);
906 gFunctionMap.set(dtTag.localName().impl(), &HTMLParser::dtCreateErrorCheck);
907 gFunctionMap.set(formTag.localName().impl(), &HTMLParser::formCreateErrorCheck);
908 gFunctionMap.set(fieldsetTag.localName().impl(), &HTMLParser::pCloserCreateErrorCheck);
909 gFunctionMap.set(framesetTag.localName().impl(), &HTMLParser::framesetCreateErrorCheck);
910 gFunctionMap.set(h1Tag.localName().impl(), &HTMLParser::pCloserCreateErrorCheck);
911 gFunctionMap.set(h2Tag.localName().impl(), &HTMLParser::pCloserCreateErrorCheck);
912 gFunctionMap.set(h3Tag.localName().impl(), &HTMLParser::pCloserCreateErrorCheck);
913 gFunctionMap.set(h4Tag.localName().impl(), &HTMLParser::pCloserCreateErrorCheck);
914 gFunctionMap.set(h5Tag.localName().impl(), &HTMLParser::pCloserCreateErrorCheck);
915 gFunctionMap.set(h6Tag.localName().impl(), &HTMLParser::pCloserCreateErrorCheck);
916 gFunctionMap.set(headTag.localName().impl(), &HTMLParser::headCreateErrorCheck);
917 gFunctionMap.set(hrTag.localName().impl(), &HTMLParser::pCloserCreateErrorCheck);
918 gFunctionMap.set(iTag.localName().impl(), &HTMLParser::nestedStyleCreateErrorCheck);
919 gFunctionMap.set(isindexTag.localName().impl(), &HTMLParser::isindexCreateErrorCheck);
920 gFunctionMap.set(liTag.localName().impl(), &HTMLParser::nestedPCloserCreateErrorCheck);
921 gFunctionMap.set(listingTag.localName().impl(), &HTMLParser::pCloserCreateErrorCheck);
922 gFunctionMap.set(mapTag.localName().impl(), &HTMLParser::mapCreateErrorCheck);
923 gFunctionMap.set(menuTag.localName().impl(), &HTMLParser::pCloserCreateErrorCheck);
924 gFunctionMap.set(nobrTag.localName().impl(), &HTMLParser::nestedCreateErrorCheck);
925 gFunctionMap.set(noembedTag.localName().impl(), &HTMLParser::noembedCreateErrorCheck);
926 gFunctionMap.set(noframesTag.localName().impl(), &HTMLParser::noframesCreateErrorCheck);
927 #if !ENABLE(XHTMLMP)
928 gFunctionMap.set(noscriptTag.localName().impl(), &HTMLParser::noscriptCreateErrorCheck);
929 #endif
930 gFunctionMap.set(olTag.localName().impl(), &HTMLParser::pCloserCreateErrorCheck);
931 gFunctionMap.set(pTag.localName().impl(), &HTMLParser::pCloserCreateErrorCheck);
932 gFunctionMap.set(plaintextTag.localName().impl(), &HTMLParser::pCloserCreateErrorCheck);
933 gFunctionMap.set(preTag.localName().impl(), &HTMLParser::pCloserCreateErrorCheck);
934 gFunctionMap.set(rpTag.localName().impl(), &HTMLParser::rpCreateErrorCheck);
935 gFunctionMap.set(rtTag.localName().impl(), &HTMLParser::rtCreateErrorCheck);
936 gFunctionMap.set(sTag.localName().impl(), &HTMLParser::nestedStyleCreateErrorCheck);
937 gFunctionMap.set(selectTag.localName().impl(), &HTMLParser::selectCreateErrorCheck);
938 gFunctionMap.set(smallTag.localName().impl(), &HTMLParser::nestedStyleCreateErrorCheck);
939 gFunctionMap.set(strikeTag.localName().impl(), &HTMLParser::nestedStyleCreateErrorCheck);
940 gFunctionMap.set(tableTag.localName().impl(), &HTMLParser::pCloserStrictCreateErrorCheck);
941 gFunctionMap.set(tbodyTag.localName().impl(), &HTMLParser::tableSectionCreateErrorCheck);
942 gFunctionMap.set(tdTag.localName().impl(), &HTMLParser::tableCellCreateErrorCheck);
943 gFunctionMap.set(textAtom.impl(), &HTMLParser::textCreateErrorCheck);
944 gFunctionMap.set(tfootTag.localName().impl(), &HTMLParser::tableSectionCreateErrorCheck);
945 gFunctionMap.set(thTag.localName().impl(), &HTMLParser::tableCellCreateErrorCheck);
946 gFunctionMap.set(theadTag.localName().impl(), &HTMLParser::tableSectionCreateErrorCheck);
947 gFunctionMap.set(trTag.localName().impl(), &HTMLParser::nestedCreateErrorCheck);
948 gFunctionMap.set(ttTag.localName().impl(), &HTMLParser::nestedStyleCreateErrorCheck);
949 gFunctionMap.set(uTag.localName().impl(), &HTMLParser::nestedStyleCreateErrorCheck);
950 gFunctionMap.set(ulTag.localName().impl(), &HTMLParser::pCloserCreateErrorCheck);
951 }
952
953 bool proceed = true;
954 RefPtr<Node> result;
955 if (CreateErrorCheckFunc errorCheckFunc = gFunctionMap.get(t->tagName.impl()))
956 proceed = (this->*errorCheckFunc)(t, result);
957 if (proceed)
958 result = HTMLElementFactory::createHTMLElement(QualifiedName(nullAtom, t->tagName, xhtmlNamespaceURI), m_document, m_currentFormElement.get());
959 return result.release();
960 }
961
allowNestedRedundantTag(const AtomicString & tagName)962 bool HTMLParser::allowNestedRedundantTag(const AtomicString& tagName)
963 {
964 // www.liceo.edu.mx is an example of a site that achieves a level of nesting of
965 // about 1500 tags, all from a bunch of <b>s. We will only allow at most 20
966 // nested tags of the same type before just ignoring them all together.
967 unsigned i = 0;
968 for (HTMLStackElem* curr = m_blockStack;
969 i < cMaxRedundantTagDepth && curr && curr->tagName == tagName;
970 curr = curr->next, i++) { }
971 return i != cMaxRedundantTagDepth;
972 }
973
processCloseTag(Token * t)974 void HTMLParser::processCloseTag(Token* t)
975 {
976 // Support for really broken html.
977 // we never close the body tag, since some stupid web pages close it before the actual end of the doc.
978 // let's rely on the end() call to close things.
979 if (t->tagName == htmlTag || t->tagName == bodyTag || t->tagName == commentAtom)
980 return;
981
982 bool checkForCloseTagErrors = true;
983 if (t->tagName == formTag && m_currentFormElement) {
984 m_currentFormElement = 0;
985 checkForCloseTagErrors = false;
986 } else if (t->tagName == mapTag)
987 m_currentMapElement = 0;
988 else if (t->tagName == pTag)
989 checkForCloseTagErrors = false;
990
991 HTMLStackElem* oldElem = m_blockStack;
992 popBlock(t->tagName, checkForCloseTagErrors);
993 if (oldElem == m_blockStack && t->tagName == pTag) {
994 // We encountered a stray </p>. Amazingly Gecko, WinIE, and MacIE all treat
995 // this as a valid break, i.e., <p></p>. So go ahead and make the empty
996 // paragraph.
997 t->beginTag = true;
998 parseToken(t);
999 popBlock(t->tagName);
1000 reportError(StrayParagraphCloseError);
1001 }
1002 }
1003
isHeaderTag(const AtomicString & tagName)1004 bool HTMLParser::isHeaderTag(const AtomicString& tagName)
1005 {
1006 DEFINE_STATIC_LOCAL(HashSet<AtomicStringImpl*>, headerTags, ());
1007 if (headerTags.isEmpty()) {
1008 headerTags.add(h1Tag.localName().impl());
1009 headerTags.add(h2Tag.localName().impl());
1010 headerTags.add(h3Tag.localName().impl());
1011 headerTags.add(h4Tag.localName().impl());
1012 headerTags.add(h5Tag.localName().impl());
1013 headerTags.add(h6Tag.localName().impl());
1014 }
1015
1016 return headerTags.contains(tagName.impl());
1017 }
1018
isInline(Node * node) const1019 bool HTMLParser::isInline(Node* node) const
1020 {
1021 if (node->isTextNode())
1022 return true;
1023
1024 if (node->isHTMLElement()) {
1025 HTMLElement* e = static_cast<HTMLElement*>(node);
1026 if (e->hasLocalName(aTag) || e->hasLocalName(fontTag) || e->hasLocalName(ttTag) ||
1027 e->hasLocalName(uTag) || e->hasLocalName(bTag) || e->hasLocalName(iTag) ||
1028 e->hasLocalName(sTag) || e->hasLocalName(strikeTag) || e->hasLocalName(bigTag) ||
1029 e->hasLocalName(smallTag) || e->hasLocalName(emTag) || e->hasLocalName(strongTag) ||
1030 e->hasLocalName(dfnTag) || e->hasLocalName(codeTag) || e->hasLocalName(sampTag) ||
1031 e->hasLocalName(kbdTag) || e->hasLocalName(varTag) || e->hasLocalName(citeTag) ||
1032 e->hasLocalName(abbrTag) || e->hasLocalName(acronymTag) || e->hasLocalName(subTag) ||
1033 e->hasLocalName(supTag) || e->hasLocalName(spanTag) || e->hasLocalName(nobrTag) ||
1034 e->hasLocalName(noframesTag) || e->hasLocalName(nolayerTag) ||
1035 e->hasLocalName(noembedTag))
1036 return true;
1037 #if !ENABLE(XHTMLMP)
1038 if (e->hasLocalName(noscriptTag) && !m_isParsingFragment) {
1039 Settings* settings = m_document->settings();
1040 if (settings && settings->isJavaScriptEnabled())
1041 return true;
1042 }
1043 #endif
1044 }
1045
1046 return false;
1047 }
1048
isResidualStyleTag(const AtomicString & tagName)1049 bool HTMLParser::isResidualStyleTag(const AtomicString& tagName)
1050 {
1051 DEFINE_STATIC_LOCAL(HashSet<AtomicStringImpl*>, residualStyleTags, ());
1052 if (residualStyleTags.isEmpty()) {
1053 residualStyleTags.add(aTag.localName().impl());
1054 residualStyleTags.add(fontTag.localName().impl());
1055 residualStyleTags.add(ttTag.localName().impl());
1056 residualStyleTags.add(uTag.localName().impl());
1057 residualStyleTags.add(bTag.localName().impl());
1058 residualStyleTags.add(iTag.localName().impl());
1059 residualStyleTags.add(sTag.localName().impl());
1060 residualStyleTags.add(strikeTag.localName().impl());
1061 residualStyleTags.add(bigTag.localName().impl());
1062 residualStyleTags.add(smallTag.localName().impl());
1063 residualStyleTags.add(emTag.localName().impl());
1064 residualStyleTags.add(strongTag.localName().impl());
1065 residualStyleTags.add(dfnTag.localName().impl());
1066 residualStyleTags.add(codeTag.localName().impl());
1067 residualStyleTags.add(sampTag.localName().impl());
1068 residualStyleTags.add(kbdTag.localName().impl());
1069 residualStyleTags.add(varTag.localName().impl());
1070 residualStyleTags.add(nobrTag.localName().impl());
1071 }
1072
1073 return residualStyleTags.contains(tagName.impl());
1074 }
1075
isAffectedByResidualStyle(const AtomicString & tagName)1076 bool HTMLParser::isAffectedByResidualStyle(const AtomicString& tagName)
1077 {
1078 DEFINE_STATIC_LOCAL(HashSet<AtomicStringImpl*>, unaffectedTags, ());
1079 if (unaffectedTags.isEmpty()) {
1080 unaffectedTags.add(bodyTag.localName().impl());
1081 unaffectedTags.add(tableTag.localName().impl());
1082 unaffectedTags.add(theadTag.localName().impl());
1083 unaffectedTags.add(tbodyTag.localName().impl());
1084 unaffectedTags.add(tfootTag.localName().impl());
1085 unaffectedTags.add(trTag.localName().impl());
1086 unaffectedTags.add(thTag.localName().impl());
1087 unaffectedTags.add(tdTag.localName().impl());
1088 unaffectedTags.add(captionTag.localName().impl());
1089 unaffectedTags.add(colgroupTag.localName().impl());
1090 unaffectedTags.add(colTag.localName().impl());
1091 unaffectedTags.add(optionTag.localName().impl());
1092 unaffectedTags.add(optgroupTag.localName().impl());
1093 unaffectedTags.add(selectTag.localName().impl());
1094 unaffectedTags.add(objectTag.localName().impl());
1095 unaffectedTags.add(datagridTag.localName().impl());
1096 }
1097
1098 return !unaffectedTags.contains(tagName.impl());
1099 }
1100
handleResidualStyleCloseTagAcrossBlocks(HTMLStackElem * elem)1101 void HTMLParser::handleResidualStyleCloseTagAcrossBlocks(HTMLStackElem* elem)
1102 {
1103 HTMLStackElem* maxElem = 0;
1104 bool finished = false;
1105 bool strayTableContent = elem->strayTableContent;
1106
1107 m_handlingResidualStyleAcrossBlocks = true;
1108 while (!finished) {
1109 // Find the outermost element that crosses over to a higher level. If there exists another higher-level
1110 // element, we will do another pass, until we have corrected the innermost one.
1111 ExceptionCode ec = 0;
1112 HTMLStackElem* curr = m_blockStack;
1113 HTMLStackElem* prev = 0;
1114 HTMLStackElem* prevMaxElem = 0;
1115 maxElem = 0;
1116 finished = true;
1117 while (curr && curr != elem) {
1118 if (curr->level > elem->level) {
1119 if (!isAffectedByResidualStyle(curr->tagName))
1120 return;
1121 if (maxElem)
1122 // We will need another pass.
1123 finished = false;
1124 maxElem = curr;
1125 prevMaxElem = prev;
1126 }
1127
1128 prev = curr;
1129 curr = curr->next;
1130 }
1131
1132 if (!curr || !maxElem)
1133 return;
1134
1135 Node* residualElem = prev->node;
1136 Node* blockElem = prevMaxElem ? prevMaxElem->node : m_current;
1137 Node* parentElem = elem->node;
1138
1139 // Check to see if the reparenting that is going to occur is allowed according to the DOM.
1140 // FIXME: We should either always allow it or perform an additional fixup instead of
1141 // just bailing here.
1142 // Example: <p><font><center>blah</font></center></p> isn't doing a fixup right now.
1143 if (!parentElem->childAllowed(blockElem))
1144 return;
1145
1146 m_hasPElementInScope = Unknown;
1147
1148 if (maxElem->node->parentNode() != elem->node) {
1149 // Walk the stack and remove any elements that aren't residual style tags. These
1150 // are basically just being closed up. Example:
1151 // <font><span>Moo<p>Goo</font></p>.
1152 // In the above example, the <span> doesn't need to be reopened. It can just close.
1153 HTMLStackElem* currElem = maxElem->next;
1154 HTMLStackElem* prevElem = maxElem;
1155 while (currElem != elem) {
1156 HTMLStackElem* nextElem = currElem->next;
1157 if (!isResidualStyleTag(currElem->tagName)) {
1158 prevElem->next = nextElem;
1159 prevElem->derefNode();
1160 prevElem->node = currElem->node;
1161 prevElem->didRefNode = currElem->didRefNode;
1162 delete currElem;
1163 }
1164 else
1165 prevElem = currElem;
1166 currElem = nextElem;
1167 }
1168
1169 // We have to reopen residual tags in between maxElem and elem. An example of this case is:
1170 // <font><i>Moo<p>Foo</font>.
1171 // In this case, we need to transform the part before the <p> into:
1172 // <font><i>Moo</i></font><i>
1173 // so that the <i> will remain open. This involves the modification of elements
1174 // in the block stack.
1175 // This will also affect how we ultimately reparent the block, since we want it to end up
1176 // under the reopened residual tags (e.g., the <i> in the above example.)
1177 RefPtr<Node> prevNode = 0;
1178 currElem = maxElem;
1179 while (currElem->node != residualElem) {
1180 if (isResidualStyleTag(currElem->node->localName())) {
1181 // Create a clone of this element.
1182 // We call releaseRef to get a raw pointer since we plan to hand over ownership to currElem.
1183 Node* currNode = currElem->node->cloneNode(false).releaseRef();
1184 reportError(ResidualStyleError, &currNode->localName());
1185
1186 // Change the stack element's node to point to the clone.
1187 // The stack element adopts the reference we obtained above by calling release().
1188 currElem->derefNode();
1189 currElem->node = currNode;
1190 currElem->didRefNode = true;
1191
1192 // Attach the previous node as a child of this new node.
1193 if (prevNode)
1194 currNode->appendChild(prevNode, ec);
1195 else // The new parent for the block element is going to be the innermost clone.
1196 parentElem = currNode; // FIXME: We shifted parentElem to be a residual inline. We never checked to see if blockElem could be legally placed inside the inline though.
1197
1198 prevNode = currNode;
1199 }
1200
1201 currElem = currElem->next;
1202 }
1203
1204 // Now append the chain of new residual style elements if one exists.
1205 if (prevNode)
1206 elem->node->appendChild(prevNode, ec); // FIXME: This append can result in weird stuff happening, like an inline chain being put into a table section.
1207 }
1208
1209 // Check if the block is still in the tree. If it isn't, then we don't
1210 // want to remove it from its parent (that would crash) or insert it into
1211 // a new parent later. See http://bugs.webkit.org/show_bug.cgi?id=6778
1212 bool isBlockStillInTree = blockElem->parentNode();
1213
1214 // We need to make a clone of |residualElem| and place it just inside |blockElem|.
1215 // All content of |blockElem| is reparented to be under this clone. We then
1216 // reparent |blockElem| using real DOM calls so that attachment/detachment will
1217 // be performed to fix up the rendering tree.
1218 // So for this example: <b>...<p>Foo</b>Goo</p>
1219 // The end result will be: <b>...</b><p><b>Foo</b>Goo</p>
1220 //
1221 // Step 1: Remove |blockElem| from its parent, doing a batch detach of all the kids.
1222 if (isBlockStillInTree)
1223 blockElem->parentNode()->removeChild(blockElem, ec);
1224
1225 Node* newNodePtr = 0;
1226 if (blockElem->firstChild()) {
1227 // Step 2: Clone |residualElem|.
1228 RefPtr<Node> newNode = residualElem->cloneNode(false); // Shallow clone. We don't pick up the same kids.
1229 newNodePtr = newNode.get();
1230 reportError(ResidualStyleError, &newNode->localName());
1231
1232 // Step 3: Place |blockElem|'s children under |newNode|. Remove all of the children of |blockElem|
1233 // before we've put |newElem| into the document. That way we'll only do one attachment of all
1234 // the new content (instead of a bunch of individual attachments).
1235 Node* currNode = blockElem->firstChild();
1236 while (currNode) {
1237 Node* nextNode = currNode->nextSibling();
1238 newNode->appendChild(currNode, ec);
1239 currNode = nextNode;
1240 }
1241
1242 // Step 4: Place |newNode| under |blockElem|. |blockElem| is still out of the document, so no
1243 // attachment can occur yet.
1244 blockElem->appendChild(newNode.release(), ec);
1245 } else
1246 finished = true;
1247
1248 // Step 5: Reparent |blockElem|. Now the full attachment of the fixed up tree takes place.
1249 if (isBlockStillInTree)
1250 parentElem->appendChild(blockElem, ec);
1251
1252 // Step 6: Pull |elem| out of the stack, since it is no longer enclosing us. Also update
1253 // the node associated with the previous stack element so that when it gets popped,
1254 // it doesn't make the residual element the next current node.
1255 HTMLStackElem* currElem = maxElem;
1256 HTMLStackElem* prevElem = 0;
1257 while (currElem != elem) {
1258 prevElem = currElem;
1259 currElem = currElem->next;
1260 }
1261 prevElem->next = elem->next;
1262 prevElem->derefNode();
1263 prevElem->node = elem->node;
1264 prevElem->didRefNode = elem->didRefNode;
1265 if (!finished) {
1266 // Repurpose |elem| to represent |newNode| and insert it at the appropriate position
1267 // in the stack. We do not do this for the innermost block, because in that case the new
1268 // node is effectively no longer open.
1269 elem->next = maxElem;
1270 elem->node = prevMaxElem->node;
1271 elem->didRefNode = prevMaxElem->didRefNode;
1272 elem->strayTableContent = false;
1273 prevMaxElem->next = elem;
1274 ASSERT(newNodePtr);
1275 prevMaxElem->node = newNodePtr;
1276 prevMaxElem->didRefNode = false;
1277 } else
1278 delete elem;
1279 }
1280
1281 // FIXME: If we ever make a case like this work:
1282 // <table><b><i><form></b></form></i></table>
1283 // Then this check will be too simplistic. Right now the <i><form> chain will end up inside the <tbody>, which is pretty crazy.
1284 if (strayTableContent)
1285 m_inStrayTableContent--;
1286
1287 // Step 7: Reopen intermediate inlines, e.g., <b><p><i>Foo</b>Goo</p>.
1288 // In the above example, Goo should stay italic.
1289 // We cap the number of tags we're willing to reopen based off cResidualStyleMaxDepth.
1290
1291 HTMLStackElem* curr = m_blockStack;
1292 HTMLStackElem* residualStyleStack = 0;
1293 unsigned stackDepth = 1;
1294 unsigned redundantStyleCount = 0;
1295 while (curr && curr != maxElem) {
1296 // We will actually schedule this tag for reopening
1297 // after we complete the close of this entire block.
1298 if (isResidualStyleTag(curr->tagName) && stackDepth++ < cResidualStyleMaxDepth) {
1299 // We've overloaded the use of stack elements and are just reusing the
1300 // struct with a slightly different meaning to the variables. Instead of chaining
1301 // from innermost to outermost, we build up a list of all the tags we need to reopen
1302 // from the outermost to the innermost, i.e., residualStyleStack will end up pointing
1303 // to the outermost tag we need to reopen.
1304 // We also set curr->node to be the actual element that corresponds to the ID stored in
1305 // curr->id rather than the node that you should pop to when the element gets pulled off
1306 // the stack.
1307 if (residualStyleStack && curr->tagName == residualStyleStack->tagName && curr->node->attributes()->mapsEquivalent(residualStyleStack->node->attributes()))
1308 redundantStyleCount++;
1309 else
1310 redundantStyleCount = 0;
1311
1312 if (redundantStyleCount < cMaxRedundantTagDepth)
1313 moveOneBlockToStack(residualStyleStack);
1314 else
1315 popOneBlock();
1316 } else
1317 popOneBlock();
1318
1319 curr = m_blockStack;
1320 }
1321
1322 reopenResidualStyleTags(residualStyleStack, 0); // Stray table content can't be an issue here, since some element above will always become the root of new stray table content.
1323
1324 m_handlingResidualStyleAcrossBlocks = false;
1325 }
1326
reopenResidualStyleTags(HTMLStackElem * elem,Node * malformedTableParent)1327 void HTMLParser::reopenResidualStyleTags(HTMLStackElem* elem, Node* malformedTableParent)
1328 {
1329 // Loop for each tag that needs to be reopened.
1330 while (elem) {
1331 // Create a shallow clone of the DOM node for this element.
1332 RefPtr<Node> newNode = elem->node->cloneNode(false);
1333 reportError(ResidualStyleError, &newNode->localName());
1334
1335 // Append the new node. In the malformed table case, we need to insert before the table,
1336 // which will be the last child.
1337 ExceptionCode ec = 0;
1338 if (malformedTableParent)
1339 malformedTableParent->insertBefore(newNode, malformedTableParent->lastChild(), ec);
1340 else
1341 m_current->appendChild(newNode, ec);
1342 // FIXME: Is it really OK to ignore the exceptions here?
1343
1344 // Now push a new stack element for this node we just created.
1345 pushBlock(elem->tagName, elem->level);
1346 newNode->beginParsingChildren();
1347
1348 // Set our strayTableContent boolean if needed, so that the reopened tag also knows
1349 // that it is inside a malformed table.
1350 m_blockStack->strayTableContent = malformedTableParent != 0;
1351 if (m_blockStack->strayTableContent)
1352 m_inStrayTableContent++;
1353
1354 // Clear our malformed table parent variable.
1355 malformedTableParent = 0;
1356
1357 // Update |current| manually to point to the new node.
1358 setCurrent(newNode.get());
1359
1360 // Advance to the next tag that needs to be reopened.
1361 HTMLStackElem* next = elem->next;
1362 elem->derefNode();
1363 delete elem;
1364 elem = next;
1365 }
1366 }
1367
pushBlock(const AtomicString & tagName,int level)1368 void HTMLParser::pushBlock(const AtomicString& tagName, int level)
1369 {
1370 m_blockStack = new HTMLStackElem(tagName, level, m_current, m_didRefCurrent, m_blockStack);
1371 if (level >= minBlockLevelTagPriority)
1372 m_blocksInStack++;
1373 m_didRefCurrent = false;
1374 if (tagName == pTag)
1375 m_hasPElementInScope = InScope;
1376 else if (isScopingTag(tagName))
1377 m_hasPElementInScope = NotInScope;
1378 }
1379
popBlock(const AtomicString & tagName,bool reportErrors)1380 void HTMLParser::popBlock(const AtomicString& tagName, bool reportErrors)
1381 {
1382 HTMLStackElem* elem = m_blockStack;
1383
1384 if (m_parserQuirks && elem && !m_parserQuirks->shouldPopBlock(elem->tagName, tagName))
1385 return;
1386
1387 int maxLevel = 0;
1388
1389 while (elem && (elem->tagName != tagName)) {
1390 if (maxLevel < elem->level)
1391 maxLevel = elem->level;
1392 elem = elem->next;
1393 }
1394
1395 if (!elem) {
1396 if (reportErrors)
1397 reportError(StrayCloseTagError, &tagName, 0, true);
1398 return;
1399 }
1400
1401 if (maxLevel > elem->level) {
1402 // We didn't match because the tag is in a different scope, e.g.,
1403 // <b><p>Foo</b>. Try to correct the problem.
1404 if (!isResidualStyleTag(tagName))
1405 return;
1406 return handleResidualStyleCloseTagAcrossBlocks(elem);
1407 }
1408
1409 bool isAffectedByStyle = isAffectedByResidualStyle(elem->tagName);
1410 HTMLStackElem* residualStyleStack = 0;
1411 Node* malformedTableParent = 0;
1412
1413 elem = m_blockStack;
1414 unsigned stackDepth = 1;
1415 unsigned redundantStyleCount = 0;
1416 while (elem) {
1417 if (elem->tagName == tagName) {
1418 int strayTable = m_inStrayTableContent;
1419 popOneBlock();
1420 elem = 0;
1421
1422 // This element was the root of some malformed content just inside an implicit or
1423 // explicit <tbody> or <tr>.
1424 // If we end up needing to reopen residual style tags, the root of the reopened chain
1425 // must also know that it is the root of malformed content inside a <tbody>/<tr>.
1426 if (strayTable && (m_inStrayTableContent < strayTable) && residualStyleStack) {
1427 Node* curr = m_current;
1428 while (curr && !curr->hasTagName(tableTag))
1429 curr = curr->parentNode();
1430 malformedTableParent = curr ? curr->parentNode() : 0;
1431 }
1432 }
1433 else {
1434 if (m_currentFormElement && elem->tagName == formTag)
1435 // A <form> is being closed prematurely (and this is
1436 // malformed HTML). Set an attribute on the form to clear out its
1437 // bottom margin.
1438 m_currentFormElement->setMalformed(true);
1439
1440 // Schedule this tag for reopening
1441 // after we complete the close of this entire block.
1442 if (isAffectedByStyle && isResidualStyleTag(elem->tagName) && stackDepth++ < cResidualStyleMaxDepth) {
1443 // We've overloaded the use of stack elements and are just reusing the
1444 // struct with a slightly different meaning to the variables. Instead of chaining
1445 // from innermost to outermost, we build up a list of all the tags we need to reopen
1446 // from the outermost to the innermost, i.e., residualStyleStack will end up pointing
1447 // to the outermost tag we need to reopen.
1448 // We also set elem->node to be the actual element that corresponds to the ID stored in
1449 // elem->id rather than the node that you should pop to when the element gets pulled off
1450 // the stack.
1451 if (residualStyleStack && elem->tagName == residualStyleStack->tagName && elem->node->attributes()->mapsEquivalent(residualStyleStack->node->attributes()))
1452 redundantStyleCount++;
1453 else
1454 redundantStyleCount = 0;
1455
1456 if (redundantStyleCount < cMaxRedundantTagDepth)
1457 moveOneBlockToStack(residualStyleStack);
1458 else
1459 popOneBlock();
1460 } else
1461 popOneBlock();
1462 elem = m_blockStack;
1463 }
1464 }
1465
1466 reopenResidualStyleTags(residualStyleStack, malformedTableParent);
1467 }
1468
popOneBlockCommon()1469 inline HTMLStackElem* HTMLParser::popOneBlockCommon()
1470 {
1471 HTMLStackElem* elem = m_blockStack;
1472
1473 // Form elements restore their state during the parsing process.
1474 // Also, a few elements (<applet>, <object>) need to know when all child elements (<param>s) are available.
1475 if (m_current && elem->node != m_current)
1476 m_current->finishParsingChildren();
1477
1478 if (m_blockStack->level >= minBlockLevelTagPriority) {
1479 ASSERT(m_blocksInStack > 0);
1480 m_blocksInStack--;
1481 }
1482 m_blockStack = elem->next;
1483 m_current = elem->node;
1484 m_didRefCurrent = elem->didRefNode;
1485
1486 if (elem->strayTableContent)
1487 m_inStrayTableContent--;
1488
1489 if (elem->tagName == pTag)
1490 m_hasPElementInScope = NotInScope;
1491 else if (isScopingTag(elem->tagName))
1492 m_hasPElementInScope = Unknown;
1493
1494 return elem;
1495 }
1496
popOneBlock()1497 void HTMLParser::popOneBlock()
1498 {
1499 // Store the current node before popOneBlockCommon overwrites it.
1500 Node* lastCurrent = m_current;
1501 bool didRefLastCurrent = m_didRefCurrent;
1502
1503 delete popOneBlockCommon();
1504
1505 if (didRefLastCurrent)
1506 lastCurrent->deref();
1507 }
1508
moveOneBlockToStack(HTMLStackElem * & head)1509 void HTMLParser::moveOneBlockToStack(HTMLStackElem*& head)
1510 {
1511 // We'll be using the stack element we're popping, but for the current node.
1512 // See the two callers for details.
1513
1514 // Store the current node before popOneBlockCommon overwrites it.
1515 Node* lastCurrent = m_current;
1516 bool didRefLastCurrent = m_didRefCurrent;
1517
1518 // Pop the block, but don't deref the current node as popOneBlock does because
1519 // we'll be using the pointer in the new stack element.
1520 HTMLStackElem* elem = popOneBlockCommon();
1521
1522 // Transfer the current node into the stack element.
1523 // No need to deref the old elem->node because popOneBlockCommon transferred
1524 // it into the m_current/m_didRefCurrent fields.
1525 elem->node = lastCurrent;
1526 elem->didRefNode = didRefLastCurrent;
1527 elem->next = head;
1528 head = elem;
1529 }
1530
checkIfHasPElementInScope()1531 void HTMLParser::checkIfHasPElementInScope()
1532 {
1533 m_hasPElementInScope = NotInScope;
1534 HTMLStackElem* elem = m_blockStack;
1535 while (elem) {
1536 const AtomicString& tagName = elem->tagName;
1537 if (tagName == pTag) {
1538 m_hasPElementInScope = InScope;
1539 return;
1540 } else if (isScopingTag(tagName))
1541 return;
1542 elem = elem->next;
1543 }
1544 }
1545
popInlineBlocks()1546 void HTMLParser::popInlineBlocks()
1547 {
1548 while (m_blockStack && isInline(m_current))
1549 popOneBlock();
1550 }
1551
freeBlock()1552 void HTMLParser::freeBlock()
1553 {
1554 while (m_blockStack)
1555 popOneBlock();
1556 ASSERT(!m_blocksInStack);
1557 }
1558
createHead()1559 void HTMLParser::createHead()
1560 {
1561 if (m_head)
1562 return;
1563
1564 if (!m_document->documentElement()) {
1565 insertNode(new HTMLHtmlElement(htmlTag, m_document));
1566 ASSERT(m_document->documentElement());
1567 }
1568
1569 m_head = new HTMLHeadElement(headTag, m_document);
1570 HTMLElement* body = m_document->body();
1571 ExceptionCode ec = 0;
1572 m_document->documentElement()->insertBefore(m_head.get(), body, ec);
1573 if (ec)
1574 m_head = 0;
1575
1576 // If the body does not exist yet, then the <head> should be pushed as the current block.
1577 if (m_head && !body) {
1578 pushBlock(m_head->localName(), m_head->tagPriority());
1579 setCurrent(m_head.get());
1580 }
1581 }
1582
handleIsindex(Token * t)1583 PassRefPtr<Node> HTMLParser::handleIsindex(Token* t)
1584 {
1585 RefPtr<Node> n = new HTMLDivElement(divTag, m_document);
1586
1587 NamedMappedAttrMap* attrs = t->attrs.get();
1588
1589 RefPtr<HTMLIsIndexElement> isIndex = new HTMLIsIndexElement(isindexTag, m_document, m_currentFormElement.get());
1590 isIndex->setAttributeMap(attrs);
1591 isIndex->setAttribute(typeAttr, "khtml_isindex");
1592
1593 String text = searchableIndexIntroduction();
1594 if (attrs) {
1595 if (Attribute* a = attrs->getAttributeItem(promptAttr))
1596 text = a->value().string() + " ";
1597 t->attrs = 0;
1598 }
1599
1600 n->addChild(new HTMLHRElement(hrTag, m_document));
1601 n->addChild(new Text(m_document, text));
1602 n->addChild(isIndex.release());
1603 n->addChild(new HTMLHRElement(hrTag, m_document));
1604
1605 return n.release();
1606 }
1607
startBody()1608 void HTMLParser::startBody()
1609 {
1610 if (m_inBody)
1611 return;
1612
1613 m_inBody = true;
1614
1615 if (m_isindexElement) {
1616 insertNode(m_isindexElement.get(), true /* don't descend into this node */);
1617 m_isindexElement = 0;
1618 }
1619 }
1620
finished()1621 void HTMLParser::finished()
1622 {
1623 // In the case of a completely empty document, here's the place to create the HTML element.
1624 if (m_current && m_current->isDocumentNode() && !m_document->documentElement())
1625 insertNode(new HTMLHtmlElement(htmlTag, m_document));
1626
1627 // This ensures that "current" is not left pointing to a node when the document is destroyed.
1628 freeBlock();
1629 setCurrent(0);
1630
1631 // Warning, this may delete the tokenizer and parser, so don't try to do anything else after this.
1632 if (!m_isParsingFragment)
1633 m_document->finishedParsing();
1634 }
1635
reportErrorToConsole(HTMLParserErrorCode errorCode,const AtomicString * tagName1,const AtomicString * tagName2,bool closeTags)1636 void HTMLParser::reportErrorToConsole(HTMLParserErrorCode errorCode, const AtomicString* tagName1, const AtomicString* tagName2, bool closeTags)
1637 {
1638 Frame* frame = m_document->frame();
1639 if (!frame)
1640 return;
1641
1642 HTMLTokenizer* htmlTokenizer = static_cast<HTMLTokenizer*>(m_document->tokenizer());
1643 int lineNumber = htmlTokenizer->lineNumber() + 1;
1644
1645 AtomicString tag1;
1646 AtomicString tag2;
1647 if (tagName1) {
1648 if (*tagName1 == "#text")
1649 tag1 = "Text";
1650 else if (*tagName1 == "#comment")
1651 tag1 = "<!-- comment -->";
1652 else
1653 tag1 = (closeTags ? "</" : "<") + *tagName1 + ">";
1654 }
1655 if (tagName2) {
1656 if (*tagName2 == "#text")
1657 tag2 = "Text";
1658 else if (*tagName2 == "#comment")
1659 tag2 = "<!-- comment -->";
1660 else
1661 tag2 = (closeTags ? "</" : "<") + *tagName2 + ">";
1662 }
1663
1664 const char* errorMsg = htmlParserErrorMessageTemplate(errorCode);
1665 if (!errorMsg)
1666 return;
1667
1668 String message;
1669 if (htmlTokenizer->processingContentWrittenByScript())
1670 message += htmlParserDocumentWriteMessage();
1671 message += errorMsg;
1672 message.replace("%tag1", tag1);
1673 message.replace("%tag2", tag2);
1674
1675 frame->domWindow()->console()->addMessage(HTMLMessageSource, LogMessageType,
1676 isWarning(errorCode) ? WarningMessageLevel : ErrorMessageLevel,
1677 message, lineNumber, m_document->url().string());
1678 }
1679
1680 #ifdef BUILDING_ON_LEOPARD
shouldCreateImplicitHead(Document * document)1681 bool shouldCreateImplicitHead(Document* document)
1682 {
1683 ASSERT(document);
1684
1685 Settings* settings = document->page() ? document->page()->settings() : 0;
1686 return settings ? !settings->needsLeopardMailQuirks() : true;
1687 }
1688 #elif defined(BUILDING_ON_TIGER)
shouldCreateImplicitHead(Document * document)1689 bool shouldCreateImplicitHead(Document* document)
1690 {
1691 ASSERT(document);
1692
1693 Settings* settings = document->page() ? document->page()->settings() : 0;
1694 return settings ? !settings->needsTigerMailQuirks() : true;
1695 }
1696 #endif
1697
1698 }
1699