1 /*
2 * Copyright (C) 2010 Google, Inc. All Rights Reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
26 #include "config.h"
27 #include "HTMLDocumentParser.h"
28
29 #include "ContentSecurityPolicy.h"
30 #include "DocumentFragment.h"
31 #include "Element.h"
32 #include "Frame.h"
33 #include "HTMLNames.h"
34 #include "HTMLParserScheduler.h"
35 #include "HTMLTokenizer.h"
36 #include "HTMLPreloadScanner.h"
37 #include "HTMLScriptRunner.h"
38 #include "HTMLTreeBuilder.h"
39 #include "HTMLDocument.h"
40 #include "InspectorInstrumentation.h"
41 #include "NestingLevelIncrementer.h"
42 #include "Settings.h"
43
44 namespace WebCore {
45
46 using namespace HTMLNames;
47
48 namespace {
49
50 // This is a direct transcription of step 4 from:
51 // http://www.whatwg.org/specs/web-apps/current-work/multipage/the-end.html#fragment-case
tokenizerStateForContextElement(Element * contextElement,bool reportErrors)52 HTMLTokenizer::State tokenizerStateForContextElement(Element* contextElement, bool reportErrors)
53 {
54 if (!contextElement)
55 return HTMLTokenizer::DataState;
56
57 const QualifiedName& contextTag = contextElement->tagQName();
58
59 if (contextTag.matches(titleTag) || contextTag.matches(textareaTag))
60 return HTMLTokenizer::RCDATAState;
61 if (contextTag.matches(styleTag)
62 || contextTag.matches(xmpTag)
63 || contextTag.matches(iframeTag)
64 || (contextTag.matches(noembedTag) && HTMLTreeBuilder::pluginsEnabled(contextElement->document()->frame()))
65 || (contextTag.matches(noscriptTag) && HTMLTreeBuilder::scriptEnabled(contextElement->document()->frame()))
66 || contextTag.matches(noframesTag))
67 return reportErrors ? HTMLTokenizer::RAWTEXTState : HTMLTokenizer::PLAINTEXTState;
68 if (contextTag.matches(scriptTag))
69 return reportErrors ? HTMLTokenizer::ScriptDataState : HTMLTokenizer::PLAINTEXTState;
70 if (contextTag.matches(plaintextTag))
71 return HTMLTokenizer::PLAINTEXTState;
72 return HTMLTokenizer::DataState;
73 }
74
75 } // namespace
76
HTMLDocumentParser(HTMLDocument * document,bool reportErrors)77 HTMLDocumentParser::HTMLDocumentParser(HTMLDocument* document, bool reportErrors)
78 : ScriptableDocumentParser(document)
79 , m_tokenizer(HTMLTokenizer::create(usePreHTML5ParserQuirks(document)))
80 , m_scriptRunner(HTMLScriptRunner::create(document, this))
81 , m_treeBuilder(HTMLTreeBuilder::create(this, document, reportErrors, usePreHTML5ParserQuirks(document)))
82 , m_parserScheduler(HTMLParserScheduler::create(this))
83 , m_xssFilter(this)
84 , m_endWasDelayed(false)
85 , m_pumpSessionNestingLevel(0)
86 {
87 }
88
89 // FIXME: Member variables should be grouped into self-initializing structs to
90 // minimize code duplication between these constructors.
HTMLDocumentParser(DocumentFragment * fragment,Element * contextElement,FragmentScriptingPermission scriptingPermission)91 HTMLDocumentParser::HTMLDocumentParser(DocumentFragment* fragment, Element* contextElement, FragmentScriptingPermission scriptingPermission)
92 : ScriptableDocumentParser(fragment->document())
93 , m_tokenizer(HTMLTokenizer::create(usePreHTML5ParserQuirks(fragment->document())))
94 , m_treeBuilder(HTMLTreeBuilder::create(this, fragment, contextElement, scriptingPermission, usePreHTML5ParserQuirks(fragment->document())))
95 , m_xssFilter(this)
96 , m_endWasDelayed(false)
97 , m_pumpSessionNestingLevel(0)
98 {
99 bool reportErrors = false; // For now document fragment parsing never reports errors.
100 m_tokenizer->setState(tokenizerStateForContextElement(contextElement, reportErrors));
101 }
102
~HTMLDocumentParser()103 HTMLDocumentParser::~HTMLDocumentParser()
104 {
105 ASSERT(!m_parserScheduler);
106 ASSERT(!m_pumpSessionNestingLevel);
107 ASSERT(!m_preloadScanner);
108 }
109
detach()110 void HTMLDocumentParser::detach()
111 {
112 DocumentParser::detach();
113 if (m_scriptRunner)
114 m_scriptRunner->detach();
115 m_treeBuilder->detach();
116 // FIXME: It seems wrong that we would have a preload scanner here.
117 // Yet during fast/dom/HTMLScriptElement/script-load-events.html we do.
118 m_preloadScanner.clear();
119 m_parserScheduler.clear(); // Deleting the scheduler will clear any timers.
120 }
121
stopParsing()122 void HTMLDocumentParser::stopParsing()
123 {
124 DocumentParser::stopParsing();
125 m_parserScheduler.clear(); // Deleting the scheduler will clear any timers.
126 }
127
128 // This kicks off "Once the user agent stops parsing" as described by:
129 // http://www.whatwg.org/specs/web-apps/current-work/multipage/the-end.html#the-end
prepareToStopParsing()130 void HTMLDocumentParser::prepareToStopParsing()
131 {
132 ASSERT(!hasInsertionPoint());
133
134 // pumpTokenizer can cause this parser to be detached from the Document,
135 // but we need to ensure it isn't deleted yet.
136 RefPtr<HTMLDocumentParser> protect(this);
137
138 // NOTE: This pump should only ever emit buffered character tokens,
139 // so ForceSynchronous vs. AllowYield should be meaningless.
140 pumpTokenizerIfPossible(ForceSynchronous);
141
142 if (isStopped())
143 return;
144
145 DocumentParser::prepareToStopParsing();
146
147 // We will not have a scriptRunner when parsing a DocumentFragment.
148 if (m_scriptRunner)
149 document()->setReadyState(Document::Interactive);
150
151 attemptToRunDeferredScriptsAndEnd();
152 }
153
isParsingFragment() const154 bool HTMLDocumentParser::isParsingFragment() const
155 {
156 return m_treeBuilder->isParsingFragment();
157 }
158
processingData() const159 bool HTMLDocumentParser::processingData() const
160 {
161 return isScheduledForResume() || inPumpSession();
162 }
163
pumpTokenizerIfPossible(SynchronousMode mode)164 void HTMLDocumentParser::pumpTokenizerIfPossible(SynchronousMode mode)
165 {
166 if (isStopped() || m_treeBuilder->isPaused())
167 return;
168
169 // Once a resume is scheduled, HTMLParserScheduler controls when we next pump.
170 if (isScheduledForResume()) {
171 ASSERT(mode == AllowYield);
172 return;
173 }
174
175 pumpTokenizer(mode);
176 }
177
isScheduledForResume() const178 bool HTMLDocumentParser::isScheduledForResume() const
179 {
180 return m_parserScheduler && m_parserScheduler->isScheduledForResume();
181 }
182
183 // Used by HTMLParserScheduler
resumeParsingAfterYield()184 void HTMLDocumentParser::resumeParsingAfterYield()
185 {
186 // pumpTokenizer can cause this parser to be detached from the Document,
187 // but we need to ensure it isn't deleted yet.
188 RefPtr<HTMLDocumentParser> protect(this);
189
190 // We should never be here unless we can pump immediately. Call pumpTokenizer()
191 // directly so that ASSERTS will fire if we're wrong.
192 pumpTokenizer(AllowYield);
193 endIfDelayed();
194 }
195
runScriptsForPausedTreeBuilder()196 bool HTMLDocumentParser::runScriptsForPausedTreeBuilder()
197 {
198 ASSERT(m_treeBuilder->isPaused());
199
200 TextPosition1 scriptStartPosition = TextPosition1::belowRangePosition();
201 RefPtr<Element> scriptElement = m_treeBuilder->takeScriptToProcess(scriptStartPosition);
202 // We will not have a scriptRunner when parsing a DocumentFragment.
203 if (!m_scriptRunner)
204 return true;
205 return m_scriptRunner->execute(scriptElement.release(), scriptStartPosition);
206 }
207
canTakeNextToken(SynchronousMode mode,PumpSession & session)208 bool HTMLDocumentParser::canTakeNextToken(SynchronousMode mode, PumpSession& session)
209 {
210 if (isStopped())
211 return false;
212
213 // The parser will pause itself when waiting on a script to load or run.
214 if (m_treeBuilder->isPaused()) {
215 if (mode == AllowYield)
216 m_parserScheduler->checkForYieldBeforeScript(session);
217
218 // If we don't run the script, we cannot allow the next token to be taken.
219 if (session.needsYield)
220 return false;
221
222 // If we're paused waiting for a script, we try to execute scripts before continuing.
223 bool shouldContinueParsing = runScriptsForPausedTreeBuilder();
224 m_treeBuilder->setPaused(!shouldContinueParsing);
225 if (!shouldContinueParsing || isStopped())
226 return false;
227 }
228
229 // FIXME: It's wrong for the HTMLDocumentParser to reach back to the
230 // Frame, but this approach is how the old parser handled
231 // stopping when the page assigns window.location. What really
232 // should happen is that assigning window.location causes the
233 // parser to stop parsing cleanly. The problem is we're not
234 // perpared to do that at every point where we run JavaScript.
235 if (!isParsingFragment()
236 && document()->frame() && document()->frame()->navigationScheduler()->locationChangePending())
237 return false;
238
239 if (mode == AllowYield)
240 m_parserScheduler->checkForYieldBeforeToken(session);
241
242 return true;
243 }
244
pumpTokenizer(SynchronousMode mode)245 void HTMLDocumentParser::pumpTokenizer(SynchronousMode mode)
246 {
247 ASSERT(!isStopped());
248 ASSERT(!isScheduledForResume());
249 // ASSERT that this object is both attached to the Document and protected.
250 ASSERT(refCount() >= 2);
251
252 PumpSession session(m_pumpSessionNestingLevel);
253
254 // We tell the InspectorInstrumentation about every pump, even if we
255 // end up pumping nothing. It can filter out empty pumps itself.
256 // FIXME: m_input.current().length() is only accurate if we
257 // end up parsing the whole buffer in this pump. We should pass how
258 // much we parsed as part of didWriteHTML instead of willWriteHTML.
259 InspectorInstrumentationCookie cookie = InspectorInstrumentation::willWriteHTML(document(), m_input.current().length(), m_tokenizer->lineNumber());
260
261 while (canTakeNextToken(mode, session) && !session.needsYield) {
262 if (!isParsingFragment())
263 m_sourceTracker.start(m_input, m_token);
264
265 if (!m_tokenizer->nextToken(m_input.current(), m_token))
266 break;
267
268 if (!isParsingFragment()) {
269 m_sourceTracker.end(m_input, m_token);
270
271 // We do not XSS filter innerHTML, which means we (intentionally) fail
272 // http/tests/security/xssAuditor/dom-write-innerHTML.html
273 m_xssFilter.filterToken(m_token);
274 }
275
276 m_treeBuilder->constructTreeFromToken(m_token);
277 ASSERT(m_token.isUninitialized());
278 }
279
280 // Ensure we haven't been totally deref'ed after pumping. Any caller of this
281 // function should be holding a RefPtr to this to ensure we weren't deleted.
282 ASSERT(refCount() >= 1);
283
284 if (isStopped())
285 return;
286
287 if (session.needsYield)
288 m_parserScheduler->scheduleForResume();
289
290 if (isWaitingForScripts()) {
291 ASSERT(m_tokenizer->state() == HTMLTokenizer::DataState);
292 if (!m_preloadScanner) {
293 m_preloadScanner.set(new HTMLPreloadScanner(document()));
294 m_preloadScanner->appendToEnd(m_input.current());
295 }
296 m_preloadScanner->scan();
297 }
298
299 InspectorInstrumentation::didWriteHTML(cookie, m_tokenizer->lineNumber());
300 }
301
hasInsertionPoint()302 bool HTMLDocumentParser::hasInsertionPoint()
303 {
304 // FIXME: The wasCreatedByScript() branch here might not be fully correct.
305 // Our model of the EOF character differs slightly from the one in
306 // the spec because our treatment is uniform between network-sourced
307 // and script-sourced input streams whereas the spec treats them
308 // differently.
309 return m_input.hasInsertionPoint() || (wasCreatedByScript() && !m_input.haveSeenEndOfFile());
310 }
311
insert(const SegmentedString & source)312 void HTMLDocumentParser::insert(const SegmentedString& source)
313 {
314 if (isStopped())
315 return;
316
317 // pumpTokenizer can cause this parser to be detached from the Document,
318 // but we need to ensure it isn't deleted yet.
319 RefPtr<HTMLDocumentParser> protect(this);
320
321 SegmentedString excludedLineNumberSource(source);
322 excludedLineNumberSource.setExcludeLineNumbers();
323 m_input.insertAtCurrentInsertionPoint(excludedLineNumberSource);
324 pumpTokenizerIfPossible(ForceSynchronous);
325
326 if (isWaitingForScripts()) {
327 // Check the document.write() output with a separate preload scanner as
328 // the main scanner can't deal with insertions.
329 HTMLPreloadScanner preloadScanner(document());
330 preloadScanner.appendToEnd(source);
331 preloadScanner.scan();
332 }
333
334 endIfDelayed();
335 }
336
append(const SegmentedString & source)337 void HTMLDocumentParser::append(const SegmentedString& source)
338 {
339 if (isStopped())
340 return;
341
342 // pumpTokenizer can cause this parser to be detached from the Document,
343 // but we need to ensure it isn't deleted yet.
344 RefPtr<HTMLDocumentParser> protect(this);
345
346 if (m_preloadScanner) {
347 if (m_input.current().isEmpty() && !isWaitingForScripts()) {
348 // We have parsed until the end of the current input and so are now moving ahead of the preload scanner.
349 // Clear the scanner so we know to scan starting from the current input point if we block again.
350 m_preloadScanner.clear();
351 } else {
352 m_preloadScanner->appendToEnd(source);
353 if (isWaitingForScripts())
354 m_preloadScanner->scan();
355 }
356 }
357
358 m_input.appendToEnd(source);
359
360 if (inPumpSession()) {
361 // We've gotten data off the network in a nested write.
362 // We don't want to consume any more of the input stream now. Do
363 // not worry. We'll consume this data in a less-nested write().
364 return;
365 }
366
367 pumpTokenizerIfPossible(AllowYield);
368
369 endIfDelayed();
370 }
371
end()372 void HTMLDocumentParser::end()
373 {
374 ASSERT(!isDetached());
375 ASSERT(!isScheduledForResume());
376
377 // Informs the the rest of WebCore that parsing is really finished (and deletes this).
378 m_treeBuilder->finished();
379 }
380
attemptToRunDeferredScriptsAndEnd()381 void HTMLDocumentParser::attemptToRunDeferredScriptsAndEnd()
382 {
383 ASSERT(isStopping());
384 ASSERT(!hasInsertionPoint());
385 if (m_scriptRunner && !m_scriptRunner->executeScriptsWaitingForParsing())
386 return;
387 end();
388 }
389
attemptToEnd()390 void HTMLDocumentParser::attemptToEnd()
391 {
392 // finish() indicates we will not receive any more data. If we are waiting on
393 // an external script to load, we can't finish parsing quite yet.
394
395 if (shouldDelayEnd()) {
396 m_endWasDelayed = true;
397 return;
398 }
399 prepareToStopParsing();
400 }
401
endIfDelayed()402 void HTMLDocumentParser::endIfDelayed()
403 {
404 // If we've already been detached, don't bother ending.
405 if (isDetached())
406 return;
407
408 if (!m_endWasDelayed || shouldDelayEnd())
409 return;
410
411 m_endWasDelayed = false;
412 prepareToStopParsing();
413 }
414
finish()415 void HTMLDocumentParser::finish()
416 {
417 // FIXME: We should ASSERT(!m_parserStopped) here, since it does not
418 // makes sense to call any methods on DocumentParser once it's been stopped.
419 // However, FrameLoader::stop calls Document::finishParsing unconditionally
420 // which in turn calls m_parser->finish().
421
422 // We're not going to get any more data off the network, so we tell the
423 // input stream we've reached the end of file. finish() can be called more
424 // than once, if the first time does not call end().
425 if (!m_input.haveSeenEndOfFile())
426 m_input.markEndOfFile();
427 attemptToEnd();
428 }
429
finishWasCalled()430 bool HTMLDocumentParser::finishWasCalled()
431 {
432 return m_input.haveSeenEndOfFile();
433 }
434
435 // This function is virtual and just for the DocumentParser interface.
isExecutingScript() const436 bool HTMLDocumentParser::isExecutingScript() const
437 {
438 return inScriptExecution();
439 }
440
441 // This function is non-virtual and used throughout the implementation.
inScriptExecution() const442 bool HTMLDocumentParser::inScriptExecution() const
443 {
444 if (!m_scriptRunner)
445 return false;
446 return m_scriptRunner->isExecutingScript();
447 }
448
sourceForToken(const HTMLToken & token)449 String HTMLDocumentParser::sourceForToken(const HTMLToken& token)
450 {
451 return m_sourceTracker.sourceForToken(token);
452 }
453
lineNumber() const454 int HTMLDocumentParser::lineNumber() const
455 {
456 return m_tokenizer->lineNumber();
457 }
458
textPosition() const459 TextPosition0 HTMLDocumentParser::textPosition() const
460 {
461 const SegmentedString& currentString = m_input.current();
462 WTF::ZeroBasedNumber line = currentString.currentLine();
463 WTF::ZeroBasedNumber column = currentString.currentColumn();
464 ASSERT(m_tokenizer->lineNumber() == line.zeroBasedInt());
465
466 return TextPosition0(line, column);
467 }
468
isWaitingForScripts() const469 bool HTMLDocumentParser::isWaitingForScripts() const
470 {
471 return m_treeBuilder->isPaused();
472 }
473
resumeParsingAfterScriptExecution()474 void HTMLDocumentParser::resumeParsingAfterScriptExecution()
475 {
476 ASSERT(!inScriptExecution());
477 ASSERT(!m_treeBuilder->isPaused());
478
479 pumpTokenizerIfPossible(AllowYield);
480 endIfDelayed();
481 }
482
watchForLoad(CachedResource * cachedScript)483 void HTMLDocumentParser::watchForLoad(CachedResource* cachedScript)
484 {
485 ASSERT(!cachedScript->isLoaded());
486 // addClient would call notifyFinished if the load were complete.
487 // Callers do not expect to be re-entered from this call, so they should
488 // not an already-loaded CachedResource.
489 cachedScript->addClient(this);
490 }
491
stopWatchingForLoad(CachedResource * cachedScript)492 void HTMLDocumentParser::stopWatchingForLoad(CachedResource* cachedScript)
493 {
494 cachedScript->removeClient(this);
495 }
496
appendCurrentInputStreamToPreloadScannerAndScan()497 void HTMLDocumentParser::appendCurrentInputStreamToPreloadScannerAndScan()
498 {
499 ASSERT(m_preloadScanner);
500 m_preloadScanner->appendToEnd(m_input.current());
501 m_preloadScanner->scan();
502 }
503
notifyFinished(CachedResource * cachedResource)504 void HTMLDocumentParser::notifyFinished(CachedResource* cachedResource)
505 {
506 // pumpTokenizer can cause this parser to be detached from the Document,
507 // but we need to ensure it isn't deleted yet.
508 RefPtr<HTMLDocumentParser> protect(this);
509
510 ASSERT(m_scriptRunner);
511 ASSERT(!inScriptExecution());
512 if (isStopping()) {
513 attemptToRunDeferredScriptsAndEnd();
514 return;
515 }
516
517 ASSERT(m_treeBuilder->isPaused());
518 // Note: We only ever wait on one script at a time, so we always know this
519 // is the one we were waiting on and can un-pause the tree builder.
520 m_treeBuilder->setPaused(false);
521 bool shouldContinueParsing = m_scriptRunner->executeScriptsWaitingForLoad(cachedResource);
522 m_treeBuilder->setPaused(!shouldContinueParsing);
523 if (shouldContinueParsing)
524 resumeParsingAfterScriptExecution();
525 }
526
executeScriptsWaitingForStylesheets()527 void HTMLDocumentParser::executeScriptsWaitingForStylesheets()
528 {
529 // Document only calls this when the Document owns the DocumentParser
530 // so this will not be called in the DocumentFragment case.
531 ASSERT(m_scriptRunner);
532 // Ignore calls unless we have a script blocking the parser waiting on a
533 // stylesheet load. Otherwise we are currently parsing and this
534 // is a re-entrant call from encountering a </ style> tag.
535 if (!m_scriptRunner->hasScriptsWaitingForStylesheets())
536 return;
537
538 // pumpTokenizer can cause this parser to be detached from the Document,
539 // but we need to ensure it isn't deleted yet.
540 RefPtr<HTMLDocumentParser> protect(this);
541
542 ASSERT(!m_scriptRunner->isExecutingScript());
543 ASSERT(m_treeBuilder->isPaused());
544 // Note: We only ever wait on one script at a time, so we always know this
545 // is the one we were waiting on and can un-pause the tree builder.
546 m_treeBuilder->setPaused(false);
547 bool shouldContinueParsing = m_scriptRunner->executeScriptsWaitingForStylesheets();
548 m_treeBuilder->setPaused(!shouldContinueParsing);
549 if (shouldContinueParsing)
550 resumeParsingAfterScriptExecution();
551 }
552
script() const553 ScriptController* HTMLDocumentParser::script() const
554 {
555 return document()->frame() ? document()->frame()->script() : 0;
556 }
557
parseDocumentFragment(const String & source,DocumentFragment * fragment,Element * contextElement,FragmentScriptingPermission scriptingPermission)558 void HTMLDocumentParser::parseDocumentFragment(const String& source, DocumentFragment* fragment, Element* contextElement, FragmentScriptingPermission scriptingPermission)
559 {
560 RefPtr<HTMLDocumentParser> parser = HTMLDocumentParser::create(fragment, contextElement, scriptingPermission);
561 parser->insert(source); // Use insert() so that the parser will not yield.
562 parser->finish();
563 ASSERT(!parser->processingData()); // Make sure we're done. <rdar://problem/3963151>
564 parser->detach(); // Allows ~DocumentParser to assert it was detached before destruction.
565 }
566
usePreHTML5ParserQuirks(Document * document)567 bool HTMLDocumentParser::usePreHTML5ParserQuirks(Document* document)
568 {
569 ASSERT(document);
570 return document->settings() && document->settings()->usePreHTML5ParserQuirks();
571 }
572
suspendScheduledTasks()573 void HTMLDocumentParser::suspendScheduledTasks()
574 {
575 if (m_parserScheduler)
576 m_parserScheduler->suspend();
577 }
578
resumeScheduledTasks()579 void HTMLDocumentParser::resumeScheduledTasks()
580 {
581 if (m_parserScheduler)
582 m_parserScheduler->resume();
583 }
584
585 }
586