1 /*
2 Copyright (C) 1997 Martin Jones (mjones@kde.org)
3 (C) 1997 Torben Weis (weis@kde.org)
4 (C) 1998 Waldo Bastian (bastian@kde.org)
5 (C) 1999 Lars Knoll (knoll@kde.org)
6 (C) 1999 Antti Koivisto (koivisto@kde.org)
7 (C) 2001 Dirk Mueller (mueller@kde.org)
8 Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
9 Copyright (C) 2005, 2006 Alexey Proskuryakov (ap@nypop.com)
10 Copyright (C) 2009 Torch Mobile Inc. All rights reserved. (http://www.torchmobile.com/)
11
12 This library is free software; you can redistribute it and/or
13 modify it under the terms of the GNU Library General Public
14 License as published by the Free Software Foundation; either
15 version 2 of the License, or (at your option) any later version.
16
17 This library is distributed in the hope that it will be useful,
18 but WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 Library General Public License for more details.
21
22 You should have received a copy of the GNU Library General Public License
23 along with this library; see the file COPYING.LIB. If not, write to
24 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
25 Boston, MA 02110-1301, USA.
26 */
27
28 #include "config.h"
29 #include "HTMLTokenizer.h"
30
31 #include "CSSHelper.h"
32 #include "Cache.h"
33 #include "CachedScript.h"
34 #include "DocLoader.h"
35 #include "DocumentFragment.h"
36 #include "Event.h"
37 #include "EventNames.h"
38 #include "Frame.h"
39 #include "FrameLoader.h"
40 #include "FrameView.h"
41 #include "HTMLElement.h"
42 #include "HTMLNames.h"
43 #include "HTMLParser.h"
44 #include "HTMLScriptElement.h"
45 #include "HTMLViewSourceDocument.h"
46 #include "ImageLoader.h"
47 #include "InspectorTimelineAgent.h"
48 #include "MappedAttribute.h"
49 #include "Page.h"
50 #include "PreloadScanner.h"
51 #include "ScriptController.h"
52 #include "ScriptSourceCode.h"
53 #include "ScriptValue.h"
54 #include "XSSAuditor.h"
55 #include <wtf/ASCIICType.h>
56 #include <wtf/CurrentTime.h>
57
58 #include "HTMLEntityNames.c"
59
60 #ifdef ANDROID_INSTRUMENT
61 #include "TimeCounter.h"
62 #endif
63
64 #define PRELOAD_SCANNER_ENABLED 1
65 // #define INSTRUMENT_LAYOUT_SCHEDULING 1
66
67 using namespace WTF;
68 using namespace std;
69
70 namespace WebCore {
71
72 using namespace HTMLNames;
73
74 #if MOBILE
75 // The mobile device needs to be responsive, as such the tokenizer chunk size is reduced.
76 // This value is used to define how many characters the tokenizer will process before
77 // yeilding control.
78 static const int defaultTokenizerChunkSize = 256;
79 #else
80 static const int defaultTokenizerChunkSize = 4096;
81 #endif
82
83 #if MOBILE
84 // As the chunks are smaller (above), the tokenizer should not yield for as long a period, otherwise
85 // it will take way to long to load a page.
86 static const double defaultTokenizerTimeDelay = 0.300;
87 #else
88 // FIXME: We would like this constant to be 200ms.
89 // Yielding more aggressively results in increased responsiveness and better incremental rendering.
90 // It slows down overall page-load on slower machines, though, so for now we set a value of 500.
91 static const double defaultTokenizerTimeDelay = 0.500;
92 #endif
93
94 static const char commentStart [] = "<!--";
95 static const char doctypeStart [] = "<!doctype";
96 static const char publicStart [] = "public";
97 static const char systemStart [] = "system";
98 static const char scriptEnd [] = "</script";
99 static const char xmpEnd [] = "</xmp";
100 static const char styleEnd [] = "</style";
101 static const char textareaEnd [] = "</textarea";
102 static const char titleEnd [] = "</title";
103 static const char iframeEnd [] = "</iframe";
104
105 // Full support for MS Windows extensions to Latin-1.
106 // Technically these extensions should only be activated for pages
107 // marked "windows-1252" or "cp1252", but
108 // in the standard Microsoft way, these extensions infect hundreds of thousands
109 // of web pages. Note that people with non-latin-1 Microsoft extensions
110 // are SOL.
111 //
112 // See: http://www.microsoft.com/globaldev/reference/WinCP.asp
113 // http://www.bbsinc.com/iso8859.html
114 // http://www.obviously.com/
115 //
116 // There may be better equivalents
117
118 // We only need this for entities. For non-entity text, we handle this in the text encoding.
119
120 static const UChar windowsLatin1ExtensionArray[32] = {
121 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87
122 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F
123 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97
124 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178 // 98-9F
125 };
126
fixUpChar(UChar c)127 static inline UChar fixUpChar(UChar c)
128 {
129 if ((c & ~0x1F) != 0x0080)
130 return c;
131 return windowsLatin1ExtensionArray[c - 0x80];
132 }
133
tagMatch(const char * s1,const UChar * s2,unsigned length)134 static inline bool tagMatch(const char* s1, const UChar* s2, unsigned length)
135 {
136 for (unsigned i = 0; i != length; ++i) {
137 unsigned char c1 = s1[i];
138 unsigned char uc1 = toASCIIUpper(static_cast<char>(c1));
139 UChar c2 = s2[i];
140 if (c1 != c2 && uc1 != c2)
141 return false;
142 }
143 return true;
144 }
145
addAttribute(AtomicString & attrName,const AtomicString & attributeValue,bool viewSourceMode)146 inline void Token::addAttribute(AtomicString& attrName, const AtomicString& attributeValue, bool viewSourceMode)
147 {
148 if (!attrName.isEmpty()) {
149 ASSERT(!attrName.contains('/'));
150 RefPtr<MappedAttribute> a = MappedAttribute::create(attrName, attributeValue);
151 if (!attrs) {
152 attrs = NamedMappedAttrMap::create();
153 attrs->reserveInitialCapacity(10);
154 }
155 attrs->insertAttribute(a.release(), viewSourceMode);
156 }
157
158 attrName = emptyAtom;
159 }
160
161 // ----------------------------------------------------------------------------
162
HTMLTokenizer(HTMLDocument * doc,bool reportErrors)163 HTMLTokenizer::HTMLTokenizer(HTMLDocument* doc, bool reportErrors)
164 : Tokenizer()
165 , m_buffer(0)
166 , m_scriptCode(0)
167 , m_scriptCodeSize(0)
168 , m_scriptCodeCapacity(0)
169 , m_scriptCodeResync(0)
170 , m_executingScript(0)
171 , m_requestingScript(false)
172 , m_hasScriptsWaitingForStylesheets(false)
173 , m_timer(this, &HTMLTokenizer::timerFired)
174 , m_externalScriptsTimer(this, &HTMLTokenizer::executeExternalScriptsTimerFired)
175 , m_doc(doc)
176 , m_parser(new HTMLParser(doc, reportErrors))
177 , m_inWrite(false)
178 , m_fragment(false)
179 , m_scriptingPermission(FragmentScriptingAllowed)
180 {
181 begin();
182 }
183
HTMLTokenizer(HTMLViewSourceDocument * doc)184 HTMLTokenizer::HTMLTokenizer(HTMLViewSourceDocument* doc)
185 : Tokenizer(true)
186 , m_buffer(0)
187 , m_scriptCode(0)
188 , m_scriptCodeSize(0)
189 , m_scriptCodeCapacity(0)
190 , m_scriptCodeResync(0)
191 , m_executingScript(0)
192 , m_requestingScript(false)
193 , m_hasScriptsWaitingForStylesheets(false)
194 , m_timer(this, &HTMLTokenizer::timerFired)
195 , m_externalScriptsTimer(this, &HTMLTokenizer::executeExternalScriptsTimerFired)
196 , m_doc(doc)
197 , m_parser(0)
198 , m_inWrite(false)
199 , m_fragment(false)
200 , m_scriptingPermission(FragmentScriptingAllowed)
201 {
202 begin();
203 }
204
HTMLTokenizer(DocumentFragment * frag,FragmentScriptingPermission scriptingPermission)205 HTMLTokenizer::HTMLTokenizer(DocumentFragment* frag, FragmentScriptingPermission scriptingPermission)
206 : m_buffer(0)
207 , m_scriptCode(0)
208 , m_scriptCodeSize(0)
209 , m_scriptCodeCapacity(0)
210 , m_scriptCodeResync(0)
211 , m_executingScript(0)
212 , m_requestingScript(false)
213 , m_hasScriptsWaitingForStylesheets(false)
214 , m_timer(this, &HTMLTokenizer::timerFired)
215 , m_externalScriptsTimer(this, &HTMLTokenizer::executeExternalScriptsTimerFired)
216 , m_doc(frag->document())
217 , m_parser(new HTMLParser(frag, scriptingPermission))
218 , m_inWrite(false)
219 , m_fragment(true)
220 , m_scriptingPermission(scriptingPermission)
221 {
222 begin();
223 }
224
reset()225 void HTMLTokenizer::reset()
226 {
227 ASSERT(m_executingScript == 0);
228
229 while (!m_pendingScripts.isEmpty()) {
230 CachedScript* cs = m_pendingScripts.first().get();
231 m_pendingScripts.removeFirst();
232 ASSERT(cache()->disabled() || cs->accessCount() > 0);
233 cs->removeClient(this);
234 }
235
236 fastFree(m_buffer);
237 m_buffer = m_dest = 0;
238 m_bufferSize = 0;
239
240 fastFree(m_scriptCode);
241 m_scriptCode = 0;
242 m_scriptCodeSize = m_scriptCodeCapacity = m_scriptCodeResync = 0;
243
244 m_timer.stop();
245 m_externalScriptsTimer.stop();
246
247 m_state.setAllowYield(false);
248 m_state.setForceSynchronous(false);
249
250 m_currentToken.reset();
251 m_doctypeToken.reset();
252 m_doctypeSearchCount = 0;
253 m_doctypeSecondarySearchCount = 0;
254 m_hasScriptsWaitingForStylesheets = false;
255 }
256
begin()257 void HTMLTokenizer::begin()
258 {
259 m_executingScript = 0;
260 m_requestingScript = false;
261 m_hasScriptsWaitingForStylesheets = false;
262 m_state.setLoadingExtScript(false);
263 reset();
264 m_bufferSize = 254;
265 m_buffer = static_cast<UChar*>(fastMalloc(sizeof(UChar) * 254));
266 m_dest = m_buffer;
267 tquote = NoQuote;
268 searchCount = 0;
269 m_state.setEntityState(NoEntity);
270 m_scriptTagSrcAttrValue = String();
271 m_pendingSrc.clear();
272 m_currentPrependingSrc = 0;
273 m_noMoreData = false;
274 m_brokenComments = false;
275 m_brokenServer = false;
276 m_lineNumber = 0;
277 m_currentScriptTagStartLineNumber = 0;
278 m_currentTagStartLineNumber = 0;
279 m_state.setForceSynchronous(false);
280
281 Page* page = m_doc->page();
282 if (page && page->hasCustomHTMLTokenizerTimeDelay())
283 m_tokenizerTimeDelay = page->customHTMLTokenizerTimeDelay();
284 else
285 m_tokenizerTimeDelay = defaultTokenizerTimeDelay;
286
287 if (page && page->hasCustomHTMLTokenizerChunkSize())
288 m_tokenizerChunkSize = page->customHTMLTokenizerChunkSize();
289 else
290 m_tokenizerChunkSize = defaultTokenizerChunkSize;
291 }
292
setForceSynchronous(bool force)293 void HTMLTokenizer::setForceSynchronous(bool force)
294 {
295 m_state.setForceSynchronous(force);
296 }
297
processListing(SegmentedString list,State state)298 HTMLTokenizer::State HTMLTokenizer::processListing(SegmentedString list, State state)
299 {
300 // This function adds the listing 'list' as
301 // preformatted text-tokens to the token-collection
302 while (!list.isEmpty()) {
303 if (state.skipLF()) {
304 state.setSkipLF(false);
305 if (*list == '\n') {
306 list.advance();
307 continue;
308 }
309 }
310
311 checkBuffer();
312
313 if (*list == '\n' || *list == '\r') {
314 if (state.discardLF())
315 // Ignore this LF
316 state.setDiscardLF(false); // We have discarded 1 LF
317 else
318 *m_dest++ = '\n';
319
320 /* Check for MS-DOS CRLF sequence */
321 if (*list == '\r')
322 state.setSkipLF(true);
323
324 list.advance();
325 } else {
326 state.setDiscardLF(false);
327 *m_dest++ = *list;
328 list.advance();
329 }
330 }
331
332 return state;
333 }
334
parseNonHTMLText(SegmentedString & src,State state)335 HTMLTokenizer::State HTMLTokenizer::parseNonHTMLText(SegmentedString& src, State state)
336 {
337 ASSERT(state.inTextArea() || state.inTitle() || state.inIFrame() || !state.hasEntityState());
338 ASSERT(!state.hasTagState());
339 ASSERT(state.inXmp() + state.inTextArea() + state.inTitle() + state.inStyle() + state.inScript() + state.inIFrame() == 1);
340 if (state.inScript() && !m_currentScriptTagStartLineNumber)
341 m_currentScriptTagStartLineNumber = m_lineNumber;
342
343 if (state.inComment())
344 state = parseComment(src, state);
345
346 int lastDecodedEntityPosition = -1;
347 while (!src.isEmpty()) {
348 checkScriptBuffer();
349 UChar ch = *src;
350
351 if (!m_scriptCodeResync && !m_brokenComments &&
352 !state.inXmp() && ch == '-' && m_scriptCodeSize >= 3 && !src.escaped() &&
353 m_scriptCode[m_scriptCodeSize - 3] == '<' && m_scriptCode[m_scriptCodeSize - 2] == '!' && m_scriptCode[m_scriptCodeSize - 1] == '-' &&
354 (lastDecodedEntityPosition < m_scriptCodeSize - 3)) {
355 state.setInComment(true);
356 state = parseComment(src, state);
357 continue;
358 }
359 if (m_scriptCodeResync && !tquote && ch == '>') {
360 src.advancePastNonNewline();
361 m_scriptCodeSize = m_scriptCodeResync - 1;
362 m_scriptCodeResync = 0;
363 m_scriptCode[m_scriptCodeSize] = m_scriptCode[m_scriptCodeSize + 1] = 0;
364 if (state.inScript())
365 state = scriptHandler(state);
366 else {
367 state = processListing(SegmentedString(m_scriptCode, m_scriptCodeSize), state);
368 processToken();
369 if (state.inStyle()) {
370 m_currentToken.tagName = styleTag.localName();
371 m_currentToken.beginTag = false;
372 } else if (state.inTextArea()) {
373 m_currentToken.tagName = textareaTag.localName();
374 m_currentToken.beginTag = false;
375 } else if (state.inTitle()) {
376 m_currentToken.tagName = titleTag.localName();
377 m_currentToken.beginTag = false;
378 } else if (state.inXmp()) {
379 m_currentToken.tagName = xmpTag.localName();
380 m_currentToken.beginTag = false;
381 } else if (state.inIFrame()) {
382 m_currentToken.tagName = iframeTag.localName();
383 m_currentToken.beginTag = false;
384 }
385 processToken();
386 state.setInStyle(false);
387 state.setInScript(false);
388 state.setInTextArea(false);
389 state.setInTitle(false);
390 state.setInXmp(false);
391 state.setInIFrame(false);
392 tquote = NoQuote;
393 m_scriptCodeSize = m_scriptCodeResync = 0;
394 }
395 return state;
396 }
397 // possible end of tagname, lets check.
398 if (!m_scriptCodeResync && !state.escaped() && !src.escaped() && (ch == '>' || ch == '/' || isASCIISpace(ch)) &&
399 m_scriptCodeSize >= m_searchStopperLength &&
400 tagMatch(m_searchStopper, m_scriptCode + m_scriptCodeSize - m_searchStopperLength, m_searchStopperLength) &&
401 (lastDecodedEntityPosition < m_scriptCodeSize - m_searchStopperLength)) {
402 m_scriptCodeResync = m_scriptCodeSize-m_searchStopperLength+1;
403 tquote = NoQuote;
404 continue;
405 }
406 if (m_scriptCodeResync && !state.escaped()) {
407 if (ch == '\"')
408 tquote = (tquote == NoQuote) ? DoubleQuote : ((tquote == SingleQuote) ? SingleQuote : NoQuote);
409 else if (ch == '\'')
410 tquote = (tquote == NoQuote) ? SingleQuote : (tquote == DoubleQuote) ? DoubleQuote : NoQuote;
411 else if (tquote != NoQuote && (ch == '\r' || ch == '\n'))
412 tquote = NoQuote;
413 }
414 state.setEscaped(!state.escaped() && ch == '\\');
415 if (!m_scriptCodeResync && (state.inTextArea() || state.inTitle() || state.inIFrame()) && !src.escaped() && ch == '&') {
416 UChar* scriptCodeDest = m_scriptCode + m_scriptCodeSize;
417 src.advancePastNonNewline();
418 state = parseEntity(src, scriptCodeDest, state, m_cBufferPos, true, false);
419 if (scriptCodeDest == m_scriptCode + m_scriptCodeSize)
420 lastDecodedEntityPosition = m_scriptCodeSize;
421 else
422 m_scriptCodeSize = scriptCodeDest - m_scriptCode;
423 } else {
424 m_scriptCode[m_scriptCodeSize++] = ch;
425 src.advance(m_lineNumber);
426 }
427 }
428
429 return state;
430 }
431
scriptHandler(State state)432 HTMLTokenizer::State HTMLTokenizer::scriptHandler(State state)
433 {
434 // We are inside a <script>
435 bool doScriptExec = false;
436 int startLine = m_currentScriptTagStartLineNumber + 1; // Script line numbers are 1 based, HTMLTokenzier line numbers are 0 based
437
438 // Reset m_currentScriptTagStartLineNumber to indicate that we've finished parsing the current script element
439 m_currentScriptTagStartLineNumber = 0;
440
441 // (Bugzilla 3837) Scripts following a frameset element should not execute or,
442 // in the case of extern scripts, even load.
443 bool followingFrameset = (m_doc->body() && m_doc->body()->hasTagName(framesetTag));
444
445 CachedScript* cs = 0;
446 // don't load external scripts for standalone documents (for now)
447 if (!inViewSourceMode()) {
448 if (!m_scriptTagSrcAttrValue.isEmpty() && m_doc->frame()) {
449 // forget what we just got; load from src url instead
450 if (!m_parser->skipMode() && !followingFrameset) {
451 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
452 if (!m_doc->ownerElement())
453 printf("Requesting script at time %d\n", m_doc->elapsedTime());
454 #endif
455 // The parser might have been stopped by for example a window.close call in an earlier script.
456 // If so, we don't want to load scripts.
457 if (!m_parserStopped && m_scriptNode->dispatchBeforeLoadEvent(m_scriptTagSrcAttrValue) &&
458 (cs = m_doc->docLoader()->requestScript(m_scriptTagSrcAttrValue, m_scriptTagCharsetAttrValue)))
459 m_pendingScripts.append(cs);
460 else
461 m_scriptNode = 0;
462 } else
463 m_scriptNode = 0;
464 m_scriptTagSrcAttrValue = String();
465 } else {
466 // Parse m_scriptCode containing <script> info
467 doScriptExec = m_scriptNode->shouldExecuteAsJavaScript();
468 #if ENABLE(XHTMLMP)
469 if (!doScriptExec)
470 m_doc->setShouldProcessNoscriptElement(true);
471 #endif
472 m_scriptNode = 0;
473 }
474 }
475
476 state = processListing(SegmentedString(m_scriptCode, m_scriptCodeSize), state);
477 RefPtr<Node> node = processToken();
478
479 if (node && m_scriptingPermission == FragmentScriptingNotAllowed) {
480 ExceptionCode ec;
481 node->remove(ec);
482 node = 0;
483 }
484
485 String scriptString = node ? node->textContent() : "";
486 m_currentToken.tagName = scriptTag.localName();
487 m_currentToken.beginTag = false;
488 processToken();
489
490 state.setInScript(false);
491 m_scriptCodeSize = m_scriptCodeResync = 0;
492
493 // FIXME: The script should be syntax highlighted.
494 if (inViewSourceMode())
495 return state;
496
497 SegmentedString* savedPrependingSrc = m_currentPrependingSrc;
498 SegmentedString prependingSrc;
499 m_currentPrependingSrc = &prependingSrc;
500
501 #ifdef ANDROID_INSTRUMENT
502 android::TimeCounter::recordNoCounter(android::TimeCounter::ParsingTimeCounter, __FUNCTION__);
503 #endif
504
505 if (!m_parser->skipMode() && !followingFrameset) {
506 if (cs) {
507 if (savedPrependingSrc)
508 savedPrependingSrc->append(m_src);
509 else
510 m_pendingSrc.prepend(m_src);
511 setSrc(SegmentedString());
512
513 // the ref() call below may call notifyFinished if the script is already in cache,
514 // and that mucks with the state directly, so we must write it back to the object.
515 m_state = state;
516 bool savedRequestingScript = m_requestingScript;
517 m_requestingScript = true;
518 cs->addClient(this);
519 m_requestingScript = savedRequestingScript;
520 state = m_state;
521 // will be 0 if script was already loaded and ref() executed it
522 if (!m_pendingScripts.isEmpty())
523 state.setLoadingExtScript(true);
524 } else if (!m_fragment && doScriptExec) {
525 if (!m_executingScript)
526 m_pendingSrc.prepend(m_src);
527 else
528 prependingSrc = m_src;
529 setSrc(SegmentedString());
530 state = scriptExecution(ScriptSourceCode(scriptString, m_doc->frame() ? m_doc->frame()->document()->url() : KURL(), startLine), state);
531 }
532 }
533
534 #ifdef ANDROID_INSTRUMENT
535 android::TimeCounter::start(android::TimeCounter::ParsingTimeCounter);
536 #endif
537
538 if (!m_executingScript && !state.loadingExtScript()) {
539 m_src.append(m_pendingSrc);
540 m_pendingSrc.clear();
541 } else if (!prependingSrc.isEmpty()) {
542 // restore first so that the write appends in the right place
543 // (does not hurt to do it again below)
544 m_currentPrependingSrc = savedPrependingSrc;
545
546 // we need to do this slightly modified bit of one of the write() cases
547 // because we want to prepend to m_pendingSrc rather than appending
548 // if there's no previous prependingSrc
549 if (!m_pendingScripts.isEmpty()) {
550 if (m_currentPrependingSrc)
551 m_currentPrependingSrc->append(prependingSrc);
552 else
553 m_pendingSrc.prepend(prependingSrc);
554 } else {
555 m_state = state;
556 write(prependingSrc, false);
557 state = m_state;
558 }
559 }
560
561 #if PRELOAD_SCANNER_ENABLED
562 if (!m_pendingScripts.isEmpty() && !m_executingScript) {
563 if (!m_preloadScanner)
564 m_preloadScanner.set(new PreloadScanner(m_doc));
565 if (!m_preloadScanner->inProgress()) {
566 m_preloadScanner->begin();
567 m_preloadScanner->write(m_pendingSrc);
568 }
569 }
570 #endif
571 m_currentPrependingSrc = savedPrependingSrc;
572
573 return state;
574 }
575
scriptExecution(const ScriptSourceCode & sourceCode,State state)576 HTMLTokenizer::State HTMLTokenizer::scriptExecution(const ScriptSourceCode& sourceCode, State state)
577 {
578 if (m_fragment || !m_doc->frame())
579 return state;
580 m_executingScript++;
581
582 SegmentedString* savedPrependingSrc = m_currentPrependingSrc;
583 SegmentedString prependingSrc;
584 m_currentPrependingSrc = &prependingSrc;
585
586 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
587 if (!m_doc->ownerElement())
588 printf("beginning script execution at %d\n", m_doc->elapsedTime());
589 #endif
590
591 m_state = state;
592 m_doc->frame()->script()->executeScript(sourceCode);
593 state = m_state;
594
595 state.setAllowYield(true);
596
597 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
598 if (!m_doc->ownerElement())
599 printf("ending script execution at %d\n", m_doc->elapsedTime());
600 #endif
601
602 m_executingScript--;
603
604 if (!m_executingScript && !state.loadingExtScript()) {
605 m_pendingSrc.prepend(prependingSrc);
606 m_src.append(m_pendingSrc);
607 m_pendingSrc.clear();
608 } else if (!prependingSrc.isEmpty()) {
609 // restore first so that the write appends in the right place
610 // (does not hurt to do it again below)
611 m_currentPrependingSrc = savedPrependingSrc;
612
613 // we need to do this slightly modified bit of one of the write() cases
614 // because we want to prepend to m_pendingSrc rather than appending
615 // if there's no previous prependingSrc
616 if (!m_pendingScripts.isEmpty()) {
617 if (m_currentPrependingSrc)
618 m_currentPrependingSrc->append(prependingSrc);
619 else
620 m_pendingSrc.prepend(prependingSrc);
621
622 #if PRELOAD_SCANNER_ENABLED
623 // We are stuck waiting for another script. Lets check the source that
624 // was just document.write()n for anything to load.
625 PreloadScanner documentWritePreloadScanner(m_doc);
626 documentWritePreloadScanner.begin();
627 documentWritePreloadScanner.write(prependingSrc);
628 documentWritePreloadScanner.end();
629 #endif
630 } else {
631 m_state = state;
632 write(prependingSrc, false);
633 state = m_state;
634 }
635 }
636
637 m_currentPrependingSrc = savedPrependingSrc;
638
639 return state;
640 }
641
parseComment(SegmentedString & src,State state)642 HTMLTokenizer::State HTMLTokenizer::parseComment(SegmentedString& src, State state)
643 {
644 // FIXME: Why does this code even run for comments inside <script> and <style>? This seems bogus.
645 checkScriptBuffer(src.length());
646 while (!src.isEmpty()) {
647 UChar ch = *src;
648 m_scriptCode[m_scriptCodeSize++] = ch;
649 if (ch == '>') {
650 bool handleBrokenComments = m_brokenComments && !(state.inScript() || state.inStyle());
651 int endCharsCount = 1; // start off with one for the '>' character
652 if (m_scriptCodeSize > 2 && m_scriptCode[m_scriptCodeSize-3] == '-' && m_scriptCode[m_scriptCodeSize-2] == '-') {
653 endCharsCount = 3;
654 } else if (m_scriptCodeSize > 3 && m_scriptCode[m_scriptCodeSize-4] == '-' && m_scriptCode[m_scriptCodeSize-3] == '-' &&
655 m_scriptCode[m_scriptCodeSize-2] == '!') {
656 // Other browsers will accept --!> as a close comment, even though it's
657 // not technically valid.
658 endCharsCount = 4;
659 }
660 if (handleBrokenComments || endCharsCount > 1) {
661 src.advancePastNonNewline();
662 if (!(state.inTitle() || state.inScript() || state.inXmp() || state.inTextArea() || state.inStyle() || state.inIFrame())) {
663 checkScriptBuffer();
664 m_scriptCode[m_scriptCodeSize] = 0;
665 m_scriptCode[m_scriptCodeSize + 1] = 0;
666 m_currentToken.tagName = commentAtom;
667 m_currentToken.beginTag = true;
668 state = processListing(SegmentedString(m_scriptCode, m_scriptCodeSize - endCharsCount), state);
669 processToken();
670 m_currentToken.tagName = commentAtom;
671 m_currentToken.beginTag = false;
672 processToken();
673 m_scriptCodeSize = 0;
674 }
675 state.setInComment(false);
676 return state; // Finished parsing comment
677 }
678 }
679 src.advance(m_lineNumber);
680 }
681
682 return state;
683 }
684
parseServer(SegmentedString & src,State state)685 HTMLTokenizer::State HTMLTokenizer::parseServer(SegmentedString& src, State state)
686 {
687 checkScriptBuffer(src.length());
688 while (!src.isEmpty()) {
689 UChar ch = *src;
690 m_scriptCode[m_scriptCodeSize++] = ch;
691 if (ch == '>' && m_scriptCodeSize > 1 && m_scriptCode[m_scriptCodeSize - 2] == '%') {
692 src.advancePastNonNewline();
693 state.setInServer(false);
694 m_scriptCodeSize = 0;
695 return state; // Finished parsing server include
696 }
697 src.advance(m_lineNumber);
698 }
699 return state;
700 }
701
parseProcessingInstruction(SegmentedString & src,State state)702 HTMLTokenizer::State HTMLTokenizer::parseProcessingInstruction(SegmentedString& src, State state)
703 {
704 UChar oldchar = 0;
705 while (!src.isEmpty()) {
706 UChar chbegin = *src;
707 if (chbegin == '\'')
708 tquote = tquote == SingleQuote ? NoQuote : SingleQuote;
709 else if (chbegin == '\"')
710 tquote = tquote == DoubleQuote ? NoQuote : DoubleQuote;
711 // Look for '?>'
712 // Some crappy sites omit the "?" before it, so
713 // we look for an unquoted '>' instead. (IE compatible)
714 else if (chbegin == '>' && (!tquote || oldchar == '?')) {
715 // We got a '?>' sequence
716 state.setInProcessingInstruction(false);
717 src.advancePastNonNewline();
718 state.setDiscardLF(true);
719 return state; // Finished parsing comment!
720 }
721 src.advance(m_lineNumber);
722 oldchar = chbegin;
723 }
724
725 return state;
726 }
727
parseText(SegmentedString & src,State state)728 HTMLTokenizer::State HTMLTokenizer::parseText(SegmentedString& src, State state)
729 {
730 while (!src.isEmpty()) {
731 UChar cc = *src;
732
733 if (state.skipLF()) {
734 state.setSkipLF(false);
735 if (cc == '\n') {
736 src.advancePastNewline(m_lineNumber);
737 continue;
738 }
739 }
740
741 // do we need to enlarge the buffer?
742 checkBuffer();
743
744 if (cc == '\r') {
745 state.setSkipLF(true);
746 *m_dest++ = '\n';
747 } else
748 *m_dest++ = cc;
749 src.advance(m_lineNumber);
750 }
751
752 return state;
753 }
754
755
parseEntity(SegmentedString & src,UChar * & dest,State state,unsigned & cBufferPos,bool start,bool parsingTag)756 HTMLTokenizer::State HTMLTokenizer::parseEntity(SegmentedString& src, UChar*& dest, State state, unsigned& cBufferPos, bool start, bool parsingTag)
757 {
758 if (start) {
759 cBufferPos = 0;
760 state.setEntityState(SearchEntity);
761 EntityUnicodeValue = 0;
762 }
763
764 while (!src.isEmpty()) {
765 UChar cc = *src;
766 switch (state.entityState()) {
767 case NoEntity:
768 ASSERT(state.entityState() != NoEntity);
769 return state;
770
771 case SearchEntity:
772 if (cc == '#') {
773 m_cBuffer[cBufferPos++] = cc;
774 src.advancePastNonNewline();
775 state.setEntityState(NumericSearch);
776 } else
777 state.setEntityState(EntityName);
778 break;
779
780 case NumericSearch:
781 if (cc == 'x' || cc == 'X') {
782 m_cBuffer[cBufferPos++] = cc;
783 src.advancePastNonNewline();
784 state.setEntityState(Hexadecimal);
785 } else if (cc >= '0' && cc <= '9')
786 state.setEntityState(Decimal);
787 else
788 state.setEntityState(SearchSemicolon);
789 break;
790
791 case Hexadecimal: {
792 int ll = min(src.length(), 10 - cBufferPos);
793 while (ll--) {
794 cc = *src;
795 if (!((cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f') || (cc >= 'A' && cc <= 'F'))) {
796 state.setEntityState(SearchSemicolon);
797 break;
798 }
799 int digit;
800 if (cc < 'A')
801 digit = cc - '0';
802 else
803 digit = (cc - 'A' + 10) & 0xF; // handle both upper and lower case without a branch
804 EntityUnicodeValue = EntityUnicodeValue * 16 + digit;
805 m_cBuffer[cBufferPos++] = cc;
806 src.advancePastNonNewline();
807 }
808 if (cBufferPos == 10)
809 state.setEntityState(SearchSemicolon);
810 break;
811 }
812 case Decimal:
813 {
814 int ll = min(src.length(), 9-cBufferPos);
815 while (ll--) {
816 cc = *src;
817
818 if (!(cc >= '0' && cc <= '9')) {
819 state.setEntityState(SearchSemicolon);
820 break;
821 }
822
823 EntityUnicodeValue = EntityUnicodeValue * 10 + (cc - '0');
824 m_cBuffer[cBufferPos++] = cc;
825 src.advancePastNonNewline();
826 }
827 if (cBufferPos == 9)
828 state.setEntityState(SearchSemicolon);
829 break;
830 }
831 case EntityName:
832 {
833 int ll = min(src.length(), 9-cBufferPos);
834 while (ll--) {
835 cc = *src;
836
837 if (!((cc >= 'a' && cc <= 'z') || (cc >= '0' && cc <= '9') || (cc >= 'A' && cc <= 'Z'))) {
838 state.setEntityState(SearchSemicolon);
839 break;
840 }
841
842 m_cBuffer[cBufferPos++] = cc;
843 src.advancePastNonNewline();
844 }
845 if (cBufferPos == 9)
846 state.setEntityState(SearchSemicolon);
847 if (state.entityState() == SearchSemicolon) {
848 if (cBufferPos > 1) {
849 // Since the maximum length of entity name is 9,
850 // so a single char array which is allocated on
851 // the stack, its length is 10, should be OK.
852 // Also if we have an illegal character, we treat it
853 // as illegal entity name.
854 unsigned testedEntityNameLen = 0;
855 char tmpEntityNameBuffer[10];
856
857 ASSERT(cBufferPos < 10);
858 for (; testedEntityNameLen < cBufferPos; ++testedEntityNameLen) {
859 if (m_cBuffer[testedEntityNameLen] > 0x7e)
860 break;
861 tmpEntityNameBuffer[testedEntityNameLen] = m_cBuffer[testedEntityNameLen];
862 }
863
864 const Entity *e;
865
866 if (testedEntityNameLen == cBufferPos)
867 e = findEntity(tmpEntityNameBuffer, cBufferPos);
868 else
869 e = 0;
870
871 if (e)
872 EntityUnicodeValue = e->code;
873
874 // be IE compatible
875 if (parsingTag && EntityUnicodeValue > 255 && *src != ';')
876 EntityUnicodeValue = 0;
877 }
878 }
879 else
880 break;
881 }
882 case SearchSemicolon:
883 // Don't allow values that are more than 21 bits.
884 if (EntityUnicodeValue > 0 && EntityUnicodeValue <= 0x10FFFF) {
885 if (!inViewSourceMode()) {
886 if (*src == ';')
887 src.advancePastNonNewline();
888 if (EntityUnicodeValue <= 0xFFFF) {
889 checkBuffer();
890 src.push(fixUpChar(EntityUnicodeValue));
891 } else {
892 // Convert to UTF-16, using surrogate code points.
893 checkBuffer(2);
894 src.push(U16_LEAD(EntityUnicodeValue));
895 src.push(U16_TRAIL(EntityUnicodeValue));
896 }
897 } else {
898 // FIXME: We should eventually colorize entities by sending them as a special token.
899 // 12 bytes required: up to 10 bytes in m_cBuffer plus the
900 // leading '&' and trailing ';'
901 checkBuffer(12);
902 *dest++ = '&';
903 for (unsigned i = 0; i < cBufferPos; i++)
904 dest[i] = m_cBuffer[i];
905 dest += cBufferPos;
906 if (*src == ';') {
907 *dest++ = ';';
908 src.advancePastNonNewline();
909 }
910 }
911 } else {
912 // 11 bytes required: up to 10 bytes in m_cBuffer plus the
913 // leading '&'
914 checkBuffer(11);
915 // ignore the sequence, add it to the buffer as plaintext
916 *dest++ = '&';
917 for (unsigned i = 0; i < cBufferPos; i++)
918 dest[i] = m_cBuffer[i];
919 dest += cBufferPos;
920 }
921
922 state.setEntityState(NoEntity);
923 return state;
924 }
925 }
926
927 return state;
928 }
929
parseDoctype(SegmentedString & src,State state)930 HTMLTokenizer::State HTMLTokenizer::parseDoctype(SegmentedString& src, State state)
931 {
932 ASSERT(state.inDoctype());
933 while (!src.isEmpty() && state.inDoctype()) {
934 UChar c = *src;
935 bool isWhitespace = c == '\r' || c == '\n' || c == '\t' || c == ' ';
936 switch (m_doctypeToken.state()) {
937 case DoctypeBegin: {
938 m_doctypeToken.setState(DoctypeBeforeName);
939 if (isWhitespace) {
940 src.advance(m_lineNumber);
941 if (inViewSourceMode())
942 m_doctypeToken.m_source.append(c);
943 }
944 break;
945 }
946 case DoctypeBeforeName: {
947 if (c == '>') {
948 // Malformed. Just exit.
949 src.advancePastNonNewline();
950 state.setInDoctype(false);
951 if (inViewSourceMode())
952 processDoctypeToken();
953 } else if (isWhitespace) {
954 src.advance(m_lineNumber);
955 if (inViewSourceMode())
956 m_doctypeToken.m_source.append(c);
957 } else
958 m_doctypeToken.setState(DoctypeName);
959 break;
960 }
961 case DoctypeName: {
962 if (c == '>') {
963 // Valid doctype. Emit it.
964 src.advancePastNonNewline();
965 state.setInDoctype(false);
966 processDoctypeToken();
967 } else if (isWhitespace) {
968 m_doctypeSearchCount = 0; // Used now to scan for PUBLIC
969 m_doctypeSecondarySearchCount = 0; // Used now to scan for SYSTEM
970 m_doctypeToken.setState(DoctypeAfterName);
971 src.advance(m_lineNumber);
972 if (inViewSourceMode())
973 m_doctypeToken.m_source.append(c);
974 } else {
975 src.advancePastNonNewline();
976 m_doctypeToken.m_name.append(c);
977 if (inViewSourceMode())
978 m_doctypeToken.m_source.append(c);
979 }
980 break;
981 }
982 case DoctypeAfterName: {
983 if (c == '>') {
984 // Valid doctype. Emit it.
985 src.advancePastNonNewline();
986 state.setInDoctype(false);
987 processDoctypeToken();
988 } else if (!isWhitespace) {
989 src.advancePastNonNewline();
990 if (toASCIILower(c) == publicStart[m_doctypeSearchCount]) {
991 m_doctypeSearchCount++;
992 if (m_doctypeSearchCount == 6)
993 // Found 'PUBLIC' sequence
994 m_doctypeToken.setState(DoctypeBeforePublicID);
995 } else if (m_doctypeSearchCount > 0) {
996 m_doctypeSearchCount = 0;
997 m_doctypeToken.setState(DoctypeBogus);
998 } else if (toASCIILower(c) == systemStart[m_doctypeSecondarySearchCount]) {
999 m_doctypeSecondarySearchCount++;
1000 if (m_doctypeSecondarySearchCount == 6)
1001 // Found 'SYSTEM' sequence
1002 m_doctypeToken.setState(DoctypeBeforeSystemID);
1003 } else {
1004 m_doctypeSecondarySearchCount = 0;
1005 m_doctypeToken.setState(DoctypeBogus);
1006 }
1007 if (inViewSourceMode())
1008 m_doctypeToken.m_source.append(c);
1009 } else {
1010 src.advance(m_lineNumber); // Whitespace keeps us in the after name state.
1011 if (inViewSourceMode())
1012 m_doctypeToken.m_source.append(c);
1013 }
1014 break;
1015 }
1016 case DoctypeBeforePublicID: {
1017 if (c == '\"' || c == '\'') {
1018 tquote = c == '\"' ? DoubleQuote : SingleQuote;
1019 m_doctypeToken.setState(DoctypePublicID);
1020 src.advancePastNonNewline();
1021 if (inViewSourceMode())
1022 m_doctypeToken.m_source.append(c);
1023 } else if (c == '>') {
1024 // Considered bogus. Don't process the doctype.
1025 src.advancePastNonNewline();
1026 state.setInDoctype(false);
1027 if (inViewSourceMode())
1028 processDoctypeToken();
1029 } else if (isWhitespace) {
1030 src.advance(m_lineNumber);
1031 if (inViewSourceMode())
1032 m_doctypeToken.m_source.append(c);
1033 } else
1034 m_doctypeToken.setState(DoctypeBogus);
1035 break;
1036 }
1037 case DoctypePublicID: {
1038 if ((c == '\"' && tquote == DoubleQuote) || (c == '\'' && tquote == SingleQuote)) {
1039 src.advancePastNonNewline();
1040 m_doctypeToken.setState(DoctypeAfterPublicID);
1041 if (inViewSourceMode())
1042 m_doctypeToken.m_source.append(c);
1043 } else if (c == '>') {
1044 // Considered bogus. Don't process the doctype.
1045 src.advancePastNonNewline();
1046 state.setInDoctype(false);
1047 if (inViewSourceMode())
1048 processDoctypeToken();
1049 } else {
1050 m_doctypeToken.m_publicID.append(c);
1051 src.advance(m_lineNumber);
1052 if (inViewSourceMode())
1053 m_doctypeToken.m_source.append(c);
1054 }
1055 break;
1056 }
1057 case DoctypeAfterPublicID:
1058 if (c == '\"' || c == '\'') {
1059 tquote = c == '\"' ? DoubleQuote : SingleQuote;
1060 m_doctypeToken.setState(DoctypeSystemID);
1061 src.advancePastNonNewline();
1062 if (inViewSourceMode())
1063 m_doctypeToken.m_source.append(c);
1064 } else if (c == '>') {
1065 // Valid doctype. Emit it now.
1066 src.advancePastNonNewline();
1067 state.setInDoctype(false);
1068 processDoctypeToken();
1069 } else if (isWhitespace) {
1070 src.advance(m_lineNumber);
1071 if (inViewSourceMode())
1072 m_doctypeToken.m_source.append(c);
1073 } else
1074 m_doctypeToken.setState(DoctypeBogus);
1075 break;
1076 case DoctypeBeforeSystemID:
1077 if (c == '\"' || c == '\'') {
1078 tquote = c == '\"' ? DoubleQuote : SingleQuote;
1079 m_doctypeToken.setState(DoctypeSystemID);
1080 src.advancePastNonNewline();
1081 if (inViewSourceMode())
1082 m_doctypeToken.m_source.append(c);
1083 } else if (c == '>') {
1084 // Considered bogus. Don't process the doctype.
1085 src.advancePastNonNewline();
1086 state.setInDoctype(false);
1087 } else if (isWhitespace) {
1088 src.advance(m_lineNumber);
1089 if (inViewSourceMode())
1090 m_doctypeToken.m_source.append(c);
1091 } else
1092 m_doctypeToken.setState(DoctypeBogus);
1093 break;
1094 case DoctypeSystemID:
1095 if ((c == '\"' && tquote == DoubleQuote) || (c == '\'' && tquote == SingleQuote)) {
1096 src.advancePastNonNewline();
1097 m_doctypeToken.setState(DoctypeAfterSystemID);
1098 if (inViewSourceMode())
1099 m_doctypeToken.m_source.append(c);
1100 } else if (c == '>') {
1101 // Considered bogus. Don't process the doctype.
1102 src.advancePastNonNewline();
1103 state.setInDoctype(false);
1104 if (inViewSourceMode())
1105 processDoctypeToken();
1106 } else {
1107 m_doctypeToken.m_systemID.append(c);
1108 src.advance(m_lineNumber);
1109 if (inViewSourceMode())
1110 m_doctypeToken.m_source.append(c);
1111 }
1112 break;
1113 case DoctypeAfterSystemID:
1114 if (c == '>') {
1115 // Valid doctype. Emit it now.
1116 src.advancePastNonNewline();
1117 state.setInDoctype(false);
1118 processDoctypeToken();
1119 } else if (isWhitespace) {
1120 src.advance(m_lineNumber);
1121 if (inViewSourceMode())
1122 m_doctypeToken.m_source.append(c);
1123 } else
1124 m_doctypeToken.setState(DoctypeBogus);
1125 break;
1126 case DoctypeBogus:
1127 if (c == '>') {
1128 // Done with the bogus doctype.
1129 src.advancePastNonNewline();
1130 state.setInDoctype(false);
1131 if (inViewSourceMode())
1132 processDoctypeToken();
1133 } else {
1134 src.advance(m_lineNumber); // Just keep scanning for '>'
1135 if (inViewSourceMode())
1136 m_doctypeToken.m_source.append(c);
1137 }
1138 break;
1139 default:
1140 break;
1141 }
1142 }
1143 return state;
1144 }
1145
parseTag(SegmentedString & src,State state)1146 HTMLTokenizer::State HTMLTokenizer::parseTag(SegmentedString& src, State state)
1147 {
1148 ASSERT(!state.hasEntityState());
1149
1150 unsigned cBufferPos = m_cBufferPos;
1151
1152 bool lastIsSlash = false;
1153
1154 while (!src.isEmpty()) {
1155 checkBuffer();
1156 switch (state.tagState()) {
1157 case NoTag:
1158 {
1159 m_cBufferPos = cBufferPos;
1160 return state;
1161 }
1162 case TagName:
1163 {
1164 if (searchCount > 0) {
1165 if (*src == commentStart[searchCount]) {
1166 searchCount++;
1167 if (searchCount == 2)
1168 m_doctypeSearchCount++; // A '!' is also part of a doctype, so we are moving through that still as well.
1169 else
1170 m_doctypeSearchCount = 0;
1171 if (searchCount == 4) {
1172 // Found '<!--' sequence
1173 src.advancePastNonNewline();
1174 m_dest = m_buffer; // ignore the previous part of this tag
1175 state.setInComment(true);
1176 state.setTagState(NoTag);
1177
1178 // Fix bug 34302 at kde.bugs.org. Go ahead and treat
1179 // <!--> as a valid comment, since both mozilla and IE on windows
1180 // can handle this case. Only do this in quirks mode. -dwh
1181 if (!src.isEmpty() && *src == '>' && m_doc->inCompatMode()) {
1182 state.setInComment(false);
1183 src.advancePastNonNewline();
1184 if (!src.isEmpty())
1185 m_cBuffer[cBufferPos++] = *src;
1186 } else
1187 state = parseComment(src, state);
1188
1189 m_cBufferPos = cBufferPos;
1190 return state; // Finished parsing tag!
1191 }
1192 m_cBuffer[cBufferPos++] = *src;
1193 src.advancePastNonNewline();
1194 break;
1195 } else
1196 searchCount = 0; // Stop looking for '<!--' sequence
1197 }
1198
1199 if (m_doctypeSearchCount > 0) {
1200 if (toASCIILower(*src) == doctypeStart[m_doctypeSearchCount]) {
1201 m_doctypeSearchCount++;
1202 m_cBuffer[cBufferPos++] = *src;
1203 src.advancePastNonNewline();
1204 if (m_doctypeSearchCount == 9) {
1205 // Found '<!DOCTYPE' sequence
1206 state.setInDoctype(true);
1207 state.setTagState(NoTag);
1208 m_doctypeToken.reset();
1209 if (inViewSourceMode())
1210 m_doctypeToken.m_source.append(m_cBuffer, cBufferPos);
1211 state = parseDoctype(src, state);
1212 m_cBufferPos = cBufferPos;
1213 return state;
1214 }
1215 break;
1216 } else
1217 m_doctypeSearchCount = 0; // Stop looking for '<!DOCTYPE' sequence
1218 }
1219
1220 bool finish = false;
1221 unsigned int ll = min(src.length(), CBUFLEN - cBufferPos);
1222 while (ll--) {
1223 UChar curchar = *src;
1224 if (isASCIISpace(curchar) || curchar == '>' || curchar == '<') {
1225 finish = true;
1226 break;
1227 }
1228
1229 // tolower() shows up on profiles. This is faster!
1230 if (curchar >= 'A' && curchar <= 'Z' && !inViewSourceMode())
1231 m_cBuffer[cBufferPos++] = curchar + ('a' - 'A');
1232 else
1233 m_cBuffer[cBufferPos++] = curchar;
1234 src.advancePastNonNewline();
1235 }
1236
1237 // Disadvantage: we add the possible rest of the tag
1238 // as attribute names. ### judge if this causes problems
1239 if (finish || CBUFLEN == cBufferPos) {
1240 bool beginTag;
1241 UChar* ptr = m_cBuffer;
1242 unsigned int len = cBufferPos;
1243 m_cBuffer[cBufferPos] = '\0';
1244 if ((cBufferPos > 0) && (*ptr == '/')) {
1245 // End Tag
1246 beginTag = false;
1247 ptr++;
1248 len--;
1249 }
1250 else
1251 // Start Tag
1252 beginTag = true;
1253
1254 // Ignore the / in fake xml tags like <br/>. We trim off the "/" so that we'll get "br" as the tag name and not "br/".
1255 if (len > 1 && ptr[len-1] == '/' && !inViewSourceMode())
1256 ptr[--len] = '\0';
1257
1258 // Now that we've shaved off any invalid / that might have followed the name), make the tag.
1259 // FIXME: FireFox and WinIE turn !foo nodes into comments, we ignore comments. (fast/parser/tag-with-exclamation-point.html)
1260 if (ptr[0] != '!' || inViewSourceMode()) {
1261 m_currentToken.tagName = AtomicString(ptr);
1262 m_currentToken.beginTag = beginTag;
1263 }
1264 m_dest = m_buffer;
1265 state.setTagState(SearchAttribute);
1266 cBufferPos = 0;
1267 }
1268 break;
1269 }
1270 case SearchAttribute:
1271 while (!src.isEmpty()) {
1272 UChar curchar = *src;
1273 // In this mode just ignore any quotes we encounter and treat them like spaces.
1274 if (!isASCIISpace(curchar) && curchar != '\'' && curchar != '"') {
1275 if (curchar == '<' || curchar == '>')
1276 state.setTagState(SearchEnd);
1277 else
1278 state.setTagState(AttributeName);
1279
1280 cBufferPos = 0;
1281 break;
1282 }
1283 if (inViewSourceMode())
1284 m_currentToken.addViewSourceChar(curchar);
1285 src.advance(m_lineNumber);
1286 }
1287 break;
1288 case AttributeName:
1289 {
1290 m_rawAttributeBeforeValue.clear();
1291 int ll = min(src.length(), CBUFLEN - cBufferPos);
1292 while (ll--) {
1293 UChar curchar = *src;
1294 // If we encounter a "/" when scanning an attribute name, treat it as a delimiter. This allows the
1295 // cases like <input type=checkbox checked/> to work (and accommodates XML-style syntax as per HTML5).
1296 if (curchar <= '>' && (curchar >= '<' || isASCIISpace(curchar) || curchar == '/')) {
1297 m_cBuffer[cBufferPos] = '\0';
1298 m_attrName = AtomicString(m_cBuffer);
1299 m_dest = m_buffer;
1300 *m_dest++ = 0;
1301 state.setTagState(SearchEqual);
1302 if (inViewSourceMode())
1303 m_currentToken.addViewSourceChar('a');
1304 break;
1305 }
1306
1307 // tolower() shows up on profiles. This is faster!
1308 if (curchar >= 'A' && curchar <= 'Z' && !inViewSourceMode())
1309 m_cBuffer[cBufferPos++] = curchar + ('a' - 'A');
1310 else
1311 m_cBuffer[cBufferPos++] = curchar;
1312
1313 m_rawAttributeBeforeValue.append(curchar);
1314 src.advance(m_lineNumber);
1315 }
1316 if (cBufferPos == CBUFLEN) {
1317 m_cBuffer[cBufferPos] = '\0';
1318 m_attrName = AtomicString(m_cBuffer);
1319 m_dest = m_buffer;
1320 *m_dest++ = 0;
1321 state.setTagState(SearchEqual);
1322 if (inViewSourceMode())
1323 m_currentToken.addViewSourceChar('a');
1324 }
1325 break;
1326 }
1327 case SearchEqual:
1328 while (!src.isEmpty()) {
1329 UChar curchar = *src;
1330
1331 if (lastIsSlash && curchar == '>') {
1332 // This is a quirk (with a long sad history). We have to do this
1333 // since widgets do <script src="foo.js"/> and expect the tag to close.
1334 if (m_currentToken.tagName == scriptTag)
1335 m_currentToken.selfClosingTag = true;
1336 m_currentToken.brokenXMLStyle = true;
1337 }
1338
1339 // In this mode just ignore any quotes or slashes we encounter and treat them like spaces.
1340 if (!isASCIISpace(curchar) && curchar != '\'' && curchar != '"' && curchar != '/') {
1341 if (curchar == '=') {
1342 state.setTagState(SearchValue);
1343 if (inViewSourceMode())
1344 m_currentToken.addViewSourceChar(curchar);
1345 m_rawAttributeBeforeValue.append(curchar);
1346 src.advancePastNonNewline();
1347 } else {
1348 m_currentToken.addAttribute(m_attrName, emptyAtom, inViewSourceMode());
1349 m_dest = m_buffer;
1350 state.setTagState(SearchAttribute);
1351 lastIsSlash = false;
1352 }
1353 break;
1354 }
1355
1356 lastIsSlash = curchar == '/';
1357
1358 if (inViewSourceMode())
1359 m_currentToken.addViewSourceChar(curchar);
1360 m_rawAttributeBeforeValue.append(curchar);
1361 src.advance(m_lineNumber);
1362 }
1363 break;
1364 case SearchValue:
1365 while (!src.isEmpty()) {
1366 UChar curchar = *src;
1367 if (!isASCIISpace(curchar)) {
1368 if (curchar == '\'' || curchar == '\"') {
1369 tquote = curchar == '\"' ? DoubleQuote : SingleQuote;
1370 state.setTagState(QuotedValue);
1371 if (inViewSourceMode())
1372 m_currentToken.addViewSourceChar(curchar);
1373 m_rawAttributeBeforeValue.append(curchar);
1374 src.advancePastNonNewline();
1375 } else
1376 state.setTagState(Value);
1377
1378 break;
1379 }
1380 if (inViewSourceMode())
1381 m_currentToken.addViewSourceChar(curchar);
1382 m_rawAttributeBeforeValue.append(curchar);
1383 src.advance(m_lineNumber);
1384 }
1385 break;
1386 case QuotedValue:
1387 while (!src.isEmpty()) {
1388 checkBuffer();
1389
1390 UChar curchar = *src;
1391 if (curchar <= '>' && !src.escaped()) {
1392 if (curchar == '>' && m_attrName.isEmpty()) {
1393 // Handle a case like <img '>. Just go ahead and be willing
1394 // to close the whole tag. Don't consume the character and
1395 // just go back into SearchEnd while ignoring the whole
1396 // value.
1397 // FIXME: Note that this is actually not a very good solution.
1398 // It doesn't handle the general case of
1399 // unmatched quotes among attributes that have names. -dwh
1400 while (m_dest > m_buffer + 1 && (m_dest[-1] == '\n' || m_dest[-1] == '\r'))
1401 m_dest--; // remove trailing newlines
1402 AtomicString attributeValue(m_buffer + 1, m_dest - m_buffer - 1);
1403 if (!attributeValue.contains('/'))
1404 m_attrName = attributeValue; // Just make the name/value match. (FIXME: Is this some WinIE quirk?)
1405 m_currentToken.addAttribute(m_attrName, attributeValue, inViewSourceMode());
1406 if (inViewSourceMode())
1407 m_currentToken.addViewSourceChar('x');
1408 state.setTagState(SearchAttribute);
1409 m_dest = m_buffer;
1410 tquote = NoQuote;
1411 break;
1412 }
1413
1414 if (curchar == '&') {
1415 src.advancePastNonNewline();
1416 state = parseEntity(src, m_dest, state, cBufferPos, true, true);
1417 break;
1418 }
1419
1420 if ((tquote == SingleQuote && curchar == '\'') || (tquote == DoubleQuote && curchar == '\"')) {
1421 // some <input type=hidden> rely on trailing spaces. argh
1422 while (m_dest > m_buffer + 1 && (m_dest[-1] == '\n' || m_dest[-1] == '\r'))
1423 m_dest--; // remove trailing newlines
1424 AtomicString attributeValue(m_buffer + 1, m_dest - m_buffer - 1);
1425 if (m_attrName.isEmpty() && !attributeValue.contains('/')) {
1426 m_attrName = attributeValue; // Make the name match the value. (FIXME: Is this a WinIE quirk?)
1427 if (inViewSourceMode())
1428 m_currentToken.addViewSourceChar('x');
1429 } else if (inViewSourceMode())
1430 m_currentToken.addViewSourceChar('v');
1431
1432 if (m_currentToken.beginTag && m_currentToken.tagName == scriptTag && !inViewSourceMode() && !m_parser->skipMode() && m_attrName == srcAttr) {
1433 String context(m_rawAttributeBeforeValue.data(), m_rawAttributeBeforeValue.size());
1434 if (m_XSSAuditor && !m_XSSAuditor->canLoadExternalScriptFromSrc(context, attributeValue))
1435 attributeValue = blankURL().string();
1436 }
1437
1438 m_currentToken.addAttribute(m_attrName, attributeValue, inViewSourceMode());
1439 m_dest = m_buffer;
1440 state.setTagState(SearchAttribute);
1441 tquote = NoQuote;
1442 if (inViewSourceMode())
1443 m_currentToken.addViewSourceChar(curchar);
1444 src.advancePastNonNewline();
1445 break;
1446 }
1447 }
1448
1449 *m_dest++ = curchar;
1450 src.advance(m_lineNumber);
1451 }
1452 break;
1453 case Value:
1454 while (!src.isEmpty()) {
1455 checkBuffer();
1456 UChar curchar = *src;
1457 if (curchar <= '>' && !src.escaped()) {
1458 // parse Entities
1459 if (curchar == '&') {
1460 src.advancePastNonNewline();
1461 state = parseEntity(src, m_dest, state, cBufferPos, true, true);
1462 break;
1463 }
1464 // no quotes. Every space means end of value
1465 // '/' does not delimit in IE!
1466 if (isASCIISpace(curchar) || curchar == '>') {
1467 AtomicString attributeValue(m_buffer + 1, m_dest - m_buffer - 1);
1468
1469 if (m_currentToken.beginTag && m_currentToken.tagName == scriptTag && !inViewSourceMode() && !m_parser->skipMode() && m_attrName == srcAttr) {
1470 String context(m_rawAttributeBeforeValue.data(), m_rawAttributeBeforeValue.size());
1471 if (m_XSSAuditor && !m_XSSAuditor->canLoadExternalScriptFromSrc(context, attributeValue))
1472 attributeValue = blankURL().string();
1473 }
1474
1475 m_currentToken.addAttribute(m_attrName, attributeValue, inViewSourceMode());
1476 if (inViewSourceMode())
1477 m_currentToken.addViewSourceChar('v');
1478 m_dest = m_buffer;
1479 state.setTagState(SearchAttribute);
1480 break;
1481 }
1482 }
1483
1484 *m_dest++ = curchar;
1485 src.advance(m_lineNumber);
1486 }
1487 break;
1488 case SearchEnd:
1489 {
1490 while (!src.isEmpty()) {
1491 UChar ch = *src;
1492 if (ch == '>' || ch == '<')
1493 break;
1494 if (ch == '/')
1495 m_currentToken.selfClosingTag = true;
1496 if (inViewSourceMode())
1497 m_currentToken.addViewSourceChar(ch);
1498 src.advance(m_lineNumber);
1499 }
1500 if (src.isEmpty())
1501 break;
1502
1503 searchCount = 0; // Stop looking for '<!--' sequence
1504 state.setTagState(NoTag);
1505 tquote = NoQuote;
1506
1507 if (*src != '<')
1508 src.advance(m_lineNumber);
1509
1510 if (m_currentToken.tagName == nullAtom) { //stop if tag is unknown
1511 m_cBufferPos = cBufferPos;
1512 return state;
1513 }
1514
1515 AtomicString tagName = m_currentToken.tagName;
1516
1517 // Handle <script src="foo"/> like Mozilla/Opera. We have to do this now for Dashboard
1518 // compatibility.
1519 bool isSelfClosingScript = m_currentToken.selfClosingTag && m_currentToken.beginTag && m_currentToken.tagName == scriptTag;
1520 bool beginTag = !m_currentToken.selfClosingTag && m_currentToken.beginTag;
1521 if (m_currentToken.beginTag && m_currentToken.tagName == scriptTag && !inViewSourceMode() && !m_parser->skipMode()) {
1522 Attribute* a = 0;
1523 m_scriptTagSrcAttrValue = String();
1524 m_scriptTagCharsetAttrValue = String();
1525 if (m_currentToken.attrs && !m_fragment) {
1526 if (m_doc->frame() && m_doc->frame()->script()->canExecuteScripts()) {
1527 if ((a = m_currentToken.attrs->getAttributeItem(srcAttr)))
1528 m_scriptTagSrcAttrValue = m_doc->completeURL(deprecatedParseURL(a->value())).string();
1529 }
1530 }
1531 }
1532
1533 RefPtr<Node> n = processToken();
1534 m_cBufferPos = cBufferPos;
1535 if (n || inViewSourceMode()) {
1536 State savedState = state;
1537 SegmentedString savedSrc = src;
1538 long savedLineno = m_lineNumber;
1539 if ((tagName == preTag || tagName == listingTag) && !inViewSourceMode()) {
1540 if (beginTag)
1541 state.setDiscardLF(true); // Discard the first LF after we open a pre.
1542 } else if (tagName == scriptTag) {
1543 ASSERT(!m_scriptNode);
1544 m_scriptNode = static_pointer_cast<HTMLScriptElement>(n);
1545 if (m_scriptNode)
1546 m_scriptTagCharsetAttrValue = m_scriptNode->scriptCharset();
1547 if (beginTag) {
1548 m_searchStopper = scriptEnd;
1549 m_searchStopperLength = 8;
1550 state.setInScript(true);
1551 state = parseNonHTMLText(src, state);
1552 } else if (isSelfClosingScript) { // Handle <script src="foo"/>
1553 state.setInScript(true);
1554 state = scriptHandler(state);
1555 }
1556 } else if (tagName == styleTag) {
1557 if (beginTag) {
1558 m_searchStopper = styleEnd;
1559 m_searchStopperLength = 7;
1560 state.setInStyle(true);
1561 state = parseNonHTMLText(src, state);
1562 }
1563 } else if (tagName == textareaTag) {
1564 if (beginTag) {
1565 m_searchStopper = textareaEnd;
1566 m_searchStopperLength = 10;
1567 state.setInTextArea(true);
1568 state = parseNonHTMLText(src, state);
1569 }
1570 } else if (tagName == titleTag) {
1571 if (beginTag) {
1572 m_searchStopper = titleEnd;
1573 m_searchStopperLength = 7;
1574 state.setInTitle(true);
1575 state = parseNonHTMLText(src, state);
1576 }
1577 } else if (tagName == xmpTag) {
1578 if (beginTag) {
1579 m_searchStopper = xmpEnd;
1580 m_searchStopperLength = 5;
1581 state.setInXmp(true);
1582 state = parseNonHTMLText(src, state);
1583 }
1584 } else if (tagName == iframeTag) {
1585 if (beginTag) {
1586 m_searchStopper = iframeEnd;
1587 m_searchStopperLength = 8;
1588 state.setInIFrame(true);
1589 state = parseNonHTMLText(src, state);
1590 }
1591 }
1592 if (src.isEmpty() && (state.inTitle() || inViewSourceMode()) && !state.inComment() && !(state.inScript() && m_currentScriptTagStartLineNumber)) {
1593 // We just ate the rest of the document as the #text node under the special tag!
1594 // Reset the state then retokenize without special handling.
1595 // Let the parser clean up the missing close tag.
1596 // FIXME: This is incorrect, because src.isEmpty() doesn't mean we're
1597 // at the end of the document unless m_noMoreData is also true. We need
1598 // to detect this case elsewhere, and save the state somewhere other
1599 // than a local variable.
1600 state = savedState;
1601 src = savedSrc;
1602 m_lineNumber = savedLineno;
1603 m_scriptCodeSize = 0;
1604 }
1605 }
1606 if (tagName == plaintextTag)
1607 state.setInPlainText(beginTag);
1608 return state; // Finished parsing tag!
1609 }
1610 } // end switch
1611 }
1612 m_cBufferPos = cBufferPos;
1613 return state;
1614 }
1615
continueProcessing(int & processedCount,double startTime,State & state)1616 inline bool HTMLTokenizer::continueProcessing(int& processedCount, double startTime, State &state)
1617 {
1618 // We don't want to be checking elapsed time with every character, so we only check after we've
1619 // processed a certain number of characters.
1620 bool allowedYield = state.allowYield();
1621 state.setAllowYield(false);
1622 if (!state.loadingExtScript() && !state.forceSynchronous() && !m_executingScript && (processedCount > m_tokenizerChunkSize || allowedYield)) {
1623 processedCount = 0;
1624 if (currentTime() - startTime > m_tokenizerTimeDelay) {
1625 /* FIXME: We'd like to yield aggressively to give stylesheets the opportunity to
1626 load, but this hurts overall performance on slower machines. For now turn this
1627 off.
1628 || (!m_doc->haveStylesheetsLoaded() &&
1629 (m_doc->documentElement()->id() != ID_HTML || m_doc->body()))) {*/
1630 // Schedule the timer to keep processing as soon as possible.
1631 m_timer.startOneShot(0);
1632 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
1633 if (currentTime() - startTime > m_tokenizerTimeDelay)
1634 printf("Deferring processing of data because 500ms elapsed away from event loop.\n");
1635 #endif
1636 return false;
1637 }
1638 }
1639
1640 processedCount++;
1641 return true;
1642 }
1643
write(const SegmentedString & str,bool appendData)1644 void HTMLTokenizer::write(const SegmentedString& str, bool appendData)
1645 {
1646 if (!m_buffer)
1647 return;
1648
1649 if (m_parserStopped)
1650 return;
1651
1652 SegmentedString source(str);
1653 if (m_executingScript)
1654 source.setExcludeLineNumbers();
1655
1656 if ((m_executingScript && appendData) || !m_pendingScripts.isEmpty()) {
1657 // don't parse; we will do this later
1658 if (m_currentPrependingSrc)
1659 m_currentPrependingSrc->append(source);
1660 else {
1661 m_pendingSrc.append(source);
1662 #if PRELOAD_SCANNER_ENABLED
1663 if (m_preloadScanner && m_preloadScanner->inProgress() && appendData)
1664 m_preloadScanner->write(source);
1665 #endif
1666 }
1667 return;
1668 }
1669
1670 #if PRELOAD_SCANNER_ENABLED
1671 if (m_preloadScanner && m_preloadScanner->inProgress() && appendData)
1672 m_preloadScanner->end();
1673 #endif
1674
1675 if (!m_src.isEmpty())
1676 m_src.append(source);
1677 else
1678 setSrc(source);
1679
1680 // Once a timer is set, it has control of when the tokenizer continues.
1681 if (m_timer.isActive())
1682 return;
1683
1684 bool wasInWrite = m_inWrite;
1685 m_inWrite = true;
1686
1687 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
1688 if (!m_doc->ownerElement())
1689 printf("Beginning write at time %d\n", m_doc->elapsedTime());
1690 #endif
1691
1692 int processedCount = 0;
1693 double startTime = currentTime();
1694 #ifdef ANDROID_INSTRUMENT
1695 android::TimeCounter::start(android::TimeCounter::ParsingTimeCounter);
1696 #endif
1697
1698 #if ENABLE(INSPECTOR)
1699 if (InspectorTimelineAgent* timelineAgent = m_doc->inspectorTimelineAgent())
1700 timelineAgent->willWriteHTML(source.length(), m_lineNumber);
1701 #endif
1702
1703 Frame* frame = m_doc->frame();
1704
1705 State state = m_state;
1706
1707 while (!m_src.isEmpty() && (!frame || !frame->redirectScheduler()->locationChangePending())) {
1708 if (!continueProcessing(processedCount, startTime, state))
1709 break;
1710
1711 // do we need to enlarge the buffer?
1712 checkBuffer();
1713
1714 UChar cc = *m_src;
1715
1716 bool wasSkipLF = state.skipLF();
1717 if (wasSkipLF)
1718 state.setSkipLF(false);
1719
1720 if (wasSkipLF && (cc == '\n'))
1721 m_src.advance();
1722 else if (state.needsSpecialWriteHandling()) {
1723 // it's important to keep needsSpecialWriteHandling with the flags this block tests
1724 if (state.hasEntityState())
1725 state = parseEntity(m_src, m_dest, state, m_cBufferPos, false, state.hasTagState());
1726 else if (state.inPlainText())
1727 state = parseText(m_src, state);
1728 else if (state.inAnyNonHTMLText())
1729 state = parseNonHTMLText(m_src, state);
1730 else if (state.inComment())
1731 state = parseComment(m_src, state);
1732 else if (state.inDoctype())
1733 state = parseDoctype(m_src, state);
1734 else if (state.inServer())
1735 state = parseServer(m_src, state);
1736 else if (state.inProcessingInstruction())
1737 state = parseProcessingInstruction(m_src, state);
1738 else if (state.hasTagState())
1739 state = parseTag(m_src, state);
1740 else if (state.startTag()) {
1741 state.setStartTag(false);
1742
1743 switch (cc) {
1744 case '/':
1745 break;
1746 case '!': {
1747 // <!-- comment --> or <!DOCTYPE ...>
1748 searchCount = 1; // Look for '<!--' sequence to start comment or '<!DOCTYPE' sequence to start doctype
1749 m_doctypeSearchCount = 1;
1750 break;
1751 }
1752 case '?': {
1753 // xml processing instruction
1754 state.setInProcessingInstruction(true);
1755 tquote = NoQuote;
1756 state = parseProcessingInstruction(m_src, state);
1757 continue;
1758
1759 break;
1760 }
1761 case '%':
1762 if (!m_brokenServer) {
1763 // <% server stuff, handle as comment %>
1764 state.setInServer(true);
1765 tquote = NoQuote;
1766 state = parseServer(m_src, state);
1767 continue;
1768 }
1769 // else fall through
1770 default: {
1771 if ( ((cc >= 'a') && (cc <= 'z')) || ((cc >= 'A') && (cc <= 'Z'))) {
1772 // Start of a Start-Tag
1773 } else {
1774 // Invalid tag
1775 // Add as is
1776 *m_dest = '<';
1777 m_dest++;
1778 continue;
1779 }
1780 }
1781 }; // end case
1782
1783 processToken();
1784
1785 m_cBufferPos = 0;
1786 state.setTagState(TagName);
1787 state = parseTag(m_src, state);
1788 }
1789 } else if (cc == '&' && !m_src.escaped()) {
1790 m_src.advancePastNonNewline();
1791 state = parseEntity(m_src, m_dest, state, m_cBufferPos, true, state.hasTagState());
1792 } else if (cc == '<' && !m_src.escaped()) {
1793 m_currentTagStartLineNumber = m_lineNumber;
1794 m_src.advancePastNonNewline();
1795 state.setStartTag(true);
1796 state.setDiscardLF(false);
1797 } else if (cc == '\n' || cc == '\r') {
1798 if (state.discardLF())
1799 // Ignore this LF
1800 state.setDiscardLF(false); // We have discarded 1 LF
1801 else {
1802 // Process this LF
1803 *m_dest++ = '\n';
1804 if (cc == '\r' && !m_src.excludeLineNumbers())
1805 m_lineNumber++;
1806 }
1807
1808 /* Check for MS-DOS CRLF sequence */
1809 if (cc == '\r')
1810 state.setSkipLF(true);
1811 m_src.advance(m_lineNumber);
1812 } else {
1813 state.setDiscardLF(false);
1814 *m_dest++ = cc;
1815 m_src.advancePastNonNewline();
1816 }
1817 }
1818
1819 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
1820 if (!m_doc->ownerElement())
1821 printf("Ending write at time %d\n", m_doc->elapsedTime());
1822 #endif
1823
1824 #if ENABLE(INSPECTOR)
1825 if (InspectorTimelineAgent* timelineAgent = m_doc->inspectorTimelineAgent())
1826 timelineAgent->didWriteHTML(m_lineNumber);
1827 #endif
1828
1829 m_inWrite = wasInWrite;
1830
1831 m_state = state;
1832
1833 #ifdef ANDROID_INSTRUMENT
1834 android::TimeCounter::record(android::TimeCounter::ParsingTimeCounter, __FUNCTION__);
1835 #endif
1836
1837 if (m_noMoreData && !m_inWrite && !state.loadingExtScript() && !m_executingScript && !m_timer.isActive())
1838 end(); // this actually causes us to be deleted
1839
1840 // After parsing, go ahead and dispatch image beforeload events.
1841 ImageLoader::dispatchPendingBeforeLoadEvents();
1842 }
1843
stopParsing()1844 void HTMLTokenizer::stopParsing()
1845 {
1846 Tokenizer::stopParsing();
1847 m_timer.stop();
1848
1849 // The part needs to know that the tokenizer has finished with its data,
1850 // regardless of whether it happened naturally or due to manual intervention.
1851 if (!m_fragment && m_doc->frame())
1852 m_doc->frame()->loader()->tokenizerProcessedData();
1853 }
1854
processingData() const1855 bool HTMLTokenizer::processingData() const
1856 {
1857 return m_timer.isActive() || m_inWrite;
1858 }
1859
timerFired(Timer<HTMLTokenizer> *)1860 void HTMLTokenizer::timerFired(Timer<HTMLTokenizer>*)
1861 {
1862 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
1863 if (!m_doc->ownerElement())
1864 printf("Beginning timer write at time %d\n", m_doc->elapsedTime());
1865 #endif
1866
1867 if (m_doc->view() && m_doc->view()->layoutPending() && !m_doc->minimumLayoutDelay()) {
1868 // Restart the timer and let layout win. This is basically a way of ensuring that the layout
1869 // timer has higher priority than our timer.
1870 m_timer.startOneShot(0);
1871 return;
1872 }
1873
1874 // Invoke write() as though more data came in. This might cause us to get deleted.
1875 write(SegmentedString(), true);
1876 }
1877
end()1878 void HTMLTokenizer::end()
1879 {
1880 ASSERT(!m_timer.isActive());
1881 m_timer.stop(); // Only helps if assertion above fires, but do it anyway.
1882
1883 if (m_buffer) {
1884 // parseTag is using the buffer for different matters
1885 if (!m_state.hasTagState())
1886 processToken();
1887
1888 fastFree(m_scriptCode);
1889 m_scriptCode = 0;
1890 m_scriptCodeSize = m_scriptCodeCapacity = m_scriptCodeResync = 0;
1891
1892 fastFree(m_buffer);
1893 m_buffer = 0;
1894 }
1895
1896 if (!inViewSourceMode())
1897 m_parser->finished();
1898 else
1899 m_doc->finishedParsing();
1900 }
1901
finish()1902 void HTMLTokenizer::finish()
1903 {
1904 // do this as long as we don't find matching comment ends
1905 while ((m_state.inComment() || m_state.inServer()) && m_scriptCode && m_scriptCodeSize) {
1906 // we've found an unmatched comment start
1907 if (m_state.inComment())
1908 m_brokenComments = true;
1909 else
1910 m_brokenServer = true;
1911 checkScriptBuffer();
1912 m_scriptCode[m_scriptCodeSize] = 0;
1913 m_scriptCode[m_scriptCodeSize + 1] = 0;
1914 int pos;
1915 String food;
1916 if (m_state.inScript() || m_state.inStyle() || m_state.inTextArea())
1917 food = String(m_scriptCode, m_scriptCodeSize);
1918 else if (m_state.inServer()) {
1919 food = "<";
1920 food.append(m_scriptCode, m_scriptCodeSize);
1921 } else {
1922 pos = find(m_scriptCode, m_scriptCodeSize, '>');
1923 food = String(m_scriptCode + pos + 1, m_scriptCodeSize - pos - 1);
1924 }
1925 fastFree(m_scriptCode);
1926 m_scriptCode = 0;
1927 m_scriptCodeSize = m_scriptCodeCapacity = m_scriptCodeResync = 0;
1928 m_state.setInComment(false);
1929 m_state.setInServer(false);
1930 if (!food.isEmpty())
1931 write(food, true);
1932 }
1933 // this indicates we will not receive any more data... but if we are waiting on
1934 // an external script to load, we can't finish parsing until that is done
1935 m_noMoreData = true;
1936 if (!m_inWrite && !m_state.loadingExtScript() && !m_executingScript && !m_timer.isActive())
1937 end(); // this actually causes us to be deleted
1938 }
1939
processToken()1940 PassRefPtr<Node> HTMLTokenizer::processToken()
1941 {
1942 ScriptController* scriptController = (!m_fragment && m_doc->frame()) ? m_doc->frame()->script() : 0;
1943 if (scriptController && scriptController->canExecuteScripts())
1944 // FIXME: Why isn't this m_currentScriptTagStartLineNumber? I suspect this is wrong.
1945 scriptController->setEventHandlerLineNumber(m_currentTagStartLineNumber + 1); // Script line numbers are 1 based.
1946 if (m_dest > m_buffer) {
1947 m_currentToken.text = StringImpl::createStrippingNullCharacters(m_buffer, m_dest - m_buffer);
1948 if (m_currentToken.tagName != commentAtom)
1949 m_currentToken.tagName = textAtom;
1950 } else if (m_currentToken.tagName == nullAtom) {
1951 m_currentToken.reset();
1952 if (scriptController)
1953 scriptController->setEventHandlerLineNumber(m_lineNumber + 1); // Script line numbers are 1 based.
1954 return 0;
1955 }
1956
1957 m_dest = m_buffer;
1958
1959 RefPtr<Node> n;
1960
1961 if (!m_parserStopped) {
1962 if (NamedMappedAttrMap* map = m_currentToken.attrs.get())
1963 map->shrinkToLength();
1964 if (inViewSourceMode())
1965 static_cast<HTMLViewSourceDocument*>(m_doc)->addViewSourceToken(&m_currentToken);
1966 else
1967 // pass the token over to the parser, the parser DOES NOT delete the token
1968 n = m_parser->parseToken(&m_currentToken);
1969 }
1970 m_currentToken.reset();
1971 if (scriptController)
1972 scriptController->setEventHandlerLineNumber(0);
1973
1974 return n.release();
1975 }
1976
processDoctypeToken()1977 void HTMLTokenizer::processDoctypeToken()
1978 {
1979 if (inViewSourceMode())
1980 static_cast<HTMLViewSourceDocument*>(m_doc)->addViewSourceDoctypeToken(&m_doctypeToken);
1981 else
1982 m_parser->parseDoctypeToken(&m_doctypeToken);
1983 }
1984
~HTMLTokenizer()1985 HTMLTokenizer::~HTMLTokenizer()
1986 {
1987 ASSERT(!m_inWrite);
1988 reset();
1989 }
1990
1991
enlargeBuffer(int len)1992 void HTMLTokenizer::enlargeBuffer(int len)
1993 {
1994 // Resize policy: Always at least double the size of the buffer each time.
1995 int delta = max(len, m_bufferSize);
1996
1997 // Check for overflow.
1998 // For now, handle overflow the same way we handle fastRealloc failure, with CRASH.
1999 static const int maxSize = INT_MAX / sizeof(UChar);
2000 if (delta > maxSize - m_bufferSize)
2001 CRASH();
2002
2003 int newSize = m_bufferSize + delta;
2004 int oldOffset = m_dest - m_buffer;
2005 m_buffer = static_cast<UChar*>(fastRealloc(m_buffer, newSize * sizeof(UChar)));
2006 m_dest = m_buffer + oldOffset;
2007 m_bufferSize = newSize;
2008 }
2009
enlargeScriptBuffer(int len)2010 void HTMLTokenizer::enlargeScriptBuffer(int len)
2011 {
2012 // Resize policy: Always at least double the size of the buffer each time.
2013 int delta = max(len, m_scriptCodeCapacity);
2014
2015 // Check for overflow.
2016 // For now, handle overflow the same way we handle fastRealloc failure, with CRASH.
2017 static const int maxSize = INT_MAX / sizeof(UChar);
2018 if (delta > maxSize - m_scriptCodeCapacity)
2019 CRASH();
2020
2021 int newSize = m_scriptCodeCapacity + delta;
2022 // If we allow fastRealloc(ptr, 0), it will call CRASH(). We run into this
2023 // case if the HTML being parsed begins with "<!--" and there's more data
2024 // coming.
2025 if (!newSize) {
2026 ASSERT(!m_scriptCode);
2027 return;
2028 }
2029
2030 m_scriptCode = static_cast<UChar*>(fastRealloc(m_scriptCode, newSize * sizeof(UChar)));
2031 m_scriptCodeCapacity = newSize;
2032 }
2033
executeScriptsWaitingForStylesheets()2034 void HTMLTokenizer::executeScriptsWaitingForStylesheets()
2035 {
2036 ASSERT(m_doc->haveStylesheetsLoaded());
2037
2038 if (m_hasScriptsWaitingForStylesheets)
2039 notifyFinished(0);
2040 }
2041
notifyFinished(CachedResource *)2042 void HTMLTokenizer::notifyFinished(CachedResource*)
2043 {
2044 executeExternalScriptsIfReady();
2045 }
2046
executeExternalScriptsIfReady()2047 void HTMLTokenizer::executeExternalScriptsIfReady()
2048 {
2049 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
2050 if (!m_doc->ownerElement())
2051 printf("script loaded at %d\n", m_doc->elapsedTime());
2052 #endif
2053
2054 ASSERT(!m_pendingScripts.isEmpty());
2055
2056 // Make external scripts wait for external stylesheets.
2057 // FIXME: This needs to be done for inline scripts too.
2058 m_hasScriptsWaitingForStylesheets = !m_doc->haveStylesheetsLoaded();
2059 if (m_hasScriptsWaitingForStylesheets)
2060 return;
2061
2062 bool finished = false;
2063
2064 double startTime = currentTime();
2065 while (!finished && m_pendingScripts.first()->isLoaded()) {
2066 if (!continueExecutingExternalScripts(startTime))
2067 break;
2068
2069 CachedScript* cs = m_pendingScripts.first().get();
2070 m_pendingScripts.removeFirst();
2071 ASSERT(cache()->disabled() || cs->accessCount() > 0);
2072
2073 setSrc(SegmentedString());
2074
2075 // make sure we forget about the script before we execute the new one
2076 // infinite recursion might happen otherwise
2077 ScriptSourceCode sourceCode(cs);
2078 bool errorOccurred = cs->errorOccurred();
2079 cs->removeClient(this);
2080
2081 RefPtr<Node> n = m_scriptNode.release();
2082
2083 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
2084 if (!m_doc->ownerElement())
2085 printf("external script beginning execution at %d\n", m_doc->elapsedTime());
2086 #endif
2087
2088 if (errorOccurred)
2089 n->dispatchEvent(Event::create(eventNames().errorEvent, true, false));
2090 else {
2091 if (static_cast<HTMLScriptElement*>(n.get())->shouldExecuteAsJavaScript())
2092 m_state = scriptExecution(sourceCode, m_state);
2093 #if ENABLE(XHTMLMP)
2094 else
2095 m_doc->setShouldProcessNoscriptElement(true);
2096 #endif
2097 n->dispatchEvent(Event::create(eventNames().loadEvent, false, false));
2098 }
2099
2100 // The state of m_pendingScripts.isEmpty() can change inside the scriptExecution()
2101 // call above, so test afterwards.
2102 finished = m_pendingScripts.isEmpty();
2103 if (finished) {
2104 ASSERT(!m_hasScriptsWaitingForStylesheets);
2105 m_state.setLoadingExtScript(false);
2106 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
2107 if (!m_doc->ownerElement())
2108 printf("external script finished execution at %d\n", m_doc->elapsedTime());
2109 #endif
2110 } else if (m_hasScriptsWaitingForStylesheets) {
2111 // m_hasScriptsWaitingForStylesheets flag might have changed during the script execution.
2112 // If it did we are now blocked waiting for stylesheets and should not execute more scripts until they arrive.
2113 finished = true;
2114 }
2115
2116 // 'm_requestingScript' is true when we are called synchronously from
2117 // scriptHandler(). In that case scriptHandler() will take care
2118 // of m_pendingSrc.
2119 if (!m_requestingScript) {
2120 SegmentedString rest = m_pendingSrc;
2121 m_pendingSrc.clear();
2122 write(rest, false);
2123 // we might be deleted at this point, do not access any members.
2124 }
2125 }
2126 }
2127
executeExternalScriptsTimerFired(Timer<HTMLTokenizer> *)2128 void HTMLTokenizer::executeExternalScriptsTimerFired(Timer<HTMLTokenizer>*)
2129 {
2130 if (m_doc->view() && m_doc->view()->layoutPending() && !m_doc->minimumLayoutDelay()) {
2131 // Restart the timer and do layout first.
2132 m_externalScriptsTimer.startOneShot(0);
2133 return;
2134 }
2135
2136 // Continue executing external scripts.
2137 executeExternalScriptsIfReady();
2138 }
2139
continueExecutingExternalScripts(double startTime)2140 bool HTMLTokenizer::continueExecutingExternalScripts(double startTime)
2141 {
2142 if (m_externalScriptsTimer.isActive())
2143 return false;
2144
2145 if (currentTime() - startTime > m_tokenizerTimeDelay) {
2146 // Schedule the timer to keep processing as soon as possible.
2147 m_externalScriptsTimer.startOneShot(0);
2148 return false;
2149 }
2150 return true;
2151 }
2152
isWaitingForScripts() const2153 bool HTMLTokenizer::isWaitingForScripts() const
2154 {
2155 return m_state.loadingExtScript();
2156 }
2157
setSrc(const SegmentedString & source)2158 void HTMLTokenizer::setSrc(const SegmentedString& source)
2159 {
2160 m_src = source;
2161 }
2162
parseHTMLDocumentFragment(const String & source,DocumentFragment * fragment,FragmentScriptingPermission scriptingPermission)2163 void parseHTMLDocumentFragment(const String& source, DocumentFragment* fragment, FragmentScriptingPermission scriptingPermission)
2164 {
2165 HTMLTokenizer tok(fragment, scriptingPermission);
2166 tok.setForceSynchronous(true);
2167 tok.write(source, true);
2168 tok.finish();
2169 ASSERT(!tok.processingData()); // make sure we're done (see 3963151)
2170 }
2171
decodeNamedEntity(const char * name)2172 UChar decodeNamedEntity(const char* name)
2173 {
2174 const Entity* e = findEntity(name, strlen(name));
2175 return e ? e->code : 0;
2176 }
2177
2178 }
2179