• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2     Copyright (C) 1997 Martin Jones (mjones@kde.org)
3               (C) 1997 Torben Weis (weis@kde.org)
4               (C) 1998 Waldo Bastian (bastian@kde.org)
5               (C) 1999 Lars Knoll (knoll@kde.org)
6               (C) 1999 Antti Koivisto (koivisto@kde.org)
7               (C) 2001 Dirk Mueller (mueller@kde.org)
8     Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
9     Copyright (C) 2005, 2006 Alexey Proskuryakov (ap@nypop.com)
10     Copyright (C) 2009 Torch Mobile Inc. All rights reserved. (http://www.torchmobile.com/)
11 
12     This library is free software; you can redistribute it and/or
13     modify it under the terms of the GNU Library General Public
14     License as published by the Free Software Foundation; either
15     version 2 of the License, or (at your option) any later version.
16 
17     This library is distributed in the hope that it will be useful,
18     but WITHOUT ANY WARRANTY; without even the implied warranty of
19     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20     Library General Public License for more details.
21 
22     You should have received a copy of the GNU Library General Public License
23     along with this library; see the file COPYING.LIB.  If not, write to
24     the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
25     Boston, MA 02110-1301, USA.
26 */
27 
28 #include "config.h"
29 #include "HTMLTokenizer.h"
30 
31 #include "CSSHelper.h"
32 #include "Cache.h"
33 #include "CachedScript.h"
34 #include "DocLoader.h"
35 #include "DocumentFragment.h"
36 #include "Event.h"
37 #include "EventNames.h"
38 #include "Frame.h"
39 #include "FrameLoader.h"
40 #include "FrameView.h"
41 #include "HTMLElement.h"
42 #include "HTMLNames.h"
43 #include "HTMLParser.h"
44 #include "HTMLScriptElement.h"
45 #include "HTMLViewSourceDocument.h"
46 #include "ImageLoader.h"
47 #include "InspectorTimelineAgent.h"
48 #include "MappedAttribute.h"
49 #include "Page.h"
50 #include "PreloadScanner.h"
51 #include "ScriptController.h"
52 #include "ScriptSourceCode.h"
53 #include "ScriptValue.h"
54 #include "XSSAuditor.h"
55 #include <wtf/ASCIICType.h>
56 #include <wtf/CurrentTime.h>
57 
58 #include "HTMLEntityNames.c"
59 
60 #ifdef ANDROID_INSTRUMENT
61 #include "TimeCounter.h"
62 #endif
63 
64 #define PRELOAD_SCANNER_ENABLED 1
65 // #define INSTRUMENT_LAYOUT_SCHEDULING 1
66 
67 using namespace WTF;
68 using namespace std;
69 
70 namespace WebCore {
71 
72 using namespace HTMLNames;
73 
74 #if MOBILE
75 // The mobile device needs to be responsive, as such the tokenizer chunk size is reduced.
76 // This value is used to define how many characters the tokenizer will process before
77 // yeilding control.
78 static const int defaultTokenizerChunkSize = 256;
79 #else
80 static const int defaultTokenizerChunkSize = 4096;
81 #endif
82 
83 #if MOBILE
84 // As the chunks are smaller (above), the tokenizer should not yield for as long a period, otherwise
85 // it will take way to long to load a page.
86 static const double defaultTokenizerTimeDelay = 0.300;
87 #else
88 // FIXME: We would like this constant to be 200ms.
89 // Yielding more aggressively results in increased responsiveness and better incremental rendering.
90 // It slows down overall page-load on slower machines, though, so for now we set a value of 500.
91 static const double defaultTokenizerTimeDelay = 0.500;
92 #endif
93 
94 static const char commentStart [] = "<!--";
95 static const char doctypeStart [] = "<!doctype";
96 static const char publicStart [] = "public";
97 static const char systemStart [] = "system";
98 static const char scriptEnd [] = "</script";
99 static const char xmpEnd [] = "</xmp";
100 static const char styleEnd [] =  "</style";
101 static const char textareaEnd [] = "</textarea";
102 static const char titleEnd [] = "</title";
103 static const char iframeEnd [] = "</iframe";
104 
105 // Full support for MS Windows extensions to Latin-1.
106 // Technically these extensions should only be activated for pages
107 // marked "windows-1252" or "cp1252", but
108 // in the standard Microsoft way, these extensions infect hundreds of thousands
109 // of web pages.  Note that people with non-latin-1 Microsoft extensions
110 // are SOL.
111 //
112 // See: http://www.microsoft.com/globaldev/reference/WinCP.asp
113 //      http://www.bbsinc.com/iso8859.html
114 //      http://www.obviously.com/
115 //
116 // There may be better equivalents
117 
118 // We only need this for entities. For non-entity text, we handle this in the text encoding.
119 
120 static const UChar windowsLatin1ExtensionArray[32] = {
121     0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87
122     0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F
123     0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97
124     0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178  // 98-9F
125 };
126 
fixUpChar(UChar c)127 static inline UChar fixUpChar(UChar c)
128 {
129     if ((c & ~0x1F) != 0x0080)
130         return c;
131     return windowsLatin1ExtensionArray[c - 0x80];
132 }
133 
tagMatch(const char * s1,const UChar * s2,unsigned length)134 static inline bool tagMatch(const char* s1, const UChar* s2, unsigned length)
135 {
136     for (unsigned i = 0; i != length; ++i) {
137         unsigned char c1 = s1[i];
138         unsigned char uc1 = toASCIIUpper(static_cast<char>(c1));
139         UChar c2 = s2[i];
140         if (c1 != c2 && uc1 != c2)
141             return false;
142     }
143     return true;
144 }
145 
addAttribute(AtomicString & attrName,const AtomicString & attributeValue,bool viewSourceMode)146 inline void Token::addAttribute(AtomicString& attrName, const AtomicString& attributeValue, bool viewSourceMode)
147 {
148     if (!attrName.isEmpty()) {
149         ASSERT(!attrName.contains('/'));
150         RefPtr<MappedAttribute> a = MappedAttribute::create(attrName, attributeValue);
151         if (!attrs) {
152             attrs = NamedMappedAttrMap::create();
153             attrs->reserveInitialCapacity(10);
154         }
155         attrs->insertAttribute(a.release(), viewSourceMode);
156     }
157 
158     attrName = emptyAtom;
159 }
160 
161 // ----------------------------------------------------------------------------
162 
HTMLTokenizer(HTMLDocument * doc,bool reportErrors)163 HTMLTokenizer::HTMLTokenizer(HTMLDocument* doc, bool reportErrors)
164     : Tokenizer()
165     , m_buffer(0)
166     , m_scriptCode(0)
167     , m_scriptCodeSize(0)
168     , m_scriptCodeCapacity(0)
169     , m_scriptCodeResync(0)
170     , m_executingScript(0)
171     , m_requestingScript(false)
172     , m_hasScriptsWaitingForStylesheets(false)
173     , m_timer(this, &HTMLTokenizer::timerFired)
174     , m_externalScriptsTimer(this, &HTMLTokenizer::executeExternalScriptsTimerFired)
175     , m_doc(doc)
176     , m_parser(new HTMLParser(doc, reportErrors))
177     , m_inWrite(false)
178     , m_fragment(false)
179     , m_scriptingPermission(FragmentScriptingAllowed)
180 {
181     begin();
182 }
183 
HTMLTokenizer(HTMLViewSourceDocument * doc)184 HTMLTokenizer::HTMLTokenizer(HTMLViewSourceDocument* doc)
185     : Tokenizer(true)
186     , m_buffer(0)
187     , m_scriptCode(0)
188     , m_scriptCodeSize(0)
189     , m_scriptCodeCapacity(0)
190     , m_scriptCodeResync(0)
191     , m_executingScript(0)
192     , m_requestingScript(false)
193     , m_hasScriptsWaitingForStylesheets(false)
194     , m_timer(this, &HTMLTokenizer::timerFired)
195     , m_externalScriptsTimer(this, &HTMLTokenizer::executeExternalScriptsTimerFired)
196     , m_doc(doc)
197     , m_parser(0)
198     , m_inWrite(false)
199     , m_fragment(false)
200     , m_scriptingPermission(FragmentScriptingAllowed)
201 {
202     begin();
203 }
204 
HTMLTokenizer(DocumentFragment * frag,FragmentScriptingPermission scriptingPermission)205 HTMLTokenizer::HTMLTokenizer(DocumentFragment* frag, FragmentScriptingPermission scriptingPermission)
206     : m_buffer(0)
207     , m_scriptCode(0)
208     , m_scriptCodeSize(0)
209     , m_scriptCodeCapacity(0)
210     , m_scriptCodeResync(0)
211     , m_executingScript(0)
212     , m_requestingScript(false)
213     , m_hasScriptsWaitingForStylesheets(false)
214     , m_timer(this, &HTMLTokenizer::timerFired)
215     , m_externalScriptsTimer(this, &HTMLTokenizer::executeExternalScriptsTimerFired)
216     , m_doc(frag->document())
217     , m_parser(new HTMLParser(frag, scriptingPermission))
218     , m_inWrite(false)
219     , m_fragment(true)
220     , m_scriptingPermission(scriptingPermission)
221 {
222     begin();
223 }
224 
reset()225 void HTMLTokenizer::reset()
226 {
227     ASSERT(m_executingScript == 0);
228 
229     while (!m_pendingScripts.isEmpty()) {
230         CachedScript* cs = m_pendingScripts.first().get();
231         m_pendingScripts.removeFirst();
232         ASSERT(cache()->disabled() || cs->accessCount() > 0);
233         cs->removeClient(this);
234     }
235 
236     fastFree(m_buffer);
237     m_buffer = m_dest = 0;
238     m_bufferSize = 0;
239 
240     fastFree(m_scriptCode);
241     m_scriptCode = 0;
242     m_scriptCodeSize = m_scriptCodeCapacity = m_scriptCodeResync = 0;
243 
244     m_timer.stop();
245     m_externalScriptsTimer.stop();
246 
247     m_state.setAllowYield(false);
248     m_state.setForceSynchronous(false);
249 
250     m_currentToken.reset();
251     m_doctypeToken.reset();
252     m_doctypeSearchCount = 0;
253     m_doctypeSecondarySearchCount = 0;
254     m_hasScriptsWaitingForStylesheets = false;
255 }
256 
begin()257 void HTMLTokenizer::begin()
258 {
259     m_executingScript = 0;
260     m_requestingScript = false;
261     m_hasScriptsWaitingForStylesheets = false;
262     m_state.setLoadingExtScript(false);
263     reset();
264     m_bufferSize = 254;
265     m_buffer = static_cast<UChar*>(fastMalloc(sizeof(UChar) * 254));
266     m_dest = m_buffer;
267     tquote = NoQuote;
268     searchCount = 0;
269     m_state.setEntityState(NoEntity);
270     m_scriptTagSrcAttrValue = String();
271     m_pendingSrc.clear();
272     m_currentPrependingSrc = 0;
273     m_noMoreData = false;
274     m_brokenComments = false;
275     m_brokenServer = false;
276     m_lineNumber = 0;
277     m_currentScriptTagStartLineNumber = 0;
278     m_currentTagStartLineNumber = 0;
279     m_state.setForceSynchronous(false);
280 
281     Page* page = m_doc->page();
282     if (page && page->hasCustomHTMLTokenizerTimeDelay())
283         m_tokenizerTimeDelay = page->customHTMLTokenizerTimeDelay();
284     else
285         m_tokenizerTimeDelay = defaultTokenizerTimeDelay;
286 
287     if (page && page->hasCustomHTMLTokenizerChunkSize())
288         m_tokenizerChunkSize = page->customHTMLTokenizerChunkSize();
289     else
290         m_tokenizerChunkSize = defaultTokenizerChunkSize;
291 }
292 
setForceSynchronous(bool force)293 void HTMLTokenizer::setForceSynchronous(bool force)
294 {
295     m_state.setForceSynchronous(force);
296 }
297 
processListing(SegmentedString list,State state)298 HTMLTokenizer::State HTMLTokenizer::processListing(SegmentedString list, State state)
299 {
300     // This function adds the listing 'list' as
301     // preformatted text-tokens to the token-collection
302     while (!list.isEmpty()) {
303         if (state.skipLF()) {
304             state.setSkipLF(false);
305             if (*list == '\n') {
306                 list.advance();
307                 continue;
308             }
309         }
310 
311         checkBuffer();
312 
313         if (*list == '\n' || *list == '\r') {
314             if (state.discardLF())
315                 // Ignore this LF
316                 state.setDiscardLF(false); // We have discarded 1 LF
317             else
318                 *m_dest++ = '\n';
319 
320             /* Check for MS-DOS CRLF sequence */
321             if (*list == '\r')
322                 state.setSkipLF(true);
323 
324             list.advance();
325         } else {
326             state.setDiscardLF(false);
327             *m_dest++ = *list;
328             list.advance();
329         }
330     }
331 
332     return state;
333 }
334 
parseNonHTMLText(SegmentedString & src,State state)335 HTMLTokenizer::State HTMLTokenizer::parseNonHTMLText(SegmentedString& src, State state)
336 {
337     ASSERT(state.inTextArea() || state.inTitle() || state.inIFrame() || !state.hasEntityState());
338     ASSERT(!state.hasTagState());
339     ASSERT(state.inXmp() + state.inTextArea() + state.inTitle() + state.inStyle() + state.inScript() + state.inIFrame() == 1);
340     if (state.inScript() && !m_currentScriptTagStartLineNumber)
341         m_currentScriptTagStartLineNumber = m_lineNumber;
342 
343     if (state.inComment())
344         state = parseComment(src, state);
345 
346     int lastDecodedEntityPosition = -1;
347     while (!src.isEmpty()) {
348         checkScriptBuffer();
349         UChar ch = *src;
350 
351         if (!m_scriptCodeResync && !m_brokenComments &&
352             !state.inXmp() && ch == '-' && m_scriptCodeSize >= 3 && !src.escaped() &&
353             m_scriptCode[m_scriptCodeSize - 3] == '<' && m_scriptCode[m_scriptCodeSize - 2] == '!' && m_scriptCode[m_scriptCodeSize - 1] == '-' &&
354             (lastDecodedEntityPosition < m_scriptCodeSize - 3)) {
355             state.setInComment(true);
356             state = parseComment(src, state);
357             continue;
358         }
359         if (m_scriptCodeResync && !tquote && ch == '>') {
360             src.advancePastNonNewline();
361             m_scriptCodeSize = m_scriptCodeResync - 1;
362             m_scriptCodeResync = 0;
363             m_scriptCode[m_scriptCodeSize] = m_scriptCode[m_scriptCodeSize + 1] = 0;
364             if (state.inScript())
365                 state = scriptHandler(state);
366             else {
367                 state = processListing(SegmentedString(m_scriptCode, m_scriptCodeSize), state);
368                 processToken();
369                 if (state.inStyle()) {
370                     m_currentToken.tagName = styleTag.localName();
371                     m_currentToken.beginTag = false;
372                 } else if (state.inTextArea()) {
373                     m_currentToken.tagName = textareaTag.localName();
374                     m_currentToken.beginTag = false;
375                 } else if (state.inTitle()) {
376                     m_currentToken.tagName = titleTag.localName();
377                     m_currentToken.beginTag = false;
378                 } else if (state.inXmp()) {
379                     m_currentToken.tagName = xmpTag.localName();
380                     m_currentToken.beginTag = false;
381                 } else if (state.inIFrame()) {
382                     m_currentToken.tagName = iframeTag.localName();
383                     m_currentToken.beginTag = false;
384                 }
385                 processToken();
386                 state.setInStyle(false);
387                 state.setInScript(false);
388                 state.setInTextArea(false);
389                 state.setInTitle(false);
390                 state.setInXmp(false);
391                 state.setInIFrame(false);
392                 tquote = NoQuote;
393                 m_scriptCodeSize = m_scriptCodeResync = 0;
394             }
395             return state;
396         }
397         // possible end of tagname, lets check.
398         if (!m_scriptCodeResync && !state.escaped() && !src.escaped() && (ch == '>' || ch == '/' || isASCIISpace(ch)) &&
399              m_scriptCodeSize >= m_searchStopperLength &&
400              tagMatch(m_searchStopper, m_scriptCode + m_scriptCodeSize - m_searchStopperLength, m_searchStopperLength) &&
401              (lastDecodedEntityPosition < m_scriptCodeSize - m_searchStopperLength)) {
402             m_scriptCodeResync = m_scriptCodeSize-m_searchStopperLength+1;
403             tquote = NoQuote;
404             continue;
405         }
406         if (m_scriptCodeResync && !state.escaped()) {
407             if (ch == '\"')
408                 tquote = (tquote == NoQuote) ? DoubleQuote : ((tquote == SingleQuote) ? SingleQuote : NoQuote);
409             else if (ch == '\'')
410                 tquote = (tquote == NoQuote) ? SingleQuote : (tquote == DoubleQuote) ? DoubleQuote : NoQuote;
411             else if (tquote != NoQuote && (ch == '\r' || ch == '\n'))
412                 tquote = NoQuote;
413         }
414         state.setEscaped(!state.escaped() && ch == '\\');
415         if (!m_scriptCodeResync && (state.inTextArea() || state.inTitle() || state.inIFrame()) && !src.escaped() && ch == '&') {
416             UChar* scriptCodeDest = m_scriptCode + m_scriptCodeSize;
417             src.advancePastNonNewline();
418             state = parseEntity(src, scriptCodeDest, state, m_cBufferPos, true, false);
419             if (scriptCodeDest == m_scriptCode + m_scriptCodeSize)
420                 lastDecodedEntityPosition = m_scriptCodeSize;
421             else
422                 m_scriptCodeSize = scriptCodeDest - m_scriptCode;
423         } else {
424             m_scriptCode[m_scriptCodeSize++] = ch;
425             src.advance(m_lineNumber);
426         }
427     }
428 
429     return state;
430 }
431 
scriptHandler(State state)432 HTMLTokenizer::State HTMLTokenizer::scriptHandler(State state)
433 {
434     // We are inside a <script>
435     bool doScriptExec = false;
436     int startLine = m_currentScriptTagStartLineNumber + 1; // Script line numbers are 1 based, HTMLTokenzier line numbers are 0 based
437 
438     // Reset m_currentScriptTagStartLineNumber to indicate that we've finished parsing the current script element
439     m_currentScriptTagStartLineNumber = 0;
440 
441     // (Bugzilla 3837) Scripts following a frameset element should not execute or,
442     // in the case of extern scripts, even load.
443     bool followingFrameset = (m_doc->body() && m_doc->body()->hasTagName(framesetTag));
444 
445     CachedScript* cs = 0;
446     // don't load external scripts for standalone documents (for now)
447     if (!inViewSourceMode()) {
448         if (!m_scriptTagSrcAttrValue.isEmpty() && m_doc->frame()) {
449             // forget what we just got; load from src url instead
450             if (!m_parser->skipMode() && !followingFrameset) {
451 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
452                 if (!m_doc->ownerElement())
453                     printf("Requesting script at time %d\n", m_doc->elapsedTime());
454 #endif
455                 // The parser might have been stopped by for example a window.close call in an earlier script.
456                 // If so, we don't want to load scripts.
457                 if (!m_parserStopped && m_scriptNode->dispatchBeforeLoadEvent(m_scriptTagSrcAttrValue) &&
458                     (cs = m_doc->docLoader()->requestScript(m_scriptTagSrcAttrValue, m_scriptTagCharsetAttrValue)))
459                     m_pendingScripts.append(cs);
460                 else
461                     m_scriptNode = 0;
462             } else
463                 m_scriptNode = 0;
464             m_scriptTagSrcAttrValue = String();
465         } else {
466             // Parse m_scriptCode containing <script> info
467             doScriptExec = m_scriptNode->shouldExecuteAsJavaScript();
468 #if ENABLE(XHTMLMP)
469             if (!doScriptExec)
470                 m_doc->setShouldProcessNoscriptElement(true);
471 #endif
472             m_scriptNode = 0;
473         }
474     }
475 
476     state = processListing(SegmentedString(m_scriptCode, m_scriptCodeSize), state);
477     RefPtr<Node> node = processToken();
478 
479     if (node && m_scriptingPermission == FragmentScriptingNotAllowed) {
480         ExceptionCode ec;
481         node->remove(ec);
482         node = 0;
483     }
484 
485     String scriptString = node ? node->textContent() : "";
486     m_currentToken.tagName = scriptTag.localName();
487     m_currentToken.beginTag = false;
488     processToken();
489 
490     state.setInScript(false);
491     m_scriptCodeSize = m_scriptCodeResync = 0;
492 
493     // FIXME: The script should be syntax highlighted.
494     if (inViewSourceMode())
495         return state;
496 
497     SegmentedString* savedPrependingSrc = m_currentPrependingSrc;
498     SegmentedString prependingSrc;
499     m_currentPrependingSrc = &prependingSrc;
500 
501 #ifdef ANDROID_INSTRUMENT
502     android::TimeCounter::recordNoCounter(android::TimeCounter::ParsingTimeCounter, __FUNCTION__);
503 #endif
504 
505     if (!m_parser->skipMode() && !followingFrameset) {
506         if (cs) {
507             if (savedPrependingSrc)
508                 savedPrependingSrc->append(m_src);
509             else
510                 m_pendingSrc.prepend(m_src);
511             setSrc(SegmentedString());
512 
513             // the ref() call below may call notifyFinished if the script is already in cache,
514             // and that mucks with the state directly, so we must write it back to the object.
515             m_state = state;
516             bool savedRequestingScript = m_requestingScript;
517             m_requestingScript = true;
518             cs->addClient(this);
519             m_requestingScript = savedRequestingScript;
520             state = m_state;
521             // will be 0 if script was already loaded and ref() executed it
522             if (!m_pendingScripts.isEmpty())
523                 state.setLoadingExtScript(true);
524         } else if (!m_fragment && doScriptExec) {
525             if (!m_executingScript)
526                 m_pendingSrc.prepend(m_src);
527             else
528                 prependingSrc = m_src;
529             setSrc(SegmentedString());
530             state = scriptExecution(ScriptSourceCode(scriptString, m_doc->frame() ? m_doc->frame()->document()->url() : KURL(), startLine), state);
531         }
532     }
533 
534 #ifdef ANDROID_INSTRUMENT
535     android::TimeCounter::start(android::TimeCounter::ParsingTimeCounter);
536 #endif
537 
538     if (!m_executingScript && !state.loadingExtScript()) {
539         m_src.append(m_pendingSrc);
540         m_pendingSrc.clear();
541     } else if (!prependingSrc.isEmpty()) {
542         // restore first so that the write appends in the right place
543         // (does not hurt to do it again below)
544         m_currentPrependingSrc = savedPrependingSrc;
545 
546         // we need to do this slightly modified bit of one of the write() cases
547         // because we want to prepend to m_pendingSrc rather than appending
548         // if there's no previous prependingSrc
549         if (!m_pendingScripts.isEmpty()) {
550             if (m_currentPrependingSrc)
551                 m_currentPrependingSrc->append(prependingSrc);
552             else
553                 m_pendingSrc.prepend(prependingSrc);
554         } else {
555             m_state = state;
556             write(prependingSrc, false);
557             state = m_state;
558         }
559     }
560 
561 #if PRELOAD_SCANNER_ENABLED
562     if (!m_pendingScripts.isEmpty() && !m_executingScript) {
563         if (!m_preloadScanner)
564             m_preloadScanner.set(new PreloadScanner(m_doc));
565         if (!m_preloadScanner->inProgress()) {
566             m_preloadScanner->begin();
567             m_preloadScanner->write(m_pendingSrc);
568         }
569     }
570 #endif
571     m_currentPrependingSrc = savedPrependingSrc;
572 
573     return state;
574 }
575 
scriptExecution(const ScriptSourceCode & sourceCode,State state)576 HTMLTokenizer::State HTMLTokenizer::scriptExecution(const ScriptSourceCode& sourceCode, State state)
577 {
578     if (m_fragment || !m_doc->frame())
579         return state;
580     m_executingScript++;
581 
582     SegmentedString* savedPrependingSrc = m_currentPrependingSrc;
583     SegmentedString prependingSrc;
584     m_currentPrependingSrc = &prependingSrc;
585 
586 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
587     if (!m_doc->ownerElement())
588         printf("beginning script execution at %d\n", m_doc->elapsedTime());
589 #endif
590 
591     m_state = state;
592     m_doc->frame()->script()->executeScript(sourceCode);
593     state = m_state;
594 
595     state.setAllowYield(true);
596 
597 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
598     if (!m_doc->ownerElement())
599         printf("ending script execution at %d\n", m_doc->elapsedTime());
600 #endif
601 
602     m_executingScript--;
603 
604     if (!m_executingScript && !state.loadingExtScript()) {
605         m_pendingSrc.prepend(prependingSrc);
606         m_src.append(m_pendingSrc);
607         m_pendingSrc.clear();
608     } else if (!prependingSrc.isEmpty()) {
609         // restore first so that the write appends in the right place
610         // (does not hurt to do it again below)
611         m_currentPrependingSrc = savedPrependingSrc;
612 
613         // we need to do this slightly modified bit of one of the write() cases
614         // because we want to prepend to m_pendingSrc rather than appending
615         // if there's no previous prependingSrc
616         if (!m_pendingScripts.isEmpty()) {
617             if (m_currentPrependingSrc)
618                 m_currentPrependingSrc->append(prependingSrc);
619             else
620                 m_pendingSrc.prepend(prependingSrc);
621 
622 #if PRELOAD_SCANNER_ENABLED
623             // We are stuck waiting for another script. Lets check the source that
624             // was just document.write()n for anything to load.
625             PreloadScanner documentWritePreloadScanner(m_doc);
626             documentWritePreloadScanner.begin();
627             documentWritePreloadScanner.write(prependingSrc);
628             documentWritePreloadScanner.end();
629 #endif
630         } else {
631             m_state = state;
632             write(prependingSrc, false);
633             state = m_state;
634         }
635     }
636 
637     m_currentPrependingSrc = savedPrependingSrc;
638 
639     return state;
640 }
641 
parseComment(SegmentedString & src,State state)642 HTMLTokenizer::State HTMLTokenizer::parseComment(SegmentedString& src, State state)
643 {
644     // FIXME: Why does this code even run for comments inside <script> and <style>? This seems bogus.
645     checkScriptBuffer(src.length());
646     while (!src.isEmpty()) {
647         UChar ch = *src;
648         m_scriptCode[m_scriptCodeSize++] = ch;
649         if (ch == '>') {
650             bool handleBrokenComments = m_brokenComments && !(state.inScript() || state.inStyle());
651             int endCharsCount = 1; // start off with one for the '>' character
652             if (m_scriptCodeSize > 2 && m_scriptCode[m_scriptCodeSize-3] == '-' && m_scriptCode[m_scriptCodeSize-2] == '-') {
653                 endCharsCount = 3;
654             } else if (m_scriptCodeSize > 3 && m_scriptCode[m_scriptCodeSize-4] == '-' && m_scriptCode[m_scriptCodeSize-3] == '-' &&
655                 m_scriptCode[m_scriptCodeSize-2] == '!') {
656                 // Other browsers will accept --!> as a close comment, even though it's
657                 // not technically valid.
658                 endCharsCount = 4;
659             }
660             if (handleBrokenComments || endCharsCount > 1) {
661                 src.advancePastNonNewline();
662                 if (!(state.inTitle() || state.inScript() || state.inXmp() || state.inTextArea() || state.inStyle() || state.inIFrame())) {
663                     checkScriptBuffer();
664                     m_scriptCode[m_scriptCodeSize] = 0;
665                     m_scriptCode[m_scriptCodeSize + 1] = 0;
666                     m_currentToken.tagName = commentAtom;
667                     m_currentToken.beginTag = true;
668                     state = processListing(SegmentedString(m_scriptCode, m_scriptCodeSize - endCharsCount), state);
669                     processToken();
670                     m_currentToken.tagName = commentAtom;
671                     m_currentToken.beginTag = false;
672                     processToken();
673                     m_scriptCodeSize = 0;
674                 }
675                 state.setInComment(false);
676                 return state; // Finished parsing comment
677             }
678         }
679         src.advance(m_lineNumber);
680     }
681 
682     return state;
683 }
684 
parseServer(SegmentedString & src,State state)685 HTMLTokenizer::State HTMLTokenizer::parseServer(SegmentedString& src, State state)
686 {
687     checkScriptBuffer(src.length());
688     while (!src.isEmpty()) {
689         UChar ch = *src;
690         m_scriptCode[m_scriptCodeSize++] = ch;
691         if (ch == '>' && m_scriptCodeSize > 1 && m_scriptCode[m_scriptCodeSize - 2] == '%') {
692             src.advancePastNonNewline();
693             state.setInServer(false);
694             m_scriptCodeSize = 0;
695             return state; // Finished parsing server include
696         }
697         src.advance(m_lineNumber);
698     }
699     return state;
700 }
701 
parseProcessingInstruction(SegmentedString & src,State state)702 HTMLTokenizer::State HTMLTokenizer::parseProcessingInstruction(SegmentedString& src, State state)
703 {
704     UChar oldchar = 0;
705     while (!src.isEmpty()) {
706         UChar chbegin = *src;
707         if (chbegin == '\'')
708             tquote = tquote == SingleQuote ? NoQuote : SingleQuote;
709         else if (chbegin == '\"')
710             tquote = tquote == DoubleQuote ? NoQuote : DoubleQuote;
711         // Look for '?>'
712         // Some crappy sites omit the "?" before it, so
713         // we look for an unquoted '>' instead. (IE compatible)
714         else if (chbegin == '>' && (!tquote || oldchar == '?')) {
715             // We got a '?>' sequence
716             state.setInProcessingInstruction(false);
717             src.advancePastNonNewline();
718             state.setDiscardLF(true);
719             return state; // Finished parsing comment!
720         }
721         src.advance(m_lineNumber);
722         oldchar = chbegin;
723     }
724 
725     return state;
726 }
727 
parseText(SegmentedString & src,State state)728 HTMLTokenizer::State HTMLTokenizer::parseText(SegmentedString& src, State state)
729 {
730     while (!src.isEmpty()) {
731         UChar cc = *src;
732 
733         if (state.skipLF()) {
734             state.setSkipLF(false);
735             if (cc == '\n') {
736                 src.advancePastNewline(m_lineNumber);
737                 continue;
738             }
739         }
740 
741         // do we need to enlarge the buffer?
742         checkBuffer();
743 
744         if (cc == '\r') {
745             state.setSkipLF(true);
746             *m_dest++ = '\n';
747         } else
748             *m_dest++ = cc;
749         src.advance(m_lineNumber);
750     }
751 
752     return state;
753 }
754 
755 
parseEntity(SegmentedString & src,UChar * & dest,State state,unsigned & cBufferPos,bool start,bool parsingTag)756 HTMLTokenizer::State HTMLTokenizer::parseEntity(SegmentedString& src, UChar*& dest, State state, unsigned& cBufferPos, bool start, bool parsingTag)
757 {
758     if (start) {
759         cBufferPos = 0;
760         state.setEntityState(SearchEntity);
761         EntityUnicodeValue = 0;
762     }
763 
764     while (!src.isEmpty()) {
765         UChar cc = *src;
766         switch (state.entityState()) {
767         case NoEntity:
768             ASSERT(state.entityState() != NoEntity);
769             return state;
770 
771         case SearchEntity:
772             if (cc == '#') {
773                 m_cBuffer[cBufferPos++] = cc;
774                 src.advancePastNonNewline();
775                 state.setEntityState(NumericSearch);
776             } else
777                 state.setEntityState(EntityName);
778             break;
779 
780         case NumericSearch:
781             if (cc == 'x' || cc == 'X') {
782                 m_cBuffer[cBufferPos++] = cc;
783                 src.advancePastNonNewline();
784                 state.setEntityState(Hexadecimal);
785             } else if (cc >= '0' && cc <= '9')
786                 state.setEntityState(Decimal);
787             else
788                 state.setEntityState(SearchSemicolon);
789             break;
790 
791         case Hexadecimal: {
792             int ll = min(src.length(), 10 - cBufferPos);
793             while (ll--) {
794                 cc = *src;
795                 if (!((cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f') || (cc >= 'A' && cc <= 'F'))) {
796                     state.setEntityState(SearchSemicolon);
797                     break;
798                 }
799                 int digit;
800                 if (cc < 'A')
801                     digit = cc - '0';
802                 else
803                     digit = (cc - 'A' + 10) & 0xF; // handle both upper and lower case without a branch
804                 EntityUnicodeValue = EntityUnicodeValue * 16 + digit;
805                 m_cBuffer[cBufferPos++] = cc;
806                 src.advancePastNonNewline();
807             }
808             if (cBufferPos == 10)
809                 state.setEntityState(SearchSemicolon);
810             break;
811         }
812         case Decimal:
813         {
814             int ll = min(src.length(), 9-cBufferPos);
815             while (ll--) {
816                 cc = *src;
817 
818                 if (!(cc >= '0' && cc <= '9')) {
819                     state.setEntityState(SearchSemicolon);
820                     break;
821                 }
822 
823                 EntityUnicodeValue = EntityUnicodeValue * 10 + (cc - '0');
824                 m_cBuffer[cBufferPos++] = cc;
825                 src.advancePastNonNewline();
826             }
827             if (cBufferPos == 9)
828                 state.setEntityState(SearchSemicolon);
829             break;
830         }
831         case EntityName:
832         {
833             int ll = min(src.length(), 9-cBufferPos);
834             while (ll--) {
835                 cc = *src;
836 
837                 if (!((cc >= 'a' && cc <= 'z') || (cc >= '0' && cc <= '9') || (cc >= 'A' && cc <= 'Z'))) {
838                     state.setEntityState(SearchSemicolon);
839                     break;
840                 }
841 
842                 m_cBuffer[cBufferPos++] = cc;
843                 src.advancePastNonNewline();
844             }
845             if (cBufferPos == 9)
846                 state.setEntityState(SearchSemicolon);
847             if (state.entityState() == SearchSemicolon) {
848                 if (cBufferPos > 1) {
849                     // Since the maximum length of entity name is 9,
850                     // so a single char array which is allocated on
851                     // the stack, its length is 10, should be OK.
852                     // Also if we have an illegal character, we treat it
853                     // as illegal entity name.
854                     unsigned testedEntityNameLen = 0;
855                     char tmpEntityNameBuffer[10];
856 
857                     ASSERT(cBufferPos < 10);
858                     for (; testedEntityNameLen < cBufferPos; ++testedEntityNameLen) {
859                         if (m_cBuffer[testedEntityNameLen] > 0x7e)
860                             break;
861                         tmpEntityNameBuffer[testedEntityNameLen] = m_cBuffer[testedEntityNameLen];
862                     }
863 
864                     const Entity *e;
865 
866                     if (testedEntityNameLen == cBufferPos)
867                         e = findEntity(tmpEntityNameBuffer, cBufferPos);
868                     else
869                         e = 0;
870 
871                     if (e)
872                         EntityUnicodeValue = e->code;
873 
874                     // be IE compatible
875                     if (parsingTag && EntityUnicodeValue > 255 && *src != ';')
876                         EntityUnicodeValue = 0;
877                 }
878             }
879             else
880                 break;
881         }
882         case SearchSemicolon:
883             // Don't allow values that are more than 21 bits.
884             if (EntityUnicodeValue > 0 && EntityUnicodeValue <= 0x10FFFF) {
885                 if (!inViewSourceMode()) {
886                     if (*src == ';')
887                         src.advancePastNonNewline();
888                     if (EntityUnicodeValue <= 0xFFFF) {
889                         checkBuffer();
890                         src.push(fixUpChar(EntityUnicodeValue));
891                     } else {
892                         // Convert to UTF-16, using surrogate code points.
893                         checkBuffer(2);
894                         src.push(U16_LEAD(EntityUnicodeValue));
895                         src.push(U16_TRAIL(EntityUnicodeValue));
896                     }
897                 } else {
898                     // FIXME: We should eventually colorize entities by sending them as a special token.
899                     // 12 bytes required: up to 10 bytes in m_cBuffer plus the
900                     // leading '&' and trailing ';'
901                     checkBuffer(12);
902                     *dest++ = '&';
903                     for (unsigned i = 0; i < cBufferPos; i++)
904                         dest[i] = m_cBuffer[i];
905                     dest += cBufferPos;
906                     if (*src == ';') {
907                         *dest++ = ';';
908                         src.advancePastNonNewline();
909                     }
910                 }
911             } else {
912                 // 11 bytes required: up to 10 bytes in m_cBuffer plus the
913                 // leading '&'
914                 checkBuffer(11);
915                 // ignore the sequence, add it to the buffer as plaintext
916                 *dest++ = '&';
917                 for (unsigned i = 0; i < cBufferPos; i++)
918                     dest[i] = m_cBuffer[i];
919                 dest += cBufferPos;
920             }
921 
922             state.setEntityState(NoEntity);
923             return state;
924         }
925     }
926 
927     return state;
928 }
929 
parseDoctype(SegmentedString & src,State state)930 HTMLTokenizer::State HTMLTokenizer::parseDoctype(SegmentedString& src, State state)
931 {
932     ASSERT(state.inDoctype());
933     while (!src.isEmpty() && state.inDoctype()) {
934         UChar c = *src;
935         bool isWhitespace = c == '\r' || c == '\n' || c == '\t' || c == ' ';
936         switch (m_doctypeToken.state()) {
937             case DoctypeBegin: {
938                 m_doctypeToken.setState(DoctypeBeforeName);
939                 if (isWhitespace) {
940                     src.advance(m_lineNumber);
941                     if (inViewSourceMode())
942                         m_doctypeToken.m_source.append(c);
943                 }
944                 break;
945             }
946             case DoctypeBeforeName: {
947                 if (c == '>') {
948                     // Malformed.  Just exit.
949                     src.advancePastNonNewline();
950                     state.setInDoctype(false);
951                     if (inViewSourceMode())
952                         processDoctypeToken();
953                 } else if (isWhitespace) {
954                     src.advance(m_lineNumber);
955                     if (inViewSourceMode())
956                         m_doctypeToken.m_source.append(c);
957                 } else
958                     m_doctypeToken.setState(DoctypeName);
959                 break;
960             }
961             case DoctypeName: {
962                 if (c == '>') {
963                     // Valid doctype. Emit it.
964                     src.advancePastNonNewline();
965                     state.setInDoctype(false);
966                     processDoctypeToken();
967                 } else if (isWhitespace) {
968                     m_doctypeSearchCount = 0; // Used now to scan for PUBLIC
969                     m_doctypeSecondarySearchCount = 0; // Used now to scan for SYSTEM
970                     m_doctypeToken.setState(DoctypeAfterName);
971                     src.advance(m_lineNumber);
972                     if (inViewSourceMode())
973                         m_doctypeToken.m_source.append(c);
974                 } else {
975                     src.advancePastNonNewline();
976                     m_doctypeToken.m_name.append(c);
977                     if (inViewSourceMode())
978                         m_doctypeToken.m_source.append(c);
979                 }
980                 break;
981             }
982             case DoctypeAfterName: {
983                 if (c == '>') {
984                     // Valid doctype. Emit it.
985                     src.advancePastNonNewline();
986                     state.setInDoctype(false);
987                     processDoctypeToken();
988                 } else if (!isWhitespace) {
989                     src.advancePastNonNewline();
990                     if (toASCIILower(c) == publicStart[m_doctypeSearchCount]) {
991                         m_doctypeSearchCount++;
992                         if (m_doctypeSearchCount == 6)
993                             // Found 'PUBLIC' sequence
994                             m_doctypeToken.setState(DoctypeBeforePublicID);
995                     } else if (m_doctypeSearchCount > 0) {
996                         m_doctypeSearchCount = 0;
997                         m_doctypeToken.setState(DoctypeBogus);
998                     } else if (toASCIILower(c) == systemStart[m_doctypeSecondarySearchCount]) {
999                         m_doctypeSecondarySearchCount++;
1000                         if (m_doctypeSecondarySearchCount == 6)
1001                             // Found 'SYSTEM' sequence
1002                             m_doctypeToken.setState(DoctypeBeforeSystemID);
1003                     } else {
1004                         m_doctypeSecondarySearchCount = 0;
1005                         m_doctypeToken.setState(DoctypeBogus);
1006                     }
1007                     if (inViewSourceMode())
1008                         m_doctypeToken.m_source.append(c);
1009                 } else {
1010                     src.advance(m_lineNumber); // Whitespace keeps us in the after name state.
1011                     if (inViewSourceMode())
1012                         m_doctypeToken.m_source.append(c);
1013                 }
1014                 break;
1015             }
1016             case DoctypeBeforePublicID: {
1017                 if (c == '\"' || c == '\'') {
1018                     tquote = c == '\"' ? DoubleQuote : SingleQuote;
1019                     m_doctypeToken.setState(DoctypePublicID);
1020                     src.advancePastNonNewline();
1021                     if (inViewSourceMode())
1022                         m_doctypeToken.m_source.append(c);
1023                 } else if (c == '>') {
1024                     // Considered bogus.  Don't process the doctype.
1025                     src.advancePastNonNewline();
1026                     state.setInDoctype(false);
1027                     if (inViewSourceMode())
1028                         processDoctypeToken();
1029                 } else if (isWhitespace) {
1030                     src.advance(m_lineNumber);
1031                     if (inViewSourceMode())
1032                         m_doctypeToken.m_source.append(c);
1033                 } else
1034                     m_doctypeToken.setState(DoctypeBogus);
1035                 break;
1036             }
1037             case DoctypePublicID: {
1038                 if ((c == '\"' && tquote == DoubleQuote) || (c == '\'' && tquote == SingleQuote)) {
1039                     src.advancePastNonNewline();
1040                     m_doctypeToken.setState(DoctypeAfterPublicID);
1041                     if (inViewSourceMode())
1042                         m_doctypeToken.m_source.append(c);
1043                 } else if (c == '>') {
1044                      // Considered bogus.  Don't process the doctype.
1045                     src.advancePastNonNewline();
1046                     state.setInDoctype(false);
1047                     if (inViewSourceMode())
1048                         processDoctypeToken();
1049                 } else {
1050                     m_doctypeToken.m_publicID.append(c);
1051                     src.advance(m_lineNumber);
1052                     if (inViewSourceMode())
1053                         m_doctypeToken.m_source.append(c);
1054                 }
1055                 break;
1056             }
1057             case DoctypeAfterPublicID:
1058                 if (c == '\"' || c == '\'') {
1059                     tquote = c == '\"' ? DoubleQuote : SingleQuote;
1060                     m_doctypeToken.setState(DoctypeSystemID);
1061                     src.advancePastNonNewline();
1062                     if (inViewSourceMode())
1063                         m_doctypeToken.m_source.append(c);
1064                 } else if (c == '>') {
1065                     // Valid doctype. Emit it now.
1066                     src.advancePastNonNewline();
1067                     state.setInDoctype(false);
1068                     processDoctypeToken();
1069                 } else if (isWhitespace) {
1070                     src.advance(m_lineNumber);
1071                     if (inViewSourceMode())
1072                         m_doctypeToken.m_source.append(c);
1073                 } else
1074                     m_doctypeToken.setState(DoctypeBogus);
1075                 break;
1076             case DoctypeBeforeSystemID:
1077                 if (c == '\"' || c == '\'') {
1078                     tquote = c == '\"' ? DoubleQuote : SingleQuote;
1079                     m_doctypeToken.setState(DoctypeSystemID);
1080                     src.advancePastNonNewline();
1081                     if (inViewSourceMode())
1082                         m_doctypeToken.m_source.append(c);
1083                 } else if (c == '>') {
1084                     // Considered bogus.  Don't process the doctype.
1085                     src.advancePastNonNewline();
1086                     state.setInDoctype(false);
1087                 } else if (isWhitespace) {
1088                     src.advance(m_lineNumber);
1089                     if (inViewSourceMode())
1090                         m_doctypeToken.m_source.append(c);
1091                 } else
1092                     m_doctypeToken.setState(DoctypeBogus);
1093                 break;
1094             case DoctypeSystemID:
1095                 if ((c == '\"' && tquote == DoubleQuote) || (c == '\'' && tquote == SingleQuote)) {
1096                     src.advancePastNonNewline();
1097                     m_doctypeToken.setState(DoctypeAfterSystemID);
1098                     if (inViewSourceMode())
1099                         m_doctypeToken.m_source.append(c);
1100                 } else if (c == '>') {
1101                      // Considered bogus.  Don't process the doctype.
1102                     src.advancePastNonNewline();
1103                     state.setInDoctype(false);
1104                     if (inViewSourceMode())
1105                         processDoctypeToken();
1106                 } else {
1107                     m_doctypeToken.m_systemID.append(c);
1108                     src.advance(m_lineNumber);
1109                     if (inViewSourceMode())
1110                         m_doctypeToken.m_source.append(c);
1111                 }
1112                 break;
1113             case DoctypeAfterSystemID:
1114                 if (c == '>') {
1115                     // Valid doctype. Emit it now.
1116                     src.advancePastNonNewline();
1117                     state.setInDoctype(false);
1118                     processDoctypeToken();
1119                 } else if (isWhitespace) {
1120                     src.advance(m_lineNumber);
1121                     if (inViewSourceMode())
1122                         m_doctypeToken.m_source.append(c);
1123                 } else
1124                     m_doctypeToken.setState(DoctypeBogus);
1125                 break;
1126             case DoctypeBogus:
1127                 if (c == '>') {
1128                     // Done with the bogus doctype.
1129                     src.advancePastNonNewline();
1130                     state.setInDoctype(false);
1131                     if (inViewSourceMode())
1132                        processDoctypeToken();
1133                 } else {
1134                     src.advance(m_lineNumber); // Just keep scanning for '>'
1135                     if (inViewSourceMode())
1136                         m_doctypeToken.m_source.append(c);
1137                 }
1138                 break;
1139             default:
1140                 break;
1141         }
1142     }
1143     return state;
1144 }
1145 
parseTag(SegmentedString & src,State state)1146 HTMLTokenizer::State HTMLTokenizer::parseTag(SegmentedString& src, State state)
1147 {
1148     ASSERT(!state.hasEntityState());
1149 
1150     unsigned cBufferPos = m_cBufferPos;
1151 
1152     bool lastIsSlash = false;
1153 
1154     while (!src.isEmpty()) {
1155         checkBuffer();
1156         switch (state.tagState()) {
1157         case NoTag:
1158         {
1159             m_cBufferPos = cBufferPos;
1160             return state;
1161         }
1162         case TagName:
1163         {
1164             if (searchCount > 0) {
1165                 if (*src == commentStart[searchCount]) {
1166                     searchCount++;
1167                     if (searchCount == 2)
1168                         m_doctypeSearchCount++; // A '!' is also part of a doctype, so we are moving through that still as well.
1169                     else
1170                         m_doctypeSearchCount = 0;
1171                     if (searchCount == 4) {
1172                         // Found '<!--' sequence
1173                         src.advancePastNonNewline();
1174                         m_dest = m_buffer; // ignore the previous part of this tag
1175                         state.setInComment(true);
1176                         state.setTagState(NoTag);
1177 
1178                         // Fix bug 34302 at kde.bugs.org.  Go ahead and treat
1179                         // <!--> as a valid comment, since both mozilla and IE on windows
1180                         // can handle this case.  Only do this in quirks mode. -dwh
1181                         if (!src.isEmpty() && *src == '>' && m_doc->inCompatMode()) {
1182                             state.setInComment(false);
1183                             src.advancePastNonNewline();
1184                             if (!src.isEmpty())
1185                                 m_cBuffer[cBufferPos++] = *src;
1186                         } else
1187                           state = parseComment(src, state);
1188 
1189                         m_cBufferPos = cBufferPos;
1190                         return state; // Finished parsing tag!
1191                     }
1192                     m_cBuffer[cBufferPos++] = *src;
1193                     src.advancePastNonNewline();
1194                     break;
1195                 } else
1196                     searchCount = 0; // Stop looking for '<!--' sequence
1197             }
1198 
1199             if (m_doctypeSearchCount > 0) {
1200                 if (toASCIILower(*src) == doctypeStart[m_doctypeSearchCount]) {
1201                     m_doctypeSearchCount++;
1202                     m_cBuffer[cBufferPos++] = *src;
1203                     src.advancePastNonNewline();
1204                     if (m_doctypeSearchCount == 9) {
1205                         // Found '<!DOCTYPE' sequence
1206                         state.setInDoctype(true);
1207                         state.setTagState(NoTag);
1208                         m_doctypeToken.reset();
1209                         if (inViewSourceMode())
1210                             m_doctypeToken.m_source.append(m_cBuffer, cBufferPos);
1211                         state = parseDoctype(src, state);
1212                         m_cBufferPos = cBufferPos;
1213                         return state;
1214                     }
1215                     break;
1216                 } else
1217                     m_doctypeSearchCount = 0; // Stop looking for '<!DOCTYPE' sequence
1218             }
1219 
1220             bool finish = false;
1221             unsigned int ll = min(src.length(), CBUFLEN - cBufferPos);
1222             while (ll--) {
1223                 UChar curchar = *src;
1224                 if (isASCIISpace(curchar) || curchar == '>' || curchar == '<') {
1225                     finish = true;
1226                     break;
1227                 }
1228 
1229                 // tolower() shows up on profiles. This is faster!
1230                 if (curchar >= 'A' && curchar <= 'Z' && !inViewSourceMode())
1231                     m_cBuffer[cBufferPos++] = curchar + ('a' - 'A');
1232                 else
1233                     m_cBuffer[cBufferPos++] = curchar;
1234                 src.advancePastNonNewline();
1235             }
1236 
1237             // Disadvantage: we add the possible rest of the tag
1238             // as attribute names. ### judge if this causes problems
1239             if (finish || CBUFLEN == cBufferPos) {
1240                 bool beginTag;
1241                 UChar* ptr = m_cBuffer;
1242                 unsigned int len = cBufferPos;
1243                 m_cBuffer[cBufferPos] = '\0';
1244                 if ((cBufferPos > 0) && (*ptr == '/')) {
1245                     // End Tag
1246                     beginTag = false;
1247                     ptr++;
1248                     len--;
1249                 }
1250                 else
1251                     // Start Tag
1252                     beginTag = true;
1253 
1254                 // Ignore the / in fake xml tags like <br/>.  We trim off the "/" so that we'll get "br" as the tag name and not "br/".
1255                 if (len > 1 && ptr[len-1] == '/' && !inViewSourceMode())
1256                     ptr[--len] = '\0';
1257 
1258                 // Now that we've shaved off any invalid / that might have followed the name), make the tag.
1259                 // FIXME: FireFox and WinIE turn !foo nodes into comments, we ignore comments. (fast/parser/tag-with-exclamation-point.html)
1260                 if (ptr[0] != '!' || inViewSourceMode()) {
1261                     m_currentToken.tagName = AtomicString(ptr);
1262                     m_currentToken.beginTag = beginTag;
1263                 }
1264                 m_dest = m_buffer;
1265                 state.setTagState(SearchAttribute);
1266                 cBufferPos = 0;
1267             }
1268             break;
1269         }
1270         case SearchAttribute:
1271             while (!src.isEmpty()) {
1272                 UChar curchar = *src;
1273                 // In this mode just ignore any quotes we encounter and treat them like spaces.
1274                 if (!isASCIISpace(curchar) && curchar != '\'' && curchar != '"') {
1275                     if (curchar == '<' || curchar == '>')
1276                         state.setTagState(SearchEnd);
1277                     else
1278                         state.setTagState(AttributeName);
1279 
1280                     cBufferPos = 0;
1281                     break;
1282                 }
1283                 if (inViewSourceMode())
1284                     m_currentToken.addViewSourceChar(curchar);
1285                 src.advance(m_lineNumber);
1286             }
1287             break;
1288         case AttributeName:
1289         {
1290             m_rawAttributeBeforeValue.clear();
1291             int ll = min(src.length(), CBUFLEN - cBufferPos);
1292             while (ll--) {
1293                 UChar curchar = *src;
1294                 // If we encounter a "/" when scanning an attribute name, treat it as a delimiter.  This allows the
1295                 // cases like <input type=checkbox checked/> to work (and accommodates XML-style syntax as per HTML5).
1296                 if (curchar <= '>' && (curchar >= '<' || isASCIISpace(curchar) || curchar == '/')) {
1297                     m_cBuffer[cBufferPos] = '\0';
1298                     m_attrName = AtomicString(m_cBuffer);
1299                     m_dest = m_buffer;
1300                     *m_dest++ = 0;
1301                     state.setTagState(SearchEqual);
1302                     if (inViewSourceMode())
1303                         m_currentToken.addViewSourceChar('a');
1304                     break;
1305                 }
1306 
1307                 // tolower() shows up on profiles. This is faster!
1308                 if (curchar >= 'A' && curchar <= 'Z' && !inViewSourceMode())
1309                     m_cBuffer[cBufferPos++] = curchar + ('a' - 'A');
1310                 else
1311                     m_cBuffer[cBufferPos++] = curchar;
1312 
1313                 m_rawAttributeBeforeValue.append(curchar);
1314                 src.advance(m_lineNumber);
1315             }
1316             if (cBufferPos == CBUFLEN) {
1317                 m_cBuffer[cBufferPos] = '\0';
1318                 m_attrName = AtomicString(m_cBuffer);
1319                 m_dest = m_buffer;
1320                 *m_dest++ = 0;
1321                 state.setTagState(SearchEqual);
1322                 if (inViewSourceMode())
1323                     m_currentToken.addViewSourceChar('a');
1324             }
1325             break;
1326         }
1327         case SearchEqual:
1328             while (!src.isEmpty()) {
1329                 UChar curchar = *src;
1330 
1331                 if (lastIsSlash && curchar == '>') {
1332                     // This is a quirk (with a long sad history).  We have to do this
1333                     // since widgets do <script src="foo.js"/> and expect the tag to close.
1334                     if (m_currentToken.tagName == scriptTag)
1335                         m_currentToken.selfClosingTag = true;
1336                     m_currentToken.brokenXMLStyle = true;
1337                 }
1338 
1339                 // In this mode just ignore any quotes or slashes we encounter and treat them like spaces.
1340                 if (!isASCIISpace(curchar) && curchar != '\'' && curchar != '"' && curchar != '/') {
1341                     if (curchar == '=') {
1342                         state.setTagState(SearchValue);
1343                         if (inViewSourceMode())
1344                             m_currentToken.addViewSourceChar(curchar);
1345                         m_rawAttributeBeforeValue.append(curchar);
1346                         src.advancePastNonNewline();
1347                     } else {
1348                         m_currentToken.addAttribute(m_attrName, emptyAtom, inViewSourceMode());
1349                         m_dest = m_buffer;
1350                         state.setTagState(SearchAttribute);
1351                         lastIsSlash = false;
1352                     }
1353                     break;
1354                 }
1355 
1356                 lastIsSlash = curchar == '/';
1357 
1358                 if (inViewSourceMode())
1359                     m_currentToken.addViewSourceChar(curchar);
1360                 m_rawAttributeBeforeValue.append(curchar);
1361                 src.advance(m_lineNumber);
1362             }
1363             break;
1364         case SearchValue:
1365             while (!src.isEmpty()) {
1366                 UChar curchar = *src;
1367                 if (!isASCIISpace(curchar)) {
1368                     if (curchar == '\'' || curchar == '\"') {
1369                         tquote = curchar == '\"' ? DoubleQuote : SingleQuote;
1370                         state.setTagState(QuotedValue);
1371                         if (inViewSourceMode())
1372                             m_currentToken.addViewSourceChar(curchar);
1373                         m_rawAttributeBeforeValue.append(curchar);
1374                         src.advancePastNonNewline();
1375                     } else
1376                         state.setTagState(Value);
1377 
1378                     break;
1379                 }
1380                 if (inViewSourceMode())
1381                     m_currentToken.addViewSourceChar(curchar);
1382                 m_rawAttributeBeforeValue.append(curchar);
1383                 src.advance(m_lineNumber);
1384             }
1385             break;
1386         case QuotedValue:
1387             while (!src.isEmpty()) {
1388                 checkBuffer();
1389 
1390                 UChar curchar = *src;
1391                 if (curchar <= '>' && !src.escaped()) {
1392                     if (curchar == '>' && m_attrName.isEmpty()) {
1393                         // Handle a case like <img '>.  Just go ahead and be willing
1394                         // to close the whole tag.  Don't consume the character and
1395                         // just go back into SearchEnd while ignoring the whole
1396                         // value.
1397                         // FIXME: Note that this is actually not a very good solution.
1398                         // It doesn't handle the general case of
1399                         // unmatched quotes among attributes that have names. -dwh
1400                         while (m_dest > m_buffer + 1 && (m_dest[-1] == '\n' || m_dest[-1] == '\r'))
1401                             m_dest--; // remove trailing newlines
1402                         AtomicString attributeValue(m_buffer + 1, m_dest - m_buffer - 1);
1403                         if (!attributeValue.contains('/'))
1404                             m_attrName = attributeValue; // Just make the name/value match. (FIXME: Is this some WinIE quirk?)
1405                         m_currentToken.addAttribute(m_attrName, attributeValue, inViewSourceMode());
1406                         if (inViewSourceMode())
1407                             m_currentToken.addViewSourceChar('x');
1408                         state.setTagState(SearchAttribute);
1409                         m_dest = m_buffer;
1410                         tquote = NoQuote;
1411                         break;
1412                     }
1413 
1414                     if (curchar == '&') {
1415                         src.advancePastNonNewline();
1416                         state = parseEntity(src, m_dest, state, cBufferPos, true, true);
1417                         break;
1418                     }
1419 
1420                     if ((tquote == SingleQuote && curchar == '\'') || (tquote == DoubleQuote && curchar == '\"')) {
1421                         // some <input type=hidden> rely on trailing spaces. argh
1422                         while (m_dest > m_buffer + 1 && (m_dest[-1] == '\n' || m_dest[-1] == '\r'))
1423                             m_dest--; // remove trailing newlines
1424                         AtomicString attributeValue(m_buffer + 1, m_dest - m_buffer - 1);
1425                         if (m_attrName.isEmpty() && !attributeValue.contains('/')) {
1426                             m_attrName = attributeValue; // Make the name match the value. (FIXME: Is this a WinIE quirk?)
1427                             if (inViewSourceMode())
1428                                 m_currentToken.addViewSourceChar('x');
1429                         } else if (inViewSourceMode())
1430                             m_currentToken.addViewSourceChar('v');
1431 
1432                         if (m_currentToken.beginTag && m_currentToken.tagName == scriptTag && !inViewSourceMode() && !m_parser->skipMode() && m_attrName == srcAttr) {
1433                             String context(m_rawAttributeBeforeValue.data(), m_rawAttributeBeforeValue.size());
1434                             if (m_XSSAuditor && !m_XSSAuditor->canLoadExternalScriptFromSrc(context, attributeValue))
1435                                 attributeValue = blankURL().string();
1436                         }
1437 
1438                         m_currentToken.addAttribute(m_attrName, attributeValue, inViewSourceMode());
1439                         m_dest = m_buffer;
1440                         state.setTagState(SearchAttribute);
1441                         tquote = NoQuote;
1442                         if (inViewSourceMode())
1443                             m_currentToken.addViewSourceChar(curchar);
1444                         src.advancePastNonNewline();
1445                         break;
1446                     }
1447                 }
1448 
1449                 *m_dest++ = curchar;
1450                 src.advance(m_lineNumber);
1451             }
1452             break;
1453         case Value:
1454             while (!src.isEmpty()) {
1455                 checkBuffer();
1456                 UChar curchar = *src;
1457                 if (curchar <= '>' && !src.escaped()) {
1458                     // parse Entities
1459                     if (curchar == '&') {
1460                         src.advancePastNonNewline();
1461                         state = parseEntity(src, m_dest, state, cBufferPos, true, true);
1462                         break;
1463                     }
1464                     // no quotes. Every space means end of value
1465                     // '/' does not delimit in IE!
1466                     if (isASCIISpace(curchar) || curchar == '>') {
1467                         AtomicString attributeValue(m_buffer + 1, m_dest - m_buffer - 1);
1468 
1469                         if (m_currentToken.beginTag && m_currentToken.tagName == scriptTag && !inViewSourceMode() && !m_parser->skipMode() && m_attrName == srcAttr) {
1470                             String context(m_rawAttributeBeforeValue.data(), m_rawAttributeBeforeValue.size());
1471                             if (m_XSSAuditor && !m_XSSAuditor->canLoadExternalScriptFromSrc(context, attributeValue))
1472                                 attributeValue = blankURL().string();
1473                         }
1474 
1475                         m_currentToken.addAttribute(m_attrName, attributeValue, inViewSourceMode());
1476                         if (inViewSourceMode())
1477                             m_currentToken.addViewSourceChar('v');
1478                         m_dest = m_buffer;
1479                         state.setTagState(SearchAttribute);
1480                         break;
1481                     }
1482                 }
1483 
1484                 *m_dest++ = curchar;
1485                 src.advance(m_lineNumber);
1486             }
1487             break;
1488         case SearchEnd:
1489         {
1490             while (!src.isEmpty()) {
1491                 UChar ch = *src;
1492                 if (ch == '>' || ch == '<')
1493                     break;
1494                 if (ch == '/')
1495                     m_currentToken.selfClosingTag = true;
1496                 if (inViewSourceMode())
1497                     m_currentToken.addViewSourceChar(ch);
1498                 src.advance(m_lineNumber);
1499             }
1500             if (src.isEmpty())
1501                 break;
1502 
1503             searchCount = 0; // Stop looking for '<!--' sequence
1504             state.setTagState(NoTag);
1505             tquote = NoQuote;
1506 
1507             if (*src != '<')
1508                 src.advance(m_lineNumber);
1509 
1510             if (m_currentToken.tagName == nullAtom) { //stop if tag is unknown
1511                 m_cBufferPos = cBufferPos;
1512                 return state;
1513             }
1514 
1515             AtomicString tagName = m_currentToken.tagName;
1516 
1517             // Handle <script src="foo"/> like Mozilla/Opera. We have to do this now for Dashboard
1518             // compatibility.
1519             bool isSelfClosingScript = m_currentToken.selfClosingTag && m_currentToken.beginTag && m_currentToken.tagName == scriptTag;
1520             bool beginTag = !m_currentToken.selfClosingTag && m_currentToken.beginTag;
1521             if (m_currentToken.beginTag && m_currentToken.tagName == scriptTag && !inViewSourceMode() && !m_parser->skipMode()) {
1522                 Attribute* a = 0;
1523                 m_scriptTagSrcAttrValue = String();
1524                 m_scriptTagCharsetAttrValue = String();
1525                 if (m_currentToken.attrs && !m_fragment) {
1526                     if (m_doc->frame() && m_doc->frame()->script()->canExecuteScripts()) {
1527                         if ((a = m_currentToken.attrs->getAttributeItem(srcAttr)))
1528                             m_scriptTagSrcAttrValue = m_doc->completeURL(deprecatedParseURL(a->value())).string();
1529                     }
1530                 }
1531             }
1532 
1533             RefPtr<Node> n = processToken();
1534             m_cBufferPos = cBufferPos;
1535             if (n || inViewSourceMode()) {
1536                 State savedState = state;
1537                 SegmentedString savedSrc = src;
1538                 long savedLineno = m_lineNumber;
1539                 if ((tagName == preTag || tagName == listingTag) && !inViewSourceMode()) {
1540                     if (beginTag)
1541                         state.setDiscardLF(true); // Discard the first LF after we open a pre.
1542                 } else if (tagName == scriptTag) {
1543                     ASSERT(!m_scriptNode);
1544                     m_scriptNode = static_pointer_cast<HTMLScriptElement>(n);
1545                     if (m_scriptNode)
1546                         m_scriptTagCharsetAttrValue = m_scriptNode->scriptCharset();
1547                     if (beginTag) {
1548                         m_searchStopper = scriptEnd;
1549                         m_searchStopperLength = 8;
1550                         state.setInScript(true);
1551                         state = parseNonHTMLText(src, state);
1552                     } else if (isSelfClosingScript) { // Handle <script src="foo"/>
1553                         state.setInScript(true);
1554                         state = scriptHandler(state);
1555                     }
1556                 } else if (tagName == styleTag) {
1557                     if (beginTag) {
1558                         m_searchStopper = styleEnd;
1559                         m_searchStopperLength = 7;
1560                         state.setInStyle(true);
1561                         state = parseNonHTMLText(src, state);
1562                     }
1563                 } else if (tagName == textareaTag) {
1564                     if (beginTag) {
1565                         m_searchStopper = textareaEnd;
1566                         m_searchStopperLength = 10;
1567                         state.setInTextArea(true);
1568                         state = parseNonHTMLText(src, state);
1569                     }
1570                 } else if (tagName == titleTag) {
1571                     if (beginTag) {
1572                         m_searchStopper = titleEnd;
1573                         m_searchStopperLength = 7;
1574                         state.setInTitle(true);
1575                         state = parseNonHTMLText(src, state);
1576                     }
1577                 } else if (tagName == xmpTag) {
1578                     if (beginTag) {
1579                         m_searchStopper = xmpEnd;
1580                         m_searchStopperLength = 5;
1581                         state.setInXmp(true);
1582                         state = parseNonHTMLText(src, state);
1583                     }
1584                 } else if (tagName == iframeTag) {
1585                     if (beginTag) {
1586                         m_searchStopper = iframeEnd;
1587                         m_searchStopperLength = 8;
1588                         state.setInIFrame(true);
1589                         state = parseNonHTMLText(src, state);
1590                     }
1591                 }
1592                 if (src.isEmpty() && (state.inTitle() || inViewSourceMode()) && !state.inComment() && !(state.inScript() && m_currentScriptTagStartLineNumber)) {
1593                     // We just ate the rest of the document as the #text node under the special tag!
1594                     // Reset the state then retokenize without special handling.
1595                     // Let the parser clean up the missing close tag.
1596                     // FIXME: This is incorrect, because src.isEmpty() doesn't mean we're
1597                     // at the end of the document unless m_noMoreData is also true. We need
1598                     // to detect this case elsewhere, and save the state somewhere other
1599                     // than a local variable.
1600                     state = savedState;
1601                     src = savedSrc;
1602                     m_lineNumber = savedLineno;
1603                     m_scriptCodeSize = 0;
1604                 }
1605             }
1606             if (tagName == plaintextTag)
1607                 state.setInPlainText(beginTag);
1608             return state; // Finished parsing tag!
1609         }
1610         } // end switch
1611     }
1612     m_cBufferPos = cBufferPos;
1613     return state;
1614 }
1615 
continueProcessing(int & processedCount,double startTime,State & state)1616 inline bool HTMLTokenizer::continueProcessing(int& processedCount, double startTime, State &state)
1617 {
1618     // We don't want to be checking elapsed time with every character, so we only check after we've
1619     // processed a certain number of characters.
1620     bool allowedYield = state.allowYield();
1621     state.setAllowYield(false);
1622     if (!state.loadingExtScript() && !state.forceSynchronous() && !m_executingScript && (processedCount > m_tokenizerChunkSize || allowedYield)) {
1623         processedCount = 0;
1624         if (currentTime() - startTime > m_tokenizerTimeDelay) {
1625             /* FIXME: We'd like to yield aggressively to give stylesheets the opportunity to
1626                load, but this hurts overall performance on slower machines.  For now turn this
1627                off.
1628             || (!m_doc->haveStylesheetsLoaded() &&
1629                 (m_doc->documentElement()->id() != ID_HTML || m_doc->body()))) {*/
1630             // Schedule the timer to keep processing as soon as possible.
1631             m_timer.startOneShot(0);
1632 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
1633             if (currentTime() - startTime > m_tokenizerTimeDelay)
1634                 printf("Deferring processing of data because 500ms elapsed away from event loop.\n");
1635 #endif
1636             return false;
1637         }
1638     }
1639 
1640     processedCount++;
1641     return true;
1642 }
1643 
write(const SegmentedString & str,bool appendData)1644 void HTMLTokenizer::write(const SegmentedString& str, bool appendData)
1645 {
1646     if (!m_buffer)
1647         return;
1648 
1649     if (m_parserStopped)
1650         return;
1651 
1652     SegmentedString source(str);
1653     if (m_executingScript)
1654         source.setExcludeLineNumbers();
1655 
1656     if ((m_executingScript && appendData) || !m_pendingScripts.isEmpty()) {
1657         // don't parse; we will do this later
1658         if (m_currentPrependingSrc)
1659             m_currentPrependingSrc->append(source);
1660         else {
1661             m_pendingSrc.append(source);
1662 #if PRELOAD_SCANNER_ENABLED
1663             if (m_preloadScanner && m_preloadScanner->inProgress() && appendData)
1664                 m_preloadScanner->write(source);
1665 #endif
1666         }
1667         return;
1668     }
1669 
1670 #if PRELOAD_SCANNER_ENABLED
1671     if (m_preloadScanner && m_preloadScanner->inProgress() && appendData)
1672         m_preloadScanner->end();
1673 #endif
1674 
1675     if (!m_src.isEmpty())
1676         m_src.append(source);
1677     else
1678         setSrc(source);
1679 
1680     // Once a timer is set, it has control of when the tokenizer continues.
1681     if (m_timer.isActive())
1682         return;
1683 
1684     bool wasInWrite = m_inWrite;
1685     m_inWrite = true;
1686 
1687 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
1688     if (!m_doc->ownerElement())
1689         printf("Beginning write at time %d\n", m_doc->elapsedTime());
1690 #endif
1691 
1692     int processedCount = 0;
1693     double startTime = currentTime();
1694 #ifdef ANDROID_INSTRUMENT
1695     android::TimeCounter::start(android::TimeCounter::ParsingTimeCounter);
1696 #endif
1697 
1698 #if ENABLE(INSPECTOR)
1699     if (InspectorTimelineAgent* timelineAgent = m_doc->inspectorTimelineAgent())
1700         timelineAgent->willWriteHTML(source.length(), m_lineNumber);
1701 #endif
1702 
1703     Frame* frame = m_doc->frame();
1704 
1705     State state = m_state;
1706 
1707     while (!m_src.isEmpty() && (!frame || !frame->redirectScheduler()->locationChangePending())) {
1708         if (!continueProcessing(processedCount, startTime, state))
1709             break;
1710 
1711         // do we need to enlarge the buffer?
1712         checkBuffer();
1713 
1714         UChar cc = *m_src;
1715 
1716         bool wasSkipLF = state.skipLF();
1717         if (wasSkipLF)
1718             state.setSkipLF(false);
1719 
1720         if (wasSkipLF && (cc == '\n'))
1721             m_src.advance();
1722         else if (state.needsSpecialWriteHandling()) {
1723             // it's important to keep needsSpecialWriteHandling with the flags this block tests
1724             if (state.hasEntityState())
1725                 state = parseEntity(m_src, m_dest, state, m_cBufferPos, false, state.hasTagState());
1726             else if (state.inPlainText())
1727                 state = parseText(m_src, state);
1728             else if (state.inAnyNonHTMLText())
1729                 state = parseNonHTMLText(m_src, state);
1730             else if (state.inComment())
1731                 state = parseComment(m_src, state);
1732             else if (state.inDoctype())
1733                 state = parseDoctype(m_src, state);
1734             else if (state.inServer())
1735                 state = parseServer(m_src, state);
1736             else if (state.inProcessingInstruction())
1737                 state = parseProcessingInstruction(m_src, state);
1738             else if (state.hasTagState())
1739                 state = parseTag(m_src, state);
1740             else if (state.startTag()) {
1741                 state.setStartTag(false);
1742 
1743                 switch (cc) {
1744                 case '/':
1745                     break;
1746                 case '!': {
1747                     // <!-- comment --> or <!DOCTYPE ...>
1748                     searchCount = 1; // Look for '<!--' sequence to start comment or '<!DOCTYPE' sequence to start doctype
1749                     m_doctypeSearchCount = 1;
1750                     break;
1751                 }
1752                 case '?': {
1753                     // xml processing instruction
1754                     state.setInProcessingInstruction(true);
1755                     tquote = NoQuote;
1756                     state = parseProcessingInstruction(m_src, state);
1757                     continue;
1758 
1759                     break;
1760                 }
1761                 case '%':
1762                     if (!m_brokenServer) {
1763                         // <% server stuff, handle as comment %>
1764                         state.setInServer(true);
1765                         tquote = NoQuote;
1766                         state = parseServer(m_src, state);
1767                         continue;
1768                     }
1769                     // else fall through
1770                 default: {
1771                     if ( ((cc >= 'a') && (cc <= 'z')) || ((cc >= 'A') && (cc <= 'Z'))) {
1772                         // Start of a Start-Tag
1773                     } else {
1774                         // Invalid tag
1775                         // Add as is
1776                         *m_dest = '<';
1777                         m_dest++;
1778                         continue;
1779                     }
1780                 }
1781                 }; // end case
1782 
1783                 processToken();
1784 
1785                 m_cBufferPos = 0;
1786                 state.setTagState(TagName);
1787                 state = parseTag(m_src, state);
1788             }
1789         } else if (cc == '&' && !m_src.escaped()) {
1790             m_src.advancePastNonNewline();
1791             state = parseEntity(m_src, m_dest, state, m_cBufferPos, true, state.hasTagState());
1792         } else if (cc == '<' && !m_src.escaped()) {
1793             m_currentTagStartLineNumber = m_lineNumber;
1794             m_src.advancePastNonNewline();
1795             state.setStartTag(true);
1796             state.setDiscardLF(false);
1797         } else if (cc == '\n' || cc == '\r') {
1798             if (state.discardLF())
1799                 // Ignore this LF
1800                 state.setDiscardLF(false); // We have discarded 1 LF
1801             else {
1802                 // Process this LF
1803                 *m_dest++ = '\n';
1804                 if (cc == '\r' && !m_src.excludeLineNumbers())
1805                     m_lineNumber++;
1806             }
1807 
1808             /* Check for MS-DOS CRLF sequence */
1809             if (cc == '\r')
1810                 state.setSkipLF(true);
1811             m_src.advance(m_lineNumber);
1812         } else {
1813             state.setDiscardLF(false);
1814             *m_dest++ = cc;
1815             m_src.advancePastNonNewline();
1816         }
1817     }
1818 
1819 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
1820     if (!m_doc->ownerElement())
1821         printf("Ending write at time %d\n", m_doc->elapsedTime());
1822 #endif
1823 
1824 #if ENABLE(INSPECTOR)
1825     if (InspectorTimelineAgent* timelineAgent = m_doc->inspectorTimelineAgent())
1826         timelineAgent->didWriteHTML(m_lineNumber);
1827 #endif
1828 
1829     m_inWrite = wasInWrite;
1830 
1831     m_state = state;
1832 
1833 #ifdef ANDROID_INSTRUMENT
1834     android::TimeCounter::record(android::TimeCounter::ParsingTimeCounter, __FUNCTION__);
1835 #endif
1836 
1837     if (m_noMoreData && !m_inWrite && !state.loadingExtScript() && !m_executingScript && !m_timer.isActive())
1838         end(); // this actually causes us to be deleted
1839 
1840     // After parsing, go ahead and dispatch image beforeload events.
1841     ImageLoader::dispatchPendingBeforeLoadEvents();
1842 }
1843 
stopParsing()1844 void HTMLTokenizer::stopParsing()
1845 {
1846     Tokenizer::stopParsing();
1847     m_timer.stop();
1848 
1849     // The part needs to know that the tokenizer has finished with its data,
1850     // regardless of whether it happened naturally or due to manual intervention.
1851     if (!m_fragment && m_doc->frame())
1852         m_doc->frame()->loader()->tokenizerProcessedData();
1853 }
1854 
processingData() const1855 bool HTMLTokenizer::processingData() const
1856 {
1857     return m_timer.isActive() || m_inWrite;
1858 }
1859 
timerFired(Timer<HTMLTokenizer> *)1860 void HTMLTokenizer::timerFired(Timer<HTMLTokenizer>*)
1861 {
1862 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
1863     if (!m_doc->ownerElement())
1864         printf("Beginning timer write at time %d\n", m_doc->elapsedTime());
1865 #endif
1866 
1867     if (m_doc->view() && m_doc->view()->layoutPending() && !m_doc->minimumLayoutDelay()) {
1868         // Restart the timer and let layout win.  This is basically a way of ensuring that the layout
1869         // timer has higher priority than our timer.
1870         m_timer.startOneShot(0);
1871         return;
1872     }
1873 
1874     // Invoke write() as though more data came in. This might cause us to get deleted.
1875     write(SegmentedString(), true);
1876 }
1877 
end()1878 void HTMLTokenizer::end()
1879 {
1880     ASSERT(!m_timer.isActive());
1881     m_timer.stop(); // Only helps if assertion above fires, but do it anyway.
1882 
1883     if (m_buffer) {
1884         // parseTag is using the buffer for different matters
1885         if (!m_state.hasTagState())
1886             processToken();
1887 
1888         fastFree(m_scriptCode);
1889         m_scriptCode = 0;
1890         m_scriptCodeSize = m_scriptCodeCapacity = m_scriptCodeResync = 0;
1891 
1892         fastFree(m_buffer);
1893         m_buffer = 0;
1894     }
1895 
1896     if (!inViewSourceMode())
1897         m_parser->finished();
1898     else
1899         m_doc->finishedParsing();
1900 }
1901 
finish()1902 void HTMLTokenizer::finish()
1903 {
1904     // do this as long as we don't find matching comment ends
1905     while ((m_state.inComment() || m_state.inServer()) && m_scriptCode && m_scriptCodeSize) {
1906         // we've found an unmatched comment start
1907         if (m_state.inComment())
1908             m_brokenComments = true;
1909         else
1910             m_brokenServer = true;
1911         checkScriptBuffer();
1912         m_scriptCode[m_scriptCodeSize] = 0;
1913         m_scriptCode[m_scriptCodeSize + 1] = 0;
1914         int pos;
1915         String food;
1916         if (m_state.inScript() || m_state.inStyle() || m_state.inTextArea())
1917             food = String(m_scriptCode, m_scriptCodeSize);
1918         else if (m_state.inServer()) {
1919             food = "<";
1920             food.append(m_scriptCode, m_scriptCodeSize);
1921         } else {
1922             pos = find(m_scriptCode, m_scriptCodeSize, '>');
1923             food = String(m_scriptCode + pos + 1, m_scriptCodeSize - pos - 1);
1924         }
1925         fastFree(m_scriptCode);
1926         m_scriptCode = 0;
1927         m_scriptCodeSize = m_scriptCodeCapacity = m_scriptCodeResync = 0;
1928         m_state.setInComment(false);
1929         m_state.setInServer(false);
1930         if (!food.isEmpty())
1931             write(food, true);
1932     }
1933     // this indicates we will not receive any more data... but if we are waiting on
1934     // an external script to load, we can't finish parsing until that is done
1935     m_noMoreData = true;
1936     if (!m_inWrite && !m_state.loadingExtScript() && !m_executingScript && !m_timer.isActive())
1937         end(); // this actually causes us to be deleted
1938 }
1939 
processToken()1940 PassRefPtr<Node> HTMLTokenizer::processToken()
1941 {
1942     ScriptController* scriptController = (!m_fragment && m_doc->frame()) ? m_doc->frame()->script() : 0;
1943     if (scriptController && scriptController->canExecuteScripts())
1944         // FIXME: Why isn't this m_currentScriptTagStartLineNumber?  I suspect this is wrong.
1945         scriptController->setEventHandlerLineNumber(m_currentTagStartLineNumber + 1); // Script line numbers are 1 based.
1946     if (m_dest > m_buffer) {
1947         m_currentToken.text = StringImpl::createStrippingNullCharacters(m_buffer, m_dest - m_buffer);
1948         if (m_currentToken.tagName != commentAtom)
1949             m_currentToken.tagName = textAtom;
1950     } else if (m_currentToken.tagName == nullAtom) {
1951         m_currentToken.reset();
1952         if (scriptController)
1953             scriptController->setEventHandlerLineNumber(m_lineNumber + 1); // Script line numbers are 1 based.
1954         return 0;
1955     }
1956 
1957     m_dest = m_buffer;
1958 
1959     RefPtr<Node> n;
1960 
1961     if (!m_parserStopped) {
1962         if (NamedMappedAttrMap* map = m_currentToken.attrs.get())
1963             map->shrinkToLength();
1964         if (inViewSourceMode())
1965             static_cast<HTMLViewSourceDocument*>(m_doc)->addViewSourceToken(&m_currentToken);
1966         else
1967             // pass the token over to the parser, the parser DOES NOT delete the token
1968             n = m_parser->parseToken(&m_currentToken);
1969     }
1970     m_currentToken.reset();
1971     if (scriptController)
1972         scriptController->setEventHandlerLineNumber(0);
1973 
1974     return n.release();
1975 }
1976 
processDoctypeToken()1977 void HTMLTokenizer::processDoctypeToken()
1978 {
1979     if (inViewSourceMode())
1980         static_cast<HTMLViewSourceDocument*>(m_doc)->addViewSourceDoctypeToken(&m_doctypeToken);
1981     else
1982         m_parser->parseDoctypeToken(&m_doctypeToken);
1983 }
1984 
~HTMLTokenizer()1985 HTMLTokenizer::~HTMLTokenizer()
1986 {
1987     ASSERT(!m_inWrite);
1988     reset();
1989 }
1990 
1991 
enlargeBuffer(int len)1992 void HTMLTokenizer::enlargeBuffer(int len)
1993 {
1994     // Resize policy: Always at least double the size of the buffer each time.
1995     int delta = max(len, m_bufferSize);
1996 
1997     // Check for overflow.
1998     // For now, handle overflow the same way we handle fastRealloc failure, with CRASH.
1999     static const int maxSize = INT_MAX / sizeof(UChar);
2000     if (delta > maxSize - m_bufferSize)
2001         CRASH();
2002 
2003     int newSize = m_bufferSize + delta;
2004     int oldOffset = m_dest - m_buffer;
2005     m_buffer = static_cast<UChar*>(fastRealloc(m_buffer, newSize * sizeof(UChar)));
2006     m_dest = m_buffer + oldOffset;
2007     m_bufferSize = newSize;
2008 }
2009 
enlargeScriptBuffer(int len)2010 void HTMLTokenizer::enlargeScriptBuffer(int len)
2011 {
2012     // Resize policy: Always at least double the size of the buffer each time.
2013     int delta = max(len, m_scriptCodeCapacity);
2014 
2015     // Check for overflow.
2016     // For now, handle overflow the same way we handle fastRealloc failure, with CRASH.
2017     static const int maxSize = INT_MAX / sizeof(UChar);
2018     if (delta > maxSize - m_scriptCodeCapacity)
2019         CRASH();
2020 
2021     int newSize = m_scriptCodeCapacity + delta;
2022     // If we allow fastRealloc(ptr, 0), it will call CRASH(). We run into this
2023     // case if the HTML being parsed begins with "<!--" and there's more data
2024     // coming.
2025     if (!newSize) {
2026         ASSERT(!m_scriptCode);
2027         return;
2028     }
2029 
2030     m_scriptCode = static_cast<UChar*>(fastRealloc(m_scriptCode, newSize * sizeof(UChar)));
2031     m_scriptCodeCapacity = newSize;
2032 }
2033 
executeScriptsWaitingForStylesheets()2034 void HTMLTokenizer::executeScriptsWaitingForStylesheets()
2035 {
2036     ASSERT(m_doc->haveStylesheetsLoaded());
2037 
2038     if (m_hasScriptsWaitingForStylesheets)
2039         notifyFinished(0);
2040 }
2041 
notifyFinished(CachedResource *)2042 void HTMLTokenizer::notifyFinished(CachedResource*)
2043 {
2044     executeExternalScriptsIfReady();
2045 }
2046 
executeExternalScriptsIfReady()2047 void HTMLTokenizer::executeExternalScriptsIfReady()
2048 {
2049 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
2050     if (!m_doc->ownerElement())
2051         printf("script loaded at %d\n", m_doc->elapsedTime());
2052 #endif
2053 
2054     ASSERT(!m_pendingScripts.isEmpty());
2055 
2056     // Make external scripts wait for external stylesheets.
2057     // FIXME: This needs to be done for inline scripts too.
2058     m_hasScriptsWaitingForStylesheets = !m_doc->haveStylesheetsLoaded();
2059     if (m_hasScriptsWaitingForStylesheets)
2060         return;
2061 
2062     bool finished = false;
2063 
2064     double startTime = currentTime();
2065     while (!finished && m_pendingScripts.first()->isLoaded()) {
2066         if (!continueExecutingExternalScripts(startTime))
2067             break;
2068 
2069         CachedScript* cs = m_pendingScripts.first().get();
2070         m_pendingScripts.removeFirst();
2071         ASSERT(cache()->disabled() || cs->accessCount() > 0);
2072 
2073         setSrc(SegmentedString());
2074 
2075         // make sure we forget about the script before we execute the new one
2076         // infinite recursion might happen otherwise
2077         ScriptSourceCode sourceCode(cs);
2078         bool errorOccurred = cs->errorOccurred();
2079         cs->removeClient(this);
2080 
2081         RefPtr<Node> n = m_scriptNode.release();
2082 
2083 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
2084         if (!m_doc->ownerElement())
2085             printf("external script beginning execution at %d\n", m_doc->elapsedTime());
2086 #endif
2087 
2088         if (errorOccurred)
2089             n->dispatchEvent(Event::create(eventNames().errorEvent, true, false));
2090         else {
2091             if (static_cast<HTMLScriptElement*>(n.get())->shouldExecuteAsJavaScript())
2092                 m_state = scriptExecution(sourceCode, m_state);
2093 #if ENABLE(XHTMLMP)
2094             else
2095                 m_doc->setShouldProcessNoscriptElement(true);
2096 #endif
2097             n->dispatchEvent(Event::create(eventNames().loadEvent, false, false));
2098         }
2099 
2100         // The state of m_pendingScripts.isEmpty() can change inside the scriptExecution()
2101         // call above, so test afterwards.
2102         finished = m_pendingScripts.isEmpty();
2103         if (finished) {
2104             ASSERT(!m_hasScriptsWaitingForStylesheets);
2105             m_state.setLoadingExtScript(false);
2106 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
2107             if (!m_doc->ownerElement())
2108                 printf("external script finished execution at %d\n", m_doc->elapsedTime());
2109 #endif
2110         } else if (m_hasScriptsWaitingForStylesheets) {
2111             // m_hasScriptsWaitingForStylesheets flag might have changed during the script execution.
2112             // If it did we are now blocked waiting for stylesheets and should not execute more scripts until they arrive.
2113             finished = true;
2114         }
2115 
2116         // 'm_requestingScript' is true when we are called synchronously from
2117         // scriptHandler(). In that case scriptHandler() will take care
2118         // of m_pendingSrc.
2119         if (!m_requestingScript) {
2120             SegmentedString rest = m_pendingSrc;
2121             m_pendingSrc.clear();
2122             write(rest, false);
2123             // we might be deleted at this point, do not access any members.
2124         }
2125     }
2126 }
2127 
executeExternalScriptsTimerFired(Timer<HTMLTokenizer> *)2128 void HTMLTokenizer::executeExternalScriptsTimerFired(Timer<HTMLTokenizer>*)
2129 {
2130     if (m_doc->view() && m_doc->view()->layoutPending() && !m_doc->minimumLayoutDelay()) {
2131         // Restart the timer and do layout first.
2132         m_externalScriptsTimer.startOneShot(0);
2133         return;
2134     }
2135 
2136     // Continue executing external scripts.
2137     executeExternalScriptsIfReady();
2138 }
2139 
continueExecutingExternalScripts(double startTime)2140 bool HTMLTokenizer::continueExecutingExternalScripts(double startTime)
2141 {
2142     if (m_externalScriptsTimer.isActive())
2143         return false;
2144 
2145     if (currentTime() - startTime > m_tokenizerTimeDelay) {
2146         // Schedule the timer to keep processing as soon as possible.
2147         m_externalScriptsTimer.startOneShot(0);
2148         return false;
2149     }
2150     return true;
2151 }
2152 
isWaitingForScripts() const2153 bool HTMLTokenizer::isWaitingForScripts() const
2154 {
2155     return m_state.loadingExtScript();
2156 }
2157 
setSrc(const SegmentedString & source)2158 void HTMLTokenizer::setSrc(const SegmentedString& source)
2159 {
2160     m_src = source;
2161 }
2162 
parseHTMLDocumentFragment(const String & source,DocumentFragment * fragment,FragmentScriptingPermission scriptingPermission)2163 void parseHTMLDocumentFragment(const String& source, DocumentFragment* fragment, FragmentScriptingPermission scriptingPermission)
2164 {
2165     HTMLTokenizer tok(fragment, scriptingPermission);
2166     tok.setForceSynchronous(true);
2167     tok.write(source, true);
2168     tok.finish();
2169     ASSERT(!tok.processingData());      // make sure we're done (see 3963151)
2170 }
2171 
decodeNamedEntity(const char * name)2172 UChar decodeNamedEntity(const char* name)
2173 {
2174     const Entity* e = findEntity(name, strlen(name));
2175     return e ? e->code : 0;
2176 }
2177 
2178 }
2179