• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2     Copyright (C) 1997 Martin Jones (mjones@kde.org)
3               (C) 1997 Torben Weis (weis@kde.org)
4               (C) 1998 Waldo Bastian (bastian@kde.org)
5               (C) 1999 Lars Knoll (knoll@kde.org)
6               (C) 1999 Antti Koivisto (koivisto@kde.org)
7               (C) 2001 Dirk Mueller (mueller@kde.org)
8     Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
9     Copyright (C) 2005, 2006 Alexey Proskuryakov (ap@nypop.com)
10     Copyright (C) 2009 Torch Mobile Inc. All rights reserved. (http://www.torchmobile.com/)
11 
12     This library is free software; you can redistribute it and/or
13     modify it under the terms of the GNU Library General Public
14     License as published by the Free Software Foundation; either
15     version 2 of the License, or (at your option) any later version.
16 
17     This library is distributed in the hope that it will be useful,
18     but WITHOUT ANY WARRANTY; without even the implied warranty of
19     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20     Library General Public License for more details.
21 
22     You should have received a copy of the GNU Library General Public License
23     along with this library; see the file COPYING.LIB.  If not, write to
24     the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
25     Boston, MA 02110-1301, USA.
26 */
27 #include "config.h"
28 #include "HTMLTokenizer.h"
29 
30 #include "CSSHelper.h"
31 #include "Cache.h"
32 #include "CachedScript.h"
33 #include "DocLoader.h"
34 #include "DocumentFragment.h"
35 #include "EventNames.h"
36 #include "Frame.h"
37 #include "FrameLoader.h"
38 #include "FrameView.h"
39 #include "HTMLElement.h"
40 #include "HTMLNames.h"
41 #include "HTMLParser.h"
42 #include "HTMLScriptElement.h"
43 #include "HTMLViewSourceDocument.h"
44 #include "MappedAttribute.h"
45 #include "Page.h"
46 #include "PreloadScanner.h"
47 #include "ScriptController.h"
48 #include "ScriptSourceCode.h"
49 #include "ScriptValue.h"
50 #include "XSSAuditor.h"
51 #include <wtf/ASCIICType.h>
52 #include <wtf/CurrentTime.h>
53 
54 #include "HTMLEntityNames.c"
55 
56 #ifdef ANDROID_INSTRUMENT
57 #include "TimeCounter.h"
58 #endif
59 
60 #define PRELOAD_SCANNER_ENABLED 1
61 // #define INSTRUMENT_LAYOUT_SCHEDULING 1
62 
63 using namespace WTF;
64 using namespace std;
65 
66 namespace WebCore {
67 
68 using namespace HTMLNames;
69 
70 #if MOBILE
71 // The mobile device needs to be responsive, as such the tokenizer chunk size is reduced.
72 // This value is used to define how many characters the tokenizer will process before
73 // yeilding control.
74 static const int defaultTokenizerChunkSize = 256;
75 #else
76 static const int defaultTokenizerChunkSize = 4096;
77 #endif
78 
79 #if MOBILE
80 // As the chunks are smaller (above), the tokenizer should not yield for as long a period, otherwise
81 // it will take way to long to load a page.
82 static const double defaultTokenizerTimeDelay = 0.300;
83 #else
84 // FIXME: We would like this constant to be 200ms.
85 // Yielding more aggressively results in increased responsiveness and better incremental rendering.
86 // It slows down overall page-load on slower machines, though, so for now we set a value of 500.
87 static const double defaultTokenizerTimeDelay = 0.500;
88 #endif
89 
90 static const char commentStart [] = "<!--";
91 static const char doctypeStart [] = "<!doctype";
92 static const char publicStart [] = "public";
93 static const char systemStart [] = "system";
94 static const char scriptEnd [] = "</script";
95 static const char xmpEnd [] = "</xmp";
96 static const char styleEnd [] =  "</style";
97 static const char textareaEnd [] = "</textarea";
98 static const char titleEnd [] = "</title";
99 static const char iframeEnd [] = "</iframe";
100 
101 // Full support for MS Windows extensions to Latin-1.
102 // Technically these extensions should only be activated for pages
103 // marked "windows-1252" or "cp1252", but
104 // in the standard Microsoft way, these extensions infect hundreds of thousands
105 // of web pages.  Note that people with non-latin-1 Microsoft extensions
106 // are SOL.
107 //
108 // See: http://www.microsoft.com/globaldev/reference/WinCP.asp
109 //      http://www.bbsinc.com/iso8859.html
110 //      http://www.obviously.com/
111 //
112 // There may be better equivalents
113 
114 // We only need this for entities. For non-entity text, we handle this in the text encoding.
115 
116 static const UChar windowsLatin1ExtensionArray[32] = {
117     0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87
118     0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F
119     0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97
120     0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178  // 98-9F
121 };
122 
fixUpChar(UChar c)123 static inline UChar fixUpChar(UChar c)
124 {
125     if ((c & ~0x1F) != 0x0080)
126         return c;
127     return windowsLatin1ExtensionArray[c - 0x80];
128 }
129 
tagMatch(const char * s1,const UChar * s2,unsigned length)130 static inline bool tagMatch(const char* s1, const UChar* s2, unsigned length)
131 {
132     for (unsigned i = 0; i != length; ++i) {
133         unsigned char c1 = s1[i];
134         unsigned char uc1 = toASCIIUpper(static_cast<char>(c1));
135         UChar c2 = s2[i];
136         if (c1 != c2 && uc1 != c2)
137             return false;
138     }
139     return true;
140 }
141 
addAttribute(AtomicString & attrName,const AtomicString & attributeValue,bool viewSourceMode)142 inline void Token::addAttribute(AtomicString& attrName, const AtomicString& attributeValue, bool viewSourceMode)
143 {
144     if (!attrName.isEmpty()) {
145         ASSERT(!attrName.contains('/'));
146         RefPtr<MappedAttribute> a = MappedAttribute::create(attrName, attributeValue);
147         if (!attrs) {
148             attrs = NamedMappedAttrMap::create();
149             attrs->reserveInitialCapacity(10);
150         }
151         attrs->insertAttribute(a.release(), viewSourceMode);
152     }
153 
154     attrName = emptyAtom;
155 }
156 
157 // ----------------------------------------------------------------------------
158 
HTMLTokenizer(HTMLDocument * doc,bool reportErrors)159 HTMLTokenizer::HTMLTokenizer(HTMLDocument* doc, bool reportErrors)
160     : Tokenizer()
161     , m_buffer(0)
162     , m_scriptCode(0)
163     , m_scriptCodeSize(0)
164     , m_scriptCodeCapacity(0)
165     , m_scriptCodeResync(0)
166     , m_executingScript(0)
167     , m_requestingScript(false)
168     , m_hasScriptsWaitingForStylesheets(false)
169     , m_timer(this, &HTMLTokenizer::timerFired)
170     , m_doc(doc)
171     , m_parser(new HTMLParser(doc, reportErrors))
172     , m_inWrite(false)
173     , m_fragment(false)
174 {
175     begin();
176 }
177 
HTMLTokenizer(HTMLViewSourceDocument * doc)178 HTMLTokenizer::HTMLTokenizer(HTMLViewSourceDocument* doc)
179     : Tokenizer(true)
180     , m_buffer(0)
181     , m_scriptCode(0)
182     , m_scriptCodeSize(0)
183     , m_scriptCodeCapacity(0)
184     , m_scriptCodeResync(0)
185     , m_executingScript(0)
186     , m_requestingScript(false)
187     , m_hasScriptsWaitingForStylesheets(false)
188     , m_timer(this, &HTMLTokenizer::timerFired)
189     , m_doc(doc)
190     , m_parser(0)
191     , m_inWrite(false)
192     , m_fragment(false)
193 {
194     begin();
195 }
196 
HTMLTokenizer(DocumentFragment * frag)197 HTMLTokenizer::HTMLTokenizer(DocumentFragment* frag)
198     : m_buffer(0)
199     , m_scriptCode(0)
200     , m_scriptCodeSize(0)
201     , m_scriptCodeCapacity(0)
202     , m_scriptCodeResync(0)
203     , m_executingScript(0)
204     , m_requestingScript(false)
205     , m_hasScriptsWaitingForStylesheets(false)
206     , m_timer(this, &HTMLTokenizer::timerFired)
207     , m_doc(frag->document())
208     , m_parser(new HTMLParser(frag))
209     , m_inWrite(false)
210     , m_fragment(true)
211 {
212     begin();
213 }
214 
reset()215 void HTMLTokenizer::reset()
216 {
217     ASSERT(m_executingScript == 0);
218 
219     while (!m_pendingScripts.isEmpty()) {
220         CachedScript* cs = m_pendingScripts.first().get();
221         m_pendingScripts.removeFirst();
222         ASSERT(cache()->disabled() || cs->accessCount() > 0);
223         cs->removeClient(this);
224     }
225 
226     fastFree(m_buffer);
227     m_buffer = m_dest = 0;
228     m_bufferSize = 0;
229 
230     fastFree(m_scriptCode);
231     m_scriptCode = 0;
232     m_scriptCodeSize = m_scriptCodeCapacity = m_scriptCodeResync = 0;
233 
234     m_timer.stop();
235     m_state.setAllowYield(false);
236     m_state.setForceSynchronous(false);
237 
238     m_currentToken.reset();
239     m_doctypeToken.reset();
240     m_doctypeSearchCount = 0;
241     m_doctypeSecondarySearchCount = 0;
242     m_hasScriptsWaitingForStylesheets = false;
243 }
244 
begin()245 void HTMLTokenizer::begin()
246 {
247     m_executingScript = 0;
248     m_requestingScript = false;
249     m_hasScriptsWaitingForStylesheets = false;
250     m_state.setLoadingExtScript(false);
251     reset();
252     m_bufferSize = 254;
253     m_buffer = static_cast<UChar*>(fastMalloc(sizeof(UChar) * 254));
254     m_dest = m_buffer;
255     tquote = NoQuote;
256     searchCount = 0;
257     m_state.setEntityState(NoEntity);
258     m_scriptTagSrcAttrValue = String();
259     m_pendingSrc.clear();
260     m_currentPrependingSrc = 0;
261     m_noMoreData = false;
262     m_brokenComments = false;
263     m_brokenServer = false;
264     m_lineNumber = 0;
265     m_currentScriptTagStartLineNumber = 0;
266     m_currentTagStartLineNumber = 0;
267     m_state.setForceSynchronous(false);
268 
269     Page* page = m_doc->page();
270     if (page && page->hasCustomHTMLTokenizerTimeDelay())
271         m_tokenizerTimeDelay = page->customHTMLTokenizerTimeDelay();
272     else
273         m_tokenizerTimeDelay = defaultTokenizerTimeDelay;
274 
275     if (page && page->hasCustomHTMLTokenizerChunkSize())
276         m_tokenizerChunkSize = page->customHTMLTokenizerChunkSize();
277     else
278         m_tokenizerChunkSize = defaultTokenizerChunkSize;
279 }
280 
setForceSynchronous(bool force)281 void HTMLTokenizer::setForceSynchronous(bool force)
282 {
283     m_state.setForceSynchronous(force);
284 }
285 
processListing(SegmentedString list,State state)286 HTMLTokenizer::State HTMLTokenizer::processListing(SegmentedString list, State state)
287 {
288     // This function adds the listing 'list' as
289     // preformatted text-tokens to the token-collection
290     while (!list.isEmpty()) {
291         if (state.skipLF()) {
292             state.setSkipLF(false);
293             if (*list == '\n') {
294                 list.advance();
295                 continue;
296             }
297         }
298 
299         checkBuffer();
300 
301         if (*list == '\n' || *list == '\r') {
302             if (state.discardLF())
303                 // Ignore this LF
304                 state.setDiscardLF(false); // We have discarded 1 LF
305             else
306                 *m_dest++ = '\n';
307 
308             /* Check for MS-DOS CRLF sequence */
309             if (*list == '\r')
310                 state.setSkipLF(true);
311 
312             list.advance();
313         } else {
314             state.setDiscardLF(false);
315             *m_dest++ = *list;
316             list.advance();
317         }
318     }
319 
320     return state;
321 }
322 
parseNonHTMLText(SegmentedString & src,State state)323 HTMLTokenizer::State HTMLTokenizer::parseNonHTMLText(SegmentedString& src, State state)
324 {
325     ASSERT(state.inTextArea() || state.inTitle() || state.inIFrame() || !state.hasEntityState());
326     ASSERT(!state.hasTagState());
327     ASSERT(state.inXmp() + state.inTextArea() + state.inTitle() + state.inStyle() + state.inScript() + state.inIFrame() == 1);
328     if (state.inScript() && !m_currentScriptTagStartLineNumber)
329         m_currentScriptTagStartLineNumber = m_lineNumber;
330 
331     if (state.inComment())
332         state = parseComment(src, state);
333 
334     int lastDecodedEntityPosition = -1;
335     while (!src.isEmpty()) {
336         checkScriptBuffer();
337         UChar ch = *src;
338 
339         if (!m_scriptCodeResync && !m_brokenComments &&
340             !state.inXmp() && ch == '-' && m_scriptCodeSize >= 3 && !src.escaped() &&
341             m_scriptCode[m_scriptCodeSize - 3] == '<' && m_scriptCode[m_scriptCodeSize - 2] == '!' && m_scriptCode[m_scriptCodeSize - 1] == '-' &&
342             (lastDecodedEntityPosition < m_scriptCodeSize - 3)) {
343             state.setInComment(true);
344             state = parseComment(src, state);
345             continue;
346         }
347         if (m_scriptCodeResync && !tquote && ch == '>') {
348             src.advancePastNonNewline();
349             m_scriptCodeSize = m_scriptCodeResync - 1;
350             m_scriptCodeResync = 0;
351             m_scriptCode[m_scriptCodeSize] = m_scriptCode[m_scriptCodeSize + 1] = 0;
352             if (state.inScript())
353                 state = scriptHandler(state);
354             else {
355                 state = processListing(SegmentedString(m_scriptCode, m_scriptCodeSize), state);
356                 processToken();
357                 if (state.inStyle()) {
358                     m_currentToken.tagName = styleTag.localName();
359                     m_currentToken.beginTag = false;
360                 } else if (state.inTextArea()) {
361                     m_currentToken.tagName = textareaTag.localName();
362                     m_currentToken.beginTag = false;
363                 } else if (state.inTitle()) {
364                     m_currentToken.tagName = titleTag.localName();
365                     m_currentToken.beginTag = false;
366                 } else if (state.inXmp()) {
367                     m_currentToken.tagName = xmpTag.localName();
368                     m_currentToken.beginTag = false;
369                 } else if (state.inIFrame()) {
370                     m_currentToken.tagName = iframeTag.localName();
371                     m_currentToken.beginTag = false;
372                 }
373                 processToken();
374                 state.setInStyle(false);
375                 state.setInScript(false);
376                 state.setInTextArea(false);
377                 state.setInTitle(false);
378                 state.setInXmp(false);
379                 state.setInIFrame(false);
380                 tquote = NoQuote;
381                 m_scriptCodeSize = m_scriptCodeResync = 0;
382             }
383             return state;
384         }
385         // possible end of tagname, lets check.
386         if (!m_scriptCodeResync && !state.escaped() && !src.escaped() && (ch == '>' || ch == '/' || isASCIISpace(ch)) &&
387              m_scriptCodeSize >= m_searchStopperLength &&
388              tagMatch(m_searchStopper, m_scriptCode + m_scriptCodeSize - m_searchStopperLength, m_searchStopperLength) &&
389              (lastDecodedEntityPosition < m_scriptCodeSize - m_searchStopperLength)) {
390             m_scriptCodeResync = m_scriptCodeSize-m_searchStopperLength+1;
391             tquote = NoQuote;
392             continue;
393         }
394         if (m_scriptCodeResync && !state.escaped()) {
395             if (ch == '\"')
396                 tquote = (tquote == NoQuote) ? DoubleQuote : ((tquote == SingleQuote) ? SingleQuote : NoQuote);
397             else if (ch == '\'')
398                 tquote = (tquote == NoQuote) ? SingleQuote : (tquote == DoubleQuote) ? DoubleQuote : NoQuote;
399             else if (tquote != NoQuote && (ch == '\r' || ch == '\n'))
400                 tquote = NoQuote;
401         }
402         state.setEscaped(!state.escaped() && ch == '\\');
403         if (!m_scriptCodeResync && (state.inTextArea() || state.inTitle() || state.inIFrame()) && !src.escaped() && ch == '&') {
404             UChar* scriptCodeDest = m_scriptCode + m_scriptCodeSize;
405             src.advancePastNonNewline();
406             state = parseEntity(src, scriptCodeDest, state, m_cBufferPos, true, false);
407             if (scriptCodeDest == m_scriptCode + m_scriptCodeSize)
408                 lastDecodedEntityPosition = m_scriptCodeSize;
409             else
410                 m_scriptCodeSize = scriptCodeDest - m_scriptCode;
411         } else {
412             m_scriptCode[m_scriptCodeSize++] = ch;
413             src.advance(m_lineNumber);
414         }
415     }
416 
417     return state;
418 }
419 
scriptHandler(State state)420 HTMLTokenizer::State HTMLTokenizer::scriptHandler(State state)
421 {
422     // We are inside a <script>
423     bool doScriptExec = false;
424     int startLine = m_currentScriptTagStartLineNumber + 1; // Script line numbers are 1 based, HTMLTokenzier line numbers are 0 based
425 
426     // Reset m_currentScriptTagStartLineNumber to indicate that we've finished parsing the current script element
427     m_currentScriptTagStartLineNumber = 0;
428 
429     // (Bugzilla 3837) Scripts following a frameset element should not execute or,
430     // in the case of extern scripts, even load.
431     bool followingFrameset = (m_doc->body() && m_doc->body()->hasTagName(framesetTag));
432 
433     CachedScript* cs = 0;
434     // don't load external scripts for standalone documents (for now)
435     if (!inViewSourceMode()) {
436         if (!m_scriptTagSrcAttrValue.isEmpty() && m_doc->frame()) {
437             // forget what we just got; load from src url instead
438             if (!m_parser->skipMode() && !followingFrameset) {
439 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
440                 if (!m_doc->ownerElement())
441                     printf("Requesting script at time %d\n", m_doc->elapsedTime());
442 #endif
443                 // The parser might have been stopped by for example a window.close call in an earlier script.
444                 // If so, we don't want to load scripts.
445                 if (!m_parserStopped && (cs = m_doc->docLoader()->requestScript(m_scriptTagSrcAttrValue, m_scriptTagCharsetAttrValue)))
446                     m_pendingScripts.append(cs);
447                 else
448                     m_scriptNode = 0;
449             } else
450                 m_scriptNode = 0;
451             m_scriptTagSrcAttrValue = String();
452         } else {
453             // Parse m_scriptCode containing <script> info
454             doScriptExec = m_scriptNode->shouldExecuteAsJavaScript();
455 #if ENABLE(XHTMLMP)
456             if (!doScriptExec)
457                 m_doc->setShouldProcessNoscriptElement(true);
458 #endif
459             m_scriptNode = 0;
460         }
461     }
462 
463     state = processListing(SegmentedString(m_scriptCode, m_scriptCodeSize), state);
464     RefPtr<Node> node = processToken();
465     String scriptString = node ? node->textContent() : "";
466     m_currentToken.tagName = scriptTag.localName();
467     m_currentToken.beginTag = false;
468     processToken();
469 
470     state.setInScript(false);
471     m_scriptCodeSize = m_scriptCodeResync = 0;
472 
473     // FIXME: The script should be syntax highlighted.
474     if (inViewSourceMode())
475         return state;
476 
477     SegmentedString* savedPrependingSrc = m_currentPrependingSrc;
478     SegmentedString prependingSrc;
479     m_currentPrependingSrc = &prependingSrc;
480 
481 #ifdef ANDROID_INSTRUMENT
482     android::TimeCounter::recordNoCounter(android::TimeCounter::ParsingTimeCounter, __FUNCTION__);
483 #endif
484 
485     if (!m_parser->skipMode() && !followingFrameset) {
486         if (cs) {
487             if (savedPrependingSrc)
488                 savedPrependingSrc->append(m_src);
489             else
490                 m_pendingSrc.prepend(m_src);
491             setSrc(SegmentedString());
492 
493             // the ref() call below may call notifyFinished if the script is already in cache,
494             // and that mucks with the state directly, so we must write it back to the object.
495             m_state = state;
496             bool savedRequestingScript = m_requestingScript;
497             m_requestingScript = true;
498             cs->addClient(this);
499             m_requestingScript = savedRequestingScript;
500             state = m_state;
501             // will be 0 if script was already loaded and ref() executed it
502             if (!m_pendingScripts.isEmpty())
503                 state.setLoadingExtScript(true);
504         } else if (!m_fragment && doScriptExec) {
505             if (!m_executingScript)
506                 m_pendingSrc.prepend(m_src);
507             else
508                 prependingSrc = m_src;
509             setSrc(SegmentedString());
510             state = scriptExecution(ScriptSourceCode(scriptString, m_doc->frame() ? m_doc->frame()->document()->url() : KURL(), startLine), state);
511         }
512     }
513 
514 #ifdef ANDROID_INSTRUMENT
515     android::TimeCounter::start(android::TimeCounter::ParsingTimeCounter);
516 #endif
517 
518     if (!m_executingScript && !state.loadingExtScript()) {
519         m_src.append(m_pendingSrc);
520         m_pendingSrc.clear();
521     } else if (!prependingSrc.isEmpty()) {
522         // restore first so that the write appends in the right place
523         // (does not hurt to do it again below)
524         m_currentPrependingSrc = savedPrependingSrc;
525 
526         // we need to do this slightly modified bit of one of the write() cases
527         // because we want to prepend to m_pendingSrc rather than appending
528         // if there's no previous prependingSrc
529         if (!m_pendingScripts.isEmpty()) {
530             if (m_currentPrependingSrc)
531                 m_currentPrependingSrc->append(prependingSrc);
532             else
533                 m_pendingSrc.prepend(prependingSrc);
534         } else {
535             m_state = state;
536             write(prependingSrc, false);
537             state = m_state;
538         }
539     }
540 
541 
542 #if PRELOAD_SCANNER_ENABLED
543     if (!m_pendingScripts.isEmpty() && !m_executingScript) {
544         if (!m_preloadScanner)
545             m_preloadScanner.set(new PreloadScanner(m_doc));
546         if (!m_preloadScanner->inProgress()) {
547             m_preloadScanner->begin();
548             m_preloadScanner->write(m_pendingSrc);
549         }
550     }
551 #endif
552     m_currentPrependingSrc = savedPrependingSrc;
553 
554     return state;
555 }
556 
scriptExecution(const ScriptSourceCode & sourceCode,State state)557 HTMLTokenizer::State HTMLTokenizer::scriptExecution(const ScriptSourceCode& sourceCode, State state)
558 {
559     if (m_fragment || !m_doc->frame())
560         return state;
561     m_executingScript++;
562 
563     SegmentedString* savedPrependingSrc = m_currentPrependingSrc;
564     SegmentedString prependingSrc;
565     m_currentPrependingSrc = &prependingSrc;
566 
567 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
568     if (!m_doc->ownerElement())
569         printf("beginning script execution at %d\n", m_doc->elapsedTime());
570 #endif
571 
572     m_state = state;
573     m_doc->frame()->loader()->executeScript(sourceCode);
574     state = m_state;
575 
576     state.setAllowYield(true);
577 
578 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
579     if (!m_doc->ownerElement())
580         printf("ending script execution at %d\n", m_doc->elapsedTime());
581 #endif
582 
583     m_executingScript--;
584 
585     if (!m_executingScript && !state.loadingExtScript()) {
586         m_pendingSrc.prepend(prependingSrc);
587         m_src.append(m_pendingSrc);
588         m_pendingSrc.clear();
589     } else if (!prependingSrc.isEmpty()) {
590         // restore first so that the write appends in the right place
591         // (does not hurt to do it again below)
592         m_currentPrependingSrc = savedPrependingSrc;
593 
594         // we need to do this slightly modified bit of one of the write() cases
595         // because we want to prepend to m_pendingSrc rather than appending
596         // if there's no previous prependingSrc
597         if (!m_pendingScripts.isEmpty()) {
598             if (m_currentPrependingSrc)
599                 m_currentPrependingSrc->append(prependingSrc);
600             else
601                 m_pendingSrc.prepend(prependingSrc);
602 
603 #if PRELOAD_SCANNER_ENABLED
604             // We are stuck waiting for another script. Lets check the source that
605             // was just document.write()n for anything to load.
606             PreloadScanner documentWritePreloadScanner(m_doc);
607             documentWritePreloadScanner.begin();
608             documentWritePreloadScanner.write(prependingSrc);
609             documentWritePreloadScanner.end();
610 #endif
611         } else {
612             m_state = state;
613             write(prependingSrc, false);
614             state = m_state;
615         }
616     }
617 
618     m_currentPrependingSrc = savedPrependingSrc;
619 
620     return state;
621 }
622 
parseComment(SegmentedString & src,State state)623 HTMLTokenizer::State HTMLTokenizer::parseComment(SegmentedString& src, State state)
624 {
625     // FIXME: Why does this code even run for comments inside <script> and <style>? This seems bogus.
626     checkScriptBuffer(src.length());
627     while (!src.isEmpty()) {
628         UChar ch = *src;
629         m_scriptCode[m_scriptCodeSize++] = ch;
630         if (ch == '>') {
631             bool handleBrokenComments = m_brokenComments && !(state.inScript() || state.inStyle());
632             int endCharsCount = 1; // start off with one for the '>' character
633             if (m_scriptCodeSize > 2 && m_scriptCode[m_scriptCodeSize-3] == '-' && m_scriptCode[m_scriptCodeSize-2] == '-') {
634                 endCharsCount = 3;
635             } else if (m_scriptCodeSize > 3 && m_scriptCode[m_scriptCodeSize-4] == '-' && m_scriptCode[m_scriptCodeSize-3] == '-' &&
636                 m_scriptCode[m_scriptCodeSize-2] == '!') {
637                 // Other browsers will accept --!> as a close comment, even though it's
638                 // not technically valid.
639                 endCharsCount = 4;
640             }
641             if (handleBrokenComments || endCharsCount > 1) {
642                 src.advancePastNonNewline();
643                 if (!(state.inTitle() || state.inScript() || state.inXmp() || state.inTextArea() || state.inStyle() || state.inIFrame())) {
644                     checkScriptBuffer();
645                     m_scriptCode[m_scriptCodeSize] = 0;
646                     m_scriptCode[m_scriptCodeSize + 1] = 0;
647                     m_currentToken.tagName = commentAtom;
648                     m_currentToken.beginTag = true;
649                     state = processListing(SegmentedString(m_scriptCode, m_scriptCodeSize - endCharsCount), state);
650                     processToken();
651                     m_currentToken.tagName = commentAtom;
652                     m_currentToken.beginTag = false;
653                     processToken();
654                     m_scriptCodeSize = 0;
655                 }
656                 state.setInComment(false);
657                 return state; // Finished parsing comment
658             }
659         }
660         src.advance(m_lineNumber);
661     }
662 
663     return state;
664 }
665 
parseServer(SegmentedString & src,State state)666 HTMLTokenizer::State HTMLTokenizer::parseServer(SegmentedString& src, State state)
667 {
668     checkScriptBuffer(src.length());
669     while (!src.isEmpty()) {
670         UChar ch = *src;
671         m_scriptCode[m_scriptCodeSize++] = ch;
672         if (ch == '>' && m_scriptCodeSize > 1 && m_scriptCode[m_scriptCodeSize - 2] == '%') {
673             src.advancePastNonNewline();
674             state.setInServer(false);
675             m_scriptCodeSize = 0;
676             return state; // Finished parsing server include
677         }
678         src.advance(m_lineNumber);
679     }
680     return state;
681 }
682 
parseProcessingInstruction(SegmentedString & src,State state)683 HTMLTokenizer::State HTMLTokenizer::parseProcessingInstruction(SegmentedString& src, State state)
684 {
685     UChar oldchar = 0;
686     while (!src.isEmpty()) {
687         UChar chbegin = *src;
688         if (chbegin == '\'')
689             tquote = tquote == SingleQuote ? NoQuote : SingleQuote;
690         else if (chbegin == '\"')
691             tquote = tquote == DoubleQuote ? NoQuote : DoubleQuote;
692         // Look for '?>'
693         // Some crappy sites omit the "?" before it, so
694         // we look for an unquoted '>' instead. (IE compatible)
695         else if (chbegin == '>' && (!tquote || oldchar == '?')) {
696             // We got a '?>' sequence
697             state.setInProcessingInstruction(false);
698             src.advancePastNonNewline();
699             state.setDiscardLF(true);
700             return state; // Finished parsing comment!
701         }
702         src.advance(m_lineNumber);
703         oldchar = chbegin;
704     }
705 
706     return state;
707 }
708 
parseText(SegmentedString & src,State state)709 HTMLTokenizer::State HTMLTokenizer::parseText(SegmentedString& src, State state)
710 {
711     while (!src.isEmpty()) {
712         UChar cc = *src;
713 
714         if (state.skipLF()) {
715             state.setSkipLF(false);
716             if (cc == '\n') {
717                 src.advancePastNewline(m_lineNumber);
718                 continue;
719             }
720         }
721 
722         // do we need to enlarge the buffer?
723         checkBuffer();
724 
725         if (cc == '\r') {
726             state.setSkipLF(true);
727             *m_dest++ = '\n';
728         } else
729             *m_dest++ = cc;
730         src.advance(m_lineNumber);
731     }
732 
733     return state;
734 }
735 
736 
parseEntity(SegmentedString & src,UChar * & dest,State state,unsigned & cBufferPos,bool start,bool parsingTag)737 HTMLTokenizer::State HTMLTokenizer::parseEntity(SegmentedString& src, UChar*& dest, State state, unsigned& cBufferPos, bool start, bool parsingTag)
738 {
739     if (start) {
740         cBufferPos = 0;
741         state.setEntityState(SearchEntity);
742         EntityUnicodeValue = 0;
743     }
744 
745     while (!src.isEmpty()) {
746         UChar cc = *src;
747         switch (state.entityState()) {
748         case NoEntity:
749             ASSERT(state.entityState() != NoEntity);
750             return state;
751 
752         case SearchEntity:
753             if (cc == '#') {
754                 m_cBuffer[cBufferPos++] = cc;
755                 src.advancePastNonNewline();
756                 state.setEntityState(NumericSearch);
757             } else
758                 state.setEntityState(EntityName);
759             break;
760 
761         case NumericSearch:
762             if (cc == 'x' || cc == 'X') {
763                 m_cBuffer[cBufferPos++] = cc;
764                 src.advancePastNonNewline();
765                 state.setEntityState(Hexadecimal);
766             } else if (cc >= '0' && cc <= '9')
767                 state.setEntityState(Decimal);
768             else
769                 state.setEntityState(SearchSemicolon);
770             break;
771 
772         case Hexadecimal: {
773             int ll = min(src.length(), 10 - cBufferPos);
774             while (ll--) {
775                 cc = *src;
776                 if (!((cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f') || (cc >= 'A' && cc <= 'F'))) {
777                     state.setEntityState(SearchSemicolon);
778                     break;
779                 }
780                 int digit;
781                 if (cc < 'A')
782                     digit = cc - '0';
783                 else
784                     digit = (cc - 'A' + 10) & 0xF; // handle both upper and lower case without a branch
785                 EntityUnicodeValue = EntityUnicodeValue * 16 + digit;
786                 m_cBuffer[cBufferPos++] = cc;
787                 src.advancePastNonNewline();
788             }
789             if (cBufferPos == 10)
790                 state.setEntityState(SearchSemicolon);
791             break;
792         }
793         case Decimal:
794         {
795             int ll = min(src.length(), 9-cBufferPos);
796             while (ll--) {
797                 cc = *src;
798 
799                 if (!(cc >= '0' && cc <= '9')) {
800                     state.setEntityState(SearchSemicolon);
801                     break;
802                 }
803 
804                 EntityUnicodeValue = EntityUnicodeValue * 10 + (cc - '0');
805                 m_cBuffer[cBufferPos++] = cc;
806                 src.advancePastNonNewline();
807             }
808             if (cBufferPos == 9)
809                 state.setEntityState(SearchSemicolon);
810             break;
811         }
812         case EntityName:
813         {
814             int ll = min(src.length(), 9-cBufferPos);
815             while (ll--) {
816                 cc = *src;
817 
818                 if (!((cc >= 'a' && cc <= 'z') || (cc >= '0' && cc <= '9') || (cc >= 'A' && cc <= 'Z'))) {
819                     state.setEntityState(SearchSemicolon);
820                     break;
821                 }
822 
823                 m_cBuffer[cBufferPos++] = cc;
824                 src.advancePastNonNewline();
825             }
826             if (cBufferPos == 9)
827                 state.setEntityState(SearchSemicolon);
828             if (state.entityState() == SearchSemicolon) {
829                 if (cBufferPos > 1) {
830                     // Since the maximum length of entity name is 9,
831                     // so a single char array which is allocated on
832                     // the stack, its length is 10, should be OK.
833                     // Also if we have an illegal character, we treat it
834                     // as illegal entity name.
835                     unsigned testedEntityNameLen = 0;
836                     char tmpEntityNameBuffer[10];
837 
838                     ASSERT(cBufferPos < 10);
839                     for (; testedEntityNameLen < cBufferPos; ++testedEntityNameLen) {
840                         if (m_cBuffer[testedEntityNameLen] > 0x7e)
841                             break;
842                         tmpEntityNameBuffer[testedEntityNameLen] = m_cBuffer[testedEntityNameLen];
843                     }
844 
845                     const Entity *e;
846 
847                     if (testedEntityNameLen == cBufferPos)
848                         e = findEntity(tmpEntityNameBuffer, cBufferPos);
849                     else
850                         e = 0;
851 
852                     if (e)
853                         EntityUnicodeValue = e->code;
854 
855                     // be IE compatible
856                     if (parsingTag && EntityUnicodeValue > 255 && *src != ';')
857                         EntityUnicodeValue = 0;
858                 }
859             }
860             else
861                 break;
862         }
863         case SearchSemicolon:
864             // Don't allow values that are more than 21 bits.
865             if (EntityUnicodeValue > 0 && EntityUnicodeValue <= 0x10FFFF) {
866                 if (!inViewSourceMode()) {
867                     if (*src == ';')
868                         src.advancePastNonNewline();
869                     if (EntityUnicodeValue <= 0xFFFF) {
870                         checkBuffer();
871                         src.push(fixUpChar(EntityUnicodeValue));
872                     } else {
873                         // Convert to UTF-16, using surrogate code points.
874                         checkBuffer(2);
875                         src.push(U16_LEAD(EntityUnicodeValue));
876                         src.push(U16_TRAIL(EntityUnicodeValue));
877                     }
878                 } else {
879                     // FIXME: We should eventually colorize entities by sending them as a special token.
880                     // 12 bytes required: up to 10 bytes in m_cBuffer plus the
881                     // leading '&' and trailing ';'
882                     checkBuffer(12);
883                     *dest++ = '&';
884                     for (unsigned i = 0; i < cBufferPos; i++)
885                         dest[i] = m_cBuffer[i];
886                     dest += cBufferPos;
887                     if (*src == ';') {
888                         *dest++ = ';';
889                         src.advancePastNonNewline();
890                     }
891                 }
892             } else {
893                 // 11 bytes required: up to 10 bytes in m_cBuffer plus the
894                 // leading '&'
895                 checkBuffer(11);
896                 // ignore the sequence, add it to the buffer as plaintext
897                 *dest++ = '&';
898                 for (unsigned i = 0; i < cBufferPos; i++)
899                     dest[i] = m_cBuffer[i];
900                 dest += cBufferPos;
901             }
902 
903             state.setEntityState(NoEntity);
904             return state;
905         }
906     }
907 
908     return state;
909 }
910 
parseDoctype(SegmentedString & src,State state)911 HTMLTokenizer::State HTMLTokenizer::parseDoctype(SegmentedString& src, State state)
912 {
913     ASSERT(state.inDoctype());
914     while (!src.isEmpty() && state.inDoctype()) {
915         UChar c = *src;
916         bool isWhitespace = c == '\r' || c == '\n' || c == '\t' || c == ' ';
917         switch (m_doctypeToken.state()) {
918             case DoctypeBegin: {
919                 m_doctypeToken.setState(DoctypeBeforeName);
920                 if (isWhitespace) {
921                     src.advance(m_lineNumber);
922                     if (inViewSourceMode())
923                         m_doctypeToken.m_source.append(c);
924                 }
925                 break;
926             }
927             case DoctypeBeforeName: {
928                 if (c == '>') {
929                     // Malformed.  Just exit.
930                     src.advancePastNonNewline();
931                     state.setInDoctype(false);
932                     if (inViewSourceMode())
933                         processDoctypeToken();
934                 } else if (isWhitespace) {
935                     src.advance(m_lineNumber);
936                     if (inViewSourceMode())
937                         m_doctypeToken.m_source.append(c);
938                 } else
939                     m_doctypeToken.setState(DoctypeName);
940                 break;
941             }
942             case DoctypeName: {
943                 if (c == '>') {
944                     // Valid doctype. Emit it.
945                     src.advancePastNonNewline();
946                     state.setInDoctype(false);
947                     processDoctypeToken();
948                 } else if (isWhitespace) {
949                     m_doctypeSearchCount = 0; // Used now to scan for PUBLIC
950                     m_doctypeSecondarySearchCount = 0; // Used now to scan for SYSTEM
951                     m_doctypeToken.setState(DoctypeAfterName);
952                     src.advance(m_lineNumber);
953                     if (inViewSourceMode())
954                         m_doctypeToken.m_source.append(c);
955                 } else {
956                     src.advancePastNonNewline();
957                     m_doctypeToken.m_name.append(c);
958                     if (inViewSourceMode())
959                         m_doctypeToken.m_source.append(c);
960                 }
961                 break;
962             }
963             case DoctypeAfterName: {
964                 if (c == '>') {
965                     // Valid doctype. Emit it.
966                     src.advancePastNonNewline();
967                     state.setInDoctype(false);
968                     processDoctypeToken();
969                 } else if (!isWhitespace) {
970                     src.advancePastNonNewline();
971                     if (toASCIILower(c) == publicStart[m_doctypeSearchCount]) {
972                         m_doctypeSearchCount++;
973                         if (m_doctypeSearchCount == 6)
974                             // Found 'PUBLIC' sequence
975                             m_doctypeToken.setState(DoctypeBeforePublicID);
976                     } else if (m_doctypeSearchCount > 0) {
977                         m_doctypeSearchCount = 0;
978                         m_doctypeToken.setState(DoctypeBogus);
979                     } else if (toASCIILower(c) == systemStart[m_doctypeSecondarySearchCount]) {
980                         m_doctypeSecondarySearchCount++;
981                         if (m_doctypeSecondarySearchCount == 6)
982                             // Found 'SYSTEM' sequence
983                             m_doctypeToken.setState(DoctypeBeforeSystemID);
984                     } else {
985                         m_doctypeSecondarySearchCount = 0;
986                         m_doctypeToken.setState(DoctypeBogus);
987                     }
988                     if (inViewSourceMode())
989                         m_doctypeToken.m_source.append(c);
990                 } else {
991                     src.advance(m_lineNumber); // Whitespace keeps us in the after name state.
992                     if (inViewSourceMode())
993                         m_doctypeToken.m_source.append(c);
994                 }
995                 break;
996             }
997             case DoctypeBeforePublicID: {
998                 if (c == '\"' || c == '\'') {
999                     tquote = c == '\"' ? DoubleQuote : SingleQuote;
1000                     m_doctypeToken.setState(DoctypePublicID);
1001                     src.advancePastNonNewline();
1002                     if (inViewSourceMode())
1003                         m_doctypeToken.m_source.append(c);
1004                 } else if (c == '>') {
1005                     // Considered bogus.  Don't process the doctype.
1006                     src.advancePastNonNewline();
1007                     state.setInDoctype(false);
1008                     if (inViewSourceMode())
1009                         processDoctypeToken();
1010                 } else if (isWhitespace) {
1011                     src.advance(m_lineNumber);
1012                     if (inViewSourceMode())
1013                         m_doctypeToken.m_source.append(c);
1014                 } else
1015                     m_doctypeToken.setState(DoctypeBogus);
1016                 break;
1017             }
1018             case DoctypePublicID: {
1019                 if ((c == '\"' && tquote == DoubleQuote) || (c == '\'' && tquote == SingleQuote)) {
1020                     src.advancePastNonNewline();
1021                     m_doctypeToken.setState(DoctypeAfterPublicID);
1022                     if (inViewSourceMode())
1023                         m_doctypeToken.m_source.append(c);
1024                 } else if (c == '>') {
1025                      // Considered bogus.  Don't process the doctype.
1026                     src.advancePastNonNewline();
1027                     state.setInDoctype(false);
1028                     if (inViewSourceMode())
1029                         processDoctypeToken();
1030                 } else {
1031                     m_doctypeToken.m_publicID.append(c);
1032                     src.advance(m_lineNumber);
1033                     if (inViewSourceMode())
1034                         m_doctypeToken.m_source.append(c);
1035                 }
1036                 break;
1037             }
1038             case DoctypeAfterPublicID:
1039                 if (c == '\"' || c == '\'') {
1040                     tquote = c == '\"' ? DoubleQuote : SingleQuote;
1041                     m_doctypeToken.setState(DoctypeSystemID);
1042                     src.advancePastNonNewline();
1043                     if (inViewSourceMode())
1044                         m_doctypeToken.m_source.append(c);
1045                 } else if (c == '>') {
1046                     // Valid doctype. Emit it now.
1047                     src.advancePastNonNewline();
1048                     state.setInDoctype(false);
1049                     processDoctypeToken();
1050                 } else if (isWhitespace) {
1051                     src.advance(m_lineNumber);
1052                     if (inViewSourceMode())
1053                         m_doctypeToken.m_source.append(c);
1054                 } else
1055                     m_doctypeToken.setState(DoctypeBogus);
1056                 break;
1057             case DoctypeBeforeSystemID:
1058                 if (c == '\"' || c == '\'') {
1059                     tquote = c == '\"' ? DoubleQuote : SingleQuote;
1060                     m_doctypeToken.setState(DoctypeSystemID);
1061                     src.advancePastNonNewline();
1062                     if (inViewSourceMode())
1063                         m_doctypeToken.m_source.append(c);
1064                 } else if (c == '>') {
1065                     // Considered bogus.  Don't process the doctype.
1066                     src.advancePastNonNewline();
1067                     state.setInDoctype(false);
1068                 } else if (isWhitespace) {
1069                     src.advance(m_lineNumber);
1070                     if (inViewSourceMode())
1071                         m_doctypeToken.m_source.append(c);
1072                 } else
1073                     m_doctypeToken.setState(DoctypeBogus);
1074                 break;
1075             case DoctypeSystemID:
1076                 if ((c == '\"' && tquote == DoubleQuote) || (c == '\'' && tquote == SingleQuote)) {
1077                     src.advancePastNonNewline();
1078                     m_doctypeToken.setState(DoctypeAfterSystemID);
1079                     if (inViewSourceMode())
1080                         m_doctypeToken.m_source.append(c);
1081                 } else if (c == '>') {
1082                      // Considered bogus.  Don't process the doctype.
1083                     src.advancePastNonNewline();
1084                     state.setInDoctype(false);
1085                     if (inViewSourceMode())
1086                         processDoctypeToken();
1087                 } else {
1088                     m_doctypeToken.m_systemID.append(c);
1089                     src.advance(m_lineNumber);
1090                     if (inViewSourceMode())
1091                         m_doctypeToken.m_source.append(c);
1092                 }
1093                 break;
1094             case DoctypeAfterSystemID:
1095                 if (c == '>') {
1096                     // Valid doctype. Emit it now.
1097                     src.advancePastNonNewline();
1098                     state.setInDoctype(false);
1099                     processDoctypeToken();
1100                 } else if (isWhitespace) {
1101                     src.advance(m_lineNumber);
1102                     if (inViewSourceMode())
1103                         m_doctypeToken.m_source.append(c);
1104                 } else
1105                     m_doctypeToken.setState(DoctypeBogus);
1106                 break;
1107             case DoctypeBogus:
1108                 if (c == '>') {
1109                     // Done with the bogus doctype.
1110                     src.advancePastNonNewline();
1111                     state.setInDoctype(false);
1112                     if (inViewSourceMode())
1113                        processDoctypeToken();
1114                 } else {
1115                     src.advance(m_lineNumber); // Just keep scanning for '>'
1116                     if (inViewSourceMode())
1117                         m_doctypeToken.m_source.append(c);
1118                 }
1119                 break;
1120             default:
1121                 break;
1122         }
1123     }
1124     return state;
1125 }
1126 
parseTag(SegmentedString & src,State state)1127 HTMLTokenizer::State HTMLTokenizer::parseTag(SegmentedString& src, State state)
1128 {
1129     ASSERT(!state.hasEntityState());
1130 
1131     unsigned cBufferPos = m_cBufferPos;
1132 
1133     bool lastIsSlash = false;
1134 
1135     while (!src.isEmpty()) {
1136         checkBuffer();
1137         switch (state.tagState()) {
1138         case NoTag:
1139         {
1140             m_cBufferPos = cBufferPos;
1141             return state;
1142         }
1143         case TagName:
1144         {
1145             if (searchCount > 0) {
1146                 if (*src == commentStart[searchCount]) {
1147                     searchCount++;
1148                     if (searchCount == 2)
1149                         m_doctypeSearchCount++; // A '!' is also part of a doctype, so we are moving through that still as well.
1150                     else
1151                         m_doctypeSearchCount = 0;
1152                     if (searchCount == 4) {
1153                         // Found '<!--' sequence
1154                         src.advancePastNonNewline();
1155                         m_dest = m_buffer; // ignore the previous part of this tag
1156                         state.setInComment(true);
1157                         state.setTagState(NoTag);
1158 
1159                         // Fix bug 34302 at kde.bugs.org.  Go ahead and treat
1160                         // <!--> as a valid comment, since both mozilla and IE on windows
1161                         // can handle this case.  Only do this in quirks mode. -dwh
1162                         if (!src.isEmpty() && *src == '>' && m_doc->inCompatMode()) {
1163                             state.setInComment(false);
1164                             src.advancePastNonNewline();
1165                             if (!src.isEmpty())
1166                                 m_cBuffer[cBufferPos++] = *src;
1167                         } else
1168                           state = parseComment(src, state);
1169 
1170                         m_cBufferPos = cBufferPos;
1171                         return state; // Finished parsing tag!
1172                     }
1173                     m_cBuffer[cBufferPos++] = *src;
1174                     src.advancePastNonNewline();
1175                     break;
1176                 } else
1177                     searchCount = 0; // Stop looking for '<!--' sequence
1178             }
1179 
1180             if (m_doctypeSearchCount > 0) {
1181                 if (toASCIILower(*src) == doctypeStart[m_doctypeSearchCount]) {
1182                     m_doctypeSearchCount++;
1183                     m_cBuffer[cBufferPos++] = *src;
1184                     src.advancePastNonNewline();
1185                     if (m_doctypeSearchCount == 9) {
1186                         // Found '<!DOCTYPE' sequence
1187                         state.setInDoctype(true);
1188                         state.setTagState(NoTag);
1189                         m_doctypeToken.reset();
1190                         if (inViewSourceMode())
1191                             m_doctypeToken.m_source.append(m_cBuffer, cBufferPos);
1192                         state = parseDoctype(src, state);
1193                         m_cBufferPos = cBufferPos;
1194                         return state;
1195                     }
1196                     break;
1197                 } else
1198                     m_doctypeSearchCount = 0; // Stop looking for '<!DOCTYPE' sequence
1199             }
1200 
1201             bool finish = false;
1202             unsigned int ll = min(src.length(), CBUFLEN - cBufferPos);
1203             while (ll--) {
1204                 UChar curchar = *src;
1205                 if (isASCIISpace(curchar) || curchar == '>' || curchar == '<') {
1206                     finish = true;
1207                     break;
1208                 }
1209 
1210                 // tolower() shows up on profiles. This is faster!
1211                 if (curchar >= 'A' && curchar <= 'Z' && !inViewSourceMode())
1212                     m_cBuffer[cBufferPos++] = curchar + ('a' - 'A');
1213                 else
1214                     m_cBuffer[cBufferPos++] = curchar;
1215                 src.advancePastNonNewline();
1216             }
1217 
1218             // Disadvantage: we add the possible rest of the tag
1219             // as attribute names. ### judge if this causes problems
1220             if (finish || CBUFLEN == cBufferPos) {
1221                 bool beginTag;
1222                 UChar* ptr = m_cBuffer;
1223                 unsigned int len = cBufferPos;
1224                 m_cBuffer[cBufferPos] = '\0';
1225                 if ((cBufferPos > 0) && (*ptr == '/')) {
1226                     // End Tag
1227                     beginTag = false;
1228                     ptr++;
1229                     len--;
1230                 }
1231                 else
1232                     // Start Tag
1233                     beginTag = true;
1234 
1235                 // Ignore the / in fake xml tags like <br/>.  We trim off the "/" so that we'll get "br" as the tag name and not "br/".
1236                 if (len > 1 && ptr[len-1] == '/' && !inViewSourceMode())
1237                     ptr[--len] = '\0';
1238 
1239                 // Now that we've shaved off any invalid / that might have followed the name), make the tag.
1240                 // FIXME: FireFox and WinIE turn !foo nodes into comments, we ignore comments. (fast/parser/tag-with-exclamation-point.html)
1241                 if (ptr[0] != '!' || inViewSourceMode()) {
1242                     m_currentToken.tagName = AtomicString(ptr);
1243                     m_currentToken.beginTag = beginTag;
1244                 }
1245                 m_dest = m_buffer;
1246                 state.setTagState(SearchAttribute);
1247                 cBufferPos = 0;
1248             }
1249             break;
1250         }
1251         case SearchAttribute:
1252             while (!src.isEmpty()) {
1253                 UChar curchar = *src;
1254                 // In this mode just ignore any quotes we encounter and treat them like spaces.
1255                 if (!isASCIISpace(curchar) && curchar != '\'' && curchar != '"') {
1256                     if (curchar == '<' || curchar == '>')
1257                         state.setTagState(SearchEnd);
1258                     else
1259                         state.setTagState(AttributeName);
1260 
1261                     cBufferPos = 0;
1262                     break;
1263                 }
1264                 if (inViewSourceMode())
1265                     m_currentToken.addViewSourceChar(curchar);
1266                 src.advance(m_lineNumber);
1267             }
1268             break;
1269         case AttributeName:
1270         {
1271             m_rawAttributeBeforeValue.clear();
1272             int ll = min(src.length(), CBUFLEN - cBufferPos);
1273             while (ll--) {
1274                 UChar curchar = *src;
1275                 // If we encounter a "/" when scanning an attribute name, treat it as a delimiter.  This allows the
1276                 // cases like <input type=checkbox checked/> to work (and accommodates XML-style syntax as per HTML5).
1277                 if (curchar <= '>' && (curchar >= '<' || isASCIISpace(curchar) || curchar == '/')) {
1278                     m_cBuffer[cBufferPos] = '\0';
1279                     m_attrName = AtomicString(m_cBuffer);
1280                     m_dest = m_buffer;
1281                     *m_dest++ = 0;
1282                     state.setTagState(SearchEqual);
1283                     if (inViewSourceMode())
1284                         m_currentToken.addViewSourceChar('a');
1285                     break;
1286                 }
1287 
1288                 // tolower() shows up on profiles. This is faster!
1289                 if (curchar >= 'A' && curchar <= 'Z' && !inViewSourceMode())
1290                     m_cBuffer[cBufferPos++] = curchar + ('a' - 'A');
1291                 else
1292                     m_cBuffer[cBufferPos++] = curchar;
1293 
1294                 m_rawAttributeBeforeValue.append(curchar);
1295                 src.advance(m_lineNumber);
1296             }
1297             if (cBufferPos == CBUFLEN) {
1298                 m_cBuffer[cBufferPos] = '\0';
1299                 m_attrName = AtomicString(m_cBuffer);
1300                 m_dest = m_buffer;
1301                 *m_dest++ = 0;
1302                 state.setTagState(SearchEqual);
1303                 if (inViewSourceMode())
1304                     m_currentToken.addViewSourceChar('a');
1305             }
1306             break;
1307         }
1308         case SearchEqual:
1309             while (!src.isEmpty()) {
1310                 UChar curchar = *src;
1311 
1312                 if (lastIsSlash && curchar == '>') {
1313                     // This is a quirk (with a long sad history).  We have to do this
1314                     // since widgets do <script src="foo.js"/> and expect the tag to close.
1315                     if (m_currentToken.tagName == scriptTag)
1316                         m_currentToken.selfClosingTag = true;
1317                     m_currentToken.brokenXMLStyle = true;
1318                 }
1319 
1320                 // In this mode just ignore any quotes or slashes we encounter and treat them like spaces.
1321                 if (!isASCIISpace(curchar) && curchar != '\'' && curchar != '"' && curchar != '/') {
1322                     if (curchar == '=') {
1323                         state.setTagState(SearchValue);
1324                         if (inViewSourceMode())
1325                             m_currentToken.addViewSourceChar(curchar);
1326                         m_rawAttributeBeforeValue.append(curchar);
1327                         src.advancePastNonNewline();
1328                     } else {
1329                         m_currentToken.addAttribute(m_attrName, emptyAtom, inViewSourceMode());
1330                         m_dest = m_buffer;
1331                         state.setTagState(SearchAttribute);
1332                         lastIsSlash = false;
1333                     }
1334                     break;
1335                 }
1336 
1337                 lastIsSlash = curchar == '/';
1338 
1339                 if (inViewSourceMode())
1340                     m_currentToken.addViewSourceChar(curchar);
1341                 m_rawAttributeBeforeValue.append(curchar);
1342                 src.advance(m_lineNumber);
1343             }
1344             break;
1345         case SearchValue:
1346             while (!src.isEmpty()) {
1347                 UChar curchar = *src;
1348                 if (!isASCIISpace(curchar)) {
1349                     if (curchar == '\'' || curchar == '\"') {
1350                         tquote = curchar == '\"' ? DoubleQuote : SingleQuote;
1351                         state.setTagState(QuotedValue);
1352                         if (inViewSourceMode())
1353                             m_currentToken.addViewSourceChar(curchar);
1354                         m_rawAttributeBeforeValue.append(curchar);
1355                         src.advancePastNonNewline();
1356                     } else
1357                         state.setTagState(Value);
1358 
1359                     break;
1360                 }
1361                 if (inViewSourceMode())
1362                     m_currentToken.addViewSourceChar(curchar);
1363                 m_rawAttributeBeforeValue.append(curchar);
1364                 src.advance(m_lineNumber);
1365             }
1366             break;
1367         case QuotedValue:
1368             while (!src.isEmpty()) {
1369                 checkBuffer();
1370 
1371                 UChar curchar = *src;
1372                 if (curchar <= '>' && !src.escaped()) {
1373                     if (curchar == '>' && m_attrName.isEmpty()) {
1374                         // Handle a case like <img '>.  Just go ahead and be willing
1375                         // to close the whole tag.  Don't consume the character and
1376                         // just go back into SearchEnd while ignoring the whole
1377                         // value.
1378                         // FIXME: Note that this is actually not a very good solution.
1379                         // It doesn't handle the general case of
1380                         // unmatched quotes among attributes that have names. -dwh
1381                         while (m_dest > m_buffer + 1 && (m_dest[-1] == '\n' || m_dest[-1] == '\r'))
1382                             m_dest--; // remove trailing newlines
1383                         AtomicString attributeValue(m_buffer + 1, m_dest - m_buffer - 1);
1384                         if (!attributeValue.contains('/'))
1385                             m_attrName = attributeValue; // Just make the name/value match. (FIXME: Is this some WinIE quirk?)
1386                         m_currentToken.addAttribute(m_attrName, attributeValue, inViewSourceMode());
1387                         if (inViewSourceMode())
1388                             m_currentToken.addViewSourceChar('x');
1389                         state.setTagState(SearchAttribute);
1390                         m_dest = m_buffer;
1391                         tquote = NoQuote;
1392                         break;
1393                     }
1394 
1395                     if (curchar == '&') {
1396                         src.advancePastNonNewline();
1397                         state = parseEntity(src, m_dest, state, cBufferPos, true, true);
1398                         break;
1399                     }
1400 
1401                     if ((tquote == SingleQuote && curchar == '\'') || (tquote == DoubleQuote && curchar == '\"')) {
1402                         // some <input type=hidden> rely on trailing spaces. argh
1403                         while (m_dest > m_buffer + 1 && (m_dest[-1] == '\n' || m_dest[-1] == '\r'))
1404                             m_dest--; // remove trailing newlines
1405                         AtomicString attributeValue(m_buffer + 1, m_dest - m_buffer - 1);
1406                         if (m_attrName.isEmpty() && !attributeValue.contains('/')) {
1407                             m_attrName = attributeValue; // Make the name match the value. (FIXME: Is this a WinIE quirk?)
1408                             if (inViewSourceMode())
1409                                 m_currentToken.addViewSourceChar('x');
1410                         } else if (inViewSourceMode())
1411                             m_currentToken.addViewSourceChar('v');
1412 
1413                         if (m_currentToken.beginTag && m_currentToken.tagName == scriptTag && !inViewSourceMode() && !m_parser->skipMode() && m_attrName == srcAttr) {
1414                             String context(m_rawAttributeBeforeValue.data(), m_rawAttributeBeforeValue.size());
1415                             if (m_XSSAuditor && !m_XSSAuditor->canLoadExternalScriptFromSrc(context, attributeValue))
1416                                 attributeValue = blankURL().string();
1417                         }
1418 
1419                         m_currentToken.addAttribute(m_attrName, attributeValue, inViewSourceMode());
1420                         m_dest = m_buffer;
1421                         state.setTagState(SearchAttribute);
1422                         tquote = NoQuote;
1423                         if (inViewSourceMode())
1424                             m_currentToken.addViewSourceChar(curchar);
1425                         src.advancePastNonNewline();
1426                         break;
1427                     }
1428                 }
1429 
1430                 *m_dest++ = curchar;
1431                 src.advance(m_lineNumber);
1432             }
1433             break;
1434         case Value:
1435             while (!src.isEmpty()) {
1436                 checkBuffer();
1437                 UChar curchar = *src;
1438                 if (curchar <= '>' && !src.escaped()) {
1439                     // parse Entities
1440                     if (curchar == '&') {
1441                         src.advancePastNonNewline();
1442                         state = parseEntity(src, m_dest, state, cBufferPos, true, true);
1443                         break;
1444                     }
1445                     // no quotes. Every space means end of value
1446                     // '/' does not delimit in IE!
1447                     if (isASCIISpace(curchar) || curchar == '>') {
1448                         AtomicString attributeValue(m_buffer + 1, m_dest - m_buffer - 1);
1449 
1450                         if (m_currentToken.beginTag && m_currentToken.tagName == scriptTag && !inViewSourceMode() && !m_parser->skipMode() && m_attrName == srcAttr) {
1451                             String context(m_rawAttributeBeforeValue.data(), m_rawAttributeBeforeValue.size());
1452                             if (m_XSSAuditor && !m_XSSAuditor->canLoadExternalScriptFromSrc(context, attributeValue))
1453                                 attributeValue = blankURL().string();
1454                         }
1455 
1456                         m_currentToken.addAttribute(m_attrName, attributeValue, inViewSourceMode());
1457                         if (inViewSourceMode())
1458                             m_currentToken.addViewSourceChar('v');
1459                         m_dest = m_buffer;
1460                         state.setTagState(SearchAttribute);
1461                         break;
1462                     }
1463                 }
1464 
1465                 *m_dest++ = curchar;
1466                 src.advance(m_lineNumber);
1467             }
1468             break;
1469         case SearchEnd:
1470         {
1471             while (!src.isEmpty()) {
1472                 UChar ch = *src;
1473                 if (ch == '>' || ch == '<')
1474                     break;
1475                 if (ch == '/')
1476                     m_currentToken.selfClosingTag = true;
1477                 if (inViewSourceMode())
1478                     m_currentToken.addViewSourceChar(ch);
1479                 src.advance(m_lineNumber);
1480             }
1481             if (src.isEmpty())
1482                 break;
1483 
1484             searchCount = 0; // Stop looking for '<!--' sequence
1485             state.setTagState(NoTag);
1486             tquote = NoQuote;
1487 
1488             if (*src != '<')
1489                 src.advance(m_lineNumber);
1490 
1491             if (m_currentToken.tagName == nullAtom) { //stop if tag is unknown
1492                 m_cBufferPos = cBufferPos;
1493                 return state;
1494             }
1495 
1496             AtomicString tagName = m_currentToken.tagName;
1497 
1498             // Handle <script src="foo"/> like Mozilla/Opera. We have to do this now for Dashboard
1499             // compatibility.
1500             bool isSelfClosingScript = m_currentToken.selfClosingTag && m_currentToken.beginTag && m_currentToken.tagName == scriptTag;
1501             bool beginTag = !m_currentToken.selfClosingTag && m_currentToken.beginTag;
1502             if (m_currentToken.beginTag && m_currentToken.tagName == scriptTag && !inViewSourceMode() && !m_parser->skipMode()) {
1503                 Attribute* a = 0;
1504                 m_scriptTagSrcAttrValue = String();
1505                 m_scriptTagCharsetAttrValue = String();
1506                 if (m_currentToken.attrs && !m_fragment) {
1507                     if (m_doc->frame() && m_doc->frame()->script()->isEnabled()) {
1508                         if ((a = m_currentToken.attrs->getAttributeItem(srcAttr)))
1509                             m_scriptTagSrcAttrValue = m_doc->completeURL(deprecatedParseURL(a->value())).string();
1510                     }
1511                 }
1512             }
1513 
1514             RefPtr<Node> n = processToken();
1515             m_cBufferPos = cBufferPos;
1516             if (n || inViewSourceMode()) {
1517                 State savedState = state;
1518                 SegmentedString savedSrc = src;
1519                 long savedLineno = m_lineNumber;
1520                 if ((tagName == preTag || tagName == listingTag) && !inViewSourceMode()) {
1521                     if (beginTag)
1522                         state.setDiscardLF(true); // Discard the first LF after we open a pre.
1523                 } else if (tagName == scriptTag) {
1524                     ASSERT(!m_scriptNode);
1525                     m_scriptNode = static_pointer_cast<HTMLScriptElement>(n);
1526                     if (m_scriptNode)
1527                         m_scriptTagCharsetAttrValue = m_scriptNode->scriptCharset();
1528                     if (beginTag) {
1529                         m_searchStopper = scriptEnd;
1530                         m_searchStopperLength = 8;
1531                         state.setInScript(true);
1532                         state = parseNonHTMLText(src, state);
1533                     } else if (isSelfClosingScript) { // Handle <script src="foo"/>
1534                         state.setInScript(true);
1535                         state = scriptHandler(state);
1536                     }
1537                 } else if (tagName == styleTag) {
1538                     if (beginTag) {
1539                         m_searchStopper = styleEnd;
1540                         m_searchStopperLength = 7;
1541                         state.setInStyle(true);
1542                         state = parseNonHTMLText(src, state);
1543                     }
1544                 } else if (tagName == textareaTag) {
1545                     if (beginTag) {
1546                         m_searchStopper = textareaEnd;
1547                         m_searchStopperLength = 10;
1548                         state.setInTextArea(true);
1549                         state = parseNonHTMLText(src, state);
1550                     }
1551                 } else if (tagName == titleTag) {
1552                     if (beginTag) {
1553                         m_searchStopper = titleEnd;
1554                         m_searchStopperLength = 7;
1555                         state.setInTitle(true);
1556                         state = parseNonHTMLText(src, state);
1557                     }
1558                 } else if (tagName == xmpTag) {
1559                     if (beginTag) {
1560                         m_searchStopper = xmpEnd;
1561                         m_searchStopperLength = 5;
1562                         state.setInXmp(true);
1563                         state = parseNonHTMLText(src, state);
1564                     }
1565                 } else if (tagName == iframeTag) {
1566                     if (beginTag) {
1567                         m_searchStopper = iframeEnd;
1568                         m_searchStopperLength = 8;
1569                         state.setInIFrame(true);
1570                         state = parseNonHTMLText(src, state);
1571                     }
1572                 }
1573                 if (src.isEmpty() && (state.inTitle() || inViewSourceMode()) && !state.inComment() && !(state.inScript() && m_currentScriptTagStartLineNumber)) {
1574                     // We just ate the rest of the document as the #text node under the special tag!
1575                     // Reset the state then retokenize without special handling.
1576                     // Let the parser clean up the missing close tag.
1577                     // FIXME: This is incorrect, because src.isEmpty() doesn't mean we're
1578                     // at the end of the document unless m_noMoreData is also true. We need
1579                     // to detect this case elsewhere, and save the state somewhere other
1580                     // than a local variable.
1581                     state = savedState;
1582                     src = savedSrc;
1583                     m_lineNumber = savedLineno;
1584                     m_scriptCodeSize = 0;
1585                 }
1586             }
1587             if (tagName == plaintextTag)
1588                 state.setInPlainText(beginTag);
1589             return state; // Finished parsing tag!
1590         }
1591         } // end switch
1592     }
1593     m_cBufferPos = cBufferPos;
1594     return state;
1595 }
1596 
continueProcessing(int & processedCount,double startTime,State & state)1597 inline bool HTMLTokenizer::continueProcessing(int& processedCount, double startTime, State &state)
1598 {
1599     // We don't want to be checking elapsed time with every character, so we only check after we've
1600     // processed a certain number of characters.
1601     bool allowedYield = state.allowYield();
1602     state.setAllowYield(false);
1603     if (!state.loadingExtScript() && !state.forceSynchronous() && !m_executingScript && (processedCount > m_tokenizerChunkSize || allowedYield)) {
1604         processedCount = 0;
1605         if (currentTime() - startTime > m_tokenizerTimeDelay) {
1606             /* FIXME: We'd like to yield aggressively to give stylesheets the opportunity to
1607                load, but this hurts overall performance on slower machines.  For now turn this
1608                off.
1609             || (!m_doc->haveStylesheetsLoaded() &&
1610                 (m_doc->documentElement()->id() != ID_HTML || m_doc->body()))) {*/
1611             // Schedule the timer to keep processing as soon as possible.
1612             m_timer.startOneShot(0);
1613 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
1614             if (currentTime() - startTime > m_tokenizerTimeDelay)
1615                 printf("Deferring processing of data because 500ms elapsed away from event loop.\n");
1616 #endif
1617             return false;
1618         }
1619     }
1620 
1621     processedCount++;
1622     return true;
1623 }
1624 
write(const SegmentedString & str,bool appendData)1625 void HTMLTokenizer::write(const SegmentedString& str, bool appendData)
1626 {
1627     if (!m_buffer)
1628         return;
1629 
1630     if (m_parserStopped)
1631         return;
1632 
1633     SegmentedString source(str);
1634     if (m_executingScript)
1635         source.setExcludeLineNumbers();
1636 
1637     if ((m_executingScript && appendData) || !m_pendingScripts.isEmpty()) {
1638         // don't parse; we will do this later
1639         if (m_currentPrependingSrc)
1640             m_currentPrependingSrc->append(source);
1641         else {
1642             m_pendingSrc.append(source);
1643 #if PRELOAD_SCANNER_ENABLED
1644             if (m_preloadScanner && m_preloadScanner->inProgress() && appendData)
1645                 m_preloadScanner->write(source);
1646 #endif
1647         }
1648         return;
1649     }
1650 
1651 
1652 #if PRELOAD_SCANNER_ENABLED
1653     if (m_preloadScanner && m_preloadScanner->inProgress() && appendData)
1654         m_preloadScanner->end();
1655 #endif
1656 
1657     if (!m_src.isEmpty())
1658         m_src.append(source);
1659     else
1660         setSrc(source);
1661 
1662     // Once a timer is set, it has control of when the tokenizer continues.
1663     if (m_timer.isActive())
1664         return;
1665 
1666     bool wasInWrite = m_inWrite;
1667     m_inWrite = true;
1668 
1669 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
1670     if (!m_doc->ownerElement())
1671         printf("Beginning write at time %d\n", m_doc->elapsedTime());
1672 #endif
1673 
1674     int processedCount = 0;
1675     double startTime = currentTime();
1676 #ifdef ANDROID_INSTRUMENT
1677     android::TimeCounter::start(android::TimeCounter::ParsingTimeCounter);
1678 #endif
1679 
1680     Frame* frame = m_doc->frame();
1681 
1682     State state = m_state;
1683 
1684     while (!m_src.isEmpty() && (!frame || !frame->loader()->isScheduledLocationChangePending())) {
1685         if (!continueProcessing(processedCount, startTime, state))
1686             break;
1687 
1688         // do we need to enlarge the buffer?
1689         checkBuffer();
1690 
1691         UChar cc = *m_src;
1692 
1693         bool wasSkipLF = state.skipLF();
1694         if (wasSkipLF)
1695             state.setSkipLF(false);
1696 
1697         if (wasSkipLF && (cc == '\n'))
1698             m_src.advance();
1699         else if (state.needsSpecialWriteHandling()) {
1700             // it's important to keep needsSpecialWriteHandling with the flags this block tests
1701             if (state.hasEntityState())
1702                 state = parseEntity(m_src, m_dest, state, m_cBufferPos, false, state.hasTagState());
1703             else if (state.inPlainText())
1704                 state = parseText(m_src, state);
1705             else if (state.inAnyNonHTMLText())
1706                 state = parseNonHTMLText(m_src, state);
1707             else if (state.inComment())
1708                 state = parseComment(m_src, state);
1709             else if (state.inDoctype())
1710                 state = parseDoctype(m_src, state);
1711             else if (state.inServer())
1712                 state = parseServer(m_src, state);
1713             else if (state.inProcessingInstruction())
1714                 state = parseProcessingInstruction(m_src, state);
1715             else if (state.hasTagState())
1716                 state = parseTag(m_src, state);
1717             else if (state.startTag()) {
1718                 state.setStartTag(false);
1719 
1720                 switch (cc) {
1721                 case '/':
1722                     break;
1723                 case '!': {
1724                     // <!-- comment --> or <!DOCTYPE ...>
1725                     searchCount = 1; // Look for '<!--' sequence to start comment or '<!DOCTYPE' sequence to start doctype
1726                     m_doctypeSearchCount = 1;
1727                     break;
1728                 }
1729                 case '?': {
1730                     // xml processing instruction
1731                     state.setInProcessingInstruction(true);
1732                     tquote = NoQuote;
1733                     state = parseProcessingInstruction(m_src, state);
1734                     continue;
1735 
1736                     break;
1737                 }
1738                 case '%':
1739                     if (!m_brokenServer) {
1740                         // <% server stuff, handle as comment %>
1741                         state.setInServer(true);
1742                         tquote = NoQuote;
1743                         state = parseServer(m_src, state);
1744                         continue;
1745                     }
1746                     // else fall through
1747                 default: {
1748                     if ( ((cc >= 'a') && (cc <= 'z')) || ((cc >= 'A') && (cc <= 'Z'))) {
1749                         // Start of a Start-Tag
1750                     } else {
1751                         // Invalid tag
1752                         // Add as is
1753                         *m_dest = '<';
1754                         m_dest++;
1755                         continue;
1756                     }
1757                 }
1758                 }; // end case
1759 
1760                 processToken();
1761 
1762                 m_cBufferPos = 0;
1763                 state.setTagState(TagName);
1764                 state = parseTag(m_src, state);
1765             }
1766         } else if (cc == '&' && !m_src.escaped()) {
1767             m_src.advancePastNonNewline();
1768             state = parseEntity(m_src, m_dest, state, m_cBufferPos, true, state.hasTagState());
1769         } else if (cc == '<' && !m_src.escaped()) {
1770             m_currentTagStartLineNumber = m_lineNumber;
1771             m_src.advancePastNonNewline();
1772             state.setStartTag(true);
1773             state.setDiscardLF(false);
1774         } else if (cc == '\n' || cc == '\r') {
1775             if (state.discardLF())
1776                 // Ignore this LF
1777                 state.setDiscardLF(false); // We have discarded 1 LF
1778             else {
1779                 // Process this LF
1780                 *m_dest++ = '\n';
1781                 if (cc == '\r' && !m_src.excludeLineNumbers())
1782                     m_lineNumber++;
1783             }
1784 
1785             /* Check for MS-DOS CRLF sequence */
1786             if (cc == '\r')
1787                 state.setSkipLF(true);
1788             m_src.advance(m_lineNumber);
1789         } else {
1790             state.setDiscardLF(false);
1791             *m_dest++ = cc;
1792             m_src.advancePastNonNewline();
1793         }
1794     }
1795 
1796 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
1797     if (!m_doc->ownerElement())
1798         printf("Ending write at time %d\n", m_doc->elapsedTime());
1799 #endif
1800 
1801     m_inWrite = wasInWrite;
1802 
1803     m_state = state;
1804 
1805 #ifdef ANDROID_INSTRUMENT
1806     android::TimeCounter::record(android::TimeCounter::ParsingTimeCounter, __FUNCTION__);
1807 #endif
1808 
1809     if (m_noMoreData && !m_inWrite && !state.loadingExtScript() && !m_executingScript && !m_timer.isActive())
1810         end(); // this actually causes us to be deleted
1811 }
1812 
stopParsing()1813 void HTMLTokenizer::stopParsing()
1814 {
1815     Tokenizer::stopParsing();
1816     m_timer.stop();
1817 
1818     // The part needs to know that the tokenizer has finished with its data,
1819     // regardless of whether it happened naturally or due to manual intervention.
1820     if (!m_fragment && m_doc->frame())
1821         m_doc->frame()->loader()->tokenizerProcessedData();
1822 }
1823 
processingData() const1824 bool HTMLTokenizer::processingData() const
1825 {
1826     return m_timer.isActive() || m_inWrite;
1827 }
1828 
timerFired(Timer<HTMLTokenizer> *)1829 void HTMLTokenizer::timerFired(Timer<HTMLTokenizer>*)
1830 {
1831 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
1832     if (!m_doc->ownerElement())
1833         printf("Beginning timer write at time %d\n", m_doc->elapsedTime());
1834 #endif
1835 
1836 #ifdef ANDROID_MOBILE
1837     if (m_doc->view() && m_doc->view()->layoutPending() && !m_doc->minimumLayoutDelay() && !m_doc->extraLayoutDelay()) {
1838 #else
1839     if (m_doc->view() && m_doc->view()->layoutPending() && !m_doc->minimumLayoutDelay()) {
1840 #endif
1841         // Restart the timer and let layout win.  This is basically a way of ensuring that the layout
1842         // timer has higher priority than our timer.
1843         m_timer.startOneShot(0);
1844         return;
1845     }
1846 
1847     // Invoke write() as though more data came in. This might cause us to get deleted.
1848     write(SegmentedString(), true);
1849 }
1850 
1851 void HTMLTokenizer::end()
1852 {
1853     ASSERT(!m_timer.isActive());
1854     m_timer.stop(); // Only helps if assertion above fires, but do it anyway.
1855 
1856     if (m_buffer) {
1857         // parseTag is using the buffer for different matters
1858         if (!m_state.hasTagState())
1859             processToken();
1860 
1861         fastFree(m_scriptCode);
1862         m_scriptCode = 0;
1863         m_scriptCodeSize = m_scriptCodeCapacity = m_scriptCodeResync = 0;
1864 
1865         fastFree(m_buffer);
1866         m_buffer = 0;
1867     }
1868 
1869     if (!inViewSourceMode())
1870         m_parser->finished();
1871     else
1872         m_doc->finishedParsing();
1873 }
1874 
1875 void HTMLTokenizer::finish()
1876 {
1877     // do this as long as we don't find matching comment ends
1878     while ((m_state.inComment() || m_state.inServer()) && m_scriptCode && m_scriptCodeSize) {
1879         // we've found an unmatched comment start
1880         if (m_state.inComment())
1881             m_brokenComments = true;
1882         else
1883             m_brokenServer = true;
1884         checkScriptBuffer();
1885         m_scriptCode[m_scriptCodeSize] = 0;
1886         m_scriptCode[m_scriptCodeSize + 1] = 0;
1887         int pos;
1888         String food;
1889         if (m_state.inScript() || m_state.inStyle() || m_state.inTextArea())
1890             food = String(m_scriptCode, m_scriptCodeSize);
1891         else if (m_state.inServer()) {
1892             food = "<";
1893             food.append(m_scriptCode, m_scriptCodeSize);
1894         } else {
1895             pos = find(m_scriptCode, m_scriptCodeSize, '>');
1896             food = String(m_scriptCode + pos + 1, m_scriptCodeSize - pos - 1);
1897         }
1898         fastFree(m_scriptCode);
1899         m_scriptCode = 0;
1900         m_scriptCodeSize = m_scriptCodeCapacity = m_scriptCodeResync = 0;
1901         m_state.setInComment(false);
1902         m_state.setInServer(false);
1903         if (!food.isEmpty())
1904             write(food, true);
1905     }
1906     // this indicates we will not receive any more data... but if we are waiting on
1907     // an external script to load, we can't finish parsing until that is done
1908     m_noMoreData = true;
1909     if (!m_inWrite && !m_state.loadingExtScript() && !m_executingScript && !m_timer.isActive())
1910         end(); // this actually causes us to be deleted
1911 }
1912 
1913 PassRefPtr<Node> HTMLTokenizer::processToken()
1914 {
1915     ScriptController* scriptController = (!m_fragment && m_doc->frame()) ? m_doc->frame()->script() : 0;
1916     if (scriptController && scriptController->isEnabled())
1917         // FIXME: Why isn't this m_currentScriptTagStartLineNumber?  I suspect this is wrong.
1918         scriptController->setEventHandlerLineNumber(m_currentTagStartLineNumber + 1); // Script line numbers are 1 based.
1919     if (m_dest > m_buffer) {
1920         m_currentToken.text = StringImpl::createStrippingNullCharacters(m_buffer, m_dest - m_buffer);
1921         if (m_currentToken.tagName != commentAtom)
1922             m_currentToken.tagName = textAtom;
1923     } else if (m_currentToken.tagName == nullAtom) {
1924         m_currentToken.reset();
1925         if (scriptController)
1926             scriptController->setEventHandlerLineNumber(m_lineNumber + 1); // Script line numbers are 1 based.
1927         return 0;
1928     }
1929 
1930     m_dest = m_buffer;
1931 
1932     RefPtr<Node> n;
1933 
1934     if (!m_parserStopped) {
1935         if (NamedMappedAttrMap* map = m_currentToken.attrs.get())
1936             map->shrinkToLength();
1937         if (inViewSourceMode())
1938             static_cast<HTMLViewSourceDocument*>(m_doc)->addViewSourceToken(&m_currentToken);
1939         else
1940             // pass the token over to the parser, the parser DOES NOT delete the token
1941             n = m_parser->parseToken(&m_currentToken);
1942     }
1943     m_currentToken.reset();
1944     if (scriptController)
1945         scriptController->setEventHandlerLineNumber(0);
1946 
1947     return n.release();
1948 }
1949 
1950 void HTMLTokenizer::processDoctypeToken()
1951 {
1952     if (inViewSourceMode())
1953         static_cast<HTMLViewSourceDocument*>(m_doc)->addViewSourceDoctypeToken(&m_doctypeToken);
1954     else
1955         m_parser->parseDoctypeToken(&m_doctypeToken);
1956 }
1957 
1958 HTMLTokenizer::~HTMLTokenizer()
1959 {
1960     ASSERT(!m_inWrite);
1961     reset();
1962 }
1963 
1964 
1965 void HTMLTokenizer::enlargeBuffer(int len)
1966 {
1967     // Resize policy: Always at least double the size of the buffer each time.
1968     int delta = max(len, m_bufferSize);
1969 
1970     // Check for overflow.
1971     // For now, handle overflow the same way we handle fastRealloc failure, with CRASH.
1972     static const int maxSize = INT_MAX / sizeof(UChar);
1973     if (delta > maxSize - m_bufferSize)
1974         CRASH();
1975 
1976     int newSize = m_bufferSize + delta;
1977     int oldOffset = m_dest - m_buffer;
1978     m_buffer = static_cast<UChar*>(fastRealloc(m_buffer, newSize * sizeof(UChar)));
1979     m_dest = m_buffer + oldOffset;
1980     m_bufferSize = newSize;
1981 }
1982 
1983 void HTMLTokenizer::enlargeScriptBuffer(int len)
1984 {
1985     // Resize policy: Always at least double the size of the buffer each time.
1986     int delta = max(len, m_scriptCodeCapacity);
1987 
1988     // Check for overflow.
1989     // For now, handle overflow the same way we handle fastRealloc failure, with CRASH.
1990     static const int maxSize = INT_MAX / sizeof(UChar);
1991     if (delta > maxSize - m_scriptCodeCapacity)
1992         CRASH();
1993 
1994     int newSize = m_scriptCodeCapacity + delta;
1995     m_scriptCode = static_cast<UChar*>(fastRealloc(m_scriptCode, newSize * sizeof(UChar)));
1996     m_scriptCodeCapacity = newSize;
1997 }
1998 
1999 void HTMLTokenizer::executeScriptsWaitingForStylesheets()
2000 {
2001     ASSERT(m_doc->haveStylesheetsLoaded());
2002 
2003     if (m_hasScriptsWaitingForStylesheets)
2004         notifyFinished(0);
2005 }
2006 
2007 void HTMLTokenizer::notifyFinished(CachedResource*)
2008 {
2009 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
2010     if (!m_doc->ownerElement())
2011         printf("script loaded at %d\n", m_doc->elapsedTime());
2012 #endif
2013 
2014     ASSERT(!m_pendingScripts.isEmpty());
2015 
2016     // Make external scripts wait for external stylesheets.
2017     // FIXME: This needs to be done for inline scripts too.
2018     m_hasScriptsWaitingForStylesheets = !m_doc->haveStylesheetsLoaded();
2019     if (m_hasScriptsWaitingForStylesheets)
2020         return;
2021 
2022     bool finished = false;
2023     while (!finished && m_pendingScripts.first()->isLoaded()) {
2024         CachedScript* cs = m_pendingScripts.first().get();
2025         m_pendingScripts.removeFirst();
2026         ASSERT(cache()->disabled() || cs->accessCount() > 0);
2027 
2028         setSrc(SegmentedString());
2029 
2030         // make sure we forget about the script before we execute the new one
2031         // infinite recursion might happen otherwise
2032         ScriptSourceCode sourceCode(cs);
2033         bool errorOccurred = cs->errorOccurred();
2034         cs->removeClient(this);
2035 
2036         RefPtr<Node> n = m_scriptNode.release();
2037 
2038 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
2039         if (!m_doc->ownerElement())
2040             printf("external script beginning execution at %d\n", m_doc->elapsedTime());
2041 #endif
2042 
2043         if (errorOccurred)
2044             n->dispatchEvent(eventNames().errorEvent, true, false);
2045         else {
2046             if (static_cast<HTMLScriptElement*>(n.get())->shouldExecuteAsJavaScript())
2047                 m_state = scriptExecution(sourceCode, m_state);
2048 #if ENABLE(XHTMLMP)
2049             else
2050                 m_doc->setShouldProcessNoscriptElement(true);
2051 #endif
2052             n->dispatchEvent(eventNames().loadEvent, false, false);
2053         }
2054 
2055         // The state of m_pendingScripts.isEmpty() can change inside the scriptExecution()
2056         // call above, so test afterwards.
2057         finished = m_pendingScripts.isEmpty();
2058         if (finished) {
2059             ASSERT(!m_hasScriptsWaitingForStylesheets);
2060             m_state.setLoadingExtScript(false);
2061 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
2062             if (!m_doc->ownerElement())
2063                 printf("external script finished execution at %d\n", m_doc->elapsedTime());
2064 #endif
2065         } else if (m_hasScriptsWaitingForStylesheets) {
2066             // m_hasScriptsWaitingForStylesheets flag might have changed during the script execution.
2067             // If it did we are now blocked waiting for stylesheets and should not execute more scripts until they arrive.
2068             finished = true;
2069         }
2070 
2071         // 'm_requestingScript' is true when we are called synchronously from
2072         // scriptHandler(). In that case scriptHandler() will take care
2073         // of m_pendingSrc.
2074         if (!m_requestingScript) {
2075             SegmentedString rest = m_pendingSrc;
2076             m_pendingSrc.clear();
2077             write(rest, false);
2078             // we might be deleted at this point, do not access any members.
2079         }
2080     }
2081 }
2082 
2083 bool HTMLTokenizer::isWaitingForScripts() const
2084 {
2085     return m_state.loadingExtScript();
2086 }
2087 
2088 void HTMLTokenizer::setSrc(const SegmentedString& source)
2089 {
2090     m_src = source;
2091 }
2092 
2093 void parseHTMLDocumentFragment(const String& source, DocumentFragment* fragment)
2094 {
2095     HTMLTokenizer tok(fragment);
2096     tok.setForceSynchronous(true);
2097     tok.write(source, true);
2098     tok.finish();
2099     ASSERT(!tok.processingData());      // make sure we're done (see 3963151)
2100 }
2101 
2102 UChar decodeNamedEntity(const char* name)
2103 {
2104     const Entity* e = findEntity(name, strlen(name));
2105     return e ? e->code : 0;
2106 }
2107 
2108 }
2109