1 /*
2 Copyright (C) 1997 Martin Jones (mjones@kde.org)
3 (C) 1997 Torben Weis (weis@kde.org)
4 (C) 1998 Waldo Bastian (bastian@kde.org)
5 (C) 1999 Lars Knoll (knoll@kde.org)
6 (C) 1999 Antti Koivisto (koivisto@kde.org)
7 (C) 2001 Dirk Mueller (mueller@kde.org)
8 Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
9 Copyright (C) 2005, 2006 Alexey Proskuryakov (ap@nypop.com)
10 Copyright (C) 2009 Torch Mobile Inc. All rights reserved. (http://www.torchmobile.com/)
11
12 This library is free software; you can redistribute it and/or
13 modify it under the terms of the GNU Library General Public
14 License as published by the Free Software Foundation; either
15 version 2 of the License, or (at your option) any later version.
16
17 This library is distributed in the hope that it will be useful,
18 but WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 Library General Public License for more details.
21
22 You should have received a copy of the GNU Library General Public License
23 along with this library; see the file COPYING.LIB. If not, write to
24 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
25 Boston, MA 02110-1301, USA.
26 */
27 #include "config.h"
28 #include "HTMLTokenizer.h"
29
30 #include "CSSHelper.h"
31 #include "Cache.h"
32 #include "CachedScript.h"
33 #include "DocLoader.h"
34 #include "DocumentFragment.h"
35 #include "EventNames.h"
36 #include "Frame.h"
37 #include "FrameLoader.h"
38 #include "FrameView.h"
39 #include "HTMLElement.h"
40 #include "HTMLNames.h"
41 #include "HTMLParser.h"
42 #include "HTMLScriptElement.h"
43 #include "HTMLViewSourceDocument.h"
44 #include "MappedAttribute.h"
45 #include "Page.h"
46 #include "PreloadScanner.h"
47 #include "ScriptController.h"
48 #include "ScriptSourceCode.h"
49 #include "ScriptValue.h"
50 #include "XSSAuditor.h"
51 #include <wtf/ASCIICType.h>
52 #include <wtf/CurrentTime.h>
53
54 #include "HTMLEntityNames.c"
55
56 #ifdef ANDROID_INSTRUMENT
57 #include "TimeCounter.h"
58 #endif
59
60 #define PRELOAD_SCANNER_ENABLED 1
61 // #define INSTRUMENT_LAYOUT_SCHEDULING 1
62
63 using namespace WTF;
64 using namespace std;
65
66 namespace WebCore {
67
68 using namespace HTMLNames;
69
70 #if MOBILE
71 // The mobile device needs to be responsive, as such the tokenizer chunk size is reduced.
72 // This value is used to define how many characters the tokenizer will process before
73 // yeilding control.
74 static const int defaultTokenizerChunkSize = 256;
75 #else
76 static const int defaultTokenizerChunkSize = 4096;
77 #endif
78
79 #if MOBILE
80 // As the chunks are smaller (above), the tokenizer should not yield for as long a period, otherwise
81 // it will take way to long to load a page.
82 static const double defaultTokenizerTimeDelay = 0.300;
83 #else
84 // FIXME: We would like this constant to be 200ms.
85 // Yielding more aggressively results in increased responsiveness and better incremental rendering.
86 // It slows down overall page-load on slower machines, though, so for now we set a value of 500.
87 static const double defaultTokenizerTimeDelay = 0.500;
88 #endif
89
90 static const char commentStart [] = "<!--";
91 static const char doctypeStart [] = "<!doctype";
92 static const char publicStart [] = "public";
93 static const char systemStart [] = "system";
94 static const char scriptEnd [] = "</script";
95 static const char xmpEnd [] = "</xmp";
96 static const char styleEnd [] = "</style";
97 static const char textareaEnd [] = "</textarea";
98 static const char titleEnd [] = "</title";
99 static const char iframeEnd [] = "</iframe";
100
101 // Full support for MS Windows extensions to Latin-1.
102 // Technically these extensions should only be activated for pages
103 // marked "windows-1252" or "cp1252", but
104 // in the standard Microsoft way, these extensions infect hundreds of thousands
105 // of web pages. Note that people with non-latin-1 Microsoft extensions
106 // are SOL.
107 //
108 // See: http://www.microsoft.com/globaldev/reference/WinCP.asp
109 // http://www.bbsinc.com/iso8859.html
110 // http://www.obviously.com/
111 //
112 // There may be better equivalents
113
114 // We only need this for entities. For non-entity text, we handle this in the text encoding.
115
116 static const UChar windowsLatin1ExtensionArray[32] = {
117 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87
118 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F
119 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97
120 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178 // 98-9F
121 };
122
fixUpChar(UChar c)123 static inline UChar fixUpChar(UChar c)
124 {
125 if ((c & ~0x1F) != 0x0080)
126 return c;
127 return windowsLatin1ExtensionArray[c - 0x80];
128 }
129
tagMatch(const char * s1,const UChar * s2,unsigned length)130 static inline bool tagMatch(const char* s1, const UChar* s2, unsigned length)
131 {
132 for (unsigned i = 0; i != length; ++i) {
133 unsigned char c1 = s1[i];
134 unsigned char uc1 = toASCIIUpper(static_cast<char>(c1));
135 UChar c2 = s2[i];
136 if (c1 != c2 && uc1 != c2)
137 return false;
138 }
139 return true;
140 }
141
addAttribute(AtomicString & attrName,const AtomicString & attributeValue,bool viewSourceMode)142 inline void Token::addAttribute(AtomicString& attrName, const AtomicString& attributeValue, bool viewSourceMode)
143 {
144 if (!attrName.isEmpty()) {
145 ASSERT(!attrName.contains('/'));
146 RefPtr<MappedAttribute> a = MappedAttribute::create(attrName, attributeValue);
147 if (!attrs) {
148 attrs = NamedMappedAttrMap::create();
149 attrs->reserveInitialCapacity(10);
150 }
151 attrs->insertAttribute(a.release(), viewSourceMode);
152 }
153
154 attrName = emptyAtom;
155 }
156
157 // ----------------------------------------------------------------------------
158
HTMLTokenizer(HTMLDocument * doc,bool reportErrors)159 HTMLTokenizer::HTMLTokenizer(HTMLDocument* doc, bool reportErrors)
160 : Tokenizer()
161 , m_buffer(0)
162 , m_scriptCode(0)
163 , m_scriptCodeSize(0)
164 , m_scriptCodeCapacity(0)
165 , m_scriptCodeResync(0)
166 , m_executingScript(0)
167 , m_requestingScript(false)
168 , m_hasScriptsWaitingForStylesheets(false)
169 , m_timer(this, &HTMLTokenizer::timerFired)
170 , m_doc(doc)
171 , m_parser(new HTMLParser(doc, reportErrors))
172 , m_inWrite(false)
173 , m_fragment(false)
174 {
175 begin();
176 }
177
HTMLTokenizer(HTMLViewSourceDocument * doc)178 HTMLTokenizer::HTMLTokenizer(HTMLViewSourceDocument* doc)
179 : Tokenizer(true)
180 , m_buffer(0)
181 , m_scriptCode(0)
182 , m_scriptCodeSize(0)
183 , m_scriptCodeCapacity(0)
184 , m_scriptCodeResync(0)
185 , m_executingScript(0)
186 , m_requestingScript(false)
187 , m_hasScriptsWaitingForStylesheets(false)
188 , m_timer(this, &HTMLTokenizer::timerFired)
189 , m_doc(doc)
190 , m_parser(0)
191 , m_inWrite(false)
192 , m_fragment(false)
193 {
194 begin();
195 }
196
HTMLTokenizer(DocumentFragment * frag)197 HTMLTokenizer::HTMLTokenizer(DocumentFragment* frag)
198 : m_buffer(0)
199 , m_scriptCode(0)
200 , m_scriptCodeSize(0)
201 , m_scriptCodeCapacity(0)
202 , m_scriptCodeResync(0)
203 , m_executingScript(0)
204 , m_requestingScript(false)
205 , m_hasScriptsWaitingForStylesheets(false)
206 , m_timer(this, &HTMLTokenizer::timerFired)
207 , m_doc(frag->document())
208 , m_parser(new HTMLParser(frag))
209 , m_inWrite(false)
210 , m_fragment(true)
211 {
212 begin();
213 }
214
reset()215 void HTMLTokenizer::reset()
216 {
217 ASSERT(m_executingScript == 0);
218
219 while (!m_pendingScripts.isEmpty()) {
220 CachedScript* cs = m_pendingScripts.first().get();
221 m_pendingScripts.removeFirst();
222 ASSERT(cache()->disabled() || cs->accessCount() > 0);
223 cs->removeClient(this);
224 }
225
226 fastFree(m_buffer);
227 m_buffer = m_dest = 0;
228 m_bufferSize = 0;
229
230 fastFree(m_scriptCode);
231 m_scriptCode = 0;
232 m_scriptCodeSize = m_scriptCodeCapacity = m_scriptCodeResync = 0;
233
234 m_timer.stop();
235 m_state.setAllowYield(false);
236 m_state.setForceSynchronous(false);
237
238 m_currentToken.reset();
239 m_doctypeToken.reset();
240 m_doctypeSearchCount = 0;
241 m_doctypeSecondarySearchCount = 0;
242 m_hasScriptsWaitingForStylesheets = false;
243 }
244
begin()245 void HTMLTokenizer::begin()
246 {
247 m_executingScript = 0;
248 m_requestingScript = false;
249 m_hasScriptsWaitingForStylesheets = false;
250 m_state.setLoadingExtScript(false);
251 reset();
252 m_bufferSize = 254;
253 m_buffer = static_cast<UChar*>(fastMalloc(sizeof(UChar) * 254));
254 m_dest = m_buffer;
255 tquote = NoQuote;
256 searchCount = 0;
257 m_state.setEntityState(NoEntity);
258 m_scriptTagSrcAttrValue = String();
259 m_pendingSrc.clear();
260 m_currentPrependingSrc = 0;
261 m_noMoreData = false;
262 m_brokenComments = false;
263 m_brokenServer = false;
264 m_lineNumber = 0;
265 m_currentScriptTagStartLineNumber = 0;
266 m_currentTagStartLineNumber = 0;
267 m_state.setForceSynchronous(false);
268
269 Page* page = m_doc->page();
270 if (page && page->hasCustomHTMLTokenizerTimeDelay())
271 m_tokenizerTimeDelay = page->customHTMLTokenizerTimeDelay();
272 else
273 m_tokenizerTimeDelay = defaultTokenizerTimeDelay;
274
275 if (page && page->hasCustomHTMLTokenizerChunkSize())
276 m_tokenizerChunkSize = page->customHTMLTokenizerChunkSize();
277 else
278 m_tokenizerChunkSize = defaultTokenizerChunkSize;
279 }
280
setForceSynchronous(bool force)281 void HTMLTokenizer::setForceSynchronous(bool force)
282 {
283 m_state.setForceSynchronous(force);
284 }
285
processListing(SegmentedString list,State state)286 HTMLTokenizer::State HTMLTokenizer::processListing(SegmentedString list, State state)
287 {
288 // This function adds the listing 'list' as
289 // preformatted text-tokens to the token-collection
290 while (!list.isEmpty()) {
291 if (state.skipLF()) {
292 state.setSkipLF(false);
293 if (*list == '\n') {
294 list.advance();
295 continue;
296 }
297 }
298
299 checkBuffer();
300
301 if (*list == '\n' || *list == '\r') {
302 if (state.discardLF())
303 // Ignore this LF
304 state.setDiscardLF(false); // We have discarded 1 LF
305 else
306 *m_dest++ = '\n';
307
308 /* Check for MS-DOS CRLF sequence */
309 if (*list == '\r')
310 state.setSkipLF(true);
311
312 list.advance();
313 } else {
314 state.setDiscardLF(false);
315 *m_dest++ = *list;
316 list.advance();
317 }
318 }
319
320 return state;
321 }
322
parseNonHTMLText(SegmentedString & src,State state)323 HTMLTokenizer::State HTMLTokenizer::parseNonHTMLText(SegmentedString& src, State state)
324 {
325 ASSERT(state.inTextArea() || state.inTitle() || state.inIFrame() || !state.hasEntityState());
326 ASSERT(!state.hasTagState());
327 ASSERT(state.inXmp() + state.inTextArea() + state.inTitle() + state.inStyle() + state.inScript() + state.inIFrame() == 1);
328 if (state.inScript() && !m_currentScriptTagStartLineNumber)
329 m_currentScriptTagStartLineNumber = m_lineNumber;
330
331 if (state.inComment())
332 state = parseComment(src, state);
333
334 int lastDecodedEntityPosition = -1;
335 while (!src.isEmpty()) {
336 checkScriptBuffer();
337 UChar ch = *src;
338
339 if (!m_scriptCodeResync && !m_brokenComments &&
340 !state.inXmp() && ch == '-' && m_scriptCodeSize >= 3 && !src.escaped() &&
341 m_scriptCode[m_scriptCodeSize - 3] == '<' && m_scriptCode[m_scriptCodeSize - 2] == '!' && m_scriptCode[m_scriptCodeSize - 1] == '-' &&
342 (lastDecodedEntityPosition < m_scriptCodeSize - 3)) {
343 state.setInComment(true);
344 state = parseComment(src, state);
345 continue;
346 }
347 if (m_scriptCodeResync && !tquote && ch == '>') {
348 src.advancePastNonNewline();
349 m_scriptCodeSize = m_scriptCodeResync - 1;
350 m_scriptCodeResync = 0;
351 m_scriptCode[m_scriptCodeSize] = m_scriptCode[m_scriptCodeSize + 1] = 0;
352 if (state.inScript())
353 state = scriptHandler(state);
354 else {
355 state = processListing(SegmentedString(m_scriptCode, m_scriptCodeSize), state);
356 processToken();
357 if (state.inStyle()) {
358 m_currentToken.tagName = styleTag.localName();
359 m_currentToken.beginTag = false;
360 } else if (state.inTextArea()) {
361 m_currentToken.tagName = textareaTag.localName();
362 m_currentToken.beginTag = false;
363 } else if (state.inTitle()) {
364 m_currentToken.tagName = titleTag.localName();
365 m_currentToken.beginTag = false;
366 } else if (state.inXmp()) {
367 m_currentToken.tagName = xmpTag.localName();
368 m_currentToken.beginTag = false;
369 } else if (state.inIFrame()) {
370 m_currentToken.tagName = iframeTag.localName();
371 m_currentToken.beginTag = false;
372 }
373 processToken();
374 state.setInStyle(false);
375 state.setInScript(false);
376 state.setInTextArea(false);
377 state.setInTitle(false);
378 state.setInXmp(false);
379 state.setInIFrame(false);
380 tquote = NoQuote;
381 m_scriptCodeSize = m_scriptCodeResync = 0;
382 }
383 return state;
384 }
385 // possible end of tagname, lets check.
386 if (!m_scriptCodeResync && !state.escaped() && !src.escaped() && (ch == '>' || ch == '/' || isASCIISpace(ch)) &&
387 m_scriptCodeSize >= m_searchStopperLength &&
388 tagMatch(m_searchStopper, m_scriptCode + m_scriptCodeSize - m_searchStopperLength, m_searchStopperLength) &&
389 (lastDecodedEntityPosition < m_scriptCodeSize - m_searchStopperLength)) {
390 m_scriptCodeResync = m_scriptCodeSize-m_searchStopperLength+1;
391 tquote = NoQuote;
392 continue;
393 }
394 if (m_scriptCodeResync && !state.escaped()) {
395 if (ch == '\"')
396 tquote = (tquote == NoQuote) ? DoubleQuote : ((tquote == SingleQuote) ? SingleQuote : NoQuote);
397 else if (ch == '\'')
398 tquote = (tquote == NoQuote) ? SingleQuote : (tquote == DoubleQuote) ? DoubleQuote : NoQuote;
399 else if (tquote != NoQuote && (ch == '\r' || ch == '\n'))
400 tquote = NoQuote;
401 }
402 state.setEscaped(!state.escaped() && ch == '\\');
403 if (!m_scriptCodeResync && (state.inTextArea() || state.inTitle() || state.inIFrame()) && !src.escaped() && ch == '&') {
404 UChar* scriptCodeDest = m_scriptCode + m_scriptCodeSize;
405 src.advancePastNonNewline();
406 state = parseEntity(src, scriptCodeDest, state, m_cBufferPos, true, false);
407 if (scriptCodeDest == m_scriptCode + m_scriptCodeSize)
408 lastDecodedEntityPosition = m_scriptCodeSize;
409 else
410 m_scriptCodeSize = scriptCodeDest - m_scriptCode;
411 } else {
412 m_scriptCode[m_scriptCodeSize++] = ch;
413 src.advance(m_lineNumber);
414 }
415 }
416
417 return state;
418 }
419
scriptHandler(State state)420 HTMLTokenizer::State HTMLTokenizer::scriptHandler(State state)
421 {
422 // We are inside a <script>
423 bool doScriptExec = false;
424 int startLine = m_currentScriptTagStartLineNumber + 1; // Script line numbers are 1 based, HTMLTokenzier line numbers are 0 based
425
426 // Reset m_currentScriptTagStartLineNumber to indicate that we've finished parsing the current script element
427 m_currentScriptTagStartLineNumber = 0;
428
429 // (Bugzilla 3837) Scripts following a frameset element should not execute or,
430 // in the case of extern scripts, even load.
431 bool followingFrameset = (m_doc->body() && m_doc->body()->hasTagName(framesetTag));
432
433 CachedScript* cs = 0;
434 // don't load external scripts for standalone documents (for now)
435 if (!inViewSourceMode()) {
436 if (!m_scriptTagSrcAttrValue.isEmpty() && m_doc->frame()) {
437 // forget what we just got; load from src url instead
438 if (!m_parser->skipMode() && !followingFrameset) {
439 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
440 if (!m_doc->ownerElement())
441 printf("Requesting script at time %d\n", m_doc->elapsedTime());
442 #endif
443 // The parser might have been stopped by for example a window.close call in an earlier script.
444 // If so, we don't want to load scripts.
445 if (!m_parserStopped && (cs = m_doc->docLoader()->requestScript(m_scriptTagSrcAttrValue, m_scriptTagCharsetAttrValue)))
446 m_pendingScripts.append(cs);
447 else
448 m_scriptNode = 0;
449 } else
450 m_scriptNode = 0;
451 m_scriptTagSrcAttrValue = String();
452 } else {
453 // Parse m_scriptCode containing <script> info
454 doScriptExec = m_scriptNode->shouldExecuteAsJavaScript();
455 #if ENABLE(XHTMLMP)
456 if (!doScriptExec)
457 m_doc->setShouldProcessNoscriptElement(true);
458 #endif
459 m_scriptNode = 0;
460 }
461 }
462
463 state = processListing(SegmentedString(m_scriptCode, m_scriptCodeSize), state);
464 RefPtr<Node> node = processToken();
465 String scriptString = node ? node->textContent() : "";
466 m_currentToken.tagName = scriptTag.localName();
467 m_currentToken.beginTag = false;
468 processToken();
469
470 state.setInScript(false);
471 m_scriptCodeSize = m_scriptCodeResync = 0;
472
473 // FIXME: The script should be syntax highlighted.
474 if (inViewSourceMode())
475 return state;
476
477 SegmentedString* savedPrependingSrc = m_currentPrependingSrc;
478 SegmentedString prependingSrc;
479 m_currentPrependingSrc = &prependingSrc;
480
481 #ifdef ANDROID_INSTRUMENT
482 android::TimeCounter::recordNoCounter(android::TimeCounter::ParsingTimeCounter, __FUNCTION__);
483 #endif
484
485 if (!m_parser->skipMode() && !followingFrameset) {
486 if (cs) {
487 if (savedPrependingSrc)
488 savedPrependingSrc->append(m_src);
489 else
490 m_pendingSrc.prepend(m_src);
491 setSrc(SegmentedString());
492
493 // the ref() call below may call notifyFinished if the script is already in cache,
494 // and that mucks with the state directly, so we must write it back to the object.
495 m_state = state;
496 bool savedRequestingScript = m_requestingScript;
497 m_requestingScript = true;
498 cs->addClient(this);
499 m_requestingScript = savedRequestingScript;
500 state = m_state;
501 // will be 0 if script was already loaded and ref() executed it
502 if (!m_pendingScripts.isEmpty())
503 state.setLoadingExtScript(true);
504 } else if (!m_fragment && doScriptExec) {
505 if (!m_executingScript)
506 m_pendingSrc.prepend(m_src);
507 else
508 prependingSrc = m_src;
509 setSrc(SegmentedString());
510 state = scriptExecution(ScriptSourceCode(scriptString, m_doc->frame() ? m_doc->frame()->document()->url() : KURL(), startLine), state);
511 }
512 }
513
514 #ifdef ANDROID_INSTRUMENT
515 android::TimeCounter::start(android::TimeCounter::ParsingTimeCounter);
516 #endif
517
518 if (!m_executingScript && !state.loadingExtScript()) {
519 m_src.append(m_pendingSrc);
520 m_pendingSrc.clear();
521 } else if (!prependingSrc.isEmpty()) {
522 // restore first so that the write appends in the right place
523 // (does not hurt to do it again below)
524 m_currentPrependingSrc = savedPrependingSrc;
525
526 // we need to do this slightly modified bit of one of the write() cases
527 // because we want to prepend to m_pendingSrc rather than appending
528 // if there's no previous prependingSrc
529 if (!m_pendingScripts.isEmpty()) {
530 if (m_currentPrependingSrc)
531 m_currentPrependingSrc->append(prependingSrc);
532 else
533 m_pendingSrc.prepend(prependingSrc);
534 } else {
535 m_state = state;
536 write(prependingSrc, false);
537 state = m_state;
538 }
539 }
540
541
542 #if PRELOAD_SCANNER_ENABLED
543 if (!m_pendingScripts.isEmpty() && !m_executingScript) {
544 if (!m_preloadScanner)
545 m_preloadScanner.set(new PreloadScanner(m_doc));
546 if (!m_preloadScanner->inProgress()) {
547 m_preloadScanner->begin();
548 m_preloadScanner->write(m_pendingSrc);
549 }
550 }
551 #endif
552 m_currentPrependingSrc = savedPrependingSrc;
553
554 return state;
555 }
556
scriptExecution(const ScriptSourceCode & sourceCode,State state)557 HTMLTokenizer::State HTMLTokenizer::scriptExecution(const ScriptSourceCode& sourceCode, State state)
558 {
559 if (m_fragment || !m_doc->frame())
560 return state;
561 m_executingScript++;
562
563 SegmentedString* savedPrependingSrc = m_currentPrependingSrc;
564 SegmentedString prependingSrc;
565 m_currentPrependingSrc = &prependingSrc;
566
567 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
568 if (!m_doc->ownerElement())
569 printf("beginning script execution at %d\n", m_doc->elapsedTime());
570 #endif
571
572 m_state = state;
573 m_doc->frame()->loader()->executeScript(sourceCode);
574 state = m_state;
575
576 state.setAllowYield(true);
577
578 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
579 if (!m_doc->ownerElement())
580 printf("ending script execution at %d\n", m_doc->elapsedTime());
581 #endif
582
583 m_executingScript--;
584
585 if (!m_executingScript && !state.loadingExtScript()) {
586 m_pendingSrc.prepend(prependingSrc);
587 m_src.append(m_pendingSrc);
588 m_pendingSrc.clear();
589 } else if (!prependingSrc.isEmpty()) {
590 // restore first so that the write appends in the right place
591 // (does not hurt to do it again below)
592 m_currentPrependingSrc = savedPrependingSrc;
593
594 // we need to do this slightly modified bit of one of the write() cases
595 // because we want to prepend to m_pendingSrc rather than appending
596 // if there's no previous prependingSrc
597 if (!m_pendingScripts.isEmpty()) {
598 if (m_currentPrependingSrc)
599 m_currentPrependingSrc->append(prependingSrc);
600 else
601 m_pendingSrc.prepend(prependingSrc);
602
603 #if PRELOAD_SCANNER_ENABLED
604 // We are stuck waiting for another script. Lets check the source that
605 // was just document.write()n for anything to load.
606 PreloadScanner documentWritePreloadScanner(m_doc);
607 documentWritePreloadScanner.begin();
608 documentWritePreloadScanner.write(prependingSrc);
609 documentWritePreloadScanner.end();
610 #endif
611 } else {
612 m_state = state;
613 write(prependingSrc, false);
614 state = m_state;
615 }
616 }
617
618 m_currentPrependingSrc = savedPrependingSrc;
619
620 return state;
621 }
622
parseComment(SegmentedString & src,State state)623 HTMLTokenizer::State HTMLTokenizer::parseComment(SegmentedString& src, State state)
624 {
625 // FIXME: Why does this code even run for comments inside <script> and <style>? This seems bogus.
626 checkScriptBuffer(src.length());
627 while (!src.isEmpty()) {
628 UChar ch = *src;
629 m_scriptCode[m_scriptCodeSize++] = ch;
630 if (ch == '>') {
631 bool handleBrokenComments = m_brokenComments && !(state.inScript() || state.inStyle());
632 int endCharsCount = 1; // start off with one for the '>' character
633 if (m_scriptCodeSize > 2 && m_scriptCode[m_scriptCodeSize-3] == '-' && m_scriptCode[m_scriptCodeSize-2] == '-') {
634 endCharsCount = 3;
635 } else if (m_scriptCodeSize > 3 && m_scriptCode[m_scriptCodeSize-4] == '-' && m_scriptCode[m_scriptCodeSize-3] == '-' &&
636 m_scriptCode[m_scriptCodeSize-2] == '!') {
637 // Other browsers will accept --!> as a close comment, even though it's
638 // not technically valid.
639 endCharsCount = 4;
640 }
641 if (handleBrokenComments || endCharsCount > 1) {
642 src.advancePastNonNewline();
643 if (!(state.inTitle() || state.inScript() || state.inXmp() || state.inTextArea() || state.inStyle() || state.inIFrame())) {
644 checkScriptBuffer();
645 m_scriptCode[m_scriptCodeSize] = 0;
646 m_scriptCode[m_scriptCodeSize + 1] = 0;
647 m_currentToken.tagName = commentAtom;
648 m_currentToken.beginTag = true;
649 state = processListing(SegmentedString(m_scriptCode, m_scriptCodeSize - endCharsCount), state);
650 processToken();
651 m_currentToken.tagName = commentAtom;
652 m_currentToken.beginTag = false;
653 processToken();
654 m_scriptCodeSize = 0;
655 }
656 state.setInComment(false);
657 return state; // Finished parsing comment
658 }
659 }
660 src.advance(m_lineNumber);
661 }
662
663 return state;
664 }
665
parseServer(SegmentedString & src,State state)666 HTMLTokenizer::State HTMLTokenizer::parseServer(SegmentedString& src, State state)
667 {
668 checkScriptBuffer(src.length());
669 while (!src.isEmpty()) {
670 UChar ch = *src;
671 m_scriptCode[m_scriptCodeSize++] = ch;
672 if (ch == '>' && m_scriptCodeSize > 1 && m_scriptCode[m_scriptCodeSize - 2] == '%') {
673 src.advancePastNonNewline();
674 state.setInServer(false);
675 m_scriptCodeSize = 0;
676 return state; // Finished parsing server include
677 }
678 src.advance(m_lineNumber);
679 }
680 return state;
681 }
682
parseProcessingInstruction(SegmentedString & src,State state)683 HTMLTokenizer::State HTMLTokenizer::parseProcessingInstruction(SegmentedString& src, State state)
684 {
685 UChar oldchar = 0;
686 while (!src.isEmpty()) {
687 UChar chbegin = *src;
688 if (chbegin == '\'')
689 tquote = tquote == SingleQuote ? NoQuote : SingleQuote;
690 else if (chbegin == '\"')
691 tquote = tquote == DoubleQuote ? NoQuote : DoubleQuote;
692 // Look for '?>'
693 // Some crappy sites omit the "?" before it, so
694 // we look for an unquoted '>' instead. (IE compatible)
695 else if (chbegin == '>' && (!tquote || oldchar == '?')) {
696 // We got a '?>' sequence
697 state.setInProcessingInstruction(false);
698 src.advancePastNonNewline();
699 state.setDiscardLF(true);
700 return state; // Finished parsing comment!
701 }
702 src.advance(m_lineNumber);
703 oldchar = chbegin;
704 }
705
706 return state;
707 }
708
parseText(SegmentedString & src,State state)709 HTMLTokenizer::State HTMLTokenizer::parseText(SegmentedString& src, State state)
710 {
711 while (!src.isEmpty()) {
712 UChar cc = *src;
713
714 if (state.skipLF()) {
715 state.setSkipLF(false);
716 if (cc == '\n') {
717 src.advancePastNewline(m_lineNumber);
718 continue;
719 }
720 }
721
722 // do we need to enlarge the buffer?
723 checkBuffer();
724
725 if (cc == '\r') {
726 state.setSkipLF(true);
727 *m_dest++ = '\n';
728 } else
729 *m_dest++ = cc;
730 src.advance(m_lineNumber);
731 }
732
733 return state;
734 }
735
736
parseEntity(SegmentedString & src,UChar * & dest,State state,unsigned & cBufferPos,bool start,bool parsingTag)737 HTMLTokenizer::State HTMLTokenizer::parseEntity(SegmentedString& src, UChar*& dest, State state, unsigned& cBufferPos, bool start, bool parsingTag)
738 {
739 if (start) {
740 cBufferPos = 0;
741 state.setEntityState(SearchEntity);
742 EntityUnicodeValue = 0;
743 }
744
745 while (!src.isEmpty()) {
746 UChar cc = *src;
747 switch (state.entityState()) {
748 case NoEntity:
749 ASSERT(state.entityState() != NoEntity);
750 return state;
751
752 case SearchEntity:
753 if (cc == '#') {
754 m_cBuffer[cBufferPos++] = cc;
755 src.advancePastNonNewline();
756 state.setEntityState(NumericSearch);
757 } else
758 state.setEntityState(EntityName);
759 break;
760
761 case NumericSearch:
762 if (cc == 'x' || cc == 'X') {
763 m_cBuffer[cBufferPos++] = cc;
764 src.advancePastNonNewline();
765 state.setEntityState(Hexadecimal);
766 } else if (cc >= '0' && cc <= '9')
767 state.setEntityState(Decimal);
768 else
769 state.setEntityState(SearchSemicolon);
770 break;
771
772 case Hexadecimal: {
773 int ll = min(src.length(), 10 - cBufferPos);
774 while (ll--) {
775 cc = *src;
776 if (!((cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f') || (cc >= 'A' && cc <= 'F'))) {
777 state.setEntityState(SearchSemicolon);
778 break;
779 }
780 int digit;
781 if (cc < 'A')
782 digit = cc - '0';
783 else
784 digit = (cc - 'A' + 10) & 0xF; // handle both upper and lower case without a branch
785 EntityUnicodeValue = EntityUnicodeValue * 16 + digit;
786 m_cBuffer[cBufferPos++] = cc;
787 src.advancePastNonNewline();
788 }
789 if (cBufferPos == 10)
790 state.setEntityState(SearchSemicolon);
791 break;
792 }
793 case Decimal:
794 {
795 int ll = min(src.length(), 9-cBufferPos);
796 while (ll--) {
797 cc = *src;
798
799 if (!(cc >= '0' && cc <= '9')) {
800 state.setEntityState(SearchSemicolon);
801 break;
802 }
803
804 EntityUnicodeValue = EntityUnicodeValue * 10 + (cc - '0');
805 m_cBuffer[cBufferPos++] = cc;
806 src.advancePastNonNewline();
807 }
808 if (cBufferPos == 9)
809 state.setEntityState(SearchSemicolon);
810 break;
811 }
812 case EntityName:
813 {
814 int ll = min(src.length(), 9-cBufferPos);
815 while (ll--) {
816 cc = *src;
817
818 if (!((cc >= 'a' && cc <= 'z') || (cc >= '0' && cc <= '9') || (cc >= 'A' && cc <= 'Z'))) {
819 state.setEntityState(SearchSemicolon);
820 break;
821 }
822
823 m_cBuffer[cBufferPos++] = cc;
824 src.advancePastNonNewline();
825 }
826 if (cBufferPos == 9)
827 state.setEntityState(SearchSemicolon);
828 if (state.entityState() == SearchSemicolon) {
829 if (cBufferPos > 1) {
830 // Since the maximum length of entity name is 9,
831 // so a single char array which is allocated on
832 // the stack, its length is 10, should be OK.
833 // Also if we have an illegal character, we treat it
834 // as illegal entity name.
835 unsigned testedEntityNameLen = 0;
836 char tmpEntityNameBuffer[10];
837
838 ASSERT(cBufferPos < 10);
839 for (; testedEntityNameLen < cBufferPos; ++testedEntityNameLen) {
840 if (m_cBuffer[testedEntityNameLen] > 0x7e)
841 break;
842 tmpEntityNameBuffer[testedEntityNameLen] = m_cBuffer[testedEntityNameLen];
843 }
844
845 const Entity *e;
846
847 if (testedEntityNameLen == cBufferPos)
848 e = findEntity(tmpEntityNameBuffer, cBufferPos);
849 else
850 e = 0;
851
852 if (e)
853 EntityUnicodeValue = e->code;
854
855 // be IE compatible
856 if (parsingTag && EntityUnicodeValue > 255 && *src != ';')
857 EntityUnicodeValue = 0;
858 }
859 }
860 else
861 break;
862 }
863 case SearchSemicolon:
864 // Don't allow values that are more than 21 bits.
865 if (EntityUnicodeValue > 0 && EntityUnicodeValue <= 0x10FFFF) {
866 if (!inViewSourceMode()) {
867 if (*src == ';')
868 src.advancePastNonNewline();
869 if (EntityUnicodeValue <= 0xFFFF) {
870 checkBuffer();
871 src.push(fixUpChar(EntityUnicodeValue));
872 } else {
873 // Convert to UTF-16, using surrogate code points.
874 checkBuffer(2);
875 src.push(U16_LEAD(EntityUnicodeValue));
876 src.push(U16_TRAIL(EntityUnicodeValue));
877 }
878 } else {
879 // FIXME: We should eventually colorize entities by sending them as a special token.
880 // 12 bytes required: up to 10 bytes in m_cBuffer plus the
881 // leading '&' and trailing ';'
882 checkBuffer(12);
883 *dest++ = '&';
884 for (unsigned i = 0; i < cBufferPos; i++)
885 dest[i] = m_cBuffer[i];
886 dest += cBufferPos;
887 if (*src == ';') {
888 *dest++ = ';';
889 src.advancePastNonNewline();
890 }
891 }
892 } else {
893 // 11 bytes required: up to 10 bytes in m_cBuffer plus the
894 // leading '&'
895 checkBuffer(11);
896 // ignore the sequence, add it to the buffer as plaintext
897 *dest++ = '&';
898 for (unsigned i = 0; i < cBufferPos; i++)
899 dest[i] = m_cBuffer[i];
900 dest += cBufferPos;
901 }
902
903 state.setEntityState(NoEntity);
904 return state;
905 }
906 }
907
908 return state;
909 }
910
parseDoctype(SegmentedString & src,State state)911 HTMLTokenizer::State HTMLTokenizer::parseDoctype(SegmentedString& src, State state)
912 {
913 ASSERT(state.inDoctype());
914 while (!src.isEmpty() && state.inDoctype()) {
915 UChar c = *src;
916 bool isWhitespace = c == '\r' || c == '\n' || c == '\t' || c == ' ';
917 switch (m_doctypeToken.state()) {
918 case DoctypeBegin: {
919 m_doctypeToken.setState(DoctypeBeforeName);
920 if (isWhitespace) {
921 src.advance(m_lineNumber);
922 if (inViewSourceMode())
923 m_doctypeToken.m_source.append(c);
924 }
925 break;
926 }
927 case DoctypeBeforeName: {
928 if (c == '>') {
929 // Malformed. Just exit.
930 src.advancePastNonNewline();
931 state.setInDoctype(false);
932 if (inViewSourceMode())
933 processDoctypeToken();
934 } else if (isWhitespace) {
935 src.advance(m_lineNumber);
936 if (inViewSourceMode())
937 m_doctypeToken.m_source.append(c);
938 } else
939 m_doctypeToken.setState(DoctypeName);
940 break;
941 }
942 case DoctypeName: {
943 if (c == '>') {
944 // Valid doctype. Emit it.
945 src.advancePastNonNewline();
946 state.setInDoctype(false);
947 processDoctypeToken();
948 } else if (isWhitespace) {
949 m_doctypeSearchCount = 0; // Used now to scan for PUBLIC
950 m_doctypeSecondarySearchCount = 0; // Used now to scan for SYSTEM
951 m_doctypeToken.setState(DoctypeAfterName);
952 src.advance(m_lineNumber);
953 if (inViewSourceMode())
954 m_doctypeToken.m_source.append(c);
955 } else {
956 src.advancePastNonNewline();
957 m_doctypeToken.m_name.append(c);
958 if (inViewSourceMode())
959 m_doctypeToken.m_source.append(c);
960 }
961 break;
962 }
963 case DoctypeAfterName: {
964 if (c == '>') {
965 // Valid doctype. Emit it.
966 src.advancePastNonNewline();
967 state.setInDoctype(false);
968 processDoctypeToken();
969 } else if (!isWhitespace) {
970 src.advancePastNonNewline();
971 if (toASCIILower(c) == publicStart[m_doctypeSearchCount]) {
972 m_doctypeSearchCount++;
973 if (m_doctypeSearchCount == 6)
974 // Found 'PUBLIC' sequence
975 m_doctypeToken.setState(DoctypeBeforePublicID);
976 } else if (m_doctypeSearchCount > 0) {
977 m_doctypeSearchCount = 0;
978 m_doctypeToken.setState(DoctypeBogus);
979 } else if (toASCIILower(c) == systemStart[m_doctypeSecondarySearchCount]) {
980 m_doctypeSecondarySearchCount++;
981 if (m_doctypeSecondarySearchCount == 6)
982 // Found 'SYSTEM' sequence
983 m_doctypeToken.setState(DoctypeBeforeSystemID);
984 } else {
985 m_doctypeSecondarySearchCount = 0;
986 m_doctypeToken.setState(DoctypeBogus);
987 }
988 if (inViewSourceMode())
989 m_doctypeToken.m_source.append(c);
990 } else {
991 src.advance(m_lineNumber); // Whitespace keeps us in the after name state.
992 if (inViewSourceMode())
993 m_doctypeToken.m_source.append(c);
994 }
995 break;
996 }
997 case DoctypeBeforePublicID: {
998 if (c == '\"' || c == '\'') {
999 tquote = c == '\"' ? DoubleQuote : SingleQuote;
1000 m_doctypeToken.setState(DoctypePublicID);
1001 src.advancePastNonNewline();
1002 if (inViewSourceMode())
1003 m_doctypeToken.m_source.append(c);
1004 } else if (c == '>') {
1005 // Considered bogus. Don't process the doctype.
1006 src.advancePastNonNewline();
1007 state.setInDoctype(false);
1008 if (inViewSourceMode())
1009 processDoctypeToken();
1010 } else if (isWhitespace) {
1011 src.advance(m_lineNumber);
1012 if (inViewSourceMode())
1013 m_doctypeToken.m_source.append(c);
1014 } else
1015 m_doctypeToken.setState(DoctypeBogus);
1016 break;
1017 }
1018 case DoctypePublicID: {
1019 if ((c == '\"' && tquote == DoubleQuote) || (c == '\'' && tquote == SingleQuote)) {
1020 src.advancePastNonNewline();
1021 m_doctypeToken.setState(DoctypeAfterPublicID);
1022 if (inViewSourceMode())
1023 m_doctypeToken.m_source.append(c);
1024 } else if (c == '>') {
1025 // Considered bogus. Don't process the doctype.
1026 src.advancePastNonNewline();
1027 state.setInDoctype(false);
1028 if (inViewSourceMode())
1029 processDoctypeToken();
1030 } else {
1031 m_doctypeToken.m_publicID.append(c);
1032 src.advance(m_lineNumber);
1033 if (inViewSourceMode())
1034 m_doctypeToken.m_source.append(c);
1035 }
1036 break;
1037 }
1038 case DoctypeAfterPublicID:
1039 if (c == '\"' || c == '\'') {
1040 tquote = c == '\"' ? DoubleQuote : SingleQuote;
1041 m_doctypeToken.setState(DoctypeSystemID);
1042 src.advancePastNonNewline();
1043 if (inViewSourceMode())
1044 m_doctypeToken.m_source.append(c);
1045 } else if (c == '>') {
1046 // Valid doctype. Emit it now.
1047 src.advancePastNonNewline();
1048 state.setInDoctype(false);
1049 processDoctypeToken();
1050 } else if (isWhitespace) {
1051 src.advance(m_lineNumber);
1052 if (inViewSourceMode())
1053 m_doctypeToken.m_source.append(c);
1054 } else
1055 m_doctypeToken.setState(DoctypeBogus);
1056 break;
1057 case DoctypeBeforeSystemID:
1058 if (c == '\"' || c == '\'') {
1059 tquote = c == '\"' ? DoubleQuote : SingleQuote;
1060 m_doctypeToken.setState(DoctypeSystemID);
1061 src.advancePastNonNewline();
1062 if (inViewSourceMode())
1063 m_doctypeToken.m_source.append(c);
1064 } else if (c == '>') {
1065 // Considered bogus. Don't process the doctype.
1066 src.advancePastNonNewline();
1067 state.setInDoctype(false);
1068 } else if (isWhitespace) {
1069 src.advance(m_lineNumber);
1070 if (inViewSourceMode())
1071 m_doctypeToken.m_source.append(c);
1072 } else
1073 m_doctypeToken.setState(DoctypeBogus);
1074 break;
1075 case DoctypeSystemID:
1076 if ((c == '\"' && tquote == DoubleQuote) || (c == '\'' && tquote == SingleQuote)) {
1077 src.advancePastNonNewline();
1078 m_doctypeToken.setState(DoctypeAfterSystemID);
1079 if (inViewSourceMode())
1080 m_doctypeToken.m_source.append(c);
1081 } else if (c == '>') {
1082 // Considered bogus. Don't process the doctype.
1083 src.advancePastNonNewline();
1084 state.setInDoctype(false);
1085 if (inViewSourceMode())
1086 processDoctypeToken();
1087 } else {
1088 m_doctypeToken.m_systemID.append(c);
1089 src.advance(m_lineNumber);
1090 if (inViewSourceMode())
1091 m_doctypeToken.m_source.append(c);
1092 }
1093 break;
1094 case DoctypeAfterSystemID:
1095 if (c == '>') {
1096 // Valid doctype. Emit it now.
1097 src.advancePastNonNewline();
1098 state.setInDoctype(false);
1099 processDoctypeToken();
1100 } else if (isWhitespace) {
1101 src.advance(m_lineNumber);
1102 if (inViewSourceMode())
1103 m_doctypeToken.m_source.append(c);
1104 } else
1105 m_doctypeToken.setState(DoctypeBogus);
1106 break;
1107 case DoctypeBogus:
1108 if (c == '>') {
1109 // Done with the bogus doctype.
1110 src.advancePastNonNewline();
1111 state.setInDoctype(false);
1112 if (inViewSourceMode())
1113 processDoctypeToken();
1114 } else {
1115 src.advance(m_lineNumber); // Just keep scanning for '>'
1116 if (inViewSourceMode())
1117 m_doctypeToken.m_source.append(c);
1118 }
1119 break;
1120 default:
1121 break;
1122 }
1123 }
1124 return state;
1125 }
1126
parseTag(SegmentedString & src,State state)1127 HTMLTokenizer::State HTMLTokenizer::parseTag(SegmentedString& src, State state)
1128 {
1129 ASSERT(!state.hasEntityState());
1130
1131 unsigned cBufferPos = m_cBufferPos;
1132
1133 bool lastIsSlash = false;
1134
1135 while (!src.isEmpty()) {
1136 checkBuffer();
1137 switch (state.tagState()) {
1138 case NoTag:
1139 {
1140 m_cBufferPos = cBufferPos;
1141 return state;
1142 }
1143 case TagName:
1144 {
1145 if (searchCount > 0) {
1146 if (*src == commentStart[searchCount]) {
1147 searchCount++;
1148 if (searchCount == 2)
1149 m_doctypeSearchCount++; // A '!' is also part of a doctype, so we are moving through that still as well.
1150 else
1151 m_doctypeSearchCount = 0;
1152 if (searchCount == 4) {
1153 // Found '<!--' sequence
1154 src.advancePastNonNewline();
1155 m_dest = m_buffer; // ignore the previous part of this tag
1156 state.setInComment(true);
1157 state.setTagState(NoTag);
1158
1159 // Fix bug 34302 at kde.bugs.org. Go ahead and treat
1160 // <!--> as a valid comment, since both mozilla and IE on windows
1161 // can handle this case. Only do this in quirks mode. -dwh
1162 if (!src.isEmpty() && *src == '>' && m_doc->inCompatMode()) {
1163 state.setInComment(false);
1164 src.advancePastNonNewline();
1165 if (!src.isEmpty())
1166 m_cBuffer[cBufferPos++] = *src;
1167 } else
1168 state = parseComment(src, state);
1169
1170 m_cBufferPos = cBufferPos;
1171 return state; // Finished parsing tag!
1172 }
1173 m_cBuffer[cBufferPos++] = *src;
1174 src.advancePastNonNewline();
1175 break;
1176 } else
1177 searchCount = 0; // Stop looking for '<!--' sequence
1178 }
1179
1180 if (m_doctypeSearchCount > 0) {
1181 if (toASCIILower(*src) == doctypeStart[m_doctypeSearchCount]) {
1182 m_doctypeSearchCount++;
1183 m_cBuffer[cBufferPos++] = *src;
1184 src.advancePastNonNewline();
1185 if (m_doctypeSearchCount == 9) {
1186 // Found '<!DOCTYPE' sequence
1187 state.setInDoctype(true);
1188 state.setTagState(NoTag);
1189 m_doctypeToken.reset();
1190 if (inViewSourceMode())
1191 m_doctypeToken.m_source.append(m_cBuffer, cBufferPos);
1192 state = parseDoctype(src, state);
1193 m_cBufferPos = cBufferPos;
1194 return state;
1195 }
1196 break;
1197 } else
1198 m_doctypeSearchCount = 0; // Stop looking for '<!DOCTYPE' sequence
1199 }
1200
1201 bool finish = false;
1202 unsigned int ll = min(src.length(), CBUFLEN - cBufferPos);
1203 while (ll--) {
1204 UChar curchar = *src;
1205 if (isASCIISpace(curchar) || curchar == '>' || curchar == '<') {
1206 finish = true;
1207 break;
1208 }
1209
1210 // tolower() shows up on profiles. This is faster!
1211 if (curchar >= 'A' && curchar <= 'Z' && !inViewSourceMode())
1212 m_cBuffer[cBufferPos++] = curchar + ('a' - 'A');
1213 else
1214 m_cBuffer[cBufferPos++] = curchar;
1215 src.advancePastNonNewline();
1216 }
1217
1218 // Disadvantage: we add the possible rest of the tag
1219 // as attribute names. ### judge if this causes problems
1220 if (finish || CBUFLEN == cBufferPos) {
1221 bool beginTag;
1222 UChar* ptr = m_cBuffer;
1223 unsigned int len = cBufferPos;
1224 m_cBuffer[cBufferPos] = '\0';
1225 if ((cBufferPos > 0) && (*ptr == '/')) {
1226 // End Tag
1227 beginTag = false;
1228 ptr++;
1229 len--;
1230 }
1231 else
1232 // Start Tag
1233 beginTag = true;
1234
1235 // Ignore the / in fake xml tags like <br/>. We trim off the "/" so that we'll get "br" as the tag name and not "br/".
1236 if (len > 1 && ptr[len-1] == '/' && !inViewSourceMode())
1237 ptr[--len] = '\0';
1238
1239 // Now that we've shaved off any invalid / that might have followed the name), make the tag.
1240 // FIXME: FireFox and WinIE turn !foo nodes into comments, we ignore comments. (fast/parser/tag-with-exclamation-point.html)
1241 if (ptr[0] != '!' || inViewSourceMode()) {
1242 m_currentToken.tagName = AtomicString(ptr);
1243 m_currentToken.beginTag = beginTag;
1244 }
1245 m_dest = m_buffer;
1246 state.setTagState(SearchAttribute);
1247 cBufferPos = 0;
1248 }
1249 break;
1250 }
1251 case SearchAttribute:
1252 while (!src.isEmpty()) {
1253 UChar curchar = *src;
1254 // In this mode just ignore any quotes we encounter and treat them like spaces.
1255 if (!isASCIISpace(curchar) && curchar != '\'' && curchar != '"') {
1256 if (curchar == '<' || curchar == '>')
1257 state.setTagState(SearchEnd);
1258 else
1259 state.setTagState(AttributeName);
1260
1261 cBufferPos = 0;
1262 break;
1263 }
1264 if (inViewSourceMode())
1265 m_currentToken.addViewSourceChar(curchar);
1266 src.advance(m_lineNumber);
1267 }
1268 break;
1269 case AttributeName:
1270 {
1271 m_rawAttributeBeforeValue.clear();
1272 int ll = min(src.length(), CBUFLEN - cBufferPos);
1273 while (ll--) {
1274 UChar curchar = *src;
1275 // If we encounter a "/" when scanning an attribute name, treat it as a delimiter. This allows the
1276 // cases like <input type=checkbox checked/> to work (and accommodates XML-style syntax as per HTML5).
1277 if (curchar <= '>' && (curchar >= '<' || isASCIISpace(curchar) || curchar == '/')) {
1278 m_cBuffer[cBufferPos] = '\0';
1279 m_attrName = AtomicString(m_cBuffer);
1280 m_dest = m_buffer;
1281 *m_dest++ = 0;
1282 state.setTagState(SearchEqual);
1283 if (inViewSourceMode())
1284 m_currentToken.addViewSourceChar('a');
1285 break;
1286 }
1287
1288 // tolower() shows up on profiles. This is faster!
1289 if (curchar >= 'A' && curchar <= 'Z' && !inViewSourceMode())
1290 m_cBuffer[cBufferPos++] = curchar + ('a' - 'A');
1291 else
1292 m_cBuffer[cBufferPos++] = curchar;
1293
1294 m_rawAttributeBeforeValue.append(curchar);
1295 src.advance(m_lineNumber);
1296 }
1297 if (cBufferPos == CBUFLEN) {
1298 m_cBuffer[cBufferPos] = '\0';
1299 m_attrName = AtomicString(m_cBuffer);
1300 m_dest = m_buffer;
1301 *m_dest++ = 0;
1302 state.setTagState(SearchEqual);
1303 if (inViewSourceMode())
1304 m_currentToken.addViewSourceChar('a');
1305 }
1306 break;
1307 }
1308 case SearchEqual:
1309 while (!src.isEmpty()) {
1310 UChar curchar = *src;
1311
1312 if (lastIsSlash && curchar == '>') {
1313 // This is a quirk (with a long sad history). We have to do this
1314 // since widgets do <script src="foo.js"/> and expect the tag to close.
1315 if (m_currentToken.tagName == scriptTag)
1316 m_currentToken.selfClosingTag = true;
1317 m_currentToken.brokenXMLStyle = true;
1318 }
1319
1320 // In this mode just ignore any quotes or slashes we encounter and treat them like spaces.
1321 if (!isASCIISpace(curchar) && curchar != '\'' && curchar != '"' && curchar != '/') {
1322 if (curchar == '=') {
1323 state.setTagState(SearchValue);
1324 if (inViewSourceMode())
1325 m_currentToken.addViewSourceChar(curchar);
1326 m_rawAttributeBeforeValue.append(curchar);
1327 src.advancePastNonNewline();
1328 } else {
1329 m_currentToken.addAttribute(m_attrName, emptyAtom, inViewSourceMode());
1330 m_dest = m_buffer;
1331 state.setTagState(SearchAttribute);
1332 lastIsSlash = false;
1333 }
1334 break;
1335 }
1336
1337 lastIsSlash = curchar == '/';
1338
1339 if (inViewSourceMode())
1340 m_currentToken.addViewSourceChar(curchar);
1341 m_rawAttributeBeforeValue.append(curchar);
1342 src.advance(m_lineNumber);
1343 }
1344 break;
1345 case SearchValue:
1346 while (!src.isEmpty()) {
1347 UChar curchar = *src;
1348 if (!isASCIISpace(curchar)) {
1349 if (curchar == '\'' || curchar == '\"') {
1350 tquote = curchar == '\"' ? DoubleQuote : SingleQuote;
1351 state.setTagState(QuotedValue);
1352 if (inViewSourceMode())
1353 m_currentToken.addViewSourceChar(curchar);
1354 m_rawAttributeBeforeValue.append(curchar);
1355 src.advancePastNonNewline();
1356 } else
1357 state.setTagState(Value);
1358
1359 break;
1360 }
1361 if (inViewSourceMode())
1362 m_currentToken.addViewSourceChar(curchar);
1363 m_rawAttributeBeforeValue.append(curchar);
1364 src.advance(m_lineNumber);
1365 }
1366 break;
1367 case QuotedValue:
1368 while (!src.isEmpty()) {
1369 checkBuffer();
1370
1371 UChar curchar = *src;
1372 if (curchar <= '>' && !src.escaped()) {
1373 if (curchar == '>' && m_attrName.isEmpty()) {
1374 // Handle a case like <img '>. Just go ahead and be willing
1375 // to close the whole tag. Don't consume the character and
1376 // just go back into SearchEnd while ignoring the whole
1377 // value.
1378 // FIXME: Note that this is actually not a very good solution.
1379 // It doesn't handle the general case of
1380 // unmatched quotes among attributes that have names. -dwh
1381 while (m_dest > m_buffer + 1 && (m_dest[-1] == '\n' || m_dest[-1] == '\r'))
1382 m_dest--; // remove trailing newlines
1383 AtomicString attributeValue(m_buffer + 1, m_dest - m_buffer - 1);
1384 if (!attributeValue.contains('/'))
1385 m_attrName = attributeValue; // Just make the name/value match. (FIXME: Is this some WinIE quirk?)
1386 m_currentToken.addAttribute(m_attrName, attributeValue, inViewSourceMode());
1387 if (inViewSourceMode())
1388 m_currentToken.addViewSourceChar('x');
1389 state.setTagState(SearchAttribute);
1390 m_dest = m_buffer;
1391 tquote = NoQuote;
1392 break;
1393 }
1394
1395 if (curchar == '&') {
1396 src.advancePastNonNewline();
1397 state = parseEntity(src, m_dest, state, cBufferPos, true, true);
1398 break;
1399 }
1400
1401 if ((tquote == SingleQuote && curchar == '\'') || (tquote == DoubleQuote && curchar == '\"')) {
1402 // some <input type=hidden> rely on trailing spaces. argh
1403 while (m_dest > m_buffer + 1 && (m_dest[-1] == '\n' || m_dest[-1] == '\r'))
1404 m_dest--; // remove trailing newlines
1405 AtomicString attributeValue(m_buffer + 1, m_dest - m_buffer - 1);
1406 if (m_attrName.isEmpty() && !attributeValue.contains('/')) {
1407 m_attrName = attributeValue; // Make the name match the value. (FIXME: Is this a WinIE quirk?)
1408 if (inViewSourceMode())
1409 m_currentToken.addViewSourceChar('x');
1410 } else if (inViewSourceMode())
1411 m_currentToken.addViewSourceChar('v');
1412
1413 if (m_currentToken.beginTag && m_currentToken.tagName == scriptTag && !inViewSourceMode() && !m_parser->skipMode() && m_attrName == srcAttr) {
1414 String context(m_rawAttributeBeforeValue.data(), m_rawAttributeBeforeValue.size());
1415 if (m_XSSAuditor && !m_XSSAuditor->canLoadExternalScriptFromSrc(context, attributeValue))
1416 attributeValue = blankURL().string();
1417 }
1418
1419 m_currentToken.addAttribute(m_attrName, attributeValue, inViewSourceMode());
1420 m_dest = m_buffer;
1421 state.setTagState(SearchAttribute);
1422 tquote = NoQuote;
1423 if (inViewSourceMode())
1424 m_currentToken.addViewSourceChar(curchar);
1425 src.advancePastNonNewline();
1426 break;
1427 }
1428 }
1429
1430 *m_dest++ = curchar;
1431 src.advance(m_lineNumber);
1432 }
1433 break;
1434 case Value:
1435 while (!src.isEmpty()) {
1436 checkBuffer();
1437 UChar curchar = *src;
1438 if (curchar <= '>' && !src.escaped()) {
1439 // parse Entities
1440 if (curchar == '&') {
1441 src.advancePastNonNewline();
1442 state = parseEntity(src, m_dest, state, cBufferPos, true, true);
1443 break;
1444 }
1445 // no quotes. Every space means end of value
1446 // '/' does not delimit in IE!
1447 if (isASCIISpace(curchar) || curchar == '>') {
1448 AtomicString attributeValue(m_buffer + 1, m_dest - m_buffer - 1);
1449
1450 if (m_currentToken.beginTag && m_currentToken.tagName == scriptTag && !inViewSourceMode() && !m_parser->skipMode() && m_attrName == srcAttr) {
1451 String context(m_rawAttributeBeforeValue.data(), m_rawAttributeBeforeValue.size());
1452 if (m_XSSAuditor && !m_XSSAuditor->canLoadExternalScriptFromSrc(context, attributeValue))
1453 attributeValue = blankURL().string();
1454 }
1455
1456 m_currentToken.addAttribute(m_attrName, attributeValue, inViewSourceMode());
1457 if (inViewSourceMode())
1458 m_currentToken.addViewSourceChar('v');
1459 m_dest = m_buffer;
1460 state.setTagState(SearchAttribute);
1461 break;
1462 }
1463 }
1464
1465 *m_dest++ = curchar;
1466 src.advance(m_lineNumber);
1467 }
1468 break;
1469 case SearchEnd:
1470 {
1471 while (!src.isEmpty()) {
1472 UChar ch = *src;
1473 if (ch == '>' || ch == '<')
1474 break;
1475 if (ch == '/')
1476 m_currentToken.selfClosingTag = true;
1477 if (inViewSourceMode())
1478 m_currentToken.addViewSourceChar(ch);
1479 src.advance(m_lineNumber);
1480 }
1481 if (src.isEmpty())
1482 break;
1483
1484 searchCount = 0; // Stop looking for '<!--' sequence
1485 state.setTagState(NoTag);
1486 tquote = NoQuote;
1487
1488 if (*src != '<')
1489 src.advance(m_lineNumber);
1490
1491 if (m_currentToken.tagName == nullAtom) { //stop if tag is unknown
1492 m_cBufferPos = cBufferPos;
1493 return state;
1494 }
1495
1496 AtomicString tagName = m_currentToken.tagName;
1497
1498 // Handle <script src="foo"/> like Mozilla/Opera. We have to do this now for Dashboard
1499 // compatibility.
1500 bool isSelfClosingScript = m_currentToken.selfClosingTag && m_currentToken.beginTag && m_currentToken.tagName == scriptTag;
1501 bool beginTag = !m_currentToken.selfClosingTag && m_currentToken.beginTag;
1502 if (m_currentToken.beginTag && m_currentToken.tagName == scriptTag && !inViewSourceMode() && !m_parser->skipMode()) {
1503 Attribute* a = 0;
1504 m_scriptTagSrcAttrValue = String();
1505 m_scriptTagCharsetAttrValue = String();
1506 if (m_currentToken.attrs && !m_fragment) {
1507 if (m_doc->frame() && m_doc->frame()->script()->isEnabled()) {
1508 if ((a = m_currentToken.attrs->getAttributeItem(srcAttr)))
1509 m_scriptTagSrcAttrValue = m_doc->completeURL(deprecatedParseURL(a->value())).string();
1510 }
1511 }
1512 }
1513
1514 RefPtr<Node> n = processToken();
1515 m_cBufferPos = cBufferPos;
1516 if (n || inViewSourceMode()) {
1517 State savedState = state;
1518 SegmentedString savedSrc = src;
1519 long savedLineno = m_lineNumber;
1520 if ((tagName == preTag || tagName == listingTag) && !inViewSourceMode()) {
1521 if (beginTag)
1522 state.setDiscardLF(true); // Discard the first LF after we open a pre.
1523 } else if (tagName == scriptTag) {
1524 ASSERT(!m_scriptNode);
1525 m_scriptNode = static_pointer_cast<HTMLScriptElement>(n);
1526 if (m_scriptNode)
1527 m_scriptTagCharsetAttrValue = m_scriptNode->scriptCharset();
1528 if (beginTag) {
1529 m_searchStopper = scriptEnd;
1530 m_searchStopperLength = 8;
1531 state.setInScript(true);
1532 state = parseNonHTMLText(src, state);
1533 } else if (isSelfClosingScript) { // Handle <script src="foo"/>
1534 state.setInScript(true);
1535 state = scriptHandler(state);
1536 }
1537 } else if (tagName == styleTag) {
1538 if (beginTag) {
1539 m_searchStopper = styleEnd;
1540 m_searchStopperLength = 7;
1541 state.setInStyle(true);
1542 state = parseNonHTMLText(src, state);
1543 }
1544 } else if (tagName == textareaTag) {
1545 if (beginTag) {
1546 m_searchStopper = textareaEnd;
1547 m_searchStopperLength = 10;
1548 state.setInTextArea(true);
1549 state = parseNonHTMLText(src, state);
1550 }
1551 } else if (tagName == titleTag) {
1552 if (beginTag) {
1553 m_searchStopper = titleEnd;
1554 m_searchStopperLength = 7;
1555 state.setInTitle(true);
1556 state = parseNonHTMLText(src, state);
1557 }
1558 } else if (tagName == xmpTag) {
1559 if (beginTag) {
1560 m_searchStopper = xmpEnd;
1561 m_searchStopperLength = 5;
1562 state.setInXmp(true);
1563 state = parseNonHTMLText(src, state);
1564 }
1565 } else if (tagName == iframeTag) {
1566 if (beginTag) {
1567 m_searchStopper = iframeEnd;
1568 m_searchStopperLength = 8;
1569 state.setInIFrame(true);
1570 state = parseNonHTMLText(src, state);
1571 }
1572 }
1573 if (src.isEmpty() && (state.inTitle() || inViewSourceMode()) && !state.inComment() && !(state.inScript() && m_currentScriptTagStartLineNumber)) {
1574 // We just ate the rest of the document as the #text node under the special tag!
1575 // Reset the state then retokenize without special handling.
1576 // Let the parser clean up the missing close tag.
1577 // FIXME: This is incorrect, because src.isEmpty() doesn't mean we're
1578 // at the end of the document unless m_noMoreData is also true. We need
1579 // to detect this case elsewhere, and save the state somewhere other
1580 // than a local variable.
1581 state = savedState;
1582 src = savedSrc;
1583 m_lineNumber = savedLineno;
1584 m_scriptCodeSize = 0;
1585 }
1586 }
1587 if (tagName == plaintextTag)
1588 state.setInPlainText(beginTag);
1589 return state; // Finished parsing tag!
1590 }
1591 } // end switch
1592 }
1593 m_cBufferPos = cBufferPos;
1594 return state;
1595 }
1596
continueProcessing(int & processedCount,double startTime,State & state)1597 inline bool HTMLTokenizer::continueProcessing(int& processedCount, double startTime, State &state)
1598 {
1599 // We don't want to be checking elapsed time with every character, so we only check after we've
1600 // processed a certain number of characters.
1601 bool allowedYield = state.allowYield();
1602 state.setAllowYield(false);
1603 if (!state.loadingExtScript() && !state.forceSynchronous() && !m_executingScript && (processedCount > m_tokenizerChunkSize || allowedYield)) {
1604 processedCount = 0;
1605 if (currentTime() - startTime > m_tokenizerTimeDelay) {
1606 /* FIXME: We'd like to yield aggressively to give stylesheets the opportunity to
1607 load, but this hurts overall performance on slower machines. For now turn this
1608 off.
1609 || (!m_doc->haveStylesheetsLoaded() &&
1610 (m_doc->documentElement()->id() != ID_HTML || m_doc->body()))) {*/
1611 // Schedule the timer to keep processing as soon as possible.
1612 m_timer.startOneShot(0);
1613 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
1614 if (currentTime() - startTime > m_tokenizerTimeDelay)
1615 printf("Deferring processing of data because 500ms elapsed away from event loop.\n");
1616 #endif
1617 return false;
1618 }
1619 }
1620
1621 processedCount++;
1622 return true;
1623 }
1624
write(const SegmentedString & str,bool appendData)1625 void HTMLTokenizer::write(const SegmentedString& str, bool appendData)
1626 {
1627 if (!m_buffer)
1628 return;
1629
1630 if (m_parserStopped)
1631 return;
1632
1633 SegmentedString source(str);
1634 if (m_executingScript)
1635 source.setExcludeLineNumbers();
1636
1637 if ((m_executingScript && appendData) || !m_pendingScripts.isEmpty()) {
1638 // don't parse; we will do this later
1639 if (m_currentPrependingSrc)
1640 m_currentPrependingSrc->append(source);
1641 else {
1642 m_pendingSrc.append(source);
1643 #if PRELOAD_SCANNER_ENABLED
1644 if (m_preloadScanner && m_preloadScanner->inProgress() && appendData)
1645 m_preloadScanner->write(source);
1646 #endif
1647 }
1648 return;
1649 }
1650
1651
1652 #if PRELOAD_SCANNER_ENABLED
1653 if (m_preloadScanner && m_preloadScanner->inProgress() && appendData)
1654 m_preloadScanner->end();
1655 #endif
1656
1657 if (!m_src.isEmpty())
1658 m_src.append(source);
1659 else
1660 setSrc(source);
1661
1662 // Once a timer is set, it has control of when the tokenizer continues.
1663 if (m_timer.isActive())
1664 return;
1665
1666 bool wasInWrite = m_inWrite;
1667 m_inWrite = true;
1668
1669 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
1670 if (!m_doc->ownerElement())
1671 printf("Beginning write at time %d\n", m_doc->elapsedTime());
1672 #endif
1673
1674 int processedCount = 0;
1675 double startTime = currentTime();
1676 #ifdef ANDROID_INSTRUMENT
1677 android::TimeCounter::start(android::TimeCounter::ParsingTimeCounter);
1678 #endif
1679
1680 Frame* frame = m_doc->frame();
1681
1682 State state = m_state;
1683
1684 while (!m_src.isEmpty() && (!frame || !frame->loader()->isScheduledLocationChangePending())) {
1685 if (!continueProcessing(processedCount, startTime, state))
1686 break;
1687
1688 // do we need to enlarge the buffer?
1689 checkBuffer();
1690
1691 UChar cc = *m_src;
1692
1693 bool wasSkipLF = state.skipLF();
1694 if (wasSkipLF)
1695 state.setSkipLF(false);
1696
1697 if (wasSkipLF && (cc == '\n'))
1698 m_src.advance();
1699 else if (state.needsSpecialWriteHandling()) {
1700 // it's important to keep needsSpecialWriteHandling with the flags this block tests
1701 if (state.hasEntityState())
1702 state = parseEntity(m_src, m_dest, state, m_cBufferPos, false, state.hasTagState());
1703 else if (state.inPlainText())
1704 state = parseText(m_src, state);
1705 else if (state.inAnyNonHTMLText())
1706 state = parseNonHTMLText(m_src, state);
1707 else if (state.inComment())
1708 state = parseComment(m_src, state);
1709 else if (state.inDoctype())
1710 state = parseDoctype(m_src, state);
1711 else if (state.inServer())
1712 state = parseServer(m_src, state);
1713 else if (state.inProcessingInstruction())
1714 state = parseProcessingInstruction(m_src, state);
1715 else if (state.hasTagState())
1716 state = parseTag(m_src, state);
1717 else if (state.startTag()) {
1718 state.setStartTag(false);
1719
1720 switch (cc) {
1721 case '/':
1722 break;
1723 case '!': {
1724 // <!-- comment --> or <!DOCTYPE ...>
1725 searchCount = 1; // Look for '<!--' sequence to start comment or '<!DOCTYPE' sequence to start doctype
1726 m_doctypeSearchCount = 1;
1727 break;
1728 }
1729 case '?': {
1730 // xml processing instruction
1731 state.setInProcessingInstruction(true);
1732 tquote = NoQuote;
1733 state = parseProcessingInstruction(m_src, state);
1734 continue;
1735
1736 break;
1737 }
1738 case '%':
1739 if (!m_brokenServer) {
1740 // <% server stuff, handle as comment %>
1741 state.setInServer(true);
1742 tquote = NoQuote;
1743 state = parseServer(m_src, state);
1744 continue;
1745 }
1746 // else fall through
1747 default: {
1748 if ( ((cc >= 'a') && (cc <= 'z')) || ((cc >= 'A') && (cc <= 'Z'))) {
1749 // Start of a Start-Tag
1750 } else {
1751 // Invalid tag
1752 // Add as is
1753 *m_dest = '<';
1754 m_dest++;
1755 continue;
1756 }
1757 }
1758 }; // end case
1759
1760 processToken();
1761
1762 m_cBufferPos = 0;
1763 state.setTagState(TagName);
1764 state = parseTag(m_src, state);
1765 }
1766 } else if (cc == '&' && !m_src.escaped()) {
1767 m_src.advancePastNonNewline();
1768 state = parseEntity(m_src, m_dest, state, m_cBufferPos, true, state.hasTagState());
1769 } else if (cc == '<' && !m_src.escaped()) {
1770 m_currentTagStartLineNumber = m_lineNumber;
1771 m_src.advancePastNonNewline();
1772 state.setStartTag(true);
1773 state.setDiscardLF(false);
1774 } else if (cc == '\n' || cc == '\r') {
1775 if (state.discardLF())
1776 // Ignore this LF
1777 state.setDiscardLF(false); // We have discarded 1 LF
1778 else {
1779 // Process this LF
1780 *m_dest++ = '\n';
1781 if (cc == '\r' && !m_src.excludeLineNumbers())
1782 m_lineNumber++;
1783 }
1784
1785 /* Check for MS-DOS CRLF sequence */
1786 if (cc == '\r')
1787 state.setSkipLF(true);
1788 m_src.advance(m_lineNumber);
1789 } else {
1790 state.setDiscardLF(false);
1791 *m_dest++ = cc;
1792 m_src.advancePastNonNewline();
1793 }
1794 }
1795
1796 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
1797 if (!m_doc->ownerElement())
1798 printf("Ending write at time %d\n", m_doc->elapsedTime());
1799 #endif
1800
1801 m_inWrite = wasInWrite;
1802
1803 m_state = state;
1804
1805 #ifdef ANDROID_INSTRUMENT
1806 android::TimeCounter::record(android::TimeCounter::ParsingTimeCounter, __FUNCTION__);
1807 #endif
1808
1809 if (m_noMoreData && !m_inWrite && !state.loadingExtScript() && !m_executingScript && !m_timer.isActive())
1810 end(); // this actually causes us to be deleted
1811 }
1812
stopParsing()1813 void HTMLTokenizer::stopParsing()
1814 {
1815 Tokenizer::stopParsing();
1816 m_timer.stop();
1817
1818 // The part needs to know that the tokenizer has finished with its data,
1819 // regardless of whether it happened naturally or due to manual intervention.
1820 if (!m_fragment && m_doc->frame())
1821 m_doc->frame()->loader()->tokenizerProcessedData();
1822 }
1823
processingData() const1824 bool HTMLTokenizer::processingData() const
1825 {
1826 return m_timer.isActive() || m_inWrite;
1827 }
1828
timerFired(Timer<HTMLTokenizer> *)1829 void HTMLTokenizer::timerFired(Timer<HTMLTokenizer>*)
1830 {
1831 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
1832 if (!m_doc->ownerElement())
1833 printf("Beginning timer write at time %d\n", m_doc->elapsedTime());
1834 #endif
1835
1836 #ifdef ANDROID_MOBILE
1837 if (m_doc->view() && m_doc->view()->layoutPending() && !m_doc->minimumLayoutDelay() && !m_doc->extraLayoutDelay()) {
1838 #else
1839 if (m_doc->view() && m_doc->view()->layoutPending() && !m_doc->minimumLayoutDelay()) {
1840 #endif
1841 // Restart the timer and let layout win. This is basically a way of ensuring that the layout
1842 // timer has higher priority than our timer.
1843 m_timer.startOneShot(0);
1844 return;
1845 }
1846
1847 // Invoke write() as though more data came in. This might cause us to get deleted.
1848 write(SegmentedString(), true);
1849 }
1850
1851 void HTMLTokenizer::end()
1852 {
1853 ASSERT(!m_timer.isActive());
1854 m_timer.stop(); // Only helps if assertion above fires, but do it anyway.
1855
1856 if (m_buffer) {
1857 // parseTag is using the buffer for different matters
1858 if (!m_state.hasTagState())
1859 processToken();
1860
1861 fastFree(m_scriptCode);
1862 m_scriptCode = 0;
1863 m_scriptCodeSize = m_scriptCodeCapacity = m_scriptCodeResync = 0;
1864
1865 fastFree(m_buffer);
1866 m_buffer = 0;
1867 }
1868
1869 if (!inViewSourceMode())
1870 m_parser->finished();
1871 else
1872 m_doc->finishedParsing();
1873 }
1874
1875 void HTMLTokenizer::finish()
1876 {
1877 // do this as long as we don't find matching comment ends
1878 while ((m_state.inComment() || m_state.inServer()) && m_scriptCode && m_scriptCodeSize) {
1879 // we've found an unmatched comment start
1880 if (m_state.inComment())
1881 m_brokenComments = true;
1882 else
1883 m_brokenServer = true;
1884 checkScriptBuffer();
1885 m_scriptCode[m_scriptCodeSize] = 0;
1886 m_scriptCode[m_scriptCodeSize + 1] = 0;
1887 int pos;
1888 String food;
1889 if (m_state.inScript() || m_state.inStyle() || m_state.inTextArea())
1890 food = String(m_scriptCode, m_scriptCodeSize);
1891 else if (m_state.inServer()) {
1892 food = "<";
1893 food.append(m_scriptCode, m_scriptCodeSize);
1894 } else {
1895 pos = find(m_scriptCode, m_scriptCodeSize, '>');
1896 food = String(m_scriptCode + pos + 1, m_scriptCodeSize - pos - 1);
1897 }
1898 fastFree(m_scriptCode);
1899 m_scriptCode = 0;
1900 m_scriptCodeSize = m_scriptCodeCapacity = m_scriptCodeResync = 0;
1901 m_state.setInComment(false);
1902 m_state.setInServer(false);
1903 if (!food.isEmpty())
1904 write(food, true);
1905 }
1906 // this indicates we will not receive any more data... but if we are waiting on
1907 // an external script to load, we can't finish parsing until that is done
1908 m_noMoreData = true;
1909 if (!m_inWrite && !m_state.loadingExtScript() && !m_executingScript && !m_timer.isActive())
1910 end(); // this actually causes us to be deleted
1911 }
1912
1913 PassRefPtr<Node> HTMLTokenizer::processToken()
1914 {
1915 ScriptController* scriptController = (!m_fragment && m_doc->frame()) ? m_doc->frame()->script() : 0;
1916 if (scriptController && scriptController->isEnabled())
1917 // FIXME: Why isn't this m_currentScriptTagStartLineNumber? I suspect this is wrong.
1918 scriptController->setEventHandlerLineNumber(m_currentTagStartLineNumber + 1); // Script line numbers are 1 based.
1919 if (m_dest > m_buffer) {
1920 m_currentToken.text = StringImpl::createStrippingNullCharacters(m_buffer, m_dest - m_buffer);
1921 if (m_currentToken.tagName != commentAtom)
1922 m_currentToken.tagName = textAtom;
1923 } else if (m_currentToken.tagName == nullAtom) {
1924 m_currentToken.reset();
1925 if (scriptController)
1926 scriptController->setEventHandlerLineNumber(m_lineNumber + 1); // Script line numbers are 1 based.
1927 return 0;
1928 }
1929
1930 m_dest = m_buffer;
1931
1932 RefPtr<Node> n;
1933
1934 if (!m_parserStopped) {
1935 if (NamedMappedAttrMap* map = m_currentToken.attrs.get())
1936 map->shrinkToLength();
1937 if (inViewSourceMode())
1938 static_cast<HTMLViewSourceDocument*>(m_doc)->addViewSourceToken(&m_currentToken);
1939 else
1940 // pass the token over to the parser, the parser DOES NOT delete the token
1941 n = m_parser->parseToken(&m_currentToken);
1942 }
1943 m_currentToken.reset();
1944 if (scriptController)
1945 scriptController->setEventHandlerLineNumber(0);
1946
1947 return n.release();
1948 }
1949
1950 void HTMLTokenizer::processDoctypeToken()
1951 {
1952 if (inViewSourceMode())
1953 static_cast<HTMLViewSourceDocument*>(m_doc)->addViewSourceDoctypeToken(&m_doctypeToken);
1954 else
1955 m_parser->parseDoctypeToken(&m_doctypeToken);
1956 }
1957
1958 HTMLTokenizer::~HTMLTokenizer()
1959 {
1960 ASSERT(!m_inWrite);
1961 reset();
1962 }
1963
1964
1965 void HTMLTokenizer::enlargeBuffer(int len)
1966 {
1967 // Resize policy: Always at least double the size of the buffer each time.
1968 int delta = max(len, m_bufferSize);
1969
1970 // Check for overflow.
1971 // For now, handle overflow the same way we handle fastRealloc failure, with CRASH.
1972 static const int maxSize = INT_MAX / sizeof(UChar);
1973 if (delta > maxSize - m_bufferSize)
1974 CRASH();
1975
1976 int newSize = m_bufferSize + delta;
1977 int oldOffset = m_dest - m_buffer;
1978 m_buffer = static_cast<UChar*>(fastRealloc(m_buffer, newSize * sizeof(UChar)));
1979 m_dest = m_buffer + oldOffset;
1980 m_bufferSize = newSize;
1981 }
1982
1983 void HTMLTokenizer::enlargeScriptBuffer(int len)
1984 {
1985 // Resize policy: Always at least double the size of the buffer each time.
1986 int delta = max(len, m_scriptCodeCapacity);
1987
1988 // Check for overflow.
1989 // For now, handle overflow the same way we handle fastRealloc failure, with CRASH.
1990 static const int maxSize = INT_MAX / sizeof(UChar);
1991 if (delta > maxSize - m_scriptCodeCapacity)
1992 CRASH();
1993
1994 int newSize = m_scriptCodeCapacity + delta;
1995 m_scriptCode = static_cast<UChar*>(fastRealloc(m_scriptCode, newSize * sizeof(UChar)));
1996 m_scriptCodeCapacity = newSize;
1997 }
1998
1999 void HTMLTokenizer::executeScriptsWaitingForStylesheets()
2000 {
2001 ASSERT(m_doc->haveStylesheetsLoaded());
2002
2003 if (m_hasScriptsWaitingForStylesheets)
2004 notifyFinished(0);
2005 }
2006
2007 void HTMLTokenizer::notifyFinished(CachedResource*)
2008 {
2009 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
2010 if (!m_doc->ownerElement())
2011 printf("script loaded at %d\n", m_doc->elapsedTime());
2012 #endif
2013
2014 ASSERT(!m_pendingScripts.isEmpty());
2015
2016 // Make external scripts wait for external stylesheets.
2017 // FIXME: This needs to be done for inline scripts too.
2018 m_hasScriptsWaitingForStylesheets = !m_doc->haveStylesheetsLoaded();
2019 if (m_hasScriptsWaitingForStylesheets)
2020 return;
2021
2022 bool finished = false;
2023 while (!finished && m_pendingScripts.first()->isLoaded()) {
2024 CachedScript* cs = m_pendingScripts.first().get();
2025 m_pendingScripts.removeFirst();
2026 ASSERT(cache()->disabled() || cs->accessCount() > 0);
2027
2028 setSrc(SegmentedString());
2029
2030 // make sure we forget about the script before we execute the new one
2031 // infinite recursion might happen otherwise
2032 ScriptSourceCode sourceCode(cs);
2033 bool errorOccurred = cs->errorOccurred();
2034 cs->removeClient(this);
2035
2036 RefPtr<Node> n = m_scriptNode.release();
2037
2038 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
2039 if (!m_doc->ownerElement())
2040 printf("external script beginning execution at %d\n", m_doc->elapsedTime());
2041 #endif
2042
2043 if (errorOccurred)
2044 n->dispatchEvent(eventNames().errorEvent, true, false);
2045 else {
2046 if (static_cast<HTMLScriptElement*>(n.get())->shouldExecuteAsJavaScript())
2047 m_state = scriptExecution(sourceCode, m_state);
2048 #if ENABLE(XHTMLMP)
2049 else
2050 m_doc->setShouldProcessNoscriptElement(true);
2051 #endif
2052 n->dispatchEvent(eventNames().loadEvent, false, false);
2053 }
2054
2055 // The state of m_pendingScripts.isEmpty() can change inside the scriptExecution()
2056 // call above, so test afterwards.
2057 finished = m_pendingScripts.isEmpty();
2058 if (finished) {
2059 ASSERT(!m_hasScriptsWaitingForStylesheets);
2060 m_state.setLoadingExtScript(false);
2061 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
2062 if (!m_doc->ownerElement())
2063 printf("external script finished execution at %d\n", m_doc->elapsedTime());
2064 #endif
2065 } else if (m_hasScriptsWaitingForStylesheets) {
2066 // m_hasScriptsWaitingForStylesheets flag might have changed during the script execution.
2067 // If it did we are now blocked waiting for stylesheets and should not execute more scripts until they arrive.
2068 finished = true;
2069 }
2070
2071 // 'm_requestingScript' is true when we are called synchronously from
2072 // scriptHandler(). In that case scriptHandler() will take care
2073 // of m_pendingSrc.
2074 if (!m_requestingScript) {
2075 SegmentedString rest = m_pendingSrc;
2076 m_pendingSrc.clear();
2077 write(rest, false);
2078 // we might be deleted at this point, do not access any members.
2079 }
2080 }
2081 }
2082
2083 bool HTMLTokenizer::isWaitingForScripts() const
2084 {
2085 return m_state.loadingExtScript();
2086 }
2087
2088 void HTMLTokenizer::setSrc(const SegmentedString& source)
2089 {
2090 m_src = source;
2091 }
2092
2093 void parseHTMLDocumentFragment(const String& source, DocumentFragment* fragment)
2094 {
2095 HTMLTokenizer tok(fragment);
2096 tok.setForceSynchronous(true);
2097 tok.write(source, true);
2098 tok.finish();
2099 ASSERT(!tok.processingData()); // make sure we're done (see 3963151)
2100 }
2101
2102 UChar decodeNamedEntity(const char* name)
2103 {
2104 const Entity* e = findEntity(name, strlen(name));
2105 return e ? e->code : 0;
2106 }
2107
2108 }
2109