1 /* 2 Copyright (C) 1997 Martin Jones (mjones@kde.org) 3 (C) 1997 Torben Weis (weis@kde.org) 4 (C) 1998 Waldo Bastian (bastian@kde.org) 5 (C) 2001 Dirk Mueller (mueller@kde.org) 6 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved. 7 8 This library is free software; you can redistribute it and/or 9 modify it under the terms of the GNU Library General Public 10 License as published by the Free Software Foundation; either 11 version 2 of the License, or (at your option) any later version. 12 13 This library is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 Library General Public License for more details. 17 18 You should have received a copy of the GNU Library General Public License 19 along with this library; see the file COPYING.LIB. If not, write to 20 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 21 Boston, MA 02110-1301, USA. 22 */ 23 24 #ifndef HTMLTokenizer_h 25 #define HTMLTokenizer_h 26 27 #include "CachedResourceClient.h" 28 #include "CachedResourceHandle.h" 29 #include "NamedMappedAttrMap.h" 30 #include "MappedAttributeEntry.h" 31 #include "SegmentedString.h" 32 #include "Timer.h" 33 #include "Tokenizer.h" 34 #include <wtf/Deque.h> 35 #include <wtf/OwnPtr.h> 36 #include <wtf/Vector.h> 37 38 namespace WebCore { 39 40 class CachedScript; 41 class DocumentFragment; 42 class Document; 43 class HTMLDocument; 44 class HTMLScriptElement; 45 class HTMLViewSourceDocument; 46 class FrameView; 47 class HTMLParser; 48 class Node; 49 class PreloadScanner; 50 class ScriptSourceCode; 51 52 /** 53 * @internal 54 * represents one HTML tag. Consists of a numerical id, and the list 55 * of attributes. Can also represent text. In this case the id = 0 and 56 * text contains the text. 57 */ 58 struct Token { TokenToken59 Token() 60 : beginTag(true) 61 , selfClosingTag(false) 62 , brokenXMLStyle(false) 63 , m_sourceInfo(0) 64 { } ~TokenToken65 ~Token() { } 66 67 void addAttribute(AtomicString& attrName, const AtomicString& v, bool viewSourceMode); 68 isOpenTagToken69 bool isOpenTag(const QualifiedName& fullName) const { return beginTag && fullName.localName() == tagName; } isCloseTagToken70 bool isCloseTag(const QualifiedName& fullName) const { return !beginTag && fullName.localName() == tagName; } 71 resetToken72 void reset() 73 { 74 attrs = 0; 75 text = 0; 76 tagName = nullAtom; 77 beginTag = true; 78 selfClosingTag = false; 79 brokenXMLStyle = false; 80 if (m_sourceInfo) 81 m_sourceInfo->clear(); 82 } 83 addViewSourceCharToken84 void addViewSourceChar(UChar c) { if (!m_sourceInfo.get()) m_sourceInfo.set(new Vector<UChar>); m_sourceInfo->append(c); } 85 86 RefPtr<NamedMappedAttrMap> attrs; 87 RefPtr<StringImpl> text; 88 AtomicString tagName; 89 bool beginTag; 90 bool selfClosingTag; 91 bool brokenXMLStyle; 92 OwnPtr<Vector<UChar> > m_sourceInfo; 93 }; 94 95 enum DoctypeState { 96 DoctypeBegin, 97 DoctypeBeforeName, 98 DoctypeName, 99 DoctypeAfterName, 100 DoctypeBeforePublicID, 101 DoctypePublicID, 102 DoctypeAfterPublicID, 103 DoctypeBeforeSystemID, 104 DoctypeSystemID, 105 DoctypeAfterSystemID, 106 DoctypeBogus 107 }; 108 109 class DoctypeToken { 110 public: DoctypeToken()111 DoctypeToken() {} 112 reset()113 void reset() 114 { 115 m_name.clear(); 116 m_publicID.clear(); 117 m_systemID.clear(); 118 m_state = DoctypeBegin; 119 m_source.clear(); 120 } 121 state()122 DoctypeState state() { return m_state; } setState(DoctypeState s)123 void setState(DoctypeState s) { m_state = s; } 124 125 Vector<UChar> m_name; 126 Vector<UChar> m_publicID; 127 Vector<UChar> m_systemID; 128 DoctypeState m_state; 129 130 Vector<UChar> m_source; 131 }; 132 133 //----------------------------------------------------------------------------- 134 135 class HTMLTokenizer : public Tokenizer, public CachedResourceClient { 136 public: 137 HTMLTokenizer(HTMLDocument*, bool reportErrors); 138 HTMLTokenizer(HTMLViewSourceDocument*); 139 HTMLTokenizer(DocumentFragment*, FragmentScriptingPermission = FragmentScriptingAllowed); 140 virtual ~HTMLTokenizer(); 141 142 virtual void write(const SegmentedString&, bool appendData); 143 virtual void finish(); 144 virtual void setForceSynchronous(bool force); 145 virtual bool isWaitingForScripts() const; 146 virtual void stopParsing(); 147 virtual bool processingData() const; executingScript()148 virtual int executingScript() const { return m_executingScript; } 149 lineNumber()150 virtual int lineNumber() const { return m_lineNumber; } columnNumber()151 virtual int columnNumber() const { return 1; } 152 processingContentWrittenByScript()153 bool processingContentWrittenByScript() const { return m_src.excludeLineNumbers(); } 154 155 virtual void executeScriptsWaitingForStylesheets(); 156 isHTMLTokenizer()157 virtual bool isHTMLTokenizer() const { return true; } htmlParser()158 HTMLParser* htmlParser() const { return m_parser.get(); } 159 160 private: 161 class State; 162 163 // Where we are in parsing a tag 164 void begin(); 165 void end(); 166 167 void reset(); 168 169 PassRefPtr<Node> processToken(); 170 void processDoctypeToken(); 171 172 State processListing(SegmentedString, State); 173 State parseComment(SegmentedString&, State); 174 State parseDoctype(SegmentedString&, State); 175 State parseServer(SegmentedString&, State); 176 State parseText(SegmentedString&, State); 177 State parseNonHTMLText(SegmentedString&, State); 178 State parseTag(SegmentedString&, State); 179 State parseEntity(SegmentedString&, UChar*& dest, State, unsigned& cBufferPos, bool start, bool parsingTag); 180 State parseProcessingInstruction(SegmentedString&, State); 181 State scriptHandler(State); 182 State scriptExecution(const ScriptSourceCode&, State); 183 void setSrc(const SegmentedString&); 184 185 // check if we have enough space in the buffer. 186 // if not enlarge it 187 inline void checkBuffer(int len = 10) 188 { 189 if ((m_dest - m_buffer) > m_bufferSize - len) 190 enlargeBuffer(len); 191 } 192 193 inline void checkScriptBuffer(int len = 10) 194 { 195 if (m_scriptCodeSize + len >= m_scriptCodeCapacity) 196 enlargeScriptBuffer(len); 197 } 198 199 void enlargeBuffer(int len); 200 void enlargeScriptBuffer(int len); 201 202 bool continueProcessing(int& processedCount, double startTime, State&); 203 void timerFired(Timer<HTMLTokenizer>*); 204 void allDataProcessed(); 205 206 // from CachedResourceClient 207 void notifyFinished(CachedResource*); 208 209 void executeExternalScriptsIfReady(); 210 void executeExternalScriptsTimerFired(Timer<HTMLTokenizer>*); 211 bool continueExecutingExternalScripts(double startTime); 212 213 // Internal buffers 214 /////////////////// 215 UChar* m_buffer; 216 int m_bufferSize; 217 UChar* m_dest; 218 219 Token m_currentToken; 220 221 // This buffer holds the raw characters we've seen between the beginning of 222 // the attribute name and the first character of the attribute value. 223 Vector<UChar, 32> m_rawAttributeBeforeValue; 224 225 // Tokenizer flags 226 ////////////////// 227 // are we in quotes within a html tag 228 enum { NoQuote, SingleQuote, DoubleQuote } tquote; 229 230 // Are we in a &... character entity description? 231 enum EntityState { 232 NoEntity = 0, 233 SearchEntity = 1, 234 NumericSearch = 2, 235 Hexadecimal = 3, 236 Decimal = 4, 237 EntityName = 5, 238 SearchSemicolon = 6 239 }; 240 unsigned EntityUnicodeValue; 241 242 enum TagState { 243 NoTag = 0, 244 TagName = 1, 245 SearchAttribute = 2, 246 AttributeName = 3, 247 SearchEqual = 4, 248 SearchValue = 5, 249 QuotedValue = 6, 250 Value = 7, 251 SearchEnd = 8 252 }; 253 254 class State { 255 public: State()256 State() : m_bits(0) { } 257 tagState()258 TagState tagState() const { return static_cast<TagState>(m_bits & TagMask); } setTagState(TagState t)259 void setTagState(TagState t) { m_bits = (m_bits & ~TagMask) | t; } entityState()260 EntityState entityState() const { return static_cast<EntityState>((m_bits & EntityMask) >> EntityShift); } setEntityState(EntityState e)261 void setEntityState(EntityState e) { m_bits = (m_bits & ~EntityMask) | (e << EntityShift); } 262 inScript()263 bool inScript() const { return testBit(InScript); } setInScript(bool v)264 void setInScript(bool v) { setBit(InScript, v); } inStyle()265 bool inStyle() const { return testBit(InStyle); } setInStyle(bool v)266 void setInStyle(bool v) { setBit(InStyle, v); } inXmp()267 bool inXmp() const { return testBit(InXmp); } setInXmp(bool v)268 void setInXmp(bool v) { setBit(InXmp, v); } inTitle()269 bool inTitle() const { return testBit(InTitle); } setInTitle(bool v)270 void setInTitle(bool v) { setBit(InTitle, v); } inIFrame()271 bool inIFrame() const { return testBit(InIFrame); } setInIFrame(bool v)272 void setInIFrame(bool v) { setBit(InIFrame, v); } inPlainText()273 bool inPlainText() const { return testBit(InPlainText); } setInPlainText(bool v)274 void setInPlainText(bool v) { setBit(InPlainText, v); } inProcessingInstruction()275 bool inProcessingInstruction() const { return testBit(InProcessingInstruction); } setInProcessingInstruction(bool v)276 void setInProcessingInstruction(bool v) { return setBit(InProcessingInstruction, v); } inComment()277 bool inComment() const { return testBit(InComment); } setInComment(bool v)278 void setInComment(bool v) { setBit(InComment, v); } inDoctype()279 bool inDoctype() const { return testBit(InDoctype); } setInDoctype(bool v)280 void setInDoctype(bool v) { setBit(InDoctype, v); } inTextArea()281 bool inTextArea() const { return testBit(InTextArea); } setInTextArea(bool v)282 void setInTextArea(bool v) { setBit(InTextArea, v); } escaped()283 bool escaped() const { return testBit(Escaped); } setEscaped(bool v)284 void setEscaped(bool v) { setBit(Escaped, v); } inServer()285 bool inServer() const { return testBit(InServer); } setInServer(bool v)286 void setInServer(bool v) { setBit(InServer, v); } skipLF()287 bool skipLF() const { return testBit(SkipLF); } setSkipLF(bool v)288 void setSkipLF(bool v) { setBit(SkipLF, v); } startTag()289 bool startTag() const { return testBit(StartTag); } setStartTag(bool v)290 void setStartTag(bool v) { setBit(StartTag, v); } discardLF()291 bool discardLF() const { return testBit(DiscardLF); } setDiscardLF(bool v)292 void setDiscardLF(bool v) { setBit(DiscardLF, v); } allowYield()293 bool allowYield() const { return testBit(AllowYield); } setAllowYield(bool v)294 void setAllowYield(bool v) { setBit(AllowYield, v); } loadingExtScript()295 bool loadingExtScript() const { return testBit(LoadingExtScript); } setLoadingExtScript(bool v)296 void setLoadingExtScript(bool v) { setBit(LoadingExtScript, v); } forceSynchronous()297 bool forceSynchronous() const { return testBit(ForceSynchronous); } setForceSynchronous(bool v)298 void setForceSynchronous(bool v) { setBit(ForceSynchronous, v); } 299 inAnyNonHTMLText()300 bool inAnyNonHTMLText() const { return m_bits & (InScript | InStyle | InXmp | InTextArea | InTitle | InIFrame); } hasTagState()301 bool hasTagState() const { return m_bits & TagMask; } hasEntityState()302 bool hasEntityState() const { return m_bits & EntityMask; } 303 needsSpecialWriteHandling()304 bool needsSpecialWriteHandling() const { return m_bits & (InScript | InStyle | InXmp | InTextArea | InTitle | InIFrame | TagMask | EntityMask | InPlainText | InComment | InDoctype | InServer | InProcessingInstruction | StartTag); } 305 306 private: 307 static const int EntityShift = 4; 308 enum StateBits { 309 TagMask = (1 << 4) - 1, 310 EntityMask = (1 << 7) - (1 << 4), 311 InScript = 1 << 7, 312 InStyle = 1 << 8, 313 // Bit 9 unused 314 InXmp = 1 << 10, 315 InTitle = 1 << 11, 316 InPlainText = 1 << 12, 317 InProcessingInstruction = 1 << 13, 318 InComment = 1 << 14, 319 InTextArea = 1 << 15, 320 Escaped = 1 << 16, 321 InServer = 1 << 17, 322 SkipLF = 1 << 18, 323 StartTag = 1 << 19, 324 DiscardLF = 1 << 20, // FIXME: should clarify difference between skip and discard 325 AllowYield = 1 << 21, 326 LoadingExtScript = 1 << 22, 327 ForceSynchronous = 1 << 23, 328 InIFrame = 1 << 24, 329 InDoctype = 1 << 25 330 }; 331 setBit(StateBits bit,bool value)332 void setBit(StateBits bit, bool value) 333 { 334 if (value) 335 m_bits |= bit; 336 else 337 m_bits &= ~bit; 338 } testBit(StateBits bit)339 bool testBit(StateBits bit) const { return m_bits & bit; } 340 341 unsigned m_bits; 342 }; 343 344 State m_state; 345 346 DoctypeToken m_doctypeToken; 347 int m_doctypeSearchCount; 348 int m_doctypeSecondarySearchCount; 349 350 bool m_brokenServer; 351 352 // Name of an attribute that we just scanned. 353 AtomicString m_attrName; 354 355 // Used to store the code of a scripting sequence 356 UChar* m_scriptCode; 357 // Size of the script sequenze stored in @ref #scriptCode 358 int m_scriptCodeSize; 359 // Maximal size that can be stored in @ref #scriptCode 360 int m_scriptCodeCapacity; 361 // resync point of script code size 362 int m_scriptCodeResync; 363 364 // Stores characters if we are scanning for a string like "</script>" 365 UChar searchBuffer[10]; 366 367 // Counts where we are in the string we are scanning for 368 int searchCount; 369 // the stopper string 370 const char* m_searchStopper; 371 int m_searchStopperLength; 372 373 // if no more data is coming, just parse what we have (including ext scripts that 374 // may be still downloading) and finish 375 bool m_noMoreData; 376 // URL to get source code of script from 377 String m_scriptTagSrcAttrValue; 378 String m_scriptTagCharsetAttrValue; 379 // the HTML code we will parse after the external script we are waiting for has loaded 380 SegmentedString m_pendingSrc; 381 382 // the HTML code we will parse after this particular script has 383 // loaded, but before all pending HTML 384 SegmentedString* m_currentPrependingSrc; 385 386 // true if we are executing a script while parsing a document. This causes the parsing of 387 // the output of the script to be postponed until after the script has finished executing 388 int m_executingScript; 389 Deque<CachedResourceHandle<CachedScript> > m_pendingScripts; 390 RefPtr<HTMLScriptElement> m_scriptNode; 391 392 bool m_requestingScript; 393 bool m_hasScriptsWaitingForStylesheets; 394 395 // if we found one broken comment, there are most likely others as well 396 // store a flag to get rid of the O(n^2) behaviour in such a case. 397 bool m_brokenComments; 398 // current line number 399 int m_lineNumber; 400 int m_currentScriptTagStartLineNumber; 401 int m_currentTagStartLineNumber; 402 403 double m_tokenizerTimeDelay; 404 int m_tokenizerChunkSize; 405 406 // The timer for continued processing. 407 Timer<HTMLTokenizer> m_timer; 408 409 // The timer for continued executing external scripts. 410 Timer<HTMLTokenizer> m_externalScriptsTimer; 411 412 // This buffer can hold arbitrarily long user-defined attribute names, such as in EMBED tags. 413 // So any fixed number might be too small, but rather than rewriting all usage of this buffer 414 // we'll just make it large enough to handle all imaginable cases. 415 #define CBUFLEN 1024 416 UChar m_cBuffer[CBUFLEN + 2]; 417 unsigned int m_cBufferPos; 418 419 SegmentedString m_src; 420 Document* m_doc; 421 OwnPtr<HTMLParser> m_parser; 422 bool m_inWrite; 423 bool m_fragment; 424 FragmentScriptingPermission m_scriptingPermission; 425 426 OwnPtr<PreloadScanner> m_preloadScanner; 427 }; 428 429 void parseHTMLDocumentFragment(const String&, DocumentFragment*, FragmentScriptingPermission = FragmentScriptingAllowed); 430 431 UChar decodeNamedEntity(const char*); 432 433 } // namespace WebCore 434 435 #endif // HTMLTokenizer_h 436