1 /* 2 Copyright (C) 1997 Martin Jones (mjones@kde.org) 3 (C) 1997 Torben Weis (weis@kde.org) 4 (C) 1998 Waldo Bastian (bastian@kde.org) 5 (C) 2001 Dirk Mueller (mueller@kde.org) 6 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved. 7 8 This library is free software; you can redistribute it and/or 9 modify it under the terms of the GNU Library General Public 10 License as published by the Free Software Foundation; either 11 version 2 of the License, or (at your option) any later version. 12 13 This library is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 Library General Public License for more details. 17 18 You should have received a copy of the GNU Library General Public License 19 along with this library; see the file COPYING.LIB. If not, write to 20 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 21 Boston, MA 02110-1301, USA. 22 */ 23 24 #ifndef HTMLTokenizer_h 25 #define HTMLTokenizer_h 26 27 #include "CachedResourceClient.h" 28 #include "CachedResourceHandle.h" 29 #include "NamedMappedAttrMap.h" 30 #include "SegmentedString.h" 31 #include "Timer.h" 32 #include "Tokenizer.h" 33 #include <wtf/Deque.h> 34 #include <wtf/OwnPtr.h> 35 #include <wtf/Vector.h> 36 37 namespace WebCore { 38 39 class CachedScript; 40 class DocumentFragment; 41 class Document; 42 class HTMLDocument; 43 class HTMLScriptElement; 44 class HTMLViewSourceDocument; 45 class FrameView; 46 class HTMLParser; 47 class Node; 48 class PreloadScanner; 49 class ScriptSourceCode; 50 51 /** 52 * @internal 53 * represents one HTML tag. Consists of a numerical id, and the list 54 * of attributes. Can also represent text. In this case the id = 0 and 55 * text contains the text. 56 */ 57 struct Token { TokenToken58 Token() 59 : beginTag(true) 60 , selfClosingTag(false) 61 , brokenXMLStyle(false) 62 , m_sourceInfo(0) 63 { } ~TokenToken64 ~Token() { } 65 66 void addAttribute(AtomicString& attrName, const AtomicString& v, bool viewSourceMode); 67 isOpenTagToken68 bool isOpenTag(const QualifiedName& fullName) const { return beginTag && fullName.localName() == tagName; } isCloseTagToken69 bool isCloseTag(const QualifiedName& fullName) const { return !beginTag && fullName.localName() == tagName; } 70 resetToken71 void reset() 72 { 73 attrs = 0; 74 text = 0; 75 tagName = nullAtom; 76 beginTag = true; 77 selfClosingTag = false; 78 brokenXMLStyle = false; 79 if (m_sourceInfo) 80 m_sourceInfo->clear(); 81 } 82 addViewSourceCharToken83 void addViewSourceChar(UChar c) { if (!m_sourceInfo.get()) m_sourceInfo.set(new Vector<UChar>); m_sourceInfo->append(c); } 84 85 RefPtr<NamedMappedAttrMap> attrs; 86 RefPtr<StringImpl> text; 87 AtomicString tagName; 88 bool beginTag; 89 bool selfClosingTag; 90 bool brokenXMLStyle; 91 OwnPtr<Vector<UChar> > m_sourceInfo; 92 }; 93 94 enum DoctypeState { 95 DoctypeBegin, 96 DoctypeBeforeName, 97 DoctypeName, 98 DoctypeAfterName, 99 DoctypeBeforePublicID, 100 DoctypePublicID, 101 DoctypeAfterPublicID, 102 DoctypeBeforeSystemID, 103 DoctypeSystemID, 104 DoctypeAfterSystemID, 105 DoctypeBogus 106 }; 107 108 class DoctypeToken { 109 public: DoctypeToken()110 DoctypeToken() {} 111 reset()112 void reset() 113 { 114 m_name.clear(); 115 m_publicID.clear(); 116 m_systemID.clear(); 117 m_state = DoctypeBegin; 118 m_source.clear(); 119 } 120 state()121 DoctypeState state() { return m_state; } setState(DoctypeState s)122 void setState(DoctypeState s) { m_state = s; } 123 124 Vector<UChar> m_name; 125 Vector<UChar> m_publicID; 126 Vector<UChar> m_systemID; 127 DoctypeState m_state; 128 129 Vector<UChar> m_source; 130 }; 131 132 //----------------------------------------------------------------------------- 133 134 class HTMLTokenizer : public Tokenizer, public CachedResourceClient { 135 public: 136 HTMLTokenizer(HTMLDocument*, bool reportErrors); 137 HTMLTokenizer(HTMLViewSourceDocument*); 138 HTMLTokenizer(DocumentFragment*); 139 virtual ~HTMLTokenizer(); 140 141 virtual void write(const SegmentedString&, bool appendData); 142 virtual void finish(); 143 virtual void setForceSynchronous(bool force); 144 virtual bool isWaitingForScripts() const; 145 virtual void stopParsing(); 146 virtual bool processingData() const; executingScript()147 virtual int executingScript() const { return m_executingScript; } 148 lineNumber()149 virtual int lineNumber() const { return m_lineNumber; } columnNumber()150 virtual int columnNumber() const { return 1; } 151 processingContentWrittenByScript()152 bool processingContentWrittenByScript() const { return m_src.excludeLineNumbers(); } 153 154 virtual void executeScriptsWaitingForStylesheets(); 155 isHTMLTokenizer()156 virtual bool isHTMLTokenizer() const { return true; } htmlParser()157 HTMLParser* htmlParser() const { return m_parser.get(); } 158 159 private: 160 class State; 161 162 // Where we are in parsing a tag 163 void begin(); 164 void end(); 165 166 void reset(); 167 168 PassRefPtr<Node> processToken(); 169 void processDoctypeToken(); 170 171 State processListing(SegmentedString, State); 172 State parseComment(SegmentedString&, State); 173 State parseDoctype(SegmentedString&, State); 174 State parseServer(SegmentedString&, State); 175 State parseText(SegmentedString&, State); 176 State parseNonHTMLText(SegmentedString&, State); 177 State parseTag(SegmentedString&, State); 178 State parseEntity(SegmentedString&, UChar*& dest, State, unsigned& cBufferPos, bool start, bool parsingTag); 179 State parseProcessingInstruction(SegmentedString&, State); 180 State scriptHandler(State); 181 State scriptExecution(const ScriptSourceCode&, State); 182 void setSrc(const SegmentedString&); 183 184 // check if we have enough space in the buffer. 185 // if not enlarge it 186 inline void checkBuffer(int len = 10) 187 { 188 if ((m_dest - m_buffer) > m_bufferSize - len) 189 enlargeBuffer(len); 190 } 191 192 inline void checkScriptBuffer(int len = 10) 193 { 194 if (m_scriptCodeSize + len >= m_scriptCodeCapacity) 195 enlargeScriptBuffer(len); 196 } 197 198 void enlargeBuffer(int len); 199 void enlargeScriptBuffer(int len); 200 201 bool continueProcessing(int& processedCount, double startTime, State&); 202 void timerFired(Timer<HTMLTokenizer>*); 203 void allDataProcessed(); 204 205 // from CachedResourceClient 206 void notifyFinished(CachedResource*); 207 208 // Internal buffers 209 /////////////////// 210 UChar* m_buffer; 211 int m_bufferSize; 212 UChar* m_dest; 213 214 Token m_currentToken; 215 216 // This buffer holds the raw characters we've seen between the beginning of 217 // the attribute name and the first character of the attribute value. 218 Vector<UChar, 32> m_rawAttributeBeforeValue; 219 220 // Tokenizer flags 221 ////////////////// 222 // are we in quotes within a html tag 223 enum { NoQuote, SingleQuote, DoubleQuote } tquote; 224 225 // Are we in a &... character entity description? 226 enum EntityState { 227 NoEntity = 0, 228 SearchEntity = 1, 229 NumericSearch = 2, 230 Hexadecimal = 3, 231 Decimal = 4, 232 EntityName = 5, 233 SearchSemicolon = 6 234 }; 235 unsigned EntityUnicodeValue; 236 237 enum TagState { 238 NoTag = 0, 239 TagName = 1, 240 SearchAttribute = 2, 241 AttributeName = 3, 242 SearchEqual = 4, 243 SearchValue = 5, 244 QuotedValue = 6, 245 Value = 7, 246 SearchEnd = 8 247 }; 248 249 class State { 250 public: State()251 State() : m_bits(0) { } 252 tagState()253 TagState tagState() const { return static_cast<TagState>(m_bits & TagMask); } setTagState(TagState t)254 void setTagState(TagState t) { m_bits = (m_bits & ~TagMask) | t; } entityState()255 EntityState entityState() const { return static_cast<EntityState>((m_bits & EntityMask) >> EntityShift); } setEntityState(EntityState e)256 void setEntityState(EntityState e) { m_bits = (m_bits & ~EntityMask) | (e << EntityShift); } 257 inScript()258 bool inScript() const { return testBit(InScript); } setInScript(bool v)259 void setInScript(bool v) { setBit(InScript, v); } inStyle()260 bool inStyle() const { return testBit(InStyle); } setInStyle(bool v)261 void setInStyle(bool v) { setBit(InStyle, v); } inXmp()262 bool inXmp() const { return testBit(InXmp); } setInXmp(bool v)263 void setInXmp(bool v) { setBit(InXmp, v); } inTitle()264 bool inTitle() const { return testBit(InTitle); } setInTitle(bool v)265 void setInTitle(bool v) { setBit(InTitle, v); } inIFrame()266 bool inIFrame() const { return testBit(InIFrame); } setInIFrame(bool v)267 void setInIFrame(bool v) { setBit(InIFrame, v); } inPlainText()268 bool inPlainText() const { return testBit(InPlainText); } setInPlainText(bool v)269 void setInPlainText(bool v) { setBit(InPlainText, v); } inProcessingInstruction()270 bool inProcessingInstruction() const { return testBit(InProcessingInstruction); } setInProcessingInstruction(bool v)271 void setInProcessingInstruction(bool v) { return setBit(InProcessingInstruction, v); } inComment()272 bool inComment() const { return testBit(InComment); } setInComment(bool v)273 void setInComment(bool v) { setBit(InComment, v); } inDoctype()274 bool inDoctype() const { return testBit(InDoctype); } setInDoctype(bool v)275 void setInDoctype(bool v) { setBit(InDoctype, v); } inTextArea()276 bool inTextArea() const { return testBit(InTextArea); } setInTextArea(bool v)277 void setInTextArea(bool v) { setBit(InTextArea, v); } escaped()278 bool escaped() const { return testBit(Escaped); } setEscaped(bool v)279 void setEscaped(bool v) { setBit(Escaped, v); } inServer()280 bool inServer() const { return testBit(InServer); } setInServer(bool v)281 void setInServer(bool v) { setBit(InServer, v); } skipLF()282 bool skipLF() const { return testBit(SkipLF); } setSkipLF(bool v)283 void setSkipLF(bool v) { setBit(SkipLF, v); } startTag()284 bool startTag() const { return testBit(StartTag); } setStartTag(bool v)285 void setStartTag(bool v) { setBit(StartTag, v); } discardLF()286 bool discardLF() const { return testBit(DiscardLF); } setDiscardLF(bool v)287 void setDiscardLF(bool v) { setBit(DiscardLF, v); } allowYield()288 bool allowYield() const { return testBit(AllowYield); } setAllowYield(bool v)289 void setAllowYield(bool v) { setBit(AllowYield, v); } loadingExtScript()290 bool loadingExtScript() const { return testBit(LoadingExtScript); } setLoadingExtScript(bool v)291 void setLoadingExtScript(bool v) { setBit(LoadingExtScript, v); } forceSynchronous()292 bool forceSynchronous() const { return testBit(ForceSynchronous); } setForceSynchronous(bool v)293 void setForceSynchronous(bool v) { setBit(ForceSynchronous, v); } 294 inAnyNonHTMLText()295 bool inAnyNonHTMLText() const { return m_bits & (InScript | InStyle | InXmp | InTextArea | InTitle | InIFrame); } hasTagState()296 bool hasTagState() const { return m_bits & TagMask; } hasEntityState()297 bool hasEntityState() const { return m_bits & EntityMask; } 298 needsSpecialWriteHandling()299 bool needsSpecialWriteHandling() const { return m_bits & (InScript | InStyle | InXmp | InTextArea | InTitle | InIFrame | TagMask | EntityMask | InPlainText | InComment | InDoctype | InServer | InProcessingInstruction | StartTag); } 300 301 private: 302 static const int EntityShift = 4; 303 enum StateBits { 304 TagMask = (1 << 4) - 1, 305 EntityMask = (1 << 7) - (1 << 4), 306 InScript = 1 << 7, 307 InStyle = 1 << 8, 308 // Bit 9 unused 309 InXmp = 1 << 10, 310 InTitle = 1 << 11, 311 InPlainText = 1 << 12, 312 InProcessingInstruction = 1 << 13, 313 InComment = 1 << 14, 314 InTextArea = 1 << 15, 315 Escaped = 1 << 16, 316 InServer = 1 << 17, 317 SkipLF = 1 << 18, 318 StartTag = 1 << 19, 319 DiscardLF = 1 << 20, // FIXME: should clarify difference between skip and discard 320 AllowYield = 1 << 21, 321 LoadingExtScript = 1 << 22, 322 ForceSynchronous = 1 << 23, 323 InIFrame = 1 << 24, 324 InDoctype = 1 << 25 325 }; 326 setBit(StateBits bit,bool value)327 void setBit(StateBits bit, bool value) 328 { 329 if (value) 330 m_bits |= bit; 331 else 332 m_bits &= ~bit; 333 } testBit(StateBits bit)334 bool testBit(StateBits bit) const { return m_bits & bit; } 335 336 unsigned m_bits; 337 }; 338 339 State m_state; 340 341 DoctypeToken m_doctypeToken; 342 int m_doctypeSearchCount; 343 int m_doctypeSecondarySearchCount; 344 345 bool m_brokenServer; 346 347 // Name of an attribute that we just scanned. 348 AtomicString m_attrName; 349 350 // Used to store the code of a scripting sequence 351 UChar* m_scriptCode; 352 // Size of the script sequenze stored in @ref #scriptCode 353 int m_scriptCodeSize; 354 // Maximal size that can be stored in @ref #scriptCode 355 int m_scriptCodeCapacity; 356 // resync point of script code size 357 int m_scriptCodeResync; 358 359 // Stores characters if we are scanning for a string like "</script>" 360 UChar searchBuffer[10]; 361 362 // Counts where we are in the string we are scanning for 363 int searchCount; 364 // the stopper string 365 const char* m_searchStopper; 366 int m_searchStopperLength; 367 368 // if no more data is coming, just parse what we have (including ext scripts that 369 // may be still downloading) and finish 370 bool m_noMoreData; 371 // URL to get source code of script from 372 String m_scriptTagSrcAttrValue; 373 String m_scriptTagCharsetAttrValue; 374 // the HTML code we will parse after the external script we are waiting for has loaded 375 SegmentedString m_pendingSrc; 376 377 // the HTML code we will parse after this particular script has 378 // loaded, but before all pending HTML 379 SegmentedString* m_currentPrependingSrc; 380 381 // true if we are executing a script while parsing a document. This causes the parsing of 382 // the output of the script to be postponed until after the script has finished executing 383 int m_executingScript; 384 Deque<CachedResourceHandle<CachedScript> > m_pendingScripts; 385 RefPtr<HTMLScriptElement> m_scriptNode; 386 387 bool m_requestingScript; 388 bool m_hasScriptsWaitingForStylesheets; 389 390 // if we found one broken comment, there are most likely others as well 391 // store a flag to get rid of the O(n^2) behaviour in such a case. 392 bool m_brokenComments; 393 // current line number 394 int m_lineNumber; 395 int m_currentScriptTagStartLineNumber; 396 int m_currentTagStartLineNumber; 397 398 double m_tokenizerTimeDelay; 399 int m_tokenizerChunkSize; 400 401 // The timer for continued processing. 402 Timer<HTMLTokenizer> m_timer; 403 404 // This buffer can hold arbitrarily long user-defined attribute names, such as in EMBED tags. 405 // So any fixed number might be too small, but rather than rewriting all usage of this buffer 406 // we'll just make it large enough to handle all imaginable cases. 407 #define CBUFLEN 1024 408 UChar m_cBuffer[CBUFLEN + 2]; 409 unsigned int m_cBufferPos; 410 411 SegmentedString m_src; 412 Document* m_doc; 413 OwnPtr<HTMLParser> m_parser; 414 bool m_inWrite; 415 bool m_fragment; 416 417 OwnPtr<PreloadScanner> m_preloadScanner; 418 }; 419 420 void parseHTMLDocumentFragment(const String&, DocumentFragment*); 421 422 UChar decodeNamedEntity(const char*); 423 424 } // namespace WebCore 425 426 #endif // HTMLTokenizer_h 427