• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2     Copyright (C) 1997 Martin Jones (mjones@kde.org)
3               (C) 1997 Torben Weis (weis@kde.org)
4               (C) 1998 Waldo Bastian (bastian@kde.org)
5               (C) 2001 Dirk Mueller (mueller@kde.org)
6     Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
7 
8     This library is free software; you can redistribute it and/or
9     modify it under the terms of the GNU Library General Public
10     License as published by the Free Software Foundation; either
11     version 2 of the License, or (at your option) any later version.
12 
13     This library is distributed in the hope that it will be useful,
14     but WITHOUT ANY WARRANTY; without even the implied warranty of
15     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16     Library General Public License for more details.
17 
18     You should have received a copy of the GNU Library General Public License
19     along with this library; see the file COPYING.LIB.  If not, write to
20     the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
21     Boston, MA 02110-1301, USA.
22 */
23 
24 #ifndef HTMLTokenizer_h
25 #define HTMLTokenizer_h
26 
27 #include "CachedResourceClient.h"
28 #include "CachedResourceHandle.h"
29 #include "NamedMappedAttrMap.h"
30 #include "SegmentedString.h"
31 #include "Timer.h"
32 #include "Tokenizer.h"
33 #include <wtf/Deque.h>
34 #include <wtf/OwnPtr.h>
35 #include <wtf/Vector.h>
36 
37 namespace WebCore {
38 
39 class CachedScript;
40 class DocumentFragment;
41 class Document;
42 class HTMLDocument;
43 class HTMLScriptElement;
44 class HTMLViewSourceDocument;
45 class FrameView;
46 class HTMLParser;
47 class Node;
48 class PreloadScanner;
49 class ScriptSourceCode;
50 
51 /**
52  * @internal
53  * represents one HTML tag. Consists of a numerical id, and the list
54  * of attributes. Can also represent text. In this case the id = 0 and
55  * text contains the text.
56  */
57 struct Token {
TokenToken58     Token()
59         : beginTag(true)
60         , selfClosingTag(false)
61         , brokenXMLStyle(false)
62         , m_sourceInfo(0)
63     { }
~TokenToken64     ~Token() { }
65 
66     void addAttribute(AtomicString& attrName, const AtomicString& v, bool viewSourceMode);
67 
isOpenTagToken68     bool isOpenTag(const QualifiedName& fullName) const { return beginTag && fullName.localName() == tagName; }
isCloseTagToken69     bool isCloseTag(const QualifiedName& fullName) const { return !beginTag && fullName.localName() == tagName; }
70 
resetToken71     void reset()
72     {
73         attrs = 0;
74         text = 0;
75         tagName = nullAtom;
76         beginTag = true;
77         selfClosingTag = false;
78         brokenXMLStyle = false;
79         if (m_sourceInfo)
80             m_sourceInfo->clear();
81     }
82 
addViewSourceCharToken83     void addViewSourceChar(UChar c) { if (!m_sourceInfo.get()) m_sourceInfo.set(new Vector<UChar>); m_sourceInfo->append(c); }
84 
85     RefPtr<NamedMappedAttrMap> attrs;
86     RefPtr<StringImpl> text;
87     AtomicString tagName;
88     bool beginTag;
89     bool selfClosingTag;
90     bool brokenXMLStyle;
91     OwnPtr<Vector<UChar> > m_sourceInfo;
92 };
93 
94 enum DoctypeState {
95     DoctypeBegin,
96     DoctypeBeforeName,
97     DoctypeName,
98     DoctypeAfterName,
99     DoctypeBeforePublicID,
100     DoctypePublicID,
101     DoctypeAfterPublicID,
102     DoctypeBeforeSystemID,
103     DoctypeSystemID,
104     DoctypeAfterSystemID,
105     DoctypeBogus
106 };
107 
108 class DoctypeToken {
109 public:
DoctypeToken()110     DoctypeToken() {}
111 
reset()112     void reset()
113     {
114         m_name.clear();
115         m_publicID.clear();
116         m_systemID.clear();
117         m_state = DoctypeBegin;
118         m_source.clear();
119     }
120 
state()121     DoctypeState state() { return m_state; }
setState(DoctypeState s)122     void setState(DoctypeState s) { m_state = s; }
123 
124     Vector<UChar> m_name;
125     Vector<UChar> m_publicID;
126     Vector<UChar> m_systemID;
127     DoctypeState m_state;
128 
129     Vector<UChar> m_source;
130 };
131 
132 //-----------------------------------------------------------------------------
133 
134 class HTMLTokenizer : public Tokenizer, public CachedResourceClient {
135 public:
136     HTMLTokenizer(HTMLDocument*, bool reportErrors);
137     HTMLTokenizer(HTMLViewSourceDocument*);
138     HTMLTokenizer(DocumentFragment*);
139     virtual ~HTMLTokenizer();
140 
141     virtual void write(const SegmentedString&, bool appendData);
142     virtual void finish();
143     virtual void setForceSynchronous(bool force);
144     virtual bool isWaitingForScripts() const;
145     virtual void stopParsing();
146     virtual bool processingData() const;
executingScript()147     virtual int executingScript() const { return m_executingScript; }
148 
lineNumber()149     virtual int lineNumber() const { return m_lineNumber; }
columnNumber()150     virtual int columnNumber() const { return 1; }
151 
processingContentWrittenByScript()152     bool processingContentWrittenByScript() const { return m_src.excludeLineNumbers(); }
153 
154     virtual void executeScriptsWaitingForStylesheets();
155 
isHTMLTokenizer()156     virtual bool isHTMLTokenizer() const { return true; }
htmlParser()157     HTMLParser* htmlParser() const { return m_parser.get(); }
158 
159 private:
160     class State;
161 
162     // Where we are in parsing a tag
163     void begin();
164     void end();
165 
166     void reset();
167 
168     PassRefPtr<Node> processToken();
169     void processDoctypeToken();
170 
171     State processListing(SegmentedString, State);
172     State parseComment(SegmentedString&, State);
173     State parseDoctype(SegmentedString&, State);
174     State parseServer(SegmentedString&, State);
175     State parseText(SegmentedString&, State);
176     State parseNonHTMLText(SegmentedString&, State);
177     State parseTag(SegmentedString&, State);
178     State parseEntity(SegmentedString&, UChar*& dest, State, unsigned& cBufferPos, bool start, bool parsingTag);
179     State parseProcessingInstruction(SegmentedString&, State);
180     State scriptHandler(State);
181     State scriptExecution(const ScriptSourceCode&, State);
182     void setSrc(const SegmentedString&);
183 
184     // check if we have enough space in the buffer.
185     // if not enlarge it
186     inline void checkBuffer(int len = 10)
187     {
188         if ((m_dest - m_buffer) > m_bufferSize - len)
189             enlargeBuffer(len);
190     }
191 
192     inline void checkScriptBuffer(int len = 10)
193     {
194         if (m_scriptCodeSize + len >= m_scriptCodeCapacity)
195             enlargeScriptBuffer(len);
196     }
197 
198     void enlargeBuffer(int len);
199     void enlargeScriptBuffer(int len);
200 
201     bool continueProcessing(int& processedCount, double startTime, State&);
202     void timerFired(Timer<HTMLTokenizer>*);
203     void allDataProcessed();
204 
205     // from CachedResourceClient
206     void notifyFinished(CachedResource*);
207 
208     // Internal buffers
209     ///////////////////
210     UChar* m_buffer;
211     int m_bufferSize;
212     UChar* m_dest;
213 
214     Token m_currentToken;
215 
216     // This buffer holds the raw characters we've seen between the beginning of
217     // the attribute name and the first character of the attribute value.
218     Vector<UChar, 32> m_rawAttributeBeforeValue;
219 
220     // Tokenizer flags
221     //////////////////
222     // are we in quotes within a html tag
223     enum { NoQuote, SingleQuote, DoubleQuote } tquote;
224 
225     // Are we in a &... character entity description?
226     enum EntityState {
227         NoEntity = 0,
228         SearchEntity = 1,
229         NumericSearch = 2,
230         Hexadecimal = 3,
231         Decimal = 4,
232         EntityName = 5,
233         SearchSemicolon = 6
234     };
235     unsigned EntityUnicodeValue;
236 
237     enum TagState {
238         NoTag = 0,
239         TagName = 1,
240         SearchAttribute = 2,
241         AttributeName = 3,
242         SearchEqual = 4,
243         SearchValue = 5,
244         QuotedValue = 6,
245         Value = 7,
246         SearchEnd = 8
247     };
248 
249     class State {
250     public:
State()251         State() : m_bits(0) { }
252 
tagState()253         TagState tagState() const { return static_cast<TagState>(m_bits & TagMask); }
setTagState(TagState t)254         void setTagState(TagState t) { m_bits = (m_bits & ~TagMask) | t; }
entityState()255         EntityState entityState() const { return static_cast<EntityState>((m_bits & EntityMask) >> EntityShift); }
setEntityState(EntityState e)256         void setEntityState(EntityState e) { m_bits = (m_bits & ~EntityMask) | (e << EntityShift); }
257 
inScript()258         bool inScript() const { return testBit(InScript); }
setInScript(bool v)259         void setInScript(bool v) { setBit(InScript, v); }
inStyle()260         bool inStyle() const { return testBit(InStyle); }
setInStyle(bool v)261         void setInStyle(bool v) { setBit(InStyle, v); }
inXmp()262         bool inXmp() const { return testBit(InXmp); }
setInXmp(bool v)263         void setInXmp(bool v) { setBit(InXmp, v); }
inTitle()264         bool inTitle() const { return testBit(InTitle); }
setInTitle(bool v)265         void setInTitle(bool v) { setBit(InTitle, v); }
inIFrame()266         bool inIFrame() const { return testBit(InIFrame); }
setInIFrame(bool v)267         void setInIFrame(bool v) { setBit(InIFrame, v); }
inPlainText()268         bool inPlainText() const { return testBit(InPlainText); }
setInPlainText(bool v)269         void setInPlainText(bool v) { setBit(InPlainText, v); }
inProcessingInstruction()270         bool inProcessingInstruction() const { return testBit(InProcessingInstruction); }
setInProcessingInstruction(bool v)271         void setInProcessingInstruction(bool v) { return setBit(InProcessingInstruction, v); }
inComment()272         bool inComment() const { return testBit(InComment); }
setInComment(bool v)273         void setInComment(bool v) { setBit(InComment, v); }
inDoctype()274         bool inDoctype() const { return testBit(InDoctype); }
setInDoctype(bool v)275         void setInDoctype(bool v) { setBit(InDoctype, v); }
inTextArea()276         bool inTextArea() const { return testBit(InTextArea); }
setInTextArea(bool v)277         void setInTextArea(bool v) { setBit(InTextArea, v); }
escaped()278         bool escaped() const { return testBit(Escaped); }
setEscaped(bool v)279         void setEscaped(bool v) { setBit(Escaped, v); }
inServer()280         bool inServer() const { return testBit(InServer); }
setInServer(bool v)281         void setInServer(bool v) { setBit(InServer, v); }
skipLF()282         bool skipLF() const { return testBit(SkipLF); }
setSkipLF(bool v)283         void setSkipLF(bool v) { setBit(SkipLF, v); }
startTag()284         bool startTag() const { return testBit(StartTag); }
setStartTag(bool v)285         void setStartTag(bool v) { setBit(StartTag, v); }
discardLF()286         bool discardLF() const { return testBit(DiscardLF); }
setDiscardLF(bool v)287         void setDiscardLF(bool v) { setBit(DiscardLF, v); }
allowYield()288         bool allowYield() const { return testBit(AllowYield); }
setAllowYield(bool v)289         void setAllowYield(bool v) { setBit(AllowYield, v); }
loadingExtScript()290         bool loadingExtScript() const { return testBit(LoadingExtScript); }
setLoadingExtScript(bool v)291         void setLoadingExtScript(bool v) { setBit(LoadingExtScript, v); }
forceSynchronous()292         bool forceSynchronous() const { return testBit(ForceSynchronous); }
setForceSynchronous(bool v)293         void setForceSynchronous(bool v) { setBit(ForceSynchronous, v); }
294 
inAnyNonHTMLText()295         bool inAnyNonHTMLText() const { return m_bits & (InScript | InStyle | InXmp | InTextArea | InTitle | InIFrame); }
hasTagState()296         bool hasTagState() const { return m_bits & TagMask; }
hasEntityState()297         bool hasEntityState() const { return m_bits & EntityMask; }
298 
needsSpecialWriteHandling()299         bool needsSpecialWriteHandling() const { return m_bits & (InScript | InStyle | InXmp | InTextArea | InTitle | InIFrame | TagMask | EntityMask | InPlainText | InComment | InDoctype | InServer | InProcessingInstruction | StartTag); }
300 
301     private:
302         static const int EntityShift = 4;
303         enum StateBits {
304             TagMask = (1 << 4) - 1,
305             EntityMask = (1 << 7) - (1 << 4),
306             InScript = 1 << 7,
307             InStyle = 1 << 8,
308             // Bit 9 unused
309             InXmp = 1 << 10,
310             InTitle = 1 << 11,
311             InPlainText = 1 << 12,
312             InProcessingInstruction = 1 << 13,
313             InComment = 1 << 14,
314             InTextArea = 1 << 15,
315             Escaped = 1 << 16,
316             InServer = 1 << 17,
317             SkipLF = 1 << 18,
318             StartTag = 1 << 19,
319             DiscardLF = 1 << 20, // FIXME: should clarify difference between skip and discard
320             AllowYield = 1 << 21,
321             LoadingExtScript = 1 << 22,
322             ForceSynchronous = 1 << 23,
323             InIFrame = 1 << 24,
324             InDoctype = 1 << 25
325         };
326 
setBit(StateBits bit,bool value)327         void setBit(StateBits bit, bool value)
328         {
329             if (value)
330                 m_bits |= bit;
331             else
332                 m_bits &= ~bit;
333         }
testBit(StateBits bit)334         bool testBit(StateBits bit) const { return m_bits & bit; }
335 
336         unsigned m_bits;
337     };
338 
339     State m_state;
340 
341     DoctypeToken m_doctypeToken;
342     int m_doctypeSearchCount;
343     int m_doctypeSecondarySearchCount;
344 
345     bool m_brokenServer;
346 
347     // Name of an attribute that we just scanned.
348     AtomicString m_attrName;
349 
350     // Used to store the code of a scripting sequence
351     UChar* m_scriptCode;
352     // Size of the script sequenze stored in @ref #scriptCode
353     int m_scriptCodeSize;
354     // Maximal size that can be stored in @ref #scriptCode
355     int m_scriptCodeCapacity;
356     // resync point of script code size
357     int m_scriptCodeResync;
358 
359     // Stores characters if we are scanning for a string like "</script>"
360     UChar searchBuffer[10];
361 
362     // Counts where we are in the string we are scanning for
363     int searchCount;
364     // the stopper string
365     const char* m_searchStopper;
366     int m_searchStopperLength;
367 
368     // if no more data is coming, just parse what we have (including ext scripts that
369     // may be still downloading) and finish
370     bool m_noMoreData;
371     // URL to get source code of script from
372     String m_scriptTagSrcAttrValue;
373     String m_scriptTagCharsetAttrValue;
374     // the HTML code we will parse after the external script we are waiting for has loaded
375     SegmentedString m_pendingSrc;
376 
377     // the HTML code we will parse after this particular script has
378     // loaded, but before all pending HTML
379     SegmentedString* m_currentPrependingSrc;
380 
381     // true if we are executing a script while parsing a document. This causes the parsing of
382     // the output of the script to be postponed until after the script has finished executing
383     int m_executingScript;
384     Deque<CachedResourceHandle<CachedScript> > m_pendingScripts;
385     RefPtr<HTMLScriptElement> m_scriptNode;
386 
387     bool m_requestingScript;
388     bool m_hasScriptsWaitingForStylesheets;
389 
390     // if we found one broken comment, there are most likely others as well
391     // store a flag to get rid of the O(n^2) behaviour in such a case.
392     bool m_brokenComments;
393     // current line number
394     int m_lineNumber;
395     int m_currentScriptTagStartLineNumber;
396     int m_currentTagStartLineNumber;
397 
398     double m_tokenizerTimeDelay;
399     int m_tokenizerChunkSize;
400 
401     // The timer for continued processing.
402     Timer<HTMLTokenizer> m_timer;
403 
404 // This buffer can hold arbitrarily long user-defined attribute names, such as in EMBED tags.
405 // So any fixed number might be too small, but rather than rewriting all usage of this buffer
406 // we'll just make it large enough to handle all imaginable cases.
407 #define CBUFLEN 1024
408     UChar m_cBuffer[CBUFLEN + 2];
409     unsigned int m_cBufferPos;
410 
411     SegmentedString m_src;
412     Document* m_doc;
413     OwnPtr<HTMLParser> m_parser;
414     bool m_inWrite;
415     bool m_fragment;
416 
417     OwnPtr<PreloadScanner> m_preloadScanner;
418 };
419 
420 void parseHTMLDocumentFragment(const String&, DocumentFragment*);
421 
422 UChar decodeNamedEntity(const char*);
423 
424 } // namespace WebCore
425 
426 #endif // HTMLTokenizer_h
427