• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2     Copyright (C) 1997 Martin Jones (mjones@kde.org)
3               (C) 1997 Torben Weis (weis@kde.org)
4               (C) 1998 Waldo Bastian (bastian@kde.org)
5               (C) 2001 Dirk Mueller (mueller@kde.org)
6     Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
7 
8     This library is free software; you can redistribute it and/or
9     modify it under the terms of the GNU Library General Public
10     License as published by the Free Software Foundation; either
11     version 2 of the License, or (at your option) any later version.
12 
13     This library is distributed in the hope that it will be useful,
14     but WITHOUT ANY WARRANTY; without even the implied warranty of
15     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16     Library General Public License for more details.
17 
18     You should have received a copy of the GNU Library General Public License
19     along with this library; see the file COPYING.LIB.  If not, write to
20     the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
21     Boston, MA 02110-1301, USA.
22 */
23 
24 #ifndef HTMLTokenizer_h
25 #define HTMLTokenizer_h
26 
27 #include "CachedResourceClient.h"
28 #include "CachedResourceHandle.h"
29 #include "NamedMappedAttrMap.h"
30 #include "MappedAttributeEntry.h"
31 #include "SegmentedString.h"
32 #include "Timer.h"
33 #include "Tokenizer.h"
34 #include <wtf/Deque.h>
35 #include <wtf/OwnPtr.h>
36 #include <wtf/Vector.h>
37 
38 namespace WebCore {
39 
40 class CachedScript;
41 class DocumentFragment;
42 class Document;
43 class HTMLDocument;
44 class HTMLScriptElement;
45 class HTMLViewSourceDocument;
46 class FrameView;
47 class HTMLParser;
48 class Node;
49 class PreloadScanner;
50 class ScriptSourceCode;
51 
52 /**
53  * @internal
54  * represents one HTML tag. Consists of a numerical id, and the list
55  * of attributes. Can also represent text. In this case the id = 0 and
56  * text contains the text.
57  */
58 struct Token {
TokenToken59     Token()
60         : beginTag(true)
61         , selfClosingTag(false)
62         , brokenXMLStyle(false)
63         , m_sourceInfo(0)
64     { }
~TokenToken65     ~Token() { }
66 
67     void addAttribute(AtomicString& attrName, const AtomicString& v, bool viewSourceMode);
68 
isOpenTagToken69     bool isOpenTag(const QualifiedName& fullName) const { return beginTag && fullName.localName() == tagName; }
isCloseTagToken70     bool isCloseTag(const QualifiedName& fullName) const { return !beginTag && fullName.localName() == tagName; }
71 
resetToken72     void reset()
73     {
74         attrs = 0;
75         text = 0;
76         tagName = nullAtom;
77         beginTag = true;
78         selfClosingTag = false;
79         brokenXMLStyle = false;
80         if (m_sourceInfo)
81             m_sourceInfo->clear();
82     }
83 
addViewSourceCharToken84     void addViewSourceChar(UChar c) { if (!m_sourceInfo.get()) m_sourceInfo.set(new Vector<UChar>); m_sourceInfo->append(c); }
85 
86     RefPtr<NamedMappedAttrMap> attrs;
87     RefPtr<StringImpl> text;
88     AtomicString tagName;
89     bool beginTag;
90     bool selfClosingTag;
91     bool brokenXMLStyle;
92     OwnPtr<Vector<UChar> > m_sourceInfo;
93 };
94 
95 enum DoctypeState {
96     DoctypeBegin,
97     DoctypeBeforeName,
98     DoctypeName,
99     DoctypeAfterName,
100     DoctypeBeforePublicID,
101     DoctypePublicID,
102     DoctypeAfterPublicID,
103     DoctypeBeforeSystemID,
104     DoctypeSystemID,
105     DoctypeAfterSystemID,
106     DoctypeBogus
107 };
108 
109 class DoctypeToken {
110 public:
DoctypeToken()111     DoctypeToken() {}
112 
reset()113     void reset()
114     {
115         m_name.clear();
116         m_publicID.clear();
117         m_systemID.clear();
118         m_state = DoctypeBegin;
119         m_source.clear();
120     }
121 
state()122     DoctypeState state() { return m_state; }
setState(DoctypeState s)123     void setState(DoctypeState s) { m_state = s; }
124 
125     Vector<UChar> m_name;
126     Vector<UChar> m_publicID;
127     Vector<UChar> m_systemID;
128     DoctypeState m_state;
129 
130     Vector<UChar> m_source;
131 };
132 
133 //-----------------------------------------------------------------------------
134 
135 class HTMLTokenizer : public Tokenizer, public CachedResourceClient {
136 public:
137     HTMLTokenizer(HTMLDocument*, bool reportErrors);
138     HTMLTokenizer(HTMLViewSourceDocument*);
139     HTMLTokenizer(DocumentFragment*, FragmentScriptingPermission = FragmentScriptingAllowed);
140     virtual ~HTMLTokenizer();
141 
142     virtual void write(const SegmentedString&, bool appendData);
143     virtual void finish();
144     virtual void setForceSynchronous(bool force);
145     virtual bool isWaitingForScripts() const;
146     virtual void stopParsing();
147     virtual bool processingData() const;
executingScript()148     virtual int executingScript() const { return m_executingScript; }
149 
lineNumber()150     virtual int lineNumber() const { return m_lineNumber; }
columnNumber()151     virtual int columnNumber() const { return 1; }
152 
processingContentWrittenByScript()153     bool processingContentWrittenByScript() const { return m_src.excludeLineNumbers(); }
154 
155     virtual void executeScriptsWaitingForStylesheets();
156 
isHTMLTokenizer()157     virtual bool isHTMLTokenizer() const { return true; }
htmlParser()158     HTMLParser* htmlParser() const { return m_parser.get(); }
159 
160 private:
161     class State;
162 
163     // Where we are in parsing a tag
164     void begin();
165     void end();
166 
167     void reset();
168 
169     PassRefPtr<Node> processToken();
170     void processDoctypeToken();
171 
172     State processListing(SegmentedString, State);
173     State parseComment(SegmentedString&, State);
174     State parseDoctype(SegmentedString&, State);
175     State parseServer(SegmentedString&, State);
176     State parseText(SegmentedString&, State);
177     State parseNonHTMLText(SegmentedString&, State);
178     State parseTag(SegmentedString&, State);
179     State parseEntity(SegmentedString&, UChar*& dest, State, unsigned& cBufferPos, bool start, bool parsingTag);
180     State parseProcessingInstruction(SegmentedString&, State);
181     State scriptHandler(State);
182     State scriptExecution(const ScriptSourceCode&, State);
183     void setSrc(const SegmentedString&);
184 
185     // check if we have enough space in the buffer.
186     // if not enlarge it
187     inline void checkBuffer(int len = 10)
188     {
189         if ((m_dest - m_buffer) > m_bufferSize - len)
190             enlargeBuffer(len);
191     }
192 
193     inline void checkScriptBuffer(int len = 10)
194     {
195         if (m_scriptCodeSize + len >= m_scriptCodeCapacity)
196             enlargeScriptBuffer(len);
197     }
198 
199     void enlargeBuffer(int len);
200     void enlargeScriptBuffer(int len);
201 
202     bool continueProcessing(int& processedCount, double startTime, State&);
203     void timerFired(Timer<HTMLTokenizer>*);
204     void allDataProcessed();
205 
206     // from CachedResourceClient
207     void notifyFinished(CachedResource*);
208 
209     void executeExternalScriptsIfReady();
210     void executeExternalScriptsTimerFired(Timer<HTMLTokenizer>*);
211     bool continueExecutingExternalScripts(double startTime);
212 
213     // Internal buffers
214     ///////////////////
215     UChar* m_buffer;
216     int m_bufferSize;
217     UChar* m_dest;
218 
219     Token m_currentToken;
220 
221     // This buffer holds the raw characters we've seen between the beginning of
222     // the attribute name and the first character of the attribute value.
223     Vector<UChar, 32> m_rawAttributeBeforeValue;
224 
225     // Tokenizer flags
226     //////////////////
227     // are we in quotes within a html tag
228     enum { NoQuote, SingleQuote, DoubleQuote } tquote;
229 
230     // Are we in a &... character entity description?
231     enum EntityState {
232         NoEntity = 0,
233         SearchEntity = 1,
234         NumericSearch = 2,
235         Hexadecimal = 3,
236         Decimal = 4,
237         EntityName = 5,
238         SearchSemicolon = 6
239     };
240     unsigned EntityUnicodeValue;
241 
242     enum TagState {
243         NoTag = 0,
244         TagName = 1,
245         SearchAttribute = 2,
246         AttributeName = 3,
247         SearchEqual = 4,
248         SearchValue = 5,
249         QuotedValue = 6,
250         Value = 7,
251         SearchEnd = 8
252     };
253 
254     class State {
255     public:
State()256         State() : m_bits(0) { }
257 
tagState()258         TagState tagState() const { return static_cast<TagState>(m_bits & TagMask); }
setTagState(TagState t)259         void setTagState(TagState t) { m_bits = (m_bits & ~TagMask) | t; }
entityState()260         EntityState entityState() const { return static_cast<EntityState>((m_bits & EntityMask) >> EntityShift); }
setEntityState(EntityState e)261         void setEntityState(EntityState e) { m_bits = (m_bits & ~EntityMask) | (e << EntityShift); }
262 
inScript()263         bool inScript() const { return testBit(InScript); }
setInScript(bool v)264         void setInScript(bool v) { setBit(InScript, v); }
inStyle()265         bool inStyle() const { return testBit(InStyle); }
setInStyle(bool v)266         void setInStyle(bool v) { setBit(InStyle, v); }
inXmp()267         bool inXmp() const { return testBit(InXmp); }
setInXmp(bool v)268         void setInXmp(bool v) { setBit(InXmp, v); }
inTitle()269         bool inTitle() const { return testBit(InTitle); }
setInTitle(bool v)270         void setInTitle(bool v) { setBit(InTitle, v); }
inIFrame()271         bool inIFrame() const { return testBit(InIFrame); }
setInIFrame(bool v)272         void setInIFrame(bool v) { setBit(InIFrame, v); }
inPlainText()273         bool inPlainText() const { return testBit(InPlainText); }
setInPlainText(bool v)274         void setInPlainText(bool v) { setBit(InPlainText, v); }
inProcessingInstruction()275         bool inProcessingInstruction() const { return testBit(InProcessingInstruction); }
setInProcessingInstruction(bool v)276         void setInProcessingInstruction(bool v) { return setBit(InProcessingInstruction, v); }
inComment()277         bool inComment() const { return testBit(InComment); }
setInComment(bool v)278         void setInComment(bool v) { setBit(InComment, v); }
inDoctype()279         bool inDoctype() const { return testBit(InDoctype); }
setInDoctype(bool v)280         void setInDoctype(bool v) { setBit(InDoctype, v); }
inTextArea()281         bool inTextArea() const { return testBit(InTextArea); }
setInTextArea(bool v)282         void setInTextArea(bool v) { setBit(InTextArea, v); }
escaped()283         bool escaped() const { return testBit(Escaped); }
setEscaped(bool v)284         void setEscaped(bool v) { setBit(Escaped, v); }
inServer()285         bool inServer() const { return testBit(InServer); }
setInServer(bool v)286         void setInServer(bool v) { setBit(InServer, v); }
skipLF()287         bool skipLF() const { return testBit(SkipLF); }
setSkipLF(bool v)288         void setSkipLF(bool v) { setBit(SkipLF, v); }
startTag()289         bool startTag() const { return testBit(StartTag); }
setStartTag(bool v)290         void setStartTag(bool v) { setBit(StartTag, v); }
discardLF()291         bool discardLF() const { return testBit(DiscardLF); }
setDiscardLF(bool v)292         void setDiscardLF(bool v) { setBit(DiscardLF, v); }
allowYield()293         bool allowYield() const { return testBit(AllowYield); }
setAllowYield(bool v)294         void setAllowYield(bool v) { setBit(AllowYield, v); }
loadingExtScript()295         bool loadingExtScript() const { return testBit(LoadingExtScript); }
setLoadingExtScript(bool v)296         void setLoadingExtScript(bool v) { setBit(LoadingExtScript, v); }
forceSynchronous()297         bool forceSynchronous() const { return testBit(ForceSynchronous); }
setForceSynchronous(bool v)298         void setForceSynchronous(bool v) { setBit(ForceSynchronous, v); }
299 
inAnyNonHTMLText()300         bool inAnyNonHTMLText() const { return m_bits & (InScript | InStyle | InXmp | InTextArea | InTitle | InIFrame); }
hasTagState()301         bool hasTagState() const { return m_bits & TagMask; }
hasEntityState()302         bool hasEntityState() const { return m_bits & EntityMask; }
303 
needsSpecialWriteHandling()304         bool needsSpecialWriteHandling() const { return m_bits & (InScript | InStyle | InXmp | InTextArea | InTitle | InIFrame | TagMask | EntityMask | InPlainText | InComment | InDoctype | InServer | InProcessingInstruction | StartTag); }
305 
306     private:
307         static const int EntityShift = 4;
308         enum StateBits {
309             TagMask = (1 << 4) - 1,
310             EntityMask = (1 << 7) - (1 << 4),
311             InScript = 1 << 7,
312             InStyle = 1 << 8,
313             // Bit 9 unused
314             InXmp = 1 << 10,
315             InTitle = 1 << 11,
316             InPlainText = 1 << 12,
317             InProcessingInstruction = 1 << 13,
318             InComment = 1 << 14,
319             InTextArea = 1 << 15,
320             Escaped = 1 << 16,
321             InServer = 1 << 17,
322             SkipLF = 1 << 18,
323             StartTag = 1 << 19,
324             DiscardLF = 1 << 20, // FIXME: should clarify difference between skip and discard
325             AllowYield = 1 << 21,
326             LoadingExtScript = 1 << 22,
327             ForceSynchronous = 1 << 23,
328             InIFrame = 1 << 24,
329             InDoctype = 1 << 25
330         };
331 
setBit(StateBits bit,bool value)332         void setBit(StateBits bit, bool value)
333         {
334             if (value)
335                 m_bits |= bit;
336             else
337                 m_bits &= ~bit;
338         }
testBit(StateBits bit)339         bool testBit(StateBits bit) const { return m_bits & bit; }
340 
341         unsigned m_bits;
342     };
343 
344     State m_state;
345 
346     DoctypeToken m_doctypeToken;
347     int m_doctypeSearchCount;
348     int m_doctypeSecondarySearchCount;
349 
350     bool m_brokenServer;
351 
352     // Name of an attribute that we just scanned.
353     AtomicString m_attrName;
354 
355     // Used to store the code of a scripting sequence
356     UChar* m_scriptCode;
357     // Size of the script sequenze stored in @ref #scriptCode
358     int m_scriptCodeSize;
359     // Maximal size that can be stored in @ref #scriptCode
360     int m_scriptCodeCapacity;
361     // resync point of script code size
362     int m_scriptCodeResync;
363 
364     // Stores characters if we are scanning for a string like "</script>"
365     UChar searchBuffer[10];
366 
367     // Counts where we are in the string we are scanning for
368     int searchCount;
369     // the stopper string
370     const char* m_searchStopper;
371     int m_searchStopperLength;
372 
373     // if no more data is coming, just parse what we have (including ext scripts that
374     // may be still downloading) and finish
375     bool m_noMoreData;
376     // URL to get source code of script from
377     String m_scriptTagSrcAttrValue;
378     String m_scriptTagCharsetAttrValue;
379     // the HTML code we will parse after the external script we are waiting for has loaded
380     SegmentedString m_pendingSrc;
381 
382     // the HTML code we will parse after this particular script has
383     // loaded, but before all pending HTML
384     SegmentedString* m_currentPrependingSrc;
385 
386     // true if we are executing a script while parsing a document. This causes the parsing of
387     // the output of the script to be postponed until after the script has finished executing
388     int m_executingScript;
389     Deque<CachedResourceHandle<CachedScript> > m_pendingScripts;
390     RefPtr<HTMLScriptElement> m_scriptNode;
391 
392     bool m_requestingScript;
393     bool m_hasScriptsWaitingForStylesheets;
394 
395     // if we found one broken comment, there are most likely others as well
396     // store a flag to get rid of the O(n^2) behaviour in such a case.
397     bool m_brokenComments;
398     // current line number
399     int m_lineNumber;
400     int m_currentScriptTagStartLineNumber;
401     int m_currentTagStartLineNumber;
402 
403     double m_tokenizerTimeDelay;
404     int m_tokenizerChunkSize;
405 
406     // The timer for continued processing.
407     Timer<HTMLTokenizer> m_timer;
408 
409     // The timer for continued executing external scripts.
410     Timer<HTMLTokenizer> m_externalScriptsTimer;
411 
412 // This buffer can hold arbitrarily long user-defined attribute names, such as in EMBED tags.
413 // So any fixed number might be too small, but rather than rewriting all usage of this buffer
414 // we'll just make it large enough to handle all imaginable cases.
415 #define CBUFLEN 1024
416     UChar m_cBuffer[CBUFLEN + 2];
417     unsigned int m_cBufferPos;
418 
419     SegmentedString m_src;
420     Document* m_doc;
421     OwnPtr<HTMLParser> m_parser;
422     bool m_inWrite;
423     bool m_fragment;
424     FragmentScriptingPermission m_scriptingPermission;
425 
426     OwnPtr<PreloadScanner> m_preloadScanner;
427 };
428 
429 void parseHTMLDocumentFragment(const String&, DocumentFragment*, FragmentScriptingPermission = FragmentScriptingAllowed);
430 
431 UChar decodeNamedEntity(const char*);
432 
433 } // namespace WebCore
434 
435 #endif // HTMLTokenizer_h
436