• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2010 Google, Inc. All Rights Reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  */
25 
26 #ifndef HTMLToken_h
27 #define HTMLToken_h
28 
29 #include "NamedNodeMap.h"
30 #include <wtf/PassOwnPtr.h>
31 #include <wtf/Vector.h>
32 
33 namespace WebCore {
34 
35 class HTMLToken {
36     WTF_MAKE_NONCOPYABLE(HTMLToken); WTF_MAKE_FAST_ALLOCATED;
37 public:
38     enum Type {
39         Uninitialized,
40         DOCTYPE,
41         StartTag,
42         EndTag,
43         Comment,
44         Character,
45         EndOfFile,
46     };
47 
48     class Range {
49     public:
50         int m_start;
51         int m_end;
52     };
53 
54     class Attribute {
55     public:
56         Range m_nameRange;
57         Range m_valueRange;
58         WTF::Vector<UChar, 32> m_name;
59         WTF::Vector<UChar, 32> m_value;
60     };
61 
62     typedef WTF::Vector<Attribute, 10> AttributeList;
63     typedef WTF::Vector<UChar, 1024> DataVector;
64 
HTMLToken()65     HTMLToken() { clear(); }
66 
clear()67     void clear()
68     {
69         m_type = Uninitialized;
70         m_range.m_start = 0;
71         m_range.m_end = 0;
72         m_baseOffset = 0;
73         m_data.clear();
74     }
75 
isUninitialized()76     bool isUninitialized() { return m_type == Uninitialized; }
77 
startIndex()78     int startIndex() const { return m_range.m_start; }
endIndex()79     int endIndex() const { return m_range.m_end; }
80 
setBaseOffset(int offset)81     void setBaseOffset(int offset)
82     {
83         m_baseOffset = offset;
84     }
85 
end(int endOffset)86     void end(int endOffset)
87     {
88         m_range.m_end = endOffset - m_baseOffset;
89     }
90 
makeEndOfFile()91     void makeEndOfFile()
92     {
93         ASSERT(m_type == Uninitialized);
94         m_type = EndOfFile;
95     }
96 
beginStartTag(UChar character)97     void beginStartTag(UChar character)
98     {
99         ASSERT(character);
100         ASSERT(m_type == Uninitialized);
101         m_type = StartTag;
102         m_selfClosing = false;
103         m_currentAttribute = 0;
104         m_attributes.clear();
105 
106         m_data.append(character);
107     }
108 
109     template<typename T>
beginEndTag(T characters)110     void beginEndTag(T characters)
111     {
112         ASSERT(m_type == Uninitialized);
113         m_type = EndTag;
114         m_selfClosing = false;
115         m_currentAttribute = 0;
116         m_attributes.clear();
117 
118         m_data.append(characters);
119     }
120 
121     // Starting a character token works slightly differently than starting
122     // other types of tokens because we want to save a per-character branch.
ensureIsCharacterToken()123     void ensureIsCharacterToken()
124     {
125         ASSERT(m_type == Uninitialized || m_type == Character);
126         m_type = Character;
127     }
128 
beginComment()129     void beginComment()
130     {
131         ASSERT(m_type == Uninitialized);
132         m_type = Comment;
133     }
134 
beginDOCTYPE()135     void beginDOCTYPE()
136     {
137         ASSERT(m_type == Uninitialized);
138         m_type = DOCTYPE;
139         m_doctypeData = adoptPtr(new DoctypeData());
140     }
141 
beginDOCTYPE(UChar character)142     void beginDOCTYPE(UChar character)
143     {
144         ASSERT(character);
145         beginDOCTYPE();
146         m_data.append(character);
147     }
148 
appendToName(UChar character)149     void appendToName(UChar character)
150     {
151         ASSERT(character);
152         ASSERT(m_type == StartTag || m_type == EndTag || m_type == DOCTYPE);
153         m_data.append(character);
154     }
155 
156     template<typename T>
appendToCharacter(T characters)157     void appendToCharacter(T characters)
158     {
159         ASSERT(m_type == Character);
160         m_data.append(characters);
161     }
162 
appendToComment(UChar character)163     void appendToComment(UChar character)
164     {
165         ASSERT(character);
166         ASSERT(m_type == Comment);
167         m_data.append(character);
168     }
169 
addNewAttribute()170     void addNewAttribute()
171     {
172         ASSERT(m_type == StartTag || m_type == EndTag);
173         m_attributes.grow(m_attributes.size() + 1);
174         m_currentAttribute = &m_attributes.last();
175 #ifndef NDEBUG
176         m_currentAttribute->m_nameRange.m_start = 0;
177         m_currentAttribute->m_nameRange.m_end = 0;
178         m_currentAttribute->m_valueRange.m_start = 0;
179         m_currentAttribute->m_valueRange.m_end = 0;
180 #endif
181     }
182 
beginAttributeName(int offset)183     void beginAttributeName(int offset)
184     {
185         m_currentAttribute->m_nameRange.m_start = offset - m_baseOffset;
186     }
187 
endAttributeName(int offset)188     void endAttributeName(int offset)
189     {
190         int index = offset - m_baseOffset;
191         m_currentAttribute->m_nameRange.m_end = index;
192         m_currentAttribute->m_valueRange.m_start = index;
193         m_currentAttribute->m_valueRange.m_end = index;
194     }
195 
beginAttributeValue(int offset)196     void beginAttributeValue(int offset)
197     {
198         m_currentAttribute->m_valueRange.m_start = offset - m_baseOffset;
199 #ifndef NDEBUG
200         m_currentAttribute->m_valueRange.m_end = 0;
201 #endif
202     }
203 
endAttributeValue(int offset)204     void endAttributeValue(int offset)
205     {
206         m_currentAttribute->m_valueRange.m_end = offset - m_baseOffset;
207     }
208 
appendToAttributeName(UChar character)209     void appendToAttributeName(UChar character)
210     {
211         ASSERT(character);
212         ASSERT(m_type == StartTag || m_type == EndTag);
213         ASSERT(m_currentAttribute->m_nameRange.m_start);
214         m_currentAttribute->m_name.append(character);
215     }
216 
appendToAttributeValue(UChar character)217     void appendToAttributeValue(UChar character)
218     {
219         ASSERT(character);
220         ASSERT(m_type == StartTag || m_type == EndTag);
221         ASSERT(m_currentAttribute->m_valueRange.m_start);
222         m_currentAttribute->m_value.append(character);
223     }
224 
appendToAttributeValue(size_t i,const String & value)225     void appendToAttributeValue(size_t i, const String& value)
226     {
227         ASSERT(!value.isEmpty());
228         ASSERT(m_type == StartTag || m_type == EndTag);
229         m_attributes[i].m_value.append(value.characters(), value.length());
230     }
231 
type()232     Type type() const { return m_type; }
233 
selfClosing()234     bool selfClosing() const
235     {
236         ASSERT(m_type == StartTag || m_type == EndTag);
237         return m_selfClosing;
238     }
239 
setSelfClosing()240     void setSelfClosing()
241     {
242         ASSERT(m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag);
243         m_selfClosing = true;
244     }
245 
attributes()246     const AttributeList& attributes() const
247     {
248         ASSERT(m_type == StartTag || m_type == EndTag);
249         return m_attributes;
250     }
251 
name()252     const DataVector& name() const
253     {
254         ASSERT(m_type == StartTag || m_type == EndTag || m_type == DOCTYPE);
255         return m_data;
256     }
257 
eraseCharacters()258     void eraseCharacters()
259     {
260         ASSERT(m_type == Character);
261         m_data.clear();
262     }
263 
eraseValueOfAttribute(size_t i)264     void eraseValueOfAttribute(size_t i)
265     {
266         ASSERT(m_type == StartTag || m_type == EndTag);
267         m_attributes[i].m_value.clear();
268     }
269 
characters()270     const DataVector& characters() const
271     {
272         ASSERT(m_type == Character);
273         return m_data;
274     }
275 
comment()276     const DataVector& comment() const
277     {
278         ASSERT(m_type == Comment);
279         return m_data;
280     }
281 
282     // FIXME: Distinguish between a missing public identifer and an empty one.
publicIdentifier()283     const WTF::Vector<UChar>& publicIdentifier() const
284     {
285         ASSERT(m_type == DOCTYPE);
286         return m_doctypeData->m_publicIdentifier;
287     }
288 
289     // FIXME: Distinguish between a missing system identifer and an empty one.
systemIdentifier()290     const WTF::Vector<UChar>& systemIdentifier() const
291     {
292         ASSERT(m_type == DOCTYPE);
293         return m_doctypeData->m_systemIdentifier;
294     }
295 
setPublicIdentifierToEmptyString()296     void setPublicIdentifierToEmptyString()
297     {
298         ASSERT(m_type == DOCTYPE);
299         m_doctypeData->m_hasPublicIdentifier = true;
300         m_doctypeData->m_publicIdentifier.clear();
301     }
302 
setSystemIdentifierToEmptyString()303     void setSystemIdentifierToEmptyString()
304     {
305         ASSERT(m_type == DOCTYPE);
306         m_doctypeData->m_hasSystemIdentifier = true;
307         m_doctypeData->m_systemIdentifier.clear();
308     }
309 
forceQuirks()310     bool forceQuirks() const
311     {
312         ASSERT(m_type == DOCTYPE);
313         return m_doctypeData->m_forceQuirks;
314     }
315 
setForceQuirks()316     void setForceQuirks()
317     {
318         ASSERT(m_type == DOCTYPE);
319         m_doctypeData->m_forceQuirks = true;
320     }
321 
appendToPublicIdentifier(UChar character)322     void appendToPublicIdentifier(UChar character)
323     {
324         ASSERT(character);
325         ASSERT(m_type == DOCTYPE);
326         ASSERT(m_doctypeData->m_hasPublicIdentifier);
327         m_doctypeData->m_publicIdentifier.append(character);
328     }
329 
appendToSystemIdentifier(UChar character)330     void appendToSystemIdentifier(UChar character)
331     {
332         ASSERT(character);
333         ASSERT(m_type == DOCTYPE);
334         ASSERT(m_doctypeData->m_hasSystemIdentifier);
335         m_doctypeData->m_systemIdentifier.append(character);
336     }
337 
338 private:
339     // FIXME: I'm not sure what the final relationship between HTMLToken and
340     // AtomicHTMLToken will be.  I'm marking this a friend for now, but we'll
341     // want to end up with a cleaner interface between the two classes.
342     friend class AtomicHTMLToken;
343 
344     class DoctypeData {
345         WTF_MAKE_NONCOPYABLE(DoctypeData);
346     public:
DoctypeData()347         DoctypeData()
348             : m_hasPublicIdentifier(false)
349             , m_hasSystemIdentifier(false)
350             , m_forceQuirks(false)
351         {
352         }
353 
354         bool m_hasPublicIdentifier;
355         bool m_hasSystemIdentifier;
356         bool m_forceQuirks;
357         WTF::Vector<UChar> m_publicIdentifier;
358         WTF::Vector<UChar> m_systemIdentifier;
359     };
360 
361     Type m_type;
362     Range m_range; // Always starts at zero.
363     int m_baseOffset;
364 
365     // "name" for DOCTYPE, StartTag, and EndTag
366     // "characters" for Character
367     // "data" for Comment
368     DataVector m_data;
369 
370     // For DOCTYPE
371     OwnPtr<DoctypeData> m_doctypeData;
372 
373     // For StartTag and EndTag
374     bool m_selfClosing;
375     AttributeList m_attributes;
376 
377     // A pointer into m_attributes used during lexing.
378     Attribute* m_currentAttribute;
379 };
380 
381 // FIXME: This class should eventually be named HTMLToken once we move the
382 // exiting HTMLToken to be internal to the HTMLTokenizer.
383 class AtomicHTMLToken {
384     WTF_MAKE_NONCOPYABLE(AtomicHTMLToken);
385 public:
AtomicHTMLToken(HTMLToken & token)386     AtomicHTMLToken(HTMLToken& token)
387         : m_type(token.type())
388     {
389         switch (m_type) {
390         case HTMLToken::Uninitialized:
391             ASSERT_NOT_REACHED();
392             break;
393         case HTMLToken::DOCTYPE:
394             m_name = AtomicString(token.name().data(), token.name().size());
395             m_doctypeData = token.m_doctypeData.release();
396             break;
397         case HTMLToken::EndOfFile:
398             break;
399         case HTMLToken::StartTag:
400         case HTMLToken::EndTag: {
401             m_selfClosing = token.selfClosing();
402             m_name = AtomicString(token.name().data(), token.name().size());
403             initializeAttributes(token.attributes());
404             break;
405         }
406         case HTMLToken::Comment:
407             m_data = String(token.comment().data(), token.comment().size());
408             break;
409         case HTMLToken::Character:
410             m_externalCharacters = &token.characters();
411             break;
412         }
413     }
414 
415     AtomicHTMLToken(HTMLToken::Type type, AtomicString name, PassRefPtr<NamedNodeMap> attributes = 0)
m_type(type)416         : m_type(type)
417         , m_name(name)
418         , m_attributes(attributes)
419     {
420         ASSERT(usesName());
421     }
422 
type()423     HTMLToken::Type type() const { return m_type; }
424 
name()425     const AtomicString& name() const
426     {
427         ASSERT(usesName());
428         return m_name;
429     }
430 
setName(const AtomicString & name)431     void setName(const AtomicString& name)
432     {
433         ASSERT(usesName());
434         m_name = name;
435     }
436 
selfClosing()437     bool selfClosing() const
438     {
439         ASSERT(m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag);
440         return m_selfClosing;
441     }
442 
getAttributeItem(const QualifiedName & attributeName)443     Attribute* getAttributeItem(const QualifiedName& attributeName)
444     {
445         ASSERT(usesAttributes());
446         if (!m_attributes)
447             return 0;
448         return m_attributes->getAttributeItem(attributeName);
449     }
450 
attributes()451     NamedNodeMap* attributes() const
452     {
453         ASSERT(usesAttributes());
454         return m_attributes.get();
455     }
456 
takeAtributes()457     PassRefPtr<NamedNodeMap> takeAtributes()
458     {
459         ASSERT(usesAttributes());
460         return m_attributes.release();
461     }
462 
characters()463     const HTMLToken::DataVector& characters() const
464     {
465         ASSERT(m_type == HTMLToken::Character);
466         return *m_externalCharacters;
467     }
468 
comment()469     const String& comment() const
470     {
471         ASSERT(m_type == HTMLToken::Comment);
472         return m_data;
473     }
474 
475     // FIXME: Distinguish between a missing public identifer and an empty one.
publicIdentifier()476     WTF::Vector<UChar>& publicIdentifier() const
477     {
478         ASSERT(m_type == HTMLToken::DOCTYPE);
479         return m_doctypeData->m_publicIdentifier;
480     }
481 
482     // FIXME: Distinguish between a missing system identifer and an empty one.
systemIdentifier()483     WTF::Vector<UChar>& systemIdentifier() const
484     {
485         ASSERT(m_type == HTMLToken::DOCTYPE);
486         return m_doctypeData->m_systemIdentifier;
487     }
488 
forceQuirks()489     bool forceQuirks() const
490     {
491         ASSERT(m_type == HTMLToken::DOCTYPE);
492         return m_doctypeData->m_forceQuirks;
493     }
494 
495 private:
496     HTMLToken::Type m_type;
497 
498     void initializeAttributes(const HTMLToken::AttributeList& attributes);
499 
usesName()500     bool usesName() const
501     {
502         return m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag || m_type == HTMLToken::DOCTYPE;
503     }
504 
usesAttributes()505     bool usesAttributes() const
506     {
507         return m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag;
508     }
509 
510     // "name" for DOCTYPE, StartTag, and EndTag
511     AtomicString m_name;
512 
513     // "data" for Comment
514     String m_data;
515 
516     // "characters" for Character
517     //
518     // We don't want to copy the the characters out of the HTMLToken, so we
519     // keep a pointer to its buffer instead.  This buffer is owned by the
520     // HTMLToken and causes a lifetime dependence between these objects.
521     //
522     // FIXME: Add a mechanism for "internalizing" the characters when the
523     //        HTMLToken is destructed.
524     const HTMLToken::DataVector* m_externalCharacters;
525 
526     // For DOCTYPE
527     OwnPtr<HTMLToken::DoctypeData> m_doctypeData;
528 
529     // For StartTag and EndTag
530     bool m_selfClosing;
531 
532     RefPtr<NamedNodeMap> m_attributes;
533 };
534 
initializeAttributes(const HTMLToken::AttributeList & attributes)535 inline void AtomicHTMLToken::initializeAttributes(const HTMLToken::AttributeList& attributes)
536 {
537     size_t size = attributes.size();
538     if (!size)
539         return;
540 
541     m_attributes = NamedNodeMap::create();
542     m_attributes->reserveInitialCapacity(size);
543     for (size_t i = 0; i < size; ++i) {
544         const HTMLToken::Attribute& attribute = attributes[i];
545         if (attribute.m_name.isEmpty())
546             continue;
547 
548         ASSERT(attribute.m_nameRange.m_start);
549         ASSERT(attribute.m_nameRange.m_end);
550         ASSERT(attribute.m_valueRange.m_start);
551         ASSERT(attribute.m_valueRange.m_end);
552 
553         String name(attribute.m_name.data(), attribute.m_name.size());
554         String value(attribute.m_value.data(), attribute.m_value.size());
555         m_attributes->insertAttribute(Attribute::createMapped(name, value), false);
556     }
557 }
558 
559 }
560 
561 #endif
562