1 /*
2 * Copyright (C) 2010 Google, Inc. All Rights Reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
26 #ifndef HTMLToken_h
27 #define HTMLToken_h
28
29 #include "NamedNodeMap.h"
30 #include <wtf/PassOwnPtr.h>
31 #include <wtf/Vector.h>
32
33 namespace WebCore {
34
35 class HTMLToken {
36 WTF_MAKE_NONCOPYABLE(HTMLToken); WTF_MAKE_FAST_ALLOCATED;
37 public:
38 enum Type {
39 Uninitialized,
40 DOCTYPE,
41 StartTag,
42 EndTag,
43 Comment,
44 Character,
45 EndOfFile,
46 };
47
48 class Range {
49 public:
50 int m_start;
51 int m_end;
52 };
53
54 class Attribute {
55 public:
56 Range m_nameRange;
57 Range m_valueRange;
58 WTF::Vector<UChar, 32> m_name;
59 WTF::Vector<UChar, 32> m_value;
60 };
61
62 typedef WTF::Vector<Attribute, 10> AttributeList;
63 typedef WTF::Vector<UChar, 1024> DataVector;
64
HTMLToken()65 HTMLToken() { clear(); }
66
clear()67 void clear()
68 {
69 m_type = Uninitialized;
70 m_range.m_start = 0;
71 m_range.m_end = 0;
72 m_baseOffset = 0;
73 m_data.clear();
74 }
75
isUninitialized()76 bool isUninitialized() { return m_type == Uninitialized; }
77
startIndex()78 int startIndex() const { return m_range.m_start; }
endIndex()79 int endIndex() const { return m_range.m_end; }
80
setBaseOffset(int offset)81 void setBaseOffset(int offset)
82 {
83 m_baseOffset = offset;
84 }
85
end(int endOffset)86 void end(int endOffset)
87 {
88 m_range.m_end = endOffset - m_baseOffset;
89 }
90
makeEndOfFile()91 void makeEndOfFile()
92 {
93 ASSERT(m_type == Uninitialized);
94 m_type = EndOfFile;
95 }
96
beginStartTag(UChar character)97 void beginStartTag(UChar character)
98 {
99 ASSERT(character);
100 ASSERT(m_type == Uninitialized);
101 m_type = StartTag;
102 m_selfClosing = false;
103 m_currentAttribute = 0;
104 m_attributes.clear();
105
106 m_data.append(character);
107 }
108
109 template<typename T>
beginEndTag(T characters)110 void beginEndTag(T characters)
111 {
112 ASSERT(m_type == Uninitialized);
113 m_type = EndTag;
114 m_selfClosing = false;
115 m_currentAttribute = 0;
116 m_attributes.clear();
117
118 m_data.append(characters);
119 }
120
121 // Starting a character token works slightly differently than starting
122 // other types of tokens because we want to save a per-character branch.
ensureIsCharacterToken()123 void ensureIsCharacterToken()
124 {
125 ASSERT(m_type == Uninitialized || m_type == Character);
126 m_type = Character;
127 }
128
beginComment()129 void beginComment()
130 {
131 ASSERT(m_type == Uninitialized);
132 m_type = Comment;
133 }
134
beginDOCTYPE()135 void beginDOCTYPE()
136 {
137 ASSERT(m_type == Uninitialized);
138 m_type = DOCTYPE;
139 m_doctypeData = adoptPtr(new DoctypeData());
140 }
141
beginDOCTYPE(UChar character)142 void beginDOCTYPE(UChar character)
143 {
144 ASSERT(character);
145 beginDOCTYPE();
146 m_data.append(character);
147 }
148
appendToName(UChar character)149 void appendToName(UChar character)
150 {
151 ASSERT(character);
152 ASSERT(m_type == StartTag || m_type == EndTag || m_type == DOCTYPE);
153 m_data.append(character);
154 }
155
156 template<typename T>
appendToCharacter(T characters)157 void appendToCharacter(T characters)
158 {
159 ASSERT(m_type == Character);
160 m_data.append(characters);
161 }
162
appendToComment(UChar character)163 void appendToComment(UChar character)
164 {
165 ASSERT(character);
166 ASSERT(m_type == Comment);
167 m_data.append(character);
168 }
169
addNewAttribute()170 void addNewAttribute()
171 {
172 ASSERT(m_type == StartTag || m_type == EndTag);
173 m_attributes.grow(m_attributes.size() + 1);
174 m_currentAttribute = &m_attributes.last();
175 #ifndef NDEBUG
176 m_currentAttribute->m_nameRange.m_start = 0;
177 m_currentAttribute->m_nameRange.m_end = 0;
178 m_currentAttribute->m_valueRange.m_start = 0;
179 m_currentAttribute->m_valueRange.m_end = 0;
180 #endif
181 }
182
beginAttributeName(int offset)183 void beginAttributeName(int offset)
184 {
185 m_currentAttribute->m_nameRange.m_start = offset - m_baseOffset;
186 }
187
endAttributeName(int offset)188 void endAttributeName(int offset)
189 {
190 int index = offset - m_baseOffset;
191 m_currentAttribute->m_nameRange.m_end = index;
192 m_currentAttribute->m_valueRange.m_start = index;
193 m_currentAttribute->m_valueRange.m_end = index;
194 }
195
beginAttributeValue(int offset)196 void beginAttributeValue(int offset)
197 {
198 m_currentAttribute->m_valueRange.m_start = offset - m_baseOffset;
199 #ifndef NDEBUG
200 m_currentAttribute->m_valueRange.m_end = 0;
201 #endif
202 }
203
endAttributeValue(int offset)204 void endAttributeValue(int offset)
205 {
206 m_currentAttribute->m_valueRange.m_end = offset - m_baseOffset;
207 }
208
appendToAttributeName(UChar character)209 void appendToAttributeName(UChar character)
210 {
211 ASSERT(character);
212 ASSERT(m_type == StartTag || m_type == EndTag);
213 ASSERT(m_currentAttribute->m_nameRange.m_start);
214 m_currentAttribute->m_name.append(character);
215 }
216
appendToAttributeValue(UChar character)217 void appendToAttributeValue(UChar character)
218 {
219 ASSERT(character);
220 ASSERT(m_type == StartTag || m_type == EndTag);
221 ASSERT(m_currentAttribute->m_valueRange.m_start);
222 m_currentAttribute->m_value.append(character);
223 }
224
appendToAttributeValue(size_t i,const String & value)225 void appendToAttributeValue(size_t i, const String& value)
226 {
227 ASSERT(!value.isEmpty());
228 ASSERT(m_type == StartTag || m_type == EndTag);
229 m_attributes[i].m_value.append(value.characters(), value.length());
230 }
231
type()232 Type type() const { return m_type; }
233
selfClosing()234 bool selfClosing() const
235 {
236 ASSERT(m_type == StartTag || m_type == EndTag);
237 return m_selfClosing;
238 }
239
setSelfClosing()240 void setSelfClosing()
241 {
242 ASSERT(m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag);
243 m_selfClosing = true;
244 }
245
attributes()246 const AttributeList& attributes() const
247 {
248 ASSERT(m_type == StartTag || m_type == EndTag);
249 return m_attributes;
250 }
251
name()252 const DataVector& name() const
253 {
254 ASSERT(m_type == StartTag || m_type == EndTag || m_type == DOCTYPE);
255 return m_data;
256 }
257
eraseCharacters()258 void eraseCharacters()
259 {
260 ASSERT(m_type == Character);
261 m_data.clear();
262 }
263
eraseValueOfAttribute(size_t i)264 void eraseValueOfAttribute(size_t i)
265 {
266 ASSERT(m_type == StartTag || m_type == EndTag);
267 m_attributes[i].m_value.clear();
268 }
269
characters()270 const DataVector& characters() const
271 {
272 ASSERT(m_type == Character);
273 return m_data;
274 }
275
comment()276 const DataVector& comment() const
277 {
278 ASSERT(m_type == Comment);
279 return m_data;
280 }
281
282 // FIXME: Distinguish between a missing public identifer and an empty one.
publicIdentifier()283 const WTF::Vector<UChar>& publicIdentifier() const
284 {
285 ASSERT(m_type == DOCTYPE);
286 return m_doctypeData->m_publicIdentifier;
287 }
288
289 // FIXME: Distinguish between a missing system identifer and an empty one.
systemIdentifier()290 const WTF::Vector<UChar>& systemIdentifier() const
291 {
292 ASSERT(m_type == DOCTYPE);
293 return m_doctypeData->m_systemIdentifier;
294 }
295
setPublicIdentifierToEmptyString()296 void setPublicIdentifierToEmptyString()
297 {
298 ASSERT(m_type == DOCTYPE);
299 m_doctypeData->m_hasPublicIdentifier = true;
300 m_doctypeData->m_publicIdentifier.clear();
301 }
302
setSystemIdentifierToEmptyString()303 void setSystemIdentifierToEmptyString()
304 {
305 ASSERT(m_type == DOCTYPE);
306 m_doctypeData->m_hasSystemIdentifier = true;
307 m_doctypeData->m_systemIdentifier.clear();
308 }
309
forceQuirks()310 bool forceQuirks() const
311 {
312 ASSERT(m_type == DOCTYPE);
313 return m_doctypeData->m_forceQuirks;
314 }
315
setForceQuirks()316 void setForceQuirks()
317 {
318 ASSERT(m_type == DOCTYPE);
319 m_doctypeData->m_forceQuirks = true;
320 }
321
appendToPublicIdentifier(UChar character)322 void appendToPublicIdentifier(UChar character)
323 {
324 ASSERT(character);
325 ASSERT(m_type == DOCTYPE);
326 ASSERT(m_doctypeData->m_hasPublicIdentifier);
327 m_doctypeData->m_publicIdentifier.append(character);
328 }
329
appendToSystemIdentifier(UChar character)330 void appendToSystemIdentifier(UChar character)
331 {
332 ASSERT(character);
333 ASSERT(m_type == DOCTYPE);
334 ASSERT(m_doctypeData->m_hasSystemIdentifier);
335 m_doctypeData->m_systemIdentifier.append(character);
336 }
337
338 private:
339 // FIXME: I'm not sure what the final relationship between HTMLToken and
340 // AtomicHTMLToken will be. I'm marking this a friend for now, but we'll
341 // want to end up with a cleaner interface between the two classes.
342 friend class AtomicHTMLToken;
343
344 class DoctypeData {
345 WTF_MAKE_NONCOPYABLE(DoctypeData);
346 public:
DoctypeData()347 DoctypeData()
348 : m_hasPublicIdentifier(false)
349 , m_hasSystemIdentifier(false)
350 , m_forceQuirks(false)
351 {
352 }
353
354 bool m_hasPublicIdentifier;
355 bool m_hasSystemIdentifier;
356 bool m_forceQuirks;
357 WTF::Vector<UChar> m_publicIdentifier;
358 WTF::Vector<UChar> m_systemIdentifier;
359 };
360
361 Type m_type;
362 Range m_range; // Always starts at zero.
363 int m_baseOffset;
364
365 // "name" for DOCTYPE, StartTag, and EndTag
366 // "characters" for Character
367 // "data" for Comment
368 DataVector m_data;
369
370 // For DOCTYPE
371 OwnPtr<DoctypeData> m_doctypeData;
372
373 // For StartTag and EndTag
374 bool m_selfClosing;
375 AttributeList m_attributes;
376
377 // A pointer into m_attributes used during lexing.
378 Attribute* m_currentAttribute;
379 };
380
381 // FIXME: This class should eventually be named HTMLToken once we move the
382 // exiting HTMLToken to be internal to the HTMLTokenizer.
383 class AtomicHTMLToken {
384 WTF_MAKE_NONCOPYABLE(AtomicHTMLToken);
385 public:
AtomicHTMLToken(HTMLToken & token)386 AtomicHTMLToken(HTMLToken& token)
387 : m_type(token.type())
388 {
389 switch (m_type) {
390 case HTMLToken::Uninitialized:
391 ASSERT_NOT_REACHED();
392 break;
393 case HTMLToken::DOCTYPE:
394 m_name = AtomicString(token.name().data(), token.name().size());
395 m_doctypeData = token.m_doctypeData.release();
396 break;
397 case HTMLToken::EndOfFile:
398 break;
399 case HTMLToken::StartTag:
400 case HTMLToken::EndTag: {
401 m_selfClosing = token.selfClosing();
402 m_name = AtomicString(token.name().data(), token.name().size());
403 initializeAttributes(token.attributes());
404 break;
405 }
406 case HTMLToken::Comment:
407 m_data = String(token.comment().data(), token.comment().size());
408 break;
409 case HTMLToken::Character:
410 m_externalCharacters = &token.characters();
411 break;
412 }
413 }
414
415 AtomicHTMLToken(HTMLToken::Type type, AtomicString name, PassRefPtr<NamedNodeMap> attributes = 0)
m_type(type)416 : m_type(type)
417 , m_name(name)
418 , m_attributes(attributes)
419 {
420 ASSERT(usesName());
421 }
422
type()423 HTMLToken::Type type() const { return m_type; }
424
name()425 const AtomicString& name() const
426 {
427 ASSERT(usesName());
428 return m_name;
429 }
430
setName(const AtomicString & name)431 void setName(const AtomicString& name)
432 {
433 ASSERT(usesName());
434 m_name = name;
435 }
436
selfClosing()437 bool selfClosing() const
438 {
439 ASSERT(m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag);
440 return m_selfClosing;
441 }
442
getAttributeItem(const QualifiedName & attributeName)443 Attribute* getAttributeItem(const QualifiedName& attributeName)
444 {
445 ASSERT(usesAttributes());
446 if (!m_attributes)
447 return 0;
448 return m_attributes->getAttributeItem(attributeName);
449 }
450
attributes()451 NamedNodeMap* attributes() const
452 {
453 ASSERT(usesAttributes());
454 return m_attributes.get();
455 }
456
takeAtributes()457 PassRefPtr<NamedNodeMap> takeAtributes()
458 {
459 ASSERT(usesAttributes());
460 return m_attributes.release();
461 }
462
characters()463 const HTMLToken::DataVector& characters() const
464 {
465 ASSERT(m_type == HTMLToken::Character);
466 return *m_externalCharacters;
467 }
468
comment()469 const String& comment() const
470 {
471 ASSERT(m_type == HTMLToken::Comment);
472 return m_data;
473 }
474
475 // FIXME: Distinguish between a missing public identifer and an empty one.
publicIdentifier()476 WTF::Vector<UChar>& publicIdentifier() const
477 {
478 ASSERT(m_type == HTMLToken::DOCTYPE);
479 return m_doctypeData->m_publicIdentifier;
480 }
481
482 // FIXME: Distinguish between a missing system identifer and an empty one.
systemIdentifier()483 WTF::Vector<UChar>& systemIdentifier() const
484 {
485 ASSERT(m_type == HTMLToken::DOCTYPE);
486 return m_doctypeData->m_systemIdentifier;
487 }
488
forceQuirks()489 bool forceQuirks() const
490 {
491 ASSERT(m_type == HTMLToken::DOCTYPE);
492 return m_doctypeData->m_forceQuirks;
493 }
494
495 private:
496 HTMLToken::Type m_type;
497
498 void initializeAttributes(const HTMLToken::AttributeList& attributes);
499
usesName()500 bool usesName() const
501 {
502 return m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag || m_type == HTMLToken::DOCTYPE;
503 }
504
usesAttributes()505 bool usesAttributes() const
506 {
507 return m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag;
508 }
509
510 // "name" for DOCTYPE, StartTag, and EndTag
511 AtomicString m_name;
512
513 // "data" for Comment
514 String m_data;
515
516 // "characters" for Character
517 //
518 // We don't want to copy the the characters out of the HTMLToken, so we
519 // keep a pointer to its buffer instead. This buffer is owned by the
520 // HTMLToken and causes a lifetime dependence between these objects.
521 //
522 // FIXME: Add a mechanism for "internalizing" the characters when the
523 // HTMLToken is destructed.
524 const HTMLToken::DataVector* m_externalCharacters;
525
526 // For DOCTYPE
527 OwnPtr<HTMLToken::DoctypeData> m_doctypeData;
528
529 // For StartTag and EndTag
530 bool m_selfClosing;
531
532 RefPtr<NamedNodeMap> m_attributes;
533 };
534
initializeAttributes(const HTMLToken::AttributeList & attributes)535 inline void AtomicHTMLToken::initializeAttributes(const HTMLToken::AttributeList& attributes)
536 {
537 size_t size = attributes.size();
538 if (!size)
539 return;
540
541 m_attributes = NamedNodeMap::create();
542 m_attributes->reserveInitialCapacity(size);
543 for (size_t i = 0; i < size; ++i) {
544 const HTMLToken::Attribute& attribute = attributes[i];
545 if (attribute.m_name.isEmpty())
546 continue;
547
548 ASSERT(attribute.m_nameRange.m_start);
549 ASSERT(attribute.m_nameRange.m_end);
550 ASSERT(attribute.m_valueRange.m_start);
551 ASSERT(attribute.m_valueRange.m_end);
552
553 String name(attribute.m_name.data(), attribute.m_name.size());
554 String value(attribute.m_value.data(), attribute.m_value.size());
555 m_attributes->insertAttribute(Attribute::createMapped(name, value), false);
556 }
557 }
558
559 }
560
561 #endif
562