1 /*
2 * Copyright (C) 2013 Google, Inc. All Rights Reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
26 #ifndef HTMLToken_h
27 #define HTMLToken_h
28
29 #include "core/dom/Attribute.h"
30 #include "wtf/PassOwnPtr.h"
31 #include "wtf/RefCounted.h"
32 #include "wtf/RefPtr.h"
33
34 namespace WebCore {
35
36 class DoctypeData {
37 WTF_MAKE_NONCOPYABLE(DoctypeData);
38 public:
DoctypeData()39 DoctypeData()
40 : m_hasPublicIdentifier(false)
41 , m_hasSystemIdentifier(false)
42 , m_forceQuirks(false)
43 {
44 }
45
46 // FIXME: This should use String instead of Vector<UChar>.
47 bool m_hasPublicIdentifier;
48 bool m_hasSystemIdentifier;
49 WTF::Vector<UChar> m_publicIdentifier;
50 WTF::Vector<UChar> m_systemIdentifier;
51 bool m_forceQuirks;
52 };
53
findAttributeInVector(Vector<Attribute> & attributes,const QualifiedName & name)54 static inline Attribute* findAttributeInVector(Vector<Attribute>& attributes, const QualifiedName& name)
55 {
56 for (unsigned i = 0; i < attributes.size(); ++i) {
57 if (attributes.at(i).name().matches(name))
58 return &attributes.at(i);
59 }
60 return 0;
61 }
62
63 class HTMLToken {
64 WTF_MAKE_NONCOPYABLE(HTMLToken);
65 WTF_MAKE_FAST_ALLOCATED;
66 public:
67 enum Type {
68 Uninitialized,
69 DOCTYPE,
70 StartTag,
71 EndTag,
72 Comment,
73 Character,
74 EndOfFile,
75 };
76
77 class Attribute {
78 public:
79 class Range {
80 public:
81 int start;
82 int end;
83 };
84
85 Range nameRange;
86 Range valueRange;
87 Vector<UChar, 32> name;
88 Vector<UChar, 32> value;
89 };
90
91 typedef Vector<Attribute, 10> AttributeList;
92
93 // By using an inline capacity of 256, we avoid spilling over into an malloced buffer
94 // approximately 99% of the time based on a non-scientific browse around a number of
95 // popular web sites on 23 May 2013.
96 typedef Vector<UChar, 256> DataVector;
97
HTMLToken()98 HTMLToken() { clear(); }
99
clear()100 void clear()
101 {
102 m_type = Uninitialized;
103 m_range.start = 0;
104 m_range.end = 0;
105 m_baseOffset = 0;
106 // Don't call Vector::clear() as that would destroy the
107 // alloced VectorBuffer. If the innerHTML'd content has
108 // two 257 character text nodes in a row, we'll needlessly
109 // thrash malloc. When we finally finish the parse the
110 // HTMLToken will be destroyed and the VectorBuffer released.
111 m_data.shrink(0);
112 m_orAllData = 0;
113 }
114
isUninitialized()115 bool isUninitialized() { return m_type == Uninitialized; }
type()116 Type type() const { return m_type; }
117
makeEndOfFile()118 void makeEndOfFile()
119 {
120 ASSERT(m_type == Uninitialized);
121 m_type = EndOfFile;
122 }
123
124 /* Range and offset methods exposed for HTMLSourceTracker and HTMLViewSourceParser */
startIndex()125 int startIndex() const { return m_range.start; }
endIndex()126 int endIndex() const { return m_range.end; }
127
setBaseOffset(int offset)128 void setBaseOffset(int offset)
129 {
130 m_baseOffset = offset;
131 }
132
end(int endOffset)133 void end(int endOffset)
134 {
135 m_range.end = endOffset - m_baseOffset;
136 }
137
data()138 const DataVector& data() const
139 {
140 ASSERT(m_type == Character || m_type == Comment || m_type == StartTag || m_type == EndTag);
141 return m_data;
142 }
143
isAll8BitData()144 bool isAll8BitData() const
145 {
146 return (m_orAllData <= 0xff);
147 }
148
name()149 const DataVector& name() const
150 {
151 ASSERT(m_type == StartTag || m_type == EndTag || m_type == DOCTYPE);
152 return m_data;
153 }
154
appendToName(UChar character)155 void appendToName(UChar character)
156 {
157 ASSERT(m_type == StartTag || m_type == EndTag || m_type == DOCTYPE);
158 ASSERT(character);
159 m_data.append(character);
160 m_orAllData |= character;
161 }
162
163 /* DOCTYPE Tokens */
164
forceQuirks()165 bool forceQuirks() const
166 {
167 ASSERT(m_type == DOCTYPE);
168 return m_doctypeData->m_forceQuirks;
169 }
170
setForceQuirks()171 void setForceQuirks()
172 {
173 ASSERT(m_type == DOCTYPE);
174 m_doctypeData->m_forceQuirks = true;
175 }
176
beginDOCTYPE()177 void beginDOCTYPE()
178 {
179 ASSERT(m_type == Uninitialized);
180 m_type = DOCTYPE;
181 m_doctypeData = adoptPtr(new DoctypeData);
182 }
183
beginDOCTYPE(UChar character)184 void beginDOCTYPE(UChar character)
185 {
186 ASSERT(character);
187 beginDOCTYPE();
188 m_data.append(character);
189 m_orAllData |= character;
190 }
191
192 // FIXME: Distinguish between a missing public identifer and an empty one.
publicIdentifier()193 const WTF::Vector<UChar>& publicIdentifier() const
194 {
195 ASSERT(m_type == DOCTYPE);
196 return m_doctypeData->m_publicIdentifier;
197 }
198
199 // FIXME: Distinguish between a missing system identifer and an empty one.
systemIdentifier()200 const WTF::Vector<UChar>& systemIdentifier() const
201 {
202 ASSERT(m_type == DOCTYPE);
203 return m_doctypeData->m_systemIdentifier;
204 }
205
setPublicIdentifierToEmptyString()206 void setPublicIdentifierToEmptyString()
207 {
208 ASSERT(m_type == DOCTYPE);
209 m_doctypeData->m_hasPublicIdentifier = true;
210 m_doctypeData->m_publicIdentifier.clear();
211 }
212
setSystemIdentifierToEmptyString()213 void setSystemIdentifierToEmptyString()
214 {
215 ASSERT(m_type == DOCTYPE);
216 m_doctypeData->m_hasSystemIdentifier = true;
217 m_doctypeData->m_systemIdentifier.clear();
218 }
219
appendToPublicIdentifier(UChar character)220 void appendToPublicIdentifier(UChar character)
221 {
222 ASSERT(character);
223 ASSERT(m_type == DOCTYPE);
224 ASSERT(m_doctypeData->m_hasPublicIdentifier);
225 m_doctypeData->m_publicIdentifier.append(character);
226 }
227
appendToSystemIdentifier(UChar character)228 void appendToSystemIdentifier(UChar character)
229 {
230 ASSERT(character);
231 ASSERT(m_type == DOCTYPE);
232 ASSERT(m_doctypeData->m_hasSystemIdentifier);
233 m_doctypeData->m_systemIdentifier.append(character);
234 }
235
releaseDoctypeData()236 PassOwnPtr<DoctypeData> releaseDoctypeData()
237 {
238 return m_doctypeData.release();
239 }
240
241 /* Start/End Tag Tokens */
242
selfClosing()243 bool selfClosing() const
244 {
245 ASSERT(m_type == StartTag || m_type == EndTag);
246 return m_selfClosing;
247 }
248
setSelfClosing()249 void setSelfClosing()
250 {
251 ASSERT(m_type == StartTag || m_type == EndTag);
252 m_selfClosing = true;
253 }
254
beginStartTag(UChar character)255 void beginStartTag(UChar character)
256 {
257 ASSERT(character);
258 ASSERT(m_type == Uninitialized);
259 m_type = StartTag;
260 m_selfClosing = false;
261 m_currentAttribute = 0;
262 m_attributes.clear();
263
264 m_data.append(character);
265 m_orAllData |= character;
266 }
267
beginEndTag(LChar character)268 void beginEndTag(LChar character)
269 {
270 ASSERT(m_type == Uninitialized);
271 m_type = EndTag;
272 m_selfClosing = false;
273 m_currentAttribute = 0;
274 m_attributes.clear();
275
276 m_data.append(character);
277 }
278
beginEndTag(const Vector<LChar,32> & characters)279 void beginEndTag(const Vector<LChar, 32>& characters)
280 {
281 ASSERT(m_type == Uninitialized);
282 m_type = EndTag;
283 m_selfClosing = false;
284 m_currentAttribute = 0;
285 m_attributes.clear();
286
287 m_data.appendVector(characters);
288 }
289
addNewAttribute()290 void addNewAttribute()
291 {
292 ASSERT(m_type == StartTag || m_type == EndTag);
293 m_attributes.grow(m_attributes.size() + 1);
294 m_currentAttribute = &m_attributes.last();
295 #ifndef NDEBUG
296 m_currentAttribute->nameRange.start = 0;
297 m_currentAttribute->nameRange.end = 0;
298 m_currentAttribute->valueRange.start = 0;
299 m_currentAttribute->valueRange.end = 0;
300 #endif
301 }
302
beginAttributeName(int offset)303 void beginAttributeName(int offset)
304 {
305 m_currentAttribute->nameRange.start = offset - m_baseOffset;
306 }
307
endAttributeName(int offset)308 void endAttributeName(int offset)
309 {
310 int index = offset - m_baseOffset;
311 m_currentAttribute->nameRange.end = index;
312 m_currentAttribute->valueRange.start = index;
313 m_currentAttribute->valueRange.end = index;
314 }
315
beginAttributeValue(int offset)316 void beginAttributeValue(int offset)
317 {
318 m_currentAttribute->valueRange.start = offset - m_baseOffset;
319 #ifndef NDEBUG
320 m_currentAttribute->valueRange.end = 0;
321 #endif
322 }
323
endAttributeValue(int offset)324 void endAttributeValue(int offset)
325 {
326 m_currentAttribute->valueRange.end = offset - m_baseOffset;
327 }
328
appendToAttributeName(UChar character)329 void appendToAttributeName(UChar character)
330 {
331 ASSERT(character);
332 ASSERT(m_type == StartTag || m_type == EndTag);
333 ASSERT(m_currentAttribute->nameRange.start);
334 m_currentAttribute->name.append(character);
335 }
336
appendToAttributeValue(UChar character)337 void appendToAttributeValue(UChar character)
338 {
339 ASSERT(character);
340 ASSERT(m_type == StartTag || m_type == EndTag);
341 ASSERT(m_currentAttribute->valueRange.start);
342 m_currentAttribute->value.append(character);
343 }
344
appendToAttributeValue(size_t i,const String & value)345 void appendToAttributeValue(size_t i, const String& value)
346 {
347 ASSERT(!value.isEmpty());
348 ASSERT(m_type == StartTag || m_type == EndTag);
349 append(m_attributes[i].value, value);
350 }
351
attributes()352 const AttributeList& attributes() const
353 {
354 ASSERT(m_type == StartTag || m_type == EndTag);
355 return m_attributes;
356 }
357
getAttributeItem(const QualifiedName & name)358 const Attribute* getAttributeItem(const QualifiedName& name) const
359 {
360 for (unsigned i = 0; i < m_attributes.size(); ++i) {
361 if (AtomicString(m_attributes.at(i).name) == name.localName())
362 return &m_attributes.at(i);
363 }
364 return 0;
365 }
366
367 // Used by the XSSAuditor to nuke XSS-laden attributes.
eraseValueOfAttribute(size_t i)368 void eraseValueOfAttribute(size_t i)
369 {
370 ASSERT(m_type == StartTag || m_type == EndTag);
371 m_attributes[i].value.clear();
372 }
373
374 /* Character Tokens */
375
376 // Starting a character token works slightly differently than starting
377 // other types of tokens because we want to save a per-character branch.
ensureIsCharacterToken()378 void ensureIsCharacterToken()
379 {
380 ASSERT(m_type == Uninitialized || m_type == Character);
381 m_type = Character;
382 }
383
characters()384 const DataVector& characters() const
385 {
386 ASSERT(m_type == Character);
387 return m_data;
388 }
389
appendToCharacter(char character)390 void appendToCharacter(char character)
391 {
392 ASSERT(m_type == Character);
393 m_data.append(character);
394 }
395
appendToCharacter(UChar character)396 void appendToCharacter(UChar character)
397 {
398 ASSERT(m_type == Character);
399 m_data.append(character);
400 m_orAllData |= character;
401 }
402
appendToCharacter(const Vector<LChar,32> & characters)403 void appendToCharacter(const Vector<LChar, 32>& characters)
404 {
405 ASSERT(m_type == Character);
406 m_data.appendVector(characters);
407 }
408
409 /* Comment Tokens */
410
comment()411 const DataVector& comment() const
412 {
413 ASSERT(m_type == Comment);
414 return m_data;
415 }
416
beginComment()417 void beginComment()
418 {
419 ASSERT(m_type == Uninitialized);
420 m_type = Comment;
421 }
422
appendToComment(UChar character)423 void appendToComment(UChar character)
424 {
425 ASSERT(character);
426 ASSERT(m_type == Comment);
427 m_data.append(character);
428 m_orAllData |= character;
429 }
430
431 // Only for XSSAuditor
eraseCharacters()432 void eraseCharacters()
433 {
434 ASSERT(m_type == Character);
435 m_data.clear();
436 m_orAllData = 0;
437 }
438
439 private:
440 Type m_type;
441 Attribute::Range m_range; // Always starts at zero.
442 int m_baseOffset;
443 DataVector m_data;
444 UChar m_orAllData;
445
446 // For StartTag and EndTag
447 bool m_selfClosing;
448 AttributeList m_attributes;
449
450 // A pointer into m_attributes used during lexing.
451 Attribute* m_currentAttribute;
452
453 // For DOCTYPE
454 OwnPtr<DoctypeData> m_doctypeData;
455 };
456
457 }
458
459 #endif
460