1 /*
2 * Copyright (C) 2013 Google, Inc. All Rights Reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
26 #ifndef HTMLToken_h
27 #define HTMLToken_h
28
29 #include "core/dom/Attribute.h"
30 #include "wtf/PassOwnPtr.h"
31 #include "wtf/RefCounted.h"
32 #include "wtf/RefPtr.h"
33
34 namespace blink {
35
36 class DoctypeData {
37 WTF_MAKE_NONCOPYABLE(DoctypeData);
38 public:
DoctypeData()39 DoctypeData()
40 : m_hasPublicIdentifier(false)
41 , m_hasSystemIdentifier(false)
42 , m_forceQuirks(false)
43 {
44 }
45
46 bool m_hasPublicIdentifier;
47 bool m_hasSystemIdentifier;
48 WTF::Vector<UChar> m_publicIdentifier;
49 WTF::Vector<UChar> m_systemIdentifier;
50 bool m_forceQuirks;
51 };
52
findAttributeInVector(Vector<Attribute> & attributes,const QualifiedName & name)53 static inline Attribute* findAttributeInVector(Vector<Attribute>& attributes, const QualifiedName& name)
54 {
55 for (unsigned i = 0; i < attributes.size(); ++i) {
56 if (attributes.at(i).name().matches(name))
57 return &attributes.at(i);
58 }
59 return 0;
60 }
61
62 class HTMLToken {
63 WTF_MAKE_NONCOPYABLE(HTMLToken);
64 WTF_MAKE_FAST_ALLOCATED;
65 public:
66 enum Type {
67 Uninitialized,
68 DOCTYPE,
69 StartTag,
70 EndTag,
71 Comment,
72 Character,
73 EndOfFile,
74 };
75
76 class Attribute {
77 public:
78 class Range {
79 public:
80 int start;
81 int end;
82 };
83
84 Range nameRange;
85 Range valueRange;
86 Vector<UChar, 32> name;
87 Vector<UChar, 32> value;
88 };
89
90 typedef Vector<Attribute, 10> AttributeList;
91
92 // By using an inline capacity of 256, we avoid spilling over into an malloced buffer
93 // approximately 99% of the time based on a non-scientific browse around a number of
94 // popular web sites on 23 May 2013.
95 typedef Vector<UChar, 256> DataVector;
96
HTMLToken()97 HTMLToken() { clear(); }
98
clear()99 void clear()
100 {
101 m_type = Uninitialized;
102 m_range.start = 0;
103 m_range.end = 0;
104 m_baseOffset = 0;
105 // Don't call Vector::clear() as that would destroy the
106 // alloced VectorBuffer. If the innerHTML'd content has
107 // two 257 character text nodes in a row, we'll needlessly
108 // thrash malloc. When we finally finish the parse the
109 // HTMLToken will be destroyed and the VectorBuffer released.
110 m_data.shrink(0);
111 m_orAllData = 0;
112 }
113
isUninitialized()114 bool isUninitialized() { return m_type == Uninitialized; }
type()115 Type type() const { return m_type; }
116
makeEndOfFile()117 void makeEndOfFile()
118 {
119 ASSERT(m_type == Uninitialized);
120 m_type = EndOfFile;
121 }
122
123 /* Range and offset methods exposed for HTMLSourceTracker and HTMLViewSourceParser */
startIndex()124 int startIndex() const { return m_range.start; }
endIndex()125 int endIndex() const { return m_range.end; }
126
setBaseOffset(int offset)127 void setBaseOffset(int offset)
128 {
129 m_baseOffset = offset;
130 }
131
end(int endOffset)132 void end(int endOffset)
133 {
134 m_range.end = endOffset - m_baseOffset;
135 }
136
data()137 const DataVector& data() const
138 {
139 ASSERT(m_type == Character || m_type == Comment || m_type == StartTag || m_type == EndTag);
140 return m_data;
141 }
142
isAll8BitData()143 bool isAll8BitData() const
144 {
145 return (m_orAllData <= 0xff);
146 }
147
name()148 const DataVector& name() const
149 {
150 ASSERT(m_type == StartTag || m_type == EndTag || m_type == DOCTYPE);
151 return m_data;
152 }
153
appendToName(UChar character)154 void appendToName(UChar character)
155 {
156 ASSERT(m_type == StartTag || m_type == EndTag || m_type == DOCTYPE);
157 ASSERT(character);
158 m_data.append(character);
159 m_orAllData |= character;
160 }
161
162 /* DOCTYPE Tokens */
163
forceQuirks()164 bool forceQuirks() const
165 {
166 ASSERT(m_type == DOCTYPE);
167 return m_doctypeData->m_forceQuirks;
168 }
169
setForceQuirks()170 void setForceQuirks()
171 {
172 ASSERT(m_type == DOCTYPE);
173 m_doctypeData->m_forceQuirks = true;
174 }
175
beginDOCTYPE()176 void beginDOCTYPE()
177 {
178 ASSERT(m_type == Uninitialized);
179 m_type = DOCTYPE;
180 m_doctypeData = adoptPtr(new DoctypeData);
181 }
182
beginDOCTYPE(UChar character)183 void beginDOCTYPE(UChar character)
184 {
185 ASSERT(character);
186 beginDOCTYPE();
187 m_data.append(character);
188 m_orAllData |= character;
189 }
190
191 // FIXME: Distinguish between a missing public identifer and an empty one.
publicIdentifier()192 const WTF::Vector<UChar>& publicIdentifier() const
193 {
194 ASSERT(m_type == DOCTYPE);
195 return m_doctypeData->m_publicIdentifier;
196 }
197
198 // FIXME: Distinguish between a missing system identifer and an empty one.
systemIdentifier()199 const WTF::Vector<UChar>& systemIdentifier() const
200 {
201 ASSERT(m_type == DOCTYPE);
202 return m_doctypeData->m_systemIdentifier;
203 }
204
setPublicIdentifierToEmptyString()205 void setPublicIdentifierToEmptyString()
206 {
207 ASSERT(m_type == DOCTYPE);
208 m_doctypeData->m_hasPublicIdentifier = true;
209 m_doctypeData->m_publicIdentifier.clear();
210 }
211
setSystemIdentifierToEmptyString()212 void setSystemIdentifierToEmptyString()
213 {
214 ASSERT(m_type == DOCTYPE);
215 m_doctypeData->m_hasSystemIdentifier = true;
216 m_doctypeData->m_systemIdentifier.clear();
217 }
218
appendToPublicIdentifier(UChar character)219 void appendToPublicIdentifier(UChar character)
220 {
221 ASSERT(character);
222 ASSERT(m_type == DOCTYPE);
223 ASSERT(m_doctypeData->m_hasPublicIdentifier);
224 m_doctypeData->m_publicIdentifier.append(character);
225 }
226
appendToSystemIdentifier(UChar character)227 void appendToSystemIdentifier(UChar character)
228 {
229 ASSERT(character);
230 ASSERT(m_type == DOCTYPE);
231 ASSERT(m_doctypeData->m_hasSystemIdentifier);
232 m_doctypeData->m_systemIdentifier.append(character);
233 }
234
releaseDoctypeData()235 PassOwnPtr<DoctypeData> releaseDoctypeData()
236 {
237 return m_doctypeData.release();
238 }
239
240 /* Start/End Tag Tokens */
241
selfClosing()242 bool selfClosing() const
243 {
244 ASSERT(m_type == StartTag || m_type == EndTag);
245 return m_selfClosing;
246 }
247
setSelfClosing()248 void setSelfClosing()
249 {
250 ASSERT(m_type == StartTag || m_type == EndTag);
251 m_selfClosing = true;
252 }
253
beginStartTag(UChar character)254 void beginStartTag(UChar character)
255 {
256 ASSERT(character);
257 ASSERT(m_type == Uninitialized);
258 m_type = StartTag;
259 m_selfClosing = false;
260 m_currentAttribute = 0;
261 m_attributes.clear();
262
263 m_data.append(character);
264 m_orAllData |= character;
265 }
266
beginEndTag(LChar character)267 void beginEndTag(LChar character)
268 {
269 ASSERT(m_type == Uninitialized);
270 m_type = EndTag;
271 m_selfClosing = false;
272 m_currentAttribute = 0;
273 m_attributes.clear();
274
275 m_data.append(character);
276 }
277
beginEndTag(const Vector<LChar,32> & characters)278 void beginEndTag(const Vector<LChar, 32>& characters)
279 {
280 ASSERT(m_type == Uninitialized);
281 m_type = EndTag;
282 m_selfClosing = false;
283 m_currentAttribute = 0;
284 m_attributes.clear();
285
286 m_data.appendVector(characters);
287 }
288
addNewAttribute()289 void addNewAttribute()
290 {
291 ASSERT(m_type == StartTag || m_type == EndTag);
292 m_attributes.grow(m_attributes.size() + 1);
293 m_currentAttribute = &m_attributes.last();
294 #if ENABLE(ASSERT)
295 m_currentAttribute->nameRange.start = 0;
296 m_currentAttribute->nameRange.end = 0;
297 m_currentAttribute->valueRange.start = 0;
298 m_currentAttribute->valueRange.end = 0;
299 #endif
300 }
301
beginAttributeName(int offset)302 void beginAttributeName(int offset)
303 {
304 m_currentAttribute->nameRange.start = offset - m_baseOffset;
305 }
306
endAttributeName(int offset)307 void endAttributeName(int offset)
308 {
309 int index = offset - m_baseOffset;
310 m_currentAttribute->nameRange.end = index;
311 m_currentAttribute->valueRange.start = index;
312 m_currentAttribute->valueRange.end = index;
313 }
314
beginAttributeValue(int offset)315 void beginAttributeValue(int offset)
316 {
317 m_currentAttribute->valueRange.start = offset - m_baseOffset;
318 #if ENABLE(ASSERT)
319 m_currentAttribute->valueRange.end = 0;
320 #endif
321 }
322
endAttributeValue(int offset)323 void endAttributeValue(int offset)
324 {
325 m_currentAttribute->valueRange.end = offset - m_baseOffset;
326 }
327
appendToAttributeName(UChar character)328 void appendToAttributeName(UChar character)
329 {
330 ASSERT(character);
331 ASSERT(m_type == StartTag || m_type == EndTag);
332 ASSERT(m_currentAttribute->nameRange.start);
333 m_currentAttribute->name.append(character);
334 }
335
appendToAttributeValue(UChar character)336 void appendToAttributeValue(UChar character)
337 {
338 ASSERT(character);
339 ASSERT(m_type == StartTag || m_type == EndTag);
340 ASSERT(m_currentAttribute->valueRange.start);
341 m_currentAttribute->value.append(character);
342 }
343
appendToAttributeValue(size_t i,const String & value)344 void appendToAttributeValue(size_t i, const String& value)
345 {
346 ASSERT(!value.isEmpty());
347 ASSERT(m_type == StartTag || m_type == EndTag);
348 append(m_attributes[i].value, value);
349 }
350
attributes()351 const AttributeList& attributes() const
352 {
353 ASSERT(m_type == StartTag || m_type == EndTag);
354 return m_attributes;
355 }
356
getAttributeItem(const QualifiedName & name)357 const Attribute* getAttributeItem(const QualifiedName& name) const
358 {
359 for (unsigned i = 0; i < m_attributes.size(); ++i) {
360 if (AtomicString(m_attributes.at(i).name) == name.localName())
361 return &m_attributes.at(i);
362 }
363 return 0;
364 }
365
366 // Used by the XSSAuditor to nuke XSS-laden attributes.
eraseValueOfAttribute(size_t i)367 void eraseValueOfAttribute(size_t i)
368 {
369 ASSERT(m_type == StartTag || m_type == EndTag);
370 m_attributes[i].value.clear();
371 }
372
373 /* Character Tokens */
374
375 // Starting a character token works slightly differently than starting
376 // other types of tokens because we want to save a per-character branch.
ensureIsCharacterToken()377 void ensureIsCharacterToken()
378 {
379 ASSERT(m_type == Uninitialized || m_type == Character);
380 m_type = Character;
381 }
382
characters()383 const DataVector& characters() const
384 {
385 ASSERT(m_type == Character);
386 return m_data;
387 }
388
appendToCharacter(char character)389 void appendToCharacter(char character)
390 {
391 ASSERT(m_type == Character);
392 m_data.append(character);
393 }
394
appendToCharacter(UChar character)395 void appendToCharacter(UChar character)
396 {
397 ASSERT(m_type == Character);
398 m_data.append(character);
399 m_orAllData |= character;
400 }
401
appendToCharacter(const Vector<LChar,32> & characters)402 void appendToCharacter(const Vector<LChar, 32>& characters)
403 {
404 ASSERT(m_type == Character);
405 m_data.appendVector(characters);
406 }
407
408 /* Comment Tokens */
409
comment()410 const DataVector& comment() const
411 {
412 ASSERT(m_type == Comment);
413 return m_data;
414 }
415
beginComment()416 void beginComment()
417 {
418 ASSERT(m_type == Uninitialized);
419 m_type = Comment;
420 }
421
appendToComment(UChar character)422 void appendToComment(UChar character)
423 {
424 ASSERT(character);
425 ASSERT(m_type == Comment);
426 m_data.append(character);
427 m_orAllData |= character;
428 }
429
430 // Only for XSSAuditor
eraseCharacters()431 void eraseCharacters()
432 {
433 ASSERT(m_type == Character);
434 m_data.clear();
435 m_orAllData = 0;
436 }
437
438 private:
439 Type m_type;
440 Attribute::Range m_range; // Always starts at zero.
441 int m_baseOffset;
442 DataVector m_data;
443 UChar m_orAllData;
444
445 // For StartTag and EndTag
446 bool m_selfClosing;
447 AttributeList m_attributes;
448
449 // A pointer into m_attributes used during lexing.
450 Attribute* m_currentAttribute;
451
452 // For DOCTYPE
453 OwnPtr<DoctypeData> m_doctypeData;
454 };
455
456 }
457
458 #endif
459