1 /*
2 * Copyright (C) 2013 Google, Inc. All Rights Reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
26 #ifndef HTMLToken_h
27 #define HTMLToken_h
28
29 #include "core/dom/Attribute.h"
30 #include "core/html/parser/HTMLToken.h"
31 #include "wtf/PassOwnPtr.h"
32 #include "wtf/RefCounted.h"
33 #include "wtf/RefPtr.h"
34
35 namespace WebCore {
36
37 class DoctypeData {
38 WTF_MAKE_NONCOPYABLE(DoctypeData);
39 public:
DoctypeData()40 DoctypeData()
41 : m_hasPublicIdentifier(false)
42 , m_hasSystemIdentifier(false)
43 , m_forceQuirks(false)
44 {
45 }
46
47 // FIXME: This should use String instead of Vector<UChar>.
48 bool m_hasPublicIdentifier;
49 bool m_hasSystemIdentifier;
50 WTF::Vector<UChar> m_publicIdentifier;
51 WTF::Vector<UChar> m_systemIdentifier;
52 bool m_forceQuirks;
53 };
54
findAttributeInVector(Vector<Attribute> & attributes,const QualifiedName & name)55 static inline Attribute* findAttributeInVector(Vector<Attribute>& attributes, const QualifiedName& name)
56 {
57 for (unsigned i = 0; i < attributes.size(); ++i) {
58 if (attributes.at(i).name().matches(name))
59 return &attributes.at(i);
60 }
61 return 0;
62 }
63
64 class HTMLToken {
65 WTF_MAKE_NONCOPYABLE(HTMLToken);
66 WTF_MAKE_FAST_ALLOCATED;
67 public:
68 enum Type {
69 Uninitialized,
70 DOCTYPE,
71 StartTag,
72 EndTag,
73 Comment,
74 Character,
75 EndOfFile,
76 };
77
78 class Attribute {
79 public:
80 class Range {
81 public:
82 int start;
83 int end;
84 };
85
86 Range nameRange;
87 Range valueRange;
88 Vector<UChar, 32> name;
89 Vector<UChar, 32> value;
90 };
91
92 typedef Vector<Attribute, 10> AttributeList;
93
94 // By using an inline capacity of 256, we avoid spilling over into an malloced buffer
95 // approximately 99% of the time based on a non-scientific browse around a number of
96 // popular web sites on 23 May 2013.
97 typedef Vector<UChar, 256> DataVector;
98
HTMLToken()99 HTMLToken() { clear(); }
100
clear()101 void clear()
102 {
103 m_type = Uninitialized;
104 m_range.start = 0;
105 m_range.end = 0;
106 m_baseOffset = 0;
107 m_data.clear();
108 m_orAllData = 0;
109 }
110
isUninitialized()111 bool isUninitialized() { return m_type == Uninitialized; }
type()112 Type type() const { return m_type; }
113
makeEndOfFile()114 void makeEndOfFile()
115 {
116 ASSERT(m_type == Uninitialized);
117 m_type = EndOfFile;
118 }
119
120 /* Range and offset methods exposed for HTMLSourceTracker and HTMLViewSourceParser */
startIndex()121 int startIndex() const { return m_range.start; }
endIndex()122 int endIndex() const { return m_range.end; }
123
setBaseOffset(int offset)124 void setBaseOffset(int offset)
125 {
126 m_baseOffset = offset;
127 }
128
end(int endOffset)129 void end(int endOffset)
130 {
131 m_range.end = endOffset - m_baseOffset;
132 }
133
data()134 const DataVector& data() const
135 {
136 ASSERT(m_type == Character || m_type == Comment || m_type == StartTag || m_type == EndTag);
137 return m_data;
138 }
139
isAll8BitData()140 bool isAll8BitData() const
141 {
142 return (m_orAllData <= 0xff);
143 }
144
name()145 const DataVector& name() const
146 {
147 ASSERT(m_type == StartTag || m_type == EndTag || m_type == DOCTYPE);
148 return m_data;
149 }
150
appendToName(UChar character)151 void appendToName(UChar character)
152 {
153 ASSERT(m_type == StartTag || m_type == EndTag || m_type == DOCTYPE);
154 ASSERT(character);
155 m_data.append(character);
156 m_orAllData |= character;
157 }
158
159 /* DOCTYPE Tokens */
160
forceQuirks()161 bool forceQuirks() const
162 {
163 ASSERT(m_type == DOCTYPE);
164 return m_doctypeData->m_forceQuirks;
165 }
166
setForceQuirks()167 void setForceQuirks()
168 {
169 ASSERT(m_type == DOCTYPE);
170 m_doctypeData->m_forceQuirks = true;
171 }
172
beginDOCTYPE()173 void beginDOCTYPE()
174 {
175 ASSERT(m_type == Uninitialized);
176 m_type = DOCTYPE;
177 m_doctypeData = adoptPtr(new DoctypeData);
178 }
179
beginDOCTYPE(UChar character)180 void beginDOCTYPE(UChar character)
181 {
182 ASSERT(character);
183 beginDOCTYPE();
184 m_data.append(character);
185 m_orAllData |= character;
186 }
187
188 // FIXME: Distinguish between a missing public identifer and an empty one.
publicIdentifier()189 const WTF::Vector<UChar>& publicIdentifier() const
190 {
191 ASSERT(m_type == DOCTYPE);
192 return m_doctypeData->m_publicIdentifier;
193 }
194
195 // FIXME: Distinguish between a missing system identifer and an empty one.
systemIdentifier()196 const WTF::Vector<UChar>& systemIdentifier() const
197 {
198 ASSERT(m_type == DOCTYPE);
199 return m_doctypeData->m_systemIdentifier;
200 }
201
setPublicIdentifierToEmptyString()202 void setPublicIdentifierToEmptyString()
203 {
204 ASSERT(m_type == DOCTYPE);
205 m_doctypeData->m_hasPublicIdentifier = true;
206 m_doctypeData->m_publicIdentifier.clear();
207 }
208
setSystemIdentifierToEmptyString()209 void setSystemIdentifierToEmptyString()
210 {
211 ASSERT(m_type == DOCTYPE);
212 m_doctypeData->m_hasSystemIdentifier = true;
213 m_doctypeData->m_systemIdentifier.clear();
214 }
215
appendToPublicIdentifier(UChar character)216 void appendToPublicIdentifier(UChar character)
217 {
218 ASSERT(character);
219 ASSERT(m_type == DOCTYPE);
220 ASSERT(m_doctypeData->m_hasPublicIdentifier);
221 m_doctypeData->m_publicIdentifier.append(character);
222 }
223
appendToSystemIdentifier(UChar character)224 void appendToSystemIdentifier(UChar character)
225 {
226 ASSERT(character);
227 ASSERT(m_type == DOCTYPE);
228 ASSERT(m_doctypeData->m_hasSystemIdentifier);
229 m_doctypeData->m_systemIdentifier.append(character);
230 }
231
releaseDoctypeData()232 PassOwnPtr<DoctypeData> releaseDoctypeData()
233 {
234 return m_doctypeData.release();
235 }
236
237 /* Start/End Tag Tokens */
238
selfClosing()239 bool selfClosing() const
240 {
241 ASSERT(m_type == StartTag || m_type == EndTag);
242 return m_selfClosing;
243 }
244
setSelfClosing()245 void setSelfClosing()
246 {
247 ASSERT(m_type == StartTag || m_type == EndTag);
248 m_selfClosing = true;
249 }
250
beginStartTag(UChar character)251 void beginStartTag(UChar character)
252 {
253 ASSERT(character);
254 ASSERT(m_type == Uninitialized);
255 m_type = StartTag;
256 m_selfClosing = false;
257 m_currentAttribute = 0;
258 m_attributes.clear();
259
260 m_data.append(character);
261 m_orAllData |= character;
262 }
263
beginEndTag(LChar character)264 void beginEndTag(LChar character)
265 {
266 ASSERT(m_type == Uninitialized);
267 m_type = EndTag;
268 m_selfClosing = false;
269 m_currentAttribute = 0;
270 m_attributes.clear();
271
272 m_data.append(character);
273 }
274
beginEndTag(const Vector<LChar,32> & characters)275 void beginEndTag(const Vector<LChar, 32>& characters)
276 {
277 ASSERT(m_type == Uninitialized);
278 m_type = EndTag;
279 m_selfClosing = false;
280 m_currentAttribute = 0;
281 m_attributes.clear();
282
283 m_data.appendVector(characters);
284 }
285
addNewAttribute()286 void addNewAttribute()
287 {
288 ASSERT(m_type == StartTag || m_type == EndTag);
289 m_attributes.grow(m_attributes.size() + 1);
290 m_currentAttribute = &m_attributes.last();
291 #ifndef NDEBUG
292 m_currentAttribute->nameRange.start = 0;
293 m_currentAttribute->nameRange.end = 0;
294 m_currentAttribute->valueRange.start = 0;
295 m_currentAttribute->valueRange.end = 0;
296 #endif
297 }
298
beginAttributeName(int offset)299 void beginAttributeName(int offset)
300 {
301 m_currentAttribute->nameRange.start = offset - m_baseOffset;
302 }
303
endAttributeName(int offset)304 void endAttributeName(int offset)
305 {
306 int index = offset - m_baseOffset;
307 m_currentAttribute->nameRange.end = index;
308 m_currentAttribute->valueRange.start = index;
309 m_currentAttribute->valueRange.end = index;
310 }
311
beginAttributeValue(int offset)312 void beginAttributeValue(int offset)
313 {
314 m_currentAttribute->valueRange.start = offset - m_baseOffset;
315 #ifndef NDEBUG
316 m_currentAttribute->valueRange.end = 0;
317 #endif
318 }
319
endAttributeValue(int offset)320 void endAttributeValue(int offset)
321 {
322 m_currentAttribute->valueRange.end = offset - m_baseOffset;
323 }
324
appendToAttributeName(UChar character)325 void appendToAttributeName(UChar character)
326 {
327 ASSERT(character);
328 ASSERT(m_type == StartTag || m_type == EndTag);
329 // FIXME: We should be able to add the following ASSERT once we fix
330 // https://bugs.webkit.org/show_bug.cgi?id=62971
331 // ASSERT(m_currentAttribute->nameRange.start);
332 m_currentAttribute->name.append(character);
333 }
334
appendToAttributeValue(UChar character)335 void appendToAttributeValue(UChar character)
336 {
337 ASSERT(character);
338 ASSERT(m_type == StartTag || m_type == EndTag);
339 ASSERT(m_currentAttribute->valueRange.start);
340 m_currentAttribute->value.append(character);
341 }
342
appendToAttributeValue(size_t i,const String & value)343 void appendToAttributeValue(size_t i, const String& value)
344 {
345 ASSERT(!value.isEmpty());
346 ASSERT(m_type == StartTag || m_type == EndTag);
347 append(m_attributes[i].value, value);
348 }
349
attributes()350 const AttributeList& attributes() const
351 {
352 ASSERT(m_type == StartTag || m_type == EndTag);
353 return m_attributes;
354 }
355
getAttributeItem(const QualifiedName & name)356 const Attribute* getAttributeItem(const QualifiedName& name) const
357 {
358 for (unsigned i = 0; i < m_attributes.size(); ++i) {
359 if (AtomicString(m_attributes.at(i).name) == name.localName())
360 return &m_attributes.at(i);
361 }
362 return 0;
363 }
364
365 // Used by the XSSAuditor to nuke XSS-laden attributes.
eraseValueOfAttribute(size_t i)366 void eraseValueOfAttribute(size_t i)
367 {
368 ASSERT(m_type == StartTag || m_type == EndTag);
369 m_attributes[i].value.clear();
370 }
371
372 /* Character Tokens */
373
374 // Starting a character token works slightly differently than starting
375 // other types of tokens because we want to save a per-character branch.
ensureIsCharacterToken()376 void ensureIsCharacterToken()
377 {
378 ASSERT(m_type == Uninitialized || m_type == Character);
379 m_type = Character;
380 }
381
characters()382 const DataVector& characters() const
383 {
384 ASSERT(m_type == Character);
385 return m_data;
386 }
387
appendToCharacter(char character)388 void appendToCharacter(char character)
389 {
390 ASSERT(m_type == Character);
391 m_data.append(character);
392 }
393
appendToCharacter(UChar character)394 void appendToCharacter(UChar character)
395 {
396 ASSERT(m_type == Character);
397 m_data.append(character);
398 m_orAllData |= character;
399 }
400
appendToCharacter(const Vector<LChar,32> & characters)401 void appendToCharacter(const Vector<LChar, 32>& characters)
402 {
403 ASSERT(m_type == Character);
404 m_data.appendVector(characters);
405 }
406
407 /* Comment Tokens */
408
comment()409 const DataVector& comment() const
410 {
411 ASSERT(m_type == Comment);
412 return m_data;
413 }
414
beginComment()415 void beginComment()
416 {
417 ASSERT(m_type == Uninitialized);
418 m_type = Comment;
419 }
420
appendToComment(UChar character)421 void appendToComment(UChar character)
422 {
423 ASSERT(character);
424 ASSERT(m_type == Comment);
425 m_data.append(character);
426 m_orAllData |= character;
427 }
428
eraseCharacters()429 void eraseCharacters()
430 {
431 ASSERT(m_type == Character);
432 m_data.clear();
433 m_orAllData = 0;
434 }
435
436 private:
437 Type m_type;
438 Attribute::Range m_range; // Always starts at zero.
439 int m_baseOffset;
440 DataVector m_data;
441 UChar m_orAllData;
442
443 // For StartTag and EndTag
444 bool m_selfClosing;
445 AttributeList m_attributes;
446
447 // A pointer into m_attributes used during lexing.
448 Attribute* m_currentAttribute;
449
450 // For DOCTYPE
451 OwnPtr<DoctypeData> m_doctypeData;
452 };
453
454 }
455
456 #endif
457