1 /*
2 * Copyright (C) 2010 Apple Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
15 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
16 * DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS BE LIABLE FOR ANY
17 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
18 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
19 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
20 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
21 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
22 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23 */
24
25 #include "config.h"
26 #include "core/html/parser/HTMLParserIdioms.h"
27
28 #include "core/HTMLNames.h"
29 #include <limits>
30 #include "wtf/MathExtras.h"
31 #include "wtf/text/AtomicString.h"
32 #include "wtf/text/StringBuilder.h"
33 #include "wtf/text/StringHash.h"
34 #include "wtf/text/TextEncoding.h"
35
36 namespace blink {
37
38 using namespace HTMLNames;
39
40 template <typename CharType>
stripLeadingAndTrailingHTMLSpaces(String string,const CharType * characters,unsigned length)41 static String stripLeadingAndTrailingHTMLSpaces(String string, const CharType* characters, unsigned length)
42 {
43 unsigned numLeadingSpaces = 0;
44 unsigned numTrailingSpaces = 0;
45
46 for (; numLeadingSpaces < length; ++numLeadingSpaces) {
47 if (isNotHTMLSpace<CharType>(characters[numLeadingSpaces]))
48 break;
49 }
50
51 if (numLeadingSpaces == length)
52 return string.isNull() ? string : emptyAtom.string();
53
54 for (; numTrailingSpaces < length; ++numTrailingSpaces) {
55 if (isNotHTMLSpace<CharType>(characters[length - numTrailingSpaces - 1]))
56 break;
57 }
58
59 ASSERT(numLeadingSpaces + numTrailingSpaces < length);
60
61 if (!(numLeadingSpaces | numTrailingSpaces))
62 return string;
63
64 return string.substring(numLeadingSpaces, length - (numLeadingSpaces + numTrailingSpaces));
65 }
66
stripLeadingAndTrailingHTMLSpaces(const String & string)67 String stripLeadingAndTrailingHTMLSpaces(const String& string)
68 {
69 unsigned length = string.length();
70
71 if (!length)
72 return string.isNull() ? string : emptyAtom.string();
73
74 if (string.is8Bit())
75 return stripLeadingAndTrailingHTMLSpaces<LChar>(string, string.characters8(), length);
76
77 return stripLeadingAndTrailingHTMLSpaces<UChar>(string, string.characters16(), length);
78 }
79
serializeForNumberType(const Decimal & number)80 String serializeForNumberType(const Decimal& number)
81 {
82 if (number.isZero()) {
83 // Decimal::toString appends exponent, e.g. "0e-18"
84 return number.isNegative() ? "-0" : "0";
85 }
86 return number.toString();
87 }
88
serializeForNumberType(double number)89 String serializeForNumberType(double number)
90 {
91 // According to HTML5, "the best representation of the number n as a floating
92 // point number" is a string produced by applying ToString() to n.
93 return String::numberToStringECMAScript(number);
94 }
95
parseToDecimalForNumberType(const String & string,const Decimal & fallbackValue)96 Decimal parseToDecimalForNumberType(const String& string, const Decimal& fallbackValue)
97 {
98 // http://www.whatwg.org/specs/web-apps/current-work/#floating-point-numbers and parseToDoubleForNumberType
99 // String::toDouble() accepts leading + and whitespace characters, which are not valid here.
100 const UChar firstCharacter = string[0];
101 if (firstCharacter != '-' && firstCharacter != '.' && !isASCIIDigit(firstCharacter))
102 return fallbackValue;
103
104 const Decimal value = Decimal::fromString(string);
105 if (!value.isFinite())
106 return fallbackValue;
107
108 // Numbers are considered finite IEEE 754 Double-precision floating point values.
109 const Decimal doubleMax = Decimal::fromDouble(std::numeric_limits<double>::max());
110 if (value < -doubleMax || value > doubleMax)
111 return fallbackValue;
112
113 // We return +0 for -0 case.
114 return value.isZero() ? Decimal(0) : value;
115 }
116
parseToDoubleForNumberType(const String & string,double fallbackValue)117 double parseToDoubleForNumberType(const String& string, double fallbackValue)
118 {
119 // http://www.whatwg.org/specs/web-apps/current-work/#floating-point-numbers
120 // String::toDouble() accepts leading + and whitespace characters, which are not valid here.
121 UChar firstCharacter = string[0];
122 if (firstCharacter != '-' && firstCharacter != '.' && !isASCIIDigit(firstCharacter))
123 return fallbackValue;
124
125 bool valid = false;
126 double value = string.toDouble(&valid);
127 if (!valid)
128 return fallbackValue;
129
130 // NaN and infinity are considered valid by String::toDouble, but not valid here.
131 if (!std::isfinite(value))
132 return fallbackValue;
133
134 // Numbers are considered finite IEEE 754 Double-precision floating point values.
135 if (-std::numeric_limits<double>::max() > value || value > std::numeric_limits<double>::max())
136 return fallbackValue;
137
138 // The following expression converts -0 to +0.
139 return value ? value : 0;
140 }
141
142 template <typename CharacterType>
parseHTMLIntegerInternal(const CharacterType * position,const CharacterType * end,int & value)143 static bool parseHTMLIntegerInternal(const CharacterType* position, const CharacterType* end, int& value)
144 {
145 // Step 3
146 int sign = 1;
147
148 // Step 4
149 while (position < end) {
150 if (!isHTMLSpace<CharacterType>(*position))
151 break;
152 ++position;
153 }
154
155 // Step 5
156 if (position == end)
157 return false;
158 ASSERT(position < end);
159
160 // Step 6
161 if (*position == '-') {
162 sign = -1;
163 ++position;
164 } else if (*position == '+')
165 ++position;
166 if (position == end)
167 return false;
168 ASSERT(position < end);
169
170 // Step 7
171 if (!isASCIIDigit(*position))
172 return false;
173
174 // Step 8
175 StringBuilder digits;
176 while (position < end) {
177 if (!isASCIIDigit(*position))
178 break;
179 digits.append(*position++);
180 }
181
182 // Step 9
183 bool ok;
184 if (digits.is8Bit())
185 value = sign * charactersToIntStrict(digits.characters8(), digits.length(), &ok);
186 else
187 value = sign * charactersToIntStrict(digits.characters16(), digits.length(), &ok);
188 return ok;
189 }
190
191 // http://www.whatwg.org/specs/web-apps/current-work/#rules-for-parsing-integers
parseHTMLInteger(const String & input,int & value)192 bool parseHTMLInteger(const String& input, int& value)
193 {
194 // Step 1
195 // Step 2
196 unsigned length = input.length();
197 if (!length || input.is8Bit()) {
198 const LChar* start = input.characters8();
199 return parseHTMLIntegerInternal(start, start + length, value);
200 }
201
202 const UChar* start = input.characters16();
203 return parseHTMLIntegerInternal(start, start + length, value);
204 }
205
206 template <typename CharacterType>
parseHTMLNonNegativeIntegerInternal(const CharacterType * position,const CharacterType * end,unsigned & value)207 static bool parseHTMLNonNegativeIntegerInternal(const CharacterType* position, const CharacterType* end, unsigned& value)
208 {
209 // Step 3
210 while (position < end) {
211 if (!isHTMLSpace<CharacterType>(*position))
212 break;
213 ++position;
214 }
215
216 // Step 4
217 if (position == end)
218 return false;
219 ASSERT(position < end);
220
221 // Step 5
222 if (*position == '+')
223 ++position;
224
225 // Step 6
226 if (position == end)
227 return false;
228 ASSERT(position < end);
229
230 // Step 7
231 if (!isASCIIDigit(*position))
232 return false;
233
234 // Step 8
235 StringBuilder digits;
236 while (position < end) {
237 if (!isASCIIDigit(*position))
238 break;
239 digits.append(*position++);
240 }
241
242 // Step 9
243 bool ok;
244 if (digits.is8Bit())
245 value = charactersToUIntStrict(digits.characters8(), digits.length(), &ok);
246 else
247 value = charactersToUIntStrict(digits.characters16(), digits.length(), &ok);
248 return ok;
249 }
250
251
252 // http://www.whatwg.org/specs/web-apps/current-work/#rules-for-parsing-non-negative-integers
parseHTMLNonNegativeInteger(const String & input,unsigned & value)253 bool parseHTMLNonNegativeInteger(const String& input, unsigned& value)
254 {
255 // Step 1
256 // Step 2
257 unsigned length = input.length();
258 if (length && input.is8Bit()) {
259 const LChar* start = input.characters8();
260 return parseHTMLNonNegativeIntegerInternal(start, start + length, value);
261 }
262
263 const UChar* start = input.characters16();
264 return parseHTMLNonNegativeIntegerInternal(start, start + length, value);
265 }
266
267 static const char charsetString[] = "charset";
268 static const size_t charsetLength = sizeof("charset") - 1;
269
extractCharset(const String & value)270 String extractCharset(const String& value)
271 {
272 size_t pos = 0;
273 unsigned length = value.length();
274
275 while (pos < length) {
276 pos = value.find(charsetString, pos, false);
277 if (pos == kNotFound)
278 break;
279
280 pos += charsetLength;
281
282 // Skip whitespace.
283 while (pos < length && value[pos] <= ' ')
284 ++pos;
285
286 if (value[pos] != '=')
287 continue;
288
289 ++pos;
290
291 while (pos < length && value[pos] <= ' ')
292 ++pos;
293
294 char quoteMark = 0;
295 if (pos < length && (value[pos] == '"' || value[pos] == '\'')) {
296 quoteMark = static_cast<char>(value[pos++]);
297 ASSERT(!(quoteMark & 0x80));
298 }
299
300 if (pos == length)
301 break;
302
303 unsigned end = pos;
304 while (end < length && ((quoteMark && value[end] != quoteMark) || (!quoteMark && value[end] > ' ' && value[end] != '"' && value[end] != '\'' && value[end] != ';')))
305 ++end;
306
307 if (quoteMark && (end == length))
308 break; // Close quote not found.
309
310 return value.substring(pos, end - pos);
311 }
312
313 return "";
314 }
315
316 enum Mode {
317 None,
318 Charset,
319 Pragma,
320 };
321
encodingFromMetaAttributes(const HTMLAttributeList & attributes)322 WTF::TextEncoding encodingFromMetaAttributes(const HTMLAttributeList& attributes)
323 {
324 bool gotPragma = false;
325 Mode mode = None;
326 String charset;
327
328 for (HTMLAttributeList::const_iterator iter = attributes.begin(); iter != attributes.end(); ++iter) {
329 const String& attributeName = iter->first;
330 const String& attributeValue = AtomicString(iter->second);
331
332 if (threadSafeMatch(attributeName, http_equivAttr)) {
333 if (equalIgnoringCase(attributeValue, "content-type"))
334 gotPragma = true;
335 } else if (charset.isEmpty()) {
336 if (threadSafeMatch(attributeName, charsetAttr)) {
337 charset = attributeValue;
338 mode = Charset;
339 } else if (threadSafeMatch(attributeName, contentAttr)) {
340 charset = extractCharset(attributeValue);
341 if (charset.length())
342 mode = Pragma;
343 }
344 }
345 }
346
347 if (mode == Charset || (mode == Pragma && gotPragma))
348 return WTF::TextEncoding(stripLeadingAndTrailingHTMLSpaces(charset));
349
350 return WTF::TextEncoding();
351 }
352
threadSafeEqual(const StringImpl * a,const StringImpl * b)353 static bool threadSafeEqual(const StringImpl* a, const StringImpl* b)
354 {
355 if (a == b)
356 return true;
357 if (a->hash() != b->hash())
358 return false;
359 return equalNonNull(a, b);
360 }
361
threadSafeMatch(const QualifiedName & a,const QualifiedName & b)362 bool threadSafeMatch(const QualifiedName& a, const QualifiedName& b)
363 {
364 return threadSafeEqual(a.localName().impl(), b.localName().impl());
365 }
366
threadSafeMatch(const String & localName,const QualifiedName & qName)367 bool threadSafeMatch(const String& localName, const QualifiedName& qName)
368 {
369 return threadSafeEqual(localName.impl(), qName.localName().impl());
370 }
371
372 template<typename CharType>
findStringIfStatic(const CharType * characters,unsigned length)373 inline StringImpl* findStringIfStatic(const CharType* characters, unsigned length)
374 {
375 // We don't need to try hashing if we know the string is too long.
376 if (length > StringImpl::highestStaticStringLength())
377 return 0;
378 // computeHashAndMaskTop8Bits is the function StringImpl::hash() uses.
379 unsigned hash = StringHasher::computeHashAndMaskTop8Bits(characters, length);
380 const WTF::StaticStringsTable& table = StringImpl::allStaticStrings();
381 ASSERT(!table.isEmpty());
382
383 WTF::StaticStringsTable::const_iterator it = table.find(hash);
384 if (it == table.end())
385 return 0;
386 // It's possible to have hash collisions between arbitrary strings and
387 // known identifiers (e.g. "bvvfg" collides with "script").
388 // However ASSERTs in StringImpl::createStatic guard against there ever being collisions
389 // between static strings.
390 if (!equal(it->value, characters, length))
391 return 0;
392 return it->value;
393 }
394
attemptStaticStringCreation(const LChar * characters,size_t size)395 String attemptStaticStringCreation(const LChar* characters, size_t size)
396 {
397 String string(findStringIfStatic(characters, size));
398 if (string.impl())
399 return string;
400 return String(characters, size);
401 }
402
attemptStaticStringCreation(const UChar * characters,size_t size,CharacterWidth width)403 String attemptStaticStringCreation(const UChar* characters, size_t size, CharacterWidth width)
404 {
405 String string(findStringIfStatic(characters, size));
406 if (string.impl())
407 return string;
408 if (width == Likely8Bit)
409 string = StringImpl::create8BitIfPossible(characters, size);
410 else if (width == Force8Bit)
411 string = String::make8BitFrom16BitSource(characters, size);
412 else
413 string = String(characters, size);
414
415 return string;
416 }
417
418 }
419