• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2010 Apple Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1.  Redistributions of source code must retain the above copyright
8  *     notice, this list of conditions and the following disclaimer.
9  * 2.  Redistributions in binary form must reproduce the above copyright
10  *     notice, this list of conditions and the following disclaimer in the
11  *     documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' AND ANY
14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
15  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
16  * DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS BE LIABLE FOR ANY
17  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
18  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
19  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
20  * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
21  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
22  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  */
24 
25 #include "config.h"
26 #include "core/html/parser/HTMLParserIdioms.h"
27 
28 #include "core/HTMLNames.h"
29 #include <limits>
30 #include "wtf/MathExtras.h"
31 #include "wtf/text/AtomicString.h"
32 #include "wtf/text/StringBuilder.h"
33 #include "wtf/text/StringHash.h"
34 #include "wtf/text/TextEncoding.h"
35 
36 namespace blink {
37 
38 using namespace HTMLNames;
39 
40 template <typename CharType>
stripLeadingAndTrailingHTMLSpaces(String string,const CharType * characters,unsigned length)41 static String stripLeadingAndTrailingHTMLSpaces(String string, const CharType* characters, unsigned length)
42 {
43     unsigned numLeadingSpaces = 0;
44     unsigned numTrailingSpaces = 0;
45 
46     for (; numLeadingSpaces < length; ++numLeadingSpaces) {
47         if (isNotHTMLSpace<CharType>(characters[numLeadingSpaces]))
48             break;
49     }
50 
51     if (numLeadingSpaces == length)
52         return string.isNull() ? string : emptyAtom.string();
53 
54     for (; numTrailingSpaces < length; ++numTrailingSpaces) {
55         if (isNotHTMLSpace<CharType>(characters[length - numTrailingSpaces - 1]))
56             break;
57     }
58 
59     ASSERT(numLeadingSpaces + numTrailingSpaces < length);
60 
61     if (!(numLeadingSpaces | numTrailingSpaces))
62         return string;
63 
64     return string.substring(numLeadingSpaces, length - (numLeadingSpaces + numTrailingSpaces));
65 }
66 
stripLeadingAndTrailingHTMLSpaces(const String & string)67 String stripLeadingAndTrailingHTMLSpaces(const String& string)
68 {
69     unsigned length = string.length();
70 
71     if (!length)
72         return string.isNull() ? string : emptyAtom.string();
73 
74     if (string.is8Bit())
75         return stripLeadingAndTrailingHTMLSpaces<LChar>(string, string.characters8(), length);
76 
77     return stripLeadingAndTrailingHTMLSpaces<UChar>(string, string.characters16(), length);
78 }
79 
serializeForNumberType(const Decimal & number)80 String serializeForNumberType(const Decimal& number)
81 {
82     if (number.isZero()) {
83         // Decimal::toString appends exponent, e.g. "0e-18"
84         return number.isNegative() ? "-0" : "0";
85     }
86     return number.toString();
87 }
88 
serializeForNumberType(double number)89 String serializeForNumberType(double number)
90 {
91     // According to HTML5, "the best representation of the number n as a floating
92     // point number" is a string produced by applying ToString() to n.
93     return String::numberToStringECMAScript(number);
94 }
95 
parseToDecimalForNumberType(const String & string,const Decimal & fallbackValue)96 Decimal parseToDecimalForNumberType(const String& string, const Decimal& fallbackValue)
97 {
98     // http://www.whatwg.org/specs/web-apps/current-work/#floating-point-numbers and parseToDoubleForNumberType
99     // String::toDouble() accepts leading + and whitespace characters, which are not valid here.
100     const UChar firstCharacter = string[0];
101     if (firstCharacter != '-' && firstCharacter != '.' && !isASCIIDigit(firstCharacter))
102         return fallbackValue;
103 
104     const Decimal value = Decimal::fromString(string);
105     if (!value.isFinite())
106         return fallbackValue;
107 
108     // Numbers are considered finite IEEE 754 Double-precision floating point values.
109     const Decimal doubleMax = Decimal::fromDouble(std::numeric_limits<double>::max());
110     if (value < -doubleMax || value > doubleMax)
111         return fallbackValue;
112 
113     // We return +0 for -0 case.
114     return value.isZero() ? Decimal(0) : value;
115 }
116 
parseToDoubleForNumberType(const String & string,double fallbackValue)117 double parseToDoubleForNumberType(const String& string, double fallbackValue)
118 {
119     // http://www.whatwg.org/specs/web-apps/current-work/#floating-point-numbers
120     // String::toDouble() accepts leading + and whitespace characters, which are not valid here.
121     UChar firstCharacter = string[0];
122     if (firstCharacter != '-' && firstCharacter != '.' && !isASCIIDigit(firstCharacter))
123         return fallbackValue;
124 
125     bool valid = false;
126     double value = string.toDouble(&valid);
127     if (!valid)
128         return fallbackValue;
129 
130     // NaN and infinity are considered valid by String::toDouble, but not valid here.
131     if (!std::isfinite(value))
132         return fallbackValue;
133 
134     // Numbers are considered finite IEEE 754 Double-precision floating point values.
135     if (-std::numeric_limits<double>::max() > value || value > std::numeric_limits<double>::max())
136         return fallbackValue;
137 
138     // The following expression converts -0 to +0.
139     return value ? value : 0;
140 }
141 
142 template <typename CharacterType>
parseHTMLIntegerInternal(const CharacterType * position,const CharacterType * end,int & value)143 static bool parseHTMLIntegerInternal(const CharacterType* position, const CharacterType* end, int& value)
144 {
145     // Step 3
146     int sign = 1;
147 
148     // Step 4
149     while (position < end) {
150         if (!isHTMLSpace<CharacterType>(*position))
151             break;
152         ++position;
153     }
154 
155     // Step 5
156     if (position == end)
157         return false;
158     ASSERT(position < end);
159 
160     // Step 6
161     if (*position == '-') {
162         sign = -1;
163         ++position;
164     } else if (*position == '+')
165         ++position;
166     if (position == end)
167         return false;
168     ASSERT(position < end);
169 
170     // Step 7
171     if (!isASCIIDigit(*position))
172         return false;
173 
174     // Step 8
175     StringBuilder digits;
176     while (position < end) {
177         if (!isASCIIDigit(*position))
178             break;
179         digits.append(*position++);
180     }
181 
182     // Step 9
183     bool ok;
184     if (digits.is8Bit())
185         value = sign * charactersToIntStrict(digits.characters8(), digits.length(), &ok);
186     else
187         value = sign * charactersToIntStrict(digits.characters16(), digits.length(), &ok);
188     return ok;
189 }
190 
191 // http://www.whatwg.org/specs/web-apps/current-work/#rules-for-parsing-integers
parseHTMLInteger(const String & input,int & value)192 bool parseHTMLInteger(const String& input, int& value)
193 {
194     // Step 1
195     // Step 2
196     unsigned length = input.length();
197     if (!length || input.is8Bit()) {
198         const LChar* start = input.characters8();
199         return parseHTMLIntegerInternal(start, start + length, value);
200     }
201 
202     const UChar* start = input.characters16();
203     return parseHTMLIntegerInternal(start, start + length, value);
204 }
205 
206 template <typename CharacterType>
parseHTMLNonNegativeIntegerInternal(const CharacterType * position,const CharacterType * end,unsigned & value)207 static bool parseHTMLNonNegativeIntegerInternal(const CharacterType* position, const CharacterType* end, unsigned& value)
208 {
209     // Step 3
210     while (position < end) {
211         if (!isHTMLSpace<CharacterType>(*position))
212             break;
213         ++position;
214     }
215 
216     // Step 4
217     if (position == end)
218         return false;
219     ASSERT(position < end);
220 
221     // Step 5
222     if (*position == '+')
223         ++position;
224 
225     // Step 6
226     if (position == end)
227         return false;
228     ASSERT(position < end);
229 
230     // Step 7
231     if (!isASCIIDigit(*position))
232         return false;
233 
234     // Step 8
235     StringBuilder digits;
236     while (position < end) {
237         if (!isASCIIDigit(*position))
238             break;
239         digits.append(*position++);
240     }
241 
242     // Step 9
243     bool ok;
244     if (digits.is8Bit())
245         value = charactersToUIntStrict(digits.characters8(), digits.length(), &ok);
246     else
247         value = charactersToUIntStrict(digits.characters16(), digits.length(), &ok);
248     return ok;
249 }
250 
251 
252 // http://www.whatwg.org/specs/web-apps/current-work/#rules-for-parsing-non-negative-integers
parseHTMLNonNegativeInteger(const String & input,unsigned & value)253 bool parseHTMLNonNegativeInteger(const String& input, unsigned& value)
254 {
255     // Step 1
256     // Step 2
257     unsigned length = input.length();
258     if (length && input.is8Bit()) {
259         const LChar* start = input.characters8();
260         return parseHTMLNonNegativeIntegerInternal(start, start + length, value);
261     }
262 
263     const UChar* start = input.characters16();
264     return parseHTMLNonNegativeIntegerInternal(start, start + length, value);
265 }
266 
267 static const char charsetString[] = "charset";
268 static const size_t charsetLength = sizeof("charset") - 1;
269 
extractCharset(const String & value)270 String extractCharset(const String& value)
271 {
272     size_t pos = 0;
273     unsigned length = value.length();
274 
275     while (pos < length) {
276         pos = value.find(charsetString, pos, false);
277         if (pos == kNotFound)
278             break;
279 
280         pos += charsetLength;
281 
282         // Skip whitespace.
283         while (pos < length && value[pos] <= ' ')
284             ++pos;
285 
286         if (value[pos] != '=')
287             continue;
288 
289         ++pos;
290 
291         while (pos < length && value[pos] <= ' ')
292             ++pos;
293 
294         char quoteMark = 0;
295         if (pos < length && (value[pos] == '"' || value[pos] == '\'')) {
296             quoteMark = static_cast<char>(value[pos++]);
297             ASSERT(!(quoteMark & 0x80));
298         }
299 
300         if (pos == length)
301             break;
302 
303         unsigned end = pos;
304         while (end < length && ((quoteMark && value[end] != quoteMark) || (!quoteMark && value[end] > ' ' && value[end] != '"' && value[end] != '\'' && value[end] != ';')))
305             ++end;
306 
307         if (quoteMark && (end == length))
308             break; // Close quote not found.
309 
310         return value.substring(pos, end - pos);
311     }
312 
313     return "";
314 }
315 
316 enum Mode {
317     None,
318     Charset,
319     Pragma,
320 };
321 
encodingFromMetaAttributes(const HTMLAttributeList & attributes)322 WTF::TextEncoding encodingFromMetaAttributes(const HTMLAttributeList& attributes)
323 {
324     bool gotPragma = false;
325     Mode mode = None;
326     String charset;
327 
328     for (HTMLAttributeList::const_iterator iter = attributes.begin(); iter != attributes.end(); ++iter) {
329         const String& attributeName = iter->first;
330         const String& attributeValue = AtomicString(iter->second);
331 
332         if (threadSafeMatch(attributeName, http_equivAttr)) {
333             if (equalIgnoringCase(attributeValue, "content-type"))
334                 gotPragma = true;
335         } else if (charset.isEmpty()) {
336             if (threadSafeMatch(attributeName, charsetAttr)) {
337                 charset = attributeValue;
338                 mode = Charset;
339             } else if (threadSafeMatch(attributeName, contentAttr)) {
340                 charset = extractCharset(attributeValue);
341                 if (charset.length())
342                     mode = Pragma;
343             }
344         }
345     }
346 
347     if (mode == Charset || (mode == Pragma && gotPragma))
348         return WTF::TextEncoding(stripLeadingAndTrailingHTMLSpaces(charset));
349 
350     return WTF::TextEncoding();
351 }
352 
threadSafeEqual(const StringImpl * a,const StringImpl * b)353 static bool threadSafeEqual(const StringImpl* a, const StringImpl* b)
354 {
355     if (a == b)
356         return true;
357     if (a->hash() != b->hash())
358         return false;
359     return equalNonNull(a, b);
360 }
361 
threadSafeMatch(const QualifiedName & a,const QualifiedName & b)362 bool threadSafeMatch(const QualifiedName& a, const QualifiedName& b)
363 {
364     return threadSafeEqual(a.localName().impl(), b.localName().impl());
365 }
366 
threadSafeMatch(const String & localName,const QualifiedName & qName)367 bool threadSafeMatch(const String& localName, const QualifiedName& qName)
368 {
369     return threadSafeEqual(localName.impl(), qName.localName().impl());
370 }
371 
372 template<typename CharType>
findStringIfStatic(const CharType * characters,unsigned length)373 inline StringImpl* findStringIfStatic(const CharType* characters, unsigned length)
374 {
375     // We don't need to try hashing if we know the string is too long.
376     if (length > StringImpl::highestStaticStringLength())
377         return 0;
378     // computeHashAndMaskTop8Bits is the function StringImpl::hash() uses.
379     unsigned hash = StringHasher::computeHashAndMaskTop8Bits(characters, length);
380     const WTF::StaticStringsTable& table = StringImpl::allStaticStrings();
381     ASSERT(!table.isEmpty());
382 
383     WTF::StaticStringsTable::const_iterator it = table.find(hash);
384     if (it == table.end())
385         return 0;
386     // It's possible to have hash collisions between arbitrary strings and
387     // known identifiers (e.g. "bvvfg" collides with "script").
388     // However ASSERTs in StringImpl::createStatic guard against there ever being collisions
389     // between static strings.
390     if (!equal(it->value, characters, length))
391         return 0;
392     return it->value;
393 }
394 
attemptStaticStringCreation(const LChar * characters,size_t size)395 String attemptStaticStringCreation(const LChar* characters, size_t size)
396 {
397     String string(findStringIfStatic(characters, size));
398     if (string.impl())
399         return string;
400     return String(characters, size);
401 }
402 
attemptStaticStringCreation(const UChar * characters,size_t size,CharacterWidth width)403 String attemptStaticStringCreation(const UChar* characters, size_t size, CharacterWidth width)
404 {
405     String string(findStringIfStatic(characters, size));
406     if (string.impl())
407         return string;
408     if (width == Likely8Bit)
409         string = StringImpl::create8BitIfPossible(characters, size);
410     else if (width == Force8Bit)
411         string = String::make8BitFrom16BitSource(characters, size);
412     else
413         string = String(characters, size);
414 
415     return string;
416 }
417 
418 }
419