1 /*
2 * Copyright (C) 2004, 2007, 2008, 2009 Apple Inc. All rights reserved.
3 * Copyright (C) 2008, 2009, 2011 Google Inc. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are
7 * met:
8 *
9 * * Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * * Redistributions in binary form must reproduce the above
12 * copyright notice, this list of conditions and the following disclaimer
13 * in the documentation and/or other materials provided with the
14 * distribution.
15 * * Neither the name of Google Inc. nor the names of its
16 * contributors may be used to endorse or promote products derived from
17 * this software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 #include "config.h"
33
34 #if USE(GOOGLEURL)
35 #include "KURL.h"
36
37 #ifndef NDEBUG
38 #include <stdio.h>
39 #endif
40
41 #include <algorithm>
42
43 #include "NotImplemented.h"
44 #include "TextEncoding.h"
45 #include <wtf/HashMap.h>
46 #include <wtf/Vector.h>
47 #include <wtf/StdLibExtras.h>
48 #include <wtf/text/CString.h>
49 #include <wtf/text/StringHash.h>
50
51 #include <googleurl/src/url_util.h>
52
53 using WTF::isASCIILower;
54 using WTF::toASCIILower;
55 using std::binary_search;
56
57 namespace WebCore {
58
59 static const int maximumValidPortNumber = 0xFFFE;
60 static const int invalidPortNumber = 0xFFFF;
61
62 // Wraps WebCore's text encoding in a character set converter for the
63 // canonicalizer.
64 class KURLCharsetConverter : public url_canon::CharsetConverter {
65 public:
66 // The encoding parameter may be 0, but in this case the object must not be called.
KURLCharsetConverter(const TextEncoding * encoding)67 KURLCharsetConverter(const TextEncoding* encoding)
68 : m_encoding(encoding)
69 {
70 }
71
ConvertFromUTF16(const url_parse::UTF16Char * input,int inputLength,url_canon::CanonOutput * output)72 virtual void ConvertFromUTF16(const url_parse::UTF16Char* input, int inputLength,
73 url_canon::CanonOutput* output)
74 {
75 CString encoded = m_encoding->encode(input, inputLength, URLEncodedEntitiesForUnencodables);
76 output->Append(encoded.data(), static_cast<int>(encoded.length()));
77 }
78
79 private:
80 const TextEncoding* m_encoding;
81 };
82
83 // Note that this function must be named differently than the one in KURL.cpp
84 // since our unit tests evilly include both files, and their local definition
85 // will be ambiguous.
assertProtocolIsGood(const char * protocol)86 static inline void assertProtocolIsGood(const char* protocol)
87 {
88 #ifndef NDEBUG
89 const char* p = protocol;
90 while (*p) {
91 ASSERT(*p > ' ' && *p < 0x7F && !(*p >= 'A' && *p <= 'Z'));
92 ++p;
93 }
94 #endif
95 }
96
97 // Returns the characters for the given string, or a pointer to a static empty
98 // string if the input string is null. This will always ensure we have a non-
99 // null character pointer since ReplaceComponents has special meaning for null.
CharactersOrEmpty(const String & str)100 static inline const url_parse::UTF16Char* CharactersOrEmpty(const String& str)
101 {
102 static const url_parse::UTF16Char zero = 0;
103 return str.characters() ?
104 reinterpret_cast<const url_parse::UTF16Char*>(str.characters()) :
105 &zero;
106 }
107
isUnicodeEncoding(const TextEncoding * encoding)108 static inline bool isUnicodeEncoding(const TextEncoding* encoding)
109 {
110 return encoding->encodingForFormSubmission() == UTF8Encoding();
111 }
112
lowerCaseEqualsASCII(const char * begin,const char * end,const char * str)113 static bool lowerCaseEqualsASCII(const char* begin, const char* end, const char* str)
114 {
115 while (begin != end && *str) {
116 ASSERT(toASCIILower(*str) == *str);
117 if (toASCIILower(*begin++) != *str++)
118 return false;
119 }
120
121 // Both strings are equal (ignoring case) if and only if all of the characters were equal,
122 // and the end of both has been reached.
123 return begin == end && !*str;
124 }
125
isSchemeFirstChar(char c)126 static inline bool isSchemeFirstChar(char c)
127 {
128 return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
129 }
130
isSchemeChar(char c)131 static inline bool isSchemeChar(char c)
132 {
133 return isSchemeFirstChar(c) || (c >= '0' && c <= '9') || c == '.' || c == '-' || c == '*';
134 }
135
isValidProtocol(const String & protocol)136 bool isValidProtocol(const String& protocol)
137 {
138 // NOTE This is a copy of the function in KURL.cpp.
139 // RFC3986: ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
140 if (protocol.isEmpty())
141 return false;
142 if (!isSchemeFirstChar(protocol[0]))
143 return false;
144 unsigned protocolLength = protocol.length();
145 for (unsigned i = 1; i < protocolLength; i++) {
146 if (!isSchemeChar(protocol[i]))
147 return false;
148 }
149 return true;
150 }
151
152
153 // KURLGooglePrivate -----------------------------------------------------------
154
KURLGooglePrivate()155 KURLGooglePrivate::KURLGooglePrivate()
156 : m_isValid(false)
157 , m_protocolInHTTPFamily(false)
158 , m_utf8IsASCII(true)
159 , m_stringIsValid(false)
160 {
161 }
162
KURLGooglePrivate(const url_parse::Parsed & parsed,bool isValid)163 KURLGooglePrivate::KURLGooglePrivate(const url_parse::Parsed& parsed, bool isValid)
164 : m_isValid(isValid)
165 , m_protocolInHTTPFamily(false)
166 , m_parsed(parsed)
167 , m_utf8IsASCII(true)
168 , m_stringIsValid(false)
169 {
170 }
171
KURLGooglePrivate(WTF::HashTableDeletedValueType)172 KURLGooglePrivate::KURLGooglePrivate(WTF::HashTableDeletedValueType)
173 : m_string(WTF::HashTableDeletedValue)
174 {
175 }
176
177 // Setters for the data. Using the ASCII version when you know the
178 // data is ASCII will be slightly more efficient. The UTF-8 version
179 // will always be correct if the caller is unsure.
setUtf8(const CString & str)180 void KURLGooglePrivate::setUtf8(const CString& str)
181 {
182 const char* data = str.data();
183 unsigned dataLength = str.length();
184
185 // The m_utf8IsASCII must always be correct since the DeprecatedString
186 // getter must create it with the proper constructor. This test can be
187 // removed when DeprecatedString is gone, but it still might be a
188 // performance win.
189 m_utf8IsASCII = true;
190 for (unsigned i = 0; i < dataLength; i++) {
191 if (static_cast<unsigned char>(data[i]) >= 0x80) {
192 m_utf8IsASCII = false;
193 break;
194 }
195 }
196
197 m_utf8 = str;
198 m_stringIsValid = false;
199 initProtocolInHTTPFamily();
200 }
201
setAscii(const CString & str)202 void KURLGooglePrivate::setAscii(const CString& str)
203 {
204 m_utf8 = str;
205 m_utf8IsASCII = true;
206 m_stringIsValid = false;
207 initProtocolInHTTPFamily();
208 }
209
init(const KURL & base,const String & relative,const TextEncoding * queryEncoding)210 void KURLGooglePrivate::init(const KURL& base,
211 const String& relative,
212 const TextEncoding* queryEncoding)
213 {
214 init(base, relative.characters(), relative.length(), queryEncoding);
215 }
216
217 template <typename CHAR>
init(const KURL & base,const CHAR * rel,int relLength,const TextEncoding * queryEncoding)218 void KURLGooglePrivate::init(const KURL& base, const CHAR* rel, int relLength,
219 const TextEncoding* queryEncoding)
220 {
221 // As a performance optimization, we do not use the charset converter
222 // if encoding is UTF-8 or other Unicode encodings. Note that this is
223 // per HTML5 2.5.3 (resolving URL). The URL canonicalizer will be more
224 // efficient with no charset converter object because it can do UTF-8
225 // internally with no extra copies.
226
227 // We feel free to make the charset converter object every time since it's
228 // just a wrapper around a reference.
229 KURLCharsetConverter charsetConverterObject(queryEncoding);
230 KURLCharsetConverter* charsetConverter =
231 (!queryEncoding || isUnicodeEncoding(queryEncoding)) ? 0 :
232 &charsetConverterObject;
233
234 url_canon::RawCanonOutputT<char> output;
235 const CString& baseStr = base.m_url.utf8String();
236 m_isValid = url_util::ResolveRelative(baseStr.data(), baseStr.length(),
237 base.m_url.m_parsed, rel, relLength,
238 charsetConverter,
239 &output, &m_parsed);
240
241 // See FIXME in KURLGooglePrivate in the header. If canonicalization has not
242 // changed the string, we can avoid an extra allocation by using assignment.
243 //
244 // When KURL encounters an error such that the URL is invalid and empty
245 // (for example, resolving a relative URL on a non-hierarchical base), it
246 // will produce an isNull URL, and calling setUtf8 will produce an empty
247 // non-null URL. This is unlikely to affect anything, but we preserve this
248 // just in case.
249 if (m_isValid || output.length()) {
250 // Without ref, the whole url is guaranteed to be ASCII-only.
251 if (m_parsed.ref.is_nonempty())
252 setUtf8(CString(output.data(), output.length()));
253 else
254 setAscii(CString(output.data(), output.length()));
255 } else {
256 // WebCore expects resolved URLs to be empty rather than null.
257 setUtf8(CString("", 0));
258 }
259 }
260
initProtocolInHTTPFamily()261 void KURLGooglePrivate::initProtocolInHTTPFamily()
262 {
263 if (!m_isValid) {
264 m_protocolInHTTPFamily = false;
265 return;
266 }
267
268 const char* scheme = m_utf8.data() + m_parsed.scheme.begin;
269 if (m_parsed.scheme.len == 4)
270 m_protocolInHTTPFamily = lowerCaseEqualsASCII(scheme, scheme + 4, "http");
271 else if (m_parsed.scheme.len == 5)
272 m_protocolInHTTPFamily = lowerCaseEqualsASCII(scheme, scheme + 5, "https");
273 else
274 m_protocolInHTTPFamily = false;
275 }
276
copyTo(KURLGooglePrivate * dest) const277 void KURLGooglePrivate::copyTo(KURLGooglePrivate* dest) const
278 {
279 dest->m_isValid = m_isValid;
280 dest->m_protocolInHTTPFamily = m_protocolInHTTPFamily;
281 dest->m_parsed = m_parsed;
282
283 // Don't copy the 16-bit string since that will be regenerated as needed.
284 dest->m_utf8 = CString(m_utf8.data(), m_utf8.length());
285 dest->m_utf8IsASCII = m_utf8IsASCII;
286 dest->m_stringIsValid = false;
287 }
288
componentString(const url_parse::Component & comp) const289 String KURLGooglePrivate::componentString(const url_parse::Component& comp) const
290 {
291 if (!m_isValid || comp.len <= 0) {
292 // KURL returns a null string if the URL is itself a null string, and an
293 // empty string for other nonexistent entities.
294 if (utf8String().isNull())
295 return String();
296 return String("", 0);
297 }
298 // begin and len are in terms of bytes which do not match
299 // if string() is UTF-16 and input contains non-ASCII characters.
300 // However, the only part in urlString that can contain non-ASCII
301 // characters is 'ref' at the end of the string. In that case,
302 // begin will always match the actual value and len (in terms of
303 // byte) will be longer than what's needed by 'mid'. However, mid
304 // truncates len to avoid go past the end of a string so that we can
305 // get away withtout doing anything here.
306 return string().substring(comp.begin, comp.len);
307 }
308
replaceComponents(const Replacements & replacements)309 void KURLGooglePrivate::replaceComponents(const Replacements& replacements)
310 {
311 url_canon::RawCanonOutputT<char> output;
312 url_parse::Parsed newParsed;
313
314 m_isValid = url_util::ReplaceComponents(utf8String().data(),
315 utf8String().length(), m_parsed, replacements, 0, &output, &newParsed);
316
317 m_parsed = newParsed;
318 if (m_parsed.ref.is_nonempty())
319 setUtf8(CString(output.data(), output.length()));
320 else
321 setAscii(CString(output.data(), output.length()));
322 }
323
string() const324 const String& KURLGooglePrivate::string() const
325 {
326 if (!m_stringIsValid) {
327 // Handle the null case separately. Otherwise, constructing
328 // the string like we do below would generate the empty string,
329 // not the null string.
330 if (m_utf8.isNull())
331 m_string = String();
332 else if (m_utf8IsASCII)
333 m_string = String(m_utf8.data(), m_utf8.length());
334 else
335 m_string = String::fromUTF8(m_utf8.data(), m_utf8.length());
336 m_stringIsValid = true;
337 }
338 return m_string;
339 }
340
341 // KURL ------------------------------------------------------------------------
342
343 // Creates with null-terminated string input representing an absolute URL.
344 // WebCore generally calls this only with hardcoded strings, so the input is
345 // ASCII. We treat it as UTF-8 just in case.
KURL(ParsedURLStringTag,const char * url)346 KURL::KURL(ParsedURLStringTag, const char *url)
347 {
348 // FIXME The Mac code checks for beginning with a slash and converts it to
349 // file: URL. We will want to add this as well once we can compile on a
350 // system like that.
351 m_url.init(KURL(), url, strlen(url), 0);
352
353 // The one-argument constructors should never generate a null string.
354 // This is a funny quirk of KURL.cpp (probably a bug) which we preserve.
355 if (m_url.utf8String().isNull())
356 m_url.setAscii(CString("", 0));
357 }
358
359 // Initializes with a string representing an absolute URL. No encoding
360 // information is specified. This generally happens when a KURL is converted
361 // to a string and then converted back. In this case, the URL is already
362 // canonical and in proper escaped form so needs no encoding. We treat it as
363 // UTF-8 just in case.
KURL(ParsedURLStringTag,const String & url)364 KURL::KURL(ParsedURLStringTag, const String& url)
365 {
366 if (!url.isNull())
367 m_url.init(KURL(), url, 0);
368 else {
369 // WebCore expects us to preserve the nullness of strings when this
370 // constructor is used. In all other cases, it expects a non-null
371 // empty string, which is what init() will create.
372 m_url.m_isValid = false;
373 m_url.m_protocolInHTTPFamily = false;
374 }
375 }
376
377 // Constructs a new URL given a base URL and a possibly relative input URL.
378 // This assumes UTF-8 encoding.
KURL(const KURL & base,const String & relative)379 KURL::KURL(const KURL& base, const String& relative)
380 {
381 m_url.init(base, relative, 0);
382 }
383
384 // Constructs a new URL given a base URL and a possibly relative input URL.
385 // Any query portion of the relative URL will be encoded in the given encoding.
KURL(const KURL & base,const String & relative,const TextEncoding & encoding)386 KURL::KURL(const KURL& base,
387 const String& relative,
388 const TextEncoding& encoding)
389 {
390 m_url.init(base, relative, &encoding.encodingForFormSubmission());
391 }
392
KURL(const CString & canonicalSpec,const url_parse::Parsed & parsed,bool isValid)393 KURL::KURL(const CString& canonicalSpec,
394 const url_parse::Parsed& parsed, bool isValid)
395 : m_url(parsed, isValid)
396 {
397 // We know the reference fragment is the only part that can be UTF-8, so
398 // we know it's ASCII when there is no ref.
399 if (parsed.ref.is_nonempty())
400 m_url.setUtf8(canonicalSpec);
401 else
402 m_url.setAscii(canonicalSpec);
403 }
404
405 #if USE(CF)
KURL(CFURLRef)406 KURL::KURL(CFURLRef)
407 {
408 notImplemented();
409 invalidate();
410 }
411
createCFURL() const412 CFURLRef KURL::createCFURL() const
413 {
414 notImplemented();
415 return 0;
416 }
417 #endif
418
copy() const419 KURL KURL::copy() const
420 {
421 KURL result = *this;
422 m_url.copyTo(&result.m_url);
423 return result;
424 }
425
isNull() const426 bool KURL::isNull() const
427 {
428 return m_url.utf8String().isNull();
429 }
430
isEmpty() const431 bool KURL::isEmpty() const
432 {
433 return !m_url.utf8String().length();
434 }
435
isValid() const436 bool KURL::isValid() const
437 {
438 return m_url.m_isValid;
439 }
440
hasPort() const441 bool KURL::hasPort() const
442 {
443 return hostEnd() < pathStart();
444 }
445
protocolInHTTPFamily() const446 bool KURL::protocolInHTTPFamily() const
447 {
448 return m_url.m_protocolInHTTPFamily;
449 }
450
hasPath() const451 bool KURL::hasPath() const
452 {
453 // Note that http://www.google.com/" has a path, the path is "/". This can
454 // return false only for invalid or nonstandard URLs.
455 return m_url.m_parsed.path.len >= 0;
456 }
457
458 // We handle "parameters" separated by a semicolon, while KURL.cpp does not,
459 // which can lead to different results in some cases.
lastPathComponent() const460 String KURL::lastPathComponent() const
461 {
462 // When the output ends in a slash, WebCore has different expectations than
463 // the GoogleURL library. For "/foo/bar/" the library will return the empty
464 // string, but WebCore wants "bar".
465 url_parse::Component path = m_url.m_parsed.path;
466 if (path.len > 0 && m_url.utf8String().data()[path.end() - 1] == '/')
467 path.len--;
468
469 url_parse::Component file;
470 url_parse::ExtractFileName(m_url.utf8String().data(), path, &file);
471
472 // Bug: https://bugs.webkit.org/show_bug.cgi?id=21015 this function returns
473 // a null string when the path is empty, which we duplicate here.
474 if (!file.is_nonempty())
475 return String();
476 return m_url.componentString(file);
477 }
478
protocol() const479 String KURL::protocol() const
480 {
481 return m_url.componentString(m_url.m_parsed.scheme);
482 }
483
host() const484 String KURL::host() const
485 {
486 // Note: KURL.cpp unescapes here.
487 return m_url.componentString(m_url.m_parsed.host);
488 }
489
490 // Returns 0 when there is no port.
491 //
492 // We treat URL's with out-of-range port numbers as invalid URLs, and they will
493 // be rejected by the canonicalizer. KURL.cpp will allow them in parsing, but
494 // return invalidPortNumber from this port() function, so we mirror that behavior here.
port() const495 unsigned short KURL::port() const
496 {
497 if (!m_url.m_isValid || m_url.m_parsed.port.len <= 0)
498 return 0;
499 int port = url_parse::ParsePort(m_url.utf8String().data(), m_url.m_parsed.port);
500 ASSERT(port != url_parse::PORT_UNSPECIFIED); // Checked port.len <= 0 before.
501
502 if (port == url_parse::PORT_INVALID || port > maximumValidPortNumber) // Mimic KURL::port()
503 port = invalidPortNumber;
504
505 return static_cast<unsigned short>(port);
506 }
507
508 // Returns the empty string if there is no password.
pass() const509 String KURL::pass() const
510 {
511 // Bug: https://bugs.webkit.org/show_bug.cgi?id=21015 this function returns
512 // a null string when the password is empty, which we duplicate here.
513 if (!m_url.m_parsed.password.is_nonempty())
514 return String();
515
516 // Note: KURL.cpp unescapes here.
517 return m_url.componentString(m_url.m_parsed.password);
518 }
519
520 // Returns the empty string if there is no username.
user() const521 String KURL::user() const
522 {
523 // Note: KURL.cpp unescapes here.
524 return m_url.componentString(m_url.m_parsed.username);
525 }
526
fragmentIdentifier() const527 String KURL::fragmentIdentifier() const
528 {
529 // Empty but present refs ("foo.com/bar#") should result in the empty
530 // string, which m_url.componentString will produce. Nonexistent refs
531 // should be the null string.
532 if (!m_url.m_parsed.ref.is_valid())
533 return String();
534
535 // Note: KURL.cpp unescapes here.
536 return m_url.componentString(m_url.m_parsed.ref);
537 }
538
hasFragmentIdentifier() const539 bool KURL::hasFragmentIdentifier() const
540 {
541 // Note: KURL.cpp unescapes here.
542 // FIXME determine if KURL.cpp agrees about an empty ref
543 return m_url.m_parsed.ref.len >= 0;
544 }
545
copyParsedQueryTo(ParsedURLParameters & parameters) const546 void KURL::copyParsedQueryTo(ParsedURLParameters& parameters) const
547 {
548 String query = m_url.componentString(m_url.m_parsed.query);
549 const UChar* pos = query.characters();
550 const UChar* end = query.characters() + query.length();
551 while (pos < end) {
552 const UChar* parameterStart = pos;
553 while (pos < end && *pos != '&')
554 ++pos;
555 const UChar* parameterEnd = pos;
556 if (pos < end) {
557 ASSERT(*pos == '&');
558 ++pos;
559 }
560 if (parameterStart == parameterEnd)
561 continue;
562 const UChar* nameStart = parameterStart;
563 const UChar* equalSign = parameterStart;
564 while (equalSign < parameterEnd && *equalSign != '=')
565 ++equalSign;
566 if (equalSign == nameStart)
567 continue;
568 String name(nameStart, equalSign - nameStart);
569 String value = equalSign == parameterEnd ? String() : String(equalSign + 1, parameterEnd - equalSign - 1);
570 parameters.set(name, value);
571 }
572 }
573
baseAsString() const574 String KURL::baseAsString() const
575 {
576 // FIXME: There is probably a more efficient way to do this?
577 return string().left(pathAfterLastSlash());
578 }
579
query() const580 String KURL::query() const
581 {
582 if (m_url.m_parsed.query.len >= 0)
583 return m_url.componentString(m_url.m_parsed.query);
584
585 // Bug: https://bugs.webkit.org/show_bug.cgi?id=21015 this function returns
586 // an empty string when the query is empty rather than a null (not sure
587 // which is right).
588 // Returns a null if the query is not specified, instead of empty.
589 if (m_url.m_parsed.query.is_valid())
590 return String("", 0);
591 return String();
592 }
593
path() const594 String KURL::path() const
595 {
596 // Note: KURL.cpp unescapes here.
597 return m_url.componentString(m_url.m_parsed.path);
598 }
599
setProtocol(const String & protocol)600 bool KURL::setProtocol(const String& protocol)
601 {
602 // Firefox and IE remove everything after the first ':'.
603 int separatorPosition = protocol.find(':');
604 String newProtocol = protocol.substring(0, separatorPosition);
605
606 // If KURL is given an invalid scheme, it returns failure without modifying
607 // the URL at all. This is in contrast to most other setters which modify
608 // the URL and set "m_isValid."
609 url_canon::RawCanonOutputT<char> canonProtocol;
610 url_parse::Component protocolComponent;
611 if (!url_canon::CanonicalizeScheme(newProtocol.characters(),
612 url_parse::Component(0, newProtocol.length()),
613 &canonProtocol, &protocolComponent)
614 || !protocolComponent.is_nonempty())
615 return false;
616
617 KURLGooglePrivate::Replacements replacements;
618 replacements.SetScheme(CharactersOrEmpty(newProtocol),
619 url_parse::Component(0, newProtocol.length()));
620 m_url.replaceComponents(replacements);
621
622 // isValid could be false but we still return true here. This is because
623 // WebCore or JS scripts can build up a URL by setting individual
624 // components, and a JS exception is based on the return value of this
625 // function. We want to throw the exception and stop the script only when
626 // its trying to set a bad protocol, and not when it maybe just hasn't
627 // finished building up its final scheme.
628 return true;
629 }
630
setHost(const String & host)631 void KURL::setHost(const String& host)
632 {
633 KURLGooglePrivate::Replacements replacements;
634 replacements.SetHost(CharactersOrEmpty(host),
635 url_parse::Component(0, host.length()));
636 m_url.replaceComponents(replacements);
637 }
638
setHostAndPort(const String & s)639 void KURL::setHostAndPort(const String& s)
640 {
641 String host = s;
642 String port;
643 int hostEnd = s.find(":");
644 if (hostEnd != -1) {
645 host = s.left(hostEnd);
646 port = s.substring(hostEnd + 1);
647 }
648
649 KURLGooglePrivate::Replacements replacements;
650 // Host can't be removed, so we always set.
651 replacements.SetHost(CharactersOrEmpty(host),
652 url_parse::Component(0, host.length()));
653
654 if (port.isEmpty()) // Port may be removed, so we support clearing.
655 replacements.ClearPort();
656 else
657 replacements.SetPort(CharactersOrEmpty(port), url_parse::Component(0, port.length()));
658 m_url.replaceComponents(replacements);
659 }
660
removePort()661 void KURL::removePort()
662 {
663 if (hasPort()) {
664 String urlWithoutPort = m_url.string().left(hostEnd()) + m_url.string().substring(pathStart());
665 m_url.setUtf8(urlWithoutPort.utf8());
666 }
667 }
668
setPort(unsigned short i)669 void KURL::setPort(unsigned short i)
670 {
671 KURLGooglePrivate::Replacements replacements;
672 String portStr;
673 if (i) {
674 portStr = String::number(i);
675 replacements.SetPort(
676 reinterpret_cast<const url_parse::UTF16Char*>(portStr.characters()),
677 url_parse::Component(0, portStr.length()));
678
679 } else {
680 // Clear any existing port when it is set to 0.
681 replacements.ClearPort();
682 }
683 m_url.replaceComponents(replacements);
684 }
685
setUser(const String & user)686 void KURL::setUser(const String& user)
687 {
688 // This function is commonly called to clear the username, which we
689 // normally don't have, so we optimize this case.
690 if (user.isEmpty() && !m_url.m_parsed.username.is_valid())
691 return;
692
693 // The canonicalizer will clear any usernames that are empty, so we
694 // don't have to explicitly call ClearUsername() here.
695 KURLGooglePrivate::Replacements replacements;
696 replacements.SetUsername(CharactersOrEmpty(user),
697 url_parse::Component(0, user.length()));
698 m_url.replaceComponents(replacements);
699 }
700
setPass(const String & pass)701 void KURL::setPass(const String& pass)
702 {
703 // This function is commonly called to clear the password, which we
704 // normally don't have, so we optimize this case.
705 if (pass.isEmpty() && !m_url.m_parsed.password.is_valid())
706 return;
707
708 // The canonicalizer will clear any passwords that are empty, so we
709 // don't have to explicitly call ClearUsername() here.
710 KURLGooglePrivate::Replacements replacements;
711 replacements.SetPassword(CharactersOrEmpty(pass),
712 url_parse::Component(0, pass.length()));
713 m_url.replaceComponents(replacements);
714 }
715
setFragmentIdentifier(const String & s)716 void KURL::setFragmentIdentifier(const String& s)
717 {
718 // This function is commonly called to clear the ref, which we
719 // normally don't have, so we optimize this case.
720 if (s.isNull() && !m_url.m_parsed.ref.is_valid())
721 return;
722
723 KURLGooglePrivate::Replacements replacements;
724 if (s.isNull())
725 replacements.ClearRef();
726 else
727 replacements.SetRef(CharactersOrEmpty(s), url_parse::Component(0, s.length()));
728 m_url.replaceComponents(replacements);
729 }
730
removeFragmentIdentifier()731 void KURL::removeFragmentIdentifier()
732 {
733 KURLGooglePrivate::Replacements replacements;
734 replacements.ClearRef();
735 m_url.replaceComponents(replacements);
736 }
737
setQuery(const String & query)738 void KURL::setQuery(const String& query)
739 {
740 KURLGooglePrivate::Replacements replacements;
741 if (query.isNull()) {
742 // KURL.cpp sets to null to clear any query.
743 replacements.ClearQuery();
744 } else if (query.length() > 0 && query[0] == '?') {
745 // WebCore expects the query string to begin with a question mark, but
746 // GoogleURL doesn't. So we trim off the question mark when setting.
747 replacements.SetQuery(CharactersOrEmpty(query),
748 url_parse::Component(1, query.length() - 1));
749 } else {
750 // When set with the empty string or something that doesn't begin with
751 // a question mark, KURL.cpp will add a question mark for you. The only
752 // way this isn't compatible is if you call this function with an empty
753 // string. KURL.cpp will leave a '?' with nothing following it in the
754 // URL, whereas we'll clear it.
755 // FIXME We should eliminate this difference.
756 replacements.SetQuery(CharactersOrEmpty(query),
757 url_parse::Component(0, query.length()));
758 }
759 m_url.replaceComponents(replacements);
760 }
761
setPath(const String & path)762 void KURL::setPath(const String& path)
763 {
764 // Empty paths will be canonicalized to "/", so we don't have to worry
765 // about calling ClearPath().
766 KURLGooglePrivate::Replacements replacements;
767 replacements.SetPath(CharactersOrEmpty(path),
768 url_parse::Component(0, path.length()));
769 m_url.replaceComponents(replacements);
770 }
771
772 // On Mac, this just seems to return the same URL, but with "/foo/bar" for
773 // file: URLs instead of file:///foo/bar. We don't bother with any of this,
774 // at least for now.
prettyURL() const775 String KURL::prettyURL() const
776 {
777 if (!m_url.m_isValid)
778 return String();
779 return m_url.string();
780 }
781
decodeURLEscapeSequences(const String & str)782 String decodeURLEscapeSequences(const String& str)
783 {
784 return decodeURLEscapeSequences(str, UTF8Encoding());
785 }
786
787 // In KURL.cpp's implementation, this is called by every component getter.
788 // It will unescape every character, including '\0'. This is scary, and may
789 // cause security holes. We never call this function for components, and
790 // just return the ASCII versions instead.
791 //
792 // This function is also used to decode javascript: URLs and as a general
793 // purpose unescaping function.
794 //
795 // FIXME These should be merged to the KURL.cpp implementation.
decodeURLEscapeSequences(const String & str,const TextEncoding & encoding)796 String decodeURLEscapeSequences(const String& str, const TextEncoding& encoding)
797 {
798 // FIXME We can probably use KURL.cpp's version of this function
799 // without modification. However, I'm concerned about
800 // https://bugs.webkit.org/show_bug.cgi?id=20559 so am keeping this old
801 // custom code for now. Using their version will also fix the bug that
802 // we ignore the encoding.
803 //
804 // FIXME b/1350291: This does not get called very often. We just convert
805 // first to 8-bit UTF-8, then unescape, then back to 16-bit. This kind of
806 // sucks, and we don't use the encoding properly, which will make some
807 // obscure anchor navigations fail.
808 CString cstr = str.utf8();
809
810 const char* input = cstr.data();
811 int inputLength = cstr.length();
812
813 url_canon::RawCanonOutputT<url_parse::UTF16Char> unescaped;
814
815 url_util::DecodeURLEscapeSequences(input, inputLength, &unescaped);
816
817 return String(reinterpret_cast<UChar*>(unescaped.data()),
818 unescaped.length());
819 }
820
protocolIs(const char * protocol) const821 bool KURL::protocolIs(const char* protocol) const
822 {
823 assertProtocolIsGood(protocol);
824
825 // JavaScript URLs are "valid" and should be executed even if KURL decides they are invalid.
826 // The free function protocolIsJavaScript() should be used instead.
827 // FIXME: Chromium code needs to be fixed for this assert to be enabled. ASSERT(strcmp(protocol, "javascript"));
828
829 if (m_url.m_parsed.scheme.len <= 0)
830 return !protocol;
831 return lowerCaseEqualsASCII(
832 m_url.utf8String().data() + m_url.m_parsed.scheme.begin,
833 m_url.utf8String().data() + m_url.m_parsed.scheme.end(),
834 protocol);
835 }
836
837 // This is called to escape a URL string. It is only used externally when
838 // constructing mailto: links to set the query section. Since our query setter
839 // will automatically do the correct escaping, this function does not have to
840 // do any work.
841 //
842 // There is a possibility that a future caller may use this function in other
843 // ways, and may expect to get a valid URL string. The dangerous thing we want
844 // to protect against here is accidentally getting '\0' characters in a string
845 // that is not supposed to have them. Therefore, we escape these characters.
encodeWithURLEscapeSequences(const String & notEncodedString)846 String encodeWithURLEscapeSequences(const String& notEncodedString)
847 {
848 CString utf8 = UTF8Encoding().encode(
849 reinterpret_cast<const UChar*>(notEncodedString.characters()),
850 notEncodedString.length(),
851 URLEncodedEntitiesForUnencodables);
852 const char* input = utf8.data();
853 int inputLength = utf8.length();
854
855 Vector<char, 2048> buffer;
856 for (int i = 0; i < inputLength; i++) {
857 if (!input[i])
858 buffer.append("%00", 3);
859 else
860 buffer.append(input[i]);
861 }
862 return String(buffer.data(), buffer.size());
863 }
864
isHierarchical() const865 bool KURL::isHierarchical() const
866 {
867 if (!m_url.m_parsed.scheme.is_nonempty())
868 return false;
869 return url_util::IsStandard(
870 &m_url.utf8String().data()[m_url.m_parsed.scheme.begin],
871 m_url.m_parsed.scheme);
872 }
873
874 #ifndef NDEBUG
print() const875 void KURL::print() const
876 {
877 printf("%s\n", m_url.utf8String().data());
878 }
879 #endif
880
invalidate()881 void KURL::invalidate()
882 {
883 // This is only called from the constructor so resetting the (automatically
884 // initialized) string and parsed structure would be a waste of time.
885 m_url.m_isValid = false;
886 m_url.m_protocolInHTTPFamily = false;
887 }
888
889 // Equal up to reference fragments, if any.
equalIgnoringFragmentIdentifier(const KURL & a,const KURL & b)890 bool equalIgnoringFragmentIdentifier(const KURL& a, const KURL& b)
891 {
892 // Compute the length of each URL without its ref. Note that the reference
893 // begin (if it exists) points to the character *after* the '#', so we need
894 // to subtract one.
895 int aLength = a.m_url.utf8String().length();
896 if (a.m_url.m_parsed.ref.len >= 0)
897 aLength = a.m_url.m_parsed.ref.begin - 1;
898
899 int bLength = b.m_url.utf8String().length();
900 if (b.m_url.m_parsed.ref.len >= 0)
901 bLength = b.m_url.m_parsed.ref.begin - 1;
902
903 return aLength == bLength
904 && !strncmp(a.m_url.utf8String().data(), b.m_url.utf8String().data(), aLength);
905 }
906
hostStart() const907 unsigned KURL::hostStart() const
908 {
909 return m_url.m_parsed.CountCharactersBefore(url_parse::Parsed::HOST, false);
910 }
911
hostEnd() const912 unsigned KURL::hostEnd() const
913 {
914 return m_url.m_parsed.CountCharactersBefore(url_parse::Parsed::PORT, true);
915 }
916
pathStart() const917 unsigned KURL::pathStart() const
918 {
919 return m_url.m_parsed.CountCharactersBefore(url_parse::Parsed::PATH, false);
920 }
921
pathEnd() const922 unsigned KURL::pathEnd() const
923 {
924 return m_url.m_parsed.CountCharactersBefore(url_parse::Parsed::QUERY, true);
925 }
926
pathAfterLastSlash() const927 unsigned KURL::pathAfterLastSlash() const
928 {
929 // When there's no path, ask for what would be the beginning of it.
930 if (!m_url.m_parsed.path.is_valid())
931 return m_url.m_parsed.CountCharactersBefore(url_parse::Parsed::PATH, false);
932
933 url_parse::Component filename;
934 url_parse::ExtractFileName(m_url.utf8String().data(), m_url.m_parsed.path,
935 &filename);
936 return filename.begin;
937 }
938
protocolIs(const String & url,const char * protocol)939 bool protocolIs(const String& url, const char* protocol)
940 {
941 // Do the comparison without making a new string object.
942 assertProtocolIsGood(protocol);
943
944 // Check the scheme like GURL does.
945 return url_util::FindAndCompareScheme(url.characters(), url.length(),
946 protocol, 0);
947 }
948
protocolIs(const String & string,const char * protocol)949 inline bool KURL::protocolIs(const String& string, const char* protocol)
950 {
951 return WebCore::protocolIs(string, protocol);
952 }
953
protocolHostAndPortAreEqual(const KURL & a,const KURL & b)954 bool protocolHostAndPortAreEqual(const KURL& a, const KURL& b)
955 {
956 if (a.parsed().scheme.end() != b.parsed().scheme.end())
957 return false;
958
959 int hostStartA = a.hostStart();
960 int hostLengthA = a.hostEnd() - hostStartA;
961 int hostStartB = b.hostStart();
962 int hostLengthB = b.hostEnd() - b.hostStart();
963 if (hostLengthA != hostLengthB)
964 return false;
965
966 // Check the scheme
967 for (int i = 0; i < a.parsed().scheme.end(); ++i)
968 if (a.string()[i] != b.string()[i])
969 return false;
970
971 // And the host
972 for (int i = 0; i < hostLengthA; ++i)
973 if (a.string()[hostStartA + i] != b.string()[hostStartB + i])
974 return false;
975
976 if (a.port() != b.port())
977 return false;
978
979 return true;
980 }
981
982 } // namespace WebCore
983
984 #endif // USE(GOOGLEURL)
985