• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2004, 2007, 2008, 2009 Apple Inc. All rights reserved.
3  * Copyright (C) 2008, 2009 Google Inc. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are
7  * met:
8  *
9  *     * Redistributions of source code must retain the above copyright
10  * notice, this list of conditions and the following disclaimer.
11  *     * Redistributions in binary form must reproduce the above
12  * copyright notice, this list of conditions and the following disclaimer
13  * in the documentation and/or other materials provided with the
14  * distribution.
15  *     * Neither the name of Google Inc. nor the names of its
16  * contributors may be used to endorse or promote products derived from
17  * this software without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 #include "config.h"
33 
34 #if USE(GOOGLEURL)
35 #include "KURL.h"
36 
37 #ifndef NDEBUG
38 #include <stdio.h>
39 #endif
40 
41 #include "CString.h"
42 #include "NotImplemented.h"
43 #include "TextEncoding.h"
44 #include <wtf/Vector.h>
45 
46 #include <googleurl/src/url_canon_internal.h>
47 #include <googleurl/src/url_util.h>
48 
49 using WTF::isASCIILower;
50 using WTF::toASCIILower;
51 
52 namespace WebCore {
53 
54 // Wraps WebCore's text encoding in a character set converter for the
55 // canonicalizer.
56 class KURLCharsetConverter : public url_canon::CharsetConverter {
57 public:
58     // The encoding parameter may be NULL, but in this case the object must not
59     // be called.
KURLCharsetConverter(const TextEncoding * encoding)60     KURLCharsetConverter(const TextEncoding* encoding)
61         : m_encoding(encoding)
62     {
63     }
64 
ConvertFromUTF16(const url_parse::UTF16Char * input,int inputLength,url_canon::CanonOutput * output)65     virtual void ConvertFromUTF16(const url_parse::UTF16Char* input, int inputLength,
66                                   url_canon::CanonOutput* output)
67     {
68         CString encoded = m_encoding->encode(input, inputLength, URLEncodedEntitiesForUnencodables);
69         output->Append(encoded.data(), static_cast<int>(encoded.length()));
70     }
71 
72 private:
73     const TextEncoding* m_encoding;
74 };
75 
76 // Note that this function must be named differently than the one in KURL.cpp
77 // since our unit tests evilly include both files, and their local definition
78 // will be ambiguous.
assertProtocolIsGood(const char * protocol)79 static inline void assertProtocolIsGood(const char* protocol)
80 {
81 #ifndef NDEBUG
82     const char* p = protocol;
83     while (*p) {
84         ASSERT(*p > ' ' && *p < 0x7F && !(*p >= 'A' && *p <= 'Z'));
85         ++p;
86     }
87 #endif
88 }
89 
90 // Returns the characters for the given string, or a pointer to a static empty
91 // string if the input string is NULL. This will always ensure we have a non-
92 // NULL character pointer since ReplaceComponents has special meaning for NULL.
CharactersOrEmpty(const String & str)93 static inline const url_parse::UTF16Char* CharactersOrEmpty(const String& str)
94 {
95     static const url_parse::UTF16Char zero = 0;
96     return str.characters() ?
97            reinterpret_cast<const url_parse::UTF16Char*>(str.characters()) :
98            &zero;
99 }
100 
isUnicodeEncoding(const TextEncoding * encoding)101 static inline bool isUnicodeEncoding(const TextEncoding* encoding)
102 {
103     return encoding->encodingForFormSubmission() == UTF8Encoding();
104 }
105 
lowerCaseEqualsASCII(const char * begin,const char * end,const char * str)106 static bool lowerCaseEqualsASCII(const char* begin, const char* end, const char* str)
107 {
108     while (begin != end && *str) {
109         ASSERT(isASCIILower(*str));
110         if (toASCIILower(*begin++) != *str++)
111             return false;
112     }
113 
114     // Both strings are equal (ignoring case) if and only if all of the characters were equal,
115     // and the end of both has been reached.
116     return begin == end && !*str;
117 }
118 
119 
120 // KURLGooglePrivate -----------------------------------------------------------
121 
KURLGooglePrivate()122 KURLGooglePrivate::KURLGooglePrivate()
123     : m_isValid(false)
124     , m_protocolInHTTPFamily(false)
125     , m_utf8IsASCII(true)
126     , m_stringIsValid(false)
127 {
128 }
129 
KURLGooglePrivate(const url_parse::Parsed & parsed,bool isValid)130 KURLGooglePrivate::KURLGooglePrivate(const url_parse::Parsed& parsed, bool isValid)
131     : m_isValid(isValid)
132     , m_protocolInHTTPFamily(false)
133     , m_parsed(parsed)
134     , m_utf8IsASCII(true)
135     , m_stringIsValid(false)
136 {
137 }
138 
139 // Setters for the data. Using the ASCII version when you know the
140 // data is ASCII will be slightly more efficient. The UTF-8 version
141 // will always be correct if the caller is unsure.
setUtf8(const CString & str)142 void KURLGooglePrivate::setUtf8(const CString& str)
143 {
144     const char* data = str.data();
145     unsigned dataLength = str.length();
146 
147     // The m_utf8IsASCII must always be correct since the DeprecatedString
148     // getter must create it with the proper constructor. This test can be
149     // removed when DeprecatedString is gone, but it still might be a
150     // performance win.
151     m_utf8IsASCII = true;
152     for (unsigned i = 0; i < dataLength; i++) {
153         if (static_cast<unsigned char>(data[i]) >= 0x80) {
154             m_utf8IsASCII = false;
155             break;
156         }
157     }
158 
159     m_utf8 = str;
160     m_stringIsValid = false;
161     initProtocolInHTTPFamily();
162 }
163 
setAscii(const CString & str)164 void KURLGooglePrivate::setAscii(const CString& str)
165 {
166     m_utf8 = str;
167     m_utf8IsASCII = true;
168     m_stringIsValid = false;
169     initProtocolInHTTPFamily();
170 }
171 
init(const KURL & base,const String & relative,const TextEncoding * queryEncoding)172 void KURLGooglePrivate::init(const KURL& base,
173                              const String& relative,
174                              const TextEncoding* queryEncoding)
175 {
176     init(base, relative.characters(), relative.length(), queryEncoding);
177 }
178 
179 // Note: code mostly duplicated below.
init(const KURL & base,const char * rel,int relLength,const TextEncoding * queryEncoding)180 void KURLGooglePrivate::init(const KURL& base, const char* rel, int relLength,
181                              const TextEncoding* queryEncoding)
182 {
183     // As a performance optimization, we do not use the charset converter if
184     // encoding is UTF-8 or other Unicode encodings. Note that this is
185     // per HTML5 2.5.3 (resolving URL). The URL canonicalizer will be
186     // more efficient with no charset converter object because it
187     // can do UTF-8 internally with no extra copies.
188 
189     // We feel free to make the charset converter object every time since it's
190     // just a wrapper around a reference.
191     KURLCharsetConverter charsetConverterObject(queryEncoding);
192     KURLCharsetConverter* charsetConverter =
193         (!queryEncoding || isUnicodeEncoding(queryEncoding)) ? 0 :
194         &charsetConverterObject;
195 
196     url_canon::RawCanonOutputT<char> output;
197     const CString& baseStr = base.m_url.utf8String();
198     m_isValid = url_util::ResolveRelative(baseStr.data(), baseStr.length(),
199                                           base.m_url.m_parsed, rel, relLength,
200                                           charsetConverter,
201                                           &output, &m_parsed);
202 
203     // See FIXME in KURLGooglePrivate in the header. If canonicalization has not
204     // changed the string, we can avoid an extra allocation by using assignment.
205     //
206     // When KURL encounters an error such that the URL is invalid and empty
207     // (for example, resolving a relative URL on a non-hierarchical base), it
208     // will produce an isNull URL, and calling setUtf8 will produce an empty
209     // non-null URL. This is unlikely to affect anything, but we preserve this
210     // just in case.
211     if (m_isValid || output.length()) {
212         // Without ref, the whole url is guaranteed to be ASCII-only.
213         if (m_parsed.ref.is_nonempty())
214             setUtf8(CString(output.data(), output.length()));
215         else
216             setAscii(CString(output.data(), output.length()));
217     } else {
218         // WebCore expects resolved URLs to be empty rather than NULL.
219         setUtf8(CString("", 0));
220     }
221 }
222 
223 // Note: code mostly duplicated above. See FIXMEs and comments there.
init(const KURL & base,const UChar * rel,int relLength,const TextEncoding * queryEncoding)224 void KURLGooglePrivate::init(const KURL& base, const UChar* rel, int relLength,
225                              const TextEncoding* queryEncoding)
226 {
227     KURLCharsetConverter charsetConverterObject(queryEncoding);
228     KURLCharsetConverter* charsetConverter =
229         (!queryEncoding || isUnicodeEncoding(queryEncoding)) ? 0 :
230         &charsetConverterObject;
231 
232     url_canon::RawCanonOutputT<char> output;
233     const CString& baseStr = base.m_url.utf8String();
234     m_isValid = url_util::ResolveRelative(baseStr.data(), baseStr.length(),
235                                           base.m_url.m_parsed, rel, relLength,
236                                           charsetConverter,
237                                           &output, &m_parsed);
238 
239 
240     if (m_isValid || output.length()) {
241         if (m_parsed.ref.is_nonempty())
242             setUtf8(CString(output.data(), output.length()));
243         else
244             setAscii(CString(output.data(), output.length()));
245     } else
246         setUtf8(CString("", 0));
247 }
248 
initProtocolInHTTPFamily()249 void KURLGooglePrivate::initProtocolInHTTPFamily()
250 {
251     if (!m_isValid) {
252         m_protocolInHTTPFamily = false;
253         return;
254     }
255 
256     const char* scheme = m_utf8.data() + m_parsed.scheme.begin;
257     if (m_parsed.scheme.len == 4)
258         m_protocolInHTTPFamily = lowerCaseEqualsASCII(scheme, scheme + 4, "http");
259     else if (m_parsed.scheme.len == 5)
260         m_protocolInHTTPFamily = lowerCaseEqualsASCII(scheme, scheme + 5, "https");
261     else
262         m_protocolInHTTPFamily = false;
263 }
264 
copyTo(KURLGooglePrivate * dest) const265 void KURLGooglePrivate::copyTo(KURLGooglePrivate* dest) const
266 {
267     dest->m_isValid = m_isValid;
268     dest->m_protocolInHTTPFamily = m_protocolInHTTPFamily;
269     dest->m_parsed = m_parsed;
270 
271     // Don't copy the 16-bit string since that will be regenerated as needed.
272     dest->m_utf8 = CString(m_utf8.data(), m_utf8.length());
273     dest->m_utf8IsASCII = m_utf8IsASCII;
274     dest->m_stringIsValid = false;
275 }
276 
componentString(const url_parse::Component & comp) const277 String KURLGooglePrivate::componentString(const url_parse::Component& comp) const
278 {
279     if (!m_isValid || comp.len <= 0) {
280         // KURL returns a NULL string if the URL is itself a NULL string, and an
281         // empty string for other nonexistant entities.
282         if (utf8String().isNull())
283             return String();
284         return String("", 0);
285     }
286     // begin and len are in terms of bytes which do not match
287     // if string() is UTF-16 and input contains non-ASCII characters.
288     // However, the only part in urlString that can contain non-ASCII
289     // characters is 'ref' at the end of the string. In that case,
290     // begin will always match the actual value and len (in terms of
291     // byte) will be longer than what's needed by 'mid'. However, mid
292     // truncates len to avoid go past the end of a string so that we can
293     // get away withtout doing anything here.
294     return string().substring(comp.begin, comp.len);
295 }
296 
replaceComponents(const Replacements & replacements)297 void KURLGooglePrivate::replaceComponents(const Replacements& replacements)
298 {
299     url_canon::RawCanonOutputT<char> output;
300     url_parse::Parsed newParsed;
301 
302     m_isValid = url_util::ReplaceComponents(utf8String().data(),
303                                             utf8String().length(), m_parsed, replacements, 0, &output, &newParsed);
304 
305     m_parsed = newParsed;
306     if (m_parsed.ref.is_nonempty())
307         setUtf8(CString(output.data(), output.length()));
308     else
309         setAscii(CString(output.data(), output.length()));
310 }
311 
string() const312 const String& KURLGooglePrivate::string() const
313 {
314     if (!m_stringIsValid) {
315         // Must special case the NULL case, since constructing the
316         // string like we do below will generate an empty rather than
317         // a NULL string.
318         if (m_utf8.isNull())
319             m_string = String();
320         else if (m_utf8IsASCII)
321             m_string = String(m_utf8.data(), m_utf8.length());
322         else
323             m_string = String::fromUTF8(m_utf8.data(), m_utf8.length());
324         m_stringIsValid = true;
325     }
326     return m_string;
327 }
328 
329 // KURL ------------------------------------------------------------------------
330 
331 // Creates with NULL-terminated string input representing an absolute URL.
332 // WebCore generally calls this only with hardcoded strings, so the input is
333 // ASCII. We treat is as UTF-8 just in case.
KURL(const char * url)334 KURL::KURL(const char *url)
335 {
336     // FIXME The Mac code checks for beginning with a slash and converting to a
337     // file: URL. We will want to add this as well once we can compile on a
338     // system like that.
339     m_url.init(KURL(), url, strlen(url), 0);
340 
341     // The one-argument constructors should never generate a NULL string.
342     // This is a funny quirk of KURL.cpp (probably a bug) which we preserve.
343     if (m_url.utf8String().isNull())
344         m_url.setAscii(CString("", 0));
345 }
346 
347 // Initializes with a string representing an absolute URL. No encoding
348 // information is specified. This generally happens when a KURL is converted
349 // to a string and then converted back. In this case, the URL is already
350 // canonical and in proper escaped form so needs no encoding. We treat it was
351 // UTF-8 just in case.
KURL(const String & url)352 KURL::KURL(const String& url)
353 {
354     if (!url.isNull())
355         m_url.init(KURL(), url, 0);
356     else {
357         // WebCore expects us to preserve the nullness of strings when this
358         // constructor is used. In all other cases, it expects a non-null
359         // empty string, which is what init() will create.
360         m_url.m_isValid = false;
361         m_url.m_protocolInHTTPFamily = false;
362     }
363 }
364 
365 // Constructs a new URL given a base URL and a possibly relative input URL.
366 // This assumes UTF-8 encoding.
KURL(const KURL & base,const String & relative)367 KURL::KURL(const KURL& base, const String& relative)
368 {
369     m_url.init(base, relative, 0);
370 }
371 
372 // Constructs a new URL given a base URL and a possibly relative input URL.
373 // Any query portion of the relative URL will be encoded in the given encoding.
KURL(const KURL & base,const String & relative,const TextEncoding & encoding)374 KURL::KURL(const KURL& base,
375            const String& relative,
376            const TextEncoding& encoding)
377 {
378     m_url.init(base, relative, &encoding.encodingForFormSubmission());
379 }
380 
KURL(const CString & canonicalSpec,const url_parse::Parsed & parsed,bool isValid)381 KURL::KURL(const CString& canonicalSpec,
382            const url_parse::Parsed& parsed, bool isValid)
383     : m_url(parsed, isValid)
384 {
385     // We know the reference fragment is the only part that can be UTF-8, so
386     // we know it's ASCII when there is no ref.
387     if (parsed.ref.is_nonempty())
388         m_url.setUtf8(canonicalSpec);
389     else
390         m_url.setAscii(canonicalSpec);
391 }
392 
393 #if PLATFORM(CF)
KURL(CFURLRef)394 KURL::KURL(CFURLRef)
395 {
396     notImplemented();
397     invalidate();
398 }
399 
createCFURL() const400 CFURLRef KURL::createCFURL() const
401 {
402     notImplemented();
403     return 0;
404 }
405 #endif
406 
copy() const407 KURL KURL::copy() const
408 {
409     KURL result = *this;
410     m_url.copyTo(&result.m_url);
411     return result;
412 }
413 
isNull() const414 bool KURL::isNull() const
415 {
416     return m_url.utf8String().isNull();
417 }
418 
isEmpty() const419 bool KURL::isEmpty() const
420 {
421     return !m_url.utf8String().length();
422 }
423 
isValid() const424 bool KURL::isValid() const
425 {
426     return m_url.m_isValid;
427 }
428 
protocolInHTTPFamily() const429 bool KURL::protocolInHTTPFamily() const
430 {
431     return m_url.m_protocolInHTTPFamily;
432 }
433 
hasPath() const434 bool KURL::hasPath() const
435 {
436     // Note that http://www.google.com/" has a path, the path is "/". This can
437     // return false only for invalid or nonstandard URLs.
438     return m_url.m_parsed.path.len >= 0;
439 }
440 
441 // We handle "parameters" separated by a semicolon, while KURL.cpp does not,
442 // which can lead to different results in some cases.
lastPathComponent() const443 String KURL::lastPathComponent() const
444 {
445     // When the output ends in a slash, WebCore has different expectations than
446     // the GoogleURL library. For "/foo/bar/" the library will return the empty
447     // string, but WebCore wants "bar".
448     url_parse::Component path = m_url.m_parsed.path;
449     if (path.len > 0 && m_url.utf8String().data()[path.end() - 1] == '/')
450         path.len--;
451 
452     url_parse::Component file;
453     url_parse::ExtractFileName(m_url.utf8String().data(), path, &file);
454 
455     // Bug: https://bugs.webkit.org/show_bug.cgi?id=21015 this function returns
456     // a null string when the path is empty, which we duplicate here.
457     if (!file.is_nonempty())
458         return String();
459     return m_url.componentString(file);
460 }
461 
protocol() const462 String KURL::protocol() const
463 {
464     return m_url.componentString(m_url.m_parsed.scheme);
465 }
466 
host() const467 String KURL::host() const
468 {
469     // Note: KURL.cpp unescapes here.
470     return m_url.componentString(m_url.m_parsed.host);
471 }
472 
473 // Returns 0 when there is no port or it is invalid.
474 //
475 // We treat URL's with out-of-range port numbers as invalid URLs, and they will
476 // be rejected by the canonicalizer. KURL.cpp will allow them in parsing, but
477 // return 0 from this port() function, so we mirror that behavior here.
port() const478 unsigned short KURL::port() const
479 {
480     if (!m_url.m_isValid || m_url.m_parsed.port.len <= 0)
481         return 0;
482     int port = url_parse::ParsePort(m_url.utf8String().data(), m_url.m_parsed.port);
483     if (port == url_parse::PORT_UNSPECIFIED)
484         return 0;
485     return static_cast<unsigned short>(port);
486 }
487 
488 // Returns the empty string if there is no password.
pass() const489 String KURL::pass() const
490 {
491     // Bug: https://bugs.webkit.org/show_bug.cgi?id=21015 this function returns
492     // a null string when the password is empty, which we duplicate here.
493     if (!m_url.m_parsed.password.is_nonempty())
494         return String();
495 
496     // Note: KURL.cpp unescapes here.
497     return m_url.componentString(m_url.m_parsed.password);
498 }
499 
500 // Returns the empty string if there is no username.
user() const501 String KURL::user() const
502 {
503     // Note: KURL.cpp unescapes here.
504     return m_url.componentString(m_url.m_parsed.username);
505 }
506 
fragmentIdentifier() const507 String KURL::fragmentIdentifier() const
508 {
509     // Empty but present refs ("foo.com/bar#") should result in the empty
510     // string, which m_url.componentString will produce. Nonexistant refs should be
511     // the NULL string.
512     if (!m_url.m_parsed.ref.is_valid())
513         return String();
514 
515     // Note: KURL.cpp unescapes here.
516     return m_url.componentString(m_url.m_parsed.ref);
517 }
518 
hasFragmentIdentifier() const519 bool KURL::hasFragmentIdentifier() const
520 {
521     // Note: KURL.cpp unescapes here.
522     // FIXME determine if KURL.cpp agrees about an empty ref
523     return m_url.m_parsed.ref.len >= 0;
524 }
525 
baseAsString() const526 String KURL::baseAsString() const
527 {
528     // FIXME: There is probably a more efficient way to do this?
529     return string().left(pathAfterLastSlash());
530 }
531 
query() const532 String KURL::query() const
533 {
534     if (m_url.m_parsed.query.len >= 0)
535         return m_url.componentString(m_url.m_parsed.query);
536 
537     // Bug: https://bugs.webkit.org/show_bug.cgi?id=21015 this function returns
538     // an empty string when the query is empty rather than a null (not sure
539     // which is right).
540     return String("", 0);
541 }
542 
path() const543 String KURL::path() const
544 {
545     // Note: KURL.cpp unescapes here.
546     return m_url.componentString(m_url.m_parsed.path);
547 }
548 
setProtocol(const String & protocol)549 void KURL::setProtocol(const String& protocol)
550 {
551     KURLGooglePrivate::Replacements replacements;
552     replacements.SetScheme(CharactersOrEmpty(protocol),
553                            url_parse::Component(0, protocol.length()));
554     m_url.replaceComponents(replacements);
555 }
556 
setHost(const String & host)557 void KURL::setHost(const String& host)
558 {
559     KURLGooglePrivate::Replacements replacements;
560     replacements.SetHost(CharactersOrEmpty(host),
561                          url_parse::Component(0, host.length()));
562     m_url.replaceComponents(replacements);
563 }
564 
565 // This function is used only in the JSC build.
setHostAndPort(const String & s)566 void KURL::setHostAndPort(const String& s)
567 {
568     String newhost = s.left(s.find(":"));
569     String newport = s.substring(s.find(":") + 1);
570 
571     KURLGooglePrivate::Replacements replacements;
572     // Host can't be removed, so we always set.
573     replacements.SetHost(CharactersOrEmpty(newhost),
574                          url_parse::Component(0, newhost.length()));
575 
576     if (newport.isEmpty())  // Port may be removed, so we support clearing.
577         replacements.ClearPort();
578     else
579         replacements.SetPort(CharactersOrEmpty(newport), url_parse::Component(0, newport.length()));
580     m_url.replaceComponents(replacements);
581 }
582 
setPort(unsigned short i)583 void KURL::setPort(unsigned short i)
584 {
585     KURLGooglePrivate::Replacements replacements;
586     String portStr;
587     if (i) {
588         portStr = String::number(static_cast<int>(i));
589         replacements.SetPort(
590             reinterpret_cast<const url_parse::UTF16Char*>(portStr.characters()),
591             url_parse::Component(0, portStr.length()));
592 
593     } else {
594         // Clear any existing port when it is set to 0.
595         replacements.ClearPort();
596     }
597     m_url.replaceComponents(replacements);
598 }
599 
setUser(const String & user)600 void KURL::setUser(const String& user)
601 {
602     // This function is commonly called to clear the username, which we
603     // normally don't have, so we optimize this case.
604     if (user.isEmpty() && !m_url.m_parsed.username.is_valid())
605         return;
606 
607     // The canonicalizer will clear any usernames that are empty, so we
608     // don't have to explicitly call ClearUsername() here.
609     KURLGooglePrivate::Replacements replacements;
610     replacements.SetUsername(CharactersOrEmpty(user),
611                              url_parse::Component(0, user.length()));
612     m_url.replaceComponents(replacements);
613 }
614 
setPass(const String & pass)615 void KURL::setPass(const String& pass)
616 {
617     // This function is commonly called to clear the password, which we
618     // normally don't have, so we optimize this case.
619     if (pass.isEmpty() && !m_url.m_parsed.password.is_valid())
620         return;
621 
622     // The canonicalizer will clear any passwords that are empty, so we
623     // don't have to explicitly call ClearUsername() here.
624     KURLGooglePrivate::Replacements replacements;
625     replacements.SetPassword(CharactersOrEmpty(pass),
626                              url_parse::Component(0, pass.length()));
627     m_url.replaceComponents(replacements);
628 }
629 
setFragmentIdentifier(const String & s)630 void KURL::setFragmentIdentifier(const String& s)
631 {
632     // This function is commonly called to clear the ref, which we
633     // normally don't have, so we optimize this case.
634     if (s.isNull() && !m_url.m_parsed.ref.is_valid())
635         return;
636 
637     KURLGooglePrivate::Replacements replacements;
638     if (s.isNull())
639         replacements.ClearRef();
640     else
641         replacements.SetRef(CharactersOrEmpty(s), url_parse::Component(0, s.length()));
642     m_url.replaceComponents(replacements);
643 }
644 
removeFragmentIdentifier()645 void KURL::removeFragmentIdentifier()
646 {
647     KURLGooglePrivate::Replacements replacements;
648     replacements.ClearRef();
649     m_url.replaceComponents(replacements);
650 }
651 
setQuery(const String & query)652 void KURL::setQuery(const String& query)
653 {
654     KURLGooglePrivate::Replacements replacements;
655     if (query.isNull()) {
656         // KURL.cpp sets to NULL to clear any query.
657         replacements.ClearQuery();
658     } else if (query.length() > 0 && query[0] == '?') {
659         // WebCore expects the query string to begin with a question mark, but
660         // GoogleURL doesn't. So we trim off the question mark when setting.
661         replacements.SetQuery(CharactersOrEmpty(query),
662                               url_parse::Component(1, query.length() - 1));
663     } else {
664         // When set with the empty string or something that doesn't begin with
665         // a question mark, KURL.cpp will add a question mark for you. The only
666         // way this isn't compatible is if you call this function with an empty
667         // string. KURL.cpp will leave a '?' with nothing following it in the
668         // URL, whereas we'll clear it.
669         // FIXME We should eliminate this difference.
670         replacements.SetQuery(CharactersOrEmpty(query),
671                               url_parse::Component(0, query.length()));
672     }
673     m_url.replaceComponents(replacements);
674 }
675 
setPath(const String & path)676 void KURL::setPath(const String& path)
677 {
678     // Empty paths will be canonicalized to "/", so we don't have to worry
679     // about calling ClearPath().
680     KURLGooglePrivate::Replacements replacements;
681     replacements.SetPath(CharactersOrEmpty(path),
682                          url_parse::Component(0, path.length()));
683     m_url.replaceComponents(replacements);
684 }
685 
686 // On Mac, this just seems to return the same URL, but with "/foo/bar" for
687 // file: URLs instead of file:///foo/bar. We don't bother with any of this,
688 // at least for now.
prettyURL() const689 String KURL::prettyURL() const
690 {
691     if (!m_url.m_isValid)
692         return String();
693     return m_url.string();
694 }
695 
protocolIsJavaScript(const String & url)696 bool protocolIsJavaScript(const String& url)
697 {
698     return protocolIs(url, "javascript");
699 }
700 
701 // We copied the KURL version here on Sept 12, 2008 while doing a WebKit
702 // merge.
703 //
704 // FIXME Somehow share this with KURL? Like we'd theoretically merge with
705 // decodeURLEscapeSequences below?
mimeTypeFromDataURL(const String & url)706 String mimeTypeFromDataURL(const String& url)
707 {
708     ASSERT(protocolIs(url, "data"));
709     int index = url.find(';');
710     if (index == -1)
711         index = url.find(',');
712     if (index != -1) {
713         int len = index - 5;
714         if (len > 0)
715             return url.substring(5, len);
716         return "text/plain"; // Data URLs with no MIME type are considered text/plain.
717     }
718     return "";
719 }
720 
decodeURLEscapeSequences(const String & str)721 String decodeURLEscapeSequences(const String& str)
722 {
723     return decodeURLEscapeSequences(str, UTF8Encoding());
724 }
725 
726 // In KURL.cpp's implementation, this is called by every component getter.
727 // It will unescape every character, including NULL. This is scary, and may
728 // cause security holes. We never call this function for components, and
729 // just return the ASCII versions instead.
730 //
731 // However, this static function is called directly in some cases. It appears
732 // that this only happens for javascript: URLs, so this is essentially the
733 // JavaScript URL decoder. It assumes UTF-8 encoding.
734 //
735 // IE doesn't unescape %00, forcing you to use \x00 in JS strings, so we do
736 // the same. This also eliminates NULL-related problems should a consumer
737 // incorrectly call this function for non-JavaScript.
738 //
739 // FIXME These should be merged to the KURL.cpp implementation.
decodeURLEscapeSequences(const String & str,const TextEncoding & encoding)740 String decodeURLEscapeSequences(const String& str, const TextEncoding& encoding)
741 {
742     // FIXME We can probably use KURL.cpp's version of this function
743     // without modification. However, I'm concerned about
744     // https://bugs.webkit.org/show_bug.cgi?id=20559 so am keeping this old
745     // custom code for now. Using their version will also fix the bug that
746     // we ignore the encoding.
747     //
748     // FIXME b/1350291: This does not get called very often. We just convert
749     // first to 8-bit UTF-8, then unescape, then back to 16-bit. This kind of
750     // sucks, and we don't use the encoding properly, which will make some
751     // obscure anchor navigations fail.
752     CString cstr = str.utf8();
753 
754     const char* input = cstr.data();
755     int inputLength = cstr.length();
756     url_canon::RawCanonOutputT<char> unescaped;
757     for (int i = 0; i < inputLength; i++) {
758         if (input[i] == '%') {
759             unsigned char ch;
760             if (url_canon::DecodeEscaped(input, &i, inputLength, &ch)) {
761                 if (!ch) {
762                     // Never unescape NULLs.
763                     unescaped.push_back('%');
764                     unescaped.push_back('0');
765                     unescaped.push_back('0');
766                 } else
767                     unescaped.push_back(ch);
768             } else {
769                 // Invalid escape sequence, copy the percent literal.
770                 unescaped.push_back('%');
771             }
772         } else {
773             // Regular non-escaped 8-bit character.
774             unescaped.push_back(input[i]);
775         }
776     }
777 
778     // Convert that 8-bit to UTF-16. It's not clear IE does this at all to
779     // JavaScript URLs, but Firefox and Safari do.
780     url_canon::RawCanonOutputT<url_parse::UTF16Char> utf16;
781     for (int i = 0; i < unescaped.length(); i++) {
782         unsigned char uch = static_cast<unsigned char>(unescaped.at(i));
783         if (uch < 0x80) {
784             // Non-UTF-8, just append directly
785             utf16.push_back(uch);
786         } else {
787             // next_ch will point to the last character of the decoded
788             // character.
789             int nextCharacter = i;
790             unsigned codePoint;
791             if (url_canon::ReadUTFChar(unescaped.data(), &nextCharacter,
792                                        unescaped.length(), &codePoint)) {
793                 // Valid UTF-8 character, convert to UTF-16.
794                 url_canon::AppendUTF16Value(codePoint, &utf16);
795                 i = nextCharacter;
796             } else {
797                 // KURL.cpp strips any sequences that are not valid UTF-8. This
798                 // sounds scary. Instead, we just keep those invalid code
799                 // points and promote to UTF-16. We copy all characters from
800                 // the current position to the end of the identified sqeuqnce.
801                 while (i < nextCharacter) {
802                     utf16.push_back(static_cast<unsigned char>(unescaped.at(i)));
803                     i++;
804                 }
805                 utf16.push_back(static_cast<unsigned char>(unescaped.at(i)));
806             }
807         }
808     }
809 
810     return String(reinterpret_cast<UChar*>(utf16.data()), utf16.length());
811 }
812 
protocolIs(const char * protocol) const813 bool KURL::protocolIs(const char* protocol) const
814 {
815     assertProtocolIsGood(protocol);
816 
817     // JavaScript URLs are "valid" and should be executed even if KURL decides they are invalid.
818     // The free function protocolIsJavaScript() should be used instead.
819     // FIXME: Chromium code needs to be fixed for this assert to be enabled. ASSERT(strcmp(protocol, "javascript"));
820 
821     if (m_url.m_parsed.scheme.len <= 0)
822         return !protocol;
823     return lowerCaseEqualsASCII(
824         m_url.utf8String().data() + m_url.m_parsed.scheme.begin,
825         m_url.utf8String().data() + m_url.m_parsed.scheme.end(),
826         protocol);
827 }
828 
isLocalFile() const829 bool KURL::isLocalFile() const
830 {
831     return protocolIs("file");
832 }
833 
834 // This is called to escape a URL string. It is only used externally when
835 // constructing mailto: links to set the query section. Since our query setter
836 // will automatically do the correct escaping, this function does not have to
837 // do any work.
838 //
839 // There is a possibility that a future called may use this function in other
840 // ways, and may expect to get a valid URL string. The dangerous thing we want
841 // to protect against here is accidentally getting NULLs in a string that is
842 // not supposed to have NULLs. Therefore, we escape NULLs here to prevent this.
encodeWithURLEscapeSequences(const String & notEncodedString)843 String encodeWithURLEscapeSequences(const String& notEncodedString)
844 {
845     CString utf8 = UTF8Encoding().encode(
846         reinterpret_cast<const UChar*>(notEncodedString.characters()),
847         notEncodedString.length(),
848         URLEncodedEntitiesForUnencodables);
849     const char* input = utf8.data();
850     int inputLength = utf8.length();
851 
852     Vector<char, 2048> buffer;
853     for (int i = 0; i < inputLength; i++) {
854         if (!input[i])
855             buffer.append("%00", 3);
856         else
857             buffer.append(input[i]);
858     }
859     return String(buffer.data(), buffer.size());
860 }
861 
isHierarchical() const862 bool KURL::isHierarchical() const
863 {
864     if (!m_url.m_parsed.scheme.is_nonempty())
865         return false;
866     return url_util::IsStandard(
867         &m_url.utf8String().data()[m_url.m_parsed.scheme.begin],
868         m_url.utf8String().length(),
869         m_url.m_parsed.scheme);
870 }
871 
872 #ifndef NDEBUG
print() const873 void KURL::print() const
874 {
875     printf("%s\n", m_url.utf8String().data());
876 }
877 #endif
878 
invalidate()879 void KURL::invalidate()
880 {
881     // This is only called from the constructor so resetting the (automatically
882     // initialized) string and parsed structure would be a waste of time.
883     m_url.m_isValid = false;
884     m_url.m_protocolInHTTPFamily = false;
885 }
886 
887 // Equal up to reference fragments, if any.
equalIgnoringFragmentIdentifier(const KURL & a,const KURL & b)888 bool equalIgnoringFragmentIdentifier(const KURL& a, const KURL& b)
889 {
890     // Compute the length of each URL without its ref. Note that the reference
891     // begin (if it exists) points to the character *after* the '#', so we need
892     // to subtract one.
893     int aLength = a.m_url.utf8String().length();
894     if (a.m_url.m_parsed.ref.len >= 0)
895         aLength = a.m_url.m_parsed.ref.begin - 1;
896 
897     int bLength = b.m_url.utf8String().length();
898     if (b.m_url.m_parsed.ref.len >= 0)
899         bLength = b.m_url.m_parsed.ref.begin - 1;
900 
901     return aLength == bLength
902         && !strncmp(a.m_url.utf8String().data(), b.m_url.utf8String().data(), aLength);
903 }
904 
hostStart() const905 unsigned KURL::hostStart() const
906 {
907     return m_url.m_parsed.CountCharactersBefore(url_parse::Parsed::HOST, false);
908 }
909 
hostEnd() const910 unsigned KURL::hostEnd() const
911 {
912     return m_url.m_parsed.CountCharactersBefore(url_parse::Parsed::PORT, true);
913 }
914 
pathStart() const915 unsigned KURL::pathStart() const
916 {
917     return m_url.m_parsed.CountCharactersBefore(url_parse::Parsed::PATH, false);
918 }
919 
pathEnd() const920 unsigned KURL::pathEnd() const
921 {
922     return m_url.m_parsed.CountCharactersBefore(url_parse::Parsed::QUERY, true);
923 }
924 
pathAfterLastSlash() const925 unsigned KURL::pathAfterLastSlash() const
926 {
927     // When there's no path, ask for what would be the beginning of it.
928     if (!m_url.m_parsed.path.is_valid())
929         return m_url.m_parsed.CountCharactersBefore(url_parse::Parsed::PATH, false);
930 
931     url_parse::Component filename;
932     url_parse::ExtractFileName(m_url.utf8String().data(), m_url.m_parsed.path,
933                                &filename);
934     return filename.begin;
935 }
936 
blankURL()937 const KURL& blankURL()
938 {
939     static KURL staticBlankURL("about:blank");
940     return staticBlankURL;
941 }
942 
protocolIs(const String & url,const char * protocol)943 bool protocolIs(const String& url, const char* protocol)
944 {
945     // Do the comparison without making a new string object.
946     assertProtocolIsGood(protocol);
947     for (int i = 0; ; ++i) {
948         if (!protocol[i])
949             return url[i] == ':';
950         if (toASCIILower(url[i]) != protocol[i])
951             return false;
952     }
953 }
954 
protocolIs(const String & string,const char * protocol)955 inline bool KURL::protocolIs(const String& string, const char* protocol)
956 {
957     return WebCore::protocolIs(string, protocol);
958 }
959 
protocolHostAndPortAreEqual(const KURL & a,const KURL & b)960 bool protocolHostAndPortAreEqual(const KURL& a, const KURL& b)
961 {
962     if (a.parsed().scheme.end() != b.parsed().scheme.end())
963         return false;
964 
965     int hostStartA = a.hostStart();
966     int hostStartB = b.hostStart();
967     if (a.hostEnd() - hostStartA != b.hostEnd() - hostStartB)
968         return false;
969 
970     // Check the scheme
971     for (int i = 0; i < a.parsed().scheme.end(); ++i)
972         if (a.string()[i] != b.string()[i])
973             return false;
974 
975     // And the host
976     for (int i = hostStartA; i < static_cast<int>(a.hostEnd()); ++i)
977         if (a.string()[i] != b.string()[i])
978             return false;
979 
980     if (a.port() != b.port())
981         return false;
982 
983     return true;
984 }
985 
986 } // namespace WebCore
987 
988 #endif // USE(GOOGLEURL)
989