1 /*
2 * Copyright (C) 2004, 2007, 2008, 2009 Apple Inc. All rights reserved.
3 * Copyright (C) 2008, 2009 Google Inc. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are
7 * met:
8 *
9 * * Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * * Redistributions in binary form must reproduce the above
12 * copyright notice, this list of conditions and the following disclaimer
13 * in the documentation and/or other materials provided with the
14 * distribution.
15 * * Neither the name of Google Inc. nor the names of its
16 * contributors may be used to endorse or promote products derived from
17 * this software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 #include "config.h"
33
34 #if USE(GOOGLEURL)
35 #include "KURL.h"
36
37 #ifndef NDEBUG
38 #include <stdio.h>
39 #endif
40
41 #include "CString.h"
42 #include "NotImplemented.h"
43 #include "TextEncoding.h"
44 #include <wtf/Vector.h>
45
46 #include <googleurl/src/url_canon_internal.h>
47 #include <googleurl/src/url_util.h>
48
49 using WTF::isASCIILower;
50 using WTF::toASCIILower;
51
52 namespace WebCore {
53
54 // Wraps WebCore's text encoding in a character set converter for the
55 // canonicalizer.
56 class KURLCharsetConverter : public url_canon::CharsetConverter {
57 public:
58 // The encoding parameter may be NULL, but in this case the object must not
59 // be called.
KURLCharsetConverter(const TextEncoding * encoding)60 KURLCharsetConverter(const TextEncoding* encoding)
61 : m_encoding(encoding)
62 {
63 }
64
ConvertFromUTF16(const url_parse::UTF16Char * input,int inputLength,url_canon::CanonOutput * output)65 virtual void ConvertFromUTF16(const url_parse::UTF16Char* input, int inputLength,
66 url_canon::CanonOutput* output)
67 {
68 CString encoded = m_encoding->encode(input, inputLength, URLEncodedEntitiesForUnencodables);
69 output->Append(encoded.data(), static_cast<int>(encoded.length()));
70 }
71
72 private:
73 const TextEncoding* m_encoding;
74 };
75
76 // Note that this function must be named differently than the one in KURL.cpp
77 // since our unit tests evilly include both files, and their local definition
78 // will be ambiguous.
assertProtocolIsGood(const char * protocol)79 static inline void assertProtocolIsGood(const char* protocol)
80 {
81 #ifndef NDEBUG
82 const char* p = protocol;
83 while (*p) {
84 ASSERT(*p > ' ' && *p < 0x7F && !(*p >= 'A' && *p <= 'Z'));
85 ++p;
86 }
87 #endif
88 }
89
90 // Returns the characters for the given string, or a pointer to a static empty
91 // string if the input string is NULL. This will always ensure we have a non-
92 // NULL character pointer since ReplaceComponents has special meaning for NULL.
CharactersOrEmpty(const String & str)93 static inline const url_parse::UTF16Char* CharactersOrEmpty(const String& str)
94 {
95 static const url_parse::UTF16Char zero = 0;
96 return str.characters() ?
97 reinterpret_cast<const url_parse::UTF16Char*>(str.characters()) :
98 &zero;
99 }
100
isUnicodeEncoding(const TextEncoding * encoding)101 static inline bool isUnicodeEncoding(const TextEncoding* encoding)
102 {
103 return encoding->encodingForFormSubmission() == UTF8Encoding();
104 }
105
lowerCaseEqualsASCII(const char * begin,const char * end,const char * str)106 static bool lowerCaseEqualsASCII(const char* begin, const char* end, const char* str)
107 {
108 while (begin != end && *str) {
109 ASSERT(isASCIILower(*str));
110 if (toASCIILower(*begin++) != *str++)
111 return false;
112 }
113
114 // Both strings are equal (ignoring case) if and only if all of the characters were equal,
115 // and the end of both has been reached.
116 return begin == end && !*str;
117 }
118
119
120 // KURLGooglePrivate -----------------------------------------------------------
121
KURLGooglePrivate()122 KURLGooglePrivate::KURLGooglePrivate()
123 : m_isValid(false)
124 , m_protocolInHTTPFamily(false)
125 , m_utf8IsASCII(true)
126 , m_stringIsValid(false)
127 {
128 }
129
KURLGooglePrivate(const url_parse::Parsed & parsed,bool isValid)130 KURLGooglePrivate::KURLGooglePrivate(const url_parse::Parsed& parsed, bool isValid)
131 : m_isValid(isValid)
132 , m_protocolInHTTPFamily(false)
133 , m_parsed(parsed)
134 , m_utf8IsASCII(true)
135 , m_stringIsValid(false)
136 {
137 }
138
139 // Setters for the data. Using the ASCII version when you know the
140 // data is ASCII will be slightly more efficient. The UTF-8 version
141 // will always be correct if the caller is unsure.
setUtf8(const CString & str)142 void KURLGooglePrivate::setUtf8(const CString& str)
143 {
144 const char* data = str.data();
145 unsigned dataLength = str.length();
146
147 // The m_utf8IsASCII must always be correct since the DeprecatedString
148 // getter must create it with the proper constructor. This test can be
149 // removed when DeprecatedString is gone, but it still might be a
150 // performance win.
151 m_utf8IsASCII = true;
152 for (unsigned i = 0; i < dataLength; i++) {
153 if (static_cast<unsigned char>(data[i]) >= 0x80) {
154 m_utf8IsASCII = false;
155 break;
156 }
157 }
158
159 m_utf8 = str;
160 m_stringIsValid = false;
161 initProtocolInHTTPFamily();
162 }
163
setAscii(const CString & str)164 void KURLGooglePrivate::setAscii(const CString& str)
165 {
166 m_utf8 = str;
167 m_utf8IsASCII = true;
168 m_stringIsValid = false;
169 initProtocolInHTTPFamily();
170 }
171
init(const KURL & base,const String & relative,const TextEncoding * queryEncoding)172 void KURLGooglePrivate::init(const KURL& base,
173 const String& relative,
174 const TextEncoding* queryEncoding)
175 {
176 init(base, relative.characters(), relative.length(), queryEncoding);
177 }
178
179 // Note: code mostly duplicated below.
init(const KURL & base,const char * rel,int relLength,const TextEncoding * queryEncoding)180 void KURLGooglePrivate::init(const KURL& base, const char* rel, int relLength,
181 const TextEncoding* queryEncoding)
182 {
183 // As a performance optimization, we do not use the charset converter if
184 // encoding is UTF-8 or other Unicode encodings. Note that this is
185 // per HTML5 2.5.3 (resolving URL). The URL canonicalizer will be
186 // more efficient with no charset converter object because it
187 // can do UTF-8 internally with no extra copies.
188
189 // We feel free to make the charset converter object every time since it's
190 // just a wrapper around a reference.
191 KURLCharsetConverter charsetConverterObject(queryEncoding);
192 KURLCharsetConverter* charsetConverter =
193 (!queryEncoding || isUnicodeEncoding(queryEncoding)) ? 0 :
194 &charsetConverterObject;
195
196 url_canon::RawCanonOutputT<char> output;
197 const CString& baseStr = base.m_url.utf8String();
198 m_isValid = url_util::ResolveRelative(baseStr.data(), baseStr.length(),
199 base.m_url.m_parsed, rel, relLength,
200 charsetConverter,
201 &output, &m_parsed);
202
203 // See FIXME in KURLGooglePrivate in the header. If canonicalization has not
204 // changed the string, we can avoid an extra allocation by using assignment.
205 //
206 // When KURL encounters an error such that the URL is invalid and empty
207 // (for example, resolving a relative URL on a non-hierarchical base), it
208 // will produce an isNull URL, and calling setUtf8 will produce an empty
209 // non-null URL. This is unlikely to affect anything, but we preserve this
210 // just in case.
211 if (m_isValid || output.length()) {
212 // Without ref, the whole url is guaranteed to be ASCII-only.
213 if (m_parsed.ref.is_nonempty())
214 setUtf8(CString(output.data(), output.length()));
215 else
216 setAscii(CString(output.data(), output.length()));
217 } else {
218 // WebCore expects resolved URLs to be empty rather than NULL.
219 setUtf8(CString("", 0));
220 }
221 }
222
223 // Note: code mostly duplicated above. See FIXMEs and comments there.
init(const KURL & base,const UChar * rel,int relLength,const TextEncoding * queryEncoding)224 void KURLGooglePrivate::init(const KURL& base, const UChar* rel, int relLength,
225 const TextEncoding* queryEncoding)
226 {
227 KURLCharsetConverter charsetConverterObject(queryEncoding);
228 KURLCharsetConverter* charsetConverter =
229 (!queryEncoding || isUnicodeEncoding(queryEncoding)) ? 0 :
230 &charsetConverterObject;
231
232 url_canon::RawCanonOutputT<char> output;
233 const CString& baseStr = base.m_url.utf8String();
234 m_isValid = url_util::ResolveRelative(baseStr.data(), baseStr.length(),
235 base.m_url.m_parsed, rel, relLength,
236 charsetConverter,
237 &output, &m_parsed);
238
239
240 if (m_isValid || output.length()) {
241 if (m_parsed.ref.is_nonempty())
242 setUtf8(CString(output.data(), output.length()));
243 else
244 setAscii(CString(output.data(), output.length()));
245 } else
246 setUtf8(CString("", 0));
247 }
248
initProtocolInHTTPFamily()249 void KURLGooglePrivate::initProtocolInHTTPFamily()
250 {
251 if (!m_isValid) {
252 m_protocolInHTTPFamily = false;
253 return;
254 }
255
256 const char* scheme = m_utf8.data() + m_parsed.scheme.begin;
257 if (m_parsed.scheme.len == 4)
258 m_protocolInHTTPFamily = lowerCaseEqualsASCII(scheme, scheme + 4, "http");
259 else if (m_parsed.scheme.len == 5)
260 m_protocolInHTTPFamily = lowerCaseEqualsASCII(scheme, scheme + 5, "https");
261 else
262 m_protocolInHTTPFamily = false;
263 }
264
copyTo(KURLGooglePrivate * dest) const265 void KURLGooglePrivate::copyTo(KURLGooglePrivate* dest) const
266 {
267 dest->m_isValid = m_isValid;
268 dest->m_protocolInHTTPFamily = m_protocolInHTTPFamily;
269 dest->m_parsed = m_parsed;
270
271 // Don't copy the 16-bit string since that will be regenerated as needed.
272 dest->m_utf8 = CString(m_utf8.data(), m_utf8.length());
273 dest->m_utf8IsASCII = m_utf8IsASCII;
274 dest->m_stringIsValid = false;
275 }
276
componentString(const url_parse::Component & comp) const277 String KURLGooglePrivate::componentString(const url_parse::Component& comp) const
278 {
279 if (!m_isValid || comp.len <= 0) {
280 // KURL returns a NULL string if the URL is itself a NULL string, and an
281 // empty string for other nonexistant entities.
282 if (utf8String().isNull())
283 return String();
284 return String("", 0);
285 }
286 // begin and len are in terms of bytes which do not match
287 // if string() is UTF-16 and input contains non-ASCII characters.
288 // However, the only part in urlString that can contain non-ASCII
289 // characters is 'ref' at the end of the string. In that case,
290 // begin will always match the actual value and len (in terms of
291 // byte) will be longer than what's needed by 'mid'. However, mid
292 // truncates len to avoid go past the end of a string so that we can
293 // get away withtout doing anything here.
294 return string().substring(comp.begin, comp.len);
295 }
296
replaceComponents(const Replacements & replacements)297 void KURLGooglePrivate::replaceComponents(const Replacements& replacements)
298 {
299 url_canon::RawCanonOutputT<char> output;
300 url_parse::Parsed newParsed;
301
302 m_isValid = url_util::ReplaceComponents(utf8String().data(),
303 utf8String().length(), m_parsed, replacements, 0, &output, &newParsed);
304
305 m_parsed = newParsed;
306 if (m_parsed.ref.is_nonempty())
307 setUtf8(CString(output.data(), output.length()));
308 else
309 setAscii(CString(output.data(), output.length()));
310 }
311
string() const312 const String& KURLGooglePrivate::string() const
313 {
314 if (!m_stringIsValid) {
315 // Must special case the NULL case, since constructing the
316 // string like we do below will generate an empty rather than
317 // a NULL string.
318 if (m_utf8.isNull())
319 m_string = String();
320 else if (m_utf8IsASCII)
321 m_string = String(m_utf8.data(), m_utf8.length());
322 else
323 m_string = String::fromUTF8(m_utf8.data(), m_utf8.length());
324 m_stringIsValid = true;
325 }
326 return m_string;
327 }
328
329 // KURL ------------------------------------------------------------------------
330
331 // Creates with NULL-terminated string input representing an absolute URL.
332 // WebCore generally calls this only with hardcoded strings, so the input is
333 // ASCII. We treat is as UTF-8 just in case.
KURL(const char * url)334 KURL::KURL(const char *url)
335 {
336 // FIXME The Mac code checks for beginning with a slash and converting to a
337 // file: URL. We will want to add this as well once we can compile on a
338 // system like that.
339 m_url.init(KURL(), url, strlen(url), 0);
340
341 // The one-argument constructors should never generate a NULL string.
342 // This is a funny quirk of KURL.cpp (probably a bug) which we preserve.
343 if (m_url.utf8String().isNull())
344 m_url.setAscii(CString("", 0));
345 }
346
347 // Initializes with a string representing an absolute URL. No encoding
348 // information is specified. This generally happens when a KURL is converted
349 // to a string and then converted back. In this case, the URL is already
350 // canonical and in proper escaped form so needs no encoding. We treat it was
351 // UTF-8 just in case.
KURL(const String & url)352 KURL::KURL(const String& url)
353 {
354 if (!url.isNull())
355 m_url.init(KURL(), url, 0);
356 else {
357 // WebCore expects us to preserve the nullness of strings when this
358 // constructor is used. In all other cases, it expects a non-null
359 // empty string, which is what init() will create.
360 m_url.m_isValid = false;
361 m_url.m_protocolInHTTPFamily = false;
362 }
363 }
364
365 // Constructs a new URL given a base URL and a possibly relative input URL.
366 // This assumes UTF-8 encoding.
KURL(const KURL & base,const String & relative)367 KURL::KURL(const KURL& base, const String& relative)
368 {
369 m_url.init(base, relative, 0);
370 }
371
372 // Constructs a new URL given a base URL and a possibly relative input URL.
373 // Any query portion of the relative URL will be encoded in the given encoding.
KURL(const KURL & base,const String & relative,const TextEncoding & encoding)374 KURL::KURL(const KURL& base,
375 const String& relative,
376 const TextEncoding& encoding)
377 {
378 m_url.init(base, relative, &encoding.encodingForFormSubmission());
379 }
380
KURL(const CString & canonicalSpec,const url_parse::Parsed & parsed,bool isValid)381 KURL::KURL(const CString& canonicalSpec,
382 const url_parse::Parsed& parsed, bool isValid)
383 : m_url(parsed, isValid)
384 {
385 // We know the reference fragment is the only part that can be UTF-8, so
386 // we know it's ASCII when there is no ref.
387 if (parsed.ref.is_nonempty())
388 m_url.setUtf8(canonicalSpec);
389 else
390 m_url.setAscii(canonicalSpec);
391 }
392
393 #if PLATFORM(CF)
KURL(CFURLRef)394 KURL::KURL(CFURLRef)
395 {
396 notImplemented();
397 invalidate();
398 }
399
createCFURL() const400 CFURLRef KURL::createCFURL() const
401 {
402 notImplemented();
403 return 0;
404 }
405 #endif
406
copy() const407 KURL KURL::copy() const
408 {
409 KURL result = *this;
410 m_url.copyTo(&result.m_url);
411 return result;
412 }
413
isNull() const414 bool KURL::isNull() const
415 {
416 return m_url.utf8String().isNull();
417 }
418
isEmpty() const419 bool KURL::isEmpty() const
420 {
421 return !m_url.utf8String().length();
422 }
423
isValid() const424 bool KURL::isValid() const
425 {
426 return m_url.m_isValid;
427 }
428
protocolInHTTPFamily() const429 bool KURL::protocolInHTTPFamily() const
430 {
431 return m_url.m_protocolInHTTPFamily;
432 }
433
hasPath() const434 bool KURL::hasPath() const
435 {
436 // Note that http://www.google.com/" has a path, the path is "/". This can
437 // return false only for invalid or nonstandard URLs.
438 return m_url.m_parsed.path.len >= 0;
439 }
440
441 // We handle "parameters" separated by a semicolon, while KURL.cpp does not,
442 // which can lead to different results in some cases.
lastPathComponent() const443 String KURL::lastPathComponent() const
444 {
445 // When the output ends in a slash, WebCore has different expectations than
446 // the GoogleURL library. For "/foo/bar/" the library will return the empty
447 // string, but WebCore wants "bar".
448 url_parse::Component path = m_url.m_parsed.path;
449 if (path.len > 0 && m_url.utf8String().data()[path.end() - 1] == '/')
450 path.len--;
451
452 url_parse::Component file;
453 url_parse::ExtractFileName(m_url.utf8String().data(), path, &file);
454
455 // Bug: https://bugs.webkit.org/show_bug.cgi?id=21015 this function returns
456 // a null string when the path is empty, which we duplicate here.
457 if (!file.is_nonempty())
458 return String();
459 return m_url.componentString(file);
460 }
461
protocol() const462 String KURL::protocol() const
463 {
464 return m_url.componentString(m_url.m_parsed.scheme);
465 }
466
host() const467 String KURL::host() const
468 {
469 // Note: KURL.cpp unescapes here.
470 return m_url.componentString(m_url.m_parsed.host);
471 }
472
473 // Returns 0 when there is no port or it is invalid.
474 //
475 // We treat URL's with out-of-range port numbers as invalid URLs, and they will
476 // be rejected by the canonicalizer. KURL.cpp will allow them in parsing, but
477 // return 0 from this port() function, so we mirror that behavior here.
port() const478 unsigned short KURL::port() const
479 {
480 if (!m_url.m_isValid || m_url.m_parsed.port.len <= 0)
481 return 0;
482 int port = url_parse::ParsePort(m_url.utf8String().data(), m_url.m_parsed.port);
483 if (port == url_parse::PORT_UNSPECIFIED)
484 return 0;
485 return static_cast<unsigned short>(port);
486 }
487
488 // Returns the empty string if there is no password.
pass() const489 String KURL::pass() const
490 {
491 // Bug: https://bugs.webkit.org/show_bug.cgi?id=21015 this function returns
492 // a null string when the password is empty, which we duplicate here.
493 if (!m_url.m_parsed.password.is_nonempty())
494 return String();
495
496 // Note: KURL.cpp unescapes here.
497 return m_url.componentString(m_url.m_parsed.password);
498 }
499
500 // Returns the empty string if there is no username.
user() const501 String KURL::user() const
502 {
503 // Note: KURL.cpp unescapes here.
504 return m_url.componentString(m_url.m_parsed.username);
505 }
506
fragmentIdentifier() const507 String KURL::fragmentIdentifier() const
508 {
509 // Empty but present refs ("foo.com/bar#") should result in the empty
510 // string, which m_url.componentString will produce. Nonexistant refs should be
511 // the NULL string.
512 if (!m_url.m_parsed.ref.is_valid())
513 return String();
514
515 // Note: KURL.cpp unescapes here.
516 return m_url.componentString(m_url.m_parsed.ref);
517 }
518
hasFragmentIdentifier() const519 bool KURL::hasFragmentIdentifier() const
520 {
521 // Note: KURL.cpp unescapes here.
522 // FIXME determine if KURL.cpp agrees about an empty ref
523 return m_url.m_parsed.ref.len >= 0;
524 }
525
baseAsString() const526 String KURL::baseAsString() const
527 {
528 // FIXME: There is probably a more efficient way to do this?
529 return string().left(pathAfterLastSlash());
530 }
531
query() const532 String KURL::query() const
533 {
534 if (m_url.m_parsed.query.len >= 0)
535 return m_url.componentString(m_url.m_parsed.query);
536
537 // Bug: https://bugs.webkit.org/show_bug.cgi?id=21015 this function returns
538 // an empty string when the query is empty rather than a null (not sure
539 // which is right).
540 return String("", 0);
541 }
542
path() const543 String KURL::path() const
544 {
545 // Note: KURL.cpp unescapes here.
546 return m_url.componentString(m_url.m_parsed.path);
547 }
548
setProtocol(const String & protocol)549 void KURL::setProtocol(const String& protocol)
550 {
551 KURLGooglePrivate::Replacements replacements;
552 replacements.SetScheme(CharactersOrEmpty(protocol),
553 url_parse::Component(0, protocol.length()));
554 m_url.replaceComponents(replacements);
555 }
556
setHost(const String & host)557 void KURL::setHost(const String& host)
558 {
559 KURLGooglePrivate::Replacements replacements;
560 replacements.SetHost(CharactersOrEmpty(host),
561 url_parse::Component(0, host.length()));
562 m_url.replaceComponents(replacements);
563 }
564
565 // This function is used only in the JSC build.
setHostAndPort(const String & s)566 void KURL::setHostAndPort(const String& s)
567 {
568 String newhost = s.left(s.find(":"));
569 String newport = s.substring(s.find(":") + 1);
570
571 KURLGooglePrivate::Replacements replacements;
572 // Host can't be removed, so we always set.
573 replacements.SetHost(CharactersOrEmpty(newhost),
574 url_parse::Component(0, newhost.length()));
575
576 if (newport.isEmpty()) // Port may be removed, so we support clearing.
577 replacements.ClearPort();
578 else
579 replacements.SetPort(CharactersOrEmpty(newport), url_parse::Component(0, newport.length()));
580 m_url.replaceComponents(replacements);
581 }
582
setPort(unsigned short i)583 void KURL::setPort(unsigned short i)
584 {
585 KURLGooglePrivate::Replacements replacements;
586 String portStr;
587 if (i) {
588 portStr = String::number(static_cast<int>(i));
589 replacements.SetPort(
590 reinterpret_cast<const url_parse::UTF16Char*>(portStr.characters()),
591 url_parse::Component(0, portStr.length()));
592
593 } else {
594 // Clear any existing port when it is set to 0.
595 replacements.ClearPort();
596 }
597 m_url.replaceComponents(replacements);
598 }
599
setUser(const String & user)600 void KURL::setUser(const String& user)
601 {
602 // This function is commonly called to clear the username, which we
603 // normally don't have, so we optimize this case.
604 if (user.isEmpty() && !m_url.m_parsed.username.is_valid())
605 return;
606
607 // The canonicalizer will clear any usernames that are empty, so we
608 // don't have to explicitly call ClearUsername() here.
609 KURLGooglePrivate::Replacements replacements;
610 replacements.SetUsername(CharactersOrEmpty(user),
611 url_parse::Component(0, user.length()));
612 m_url.replaceComponents(replacements);
613 }
614
setPass(const String & pass)615 void KURL::setPass(const String& pass)
616 {
617 // This function is commonly called to clear the password, which we
618 // normally don't have, so we optimize this case.
619 if (pass.isEmpty() && !m_url.m_parsed.password.is_valid())
620 return;
621
622 // The canonicalizer will clear any passwords that are empty, so we
623 // don't have to explicitly call ClearUsername() here.
624 KURLGooglePrivate::Replacements replacements;
625 replacements.SetPassword(CharactersOrEmpty(pass),
626 url_parse::Component(0, pass.length()));
627 m_url.replaceComponents(replacements);
628 }
629
setFragmentIdentifier(const String & s)630 void KURL::setFragmentIdentifier(const String& s)
631 {
632 // This function is commonly called to clear the ref, which we
633 // normally don't have, so we optimize this case.
634 if (s.isNull() && !m_url.m_parsed.ref.is_valid())
635 return;
636
637 KURLGooglePrivate::Replacements replacements;
638 if (s.isNull())
639 replacements.ClearRef();
640 else
641 replacements.SetRef(CharactersOrEmpty(s), url_parse::Component(0, s.length()));
642 m_url.replaceComponents(replacements);
643 }
644
removeFragmentIdentifier()645 void KURL::removeFragmentIdentifier()
646 {
647 KURLGooglePrivate::Replacements replacements;
648 replacements.ClearRef();
649 m_url.replaceComponents(replacements);
650 }
651
setQuery(const String & query)652 void KURL::setQuery(const String& query)
653 {
654 KURLGooglePrivate::Replacements replacements;
655 if (query.isNull()) {
656 // KURL.cpp sets to NULL to clear any query.
657 replacements.ClearQuery();
658 } else if (query.length() > 0 && query[0] == '?') {
659 // WebCore expects the query string to begin with a question mark, but
660 // GoogleURL doesn't. So we trim off the question mark when setting.
661 replacements.SetQuery(CharactersOrEmpty(query),
662 url_parse::Component(1, query.length() - 1));
663 } else {
664 // When set with the empty string or something that doesn't begin with
665 // a question mark, KURL.cpp will add a question mark for you. The only
666 // way this isn't compatible is if you call this function with an empty
667 // string. KURL.cpp will leave a '?' with nothing following it in the
668 // URL, whereas we'll clear it.
669 // FIXME We should eliminate this difference.
670 replacements.SetQuery(CharactersOrEmpty(query),
671 url_parse::Component(0, query.length()));
672 }
673 m_url.replaceComponents(replacements);
674 }
675
setPath(const String & path)676 void KURL::setPath(const String& path)
677 {
678 // Empty paths will be canonicalized to "/", so we don't have to worry
679 // about calling ClearPath().
680 KURLGooglePrivate::Replacements replacements;
681 replacements.SetPath(CharactersOrEmpty(path),
682 url_parse::Component(0, path.length()));
683 m_url.replaceComponents(replacements);
684 }
685
686 // On Mac, this just seems to return the same URL, but with "/foo/bar" for
687 // file: URLs instead of file:///foo/bar. We don't bother with any of this,
688 // at least for now.
prettyURL() const689 String KURL::prettyURL() const
690 {
691 if (!m_url.m_isValid)
692 return String();
693 return m_url.string();
694 }
695
protocolIsJavaScript(const String & url)696 bool protocolIsJavaScript(const String& url)
697 {
698 return protocolIs(url, "javascript");
699 }
700
701 // We copied the KURL version here on Sept 12, 2008 while doing a WebKit
702 // merge.
703 //
704 // FIXME Somehow share this with KURL? Like we'd theoretically merge with
705 // decodeURLEscapeSequences below?
mimeTypeFromDataURL(const String & url)706 String mimeTypeFromDataURL(const String& url)
707 {
708 ASSERT(protocolIs(url, "data"));
709 int index = url.find(';');
710 if (index == -1)
711 index = url.find(',');
712 if (index != -1) {
713 int len = index - 5;
714 if (len > 0)
715 return url.substring(5, len);
716 return "text/plain"; // Data URLs with no MIME type are considered text/plain.
717 }
718 return "";
719 }
720
decodeURLEscapeSequences(const String & str)721 String decodeURLEscapeSequences(const String& str)
722 {
723 return decodeURLEscapeSequences(str, UTF8Encoding());
724 }
725
726 // In KURL.cpp's implementation, this is called by every component getter.
727 // It will unescape every character, including NULL. This is scary, and may
728 // cause security holes. We never call this function for components, and
729 // just return the ASCII versions instead.
730 //
731 // However, this static function is called directly in some cases. It appears
732 // that this only happens for javascript: URLs, so this is essentially the
733 // JavaScript URL decoder. It assumes UTF-8 encoding.
734 //
735 // IE doesn't unescape %00, forcing you to use \x00 in JS strings, so we do
736 // the same. This also eliminates NULL-related problems should a consumer
737 // incorrectly call this function for non-JavaScript.
738 //
739 // FIXME These should be merged to the KURL.cpp implementation.
decodeURLEscapeSequences(const String & str,const TextEncoding & encoding)740 String decodeURLEscapeSequences(const String& str, const TextEncoding& encoding)
741 {
742 // FIXME We can probably use KURL.cpp's version of this function
743 // without modification. However, I'm concerned about
744 // https://bugs.webkit.org/show_bug.cgi?id=20559 so am keeping this old
745 // custom code for now. Using their version will also fix the bug that
746 // we ignore the encoding.
747 //
748 // FIXME b/1350291: This does not get called very often. We just convert
749 // first to 8-bit UTF-8, then unescape, then back to 16-bit. This kind of
750 // sucks, and we don't use the encoding properly, which will make some
751 // obscure anchor navigations fail.
752 CString cstr = str.utf8();
753
754 const char* input = cstr.data();
755 int inputLength = cstr.length();
756 url_canon::RawCanonOutputT<char> unescaped;
757 for (int i = 0; i < inputLength; i++) {
758 if (input[i] == '%') {
759 unsigned char ch;
760 if (url_canon::DecodeEscaped(input, &i, inputLength, &ch)) {
761 if (!ch) {
762 // Never unescape NULLs.
763 unescaped.push_back('%');
764 unescaped.push_back('0');
765 unescaped.push_back('0');
766 } else
767 unescaped.push_back(ch);
768 } else {
769 // Invalid escape sequence, copy the percent literal.
770 unescaped.push_back('%');
771 }
772 } else {
773 // Regular non-escaped 8-bit character.
774 unescaped.push_back(input[i]);
775 }
776 }
777
778 // Convert that 8-bit to UTF-16. It's not clear IE does this at all to
779 // JavaScript URLs, but Firefox and Safari do.
780 url_canon::RawCanonOutputT<url_parse::UTF16Char> utf16;
781 for (int i = 0; i < unescaped.length(); i++) {
782 unsigned char uch = static_cast<unsigned char>(unescaped.at(i));
783 if (uch < 0x80) {
784 // Non-UTF-8, just append directly
785 utf16.push_back(uch);
786 } else {
787 // next_ch will point to the last character of the decoded
788 // character.
789 int nextCharacter = i;
790 unsigned codePoint;
791 if (url_canon::ReadUTFChar(unescaped.data(), &nextCharacter,
792 unescaped.length(), &codePoint)) {
793 // Valid UTF-8 character, convert to UTF-16.
794 url_canon::AppendUTF16Value(codePoint, &utf16);
795 i = nextCharacter;
796 } else {
797 // KURL.cpp strips any sequences that are not valid UTF-8. This
798 // sounds scary. Instead, we just keep those invalid code
799 // points and promote to UTF-16. We copy all characters from
800 // the current position to the end of the identified sqeuqnce.
801 while (i < nextCharacter) {
802 utf16.push_back(static_cast<unsigned char>(unescaped.at(i)));
803 i++;
804 }
805 utf16.push_back(static_cast<unsigned char>(unescaped.at(i)));
806 }
807 }
808 }
809
810 return String(reinterpret_cast<UChar*>(utf16.data()), utf16.length());
811 }
812
protocolIs(const char * protocol) const813 bool KURL::protocolIs(const char* protocol) const
814 {
815 assertProtocolIsGood(protocol);
816
817 // JavaScript URLs are "valid" and should be executed even if KURL decides they are invalid.
818 // The free function protocolIsJavaScript() should be used instead.
819 // FIXME: Chromium code needs to be fixed for this assert to be enabled. ASSERT(strcmp(protocol, "javascript"));
820
821 if (m_url.m_parsed.scheme.len <= 0)
822 return !protocol;
823 return lowerCaseEqualsASCII(
824 m_url.utf8String().data() + m_url.m_parsed.scheme.begin,
825 m_url.utf8String().data() + m_url.m_parsed.scheme.end(),
826 protocol);
827 }
828
isLocalFile() const829 bool KURL::isLocalFile() const
830 {
831 return protocolIs("file");
832 }
833
834 // This is called to escape a URL string. It is only used externally when
835 // constructing mailto: links to set the query section. Since our query setter
836 // will automatically do the correct escaping, this function does not have to
837 // do any work.
838 //
839 // There is a possibility that a future called may use this function in other
840 // ways, and may expect to get a valid URL string. The dangerous thing we want
841 // to protect against here is accidentally getting NULLs in a string that is
842 // not supposed to have NULLs. Therefore, we escape NULLs here to prevent this.
encodeWithURLEscapeSequences(const String & notEncodedString)843 String encodeWithURLEscapeSequences(const String& notEncodedString)
844 {
845 CString utf8 = UTF8Encoding().encode(
846 reinterpret_cast<const UChar*>(notEncodedString.characters()),
847 notEncodedString.length(),
848 URLEncodedEntitiesForUnencodables);
849 const char* input = utf8.data();
850 int inputLength = utf8.length();
851
852 Vector<char, 2048> buffer;
853 for (int i = 0; i < inputLength; i++) {
854 if (!input[i])
855 buffer.append("%00", 3);
856 else
857 buffer.append(input[i]);
858 }
859 return String(buffer.data(), buffer.size());
860 }
861
isHierarchical() const862 bool KURL::isHierarchical() const
863 {
864 if (!m_url.m_parsed.scheme.is_nonempty())
865 return false;
866 return url_util::IsStandard(
867 &m_url.utf8String().data()[m_url.m_parsed.scheme.begin],
868 m_url.utf8String().length(),
869 m_url.m_parsed.scheme);
870 }
871
872 #ifndef NDEBUG
print() const873 void KURL::print() const
874 {
875 printf("%s\n", m_url.utf8String().data());
876 }
877 #endif
878
invalidate()879 void KURL::invalidate()
880 {
881 // This is only called from the constructor so resetting the (automatically
882 // initialized) string and parsed structure would be a waste of time.
883 m_url.m_isValid = false;
884 m_url.m_protocolInHTTPFamily = false;
885 }
886
887 // Equal up to reference fragments, if any.
equalIgnoringFragmentIdentifier(const KURL & a,const KURL & b)888 bool equalIgnoringFragmentIdentifier(const KURL& a, const KURL& b)
889 {
890 // Compute the length of each URL without its ref. Note that the reference
891 // begin (if it exists) points to the character *after* the '#', so we need
892 // to subtract one.
893 int aLength = a.m_url.utf8String().length();
894 if (a.m_url.m_parsed.ref.len >= 0)
895 aLength = a.m_url.m_parsed.ref.begin - 1;
896
897 int bLength = b.m_url.utf8String().length();
898 if (b.m_url.m_parsed.ref.len >= 0)
899 bLength = b.m_url.m_parsed.ref.begin - 1;
900
901 return aLength == bLength
902 && !strncmp(a.m_url.utf8String().data(), b.m_url.utf8String().data(), aLength);
903 }
904
hostStart() const905 unsigned KURL::hostStart() const
906 {
907 return m_url.m_parsed.CountCharactersBefore(url_parse::Parsed::HOST, false);
908 }
909
hostEnd() const910 unsigned KURL::hostEnd() const
911 {
912 return m_url.m_parsed.CountCharactersBefore(url_parse::Parsed::PORT, true);
913 }
914
pathStart() const915 unsigned KURL::pathStart() const
916 {
917 return m_url.m_parsed.CountCharactersBefore(url_parse::Parsed::PATH, false);
918 }
919
pathEnd() const920 unsigned KURL::pathEnd() const
921 {
922 return m_url.m_parsed.CountCharactersBefore(url_parse::Parsed::QUERY, true);
923 }
924
pathAfterLastSlash() const925 unsigned KURL::pathAfterLastSlash() const
926 {
927 // When there's no path, ask for what would be the beginning of it.
928 if (!m_url.m_parsed.path.is_valid())
929 return m_url.m_parsed.CountCharactersBefore(url_parse::Parsed::PATH, false);
930
931 url_parse::Component filename;
932 url_parse::ExtractFileName(m_url.utf8String().data(), m_url.m_parsed.path,
933 &filename);
934 return filename.begin;
935 }
936
blankURL()937 const KURL& blankURL()
938 {
939 static KURL staticBlankURL("about:blank");
940 return staticBlankURL;
941 }
942
protocolIs(const String & url,const char * protocol)943 bool protocolIs(const String& url, const char* protocol)
944 {
945 // Do the comparison without making a new string object.
946 assertProtocolIsGood(protocol);
947 for (int i = 0; ; ++i) {
948 if (!protocol[i])
949 return url[i] == ':';
950 if (toASCIILower(url[i]) != protocol[i])
951 return false;
952 }
953 }
954
protocolIs(const String & string,const char * protocol)955 inline bool KURL::protocolIs(const String& string, const char* protocol)
956 {
957 return WebCore::protocolIs(string, protocol);
958 }
959
protocolHostAndPortAreEqual(const KURL & a,const KURL & b)960 bool protocolHostAndPortAreEqual(const KURL& a, const KURL& b)
961 {
962 if (a.parsed().scheme.end() != b.parsed().scheme.end())
963 return false;
964
965 int hostStartA = a.hostStart();
966 int hostStartB = b.hostStart();
967 if (a.hostEnd() - hostStartA != b.hostEnd() - hostStartB)
968 return false;
969
970 // Check the scheme
971 for (int i = 0; i < a.parsed().scheme.end(); ++i)
972 if (a.string()[i] != b.string()[i])
973 return false;
974
975 // And the host
976 for (int i = hostStartA; i < static_cast<int>(a.hostEnd()); ++i)
977 if (a.string()[i] != b.string()[i])
978 return false;
979
980 if (a.port() != b.port())
981 return false;
982
983 return true;
984 }
985
986 } // namespace WebCore
987
988 #endif // USE(GOOGLEURL)
989