// Copyright 2023 The Chromium Authors // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. // Functions to canonicalize non-special URLs. #include "url/url_canon.h" #include "url/url_canon_internal.h" namespace url { namespace { template bool DoCanonicalizeNonSpecialURL(const URLComponentSource& source, const Parsed& parsed, CharsetConverter* query_converter, CanonOutput& output, Parsed& new_parsed) { // The implementation is similar to `DoCanonicalizeStandardURL()`, but there // are many subtle differences. So we have a different function for // canonicalizing non-special URLs. // // Since canonicalization is also used from url::ReplaceComponents(), // we have to handle an invalid URL replacement here, such as: // // > const url = "git:///"; // > url.username = "x"; // > url.href // "git:///" (this should not be "git://x@"). DCHECK(!parsed.has_opaque_path); // Scheme: this will append the colon. bool success = CanonicalizeScheme(source.scheme, parsed.scheme, &output, &new_parsed.scheme); bool have_authority = (parsed.username.is_valid() || parsed.password.is_valid() || parsed.host.is_valid() || parsed.port.is_valid()); // Non-special URL examples which should be carefully handled: // // | URL | parsed.user | parsed.host | have_authority | Valid URL? | // |----------+---------------+---------------+----------------+------------| // | git:/a | invalid | invalid | false | valid | // | git://@/ | valid (empty) | invalid | true | invalid | // | git:/// | invalid | valid (empty) | true | valid | if (have_authority) { // Only write the authority separators when we have a scheme. if (parsed.scheme.is_valid()) { output.push_back('/'); output.push_back('/'); } // Username and Password // // URL Standard: // - https://url.spec.whatwg.org/#cannot-have-a-username-password-port // - https://url.spec.whatwg.org/#dom-url-username // - https://url.spec.whatwg.org/#dom-url-password if (parsed.host.is_nonempty()) { // User info: the canonicalizer will handle the : and @. success &= CanonicalizeUserInfo( source.username, parsed.username, source.password, parsed.password, &output, &new_parsed.username, &new_parsed.password); } else { new_parsed.username.reset(); new_parsed.password.reset(); } // Host if (parsed.host.is_valid()) { success &= CanonicalizeNonSpecialHost(source.host, parsed.host, output, new_parsed.host); } else { new_parsed.host.reset(); // URL is invalid if `have_authority` is true, but `parsed.host` is // invalid. Example: "git://@/". success = false; } // Port // // URL Standard: // - https://url.spec.whatwg.org/#cannot-have-a-username-password-port // - https://url.spec.whatwg.org/#dom-url-port if (parsed.host.is_nonempty()) { success &= CanonicalizePort(source.port, parsed.port, PORT_UNSPECIFIED, &output, &new_parsed.port); } else { new_parsed.port.reset(); } } else { // No authority, clear the components. new_parsed.host.reset(); new_parsed.username.reset(); new_parsed.password.reset(); new_parsed.port.reset(); } // Path if (parsed.path.is_valid()) { if (!parsed.host.is_valid() && parsed.path.is_empty()) { // Handle an edge case: Replacing non-special path-only URL's pathname // with an empty path. // // Path-only non-special URLs cannot have their paths erased. // // Example: // // > const url = new URL("git:/a"); // > url.pathname = ''; // > url.href // => The result should be "git:/", instead of "git:". // > url.pathname // => The result should be "/", instead of "". // // URL Standard is https://url.spec.whatwg.org/#dom-url-pathname, however, // it would take some time to understand why url.pathname ends up as "/" // in this case. Please read the URL Standard carefully to understand // that. new_parsed.path.begin = output.length(); output.push_back('/'); new_parsed.path.len = output.length() - new_parsed.path.begin; } else { success &= CanonicalizePath(source.path, parsed.path, CanonMode::kNonSpecialURL, &output, &new_parsed.path); if (!parsed.host.is_valid() && new_parsed.path.is_valid() && new_parsed.path.as_string_view_on(output.view().data()) .starts_with("//")) { // To avoid path being treated as the host, prepend "/." to the path". // // Examples: // // > const url = new URL("git:/.//a"); // > url.href // => The result should be "git:/.//a", instead of "git://a". // // > const url = new URL("git:/"); // > url.pathname = "/.//a" // > url.href // => The result should be "git:/.//a", instead of "git://a". // // URL Standard: https://url.spec.whatwg.org/#concept-url-serializer // // > 3. If url’s host is null, url does not have an opaque path, url’s // > path’s size is greater than 1, and url’s path[0] is the empty // > string, then append U+002F (/) followed by U+002E (.) to output. // // Since the path length is unknown in advance, we post-process the new // path here. This case is likely to be infrequent, so the performance // impact should be minimal. size_t prior_output_length = output.length(); output.Insert(new_parsed.path.begin, "/."); // Adjust path. new_parsed.path.begin += output.length() - prior_output_length; } } } else { new_parsed.path.reset(); } // Query CanonicalizeQuery(source.query, parsed.query, query_converter, &output, &new_parsed.query); // Ref: ignore failure for this, since the page can probably still be loaded. CanonicalizeRef(source.ref, parsed.ref, &output, &new_parsed.ref); // Carry over the flag for potentially dangling markup: if (parsed.potentially_dangling_markup) { new_parsed.potentially_dangling_markup = true; } return success; } } // namespace bool CanonicalizeNonSpecialURL(const char* spec, int spec_len, const Parsed& parsed, CharsetConverter* query_converter, CanonOutput& output, Parsed& new_parsed) { // Carry over the flag. new_parsed.has_opaque_path = parsed.has_opaque_path; if (parsed.has_opaque_path) { return CanonicalizePathURL(spec, spec_len, parsed, &output, &new_parsed); } return DoCanonicalizeNonSpecialURL(URLComponentSource(spec), parsed, query_converter, output, new_parsed); } bool CanonicalizeNonSpecialURL(const char16_t* spec, int spec_len, const Parsed& parsed, CharsetConverter* query_converter, CanonOutput& output, Parsed& new_parsed) { // Carry over the flag. new_parsed.has_opaque_path = parsed.has_opaque_path; if (parsed.has_opaque_path) { return CanonicalizePathURL(spec, spec_len, parsed, &output, &new_parsed); } return DoCanonicalizeNonSpecialURL(URLComponentSource(spec), parsed, query_converter, output, new_parsed); } bool ReplaceNonSpecialURL(const char* base, const Parsed& base_parsed, const Replacements& replacements, CharsetConverter* query_converter, CanonOutput& output, Parsed& new_parsed) { // Carry over the flag. new_parsed.has_opaque_path = base_parsed.has_opaque_path; if (base_parsed.has_opaque_path) { return ReplacePathURL(base, base_parsed, replacements, &output, &new_parsed); } URLComponentSource source(base); Parsed parsed(base_parsed); SetupOverrideComponents(base, replacements, &source, &parsed); return DoCanonicalizeNonSpecialURL(source, parsed, query_converter, output, new_parsed); } // For 16-bit replacements, we turn all the replacements into UTF-8 so the // regular code path can be used. bool ReplaceNonSpecialURL(const char* base, const Parsed& base_parsed, const Replacements& replacements, CharsetConverter* query_converter, CanonOutput& output, Parsed& new_parsed) { // Carry over the flag. new_parsed.has_opaque_path = base_parsed.has_opaque_path; if (base_parsed.has_opaque_path) { return ReplacePathURL(base, base_parsed, replacements, &output, &new_parsed); } RawCanonOutput<1024> utf8; URLComponentSource source(base); Parsed parsed(base_parsed); SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed); return DoCanonicalizeNonSpecialURL(source, parsed, query_converter, output, new_parsed); } } // namespace url