// Copyright 2013 The Chromium Authors // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "net/tools/tld_cleanup/tld_cleanup_util.h" #include #include #include "base/containers/contains.h" #include "base/files/file_util.h" #include "base/logging.h" #include "base/ranges/algorithm.h" #include "base/strings/strcat.h" #include "base/strings/string_number_conversions.h" #include "base/strings/string_util.h" #include "url/gurl.h" #include "url/third_party/mozilla/url_parse.h" namespace { const char kBeginPrivateDomainsComment[] = "// ===BEGIN PRIVATE DOMAINS==="; const char kEndPrivateDomainsComment[] = "// ===END PRIVATE DOMAINS==="; const int kExceptionRule = 1; const int kWildcardRule = 2; const int kPrivateRule = 4; } namespace net::tld_cleanup { std::string RulesToGperf(const RuleMap& rules) { std::string data; data.append("%{\n" "// Copyright 2012 The Chromium Authors\n" "// Use of this source code is governed by a BSD-style license " "that can be\n" "// found in the LICENSE file.\n\n" "// This file is generated by net/tools/tld_cleanup/.\n" "// DO NOT MANUALLY EDIT!\n" "%}\n" "struct DomainRule {\n" " int name_offset;\n" " int type; // flags: 1: exception, 2: wildcard, 4: private\n" "};\n" "%%\n"); for (const auto& [domain, rule] : rules) { data.append(domain); data.append(", "); int type = 0; if (rule.exception) { type = kExceptionRule; } else if (rule.wildcard) { type = kWildcardRule; } if (rule.is_private) { type += kPrivateRule; } data.append(base::NumberToString(type)); data.append("\n"); } data.append("%%\n"); return data; } // Adjusts the rule to a standard form: removes single extraneous dots and // canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as // valid; logs a warning and returns kWarning if it is probably invalid; and // logs an error and returns kError if the rule is (almost) certainly invalid. NormalizeResult NormalizeRule(std::string& domain, Rule& rule) { NormalizeResult result = NormalizeResult::kSuccess; // Strip single leading and trailing dots. if (domain.starts_with(".")) { domain.erase(0, 1); } if (domain.ends_with(".")) { domain.pop_back(); } // Allow single leading '*.' or '!', saved here so it's not canonicalized. if (domain.starts_with("!")) { domain.erase(0, 1); rule.exception = true; } else if (domain.starts_with("*.")) { domain.erase(0, 2); rule.wildcard = true; } if (domain.empty()) { LOG(WARNING) << "Ignoring empty rule"; return NormalizeResult::kWarning; } // Warn about additional '*.' or '!'. if (base::Contains(domain, "*.") || base::Contains(domain, '!')) { LOG(WARNING) << "Keeping probably invalid rule: " << domain; result = NormalizeResult::kWarning; } // Make a GURL and normalize it, then get the host back out. GURL gurl(base::StrCat({"http://", domain})); const std::string& spec = gurl.possibly_invalid_spec(); url::Component host = gurl.parsed_for_possibly_invalid_spec().host; if (!host.is_valid()) { LOG(ERROR) << "Ignoring rule that couldn't be normalized: " << domain; return NormalizeResult::kError; } if (!gurl.is_valid()) { LOG(WARNING) << "Keeping rule that GURL says is invalid: " << domain; result = NormalizeResult::kWarning; } domain.assign(spec.substr(host.begin, host.len)); return result; } NormalizeResult NormalizeDataToRuleMap(const std::string& data, RuleMap& rules) { // We do a lot of string assignment during parsing, but simplicity is more // important than performance here. NormalizeResult result = NormalizeResult::kSuccess; std::istringstream data_stream(data); bool in_private_section = false; RuleMap extra_rules; for (std::string line; std::getline(data_stream, line, '\n');) { if (line.starts_with(kBeginPrivateDomainsComment)) { in_private_section = true; continue; } if (line.starts_with(kEndPrivateDomainsComment)) { in_private_section = false; continue; } if (line.starts_with("//")) { // Skip comments. continue; } if (line.empty()) { continue; } // Truncate at first whitespace. if (size_t first_whitespace = line.find_first_of("\r\n \t"); first_whitespace != std::string::npos) { line.erase(first_whitespace); } std::string domain = line; Rule rule{/*exception=*/false, /*wildcard=*/false, /*is_private=*/in_private_section}; NormalizeResult new_result = NormalizeRule(domain, rule); result = std::max(result, new_result); if (new_result == NormalizeResult::kError) { continue; } // Check the existing rules to make sure we don't have an exception and // wildcard for the same rule, or that the same domain is listed as both // private and not private. If we did, we'd have to update our // parsing code to handle this case. CHECK(!base::Contains(rules, domain)) << "Duplicate rule found for " << domain; rules[domain] = rule; // Add true TLD for multi-level rules. We don't add them right now, in // case there's an exception or wild card that either exists or might be // added in a later iteration. In those cases, there's no need to add // it and it would just slow down parsing the data. size_t tld_start = domain.find_last_of('.'); if (tld_start != std::string::npos && tld_start + 1 < domain.size()) { std::string extra_rule_domain = domain.substr(tld_start + 1); RuleMap::const_iterator iter = extra_rules.find(extra_rule_domain); // If a rule already exists, we ensure that if any of the entries is not // private the result should be that the entry is not private. An example // is .au which is not listed as a real TLD, but only lists second-level // domains such as com.au. Subdomains of .au (eg. blogspot.com.au) are // also listed in the private section, which is processed later, so this // ensures that the real TLD (eg. .au) is listed as public. bool is_private = in_private_section && (iter == extra_rules.end() || iter->second.is_private); extra_rules[extra_rule_domain] = Rule{/*exception=*/false, /*wildcard=*/false, is_private}; } } base::ranges::copy_if(extra_rules, std::inserter(rules, rules.end()), [&](const auto& extra_rule) { return !base::Contains(rules, extra_rule.first); }); return result; } NormalizeResult NormalizeFile(const base::FilePath& in_filename, const base::FilePath& out_filename) { RuleMap rules; std::string data; if (!base::ReadFileToString(in_filename, &data)) { LOG(ERROR) << "Unable to read file"; // We return success since we've already reported the error. return NormalizeResult::kSuccess; } NormalizeResult result = NormalizeDataToRuleMap(data, rules); if (!base::WriteFile(out_filename, RulesToGperf(rules))) { LOG(ERROR) << "Error(s) writing output file"; result = NormalizeResult::kError; } return result; } } // namespace net::tld_cleanup