• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "net/tools/tld_cleanup/tld_cleanup_util.h"
6 
7 #include "base/files/file_util.h"
8 #include "base/logging.h"
9 #include "base/strings/string_number_conversions.h"
10 #include "base/strings/string_util.h"
11 #include "url/gurl.h"
12 #include "url/url_parse.h"
13 
14 namespace {
15 
16 const char kBeginPrivateDomainsComment[] = "// ===BEGIN PRIVATE DOMAINS===";
17 const char kEndPrivateDomainsComment[] = "// ===END PRIVATE DOMAINS===";
18 
19 const int kExceptionRule = 1;
20 const int kWildcardRule = 2;
21 const int kPrivateRule = 4;
22 }
23 
24 namespace net {
25 namespace tld_cleanup {
26 
27 // Writes the list of domain rules contained in the 'rules' set to the
28 // 'outfile', with each rule terminated by a LF.  The file must already have
29 // been created with write access.
WriteRules(const RuleMap & rules,const base::FilePath & outfile)30 bool WriteRules(const RuleMap& rules, const base::FilePath& outfile) {
31   std::string data;
32   data.append("%{\n"
33               "// Copyright 2012 The Chromium Authors. All rights reserved.\n"
34               "// Use of this source code is governed by a BSD-style license "
35               "that can be\n"
36               "// found in the LICENSE file.\n\n"
37               "// This file is generated by net/tools/tld_cleanup/.\n"
38               "// DO NOT MANUALLY EDIT!\n"
39               "%}\n"
40               "struct DomainRule {\n"
41               "  int name_offset;\n"
42               "  int type;  // flags: 1: exception, 2: wildcard, 4: private\n"
43               "};\n"
44               "%%\n");
45 
46   for (RuleMap::const_iterator i = rules.begin(); i != rules.end(); ++i) {
47     data.append(i->first);
48     data.append(", ");
49     int type = 0;
50     if (i->second.exception) {
51       type = kExceptionRule;
52     } else if (i->second.wildcard) {
53       type = kWildcardRule;
54     }
55     if (i->second.is_private) {
56       type += kPrivateRule;
57     }
58     data.append(base::IntToString(type));
59     data.append("\n");
60   }
61 
62   data.append("%%\n");
63 
64   int written = base::WriteFile(outfile,
65                                      data.data(),
66                                      static_cast<int>(data.size()));
67 
68   return written == static_cast<int>(data.size());
69 }
70 
71 // Adjusts the rule to a standard form: removes single extraneous dots and
72 // canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as
73 // valid; logs a warning and returns kWarning if it is probably invalid; and
74 // logs an error and returns kError if the rule is (almost) certainly invalid.
NormalizeRule(std::string * domain,Rule * rule)75 NormalizeResult NormalizeRule(std::string* domain, Rule* rule) {
76   NormalizeResult result = kSuccess;
77 
78   // Strip single leading and trailing dots.
79   if (domain->at(0) == '.')
80     domain->erase(0, 1);
81   if (domain->empty()) {
82     LOG(WARNING) << "Ignoring empty rule";
83     return kWarning;
84   }
85   if (domain->at(domain->size() - 1) == '.')
86     domain->erase(domain->size() - 1, 1);
87   if (domain->empty()) {
88     LOG(WARNING) << "Ignoring empty rule";
89     return kWarning;
90   }
91 
92   // Allow single leading '*.' or '!', saved here so it's not canonicalized.
93   size_t start_offset = 0;
94   if (domain->at(0) == '!') {
95     domain->erase(0, 1);
96     rule->exception = true;
97   } else if (domain->find("*.") == 0) {
98     domain->erase(0, 2);
99     rule->wildcard = true;
100   }
101   if (domain->empty()) {
102     LOG(WARNING) << "Ignoring empty rule";
103     return kWarning;
104   }
105 
106   // Warn about additional '*.' or '!'.
107   if (domain->find("*.", start_offset) != std::string::npos ||
108       domain->find('!', start_offset) != std::string::npos) {
109     LOG(WARNING) << "Keeping probably invalid rule: " << *domain;
110     result = kWarning;
111   }
112 
113   // Make a GURL and normalize it, then get the host back out.
114   std::string url = "http://";
115   url.append(*domain);
116   GURL gurl(url);
117   const std::string& spec = gurl.possibly_invalid_spec();
118   url::Component host = gurl.parsed_for_possibly_invalid_spec().host;
119   if (host.len < 0) {
120     LOG(ERROR) << "Ignoring rule that couldn't be normalized: " << *domain;
121     return kError;
122   }
123   if (!gurl.is_valid()) {
124     LOG(WARNING) << "Keeping rule that GURL says is invalid: " << *domain;
125     result = kWarning;
126   }
127   domain->assign(spec.substr(host.begin, host.len));
128 
129   return result;
130 }
131 
NormalizeDataToRuleMap(const std::string data,RuleMap * rules)132 NormalizeResult NormalizeDataToRuleMap(const std::string data,
133                                        RuleMap* rules) {
134   CHECK(rules);
135   // We do a lot of string assignment during parsing, but simplicity is more
136   // important than performance here.
137   std::string domain;
138   NormalizeResult result = kSuccess;
139   size_t line_start = 0;
140   size_t line_end = 0;
141   bool is_private = false;
142   RuleMap extra_rules;
143   int begin_private_length = arraysize(kBeginPrivateDomainsComment) - 1;
144   int end_private_length = arraysize(kEndPrivateDomainsComment) - 1;
145   while (line_start < data.size()) {
146     if (line_start + begin_private_length < data.size() &&
147         !data.compare(line_start, begin_private_length,
148                       kBeginPrivateDomainsComment)) {
149       is_private = true;
150       line_end = line_start + begin_private_length;
151     } else if (line_start + end_private_length < data.size() &&
152         !data.compare(line_start, end_private_length,
153                       kEndPrivateDomainsComment)) {
154       is_private = false;
155       line_end = line_start + end_private_length;
156     } else if (line_start + 1 < data.size() &&
157         data[line_start] == '/' &&
158         data[line_start + 1] == '/') {
159       // Skip comments.
160       line_end = data.find_first_of("\r\n", line_start);
161       if (line_end == std::string::npos)
162         line_end = data.size();
163     } else {
164       // Truncate at first whitespace.
165       line_end = data.find_first_of("\r\n \t", line_start);
166       if (line_end == std::string::npos)
167         line_end = data.size();
168       domain.assign(data.data(), line_start, line_end - line_start);
169 
170       Rule rule;
171       rule.wildcard = false;
172       rule.exception = false;
173       rule.is_private = is_private;
174       NormalizeResult new_result = NormalizeRule(&domain, &rule);
175       if (new_result != kError) {
176         // Check the existing rules to make sure we don't have an exception and
177         // wildcard for the same rule, or that the same domain is listed as both
178         // private and not private. If we did, we'd have to update our
179         // parsing code to handle this case.
180         CHECK(rules->find(domain) == rules->end())
181             << "Duplicate rule found for " << domain;
182 
183         (*rules)[domain] = rule;
184         // Add true TLD for multi-level rules.  We don't add them right now, in
185         // case there's an exception or wild card that either exists or might be
186         // added in a later iteration.  In those cases, there's no need to add
187         // it and it would just slow down parsing the data.
188         size_t tld_start = domain.find_last_of('.');
189         if (tld_start != std::string::npos && tld_start + 1 < domain.size()) {
190           std::string extra_rule_domain = domain.substr(tld_start + 1);
191           RuleMap::const_iterator iter = extra_rules.find(extra_rule_domain);
192           Rule extra_rule;
193           extra_rule.exception = false;
194           extra_rule.wildcard = false;
195           if (iter == extra_rules.end()) {
196             extra_rule.is_private = is_private;
197           } else {
198             // A rule already exists, so we ensure that if any of the entries is
199             // not private the result should be that the entry is not private.
200             // An example is .au which is not listed as a real TLD, but only
201             // lists second-level domains such as com.au. Subdomains of .au
202             // (eg. blogspot.com.au) are also listed in the private section,
203             // which is processed later, so this ensures that the real TLD
204             // (eg. .au) is listed as public.
205             extra_rule.is_private = is_private && iter->second.is_private;
206           }
207           extra_rules[extra_rule_domain] = extra_rule;
208         }
209       }
210       result = std::max(result, new_result);
211     }
212 
213     // Find beginning of next non-empty line.
214     line_start = data.find_first_of("\r\n", line_end);
215     if (line_start == std::string::npos)
216       line_start = data.size();
217     line_start = data.find_first_not_of("\r\n", line_start);
218     if (line_start == std::string::npos)
219       line_start = data.size();
220   }
221 
222   for (RuleMap::const_iterator iter = extra_rules.begin();
223        iter != extra_rules.end();
224        ++iter) {
225     if (rules->find(iter->first) == rules->end()) {
226       (*rules)[iter->first] = iter->second;
227     }
228   }
229 
230   return result;
231 }
232 
NormalizeFile(const base::FilePath & in_filename,const base::FilePath & out_filename)233 NormalizeResult NormalizeFile(const base::FilePath& in_filename,
234                               const base::FilePath& out_filename) {
235   RuleMap rules;
236   std::string data;
237   if (!base::ReadFileToString(in_filename, &data)) {
238     LOG(ERROR) << "Unable to read file";
239     // We return success since we've already reported the error.
240     return kSuccess;
241   }
242 
243   NormalizeResult result = NormalizeDataToRuleMap(data, &rules);
244 
245   if (!WriteRules(rules, out_filename)) {
246     LOG(ERROR) << "Error(s) writing output file";
247     result = kError;
248   }
249 
250   return result;
251 }
252 
253 
254 }  // namespace tld_cleanup
255 }  // namespace net
256