• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // This command-line program converts an effective-TLD data file in UTF-8 from
6 // the format provided by Mozilla to the format expected by Chrome.  This
7 // program generates an intermediate file which is then used by gperf to
8 // generate a perfect hash map.  The benefit of this approach is that no time is
9 // spent on program initialization to generate the map of this data.
10 //
11 // Running this program finds "effective_tld_names.cc" in the expected location
12 // in the source checkout and generates "effective_tld_names.gperf" next to it.
13 //
14 // Any errors or warnings from this program are recorded in tld_cleanup.log.
15 //
16 // In particular, it
17 //  * Strips blank lines and comments, as well as notes for individual rules.
18 //  * Strips a single leading and/or trailing dot from each rule, if present.
19 //  * Logs a warning if a rule contains '!' or '*.' other than at the beginning
20 //    of the rule.  (This also catches multiple ! or *. at the start of a rule.)
21 //  * Logs a warning if GURL reports a rule as invalid, but keeps the rule.
22 //  * Canonicalizes each rule's domain by converting it to a GURL and back.
23 //  * Adds explicit rules for true TLDs found in any rule.
24 
25 #include <map>
26 #include <set>
27 #include <string>
28 
29 #include "base/at_exit.h"
30 #include "base/command_line.h"
31 #include "base/file_util.h"
32 #include "base/i18n/icu_util.h"
33 #include "base/logging.h"
34 #include "base/file_path.h"
35 #include "base/file_util.h"
36 #include "base/path_service.h"
37 #include "base/process_util.h"
38 #include "base/string_util.h"
39 #include "googleurl/src/gurl.h"
40 #include "googleurl/src/url_parse.h"
41 
42 namespace {
43 struct Rule {
44   bool exception;
45   bool wildcard;
46 };
47 
48 typedef std::map<std::string, Rule> RuleMap;
49 typedef std::set<std::string> RuleSet;
50 }
51 
52 // Writes the list of domain rules contained in the 'rules' set to the
53 // 'outfile', with each rule terminated by a LF.  The file must already have
54 // been created with write access.
WriteRules(const RuleMap & rules,FilePath outfile)55 bool WriteRules(const RuleMap& rules, FilePath outfile) {
56   std::string data;
57   data.append(
58       "%{\n"
59       "// Copyright (c) 2009 The Chromium Authors. All rights reserved.\n"
60       "// Use of this source code is governed by a BSD-style license that\n"
61       "// can be found in the LICENSE file.\n\n"
62       "// This file is generated by net/tools/tld_cleanup/.\n"
63       "// DO NOT MANUALLY EDIT!\n"
64       "%}\n"
65       "struct DomainRule {\n"
66       "  const char *name;\n"
67       "  int type;  // 1: exception, 2: wildcard\n"
68       "};\n"
69       "%%\n"
70   );
71 
72   for (RuleMap::const_iterator i = rules.begin(); i != rules.end(); ++i) {
73     data.append(i->first);
74     data.append(", ");
75     if (i->second.exception) {
76       data.append("1");
77     } else if (i->second.wildcard) {
78       data.append("2");
79     } else {
80       data.append("0");
81     }
82     data.append("\n");
83   }
84 
85   data.append("%%\n");
86 
87   int written = file_util::WriteFile(outfile, data.data(), data.size());
88 
89   return written == static_cast<int>(data.size());
90 }
91 
92 // These result codes should be in increasing order of severity.
93 typedef enum {
94   kSuccess,
95   kWarning,
96   kError,
97 } NormalizeResult;
98 
99 // Adjusts the rule to a standard form: removes single extraneous dots and
100 // canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as
101 // valid; logs a warning and returns kWarning if it is probably invalid; and
102 // logs an error and returns kError if the rule is (almost) certainly invalid.
NormalizeRule(std::string * domain,Rule * rule)103 NormalizeResult NormalizeRule(std::string* domain, Rule* rule) {
104   NormalizeResult result = kSuccess;
105 
106   // Strip single leading and trailing dots.
107   if (domain->at(0) == '.')
108     domain->erase(0, 1);
109   if (domain->empty()) {
110     LOG(WARNING) << "Ignoring empty rule";
111     return kWarning;
112   }
113   if (domain->at(domain->size() - 1) == '.')
114     domain->erase(domain->size() - 1, 1);
115   if (domain->empty()) {
116     LOG(WARNING) << "Ignoring empty rule";
117     return kWarning;
118   }
119 
120   // Allow single leading '*.' or '!', saved here so it's not canonicalized.
121   size_t start_offset = 0;
122   if (domain->at(0) == '!') {
123     domain->erase(0, 1);
124     rule->exception = true;
125   } else if (domain->find("*.") == 0) {
126     domain->erase(0, 2);
127     rule->wildcard = true;
128   }
129   if (domain->empty()) {
130     LOG(WARNING) << "Ignoring empty rule";
131     return kWarning;
132   }
133 
134   // Warn about additional '*.' or '!'.
135   if (domain->find("*.", start_offset) != std::string::npos ||
136       domain->find('!', start_offset) != std::string::npos) {
137     LOG(WARNING) << "Keeping probably invalid rule: " << *domain;
138     result = kWarning;
139   }
140 
141   // Make a GURL and normalize it, then get the host back out.
142   std::string url = "http://";
143   url.append(*domain);
144   GURL gurl(url);
145   const std::string& spec = gurl.possibly_invalid_spec();
146   url_parse::Component host = gurl.parsed_for_possibly_invalid_spec().host;
147   if (host.len < 0) {
148     LOG(ERROR) << "Ignoring rule that couldn't be normalized: " << *domain;
149     return kError;
150   }
151   if (!gurl.is_valid()) {
152     LOG(WARNING) << "Keeping rule that GURL says is invalid: " << *domain;
153     result = kWarning;
154   }
155   domain->assign(spec.substr(host.begin, host.len));
156 
157   return result;
158 }
159 
160 // Loads the file described by 'in_filename', converts it to the desired format
161 // (see the file comments above), and saves it into 'out_filename'.  Returns
162 // the most severe of the result codes encountered when normalizing the rules.
NormalizeFile(const FilePath & in_filename,const FilePath & out_filename)163 NormalizeResult NormalizeFile(const FilePath& in_filename,
164                               const FilePath& out_filename) {
165   std::string data;
166   if (!file_util::ReadFileToString(in_filename, &data)) {
167     LOG(ERROR) << "Unable to read file";
168     // We return success since we've already reported the error.
169     return kSuccess;
170   }
171 
172   // We do a lot of string assignment during parsing, but simplicity is more
173   // important than performance here.
174   std::string domain;
175   NormalizeResult result = kSuccess;
176   size_t line_start = 0;
177   size_t line_end = 0;
178   RuleMap rules;
179   RuleSet extra_rules;
180   while (line_start < data.size()) {
181     // Skip comments.
182     if (line_start + 1 < data.size() &&
183         data[line_start] == '/' &&
184         data[line_start + 1] == '/') {
185       line_end = data.find_first_of("\r\n", line_start);
186       if (line_end == std::string::npos)
187         line_end = data.size();
188     } else {
189       // Truncate at first whitespace.
190       line_end = data.find_first_of("\r\n \t", line_start);
191       if (line_end == std::string::npos)
192         line_end = data.size();
193       domain.assign(data.data(), line_start, line_end - line_start);
194 
195       Rule rule;
196       rule.wildcard = false;
197       rule.exception = false;
198       NormalizeResult new_result = NormalizeRule(&domain, &rule);
199       if (new_result != kError) {
200         // Check the existing rules to make sure we don't have an exception and
201         // wildcard for the same rule.  If we did, we'd have to update our
202         // parsing code to handle this case.
203         CHECK(rules.find(domain) == rules.end());
204 
205         rules[domain] = rule;
206         // Add true TLD for multi-level rules.  We don't add them right now, in
207         // case there's an exception or wild card that either exists or might be
208         // added in a later iteration.  In those cases, there's no need to add
209         // it and it would just slow down parsing the data.
210         size_t tld_start = domain.find_last_of('.');
211         if (tld_start != std::string::npos && tld_start + 1 < domain.size())
212           extra_rules.insert(domain.substr(tld_start + 1));
213       }
214       result = std::max(result, new_result);
215     }
216 
217     // Find beginning of next non-empty line.
218     line_start = data.find_first_of("\r\n", line_end);
219     if (line_start == std::string::npos)
220       line_start = data.size();
221     line_start = data.find_first_not_of("\r\n", line_start);
222     if (line_start == std::string::npos)
223       line_start = data.size();
224   }
225 
226   for (RuleSet::const_iterator iter = extra_rules.begin();
227        iter != extra_rules.end();
228        ++iter) {
229     if (rules.find(*iter) == rules.end()) {
230       Rule rule;
231       rule.exception = false;
232       rule.wildcard = false;
233       rules[*iter] = rule;
234     }
235   }
236 
237   if (!WriteRules(rules, out_filename)) {
238     LOG(ERROR) << "Error(s) writing output file";
239     result = kError;
240   }
241 
242   return result;
243 }
244 
main(int argc,const char * argv[])245 int main(int argc, const char* argv[]) {
246   base::EnableTerminationOnHeapCorruption();
247   if (argc != 1) {
248     fprintf(stderr, "Normalizes and verifies UTF-8 TLD data files\n");
249     fprintf(stderr, "Usage: %s\n", argv[0]);
250     return 1;
251   }
252 
253   // Manages the destruction of singletons.
254   base::AtExitManager exit_manager;
255 
256   // Only use OutputDebugString in debug mode.
257 #ifdef NDEBUG
258   logging::LoggingDestination destination = logging::LOG_ONLY_TO_FILE;
259 #else
260   logging::LoggingDestination destination =
261       logging::LOG_TO_BOTH_FILE_AND_SYSTEM_DEBUG_LOG;
262 #endif
263 
264   CommandLine::Init(argc, argv);
265 
266   FilePath log_filename;
267   PathService::Get(base::DIR_EXE, &log_filename);
268   log_filename = log_filename.AppendASCII("tld_cleanup.log");
269   logging::InitLogging(
270       log_filename.value().c_str(),
271       destination,
272       logging::LOCK_LOG_FILE,
273       logging::DELETE_OLD_LOG_FILE,
274       logging::DISABLE_DCHECK_FOR_NON_OFFICIAL_RELEASE_BUILDS);
275 
276   icu_util::Initialize();
277 
278   FilePath input_file;
279   PathService::Get(base::DIR_SOURCE_ROOT, &input_file);
280   input_file = input_file.Append(FILE_PATH_LITERAL("net"))
281                          .Append(FILE_PATH_LITERAL("base"))
282                          .Append(FILE_PATH_LITERAL("effective_tld_names.dat"));
283   FilePath output_file;
284   PathService::Get(base::DIR_SOURCE_ROOT, &output_file);
285   output_file = output_file.Append(FILE_PATH_LITERAL("net"))
286                            .Append(FILE_PATH_LITERAL("base"))
287                            .Append(FILE_PATH_LITERAL(
288                                "effective_tld_names.gperf"));
289   NormalizeResult result = NormalizeFile(input_file, output_file);
290   if (result != kSuccess) {
291     fprintf(stderr,
292             "Errors or warnings processing file.  See log in tld_cleanup.log.");
293   }
294 
295   if (result == kError)
296     return 1;
297   return 0;
298 }
299