1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // This command-line program converts an effective-TLD data file in UTF-8 from
6 // the format provided by Mozilla to the format expected by Chrome. This
7 // program generates an intermediate file which is then used by gperf to
8 // generate a perfect hash map. The benefit of this approach is that no time is
9 // spent on program initialization to generate the map of this data.
10 //
11 // Running this program finds "effective_tld_names.cc" in the expected location
12 // in the source checkout and generates "effective_tld_names.gperf" next to it.
13 //
14 // Any errors or warnings from this program are recorded in tld_cleanup.log.
15 //
16 // In particular, it
17 // * Strips blank lines and comments, as well as notes for individual rules.
18 // * Strips a single leading and/or trailing dot from each rule, if present.
19 // * Logs a warning if a rule contains '!' or '*.' other than at the beginning
20 // of the rule. (This also catches multiple ! or *. at the start of a rule.)
21 // * Logs a warning if GURL reports a rule as invalid, but keeps the rule.
22 // * Canonicalizes each rule's domain by converting it to a GURL and back.
23 // * Adds explicit rules for true TLDs found in any rule.
24
25 #include <map>
26 #include <set>
27 #include <string>
28
29 #include "base/at_exit.h"
30 #include "base/command_line.h"
31 #include "base/file_util.h"
32 #include "base/i18n/icu_util.h"
33 #include "base/logging.h"
34 #include "base/file_path.h"
35 #include "base/file_util.h"
36 #include "base/path_service.h"
37 #include "base/process_util.h"
38 #include "base/string_util.h"
39 #include "googleurl/src/gurl.h"
40 #include "googleurl/src/url_parse.h"
41
42 namespace {
43 struct Rule {
44 bool exception;
45 bool wildcard;
46 };
47
48 typedef std::map<std::string, Rule> RuleMap;
49 typedef std::set<std::string> RuleSet;
50 }
51
52 // Writes the list of domain rules contained in the 'rules' set to the
53 // 'outfile', with each rule terminated by a LF. The file must already have
54 // been created with write access.
WriteRules(const RuleMap & rules,FilePath outfile)55 bool WriteRules(const RuleMap& rules, FilePath outfile) {
56 std::string data;
57 data.append(
58 "%{\n"
59 "// Copyright (c) 2009 The Chromium Authors. All rights reserved.\n"
60 "// Use of this source code is governed by a BSD-style license that\n"
61 "// can be found in the LICENSE file.\n\n"
62 "// This file is generated by net/tools/tld_cleanup/.\n"
63 "// DO NOT MANUALLY EDIT!\n"
64 "%}\n"
65 "struct DomainRule {\n"
66 " const char *name;\n"
67 " int type; // 1: exception, 2: wildcard\n"
68 "};\n"
69 "%%\n"
70 );
71
72 for (RuleMap::const_iterator i = rules.begin(); i != rules.end(); ++i) {
73 data.append(i->first);
74 data.append(", ");
75 if (i->second.exception) {
76 data.append("1");
77 } else if (i->second.wildcard) {
78 data.append("2");
79 } else {
80 data.append("0");
81 }
82 data.append("\n");
83 }
84
85 data.append("%%\n");
86
87 int written = file_util::WriteFile(outfile, data.data(), data.size());
88
89 return written == static_cast<int>(data.size());
90 }
91
92 // These result codes should be in increasing order of severity.
93 typedef enum {
94 kSuccess,
95 kWarning,
96 kError,
97 } NormalizeResult;
98
99 // Adjusts the rule to a standard form: removes single extraneous dots and
100 // canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as
101 // valid; logs a warning and returns kWarning if it is probably invalid; and
102 // logs an error and returns kError if the rule is (almost) certainly invalid.
NormalizeRule(std::string * domain,Rule * rule)103 NormalizeResult NormalizeRule(std::string* domain, Rule* rule) {
104 NormalizeResult result = kSuccess;
105
106 // Strip single leading and trailing dots.
107 if (domain->at(0) == '.')
108 domain->erase(0, 1);
109 if (domain->empty()) {
110 LOG(WARNING) << "Ignoring empty rule";
111 return kWarning;
112 }
113 if (domain->at(domain->size() - 1) == '.')
114 domain->erase(domain->size() - 1, 1);
115 if (domain->empty()) {
116 LOG(WARNING) << "Ignoring empty rule";
117 return kWarning;
118 }
119
120 // Allow single leading '*.' or '!', saved here so it's not canonicalized.
121 size_t start_offset = 0;
122 if (domain->at(0) == '!') {
123 domain->erase(0, 1);
124 rule->exception = true;
125 } else if (domain->find("*.") == 0) {
126 domain->erase(0, 2);
127 rule->wildcard = true;
128 }
129 if (domain->empty()) {
130 LOG(WARNING) << "Ignoring empty rule";
131 return kWarning;
132 }
133
134 // Warn about additional '*.' or '!'.
135 if (domain->find("*.", start_offset) != std::string::npos ||
136 domain->find('!', start_offset) != std::string::npos) {
137 LOG(WARNING) << "Keeping probably invalid rule: " << *domain;
138 result = kWarning;
139 }
140
141 // Make a GURL and normalize it, then get the host back out.
142 std::string url = "http://";
143 url.append(*domain);
144 GURL gurl(url);
145 const std::string& spec = gurl.possibly_invalid_spec();
146 url_parse::Component host = gurl.parsed_for_possibly_invalid_spec().host;
147 if (host.len < 0) {
148 LOG(ERROR) << "Ignoring rule that couldn't be normalized: " << *domain;
149 return kError;
150 }
151 if (!gurl.is_valid()) {
152 LOG(WARNING) << "Keeping rule that GURL says is invalid: " << *domain;
153 result = kWarning;
154 }
155 domain->assign(spec.substr(host.begin, host.len));
156
157 return result;
158 }
159
160 // Loads the file described by 'in_filename', converts it to the desired format
161 // (see the file comments above), and saves it into 'out_filename'. Returns
162 // the most severe of the result codes encountered when normalizing the rules.
NormalizeFile(const FilePath & in_filename,const FilePath & out_filename)163 NormalizeResult NormalizeFile(const FilePath& in_filename,
164 const FilePath& out_filename) {
165 std::string data;
166 if (!file_util::ReadFileToString(in_filename, &data)) {
167 LOG(ERROR) << "Unable to read file";
168 // We return success since we've already reported the error.
169 return kSuccess;
170 }
171
172 // We do a lot of string assignment during parsing, but simplicity is more
173 // important than performance here.
174 std::string domain;
175 NormalizeResult result = kSuccess;
176 size_t line_start = 0;
177 size_t line_end = 0;
178 RuleMap rules;
179 RuleSet extra_rules;
180 while (line_start < data.size()) {
181 // Skip comments.
182 if (line_start + 1 < data.size() &&
183 data[line_start] == '/' &&
184 data[line_start + 1] == '/') {
185 line_end = data.find_first_of("\r\n", line_start);
186 if (line_end == std::string::npos)
187 line_end = data.size();
188 } else {
189 // Truncate at first whitespace.
190 line_end = data.find_first_of("\r\n \t", line_start);
191 if (line_end == std::string::npos)
192 line_end = data.size();
193 domain.assign(data.data(), line_start, line_end - line_start);
194
195 Rule rule;
196 rule.wildcard = false;
197 rule.exception = false;
198 NormalizeResult new_result = NormalizeRule(&domain, &rule);
199 if (new_result != kError) {
200 // Check the existing rules to make sure we don't have an exception and
201 // wildcard for the same rule. If we did, we'd have to update our
202 // parsing code to handle this case.
203 CHECK(rules.find(domain) == rules.end());
204
205 rules[domain] = rule;
206 // Add true TLD for multi-level rules. We don't add them right now, in
207 // case there's an exception or wild card that either exists or might be
208 // added in a later iteration. In those cases, there's no need to add
209 // it and it would just slow down parsing the data.
210 size_t tld_start = domain.find_last_of('.');
211 if (tld_start != std::string::npos && tld_start + 1 < domain.size())
212 extra_rules.insert(domain.substr(tld_start + 1));
213 }
214 result = std::max(result, new_result);
215 }
216
217 // Find beginning of next non-empty line.
218 line_start = data.find_first_of("\r\n", line_end);
219 if (line_start == std::string::npos)
220 line_start = data.size();
221 line_start = data.find_first_not_of("\r\n", line_start);
222 if (line_start == std::string::npos)
223 line_start = data.size();
224 }
225
226 for (RuleSet::const_iterator iter = extra_rules.begin();
227 iter != extra_rules.end();
228 ++iter) {
229 if (rules.find(*iter) == rules.end()) {
230 Rule rule;
231 rule.exception = false;
232 rule.wildcard = false;
233 rules[*iter] = rule;
234 }
235 }
236
237 if (!WriteRules(rules, out_filename)) {
238 LOG(ERROR) << "Error(s) writing output file";
239 result = kError;
240 }
241
242 return result;
243 }
244
main(int argc,const char * argv[])245 int main(int argc, const char* argv[]) {
246 base::EnableTerminationOnHeapCorruption();
247 if (argc != 1) {
248 fprintf(stderr, "Normalizes and verifies UTF-8 TLD data files\n");
249 fprintf(stderr, "Usage: %s\n", argv[0]);
250 return 1;
251 }
252
253 // Manages the destruction of singletons.
254 base::AtExitManager exit_manager;
255
256 // Only use OutputDebugString in debug mode.
257 #ifdef NDEBUG
258 logging::LoggingDestination destination = logging::LOG_ONLY_TO_FILE;
259 #else
260 logging::LoggingDestination destination =
261 logging::LOG_TO_BOTH_FILE_AND_SYSTEM_DEBUG_LOG;
262 #endif
263
264 CommandLine::Init(argc, argv);
265
266 FilePath log_filename;
267 PathService::Get(base::DIR_EXE, &log_filename);
268 log_filename = log_filename.AppendASCII("tld_cleanup.log");
269 logging::InitLogging(
270 log_filename.value().c_str(),
271 destination,
272 logging::LOCK_LOG_FILE,
273 logging::DELETE_OLD_LOG_FILE,
274 logging::DISABLE_DCHECK_FOR_NON_OFFICIAL_RELEASE_BUILDS);
275
276 icu_util::Initialize();
277
278 FilePath input_file;
279 PathService::Get(base::DIR_SOURCE_ROOT, &input_file);
280 input_file = input_file.Append(FILE_PATH_LITERAL("net"))
281 .Append(FILE_PATH_LITERAL("base"))
282 .Append(FILE_PATH_LITERAL("effective_tld_names.dat"));
283 FilePath output_file;
284 PathService::Get(base::DIR_SOURCE_ROOT, &output_file);
285 output_file = output_file.Append(FILE_PATH_LITERAL("net"))
286 .Append(FILE_PATH_LITERAL("base"))
287 .Append(FILE_PATH_LITERAL(
288 "effective_tld_names.gperf"));
289 NormalizeResult result = NormalizeFile(input_file, output_file);
290 if (result != kSuccess) {
291 fprintf(stderr,
292 "Errors or warnings processing file. See log in tld_cleanup.log.");
293 }
294
295 if (result == kError)
296 return 1;
297 return 0;
298 }
299