1 // Copyright 2012 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #ifdef UNSAFE_BUFFERS_BUILD
6 // TODO(crbug.com/40284755): Remove this and spanify to fix the errors.
7 #pragma allow_unsafe_buffers
8 #endif
9
10 #include "net/dns/dns_hosts.h"
11
12 #include <string>
13 #include <string_view>
14 #include <utility>
15
16 #include "base/check.h"
17 #include "base/files/file_path.h"
18 #include "base/files/file_util.h"
19 #include "base/metrics/histogram_functions.h"
20 #include "base/strings/string_util.h"
21 #include "base/trace_event/memory_usage_estimator.h"
22 #include "build/build_config.h"
23 #include "net/base/cronet_buildflags.h"
24 #include "net/base/url_util.h"
25 #include "net/dns/dns_util.h"
26 #include "url/url_canon.h"
27
28 namespace net {
29
30 namespace {
31
32 // Parses the contents of a hosts file. Returns one token (IP or hostname) at
33 // a time. Doesn't copy anything; accepts the file as a std::string_view and
34 // returns tokens as StringPieces.
35 class HostsParser {
36 public:
HostsParser(std::string_view text,ParseHostsCommaMode comma_mode)37 explicit HostsParser(std::string_view text, ParseHostsCommaMode comma_mode)
38 : text_(text),
39 data_(text.data()),
40 end_(text.size()),
41 comma_mode_(comma_mode) {}
42
43 HostsParser(const HostsParser&) = delete;
44 HostsParser& operator=(const HostsParser&) = delete;
45
46 // Advances to the next token (IP or hostname). Returns whether another
47 // token was available. |token_is_ip| and |token| can be used to find out
48 // the type and text of the token.
Advance()49 bool Advance() {
50 bool next_is_ip = (pos_ == 0);
51 while (pos_ < end_ && pos_ != std::string::npos) {
52 switch (text_[pos_]) {
53 case ' ':
54 case '\t':
55 SkipWhitespace();
56 break;
57
58 case '\r':
59 case '\n':
60 next_is_ip = true;
61 pos_++;
62 break;
63
64 case '#':
65 SkipRestOfLine();
66 break;
67
68 case ',':
69 if (comma_mode_ == PARSE_HOSTS_COMMA_IS_WHITESPACE) {
70 SkipWhitespace();
71 break;
72 }
73
74 // If comma_mode_ is COMMA_IS_TOKEN, fall through:
75 [[fallthrough]];
76
77 default: {
78 size_t token_start = pos_;
79 SkipToken();
80 size_t token_end = (pos_ == std::string::npos) ? end_ : pos_;
81
82 token_ =
83 std::string_view(data_ + token_start, token_end - token_start);
84 token_is_ip_ = next_is_ip;
85
86 return true;
87 }
88 }
89 }
90
91 return false;
92 }
93
94 // Fast-forwards the parser to the next line. Should be called if an IP
95 // address doesn't parse, to avoid wasting time tokenizing hostnames that
96 // will be ignored.
SkipRestOfLine()97 void SkipRestOfLine() { pos_ = text_.find("\n", pos_); }
98
99 // Returns whether the last-parsed token is an IP address (true) or a
100 // hostname (false).
token_is_ip()101 bool token_is_ip() { return token_is_ip_; }
102
103 // Returns the text of the last-parsed token as a std::string_view referencing
104 // the same underlying memory as the std::string_view passed to the
105 // constructor. Returns an empty std::string_view if no token has been parsed
106 // or the end of the input string has been reached.
token()107 std::string_view token() { return token_; }
108
109 private:
SkipToken()110 void SkipToken() {
111 switch (comma_mode_) {
112 case PARSE_HOSTS_COMMA_IS_TOKEN:
113 pos_ = text_.find_first_of(" \t\n\r#", pos_);
114 break;
115 case PARSE_HOSTS_COMMA_IS_WHITESPACE:
116 pos_ = text_.find_first_of(" ,\t\n\r#", pos_);
117 break;
118 }
119 }
120
SkipWhitespace()121 void SkipWhitespace() {
122 switch (comma_mode_) {
123 case PARSE_HOSTS_COMMA_IS_TOKEN:
124 pos_ = text_.find_first_not_of(" \t", pos_);
125 break;
126 case PARSE_HOSTS_COMMA_IS_WHITESPACE:
127 pos_ = text_.find_first_not_of(" ,\t", pos_);
128 break;
129 }
130 }
131
132 const std::string_view text_;
133 const char* data_;
134 const size_t end_;
135
136 size_t pos_ = 0;
137 std::string_view token_;
138 bool token_is_ip_ = false;
139
140 const ParseHostsCommaMode comma_mode_;
141 };
142
ParseHostsWithCommaMode(const std::string & contents,DnsHosts * dns_hosts,ParseHostsCommaMode comma_mode)143 void ParseHostsWithCommaMode(const std::string& contents,
144 DnsHosts* dns_hosts,
145 ParseHostsCommaMode comma_mode) {
146 CHECK(dns_hosts);
147
148 std::string_view ip_text;
149 IPAddress ip;
150 AddressFamily family = ADDRESS_FAMILY_IPV4;
151 HostsParser parser(contents, comma_mode);
152 while (parser.Advance()) {
153 if (parser.token_is_ip()) {
154 std::string_view new_ip_text = parser.token();
155 // Some ad-blocking hosts files contain thousands of entries pointing to
156 // the same IP address (usually 127.0.0.1). Don't bother parsing the IP
157 // again if it's the same as the one above it.
158 if (new_ip_text != ip_text) {
159 IPAddress new_ip;
160 if (new_ip.AssignFromIPLiteral(parser.token())) {
161 ip_text = new_ip_text;
162 ip = new_ip;
163 family = (ip.IsIPv4()) ? ADDRESS_FAMILY_IPV4 : ADDRESS_FAMILY_IPV6;
164 } else {
165 parser.SkipRestOfLine();
166 }
167 }
168 } else {
169 url::CanonHostInfo canonicalization_info;
170 std::string canonicalized_host =
171 CanonicalizeHost(parser.token(), &canonicalization_info);
172
173 // Skip if token is invalid for host canonicalization, or if it
174 // canonicalizes as an IP address.
175 if (canonicalization_info.family != url::CanonHostInfo::NEUTRAL)
176 continue;
177
178 DnsHostsKey key(std::move(canonicalized_host), family);
179 if (!IsCanonicalizedHostCompliant(key.first))
180 continue;
181 IPAddress* mapped_ip = &(*dns_hosts)[key];
182 if (mapped_ip->empty())
183 *mapped_ip = ip;
184 // else ignore this entry (first hit counts)
185 }
186 }
187 }
188
189 } // namespace
190
ParseHostsWithCommaModeForTesting(const std::string & contents,DnsHosts * dns_hosts,ParseHostsCommaMode comma_mode)191 void ParseHostsWithCommaModeForTesting(const std::string& contents,
192 DnsHosts* dns_hosts,
193 ParseHostsCommaMode comma_mode) {
194 ParseHostsWithCommaMode(contents, dns_hosts, comma_mode);
195 }
196
ParseHosts(const std::string & contents,DnsHosts * dns_hosts)197 void ParseHosts(const std::string& contents, DnsHosts* dns_hosts) {
198 ParseHostsCommaMode comma_mode;
199 #if BUILDFLAG(IS_APPLE)
200 // Mac OS X allows commas to separate hostnames.
201 comma_mode = PARSE_HOSTS_COMMA_IS_WHITESPACE;
202 #else
203 // Linux allows commas in hostnames.
204 comma_mode = PARSE_HOSTS_COMMA_IS_TOKEN;
205 #endif
206
207 ParseHostsWithCommaMode(contents, dns_hosts, comma_mode);
208
209 // TODO(crbug.com/40874231): Remove this when we have enough data.
210 base::UmaHistogramCounts100000("Net.DNS.DnsHosts.Count", dns_hosts->size());
211
212 #if !BUILDFLAG(CRONET_BUILD)
213 // Cronet disables tracing and doesn't provide an implementation of
214 // base::trace_event::EstimateMemoryUsage for DnsHosts. Having this
215 // conditional is preferred over a fake implementation to avoid reporting fake
216 // metrics.
217 base::UmaHistogramMemoryKB(
218 "Net.DNS.DnsHosts.EstimateMemoryUsage",
219 base::trace_event::EstimateMemoryUsage(*dns_hosts));
220 #endif // !BUILDFLAG(CRONET_BUILD)
221 }
222
223 DnsHostsParser::~DnsHostsParser() = default;
224
DnsHostsFileParser(base::FilePath hosts_file_path)225 DnsHostsFileParser::DnsHostsFileParser(base::FilePath hosts_file_path)
226 : hosts_file_path_(std::move(hosts_file_path)) {}
227
228 DnsHostsFileParser::~DnsHostsFileParser() = default;
229
ParseHosts(DnsHosts * dns_hosts) const230 bool DnsHostsFileParser::ParseHosts(DnsHosts* dns_hosts) const {
231 dns_hosts->clear();
232 // Missing file indicates empty HOSTS.
233 if (!base::PathExists(hosts_file_path_))
234 return true;
235
236 std::optional<int64_t> size = base::GetFileSize(hosts_file_path_);
237 if (!size.has_value()) {
238 return false;
239 }
240
241 // Reject HOSTS files larger than |kMaxHostsSize| bytes.
242 const int64_t kMaxHostsSize = 1 << 25; // 32MB
243
244 // TODO(crbug.com/40874231): Remove this when we have enough data.
245 base::UmaHistogramCustomCounts("Net.DNS.DnsHosts.FileSize", size.value(), 1,
246 kMaxHostsSize * 2, 50);
247 if (size.value() > kMaxHostsSize) {
248 return false;
249 }
250
251 std::string contents;
252 if (!base::ReadFileToString(hosts_file_path_, &contents))
253 return false;
254
255 net::ParseHosts(contents, dns_hosts);
256 return true;
257 }
258
259 } // namespace net
260