• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2012 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifdef UNSAFE_BUFFERS_BUILD
6 // TODO(crbug.com/40284755): Remove this and spanify to fix the errors.
7 #pragma allow_unsafe_buffers
8 #endif
9 
10 #include "net/dns/dns_hosts.h"
11 
12 #include <string>
13 #include <string_view>
14 #include <utility>
15 
16 #include "base/check.h"
17 #include "base/files/file_path.h"
18 #include "base/files/file_util.h"
19 #include "base/metrics/histogram_functions.h"
20 #include "base/strings/string_util.h"
21 #include "base/trace_event/memory_usage_estimator.h"
22 #include "build/build_config.h"
23 #include "net/base/cronet_buildflags.h"
24 #include "net/base/url_util.h"
25 #include "net/dns/dns_util.h"
26 #include "url/url_canon.h"
27 
28 namespace net {
29 
30 namespace {
31 
32 // Parses the contents of a hosts file.  Returns one token (IP or hostname) at
33 // a time.  Doesn't copy anything; accepts the file as a std::string_view and
34 // returns tokens as StringPieces.
35 class HostsParser {
36  public:
HostsParser(std::string_view text,ParseHostsCommaMode comma_mode)37   explicit HostsParser(std::string_view text, ParseHostsCommaMode comma_mode)
38       : text_(text),
39         data_(text.data()),
40         end_(text.size()),
41         comma_mode_(comma_mode) {}
42 
43   HostsParser(const HostsParser&) = delete;
44   HostsParser& operator=(const HostsParser&) = delete;
45 
46   // Advances to the next token (IP or hostname).  Returns whether another
47   // token was available.  |token_is_ip| and |token| can be used to find out
48   // the type and text of the token.
Advance()49   bool Advance() {
50     bool next_is_ip = (pos_ == 0);
51     while (pos_ < end_ && pos_ != std::string::npos) {
52       switch (text_[pos_]) {
53         case ' ':
54         case '\t':
55           SkipWhitespace();
56           break;
57 
58         case '\r':
59         case '\n':
60           next_is_ip = true;
61           pos_++;
62           break;
63 
64         case '#':
65           SkipRestOfLine();
66           break;
67 
68         case ',':
69           if (comma_mode_ == PARSE_HOSTS_COMMA_IS_WHITESPACE) {
70             SkipWhitespace();
71             break;
72           }
73 
74           // If comma_mode_ is COMMA_IS_TOKEN, fall through:
75           [[fallthrough]];
76 
77         default: {
78           size_t token_start = pos_;
79           SkipToken();
80           size_t token_end = (pos_ == std::string::npos) ? end_ : pos_;
81 
82           token_ =
83               std::string_view(data_ + token_start, token_end - token_start);
84           token_is_ip_ = next_is_ip;
85 
86           return true;
87         }
88       }
89     }
90 
91     return false;
92   }
93 
94   // Fast-forwards the parser to the next line.  Should be called if an IP
95   // address doesn't parse, to avoid wasting time tokenizing hostnames that
96   // will be ignored.
SkipRestOfLine()97   void SkipRestOfLine() { pos_ = text_.find("\n", pos_); }
98 
99   // Returns whether the last-parsed token is an IP address (true) or a
100   // hostname (false).
token_is_ip()101   bool token_is_ip() { return token_is_ip_; }
102 
103   // Returns the text of the last-parsed token as a std::string_view referencing
104   // the same underlying memory as the std::string_view passed to the
105   // constructor. Returns an empty std::string_view if no token has been parsed
106   // or the end of the input string has been reached.
token()107   std::string_view token() { return token_; }
108 
109  private:
SkipToken()110   void SkipToken() {
111     switch (comma_mode_) {
112       case PARSE_HOSTS_COMMA_IS_TOKEN:
113         pos_ = text_.find_first_of(" \t\n\r#", pos_);
114         break;
115       case PARSE_HOSTS_COMMA_IS_WHITESPACE:
116         pos_ = text_.find_first_of(" ,\t\n\r#", pos_);
117         break;
118     }
119   }
120 
SkipWhitespace()121   void SkipWhitespace() {
122     switch (comma_mode_) {
123       case PARSE_HOSTS_COMMA_IS_TOKEN:
124         pos_ = text_.find_first_not_of(" \t", pos_);
125         break;
126       case PARSE_HOSTS_COMMA_IS_WHITESPACE:
127         pos_ = text_.find_first_not_of(" ,\t", pos_);
128         break;
129     }
130   }
131 
132   const std::string_view text_;
133   const char* data_;
134   const size_t end_;
135 
136   size_t pos_ = 0;
137   std::string_view token_;
138   bool token_is_ip_ = false;
139 
140   const ParseHostsCommaMode comma_mode_;
141 };
142 
ParseHostsWithCommaMode(const std::string & contents,DnsHosts * dns_hosts,ParseHostsCommaMode comma_mode)143 void ParseHostsWithCommaMode(const std::string& contents,
144                              DnsHosts* dns_hosts,
145                              ParseHostsCommaMode comma_mode) {
146   CHECK(dns_hosts);
147 
148   std::string_view ip_text;
149   IPAddress ip;
150   AddressFamily family = ADDRESS_FAMILY_IPV4;
151   HostsParser parser(contents, comma_mode);
152   while (parser.Advance()) {
153     if (parser.token_is_ip()) {
154       std::string_view new_ip_text = parser.token();
155       // Some ad-blocking hosts files contain thousands of entries pointing to
156       // the same IP address (usually 127.0.0.1).  Don't bother parsing the IP
157       // again if it's the same as the one above it.
158       if (new_ip_text != ip_text) {
159         IPAddress new_ip;
160         if (new_ip.AssignFromIPLiteral(parser.token())) {
161           ip_text = new_ip_text;
162           ip = new_ip;
163           family = (ip.IsIPv4()) ? ADDRESS_FAMILY_IPV4 : ADDRESS_FAMILY_IPV6;
164         } else {
165           parser.SkipRestOfLine();
166         }
167       }
168     } else {
169       url::CanonHostInfo canonicalization_info;
170       std::string canonicalized_host =
171           CanonicalizeHost(parser.token(), &canonicalization_info);
172 
173       // Skip if token is invalid for host canonicalization, or if it
174       // canonicalizes as an IP address.
175       if (canonicalization_info.family != url::CanonHostInfo::NEUTRAL)
176         continue;
177 
178       DnsHostsKey key(std::move(canonicalized_host), family);
179       if (!IsCanonicalizedHostCompliant(key.first))
180         continue;
181       IPAddress* mapped_ip = &(*dns_hosts)[key];
182       if (mapped_ip->empty())
183         *mapped_ip = ip;
184       // else ignore this entry (first hit counts)
185     }
186   }
187 }
188 
189 }  // namespace
190 
ParseHostsWithCommaModeForTesting(const std::string & contents,DnsHosts * dns_hosts,ParseHostsCommaMode comma_mode)191 void ParseHostsWithCommaModeForTesting(const std::string& contents,
192                                        DnsHosts* dns_hosts,
193                                        ParseHostsCommaMode comma_mode) {
194   ParseHostsWithCommaMode(contents, dns_hosts, comma_mode);
195 }
196 
ParseHosts(const std::string & contents,DnsHosts * dns_hosts)197 void ParseHosts(const std::string& contents, DnsHosts* dns_hosts) {
198   ParseHostsCommaMode comma_mode;
199 #if BUILDFLAG(IS_APPLE)
200   // Mac OS X allows commas to separate hostnames.
201   comma_mode = PARSE_HOSTS_COMMA_IS_WHITESPACE;
202 #else
203   // Linux allows commas in hostnames.
204   comma_mode = PARSE_HOSTS_COMMA_IS_TOKEN;
205 #endif
206 
207   ParseHostsWithCommaMode(contents, dns_hosts, comma_mode);
208 
209   // TODO(crbug.com/40874231): Remove this when we have enough data.
210   base::UmaHistogramCounts100000("Net.DNS.DnsHosts.Count", dns_hosts->size());
211 
212 #if !BUILDFLAG(CRONET_BUILD)
213   // Cronet disables tracing and doesn't provide an implementation of
214   // base::trace_event::EstimateMemoryUsage for DnsHosts. Having this
215   // conditional is preferred over a fake implementation to avoid reporting fake
216   // metrics.
217   base::UmaHistogramMemoryKB(
218       "Net.DNS.DnsHosts.EstimateMemoryUsage",
219       base::trace_event::EstimateMemoryUsage(*dns_hosts));
220 #endif  // !BUILDFLAG(CRONET_BUILD)
221 }
222 
223 DnsHostsParser::~DnsHostsParser() = default;
224 
DnsHostsFileParser(base::FilePath hosts_file_path)225 DnsHostsFileParser::DnsHostsFileParser(base::FilePath hosts_file_path)
226     : hosts_file_path_(std::move(hosts_file_path)) {}
227 
228 DnsHostsFileParser::~DnsHostsFileParser() = default;
229 
ParseHosts(DnsHosts * dns_hosts) const230 bool DnsHostsFileParser::ParseHosts(DnsHosts* dns_hosts) const {
231   dns_hosts->clear();
232   // Missing file indicates empty HOSTS.
233   if (!base::PathExists(hosts_file_path_))
234     return true;
235 
236   std::optional<int64_t> size = base::GetFileSize(hosts_file_path_);
237   if (!size.has_value()) {
238     return false;
239   }
240 
241   // Reject HOSTS files larger than |kMaxHostsSize| bytes.
242   const int64_t kMaxHostsSize = 1 << 25;  // 32MB
243 
244   // TODO(crbug.com/40874231): Remove this when we have enough data.
245   base::UmaHistogramCustomCounts("Net.DNS.DnsHosts.FileSize", size.value(), 1,
246                                  kMaxHostsSize * 2, 50);
247   if (size.value() > kMaxHostsSize) {
248     return false;
249   }
250 
251   std::string contents;
252   if (!base::ReadFileToString(hosts_file_path_, &contents))
253     return false;
254 
255   net::ParseHosts(contents, dns_hosts);
256   return true;
257 }
258 
259 }  // namespace net
260