• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2015 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifdef UNSAFE_BUFFERS_BUILD
6 // TODO(crbug.com/40284755): Remove this and spanify to fix the errors.
7 #pragma allow_unsafe_buffers
8 #endif
9 
10 #include "net/base/lookup_string_in_fixed_set.h"
11 
12 #include <cstdint>
13 
14 #include "base/check.h"
15 #include "base/containers/span.h"
16 
17 namespace net {
18 
19 namespace {
20 
21 // Read next offset from `bytes`, increment `offset_bytes` by that amount, and
22 // increment `bytes` either to point to the start of the next encoded offset in
23 // its node, or set it to an empty span, if there are no remaining offsets.
24 //
25 // Returns true if an offset could be read; false otherwise.
GetNextOffset(base::span<const uint8_t> * bytes,base::span<const uint8_t> * offset_bytes)26 inline bool GetNextOffset(base::span<const uint8_t>* bytes,
27                           base::span<const uint8_t>* offset_bytes) {
28   if (bytes->empty()) {
29     return false;
30   }
31 
32   size_t bytes_consumed;
33   switch ((*bytes)[0] & 0x60) {
34     case 0x60:  // Read three byte offset
35       *offset_bytes = offset_bytes->subspan(static_cast<size_t>(
36           (((*bytes)[0] & 0x1F) << 16) | ((*bytes)[1] << 8) | (*bytes)[2]));
37       bytes_consumed = 3;
38       break;
39     case 0x40:  // Read two byte offset
40       *offset_bytes = offset_bytes->subspan(
41           static_cast<size_t>((((*bytes)[0] & 0x1F) << 8) | (*bytes)[1]));
42       bytes_consumed = 2;
43       break;
44     default:
45       *offset_bytes =
46           offset_bytes->subspan(static_cast<size_t>((*bytes)[0] & 0x3F));
47       bytes_consumed = 1;
48   }
49   if ((*bytes)[0] & 0x80) {
50     *bytes = base::span<const uint8_t>();
51   } else {
52     *bytes = bytes->subspan(bytes_consumed);
53   }
54   return true;
55 }
56 
57 // Check if byte at `byte` is last in label.
IsEOL(uint8_t byte)58 bool IsEOL(uint8_t byte) {
59   return (byte & 0x80) != 0;
60 }
61 
62 // Check if byte at `byte` matches key. This version matches both end-of-label
63 // chars and not-end-of-label chars.
IsMatch(uint8_t byte,char key)64 bool IsMatch(uint8_t byte, char key) {
65   return (byte & 0x7F) == key;
66 }
67 
68 // Read return value at `byte`, if it is a return value. Returns true if a
69 // return value could be read, false otherwise.
GetReturnValue(uint8_t byte,int * return_value)70 bool GetReturnValue(uint8_t byte, int* return_value) {
71   // Return values are always encoded as end-of-label chars (so the high bit is
72   // set). So byte values in the inclusive range [0x80, 0x9F] encode the return
73   // values 0 through 31 (though make_dafsa.py doesn't currently encode values
74   // higher than 7). The following code does that translation.
75   if ((byte & 0xE0) == 0x80) {
76     *return_value = byte & 0x1F;
77     return true;
78   }
79   return false;
80 }
81 
82 }  // namespace
83 
FixedSetIncrementalLookup(base::span<const uint8_t> graph)84 FixedSetIncrementalLookup::FixedSetIncrementalLookup(
85     base::span<const uint8_t> graph)
86     : bytes_(graph), original_bytes_(graph) {}
87 
88 FixedSetIncrementalLookup::FixedSetIncrementalLookup(
89     const FixedSetIncrementalLookup& other) = default;
90 
91 FixedSetIncrementalLookup& FixedSetIncrementalLookup::operator=(
92     const FixedSetIncrementalLookup& other) = default;
93 
94 FixedSetIncrementalLookup::~FixedSetIncrementalLookup() = default;
95 
Advance(char input)96 bool FixedSetIncrementalLookup::Advance(char input) {
97   if (bytes_.empty()) {
98     // A previous input exhausted the graph, so there are no possible matches.
99     return false;
100   }
101 
102   // Only ASCII printable chars are supported by the current DAFSA format -- the
103   // high bit (values 0x80-0xFF) is reserved as a label-end signifier, and the
104   // low values (values 0x00-0x1F) are reserved to encode the return values. So
105   // values outside this range will never be in the dictionary.
106   if (input >= 0x20) {
107     if (bytes_starts_with_label_character_) {
108       // Currently processing a label, so it is only necessary to check the byte
109       // pointed by `bytes_` to see if it encodes a character matching `input`.
110       bool is_last_char_in_label = IsEOL(bytes_.front());
111       bool is_match = IsMatch(bytes_.front(), input);
112       if (is_match) {
113         // If this is not the last character in the label, the next byte should
114         // be interpreted as a character or return value. Otherwise, the next
115         // byte should be interpreted as a list of child node offsets.
116         bytes_ = bytes_.subspan<1>();
117         DCHECK(!bytes_.empty());
118         bytes_starts_with_label_character_ = !is_last_char_in_label;
119         return true;
120       }
121     } else {
122       base::span<const uint8_t> offset_bytes = bytes_;
123       // Read offsets from `bytes_` until the label of the child node at
124       // `offset_bytes` matches `input`, or until there are no more offsets.
125       while (GetNextOffset(&bytes_, &offset_bytes)) {
126         DCHECK(!offset_bytes.empty());
127 
128         // `offset_bytes` points to a DAFSA node that is a child of the original
129         // node.
130         //
131         // The low 7 bits of a node encodes a character value; the high bit
132         // indicates whether it's the last character in the label.
133         //
134         // Note that `*offset_bytes` could also be a result code value, but
135         // these are really just out-of-range ASCII values, encoded the same way
136         // as characters. Since `input` was already validated as a printable
137         // ASCII value, IsMatch will never return true if `offset_bytes` is a
138         // result code.
139         bool is_last_char_in_label = IsEOL(offset_bytes.front());
140         bool is_match = IsMatch(offset_bytes.front(), input);
141 
142         if (is_match) {
143           // If this is not the last character in the label, the next byte
144           // should be interpreted as a character or return value. Otherwise,
145           // the next byte should be interpreted as a list of child node
146           // offsets.
147           bytes_ = offset_bytes.subspan<1>();
148           DCHECK(!bytes_.empty());
149           bytes_starts_with_label_character_ = !is_last_char_in_label;
150           return true;
151         }
152       }
153     }
154   }
155 
156   // If no match was found, then end of the DAFSA has been reached.
157   bytes_ = base::span<const uint8_t>();
158   bytes_starts_with_label_character_ = false;
159   return false;
160 }
161 
GetResultForCurrentSequence() const162 int FixedSetIncrementalLookup::GetResultForCurrentSequence() const {
163   int value = kDafsaNotFound;
164   // Look to see if there is a next character that's a return value.
165   if (bytes_starts_with_label_character_) {
166     // Currently processing a label, so it is only necessary to check the byte
167     // at `bytes_` to see if encodes a return value.
168     GetReturnValue(bytes_.front(), &value);
169   } else {
170     // Otherwise, `bytes_` is an offset list. Explore the list of child nodes
171     // (given by their offsets) to find one whose label is a result code.
172     //
173     // This search uses a temporary copy of `bytes_`, since mutating `bytes_`
174     // could skip over a node that would be important to a subsequent Advance()
175     // call.
176     base::span<const uint8_t> temp_bytes = bytes_;
177 
178     // Read offsets from `temp_bytes` until either `temp_bytes` is exhausted or
179     // until the byte at `offset_bytes` contains a result code (encoded as an
180     // ASCII character below 0x20).
181     base::span<const uint8_t> offset_bytes = bytes_;
182     while (GetNextOffset(&temp_bytes, &offset_bytes)) {
183       DCHECK(!offset_bytes.empty());
184       if (GetReturnValue(offset_bytes.front(), &value)) {
185         break;
186       }
187     }
188   }
189   return value;
190 }
191 
LookupStringInFixedSet(base::span<const uint8_t> graph,const char * key,size_t key_length)192 int LookupStringInFixedSet(base::span<const uint8_t> graph,
193                            const char* key,
194                            size_t key_length) {
195   // Do an incremental lookup until either the end of the graph is reached, or
196   // until every character in |key| is consumed.
197   FixedSetIncrementalLookup lookup(graph);
198   const char* key_end = key + key_length;
199   while (key != key_end) {
200     if (!lookup.Advance(*key))
201       return kDafsaNotFound;
202     key++;
203   }
204   // The entire input was consumed without reaching the end of the graph. Return
205   // the result code (if present) for the current position, or kDafsaNotFound.
206   return lookup.GetResultForCurrentSequence();
207 }
208 
209 // This function is only used by GetRegistryLengthInStrippedHost(), but is
210 // implemented here to allow inlining of
211 // LookupStringInFixedSet::GetResultForCurrentSequence() and
212 // LookupStringInFixedSet::Advance() at compile time. Tests on x86_64 linux
213 // indicated about 10% increased runtime cost for GetRegistryLength() in average
214 // if the implementation of this function was separated from the lookup methods.
LookupSuffixInReversedSet(base::span<const uint8_t> graph,bool include_private,std::string_view host,size_t * suffix_length)215 int LookupSuffixInReversedSet(base::span<const uint8_t> graph,
216                               bool include_private,
217                               std::string_view host,
218                               size_t* suffix_length) {
219   FixedSetIncrementalLookup lookup(graph);
220   *suffix_length = 0;
221   int result = kDafsaNotFound;
222   std::string_view::const_iterator pos = host.end();
223   // Look up host from right to left.
224   while (pos != host.begin() && lookup.Advance(*--pos)) {
225     // Only host itself or a part that follows a dot can match.
226     if (pos == host.begin() || *(pos - 1) == '.') {
227       int value = lookup.GetResultForCurrentSequence();
228       if (value != kDafsaNotFound) {
229         // Break if private and private rules should be excluded.
230         if ((value & kDafsaPrivateRule) && !include_private)
231           break;
232         // Save length and return value. Since hosts are looked up from right to
233         // left, the last saved values will be from the longest match.
234         *suffix_length = host.end() - pos;
235         result = value;
236       }
237     }
238   }
239   return result;
240 }
241 
242 }  // namespace net
243