1 // Copyright 2015 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #ifdef UNSAFE_BUFFERS_BUILD
6 // TODO(crbug.com/40284755): Remove this and spanify to fix the errors.
7 #pragma allow_unsafe_buffers
8 #endif
9
10 #include "net/base/lookup_string_in_fixed_set.h"
11
12 #include <cstdint>
13
14 #include "base/check.h"
15 #include "base/containers/span.h"
16
17 namespace net {
18
19 namespace {
20
21 // Read next offset from `bytes`, increment `offset_bytes` by that amount, and
22 // increment `bytes` either to point to the start of the next encoded offset in
23 // its node, or set it to an empty span, if there are no remaining offsets.
24 //
25 // Returns true if an offset could be read; false otherwise.
GetNextOffset(base::span<const uint8_t> * bytes,base::span<const uint8_t> * offset_bytes)26 inline bool GetNextOffset(base::span<const uint8_t>* bytes,
27 base::span<const uint8_t>* offset_bytes) {
28 if (bytes->empty()) {
29 return false;
30 }
31
32 size_t bytes_consumed;
33 switch ((*bytes)[0] & 0x60) {
34 case 0x60: // Read three byte offset
35 *offset_bytes = offset_bytes->subspan(static_cast<size_t>(
36 (((*bytes)[0] & 0x1F) << 16) | ((*bytes)[1] << 8) | (*bytes)[2]));
37 bytes_consumed = 3;
38 break;
39 case 0x40: // Read two byte offset
40 *offset_bytes = offset_bytes->subspan(
41 static_cast<size_t>((((*bytes)[0] & 0x1F) << 8) | (*bytes)[1]));
42 bytes_consumed = 2;
43 break;
44 default:
45 *offset_bytes =
46 offset_bytes->subspan(static_cast<size_t>((*bytes)[0] & 0x3F));
47 bytes_consumed = 1;
48 }
49 if ((*bytes)[0] & 0x80) {
50 *bytes = base::span<const uint8_t>();
51 } else {
52 *bytes = bytes->subspan(bytes_consumed);
53 }
54 return true;
55 }
56
57 // Check if byte at `byte` is last in label.
IsEOL(uint8_t byte)58 bool IsEOL(uint8_t byte) {
59 return (byte & 0x80) != 0;
60 }
61
62 // Check if byte at `byte` matches key. This version matches both end-of-label
63 // chars and not-end-of-label chars.
IsMatch(uint8_t byte,char key)64 bool IsMatch(uint8_t byte, char key) {
65 return (byte & 0x7F) == key;
66 }
67
68 // Read return value at `byte`, if it is a return value. Returns true if a
69 // return value could be read, false otherwise.
GetReturnValue(uint8_t byte,int * return_value)70 bool GetReturnValue(uint8_t byte, int* return_value) {
71 // Return values are always encoded as end-of-label chars (so the high bit is
72 // set). So byte values in the inclusive range [0x80, 0x9F] encode the return
73 // values 0 through 31 (though make_dafsa.py doesn't currently encode values
74 // higher than 7). The following code does that translation.
75 if ((byte & 0xE0) == 0x80) {
76 *return_value = byte & 0x1F;
77 return true;
78 }
79 return false;
80 }
81
82 } // namespace
83
FixedSetIncrementalLookup(base::span<const uint8_t> graph)84 FixedSetIncrementalLookup::FixedSetIncrementalLookup(
85 base::span<const uint8_t> graph)
86 : bytes_(graph), original_bytes_(graph) {}
87
88 FixedSetIncrementalLookup::FixedSetIncrementalLookup(
89 const FixedSetIncrementalLookup& other) = default;
90
91 FixedSetIncrementalLookup& FixedSetIncrementalLookup::operator=(
92 const FixedSetIncrementalLookup& other) = default;
93
94 FixedSetIncrementalLookup::~FixedSetIncrementalLookup() = default;
95
Advance(char input)96 bool FixedSetIncrementalLookup::Advance(char input) {
97 if (bytes_.empty()) {
98 // A previous input exhausted the graph, so there are no possible matches.
99 return false;
100 }
101
102 // Only ASCII printable chars are supported by the current DAFSA format -- the
103 // high bit (values 0x80-0xFF) is reserved as a label-end signifier, and the
104 // low values (values 0x00-0x1F) are reserved to encode the return values. So
105 // values outside this range will never be in the dictionary.
106 if (input >= 0x20) {
107 if (bytes_starts_with_label_character_) {
108 // Currently processing a label, so it is only necessary to check the byte
109 // pointed by `bytes_` to see if it encodes a character matching `input`.
110 bool is_last_char_in_label = IsEOL(bytes_.front());
111 bool is_match = IsMatch(bytes_.front(), input);
112 if (is_match) {
113 // If this is not the last character in the label, the next byte should
114 // be interpreted as a character or return value. Otherwise, the next
115 // byte should be interpreted as a list of child node offsets.
116 bytes_ = bytes_.subspan<1>();
117 DCHECK(!bytes_.empty());
118 bytes_starts_with_label_character_ = !is_last_char_in_label;
119 return true;
120 }
121 } else {
122 base::span<const uint8_t> offset_bytes = bytes_;
123 // Read offsets from `bytes_` until the label of the child node at
124 // `offset_bytes` matches `input`, or until there are no more offsets.
125 while (GetNextOffset(&bytes_, &offset_bytes)) {
126 DCHECK(!offset_bytes.empty());
127
128 // `offset_bytes` points to a DAFSA node that is a child of the original
129 // node.
130 //
131 // The low 7 bits of a node encodes a character value; the high bit
132 // indicates whether it's the last character in the label.
133 //
134 // Note that `*offset_bytes` could also be a result code value, but
135 // these are really just out-of-range ASCII values, encoded the same way
136 // as characters. Since `input` was already validated as a printable
137 // ASCII value, IsMatch will never return true if `offset_bytes` is a
138 // result code.
139 bool is_last_char_in_label = IsEOL(offset_bytes.front());
140 bool is_match = IsMatch(offset_bytes.front(), input);
141
142 if (is_match) {
143 // If this is not the last character in the label, the next byte
144 // should be interpreted as a character or return value. Otherwise,
145 // the next byte should be interpreted as a list of child node
146 // offsets.
147 bytes_ = offset_bytes.subspan<1>();
148 DCHECK(!bytes_.empty());
149 bytes_starts_with_label_character_ = !is_last_char_in_label;
150 return true;
151 }
152 }
153 }
154 }
155
156 // If no match was found, then end of the DAFSA has been reached.
157 bytes_ = base::span<const uint8_t>();
158 bytes_starts_with_label_character_ = false;
159 return false;
160 }
161
GetResultForCurrentSequence() const162 int FixedSetIncrementalLookup::GetResultForCurrentSequence() const {
163 int value = kDafsaNotFound;
164 // Look to see if there is a next character that's a return value.
165 if (bytes_starts_with_label_character_) {
166 // Currently processing a label, so it is only necessary to check the byte
167 // at `bytes_` to see if encodes a return value.
168 GetReturnValue(bytes_.front(), &value);
169 } else {
170 // Otherwise, `bytes_` is an offset list. Explore the list of child nodes
171 // (given by their offsets) to find one whose label is a result code.
172 //
173 // This search uses a temporary copy of `bytes_`, since mutating `bytes_`
174 // could skip over a node that would be important to a subsequent Advance()
175 // call.
176 base::span<const uint8_t> temp_bytes = bytes_;
177
178 // Read offsets from `temp_bytes` until either `temp_bytes` is exhausted or
179 // until the byte at `offset_bytes` contains a result code (encoded as an
180 // ASCII character below 0x20).
181 base::span<const uint8_t> offset_bytes = bytes_;
182 while (GetNextOffset(&temp_bytes, &offset_bytes)) {
183 DCHECK(!offset_bytes.empty());
184 if (GetReturnValue(offset_bytes.front(), &value)) {
185 break;
186 }
187 }
188 }
189 return value;
190 }
191
LookupStringInFixedSet(base::span<const uint8_t> graph,const char * key,size_t key_length)192 int LookupStringInFixedSet(base::span<const uint8_t> graph,
193 const char* key,
194 size_t key_length) {
195 // Do an incremental lookup until either the end of the graph is reached, or
196 // until every character in |key| is consumed.
197 FixedSetIncrementalLookup lookup(graph);
198 const char* key_end = key + key_length;
199 while (key != key_end) {
200 if (!lookup.Advance(*key))
201 return kDafsaNotFound;
202 key++;
203 }
204 // The entire input was consumed without reaching the end of the graph. Return
205 // the result code (if present) for the current position, or kDafsaNotFound.
206 return lookup.GetResultForCurrentSequence();
207 }
208
209 // This function is only used by GetRegistryLengthInStrippedHost(), but is
210 // implemented here to allow inlining of
211 // LookupStringInFixedSet::GetResultForCurrentSequence() and
212 // LookupStringInFixedSet::Advance() at compile time. Tests on x86_64 linux
213 // indicated about 10% increased runtime cost for GetRegistryLength() in average
214 // if the implementation of this function was separated from the lookup methods.
LookupSuffixInReversedSet(base::span<const uint8_t> graph,bool include_private,std::string_view host,size_t * suffix_length)215 int LookupSuffixInReversedSet(base::span<const uint8_t> graph,
216 bool include_private,
217 std::string_view host,
218 size_t* suffix_length) {
219 FixedSetIncrementalLookup lookup(graph);
220 *suffix_length = 0;
221 int result = kDafsaNotFound;
222 std::string_view::const_iterator pos = host.end();
223 // Look up host from right to left.
224 while (pos != host.begin() && lookup.Advance(*--pos)) {
225 // Only host itself or a part that follows a dot can match.
226 if (pos == host.begin() || *(pos - 1) == '.') {
227 int value = lookup.GetResultForCurrentSequence();
228 if (value != kDafsaNotFound) {
229 // Break if private and private rules should be excluded.
230 if ((value & kDafsaPrivateRule) && !include_private)
231 break;
232 // Save length and return value. Since hosts are looked up from right to
233 // left, the last saved values will be from the longest match.
234 *suffix_length = host.end() - pos;
235 result = value;
236 }
237 }
238 }
239 return result;
240 }
241
242 } // namespace net
243