1 // Copyright 2015 The Chromium Authors 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef NET_BASE_LOOKUP_STRING_IN_FIXED_SET_H_ 6 #define NET_BASE_LOOKUP_STRING_IN_FIXED_SET_H_ 7 8 #include <stddef.h> 9 10 #include <cstdint> 11 #include <string_view> 12 13 #include "base/containers/span.h" 14 #include "base/memory/raw_ptr_exclusion.h" 15 #include "base/memory/raw_span.h" 16 #include "net/base/net_export.h" 17 18 namespace net { 19 20 enum { 21 kDafsaNotFound = -1, // key is not in set 22 kDafsaFound = 0, // key is in set 23 // The following return values are used by the implementation of 24 // GetDomainAndRegistry() and are probably not generally useful. 25 kDafsaExceptionRule = 1, // key excluded from set via exception 26 kDafsaWildcardRule = 2, // key matched a wildcard rule 27 kDafsaPrivateRule = 4, // key matched a private rule 28 }; 29 30 // Looks up the string |key| with length |key_length| in a fixed set of 31 // strings. The set of strings must be known at compile time. It is converted to 32 // a graph structure named a DAFSA (Deterministic Acyclic Finite State 33 // Automaton) by the script make_dafsa.py during compilation. This permits 34 // efficient (in time and space) lookup. The graph generated by make_dafsa.py 35 // takes the form of a constant byte array which should be supplied via the 36 // |graph| parameter. The return value is kDafsaNotFound, kDafsaFound, or a 37 // bitmap consisting of one or more of kDafsaExceptionRule, kDafsaWildcardRule 38 // and kDafsaPrivateRule ORed together. 39 // 40 // TODO(nick): Replace this with FixedSetIncrementalLookup everywhere. 41 NET_EXPORT int LookupStringInFixedSet(base::span<const uint8_t> graph, 42 const char* key, 43 size_t key_length); 44 45 // Looks up the longest matching suffix for |host| with length |length| in a 46 // reversed DAFSA. Partial matches must begin at a new component, i.e. host 47 // itself could match or a host part starting after a dot could match. 48 // If no match was found a value of 0 is written to |suffix_length| and the 49 // value kDafsaNotFound is returned, otherwise the length of the longest match 50 // is written to |suffix_length| and the type of the longest match is returned. 51 int LookupSuffixInReversedSet(base::span<const uint8_t> graph, 52 bool include_private, 53 std::string_view host, 54 size_t* suffix_length); 55 56 // FixedSetIncrementalLookup provides efficient membership and prefix queries 57 // against a fixed set of strings. The set of strings must be known at compile 58 // time. The set is converted to a graph structure named a DAFSA (Deterministic 59 // Acyclic Finite State Automaton) by the script //net/tools/dafsa/make_dafsa.py 60 // during compilation. The conversion generates a C++ header file defining the 61 // encoded graph as a constant byte array. This class provides a fast, constant- 62 // space lookup operation against such byte arrays. 63 // 64 // The lookup proceeds incrementally, with input characters provided one at a 65 // time. This approach allow queries of the form: "given an input string, which 66 // prefixes of that string that appear in the fixed set?" As the matching 67 // prefixes (and their result codes) are enumerated, the most suitable match 68 // among them can be selected in a single pass. 69 // 70 // This class can also be used to perform suffix queries (instead of prefix 71 // queries) against a fixed set, so long as the DAFSA is constructed on reversed 72 // values, and the input is provided in reverse order. 73 // 74 // Example usage for simple membership query; |input| is a std::string: 75 // 76 // FixedSetIncrementalLookup lookup(kDafsa, sizeof(kDafsa)); 77 // for (char c : input) { 78 // if (!lookup.Advance(c)) 79 // return false; 80 // } 81 // return lookup.GetResultForCurrentSequence() != kDafsaNotFound; 82 // 83 // Example usage for 'find longest prefix in set with result code == 3' query: 84 // 85 // FixedSetIncrementalLookup prefix_lookup(kDafsa, sizeof(kDafsa)); 86 // size_t longest_match_end = 0; 87 // for (size_t i = 0; i < input.length(); ++i) { 88 // if (!prefix_lookup.Advance(input[i])) 89 // break; 90 // if (prefix_lookup.GetResultForCurrentSequence() == 3) 91 // longest_match_end = (i + 1); 92 // } 93 // return input.substr(0, longest_match_end); 94 // 95 class NET_EXPORT FixedSetIncrementalLookup { 96 public: 97 // Begin a lookup against the provided fixed set. `graph` describes a byte 98 // buffer generated by the make_dafsa.py script, as described in the class 99 // comment. 100 // 101 // FixedSetIncrementalLookup is initialized to a state corresponding to the 102 // empty input sequence. Calling GetResultForCurrentSequence() in the initial 103 // state would indicate whether the empty string appears in the fixed set. 104 // Characters can be added to the sequence by calling Advance(), and the 105 // lookup result can be checked after each addition by calling 106 // GetResultForCurrentSequence(). 107 explicit FixedSetIncrementalLookup(base::span<const uint8_t> graph); 108 109 // FixedSetIncrementalLookup is copyable so that callers can save/restore 110 // their position in the search. This is for cases where branching or 111 // backtracking might be required (e.g. to probe for the presence of a 112 // designated wildcard character). 113 FixedSetIncrementalLookup(const FixedSetIncrementalLookup&); 114 FixedSetIncrementalLookup& operator=(const FixedSetIncrementalLookup&); 115 116 ~FixedSetIncrementalLookup(); 117 118 // Advance the query by adding a character to the input sequence. |input| can 119 // be any char value, but only ASCII characters will ever result in matches, 120 // since the fixed set itself is limited to ASCII strings. 121 // 122 // Returns true if the resulting input sequence either appears in the fixed 123 // set itself, or is a prefix of some longer string in the fixed set. Returns 124 // false otherwise, implying that the graph is exhausted and 125 // GetResultForCurrentSequence() will return kDafsaNotFound. 126 // 127 // Once Advance() has returned false, the caller can safely stop feeding more 128 // characters, as subsequent calls to Advance() will return false and have no 129 // effect. 130 bool Advance(char input); 131 132 // Returns the result code corresponding to the input sequence provided thus 133 // far to Advance(). 134 // 135 // If the sequence does not appear in the fixed set, the return value is 136 // kDafsaNotFound. Otherwise, the value is a non-negative integer (currently 137 // limited to 0-7) corresponding to the result code for that string, as listed 138 // in the .gperf file from which the DAFSA was generated. For 139 // GetDomainAndRegistry DAFSAs, these values should be interpreted as a 140 // bitmask of kDafsaExceptionRule, kDafsaWildcardRule, and kDafsaPrivateRule. 141 // 142 // It is okay to call this function, and then extend the sequence further by 143 // calling Advance(). 144 int GetResultForCurrentSequence() const; 145 146 private: 147 // Span start points to the current position in the graph indicating the 148 // current state of the automaton. Is as empty span if the graph is exhausted. 149 // RAW_PTR_EXCLUSION: Already protected by `original_bytes_` (see below). 150 RAW_PTR_EXCLUSION base::span<const uint8_t> bytes_; 151 152 // `original_bytes_` bytes isn't used per se, as we care only about the 153 // current state in `bytes_` (which is a subspan), but is kept here as 154 // raw_span<> because of its anti-UaF properties. `bytes_`, due to being 155 // constantly modified, isn't a good candidate for raw_span<> for performance 156 // reasons: `bytes_ = bytes_.subspan(N)` would be costly. 157 base::raw_span<const uint8_t> original_bytes_; 158 159 // Contains the current decoder state. If true, `bytes_` points to a label 160 // character or a return code. If false, `bytes_` points to a sequence of 161 // offsets that indicate the child nodes of the current state. 162 bool bytes_starts_with_label_character_ = false; 163 }; 164 165 } // namespace net 166 167 #endif // NET_BASE_LOOKUP_STRING_IN_FIXED_SET_H_ 168