1 // Copyright 2015 The Chromium Authors 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef NET_BASE_LOOKUP_STRING_IN_FIXED_SET_H_ 6 #define NET_BASE_LOOKUP_STRING_IN_FIXED_SET_H_ 7 8 #include <stddef.h> 9 10 #include "base/strings/string_piece.h" 11 #include "net/base/net_export.h" 12 13 namespace net { 14 15 enum { 16 kDafsaNotFound = -1, // key is not in set 17 kDafsaFound = 0, // key is in set 18 // The following return values are used by the implementation of 19 // GetDomainAndRegistry() and are probably not generally useful. 20 kDafsaExceptionRule = 1, // key excluded from set via exception 21 kDafsaWildcardRule = 2, // key matched a wildcard rule 22 kDafsaPrivateRule = 4, // key matched a private rule 23 }; 24 25 // Looks up the string |key| with length |key_length| in a fixed set of 26 // strings. The set of strings must be known at compile time. It is converted to 27 // a graph structure named a DAFSA (Deterministic Acyclic Finite State 28 // Automaton) by the script make_dafsa.py during compilation. This permits 29 // efficient (in time and space) lookup. The graph generated by make_dafsa.py 30 // takes the form of a constant byte array which should be supplied via the 31 // |graph| and |length| parameters. The return value is kDafsaNotFound, 32 // kDafsaFound, or a bitmap consisting of one or more of kDafsaExceptionRule, 33 // kDafsaWildcardRule and kDafsaPrivateRule ORed together. 34 // 35 // TODO(nick): Replace this with FixedSetIncrementalLookup everywhere. 36 NET_EXPORT int LookupStringInFixedSet(const unsigned char* graph, 37 size_t length, 38 const char* key, 39 size_t key_length); 40 41 // Looks up the longest matching suffix for |host| with length |length| in a 42 // reversed DAFSA. Partial matches must begin at a new component, i.e. host 43 // itself could match or a host part starting after a dot could match. 44 // If no match was found a value of 0 is written to |suffix_length| and the 45 // value kDafsaNotFound is returned, otherwise the length of the longest match 46 // is written to |suffix_length| and the type of the longest match is returned. 47 int LookupSuffixInReversedSet(const unsigned char* graph, 48 size_t length, 49 bool include_private, 50 base::StringPiece host, 51 size_t* suffix_length); 52 53 // FixedSetIncrementalLookup provides efficient membership and prefix queries 54 // against a fixed set of strings. The set of strings must be known at compile 55 // time. The set is converted to a graph structure named a DAFSA (Deterministic 56 // Acyclic Finite State Automaton) by the script //net/tools/dafsa/make_dafsa.py 57 // during compilation. The conversion generates a C++ header file defining the 58 // encoded graph as a constant byte array. This class provides a fast, constant- 59 // space lookup operation against such byte arrays. 60 // 61 // The lookup proceeds incrementally, with input characters provided one at a 62 // time. This approach allow queries of the form: "given an input string, which 63 // prefixes of that string that appear in the fixed set?" As the matching 64 // prefixes (and their result codes) are enumerated, the most suitable match 65 // among them can be selected in a single pass. 66 // 67 // This class can also be used to perform suffix queries (instead of prefix 68 // queries) against a fixed set, so long as the DAFSA is constructed on reversed 69 // values, and the input is provided in reverse order. 70 // 71 // Example usage for simple membership query; |input| is a std::string: 72 // 73 // FixedSetIncrementalLookup lookup(kDafsa, sizeof(kDafsa)); 74 // for (char c : input) { 75 // if (!lookup.Advance(c)) 76 // return false; 77 // } 78 // return lookup.GetResultForCurrentSequence() != kDafsaNotFound; 79 // 80 // Example usage for 'find longest prefix in set with result code == 3' query: 81 // 82 // FixedSetIncrementalLookup prefix_lookup(kDafsa, sizeof(kDafsa)); 83 // size_t longest_match_end = 0; 84 // for (size_t i = 0; i < input.length(); ++i) { 85 // if (!prefix_lookup.Advance(input[i])) 86 // break; 87 // if (prefix_lookup.GetResultForCurrentSequence() == 3) 88 // longest_match_end = (i + 1); 89 // } 90 // return input.substr(0, longest_match_end); 91 // 92 class NET_EXPORT FixedSetIncrementalLookup { 93 public: 94 // Begin a lookup against the provided fixed set. |graph| and |length| 95 // describe a byte buffer generated by the make_dafsa.py script, as described 96 // in the class comment. 97 // 98 // FixedSetIncrementalLookup is initialized to a state corresponding to the 99 // empty input sequence. Calling GetResultForCurrentSequence() in the initial 100 // state would indicate whether the empty string appears in the fixed set. 101 // Characters can be added to the sequence by calling Advance(), and the 102 // lookup result can be checked after each addition by calling 103 // GetResultForCurrentSequence(). 104 FixedSetIncrementalLookup(const unsigned char* graph, size_t length); 105 106 // FixedSetIncrementalLookup is copyable so that callers can save/restore 107 // their position in the search. This is for cases where branching or 108 // backtracking might be required (e.g. to probe for the presence of a 109 // designated wildcard character). 110 FixedSetIncrementalLookup(const FixedSetIncrementalLookup&); 111 FixedSetIncrementalLookup& operator=(const FixedSetIncrementalLookup&); 112 113 ~FixedSetIncrementalLookup(); 114 115 // Advance the query by adding a character to the input sequence. |input| can 116 // be any char value, but only ASCII characters will ever result in matches, 117 // since the fixed set itself is limited to ASCII strings. 118 // 119 // Returns true if the resulting input sequence either appears in the fixed 120 // set itself, or is a prefix of some longer string in the fixed set. Returns 121 // false otherwise, implying that the graph is exhausted and 122 // GetResultForCurrentSequence() will return kDafsaNotFound. 123 // 124 // Once Advance() has returned false, the caller can safely stop feeding more 125 // characters, as subsequent calls to Advance() will return false and have no 126 // effect. 127 bool Advance(char input); 128 129 // Returns the result code corresponding to the input sequence provided thus 130 // far to Advance(). 131 // 132 // If the sequence does not appear in the fixed set, the return value is 133 // kDafsaNotFound. Otherwise, the value is a non-negative integer (currently 134 // limited to 0-7) corresponding to the result code for that string, as listed 135 // in the .gperf file from which the DAFSA was generated. For 136 // GetDomainAndRegistry DAFSAs, these values should be interpreted as a 137 // bitmask of kDafsaExceptionRule, kDafsaWildcardRule, and kDafsaPrivateRule. 138 // 139 // It is okay to call this function, and then extend the sequence further by 140 // calling Advance(). 141 int GetResultForCurrentSequence() const; 142 143 private: 144 // Pointer to the current position in the graph indicating the current state 145 // of the automaton, or nullptr if the graph is exhausted. 146 const unsigned char* pos_; 147 148 // Pointer just past the end of the graph. |pos_| should never get here. This 149 // is used only in DCHECKs. 150 const unsigned char* end_; 151 152 // Contains the current decoder state. If true, |pos_| points to a label 153 // character or a return code. If false, |pos_| points to a sequence of 154 // offsets that indicate the child nodes of the current state. 155 bool pos_is_label_character_ = false; 156 }; 157 158 } // namespace net 159 160 #endif // NET_BASE_LOOKUP_STRING_IN_FIXED_SET_H_ 161