• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2015 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef NET_BASE_LOOKUP_STRING_IN_FIXED_SET_H_
6 #define NET_BASE_LOOKUP_STRING_IN_FIXED_SET_H_
7 
8 #include <stddef.h>
9 
10 #include <cstdint>
11 #include <string_view>
12 
13 #include "base/containers/span.h"
14 #include "base/memory/raw_ptr_exclusion.h"
15 #include "base/memory/raw_span.h"
16 #include "net/base/net_export.h"
17 
18 namespace net {
19 
20 enum {
21   kDafsaNotFound = -1,  // key is not in set
22   kDafsaFound = 0,      // key is in set
23   // The following return values are used by the implementation of
24   // GetDomainAndRegistry() and are probably not generally useful.
25   kDafsaExceptionRule = 1,  // key excluded from set via exception
26   kDafsaWildcardRule = 2,   // key matched a wildcard rule
27   kDafsaPrivateRule = 4,    // key matched a private rule
28 };
29 
30 // Looks up the string |key| with length |key_length| in a fixed set of
31 // strings. The set of strings must be known at compile time. It is converted to
32 // a graph structure named a DAFSA (Deterministic Acyclic Finite State
33 // Automaton) by the script make_dafsa.py during compilation. This permits
34 // efficient (in time and space) lookup. The graph generated by make_dafsa.py
35 // takes the form of a constant byte array which should be supplied via the
36 // |graph| parameter.  The return value is kDafsaNotFound, kDafsaFound, or a
37 // bitmap consisting of one or more of kDafsaExceptionRule, kDafsaWildcardRule
38 // and kDafsaPrivateRule ORed together.
39 //
40 // TODO(nick): Replace this with FixedSetIncrementalLookup everywhere.
41 NET_EXPORT int LookupStringInFixedSet(base::span<const uint8_t> graph,
42                                       const char* key,
43                                       size_t key_length);
44 
45 // Looks up the longest matching suffix for |host| with length |length| in a
46 // reversed DAFSA. Partial matches must begin at a new component, i.e. host
47 // itself could match or a host part starting after a dot could match.
48 // If no match was found a value of 0 is written to |suffix_length| and the
49 // value kDafsaNotFound is returned, otherwise the length of the longest match
50 // is written to |suffix_length| and the type of the longest match is returned.
51 int LookupSuffixInReversedSet(base::span<const uint8_t> graph,
52                               bool include_private,
53                               std::string_view host,
54                               size_t* suffix_length);
55 
56 // FixedSetIncrementalLookup provides efficient membership and prefix queries
57 // against a fixed set of strings. The set of strings must be known at compile
58 // time. The set is converted to a graph structure named a DAFSA (Deterministic
59 // Acyclic Finite State Automaton) by the script //net/tools/dafsa/make_dafsa.py
60 // during compilation. The conversion generates a C++ header file defining the
61 // encoded graph as a constant byte array. This class provides a fast, constant-
62 // space lookup operation against such byte arrays.
63 //
64 // The lookup proceeds incrementally, with input characters provided one at a
65 // time. This approach allow queries of the form: "given an input string, which
66 // prefixes of that string that appear in the fixed set?" As the matching
67 // prefixes (and their result codes) are enumerated, the most suitable match
68 // among them can be selected in a single pass.
69 //
70 // This class can also be used to perform suffix queries (instead of prefix
71 // queries) against a fixed set, so long as the DAFSA is constructed on reversed
72 // values, and the input is provided in reverse order.
73 //
74 // Example usage for simple membership query; |input| is a std::string:
75 //
76 //    FixedSetIncrementalLookup lookup(kDafsa, sizeof(kDafsa));
77 //    for (char c : input) {
78 //      if (!lookup.Advance(c))
79 //         return false;
80 //    }
81 //    return lookup.GetResultForCurrentSequence() != kDafsaNotFound;
82 //
83 // Example usage for 'find longest prefix in set with result code == 3' query:
84 //
85 //    FixedSetIncrementalLookup prefix_lookup(kDafsa, sizeof(kDafsa));
86 //    size_t longest_match_end = 0;
87 //    for (size_t i = 0; i < input.length(); ++i) {
88 //      if (!prefix_lookup.Advance(input[i]))
89 //         break;
90 //      if (prefix_lookup.GetResultForCurrentSequence() == 3)
91 //        longest_match_end = (i + 1);
92 //    }
93 //    return input.substr(0, longest_match_end);
94 //
95 class NET_EXPORT FixedSetIncrementalLookup {
96  public:
97   // Begin a lookup against the provided fixed set. `graph` describes a byte
98   // buffer generated by the make_dafsa.py script, as described in the class
99   // comment.
100   //
101   // FixedSetIncrementalLookup is initialized to a state corresponding to the
102   // empty input sequence. Calling GetResultForCurrentSequence() in the initial
103   // state would indicate whether the empty string appears in the fixed set.
104   // Characters can be added to the sequence by calling Advance(), and the
105   // lookup result can be checked after each addition by calling
106   // GetResultForCurrentSequence().
107   explicit FixedSetIncrementalLookup(base::span<const uint8_t> graph);
108 
109   // FixedSetIncrementalLookup is copyable so that callers can save/restore
110   // their position in the search. This is for cases where branching or
111   // backtracking might be required (e.g. to probe for the presence of a
112   // designated wildcard character).
113   FixedSetIncrementalLookup(const FixedSetIncrementalLookup&);
114   FixedSetIncrementalLookup& operator=(const FixedSetIncrementalLookup&);
115 
116   ~FixedSetIncrementalLookup();
117 
118   // Advance the query by adding a character to the input sequence. |input| can
119   // be any char value, but only ASCII characters will ever result in matches,
120   // since the fixed set itself is limited to ASCII strings.
121   //
122   // Returns true if the resulting input sequence either appears in the fixed
123   // set itself, or is a prefix of some longer string in the fixed set. Returns
124   // false otherwise, implying that the graph is exhausted and
125   // GetResultForCurrentSequence() will return kDafsaNotFound.
126   //
127   // Once Advance() has returned false, the caller can safely stop feeding more
128   // characters, as subsequent calls to Advance() will return false and have no
129   // effect.
130   bool Advance(char input);
131 
132   // Returns the result code corresponding to the input sequence provided thus
133   // far to Advance().
134   //
135   // If the sequence does not appear in the fixed set, the return value is
136   // kDafsaNotFound. Otherwise, the value is a non-negative integer (currently
137   // limited to 0-7) corresponding to the result code for that string, as listed
138   // in the .gperf file from which the DAFSA was generated. For
139   // GetDomainAndRegistry DAFSAs, these values should be interpreted as a
140   // bitmask of kDafsaExceptionRule, kDafsaWildcardRule, and kDafsaPrivateRule.
141   //
142   // It is okay to call this function, and then extend the sequence further by
143   // calling Advance().
144   int GetResultForCurrentSequence() const;
145 
146  private:
147   // Span start points to the current position in the graph indicating the
148   // current state of the automaton. Is as empty span if the graph is exhausted.
149   // RAW_PTR_EXCLUSION: Already protected by `original_bytes_` (see below).
150   RAW_PTR_EXCLUSION base::span<const uint8_t> bytes_;
151 
152   // `original_bytes_` bytes isn't used per se, as we care only about the
153   // current state in `bytes_` (which is a subspan), but is kept here as
154   // raw_span<> because of its anti-UaF properties. `bytes_`, due to being
155   // constantly modified, isn't a good candidate for raw_span<> for performance
156   // reasons: `bytes_ = bytes_.subspan(N)` would be costly.
157   base::raw_span<const uint8_t> original_bytes_;
158 
159   // Contains the current decoder state. If true, `bytes_` points to a label
160   // character or a return code. If false, `bytes_` points to a sequence of
161   // offsets that indicate the child nodes of the current state.
162   bool bytes_starts_with_label_character_ = false;
163 };
164 
165 }  // namespace net
166 
167 #endif  // NET_BASE_LOOKUP_STRING_IN_FIXED_SET_H_
168