1 // Copyright 2018 The Chromium Authors 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef NET_EXTRAS_PRELOAD_DATA_DECODER_H_ 6 #define NET_EXTRAS_PRELOAD_DATA_DECODER_H_ 7 8 #include <stdint.h> 9 10 #include <string> 11 12 #include "base/memory/raw_ptr.h" 13 14 namespace net::extras { 15 16 // Decodes an entry from preloaded data. 17 // Clients must implement ReadEntry() method to read the specific type of data 18 // they are interested in. 19 class PreloadDecoder { 20 public: 21 // These must match the values in net/tools/huffman_trie/trie/trie_writer.h. 22 enum : char { kEndOfString = 0, kEndOfTable = 127 }; 23 24 // BitReader is a class that allows a bytestring to be read bit-by-bit. 25 class BitReader { 26 public: 27 BitReader(const uint8_t* bytes, size_t num_bits); 28 29 BitReader(const BitReader&) = delete; 30 BitReader& operator=(const BitReader&) = delete; 31 32 // Next sets |*out| to the next bit from the input. It returns false if no 33 // more bits are available or true otherwise. 34 bool Next(bool* out); 35 36 // Read sets the |num_bits| least-significant bits of |*out| to the value of 37 // the next |num_bits| bits from the input. It returns false if there are 38 // insufficient bits in the input or true otherwise. 39 bool Read(unsigned num_bits, uint32_t* out); 40 41 // Decodes a size_t from the reader, putting the resulting value in |*out|. 42 // Returns false if there are insufficient bits to read and true otherwise. 43 // 44 // This function's inverse is TrieBitBuffer::WriteSize. 45 // 46 // The encoding is a prefix code optimized for small values (less than 4). 47 // It is designed for the lengths of prefixes in the HSTS Preload list trie. 48 // Compared to the unary encoding that was previously used (where the number 49 // of bits used is one plus the value being encoded), this uses one more bit 50 // for encoding 0 and 1, and the same number of bits for encoding 2, and 51 // fewer bits for encoding values greater than 2. At the time of writing, 52 // 35% of the lengths encoded in the trie were 0 or 1, 11% were 2, and the 53 // remaining 54% were greater than 2. 54 // 55 // This encoding scheme uses a variable number of bits to encode each value. 56 // There are fixed values for 0, 1, 2, and 3, and then a simple rule is used 57 // for 4 and greater. 0 uses 2 bits; 1 through 3 use 3 bits. The fixed 58 // values are as follows: 59 // 60 // 0: 0b00 61 // 1: 0b100 62 // 2: 0b101 63 // 3: 0b110 64 // 65 // Note that none of the fixed values are prefixed with 0b01 or 0b111. These 66 // prefixes are used with a unary-like encoding for values 4 and above. 67 // Zero or more 1s, followed by a 0, are appended to one of those prefixes. 68 // Even values use the prefix 0b01, and odd values use the prefix 0b111. The 69 // number of 1s to append is half the value (rounded down) minus 1. 70 bool DecodeSize(size_t* out); 71 72 // Seek sets the current offest in the input to bit number |offset|. It 73 // returns true if |offset| is within the range of the input and false 74 // otherwise. 75 bool Seek(size_t offset); 76 77 private: 78 const raw_ptr<const uint8_t, AllowPtrArithmetic> bytes_; 79 const size_t num_bits_; 80 const size_t num_bytes_; 81 // current_byte_index_ contains the current byte offset in |bytes_|. 82 size_t current_byte_index_ = 0; 83 // current_byte_ contains the current byte of the input. 84 uint8_t current_byte_; 85 // num_bits_used_ contains the number of bits of |current_byte_| that have 86 // been read. 87 unsigned num_bits_used_ = 8; 88 }; 89 90 // HuffmanDecoder is a very simple Huffman reader. The input Huffman tree is 91 // simply encoded as a series of two-byte structures. The first byte 92 // determines the "0" pointer for that node and the second the "1" pointer. 93 // Each byte either has the MSB set, in which case the bottom 7 bits are the 94 // value for that position, or else the bottom seven bits contain the index of 95 // a node. 96 // 97 // The tree is decoded by walking rather than a table-driven approach. 98 class HuffmanDecoder { 99 public: 100 HuffmanDecoder(const uint8_t* tree, size_t tree_bytes); 101 102 HuffmanDecoder(const HuffmanDecoder&) = delete; 103 HuffmanDecoder& operator=(const HuffmanDecoder&) = delete; 104 105 bool Decode(PreloadDecoder::BitReader* reader, char* out) const; 106 107 private: 108 const raw_ptr<const uint8_t, AllowPtrArithmetic> tree_; 109 const size_t tree_bytes_; 110 }; 111 112 PreloadDecoder(const uint8_t* huffman_tree, 113 size_t huffman_tree_size, 114 const uint8_t* trie, 115 size_t trie_bits, 116 size_t trie_root_position); 117 118 PreloadDecoder(const PreloadDecoder&) = delete; 119 PreloadDecoder& operator=(const PreloadDecoder&) = delete; 120 121 virtual ~PreloadDecoder(); 122 123 // Resolves search keyword given by |search| in the preloaded data. Returns 124 // false on internal error and true otherwise. After a successful return, 125 // |*out_found| is true iff a relevant entry has been found. In the case of 126 // HSTS data, |search| is the hostname being searched. 127 // 128 // Although this code should be robust, it never processes attacker-controlled 129 // data -- it only operates on the preloaded data built into the binary. 130 // 131 // The preloaded data is represented as a trie and matches |search| 132 // backwards. Each node in the trie starts with a number of characters, which 133 // must match exactly. After that is a dispatch table which maps the next 134 // character in the search keyword to another node in the trie. 135 // 136 // In the dispatch table, the zero character represents the "end of string" 137 // (which is the *beginning* of the search keyword since we process it 138 // backwards). The value in that case is special -- rather than an offset to 139 // another trie node, it contains the searched entry (for HSTS data, it 140 // contains whether subdomains are included, pinsets etc.). Clients must 141 // implement ReadEntry to read the entry at this location. 142 // 143 // Dispatch tables are always given in order, but the "end of string" (zero) 144 // value always comes before an entry for '.'. 145 bool Decode(const std::string& search, bool* out_found); 146 147 protected: 148 virtual bool ReadEntry(BitReader* reader, 149 const std::string& search, 150 size_t current_search_offset, 151 bool* out_found) = 0; 152 huffman_decoder()153 const HuffmanDecoder& huffman_decoder() const { return huffman_decoder_; } 154 155 private: 156 HuffmanDecoder huffman_decoder_; 157 BitReader bit_reader_; 158 159 const size_t trie_root_position_; 160 }; 161 162 } // namespace net::extras 163 164 #endif // NET_EXTRAS_PRELOAD_DATA_DECODER_H_ 165