1 // Copyright 2020 The Pigweed Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); you may not 4 // use this file except in compliance with the License. You may obtain a copy of 5 // the License at 6 // 7 // https://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 11 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 12 // License for the specific language governing permissions and limitations under 13 // the License. 14 #pragma once 15 16 #include <array> 17 #include <cstddef> 18 #include <cstdint> 19 #include <iterator> 20 21 namespace pw::tokenizer { 22 23 // Reads entries from a binary token string database. This class does not copy 24 // or modify the contents of the database. 25 // 26 // A binary token database is comprised of a 16-byte header followed by an array 27 // of 8-byte entries and a table of null-terminated strings. The header 28 // specifies the number of entries. Each entry contains information about a 29 // tokenized string: the token and removal date, if any. All fields are 30 // little-endian. 31 // 32 // Header 33 // ====== 34 // Offset Size Field 35 // ----------------------------------- 36 // 0 6 Magic number (TOKENS) 37 // 6 2 Version (00 00) 38 // 8 4 Entry count 39 // 12 4 Reserved 40 // 41 // Entry 42 // ===== 43 // Offset Size Field 44 // ----------------------------------- 45 // 0 4 Token 46 // 4 4 Removal date (d m yy) 47 // 48 // Entries are sorted by token. A string table with a null-terminated string for 49 // each entry in order follows the entries. 50 // 51 // Entries are accessed by iterating over the database. A O(n) Find function is 52 // also provided. In typical use, a TokenDatabase is preprocessed by a 53 // Detokenizer into a std::unordered_map. 54 class TokenDatabase { 55 public: 56 // Internal struct that describes how the underlying binary token database 57 // stores entries. RawEntries generally should not be used directly. Instead, 58 // use an Entry, which contains a pointer to the entry's string. 59 struct RawEntry { 60 uint32_t token; 61 uint32_t date_removed; 62 }; 63 64 static_assert(sizeof(RawEntry) == 8u); 65 66 // An entry in the token database. This struct adds the string to a RawEntry. 67 struct Entry { 68 // The token calculated for this string. 69 uint32_t token; 70 71 // The date the token and string was removed from the database, or 72 // 0xFFFFFFFF if it was never removed. Dates are encoded such that natural 73 // integer sorting sorts from oldest to newest dates. The day is stored an 74 // an 8-bit day, 8-bit month, and 16-bit year, packed into a little-endian 75 // uint32_t. 76 uint32_t date_removed; 77 78 // The null-terminated string represented by this token. 79 const char* string; 80 }; 81 82 // Iterator for TokenDatabase values. Note that this is not a normal STL-style 83 // iterator, since * returns a value instead of a reference. 84 class Iterator { 85 public: Iterator(const RawEntry * raw_entry,const char * string)86 constexpr Iterator(const RawEntry* raw_entry, const char* string) 87 : raw_(raw_entry), string_(string) {} 88 89 // Constructs a TokenDatabase::Entry for the entry this iterator refers to. entry()90 constexpr Entry entry() const { 91 return {raw_->token, raw_->date_removed, string_}; 92 } 93 94 constexpr Iterator& operator++() { 95 raw_ += 1; 96 // Move string_ to the character beyond the next null terminator. 97 while (*string_++ != '\0') { 98 } 99 return *this; 100 } 101 constexpr Iterator operator++(int) { 102 Iterator previous(raw_, string_); 103 operator++(); 104 return previous; 105 } 106 constexpr bool operator==(const Iterator& rhs) const { 107 return raw_ == rhs.raw_; 108 } 109 constexpr bool operator!=(const Iterator& rhs) const { 110 return raw_ != rhs.raw_; 111 } 112 113 // Derefencing a TokenDatabase::Iterator returns an Entry, not an Entry&. 114 constexpr Entry operator*() const { return entry(); } 115 116 // Point directly into the underlying RawEntry. Strings are not accessible 117 // via this operator. 118 constexpr const RawEntry* operator->() const { return raw_; } 119 120 constexpr ptrdiff_t operator-(const Iterator& rhs) const { 121 return raw_ - rhs.raw_; 122 } 123 124 private: 125 const RawEntry* raw_; 126 const char* string_; 127 }; 128 129 // A list of token entries returned from a Find operation. This object can be 130 // iterated over or indexed as an array. 131 class Entries { 132 public: Entries(const Iterator & begin,const Iterator & end)133 constexpr Entries(const Iterator& begin, const Iterator& end) 134 : begin_(begin), end_(end) {} 135 136 // The number of entries in this list. size()137 constexpr size_t size() const { return end_ - begin_; } 138 139 // True of the list is empty. empty()140 constexpr bool empty() const { return begin_ == end_; } 141 142 // Accesses the specified entry in this set. Returns an Entry object, which 143 // is constructed from the underlying raw entry. The index must be less than 144 // size(). This operation is O(n) in size(). 145 Entry operator[](size_t index) const; 146 begin()147 constexpr const Iterator& begin() const { return begin_; } end()148 constexpr const Iterator& end() const { return end_; } 149 150 private: 151 Iterator begin_; 152 Iterator end_; 153 }; 154 155 // Returns true if the provided data is a valid token database. This checks 156 // the magic number ("TOKENS"), version (which must be 0), and that there is 157 // is one string for each entry in the database. A database with extra strings 158 // or other trailing data is considered valid. 159 template <typename ByteArray> IsValid(const ByteArray & bytes)160 static constexpr bool IsValid(const ByteArray& bytes) { 161 return HasValidHeader(bytes) && EachEntryHasAString(bytes); 162 } 163 164 // Creates a TokenDatabase and checks if the provided data is valid at compile 165 // time. Accepts references to constexpr containers (array, span, string_view, 166 // etc.) with static storage duration. For example: 167 // 168 // constexpr char kMyData[] = ...; 169 // constexpr TokenDatabase db = TokenDatabase::Create<kMyData>(); 170 // 171 template <const auto& kDatabaseBytes> Create()172 static constexpr TokenDatabase Create() { 173 static_assert( 174 HasValidHeader<decltype(kDatabaseBytes)>(kDatabaseBytes), 175 "Databases must start with a 16-byte header that begins with TOKENS."); 176 177 static_assert(EachEntryHasAString<decltype(kDatabaseBytes)>(kDatabaseBytes), 178 "The database must have at least one string for each entry."); 179 180 return TokenDatabase(std::data(kDatabaseBytes)); 181 } 182 183 // Creates a TokenDatabase from the provided byte array. The array may be a 184 // span, array, or other container type. If the data is not valid, returns a 185 // default-constructed database for which ok() is false. 186 // 187 // Prefer the Create overload that takes the data as a template parameter 188 // whenever possible, since that function checks the integrity of the data at 189 // compile time. 190 template <typename ByteArray> Create(const ByteArray & database_bytes)191 static constexpr TokenDatabase Create(const ByteArray& database_bytes) { 192 return IsValid<ByteArray>(database_bytes) 193 ? TokenDatabase(std::data(database_bytes)) 194 : TokenDatabase(); // Invalid database. 195 } 196 // Creates a database with no data. ok() returns false. TokenDatabase()197 constexpr TokenDatabase() : begin_{.data = nullptr}, end_{.data = nullptr} {} 198 199 // Returns all entries associated with this token. This is a O(n) operation. 200 Entries Find(uint32_t token) const; 201 202 // Returns the total number of entries (unique token-string pairs). size()203 constexpr size_t size() const { 204 return (end_.data - begin_.data) / sizeof(RawEntry); 205 } 206 207 // True if this database was constructed with valid data. ok()208 constexpr bool ok() const { return begin_.data != nullptr; } 209 begin()210 Iterator begin() const { return Iterator(begin_.entry, end_.data); } end()211 Iterator end() const { return Iterator(end_.entry, nullptr); } 212 213 private: 214 struct Header { 215 std::array<char, 6> magic; 216 uint16_t version; 217 uint32_t entry_count; 218 uint32_t reserved; 219 }; 220 221 static_assert(sizeof(Header) == 2 * sizeof(RawEntry)); 222 223 template <typename ByteArray> HasValidHeader(const ByteArray & bytes)224 static constexpr bool HasValidHeader(const ByteArray& bytes) { 225 static_assert(sizeof(*std::data(bytes)) == 1u); 226 227 if (std::size(bytes) < sizeof(Header)) { 228 return false; 229 } 230 231 // Check the magic number and version. 232 for (size_t i = 0; i < kMagicAndVersion.size(); ++i) { 233 if (bytes[i] != kMagicAndVersion[i]) { 234 return false; 235 } 236 } 237 238 return true; 239 } 240 241 template <typename ByteArray> EachEntryHasAString(const ByteArray & bytes)242 static constexpr bool EachEntryHasAString(const ByteArray& bytes) { 243 const size_t entries = ReadEntryCount(std::data(bytes)); 244 245 // Check that the data is large enough to have a string table. 246 if (std::size(bytes) < StringTable(entries)) { 247 return false; 248 } 249 250 // Count the strings in the string table. 251 size_t string_count = 0; 252 for (auto i = std::begin(bytes) + StringTable(entries); i < std::end(bytes); 253 ++i) { 254 string_count += (*i == '\0') ? 1 : 0; 255 } 256 257 // Check that there is at least one string for each entry. 258 return string_count >= entries; 259 } 260 261 // Reads the number of entries from a database header. Cast to the bytes to 262 // uint8_t to avoid sign extension if T is signed. 263 template <typename T> ReadEntryCount(const T * header_bytes)264 static constexpr uint32_t ReadEntryCount(const T* header_bytes) { 265 const T* bytes = header_bytes + offsetof(Header, entry_count); 266 return static_cast<uint8_t>(bytes[0]) | 267 static_cast<uint8_t>(bytes[1]) << 8 | 268 static_cast<uint8_t>(bytes[2]) << 16 | 269 static_cast<uint8_t>(bytes[3]) << 24; 270 } 271 272 // Calculates the offset of the string table. StringTable(size_t entries)273 static constexpr size_t StringTable(size_t entries) { 274 return sizeof(Header) + entries * sizeof(RawEntry); 275 } 276 277 // The magic number that starts the table is "TOKENS". The version is encoded 278 // next as two bytes. 279 static constexpr std::array<char, 8> kMagicAndVersion = { 280 'T', 'O', 'K', 'E', 'N', 'S', '\0', '\0'}; 281 282 template <typename Byte> TokenDatabase(const Byte bytes[])283 constexpr TokenDatabase(const Byte bytes[]) 284 : TokenDatabase(bytes + sizeof(Header), 285 bytes + StringTable(ReadEntryCount(bytes))) { 286 static_assert(sizeof(Byte) == 1u); 287 } 288 289 // It is illegal to reinterpret_cast in constexpr functions, but acceptable to 290 // use unions. Instead of using a reinterpret_cast to change the byte pointer 291 // to a RawEntry pointer, have a separate overload for each byte pointer type 292 // and store them in a union. TokenDatabase(const char * begin,const char * end)293 constexpr TokenDatabase(const char* begin, const char* end) 294 : begin_{.data = begin}, end_{.data = end} {} 295 TokenDatabase(const unsigned char * begin,const unsigned char * end)296 constexpr TokenDatabase(const unsigned char* begin, const unsigned char* end) 297 : begin_{.unsigned_data = begin}, end_{.unsigned_data = end} {} 298 TokenDatabase(const signed char * begin,const signed char * end)299 constexpr TokenDatabase(const signed char* begin, const signed char* end) 300 : begin_{.signed_data = begin}, end_{.signed_data = end} {} 301 302 // Store the beginning and end pointers as a union to avoid breaking constexpr 303 // rules for reinterpret_cast. 304 union { 305 const RawEntry* entry; 306 const char* data; 307 const unsigned char* unsigned_data; 308 const signed char* signed_data; 309 } begin_, end_; 310 }; 311 312 } // namespace pw::tokenizer 313