// Copyright 2020 The Pigweed Authors // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy of // the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the // License for the specific language governing permissions and limitations under // the License. #pragma once #include #include #include #include namespace pw::tokenizer { // Reads entries from a binary token string database. This class does not copy // or modify the contents of the database. // // A binary token database is comprised of a 16-byte header followed by an array // of 8-byte entries and a table of null-terminated strings. The header // specifies the number of entries. Each entry contains information about a // tokenized string: the token and removal date, if any. All fields are // little-endian. // // Header // ====== // Offset Size Field // ----------------------------------- // 0 6 Magic number (TOKENS) // 6 2 Version (00 00) // 8 4 Entry count // 12 4 Reserved // // Entry // ===== // Offset Size Field // ----------------------------------- // 0 4 Token // 4 4 Removal date (d m yy) // // Entries are sorted by token. A string table with a null-terminated string for // each entry in order follows the entries. // // Entries are accessed by iterating over the database. A O(n) Find function is // also provided. In typical use, a TokenDatabase is preprocessed by a // Detokenizer into a std::unordered_map. class TokenDatabase { public: // Internal struct that describes how the underlying binary token database // stores entries. RawEntries generally should not be used directly. Instead, // use an Entry, which contains a pointer to the entry's string. struct RawEntry { uint32_t token; uint32_t date_removed; }; static_assert(sizeof(RawEntry) == 8u); // An entry in the token database. This struct adds the string to a RawEntry. struct Entry { // The token calculated for this string. uint32_t token; // The date the token and string was removed from the database, or // 0xFFFFFFFF if it was never removed. Dates are encoded such that natural // integer sorting sorts from oldest to newest dates. The day is stored an // an 8-bit day, 8-bit month, and 16-bit year, packed into a little-endian // uint32_t. uint32_t date_removed; // The null-terminated string represented by this token. const char* string; }; // Iterator for TokenDatabase values. Note that this is not a normal STL-style // iterator, since * returns a value instead of a reference. class Iterator { public: constexpr Iterator(const RawEntry* raw_entry, const char* string) : raw_(raw_entry), string_(string) {} // Constructs a TokenDatabase::Entry for the entry this iterator refers to. constexpr Entry entry() const { return {raw_->token, raw_->date_removed, string_}; } constexpr Iterator& operator++() { raw_ += 1; // Move string_ to the character beyond the next null terminator. while (*string_++ != '\0') { } return *this; } constexpr Iterator operator++(int) { Iterator previous(raw_, string_); operator++(); return previous; } constexpr bool operator==(const Iterator& rhs) const { return raw_ == rhs.raw_; } constexpr bool operator!=(const Iterator& rhs) const { return raw_ != rhs.raw_; } // Derefencing a TokenDatabase::Iterator returns an Entry, not an Entry&. constexpr Entry operator*() const { return entry(); } // Point directly into the underlying RawEntry. Strings are not accessible // via this operator. constexpr const RawEntry* operator->() const { return raw_; } constexpr ptrdiff_t operator-(const Iterator& rhs) const { return raw_ - rhs.raw_; } private: const RawEntry* raw_; const char* string_; }; // A list of token entries returned from a Find operation. This object can be // iterated over or indexed as an array. class Entries { public: constexpr Entries(const Iterator& begin, const Iterator& end) : begin_(begin), end_(end) {} // The number of entries in this list. constexpr size_t size() const { return end_ - begin_; } // True of the list is empty. constexpr bool empty() const { return begin_ == end_; } // Accesses the specified entry in this set. Returns an Entry object, which // is constructed from the underlying raw entry. The index must be less than // size(). This operation is O(n) in size(). Entry operator[](size_t index) const; constexpr const Iterator& begin() const { return begin_; } constexpr const Iterator& end() const { return end_; } private: Iterator begin_; Iterator end_; }; // Returns true if the provided data is a valid token database. This checks // the magic number ("TOKENS"), version (which must be 0), and that there is // is one string for each entry in the database. A database with extra strings // or other trailing data is considered valid. template static constexpr bool IsValid(const ByteArray& bytes) { return HasValidHeader(bytes) && EachEntryHasAString(bytes); } // Creates a TokenDatabase and checks if the provided data is valid at compile // time. Accepts references to constexpr containers (array, span, string_view, // etc.) with static storage duration. For example: // // constexpr char kMyData[] = ...; // constexpr TokenDatabase db = TokenDatabase::Create(); // template static constexpr TokenDatabase Create() { static_assert( HasValidHeader(kDatabaseBytes), "Databases must start with a 16-byte header that begins with TOKENS."); static_assert(EachEntryHasAString(kDatabaseBytes), "The database must have at least one string for each entry."); return TokenDatabase(std::data(kDatabaseBytes)); } // Creates a TokenDatabase from the provided byte array. The array may be a // span, array, or other container type. If the data is not valid, returns a // default-constructed database for which ok() is false. // // Prefer the Create overload that takes the data as a template parameter // whenever possible, since that function checks the integrity of the data at // compile time. template static constexpr TokenDatabase Create(const ByteArray& database_bytes) { return IsValid(database_bytes) ? TokenDatabase(std::data(database_bytes)) : TokenDatabase(); // Invalid database. } // Creates a database with no data. ok() returns false. constexpr TokenDatabase() : begin_{.data = nullptr}, end_{.data = nullptr} {} // Returns all entries associated with this token. This is a O(n) operation. Entries Find(uint32_t token) const; // Returns the total number of entries (unique token-string pairs). constexpr size_t size() const { return (end_.data - begin_.data) / sizeof(RawEntry); } // True if this database was constructed with valid data. constexpr bool ok() const { return begin_.data != nullptr; } Iterator begin() const { return Iterator(begin_.entry, end_.data); } Iterator end() const { return Iterator(end_.entry, nullptr); } private: struct Header { std::array magic; uint16_t version; uint32_t entry_count; uint32_t reserved; }; static_assert(sizeof(Header) == 2 * sizeof(RawEntry)); template static constexpr bool HasValidHeader(const ByteArray& bytes) { static_assert(sizeof(*std::data(bytes)) == 1u); if (std::size(bytes) < sizeof(Header)) { return false; } // Check the magic number and version. for (size_t i = 0; i < kMagicAndVersion.size(); ++i) { if (bytes[i] != kMagicAndVersion[i]) { return false; } } return true; } template static constexpr bool EachEntryHasAString(const ByteArray& bytes) { const size_t entries = ReadEntryCount(std::data(bytes)); // Check that the data is large enough to have a string table. if (std::size(bytes) < StringTable(entries)) { return false; } // Count the strings in the string table. size_t string_count = 0; for (auto i = std::begin(bytes) + StringTable(entries); i < std::end(bytes); ++i) { string_count += (*i == '\0') ? 1 : 0; } // Check that there is at least one string for each entry. return string_count >= entries; } // Reads the number of entries from a database header. Cast to the bytes to // uint8_t to avoid sign extension if T is signed. template static constexpr uint32_t ReadEntryCount(const T* header_bytes) { const T* bytes = header_bytes + offsetof(Header, entry_count); return static_cast(bytes[0]) | static_cast(bytes[1]) << 8 | static_cast(bytes[2]) << 16 | static_cast(bytes[3]) << 24; } // Calculates the offset of the string table. static constexpr size_t StringTable(size_t entries) { return sizeof(Header) + entries * sizeof(RawEntry); } // The magic number that starts the table is "TOKENS". The version is encoded // next as two bytes. static constexpr std::array kMagicAndVersion = { 'T', 'O', 'K', 'E', 'N', 'S', '\0', '\0'}; template constexpr TokenDatabase(const Byte bytes[]) : TokenDatabase(bytes + sizeof(Header), bytes + StringTable(ReadEntryCount(bytes))) { static_assert(sizeof(Byte) == 1u); } // It is illegal to reinterpret_cast in constexpr functions, but acceptable to // use unions. Instead of using a reinterpret_cast to change the byte pointer // to a RawEntry pointer, have a separate overload for each byte pointer type // and store them in a union. constexpr TokenDatabase(const char* begin, const char* end) : begin_{.data = begin}, end_{.data = end} {} constexpr TokenDatabase(const unsigned char* begin, const unsigned char* end) : begin_{.unsigned_data = begin}, end_{.unsigned_data = end} {} constexpr TokenDatabase(const signed char* begin, const signed char* end) : begin_{.signed_data = begin}, end_{.signed_data = end} {} // Store the beginning and end pointers as a union to avoid breaking constexpr // rules for reinterpret_cast. union { const RawEntry* entry; const char* data; const unsigned char* unsigned_data; const signed char* signed_data; } begin_, end_; }; } // namespace pw::tokenizer