• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2020 The Pigweed Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
4 // use this file except in compliance with the License. You may obtain a copy of
5 // the License at
6 //
7 //     https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
11 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
12 // License for the specific language governing permissions and limitations under
13 // the License.
14 #pragma once
15 
16 #include <array>
17 #include <cstddef>
18 #include <cstdint>
19 #include <iterator>
20 
21 namespace pw::tokenizer {
22 
23 // Reads entries from a binary token string database. This class does not copy
24 // or modify the contents of the database.
25 //
26 // A binary token database is comprised of a 16-byte header followed by an array
27 // of 8-byte entries and a table of null-terminated strings. The header
28 // specifies the number of entries. Each entry contains information about a
29 // tokenized string: the token and removal date, if any. All fields are
30 // little-endian.
31 //
32 //            Header
33 //            ======
34 //   Offset  Size  Field
35 //   -----------------------------------
36 //        0     6  Magic number (TOKENS)
37 //        6     2  Version (00 00)
38 //        8     4  Entry count
39 //       12     4  Reserved
40 //
41 //             Entry
42 //             =====
43 //   Offset  Size  Field
44 //   -----------------------------------
45 //        0     4  Token
46 //        4     4  Removal date (d m yy)
47 //
48 // Entries are sorted by token. A string table with a null-terminated string for
49 // each entry in order follows the entries.
50 //
51 // Entries are accessed by iterating over the database. A O(n) Find function is
52 // also provided. In typical use, a TokenDatabase is preprocessed by a
53 // Detokenizer into a std::unordered_map.
54 class TokenDatabase {
55  public:
56   // Internal struct that describes how the underlying binary token database
57   // stores entries. RawEntries generally should not be used directly. Instead,
58   // use an Entry, which contains a pointer to the entry's string.
59   struct RawEntry {
60     uint32_t token;
61     uint32_t date_removed;
62   };
63 
64   static_assert(sizeof(RawEntry) == 8u);
65 
66   // An entry in the token database. This struct adds the string to a RawEntry.
67   struct Entry {
68     // The token calculated for this string.
69     uint32_t token;
70 
71     // The date the token and string was removed from the database, or
72     // 0xFFFFFFFF if it was never removed. Dates are encoded such that natural
73     // integer sorting sorts from oldest to newest dates. The day is stored an
74     // an 8-bit day, 8-bit month, and 16-bit year, packed into a little-endian
75     // uint32_t.
76     uint32_t date_removed;
77 
78     // The null-terminated string represented by this token.
79     const char* string;
80   };
81 
82   // Iterator for TokenDatabase values. Note that this is not a normal STL-style
83   // iterator, since * returns a value instead of a reference.
84   class Iterator {
85    public:
Iterator(const RawEntry * raw_entry,const char * string)86     constexpr Iterator(const RawEntry* raw_entry, const char* string)
87         : raw_(raw_entry), string_(string) {}
88 
89     // Constructs a TokenDatabase::Entry for the entry this iterator refers to.
entry()90     constexpr Entry entry() const {
91       return {raw_->token, raw_->date_removed, string_};
92     }
93 
94     constexpr Iterator& operator++() {
95       raw_ += 1;
96       // Move string_ to the character beyond the next null terminator.
97       while (*string_++ != '\0') {
98       }
99       return *this;
100     }
101     constexpr Iterator operator++(int) {
102       Iterator previous(raw_, string_);
103       operator++();
104       return previous;
105     }
106     constexpr bool operator==(const Iterator& rhs) const {
107       return raw_ == rhs.raw_;
108     }
109     constexpr bool operator!=(const Iterator& rhs) const {
110       return raw_ != rhs.raw_;
111     }
112 
113     // Derefencing a TokenDatabase::Iterator returns an Entry, not an Entry&.
114     constexpr Entry operator*() const { return entry(); }
115 
116     // Point directly into the underlying RawEntry. Strings are not accessible
117     // via this operator.
118     constexpr const RawEntry* operator->() const { return raw_; }
119 
120     constexpr ptrdiff_t operator-(const Iterator& rhs) const {
121       return raw_ - rhs.raw_;
122     }
123 
124    private:
125     const RawEntry* raw_;
126     const char* string_;
127   };
128 
129   // A list of token entries returned from a Find operation. This object can be
130   // iterated over or indexed as an array.
131   class Entries {
132    public:
Entries(const Iterator & begin,const Iterator & end)133     constexpr Entries(const Iterator& begin, const Iterator& end)
134         : begin_(begin), end_(end) {}
135 
136     // The number of entries in this list.
size()137     constexpr size_t size() const { return end_ - begin_; }
138 
139     // True of the list is empty.
empty()140     constexpr bool empty() const { return begin_ == end_; }
141 
142     // Accesses the specified entry in this set. Returns an Entry object, which
143     // is constructed from the underlying raw entry. The index must be less than
144     // size(). This operation is O(n) in size().
145     Entry operator[](size_t index) const;
146 
begin()147     constexpr const Iterator& begin() const { return begin_; }
end()148     constexpr const Iterator& end() const { return end_; }
149 
150    private:
151     Iterator begin_;
152     Iterator end_;
153   };
154 
155   // Returns true if the provided data is a valid token database. This checks
156   // the magic number ("TOKENS"), version (which must be 0), and that there is
157   // is one string for each entry in the database. A database with extra strings
158   // or other trailing data is considered valid.
159   template <typename ByteArray>
IsValid(const ByteArray & bytes)160   static constexpr bool IsValid(const ByteArray& bytes) {
161     return HasValidHeader(bytes) && EachEntryHasAString(bytes);
162   }
163 
164   // Creates a TokenDatabase and checks if the provided data is valid at compile
165   // time. Accepts references to constexpr containers (array, span, string_view,
166   // etc.) with static storage duration. For example:
167   //
168   //   constexpr char kMyData[] = ...;
169   //   constexpr TokenDatabase db = TokenDatabase::Create<kMyData>();
170   //
171   template <const auto& kDatabaseBytes>
Create()172   static constexpr TokenDatabase Create() {
173     static_assert(
174         HasValidHeader<decltype(kDatabaseBytes)>(kDatabaseBytes),
175         "Databases must start with a 16-byte header that begins with TOKENS.");
176 
177     static_assert(EachEntryHasAString<decltype(kDatabaseBytes)>(kDatabaseBytes),
178                   "The database must have at least one string for each entry.");
179 
180     return TokenDatabase(std::data(kDatabaseBytes));
181   }
182 
183   // Creates a TokenDatabase from the provided byte array. The array may be a
184   // span, array, or other container type. If the data is not valid, returns a
185   // default-constructed database for which ok() is false.
186   //
187   // Prefer the Create overload that takes the data as a template parameter
188   // whenever possible, since that function checks the integrity of the data at
189   // compile time.
190   template <typename ByteArray>
Create(const ByteArray & database_bytes)191   static constexpr TokenDatabase Create(const ByteArray& database_bytes) {
192     return IsValid<ByteArray>(database_bytes)
193                ? TokenDatabase(std::data(database_bytes))
194                : TokenDatabase();  // Invalid database.
195   }
196   // Creates a database with no data. ok() returns false.
TokenDatabase()197   constexpr TokenDatabase() : begin_{.data = nullptr}, end_{.data = nullptr} {}
198 
199   // Returns all entries associated with this token. This is a O(n) operation.
200   Entries Find(uint32_t token) const;
201 
202   // Returns the total number of entries (unique token-string pairs).
size()203   constexpr size_t size() const {
204     return (end_.data - begin_.data) / sizeof(RawEntry);
205   }
206 
207   // True if this database was constructed with valid data.
ok()208   constexpr bool ok() const { return begin_.data != nullptr; }
209 
begin()210   Iterator begin() const { return Iterator(begin_.entry, end_.data); }
end()211   Iterator end() const { return Iterator(end_.entry, nullptr); }
212 
213  private:
214   struct Header {
215     std::array<char, 6> magic;
216     uint16_t version;
217     uint32_t entry_count;
218     uint32_t reserved;
219   };
220 
221   static_assert(sizeof(Header) == 2 * sizeof(RawEntry));
222 
223   template <typename ByteArray>
HasValidHeader(const ByteArray & bytes)224   static constexpr bool HasValidHeader(const ByteArray& bytes) {
225     static_assert(sizeof(*std::data(bytes)) == 1u);
226 
227     if (std::size(bytes) < sizeof(Header)) {
228       return false;
229     }
230 
231     // Check the magic number and version.
232     for (size_t i = 0; i < kMagicAndVersion.size(); ++i) {
233       if (bytes[i] != kMagicAndVersion[i]) {
234         return false;
235       }
236     }
237 
238     return true;
239   }
240 
241   template <typename ByteArray>
EachEntryHasAString(const ByteArray & bytes)242   static constexpr bool EachEntryHasAString(const ByteArray& bytes) {
243     const size_t entries = ReadEntryCount(std::data(bytes));
244 
245     // Check that the data is large enough to have a string table.
246     if (std::size(bytes) < StringTable(entries)) {
247       return false;
248     }
249 
250     // Count the strings in the string table.
251     size_t string_count = 0;
252     for (auto i = std::begin(bytes) + StringTable(entries); i < std::end(bytes);
253          ++i) {
254       string_count += (*i == '\0') ? 1 : 0;
255     }
256 
257     // Check that there is at least one string for each entry.
258     return string_count >= entries;
259   }
260 
261   // Reads the number of entries from a database header. Cast to the bytes to
262   // uint8_t to avoid sign extension if T is signed.
263   template <typename T>
ReadEntryCount(const T * header_bytes)264   static constexpr uint32_t ReadEntryCount(const T* header_bytes) {
265     const T* bytes = header_bytes + offsetof(Header, entry_count);
266     return static_cast<uint8_t>(bytes[0]) |
267            static_cast<uint8_t>(bytes[1]) << 8 |
268            static_cast<uint8_t>(bytes[2]) << 16 |
269            static_cast<uint8_t>(bytes[3]) << 24;
270   }
271 
272   // Calculates the offset of the string table.
StringTable(size_t entries)273   static constexpr size_t StringTable(size_t entries) {
274     return sizeof(Header) + entries * sizeof(RawEntry);
275   }
276 
277   // The magic number that starts the table is "TOKENS". The version is encoded
278   // next as two bytes.
279   static constexpr std::array<char, 8> kMagicAndVersion = {
280       'T', 'O', 'K', 'E', 'N', 'S', '\0', '\0'};
281 
282   template <typename Byte>
TokenDatabase(const Byte bytes[])283   constexpr TokenDatabase(const Byte bytes[])
284       : TokenDatabase(bytes + sizeof(Header),
285                       bytes + StringTable(ReadEntryCount(bytes))) {
286     static_assert(sizeof(Byte) == 1u);
287   }
288 
289   // It is illegal to reinterpret_cast in constexpr functions, but acceptable to
290   // use unions. Instead of using a reinterpret_cast to change the byte pointer
291   // to a RawEntry pointer, have a separate overload for each byte pointer type
292   // and store them in a union.
TokenDatabase(const char * begin,const char * end)293   constexpr TokenDatabase(const char* begin, const char* end)
294       : begin_{.data = begin}, end_{.data = end} {}
295 
TokenDatabase(const unsigned char * begin,const unsigned char * end)296   constexpr TokenDatabase(const unsigned char* begin, const unsigned char* end)
297       : begin_{.unsigned_data = begin}, end_{.unsigned_data = end} {}
298 
TokenDatabase(const signed char * begin,const signed char * end)299   constexpr TokenDatabase(const signed char* begin, const signed char* end)
300       : begin_{.signed_data = begin}, end_{.signed_data = end} {}
301 
302   // Store the beginning and end pointers as a union to avoid breaking constexpr
303   // rules for reinterpret_cast.
304   union {
305     const RawEntry* entry;
306     const char* data;
307     const unsigned char* unsigned_data;
308     const signed char* signed_data;
309   } begin_, end_;
310 };
311 
312 }  // namespace pw::tokenizer
313