1 // Copyright (C) 2019 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Copyright 2011 Google Inc. All Rights Reserved. 16 // Author: ulas@google.com (Ulas Kirazci) 17 // 18 // Trie for word prefix lookups. Features: 19 // 20 // - Dynamic additions (but not deletions) 21 // - Low memory usage 22 // - Reasonable latency but not QPS 23 // - Revive from persistence is a disk read 24 // - Stores a 4-byte value associated with every key 25 // 26 // Associated with each value in the trie is a set of property ids. For 27 // efficiency, property ids should start at 0 and be densely packed. A value 28 // may have more than one id set. There is an additional deleted property 29 // for each value, which is set only when all the property ids associated with a 30 // value have been cleared. In the flash_index, property ids are used to track 31 // corpus ids. 32 // 33 // Not thread-safe. 34 35 #ifndef ICING_LEGACY_INDEX_ICING_DYNAMIC_TRIE_H_ 36 #define ICING_LEGACY_INDEX_ICING_DYNAMIC_TRIE_H_ 37 38 #include <cstdint> 39 #include <memory> 40 #include <string> 41 #include <unordered_map> 42 #include <vector> 43 44 #include "icing/legacy/core/icing-compat.h" 45 #include "icing/legacy/core/icing-packed-pod.h" 46 #include "icing/legacy/index/icing-filesystem.h" 47 #include "icing/legacy/index/icing-mmapper.h" 48 #include "icing/legacy/index/icing-storage.h" 49 #include "icing/legacy/index/proto/icing-dynamic-trie-header.pb.h" 50 #include "icing/util/i18n-utils.h" 51 #include "unicode/utf8.h" 52 53 namespace icing { 54 namespace lib { 55 56 class IcingFlashBitmap; 57 58 class IcingDynamicTrie : public IIcingStorage { 59 class Dumper; 60 class IcingDynamicTrieStorage; 61 62 public: 63 // Adjacent bit fields are usually packed automatically. However, that is 64 // implementation specific: 65 // http://en.cppreference.com/w/cpp/language/bit_field 66 // So we'll set packed to be explicit. 67 class Node { 68 public: 69 // This object is only ever used by an ArrayStorage, which allocates 70 // sizeof(Node) bytes, zeroes them out and then casts to a Node. 71 Node() = delete; 72 next_index()73 uint32_t next_index() const { return next_index_; } set_next_index(uint32_t next_index)74 void set_next_index(uint32_t next_index) { next_index_ = next_index; } 75 is_leaf()76 bool is_leaf() const { return is_leaf_; } set_is_leaf(bool is_leaf)77 void set_is_leaf(bool is_leaf) { is_leaf_ = is_leaf; } 78 log2_num_children()79 uint8_t log2_num_children() const { return log2_num_children_; } set_log2_num_children(uint8_t log2_num_children)80 void set_log2_num_children(uint8_t log2_num_children) { 81 log2_num_children_ = log2_num_children; 82 } 83 84 private: 85 uint32_t next_index_ : 27; 86 uint32_t is_leaf_ : 1; 87 uint32_t log2_num_children_ : 4; 88 } __attribute__((packed)); 89 static_assert(sizeof(Node) == 4, ""); 90 static_assert(icing_is_packed_pod<Node>::value, "go/icing-ubsan"); 91 92 // Adjacent bit fields are usually packed automatically. However, that is 93 // implementation specific: 94 // http://en.cppreference.com/w/cpp/language/bit_field. 95 // So we'll set packed to be explicit. 96 union Next { Next(uint8_t val,uint32_t node_index)97 Next(uint8_t val, uint32_t node_index) { 98 used.val = val; 99 used.node_index = node_index; 100 } 101 val()102 uint8_t val() const { return used.val; } set_val(uint8_t val)103 void set_val(uint8_t val) { used.val = val; } 104 node_index()105 uint32_t node_index() const { return used.node_index; } set_node_index(uint32_t node_index)106 void set_node_index(uint32_t node_index) { used.node_index = node_index; } 107 next_index()108 uint32_t next_index() const { return freelink.next_index; } set_next_index(uint32_t next_index)109 void set_next_index(uint32_t next_index) { 110 freelink.next_index = next_index; 111 } 112 113 bool operator<(const Next &next2) const { 114 if (val() == next2.val()) { 115 return node_index() < next2.node_index(); 116 } 117 return val() < next2.val(); 118 } 119 120 private: 121 // This object is only ever used by an ArrayStorage, which allocates 122 // sizeof(Node) bytes, zeroes them out and then casts to a Node. 123 Next() = default; 124 125 struct { 126 uint32_t val : 8; 127 uint32_t node_index : 24; 128 } used; 129 struct { 130 uint32_t next_index : 32; 131 } freelink; 132 } __attribute__((packed)); 133 static_assert(sizeof(Next) == 4, ""); 134 static_assert(sizeof(Next) % alignof(Next) == 0, ""); 135 static_assert(icing_is_packed_pod<Next>::value, "go/icing-ubsan"); 136 137 static const int kMaxNextArraySize = 256; 138 static const int kNumNextAllocationBuckets = 9; // [log2(1), log2(256)] 139 140 static const uint32_t kMaxPropertyId = (1 << 16) - 1; 141 142 static const uint32_t kInvalidValueIndex = 0; 143 144 static const uint32_t kNoCrc = 0; 145 146 struct Stats { 147 uint32_t num_keys; 148 149 // Node stats 150 151 uint32_t num_nodes; 152 uint32_t max_nodes; 153 // Count of intermediate nodes. 154 uint32_t num_intermediates; 155 // Total and maximum number of children of intermediate nodes. 156 uint32_t sum_children, max_children; 157 158 // Count of leaf nodes. 159 uint32_t num_leaves; 160 // Total and maximum depth of leaf nodes. 161 uint32_t sum_depth, max_depth; 162 163 // Next stats 164 165 uint32_t num_nexts; 166 uint32_t max_nexts; 167 // Count of next arrays by size. 168 uint32_t child_counts[kMaxNextArraySize]; 169 // Wasted next array space per allocation bucket (in Nexts, not 170 // bytes). 171 uint32_t wasted[kNumNextAllocationBuckets]; 172 // Sum of wasted array. 173 uint32_t total_wasted; 174 175 // Suffix stats 176 177 uint32_t suffixes_size; 178 uint32_t max_suffixes_size; 179 // Bytes actually used by suffixes. 180 uint32_t suffixes_used; 181 // Number of suffixes that are just empty strings. 182 uint32_t null_suffixes; 183 184 // Next free-list stats 185 uint32_t num_free[kNumNextAllocationBuckets]; 186 // Total Next nodes free (weighted sum of the above). 187 uint32_t total_free; 188 189 // Dirty pages. 190 uint32_t dirty_pages_nodes; 191 uint32_t dirty_pages_nexts; 192 uint32_t dirty_pages_suffixes; 193 194 // TODO(b/222349894) Convert the string output to a protocol buffer instead. 195 std::string DumpStats(int verbosity) const; 196 }; 197 198 // Options when creating the trie. Maximums for the node/next/suffix 199 // arrays must be specified in advance. 200 struct Options { 201 // Absolute maximums. 202 static const uint32_t kMaxNodes, kMaxNexts, kMaxSuffixesSize, kMaxValueSize; 203 204 // The default takes 13MB of memory and can take about 1M English 205 // words. OptionsOptions206 Options() 207 : max_nodes(1U << 20), 208 max_nexts(1U << 20), 209 max_suffixes_size(5U << 20), 210 value_size(sizeof(uint32_t)) {} OptionsOptions211 Options(uint32_t max_nodes_in, uint32_t max_nexts_in, 212 uint32_t max_suffixes_size_in, uint32_t value_size_in) 213 : max_nodes(max_nodes_in), 214 max_nexts(max_nexts_in), 215 max_suffixes_size(max_suffixes_size_in), 216 value_size(value_size_in) {} 217 218 uint32_t max_nodes; 219 uint32_t max_nexts; 220 uint32_t max_suffixes_size; 221 uint32_t value_size; 222 223 // True if options do not exceed absolute maximums. 224 bool is_valid() const; 225 }; 226 227 // These can be supplied during runtime, as opposed to the persisted 228 // Options above. 229 struct RuntimeOptions { 230 enum StoragePolicy { 231 // Changes are reflected in the underlying file immediately but 232 // more vulnerable to corruption. 233 kMapSharedWithCrc, 234 235 // Changes only applied during Flush. Smaller window of 236 // vulnerability to corruption. 237 kExplicitFlush, 238 }; 239 set_storage_policyRuntimeOptions240 RuntimeOptions &set_storage_policy(StoragePolicy sp) { 241 storage_policy = sp; 242 return *this; 243 } 244 245 StoragePolicy storage_policy = kExplicitFlush; 246 }; 247 max_value_index(const Options & options)248 static uint32_t max_value_index(const Options &options) { 249 return options.max_suffixes_size; 250 } 251 252 // Light-weight constructor. Real work happens in Create or Init. 253 IcingDynamicTrie(const std::string &filename_base, 254 const RuntimeOptions &runtime_options, 255 const IcingFilesystem *filesystem); 256 ~IcingDynamicTrie() override; 257 is_initialized()258 bool is_initialized() const { return is_initialized_; } 259 260 // Create, but do not Init, a new trie with options if the file does 261 // not already exist. 262 // 263 // Returns true if successfully created all files or files already 264 // exist. Does not do a complete sanity check for when files seem to 265 // exist. Cleans up files if creation fails midstream. 266 bool CreateIfNotExist(const Options &options); 267 UpgradeTo(int new_version)268 bool UpgradeTo(int new_version) override { return true; } 269 bool Init() override; 270 void Close() override; 271 bool Remove() override; 272 uint64_t GetDiskUsage() const override; 273 274 // Returns the size of the elements held in the trie. This excludes the size 275 // of any internal metadata of the trie, e.g. the trie's header. 276 uint64_t GetElementsSize() const; 277 278 // REQUIRED: For all functions below is_initialized() == true. 279 280 // Number of keys in trie. 281 uint32_t size() const; 282 283 // Collecting stats. 284 void CollectStats(Stats *stats) const; 285 286 // Gets all of the contents of the trie for debugging purposes. Note: this 287 // stores the entire set of terms in memory. 288 // pretty_print - The tree structure of the trie will be written to this. 289 // keys - All keys in the trie are appended to this vector. 290 void DumpTrie(std::ostream *pretty_print, 291 std::vector<std::string> *keys) const; 292 293 // Empty out the trie without closing or removing. 294 void Clear(); 295 296 // Clears the suffix and value at the given index. Returns true on success. 297 bool ClearSuffixAndValue(uint32_t suffix_value_index); 298 299 // Resets the next at the given index so that it points to no node. 300 // Returns true on success. 301 bool ResetNext(uint32_t next_index); 302 303 // Sorts the next array of the node. Returns true on success. 304 bool SortNextArray(const Node *node); 305 306 // Sync to disk. 307 bool Sync() override; 308 309 // Tell kernel we will access the memory shortly. 310 void Warm() const; 311 312 // Potentially about to get nuked. 313 void OnSleep() override; 314 315 // Compact trie into out for value indices present in old_tvi_to_new_value. 316 class NewValueMap { 317 public: 318 virtual ~NewValueMap(); 319 320 // Returns the new value we want to assign to the entry at old 321 // value index. We don't take ownership of the pointer. 322 virtual const void *GetNewValue(uint32_t old_value_index) const = 0; 323 }; 324 // Compacts this trie. This drops all deleted keys, drops all keys for which 325 // old_tvi_to_new_value returns nullptr, updates values to be the values 326 // returned by old_tvi_to_new_value, rewrites tvis, and saves the results into 327 // the trie given in 'out'. 'old_to_new_tvi' is be populated with a mapping of 328 // old value_index to new value_index. 329 bool Compact(const NewValueMap &old_tvi_to_new_value, IcingDynamicTrie *out, 330 std::unordered_map<uint32_t, uint32_t> *old_to_new_tvi) const; 331 332 // Insert value at key. If key already exists and replace == true, 333 // replaces old value with value. We take a copy of value. 334 // 335 // If value_index is not NULL, returns a pointer to value in 336 // value_index. This can then be used with SetValueAtIndex 337 // below. value_index is not valid past a Clear/Read/Write. 338 // 339 // Returns false if there is no space left in the trie. 340 // 341 // REQUIRES: value a buffer of size value_size() Insert(const char * key,const void * value)342 bool Insert(const char *key, const void *value) { 343 return Insert(key, value, nullptr, true, nullptr); 344 } Insert(const char * key,const void * value,uint32_t * value_index,bool replace)345 bool Insert(const char *key, const void *value, uint32_t *value_index, 346 bool replace) { 347 return Insert(key, value, value_index, replace, nullptr); 348 } 349 bool Insert(const char *key, const void *value, uint32_t *value_index, 350 bool replace, bool *pnew_key); 351 352 // Get a value returned by Insert value_index. This points to the 353 // value in the trie. The pointer is immutable and always valid 354 // while the trie is alive. 355 const void *GetValueAtIndex(uint32_t value_index) const; 356 357 // Set a value returned by Insert value_index. We take a copy of 358 // value. 359 // 360 // REQUIRES: value a buffer of size value_size() 361 void SetValueAtIndex(uint32_t value_index, const void *value); 362 363 // Returns true if key is found and sets value. If value_index is 364 // not NULL, returns value_index (see Insert discussion above). 365 // If the key is not found, returns false and neither value nor 366 // value_index is modified. 367 // 368 // REQUIRES: value a buffer of size value_size() Find(const char * key,void * value)369 bool Find(const char *key, void *value) const { 370 return Find(key, value, nullptr); 371 } 372 bool Find(const char *key, void *value, uint32_t *value_index) const; 373 374 // Find the input key and all keys that are a variant of the input 375 // key according to a variant map. Currently supports 376 // transliteration. For example "a" is a variant for "à" or "á" so 377 // an "a" in the input key can match those characters in the trie in 378 // addition to itself. 379 // 380 // If prefix is set, also returns any prefix matches (so value_index 381 // will be invalid). 382 // 383 // REQUIRES: all terms in the lexicon to be valid utf8. 384 struct OriginalMatch { 385 uint32_t value_index; 386 std::string orig; 387 OriginalMatchOriginalMatch388 OriginalMatch() : value_index(kInvalidValueIndex) {} 389 is_full_matchOriginalMatch390 bool is_full_match() const { return value_index != kInvalidValueIndex; } 391 }; 392 393 static constexpr int kNoBranchFound = -1; 394 // Return prefix of any new branches created if key were inserted. If utf8 is 395 // true, does not cut key mid-utf8. Returns kNoBranchFound if no branches 396 // would be created. 397 int FindNewBranchingPrefixLength(const char *key, bool utf8) const; 398 399 // Find all prefixes of key where the trie branches. Excludes the key 400 // itself. If utf8 is true, does not cut key mid-utf8. 401 std::vector<int> FindBranchingPrefixLengths(const char *key, bool utf8) const; 402 403 void GetDebugInfo(int verbosity, std::string *out) const override; 404 405 double min_free_fraction() const; 406 407 uint32_t value_size() const; 408 409 uint32_t max_value_index() const; 410 411 // If in kMapSharedWithCrc mode, update crcs and return the master 412 // crc, else return kNoCrc. This crc includes both the trie files 413 // and property bitmaps. 414 uint32_t UpdateCrc(); 415 416 // Store dynamic properties for each value. When a property is added to 417 // a value, the deleted flag is cleared for it (if it was previously set). 418 bool SetProperty(uint32_t value_index, uint32_t property_id); 419 bool ClearProperty(uint32_t value_index, uint32_t property_id); 420 421 // Store deleted property for each value. 422 // This method is not the only way the deleted property can be set; the trie 423 // may set this property itself during other operations if it can determine a 424 // value becomes superfluous. 425 bool SetDeleted(uint32_t value_index); 426 427 // Clears the deleted property for each value. 428 bool ClearDeleted(uint32_t value_index); 429 430 // Deletes the entry associated with the key. Data can not be recovered after 431 // the deletion. Returns true on success. 432 bool Delete(std::string_view key); 433 434 // Clear a specific property id from all values. For each value that has this 435 // property cleared, also check to see if it was the only property set; if 436 // so, set the deleted property for the value to indicate it no longer has any 437 // properties associated with it. 438 bool ClearPropertyForAllValues(uint32_t property_id); 439 440 // Access properties. Usage: 441 // 442 // IcingDynamicTrie::PropertyReader reader(trie, 10); 443 // char value[SIZE]; 444 // uint32_t value_index; 445 // if (trie.Find("abc", value, &value_index) && 446 // reader.HasProperty(value_index)) { 447 // ... 448 // } 449 // 450 // Readers are valid as long as the underlying trie is open. 451 class PropertyReaderBase { 452 public: 453 // Whether underlying file exists. 454 bool Exists() const; 455 456 // Returns false for all values if underlying file is missing. 457 bool HasProperty(uint32_t value_index) const; 458 459 protected: 460 PropertyReaderBase(const IcingDynamicTrie &trie, bool deleted, 461 uint32_t property_id); 462 463 // Does not own. 464 const IcingFlashBitmap *bitmap_; 465 const IcingDynamicTrie &trie_; 466 }; 467 468 // Reader for a given property. It is invalidated when the underlying property 469 // is deleted, or the trie is closed. 470 class PropertyReader : public PropertyReaderBase { 471 public: PropertyReader(const IcingDynamicTrie & trie,uint32_t property_id)472 PropertyReader(const IcingDynamicTrie &trie, uint32_t property_id) 473 : PropertyReaderBase(trie, false, property_id) {} 474 }; 475 476 // Reader for the deleted property. It is invalidated when the trie is closed. 477 class PropertyDeletedReader : public PropertyReaderBase { 478 public: PropertyDeletedReader(const IcingDynamicTrie & trie)479 explicit PropertyDeletedReader(const IcingDynamicTrie &trie) 480 : PropertyReaderBase(trie, true, 0) {} 481 }; 482 483 // Reader for all properties (but not the deleted one). It is invalidated when 484 // the trie is closed. 485 class PropertyReadersAll { 486 public: 487 explicit PropertyReadersAll(const IcingDynamicTrie &trie); 488 489 // Whether underlying file for property_id exists. 490 bool Exists(uint32_t property_id) const; 491 492 // Returns false if underlying file or property doesn't exist. 493 bool HasProperty(uint32_t property_id, uint32_t value_index) const; 494 495 // Returns true if the value at value_index is set for the only the supplied 496 // property_id, and none of the other properties. 497 bool IsPropertyUnique(uint32_t property_id, uint32_t value_index) const; 498 499 // For iterating. 500 size_t size() const; 501 502 private: 503 const IcingDynamicTrie &trie_; 504 }; 505 506 // Iterate through trie in lexicographic order. 507 // 508 // Not thread-safe. 509 // 510 // Change in underlying trie invalidates iterator. 511 class Iterator { 512 public: 513 Iterator(const IcingDynamicTrie &trie, const char *prefix); 514 void Reset(); 515 bool Advance(); 516 517 // If !IsValid(), GetKey() will return NULL and GetValue() will 518 // return 0. 519 bool IsValid() const; 520 const char *GetKey() const; 521 // This points directly to the underlying data and is valid while 522 // the trie is alive. We keep ownership of the pointer. 523 const void *GetValue() const; 524 uint32_t GetValueIndex() const; 525 526 private: 527 Iterator(); 528 // Copy is ok. 529 530 // Helper function that takes the left-most branch down 531 // intermediate nodes to a leaf. 532 void LeftBranchToLeaf(uint32_t node_index); 533 534 std::string cur_key_; 535 const char *cur_suffix_; 536 int cur_suffix_len_; 537 struct Branch { 538 uint32_t node_idx; 539 int child_idx; 540 BranchBranch541 explicit Branch(uint32_t ni) : node_idx(ni), child_idx(0) {} 542 }; 543 std::vector<Branch> branch_stack_; 544 bool single_leaf_match_; 545 546 const IcingDynamicTrie &trie_; 547 }; 548 549 // Represents a non-leaf node or a "virtual" trie node in the suffix 550 // region. 551 struct LogicalNode { 552 const Node *node; 553 int suffix_offset; 554 LogicalNodeLogicalNode555 LogicalNode() : node(nullptr), suffix_offset(0) {} LogicalNodeLogicalNode556 LogicalNode(const Node *node_in, int suffix_offset_in) 557 : node(node_in), suffix_offset(suffix_offset_in) {} 558 }; 559 560 // Iterate over all utf8 chars in the trie anchored at prefix (or 561 // node). If trie has invalid utf8 chars, behavior is undefined (but 562 // won't crash). 563 class Utf8Iterator { 564 public: 565 void Reset(); 566 bool Advance(); 567 568 bool IsValid() const; 569 570 private: 571 struct Branch { 572 const Node *node; 573 const Next *child; 574 const Next *child_end; 575 576 bool IsFinished(); 577 }; 578 579 Utf8Iterator(); 580 // Copy is ok. 581 582 void LeftBranchToUtf8End(); 583 void InitBranch(Branch *branch, const Node *start, char key_char); 584 void GoIntoSuffix(const Node *node); 585 586 char cur_[U8_MAX_LENGTH + 1]; // NULL-terminated 587 int cur_len_; 588 LogicalNode cur_logical_node_; 589 590 Branch branch_stack_[U8_MAX_LENGTH]; 591 Branch *branch_end_; 592 593 const IcingDynamicTrie &trie_; 594 const Node *start_node_; 595 }; 596 597 private: 598 class CandidateSet; 599 600 // For testing only. 601 friend class IcingDynamicTrieTest_TrieShouldRespectLimits_Test; 602 friend class IcingDynamicTrieTest_SyncErrorRecovery_Test; 603 friend class IcingDynamicTrieTest_BitmapsClosedWhenInitFails_Test; 604 void GetHeader(IcingDynamicTrieHeader *hdr) const; 605 void SetHeader(const IcingDynamicTrieHeader &new_hdr); 606 607 static const uint32_t kInvalidSuffixIndex; 608 609 // Stats helpers. 610 void CollectStatsRecursive(const Node &node, Stats *stats, 611 uint32_t depth = 0) const; 612 613 // Helpers for Find and Insert. 614 const Next *GetNextByChar(const Node *node, uint8_t key_char) const; 615 const Next *LowerBound(const Next *start, const Next *end, 616 uint8_t key_char) const; 617 void FindBestNode(const char *key, uint32_t *best_node_index, int *key_offset, 618 bool prefix, bool utf8 = false) const; 619 620 // For value properties. This truncates the data by clearing it, but leaving 621 // the storage intact. 622 bool InitPropertyBitmaps(); 623 624 // Returns a pointer to a bitmap that is successfully opened. 625 static std::unique_ptr<IcingFlashBitmap> OpenAndInitBitmap( 626 const std::string &filename, bool verify, 627 const IcingFilesystem *filesystem); 628 629 // Returns a pointer to a writable bitmap, creating it if necessary. Returned 630 // pointer should not be freed, it will be maintained by property_bitmaps_. 631 // Returns null if bitmap failed to load. 632 IcingFlashBitmap *OpenOrCreatePropertyBitmap(uint32_t property_id); 633 634 uint64_t ValueIndexToPropertyBitmapIndex(uint32_t value_index) const; 635 636 const std::string filename_base_; 637 bool is_initialized_; 638 const RuntimeOptions runtime_options_; 639 std::unique_ptr<IcingDynamicTrieStorage> storage_; 640 const std::string property_bitmaps_prefix_; 641 std::vector<std::unique_ptr<IcingFlashBitmap>> property_bitmaps_; 642 const std::string deleted_bitmap_filename_; 643 std::unique_ptr<IcingFlashBitmap> deleted_bitmap_; 644 const IcingFilesystem *const filesystem_; 645 }; 646 647 } // namespace lib 648 } // namespace icing 649 650 #endif // ICING_LEGACY_INDEX_ICING_DYNAMIC_TRIE_H_ 651