1 // Copyright (C) 2019 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Copyright 2011 Google Inc. All Rights Reserved. 16 // Author: ulas@google.com (Ulas Kirazci) 17 // 18 // Trie for word prefix lookups. Features: 19 // 20 // - Dynamic additions (but not deletions) 21 // - Low memory usage 22 // - Reasonable latency but not QPS 23 // - Revive from persistence is a disk read 24 // - Stores a 4-byte value associated with every key 25 // 26 // Associated with each value in the trie is a set of property ids. For 27 // efficiency, property ids should start at 0 and be densely packed. A value 28 // may have more than one id set. There is an additional deleted property 29 // for each value, which is set only when all the property ids associated with a 30 // value have been cleared. In the flash_index, property ids are used to track 31 // corpus ids. 32 // 33 // Not thread-safe. 34 35 #ifndef ICING_LEGACY_INDEX_ICING_DYNAMIC_TRIE_H_ 36 #define ICING_LEGACY_INDEX_ICING_DYNAMIC_TRIE_H_ 37 38 #include <cstdint> 39 #include <memory> 40 #include <string> 41 #include <unordered_map> 42 #include <vector> 43 44 #include "icing/text_classifier/lib3/utils/base/status.h" 45 #include "icing/text_classifier/lib3/utils/base/statusor.h" 46 #include "icing/legacy/core/icing-compat.h" 47 #include "icing/legacy/core/icing-packed-pod.h" 48 #include "icing/legacy/index/icing-filesystem.h" 49 #include "icing/legacy/index/icing-mmapper.h" 50 #include "icing/legacy/index/icing-storage.h" 51 #include "icing/legacy/index/proto/icing-dynamic-trie-header.pb.h" 52 #include "icing/util/i18n-utils.h" 53 #include "unicode/utf8.h" 54 55 namespace icing { 56 namespace lib { 57 58 class IcingFlashBitmap; 59 60 class IcingDynamicTrie : public IIcingStorage { 61 class Dumper; 62 class IcingDynamicTrieStorage; 63 64 public: 65 // Adjacent bit fields are usually packed automatically. However, that is 66 // implementation specific: 67 // http://en.cppreference.com/w/cpp/language/bit_field 68 // So we'll set packed to be explicit. 69 class Node { 70 public: 71 // This object is only ever used by an ArrayStorage, which allocates 72 // sizeof(Node) bytes, zeroes them out and then casts to a Node. 73 Node() = delete; 74 next_index()75 uint32_t next_index() const { return next_index_; } set_next_index(uint32_t next_index)76 void set_next_index(uint32_t next_index) { next_index_ = next_index; } 77 is_leaf()78 bool is_leaf() const { return is_leaf_; } set_is_leaf(bool is_leaf)79 void set_is_leaf(bool is_leaf) { is_leaf_ = is_leaf; } 80 log2_num_children()81 uint8_t log2_num_children() const { return log2_num_children_; } set_log2_num_children(uint8_t log2_num_children)82 void set_log2_num_children(uint8_t log2_num_children) { 83 log2_num_children_ = log2_num_children; 84 } 85 86 private: 87 uint32_t next_index_ : 27; 88 uint32_t is_leaf_ : 1; 89 uint32_t log2_num_children_ : 4; 90 } __attribute__((packed)); 91 static_assert(sizeof(Node) == 4, ""); 92 static_assert(icing_is_packed_pod<Node>::value, "go/icing-ubsan"); 93 94 // Adjacent bit fields are usually packed automatically. However, that is 95 // implementation specific: 96 // http://en.cppreference.com/w/cpp/language/bit_field. 97 // So we'll set packed to be explicit. 98 union Next { Next(uint8_t val,uint32_t node_index)99 Next(uint8_t val, uint32_t node_index) { 100 used.val = val; 101 used.node_index = node_index; 102 } 103 val()104 uint8_t val() const { return used.val; } set_val(uint8_t val)105 void set_val(uint8_t val) { used.val = val; } 106 node_index()107 uint32_t node_index() const { return used.node_index; } set_node_index(uint32_t node_index)108 void set_node_index(uint32_t node_index) { used.node_index = node_index; } 109 next_index()110 uint32_t next_index() const { return freelink.next_index; } set_next_index(uint32_t next_index)111 void set_next_index(uint32_t next_index) { 112 freelink.next_index = next_index; 113 } 114 115 bool operator<(const Next &next2) const { 116 if (val() == next2.val()) { 117 return node_index() < next2.node_index(); 118 } 119 return val() < next2.val(); 120 } 121 122 private: 123 // This object is only ever used by an ArrayStorage, which allocates 124 // sizeof(Node) bytes, zeroes them out and then casts to a Node. 125 Next() = default; 126 127 struct { 128 uint32_t val : 8; 129 uint32_t node_index : 24; 130 } used; 131 struct { 132 uint32_t next_index : 32; 133 } freelink; 134 } __attribute__((packed)); 135 static_assert(sizeof(Next) == 4, ""); 136 static_assert(sizeof(Next) % alignof(Next) == 0, ""); 137 static_assert(icing_is_packed_pod<Next>::value, "go/icing-ubsan"); 138 139 static const int kMaxNextArraySize = 256; 140 static const int kNumNextAllocationBuckets = 9; // [log2(1), log2(256)] 141 142 static const uint32_t kMaxPropertyId = (1 << 16) - 1; 143 144 static const uint32_t kInvalidValueIndex = 0; 145 146 static const uint32_t kNoCrc = 0; 147 148 struct Stats { 149 uint32_t num_keys; 150 151 // Node stats 152 153 uint32_t num_nodes; 154 uint32_t max_nodes; 155 // Count of intermediate nodes. 156 uint32_t num_intermediates; 157 // Total and maximum number of children of intermediate nodes. 158 uint32_t sum_children, max_children; 159 160 // Count of leaf nodes. 161 uint32_t num_leaves; 162 // Total and maximum depth of leaf nodes. 163 uint32_t sum_depth, max_depth; 164 165 // Next stats 166 167 uint32_t num_nexts; 168 uint32_t max_nexts; 169 // Count of next arrays by size. 170 uint32_t child_counts[kMaxNextArraySize]; 171 // Wasted next array space per allocation bucket (in Nexts, not 172 // bytes). 173 uint32_t wasted[kNumNextAllocationBuckets]; 174 // Sum of wasted array. 175 uint32_t total_wasted; 176 177 // Suffix stats 178 179 uint32_t suffixes_size; 180 uint32_t max_suffixes_size; 181 // Bytes actually used by suffixes. 182 uint32_t suffixes_used; 183 // Number of suffixes that are just empty strings. 184 uint32_t null_suffixes; 185 186 // Next free-list stats 187 uint32_t num_free[kNumNextAllocationBuckets]; 188 // Total Next nodes free (weighted sum of the above). 189 uint32_t total_free; 190 191 // Dirty pages. 192 uint32_t dirty_pages_nodes; 193 uint32_t dirty_pages_nexts; 194 uint32_t dirty_pages_suffixes; 195 196 // TODO(b/222349894) Convert the string output to a protocol buffer instead. 197 std::string DumpStats(int verbosity) const; 198 }; 199 200 // Options when creating the trie. Maximums for the node/next/suffix 201 // arrays must be specified in advance. 202 struct Options { 203 // Absolute maximums. 204 static const uint32_t kMaxNodes, kMaxNexts, kMaxSuffixesSize, kMaxValueSize; 205 206 // The default takes 13MB of memory and can take about 1M English 207 // words. OptionsOptions208 Options() 209 : max_nodes(1U << 20), 210 max_nexts(1U << 20), 211 max_suffixes_size(5U << 20), 212 value_size(sizeof(uint32_t)) {} OptionsOptions213 Options(uint32_t max_nodes_in, uint32_t max_nexts_in, 214 uint32_t max_suffixes_size_in, uint32_t value_size_in) 215 : max_nodes(max_nodes_in), 216 max_nexts(max_nexts_in), 217 max_suffixes_size(max_suffixes_size_in), 218 value_size(value_size_in) {} 219 220 uint32_t max_nodes; 221 uint32_t max_nexts; 222 uint32_t max_suffixes_size; 223 uint32_t value_size; 224 225 // True if options do not exceed absolute maximums. 226 bool is_valid() const; 227 }; 228 229 // These can be supplied during runtime, as opposed to the persisted 230 // Options above. 231 struct RuntimeOptions { 232 enum StoragePolicy { 233 // Changes are reflected in the underlying file immediately but 234 // more vulnerable to corruption. 235 kMapSharedWithCrc, 236 237 // Changes only applied during Flush. Smaller window of 238 // vulnerability to corruption. 239 kExplicitFlush, 240 }; 241 set_storage_policyRuntimeOptions242 RuntimeOptions &set_storage_policy(StoragePolicy sp) { 243 storage_policy = sp; 244 return *this; 245 } 246 247 StoragePolicy storage_policy = kExplicitFlush; 248 }; 249 max_value_index(const Options & options)250 static uint32_t max_value_index(const Options &options) { 251 return options.max_suffixes_size; 252 } 253 254 // Light-weight constructor. Real work happens in Create or Init. 255 IcingDynamicTrie(const std::string &filename_base, 256 const RuntimeOptions &runtime_options, 257 const IcingFilesystem *filesystem); 258 ~IcingDynamicTrie() override; 259 is_initialized()260 bool is_initialized() const { return is_initialized_; } 261 262 // Create, but do not Init, a new trie with options if the file does 263 // not already exist. 264 // 265 // Returns true if successfully created all files or files already 266 // exist. Does not do a complete sanity check for when files seem to 267 // exist. Cleans up files if creation fails midstream. 268 bool CreateIfNotExist(const Options &options); 269 UpgradeTo(int new_version)270 bool UpgradeTo(int new_version) override { return true; } 271 bool Init() override; 272 void Close() override; 273 bool Remove() override; 274 uint64_t GetDiskUsage() const override; 275 276 // Returns the size of the elements held in the trie. This excludes the size 277 // of any internal metadata of the trie, e.g. the trie's header. 278 uint64_t GetElementsSize() const; 279 280 // REQUIRED: For all functions below is_initialized() == true. 281 282 // Number of keys in trie. 283 uint32_t size() const; 284 285 // Collecting stats. 286 void CollectStats(Stats *stats) const; 287 288 // Gets all of the contents of the trie for debugging purposes. Note: this 289 // stores the entire set of terms in memory. 290 // pretty_print - The tree structure of the trie will be written to this. 291 // keys - All keys in the trie are appended to this vector. 292 void DumpTrie(std::ostream *pretty_print, 293 std::vector<std::string> *keys) const; 294 295 // Empty out the trie without closing or removing. 296 void Clear(); 297 298 // Clears the suffix and value at the given index. Returns true on success. 299 bool ClearSuffixAndValue(uint32_t suffix_value_index); 300 301 // Resets the next at the given index so that it points to no node. 302 // Returns true on success. 303 bool ResetNext(uint32_t next_index); 304 305 // Sorts the next array of the node. Returns true on success. 306 bool SortNextArray(const Node *node); 307 308 // Sync to disk. 309 bool Sync() override; 310 311 // Tell kernel we will access the memory shortly. 312 void Warm() const; 313 314 // Potentially about to get nuked. 315 void OnSleep() override; 316 317 // Insert value at key. If key already exists and replace == true, 318 // replaces old value with value. We take a copy of value. 319 // 320 // If value_index is not NULL, returns a pointer to value in 321 // value_index. This can then be used with SetValueAtIndex 322 // below. value_index is not valid past a Clear/Read/Write. 323 // 324 // REQUIRES: value a buffer of size value_size() 325 // 326 // Returns: 327 // OK on success 328 // RESOURCE_EXHAUSTED if no disk space is available 329 // INTERNAL_ERROR if there are inconsistencies in the dynamic trie. Insert(const char * key,const void * value)330 libtextclassifier3::Status Insert(const char *key, const void *value) { 331 return Insert(key, value, nullptr, true, nullptr); 332 } Insert(const char * key,const void * value,uint32_t * value_index,bool replace)333 libtextclassifier3::Status Insert(const char *key, const void *value, 334 uint32_t *value_index, bool replace) { 335 return Insert(key, value, value_index, replace, nullptr); 336 } 337 libtextclassifier3::Status Insert(const char *key, const void *value, 338 uint32_t *value_index, bool replace, 339 bool *pnew_key); 340 341 // Get a value returned by Insert value_index. This points to the 342 // value in the trie. The pointer is immutable and always valid 343 // while the trie is alive. 344 const void *GetValueAtIndex(uint32_t value_index) const; 345 346 // Set a value returned by Insert value_index. We take a copy of 347 // value. 348 // 349 // REQUIRES: value a buffer of size value_size() 350 void SetValueAtIndex(uint32_t value_index, const void *value); 351 352 // Returns true if key is found and sets value. If value_index is 353 // not NULL, returns value_index (see Insert discussion above). 354 // If the key is not found, returns false and neither value nor 355 // value_index is modified. 356 // 357 // REQUIRES: value a buffer of size value_size() Find(const char * key,void * value)358 bool Find(const char *key, void *value) const { 359 return Find(key, value, nullptr); 360 } 361 bool Find(const char *key, void *value, uint32_t *value_index) const; 362 363 // Find the input key and all keys that are a variant of the input 364 // key according to a variant map. Currently supports 365 // transliteration. For example "a" is a variant for "à" or "á" so 366 // an "a" in the input key can match those characters in the trie in 367 // addition to itself. 368 // 369 // If prefix is set, also returns any prefix matches (so value_index 370 // will be invalid). 371 // 372 // REQUIRES: all terms in the lexicon to be valid utf8. 373 struct OriginalMatch { 374 uint32_t value_index; 375 std::string orig; 376 OriginalMatchOriginalMatch377 OriginalMatch() : value_index(kInvalidValueIndex) {} 378 is_full_matchOriginalMatch379 bool is_full_match() const { return value_index != kInvalidValueIndex; } 380 }; 381 382 static constexpr int kNoBranchFound = -1; 383 // Return prefix of any new branches created if key were inserted. If utf8 is 384 // true, does not cut key mid-utf8. Returns kNoBranchFound if no branches 385 // would be created. 386 int FindNewBranchingPrefixLength(const char *key, bool utf8) const; 387 388 // Find all prefixes of key where the trie branches. Excludes the key 389 // itself. If utf8 is true, does not cut key mid-utf8. 390 std::vector<int> FindBranchingPrefixLengths(const char *key, bool utf8) const; 391 392 // Check if key is a branching term. 393 // 394 // key is a branching term, if and only if there exists terms s1 and s2 in the 395 // trie such that key is the maximum common prefix of s1 and s2, but s1 and s2 396 // are not prefixes of each other. 397 bool IsBranchingTerm(const char *key) const; 398 399 void GetDebugInfo(int verbosity, std::string *out) const override; 400 401 double min_free_fraction() const; 402 403 uint32_t value_size() const; 404 405 uint32_t max_value_index() const; 406 407 // If in kMapSharedWithCrc mode, update crcs and return the master 408 // crc, else return kNoCrc. This crc includes both the trie files 409 // and property bitmaps. 410 uint32_t UpdateCrc(); 411 412 // Store dynamic properties for each value. When a property is added to 413 // a value, the deleted flag is cleared for it (if it was previously set). 414 bool SetProperty(uint32_t value_index, uint32_t property_id); 415 bool ClearProperty(uint32_t value_index, uint32_t property_id); 416 417 // Store deleted property for each value. 418 // This method is not the only way the deleted property can be set; the trie 419 // may set this property itself during other operations if it can determine a 420 // value becomes superfluous. 421 bool SetDeleted(uint32_t value_index); 422 423 // Clears the deleted property for each value. 424 bool ClearDeleted(uint32_t value_index); 425 426 // Deletes the entry associated with the key. Data can not be recovered after 427 // the deletion. Returns true on success. 428 bool Delete(std::string_view key); 429 430 // Clear a specific property id from all values. For each value that has this 431 // property cleared, also check to see if it was the only property set; if 432 // so, set the deleted property for the value to indicate it no longer has any 433 // properties associated with it. 434 bool ClearPropertyForAllValues(uint32_t property_id); 435 436 // Access properties. Usage: 437 // 438 // IcingDynamicTrie::PropertyReader reader(trie, 10); 439 // char value[SIZE]; 440 // uint32_t value_index; 441 // if (trie.Find("abc", value, &value_index) && 442 // reader.HasProperty(value_index)) { 443 // ... 444 // } 445 // 446 // Readers are valid as long as the underlying trie is open. 447 class PropertyReaderBase { 448 public: 449 // Whether underlying file exists. 450 bool Exists() const; 451 452 // Returns false for all values if underlying file is missing. 453 bool HasProperty(uint32_t value_index) const; 454 455 protected: 456 PropertyReaderBase(const IcingDynamicTrie &trie, bool deleted, 457 uint32_t property_id); 458 459 // Does not own. 460 const IcingFlashBitmap *bitmap_; 461 const IcingDynamicTrie &trie_; 462 }; 463 464 // Reader for a given property. It is invalidated when the underlying property 465 // is deleted, or the trie is closed. 466 class PropertyReader : public PropertyReaderBase { 467 public: PropertyReader(const IcingDynamicTrie & trie,uint32_t property_id)468 PropertyReader(const IcingDynamicTrie &trie, uint32_t property_id) 469 : PropertyReaderBase(trie, false, property_id) {} 470 }; 471 472 // Reader for the deleted property. It is invalidated when the trie is closed. 473 class PropertyDeletedReader : public PropertyReaderBase { 474 public: PropertyDeletedReader(const IcingDynamicTrie & trie)475 explicit PropertyDeletedReader(const IcingDynamicTrie &trie) 476 : PropertyReaderBase(trie, true, 0) {} 477 }; 478 479 // Reader for all properties (but not the deleted one). It is invalidated when 480 // the trie is closed. 481 class PropertyReadersAll { 482 public: 483 explicit PropertyReadersAll(const IcingDynamicTrie &trie); 484 485 // Whether underlying file for property_id exists. 486 bool Exists(uint32_t property_id) const; 487 488 // Returns false if underlying file or property doesn't exist. 489 bool HasProperty(uint32_t property_id, uint32_t value_index) const; 490 491 // Returns true if the value at value_index is set for the only the supplied 492 // property_id, and none of the other properties. 493 bool IsPropertyUnique(uint32_t property_id, uint32_t value_index) const; 494 495 // For iterating. 496 size_t size() const; 497 498 private: 499 const IcingDynamicTrie &trie_; 500 }; 501 502 // Iterate through trie in lexicographic order. 503 // 504 // Not thread-safe. 505 // 506 // Change in underlying trie invalidates iterator. 507 // 508 // TODO(b/241784804): change IcingDynamicTrie::Iterator to follow the common 509 // iterator pattern in our codebase. 510 class Iterator { 511 public: 512 Iterator(const IcingDynamicTrie &trie, const char *prefix, 513 bool reverse = false); 514 void Reset(); 515 bool Advance(); 516 517 // If !IsValid(), GetKey() will return NULL and GetValue() will 518 // return 0. 519 bool IsValid() const; 520 const char *GetKey() const; 521 // This points directly to the underlying data and is valid while 522 // the trie is alive. We keep ownership of the pointer. 523 const void *GetValue() const; 524 uint32_t GetValueIndex() const; 525 526 private: 527 Iterator(); 528 // Copy is ok. 529 530 enum class BranchType { kLeftMost = 0, kRightMost = 1 }; 531 // Helper function that takes the left-most or the right-most branch down 532 // intermediate nodes to a leaf, based on branch_type. 533 void BranchToLeaf(uint32_t node_index, BranchType branch_type); 534 535 std::string cur_key_; 536 const char *cur_suffix_; 537 int cur_suffix_len_; 538 struct Branch { 539 uint32_t node_idx; 540 int child_idx; 541 BranchBranch542 explicit Branch(uint32_t node_index, int child_index) 543 : node_idx(node_index), child_idx(child_index) {} 544 }; 545 std::vector<Branch> branch_stack_; 546 bool single_leaf_match_; 547 bool reverse_; 548 549 const IcingDynamicTrie &trie_; 550 }; 551 552 // Represents a non-leaf node or a "virtual" trie node in the suffix 553 // region. 554 struct LogicalNode { 555 const Node *node; 556 int suffix_offset; 557 LogicalNodeLogicalNode558 LogicalNode() : node(nullptr), suffix_offset(0) {} LogicalNodeLogicalNode559 LogicalNode(const Node *node_in, int suffix_offset_in) 560 : node(node_in), suffix_offset(suffix_offset_in) {} 561 }; 562 563 // Iterate over all utf8 chars in the trie anchored at prefix (or 564 // node). If trie has invalid utf8 chars, behavior is undefined (but 565 // won't crash). 566 class Utf8Iterator { 567 public: 568 void Reset(); 569 bool Advance(); 570 571 bool IsValid() const; 572 573 private: 574 struct Branch { 575 const Node *node; 576 const Next *child; 577 const Next *child_end; 578 579 bool IsFinished(); 580 }; 581 582 Utf8Iterator(); 583 // Copy is ok. 584 585 void LeftBranchToUtf8End(); 586 void InitBranch(Branch *branch, const Node *start, char key_char); 587 void GoIntoSuffix(const Node *node); 588 589 char cur_[U8_MAX_LENGTH + 1]; // NULL-terminated 590 int cur_len_; 591 LogicalNode cur_logical_node_; 592 593 Branch branch_stack_[U8_MAX_LENGTH]; 594 Branch *branch_end_; 595 596 const IcingDynamicTrie &trie_; 597 const Node *start_node_; 598 }; 599 600 private: 601 class CandidateSet; 602 603 // For testing only. 604 friend class IcingDynamicTrieTest_TrieShouldRespectLimits_Test; 605 friend class IcingDynamicTrieTest_SyncErrorRecovery_Test; 606 friend class IcingDynamicTrieTest_BitmapsClosedWhenInitFails_Test; 607 void GetHeader(IcingDynamicTrieHeader *hdr) const; 608 void SetHeader(const IcingDynamicTrieHeader &new_hdr); 609 610 static const uint32_t kInvalidSuffixIndex; 611 612 // Stats helpers. 613 void CollectStatsRecursive(const Node &node, Stats *stats, 614 uint32_t depth = 0) const; 615 616 // Helpers for Find and Insert. 617 const Next *GetNextByChar(const Node *node, uint8_t key_char) const; 618 const Next *LowerBound(const Next *start, const Next *end, uint8_t key_char, 619 uint32_t node_index = 0) const; 620 // Returns the number of valid nexts in the array. 621 int GetValidNextsSize(const IcingDynamicTrie::Next *next_array_start, 622 int next_array_length) const; 623 void FindBestNode(const char *key, uint32_t *best_node_index, int *key_offset, 624 bool prefix, bool utf8 = false) const; 625 626 // For value properties. This truncates the data by clearing it, but leaving 627 // the storage intact. 628 bool InitPropertyBitmaps(); 629 630 // Returns a pointer to a bitmap that is successfully opened. 631 static std::unique_ptr<IcingFlashBitmap> OpenAndInitBitmap( 632 const std::string &filename, bool verify, 633 const IcingFilesystem *filesystem); 634 635 // Returns a pointer to a writable bitmap, creating it if necessary. Returned 636 // pointer should not be freed, it will be maintained by property_bitmaps_. 637 // Returns null if bitmap failed to load. 638 IcingFlashBitmap *OpenOrCreatePropertyBitmap(uint32_t property_id); 639 640 uint64_t ValueIndexToPropertyBitmapIndex(uint32_t value_index) const; 641 642 const std::string filename_base_; 643 bool is_initialized_; 644 const RuntimeOptions runtime_options_; 645 std::unique_ptr<IcingDynamicTrieStorage> storage_; 646 const std::string property_bitmaps_prefix_; 647 std::vector<std::unique_ptr<IcingFlashBitmap>> property_bitmaps_; 648 const std::string deleted_bitmap_filename_; 649 std::unique_ptr<IcingFlashBitmap> deleted_bitmap_; 650 const IcingFilesystem *const filesystem_; 651 }; 652 653 } // namespace lib 654 } // namespace icing 655 656 #endif // ICING_LEGACY_INDEX_ICING_DYNAMIC_TRIE_H_ 657