• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // Copyright 2011 Google Inc. All Rights Reserved.
16 // Author: ulas@google.com (Ulas Kirazci)
17 //
18 // Trie for word prefix lookups. Features:
19 //
20 // - Dynamic additions (but not deletions)
21 // - Low memory usage
22 // - Reasonable latency but not QPS
23 // - Revive from persistence is a disk read
24 // - Stores a 4-byte value associated with every key
25 //
26 // Associated with each value in the trie is a set of property ids.  For
27 // efficiency, property ids should start at 0 and be densely packed.  A value
28 // may have more than one id set.  There is an additional deleted property
29 // for each value, which is set only when all the property ids associated with a
30 // value have been cleared.  In the flash_index, property ids are used to track
31 // corpus ids.
32 //
33 // Not thread-safe.
34 
35 #ifndef ICING_LEGACY_INDEX_ICING_DYNAMIC_TRIE_H_
36 #define ICING_LEGACY_INDEX_ICING_DYNAMIC_TRIE_H_
37 
38 #include <cstdint>
39 #include <memory>
40 #include <string>
41 #include <unordered_map>
42 #include <vector>
43 
44 #include "icing/legacy/core/icing-compat.h"
45 #include "icing/legacy/core/icing-packed-pod.h"
46 #include "icing/legacy/index/icing-filesystem.h"
47 #include "icing/legacy/index/icing-mmapper.h"
48 #include "icing/legacy/index/icing-storage.h"
49 #include "icing/legacy/index/proto/icing-dynamic-trie-header.pb.h"
50 #include "icing/util/i18n-utils.h"
51 #include "unicode/utf8.h"
52 
53 namespace icing {
54 namespace lib {
55 
56 class IcingFlashBitmap;
57 
58 class IcingDynamicTrie : public IIcingStorage {
59   class Dumper;
60   class IcingDynamicTrieStorage;
61 
62  public:
63   // Adjacent bit fields are usually packed automatically. However, that is
64   // implementation specific:
65   // http://en.cppreference.com/w/cpp/language/bit_field
66   // So we'll set packed to be explicit.
67   class Node {
68    public:
69     // This object is only ever used by an ArrayStorage, which allocates
70     // sizeof(Node) bytes, zeroes them out and then casts to a Node.
71     Node() = delete;
72 
next_index()73     uint32_t next_index() const { return next_index_; }
set_next_index(uint32_t next_index)74     void set_next_index(uint32_t next_index) { next_index_ = next_index; }
75 
is_leaf()76     bool is_leaf() const { return is_leaf_; }
set_is_leaf(bool is_leaf)77     void set_is_leaf(bool is_leaf) { is_leaf_ = is_leaf; }
78 
log2_num_children()79     uint8_t log2_num_children() const { return log2_num_children_; }
set_log2_num_children(uint8_t log2_num_children)80     void set_log2_num_children(uint8_t log2_num_children) {
81       log2_num_children_ = log2_num_children;
82     }
83 
84    private:
85     uint32_t next_index_ : 27;
86     uint32_t is_leaf_ : 1;
87     uint32_t log2_num_children_ : 4;
88   } __attribute__((packed));
89   static_assert(sizeof(Node) == 4, "");
90   static_assert(icing_is_packed_pod<Node>::value, "go/icing-ubsan");
91 
92   // Adjacent bit fields are usually packed automatically. However, that is
93   // implementation specific:
94   // http://en.cppreference.com/w/cpp/language/bit_field.
95   // So we'll set packed to be explicit.
96   union Next {
Next(uint8_t val,uint32_t node_index)97     Next(uint8_t val, uint32_t node_index) {
98       used.val = val;
99       used.node_index = node_index;
100     }
101 
val()102     uint8_t val() const { return used.val; }
set_val(uint8_t val)103     void set_val(uint8_t val) { used.val = val; }
104 
node_index()105     uint32_t node_index() const { return used.node_index; }
set_node_index(uint32_t node_index)106     void set_node_index(uint32_t node_index) { used.node_index = node_index; }
107 
next_index()108     uint32_t next_index() const { return freelink.next_index; }
set_next_index(uint32_t next_index)109     void set_next_index(uint32_t next_index) {
110       freelink.next_index = next_index;
111     }
112 
113     bool operator<(const Next &next2) const {
114       if (val() == next2.val()) {
115         return node_index() < next2.node_index();
116       }
117       return val() < next2.val();
118     }
119 
120    private:
121     // This object is only ever used by an ArrayStorage, which allocates
122     // sizeof(Node) bytes, zeroes them out and then casts to a Node.
123     Next() = default;
124 
125     struct {
126       uint32_t val : 8;
127       uint32_t node_index : 24;
128     } used;
129     struct {
130       uint32_t next_index : 32;
131     } freelink;
132   } __attribute__((packed));
133   static_assert(sizeof(Next) == 4, "");
134   static_assert(sizeof(Next) % alignof(Next) == 0, "");
135   static_assert(icing_is_packed_pod<Next>::value, "go/icing-ubsan");
136 
137   static const int kMaxNextArraySize = 256;
138   static const int kNumNextAllocationBuckets = 9;  // [log2(1), log2(256)]
139 
140   static const uint32_t kMaxPropertyId = (1 << 16) - 1;
141 
142   static const uint32_t kInvalidValueIndex = 0;
143 
144   static const uint32_t kNoCrc = 0;
145 
146   struct Stats {
147     uint32_t num_keys;
148 
149     // Node stats
150 
151     uint32_t num_nodes;
152     uint32_t max_nodes;
153     // Count of intermediate nodes.
154     uint32_t num_intermediates;
155     // Total and maximum number of children of intermediate nodes.
156     uint32_t sum_children, max_children;
157 
158     // Count of leaf nodes.
159     uint32_t num_leaves;
160     // Total and maximum depth of leaf nodes.
161     uint32_t sum_depth, max_depth;
162 
163     // Next stats
164 
165     uint32_t num_nexts;
166     uint32_t max_nexts;
167     // Count of next arrays by size.
168     uint32_t child_counts[kMaxNextArraySize];
169     // Wasted next array space per allocation bucket (in Nexts, not
170     // bytes).
171     uint32_t wasted[kNumNextAllocationBuckets];
172     // Sum of wasted array.
173     uint32_t total_wasted;
174 
175     // Suffix stats
176 
177     uint32_t suffixes_size;
178     uint32_t max_suffixes_size;
179     // Bytes actually used by suffixes.
180     uint32_t suffixes_used;
181     // Number of suffixes that are just empty strings.
182     uint32_t null_suffixes;
183 
184     // Next free-list stats
185     uint32_t num_free[kNumNextAllocationBuckets];
186     // Total Next nodes free (weighted sum of the above).
187     uint32_t total_free;
188 
189     // Dirty pages.
190     uint32_t dirty_pages_nodes;
191     uint32_t dirty_pages_nexts;
192     uint32_t dirty_pages_suffixes;
193 
194     // TODO(b/222349894) Convert the string output to a protocol buffer instead.
195     std::string DumpStats(int verbosity) const;
196   };
197 
198   // Options when creating the trie. Maximums for the node/next/suffix
199   // arrays must be specified in advance.
200   struct Options {
201     // Absolute maximums.
202     static const uint32_t kMaxNodes, kMaxNexts, kMaxSuffixesSize, kMaxValueSize;
203 
204     // The default takes 13MB of memory and can take about 1M English
205     // words.
OptionsOptions206     Options()
207         : max_nodes(1U << 20),
208           max_nexts(1U << 20),
209           max_suffixes_size(5U << 20),
210           value_size(sizeof(uint32_t)) {}
OptionsOptions211     Options(uint32_t max_nodes_in, uint32_t max_nexts_in,
212             uint32_t max_suffixes_size_in, uint32_t value_size_in)
213         : max_nodes(max_nodes_in),
214           max_nexts(max_nexts_in),
215           max_suffixes_size(max_suffixes_size_in),
216           value_size(value_size_in) {}
217 
218     uint32_t max_nodes;
219     uint32_t max_nexts;
220     uint32_t max_suffixes_size;
221     uint32_t value_size;
222 
223     // True if options do not exceed absolute maximums.
224     bool is_valid() const;
225   };
226 
227   // These can be supplied during runtime, as opposed to the persisted
228   // Options above.
229   struct RuntimeOptions {
230     enum StoragePolicy {
231       // Changes are reflected in the underlying file immediately but
232       // more vulnerable to corruption.
233       kMapSharedWithCrc,
234 
235       // Changes only applied during Flush. Smaller window of
236       // vulnerability to corruption.
237       kExplicitFlush,
238     };
239 
set_storage_policyRuntimeOptions240     RuntimeOptions &set_storage_policy(StoragePolicy sp) {
241       storage_policy = sp;
242       return *this;
243     }
244 
245     StoragePolicy storage_policy = kExplicitFlush;
246   };
247 
max_value_index(const Options & options)248   static uint32_t max_value_index(const Options &options) {
249     return options.max_suffixes_size;
250   }
251 
252   // Light-weight constructor. Real work happens in Create or Init.
253   IcingDynamicTrie(const std::string &filename_base,
254                    const RuntimeOptions &runtime_options,
255                    const IcingFilesystem *filesystem);
256   ~IcingDynamicTrie() override;
257 
is_initialized()258   bool is_initialized() const { return is_initialized_; }
259 
260   // Create, but do not Init, a new trie with options if the file does
261   // not already exist.
262   //
263   // Returns true if successfully created all files or files already
264   // exist. Does not do a complete sanity check for when files seem to
265   // exist. Cleans up files if creation fails midstream.
266   bool CreateIfNotExist(const Options &options);
267 
UpgradeTo(int new_version)268   bool UpgradeTo(int new_version) override { return true; }
269   bool Init() override;
270   void Close() override;
271   bool Remove() override;
272   uint64_t GetDiskUsage() const override;
273 
274   // Returns the size of the elements held in the trie. This excludes the size
275   // of any internal metadata of the trie, e.g. the trie's header.
276   uint64_t GetElementsSize() const;
277 
278   // REQUIRED: For all functions below is_initialized() == true.
279 
280   // Number of keys in trie.
281   uint32_t size() const;
282 
283   // Collecting stats.
284   void CollectStats(Stats *stats) const;
285 
286   // Gets all of the contents of the trie for debugging purposes. Note: this
287   // stores the entire set of terms in memory.
288   //   pretty_print - The tree structure of the trie will be written to this.
289   //   keys - All keys in the trie are appended to this vector.
290   void DumpTrie(std::ostream *pretty_print,
291                 std::vector<std::string> *keys) const;
292 
293   // Empty out the trie without closing or removing.
294   void Clear();
295 
296   // Clears the suffix and value at the given index. Returns true on success.
297   bool ClearSuffixAndValue(uint32_t suffix_value_index);
298 
299   // Resets the next at the given index so that it points to no node.
300   // Returns true on success.
301   bool ResetNext(uint32_t next_index);
302 
303   // Sorts the next array of the node. Returns true on success.
304   bool SortNextArray(const Node *node);
305 
306   // Sync to disk.
307   bool Sync() override;
308 
309   // Tell kernel we will access the memory shortly.
310   void Warm() const;
311 
312   // Potentially about to get nuked.
313   void OnSleep() override;
314 
315   // Compact trie into out for value indices present in old_tvi_to_new_value.
316   class NewValueMap {
317    public:
318     virtual ~NewValueMap();
319 
320     // Returns the new value we want to assign to the entry at old
321     // value index. We don't take ownership of the pointer.
322     virtual const void *GetNewValue(uint32_t old_value_index) const = 0;
323   };
324   // Compacts this trie. This drops all deleted keys, drops all keys for which
325   // old_tvi_to_new_value returns nullptr, updates values to be the values
326   // returned by old_tvi_to_new_value, rewrites tvis, and saves the results into
327   // the trie given in 'out'. 'old_to_new_tvi' is be populated with a mapping of
328   // old value_index to new value_index.
329   bool Compact(const NewValueMap &old_tvi_to_new_value, IcingDynamicTrie *out,
330                std::unordered_map<uint32_t, uint32_t> *old_to_new_tvi) const;
331 
332   // Insert value at key. If key already exists and replace == true,
333   // replaces old value with value. We take a copy of value.
334   //
335   // If value_index is not NULL, returns a pointer to value in
336   // value_index. This can then be used with SetValueAtIndex
337   // below. value_index is not valid past a Clear/Read/Write.
338   //
339   // Returns false if there is no space left in the trie.
340   //
341   // REQUIRES: value a buffer of size value_size()
Insert(const char * key,const void * value)342   bool Insert(const char *key, const void *value) {
343     return Insert(key, value, nullptr, true, nullptr);
344   }
Insert(const char * key,const void * value,uint32_t * value_index,bool replace)345   bool Insert(const char *key, const void *value, uint32_t *value_index,
346               bool replace) {
347     return Insert(key, value, value_index, replace, nullptr);
348   }
349   bool Insert(const char *key, const void *value, uint32_t *value_index,
350               bool replace, bool *pnew_key);
351 
352   // Get a value returned by Insert value_index. This points to the
353   // value in the trie. The pointer is immutable and always valid
354   // while the trie is alive.
355   const void *GetValueAtIndex(uint32_t value_index) const;
356 
357   // Set a value returned by Insert value_index. We take a copy of
358   // value.
359   //
360   // REQUIRES: value a buffer of size value_size()
361   void SetValueAtIndex(uint32_t value_index, const void *value);
362 
363   // Returns true if key is found and sets value. If value_index is
364   // not NULL, returns value_index (see Insert discussion above).
365   // If the key is not found, returns false and neither value nor
366   // value_index is modified.
367   //
368   // REQUIRES: value a buffer of size value_size()
Find(const char * key,void * value)369   bool Find(const char *key, void *value) const {
370     return Find(key, value, nullptr);
371   }
372   bool Find(const char *key, void *value, uint32_t *value_index) const;
373 
374   // Find the input key and all keys that are a variant of the input
375   // key according to a variant map. Currently supports
376   // transliteration. For example "a" is a variant for "à" or "á" so
377   // an "a" in the input key can match those characters in the trie in
378   // addition to itself.
379   //
380   // If prefix is set, also returns any prefix matches (so value_index
381   // will be invalid).
382   //
383   // REQUIRES: all terms in the lexicon to be valid utf8.
384   struct OriginalMatch {
385     uint32_t value_index;
386     std::string orig;
387 
OriginalMatchOriginalMatch388     OriginalMatch() : value_index(kInvalidValueIndex) {}
389 
is_full_matchOriginalMatch390     bool is_full_match() const { return value_index != kInvalidValueIndex; }
391   };
392 
393   static constexpr int kNoBranchFound = -1;
394   // Return prefix of any new branches created if key were inserted. If utf8 is
395   // true, does not cut key mid-utf8. Returns kNoBranchFound if no branches
396   // would be created.
397   int FindNewBranchingPrefixLength(const char *key, bool utf8) const;
398 
399   // Find all prefixes of key where the trie branches. Excludes the key
400   // itself. If utf8 is true, does not cut key mid-utf8.
401   std::vector<int> FindBranchingPrefixLengths(const char *key, bool utf8) const;
402 
403   void GetDebugInfo(int verbosity, std::string *out) const override;
404 
405   double min_free_fraction() const;
406 
407   uint32_t value_size() const;
408 
409   uint32_t max_value_index() const;
410 
411   // If in kMapSharedWithCrc mode, update crcs and return the master
412   // crc, else return kNoCrc. This crc includes both the trie files
413   // and property bitmaps.
414   uint32_t UpdateCrc();
415 
416   // Store dynamic properties for each value.  When a property is added to
417   // a value, the deleted flag is cleared for it (if it was previously set).
418   bool SetProperty(uint32_t value_index, uint32_t property_id);
419   bool ClearProperty(uint32_t value_index, uint32_t property_id);
420 
421   // Store deleted property for each value.
422   // This method is not the only way the deleted property can be set; the trie
423   // may set this property itself during other operations if it can determine a
424   // value becomes superfluous.
425   bool SetDeleted(uint32_t value_index);
426 
427   // Clears the deleted property for each value.
428   bool ClearDeleted(uint32_t value_index);
429 
430   // Deletes the entry associated with the key. Data can not be recovered after
431   // the deletion. Returns true on success.
432   bool Delete(std::string_view key);
433 
434   // Clear a specific property id from all values.  For each value that has this
435   // property cleared, also check to see if it was the only property set;  if
436   // so, set the deleted property for the value to indicate it no longer has any
437   // properties associated with it.
438   bool ClearPropertyForAllValues(uint32_t property_id);
439 
440   // Access properties. Usage:
441   //
442   // IcingDynamicTrie::PropertyReader reader(trie, 10);
443   // char value[SIZE];
444   // uint32_t value_index;
445   // if (trie.Find("abc", value, &value_index) &&
446   //     reader.HasProperty(value_index)) {
447   //     ...
448   // }
449   //
450   // Readers are valid as long as the underlying trie is open.
451   class PropertyReaderBase {
452    public:
453     // Whether underlying file exists.
454     bool Exists() const;
455 
456     // Returns false for all values if underlying file is missing.
457     bool HasProperty(uint32_t value_index) const;
458 
459    protected:
460     PropertyReaderBase(const IcingDynamicTrie &trie, bool deleted,
461                        uint32_t property_id);
462 
463     // Does not own.
464     const IcingFlashBitmap *bitmap_;
465     const IcingDynamicTrie &trie_;
466   };
467 
468   // Reader for a given property. It is invalidated when the underlying property
469   // is deleted, or the trie is closed.
470   class PropertyReader : public PropertyReaderBase {
471    public:
PropertyReader(const IcingDynamicTrie & trie,uint32_t property_id)472     PropertyReader(const IcingDynamicTrie &trie, uint32_t property_id)
473         : PropertyReaderBase(trie, false, property_id) {}
474   };
475 
476   // Reader for the deleted property. It is invalidated when the trie is closed.
477   class PropertyDeletedReader : public PropertyReaderBase {
478    public:
PropertyDeletedReader(const IcingDynamicTrie & trie)479     explicit PropertyDeletedReader(const IcingDynamicTrie &trie)
480         : PropertyReaderBase(trie, true, 0) {}
481   };
482 
483   // Reader for all properties (but not the deleted one). It is invalidated when
484   // the trie is closed.
485   class PropertyReadersAll {
486    public:
487     explicit PropertyReadersAll(const IcingDynamicTrie &trie);
488 
489     // Whether underlying file for property_id exists.
490     bool Exists(uint32_t property_id) const;
491 
492     // Returns false if underlying file or property doesn't exist.
493     bool HasProperty(uint32_t property_id, uint32_t value_index) const;
494 
495     // Returns true if the value at value_index is set for the only the supplied
496     // property_id, and none of the other properties.
497     bool IsPropertyUnique(uint32_t property_id, uint32_t value_index) const;
498 
499     // For iterating.
500     size_t size() const;
501 
502    private:
503     const IcingDynamicTrie &trie_;
504   };
505 
506   // Iterate through trie in lexicographic order.
507   //
508   // Not thread-safe.
509   //
510   // Change in underlying trie invalidates iterator.
511   class Iterator {
512    public:
513     Iterator(const IcingDynamicTrie &trie, const char *prefix);
514     void Reset();
515     bool Advance();
516 
517     // If !IsValid(), GetKey() will return NULL and GetValue() will
518     // return 0.
519     bool IsValid() const;
520     const char *GetKey() const;
521     // This points directly to the underlying data and is valid while
522     // the trie is alive. We keep ownership of the pointer.
523     const void *GetValue() const;
524     uint32_t GetValueIndex() const;
525 
526    private:
527     Iterator();
528     // Copy is ok.
529 
530     // Helper function that takes the left-most branch down
531     // intermediate nodes to a leaf.
532     void LeftBranchToLeaf(uint32_t node_index);
533 
534     std::string cur_key_;
535     const char *cur_suffix_;
536     int cur_suffix_len_;
537     struct Branch {
538       uint32_t node_idx;
539       int child_idx;
540 
BranchBranch541       explicit Branch(uint32_t ni) : node_idx(ni), child_idx(0) {}
542     };
543     std::vector<Branch> branch_stack_;
544     bool single_leaf_match_;
545 
546     const IcingDynamicTrie &trie_;
547   };
548 
549   // Represents a non-leaf node or a "virtual" trie node in the suffix
550   // region.
551   struct LogicalNode {
552     const Node *node;
553     int suffix_offset;
554 
LogicalNodeLogicalNode555     LogicalNode() : node(nullptr), suffix_offset(0) {}
LogicalNodeLogicalNode556     LogicalNode(const Node *node_in, int suffix_offset_in)
557         : node(node_in), suffix_offset(suffix_offset_in) {}
558   };
559 
560   // Iterate over all utf8 chars in the trie anchored at prefix (or
561   // node). If trie has invalid utf8 chars, behavior is undefined (but
562   // won't crash).
563   class Utf8Iterator {
564    public:
565     void Reset();
566     bool Advance();
567 
568     bool IsValid() const;
569 
570    private:
571     struct Branch {
572       const Node *node;
573       const Next *child;
574       const Next *child_end;
575 
576       bool IsFinished();
577     };
578 
579     Utf8Iterator();
580     // Copy is ok.
581 
582     void LeftBranchToUtf8End();
583     void InitBranch(Branch *branch, const Node *start, char key_char);
584     void GoIntoSuffix(const Node *node);
585 
586     char cur_[U8_MAX_LENGTH + 1];  // NULL-terminated
587     int cur_len_;
588     LogicalNode cur_logical_node_;
589 
590     Branch branch_stack_[U8_MAX_LENGTH];
591     Branch *branch_end_;
592 
593     const IcingDynamicTrie &trie_;
594     const Node *start_node_;
595   };
596 
597  private:
598   class CandidateSet;
599 
600   // For testing only.
601   friend class IcingDynamicTrieTest_TrieShouldRespectLimits_Test;
602   friend class IcingDynamicTrieTest_SyncErrorRecovery_Test;
603   friend class IcingDynamicTrieTest_BitmapsClosedWhenInitFails_Test;
604   void GetHeader(IcingDynamicTrieHeader *hdr) const;
605   void SetHeader(const IcingDynamicTrieHeader &new_hdr);
606 
607   static const uint32_t kInvalidSuffixIndex;
608 
609   // Stats helpers.
610   void CollectStatsRecursive(const Node &node, Stats *stats,
611                              uint32_t depth = 0) const;
612 
613   // Helpers for Find and Insert.
614   const Next *GetNextByChar(const Node *node, uint8_t key_char) const;
615   const Next *LowerBound(const Next *start, const Next *end,
616                          uint8_t key_char) const;
617   void FindBestNode(const char *key, uint32_t *best_node_index, int *key_offset,
618                     bool prefix, bool utf8 = false) const;
619 
620   // For value properties.  This truncates the data by clearing it, but leaving
621   // the storage intact.
622   bool InitPropertyBitmaps();
623 
624   // Returns a pointer to a bitmap that is successfully opened.
625   static std::unique_ptr<IcingFlashBitmap> OpenAndInitBitmap(
626       const std::string &filename, bool verify,
627       const IcingFilesystem *filesystem);
628 
629   // Returns a pointer to a writable bitmap, creating it if necessary.  Returned
630   // pointer should not be freed, it will be maintained by property_bitmaps_.
631   // Returns null if bitmap failed to load.
632   IcingFlashBitmap *OpenOrCreatePropertyBitmap(uint32_t property_id);
633 
634   uint64_t ValueIndexToPropertyBitmapIndex(uint32_t value_index) const;
635 
636   const std::string filename_base_;
637   bool is_initialized_;
638   const RuntimeOptions runtime_options_;
639   std::unique_ptr<IcingDynamicTrieStorage> storage_;
640   const std::string property_bitmaps_prefix_;
641   std::vector<std::unique_ptr<IcingFlashBitmap>> property_bitmaps_;
642   const std::string deleted_bitmap_filename_;
643   std::unique_ptr<IcingFlashBitmap> deleted_bitmap_;
644   const IcingFilesystem *const filesystem_;
645 };
646 
647 }  // namespace lib
648 }  // namespace icing
649 
650 #endif  // ICING_LEGACY_INDEX_ICING_DYNAMIC_TRIE_H_
651