• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 // We store the trie in three areas: nodes, nexts and suffixes.
16 //
17 // Nodes contain an index to a children array (kept in nexts) or to
18 // suffixes (for leaf nodes). Nexts contain children arrays of
19 // different sizes. Each child entry has the matched char and an index
20 // back into the nodes. Leaf nodes index into suffixes instead of the
21 // nexts array. Each suffix is a NULL-terminated suffix off the trie,
22 // followed by a 4-byte value associated with that key.
23 //
24 // Allocation
25 //
26 // Nodes are allocated and never removed. Nexts contain arrays of
27 // sizes in power-of-2 increments, i.e. 1, 2, 4, ..., 256. When the
28 // number of children of a node increases, it is relocated to an array
29 // with the proper size. The (smaller) unused array is added to a free
30 // list. A free list is kept for each array size. Allocations happen
31 // from the free list first, and then from the end of the nexts
32 // array. Suffixes are never freed or compacted. If a node wants to
33 // refer to a smaller suffix, it moves the pointer forward and the
34 // characters before the new pointer are wasted.
35 //
36 // Keys can contain any character except '\0'. The '\0' char is
37 // special in that it specifies an end-of-key in the child array.
38 //
39 // Ideas to try:
40 //
41 // - Put suffix index in a Next instead of creating a leaf node.
42 // - Change allocation buckets to 1, 2, 3, 4, 5, 6, 7, 8, 16, 32, ..., 256
43 // - Compact next array
44 // - GroupVarByte and delta-encode the next array
45 // - Collapse nodes with single children
46 //
47 // Persistence
48 //
49 // We persist the trie in a binary format such that resurrecting the
50 // trie is simply a few file reads. The file is laid out as such:
51 //
52 // - Header
53 // - Nodes
54 // - Nexts
55 // - Suffixes
56 //
57 // Each section is aligned to IcingMMapper::system_page_size(). The max
58 // requested value for each array is pre-allocated in the file. When
59 // we make modifications to the arrays, we set bits in a dirty bitmap
60 // of pages. No changes get written to disk until an explicit call to
61 // Flush. Then we only write the pages that have their dirty bit set.
62 
63 #include "icing/legacy/index/icing-dynamic-trie.h"
64 
65 #include <fcntl.h>
66 #include <sys/mman.h>
67 #include <sys/stat.h>
68 #include <unistd.h>
69 
70 #include <algorithm>
71 #include <cerrno>
72 #include <cinttypes>
73 #include <cstdint>
74 #include <cstring>
75 #include <memory>
76 #include <utility>
77 
78 #include "icing/absl_ports/canonical_errors.h"
79 #include "icing/legacy/core/icing-packed-pod.h"
80 #include "icing/legacy/core/icing-string-util.h"
81 #include "icing/legacy/core/icing-timer.h"
82 #include "icing/legacy/index/icing-array-storage.h"
83 #include "icing/legacy/index/icing-filesystem.h"
84 #include "icing/legacy/index/icing-flash-bitmap.h"
85 #include "icing/legacy/index/icing-mmapper.h"
86 #include "icing/legacy/index/proto/icing-dynamic-trie-header.pb.h"
87 #include "icing/util/i18n-utils.h"
88 #include "icing/util/logging.h"
89 #include "icing/util/math-util.h"
90 #include "icing/util/status-macros.h"
91 
92 using std::inplace_merge;
93 using std::lower_bound;
94 using std::max;
95 using std::mismatch;
96 using std::pair;
97 using std::sort;
98 using std::vector;
99 
100 namespace icing {
101 namespace lib {
102 
103 namespace {
104 constexpr uint32_t kInvalidNodeIndex = (1U << 24) - 1;
105 constexpr uint32_t kInvalidNextIndex = ~0U;
106 
ResetMutableNext(IcingDynamicTrie::Next & mutable_next)107 void ResetMutableNext(IcingDynamicTrie::Next &mutable_next) {
108   mutable_next.set_val(0xff);
109   mutable_next.set_node_index(kInvalidNodeIndex);
110 }
111 }  // namespace
112 
113 // Based on the bit field widths.
114 const uint32_t IcingDynamicTrie::Options::kMaxNodes = (1U << 24) - 1;
115 const uint32_t IcingDynamicTrie::Options::kMaxNexts = (1U << 27) - 1;
116 const uint32_t IcingDynamicTrie::Options::kMaxSuffixesSize = 1U << 27;
117 const uint32_t IcingDynamicTrie::Options::kMaxValueSize = 1U << 16;
118 
119 const uint32_t IcingDynamicTrie::kInvalidSuffixIndex = ~0U;
120 
121 const int IcingDynamicTrie::kMaxNextArraySize;
122 const int IcingDynamicTrie::kNumNextAllocationBuckets;
123 
124 const uint32_t IcingDynamicTrie::kMaxPropertyId;
125 
126 const uint32_t IcingDynamicTrie::kInvalidValueIndex;
127 
128 const uint32_t IcingDynamicTrie::kNoCrc;
129 
130 // Manages logical node candidates while searching for possible
131 // variant matches. Currently implemented as depth first search. The
132 // max stack depth is key length * variant fanout. Since max variant
133 // fanout is 3, we don't need to worry about blowup of the depth first
134 // search stack.
135 //
136 // Keeps track of original matched string (the string actually present
137 // in the trie) for every candidate.
138 class IcingDynamicTrie::CandidateSet {
139  public:
140   struct Candidate {
141     LogicalNode logical_node;
142     const char *key;
143     int matched_prefix_len;
144     std::string matched_span;
145 
Candidateicing::lib::IcingDynamicTrie::CandidateSet::Candidate146     Candidate() {}
147 
Candidateicing::lib::IcingDynamicTrie::CandidateSet::Candidate148     Candidate(const LogicalNode &logical_node_in, const char *key_in,
149               int matched_prefix_len_in, const char *matched_span_in,
150               int matched_span_len_in)
151         : logical_node(logical_node_in),
152           key(key_in),
153           matched_prefix_len(matched_prefix_len_in),
154           matched_span(matched_span_in, matched_span_len_in) {}
155 
matched_lenicing::lib::IcingDynamicTrie::CandidateSet::Candidate156     int matched_len() const { return matched_prefix_len + matched_span.size(); }
157   };
158 
CandidateSet(bool prefix)159   explicit CandidateSet(bool prefix) : prefix_(prefix) {}
160 
IsTerminal(const char * key,uint32_t value_index) const161   bool IsTerminal(const char *key, uint32_t value_index) const {
162     // Terminal match condition:
163     //
164     // 1. Key was entirely consumed.
165     // 2. The entire suffix was consumed (hence value index is
166     //    valid). OR, we are ok with prefix matches.
167     return *key == 0 && (value_index != kInvalidValueIndex || prefix_);
168   }
169 
170   // Push a terminal or non-terminal.
Push(const LogicalNode & logical_node,const char * key,uint32_t value_index,int matched_prefix_len,const char * matched_span,int matched_span_len)171   void Push(const LogicalNode &logical_node, const char *key,
172             uint32_t value_index, int matched_prefix_len,
173             const char *matched_span, int matched_span_len) {
174     if (!AddMatchIfTerminal(key, value_index, matched_span, matched_span_len)) {
175       PushNonTerminal(logical_node, key, matched_prefix_len, matched_span,
176                       matched_span_len);
177     }
178   }
179 
AddMatchIfTerminal(const char * key,uint32_t value_index,const char * matched_span,int matched_span_len)180   bool AddMatchIfTerminal(const char *key, uint32_t value_index,
181                           const char *matched_span, int matched_span_len) {
182     if (!IsTerminal(key, value_index)) {
183       return false;
184     }
185 
186     // Terminal match.
187     matches_.push_back(OriginalMatch());
188     OriginalMatch *match = &matches_.back();
189     match->value_index = value_index;
190     match->orig.reserve(cur_prefix_.size() + matched_span_len);
191     match->orig.append(cur_prefix_).append(matched_span, matched_span_len);
192     return true;
193   }
194 
195   // Push a definite non-terminal.
PushNonTerminal(const LogicalNode & logical_node,const char * key,int matched_prefix_len,const char * matched_span,int matched_span_len)196   void PushNonTerminal(const LogicalNode &logical_node, const char *key,
197                        int matched_prefix_len, const char *matched_span,
198                        int matched_span_len) {
199     candidates_.push_back(Candidate(logical_node, key, matched_prefix_len,
200                                     matched_span, matched_span_len));
201   }
202 
Pop(Candidate * candidate)203   void Pop(Candidate *candidate) {
204     *candidate = candidates_.back();
205     if (cur_prefix_.size() < candidate->matched_prefix_len) {
206       ICING_LOG(FATAL)
207           << "Length of current prefix is smaller than length of matched "
208              "prefer, there're inconsistencies in dynamic trie.";
209     }
210 
211     cur_prefix_.resize(candidate->matched_prefix_len);
212     cur_prefix_.append(candidate->matched_span);
213     candidates_.pop_back();
214   }
215 
empty() const216   bool empty() const { return candidates_.empty(); }
217 
Release(vector<OriginalMatch> * ret)218   void Release(vector<OriginalMatch> *ret) {
219     if (!empty()) {
220       ICING_LOG(FATAL) << "Candidate set not empty before releasing matches";
221     }
222 
223     ret->swap(matches_);
224 
225     cur_prefix_.clear();
226     candidates_.clear();
227     matches_.clear();
228   }
229 
230  private:
231   const bool prefix_;
232 
233   std::string cur_prefix_;
234   vector<Candidate> candidates_;
235 
236   vector<IcingDynamicTrie::OriginalMatch> matches_;
237 };
238 
239 // Options.
is_valid() const240 bool IcingDynamicTrie::Options::is_valid() const {
241   return max_nodes <= kMaxNodes && max_nodes > 0 && max_nexts <= kMaxNexts &&
242          max_nexts > 0 && max_suffixes_size <= kMaxSuffixesSize &&
243          max_suffixes_size > 0 && value_size <= kMaxValueSize;
244 }
245 
246 // IcingDynamicTrieStorage
247 class IcingDynamicTrie::IcingDynamicTrieStorage {
248  public:
249   IcingDynamicTrieStorage(const std::string &file_basename,
250                           const RuntimeOptions &runtime_options,
251                           const IcingFilesystem *filesystem);
252   ~IcingDynamicTrieStorage();
253 
is_initialized() const254   bool is_initialized() const { return hdr_mmapper_.is_valid(); }
255 
256   bool CreateIfNotExist(const Options &options);
257   bool Init();
258   static bool Remove(const std::string &file_basename,
259                      const IcingFilesystem &filesystem);
260   bool Sync();
261   uint64_t GetDiskUsage() const;
262 
263   // Returns the size of the elements held in the trie. This excludes the size
264   // of any internal metadata of the trie, e.g. the trie's header.
265   uint64_t GetElementsFileSize() const;
266 
267   void Warm();
268 
269   void Clear();
270 
empty() const271   bool empty() const { return hdr().num_nodes() == 0; }
272 
273   // Never cast off these consts when writing to the arrays. Always
274   // use the GetMutable* helpers above.
GetNode(uint32_t idx) const275   const Node *GetNode(uint32_t idx) const {
276     return &array_storage_[NODE].array_cast<Node>()[idx];
277   }
GetRootNode() const278   const Node *GetRootNode() const { return GetNode(0); }
GetNext(uint32_t idx,int child) const279   const Next *GetNext(uint32_t idx, int child) const {
280     return &array_storage_[NEXT].array_cast<Next>()[idx + child];
281   }
GetSuffix(uint32_t idx) const282   const char *GetSuffix(uint32_t idx) const {
283     return &array_storage_[SUFFIX].array_cast<char>()[idx];
284   }
285 
GetNodeIndex(const Node * node) const286   uint32_t GetNodeIndex(const Node *node) const { return node - GetNode(0); }
GetNextArrayIndex(const Next * next) const287   uint32_t GetNextArrayIndex(const Next *next) const {
288     return next - GetNext(0, 0);
289   }
GetSuffixIndex(const char * suffix) const290   uint32_t GetSuffixIndex(const char *suffix) const {
291     return suffix - GetSuffix(0);
292   }
293 
294   // By default, nodes_, nexts_ and suffixes_ are read-only. This
295   // returns a writable element or array within and sets
296   // dirty_pages_[array_type] as a side effect, assuming the mutable
297   // area will get written to.
298   Node *GetMutableNode(uint32_t idx);
299   Next *GetMutableNextArray(uint32_t idx, uint32_t len);
300   char *GetMutableSuffix(uint32_t idx, uint32_t len);
301 
302   // Update crcs based on current contents. Returns all_crc or kNoCrc.
303   uint32_t UpdateCrc();
304 
305   // Allocators.
306   uint32_t nodes_left() const;
307   uint32_t nexts_left() const;
308   uint32_t suffixes_left() const;
309 
310   // REQUIRES: nodes_left() > 0.
311   Node *AllocNode();
312   // REQUIRES: nexts_left() >= kMaxNextArraySize.
313   libtextclassifier3::StatusOr<Next *> AllocNextArray(int size);
314   void FreeNextArray(Next *next, int log2_size);
315   // REQUIRES: suffixes_left() >= strlen(suffix) + 1 + value_size()
316   uint32_t MakeSuffix(const char *suffix, const void *value,
317                       uint32_t *value_index);
318 
hdr() const319   const IcingDynamicTrieHeader &hdr() const { return hdr_.hdr; }
320 
value_size() const321   uint32_t value_size() const { return hdr().value_size(); }
322 
323   void FillDirtyPageStats(Stats *stats) const;
324 
inc_num_keys()325   void inc_num_keys() { hdr_.hdr.set_num_keys(hdr_.hdr.num_keys() + 1); }
326 
327  private:
328   friend void IcingDynamicTrie::SetHeader(
329       const IcingDynamicTrieHeader &new_hdr);
330 
331   enum ArrayType { NODE, NEXT, SUFFIX, NUM_ARRAY_TYPES };
332 
333   // Returns all filenames that are part of the storage. First
334   // filename is the header and the rest correspond to ArrayType enum
335   // values.
336   static void GetFilenames(const std::string &file_basename,
337                            vector<std::string> *filenames);
338   static std::string GetHeaderFilename(const std::string &file_basename);
339 
340   uint32_t GetHeaderCrc() const;
341 
342   uint32_t GetAllCrc() const;
343 
344   uint32_t UpdateCrcInternal(bool write_hdr);
345 
346   // Initializes hdr_ with options and writes the resulting header to disk.
347   bool CreateNewHeader(IcingScopedFd sfd, const Options &options);
348   bool WriteHeader();
349 
350   // Header block. On-disk header block format is as follows:
351   //
352   // |serialized-header|pad|crcs|
353   // <--- system_page_size() --->
354 
355   // Wrapper for header protobuf.
356   class Header {
357     // Serialized format:
358     //
359     // magic(4)|size(4)|serialized hdr(size)
360     static const uint32_t kMagic;
361     // TODO(b/77482303) : Remove version from the IcingFlashBitmap header -
362     // magic makes it unnecessary.
363     static const uint32_t kCurVersion;
364 
365    public:
366     void Init(const Options &options);
367     bool Init(const uint8_t *buf, uint32_t buf_size);
Invalidate()368     void Invalidate() { hdr.Clear(); }
369     bool SerializeToArray(uint8_t *buf, uint32_t buf_size) const;
370     bool Verify();
371 
372     IcingDynamicTrieHeader hdr;
373   };
374 
375   std::string file_basename_;
376 
377   Header hdr_;
378 
379   IcingMMapper hdr_mmapper_;
380 
381   struct Crcs {
382     uint32_t all_crc;
383     uint32_t header_crc;
384     uint32_t array_crcs[NUM_ARRAY_TYPES];
385   };
386   Crcs *crcs_;
387 
serialized_header_max()388   static uint32_t serialized_header_max() {
389     return IcingMMapper::system_page_size() - sizeof(Crcs);
390   }
391 
392   RuntimeOptions runtime_options_;
393 
394   // Info kept about each array (NODE, NEXT, SUFFIX) to manage
395   // storage.
396   IcingScopedFd array_fds_[NUM_ARRAY_TYPES];
397   std::vector<IcingArrayStorage> array_storage_;
398 
399   // Legacy file system. Switch to use the new Filesystem class instead.
400   const IcingFilesystem *filesystem_;
401 };
402 
IcingDynamicTrieStorage(const std::string & file_basename,const RuntimeOptions & runtime_options,const IcingFilesystem * filesystem)403 IcingDynamicTrie::IcingDynamicTrieStorage::IcingDynamicTrieStorage(
404     const std::string &file_basename, const RuntimeOptions &runtime_options,
405     const IcingFilesystem *filesystem)
406     : file_basename_(file_basename),
407       hdr_mmapper_(false, MAP_SHARED),
408       crcs_(nullptr),
409       runtime_options_(runtime_options),
410       array_storage_(NUM_ARRAY_TYPES, IcingArrayStorage(*filesystem)),
411       filesystem_(filesystem) {}
412 
~IcingDynamicTrieStorage()413 IcingDynamicTrie::IcingDynamicTrieStorage::~IcingDynamicTrieStorage() {
414   if (is_initialized()) {
415     for (int i = 0; i < NUM_ARRAY_TYPES; i++) {
416       array_storage_[i].Reset();
417     }
418   }
419 }
420 
GetFilenames(const std::string & file_basename,vector<std::string> * filenames)421 void IcingDynamicTrie::IcingDynamicTrieStorage::GetFilenames(
422     const std::string &file_basename, vector<std::string> *filenames) {
423   const char *kArrayFilenameSuffixes[NUM_ARRAY_TYPES] = {
424       ".n",
425       ".x",
426       ".s",
427   };
428 
429   filenames->clear();
430   filenames->push_back(GetHeaderFilename(file_basename));
431   for (int i = 0; i < NUM_ARRAY_TYPES; i++) {
432     filenames->push_back(file_basename + kArrayFilenameSuffixes[i]);
433   }
434 }
435 
GetHeaderFilename(const std::string & file_basename)436 std::string IcingDynamicTrie::IcingDynamicTrieStorage::GetHeaderFilename(
437     const std::string &file_basename) {
438   constexpr char kHeaderFilenameSuffix[] = ".h";
439   return file_basename + kHeaderFilenameSuffix;
440 }
441 
Init()442 bool IcingDynamicTrie::IcingDynamicTrieStorage::Init() {
443   bool init_crcs = false;
444   const bool map_shared =
445       runtime_options_.storage_policy == RuntimeOptions::kMapSharedWithCrc;
446 
447   // Open files.
448   vector<std::string> filenames;
449   GetFilenames(file_basename_, &filenames);
450   for (size_t i = 0; i < filenames.size(); i++) {
451     uint64_t file_size = filesystem_->GetFileSize(filenames[i].c_str());
452     if (file_size == IcingFilesystem::kBadFileSize) {
453       goto failed;
454     }
455     IcingScopedFd sfd(filesystem_->OpenForWrite(filenames[i].c_str()));
456     if (!sfd.is_valid()) {
457       goto failed;
458     }
459     // The first filename is the header and the rest correspond to ArrayType
460     // enum values. The header's fd can be closed immediately after mmapping
461     // (see b/114830334). Other files' fds are tracked in array_fds_ for later
462     // closing.
463     if (i == 0) {
464       // Header.
465       if (file_size != IcingMMapper::system_page_size()) {
466         ICING_LOG(ERROR) << "Trie hdr wrong size: " << file_size;
467         goto failed;
468       }
469 
470       // Open hdr.
471       hdr_mmapper_.Remap(sfd.get(), 0, IcingMMapper::system_page_size());
472       if (!hdr_mmapper_.is_valid()) {
473         ICING_LOG(ERROR) << "Trie map header failed";
474         goto failed;
475       }
476     } else {
477       array_fds_[i - 1] = std::move(sfd);
478     }
479   }
480 
481   // Point crcs_ to correct region.
482   crcs_ = reinterpret_cast<Crcs *>(hdr_mmapper_.address() +
483                                    serialized_header_max());
484   if (crcs_->header_crc == kNoCrc) {
485     // Create crcs.
486     crcs_->header_crc = GetHeaderCrc();
487 
488     // Do the same for the arrays.
489     init_crcs = true;
490   } else {
491     // Verify crc.
492     if (crcs_->header_crc != GetHeaderCrc()) {
493       ICING_LOG(ERROR) << "Trie header crc failed";
494       goto failed;
495     }
496   }
497 
498   // Deserialize and verify header.
499   if (!hdr_.Init(hdr_mmapper_.address(),
500                  IcingMMapper::system_page_size() - sizeof(Crcs)) ||
501       !hdr_.Verify()) {
502     ICING_LOG(ERROR) << "Trie reading header failed";
503     goto failed;
504   }
505 
506   // We have the header set up. Now read in the arrays.
507   if (!array_storage_[NODE].Init(array_fds_[NODE].get(), 0, map_shared,
508                                  sizeof(Node), hdr_.hdr.num_nodes(),
509                                  hdr_.hdr.max_nodes(), &crcs_->array_crcs[NODE],
510                                  init_crcs)) {
511     ICING_LOG(ERROR) << "Trie mmap node failed";
512     goto failed;
513   }
514 
515   if (!array_storage_[NEXT].Init(array_fds_[NEXT].get(), 0, map_shared,
516                                  sizeof(Next), hdr_.hdr.num_nexts(),
517                                  hdr_.hdr.max_nexts(), &crcs_->array_crcs[NEXT],
518                                  init_crcs)) {
519     ICING_LOG(ERROR) << "Trie mmap next failed";
520     goto failed;
521   }
522 
523   if (!array_storage_[SUFFIX].Init(array_fds_[SUFFIX].get(), 0, map_shared,
524                                    sizeof(char), hdr_.hdr.suffixes_size(),
525                                    hdr_.hdr.max_suffixes_size(),
526                                    &crcs_->array_crcs[SUFFIX], init_crcs)) {
527     ICING_LOG(ERROR) << "Trie mmap suffix failed";
528     goto failed;
529   }
530 
531   // Overall crc.
532   if (init_crcs) {
533     crcs_->all_crc = GetAllCrc();
534   } else {
535     // Verify crc.
536     if (crcs_->all_crc != GetAllCrc()) {
537       ICING_LOG(ERROR) << "Trie all crc failed";
538       goto failed;
539     }
540   }
541 
542   return true;
543 
544 failed:
545   crcs_ = nullptr;
546   hdr_mmapper_.Unmap();
547   hdr_.Invalidate();
548   for (int i = 0; i < NUM_ARRAY_TYPES; i++) {
549     array_storage_[i].Reset();
550     array_fds_[i].reset();
551   }
552 
553   return false;
554 }
555 
CreateIfNotExist(const Options & options)556 bool IcingDynamicTrie::IcingDynamicTrieStorage::CreateIfNotExist(
557     const Options &options) {
558   vector<std::string> filenames;
559   GetFilenames(file_basename_, &filenames);
560 
561   // Check already exists. Just header file check is enough.
562   if (filesystem_->FileExists(filenames[0].c_str())) {
563     return true;
564   }
565 
566   // Ensure the storage directory exists
567   std::string storage_dir = filesystem_->GetDirname(filenames[0].c_str());
568   if (!filesystem_->CreateDirectoryRecursively(storage_dir.c_str())) {
569     return false;
570   }
571 
572   // Create files.
573   for (size_t i = 0; i < filenames.size(); i++) {
574     IcingScopedFd sfd(filesystem_->OpenForWrite(filenames[i].c_str()));
575     if (!sfd.is_valid()) {
576       Remove(file_basename_, *filesystem_);
577       return false;
578     }
579 
580     if (i == 0) {
581       if (!CreateNewHeader(std::move(sfd), options)) {
582         ICING_LOG(ERROR) << "Serialize trie header failed";
583         Remove(file_basename_, *filesystem_);
584         return false;
585       }
586     } else {
587       // Crcs are automatically kNoCrc so they will be initialized
588       // upon first call to Init.
589       if (!filesystem_->Truncate(*sfd, 0)) {
590         Remove(file_basename_, *filesystem_);
591         return false;
592       }
593     }
594   }
595   return true;
596 }
597 
CreateNewHeader(IcingScopedFd sfd,const Options & options)598 bool IcingDynamicTrie::IcingDynamicTrieStorage::CreateNewHeader(
599     IcingScopedFd sfd, const Options &options) {
600   ICING_VLOG(1) << "Creating header with write+sync";
601   hdr_.Init(options);
602   auto buf = std::make_unique<uint8_t[]>(IcingMMapper::system_page_size());
603   // serialized_header_max must be less than system_page_size so we don't
604   // overflow buf when serializing the header.
605   if (serialized_header_max() > IcingMMapper::system_page_size()) {
606     ICING_LOG(FATAL) << "serialized_header_max exceeds system page size";
607   }
608 
609   return hdr_.SerializeToArray(buf.get(), serialized_header_max()) &&
610          filesystem_->Write(sfd.get(), buf.get(),
611                             IcingMMapper::system_page_size()) &&
612          filesystem_->DataSync(sfd.get());
613 }
614 
Remove(const std::string & file_basename,const IcingFilesystem & filesystem)615 bool IcingDynamicTrie::IcingDynamicTrieStorage::Remove(
616     const std::string &file_basename, const IcingFilesystem &filesystem) {
617   bool success = true;
618   vector<std::string> files;
619   GetFilenames(file_basename, &files);
620   for (size_t i = 0; i < files.size(); i++) {
621     if (!filesystem.DeleteFile(files[i].c_str())) {
622       success = false;
623     }
624   }
625   return success;
626 }
627 
Warm()628 void IcingDynamicTrie::IcingDynamicTrieStorage::Warm() {
629   for (int i = 0; i < NUM_ARRAY_TYPES; i++) {
630     array_storage_[i].Warm();
631   }
632 }
633 
Clear()634 void IcingDynamicTrie::IcingDynamicTrieStorage::Clear() {
635   if (!is_initialized()) {
636     ICING_LOG(FATAL) << "DynamicTrie not initialized";
637   }
638 
639   // Clear header.
640   hdr_.hdr.set_num_nodes(0);
641   hdr_.hdr.set_num_nexts(0);
642   hdr_.hdr.set_suffixes_size(0);
643   for (int i = 0; i < hdr_.hdr.free_lists_size(); i++) {
644     hdr_.hdr.set_free_lists(i, kInvalidNextIndex);
645   }
646   hdr_.hdr.set_num_keys(0);
647 
648   // Clear array storage.
649   for (int i = 0; i < NUM_ARRAY_TYPES; i++) {
650     array_storage_[i].Clear();
651   }
652 
653   // Copy to persistence.
654   WriteHeader();
655 }
656 
Sync()657 bool IcingDynamicTrie::IcingDynamicTrieStorage::Sync() {
658   if (!is_initialized()) {
659     ICING_LOG(FATAL) << "DynamicTrie not initialized";
660   }
661 
662   uint32_t total_flushed = 0;
663   bool success = true;
664 
665   // Sync all array types.
666   for (int i = 0; i < NUM_ARRAY_TYPES; i++) {
667     total_flushed += array_storage_[i].Sync();
668     if (!filesystem_->DataSync(array_fds_[i].get())) {
669       ICING_LOG(ERROR) << "Unable to sync data for flushing";
670       success = false;
671     }
672   }
673 
674   if (!WriteHeader()) {
675     ICING_LOG(ERROR) << "Flushing trie header failed: " << strerror(errno);
676     success = false;
677   }
678 
679   // Need to update CRCs before we sync the header mmap.
680   UpdateCrcInternal(false);
681 
682   // Sync header.
683   if (!hdr_mmapper_.Sync()) {
684     ICING_LOG(ERROR) << "Unable to sync trie header for flushing";
685     success = false;
686   }
687 
688   if (total_flushed > 0) {
689     ICING_VLOG(1) << "Flushing " << total_flushed << " pages of trie";
690   }
691 
692   return success;
693 }
694 
GetDiskUsage() const695 uint64_t IcingDynamicTrie::IcingDynamicTrieStorage::GetDiskUsage() const {
696   // Trie files themselves.
697   uint64_t total = 0;
698   for (int i = 0; i < NUM_ARRAY_TYPES; i++) {
699     IcingFilesystem::IncrementByOrSetInvalid(
700         filesystem_->GetDiskUsage(array_fds_[i].get()), &total);
701   }
702 
703   // Header.
704   std::string header_filename = GetHeaderFilename(file_basename_);
705   IcingFilesystem::IncrementByOrSetInvalid(
706       filesystem_->GetFileDiskUsage(header_filename.c_str()), &total);
707 
708   return total;
709 }
710 
GetElementsFileSize() const711 uint64_t IcingDynamicTrie::IcingDynamicTrieStorage::GetElementsFileSize()
712     const {
713   // Trie files themselves, exclude size of the header. These arrays are dense,
714   // not sparse, so use file size for more accurate numbers.
715   uint64_t total = 0;
716   for (int i = 0; i < NUM_ARRAY_TYPES; i++) {
717     IcingFilesystem::IncrementByOrSetInvalid(
718         filesystem_->GetFileSize(array_fds_[i].get()), &total);
719   }
720   return total;
721 }
722 
AllocNode()723 IcingDynamicTrie::Node *IcingDynamicTrie::IcingDynamicTrieStorage::AllocNode() {
724   if (nodes_left() == 0) {
725     ICING_LOG(FATAL) << "No allocated nodes left";
726   }
727 
728   hdr_.hdr.set_num_nodes(hdr_.hdr.num_nodes() + 1);
729   return GetMutableNode(hdr_.hdr.num_nodes() - 1);
730 }
731 
732 libtextclassifier3::StatusOr<IcingDynamicTrie::Next *>
AllocNextArray(int size)733 IcingDynamicTrie::IcingDynamicTrieStorage::AllocNextArray(int size) {
734   if (size > kMaxNextArraySize) {
735     return absl_ports::InternalError(
736         "Array size exceeds the max 'next' array size");
737   }
738 
739   if (nexts_left() < static_cast<uint32_t>(kMaxNextArraySize)) {
740     ICING_LOG(FATAL) << "'next' buffer not enough";
741   }
742 
743   // Compute ceil(log2(size)).
744   int log2_size = 0;
745   while ((1 << log2_size) < size) log2_size++;
746   // Note: size <= aligned_size <= kMaxNextArraySize
747   int aligned_size = 1 << log2_size;
748 
749   // Look in free list.
750   Next *ret;
751   if (hdr_.hdr.free_lists(log2_size) != kInvalidNextIndex) {
752     ret = GetMutableNextArray(hdr_.hdr.free_lists(log2_size), aligned_size);
753     uint32_t next_link = ret->next_index();
754     if (next_link != kInvalidNextIndex && next_link >= hdr_.hdr.max_nexts()) {
755       ICING_LOG(FATAL) << "'next' index is out of range";
756     }
757     hdr_.hdr.set_free_lists(log2_size, next_link);
758   } else {
759     // Allocate a new one.
760     ret = GetMutableNextArray(hdr_.hdr.num_nexts(), aligned_size);
761     hdr_.hdr.set_num_nexts(hdr_.hdr.num_nexts() + aligned_size);
762   }
763 
764   // Fill with char 0xff so we are sorted properly.
765   for (int i = 0; i < aligned_size; i++) {
766     ResetMutableNext(ret[i]);
767   }
768   return ret;
769 }
770 
FreeNextArray(Next * next,int log2_size)771 void IcingDynamicTrie::IcingDynamicTrieStorage::FreeNextArray(Next *next,
772                                                               int log2_size) {
773   if (GetNextArrayIndex(next) + (1 << log2_size) > hdr_.hdr.max_nexts()) {
774     ICING_LOG(FATAL) << "'next' array is out of range";
775   }
776 
777   // Put it in free list.
778   next->set_next_index(hdr_.hdr.free_lists(log2_size));
779   hdr_.hdr.set_free_lists(log2_size, GetNextArrayIndex(next));
780 }
781 
MakeSuffix(const char * suffix,const void * value,uint32_t * value_index)782 uint32_t IcingDynamicTrie::IcingDynamicTrieStorage::MakeSuffix(
783     const char *suffix, const void *value, uint32_t *value_index) {
784   int suffix_len = strlen(suffix);
785   if (suffixes_left() < suffix_len + 1 + value_size()) {
786     ICING_LOG(FATAL) << "'suffix' buffer not enough";
787   }
788 
789   char *start =
790       GetMutableSuffix(hdr_.hdr.suffixes_size(), suffix_len + 1 + value_size());
791   memcpy(start, suffix, suffix_len + 1);
792   memcpy(start + suffix_len + 1, value, value_size());
793   if (value_index) *value_index = GetSuffixIndex(start + suffix_len + 1);
794   hdr_.hdr.set_suffixes_size(hdr_.hdr.suffixes_size() + suffix_len + 1 +
795                              value_size());
796 
797   return GetSuffixIndex(start);
798 }
799 
GetHeaderCrc() const800 uint32_t IcingDynamicTrie::IcingDynamicTrieStorage::GetHeaderCrc() const {
801   return IcingStringUtil::UpdateCrc32(
802       0, reinterpret_cast<const char *>(hdr_mmapper_.address()),
803       serialized_header_max());
804 }
805 
GetAllCrc() const806 uint32_t IcingDynamicTrie::IcingDynamicTrieStorage::GetAllCrc() const {
807   // Append array crcs to header crc.
808   return IcingStringUtil::UpdateCrc32(
809       crcs_->header_crc, reinterpret_cast<const char *>(crcs_->array_crcs),
810       sizeof(crcs_->array_crcs));
811 }
812 
UpdateCrc()813 uint32_t IcingDynamicTrie::IcingDynamicTrieStorage::UpdateCrc() {
814   return UpdateCrcInternal(true);
815 }
816 
UpdateCrcInternal(bool write_hdr)817 uint32_t IcingDynamicTrie::IcingDynamicTrieStorage::UpdateCrcInternal(
818     bool write_hdr) {
819   if (write_hdr && !WriteHeader()) {
820     ICING_LOG(ERROR) << "Flushing trie header failed: " << strerror(errno);
821   }
822 
823   crcs_->header_crc = GetHeaderCrc();
824 
825   for (int i = 0; i < NUM_ARRAY_TYPES; i++) {
826     array_storage_[i].UpdateCrc();
827   }
828 
829   crcs_->all_crc = GetAllCrc();
830 
831   return crcs_->all_crc;
832 }
833 
WriteHeader()834 bool IcingDynamicTrie::IcingDynamicTrieStorage::WriteHeader() {
835   return hdr_.SerializeToArray(hdr_mmapper_.address(), serialized_header_max());
836 }
837 
838 IcingDynamicTrie::Node *
GetMutableNode(uint32_t idx)839 IcingDynamicTrie::IcingDynamicTrieStorage::GetMutableNode(uint32_t idx) {
840   return array_storage_[NODE].GetMutableMem<Node>(idx, 1);
841 }
842 
843 IcingDynamicTrie::Next *
GetMutableNextArray(uint32_t idx,uint32_t len)844 IcingDynamicTrie::IcingDynamicTrieStorage::GetMutableNextArray(uint32_t idx,
845                                                                uint32_t len) {
846   return array_storage_[NEXT].GetMutableMem<Next>(idx, len);
847 }
848 
GetMutableSuffix(uint32_t idx,uint32_t len)849 char *IcingDynamicTrie::IcingDynamicTrieStorage::GetMutableSuffix(
850     uint32_t idx, uint32_t len) {
851   return array_storage_[SUFFIX].GetMutableMem<char>(idx, len);
852 }
853 
854 // Header functions.
855 const uint32_t IcingDynamicTrie::IcingDynamicTrieStorage::Header::kMagic =
856     0x6dfba6ae;
857 // For future revisions, this should be synced with global index version.
858 // See comments on Upgrade() in native-index-impl.h for versioning.
859 const uint32_t IcingDynamicTrie::IcingDynamicTrieStorage::Header::kCurVersion =
860     4;
861 
Init(const IcingDynamicTrie::Options & options)862 void IcingDynamicTrie::IcingDynamicTrieStorage::Header::Init(
863     const IcingDynamicTrie::Options &options) {
864   hdr.Clear();
865 
866   hdr.set_version(kCurVersion);
867   hdr.set_max_nodes(options.max_nodes);
868   hdr.set_max_nexts(options.max_nexts);
869   hdr.set_max_suffixes_size(options.max_suffixes_size);
870   hdr.set_value_size(options.value_size);
871 
872   for (int i = 0; i < kNumNextAllocationBuckets; i++) {
873     hdr.add_free_lists(kInvalidNextIndex);
874   }
875 }
876 
Init(const uint8_t * buf,uint32_t buf_size)877 bool IcingDynamicTrie::IcingDynamicTrieStorage::Header::Init(
878     const uint8_t *buf, uint32_t buf_size) {
879   // Check magic and length.
880   if (buf_size <= sizeof(kMagic) + sizeof(uint32_t)) {
881     ICING_LOG(ERROR) << "Trie header too short";
882     return false;
883   }
884 
885   uint32_t magic;
886   memcpy(&magic, buf, sizeof(magic));
887   if (magic != kMagic) {
888     ICING_LOG(ERROR) << "Trie header magic mismatch";
889     return false;
890   }
891   uint32_t len;
892   memcpy(&len, buf + sizeof(magic), sizeof(len));
893   if (len > buf_size - sizeof(magic) - sizeof(len)) {
894     ICING_LOG(ERROR) << "Trie header too short";
895     return false;
896   }
897 
898   return hdr.ParseFromArray(buf + sizeof(magic) + sizeof(len), len);
899 }
900 
SerializeToArray(uint8_t * buf,uint32_t buf_size) const901 bool IcingDynamicTrie::IcingDynamicTrieStorage::Header::SerializeToArray(
902     uint8_t *buf, uint32_t buf_size) const {
903   uint32_t size = hdr.ByteSizeLong();
904   if (size + sizeof(kMagic) + sizeof(uint32_t) > buf_size) return false;
905   memcpy(buf, &kMagic, sizeof(kMagic));
906   memcpy(buf + sizeof(kMagic), &size, sizeof(uint32_t));
907   hdr.SerializeWithCachedSizesToArray(buf + sizeof(kMagic) + sizeof(uint32_t));
908   return true;
909 }
910 
Verify()911 bool IcingDynamicTrie::IcingDynamicTrieStorage::Header::Verify() {
912   // Check version.
913   if (hdr.version() != kCurVersion) {
914     ICING_LOG(ERROR) << "Trie version " << hdr.version() << " mismatch";
915     return false;
916   }
917 
918   // Check that indices in hdr are within bounds. Note that this is
919   // not a comprehensive integrity check for the entire trie.
920   if (hdr.num_nodes() > hdr.max_nodes() || hdr.num_nexts() > hdr.max_nexts() ||
921       hdr.suffixes_size() > hdr.max_suffixes_size() ||
922       hdr.value_size() >= hdr.max_suffixes_size()) {
923     ICING_LOG(ERROR) << "Trie header array size out of bounds";
924     return false;
925   }
926 
927   if (hdr.free_lists_size() != kNumNextAllocationBuckets) {
928     ICING_LOG(ERROR) << "Bad number of free lists";
929     return false;
930   }
931 
932   for (int i = 0; i < kNumNextAllocationBuckets; i++) {
933     if (hdr.free_lists(i) != kInvalidNextIndex &&
934         hdr.free_lists(i) >= hdr.max_nexts()) {
935       ICING_LOG(ERROR) << "Free list index out of bounds";
936       return false;
937     }
938   }
939 
940   return true;
941 }
942 
nodes_left() const943 uint32_t IcingDynamicTrie::IcingDynamicTrieStorage::nodes_left() const {
944   return hdr_.hdr.max_nodes() - hdr_.hdr.num_nodes();
945 }
946 
nexts_left() const947 uint32_t IcingDynamicTrie::IcingDynamicTrieStorage::nexts_left() const {
948   return hdr_.hdr.max_nexts() - hdr_.hdr.num_nexts();
949 }
950 
suffixes_left() const951 uint32_t IcingDynamicTrie::IcingDynamicTrieStorage::suffixes_left() const {
952   return hdr_.hdr.max_suffixes_size() - hdr_.hdr.suffixes_size();
953 }
954 
FillDirtyPageStats(Stats * stats) const955 void IcingDynamicTrie::IcingDynamicTrieStorage::FillDirtyPageStats(
956     Stats *stats) const {
957   stats->dirty_pages_nodes = array_storage_[NODE].num_dirty_pages();
958   stats->dirty_pages_nexts = array_storage_[NEXT].num_dirty_pages();
959   stats->dirty_pages_suffixes = array_storage_[SUFFIX].num_dirty_pages();
960 }
961 
962 // Dumper.
963 class IcingDynamicTrie::Dumper {
964  public:
Dumper(const IcingDynamicTrie & trie)965   explicit Dumper(const IcingDynamicTrie &trie)
966       : all_props_(trie), del_prop_(trie), storage_(trie.storage_.get()) {}
967 
Dump(std::ostream * pretty_print,vector<std::string> * keys) const968   void Dump(std::ostream *pretty_print, vector<std::string> *keys) const {
969     if (storage_->empty()) {
970       *pretty_print << "(empty)\n";
971     } else {
972       DumpNodeRecursive("", *storage_->GetRootNode(), 0, pretty_print, keys);
973     }
974   }
975 
976  private:
SuffixToValueAsString(const char * suffix) const977   std::string SuffixToValueAsString(const char *suffix) const {
978     int suffix_len = strlen(suffix);
979     std::string ret;
980     ret.reserve(storage_->value_size() * 2);
981     for (uint32_t i = 0; i < storage_->value_size(); i++) {
982       IcingStringUtil::SStringAppendF(&ret, 10, "%02x",
983                                       suffix[suffix_len + 1 + i]);
984     }
985 
986     // Now dump set properties.
987     uint32_t value_index = storage_->GetSuffixIndex(suffix + suffix_len + 1);
988     if (del_prop_.HasProperty(value_index)) {
989       ret += " (deleted)";
990     }
991     ret += " [";
992     for (size_t i = 0; i < all_props_.size(); i++) {
993       if (all_props_.HasProperty(i, value_index)) {
994         IcingStringUtil::SStringAppendF(&ret, 10, "%zu", i);
995       }
996     }
997     ret += ']';
998 
999     return ret;
1000   }
1001 
1002   // Inputs:
1003   //   prefix - the key prefix of the current node (so we can rebuild the key)
1004   //   node - the node we're at
1005   //   level - how many levels deep we are in the trie
1006   //   ret - the stream to pretty print to
1007   //   keys - the keys encountered are appended to this
DumpNodeRecursive(const std::string & prefix,const Node & node,int level,std::ostream * ret,vector<std::string> * keys) const1008   void DumpNodeRecursive(const std::string &prefix, const Node &node, int level,
1009                          std::ostream *ret, vector<std::string> *keys) const {
1010     if (node.is_leaf()) {
1011       // Dump suffix and value.
1012       for (int i = 0; i < level; i++) {
1013         *ret << ' ';
1014       }
1015       const char *suffix = storage_->GetSuffix(node.next_index());
1016       *ret << suffix;
1017       *ret << ' ';
1018       *ret << SuffixToValueAsString(suffix);
1019       *ret << '\n';
1020       keys->push_back(prefix + suffix);
1021     } else {
1022       // Go through each child (next) node. Print char and recursively
1023       // print trie underneath.
1024       for (uint32_t i = 0; i < (1U << node.log2_num_children()); i++) {
1025         const Next &next = *storage_->GetNext(node.next_index(), i);
1026         if (next.node_index() == kInvalidNodeIndex) break;
1027         for (int j = 0; j < level; j++) {
1028           *ret << ' ';
1029         }
1030         std::string new_prefix = prefix;
1031         if (next.val()) {
1032           *ret << static_cast<char>(next.val());
1033           new_prefix += next.val();
1034         } else {
1035           *ret << "null";
1036         }
1037         *ret << '\n';
1038         DumpNodeRecursive(new_prefix, *storage_->GetNode(next.node_index()),
1039                           level + 1, ret, keys);
1040       }
1041     }
1042   }
1043 
1044   PropertyReadersAll all_props_;
1045   PropertyDeletedReader del_prop_;
1046   const IcingDynamicTrie::IcingDynamicTrieStorage *storage_;
1047 };
1048 
1049 // IcingDynamicTrie.
IcingDynamicTrie(const std::string & filename_base,const RuntimeOptions & runtime_options,const IcingFilesystem * filesystem)1050 IcingDynamicTrie::IcingDynamicTrie(const std::string &filename_base,
1051                                    const RuntimeOptions &runtime_options,
1052                                    const IcingFilesystem *filesystem)
1053     : IIcingStorage(),
1054       filename_base_(filename_base),
1055       is_initialized_(false),
1056       runtime_options_(runtime_options),
1057       storage_(nullptr),
1058       property_bitmaps_prefix_(filename_base_ + ".prop."),
1059       deleted_bitmap_filename_(filename_base_ + ".deleted"),
1060       deleted_bitmap_(nullptr),
1061       filesystem_(filesystem) {}
1062 
~IcingDynamicTrie()1063 IcingDynamicTrie::~IcingDynamicTrie() { Close(); }
1064 
Init()1065 bool IcingDynamicTrie::Init() {
1066   if (is_initialized_) return true;
1067 
1068   if (storage_ != nullptr) {
1069     ICING_LOG(FATAL) << "Storage is not null before initialization";
1070   }
1071 
1072   storage_ = std::make_unique<IcingDynamicTrieStorage>(
1073       filename_base_, runtime_options_, filesystem_);
1074   if (!storage_->Init() || !InitPropertyBitmaps()) {
1075     storage_.reset();
1076     return false;
1077   }
1078   is_initialized_ = true;
1079   return true;
1080 }
1081 
CreateIfNotExist(const Options & options)1082 bool IcingDynamicTrie::CreateIfNotExist(const Options &options) {
1083   // Initialized means exists.
1084   if (is_initialized_) return true;
1085 
1086   if (!options.is_valid()) {
1087     ICING_LOG(ERROR) << "Trie options invalid";
1088     return false;
1089   }
1090 
1091   auto storage = std::make_unique<IcingDynamicTrieStorage>(
1092       filename_base_, runtime_options_, filesystem_);
1093   return storage->CreateIfNotExist(options);
1094 }
1095 
Close()1096 void IcingDynamicTrie::Close() {
1097   if (!is_initialized_) return;
1098 
1099   UpdateCrc();
1100 
1101   storage_.reset();
1102   property_bitmaps_.clear();
1103   deleted_bitmap_.reset();
1104   is_initialized_ = false;
1105 }
1106 
Remove()1107 bool IcingDynamicTrie::Remove() {
1108   if (is_initialized()) {
1109     Close();
1110   }
1111 
1112   bool success = true;
1113 
1114   // Remove storage files.
1115   if (!IcingDynamicTrieStorage::Remove(filename_base_, *filesystem_)) {
1116     success = false;
1117   }
1118 
1119   // Also remove property bitmaps.
1120   vector<std::string> files;
1121   if (!filesystem_->GetMatchingFiles((property_bitmaps_prefix_ + "*").c_str(),
1122                                      &files)) {
1123     return false;
1124   }
1125   for (size_t i = 0; i < files.size(); i++) {
1126     if (!filesystem_->DeleteFile(files[i].c_str())) success = false;
1127   }
1128   // And deleted bitmap.
1129   if (!filesystem_->DeleteFile(deleted_bitmap_filename_.c_str()))
1130     success = false;
1131 
1132   return success;
1133 }
1134 
Sync()1135 bool IcingDynamicTrie::Sync() {
1136   if (!is_initialized_) {
1137     ICING_LOG(FATAL) << "DynamicTrie not initialized";
1138   }
1139 
1140   bool success = true;
1141   IcingTimer timer;
1142 
1143   // Sync property bitmaps.
1144   for (size_t i = 0; i < property_bitmaps_.size(); i++) {
1145     if (property_bitmaps_[i]) {
1146       if (!property_bitmaps_[i]->Sync()) success = false;
1147     }
1148   }
1149   if (!deleted_bitmap_->Sync()) success = false;
1150 
1151   // Sync storage.
1152   if (!storage_->Sync()) success = false;
1153 
1154   Warm();
1155 
1156   ICING_VLOG(1) << "Syncing dynamic trie " << filename_base_.c_str()
1157       << " took " << timer.Elapsed() * 1000 << "ms";
1158 
1159   return success;
1160 }
1161 
GetDiskUsage() const1162 uint64_t IcingDynamicTrie::GetDiskUsage() const {
1163   uint64_t total = 0;
1164   // Property bitmaps.
1165   IcingFilesystem::IncrementByOrSetInvalid(deleted_bitmap_->GetDiskUsage(),
1166                                            &total);
1167 
1168   for (auto &bitmap : property_bitmaps_) {
1169     if (bitmap == nullptr) continue;
1170     IcingFilesystem::IncrementByOrSetInvalid(bitmap->GetDiskUsage(), &total);
1171   }
1172 
1173   // Storage.
1174   IcingFilesystem::IncrementByOrSetInvalid(storage_->GetDiskUsage(), &total);
1175   return total;
1176 }
1177 
GetElementsSize() const1178 uint64_t IcingDynamicTrie::GetElementsSize() const {
1179   uint64_t total = 0;
1180 
1181   // Bitmaps are sparsely populated, so disk usage is more accurate for those.
1182   // Property bitmaps.
1183   IcingFilesystem::IncrementByOrSetInvalid(deleted_bitmap_->GetDiskUsage(),
1184                                            &total);
1185   // The deleted bitmap is always initially grown to kGrowSize, whether there
1186   // are elements or not. So even if there are no elements in the trie, we'll
1187   // still have the bitmap of size kGrowSize, so subtract that from the size of
1188   // the trie's elements.
1189   total -= IcingFlashBitmap::kGrowSize;
1190 
1191   for (auto &bitmap : property_bitmaps_) {
1192     if (bitmap == nullptr) continue;
1193     IcingFilesystem::IncrementByOrSetInvalid(bitmap->GetDiskUsage(), &total);
1194   }
1195 
1196   // Storage. We can use file size here since the storage files aren't sparse.
1197   IcingFilesystem::IncrementByOrSetInvalid(storage_->GetElementsFileSize(),
1198                                            &total);
1199   return total;
1200 }
1201 
OpenAndInitBitmap(const std::string & filename,bool verify,const IcingFilesystem * filesystem)1202 std::unique_ptr<IcingFlashBitmap> IcingDynamicTrie::OpenAndInitBitmap(
1203     const std::string &filename, bool verify,
1204     const IcingFilesystem *filesystem) {
1205   auto bitmap = std::make_unique<IcingFlashBitmap>(filename, filesystem);
1206   if (!bitmap->Init() || (verify && !bitmap->Verify())) {
1207     ICING_LOG(ERROR) << "Init of " << filename.c_str() << " failed";
1208     return nullptr;
1209   }
1210   return bitmap;
1211 }
1212 
InitPropertyBitmaps()1213 bool IcingDynamicTrie::InitPropertyBitmaps() {
1214   // Only called on init.
1215   if (!property_bitmaps_.empty()) {
1216     ICING_LOG(FATAL) << "Property bitmaps not empty before initialization";
1217   }
1218 
1219   if (deleted_bitmap_ != nullptr) {
1220     ICING_LOG(FATAL) << "Deleted bitmap not null before initialization";
1221   }
1222 
1223   // Truncate property bitmap files at current value index. Last value
1224   // is at suffixes_size - value_size(). We want to clear everything
1225   // after that.
1226   uint64_t truncate_idx =
1227       storage_->hdr().suffixes_size() > 0
1228           ? ValueIndexToPropertyBitmapIndex(storage_->hdr().suffixes_size() -
1229                                             value_size()) +
1230                 1
1231           : 0;
1232 
1233   // Discover property bitmaps by scanning the dir.
1234   vector<std::string> files;
1235   if (!filesystem_->GetMatchingFiles((property_bitmaps_prefix_ + "*").c_str(),
1236                                      &files)) {
1237     ICING_LOG(ERROR) << "Could not get files at prefix " << property_bitmaps_prefix_;
1238     goto failed;
1239   }
1240   for (size_t i = 0; i < files.size(); i++) {
1241     // Decode property id from filename.
1242     size_t property_id_start_idx = files[i].rfind('.');
1243     if (property_id_start_idx == std::string::npos) {
1244       ICING_LOG(ERROR) << "Malformed filename " << files[i];
1245       continue;
1246     }
1247     property_id_start_idx++;  // skip dot
1248     char *end;
1249     uint32_t property_id =
1250         strtol(files[i].c_str() + property_id_start_idx, &end, 10);  // NOLINT
1251     if (!end || end != (files[i].c_str() + files[i].size())) {
1252       ICING_LOG(ERROR) << "Malformed filename " << files[i];
1253       continue;
1254     }
1255     std::unique_ptr<IcingFlashBitmap> bitmap = OpenAndInitBitmap(
1256         files[i],
1257         runtime_options_.storage_policy == RuntimeOptions::kMapSharedWithCrc,
1258         filesystem_);
1259     if (!bitmap) {
1260       ICING_LOG(ERROR) << "Open prop bitmap failed: " << files[i];
1261       goto failed;
1262     }
1263     bitmap->Truncate(truncate_idx);
1264     if (property_id >= property_bitmaps_.size()) {
1265       property_bitmaps_.resize(property_id + 1);
1266     }
1267     property_bitmaps_[property_id] = std::move(bitmap);
1268   }
1269 
1270   deleted_bitmap_ = OpenAndInitBitmap(
1271       deleted_bitmap_filename_,
1272       runtime_options_.storage_policy == RuntimeOptions::kMapSharedWithCrc,
1273       filesystem_);
1274   if (!deleted_bitmap_) {
1275     goto failed;
1276   }
1277   deleted_bitmap_->Truncate(truncate_idx);
1278 
1279   return true;
1280 
1281 failed:
1282   property_bitmaps_.clear();
1283   deleted_bitmap_.reset();
1284   return false;
1285 }
1286 
Warm() const1287 void IcingDynamicTrie::Warm() const {
1288   if (!is_initialized()) {
1289     ICING_LOG(FATAL) << "DynamicTrie not initialized";
1290   }
1291 
1292   return storage_->Warm();
1293 }
1294 
OnSleep()1295 void IcingDynamicTrie::OnSleep() {
1296   if (!is_initialized()) {
1297     ICING_LOG(FATAL) << "DynamicTrie not initialized";
1298   }
1299 
1300   // Update crcs so we can verify when we come back.
1301   UpdateCrc();
1302 }
1303 
size() const1304 uint32_t IcingDynamicTrie::size() const {
1305   if (!is_initialized()) {
1306     ICING_LOG(FATAL) << "DynamicTrie not initialized";
1307   }
1308   return storage_->hdr().num_keys();
1309 }
1310 
CollectStatsRecursive(const Node & node,Stats * stats,uint32_t depth) const1311 void IcingDynamicTrie::CollectStatsRecursive(const Node &node, Stats *stats,
1312                                              uint32_t depth) const {
1313   if (node.is_leaf()) {
1314     stats->num_leaves++;
1315     stats->sum_depth += depth;
1316     stats->max_depth = max(stats->max_depth, depth);
1317     const char *suffix = storage_->GetSuffix(node.next_index());
1318     stats->suffixes_used += strlen(suffix) + 1 + value_size();
1319     if (!suffix[0]) {
1320       stats->null_suffixes++;
1321     }
1322   } else {
1323     stats->num_intermediates++;
1324     uint32_t i = 0;
1325     for (; i < (1U << node.log2_num_children()); i++) {
1326       const Next &next = *storage_->GetNext(node.next_index(), i);
1327       if (next.node_index() == kInvalidNodeIndex) break;
1328       CollectStatsRecursive(*storage_->GetNode(next.node_index()), stats,
1329                             depth + 1);
1330     }
1331 
1332     // At least one valid node in each next array
1333     if (i == 0) {
1334       ICING_LOG(FATAL) << "No valid node in 'next' array";
1335     }
1336     stats->sum_children += i;
1337     stats->max_children = max(stats->max_children, i);
1338 
1339     stats->child_counts[i - 1]++;
1340     stats->wasted[node.log2_num_children()] +=
1341         (1 << node.log2_num_children()) - i;
1342     stats->total_wasted += (1 << node.log2_num_children()) - i;
1343   }
1344 }
1345 
CollectStats(Stats * stats) const1346 void IcingDynamicTrie::CollectStats(Stats *stats) const {
1347   if (!is_initialized()) {
1348     ICING_LOG(FATAL) << "DynamicTrie not initialized";
1349   }
1350 
1351   memset(stats, 0, sizeof(*stats));
1352 
1353   stats->num_keys = storage_->hdr().num_keys();
1354   stats->num_nodes = storage_->hdr().num_nodes();
1355   stats->max_nodes = storage_->hdr().max_nodes();
1356   stats->num_nexts = storage_->hdr().num_nexts();
1357   stats->max_nexts = storage_->hdr().max_nexts();
1358   stats->suffixes_size = storage_->hdr().suffixes_size();
1359   stats->max_suffixes_size = storage_->hdr().max_suffixes_size();
1360 
1361   // Stats collected from traversing the trie.
1362   if (!storage_->empty()) {
1363     CollectStatsRecursive(*storage_->GetRootNode(), stats);
1364   }
1365 
1366   // Free-list stats.
1367   for (int i = 0; i < kNumNextAllocationBuckets; i++) {
1368     for (uint32_t cur = storage_->hdr().free_lists(i); cur != kInvalidNextIndex;
1369          cur = storage_->GetNext(cur, 0)->next_index()) {
1370       stats->num_free[i]++;
1371     }
1372     stats->total_free += stats->num_free[i] * (1 << i);
1373   }
1374 
1375   // Dirty page counts.
1376   storage_->FillDirtyPageStats(stats);
1377 }
1378 
DumpStats(int verbosity) const1379 std::string IcingDynamicTrie::Stats::DumpStats(int verbosity) const {
1380   std::string ret;
1381   IcingStringUtil::SStringAppendF(
1382       &ret, 0,
1383       "Keys %u "
1384       "Nodes (%u/%u) %.3f%% "
1385       "Nexts (%u/%u) %.3f%% "
1386       "Suffixes (%u/%u) %.3f%%\n",
1387       num_keys, num_nodes, max_nodes,
1388       100. * math_util::SafeDivide(num_nodes, max_nodes), num_nexts, max_nexts,
1389       100. * math_util::SafeDivide(num_nexts, max_nexts), suffixes_size,
1390       max_suffixes_size,
1391       100. * math_util::SafeDivide(suffixes_size, max_suffixes_size));
1392 
1393   if (verbosity > 0) {
1394     for (int i = 0; i < kNumNextAllocationBuckets; i++) {
1395       if (num_free[i] > 0) {
1396         IcingStringUtil::SStringAppendF(&ret, 0, "Freelist@%d: %u\n", 1 << i,
1397                                         num_free[i]);
1398       }
1399     }
1400     IcingStringUtil::SStringAppendF(
1401         &ret, 0, "Freelist total: %u/%u %.3f%%\n", total_free, num_nexts,
1402         100. * math_util::SafeDivide(total_free, num_nexts));
1403 
1404     for (int i = 0; i < 256; i++) {
1405       if (child_counts[i] > 0) {
1406         IcingStringUtil::SStringAppendF(&ret, 0, "Child count@%d: %u\n", i + 1,
1407                                         child_counts[i]);
1408       }
1409     }
1410     for (int i = 0; i < kNumNextAllocationBuckets; i++) {
1411       IcingStringUtil::SStringAppendF(&ret, 0, "Wasted@%d: %u\n", 1 << i,
1412                                       wasted[i]);
1413     }
1414     IcingStringUtil::SStringAppendF(
1415         &ret, 0,
1416         "Wasted total: %u\n"
1417         "Num intermediates %u num leaves %u "
1418         "suffixes used %u null %u\n"
1419         "avg and max children for intermediates: %.3f, %u\n"
1420         "avg and max depth for leaves: %.3f, %u\n"
1421         "Total next frag: %.3f%%\n",
1422         total_wasted, num_intermediates, num_leaves, suffixes_used,
1423         null_suffixes, 1. * sum_children / num_intermediates, max_children,
1424         1. * sum_depth / num_leaves, max_depth,
1425         100. * math_util::SafeDivide((total_free + total_wasted), num_nexts));
1426   }
1427   IcingStringUtil::SStringAppendF(
1428       &ret, 0, "Memory usage: %zu/%zu bytes\n",
1429       num_nodes * sizeof(Node) + num_nexts * sizeof(Next) + suffixes_size,
1430       max_nodes * sizeof(Node) + max_nexts * sizeof(Next) + max_suffixes_size);
1431 
1432   IcingStringUtil::SStringAppendF(
1433       &ret, 0, "Dirty pages: nodes %u/%.0f nexts %u/%.0f suffixes %u/%.0f\n",
1434       dirty_pages_nodes,
1435       math_util::SafeDivide(num_nodes * sizeof(Node) + getpagesize() - 1,
1436                             getpagesize()),
1437       dirty_pages_nexts,
1438       math_util::SafeDivide(num_nexts * sizeof(Next) + getpagesize() - 1,
1439                             getpagesize()),
1440       dirty_pages_suffixes,
1441       math_util::SafeDivide(suffixes_size + getpagesize() - 1, getpagesize()));
1442 
1443   return ret;
1444 }
1445 
DumpTrie(std::ostream * pretty_print,vector<std::string> * keys) const1446 void IcingDynamicTrie::DumpTrie(std::ostream *pretty_print,
1447                                 vector<std::string> *keys) const {
1448   if (!is_initialized()) {
1449     ICING_LOG(FATAL) << "DynamicTrie not initialized";
1450   }
1451 
1452   Dumper dumper(*this);
1453   dumper.Dump(pretty_print, keys);
1454 }
1455 
Clear()1456 void IcingDynamicTrie::Clear() {
1457   if (!is_initialized()) {
1458     ICING_LOG(FATAL) << "DynamicTrie not initialized";
1459   }
1460 
1461   storage_->Clear();
1462   for (auto &bitmap : property_bitmaps_) {
1463     if (bitmap) {
1464       bitmap->Delete();
1465       bitmap.reset();
1466     }
1467   }
1468   deleted_bitmap_->Truncate(0);
1469 }
1470 
ClearSuffixAndValue(uint32_t suffix_value_index)1471 bool IcingDynamicTrie::ClearSuffixAndValue(uint32_t suffix_value_index) {
1472   // The size 1 below is for a '\0' between the suffix and the value.
1473   size_t suffix_and_value_length =
1474       strlen(this->storage_->GetSuffix(suffix_value_index)) + 1 +
1475       this->value_size();
1476   char *mutable_suffix_and_value = this->storage_->GetMutableSuffix(
1477       suffix_value_index, suffix_and_value_length);
1478 
1479   if (mutable_suffix_and_value == nullptr) {
1480     return false;
1481   }
1482 
1483   memset(mutable_suffix_and_value, 0, suffix_and_value_length);
1484   return true;
1485 }
1486 
ResetNext(uint32_t next_index)1487 bool IcingDynamicTrie::ResetNext(uint32_t next_index) {
1488   Next *mutable_next =
1489       this->storage_->GetMutableNextArray(next_index, /*len=*/1);
1490 
1491   if (mutable_next == nullptr) {
1492     return false;
1493   }
1494   ResetMutableNext(*mutable_next);
1495   return true;
1496 }
1497 
SortNextArray(const Node * node)1498 bool IcingDynamicTrie::SortNextArray(const Node *node) {
1499   if (node == nullptr) {
1500     // Nothing to sort, return success directly.
1501     return true;
1502   }
1503 
1504   uint32_t next_array_buffer_size = 1u << node->log2_num_children();
1505   Next *next_array_start = this->storage_->GetMutableNextArray(
1506       node->next_index(), next_array_buffer_size);
1507 
1508   if (next_array_start == nullptr) {
1509     return false;
1510   }
1511 
1512   std::sort(next_array_start, next_array_start + next_array_buffer_size);
1513   return true;
1514 }
1515 
Insert(const char * key,const void * value,uint32_t * value_index,bool replace,bool * pnew_key)1516 libtextclassifier3::Status IcingDynamicTrie::Insert(const char *key,
1517                                                     const void *value,
1518                                                     uint32_t *value_index,
1519                                                     bool replace,
1520                                                     bool *pnew_key) {
1521   if (!is_initialized()) {
1522     ICING_LOG(FATAL) << "DynamicTrie not initialized";
1523   }
1524 
1525   if (pnew_key) *pnew_key = false;
1526 
1527   // Find out ahead of time whether things will fit. A conservative
1528   // check based on allocations made below.
1529   //
1530   // IMPORTANT: This needs to be updated if the alloc patterns below
1531   // change.
1532   size_t key_len = strlen(key);
1533   if (!(storage_->nodes_left() >= 2 + key_len + 1 &&
1534         storage_->nexts_left() >= 2 + key_len + 1 + kMaxNextArraySize &&
1535         storage_->suffixes_left() >= key_len + 1 + value_size())) {
1536     return absl_ports::ResourceExhaustedError("No more space left");
1537   }
1538 
1539   uint32_t best_node_index;
1540   int key_offset;
1541   FindBestNode(key, &best_node_index, &key_offset, false);
1542 
1543   // A negative key_offset indicates that storage_ is empty
1544   if (key_offset < 0) {
1545     // First key.
1546     if (!storage_->empty()) {
1547       ICING_LOG(FATAL) << "Key offset is negative but storage is not empty, "
1548                           "there're inconsistencies in dynamic trie.";
1549     }
1550     Node *node = storage_->AllocNode();
1551     node->set_next_index(storage_->MakeSuffix(key, value, value_index));
1552     node->set_is_leaf(true);
1553     node->set_log2_num_children(0);
1554   } else if (storage_->GetNode(best_node_index)->is_leaf()) {
1555     // Prefix in the trie. Split at leaf.
1556     Node *split_node = storage_->GetMutableNode(best_node_index);
1557     const char *prev_suffix = storage_->GetSuffix(split_node->next_index());
1558 
1559     // Find the common prefix length.
1560     const char *prev_suffix_cur = prev_suffix;
1561     const char *key_cur = key + key_offset;
1562     while (*prev_suffix_cur && *prev_suffix_cur == *key_cur) {
1563       prev_suffix_cur++;
1564       key_cur++;
1565     }
1566 
1567     // Equal strings?
1568     if (*prev_suffix_cur == 0 && *key_cur == 0) {
1569       // Update value if replace == true and return.
1570       if (value_index) {
1571         *value_index = storage_->GetSuffixIndex(prev_suffix_cur + 1);
1572       }
1573       if (replace) {
1574         char *mutable_prev_suffix_cur = storage_->GetMutableSuffix(
1575             storage_->GetSuffixIndex(prev_suffix_cur + 1), value_size());
1576         memcpy(mutable_prev_suffix_cur, value, value_size());
1577       }
1578       return libtextclassifier3::Status::OK;
1579     }
1580 
1581     if (*prev_suffix_cur == *key_cur) {
1582       ICING_LOG(FATAL) << "The suffix cursor and key cursor should diverge "
1583                           "after finding the common prefix.";
1584     }
1585 
1586     // Create single-branch children for the common prefix
1587     // length. After the loop, split_node points to the node that
1588     // will have more than 1 char.
1589     int common_len = prev_suffix_cur - prev_suffix;
1590     for (int i = 0; i < common_len; i++) {
1591       // Create a single-branch child node.
1592       ICING_ASSIGN_OR_RETURN(Next * split_next, storage_->AllocNextArray(1));
1593       split_node->set_next_index(storage_->GetNextArrayIndex(split_next));
1594       split_node->set_is_leaf(false);
1595       split_node->set_log2_num_children(0);
1596       Node *child_node = storage_->AllocNode();
1597       split_next[0].set_val(*(prev_suffix + i));
1598       split_next[0].set_node_index(storage_->GetNodeIndex(child_node));
1599 
1600       split_node = child_node;
1601     }
1602 
1603     // Fill a split.
1604     ICING_ASSIGN_OR_RETURN(Next * split_next, storage_->AllocNextArray(2));
1605     split_node->set_next_index(storage_->GetNextArrayIndex(split_next));
1606     split_node->set_is_leaf(false);
1607     split_node->set_log2_num_children(1);
1608     Node *prev_suffix_node = storage_->AllocNode();
1609     Node *key_node = storage_->AllocNode();
1610     split_next[0].set_val(*(prev_suffix + common_len));
1611     split_next[0].set_node_index(storage_->GetNodeIndex(prev_suffix_node));
1612     if (*(prev_suffix + common_len)) {
1613       uint32_t next_index =
1614           storage_->GetSuffixIndex(prev_suffix + common_len) + 1;
1615       prev_suffix_node->set_next_index(next_index);
1616     } else {
1617       uint32_t next_index = storage_->GetSuffixIndex(prev_suffix + common_len);
1618       prev_suffix_node->set_next_index(next_index);
1619     }
1620     prev_suffix_node->set_is_leaf(true);
1621     prev_suffix_node->set_log2_num_children(0);
1622     split_next[1].set_val(*(key + key_offset + common_len));
1623     split_next[1].set_node_index(storage_->GetNodeIndex(key_node));
1624     if (*(key + key_offset + common_len)) {
1625       uint32_t next_index = storage_->MakeSuffix(
1626           key + key_offset + common_len + 1, value, value_index);
1627       key_node->set_next_index(next_index);
1628     } else {
1629       uint32_t next_index = storage_->MakeSuffix(key + key_offset + common_len,
1630                                                  value, value_index);
1631       key_node->set_next_index(next_index);
1632     }
1633     key_node->set_is_leaf(true);
1634     key_node->set_log2_num_children(0);
1635 
1636     std::sort(split_next, split_next + 2);
1637   } else {
1638     // Insert into intermediate node.
1639     const Node *best_node = storage_->GetNode(best_node_index);
1640 
1641     // Add our value as a node + suffix.
1642     Node *new_leaf_node = storage_->AllocNode();
1643     if (*(key + key_offset)) {
1644       uint32_t next_index =
1645           storage_->MakeSuffix(key + key_offset + 1, value, value_index);
1646       new_leaf_node->set_next_index(next_index);
1647     } else {
1648       uint32_t next_index =
1649           storage_->MakeSuffix(key + key_offset, value, value_index);
1650       new_leaf_node->set_next_index(next_index);
1651     }
1652     new_leaf_node->set_is_leaf(true);
1653     new_leaf_node->set_log2_num_children(0);
1654 
1655     // Figure out the real length of the existing next array.
1656     uint32_t next_array_buffer_size = 1u << best_node->log2_num_children();
1657     Next *cur_next = storage_->GetMutableNextArray(best_node->next_index(),
1658                                                    next_array_buffer_size);
1659     int next_len = GetValidNextsSize(cur_next, next_array_buffer_size);
1660     Next *new_next = cur_next;
1661     if (next_len == (next_array_buffer_size)) {
1662       // Allocate a new, larger, array.
1663       ICING_ASSIGN_OR_RETURN(new_next, storage_->AllocNextArray(next_len + 1));
1664       memcpy(new_next, cur_next, sizeof(Next) * next_len);
1665     }
1666 
1667     // Write a link to our new leaf node and sort.
1668     new_next[next_len].set_val(*(key + key_offset));
1669     new_next[next_len].set_node_index(storage_->GetNodeIndex(new_leaf_node));
1670     inplace_merge(new_next, new_next + next_len, new_next + next_len + 1);
1671     next_len++;
1672 
1673     // If this was new, update the parent node and free the old next
1674     // array.
1675     if (new_next != cur_next) {
1676       Node *mutable_best_node =
1677           storage_->GetMutableNode(storage_->GetNodeIndex(best_node));
1678       mutable_best_node->set_next_index(storage_->GetNextArrayIndex(new_next));
1679       mutable_best_node->set_is_leaf(false);
1680       uint8_t log2_num_children = mutable_best_node->log2_num_children();
1681 
1682       // 8 == log2(256)
1683       if (log2_num_children >= 8) {
1684         return absl_ports::InternalError(
1685             "Number of children exceeds the max allowed size");
1686       }
1687 
1688       mutable_best_node->set_log2_num_children(log2_num_children + 1);
1689 
1690       storage_->FreeNextArray(cur_next,
1691                               mutable_best_node->log2_num_children() - 1);
1692     }
1693   }
1694 
1695   // We added a new key.
1696   storage_->inc_num_keys();
1697 
1698   if (pnew_key) *pnew_key = true;
1699   return libtextclassifier3::Status::OK;
1700 }
1701 
GetValueAtIndex(uint32_t value_index) const1702 const void *IcingDynamicTrie::GetValueAtIndex(uint32_t value_index) const {
1703   if (!is_initialized()) {
1704     ICING_LOG(FATAL) << "DynamicTrie not initialized";
1705   }
1706 
1707   return static_cast<const void *>(storage_->GetSuffix(value_index));
1708 }
1709 
SetValueAtIndex(uint32_t value_index,const void * value)1710 void IcingDynamicTrie::SetValueAtIndex(uint32_t value_index,
1711                                        const void *value) {
1712   if (!is_initialized()) {
1713     ICING_LOG(FATAL) << "DynamicTrie not initialized";
1714   }
1715 
1716   if (value_index > storage_->hdr().max_suffixes_size() - value_size()) {
1717     ICING_LOG(FATAL) << "Value index is out of range";
1718   }
1719 
1720   memcpy(storage_->GetMutableSuffix(value_index, value_size()), value,
1721          value_size());
1722 }
1723 
Find(const char * key,void * value,uint32_t * value_index) const1724 bool IcingDynamicTrie::Find(const char *key, void *value,
1725                             uint32_t *value_index) const {
1726   if (!is_initialized()) {
1727     ICING_LOG(FATAL) << "DynamicTrie not initialized";
1728   }
1729 
1730   uint32_t best_node_index;
1731   int key_offset;
1732   FindBestNode(key, &best_node_index, &key_offset, false);
1733 
1734   const Node *best_node = storage_->GetNode(best_node_index);
1735   if (key_offset >= 0 && best_node->is_leaf() &&
1736       !strcmp(key + key_offset, storage_->GetSuffix(best_node->next_index()))) {
1737     uint32_t vidx = best_node->next_index() +
1738                     strlen(storage_->GetSuffix(best_node->next_index())) + 1;
1739     if (value_index) *value_index = vidx;
1740     if (value) memcpy(value, storage_->GetSuffix(vidx), value_size());
1741     return true;
1742   } else {
1743     return false;
1744   }
1745 }
1746 
Iterator(const IcingDynamicTrie & trie,const char * prefix,bool reverse)1747 IcingDynamicTrie::Iterator::Iterator(const IcingDynamicTrie &trie,
1748                                      const char *prefix, bool reverse)
1749     : cur_key_(prefix),
1750       cur_suffix_(nullptr),
1751       cur_suffix_len_(0),
1752       single_leaf_match_(false),
1753       reverse_(reverse),
1754       trie_(trie) {
1755   if (!trie.is_initialized()) {
1756     ICING_LOG(FATAL) << "DynamicTrie not initialized";
1757   }
1758 
1759   Reset();
1760 }
1761 
BranchToLeaf(uint32_t node_index,BranchType branch_type)1762 void IcingDynamicTrie::Iterator::BranchToLeaf(uint32_t node_index,
1763                                               BranchType branch_type) {
1764   // Go down the trie, following the left-most child until we hit a
1765   // leaf. Push to stack and cur_key nodes and chars as we go.
1766   // When reverse_ is true, the method will follow the right-most child.
1767   const Node *node = trie_.storage_->GetNode(node_index);
1768   while (!node->is_leaf()) {
1769     const Next *next_start = trie_.storage_->GetNext(node->next_index(), 0);
1770     int child_idx;
1771     if (branch_type == BranchType::kRightMost) {
1772       uint32_t next_array_size = 1u << node->log2_num_children();
1773       child_idx = trie_.GetValidNextsSize(next_start, next_array_size) - 1;
1774     } else {
1775       // node isn't a leaf. So it must have >0 children.
1776       // 0 is the left-most child.
1777       child_idx = 0;
1778     }
1779     const Next &child_next = next_start[child_idx];
1780     branch_stack_.push_back(Branch(node_index, child_idx));
1781     cur_key_.push_back(child_next.val());
1782 
1783     node_index = child_next.node_index();
1784     node = trie_.storage_->GetNode(node_index);
1785   }
1786 
1787   // We're at a leaf.
1788   cur_suffix_ = trie_.storage_->GetSuffix(
1789       trie_.storage_->GetNode(node_index)->next_index());
1790   cur_suffix_len_ = strlen(cur_suffix_);
1791   cur_key_.append(cur_suffix_, cur_suffix_len_);
1792 }
1793 
Reset()1794 void IcingDynamicTrie::Iterator::Reset() {
1795   size_t strip_len = branch_stack_.size() + cur_suffix_len_;
1796 
1797   if (cur_key_.size() < strip_len) {
1798     ICING_LOG(FATAL) << "Key size < visited trie depth + remaining suffix "
1799                         "size, there're inconsistencies in dynamic trie";
1800   }
1801 
1802   // Trim back cur_key_ to original prefix.
1803   cur_key_.resize(cur_key_.size() - strip_len);
1804   cur_suffix_ = nullptr;
1805   cur_suffix_len_ = 0;
1806   single_leaf_match_ = false;
1807   branch_stack_.clear();
1808 
1809   // Nothing to do with an empty trie.
1810   if (trie_.storage_->empty()) return;
1811 
1812   // Find node matching prefix.
1813   uint32_t node_index;
1814   int key_offset;
1815   trie_.FindBestNode(cur_key_.c_str(), &node_index, &key_offset, true);
1816 
1817   // Two cases/states:
1818   //
1819   // - Found an intermediate node. If we matched all of prefix
1820   //   (cur_key_), BranchToLeaf.
1821   //
1822   // - Found a leaf node, which is the ONLY matching key for this
1823   //   prefix. Check that suffix matches the prefix. Then we set
1824   //   single_leaf_match_ = true and apply different logic for
1825   //   Advance.
1826   if (key_offset < 0) {
1827     // A negative key_offset indicates that trie_.storage_ is empty
1828     ICING_LOG(FATAL) << "Trie storage is empty";
1829   }
1830 
1831   const Node *best_node = trie_.storage_->GetNode(node_index);
1832   if (best_node->is_leaf() &&
1833       !strncmp(cur_key_.c_str() + key_offset,
1834                trie_.storage_->GetSuffix(best_node->next_index()),
1835                cur_key_.size() - key_offset)) {
1836     // Copy the entire suffix into the current key.
1837     cur_key_.resize(key_offset);
1838     cur_key_.append(trie_.storage_->GetSuffix(best_node->next_index()));
1839     cur_suffix_ = trie_.storage_->GetSuffix(best_node->next_index());
1840     cur_suffix_len_ = strlen(cur_suffix_);
1841     single_leaf_match_ = true;
1842   } else if (static_cast<size_t>(key_offset) == cur_key_.size()) {
1843     BranchType branch_type =
1844         (reverse_) ? BranchType::kRightMost : BranchType::kLeftMost;
1845     BranchToLeaf(node_index, branch_type);
1846   }
1847 }
1848 
Advance()1849 bool IcingDynamicTrie::Iterator::Advance() {
1850   if (!IsValid()) return false;
1851   if (single_leaf_match_) {
1852     // If we only have an exact match, the Advance logic does not
1853     // apply. Invalidate the iterator and return.
1854     cur_suffix_ = nullptr;
1855     cur_suffix_len_ = 0;
1856     return false;
1857   }
1858 
1859   if (cur_key_.size() < (branch_stack_.size() + cur_suffix_len_)) {
1860     ICING_LOG(FATAL) << "Key size < visited trie depth + remaining suffix "
1861                         "size, there're inconsistencies in dynamic trie";
1862   }
1863 
1864   // Move up from the current leaf.
1865   cur_key_.resize(cur_key_.size() - cur_suffix_len_);
1866   cur_suffix_ = nullptr;
1867   cur_suffix_len_ = 0;
1868 
1869   while (!branch_stack_.empty()) {
1870     Branch *branch = &branch_stack_.back();
1871     const Node *node = trie_.storage_->GetNode(branch->node_idx);
1872     if (reverse_) {
1873       branch->child_idx--;
1874     } else {
1875       branch->child_idx++;
1876     }
1877     if (branch->child_idx >= 0 &&
1878         branch->child_idx < (1 << node->log2_num_children())) {
1879       const Next *child_next =
1880           trie_.storage_->GetNext(node->next_index(), branch->child_idx);
1881       if (child_next->node_index() != kInvalidNodeIndex) {
1882         // Successfully incremented to the next child. Update the char
1883         // value at this depth.
1884         cur_key_[cur_key_.size() - 1] = child_next->val();
1885         // We successfully found a sub-trie to explore.
1886         BranchType branch_type =
1887             (reverse_) ? BranchType::kRightMost : BranchType::kLeftMost;
1888         BranchToLeaf(child_next->node_index(), branch_type);
1889         return true;
1890       }
1891     }
1892     branch_stack_.pop_back();
1893     cur_key_.resize(cur_key_.size() - 1);
1894   }
1895 
1896   // Un-wound the entire stack. We are done.
1897   return false;
1898 }
1899 
IsValid() const1900 bool IcingDynamicTrie::Iterator::IsValid() const {
1901   return cur_suffix_ != nullptr;
1902 }
1903 
GetKey() const1904 const char *IcingDynamicTrie::Iterator::GetKey() const {
1905   // cur_key_ can have a NULL in it so cur_key_ can be wrong but
1906   // cur_key_.c_str() is always right.
1907   return IsValid() ? cur_key_.c_str() : nullptr;
1908 }
1909 
GetValue() const1910 const void *IcingDynamicTrie::Iterator::GetValue() const {
1911   if (!IsValid()) return nullptr;
1912 
1913   return static_cast<const void *>(cur_suffix_ + cur_suffix_len_ + 1);
1914 }
1915 
GetValueIndex() const1916 uint32_t IcingDynamicTrie::Iterator::GetValueIndex() const {
1917   if (!IsValid()) return kInvalidSuffixIndex;
1918 
1919   return trie_.storage_->GetSuffixIndex(cur_suffix_ + cur_suffix_len_ + 1);
1920 }
1921 
LeftBranchToUtf8End()1922 void IcingDynamicTrie::Utf8Iterator::LeftBranchToUtf8End() {
1923   if (cur_len_ <= 0) {
1924     ICING_LOG(FATAL) << "Invalid UTF-8 character length";
1925   }
1926 
1927   if (branch_end_ - branch_stack_ != cur_len_) {
1928     ICING_LOG(FATAL) << "Depth from first visited node to last visited node "
1929                         "doesn't match the current UTF-8 character length";
1930   }
1931 
1932   // Use branch at top of stack to determine where to follow.
1933   const Branch &branch = *(branch_end_ - 1);
1934   const Node *node = trie_.storage_->GetNode(branch.child->node_index());
1935 
1936   // If we start with non-ascii, take all left branches while there is
1937   // a continuation byte.
1938   if (!i18n_utils::IsAscii(cur_[cur_len_ - 1])) {
1939     while (!node->is_leaf()) {
1940       if (cur_len_ >= U8_MAX_LENGTH) break;
1941 
1942       InitBranch(branch_end_, node, 0);
1943       // When we are looking to complete a utf8 char, skip 0s.
1944       if (branch_end_->child->val() == 0) {
1945         // Check if we already have a valid cur_.
1946         cur_[cur_len_] = 0;
1947         UChar32 uchar32 = i18n_utils::GetUChar32At(cur_, cur_len_, 0);
1948         if (uchar32 == i18n_utils::kInvalidUChar32 &&
1949             node->log2_num_children() > 0) {
1950           branch_end_->child++;
1951         } else {
1952           // Good termination. Just break.
1953           break;
1954         }
1955       }
1956 
1957       if (!IcingStringUtil::IsContinuationByte(branch_end_->child->val()))
1958         break;
1959 
1960       cur_[cur_len_++] = branch_end_->child->val();
1961       node = trie_.storage_->GetNode(branch_end_->child->node_index());
1962       branch_end_++;
1963     }
1964 
1965     cur_logical_node_.node = node;
1966 
1967     // Maybe go into suffixes and set suffix_offset.
1968     if (node->is_leaf()) {
1969       GoIntoSuffix(node);
1970     } else {
1971       cur_logical_node_.suffix_offset = 0;
1972     }
1973   } else {  // ascii
1974     cur_logical_node_.node = node;
1975     cur_logical_node_.suffix_offset = 0;
1976   }
1977 
1978   // NULL-terminate.
1979   cur_[cur_len_] = 0;
1980 }
1981 
GoIntoSuffix(const Node * node)1982 void IcingDynamicTrie::Utf8Iterator::GoIntoSuffix(const Node *node) {
1983   const char *suffix = trie_.storage_->GetSuffix(node->next_index());
1984   const char *cur_suffix;
1985   for (cur_suffix = suffix; cur_len_ < U8_MAX_LENGTH &&
1986                             IcingStringUtil::IsContinuationByte(*cur_suffix);
1987        cur_suffix++) {
1988     cur_[cur_len_++] = *cur_suffix;
1989   }
1990   cur_logical_node_.suffix_offset = cur_suffix - suffix;
1991 }
1992 
Reset()1993 void IcingDynamicTrie::Utf8Iterator::Reset() {
1994   cur_[0] = 0;
1995   cur_len_ = 0;
1996   branch_end_ = branch_stack_;
1997 
1998   if (start_node_) {
1999     // Take the first char node's children.
2000     const Next *next = trie_.storage_->GetNext(start_node_->next_index(), 0);
2001     branch_end_->node = start_node_;
2002     branch_end_->child_end = next + (1 << start_node_->log2_num_children());
2003     if (next->val() == 0) {
2004       // Skip any nulls at this position. We don't return empty string
2005       // as an iteration.
2006       next++;
2007     }
2008     branch_end_->child = next;
2009     cur_[cur_len_++] = next->val();
2010     branch_end_++;
2011 
2012     // Will NULL-terminate cur_.
2013     LeftBranchToUtf8End();
2014   } else {
2015     // Nothing to return.
2016     cur_logical_node_.node = nullptr;
2017     cur_logical_node_.suffix_offset = 0;
2018   }
2019 }
2020 
Advance()2021 bool IcingDynamicTrie::Utf8Iterator::Advance() {
2022   if (!IsValid()) return false;
2023 
2024   // Clip to branch.
2025   cur_len_ = branch_end_ - branch_stack_;
2026 
2027   while (branch_end_ > branch_stack_) {
2028     Branch *branch = branch_end_ - 1;
2029     branch->child++;
2030     if (!branch->IsFinished()) {
2031       // Successfully incremented to the next child. Update the char
2032       // value at this depth.
2033       cur_[cur_len_ - 1] = branch->child->val();
2034 
2035       // We successfully found a sub-trie to explore.
2036       LeftBranchToUtf8End();
2037       return true;
2038     }
2039     cur_len_--;
2040     branch_end_--;
2041   }
2042 
2043   // Un-wound the entire stack. We are done.
2044   return false;
2045 }
2046 
InitBranch(Branch * branch,const Node * start,char key_char)2047 void IcingDynamicTrie::Utf8Iterator::InitBranch(Branch *branch,
2048                                                 const Node *start,
2049                                                 char key_char) {
2050   branch->node = start;
2051   branch->child = trie_.storage_->GetNext(start->next_index(), 0);
2052   branch->child_end = branch->child + (1 << start->log2_num_children());
2053   if (key_char) {
2054     branch->child =
2055         trie_.LowerBound(branch->child, branch->child_end, key_char);
2056   }
2057 }
2058 
IsFinished()2059 bool IcingDynamicTrie::Utf8Iterator::Branch::IsFinished() {
2060   return child >= child_end || child->node_index() == kInvalidNodeIndex;
2061 }
2062 
IsValid() const2063 bool IcingDynamicTrie::Utf8Iterator::IsValid() const { return cur_len_ > 0; }
2064 
GetNextByChar(const Node * node,uint8_t key_char) const2065 const IcingDynamicTrie::Next *IcingDynamicTrie::GetNextByChar(
2066     const Node *node, uint8_t key_char) const {
2067   const Next *next_start = storage_->GetNext(node->next_index(), 0);
2068   const Next *next_end = next_start + (1 << node->log2_num_children());
2069 
2070   const Next *found = LowerBound(next_start, next_end, key_char);
2071   if (found >= next_end || found->val() != key_char ||
2072       found->node_index() == kInvalidNodeIndex) {
2073     return nullptr;
2074   }
2075 
2076   return found;
2077 }
2078 
GetValidNextsSize(const IcingDynamicTrie::Next * next_array_start,int next_array_length) const2079 int IcingDynamicTrie::GetValidNextsSize(
2080     const IcingDynamicTrie::Next *next_array_start,
2081     int next_array_length) const {
2082   // Only searching for key char 0xff is not sufficient, as 0xff can be a valid
2083   // character. We must also specify kInvalidNodeIndex as the target node index
2084   // when searching the next array.
2085   return LowerBound(next_array_start, next_array_start + next_array_length,
2086                     /*key_char=*/0xff, /*node_index=*/kInvalidNodeIndex) -
2087          next_array_start;
2088 }
2089 
LowerBound(const Next * start,const Next * end,uint8_t key_char,uint32_t node_index) const2090 const IcingDynamicTrie::Next *IcingDynamicTrie::LowerBound(
2091     const Next *start, const Next *end, uint8_t key_char,
2092     uint32_t node_index) const {
2093   // Above this value will use binary search instead of linear
2094   // search. 16 was chosen from running some benchmarks with
2095   // different values.
2096   static const uint32_t kBinarySearchCutoff = 16;
2097 
2098   Next key_next(key_char, node_index);
2099   if (end - start >= kBinarySearchCutoff) {
2100     // Binary search.
2101     return lower_bound(start, end, key_next);
2102   } else {
2103     // Linear search.
2104     const Next *found;
2105     for (found = start; found < end; found++) {
2106       if (!(*found < key_next)) {
2107         // Should have gotten match.
2108         break;
2109       }
2110     }
2111     return found;
2112   }
2113 }
2114 
FindBestNode(const char * key,uint32_t * best_node_index,int * key_offset,bool prefix,bool utf8) const2115 void IcingDynamicTrie::FindBestNode(const char *key, uint32_t *best_node_index,
2116                                     int *key_offset, bool prefix,
2117                                     bool utf8) const {
2118   // Find the best node such that:
2119   //
2120   // - If key is NOT in the trie, key[0..key_offset) is a prefix to
2121   //   everything under best_node_index.
2122   //
2123   // - If key is in the trie, best_node_index is the leaf that points
2124   //   to the key suffix and key_offset == strlen(key).
2125   //
2126   // If prefix is true, when key is both in the trie AND a prefix
2127   // (e.g. "ab" and "abc" are in the trie), we return the intermediate
2128   // node with key as the prefix as opposed to the exactly matching
2129   // leaf node.
2130   if (storage_->empty()) {
2131     *best_node_index = 0;
2132     *key_offset = -1;
2133     return;
2134   }
2135 
2136   const Node *cur_node = storage_->GetRootNode();
2137   const char *cur_key = key;
2138   const Node *utf8_node = cur_node;
2139   const char *utf8_key = cur_key;
2140   while (!cur_node->is_leaf()) {
2141     const Next *found = GetNextByChar(cur_node, *cur_key);
2142     if (!found) break;
2143 
2144     if (prefix && found->val() == 0) {
2145       break;
2146     }
2147 
2148     cur_node = storage_->GetNode(found->node_index());
2149 
2150     // End of key.
2151     if (*cur_key == 0) {
2152       break;
2153     }
2154     cur_key++;
2155 
2156     if (utf8 && i18n_utils::IsLeadUtf8Byte(*cur_key)) {
2157       utf8_node = cur_node;
2158       utf8_key = cur_key;
2159     }
2160   }
2161 
2162   if (utf8) {
2163     // Rewind.
2164     cur_node = utf8_node;
2165     cur_key = utf8_key;
2166   }
2167 
2168   *best_node_index = storage_->GetNodeIndex(cur_node);
2169   *key_offset = reinterpret_cast<const char *>(cur_key) - key;
2170 }
2171 
FindNewBranchingPrefixLength(const char * key,bool utf8) const2172 int IcingDynamicTrie::FindNewBranchingPrefixLength(const char *key,
2173                                                    bool utf8) const {
2174   if (storage_->empty()) {
2175     return kNoBranchFound;
2176   }
2177 
2178   uint32_t best_node_index;
2179   int key_offset;
2180   FindBestNode(key, &best_node_index, &key_offset, /*prefix=*/true, utf8);
2181   const Node *cur_node = storage_->GetNode(best_node_index);
2182   const char *cur_key = key + key_offset;
2183   if (cur_node->is_leaf()) {
2184     // Prefix in the trie. Split at leaf.
2185     const char *prev_suffix = storage_->GetSuffix(cur_node->next_index());
2186     while (*prev_suffix != '\0' && *prev_suffix == *cur_key) {
2187       prev_suffix++;
2188       cur_key++;
2189     }
2190 
2191     // Equal strings? No branching.
2192     if (*prev_suffix == '\0' && *cur_key == '\0') {
2193       return kNoBranchFound;
2194     }
2195 
2196     if (utf8) {
2197       // Rewind to utf8 boundary.
2198       size_t offset = i18n_utils::SafeTruncateUtf8Length(key, cur_key - key);
2199       cur_key = key + offset;
2200     }
2201 
2202     return cur_key - key;
2203   } else if (cur_node->log2_num_children() == 0) {
2204     // Intermediate node going from no branching to branching.
2205     return cur_key - key;
2206   }
2207 
2208   // If we've reached this point, then we're already at a branch point. So there
2209   // is no *new* branch point.
2210   return kNoBranchFound;
2211 }
2212 
FindBranchingPrefixLengths(const char * key,bool utf8) const2213 std::vector<int> IcingDynamicTrie::FindBranchingPrefixLengths(const char *key,
2214                                                               bool utf8) const {
2215   std::vector<int> prefix_lengths;
2216 
2217   if (storage_->empty()) {
2218     return prefix_lengths;
2219   }
2220 
2221   const Node *cur_node = storage_->GetRootNode();
2222   const char *cur_key = key;
2223   while (*cur_key && !cur_node->is_leaf()) {
2224     // Branching prefix?
2225     if (cur_node->log2_num_children() > 0) {
2226       int len = cur_key - key;
2227       if (utf8) {
2228         // Do not cut mid-utf8. Walk up to utf8 boundary.
2229         len = i18n_utils::SafeTruncateUtf8Length(key, len);
2230         if (prefix_lengths.empty() || len != prefix_lengths.back()) {
2231           prefix_lengths.push_back(len);
2232         }
2233       } else {
2234         prefix_lengths.push_back(len);
2235       }
2236     }
2237 
2238     // Move to next.
2239     const Next *found = GetNextByChar(cur_node, *cur_key);
2240     if (found == nullptr) {
2241       break;
2242     }
2243     cur_node = storage_->GetNode(found->node_index());
2244 
2245     ++cur_key;
2246   }
2247   return prefix_lengths;
2248 }
2249 
IsBranchingTerm(const char * key) const2250 bool IcingDynamicTrie::IsBranchingTerm(const char *key) const {
2251   if (!is_initialized()) {
2252     ICING_LOG(FATAL) << "DynamicTrie not initialized";
2253   }
2254 
2255   if (storage_->empty()) {
2256     return false;
2257   }
2258 
2259   uint32_t best_node_index;
2260   int key_offset;
2261   FindBestNode(key, &best_node_index, &key_offset, /*prefix=*/true);
2262   const Node *cur_node = storage_->GetNode(best_node_index);
2263 
2264   if (cur_node->is_leaf()) {
2265     return false;
2266   }
2267 
2268   // There is no intermediate node for key in the trie.
2269   if (key[key_offset] != '\0') {
2270     return false;
2271   }
2272 
2273   // Found key as an intermediate node, but key is not a valid term stored in
2274   // the trie. In this case, we need at least two children for key to be a
2275   // branching term.
2276   if (GetNextByChar(cur_node, '\0') == nullptr) {
2277     return cur_node->log2_num_children() >= 1;
2278   }
2279 
2280   // The intermediate node for key must have more than two children for key to
2281   // be a branching term, one of which represents the leaf node for key itself.
2282   return cur_node->log2_num_children() > 1;
2283 }
2284 
GetDebugInfo(int verbosity,std::string * out) const2285 void IcingDynamicTrie::GetDebugInfo(int verbosity, std::string *out) const {
2286   Stats stats;
2287   CollectStats(&stats);
2288   out->append(stats.DumpStats(verbosity));
2289 
2290   // Property files.
2291   vector<std::string> files;
2292   if (!filesystem_->GetMatchingFiles((property_bitmaps_prefix_ + "*").c_str(),
2293                                      &files)) {
2294     ICING_LOG(ERROR) << "Could not get files at prefix " << property_bitmaps_prefix_;
2295     return;
2296   }
2297   for (size_t i = 0; i < files.size(); i++) {
2298     IcingStringUtil::SStringAppendF(
2299         out, 1000, "Prop file %s size %" PRIu64 "\n",
2300         filesystem_->GetBasename(files[i].c_str()).c_str(),
2301         filesystem_->GetFileSize(files[i].c_str()));
2302   }
2303   IcingStringUtil::SStringAppendF(
2304       out, 1000, "Deleted file %s size %" PRIu64 "\n",
2305       filesystem_->GetBasename(deleted_bitmap_filename_.c_str()).c_str(),
2306       filesystem_->GetFileSize(deleted_bitmap_filename_.c_str()));
2307 }
2308 
min_free_fraction() const2309 double IcingDynamicTrie::min_free_fraction() const {
2310   if (!is_initialized()) {
2311     ICING_LOG(FATAL) << "DynamicTrie not initialized";
2312   }
2313 
2314   return 1.0 - max(max(static_cast<double>(storage_->hdr().num_nodes()) /
2315                            storage_->hdr().max_nodes(),
2316                        static_cast<double>(storage_->hdr().num_nexts()) /
2317                            storage_->hdr().max_nexts()),
2318                    static_cast<double>(storage_->hdr().suffixes_size()) /
2319                        storage_->hdr().max_suffixes_size());
2320 }
2321 
value_size() const2322 uint32_t IcingDynamicTrie::value_size() const {
2323   return storage_->hdr().value_size();
2324 }
2325 
max_value_index() const2326 uint32_t IcingDynamicTrie::max_value_index() const {
2327   return storage_->hdr().max_suffixes_size();
2328 }
2329 
UpdateCrc()2330 uint32_t IcingDynamicTrie::UpdateCrc() {
2331   if (!is_initialized()) {
2332     ICING_LOG(FATAL) << "DynamicTrie not initialized";
2333   }
2334 
2335   if (runtime_options_.storage_policy != RuntimeOptions::kMapSharedWithCrc) {
2336     return kNoCrc;
2337   }
2338 
2339   // Combine storage crc with property bitmap crcs.
2340   uint32_t crc = storage_->UpdateCrc();
2341 
2342   // Update crcs on bitmaps.
2343   for (size_t i = 0; i < property_bitmaps_.size(); ++i) {
2344     if (property_bitmaps_[i]) {
2345       // Combine property id with the bitmap crc.
2346       uint64_t this_crc = property_bitmaps_[i]->UpdateCrc();
2347       this_crc = (this_crc << 32) | i;
2348       crc = IcingStringUtil::UpdateCrc32(
2349           crc, reinterpret_cast<const char *>(&this_crc), sizeof(this_crc));
2350     }
2351   }
2352   uint32_t this_crc = deleted_bitmap_->UpdateCrc();
2353   crc = IcingStringUtil::UpdateCrc32(
2354       crc, reinterpret_cast<const char *>(&this_crc), sizeof(this_crc));
2355 
2356   return crc;
2357 }
2358 
OpenOrCreatePropertyBitmap(uint32_t property_id)2359 IcingFlashBitmap *IcingDynamicTrie::OpenOrCreatePropertyBitmap(
2360     uint32_t property_id) {
2361   if (!is_initialized()) {
2362     ICING_LOG(FATAL) << "DynamicTrie not initialized";
2363   }
2364 
2365   if (property_id > kMaxPropertyId) {
2366     ICING_LOG(ERROR) << "Property id " << property_id << " out of range";
2367     return nullptr;
2368   }
2369 
2370   if (property_id >= property_bitmaps_.size()) {
2371     property_bitmaps_.resize(property_id + 1);
2372   }
2373   if (!property_bitmaps_[property_id]) {
2374     std::string filename;
2375     IcingStringUtil::SStringAppendF(
2376         &filename, property_bitmaps_prefix_.size() + 10, "%s%u",
2377         property_bitmaps_prefix_.c_str(), property_id);
2378     property_bitmaps_[property_id] =
2379         OpenAndInitBitmap(filename, false, filesystem_);
2380   }
2381   return property_bitmaps_[property_id].get();
2382 }
2383 
SetProperty(uint32_t value_index,uint32_t property_id)2384 bool IcingDynamicTrie::SetProperty(uint32_t value_index, uint32_t property_id) {
2385   IcingFlashBitmap *bitmap = OpenOrCreatePropertyBitmap(property_id);
2386   if (!bitmap) {
2387     return false;
2388   }
2389   uint64_t idx = ValueIndexToPropertyBitmapIndex(value_index);
2390 
2391   // Also clear deleted bit.
2392   return bitmap->SetBit(idx, true) && deleted_bitmap_->SetBit(idx, false);
2393 }
2394 
ClearProperty(uint32_t value_index,uint32_t property_id)2395 bool IcingDynamicTrie::ClearProperty(uint32_t value_index,
2396                                      uint32_t property_id) {
2397   if (property_id >= property_bitmaps_.size() ||
2398       !property_bitmaps_[property_id]) {
2399     // No bitmap is ok for clearing.
2400     return true;
2401   }
2402 
2403   uint64_t idx = ValueIndexToPropertyBitmapIndex(value_index);
2404   return property_bitmaps_[property_id]->SetBit(idx, false);
2405 }
2406 
SetDeleted(uint32_t value_index)2407 bool IcingDynamicTrie::SetDeleted(uint32_t value_index) {
2408   uint64_t idx = ValueIndexToPropertyBitmapIndex(value_index);
2409   return deleted_bitmap_->SetBit(idx, true);
2410 }
2411 
ClearDeleted(uint32_t value_index)2412 bool IcingDynamicTrie::ClearDeleted(uint32_t value_index) {
2413   uint64_t idx = ValueIndexToPropertyBitmapIndex(value_index);
2414   return deleted_bitmap_->SetBit(idx, false);
2415 }
2416 
2417 // Steps:
2418 // 1. Find the key in the trie.
2419 // 2. Remove the suffix and the value.
2420 // 3. Reset the nexts that point to the nodes to be removed.
2421 // 4. Sort any next array if needed.
Delete(const std::string_view key)2422 bool IcingDynamicTrie::Delete(const std::string_view key) {
2423   if (!is_initialized()) {
2424     ICING_LOG(ERROR) << "DynamicTrie not initialized";
2425     return false;
2426   }
2427 
2428   if (storage_->empty()) {
2429     // Nothing to delete.
2430     return true;
2431   }
2432 
2433   // Tries to find the key in the trie, starting from the root.
2434   const Node *current_node = storage_->GetRootNode();
2435 
2436   // The node after which we start to remove data.
2437   const Node *last_multichild_node = nullptr;
2438 
2439   // While visiting the trie nodes, we store the indices of Nexts that point
2440   // to all the nodes after last_multichild_node. Those nodes must be
2441   // consecutive and all have only one child. Resetting those Nexts means that
2442   // we remove the data of the key.
2443   std::vector<uint32_t> nexts_to_reset;
2444   nexts_to_reset.reserve(key.length());
2445 
2446   // Iterates through chars in the key, finds nodes in the trie until a leaf
2447   // node is reached. The max number of loops is key.length() + 1 because we
2448   // start from the root.
2449   for (size_t i = 0; i <= key.length(); ++i) {
2450     if (current_node->is_leaf()) {
2451       // Leaf node, now check the suffix.
2452       if (key.substr(i) != storage_->GetSuffix(current_node->next_index())) {
2453         // Key does not exist in the trie, nothing to delete.
2454         return true;
2455       }
2456       // Otherwise, key is found.
2457       break;
2458     }
2459 
2460     // Finds the next char.
2461     const Next *next;
2462     if (i == key.length()) {
2463       // When we're at the end of the key, the next char is the termination char
2464       // '\0'.
2465       next = GetNextByChar(current_node, '\0');
2466     } else {
2467       next = GetNextByChar(current_node, key[i]);
2468     }
2469 
2470     if (next == nullptr) {
2471       // Key does not exist in the trie, nothing to delete.
2472       return true;
2473     }
2474 
2475     // Checks the real size of next array.
2476     uint32_t next_array_buffer_size = 1u << current_node->log2_num_children();
2477     Next *next_array_start = storage_->GetMutableNextArray(
2478         current_node->next_index(), next_array_buffer_size);
2479     int valid_next_array_size =
2480         GetValidNextsSize(next_array_start, next_array_buffer_size);
2481     if (valid_next_array_size == 0) {
2482       // Key does not exist in the trie, nothing to delete.
2483       // This shouldn't happen, but we put a sanity check here in case something
2484       // is wrong.
2485       return true;
2486     } else if (valid_next_array_size == 1) {
2487       // Single-child branch will be deleted.
2488       nexts_to_reset.push_back(storage_->GetNextArrayIndex(next));
2489     } else {
2490       // We see a new node with multiple children, all the previously seen nodes
2491       // shouldn't be removed.
2492       last_multichild_node = current_node;
2493       nexts_to_reset.clear();
2494       nexts_to_reset.push_back(storage_->GetNextArrayIndex(next));
2495     }
2496 
2497     // Updates current_node.
2498     current_node = storage_->GetNode(next->node_index());
2499   }
2500   // Now we've found the key in the trie.
2501 
2502   ClearSuffixAndValue(current_node->next_index());
2503 
2504   // Resets nexts to remove key information.
2505   for (uint32_t next_index : nexts_to_reset) {
2506     ResetNext(next_index);
2507   }
2508 
2509   if (last_multichild_node != nullptr) {
2510     SortNextArray(last_multichild_node);
2511     uint32_t next_array_buffer_size =
2512         1u << last_multichild_node->log2_num_children();
2513     Next *next_array_start = this->storage_->GetMutableNextArray(
2514         last_multichild_node->next_index(), next_array_buffer_size);
2515     uint32_t num_children =
2516         GetValidNextsSize(next_array_start, next_array_buffer_size);
2517     // Shrink the next array if we can.
2518     if (num_children == next_array_buffer_size / 2) {
2519       Node *mutable_node = storage_->GetMutableNode(
2520           storage_->GetNodeIndex(last_multichild_node));
2521       mutable_node->set_log2_num_children(mutable_node->log2_num_children() -
2522                                           1);
2523       // Add the unused second half of the next array to the free list.
2524       storage_->FreeNextArray(next_array_start + next_array_buffer_size / 2,
2525                               mutable_node->log2_num_children());
2526     }
2527   }
2528 
2529   return true;
2530 }
2531 
ClearPropertyForAllValues(uint32_t property_id)2532 bool IcingDynamicTrie::ClearPropertyForAllValues(uint32_t property_id) {
2533   if (!is_initialized()) {
2534     ICING_LOG(FATAL) << "DynamicTrie not initialized";
2535   }
2536 
2537   PropertyReadersAll readers(*this);
2538   if (!readers.Exists(property_id)) {
2539     ICING_VLOG(1) << "Properties for id " << property_id << " don't exist";
2540     return true;
2541   }
2542 
2543   // Mark values that have no other properties set as as deleted.
2544   uint64_t max_idx =
2545       ValueIndexToPropertyBitmapIndex(storage_->hdr().suffixes_size());
2546   // TODO(vishwajith) Inefficient to do this bit by bit, should be word by
2547   // word. Removing a corpus is likely rare enough that this is low priority.
2548   for (uint64_t i = 0; i < max_idx; ++i) {
2549     // See if the bit is set in our property map.
2550     if (readers.IsPropertyUnique(property_id, i)) {
2551       deleted_bitmap_->SetBit(i, true);
2552     }
2553   }
2554 
2555   // Now delete the bitmap file for this property.
2556   std::unique_ptr<IcingFlashBitmap> bitmap(
2557       std::move(property_bitmaps_[property_id]));
2558   // bitmap cannot be null here, because then readers.Exists(property_id) would
2559   // have returned false earlier, and we wouldn't get here.
2560   if (bitmap == nullptr) {
2561     ICING_LOG(ERROR) << "Property bitmap is null";
2562     return false;
2563   }
2564 
2565   return bitmap->Delete();
2566 }
2567 
Exists() const2568 bool IcingDynamicTrie::PropertyReaderBase::Exists() const {
2569   return bitmap_ != nullptr;
2570 }
2571 
HasProperty(uint32_t value_index) const2572 bool IcingDynamicTrie::PropertyReaderBase::HasProperty(
2573     uint32_t value_index) const {
2574   return bitmap_ &&
2575          bitmap_->GetBit(trie_.ValueIndexToPropertyBitmapIndex(value_index));
2576 }
2577 
PropertyReaderBase(const IcingDynamicTrie & trie,bool deleted,uint32_t property_id)2578 IcingDynamicTrie::PropertyReaderBase::PropertyReaderBase(
2579     const IcingDynamicTrie &trie, bool deleted, uint32_t property_id)
2580     : trie_(trie) {
2581   if (!trie.is_initialized()) {
2582     ICING_LOG(FATAL) << "DynamicTrie not initialized";
2583   }
2584 
2585   if (deleted) {
2586     bitmap_ = trie.deleted_bitmap_.get();
2587   } else if (property_id < trie.property_bitmaps_.size()) {
2588     bitmap_ = trie.property_bitmaps_[property_id].get();
2589   } else {
2590     bitmap_ = nullptr;
2591   }
2592 }
2593 
PropertyReadersAll(const IcingDynamicTrie & trie)2594 IcingDynamicTrie::PropertyReadersAll::PropertyReadersAll(
2595     const IcingDynamicTrie &trie)
2596     : trie_(trie) {
2597   if (!trie.is_initialized()) {
2598     ICING_LOG(FATAL) << "DynamicTrie not initialized";
2599   }
2600 }
2601 
Exists(uint32_t property_id) const2602 bool IcingDynamicTrie::PropertyReadersAll::Exists(uint32_t property_id) const {
2603   return property_id < trie_.property_bitmaps_.size() &&
2604          trie_.property_bitmaps_[property_id];
2605 }
2606 
HasProperty(uint32_t property_id,uint32_t value_index) const2607 bool IcingDynamicTrie::PropertyReadersAll::HasProperty(
2608     uint32_t property_id, uint32_t value_index) const {
2609   return property_id < trie_.property_bitmaps_.size() &&
2610          trie_.property_bitmaps_[property_id] &&
2611          trie_.property_bitmaps_[property_id]->GetBit(
2612              trie_.ValueIndexToPropertyBitmapIndex(value_index));
2613 }
2614 
IsPropertyUnique(uint32_t property_id,uint32_t value_index) const2615 bool IcingDynamicTrie::PropertyReadersAll::IsPropertyUnique(
2616     uint32_t property_id, uint32_t value_index) const {
2617   uint32_t idx = trie_.ValueIndexToPropertyBitmapIndex(value_index);
2618 
2619   // First check that value is set for the requested id.
2620   if (property_id >= trie_.property_bitmaps_.size() ||
2621       !trie_.property_bitmaps_[property_id] ||
2622       !trie_.property_bitmaps_[property_id]->GetBit(idx)) {
2623     return false;
2624   }
2625 
2626   // Now check that the value is not set for the rest.
2627   for (size_t i = 0; i < trie_.property_bitmaps_.size(); ++i) {
2628     if (i == property_id) {
2629       continue;
2630     }
2631     if (trie_.property_bitmaps_[i] && trie_.property_bitmaps_[i]->GetBit(idx)) {
2632       return false;
2633     }
2634   }
2635   return true;
2636 }
2637 
size() const2638 size_t IcingDynamicTrie::PropertyReadersAll::size() const {
2639   return trie_.property_bitmaps_.size();
2640 }
2641 
ValueIndexToPropertyBitmapIndex(uint32_t value_index) const2642 uint64_t IcingDynamicTrie::ValueIndexToPropertyBitmapIndex(
2643     uint32_t value_index) const {
2644   // We know that value indices are separated by at least 1 +
2645   // value_size() bytes (for the null terminator and the value).
2646   return value_index / (value_size() + 1);
2647 }
2648 
2649 // Testing hooks.
GetHeader(IcingDynamicTrieHeader * hdr) const2650 void IcingDynamicTrie::GetHeader(IcingDynamicTrieHeader *hdr) const {
2651   if (!is_initialized()) {
2652     ICING_LOG(FATAL) << "DynamicTrie not initialized";
2653   }
2654 
2655   *hdr = storage_->hdr();
2656 }
2657 
SetHeader(const IcingDynamicTrieHeader & new_hdr)2658 void IcingDynamicTrie::SetHeader(const IcingDynamicTrieHeader &new_hdr) {
2659   if (!is_initialized()) {
2660     ICING_LOG(FATAL) << "DynamicTrie not initialized";
2661   }
2662 
2663   storage_->hdr_.hdr = new_hdr;
2664   storage_->WriteHeader();
2665 }
2666 
2667 }  // namespace lib
2668 }  // namespace icing
2669