• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 #include "icing/index/main/main-index.h"
15 
16 #include <cstdint>
17 #include <cstring>
18 #include <memory>
19 #include <string>
20 #include <unordered_set>
21 
22 #include "icing/absl_ports/canonical_errors.h"
23 #include "icing/absl_ports/str_cat.h"
24 #include "icing/file/destructible-directory.h"
25 #include "icing/file/posting_list/flash-index-storage.h"
26 #include "icing/file/posting_list/posting-list-common.h"
27 #include "icing/index/main/posting-list-hit-serializer.h"
28 #include "icing/index/term-id-codec.h"
29 #include "icing/index/term-property-id.h"
30 #include "icing/legacy/core/icing-string-util.h"
31 #include "icing/legacy/index/icing-dynamic-trie.h"
32 #include "icing/proto/debug.pb.h"
33 #include "icing/proto/storage.pb.h"
34 #include "icing/proto/term.pb.h"
35 #include "icing/util/logging.h"
36 #include "icing/util/status-macros.h"
37 
38 namespace icing {
39 namespace lib {
40 
41 namespace {
42 
43 // Finds the shortest,valid prefix term with prefix hits in lexicon for which
44 // "prefix" is a prefix.
45 // Returns a valid FindTermResult with found=true if either:
46 //   1. prefix exists as a term in lexicon.
47 //   2. the shortest, valid prefix in the lexicon exists and contains prefix
48 //      hits.
49 // Returns a FindTermResult with found=false and undefined values of tvi and
50 // exact if no term was found.
51 struct FindTermResult {
52   // TVI of the term that was found. Undefined if found=false.
53   uint32_t tvi;
54   // Whether or not a valid term with prefix hits was found.
55   bool found;
56   // Whether or not that term is equal to 'prefix'
57   bool exact;
58 };
FindShortestValidTermWithPrefixHits(const IcingDynamicTrie * lexicon,const std::string & prefix)59 FindTermResult FindShortestValidTermWithPrefixHits(
60     const IcingDynamicTrie* lexicon, const std::string& prefix) {
61   // For prefix indexing: when we are doing a prefix match for "prefix", find
62   // the tvi to the equivalent posting list. prefix's own posting list might not
63   // exist but one of its children acts as a proxy.
64   IcingDynamicTrie::PropertyReader hits_in_prefix_section(
65       *lexicon, GetHasHitsInPrefixSectionPropertyId());
66   uint32_t tvi = 0;
67   bool found = false;
68   bool exact = false;
69   for (IcingDynamicTrie::Iterator it(*lexicon, prefix.c_str()); it.IsValid();
70        it.Advance()) {
71     PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
72     memcpy(&posting_list_id, it.GetValue(), sizeof(posting_list_id));
73 
74     // Posting list id might be invalid if this is also a backfill term.
75     // Suppose that the main index has two pre-existing prefix hits "foot" and
76     // "fool" - it will have a branch point posting list for "foo". Then, let's
77     // suppose that the other index adds hits for "foul", "four" and "far". This
78     // will result in branch points for "fo" and "f".
79     // If "fo" was added before "f", then the iterator would first give us "fo".
80     // "fo" will have an invalid posting_list_id because it hasn't been
81     // backfilled yet, so we need to continue iterating to "foo".
82     if (posting_list_id.is_valid()) {
83       exact = (prefix.size() == strlen(it.GetKey()));
84       tvi = it.GetValueIndex();
85       // Found it. Does it have prefix hits?
86       found = exact || hits_in_prefix_section.HasProperty(tvi);
87       break;
88     }
89   }
90   FindTermResult result = {tvi, found, exact};
91   return result;
92 }
93 
MakeFlashIndexFilename(const std::string & base_dir)94 std::string MakeFlashIndexFilename(const std::string& base_dir) {
95   return base_dir + "/main_index";
96 }
97 
98 }  // namespace
99 
MainIndex(const std::string & index_directory,const Filesystem * filesystem,const IcingFilesystem * icing_filesystem)100 MainIndex::MainIndex(const std::string& index_directory,
101                      const Filesystem* filesystem,
102                      const IcingFilesystem* icing_filesystem)
103     : base_dir_(index_directory),
104       filesystem_(filesystem),
105       icing_filesystem_(icing_filesystem),
106       posting_list_hit_serializer_(
107           std::make_unique<PostingListHitSerializer>()) {}
108 
Create(const std::string & index_directory,const Filesystem * filesystem,const IcingFilesystem * icing_filesystem)109 libtextclassifier3::StatusOr<std::unique_ptr<MainIndex>> MainIndex::Create(
110     const std::string& index_directory, const Filesystem* filesystem,
111     const IcingFilesystem* icing_filesystem) {
112   ICING_RETURN_ERROR_IF_NULL(filesystem);
113   ICING_RETURN_ERROR_IF_NULL(icing_filesystem);
114   std::unique_ptr<MainIndex> main_index(
115       new MainIndex(index_directory, filesystem, icing_filesystem));
116   ICING_RETURN_IF_ERROR(main_index->Init());
117   return main_index;
118 }
119 
ReadFlashIndexMagic(const Filesystem * filesystem,const std::string & index_directory)120 /* static */ libtextclassifier3::StatusOr<int> MainIndex::ReadFlashIndexMagic(
121     const Filesystem* filesystem, const std::string& index_directory) {
122   return FlashIndexStorage::ReadHeaderMagic(
123       filesystem, MakeFlashIndexFilename(index_directory));
124 }
125 
126 // TODO(b/139087650) : Migrate off of IcingFilesystem.
Init()127 libtextclassifier3::Status MainIndex::Init() {
128   if (!filesystem_->CreateDirectoryRecursively(base_dir_.c_str())) {
129     return absl_ports::InternalError("Unable to create main index directory.");
130   }
131   std::string flash_index_file = MakeFlashIndexFilename(base_dir_);
132   ICING_ASSIGN_OR_RETURN(
133       FlashIndexStorage flash_index,
134       FlashIndexStorage::Create(flash_index_file, filesystem_,
135                                 posting_list_hit_serializer_.get()));
136   flash_index_storage_ =
137       std::make_unique<FlashIndexStorage>(std::move(flash_index));
138 
139   std::string lexicon_file = base_dir_ + "/main-lexicon";
140   IcingDynamicTrie::RuntimeOptions runtime_options;
141   main_lexicon_ = std::make_unique<IcingDynamicTrie>(
142       lexicon_file, runtime_options, icing_filesystem_);
143   IcingDynamicTrie::Options lexicon_options;
144   if (!main_lexicon_->CreateIfNotExist(lexicon_options) ||
145       !main_lexicon_->Init()) {
146     return absl_ports::InternalError("Failed to initialize lexicon trie");
147   }
148   return libtextclassifier3::Status::OK;
149 }
150 
GetElementsSize() const151 libtextclassifier3::StatusOr<int64_t> MainIndex::GetElementsSize() const {
152   IndexStorageInfoProto storage_info = GetStorageInfo(IndexStorageInfoProto());
153   if (storage_info.main_index_storage_size() == -1 ||
154       storage_info.main_index_lexicon_size() == -1) {
155     return absl_ports::AbortedError(
156         "Failed to get size of MainIndex's members.");
157   }
158   return storage_info.main_index_storage_size() +
159          storage_info.main_index_lexicon_size();
160 }
161 
GetStorageInfo(IndexStorageInfoProto storage_info) const162 IndexStorageInfoProto MainIndex::GetStorageInfo(
163     IndexStorageInfoProto storage_info) const {
164   storage_info.set_main_index_lexicon_size(
165       IcingFilesystem::SanitizeFileSize(main_lexicon_->GetElementsSize()));
166   storage_info.set_main_index_storage_size(
167       Filesystem::SanitizeFileSize(flash_index_storage_->GetElementsSize()));
168   storage_info.set_main_index_block_size(flash_index_storage_->block_size());
169   storage_info.set_num_blocks(flash_index_storage_->num_blocks());
170   storage_info.set_min_free_fraction(flash_index_storage_->min_free_fraction());
171   return storage_info;
172 }
173 
174 libtextclassifier3::StatusOr<std::unique_ptr<PostingListHitAccessor>>
GetAccessorForExactTerm(const std::string & term)175 MainIndex::GetAccessorForExactTerm(const std::string& term) {
176   PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
177   if (!main_lexicon_->Find(term.c_str(), &posting_list_id)) {
178     return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
179         "Term %s is not present in main lexicon.", term.c_str()));
180   }
181   return PostingListHitAccessor::CreateFromExisting(
182       flash_index_storage_.get(), posting_list_hit_serializer_.get(),
183       posting_list_id);
184 }
185 
186 libtextclassifier3::StatusOr<MainIndex::GetPrefixAccessorResult>
GetAccessorForPrefixTerm(const std::string & prefix)187 MainIndex::GetAccessorForPrefixTerm(const std::string& prefix) {
188   bool exact = false;
189   // For prefix indexing: when we are doing a prefix match for
190   // "prefix", find the tvi to the equivalent posting list. prefix's
191   // own posting list might not exist but its shortest child acts as a proxy.
192   //
193   // For example, if there are only two hits in the index are prefix hits for
194   // "bar" and "bat", then both will appear on a posting list for "ba". "b"
195   // won't have a posting list, but "ba" will suffice.
196   IcingDynamicTrie::PropertyReader hits_in_prefix_section(
197       *main_lexicon_, GetHasHitsInPrefixSectionPropertyId());
198   IcingDynamicTrie::Iterator main_itr(*main_lexicon_, prefix.c_str());
199   if (!main_itr.IsValid()) {
200     return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
201         "Term: %s is not present in the main lexicon.", prefix.c_str()));
202   }
203   exact = (prefix.length() == strlen(main_itr.GetKey()));
204 
205   if (!exact && !hits_in_prefix_section.HasProperty(main_itr.GetValueIndex())) {
206     // Found it, but it doesn't have prefix hits. Exit early. No need to
207     // retrieve the posting list because there's nothing there for us.
208     return absl_ports::NotFoundError("The term doesn't have any prefix hits.");
209   }
210   PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
211   memcpy(&posting_list_id, main_itr.GetValue(), sizeof(posting_list_id));
212   ICING_ASSIGN_OR_RETURN(
213       std::unique_ptr<PostingListHitAccessor> pl_accessor,
214       PostingListHitAccessor::CreateFromExisting(
215           flash_index_storage_.get(), posting_list_hit_serializer_.get(),
216           posting_list_id));
217   return GetPrefixAccessorResult(std::move(pl_accessor), exact);
218 }
219 
220 // TODO(tjbarron): Implement a method PropertyReadersAll.HasAnyProperty().
IsTermInNamespaces(const IcingDynamicTrie::PropertyReadersAll & property_reader,uint32_t value_index,const std::vector<NamespaceId> & namespace_ids)221 bool IsTermInNamespaces(
222     const IcingDynamicTrie::PropertyReadersAll& property_reader,
223     uint32_t value_index, const std::vector<NamespaceId>& namespace_ids) {
224   if (namespace_ids.empty()) {
225     return true;
226   }
227   for (NamespaceId namespace_id : namespace_ids) {
228     if (property_reader.HasProperty(GetNamespacePropertyId(namespace_id),
229                                     value_index)) {
230       return true;
231     }
232   }
233 
234   return false;
235 }
236 
237 libtextclassifier3::StatusOr<std::vector<TermMetadata>>
FindTermsByPrefix(const std::string & prefix,TermMatchType::Code scoring_match_type,SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by,const SuggestionResultChecker * suggestion_result_checker)238 MainIndex::FindTermsByPrefix(
239     const std::string& prefix, TermMatchType::Code scoring_match_type,
240     SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by,
241     const SuggestionResultChecker* suggestion_result_checker) {
242   // Finds all the terms that start with the given prefix in the lexicon.
243   IcingDynamicTrie::Iterator term_iterator(*main_lexicon_, prefix.c_str());
244 
245   std::vector<TermMetadata> term_metadata_list;
246   while (term_iterator.IsValid()) {
247     int score = 0;
248     DocumentId last_document_id = kInvalidDocumentId;
249     bool is_last_document_in_desired = false;
250 
251     PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
252     memcpy(&posting_list_id, term_iterator.GetValue(), sizeof(posting_list_id));
253     ICING_ASSIGN_OR_RETURN(
254         std::unique_ptr<PostingListHitAccessor> pl_accessor,
255         PostingListHitAccessor::CreateFromExisting(
256             flash_index_storage_.get(), posting_list_hit_serializer_.get(),
257             posting_list_id));
258     ICING_ASSIGN_OR_RETURN(std::vector<Hit> hits,
259                            pl_accessor->GetNextHitsBatch());
260     while (!hits.empty()) {
261       for (const Hit& hit : hits) {
262         // Check whether this Hit is desired.
263         DocumentId document_id = hit.document_id();
264         bool is_new_document = document_id != last_document_id;
265         if (is_new_document) {
266           last_document_id = document_id;
267           is_last_document_in_desired =
268               suggestion_result_checker->BelongsToTargetResults(
269                   document_id, hit.section_id());
270         }
271         if (!is_last_document_in_desired) {
272           // The document is removed or expired or not belongs to target
273           // namespaces.
274           continue;
275         }
276         if (scoring_match_type == TermMatchType::EXACT_ONLY &&
277             hit.is_prefix_hit()) {
278           continue;
279         }
280 
281         // Score the hit by the strategy
282         if (score_by ==
283             SuggestionScoringSpecProto::SuggestionRankingStrategy::NONE) {
284           // Give 1 to all match terms and return them in arbitrary order
285           score = 1;
286           break;
287         } else if (score_by == SuggestionScoringSpecProto::
288                                    SuggestionRankingStrategy::DOCUMENT_COUNT &&
289                    is_new_document) {
290           ++score;
291         } else if (score_by == SuggestionScoringSpecProto::
292                                    SuggestionRankingStrategy::TERM_FREQUENCY) {
293           if (hit.has_term_frequency()) {
294             score += hit.term_frequency();
295           } else {
296             ++score;
297           }
298         }
299       }
300       if (score_by ==
301               SuggestionScoringSpecProto::SuggestionRankingStrategy::NONE &&
302           score == 1) {
303         // The term is desired and no need to be scored.
304         break;
305       }
306       ICING_ASSIGN_OR_RETURN(hits, pl_accessor->GetNextHitsBatch());
307     }
308     if (score > 0) {
309       term_metadata_list.push_back(TermMetadata(term_iterator.GetKey(), score));
310     }
311 
312     term_iterator.Advance();
313   }
314   return term_metadata_list;
315 }
316 
317 libtextclassifier3::StatusOr<MainIndex::LexiconMergeOutputs>
AddBackfillBranchPoints(const IcingDynamicTrie & other_lexicon)318 MainIndex::AddBackfillBranchPoints(const IcingDynamicTrie& other_lexicon) {
319   // Maps new branching points in main lexicon to the term such that
320   // branching_point_term is a prefix of term and there are no terms smaller
321   // than term and greater than branching_point_term.
322   std::string prefix;
323   LexiconMergeOutputs outputs;
324   for (IcingDynamicTrie::Iterator other_term_itr(other_lexicon, /*prefix=*/"");
325        other_term_itr.IsValid(); other_term_itr.Advance()) {
326     // If term were inserted in the main lexicon, what new branching would it
327     // create? (It always creates at most one.)
328     int prefix_len = main_lexicon_->FindNewBranchingPrefixLength(
329         other_term_itr.GetKey(), /*utf8=*/true);
330     if (prefix_len <= 0) {
331       continue;
332     }
333     prefix.assign(other_term_itr.GetKey(), prefix_len);
334 
335     // Figure out backfill tvi. Might not exist since all children terms could
336     // only contain hits from non-prefix sections.
337     //
338     // Ex. Suppose that the main lexicon contains "foot" and "fool" and that
339     // we're adding "foul". The new branching prefix will be "fo". The backfill
340     // prefix will be "foo" - all hits in prefix section on "foo" will need to
341     // be added to the new "fo" posting list later.
342     FindTermResult result =
343         FindShortestValidTermWithPrefixHits(main_lexicon_.get(), prefix);
344     if (!result.found || result.exact) {
345       continue;
346     }
347 
348     // This is a new prefix that will need backfilling from its next-in-line
349     // posting list. This new prefix will have to have a posting list eventually
350     // so insert a default PostingListIdentifier as a placeholder.
351     uint32_t branching_prefix_tvi;
352     bool new_key;
353     PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
354     libtextclassifier3::Status status = main_lexicon_->Insert(
355         prefix.c_str(), &posting_list_id, &branching_prefix_tvi,
356         /*replace=*/false, &new_key);
357     if (!status.ok()) {
358       ICING_LOG(DBG) << "Could not insert branching prefix\n"
359                      << status.error_message();
360       return status;
361     }
362 
363     // Backfills only contain prefix hits by default. So set these here but
364     // could be overridden when adding hits from the other index later.
365     if (!main_lexicon_->SetProperty(branching_prefix_tvi,
366                                     GetHasNoExactHitsPropertyId()) ||
367         !main_lexicon_->SetProperty(branching_prefix_tvi,
368                                     GetHasHitsInPrefixSectionPropertyId())) {
369       return absl_ports::InternalError("Setting prefix prop failed");
370     }
371 
372     outputs.backfill_map[branching_prefix_tvi] = result.tvi;
373   }
374   return outputs;
375 }
376 
377 libtextclassifier3::StatusOr<MainIndex::LexiconMergeOutputs>
AddTerms(const IcingDynamicTrie & other_lexicon,LexiconMergeOutputs && outputs)378 MainIndex::AddTerms(const IcingDynamicTrie& other_lexicon,
379                     LexiconMergeOutputs&& outputs) {
380   IcingDynamicTrie::PropertyReadersAll new_term_prop_readers(other_lexicon);
381   for (IcingDynamicTrie::Iterator other_term_itr(other_lexicon, /*prefix=*/"");
382        other_term_itr.IsValid(); other_term_itr.Advance()) {
383     uint32_t new_main_tvi;
384     PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
385     libtextclassifier3::Status status = main_lexicon_->Insert(
386         other_term_itr.GetKey(), &posting_list_id, &new_main_tvi,
387         /*replace=*/false);
388     if (!status.ok()) {
389       ICING_LOG(DBG) << "Could not insert term: " << other_term_itr.GetKey()
390                      << "\n"
391                      << status.error_message();
392       return status;
393     }
394 
395     // Copy the properties from the other lexicon over to the main lexicon.
396     uint32_t other_tvi = other_term_itr.GetValueIndex();
397     if (!CopyProperties(new_term_prop_readers, other_lexicon, other_tvi,
398                         new_main_tvi)) {
399       return absl_ports::InternalError(absl_ports::StrCat(
400           "Could not insert term: ", other_term_itr.GetKey()));
401     }
402 
403     // Add other to main mapping.
404     outputs.other_tvi_to_main_tvi.emplace(other_tvi, new_main_tvi);
405 
406     memcpy(&posting_list_id, main_lexicon_->GetValueAtIndex(new_main_tvi),
407            sizeof(posting_list_id));
408     if (posting_list_id.block_index() != kInvalidBlockIndex) {
409       outputs.main_tvi_to_block_index[new_main_tvi] =
410           posting_list_id.block_index();
411     }
412   }
413   return std::move(outputs);
414 }
415 
416 libtextclassifier3::StatusOr<MainIndex::LexiconMergeOutputs>
AddBranchPoints(const IcingDynamicTrie & other_lexicon,LexiconMergeOutputs && outputs)417 MainIndex::AddBranchPoints(const IcingDynamicTrie& other_lexicon,
418                            LexiconMergeOutputs&& outputs) {
419   IcingDynamicTrie::PropertyReader has_prefix_prop_reader(
420       other_lexicon, GetHasHitsInPrefixSectionPropertyId());
421   if (!has_prefix_prop_reader.Exists()) {
422     return std::move(outputs);
423   }
424   std::string prefix;
425   for (IcingDynamicTrie::Iterator other_term_itr(other_lexicon, /*prefix=*/"");
426        other_term_itr.IsValid(); other_term_itr.Advance()) {
427     // Only expand terms that have hits in prefix sections.
428     if (!has_prefix_prop_reader.HasProperty(other_term_itr.GetValueIndex())) {
429       continue;
430     }
431 
432     // Get prefixes where there is already a branching point in the main
433     // lexicon. We skip prefixes which don't already have a branching point.
434     std::vector<int> prefix_lengths = main_lexicon_->FindBranchingPrefixLengths(
435         other_term_itr.GetKey(), /*utf8=*/true);
436 
437     int buf_start = outputs.prefix_tvis_buf.size();
438     // Add prefixes.
439     for (int prefix_length : prefix_lengths) {
440       if (prefix_length <= 0) {
441         continue;
442       }
443 
444       prefix.assign(other_term_itr.GetKey(), prefix_length);
445       uint32_t prefix_tvi;
446       bool new_key;
447       PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
448       libtextclassifier3::Status status =
449           main_lexicon_->Insert(prefix.c_str(), &posting_list_id, &prefix_tvi,
450                                 /*replace=*/false, &new_key);
451       if (!status.ok()) {
452         ICING_LOG(DBG) << "Could not insert prefix: " << prefix << "\n"
453                        << status.error_message();
454         return status;
455       }
456 
457       // Prefix tvi will have hits in prefix section.
458       if (!main_lexicon_->SetProperty(prefix_tvi,
459                                       GetHasHitsInPrefixSectionPropertyId())) {
460         return absl_ports::InternalError(
461             "Setting has hits in prefix section prop failed");
462       }
463 
464       // If it hasn't been added by non-prefix term insertions in
465       // AddBackfillBranchPoints and AddTerms, it is a prefix-only term.
466       if (new_key && !main_lexicon_->SetProperty(
467                          prefix_tvi, GetHasNoExactHitsPropertyId())) {
468         return absl_ports::InternalError("Setting no exact hits prop failed");
469       }
470 
471       outputs.prefix_tvis_buf.push_back(prefix_tvi);
472 
473       memcpy(&posting_list_id, main_lexicon_->GetValueAtIndex(prefix_tvi),
474              sizeof(posting_list_id));
475       if (posting_list_id.block_index() != kInvalidBlockIndex) {
476         outputs.main_tvi_to_block_index[prefix_tvi] =
477             posting_list_id.block_index();
478       }
479     }
480 
481     // Any prefixes added? Then add to map.
482     if (buf_start < outputs.prefix_tvis_buf.size()) {
483       outputs.other_tvi_to_prefix_main_tvis[other_term_itr.GetValueIndex()] = {
484           buf_start, outputs.prefix_tvis_buf.size() - buf_start};
485     }
486   }
487   return std::move(outputs);
488 }
489 
CopyProperties(const IcingDynamicTrie::PropertyReadersAll & prop_reader,const IcingDynamicTrie & other_lexicon,uint32_t other_tvi,uint32_t new_main_tvi)490 bool MainIndex::CopyProperties(
491     const IcingDynamicTrie::PropertyReadersAll& prop_reader,
492     const IcingDynamicTrie& other_lexicon, uint32_t other_tvi,
493     uint32_t new_main_tvi) {
494   for (uint32_t property_id = 0; property_id < prop_reader.size();
495        ++property_id) {
496     if (property_id == GetHasNoExactHitsPropertyId()) {
497       // HasNoExactHitsProperty is an inverse. If other_lexicon has exact hits
498       // for this term, then HasNoExactHits needs to be set to false in
499       // main_lexicon. If other_lexicon has no exact hits for this term, then
500       // HasNoExactHits in the main_lexicon should not be modified.
501       if (!prop_reader.HasProperty(property_id, other_tvi) &&
502           !main_lexicon_->ClearProperty(new_main_tvi, property_id)) {
503         ICING_LOG(ERROR) << "Clearing HasNoExactHitsProperty failed";
504         return false;
505       }
506     } else {
507       // If other_lexicon has this property set for this term, then that
508       // property needs to be set for the main_lexicon. If other_lexicon
509       // doesn't have this property set, then the property in the main lexicon
510       // should not be modified.
511       if (prop_reader.HasProperty(property_id, other_tvi) &&
512           !main_lexicon_->SetProperty(new_main_tvi, property_id)) {
513         return false;
514       }
515     }
516   }
517   return true;
518 }
519 
AddHits(const TermIdCodec & term_id_codec,std::unordered_map<uint32_t,uint32_t> && backfill_map,std::vector<TermIdHitPair> && hits,DocumentId last_added_document_id)520 libtextclassifier3::Status MainIndex::AddHits(
521     const TermIdCodec& term_id_codec,
522     std::unordered_map<uint32_t, uint32_t>&& backfill_map,
523     std::vector<TermIdHitPair>&& hits, DocumentId last_added_document_id) {
524   if (hits.empty()) {
525     flash_index_storage_->set_last_indexed_docid(last_added_document_id);
526     return libtextclassifier3::Status::OK;
527   }
528   uint32_t cur_term_id = hits[0].term_id();
529   ICING_ASSIGN_OR_RETURN(TermIdCodec::DecodedTermInfo cur_decoded_term,
530                          term_id_codec.DecodeTermInfo(cur_term_id));
531   // Iterate through all hits. If these hits are for a term that also needs
532   // backfill, then backfill first and then add the new hits.
533   size_t k_start = 0;
534   size_t k_end = 0;
535   while (k_start < hits.size()) {
536     uint32_t term_id = hits[k_end].term_id();
537     while (term_id == cur_term_id && ++k_end < hits.size()) {
538       term_id = hits[k_end].term_id();
539     }
540 
541     // Look for backfill.
542     PostingListIdentifier backfill_posting_list_id =
543         PostingListIdentifier::kInvalid;
544     auto itr = backfill_map.find(cur_decoded_term.tvi);
545     if (itr != backfill_map.end()) {
546       const void* value = main_lexicon_->GetValueAtIndex(itr->second);
547       memcpy(&backfill_posting_list_id, value,
548              sizeof(backfill_posting_list_id));
549       backfill_map.erase(itr);
550     }
551     ICING_RETURN_IF_ERROR(AddHitsForTerm(cur_decoded_term.tvi,
552                                          backfill_posting_list_id,
553                                          &hits[k_start], k_end - k_start));
554     cur_term_id = term_id;
555     ICING_ASSIGN_OR_RETURN(cur_decoded_term,
556                            term_id_codec.DecodeTermInfo(cur_term_id));
557     k_start = k_end;
558   }
559 
560   // Now copy remaining backfills.
561   ICING_VLOG(1) << "Remaining backfills " << backfill_map.size();
562   for (auto other_tvi_main_tvi_pair : backfill_map) {
563     PostingListIdentifier backfill_posting_list_id =
564         PostingListIdentifier::kInvalid;
565     memcpy(&backfill_posting_list_id,
566            main_lexicon_->GetValueAtIndex(other_tvi_main_tvi_pair.second),
567            sizeof(backfill_posting_list_id));
568     ICING_ASSIGN_OR_RETURN(
569         std::unique_ptr<PostingListHitAccessor> hit_accum,
570         PostingListHitAccessor::Create(flash_index_storage_.get(),
571                                        posting_list_hit_serializer_.get()));
572     ICING_RETURN_IF_ERROR(
573         AddPrefixBackfillHits(backfill_posting_list_id, hit_accum.get()));
574     PostingListAccessor::FinalizeResult result =
575         std::move(*hit_accum).Finalize();
576     if (result.id.is_valid()) {
577       main_lexicon_->SetValueAtIndex(other_tvi_main_tvi_pair.first, &result.id);
578     }
579   }
580   flash_index_storage_->set_last_indexed_docid(last_added_document_id);
581   return libtextclassifier3::Status::OK;
582 }
583 
AddHitsForTerm(uint32_t tvi,PostingListIdentifier backfill_posting_list_id,const TermIdHitPair * hit_elements,size_t len)584 libtextclassifier3::Status MainIndex::AddHitsForTerm(
585     uint32_t tvi, PostingListIdentifier backfill_posting_list_id,
586     const TermIdHitPair* hit_elements, size_t len) {
587   // 1. Create a PostingListHitAccessor - either from the pre-existing block, if
588   // one exists, or from scratch.
589   PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
590   memcpy(&posting_list_id, main_lexicon_->GetValueAtIndex(tvi),
591          sizeof(posting_list_id));
592   std::unique_ptr<PostingListHitAccessor> pl_accessor;
593   if (posting_list_id.is_valid()) {
594     if (posting_list_id.block_index() >= flash_index_storage_->num_blocks()) {
595       ICING_LOG(ERROR) << "Index dropped hits. Invalid block index "
596                        << posting_list_id.block_index()
597                        << " >= " << flash_index_storage_->num_blocks();
598       // TODO(b/159918304) : Consider revising the checksumming strategy in the
599       // main index. Providing some mechanism to check for corruption - either
600       // during initialization or some later time would allow us to avoid
601       // whack-a-mole with odd corruption issues like this one (b/62820689).
602       return absl_ports::InternalError(
603           "Valid posting list has an invalid block index!");
604     }
605     ICING_ASSIGN_OR_RETURN(
606         pl_accessor, PostingListHitAccessor::CreateFromExisting(
607                          flash_index_storage_.get(),
608                          posting_list_hit_serializer_.get(), posting_list_id));
609   } else {
610     // New posting list.
611     ICING_ASSIGN_OR_RETURN(
612         pl_accessor,
613         PostingListHitAccessor::Create(flash_index_storage_.get(),
614                                        posting_list_hit_serializer_.get()));
615   }
616 
617   // 2. Backfill any hits if necessary.
618   if (backfill_posting_list_id.is_valid()) {
619     ICING_RETURN_IF_ERROR(
620         AddPrefixBackfillHits(backfill_posting_list_id, pl_accessor.get()));
621   }
622 
623   // 3. Add all the new hits.
624   for (int i = len - 1; i >= 0; --i) {
625     Hit hit = hit_elements[i].hit();
626     ICING_RETURN_IF_ERROR(pl_accessor->PrependHit(hit));
627   }
628 
629   // 4. Finalize this posting list and put its identifier in the lexicon.
630   PostingListAccessor::FinalizeResult result =
631       std::move(*pl_accessor).Finalize();
632   if (result.id.is_valid()) {
633     main_lexicon_->SetValueAtIndex(tvi, &result.id);
634   }
635   return libtextclassifier3::Status::OK;
636 }
637 
AddPrefixBackfillHits(PostingListIdentifier backfill_posting_list_id,PostingListHitAccessor * hit_accum)638 libtextclassifier3::Status MainIndex::AddPrefixBackfillHits(
639     PostingListIdentifier backfill_posting_list_id,
640     PostingListHitAccessor* hit_accum) {
641   ICING_ASSIGN_OR_RETURN(
642       std::unique_ptr<PostingListHitAccessor> backfill_accessor,
643       PostingListHitAccessor::CreateFromExisting(
644           flash_index_storage_.get(), posting_list_hit_serializer_.get(),
645           backfill_posting_list_id));
646   std::vector<Hit> backfill_hits;
647   ICING_ASSIGN_OR_RETURN(std::vector<Hit> tmp,
648                          backfill_accessor->GetNextHitsBatch());
649   while (!tmp.empty()) {
650     std::copy(tmp.begin(), tmp.end(), std::back_inserter(backfill_hits));
651     ICING_ASSIGN_OR_RETURN(tmp, backfill_accessor->GetNextHitsBatch());
652   }
653 
654   Hit last_added_hit;
655   // The hits in backfill_hits are in the reverse order of how they were added.
656   // Iterate in reverse to add them to this new posting list in the correct
657   // order.
658   for (auto itr = backfill_hits.rbegin(); itr != backfill_hits.rend(); ++itr) {
659     const Hit& hit = *itr;
660     // Skip hits from non-prefix-enabled sections.
661     if (!hit.is_in_prefix_section()) {
662       continue;
663     }
664 
665     // A backfill hit is a prefix hit in a prefix section.
666     const Hit backfill_hit(hit.section_id(), hit.document_id(),
667                            hit.term_frequency(),
668                            /*is_in_prefix_section=*/true,
669                            /*is_prefix_hit=*/true);
670     if (backfill_hit == last_added_hit) {
671       // Skip duplicate values due to overriding of the is_prefix flag.
672       continue;
673     }
674     last_added_hit = backfill_hit;
675     ICING_RETURN_IF_ERROR(hit_accum->PrependHit(backfill_hit));
676   }
677   return libtextclassifier3::Status::OK;
678 }
679 
GetDebugInfo(DebugInfoVerbosity::Code verbosity) const680 std::string MainIndex::GetDebugInfo(DebugInfoVerbosity::Code verbosity) const {
681   std::string res;
682 
683   // Lexicon.
684   std::string lexicon_info;
685   main_lexicon_->GetDebugInfo(verbosity, &lexicon_info);
686 
687   IcingStringUtil::SStringAppendF(&res, 0,
688                                   "last_added_document_id: %u\n"
689                                   "\n"
690                                   "main_lexicon_info:\n%s\n",
691                                   last_added_document_id(),
692                                   lexicon_info.c_str());
693 
694   if (verbosity == DebugInfoVerbosity::BASIC) {
695     return res;
696   }
697 
698   std::string flash_index_storage_info;
699   flash_index_storage_->GetDebugInfo(verbosity, &flash_index_storage_info);
700   IcingStringUtil::SStringAppendF(&res, 0, "flash_index_storage_info:\n%s\n",
701                                   flash_index_storage_info.c_str());
702   return res;
703 }
704 
Optimize(const std::vector<DocumentId> & document_id_old_to_new)705 libtextclassifier3::Status MainIndex::Optimize(
706     const std::vector<DocumentId>& document_id_old_to_new) {
707   std::string temporary_index_dir_path = base_dir_ + "_temp";
708   if (!filesystem_->DeleteDirectoryRecursively(
709           temporary_index_dir_path.c_str())) {
710     ICING_LOG(ERROR) << "Recursively deleting " << temporary_index_dir_path;
711     return absl_ports::InternalError(
712         "Unable to delete temp directory to prepare to build new index.");
713   }
714 
715   DestructibleDirectory temporary_index_dir(
716       filesystem_, std::move(temporary_index_dir_path));
717   if (!temporary_index_dir.is_valid()) {
718     return absl_ports::InternalError(
719         "Unable to create temp directory to build new index.");
720   }
721 
722   ICING_ASSIGN_OR_RETURN(std::unique_ptr<MainIndex> new_index,
723                          MainIndex::Create(temporary_index_dir.dir(),
724                                            filesystem_, icing_filesystem_));
725   ICING_RETURN_IF_ERROR(TransferIndex(document_id_old_to_new, new_index.get()));
726   ICING_RETURN_IF_ERROR(new_index->PersistToDisk());
727   new_index = nullptr;
728   flash_index_storage_ = nullptr;
729   main_lexicon_ = nullptr;
730 
731   if (!filesystem_->SwapFiles(temporary_index_dir.dir().c_str(),
732                               base_dir_.c_str())) {
733     return absl_ports::InternalError(
734         "Unable to apply new index due to failed swap!");
735   }
736 
737   // Reinitialize the index so that flash_index_storage_ and main_lexicon_ are
738   // properly updated.
739   return Init();
740 }
741 
TransferAndAddHits(const std::vector<DocumentId> & document_id_old_to_new,const char * term,PostingListHitAccessor & old_pl_accessor,MainIndex * new_index)742 libtextclassifier3::StatusOr<DocumentId> MainIndex::TransferAndAddHits(
743     const std::vector<DocumentId>& document_id_old_to_new, const char* term,
744     PostingListHitAccessor& old_pl_accessor, MainIndex* new_index) {
745   std::vector<Hit> new_hits;
746   bool has_no_exact_hits = true;
747   bool has_hits_in_prefix_section = false;
748   // The largest document id after translating hits.
749   DocumentId largest_document_id = kInvalidDocumentId;
750   ICING_ASSIGN_OR_RETURN(std::vector<Hit> tmp,
751                          old_pl_accessor.GetNextHitsBatch());
752   while (!tmp.empty()) {
753     for (const Hit& hit : tmp) {
754       DocumentId new_document_id = document_id_old_to_new[hit.document_id()];
755       // Transfer the document id of the hit, if the document is not deleted
756       // or outdated.
757       if (new_document_id != kInvalidDocumentId) {
758         if (hit.is_in_prefix_section()) {
759           has_hits_in_prefix_section = true;
760         }
761         if (!hit.is_prefix_hit()) {
762           has_no_exact_hits = false;
763         }
764         if (largest_document_id == kInvalidDocumentId ||
765             new_document_id > largest_document_id) {
766           largest_document_id = new_document_id;
767         }
768         new_hits.push_back(Hit::TranslateHit(hit, new_document_id));
769       }
770     }
771     ICING_ASSIGN_OR_RETURN(tmp, old_pl_accessor.GetNextHitsBatch());
772   }
773   // A term without exact hits indicates that it is a purely backfill term. If
774   // the term is not branching in the new trie, it means backfilling is no
775   // longer necessary, so that we can skip.
776   if (new_hits.empty() ||
777       (has_no_exact_hits && !new_index->main_lexicon_->IsBranchingTerm(term))) {
778     return largest_document_id;
779   }
780 
781   ICING_ASSIGN_OR_RETURN(std::unique_ptr<PostingListHitAccessor> hit_accum,
782                          PostingListHitAccessor::Create(
783                              new_index->flash_index_storage_.get(),
784                              new_index->posting_list_hit_serializer_.get()));
785   for (auto itr = new_hits.rbegin(); itr != new_hits.rend(); ++itr) {
786     ICING_RETURN_IF_ERROR(hit_accum->PrependHit(*itr));
787   }
788   PostingListAccessor::FinalizeResult result = std::move(*hit_accum).Finalize();
789   if (!result.id.is_valid()) {
790     return absl_ports::InternalError(
791         absl_ports::StrCat("Failed to add translated hits for term: ", term));
792   }
793   uint32_t tvi;
794   libtextclassifier3::Status status =
795       new_index->main_lexicon_->Insert(term, &result.id, &tvi,
796                                        /*replace=*/false);
797   if (!status.ok()) {
798     ICING_LOG(DBG) << "Could not transfer main index for term: " << term << "\n"
799                    << status.error_message();
800     return status;
801   }
802   if (has_no_exact_hits && !new_index->main_lexicon_->SetProperty(
803                                tvi, GetHasNoExactHitsPropertyId())) {
804     return absl_ports::InternalError("Setting prefix prop failed");
805   }
806   if (has_hits_in_prefix_section &&
807       !new_index->main_lexicon_->SetProperty(
808           tvi, GetHasHitsInPrefixSectionPropertyId())) {
809     return absl_ports::InternalError("Setting prefix prop failed");
810   }
811   return largest_document_id;
812 }
813 
TransferIndex(const std::vector<DocumentId> & document_id_old_to_new,MainIndex * new_index)814 libtextclassifier3::Status MainIndex::TransferIndex(
815     const std::vector<DocumentId>& document_id_old_to_new,
816     MainIndex* new_index) {
817   DocumentId largest_document_id = kInvalidDocumentId;
818   for (IcingDynamicTrie::Iterator term_itr(*main_lexicon_, /*prefix=*/"",
819                                            /*reverse=*/true);
820        term_itr.IsValid(); term_itr.Advance()) {
821     PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
822     memcpy(&posting_list_id, term_itr.GetValue(), sizeof(posting_list_id));
823     if (posting_list_id == PostingListIdentifier::kInvalid) {
824       // Why?
825       ICING_LOG(ERROR)
826           << "Got invalid posting_list_id from previous main index";
827       continue;
828     }
829     ICING_ASSIGN_OR_RETURN(
830         std::unique_ptr<PostingListHitAccessor> pl_accessor,
831         PostingListHitAccessor::CreateFromExisting(
832             flash_index_storage_.get(), posting_list_hit_serializer_.get(),
833             posting_list_id));
834     ICING_ASSIGN_OR_RETURN(
835         DocumentId curr_largest_document_id,
836         TransferAndAddHits(document_id_old_to_new, term_itr.GetKey(),
837                            *pl_accessor, new_index));
838     if (curr_largest_document_id == kInvalidDocumentId) {
839       continue;
840     }
841     if (largest_document_id == kInvalidDocumentId ||
842         curr_largest_document_id > largest_document_id) {
843       largest_document_id = curr_largest_document_id;
844     }
845   }
846   new_index->flash_index_storage_->set_last_indexed_docid(largest_document_id);
847   return libtextclassifier3::Status::OK;
848 }
849 
850 }  // namespace lib
851 }  // namespace icing
852