• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/index/lite/lite-index.h"
16 
17 #include <sys/mman.h>
18 
19 #include <algorithm>
20 #include <cinttypes>
21 #include <cstddef>
22 #include <cstdint>
23 #include <memory>
24 #include <string>
25 #include <string_view>
26 #include <unordered_set>
27 #include <utility>
28 #include <vector>
29 
30 #include "icing/text_classifier/lib3/utils/base/status.h"
31 #include "icing/text_classifier/lib3/utils/base/statusor.h"
32 #include "icing/absl_ports/canonical_errors.h"
33 #include "icing/absl_ports/mutex.h"
34 #include "icing/absl_ports/str_cat.h"
35 #include "icing/file/filesystem.h"
36 #include "icing/index/hit/doc-hit-info.h"
37 #include "icing/index/hit/hit.h"
38 #include "icing/index/lite/lite-index-header.h"
39 #include "icing/index/term-property-id.h"
40 #include "icing/legacy/core/icing-string-util.h"
41 #include "icing/legacy/core/icing-timer.h"
42 #include "icing/legacy/index/icing-array-storage.h"
43 #include "icing/legacy/index/icing-dynamic-trie.h"
44 #include "icing/legacy/index/icing-filesystem.h"
45 #include "icing/legacy/index/icing-mmapper.h"
46 #include "icing/proto/debug.pb.h"
47 #include "icing/proto/storage.pb.h"
48 #include "icing/proto/term.pb.h"
49 #include "icing/schema/section.h"
50 #include "icing/store/document-id.h"
51 #include "icing/util/crc32.h"
52 #include "icing/util/logging.h"
53 #include "icing/util/status-macros.h"
54 
55 namespace icing {
56 namespace lib {
57 
58 namespace {
59 
60 // Point at which we declare the trie full.
61 constexpr double kTrieFullFraction = 0.95;
62 
MakeHitBufferFilename(const std::string & filename_base)63 std::string MakeHitBufferFilename(const std::string& filename_base) {
64   return filename_base + "hb";
65 }
66 
header_size()67 size_t header_size() { return sizeof(LiteIndex_HeaderImpl::HeaderData); }
68 
69 }  // namespace
70 
71 const TermIdHitPair::Value TermIdHitPair::kInvalidValue =
72     TermIdHitPair(0, Hit()).value();
73 
Create(const LiteIndex::Options & options,const IcingFilesystem * filesystem)74 libtextclassifier3::StatusOr<std::unique_ptr<LiteIndex>> LiteIndex::Create(
75     const LiteIndex::Options& options, const IcingFilesystem* filesystem) {
76   ICING_RETURN_ERROR_IF_NULL(filesystem);
77 
78   std::unique_ptr<LiteIndex> lite_index =
79       std::unique_ptr<LiteIndex>(new LiteIndex(options, filesystem));
80   ICING_RETURN_IF_ERROR(lite_index->Initialize());
81   return std::move(lite_index);
82 }
83 
84 // size is max size in elements. An appropriate lexicon and display
85 // mapping size will be chosen based on hit buffer size.
LiteIndex(const LiteIndex::Options & options,const IcingFilesystem * filesystem)86 LiteIndex::LiteIndex(const LiteIndex::Options& options,
87                      const IcingFilesystem* filesystem)
88     : hit_buffer_(*filesystem),
89       hit_buffer_crc_(0),
90       lexicon_(options.filename_base + "lexicon", MakeTrieRuntimeOptions(),
91                filesystem),
92       header_mmap_(false, MAP_SHARED),
93       options_(options),
94       filesystem_(filesystem) {}
95 
~LiteIndex()96 LiteIndex::~LiteIndex() {
97   if (initialized()) {
98     libtextclassifier3::Status unused = PersistToDisk();
99   }
100 }
101 
MakeTrieRuntimeOptions()102 IcingDynamicTrie::RuntimeOptions LiteIndex::MakeTrieRuntimeOptions() {
103   return IcingDynamicTrie::RuntimeOptions().set_storage_policy(
104       IcingDynamicTrie::RuntimeOptions::kMapSharedWithCrc);
105 }
106 
Initialize()107 libtextclassifier3::Status LiteIndex::Initialize() {
108   // Size of hit buffer's header struct, rounded up to the nearest number of
109   // system memory pages.
110   const size_t header_padded_size =
111       IcingMMapper::page_aligned_size(header_size());
112 
113   // Variable declarations cannot cross goto jumps, so declare these up top.
114   libtextclassifier3::Status status;
115   uint64_t file_size;
116   IcingTimer timer;
117 
118   absl_ports::unique_lock l(&mutex_);
119   if (!lexicon_.CreateIfNotExist(options_.lexicon_options) ||
120       !lexicon_.Init()) {
121     return absl_ports::InternalError("Failed to initialize lexicon trie");
122   }
123 
124   hit_buffer_fd_.reset(filesystem_->OpenForWrite(
125       MakeHitBufferFilename(options_.filename_base).c_str()));
126   if (!hit_buffer_fd_.is_valid()) {
127     status = absl_ports::InternalError("Failed to open hit buffer file");
128     goto error;
129   }
130 
131   file_size = filesystem_->GetFileSize(hit_buffer_fd_.get());
132   if (file_size == IcingFilesystem::kBadFileSize) {
133     status = absl_ports::InternalError("Failed to query hit buffer file size");
134     goto error;
135   }
136 
137   if (file_size < header_padded_size) {
138     if (file_size != 0) {
139       status = absl_ports::InternalError(IcingStringUtil::StringPrintf(
140           "Hit buffer had unexpected size %" PRIu64, file_size));
141       goto error;
142     }
143 
144     ICING_VLOG(2) << "Creating new hit buffer";
145     // Make sure files are fresh.
146     if (!lexicon_.Remove() ||
147         !lexicon_.CreateIfNotExist(options_.lexicon_options) ||
148         !lexicon_.Init()) {
149       status =
150           absl_ports::InternalError("Failed to refresh lexicon during clear");
151       goto error;
152     }
153 
154     // Create fresh hit buffer by first emptying the hit buffer file and then
155     // allocating header_padded_size of the cleared space.
156     if (!filesystem_->Truncate(hit_buffer_fd_.get(), 0) ||
157         !filesystem_->Truncate(hit_buffer_fd_.get(), header_padded_size)) {
158       status = absl_ports::InternalError("Failed to truncate hit buffer file");
159       goto error;
160     }
161 
162     // Set up header.
163     header_mmap_.Remap(hit_buffer_fd_.get(), 0, header_size());
164     header_ = std::make_unique<LiteIndex_HeaderImpl>(
165         reinterpret_cast<LiteIndex_HeaderImpl::HeaderData*>(
166             header_mmap_.address()));
167     header_->Reset();
168 
169     if (!hit_buffer_.Init(hit_buffer_fd_.get(), header_padded_size, true,
170                           sizeof(TermIdHitPair::Value), header_->cur_size(),
171                           options_.hit_buffer_size, &hit_buffer_crc_, true)) {
172       status = absl_ports::InternalError("Failed to initialize new hit buffer");
173       goto error;
174     }
175 
176     UpdateChecksum();
177   } else {
178     header_mmap_.Remap(hit_buffer_fd_.get(), 0, header_size());
179     header_ = std::make_unique<LiteIndex_HeaderImpl>(
180         reinterpret_cast<LiteIndex_HeaderImpl::HeaderData*>(
181             header_mmap_.address()));
182 
183     if (!hit_buffer_.Init(hit_buffer_fd_.get(), header_padded_size, true,
184                           sizeof(TermIdHitPair::Value), header_->cur_size(),
185                           options_.hit_buffer_size, &hit_buffer_crc_, true)) {
186       status = absl_ports::InternalError(
187           "Failed to re-initialize existing hit buffer");
188       goto error;
189     }
190 
191     // Check integrity.
192     if (!header_->check_magic()) {
193       status = absl_ports::InternalError("Lite index header magic mismatch");
194       goto error;
195     }
196     Crc32 crc = ComputeChecksum();
197     if (crc.Get() != header_->lite_index_crc()) {
198       status = absl_ports::DataLossError(
199           IcingStringUtil::StringPrintf("Lite index crc check failed: %u vs %u",
200                                         crc.Get(), header_->lite_index_crc()));
201       goto error;
202     }
203   }
204 
205   ICING_VLOG(2) << "Lite index init ok in " << timer.Elapsed() * 1000 << "ms";
206   return status;
207 
208 error:
209   header_ = nullptr;
210   header_mmap_.Unmap();
211   lexicon_.Close();
212   hit_buffer_crc_ = 0;
213   hit_buffer_.Reset();
214   hit_buffer_fd_.reset();
215   if (status.ok()) {
216     return absl_ports::InternalError(
217         "Error handling code ran but status was ok");
218   }
219   return status;
220 }
221 
ComputeChecksum()222 Crc32 LiteIndex::ComputeChecksum() {
223   IcingTimer timer;
224 
225   // Update crcs.
226   uint32_t dependent_crcs[2];
227   hit_buffer_.UpdateCrc();
228   dependent_crcs[0] = hit_buffer_crc_;
229   dependent_crcs[1] = lexicon_.UpdateCrc();
230 
231   // Compute the master crc.
232 
233   // Header crc, excluding the actual crc field.
234   Crc32 all_crc(header_->CalculateHeaderCrc());
235   all_crc.Append(std::string_view(reinterpret_cast<const char*>(dependent_crcs),
236                                   sizeof(dependent_crcs)));
237   ICING_VLOG(2) << "Lite index crc computed in " << timer.Elapsed() * 1000
238                 << "ms";
239 
240   return all_crc;
241 }
242 
Reset()243 libtextclassifier3::Status LiteIndex::Reset() {
244   IcingTimer timer;
245 
246   absl_ports::unique_lock l(&mutex_);
247   // TODO(b/140436942): When these components have been changed to return errors
248   // they should be propagated from here.
249   lexicon_.Clear();
250   hit_buffer_.Clear();
251   header_->Reset();
252   UpdateChecksum();
253 
254   ICING_VLOG(2) << "Lite index clear in " << timer.Elapsed() * 1000 << "ms";
255   return libtextclassifier3::Status::OK;
256 }
257 
Warm()258 void LiteIndex::Warm() {
259   absl_ports::shared_lock l(&mutex_);
260   hit_buffer_.Warm();
261   lexicon_.Warm();
262 }
263 
PersistToDisk()264 libtextclassifier3::Status LiteIndex::PersistToDisk() {
265   absl_ports::unique_lock l(&mutex_);
266   bool success = true;
267   if (!lexicon_.Sync()) {
268     ICING_VLOG(1) << "Failed to sync the lexicon.";
269     success = false;
270   }
271   hit_buffer_.Sync();
272   UpdateChecksum();
273   header_mmap_.Sync();
274 
275   return (success) ? libtextclassifier3::Status::OK
276                    : absl_ports::InternalError(
277                          "Unable to sync lite index components.");
278 }
279 
UpdateChecksum()280 void LiteIndex::UpdateChecksum() {
281   header_->set_lite_index_crc(ComputeChecksum().Get());
282 }
283 
InsertTerm(const std::string & term,TermMatchType::Code term_match_type,NamespaceId namespace_id)284 libtextclassifier3::StatusOr<uint32_t> LiteIndex::InsertTerm(
285     const std::string& term, TermMatchType::Code term_match_type,
286     NamespaceId namespace_id) {
287   absl_ports::unique_lock l(&mutex_);
288   uint32_t tvi;
289   libtextclassifier3::Status status =
290       lexicon_.Insert(term.c_str(), "", &tvi, false);
291   if (!status.ok()) {
292     ICING_LOG(DBG) << "Unable to add term " << term << " to lexicon!\n"
293                    << status.error_message();
294     return status;
295   }
296   ICING_RETURN_IF_ERROR(UpdateTermPropertiesImpl(
297       tvi, term_match_type == TermMatchType::PREFIX, namespace_id));
298   return tvi;
299 }
300 
UpdateTermProperties(uint32_t tvi,bool hasPrefixHits,NamespaceId namespace_id)301 libtextclassifier3::Status LiteIndex::UpdateTermProperties(
302     uint32_t tvi, bool hasPrefixHits, NamespaceId namespace_id) {
303   absl_ports::unique_lock l(&mutex_);
304   return UpdateTermPropertiesImpl(tvi, hasPrefixHits, namespace_id);
305 }
306 
UpdateTermPropertiesImpl(uint32_t tvi,bool hasPrefixHits,NamespaceId namespace_id)307 libtextclassifier3::Status LiteIndex::UpdateTermPropertiesImpl(
308     uint32_t tvi, bool hasPrefixHits, NamespaceId namespace_id) {
309   if (hasPrefixHits &&
310       !lexicon_.SetProperty(tvi, GetHasHitsInPrefixSectionPropertyId())) {
311     return absl_ports::ResourceExhaustedError(
312         "Insufficient disk space to create prefix property!");
313   }
314 
315   if (!lexicon_.SetProperty(tvi, GetNamespacePropertyId(namespace_id))) {
316     return absl_ports::ResourceExhaustedError(
317         "Insufficient disk space to create namespace property!");
318   }
319 
320   return libtextclassifier3::Status::OK;
321 }
322 
AddHit(uint32_t term_id,const Hit & hit)323 libtextclassifier3::Status LiteIndex::AddHit(uint32_t term_id, const Hit& hit) {
324   absl_ports::unique_lock l(&mutex_);
325   if (is_full()) {
326     return absl_ports::ResourceExhaustedError("Hit buffer is full!");
327   }
328 
329   TermIdHitPair term_id_hit_pair(term_id, hit);
330   uint32_t cur_size = header_->cur_size();
331   TermIdHitPair::Value* valp =
332       hit_buffer_.GetMutableMem<TermIdHitPair::Value>(cur_size, 1);
333   if (valp == nullptr) {
334     return absl_ports::ResourceExhaustedError(
335         "Allocating more space in hit buffer failed!");
336   }
337   *valp = term_id_hit_pair.value();
338   header_->set_cur_size(cur_size + 1);
339 
340   return libtextclassifier3::Status::OK;
341 }
342 
GetTermId(const std::string & term) const343 libtextclassifier3::StatusOr<uint32_t> LiteIndex::GetTermId(
344     const std::string& term) const {
345   absl_ports::shared_lock l(&mutex_);
346   char dummy;
347   uint32_t tvi;
348   if (!lexicon_.Find(term.c_str(), &dummy, &tvi)) {
349     return absl_ports::NotFoundError(
350         absl_ports::StrCat("Could not find ", term, " in the lexicon."));
351   }
352   return tvi;
353 }
354 
FetchHits(uint32_t term_id,SectionIdMask section_id_mask,bool only_from_prefix_sections,SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by,const SuggestionResultChecker * suggestion_result_checker,std::vector<DocHitInfo> * hits_out,std::vector<Hit::TermFrequencyArray> * term_frequency_out)355 int LiteIndex::FetchHits(
356     uint32_t term_id, SectionIdMask section_id_mask,
357     bool only_from_prefix_sections,
358     SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by,
359     const SuggestionResultChecker* suggestion_result_checker,
360     std::vector<DocHitInfo>* hits_out,
361     std::vector<Hit::TermFrequencyArray>* term_frequency_out) {
362   int score = 0;
363   DocumentId last_document_id = kInvalidDocumentId;
364   // Record whether the last document belongs to the given namespaces.
365   bool is_last_document_desired = false;
366 
367   if (NeedSort()) {
368     // Transition from shared_lock in NeedSort to unique_lock here is safe
369     // because it doesn't hurt to sort again if sorting was done already by
370     // another thread after NeedSort is evaluated. NeedSort is called before
371     // sorting to improve concurrency as threads can avoid acquiring the unique
372     // lock if no sorting is needed.
373     absl_ports::unique_lock l(&mutex_);
374     SortHits();
375   }
376 
377   // This downgrade from an unique_lock to a shared_lock is safe because we're
378   // searching for the term in the searchable (sorted) section of the HitBuffer
379   // only in Seek().
380   // Any operations that might execute in between the transition of downgrading
381   // the lock here are guaranteed not to alter the searchable section (or the
382   // LiteIndex due to a global lock in IcingSearchEngine).
383   absl_ports::shared_lock l(&mutex_);
384   for (uint32_t idx = Seek(term_id); idx < header_->searchable_end(); idx++) {
385     TermIdHitPair term_id_hit_pair =
386         hit_buffer_.array_cast<TermIdHitPair>()[idx];
387     if (term_id_hit_pair.term_id() != term_id) break;
388 
389     const Hit& hit = term_id_hit_pair.hit();
390     // Check sections.
391     if (((UINT64_C(1) << hit.section_id()) & section_id_mask) == 0) {
392       continue;
393     }
394     // Check prefix section only.
395     if (only_from_prefix_sections && !hit.is_in_prefix_section()) {
396       continue;
397     }
398     // TODO(b/230553264) Move common logic into helper function once we support
399     //  score term by prefix_hit in lite_index.
400     // Check whether this Hit is desired.
401     DocumentId document_id = hit.document_id();
402     bool is_new_document = document_id != last_document_id;
403     if (is_new_document) {
404       last_document_id = document_id;
405       is_last_document_desired =
406           suggestion_result_checker == nullptr ||
407           suggestion_result_checker->BelongsToTargetResults(document_id,
408                                                             hit.section_id());
409     }
410     if (!is_last_document_desired) {
411       // The document is removed or expired or not desired.
412       continue;
413     }
414 
415     // Score the hit by the strategy
416     switch (score_by) {
417       case SuggestionScoringSpecProto::SuggestionRankingStrategy::NONE:
418         score = 1;
419         break;
420       case SuggestionScoringSpecProto::SuggestionRankingStrategy::
421           DOCUMENT_COUNT:
422         if (is_new_document) {
423           ++score;
424         }
425         break;
426       case SuggestionScoringSpecProto::SuggestionRankingStrategy::
427           TERM_FREQUENCY:
428         if (hit.has_term_frequency()) {
429           score += hit.term_frequency();
430         } else {
431           ++score;
432         }
433         break;
434     }
435 
436     // Append the Hit or update hit section to the output vector.
437     if (is_new_document && hits_out != nullptr) {
438       hits_out->push_back(DocHitInfo(document_id));
439       if (term_frequency_out != nullptr) {
440         term_frequency_out->push_back(Hit::TermFrequencyArray());
441       }
442     }
443     if (hits_out != nullptr) {
444       hits_out->back().UpdateSection(hit.section_id());
445       if (term_frequency_out != nullptr) {
446         term_frequency_out->back()[hit.section_id()] = hit.term_frequency();
447       }
448     }
449   }
450   return score;
451 }
452 
ScoreHits(uint32_t term_id,SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by,const SuggestionResultChecker * suggestion_result_checker)453 libtextclassifier3::StatusOr<int> LiteIndex::ScoreHits(
454     uint32_t term_id,
455     SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by,
456     const SuggestionResultChecker* suggestion_result_checker) {
457   return FetchHits(term_id, kSectionIdMaskAll,
458                     /*only_from_prefix_sections=*/false, score_by,
459                     suggestion_result_checker,
460                     /*hits_out=*/nullptr);
461 }
462 
is_full() const463 bool LiteIndex::is_full() const {
464   return (header_->cur_size() == options_.hit_buffer_size ||
465           lexicon_.min_free_fraction() < (1.0 - kTrieFullFraction));
466 }
467 
GetDebugInfo(DebugInfoVerbosity::Code verbosity)468 std::string LiteIndex::GetDebugInfo(DebugInfoVerbosity::Code verbosity) {
469   absl_ports::unique_lock l(&mutex_);
470   std::string res;
471   std::string lexicon_info;
472   lexicon_.GetDebugInfo(verbosity, &lexicon_info);
473   IcingStringUtil::SStringAppendF(
474       &res, 0,
475       "curr_size: %u\n"
476       "hit_buffer_size: %u\n"
477       "last_added_document_id %u\n"
478       "searchable_end: %u\n"
479       "index_crc: %u\n"
480       "\n"
481       "lite_lexicon_info:\n%s\n",
482       header_->cur_size(), options_.hit_buffer_size,
483       header_->last_added_docid(), header_->searchable_end(),
484       ComputeChecksum().Get(), lexicon_info.c_str());
485   return res;
486 }
487 
GetElementsSize() const488 libtextclassifier3::StatusOr<int64_t> LiteIndex::GetElementsSize() const {
489   IndexStorageInfoProto storage_info = GetStorageInfo(IndexStorageInfoProto());
490   if (storage_info.lite_index_hit_buffer_size() == -1 ||
491       storage_info.lite_index_lexicon_size() == -1) {
492     return absl_ports::AbortedError(
493         "Failed to get size of LiteIndex's members.");
494   }
495   // On initialization, we grow the file to a padded size first. So this size
496   // won't count towards the size taken up by elements
497   size_t header_padded_size = IcingMMapper::page_aligned_size(header_size());
498   return storage_info.lite_index_hit_buffer_size() - header_padded_size +
499          storage_info.lite_index_lexicon_size();
500 }
501 
GetStorageInfo(IndexStorageInfoProto storage_info) const502 IndexStorageInfoProto LiteIndex::GetStorageInfo(
503     IndexStorageInfoProto storage_info) const {
504   absl_ports::shared_lock l(&mutex_);
505   int64_t header_and_hit_buffer_file_size =
506       filesystem_->GetFileSize(hit_buffer_fd_.get());
507   storage_info.set_lite_index_hit_buffer_size(
508       IcingFilesystem::SanitizeFileSize(header_and_hit_buffer_file_size));
509   int64_t lexicon_disk_usage = lexicon_.GetElementsSize();
510   if (lexicon_disk_usage != Filesystem::kBadFileSize) {
511     storage_info.set_lite_index_lexicon_size(lexicon_disk_usage);
512   } else {
513     storage_info.set_lite_index_lexicon_size(-1);
514   }
515   return storage_info;
516 }
517 
SortHits()518 void LiteIndex::SortHits() {
519   // Make searchable by sorting by hit buffer.
520   uint32_t sort_len = header_->cur_size() - header_->searchable_end();
521   if (sort_len <= 0) {
522     return;
523   }
524   IcingTimer timer;
525 
526   auto* array_start =
527       hit_buffer_.GetMutableMem<TermIdHitPair::Value>(0, header_->cur_size());
528   TermIdHitPair::Value* sort_start = array_start + header_->searchable_end();
529   std::sort(sort_start, array_start + header_->cur_size());
530 
531   // Now merge with previous region. Since the previous region is already
532   // sorted and deduplicated, optimize the merge by skipping everything less
533   // than the new region's smallest value.
534   if (header_->searchable_end() > 0) {
535     std::inplace_merge(array_start, array_start + header_->searchable_end(),
536                        array_start + header_->cur_size());
537   }
538   ICING_VLOG(2) << "Lite index sort and merge " << sort_len << " into "
539                 << header_->searchable_end() << " in " << timer.Elapsed() * 1000
540                 << "ms";
541 
542   // Now the entire array is sorted.
543   header_->set_searchable_end(header_->cur_size());
544 
545   // Update crc in-line.
546   UpdateChecksum();
547 }
548 
Seek(uint32_t term_id) const549 uint32_t LiteIndex::Seek(uint32_t term_id) const {
550   // Binary search for our term_id.  Make sure we get the first
551   // element.  Using kBeginSortValue ensures this for the hit value.
552   TermIdHitPair term_id_hit_pair(
553       term_id, Hit(Hit::kMaxDocumentIdSortValue, Hit::kDefaultTermFrequency));
554 
555   const TermIdHitPair::Value* array =
556       hit_buffer_.array_cast<TermIdHitPair::Value>();
557   if (header_->searchable_end() != header_->cur_size()) {
558     ICING_LOG(WARNING) << "Lite index: hit buffer searchable end != current "
559                        << "size during Seek(): "
560                        << header_->searchable_end() << " vs "
561                        << header_->cur_size();
562   }
563   const TermIdHitPair::Value* ptr = std::lower_bound(
564       array, array + header_->searchable_end(), term_id_hit_pair.value());
565   return ptr - array;
566 }
567 
Optimize(const std::vector<DocumentId> & document_id_old_to_new,const TermIdCodec * term_id_codec,DocumentId new_last_added_document_id)568 libtextclassifier3::Status LiteIndex::Optimize(
569     const std::vector<DocumentId>& document_id_old_to_new,
570     const TermIdCodec* term_id_codec, DocumentId new_last_added_document_id) {
571   absl_ports::unique_lock l(&mutex_);
572   header_->set_last_added_docid(new_last_added_document_id);
573   if (header_->cur_size() == 0) {
574     return libtextclassifier3::Status::OK;
575   }
576   // Sort the hits so that hits with the same term id will be grouped together,
577   // which helps later to determine which terms will be unused after compaction.
578   SortHits();
579   uint32_t new_size = 0;
580   uint32_t curr_term_id = 0;
581   uint32_t curr_tvi = 0;
582   std::unordered_set<uint32_t> tvi_to_delete;
583   for (uint32_t idx = 0; idx < header_->cur_size(); ++idx) {
584     TermIdHitPair term_id_hit_pair(
585         hit_buffer_.array_cast<TermIdHitPair>()[idx]);
586     if (idx == 0 || term_id_hit_pair.term_id() != curr_term_id) {
587       curr_term_id = term_id_hit_pair.term_id();
588       ICING_ASSIGN_OR_RETURN(TermIdCodec::DecodedTermInfo term_info,
589                              term_id_codec->DecodeTermInfo(curr_term_id));
590       curr_tvi = term_info.tvi;
591       // Mark the property of the current term as not having hits in prefix
592       // section. The property will be set below if there are any valid hits
593       // from a prefix section.
594       lexicon_.ClearProperty(curr_tvi, GetHasHitsInPrefixSectionPropertyId());
595       // Add curr_tvi to tvi_to_delete. It will be removed from tvi_to_delete
596       // below if there are any valid hits pointing to that termid.
597       tvi_to_delete.insert(curr_tvi);
598     }
599     DocumentId new_document_id =
600         document_id_old_to_new[term_id_hit_pair.hit().document_id()];
601     if (new_document_id == kInvalidDocumentId) {
602       continue;
603     }
604     if (term_id_hit_pair.hit().is_in_prefix_section()) {
605       lexicon_.SetProperty(curr_tvi, GetHasHitsInPrefixSectionPropertyId());
606     }
607     tvi_to_delete.erase(curr_tvi);
608     TermIdHitPair new_term_id_hit_pair(
609         term_id_hit_pair.term_id(),
610         Hit::TranslateHit(term_id_hit_pair.hit(), new_document_id));
611     // Rewriting the hit_buffer in place.
612     // new_size is weakly less than idx so we are okay to overwrite the entry at
613     // new_size, and valp should never be nullptr since it is within the already
614     // allocated region of hit_buffer_.
615     TermIdHitPair::Value* valp =
616         hit_buffer_.GetMutableMem<TermIdHitPair::Value>(new_size++, 1);
617     *valp = new_term_id_hit_pair.value();
618   }
619   header_->set_cur_size(new_size);
620   header_->set_searchable_end(new_size);
621 
622   // Delete unused terms.
623   std::unordered_set<std::string> terms_to_delete;
624   for (IcingDynamicTrie::Iterator term_iter(lexicon_, /*prefix=*/"");
625        term_iter.IsValid(); term_iter.Advance()) {
626     if (tvi_to_delete.find(term_iter.GetValueIndex()) != tvi_to_delete.end()) {
627       terms_to_delete.insert(term_iter.GetKey());
628     }
629   }
630   for (const std::string& term : terms_to_delete) {
631     // Mark "term" as deleted. This won't actually free space in the lexicon. It
632     // will simply make it impossible to Find "term" in subsequent calls (which
633     // saves an unnecessary search through the hit buffer). This is acceptable
634     // because the free space will eventually be reclaimed the next time that
635     // the lite index is merged with the main index.
636     if (!lexicon_.Delete(term)) {
637       return absl_ports::InternalError(
638           "Could not delete invalid terms in lite lexicon during compaction.");
639     }
640   }
641   return libtextclassifier3::Status::OK;
642 }
643 
644 }  // namespace lib
645 }  // namespace icing
646