1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/index/lite/lite-index.h"
16
17 #include <sys/mman.h>
18
19 #include <algorithm>
20 #include <cinttypes>
21 #include <cstddef>
22 #include <cstdint>
23 #include <memory>
24 #include <string>
25 #include <string_view>
26 #include <unordered_set>
27 #include <utility>
28 #include <vector>
29
30 #include "icing/text_classifier/lib3/utils/base/status.h"
31 #include "icing/text_classifier/lib3/utils/base/statusor.h"
32 #include "icing/absl_ports/canonical_errors.h"
33 #include "icing/absl_ports/mutex.h"
34 #include "icing/absl_ports/str_cat.h"
35 #include "icing/file/filesystem.h"
36 #include "icing/index/hit/doc-hit-info.h"
37 #include "icing/index/hit/hit.h"
38 #include "icing/index/lite/lite-index-header.h"
39 #include "icing/index/term-property-id.h"
40 #include "icing/legacy/core/icing-string-util.h"
41 #include "icing/legacy/core/icing-timer.h"
42 #include "icing/legacy/index/icing-array-storage.h"
43 #include "icing/legacy/index/icing-dynamic-trie.h"
44 #include "icing/legacy/index/icing-filesystem.h"
45 #include "icing/legacy/index/icing-mmapper.h"
46 #include "icing/proto/debug.pb.h"
47 #include "icing/proto/storage.pb.h"
48 #include "icing/proto/term.pb.h"
49 #include "icing/schema/section.h"
50 #include "icing/store/document-id.h"
51 #include "icing/util/crc32.h"
52 #include "icing/util/logging.h"
53 #include "icing/util/status-macros.h"
54
55 namespace icing {
56 namespace lib {
57
58 namespace {
59
60 // Point at which we declare the trie full.
61 constexpr double kTrieFullFraction = 0.95;
62
MakeHitBufferFilename(const std::string & filename_base)63 std::string MakeHitBufferFilename(const std::string& filename_base) {
64 return filename_base + "hb";
65 }
66
header_size()67 size_t header_size() { return sizeof(LiteIndex_HeaderImpl::HeaderData); }
68
69 } // namespace
70
71 const TermIdHitPair::Value TermIdHitPair::kInvalidValue =
72 TermIdHitPair(0, Hit()).value();
73
Create(const LiteIndex::Options & options,const IcingFilesystem * filesystem)74 libtextclassifier3::StatusOr<std::unique_ptr<LiteIndex>> LiteIndex::Create(
75 const LiteIndex::Options& options, const IcingFilesystem* filesystem) {
76 ICING_RETURN_ERROR_IF_NULL(filesystem);
77
78 std::unique_ptr<LiteIndex> lite_index =
79 std::unique_ptr<LiteIndex>(new LiteIndex(options, filesystem));
80 ICING_RETURN_IF_ERROR(lite_index->Initialize());
81 return std::move(lite_index);
82 }
83
84 // size is max size in elements. An appropriate lexicon and display
85 // mapping size will be chosen based on hit buffer size.
LiteIndex(const LiteIndex::Options & options,const IcingFilesystem * filesystem)86 LiteIndex::LiteIndex(const LiteIndex::Options& options,
87 const IcingFilesystem* filesystem)
88 : hit_buffer_(*filesystem),
89 hit_buffer_crc_(0),
90 lexicon_(options.filename_base + "lexicon", MakeTrieRuntimeOptions(),
91 filesystem),
92 header_mmap_(false, MAP_SHARED),
93 options_(options),
94 filesystem_(filesystem) {}
95
~LiteIndex()96 LiteIndex::~LiteIndex() {
97 if (initialized()) {
98 libtextclassifier3::Status unused = PersistToDisk();
99 }
100 }
101
MakeTrieRuntimeOptions()102 IcingDynamicTrie::RuntimeOptions LiteIndex::MakeTrieRuntimeOptions() {
103 return IcingDynamicTrie::RuntimeOptions().set_storage_policy(
104 IcingDynamicTrie::RuntimeOptions::kMapSharedWithCrc);
105 }
106
Initialize()107 libtextclassifier3::Status LiteIndex::Initialize() {
108 // Size of hit buffer's header struct, rounded up to the nearest number of
109 // system memory pages.
110 const size_t header_padded_size =
111 IcingMMapper::page_aligned_size(header_size());
112
113 // Variable declarations cannot cross goto jumps, so declare these up top.
114 libtextclassifier3::Status status;
115 uint64_t file_size;
116 IcingTimer timer;
117
118 absl_ports::unique_lock l(&mutex_);
119 if (!lexicon_.CreateIfNotExist(options_.lexicon_options) ||
120 !lexicon_.Init()) {
121 return absl_ports::InternalError("Failed to initialize lexicon trie");
122 }
123
124 hit_buffer_fd_.reset(filesystem_->OpenForWrite(
125 MakeHitBufferFilename(options_.filename_base).c_str()));
126 if (!hit_buffer_fd_.is_valid()) {
127 status = absl_ports::InternalError("Failed to open hit buffer file");
128 goto error;
129 }
130
131 file_size = filesystem_->GetFileSize(hit_buffer_fd_.get());
132 if (file_size == IcingFilesystem::kBadFileSize) {
133 status = absl_ports::InternalError("Failed to query hit buffer file size");
134 goto error;
135 }
136
137 if (file_size < header_padded_size) {
138 if (file_size != 0) {
139 status = absl_ports::InternalError(IcingStringUtil::StringPrintf(
140 "Hit buffer had unexpected size %" PRIu64, file_size));
141 goto error;
142 }
143
144 ICING_VLOG(2) << "Creating new hit buffer";
145 // Make sure files are fresh.
146 if (!lexicon_.Remove() ||
147 !lexicon_.CreateIfNotExist(options_.lexicon_options) ||
148 !lexicon_.Init()) {
149 status =
150 absl_ports::InternalError("Failed to refresh lexicon during clear");
151 goto error;
152 }
153
154 // Create fresh hit buffer by first emptying the hit buffer file and then
155 // allocating header_padded_size of the cleared space.
156 if (!filesystem_->Truncate(hit_buffer_fd_.get(), 0) ||
157 !filesystem_->Truncate(hit_buffer_fd_.get(), header_padded_size)) {
158 status = absl_ports::InternalError("Failed to truncate hit buffer file");
159 goto error;
160 }
161
162 // Set up header.
163 header_mmap_.Remap(hit_buffer_fd_.get(), 0, header_size());
164 header_ = std::make_unique<LiteIndex_HeaderImpl>(
165 reinterpret_cast<LiteIndex_HeaderImpl::HeaderData*>(
166 header_mmap_.address()));
167 header_->Reset();
168
169 if (!hit_buffer_.Init(hit_buffer_fd_.get(), header_padded_size, true,
170 sizeof(TermIdHitPair::Value), header_->cur_size(),
171 options_.hit_buffer_size, &hit_buffer_crc_, true)) {
172 status = absl_ports::InternalError("Failed to initialize new hit buffer");
173 goto error;
174 }
175
176 UpdateChecksum();
177 } else {
178 header_mmap_.Remap(hit_buffer_fd_.get(), 0, header_size());
179 header_ = std::make_unique<LiteIndex_HeaderImpl>(
180 reinterpret_cast<LiteIndex_HeaderImpl::HeaderData*>(
181 header_mmap_.address()));
182
183 if (!hit_buffer_.Init(hit_buffer_fd_.get(), header_padded_size, true,
184 sizeof(TermIdHitPair::Value), header_->cur_size(),
185 options_.hit_buffer_size, &hit_buffer_crc_, true)) {
186 status = absl_ports::InternalError(
187 "Failed to re-initialize existing hit buffer");
188 goto error;
189 }
190
191 // Check integrity.
192 if (!header_->check_magic()) {
193 status = absl_ports::InternalError("Lite index header magic mismatch");
194 goto error;
195 }
196 Crc32 crc = ComputeChecksum();
197 if (crc.Get() != header_->lite_index_crc()) {
198 status = absl_ports::DataLossError(
199 IcingStringUtil::StringPrintf("Lite index crc check failed: %u vs %u",
200 crc.Get(), header_->lite_index_crc()));
201 goto error;
202 }
203 }
204
205 ICING_VLOG(2) << "Lite index init ok in " << timer.Elapsed() * 1000 << "ms";
206 return status;
207
208 error:
209 header_ = nullptr;
210 header_mmap_.Unmap();
211 lexicon_.Close();
212 hit_buffer_crc_ = 0;
213 hit_buffer_.Reset();
214 hit_buffer_fd_.reset();
215 if (status.ok()) {
216 return absl_ports::InternalError(
217 "Error handling code ran but status was ok");
218 }
219 return status;
220 }
221
ComputeChecksum()222 Crc32 LiteIndex::ComputeChecksum() {
223 IcingTimer timer;
224
225 // Update crcs.
226 uint32_t dependent_crcs[2];
227 hit_buffer_.UpdateCrc();
228 dependent_crcs[0] = hit_buffer_crc_;
229 dependent_crcs[1] = lexicon_.UpdateCrc();
230
231 // Compute the master crc.
232
233 // Header crc, excluding the actual crc field.
234 Crc32 all_crc(header_->CalculateHeaderCrc());
235 all_crc.Append(std::string_view(reinterpret_cast<const char*>(dependent_crcs),
236 sizeof(dependent_crcs)));
237 ICING_VLOG(2) << "Lite index crc computed in " << timer.Elapsed() * 1000
238 << "ms";
239
240 return all_crc;
241 }
242
Reset()243 libtextclassifier3::Status LiteIndex::Reset() {
244 IcingTimer timer;
245
246 absl_ports::unique_lock l(&mutex_);
247 // TODO(b/140436942): When these components have been changed to return errors
248 // they should be propagated from here.
249 lexicon_.Clear();
250 hit_buffer_.Clear();
251 header_->Reset();
252 UpdateChecksum();
253
254 ICING_VLOG(2) << "Lite index clear in " << timer.Elapsed() * 1000 << "ms";
255 return libtextclassifier3::Status::OK;
256 }
257
Warm()258 void LiteIndex::Warm() {
259 absl_ports::shared_lock l(&mutex_);
260 hit_buffer_.Warm();
261 lexicon_.Warm();
262 }
263
PersistToDisk()264 libtextclassifier3::Status LiteIndex::PersistToDisk() {
265 absl_ports::unique_lock l(&mutex_);
266 bool success = true;
267 if (!lexicon_.Sync()) {
268 ICING_VLOG(1) << "Failed to sync the lexicon.";
269 success = false;
270 }
271 hit_buffer_.Sync();
272 UpdateChecksum();
273 header_mmap_.Sync();
274
275 return (success) ? libtextclassifier3::Status::OK
276 : absl_ports::InternalError(
277 "Unable to sync lite index components.");
278 }
279
UpdateChecksum()280 void LiteIndex::UpdateChecksum() {
281 header_->set_lite_index_crc(ComputeChecksum().Get());
282 }
283
InsertTerm(const std::string & term,TermMatchType::Code term_match_type,NamespaceId namespace_id)284 libtextclassifier3::StatusOr<uint32_t> LiteIndex::InsertTerm(
285 const std::string& term, TermMatchType::Code term_match_type,
286 NamespaceId namespace_id) {
287 absl_ports::unique_lock l(&mutex_);
288 uint32_t tvi;
289 libtextclassifier3::Status status =
290 lexicon_.Insert(term.c_str(), "", &tvi, false);
291 if (!status.ok()) {
292 ICING_LOG(DBG) << "Unable to add term " << term << " to lexicon!\n"
293 << status.error_message();
294 return status;
295 }
296 ICING_RETURN_IF_ERROR(UpdateTermPropertiesImpl(
297 tvi, term_match_type == TermMatchType::PREFIX, namespace_id));
298 return tvi;
299 }
300
UpdateTermProperties(uint32_t tvi,bool hasPrefixHits,NamespaceId namespace_id)301 libtextclassifier3::Status LiteIndex::UpdateTermProperties(
302 uint32_t tvi, bool hasPrefixHits, NamespaceId namespace_id) {
303 absl_ports::unique_lock l(&mutex_);
304 return UpdateTermPropertiesImpl(tvi, hasPrefixHits, namespace_id);
305 }
306
UpdateTermPropertiesImpl(uint32_t tvi,bool hasPrefixHits,NamespaceId namespace_id)307 libtextclassifier3::Status LiteIndex::UpdateTermPropertiesImpl(
308 uint32_t tvi, bool hasPrefixHits, NamespaceId namespace_id) {
309 if (hasPrefixHits &&
310 !lexicon_.SetProperty(tvi, GetHasHitsInPrefixSectionPropertyId())) {
311 return absl_ports::ResourceExhaustedError(
312 "Insufficient disk space to create prefix property!");
313 }
314
315 if (!lexicon_.SetProperty(tvi, GetNamespacePropertyId(namespace_id))) {
316 return absl_ports::ResourceExhaustedError(
317 "Insufficient disk space to create namespace property!");
318 }
319
320 return libtextclassifier3::Status::OK;
321 }
322
AddHit(uint32_t term_id,const Hit & hit)323 libtextclassifier3::Status LiteIndex::AddHit(uint32_t term_id, const Hit& hit) {
324 absl_ports::unique_lock l(&mutex_);
325 if (is_full()) {
326 return absl_ports::ResourceExhaustedError("Hit buffer is full!");
327 }
328
329 TermIdHitPair term_id_hit_pair(term_id, hit);
330 uint32_t cur_size = header_->cur_size();
331 TermIdHitPair::Value* valp =
332 hit_buffer_.GetMutableMem<TermIdHitPair::Value>(cur_size, 1);
333 if (valp == nullptr) {
334 return absl_ports::ResourceExhaustedError(
335 "Allocating more space in hit buffer failed!");
336 }
337 *valp = term_id_hit_pair.value();
338 header_->set_cur_size(cur_size + 1);
339
340 return libtextclassifier3::Status::OK;
341 }
342
GetTermId(const std::string & term) const343 libtextclassifier3::StatusOr<uint32_t> LiteIndex::GetTermId(
344 const std::string& term) const {
345 absl_ports::shared_lock l(&mutex_);
346 char dummy;
347 uint32_t tvi;
348 if (!lexicon_.Find(term.c_str(), &dummy, &tvi)) {
349 return absl_ports::NotFoundError(
350 absl_ports::StrCat("Could not find ", term, " in the lexicon."));
351 }
352 return tvi;
353 }
354
FetchHits(uint32_t term_id,SectionIdMask section_id_mask,bool only_from_prefix_sections,SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by,const SuggestionResultChecker * suggestion_result_checker,std::vector<DocHitInfo> * hits_out,std::vector<Hit::TermFrequencyArray> * term_frequency_out)355 int LiteIndex::FetchHits(
356 uint32_t term_id, SectionIdMask section_id_mask,
357 bool only_from_prefix_sections,
358 SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by,
359 const SuggestionResultChecker* suggestion_result_checker,
360 std::vector<DocHitInfo>* hits_out,
361 std::vector<Hit::TermFrequencyArray>* term_frequency_out) {
362 int score = 0;
363 DocumentId last_document_id = kInvalidDocumentId;
364 // Record whether the last document belongs to the given namespaces.
365 bool is_last_document_desired = false;
366
367 if (NeedSort()) {
368 // Transition from shared_lock in NeedSort to unique_lock here is safe
369 // because it doesn't hurt to sort again if sorting was done already by
370 // another thread after NeedSort is evaluated. NeedSort is called before
371 // sorting to improve concurrency as threads can avoid acquiring the unique
372 // lock if no sorting is needed.
373 absl_ports::unique_lock l(&mutex_);
374 SortHits();
375 }
376
377 // This downgrade from an unique_lock to a shared_lock is safe because we're
378 // searching for the term in the searchable (sorted) section of the HitBuffer
379 // only in Seek().
380 // Any operations that might execute in between the transition of downgrading
381 // the lock here are guaranteed not to alter the searchable section (or the
382 // LiteIndex due to a global lock in IcingSearchEngine).
383 absl_ports::shared_lock l(&mutex_);
384 for (uint32_t idx = Seek(term_id); idx < header_->searchable_end(); idx++) {
385 TermIdHitPair term_id_hit_pair =
386 hit_buffer_.array_cast<TermIdHitPair>()[idx];
387 if (term_id_hit_pair.term_id() != term_id) break;
388
389 const Hit& hit = term_id_hit_pair.hit();
390 // Check sections.
391 if (((UINT64_C(1) << hit.section_id()) & section_id_mask) == 0) {
392 continue;
393 }
394 // Check prefix section only.
395 if (only_from_prefix_sections && !hit.is_in_prefix_section()) {
396 continue;
397 }
398 // TODO(b/230553264) Move common logic into helper function once we support
399 // score term by prefix_hit in lite_index.
400 // Check whether this Hit is desired.
401 DocumentId document_id = hit.document_id();
402 bool is_new_document = document_id != last_document_id;
403 if (is_new_document) {
404 last_document_id = document_id;
405 is_last_document_desired =
406 suggestion_result_checker == nullptr ||
407 suggestion_result_checker->BelongsToTargetResults(document_id,
408 hit.section_id());
409 }
410 if (!is_last_document_desired) {
411 // The document is removed or expired or not desired.
412 continue;
413 }
414
415 // Score the hit by the strategy
416 switch (score_by) {
417 case SuggestionScoringSpecProto::SuggestionRankingStrategy::NONE:
418 score = 1;
419 break;
420 case SuggestionScoringSpecProto::SuggestionRankingStrategy::
421 DOCUMENT_COUNT:
422 if (is_new_document) {
423 ++score;
424 }
425 break;
426 case SuggestionScoringSpecProto::SuggestionRankingStrategy::
427 TERM_FREQUENCY:
428 if (hit.has_term_frequency()) {
429 score += hit.term_frequency();
430 } else {
431 ++score;
432 }
433 break;
434 }
435
436 // Append the Hit or update hit section to the output vector.
437 if (is_new_document && hits_out != nullptr) {
438 hits_out->push_back(DocHitInfo(document_id));
439 if (term_frequency_out != nullptr) {
440 term_frequency_out->push_back(Hit::TermFrequencyArray());
441 }
442 }
443 if (hits_out != nullptr) {
444 hits_out->back().UpdateSection(hit.section_id());
445 if (term_frequency_out != nullptr) {
446 term_frequency_out->back()[hit.section_id()] = hit.term_frequency();
447 }
448 }
449 }
450 return score;
451 }
452
ScoreHits(uint32_t term_id,SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by,const SuggestionResultChecker * suggestion_result_checker)453 libtextclassifier3::StatusOr<int> LiteIndex::ScoreHits(
454 uint32_t term_id,
455 SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by,
456 const SuggestionResultChecker* suggestion_result_checker) {
457 return FetchHits(term_id, kSectionIdMaskAll,
458 /*only_from_prefix_sections=*/false, score_by,
459 suggestion_result_checker,
460 /*hits_out=*/nullptr);
461 }
462
is_full() const463 bool LiteIndex::is_full() const {
464 return (header_->cur_size() == options_.hit_buffer_size ||
465 lexicon_.min_free_fraction() < (1.0 - kTrieFullFraction));
466 }
467
GetDebugInfo(DebugInfoVerbosity::Code verbosity)468 std::string LiteIndex::GetDebugInfo(DebugInfoVerbosity::Code verbosity) {
469 absl_ports::unique_lock l(&mutex_);
470 std::string res;
471 std::string lexicon_info;
472 lexicon_.GetDebugInfo(verbosity, &lexicon_info);
473 IcingStringUtil::SStringAppendF(
474 &res, 0,
475 "curr_size: %u\n"
476 "hit_buffer_size: %u\n"
477 "last_added_document_id %u\n"
478 "searchable_end: %u\n"
479 "index_crc: %u\n"
480 "\n"
481 "lite_lexicon_info:\n%s\n",
482 header_->cur_size(), options_.hit_buffer_size,
483 header_->last_added_docid(), header_->searchable_end(),
484 ComputeChecksum().Get(), lexicon_info.c_str());
485 return res;
486 }
487
GetElementsSize() const488 libtextclassifier3::StatusOr<int64_t> LiteIndex::GetElementsSize() const {
489 IndexStorageInfoProto storage_info = GetStorageInfo(IndexStorageInfoProto());
490 if (storage_info.lite_index_hit_buffer_size() == -1 ||
491 storage_info.lite_index_lexicon_size() == -1) {
492 return absl_ports::AbortedError(
493 "Failed to get size of LiteIndex's members.");
494 }
495 // On initialization, we grow the file to a padded size first. So this size
496 // won't count towards the size taken up by elements
497 size_t header_padded_size = IcingMMapper::page_aligned_size(header_size());
498 return storage_info.lite_index_hit_buffer_size() - header_padded_size +
499 storage_info.lite_index_lexicon_size();
500 }
501
GetStorageInfo(IndexStorageInfoProto storage_info) const502 IndexStorageInfoProto LiteIndex::GetStorageInfo(
503 IndexStorageInfoProto storage_info) const {
504 absl_ports::shared_lock l(&mutex_);
505 int64_t header_and_hit_buffer_file_size =
506 filesystem_->GetFileSize(hit_buffer_fd_.get());
507 storage_info.set_lite_index_hit_buffer_size(
508 IcingFilesystem::SanitizeFileSize(header_and_hit_buffer_file_size));
509 int64_t lexicon_disk_usage = lexicon_.GetElementsSize();
510 if (lexicon_disk_usage != Filesystem::kBadFileSize) {
511 storage_info.set_lite_index_lexicon_size(lexicon_disk_usage);
512 } else {
513 storage_info.set_lite_index_lexicon_size(-1);
514 }
515 return storage_info;
516 }
517
SortHits()518 void LiteIndex::SortHits() {
519 // Make searchable by sorting by hit buffer.
520 uint32_t sort_len = header_->cur_size() - header_->searchable_end();
521 if (sort_len <= 0) {
522 return;
523 }
524 IcingTimer timer;
525
526 auto* array_start =
527 hit_buffer_.GetMutableMem<TermIdHitPair::Value>(0, header_->cur_size());
528 TermIdHitPair::Value* sort_start = array_start + header_->searchable_end();
529 std::sort(sort_start, array_start + header_->cur_size());
530
531 // Now merge with previous region. Since the previous region is already
532 // sorted and deduplicated, optimize the merge by skipping everything less
533 // than the new region's smallest value.
534 if (header_->searchable_end() > 0) {
535 std::inplace_merge(array_start, array_start + header_->searchable_end(),
536 array_start + header_->cur_size());
537 }
538 ICING_VLOG(2) << "Lite index sort and merge " << sort_len << " into "
539 << header_->searchable_end() << " in " << timer.Elapsed() * 1000
540 << "ms";
541
542 // Now the entire array is sorted.
543 header_->set_searchable_end(header_->cur_size());
544
545 // Update crc in-line.
546 UpdateChecksum();
547 }
548
Seek(uint32_t term_id) const549 uint32_t LiteIndex::Seek(uint32_t term_id) const {
550 // Binary search for our term_id. Make sure we get the first
551 // element. Using kBeginSortValue ensures this for the hit value.
552 TermIdHitPair term_id_hit_pair(
553 term_id, Hit(Hit::kMaxDocumentIdSortValue, Hit::kDefaultTermFrequency));
554
555 const TermIdHitPair::Value* array =
556 hit_buffer_.array_cast<TermIdHitPair::Value>();
557 if (header_->searchable_end() != header_->cur_size()) {
558 ICING_LOG(WARNING) << "Lite index: hit buffer searchable end != current "
559 << "size during Seek(): "
560 << header_->searchable_end() << " vs "
561 << header_->cur_size();
562 }
563 const TermIdHitPair::Value* ptr = std::lower_bound(
564 array, array + header_->searchable_end(), term_id_hit_pair.value());
565 return ptr - array;
566 }
567
Optimize(const std::vector<DocumentId> & document_id_old_to_new,const TermIdCodec * term_id_codec,DocumentId new_last_added_document_id)568 libtextclassifier3::Status LiteIndex::Optimize(
569 const std::vector<DocumentId>& document_id_old_to_new,
570 const TermIdCodec* term_id_codec, DocumentId new_last_added_document_id) {
571 absl_ports::unique_lock l(&mutex_);
572 header_->set_last_added_docid(new_last_added_document_id);
573 if (header_->cur_size() == 0) {
574 return libtextclassifier3::Status::OK;
575 }
576 // Sort the hits so that hits with the same term id will be grouped together,
577 // which helps later to determine which terms will be unused after compaction.
578 SortHits();
579 uint32_t new_size = 0;
580 uint32_t curr_term_id = 0;
581 uint32_t curr_tvi = 0;
582 std::unordered_set<uint32_t> tvi_to_delete;
583 for (uint32_t idx = 0; idx < header_->cur_size(); ++idx) {
584 TermIdHitPair term_id_hit_pair(
585 hit_buffer_.array_cast<TermIdHitPair>()[idx]);
586 if (idx == 0 || term_id_hit_pair.term_id() != curr_term_id) {
587 curr_term_id = term_id_hit_pair.term_id();
588 ICING_ASSIGN_OR_RETURN(TermIdCodec::DecodedTermInfo term_info,
589 term_id_codec->DecodeTermInfo(curr_term_id));
590 curr_tvi = term_info.tvi;
591 // Mark the property of the current term as not having hits in prefix
592 // section. The property will be set below if there are any valid hits
593 // from a prefix section.
594 lexicon_.ClearProperty(curr_tvi, GetHasHitsInPrefixSectionPropertyId());
595 // Add curr_tvi to tvi_to_delete. It will be removed from tvi_to_delete
596 // below if there are any valid hits pointing to that termid.
597 tvi_to_delete.insert(curr_tvi);
598 }
599 DocumentId new_document_id =
600 document_id_old_to_new[term_id_hit_pair.hit().document_id()];
601 if (new_document_id == kInvalidDocumentId) {
602 continue;
603 }
604 if (term_id_hit_pair.hit().is_in_prefix_section()) {
605 lexicon_.SetProperty(curr_tvi, GetHasHitsInPrefixSectionPropertyId());
606 }
607 tvi_to_delete.erase(curr_tvi);
608 TermIdHitPair new_term_id_hit_pair(
609 term_id_hit_pair.term_id(),
610 Hit::TranslateHit(term_id_hit_pair.hit(), new_document_id));
611 // Rewriting the hit_buffer in place.
612 // new_size is weakly less than idx so we are okay to overwrite the entry at
613 // new_size, and valp should never be nullptr since it is within the already
614 // allocated region of hit_buffer_.
615 TermIdHitPair::Value* valp =
616 hit_buffer_.GetMutableMem<TermIdHitPair::Value>(new_size++, 1);
617 *valp = new_term_id_hit_pair.value();
618 }
619 header_->set_cur_size(new_size);
620 header_->set_searchable_end(new_size);
621
622 // Delete unused terms.
623 std::unordered_set<std::string> terms_to_delete;
624 for (IcingDynamicTrie::Iterator term_iter(lexicon_, /*prefix=*/"");
625 term_iter.IsValid(); term_iter.Advance()) {
626 if (tvi_to_delete.find(term_iter.GetValueIndex()) != tvi_to_delete.end()) {
627 terms_to_delete.insert(term_iter.GetKey());
628 }
629 }
630 for (const std::string& term : terms_to_delete) {
631 // Mark "term" as deleted. This won't actually free space in the lexicon. It
632 // will simply make it impossible to Find "term" in subsequent calls (which
633 // saves an unnecessary search through the hit buffer). This is acceptable
634 // because the free space will eventually be reclaimed the next time that
635 // the lite index is merged with the main index.
636 if (!lexicon_.Delete(term)) {
637 return absl_ports::InternalError(
638 "Could not delete invalid terms in lite lexicon during compaction.");
639 }
640 }
641 return libtextclassifier3::Status::OK;
642 }
643
644 } // namespace lib
645 } // namespace icing
646