1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/index/lite/lite-index.h"
16
17 #include <sys/mman.h>
18
19 #include <algorithm>
20 #include <cinttypes>
21 #include <cstddef>
22 #include <cstdint>
23 #include <memory>
24 #include <string>
25 #include <string_view>
26 #include <unordered_set>
27 #include <utility>
28 #include <vector>
29
30 #include "icing/text_classifier/lib3/utils/base/status.h"
31 #include "icing/text_classifier/lib3/utils/base/statusor.h"
32 #include "icing/absl_ports/canonical_errors.h"
33 #include "icing/absl_ports/mutex.h"
34 #include "icing/absl_ports/str_cat.h"
35 #include "icing/file/filesystem.h"
36 #include "icing/index/hit/doc-hit-info.h"
37 #include "icing/index/hit/hit.h"
38 #include "icing/index/lite/lite-index-header.h"
39 #include "icing/index/lite/term-id-hit-pair.h"
40 #include "icing/index/term-id-codec.h"
41 #include "icing/index/term-property-id.h"
42 #include "icing/legacy/core/icing-string-util.h"
43 #include "icing/legacy/core/icing-timer.h"
44 #include "icing/legacy/index/icing-array-storage.h"
45 #include "icing/legacy/index/icing-dynamic-trie.h"
46 #include "icing/legacy/index/icing-filesystem.h"
47 #include "icing/legacy/index/icing-mmapper.h"
48 #include "icing/proto/debug.pb.h"
49 #include "icing/proto/scoring.pb.h"
50 #include "icing/proto/storage.pb.h"
51 #include "icing/proto/term.pb.h"
52 #include "icing/schema/section.h"
53 #include "icing/store/document-id.h"
54 #include "icing/store/namespace-id.h"
55 #include "icing/store/suggestion-result-checker.h"
56 #include "icing/util/crc32.h"
57 #include "icing/util/logging.h"
58 #include "icing/util/status-macros.h"
59
60 namespace icing {
61 namespace lib {
62
63 namespace {
64
65 // Point at which we declare the trie full.
66 constexpr double kTrieFullFraction = 0.95;
67
MakeHitBufferFilename(const std::string & filename_base)68 std::string MakeHitBufferFilename(const std::string& filename_base) {
69 return filename_base + "hb";
70 }
71
header_size()72 size_t header_size() { return sizeof(LiteIndex_HeaderImpl::HeaderData); }
73
74 } // namespace
75
76 const TermIdHitPair::Value TermIdHitPair::kInvalidValue =
77 TermIdHitPair(0, Hit(Hit::kInvalidValue)).value();
78
Create(const LiteIndex::Options & options,const IcingFilesystem * filesystem)79 libtextclassifier3::StatusOr<std::unique_ptr<LiteIndex>> LiteIndex::Create(
80 const LiteIndex::Options& options, const IcingFilesystem* filesystem) {
81 ICING_RETURN_ERROR_IF_NULL(filesystem);
82
83 std::unique_ptr<LiteIndex> lite_index =
84 std::unique_ptr<LiteIndex>(new LiteIndex(options, filesystem));
85 ICING_RETURN_IF_ERROR(lite_index->Initialize());
86 return std::move(lite_index);
87 }
88
89 // size is max size in elements. An appropriate lexicon and display
90 // mapping size will be chosen based on hit buffer size.
LiteIndex(const LiteIndex::Options & options,const IcingFilesystem * filesystem)91 LiteIndex::LiteIndex(const LiteIndex::Options& options,
92 const IcingFilesystem* filesystem)
93 : hit_buffer_(*filesystem),
94 hit_buffer_crc_(0),
95 lexicon_(options.filename_base + "lexicon", MakeTrieRuntimeOptions(),
96 filesystem),
97 header_mmap_(false, MAP_SHARED),
98 options_(options),
99 filesystem_(filesystem) {}
100
~LiteIndex()101 LiteIndex::~LiteIndex() {
102 if (initialized()) {
103 libtextclassifier3::Status unused = PersistToDisk();
104 }
105 }
106
MakeTrieRuntimeOptions()107 IcingDynamicTrie::RuntimeOptions LiteIndex::MakeTrieRuntimeOptions() {
108 return IcingDynamicTrie::RuntimeOptions().set_storage_policy(
109 IcingDynamicTrie::RuntimeOptions::kMapSharedWithCrc);
110 }
111
Initialize()112 libtextclassifier3::Status LiteIndex::Initialize() {
113 // Size of hit buffer's header struct, rounded up to the nearest number of
114 // system memory pages.
115 const size_t header_padded_size =
116 IcingMMapper::page_aligned_size(header_size());
117
118 // Variable declarations cannot cross goto jumps, so declare these up top.
119 libtextclassifier3::Status status;
120 uint64_t file_size;
121 IcingTimer timer;
122
123 absl_ports::unique_lock l(&mutex_);
124 if (!lexicon_.CreateIfNotExist(options_.lexicon_options) ||
125 !lexicon_.Init()) {
126 return absl_ports::InternalError("Failed to initialize lexicon trie");
127 }
128
129 hit_buffer_fd_.reset(filesystem_->OpenForWrite(
130 MakeHitBufferFilename(options_.filename_base).c_str()));
131 if (!hit_buffer_fd_.is_valid()) {
132 status = absl_ports::InternalError("Failed to open hit buffer file");
133 goto error;
134 }
135
136 file_size = filesystem_->GetFileSize(hit_buffer_fd_.get());
137 if (file_size == IcingFilesystem::kBadFileSize) {
138 status = absl_ports::InternalError("Failed to query hit buffer file size");
139 goto error;
140 }
141
142 if (file_size < header_padded_size) {
143 if (file_size != 0) {
144 status = absl_ports::InternalError(IcingStringUtil::StringPrintf(
145 "Hit buffer had unexpected size %" PRIu64, file_size));
146 goto error;
147 }
148
149 ICING_VLOG(2) << "Creating new hit buffer";
150 // Make sure files are fresh.
151 if (!lexicon_.Remove() ||
152 !lexicon_.CreateIfNotExist(options_.lexicon_options) ||
153 !lexicon_.Init()) {
154 status =
155 absl_ports::InternalError("Failed to refresh lexicon during clear");
156 goto error;
157 }
158
159 // Create fresh hit buffer by first emptying the hit buffer file and then
160 // allocating header_padded_size of the cleared space.
161 if (!filesystem_->Truncate(hit_buffer_fd_.get(), 0) ||
162 !filesystem_->Truncate(hit_buffer_fd_.get(), header_padded_size)) {
163 status = absl_ports::InternalError("Failed to truncate hit buffer file");
164 goto error;
165 }
166
167 // Set up header.
168 header_mmap_.Remap(hit_buffer_fd_.get(), kHeaderFileOffset, header_size());
169 header_ = std::make_unique<LiteIndex_HeaderImpl>(
170 reinterpret_cast<LiteIndex_HeaderImpl::HeaderData*>(
171 header_mmap_.address()));
172 header_->Reset();
173
174 if (!hit_buffer_.Init(hit_buffer_fd_.get(), header_padded_size, true,
175 sizeof(TermIdHitPair::Value), header_->cur_size(),
176 options_.hit_buffer_size, &hit_buffer_crc_, true)) {
177 status = absl_ports::InternalError("Failed to initialize new hit buffer");
178 goto error;
179 }
180
181 UpdateChecksum();
182 } else {
183 header_mmap_.Remap(hit_buffer_fd_.get(), kHeaderFileOffset, header_size());
184 header_ = std::make_unique<LiteIndex_HeaderImpl>(
185 reinterpret_cast<LiteIndex_HeaderImpl::HeaderData*>(
186 header_mmap_.address()));
187
188 if (!hit_buffer_.Init(hit_buffer_fd_.get(), header_padded_size, true,
189 sizeof(TermIdHitPair::Value), header_->cur_size(),
190 options_.hit_buffer_size, &hit_buffer_crc_, true)) {
191 status = absl_ports::InternalError(
192 "Failed to re-initialize existing hit buffer");
193 goto error;
194 }
195
196 // Check integrity.
197 if (!header_->check_magic()) {
198 status = absl_ports::InternalError("Lite index header magic mismatch");
199 goto error;
200 }
201 Crc32 crc = ComputeChecksum();
202 if (crc.Get() != header_->lite_index_crc()) {
203 status = absl_ports::DataLossError(
204 IcingStringUtil::StringPrintf("Lite index crc check failed: %u vs %u",
205 crc.Get(), header_->lite_index_crc()));
206 goto error;
207 }
208 }
209
210 ICING_VLOG(2) << "Lite index init ok in " << timer.Elapsed() * 1000 << "ms";
211 return status;
212
213 error:
214 header_ = nullptr;
215 header_mmap_.Unmap();
216 lexicon_.Close();
217 hit_buffer_crc_ = 0;
218 hit_buffer_.Reset();
219 hit_buffer_fd_.reset();
220 if (status.ok()) {
221 return absl_ports::InternalError(
222 "Error handling code ran but status was ok");
223 }
224 return status;
225 }
226
ComputeChecksum()227 Crc32 LiteIndex::ComputeChecksum() {
228 IcingTimer timer;
229
230 // Update crcs.
231 uint32_t dependent_crcs[2];
232 hit_buffer_.UpdateCrc();
233 dependent_crcs[0] = hit_buffer_crc_;
234 dependent_crcs[1] = lexicon_.UpdateCrc();
235
236 // Compute the master crc.
237
238 // Header crc, excluding the actual crc field.
239 Crc32 all_crc(header_->CalculateHeaderCrc());
240 all_crc.Append(std::string_view(reinterpret_cast<const char*>(dependent_crcs),
241 sizeof(dependent_crcs)));
242 ICING_VLOG(2) << "Lite index crc computed in " << timer.Elapsed() * 1000
243 << "ms";
244
245 return all_crc;
246 }
247
Reset()248 libtextclassifier3::Status LiteIndex::Reset() {
249 IcingTimer timer;
250
251 absl_ports::unique_lock l(&mutex_);
252 // TODO(b/140436942): When these components have been changed to return errors
253 // they should be propagated from here.
254 lexicon_.Clear();
255 hit_buffer_.Clear();
256 header_->Reset();
257 UpdateChecksum();
258
259 ICING_VLOG(2) << "Lite index clear in " << timer.Elapsed() * 1000 << "ms";
260 return libtextclassifier3::Status::OK;
261 }
262
Warm()263 void LiteIndex::Warm() {
264 absl_ports::shared_lock l(&mutex_);
265 hit_buffer_.Warm();
266 lexicon_.Warm();
267 }
268
PersistToDisk()269 libtextclassifier3::Status LiteIndex::PersistToDisk() {
270 absl_ports::unique_lock l(&mutex_);
271 bool success = true;
272 if (!lexicon_.Sync()) {
273 ICING_VLOG(1) << "Failed to sync the lexicon.";
274 success = false;
275 }
276 hit_buffer_.Sync();
277 UpdateChecksum();
278 header_mmap_.Sync();
279
280 return (success) ? libtextclassifier3::Status::OK
281 : absl_ports::InternalError(
282 "Unable to sync lite index components.");
283 }
284
UpdateChecksum()285 void LiteIndex::UpdateChecksum() {
286 header_->set_lite_index_crc(ComputeChecksum().Get());
287 }
288
InsertTerm(const std::string & term,TermMatchType::Code term_match_type,NamespaceId namespace_id)289 libtextclassifier3::StatusOr<uint32_t> LiteIndex::InsertTerm(
290 const std::string& term, TermMatchType::Code term_match_type,
291 NamespaceId namespace_id) {
292 absl_ports::unique_lock l(&mutex_);
293 uint32_t tvi;
294 libtextclassifier3::Status status =
295 lexicon_.Insert(term.c_str(), "", &tvi, false);
296 if (!status.ok()) {
297 ICING_LOG(DBG) << "Unable to add term " << term << " to lexicon!\n"
298 << status.error_message();
299 return status;
300 }
301 ICING_RETURN_IF_ERROR(UpdateTermPropertiesImpl(
302 tvi, term_match_type == TermMatchType::PREFIX, namespace_id));
303 return tvi;
304 }
305
UpdateTermProperties(uint32_t tvi,bool hasPrefixHits,NamespaceId namespace_id)306 libtextclassifier3::Status LiteIndex::UpdateTermProperties(
307 uint32_t tvi, bool hasPrefixHits, NamespaceId namespace_id) {
308 absl_ports::unique_lock l(&mutex_);
309 return UpdateTermPropertiesImpl(tvi, hasPrefixHits, namespace_id);
310 }
311
UpdateTermPropertiesImpl(uint32_t tvi,bool hasPrefixHits,NamespaceId namespace_id)312 libtextclassifier3::Status LiteIndex::UpdateTermPropertiesImpl(
313 uint32_t tvi, bool hasPrefixHits, NamespaceId namespace_id) {
314 if (hasPrefixHits &&
315 !lexicon_.SetProperty(tvi, GetHasHitsInPrefixSectionPropertyId())) {
316 return absl_ports::ResourceExhaustedError(
317 "Insufficient disk space to create prefix property!");
318 }
319
320 if (!lexicon_.SetProperty(tvi, GetNamespacePropertyId(namespace_id))) {
321 return absl_ports::ResourceExhaustedError(
322 "Insufficient disk space to create namespace property!");
323 }
324
325 return libtextclassifier3::Status::OK;
326 }
327
AddHit(uint32_t term_id,const Hit & hit)328 libtextclassifier3::Status LiteIndex::AddHit(uint32_t term_id, const Hit& hit) {
329 absl_ports::unique_lock l(&mutex_);
330 if (is_full()) {
331 return absl_ports::ResourceExhaustedError("Hit buffer is full!");
332 }
333
334 TermIdHitPair term_id_hit_pair(term_id, hit);
335 uint32_t cur_size = header_->cur_size();
336 TermIdHitPair::Value* valp =
337 hit_buffer_.GetMutableMem<TermIdHitPair::Value>(cur_size, 1);
338 if (valp == nullptr) {
339 return absl_ports::ResourceExhaustedError(
340 "Allocating more space in hit buffer failed!");
341 }
342 *valp = term_id_hit_pair.value();
343 header_->set_cur_size(cur_size + 1);
344
345 return libtextclassifier3::Status::OK;
346 }
347
GetTermId(const std::string & term) const348 libtextclassifier3::StatusOr<uint32_t> LiteIndex::GetTermId(
349 const std::string& term) const {
350 absl_ports::shared_lock l(&mutex_);
351 char dummy;
352 uint32_t tvi;
353 if (!lexicon_.Find(term.c_str(), &dummy, &tvi)) {
354 return absl_ports::NotFoundError(
355 absl_ports::StrCat("Could not find ", term, " in the lexicon."));
356 }
357 return tvi;
358 }
359
ScoreAndAppendFetchedHit(const Hit & hit,SectionIdMask section_id_mask,bool only_from_prefix_sections,SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by,const SuggestionResultChecker * suggestion_result_checker,DocumentId & last_document_id,bool & is_last_document_desired,int & total_score_out,std::vector<DocHitInfo> * hits_out,std::vector<Hit::TermFrequencyArray> * term_frequency_out) const360 void LiteIndex::ScoreAndAppendFetchedHit(
361 const Hit& hit, SectionIdMask section_id_mask,
362 bool only_from_prefix_sections,
363 SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by,
364 const SuggestionResultChecker* suggestion_result_checker,
365 DocumentId& last_document_id, bool& is_last_document_desired,
366 int& total_score_out, std::vector<DocHitInfo>* hits_out,
367 std::vector<Hit::TermFrequencyArray>* term_frequency_out) const {
368 // Check sections.
369 if (((UINT64_C(1) << hit.section_id()) & section_id_mask) == 0) {
370 return;
371 }
372 // Check prefix section only.
373 if (only_from_prefix_sections && !hit.is_in_prefix_section()) {
374 return;
375 }
376 // Check whether this Hit is desired.
377 // TODO(b/230553264) Move common logic into helper function once we support
378 // score term by prefix_hit in lite_index.
379 DocumentId document_id = hit.document_id();
380 bool is_new_document = document_id != last_document_id;
381 if (is_new_document) {
382 last_document_id = document_id;
383 is_last_document_desired =
384 suggestion_result_checker == nullptr ||
385 suggestion_result_checker->BelongsToTargetResults(document_id,
386 hit.section_id());
387 }
388 if (!is_last_document_desired) {
389 // The document is removed or expired or not desired.
390 return;
391 }
392
393 // Score the hit by the strategy
394 switch (score_by) {
395 case SuggestionScoringSpecProto::SuggestionRankingStrategy::NONE:
396 total_score_out = 1;
397 break;
398 case SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT:
399 if (is_new_document) {
400 ++total_score_out;
401 }
402 break;
403 case SuggestionScoringSpecProto::SuggestionRankingStrategy::TERM_FREQUENCY:
404 if (hit.has_term_frequency()) {
405 total_score_out += hit.term_frequency();
406 } else {
407 ++total_score_out;
408 }
409 break;
410 }
411
412 // Append the Hit or update hit section to the output vector.
413 if (is_new_document && hits_out != nullptr) {
414 hits_out->push_back(DocHitInfo(document_id));
415 if (term_frequency_out != nullptr) {
416 term_frequency_out->push_back(Hit::TermFrequencyArray());
417 }
418 }
419 if (hits_out != nullptr) {
420 hits_out->back().UpdateSection(hit.section_id());
421 if (term_frequency_out != nullptr) {
422 term_frequency_out->back()[hit.section_id()] = hit.term_frequency();
423 }
424 }
425 }
426
FetchHits(uint32_t term_id,SectionIdMask section_id_mask,bool only_from_prefix_sections,SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by,const SuggestionResultChecker * suggestion_result_checker,std::vector<DocHitInfo> * hits_out,std::vector<Hit::TermFrequencyArray> * term_frequency_out)427 int LiteIndex::FetchHits(
428 uint32_t term_id, SectionIdMask section_id_mask,
429 bool only_from_prefix_sections,
430 SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by,
431 const SuggestionResultChecker* suggestion_result_checker,
432 std::vector<DocHitInfo>* hits_out,
433 std::vector<Hit::TermFrequencyArray>* term_frequency_out) {
434 bool need_sort_at_querying = false;
435 {
436 absl_ports::shared_lock l(&mutex_);
437
438 // We sort here when:
439 // 1. We don't enable sorting at indexing time (i.e. we sort at querying
440 // time), and there is an unsorted tail portion. OR
441 // 2. The unsorted tail size exceeds the hit_buffer_sort_threshold,
442 // regardless of whether or not hit_buffer_sort_at_indexing is enabled.
443 // This is more of a sanity check. We should not really be encountering
444 // this case.
445 need_sort_at_querying = NeedSortAtQuerying();
446 }
447 if (need_sort_at_querying) {
448 absl_ports::unique_lock l(&mutex_);
449 IcingTimer timer;
450
451 // Transition from shared_lock to unique_lock is safe here because it
452 // doesn't hurt to sort again if sorting was done already by another thread
453 // after need_sort_at_querying is evaluated.
454 // We check need_sort_at_querying to improve query concurrency as threads
455 // can avoid acquiring the unique lock if no sorting is needed.
456 SortHitsImpl();
457
458 if (options_.hit_buffer_sort_at_indexing) {
459 // This is the second case for sort. Log as this should be a very rare
460 // occasion.
461 ICING_LOG(WARNING) << "Sorting HitBuffer at querying time when "
462 "hit_buffer_sort_at_indexing is enabled. Sort and "
463 "merge HitBuffer in "
464 << timer.Elapsed() * 1000 << " ms.";
465 }
466 }
467
468 // This downgrade from an unique_lock to a shared_lock is safe because we're
469 // searching for the term in the searchable (sorted) section of the HitBuffer
470 // only in Seek().
471 // Any operations that might execute in between the transition of downgrading
472 // the lock here are guaranteed not to alter the searchable section (or the
473 // LiteIndex) due to a global lock in IcingSearchEngine.
474 absl_ports::shared_lock l(&mutex_);
475
476 // Search in the HitBuffer array for Hits with the corresponding term_id.
477 // Hits are added in increasing order of doc ids, so hits that get appended
478 // later have larger docIds. This means that:
479 // 1. Hits in the unsorted tail will have larger docIds than hits in the
480 // sorted portion.
481 // 2. Hits at the end of the unsorted tail will have larger docIds than hits
482 // in the front of the tail.
483 // We want to retrieve hits in descending order of docIds. Therefore we should
484 // search by doing:
485 // 1. Linear search first in reverse iteration order over the unsorted tail
486 // portion.
487 // 2. Followed by binary search on the sorted portion.
488 const TermIdHitPair* array = hit_buffer_.array_cast<TermIdHitPair>();
489
490 DocumentId last_document_id = kInvalidDocumentId;
491 // Record whether the last document belongs to the given namespaces.
492 bool is_last_document_desired = false;
493 int total_score = 0;
494
495 // Linear search over unsorted tail in reverse iteration order.
496 // This should only be performed when hit_buffer_sort_at_indexing is enabled.
497 // When disabled, the entire HitBuffer should be sorted already and only
498 // binary search is needed.
499 if (options_.hit_buffer_sort_at_indexing) {
500 uint32_t unsorted_length = GetHitBufferUnsortedSizeImpl();
501 for (uint32_t i = 1; i <= unsorted_length; ++i) {
502 TermIdHitPair term_id_hit_pair = array[header_->cur_size() - i];
503 if (term_id_hit_pair.term_id() == term_id) {
504 // We've found a matched hit.
505 const Hit& matched_hit = term_id_hit_pair.hit();
506 // Score the hit and add to total_score. Also add the hits and its term
507 // frequency info to hits_out and term_frequency_out if the two vectors
508 // are non-null.
509 ScoreAndAppendFetchedHit(matched_hit, section_id_mask,
510 only_from_prefix_sections, score_by,
511 suggestion_result_checker, last_document_id,
512 is_last_document_desired, total_score,
513 hits_out, term_frequency_out);
514 }
515 }
516 }
517
518 // Do binary search over the sorted section and repeat the above steps.
519 TermIdHitPair target_term_id_hit_pair(
520 term_id, Hit(Hit::kMaxDocumentIdSortValue, Hit::kNoEnabledFlags,
521 Hit::kDefaultTermFrequency));
522 for (const TermIdHitPair* ptr = std::lower_bound(
523 array, array + header_->searchable_end(), target_term_id_hit_pair);
524 ptr < array + header_->searchable_end(); ++ptr) {
525 if (ptr->term_id() != term_id) {
526 // We've processed all matches. Stop iterating further.
527 break;
528 }
529
530 const Hit& matched_hit = ptr->hit();
531 // Score the hit and add to total_score. Also add the hits and its term
532 // frequency info to hits_out and term_frequency_out if the two vectors are
533 // non-null.
534 ScoreAndAppendFetchedHit(
535 matched_hit, section_id_mask, only_from_prefix_sections, score_by,
536 suggestion_result_checker, last_document_id, is_last_document_desired,
537 total_score, hits_out, term_frequency_out);
538 }
539 return total_score;
540 }
541
ScoreHits(uint32_t term_id,SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by,const SuggestionResultChecker * suggestion_result_checker)542 libtextclassifier3::StatusOr<int> LiteIndex::ScoreHits(
543 uint32_t term_id,
544 SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by,
545 const SuggestionResultChecker* suggestion_result_checker) {
546 return FetchHits(term_id, kSectionIdMaskAll,
547 /*only_from_prefix_sections=*/false, score_by,
548 suggestion_result_checker,
549 /*hits_out=*/nullptr);
550 }
551
is_full() const552 bool LiteIndex::is_full() const {
553 return (header_->cur_size() == options_.hit_buffer_size ||
554 lexicon_.min_free_fraction() < (1.0 - kTrieFullFraction));
555 }
556
GetDebugInfo(DebugInfoVerbosity::Code verbosity)557 std::string LiteIndex::GetDebugInfo(DebugInfoVerbosity::Code verbosity) {
558 absl_ports::unique_lock l(&mutex_);
559 std::string res;
560 std::string lexicon_info;
561 lexicon_.GetDebugInfo(verbosity, &lexicon_info);
562 IcingStringUtil::SStringAppendF(
563 &res, 0,
564 "curr_size: %u\n"
565 "hit_buffer_size: %u\n"
566 "last_added_document_id %u\n"
567 "searchable_end: %u\n"
568 "index_crc: %u\n"
569 "\n"
570 "lite_lexicon_info:\n%s\n",
571 header_->cur_size(), options_.hit_buffer_size,
572 header_->last_added_docid(), header_->searchable_end(),
573 ComputeChecksum().Get(), lexicon_info.c_str());
574 return res;
575 }
576
GetElementsSize() const577 libtextclassifier3::StatusOr<int64_t> LiteIndex::GetElementsSize() const {
578 IndexStorageInfoProto storage_info = GetStorageInfo(IndexStorageInfoProto());
579 if (storage_info.lite_index_hit_buffer_size() == -1 ||
580 storage_info.lite_index_lexicon_size() == -1) {
581 return absl_ports::AbortedError(
582 "Failed to get size of LiteIndex's members.");
583 }
584 // On initialization, we grow the file to a padded size first. So this size
585 // won't count towards the size taken up by elements
586 size_t header_padded_size = IcingMMapper::page_aligned_size(header_size());
587 return storage_info.lite_index_hit_buffer_size() - header_padded_size +
588 storage_info.lite_index_lexicon_size();
589 }
590
GetStorageInfo(IndexStorageInfoProto storage_info) const591 IndexStorageInfoProto LiteIndex::GetStorageInfo(
592 IndexStorageInfoProto storage_info) const {
593 absl_ports::shared_lock l(&mutex_);
594 int64_t header_and_hit_buffer_file_size =
595 filesystem_->GetFileSize(hit_buffer_fd_.get());
596 storage_info.set_lite_index_hit_buffer_size(
597 IcingFilesystem::SanitizeFileSize(header_and_hit_buffer_file_size));
598 int64_t lexicon_disk_usage = lexicon_.GetElementsSize();
599 if (lexicon_disk_usage != Filesystem::kBadFileSize) {
600 storage_info.set_lite_index_lexicon_size(lexicon_disk_usage);
601 } else {
602 storage_info.set_lite_index_lexicon_size(-1);
603 }
604 return storage_info;
605 }
606
SortHitsImpl()607 void LiteIndex::SortHitsImpl() {
608 // Make searchable by sorting by hit buffer.
609 uint32_t need_sort_len = GetHitBufferUnsortedSizeImpl();
610 if (need_sort_len <= 0) {
611 return;
612 }
613 IcingTimer timer;
614
615 TermIdHitPair::Value* array_start =
616 hit_buffer_.GetMutableMem<TermIdHitPair::Value>(0, header_->cur_size());
617 TermIdHitPair::Value* sort_start = array_start + header_->searchable_end();
618 std::sort(sort_start, array_start + header_->cur_size());
619
620 // Now merge with previous region. Since the previous region is already
621 // sorted and deduplicated, optimize the merge by skipping everything less
622 // than the new region's smallest value.
623 if (header_->searchable_end() > 0) {
624 std::inplace_merge(array_start, array_start + header_->searchable_end(),
625 array_start + header_->cur_size());
626 }
627 ICING_VLOG(2) << "Lite index sort and merge " << need_sort_len << " into "
628 << header_->searchable_end() << " in " << timer.Elapsed() * 1000
629 << "ms";
630
631 // Now the entire array is sorted.
632 header_->set_searchable_end(header_->cur_size());
633
634 // Update crc in-line.
635 UpdateChecksum();
636 }
637
Optimize(const std::vector<DocumentId> & document_id_old_to_new,const TermIdCodec * term_id_codec,DocumentId new_last_added_document_id)638 libtextclassifier3::Status LiteIndex::Optimize(
639 const std::vector<DocumentId>& document_id_old_to_new,
640 const TermIdCodec* term_id_codec, DocumentId new_last_added_document_id) {
641 absl_ports::unique_lock l(&mutex_);
642 header_->set_last_added_docid(new_last_added_document_id);
643 if (header_->cur_size() == 0) {
644 return libtextclassifier3::Status::OK;
645 }
646 // Sort the hits so that hits with the same term id will be grouped together,
647 // which helps later to determine which terms will be unused after compaction.
648 SortHitsImpl();
649 uint32_t new_size = 0;
650 uint32_t curr_term_id = 0;
651 uint32_t curr_tvi = 0;
652 std::unordered_set<uint32_t> tvi_to_delete;
653 for (uint32_t idx = 0; idx < header_->cur_size(); ++idx) {
654 TermIdHitPair term_id_hit_pair(
655 hit_buffer_.array_cast<TermIdHitPair>()[idx]);
656 if (idx == 0 || term_id_hit_pair.term_id() != curr_term_id) {
657 curr_term_id = term_id_hit_pair.term_id();
658 ICING_ASSIGN_OR_RETURN(TermIdCodec::DecodedTermInfo term_info,
659 term_id_codec->DecodeTermInfo(curr_term_id));
660 curr_tvi = term_info.tvi;
661 // Mark the property of the current term as not having hits in prefix
662 // section. The property will be set below if there are any valid hits
663 // from a prefix section.
664 lexicon_.ClearProperty(curr_tvi, GetHasHitsInPrefixSectionPropertyId());
665 // Add curr_tvi to tvi_to_delete. It will be removed from tvi_to_delete
666 // below if there are any valid hits pointing to that termid.
667 tvi_to_delete.insert(curr_tvi);
668 }
669 DocumentId new_document_id =
670 document_id_old_to_new[term_id_hit_pair.hit().document_id()];
671 if (new_document_id == kInvalidDocumentId) {
672 continue;
673 }
674 if (term_id_hit_pair.hit().is_in_prefix_section()) {
675 lexicon_.SetProperty(curr_tvi, GetHasHitsInPrefixSectionPropertyId());
676 }
677 tvi_to_delete.erase(curr_tvi);
678 TermIdHitPair new_term_id_hit_pair(
679 term_id_hit_pair.term_id(),
680 Hit::TranslateHit(term_id_hit_pair.hit(), new_document_id));
681 // Rewriting the hit_buffer in place.
682 // new_size is weakly less than idx so we are okay to overwrite the entry at
683 // new_size, and valp should never be nullptr since it is within the already
684 // allocated region of hit_buffer_.
685 TermIdHitPair::Value* valp =
686 hit_buffer_.GetMutableMem<TermIdHitPair::Value>(new_size++, 1);
687 *valp = new_term_id_hit_pair.value();
688 }
689 header_->set_cur_size(new_size);
690 header_->set_searchable_end(new_size);
691
692 // Delete unused terms.
693 std::unordered_set<std::string> terms_to_delete;
694 for (IcingDynamicTrie::Iterator term_iter(lexicon_, /*prefix=*/"");
695 term_iter.IsValid(); term_iter.Advance()) {
696 if (tvi_to_delete.find(term_iter.GetValueIndex()) != tvi_to_delete.end()) {
697 terms_to_delete.insert(term_iter.GetKey());
698 }
699 }
700 for (const std::string& term : terms_to_delete) {
701 // Mark "term" as deleted. This won't actually free space in the lexicon. It
702 // will simply make it impossible to Find "term" in subsequent calls (which
703 // saves an unnecessary search through the hit buffer). This is acceptable
704 // because the free space will eventually be reclaimed the next time that
705 // the lite index is merged with the main index.
706 if (!lexicon_.Delete(term)) {
707 return absl_ports::InternalError(
708 "Could not delete invalid terms in lite lexicon during compaction.");
709 }
710 }
711 return libtextclassifier3::Status::OK;
712 }
713
714 } // namespace lib
715 } // namespace icing
716