1 // Copyright (C) 2024 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/index/embed/embedding-index.h"
16
17 #include <algorithm>
18 #include <cstdint>
19 #include <cstring>
20 #include <memory>
21 #include <string>
22 #include <string_view>
23 #include <utility>
24 #include <vector>
25
26 #include "icing/text_classifier/lib3/utils/base/status.h"
27 #include "icing/text_classifier/lib3/utils/base/statusor.h"
28 #include "icing/absl_ports/canonical_errors.h"
29 #include "icing/absl_ports/str_cat.h"
30 #include "icing/feature-flags.h"
31 #include "icing/file/destructible-directory.h"
32 #include "icing/file/file-backed-vector.h"
33 #include "icing/file/filesystem.h"
34 #include "icing/file/memory-mapped-file.h"
35 #include "icing/file/posting_list/flash-index-storage.h"
36 #include "icing/file/posting_list/posting-list-identifier.h"
37 #include "icing/index/embed/embedding-hit.h"
38 #include "icing/index/embed/embedding-scorer.h"
39 #include "icing/index/embed/posting-list-embedding-hit-accessor.h"
40 #include "icing/index/embed/quantizer.h"
41 #include "icing/index/hit/hit.h"
42 #include "icing/schema/schema-store.h"
43 #include "icing/store/document-filter-data.h"
44 #include "icing/store/document-id.h"
45 #include "icing/store/document-store.h"
46 #include "icing/store/dynamic-trie-key-mapper.h"
47 #include "icing/store/key-mapper.h"
48 #include "icing/util/clock.h"
49 #include "icing/util/crc32.h"
50 #include "icing/util/encode-util.h"
51 #include "icing/util/logging.h"
52 #include "icing/util/status-macros.h"
53
54 namespace icing {
55 namespace lib {
56
57 namespace {
58
59 // The maximum size of the embedding hit list mmapper.
60 // We use 64MiB for 32-bit platforms and 128MiB for 64-bit platforms.
61 #ifdef ICING_ARCH_BIT_64
62 constexpr uint32_t kEmbeddingHitListMapperMaxSize = 128 * 1024 * 1024;
63 #else
64 constexpr uint32_t kEmbeddingHitListMapperMaxSize = 64 * 1024 * 1024;
65 #endif
66
67 // The maximum length returned by encode_util::EncodeIntToCString is 5 for
68 // uint32_t.
69 constexpr uint32_t kEncodedDimensionLength = 5;
70
GetMetadataFilePath(std::string_view working_path)71 std::string GetMetadataFilePath(std::string_view working_path) {
72 return absl_ports::StrCat(working_path, "/metadata");
73 }
74
GetFlashIndexStorageFilePath(std::string_view working_path)75 std::string GetFlashIndexStorageFilePath(std::string_view working_path) {
76 return absl_ports::StrCat(working_path, "/flash_index_storage");
77 }
78
GetEmbeddingHitListMapperPath(std::string_view working_path)79 std::string GetEmbeddingHitListMapperPath(std::string_view working_path) {
80 return absl_ports::StrCat(working_path, "/embedding_hit_list_mapper");
81 }
82
GetEmbeddingVectorsFilePath(std::string_view working_path)83 std::string GetEmbeddingVectorsFilePath(std::string_view working_path) {
84 return absl_ports::StrCat(working_path, "/embedding_vectors");
85 }
86
GetQuantizedEmbeddingVectorsFilePath(std::string_view working_path)87 std::string GetQuantizedEmbeddingVectorsFilePath(
88 std::string_view working_path) {
89 return absl_ports::StrCat(working_path, "/quantized_embedding_vectors");
90 }
91
92 // An injective function that maps the ordered pair (dimension, model_signature)
93 // to a string, which is used to form a key for embedding_posting_list_mapper_.
GetPostingListKey(uint32_t dimension,std::string_view model_signature)94 std::string GetPostingListKey(uint32_t dimension,
95 std::string_view model_signature) {
96 std::string encoded_dimension_str =
97 encode_util::EncodeIntToCString(dimension);
98 // Make encoded_dimension_str to fixed kEncodedDimensionLength bytes.
99 while (encoded_dimension_str.size() < kEncodedDimensionLength) {
100 // C string cannot contain 0 bytes, so we append it using 1, just like what
101 // we do in encode_util::EncodeIntToCString.
102 //
103 // The reason that this works is because DecodeIntToString decodes a byte
104 // value of 0x01 as 0x00. When EncodeIntToCString returns an encoded
105 // dimension that is less than 5 bytes, it means that the dimension contains
106 // unencoded leading 0x00. So here we're explicitly encoding those bytes as
107 // 0x01.
108 encoded_dimension_str.push_back(1);
109 }
110 return absl_ports::StrCat(encoded_dimension_str, model_signature);
111 }
112
GetPostingListKey(const PropertyProto::VectorProto & vector)113 std::string GetPostingListKey(const PropertyProto::VectorProto& vector) {
114 return GetPostingListKey(vector.values().size(), vector.model_signature());
115 }
116
CreateQuantizer(const PropertyProto::VectorProto & vector)117 libtextclassifier3::StatusOr<Quantizer> CreateQuantizer(
118 const PropertyProto::VectorProto& vector) {
119 if (vector.values().empty()) {
120 return absl_ports::InvalidArgumentError("Vector dimension is 0");
121 }
122 auto minmax_pair =
123 std::minmax_element(vector.values().begin(), vector.values().end());
124 return Quantizer::Create(*minmax_pair.first, *minmax_pair.second);
125 }
126
127 } // namespace
128
129 libtextclassifier3::StatusOr<std::unique_ptr<EmbeddingIndex>>
Create(const Filesystem * filesystem,std::string working_path,const Clock * clock,const FeatureFlags * feature_flags)130 EmbeddingIndex::Create(const Filesystem* filesystem, std::string working_path,
131 const Clock* clock, const FeatureFlags* feature_flags) {
132 ICING_RETURN_ERROR_IF_NULL(filesystem);
133 ICING_RETURN_ERROR_IF_NULL(clock);
134
135 std::unique_ptr<EmbeddingIndex> index =
136 std::unique_ptr<EmbeddingIndex>(new EmbeddingIndex(
137 *filesystem, std::move(working_path), clock, feature_flags));
138 ICING_RETURN_IF_ERROR(index->Initialize());
139 return index;
140 }
141
CreateStorageDataIfNonEmpty()142 libtextclassifier3::Status EmbeddingIndex::CreateStorageDataIfNonEmpty() {
143 if (is_empty()) {
144 return libtextclassifier3::Status::OK;
145 }
146
147 ICING_ASSIGN_OR_RETURN(FlashIndexStorage flash_index_storage,
148 FlashIndexStorage::Create(
149 GetFlashIndexStorageFilePath(working_path_),
150 &filesystem_, posting_list_hit_serializer_.get()));
151 flash_index_storage_ =
152 std::make_unique<FlashIndexStorage>(std::move(flash_index_storage));
153
154 ICING_ASSIGN_OR_RETURN(
155 embedding_posting_list_mapper_,
156 DynamicTrieKeyMapper<PostingListIdentifier>::Create(
157 filesystem_, GetEmbeddingHitListMapperPath(working_path_),
158 kEmbeddingHitListMapperMaxSize));
159
160 ICING_ASSIGN_OR_RETURN(
161 embedding_vectors_,
162 FileBackedVector<float>::Create(
163 filesystem_, GetEmbeddingVectorsFilePath(working_path_),
164 MemoryMappedFile::READ_WRITE_AUTO_SYNC));
165
166 ICING_ASSIGN_OR_RETURN(
167 quantized_embedding_vectors_,
168 FileBackedVector<char>::Create(
169 filesystem_, GetQuantizedEmbeddingVectorsFilePath(working_path_),
170 MemoryMappedFile::READ_WRITE_AUTO_SYNC));
171
172 return libtextclassifier3::Status::OK;
173 }
174
MarkIndexNonEmpty()175 libtextclassifier3::Status EmbeddingIndex::MarkIndexNonEmpty() {
176 if (!is_empty()) {
177 return libtextclassifier3::Status::OK;
178 }
179 info().is_empty = false;
180 return CreateStorageDataIfNonEmpty();
181 }
182
Initialize()183 libtextclassifier3::Status EmbeddingIndex::Initialize() {
184 bool is_new = false;
185 if (!filesystem_.FileExists(GetMetadataFilePath(working_path_).c_str())) {
186 // Create working directory.
187 if (!filesystem_.CreateDirectoryRecursively(working_path_.c_str())) {
188 return absl_ports::InternalError(
189 absl_ports::StrCat("Failed to create directory: ", working_path_));
190 }
191 is_new = true;
192 }
193
194 ICING_ASSIGN_OR_RETURN(
195 MemoryMappedFile metadata_mmapped_file,
196 MemoryMappedFile::Create(filesystem_, GetMetadataFilePath(working_path_),
197 MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
198 /*max_file_size=*/kMetadataFileSize,
199 /*pre_mapping_file_offset=*/0,
200 /*pre_mapping_mmap_size=*/kMetadataFileSize));
201 metadata_mmapped_file_ =
202 std::make_unique<MemoryMappedFile>(std::move(metadata_mmapped_file));
203
204 if (is_new) {
205 ICING_RETURN_IF_ERROR(metadata_mmapped_file_->GrowAndRemapIfNecessary(
206 /*file_offset=*/0, /*mmap_size=*/kMetadataFileSize));
207 info().magic = Info::kMagic;
208 info().last_added_document_id = kInvalidDocumentId;
209 info().is_empty = true;
210 memset(Info().padding_, 0, Info::kPaddingSize);
211 ICING_RETURN_IF_ERROR(InitializeNewStorage());
212 } else {
213 if (metadata_mmapped_file_->available_size() != kMetadataFileSize) {
214 return absl_ports::FailedPreconditionError(
215 "Incorrect metadata file size");
216 }
217 if (info().magic != Info::kMagic) {
218 return absl_ports::FailedPreconditionError("Incorrect magic value");
219 }
220 ICING_RETURN_IF_ERROR(CreateStorageDataIfNonEmpty());
221 ICING_RETURN_IF_ERROR(InitializeExistingStorage());
222 }
223 return libtextclassifier3::Status::OK;
224 }
225
Clear()226 libtextclassifier3::Status EmbeddingIndex::Clear() {
227 pending_embedding_hits_.clear();
228 metadata_mmapped_file_.reset();
229 flash_index_storage_.reset();
230 embedding_posting_list_mapper_.reset();
231 embedding_vectors_.reset();
232 quantized_embedding_vectors_.reset();
233 if (filesystem_.DirectoryExists(working_path_.c_str())) {
234 ICING_RETURN_IF_ERROR(Discard(filesystem_, working_path_));
235 }
236 is_initialized_ = false;
237 return Initialize();
238 }
239
240 libtextclassifier3::StatusOr<std::unique_ptr<PostingListEmbeddingHitAccessor>>
GetAccessor(uint32_t dimension,std::string_view model_signature) const241 EmbeddingIndex::GetAccessor(uint32_t dimension,
242 std::string_view model_signature) const {
243 if (dimension == 0) {
244 return absl_ports::InvalidArgumentError("Dimension is 0");
245 }
246 if (is_empty()) {
247 return absl_ports::NotFoundError("EmbeddingIndex is empty");
248 }
249
250 std::string key = GetPostingListKey(dimension, model_signature);
251 ICING_ASSIGN_OR_RETURN(PostingListIdentifier posting_list_id,
252 embedding_posting_list_mapper_->Get(key));
253 return PostingListEmbeddingHitAccessor::CreateFromExisting(
254 flash_index_storage_.get(), posting_list_hit_serializer_.get(),
255 posting_list_id);
256 }
257
AppendEmbeddingVector(const PropertyProto::VectorProto & vector,EmbeddingIndexingConfig::QuantizationType::Code quantization_type)258 libtextclassifier3::StatusOr<uint32_t> EmbeddingIndex::AppendEmbeddingVector(
259 const PropertyProto::VectorProto& vector,
260 EmbeddingIndexingConfig::QuantizationType::Code quantization_type) {
261 uint32_t dimension = vector.values().size();
262 uint32_t location;
263 if (!feature_flags_->enable_embedding_quantization() ||
264 quantization_type == EmbeddingIndexingConfig::QuantizationType::NONE) {
265 location = embedding_vectors_->num_elements();
266 ICING_ASSIGN_OR_RETURN(
267 FileBackedVector<float>::MutableArrayView mutable_arr,
268 embedding_vectors_->Allocate(dimension));
269 mutable_arr.SetArray(/*idx=*/0, vector.values().data(), dimension);
270 } else {
271 ICING_ASSIGN_OR_RETURN(Quantizer quantizer, CreateQuantizer(vector));
272 // Quantize the vector
273 std::vector<uint8_t> quantized_values;
274 quantized_values.reserve(vector.values().size());
275 for (float value : vector.values()) {
276 quantized_values.push_back(quantizer.Quantize(value));
277 }
278
279 // Store the quantizer and the quantized vector
280 location = quantized_embedding_vectors_->num_elements();
281 ICING_ASSIGN_OR_RETURN(
282 FileBackedVector<char>::MutableArrayView mutable_arr,
283 quantized_embedding_vectors_->Allocate(sizeof(Quantizer) + dimension));
284 mutable_arr.SetArray(/*idx=*/0, reinterpret_cast<char*>(&quantizer),
285 sizeof(Quantizer));
286 mutable_arr.SetArray(/*idx=*/sizeof(Quantizer),
287 reinterpret_cast<char*>(quantized_values.data()),
288 dimension);
289 }
290 return location;
291 }
292
BufferEmbedding(const BasicHit & basic_hit,const PropertyProto::VectorProto & vector,EmbeddingIndexingConfig::QuantizationType::Code quantization_type)293 libtextclassifier3::Status EmbeddingIndex::BufferEmbedding(
294 const BasicHit& basic_hit, const PropertyProto::VectorProto& vector,
295 EmbeddingIndexingConfig::QuantizationType::Code quantization_type) {
296 if (vector.values().empty()) {
297 return absl_ports::InvalidArgumentError("Vector dimension is 0");
298 }
299 ICING_RETURN_IF_ERROR(MarkIndexNonEmpty());
300
301 std::string key = GetPostingListKey(vector);
302 ICING_ASSIGN_OR_RETURN(uint32_t location,
303 AppendEmbeddingVector(vector, quantization_type));
304
305 // Buffer the embedding hit.
306 pending_embedding_hits_.push_back(
307 {std::move(key), EmbeddingHit(basic_hit, location)});
308 return libtextclassifier3::Status::OK;
309 }
310
CommitBufferToIndex()311 libtextclassifier3::Status EmbeddingIndex::CommitBufferToIndex() {
312 if (pending_embedding_hits_.empty()) {
313 return libtextclassifier3::Status::OK;
314 }
315 ICING_RETURN_IF_ERROR(MarkIndexNonEmpty());
316
317 std::sort(pending_embedding_hits_.begin(), pending_embedding_hits_.end());
318 auto iter_curr_key = pending_embedding_hits_.rbegin();
319 while (iter_curr_key != pending_embedding_hits_.rend()) {
320 // In order to batch putting embedding hits with the same key (dimension,
321 // model_signature) to the same posting list, we find the range
322 // [iter_curr_key, iter_next_key) of embedding hits with the same key and
323 // put them into their corresponding posting list together.
324 auto iter_next_key = iter_curr_key;
325 while (iter_next_key != pending_embedding_hits_.rend() &&
326 iter_next_key->first == iter_curr_key->first) {
327 iter_next_key++;
328 }
329
330 const std::string& key = iter_curr_key->first;
331 libtextclassifier3::StatusOr<PostingListIdentifier> posting_list_id_or =
332 embedding_posting_list_mapper_->Get(key);
333 std::unique_ptr<PostingListEmbeddingHitAccessor> pl_accessor;
334 if (posting_list_id_or.ok()) {
335 // Existing posting list.
336 ICING_ASSIGN_OR_RETURN(
337 pl_accessor,
338 PostingListEmbeddingHitAccessor::CreateFromExisting(
339 flash_index_storage_.get(), posting_list_hit_serializer_.get(),
340 posting_list_id_or.ValueOrDie()));
341 } else if (absl_ports::IsNotFound(posting_list_id_or.status())) {
342 // New posting list.
343 ICING_ASSIGN_OR_RETURN(
344 pl_accessor,
345 PostingListEmbeddingHitAccessor::Create(
346 flash_index_storage_.get(), posting_list_hit_serializer_.get()));
347 } else {
348 // Errors
349 return std::move(posting_list_id_or).status();
350 }
351
352 // Adding the embedding hits.
353 for (auto iter = iter_curr_key; iter != iter_next_key; ++iter) {
354 ICING_RETURN_IF_ERROR(pl_accessor->PrependHit(iter->second));
355 }
356
357 // Finalize this posting list and add the posting list id in
358 // embedding_posting_list_mapper_.
359 PostingListEmbeddingHitAccessor::FinalizeResult result =
360 std::move(*pl_accessor).Finalize();
361 if (!result.id.is_valid()) {
362 return absl_ports::InternalError("Failed to finalize posting list");
363 }
364 ICING_RETURN_IF_ERROR(embedding_posting_list_mapper_->Put(key, result.id));
365
366 // Advance to the next key.
367 iter_curr_key = iter_next_key;
368 }
369 pending_embedding_hits_.clear();
370 return libtextclassifier3::Status::OK;
371 }
372
TransferEmbeddingVector(const EmbeddingHit & old_hit,uint32_t dimension,EmbeddingIndexingConfig::QuantizationType::Code quantization_type,EmbeddingIndex * new_index) const373 libtextclassifier3::StatusOr<uint32_t> EmbeddingIndex::TransferEmbeddingVector(
374 const EmbeddingHit& old_hit, uint32_t dimension,
375 EmbeddingIndexingConfig::QuantizationType::Code quantization_type,
376 EmbeddingIndex* new_index) const {
377 uint32_t new_location;
378 if (!feature_flags_->enable_embedding_quantization() ||
379 quantization_type == EmbeddingIndexingConfig::QuantizationType::NONE) {
380 ICING_ASSIGN_OR_RETURN(const float* old_vector,
381 GetEmbeddingVector(old_hit, dimension));
382 new_location = new_index->embedding_vectors_->num_elements();
383
384 // Copy the embedding vector of the hit to the new index.
385 ICING_ASSIGN_OR_RETURN(
386 FileBackedVector<float>::MutableArrayView mutable_arr,
387 new_index->embedding_vectors_->Allocate(dimension));
388 mutable_arr.SetArray(/*idx=*/0, old_vector, dimension);
389 } else {
390 ICING_ASSIGN_OR_RETURN(const char* old_data,
391 GetQuantizedEmbeddingVector(old_hit, dimension));
392 new_location = new_index->quantized_embedding_vectors_->num_elements();
393
394 // Copy the embedding vector of the hit to the new index.
395 ICING_ASSIGN_OR_RETURN(FileBackedVector<char>::MutableArrayView mutable_arr,
396 new_index->quantized_embedding_vectors_->Allocate(
397 sizeof(Quantizer) + dimension));
398 mutable_arr.SetArray(/*idx=*/0, old_data, sizeof(Quantizer) + dimension);
399 }
400 return new_location;
401 }
402
TransferIndex(const DocumentStore & document_store,const SchemaStore & schema_store,const std::vector<DocumentId> & document_id_old_to_new,EmbeddingIndex * new_index) const403 libtextclassifier3::Status EmbeddingIndex::TransferIndex(
404 const DocumentStore& document_store, const SchemaStore& schema_store,
405 const std::vector<DocumentId>& document_id_old_to_new,
406 EmbeddingIndex* new_index) const {
407 if (is_empty()) {
408 return absl_ports::FailedPreconditionError("EmbeddingIndex is empty");
409 }
410
411 const int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
412 std::unique_ptr<KeyMapper<PostingListIdentifier>::Iterator> itr =
413 embedding_posting_list_mapper_->GetIterator();
414 while (itr->Advance()) {
415 std::string_view key = itr->GetKey();
416 // This should never happen unless there is an inconsistency, or the index
417 // is corrupted.
418 if (key.size() < kEncodedDimensionLength) {
419 return absl_ports::InternalError(
420 "Got invalid key from embedding posting list mapper.");
421 }
422 uint32_t dimension = encode_util::DecodeIntFromCString(
423 std::string_view(key.begin(), kEncodedDimensionLength));
424
425 // Transfer hits
426 std::vector<EmbeddingHit> new_hits;
427 ICING_ASSIGN_OR_RETURN(
428 std::unique_ptr<PostingListEmbeddingHitAccessor> old_pl_accessor,
429 PostingListEmbeddingHitAccessor::CreateFromExisting(
430 flash_index_storage_.get(), posting_list_hit_serializer_.get(),
431 /*existing_posting_list_id=*/itr->GetValue()));
432 DocumentId last_new_document_id = kInvalidDocumentId;
433 SchemaTypeId schema_type_id = kInvalidSchemaTypeId;
434 while (true) {
435 ICING_ASSIGN_OR_RETURN(std::vector<EmbeddingHit> batch,
436 old_pl_accessor->GetNextHitsBatch());
437 if (batch.empty()) {
438 break;
439 }
440 for (EmbeddingHit& old_hit : batch) {
441 // Safety checks to add robustness to the codebase, so to make sure
442 // that we never access invalid memory, in case that hit from the
443 // posting list is corrupted.
444 if (old_hit.basic_hit().document_id() < 0 ||
445 old_hit.basic_hit().document_id() >=
446 document_id_old_to_new.size()) {
447 return absl_ports::InternalError(
448 "Embedding hit document id is out of bound. The provided map is "
449 "too small, or the index may have been corrupted.");
450 }
451
452 // Construct transferred hit and add the embedding vector to the new
453 // index.
454 DocumentId new_document_id =
455 document_id_old_to_new[old_hit.basic_hit().document_id()];
456 if (new_document_id == kInvalidDocumentId) {
457 continue;
458 }
459 if (new_document_id != last_new_document_id) {
460 schema_type_id =
461 document_store.GetSchemaTypeId(new_document_id, current_time_ms);
462 }
463 last_new_document_id = new_document_id;
464 if (schema_type_id == kInvalidSchemaTypeId) {
465 // This should not happen, since document store is optimized first,
466 // so that new_document_id here should be alive.
467 continue;
468 }
469 ICING_ASSIGN_OR_RETURN(
470 EmbeddingIndexingConfig::QuantizationType::Code quantization_type,
471 schema_store.GetQuantizationType(schema_type_id,
472 old_hit.basic_hit().section_id()));
473 ICING_RETURN_IF_ERROR(new_index->MarkIndexNonEmpty());
474
475 ICING_ASSIGN_OR_RETURN(
476 uint32_t new_location,
477 TransferEmbeddingVector(old_hit, dimension, quantization_type,
478 new_index));
479 new_hits.push_back(EmbeddingHit(
480 BasicHit(old_hit.basic_hit().section_id(), new_document_id),
481 new_location));
482 }
483 }
484 // No hit needs to be added to the new index.
485 if (new_hits.empty()) {
486 continue;
487 }
488 // Add transferred hits to the new index.
489 ICING_ASSIGN_OR_RETURN(
490 std::unique_ptr<PostingListEmbeddingHitAccessor> hit_accum,
491 PostingListEmbeddingHitAccessor::Create(
492 new_index->flash_index_storage_.get(),
493 new_index->posting_list_hit_serializer_.get()));
494 for (auto new_hit_itr = new_hits.rbegin(); new_hit_itr != new_hits.rend();
495 ++new_hit_itr) {
496 ICING_RETURN_IF_ERROR(hit_accum->PrependHit(*new_hit_itr));
497 }
498 PostingListEmbeddingHitAccessor::FinalizeResult result =
499 std::move(*hit_accum).Finalize();
500 if (!result.id.is_valid()) {
501 return absl_ports::InternalError("Failed to finalize posting list");
502 }
503 ICING_RETURN_IF_ERROR(
504 new_index->embedding_posting_list_mapper_->Put(key, result.id));
505 }
506 return libtextclassifier3::Status::OK;
507 }
508
Optimize(const DocumentStore * document_store,const SchemaStore * schema_store,const std::vector<DocumentId> & document_id_old_to_new,DocumentId new_last_added_document_id)509 libtextclassifier3::Status EmbeddingIndex::Optimize(
510 const DocumentStore* document_store, const SchemaStore* schema_store,
511 const std::vector<DocumentId>& document_id_old_to_new,
512 DocumentId new_last_added_document_id) {
513 ICING_RETURN_ERROR_IF_NULL(document_store);
514 ICING_RETURN_ERROR_IF_NULL(schema_store);
515 if (is_empty()) {
516 info().last_added_document_id = new_last_added_document_id;
517 return libtextclassifier3::Status::OK;
518 }
519
520 // This is just for completeness, but this should never be necessary, since we
521 // should never have pending hits at the time when Optimize is run.
522 ICING_RETURN_IF_ERROR(CommitBufferToIndex());
523
524 std::string temporary_index_working_path = working_path_ + "_temp";
525 if (!filesystem_.DeleteDirectoryRecursively(
526 temporary_index_working_path.c_str())) {
527 ICING_LOG(ERROR) << "Recursively deleting " << temporary_index_working_path;
528 return absl_ports::InternalError(
529 "Unable to delete temp directory to prepare to build new index.");
530 }
531
532 DestructibleDirectory temporary_index_dir(
533 &filesystem_, std::move(temporary_index_working_path));
534 if (!temporary_index_dir.is_valid()) {
535 return absl_ports::InternalError(
536 "Unable to create temp directory to build new index.");
537 }
538
539 {
540 ICING_ASSIGN_OR_RETURN(
541 std::unique_ptr<EmbeddingIndex> new_index,
542 EmbeddingIndex::Create(&filesystem_, temporary_index_dir.dir(), &clock_,
543 feature_flags_));
544 ICING_RETURN_IF_ERROR(TransferIndex(*document_store, *schema_store,
545 document_id_old_to_new,
546 new_index.get()));
547 new_index->set_last_added_document_id(new_last_added_document_id);
548 ICING_RETURN_IF_ERROR(new_index->PersistToDisk());
549 }
550
551 // Destruct current storage instances to safely swap directories.
552 metadata_mmapped_file_.reset();
553 flash_index_storage_.reset();
554 embedding_posting_list_mapper_.reset();
555 embedding_vectors_.reset();
556 quantized_embedding_vectors_.reset();
557
558 if (!filesystem_.SwapFiles(temporary_index_dir.dir().c_str(),
559 working_path_.c_str())) {
560 return absl_ports::InternalError(
561 "Unable to apply new index due to failed swap!");
562 }
563
564 // Reinitialize the index.
565 is_initialized_ = false;
566 return Initialize();
567 }
568
ScoreEmbeddingHit(const EmbeddingScorer & scorer,const PropertyProto::VectorProto & query,const EmbeddingHit & hit,EmbeddingIndexingConfig::QuantizationType::Code quantization_type) const569 libtextclassifier3::StatusOr<float> EmbeddingIndex::ScoreEmbeddingHit(
570 const EmbeddingScorer& scorer, const PropertyProto::VectorProto& query,
571 const EmbeddingHit& hit,
572 EmbeddingIndexingConfig::QuantizationType::Code quantization_type) const {
573 int dimension = query.values().size();
574 float semantic_score;
575 if (!feature_flags_->enable_embedding_quantization() ||
576 quantization_type == EmbeddingIndexingConfig::QuantizationType::NONE) {
577 ICING_ASSIGN_OR_RETURN(const float* vector,
578 GetEmbeddingVector(hit, dimension));
579 semantic_score = scorer.Score(dimension,
580 /*v1=*/query.values().data(),
581 /*v2=*/vector);
582 } else {
583 ICING_ASSIGN_OR_RETURN(const char* data,
584 GetQuantizedEmbeddingVector(hit, dimension));
585 Quantizer quantizer(data);
586 const uint8_t* quantized_vector =
587 reinterpret_cast<const uint8_t*>(data + sizeof(Quantizer));
588 semantic_score = scorer.Score(dimension,
589 /*v1=*/query.values().data(),
590 /*v2=*/quantized_vector, quantizer);
591 }
592 return semantic_score;
593 }
594
PersistMetadataToDisk()595 libtextclassifier3::Status EmbeddingIndex::PersistMetadataToDisk() {
596 return metadata_mmapped_file_->PersistToDisk();
597 }
598
PersistStoragesToDisk()599 libtextclassifier3::Status EmbeddingIndex::PersistStoragesToDisk() {
600 if (is_empty()) {
601 return libtextclassifier3::Status::OK;
602 }
603 if (!flash_index_storage_->PersistToDisk()) {
604 return absl_ports::InternalError("Fail to persist flash index to disk");
605 }
606 ICING_RETURN_IF_ERROR(embedding_posting_list_mapper_->PersistToDisk());
607 ICING_RETURN_IF_ERROR(embedding_vectors_->PersistToDisk());
608 ICING_RETURN_IF_ERROR(quantized_embedding_vectors_->PersistToDisk());
609 return libtextclassifier3::Status::OK;
610 }
611
UpdateStoragesChecksum()612 libtextclassifier3::StatusOr<Crc32> EmbeddingIndex::UpdateStoragesChecksum() {
613 if (is_empty()) {
614 return Crc32(0);
615 }
616 ICING_ASSIGN_OR_RETURN(Crc32 embedding_posting_list_mapper_crc,
617 embedding_posting_list_mapper_->UpdateChecksum());
618 ICING_ASSIGN_OR_RETURN(Crc32 embedding_vectors_crc,
619 embedding_vectors_->UpdateChecksum());
620 ICING_ASSIGN_OR_RETURN(Crc32 quantized_embedding_vectors_crc,
621 quantized_embedding_vectors_->UpdateChecksum());
622 return Crc32(embedding_posting_list_mapper_crc.Get() ^
623 embedding_vectors_crc.Get() ^
624 quantized_embedding_vectors_crc.Get());
625 }
626
GetStoragesChecksum() const627 libtextclassifier3::StatusOr<Crc32> EmbeddingIndex::GetStoragesChecksum()
628 const {
629 if (is_empty()) {
630 return Crc32(0);
631 }
632 ICING_ASSIGN_OR_RETURN(Crc32 embedding_posting_list_mapper_crc,
633 embedding_posting_list_mapper_->GetChecksum());
634 Crc32 embedding_vectors_crc = embedding_vectors_->GetChecksum();
635 Crc32 quantized_embedding_vectors_crc =
636 quantized_embedding_vectors_->GetChecksum();
637 return Crc32(embedding_posting_list_mapper_crc.Get() ^
638 embedding_vectors_crc.Get() ^
639 quantized_embedding_vectors_crc.Get());
640 }
641
642 } // namespace lib
643 } // namespace icing
644