• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2024 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/index/embed/embedding-index.h"
16 
17 #include <algorithm>
18 #include <cstdint>
19 #include <cstring>
20 #include <memory>
21 #include <string>
22 #include <string_view>
23 #include <utility>
24 #include <vector>
25 
26 #include "icing/text_classifier/lib3/utils/base/status.h"
27 #include "icing/text_classifier/lib3/utils/base/statusor.h"
28 #include "icing/absl_ports/canonical_errors.h"
29 #include "icing/absl_ports/str_cat.h"
30 #include "icing/feature-flags.h"
31 #include "icing/file/destructible-directory.h"
32 #include "icing/file/file-backed-vector.h"
33 #include "icing/file/filesystem.h"
34 #include "icing/file/memory-mapped-file.h"
35 #include "icing/file/posting_list/flash-index-storage.h"
36 #include "icing/file/posting_list/posting-list-identifier.h"
37 #include "icing/index/embed/embedding-hit.h"
38 #include "icing/index/embed/embedding-scorer.h"
39 #include "icing/index/embed/posting-list-embedding-hit-accessor.h"
40 #include "icing/index/embed/quantizer.h"
41 #include "icing/index/hit/hit.h"
42 #include "icing/schema/schema-store.h"
43 #include "icing/store/document-filter-data.h"
44 #include "icing/store/document-id.h"
45 #include "icing/store/document-store.h"
46 #include "icing/store/dynamic-trie-key-mapper.h"
47 #include "icing/store/key-mapper.h"
48 #include "icing/util/clock.h"
49 #include "icing/util/crc32.h"
50 #include "icing/util/encode-util.h"
51 #include "icing/util/logging.h"
52 #include "icing/util/status-macros.h"
53 
54 namespace icing {
55 namespace lib {
56 
57 namespace {
58 
59 // The maximum size of the embedding hit list mmapper.
60 // We use 64MiB for 32-bit platforms and 128MiB for 64-bit platforms.
61 #ifdef ICING_ARCH_BIT_64
62   constexpr uint32_t kEmbeddingHitListMapperMaxSize = 128 * 1024 * 1024;
63 #else
64   constexpr uint32_t kEmbeddingHitListMapperMaxSize = 64 * 1024 * 1024;
65 #endif
66 
67 // The maximum length returned by encode_util::EncodeIntToCString is 5 for
68 // uint32_t.
69 constexpr uint32_t kEncodedDimensionLength = 5;
70 
GetMetadataFilePath(std::string_view working_path)71 std::string GetMetadataFilePath(std::string_view working_path) {
72   return absl_ports::StrCat(working_path, "/metadata");
73 }
74 
GetFlashIndexStorageFilePath(std::string_view working_path)75 std::string GetFlashIndexStorageFilePath(std::string_view working_path) {
76   return absl_ports::StrCat(working_path, "/flash_index_storage");
77 }
78 
GetEmbeddingHitListMapperPath(std::string_view working_path)79 std::string GetEmbeddingHitListMapperPath(std::string_view working_path) {
80   return absl_ports::StrCat(working_path, "/embedding_hit_list_mapper");
81 }
82 
GetEmbeddingVectorsFilePath(std::string_view working_path)83 std::string GetEmbeddingVectorsFilePath(std::string_view working_path) {
84   return absl_ports::StrCat(working_path, "/embedding_vectors");
85 }
86 
GetQuantizedEmbeddingVectorsFilePath(std::string_view working_path)87 std::string GetQuantizedEmbeddingVectorsFilePath(
88     std::string_view working_path) {
89   return absl_ports::StrCat(working_path, "/quantized_embedding_vectors");
90 }
91 
92 // An injective function that maps the ordered pair (dimension, model_signature)
93 // to a string, which is used to form a key for embedding_posting_list_mapper_.
GetPostingListKey(uint32_t dimension,std::string_view model_signature)94 std::string GetPostingListKey(uint32_t dimension,
95                               std::string_view model_signature) {
96   std::string encoded_dimension_str =
97       encode_util::EncodeIntToCString(dimension);
98   // Make encoded_dimension_str to fixed kEncodedDimensionLength bytes.
99   while (encoded_dimension_str.size() < kEncodedDimensionLength) {
100     // C string cannot contain 0 bytes, so we append it using 1, just like what
101     // we do in encode_util::EncodeIntToCString.
102     //
103     // The reason that this works is because DecodeIntToString decodes a byte
104     // value of 0x01 as 0x00. When EncodeIntToCString returns an encoded
105     // dimension that is less than 5 bytes, it means that the dimension contains
106     // unencoded leading 0x00. So here we're explicitly encoding those bytes as
107     // 0x01.
108     encoded_dimension_str.push_back(1);
109   }
110   return absl_ports::StrCat(encoded_dimension_str, model_signature);
111 }
112 
GetPostingListKey(const PropertyProto::VectorProto & vector)113 std::string GetPostingListKey(const PropertyProto::VectorProto& vector) {
114   return GetPostingListKey(vector.values().size(), vector.model_signature());
115 }
116 
CreateQuantizer(const PropertyProto::VectorProto & vector)117 libtextclassifier3::StatusOr<Quantizer> CreateQuantizer(
118     const PropertyProto::VectorProto& vector) {
119   if (vector.values().empty()) {
120     return absl_ports::InvalidArgumentError("Vector dimension is 0");
121   }
122   auto minmax_pair =
123       std::minmax_element(vector.values().begin(), vector.values().end());
124   return Quantizer::Create(*minmax_pair.first, *minmax_pair.second);
125 }
126 
127 }  // namespace
128 
129 libtextclassifier3::StatusOr<std::unique_ptr<EmbeddingIndex>>
Create(const Filesystem * filesystem,std::string working_path,const Clock * clock,const FeatureFlags * feature_flags)130 EmbeddingIndex::Create(const Filesystem* filesystem, std::string working_path,
131                        const Clock* clock, const FeatureFlags* feature_flags) {
132   ICING_RETURN_ERROR_IF_NULL(filesystem);
133   ICING_RETURN_ERROR_IF_NULL(clock);
134 
135   std::unique_ptr<EmbeddingIndex> index =
136       std::unique_ptr<EmbeddingIndex>(new EmbeddingIndex(
137           *filesystem, std::move(working_path), clock, feature_flags));
138   ICING_RETURN_IF_ERROR(index->Initialize());
139   return index;
140 }
141 
CreateStorageDataIfNonEmpty()142 libtextclassifier3::Status EmbeddingIndex::CreateStorageDataIfNonEmpty() {
143   if (is_empty()) {
144     return libtextclassifier3::Status::OK;
145   }
146 
147   ICING_ASSIGN_OR_RETURN(FlashIndexStorage flash_index_storage,
148                          FlashIndexStorage::Create(
149                              GetFlashIndexStorageFilePath(working_path_),
150                              &filesystem_, posting_list_hit_serializer_.get()));
151   flash_index_storage_ =
152       std::make_unique<FlashIndexStorage>(std::move(flash_index_storage));
153 
154   ICING_ASSIGN_OR_RETURN(
155       embedding_posting_list_mapper_,
156       DynamicTrieKeyMapper<PostingListIdentifier>::Create(
157           filesystem_, GetEmbeddingHitListMapperPath(working_path_),
158           kEmbeddingHitListMapperMaxSize));
159 
160   ICING_ASSIGN_OR_RETURN(
161       embedding_vectors_,
162       FileBackedVector<float>::Create(
163           filesystem_, GetEmbeddingVectorsFilePath(working_path_),
164           MemoryMappedFile::READ_WRITE_AUTO_SYNC));
165 
166   ICING_ASSIGN_OR_RETURN(
167       quantized_embedding_vectors_,
168       FileBackedVector<char>::Create(
169           filesystem_, GetQuantizedEmbeddingVectorsFilePath(working_path_),
170           MemoryMappedFile::READ_WRITE_AUTO_SYNC));
171 
172   return libtextclassifier3::Status::OK;
173 }
174 
MarkIndexNonEmpty()175 libtextclassifier3::Status EmbeddingIndex::MarkIndexNonEmpty() {
176   if (!is_empty()) {
177     return libtextclassifier3::Status::OK;
178   }
179   info().is_empty = false;
180   return CreateStorageDataIfNonEmpty();
181 }
182 
Initialize()183 libtextclassifier3::Status EmbeddingIndex::Initialize() {
184   bool is_new = false;
185   if (!filesystem_.FileExists(GetMetadataFilePath(working_path_).c_str())) {
186     // Create working directory.
187     if (!filesystem_.CreateDirectoryRecursively(working_path_.c_str())) {
188       return absl_ports::InternalError(
189           absl_ports::StrCat("Failed to create directory: ", working_path_));
190     }
191     is_new = true;
192   }
193 
194   ICING_ASSIGN_OR_RETURN(
195       MemoryMappedFile metadata_mmapped_file,
196       MemoryMappedFile::Create(filesystem_, GetMetadataFilePath(working_path_),
197                                MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
198                                /*max_file_size=*/kMetadataFileSize,
199                                /*pre_mapping_file_offset=*/0,
200                                /*pre_mapping_mmap_size=*/kMetadataFileSize));
201   metadata_mmapped_file_ =
202       std::make_unique<MemoryMappedFile>(std::move(metadata_mmapped_file));
203 
204   if (is_new) {
205     ICING_RETURN_IF_ERROR(metadata_mmapped_file_->GrowAndRemapIfNecessary(
206         /*file_offset=*/0, /*mmap_size=*/kMetadataFileSize));
207     info().magic = Info::kMagic;
208     info().last_added_document_id = kInvalidDocumentId;
209     info().is_empty = true;
210     memset(Info().padding_, 0, Info::kPaddingSize);
211     ICING_RETURN_IF_ERROR(InitializeNewStorage());
212   } else {
213     if (metadata_mmapped_file_->available_size() != kMetadataFileSize) {
214       return absl_ports::FailedPreconditionError(
215           "Incorrect metadata file size");
216     }
217     if (info().magic != Info::kMagic) {
218       return absl_ports::FailedPreconditionError("Incorrect magic value");
219     }
220     ICING_RETURN_IF_ERROR(CreateStorageDataIfNonEmpty());
221     ICING_RETURN_IF_ERROR(InitializeExistingStorage());
222   }
223   return libtextclassifier3::Status::OK;
224 }
225 
Clear()226 libtextclassifier3::Status EmbeddingIndex::Clear() {
227   pending_embedding_hits_.clear();
228   metadata_mmapped_file_.reset();
229   flash_index_storage_.reset();
230   embedding_posting_list_mapper_.reset();
231   embedding_vectors_.reset();
232   quantized_embedding_vectors_.reset();
233   if (filesystem_.DirectoryExists(working_path_.c_str())) {
234     ICING_RETURN_IF_ERROR(Discard(filesystem_, working_path_));
235   }
236   is_initialized_ = false;
237   return Initialize();
238 }
239 
240 libtextclassifier3::StatusOr<std::unique_ptr<PostingListEmbeddingHitAccessor>>
GetAccessor(uint32_t dimension,std::string_view model_signature) const241 EmbeddingIndex::GetAccessor(uint32_t dimension,
242                             std::string_view model_signature) const {
243   if (dimension == 0) {
244     return absl_ports::InvalidArgumentError("Dimension is 0");
245   }
246   if (is_empty()) {
247     return absl_ports::NotFoundError("EmbeddingIndex is empty");
248   }
249 
250   std::string key = GetPostingListKey(dimension, model_signature);
251   ICING_ASSIGN_OR_RETURN(PostingListIdentifier posting_list_id,
252                          embedding_posting_list_mapper_->Get(key));
253   return PostingListEmbeddingHitAccessor::CreateFromExisting(
254       flash_index_storage_.get(), posting_list_hit_serializer_.get(),
255       posting_list_id);
256 }
257 
AppendEmbeddingVector(const PropertyProto::VectorProto & vector,EmbeddingIndexingConfig::QuantizationType::Code quantization_type)258 libtextclassifier3::StatusOr<uint32_t> EmbeddingIndex::AppendEmbeddingVector(
259     const PropertyProto::VectorProto& vector,
260     EmbeddingIndexingConfig::QuantizationType::Code quantization_type) {
261   uint32_t dimension = vector.values().size();
262   uint32_t location;
263   if (!feature_flags_->enable_embedding_quantization() ||
264       quantization_type == EmbeddingIndexingConfig::QuantizationType::NONE) {
265     location = embedding_vectors_->num_elements();
266     ICING_ASSIGN_OR_RETURN(
267         FileBackedVector<float>::MutableArrayView mutable_arr,
268         embedding_vectors_->Allocate(dimension));
269     mutable_arr.SetArray(/*idx=*/0, vector.values().data(), dimension);
270   } else {
271     ICING_ASSIGN_OR_RETURN(Quantizer quantizer, CreateQuantizer(vector));
272     // Quantize the vector
273     std::vector<uint8_t> quantized_values;
274     quantized_values.reserve(vector.values().size());
275     for (float value : vector.values()) {
276       quantized_values.push_back(quantizer.Quantize(value));
277     }
278 
279     // Store the quantizer and the quantized vector
280     location = quantized_embedding_vectors_->num_elements();
281     ICING_ASSIGN_OR_RETURN(
282         FileBackedVector<char>::MutableArrayView mutable_arr,
283         quantized_embedding_vectors_->Allocate(sizeof(Quantizer) + dimension));
284     mutable_arr.SetArray(/*idx=*/0, reinterpret_cast<char*>(&quantizer),
285                          sizeof(Quantizer));
286     mutable_arr.SetArray(/*idx=*/sizeof(Quantizer),
287                          reinterpret_cast<char*>(quantized_values.data()),
288                          dimension);
289   }
290   return location;
291 }
292 
BufferEmbedding(const BasicHit & basic_hit,const PropertyProto::VectorProto & vector,EmbeddingIndexingConfig::QuantizationType::Code quantization_type)293 libtextclassifier3::Status EmbeddingIndex::BufferEmbedding(
294     const BasicHit& basic_hit, const PropertyProto::VectorProto& vector,
295     EmbeddingIndexingConfig::QuantizationType::Code quantization_type) {
296   if (vector.values().empty()) {
297     return absl_ports::InvalidArgumentError("Vector dimension is 0");
298   }
299   ICING_RETURN_IF_ERROR(MarkIndexNonEmpty());
300 
301   std::string key = GetPostingListKey(vector);
302   ICING_ASSIGN_OR_RETURN(uint32_t location,
303                          AppendEmbeddingVector(vector, quantization_type));
304 
305   // Buffer the embedding hit.
306   pending_embedding_hits_.push_back(
307       {std::move(key), EmbeddingHit(basic_hit, location)});
308   return libtextclassifier3::Status::OK;
309 }
310 
CommitBufferToIndex()311 libtextclassifier3::Status EmbeddingIndex::CommitBufferToIndex() {
312   if (pending_embedding_hits_.empty()) {
313     return libtextclassifier3::Status::OK;
314   }
315   ICING_RETURN_IF_ERROR(MarkIndexNonEmpty());
316 
317   std::sort(pending_embedding_hits_.begin(), pending_embedding_hits_.end());
318   auto iter_curr_key = pending_embedding_hits_.rbegin();
319   while (iter_curr_key != pending_embedding_hits_.rend()) {
320     // In order to batch putting embedding hits with the same key (dimension,
321     // model_signature) to the same posting list, we find the range
322     // [iter_curr_key, iter_next_key) of embedding hits with the same key and
323     // put them into their corresponding posting list together.
324     auto iter_next_key = iter_curr_key;
325     while (iter_next_key != pending_embedding_hits_.rend() &&
326            iter_next_key->first == iter_curr_key->first) {
327       iter_next_key++;
328     }
329 
330     const std::string& key = iter_curr_key->first;
331     libtextclassifier3::StatusOr<PostingListIdentifier> posting_list_id_or =
332         embedding_posting_list_mapper_->Get(key);
333     std::unique_ptr<PostingListEmbeddingHitAccessor> pl_accessor;
334     if (posting_list_id_or.ok()) {
335       // Existing posting list.
336       ICING_ASSIGN_OR_RETURN(
337           pl_accessor,
338           PostingListEmbeddingHitAccessor::CreateFromExisting(
339               flash_index_storage_.get(), posting_list_hit_serializer_.get(),
340               posting_list_id_or.ValueOrDie()));
341     } else if (absl_ports::IsNotFound(posting_list_id_or.status())) {
342       // New posting list.
343       ICING_ASSIGN_OR_RETURN(
344           pl_accessor,
345           PostingListEmbeddingHitAccessor::Create(
346               flash_index_storage_.get(), posting_list_hit_serializer_.get()));
347     } else {
348       // Errors
349       return std::move(posting_list_id_or).status();
350     }
351 
352     // Adding the embedding hits.
353     for (auto iter = iter_curr_key; iter != iter_next_key; ++iter) {
354       ICING_RETURN_IF_ERROR(pl_accessor->PrependHit(iter->second));
355     }
356 
357     // Finalize this posting list and add the posting list id in
358     // embedding_posting_list_mapper_.
359     PostingListEmbeddingHitAccessor::FinalizeResult result =
360         std::move(*pl_accessor).Finalize();
361     if (!result.id.is_valid()) {
362       return absl_ports::InternalError("Failed to finalize posting list");
363     }
364     ICING_RETURN_IF_ERROR(embedding_posting_list_mapper_->Put(key, result.id));
365 
366     // Advance to the next key.
367     iter_curr_key = iter_next_key;
368   }
369   pending_embedding_hits_.clear();
370   return libtextclassifier3::Status::OK;
371 }
372 
TransferEmbeddingVector(const EmbeddingHit & old_hit,uint32_t dimension,EmbeddingIndexingConfig::QuantizationType::Code quantization_type,EmbeddingIndex * new_index) const373 libtextclassifier3::StatusOr<uint32_t> EmbeddingIndex::TransferEmbeddingVector(
374     const EmbeddingHit& old_hit, uint32_t dimension,
375     EmbeddingIndexingConfig::QuantizationType::Code quantization_type,
376     EmbeddingIndex* new_index) const {
377   uint32_t new_location;
378   if (!feature_flags_->enable_embedding_quantization() ||
379       quantization_type == EmbeddingIndexingConfig::QuantizationType::NONE) {
380     ICING_ASSIGN_OR_RETURN(const float* old_vector,
381                            GetEmbeddingVector(old_hit, dimension));
382     new_location = new_index->embedding_vectors_->num_elements();
383 
384     // Copy the embedding vector of the hit to the new index.
385     ICING_ASSIGN_OR_RETURN(
386         FileBackedVector<float>::MutableArrayView mutable_arr,
387         new_index->embedding_vectors_->Allocate(dimension));
388     mutable_arr.SetArray(/*idx=*/0, old_vector, dimension);
389   } else {
390     ICING_ASSIGN_OR_RETURN(const char* old_data,
391                            GetQuantizedEmbeddingVector(old_hit, dimension));
392     new_location = new_index->quantized_embedding_vectors_->num_elements();
393 
394     // Copy the embedding vector of the hit to the new index.
395     ICING_ASSIGN_OR_RETURN(FileBackedVector<char>::MutableArrayView mutable_arr,
396                            new_index->quantized_embedding_vectors_->Allocate(
397                                sizeof(Quantizer) + dimension));
398     mutable_arr.SetArray(/*idx=*/0, old_data, sizeof(Quantizer) + dimension);
399   }
400   return new_location;
401 }
402 
TransferIndex(const DocumentStore & document_store,const SchemaStore & schema_store,const std::vector<DocumentId> & document_id_old_to_new,EmbeddingIndex * new_index) const403 libtextclassifier3::Status EmbeddingIndex::TransferIndex(
404     const DocumentStore& document_store, const SchemaStore& schema_store,
405     const std::vector<DocumentId>& document_id_old_to_new,
406     EmbeddingIndex* new_index) const {
407   if (is_empty()) {
408     return absl_ports::FailedPreconditionError("EmbeddingIndex is empty");
409   }
410 
411   const int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
412   std::unique_ptr<KeyMapper<PostingListIdentifier>::Iterator> itr =
413       embedding_posting_list_mapper_->GetIterator();
414   while (itr->Advance()) {
415     std::string_view key = itr->GetKey();
416     // This should never happen unless there is an inconsistency, or the index
417     // is corrupted.
418     if (key.size() < kEncodedDimensionLength) {
419       return absl_ports::InternalError(
420           "Got invalid key from embedding posting list mapper.");
421     }
422     uint32_t dimension = encode_util::DecodeIntFromCString(
423         std::string_view(key.begin(), kEncodedDimensionLength));
424 
425     // Transfer hits
426     std::vector<EmbeddingHit> new_hits;
427     ICING_ASSIGN_OR_RETURN(
428         std::unique_ptr<PostingListEmbeddingHitAccessor> old_pl_accessor,
429         PostingListEmbeddingHitAccessor::CreateFromExisting(
430             flash_index_storage_.get(), posting_list_hit_serializer_.get(),
431             /*existing_posting_list_id=*/itr->GetValue()));
432     DocumentId last_new_document_id = kInvalidDocumentId;
433     SchemaTypeId schema_type_id = kInvalidSchemaTypeId;
434     while (true) {
435       ICING_ASSIGN_OR_RETURN(std::vector<EmbeddingHit> batch,
436                              old_pl_accessor->GetNextHitsBatch());
437       if (batch.empty()) {
438         break;
439       }
440       for (EmbeddingHit& old_hit : batch) {
441         // Safety checks to add robustness to the codebase, so to make sure
442         // that we never access invalid memory, in case that hit from the
443         // posting list is corrupted.
444         if (old_hit.basic_hit().document_id() < 0 ||
445             old_hit.basic_hit().document_id() >=
446                 document_id_old_to_new.size()) {
447           return absl_ports::InternalError(
448               "Embedding hit document id is out of bound. The provided map is "
449               "too small, or the index may have been corrupted.");
450         }
451 
452         // Construct transferred hit and add the embedding vector to the new
453         // index.
454         DocumentId new_document_id =
455             document_id_old_to_new[old_hit.basic_hit().document_id()];
456         if (new_document_id == kInvalidDocumentId) {
457           continue;
458         }
459         if (new_document_id != last_new_document_id) {
460           schema_type_id =
461               document_store.GetSchemaTypeId(new_document_id, current_time_ms);
462         }
463         last_new_document_id = new_document_id;
464         if (schema_type_id == kInvalidSchemaTypeId) {
465           // This should not happen, since document store is optimized first,
466           // so that new_document_id here should be alive.
467           continue;
468         }
469         ICING_ASSIGN_OR_RETURN(
470             EmbeddingIndexingConfig::QuantizationType::Code quantization_type,
471             schema_store.GetQuantizationType(schema_type_id,
472                                              old_hit.basic_hit().section_id()));
473         ICING_RETURN_IF_ERROR(new_index->MarkIndexNonEmpty());
474 
475         ICING_ASSIGN_OR_RETURN(
476             uint32_t new_location,
477             TransferEmbeddingVector(old_hit, dimension, quantization_type,
478                                     new_index));
479         new_hits.push_back(EmbeddingHit(
480             BasicHit(old_hit.basic_hit().section_id(), new_document_id),
481             new_location));
482       }
483     }
484     // No hit needs to be added to the new index.
485     if (new_hits.empty()) {
486       continue;
487     }
488     // Add transferred hits to the new index.
489     ICING_ASSIGN_OR_RETURN(
490         std::unique_ptr<PostingListEmbeddingHitAccessor> hit_accum,
491         PostingListEmbeddingHitAccessor::Create(
492             new_index->flash_index_storage_.get(),
493             new_index->posting_list_hit_serializer_.get()));
494     for (auto new_hit_itr = new_hits.rbegin(); new_hit_itr != new_hits.rend();
495          ++new_hit_itr) {
496       ICING_RETURN_IF_ERROR(hit_accum->PrependHit(*new_hit_itr));
497     }
498     PostingListEmbeddingHitAccessor::FinalizeResult result =
499         std::move(*hit_accum).Finalize();
500     if (!result.id.is_valid()) {
501       return absl_ports::InternalError("Failed to finalize posting list");
502     }
503     ICING_RETURN_IF_ERROR(
504         new_index->embedding_posting_list_mapper_->Put(key, result.id));
505   }
506   return libtextclassifier3::Status::OK;
507 }
508 
Optimize(const DocumentStore * document_store,const SchemaStore * schema_store,const std::vector<DocumentId> & document_id_old_to_new,DocumentId new_last_added_document_id)509 libtextclassifier3::Status EmbeddingIndex::Optimize(
510     const DocumentStore* document_store, const SchemaStore* schema_store,
511     const std::vector<DocumentId>& document_id_old_to_new,
512     DocumentId new_last_added_document_id) {
513   ICING_RETURN_ERROR_IF_NULL(document_store);
514   ICING_RETURN_ERROR_IF_NULL(schema_store);
515   if (is_empty()) {
516     info().last_added_document_id = new_last_added_document_id;
517     return libtextclassifier3::Status::OK;
518   }
519 
520   // This is just for completeness, but this should never be necessary, since we
521   // should never have pending hits at the time when Optimize is run.
522   ICING_RETURN_IF_ERROR(CommitBufferToIndex());
523 
524   std::string temporary_index_working_path = working_path_ + "_temp";
525   if (!filesystem_.DeleteDirectoryRecursively(
526           temporary_index_working_path.c_str())) {
527     ICING_LOG(ERROR) << "Recursively deleting " << temporary_index_working_path;
528     return absl_ports::InternalError(
529         "Unable to delete temp directory to prepare to build new index.");
530   }
531 
532   DestructibleDirectory temporary_index_dir(
533       &filesystem_, std::move(temporary_index_working_path));
534   if (!temporary_index_dir.is_valid()) {
535     return absl_ports::InternalError(
536         "Unable to create temp directory to build new index.");
537   }
538 
539   {
540     ICING_ASSIGN_OR_RETURN(
541         std::unique_ptr<EmbeddingIndex> new_index,
542         EmbeddingIndex::Create(&filesystem_, temporary_index_dir.dir(), &clock_,
543                                feature_flags_));
544     ICING_RETURN_IF_ERROR(TransferIndex(*document_store, *schema_store,
545                                         document_id_old_to_new,
546                                         new_index.get()));
547     new_index->set_last_added_document_id(new_last_added_document_id);
548     ICING_RETURN_IF_ERROR(new_index->PersistToDisk());
549   }
550 
551   // Destruct current storage instances to safely swap directories.
552   metadata_mmapped_file_.reset();
553   flash_index_storage_.reset();
554   embedding_posting_list_mapper_.reset();
555   embedding_vectors_.reset();
556   quantized_embedding_vectors_.reset();
557 
558   if (!filesystem_.SwapFiles(temporary_index_dir.dir().c_str(),
559                              working_path_.c_str())) {
560     return absl_ports::InternalError(
561         "Unable to apply new index due to failed swap!");
562   }
563 
564   // Reinitialize the index.
565   is_initialized_ = false;
566   return Initialize();
567 }
568 
ScoreEmbeddingHit(const EmbeddingScorer & scorer,const PropertyProto::VectorProto & query,const EmbeddingHit & hit,EmbeddingIndexingConfig::QuantizationType::Code quantization_type) const569 libtextclassifier3::StatusOr<float> EmbeddingIndex::ScoreEmbeddingHit(
570     const EmbeddingScorer& scorer, const PropertyProto::VectorProto& query,
571     const EmbeddingHit& hit,
572     EmbeddingIndexingConfig::QuantizationType::Code quantization_type) const {
573   int dimension = query.values().size();
574   float semantic_score;
575   if (!feature_flags_->enable_embedding_quantization() ||
576       quantization_type == EmbeddingIndexingConfig::QuantizationType::NONE) {
577     ICING_ASSIGN_OR_RETURN(const float* vector,
578                            GetEmbeddingVector(hit, dimension));
579     semantic_score = scorer.Score(dimension,
580                                   /*v1=*/query.values().data(),
581                                   /*v2=*/vector);
582   } else {
583     ICING_ASSIGN_OR_RETURN(const char* data,
584                            GetQuantizedEmbeddingVector(hit, dimension));
585     Quantizer quantizer(data);
586     const uint8_t* quantized_vector =
587         reinterpret_cast<const uint8_t*>(data + sizeof(Quantizer));
588     semantic_score = scorer.Score(dimension,
589                                   /*v1=*/query.values().data(),
590                                   /*v2=*/quantized_vector, quantizer);
591   }
592   return semantic_score;
593 }
594 
PersistMetadataToDisk()595 libtextclassifier3::Status EmbeddingIndex::PersistMetadataToDisk() {
596   return metadata_mmapped_file_->PersistToDisk();
597 }
598 
PersistStoragesToDisk()599 libtextclassifier3::Status EmbeddingIndex::PersistStoragesToDisk() {
600   if (is_empty()) {
601     return libtextclassifier3::Status::OK;
602   }
603   if (!flash_index_storage_->PersistToDisk()) {
604     return absl_ports::InternalError("Fail to persist flash index to disk");
605   }
606   ICING_RETURN_IF_ERROR(embedding_posting_list_mapper_->PersistToDisk());
607   ICING_RETURN_IF_ERROR(embedding_vectors_->PersistToDisk());
608   ICING_RETURN_IF_ERROR(quantized_embedding_vectors_->PersistToDisk());
609   return libtextclassifier3::Status::OK;
610 }
611 
UpdateStoragesChecksum()612 libtextclassifier3::StatusOr<Crc32> EmbeddingIndex::UpdateStoragesChecksum() {
613   if (is_empty()) {
614     return Crc32(0);
615   }
616   ICING_ASSIGN_OR_RETURN(Crc32 embedding_posting_list_mapper_crc,
617                          embedding_posting_list_mapper_->UpdateChecksum());
618   ICING_ASSIGN_OR_RETURN(Crc32 embedding_vectors_crc,
619                          embedding_vectors_->UpdateChecksum());
620   ICING_ASSIGN_OR_RETURN(Crc32 quantized_embedding_vectors_crc,
621                          quantized_embedding_vectors_->UpdateChecksum());
622   return Crc32(embedding_posting_list_mapper_crc.Get() ^
623                embedding_vectors_crc.Get() ^
624                quantized_embedding_vectors_crc.Get());
625 }
626 
GetStoragesChecksum() const627 libtextclassifier3::StatusOr<Crc32> EmbeddingIndex::GetStoragesChecksum()
628     const {
629   if (is_empty()) {
630     return Crc32(0);
631   }
632   ICING_ASSIGN_OR_RETURN(Crc32 embedding_posting_list_mapper_crc,
633                          embedding_posting_list_mapper_->GetChecksum());
634   Crc32 embedding_vectors_crc = embedding_vectors_->GetChecksum();
635   Crc32 quantized_embedding_vectors_crc =
636       quantized_embedding_vectors_->GetChecksum();
637   return Crc32(embedding_posting_list_mapper_crc.Get() ^
638                embedding_vectors_crc.Get() ^
639                quantized_embedding_vectors_crc.Get());
640 }
641 
642 }  // namespace lib
643 }  // namespace icing
644