1 // Copyright (C) 2024 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_INDEX_EMBED_EMBEDDING_INDEX_H_ 16 #define ICING_INDEX_EMBED_EMBEDDING_INDEX_H_ 17 18 #include <cstdint> 19 #include <memory> 20 #include <string> 21 #include <string_view> 22 #include <utility> 23 #include <vector> 24 25 #include "icing/text_classifier/lib3/utils/base/status.h" 26 #include "icing/text_classifier/lib3/utils/base/statusor.h" 27 #include "icing/absl_ports/canonical_errors.h" 28 #include "icing/file/file-backed-vector.h" 29 #include "icing/file/filesystem.h" 30 #include "icing/file/memory-mapped-file.h" 31 #include "icing/file/persistent-storage.h" 32 #include "icing/file/posting_list/flash-index-storage.h" 33 #include "icing/file/posting_list/posting-list-identifier.h" 34 #include "icing/index/embed/embedding-hit.h" 35 #include "icing/index/embed/posting-list-embedding-hit-accessor.h" 36 #include "icing/index/embed/posting-list-embedding-hit-serializer.h" 37 #include "icing/index/hit/hit.h" 38 #include "icing/store/document-id.h" 39 #include "icing/store/key-mapper.h" 40 #include "icing/util/crc32.h" 41 42 namespace icing { 43 namespace lib { 44 45 class EmbeddingIndex : public PersistentStorage { 46 public: 47 struct Info { 48 static constexpr int32_t kMagic = 0x61e7cbf1; 49 50 int32_t magic; 51 DocumentId last_added_document_id; 52 bool is_empty; 53 54 static constexpr int kPaddingSize = 1000; 55 // Padding exists just to reserve space for additional values. 56 uint8_t padding_[kPaddingSize]; 57 ComputeChecksumInfo58 Crc32 ComputeChecksum() const { 59 return Crc32( 60 std::string_view(reinterpret_cast<const char*>(this), sizeof(Info))); 61 } 62 }; 63 static_assert(sizeof(Info) == 1012, ""); 64 65 // Metadata file layout: <Crcs><Info> 66 static constexpr int32_t kCrcsMetadataBufferOffset = 0; 67 static constexpr int32_t kInfoMetadataBufferOffset = 68 static_cast<int32_t>(sizeof(Crcs)); 69 static constexpr int32_t kMetadataFileSize = sizeof(Crcs) + sizeof(Info); 70 static_assert(kMetadataFileSize == 1024, ""); 71 72 static constexpr WorkingPathType kWorkingPathType = 73 WorkingPathType::kDirectory; 74 75 EmbeddingIndex(const EmbeddingIndex&) = delete; 76 EmbeddingIndex& operator=(const EmbeddingIndex&) = delete; 77 78 // Creates a new EmbeddingIndex instance to index embeddings. 79 // 80 // Returns: 81 // - FAILED_PRECONDITION_ERROR if the file checksum doesn't match the stored 82 // checksum. 83 // - INTERNAL_ERROR on I/O errors. 84 // - Any error from MemoryMappedFile, FlashIndexStorage, 85 // DynamicTrieKeyMapper, or FileBackedVector. 86 static libtextclassifier3::StatusOr<std::unique_ptr<EmbeddingIndex>> Create( 87 const Filesystem* filesystem, std::string working_path); 88 Discard(const Filesystem & filesystem,const std::string & working_path)89 static libtextclassifier3::Status Discard(const Filesystem& filesystem, 90 const std::string& working_path) { 91 return PersistentStorage::Discard(filesystem, working_path, 92 kWorkingPathType); 93 } 94 95 libtextclassifier3::Status Clear(); 96 97 // Buffer an embedding pending to be added to the index. This is required 98 // since EmbeddingHits added in posting lists must be decreasing, which means 99 // that section ids and location indexes for a single document must be added 100 // decreasingly. 101 // 102 // Returns: 103 // - OK on success 104 // - INVALID_ARGUMENT error if the dimension is 0. 105 // - INTERNAL_ERROR on I/O error 106 libtextclassifier3::Status BufferEmbedding( 107 const BasicHit& basic_hit, const PropertyProto::VectorProto& vector); 108 109 // Commit the embedding hits in the buffer to the index. 110 // 111 // Returns: 112 // - OK on success 113 // - INTERNAL_ERROR on I/O error 114 // - Any error from posting lists 115 libtextclassifier3::Status CommitBufferToIndex(); 116 117 // Returns a PostingListEmbeddingHitAccessor for all embedding hits that match 118 // with the provided dimension and signature. 119 // 120 // Returns: 121 // - a PostingListEmbeddingHitAccessor instance on success. 122 // - INVALID_ARGUMENT error if the dimension is 0. 123 // - NOT_FOUND error if there is no matching embedding hit. 124 // - Any error from posting lists. 125 libtextclassifier3::StatusOr<std::unique_ptr<PostingListEmbeddingHitAccessor>> 126 GetAccessor(uint32_t dimension, std::string_view model_signature) const; 127 128 // Returns a PostingListEmbeddingHitAccessor for all embedding hits that match 129 // with the provided vector's dimension and signature. 130 // 131 // Returns: 132 // - a PostingListEmbeddingHitAccessor instance on success. 133 // - INVALID_ARGUMENT error if the dimension is 0. 134 // - NOT_FOUND error if there is no matching embedding hit. 135 // - Any error from posting lists. 136 libtextclassifier3::StatusOr<std::unique_ptr<PostingListEmbeddingHitAccessor>> GetAccessorForVector(const PropertyProto::VectorProto & vector)137 GetAccessorForVector(const PropertyProto::VectorProto& vector) const { 138 return GetAccessor(vector.values_size(), vector.model_signature()); 139 } 140 141 // Reduces internal file sizes by reclaiming space of deleted documents. 142 // new_last_added_document_id will be used to update the last added document 143 // id in the lite index. 144 // 145 // Returns: 146 // - OK on success 147 // - INTERNAL_ERROR on IO error, this indicates that the index may be in an 148 // invalid state and should be cleared. 149 libtextclassifier3::Status Optimize( 150 const std::vector<DocumentId>& document_id_old_to_new, 151 DocumentId new_last_added_document_id); 152 153 // Returns a pointer to the embedding vector for the given hit. 154 // 155 // Returns: 156 // - a pointer to the embedding vector on success. 157 // - OUT_OF_RANGE error if the referred vector is out of range based on the 158 // location and dimension. GetEmbeddingVector(const EmbeddingHit & hit,uint32_t dimension)159 libtextclassifier3::StatusOr<const float*> GetEmbeddingVector( 160 const EmbeddingHit& hit, uint32_t dimension) const { 161 if (static_cast<int64_t>(hit.location()) + dimension > 162 GetTotalVectorSize()) { 163 return absl_ports::OutOfRangeError( 164 "Got an embedding hit that refers to a vector out of range."); 165 } 166 return embedding_vectors_->array() + hit.location(); 167 } 168 GetRawEmbeddingData()169 libtextclassifier3::StatusOr<const float*> GetRawEmbeddingData() const { 170 if (is_empty()) { 171 return absl_ports::NotFoundError("EmbeddingIndex is empty"); 172 } 173 return embedding_vectors_->array(); 174 } 175 GetTotalVectorSize()176 int32_t GetTotalVectorSize() const { 177 if (is_empty()) { 178 return 0; 179 } 180 return embedding_vectors_->num_elements(); 181 } 182 last_added_document_id()183 DocumentId last_added_document_id() const { 184 return info().last_added_document_id; 185 } 186 set_last_added_document_id(DocumentId document_id)187 void set_last_added_document_id(DocumentId document_id) { 188 Info& info_ref = info(); 189 if (info_ref.last_added_document_id == kInvalidDocumentId || 190 document_id > info_ref.last_added_document_id) { 191 info_ref.last_added_document_id = document_id; 192 } 193 } 194 is_empty()195 bool is_empty() const { return info().is_empty; } 196 197 private: EmbeddingIndex(const Filesystem & filesystem,std::string working_path)198 explicit EmbeddingIndex(const Filesystem& filesystem, 199 std::string working_path) 200 : PersistentStorage(filesystem, std::move(working_path), 201 kWorkingPathType) {} 202 203 // Creates the storage data if the index is not empty. This will initialize 204 // flash_index_storage_, embedding_posting_list_mapper_, embedding_vectors_. 205 // 206 // Returns: 207 // - OK on success 208 // - Any error from FlashIndexStorage, DynamicTrieKeyMapper, or 209 // FileBackedVector. 210 libtextclassifier3::Status CreateStorageDataIfNonEmpty(); 211 212 // Marks the index's header to indicate that the index is non-empty. 213 // 214 // If the index is already marked as non-empty, this is a no-op. Otherwise, 215 // CreateStorageDataIfNonEmpty will be called to create the storage data. 216 // 217 // Returns: 218 // - OK on success 219 // - Any error when calling CreateStorageDataIfNonEmpty. 220 libtextclassifier3::Status MarkIndexNonEmpty(); 221 222 libtextclassifier3::Status Initialize(); 223 224 // Transfers embedding data and hits from the current index to new_index. 225 // 226 // Returns: 227 // - OK on success 228 // - FAILED_PRECONDITION_ERROR if the current index is empty. 229 // - INTERNAL_ERROR on I/O error. This could potentially leave the storages 230 // in an invalid state and the caller should handle it properly (e.g. 231 // discard and rebuild) 232 libtextclassifier3::Status TransferIndex( 233 const std::vector<DocumentId>& document_id_old_to_new, 234 EmbeddingIndex* new_index) const; 235 236 // Flushes contents of metadata file. 237 // 238 // Returns: 239 // - OK on success 240 // - INTERNAL_ERROR on I/O error 241 libtextclassifier3::Status PersistMetadataToDisk(bool force) override; 242 243 // Flushes contents of all storages to underlying files. 244 // 245 // Returns: 246 // - OK on success 247 // - INTERNAL_ERROR on I/O error 248 libtextclassifier3::Status PersistStoragesToDisk(bool force) override; 249 250 // Computes and returns Info checksum. 251 // 252 // Returns: 253 // - Crc of the Info on success 254 libtextclassifier3::StatusOr<Crc32> ComputeInfoChecksum(bool force) override; 255 256 // Computes and returns all storages checksum. 257 // 258 // Returns: 259 // - Crc of all storages on success 260 // - INTERNAL_ERROR if any data inconsistency 261 libtextclassifier3::StatusOr<Crc32> ComputeStoragesChecksum( 262 bool force) override; 263 crcs()264 Crcs& crcs() override { 265 return *reinterpret_cast<Crcs*>(metadata_mmapped_file_->mutable_region() + 266 kCrcsMetadataBufferOffset); 267 } 268 crcs()269 const Crcs& crcs() const override { 270 return *reinterpret_cast<const Crcs*>(metadata_mmapped_file_->region() + 271 kCrcsMetadataBufferOffset); 272 } 273 info()274 Info& info() { 275 return *reinterpret_cast<Info*>(metadata_mmapped_file_->mutable_region() + 276 kInfoMetadataBufferOffset); 277 } 278 info()279 const Info& info() const { 280 return *reinterpret_cast<const Info*>(metadata_mmapped_file_->region() + 281 kInfoMetadataBufferOffset); 282 } 283 284 // In memory data: 285 // Pending embedding hits with their embedding keys used for 286 // embedding_posting_list_mapper_. 287 std::vector<std::pair<std::string, EmbeddingHit>> pending_embedding_hits_; 288 289 // Metadata 290 std::unique_ptr<MemoryMappedFile> metadata_mmapped_file_; 291 292 // Posting list storage 293 std::unique_ptr<PostingListEmbeddingHitSerializer> 294 posting_list_hit_serializer_ = 295 std::make_unique<PostingListEmbeddingHitSerializer>(); 296 297 // null if the index is empty. 298 std::unique_ptr<FlashIndexStorage> flash_index_storage_; 299 300 // The mapper from embedding keys to the corresponding posting list identifier 301 // that stores all embedding hits with the same key. 302 // 303 // The key for an embedding hit is a one-to-one encoded string of the ordered 304 // pair (dimension, model_signature) corresponding to the embedding. 305 // 306 // null if the index is empty. 307 std::unique_ptr<KeyMapper<PostingListIdentifier>> 308 embedding_posting_list_mapper_; 309 310 // A single FileBackedVector that holds all embedding vectors. 311 // 312 // null if the index is empty. 313 std::unique_ptr<FileBackedVector<float>> embedding_vectors_; 314 }; 315 316 } // namespace lib 317 } // namespace icing 318 319 #endif // ICING_INDEX_EMBED_EMBEDDING_INDEX_H_ 320