• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2024 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_INDEX_EMBED_EMBEDDING_INDEX_H_
16 #define ICING_INDEX_EMBED_EMBEDDING_INDEX_H_
17 
18 #include <cstdint>
19 #include <memory>
20 #include <string>
21 #include <string_view>
22 #include <utility>
23 #include <vector>
24 
25 #include "icing/text_classifier/lib3/utils/base/status.h"
26 #include "icing/text_classifier/lib3/utils/base/statusor.h"
27 #include "icing/absl_ports/canonical_errors.h"
28 #include "icing/file/file-backed-vector.h"
29 #include "icing/file/filesystem.h"
30 #include "icing/file/memory-mapped-file.h"
31 #include "icing/file/persistent-storage.h"
32 #include "icing/file/posting_list/flash-index-storage.h"
33 #include "icing/file/posting_list/posting-list-identifier.h"
34 #include "icing/index/embed/embedding-hit.h"
35 #include "icing/index/embed/posting-list-embedding-hit-accessor.h"
36 #include "icing/index/embed/posting-list-embedding-hit-serializer.h"
37 #include "icing/index/hit/hit.h"
38 #include "icing/store/document-id.h"
39 #include "icing/store/key-mapper.h"
40 #include "icing/util/crc32.h"
41 
42 namespace icing {
43 namespace lib {
44 
45 class EmbeddingIndex : public PersistentStorage {
46  public:
47   struct Info {
48     static constexpr int32_t kMagic = 0x61e7cbf1;
49 
50     int32_t magic;
51     DocumentId last_added_document_id;
52     bool is_empty;
53 
54     static constexpr int kPaddingSize = 1000;
55     // Padding exists just to reserve space for additional values.
56     uint8_t padding_[kPaddingSize];
57 
ComputeChecksumInfo58     Crc32 ComputeChecksum() const {
59       return Crc32(
60           std::string_view(reinterpret_cast<const char*>(this), sizeof(Info)));
61     }
62   };
63   static_assert(sizeof(Info) == 1012, "");
64 
65   // Metadata file layout: <Crcs><Info>
66   static constexpr int32_t kCrcsMetadataBufferOffset = 0;
67   static constexpr int32_t kInfoMetadataBufferOffset =
68       static_cast<int32_t>(sizeof(Crcs));
69   static constexpr int32_t kMetadataFileSize = sizeof(Crcs) + sizeof(Info);
70   static_assert(kMetadataFileSize == 1024, "");
71 
72   static constexpr WorkingPathType kWorkingPathType =
73       WorkingPathType::kDirectory;
74 
75   EmbeddingIndex(const EmbeddingIndex&) = delete;
76   EmbeddingIndex& operator=(const EmbeddingIndex&) = delete;
77 
78   // Creates a new EmbeddingIndex instance to index embeddings.
79   //
80   // Returns:
81   //   - FAILED_PRECONDITION_ERROR if the file checksum doesn't match the stored
82   //                               checksum.
83   //   - INTERNAL_ERROR on I/O errors.
84   //   - Any error from MemoryMappedFile, FlashIndexStorage,
85   //     DynamicTrieKeyMapper, or FileBackedVector.
86   static libtextclassifier3::StatusOr<std::unique_ptr<EmbeddingIndex>> Create(
87       const Filesystem* filesystem, std::string working_path);
88 
Discard(const Filesystem & filesystem,const std::string & working_path)89   static libtextclassifier3::Status Discard(const Filesystem& filesystem,
90                                             const std::string& working_path) {
91     return PersistentStorage::Discard(filesystem, working_path,
92                                       kWorkingPathType);
93   }
94 
95   libtextclassifier3::Status Clear();
96 
97   // Buffer an embedding pending to be added to the index. This is required
98   // since EmbeddingHits added in posting lists must be decreasing, which means
99   // that section ids and location indexes for a single document must be added
100   // decreasingly.
101   //
102   // Returns:
103   //   - OK on success
104   //   - INVALID_ARGUMENT error if the dimension is 0.
105   //   - INTERNAL_ERROR on I/O error
106   libtextclassifier3::Status BufferEmbedding(
107       const BasicHit& basic_hit, const PropertyProto::VectorProto& vector);
108 
109   // Commit the embedding hits in the buffer to the index.
110   //
111   // Returns:
112   //   - OK on success
113   //   - INTERNAL_ERROR on I/O error
114   //   - Any error from posting lists
115   libtextclassifier3::Status CommitBufferToIndex();
116 
117   // Returns a PostingListEmbeddingHitAccessor for all embedding hits that match
118   // with the provided dimension and signature.
119   //
120   // Returns:
121   //   - a PostingListEmbeddingHitAccessor instance on success.
122   //   - INVALID_ARGUMENT error if the dimension is 0.
123   //   - NOT_FOUND error if there is no matching embedding hit.
124   //   - Any error from posting lists.
125   libtextclassifier3::StatusOr<std::unique_ptr<PostingListEmbeddingHitAccessor>>
126   GetAccessor(uint32_t dimension, std::string_view model_signature) const;
127 
128   // Returns a PostingListEmbeddingHitAccessor for all embedding hits that match
129   // with the provided vector's dimension and signature.
130   //
131   // Returns:
132   //   - a PostingListEmbeddingHitAccessor instance on success.
133   //   - INVALID_ARGUMENT error if the dimension is 0.
134   //   - NOT_FOUND error if there is no matching embedding hit.
135   //   - Any error from posting lists.
136   libtextclassifier3::StatusOr<std::unique_ptr<PostingListEmbeddingHitAccessor>>
GetAccessorForVector(const PropertyProto::VectorProto & vector)137   GetAccessorForVector(const PropertyProto::VectorProto& vector) const {
138     return GetAccessor(vector.values_size(), vector.model_signature());
139   }
140 
141   // Reduces internal file sizes by reclaiming space of deleted documents.
142   // new_last_added_document_id will be used to update the last added document
143   // id in the lite index.
144   //
145   // Returns:
146   //   - OK on success
147   //   - INTERNAL_ERROR on IO error, this indicates that the index may be in an
148   //     invalid state and should be cleared.
149   libtextclassifier3::Status Optimize(
150       const std::vector<DocumentId>& document_id_old_to_new,
151       DocumentId new_last_added_document_id);
152 
153   // Returns a pointer to the embedding vector for the given hit.
154   //
155   // Returns:
156   //   - a pointer to the embedding vector on success.
157   //   - OUT_OF_RANGE error if the referred vector is out of range based on the
158   //     location and dimension.
GetEmbeddingVector(const EmbeddingHit & hit,uint32_t dimension)159   libtextclassifier3::StatusOr<const float*> GetEmbeddingVector(
160       const EmbeddingHit& hit, uint32_t dimension) const {
161     if (static_cast<int64_t>(hit.location()) + dimension >
162         GetTotalVectorSize()) {
163       return absl_ports::OutOfRangeError(
164           "Got an embedding hit that refers to a vector out of range.");
165     }
166     return embedding_vectors_->array() + hit.location();
167   }
168 
GetRawEmbeddingData()169   libtextclassifier3::StatusOr<const float*> GetRawEmbeddingData() const {
170     if (is_empty()) {
171       return absl_ports::NotFoundError("EmbeddingIndex is empty");
172     }
173     return embedding_vectors_->array();
174   }
175 
GetTotalVectorSize()176   int32_t GetTotalVectorSize() const {
177     if (is_empty()) {
178       return 0;
179     }
180     return embedding_vectors_->num_elements();
181   }
182 
last_added_document_id()183   DocumentId last_added_document_id() const {
184     return info().last_added_document_id;
185   }
186 
set_last_added_document_id(DocumentId document_id)187   void set_last_added_document_id(DocumentId document_id) {
188     Info& info_ref = info();
189     if (info_ref.last_added_document_id == kInvalidDocumentId ||
190         document_id > info_ref.last_added_document_id) {
191       info_ref.last_added_document_id = document_id;
192     }
193   }
194 
is_empty()195   bool is_empty() const { return info().is_empty; }
196 
197  private:
EmbeddingIndex(const Filesystem & filesystem,std::string working_path)198   explicit EmbeddingIndex(const Filesystem& filesystem,
199                           std::string working_path)
200       : PersistentStorage(filesystem, std::move(working_path),
201                           kWorkingPathType) {}
202 
203   // Creates the storage data if the index is not empty. This will initialize
204   // flash_index_storage_, embedding_posting_list_mapper_, embedding_vectors_.
205   //
206   // Returns:
207   //   - OK on success
208   //   - Any error from FlashIndexStorage, DynamicTrieKeyMapper, or
209   //     FileBackedVector.
210   libtextclassifier3::Status CreateStorageDataIfNonEmpty();
211 
212   // Marks the index's header to indicate that the index is non-empty.
213   //
214   // If the index is already marked as non-empty, this is a no-op. Otherwise,
215   // CreateStorageDataIfNonEmpty will be called to create the storage data.
216   //
217   // Returns:
218   //   - OK on success
219   //   - Any error when calling CreateStorageDataIfNonEmpty.
220   libtextclassifier3::Status MarkIndexNonEmpty();
221 
222   libtextclassifier3::Status Initialize();
223 
224   // Transfers embedding data and hits from the current index to new_index.
225   //
226   // Returns:
227   //   - OK on success
228   //   - FAILED_PRECONDITION_ERROR if the current index is empty.
229   //   - INTERNAL_ERROR on I/O error. This could potentially leave the storages
230   //     in an invalid state and the caller should handle it properly (e.g.
231   //     discard and rebuild)
232   libtextclassifier3::Status TransferIndex(
233       const std::vector<DocumentId>& document_id_old_to_new,
234       EmbeddingIndex* new_index) const;
235 
236   // Flushes contents of metadata file.
237   //
238   // Returns:
239   //   - OK on success
240   //   - INTERNAL_ERROR on I/O error
241   libtextclassifier3::Status PersistMetadataToDisk(bool force) override;
242 
243   // Flushes contents of all storages to underlying files.
244   //
245   // Returns:
246   //   - OK on success
247   //   - INTERNAL_ERROR on I/O error
248   libtextclassifier3::Status PersistStoragesToDisk(bool force) override;
249 
250   // Computes and returns Info checksum.
251   //
252   // Returns:
253   //   - Crc of the Info on success
254   libtextclassifier3::StatusOr<Crc32> ComputeInfoChecksum(bool force) override;
255 
256   // Computes and returns all storages checksum.
257   //
258   // Returns:
259   //   - Crc of all storages on success
260   //   - INTERNAL_ERROR if any data inconsistency
261   libtextclassifier3::StatusOr<Crc32> ComputeStoragesChecksum(
262       bool force) override;
263 
crcs()264   Crcs& crcs() override {
265     return *reinterpret_cast<Crcs*>(metadata_mmapped_file_->mutable_region() +
266                                     kCrcsMetadataBufferOffset);
267   }
268 
crcs()269   const Crcs& crcs() const override {
270     return *reinterpret_cast<const Crcs*>(metadata_mmapped_file_->region() +
271                                           kCrcsMetadataBufferOffset);
272   }
273 
info()274   Info& info() {
275     return *reinterpret_cast<Info*>(metadata_mmapped_file_->mutable_region() +
276                                     kInfoMetadataBufferOffset);
277   }
278 
info()279   const Info& info() const {
280     return *reinterpret_cast<const Info*>(metadata_mmapped_file_->region() +
281                                           kInfoMetadataBufferOffset);
282   }
283 
284   // In memory data:
285   // Pending embedding hits with their embedding keys used for
286   // embedding_posting_list_mapper_.
287   std::vector<std::pair<std::string, EmbeddingHit>> pending_embedding_hits_;
288 
289   // Metadata
290   std::unique_ptr<MemoryMappedFile> metadata_mmapped_file_;
291 
292   // Posting list storage
293   std::unique_ptr<PostingListEmbeddingHitSerializer>
294       posting_list_hit_serializer_ =
295           std::make_unique<PostingListEmbeddingHitSerializer>();
296 
297   // null if the index is empty.
298   std::unique_ptr<FlashIndexStorage> flash_index_storage_;
299 
300   // The mapper from embedding keys to the corresponding posting list identifier
301   // that stores all embedding hits with the same key.
302   //
303   // The key for an embedding hit is a one-to-one encoded string of the ordered
304   // pair (dimension, model_signature) corresponding to the embedding.
305   //
306   // null if the index is empty.
307   std::unique_ptr<KeyMapper<PostingListIdentifier>>
308       embedding_posting_list_mapper_;
309 
310   // A single FileBackedVector that holds all embedding vectors.
311   //
312   // null if the index is empty.
313   std::unique_ptr<FileBackedVector<float>> embedding_vectors_;
314 };
315 
316 }  // namespace lib
317 }  // namespace icing
318 
319 #endif  // ICING_INDEX_EMBED_EMBEDDING_INDEX_H_
320