• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2023 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_JOIN_QUALIFIED_ID_JOIN_INDEX_IMPL_V2_H_
16 #define ICING_JOIN_QUALIFIED_ID_JOIN_INDEX_IMPL_V2_H_
17 
18 #include <cstdint>
19 #include <memory>
20 #include <string>
21 #include <string_view>
22 #include <utility>
23 #include <vector>
24 
25 #include "icing/text_classifier/lib3/utils/base/status.h"
26 #include "icing/text_classifier/lib3/utils/base/statusor.h"
27 #include "icing/absl_ports/canonical_errors.h"
28 #include "icing/file/filesystem.h"
29 #include "icing/file/persistent-storage.h"
30 #include "icing/file/posting_list/flash-index-storage.h"
31 #include "icing/file/posting_list/posting-list-identifier.h"
32 #include "icing/join/doc-join-info.h"
33 #include "icing/join/document-id-to-join-info.h"
34 #include "icing/join/posting-list-join-data-accessor.h"
35 #include "icing/join/posting-list-join-data-serializer.h"
36 #include "icing/join/qualified-id-join-index.h"
37 #include "icing/schema/joinable-property.h"
38 #include "icing/store/document-filter-data.h"
39 #include "icing/store/document-id.h"
40 #include "icing/store/key-mapper.h"
41 #include "icing/store/namespace-fingerprint-identifier.h"
42 #include "icing/store/namespace-id.h"
43 #include "icing/util/crc32.h"
44 
45 namespace icing {
46 namespace lib {
47 
48 // QualifiedIdJoinIndexImplV2: a class to maintain join data (DocumentId to
49 // referenced NamespaceFingerprintIdentifier). It stores join data in posting
50 // lists and bucketizes them by (schema_type_id, joinable_property_id).
51 class QualifiedIdJoinIndexImplV2 : public QualifiedIdJoinIndex {
52  public:
53   using JoinDataType = DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>;
54 
55   class JoinDataIterator : public JoinDataIteratorBase {
56    public:
JoinDataIterator(std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor)57     explicit JoinDataIterator(
58         std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor)
59         : pl_accessor_(std::move(pl_accessor)),
60           should_retrieve_next_batch_(true) {}
61 
62     ~JoinDataIterator() override = default;
63 
64     // Advances to the next data.
65     //
66     // Returns:
67     //   - OK on success
68     //   - RESOURCE_EXHAUSTED_ERROR if reaching the end (i.e. no more relevant
69     //     data)
70     //   - Any other PostingListJoinDataAccessor errors
71     libtextclassifier3::Status Advance() override;
72 
GetCurrent()73     const JoinDataType& GetCurrent() const override { return *curr_; }
74 
75    private:
76     // Gets next batch of data from the posting list chain, caches in
77     // cached_batch_integer_index_data_, and sets curr_ to the begin of the
78     // cache.
79     libtextclassifier3::Status GetNextDataBatch();
80 
81     std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor_;
82     std::vector<JoinDataType> cached_batch_join_data_;
83     std::vector<JoinDataType>::const_iterator curr_;
84     bool should_retrieve_next_batch_;
85   };
86 
87   struct Info {
88     static constexpr int32_t kMagic = 0x12d1c074;
89 
90     int32_t magic;
91     int32_t num_data;
92     DocumentId last_added_document_id;
93 
ComputeChecksumInfo94     Crc32 ComputeChecksum() const {
95       return Crc32(
96           std::string_view(reinterpret_cast<const char*>(this), sizeof(Info)));
97     }
98   } __attribute__((packed));
99   static_assert(sizeof(Info) == 12, "");
100 
101   // Metadata file layout: <Crcs><Info>
102   static constexpr int32_t kCrcsMetadataBufferOffset = 0;
103   static constexpr int32_t kInfoMetadataBufferOffset =
104       static_cast<int32_t>(sizeof(Crcs));
105   static constexpr int32_t kMetadataFileSize = sizeof(Crcs) + sizeof(Info);
106   static_assert(kMetadataFileSize == 24, "");
107 
108   static constexpr WorkingPathType kWorkingPathType =
109       WorkingPathType::kDirectory;
110 
111   // Creates a QualifiedIdJoinIndexImplV2 instance to store join data
112   // (DocumentId to referenced NamespaceFingerPrintIdentifier) for future
113   // joining search. If any of the underlying file is missing, then delete the
114   // whole working_path and (re)initialize with new ones. Otherwise initialize
115   // and create the instance by existing files.
116   //
117   // filesystem: Object to make system level calls
118   // working_path: Specifies the working path for PersistentStorage.
119   //               QualifiedIdJoinIndexImplV2 uses working path as working
120   //               directory and all related files will be stored under this
121   //               directory. It takes full ownership and of working_path_,
122   //               including creation/deletion. It is the caller's
123   //               responsibility to specify correct working path and avoid
124   //               mixing different persistent storages together under the same
125   //               path. Also the caller has the ownership for the parent
126   //               directory of working_path_, and it is responsible for parent
127   //               directory creation/deletion. See PersistentStorage for more
128   //               details about the concept of working_path.
129   // pre_mapping_fbv: flag indicating whether memory map max possible file size
130   //                  for underlying FileBackedVector before growing the actual
131   //                  file size.
132   //
133   // Returns:
134   //   - FAILED_PRECONDITION_ERROR if the file checksum doesn't match the stored
135   //                               checksum
136   //   - INTERNAL_ERROR on I/O errors
137   //   - Any KeyMapper errors
138   static libtextclassifier3::StatusOr<
139       std::unique_ptr<QualifiedIdJoinIndexImplV2>>
140   Create(const Filesystem& filesystem, std::string working_path,
141          bool pre_mapping_fbv);
142 
143   // Delete copy and move constructor/assignment operator.
144   QualifiedIdJoinIndexImplV2(const QualifiedIdJoinIndexImplV2&) = delete;
145   QualifiedIdJoinIndexImplV2& operator=(const QualifiedIdJoinIndexImplV2&) =
146       delete;
147 
148   QualifiedIdJoinIndexImplV2(QualifiedIdJoinIndexImplV2&&) = delete;
149   QualifiedIdJoinIndexImplV2& operator=(QualifiedIdJoinIndexImplV2&&) = delete;
150 
151   ~QualifiedIdJoinIndexImplV2() override;
152 
153   // v1 only API. Returns UNIMPLEMENTED_ERROR.
Put(const DocJoinInfo & doc_join_info,std::string_view ref_qualified_id_str)154   libtextclassifier3::Status Put(
155       const DocJoinInfo& doc_join_info,
156       std::string_view ref_qualified_id_str) override {
157     return absl_ports::UnimplementedError("This API is not supported in V2");
158   }
159 
160   // v1 only API. Returns UNIMPLEMENTED_ERROR.
Get(const DocJoinInfo & doc_join_info)161   libtextclassifier3::StatusOr<std::string_view> Get(
162       const DocJoinInfo& doc_join_info) const override {
163     return absl_ports::UnimplementedError("This API is not supported in V2");
164   }
165 
166   // Puts a list of referenced (parent) NamespaceFingerprintIdentifiers into
167   // the join index, given the (child) DocumentId, SchemaTypeId and
168   // JoinablePropertyId.
169   //
170   // Returns:
171   //   - OK on success
172   //   - INVALID_ARGUMENT_ERROR if schema_type_id, joinable_property_id, or
173   //     document_id is invalid
174   //   - Any KeyMapper/FlashIndexStorage errors
175   libtextclassifier3::Status Put(SchemaTypeId schema_type_id,
176                                  JoinablePropertyId joinable_property_id,
177                                  DocumentId document_id,
178                                  std::vector<NamespaceFingerprintIdentifier>&&
179                                      ref_namespace_fingerprint_ids) override;
180 
181   // Returns a JoinDataIterator for iterating through all join data of the
182   // specified (schema_type_id, joinable_property_id).
183   //
184   // Returns:
185   //   - On success: a JoinDataIterator
186   //   - INVALID_ARGUMENT_ERROR if schema_type_id or joinable_property_id is
187   //     invalid
188   //   - Any KeyMapper/FlashIndexStorage errors
189   libtextclassifier3::StatusOr<std::unique_ptr<JoinDataIteratorBase>>
190   GetIterator(SchemaTypeId schema_type_id,
191               JoinablePropertyId joinable_property_id) const override;
192 
193   // Reduces internal file sizes by reclaiming space and ids of deleted
194   // documents. Qualified id join index will convert all entries to the new
195   // document ids and namespace ids.
196   //
197   // - document_id_old_to_new: a map for converting old document id to new
198   //   document id.
199   // - namespace_id_old_to_new: a map for converting old namespace id to new
200   //   namespace id.
201   // - new_last_added_document_id: will be used to update the last added
202   //                               document id in the qualified id join index.
203   //
204   // Returns:
205   //   - OK on success
206   //   - INTERNAL_ERROR on I/O error. This could potentially leave the index in
207   //     an invalid state and the caller should handle it properly (e.g. discard
208   //     and rebuild)
209   libtextclassifier3::Status Optimize(
210       const std::vector<DocumentId>& document_id_old_to_new,
211       const std::vector<NamespaceId>& namespace_id_old_to_new,
212       DocumentId new_last_added_document_id) override;
213 
214   // Clears all data and set last_added_document_id to kInvalidDocumentId.
215   //
216   // Returns:
217   //   - OK on success
218   //   - INTERNAL_ERROR on I/O error
219   libtextclassifier3::Status Clear() override;
220 
is_v2()221   bool is_v2() const override { return true; }
222 
size()223   int32_t size() const override { return info().num_data; }
224 
empty()225   bool empty() const override { return size() == 0; }
226 
last_added_document_id()227   DocumentId last_added_document_id() const override {
228     return info().last_added_document_id;
229   }
230 
set_last_added_document_id(DocumentId document_id)231   void set_last_added_document_id(DocumentId document_id) override {
232     SetInfoDirty();
233 
234     Info& info_ref = info();
235     if (info_ref.last_added_document_id == kInvalidDocumentId ||
236         document_id > info_ref.last_added_document_id) {
237       info_ref.last_added_document_id = document_id;
238     }
239   }
240 
241  private:
QualifiedIdJoinIndexImplV2(const Filesystem & filesystem,std::string && working_path,std::unique_ptr<uint8_t[]> metadata_buffer,std::unique_ptr<KeyMapper<PostingListIdentifier>> schema_joinable_id_to_posting_list_mapper,std::unique_ptr<PostingListJoinDataSerializer<JoinDataType>> posting_list_serializer,std::unique_ptr<FlashIndexStorage> flash_index_storage,bool pre_mapping_fbv)242   explicit QualifiedIdJoinIndexImplV2(
243       const Filesystem& filesystem, std::string&& working_path,
244       std::unique_ptr<uint8_t[]> metadata_buffer,
245       std::unique_ptr<KeyMapper<PostingListIdentifier>>
246           schema_joinable_id_to_posting_list_mapper,
247       std::unique_ptr<PostingListJoinDataSerializer<JoinDataType>>
248           posting_list_serializer,
249       std::unique_ptr<FlashIndexStorage> flash_index_storage,
250       bool pre_mapping_fbv)
251       : QualifiedIdJoinIndex(filesystem, std::move(working_path)),
252         metadata_buffer_(std::move(metadata_buffer)),
253         schema_joinable_id_to_posting_list_mapper_(
254             std::move(schema_joinable_id_to_posting_list_mapper)),
255         posting_list_serializer_(std::move(posting_list_serializer)),
256         flash_index_storage_(std::move(flash_index_storage)),
257         pre_mapping_fbv_(pre_mapping_fbv),
258         is_info_dirty_(false),
259         is_storage_dirty_(false) {}
260 
261   static libtextclassifier3::StatusOr<
262       std::unique_ptr<QualifiedIdJoinIndexImplV2>>
263   InitializeNewFiles(const Filesystem& filesystem, std::string&& working_path,
264                      bool pre_mapping_fbv);
265 
266   static libtextclassifier3::StatusOr<
267       std::unique_ptr<QualifiedIdJoinIndexImplV2>>
268   InitializeExistingFiles(const Filesystem& filesystem,
269                           std::string&& working_path, bool pre_mapping_fbv);
270 
271   // Transfers qualified id join index data from the current to new_index and
272   // convert to new document id according to document_id_old_to_new and
273   // namespace_id_old_to_new. It is a helper function for Optimize.
274   //
275   // Returns:
276   //   - OK on success
277   //   - INTERNAL_ERROR on I/O error
278   libtextclassifier3::Status TransferIndex(
279       const std::vector<DocumentId>& document_id_old_to_new,
280       const std::vector<NamespaceId>& namespace_id_old_to_new,
281       QualifiedIdJoinIndexImplV2* new_index) const;
282 
283   // Flushes contents of metadata file.
284   //
285   // Returns:
286   //   - OK on success
287   //   - INTERNAL_ERROR on I/O error
288   libtextclassifier3::Status PersistMetadataToDisk(bool force) override;
289 
290   // Flushes contents of all storages to underlying files.
291   //
292   // Returns:
293   //   - OK on success
294   //   - INTERNAL_ERROR on I/O error
295   libtextclassifier3::Status PersistStoragesToDisk(bool force) override;
296 
297   // Computes and returns Info checksum.
298   //
299   // Returns:
300   //   - Crc of the Info on success
301   libtextclassifier3::StatusOr<Crc32> ComputeInfoChecksum(bool force) override;
302 
303   // Computes and returns all storages checksum.
304   //
305   // Returns:
306   //   - Crc of all storages on success
307   //   - INTERNAL_ERROR if any data inconsistency
308   libtextclassifier3::StatusOr<Crc32> ComputeStoragesChecksum(
309       bool force) override;
310 
crcs()311   Crcs& crcs() override {
312     return *reinterpret_cast<Crcs*>(metadata_buffer_.get() +
313                                     kCrcsMetadataBufferOffset);
314   }
315 
crcs()316   const Crcs& crcs() const override {
317     return *reinterpret_cast<const Crcs*>(metadata_buffer_.get() +
318                                           kCrcsMetadataBufferOffset);
319   }
320 
info()321   Info& info() {
322     return *reinterpret_cast<Info*>(metadata_buffer_.get() +
323                                     kInfoMetadataBufferOffset);
324   }
325 
info()326   const Info& info() const {
327     return *reinterpret_cast<const Info*>(metadata_buffer_.get() +
328                                           kInfoMetadataBufferOffset);
329   }
330 
SetInfoDirty()331   void SetInfoDirty() { is_info_dirty_ = true; }
332   // When storage is dirty, we have to set info dirty as well. So just expose
333   // SetDirty to set both.
SetDirty()334   void SetDirty() {
335     is_info_dirty_ = true;
336     is_storage_dirty_ = true;
337   }
338 
is_info_dirty()339   bool is_info_dirty() const { return is_info_dirty_; }
is_storage_dirty()340   bool is_storage_dirty() const { return is_storage_dirty_; }
341 
342   // Metadata buffer
343   std::unique_ptr<uint8_t[]> metadata_buffer_;
344 
345   // Persistent KeyMapper for mapping (schema_type_id, joinable_property_id) to
346   // PostingListIdentifier.
347   std::unique_ptr<KeyMapper<PostingListIdentifier>>
348       schema_joinable_id_to_posting_list_mapper_;
349 
350   // Posting list related members. Use posting list to store join data
351   // (document id to referenced NamespaceFingerprintIdentifier).
352   std::unique_ptr<PostingListJoinDataSerializer<JoinDataType>>
353       posting_list_serializer_;
354   std::unique_ptr<FlashIndexStorage> flash_index_storage_;
355 
356   // TODO(b/268521214): add delete propagation storage
357 
358   // Flag indicating whether memory map max possible file size for underlying
359   // FileBackedVector before growing the actual file size.
360   bool pre_mapping_fbv_;
361 
362   bool is_info_dirty_;
363   bool is_storage_dirty_;
364 };
365 
366 }  // namespace lib
367 }  // namespace icing
368 
369 #endif  // ICING_JOIN_QUALIFIED_ID_JOIN_INDEX_IMPL_V2_H_
370