1 // Copyright (C) 2023 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_JOIN_QUALIFIED_ID_JOIN_INDEX_IMPL_V2_H_ 16 #define ICING_JOIN_QUALIFIED_ID_JOIN_INDEX_IMPL_V2_H_ 17 18 #include <cstdint> 19 #include <memory> 20 #include <string> 21 #include <string_view> 22 #include <utility> 23 #include <vector> 24 25 #include "icing/text_classifier/lib3/utils/base/status.h" 26 #include "icing/text_classifier/lib3/utils/base/statusor.h" 27 #include "icing/absl_ports/canonical_errors.h" 28 #include "icing/file/filesystem.h" 29 #include "icing/file/persistent-storage.h" 30 #include "icing/file/posting_list/flash-index-storage.h" 31 #include "icing/file/posting_list/posting-list-identifier.h" 32 #include "icing/join/doc-join-info.h" 33 #include "icing/join/document-id-to-join-info.h" 34 #include "icing/join/posting-list-join-data-accessor.h" 35 #include "icing/join/posting-list-join-data-serializer.h" 36 #include "icing/join/qualified-id-join-index.h" 37 #include "icing/schema/joinable-property.h" 38 #include "icing/store/document-filter-data.h" 39 #include "icing/store/document-id.h" 40 #include "icing/store/key-mapper.h" 41 #include "icing/store/namespace-fingerprint-identifier.h" 42 #include "icing/store/namespace-id.h" 43 #include "icing/util/crc32.h" 44 45 namespace icing { 46 namespace lib { 47 48 // QualifiedIdJoinIndexImplV2: a class to maintain join data (DocumentId to 49 // referenced NamespaceFingerprintIdentifier). It stores join data in posting 50 // lists and bucketizes them by (schema_type_id, joinable_property_id). 51 class QualifiedIdJoinIndexImplV2 : public QualifiedIdJoinIndex { 52 public: 53 using JoinDataType = DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>; 54 55 class JoinDataIterator : public JoinDataIteratorBase { 56 public: JoinDataIterator(std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor)57 explicit JoinDataIterator( 58 std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor) 59 : pl_accessor_(std::move(pl_accessor)), 60 should_retrieve_next_batch_(true) {} 61 62 ~JoinDataIterator() override = default; 63 64 // Advances to the next data. 65 // 66 // Returns: 67 // - OK on success 68 // - RESOURCE_EXHAUSTED_ERROR if reaching the end (i.e. no more relevant 69 // data) 70 // - Any other PostingListJoinDataAccessor errors 71 libtextclassifier3::Status Advance() override; 72 GetCurrent()73 const JoinDataType& GetCurrent() const override { return *curr_; } 74 75 private: 76 // Gets next batch of data from the posting list chain, caches in 77 // cached_batch_integer_index_data_, and sets curr_ to the begin of the 78 // cache. 79 libtextclassifier3::Status GetNextDataBatch(); 80 81 std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor_; 82 std::vector<JoinDataType> cached_batch_join_data_; 83 std::vector<JoinDataType>::const_iterator curr_; 84 bool should_retrieve_next_batch_; 85 }; 86 87 struct Info { 88 static constexpr int32_t kMagic = 0x12d1c074; 89 90 int32_t magic; 91 int32_t num_data; 92 DocumentId last_added_document_id; 93 ComputeChecksumInfo94 Crc32 ComputeChecksum() const { 95 return Crc32( 96 std::string_view(reinterpret_cast<const char*>(this), sizeof(Info))); 97 } 98 } __attribute__((packed)); 99 static_assert(sizeof(Info) == 12, ""); 100 101 // Metadata file layout: <Crcs><Info> 102 static constexpr int32_t kCrcsMetadataBufferOffset = 0; 103 static constexpr int32_t kInfoMetadataBufferOffset = 104 static_cast<int32_t>(sizeof(Crcs)); 105 static constexpr int32_t kMetadataFileSize = sizeof(Crcs) + sizeof(Info); 106 static_assert(kMetadataFileSize == 24, ""); 107 108 static constexpr WorkingPathType kWorkingPathType = 109 WorkingPathType::kDirectory; 110 111 // Creates a QualifiedIdJoinIndexImplV2 instance to store join data 112 // (DocumentId to referenced NamespaceFingerPrintIdentifier) for future 113 // joining search. If any of the underlying file is missing, then delete the 114 // whole working_path and (re)initialize with new ones. Otherwise initialize 115 // and create the instance by existing files. 116 // 117 // filesystem: Object to make system level calls 118 // working_path: Specifies the working path for PersistentStorage. 119 // QualifiedIdJoinIndexImplV2 uses working path as working 120 // directory and all related files will be stored under this 121 // directory. It takes full ownership and of working_path_, 122 // including creation/deletion. It is the caller's 123 // responsibility to specify correct working path and avoid 124 // mixing different persistent storages together under the same 125 // path. Also the caller has the ownership for the parent 126 // directory of working_path_, and it is responsible for parent 127 // directory creation/deletion. See PersistentStorage for more 128 // details about the concept of working_path. 129 // pre_mapping_fbv: flag indicating whether memory map max possible file size 130 // for underlying FileBackedVector before growing the actual 131 // file size. 132 // 133 // Returns: 134 // - FAILED_PRECONDITION_ERROR if the file checksum doesn't match the stored 135 // checksum 136 // - INTERNAL_ERROR on I/O errors 137 // - Any KeyMapper errors 138 static libtextclassifier3::StatusOr< 139 std::unique_ptr<QualifiedIdJoinIndexImplV2>> 140 Create(const Filesystem& filesystem, std::string working_path, 141 bool pre_mapping_fbv); 142 143 // Delete copy and move constructor/assignment operator. 144 QualifiedIdJoinIndexImplV2(const QualifiedIdJoinIndexImplV2&) = delete; 145 QualifiedIdJoinIndexImplV2& operator=(const QualifiedIdJoinIndexImplV2&) = 146 delete; 147 148 QualifiedIdJoinIndexImplV2(QualifiedIdJoinIndexImplV2&&) = delete; 149 QualifiedIdJoinIndexImplV2& operator=(QualifiedIdJoinIndexImplV2&&) = delete; 150 151 ~QualifiedIdJoinIndexImplV2() override; 152 153 // v1 only API. Returns UNIMPLEMENTED_ERROR. Put(const DocJoinInfo & doc_join_info,std::string_view ref_qualified_id_str)154 libtextclassifier3::Status Put( 155 const DocJoinInfo& doc_join_info, 156 std::string_view ref_qualified_id_str) override { 157 return absl_ports::UnimplementedError("This API is not supported in V2"); 158 } 159 160 // v1 only API. Returns UNIMPLEMENTED_ERROR. Get(const DocJoinInfo & doc_join_info)161 libtextclassifier3::StatusOr<std::string_view> Get( 162 const DocJoinInfo& doc_join_info) const override { 163 return absl_ports::UnimplementedError("This API is not supported in V2"); 164 } 165 166 // Puts a list of referenced (parent) NamespaceFingerprintIdentifiers into 167 // the join index, given the (child) DocumentId, SchemaTypeId and 168 // JoinablePropertyId. 169 // 170 // Returns: 171 // - OK on success 172 // - INVALID_ARGUMENT_ERROR if schema_type_id, joinable_property_id, or 173 // document_id is invalid 174 // - Any KeyMapper/FlashIndexStorage errors 175 libtextclassifier3::Status Put(SchemaTypeId schema_type_id, 176 JoinablePropertyId joinable_property_id, 177 DocumentId document_id, 178 std::vector<NamespaceFingerprintIdentifier>&& 179 ref_namespace_fingerprint_ids) override; 180 181 // Returns a JoinDataIterator for iterating through all join data of the 182 // specified (schema_type_id, joinable_property_id). 183 // 184 // Returns: 185 // - On success: a JoinDataIterator 186 // - INVALID_ARGUMENT_ERROR if schema_type_id or joinable_property_id is 187 // invalid 188 // - Any KeyMapper/FlashIndexStorage errors 189 libtextclassifier3::StatusOr<std::unique_ptr<JoinDataIteratorBase>> 190 GetIterator(SchemaTypeId schema_type_id, 191 JoinablePropertyId joinable_property_id) const override; 192 193 // Reduces internal file sizes by reclaiming space and ids of deleted 194 // documents. Qualified id join index will convert all entries to the new 195 // document ids and namespace ids. 196 // 197 // - document_id_old_to_new: a map for converting old document id to new 198 // document id. 199 // - namespace_id_old_to_new: a map for converting old namespace id to new 200 // namespace id. 201 // - new_last_added_document_id: will be used to update the last added 202 // document id in the qualified id join index. 203 // 204 // Returns: 205 // - OK on success 206 // - INTERNAL_ERROR on I/O error. This could potentially leave the index in 207 // an invalid state and the caller should handle it properly (e.g. discard 208 // and rebuild) 209 libtextclassifier3::Status Optimize( 210 const std::vector<DocumentId>& document_id_old_to_new, 211 const std::vector<NamespaceId>& namespace_id_old_to_new, 212 DocumentId new_last_added_document_id) override; 213 214 // Clears all data and set last_added_document_id to kInvalidDocumentId. 215 // 216 // Returns: 217 // - OK on success 218 // - INTERNAL_ERROR on I/O error 219 libtextclassifier3::Status Clear() override; 220 is_v2()221 bool is_v2() const override { return true; } 222 size()223 int32_t size() const override { return info().num_data; } 224 empty()225 bool empty() const override { return size() == 0; } 226 last_added_document_id()227 DocumentId last_added_document_id() const override { 228 return info().last_added_document_id; 229 } 230 set_last_added_document_id(DocumentId document_id)231 void set_last_added_document_id(DocumentId document_id) override { 232 SetInfoDirty(); 233 234 Info& info_ref = info(); 235 if (info_ref.last_added_document_id == kInvalidDocumentId || 236 document_id > info_ref.last_added_document_id) { 237 info_ref.last_added_document_id = document_id; 238 } 239 } 240 241 private: QualifiedIdJoinIndexImplV2(const Filesystem & filesystem,std::string && working_path,std::unique_ptr<uint8_t[]> metadata_buffer,std::unique_ptr<KeyMapper<PostingListIdentifier>> schema_joinable_id_to_posting_list_mapper,std::unique_ptr<PostingListJoinDataSerializer<JoinDataType>> posting_list_serializer,std::unique_ptr<FlashIndexStorage> flash_index_storage,bool pre_mapping_fbv)242 explicit QualifiedIdJoinIndexImplV2( 243 const Filesystem& filesystem, std::string&& working_path, 244 std::unique_ptr<uint8_t[]> metadata_buffer, 245 std::unique_ptr<KeyMapper<PostingListIdentifier>> 246 schema_joinable_id_to_posting_list_mapper, 247 std::unique_ptr<PostingListJoinDataSerializer<JoinDataType>> 248 posting_list_serializer, 249 std::unique_ptr<FlashIndexStorage> flash_index_storage, 250 bool pre_mapping_fbv) 251 : QualifiedIdJoinIndex(filesystem, std::move(working_path)), 252 metadata_buffer_(std::move(metadata_buffer)), 253 schema_joinable_id_to_posting_list_mapper_( 254 std::move(schema_joinable_id_to_posting_list_mapper)), 255 posting_list_serializer_(std::move(posting_list_serializer)), 256 flash_index_storage_(std::move(flash_index_storage)), 257 pre_mapping_fbv_(pre_mapping_fbv), 258 is_info_dirty_(false), 259 is_storage_dirty_(false) {} 260 261 static libtextclassifier3::StatusOr< 262 std::unique_ptr<QualifiedIdJoinIndexImplV2>> 263 InitializeNewFiles(const Filesystem& filesystem, std::string&& working_path, 264 bool pre_mapping_fbv); 265 266 static libtextclassifier3::StatusOr< 267 std::unique_ptr<QualifiedIdJoinIndexImplV2>> 268 InitializeExistingFiles(const Filesystem& filesystem, 269 std::string&& working_path, bool pre_mapping_fbv); 270 271 // Transfers qualified id join index data from the current to new_index and 272 // convert to new document id according to document_id_old_to_new and 273 // namespace_id_old_to_new. It is a helper function for Optimize. 274 // 275 // Returns: 276 // - OK on success 277 // - INTERNAL_ERROR on I/O error 278 libtextclassifier3::Status TransferIndex( 279 const std::vector<DocumentId>& document_id_old_to_new, 280 const std::vector<NamespaceId>& namespace_id_old_to_new, 281 QualifiedIdJoinIndexImplV2* new_index) const; 282 283 // Flushes contents of metadata file. 284 // 285 // Returns: 286 // - OK on success 287 // - INTERNAL_ERROR on I/O error 288 libtextclassifier3::Status PersistMetadataToDisk(bool force) override; 289 290 // Flushes contents of all storages to underlying files. 291 // 292 // Returns: 293 // - OK on success 294 // - INTERNAL_ERROR on I/O error 295 libtextclassifier3::Status PersistStoragesToDisk(bool force) override; 296 297 // Computes and returns Info checksum. 298 // 299 // Returns: 300 // - Crc of the Info on success 301 libtextclassifier3::StatusOr<Crc32> ComputeInfoChecksum(bool force) override; 302 303 // Computes and returns all storages checksum. 304 // 305 // Returns: 306 // - Crc of all storages on success 307 // - INTERNAL_ERROR if any data inconsistency 308 libtextclassifier3::StatusOr<Crc32> ComputeStoragesChecksum( 309 bool force) override; 310 crcs()311 Crcs& crcs() override { 312 return *reinterpret_cast<Crcs*>(metadata_buffer_.get() + 313 kCrcsMetadataBufferOffset); 314 } 315 crcs()316 const Crcs& crcs() const override { 317 return *reinterpret_cast<const Crcs*>(metadata_buffer_.get() + 318 kCrcsMetadataBufferOffset); 319 } 320 info()321 Info& info() { 322 return *reinterpret_cast<Info*>(metadata_buffer_.get() + 323 kInfoMetadataBufferOffset); 324 } 325 info()326 const Info& info() const { 327 return *reinterpret_cast<const Info*>(metadata_buffer_.get() + 328 kInfoMetadataBufferOffset); 329 } 330 SetInfoDirty()331 void SetInfoDirty() { is_info_dirty_ = true; } 332 // When storage is dirty, we have to set info dirty as well. So just expose 333 // SetDirty to set both. SetDirty()334 void SetDirty() { 335 is_info_dirty_ = true; 336 is_storage_dirty_ = true; 337 } 338 is_info_dirty()339 bool is_info_dirty() const { return is_info_dirty_; } is_storage_dirty()340 bool is_storage_dirty() const { return is_storage_dirty_; } 341 342 // Metadata buffer 343 std::unique_ptr<uint8_t[]> metadata_buffer_; 344 345 // Persistent KeyMapper for mapping (schema_type_id, joinable_property_id) to 346 // PostingListIdentifier. 347 std::unique_ptr<KeyMapper<PostingListIdentifier>> 348 schema_joinable_id_to_posting_list_mapper_; 349 350 // Posting list related members. Use posting list to store join data 351 // (document id to referenced NamespaceFingerprintIdentifier). 352 std::unique_ptr<PostingListJoinDataSerializer<JoinDataType>> 353 posting_list_serializer_; 354 std::unique_ptr<FlashIndexStorage> flash_index_storage_; 355 356 // TODO(b/268521214): add delete propagation storage 357 358 // Flag indicating whether memory map max possible file size for underlying 359 // FileBackedVector before growing the actual file size. 360 bool pre_mapping_fbv_; 361 362 bool is_info_dirty_; 363 bool is_storage_dirty_; 364 }; 365 366 } // namespace lib 367 } // namespace icing 368 369 #endif // ICING_JOIN_QUALIFIED_ID_JOIN_INDEX_IMPL_V2_H_ 370