1 // Copyright (C) 2023 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_JOIN_QUALIFIED_ID_JOIN_INDEX_IMPL_V1_H_ 16 #define ICING_JOIN_QUALIFIED_ID_JOIN_INDEX_IMPL_V1_H_ 17 18 #include <cstdint> 19 #include <memory> 20 #include <string> 21 #include <string_view> 22 #include <utility> 23 #include <vector> 24 25 #include "icing/text_classifier/lib3/utils/base/status.h" 26 #include "icing/text_classifier/lib3/utils/base/statusor.h" 27 #include "icing/absl_ports/canonical_errors.h" 28 #include "icing/file/file-backed-vector.h" 29 #include "icing/file/filesystem.h" 30 #include "icing/file/persistent-storage.h" 31 #include "icing/join/doc-join-info.h" 32 #include "icing/join/qualified-id-join-index.h" 33 #include "icing/schema/joinable-property.h" 34 #include "icing/store/document-filter-data.h" 35 #include "icing/store/document-id.h" 36 #include "icing/store/key-mapper.h" 37 #include "icing/store/namespace-fingerprint-identifier.h" 38 #include "icing/store/namespace-id.h" 39 #include "icing/util/crc32.h" 40 41 namespace icing { 42 namespace lib { 43 44 // QualifiedIdJoinIndexImplV1: a class to maintain data mapping DocJoinInfo to 45 // joinable qualified ids and delete propagation info. 46 class QualifiedIdJoinIndexImplV1 : public QualifiedIdJoinIndex { 47 public: 48 struct Info { 49 static constexpr int32_t kMagic = 0x48cabdc6; 50 51 int32_t magic; 52 DocumentId last_added_document_id; 53 ComputeChecksumInfo54 Crc32 ComputeChecksum() const { 55 return Crc32( 56 std::string_view(reinterpret_cast<const char*>(this), sizeof(Info))); 57 } 58 } __attribute__((packed)); 59 static_assert(sizeof(Info) == 8, ""); 60 61 // Metadata file layout: <Crcs><Info> 62 static constexpr int32_t kCrcsMetadataBufferOffset = 0; 63 static constexpr int32_t kInfoMetadataBufferOffset = 64 static_cast<int32_t>(sizeof(Crcs)); 65 static constexpr int32_t kMetadataFileSize = sizeof(Crcs) + sizeof(Info); 66 static_assert(kMetadataFileSize == 20, ""); 67 68 // Creates a QualifiedIdJoinIndexImplV1 instance to store qualified ids for 69 // future joining search. If any of the underlying file is missing, then 70 // delete the whole working_path and (re)initialize with new ones. Otherwise 71 // initialize and create the instance by existing files. 72 // 73 // filesystem: Object to make system level calls 74 // working_path: Specifies the working path for PersistentStorage. 75 // QualifiedIdJoinIndexImplV1 uses working path as working 76 // directory and all related files will be stored under this 77 // directory. It takes full ownership and of working_path_, 78 // including creation/deletion. It is the caller's 79 // responsibility to specify correct working path and avoid 80 // mixing different persistent storages together under the same 81 // path. Also the caller has the ownership for the parent 82 // directory of working_path_, and it is responsible for parent 83 // directory creation/deletion. See PersistentStorage for more 84 // details about the concept of working_path. 85 // pre_mapping_fbv: flag indicating whether memory map max possible file size 86 // for underlying FileBackedVector before growing the actual 87 // file size. 88 // use_persistent_hash_map: flag indicating whether use persistent hash map as 89 // the key mapper (if false, then fall back to 90 // dynamic trie key mapper). 91 // 92 // Returns: 93 // - FAILED_PRECONDITION_ERROR if the file checksum doesn't match the stored 94 // checksum 95 // - INTERNAL_ERROR on I/O errors 96 // - Any KeyMapper errors 97 static libtextclassifier3::StatusOr< 98 std::unique_ptr<QualifiedIdJoinIndexImplV1>> 99 Create(const Filesystem& filesystem, std::string working_path, 100 bool pre_mapping_fbv, bool use_persistent_hash_map); 101 102 // Delete copy and move constructor/assignment operator. 103 QualifiedIdJoinIndexImplV1(const QualifiedIdJoinIndexImplV1&) = delete; 104 QualifiedIdJoinIndexImplV1& operator=(const QualifiedIdJoinIndexImplV1&) = 105 delete; 106 107 QualifiedIdJoinIndexImplV1(QualifiedIdJoinIndexImplV1&&) = delete; 108 QualifiedIdJoinIndexImplV1& operator=(QualifiedIdJoinIndexImplV1&&) = delete; 109 110 ~QualifiedIdJoinIndexImplV1() override; 111 112 // v2 only API. Returns UNIMPLEMENTED_ERROR. Put(SchemaTypeId schema_type_id,JoinablePropertyId joinable_property_id,DocumentId document_id,std::vector<NamespaceFingerprintIdentifier> && ref_namespace_fingerprint_ids)113 libtextclassifier3::Status Put(SchemaTypeId schema_type_id, 114 JoinablePropertyId joinable_property_id, 115 DocumentId document_id, 116 std::vector<NamespaceFingerprintIdentifier>&& 117 ref_namespace_fingerprint_ids) override { 118 return absl_ports::UnimplementedError("This API is not supported in V2"); 119 } 120 121 // v2 only API. Returns UNIMPLEMENTED_ERROR. 122 libtextclassifier3::StatusOr<std::unique_ptr<JoinDataIteratorBase>> GetIterator(SchemaTypeId schema_type_id,JoinablePropertyId joinable_property_id)123 GetIterator(SchemaTypeId schema_type_id, 124 JoinablePropertyId joinable_property_id) const override { 125 return absl_ports::UnimplementedError("This API is not supported in V2"); 126 } 127 128 // Puts a new data into index: DocJoinInfo (DocumentId, JoinablePropertyId) 129 // references to ref_qualified_id_str (the identifier of another document). 130 // 131 // REQUIRES: ref_qualified_id_str contains no '\0'. 132 // 133 // Returns: 134 // - OK on success 135 // - INVALID_ARGUMENT_ERROR if doc_join_info is invalid 136 // - Any KeyMapper errors 137 libtextclassifier3::Status Put( 138 const DocJoinInfo& doc_join_info, 139 std::string_view ref_qualified_id_str) override; 140 141 // Gets the referenced document's qualified id string by DocJoinInfo. 142 // 143 // Returns: 144 // - A qualified id string referenced by the given DocJoinInfo (DocumentId, 145 // JoinablePropertyId) on success 146 // - INVALID_ARGUMENT_ERROR if doc_join_info is invalid 147 // - NOT_FOUND_ERROR if doc_join_info doesn't exist 148 // - Any KeyMapper errors 149 libtextclassifier3::StatusOr<std::string_view> Get( 150 const DocJoinInfo& doc_join_info) const override; 151 152 // Reduces internal file sizes by reclaiming space and ids of deleted 153 // documents. Qualified id type joinable index will convert all entries to the 154 // new document ids. 155 // 156 // - document_id_old_to_new: a map for converting old document id to new 157 // document id. 158 // - namespace_id_old_to_new: a map for converting old namespace id to new 159 // namespace id. It is unused in this implementation since we store raw 160 // qualified id string (which contains raw namespace string). 161 // - new_last_added_document_id: will be used to update the last added 162 // document id in the qualified id type joinable 163 // index. 164 // 165 // Returns: 166 // - OK on success 167 // - INTERNAL_ERROR on I/O error. This could potentially leave the index in 168 // an invalid state and the caller should handle it properly (e.g. discard 169 // and rebuild) 170 libtextclassifier3::Status Optimize( 171 const std::vector<DocumentId>& document_id_old_to_new, 172 const std::vector<NamespaceId>& namespace_id_old_to_new, 173 DocumentId new_last_added_document_id) override; 174 175 // Clears all data and set last_added_document_id to kInvalidDocumentId. 176 // 177 // Returns: 178 // - OK on success 179 // - INTERNAL_ERROR on I/O error 180 libtextclassifier3::Status Clear() override; 181 is_v2()182 bool is_v2() const override { return false; } 183 size()184 int32_t size() const override { return doc_join_info_mapper_->num_keys(); } 185 empty()186 bool empty() const override { return size() == 0; } 187 last_added_document_id()188 DocumentId last_added_document_id() const override { 189 return info().last_added_document_id; 190 } 191 set_last_added_document_id(DocumentId document_id)192 void set_last_added_document_id(DocumentId document_id) override { 193 SetInfoDirty(); 194 195 Info& info_ref = info(); 196 if (info_ref.last_added_document_id == kInvalidDocumentId || 197 document_id > info_ref.last_added_document_id) { 198 info_ref.last_added_document_id = document_id; 199 } 200 } 201 202 private: QualifiedIdJoinIndexImplV1(const Filesystem & filesystem,std::string && working_path,std::unique_ptr<uint8_t[]> metadata_buffer,std::unique_ptr<KeyMapper<int32_t>> doc_join_info_mapper,std::unique_ptr<FileBackedVector<char>> qualified_id_storage,bool pre_mapping_fbv,bool use_persistent_hash_map)203 explicit QualifiedIdJoinIndexImplV1( 204 const Filesystem& filesystem, std::string&& working_path, 205 std::unique_ptr<uint8_t[]> metadata_buffer, 206 std::unique_ptr<KeyMapper<int32_t>> doc_join_info_mapper, 207 std::unique_ptr<FileBackedVector<char>> qualified_id_storage, 208 bool pre_mapping_fbv, bool use_persistent_hash_map) 209 : QualifiedIdJoinIndex(filesystem, std::move(working_path)), 210 metadata_buffer_(std::move(metadata_buffer)), 211 doc_join_info_mapper_(std::move(doc_join_info_mapper)), 212 qualified_id_storage_(std::move(qualified_id_storage)), 213 pre_mapping_fbv_(pre_mapping_fbv), 214 use_persistent_hash_map_(use_persistent_hash_map), 215 is_info_dirty_(false), 216 is_storage_dirty_(false) {} 217 218 static libtextclassifier3::StatusOr< 219 std::unique_ptr<QualifiedIdJoinIndexImplV1>> 220 InitializeNewFiles(const Filesystem& filesystem, std::string&& working_path, 221 bool pre_mapping_fbv, bool use_persistent_hash_map); 222 223 static libtextclassifier3::StatusOr< 224 std::unique_ptr<QualifiedIdJoinIndexImplV1>> 225 InitializeExistingFiles(const Filesystem& filesystem, 226 std::string&& working_path, bool pre_mapping_fbv, 227 bool use_persistent_hash_map); 228 229 // Transfers qualified id join index data from the current to new_index and 230 // convert to new document id according to document_id_old_to_new. It is a 231 // helper function for Optimize. 232 // 233 // Returns: 234 // - OK on success 235 // - INTERNAL_ERROR on I/O error 236 libtextclassifier3::Status TransferIndex( 237 const std::vector<DocumentId>& document_id_old_to_new, 238 QualifiedIdJoinIndexImplV1* new_index) const; 239 240 // Flushes contents of metadata file. 241 // 242 // Returns: 243 // - OK on success 244 // - INTERNAL_ERROR on I/O error 245 libtextclassifier3::Status PersistMetadataToDisk(bool force) override; 246 247 // Flushes contents of all storages to underlying files. 248 // 249 // Returns: 250 // - OK on success 251 // - INTERNAL_ERROR on I/O error 252 libtextclassifier3::Status PersistStoragesToDisk(bool force) override; 253 254 // Computes and returns Info checksum. 255 // 256 // Returns: 257 // - Crc of the Info on success 258 libtextclassifier3::StatusOr<Crc32> ComputeInfoChecksum(bool force) override; 259 260 // Computes and returns all storages checksum. 261 // 262 // Returns: 263 // - Crc of all storages on success 264 // - INTERNAL_ERROR if any data inconsistency 265 libtextclassifier3::StatusOr<Crc32> ComputeStoragesChecksum( 266 bool force) override; 267 crcs()268 Crcs& crcs() override { 269 return *reinterpret_cast<Crcs*>(metadata_buffer_.get() + 270 kCrcsMetadataBufferOffset); 271 } 272 crcs()273 const Crcs& crcs() const override { 274 return *reinterpret_cast<const Crcs*>(metadata_buffer_.get() + 275 kCrcsMetadataBufferOffset); 276 } 277 info()278 Info& info() { 279 return *reinterpret_cast<Info*>(metadata_buffer_.get() + 280 kInfoMetadataBufferOffset); 281 } 282 info()283 const Info& info() const { 284 return *reinterpret_cast<const Info*>(metadata_buffer_.get() + 285 kInfoMetadataBufferOffset); 286 } 287 SetInfoDirty()288 void SetInfoDirty() { is_info_dirty_ = true; } 289 // When storage is dirty, we have to set info dirty as well. So just expose 290 // SetDirty to set both. SetDirty()291 void SetDirty() { 292 is_info_dirty_ = true; 293 is_storage_dirty_ = true; 294 } 295 is_info_dirty()296 bool is_info_dirty() const { return is_info_dirty_; } is_storage_dirty()297 bool is_storage_dirty() const { return is_storage_dirty_; } 298 299 // Metadata buffer 300 std::unique_ptr<uint8_t[]> metadata_buffer_; 301 302 // Persistent KeyMapper for mapping (encoded) DocJoinInfo (DocumentId, 303 // JoinablePropertyId) to another referenced document's qualified id string 304 // index in qualified_id_storage_. 305 std::unique_ptr<KeyMapper<int32_t>> doc_join_info_mapper_; 306 307 // Storage for qualified id strings. 308 std::unique_ptr<FileBackedVector<char>> qualified_id_storage_; 309 310 // TODO(b/268521214): add delete propagation storage 311 312 // Flag indicating whether memory map max possible file size for underlying 313 // FileBackedVector before growing the actual file size. 314 bool pre_mapping_fbv_; 315 316 // Flag indicating whether use persistent hash map as the key mapper (if 317 // false, then fall back to dynamic trie key mapper). 318 bool use_persistent_hash_map_; 319 320 bool is_info_dirty_; 321 bool is_storage_dirty_; 322 }; 323 324 } // namespace lib 325 } // namespace icing 326 327 #endif // ICING_JOIN_QUALIFIED_ID_JOIN_INDEX_IMPL_V1_H_ 328