1 // Copyright (C) 2023 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_FILE_PERSISTENT_STORAGE_H_ 16 #define ICING_FILE_PERSISTENT_STORAGE_H_ 17 18 #include <cstdint> 19 #include <string> 20 #include <string_view> 21 22 #include "icing/text_classifier/lib3/utils/base/status.h" 23 #include "icing/text_classifier/lib3/utils/base/statusor.h" 24 #include "icing/absl_ports/canonical_errors.h" 25 #include "icing/absl_ports/str_cat.h" 26 #include "icing/file/filesystem.h" 27 #include "icing/util/crc32.h" 28 #include "icing/util/status-macros.h" 29 30 namespace icing { 31 namespace lib { 32 33 // PersistentStorage: an abstract class for all persistent data structures. 34 // - It provides some common persistent file methods, e.g. PersistToDisk. 35 // - It encapsulates most of the checksum handling logics (including update and 36 // validation). 37 // 38 // Terminology: 39 // - Crcs: checksum section 40 // - Info: (custom) information for derived class 41 // - Metadata: Crcs + Info 42 // 43 // Usually a persistent data structure will have its own custom Info and 44 // storages (single or composite storages) definition. To create a new 45 // persistent data structure via PersistentStorage: 46 // - Decide what type the working path is (single file or directory). See 47 // working_path_ and WorkingPathType for more details. 48 // - Create a new class that inherits PersistentStorage: 49 // - Declare custom Info and design the metadata section layout. 50 // Usually the layout is <Crcs><Info>, and there are 2 common ways to 51 // manage metadata section: 52 // - Have a separate file for metadata. In this case, the new persistent 53 // data structure contains multiple files, so working path should be used 54 // as directory path and multiple files will be stored under it. Example: 55 // PersistentHashMap. 56 // - Have a single file for both metadata and storage data. In this case, 57 // the file layout should be <Crcs><Info><Storage Data>, and 58 // working path should be used as file path. Example: FileBackedVector. 59 // - Handle working path file/directory creation and deletion. 60 // PersistentStorage only provides static Discard() method to use. The 61 // derived class should implement other logics, e.g. working path (file 62 // /directory) creation, check condition to discard working path and start 63 // over new file(s). 64 // - Implement all pure virtual methods: 65 // - PersistStoragesToDisk: persist all (composite) storages. In general, 66 // the implementation will be calling PersistToDisk for all composite 67 // storages. 68 // - PersistMetadataToDisk: persist metadata, including Crcs and Info. 69 // - If the derived class maintains a concrete Crc and (custom) Info 70 // instance, then it should perform write/pwrite into the metadata 71 // section. 72 // - If the derived class uses memory-mapped region directly for metadata, 73 // then it should call MemoryMappedFile::PersistToDisk. 74 // - See crcs() for more details. 75 // - ComputeInfoChecksum: compute the checksum for custom Info. 76 // - ComputeStoragesChecksum: compute the (combined) checksum for all 77 // (composite) storages. In general, the implementation will be calling 78 // UpdateChecksums for all composite storages and XOR all checksums. 79 // - crcs(): provide the reference for PersistentStorage to write checksums. 80 // The derived class can either maintain a concrete Crcs instance, or 81 // reinterpret_cast the memory-mapped region to Crcs reference. Either 82 // choice is fine as long as PersistMetadataToDisk flushes it to disk 83 // correctly. 84 // - Call either InitializeNewStorage or InitializeExistingStorage when creating 85 // and initializing an instance, depending on initializing new storage or from 86 // existing file(s). 87 class PersistentStorage { 88 public: 89 enum class WorkingPathType { 90 kSingleFile, 91 kDirectory, 92 kDummy, 93 }; 94 95 // Crcs and Info will be written into the metadata section. Info is defined by 96 // the actual implementation of each persistent storage. Usually the Metadata 97 // layout is: <Crcs><Info> 98 struct Crcs { 99 struct ComponentCrcs { 100 uint32_t info_crc; 101 uint32_t storages_crc; 102 103 bool operator==(const ComponentCrcs& other) const { 104 return info_crc == other.info_crc && storages_crc == other.storages_crc; 105 } 106 ComputeChecksumCrcs::ComponentCrcs107 Crc32 ComputeChecksum() const { 108 return Crc32(std::string_view(reinterpret_cast<const char*>(this), 109 sizeof(ComponentCrcs))); 110 } 111 } __attribute__((packed)); 112 113 bool operator==(const Crcs& other) const { 114 return all_crc == other.all_crc && component_crcs == other.component_crcs; 115 } 116 117 uint32_t all_crc; 118 ComponentCrcs component_crcs; 119 } __attribute__((packed)); 120 static_assert(sizeof(Crcs) == 12, ""); 121 122 // Deletes working_path according to its type. 123 // 124 // Returns: 125 // - OK on success 126 // - INTERNAL_ERROR on I/O error 127 // - INVALID_ARGUMENT_ERROR if working_path_type is unknown type 128 static libtextclassifier3::Status Discard(const Filesystem& filesystem, 129 const std::string& working_path, 130 WorkingPathType working_path_type); 131 132 virtual ~PersistentStorage() = default; 133 134 // Initializes new persistent storage. It computes the initial checksums and 135 // writes into the metadata file. 136 // 137 // Note: either InitializeNewStorage or InitializeExistingStorage should be 138 // invoked after creating a PersistentStorage instance before using, otherwise 139 // an uninitialized instance will fail to use persistent storage features, 140 // e.g. PersistToDisk, UpdateChecksums. 141 // 142 // Returns: 143 // - OK on success or already initialized 144 // - Any errors from ComputeInfoChecksum, ComputeStoragesChecksum, depending 145 // on actual implementation InitializeNewStorage()146 libtextclassifier3::Status InitializeNewStorage() { 147 if (is_initialized_) { 148 return libtextclassifier3::Status::OK; 149 } 150 151 ICING_RETURN_IF_ERROR(UpdateChecksumsInternal(/*force=*/true)); 152 ICING_RETURN_IF_ERROR(PersistStoragesToDisk(/*force=*/true)); 153 ICING_RETURN_IF_ERROR(PersistMetadataToDisk(/*force=*/true)); 154 155 is_initialized_ = true; 156 return libtextclassifier3::Status::OK; 157 } 158 159 // Initializes persistent storage from existing file(s). 160 // 161 // It enforces the following check(s): 162 // - Validate checksums. 163 // 164 // Note: either InitializeNewStorage or InitializeExistingStorage should be 165 // invoked after creating a PersistentStorage instance before using. 166 // 167 // Returns: 168 // - OK on success or already initialized 169 // - FAILED_PRECONDITION_ERROR if checksum validation fails. 170 // - Any errors from ComputeInfoChecksum, ComputeStoragesChecksum, depending 171 // on actual implementation InitializeExistingStorage()172 libtextclassifier3::Status InitializeExistingStorage() { 173 if (is_initialized_) { 174 return libtextclassifier3::Status::OK; 175 } 176 177 ICING_RETURN_IF_ERROR(ValidateChecksums()); 178 179 is_initialized_ = true; 180 return libtextclassifier3::Status::OK; 181 } 182 183 // Flushes contents to underlying files. 184 // 1) Flushes storages. 185 // 2) Updates all checksums by new data. 186 // 3) Flushes metadata. 187 // 188 // Force flag will be passed down to PersistMetadataToDisk, 189 // PersistStoragesToDisk, ComputeInfoChecksum, ComputeStoragesChecksum. 190 // - If force == true, then performs actual persisting operations/recomputes 191 // the checksum. 192 // - Otherwise, the derived class can decide itself whether skipping 193 // persisting operations/doing lazy checksum recomputing if the storage is 194 // not dirty. 195 // 196 // Returns: 197 // - OK on success 198 // - FAILED_PRECONDITION_ERROR if PersistentStorage is uninitialized 199 // - Any errors from PersistStoragesToDisk, UpdateChecksums, 200 // PersistMetadataToDisk, depending on actual implementation 201 libtextclassifier3::Status PersistToDisk(bool force = false) { 202 if (!is_initialized_) { 203 return absl_ports::FailedPreconditionError(absl_ports::StrCat( 204 "PersistentStorage ", working_path_, " not initialized")); 205 } 206 207 ICING_RETURN_IF_ERROR(UpdateChecksumsInternal(force)); 208 ICING_RETURN_IF_ERROR(PersistStoragesToDisk(force)); 209 ICING_RETURN_IF_ERROR(PersistMetadataToDisk(force)); 210 return libtextclassifier3::Status::OK; 211 } 212 213 // Updates checksums of all components and returns the overall crc (all_crc) 214 // of the persistent storage. 215 // 216 // Force flag will be passed down ComputeInfoChecksum, 217 // ComputeStoragesChecksum. 218 // - If force == true, then recomputes the checksum. 219 // - Otherwise, the derived class can decide itself whether doing lazy 220 // checksum recomputing if the storage is not dirty. 221 // 222 // Returns: 223 // - Overall crc of the persistent storage on success 224 // - FAILED_PRECONDITION_ERROR if PersistentStorage is uninitialized 225 // - Any errors from ComputeInfoChecksum, ComputeStoragesChecksum, depending 226 // on actual implementation 227 libtextclassifier3::StatusOr<Crc32> UpdateChecksums(bool force = false) { 228 if (!is_initialized_) { 229 return absl_ports::FailedPreconditionError(absl_ports::StrCat( 230 "PersistentStorage ", working_path_, " not initialized")); 231 } 232 233 return UpdateChecksumsInternal(force); 234 } 235 236 protected: PersistentStorage(const Filesystem & filesystem,std::string working_path,WorkingPathType working_path_type)237 explicit PersistentStorage(const Filesystem& filesystem, 238 std::string working_path, 239 WorkingPathType working_path_type) 240 : filesystem_(filesystem), 241 working_path_(std::move(working_path)), 242 working_path_type_(working_path_type), 243 is_initialized_(false) {} 244 245 // Flushes contents of metadata. The implementation should flush Crcs and Info 246 // correctly, depending on whether they're using memory-mapped regions or 247 // concrete instances in the derived class. 248 // 249 // Returns: 250 // - OK on success 251 // - Any other errors, depending on actual implementation 252 virtual libtextclassifier3::Status PersistMetadataToDisk(bool force) = 0; 253 254 // Flushes contents of all storages to underlying files. 255 // 256 // Returns: 257 // - OK on success 258 // - Any other errors, depending on actual implementation 259 virtual libtextclassifier3::Status PersistStoragesToDisk(bool force) = 0; 260 261 // Computes and returns Info checksum. 262 // - If force = true, then recompute the entire checksum. 263 // - Otherwise, the derived class can decide itself whether doing lazy 264 // checksum computing if the storage is not dirty. 265 // 266 // This function will be mainly called by UpdateChecksums. 267 // 268 // Returns: 269 // - Crc of the Info on success 270 // - Any other errors, depending on actual implementation 271 virtual libtextclassifier3::StatusOr<Crc32> ComputeInfoChecksum( 272 bool force) = 0; 273 274 // Computes and returns all storages checksum. If there are multiple storages, 275 // usually we XOR their checksums together to a single checksum. 276 // - If force = true, then recompute the entire checksum. 277 // - Otherwise, the derived class can decide itself whether doing lazy 278 // checksum computing if the storage is not dirty. 279 // 280 // This function will be mainly called by UpdateChecksums. 281 // 282 // Returns: 283 // - Crc of all storages on success 284 // - Any other errors from depending on actual implementation 285 virtual libtextclassifier3::StatusOr<Crc32> ComputeStoragesChecksum( 286 bool force) = 0; 287 288 // Returns the Crcs instance reference. The derived class can either own a 289 // concrete Crcs instance, or reinterpret_cast the memory-mapped region to 290 // Crcs reference. PersistMetadataToDisk should flush it to disk correctly. 291 virtual Crcs& crcs() = 0; 292 virtual const Crcs& crcs() const = 0; 293 294 const Filesystem& filesystem_; // Does not own 295 // Path to the storage. It can be a single file path or a directory path 296 // depending on the implementation of the derived class. 297 // 298 // Note that the derived storage class will take full ownership and of 299 // working_path_, including creation/deletion. It is the caller's 300 // responsibility to specify correct working path and avoid mixing different 301 // persistent storages together under the same path. Also the caller has the 302 // ownership for the parent directory of working_path_, and it is responsible 303 // for parent directory creation/deletion. 304 std::string working_path_; 305 WorkingPathType working_path_type_; 306 307 bool is_initialized_; 308 309 private: 310 // Updates checksums of all components and returns the overall crc (all_crc) 311 // of the persistent storage. Different from UpdateChecksums, it won't check 312 // if PersistentStorage is initialized or not. 313 // 314 // Returns: 315 // - Overall crc of the persistent storage on success 316 // - Any errors from ComputeInfoChecksum, ComputeStoragesChecksum, depending 317 // on actual implementation UpdateChecksumsInternal(bool force)318 libtextclassifier3::StatusOr<Crc32> UpdateChecksumsInternal(bool force) { 319 Crcs& crcs_ref = crcs(); 320 // Compute and update storages + info checksums. 321 ICING_ASSIGN_OR_RETURN(Crc32 info_crc, ComputeInfoChecksum(force)); 322 ICING_ASSIGN_OR_RETURN(Crc32 storages_crc, ComputeStoragesChecksum(force)); 323 if (crcs_ref.component_crcs.info_crc == info_crc.Get() && 324 crcs_ref.component_crcs.storages_crc == storages_crc.Get()) { 325 // If info and storages crc haven't changed, then we don't have to update 326 // checksums. 327 return Crc32(crcs_ref.all_crc); 328 } 329 330 crcs_ref.component_crcs.info_crc = info_crc.Get(); 331 crcs_ref.component_crcs.storages_crc = storages_crc.Get(); 332 333 // Finally compute and update overall checksum. 334 crcs_ref.all_crc = crcs_ref.component_crcs.ComputeChecksum().Get(); 335 return Crc32(crcs_ref.all_crc); 336 } 337 338 // Validates all checksums of the persistent storage. 339 // 340 // Returns: 341 // - OK on success 342 // - FAILED_PRECONDITION_ERROR if any checksum is incorrect. 343 // - Any errors from ComputeInfoChecksum, ComputeStoragesChecksum, depending 344 // on actual implementation ValidateChecksums()345 libtextclassifier3::Status ValidateChecksums() { 346 const Crcs& crcs_ref = crcs(); 347 if (crcs_ref.all_crc != crcs_ref.component_crcs.ComputeChecksum().Get()) { 348 return absl_ports::FailedPreconditionError("Invalid all crc"); 349 } 350 351 ICING_ASSIGN_OR_RETURN(Crc32 info_crc, ComputeInfoChecksum(/*force=*/true)); 352 if (crcs_ref.component_crcs.info_crc != info_crc.Get()) { 353 return absl_ports::FailedPreconditionError("Invalid info crc"); 354 } 355 356 ICING_ASSIGN_OR_RETURN(Crc32 storages_crc, 357 ComputeStoragesChecksum(/*force=*/true)); 358 if (crcs_ref.component_crcs.storages_crc != storages_crc.Get()) { 359 return absl_ports::FailedPreconditionError("Invalid storages crc"); 360 } 361 362 return libtextclassifier3::Status::OK; 363 } 364 }; 365 366 } // namespace lib 367 } // namespace icing 368 369 #endif // ICING_FILE_PERSISTENT_STORAGE_H_ 370