1 // Copyright (C) 2019 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_STORE_DOCUMENT_STORE_H_ 16 #define ICING_STORE_DOCUMENT_STORE_H_ 17 18 #include <cstdint> 19 #include <memory> 20 #include <optional> 21 #include <string> 22 #include <string_view> 23 #include <unordered_set> 24 #include <vector> 25 26 #include "icing/text_classifier/lib3/utils/base/status.h" 27 #include "icing/text_classifier/lib3/utils/base/statusor.h" 28 #include "icing/feature-flags.h" 29 #include "icing/file/file-backed-vector.h" 30 #include "icing/file/filesystem.h" 31 #include "icing/file/memory-mapped-file-backed-proto-log.h" 32 #include "icing/file/portable-file-backed-proto-log.h" 33 #include "icing/proto/debug.pb.h" 34 #include "icing/proto/document.pb.h" 35 #include "icing/proto/document_wrapper.pb.h" 36 #include "icing/proto/internal/scorable_property_set.pb.h" 37 #include "icing/proto/logging.pb.h" 38 #include "icing/proto/optimize.pb.h" 39 #include "icing/proto/persist.pb.h" 40 #include "icing/proto/search.pb.h" 41 #include "icing/proto/storage.pb.h" 42 #include "icing/proto/usage.pb.h" 43 #include "icing/schema/schema-store.h" 44 #include "icing/store/corpus-associated-scoring-data.h" 45 #include "icing/store/corpus-id.h" 46 #include "icing/store/document-associated-score-data.h" 47 #include "icing/store/document-filter-data.h" 48 #include "icing/store/document-id.h" 49 #include "icing/store/key-mapper.h" 50 #include "icing/store/namespace-id-fingerprint.h" 51 #include "icing/store/namespace-id.h" 52 #include "icing/store/usage-store.h" 53 #include "icing/tokenization/language-segmenter.h" 54 #include "icing/util/clock.h" 55 #include "icing/util/crc32.h" 56 #include "icing/util/data-loss.h" 57 #include "icing/util/document-validator.h" 58 #include "icing/util/fingerprint-util.h" 59 #include "icing/util/scorable_property_set.h" 60 61 namespace icing { 62 namespace lib { 63 64 // Provides storage interfaces for documents. 65 class DocumentStore { 66 public: 67 struct Header { 68 // Previously used magic numbers, please avoid reusing those: 69 // [0x1b99c8b0, 0x3e005b5e] 70 static constexpr int32_t kMagic = 0x8a32cd1f; 71 72 // Holds the magic as a quick sanity check against file corruption. 73 int32_t magic; 74 75 // Checksum of the DocumentStore's sub-component's checksums. 76 uint32_t checksum; 77 }; 78 79 struct OptimizeInfo { 80 // The estimated size in bytes of the optimizable docs. We don't track the 81 // size of each document, so we estimate by taking the size of the entire 82 // DocumentStore and dividing that by the total number of documents we have. 83 // So we end up with an average document size. 84 int64_t estimated_optimizable_bytes = 0; 85 86 // Number of total documents the DocumentStore tracks. 87 int32_t total_docs = 0; 88 89 // Number of optimizable (deleted + expired) docs the DocumentStore tracks. 90 int32_t optimizable_docs = 0; 91 }; 92 93 struct DeleteByGroupResult { 94 // Status representing whether or not the operation succeeded. See the 95 // comments above the function that returns this result to determine what 96 // possible statuses could be returned. 97 libtextclassifier3::Status status; 98 99 int num_docs_deleted = 0; 100 }; 101 102 struct CreateResult { 103 // A successfully initialized document store. 104 std::unique_ptr<DocumentStore> document_store; 105 106 // The data status after initializing from a previous state. Data loss can 107 // happen if the file is corrupted or some previously added data was 108 // unpersisted. This may be used to signal that any derived data off of the 109 // document store may need to be regenerated. 110 DataLoss data_loss; 111 112 // A boolean flag indicating if derived files of the document store have 113 // been regenerated or not. This is usually a signal for callers to detect 114 // if any id assignment has changed (e.g. NamespaceId). 115 bool derived_files_regenerated; 116 }; 117 118 // Not copyable 119 DocumentStore(const DocumentStore&) = delete; 120 DocumentStore& operator=(const DocumentStore&) = delete; 121 122 // Persists and updates checksum of subcomponents. 123 ~DocumentStore(); 124 125 // Factory method to create, initialize, and return a DocumentStore. The base 126 // directory is used to persist document store files. If document store was 127 // previously initialized with this directory, it will reload the files saved 128 // by the last instance. 129 // 130 // force_recovery_and_revalidate_documents=true will pre-emptively throw out 131 // the derived files and validate each document while recreating them. This 132 // can be used to indicate that the schema (and type ids) may have changed and 133 // those changes might not have been applied to the document store. 134 // 135 // If initialize_stats is present, the fields related to DocumentStore will be 136 // populated. 137 // 138 // Does not take any ownership, and all pointers except initialize_stats must 139 // refer to valid objects that outlive the one constructed. 140 // 141 // TODO(cassiewang): Consider returning a status indicating that derived files 142 // were regenerated. This may be helpful in logs. 143 // 144 // Returns: 145 // A DocumentStore::CreateResult on success 146 // FAILED_PRECONDITION on any null pointer input 147 // INTERNAL_ERROR on IO error 148 static libtextclassifier3::StatusOr<DocumentStore::CreateResult> Create( 149 const Filesystem* filesystem, const std::string& base_dir, 150 const Clock* clock, const SchemaStore* schema_store, 151 const FeatureFlags* feature_flags, 152 bool force_recovery_and_revalidate_documents, bool pre_mapping_fbv, 153 bool use_persistent_hash_map, int32_t compression_level, 154 InitializeStatsProto* initialize_stats); 155 156 // Discards all derived data in the document store. 157 // 158 // Returns: 159 // OK on success or nothing to discard 160 // INTERNAL_ERROR on any I/O errors 161 static libtextclassifier3::Status DiscardDerivedFiles( 162 const Filesystem* filesystem, const std::string& base_dir); 163 164 // Returns the maximum DocumentId that the DocumentStore has assigned. If 165 // there has not been any DocumentIds assigned, i.e. the DocumentStore is 166 // empty, then kInvalidDocumentId is returned. This does not filter out 167 // DocumentIds of deleted or expired documents. last_added_document_id()168 DocumentId last_added_document_id() const { 169 if (document_id_mapper_->num_elements() == 0) { 170 return kInvalidDocumentId; 171 } 172 return document_id_mapper_->num_elements() - 1; 173 } 174 175 // Returns the number of documents. The result does not filter out DocumentIds 176 // of deleted or expired documents. num_documents()177 int num_documents() const { return document_id_mapper_->num_elements(); } 178 179 // Puts the document into document store. 180 // 181 // If put_document_stats is present, the fields related to DocumentStore will 182 // be populated. 183 // 184 // Returns: 185 // - On success, a PutResult with the DocumentId of the newly added document 186 // and the old DocumentId before replacement. If this is a new document, 187 // then old DocumentId will be kInvalidDocumentId. 188 // - RESOURCE_EXHAUSTED if exceeds maximum number of allowed documents 189 // - FAILED_PRECONDITION if schema hasn't been set yet 190 // - NOT_FOUND if the schema_type or a property config of the document 191 // doesn't exist in schema 192 // - INTERNAL_ERROR on IO error 193 struct PutResult { 194 DocumentId old_document_id = kInvalidDocumentId; 195 DocumentId new_document_id = kInvalidDocumentId; 196 was_replacementPutResult197 bool was_replacement() const { 198 return old_document_id != kInvalidDocumentId; 199 } 200 }; 201 libtextclassifier3::StatusOr<PutResult> Put( 202 const DocumentProto& document, int32_t num_tokens = 0, 203 PutDocumentStatsProto* put_document_stats = nullptr); 204 libtextclassifier3::StatusOr<PutResult> Put( 205 DocumentProto&& document, int32_t num_tokens = 0, 206 PutDocumentStatsProto* put_document_stats = nullptr); 207 208 // Finds and returns the document identified by the given key (namespace + 209 // uri). If 'clear_internal_fields' is true, document level data that's 210 // generated internally by DocumentStore is cleared. 211 // 212 // Returns: 213 // The document found on success 214 // NOT_FOUND if the key doesn't exist or document has been deleted 215 // INTERNAL_ERROR on IO error 216 libtextclassifier3::StatusOr<DocumentProto> Get( 217 std::string_view name_space, std::string_view uri, 218 bool clear_internal_fields = true) const; 219 220 // Finds and returns the document identified by the given document id. If 221 // 'clear_internal_fields' is true, document level data that's generated 222 // internally by DocumentStore is cleared. 223 // 224 // Returns: 225 // The document found on success 226 // INVALID_ARGUMENT if document_id is less than 0 or greater than the 227 // maximum value 228 // NOT_FOUND if the document doesn't exist or has been deleted 229 // INTERNAL_ERROR on IO error 230 libtextclassifier3::StatusOr<DocumentProto> Get( 231 DocumentId document_id, bool clear_internal_fields = true) const; 232 233 // Returns the ScorablePropertySet of the document specified by the 234 // DocumentId. 235 // 236 // Returns: 237 // - ScorablePropertySet on success 238 // - nullptr when the ScorablePropertySet fails to be created, it could be 239 // due to that: 240 // - |document_id| is invalid, or 241 // - no ScorablePropertySetProto is found for the document in the cache 242 // - internal IO error 243 std::unique_ptr<ScorablePropertySet> GetScorablePropertySet( 244 DocumentId document_id, int64_t current_time_ms) const; 245 246 // Returns all namespaces which have at least 1 active document (not deleted 247 // or expired). Order of namespaces is undefined. 248 std::vector<std::string> GetAllNamespaces() const; 249 250 // Deletes the document identified by the given namespace and uri. The 251 // document proto will be erased immediately. 252 // 253 // NOTE: 254 // Space is not reclaimed for deleted documents until Optimize() is 255 // called. 256 // 257 // Returns: 258 // OK on success 259 // NOT_FOUND if no document exists with namespace, uri 260 // INTERNAL_ERROR on IO error 261 libtextclassifier3::Status Delete(std::string_view name_space, 262 std::string_view uri, 263 int64_t current_time_ms); 264 265 // Deletes the document identified by the given document_id. The document 266 // proto will be erased immediately. 267 // 268 // NOTE: 269 // Space is not reclaimed for deleted documents until Optimize() is 270 // called. 271 // 272 // Returns: 273 // OK on success 274 // NOT_FOUND if the document doesn't exist (i.e. deleted or expired) 275 // INTERNAL_ERROR on IO error 276 // INVALID_ARGUMENT if document_id is invalid. 277 libtextclassifier3::Status Delete(DocumentId document_id, 278 int64_t current_time_ms); 279 280 // Returns the NamespaceId of the string namespace 281 // 282 // Returns: 283 // NamespaceId on success 284 // NOT_FOUND if the namespace doesn't exist 285 // INTERNAL_ERROR on IO error 286 libtextclassifier3::StatusOr<NamespaceId> GetNamespaceId( 287 std::string_view name_space) const; 288 289 // Helper method to find a DocumentId that is associated with the given 290 // namespace and uri. 291 // 292 // NOTE: if succeeded, it always returns a valid DocumentId, but this 293 // DocumentId may refer to a invalid document (deleted or expired). Callers 294 // can call GetAliveDocumentFilterData(document_id, current_time_ms) and check 295 // the return value to ensure it refers to an alive Document. 296 // 297 // Returns: 298 // A DocumentId on success 299 // NOT_FOUND if the key doesn't exist 300 // INTERNAL_ERROR on IO error 301 libtextclassifier3::StatusOr<DocumentId> GetDocumentId( 302 std::string_view name_space, std::string_view uri) const; 303 304 // Helper method to find a DocumentId that is associated with the given 305 // NamespaceIdFingerprint. 306 // 307 // NOTE: if succeeded, it always returns a valid DocumentId, but this 308 // DocumentId may refer to a invalid document (deleted or expired). Callers 309 // can call GetAliveDocumentFilterData(document_id, current_time_ms) and check 310 // the return value to ensure it refers to an alive Document. 311 // 312 // Returns: 313 // A DocumentId on success 314 // NOT_FOUND if the key doesn't exist 315 // INTERNAL_ERROR on IO error 316 libtextclassifier3::StatusOr<DocumentId> GetDocumentId( 317 const NamespaceIdFingerprint& doc_namespace_id_uri_fingerprint) const; 318 319 // Returns the CorpusId associated with the given namespace and schema. 320 // 321 // Returns: 322 // A CorpusId on success 323 // NOT_FOUND if the key doesn't exist 324 // INTERNAL_ERROR on IO error 325 libtextclassifier3::StatusOr<CorpusId> GetCorpusId( 326 const std::string_view name_space, const std::string_view schema) const; 327 328 // Returns the ResultGroupingEntryId associated with the given namespace 329 // and schema. 330 // 331 // NOTE: ResultGroupingEntryIds that are generated by calls with different 332 // ResultGroupingTypes should not be compared. Returned ResultGroupingEntryIds 333 // are only guarenteed to be unique within their own ResultGroupingType. 334 // 335 // Returns: 336 // A ResultGroupingEntryId on success 337 // NOT_FOUND if the key doesn't exist 338 // INTERNAL_ERROR on IO error 339 libtextclassifier3::StatusOr<int32_t> GetResultGroupingEntryId( 340 ResultSpecProto::ResultGroupingType result_group_type, 341 const std::string_view name_space, const std::string_view schema) const; 342 343 // Returns the ResultGrouping Entry Id associated with the given NamespaceId 344 // and SchemaTypeId 345 // 346 // NOTE: ResultGroupingEntryIds that are generated by calls with different 347 // ResultGroupingTypes should not be compared. Returned ResultGroupingEntryIds 348 // are only guarenteed to be unique within their own ResultGroupingType. 349 // 350 // Returns: 351 // A ResultGroupingEntryId on success 352 // NOT_FOUND if the key doesn't exist 353 // INTERNAL_ERROR on IO error 354 libtextclassifier3::StatusOr<int32_t> GetResultGroupingEntryId( 355 ResultSpecProto::ResultGroupingType result_group_type, 356 const NamespaceId namespace_id, const SchemaTypeId schema_type_id) const; 357 358 // Returns the DocumentAssociatedScoreData of the document specified by the 359 // DocumentId. 360 // 361 // Returns: 362 // DocumentAssociatedScoreData on success 363 // NOT_FOUND if the document or the score data is not found 364 libtextclassifier3::StatusOr<DocumentAssociatedScoreData> 365 GetDocumentAssociatedScoreData(DocumentId document_id) const; 366 367 // Returns the CorpusAssociatedScoreData of the corpus specified by the 368 // corpus_id. 369 // 370 // NOTE: This does not check if the corpus exists and will return the 371 // CorpusAssociatedScoreData of the corpus even if all documents belonging to 372 // that corpus have been deleted. 373 // 374 // Returns: 375 // CorpusAssociatedScoreData on success 376 // OUT_OF_RANGE if corpus_id is negative or exceeds previously seen 377 // CorpusIds 378 libtextclassifier3::StatusOr<CorpusAssociatedScoreData> 379 GetCorpusAssociatedScoreData(CorpusId corpus_id) const; 380 381 // Gets the document filter data if a document exists and is not expired. 382 // Otherwise, will get a false optional. 383 // 384 // Existence means it hasn't been deleted and it hasn't expired yet. 385 // 386 // Returns: 387 // True:DocumentFilterData if the given document exists. 388 // False if the given document doesn't exist. 389 std::optional<DocumentFilterData> GetAliveDocumentFilterData( 390 DocumentId document_id, int64_t current_time_ms) const; 391 392 // Gets the document filter data if a document has not been deleted. If the 393 // document is expired but not deleted, will still return a valid document 394 // filter data. Otherwise, will get a false optional. 395 // 396 // Returns: 397 // True:DocumentFilterData if the given document exists. 398 // False if the given document has been deleted. 399 std::optional<DocumentFilterData> GetNonDeletedDocumentFilterData( 400 DocumentId document_id) const; 401 402 // Gets the SchemaTypeId of a document. 403 // 404 // Returns: 405 // SchemaTypeId on success 406 // kInvalidSchemaTypeId if the document is deleted or expired. GetSchemaTypeId(DocumentId document_id,int64_t current_time_ms)407 SchemaTypeId GetSchemaTypeId(DocumentId document_id, 408 int64_t current_time_ms) const { 409 std::optional<DocumentFilterData> document_filter_data_optional = 410 GetAliveDocumentFilterData(document_id, current_time_ms); 411 if (document_filter_data_optional) { 412 return document_filter_data_optional.value().schema_type_id(); 413 } else { 414 return kInvalidSchemaTypeId; 415 } 416 } 417 418 // Gets the usage scores of a document. 419 // 420 // Returns: 421 // UsageScores on success 422 // nullopt if there are no usage scores stored for the requested docid. 423 std::optional<UsageStore::UsageScores> GetUsageScores( 424 DocumentId document_id, int64_t current_time_ms) const; 425 426 // Reports usage. The corresponding usage scores of the specified document in 427 // the report will be updated. 428 // 429 // Returns: 430 // OK on success 431 // NOT_FOUND if the [namesapce + uri] key in the report doesn't exist 432 // INTERNAL_ERROR on I/O errors. 433 libtextclassifier3::Status ReportUsage(const UsageReport& usage_report); 434 435 // Deletes all documents belonging to the given namespace. The documents will 436 // be erased immediately. 437 // 438 // NOTE: 439 // Space is not reclaimed for deleted documents until Optimize() is 440 // called. 441 // 442 // Returns: 443 // OK on success 444 // NOT_FOUND if namespace doesn't exist 445 // INTERNAL_ERROR on IO error 446 DeleteByGroupResult DeleteByNamespace(std::string_view name_space); 447 448 // Deletes all documents belonging to the given schema type. The documents 449 // will be erased immediately. 450 // 451 // NOTE: 452 // Space is not reclaimed for deleted documents until Optimize() is 453 // called. 454 // 455 // Returns: 456 // OK on success 457 // NOT_FOUND if schema_type doesn't exist 458 // INTERNAL_ERROR on IO error 459 DeleteByGroupResult DeleteBySchemaType(std::string_view schema_type); 460 461 // Syncs all the data and metadata changes to disk. 462 // 463 // Returns: 464 // OK on success 465 // INTERNAL on I/O error 466 libtextclassifier3::Status PersistToDisk(PersistType::Code persist_type); 467 468 // Calculates the StorageInfo for the Document Store. 469 // 470 // If an IO error occurs while trying to calculate the value for a field, then 471 // that field will be set to -1. 472 DocumentStorageInfoProto GetStorageInfo() const; 473 474 // Update any derived data off of the SchemaStore with the new SchemaStore. 475 // This may include pointers, SchemaTypeIds, etc. 476 // 477 // NOTE: This function may delete documents. A document may be invalidated by 478 // the new SchemaStore, such as failing validation or having its schema type 479 // deleted from the schema. 480 // 481 // This is best used if the caller is unsure about what's changed in the 482 // SchemaStore, and wants to update all information no matter what. If the 483 // caller does know what has changed, then it's recommended to call 484 // OptimizedUpdateSchemaStore. 485 // 486 // Returns; 487 // OK on success 488 // INTERNAL_ERROR on IO error 489 libtextclassifier3::Status UpdateSchemaStore(const SchemaStore* schema_store); 490 491 // Performs the same funtionality as UpdateSchemaStore, but this can be more 492 // optimized in terms of less disk reads and less work if we know exactly 493 // what's changed between the old and new SchemaStore. 494 // 495 // Returns; 496 // OK on success 497 // INTERNAL_ERROR on IO error 498 libtextclassifier3::Status OptimizedUpdateSchemaStore( 499 const SchemaStore* schema_store, 500 const SchemaStore::SetSchemaResult& set_schema_result); 501 502 // Re-generates the scorable property cache for documents with the given 503 // schema types. 504 // 505 // Returns: 506 // OK on success 507 // INTERNAL_ERROR on IO error 508 libtextclassifier3::Status RegenerateScorablePropertyCache( 509 const std::unordered_set<SchemaTypeId>& schema_type_ids); 510 511 // Reduces internal file sizes by reclaiming space of deleted documents and 512 // regenerating derived files. 513 // 514 // NOTE: The tasks in this method are too expensive to be executed in 515 // real-time. The caller should decide how frequently and when to call this 516 // method based on device usage. 517 // 518 // Returns: 519 // OK on success 520 // INTERNAL_ERROR on IO error 521 libtextclassifier3::Status Optimize(); 522 523 struct OptimizeResult { 524 // A vector that maps old document id to new document id. 525 std::vector<DocumentId> document_id_old_to_new; 526 527 // A vector that maps old namespace id to new namespace id. Will be empty if 528 // should_rebuild_index is set to true. 529 std::vector<NamespaceId> namespace_id_old_to_new; 530 531 // A boolean flag that hints the caller (usually IcingSearchEngine) if it 532 // should rebuild index instead of adopting the id changes via the 2 vectors 533 // above. It will be set to true if finding any id inconsistency. 534 bool should_rebuild_index = false; 535 536 // A set of blob handles that are dead and need to be removed. 537 std::unordered_set<std::string> dead_blob_handles; 538 }; 539 // Copy data from current base directory into a new directory. Any outdated or 540 // deleted data won't be copied. During the process, document/namespace ids 541 // will be reassigned so any files / classes that are based on old 542 // document/namespace ids may be outdated. 543 // 544 // stats will be set if non-null. 545 // 546 // NOTE: The tasks in this method are too expensive to be executed in 547 // real-time. The caller should decide how frequently and when to call this 548 // method based on device usage. 549 // 550 // Returns: 551 // OptimizeResult which contains a vector mapping from old document id to 552 // new document id and another vector mapping from old namespace id to new 553 // namespace id, on success 554 // INVALID_ARGUMENT if new_directory is same as current base directory 555 // INTERNAL_ERROR on IO error 556 libtextclassifier3::StatusOr<OptimizeResult> OptimizeInto( 557 const std::string& new_directory, const LanguageSegmenter* lang_segmenter, 558 std::unordered_set<std::string>&& expired_blob_handles, 559 OptimizeStatsProto* stats = nullptr) const; 560 561 // Calculates status for a potential Optimize call. Includes how many docs 562 // there are vs how many would be optimized away. And also includes an 563 // estimated size gains, in bytes, if Optimize were called. 564 // 565 // Returns: 566 // OptimizeInfo on success 567 // INTERNAL_ERROR on IO error 568 libtextclassifier3::StatusOr<OptimizeInfo> GetOptimizeInfo() const; 569 570 // Update, replace and persist the header file. Creates the header file if it 571 // doesn't exist. 572 // 573 // Returns: 574 // OK on success 575 // INTERNAL on I/O error 576 libtextclassifier3::StatusOr<Crc32> UpdateChecksum(); 577 578 // Calculates and returns the checksum of the document store. 579 // 580 // Returns: 581 // OK on success 582 // INTERNAL on I/O error 583 libtextclassifier3::StatusOr<Crc32> GetChecksum() const; 584 585 // Get debug information for the document store. 586 // verbosity <= 0, simplest debug information 587 // verbosity > 0, also return the total number of documents and tokens in each 588 // (namespace, schema type) pair. 589 // 590 // Returns: 591 // DocumentDebugInfoProto on success 592 // INTERNAL_ERROR on IO errors, crc compute error 593 libtextclassifier3::StatusOr<DocumentDebugInfoProto> GetDebugInfo( 594 int verbosity) const; 595 596 private: 597 // Use DocumentStore::Create() to instantiate. 598 explicit DocumentStore(const Filesystem* filesystem, 599 std::string_view base_dir, const Clock* clock, 600 const SchemaStore* schema_store, 601 const FeatureFlags* feature_flags, 602 bool pre_mapping_fbv, bool use_persistent_hash_map, 603 int32_t compression_level); 604 605 const Filesystem* const filesystem_; 606 const std::string base_dir_; 607 const Clock& clock_; 608 const FeatureFlags& feature_flags_; // Does not own. 609 610 // Handles the ground truth schema and all of the derived data off of the 611 // schema 612 const SchemaStore* schema_store_; 613 614 // Used to validate incoming documents 615 DocumentValidator document_validator_; 616 617 // Flag indicating whether memory map max possible file size for underlying 618 // FileBackedVector before growing the actual file size. 619 bool pre_mapping_fbv_; 620 621 // Flag indicating whether use persistent hash map as the key mapper (if 622 // false, then fall back to dynamic trie key mapper). Note: we only use 623 // persistent hash map for uri mapper if it is true. 624 bool use_persistent_hash_map_; 625 626 const int32_t compression_level_; 627 628 // A log used to store all documents, it serves as a ground truth of doc 629 // store. key_mapper_ and document_id_mapper_ can be regenerated from it. 630 std::unique_ptr<PortableFileBackedProtoLog<DocumentWrapper>> document_log_; 631 632 // Key (namespace + uri) to DocumentId mapping 633 std::unique_ptr< 634 KeyMapper<DocumentId, fingerprint_util::FingerprintStringFormatter>> 635 document_key_mapper_; 636 637 // DocumentId to file offset mapping 638 std::unique_ptr<FileBackedVector<int64_t>> document_id_mapper_; 639 640 // A cache of document associated scores. The ground truth of the scores is 641 // DocumentProto stored in document_log_. This cache contains: 642 // - CorpusId 643 // - Document score 644 // - Document creation timestamp in seconds 645 // - Document length in number of tokens 646 // - Index of the ScorablePropertySetProto at the scorable_property_cache_ 647 std::unique_ptr<FileBackedVector<DocumentAssociatedScoreData>> score_cache_; 648 649 // A cache of document scorable properties. The ground truth of the data is 650 // DocumentProto stored in document_log_. 651 std::unique_ptr<MemoryMappedFileBackedProtoLog<ScorablePropertySetProto>> 652 scorable_property_cache_; 653 654 // A cache of data, indexed by DocumentId, used to filter documents. Currently 655 // contains: 656 // - NamespaceId 657 // - SchemaTypeId 658 // - Expiration timestamp in seconds 659 std::unique_ptr<FileBackedVector<DocumentFilterData>> filter_cache_; 660 661 // A cache of corpus associated scores. The ground truth of the scores is 662 // DocumentProto stored in document_log_. This cache contains: 663 // - Number of documents belonging to the corpus score 664 // - The sum of the documents' lengths, in number of tokens. 665 std::unique_ptr<FileBackedVector<CorpusAssociatedScoreData>> 666 corpus_score_cache_; 667 668 // Maps namespaces to a densely-assigned unique id. Namespaces are assigned an 669 // id when the first document belonging to that namespace is added to the 670 // DocumentStore. Namespaces may be removed from the mapper during compaction. 671 std::unique_ptr<KeyMapper<NamespaceId>> namespace_mapper_; 672 673 // Maps a corpus, i.e. a (namespace, schema type) pair, to a densely-assigned 674 // unique id. A coprus is assigned an 675 // id when the first document belonging to that corpus is added to the 676 // DocumentStore. Corpus ids may be removed from the mapper during compaction. 677 std::unique_ptr< 678 KeyMapper<CorpusId, fingerprint_util::FingerprintStringFormatter>> 679 corpus_mapper_; 680 681 // A storage class that caches all usage scores. Usage scores are not 682 // considered as ground truth. Usage scores are associated with document ids 683 // so they need to be updated when document ids change. 684 std::unique_ptr<UsageStore> usage_store_; 685 686 // Used internally to indicate whether the class has been initialized. This is 687 // to guard against cases where the object has been created, but Initialize 688 // fails in the constructor. If we have successfully exited the constructor, 689 // then this field can be ignored. Clients of DocumentStore should not need to 690 // worry about this field. 691 bool initialized_ = false; 692 693 struct InitializeResult { 694 DataLoss data_loss; 695 696 // A boolean flag indicating if derived files of the document store have 697 // been regenerated or not. This is usually a signal for callers to detect 698 // if any id assignment has changed (e.g. NamespaceId). 699 bool derived_files_regenerated; 700 }; 701 libtextclassifier3::StatusOr<InitializeResult> Initialize( 702 bool force_recovery_and_revalidate_documents, 703 InitializeStatsProto* initialize_stats); 704 705 // Creates sub-components and verifies the integrity of each sub-component. 706 // This assumes that the the underlying files already exist, and will return 707 // an error if it doesn't find what it's expecting. 708 // 709 // Returns an error if subcomponents failed to initialize successfully. 710 // INTERNAL_ERROR on IO error 711 libtextclassifier3::Status InitializeExistingDerivedFiles(); 712 713 // Re-generates all files derived from the ground truth: the document log. 714 // 715 // revalidate_documents=true will also cause each document to be revalidated 716 // the schema as it is read out of the document log. 717 // 718 // NOTE: if this function fails, the only thing we can do is to retry it until 719 // it succeeds or prevent the initialization of a DocumentStore. The 720 // DocumentStore object wouldn't work reliably if this fails. 721 // 722 // Steps: 723 // 1. Delete all derived files. 724 // 2. Iterate through document log, put data into new key mapper and 725 // document_id 726 // mapper. 727 // 3. Create header and store the updated combined checksum 728 libtextclassifier3::Status RegenerateDerivedFiles(bool revalidate_documents); 729 730 // Resets the unique_ptr to the document_key_mapper, deletes the underlying 731 // file, and re-creates a new instance of the document_key_mapper . 732 // 733 // Returns OK or any IO errors. 734 libtextclassifier3::Status ResetDocumentKeyMapper(); 735 736 // Resets the unique_ptr to the document_id_mapper, deletes the underlying 737 // file, and re-creates a new instance of the document_id_mapper. 738 // 739 // Returns OK or any IO errors. 740 libtextclassifier3::Status ResetDocumentIdMapper(); 741 742 // Resets the unique_ptr to the score_cache, deletes the underlying file, and 743 // re-creates a new instance of the score_cache. 744 // 745 // Returns OK or any IO errors. 746 libtextclassifier3::Status ResetDocumentAssociatedScoreCache(); 747 748 // Resets the unique_ptr to the |scorable_property_cache_|, deletes the 749 // underlying file, and re-creates a new instance of it. 750 // 751 // Returns OK or any IO errors. 752 libtextclassifier3::Status ResetScorablePropertyCache(); 753 754 // Resets the unique_ptr to the corpus_score_cache, deletes the underlying 755 // file, and re-creates a new instance of the corpus_score_cache. 756 // 757 // Returns OK or any IO errors. 758 libtextclassifier3::Status ResetCorpusAssociatedScoreCache(); 759 760 // Resets the unique_ptr to the filter_cache, deletes the underlying file, and 761 // re-creates a new instance of the filter_cache. 762 // 763 // Returns OK or any IO errors. 764 libtextclassifier3::Status ResetFilterCache(); 765 766 // Resets the unique_ptr to the namespace_mapper, deletes the underlying file, 767 // and re-creates a new instance of the namespace_mapper. 768 // 769 // Returns OK or any IO errors. 770 libtextclassifier3::Status ResetNamespaceMapper(); 771 772 // Resets the unique_ptr to the corpus_mapper, deletes the underlying file, 773 // and re-creates a new instance of the corpus_mapper. 774 // 775 // Returns OK or any IO errors. 776 libtextclassifier3::Status ResetCorpusMapper(); 777 778 // Checks if the header exists already. This does not create the header file 779 // if it doesn't exist. 780 bool HeaderExists(); 781 782 libtextclassifier3::StatusOr<PutResult> InternalPut( 783 DocumentProto&& document, 784 PutDocumentStatsProto* put_document_stats = nullptr); 785 786 // Helper function to do batch deletes. Documents with the given 787 // "namespace_id" and "schema_type_id" will be deleted. If callers don't need 788 // to specify the namespace or schema type, pass in kInvalidNamespaceId or 789 // kInvalidSchemaTypeId. The document protos with their derived data will be 790 // erased / cleared immediately. 791 // 792 // NOTE: Space is not reclaimed in the derived files until Optimize() is 793 // called. 794 // 795 // Returns: 796 // Number of documents that were actually updated to be deleted 797 // INTERNAL_ERROR on IO error 798 libtextclassifier3::StatusOr<int> BatchDelete(NamespaceId namespace_id, 799 SchemaTypeId schema_type_id); 800 801 // Returns the CorpusAssociatedScoreData of the corpus specified by the 802 // corpus_id. 803 // 804 // If the corpus_id has never been seen before, it returns a 805 // CorpusAssociatedScoreData with properties set to default values. 806 // 807 // NOTE: This does not check if the corpus exists and will return the 808 // CorpusAssociatedScoreData of the corpus even if all documents belonging to 809 // that corpus have been deleted. 810 // 811 // Returns: 812 // CorpusAssociatedScoreData on success 813 libtextclassifier3::StatusOr<CorpusAssociatedScoreData> 814 GetCorpusAssociatedScoreDataToUpdate(CorpusId corpus_id) const; 815 816 // Checks if a document has been deleted. 817 // 818 // This is for internal-use only because we assume that the document_id is 819 // already valid. 820 bool IsDeleted(DocumentId document_id) const; 821 822 // Checks if a document has expired. 823 // 824 // This is for internal-use only because we assume that the document_id is 825 // already valid. 826 827 // Returns: 828 // True:DocumentFilterData if the given document isn't expired. 829 // False if the given doesn't document is expired. 830 std::optional<DocumentFilterData> GetNonExpiredDocumentFilterData( 831 DocumentId document_id, int64_t current_time_ms) const; 832 833 // Updates the entry in the score cache for document_id. 834 libtextclassifier3::Status UpdateDocumentAssociatedScoreCache( 835 DocumentId document_id, const DocumentAssociatedScoreData& score_data); 836 837 // Updates the entry in the corpus score cache for corpus_id. 838 libtextclassifier3::Status UpdateCorpusAssociatedScoreCache( 839 CorpusId corpus_id, const CorpusAssociatedScoreData& score_data); 840 841 // Updates the entry in the filter cache for document_id. 842 libtextclassifier3::Status UpdateFilterCache( 843 DocumentId document_id, const DocumentFilterData& filter_data); 844 845 // Helper method to clear the derived data of a document 846 libtextclassifier3::Status ClearDerivedData(DocumentId document_id); 847 848 // Sets usage scores for the given document. 849 libtextclassifier3::Status SetUsageScores( 850 DocumentId document_id, const UsageStore::UsageScores& usage_scores); 851 852 // Returns: 853 // - on success, a DocumentStorageInfoProto with the fields relating to the 854 // size of Document Store member variables populated. 855 // - INTERNAL on failure to get file size 856 DocumentStorageInfoProto GetMemberStorageInfo() const; 857 858 // Returns: 859 // - on success, the storage_info that was passed in but with the number of 860 // alive, deleted and expired documents also set. 861 // - OUT_OF_RANGE, this should never happen. This could only be returned if 862 // the document_id_mapper somehow became larger than the filter cache. 863 DocumentStorageInfoProto CalculateDocumentStatusCounts( 864 DocumentStorageInfoProto storage_info) const; 865 866 // Returns: 867 // - on success, a RepeatedPtrField for CorpusInfo collected. 868 // - OUT_OF_RANGE, this should never happen. 869 libtextclassifier3::StatusOr< 870 google::protobuf::RepeatedPtrField<DocumentDebugInfoProto::CorpusInfo>> 871 CollectCorpusInfo() const; 872 873 // Extracts the ScorablePropertySetProto from the |document| and add it to 874 // the |scorable_property_cache_|. 875 // 876 // Returns: 877 // - Index of the newly inserted ScorablePropertySetProto in the 878 // |scorable_property_cache_|. 879 // - kInvalidScorablePropertyCacheIndex if the schema contains no 880 // scorable properties. 881 // - INVALID_ARGUMENT if |schema_type_id| is invalid, or the converted 882 // ScorablePropertySetProto exceeds the size limit of 16MiB. 883 // - INTERNAL_ERROR on IO error. 884 libtextclassifier3::StatusOr<int> UpdateScorablePropertyCache( 885 const DocumentProto& document, SchemaTypeId schema_type_id); 886 }; 887 888 } // namespace lib 889 } // namespace icing 890 891 #endif // ICING_STORE_DOCUMENT_STORE_H_ 892