1 // Copyright (C) 2019 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_STORE_DOCUMENT_STORE_H_ 16 #define ICING_STORE_DOCUMENT_STORE_H_ 17 18 #include <cstdint> 19 #include <memory> 20 #include <string> 21 #include <string_view> 22 #include <vector> 23 24 #include "icing/text_classifier/lib3/utils/base/status.h" 25 #include "icing/text_classifier/lib3/utils/base/statusor.h" 26 #include "icing/file/file-backed-proto-log.h" 27 #include "icing/file/file-backed-vector.h" 28 #include "icing/file/filesystem.h" 29 #include "icing/file/portable-file-backed-proto-log.h" 30 #include "icing/proto/debug.pb.h" 31 #include "icing/proto/document.pb.h" 32 #include "icing/proto/document_wrapper.pb.h" 33 #include "icing/proto/logging.pb.h" 34 #include "icing/proto/optimize.pb.h" 35 #include "icing/proto/persist.pb.h" 36 #include "icing/proto/search.pb.h" 37 #include "icing/proto/storage.pb.h" 38 #include "icing/proto/usage.pb.h" 39 #include "icing/schema/schema-store.h" 40 #include "icing/store/corpus-associated-scoring-data.h" 41 #include "icing/store/corpus-id.h" 42 #include "icing/store/document-associated-score-data.h" 43 #include "icing/store/document-filter-data.h" 44 #include "icing/store/document-id.h" 45 #include "icing/store/key-mapper.h" 46 #include "icing/store/namespace-fingerprint-identifier.h" 47 #include "icing/store/namespace-id.h" 48 #include "icing/store/usage-store.h" 49 #include "icing/tokenization/language-segmenter.h" 50 #include "icing/util/clock.h" 51 #include "icing/util/crc32.h" 52 #include "icing/util/data-loss.h" 53 #include "icing/util/document-validator.h" 54 #include "icing/util/fingerprint-util.h" 55 56 namespace icing { 57 namespace lib { 58 59 // Provides storage interfaces for documents. 60 class DocumentStore { 61 public: 62 struct Header { GetCurrentMagicHeader63 static int32_t GetCurrentMagic(bool namespace_id_fingerprint) { 64 return namespace_id_fingerprint ? kNewMagic : kOldMagic; 65 } 66 67 // Holds the magic as a quick sanity check against file corruption. 68 int32_t magic; 69 70 // Checksum of the DocumentStore's sub-component's checksums. 71 uint32_t checksum; 72 73 private: 74 static constexpr int32_t kOldMagic = 0x746f7265; 75 static constexpr int32_t kNewMagic = 0x1b99c8b0; 76 }; 77 78 struct OptimizeInfo { 79 // The estimated size in bytes of the optimizable docs. We don't track the 80 // size of each document, so we estimate by taking the size of the entire 81 // DocumentStore and dividing that by the total number of documents we have. 82 // So we end up with an average document size. 83 int64_t estimated_optimizable_bytes = 0; 84 85 // Number of total documents the DocumentStore tracks. 86 int32_t total_docs = 0; 87 88 // Number of optimizable (deleted + expired) docs the DocumentStore tracks. 89 int32_t optimizable_docs = 0; 90 }; 91 92 struct DeleteByGroupResult { 93 // Status representing whether or not the operation succeeded. See the 94 // comments above the function that returns this result to determine what 95 // possible statuses could be returned. 96 libtextclassifier3::Status status; 97 98 int num_docs_deleted = 0; 99 }; 100 101 struct CreateResult { 102 // A successfully initialized document store. 103 std::unique_ptr<DocumentStore> document_store; 104 105 // The data status after initializing from a previous state. Data loss can 106 // happen if the file is corrupted or some previously added data was 107 // unpersisted. This may be used to signal that any derived data off of the 108 // document store may need to be regenerated. 109 DataLoss data_loss; 110 111 // A boolean flag indicating if derived files of the document store have 112 // been regenerated or not. This is usually a signal for callers to detect 113 // if any id assignment has changed (e.g. NamespaceId). 114 bool derived_files_regenerated; 115 }; 116 117 // Not copyable 118 DocumentStore(const DocumentStore&) = delete; 119 DocumentStore& operator=(const DocumentStore&) = delete; 120 121 // Persists and updates checksum of subcomponents. 122 ~DocumentStore(); 123 124 // Factory method to create, initialize, and return a DocumentStore. The base 125 // directory is used to persist document store files. If document store was 126 // previously initialized with this directory, it will reload the files saved 127 // by the last instance. 128 // 129 // force_recovery_and_revalidate_documents=true will pre-emptively throw out 130 // the derived files and validate each document while recreating them. This 131 // can be used to indicate that the schema (and type ids) may have changed and 132 // those changes might not have been applied to the document store. 133 // 134 // If initialize_stats is present, the fields related to DocumentStore will be 135 // populated. 136 // 137 // Does not take any ownership, and all pointers except initialize_stats must 138 // refer to valid objects that outlive the one constructed. 139 // 140 // TODO(cassiewang): Consider returning a status indicating that derived files 141 // were regenerated. This may be helpful in logs. 142 // 143 // Returns: 144 // A DocumentStore::CreateResult on success 145 // FAILED_PRECONDITION on any null pointer input 146 // INTERNAL_ERROR on IO error 147 static libtextclassifier3::StatusOr<DocumentStore::CreateResult> Create( 148 const Filesystem* filesystem, const std::string& base_dir, 149 const Clock* clock, const SchemaStore* schema_store, 150 bool force_recovery_and_revalidate_documents, 151 bool namespace_id_fingerprint, bool pre_mapping_fbv, 152 bool use_persistent_hash_map, int32_t compression_level, 153 InitializeStatsProto* initialize_stats); 154 155 // Discards all derived data in the document store. 156 // 157 // Returns: 158 // OK on success or nothing to discard 159 // INTERNAL_ERROR on any I/O errors 160 static libtextclassifier3::Status DiscardDerivedFiles( 161 const Filesystem* filesystem, const std::string& base_dir); 162 163 // Returns the maximum DocumentId that the DocumentStore has assigned. If 164 // there has not been any DocumentIds assigned, i.e. the DocumentStore is 165 // empty, then kInvalidDocumentId is returned. This does not filter out 166 // DocumentIds of deleted or expired documents. last_added_document_id()167 DocumentId last_added_document_id() const { 168 if (document_id_mapper_->num_elements() == 0) { 169 return kInvalidDocumentId; 170 } 171 return document_id_mapper_->num_elements() - 1; 172 } 173 174 // Returns the number of documents. The result does not filter out DocumentIds 175 // of deleted or expired documents. num_documents()176 int num_documents() const { return document_id_mapper_->num_elements(); } 177 178 // Puts the document into document store. 179 // 180 // If put_document_stats is present, the fields related to DocumentStore will 181 // be populated. 182 // 183 // Returns: 184 // A newly generated document id on success 185 // RESOURCE_EXHAUSED if exceeds maximum number of allowed documents 186 // FAILED_PRECONDITION if schema hasn't been set yet 187 // NOT_FOUND if the schema_type or a property config of the document doesn't 188 // exist in schema 189 // INTERNAL_ERROR on IO error 190 libtextclassifier3::StatusOr<DocumentId> Put( 191 const DocumentProto& document, int32_t num_tokens = 0, 192 PutDocumentStatsProto* put_document_stats = nullptr); 193 libtextclassifier3::StatusOr<DocumentId> Put( 194 DocumentProto&& document, int32_t num_tokens = 0, 195 PutDocumentStatsProto* put_document_stats = nullptr); 196 197 // Finds and returns the document identified by the given key (namespace + 198 // uri). If 'clear_internal_fields' is true, document level data that's 199 // generated internally by DocumentStore is cleared. 200 // 201 // Returns: 202 // The document found on success 203 // NOT_FOUND if the key doesn't exist or document has been deleted 204 // INTERNAL_ERROR on IO error 205 libtextclassifier3::StatusOr<DocumentProto> Get( 206 std::string_view name_space, std::string_view uri, 207 bool clear_internal_fields = true) const; 208 209 // Finds and returns the document identified by the given document id. If 210 // 'clear_internal_fields' is true, document level data that's generated 211 // internally by DocumentStore is cleared. 212 // 213 // Returns: 214 // The document found on success 215 // INVALID_ARGUMENT if document_id is less than 0 or greater than the 216 // maximum value 217 // NOT_FOUND if the document doesn't exist or has been deleted 218 // INTERNAL_ERROR on IO error 219 libtextclassifier3::StatusOr<DocumentProto> Get( 220 DocumentId document_id, bool clear_internal_fields = true) const; 221 222 // Returns all namespaces which have at least 1 active document (not deleted 223 // or expired). Order of namespaces is undefined. 224 std::vector<std::string> GetAllNamespaces() const; 225 226 // Deletes the document identified by the given namespace and uri. The 227 // document proto will be erased immediately. 228 // 229 // NOTE: 230 // Space is not reclaimed for deleted documents until Optimize() is 231 // called. 232 // 233 // Returns: 234 // OK on success 235 // NOT_FOUND if no document exists with namespace, uri 236 // INTERNAL_ERROR on IO error 237 libtextclassifier3::Status Delete(std::string_view name_space, 238 std::string_view uri, 239 int64_t current_time_ms); 240 241 // Deletes the document identified by the given document_id. The document 242 // proto will be erased immediately. 243 // 244 // NOTE: 245 // Space is not reclaimed for deleted documents until Optimize() is 246 // called. 247 // 248 // Returns: 249 // OK on success 250 // NOT_FOUND if the document doesn't exist (i.e. deleted or expired) 251 // INTERNAL_ERROR on IO error 252 // INVALID_ARGUMENT if document_id is invalid. 253 libtextclassifier3::Status Delete(DocumentId document_id, 254 int64_t current_time_ms); 255 256 // Returns the NamespaceId of the string namespace 257 // 258 // Returns: 259 // NamespaceId on success 260 // NOT_FOUND if the namespace doesn't exist 261 // INTERNAL_ERROR on IO error 262 libtextclassifier3::StatusOr<NamespaceId> GetNamespaceId( 263 std::string_view name_space) const; 264 265 // Helper method to find a DocumentId that is associated with the given 266 // namespace and uri. 267 // 268 // NOTE: The DocumentId may refer to a invalid document (deleted 269 // or expired). Callers can call DoesDocumentExist(document_id) to ensure it 270 // refers to a valid Document. 271 // 272 // Returns: 273 // A DocumentId on success 274 // NOT_FOUND if the key doesn't exist 275 // INTERNAL_ERROR on IO error 276 libtextclassifier3::StatusOr<DocumentId> GetDocumentId( 277 std::string_view name_space, std::string_view uri) const; 278 279 // Helper method to find a DocumentId that is associated with the given 280 // NamespaceFingerprintIdentifier. 281 // 282 // NOTE: The DocumentId may refer to a invalid document (deleted 283 // or expired). Callers can call DoesDocumentExist(document_id) to ensure it 284 // refers to a valid Document. 285 // 286 // Returns: 287 // A DocumentId on success 288 // NOT_FOUND if the key doesn't exist 289 // INTERNAL_ERROR on IO error 290 libtextclassifier3::StatusOr<DocumentId> GetDocumentId( 291 const NamespaceFingerprintIdentifier& namespace_fingerprint_identifier) 292 const; 293 294 // Returns the CorpusId associated with the given namespace and schema. 295 // 296 // Returns: 297 // A CorpusId on success 298 // NOT_FOUND if the key doesn't exist 299 // INTERNAL_ERROR on IO error 300 libtextclassifier3::StatusOr<CorpusId> GetCorpusId( 301 const std::string_view name_space, const std::string_view schema) const; 302 303 // Returns the ResultGroupingEntryId associated with the given namespace 304 // and schema. 305 // 306 // NOTE: ResultGroupingEntryIds that are generated by calls with different 307 // ResultGroupingTypes should not be compared. Returned ResultGroupingEntryIds 308 // are only guarenteed to be unique within their own ResultGroupingType. 309 // 310 // Returns: 311 // A ResultGroupingEntryId on success 312 // NOT_FOUND if the key doesn't exist 313 // INTERNAL_ERROR on IO error 314 libtextclassifier3::StatusOr<int32_t> GetResultGroupingEntryId( 315 ResultSpecProto::ResultGroupingType result_group_type, 316 const std::string_view name_space, const std::string_view schema) const; 317 318 // Returns the ResultGrouping Entry Id associated with the given NamespaceId 319 // and SchemaTypeId 320 // 321 // NOTE: ResultGroupingEntryIds that are generated by calls with different 322 // ResultGroupingTypes should not be compared. Returned ResultGroupingEntryIds 323 // are only guarenteed to be unique within their own ResultGroupingType. 324 // 325 // Returns: 326 // A ResultGroupingEntryId on success 327 // NOT_FOUND if the key doesn't exist 328 // INTERNAL_ERROR on IO error 329 libtextclassifier3::StatusOr<int32_t> GetResultGroupingEntryId( 330 ResultSpecProto::ResultGroupingType result_group_type, 331 const NamespaceId namespace_id, const SchemaTypeId schema_type_id) const; 332 333 // Returns the DocumentAssociatedScoreData of the document specified by the 334 // DocumentId. 335 // 336 // Returns: 337 // DocumentAssociatedScoreData on success 338 // NOT_FOUND if the document or the score data is not found 339 libtextclassifier3::StatusOr<DocumentAssociatedScoreData> 340 GetDocumentAssociatedScoreData(DocumentId document_id) const; 341 342 // Returns the CorpusAssociatedScoreData of the corpus specified by the 343 // corpus_id. 344 // 345 // NOTE: This does not check if the corpus exists and will return the 346 // CorpusAssociatedScoreData of the corpus even if all documents belonging to 347 // that corpus have been deleted. 348 // 349 // Returns: 350 // CorpusAssociatedScoreData on success 351 // OUT_OF_RANGE if corpus_id is negative or exceeds previously seen 352 // CorpusIds 353 libtextclassifier3::StatusOr<CorpusAssociatedScoreData> 354 GetCorpusAssociatedScoreData(CorpusId corpus_id) const; 355 356 // Gets the document filter data if a document exists. Otherwise, will get a 357 // false optional. 358 // 359 // Existence means it hasn't been deleted and it hasn't expired yet. 360 // 361 // Returns: 362 // True:DocumentFilterData if the given document exists. 363 // False if the given document doesn't exist. 364 std::optional<DocumentFilterData> GetAliveDocumentFilterData( 365 DocumentId document_id, int64_t current_time_ms) const; 366 367 // Gets the usage scores of a document. 368 // 369 // Returns: 370 // UsageScores on success 371 // nullopt if there are no usage scores stored for the requested docid. 372 std::optional<UsageStore::UsageScores> GetUsageScores( 373 DocumentId document_id, int64_t current_time_ms) const; 374 375 // Reports usage. The corresponding usage scores of the specified document in 376 // the report will be updated. 377 // 378 // Returns: 379 // OK on success 380 // NOT_FOUND if the [namesapce + uri] key in the report doesn't exist 381 // INTERNAL_ERROR on I/O errors. 382 libtextclassifier3::Status ReportUsage(const UsageReport& usage_report); 383 384 // Deletes all documents belonging to the given namespace. The documents will 385 // be erased immediately. 386 // 387 // NOTE: 388 // Space is not reclaimed for deleted documents until Optimize() is 389 // called. 390 // 391 // Returns: 392 // OK on success 393 // NOT_FOUND if namespace doesn't exist 394 // INTERNAL_ERROR on IO error 395 DeleteByGroupResult DeleteByNamespace(std::string_view name_space); 396 397 // Deletes all documents belonging to the given schema type. The documents 398 // will be erased immediately. 399 // 400 // NOTE: 401 // Space is not reclaimed for deleted documents until Optimize() is 402 // called. 403 // 404 // Returns: 405 // OK on success 406 // NOT_FOUND if schema_type doesn't exist 407 // INTERNAL_ERROR on IO error 408 DeleteByGroupResult DeleteBySchemaType(std::string_view schema_type); 409 410 // Syncs all the data and metadata changes to disk. 411 // 412 // Returns: 413 // OK on success 414 // INTERNAL on I/O error 415 libtextclassifier3::Status PersistToDisk(PersistType::Code persist_type); 416 417 // Calculates the StorageInfo for the Document Store. 418 // 419 // If an IO error occurs while trying to calculate the value for a field, then 420 // that field will be set to -1. 421 DocumentStorageInfoProto GetStorageInfo() const; 422 423 // Update any derived data off of the SchemaStore with the new SchemaStore. 424 // This may include pointers, SchemaTypeIds, etc. 425 // 426 // NOTE: This function may delete documents. A document may be invalidated by 427 // the new SchemaStore, such as failing validation or having its schema type 428 // deleted from the schema. 429 // 430 // This is best used if the caller is unsure about what's changed in the 431 // SchemaStore, and wants to update all information no matter what. If the 432 // caller does know what has changed, then it's recommended to call 433 // OptimizedUpdateSchemaStore. 434 // 435 // Returns; 436 // OK on success 437 // INTERNAL_ERROR on IO error 438 libtextclassifier3::Status UpdateSchemaStore(const SchemaStore* schema_store); 439 440 // Performs the same funtionality as UpdateSchemaStore, but this can be more 441 // optimized in terms of less disk reads and less work if we know exactly 442 // what's changed between the old and new SchemaStore. 443 // 444 // Returns; 445 // OK on success 446 // INTERNAL_ERROR on IO error 447 libtextclassifier3::Status OptimizedUpdateSchemaStore( 448 const SchemaStore* schema_store, 449 const SchemaStore::SetSchemaResult& set_schema_result); 450 451 // Reduces internal file sizes by reclaiming space of deleted documents and 452 // regenerating derived files. 453 // 454 // NOTE: The tasks in this method are too expensive to be executed in 455 // real-time. The caller should decide how frequently and when to call this 456 // method based on device usage. 457 // 458 // Returns: 459 // OK on success 460 // INTERNAL_ERROR on IO error 461 libtextclassifier3::Status Optimize(); 462 463 struct OptimizeResult { 464 // A vector that maps old document id to new document id. 465 std::vector<DocumentId> document_id_old_to_new; 466 467 // A vector that maps old namespace id to new namespace id. Will be empty if 468 // should_rebuild_index is set to true. 469 std::vector<NamespaceId> namespace_id_old_to_new; 470 471 // A boolean flag that hints the caller (usually IcingSearchEngine) if it 472 // should rebuild index instead of adopting the id changes via the 2 vectors 473 // above. It will be set to true if finding any id inconsistency. 474 bool should_rebuild_index = false; 475 }; 476 // Copy data from current base directory into a new directory. Any outdated or 477 // deleted data won't be copied. During the process, document/namespace ids 478 // will be reassigned so any files / classes that are based on old 479 // document/namespace ids may be outdated. 480 // 481 // stats will be set if non-null. 482 // 483 // NOTE: The tasks in this method are too expensive to be executed in 484 // real-time. The caller should decide how frequently and when to call this 485 // method based on device usage. 486 // 487 // Returns: 488 // OptimizeResult which contains a vector mapping from old document id to 489 // new document id and another vector mapping from old namespace id to new 490 // namespace id, on success 491 // INVALID_ARGUMENT if new_directory is same as current base directory 492 // INTERNAL_ERROR on IO error 493 libtextclassifier3::StatusOr<OptimizeResult> OptimizeInto( 494 const std::string& new_directory, const LanguageSegmenter* lang_segmenter, 495 OptimizeStatsProto* stats = nullptr) const; 496 497 // Calculates status for a potential Optimize call. Includes how many docs 498 // there are vs how many would be optimized away. And also includes an 499 // estimated size gains, in bytes, if Optimize were called. 500 // 501 // Returns: 502 // OptimizeInfo on success 503 // INTERNAL_ERROR on IO error 504 libtextclassifier3::StatusOr<OptimizeInfo> GetOptimizeInfo() const; 505 506 // Computes the combined checksum of the document store - includes the ground 507 // truth and all derived files. 508 // 509 // Returns: 510 // Combined checksum on success 511 // INTERNAL_ERROR on compute error 512 libtextclassifier3::StatusOr<Crc32> ComputeChecksum() const; 513 514 // Get debug information for the document store. 515 // verbosity <= 0, simplest debug information 516 // verbosity > 0, also return the total number of documents and tokens in each 517 // (namespace, schema type) pair. 518 // 519 // Returns: 520 // DocumentDebugInfoProto on success 521 // INTERNAL_ERROR on IO errors, crc compute error 522 libtextclassifier3::StatusOr<DocumentDebugInfoProto> GetDebugInfo( 523 int verbosity) const; 524 525 private: 526 // Use DocumentStore::Create() to instantiate. 527 explicit DocumentStore(const Filesystem* filesystem, 528 std::string_view base_dir, const Clock* clock, 529 const SchemaStore* schema_store, 530 bool namespace_id_fingerprint, bool pre_mapping_fbv, 531 bool use_persistent_hash_map, 532 int32_t compression_level); 533 534 const Filesystem* const filesystem_; 535 const std::string base_dir_; 536 const Clock& clock_; 537 538 // Handles the ground truth schema and all of the derived data off of the 539 // schema 540 const SchemaStore* schema_store_; 541 542 // Used to validate incoming documents 543 DocumentValidator document_validator_; 544 545 // Whether to use namespace id or namespace name to build up fingerprint for 546 // document_key_mapper_ and corpus_mapper_. 547 bool namespace_id_fingerprint_; 548 549 // Flag indicating whether memory map max possible file size for underlying 550 // FileBackedVector before growing the actual file size. 551 bool pre_mapping_fbv_; 552 553 // Flag indicating whether use persistent hash map as the key mapper (if 554 // false, then fall back to dynamic trie key mapper). Note: we only use 555 // persistent hash map for uri mapper if it is true. 556 bool use_persistent_hash_map_; 557 558 const int32_t compression_level_; 559 560 // A log used to store all documents, it serves as a ground truth of doc 561 // store. key_mapper_ and document_id_mapper_ can be regenerated from it. 562 std::unique_ptr<PortableFileBackedProtoLog<DocumentWrapper>> document_log_; 563 564 // Key (namespace + uri) to DocumentId mapping 565 std::unique_ptr< 566 KeyMapper<DocumentId, fingerprint_util::FingerprintStringFormatter>> 567 document_key_mapper_; 568 569 // DocumentId to file offset mapping 570 std::unique_ptr<FileBackedVector<int64_t>> document_id_mapper_; 571 572 // A cache of document associated scores. The ground truth of the scores is 573 // DocumentProto stored in document_log_. This cache contains: 574 // - CorpusId 575 // - Document score 576 // - Document creation timestamp in seconds 577 // - Document length in number of tokens 578 std::unique_ptr<FileBackedVector<DocumentAssociatedScoreData>> score_cache_; 579 580 // A cache of data, indexed by DocumentId, used to filter documents. Currently 581 // contains: 582 // - NamespaceId 583 // - SchemaTypeId 584 // - Expiration timestamp in seconds 585 std::unique_ptr<FileBackedVector<DocumentFilterData>> filter_cache_; 586 587 // A cache of corpus associated scores. The ground truth of the scores is 588 // DocumentProto stored in document_log_. This cache contains: 589 // - Number of documents belonging to the corpus score 590 // - The sum of the documents' lengths, in number of tokens. 591 std::unique_ptr<FileBackedVector<CorpusAssociatedScoreData>> 592 corpus_score_cache_; 593 594 // Maps namespaces to a densely-assigned unique id. Namespaces are assigned an 595 // id when the first document belonging to that namespace is added to the 596 // DocumentStore. Namespaces may be removed from the mapper during compaction. 597 std::unique_ptr<KeyMapper<NamespaceId>> namespace_mapper_; 598 599 // Maps a corpus, i.e. a (namespace, schema type) pair, to a densely-assigned 600 // unique id. A coprus is assigned an 601 // id when the first document belonging to that corpus is added to the 602 // DocumentStore. Corpus ids may be removed from the mapper during compaction. 603 std::unique_ptr< 604 KeyMapper<CorpusId, fingerprint_util::FingerprintStringFormatter>> 605 corpus_mapper_; 606 607 // A storage class that caches all usage scores. Usage scores are not 608 // considered as ground truth. Usage scores are associated with document ids 609 // so they need to be updated when document ids change. 610 std::unique_ptr<UsageStore> usage_store_; 611 612 // Used internally to indicate whether the class has been initialized. This is 613 // to guard against cases where the object has been created, but Initialize 614 // fails in the constructor. If we have successfully exited the constructor, 615 // then this field can be ignored. Clients of DocumentStore should not need to 616 // worry about this field. 617 bool initialized_ = false; 618 619 struct InitializeResult { 620 DataLoss data_loss; 621 622 // A boolean flag indicating if derived files of the document store have 623 // been regenerated or not. This is usually a signal for callers to detect 624 // if any id assignment has changed (e.g. NamespaceId). 625 bool derived_files_regenerated; 626 }; 627 libtextclassifier3::StatusOr<InitializeResult> Initialize( 628 bool force_recovery_and_revalidate_documents, 629 InitializeStatsProto* initialize_stats); 630 631 // Creates sub-components and verifies the integrity of each sub-component. 632 // This assumes that the the underlying files already exist, and will return 633 // an error if it doesn't find what it's expecting. 634 // 635 // Returns an error if subcomponents failed to initialize successfully. 636 // INTERNAL_ERROR on IO error 637 libtextclassifier3::Status InitializeExistingDerivedFiles(); 638 639 // Re-generates all files derived from the ground truth: the document log. 640 // 641 // revalidate_documents=true will also cause each document to be revalidated 642 // the schema as it is read out of the document log. 643 // 644 // NOTE: if this function fails, the only thing we can do is to retry it until 645 // it succeeds or prevent the initialization of a DocumentStore. The 646 // DocumentStore object wouldn't work reliably if this fails. 647 // 648 // Steps: 649 // 1. Delete all derived files. 650 // 2. Iterate through document log, put data into new key mapper and 651 // document_id 652 // mapper. 653 // 3. Create header and store the updated combined checksum 654 libtextclassifier3::Status RegenerateDerivedFiles(bool revalidate_documents); 655 656 // Resets the unique_ptr to the document_key_mapper, deletes the underlying 657 // file, and re-creates a new instance of the document_key_mapper . 658 // 659 // Returns OK or any IO errors. 660 libtextclassifier3::Status ResetDocumentKeyMapper(); 661 662 // Resets the unique_ptr to the document_id_mapper, deletes the underlying 663 // file, and re-creates a new instance of the document_id_mapper. 664 // 665 // Returns OK or any IO errors. 666 libtextclassifier3::Status ResetDocumentIdMapper(); 667 668 // Resets the unique_ptr to the score_cache, deletes the underlying file, and 669 // re-creates a new instance of the score_cache. 670 // 671 // Returns OK or any IO errors. 672 libtextclassifier3::Status ResetDocumentAssociatedScoreCache(); 673 674 // Resets the unique_ptr to the corpus_score_cache, deletes the underlying 675 // file, and re-creates a new instance of the corpus_score_cache. 676 // 677 // Returns OK or any IO errors. 678 libtextclassifier3::Status ResetCorpusAssociatedScoreCache(); 679 680 // Resets the unique_ptr to the filter_cache, deletes the underlying file, and 681 // re-creates a new instance of the filter_cache. 682 // 683 // Returns OK or any IO errors. 684 libtextclassifier3::Status ResetFilterCache(); 685 686 // Resets the unique_ptr to the namespace_mapper, deletes the underlying file, 687 // and re-creates a new instance of the namespace_mapper. 688 // 689 // Returns OK or any IO errors. 690 libtextclassifier3::Status ResetNamespaceMapper(); 691 692 // Resets the unique_ptr to the corpus_mapper, deletes the underlying file, 693 // and re-creates a new instance of the corpus_mapper. 694 // 695 // Returns OK or any IO errors. 696 libtextclassifier3::Status ResetCorpusMapper(); 697 698 // Checks if the header exists already. This does not create the header file 699 // if it doesn't exist. 700 bool HeaderExists(); 701 702 // Update, replace and persist the header file. Creates the header file if it 703 // doesn't exist. 704 // 705 // Returns: 706 // OK on success 707 // INTERNAL on I/O error 708 libtextclassifier3::Status UpdateHeader(const Crc32& checksum); 709 710 libtextclassifier3::StatusOr<DocumentId> InternalPut( 711 DocumentProto&& document, 712 PutDocumentStatsProto* put_document_stats = nullptr); 713 714 // Helper function to do batch deletes. Documents with the given 715 // "namespace_id" and "schema_type_id" will be deleted. If callers don't need 716 // to specify the namespace or schema type, pass in kInvalidNamespaceId or 717 // kInvalidSchemaTypeId. The document protos with their derived data will be 718 // erased / cleared immediately. 719 // 720 // NOTE: Space is not reclaimed in the derived files until Optimize() is 721 // called. 722 // 723 // Returns: 724 // Number of documents that were actually updated to be deleted 725 // INTERNAL_ERROR on IO error 726 libtextclassifier3::StatusOr<int> BatchDelete(NamespaceId namespace_id, 727 SchemaTypeId schema_type_id); 728 729 // Returns the CorpusAssociatedScoreData of the corpus specified by the 730 // corpus_id. 731 // 732 // If the corpus_id has never been seen before, it returns a 733 // CorpusAssociatedScoreData with properties set to default values. 734 // 735 // NOTE: This does not check if the corpus exists and will return the 736 // CorpusAssociatedScoreData of the corpus even if all documents belonging to 737 // that corpus have been deleted. 738 // 739 // Returns: 740 // CorpusAssociatedScoreData on success 741 libtextclassifier3::StatusOr<CorpusAssociatedScoreData> 742 GetCorpusAssociatedScoreDataToUpdate(CorpusId corpus_id) const; 743 744 // Check if a document exists. Existence means it hasn't been deleted and it 745 // hasn't expired yet. 746 // 747 // Returns: 748 // OK if the document exists 749 // INVALID_ARGUMENT if document_id is less than 0 or greater than the 750 // maximum value 751 // NOT_FOUND if the document doesn't exist (i.e. deleted or expired) 752 // INTERNAL_ERROR on IO error 753 libtextclassifier3::Status DoesDocumentExistWithStatus( 754 DocumentId document_id) const; 755 756 // Checks if a document has been deleted 757 // 758 // This is for internal-use only because we assume that the document_id is 759 // already valid. If you're unsure if the document_id is valid, use 760 // DoesDocumentExist(document_id) instead, which will perform those additional 761 // checks. 762 bool IsDeleted(DocumentId document_id) const; 763 764 // Checks if a document has expired. 765 // 766 // This is for internal-use only because we assume that the document_id is 767 // already valid. If you're unsure if the document_id is valid, use 768 // DoesDocumentExist(document_id) instead, which will perform those additional 769 // checks. 770 771 // Returns: 772 // True:DocumentFilterData if the given document isn't expired. 773 // False if the given doesn't document is expired. 774 std::optional<DocumentFilterData> GetNonExpiredDocumentFilterData( 775 DocumentId document_id, int64_t current_time_ms) const; 776 777 // Updates the entry in the score cache for document_id. 778 libtextclassifier3::Status UpdateDocumentAssociatedScoreCache( 779 DocumentId document_id, const DocumentAssociatedScoreData& score_data); 780 781 // Updates the entry in the corpus score cache for corpus_id. 782 libtextclassifier3::Status UpdateCorpusAssociatedScoreCache( 783 CorpusId corpus_id, const CorpusAssociatedScoreData& score_data); 784 785 // Updates the entry in the filter cache for document_id. 786 libtextclassifier3::Status UpdateFilterCache( 787 DocumentId document_id, const DocumentFilterData& filter_data); 788 789 // Helper method to clear the derived data of a document 790 libtextclassifier3::Status ClearDerivedData(DocumentId document_id); 791 792 // Sets usage scores for the given document. 793 libtextclassifier3::Status SetUsageScores( 794 DocumentId document_id, const UsageStore::UsageScores& usage_scores); 795 796 // Returns: 797 // - on success, a DocumentStorageInfoProto with the fields relating to the 798 // size of Document Store member variables populated. 799 // - INTERNAL on failure to get file size 800 DocumentStorageInfoProto GetMemberStorageInfo() const; 801 802 // Returns: 803 // - on success, the storage_info that was passed in but with the number of 804 // alive, deleted and expired documents also set. 805 // - OUT_OF_RANGE, this should never happen. This could only be returned if 806 // the document_id_mapper somehow became larger than the filter cache. 807 DocumentStorageInfoProto CalculateDocumentStatusCounts( 808 DocumentStorageInfoProto storage_info) const; 809 810 // Returns: 811 // - on success, a RepeatedPtrField for CorpusInfo collected. 812 // - OUT_OF_RANGE, this should never happen. 813 libtextclassifier3::StatusOr< 814 google::protobuf::RepeatedPtrField<DocumentDebugInfoProto::CorpusInfo>> 815 CollectCorpusInfo() const; 816 817 // Build fingerprint for the keys of document_key_mapper_ and corpus_mapper_. 818 // Note that namespace_id_fingerprint_ controls the way that a fingerprint is 819 // built. 820 std::string MakeFingerprint(NamespaceId namespace_id, 821 std::string_view namespace_, 822 std::string_view uri_or_schema) const; 823 }; 824 825 } // namespace lib 826 } // namespace icing 827 828 #endif // ICING_STORE_DOCUMENT_STORE_H_ 829