1 // Copyright (C) 2019 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_STORE_DOCUMENT_STORE_H_ 16 #define ICING_STORE_DOCUMENT_STORE_H_ 17 18 #include <cstdint> 19 #include <memory> 20 #include <string> 21 #include <string_view> 22 #include <vector> 23 24 #include "icing/text_classifier/lib3/utils/base/status.h" 25 #include "icing/text_classifier/lib3/utils/base/statusor.h" 26 #include "icing/file/file-backed-proto-log.h" 27 #include "icing/file/file-backed-vector.h" 28 #include "icing/file/filesystem.h" 29 #include "icing/file/portable-file-backed-proto-log.h" 30 #include "icing/proto/document.pb.h" 31 #include "icing/proto/document_wrapper.pb.h" 32 #include "icing/proto/logging.pb.h" 33 #include "icing/proto/optimize.pb.h" 34 #include "icing/proto/persist.pb.h" 35 #include "icing/proto/storage.pb.h" 36 #include "icing/schema/schema-store.h" 37 #include "icing/store/corpus-associated-scoring-data.h" 38 #include "icing/store/corpus-id.h" 39 #include "icing/store/document-associated-score-data.h" 40 #include "icing/store/document-filter-data.h" 41 #include "icing/store/document-id.h" 42 #include "icing/store/key-mapper.h" 43 #include "icing/store/namespace-id.h" 44 #include "icing/store/usage-store.h" 45 #include "icing/tokenization/language-segmenter.h" 46 #include "icing/util/clock.h" 47 #include "icing/util/crc32.h" 48 #include "icing/util/data-loss.h" 49 #include "icing/util/document-validator.h" 50 51 namespace icing { 52 namespace lib { 53 54 // Provides storage interfaces for documents. 55 class DocumentStore { 56 public: 57 struct Header { 58 static constexpr int32_t kMagic = 0x746f7265; 59 60 // Holds the magic as a quick sanity check against file corruption. 61 int32_t magic; 62 63 // Checksum of the DocumentStore's sub-component's checksums. 64 uint32_t checksum; 65 }; 66 67 struct OptimizeInfo { 68 // The estimated size in bytes of the optimizable docs. We don't track the 69 // size of each document, so we estimate by taking the size of the entire 70 // DocumentStore and dividing that by the total number of documents we have. 71 // So we end up with an average document size. 72 int64_t estimated_optimizable_bytes = 0; 73 74 // Number of total documents the DocumentStore tracks. 75 int32_t total_docs = 0; 76 77 // Number of optimizable (deleted + expired) docs the DocumentStore tracks. 78 int32_t optimizable_docs = 0; 79 }; 80 81 struct DeleteByGroupResult { 82 // Status representing whether or not the operation succeeded. See the 83 // comments above the function that returns this result to determine what 84 // possible statuses could be returned. 85 libtextclassifier3::Status status; 86 87 int num_docs_deleted = 0; 88 }; 89 90 struct CreateResult { 91 // A successfully initialized document store. 92 std::unique_ptr<DocumentStore> document_store; 93 94 // The data status after initializing from a previous state. Data loss can 95 // happen if the file is corrupted or some previously added data was 96 // unpersisted. This may be used to signal that any derived data off of the 97 // document store may need to be regenerated. 98 DataLoss data_loss; 99 }; 100 101 // Not copyable 102 DocumentStore(const DocumentStore&) = delete; 103 DocumentStore& operator=(const DocumentStore&) = delete; 104 105 // Persists and updates checksum of subcomponents. 106 ~DocumentStore(); 107 108 // Factory method to create, initialize, and return a DocumentStore. The base 109 // directory is used to persist document store files. If document store was 110 // previously initialized with this directory, it will reload the files saved 111 // by the last instance. 112 // 113 // force_recovery_and_revalidate_documents=true will pre-emptively throw out 114 // the derived files and validate each document while recreating them. This 115 // can be used to indicate that the schema (and type ids) may have changed and 116 // those changes might not have been applied to the document store. 117 // 118 // If initialize_stats is present, the fields related to DocumentStore will be 119 // populated. 120 // 121 // Does not take any ownership, and all pointers except initialize_stats must 122 // refer to valid objects that outlive the one constructed. 123 // 124 // TODO(cassiewang): Consider returning a status indicating that derived files 125 // were regenerated. This may be helpful in logs. 126 // 127 // Returns: 128 // A DocumentStore::CreateResult on success 129 // FAILED_PRECONDITION on any null pointer input 130 // INTERNAL_ERROR on IO error 131 static libtextclassifier3::StatusOr<DocumentStore::CreateResult> Create( 132 const Filesystem* filesystem, const std::string& base_dir, 133 const Clock* clock, const SchemaStore* schema_store, 134 bool force_recovery_and_revalidate_documents = false, 135 InitializeStatsProto* initialize_stats = nullptr); 136 137 // Returns the maximum DocumentId that the DocumentStore has assigned. If 138 // there has not been any DocumentIds assigned, i.e. the DocumentStore is 139 // empty, then kInvalidDocumentId is returned. This does not filter out 140 // DocumentIds of deleted or expired documents. last_added_document_id()141 DocumentId last_added_document_id() const { 142 if (document_id_mapper_->num_elements() == 0) { 143 return kInvalidDocumentId; 144 } 145 return document_id_mapper_->num_elements() - 1; 146 } 147 148 // Returns the number of documents. The result does not filter out DocumentIds 149 // of deleted or expired documents. num_documents()150 int num_documents() const { return document_id_mapper_->num_elements(); } 151 152 // Puts the document into document store. 153 // 154 // If put_document_stats is present, the fields related to DocumentStore will 155 // be populated. 156 // 157 // Returns: 158 // A newly generated document id on success 159 // RESOURCE_EXHAUSED if exceeds maximum number of allowed documents 160 // FAILED_PRECONDITION if schema hasn't been set yet 161 // NOT_FOUND if the schema_type or a property config of the document doesn't 162 // exist in schema 163 // INTERNAL_ERROR on IO error 164 libtextclassifier3::StatusOr<DocumentId> Put( 165 const DocumentProto& document, int32_t num_tokens = 0, 166 PutDocumentStatsProto* put_document_stats = nullptr); 167 libtextclassifier3::StatusOr<DocumentId> Put( 168 DocumentProto&& document, int32_t num_tokens = 0, 169 PutDocumentStatsProto* put_document_stats = nullptr); 170 171 // Finds and returns the document identified by the given key (namespace + 172 // uri). If 'clear_internal_fields' is true, document level data that's 173 // generated internally by DocumentStore is cleared. 174 // 175 // Returns: 176 // The document found on success 177 // NOT_FOUND if the key doesn't exist or document has been deleted 178 // INTERNAL_ERROR on IO error 179 libtextclassifier3::StatusOr<DocumentProto> Get( 180 std::string_view name_space, std::string_view uri, 181 bool clear_internal_fields = true) const; 182 183 // Finds and returns the document identified by the given document id. If 184 // 'clear_internal_fields' is true, document level data that's generated 185 // internally by DocumentStore is cleared. 186 // 187 // Returns: 188 // The document found on success 189 // INVALID_ARGUMENT if document_id is less than 0 or greater than the 190 // maximum value 191 // NOT_FOUND if the document doesn't exist or has been deleted 192 // INTERNAL_ERROR on IO error 193 libtextclassifier3::StatusOr<DocumentProto> Get( 194 DocumentId document_id, bool clear_internal_fields = true) const; 195 196 // Returns all namespaces which have at least 1 active document (not deleted 197 // or expired). Order of namespaces is undefined. 198 std::vector<std::string> GetAllNamespaces() const; 199 200 // Check if a document exists. Existence means it hasn't been deleted and it 201 // hasn't expired yet. 202 // 203 // NOTE: This should be used when callers don't care about error messages, 204 // expect documents to be deleted/not found, or in frequently called code 205 // paths that could cause performance issues. A signficant amount of CPU 206 // cycles can be saved if we don't construct strings and create new Status 207 // objects on the heap. See b/185822483. 208 // 209 // Returns: 210 // boolean whether a document exists or not 211 bool DoesDocumentExist(DocumentId document_id) const; 212 213 // Deletes the document identified by the given namespace and uri. The 214 // document proto will be erased immediately. 215 // 216 // NOTE: 217 // Space is not reclaimed for deleted documents until Optimize() is 218 // called. 219 // 220 // Returns: 221 // OK on success 222 // NOT_FOUND if no document exists with namespace, uri 223 // INTERNAL_ERROR on IO error 224 libtextclassifier3::Status Delete(std::string_view name_space, 225 std::string_view uri); 226 227 // Deletes the document identified by the given document_id. The document 228 // proto will be erased immediately. 229 // 230 // NOTE: 231 // Space is not reclaimed for deleted documents until Optimize() is 232 // called. 233 // 234 // Returns: 235 // OK on success 236 // NOT_FOUND if the document doesn't exist (i.e. deleted or expired) 237 // INTERNAL_ERROR on IO error 238 // INVALID_ARGUMENT if document_id is invalid. 239 libtextclassifier3::Status Delete(DocumentId document_id); 240 241 // Returns the NamespaceId of the string namespace 242 // 243 // Returns: 244 // NamespaceId on success 245 // NOT_FOUND if the namespace doesn't exist 246 // INTERNAL_ERROR on IO error 247 libtextclassifier3::StatusOr<NamespaceId> GetNamespaceId( 248 std::string_view name_space) const; 249 250 // Returns the CorpusId associated with the given namespace and schema. 251 // 252 // Returns: 253 // A CorpusId on success 254 // NOT_FOUND if the key doesn't exist 255 // INTERNAL_ERROR on IO error 256 libtextclassifier3::StatusOr<CorpusId> GetCorpusId( 257 const std::string_view name_space, const std::string_view schema) const; 258 259 // Returns the DocumentAssociatedScoreData of the document specified by the 260 // DocumentId. 261 // 262 // Returns: 263 // DocumentAssociatedScoreData on success 264 // NOT_FOUND if the document or the score data is not found 265 libtextclassifier3::StatusOr<DocumentAssociatedScoreData> 266 GetDocumentAssociatedScoreData(DocumentId document_id) const; 267 268 // Returns the CorpusAssociatedScoreData of the corpus specified by the 269 // corpus_id. 270 // 271 // NOTE: This does not check if the corpus exists and will return the 272 // CorpusAssociatedScoreData of the corpus even if all documents belonging to 273 // that corpus have been deleted. 274 // 275 // Returns: 276 // CorpusAssociatedScoreData on success 277 // OUT_OF_RANGE if corpus_id is negative or exceeds previously seen 278 // CorpusIds 279 libtextclassifier3::StatusOr<CorpusAssociatedScoreData> 280 GetCorpusAssociatedScoreData(CorpusId corpus_id) const; 281 282 // Returns the DocumentFilterData of the document specified by the DocumentId. 283 // 284 // Returns: 285 // DocumentFilterData on success 286 // OUT_OF_RANGE if document_id is negative or exceeds previously seen 287 // DocumentIds 288 // NOT_FOUND if the document or the filter data is not found 289 libtextclassifier3::StatusOr<DocumentFilterData> GetDocumentFilterData( 290 DocumentId document_id) const; 291 292 // Gets the usage scores of a document. 293 // 294 // Returns: 295 // UsageScores on success 296 // NOT_FOUND if document_id no longer exists. 297 // INVALID_ARGUMENT if document_id is invalid 298 libtextclassifier3::StatusOr<UsageStore::UsageScores> GetUsageScores( 299 DocumentId document_id) const; 300 301 // Reports usage. The corresponding usage scores of the specified document in 302 // the report will be updated. 303 // 304 // Returns: 305 // OK on success 306 // NOT_FOUND if the [namesapce + uri] key in the report doesn't exist 307 // INTERNAL_ERROR on I/O errors. 308 libtextclassifier3::Status ReportUsage(const UsageReport& usage_report); 309 310 // Deletes all documents belonging to the given namespace. The documents will 311 // be erased immediately. 312 // 313 // NOTE: 314 // Space is not reclaimed for deleted documents until Optimize() is 315 // called. 316 // 317 // Returns: 318 // OK on success 319 // NOT_FOUND if namespace doesn't exist 320 // INTERNAL_ERROR on IO error 321 DeleteByGroupResult DeleteByNamespace(std::string_view name_space); 322 323 // Deletes all documents belonging to the given schema type. The documents 324 // will be erased immediately. 325 // 326 // NOTE: 327 // Space is not reclaimed for deleted documents until Optimize() is 328 // called. 329 // 330 // Returns: 331 // OK on success 332 // NOT_FOUND if schema_type doesn't exist 333 // INTERNAL_ERROR on IO error 334 DeleteByGroupResult DeleteBySchemaType(std::string_view schema_type); 335 336 // Syncs all the data and metadata changes to disk. 337 // 338 // Returns: 339 // OK on success 340 // INTERNAL on I/O error 341 libtextclassifier3::Status PersistToDisk(PersistType::Code persist_type); 342 343 // Calculates the StorageInfo for the Document Store. 344 // 345 // If an IO error occurs while trying to calculate the value for a field, then 346 // that field will be set to -1. 347 DocumentStorageInfoProto GetStorageInfo() const; 348 349 // Update any derived data off of the SchemaStore with the new SchemaStore. 350 // This may include pointers, SchemaTypeIds, etc. 351 // 352 // NOTE: This function may delete documents. A document may be invalidated by 353 // the new SchemaStore, such as failing validation or having its schema type 354 // deleted from the schema. 355 // 356 // This is best used if the caller is unsure about what's changed in the 357 // SchemaStore, and wants to update all information no matter what. If the 358 // caller does know what has changed, then it's recommended to call 359 // OptimizedUpdateSchemaStore. 360 // 361 // Returns; 362 // OK on success 363 // INTERNAL_ERROR on IO error 364 libtextclassifier3::Status UpdateSchemaStore(const SchemaStore* schema_store); 365 366 // Performs the same funtionality as UpdateSchemaStore, but this can be more 367 // optimized in terms of less disk reads and less work if we know exactly 368 // what's changed between the old and new SchemaStore. 369 // 370 // Returns; 371 // OK on success 372 // INTERNAL_ERROR on IO error 373 libtextclassifier3::Status OptimizedUpdateSchemaStore( 374 const SchemaStore* schema_store, 375 const SchemaStore::SetSchemaResult& set_schema_result); 376 377 // Reduces internal file sizes by reclaiming space of deleted documents and 378 // regenerating derived files. 379 // 380 // NOTE: The tasks in this method are too expensive to be executed in 381 // real-time. The caller should decide how frequently and when to call this 382 // method based on device usage. 383 // 384 // Returns: 385 // OK on success 386 // INTERNAL_ERROR on IO error 387 libtextclassifier3::Status Optimize(); 388 389 // Copy data from current base directory into a new directory. Any outdated or 390 // deleted data won't be copied. During the process, document ids will be 391 // reassigned so any files / classes that are based on old document ids may be 392 // outdated. 393 // 394 // stats will be set if non-null. 395 // 396 // NOTE: The tasks in this method are too expensive to be executed in 397 // real-time. The caller should decide how frequently and when to call this 398 // method based on device usage. 399 // 400 // Returns: 401 // OK on success 402 // INVALID_ARGUMENT if new_directory is same as current base directory 403 // INTERNAL_ERROR on IO error 404 libtextclassifier3::Status OptimizeInto( 405 const std::string& new_directory, const LanguageSegmenter* lang_segmenter, 406 OptimizeStatsProto* stats = nullptr); 407 408 // Calculates status for a potential Optimize call. Includes how many docs 409 // there are vs how many would be optimized away. And also includes an 410 // estimated size gains, in bytes, if Optimize were called. 411 // 412 // Returns: 413 // OptimizeInfo on success 414 // INTERNAL_ERROR on IO error 415 libtextclassifier3::StatusOr<OptimizeInfo> GetOptimizeInfo() const; 416 417 // Computes the combined checksum of the document store - includes the ground 418 // truth and all derived files. 419 // 420 // Returns: 421 // Combined checksum on success 422 // INTERNAL_ERROR on compute error 423 libtextclassifier3::StatusOr<Crc32> ComputeChecksum() const; 424 425 private: 426 // Use DocumentStore::Create() to instantiate. 427 DocumentStore(const Filesystem* filesystem, std::string_view base_dir, 428 const Clock* clock, const SchemaStore* schema_store); 429 430 const Filesystem* const filesystem_; 431 const std::string base_dir_; 432 const Clock& clock_; 433 434 // Handles the ground truth schema and all of the derived data off of the 435 // schema 436 const SchemaStore* schema_store_; 437 438 // Used to validate incoming documents 439 DocumentValidator document_validator_; 440 441 // A log used to store all documents, it serves as a ground truth of doc 442 // store. key_mapper_ and document_id_mapper_ can be regenerated from it. 443 std::unique_ptr<PortableFileBackedProtoLog<DocumentWrapper>> document_log_; 444 445 // Key (namespace + uri) to DocumentId mapping 446 std::unique_ptr<KeyMapper<DocumentId>> document_key_mapper_; 447 448 // DocumentId to file offset mapping 449 std::unique_ptr<FileBackedVector<int64_t>> document_id_mapper_; 450 451 // A cache of document associated scores. The ground truth of the scores is 452 // DocumentProto stored in document_log_. This cache contains: 453 // - CorpusId 454 // - Document score 455 // - Document creation timestamp in seconds 456 // - Document length in number of tokens 457 std::unique_ptr<FileBackedVector<DocumentAssociatedScoreData>> score_cache_; 458 459 // A cache of data, indexed by DocumentId, used to filter documents. Currently 460 // contains: 461 // - NamespaceId 462 // - SchemaTypeId 463 // - Expiration timestamp in seconds 464 std::unique_ptr<FileBackedVector<DocumentFilterData>> filter_cache_; 465 466 // A cache of corpus associated scores. The ground truth of the scores is 467 // DocumentProto stored in document_log_. This cache contains: 468 // - Number of documents belonging to the corpus score 469 // - The sum of the documents' lengths, in number of tokens. 470 std::unique_ptr<FileBackedVector<CorpusAssociatedScoreData>> 471 corpus_score_cache_; 472 473 // Maps namespaces to a densely-assigned unique id. Namespaces are assigned an 474 // id when the first document belonging to that namespace is added to the 475 // DocumentStore. Namespaces may be removed from the mapper during compaction. 476 std::unique_ptr<KeyMapper<NamespaceId>> namespace_mapper_; 477 478 // Maps a corpus, i.e. a (namespace, schema type) pair, to a densely-assigned 479 // unique id. A coprus is assigned an 480 // id when the first document belonging to that corpus is added to the 481 // DocumentStore. Corpus ids may be removed from the mapper during compaction. 482 std::unique_ptr<KeyMapper<CorpusId>> corpus_mapper_; 483 484 // A storage class that caches all usage scores. Usage scores are not 485 // considered as ground truth. Usage scores are associated with document ids 486 // so they need to be updated when document ids change. 487 std::unique_ptr<UsageStore> usage_store_; 488 489 // Used internally to indicate whether the class has been initialized. This is 490 // to guard against cases where the object has been created, but Initialize 491 // fails in the constructor. If we have successfully exited the constructor, 492 // then this field can be ignored. Clients of DocumentStore should not need to 493 // worry about this field. 494 bool initialized_ = false; 495 496 libtextclassifier3::StatusOr<DataLoss> Initialize( 497 bool force_recovery_and_revalidate_documents, 498 InitializeStatsProto* initialize_stats); 499 500 // Initializes a new DocumentStore and sets up any underlying files. 501 // 502 // Returns: 503 // Data loss status on success, effectively always DataLoss::NONE 504 // INTERNAL on I/O error 505 libtextclassifier3::StatusOr<DataLoss> InitializeNewStore( 506 InitializeStatsProto* initialize_stats); 507 508 // Initializes a DocumentStore over an existing directory of files. 509 // 510 // stats will be set if non-null 511 // 512 // Returns: 513 // Data loss status on success 514 // INTERNAL on I/O error 515 libtextclassifier3::StatusOr<DataLoss> InitializeExistingStore( 516 bool force_recovery_and_revalidate_documents, 517 InitializeStatsProto* initialize_stats); 518 519 libtextclassifier3::StatusOr<DataLoss> MigrateFromV0ToV1( 520 InitializeStatsProto* initialize_stats); 521 522 // Creates sub-components and verifies the integrity of each sub-component. 523 // This assumes that the the underlying files already exist, and will return 524 // an error if it doesn't find what it's expecting. 525 // 526 // Returns an error if subcomponents failed to initialize successfully. 527 // INTERNAL_ERROR on IO error 528 libtextclassifier3::Status InitializeExistingDerivedFiles(); 529 530 // Re-generates all files derived from the ground truth: the document log. 531 // 532 // revalidate_documents=true will also cause each document to be revalidated 533 // the schema as it is read out of the document log. 534 // 535 // NOTE: if this function fails, the only thing we can do is to retry it until 536 // it succeeds or prevent the initialization of a DocumentStore. The 537 // DocumentStore object wouldn't work reliably if this fails. 538 // 539 // Steps: 540 // 1. Delete all derived files. 541 // 2. Iterate through document log, put data into new key mapper and 542 // document_id 543 // mapper. 544 // 3. Create header and store the updated combined checksum 545 libtextclassifier3::Status RegenerateDerivedFiles(bool revalidate_documents); 546 547 // Resets the unique_ptr to the document_key_mapper, deletes the underlying 548 // file, and re-creates a new instance of the document_key_mapper . 549 // 550 // Returns OK or any IO errors. 551 libtextclassifier3::Status ResetDocumentKeyMapper(); 552 553 // Resets the unique_ptr to the document_id_mapper, deletes the underlying 554 // file, and re-creates a new instance of the document_id_mapper. 555 // 556 // Returns OK or any IO errors. 557 libtextclassifier3::Status ResetDocumentIdMapper(); 558 559 // Resets the unique_ptr to the score_cache, deletes the underlying file, and 560 // re-creates a new instance of the score_cache. 561 // 562 // Returns OK or any IO errors. 563 libtextclassifier3::Status ResetDocumentAssociatedScoreCache(); 564 565 // Resets the unique_ptr to the corpus_score_cache, deletes the underlying 566 // file, and re-creates a new instance of the corpus_score_cache. 567 // 568 // Returns OK or any IO errors. 569 libtextclassifier3::Status ResetCorpusAssociatedScoreCache(); 570 571 // Resets the unique_ptr to the filter_cache, deletes the underlying file, and 572 // re-creates a new instance of the filter_cache. 573 // 574 // Returns OK or any IO errors. 575 libtextclassifier3::Status ResetFilterCache(); 576 577 // Resets the unique_ptr to the namespace_mapper, deletes the underlying file, 578 // and re-creates a new instance of the namespace_mapper. 579 // 580 // Returns OK or any IO errors. 581 libtextclassifier3::Status ResetNamespaceMapper(); 582 583 // Resets the unique_ptr to the corpus_mapper, deletes the underlying file, 584 // and re-creates a new instance of the corpus_mapper. 585 // 586 // Returns OK or any IO errors. 587 libtextclassifier3::Status ResetCorpusMapper(); 588 589 // Checks if the header exists already. This does not create the header file 590 // if it doesn't exist. 591 bool HeaderExists(); 592 593 // Update, replace and persist the header file. Creates the header file if it 594 // doesn't exist. 595 // 596 // Returns: 597 // OK on success 598 // INTERNAL on I/O error 599 libtextclassifier3::Status UpdateHeader(const Crc32& checksum); 600 601 libtextclassifier3::StatusOr<DocumentId> InternalPut( 602 DocumentProto& document, 603 PutDocumentStatsProto* put_document_stats = nullptr); 604 605 // Helper function to do batch deletes. Documents with the given 606 // "namespace_id" and "schema_type_id" will be deleted. If callers don't need 607 // to specify the namespace or schema type, pass in kInvalidNamespaceId or 608 // kInvalidSchemaTypeId. The document protos with their derived data will be 609 // erased / cleared immediately. 610 // 611 // NOTE: Space is not reclaimed in the derived files until Optimize() is 612 // called. 613 // 614 // Returns: 615 // Number of documents that were actually updated to be deleted 616 // INTERNAL_ERROR on IO error 617 libtextclassifier3::StatusOr<int> BatchDelete(NamespaceId namespace_id, 618 SchemaTypeId schema_type_id); 619 620 // Helper method to find a DocumentId that is associated with the given 621 // namespace and uri. 622 // 623 // NOTE: The DocumentId may refer to a invalid document (deleted 624 // or expired). Callers can call DoesDocumentExist(document_id) to ensure it 625 // refers to a valid Document. 626 // 627 // Returns: 628 // A DocumentId on success 629 // NOT_FOUND if the key doesn't exist 630 // INTERNAL_ERROR on IO error 631 libtextclassifier3::StatusOr<DocumentId> GetDocumentId( 632 std::string_view name_space, std::string_view uri) const; 633 634 // Returns the CorpusAssociatedScoreData of the corpus specified by the 635 // corpus_id. 636 // 637 // If the corpus_id has never been seen before, it returns a 638 // CorpusAssociatedScoreData with properties set to default values. 639 // 640 // NOTE: This does not check if the corpus exists and will return the 641 // CorpusAssociatedScoreData of the corpus even if all documents belonging to 642 // that corpus have been deleted. 643 // 644 // Returns: 645 // CorpusAssociatedScoreData on success 646 libtextclassifier3::StatusOr<CorpusAssociatedScoreData> 647 GetCorpusAssociatedScoreDataToUpdate(CorpusId corpus_id) const; 648 649 // Check if a document exists. Existence means it hasn't been deleted and it 650 // hasn't expired yet. 651 // 652 // Returns: 653 // OK if the document exists 654 // INVALID_ARGUMENT if document_id is less than 0 or greater than the 655 // maximum value 656 // NOT_FOUND if the document doesn't exist (i.e. deleted or expired) 657 // INTERNAL_ERROR on IO error 658 libtextclassifier3::Status DoesDocumentExistWithStatus( 659 DocumentId document_id) const; 660 661 // Check if a document exists. Existence means it hasn't been deleted and it 662 // hasn't expired yet. 663 // 664 // This is for internal-use only because we assume that the document_id is 665 // already valid. If you're unsure if the document_id is valid, use 666 // DoesDocumentExist(document_id) instead, which will perform those additional 667 // checks. 668 // 669 // Returns: 670 // boolean whether a document exists or not 671 bool InternalDoesDocumentExist(DocumentId document_id) const; 672 673 // Checks if a document has been deleted 674 // 675 // This is for internal-use only because we assume that the document_id is 676 // already valid. If you're unsure if the document_id is valid, use 677 // DoesDocumentExist(document_id) instead, which will perform those additional 678 // checks. 679 bool IsDeleted(DocumentId document_id) const; 680 681 // Checks if a document has expired. 682 // 683 // This is for internal-use only because we assume that the document_id is 684 // already valid. If you're unsure if the document_id is valid, use 685 // DoesDocumentExist(document_id) instead, which will perform those additional 686 // checks. 687 bool IsExpired(DocumentId document_id) const; 688 689 // Updates the entry in the score cache for document_id. 690 libtextclassifier3::Status UpdateDocumentAssociatedScoreCache( 691 DocumentId document_id, const DocumentAssociatedScoreData& score_data); 692 693 // Updates the entry in the corpus score cache for corpus_id. 694 libtextclassifier3::Status UpdateCorpusAssociatedScoreCache( 695 CorpusId corpus_id, const CorpusAssociatedScoreData& score_data); 696 697 // Updates the entry in the filter cache for document_id. 698 libtextclassifier3::Status UpdateFilterCache( 699 DocumentId document_id, const DocumentFilterData& filter_data); 700 701 // Helper method to clear the derived data of a document 702 libtextclassifier3::Status ClearDerivedData(DocumentId document_id); 703 704 // Sets usage scores for the given document. 705 libtextclassifier3::Status SetUsageScores( 706 DocumentId document_id, const UsageStore::UsageScores& usage_scores); 707 708 // Returns: 709 // - on success, a DocumentStorageInfoProto with the fields relating to the 710 // size of Document Store member variables populated. 711 // - INTERNAL on failure to get file size 712 DocumentStorageInfoProto GetMemberStorageInfo() const; 713 714 // Returns: 715 // - on success, the storage_info that was passed in but with the number of 716 // alive, deleted and expired documents also set. 717 // - OUT_OF_RANGE, this should never happen. This could only be returned if 718 // the document_id_mapper somehow became larger than the filter cache. 719 DocumentStorageInfoProto CalculateDocumentStatusCounts( 720 DocumentStorageInfoProto storage_info) const; 721 }; 722 723 } // namespace lib 724 } // namespace icing 725 726 #endif // ICING_STORE_DOCUMENT_STORE_H_ 727