1 // Copyright (C) 2019 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_STORE_DOCUMENT_STORE_H_ 16 #define ICING_STORE_DOCUMENT_STORE_H_ 17 18 #include <cstdint> 19 #include <memory> 20 #include <string> 21 #include <string_view> 22 #include <vector> 23 24 #include "icing/text_classifier/lib3/utils/base/status.h" 25 #include "icing/text_classifier/lib3/utils/base/statusor.h" 26 #include "icing/file/file-backed-proto-log.h" 27 #include "icing/file/file-backed-vector.h" 28 #include "icing/file/filesystem.h" 29 #include "icing/file/portable-file-backed-proto-log.h" 30 #include "icing/proto/debug.pb.h" 31 #include "icing/proto/document.pb.h" 32 #include "icing/proto/document_wrapper.pb.h" 33 #include "icing/proto/logging.pb.h" 34 #include "icing/proto/optimize.pb.h" 35 #include "icing/proto/persist.pb.h" 36 #include "icing/proto/search.pb.h" 37 #include "icing/proto/storage.pb.h" 38 #include "icing/proto/usage.pb.h" 39 #include "icing/schema/schema-store.h" 40 #include "icing/store/corpus-associated-scoring-data.h" 41 #include "icing/store/corpus-id.h" 42 #include "icing/store/document-associated-score-data.h" 43 #include "icing/store/document-filter-data.h" 44 #include "icing/store/document-id.h" 45 #include "icing/store/key-mapper.h" 46 #include "icing/store/namespace-id.h" 47 #include "icing/store/usage-store.h" 48 #include "icing/tokenization/language-segmenter.h" 49 #include "icing/util/clock.h" 50 #include "icing/util/crc32.h" 51 #include "icing/util/data-loss.h" 52 #include "icing/util/document-validator.h" 53 #include "icing/util/fingerprint-util.h" 54 55 namespace icing { 56 namespace lib { 57 58 // Provides storage interfaces for documents. 59 class DocumentStore { 60 public: 61 struct Header { GetCurrentMagicHeader62 static int32_t GetCurrentMagic(bool namespace_id_fingerprint) { 63 return namespace_id_fingerprint ? kNewMagic : kOldMagic; 64 } 65 66 // Holds the magic as a quick sanity check against file corruption. 67 int32_t magic; 68 69 // Checksum of the DocumentStore's sub-component's checksums. 70 uint32_t checksum; 71 72 private: 73 static constexpr int32_t kOldMagic = 0x746f7265; 74 static constexpr int32_t kNewMagic = 0x1b99c8b0; 75 }; 76 77 struct OptimizeInfo { 78 // The estimated size in bytes of the optimizable docs. We don't track the 79 // size of each document, so we estimate by taking the size of the entire 80 // DocumentStore and dividing that by the total number of documents we have. 81 // So we end up with an average document size. 82 int64_t estimated_optimizable_bytes = 0; 83 84 // Number of total documents the DocumentStore tracks. 85 int32_t total_docs = 0; 86 87 // Number of optimizable (deleted + expired) docs the DocumentStore tracks. 88 int32_t optimizable_docs = 0; 89 }; 90 91 struct DeleteByGroupResult { 92 // Status representing whether or not the operation succeeded. See the 93 // comments above the function that returns this result to determine what 94 // possible statuses could be returned. 95 libtextclassifier3::Status status; 96 97 int num_docs_deleted = 0; 98 }; 99 100 struct CreateResult { 101 // A successfully initialized document store. 102 std::unique_ptr<DocumentStore> document_store; 103 104 // The data status after initializing from a previous state. Data loss can 105 // happen if the file is corrupted or some previously added data was 106 // unpersisted. This may be used to signal that any derived data off of the 107 // document store may need to be regenerated. 108 DataLoss data_loss; 109 }; 110 111 // Not copyable 112 DocumentStore(const DocumentStore&) = delete; 113 DocumentStore& operator=(const DocumentStore&) = delete; 114 115 // Persists and updates checksum of subcomponents. 116 ~DocumentStore(); 117 118 // Factory method to create, initialize, and return a DocumentStore. The base 119 // directory is used to persist document store files. If document store was 120 // previously initialized with this directory, it will reload the files saved 121 // by the last instance. 122 // 123 // force_recovery_and_revalidate_documents=true will pre-emptively throw out 124 // the derived files and validate each document while recreating them. This 125 // can be used to indicate that the schema (and type ids) may have changed and 126 // those changes might not have been applied to the document store. 127 // 128 // If initialize_stats is present, the fields related to DocumentStore will be 129 // populated. 130 // 131 // Does not take any ownership, and all pointers except initialize_stats must 132 // refer to valid objects that outlive the one constructed. 133 // 134 // TODO(cassiewang): Consider returning a status indicating that derived files 135 // were regenerated. This may be helpful in logs. 136 // 137 // Returns: 138 // A DocumentStore::CreateResult on success 139 // FAILED_PRECONDITION on any null pointer input 140 // INTERNAL_ERROR on IO error 141 static libtextclassifier3::StatusOr<DocumentStore::CreateResult> Create( 142 const Filesystem* filesystem, const std::string& base_dir, 143 const Clock* clock, const SchemaStore* schema_store, 144 bool force_recovery_and_revalidate_documents, 145 bool namespace_id_fingerprint, 146 int32_t compression_level, 147 InitializeStatsProto* initialize_stats); 148 149 // Discards all derived data in the document store. 150 // 151 // Returns: 152 // OK on success or nothing to discard 153 // INTERNAL_ERROR on any I/O errors 154 static libtextclassifier3::Status DiscardDerivedFiles( 155 const Filesystem* filesystem, const std::string& base_dir); 156 157 // Returns the maximum DocumentId that the DocumentStore has assigned. If 158 // there has not been any DocumentIds assigned, i.e. the DocumentStore is 159 // empty, then kInvalidDocumentId is returned. This does not filter out 160 // DocumentIds of deleted or expired documents. last_added_document_id()161 DocumentId last_added_document_id() const { 162 if (document_id_mapper_->num_elements() == 0) { 163 return kInvalidDocumentId; 164 } 165 return document_id_mapper_->num_elements() - 1; 166 } 167 168 // Returns the number of documents. The result does not filter out DocumentIds 169 // of deleted or expired documents. num_documents()170 int num_documents() const { return document_id_mapper_->num_elements(); } 171 172 // Puts the document into document store. 173 // 174 // If put_document_stats is present, the fields related to DocumentStore will 175 // be populated. 176 // 177 // Returns: 178 // A newly generated document id on success 179 // RESOURCE_EXHAUSED if exceeds maximum number of allowed documents 180 // FAILED_PRECONDITION if schema hasn't been set yet 181 // NOT_FOUND if the schema_type or a property config of the document doesn't 182 // exist in schema 183 // INTERNAL_ERROR on IO error 184 libtextclassifier3::StatusOr<DocumentId> Put( 185 const DocumentProto& document, int32_t num_tokens = 0, 186 PutDocumentStatsProto* put_document_stats = nullptr); 187 libtextclassifier3::StatusOr<DocumentId> Put( 188 DocumentProto&& document, int32_t num_tokens = 0, 189 PutDocumentStatsProto* put_document_stats = nullptr); 190 191 // Finds and returns the document identified by the given key (namespace + 192 // uri). If 'clear_internal_fields' is true, document level data that's 193 // generated internally by DocumentStore is cleared. 194 // 195 // Returns: 196 // The document found on success 197 // NOT_FOUND if the key doesn't exist or document has been deleted 198 // INTERNAL_ERROR on IO error 199 libtextclassifier3::StatusOr<DocumentProto> Get( 200 std::string_view name_space, std::string_view uri, 201 bool clear_internal_fields = true) const; 202 203 // Finds and returns the document identified by the given document id. If 204 // 'clear_internal_fields' is true, document level data that's generated 205 // internally by DocumentStore is cleared. 206 // 207 // Returns: 208 // The document found on success 209 // INVALID_ARGUMENT if document_id is less than 0 or greater than the 210 // maximum value 211 // NOT_FOUND if the document doesn't exist or has been deleted 212 // INTERNAL_ERROR on IO error 213 libtextclassifier3::StatusOr<DocumentProto> Get( 214 DocumentId document_id, bool clear_internal_fields = true) const; 215 216 // Returns all namespaces which have at least 1 active document (not deleted 217 // or expired). Order of namespaces is undefined. 218 std::vector<std::string> GetAllNamespaces() const; 219 220 // Deletes the document identified by the given namespace and uri. The 221 // document proto will be erased immediately. 222 // 223 // NOTE: 224 // Space is not reclaimed for deleted documents until Optimize() is 225 // called. 226 // 227 // Returns: 228 // OK on success 229 // NOT_FOUND if no document exists with namespace, uri 230 // INTERNAL_ERROR on IO error 231 libtextclassifier3::Status Delete(std::string_view name_space, 232 std::string_view uri, 233 int64_t current_time_ms); 234 235 // Deletes the document identified by the given document_id. The document 236 // proto will be erased immediately. 237 // 238 // NOTE: 239 // Space is not reclaimed for deleted documents until Optimize() is 240 // called. 241 // 242 // Returns: 243 // OK on success 244 // NOT_FOUND if the document doesn't exist (i.e. deleted or expired) 245 // INTERNAL_ERROR on IO error 246 // INVALID_ARGUMENT if document_id is invalid. 247 libtextclassifier3::Status Delete(DocumentId document_id, 248 int64_t current_time_ms); 249 250 // Returns the NamespaceId of the string namespace 251 // 252 // Returns: 253 // NamespaceId on success 254 // NOT_FOUND if the namespace doesn't exist 255 // INTERNAL_ERROR on IO error 256 libtextclassifier3::StatusOr<NamespaceId> GetNamespaceId( 257 std::string_view name_space) const; 258 259 // Helper method to find a DocumentId that is associated with the given 260 // namespace and uri. 261 // 262 // NOTE: The DocumentId may refer to a invalid document (deleted 263 // or expired). Callers can call DoesDocumentExist(document_id) to ensure it 264 // refers to a valid Document. 265 // 266 // Returns: 267 // A DocumentId on success 268 // NOT_FOUND if the key doesn't exist 269 // INTERNAL_ERROR on IO error 270 libtextclassifier3::StatusOr<DocumentId> GetDocumentId( 271 std::string_view name_space, std::string_view uri) const; 272 273 // Returns the CorpusId associated with the given namespace and schema. 274 // 275 // Returns: 276 // A CorpusId on success 277 // NOT_FOUND if the key doesn't exist 278 // INTERNAL_ERROR on IO error 279 libtextclassifier3::StatusOr<CorpusId> GetCorpusId( 280 const std::string_view name_space, const std::string_view schema) const; 281 282 // Returns the ResultGroupingEntryId associated with the given namespace 283 // and schema. 284 // 285 // NOTE: ResultGroupingEntryIds that are generated by calls with different 286 // ResultGroupingTypes should not be compared. Returned ResultGroupingEntryIds 287 // are only guarenteed to be unique within their own ResultGroupingType. 288 // 289 // Returns: 290 // A ResultGroupingEntryId on success 291 // NOT_FOUND if the key doesn't exist 292 // INTERNAL_ERROR on IO error 293 libtextclassifier3::StatusOr<int32_t> GetResultGroupingEntryId( 294 ResultSpecProto::ResultGroupingType result_group_type, 295 const std::string_view name_space, const std::string_view schema) const; 296 297 // Returns the ResultGrouping Entry Id associated with the given NamespaceId 298 // and SchemaTypeId 299 // 300 // NOTE: ResultGroupingEntryIds that are generated by calls with different 301 // ResultGroupingTypes should not be compared. Returned ResultGroupingEntryIds 302 // are only guarenteed to be unique within their own ResultGroupingType. 303 // 304 // Returns: 305 // A ResultGroupingEntryId on success 306 // NOT_FOUND if the key doesn't exist 307 // INTERNAL_ERROR on IO error 308 libtextclassifier3::StatusOr<int32_t> GetResultGroupingEntryId( 309 ResultSpecProto::ResultGroupingType result_group_type, 310 const NamespaceId namespace_id, const SchemaTypeId schema_type_id) const; 311 312 // Returns the DocumentAssociatedScoreData of the document specified by the 313 // DocumentId. 314 // 315 // Returns: 316 // DocumentAssociatedScoreData on success 317 // NOT_FOUND if the document or the score data is not found 318 libtextclassifier3::StatusOr<DocumentAssociatedScoreData> 319 GetDocumentAssociatedScoreData(DocumentId document_id) const; 320 321 // Returns the CorpusAssociatedScoreData of the corpus specified by the 322 // corpus_id. 323 // 324 // NOTE: This does not check if the corpus exists and will return the 325 // CorpusAssociatedScoreData of the corpus even if all documents belonging to 326 // that corpus have been deleted. 327 // 328 // Returns: 329 // CorpusAssociatedScoreData on success 330 // OUT_OF_RANGE if corpus_id is negative or exceeds previously seen 331 // CorpusIds 332 libtextclassifier3::StatusOr<CorpusAssociatedScoreData> 333 GetCorpusAssociatedScoreData(CorpusId corpus_id) const; 334 335 // Gets the document filter data if a document exists. Otherwise, will get a 336 // false optional. 337 // 338 // Existence means it hasn't been deleted and it hasn't expired yet. 339 // 340 // Returns: 341 // True:DocumentFilterData if the given document exists. 342 // False if the given document doesn't exist. 343 std::optional<DocumentFilterData> GetAliveDocumentFilterData( 344 DocumentId document_id, int64_t current_time_ms) const; 345 346 // Gets the usage scores of a document. 347 // 348 // Returns: 349 // UsageScores on success 350 // nullopt if there are no usage scores stored for the requested docid. 351 std::optional<UsageStore::UsageScores> GetUsageScores( 352 DocumentId document_id, int64_t current_time_ms) const; 353 354 // Reports usage. The corresponding usage scores of the specified document in 355 // the report will be updated. 356 // 357 // Returns: 358 // OK on success 359 // NOT_FOUND if the [namesapce + uri] key in the report doesn't exist 360 // INTERNAL_ERROR on I/O errors. 361 libtextclassifier3::Status ReportUsage(const UsageReport& usage_report); 362 363 // Deletes all documents belonging to the given namespace. The documents will 364 // be erased immediately. 365 // 366 // NOTE: 367 // Space is not reclaimed for deleted documents until Optimize() is 368 // called. 369 // 370 // Returns: 371 // OK on success 372 // NOT_FOUND if namespace doesn't exist 373 // INTERNAL_ERROR on IO error 374 DeleteByGroupResult DeleteByNamespace(std::string_view name_space); 375 376 // Deletes all documents belonging to the given schema type. The documents 377 // will be erased immediately. 378 // 379 // NOTE: 380 // Space is not reclaimed for deleted documents until Optimize() is 381 // called. 382 // 383 // Returns: 384 // OK on success 385 // NOT_FOUND if schema_type doesn't exist 386 // INTERNAL_ERROR on IO error 387 DeleteByGroupResult DeleteBySchemaType(std::string_view schema_type); 388 389 // Syncs all the data and metadata changes to disk. 390 // 391 // Returns: 392 // OK on success 393 // INTERNAL on I/O error 394 libtextclassifier3::Status PersistToDisk(PersistType::Code persist_type); 395 396 // Calculates the StorageInfo for the Document Store. 397 // 398 // If an IO error occurs while trying to calculate the value for a field, then 399 // that field will be set to -1. 400 DocumentStorageInfoProto GetStorageInfo() const; 401 402 // Update any derived data off of the SchemaStore with the new SchemaStore. 403 // This may include pointers, SchemaTypeIds, etc. 404 // 405 // NOTE: This function may delete documents. A document may be invalidated by 406 // the new SchemaStore, such as failing validation or having its schema type 407 // deleted from the schema. 408 // 409 // This is best used if the caller is unsure about what's changed in the 410 // SchemaStore, and wants to update all information no matter what. If the 411 // caller does know what has changed, then it's recommended to call 412 // OptimizedUpdateSchemaStore. 413 // 414 // Returns; 415 // OK on success 416 // INTERNAL_ERROR on IO error 417 libtextclassifier3::Status UpdateSchemaStore(const SchemaStore* schema_store); 418 419 // Performs the same funtionality as UpdateSchemaStore, but this can be more 420 // optimized in terms of less disk reads and less work if we know exactly 421 // what's changed between the old and new SchemaStore. 422 // 423 // Returns; 424 // OK on success 425 // INTERNAL_ERROR on IO error 426 libtextclassifier3::Status OptimizedUpdateSchemaStore( 427 const SchemaStore* schema_store, 428 const SchemaStore::SetSchemaResult& set_schema_result); 429 430 // Reduces internal file sizes by reclaiming space of deleted documents and 431 // regenerating derived files. 432 // 433 // NOTE: The tasks in this method are too expensive to be executed in 434 // real-time. The caller should decide how frequently and when to call this 435 // method based on device usage. 436 // 437 // Returns: 438 // OK on success 439 // INTERNAL_ERROR on IO error 440 libtextclassifier3::Status Optimize(); 441 442 // Copy data from current base directory into a new directory. Any outdated or 443 // deleted data won't be copied. During the process, document ids will be 444 // reassigned so any files / classes that are based on old document ids may be 445 // outdated. 446 // 447 // stats will be set if non-null. 448 // 449 // NOTE: The tasks in this method are too expensive to be executed in 450 // real-time. The caller should decide how frequently and when to call this 451 // method based on device usage. 452 // 453 // Returns: 454 // A vector that maps from old document id to new document id on success 455 // INVALID_ARGUMENT if new_directory is same as current base directory 456 // INTERNAL_ERROR on IO error 457 libtextclassifier3::StatusOr<std::vector<DocumentId>> OptimizeInto( 458 const std::string& new_directory, const LanguageSegmenter* lang_segmenter, 459 bool namespace_id_fingerprint, OptimizeStatsProto* stats = nullptr); 460 461 // Calculates status for a potential Optimize call. Includes how many docs 462 // there are vs how many would be optimized away. And also includes an 463 // estimated size gains, in bytes, if Optimize were called. 464 // 465 // Returns: 466 // OptimizeInfo on success 467 // INTERNAL_ERROR on IO error 468 libtextclassifier3::StatusOr<OptimizeInfo> GetOptimizeInfo() const; 469 470 // Computes the combined checksum of the document store - includes the ground 471 // truth and all derived files. 472 // 473 // Returns: 474 // Combined checksum on success 475 // INTERNAL_ERROR on compute error 476 libtextclassifier3::StatusOr<Crc32> ComputeChecksum() const; 477 478 // Get debug information for the document store. 479 // verbosity <= 0, simplest debug information 480 // verbosity > 0, also return the total number of documents and tokens in each 481 // (namespace, schema type) pair. 482 // 483 // Returns: 484 // DocumentDebugInfoProto on success 485 // INTERNAL_ERROR on IO errors, crc compute error 486 libtextclassifier3::StatusOr<DocumentDebugInfoProto> GetDebugInfo( 487 int verbosity) const; 488 489 private: 490 // Use DocumentStore::Create() to instantiate. 491 DocumentStore(const Filesystem* filesystem, std::string_view base_dir, 492 const Clock* clock, const SchemaStore* schema_store, 493 bool namespace_id_fingerprint, int32_t compression_level); 494 495 const Filesystem* const filesystem_; 496 const std::string base_dir_; 497 const Clock& clock_; 498 499 // Handles the ground truth schema and all of the derived data off of the 500 // schema 501 const SchemaStore* schema_store_; 502 503 // Used to validate incoming documents 504 DocumentValidator document_validator_; 505 506 // Whether to use namespace id or namespace name to build up fingerprint for 507 // document_key_mapper_ and corpus_mapper_. 508 bool namespace_id_fingerprint_; 509 510 const int32_t compression_level_; 511 512 // A log used to store all documents, it serves as a ground truth of doc 513 // store. key_mapper_ and document_id_mapper_ can be regenerated from it. 514 std::unique_ptr<PortableFileBackedProtoLog<DocumentWrapper>> document_log_; 515 516 // Key (namespace + uri) to DocumentId mapping 517 std::unique_ptr< 518 KeyMapper<DocumentId, fingerprint_util::FingerprintStringFormatter>> 519 document_key_mapper_; 520 521 // DocumentId to file offset mapping 522 std::unique_ptr<FileBackedVector<int64_t>> document_id_mapper_; 523 524 // A cache of document associated scores. The ground truth of the scores is 525 // DocumentProto stored in document_log_. This cache contains: 526 // - CorpusId 527 // - Document score 528 // - Document creation timestamp in seconds 529 // - Document length in number of tokens 530 std::unique_ptr<FileBackedVector<DocumentAssociatedScoreData>> score_cache_; 531 532 // A cache of data, indexed by DocumentId, used to filter documents. Currently 533 // contains: 534 // - NamespaceId 535 // - SchemaTypeId 536 // - Expiration timestamp in seconds 537 std::unique_ptr<FileBackedVector<DocumentFilterData>> filter_cache_; 538 539 // A cache of corpus associated scores. The ground truth of the scores is 540 // DocumentProto stored in document_log_. This cache contains: 541 // - Number of documents belonging to the corpus score 542 // - The sum of the documents' lengths, in number of tokens. 543 std::unique_ptr<FileBackedVector<CorpusAssociatedScoreData>> 544 corpus_score_cache_; 545 546 // Maps namespaces to a densely-assigned unique id. Namespaces are assigned an 547 // id when the first document belonging to that namespace is added to the 548 // DocumentStore. Namespaces may be removed from the mapper during compaction. 549 std::unique_ptr<KeyMapper<NamespaceId>> namespace_mapper_; 550 551 // Maps a corpus, i.e. a (namespace, schema type) pair, to a densely-assigned 552 // unique id. A coprus is assigned an 553 // id when the first document belonging to that corpus is added to the 554 // DocumentStore. Corpus ids may be removed from the mapper during compaction. 555 std::unique_ptr< 556 KeyMapper<CorpusId, fingerprint_util::FingerprintStringFormatter>> 557 corpus_mapper_; 558 559 // A storage class that caches all usage scores. Usage scores are not 560 // considered as ground truth. Usage scores are associated with document ids 561 // so they need to be updated when document ids change. 562 std::unique_ptr<UsageStore> usage_store_; 563 564 // Used internally to indicate whether the class has been initialized. This is 565 // to guard against cases where the object has been created, but Initialize 566 // fails in the constructor. If we have successfully exited the constructor, 567 // then this field can be ignored. Clients of DocumentStore should not need to 568 // worry about this field. 569 bool initialized_ = false; 570 571 libtextclassifier3::StatusOr<DataLoss> Initialize( 572 bool force_recovery_and_revalidate_documents, 573 InitializeStatsProto* initialize_stats); 574 575 // Creates sub-components and verifies the integrity of each sub-component. 576 // This assumes that the the underlying files already exist, and will return 577 // an error if it doesn't find what it's expecting. 578 // 579 // Returns an error if subcomponents failed to initialize successfully. 580 // INTERNAL_ERROR on IO error 581 libtextclassifier3::Status InitializeExistingDerivedFiles(); 582 583 // Re-generates all files derived from the ground truth: the document log. 584 // 585 // revalidate_documents=true will also cause each document to be revalidated 586 // the schema as it is read out of the document log. 587 // 588 // NOTE: if this function fails, the only thing we can do is to retry it until 589 // it succeeds or prevent the initialization of a DocumentStore. The 590 // DocumentStore object wouldn't work reliably if this fails. 591 // 592 // Steps: 593 // 1. Delete all derived files. 594 // 2. Iterate through document log, put data into new key mapper and 595 // document_id 596 // mapper. 597 // 3. Create header and store the updated combined checksum 598 libtextclassifier3::Status RegenerateDerivedFiles(bool revalidate_documents); 599 600 // Resets the unique_ptr to the document_key_mapper, deletes the underlying 601 // file, and re-creates a new instance of the document_key_mapper . 602 // 603 // Returns OK or any IO errors. 604 libtextclassifier3::Status ResetDocumentKeyMapper(); 605 606 // Resets the unique_ptr to the document_id_mapper, deletes the underlying 607 // file, and re-creates a new instance of the document_id_mapper. 608 // 609 // Returns OK or any IO errors. 610 libtextclassifier3::Status ResetDocumentIdMapper(); 611 612 // Resets the unique_ptr to the score_cache, deletes the underlying file, and 613 // re-creates a new instance of the score_cache. 614 // 615 // Returns OK or any IO errors. 616 libtextclassifier3::Status ResetDocumentAssociatedScoreCache(); 617 618 // Resets the unique_ptr to the corpus_score_cache, deletes the underlying 619 // file, and re-creates a new instance of the corpus_score_cache. 620 // 621 // Returns OK or any IO errors. 622 libtextclassifier3::Status ResetCorpusAssociatedScoreCache(); 623 624 // Resets the unique_ptr to the filter_cache, deletes the underlying file, and 625 // re-creates a new instance of the filter_cache. 626 // 627 // Returns OK or any IO errors. 628 libtextclassifier3::Status ResetFilterCache(); 629 630 // Resets the unique_ptr to the namespace_mapper, deletes the underlying file, 631 // and re-creates a new instance of the namespace_mapper. 632 // 633 // Returns OK or any IO errors. 634 libtextclassifier3::Status ResetNamespaceMapper(); 635 636 // Resets the unique_ptr to the corpus_mapper, deletes the underlying file, 637 // and re-creates a new instance of the corpus_mapper. 638 // 639 // Returns OK or any IO errors. 640 libtextclassifier3::Status ResetCorpusMapper(); 641 642 // Checks if the header exists already. This does not create the header file 643 // if it doesn't exist. 644 bool HeaderExists(); 645 646 // Update, replace and persist the header file. Creates the header file if it 647 // doesn't exist. 648 // 649 // Returns: 650 // OK on success 651 // INTERNAL on I/O error 652 libtextclassifier3::Status UpdateHeader(const Crc32& checksum); 653 654 libtextclassifier3::StatusOr<DocumentId> InternalPut( 655 DocumentProto&& document, 656 PutDocumentStatsProto* put_document_stats = nullptr); 657 658 // Helper function to do batch deletes. Documents with the given 659 // "namespace_id" and "schema_type_id" will be deleted. If callers don't need 660 // to specify the namespace or schema type, pass in kInvalidNamespaceId or 661 // kInvalidSchemaTypeId. The document protos with their derived data will be 662 // erased / cleared immediately. 663 // 664 // NOTE: Space is not reclaimed in the derived files until Optimize() is 665 // called. 666 // 667 // Returns: 668 // Number of documents that were actually updated to be deleted 669 // INTERNAL_ERROR on IO error 670 libtextclassifier3::StatusOr<int> BatchDelete(NamespaceId namespace_id, 671 SchemaTypeId schema_type_id); 672 673 // Returns the CorpusAssociatedScoreData of the corpus specified by the 674 // corpus_id. 675 // 676 // If the corpus_id has never been seen before, it returns a 677 // CorpusAssociatedScoreData with properties set to default values. 678 // 679 // NOTE: This does not check if the corpus exists and will return the 680 // CorpusAssociatedScoreData of the corpus even if all documents belonging to 681 // that corpus have been deleted. 682 // 683 // Returns: 684 // CorpusAssociatedScoreData on success 685 libtextclassifier3::StatusOr<CorpusAssociatedScoreData> 686 GetCorpusAssociatedScoreDataToUpdate(CorpusId corpus_id) const; 687 688 // Check if a document exists. Existence means it hasn't been deleted and it 689 // hasn't expired yet. 690 // 691 // Returns: 692 // OK if the document exists 693 // INVALID_ARGUMENT if document_id is less than 0 or greater than the 694 // maximum value 695 // NOT_FOUND if the document doesn't exist (i.e. deleted or expired) 696 // INTERNAL_ERROR on IO error 697 libtextclassifier3::Status DoesDocumentExistWithStatus( 698 DocumentId document_id) const; 699 700 // Checks if a document has been deleted 701 // 702 // This is for internal-use only because we assume that the document_id is 703 // already valid. If you're unsure if the document_id is valid, use 704 // DoesDocumentExist(document_id) instead, which will perform those additional 705 // checks. 706 bool IsDeleted(DocumentId document_id) const; 707 708 // Checks if a document has expired. 709 // 710 // This is for internal-use only because we assume that the document_id is 711 // already valid. If you're unsure if the document_id is valid, use 712 // DoesDocumentExist(document_id) instead, which will perform those additional 713 // checks. 714 715 // Returns: 716 // True:DocumentFilterData if the given document isn't expired. 717 // False if the given doesn't document is expired. 718 std::optional<DocumentFilterData> GetNonExpiredDocumentFilterData( 719 DocumentId document_id, int64_t current_time_ms) const; 720 721 // Updates the entry in the score cache for document_id. 722 libtextclassifier3::Status UpdateDocumentAssociatedScoreCache( 723 DocumentId document_id, const DocumentAssociatedScoreData& score_data); 724 725 // Updates the entry in the corpus score cache for corpus_id. 726 libtextclassifier3::Status UpdateCorpusAssociatedScoreCache( 727 CorpusId corpus_id, const CorpusAssociatedScoreData& score_data); 728 729 // Updates the entry in the filter cache for document_id. 730 libtextclassifier3::Status UpdateFilterCache( 731 DocumentId document_id, const DocumentFilterData& filter_data); 732 733 // Helper method to clear the derived data of a document 734 libtextclassifier3::Status ClearDerivedData(DocumentId document_id); 735 736 // Sets usage scores for the given document. 737 libtextclassifier3::Status SetUsageScores( 738 DocumentId document_id, const UsageStore::UsageScores& usage_scores); 739 740 // Returns: 741 // - on success, a DocumentStorageInfoProto with the fields relating to the 742 // size of Document Store member variables populated. 743 // - INTERNAL on failure to get file size 744 DocumentStorageInfoProto GetMemberStorageInfo() const; 745 746 // Returns: 747 // - on success, the storage_info that was passed in but with the number of 748 // alive, deleted and expired documents also set. 749 // - OUT_OF_RANGE, this should never happen. This could only be returned if 750 // the document_id_mapper somehow became larger than the filter cache. 751 DocumentStorageInfoProto CalculateDocumentStatusCounts( 752 DocumentStorageInfoProto storage_info) const; 753 754 // Returns: 755 // - on success, a RepeatedPtrField for CorpusInfo collected. 756 // - OUT_OF_RANGE, this should never happen. 757 libtextclassifier3::StatusOr< 758 google::protobuf::RepeatedPtrField<DocumentDebugInfoProto::CorpusInfo>> 759 CollectCorpusInfo() const; 760 761 // Build fingerprint for the keys of document_key_mapper_ and corpus_mapper_. 762 // Note that namespace_id_fingerprint_ controls the way that a fingerprint is 763 // built. 764 std::string MakeFingerprint(NamespaceId namespace_id, 765 std::string_view namespace_, 766 std::string_view uri_or_schema) const; 767 }; 768 769 } // namespace lib 770 } // namespace icing 771 772 #endif // ICING_STORE_DOCUMENT_STORE_H_ 773