• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_STORE_DOCUMENT_STORE_H_
16 #define ICING_STORE_DOCUMENT_STORE_H_
17 
18 #include <cstdint>
19 #include <memory>
20 #include <string>
21 #include <string_view>
22 #include <vector>
23 
24 #include "icing/text_classifier/lib3/utils/base/status.h"
25 #include "icing/text_classifier/lib3/utils/base/statusor.h"
26 #include "icing/file/file-backed-proto-log.h"
27 #include "icing/file/file-backed-vector.h"
28 #include "icing/file/filesystem.h"
29 #include "icing/file/portable-file-backed-proto-log.h"
30 #include "icing/proto/document.pb.h"
31 #include "icing/proto/document_wrapper.pb.h"
32 #include "icing/proto/logging.pb.h"
33 #include "icing/proto/optimize.pb.h"
34 #include "icing/proto/persist.pb.h"
35 #include "icing/proto/storage.pb.h"
36 #include "icing/schema/schema-store.h"
37 #include "icing/store/corpus-associated-scoring-data.h"
38 #include "icing/store/corpus-id.h"
39 #include "icing/store/document-associated-score-data.h"
40 #include "icing/store/document-filter-data.h"
41 #include "icing/store/document-id.h"
42 #include "icing/store/key-mapper.h"
43 #include "icing/store/namespace-id.h"
44 #include "icing/store/usage-store.h"
45 #include "icing/tokenization/language-segmenter.h"
46 #include "icing/util/clock.h"
47 #include "icing/util/crc32.h"
48 #include "icing/util/data-loss.h"
49 #include "icing/util/document-validator.h"
50 
51 namespace icing {
52 namespace lib {
53 
54 // Provides storage interfaces for documents.
55 class DocumentStore {
56  public:
57   struct Header {
58     static constexpr int32_t kMagic = 0x746f7265;
59 
60     // Holds the magic as a quick sanity check against file corruption.
61     int32_t magic;
62 
63     // Checksum of the DocumentStore's sub-component's checksums.
64     uint32_t checksum;
65   };
66 
67   struct OptimizeInfo {
68     // The estimated size in bytes of the optimizable docs. We don't track the
69     // size of each document, so we estimate by taking the size of the entire
70     // DocumentStore and dividing that by the total number of documents we have.
71     // So we end up with an average document size.
72     int64_t estimated_optimizable_bytes = 0;
73 
74     // Number of total documents the DocumentStore tracks.
75     int32_t total_docs = 0;
76 
77     // Number of optimizable (deleted + expired) docs the DocumentStore tracks.
78     int32_t optimizable_docs = 0;
79   };
80 
81   struct DeleteByGroupResult {
82     // Status representing whether or not the operation succeeded. See the
83     // comments above the function that returns this result to determine what
84     // possible statuses could be returned.
85     libtextclassifier3::Status status;
86 
87     int num_docs_deleted = 0;
88   };
89 
90   struct CreateResult {
91     // A successfully initialized document store.
92     std::unique_ptr<DocumentStore> document_store;
93 
94     // The data status after initializing from a previous state. Data loss can
95     // happen if the file is corrupted or some previously added data was
96     // unpersisted. This may be used to signal that any derived data off of the
97     // document store may need to be regenerated.
98     DataLoss data_loss;
99   };
100 
101   // Not copyable
102   DocumentStore(const DocumentStore&) = delete;
103   DocumentStore& operator=(const DocumentStore&) = delete;
104 
105   // Persists and updates checksum of subcomponents.
106   ~DocumentStore();
107 
108   // Factory method to create, initialize, and return a DocumentStore. The base
109   // directory is used to persist document store files. If document store was
110   // previously initialized with this directory, it will reload the files saved
111   // by the last instance.
112   //
113   // force_recovery_and_revalidate_documents=true will pre-emptively throw out
114   // the derived files and validate each document while recreating them. This
115   // can be used to indicate that the schema (and type ids) may have changed and
116   // those changes might not have been applied to the document store.
117   //
118   // If initialize_stats is present, the fields related to DocumentStore will be
119   // populated.
120   //
121   // Does not take any ownership, and all pointers except initialize_stats must
122   // refer to valid objects that outlive the one constructed.
123   //
124   // TODO(cassiewang): Consider returning a status indicating that derived files
125   // were regenerated. This may be helpful in logs.
126   //
127   // Returns:
128   //   A DocumentStore::CreateResult on success
129   //   FAILED_PRECONDITION on any null pointer input
130   //   INTERNAL_ERROR on IO error
131   static libtextclassifier3::StatusOr<DocumentStore::CreateResult> Create(
132       const Filesystem* filesystem, const std::string& base_dir,
133       const Clock* clock, const SchemaStore* schema_store,
134       bool force_recovery_and_revalidate_documents = false,
135       InitializeStatsProto* initialize_stats = nullptr);
136 
137   // Returns the maximum DocumentId that the DocumentStore has assigned. If
138   // there has not been any DocumentIds assigned, i.e. the DocumentStore is
139   // empty, then kInvalidDocumentId is returned. This does not filter out
140   // DocumentIds of deleted or expired documents.
last_added_document_id()141   DocumentId last_added_document_id() const {
142     if (document_id_mapper_->num_elements() == 0) {
143       return kInvalidDocumentId;
144     }
145     return document_id_mapper_->num_elements() - 1;
146   }
147 
148   // Returns the number of documents. The result does not filter out DocumentIds
149   // of deleted or expired documents.
num_documents()150   int num_documents() const { return document_id_mapper_->num_elements(); }
151 
152   // Puts the document into document store.
153   //
154   // If put_document_stats is present, the fields related to DocumentStore will
155   // be populated.
156   //
157   // Returns:
158   //   A newly generated document id on success
159   //   RESOURCE_EXHAUSED if exceeds maximum number of allowed documents
160   //   FAILED_PRECONDITION if schema hasn't been set yet
161   //   NOT_FOUND if the schema_type or a property config of the document doesn't
162   //     exist in schema
163   //   INTERNAL_ERROR on IO error
164   libtextclassifier3::StatusOr<DocumentId> Put(
165       const DocumentProto& document, int32_t num_tokens = 0,
166       PutDocumentStatsProto* put_document_stats = nullptr);
167   libtextclassifier3::StatusOr<DocumentId> Put(
168       DocumentProto&& document, int32_t num_tokens = 0,
169       PutDocumentStatsProto* put_document_stats = nullptr);
170 
171   // Finds and returns the document identified by the given key (namespace +
172   // uri). If 'clear_internal_fields' is true, document level data that's
173   // generated internally by DocumentStore is cleared.
174   //
175   // Returns:
176   //   The document found on success
177   //   NOT_FOUND if the key doesn't exist or document has been deleted
178   //   INTERNAL_ERROR on IO error
179   libtextclassifier3::StatusOr<DocumentProto> Get(
180       std::string_view name_space, std::string_view uri,
181       bool clear_internal_fields = true) const;
182 
183   // Finds and returns the document identified by the given document id. If
184   // 'clear_internal_fields' is true, document level data that's generated
185   // internally by DocumentStore is cleared.
186   //
187   // Returns:
188   //   The document found on success
189   //   INVALID_ARGUMENT if document_id is less than 0 or greater than the
190   //                    maximum value
191   //   NOT_FOUND if the document doesn't exist or has been deleted
192   //   INTERNAL_ERROR on IO error
193   libtextclassifier3::StatusOr<DocumentProto> Get(
194       DocumentId document_id, bool clear_internal_fields = true) const;
195 
196   // Returns all namespaces which have at least 1 active document (not deleted
197   // or expired). Order of namespaces is undefined.
198   std::vector<std::string> GetAllNamespaces() const;
199 
200   // Check if a document exists. Existence means it hasn't been deleted and it
201   // hasn't expired yet.
202   //
203   // NOTE: This should be used when callers don't care about error messages,
204   // expect documents to be deleted/not found, or in frequently called code
205   // paths that could cause performance issues. A signficant amount of CPU
206   // cycles can be saved if we don't construct strings and create new Status
207   // objects on the heap. See b/185822483.
208   //
209   // Returns:
210   //   boolean whether a document exists or not
211   bool DoesDocumentExist(DocumentId document_id) const;
212 
213   // Deletes the document identified by the given namespace and uri. The
214   // document proto will be erased immediately.
215   //
216   // NOTE:
217   //    Space is not reclaimed for deleted documents until Optimize() is
218   //    called.
219   //
220   // Returns:
221   //   OK on success
222   //   NOT_FOUND if no document exists with namespace, uri
223   //   INTERNAL_ERROR on IO error
224   libtextclassifier3::Status Delete(std::string_view name_space,
225                                     std::string_view uri);
226 
227   // Deletes the document identified by the given document_id. The document
228   // proto will be erased immediately.
229   //
230   // NOTE:
231   //    Space is not reclaimed for deleted documents until Optimize() is
232   //    called.
233   //
234   // Returns:
235   //   OK on success
236   //   NOT_FOUND if the document doesn't exist (i.e. deleted or expired)
237   //   INTERNAL_ERROR on IO error
238   //   INVALID_ARGUMENT if document_id is invalid.
239   libtextclassifier3::Status Delete(DocumentId document_id);
240 
241   // Returns the NamespaceId of the string namespace
242   //
243   // Returns:
244   //   NamespaceId on success
245   //   NOT_FOUND if the namespace doesn't exist
246   //   INTERNAL_ERROR on IO error
247   libtextclassifier3::StatusOr<NamespaceId> GetNamespaceId(
248       std::string_view name_space) const;
249 
250   // Returns the CorpusId associated with the given namespace and schema.
251   //
252   // Returns:
253   //   A CorpusId on success
254   //   NOT_FOUND if the key doesn't exist
255   //   INTERNAL_ERROR on IO error
256   libtextclassifier3::StatusOr<CorpusId> GetCorpusId(
257       const std::string_view name_space, const std::string_view schema) const;
258 
259   // Returns the DocumentAssociatedScoreData of the document specified by the
260   // DocumentId.
261   //
262   // Returns:
263   //   DocumentAssociatedScoreData on success
264   //   NOT_FOUND if the document or the score data is not found
265   libtextclassifier3::StatusOr<DocumentAssociatedScoreData>
266   GetDocumentAssociatedScoreData(DocumentId document_id) const;
267 
268   // Returns the CorpusAssociatedScoreData of the corpus specified by the
269   // corpus_id.
270   //
271   // NOTE: This does not check if the corpus exists and will return the
272   // CorpusAssociatedScoreData of the corpus even if all documents belonging to
273   // that corpus have been deleted.
274   //
275   // Returns:
276   //   CorpusAssociatedScoreData on success
277   //   OUT_OF_RANGE if corpus_id is negative or exceeds previously seen
278   //                CorpusIds
279   libtextclassifier3::StatusOr<CorpusAssociatedScoreData>
280   GetCorpusAssociatedScoreData(CorpusId corpus_id) const;
281 
282   // Returns the DocumentFilterData of the document specified by the DocumentId.
283   //
284   // Returns:
285   //   DocumentFilterData on success
286   //   OUT_OF_RANGE if document_id is negative or exceeds previously seen
287   //                DocumentIds
288   //   NOT_FOUND if the document or the filter data is not found
289   libtextclassifier3::StatusOr<DocumentFilterData> GetDocumentFilterData(
290       DocumentId document_id) const;
291 
292   // Gets the usage scores of a document.
293   //
294   // Returns:
295   //   UsageScores on success
296   //   NOT_FOUND if document_id no longer exists.
297   //   INVALID_ARGUMENT if document_id is invalid
298   libtextclassifier3::StatusOr<UsageStore::UsageScores> GetUsageScores(
299       DocumentId document_id) const;
300 
301   // Reports usage. The corresponding usage scores of the specified document in
302   // the report will be updated.
303   //
304   // Returns:
305   //   OK on success
306   //   NOT_FOUND if the [namesapce + uri] key in the report doesn't exist
307   //   INTERNAL_ERROR on I/O errors.
308   libtextclassifier3::Status ReportUsage(const UsageReport& usage_report);
309 
310   // Deletes all documents belonging to the given namespace. The documents will
311   // be erased immediately.
312   //
313   // NOTE:
314   //    Space is not reclaimed for deleted documents until Optimize() is
315   //    called.
316   //
317   // Returns:
318   //   OK on success
319   //   NOT_FOUND if namespace doesn't exist
320   //   INTERNAL_ERROR on IO error
321   DeleteByGroupResult DeleteByNamespace(std::string_view name_space);
322 
323   // Deletes all documents belonging to the given schema type. The documents
324   // will be erased immediately.
325   //
326   // NOTE:
327   //    Space is not reclaimed for deleted documents until Optimize() is
328   //    called.
329   //
330   // Returns:
331   //   OK on success
332   //   NOT_FOUND if schema_type doesn't exist
333   //   INTERNAL_ERROR on IO error
334   DeleteByGroupResult DeleteBySchemaType(std::string_view schema_type);
335 
336   // Syncs all the data and metadata changes to disk.
337   //
338   // Returns:
339   //   OK on success
340   //   INTERNAL on I/O error
341   libtextclassifier3::Status PersistToDisk(PersistType::Code persist_type);
342 
343   // Calculates the StorageInfo for the Document Store.
344   //
345   // If an IO error occurs while trying to calculate the value for a field, then
346   // that field will be set to -1.
347   DocumentStorageInfoProto GetStorageInfo() const;
348 
349   // Update any derived data off of the SchemaStore with the new SchemaStore.
350   // This may include pointers, SchemaTypeIds, etc.
351   //
352   // NOTE: This function may delete documents. A document may be invalidated by
353   // the new SchemaStore, such as failing validation or having its schema type
354   // deleted from the schema.
355   //
356   // This is best used if the caller is unsure about what's changed in the
357   // SchemaStore, and wants to update all information no matter what. If the
358   // caller does know what has changed, then it's recommended to call
359   // OptimizedUpdateSchemaStore.
360   //
361   // Returns;
362   //   OK on success
363   //   INTERNAL_ERROR on IO error
364   libtextclassifier3::Status UpdateSchemaStore(const SchemaStore* schema_store);
365 
366   // Performs the same funtionality as UpdateSchemaStore, but this can be more
367   // optimized in terms of less disk reads and less work if we know exactly
368   // what's changed between the old and new SchemaStore.
369   //
370   // Returns;
371   //   OK on success
372   //   INTERNAL_ERROR on IO error
373   libtextclassifier3::Status OptimizedUpdateSchemaStore(
374       const SchemaStore* schema_store,
375       const SchemaStore::SetSchemaResult& set_schema_result);
376 
377   // Reduces internal file sizes by reclaiming space of deleted documents and
378   // regenerating derived files.
379   //
380   // NOTE: The tasks in this method are too expensive to be executed in
381   // real-time. The caller should decide how frequently and when to call this
382   // method based on device usage.
383   //
384   // Returns:
385   //   OK on success
386   //   INTERNAL_ERROR on IO error
387   libtextclassifier3::Status Optimize();
388 
389   // Copy data from current base directory into a new directory. Any outdated or
390   // deleted data won't be copied. During the process, document ids will be
391   // reassigned so any files / classes that are based on old document ids may be
392   // outdated.
393   //
394   // stats will be set if non-null.
395   //
396   // NOTE: The tasks in this method are too expensive to be executed in
397   // real-time. The caller should decide how frequently and when to call this
398   // method based on device usage.
399   //
400   // Returns:
401   //   OK on success
402   //   INVALID_ARGUMENT if new_directory is same as current base directory
403   //   INTERNAL_ERROR on IO error
404   libtextclassifier3::Status OptimizeInto(
405       const std::string& new_directory, const LanguageSegmenter* lang_segmenter,
406       OptimizeStatsProto* stats = nullptr);
407 
408   // Calculates status for a potential Optimize call. Includes how many docs
409   // there are vs how many would be optimized away. And also includes an
410   // estimated size gains, in bytes, if Optimize were called.
411   //
412   // Returns:
413   //   OptimizeInfo on success
414   //   INTERNAL_ERROR on IO error
415   libtextclassifier3::StatusOr<OptimizeInfo> GetOptimizeInfo() const;
416 
417   // Computes the combined checksum of the document store - includes the ground
418   // truth and all derived files.
419   //
420   // Returns:
421   //   Combined checksum on success
422   //   INTERNAL_ERROR on compute error
423   libtextclassifier3::StatusOr<Crc32> ComputeChecksum() const;
424 
425  private:
426   // Use DocumentStore::Create() to instantiate.
427   DocumentStore(const Filesystem* filesystem, std::string_view base_dir,
428                 const Clock* clock, const SchemaStore* schema_store);
429 
430   const Filesystem* const filesystem_;
431   const std::string base_dir_;
432   const Clock& clock_;
433 
434   // Handles the ground truth schema and all of the derived data off of the
435   // schema
436   const SchemaStore* schema_store_;
437 
438   // Used to validate incoming documents
439   DocumentValidator document_validator_;
440 
441   // A log used to store all documents, it serves as a ground truth of doc
442   // store. key_mapper_ and document_id_mapper_ can be regenerated from it.
443   std::unique_ptr<PortableFileBackedProtoLog<DocumentWrapper>> document_log_;
444 
445   // Key (namespace + uri) to DocumentId mapping
446   std::unique_ptr<KeyMapper<DocumentId>> document_key_mapper_;
447 
448   // DocumentId to file offset mapping
449   std::unique_ptr<FileBackedVector<int64_t>> document_id_mapper_;
450 
451   // A cache of document associated scores. The ground truth of the scores is
452   // DocumentProto stored in document_log_. This cache contains:
453   //   - CorpusId
454   //   - Document score
455   //   - Document creation timestamp in seconds
456   //   - Document length in number of tokens
457   std::unique_ptr<FileBackedVector<DocumentAssociatedScoreData>> score_cache_;
458 
459   // A cache of data, indexed by DocumentId, used to filter documents. Currently
460   // contains:
461   //   - NamespaceId
462   //   - SchemaTypeId
463   //   - Expiration timestamp in seconds
464   std::unique_ptr<FileBackedVector<DocumentFilterData>> filter_cache_;
465 
466   // A cache of corpus associated scores. The ground truth of the scores is
467   // DocumentProto stored in document_log_. This cache contains:
468   //   - Number of documents belonging to the corpus score
469   //   - The sum of the documents' lengths, in number of tokens.
470   std::unique_ptr<FileBackedVector<CorpusAssociatedScoreData>>
471       corpus_score_cache_;
472 
473   // Maps namespaces to a densely-assigned unique id. Namespaces are assigned an
474   // id when the first document belonging to that namespace is added to the
475   // DocumentStore. Namespaces may be removed from the mapper during compaction.
476   std::unique_ptr<KeyMapper<NamespaceId>> namespace_mapper_;
477 
478   // Maps a corpus, i.e. a (namespace, schema type) pair, to a densely-assigned
479   // unique id. A coprus is assigned an
480   // id when the first document belonging to that corpus is added to the
481   // DocumentStore. Corpus ids may be removed from the mapper during compaction.
482   std::unique_ptr<KeyMapper<CorpusId>> corpus_mapper_;
483 
484   // A storage class that caches all usage scores. Usage scores are not
485   // considered as ground truth. Usage scores are associated with document ids
486   // so they need to be updated when document ids change.
487   std::unique_ptr<UsageStore> usage_store_;
488 
489   // Used internally to indicate whether the class has been initialized. This is
490   // to guard against cases where the object has been created, but Initialize
491   // fails in the constructor. If we have successfully exited the constructor,
492   // then this field can be ignored. Clients of DocumentStore should not need to
493   // worry about this field.
494   bool initialized_ = false;
495 
496   libtextclassifier3::StatusOr<DataLoss> Initialize(
497       bool force_recovery_and_revalidate_documents,
498       InitializeStatsProto* initialize_stats);
499 
500   // Initializes a new DocumentStore and sets up any underlying files.
501   //
502   // Returns:
503   //   Data loss status on success, effectively always DataLoss::NONE
504   //   INTERNAL on I/O error
505   libtextclassifier3::StatusOr<DataLoss> InitializeNewStore(
506       InitializeStatsProto* initialize_stats);
507 
508   // Initializes a DocumentStore over an existing directory of files.
509   //
510   // stats will be set if non-null
511   //
512   // Returns:
513   //   Data loss status on success
514   //   INTERNAL on I/O error
515   libtextclassifier3::StatusOr<DataLoss> InitializeExistingStore(
516       bool force_recovery_and_revalidate_documents,
517       InitializeStatsProto* initialize_stats);
518 
519   libtextclassifier3::StatusOr<DataLoss> MigrateFromV0ToV1(
520       InitializeStatsProto* initialize_stats);
521 
522   // Creates sub-components and verifies the integrity of each sub-component.
523   // This assumes that the the underlying files already exist, and will return
524   // an error if it doesn't find what it's expecting.
525   //
526   // Returns an error if subcomponents failed to initialize successfully.
527   //   INTERNAL_ERROR on IO error
528   libtextclassifier3::Status InitializeExistingDerivedFiles();
529 
530   // Re-generates all files derived from the ground truth: the document log.
531   //
532   // revalidate_documents=true will also cause each document to be revalidated
533   // the schema as it is read out of the document log.
534   //
535   // NOTE: if this function fails, the only thing we can do is to retry it until
536   // it succeeds or prevent the initialization of a DocumentStore. The
537   // DocumentStore object wouldn't work reliably if this fails.
538   //
539   // Steps:
540   //   1. Delete all derived files.
541   //   2. Iterate through document log, put data into new key mapper and
542   //   document_id
543   //      mapper.
544   //   3. Create header and store the updated combined checksum
545   libtextclassifier3::Status RegenerateDerivedFiles(bool revalidate_documents);
546 
547   // Resets the unique_ptr to the document_key_mapper, deletes the underlying
548   // file, and re-creates a new instance of the document_key_mapper .
549   //
550   // Returns OK or any IO errors.
551   libtextclassifier3::Status ResetDocumentKeyMapper();
552 
553   // Resets the unique_ptr to the document_id_mapper, deletes the underlying
554   // file, and re-creates a new instance of the document_id_mapper.
555   //
556   // Returns OK or any IO errors.
557   libtextclassifier3::Status ResetDocumentIdMapper();
558 
559   // Resets the unique_ptr to the score_cache, deletes the underlying file, and
560   // re-creates a new instance of the score_cache.
561   //
562   // Returns OK or any IO errors.
563   libtextclassifier3::Status ResetDocumentAssociatedScoreCache();
564 
565   // Resets the unique_ptr to the corpus_score_cache, deletes the underlying
566   // file, and re-creates a new instance of the corpus_score_cache.
567   //
568   // Returns OK or any IO errors.
569   libtextclassifier3::Status ResetCorpusAssociatedScoreCache();
570 
571   // Resets the unique_ptr to the filter_cache, deletes the underlying file, and
572   // re-creates a new instance of the filter_cache.
573   //
574   // Returns OK or any IO errors.
575   libtextclassifier3::Status ResetFilterCache();
576 
577   // Resets the unique_ptr to the namespace_mapper, deletes the underlying file,
578   // and re-creates a new instance of the namespace_mapper.
579   //
580   // Returns OK or any IO errors.
581   libtextclassifier3::Status ResetNamespaceMapper();
582 
583   // Resets the unique_ptr to the corpus_mapper, deletes the underlying file,
584   // and re-creates a new instance of the corpus_mapper.
585   //
586   // Returns OK or any IO errors.
587   libtextclassifier3::Status ResetCorpusMapper();
588 
589   // Checks if the header exists already. This does not create the header file
590   // if it doesn't exist.
591   bool HeaderExists();
592 
593   // Update, replace and persist the header file. Creates the header file if it
594   // doesn't exist.
595   //
596   // Returns:
597   //   OK on success
598   //   INTERNAL on I/O error
599   libtextclassifier3::Status UpdateHeader(const Crc32& checksum);
600 
601   libtextclassifier3::StatusOr<DocumentId> InternalPut(
602       DocumentProto& document,
603       PutDocumentStatsProto* put_document_stats = nullptr);
604 
605   // Helper function to do batch deletes. Documents with the given
606   // "namespace_id" and "schema_type_id" will be deleted. If callers don't need
607   // to specify the namespace or schema type, pass in kInvalidNamespaceId or
608   // kInvalidSchemaTypeId. The document protos with their derived data will be
609   // erased / cleared immediately.
610   //
611   // NOTE: Space is not reclaimed in the derived files until Optimize() is
612   // called.
613   //
614   // Returns:
615   //   Number of documents that were actually updated to be deleted
616   //   INTERNAL_ERROR on IO error
617   libtextclassifier3::StatusOr<int> BatchDelete(NamespaceId namespace_id,
618                                                 SchemaTypeId schema_type_id);
619 
620   // Helper method to find a DocumentId that is associated with the given
621   // namespace and uri.
622   //
623   // NOTE: The DocumentId may refer to a invalid document (deleted
624   // or expired). Callers can call DoesDocumentExist(document_id) to ensure it
625   // refers to a valid Document.
626   //
627   // Returns:
628   //   A DocumentId on success
629   //   NOT_FOUND if the key doesn't exist
630   //   INTERNAL_ERROR on IO error
631   libtextclassifier3::StatusOr<DocumentId> GetDocumentId(
632       std::string_view name_space, std::string_view uri) const;
633 
634   // Returns the CorpusAssociatedScoreData of the corpus specified by the
635   // corpus_id.
636   //
637   // If the corpus_id has never been seen before, it returns a
638   // CorpusAssociatedScoreData with properties set to default values.
639   //
640   // NOTE: This does not check if the corpus exists and will return the
641   // CorpusAssociatedScoreData of the corpus even if all documents belonging to
642   // that corpus have been deleted.
643   //
644   // Returns:
645   //   CorpusAssociatedScoreData on success
646   libtextclassifier3::StatusOr<CorpusAssociatedScoreData>
647   GetCorpusAssociatedScoreDataToUpdate(CorpusId corpus_id) const;
648 
649   // Check if a document exists. Existence means it hasn't been deleted and it
650   // hasn't expired yet.
651   //
652   // Returns:
653   //   OK if the document exists
654   //   INVALID_ARGUMENT if document_id is less than 0 or greater than the
655   //                    maximum value
656   //   NOT_FOUND if the document doesn't exist (i.e. deleted or expired)
657   //   INTERNAL_ERROR on IO error
658   libtextclassifier3::Status DoesDocumentExistWithStatus(
659       DocumentId document_id) const;
660 
661   // Check if a document exists. Existence means it hasn't been deleted and it
662   // hasn't expired yet.
663   //
664   // This is for internal-use only because we assume that the document_id is
665   // already valid. If you're unsure if the document_id is valid, use
666   // DoesDocumentExist(document_id) instead, which will perform those additional
667   // checks.
668   //
669   // Returns:
670   //   boolean whether a document exists or not
671   bool InternalDoesDocumentExist(DocumentId document_id) const;
672 
673   // Checks if a document has been deleted
674   //
675   // This is for internal-use only because we assume that the document_id is
676   // already valid. If you're unsure if the document_id is valid, use
677   // DoesDocumentExist(document_id) instead, which will perform those additional
678   // checks.
679   bool IsDeleted(DocumentId document_id) const;
680 
681   // Checks if a document has expired.
682   //
683   // This is for internal-use only because we assume that the document_id is
684   // already valid. If you're unsure if the document_id is valid, use
685   // DoesDocumentExist(document_id) instead, which will perform those additional
686   // checks.
687   bool IsExpired(DocumentId document_id) const;
688 
689   // Updates the entry in the score cache for document_id.
690   libtextclassifier3::Status UpdateDocumentAssociatedScoreCache(
691       DocumentId document_id, const DocumentAssociatedScoreData& score_data);
692 
693   // Updates the entry in the corpus score cache for corpus_id.
694   libtextclassifier3::Status UpdateCorpusAssociatedScoreCache(
695       CorpusId corpus_id, const CorpusAssociatedScoreData& score_data);
696 
697   // Updates the entry in the filter cache for document_id.
698   libtextclassifier3::Status UpdateFilterCache(
699       DocumentId document_id, const DocumentFilterData& filter_data);
700 
701   // Helper method to clear the derived data of a document
702   libtextclassifier3::Status ClearDerivedData(DocumentId document_id);
703 
704   // Sets usage scores for the given document.
705   libtextclassifier3::Status SetUsageScores(
706       DocumentId document_id, const UsageStore::UsageScores& usage_scores);
707 
708   // Returns:
709   //   - on success, a DocumentStorageInfoProto with the fields relating to the
710   //     size of Document Store member variables populated.
711   //   - INTERNAL on failure to get file size
712   DocumentStorageInfoProto GetMemberStorageInfo() const;
713 
714   // Returns:
715   //   - on success, the storage_info that was passed in but with the number of
716   //     alive, deleted and expired documents also set.
717   //   - OUT_OF_RANGE, this should never happen. This could only be returned if
718   //     the document_id_mapper somehow became larger than the filter cache.
719   DocumentStorageInfoProto CalculateDocumentStatusCounts(
720       DocumentStorageInfoProto storage_info) const;
721 };
722 
723 }  // namespace lib
724 }  // namespace icing
725 
726 #endif  // ICING_STORE_DOCUMENT_STORE_H_
727