• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_STORE_DOCUMENT_STORE_H_
16 #define ICING_STORE_DOCUMENT_STORE_H_
17 
18 #include <cstdint>
19 #include <memory>
20 #include <string>
21 #include <string_view>
22 #include <vector>
23 
24 #include "icing/text_classifier/lib3/utils/base/status.h"
25 #include "icing/text_classifier/lib3/utils/base/statusor.h"
26 #include "icing/file/file-backed-proto-log.h"
27 #include "icing/file/file-backed-vector.h"
28 #include "icing/file/filesystem.h"
29 #include "icing/file/portable-file-backed-proto-log.h"
30 #include "icing/proto/debug.pb.h"
31 #include "icing/proto/document.pb.h"
32 #include "icing/proto/document_wrapper.pb.h"
33 #include "icing/proto/logging.pb.h"
34 #include "icing/proto/optimize.pb.h"
35 #include "icing/proto/persist.pb.h"
36 #include "icing/proto/search.pb.h"
37 #include "icing/proto/storage.pb.h"
38 #include "icing/proto/usage.pb.h"
39 #include "icing/schema/schema-store.h"
40 #include "icing/store/corpus-associated-scoring-data.h"
41 #include "icing/store/corpus-id.h"
42 #include "icing/store/document-associated-score-data.h"
43 #include "icing/store/document-filter-data.h"
44 #include "icing/store/document-id.h"
45 #include "icing/store/key-mapper.h"
46 #include "icing/store/namespace-id.h"
47 #include "icing/store/usage-store.h"
48 #include "icing/tokenization/language-segmenter.h"
49 #include "icing/util/clock.h"
50 #include "icing/util/crc32.h"
51 #include "icing/util/data-loss.h"
52 #include "icing/util/document-validator.h"
53 #include "icing/util/fingerprint-util.h"
54 
55 namespace icing {
56 namespace lib {
57 
58 // Provides storage interfaces for documents.
59 class DocumentStore {
60  public:
61   struct Header {
GetCurrentMagicHeader62     static int32_t GetCurrentMagic(bool namespace_id_fingerprint) {
63       return namespace_id_fingerprint ? kNewMagic : kOldMagic;
64     }
65 
66     // Holds the magic as a quick sanity check against file corruption.
67     int32_t magic;
68 
69     // Checksum of the DocumentStore's sub-component's checksums.
70     uint32_t checksum;
71 
72    private:
73     static constexpr int32_t kOldMagic = 0x746f7265;
74     static constexpr int32_t kNewMagic = 0x1b99c8b0;
75   };
76 
77   struct OptimizeInfo {
78     // The estimated size in bytes of the optimizable docs. We don't track the
79     // size of each document, so we estimate by taking the size of the entire
80     // DocumentStore and dividing that by the total number of documents we have.
81     // So we end up with an average document size.
82     int64_t estimated_optimizable_bytes = 0;
83 
84     // Number of total documents the DocumentStore tracks.
85     int32_t total_docs = 0;
86 
87     // Number of optimizable (deleted + expired) docs the DocumentStore tracks.
88     int32_t optimizable_docs = 0;
89   };
90 
91   struct DeleteByGroupResult {
92     // Status representing whether or not the operation succeeded. See the
93     // comments above the function that returns this result to determine what
94     // possible statuses could be returned.
95     libtextclassifier3::Status status;
96 
97     int num_docs_deleted = 0;
98   };
99 
100   struct CreateResult {
101     // A successfully initialized document store.
102     std::unique_ptr<DocumentStore> document_store;
103 
104     // The data status after initializing from a previous state. Data loss can
105     // happen if the file is corrupted or some previously added data was
106     // unpersisted. This may be used to signal that any derived data off of the
107     // document store may need to be regenerated.
108     DataLoss data_loss;
109   };
110 
111   // Not copyable
112   DocumentStore(const DocumentStore&) = delete;
113   DocumentStore& operator=(const DocumentStore&) = delete;
114 
115   // Persists and updates checksum of subcomponents.
116   ~DocumentStore();
117 
118   // Factory method to create, initialize, and return a DocumentStore. The base
119   // directory is used to persist document store files. If document store was
120   // previously initialized with this directory, it will reload the files saved
121   // by the last instance.
122   //
123   // force_recovery_and_revalidate_documents=true will pre-emptively throw out
124   // the derived files and validate each document while recreating them. This
125   // can be used to indicate that the schema (and type ids) may have changed and
126   // those changes might not have been applied to the document store.
127   //
128   // If initialize_stats is present, the fields related to DocumentStore will be
129   // populated.
130   //
131   // Does not take any ownership, and all pointers except initialize_stats must
132   // refer to valid objects that outlive the one constructed.
133   //
134   // TODO(cassiewang): Consider returning a status indicating that derived files
135   // were regenerated. This may be helpful in logs.
136   //
137   // Returns:
138   //   A DocumentStore::CreateResult on success
139   //   FAILED_PRECONDITION on any null pointer input
140   //   INTERNAL_ERROR on IO error
141   static libtextclassifier3::StatusOr<DocumentStore::CreateResult> Create(
142       const Filesystem* filesystem, const std::string& base_dir,
143       const Clock* clock, const SchemaStore* schema_store,
144       bool force_recovery_and_revalidate_documents,
145       bool namespace_id_fingerprint,
146       int32_t compression_level,
147       InitializeStatsProto* initialize_stats);
148 
149   // Discards all derived data in the document store.
150   //
151   // Returns:
152   //   OK on success or nothing to discard
153   //   INTERNAL_ERROR on any I/O errors
154   static libtextclassifier3::Status DiscardDerivedFiles(
155       const Filesystem* filesystem, const std::string& base_dir);
156 
157   // Returns the maximum DocumentId that the DocumentStore has assigned. If
158   // there has not been any DocumentIds assigned, i.e. the DocumentStore is
159   // empty, then kInvalidDocumentId is returned. This does not filter out
160   // DocumentIds of deleted or expired documents.
last_added_document_id()161   DocumentId last_added_document_id() const {
162     if (document_id_mapper_->num_elements() == 0) {
163       return kInvalidDocumentId;
164     }
165     return document_id_mapper_->num_elements() - 1;
166   }
167 
168   // Returns the number of documents. The result does not filter out DocumentIds
169   // of deleted or expired documents.
num_documents()170   int num_documents() const { return document_id_mapper_->num_elements(); }
171 
172   // Puts the document into document store.
173   //
174   // If put_document_stats is present, the fields related to DocumentStore will
175   // be populated.
176   //
177   // Returns:
178   //   A newly generated document id on success
179   //   RESOURCE_EXHAUSED if exceeds maximum number of allowed documents
180   //   FAILED_PRECONDITION if schema hasn't been set yet
181   //   NOT_FOUND if the schema_type or a property config of the document doesn't
182   //     exist in schema
183   //   INTERNAL_ERROR on IO error
184   libtextclassifier3::StatusOr<DocumentId> Put(
185       const DocumentProto& document, int32_t num_tokens = 0,
186       PutDocumentStatsProto* put_document_stats = nullptr);
187   libtextclassifier3::StatusOr<DocumentId> Put(
188       DocumentProto&& document, int32_t num_tokens = 0,
189       PutDocumentStatsProto* put_document_stats = nullptr);
190 
191   // Finds and returns the document identified by the given key (namespace +
192   // uri). If 'clear_internal_fields' is true, document level data that's
193   // generated internally by DocumentStore is cleared.
194   //
195   // Returns:
196   //   The document found on success
197   //   NOT_FOUND if the key doesn't exist or document has been deleted
198   //   INTERNAL_ERROR on IO error
199   libtextclassifier3::StatusOr<DocumentProto> Get(
200       std::string_view name_space, std::string_view uri,
201       bool clear_internal_fields = true) const;
202 
203   // Finds and returns the document identified by the given document id. If
204   // 'clear_internal_fields' is true, document level data that's generated
205   // internally by DocumentStore is cleared.
206   //
207   // Returns:
208   //   The document found on success
209   //   INVALID_ARGUMENT if document_id is less than 0 or greater than the
210   //                    maximum value
211   //   NOT_FOUND if the document doesn't exist or has been deleted
212   //   INTERNAL_ERROR on IO error
213   libtextclassifier3::StatusOr<DocumentProto> Get(
214       DocumentId document_id, bool clear_internal_fields = true) const;
215 
216   // Returns all namespaces which have at least 1 active document (not deleted
217   // or expired). Order of namespaces is undefined.
218   std::vector<std::string> GetAllNamespaces() const;
219 
220   // Deletes the document identified by the given namespace and uri. The
221   // document proto will be erased immediately.
222   //
223   // NOTE:
224   //    Space is not reclaimed for deleted documents until Optimize() is
225   //    called.
226   //
227   // Returns:
228   //   OK on success
229   //   NOT_FOUND if no document exists with namespace, uri
230   //   INTERNAL_ERROR on IO error
231   libtextclassifier3::Status Delete(std::string_view name_space,
232                                     std::string_view uri,
233                                     int64_t current_time_ms);
234 
235   // Deletes the document identified by the given document_id. The document
236   // proto will be erased immediately.
237   //
238   // NOTE:
239   //    Space is not reclaimed for deleted documents until Optimize() is
240   //    called.
241   //
242   // Returns:
243   //   OK on success
244   //   NOT_FOUND if the document doesn't exist (i.e. deleted or expired)
245   //   INTERNAL_ERROR on IO error
246   //   INVALID_ARGUMENT if document_id is invalid.
247   libtextclassifier3::Status Delete(DocumentId document_id,
248                                     int64_t current_time_ms);
249 
250   // Returns the NamespaceId of the string namespace
251   //
252   // Returns:
253   //   NamespaceId on success
254   //   NOT_FOUND if the namespace doesn't exist
255   //   INTERNAL_ERROR on IO error
256   libtextclassifier3::StatusOr<NamespaceId> GetNamespaceId(
257       std::string_view name_space) const;
258 
259   // Helper method to find a DocumentId that is associated with the given
260   // namespace and uri.
261   //
262   // NOTE: The DocumentId may refer to a invalid document (deleted
263   // or expired). Callers can call DoesDocumentExist(document_id) to ensure it
264   // refers to a valid Document.
265   //
266   // Returns:
267   //   A DocumentId on success
268   //   NOT_FOUND if the key doesn't exist
269   //   INTERNAL_ERROR on IO error
270   libtextclassifier3::StatusOr<DocumentId> GetDocumentId(
271       std::string_view name_space, std::string_view uri) const;
272 
273   // Returns the CorpusId associated with the given namespace and schema.
274   //
275   // Returns:
276   //   A CorpusId on success
277   //   NOT_FOUND if the key doesn't exist
278   //   INTERNAL_ERROR on IO error
279   libtextclassifier3::StatusOr<CorpusId> GetCorpusId(
280       const std::string_view name_space, const std::string_view schema) const;
281 
282   // Returns the ResultGroupingEntryId associated with the given namespace
283   // and schema.
284   //
285   // NOTE: ResultGroupingEntryIds that are generated by calls with different
286   // ResultGroupingTypes should not be compared. Returned ResultGroupingEntryIds
287   // are only guarenteed to be unique within their own ResultGroupingType.
288   //
289   // Returns:
290   //   A ResultGroupingEntryId on success
291   //   NOT_FOUND if the key doesn't exist
292   //   INTERNAL_ERROR on IO error
293   libtextclassifier3::StatusOr<int32_t> GetResultGroupingEntryId(
294       ResultSpecProto::ResultGroupingType result_group_type,
295       const std::string_view name_space, const std::string_view schema) const;
296 
297   // Returns the ResultGrouping Entry Id associated with the given NamespaceId
298   // and SchemaTypeId
299   //
300   // NOTE: ResultGroupingEntryIds that are generated by calls with different
301   // ResultGroupingTypes should not be compared. Returned ResultGroupingEntryIds
302   // are only guarenteed to be unique within their own ResultGroupingType.
303   //
304   // Returns:
305   //   A ResultGroupingEntryId on success
306   //   NOT_FOUND if the key doesn't exist
307   //   INTERNAL_ERROR on IO error
308   libtextclassifier3::StatusOr<int32_t> GetResultGroupingEntryId(
309       ResultSpecProto::ResultGroupingType result_group_type,
310       const NamespaceId namespace_id, const SchemaTypeId schema_type_id) const;
311 
312   // Returns the DocumentAssociatedScoreData of the document specified by the
313   // DocumentId.
314   //
315   // Returns:
316   //   DocumentAssociatedScoreData on success
317   //   NOT_FOUND if the document or the score data is not found
318   libtextclassifier3::StatusOr<DocumentAssociatedScoreData>
319   GetDocumentAssociatedScoreData(DocumentId document_id) const;
320 
321   // Returns the CorpusAssociatedScoreData of the corpus specified by the
322   // corpus_id.
323   //
324   // NOTE: This does not check if the corpus exists and will return the
325   // CorpusAssociatedScoreData of the corpus even if all documents belonging to
326   // that corpus have been deleted.
327   //
328   // Returns:
329   //   CorpusAssociatedScoreData on success
330   //   OUT_OF_RANGE if corpus_id is negative or exceeds previously seen
331   //                CorpusIds
332   libtextclassifier3::StatusOr<CorpusAssociatedScoreData>
333   GetCorpusAssociatedScoreData(CorpusId corpus_id) const;
334 
335   // Gets the document filter data if a document exists. Otherwise, will get a
336   // false optional.
337   //
338   // Existence means it hasn't been deleted and it hasn't expired yet.
339   //
340   // Returns:
341   //   True:DocumentFilterData  if the given document exists.
342   //   False                    if the given document doesn't exist.
343   std::optional<DocumentFilterData> GetAliveDocumentFilterData(
344       DocumentId document_id, int64_t current_time_ms) const;
345 
346   // Gets the usage scores of a document.
347   //
348   // Returns:
349   //   UsageScores on success
350   //   nullopt if there are no usage scores stored for the requested docid.
351   std::optional<UsageStore::UsageScores> GetUsageScores(
352       DocumentId document_id, int64_t current_time_ms) const;
353 
354   // Reports usage. The corresponding usage scores of the specified document in
355   // the report will be updated.
356   //
357   // Returns:
358   //   OK on success
359   //   NOT_FOUND if the [namesapce + uri] key in the report doesn't exist
360   //   INTERNAL_ERROR on I/O errors.
361   libtextclassifier3::Status ReportUsage(const UsageReport& usage_report);
362 
363   // Deletes all documents belonging to the given namespace. The documents will
364   // be erased immediately.
365   //
366   // NOTE:
367   //    Space is not reclaimed for deleted documents until Optimize() is
368   //    called.
369   //
370   // Returns:
371   //   OK on success
372   //   NOT_FOUND if namespace doesn't exist
373   //   INTERNAL_ERROR on IO error
374   DeleteByGroupResult DeleteByNamespace(std::string_view name_space);
375 
376   // Deletes all documents belonging to the given schema type. The documents
377   // will be erased immediately.
378   //
379   // NOTE:
380   //    Space is not reclaimed for deleted documents until Optimize() is
381   //    called.
382   //
383   // Returns:
384   //   OK on success
385   //   NOT_FOUND if schema_type doesn't exist
386   //   INTERNAL_ERROR on IO error
387   DeleteByGroupResult DeleteBySchemaType(std::string_view schema_type);
388 
389   // Syncs all the data and metadata changes to disk.
390   //
391   // Returns:
392   //   OK on success
393   //   INTERNAL on I/O error
394   libtextclassifier3::Status PersistToDisk(PersistType::Code persist_type);
395 
396   // Calculates the StorageInfo for the Document Store.
397   //
398   // If an IO error occurs while trying to calculate the value for a field, then
399   // that field will be set to -1.
400   DocumentStorageInfoProto GetStorageInfo() const;
401 
402   // Update any derived data off of the SchemaStore with the new SchemaStore.
403   // This may include pointers, SchemaTypeIds, etc.
404   //
405   // NOTE: This function may delete documents. A document may be invalidated by
406   // the new SchemaStore, such as failing validation or having its schema type
407   // deleted from the schema.
408   //
409   // This is best used if the caller is unsure about what's changed in the
410   // SchemaStore, and wants to update all information no matter what. If the
411   // caller does know what has changed, then it's recommended to call
412   // OptimizedUpdateSchemaStore.
413   //
414   // Returns;
415   //   OK on success
416   //   INTERNAL_ERROR on IO error
417   libtextclassifier3::Status UpdateSchemaStore(const SchemaStore* schema_store);
418 
419   // Performs the same funtionality as UpdateSchemaStore, but this can be more
420   // optimized in terms of less disk reads and less work if we know exactly
421   // what's changed between the old and new SchemaStore.
422   //
423   // Returns;
424   //   OK on success
425   //   INTERNAL_ERROR on IO error
426   libtextclassifier3::Status OptimizedUpdateSchemaStore(
427       const SchemaStore* schema_store,
428       const SchemaStore::SetSchemaResult& set_schema_result);
429 
430   // Reduces internal file sizes by reclaiming space of deleted documents and
431   // regenerating derived files.
432   //
433   // NOTE: The tasks in this method are too expensive to be executed in
434   // real-time. The caller should decide how frequently and when to call this
435   // method based on device usage.
436   //
437   // Returns:
438   //   OK on success
439   //   INTERNAL_ERROR on IO error
440   libtextclassifier3::Status Optimize();
441 
442   // Copy data from current base directory into a new directory. Any outdated or
443   // deleted data won't be copied. During the process, document ids will be
444   // reassigned so any files / classes that are based on old document ids may be
445   // outdated.
446   //
447   // stats will be set if non-null.
448   //
449   // NOTE: The tasks in this method are too expensive to be executed in
450   // real-time. The caller should decide how frequently and when to call this
451   // method based on device usage.
452   //
453   // Returns:
454   //   A vector that maps from old document id to new document id on success
455   //   INVALID_ARGUMENT if new_directory is same as current base directory
456   //   INTERNAL_ERROR on IO error
457   libtextclassifier3::StatusOr<std::vector<DocumentId>> OptimizeInto(
458       const std::string& new_directory, const LanguageSegmenter* lang_segmenter,
459       bool namespace_id_fingerprint, OptimizeStatsProto* stats = nullptr);
460 
461   // Calculates status for a potential Optimize call. Includes how many docs
462   // there are vs how many would be optimized away. And also includes an
463   // estimated size gains, in bytes, if Optimize were called.
464   //
465   // Returns:
466   //   OptimizeInfo on success
467   //   INTERNAL_ERROR on IO error
468   libtextclassifier3::StatusOr<OptimizeInfo> GetOptimizeInfo() const;
469 
470   // Computes the combined checksum of the document store - includes the ground
471   // truth and all derived files.
472   //
473   // Returns:
474   //   Combined checksum on success
475   //   INTERNAL_ERROR on compute error
476   libtextclassifier3::StatusOr<Crc32> ComputeChecksum() const;
477 
478   // Get debug information for the document store.
479   // verbosity <= 0, simplest debug information
480   // verbosity > 0, also return the total number of documents and tokens in each
481   // (namespace, schema type) pair.
482   //
483   // Returns:
484   //   DocumentDebugInfoProto on success
485   //   INTERNAL_ERROR on IO errors, crc compute error
486   libtextclassifier3::StatusOr<DocumentDebugInfoProto> GetDebugInfo(
487       int verbosity) const;
488 
489  private:
490   // Use DocumentStore::Create() to instantiate.
491   DocumentStore(const Filesystem* filesystem, std::string_view base_dir,
492                 const Clock* clock, const SchemaStore* schema_store,
493                 bool namespace_id_fingerprint, int32_t compression_level);
494 
495   const Filesystem* const filesystem_;
496   const std::string base_dir_;
497   const Clock& clock_;
498 
499   // Handles the ground truth schema and all of the derived data off of the
500   // schema
501   const SchemaStore* schema_store_;
502 
503   // Used to validate incoming documents
504   DocumentValidator document_validator_;
505 
506   // Whether to use namespace id or namespace name to build up fingerprint for
507   // document_key_mapper_ and corpus_mapper_.
508   bool namespace_id_fingerprint_;
509 
510   const int32_t compression_level_;
511 
512   // A log used to store all documents, it serves as a ground truth of doc
513   // store. key_mapper_ and document_id_mapper_ can be regenerated from it.
514   std::unique_ptr<PortableFileBackedProtoLog<DocumentWrapper>> document_log_;
515 
516   // Key (namespace + uri) to DocumentId mapping
517   std::unique_ptr<
518       KeyMapper<DocumentId, fingerprint_util::FingerprintStringFormatter>>
519       document_key_mapper_;
520 
521   // DocumentId to file offset mapping
522   std::unique_ptr<FileBackedVector<int64_t>> document_id_mapper_;
523 
524   // A cache of document associated scores. The ground truth of the scores is
525   // DocumentProto stored in document_log_. This cache contains:
526   //   - CorpusId
527   //   - Document score
528   //   - Document creation timestamp in seconds
529   //   - Document length in number of tokens
530   std::unique_ptr<FileBackedVector<DocumentAssociatedScoreData>> score_cache_;
531 
532   // A cache of data, indexed by DocumentId, used to filter documents. Currently
533   // contains:
534   //   - NamespaceId
535   //   - SchemaTypeId
536   //   - Expiration timestamp in seconds
537   std::unique_ptr<FileBackedVector<DocumentFilterData>> filter_cache_;
538 
539   // A cache of corpus associated scores. The ground truth of the scores is
540   // DocumentProto stored in document_log_. This cache contains:
541   //   - Number of documents belonging to the corpus score
542   //   - The sum of the documents' lengths, in number of tokens.
543   std::unique_ptr<FileBackedVector<CorpusAssociatedScoreData>>
544       corpus_score_cache_;
545 
546   // Maps namespaces to a densely-assigned unique id. Namespaces are assigned an
547   // id when the first document belonging to that namespace is added to the
548   // DocumentStore. Namespaces may be removed from the mapper during compaction.
549   std::unique_ptr<KeyMapper<NamespaceId>> namespace_mapper_;
550 
551   // Maps a corpus, i.e. a (namespace, schema type) pair, to a densely-assigned
552   // unique id. A coprus is assigned an
553   // id when the first document belonging to that corpus is added to the
554   // DocumentStore. Corpus ids may be removed from the mapper during compaction.
555   std::unique_ptr<
556       KeyMapper<CorpusId, fingerprint_util::FingerprintStringFormatter>>
557       corpus_mapper_;
558 
559   // A storage class that caches all usage scores. Usage scores are not
560   // considered as ground truth. Usage scores are associated with document ids
561   // so they need to be updated when document ids change.
562   std::unique_ptr<UsageStore> usage_store_;
563 
564   // Used internally to indicate whether the class has been initialized. This is
565   // to guard against cases where the object has been created, but Initialize
566   // fails in the constructor. If we have successfully exited the constructor,
567   // then this field can be ignored. Clients of DocumentStore should not need to
568   // worry about this field.
569   bool initialized_ = false;
570 
571   libtextclassifier3::StatusOr<DataLoss> Initialize(
572       bool force_recovery_and_revalidate_documents,
573       InitializeStatsProto* initialize_stats);
574 
575   // Creates sub-components and verifies the integrity of each sub-component.
576   // This assumes that the the underlying files already exist, and will return
577   // an error if it doesn't find what it's expecting.
578   //
579   // Returns an error if subcomponents failed to initialize successfully.
580   //   INTERNAL_ERROR on IO error
581   libtextclassifier3::Status InitializeExistingDerivedFiles();
582 
583   // Re-generates all files derived from the ground truth: the document log.
584   //
585   // revalidate_documents=true will also cause each document to be revalidated
586   // the schema as it is read out of the document log.
587   //
588   // NOTE: if this function fails, the only thing we can do is to retry it until
589   // it succeeds or prevent the initialization of a DocumentStore. The
590   // DocumentStore object wouldn't work reliably if this fails.
591   //
592   // Steps:
593   //   1. Delete all derived files.
594   //   2. Iterate through document log, put data into new key mapper and
595   //   document_id
596   //      mapper.
597   //   3. Create header and store the updated combined checksum
598   libtextclassifier3::Status RegenerateDerivedFiles(bool revalidate_documents);
599 
600   // Resets the unique_ptr to the document_key_mapper, deletes the underlying
601   // file, and re-creates a new instance of the document_key_mapper .
602   //
603   // Returns OK or any IO errors.
604   libtextclassifier3::Status ResetDocumentKeyMapper();
605 
606   // Resets the unique_ptr to the document_id_mapper, deletes the underlying
607   // file, and re-creates a new instance of the document_id_mapper.
608   //
609   // Returns OK or any IO errors.
610   libtextclassifier3::Status ResetDocumentIdMapper();
611 
612   // Resets the unique_ptr to the score_cache, deletes the underlying file, and
613   // re-creates a new instance of the score_cache.
614   //
615   // Returns OK or any IO errors.
616   libtextclassifier3::Status ResetDocumentAssociatedScoreCache();
617 
618   // Resets the unique_ptr to the corpus_score_cache, deletes the underlying
619   // file, and re-creates a new instance of the corpus_score_cache.
620   //
621   // Returns OK or any IO errors.
622   libtextclassifier3::Status ResetCorpusAssociatedScoreCache();
623 
624   // Resets the unique_ptr to the filter_cache, deletes the underlying file, and
625   // re-creates a new instance of the filter_cache.
626   //
627   // Returns OK or any IO errors.
628   libtextclassifier3::Status ResetFilterCache();
629 
630   // Resets the unique_ptr to the namespace_mapper, deletes the underlying file,
631   // and re-creates a new instance of the namespace_mapper.
632   //
633   // Returns OK or any IO errors.
634   libtextclassifier3::Status ResetNamespaceMapper();
635 
636   // Resets the unique_ptr to the corpus_mapper, deletes the underlying file,
637   // and re-creates a new instance of the corpus_mapper.
638   //
639   // Returns OK or any IO errors.
640   libtextclassifier3::Status ResetCorpusMapper();
641 
642   // Checks if the header exists already. This does not create the header file
643   // if it doesn't exist.
644   bool HeaderExists();
645 
646   // Update, replace and persist the header file. Creates the header file if it
647   // doesn't exist.
648   //
649   // Returns:
650   //   OK on success
651   //   INTERNAL on I/O error
652   libtextclassifier3::Status UpdateHeader(const Crc32& checksum);
653 
654   libtextclassifier3::StatusOr<DocumentId> InternalPut(
655       DocumentProto&& document,
656       PutDocumentStatsProto* put_document_stats = nullptr);
657 
658   // Helper function to do batch deletes. Documents with the given
659   // "namespace_id" and "schema_type_id" will be deleted. If callers don't need
660   // to specify the namespace or schema type, pass in kInvalidNamespaceId or
661   // kInvalidSchemaTypeId. The document protos with their derived data will be
662   // erased / cleared immediately.
663   //
664   // NOTE: Space is not reclaimed in the derived files until Optimize() is
665   // called.
666   //
667   // Returns:
668   //   Number of documents that were actually updated to be deleted
669   //   INTERNAL_ERROR on IO error
670   libtextclassifier3::StatusOr<int> BatchDelete(NamespaceId namespace_id,
671                                                 SchemaTypeId schema_type_id);
672 
673   // Returns the CorpusAssociatedScoreData of the corpus specified by the
674   // corpus_id.
675   //
676   // If the corpus_id has never been seen before, it returns a
677   // CorpusAssociatedScoreData with properties set to default values.
678   //
679   // NOTE: This does not check if the corpus exists and will return the
680   // CorpusAssociatedScoreData of the corpus even if all documents belonging to
681   // that corpus have been deleted.
682   //
683   // Returns:
684   //   CorpusAssociatedScoreData on success
685   libtextclassifier3::StatusOr<CorpusAssociatedScoreData>
686   GetCorpusAssociatedScoreDataToUpdate(CorpusId corpus_id) const;
687 
688   // Check if a document exists. Existence means it hasn't been deleted and it
689   // hasn't expired yet.
690   //
691   // Returns:
692   //   OK if the document exists
693   //   INVALID_ARGUMENT if document_id is less than 0 or greater than the
694   //                    maximum value
695   //   NOT_FOUND if the document doesn't exist (i.e. deleted or expired)
696   //   INTERNAL_ERROR on IO error
697   libtextclassifier3::Status DoesDocumentExistWithStatus(
698       DocumentId document_id) const;
699 
700   // Checks if a document has been deleted
701   //
702   // This is for internal-use only because we assume that the document_id is
703   // already valid. If you're unsure if the document_id is valid, use
704   // DoesDocumentExist(document_id) instead, which will perform those additional
705   // checks.
706   bool IsDeleted(DocumentId document_id) const;
707 
708   // Checks if a document has expired.
709   //
710   // This is for internal-use only because we assume that the document_id is
711   // already valid. If you're unsure if the document_id is valid, use
712   // DoesDocumentExist(document_id) instead, which will perform those additional
713   // checks.
714 
715   // Returns:
716   //   True:DocumentFilterData  if the given document isn't expired.
717   //   False                    if the given doesn't document is expired.
718   std::optional<DocumentFilterData> GetNonExpiredDocumentFilterData(
719       DocumentId document_id, int64_t current_time_ms) const;
720 
721   // Updates the entry in the score cache for document_id.
722   libtextclassifier3::Status UpdateDocumentAssociatedScoreCache(
723       DocumentId document_id, const DocumentAssociatedScoreData& score_data);
724 
725   // Updates the entry in the corpus score cache for corpus_id.
726   libtextclassifier3::Status UpdateCorpusAssociatedScoreCache(
727       CorpusId corpus_id, const CorpusAssociatedScoreData& score_data);
728 
729   // Updates the entry in the filter cache for document_id.
730   libtextclassifier3::Status UpdateFilterCache(
731       DocumentId document_id, const DocumentFilterData& filter_data);
732 
733   // Helper method to clear the derived data of a document
734   libtextclassifier3::Status ClearDerivedData(DocumentId document_id);
735 
736   // Sets usage scores for the given document.
737   libtextclassifier3::Status SetUsageScores(
738       DocumentId document_id, const UsageStore::UsageScores& usage_scores);
739 
740   // Returns:
741   //   - on success, a DocumentStorageInfoProto with the fields relating to the
742   //     size of Document Store member variables populated.
743   //   - INTERNAL on failure to get file size
744   DocumentStorageInfoProto GetMemberStorageInfo() const;
745 
746   // Returns:
747   //   - on success, the storage_info that was passed in but with the number of
748   //     alive, deleted and expired documents also set.
749   //   - OUT_OF_RANGE, this should never happen. This could only be returned if
750   //     the document_id_mapper somehow became larger than the filter cache.
751   DocumentStorageInfoProto CalculateDocumentStatusCounts(
752       DocumentStorageInfoProto storage_info) const;
753 
754   // Returns:
755   //   - on success, a RepeatedPtrField for CorpusInfo collected.
756   //   - OUT_OF_RANGE, this should never happen.
757   libtextclassifier3::StatusOr<
758       google::protobuf::RepeatedPtrField<DocumentDebugInfoProto::CorpusInfo>>
759   CollectCorpusInfo() const;
760 
761   // Build fingerprint for the keys of document_key_mapper_ and corpus_mapper_.
762   // Note that namespace_id_fingerprint_ controls the way that a fingerprint is
763   // built.
764   std::string MakeFingerprint(NamespaceId namespace_id,
765                               std::string_view namespace_,
766                               std::string_view uri_or_schema) const;
767 };
768 
769 }  // namespace lib
770 }  // namespace icing
771 
772 #endif  // ICING_STORE_DOCUMENT_STORE_H_
773