• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_STORE_DOCUMENT_STORE_H_
16 #define ICING_STORE_DOCUMENT_STORE_H_
17 
18 #include <cstdint>
19 #include <memory>
20 #include <string>
21 #include <string_view>
22 #include <vector>
23 
24 #include "icing/text_classifier/lib3/utils/base/status.h"
25 #include "icing/text_classifier/lib3/utils/base/statusor.h"
26 #include "icing/file/file-backed-proto-log.h"
27 #include "icing/file/file-backed-vector.h"
28 #include "icing/file/filesystem.h"
29 #include "icing/file/portable-file-backed-proto-log.h"
30 #include "icing/proto/debug.pb.h"
31 #include "icing/proto/document.pb.h"
32 #include "icing/proto/document_wrapper.pb.h"
33 #include "icing/proto/logging.pb.h"
34 #include "icing/proto/optimize.pb.h"
35 #include "icing/proto/persist.pb.h"
36 #include "icing/proto/search.pb.h"
37 #include "icing/proto/storage.pb.h"
38 #include "icing/proto/usage.pb.h"
39 #include "icing/schema/schema-store.h"
40 #include "icing/store/corpus-associated-scoring-data.h"
41 #include "icing/store/corpus-id.h"
42 #include "icing/store/document-associated-score-data.h"
43 #include "icing/store/document-filter-data.h"
44 #include "icing/store/document-id.h"
45 #include "icing/store/key-mapper.h"
46 #include "icing/store/namespace-fingerprint-identifier.h"
47 #include "icing/store/namespace-id.h"
48 #include "icing/store/usage-store.h"
49 #include "icing/tokenization/language-segmenter.h"
50 #include "icing/util/clock.h"
51 #include "icing/util/crc32.h"
52 #include "icing/util/data-loss.h"
53 #include "icing/util/document-validator.h"
54 #include "icing/util/fingerprint-util.h"
55 
56 namespace icing {
57 namespace lib {
58 
59 // Provides storage interfaces for documents.
60 class DocumentStore {
61  public:
62   struct Header {
GetCurrentMagicHeader63     static int32_t GetCurrentMagic(bool namespace_id_fingerprint) {
64       return namespace_id_fingerprint ? kNewMagic : kOldMagic;
65     }
66 
67     // Holds the magic as a quick sanity check against file corruption.
68     int32_t magic;
69 
70     // Checksum of the DocumentStore's sub-component's checksums.
71     uint32_t checksum;
72 
73    private:
74     static constexpr int32_t kOldMagic = 0x746f7265;
75     static constexpr int32_t kNewMagic = 0x1b99c8b0;
76   };
77 
78   struct OptimizeInfo {
79     // The estimated size in bytes of the optimizable docs. We don't track the
80     // size of each document, so we estimate by taking the size of the entire
81     // DocumentStore and dividing that by the total number of documents we have.
82     // So we end up with an average document size.
83     int64_t estimated_optimizable_bytes = 0;
84 
85     // Number of total documents the DocumentStore tracks.
86     int32_t total_docs = 0;
87 
88     // Number of optimizable (deleted + expired) docs the DocumentStore tracks.
89     int32_t optimizable_docs = 0;
90   };
91 
92   struct DeleteByGroupResult {
93     // Status representing whether or not the operation succeeded. See the
94     // comments above the function that returns this result to determine what
95     // possible statuses could be returned.
96     libtextclassifier3::Status status;
97 
98     int num_docs_deleted = 0;
99   };
100 
101   struct CreateResult {
102     // A successfully initialized document store.
103     std::unique_ptr<DocumentStore> document_store;
104 
105     // The data status after initializing from a previous state. Data loss can
106     // happen if the file is corrupted or some previously added data was
107     // unpersisted. This may be used to signal that any derived data off of the
108     // document store may need to be regenerated.
109     DataLoss data_loss;
110 
111     // A boolean flag indicating if derived files of the document store have
112     // been regenerated or not. This is usually a signal for callers to detect
113     // if any id assignment has changed (e.g. NamespaceId).
114     bool derived_files_regenerated;
115   };
116 
117   // Not copyable
118   DocumentStore(const DocumentStore&) = delete;
119   DocumentStore& operator=(const DocumentStore&) = delete;
120 
121   // Persists and updates checksum of subcomponents.
122   ~DocumentStore();
123 
124   // Factory method to create, initialize, and return a DocumentStore. The base
125   // directory is used to persist document store files. If document store was
126   // previously initialized with this directory, it will reload the files saved
127   // by the last instance.
128   //
129   // force_recovery_and_revalidate_documents=true will pre-emptively throw out
130   // the derived files and validate each document while recreating them. This
131   // can be used to indicate that the schema (and type ids) may have changed and
132   // those changes might not have been applied to the document store.
133   //
134   // If initialize_stats is present, the fields related to DocumentStore will be
135   // populated.
136   //
137   // Does not take any ownership, and all pointers except initialize_stats must
138   // refer to valid objects that outlive the one constructed.
139   //
140   // TODO(cassiewang): Consider returning a status indicating that derived files
141   // were regenerated. This may be helpful in logs.
142   //
143   // Returns:
144   //   A DocumentStore::CreateResult on success
145   //   FAILED_PRECONDITION on any null pointer input
146   //   INTERNAL_ERROR on IO error
147   static libtextclassifier3::StatusOr<DocumentStore::CreateResult> Create(
148       const Filesystem* filesystem, const std::string& base_dir,
149       const Clock* clock, const SchemaStore* schema_store,
150       bool force_recovery_and_revalidate_documents,
151       bool namespace_id_fingerprint, bool pre_mapping_fbv,
152       bool use_persistent_hash_map, int32_t compression_level,
153       InitializeStatsProto* initialize_stats);
154 
155   // Discards all derived data in the document store.
156   //
157   // Returns:
158   //   OK on success or nothing to discard
159   //   INTERNAL_ERROR on any I/O errors
160   static libtextclassifier3::Status DiscardDerivedFiles(
161       const Filesystem* filesystem, const std::string& base_dir);
162 
163   // Returns the maximum DocumentId that the DocumentStore has assigned. If
164   // there has not been any DocumentIds assigned, i.e. the DocumentStore is
165   // empty, then kInvalidDocumentId is returned. This does not filter out
166   // DocumentIds of deleted or expired documents.
last_added_document_id()167   DocumentId last_added_document_id() const {
168     if (document_id_mapper_->num_elements() == 0) {
169       return kInvalidDocumentId;
170     }
171     return document_id_mapper_->num_elements() - 1;
172   }
173 
174   // Returns the number of documents. The result does not filter out DocumentIds
175   // of deleted or expired documents.
num_documents()176   int num_documents() const { return document_id_mapper_->num_elements(); }
177 
178   // Puts the document into document store.
179   //
180   // If put_document_stats is present, the fields related to DocumentStore will
181   // be populated.
182   //
183   // Returns:
184   //   A newly generated document id on success
185   //   RESOURCE_EXHAUSED if exceeds maximum number of allowed documents
186   //   FAILED_PRECONDITION if schema hasn't been set yet
187   //   NOT_FOUND if the schema_type or a property config of the document doesn't
188   //     exist in schema
189   //   INTERNAL_ERROR on IO error
190   libtextclassifier3::StatusOr<DocumentId> Put(
191       const DocumentProto& document, int32_t num_tokens = 0,
192       PutDocumentStatsProto* put_document_stats = nullptr);
193   libtextclassifier3::StatusOr<DocumentId> Put(
194       DocumentProto&& document, int32_t num_tokens = 0,
195       PutDocumentStatsProto* put_document_stats = nullptr);
196 
197   // Finds and returns the document identified by the given key (namespace +
198   // uri). If 'clear_internal_fields' is true, document level data that's
199   // generated internally by DocumentStore is cleared.
200   //
201   // Returns:
202   //   The document found on success
203   //   NOT_FOUND if the key doesn't exist or document has been deleted
204   //   INTERNAL_ERROR on IO error
205   libtextclassifier3::StatusOr<DocumentProto> Get(
206       std::string_view name_space, std::string_view uri,
207       bool clear_internal_fields = true) const;
208 
209   // Finds and returns the document identified by the given document id. If
210   // 'clear_internal_fields' is true, document level data that's generated
211   // internally by DocumentStore is cleared.
212   //
213   // Returns:
214   //   The document found on success
215   //   INVALID_ARGUMENT if document_id is less than 0 or greater than the
216   //                    maximum value
217   //   NOT_FOUND if the document doesn't exist or has been deleted
218   //   INTERNAL_ERROR on IO error
219   libtextclassifier3::StatusOr<DocumentProto> Get(
220       DocumentId document_id, bool clear_internal_fields = true) const;
221 
222   // Returns all namespaces which have at least 1 active document (not deleted
223   // or expired). Order of namespaces is undefined.
224   std::vector<std::string> GetAllNamespaces() const;
225 
226   // Deletes the document identified by the given namespace and uri. The
227   // document proto will be erased immediately.
228   //
229   // NOTE:
230   //    Space is not reclaimed for deleted documents until Optimize() is
231   //    called.
232   //
233   // Returns:
234   //   OK on success
235   //   NOT_FOUND if no document exists with namespace, uri
236   //   INTERNAL_ERROR on IO error
237   libtextclassifier3::Status Delete(std::string_view name_space,
238                                     std::string_view uri,
239                                     int64_t current_time_ms);
240 
241   // Deletes the document identified by the given document_id. The document
242   // proto will be erased immediately.
243   //
244   // NOTE:
245   //    Space is not reclaimed for deleted documents until Optimize() is
246   //    called.
247   //
248   // Returns:
249   //   OK on success
250   //   NOT_FOUND if the document doesn't exist (i.e. deleted or expired)
251   //   INTERNAL_ERROR on IO error
252   //   INVALID_ARGUMENT if document_id is invalid.
253   libtextclassifier3::Status Delete(DocumentId document_id,
254                                     int64_t current_time_ms);
255 
256   // Returns the NamespaceId of the string namespace
257   //
258   // Returns:
259   //   NamespaceId on success
260   //   NOT_FOUND if the namespace doesn't exist
261   //   INTERNAL_ERROR on IO error
262   libtextclassifier3::StatusOr<NamespaceId> GetNamespaceId(
263       std::string_view name_space) const;
264 
265   // Helper method to find a DocumentId that is associated with the given
266   // namespace and uri.
267   //
268   // NOTE: The DocumentId may refer to a invalid document (deleted
269   // or expired). Callers can call DoesDocumentExist(document_id) to ensure it
270   // refers to a valid Document.
271   //
272   // Returns:
273   //   A DocumentId on success
274   //   NOT_FOUND if the key doesn't exist
275   //   INTERNAL_ERROR on IO error
276   libtextclassifier3::StatusOr<DocumentId> GetDocumentId(
277       std::string_view name_space, std::string_view uri) const;
278 
279   // Helper method to find a DocumentId that is associated with the given
280   // NamespaceFingerprintIdentifier.
281   //
282   // NOTE: The DocumentId may refer to a invalid document (deleted
283   // or expired). Callers can call DoesDocumentExist(document_id) to ensure it
284   // refers to a valid Document.
285   //
286   // Returns:
287   //   A DocumentId on success
288   //   NOT_FOUND if the key doesn't exist
289   //   INTERNAL_ERROR on IO error
290   libtextclassifier3::StatusOr<DocumentId> GetDocumentId(
291       const NamespaceFingerprintIdentifier& namespace_fingerprint_identifier)
292       const;
293 
294   // Returns the CorpusId associated with the given namespace and schema.
295   //
296   // Returns:
297   //   A CorpusId on success
298   //   NOT_FOUND if the key doesn't exist
299   //   INTERNAL_ERROR on IO error
300   libtextclassifier3::StatusOr<CorpusId> GetCorpusId(
301       const std::string_view name_space, const std::string_view schema) const;
302 
303   // Returns the ResultGroupingEntryId associated with the given namespace
304   // and schema.
305   //
306   // NOTE: ResultGroupingEntryIds that are generated by calls with different
307   // ResultGroupingTypes should not be compared. Returned ResultGroupingEntryIds
308   // are only guarenteed to be unique within their own ResultGroupingType.
309   //
310   // Returns:
311   //   A ResultGroupingEntryId on success
312   //   NOT_FOUND if the key doesn't exist
313   //   INTERNAL_ERROR on IO error
314   libtextclassifier3::StatusOr<int32_t> GetResultGroupingEntryId(
315       ResultSpecProto::ResultGroupingType result_group_type,
316       const std::string_view name_space, const std::string_view schema) const;
317 
318   // Returns the ResultGrouping Entry Id associated with the given NamespaceId
319   // and SchemaTypeId
320   //
321   // NOTE: ResultGroupingEntryIds that are generated by calls with different
322   // ResultGroupingTypes should not be compared. Returned ResultGroupingEntryIds
323   // are only guarenteed to be unique within their own ResultGroupingType.
324   //
325   // Returns:
326   //   A ResultGroupingEntryId on success
327   //   NOT_FOUND if the key doesn't exist
328   //   INTERNAL_ERROR on IO error
329   libtextclassifier3::StatusOr<int32_t> GetResultGroupingEntryId(
330       ResultSpecProto::ResultGroupingType result_group_type,
331       const NamespaceId namespace_id, const SchemaTypeId schema_type_id) const;
332 
333   // Returns the DocumentAssociatedScoreData of the document specified by the
334   // DocumentId.
335   //
336   // Returns:
337   //   DocumentAssociatedScoreData on success
338   //   NOT_FOUND if the document or the score data is not found
339   libtextclassifier3::StatusOr<DocumentAssociatedScoreData>
340   GetDocumentAssociatedScoreData(DocumentId document_id) const;
341 
342   // Returns the CorpusAssociatedScoreData of the corpus specified by the
343   // corpus_id.
344   //
345   // NOTE: This does not check if the corpus exists and will return the
346   // CorpusAssociatedScoreData of the corpus even if all documents belonging to
347   // that corpus have been deleted.
348   //
349   // Returns:
350   //   CorpusAssociatedScoreData on success
351   //   OUT_OF_RANGE if corpus_id is negative or exceeds previously seen
352   //                CorpusIds
353   libtextclassifier3::StatusOr<CorpusAssociatedScoreData>
354   GetCorpusAssociatedScoreData(CorpusId corpus_id) const;
355 
356   // Gets the document filter data if a document exists. Otherwise, will get a
357   // false optional.
358   //
359   // Existence means it hasn't been deleted and it hasn't expired yet.
360   //
361   // Returns:
362   //   True:DocumentFilterData  if the given document exists.
363   //   False                    if the given document doesn't exist.
364   std::optional<DocumentFilterData> GetAliveDocumentFilterData(
365       DocumentId document_id, int64_t current_time_ms) const;
366 
367   // Gets the usage scores of a document.
368   //
369   // Returns:
370   //   UsageScores on success
371   //   nullopt if there are no usage scores stored for the requested docid.
372   std::optional<UsageStore::UsageScores> GetUsageScores(
373       DocumentId document_id, int64_t current_time_ms) const;
374 
375   // Reports usage. The corresponding usage scores of the specified document in
376   // the report will be updated.
377   //
378   // Returns:
379   //   OK on success
380   //   NOT_FOUND if the [namesapce + uri] key in the report doesn't exist
381   //   INTERNAL_ERROR on I/O errors.
382   libtextclassifier3::Status ReportUsage(const UsageReport& usage_report);
383 
384   // Deletes all documents belonging to the given namespace. The documents will
385   // be erased immediately.
386   //
387   // NOTE:
388   //    Space is not reclaimed for deleted documents until Optimize() is
389   //    called.
390   //
391   // Returns:
392   //   OK on success
393   //   NOT_FOUND if namespace doesn't exist
394   //   INTERNAL_ERROR on IO error
395   DeleteByGroupResult DeleteByNamespace(std::string_view name_space);
396 
397   // Deletes all documents belonging to the given schema type. The documents
398   // will be erased immediately.
399   //
400   // NOTE:
401   //    Space is not reclaimed for deleted documents until Optimize() is
402   //    called.
403   //
404   // Returns:
405   //   OK on success
406   //   NOT_FOUND if schema_type doesn't exist
407   //   INTERNAL_ERROR on IO error
408   DeleteByGroupResult DeleteBySchemaType(std::string_view schema_type);
409 
410   // Syncs all the data and metadata changes to disk.
411   //
412   // Returns:
413   //   OK on success
414   //   INTERNAL on I/O error
415   libtextclassifier3::Status PersistToDisk(PersistType::Code persist_type);
416 
417   // Calculates the StorageInfo for the Document Store.
418   //
419   // If an IO error occurs while trying to calculate the value for a field, then
420   // that field will be set to -1.
421   DocumentStorageInfoProto GetStorageInfo() const;
422 
423   // Update any derived data off of the SchemaStore with the new SchemaStore.
424   // This may include pointers, SchemaTypeIds, etc.
425   //
426   // NOTE: This function may delete documents. A document may be invalidated by
427   // the new SchemaStore, such as failing validation or having its schema type
428   // deleted from the schema.
429   //
430   // This is best used if the caller is unsure about what's changed in the
431   // SchemaStore, and wants to update all information no matter what. If the
432   // caller does know what has changed, then it's recommended to call
433   // OptimizedUpdateSchemaStore.
434   //
435   // Returns;
436   //   OK on success
437   //   INTERNAL_ERROR on IO error
438   libtextclassifier3::Status UpdateSchemaStore(const SchemaStore* schema_store);
439 
440   // Performs the same funtionality as UpdateSchemaStore, but this can be more
441   // optimized in terms of less disk reads and less work if we know exactly
442   // what's changed between the old and new SchemaStore.
443   //
444   // Returns;
445   //   OK on success
446   //   INTERNAL_ERROR on IO error
447   libtextclassifier3::Status OptimizedUpdateSchemaStore(
448       const SchemaStore* schema_store,
449       const SchemaStore::SetSchemaResult& set_schema_result);
450 
451   // Reduces internal file sizes by reclaiming space of deleted documents and
452   // regenerating derived files.
453   //
454   // NOTE: The tasks in this method are too expensive to be executed in
455   // real-time. The caller should decide how frequently and when to call this
456   // method based on device usage.
457   //
458   // Returns:
459   //   OK on success
460   //   INTERNAL_ERROR on IO error
461   libtextclassifier3::Status Optimize();
462 
463   struct OptimizeResult {
464     // A vector that maps old document id to new document id.
465     std::vector<DocumentId> document_id_old_to_new;
466 
467     // A vector that maps old namespace id to new namespace id. Will be empty if
468     // should_rebuild_index is set to true.
469     std::vector<NamespaceId> namespace_id_old_to_new;
470 
471     // A boolean flag that hints the caller (usually IcingSearchEngine) if it
472     // should rebuild index instead of adopting the id changes via the 2 vectors
473     // above. It will be set to true if finding any id inconsistency.
474     bool should_rebuild_index = false;
475   };
476   // Copy data from current base directory into a new directory. Any outdated or
477   // deleted data won't be copied. During the process, document/namespace ids
478   // will be reassigned so any files / classes that are based on old
479   // document/namespace ids may be outdated.
480   //
481   // stats will be set if non-null.
482   //
483   // NOTE: The tasks in this method are too expensive to be executed in
484   // real-time. The caller should decide how frequently and when to call this
485   // method based on device usage.
486   //
487   // Returns:
488   //   OptimizeResult which contains a vector mapping from old document id to
489   //   new document id and another vector mapping from old namespace id to new
490   //   namespace id, on success
491   //   INVALID_ARGUMENT if new_directory is same as current base directory
492   //   INTERNAL_ERROR on IO error
493   libtextclassifier3::StatusOr<OptimizeResult> OptimizeInto(
494       const std::string& new_directory, const LanguageSegmenter* lang_segmenter,
495       OptimizeStatsProto* stats = nullptr) const;
496 
497   // Calculates status for a potential Optimize call. Includes how many docs
498   // there are vs how many would be optimized away. And also includes an
499   // estimated size gains, in bytes, if Optimize were called.
500   //
501   // Returns:
502   //   OptimizeInfo on success
503   //   INTERNAL_ERROR on IO error
504   libtextclassifier3::StatusOr<OptimizeInfo> GetOptimizeInfo() const;
505 
506   // Computes the combined checksum of the document store - includes the ground
507   // truth and all derived files.
508   //
509   // Returns:
510   //   Combined checksum on success
511   //   INTERNAL_ERROR on compute error
512   libtextclassifier3::StatusOr<Crc32> ComputeChecksum() const;
513 
514   // Get debug information for the document store.
515   // verbosity <= 0, simplest debug information
516   // verbosity > 0, also return the total number of documents and tokens in each
517   // (namespace, schema type) pair.
518   //
519   // Returns:
520   //   DocumentDebugInfoProto on success
521   //   INTERNAL_ERROR on IO errors, crc compute error
522   libtextclassifier3::StatusOr<DocumentDebugInfoProto> GetDebugInfo(
523       int verbosity) const;
524 
525  private:
526   // Use DocumentStore::Create() to instantiate.
527   explicit DocumentStore(const Filesystem* filesystem,
528                          std::string_view base_dir, const Clock* clock,
529                          const SchemaStore* schema_store,
530                          bool namespace_id_fingerprint, bool pre_mapping_fbv,
531                          bool use_persistent_hash_map,
532                          int32_t compression_level);
533 
534   const Filesystem* const filesystem_;
535   const std::string base_dir_;
536   const Clock& clock_;
537 
538   // Handles the ground truth schema and all of the derived data off of the
539   // schema
540   const SchemaStore* schema_store_;
541 
542   // Used to validate incoming documents
543   DocumentValidator document_validator_;
544 
545   // Whether to use namespace id or namespace name to build up fingerprint for
546   // document_key_mapper_ and corpus_mapper_.
547   bool namespace_id_fingerprint_;
548 
549   // Flag indicating whether memory map max possible file size for underlying
550   // FileBackedVector before growing the actual file size.
551   bool pre_mapping_fbv_;
552 
553   // Flag indicating whether use persistent hash map as the key mapper (if
554   // false, then fall back to dynamic trie key mapper). Note: we only use
555   // persistent hash map for uri mapper if it is true.
556   bool use_persistent_hash_map_;
557 
558   const int32_t compression_level_;
559 
560   // A log used to store all documents, it serves as a ground truth of doc
561   // store. key_mapper_ and document_id_mapper_ can be regenerated from it.
562   std::unique_ptr<PortableFileBackedProtoLog<DocumentWrapper>> document_log_;
563 
564   // Key (namespace + uri) to DocumentId mapping
565   std::unique_ptr<
566       KeyMapper<DocumentId, fingerprint_util::FingerprintStringFormatter>>
567       document_key_mapper_;
568 
569   // DocumentId to file offset mapping
570   std::unique_ptr<FileBackedVector<int64_t>> document_id_mapper_;
571 
572   // A cache of document associated scores. The ground truth of the scores is
573   // DocumentProto stored in document_log_. This cache contains:
574   //   - CorpusId
575   //   - Document score
576   //   - Document creation timestamp in seconds
577   //   - Document length in number of tokens
578   std::unique_ptr<FileBackedVector<DocumentAssociatedScoreData>> score_cache_;
579 
580   // A cache of data, indexed by DocumentId, used to filter documents. Currently
581   // contains:
582   //   - NamespaceId
583   //   - SchemaTypeId
584   //   - Expiration timestamp in seconds
585   std::unique_ptr<FileBackedVector<DocumentFilterData>> filter_cache_;
586 
587   // A cache of corpus associated scores. The ground truth of the scores is
588   // DocumentProto stored in document_log_. This cache contains:
589   //   - Number of documents belonging to the corpus score
590   //   - The sum of the documents' lengths, in number of tokens.
591   std::unique_ptr<FileBackedVector<CorpusAssociatedScoreData>>
592       corpus_score_cache_;
593 
594   // Maps namespaces to a densely-assigned unique id. Namespaces are assigned an
595   // id when the first document belonging to that namespace is added to the
596   // DocumentStore. Namespaces may be removed from the mapper during compaction.
597   std::unique_ptr<KeyMapper<NamespaceId>> namespace_mapper_;
598 
599   // Maps a corpus, i.e. a (namespace, schema type) pair, to a densely-assigned
600   // unique id. A coprus is assigned an
601   // id when the first document belonging to that corpus is added to the
602   // DocumentStore. Corpus ids may be removed from the mapper during compaction.
603   std::unique_ptr<
604       KeyMapper<CorpusId, fingerprint_util::FingerprintStringFormatter>>
605       corpus_mapper_;
606 
607   // A storage class that caches all usage scores. Usage scores are not
608   // considered as ground truth. Usage scores are associated with document ids
609   // so they need to be updated when document ids change.
610   std::unique_ptr<UsageStore> usage_store_;
611 
612   // Used internally to indicate whether the class has been initialized. This is
613   // to guard against cases where the object has been created, but Initialize
614   // fails in the constructor. If we have successfully exited the constructor,
615   // then this field can be ignored. Clients of DocumentStore should not need to
616   // worry about this field.
617   bool initialized_ = false;
618 
619   struct InitializeResult {
620     DataLoss data_loss;
621 
622     // A boolean flag indicating if derived files of the document store have
623     // been regenerated or not. This is usually a signal for callers to detect
624     // if any id assignment has changed (e.g. NamespaceId).
625     bool derived_files_regenerated;
626   };
627   libtextclassifier3::StatusOr<InitializeResult> Initialize(
628       bool force_recovery_and_revalidate_documents,
629       InitializeStatsProto* initialize_stats);
630 
631   // Creates sub-components and verifies the integrity of each sub-component.
632   // This assumes that the the underlying files already exist, and will return
633   // an error if it doesn't find what it's expecting.
634   //
635   // Returns an error if subcomponents failed to initialize successfully.
636   //   INTERNAL_ERROR on IO error
637   libtextclassifier3::Status InitializeExistingDerivedFiles();
638 
639   // Re-generates all files derived from the ground truth: the document log.
640   //
641   // revalidate_documents=true will also cause each document to be revalidated
642   // the schema as it is read out of the document log.
643   //
644   // NOTE: if this function fails, the only thing we can do is to retry it until
645   // it succeeds or prevent the initialization of a DocumentStore. The
646   // DocumentStore object wouldn't work reliably if this fails.
647   //
648   // Steps:
649   //   1. Delete all derived files.
650   //   2. Iterate through document log, put data into new key mapper and
651   //   document_id
652   //      mapper.
653   //   3. Create header and store the updated combined checksum
654   libtextclassifier3::Status RegenerateDerivedFiles(bool revalidate_documents);
655 
656   // Resets the unique_ptr to the document_key_mapper, deletes the underlying
657   // file, and re-creates a new instance of the document_key_mapper .
658   //
659   // Returns OK or any IO errors.
660   libtextclassifier3::Status ResetDocumentKeyMapper();
661 
662   // Resets the unique_ptr to the document_id_mapper, deletes the underlying
663   // file, and re-creates a new instance of the document_id_mapper.
664   //
665   // Returns OK or any IO errors.
666   libtextclassifier3::Status ResetDocumentIdMapper();
667 
668   // Resets the unique_ptr to the score_cache, deletes the underlying file, and
669   // re-creates a new instance of the score_cache.
670   //
671   // Returns OK or any IO errors.
672   libtextclassifier3::Status ResetDocumentAssociatedScoreCache();
673 
674   // Resets the unique_ptr to the corpus_score_cache, deletes the underlying
675   // file, and re-creates a new instance of the corpus_score_cache.
676   //
677   // Returns OK or any IO errors.
678   libtextclassifier3::Status ResetCorpusAssociatedScoreCache();
679 
680   // Resets the unique_ptr to the filter_cache, deletes the underlying file, and
681   // re-creates a new instance of the filter_cache.
682   //
683   // Returns OK or any IO errors.
684   libtextclassifier3::Status ResetFilterCache();
685 
686   // Resets the unique_ptr to the namespace_mapper, deletes the underlying file,
687   // and re-creates a new instance of the namespace_mapper.
688   //
689   // Returns OK or any IO errors.
690   libtextclassifier3::Status ResetNamespaceMapper();
691 
692   // Resets the unique_ptr to the corpus_mapper, deletes the underlying file,
693   // and re-creates a new instance of the corpus_mapper.
694   //
695   // Returns OK or any IO errors.
696   libtextclassifier3::Status ResetCorpusMapper();
697 
698   // Checks if the header exists already. This does not create the header file
699   // if it doesn't exist.
700   bool HeaderExists();
701 
702   // Update, replace and persist the header file. Creates the header file if it
703   // doesn't exist.
704   //
705   // Returns:
706   //   OK on success
707   //   INTERNAL on I/O error
708   libtextclassifier3::Status UpdateHeader(const Crc32& checksum);
709 
710   libtextclassifier3::StatusOr<DocumentId> InternalPut(
711       DocumentProto&& document,
712       PutDocumentStatsProto* put_document_stats = nullptr);
713 
714   // Helper function to do batch deletes. Documents with the given
715   // "namespace_id" and "schema_type_id" will be deleted. If callers don't need
716   // to specify the namespace or schema type, pass in kInvalidNamespaceId or
717   // kInvalidSchemaTypeId. The document protos with their derived data will be
718   // erased / cleared immediately.
719   //
720   // NOTE: Space is not reclaimed in the derived files until Optimize() is
721   // called.
722   //
723   // Returns:
724   //   Number of documents that were actually updated to be deleted
725   //   INTERNAL_ERROR on IO error
726   libtextclassifier3::StatusOr<int> BatchDelete(NamespaceId namespace_id,
727                                                 SchemaTypeId schema_type_id);
728 
729   // Returns the CorpusAssociatedScoreData of the corpus specified by the
730   // corpus_id.
731   //
732   // If the corpus_id has never been seen before, it returns a
733   // CorpusAssociatedScoreData with properties set to default values.
734   //
735   // NOTE: This does not check if the corpus exists and will return the
736   // CorpusAssociatedScoreData of the corpus even if all documents belonging to
737   // that corpus have been deleted.
738   //
739   // Returns:
740   //   CorpusAssociatedScoreData on success
741   libtextclassifier3::StatusOr<CorpusAssociatedScoreData>
742   GetCorpusAssociatedScoreDataToUpdate(CorpusId corpus_id) const;
743 
744   // Check if a document exists. Existence means it hasn't been deleted and it
745   // hasn't expired yet.
746   //
747   // Returns:
748   //   OK if the document exists
749   //   INVALID_ARGUMENT if document_id is less than 0 or greater than the
750   //                    maximum value
751   //   NOT_FOUND if the document doesn't exist (i.e. deleted or expired)
752   //   INTERNAL_ERROR on IO error
753   libtextclassifier3::Status DoesDocumentExistWithStatus(
754       DocumentId document_id) const;
755 
756   // Checks if a document has been deleted
757   //
758   // This is for internal-use only because we assume that the document_id is
759   // already valid. If you're unsure if the document_id is valid, use
760   // DoesDocumentExist(document_id) instead, which will perform those additional
761   // checks.
762   bool IsDeleted(DocumentId document_id) const;
763 
764   // Checks if a document has expired.
765   //
766   // This is for internal-use only because we assume that the document_id is
767   // already valid. If you're unsure if the document_id is valid, use
768   // DoesDocumentExist(document_id) instead, which will perform those additional
769   // checks.
770 
771   // Returns:
772   //   True:DocumentFilterData  if the given document isn't expired.
773   //   False                    if the given doesn't document is expired.
774   std::optional<DocumentFilterData> GetNonExpiredDocumentFilterData(
775       DocumentId document_id, int64_t current_time_ms) const;
776 
777   // Updates the entry in the score cache for document_id.
778   libtextclassifier3::Status UpdateDocumentAssociatedScoreCache(
779       DocumentId document_id, const DocumentAssociatedScoreData& score_data);
780 
781   // Updates the entry in the corpus score cache for corpus_id.
782   libtextclassifier3::Status UpdateCorpusAssociatedScoreCache(
783       CorpusId corpus_id, const CorpusAssociatedScoreData& score_data);
784 
785   // Updates the entry in the filter cache for document_id.
786   libtextclassifier3::Status UpdateFilterCache(
787       DocumentId document_id, const DocumentFilterData& filter_data);
788 
789   // Helper method to clear the derived data of a document
790   libtextclassifier3::Status ClearDerivedData(DocumentId document_id);
791 
792   // Sets usage scores for the given document.
793   libtextclassifier3::Status SetUsageScores(
794       DocumentId document_id, const UsageStore::UsageScores& usage_scores);
795 
796   // Returns:
797   //   - on success, a DocumentStorageInfoProto with the fields relating to the
798   //     size of Document Store member variables populated.
799   //   - INTERNAL on failure to get file size
800   DocumentStorageInfoProto GetMemberStorageInfo() const;
801 
802   // Returns:
803   //   - on success, the storage_info that was passed in but with the number of
804   //     alive, deleted and expired documents also set.
805   //   - OUT_OF_RANGE, this should never happen. This could only be returned if
806   //     the document_id_mapper somehow became larger than the filter cache.
807   DocumentStorageInfoProto CalculateDocumentStatusCounts(
808       DocumentStorageInfoProto storage_info) const;
809 
810   // Returns:
811   //   - on success, a RepeatedPtrField for CorpusInfo collected.
812   //   - OUT_OF_RANGE, this should never happen.
813   libtextclassifier3::StatusOr<
814       google::protobuf::RepeatedPtrField<DocumentDebugInfoProto::CorpusInfo>>
815   CollectCorpusInfo() const;
816 
817   // Build fingerprint for the keys of document_key_mapper_ and corpus_mapper_.
818   // Note that namespace_id_fingerprint_ controls the way that a fingerprint is
819   // built.
820   std::string MakeFingerprint(NamespaceId namespace_id,
821                               std::string_view namespace_,
822                               std::string_view uri_or_schema) const;
823 };
824 
825 }  // namespace lib
826 }  // namespace icing
827 
828 #endif  // ICING_STORE_DOCUMENT_STORE_H_
829