• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_STORE_DOCUMENT_STORE_H_
16 #define ICING_STORE_DOCUMENT_STORE_H_
17 
18 #include <cstdint>
19 #include <memory>
20 #include <optional>
21 #include <string>
22 #include <string_view>
23 #include <unordered_set>
24 #include <vector>
25 
26 #include "icing/text_classifier/lib3/utils/base/status.h"
27 #include "icing/text_classifier/lib3/utils/base/statusor.h"
28 #include "icing/feature-flags.h"
29 #include "icing/file/file-backed-vector.h"
30 #include "icing/file/filesystem.h"
31 #include "icing/file/memory-mapped-file-backed-proto-log.h"
32 #include "icing/file/portable-file-backed-proto-log.h"
33 #include "icing/proto/debug.pb.h"
34 #include "icing/proto/document.pb.h"
35 #include "icing/proto/document_wrapper.pb.h"
36 #include "icing/proto/internal/scorable_property_set.pb.h"
37 #include "icing/proto/logging.pb.h"
38 #include "icing/proto/optimize.pb.h"
39 #include "icing/proto/persist.pb.h"
40 #include "icing/proto/search.pb.h"
41 #include "icing/proto/storage.pb.h"
42 #include "icing/proto/usage.pb.h"
43 #include "icing/schema/schema-store.h"
44 #include "icing/store/corpus-associated-scoring-data.h"
45 #include "icing/store/corpus-id.h"
46 #include "icing/store/document-associated-score-data.h"
47 #include "icing/store/document-filter-data.h"
48 #include "icing/store/document-id.h"
49 #include "icing/store/key-mapper.h"
50 #include "icing/store/namespace-id-fingerprint.h"
51 #include "icing/store/namespace-id.h"
52 #include "icing/store/usage-store.h"
53 #include "icing/tokenization/language-segmenter.h"
54 #include "icing/util/clock.h"
55 #include "icing/util/crc32.h"
56 #include "icing/util/data-loss.h"
57 #include "icing/util/document-validator.h"
58 #include "icing/util/fingerprint-util.h"
59 #include "icing/util/scorable_property_set.h"
60 
61 namespace icing {
62 namespace lib {
63 
64 // Provides storage interfaces for documents.
65 class DocumentStore {
66  public:
67   struct Header {
68     // Previously used magic numbers, please avoid reusing those:
69     // [0x1b99c8b0, 0x3e005b5e]
70     static constexpr int32_t kMagic = 0x8a32cd1f;
71 
72     // Holds the magic as a quick sanity check against file corruption.
73     int32_t magic;
74 
75     // Checksum of the DocumentStore's sub-component's checksums.
76     uint32_t checksum;
77   };
78 
79   struct OptimizeInfo {
80     // The estimated size in bytes of the optimizable docs. We don't track the
81     // size of each document, so we estimate by taking the size of the entire
82     // DocumentStore and dividing that by the total number of documents we have.
83     // So we end up with an average document size.
84     int64_t estimated_optimizable_bytes = 0;
85 
86     // Number of total documents the DocumentStore tracks.
87     int32_t total_docs = 0;
88 
89     // Number of optimizable (deleted + expired) docs the DocumentStore tracks.
90     int32_t optimizable_docs = 0;
91   };
92 
93   struct DeleteByGroupResult {
94     // Status representing whether or not the operation succeeded. See the
95     // comments above the function that returns this result to determine what
96     // possible statuses could be returned.
97     libtextclassifier3::Status status;
98 
99     int num_docs_deleted = 0;
100   };
101 
102   struct CreateResult {
103     // A successfully initialized document store.
104     std::unique_ptr<DocumentStore> document_store;
105 
106     // The data status after initializing from a previous state. Data loss can
107     // happen if the file is corrupted or some previously added data was
108     // unpersisted. This may be used to signal that any derived data off of the
109     // document store may need to be regenerated.
110     DataLoss data_loss;
111 
112     // A boolean flag indicating if derived files of the document store have
113     // been regenerated or not. This is usually a signal for callers to detect
114     // if any id assignment has changed (e.g. NamespaceId).
115     bool derived_files_regenerated;
116   };
117 
118   // Not copyable
119   DocumentStore(const DocumentStore&) = delete;
120   DocumentStore& operator=(const DocumentStore&) = delete;
121 
122   // Persists and updates checksum of subcomponents.
123   ~DocumentStore();
124 
125   // Factory method to create, initialize, and return a DocumentStore. The base
126   // directory is used to persist document store files. If document store was
127   // previously initialized with this directory, it will reload the files saved
128   // by the last instance.
129   //
130   // force_recovery_and_revalidate_documents=true will pre-emptively throw out
131   // the derived files and validate each document while recreating them. This
132   // can be used to indicate that the schema (and type ids) may have changed and
133   // those changes might not have been applied to the document store.
134   //
135   // If initialize_stats is present, the fields related to DocumentStore will be
136   // populated.
137   //
138   // Does not take any ownership, and all pointers except initialize_stats must
139   // refer to valid objects that outlive the one constructed.
140   //
141   // TODO(cassiewang): Consider returning a status indicating that derived files
142   // were regenerated. This may be helpful in logs.
143   //
144   // Returns:
145   //   A DocumentStore::CreateResult on success
146   //   FAILED_PRECONDITION on any null pointer input
147   //   INTERNAL_ERROR on IO error
148   static libtextclassifier3::StatusOr<DocumentStore::CreateResult> Create(
149       const Filesystem* filesystem, const std::string& base_dir,
150       const Clock* clock, const SchemaStore* schema_store,
151       const FeatureFlags* feature_flags,
152       bool force_recovery_and_revalidate_documents, bool pre_mapping_fbv,
153       bool use_persistent_hash_map, int32_t compression_level,
154       InitializeStatsProto* initialize_stats);
155 
156   // Discards all derived data in the document store.
157   //
158   // Returns:
159   //   OK on success or nothing to discard
160   //   INTERNAL_ERROR on any I/O errors
161   static libtextclassifier3::Status DiscardDerivedFiles(
162       const Filesystem* filesystem, const std::string& base_dir);
163 
164   // Returns the maximum DocumentId that the DocumentStore has assigned. If
165   // there has not been any DocumentIds assigned, i.e. the DocumentStore is
166   // empty, then kInvalidDocumentId is returned. This does not filter out
167   // DocumentIds of deleted or expired documents.
last_added_document_id()168   DocumentId last_added_document_id() const {
169     if (document_id_mapper_->num_elements() == 0) {
170       return kInvalidDocumentId;
171     }
172     return document_id_mapper_->num_elements() - 1;
173   }
174 
175   // Returns the number of documents. The result does not filter out DocumentIds
176   // of deleted or expired documents.
num_documents()177   int num_documents() const { return document_id_mapper_->num_elements(); }
178 
179   // Puts the document into document store.
180   //
181   // If put_document_stats is present, the fields related to DocumentStore will
182   // be populated.
183   //
184   //  Returns:
185   //   - On success, a PutResult with the DocumentId of the newly added document
186   //     and the old DocumentId before replacement. If this is a new document,
187   //     then old DocumentId will be kInvalidDocumentId.
188   //   - RESOURCE_EXHAUSTED if exceeds maximum number of allowed documents
189   //   - FAILED_PRECONDITION if schema hasn't been set yet
190   //   - NOT_FOUND if the schema_type or a property config of the document
191   //     doesn't exist in schema
192   //   - INTERNAL_ERROR on IO error
193   struct PutResult {
194     DocumentId old_document_id = kInvalidDocumentId;
195     DocumentId new_document_id = kInvalidDocumentId;
196 
was_replacementPutResult197     bool was_replacement() const {
198       return old_document_id != kInvalidDocumentId;
199     }
200   };
201   libtextclassifier3::StatusOr<PutResult> Put(
202       const DocumentProto& document, int32_t num_tokens = 0,
203       PutDocumentStatsProto* put_document_stats = nullptr);
204   libtextclassifier3::StatusOr<PutResult> Put(
205       DocumentProto&& document, int32_t num_tokens = 0,
206       PutDocumentStatsProto* put_document_stats = nullptr);
207 
208   // Finds and returns the document identified by the given key (namespace +
209   // uri). If 'clear_internal_fields' is true, document level data that's
210   // generated internally by DocumentStore is cleared.
211   //
212   // Returns:
213   //   The document found on success
214   //   NOT_FOUND if the key doesn't exist or document has been deleted
215   //   INTERNAL_ERROR on IO error
216   libtextclassifier3::StatusOr<DocumentProto> Get(
217       std::string_view name_space, std::string_view uri,
218       bool clear_internal_fields = true) const;
219 
220   // Finds and returns the document identified by the given document id. If
221   // 'clear_internal_fields' is true, document level data that's generated
222   // internally by DocumentStore is cleared.
223   //
224   // Returns:
225   //   The document found on success
226   //   INVALID_ARGUMENT if document_id is less than 0 or greater than the
227   //                    maximum value
228   //   NOT_FOUND if the document doesn't exist or has been deleted
229   //   INTERNAL_ERROR on IO error
230   libtextclassifier3::StatusOr<DocumentProto> Get(
231       DocumentId document_id, bool clear_internal_fields = true) const;
232 
233   // Returns the ScorablePropertySet of the document specified by the
234   // DocumentId.
235   //
236   // Returns:
237   //   - ScorablePropertySet on success
238   //   - nullptr when the ScorablePropertySet fails to be created, it could be
239   //     due to that:
240   //     - |document_id| is invalid, or
241   //     - no ScorablePropertySetProto is found for the document in the cache
242   //     - internal IO error
243   std::unique_ptr<ScorablePropertySet> GetScorablePropertySet(
244       DocumentId document_id, int64_t current_time_ms) const;
245 
246   // Returns all namespaces which have at least 1 active document (not deleted
247   // or expired). Order of namespaces is undefined.
248   std::vector<std::string> GetAllNamespaces() const;
249 
250   // Deletes the document identified by the given namespace and uri. The
251   // document proto will be erased immediately.
252   //
253   // NOTE:
254   //    Space is not reclaimed for deleted documents until Optimize() is
255   //    called.
256   //
257   // Returns:
258   //   OK on success
259   //   NOT_FOUND if no document exists with namespace, uri
260   //   INTERNAL_ERROR on IO error
261   libtextclassifier3::Status Delete(std::string_view name_space,
262                                     std::string_view uri,
263                                     int64_t current_time_ms);
264 
265   // Deletes the document identified by the given document_id. The document
266   // proto will be erased immediately.
267   //
268   // NOTE:
269   //    Space is not reclaimed for deleted documents until Optimize() is
270   //    called.
271   //
272   // Returns:
273   //   OK on success
274   //   NOT_FOUND if the document doesn't exist (i.e. deleted or expired)
275   //   INTERNAL_ERROR on IO error
276   //   INVALID_ARGUMENT if document_id is invalid.
277   libtextclassifier3::Status Delete(DocumentId document_id,
278                                     int64_t current_time_ms);
279 
280   // Returns the NamespaceId of the string namespace
281   //
282   // Returns:
283   //   NamespaceId on success
284   //   NOT_FOUND if the namespace doesn't exist
285   //   INTERNAL_ERROR on IO error
286   libtextclassifier3::StatusOr<NamespaceId> GetNamespaceId(
287       std::string_view name_space) const;
288 
289   // Helper method to find a DocumentId that is associated with the given
290   // namespace and uri.
291   //
292   // NOTE: if succeeded, it always returns a valid DocumentId, but this
293   // DocumentId may refer to a invalid document (deleted or expired). Callers
294   // can call GetAliveDocumentFilterData(document_id, current_time_ms) and check
295   // the return value to ensure it refers to an alive Document.
296   //
297   // Returns:
298   //   A DocumentId on success
299   //   NOT_FOUND if the key doesn't exist
300   //   INTERNAL_ERROR on IO error
301   libtextclassifier3::StatusOr<DocumentId> GetDocumentId(
302       std::string_view name_space, std::string_view uri) const;
303 
304   // Helper method to find a DocumentId that is associated with the given
305   // NamespaceIdFingerprint.
306   //
307   // NOTE: if succeeded, it always returns a valid DocumentId, but this
308   // DocumentId may refer to a invalid document (deleted or expired). Callers
309   // can call GetAliveDocumentFilterData(document_id, current_time_ms) and check
310   // the return value to ensure it refers to an alive Document.
311   //
312   // Returns:
313   //   A DocumentId on success
314   //   NOT_FOUND if the key doesn't exist
315   //   INTERNAL_ERROR on IO error
316   libtextclassifier3::StatusOr<DocumentId> GetDocumentId(
317       const NamespaceIdFingerprint& doc_namespace_id_uri_fingerprint) const;
318 
319   // Returns the CorpusId associated with the given namespace and schema.
320   //
321   // Returns:
322   //   A CorpusId on success
323   //   NOT_FOUND if the key doesn't exist
324   //   INTERNAL_ERROR on IO error
325   libtextclassifier3::StatusOr<CorpusId> GetCorpusId(
326       const std::string_view name_space, const std::string_view schema) const;
327 
328   // Returns the ResultGroupingEntryId associated with the given namespace
329   // and schema.
330   //
331   // NOTE: ResultGroupingEntryIds that are generated by calls with different
332   // ResultGroupingTypes should not be compared. Returned ResultGroupingEntryIds
333   // are only guarenteed to be unique within their own ResultGroupingType.
334   //
335   // Returns:
336   //   A ResultGroupingEntryId on success
337   //   NOT_FOUND if the key doesn't exist
338   //   INTERNAL_ERROR on IO error
339   libtextclassifier3::StatusOr<int32_t> GetResultGroupingEntryId(
340       ResultSpecProto::ResultGroupingType result_group_type,
341       const std::string_view name_space, const std::string_view schema) const;
342 
343   // Returns the ResultGrouping Entry Id associated with the given NamespaceId
344   // and SchemaTypeId
345   //
346   // NOTE: ResultGroupingEntryIds that are generated by calls with different
347   // ResultGroupingTypes should not be compared. Returned ResultGroupingEntryIds
348   // are only guarenteed to be unique within their own ResultGroupingType.
349   //
350   // Returns:
351   //   A ResultGroupingEntryId on success
352   //   NOT_FOUND if the key doesn't exist
353   //   INTERNAL_ERROR on IO error
354   libtextclassifier3::StatusOr<int32_t> GetResultGroupingEntryId(
355       ResultSpecProto::ResultGroupingType result_group_type,
356       const NamespaceId namespace_id, const SchemaTypeId schema_type_id) const;
357 
358   // Returns the DocumentAssociatedScoreData of the document specified by the
359   // DocumentId.
360   //
361   // Returns:
362   //   DocumentAssociatedScoreData on success
363   //   NOT_FOUND if the document or the score data is not found
364   libtextclassifier3::StatusOr<DocumentAssociatedScoreData>
365   GetDocumentAssociatedScoreData(DocumentId document_id) const;
366 
367   // Returns the CorpusAssociatedScoreData of the corpus specified by the
368   // corpus_id.
369   //
370   // NOTE: This does not check if the corpus exists and will return the
371   // CorpusAssociatedScoreData of the corpus even if all documents belonging to
372   // that corpus have been deleted.
373   //
374   // Returns:
375   //   CorpusAssociatedScoreData on success
376   //   OUT_OF_RANGE if corpus_id is negative or exceeds previously seen
377   //                CorpusIds
378   libtextclassifier3::StatusOr<CorpusAssociatedScoreData>
379   GetCorpusAssociatedScoreData(CorpusId corpus_id) const;
380 
381   // Gets the document filter data if a document exists and is not expired.
382   // Otherwise, will get a false optional.
383   //
384   // Existence means it hasn't been deleted and it hasn't expired yet.
385   //
386   // Returns:
387   //   True:DocumentFilterData  if the given document exists.
388   //   False                    if the given document doesn't exist.
389   std::optional<DocumentFilterData> GetAliveDocumentFilterData(
390       DocumentId document_id, int64_t current_time_ms) const;
391 
392   // Gets the document filter data if a document has not been deleted. If the
393   // document is expired but not deleted, will still return a valid document
394   // filter data. Otherwise, will get a false optional.
395   //
396   // Returns:
397   //   True:DocumentFilterData  if the given document exists.
398   //   False                    if the given document has been deleted.
399   std::optional<DocumentFilterData> GetNonDeletedDocumentFilterData(
400       DocumentId document_id) const;
401 
402   // Gets the SchemaTypeId of a document.
403   //
404   // Returns:
405   //   SchemaTypeId on success
406   //   kInvalidSchemaTypeId if the document is deleted or expired.
GetSchemaTypeId(DocumentId document_id,int64_t current_time_ms)407   SchemaTypeId GetSchemaTypeId(DocumentId document_id,
408                                int64_t current_time_ms) const {
409     std::optional<DocumentFilterData> document_filter_data_optional =
410         GetAliveDocumentFilterData(document_id, current_time_ms);
411     if (document_filter_data_optional) {
412       return document_filter_data_optional.value().schema_type_id();
413     } else {
414       return kInvalidSchemaTypeId;
415     }
416   }
417 
418   // Gets the usage scores of a document.
419   //
420   // Returns:
421   //   UsageScores on success
422   //   nullopt if there are no usage scores stored for the requested docid.
423   std::optional<UsageStore::UsageScores> GetUsageScores(
424       DocumentId document_id, int64_t current_time_ms) const;
425 
426   // Reports usage. The corresponding usage scores of the specified document in
427   // the report will be updated.
428   //
429   // Returns:
430   //   OK on success
431   //   NOT_FOUND if the [namesapce + uri] key in the report doesn't exist
432   //   INTERNAL_ERROR on I/O errors.
433   libtextclassifier3::Status ReportUsage(const UsageReport& usage_report);
434 
435   // Deletes all documents belonging to the given namespace. The documents will
436   // be erased immediately.
437   //
438   // NOTE:
439   //    Space is not reclaimed for deleted documents until Optimize() is
440   //    called.
441   //
442   // Returns:
443   //   OK on success
444   //   NOT_FOUND if namespace doesn't exist
445   //   INTERNAL_ERROR on IO error
446   DeleteByGroupResult DeleteByNamespace(std::string_view name_space);
447 
448   // Deletes all documents belonging to the given schema type. The documents
449   // will be erased immediately.
450   //
451   // NOTE:
452   //    Space is not reclaimed for deleted documents until Optimize() is
453   //    called.
454   //
455   // Returns:
456   //   OK on success
457   //   NOT_FOUND if schema_type doesn't exist
458   //   INTERNAL_ERROR on IO error
459   DeleteByGroupResult DeleteBySchemaType(std::string_view schema_type);
460 
461   // Syncs all the data and metadata changes to disk.
462   //
463   // Returns:
464   //   OK on success
465   //   INTERNAL on I/O error
466   libtextclassifier3::Status PersistToDisk(PersistType::Code persist_type);
467 
468   // Calculates the StorageInfo for the Document Store.
469   //
470   // If an IO error occurs while trying to calculate the value for a field, then
471   // that field will be set to -1.
472   DocumentStorageInfoProto GetStorageInfo() const;
473 
474   // Update any derived data off of the SchemaStore with the new SchemaStore.
475   // This may include pointers, SchemaTypeIds, etc.
476   //
477   // NOTE: This function may delete documents. A document may be invalidated by
478   // the new SchemaStore, such as failing validation or having its schema type
479   // deleted from the schema.
480   //
481   // This is best used if the caller is unsure about what's changed in the
482   // SchemaStore, and wants to update all information no matter what. If the
483   // caller does know what has changed, then it's recommended to call
484   // OptimizedUpdateSchemaStore.
485   //
486   // Returns;
487   //   OK on success
488   //   INTERNAL_ERROR on IO error
489   libtextclassifier3::Status UpdateSchemaStore(const SchemaStore* schema_store);
490 
491   // Performs the same funtionality as UpdateSchemaStore, but this can be more
492   // optimized in terms of less disk reads and less work if we know exactly
493   // what's changed between the old and new SchemaStore.
494   //
495   // Returns;
496   //   OK on success
497   //   INTERNAL_ERROR on IO error
498   libtextclassifier3::Status OptimizedUpdateSchemaStore(
499       const SchemaStore* schema_store,
500       const SchemaStore::SetSchemaResult& set_schema_result);
501 
502   // Re-generates the scorable property cache for documents with the given
503   // schema types.
504   //
505   // Returns:
506   //   OK on success
507   //   INTERNAL_ERROR on IO error
508   libtextclassifier3::Status RegenerateScorablePropertyCache(
509       const std::unordered_set<SchemaTypeId>& schema_type_ids);
510 
511   // Reduces internal file sizes by reclaiming space of deleted documents and
512   // regenerating derived files.
513   //
514   // NOTE: The tasks in this method are too expensive to be executed in
515   // real-time. The caller should decide how frequently and when to call this
516   // method based on device usage.
517   //
518   // Returns:
519   //   OK on success
520   //   INTERNAL_ERROR on IO error
521   libtextclassifier3::Status Optimize();
522 
523   struct OptimizeResult {
524     // A vector that maps old document id to new document id.
525     std::vector<DocumentId> document_id_old_to_new;
526 
527     // A vector that maps old namespace id to new namespace id. Will be empty if
528     // should_rebuild_index is set to true.
529     std::vector<NamespaceId> namespace_id_old_to_new;
530 
531     // A boolean flag that hints the caller (usually IcingSearchEngine) if it
532     // should rebuild index instead of adopting the id changes via the 2 vectors
533     // above. It will be set to true if finding any id inconsistency.
534     bool should_rebuild_index = false;
535 
536     // A set of blob handles that are dead and need to be removed.
537     std::unordered_set<std::string> dead_blob_handles;
538   };
539   // Copy data from current base directory into a new directory. Any outdated or
540   // deleted data won't be copied. During the process, document/namespace ids
541   // will be reassigned so any files / classes that are based on old
542   // document/namespace ids may be outdated.
543   //
544   // stats will be set if non-null.
545   //
546   // NOTE: The tasks in this method are too expensive to be executed in
547   // real-time. The caller should decide how frequently and when to call this
548   // method based on device usage.
549   //
550   // Returns:
551   //   OptimizeResult which contains a vector mapping from old document id to
552   //   new document id and another vector mapping from old namespace id to new
553   //   namespace id, on success
554   //   INVALID_ARGUMENT if new_directory is same as current base directory
555   //   INTERNAL_ERROR on IO error
556   libtextclassifier3::StatusOr<OptimizeResult> OptimizeInto(
557       const std::string& new_directory, const LanguageSegmenter* lang_segmenter,
558       std::unordered_set<std::string>&& expired_blob_handles,
559       OptimizeStatsProto* stats = nullptr) const;
560 
561   // Calculates status for a potential Optimize call. Includes how many docs
562   // there are vs how many would be optimized away. And also includes an
563   // estimated size gains, in bytes, if Optimize were called.
564   //
565   // Returns:
566   //   OptimizeInfo on success
567   //   INTERNAL_ERROR on IO error
568   libtextclassifier3::StatusOr<OptimizeInfo> GetOptimizeInfo() const;
569 
570   // Update, replace and persist the header file. Creates the header file if it
571   // doesn't exist.
572   //
573   // Returns:
574   //   OK on success
575   //   INTERNAL on I/O error
576   libtextclassifier3::StatusOr<Crc32> UpdateChecksum();
577 
578   // Calculates and returns the checksum of the document store.
579   //
580   // Returns:
581   //   OK on success
582   //   INTERNAL on I/O error
583   libtextclassifier3::StatusOr<Crc32> GetChecksum() const;
584 
585   // Get debug information for the document store.
586   // verbosity <= 0, simplest debug information
587   // verbosity > 0, also return the total number of documents and tokens in each
588   // (namespace, schema type) pair.
589   //
590   // Returns:
591   //   DocumentDebugInfoProto on success
592   //   INTERNAL_ERROR on IO errors, crc compute error
593   libtextclassifier3::StatusOr<DocumentDebugInfoProto> GetDebugInfo(
594       int verbosity) const;
595 
596  private:
597   // Use DocumentStore::Create() to instantiate.
598   explicit DocumentStore(const Filesystem* filesystem,
599                          std::string_view base_dir, const Clock* clock,
600                          const SchemaStore* schema_store,
601                          const FeatureFlags* feature_flags,
602                          bool pre_mapping_fbv, bool use_persistent_hash_map,
603                          int32_t compression_level);
604 
605   const Filesystem* const filesystem_;
606   const std::string base_dir_;
607   const Clock& clock_;
608   const FeatureFlags& feature_flags_;  // Does not own.
609 
610   // Handles the ground truth schema and all of the derived data off of the
611   // schema
612   const SchemaStore* schema_store_;
613 
614   // Used to validate incoming documents
615   DocumentValidator document_validator_;
616 
617   // Flag indicating whether memory map max possible file size for underlying
618   // FileBackedVector before growing the actual file size.
619   bool pre_mapping_fbv_;
620 
621   // Flag indicating whether use persistent hash map as the key mapper (if
622   // false, then fall back to dynamic trie key mapper). Note: we only use
623   // persistent hash map for uri mapper if it is true.
624   bool use_persistent_hash_map_;
625 
626   const int32_t compression_level_;
627 
628   // A log used to store all documents, it serves as a ground truth of doc
629   // store. key_mapper_ and document_id_mapper_ can be regenerated from it.
630   std::unique_ptr<PortableFileBackedProtoLog<DocumentWrapper>> document_log_;
631 
632   // Key (namespace + uri) to DocumentId mapping
633   std::unique_ptr<
634       KeyMapper<DocumentId, fingerprint_util::FingerprintStringFormatter>>
635       document_key_mapper_;
636 
637   // DocumentId to file offset mapping
638   std::unique_ptr<FileBackedVector<int64_t>> document_id_mapper_;
639 
640   // A cache of document associated scores. The ground truth of the scores is
641   // DocumentProto stored in document_log_. This cache contains:
642   //   - CorpusId
643   //   - Document score
644   //   - Document creation timestamp in seconds
645   //   - Document length in number of tokens
646   //   - Index of the ScorablePropertySetProto at the scorable_property_cache_
647   std::unique_ptr<FileBackedVector<DocumentAssociatedScoreData>> score_cache_;
648 
649   // A cache of document scorable properties. The ground truth of the data is
650   // DocumentProto stored in document_log_.
651   std::unique_ptr<MemoryMappedFileBackedProtoLog<ScorablePropertySetProto>>
652       scorable_property_cache_;
653 
654   // A cache of data, indexed by DocumentId, used to filter documents. Currently
655   // contains:
656   //   - NamespaceId
657   //   - SchemaTypeId
658   //   - Expiration timestamp in seconds
659   std::unique_ptr<FileBackedVector<DocumentFilterData>> filter_cache_;
660 
661   // A cache of corpus associated scores. The ground truth of the scores is
662   // DocumentProto stored in document_log_. This cache contains:
663   //   - Number of documents belonging to the corpus score
664   //   - The sum of the documents' lengths, in number of tokens.
665   std::unique_ptr<FileBackedVector<CorpusAssociatedScoreData>>
666       corpus_score_cache_;
667 
668   // Maps namespaces to a densely-assigned unique id. Namespaces are assigned an
669   // id when the first document belonging to that namespace is added to the
670   // DocumentStore. Namespaces may be removed from the mapper during compaction.
671   std::unique_ptr<KeyMapper<NamespaceId>> namespace_mapper_;
672 
673   // Maps a corpus, i.e. a (namespace, schema type) pair, to a densely-assigned
674   // unique id. A coprus is assigned an
675   // id when the first document belonging to that corpus is added to the
676   // DocumentStore. Corpus ids may be removed from the mapper during compaction.
677   std::unique_ptr<
678       KeyMapper<CorpusId, fingerprint_util::FingerprintStringFormatter>>
679       corpus_mapper_;
680 
681   // A storage class that caches all usage scores. Usage scores are not
682   // considered as ground truth. Usage scores are associated with document ids
683   // so they need to be updated when document ids change.
684   std::unique_ptr<UsageStore> usage_store_;
685 
686   // Used internally to indicate whether the class has been initialized. This is
687   // to guard against cases where the object has been created, but Initialize
688   // fails in the constructor. If we have successfully exited the constructor,
689   // then this field can be ignored. Clients of DocumentStore should not need to
690   // worry about this field.
691   bool initialized_ = false;
692 
693   struct InitializeResult {
694     DataLoss data_loss;
695 
696     // A boolean flag indicating if derived files of the document store have
697     // been regenerated or not. This is usually a signal for callers to detect
698     // if any id assignment has changed (e.g. NamespaceId).
699     bool derived_files_regenerated;
700   };
701   libtextclassifier3::StatusOr<InitializeResult> Initialize(
702       bool force_recovery_and_revalidate_documents,
703       InitializeStatsProto* initialize_stats);
704 
705   // Creates sub-components and verifies the integrity of each sub-component.
706   // This assumes that the the underlying files already exist, and will return
707   // an error if it doesn't find what it's expecting.
708   //
709   // Returns an error if subcomponents failed to initialize successfully.
710   //   INTERNAL_ERROR on IO error
711   libtextclassifier3::Status InitializeExistingDerivedFiles();
712 
713   // Re-generates all files derived from the ground truth: the document log.
714   //
715   // revalidate_documents=true will also cause each document to be revalidated
716   // the schema as it is read out of the document log.
717   //
718   // NOTE: if this function fails, the only thing we can do is to retry it until
719   // it succeeds or prevent the initialization of a DocumentStore. The
720   // DocumentStore object wouldn't work reliably if this fails.
721   //
722   // Steps:
723   //   1. Delete all derived files.
724   //   2. Iterate through document log, put data into new key mapper and
725   //   document_id
726   //      mapper.
727   //   3. Create header and store the updated combined checksum
728   libtextclassifier3::Status RegenerateDerivedFiles(bool revalidate_documents);
729 
730   // Resets the unique_ptr to the document_key_mapper, deletes the underlying
731   // file, and re-creates a new instance of the document_key_mapper .
732   //
733   // Returns OK or any IO errors.
734   libtextclassifier3::Status ResetDocumentKeyMapper();
735 
736   // Resets the unique_ptr to the document_id_mapper, deletes the underlying
737   // file, and re-creates a new instance of the document_id_mapper.
738   //
739   // Returns OK or any IO errors.
740   libtextclassifier3::Status ResetDocumentIdMapper();
741 
742   // Resets the unique_ptr to the score_cache, deletes the underlying file, and
743   // re-creates a new instance of the score_cache.
744   //
745   // Returns OK or any IO errors.
746   libtextclassifier3::Status ResetDocumentAssociatedScoreCache();
747 
748   // Resets the unique_ptr to the |scorable_property_cache_|, deletes the
749   // underlying file, and re-creates a new instance of it.
750   //
751   // Returns OK or any IO errors.
752   libtextclassifier3::Status ResetScorablePropertyCache();
753 
754   // Resets the unique_ptr to the corpus_score_cache, deletes the underlying
755   // file, and re-creates a new instance of the corpus_score_cache.
756   //
757   // Returns OK or any IO errors.
758   libtextclassifier3::Status ResetCorpusAssociatedScoreCache();
759 
760   // Resets the unique_ptr to the filter_cache, deletes the underlying file, and
761   // re-creates a new instance of the filter_cache.
762   //
763   // Returns OK or any IO errors.
764   libtextclassifier3::Status ResetFilterCache();
765 
766   // Resets the unique_ptr to the namespace_mapper, deletes the underlying file,
767   // and re-creates a new instance of the namespace_mapper.
768   //
769   // Returns OK or any IO errors.
770   libtextclassifier3::Status ResetNamespaceMapper();
771 
772   // Resets the unique_ptr to the corpus_mapper, deletes the underlying file,
773   // and re-creates a new instance of the corpus_mapper.
774   //
775   // Returns OK or any IO errors.
776   libtextclassifier3::Status ResetCorpusMapper();
777 
778   // Checks if the header exists already. This does not create the header file
779   // if it doesn't exist.
780   bool HeaderExists();
781 
782   libtextclassifier3::StatusOr<PutResult> InternalPut(
783       DocumentProto&& document,
784       PutDocumentStatsProto* put_document_stats = nullptr);
785 
786   // Helper function to do batch deletes. Documents with the given
787   // "namespace_id" and "schema_type_id" will be deleted. If callers don't need
788   // to specify the namespace or schema type, pass in kInvalidNamespaceId or
789   // kInvalidSchemaTypeId. The document protos with their derived data will be
790   // erased / cleared immediately.
791   //
792   // NOTE: Space is not reclaimed in the derived files until Optimize() is
793   // called.
794   //
795   // Returns:
796   //   Number of documents that were actually updated to be deleted
797   //   INTERNAL_ERROR on IO error
798   libtextclassifier3::StatusOr<int> BatchDelete(NamespaceId namespace_id,
799                                                 SchemaTypeId schema_type_id);
800 
801   // Returns the CorpusAssociatedScoreData of the corpus specified by the
802   // corpus_id.
803   //
804   // If the corpus_id has never been seen before, it returns a
805   // CorpusAssociatedScoreData with properties set to default values.
806   //
807   // NOTE: This does not check if the corpus exists and will return the
808   // CorpusAssociatedScoreData of the corpus even if all documents belonging to
809   // that corpus have been deleted.
810   //
811   // Returns:
812   //   CorpusAssociatedScoreData on success
813   libtextclassifier3::StatusOr<CorpusAssociatedScoreData>
814   GetCorpusAssociatedScoreDataToUpdate(CorpusId corpus_id) const;
815 
816   // Checks if a document has been deleted.
817   //
818   // This is for internal-use only because we assume that the document_id is
819   // already valid.
820   bool IsDeleted(DocumentId document_id) const;
821 
822   // Checks if a document has expired.
823   //
824   // This is for internal-use only because we assume that the document_id is
825   // already valid.
826 
827   // Returns:
828   //   True:DocumentFilterData  if the given document isn't expired.
829   //   False                    if the given doesn't document is expired.
830   std::optional<DocumentFilterData> GetNonExpiredDocumentFilterData(
831       DocumentId document_id, int64_t current_time_ms) const;
832 
833   // Updates the entry in the score cache for document_id.
834   libtextclassifier3::Status UpdateDocumentAssociatedScoreCache(
835       DocumentId document_id, const DocumentAssociatedScoreData& score_data);
836 
837   // Updates the entry in the corpus score cache for corpus_id.
838   libtextclassifier3::Status UpdateCorpusAssociatedScoreCache(
839       CorpusId corpus_id, const CorpusAssociatedScoreData& score_data);
840 
841   // Updates the entry in the filter cache for document_id.
842   libtextclassifier3::Status UpdateFilterCache(
843       DocumentId document_id, const DocumentFilterData& filter_data);
844 
845   // Helper method to clear the derived data of a document
846   libtextclassifier3::Status ClearDerivedData(DocumentId document_id);
847 
848   // Sets usage scores for the given document.
849   libtextclassifier3::Status SetUsageScores(
850       DocumentId document_id, const UsageStore::UsageScores& usage_scores);
851 
852   // Returns:
853   //   - on success, a DocumentStorageInfoProto with the fields relating to the
854   //     size of Document Store member variables populated.
855   //   - INTERNAL on failure to get file size
856   DocumentStorageInfoProto GetMemberStorageInfo() const;
857 
858   // Returns:
859   //   - on success, the storage_info that was passed in but with the number of
860   //     alive, deleted and expired documents also set.
861   //   - OUT_OF_RANGE, this should never happen. This could only be returned if
862   //     the document_id_mapper somehow became larger than the filter cache.
863   DocumentStorageInfoProto CalculateDocumentStatusCounts(
864       DocumentStorageInfoProto storage_info) const;
865 
866   // Returns:
867   //   - on success, a RepeatedPtrField for CorpusInfo collected.
868   //   - OUT_OF_RANGE, this should never happen.
869   libtextclassifier3::StatusOr<
870       google::protobuf::RepeatedPtrField<DocumentDebugInfoProto::CorpusInfo>>
871   CollectCorpusInfo() const;
872 
873   // Extracts the ScorablePropertySetProto from the |document| and add it to
874   // the |scorable_property_cache_|.
875   //
876   // Returns:
877   //     - Index of the newly inserted ScorablePropertySetProto in the
878   //       |scorable_property_cache_|.
879   //     - kInvalidScorablePropertyCacheIndex if the schema contains no
880   //       scorable properties.
881   //     - INVALID_ARGUMENT if |schema_type_id| is invalid, or the converted
882   //       ScorablePropertySetProto exceeds the size limit of 16MiB.
883   //     - INTERNAL_ERROR on IO error.
884   libtextclassifier3::StatusOr<int> UpdateScorablePropertyCache(
885       const DocumentProto& document, SchemaTypeId schema_type_id);
886 };
887 
888 }  // namespace lib
889 }  // namespace icing
890 
891 #endif  // ICING_STORE_DOCUMENT_STORE_H_
892