• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/store/document-store.h"
16 
17 #include <cstdint>
18 #include <limits>
19 #include <memory>
20 #include <optional>
21 #include <string>
22 #include <string_view>
23 #include <unordered_map>
24 #include <utility>
25 #include <vector>
26 
27 #include "icing/text_classifier/lib3/utils/base/status.h"
28 #include "icing/text_classifier/lib3/utils/base/statusor.h"
29 #include "icing/text_classifier/lib3/utils/hash/farmhash.h"
30 #include "icing/absl_ports/annotate.h"
31 #include "icing/absl_ports/canonical_errors.h"
32 #include "icing/absl_ports/str_cat.h"
33 #include "icing/file/file-backed-proto-log.h"
34 #include "icing/file/file-backed-vector.h"
35 #include "icing/file/filesystem.h"
36 #include "icing/file/memory-mapped-file.h"
37 #include "icing/file/portable-file-backed-proto-log.h"
38 #include "icing/legacy/core/icing-string-util.h"
39 #include "icing/proto/debug.pb.h"
40 #include "icing/proto/document.pb.h"
41 #include "icing/proto/document_wrapper.pb.h"
42 #include "icing/proto/logging.pb.h"
43 #include "icing/proto/optimize.pb.h"
44 #include "icing/proto/persist.pb.h"
45 #include "icing/proto/schema.pb.h"
46 #include "icing/proto/storage.pb.h"
47 #include "icing/proto/usage.pb.h"
48 #include "icing/schema/schema-store.h"
49 #include "icing/store/corpus-associated-scoring-data.h"
50 #include "icing/store/corpus-id.h"
51 #include "icing/store/document-associated-score-data.h"
52 #include "icing/store/document-filter-data.h"
53 #include "icing/store/document-id.h"
54 #include "icing/store/document-log-creator.h"
55 #include "icing/store/dynamic-trie-key-mapper.h"
56 #include "icing/store/namespace-id.h"
57 #include "icing/store/usage-store.h"
58 #include "icing/tokenization/language-segmenter.h"
59 #include "icing/util/clock.h"
60 #include "icing/util/crc32.h"
61 #include "icing/util/data-loss.h"
62 #include "icing/util/encode-util.h"
63 #include "icing/util/fingerprint-util.h"
64 #include "icing/util/logging.h"
65 #include "icing/util/status-macros.h"
66 #include "icing/util/tokenized-document.h"
67 
68 namespace icing {
69 namespace lib {
70 
71 namespace {
72 
73 // Used in DocumentId mapper to mark a document as deleted
74 constexpr int64_t kDocDeletedFlag = -1;
75 constexpr char kDocumentIdMapperFilename[] = "document_id_mapper";
76 constexpr char kDocumentStoreHeaderFilename[] = "document_store_header";
77 constexpr char kScoreCacheFilename[] = "score_cache";
78 constexpr char kCorpusScoreCache[] = "corpus_score_cache";
79 constexpr char kFilterCacheFilename[] = "filter_cache";
80 constexpr char kNamespaceMapperFilename[] = "namespace_mapper";
81 constexpr char kUsageStoreDirectoryName[] = "usage_store";
82 constexpr char kCorpusIdMapperFilename[] = "corpus_mapper";
83 
84 // Determined through manual testing to allow for 1 million uris. 1 million
85 // because we allow up to 1 million DocumentIds.
86 constexpr int32_t kUriMapperMaxSize = 36 * 1024 * 1024;  // 36 MiB
87 
88 // 384 KiB for a DynamicTrieKeyMapper would allow each internal array to have a
89 // max of 128 KiB for storage.
90 constexpr int32_t kNamespaceMapperMaxSize = 3 * 128 * 1024;  // 384 KiB
91 constexpr int32_t kCorpusMapperMaxSize = 3 * 128 * 1024;     // 384 KiB
92 
CreateDocumentWrapper(DocumentProto && document)93 DocumentWrapper CreateDocumentWrapper(DocumentProto&& document) {
94   DocumentWrapper document_wrapper;
95   *document_wrapper.mutable_document() = std::move(document);
96   return document_wrapper;
97 }
98 
MakeHeaderFilename(const std::string & base_dir)99 std::string MakeHeaderFilename(const std::string& base_dir) {
100   return absl_ports::StrCat(base_dir, "/", kDocumentStoreHeaderFilename);
101 }
102 
MakeDocumentIdMapperFilename(const std::string & base_dir)103 std::string MakeDocumentIdMapperFilename(const std::string& base_dir) {
104   return absl_ports::StrCat(base_dir, "/", kDocumentIdMapperFilename);
105 }
106 
MakeScoreCacheFilename(const std::string & base_dir)107 std::string MakeScoreCacheFilename(const std::string& base_dir) {
108   return absl_ports::StrCat(base_dir, "/", kScoreCacheFilename);
109 }
110 
MakeCorpusScoreCache(const std::string & base_dir)111 std::string MakeCorpusScoreCache(const std::string& base_dir) {
112   return absl_ports::StrCat(base_dir, "/", kCorpusScoreCache);
113 }
114 
MakeFilterCacheFilename(const std::string & base_dir)115 std::string MakeFilterCacheFilename(const std::string& base_dir) {
116   return absl_ports::StrCat(base_dir, "/", kFilterCacheFilename);
117 }
118 
MakeNamespaceMapperFilename(const std::string & base_dir)119 std::string MakeNamespaceMapperFilename(const std::string& base_dir) {
120   return absl_ports::StrCat(base_dir, "/", kNamespaceMapperFilename);
121 }
122 
MakeUsageStoreDirectoryName(const std::string & base_dir)123 std::string MakeUsageStoreDirectoryName(const std::string& base_dir) {
124   return absl_ports::StrCat(base_dir, "/", kUsageStoreDirectoryName);
125 }
126 
MakeCorpusMapperFilename(const std::string & base_dir)127 std::string MakeCorpusMapperFilename(const std::string& base_dir) {
128   return absl_ports::StrCat(base_dir, "/", kCorpusIdMapperFilename);
129 }
130 
131 // This function will encode a namespace id into a fixed 3 bytes string.
EncodeNamespaceId(NamespaceId namespace_id)132 std::string EncodeNamespaceId(NamespaceId namespace_id) {
133   // encoding should be 1 to 3 bytes based on the value of namespace_id.
134   std::string encoding = encode_util::EncodeIntToCString(namespace_id);
135   // Make encoding to fixed 3 bytes.
136   while (encoding.size() < 3) {
137     // DynamicTrie cannot handle keys with 0 as bytes, so we append it using 1,
138     // just like what we do in encode_util::EncodeIntToCString.
139     //
140     // The reason that this works is because DecodeIntToString decodes a byte
141     // value of 0x01 as 0x00. When EncodeIntToCString returns a namespaceid
142     // encoding that is less than 3 bytes, it means that the id contains
143     // unencoded leading 0x00. So here we're explicitly encoding those bytes as
144     // 0x01.
145     encoding.push_back(1);
146   }
147   return encoding;
148 }
149 
CalculateExpirationTimestampMs(int64_t creation_timestamp_ms,int64_t ttl_ms)150 int64_t CalculateExpirationTimestampMs(int64_t creation_timestamp_ms,
151                                        int64_t ttl_ms) {
152   if (ttl_ms == 0) {
153     // Special case where a TTL of 0 indicates the document should never
154     // expire. int64_t max, interpreted as seconds since epoch, represents
155     // some point in the year 292,277,026,596. So we're probably ok to use
156     // this as "never reaching this point".
157     return std::numeric_limits<int64_t>::max();
158   }
159 
160   int64_t expiration_timestamp_ms;
161   if (__builtin_add_overflow(creation_timestamp_ms, ttl_ms,
162                              &expiration_timestamp_ms)) {
163     // Overflow detected. Treat overflow as the same behavior of just int64_t
164     // max
165     return std::numeric_limits<int64_t>::max();
166   }
167 
168   return expiration_timestamp_ms;
169 }
170 
GetRecoveryCause(const DocumentLogCreator::CreateResult & create_result,bool force_recovery_and_revalidate_documents)171 InitializeStatsProto::RecoveryCause GetRecoveryCause(
172     const DocumentLogCreator::CreateResult& create_result,
173     bool force_recovery_and_revalidate_documents) {
174   if (force_recovery_and_revalidate_documents) {
175     return InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC;
176   } else if (create_result.log_create_result.has_data_loss()) {
177     return InitializeStatsProto::DATA_LOSS;
178   } else if (create_result.preexisting_file_version !=
179              DocumentLogCreator::kCurrentVersion) {
180     return InitializeStatsProto::LEGACY_DOCUMENT_LOG_FORMAT;
181   }
182   return InitializeStatsProto::NONE;
183 }
184 
GetDataStatus(DataLoss data_loss)185 InitializeStatsProto::DocumentStoreDataStatus GetDataStatus(
186     DataLoss data_loss) {
187   switch (data_loss) {
188     case DataLoss::PARTIAL:
189       return InitializeStatsProto::PARTIAL_LOSS;
190     case DataLoss::COMPLETE:
191       return InitializeStatsProto::COMPLETE_LOSS;
192     case DataLoss::NONE:
193       return InitializeStatsProto::NO_DATA_LOSS;
194   }
195 }
196 
GetNamespaceIdsToNamespaces(const KeyMapper<NamespaceId> * key_mapper)197 std::unordered_map<NamespaceId, std::string> GetNamespaceIdsToNamespaces(
198     const KeyMapper<NamespaceId>* key_mapper) {
199   std::unordered_map<NamespaceId, std::string> namespace_ids_to_namespaces;
200 
201   std::unique_ptr<typename KeyMapper<NamespaceId>::Iterator> itr =
202       key_mapper->GetIterator();
203   while (itr->Advance()) {
204     namespace_ids_to_namespaces.insert(
205         {itr->GetValue(), std::string(itr->GetKey())});
206   }
207   return namespace_ids_to_namespaces;
208 }
209 
210 }  // namespace
211 
MakeFingerprint(NamespaceId namespace_id,std::string_view namespace_,std::string_view uri_or_schema) const212 std::string DocumentStore::MakeFingerprint(
213     NamespaceId namespace_id, std::string_view namespace_,
214     std::string_view uri_or_schema) const {
215   if (!namespace_id_fingerprint_) {
216     // Using a 64-bit fingerprint to represent the key could lead to collisions.
217     // But, even with 200K unique keys, the probability of collision is about
218     // one-in-a-billion (https://en.wikipedia.org/wiki/Birthday_attack).
219     uint64_t fprint = tc3farmhash::Fingerprint64(
220         absl_ports::StrCat(namespace_, uri_or_schema));
221     return fingerprint_util::GetFingerprintString(fprint);
222   } else {
223     return absl_ports::StrCat(EncodeNamespaceId(namespace_id),
224                               encode_util::EncodeIntToCString(
225                                   tc3farmhash::Fingerprint64(uri_or_schema)));
226   }
227 }
228 
DocumentStore(const Filesystem * filesystem,const std::string_view base_dir,const Clock * clock,const SchemaStore * schema_store,bool namespace_id_fingerprint,int32_t compression_level)229 DocumentStore::DocumentStore(const Filesystem* filesystem,
230                              const std::string_view base_dir,
231                              const Clock* clock,
232                              const SchemaStore* schema_store,
233                              bool namespace_id_fingerprint,
234                              int32_t compression_level)
235     : filesystem_(filesystem),
236       base_dir_(base_dir),
237       clock_(*clock),
238       schema_store_(schema_store),
239       document_validator_(schema_store),
240       namespace_id_fingerprint_(namespace_id_fingerprint),
241       compression_level_(compression_level) {}
242 
Put(const DocumentProto & document,int32_t num_tokens,PutDocumentStatsProto * put_document_stats)243 libtextclassifier3::StatusOr<DocumentId> DocumentStore::Put(
244     const DocumentProto& document, int32_t num_tokens,
245     PutDocumentStatsProto* put_document_stats) {
246   return Put(DocumentProto(document), num_tokens, put_document_stats);
247 }
248 
Put(DocumentProto && document,int32_t num_tokens,PutDocumentStatsProto * put_document_stats)249 libtextclassifier3::StatusOr<DocumentId> DocumentStore::Put(
250     DocumentProto&& document, int32_t num_tokens,
251     PutDocumentStatsProto* put_document_stats) {
252   document.mutable_internal_fields()->set_length_in_tokens(num_tokens);
253   return InternalPut(std::move(document), put_document_stats);
254 }
255 
~DocumentStore()256 DocumentStore::~DocumentStore() {
257   if (initialized_) {
258     if (!PersistToDisk(PersistType::FULL).ok()) {
259       ICING_LOG(ERROR)
260           << "Error persisting to disk in DocumentStore destructor";
261     }
262   }
263 }
264 
Create(const Filesystem * filesystem,const std::string & base_dir,const Clock * clock,const SchemaStore * schema_store,bool force_recovery_and_revalidate_documents,bool namespace_id_fingerprint,int32_t compression_level,InitializeStatsProto * initialize_stats)265 libtextclassifier3::StatusOr<DocumentStore::CreateResult> DocumentStore::Create(
266     const Filesystem* filesystem, const std::string& base_dir,
267     const Clock* clock, const SchemaStore* schema_store,
268     bool force_recovery_and_revalidate_documents, bool namespace_id_fingerprint,
269     int32_t compression_level, InitializeStatsProto* initialize_stats) {
270   ICING_RETURN_ERROR_IF_NULL(filesystem);
271   ICING_RETURN_ERROR_IF_NULL(clock);
272   ICING_RETURN_ERROR_IF_NULL(schema_store);
273 
274   auto document_store = std::unique_ptr<DocumentStore>(new DocumentStore(
275       filesystem, base_dir, clock, schema_store, namespace_id_fingerprint,
276       compression_level));
277   ICING_ASSIGN_OR_RETURN(
278       DataLoss data_loss,
279       document_store->Initialize(force_recovery_and_revalidate_documents,
280                                  initialize_stats));
281 
282   CreateResult create_result;
283   create_result.document_store = std::move(document_store);
284   create_result.data_loss = data_loss;
285   return create_result;
286 }
287 
DiscardDerivedFiles(const Filesystem * filesystem,const std::string & base_dir)288 /* static */ libtextclassifier3::Status DocumentStore::DiscardDerivedFiles(
289     const Filesystem* filesystem, const std::string& base_dir) {
290   // Header
291   const std::string header_filename = MakeHeaderFilename(base_dir);
292   if (!filesystem->DeleteFile(MakeHeaderFilename(base_dir).c_str())) {
293     return absl_ports::InternalError("Couldn't delete header file");
294   }
295 
296   // Document key mapper
297   ICING_RETURN_IF_ERROR(
298       DynamicTrieKeyMapper<DocumentId>::Delete(*filesystem, base_dir));
299 
300   // Document id mapper
301   ICING_RETURN_IF_ERROR(FileBackedVector<int64_t>::Delete(
302       *filesystem, MakeDocumentIdMapperFilename(base_dir)));
303 
304   // Document associated score cache
305   ICING_RETURN_IF_ERROR(FileBackedVector<DocumentAssociatedScoreData>::Delete(
306       *filesystem, MakeScoreCacheFilename(base_dir)));
307 
308   // Filter cache
309   ICING_RETURN_IF_ERROR(FileBackedVector<DocumentFilterData>::Delete(
310       *filesystem, MakeFilterCacheFilename(base_dir)));
311 
312   // Namespace mapper
313   ICING_RETURN_IF_ERROR(DynamicTrieKeyMapper<NamespaceId>::Delete(
314       *filesystem, MakeNamespaceMapperFilename(base_dir)));
315 
316   // Corpus mapper
317   ICING_RETURN_IF_ERROR(DynamicTrieKeyMapper<CorpusId>::Delete(
318       *filesystem, MakeCorpusMapperFilename(base_dir)));
319 
320   // Corpus associated score cache
321   ICING_RETURN_IF_ERROR(FileBackedVector<CorpusAssociatedScoreData>::Delete(
322       *filesystem, MakeCorpusScoreCache(base_dir)));
323 
324   return libtextclassifier3::Status::OK;
325 }
326 
Initialize(bool force_recovery_and_revalidate_documents,InitializeStatsProto * initialize_stats)327 libtextclassifier3::StatusOr<DataLoss> DocumentStore::Initialize(
328     bool force_recovery_and_revalidate_documents,
329     InitializeStatsProto* initialize_stats) {
330   auto create_result_or =
331       DocumentLogCreator::Create(filesystem_, base_dir_, compression_level_);
332 
333   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
334   // that can support error logging.
335   if (!create_result_or.ok()) {
336     ICING_LOG(ERROR) << create_result_or.status().error_message()
337                      << "\nFailed to initialize DocumentLog.";
338     return create_result_or.status();
339   }
340   DocumentLogCreator::CreateResult create_result =
341       std::move(create_result_or).ValueOrDie();
342 
343   document_log_ = std::move(create_result.log_create_result.proto_log);
344   InitializeStatsProto::RecoveryCause recovery_cause =
345       GetRecoveryCause(create_result, force_recovery_and_revalidate_documents);
346 
347   if (recovery_cause != InitializeStatsProto::NONE || create_result.new_file) {
348     ICING_LOG(INFO) << "Starting Document Store Recovery with cause="
349                     << recovery_cause << ", and create result { new_file="
350                     << create_result.new_file << ", preeisting_file_version="
351                     << create_result.preexisting_file_version << ", data_loss="
352                     << create_result.log_create_result.data_loss
353                     << "} and kCurrentVersion="
354                     << DocumentLogCreator::kCurrentVersion;
355     // We can't rely on any existing derived files. Recreate them from scratch.
356     // Currently happens if:
357     //   1) This is a new log and we don't have derived files yet
358     //   2) Client wanted us to force a regeneration.
359     //   3) Log has some data loss, can't rely on existing derived data.
360     std::unique_ptr<Timer> document_recovery_timer = clock_.GetNewTimer();
361     libtextclassifier3::Status status =
362         RegenerateDerivedFiles(force_recovery_and_revalidate_documents);
363     if (initialize_stats != nullptr &&
364         recovery_cause != InitializeStatsProto::NONE) {
365       // Only consider it a recovery if the client forced a recovery or there
366       // was data loss. Otherwise, this could just be the first time we're
367       // initializing and generating derived files.
368       initialize_stats->set_document_store_recovery_latency_ms(
369           document_recovery_timer->GetElapsedMilliseconds());
370       initialize_stats->set_document_store_recovery_cause(recovery_cause);
371       initialize_stats->set_document_store_data_status(
372           GetDataStatus(create_result.log_create_result.data_loss));
373     }
374     if (!status.ok()) {
375       ICING_LOG(ERROR)
376           << "Failed to regenerate derived files for DocumentStore";
377       return status;
378     }
379   } else {
380     if (!InitializeExistingDerivedFiles().ok()) {
381       ICING_LOG(WARNING)
382           << "Couldn't find derived files or failed to initialize them, "
383              "regenerating derived files for DocumentStore.";
384       std::unique_ptr<Timer> document_recovery_timer = clock_.GetNewTimer();
385       libtextclassifier3::Status status = RegenerateDerivedFiles(
386           /*force_recovery_and_revalidate_documents=*/false);
387       if (initialize_stats != nullptr) {
388         initialize_stats->set_document_store_recovery_cause(
389             InitializeStatsProto::IO_ERROR);
390         initialize_stats->set_document_store_recovery_latency_ms(
391             document_recovery_timer->GetElapsedMilliseconds());
392       }
393       if (!status.ok()) {
394         ICING_LOG(ERROR)
395             << "Failed to regenerate derived files for DocumentStore";
396         return status;
397       }
398     }
399   }
400 
401   initialized_ = true;
402   if (initialize_stats != nullptr) {
403     initialize_stats->set_num_documents(document_id_mapper_->num_elements());
404   }
405 
406   return create_result.log_create_result.data_loss;
407 }
408 
InitializeExistingDerivedFiles()409 libtextclassifier3::Status DocumentStore::InitializeExistingDerivedFiles() {
410   if (!HeaderExists()) {
411     // Without a header, we don't know if things are consistent between each
412     // other so the caller should just regenerate everything from ground
413     // truth.
414     return absl_ports::InternalError("DocumentStore header doesn't exist");
415   }
416 
417   DocumentStore::Header header;
418   if (!filesystem_->Read(MakeHeaderFilename(base_dir_).c_str(), &header,
419                          sizeof(header))) {
420     return absl_ports::InternalError(
421         absl_ports::StrCat("Couldn't read: ", MakeHeaderFilename(base_dir_)));
422   }
423 
424   if (header.magic !=
425       DocumentStore::Header::GetCurrentMagic(namespace_id_fingerprint_)) {
426     return absl_ports::InternalError(absl_ports::StrCat(
427         "Invalid header kMagic for file: ", MakeHeaderFilename(base_dir_)));
428   }
429 
430   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
431   // that can support error logging.
432   auto document_key_mapper_or = DynamicTrieKeyMapper<
433       DocumentId,
434       fingerprint_util::FingerprintStringFormatter>::Create(*filesystem_,
435                                                             base_dir_,
436                                                             kUriMapperMaxSize);
437   if (!document_key_mapper_or.ok()) {
438     ICING_LOG(ERROR) << document_key_mapper_or.status().error_message()
439                      << "Failed to initialize KeyMapper";
440     return document_key_mapper_or.status();
441   }
442   document_key_mapper_ = std::move(document_key_mapper_or).ValueOrDie();
443 
444   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
445   // that can support error logging.
446   auto document_id_mapper_or = FileBackedVector<int64_t>::Create(
447       *filesystem_, MakeDocumentIdMapperFilename(base_dir_),
448       MemoryMappedFile::READ_WRITE_AUTO_SYNC);
449   if (!document_id_mapper_or.ok()) {
450     ICING_LOG(ERROR) << document_id_mapper_or.status().error_message()
451                      << "Failed to initialize DocumentIdMapper";
452     return document_id_mapper_or.status();
453   }
454   document_id_mapper_ = std::move(document_id_mapper_or).ValueOrDie();
455 
456   ICING_ASSIGN_OR_RETURN(score_cache_,
457                          FileBackedVector<DocumentAssociatedScoreData>::Create(
458                              *filesystem_, MakeScoreCacheFilename(base_dir_),
459                              MemoryMappedFile::READ_WRITE_AUTO_SYNC));
460 
461   ICING_ASSIGN_OR_RETURN(filter_cache_,
462                          FileBackedVector<DocumentFilterData>::Create(
463                              *filesystem_, MakeFilterCacheFilename(base_dir_),
464                              MemoryMappedFile::READ_WRITE_AUTO_SYNC));
465 
466   ICING_ASSIGN_OR_RETURN(
467       namespace_mapper_,
468       DynamicTrieKeyMapper<NamespaceId>::Create(
469           *filesystem_, MakeNamespaceMapperFilename(base_dir_),
470           kNamespaceMapperMaxSize));
471 
472   ICING_ASSIGN_OR_RETURN(
473       usage_store_,
474       UsageStore::Create(filesystem_, MakeUsageStoreDirectoryName(base_dir_)));
475 
476   auto corpus_mapper_or =
477       DynamicTrieKeyMapper<CorpusId,
478                            fingerprint_util::FingerprintStringFormatter>::
479           Create(*filesystem_, MakeCorpusMapperFilename(base_dir_),
480                  kCorpusMapperMaxSize);
481   if (!corpus_mapper_or.ok()) {
482     return std::move(corpus_mapper_or).status();
483   }
484   corpus_mapper_ = std::move(corpus_mapper_or).ValueOrDie();
485 
486   ICING_ASSIGN_OR_RETURN(corpus_score_cache_,
487                          FileBackedVector<CorpusAssociatedScoreData>::Create(
488                              *filesystem_, MakeCorpusScoreCache(base_dir_),
489                              MemoryMappedFile::READ_WRITE_AUTO_SYNC));
490 
491   // Ensure the usage store is the correct size.
492   ICING_RETURN_IF_ERROR(
493       usage_store_->TruncateTo(document_id_mapper_->num_elements()));
494 
495   ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
496   if (checksum.Get() != header.checksum) {
497     return absl_ports::InternalError(
498         "Combined checksum of DocStore was inconsistent");
499   }
500 
501   return libtextclassifier3::Status::OK;
502 }
503 
RegenerateDerivedFiles(bool revalidate_documents)504 libtextclassifier3::Status DocumentStore::RegenerateDerivedFiles(
505     bool revalidate_documents) {
506   ICING_RETURN_IF_ERROR(ResetDocumentKeyMapper());
507   ICING_RETURN_IF_ERROR(ResetDocumentIdMapper());
508   ICING_RETURN_IF_ERROR(ResetDocumentAssociatedScoreCache());
509   ICING_RETURN_IF_ERROR(ResetFilterCache());
510   ICING_RETURN_IF_ERROR(ResetNamespaceMapper());
511   ICING_RETURN_IF_ERROR(ResetCorpusMapper());
512   ICING_RETURN_IF_ERROR(ResetCorpusAssociatedScoreCache());
513 
514   // Creates a new UsageStore instance. Note that we don't reset the data in
515   // usage store here because we're not able to regenerate the usage scores.
516   ICING_ASSIGN_OR_RETURN(
517       usage_store_,
518       UsageStore::Create(filesystem_, MakeUsageStoreDirectoryName(base_dir_)));
519 
520   // Iterates through document log
521   auto iterator = document_log_->GetIterator();
522   auto iterator_status = iterator.Advance();
523   libtextclassifier3::StatusOr<int64_t> element_size =
524       document_log_->GetElementsFileSize();
525   libtextclassifier3::StatusOr<int64_t> disk_usage =
526       document_log_->GetDiskUsage();
527   if (element_size.ok() && disk_usage.ok()) {
528     ICING_VLOG(1) << "Starting recovery of document store. Document store "
529                      "elements file size:"
530                   << element_size.ValueOrDie()
531                   << ", disk usage=" << disk_usage.ValueOrDie();
532   }
533   while (iterator_status.ok()) {
534     ICING_VLOG(2) << "Attempting to read document at offset="
535                   << iterator.GetOffset();
536     libtextclassifier3::StatusOr<DocumentWrapper> document_wrapper_or =
537         document_log_->ReadProto(iterator.GetOffset());
538 
539     if (absl_ports::IsNotFound(document_wrapper_or.status())) {
540       // The erased document still occupies 1 document id.
541       DocumentId new_document_id = document_id_mapper_->num_elements();
542       ICING_RETURN_IF_ERROR(ClearDerivedData(new_document_id));
543       iterator_status = iterator.Advance();
544       continue;
545     } else if (!document_wrapper_or.ok()) {
546       return document_wrapper_or.status();
547     }
548 
549     DocumentWrapper document_wrapper =
550         std::move(document_wrapper_or).ValueOrDie();
551     // Revalidate that this document is still compatible if requested.
552     if (revalidate_documents) {
553       if (!document_validator_.Validate(document_wrapper.document()).ok()) {
554         // Document is no longer valid with the current schema. Mark as
555         // deleted
556         DocumentId new_document_id = document_id_mapper_->num_elements();
557         ICING_RETURN_IF_ERROR(document_log_->EraseProto(iterator.GetOffset()));
558         ICING_RETURN_IF_ERROR(ClearDerivedData(new_document_id));
559         continue;
560       }
561     }
562 
563     ICING_ASSIGN_OR_RETURN(
564         NamespaceId namespace_id,
565         namespace_mapper_->GetOrPut(document_wrapper.document().namespace_(),
566                                     namespace_mapper_->num_keys()));
567 
568     // Updates key mapper and document_id mapper with the new document
569     DocumentId new_document_id = document_id_mapper_->num_elements();
570     ICING_RETURN_IF_ERROR(document_key_mapper_->Put(
571         MakeFingerprint(namespace_id, document_wrapper.document().namespace_(),
572                         document_wrapper.document().uri()),
573         new_document_id));
574     ICING_RETURN_IF_ERROR(
575         document_id_mapper_->Set(new_document_id, iterator.GetOffset()));
576 
577     SchemaTypeId schema_type_id;
578     auto schema_type_id_or =
579         schema_store_->GetSchemaTypeId(document_wrapper.document().schema());
580     if (absl_ports::IsNotFound(schema_type_id_or.status())) {
581       // Didn't find a SchemaTypeId. This means that the DocumentStore and
582       // the SchemaStore are out of sync. But DocumentStore can't do
583       // anything about it so just ignore this for now. This should be
584       // detected/handled by the owner of DocumentStore. Set it to some
585       // arbitrary invalid value for now, it'll get updated to the correct
586       // ID later.
587       schema_type_id = -1;
588     } else if (!schema_type_id_or.ok()) {
589       // Real error. Pass it up
590       return schema_type_id_or.status();
591     } else {
592       // We're guaranteed that SchemaTypeId is valid now
593       schema_type_id = schema_type_id_or.ValueOrDie();
594     }
595 
596     // Update corpus maps
597     std::string corpus =
598         MakeFingerprint(namespace_id, document_wrapper.document().namespace_(),
599                         document_wrapper.document().schema());
600     ICING_ASSIGN_OR_RETURN(
601         CorpusId corpusId,
602         corpus_mapper_->GetOrPut(corpus, corpus_mapper_->num_keys()));
603 
604     ICING_ASSIGN_OR_RETURN(CorpusAssociatedScoreData scoring_data,
605                            GetCorpusAssociatedScoreDataToUpdate(corpusId));
606     scoring_data.AddDocument(
607         document_wrapper.document().internal_fields().length_in_tokens());
608 
609     ICING_RETURN_IF_ERROR(
610         UpdateCorpusAssociatedScoreCache(corpusId, scoring_data));
611 
612     ICING_RETURN_IF_ERROR(UpdateDocumentAssociatedScoreCache(
613         new_document_id,
614         DocumentAssociatedScoreData(
615             corpusId, document_wrapper.document().score(),
616             document_wrapper.document().creation_timestamp_ms(),
617             document_wrapper.document().internal_fields().length_in_tokens())));
618 
619     int64_t expiration_timestamp_ms = CalculateExpirationTimestampMs(
620         document_wrapper.document().creation_timestamp_ms(),
621         document_wrapper.document().ttl_ms());
622 
623     ICING_RETURN_IF_ERROR(UpdateFilterCache(
624         new_document_id, DocumentFilterData(namespace_id, schema_type_id,
625                                             expiration_timestamp_ms)));
626     iterator_status = iterator.Advance();
627   }
628 
629   if (!absl_ports::IsOutOfRange(iterator_status)) {
630     ICING_LOG(WARNING)
631         << "Failed to iterate through proto log while regenerating "
632            "derived files";
633     return absl_ports::Annotate(iterator_status,
634                                 "Failed to iterate through proto log.");
635   }
636 
637   // Shrink usage_store_ to the correct size.
638   ICING_RETURN_IF_ERROR(
639       usage_store_->TruncateTo(document_id_mapper_->num_elements()));
640 
641   // Write the header
642   ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
643   ICING_RETURN_IF_ERROR(UpdateHeader(checksum));
644 
645   return libtextclassifier3::Status::OK;
646 }
647 
ResetDocumentKeyMapper()648 libtextclassifier3::Status DocumentStore::ResetDocumentKeyMapper() {
649   // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
650   document_key_mapper_.reset();
651   // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
652   // that can support error logging.
653   libtextclassifier3::Status status =
654       DynamicTrieKeyMapper<DocumentId>::Delete(*filesystem_, base_dir_);
655   if (!status.ok()) {
656     ICING_LOG(ERROR) << status.error_message()
657                      << "Failed to delete old key mapper";
658     return status;
659   }
660 
661   // TODO(b/216487496): Implement a more robust version of TC_ASSIGN_OR_RETURN
662   // that can support error logging.
663   auto document_key_mapper_or = DynamicTrieKeyMapper<
664       DocumentId,
665       fingerprint_util::FingerprintStringFormatter>::Create(*filesystem_,
666                                                             base_dir_,
667                                                             kUriMapperMaxSize);
668   if (!document_key_mapper_or.ok()) {
669     ICING_LOG(ERROR) << document_key_mapper_or.status().error_message()
670                      << "Failed to re-init key mapper";
671     return document_key_mapper_or.status();
672   }
673   document_key_mapper_ = std::move(document_key_mapper_or).ValueOrDie();
674   return libtextclassifier3::Status::OK;
675 }
676 
ResetDocumentIdMapper()677 libtextclassifier3::Status DocumentStore::ResetDocumentIdMapper() {
678   // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
679   document_id_mapper_.reset();
680   // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
681   // that can support error logging.
682   libtextclassifier3::Status status = FileBackedVector<int64_t>::Delete(
683       *filesystem_, MakeDocumentIdMapperFilename(base_dir_));
684   if (!status.ok()) {
685     ICING_LOG(ERROR) << status.error_message()
686                      << "Failed to delete old document_id mapper";
687     return status;
688   }
689   // TODO(b/216487496): Implement a more robust version of TC_ASSIGN_OR_RETURN
690   // that can support error logging.
691   auto document_id_mapper_or = FileBackedVector<int64_t>::Create(
692       *filesystem_, MakeDocumentIdMapperFilename(base_dir_),
693       MemoryMappedFile::READ_WRITE_AUTO_SYNC);
694   if (!document_id_mapper_or.ok()) {
695     ICING_LOG(ERROR) << document_id_mapper_or.status().error_message()
696                      << "Failed to re-init document_id mapper";
697     return document_id_mapper_or.status();
698   }
699   document_id_mapper_ = std::move(document_id_mapper_or).ValueOrDie();
700   return libtextclassifier3::Status::OK;
701 }
702 
ResetDocumentAssociatedScoreCache()703 libtextclassifier3::Status DocumentStore::ResetDocumentAssociatedScoreCache() {
704   // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
705   score_cache_.reset();
706   ICING_RETURN_IF_ERROR(FileBackedVector<DocumentAssociatedScoreData>::Delete(
707       *filesystem_, MakeScoreCacheFilename(base_dir_)));
708   ICING_ASSIGN_OR_RETURN(score_cache_,
709                          FileBackedVector<DocumentAssociatedScoreData>::Create(
710                              *filesystem_, MakeScoreCacheFilename(base_dir_),
711                              MemoryMappedFile::READ_WRITE_AUTO_SYNC));
712   return libtextclassifier3::Status::OK;
713 }
714 
ResetCorpusAssociatedScoreCache()715 libtextclassifier3::Status DocumentStore::ResetCorpusAssociatedScoreCache() {
716   // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
717   corpus_score_cache_.reset();
718   ICING_RETURN_IF_ERROR(FileBackedVector<CorpusAssociatedScoreData>::Delete(
719       *filesystem_, MakeCorpusScoreCache(base_dir_)));
720   ICING_ASSIGN_OR_RETURN(corpus_score_cache_,
721                          FileBackedVector<CorpusAssociatedScoreData>::Create(
722                              *filesystem_, MakeCorpusScoreCache(base_dir_),
723                              MemoryMappedFile::READ_WRITE_AUTO_SYNC));
724   return libtextclassifier3::Status::OK;
725 }
726 
ResetFilterCache()727 libtextclassifier3::Status DocumentStore::ResetFilterCache() {
728   // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
729   filter_cache_.reset();
730   ICING_RETURN_IF_ERROR(FileBackedVector<DocumentFilterData>::Delete(
731       *filesystem_, MakeFilterCacheFilename(base_dir_)));
732   ICING_ASSIGN_OR_RETURN(filter_cache_,
733                          FileBackedVector<DocumentFilterData>::Create(
734                              *filesystem_, MakeFilterCacheFilename(base_dir_),
735                              MemoryMappedFile::READ_WRITE_AUTO_SYNC));
736   return libtextclassifier3::Status::OK;
737 }
738 
ResetNamespaceMapper()739 libtextclassifier3::Status DocumentStore::ResetNamespaceMapper() {
740   // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
741   namespace_mapper_.reset();
742   // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
743   // that can support error logging.
744   libtextclassifier3::Status status = DynamicTrieKeyMapper<NamespaceId>::Delete(
745       *filesystem_, MakeNamespaceMapperFilename(base_dir_));
746   if (!status.ok()) {
747     ICING_LOG(ERROR) << status.error_message()
748                      << "Failed to delete old namespace_id mapper";
749     return status;
750   }
751   ICING_ASSIGN_OR_RETURN(
752       namespace_mapper_,
753       DynamicTrieKeyMapper<NamespaceId>::Create(
754           *filesystem_, MakeNamespaceMapperFilename(base_dir_),
755           kNamespaceMapperMaxSize));
756   return libtextclassifier3::Status::OK;
757 }
758 
ResetCorpusMapper()759 libtextclassifier3::Status DocumentStore::ResetCorpusMapper() {
760   // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
761   corpus_mapper_.reset();
762   // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
763   // that can support error logging.
764   libtextclassifier3::Status status = DynamicTrieKeyMapper<CorpusId>::Delete(
765       *filesystem_, MakeCorpusMapperFilename(base_dir_));
766   if (!status.ok()) {
767     ICING_LOG(ERROR) << status.error_message()
768                      << "Failed to delete old corpus_id mapper";
769     return status;
770   }
771   auto corpus_mapper_or =
772       DynamicTrieKeyMapper<CorpusId,
773                            fingerprint_util::FingerprintStringFormatter>::
774           Create(*filesystem_, MakeCorpusMapperFilename(base_dir_),
775                  kCorpusMapperMaxSize);
776   if (!corpus_mapper_or.ok()) {
777     return std::move(corpus_mapper_or).status();
778   }
779   corpus_mapper_ = std::move(corpus_mapper_or).ValueOrDie();
780   return libtextclassifier3::Status::OK;
781 }
782 
ComputeChecksum() const783 libtextclassifier3::StatusOr<Crc32> DocumentStore::ComputeChecksum() const {
784   Crc32 total_checksum;
785 
786   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
787   // that can support error logging.
788   auto checksum_or = document_log_->ComputeChecksum();
789   if (!checksum_or.ok()) {
790     ICING_LOG(ERROR) << checksum_or.status().error_message()
791                      << "Failed to compute checksum of DocumentLog";
792     return checksum_or.status();
793   }
794   Crc32 document_log_checksum = std::move(checksum_or).ValueOrDie();
795 
796   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
797   // that can support error logging.
798   checksum_or = document_key_mapper_->ComputeChecksum();
799   if (!checksum_or.ok()) {
800     ICING_LOG(ERROR) << checksum_or.status().error_message()
801                      << "Failed to compute checksum of DocumentKeyMapper";
802     return checksum_or.status();
803   }
804   Crc32 document_key_mapper_checksum = std::move(checksum_or).ValueOrDie();
805 
806   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
807   // that can support error logging.
808   checksum_or = document_id_mapper_->ComputeChecksum();
809   if (!checksum_or.ok()) {
810     ICING_LOG(ERROR) << checksum_or.status().error_message()
811                      << "Failed to compute checksum of DocumentIdMapper";
812     return checksum_or.status();
813   }
814   Crc32 document_id_mapper_checksum = std::move(checksum_or).ValueOrDie();
815 
816   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
817   // that can support error logging.
818   checksum_or = score_cache_->ComputeChecksum();
819   if (!checksum_or.ok()) {
820     ICING_LOG(ERROR) << checksum_or.status().error_message()
821                      << "Failed to compute checksum of score cache";
822     return checksum_or.status();
823   }
824   Crc32 score_cache_checksum = std::move(checksum_or).ValueOrDie();
825 
826   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
827   // that can support error logging.
828   checksum_or = filter_cache_->ComputeChecksum();
829   if (!checksum_or.ok()) {
830     ICING_LOG(ERROR) << checksum_or.status().error_message()
831                      << "Failed to compute checksum of filter cache";
832     return checksum_or.status();
833   }
834   Crc32 filter_cache_checksum = std::move(checksum_or).ValueOrDie();
835 
836   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
837   // that can support error logging.
838   checksum_or = namespace_mapper_->ComputeChecksum();
839   if (!checksum_or.ok()) {
840     ICING_LOG(ERROR) << checksum_or.status().error_message()
841                      << "Failed to compute checksum of namespace mapper";
842     return checksum_or.status();
843   }
844   Crc32 namespace_mapper_checksum = std::move(checksum_or).ValueOrDie();
845 
846   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
847   // that can support error logging.
848   checksum_or = corpus_mapper_->ComputeChecksum();
849   if (!checksum_or.ok()) {
850     ICING_LOG(ERROR) << checksum_or.status().error_message()
851                      << "Failed to compute checksum of corpus mapper";
852     return checksum_or.status();
853   }
854   Crc32 corpus_mapper_checksum = std::move(checksum_or).ValueOrDie();
855 
856   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
857   // that can support error logging.
858   checksum_or = corpus_score_cache_->ComputeChecksum();
859   if (!checksum_or.ok()) {
860     ICING_LOG(WARNING) << checksum_or.status().error_message()
861                        << "Failed to compute checksum of score cache";
862     return checksum_or.status();
863   }
864   Crc32 corpus_score_cache_checksum = std::move(checksum_or).ValueOrDie();
865 
866   // NOTE: We purposely don't include usage_store checksum here because we can't
867   // regenerate it from ground truth documents. If it gets corrupted, we'll just
868   // clear all usage reports, but we shouldn't throw everything else in the
869   // document store out.
870 
871   total_checksum.Append(std::to_string(document_log_checksum.Get()));
872   total_checksum.Append(std::to_string(document_key_mapper_checksum.Get()));
873   total_checksum.Append(std::to_string(document_id_mapper_checksum.Get()));
874   total_checksum.Append(std::to_string(score_cache_checksum.Get()));
875   total_checksum.Append(std::to_string(filter_cache_checksum.Get()));
876   total_checksum.Append(std::to_string(namespace_mapper_checksum.Get()));
877   total_checksum.Append(std::to_string(corpus_mapper_checksum.Get()));
878   total_checksum.Append(std::to_string(corpus_score_cache_checksum.Get()));
879 
880   return total_checksum;
881 }
882 
HeaderExists()883 bool DocumentStore::HeaderExists() {
884   if (!filesystem_->FileExists(MakeHeaderFilename(base_dir_).c_str())) {
885     return false;
886   }
887 
888   int64_t file_size =
889       filesystem_->GetFileSize(MakeHeaderFilename(base_dir_).c_str());
890 
891   // If it's been truncated to size 0 before, we consider it to be a new file
892   return file_size != 0 && file_size != Filesystem::kBadFileSize;
893 }
894 
UpdateHeader(const Crc32 & checksum)895 libtextclassifier3::Status DocumentStore::UpdateHeader(const Crc32& checksum) {
896   // Write the header
897   DocumentStore::Header header;
898   header.magic =
899       DocumentStore::Header::GetCurrentMagic(namespace_id_fingerprint_);
900   header.checksum = checksum.Get();
901 
902   // This should overwrite the header.
903   ScopedFd sfd(
904       filesystem_->OpenForWrite(MakeHeaderFilename(base_dir_).c_str()));
905   if (!sfd.is_valid() ||
906       !filesystem_->Write(sfd.get(), &header, sizeof(header)) ||
907       !filesystem_->DataSync(sfd.get())) {
908     return absl_ports::InternalError(absl_ports::StrCat(
909         "Failed to write DocStore header: ", MakeHeaderFilename(base_dir_)));
910   }
911   return libtextclassifier3::Status::OK;
912 }
913 
InternalPut(DocumentProto && document,PutDocumentStatsProto * put_document_stats)914 libtextclassifier3::StatusOr<DocumentId> DocumentStore::InternalPut(
915     DocumentProto&& document, PutDocumentStatsProto* put_document_stats) {
916   std::unique_ptr<Timer> put_timer = clock_.GetNewTimer();
917   ICING_RETURN_IF_ERROR(document_validator_.Validate(document));
918 
919   if (put_document_stats != nullptr) {
920     put_document_stats->set_document_size(document.ByteSizeLong());
921   }
922 
923   // Copy fields needed before they are moved
924   std::string name_space = document.namespace_();
925   std::string uri = document.uri();
926   std::string schema = document.schema();
927   int document_score = document.score();
928   int32_t length_in_tokens = document.internal_fields().length_in_tokens();
929   int64_t creation_timestamp_ms = document.creation_timestamp_ms();
930 
931   // Sets the creation timestamp if caller hasn't specified.
932   if (document.creation_timestamp_ms() == 0) {
933     creation_timestamp_ms = clock_.GetSystemTimeMilliseconds();
934     document.set_creation_timestamp_ms(creation_timestamp_ms);
935   }
936 
937   int64_t expiration_timestamp_ms =
938       CalculateExpirationTimestampMs(creation_timestamp_ms, document.ttl_ms());
939 
940   // Update ground truth first
941   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
942   // that can support error logging.
943   auto offset_or =
944       document_log_->WriteProto(CreateDocumentWrapper(std::move(document)));
945   if (!offset_or.ok()) {
946     ICING_LOG(ERROR) << offset_or.status().error_message()
947                      << "Failed to write document";
948     return offset_or.status();
949   }
950   int64_t file_offset = std::move(offset_or).ValueOrDie();
951 
952   // Get existing document id
953   auto old_document_id_or = GetDocumentId(name_space, uri);
954   if (!old_document_id_or.ok() &&
955       !absl_ports::IsNotFound(old_document_id_or.status())) {
956     return absl_ports::InternalError("Failed to read from key mapper");
957   }
958 
959   // Creates a new document id, updates key mapper and document_id mapper
960   DocumentId new_document_id = document_id_mapper_->num_elements();
961   if (!IsDocumentIdValid(new_document_id)) {
962     return absl_ports::ResourceExhaustedError(
963         "Exceeded maximum number of documents. Try calling Optimize to reclaim "
964         "some space.");
965   }
966 
967   // Update namespace maps
968   ICING_ASSIGN_OR_RETURN(
969       NamespaceId namespace_id,
970       namespace_mapper_->GetOrPut(name_space, namespace_mapper_->num_keys()));
971 
972   // Updates key mapper and document_id mapper
973   ICING_RETURN_IF_ERROR(document_key_mapper_->Put(
974       MakeFingerprint(namespace_id, name_space, uri), new_document_id));
975   ICING_RETURN_IF_ERROR(document_id_mapper_->Set(new_document_id, file_offset));
976 
977   // Update corpus maps
978   ICING_ASSIGN_OR_RETURN(CorpusId corpusId,
979                          corpus_mapper_->GetOrPut(
980                              MakeFingerprint(namespace_id, name_space, schema),
981                              corpus_mapper_->num_keys()));
982 
983   ICING_ASSIGN_OR_RETURN(CorpusAssociatedScoreData scoring_data,
984                          GetCorpusAssociatedScoreDataToUpdate(corpusId));
985   scoring_data.AddDocument(length_in_tokens);
986 
987   ICING_RETURN_IF_ERROR(
988       UpdateCorpusAssociatedScoreCache(corpusId, scoring_data));
989 
990   ICING_RETURN_IF_ERROR(UpdateDocumentAssociatedScoreCache(
991       new_document_id,
992       DocumentAssociatedScoreData(corpusId, document_score,
993                                   creation_timestamp_ms, length_in_tokens)));
994 
995   ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
996                          schema_store_->GetSchemaTypeId(schema));
997 
998   ICING_RETURN_IF_ERROR(UpdateFilterCache(
999       new_document_id, DocumentFilterData(namespace_id, schema_type_id,
1000                                           expiration_timestamp_ms)));
1001 
1002   if (old_document_id_or.ok()) {
1003     // The old document exists, copy over the usage scores and delete the old
1004     // document.
1005     DocumentId old_document_id = old_document_id_or.ValueOrDie();
1006 
1007     ICING_RETURN_IF_ERROR(
1008         usage_store_->CloneUsageScores(/*from_document_id=*/old_document_id,
1009                                        /*to_document_id=*/new_document_id));
1010 
1011     // Delete the old document. It's fine if it's not found since it might have
1012     // been deleted previously.
1013     auto delete_status =
1014         Delete(old_document_id, clock_.GetSystemTimeMilliseconds());
1015     if (!delete_status.ok() && !absl_ports::IsNotFound(delete_status)) {
1016       // Real error, pass it up.
1017       return delete_status;
1018     }
1019   }
1020 
1021   if (put_document_stats != nullptr) {
1022     put_document_stats->set_document_store_latency_ms(
1023         put_timer->GetElapsedMilliseconds());
1024   }
1025 
1026   return new_document_id;
1027 }
1028 
Get(const std::string_view name_space,const std::string_view uri,bool clear_internal_fields) const1029 libtextclassifier3::StatusOr<DocumentProto> DocumentStore::Get(
1030     const std::string_view name_space, const std::string_view uri,
1031     bool clear_internal_fields) const {
1032   // TODO(b/147231617): Make a better way to replace the error message in an
1033   // existing Status.
1034   auto document_id_or = GetDocumentId(name_space, uri);
1035   if (absl_ports::IsNotFound(document_id_or.status())) {
1036     ICING_VLOG(1) << document_id_or.status().error_message();
1037     return libtextclassifier3::Status(
1038         document_id_or.status().CanonicalCode(),
1039         IcingStringUtil::StringPrintf("Document (%s, %s) not found.",
1040                                       name_space.data(), uri.data()));
1041   }
1042   DocumentId document_id = document_id_or.ValueOrDie();
1043 
1044   // TODO(b/147231617): Make a better way to replace the error message in an
1045   // existing Status.
1046   auto status_or = Get(document_id);
1047   if (absl_ports::IsNotFound(status_or.status())) {
1048     ICING_LOG(ERROR) << document_id_or.status().error_message();
1049     return libtextclassifier3::Status(
1050         status_or.status().CanonicalCode(),
1051         IcingStringUtil::StringPrintf("Document (%s, %s) not found.",
1052                                       name_space.data(), uri.data()));
1053   }
1054   return status_or;
1055 }
1056 
Get(DocumentId document_id,bool clear_internal_fields) const1057 libtextclassifier3::StatusOr<DocumentProto> DocumentStore::Get(
1058     DocumentId document_id, bool clear_internal_fields) const {
1059   int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
1060   auto document_filter_data_optional_ =
1061       GetAliveDocumentFilterData(document_id, current_time_ms);
1062   if (!document_filter_data_optional_) {
1063     // The document doesn't exist. Let's check if the document id is invalid, we
1064     // will return InvalidArgumentError. Otherwise we should return NOT_FOUND
1065     // error.
1066     if (!IsDocumentIdValid(document_id)) {
1067       return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
1068           "Document id '%d' invalid.", document_id));
1069     }
1070     return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
1071         "Document id '%d' doesn't exist", document_id));
1072   }
1073 
1074   auto document_log_offset_or = document_id_mapper_->Get(document_id);
1075   if (!document_log_offset_or.ok()) {
1076     // Since we've just checked that our document_id is valid a few lines
1077     // above, there's no reason this should fail and an error should never
1078     // happen.
1079     return absl_ports::InternalError("Failed to find document offset.");
1080   }
1081   int64_t document_log_offset = *document_log_offset_or.ValueOrDie();
1082 
1083   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
1084   // that can support error logging.
1085   auto document_wrapper_or = document_log_->ReadProto(document_log_offset);
1086   if (!document_wrapper_or.ok()) {
1087     ICING_LOG(ERROR) << document_wrapper_or.status().error_message()
1088                      << "Failed to read from document log";
1089     return document_wrapper_or.status();
1090   }
1091   DocumentWrapper document_wrapper =
1092       std::move(document_wrapper_or).ValueOrDie();
1093   if (clear_internal_fields) {
1094     document_wrapper.mutable_document()->clear_internal_fields();
1095   }
1096 
1097   return std::move(*document_wrapper.mutable_document());
1098 }
1099 
GetDocumentId(const std::string_view name_space,const std::string_view uri) const1100 libtextclassifier3::StatusOr<DocumentId> DocumentStore::GetDocumentId(
1101     const std::string_view name_space, const std::string_view uri) const {
1102   auto namespace_id_or = namespace_mapper_->Get(name_space);
1103   libtextclassifier3::Status status = namespace_id_or.status();
1104   if (status.ok()) {
1105     NamespaceId namespace_id = namespace_id_or.ValueOrDie();
1106     auto document_id_or = document_key_mapper_->Get(
1107         MakeFingerprint(namespace_id, name_space, uri));
1108     status = document_id_or.status();
1109     if (status.ok()) {
1110       // Guaranteed to have a DocumentId now
1111       return document_id_or.ValueOrDie();
1112     }
1113   }
1114   return absl_ports::Annotate(
1115       status, absl_ports::StrCat(
1116                   "Failed to find DocumentId by key: ", name_space, ", ", uri));
1117 }
1118 
GetAllNamespaces() const1119 std::vector<std::string> DocumentStore::GetAllNamespaces() const {
1120   std::unordered_map<NamespaceId, std::string> namespace_id_to_namespace =
1121       GetNamespaceIdsToNamespaces(namespace_mapper_.get());
1122 
1123   std::unordered_set<NamespaceId> existing_namespace_ids;
1124   int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
1125   for (DocumentId document_id = 0; document_id < filter_cache_->num_elements();
1126        ++document_id) {
1127     // filter_cache_->Get can only fail if document_id is < 0
1128     // or >= filter_cache_->num_elements. So, this error SHOULD NEVER HAPPEN.
1129     auto status_or_data = filter_cache_->Get(document_id);
1130     if (!status_or_data.ok()) {
1131       ICING_LOG(ERROR)
1132           << "Error while iterating over filter cache in GetAllNamespaces";
1133       return std::vector<std::string>();
1134     }
1135     const DocumentFilterData* data = status_or_data.ValueOrDie();
1136 
1137     if (GetAliveDocumentFilterData(document_id, current_time_ms)) {
1138       existing_namespace_ids.insert(data->namespace_id());
1139     }
1140   }
1141 
1142   std::vector<std::string> existing_namespaces;
1143   for (auto itr = existing_namespace_ids.begin();
1144        itr != existing_namespace_ids.end(); ++itr) {
1145     existing_namespaces.push_back(namespace_id_to_namespace.at(*itr));
1146   }
1147   return existing_namespaces;
1148 }
1149 
GetAliveDocumentFilterData(DocumentId document_id,int64_t current_time_ms) const1150 std::optional<DocumentFilterData> DocumentStore::GetAliveDocumentFilterData(
1151     DocumentId document_id, int64_t current_time_ms) const {
1152   if (IsDeleted(document_id)) {
1153     return std::nullopt;
1154   }
1155   return GetNonExpiredDocumentFilterData(document_id, current_time_ms);
1156 }
1157 
IsDeleted(DocumentId document_id) const1158 bool DocumentStore::IsDeleted(DocumentId document_id) const {
1159   auto file_offset_or = document_id_mapper_->Get(document_id);
1160   if (!file_offset_or.ok()) {
1161     // This would only happen if document_id is out of range of the
1162     // document_id_mapper, meaning we got some invalid document_id. Callers
1163     // should already have checked that their document_id is valid or used
1164     // DoesDocumentExist(WithStatus). Regardless, return true since the
1165     // document doesn't exist.
1166     return true;
1167   }
1168   int64_t file_offset = *file_offset_or.ValueOrDie();
1169   return file_offset == kDocDeletedFlag;
1170 }
1171 
1172 // Returns DocumentFilterData if the document is not expired. Otherwise,
1173 // std::nullopt.
1174 std::optional<DocumentFilterData>
GetNonExpiredDocumentFilterData(DocumentId document_id,int64_t current_time_ms) const1175 DocumentStore::GetNonExpiredDocumentFilterData(DocumentId document_id,
1176                                                int64_t current_time_ms) const {
1177   auto filter_data_or = filter_cache_->GetCopy(document_id);
1178   if (!filter_data_or.ok()) {
1179     // This would only happen if document_id is out of range of the
1180     // filter_cache, meaning we got some invalid document_id. Callers should
1181     // already have checked that their document_id is valid or used
1182     // DoesDocumentExist(WithStatus). Regardless, return true since the
1183     // document doesn't exist.
1184     return std::nullopt;
1185   }
1186   DocumentFilterData document_filter_data = filter_data_or.ValueOrDie();
1187 
1188   // Check if it's past the expiration time
1189   if (current_time_ms >= document_filter_data.expiration_timestamp_ms()) {
1190     return std::nullopt;
1191   }
1192   return document_filter_data;
1193 }
1194 
Delete(const std::string_view name_space,const std::string_view uri,int64_t current_time_ms)1195 libtextclassifier3::Status DocumentStore::Delete(
1196     const std::string_view name_space, const std::string_view uri,
1197     int64_t current_time_ms) {
1198   // Try to get the DocumentId first
1199   auto document_id_or = GetDocumentId(name_space, uri);
1200   if (!document_id_or.ok()) {
1201     return absl_ports::Annotate(
1202         document_id_or.status(),
1203         absl_ports::StrCat("Failed to delete Document. namespace: ", name_space,
1204                            ", uri: ", uri));
1205   }
1206   return Delete(document_id_or.ValueOrDie(), current_time_ms);
1207 }
1208 
Delete(DocumentId document_id,int64_t current_time_ms)1209 libtextclassifier3::Status DocumentStore::Delete(DocumentId document_id,
1210                                                  int64_t current_time_ms) {
1211   auto document_filter_data_optional_ =
1212       GetAliveDocumentFilterData(document_id, current_time_ms);
1213   if (!document_filter_data_optional_) {
1214     // The document doesn't exist. We should return InvalidArgumentError if the
1215     // document id is invalid. Otherwise we should return NOT_FOUND error.
1216     if (!IsDocumentIdValid(document_id)) {
1217       return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
1218           "Document id '%d' invalid.", document_id));
1219     }
1220     return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
1221         "Document id '%d' doesn't exist", document_id));
1222   }
1223 
1224   auto document_log_offset_or = document_id_mapper_->Get(document_id);
1225   if (!document_log_offset_or.ok()) {
1226     return absl_ports::InternalError("Failed to find document offset.");
1227   }
1228   int64_t document_log_offset = *document_log_offset_or.ValueOrDie();
1229 
1230   // Erases document proto.
1231   ICING_RETURN_IF_ERROR(document_log_->EraseProto(document_log_offset));
1232   return ClearDerivedData(document_id);
1233 }
1234 
GetNamespaceId(std::string_view name_space) const1235 libtextclassifier3::StatusOr<NamespaceId> DocumentStore::GetNamespaceId(
1236     std::string_view name_space) const {
1237   return namespace_mapper_->Get(name_space);
1238 }
1239 
GetCorpusId(const std::string_view name_space,const std::string_view schema) const1240 libtextclassifier3::StatusOr<CorpusId> DocumentStore::GetCorpusId(
1241     const std::string_view name_space, const std::string_view schema) const {
1242   ICING_ASSIGN_OR_RETURN(NamespaceId namespace_id,
1243                          namespace_mapper_->Get(name_space));
1244   return corpus_mapper_->Get(MakeFingerprint(namespace_id, name_space, schema));
1245 }
1246 
GetResultGroupingEntryId(ResultSpecProto::ResultGroupingType result_group_type,const std::string_view name_space,const std::string_view schema) const1247 libtextclassifier3::StatusOr<int32_t> DocumentStore::GetResultGroupingEntryId(
1248     ResultSpecProto::ResultGroupingType result_group_type,
1249     const std::string_view name_space, const std::string_view schema) const {
1250   auto namespace_id = GetNamespaceId(name_space);
1251   auto schema_type_id = schema_store_->GetSchemaTypeId(schema);
1252   switch (result_group_type) {
1253     case ResultSpecProto::NONE:
1254       return absl_ports::InvalidArgumentError(
1255           "Cannot group by ResultSpecProto::NONE");
1256     case ResultSpecProto::SCHEMA_TYPE:
1257       if (schema_type_id.ok()) {
1258         return schema_type_id.ValueOrDie();
1259       }
1260       break;
1261     case ResultSpecProto::NAMESPACE:
1262       if (namespace_id.ok()) {
1263         return namespace_id.ValueOrDie();
1264       }
1265       break;
1266     case ResultSpecProto::NAMESPACE_AND_SCHEMA_TYPE:
1267       if (namespace_id.ok() && schema_type_id.ok()) {
1268         // TODO(b/258715421): Temporary workaround to get a
1269         //                    ResultGroupingEntryId given the Namespace string
1270         //                    and Schema string.
1271         return namespace_id.ValueOrDie() << 16 | schema_type_id.ValueOrDie();
1272       }
1273       break;
1274   }
1275   return absl_ports::NotFoundError("Cannot generate ResultGrouping Entry Id");
1276 }
1277 
GetResultGroupingEntryId(ResultSpecProto::ResultGroupingType result_group_type,const NamespaceId namespace_id,const SchemaTypeId schema_type_id) const1278 libtextclassifier3::StatusOr<int32_t> DocumentStore::GetResultGroupingEntryId(
1279     ResultSpecProto::ResultGroupingType result_group_type,
1280     const NamespaceId namespace_id, const SchemaTypeId schema_type_id) const {
1281   switch (result_group_type) {
1282     case ResultSpecProto::NONE:
1283       return absl_ports::InvalidArgumentError(
1284           "Cannot group by ResultSpecProto::NONE");
1285     case ResultSpecProto::SCHEMA_TYPE:
1286       return schema_type_id;
1287     case ResultSpecProto::NAMESPACE:
1288       return namespace_id;
1289     case ResultSpecProto::NAMESPACE_AND_SCHEMA_TYPE:
1290       // TODO(b/258715421): Temporary workaround to get a ResultGroupingEntryId
1291       //                    given the Namespace Id and SchemaType Id.
1292       return namespace_id << 16 | schema_type_id;
1293   }
1294   return absl_ports::NotFoundError("Cannot generate ResultGrouping Entry Id");
1295 }
1296 
1297 libtextclassifier3::StatusOr<DocumentAssociatedScoreData>
GetDocumentAssociatedScoreData(DocumentId document_id) const1298 DocumentStore::GetDocumentAssociatedScoreData(DocumentId document_id) const {
1299   auto score_data_or = score_cache_->GetCopy(document_id);
1300   if (!score_data_or.ok()) {
1301     ICING_LOG(ERROR) << " while trying to access DocumentId " << document_id
1302                      << " from score_cache_";
1303     return absl_ports::NotFoundError(
1304         std::move(score_data_or).status().error_message());
1305   }
1306 
1307   DocumentAssociatedScoreData document_associated_score_data =
1308       std::move(score_data_or).ValueOrDie();
1309   return document_associated_score_data;
1310 }
1311 
1312 libtextclassifier3::StatusOr<CorpusAssociatedScoreData>
GetCorpusAssociatedScoreData(CorpusId corpus_id) const1313 DocumentStore::GetCorpusAssociatedScoreData(CorpusId corpus_id) const {
1314   auto score_data_or = corpus_score_cache_->GetCopy(corpus_id);
1315   if (!score_data_or.ok()) {
1316     return score_data_or.status();
1317   }
1318 
1319   CorpusAssociatedScoreData corpus_associated_score_data =
1320       std::move(score_data_or).ValueOrDie();
1321   return corpus_associated_score_data;
1322 }
1323 
1324 libtextclassifier3::StatusOr<CorpusAssociatedScoreData>
GetCorpusAssociatedScoreDataToUpdate(CorpusId corpus_id) const1325 DocumentStore::GetCorpusAssociatedScoreDataToUpdate(CorpusId corpus_id) const {
1326   auto corpus_scoring_data_or = GetCorpusAssociatedScoreData(corpus_id);
1327   if (corpus_scoring_data_or.ok()) {
1328     return std::move(corpus_scoring_data_or).ValueOrDie();
1329   }
1330   CorpusAssociatedScoreData scoringData;
1331   // OUT_OF_RANGE is the StatusCode returned when a corpus id is added to
1332   // corpus_score_cache_ for the first time.
1333   if (corpus_scoring_data_or.status().CanonicalCode() ==
1334       libtextclassifier3::StatusCode::OUT_OF_RANGE) {
1335     return scoringData;
1336   }
1337   return corpus_scoring_data_or.status();
1338 }
1339 
1340 // TODO(b/273826815): Decide on and adopt a consistent pattern for handling
1341 // NOT_FOUND 'errors' returned by our internal classes.
GetUsageScores(DocumentId document_id,int64_t current_time_ms) const1342 std::optional<UsageStore::UsageScores> DocumentStore::GetUsageScores(
1343     DocumentId document_id, int64_t current_time_ms) const {
1344   std::optional<DocumentFilterData> opt =
1345       GetAliveDocumentFilterData(document_id, current_time_ms);
1346   if (!opt) {
1347     return std::nullopt;
1348   }
1349   if (document_id >= usage_store_->num_elements()) {
1350     return std::nullopt;
1351   }
1352   auto usage_scores_or = usage_store_->GetUsageScores(document_id);
1353   if (!usage_scores_or.ok()) {
1354     ICING_LOG(ERROR) << "Error retrieving usage for " << document_id << ": "
1355                      << usage_scores_or.status().error_message();
1356     return std::nullopt;
1357   }
1358   return std::move(usage_scores_or).ValueOrDie();
1359 }
1360 
ReportUsage(const UsageReport & usage_report)1361 libtextclassifier3::Status DocumentStore::ReportUsage(
1362     const UsageReport& usage_report) {
1363   ICING_ASSIGN_OR_RETURN(DocumentId document_id,
1364                          GetDocumentId(usage_report.document_namespace(),
1365                                        usage_report.document_uri()));
1366   // We can use the internal version here because we got our document_id from
1367   // our internal data structures. We would have thrown some error if the
1368   // namespace and/or uri were incorrect.
1369   int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
1370   if (!GetAliveDocumentFilterData(document_id, current_time_ms)) {
1371     // Document was probably deleted or expired.
1372     return absl_ports::NotFoundError(absl_ports::StrCat(
1373         "Couldn't report usage on a nonexistent document: (namespace: '",
1374         usage_report.document_namespace(), "', uri: '",
1375         usage_report.document_uri(), "')"));
1376   }
1377 
1378   return usage_store_->AddUsageReport(usage_report, document_id);
1379 }
1380 
DeleteByNamespace(std::string_view name_space)1381 DocumentStore::DeleteByGroupResult DocumentStore::DeleteByNamespace(
1382     std::string_view name_space) {
1383   DeleteByGroupResult result;
1384   auto namespace_id_or = namespace_mapper_->Get(name_space);
1385   if (!namespace_id_or.ok()) {
1386     result.status = absl_ports::Annotate(
1387         namespace_id_or.status(),
1388         absl_ports::StrCat("Failed to find namespace: ", name_space));
1389     return result;
1390   }
1391   NamespaceId namespace_id = namespace_id_or.ValueOrDie();
1392   auto num_deleted_or = BatchDelete(namespace_id, kInvalidSchemaTypeId);
1393   if (!num_deleted_or.ok()) {
1394     result.status = std::move(num_deleted_or).status();
1395     return result;
1396   }
1397 
1398   result.num_docs_deleted = num_deleted_or.ValueOrDie();
1399   if (result.num_docs_deleted <= 0) {
1400     // Treat the fact that no existing documents had this namespace to be the
1401     // same as this namespace not existing at all.
1402     result.status = absl_ports::NotFoundError(
1403         absl_ports::StrCat("Namespace '", name_space, "' doesn't exist"));
1404     return result;
1405   }
1406 
1407   return result;
1408 }
1409 
DeleteBySchemaType(std::string_view schema_type)1410 DocumentStore::DeleteByGroupResult DocumentStore::DeleteBySchemaType(
1411     std::string_view schema_type) {
1412   DeleteByGroupResult result;
1413   auto schema_type_id_or = schema_store_->GetSchemaTypeId(schema_type);
1414   if (!schema_type_id_or.ok()) {
1415     result.status = absl_ports::Annotate(
1416         schema_type_id_or.status(),
1417         absl_ports::StrCat("Failed to find schema type. schema_type: ",
1418                            schema_type));
1419     return result;
1420   }
1421   SchemaTypeId schema_type_id = schema_type_id_or.ValueOrDie();
1422   auto num_deleted_or = BatchDelete(kInvalidNamespaceId, schema_type_id);
1423   if (!num_deleted_or.ok()) {
1424     result.status = std::move(num_deleted_or).status();
1425     return result;
1426   }
1427 
1428   result.num_docs_deleted = num_deleted_or.ValueOrDie();
1429   if (result.num_docs_deleted <= 0) {
1430     result.status = absl_ports::NotFoundError(absl_ports::StrCat(
1431         "No documents found with schema type '", schema_type, "'"));
1432     return result;
1433   }
1434 
1435   return result;
1436 }
1437 
BatchDelete(NamespaceId namespace_id,SchemaTypeId schema_type_id)1438 libtextclassifier3::StatusOr<int> DocumentStore::BatchDelete(
1439     NamespaceId namespace_id, SchemaTypeId schema_type_id) {
1440   // Tracks if there were any existing documents with this namespace that we
1441   // will mark as deleted.
1442   int num_updated_documents = 0;
1443 
1444   // Traverse FilterCache and delete all docs that match namespace_id and
1445   // schema_type_id.
1446   int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
1447   for (DocumentId document_id = 0; document_id < filter_cache_->num_elements();
1448        ++document_id) {
1449     // filter_cache_->Get can only fail if document_id is < 0
1450     // or >= filter_cache_->num_elements. So, this error SHOULD NEVER HAPPEN.
1451     ICING_ASSIGN_OR_RETURN(const DocumentFilterData* data,
1452                            filter_cache_->Get(document_id));
1453 
1454     // Check namespace only when the input namespace id is valid.
1455     if (namespace_id != kInvalidNamespaceId &&
1456         (data->namespace_id() == kInvalidNamespaceId ||
1457          data->namespace_id() != namespace_id)) {
1458       // The document has already been hard-deleted or isn't from the desired
1459       // namespace.
1460       continue;
1461     }
1462 
1463     // Check schema type only when the input schema type id is valid.
1464     if (schema_type_id != kInvalidSchemaTypeId &&
1465         (data->schema_type_id() == kInvalidSchemaTypeId ||
1466          data->schema_type_id() != schema_type_id)) {
1467       // The document has already been hard-deleted or doesn't have the
1468       // desired schema type.
1469       continue;
1470     }
1471 
1472     // The document has the desired namespace and schema type, it either
1473     // exists or has expired.
1474     libtextclassifier3::Status delete_status =
1475         Delete(document_id, current_time_ms);
1476     if (absl_ports::IsNotFound(delete_status)) {
1477       continue;
1478     } else if (!delete_status.ok()) {
1479       // Real error, pass up.
1480       return delete_status;
1481     }
1482     ++num_updated_documents;
1483   }
1484 
1485   return num_updated_documents;
1486 }
1487 
PersistToDisk(PersistType::Code persist_type)1488 libtextclassifier3::Status DocumentStore::PersistToDisk(
1489     PersistType::Code persist_type) {
1490   if (persist_type == PersistType::LITE) {
1491     // only persist the document log.
1492     return document_log_->PersistToDisk();
1493   }
1494   ICING_RETURN_IF_ERROR(document_log_->PersistToDisk());
1495   ICING_RETURN_IF_ERROR(document_key_mapper_->PersistToDisk());
1496   ICING_RETURN_IF_ERROR(document_id_mapper_->PersistToDisk());
1497   ICING_RETURN_IF_ERROR(score_cache_->PersistToDisk());
1498   ICING_RETURN_IF_ERROR(filter_cache_->PersistToDisk());
1499   ICING_RETURN_IF_ERROR(namespace_mapper_->PersistToDisk());
1500   ICING_RETURN_IF_ERROR(usage_store_->PersistToDisk());
1501   ICING_RETURN_IF_ERROR(corpus_mapper_->PersistToDisk());
1502   ICING_RETURN_IF_ERROR(corpus_score_cache_->PersistToDisk());
1503 
1504   // Update the combined checksum and write to header file.
1505   ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
1506   ICING_RETURN_IF_ERROR(UpdateHeader(checksum));
1507 
1508   return libtextclassifier3::Status::OK;
1509 }
1510 
GetValueOrDefault(const libtextclassifier3::StatusOr<int64_t> & value_or,int64_t default_value)1511 int64_t GetValueOrDefault(const libtextclassifier3::StatusOr<int64_t>& value_or,
1512                           int64_t default_value) {
1513   return (value_or.ok()) ? value_or.ValueOrDie() : default_value;
1514 }
1515 
GetMemberStorageInfo() const1516 DocumentStorageInfoProto DocumentStore::GetMemberStorageInfo() const {
1517   DocumentStorageInfoProto storage_info;
1518   storage_info.set_document_log_size(
1519       GetValueOrDefault(document_log_->GetDiskUsage(), -1));
1520   storage_info.set_key_mapper_size(
1521       GetValueOrDefault(document_key_mapper_->GetDiskUsage(), -1));
1522   storage_info.set_document_id_mapper_size(
1523       GetValueOrDefault(document_id_mapper_->GetDiskUsage(), -1));
1524   storage_info.set_score_cache_size(
1525       GetValueOrDefault(score_cache_->GetDiskUsage(), -1));
1526   storage_info.set_filter_cache_size(
1527       GetValueOrDefault(filter_cache_->GetDiskUsage(), -1));
1528   storage_info.set_namespace_id_mapper_size(
1529       GetValueOrDefault(namespace_mapper_->GetDiskUsage(), -1));
1530   storage_info.set_corpus_mapper_size(
1531       GetValueOrDefault(corpus_mapper_->GetDiskUsage(), -1));
1532   storage_info.set_corpus_score_cache_size(
1533       GetValueOrDefault(corpus_score_cache_->GetDiskUsage(), -1));
1534   return storage_info;
1535 }
1536 
CalculateDocumentStatusCounts(DocumentStorageInfoProto storage_info) const1537 DocumentStorageInfoProto DocumentStore::CalculateDocumentStatusCounts(
1538     DocumentStorageInfoProto storage_info) const {
1539   int total_num_alive = 0;
1540   int total_num_expired = 0;
1541   int total_num_deleted = 0;
1542   std::unordered_map<NamespaceId, std::string> namespace_id_to_namespace =
1543       GetNamespaceIdsToNamespaces(namespace_mapper_.get());
1544   std::unordered_map<std::string, NamespaceStorageInfoProto>
1545       namespace_to_storage_info;
1546 
1547   int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
1548   for (DocumentId document_id = 0;
1549        document_id < document_id_mapper_->num_elements(); ++document_id) {
1550     // Check if it's deleted first.
1551     if (IsDeleted(document_id)) {
1552       // We don't have the namespace id of hard deleted documents anymore, so
1553       // we can't add to our namespace storage info.
1554       ++total_num_deleted;
1555       continue;
1556     }
1557 
1558     // At this point, the document is either alive or expired, we can get
1559     // namespace info for it.
1560     auto filter_data_or = filter_cache_->Get(document_id);
1561     if (!filter_data_or.ok()) {
1562       ICING_VLOG(1) << "Error trying to get filter data for document store "
1563                        "storage info counts.";
1564       continue;
1565     }
1566     const DocumentFilterData* filter_data = filter_data_or.ValueOrDie();
1567     auto itr = namespace_id_to_namespace.find(filter_data->namespace_id());
1568     if (itr == namespace_id_to_namespace.end()) {
1569       ICING_VLOG(1) << "Error trying to find namespace for document store "
1570                        "storage info counts.";
1571       continue;
1572     }
1573     const std::string& name_space = itr->second;
1574 
1575     // Always set the namespace, if the NamespaceStorageInfoProto didn't exist
1576     // before, we'll get back a default instance of it.
1577     NamespaceStorageInfoProto& namespace_storage_info =
1578         namespace_to_storage_info[name_space];
1579     namespace_storage_info.set_namespace_(name_space);
1580 
1581     // Get usage scores
1582     auto usage_scores_or = usage_store_->GetUsageScores(document_id);
1583     if (!usage_scores_or.ok()) {
1584       ICING_VLOG(1) << "Error trying to get usage scores for document store "
1585                        "storage info counts.";
1586       continue;
1587     }
1588     UsageStore::UsageScores usage_scores = usage_scores_or.ValueOrDie();
1589 
1590     // Update our stats
1591     if (!GetNonExpiredDocumentFilterData(document_id, current_time_ms)) {
1592       ++total_num_expired;
1593       namespace_storage_info.set_num_expired_documents(
1594           namespace_storage_info.num_expired_documents() + 1);
1595       if (usage_scores.usage_type1_count > 0) {
1596         namespace_storage_info.set_num_expired_documents_usage_type1(
1597             namespace_storage_info.num_expired_documents_usage_type1() + 1);
1598       }
1599       if (usage_scores.usage_type2_count > 0) {
1600         namespace_storage_info.set_num_expired_documents_usage_type2(
1601             namespace_storage_info.num_expired_documents_usage_type2() + 1);
1602       }
1603       if (usage_scores.usage_type3_count > 0) {
1604         namespace_storage_info.set_num_expired_documents_usage_type3(
1605             namespace_storage_info.num_expired_documents_usage_type3() + 1);
1606       }
1607     } else {
1608       ++total_num_alive;
1609       namespace_storage_info.set_num_alive_documents(
1610           namespace_storage_info.num_alive_documents() + 1);
1611       if (usage_scores.usage_type1_count > 0) {
1612         namespace_storage_info.set_num_alive_documents_usage_type1(
1613             namespace_storage_info.num_alive_documents_usage_type1() + 1);
1614       }
1615       if (usage_scores.usage_type2_count > 0) {
1616         namespace_storage_info.set_num_alive_documents_usage_type2(
1617             namespace_storage_info.num_alive_documents_usage_type2() + 1);
1618       }
1619       if (usage_scores.usage_type3_count > 0) {
1620         namespace_storage_info.set_num_alive_documents_usage_type3(
1621             namespace_storage_info.num_alive_documents_usage_type3() + 1);
1622       }
1623     }
1624   }
1625 
1626   for (auto& itr : namespace_to_storage_info) {
1627     storage_info.mutable_namespace_storage_info()->Add(std::move(itr.second));
1628   }
1629   storage_info.set_num_alive_documents(total_num_alive);
1630   storage_info.set_num_deleted_documents(total_num_deleted);
1631   storage_info.set_num_expired_documents(total_num_expired);
1632   return storage_info;
1633 }
1634 
GetStorageInfo() const1635 DocumentStorageInfoProto DocumentStore::GetStorageInfo() const {
1636   DocumentStorageInfoProto storage_info = GetMemberStorageInfo();
1637   int64_t directory_size = filesystem_->GetDiskUsage(base_dir_.c_str());
1638   if (directory_size != Filesystem::kBadFileSize) {
1639     storage_info.set_document_store_size(directory_size);
1640   } else {
1641     storage_info.set_document_store_size(-1);
1642   }
1643   storage_info.set_num_namespaces(namespace_mapper_->num_keys());
1644   return CalculateDocumentStatusCounts(std::move(storage_info));
1645 }
1646 
UpdateSchemaStore(const SchemaStore * schema_store)1647 libtextclassifier3::Status DocumentStore::UpdateSchemaStore(
1648     const SchemaStore* schema_store) {
1649   // Update all references to the SchemaStore
1650   schema_store_ = schema_store;
1651   document_validator_.UpdateSchemaStore(schema_store);
1652 
1653   int size = document_id_mapper_->num_elements();
1654   int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
1655   for (DocumentId document_id = 0; document_id < size; document_id++) {
1656     auto document_or = Get(document_id);
1657     if (absl_ports::IsNotFound(document_or.status())) {
1658       // Skip nonexistent documents
1659       continue;
1660     } else if (!document_or.ok()) {
1661       // Real error, pass up
1662       return absl_ports::Annotate(
1663           document_or.status(),
1664           IcingStringUtil::StringPrintf(
1665               "Failed to retrieve Document for DocumentId %d", document_id));
1666     }
1667 
1668     // Guaranteed to have a document now.
1669     DocumentProto document = document_or.ValueOrDie();
1670 
1671     // Revalidate that this document is still compatible
1672     if (document_validator_.Validate(document).ok()) {
1673       // Update the SchemaTypeId for this entry
1674       ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
1675                              schema_store_->GetSchemaTypeId(document.schema()));
1676       ICING_ASSIGN_OR_RETURN(
1677           typename FileBackedVector<DocumentFilterData>::MutableView
1678               doc_filter_data_view,
1679           filter_cache_->GetMutable(document_id));
1680       doc_filter_data_view.Get().set_schema_type_id(schema_type_id);
1681     } else {
1682       // Document is no longer valid with the new SchemaStore. Mark as
1683       // deleted
1684       auto delete_status =
1685           Delete(document.namespace_(), document.uri(), current_time_ms);
1686       if (!delete_status.ok() && !absl_ports::IsNotFound(delete_status)) {
1687         // Real error, pass up
1688         return delete_status;
1689       }
1690     }
1691   }
1692 
1693   return libtextclassifier3::Status::OK;
1694 }
1695 
OptimizedUpdateSchemaStore(const SchemaStore * schema_store,const SchemaStore::SetSchemaResult & set_schema_result)1696 libtextclassifier3::Status DocumentStore::OptimizedUpdateSchemaStore(
1697     const SchemaStore* schema_store,
1698     const SchemaStore::SetSchemaResult& set_schema_result) {
1699   if (!set_schema_result.success) {
1700     // No new schema was set, no work to be done
1701     return libtextclassifier3::Status::OK;
1702   }
1703 
1704   // Update all references to the SchemaStore
1705   schema_store_ = schema_store;
1706   document_validator_.UpdateSchemaStore(schema_store);
1707 
1708   int size = document_id_mapper_->num_elements();
1709   int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
1710   for (DocumentId document_id = 0; document_id < size; document_id++) {
1711     if (!GetAliveDocumentFilterData(document_id, current_time_ms)) {
1712       // Skip nonexistent documents
1713       continue;
1714     }
1715 
1716     // Guaranteed that the document exists now.
1717     ICING_ASSIGN_OR_RETURN(const DocumentFilterData* filter_data,
1718                            filter_cache_->Get(document_id));
1719 
1720     bool delete_document = set_schema_result.schema_types_deleted_by_id.count(
1721                                filter_data->schema_type_id()) != 0;
1722 
1723     // Check if we need to update the FilterCache entry for this document. It
1724     // may have been assigned a different SchemaTypeId in the new SchemaStore.
1725     bool update_filter_cache =
1726         set_schema_result.old_schema_type_ids_changed.count(
1727             filter_data->schema_type_id()) != 0;
1728 
1729     // Check if we need to revalidate this document if the type is now
1730     // incompatible
1731     bool revalidate_document =
1732         set_schema_result.schema_types_incompatible_by_id.count(
1733             filter_data->schema_type_id()) != 0;
1734 
1735     if (update_filter_cache || revalidate_document) {
1736       ICING_ASSIGN_OR_RETURN(DocumentProto document, Get(document_id));
1737 
1738       if (update_filter_cache) {
1739         ICING_ASSIGN_OR_RETURN(
1740             SchemaTypeId schema_type_id,
1741             schema_store_->GetSchemaTypeId(document.schema()));
1742         ICING_ASSIGN_OR_RETURN(
1743             typename FileBackedVector<DocumentFilterData>::MutableView
1744                 doc_filter_data_view,
1745             filter_cache_->GetMutable(document_id));
1746         doc_filter_data_view.Get().set_schema_type_id(schema_type_id);
1747       }
1748       if (revalidate_document) {
1749         delete_document = !document_validator_.Validate(document).ok();
1750       }
1751     }
1752 
1753     if (delete_document) {
1754       // Document is no longer valid with the new SchemaStore. Mark as deleted
1755       auto delete_status = Delete(document_id, current_time_ms);
1756       if (!delete_status.ok() && !absl_ports::IsNotFound(delete_status)) {
1757         // Real error, pass up
1758         return delete_status;
1759       }
1760     }
1761   }
1762 
1763   return libtextclassifier3::Status::OK;
1764 }
1765 
1766 // TODO(b/121227117): Implement Optimize()
Optimize()1767 libtextclassifier3::Status DocumentStore::Optimize() {
1768   return libtextclassifier3::Status::OK;
1769 }
1770 
1771 libtextclassifier3::StatusOr<std::vector<DocumentId>>
OptimizeInto(const std::string & new_directory,const LanguageSegmenter * lang_segmenter,bool namespace_id_fingerprint,OptimizeStatsProto * stats)1772 DocumentStore::OptimizeInto(const std::string& new_directory,
1773                             const LanguageSegmenter* lang_segmenter,
1774                             bool namespace_id_fingerprint,
1775                             OptimizeStatsProto* stats) {
1776   // Validates directory
1777   if (new_directory == base_dir_) {
1778     return absl_ports::InvalidArgumentError(
1779         "New directory is the same as the current one.");
1780   }
1781 
1782   ICING_ASSIGN_OR_RETURN(
1783       auto doc_store_create_result,
1784       DocumentStore::Create(filesystem_, new_directory, &clock_, schema_store_,
1785                             /*force_recovery_and_revalidate_documents=*/false,
1786                             namespace_id_fingerprint, compression_level_,
1787                             /*initialize_stats=*/nullptr));
1788   std::unique_ptr<DocumentStore> new_doc_store =
1789       std::move(doc_store_create_result.document_store);
1790 
1791   // Writes all valid docs into new document store (new directory)
1792   int size = document_id_mapper_->num_elements();
1793   int num_deleted = 0;
1794   int num_expired = 0;
1795   UsageStore::UsageScores default_usage;
1796   std::vector<DocumentId> document_id_old_to_new(size, kInvalidDocumentId);
1797   int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
1798   for (DocumentId document_id = 0; document_id < size; document_id++) {
1799     auto document_or = Get(document_id, /*clear_internal_fields=*/false);
1800     if (absl_ports::IsNotFound(document_or.status())) {
1801       if (IsDeleted(document_id)) {
1802         ++num_deleted;
1803       } else if (!GetNonExpiredDocumentFilterData(document_id,
1804                                                   current_time_ms)) {
1805         ++num_expired;
1806       }
1807       continue;
1808     } else if (!document_or.ok()) {
1809       // Real error, pass up
1810       return absl_ports::Annotate(
1811           document_or.status(),
1812           IcingStringUtil::StringPrintf(
1813               "Failed to retrieve Document for DocumentId %d", document_id));
1814     }
1815 
1816     // Guaranteed to have a document now.
1817     DocumentProto document_to_keep = std::move(document_or).ValueOrDie();
1818 
1819     libtextclassifier3::StatusOr<DocumentId> new_document_id_or;
1820     if (document_to_keep.internal_fields().length_in_tokens() == 0) {
1821       auto tokenized_document_or = TokenizedDocument::Create(
1822           schema_store_, lang_segmenter, document_to_keep);
1823       if (!tokenized_document_or.ok()) {
1824         return absl_ports::Annotate(
1825             tokenized_document_or.status(),
1826             IcingStringUtil::StringPrintf(
1827                 "Failed to tokenize Document for DocumentId %d", document_id));
1828       }
1829       TokenizedDocument tokenized_document(
1830           std::move(tokenized_document_or).ValueOrDie());
1831       new_document_id_or = new_doc_store->Put(
1832           std::move(document_to_keep), tokenized_document.num_string_tokens());
1833     } else {
1834       // TODO(b/144458732): Implement a more robust version of
1835       // TC_ASSIGN_OR_RETURN that can support error logging.
1836       new_document_id_or =
1837           new_doc_store->InternalPut(std::move(document_to_keep));
1838     }
1839     if (!new_document_id_or.ok()) {
1840       ICING_LOG(ERROR) << new_document_id_or.status().error_message()
1841                        << "Failed to write into new document store";
1842       return new_document_id_or.status();
1843     }
1844 
1845     document_id_old_to_new[document_id] = new_document_id_or.ValueOrDie();
1846 
1847     // Copy over usage scores.
1848     ICING_ASSIGN_OR_RETURN(UsageStore::UsageScores usage_scores,
1849                            usage_store_->GetUsageScores(document_id));
1850     if (!(usage_scores == default_usage)) {
1851       // If the usage scores for this document are the default (no usage), then
1852       // don't bother setting it. No need to possibly allocate storage if
1853       // there's nothing interesting to store.
1854       DocumentId new_document_id = new_document_id_or.ValueOrDie();
1855       ICING_RETURN_IF_ERROR(
1856           new_doc_store->SetUsageScores(new_document_id, usage_scores));
1857     }
1858   }
1859   if (stats != nullptr) {
1860     stats->set_num_original_documents(size);
1861     stats->set_num_deleted_documents(num_deleted);
1862     stats->set_num_expired_documents(num_expired);
1863   }
1864   ICING_RETURN_IF_ERROR(new_doc_store->PersistToDisk(PersistType::FULL));
1865   return document_id_old_to_new;
1866 }
1867 
1868 libtextclassifier3::StatusOr<DocumentStore::OptimizeInfo>
GetOptimizeInfo() const1869 DocumentStore::GetOptimizeInfo() const {
1870   OptimizeInfo optimize_info;
1871 
1872   // Figure out our ratio of optimizable/total docs.
1873   int32_t num_documents = document_id_mapper_->num_elements();
1874   int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
1875   for (DocumentId document_id = kMinDocumentId; document_id < num_documents;
1876        ++document_id) {
1877     if (!GetAliveDocumentFilterData(document_id, current_time_ms)) {
1878       ++optimize_info.optimizable_docs;
1879     }
1880 
1881     ++optimize_info.total_docs;
1882   }
1883 
1884   if (optimize_info.total_docs == 0) {
1885     // Can exit early since there's nothing to calculate.
1886     return optimize_info;
1887   }
1888 
1889   // Get the total element size.
1890   //
1891   // We use file size instead of disk usage here because the files are not
1892   // sparse, so it's more accurate. Disk usage rounds up to the nearest block
1893   // size.
1894   ICING_ASSIGN_OR_RETURN(const int64_t document_log_file_size,
1895                          document_log_->GetElementsFileSize());
1896   ICING_ASSIGN_OR_RETURN(const int64_t document_id_mapper_file_size,
1897                          document_id_mapper_->GetElementsFileSize());
1898   ICING_ASSIGN_OR_RETURN(const int64_t score_cache_file_size,
1899                          score_cache_->GetElementsFileSize());
1900   ICING_ASSIGN_OR_RETURN(const int64_t filter_cache_file_size,
1901                          filter_cache_->GetElementsFileSize());
1902   ICING_ASSIGN_OR_RETURN(const int64_t corpus_score_cache_file_size,
1903                          corpus_score_cache_->GetElementsFileSize());
1904 
1905   // Usage store might be sparse, but we'll still use file size for more
1906   // accurate counting.
1907   ICING_ASSIGN_OR_RETURN(const int64_t usage_store_file_size,
1908                          usage_store_->GetElementsFileSize());
1909 
1910   // We use a combined disk usage and file size for the DynamicTrieKeyMapper
1911   // because it's backed by a trie, which has some sparse property bitmaps.
1912   ICING_ASSIGN_OR_RETURN(const int64_t document_key_mapper_size,
1913                          document_key_mapper_->GetElementsSize());
1914 
1915   // We don't include the namespace_mapper or the corpus_mapper because it's
1916   // not clear if we could recover any space even if Optimize were called.
1917   // Deleting 100s of documents could still leave a few documents of a
1918   // namespace, and then there would be no change.
1919 
1920   int64_t total_size = document_log_file_size + document_key_mapper_size +
1921                        document_id_mapper_file_size + score_cache_file_size +
1922                        filter_cache_file_size + corpus_score_cache_file_size +
1923                        usage_store_file_size;
1924 
1925   optimize_info.estimated_optimizable_bytes =
1926       total_size * optimize_info.optimizable_docs / optimize_info.total_docs;
1927   return optimize_info;
1928 }
1929 
UpdateCorpusAssociatedScoreCache(CorpusId corpus_id,const CorpusAssociatedScoreData & score_data)1930 libtextclassifier3::Status DocumentStore::UpdateCorpusAssociatedScoreCache(
1931     CorpusId corpus_id, const CorpusAssociatedScoreData& score_data) {
1932   return corpus_score_cache_->Set(corpus_id, score_data);
1933 }
1934 
UpdateDocumentAssociatedScoreCache(DocumentId document_id,const DocumentAssociatedScoreData & score_data)1935 libtextclassifier3::Status DocumentStore::UpdateDocumentAssociatedScoreCache(
1936     DocumentId document_id, const DocumentAssociatedScoreData& score_data) {
1937   return score_cache_->Set(document_id, score_data);
1938 }
1939 
UpdateFilterCache(DocumentId document_id,const DocumentFilterData & filter_data)1940 libtextclassifier3::Status DocumentStore::UpdateFilterCache(
1941     DocumentId document_id, const DocumentFilterData& filter_data) {
1942   return filter_cache_->Set(document_id, filter_data);
1943 }
1944 
ClearDerivedData(DocumentId document_id)1945 libtextclassifier3::Status DocumentStore::ClearDerivedData(
1946     DocumentId document_id) {
1947   // We intentionally leave the data in key_mapper_ because locating that data
1948   // requires fetching namespace and uri. Leaving data in key_mapper_ should
1949   // be fine because the data is hashed.
1950 
1951   ICING_RETURN_IF_ERROR(document_id_mapper_->Set(document_id, kDocDeletedFlag));
1952 
1953   // Resets the score cache entry
1954   ICING_RETURN_IF_ERROR(UpdateDocumentAssociatedScoreCache(
1955       document_id, DocumentAssociatedScoreData(kInvalidCorpusId,
1956                                                /*document_score=*/-1,
1957                                                /*creation_timestamp_ms=*/-1,
1958                                                /*length_in_tokens=*/0)));
1959 
1960   // Resets the filter cache entry
1961   ICING_RETURN_IF_ERROR(UpdateFilterCache(
1962       document_id, DocumentFilterData(kInvalidNamespaceId, kInvalidSchemaTypeId,
1963                                       /*expiration_timestamp_ms=*/-1)));
1964 
1965   // Clears the usage scores.
1966   return usage_store_->DeleteUsageScores(document_id);
1967 }
1968 
SetUsageScores(DocumentId document_id,const UsageStore::UsageScores & usage_scores)1969 libtextclassifier3::Status DocumentStore::SetUsageScores(
1970     DocumentId document_id, const UsageStore::UsageScores& usage_scores) {
1971   return usage_store_->SetUsageScores(document_id, usage_scores);
1972 }
1973 
1974 libtextclassifier3::StatusOr<
1975     google::protobuf::RepeatedPtrField<DocumentDebugInfoProto::CorpusInfo>>
CollectCorpusInfo() const1976 DocumentStore::CollectCorpusInfo() const {
1977   google::protobuf::RepeatedPtrField<DocumentDebugInfoProto::CorpusInfo> corpus_info;
1978   libtextclassifier3::StatusOr<const SchemaProto*> schema_proto_or =
1979       schema_store_->GetSchema();
1980   if (!schema_proto_or.ok()) {
1981     return corpus_info;
1982   }
1983   // Maps from CorpusId to the corresponding protocol buffer in the result.
1984   std::unordered_map<CorpusId, DocumentDebugInfoProto::CorpusInfo*> info_map;
1985   std::unordered_map<NamespaceId, std::string> namespace_id_to_namespace =
1986       GetNamespaceIdsToNamespaces(namespace_mapper_.get());
1987   const SchemaProto* schema_proto = schema_proto_or.ValueOrDie();
1988   int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
1989   for (DocumentId document_id = 0; document_id < filter_cache_->num_elements();
1990        ++document_id) {
1991     if (!GetAliveDocumentFilterData(document_id, current_time_ms)) {
1992       continue;
1993     }
1994     ICING_ASSIGN_OR_RETURN(const DocumentFilterData* filter_data,
1995                            filter_cache_->Get(document_id));
1996     ICING_ASSIGN_OR_RETURN(const DocumentAssociatedScoreData* score_data,
1997                            score_cache_->Get(document_id));
1998     const std::string& name_space =
1999         namespace_id_to_namespace[filter_data->namespace_id()];
2000     const std::string& schema =
2001         schema_proto->types()[filter_data->schema_type_id()].schema_type();
2002     auto iter = info_map.find(score_data->corpus_id());
2003     if (iter == info_map.end()) {
2004       DocumentDebugInfoProto::CorpusInfo* entry = corpus_info.Add();
2005       entry->set_namespace_(name_space);
2006       entry->set_schema(schema);
2007       iter = info_map.insert({score_data->corpus_id(), entry}).first;
2008     }
2009     iter->second->set_total_documents(iter->second->total_documents() + 1);
2010     iter->second->set_total_token(iter->second->total_token() +
2011                                   score_data->length_in_tokens());
2012   }
2013   return corpus_info;
2014 }
2015 
2016 libtextclassifier3::StatusOr<DocumentDebugInfoProto>
GetDebugInfo(int verbosity) const2017 DocumentStore::GetDebugInfo(int verbosity) const {
2018   DocumentDebugInfoProto debug_info;
2019   *debug_info.mutable_document_storage_info() = GetStorageInfo();
2020   ICING_ASSIGN_OR_RETURN(Crc32 crc, ComputeChecksum());
2021   debug_info.set_crc(crc.Get());
2022   if (verbosity > 0) {
2023     ICING_ASSIGN_OR_RETURN(
2024         google::protobuf::RepeatedPtrField<DocumentDebugInfoProto::CorpusInfo>
2025             corpus_info,
2026         CollectCorpusInfo());
2027     *debug_info.mutable_corpus_info() = std::move(corpus_info);
2028   }
2029   return debug_info;
2030 }
2031 
2032 }  // namespace lib
2033 }  // namespace icing
2034