• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/store/document-store.h"
16 
17 #include <cstdint>
18 #include <limits>
19 #include <memory>
20 #include <optional>
21 #include <string>
22 #include <string_view>
23 #include <unordered_map>
24 #include <utility>
25 #include <vector>
26 
27 #include "icing/text_classifier/lib3/utils/base/status.h"
28 #include "icing/text_classifier/lib3/utils/base/statusor.h"
29 #include "icing/text_classifier/lib3/utils/hash/farmhash.h"
30 #include "icing/absl_ports/annotate.h"
31 #include "icing/absl_ports/canonical_errors.h"
32 #include "icing/absl_ports/str_cat.h"
33 #include "icing/file/file-backed-proto-log.h"
34 #include "icing/file/file-backed-vector.h"
35 #include "icing/file/filesystem.h"
36 #include "icing/file/memory-mapped-file.h"
37 #include "icing/file/portable-file-backed-proto-log.h"
38 #include "icing/legacy/core/icing-string-util.h"
39 #include "icing/proto/debug.pb.h"
40 #include "icing/proto/document.pb.h"
41 #include "icing/proto/document_wrapper.pb.h"
42 #include "icing/proto/logging.pb.h"
43 #include "icing/proto/optimize.pb.h"
44 #include "icing/proto/persist.pb.h"
45 #include "icing/proto/schema.pb.h"
46 #include "icing/proto/storage.pb.h"
47 #include "icing/proto/usage.pb.h"
48 #include "icing/schema/schema-store.h"
49 #include "icing/store/corpus-associated-scoring-data.h"
50 #include "icing/store/corpus-id.h"
51 #include "icing/store/document-associated-score-data.h"
52 #include "icing/store/document-filter-data.h"
53 #include "icing/store/document-id.h"
54 #include "icing/store/document-log-creator.h"
55 #include "icing/store/dynamic-trie-key-mapper.h"
56 #include "icing/store/namespace-fingerprint-identifier.h"
57 #include "icing/store/namespace-id.h"
58 #include "icing/store/persistent-hash-map-key-mapper.h"
59 #include "icing/store/usage-store.h"
60 #include "icing/tokenization/language-segmenter.h"
61 #include "icing/util/clock.h"
62 #include "icing/util/crc32.h"
63 #include "icing/util/data-loss.h"
64 #include "icing/util/encode-util.h"
65 #include "icing/util/fingerprint-util.h"
66 #include "icing/util/logging.h"
67 #include "icing/util/status-macros.h"
68 #include "icing/util/tokenized-document.h"
69 
70 namespace icing {
71 namespace lib {
72 
73 namespace {
74 
75 // Used in DocumentId mapper to mark a document as deleted
76 constexpr int64_t kDocDeletedFlag = -1;
77 constexpr char kDocumentIdMapperFilename[] = "document_id_mapper";
78 constexpr char kUriHashMapperWorkingPath[] = "uri_mapper";
79 constexpr char kDocumentStoreHeaderFilename[] = "document_store_header";
80 constexpr char kScoreCacheFilename[] = "score_cache";
81 constexpr char kCorpusScoreCache[] = "corpus_score_cache";
82 constexpr char kFilterCacheFilename[] = "filter_cache";
83 constexpr char kNamespaceMapperFilename[] = "namespace_mapper";
84 constexpr char kUsageStoreDirectoryName[] = "usage_store";
85 constexpr char kCorpusIdMapperFilename[] = "corpus_mapper";
86 
87 // Determined through manual testing to allow for 4 million uris. 4 million
88 // because we allow up to 4 million DocumentIds.
89 constexpr int32_t kUriDynamicTrieKeyMapperMaxSize =
90     144 * 1024 * 1024;  // 144 MiB
91 
92 constexpr int32_t kUriHashKeyMapperMaxNumEntries =
93     kMaxDocumentId + 1;  // 1 << 22, 4M
94 // - Key: namespace_id_str (3 bytes) + fingerprinted_uri (10 bytes) + '\0' (1
95 //        byte)
96 // - Value: DocumentId (4 bytes)
97 constexpr int32_t kUriHashKeyMapperKVByteSize = 13 + 1 + sizeof(DocumentId);
98 
99 // 384 KiB for a DynamicTrieKeyMapper would allow each internal array to have a
100 // max of 128 KiB for storage.
101 constexpr int32_t kNamespaceMapperMaxSize = 3 * 128 * 1024;  // 384 KiB
102 constexpr int32_t kCorpusMapperMaxSize = 3 * 128 * 1024;     // 384 KiB
103 
CreateDocumentWrapper(DocumentProto && document)104 DocumentWrapper CreateDocumentWrapper(DocumentProto&& document) {
105   DocumentWrapper document_wrapper;
106   *document_wrapper.mutable_document() = std::move(document);
107   return document_wrapper;
108 }
109 
MakeHeaderFilename(const std::string & base_dir)110 std::string MakeHeaderFilename(const std::string& base_dir) {
111   return absl_ports::StrCat(base_dir, "/", kDocumentStoreHeaderFilename);
112 }
113 
MakeUriHashMapperWorkingPath(const std::string & base_dir)114 std::string MakeUriHashMapperWorkingPath(const std::string& base_dir) {
115   return absl_ports::StrCat(base_dir, "/", kUriHashMapperWorkingPath);
116 }
117 
MakeDocumentIdMapperFilename(const std::string & base_dir)118 std::string MakeDocumentIdMapperFilename(const std::string& base_dir) {
119   return absl_ports::StrCat(base_dir, "/", kDocumentIdMapperFilename);
120 }
121 
MakeScoreCacheFilename(const std::string & base_dir)122 std::string MakeScoreCacheFilename(const std::string& base_dir) {
123   return absl_ports::StrCat(base_dir, "/", kScoreCacheFilename);
124 }
125 
MakeCorpusScoreCache(const std::string & base_dir)126 std::string MakeCorpusScoreCache(const std::string& base_dir) {
127   return absl_ports::StrCat(base_dir, "/", kCorpusScoreCache);
128 }
129 
MakeFilterCacheFilename(const std::string & base_dir)130 std::string MakeFilterCacheFilename(const std::string& base_dir) {
131   return absl_ports::StrCat(base_dir, "/", kFilterCacheFilename);
132 }
133 
MakeNamespaceMapperFilename(const std::string & base_dir)134 std::string MakeNamespaceMapperFilename(const std::string& base_dir) {
135   return absl_ports::StrCat(base_dir, "/", kNamespaceMapperFilename);
136 }
137 
MakeUsageStoreDirectoryName(const std::string & base_dir)138 std::string MakeUsageStoreDirectoryName(const std::string& base_dir) {
139   return absl_ports::StrCat(base_dir, "/", kUsageStoreDirectoryName);
140 }
141 
MakeCorpusMapperFilename(const std::string & base_dir)142 std::string MakeCorpusMapperFilename(const std::string& base_dir) {
143   return absl_ports::StrCat(base_dir, "/", kCorpusIdMapperFilename);
144 }
145 
CalculateExpirationTimestampMs(int64_t creation_timestamp_ms,int64_t ttl_ms)146 int64_t CalculateExpirationTimestampMs(int64_t creation_timestamp_ms,
147                                        int64_t ttl_ms) {
148   if (ttl_ms == 0) {
149     // Special case where a TTL of 0 indicates the document should never
150     // expire. int64_t max, interpreted as seconds since epoch, represents
151     // some point in the year 292,277,026,596. So we're probably ok to use
152     // this as "never reaching this point".
153     return std::numeric_limits<int64_t>::max();
154   }
155 
156   int64_t expiration_timestamp_ms;
157   if (__builtin_add_overflow(creation_timestamp_ms, ttl_ms,
158                              &expiration_timestamp_ms)) {
159     // Overflow detected. Treat overflow as the same behavior of just int64_t
160     // max
161     return std::numeric_limits<int64_t>::max();
162   }
163 
164   return expiration_timestamp_ms;
165 }
166 
GetRecoveryCause(const DocumentLogCreator::CreateResult & create_result,bool force_recovery_and_revalidate_documents)167 InitializeStatsProto::RecoveryCause GetRecoveryCause(
168     const DocumentLogCreator::CreateResult& create_result,
169     bool force_recovery_and_revalidate_documents) {
170   if (force_recovery_and_revalidate_documents) {
171     return InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC;
172   } else if (create_result.log_create_result.has_data_loss()) {
173     return InitializeStatsProto::DATA_LOSS;
174   } else if (create_result.preexisting_file_version !=
175              DocumentLogCreator::kCurrentVersion) {
176     return InitializeStatsProto::LEGACY_DOCUMENT_LOG_FORMAT;
177   }
178   return InitializeStatsProto::NONE;
179 }
180 
GetDataStatus(DataLoss data_loss)181 InitializeStatsProto::DocumentStoreDataStatus GetDataStatus(
182     DataLoss data_loss) {
183   switch (data_loss) {
184     case DataLoss::PARTIAL:
185       return InitializeStatsProto::PARTIAL_LOSS;
186     case DataLoss::COMPLETE:
187       return InitializeStatsProto::COMPLETE_LOSS;
188     case DataLoss::NONE:
189       return InitializeStatsProto::NO_DATA_LOSS;
190   }
191 }
192 
GetNamespaceIdsToNamespaces(const KeyMapper<NamespaceId> * key_mapper)193 std::unordered_map<NamespaceId, std::string> GetNamespaceIdsToNamespaces(
194     const KeyMapper<NamespaceId>* key_mapper) {
195   std::unordered_map<NamespaceId, std::string> namespace_ids_to_namespaces;
196 
197   std::unique_ptr<typename KeyMapper<NamespaceId>::Iterator> itr =
198       key_mapper->GetIterator();
199   while (itr->Advance()) {
200     namespace_ids_to_namespaces.insert(
201         {itr->GetValue(), std::string(itr->GetKey())});
202   }
203   return namespace_ids_to_namespaces;
204 }
205 
206 libtextclassifier3::StatusOr<std::unique_ptr<
207     KeyMapper<DocumentId, fingerprint_util::FingerprintStringFormatter>>>
CreateUriMapper(const Filesystem & filesystem,const std::string & base_dir,bool use_persistent_hash_map)208 CreateUriMapper(const Filesystem& filesystem, const std::string& base_dir,
209                 bool use_persistent_hash_map) {
210   std::string uri_hash_mapper_working_path =
211       MakeUriHashMapperWorkingPath(base_dir);
212   // Due to historic issue, we use document store's base_dir directly as
213   // DynamicTrieKeyMapper's working directory for uri mapper.
214   // DynamicTrieKeyMapper also creates a subdirectory "key_mapper_dir", so the
215   // actual files will be put under "<base_dir>/key_mapper_dir/".
216   bool dynamic_trie_key_mapper_dir_exists = filesystem.DirectoryExists(
217       absl_ports::StrCat(base_dir, "/key_mapper_dir").c_str());
218   bool persistent_hash_map_dir_exists =
219       filesystem.DirectoryExists(uri_hash_mapper_working_path.c_str());
220   if ((use_persistent_hash_map && dynamic_trie_key_mapper_dir_exists) ||
221       (!use_persistent_hash_map && persistent_hash_map_dir_exists)) {
222     // Return a failure here so that the caller can properly delete and rebuild
223     // this component.
224     return absl_ports::FailedPreconditionError("Key mapper type mismatch");
225   }
226 
227   if (use_persistent_hash_map) {
228     return PersistentHashMapKeyMapper<
229         DocumentId, fingerprint_util::FingerprintStringFormatter>::
230         Create(filesystem, std::move(uri_hash_mapper_working_path),
231                /*pre_mapping_fbv=*/false,
232                /*max_num_entries=*/kUriHashKeyMapperMaxNumEntries,
233                /*average_kv_byte_size=*/kUriHashKeyMapperKVByteSize);
234   } else {
235     return DynamicTrieKeyMapper<DocumentId,
236                                 fingerprint_util::FingerprintStringFormatter>::
237         Create(filesystem, base_dir, kUriDynamicTrieKeyMapperMaxSize);
238   }
239 }
240 
241 }  // namespace
242 
MakeFingerprint(NamespaceId namespace_id,std::string_view namespace_,std::string_view uri_or_schema) const243 std::string DocumentStore::MakeFingerprint(
244     NamespaceId namespace_id, std::string_view namespace_,
245     std::string_view uri_or_schema) const {
246   if (!namespace_id_fingerprint_) {
247     // Using a 64-bit fingerprint to represent the key could lead to collisions.
248     // But, even with 200K unique keys, the probability of collision is about
249     // one-in-a-billion (https://en.wikipedia.org/wiki/Birthday_attack).
250     uint64_t fprint = tc3farmhash::Fingerprint64(
251         absl_ports::StrCat(namespace_, uri_or_schema));
252     return fingerprint_util::GetFingerprintString(fprint);
253   } else {
254     return NamespaceFingerprintIdentifier(namespace_id, uri_or_schema)
255         .EncodeToCString();
256   }
257 }
258 
DocumentStore(const Filesystem * filesystem,const std::string_view base_dir,const Clock * clock,const SchemaStore * schema_store,bool namespace_id_fingerprint,bool pre_mapping_fbv,bool use_persistent_hash_map,int32_t compression_level)259 DocumentStore::DocumentStore(const Filesystem* filesystem,
260                              const std::string_view base_dir,
261                              const Clock* clock,
262                              const SchemaStore* schema_store,
263                              bool namespace_id_fingerprint,
264                              bool pre_mapping_fbv, bool use_persistent_hash_map,
265                              int32_t compression_level)
266     : filesystem_(filesystem),
267       base_dir_(base_dir),
268       clock_(*clock),
269       schema_store_(schema_store),
270       document_validator_(schema_store),
271       namespace_id_fingerprint_(namespace_id_fingerprint),
272       pre_mapping_fbv_(pre_mapping_fbv),
273       use_persistent_hash_map_(use_persistent_hash_map),
274       compression_level_(compression_level) {}
275 
Put(const DocumentProto & document,int32_t num_tokens,PutDocumentStatsProto * put_document_stats)276 libtextclassifier3::StatusOr<DocumentId> DocumentStore::Put(
277     const DocumentProto& document, int32_t num_tokens,
278     PutDocumentStatsProto* put_document_stats) {
279   return Put(DocumentProto(document), num_tokens, put_document_stats);
280 }
281 
Put(DocumentProto && document,int32_t num_tokens,PutDocumentStatsProto * put_document_stats)282 libtextclassifier3::StatusOr<DocumentId> DocumentStore::Put(
283     DocumentProto&& document, int32_t num_tokens,
284     PutDocumentStatsProto* put_document_stats) {
285   document.mutable_internal_fields()->set_length_in_tokens(num_tokens);
286   return InternalPut(std::move(document), put_document_stats);
287 }
288 
~DocumentStore()289 DocumentStore::~DocumentStore() {
290   if (initialized_) {
291     if (!PersistToDisk(PersistType::FULL).ok()) {
292       ICING_LOG(ERROR)
293           << "Error persisting to disk in DocumentStore destructor";
294     }
295   }
296 }
297 
Create(const Filesystem * filesystem,const std::string & base_dir,const Clock * clock,const SchemaStore * schema_store,bool force_recovery_and_revalidate_documents,bool namespace_id_fingerprint,bool pre_mapping_fbv,bool use_persistent_hash_map,int32_t compression_level,InitializeStatsProto * initialize_stats)298 libtextclassifier3::StatusOr<DocumentStore::CreateResult> DocumentStore::Create(
299     const Filesystem* filesystem, const std::string& base_dir,
300     const Clock* clock, const SchemaStore* schema_store,
301     bool force_recovery_and_revalidate_documents, bool namespace_id_fingerprint,
302     bool pre_mapping_fbv, bool use_persistent_hash_map,
303     int32_t compression_level, InitializeStatsProto* initialize_stats) {
304   ICING_RETURN_ERROR_IF_NULL(filesystem);
305   ICING_RETURN_ERROR_IF_NULL(clock);
306   ICING_RETURN_ERROR_IF_NULL(schema_store);
307 
308   auto document_store = std::unique_ptr<DocumentStore>(new DocumentStore(
309       filesystem, base_dir, clock, schema_store, namespace_id_fingerprint,
310       pre_mapping_fbv, use_persistent_hash_map, compression_level));
311   ICING_ASSIGN_OR_RETURN(
312       InitializeResult initialize_result,
313       document_store->Initialize(force_recovery_and_revalidate_documents,
314                                  initialize_stats));
315 
316   CreateResult create_result;
317   create_result.document_store = std::move(document_store);
318   create_result.data_loss = initialize_result.data_loss;
319   create_result.derived_files_regenerated =
320       initialize_result.derived_files_regenerated;
321   return create_result;
322 }
323 
DiscardDerivedFiles(const Filesystem * filesystem,const std::string & base_dir)324 /* static */ libtextclassifier3::Status DocumentStore::DiscardDerivedFiles(
325     const Filesystem* filesystem, const std::string& base_dir) {
326   // Header
327   const std::string header_filename = MakeHeaderFilename(base_dir);
328   if (!filesystem->DeleteFile(MakeHeaderFilename(base_dir).c_str())) {
329     return absl_ports::InternalError("Couldn't delete header file");
330   }
331 
332   // Document key mapper. Doesn't hurt to delete both dynamic trie and
333   // persistent hash map without checking.
334   ICING_RETURN_IF_ERROR(
335       DynamicTrieKeyMapper<DocumentId>::Delete(*filesystem, base_dir));
336   ICING_RETURN_IF_ERROR(PersistentHashMapKeyMapper<DocumentId>::Delete(
337       *filesystem, MakeUriHashMapperWorkingPath(base_dir)));
338 
339   // Document id mapper
340   ICING_RETURN_IF_ERROR(FileBackedVector<int64_t>::Delete(
341       *filesystem, MakeDocumentIdMapperFilename(base_dir)));
342 
343   // Document associated score cache
344   ICING_RETURN_IF_ERROR(FileBackedVector<DocumentAssociatedScoreData>::Delete(
345       *filesystem, MakeScoreCacheFilename(base_dir)));
346 
347   // Filter cache
348   ICING_RETURN_IF_ERROR(FileBackedVector<DocumentFilterData>::Delete(
349       *filesystem, MakeFilterCacheFilename(base_dir)));
350 
351   // Namespace mapper
352   ICING_RETURN_IF_ERROR(DynamicTrieKeyMapper<NamespaceId>::Delete(
353       *filesystem, MakeNamespaceMapperFilename(base_dir)));
354 
355   // Corpus mapper
356   ICING_RETURN_IF_ERROR(DynamicTrieKeyMapper<CorpusId>::Delete(
357       *filesystem, MakeCorpusMapperFilename(base_dir)));
358 
359   // Corpus associated score cache
360   ICING_RETURN_IF_ERROR(FileBackedVector<CorpusAssociatedScoreData>::Delete(
361       *filesystem, MakeCorpusScoreCache(base_dir)));
362 
363   return libtextclassifier3::Status::OK;
364 }
365 
366 libtextclassifier3::StatusOr<DocumentStore::InitializeResult>
Initialize(bool force_recovery_and_revalidate_documents,InitializeStatsProto * initialize_stats)367 DocumentStore::Initialize(bool force_recovery_and_revalidate_documents,
368                           InitializeStatsProto* initialize_stats) {
369   auto create_result_or =
370       DocumentLogCreator::Create(filesystem_, base_dir_, compression_level_);
371 
372   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
373   // that can support error logging.
374   if (!create_result_or.ok()) {
375     ICING_LOG(ERROR) << create_result_or.status().error_message()
376                      << "\nFailed to initialize DocumentLog.";
377     return create_result_or.status();
378   }
379   DocumentLogCreator::CreateResult create_result =
380       std::move(create_result_or).ValueOrDie();
381 
382   document_log_ = std::move(create_result.log_create_result.proto_log);
383   InitializeStatsProto::RecoveryCause recovery_cause =
384       GetRecoveryCause(create_result, force_recovery_and_revalidate_documents);
385 
386   bool derived_files_regenerated = false;
387   if (recovery_cause != InitializeStatsProto::NONE || create_result.new_file) {
388     ICING_LOG(INFO) << "Starting Document Store Recovery with cause="
389                     << recovery_cause << ", and create result { new_file="
390                     << create_result.new_file << ", preeisting_file_version="
391                     << create_result.preexisting_file_version << ", data_loss="
392                     << create_result.log_create_result.data_loss
393                     << "} and kCurrentVersion="
394                     << DocumentLogCreator::kCurrentVersion;
395     // We can't rely on any existing derived files. Recreate them from scratch.
396     // Currently happens if:
397     //   1) This is a new log and we don't have derived files yet
398     //   2) Client wanted us to force a regeneration.
399     //   3) Log has some data loss, can't rely on existing derived data.
400     std::unique_ptr<Timer> document_recovery_timer = clock_.GetNewTimer();
401     libtextclassifier3::Status status =
402         RegenerateDerivedFiles(force_recovery_and_revalidate_documents);
403     if (recovery_cause != InitializeStatsProto::NONE) {
404       // Only consider it a recovery if the client forced a recovery or there
405       // was data loss. Otherwise, this could just be the first time we're
406       // initializing and generating derived files.
407       derived_files_regenerated = true;
408       if (initialize_stats != nullptr) {
409         initialize_stats->set_document_store_recovery_latency_ms(
410             document_recovery_timer->GetElapsedMilliseconds());
411         initialize_stats->set_document_store_recovery_cause(recovery_cause);
412         initialize_stats->set_document_store_data_status(
413             GetDataStatus(create_result.log_create_result.data_loss));
414       }
415     }
416     if (!status.ok()) {
417       ICING_LOG(ERROR)
418           << "Failed to regenerate derived files for DocumentStore";
419       return status;
420     }
421   } else {
422     if (!InitializeExistingDerivedFiles().ok()) {
423       ICING_LOG(WARNING)
424           << "Couldn't find derived files or failed to initialize them, "
425              "regenerating derived files for DocumentStore.";
426       std::unique_ptr<Timer> document_recovery_timer = clock_.GetNewTimer();
427       derived_files_regenerated = true;
428       libtextclassifier3::Status status = RegenerateDerivedFiles(
429           /*force_recovery_and_revalidate_documents=*/false);
430       if (initialize_stats != nullptr) {
431         initialize_stats->set_document_store_recovery_cause(
432             InitializeStatsProto::IO_ERROR);
433         initialize_stats->set_document_store_recovery_latency_ms(
434             document_recovery_timer->GetElapsedMilliseconds());
435       }
436       if (!status.ok()) {
437         ICING_LOG(ERROR)
438             << "Failed to regenerate derived files for DocumentStore";
439         return status;
440       }
441     }
442   }
443 
444   initialized_ = true;
445   if (initialize_stats != nullptr) {
446     initialize_stats->set_num_documents(document_id_mapper_->num_elements());
447   }
448 
449   InitializeResult initialize_result = {
450       .data_loss = create_result.log_create_result.data_loss,
451       .derived_files_regenerated = derived_files_regenerated};
452   return initialize_result;
453 }
454 
InitializeExistingDerivedFiles()455 libtextclassifier3::Status DocumentStore::InitializeExistingDerivedFiles() {
456   if (!HeaderExists()) {
457     // Without a header, we don't know if things are consistent between each
458     // other so the caller should just regenerate everything from ground
459     // truth.
460     return absl_ports::InternalError("DocumentStore header doesn't exist");
461   }
462 
463   DocumentStore::Header header;
464   if (!filesystem_->Read(MakeHeaderFilename(base_dir_).c_str(), &header,
465                          sizeof(header))) {
466     return absl_ports::InternalError(
467         absl_ports::StrCat("Couldn't read: ", MakeHeaderFilename(base_dir_)));
468   }
469 
470   if (header.magic !=
471       DocumentStore::Header::GetCurrentMagic(namespace_id_fingerprint_)) {
472     return absl_ports::InternalError(absl_ports::StrCat(
473         "Invalid header kMagic for file: ", MakeHeaderFilename(base_dir_)));
474   }
475 
476   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
477   // that can support error logging.
478   auto document_key_mapper_or =
479       CreateUriMapper(*filesystem_, base_dir_, use_persistent_hash_map_);
480   if (!document_key_mapper_or.ok()) {
481     ICING_LOG(ERROR) << document_key_mapper_or.status().error_message()
482                      << "Failed to initialize KeyMapper";
483     return document_key_mapper_or.status();
484   }
485   document_key_mapper_ = std::move(document_key_mapper_or).ValueOrDie();
486 
487   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
488   // that can support error logging.
489   auto document_id_mapper_or = FileBackedVector<int64_t>::Create(
490       *filesystem_, MakeDocumentIdMapperFilename(base_dir_),
491       MemoryMappedFile::READ_WRITE_AUTO_SYNC);
492   if (!document_id_mapper_or.ok()) {
493     ICING_LOG(ERROR) << document_id_mapper_or.status().error_message()
494                      << "Failed to initialize DocumentIdMapper";
495     return document_id_mapper_or.status();
496   }
497   document_id_mapper_ = std::move(document_id_mapper_or).ValueOrDie();
498 
499   ICING_ASSIGN_OR_RETURN(score_cache_,
500                          FileBackedVector<DocumentAssociatedScoreData>::Create(
501                              *filesystem_, MakeScoreCacheFilename(base_dir_),
502                              MemoryMappedFile::READ_WRITE_AUTO_SYNC));
503 
504   ICING_ASSIGN_OR_RETURN(filter_cache_,
505                          FileBackedVector<DocumentFilterData>::Create(
506                              *filesystem_, MakeFilterCacheFilename(base_dir_),
507                              MemoryMappedFile::READ_WRITE_AUTO_SYNC));
508 
509   ICING_ASSIGN_OR_RETURN(
510       namespace_mapper_,
511       DynamicTrieKeyMapper<NamespaceId>::Create(
512           *filesystem_, MakeNamespaceMapperFilename(base_dir_),
513           kNamespaceMapperMaxSize));
514 
515   ICING_ASSIGN_OR_RETURN(
516       usage_store_,
517       UsageStore::Create(filesystem_, MakeUsageStoreDirectoryName(base_dir_)));
518 
519   auto corpus_mapper_or =
520       DynamicTrieKeyMapper<CorpusId,
521                            fingerprint_util::FingerprintStringFormatter>::
522           Create(*filesystem_, MakeCorpusMapperFilename(base_dir_),
523                  kCorpusMapperMaxSize);
524   if (!corpus_mapper_or.ok()) {
525     return std::move(corpus_mapper_or).status();
526   }
527   corpus_mapper_ = std::move(corpus_mapper_or).ValueOrDie();
528 
529   ICING_ASSIGN_OR_RETURN(corpus_score_cache_,
530                          FileBackedVector<CorpusAssociatedScoreData>::Create(
531                              *filesystem_, MakeCorpusScoreCache(base_dir_),
532                              MemoryMappedFile::READ_WRITE_AUTO_SYNC));
533 
534   // Ensure the usage store is the correct size.
535   ICING_RETURN_IF_ERROR(
536       usage_store_->TruncateTo(document_id_mapper_->num_elements()));
537 
538   ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
539   if (checksum.Get() != header.checksum) {
540     return absl_ports::InternalError(
541         "Combined checksum of DocStore was inconsistent");
542   }
543 
544   return libtextclassifier3::Status::OK;
545 }
546 
RegenerateDerivedFiles(bool revalidate_documents)547 libtextclassifier3::Status DocumentStore::RegenerateDerivedFiles(
548     bool revalidate_documents) {
549   ICING_RETURN_IF_ERROR(ResetDocumentKeyMapper());
550   ICING_RETURN_IF_ERROR(ResetDocumentIdMapper());
551   ICING_RETURN_IF_ERROR(ResetDocumentAssociatedScoreCache());
552   ICING_RETURN_IF_ERROR(ResetFilterCache());
553   ICING_RETURN_IF_ERROR(ResetNamespaceMapper());
554   ICING_RETURN_IF_ERROR(ResetCorpusMapper());
555   ICING_RETURN_IF_ERROR(ResetCorpusAssociatedScoreCache());
556 
557   // Creates a new UsageStore instance. Note that we don't reset the data in
558   // usage store here because we're not able to regenerate the usage scores.
559   ICING_ASSIGN_OR_RETURN(
560       usage_store_,
561       UsageStore::Create(filesystem_, MakeUsageStoreDirectoryName(base_dir_)));
562 
563   // Iterates through document log
564   auto iterator = document_log_->GetIterator();
565   auto iterator_status = iterator.Advance();
566   libtextclassifier3::StatusOr<int64_t> element_size =
567       document_log_->GetElementsFileSize();
568   libtextclassifier3::StatusOr<int64_t> disk_usage =
569       document_log_->GetDiskUsage();
570   if (element_size.ok() && disk_usage.ok()) {
571     ICING_VLOG(1) << "Starting recovery of document store. Document store "
572                      "elements file size:"
573                   << element_size.ValueOrDie()
574                   << ", disk usage=" << disk_usage.ValueOrDie();
575   }
576   while (iterator_status.ok()) {
577     ICING_VLOG(2) << "Attempting to read document at offset="
578                   << iterator.GetOffset();
579     libtextclassifier3::StatusOr<DocumentWrapper> document_wrapper_or =
580         document_log_->ReadProto(iterator.GetOffset());
581 
582     if (absl_ports::IsNotFound(document_wrapper_or.status())) {
583       // The erased document still occupies 1 document id.
584       DocumentId new_document_id = document_id_mapper_->num_elements();
585       ICING_RETURN_IF_ERROR(ClearDerivedData(new_document_id));
586       iterator_status = iterator.Advance();
587       continue;
588     } else if (!document_wrapper_or.ok()) {
589       return document_wrapper_or.status();
590     }
591 
592     DocumentWrapper document_wrapper =
593         std::move(document_wrapper_or).ValueOrDie();
594     // Revalidate that this document is still compatible if requested.
595     if (revalidate_documents) {
596       if (!document_validator_.Validate(document_wrapper.document()).ok()) {
597         // Document is no longer valid with the current schema. Mark as
598         // deleted
599         DocumentId new_document_id = document_id_mapper_->num_elements();
600         ICING_RETURN_IF_ERROR(document_log_->EraseProto(iterator.GetOffset()));
601         ICING_RETURN_IF_ERROR(ClearDerivedData(new_document_id));
602         continue;
603       }
604     }
605 
606     ICING_ASSIGN_OR_RETURN(
607         NamespaceId namespace_id,
608         namespace_mapper_->GetOrPut(document_wrapper.document().namespace_(),
609                                     namespace_mapper_->num_keys()));
610 
611     // Updates key mapper and document_id mapper with the new document
612     DocumentId new_document_id = document_id_mapper_->num_elements();
613     ICING_RETURN_IF_ERROR(document_key_mapper_->Put(
614         MakeFingerprint(namespace_id, document_wrapper.document().namespace_(),
615                         document_wrapper.document().uri()),
616         new_document_id));
617     ICING_RETURN_IF_ERROR(
618         document_id_mapper_->Set(new_document_id, iterator.GetOffset()));
619 
620     SchemaTypeId schema_type_id;
621     auto schema_type_id_or =
622         schema_store_->GetSchemaTypeId(document_wrapper.document().schema());
623     if (absl_ports::IsNotFound(schema_type_id_or.status())) {
624       // Didn't find a SchemaTypeId. This means that the DocumentStore and
625       // the SchemaStore are out of sync. But DocumentStore can't do
626       // anything about it so just ignore this for now. This should be
627       // detected/handled by the owner of DocumentStore. Set it to some
628       // arbitrary invalid value for now, it'll get updated to the correct
629       // ID later.
630       schema_type_id = -1;
631     } else if (!schema_type_id_or.ok()) {
632       // Real error. Pass it up
633       return schema_type_id_or.status();
634     } else {
635       // We're guaranteed that SchemaTypeId is valid now
636       schema_type_id = schema_type_id_or.ValueOrDie();
637     }
638 
639     // Update corpus maps
640     std::string corpus =
641         MakeFingerprint(namespace_id, document_wrapper.document().namespace_(),
642                         document_wrapper.document().schema());
643     ICING_ASSIGN_OR_RETURN(
644         CorpusId corpusId,
645         corpus_mapper_->GetOrPut(corpus, corpus_mapper_->num_keys()));
646 
647     ICING_ASSIGN_OR_RETURN(CorpusAssociatedScoreData scoring_data,
648                            GetCorpusAssociatedScoreDataToUpdate(corpusId));
649     scoring_data.AddDocument(
650         document_wrapper.document().internal_fields().length_in_tokens());
651 
652     ICING_RETURN_IF_ERROR(
653         UpdateCorpusAssociatedScoreCache(corpusId, scoring_data));
654 
655     ICING_RETURN_IF_ERROR(UpdateDocumentAssociatedScoreCache(
656         new_document_id,
657         DocumentAssociatedScoreData(
658             corpusId, document_wrapper.document().score(),
659             document_wrapper.document().creation_timestamp_ms(),
660             document_wrapper.document().internal_fields().length_in_tokens())));
661 
662     int64_t expiration_timestamp_ms = CalculateExpirationTimestampMs(
663         document_wrapper.document().creation_timestamp_ms(),
664         document_wrapper.document().ttl_ms());
665 
666     ICING_RETURN_IF_ERROR(UpdateFilterCache(
667         new_document_id, DocumentFilterData(namespace_id, schema_type_id,
668                                             expiration_timestamp_ms)));
669     iterator_status = iterator.Advance();
670   }
671 
672   if (!absl_ports::IsOutOfRange(iterator_status)) {
673     ICING_LOG(WARNING)
674         << "Failed to iterate through proto log while regenerating "
675            "derived files";
676     return absl_ports::Annotate(iterator_status,
677                                 "Failed to iterate through proto log.");
678   }
679 
680   // Shrink usage_store_ to the correct size.
681   ICING_RETURN_IF_ERROR(
682       usage_store_->TruncateTo(document_id_mapper_->num_elements()));
683 
684   // Write the header
685   ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
686   ICING_RETURN_IF_ERROR(UpdateHeader(checksum));
687 
688   return libtextclassifier3::Status::OK;
689 }
690 
ResetDocumentKeyMapper()691 libtextclassifier3::Status DocumentStore::ResetDocumentKeyMapper() {
692   // Only one type of KeyMapper (either DynamicTrieKeyMapper or
693   // PersistentHashMapKeyMapper) will actually exist at any moment, but it is ok
694   // to call Delete() for both since Delete() returns OK if any of them doesn't
695   // exist.
696   // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
697   document_key_mapper_.reset();
698   // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
699   // that can support error logging.
700   libtextclassifier3::Status status =
701       DynamicTrieKeyMapper<DocumentId>::Delete(*filesystem_, base_dir_);
702   if (!status.ok()) {
703     ICING_LOG(ERROR) << status.error_message()
704                      << "Failed to delete old dynamic trie key mapper";
705     return status;
706   }
707   status = PersistentHashMapKeyMapper<DocumentId>::Delete(
708       *filesystem_, MakeUriHashMapperWorkingPath(base_dir_));
709   if (!status.ok()) {
710     ICING_LOG(ERROR) << status.error_message()
711                      << "Failed to delete old persistent hash map key mapper";
712     return status;
713   }
714 
715   // TODO(b/216487496): Implement a more robust version of TC_ASSIGN_OR_RETURN
716   // that can support error logging.
717   auto document_key_mapper_or =
718       CreateUriMapper(*filesystem_, base_dir_, use_persistent_hash_map_);
719   if (!document_key_mapper_or.ok()) {
720     ICING_LOG(ERROR) << document_key_mapper_or.status().error_message()
721                      << "Failed to re-init key mapper";
722     return document_key_mapper_or.status();
723   }
724   document_key_mapper_ = std::move(document_key_mapper_or).ValueOrDie();
725   return libtextclassifier3::Status::OK;
726 }
727 
ResetDocumentIdMapper()728 libtextclassifier3::Status DocumentStore::ResetDocumentIdMapper() {
729   // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
730   document_id_mapper_.reset();
731   // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
732   // that can support error logging.
733   libtextclassifier3::Status status = FileBackedVector<int64_t>::Delete(
734       *filesystem_, MakeDocumentIdMapperFilename(base_dir_));
735   if (!status.ok()) {
736     ICING_LOG(ERROR) << status.error_message()
737                      << "Failed to delete old document_id mapper";
738     return status;
739   }
740   // TODO(b/216487496): Implement a more robust version of TC_ASSIGN_OR_RETURN
741   // that can support error logging.
742   auto document_id_mapper_or = FileBackedVector<int64_t>::Create(
743       *filesystem_, MakeDocumentIdMapperFilename(base_dir_),
744       MemoryMappedFile::READ_WRITE_AUTO_SYNC);
745   if (!document_id_mapper_or.ok()) {
746     ICING_LOG(ERROR) << document_id_mapper_or.status().error_message()
747                      << "Failed to re-init document_id mapper";
748     return document_id_mapper_or.status();
749   }
750   document_id_mapper_ = std::move(document_id_mapper_or).ValueOrDie();
751   return libtextclassifier3::Status::OK;
752 }
753 
ResetDocumentAssociatedScoreCache()754 libtextclassifier3::Status DocumentStore::ResetDocumentAssociatedScoreCache() {
755   // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
756   score_cache_.reset();
757   ICING_RETURN_IF_ERROR(FileBackedVector<DocumentAssociatedScoreData>::Delete(
758       *filesystem_, MakeScoreCacheFilename(base_dir_)));
759   ICING_ASSIGN_OR_RETURN(score_cache_,
760                          FileBackedVector<DocumentAssociatedScoreData>::Create(
761                              *filesystem_, MakeScoreCacheFilename(base_dir_),
762                              MemoryMappedFile::READ_WRITE_AUTO_SYNC));
763   return libtextclassifier3::Status::OK;
764 }
765 
ResetCorpusAssociatedScoreCache()766 libtextclassifier3::Status DocumentStore::ResetCorpusAssociatedScoreCache() {
767   // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
768   corpus_score_cache_.reset();
769   ICING_RETURN_IF_ERROR(FileBackedVector<CorpusAssociatedScoreData>::Delete(
770       *filesystem_, MakeCorpusScoreCache(base_dir_)));
771   ICING_ASSIGN_OR_RETURN(corpus_score_cache_,
772                          FileBackedVector<CorpusAssociatedScoreData>::Create(
773                              *filesystem_, MakeCorpusScoreCache(base_dir_),
774                              MemoryMappedFile::READ_WRITE_AUTO_SYNC));
775   return libtextclassifier3::Status::OK;
776 }
777 
ResetFilterCache()778 libtextclassifier3::Status DocumentStore::ResetFilterCache() {
779   // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
780   filter_cache_.reset();
781   ICING_RETURN_IF_ERROR(FileBackedVector<DocumentFilterData>::Delete(
782       *filesystem_, MakeFilterCacheFilename(base_dir_)));
783   ICING_ASSIGN_OR_RETURN(filter_cache_,
784                          FileBackedVector<DocumentFilterData>::Create(
785                              *filesystem_, MakeFilterCacheFilename(base_dir_),
786                              MemoryMappedFile::READ_WRITE_AUTO_SYNC));
787   return libtextclassifier3::Status::OK;
788 }
789 
ResetNamespaceMapper()790 libtextclassifier3::Status DocumentStore::ResetNamespaceMapper() {
791   // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
792   namespace_mapper_.reset();
793   // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
794   // that can support error logging.
795   libtextclassifier3::Status status = DynamicTrieKeyMapper<NamespaceId>::Delete(
796       *filesystem_, MakeNamespaceMapperFilename(base_dir_));
797   if (!status.ok()) {
798     ICING_LOG(ERROR) << status.error_message()
799                      << "Failed to delete old namespace_id mapper";
800     return status;
801   }
802   ICING_ASSIGN_OR_RETURN(
803       namespace_mapper_,
804       DynamicTrieKeyMapper<NamespaceId>::Create(
805           *filesystem_, MakeNamespaceMapperFilename(base_dir_),
806           kNamespaceMapperMaxSize));
807   return libtextclassifier3::Status::OK;
808 }
809 
ResetCorpusMapper()810 libtextclassifier3::Status DocumentStore::ResetCorpusMapper() {
811   // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
812   corpus_mapper_.reset();
813   // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
814   // that can support error logging.
815   libtextclassifier3::Status status = DynamicTrieKeyMapper<CorpusId>::Delete(
816       *filesystem_, MakeCorpusMapperFilename(base_dir_));
817   if (!status.ok()) {
818     ICING_LOG(ERROR) << status.error_message()
819                      << "Failed to delete old corpus_id mapper";
820     return status;
821   }
822   auto corpus_mapper_or =
823       DynamicTrieKeyMapper<CorpusId,
824                            fingerprint_util::FingerprintStringFormatter>::
825           Create(*filesystem_, MakeCorpusMapperFilename(base_dir_),
826                  kCorpusMapperMaxSize);
827   if (!corpus_mapper_or.ok()) {
828     return std::move(corpus_mapper_or).status();
829   }
830   corpus_mapper_ = std::move(corpus_mapper_or).ValueOrDie();
831   return libtextclassifier3::Status::OK;
832 }
833 
ComputeChecksum() const834 libtextclassifier3::StatusOr<Crc32> DocumentStore::ComputeChecksum() const {
835   Crc32 total_checksum;
836 
837   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
838   // that can support error logging.
839   auto checksum_or = document_log_->ComputeChecksum();
840   if (!checksum_or.ok()) {
841     ICING_LOG(ERROR) << checksum_or.status().error_message()
842                      << "Failed to compute checksum of DocumentLog";
843     return checksum_or.status();
844   }
845   Crc32 document_log_checksum = std::move(checksum_or).ValueOrDie();
846 
847   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
848   // that can support error logging.
849   checksum_or = document_key_mapper_->ComputeChecksum();
850   if (!checksum_or.ok()) {
851     ICING_LOG(ERROR) << checksum_or.status().error_message()
852                      << "Failed to compute checksum of DocumentKeyMapper";
853     return checksum_or.status();
854   }
855   Crc32 document_key_mapper_checksum = std::move(checksum_or).ValueOrDie();
856 
857   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
858   // that can support error logging.
859   checksum_or = document_id_mapper_->ComputeChecksum();
860   if (!checksum_or.ok()) {
861     ICING_LOG(ERROR) << checksum_or.status().error_message()
862                      << "Failed to compute checksum of DocumentIdMapper";
863     return checksum_or.status();
864   }
865   Crc32 document_id_mapper_checksum = std::move(checksum_or).ValueOrDie();
866 
867   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
868   // that can support error logging.
869   checksum_or = score_cache_->ComputeChecksum();
870   if (!checksum_or.ok()) {
871     ICING_LOG(ERROR) << checksum_or.status().error_message()
872                      << "Failed to compute checksum of score cache";
873     return checksum_or.status();
874   }
875   Crc32 score_cache_checksum = std::move(checksum_or).ValueOrDie();
876 
877   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
878   // that can support error logging.
879   checksum_or = filter_cache_->ComputeChecksum();
880   if (!checksum_or.ok()) {
881     ICING_LOG(ERROR) << checksum_or.status().error_message()
882                      << "Failed to compute checksum of filter cache";
883     return checksum_or.status();
884   }
885   Crc32 filter_cache_checksum = std::move(checksum_or).ValueOrDie();
886 
887   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
888   // that can support error logging.
889   checksum_or = namespace_mapper_->ComputeChecksum();
890   if (!checksum_or.ok()) {
891     ICING_LOG(ERROR) << checksum_or.status().error_message()
892                      << "Failed to compute checksum of namespace mapper";
893     return checksum_or.status();
894   }
895   Crc32 namespace_mapper_checksum = std::move(checksum_or).ValueOrDie();
896 
897   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
898   // that can support error logging.
899   checksum_or = corpus_mapper_->ComputeChecksum();
900   if (!checksum_or.ok()) {
901     ICING_LOG(ERROR) << checksum_or.status().error_message()
902                      << "Failed to compute checksum of corpus mapper";
903     return checksum_or.status();
904   }
905   Crc32 corpus_mapper_checksum = std::move(checksum_or).ValueOrDie();
906 
907   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
908   // that can support error logging.
909   checksum_or = corpus_score_cache_->ComputeChecksum();
910   if (!checksum_or.ok()) {
911     ICING_LOG(WARNING) << checksum_or.status().error_message()
912                        << "Failed to compute checksum of score cache";
913     return checksum_or.status();
914   }
915   Crc32 corpus_score_cache_checksum = std::move(checksum_or).ValueOrDie();
916 
917   // NOTE: We purposely don't include usage_store checksum here because we can't
918   // regenerate it from ground truth documents. If it gets corrupted, we'll just
919   // clear all usage reports, but we shouldn't throw everything else in the
920   // document store out.
921 
922   total_checksum.Append(std::to_string(document_log_checksum.Get()));
923   total_checksum.Append(std::to_string(document_key_mapper_checksum.Get()));
924   total_checksum.Append(std::to_string(document_id_mapper_checksum.Get()));
925   total_checksum.Append(std::to_string(score_cache_checksum.Get()));
926   total_checksum.Append(std::to_string(filter_cache_checksum.Get()));
927   total_checksum.Append(std::to_string(namespace_mapper_checksum.Get()));
928   total_checksum.Append(std::to_string(corpus_mapper_checksum.Get()));
929   total_checksum.Append(std::to_string(corpus_score_cache_checksum.Get()));
930 
931   return total_checksum;
932 }
933 
HeaderExists()934 bool DocumentStore::HeaderExists() {
935   if (!filesystem_->FileExists(MakeHeaderFilename(base_dir_).c_str())) {
936     return false;
937   }
938 
939   int64_t file_size =
940       filesystem_->GetFileSize(MakeHeaderFilename(base_dir_).c_str());
941 
942   // If it's been truncated to size 0 before, we consider it to be a new file
943   return file_size != 0 && file_size != Filesystem::kBadFileSize;
944 }
945 
UpdateHeader(const Crc32 & checksum)946 libtextclassifier3::Status DocumentStore::UpdateHeader(const Crc32& checksum) {
947   // Write the header
948   DocumentStore::Header header;
949   header.magic =
950       DocumentStore::Header::GetCurrentMagic(namespace_id_fingerprint_);
951   header.checksum = checksum.Get();
952 
953   // This should overwrite the header.
954   ScopedFd sfd(
955       filesystem_->OpenForWrite(MakeHeaderFilename(base_dir_).c_str()));
956   if (!sfd.is_valid() ||
957       !filesystem_->Write(sfd.get(), &header, sizeof(header)) ||
958       !filesystem_->DataSync(sfd.get())) {
959     return absl_ports::InternalError(absl_ports::StrCat(
960         "Failed to write DocStore header: ", MakeHeaderFilename(base_dir_)));
961   }
962   return libtextclassifier3::Status::OK;
963 }
964 
InternalPut(DocumentProto && document,PutDocumentStatsProto * put_document_stats)965 libtextclassifier3::StatusOr<DocumentId> DocumentStore::InternalPut(
966     DocumentProto&& document, PutDocumentStatsProto* put_document_stats) {
967   std::unique_ptr<Timer> put_timer = clock_.GetNewTimer();
968   ICING_RETURN_IF_ERROR(document_validator_.Validate(document));
969 
970   if (put_document_stats != nullptr) {
971     put_document_stats->set_document_size(document.ByteSizeLong());
972   }
973 
974   // Copy fields needed before they are moved
975   std::string name_space = document.namespace_();
976   std::string uri = document.uri();
977   std::string schema = document.schema();
978   int document_score = document.score();
979   int32_t length_in_tokens = document.internal_fields().length_in_tokens();
980   int64_t creation_timestamp_ms = document.creation_timestamp_ms();
981 
982   // Sets the creation timestamp if caller hasn't specified.
983   if (document.creation_timestamp_ms() == 0) {
984     creation_timestamp_ms = clock_.GetSystemTimeMilliseconds();
985     document.set_creation_timestamp_ms(creation_timestamp_ms);
986   }
987 
988   int64_t expiration_timestamp_ms =
989       CalculateExpirationTimestampMs(creation_timestamp_ms, document.ttl_ms());
990 
991   // Update ground truth first
992   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
993   // that can support error logging.
994   auto offset_or =
995       document_log_->WriteProto(CreateDocumentWrapper(std::move(document)));
996   if (!offset_or.ok()) {
997     ICING_LOG(ERROR) << offset_or.status().error_message()
998                      << "Failed to write document";
999     return offset_or.status();
1000   }
1001   int64_t file_offset = std::move(offset_or).ValueOrDie();
1002 
1003   // Get existing document id
1004   auto old_document_id_or = GetDocumentId(name_space, uri);
1005   if (!old_document_id_or.ok() &&
1006       !absl_ports::IsNotFound(old_document_id_or.status())) {
1007     return absl_ports::InternalError("Failed to read from key mapper");
1008   }
1009 
1010   // Creates a new document id, updates key mapper and document_id mapper
1011   DocumentId new_document_id = document_id_mapper_->num_elements();
1012   if (!IsDocumentIdValid(new_document_id)) {
1013     return absl_ports::ResourceExhaustedError(
1014         "Exceeded maximum number of documents. Try calling Optimize to reclaim "
1015         "some space.");
1016   }
1017 
1018   // Update namespace maps
1019   ICING_ASSIGN_OR_RETURN(
1020       NamespaceId namespace_id,
1021       namespace_mapper_->GetOrPut(name_space, namespace_mapper_->num_keys()));
1022 
1023   // Updates key mapper and document_id mapper
1024   ICING_RETURN_IF_ERROR(document_key_mapper_->Put(
1025       MakeFingerprint(namespace_id, name_space, uri), new_document_id));
1026   ICING_RETURN_IF_ERROR(document_id_mapper_->Set(new_document_id, file_offset));
1027 
1028   // Update corpus maps
1029   ICING_ASSIGN_OR_RETURN(CorpusId corpusId,
1030                          corpus_mapper_->GetOrPut(
1031                              MakeFingerprint(namespace_id, name_space, schema),
1032                              corpus_mapper_->num_keys()));
1033 
1034   ICING_ASSIGN_OR_RETURN(CorpusAssociatedScoreData scoring_data,
1035                          GetCorpusAssociatedScoreDataToUpdate(corpusId));
1036   scoring_data.AddDocument(length_in_tokens);
1037 
1038   ICING_RETURN_IF_ERROR(
1039       UpdateCorpusAssociatedScoreCache(corpusId, scoring_data));
1040 
1041   ICING_RETURN_IF_ERROR(UpdateDocumentAssociatedScoreCache(
1042       new_document_id,
1043       DocumentAssociatedScoreData(corpusId, document_score,
1044                                   creation_timestamp_ms, length_in_tokens)));
1045 
1046   ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
1047                          schema_store_->GetSchemaTypeId(schema));
1048 
1049   ICING_RETURN_IF_ERROR(UpdateFilterCache(
1050       new_document_id, DocumentFilterData(namespace_id, schema_type_id,
1051                                           expiration_timestamp_ms)));
1052 
1053   if (old_document_id_or.ok()) {
1054     // The old document exists, copy over the usage scores and delete the old
1055     // document.
1056     DocumentId old_document_id = old_document_id_or.ValueOrDie();
1057 
1058     ICING_RETURN_IF_ERROR(
1059         usage_store_->CloneUsageScores(/*from_document_id=*/old_document_id,
1060                                        /*to_document_id=*/new_document_id));
1061 
1062     // Delete the old document. It's fine if it's not found since it might have
1063     // been deleted previously.
1064     auto delete_status =
1065         Delete(old_document_id, clock_.GetSystemTimeMilliseconds());
1066     if (!delete_status.ok() && !absl_ports::IsNotFound(delete_status)) {
1067       // Real error, pass it up.
1068       return delete_status;
1069     }
1070   }
1071 
1072   if (put_document_stats != nullptr) {
1073     put_document_stats->set_document_store_latency_ms(
1074         put_timer->GetElapsedMilliseconds());
1075   }
1076 
1077   return new_document_id;
1078 }
1079 
Get(const std::string_view name_space,const std::string_view uri,bool clear_internal_fields) const1080 libtextclassifier3::StatusOr<DocumentProto> DocumentStore::Get(
1081     const std::string_view name_space, const std::string_view uri,
1082     bool clear_internal_fields) const {
1083   // TODO(b/147231617): Make a better way to replace the error message in an
1084   // existing Status.
1085   auto document_id_or = GetDocumentId(name_space, uri);
1086   if (!document_id_or.ok()) {
1087     if (absl_ports::IsNotFound(document_id_or.status())) {
1088       ICING_VLOG(1) << document_id_or.status().error_message();
1089       return absl_ports::NotFoundError(absl_ports::StrCat(
1090           "Document (", name_space, ", ", uri, ") not found."));
1091     }
1092 
1093     // Real error. Log it in error level and pass it up.
1094     ICING_LOG(ERROR) << document_id_or.status().error_message();
1095     return std::move(document_id_or).status();
1096   }
1097   DocumentId document_id = document_id_or.ValueOrDie();
1098 
1099   // TODO(b/147231617): Make a better way to replace the error message in an
1100   // existing Status.
1101   auto status_or = Get(document_id, clear_internal_fields);
1102   if (!status_or.ok()) {
1103     if (absl_ports::IsNotFound(status_or.status())) {
1104       ICING_VLOG(1) << status_or.status().error_message();
1105       return absl_ports::NotFoundError(absl_ports::StrCat(
1106           "Document (", name_space, ", ", uri, ") not found."));
1107     }
1108 
1109     // Real error. Log it in error level.
1110     ICING_LOG(ERROR) << status_or.status().error_message();
1111   }
1112   return status_or;
1113 }
1114 
Get(DocumentId document_id,bool clear_internal_fields) const1115 libtextclassifier3::StatusOr<DocumentProto> DocumentStore::Get(
1116     DocumentId document_id, bool clear_internal_fields) const {
1117   int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
1118   auto document_filter_data_optional_ =
1119       GetAliveDocumentFilterData(document_id, current_time_ms);
1120   if (!document_filter_data_optional_) {
1121     // The document doesn't exist. Let's check if the document id is invalid, we
1122     // will return InvalidArgumentError. Otherwise we should return NOT_FOUND
1123     // error.
1124     if (!IsDocumentIdValid(document_id)) {
1125       return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
1126           "Document id '%d' invalid.", document_id));
1127     }
1128     return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
1129         "Document id '%d' doesn't exist", document_id));
1130   }
1131 
1132   auto document_log_offset_or = document_id_mapper_->Get(document_id);
1133   if (!document_log_offset_or.ok()) {
1134     // Since we've just checked that our document_id is valid a few lines
1135     // above, there's no reason this should fail and an error should never
1136     // happen.
1137     return absl_ports::InternalError("Failed to find document offset.");
1138   }
1139   int64_t document_log_offset = *document_log_offset_or.ValueOrDie();
1140 
1141   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
1142   // that can support error logging.
1143   auto document_wrapper_or = document_log_->ReadProto(document_log_offset);
1144   if (!document_wrapper_or.ok()) {
1145     ICING_LOG(ERROR) << document_wrapper_or.status().error_message()
1146                      << "Failed to read from document log";
1147     return document_wrapper_or.status();
1148   }
1149   DocumentWrapper document_wrapper =
1150       std::move(document_wrapper_or).ValueOrDie();
1151   if (clear_internal_fields) {
1152     document_wrapper.mutable_document()->clear_internal_fields();
1153   }
1154 
1155   return std::move(*document_wrapper.mutable_document());
1156 }
1157 
GetDocumentId(const std::string_view name_space,const std::string_view uri) const1158 libtextclassifier3::StatusOr<DocumentId> DocumentStore::GetDocumentId(
1159     const std::string_view name_space, const std::string_view uri) const {
1160   auto namespace_id_or = namespace_mapper_->Get(name_space);
1161   libtextclassifier3::Status status = namespace_id_or.status();
1162   if (status.ok()) {
1163     NamespaceId namespace_id = namespace_id_or.ValueOrDie();
1164     auto document_id_or = document_key_mapper_->Get(
1165         MakeFingerprint(namespace_id, name_space, uri));
1166     status = document_id_or.status();
1167     if (status.ok()) {
1168       // Guaranteed to have a DocumentId now
1169       return document_id_or.ValueOrDie();
1170     }
1171   }
1172   return absl_ports::Annotate(
1173       status, absl_ports::StrCat(
1174                   "Failed to find DocumentId by key: ", name_space, ", ", uri));
1175 }
1176 
GetDocumentId(const NamespaceFingerprintIdentifier & namespace_fingerprint_identifier) const1177 libtextclassifier3::StatusOr<DocumentId> DocumentStore::GetDocumentId(
1178     const NamespaceFingerprintIdentifier& namespace_fingerprint_identifier)
1179     const {
1180   if (!namespace_id_fingerprint_) {
1181     return absl_ports::FailedPreconditionError(
1182         "Cannot lookup document id by namespace id + fingerprint without "
1183         "enabling it on uri_mapper");
1184   }
1185 
1186   auto document_id_or = document_key_mapper_->Get(
1187       namespace_fingerprint_identifier.EncodeToCString());
1188   if (document_id_or.ok()) {
1189     return document_id_or.ValueOrDie();
1190   }
1191   return absl_ports::Annotate(
1192       std::move(document_id_or).status(),
1193       "Failed to find DocumentId by namespace id + fingerprint");
1194 }
1195 
GetAllNamespaces() const1196 std::vector<std::string> DocumentStore::GetAllNamespaces() const {
1197   std::unordered_map<NamespaceId, std::string> namespace_id_to_namespace =
1198       GetNamespaceIdsToNamespaces(namespace_mapper_.get());
1199 
1200   std::unordered_set<NamespaceId> existing_namespace_ids;
1201   int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
1202   for (DocumentId document_id = 0; document_id < filter_cache_->num_elements();
1203        ++document_id) {
1204     // filter_cache_->Get can only fail if document_id is < 0
1205     // or >= filter_cache_->num_elements. So, this error SHOULD NEVER HAPPEN.
1206     auto status_or_data = filter_cache_->Get(document_id);
1207     if (!status_or_data.ok()) {
1208       ICING_LOG(ERROR)
1209           << "Error while iterating over filter cache in GetAllNamespaces";
1210       return std::vector<std::string>();
1211     }
1212     const DocumentFilterData* data = status_or_data.ValueOrDie();
1213 
1214     if (GetAliveDocumentFilterData(document_id, current_time_ms)) {
1215       existing_namespace_ids.insert(data->namespace_id());
1216     }
1217   }
1218 
1219   std::vector<std::string> existing_namespaces;
1220   for (auto itr = existing_namespace_ids.begin();
1221        itr != existing_namespace_ids.end(); ++itr) {
1222     existing_namespaces.push_back(namespace_id_to_namespace.at(*itr));
1223   }
1224   return existing_namespaces;
1225 }
1226 
GetAliveDocumentFilterData(DocumentId document_id,int64_t current_time_ms) const1227 std::optional<DocumentFilterData> DocumentStore::GetAliveDocumentFilterData(
1228     DocumentId document_id, int64_t current_time_ms) const {
1229   if (IsDeleted(document_id)) {
1230     return std::nullopt;
1231   }
1232   return GetNonExpiredDocumentFilterData(document_id, current_time_ms);
1233 }
1234 
IsDeleted(DocumentId document_id) const1235 bool DocumentStore::IsDeleted(DocumentId document_id) const {
1236   auto file_offset_or = document_id_mapper_->Get(document_id);
1237   if (!file_offset_or.ok()) {
1238     // This would only happen if document_id is out of range of the
1239     // document_id_mapper, meaning we got some invalid document_id. Callers
1240     // should already have checked that their document_id is valid or used
1241     // DoesDocumentExist(WithStatus). Regardless, return true since the
1242     // document doesn't exist.
1243     return true;
1244   }
1245   int64_t file_offset = *file_offset_or.ValueOrDie();
1246   return file_offset == kDocDeletedFlag;
1247 }
1248 
1249 // Returns DocumentFilterData if the document is not expired. Otherwise,
1250 // std::nullopt.
1251 std::optional<DocumentFilterData>
GetNonExpiredDocumentFilterData(DocumentId document_id,int64_t current_time_ms) const1252 DocumentStore::GetNonExpiredDocumentFilterData(DocumentId document_id,
1253                                                int64_t current_time_ms) const {
1254   auto filter_data_or = filter_cache_->GetCopy(document_id);
1255   if (!filter_data_or.ok()) {
1256     // This would only happen if document_id is out of range of the
1257     // filter_cache, meaning we got some invalid document_id. Callers should
1258     // already have checked that their document_id is valid or used
1259     // DoesDocumentExist(WithStatus). Regardless, return true since the
1260     // document doesn't exist.
1261     return std::nullopt;
1262   }
1263   DocumentFilterData document_filter_data = filter_data_or.ValueOrDie();
1264 
1265   // Check if it's past the expiration time
1266   if (current_time_ms >= document_filter_data.expiration_timestamp_ms()) {
1267     return std::nullopt;
1268   }
1269   return document_filter_data;
1270 }
1271 
Delete(const std::string_view name_space,const std::string_view uri,int64_t current_time_ms)1272 libtextclassifier3::Status DocumentStore::Delete(
1273     const std::string_view name_space, const std::string_view uri,
1274     int64_t current_time_ms) {
1275   // Try to get the DocumentId first
1276   auto document_id_or = GetDocumentId(name_space, uri);
1277   if (!document_id_or.ok()) {
1278     return absl_ports::Annotate(
1279         document_id_or.status(),
1280         absl_ports::StrCat("Failed to delete Document. namespace: ", name_space,
1281                            ", uri: ", uri));
1282   }
1283   return Delete(document_id_or.ValueOrDie(), current_time_ms);
1284 }
1285 
Delete(DocumentId document_id,int64_t current_time_ms)1286 libtextclassifier3::Status DocumentStore::Delete(DocumentId document_id,
1287                                                  int64_t current_time_ms) {
1288   auto document_filter_data_optional_ =
1289       GetAliveDocumentFilterData(document_id, current_time_ms);
1290   if (!document_filter_data_optional_) {
1291     // The document doesn't exist. We should return InvalidArgumentError if the
1292     // document id is invalid. Otherwise we should return NOT_FOUND error.
1293     if (!IsDocumentIdValid(document_id)) {
1294       return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
1295           "Document id '%d' invalid.", document_id));
1296     }
1297     return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
1298         "Document id '%d' doesn't exist", document_id));
1299   }
1300 
1301   auto document_log_offset_or = document_id_mapper_->Get(document_id);
1302   if (!document_log_offset_or.ok()) {
1303     return absl_ports::InternalError("Failed to find document offset.");
1304   }
1305   int64_t document_log_offset = *document_log_offset_or.ValueOrDie();
1306 
1307   // Erases document proto.
1308   ICING_RETURN_IF_ERROR(document_log_->EraseProto(document_log_offset));
1309   return ClearDerivedData(document_id);
1310 }
1311 
GetNamespaceId(std::string_view name_space) const1312 libtextclassifier3::StatusOr<NamespaceId> DocumentStore::GetNamespaceId(
1313     std::string_view name_space) const {
1314   return namespace_mapper_->Get(name_space);
1315 }
1316 
GetCorpusId(const std::string_view name_space,const std::string_view schema) const1317 libtextclassifier3::StatusOr<CorpusId> DocumentStore::GetCorpusId(
1318     const std::string_view name_space, const std::string_view schema) const {
1319   ICING_ASSIGN_OR_RETURN(NamespaceId namespace_id,
1320                          namespace_mapper_->Get(name_space));
1321   return corpus_mapper_->Get(MakeFingerprint(namespace_id, name_space, schema));
1322 }
1323 
GetResultGroupingEntryId(ResultSpecProto::ResultGroupingType result_group_type,const std::string_view name_space,const std::string_view schema) const1324 libtextclassifier3::StatusOr<int32_t> DocumentStore::GetResultGroupingEntryId(
1325     ResultSpecProto::ResultGroupingType result_group_type,
1326     const std::string_view name_space, const std::string_view schema) const {
1327   auto namespace_id = GetNamespaceId(name_space);
1328   auto schema_type_id = schema_store_->GetSchemaTypeId(schema);
1329   switch (result_group_type) {
1330     case ResultSpecProto::NONE:
1331       return absl_ports::InvalidArgumentError(
1332           "Cannot group by ResultSpecProto::NONE");
1333     case ResultSpecProto::SCHEMA_TYPE:
1334       if (schema_type_id.ok()) {
1335         return schema_type_id.ValueOrDie();
1336       }
1337       break;
1338     case ResultSpecProto::NAMESPACE:
1339       if (namespace_id.ok()) {
1340         return namespace_id.ValueOrDie();
1341       }
1342       break;
1343     case ResultSpecProto::NAMESPACE_AND_SCHEMA_TYPE:
1344       if (namespace_id.ok() && schema_type_id.ok()) {
1345         // TODO(b/258715421): Temporary workaround to get a
1346         //                    ResultGroupingEntryId given the Namespace string
1347         //                    and Schema string.
1348         return namespace_id.ValueOrDie() << 16 | schema_type_id.ValueOrDie();
1349       }
1350       break;
1351   }
1352   return absl_ports::NotFoundError("Cannot generate ResultGrouping Entry Id");
1353 }
1354 
GetResultGroupingEntryId(ResultSpecProto::ResultGroupingType result_group_type,const NamespaceId namespace_id,const SchemaTypeId schema_type_id) const1355 libtextclassifier3::StatusOr<int32_t> DocumentStore::GetResultGroupingEntryId(
1356     ResultSpecProto::ResultGroupingType result_group_type,
1357     const NamespaceId namespace_id, const SchemaTypeId schema_type_id) const {
1358   switch (result_group_type) {
1359     case ResultSpecProto::NONE:
1360       return absl_ports::InvalidArgumentError(
1361           "Cannot group by ResultSpecProto::NONE");
1362     case ResultSpecProto::SCHEMA_TYPE:
1363       return schema_type_id;
1364     case ResultSpecProto::NAMESPACE:
1365       return namespace_id;
1366     case ResultSpecProto::NAMESPACE_AND_SCHEMA_TYPE:
1367       // TODO(b/258715421): Temporary workaround to get a ResultGroupingEntryId
1368       //                    given the Namespace Id and SchemaType Id.
1369       return namespace_id << 16 | schema_type_id;
1370   }
1371   return absl_ports::NotFoundError("Cannot generate ResultGrouping Entry Id");
1372 }
1373 
1374 libtextclassifier3::StatusOr<DocumentAssociatedScoreData>
GetDocumentAssociatedScoreData(DocumentId document_id) const1375 DocumentStore::GetDocumentAssociatedScoreData(DocumentId document_id) const {
1376   auto score_data_or = score_cache_->GetCopy(document_id);
1377   if (!score_data_or.ok()) {
1378     ICING_LOG(ERROR) << " while trying to access DocumentId " << document_id
1379                      << " from score_cache_";
1380     return absl_ports::NotFoundError(
1381         std::move(score_data_or).status().error_message());
1382   }
1383 
1384   DocumentAssociatedScoreData document_associated_score_data =
1385       std::move(score_data_or).ValueOrDie();
1386   return document_associated_score_data;
1387 }
1388 
1389 libtextclassifier3::StatusOr<CorpusAssociatedScoreData>
GetCorpusAssociatedScoreData(CorpusId corpus_id) const1390 DocumentStore::GetCorpusAssociatedScoreData(CorpusId corpus_id) const {
1391   return corpus_score_cache_->GetCopy(corpus_id);
1392 }
1393 
1394 libtextclassifier3::StatusOr<CorpusAssociatedScoreData>
GetCorpusAssociatedScoreDataToUpdate(CorpusId corpus_id) const1395 DocumentStore::GetCorpusAssociatedScoreDataToUpdate(CorpusId corpus_id) const {
1396   auto corpus_scoring_data_or = GetCorpusAssociatedScoreData(corpus_id);
1397   if (!corpus_scoring_data_or.ok() &&
1398       absl_ports::IsOutOfRange(corpus_scoring_data_or.status())) {
1399     // OUT_OF_RANGE is the StatusCode returned when a corpus id is added to
1400     // corpus_score_cache_ for the first time. Return a default
1401     // CorpusAssociatedScoreData object in this case.
1402     return CorpusAssociatedScoreData();
1403   }
1404 
1405   return corpus_scoring_data_or;
1406 }
1407 
1408 // TODO(b/273826815): Decide on and adopt a consistent pattern for handling
1409 // NOT_FOUND 'errors' returned by our internal classes.
GetUsageScores(DocumentId document_id,int64_t current_time_ms) const1410 std::optional<UsageStore::UsageScores> DocumentStore::GetUsageScores(
1411     DocumentId document_id, int64_t current_time_ms) const {
1412   std::optional<DocumentFilterData> opt =
1413       GetAliveDocumentFilterData(document_id, current_time_ms);
1414   if (!opt) {
1415     return std::nullopt;
1416   }
1417   if (document_id >= usage_store_->num_elements()) {
1418     return std::nullopt;
1419   }
1420   auto usage_scores_or = usage_store_->GetUsageScores(document_id);
1421   if (!usage_scores_or.ok()) {
1422     ICING_LOG(ERROR) << "Error retrieving usage for " << document_id << ": "
1423                      << usage_scores_or.status().error_message();
1424     return std::nullopt;
1425   }
1426   return std::move(usage_scores_or).ValueOrDie();
1427 }
1428 
ReportUsage(const UsageReport & usage_report)1429 libtextclassifier3::Status DocumentStore::ReportUsage(
1430     const UsageReport& usage_report) {
1431   ICING_ASSIGN_OR_RETURN(DocumentId document_id,
1432                          GetDocumentId(usage_report.document_namespace(),
1433                                        usage_report.document_uri()));
1434   // We can use the internal version here because we got our document_id from
1435   // our internal data structures. We would have thrown some error if the
1436   // namespace and/or uri were incorrect.
1437   int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
1438   if (!GetAliveDocumentFilterData(document_id, current_time_ms)) {
1439     // Document was probably deleted or expired.
1440     return absl_ports::NotFoundError(absl_ports::StrCat(
1441         "Couldn't report usage on a nonexistent document: (namespace: '",
1442         usage_report.document_namespace(), "', uri: '",
1443         usage_report.document_uri(), "')"));
1444   }
1445 
1446   return usage_store_->AddUsageReport(usage_report, document_id);
1447 }
1448 
DeleteByNamespace(std::string_view name_space)1449 DocumentStore::DeleteByGroupResult DocumentStore::DeleteByNamespace(
1450     std::string_view name_space) {
1451   DeleteByGroupResult result;
1452   auto namespace_id_or = namespace_mapper_->Get(name_space);
1453   if (!namespace_id_or.ok()) {
1454     result.status = absl_ports::Annotate(
1455         namespace_id_or.status(),
1456         absl_ports::StrCat("Failed to find namespace: ", name_space));
1457     return result;
1458   }
1459   NamespaceId namespace_id = namespace_id_or.ValueOrDie();
1460   auto num_deleted_or = BatchDelete(namespace_id, kInvalidSchemaTypeId);
1461   if (!num_deleted_or.ok()) {
1462     result.status = std::move(num_deleted_or).status();
1463     return result;
1464   }
1465 
1466   result.num_docs_deleted = num_deleted_or.ValueOrDie();
1467   if (result.num_docs_deleted <= 0) {
1468     // Treat the fact that no existing documents had this namespace to be the
1469     // same as this namespace not existing at all.
1470     result.status = absl_ports::NotFoundError(
1471         absl_ports::StrCat("Namespace '", name_space, "' doesn't exist"));
1472     return result;
1473   }
1474 
1475   return result;
1476 }
1477 
DeleteBySchemaType(std::string_view schema_type)1478 DocumentStore::DeleteByGroupResult DocumentStore::DeleteBySchemaType(
1479     std::string_view schema_type) {
1480   DeleteByGroupResult result;
1481   auto schema_type_id_or = schema_store_->GetSchemaTypeId(schema_type);
1482   if (!schema_type_id_or.ok()) {
1483     result.status = absl_ports::Annotate(
1484         schema_type_id_or.status(),
1485         absl_ports::StrCat("Failed to find schema type. schema_type: ",
1486                            schema_type));
1487     return result;
1488   }
1489   SchemaTypeId schema_type_id = schema_type_id_or.ValueOrDie();
1490   auto num_deleted_or = BatchDelete(kInvalidNamespaceId, schema_type_id);
1491   if (!num_deleted_or.ok()) {
1492     result.status = std::move(num_deleted_or).status();
1493     return result;
1494   }
1495 
1496   result.num_docs_deleted = num_deleted_or.ValueOrDie();
1497   if (result.num_docs_deleted <= 0) {
1498     result.status = absl_ports::NotFoundError(absl_ports::StrCat(
1499         "No documents found with schema type '", schema_type, "'"));
1500     return result;
1501   }
1502 
1503   return result;
1504 }
1505 
BatchDelete(NamespaceId namespace_id,SchemaTypeId schema_type_id)1506 libtextclassifier3::StatusOr<int> DocumentStore::BatchDelete(
1507     NamespaceId namespace_id, SchemaTypeId schema_type_id) {
1508   // Tracks if there were any existing documents with this namespace that we
1509   // will mark as deleted.
1510   int num_updated_documents = 0;
1511 
1512   // Traverse FilterCache and delete all docs that match namespace_id and
1513   // schema_type_id.
1514   int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
1515   for (DocumentId document_id = 0; document_id < filter_cache_->num_elements();
1516        ++document_id) {
1517     // filter_cache_->Get can only fail if document_id is < 0
1518     // or >= filter_cache_->num_elements. So, this error SHOULD NEVER HAPPEN.
1519     ICING_ASSIGN_OR_RETURN(const DocumentFilterData* data,
1520                            filter_cache_->Get(document_id));
1521 
1522     // Check namespace only when the input namespace id is valid.
1523     if (namespace_id != kInvalidNamespaceId &&
1524         (data->namespace_id() == kInvalidNamespaceId ||
1525          data->namespace_id() != namespace_id)) {
1526       // The document has already been hard-deleted or isn't from the desired
1527       // namespace.
1528       continue;
1529     }
1530 
1531     // Check schema type only when the input schema type id is valid.
1532     if (schema_type_id != kInvalidSchemaTypeId &&
1533         (data->schema_type_id() == kInvalidSchemaTypeId ||
1534          data->schema_type_id() != schema_type_id)) {
1535       // The document has already been hard-deleted or doesn't have the
1536       // desired schema type.
1537       continue;
1538     }
1539 
1540     // The document has the desired namespace and schema type, it either
1541     // exists or has expired.
1542     libtextclassifier3::Status delete_status =
1543         Delete(document_id, current_time_ms);
1544     if (absl_ports::IsNotFound(delete_status)) {
1545       continue;
1546     } else if (!delete_status.ok()) {
1547       // Real error, pass up.
1548       return delete_status;
1549     }
1550     ++num_updated_documents;
1551   }
1552 
1553   return num_updated_documents;
1554 }
1555 
PersistToDisk(PersistType::Code persist_type)1556 libtextclassifier3::Status DocumentStore::PersistToDisk(
1557     PersistType::Code persist_type) {
1558   if (persist_type == PersistType::LITE) {
1559     // only persist the document log.
1560     return document_log_->PersistToDisk();
1561   }
1562   ICING_RETURN_IF_ERROR(document_log_->PersistToDisk());
1563   ICING_RETURN_IF_ERROR(document_key_mapper_->PersistToDisk());
1564   ICING_RETURN_IF_ERROR(document_id_mapper_->PersistToDisk());
1565   ICING_RETURN_IF_ERROR(score_cache_->PersistToDisk());
1566   ICING_RETURN_IF_ERROR(filter_cache_->PersistToDisk());
1567   ICING_RETURN_IF_ERROR(namespace_mapper_->PersistToDisk());
1568   ICING_RETURN_IF_ERROR(usage_store_->PersistToDisk());
1569   ICING_RETURN_IF_ERROR(corpus_mapper_->PersistToDisk());
1570   ICING_RETURN_IF_ERROR(corpus_score_cache_->PersistToDisk());
1571 
1572   // Update the combined checksum and write to header file.
1573   ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
1574   ICING_RETURN_IF_ERROR(UpdateHeader(checksum));
1575 
1576   return libtextclassifier3::Status::OK;
1577 }
1578 
GetValueOrDefault(const libtextclassifier3::StatusOr<int64_t> & value_or,int64_t default_value)1579 int64_t GetValueOrDefault(const libtextclassifier3::StatusOr<int64_t>& value_or,
1580                           int64_t default_value) {
1581   return (value_or.ok()) ? value_or.ValueOrDie() : default_value;
1582 }
1583 
GetMemberStorageInfo() const1584 DocumentStorageInfoProto DocumentStore::GetMemberStorageInfo() const {
1585   DocumentStorageInfoProto storage_info;
1586   storage_info.set_document_log_size(
1587       GetValueOrDefault(document_log_->GetDiskUsage(), -1));
1588   storage_info.set_key_mapper_size(
1589       GetValueOrDefault(document_key_mapper_->GetDiskUsage(), -1));
1590   storage_info.set_document_id_mapper_size(
1591       GetValueOrDefault(document_id_mapper_->GetDiskUsage(), -1));
1592   storage_info.set_score_cache_size(
1593       GetValueOrDefault(score_cache_->GetDiskUsage(), -1));
1594   storage_info.set_filter_cache_size(
1595       GetValueOrDefault(filter_cache_->GetDiskUsage(), -1));
1596   storage_info.set_namespace_id_mapper_size(
1597       GetValueOrDefault(namespace_mapper_->GetDiskUsage(), -1));
1598   storage_info.set_corpus_mapper_size(
1599       GetValueOrDefault(corpus_mapper_->GetDiskUsage(), -1));
1600   storage_info.set_corpus_score_cache_size(
1601       GetValueOrDefault(corpus_score_cache_->GetDiskUsage(), -1));
1602   return storage_info;
1603 }
1604 
CalculateDocumentStatusCounts(DocumentStorageInfoProto storage_info) const1605 DocumentStorageInfoProto DocumentStore::CalculateDocumentStatusCounts(
1606     DocumentStorageInfoProto storage_info) const {
1607   int total_num_alive = 0;
1608   int total_num_expired = 0;
1609   int total_num_deleted = 0;
1610   std::unordered_map<NamespaceId, std::string> namespace_id_to_namespace =
1611       GetNamespaceIdsToNamespaces(namespace_mapper_.get());
1612   std::unordered_map<std::string, NamespaceStorageInfoProto>
1613       namespace_to_storage_info;
1614 
1615   int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
1616   for (DocumentId document_id = 0;
1617        document_id < document_id_mapper_->num_elements(); ++document_id) {
1618     // Check if it's deleted first.
1619     if (IsDeleted(document_id)) {
1620       // We don't have the namespace id of hard deleted documents anymore, so
1621       // we can't add to our namespace storage info.
1622       ++total_num_deleted;
1623       continue;
1624     }
1625 
1626     // At this point, the document is either alive or expired, we can get
1627     // namespace info for it.
1628     auto filter_data_or = filter_cache_->Get(document_id);
1629     if (!filter_data_or.ok()) {
1630       ICING_VLOG(1) << "Error trying to get filter data for document store "
1631                        "storage info counts.";
1632       continue;
1633     }
1634     const DocumentFilterData* filter_data = filter_data_or.ValueOrDie();
1635     auto itr = namespace_id_to_namespace.find(filter_data->namespace_id());
1636     if (itr == namespace_id_to_namespace.end()) {
1637       ICING_VLOG(1) << "Error trying to find namespace for document store "
1638                        "storage info counts.";
1639       continue;
1640     }
1641     const std::string& name_space = itr->second;
1642 
1643     // Always set the namespace, if the NamespaceStorageInfoProto didn't exist
1644     // before, we'll get back a default instance of it.
1645     NamespaceStorageInfoProto& namespace_storage_info =
1646         namespace_to_storage_info[name_space];
1647     namespace_storage_info.set_namespace_(name_space);
1648 
1649     // Get usage scores
1650     auto usage_scores_or = usage_store_->GetUsageScores(document_id);
1651     if (!usage_scores_or.ok()) {
1652       ICING_VLOG(1) << "Error trying to get usage scores for document store "
1653                        "storage info counts.";
1654       continue;
1655     }
1656     UsageStore::UsageScores usage_scores = usage_scores_or.ValueOrDie();
1657 
1658     // Update our stats
1659     if (!GetNonExpiredDocumentFilterData(document_id, current_time_ms)) {
1660       ++total_num_expired;
1661       namespace_storage_info.set_num_expired_documents(
1662           namespace_storage_info.num_expired_documents() + 1);
1663       if (usage_scores.usage_type1_count > 0) {
1664         namespace_storage_info.set_num_expired_documents_usage_type1(
1665             namespace_storage_info.num_expired_documents_usage_type1() + 1);
1666       }
1667       if (usage_scores.usage_type2_count > 0) {
1668         namespace_storage_info.set_num_expired_documents_usage_type2(
1669             namespace_storage_info.num_expired_documents_usage_type2() + 1);
1670       }
1671       if (usage_scores.usage_type3_count > 0) {
1672         namespace_storage_info.set_num_expired_documents_usage_type3(
1673             namespace_storage_info.num_expired_documents_usage_type3() + 1);
1674       }
1675     } else {
1676       ++total_num_alive;
1677       namespace_storage_info.set_num_alive_documents(
1678           namespace_storage_info.num_alive_documents() + 1);
1679       if (usage_scores.usage_type1_count > 0) {
1680         namespace_storage_info.set_num_alive_documents_usage_type1(
1681             namespace_storage_info.num_alive_documents_usage_type1() + 1);
1682       }
1683       if (usage_scores.usage_type2_count > 0) {
1684         namespace_storage_info.set_num_alive_documents_usage_type2(
1685             namespace_storage_info.num_alive_documents_usage_type2() + 1);
1686       }
1687       if (usage_scores.usage_type3_count > 0) {
1688         namespace_storage_info.set_num_alive_documents_usage_type3(
1689             namespace_storage_info.num_alive_documents_usage_type3() + 1);
1690       }
1691     }
1692   }
1693 
1694   for (auto& itr : namespace_to_storage_info) {
1695     storage_info.mutable_namespace_storage_info()->Add(std::move(itr.second));
1696   }
1697   storage_info.set_num_alive_documents(total_num_alive);
1698   storage_info.set_num_deleted_documents(total_num_deleted);
1699   storage_info.set_num_expired_documents(total_num_expired);
1700   return storage_info;
1701 }
1702 
GetStorageInfo() const1703 DocumentStorageInfoProto DocumentStore::GetStorageInfo() const {
1704   DocumentStorageInfoProto storage_info = GetMemberStorageInfo();
1705   int64_t directory_size = filesystem_->GetDiskUsage(base_dir_.c_str());
1706   if (directory_size != Filesystem::kBadFileSize) {
1707     storage_info.set_document_store_size(directory_size);
1708   } else {
1709     storage_info.set_document_store_size(-1);
1710   }
1711   storage_info.set_num_namespaces(namespace_mapper_->num_keys());
1712   return CalculateDocumentStatusCounts(std::move(storage_info));
1713 }
1714 
UpdateSchemaStore(const SchemaStore * schema_store)1715 libtextclassifier3::Status DocumentStore::UpdateSchemaStore(
1716     const SchemaStore* schema_store) {
1717   // Update all references to the SchemaStore
1718   schema_store_ = schema_store;
1719   document_validator_.UpdateSchemaStore(schema_store);
1720 
1721   int size = document_id_mapper_->num_elements();
1722   int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
1723   for (DocumentId document_id = 0; document_id < size; document_id++) {
1724     auto document_or = Get(document_id);
1725     if (absl_ports::IsNotFound(document_or.status())) {
1726       // Skip nonexistent documents
1727       continue;
1728     } else if (!document_or.ok()) {
1729       // Real error, pass up
1730       return absl_ports::Annotate(
1731           document_or.status(),
1732           IcingStringUtil::StringPrintf(
1733               "Failed to retrieve Document for DocumentId %d", document_id));
1734     }
1735 
1736     // Guaranteed to have a document now.
1737     DocumentProto document = document_or.ValueOrDie();
1738 
1739     // Revalidate that this document is still compatible
1740     if (document_validator_.Validate(document).ok()) {
1741       // Update the SchemaTypeId for this entry
1742       ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
1743                              schema_store_->GetSchemaTypeId(document.schema()));
1744       ICING_ASSIGN_OR_RETURN(
1745           typename FileBackedVector<DocumentFilterData>::MutableView
1746               doc_filter_data_view,
1747           filter_cache_->GetMutable(document_id));
1748       doc_filter_data_view.Get().set_schema_type_id(schema_type_id);
1749     } else {
1750       // Document is no longer valid with the new SchemaStore. Mark as
1751       // deleted
1752       auto delete_status =
1753           Delete(document.namespace_(), document.uri(), current_time_ms);
1754       if (!delete_status.ok() && !absl_ports::IsNotFound(delete_status)) {
1755         // Real error, pass up
1756         return delete_status;
1757       }
1758     }
1759   }
1760 
1761   return libtextclassifier3::Status::OK;
1762 }
1763 
OptimizedUpdateSchemaStore(const SchemaStore * schema_store,const SchemaStore::SetSchemaResult & set_schema_result)1764 libtextclassifier3::Status DocumentStore::OptimizedUpdateSchemaStore(
1765     const SchemaStore* schema_store,
1766     const SchemaStore::SetSchemaResult& set_schema_result) {
1767   if (!set_schema_result.success) {
1768     // No new schema was set, no work to be done
1769     return libtextclassifier3::Status::OK;
1770   }
1771 
1772   // Update all references to the SchemaStore
1773   schema_store_ = schema_store;
1774   document_validator_.UpdateSchemaStore(schema_store);
1775 
1776   int size = document_id_mapper_->num_elements();
1777   int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
1778   for (DocumentId document_id = 0; document_id < size; document_id++) {
1779     if (!GetAliveDocumentFilterData(document_id, current_time_ms)) {
1780       // Skip nonexistent documents
1781       continue;
1782     }
1783 
1784     // Guaranteed that the document exists now.
1785     ICING_ASSIGN_OR_RETURN(const DocumentFilterData* filter_data,
1786                            filter_cache_->Get(document_id));
1787 
1788     bool delete_document = set_schema_result.schema_types_deleted_by_id.count(
1789                                filter_data->schema_type_id()) != 0;
1790 
1791     // Check if we need to update the FilterCache entry for this document. It
1792     // may have been assigned a different SchemaTypeId in the new SchemaStore.
1793     bool update_filter_cache =
1794         set_schema_result.old_schema_type_ids_changed.count(
1795             filter_data->schema_type_id()) != 0;
1796 
1797     // Check if we need to revalidate this document if the type is now
1798     // incompatible
1799     bool revalidate_document =
1800         set_schema_result.schema_types_incompatible_by_id.count(
1801             filter_data->schema_type_id()) != 0;
1802 
1803     if (update_filter_cache || revalidate_document) {
1804       ICING_ASSIGN_OR_RETURN(DocumentProto document, Get(document_id));
1805 
1806       if (update_filter_cache) {
1807         ICING_ASSIGN_OR_RETURN(
1808             SchemaTypeId schema_type_id,
1809             schema_store_->GetSchemaTypeId(document.schema()));
1810         ICING_ASSIGN_OR_RETURN(
1811             typename FileBackedVector<DocumentFilterData>::MutableView
1812                 doc_filter_data_view,
1813             filter_cache_->GetMutable(document_id));
1814         doc_filter_data_view.Get().set_schema_type_id(schema_type_id);
1815       }
1816       if (revalidate_document) {
1817         delete_document = !document_validator_.Validate(document).ok();
1818       }
1819     }
1820 
1821     if (delete_document) {
1822       // Document is no longer valid with the new SchemaStore. Mark as deleted
1823       auto delete_status = Delete(document_id, current_time_ms);
1824       if (!delete_status.ok() && !absl_ports::IsNotFound(delete_status)) {
1825         // Real error, pass up
1826         return delete_status;
1827       }
1828     }
1829   }
1830 
1831   return libtextclassifier3::Status::OK;
1832 }
1833 
1834 // TODO(b/121227117): Implement Optimize()
Optimize()1835 libtextclassifier3::Status DocumentStore::Optimize() {
1836   return libtextclassifier3::Status::OK;
1837 }
1838 
1839 libtextclassifier3::StatusOr<DocumentStore::OptimizeResult>
OptimizeInto(const std::string & new_directory,const LanguageSegmenter * lang_segmenter,OptimizeStatsProto * stats) const1840 DocumentStore::OptimizeInto(const std::string& new_directory,
1841                             const LanguageSegmenter* lang_segmenter,
1842                             OptimizeStatsProto* stats) const {
1843   // Validates directory
1844   if (new_directory == base_dir_) {
1845     return absl_ports::InvalidArgumentError(
1846         "New directory is the same as the current one.");
1847   }
1848 
1849   ICING_ASSIGN_OR_RETURN(
1850       auto doc_store_create_result,
1851       DocumentStore::Create(filesystem_, new_directory, &clock_, schema_store_,
1852                             /*force_recovery_and_revalidate_documents=*/false,
1853                             namespace_id_fingerprint_, pre_mapping_fbv_,
1854                             use_persistent_hash_map_, compression_level_,
1855                             /*initialize_stats=*/nullptr));
1856   std::unique_ptr<DocumentStore> new_doc_store =
1857       std::move(doc_store_create_result.document_store);
1858 
1859   // Writes all valid docs into new document store (new directory)
1860   int document_cnt = document_id_mapper_->num_elements();
1861   int num_deleted_documents = 0;
1862   int num_expired_documents = 0;
1863   UsageStore::UsageScores default_usage;
1864 
1865   OptimizeResult result;
1866   result.document_id_old_to_new.resize(document_cnt, kInvalidDocumentId);
1867   int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
1868   for (DocumentId document_id = 0; document_id < document_cnt; document_id++) {
1869     auto document_or = Get(document_id, /*clear_internal_fields=*/false);
1870     if (absl_ports::IsNotFound(document_or.status())) {
1871       if (IsDeleted(document_id)) {
1872         ++num_deleted_documents;
1873       } else if (!GetNonExpiredDocumentFilterData(document_id,
1874                                                   current_time_ms)) {
1875         ++num_expired_documents;
1876       }
1877       continue;
1878     } else if (!document_or.ok()) {
1879       // Real error, pass up
1880       return absl_ports::Annotate(
1881           document_or.status(),
1882           IcingStringUtil::StringPrintf(
1883               "Failed to retrieve Document for DocumentId %d", document_id));
1884     }
1885 
1886     // Guaranteed to have a document now.
1887     DocumentProto document_to_keep = std::move(document_or).ValueOrDie();
1888 
1889     libtextclassifier3::StatusOr<DocumentId> new_document_id_or;
1890     if (document_to_keep.internal_fields().length_in_tokens() == 0) {
1891       auto tokenized_document_or = TokenizedDocument::Create(
1892           schema_store_, lang_segmenter, document_to_keep);
1893       if (!tokenized_document_or.ok()) {
1894         return absl_ports::Annotate(
1895             tokenized_document_or.status(),
1896             IcingStringUtil::StringPrintf(
1897                 "Failed to tokenize Document for DocumentId %d", document_id));
1898       }
1899       TokenizedDocument tokenized_document(
1900           std::move(tokenized_document_or).ValueOrDie());
1901       new_document_id_or = new_doc_store->Put(
1902           std::move(document_to_keep), tokenized_document.num_string_tokens());
1903     } else {
1904       // TODO(b/144458732): Implement a more robust version of
1905       // TC_ASSIGN_OR_RETURN that can support error logging.
1906       new_document_id_or =
1907           new_doc_store->InternalPut(std::move(document_to_keep));
1908     }
1909     if (!new_document_id_or.ok()) {
1910       ICING_LOG(ERROR) << new_document_id_or.status().error_message()
1911                        << "Failed to write into new document store";
1912       return new_document_id_or.status();
1913     }
1914 
1915     result.document_id_old_to_new[document_id] =
1916         new_document_id_or.ValueOrDie();
1917 
1918     // Copy over usage scores.
1919     ICING_ASSIGN_OR_RETURN(UsageStore::UsageScores usage_scores,
1920                            usage_store_->GetUsageScores(document_id));
1921     if (!(usage_scores == default_usage)) {
1922       // If the usage scores for this document are the default (no usage), then
1923       // don't bother setting it. No need to possibly allocate storage if
1924       // there's nothing interesting to store.
1925       DocumentId new_document_id = new_document_id_or.ValueOrDie();
1926       ICING_RETURN_IF_ERROR(
1927           new_doc_store->SetUsageScores(new_document_id, usage_scores));
1928     }
1929   }
1930 
1931   // Construct namespace_id_old_to_new
1932   int namespace_cnt = namespace_mapper_->num_keys();
1933   std::unordered_map<NamespaceId, std::string> old_namespaces =
1934       GetNamespaceIdsToNamespaces(namespace_mapper_.get());
1935   if (namespace_cnt != old_namespaces.size()) {
1936     // This really shouldn't happen. If it really happens, then:
1937     // - It won't block DocumentStore optimization, so don't return error here.
1938     // - Instead, write a warning log here and hint the caller to rebuild index.
1939     ICING_LOG(WARNING) << "Unexpected old namespace count " << namespace_cnt
1940                        << " vs " << old_namespaces.size();
1941     result.should_rebuild_index = true;
1942   } else {
1943     result.namespace_id_old_to_new.resize(namespace_cnt, kInvalidNamespaceId);
1944     for (const auto& [old_namespace_id, ns] : old_namespaces) {
1945       if (old_namespace_id >= result.namespace_id_old_to_new.size()) {
1946         // This really shouldn't happen. If it really happens, then:
1947         // - It won't block DocumentStore optimization, so don't return error
1948         //   here.
1949         // - Instead, write a warning log here and hint the caller to rebuild
1950         //   index.
1951         ICING_LOG(WARNING) << "Found unexpected namespace id "
1952                            << old_namespace_id << ". Should be in range 0 to "
1953                            << result.namespace_id_old_to_new.size()
1954                            << " (exclusive).";
1955         result.namespace_id_old_to_new.clear();
1956         result.should_rebuild_index = true;
1957         break;
1958       }
1959 
1960       auto new_namespace_id_or = new_doc_store->namespace_mapper_->Get(ns);
1961       if (!new_namespace_id_or.ok()) {
1962         if (absl_ports::IsNotFound(new_namespace_id_or.status())) {
1963           continue;
1964         }
1965         // Real error, return it.
1966         return std::move(new_namespace_id_or).status();
1967       }
1968 
1969       NamespaceId new_namespace_id = new_namespace_id_or.ValueOrDie();
1970       // Safe to use bracket to assign given that we've checked the range above.
1971       result.namespace_id_old_to_new[old_namespace_id] = new_namespace_id;
1972     }
1973   }
1974 
1975   if (stats != nullptr) {
1976     stats->set_num_original_documents(document_cnt);
1977     stats->set_num_deleted_documents(num_deleted_documents);
1978     stats->set_num_expired_documents(num_expired_documents);
1979     stats->set_num_original_namespaces(namespace_cnt);
1980     stats->set_num_deleted_namespaces(
1981         namespace_cnt - new_doc_store->namespace_mapper_->num_keys());
1982   }
1983   ICING_RETURN_IF_ERROR(new_doc_store->PersistToDisk(PersistType::FULL));
1984   return result;
1985 }
1986 
1987 libtextclassifier3::StatusOr<DocumentStore::OptimizeInfo>
GetOptimizeInfo() const1988 DocumentStore::GetOptimizeInfo() const {
1989   OptimizeInfo optimize_info;
1990 
1991   // Figure out our ratio of optimizable/total docs.
1992   int32_t num_documents = document_id_mapper_->num_elements();
1993   int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
1994   for (DocumentId document_id = kMinDocumentId; document_id < num_documents;
1995        ++document_id) {
1996     if (!GetAliveDocumentFilterData(document_id, current_time_ms)) {
1997       ++optimize_info.optimizable_docs;
1998     }
1999 
2000     ++optimize_info.total_docs;
2001   }
2002 
2003   if (optimize_info.total_docs == 0) {
2004     // Can exit early since there's nothing to calculate.
2005     return optimize_info;
2006   }
2007 
2008   // Get the total element size.
2009   //
2010   // We use file size instead of disk usage here because the files are not
2011   // sparse, so it's more accurate. Disk usage rounds up to the nearest block
2012   // size.
2013   ICING_ASSIGN_OR_RETURN(const int64_t document_log_file_size,
2014                          document_log_->GetElementsFileSize());
2015   ICING_ASSIGN_OR_RETURN(const int64_t document_id_mapper_file_size,
2016                          document_id_mapper_->GetElementsFileSize());
2017   ICING_ASSIGN_OR_RETURN(const int64_t score_cache_file_size,
2018                          score_cache_->GetElementsFileSize());
2019   ICING_ASSIGN_OR_RETURN(const int64_t filter_cache_file_size,
2020                          filter_cache_->GetElementsFileSize());
2021   ICING_ASSIGN_OR_RETURN(const int64_t corpus_score_cache_file_size,
2022                          corpus_score_cache_->GetElementsFileSize());
2023 
2024   // Usage store might be sparse, but we'll still use file size for more
2025   // accurate counting.
2026   ICING_ASSIGN_OR_RETURN(const int64_t usage_store_file_size,
2027                          usage_store_->GetElementsFileSize());
2028 
2029   // We use a combined disk usage and file size for the DynamicTrieKeyMapper
2030   // because it's backed by a trie, which has some sparse property bitmaps.
2031   ICING_ASSIGN_OR_RETURN(const int64_t document_key_mapper_size,
2032                          document_key_mapper_->GetElementsSize());
2033 
2034   // We don't include the namespace_mapper or the corpus_mapper because it's
2035   // not clear if we could recover any space even if Optimize were called.
2036   // Deleting 100s of documents could still leave a few documents of a
2037   // namespace, and then there would be no change.
2038 
2039   int64_t total_size = document_log_file_size + document_key_mapper_size +
2040                        document_id_mapper_file_size + score_cache_file_size +
2041                        filter_cache_file_size + corpus_score_cache_file_size +
2042                        usage_store_file_size;
2043 
2044   optimize_info.estimated_optimizable_bytes =
2045       total_size * optimize_info.optimizable_docs / optimize_info.total_docs;
2046   return optimize_info;
2047 }
2048 
UpdateCorpusAssociatedScoreCache(CorpusId corpus_id,const CorpusAssociatedScoreData & score_data)2049 libtextclassifier3::Status DocumentStore::UpdateCorpusAssociatedScoreCache(
2050     CorpusId corpus_id, const CorpusAssociatedScoreData& score_data) {
2051   return corpus_score_cache_->Set(corpus_id, score_data);
2052 }
2053 
UpdateDocumentAssociatedScoreCache(DocumentId document_id,const DocumentAssociatedScoreData & score_data)2054 libtextclassifier3::Status DocumentStore::UpdateDocumentAssociatedScoreCache(
2055     DocumentId document_id, const DocumentAssociatedScoreData& score_data) {
2056   return score_cache_->Set(document_id, score_data);
2057 }
2058 
UpdateFilterCache(DocumentId document_id,const DocumentFilterData & filter_data)2059 libtextclassifier3::Status DocumentStore::UpdateFilterCache(
2060     DocumentId document_id, const DocumentFilterData& filter_data) {
2061   return filter_cache_->Set(document_id, filter_data);
2062 }
2063 
ClearDerivedData(DocumentId document_id)2064 libtextclassifier3::Status DocumentStore::ClearDerivedData(
2065     DocumentId document_id) {
2066   // We intentionally leave the data in key_mapper_ because locating that data
2067   // requires fetching namespace and uri. Leaving data in key_mapper_ should
2068   // be fine because the data is hashed.
2069 
2070   ICING_RETURN_IF_ERROR(document_id_mapper_->Set(document_id, kDocDeletedFlag));
2071 
2072   // Resets the score cache entry
2073   ICING_RETURN_IF_ERROR(UpdateDocumentAssociatedScoreCache(
2074       document_id, DocumentAssociatedScoreData(kInvalidCorpusId,
2075                                                /*document_score=*/-1,
2076                                                /*creation_timestamp_ms=*/-1,
2077                                                /*length_in_tokens=*/0)));
2078 
2079   // Resets the filter cache entry
2080   ICING_RETURN_IF_ERROR(UpdateFilterCache(
2081       document_id, DocumentFilterData(kInvalidNamespaceId, kInvalidSchemaTypeId,
2082                                       /*expiration_timestamp_ms=*/-1)));
2083 
2084   // Clears the usage scores.
2085   return usage_store_->DeleteUsageScores(document_id);
2086 }
2087 
SetUsageScores(DocumentId document_id,const UsageStore::UsageScores & usage_scores)2088 libtextclassifier3::Status DocumentStore::SetUsageScores(
2089     DocumentId document_id, const UsageStore::UsageScores& usage_scores) {
2090   return usage_store_->SetUsageScores(document_id, usage_scores);
2091 }
2092 
2093 libtextclassifier3::StatusOr<
2094     google::protobuf::RepeatedPtrField<DocumentDebugInfoProto::CorpusInfo>>
CollectCorpusInfo() const2095 DocumentStore::CollectCorpusInfo() const {
2096   google::protobuf::RepeatedPtrField<DocumentDebugInfoProto::CorpusInfo> corpus_info;
2097   libtextclassifier3::StatusOr<const SchemaProto*> schema_proto_or =
2098       schema_store_->GetSchema();
2099   if (!schema_proto_or.ok()) {
2100     return corpus_info;
2101   }
2102   // Maps from CorpusId to the corresponding protocol buffer in the result.
2103   std::unordered_map<CorpusId, DocumentDebugInfoProto::CorpusInfo*> info_map;
2104   std::unordered_map<NamespaceId, std::string> namespace_id_to_namespace =
2105       GetNamespaceIdsToNamespaces(namespace_mapper_.get());
2106   const SchemaProto* schema_proto = schema_proto_or.ValueOrDie();
2107   int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
2108   for (DocumentId document_id = 0; document_id < filter_cache_->num_elements();
2109        ++document_id) {
2110     if (!GetAliveDocumentFilterData(document_id, current_time_ms)) {
2111       continue;
2112     }
2113     ICING_ASSIGN_OR_RETURN(const DocumentFilterData* filter_data,
2114                            filter_cache_->Get(document_id));
2115     ICING_ASSIGN_OR_RETURN(const DocumentAssociatedScoreData* score_data,
2116                            score_cache_->Get(document_id));
2117     const std::string& name_space =
2118         namespace_id_to_namespace[filter_data->namespace_id()];
2119     const std::string& schema =
2120         schema_proto->types()[filter_data->schema_type_id()].schema_type();
2121     auto iter = info_map.find(score_data->corpus_id());
2122     if (iter == info_map.end()) {
2123       DocumentDebugInfoProto::CorpusInfo* entry = corpus_info.Add();
2124       entry->set_namespace_(name_space);
2125       entry->set_schema(schema);
2126       iter = info_map.insert({score_data->corpus_id(), entry}).first;
2127     }
2128     iter->second->set_total_documents(iter->second->total_documents() + 1);
2129     iter->second->set_total_token(iter->second->total_token() +
2130                                   score_data->length_in_tokens());
2131   }
2132   return corpus_info;
2133 }
2134 
2135 libtextclassifier3::StatusOr<DocumentDebugInfoProto>
GetDebugInfo(int verbosity) const2136 DocumentStore::GetDebugInfo(int verbosity) const {
2137   DocumentDebugInfoProto debug_info;
2138   *debug_info.mutable_document_storage_info() = GetStorageInfo();
2139   ICING_ASSIGN_OR_RETURN(Crc32 crc, ComputeChecksum());
2140   debug_info.set_crc(crc.Get());
2141   if (verbosity > 0) {
2142     ICING_ASSIGN_OR_RETURN(
2143         google::protobuf::RepeatedPtrField<DocumentDebugInfoProto::CorpusInfo>
2144             corpus_info,
2145         CollectCorpusInfo());
2146     *debug_info.mutable_corpus_info() = std::move(corpus_info);
2147   }
2148   return debug_info;
2149 }
2150 
2151 }  // namespace lib
2152 }  // namespace icing
2153