• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/store/document-store.h"
16 
17 #include <cstdint>
18 #include <limits>
19 #include <memory>
20 #include <string>
21 #include <string_view>
22 #include <unordered_map>
23 #include <utility>
24 #include <vector>
25 
26 #include "icing/text_classifier/lib3/utils/base/status.h"
27 #include "icing/text_classifier/lib3/utils/base/statusor.h"
28 #include "icing/text_classifier/lib3/utils/hash/farmhash.h"
29 #include "icing/absl_ports/annotate.h"
30 #include "icing/absl_ports/canonical_errors.h"
31 #include "icing/absl_ports/str_cat.h"
32 #include "icing/file/file-backed-proto-log.h"
33 #include "icing/file/file-backed-vector.h"
34 #include "icing/file/filesystem.h"
35 #include "icing/file/memory-mapped-file.h"
36 #include "icing/file/portable-file-backed-proto-log.h"
37 #include "icing/legacy/core/icing-string-util.h"
38 #include "icing/proto/document.pb.h"
39 #include "icing/proto/document_wrapper.pb.h"
40 #include "icing/proto/logging.pb.h"
41 #include "icing/proto/storage.pb.h"
42 #include "icing/schema/schema-store.h"
43 #include "icing/store/corpus-associated-scoring-data.h"
44 #include "icing/store/corpus-id.h"
45 #include "icing/store/document-associated-score-data.h"
46 #include "icing/store/document-filter-data.h"
47 #include "icing/store/document-id.h"
48 #include "icing/store/document-log-creator.h"
49 #include "icing/store/key-mapper.h"
50 #include "icing/store/namespace-id.h"
51 #include "icing/store/usage-store.h"
52 #include "icing/tokenization/language-segmenter.h"
53 #include "icing/util/clock.h"
54 #include "icing/util/crc32.h"
55 #include "icing/util/data-loss.h"
56 #include "icing/util/logging.h"
57 #include "icing/util/status-macros.h"
58 #include "icing/util/tokenized-document.h"
59 
60 namespace icing {
61 namespace lib {
62 
63 namespace {
64 
65 // Used in DocumentId mapper to mark a document as deleted
66 constexpr int64_t kDocDeletedFlag = -1;
67 constexpr char kDocumentIdMapperFilename[] = "document_id_mapper";
68 constexpr char kDocumentStoreHeaderFilename[] = "document_store_header";
69 constexpr char kScoreCacheFilename[] = "score_cache";
70 constexpr char kCorpusScoreCache[] = "corpus_score_cache";
71 constexpr char kFilterCacheFilename[] = "filter_cache";
72 constexpr char kNamespaceMapperFilename[] = "namespace_mapper";
73 constexpr char kUsageStoreDirectoryName[] = "usage_store";
74 constexpr char kCorpusIdMapperFilename[] = "corpus_mapper";
75 
76 // Determined through manual testing to allow for 1 million uris. 1 million
77 // because we allow up to 1 million DocumentIds.
78 constexpr int32_t kUriMapperMaxSize = 36 * 1024 * 1024;  // 36 MiB
79 
80 // 384 KiB for a KeyMapper would allow each internal array to have a max of
81 // 128 KiB for storage.
82 constexpr int32_t kNamespaceMapperMaxSize = 3 * 128 * 1024;  // 384 KiB
83 constexpr int32_t kCorpusMapperMaxSize = 3 * 128 * 1024;     // 384 KiB
84 
CreateDocumentWrapper(DocumentProto && document)85 DocumentWrapper CreateDocumentWrapper(DocumentProto&& document) {
86   DocumentWrapper document_wrapper;
87   *document_wrapper.mutable_document() = std::move(document);
88   return document_wrapper;
89 }
90 
MakeHeaderFilename(const std::string & base_dir)91 std::string MakeHeaderFilename(const std::string& base_dir) {
92   return absl_ports::StrCat(base_dir, "/", kDocumentStoreHeaderFilename);
93 }
94 
MakeDocumentIdMapperFilename(const std::string & base_dir)95 std::string MakeDocumentIdMapperFilename(const std::string& base_dir) {
96   return absl_ports::StrCat(base_dir, "/", kDocumentIdMapperFilename);
97 }
98 
MakeScoreCacheFilename(const std::string & base_dir)99 std::string MakeScoreCacheFilename(const std::string& base_dir) {
100   return absl_ports::StrCat(base_dir, "/", kScoreCacheFilename);
101 }
102 
MakeCorpusScoreCache(const std::string & base_dir)103 std::string MakeCorpusScoreCache(const std::string& base_dir) {
104   return absl_ports::StrCat(base_dir, "/", kCorpusScoreCache);
105 }
106 
MakeFilterCacheFilename(const std::string & base_dir)107 std::string MakeFilterCacheFilename(const std::string& base_dir) {
108   return absl_ports::StrCat(base_dir, "/", kFilterCacheFilename);
109 }
110 
MakeNamespaceMapperFilename(const std::string & base_dir)111 std::string MakeNamespaceMapperFilename(const std::string& base_dir) {
112   return absl_ports::StrCat(base_dir, "/", kNamespaceMapperFilename);
113 }
114 
MakeUsageStoreDirectoryName(const std::string & base_dir)115 std::string MakeUsageStoreDirectoryName(const std::string& base_dir) {
116   return absl_ports::StrCat(base_dir, "/", kUsageStoreDirectoryName);
117 }
118 
MakeCorpusMapperFilename(const std::string & base_dir)119 std::string MakeCorpusMapperFilename(const std::string& base_dir) {
120   return absl_ports::StrCat(base_dir, "/", kCorpusIdMapperFilename);
121 }
122 
123 // TODO(adorokhine): This class internally uses an 8-byte fingerprint of the
124 // Key and stores the key/value in a file-backed-trie that adds an ~80 byte
125 // overhead per key. As we know that these fingerprints are always 8-bytes in
126 // length and that they're random, we might be able to store them more
127 // compactly.
MakeFingerprint(std::string_view name_space,std::string_view uri)128 std::string MakeFingerprint(std::string_view name_space, std::string_view uri) {
129   // Using a 64-bit fingerprint to represent the key could lead to collisions.
130   // But, even with 200K unique keys, the probability of collision is about
131   // one-in-a-billion (https://en.wikipedia.org/wiki/Birthday_attack).
132   uint64_t fprint =
133       tc3farmhash::Fingerprint64(absl_ports::StrCat(name_space, uri));
134 
135   std::string encoded_fprint;
136   // DynamicTrie cannot handle keys with '0' as bytes. So, we encode it in
137   // base128 and add 1 to make sure that no byte is '0'. This increases the
138   // size of the encoded_fprint from 8-bytes to 10-bytes.
139   while (fprint) {
140     encoded_fprint.push_back((fprint & 0x7F) + 1);
141     fprint >>= 7;
142   }
143   return encoded_fprint;
144 }
145 
CalculateExpirationTimestampMs(int64_t creation_timestamp_ms,int64_t ttl_ms)146 int64_t CalculateExpirationTimestampMs(int64_t creation_timestamp_ms,
147                                        int64_t ttl_ms) {
148   if (ttl_ms == 0) {
149     // Special case where a TTL of 0 indicates the document should never
150     // expire. int64_t max, interpreted as seconds since epoch, represents
151     // some point in the year 292,277,026,596. So we're probably ok to use
152     // this as "never reaching this point".
153     return std::numeric_limits<int64_t>::max();
154   }
155 
156   int64_t expiration_timestamp_ms;
157   if (__builtin_add_overflow(creation_timestamp_ms, ttl_ms,
158                              &expiration_timestamp_ms)) {
159     // Overflow detected. Treat overflow as the same behavior of just int64_t
160     // max
161     return std::numeric_limits<int64_t>::max();
162   }
163 
164   return expiration_timestamp_ms;
165 }
166 
167 }  // namespace
168 
DocumentStore(const Filesystem * filesystem,const std::string_view base_dir,const Clock * clock,const SchemaStore * schema_store)169 DocumentStore::DocumentStore(const Filesystem* filesystem,
170                              const std::string_view base_dir,
171                              const Clock* clock,
172                              const SchemaStore* schema_store)
173     : filesystem_(filesystem),
174       base_dir_(base_dir),
175       clock_(*clock),
176       schema_store_(schema_store),
177       document_validator_(schema_store) {}
178 
Put(const DocumentProto & document,int32_t num_tokens,PutDocumentStatsProto * put_document_stats)179 libtextclassifier3::StatusOr<DocumentId> DocumentStore::Put(
180     const DocumentProto& document, int32_t num_tokens,
181     PutDocumentStatsProto* put_document_stats) {
182   return Put(DocumentProto(document), num_tokens, put_document_stats);
183 }
184 
Put(DocumentProto && document,int32_t num_tokens,PutDocumentStatsProto * put_document_stats)185 libtextclassifier3::StatusOr<DocumentId> DocumentStore::Put(
186     DocumentProto&& document, int32_t num_tokens,
187     PutDocumentStatsProto* put_document_stats) {
188   document.mutable_internal_fields()->set_length_in_tokens(num_tokens);
189   return InternalPut(document, put_document_stats);
190 }
191 
~DocumentStore()192 DocumentStore::~DocumentStore() {
193   if (initialized_) {
194     if (!PersistToDisk(PersistType::FULL).ok()) {
195       ICING_LOG(ERROR)
196           << "Error persisting to disk in DocumentStore destructor";
197     }
198   }
199 }
200 
Create(const Filesystem * filesystem,const std::string & base_dir,const Clock * clock,const SchemaStore * schema_store,bool force_recovery_and_revalidate_documents,InitializeStatsProto * initialize_stats)201 libtextclassifier3::StatusOr<DocumentStore::CreateResult> DocumentStore::Create(
202     const Filesystem* filesystem, const std::string& base_dir,
203     const Clock* clock, const SchemaStore* schema_store,
204     bool force_recovery_and_revalidate_documents,
205     InitializeStatsProto* initialize_stats) {
206   ICING_RETURN_ERROR_IF_NULL(filesystem);
207   ICING_RETURN_ERROR_IF_NULL(clock);
208   ICING_RETURN_ERROR_IF_NULL(schema_store);
209 
210   auto document_store = std::unique_ptr<DocumentStore>(
211       new DocumentStore(filesystem, base_dir, clock, schema_store));
212   ICING_ASSIGN_OR_RETURN(
213       DataLoss data_loss,
214       document_store->Initialize(force_recovery_and_revalidate_documents,
215                                  initialize_stats));
216 
217   CreateResult create_result;
218   create_result.document_store = std::move(document_store);
219   create_result.data_loss = data_loss;
220   return create_result;
221 }
222 
Initialize(bool force_recovery_and_revalidate_documents,InitializeStatsProto * initialize_stats)223 libtextclassifier3::StatusOr<DataLoss> DocumentStore::Initialize(
224     bool force_recovery_and_revalidate_documents,
225     InitializeStatsProto* initialize_stats) {
226   auto create_result_or = DocumentLogCreator::Create(filesystem_, base_dir_);
227 
228   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
229   // that can support error logging.
230   if (!create_result_or.ok()) {
231     ICING_LOG(ERROR) << create_result_or.status().error_message()
232                      << "\nFailed to initialize DocumentLog.";
233     return create_result_or.status();
234   }
235   DocumentLogCreator::CreateResult create_result =
236       std::move(create_result_or).ValueOrDie();
237 
238   document_log_ = std::move(create_result.log_create_result.proto_log);
239 
240   if (create_result.regen_derived_files ||
241       force_recovery_and_revalidate_documents ||
242       create_result.log_create_result.has_data_loss()) {
243     // We can't rely on any existing derived files. Recreate them from scratch.
244     // Currently happens if:
245     //   1) This is a new log and we don't have derived files yet
246     //   2) Client wanted us to force a regeneration.
247     //   3) Log has some data loss, can't rely on existing derived data.
248     if (create_result.log_create_result.has_data_loss() &&
249         initialize_stats != nullptr) {
250       ICING_LOG(WARNING)
251           << "Data loss in document log, regenerating derived files.";
252       initialize_stats->set_document_store_recovery_cause(
253           InitializeStatsProto::DATA_LOSS);
254 
255       if (create_result.log_create_result.data_loss == DataLoss::PARTIAL) {
256         // Ground truth is partially lost.
257         initialize_stats->set_document_store_data_status(
258             InitializeStatsProto::PARTIAL_LOSS);
259       } else {
260         // Ground truth is completely lost.
261         initialize_stats->set_document_store_data_status(
262             InitializeStatsProto::COMPLETE_LOSS);
263       }
264     }
265 
266     std::unique_ptr<Timer> document_recovery_timer = clock_.GetNewTimer();
267     libtextclassifier3::Status status =
268         RegenerateDerivedFiles(force_recovery_and_revalidate_documents);
269     if (initialize_stats != nullptr &&
270         (force_recovery_and_revalidate_documents ||
271          create_result.log_create_result.has_data_loss())) {
272       // Only consider it a recovery if the client forced a recovery or there
273       // was data loss. Otherwise, this could just be the first time we're
274       // initializing and generating derived files.
275       initialize_stats->set_document_store_recovery_latency_ms(
276           document_recovery_timer->GetElapsedMilliseconds());
277     }
278     if (!status.ok()) {
279       ICING_LOG(ERROR)
280           << "Failed to regenerate derived files for DocumentStore";
281       return status;
282     }
283   } else {
284     if (!InitializeExistingDerivedFiles().ok()) {
285       ICING_VLOG(1)
286           << "Couldn't find derived files or failed to initialize them, "
287              "regenerating derived files for DocumentStore.";
288       std::unique_ptr<Timer> document_recovery_timer = clock_.GetNewTimer();
289       libtextclassifier3::Status status = RegenerateDerivedFiles(
290           /*force_recovery_and_revalidate_documents*/ false);
291       if (initialize_stats != nullptr && num_documents() > 0) {
292         initialize_stats->set_document_store_recovery_cause(
293             InitializeStatsProto::IO_ERROR);
294         initialize_stats->set_document_store_recovery_latency_ms(
295             document_recovery_timer->GetElapsedMilliseconds());
296       }
297       if (!status.ok()) {
298         ICING_LOG(ERROR)
299             << "Failed to regenerate derived files for DocumentStore";
300         return status;
301       }
302     }
303   }
304 
305   initialized_ = true;
306   if (initialize_stats != nullptr) {
307     initialize_stats->set_num_documents(document_id_mapper_->num_elements());
308   }
309 
310   return create_result.log_create_result.data_loss;
311 }
312 
InitializeExistingDerivedFiles()313 libtextclassifier3::Status DocumentStore::InitializeExistingDerivedFiles() {
314   if (!HeaderExists()) {
315     // Without a header, we don't know if things are consistent between each
316     // other so the caller should just regenerate everything from ground
317     // truth.
318     return absl_ports::InternalError("DocumentStore header doesn't exist");
319   }
320 
321   DocumentStore::Header header;
322   if (!filesystem_->Read(MakeHeaderFilename(base_dir_).c_str(), &header,
323                          sizeof(header))) {
324     return absl_ports::InternalError(
325         absl_ports::StrCat("Couldn't read: ", MakeHeaderFilename(base_dir_)));
326   }
327 
328   if (header.magic != DocumentStore::Header::kMagic) {
329     return absl_ports::InternalError(absl_ports::StrCat(
330         "Invalid header kMagic for file: ", MakeHeaderFilename(base_dir_)));
331   }
332 
333   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
334   // that can support error logging.
335   auto document_key_mapper_or =
336       KeyMapper<DocumentId>::Create(*filesystem_, base_dir_, kUriMapperMaxSize);
337   if (!document_key_mapper_or.ok()) {
338     ICING_LOG(ERROR) << document_key_mapper_or.status().error_message()
339                      << "Failed to initialize KeyMapper";
340     return document_key_mapper_or.status();
341   }
342   document_key_mapper_ = std::move(document_key_mapper_or).ValueOrDie();
343 
344   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
345   // that can support error logging.
346   auto document_id_mapper_or = FileBackedVector<int64_t>::Create(
347       *filesystem_, MakeDocumentIdMapperFilename(base_dir_),
348       MemoryMappedFile::READ_WRITE_AUTO_SYNC);
349   if (!document_id_mapper_or.ok()) {
350     ICING_LOG(ERROR) << document_id_mapper_or.status().error_message()
351                      << "Failed to initialize DocumentIdMapper";
352     return document_id_mapper_or.status();
353   }
354   document_id_mapper_ = std::move(document_id_mapper_or).ValueOrDie();
355 
356   ICING_ASSIGN_OR_RETURN(score_cache_,
357                          FileBackedVector<DocumentAssociatedScoreData>::Create(
358                              *filesystem_, MakeScoreCacheFilename(base_dir_),
359                              MemoryMappedFile::READ_WRITE_AUTO_SYNC));
360 
361   ICING_ASSIGN_OR_RETURN(filter_cache_,
362                          FileBackedVector<DocumentFilterData>::Create(
363                              *filesystem_, MakeFilterCacheFilename(base_dir_),
364                              MemoryMappedFile::READ_WRITE_AUTO_SYNC));
365 
366   ICING_ASSIGN_OR_RETURN(
367       namespace_mapper_,
368       KeyMapper<NamespaceId>::Create(*filesystem_,
369                                      MakeNamespaceMapperFilename(base_dir_),
370                                      kNamespaceMapperMaxSize));
371 
372   ICING_ASSIGN_OR_RETURN(
373       usage_store_,
374       UsageStore::Create(filesystem_, MakeUsageStoreDirectoryName(base_dir_)));
375 
376   ICING_ASSIGN_OR_RETURN(corpus_mapper_,
377                          KeyMapper<CorpusId>::Create(
378                              *filesystem_, MakeCorpusMapperFilename(base_dir_),
379                              kCorpusMapperMaxSize));
380 
381   ICING_ASSIGN_OR_RETURN(corpus_score_cache_,
382                          FileBackedVector<CorpusAssociatedScoreData>::Create(
383                              *filesystem_, MakeCorpusScoreCache(base_dir_),
384                              MemoryMappedFile::READ_WRITE_AUTO_SYNC));
385 
386   // Ensure the usage store is the correct size.
387   ICING_RETURN_IF_ERROR(
388       usage_store_->TruncateTo(document_id_mapper_->num_elements()));
389 
390   ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
391   if (checksum.Get() != header.checksum) {
392     return absl_ports::InternalError(
393         "Combined checksum of DocStore was inconsistent");
394   }
395 
396   return libtextclassifier3::Status::OK;
397 }
398 
RegenerateDerivedFiles(bool revalidate_documents)399 libtextclassifier3::Status DocumentStore::RegenerateDerivedFiles(
400     bool revalidate_documents) {
401   ICING_RETURN_IF_ERROR(ResetDocumentKeyMapper());
402   ICING_RETURN_IF_ERROR(ResetDocumentIdMapper());
403   ICING_RETURN_IF_ERROR(ResetDocumentAssociatedScoreCache());
404   ICING_RETURN_IF_ERROR(ResetFilterCache());
405   ICING_RETURN_IF_ERROR(ResetNamespaceMapper());
406   ICING_RETURN_IF_ERROR(ResetCorpusMapper());
407   ICING_RETURN_IF_ERROR(ResetCorpusAssociatedScoreCache());
408 
409   // Creates a new UsageStore instance. Note that we don't reset the data in
410   // usage store here because we're not able to regenerate the usage scores.
411   ICING_ASSIGN_OR_RETURN(
412       usage_store_,
413       UsageStore::Create(filesystem_, MakeUsageStoreDirectoryName(base_dir_)));
414 
415   // Iterates through document log
416   auto iterator = document_log_->GetIterator();
417   auto iterator_status = iterator.Advance();
418   while (iterator_status.ok()) {
419     libtextclassifier3::StatusOr<DocumentWrapper> document_wrapper_or =
420         document_log_->ReadProto(iterator.GetOffset());
421 
422     if (absl_ports::IsNotFound(document_wrapper_or.status())) {
423       // The erased document still occupies 1 document id.
424       DocumentId new_document_id = document_id_mapper_->num_elements();
425       ICING_RETURN_IF_ERROR(ClearDerivedData(new_document_id));
426       iterator_status = iterator.Advance();
427       continue;
428     } else if (!document_wrapper_or.ok()) {
429       return document_wrapper_or.status();
430     }
431 
432     DocumentWrapper document_wrapper =
433         std::move(document_wrapper_or).ValueOrDie();
434     // Revalidate that this document is still compatible if requested.
435     if (revalidate_documents) {
436       if (!document_validator_.Validate(document_wrapper.document()).ok()) {
437         // Document is no longer valid with the current schema. Mark as
438         // deleted
439         DocumentId new_document_id = document_id_mapper_->num_elements();
440         ICING_RETURN_IF_ERROR(document_log_->EraseProto(iterator.GetOffset()));
441         ICING_RETURN_IF_ERROR(ClearDerivedData(new_document_id));
442         continue;
443       }
444     }
445     // Updates key mapper and document_id mapper with the new document
446     DocumentId new_document_id = document_id_mapper_->num_elements();
447     ICING_RETURN_IF_ERROR(document_key_mapper_->Put(
448         MakeFingerprint(document_wrapper.document().namespace_(),
449                         document_wrapper.document().uri()),
450         new_document_id));
451     ICING_RETURN_IF_ERROR(
452         document_id_mapper_->Set(new_document_id, iterator.GetOffset()));
453 
454     SchemaTypeId schema_type_id;
455     auto schema_type_id_or =
456         schema_store_->GetSchemaTypeId(document_wrapper.document().schema());
457     if (absl_ports::IsNotFound(schema_type_id_or.status())) {
458       // Didn't find a SchemaTypeId. This means that the DocumentStore and
459       // the SchemaStore are out of sync. But DocumentStore can't do
460       // anything about it so just ignore this for now. This should be
461       // detected/handled by the owner of DocumentStore. Set it to some
462       // arbitrary invalid value for now, it'll get updated to the correct
463       // ID later.
464       schema_type_id = -1;
465     } else if (!schema_type_id_or.ok()) {
466       // Real error. Pass it up
467       return schema_type_id_or.status();
468     } else {
469       // We're guaranteed that SchemaTypeId is valid now
470       schema_type_id = schema_type_id_or.ValueOrDie();
471     }
472 
473     ICING_ASSIGN_OR_RETURN(
474         NamespaceId namespace_id,
475         namespace_mapper_->GetOrPut(document_wrapper.document().namespace_(),
476                                     namespace_mapper_->num_keys()));
477 
478     // Update corpus maps
479     std::string corpus =
480         MakeFingerprint(document_wrapper.document().namespace_(),
481                         document_wrapper.document().schema());
482     ICING_ASSIGN_OR_RETURN(
483         CorpusId corpusId,
484         corpus_mapper_->GetOrPut(corpus, corpus_mapper_->num_keys()));
485 
486     ICING_ASSIGN_OR_RETURN(CorpusAssociatedScoreData scoring_data,
487                            GetCorpusAssociatedScoreDataToUpdate(corpusId));
488     scoring_data.AddDocument(
489         document_wrapper.document().internal_fields().length_in_tokens());
490 
491     ICING_RETURN_IF_ERROR(
492         UpdateCorpusAssociatedScoreCache(corpusId, scoring_data));
493 
494     ICING_RETURN_IF_ERROR(UpdateDocumentAssociatedScoreCache(
495         new_document_id,
496         DocumentAssociatedScoreData(
497             corpusId, document_wrapper.document().score(),
498             document_wrapper.document().creation_timestamp_ms(),
499             document_wrapper.document().internal_fields().length_in_tokens())));
500 
501     int64_t expiration_timestamp_ms = CalculateExpirationTimestampMs(
502         document_wrapper.document().creation_timestamp_ms(),
503         document_wrapper.document().ttl_ms());
504 
505     ICING_RETURN_IF_ERROR(UpdateFilterCache(
506         new_document_id, DocumentFilterData(namespace_id, schema_type_id,
507                                             expiration_timestamp_ms)));
508     iterator_status = iterator.Advance();
509   }
510 
511   if (!absl_ports::IsOutOfRange(iterator_status)) {
512     ICING_LOG(WARNING)
513         << "Failed to iterate through proto log while regenerating "
514            "derived files";
515     return absl_ports::Annotate(iterator_status,
516                                 "Failed to iterate through proto log.");
517   }
518 
519   // Shrink usage_store_ to the correct size.
520   ICING_RETURN_IF_ERROR(
521       usage_store_->TruncateTo(document_id_mapper_->num_elements()));
522 
523   // Write the header
524   ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
525   ICING_RETURN_IF_ERROR(UpdateHeader(checksum));
526 
527   return libtextclassifier3::Status::OK;
528 }
529 
ResetDocumentKeyMapper()530 libtextclassifier3::Status DocumentStore::ResetDocumentKeyMapper() {
531   // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
532   document_key_mapper_.reset();
533   // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
534   // that can support error logging.
535   libtextclassifier3::Status status =
536       KeyMapper<DocumentId>::Delete(*filesystem_, base_dir_);
537   if (!status.ok()) {
538     ICING_LOG(ERROR) << status.error_message()
539                      << "Failed to delete old key mapper";
540     return status;
541   }
542 
543   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
544   // that can support error logging.
545   auto document_key_mapper_or =
546       KeyMapper<DocumentId>::Create(*filesystem_, base_dir_, kUriMapperMaxSize);
547   if (!document_key_mapper_or.ok()) {
548     ICING_LOG(ERROR) << document_key_mapper_or.status().error_message()
549                      << "Failed to re-init key mapper";
550     return document_key_mapper_or.status();
551   }
552   document_key_mapper_ = std::move(document_key_mapper_or).ValueOrDie();
553   return libtextclassifier3::Status::OK;
554 }
555 
ResetDocumentIdMapper()556 libtextclassifier3::Status DocumentStore::ResetDocumentIdMapper() {
557   // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
558   document_id_mapper_.reset();
559   // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
560   // that can support error logging.
561   libtextclassifier3::Status status = FileBackedVector<int64_t>::Delete(
562       *filesystem_, MakeDocumentIdMapperFilename(base_dir_));
563   if (!status.ok()) {
564     ICING_LOG(ERROR) << status.error_message()
565                      << "Failed to delete old document_id mapper";
566     return status;
567   }
568   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
569   // that can support error logging.
570   auto document_id_mapper_or = FileBackedVector<int64_t>::Create(
571       *filesystem_, MakeDocumentIdMapperFilename(base_dir_),
572       MemoryMappedFile::READ_WRITE_AUTO_SYNC);
573   if (!document_id_mapper_or.ok()) {
574     ICING_LOG(ERROR) << document_id_mapper_or.status().error_message()
575                      << "Failed to re-init document_id mapper";
576     return document_id_mapper_or.status();
577   }
578   document_id_mapper_ = std::move(document_id_mapper_or).ValueOrDie();
579   return libtextclassifier3::Status::OK;
580 }
581 
ResetDocumentAssociatedScoreCache()582 libtextclassifier3::Status DocumentStore::ResetDocumentAssociatedScoreCache() {
583   // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
584   score_cache_.reset();
585   ICING_RETURN_IF_ERROR(FileBackedVector<DocumentAssociatedScoreData>::Delete(
586       *filesystem_, MakeScoreCacheFilename(base_dir_)));
587   ICING_ASSIGN_OR_RETURN(score_cache_,
588                          FileBackedVector<DocumentAssociatedScoreData>::Create(
589                              *filesystem_, MakeScoreCacheFilename(base_dir_),
590                              MemoryMappedFile::READ_WRITE_AUTO_SYNC));
591   return libtextclassifier3::Status::OK;
592 }
593 
ResetCorpusAssociatedScoreCache()594 libtextclassifier3::Status DocumentStore::ResetCorpusAssociatedScoreCache() {
595   // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
596   corpus_score_cache_.reset();
597   ICING_RETURN_IF_ERROR(FileBackedVector<CorpusAssociatedScoreData>::Delete(
598       *filesystem_, MakeCorpusScoreCache(base_dir_)));
599   ICING_ASSIGN_OR_RETURN(corpus_score_cache_,
600                          FileBackedVector<CorpusAssociatedScoreData>::Create(
601                              *filesystem_, MakeCorpusScoreCache(base_dir_),
602                              MemoryMappedFile::READ_WRITE_AUTO_SYNC));
603   return libtextclassifier3::Status::OK;
604 }
605 
ResetFilterCache()606 libtextclassifier3::Status DocumentStore::ResetFilterCache() {
607   // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
608   filter_cache_.reset();
609   ICING_RETURN_IF_ERROR(FileBackedVector<DocumentFilterData>::Delete(
610       *filesystem_, MakeFilterCacheFilename(base_dir_)));
611   ICING_ASSIGN_OR_RETURN(filter_cache_,
612                          FileBackedVector<DocumentFilterData>::Create(
613                              *filesystem_, MakeFilterCacheFilename(base_dir_),
614                              MemoryMappedFile::READ_WRITE_AUTO_SYNC));
615   return libtextclassifier3::Status::OK;
616 }
617 
ResetNamespaceMapper()618 libtextclassifier3::Status DocumentStore::ResetNamespaceMapper() {
619   // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
620   namespace_mapper_.reset();
621   // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
622   // that can support error logging.
623   libtextclassifier3::Status status = KeyMapper<NamespaceId>::Delete(
624       *filesystem_, MakeNamespaceMapperFilename(base_dir_));
625   if (!status.ok()) {
626     ICING_LOG(ERROR) << status.error_message()
627                      << "Failed to delete old namespace_id mapper";
628     return status;
629   }
630   ICING_ASSIGN_OR_RETURN(
631       namespace_mapper_,
632       KeyMapper<NamespaceId>::Create(*filesystem_,
633                                      MakeNamespaceMapperFilename(base_dir_),
634                                      kNamespaceMapperMaxSize));
635   return libtextclassifier3::Status::OK;
636 }
637 
ResetCorpusMapper()638 libtextclassifier3::Status DocumentStore::ResetCorpusMapper() {
639   // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
640   corpus_mapper_.reset();
641   // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
642   // that can support error logging.
643   libtextclassifier3::Status status = KeyMapper<CorpusId>::Delete(
644       *filesystem_, MakeCorpusMapperFilename(base_dir_));
645   if (!status.ok()) {
646     ICING_LOG(ERROR) << status.error_message()
647                      << "Failed to delete old corpus_id mapper";
648     return status;
649   }
650   ICING_ASSIGN_OR_RETURN(corpus_mapper_,
651                          KeyMapper<CorpusId>::Create(
652                              *filesystem_, MakeCorpusMapperFilename(base_dir_),
653                              kCorpusMapperMaxSize));
654   return libtextclassifier3::Status::OK;
655 }
656 
ComputeChecksum() const657 libtextclassifier3::StatusOr<Crc32> DocumentStore::ComputeChecksum() const {
658   Crc32 total_checksum;
659 
660   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
661   // that can support error logging.
662   auto checksum_or = document_log_->ComputeChecksum();
663   if (!checksum_or.ok()) {
664     ICING_LOG(ERROR) << checksum_or.status().error_message()
665                      << "Failed to compute checksum of DocumentLog";
666     return checksum_or.status();
667   }
668   Crc32 document_log_checksum = std::move(checksum_or).ValueOrDie();
669 
670   Crc32 document_key_mapper_checksum = document_key_mapper_->ComputeChecksum();
671 
672   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
673   // that can support error logging.
674   checksum_or = document_id_mapper_->ComputeChecksum();
675   if (!checksum_or.ok()) {
676     ICING_LOG(ERROR) << checksum_or.status().error_message()
677                      << "Failed to compute checksum of DocumentIdMapper";
678     return checksum_or.status();
679   }
680   Crc32 document_id_mapper_checksum = std::move(checksum_or).ValueOrDie();
681 
682   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
683   // that can support error logging.
684   checksum_or = score_cache_->ComputeChecksum();
685   if (!checksum_or.ok()) {
686     ICING_LOG(ERROR) << checksum_or.status().error_message()
687                      << "Failed to compute checksum of score cache";
688     return checksum_or.status();
689   }
690   Crc32 score_cache_checksum = std::move(checksum_or).ValueOrDie();
691 
692   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
693   // that can support error logging.
694   checksum_or = filter_cache_->ComputeChecksum();
695   if (!checksum_or.ok()) {
696     ICING_LOG(ERROR) << checksum_or.status().error_message()
697                      << "Failed to compute checksum of filter cache";
698     return checksum_or.status();
699   }
700   Crc32 filter_cache_checksum = std::move(checksum_or).ValueOrDie();
701 
702   Crc32 namespace_mapper_checksum = namespace_mapper_->ComputeChecksum();
703 
704   Crc32 corpus_mapper_checksum = corpus_mapper_->ComputeChecksum();
705 
706   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
707   // that can support error logging.
708   checksum_or = corpus_score_cache_->ComputeChecksum();
709   if (!checksum_or.ok()) {
710     ICING_LOG(WARNING) << checksum_or.status().error_message()
711                        << "Failed to compute checksum of score cache";
712     return checksum_or.status();
713   }
714   Crc32 corpus_score_cache_checksum = std::move(checksum_or).ValueOrDie();
715 
716   // NOTE: We purposely don't include usage_store checksum here because we can't
717   // regenerate it from ground truth documents. If it gets corrupted, we'll just
718   // clear all usage reports, but we shouldn't throw everything else in the
719   // document store out.
720 
721   total_checksum.Append(std::to_string(document_log_checksum.Get()));
722   total_checksum.Append(std::to_string(document_key_mapper_checksum.Get()));
723   total_checksum.Append(std::to_string(document_id_mapper_checksum.Get()));
724   total_checksum.Append(std::to_string(score_cache_checksum.Get()));
725   total_checksum.Append(std::to_string(filter_cache_checksum.Get()));
726   total_checksum.Append(std::to_string(namespace_mapper_checksum.Get()));
727   total_checksum.Append(std::to_string(corpus_mapper_checksum.Get()));
728   total_checksum.Append(std::to_string(corpus_score_cache_checksum.Get()));
729 
730   return total_checksum;
731 }
732 
HeaderExists()733 bool DocumentStore::HeaderExists() {
734   if (!filesystem_->FileExists(MakeHeaderFilename(base_dir_).c_str())) {
735     return false;
736   }
737 
738   int64_t file_size =
739       filesystem_->GetFileSize(MakeHeaderFilename(base_dir_).c_str());
740 
741   // If it's been truncated to size 0 before, we consider it to be a new file
742   return file_size != 0 && file_size != Filesystem::kBadFileSize;
743 }
744 
UpdateHeader(const Crc32 & checksum)745 libtextclassifier3::Status DocumentStore::UpdateHeader(const Crc32& checksum) {
746   // Write the header
747   DocumentStore::Header header;
748   header.magic = DocumentStore::Header::kMagic;
749   header.checksum = checksum.Get();
750 
751   // This should overwrite the header.
752   ScopedFd sfd(
753       filesystem_->OpenForWrite(MakeHeaderFilename(base_dir_).c_str()));
754   if (!sfd.is_valid() ||
755       !filesystem_->Write(sfd.get(), &header, sizeof(header)) ||
756       !filesystem_->DataSync(sfd.get())) {
757     return absl_ports::InternalError(absl_ports::StrCat(
758         "Failed to write DocStore header: ", MakeHeaderFilename(base_dir_)));
759   }
760   return libtextclassifier3::Status::OK;
761 }
762 
InternalPut(DocumentProto & document,PutDocumentStatsProto * put_document_stats)763 libtextclassifier3::StatusOr<DocumentId> DocumentStore::InternalPut(
764     DocumentProto& document, PutDocumentStatsProto* put_document_stats) {
765   std::unique_ptr<Timer> put_timer = clock_.GetNewTimer();
766   ICING_RETURN_IF_ERROR(document_validator_.Validate(document));
767 
768   if (put_document_stats != nullptr) {
769     put_document_stats->set_document_size(document.ByteSizeLong());
770   }
771 
772   // Copy fields needed before they are moved
773   std::string name_space = document.namespace_();
774   std::string uri = document.uri();
775   std::string schema = document.schema();
776   int document_score = document.score();
777   int32_t length_in_tokens = document.internal_fields().length_in_tokens();
778   int64_t creation_timestamp_ms = document.creation_timestamp_ms();
779 
780   // Sets the creation timestamp if caller hasn't specified.
781   if (document.creation_timestamp_ms() == 0) {
782     creation_timestamp_ms = clock_.GetSystemTimeMilliseconds();
783     document.set_creation_timestamp_ms(creation_timestamp_ms);
784   }
785 
786   int64_t expiration_timestamp_ms =
787       CalculateExpirationTimestampMs(creation_timestamp_ms, document.ttl_ms());
788 
789   // Update ground truth first
790   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
791   // that can support error logging.
792   auto offset_or =
793       document_log_->WriteProto(CreateDocumentWrapper(std::move(document)));
794   if (!offset_or.ok()) {
795     ICING_LOG(ERROR) << offset_or.status().error_message()
796                      << "Failed to write document";
797     return offset_or.status();
798   }
799   int64_t file_offset = std::move(offset_or).ValueOrDie();
800 
801   // Get existing document id
802   auto old_document_id_or = GetDocumentId(name_space, uri);
803   if (!old_document_id_or.ok() &&
804       !absl_ports::IsNotFound(old_document_id_or.status())) {
805     return absl_ports::InternalError("Failed to read from key mapper");
806   }
807 
808   // Creates a new document id, updates key mapper and document_id mapper
809   DocumentId new_document_id = document_id_mapper_->num_elements();
810   if (!IsDocumentIdValid(new_document_id)) {
811     return absl_ports::ResourceExhaustedError(
812         "Exceeded maximum number of documents. Try calling Optimize to reclaim "
813         "some space.");
814   }
815 
816   ICING_RETURN_IF_ERROR(document_key_mapper_->Put(
817       MakeFingerprint(name_space, uri), new_document_id));
818   ICING_RETURN_IF_ERROR(document_id_mapper_->Set(new_document_id, file_offset));
819 
820   // Update namespace maps
821   ICING_ASSIGN_OR_RETURN(
822       NamespaceId namespace_id,
823       namespace_mapper_->GetOrPut(name_space, namespace_mapper_->num_keys()));
824 
825   // Update corpus maps
826   ICING_ASSIGN_OR_RETURN(
827       CorpusId corpusId,
828       corpus_mapper_->GetOrPut(MakeFingerprint(name_space, schema),
829                                corpus_mapper_->num_keys()));
830 
831   ICING_ASSIGN_OR_RETURN(CorpusAssociatedScoreData scoring_data,
832                          GetCorpusAssociatedScoreDataToUpdate(corpusId));
833   scoring_data.AddDocument(length_in_tokens);
834 
835   ICING_RETURN_IF_ERROR(
836       UpdateCorpusAssociatedScoreCache(corpusId, scoring_data));
837 
838   ICING_RETURN_IF_ERROR(UpdateDocumentAssociatedScoreCache(
839       new_document_id,
840       DocumentAssociatedScoreData(corpusId, document_score,
841                                   creation_timestamp_ms, length_in_tokens)));
842 
843   ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
844                          schema_store_->GetSchemaTypeId(schema));
845 
846   ICING_RETURN_IF_ERROR(UpdateFilterCache(
847       new_document_id, DocumentFilterData(namespace_id, schema_type_id,
848                                           expiration_timestamp_ms)));
849 
850   if (old_document_id_or.ok()) {
851     // The old document exists, copy over the usage scores and delete the old
852     // document.
853     DocumentId old_document_id = old_document_id_or.ValueOrDie();
854 
855     ICING_RETURN_IF_ERROR(
856         usage_store_->CloneUsageScores(/*from_document_id=*/old_document_id,
857                                        /*to_document_id=*/new_document_id));
858 
859     // Delete the old document. It's fine if it's not found since it might have
860     // been deleted previously.
861     auto delete_status = Delete(old_document_id);
862     if (!delete_status.ok() && !absl_ports::IsNotFound(delete_status)) {
863       // Real error, pass it up.
864       return delete_status;
865     }
866   }
867 
868   if (put_document_stats != nullptr) {
869     put_document_stats->set_document_store_latency_ms(
870         put_timer->GetElapsedMilliseconds());
871   }
872 
873   return new_document_id;
874 }
875 
Get(const std::string_view name_space,const std::string_view uri,bool clear_internal_fields) const876 libtextclassifier3::StatusOr<DocumentProto> DocumentStore::Get(
877     const std::string_view name_space, const std::string_view uri,
878     bool clear_internal_fields) const {
879   // TODO(b/147231617): Make a better way to replace the error message in an
880   // existing Status.
881   auto document_id_or = GetDocumentId(name_space, uri);
882   if (absl_ports::IsNotFound(document_id_or.status())) {
883     ICING_VLOG(1) << document_id_or.status().error_message();
884     return libtextclassifier3::Status(
885         document_id_or.status().CanonicalCode(),
886         IcingStringUtil::StringPrintf("Document (%s, %s) not found.",
887                                       name_space.data(), uri.data()));
888   }
889   DocumentId document_id = document_id_or.ValueOrDie();
890 
891   // TODO(b/147231617): Make a better way to replace the error message in an
892   // existing Status.
893   auto status_or = Get(document_id);
894   if (absl_ports::IsNotFound(status_or.status())) {
895     ICING_LOG(ERROR) << document_id_or.status().error_message();
896     return libtextclassifier3::Status(
897         status_or.status().CanonicalCode(),
898         IcingStringUtil::StringPrintf("Document (%s, %s) not found.",
899                                       name_space.data(), uri.data()));
900   }
901   return status_or;
902 }
903 
Get(DocumentId document_id,bool clear_internal_fields) const904 libtextclassifier3::StatusOr<DocumentProto> DocumentStore::Get(
905     DocumentId document_id, bool clear_internal_fields) const {
906   ICING_RETURN_IF_ERROR(DoesDocumentExistWithStatus(document_id));
907 
908   auto document_log_offset_or = document_id_mapper_->Get(document_id);
909   if (!document_log_offset_or.ok()) {
910     // Since we've just checked that our document_id is valid a few lines
911     // above, there's no reason this should fail and an error should never
912     // happen.
913     return absl_ports::InternalError("Failed to find document offset.");
914   }
915   int64_t document_log_offset = *document_log_offset_or.ValueOrDie();
916 
917   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
918   // that can support error logging.
919   auto document_wrapper_or = document_log_->ReadProto(document_log_offset);
920   if (!document_wrapper_or.ok()) {
921     ICING_LOG(ERROR) << document_wrapper_or.status().error_message()
922                      << "Failed to read from document log";
923     return document_wrapper_or.status();
924   }
925   DocumentWrapper document_wrapper =
926       std::move(document_wrapper_or).ValueOrDie();
927   if (clear_internal_fields) {
928     document_wrapper.mutable_document()->clear_internal_fields();
929   }
930 
931   return std::move(*document_wrapper.mutable_document());
932 }
933 
GetDocumentId(const std::string_view name_space,const std::string_view uri) const934 libtextclassifier3::StatusOr<DocumentId> DocumentStore::GetDocumentId(
935     const std::string_view name_space, const std::string_view uri) const {
936   auto document_id_or =
937       document_key_mapper_->Get(MakeFingerprint(name_space, uri));
938   if (!document_id_or.ok()) {
939     return absl_ports::Annotate(
940         document_id_or.status(),
941         absl_ports::StrCat("Failed to find DocumentId by key: ", name_space,
942                            ", ", uri));
943   }
944 
945   // Guaranteed to have a DocumentId now
946   return document_id_or.ValueOrDie();
947 }
948 
GetAllNamespaces() const949 std::vector<std::string> DocumentStore::GetAllNamespaces() const {
950   std::unordered_map<NamespaceId, std::string> namespace_id_to_namespace =
951       namespace_mapper_->GetValuesToKeys();
952 
953   std::unordered_set<NamespaceId> existing_namespace_ids;
954   for (DocumentId document_id = 0; document_id < filter_cache_->num_elements();
955        ++document_id) {
956     // filter_cache_->Get can only fail if document_id is < 0
957     // or >= filter_cache_->num_elements. So, this error SHOULD NEVER HAPPEN.
958     auto status_or_data = filter_cache_->Get(document_id);
959     if (!status_or_data.ok()) {
960       ICING_LOG(ERROR)
961           << "Error while iterating over filter cache in GetAllNamespaces";
962       return std::vector<std::string>();
963     }
964     const DocumentFilterData* data = status_or_data.ValueOrDie();
965 
966     if (InternalDoesDocumentExist(document_id)) {
967       existing_namespace_ids.insert(data->namespace_id());
968     }
969   }
970 
971   std::vector<std::string> existing_namespaces;
972   for (auto itr = existing_namespace_ids.begin();
973        itr != existing_namespace_ids.end(); ++itr) {
974     existing_namespaces.push_back(namespace_id_to_namespace.at(*itr));
975   }
976   return existing_namespaces;
977 }
978 
DoesDocumentExist(DocumentId document_id) const979 bool DocumentStore::DoesDocumentExist(DocumentId document_id) const {
980   if (!IsDocumentIdValid(document_id)) {
981     return false;
982   }
983 
984   if (document_id >= document_id_mapper_->num_elements()) {
985     // Somehow got an validly constructed document_id that the document store
986     // doesn't know about
987     return false;
988   }
989 
990   return InternalDoesDocumentExist(document_id);
991 }
992 
DoesDocumentExistWithStatus(DocumentId document_id) const993 libtextclassifier3::Status DocumentStore::DoesDocumentExistWithStatus(
994     DocumentId document_id) const {
995   if (!IsDocumentIdValid(document_id)) {
996     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
997         "Document id '%d' invalid.", document_id));
998   }
999 
1000   if (document_id >= document_id_mapper_->num_elements()) {
1001     // Somehow got a validly constructed document_id that the document store
1002     // doesn't know about.
1003     return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
1004         "Unknown document id '%d'.", document_id));
1005   }
1006 
1007   if (!InternalDoesDocumentExist(document_id)) {
1008     return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
1009         "Document id '%d' doesn't exist", document_id));
1010   };
1011   return libtextclassifier3::Status::OK;
1012 }
1013 
InternalDoesDocumentExist(DocumentId document_id) const1014 bool DocumentStore::InternalDoesDocumentExist(DocumentId document_id) const {
1015   return !IsDeleted(document_id) && !IsExpired(document_id);
1016 }
1017 
IsDeleted(DocumentId document_id) const1018 bool DocumentStore::IsDeleted(DocumentId document_id) const {
1019   auto file_offset_or = document_id_mapper_->Get(document_id);
1020   if (!file_offset_or.ok()) {
1021     // This would only happen if document_id is out of range of the
1022     // document_id_mapper, meaning we got some invalid document_id. Callers
1023     // should already have checked that their document_id is valid or used
1024     // DoesDocumentExist(WithStatus). Regardless, return true since the
1025     // document doesn't exist.
1026     return true;
1027   }
1028   int64_t file_offset = *file_offset_or.ValueOrDie();
1029   return file_offset == kDocDeletedFlag;
1030 }
1031 
IsExpired(DocumentId document_id) const1032 bool DocumentStore::IsExpired(DocumentId document_id) const {
1033   auto filter_data_or = filter_cache_->Get(document_id);
1034   if (!filter_data_or.ok()) {
1035     // This would only happen if document_id is out of range of the
1036     // filter_cache, meaning we got some invalid document_id. Callers should
1037     // already have checked that their document_id is valid or used
1038     // DoesDocumentExist(WithStatus). Regardless, return true since the
1039     // document doesn't exist.
1040     return true;
1041   }
1042   const DocumentFilterData* filter_data = filter_data_or.ValueOrDie();
1043 
1044   // Check if it's past the expiration time
1045   return clock_.GetSystemTimeMilliseconds() >=
1046          filter_data->expiration_timestamp_ms();
1047 }
1048 
Delete(const std::string_view name_space,const std::string_view uri)1049 libtextclassifier3::Status DocumentStore::Delete(
1050     const std::string_view name_space, const std::string_view uri) {
1051   // Try to get the DocumentId first
1052   auto document_id_or = GetDocumentId(name_space, uri);
1053   if (!document_id_or.ok()) {
1054     return absl_ports::Annotate(
1055         document_id_or.status(),
1056         absl_ports::StrCat("Failed to delete Document. namespace: ", name_space,
1057                            ", uri: ", uri));
1058   }
1059   return Delete(document_id_or.ValueOrDie());
1060 }
1061 
Delete(DocumentId document_id)1062 libtextclassifier3::Status DocumentStore::Delete(DocumentId document_id) {
1063   ICING_RETURN_IF_ERROR(DoesDocumentExistWithStatus(document_id));
1064 
1065   auto document_log_offset_or = document_id_mapper_->Get(document_id);
1066   if (!document_log_offset_or.ok()) {
1067     return absl_ports::InternalError("Failed to find document offset.");
1068   }
1069   int64_t document_log_offset = *document_log_offset_or.ValueOrDie();
1070 
1071   // Erases document proto.
1072   ICING_RETURN_IF_ERROR(document_log_->EraseProto(document_log_offset));
1073   return ClearDerivedData(document_id);
1074 }
1075 
GetNamespaceId(std::string_view name_space) const1076 libtextclassifier3::StatusOr<NamespaceId> DocumentStore::GetNamespaceId(
1077     std::string_view name_space) const {
1078   return namespace_mapper_->Get(name_space);
1079 }
1080 
GetCorpusId(const std::string_view name_space,const std::string_view schema) const1081 libtextclassifier3::StatusOr<CorpusId> DocumentStore::GetCorpusId(
1082     const std::string_view name_space, const std::string_view schema) const {
1083   return corpus_mapper_->Get(MakeFingerprint(name_space, schema));
1084 }
1085 
1086 libtextclassifier3::StatusOr<DocumentAssociatedScoreData>
GetDocumentAssociatedScoreData(DocumentId document_id) const1087 DocumentStore::GetDocumentAssociatedScoreData(DocumentId document_id) const {
1088   if (!DoesDocumentExist(document_id)) {
1089     return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
1090         "Can't get usage scores, document id '%d' doesn't exist", document_id));
1091   }
1092 
1093   auto score_data_or = score_cache_->GetCopy(document_id);
1094   if (!score_data_or.ok()) {
1095     ICING_LOG(ERROR) << " while trying to access DocumentId " << document_id
1096                      << " from score_cache_";
1097     return score_data_or.status();
1098   }
1099 
1100   DocumentAssociatedScoreData document_associated_score_data =
1101       std::move(score_data_or).ValueOrDie();
1102   if (document_associated_score_data.document_score() < 0) {
1103     // An negative / invalid score means that the score data has been deleted.
1104     return absl_ports::NotFoundError("Document score data not found.");
1105   }
1106   return document_associated_score_data;
1107 }
1108 
1109 libtextclassifier3::StatusOr<CorpusAssociatedScoreData>
GetCorpusAssociatedScoreData(CorpusId corpus_id) const1110 DocumentStore::GetCorpusAssociatedScoreData(CorpusId corpus_id) const {
1111   auto score_data_or = corpus_score_cache_->GetCopy(corpus_id);
1112   if (!score_data_or.ok()) {
1113     return score_data_or.status();
1114   }
1115 
1116   CorpusAssociatedScoreData corpus_associated_score_data =
1117       std::move(score_data_or).ValueOrDie();
1118   return corpus_associated_score_data;
1119 }
1120 
1121 libtextclassifier3::StatusOr<CorpusAssociatedScoreData>
GetCorpusAssociatedScoreDataToUpdate(CorpusId corpus_id) const1122 DocumentStore::GetCorpusAssociatedScoreDataToUpdate(CorpusId corpus_id) const {
1123   auto corpus_scoring_data_or = GetCorpusAssociatedScoreData(corpus_id);
1124   if (corpus_scoring_data_or.ok()) {
1125     return std::move(corpus_scoring_data_or).ValueOrDie();
1126   }
1127   CorpusAssociatedScoreData scoringData;
1128   // OUT_OF_RANGE is the StatusCode returned when a corpus id is added to
1129   // corpus_score_cache_ for the first time.
1130   if (corpus_scoring_data_or.status().CanonicalCode() ==
1131       libtextclassifier3::StatusCode::OUT_OF_RANGE) {
1132     return scoringData;
1133   }
1134   return corpus_scoring_data_or.status();
1135 }
1136 
1137 libtextclassifier3::StatusOr<DocumentFilterData>
GetDocumentFilterData(DocumentId document_id) const1138 DocumentStore::GetDocumentFilterData(DocumentId document_id) const {
1139   if (!DoesDocumentExist(document_id)) {
1140     return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
1141         "Can't get filter data, document id '%d' doesn't exist", document_id));
1142   }
1143 
1144   auto filter_data_or = filter_cache_->GetCopy(document_id);
1145   if (!filter_data_or.ok()) {
1146     ICING_LOG(ERROR) << " while trying to access DocumentId " << document_id
1147                      << " from filter_cache_";
1148     return filter_data_or.status();
1149   }
1150   DocumentFilterData document_filter_data =
1151       std::move(filter_data_or).ValueOrDie();
1152   return document_filter_data;
1153 }
1154 
1155 libtextclassifier3::StatusOr<UsageStore::UsageScores>
GetUsageScores(DocumentId document_id) const1156 DocumentStore::GetUsageScores(DocumentId document_id) const {
1157   if (!DoesDocumentExist(document_id)) {
1158     return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
1159         "Can't get usage scores, document id '%d' doesn't exist", document_id));
1160   }
1161   return usage_store_->GetUsageScores(document_id);
1162 }
1163 
ReportUsage(const UsageReport & usage_report)1164 libtextclassifier3::Status DocumentStore::ReportUsage(
1165     const UsageReport& usage_report) {
1166   ICING_ASSIGN_OR_RETURN(DocumentId document_id,
1167                          GetDocumentId(usage_report.document_namespace(),
1168                                        usage_report.document_uri()));
1169   // We can use the internal version here because we got our document_id from
1170   // our internal data structures. We would have thrown some error if the
1171   // namespace and/or uri were incorrect.
1172   if (!InternalDoesDocumentExist(document_id)) {
1173     // Document was probably deleted or expired.
1174     return absl_ports::NotFoundError(absl_ports::StrCat(
1175         "Couldn't report usage on a nonexistent document: (namespace: '",
1176         usage_report.document_namespace(), "', uri: '",
1177         usage_report.document_uri(), "')"));
1178   }
1179 
1180   return usage_store_->AddUsageReport(usage_report, document_id);
1181 }
1182 
DeleteByNamespace(std::string_view name_space)1183 DocumentStore::DeleteByGroupResult DocumentStore::DeleteByNamespace(
1184     std::string_view name_space) {
1185   DeleteByGroupResult result;
1186   auto namespace_id_or = namespace_mapper_->Get(name_space);
1187   if (!namespace_id_or.ok()) {
1188     result.status = absl_ports::Annotate(
1189         namespace_id_or.status(),
1190         absl_ports::StrCat("Failed to find namespace: ", name_space));
1191     return result;
1192   }
1193   NamespaceId namespace_id = namespace_id_or.ValueOrDie();
1194   auto num_deleted_or = BatchDelete(namespace_id, kInvalidSchemaTypeId);
1195   if (!num_deleted_or.ok()) {
1196     result.status = std::move(num_deleted_or).status();
1197     return result;
1198   }
1199 
1200   result.num_docs_deleted = num_deleted_or.ValueOrDie();
1201   if (result.num_docs_deleted <= 0) {
1202     // Treat the fact that no existing documents had this namespace to be the
1203     // same as this namespace not existing at all.
1204     result.status = absl_ports::NotFoundError(
1205         absl_ports::StrCat("Namespace '", name_space, "' doesn't exist"));
1206     return result;
1207   }
1208 
1209   return result;
1210 }
1211 
DeleteBySchemaType(std::string_view schema_type)1212 DocumentStore::DeleteByGroupResult DocumentStore::DeleteBySchemaType(
1213     std::string_view schema_type) {
1214   DeleteByGroupResult result;
1215   auto schema_type_id_or = schema_store_->GetSchemaTypeId(schema_type);
1216   if (!schema_type_id_or.ok()) {
1217     result.status = absl_ports::Annotate(
1218         schema_type_id_or.status(),
1219         absl_ports::StrCat("Failed to find schema type. schema_type: ",
1220                            schema_type));
1221     return result;
1222   }
1223   SchemaTypeId schema_type_id = schema_type_id_or.ValueOrDie();
1224   auto num_deleted_or = BatchDelete(kInvalidNamespaceId, schema_type_id);
1225   if (!num_deleted_or.ok()) {
1226     result.status = std::move(num_deleted_or).status();
1227     return result;
1228   }
1229 
1230   result.num_docs_deleted = num_deleted_or.ValueOrDie();
1231   if (result.num_docs_deleted <= 0) {
1232     result.status = absl_ports::NotFoundError(absl_ports::StrCat(
1233         "No documents found with schema type '", schema_type, "'"));
1234     return result;
1235   }
1236 
1237   return result;
1238 }
1239 
BatchDelete(NamespaceId namespace_id,SchemaTypeId schema_type_id)1240 libtextclassifier3::StatusOr<int> DocumentStore::BatchDelete(
1241     NamespaceId namespace_id, SchemaTypeId schema_type_id) {
1242   // Tracks if there were any existing documents with this namespace that we
1243   // will mark as deleted.
1244   int num_updated_documents = 0;
1245 
1246   // Traverse FilterCache and delete all docs that match namespace_id and
1247   // schema_type_id.
1248   for (DocumentId document_id = 0; document_id < filter_cache_->num_elements();
1249        ++document_id) {
1250     // filter_cache_->Get can only fail if document_id is < 0
1251     // or >= filter_cache_->num_elements. So, this error SHOULD NEVER HAPPEN.
1252     ICING_ASSIGN_OR_RETURN(const DocumentFilterData* data,
1253                            filter_cache_->Get(document_id));
1254 
1255     // Check namespace only when the input namespace id is valid.
1256     if (namespace_id != kInvalidNamespaceId &&
1257         (data->namespace_id() == kInvalidNamespaceId ||
1258          data->namespace_id() != namespace_id)) {
1259       // The document has already been hard-deleted or isn't from the desired
1260       // namespace.
1261       continue;
1262     }
1263 
1264     // Check schema type only when the input schema type id is valid.
1265     if (schema_type_id != kInvalidSchemaTypeId &&
1266         (data->schema_type_id() == kInvalidSchemaTypeId ||
1267          data->schema_type_id() != schema_type_id)) {
1268       // The document has already been hard-deleted or doesn't have the
1269       // desired schema type.
1270       continue;
1271     }
1272 
1273     // The document has the desired namespace and schema type, it either
1274     // exists or has expired.
1275     libtextclassifier3::Status delete_status = Delete(document_id);
1276     if (absl_ports::IsNotFound(delete_status)) {
1277       continue;
1278     } else if (!delete_status.ok()) {
1279       // Real error, pass up.
1280       return delete_status;
1281     }
1282     ++num_updated_documents;
1283   }
1284 
1285   return num_updated_documents;
1286 }
1287 
PersistToDisk(PersistType::Code persist_type)1288 libtextclassifier3::Status DocumentStore::PersistToDisk(
1289     PersistType::Code persist_type) {
1290   if (persist_type == PersistType::LITE) {
1291     // only persist the document log.
1292     return document_log_->PersistToDisk();
1293   }
1294   ICING_RETURN_IF_ERROR(document_log_->PersistToDisk());
1295   ICING_RETURN_IF_ERROR(document_key_mapper_->PersistToDisk());
1296   ICING_RETURN_IF_ERROR(document_id_mapper_->PersistToDisk());
1297   ICING_RETURN_IF_ERROR(score_cache_->PersistToDisk());
1298   ICING_RETURN_IF_ERROR(filter_cache_->PersistToDisk());
1299   ICING_RETURN_IF_ERROR(namespace_mapper_->PersistToDisk());
1300   ICING_RETURN_IF_ERROR(usage_store_->PersistToDisk());
1301   ICING_RETURN_IF_ERROR(corpus_mapper_->PersistToDisk());
1302   ICING_RETURN_IF_ERROR(corpus_score_cache_->PersistToDisk());
1303 
1304   // Update the combined checksum and write to header file.
1305   ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
1306   ICING_RETURN_IF_ERROR(UpdateHeader(checksum));
1307 
1308   return libtextclassifier3::Status::OK;
1309 }
1310 
GetValueOrDefault(const libtextclassifier3::StatusOr<int64_t> & value_or,int64_t default_value)1311 int64_t GetValueOrDefault(const libtextclassifier3::StatusOr<int64_t>& value_or,
1312                           int64_t default_value) {
1313   return (value_or.ok()) ? value_or.ValueOrDie() : default_value;
1314 }
1315 
GetMemberStorageInfo() const1316 DocumentStorageInfoProto DocumentStore::GetMemberStorageInfo() const {
1317   DocumentStorageInfoProto storage_info;
1318   storage_info.set_document_log_size(
1319       GetValueOrDefault(document_log_->GetDiskUsage(), -1));
1320   storage_info.set_key_mapper_size(
1321       GetValueOrDefault(document_key_mapper_->GetDiskUsage(), -1));
1322   storage_info.set_document_id_mapper_size(
1323       GetValueOrDefault(document_id_mapper_->GetDiskUsage(), -1));
1324   storage_info.set_score_cache_size(
1325       GetValueOrDefault(score_cache_->GetDiskUsage(), -1));
1326   storage_info.set_filter_cache_size(
1327       GetValueOrDefault(filter_cache_->GetDiskUsage(), -1));
1328   storage_info.set_namespace_id_mapper_size(
1329       GetValueOrDefault(namespace_mapper_->GetDiskUsage(), -1));
1330   storage_info.set_corpus_mapper_size(
1331       GetValueOrDefault(corpus_mapper_->GetDiskUsage(), -1));
1332   storage_info.set_corpus_score_cache_size(
1333       GetValueOrDefault(corpus_score_cache_->GetDiskUsage(), -1));
1334   return storage_info;
1335 }
1336 
CalculateDocumentStatusCounts(DocumentStorageInfoProto storage_info) const1337 DocumentStorageInfoProto DocumentStore::CalculateDocumentStatusCounts(
1338     DocumentStorageInfoProto storage_info) const {
1339   int total_num_alive = 0;
1340   int total_num_expired = 0;
1341   int total_num_deleted = 0;
1342   std::unordered_map<NamespaceId, std::string> namespace_id_to_namespace =
1343       namespace_mapper_->GetValuesToKeys();
1344   std::unordered_map<std::string, NamespaceStorageInfoProto>
1345       namespace_to_storage_info;
1346 
1347   for (DocumentId document_id = 0;
1348        document_id < document_id_mapper_->num_elements(); ++document_id) {
1349     // Check if it's deleted first.
1350     if (IsDeleted(document_id)) {
1351       // We don't have the namespace id of hard deleted documents anymore, so
1352       // we can't add to our namespace storage info.
1353       ++total_num_deleted;
1354       continue;
1355     }
1356 
1357     // At this point, the document is either alive or expired, we can get
1358     // namespace info for it.
1359     auto filter_data_or = filter_cache_->Get(document_id);
1360     if (!filter_data_or.ok()) {
1361       ICING_VLOG(1) << "Error trying to get filter data for document store "
1362                        "storage info counts.";
1363       continue;
1364     }
1365     const DocumentFilterData* filter_data = filter_data_or.ValueOrDie();
1366     auto itr = namespace_id_to_namespace.find(filter_data->namespace_id());
1367     if (itr == namespace_id_to_namespace.end()) {
1368       ICING_VLOG(1) << "Error trying to find namespace for document store "
1369                        "storage info counts.";
1370       continue;
1371     }
1372     const std::string& name_space = itr->second;
1373 
1374     // Always set the namespace, if the NamespaceStorageInfoProto didn't exist
1375     // before, we'll get back a default instance of it.
1376     NamespaceStorageInfoProto& namespace_storage_info =
1377         namespace_to_storage_info[name_space];
1378     namespace_storage_info.set_namespace_(name_space);
1379 
1380     // Get usage scores
1381     auto usage_scores_or = usage_store_->GetUsageScores(document_id);
1382     if (!usage_scores_or.ok()) {
1383       ICING_VLOG(1) << "Error trying to get usage scores for document store "
1384                        "storage info counts.";
1385       continue;
1386     }
1387     UsageStore::UsageScores usage_scores = usage_scores_or.ValueOrDie();
1388 
1389     // Update our stats
1390     if (IsExpired(document_id)) {
1391       ++total_num_expired;
1392       namespace_storage_info.set_num_expired_documents(
1393           namespace_storage_info.num_expired_documents() + 1);
1394       if (usage_scores.usage_type1_count > 0) {
1395         namespace_storage_info.set_num_expired_documents_usage_type1(
1396             namespace_storage_info.num_expired_documents_usage_type1() + 1);
1397       }
1398       if (usage_scores.usage_type2_count > 0) {
1399         namespace_storage_info.set_num_expired_documents_usage_type2(
1400             namespace_storage_info.num_expired_documents_usage_type2() + 1);
1401       }
1402       if (usage_scores.usage_type3_count > 0) {
1403         namespace_storage_info.set_num_expired_documents_usage_type3(
1404             namespace_storage_info.num_expired_documents_usage_type3() + 1);
1405       }
1406     } else {
1407       ++total_num_alive;
1408       namespace_storage_info.set_num_alive_documents(
1409           namespace_storage_info.num_alive_documents() + 1);
1410       if (usage_scores.usage_type1_count > 0) {
1411         namespace_storage_info.set_num_alive_documents_usage_type1(
1412             namespace_storage_info.num_alive_documents_usage_type1() + 1);
1413       }
1414       if (usage_scores.usage_type2_count > 0) {
1415         namespace_storage_info.set_num_alive_documents_usage_type2(
1416             namespace_storage_info.num_alive_documents_usage_type2() + 1);
1417       }
1418       if (usage_scores.usage_type3_count > 0) {
1419         namespace_storage_info.set_num_alive_documents_usage_type3(
1420             namespace_storage_info.num_alive_documents_usage_type3() + 1);
1421       }
1422     }
1423   }
1424 
1425   for (auto& itr : namespace_to_storage_info) {
1426     storage_info.mutable_namespace_storage_info()->Add(std::move(itr.second));
1427   }
1428   storage_info.set_num_alive_documents(total_num_alive);
1429   storage_info.set_num_deleted_documents(total_num_deleted);
1430   storage_info.set_num_expired_documents(total_num_expired);
1431   return storage_info;
1432 }
1433 
GetStorageInfo() const1434 DocumentStorageInfoProto DocumentStore::GetStorageInfo() const {
1435   DocumentStorageInfoProto storage_info = GetMemberStorageInfo();
1436   int64_t directory_size = filesystem_->GetDiskUsage(base_dir_.c_str());
1437   if (directory_size != Filesystem::kBadFileSize) {
1438     storage_info.set_document_store_size(directory_size);
1439   } else {
1440     storage_info.set_document_store_size(-1);
1441   }
1442   storage_info.set_num_namespaces(namespace_mapper_->num_keys());
1443   return CalculateDocumentStatusCounts(std::move(storage_info));
1444 }
1445 
UpdateSchemaStore(const SchemaStore * schema_store)1446 libtextclassifier3::Status DocumentStore::UpdateSchemaStore(
1447     const SchemaStore* schema_store) {
1448   // Update all references to the SchemaStore
1449   schema_store_ = schema_store;
1450   document_validator_.UpdateSchemaStore(schema_store);
1451 
1452   int size = document_id_mapper_->num_elements();
1453   for (DocumentId document_id = 0; document_id < size; document_id++) {
1454     auto document_or = Get(document_id);
1455     if (absl_ports::IsNotFound(document_or.status())) {
1456       // Skip nonexistent documents
1457       continue;
1458     } else if (!document_or.ok()) {
1459       // Real error, pass up
1460       return absl_ports::Annotate(
1461           document_or.status(),
1462           IcingStringUtil::StringPrintf(
1463               "Failed to retrieve Document for DocumentId %d", document_id));
1464     }
1465 
1466     // Guaranteed to have a document now.
1467     DocumentProto document = document_or.ValueOrDie();
1468 
1469     // Revalidate that this document is still compatible
1470     if (document_validator_.Validate(document).ok()) {
1471       // Update the SchemaTypeId for this entry
1472       ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
1473                              schema_store_->GetSchemaTypeId(document.schema()));
1474       filter_cache_->mutable_array()[document_id].set_schema_type_id(
1475           schema_type_id);
1476     } else {
1477       // Document is no longer valid with the new SchemaStore. Mark as
1478       // deleted
1479       auto delete_status = Delete(document.namespace_(), document.uri());
1480       if (!delete_status.ok() && !absl_ports::IsNotFound(delete_status)) {
1481         // Real error, pass up
1482         return delete_status;
1483       }
1484     }
1485   }
1486 
1487   return libtextclassifier3::Status::OK;
1488 }
1489 
OptimizedUpdateSchemaStore(const SchemaStore * schema_store,const SchemaStore::SetSchemaResult & set_schema_result)1490 libtextclassifier3::Status DocumentStore::OptimizedUpdateSchemaStore(
1491     const SchemaStore* schema_store,
1492     const SchemaStore::SetSchemaResult& set_schema_result) {
1493   if (!set_schema_result.success) {
1494     // No new schema was set, no work to be done
1495     return libtextclassifier3::Status::OK;
1496   }
1497 
1498   // Update all references to the SchemaStore
1499   schema_store_ = schema_store;
1500   document_validator_.UpdateSchemaStore(schema_store);
1501 
1502   int size = document_id_mapper_->num_elements();
1503   for (DocumentId document_id = 0; document_id < size; document_id++) {
1504     if (!InternalDoesDocumentExist(document_id)) {
1505       // Skip nonexistent documents
1506       continue;
1507     }
1508 
1509     // Guaranteed that the document exists now.
1510     ICING_ASSIGN_OR_RETURN(const DocumentFilterData* filter_data,
1511                            filter_cache_->Get(document_id));
1512 
1513     bool delete_document = set_schema_result.schema_types_deleted_by_id.count(
1514                                filter_data->schema_type_id()) != 0;
1515 
1516     // Check if we need to update the FilterCache entry for this document. It
1517     // may have been assigned a different SchemaTypeId in the new SchemaStore.
1518     bool update_filter_cache =
1519         set_schema_result.old_schema_type_ids_changed.count(
1520             filter_data->schema_type_id()) != 0;
1521 
1522     // Check if we need to revalidate this document if the type is now
1523     // incompatible
1524     bool revalidate_document =
1525         set_schema_result.schema_types_incompatible_by_id.count(
1526             filter_data->schema_type_id()) != 0;
1527 
1528     if (update_filter_cache || revalidate_document) {
1529       ICING_ASSIGN_OR_RETURN(DocumentProto document, Get(document_id));
1530 
1531       if (update_filter_cache) {
1532         ICING_ASSIGN_OR_RETURN(
1533             SchemaTypeId schema_type_id,
1534             schema_store_->GetSchemaTypeId(document.schema()));
1535         filter_cache_->mutable_array()[document_id].set_schema_type_id(
1536             schema_type_id);
1537       }
1538       if (revalidate_document) {
1539         delete_document = !document_validator_.Validate(document).ok();
1540       }
1541     }
1542 
1543     if (delete_document) {
1544       // Document is no longer valid with the new SchemaStore. Mark as deleted
1545       auto delete_status = Delete(document_id);
1546       if (!delete_status.ok() && !absl_ports::IsNotFound(delete_status)) {
1547         // Real error, pass up
1548         return delete_status;
1549       }
1550     }
1551   }
1552 
1553   return libtextclassifier3::Status::OK;
1554 }
1555 
1556 // TODO(b/121227117): Implement Optimize()
Optimize()1557 libtextclassifier3::Status DocumentStore::Optimize() {
1558   return libtextclassifier3::Status::OK;
1559 }
1560 
OptimizeInto(const std::string & new_directory,const LanguageSegmenter * lang_segmenter,OptimizeStatsProto * stats)1561 libtextclassifier3::Status DocumentStore::OptimizeInto(
1562     const std::string& new_directory, const LanguageSegmenter* lang_segmenter,
1563     OptimizeStatsProto* stats) {
1564   // Validates directory
1565   if (new_directory == base_dir_) {
1566     return absl_ports::InvalidArgumentError(
1567         "New directory is the same as the current one.");
1568   }
1569 
1570   ICING_ASSIGN_OR_RETURN(auto doc_store_create_result,
1571                          DocumentStore::Create(filesystem_, new_directory,
1572                                                &clock_, schema_store_));
1573   std::unique_ptr<DocumentStore> new_doc_store =
1574       std::move(doc_store_create_result.document_store);
1575 
1576   // Writes all valid docs into new document store (new directory)
1577   int size = document_id_mapper_->num_elements();
1578   int num_deleted = 0;
1579   int num_expired = 0;
1580   UsageStore::UsageScores default_usage;
1581   for (DocumentId document_id = 0; document_id < size; document_id++) {
1582     auto document_or = Get(document_id, /*clear_internal_fields=*/false);
1583     if (absl_ports::IsNotFound(document_or.status())) {
1584       if (IsDeleted(document_id)) {
1585         ++num_deleted;
1586       } else if (IsExpired(document_id)) {
1587         ++num_expired;
1588       }
1589       continue;
1590     } else if (!document_or.ok()) {
1591       // Real error, pass up
1592       return absl_ports::Annotate(
1593           document_or.status(),
1594           IcingStringUtil::StringPrintf(
1595               "Failed to retrieve Document for DocumentId %d", document_id));
1596     }
1597 
1598     // Guaranteed to have a document now.
1599     DocumentProto document_to_keep = document_or.ValueOrDie();
1600 
1601     libtextclassifier3::StatusOr<DocumentId> new_document_id_or;
1602     if (document_to_keep.internal_fields().length_in_tokens() == 0) {
1603       auto tokenized_document_or = TokenizedDocument::Create(
1604           schema_store_, lang_segmenter, document_to_keep);
1605       if (!tokenized_document_or.ok()) {
1606         return absl_ports::Annotate(
1607             tokenized_document_or.status(),
1608             IcingStringUtil::StringPrintf(
1609                 "Failed to tokenize Document for DocumentId %d", document_id));
1610       }
1611       TokenizedDocument tokenized_document(
1612           std::move(tokenized_document_or).ValueOrDie());
1613       new_document_id_or =
1614           new_doc_store->Put(document_to_keep, tokenized_document.num_tokens());
1615     } else {
1616       // TODO(b/144458732): Implement a more robust version of
1617       // TC_ASSIGN_OR_RETURN that can support error logging.
1618       new_document_id_or = new_doc_store->InternalPut(document_to_keep);
1619     }
1620     if (!new_document_id_or.ok()) {
1621       ICING_LOG(ERROR) << new_document_id_or.status().error_message()
1622                        << "Failed to write into new document store";
1623       return new_document_id_or.status();
1624     }
1625 
1626     // Copy over usage scores.
1627     ICING_ASSIGN_OR_RETURN(UsageStore::UsageScores usage_scores,
1628                            usage_store_->GetUsageScores(document_id));
1629     if (!(usage_scores == default_usage)) {
1630       // If the usage scores for this document are the default (no usage), then
1631       // don't bother setting it. No need to possibly allocate storage if
1632       // there's nothing interesting to store.
1633       DocumentId new_document_id = new_document_id_or.ValueOrDie();
1634       ICING_RETURN_IF_ERROR(
1635           new_doc_store->SetUsageScores(new_document_id, usage_scores));
1636     }
1637   }
1638   if (stats != nullptr) {
1639     stats->set_num_original_documents(size);
1640     stats->set_num_deleted_documents(num_deleted);
1641     stats->set_num_expired_documents(num_expired);
1642   }
1643   ICING_RETURN_IF_ERROR(new_doc_store->PersistToDisk(PersistType::FULL));
1644   return libtextclassifier3::Status::OK;
1645 }
1646 
1647 libtextclassifier3::StatusOr<DocumentStore::OptimizeInfo>
GetOptimizeInfo() const1648 DocumentStore::GetOptimizeInfo() const {
1649   OptimizeInfo optimize_info;
1650 
1651   // Figure out our ratio of optimizable/total docs.
1652   int32_t num_documents = document_id_mapper_->num_elements();
1653   for (DocumentId document_id = kMinDocumentId; document_id < num_documents;
1654        ++document_id) {
1655     if (!InternalDoesDocumentExist(document_id)) {
1656       ++optimize_info.optimizable_docs;
1657     }
1658 
1659     ++optimize_info.total_docs;
1660   }
1661 
1662   if (optimize_info.total_docs == 0) {
1663     // Can exit early since there's nothing to calculate.
1664     return optimize_info;
1665   }
1666 
1667   // Get the total element size.
1668   //
1669   // We use file size instead of disk usage here because the files are not
1670   // sparse, so it's more accurate. Disk usage rounds up to the nearest block
1671   // size.
1672   ICING_ASSIGN_OR_RETURN(const int64_t document_log_file_size,
1673                          document_log_->GetElementsFileSize());
1674   ICING_ASSIGN_OR_RETURN(const int64_t document_id_mapper_file_size,
1675                          document_id_mapper_->GetElementsFileSize());
1676   ICING_ASSIGN_OR_RETURN(const int64_t score_cache_file_size,
1677                          score_cache_->GetElementsFileSize());
1678   ICING_ASSIGN_OR_RETURN(const int64_t filter_cache_file_size,
1679                          filter_cache_->GetElementsFileSize());
1680   ICING_ASSIGN_OR_RETURN(const int64_t corpus_score_cache_file_size,
1681                          corpus_score_cache_->GetElementsFileSize());
1682 
1683   // Usage store might be sparse, but we'll still use file size for more
1684   // accurate counting.
1685   ICING_ASSIGN_OR_RETURN(const int64_t usage_store_file_size,
1686                          usage_store_->GetElementsFileSize());
1687 
1688   // We use a combined disk usage and file size for the KeyMapper because it's
1689   // backed by a trie, which has some sparse property bitmaps.
1690   ICING_ASSIGN_OR_RETURN(const int64_t document_key_mapper_size,
1691                          document_key_mapper_->GetElementsSize());
1692 
1693   // We don't include the namespace_mapper or the corpus_mapper because it's
1694   // not clear if we could recover any space even if Optimize were called.
1695   // Deleting 100s of documents could still leave a few documents of a
1696   // namespace, and then there would be no change.
1697 
1698   int64_t total_size = document_log_file_size + document_key_mapper_size +
1699                        document_id_mapper_file_size + score_cache_file_size +
1700                        filter_cache_file_size + corpus_score_cache_file_size +
1701                        usage_store_file_size;
1702 
1703   optimize_info.estimated_optimizable_bytes =
1704       total_size * optimize_info.optimizable_docs / optimize_info.total_docs;
1705   return optimize_info;
1706 }
1707 
UpdateCorpusAssociatedScoreCache(CorpusId corpus_id,const CorpusAssociatedScoreData & score_data)1708 libtextclassifier3::Status DocumentStore::UpdateCorpusAssociatedScoreCache(
1709     CorpusId corpus_id, const CorpusAssociatedScoreData& score_data) {
1710   return corpus_score_cache_->Set(corpus_id, score_data);
1711 }
1712 
UpdateDocumentAssociatedScoreCache(DocumentId document_id,const DocumentAssociatedScoreData & score_data)1713 libtextclassifier3::Status DocumentStore::UpdateDocumentAssociatedScoreCache(
1714     DocumentId document_id, const DocumentAssociatedScoreData& score_data) {
1715   return score_cache_->Set(document_id, score_data);
1716 }
1717 
UpdateFilterCache(DocumentId document_id,const DocumentFilterData & filter_data)1718 libtextclassifier3::Status DocumentStore::UpdateFilterCache(
1719     DocumentId document_id, const DocumentFilterData& filter_data) {
1720   return filter_cache_->Set(document_id, filter_data);
1721 }
1722 
ClearDerivedData(DocumentId document_id)1723 libtextclassifier3::Status DocumentStore::ClearDerivedData(
1724     DocumentId document_id) {
1725   // We intentionally leave the data in key_mapper_ because locating that data
1726   // requires fetching namespace and uri. Leaving data in key_mapper_ should
1727   // be fine because the data is hashed.
1728 
1729   ICING_RETURN_IF_ERROR(document_id_mapper_->Set(document_id, kDocDeletedFlag));
1730 
1731   // Resets the score cache entry
1732   ICING_RETURN_IF_ERROR(UpdateDocumentAssociatedScoreCache(
1733       document_id, DocumentAssociatedScoreData(kInvalidCorpusId,
1734                                                /*document_score=*/-1,
1735                                                /*creation_timestamp_ms=*/-1,
1736                                                /*length_in_tokens=*/0)));
1737 
1738   // Resets the filter cache entry
1739   ICING_RETURN_IF_ERROR(UpdateFilterCache(
1740       document_id, DocumentFilterData(kInvalidNamespaceId, kInvalidSchemaTypeId,
1741                                       /*expiration_timestamp_ms=*/-1)));
1742 
1743   // Clears the usage scores.
1744   return usage_store_->DeleteUsageScores(document_id);
1745 }
1746 
SetUsageScores(DocumentId document_id,const UsageStore::UsageScores & usage_scores)1747 libtextclassifier3::Status DocumentStore::SetUsageScores(
1748     DocumentId document_id, const UsageStore::UsageScores& usage_scores) {
1749   return usage_store_->SetUsageScores(document_id, usage_scores);
1750 }
1751 
1752 }  // namespace lib
1753 }  // namespace icing
1754