1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/store/document-store.h"
16
17 #include <cstdint>
18 #include <limits>
19 #include <memory>
20 #include <string>
21 #include <string_view>
22 #include <unordered_map>
23 #include <utility>
24 #include <vector>
25
26 #include "icing/text_classifier/lib3/utils/base/status.h"
27 #include "icing/text_classifier/lib3/utils/base/statusor.h"
28 #include "icing/text_classifier/lib3/utils/hash/farmhash.h"
29 #include "icing/absl_ports/annotate.h"
30 #include "icing/absl_ports/canonical_errors.h"
31 #include "icing/absl_ports/str_cat.h"
32 #include "icing/file/file-backed-proto-log.h"
33 #include "icing/file/file-backed-vector.h"
34 #include "icing/file/filesystem.h"
35 #include "icing/file/memory-mapped-file.h"
36 #include "icing/file/portable-file-backed-proto-log.h"
37 #include "icing/legacy/core/icing-string-util.h"
38 #include "icing/proto/document.pb.h"
39 #include "icing/proto/document_wrapper.pb.h"
40 #include "icing/proto/logging.pb.h"
41 #include "icing/proto/storage.pb.h"
42 #include "icing/schema/schema-store.h"
43 #include "icing/store/corpus-associated-scoring-data.h"
44 #include "icing/store/corpus-id.h"
45 #include "icing/store/document-associated-score-data.h"
46 #include "icing/store/document-filter-data.h"
47 #include "icing/store/document-id.h"
48 #include "icing/store/document-log-creator.h"
49 #include "icing/store/key-mapper.h"
50 #include "icing/store/namespace-id.h"
51 #include "icing/store/usage-store.h"
52 #include "icing/tokenization/language-segmenter.h"
53 #include "icing/util/clock.h"
54 #include "icing/util/crc32.h"
55 #include "icing/util/data-loss.h"
56 #include "icing/util/logging.h"
57 #include "icing/util/status-macros.h"
58 #include "icing/util/tokenized-document.h"
59
60 namespace icing {
61 namespace lib {
62
63 namespace {
64
65 // Used in DocumentId mapper to mark a document as deleted
66 constexpr int64_t kDocDeletedFlag = -1;
67 constexpr char kDocumentIdMapperFilename[] = "document_id_mapper";
68 constexpr char kDocumentStoreHeaderFilename[] = "document_store_header";
69 constexpr char kScoreCacheFilename[] = "score_cache";
70 constexpr char kCorpusScoreCache[] = "corpus_score_cache";
71 constexpr char kFilterCacheFilename[] = "filter_cache";
72 constexpr char kNamespaceMapperFilename[] = "namespace_mapper";
73 constexpr char kUsageStoreDirectoryName[] = "usage_store";
74 constexpr char kCorpusIdMapperFilename[] = "corpus_mapper";
75
76 // Determined through manual testing to allow for 1 million uris. 1 million
77 // because we allow up to 1 million DocumentIds.
78 constexpr int32_t kUriMapperMaxSize = 36 * 1024 * 1024; // 36 MiB
79
80 // 384 KiB for a KeyMapper would allow each internal array to have a max of
81 // 128 KiB for storage.
82 constexpr int32_t kNamespaceMapperMaxSize = 3 * 128 * 1024; // 384 KiB
83 constexpr int32_t kCorpusMapperMaxSize = 3 * 128 * 1024; // 384 KiB
84
CreateDocumentWrapper(DocumentProto && document)85 DocumentWrapper CreateDocumentWrapper(DocumentProto&& document) {
86 DocumentWrapper document_wrapper;
87 *document_wrapper.mutable_document() = std::move(document);
88 return document_wrapper;
89 }
90
MakeHeaderFilename(const std::string & base_dir)91 std::string MakeHeaderFilename(const std::string& base_dir) {
92 return absl_ports::StrCat(base_dir, "/", kDocumentStoreHeaderFilename);
93 }
94
MakeDocumentIdMapperFilename(const std::string & base_dir)95 std::string MakeDocumentIdMapperFilename(const std::string& base_dir) {
96 return absl_ports::StrCat(base_dir, "/", kDocumentIdMapperFilename);
97 }
98
MakeScoreCacheFilename(const std::string & base_dir)99 std::string MakeScoreCacheFilename(const std::string& base_dir) {
100 return absl_ports::StrCat(base_dir, "/", kScoreCacheFilename);
101 }
102
MakeCorpusScoreCache(const std::string & base_dir)103 std::string MakeCorpusScoreCache(const std::string& base_dir) {
104 return absl_ports::StrCat(base_dir, "/", kCorpusScoreCache);
105 }
106
MakeFilterCacheFilename(const std::string & base_dir)107 std::string MakeFilterCacheFilename(const std::string& base_dir) {
108 return absl_ports::StrCat(base_dir, "/", kFilterCacheFilename);
109 }
110
MakeNamespaceMapperFilename(const std::string & base_dir)111 std::string MakeNamespaceMapperFilename(const std::string& base_dir) {
112 return absl_ports::StrCat(base_dir, "/", kNamespaceMapperFilename);
113 }
114
MakeUsageStoreDirectoryName(const std::string & base_dir)115 std::string MakeUsageStoreDirectoryName(const std::string& base_dir) {
116 return absl_ports::StrCat(base_dir, "/", kUsageStoreDirectoryName);
117 }
118
MakeCorpusMapperFilename(const std::string & base_dir)119 std::string MakeCorpusMapperFilename(const std::string& base_dir) {
120 return absl_ports::StrCat(base_dir, "/", kCorpusIdMapperFilename);
121 }
122
123 // TODO(adorokhine): This class internally uses an 8-byte fingerprint of the
124 // Key and stores the key/value in a file-backed-trie that adds an ~80 byte
125 // overhead per key. As we know that these fingerprints are always 8-bytes in
126 // length and that they're random, we might be able to store them more
127 // compactly.
MakeFingerprint(std::string_view name_space,std::string_view uri)128 std::string MakeFingerprint(std::string_view name_space, std::string_view uri) {
129 // Using a 64-bit fingerprint to represent the key could lead to collisions.
130 // But, even with 200K unique keys, the probability of collision is about
131 // one-in-a-billion (https://en.wikipedia.org/wiki/Birthday_attack).
132 uint64_t fprint =
133 tc3farmhash::Fingerprint64(absl_ports::StrCat(name_space, uri));
134
135 std::string encoded_fprint;
136 // DynamicTrie cannot handle keys with '0' as bytes. So, we encode it in
137 // base128 and add 1 to make sure that no byte is '0'. This increases the
138 // size of the encoded_fprint from 8-bytes to 10-bytes.
139 while (fprint) {
140 encoded_fprint.push_back((fprint & 0x7F) + 1);
141 fprint >>= 7;
142 }
143 return encoded_fprint;
144 }
145
CalculateExpirationTimestampMs(int64_t creation_timestamp_ms,int64_t ttl_ms)146 int64_t CalculateExpirationTimestampMs(int64_t creation_timestamp_ms,
147 int64_t ttl_ms) {
148 if (ttl_ms == 0) {
149 // Special case where a TTL of 0 indicates the document should never
150 // expire. int64_t max, interpreted as seconds since epoch, represents
151 // some point in the year 292,277,026,596. So we're probably ok to use
152 // this as "never reaching this point".
153 return std::numeric_limits<int64_t>::max();
154 }
155
156 int64_t expiration_timestamp_ms;
157 if (__builtin_add_overflow(creation_timestamp_ms, ttl_ms,
158 &expiration_timestamp_ms)) {
159 // Overflow detected. Treat overflow as the same behavior of just int64_t
160 // max
161 return std::numeric_limits<int64_t>::max();
162 }
163
164 return expiration_timestamp_ms;
165 }
166
167 } // namespace
168
DocumentStore(const Filesystem * filesystem,const std::string_view base_dir,const Clock * clock,const SchemaStore * schema_store)169 DocumentStore::DocumentStore(const Filesystem* filesystem,
170 const std::string_view base_dir,
171 const Clock* clock,
172 const SchemaStore* schema_store)
173 : filesystem_(filesystem),
174 base_dir_(base_dir),
175 clock_(*clock),
176 schema_store_(schema_store),
177 document_validator_(schema_store) {}
178
Put(const DocumentProto & document,int32_t num_tokens,PutDocumentStatsProto * put_document_stats)179 libtextclassifier3::StatusOr<DocumentId> DocumentStore::Put(
180 const DocumentProto& document, int32_t num_tokens,
181 PutDocumentStatsProto* put_document_stats) {
182 return Put(DocumentProto(document), num_tokens, put_document_stats);
183 }
184
Put(DocumentProto && document,int32_t num_tokens,PutDocumentStatsProto * put_document_stats)185 libtextclassifier3::StatusOr<DocumentId> DocumentStore::Put(
186 DocumentProto&& document, int32_t num_tokens,
187 PutDocumentStatsProto* put_document_stats) {
188 document.mutable_internal_fields()->set_length_in_tokens(num_tokens);
189 return InternalPut(document, put_document_stats);
190 }
191
~DocumentStore()192 DocumentStore::~DocumentStore() {
193 if (initialized_) {
194 if (!PersistToDisk(PersistType::FULL).ok()) {
195 ICING_LOG(ERROR)
196 << "Error persisting to disk in DocumentStore destructor";
197 }
198 }
199 }
200
Create(const Filesystem * filesystem,const std::string & base_dir,const Clock * clock,const SchemaStore * schema_store,bool force_recovery_and_revalidate_documents,InitializeStatsProto * initialize_stats)201 libtextclassifier3::StatusOr<DocumentStore::CreateResult> DocumentStore::Create(
202 const Filesystem* filesystem, const std::string& base_dir,
203 const Clock* clock, const SchemaStore* schema_store,
204 bool force_recovery_and_revalidate_documents,
205 InitializeStatsProto* initialize_stats) {
206 ICING_RETURN_ERROR_IF_NULL(filesystem);
207 ICING_RETURN_ERROR_IF_NULL(clock);
208 ICING_RETURN_ERROR_IF_NULL(schema_store);
209
210 auto document_store = std::unique_ptr<DocumentStore>(
211 new DocumentStore(filesystem, base_dir, clock, schema_store));
212 ICING_ASSIGN_OR_RETURN(
213 DataLoss data_loss,
214 document_store->Initialize(force_recovery_and_revalidate_documents,
215 initialize_stats));
216
217 CreateResult create_result;
218 create_result.document_store = std::move(document_store);
219 create_result.data_loss = data_loss;
220 return create_result;
221 }
222
Initialize(bool force_recovery_and_revalidate_documents,InitializeStatsProto * initialize_stats)223 libtextclassifier3::StatusOr<DataLoss> DocumentStore::Initialize(
224 bool force_recovery_and_revalidate_documents,
225 InitializeStatsProto* initialize_stats) {
226 auto create_result_or = DocumentLogCreator::Create(filesystem_, base_dir_);
227
228 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
229 // that can support error logging.
230 if (!create_result_or.ok()) {
231 ICING_LOG(ERROR) << create_result_or.status().error_message()
232 << "\nFailed to initialize DocumentLog.";
233 return create_result_or.status();
234 }
235 DocumentLogCreator::CreateResult create_result =
236 std::move(create_result_or).ValueOrDie();
237
238 document_log_ = std::move(create_result.log_create_result.proto_log);
239
240 if (create_result.regen_derived_files ||
241 force_recovery_and_revalidate_documents ||
242 create_result.log_create_result.has_data_loss()) {
243 // We can't rely on any existing derived files. Recreate them from scratch.
244 // Currently happens if:
245 // 1) This is a new log and we don't have derived files yet
246 // 2) Client wanted us to force a regeneration.
247 // 3) Log has some data loss, can't rely on existing derived data.
248 if (create_result.log_create_result.has_data_loss() &&
249 initialize_stats != nullptr) {
250 ICING_LOG(WARNING)
251 << "Data loss in document log, regenerating derived files.";
252 initialize_stats->set_document_store_recovery_cause(
253 InitializeStatsProto::DATA_LOSS);
254
255 if (create_result.log_create_result.data_loss == DataLoss::PARTIAL) {
256 // Ground truth is partially lost.
257 initialize_stats->set_document_store_data_status(
258 InitializeStatsProto::PARTIAL_LOSS);
259 } else {
260 // Ground truth is completely lost.
261 initialize_stats->set_document_store_data_status(
262 InitializeStatsProto::COMPLETE_LOSS);
263 }
264 }
265
266 std::unique_ptr<Timer> document_recovery_timer = clock_.GetNewTimer();
267 libtextclassifier3::Status status =
268 RegenerateDerivedFiles(force_recovery_and_revalidate_documents);
269 if (initialize_stats != nullptr &&
270 (force_recovery_and_revalidate_documents ||
271 create_result.log_create_result.has_data_loss())) {
272 // Only consider it a recovery if the client forced a recovery or there
273 // was data loss. Otherwise, this could just be the first time we're
274 // initializing and generating derived files.
275 initialize_stats->set_document_store_recovery_latency_ms(
276 document_recovery_timer->GetElapsedMilliseconds());
277 }
278 if (!status.ok()) {
279 ICING_LOG(ERROR)
280 << "Failed to regenerate derived files for DocumentStore";
281 return status;
282 }
283 } else {
284 if (!InitializeExistingDerivedFiles().ok()) {
285 ICING_VLOG(1)
286 << "Couldn't find derived files or failed to initialize them, "
287 "regenerating derived files for DocumentStore.";
288 std::unique_ptr<Timer> document_recovery_timer = clock_.GetNewTimer();
289 libtextclassifier3::Status status = RegenerateDerivedFiles(
290 /*force_recovery_and_revalidate_documents*/ false);
291 if (initialize_stats != nullptr && num_documents() > 0) {
292 initialize_stats->set_document_store_recovery_cause(
293 InitializeStatsProto::IO_ERROR);
294 initialize_stats->set_document_store_recovery_latency_ms(
295 document_recovery_timer->GetElapsedMilliseconds());
296 }
297 if (!status.ok()) {
298 ICING_LOG(ERROR)
299 << "Failed to regenerate derived files for DocumentStore";
300 return status;
301 }
302 }
303 }
304
305 initialized_ = true;
306 if (initialize_stats != nullptr) {
307 initialize_stats->set_num_documents(document_id_mapper_->num_elements());
308 }
309
310 return create_result.log_create_result.data_loss;
311 }
312
InitializeExistingDerivedFiles()313 libtextclassifier3::Status DocumentStore::InitializeExistingDerivedFiles() {
314 if (!HeaderExists()) {
315 // Without a header, we don't know if things are consistent between each
316 // other so the caller should just regenerate everything from ground
317 // truth.
318 return absl_ports::InternalError("DocumentStore header doesn't exist");
319 }
320
321 DocumentStore::Header header;
322 if (!filesystem_->Read(MakeHeaderFilename(base_dir_).c_str(), &header,
323 sizeof(header))) {
324 return absl_ports::InternalError(
325 absl_ports::StrCat("Couldn't read: ", MakeHeaderFilename(base_dir_)));
326 }
327
328 if (header.magic != DocumentStore::Header::kMagic) {
329 return absl_ports::InternalError(absl_ports::StrCat(
330 "Invalid header kMagic for file: ", MakeHeaderFilename(base_dir_)));
331 }
332
333 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
334 // that can support error logging.
335 auto document_key_mapper_or =
336 KeyMapper<DocumentId>::Create(*filesystem_, base_dir_, kUriMapperMaxSize);
337 if (!document_key_mapper_or.ok()) {
338 ICING_LOG(ERROR) << document_key_mapper_or.status().error_message()
339 << "Failed to initialize KeyMapper";
340 return document_key_mapper_or.status();
341 }
342 document_key_mapper_ = std::move(document_key_mapper_or).ValueOrDie();
343
344 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
345 // that can support error logging.
346 auto document_id_mapper_or = FileBackedVector<int64_t>::Create(
347 *filesystem_, MakeDocumentIdMapperFilename(base_dir_),
348 MemoryMappedFile::READ_WRITE_AUTO_SYNC);
349 if (!document_id_mapper_or.ok()) {
350 ICING_LOG(ERROR) << document_id_mapper_or.status().error_message()
351 << "Failed to initialize DocumentIdMapper";
352 return document_id_mapper_or.status();
353 }
354 document_id_mapper_ = std::move(document_id_mapper_or).ValueOrDie();
355
356 ICING_ASSIGN_OR_RETURN(score_cache_,
357 FileBackedVector<DocumentAssociatedScoreData>::Create(
358 *filesystem_, MakeScoreCacheFilename(base_dir_),
359 MemoryMappedFile::READ_WRITE_AUTO_SYNC));
360
361 ICING_ASSIGN_OR_RETURN(filter_cache_,
362 FileBackedVector<DocumentFilterData>::Create(
363 *filesystem_, MakeFilterCacheFilename(base_dir_),
364 MemoryMappedFile::READ_WRITE_AUTO_SYNC));
365
366 ICING_ASSIGN_OR_RETURN(
367 namespace_mapper_,
368 KeyMapper<NamespaceId>::Create(*filesystem_,
369 MakeNamespaceMapperFilename(base_dir_),
370 kNamespaceMapperMaxSize));
371
372 ICING_ASSIGN_OR_RETURN(
373 usage_store_,
374 UsageStore::Create(filesystem_, MakeUsageStoreDirectoryName(base_dir_)));
375
376 ICING_ASSIGN_OR_RETURN(corpus_mapper_,
377 KeyMapper<CorpusId>::Create(
378 *filesystem_, MakeCorpusMapperFilename(base_dir_),
379 kCorpusMapperMaxSize));
380
381 ICING_ASSIGN_OR_RETURN(corpus_score_cache_,
382 FileBackedVector<CorpusAssociatedScoreData>::Create(
383 *filesystem_, MakeCorpusScoreCache(base_dir_),
384 MemoryMappedFile::READ_WRITE_AUTO_SYNC));
385
386 // Ensure the usage store is the correct size.
387 ICING_RETURN_IF_ERROR(
388 usage_store_->TruncateTo(document_id_mapper_->num_elements()));
389
390 ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
391 if (checksum.Get() != header.checksum) {
392 return absl_ports::InternalError(
393 "Combined checksum of DocStore was inconsistent");
394 }
395
396 return libtextclassifier3::Status::OK;
397 }
398
RegenerateDerivedFiles(bool revalidate_documents)399 libtextclassifier3::Status DocumentStore::RegenerateDerivedFiles(
400 bool revalidate_documents) {
401 ICING_RETURN_IF_ERROR(ResetDocumentKeyMapper());
402 ICING_RETURN_IF_ERROR(ResetDocumentIdMapper());
403 ICING_RETURN_IF_ERROR(ResetDocumentAssociatedScoreCache());
404 ICING_RETURN_IF_ERROR(ResetFilterCache());
405 ICING_RETURN_IF_ERROR(ResetNamespaceMapper());
406 ICING_RETURN_IF_ERROR(ResetCorpusMapper());
407 ICING_RETURN_IF_ERROR(ResetCorpusAssociatedScoreCache());
408
409 // Creates a new UsageStore instance. Note that we don't reset the data in
410 // usage store here because we're not able to regenerate the usage scores.
411 ICING_ASSIGN_OR_RETURN(
412 usage_store_,
413 UsageStore::Create(filesystem_, MakeUsageStoreDirectoryName(base_dir_)));
414
415 // Iterates through document log
416 auto iterator = document_log_->GetIterator();
417 auto iterator_status = iterator.Advance();
418 while (iterator_status.ok()) {
419 libtextclassifier3::StatusOr<DocumentWrapper> document_wrapper_or =
420 document_log_->ReadProto(iterator.GetOffset());
421
422 if (absl_ports::IsNotFound(document_wrapper_or.status())) {
423 // The erased document still occupies 1 document id.
424 DocumentId new_document_id = document_id_mapper_->num_elements();
425 ICING_RETURN_IF_ERROR(ClearDerivedData(new_document_id));
426 iterator_status = iterator.Advance();
427 continue;
428 } else if (!document_wrapper_or.ok()) {
429 return document_wrapper_or.status();
430 }
431
432 DocumentWrapper document_wrapper =
433 std::move(document_wrapper_or).ValueOrDie();
434 // Revalidate that this document is still compatible if requested.
435 if (revalidate_documents) {
436 if (!document_validator_.Validate(document_wrapper.document()).ok()) {
437 // Document is no longer valid with the current schema. Mark as
438 // deleted
439 DocumentId new_document_id = document_id_mapper_->num_elements();
440 ICING_RETURN_IF_ERROR(document_log_->EraseProto(iterator.GetOffset()));
441 ICING_RETURN_IF_ERROR(ClearDerivedData(new_document_id));
442 continue;
443 }
444 }
445 // Updates key mapper and document_id mapper with the new document
446 DocumentId new_document_id = document_id_mapper_->num_elements();
447 ICING_RETURN_IF_ERROR(document_key_mapper_->Put(
448 MakeFingerprint(document_wrapper.document().namespace_(),
449 document_wrapper.document().uri()),
450 new_document_id));
451 ICING_RETURN_IF_ERROR(
452 document_id_mapper_->Set(new_document_id, iterator.GetOffset()));
453
454 SchemaTypeId schema_type_id;
455 auto schema_type_id_or =
456 schema_store_->GetSchemaTypeId(document_wrapper.document().schema());
457 if (absl_ports::IsNotFound(schema_type_id_or.status())) {
458 // Didn't find a SchemaTypeId. This means that the DocumentStore and
459 // the SchemaStore are out of sync. But DocumentStore can't do
460 // anything about it so just ignore this for now. This should be
461 // detected/handled by the owner of DocumentStore. Set it to some
462 // arbitrary invalid value for now, it'll get updated to the correct
463 // ID later.
464 schema_type_id = -1;
465 } else if (!schema_type_id_or.ok()) {
466 // Real error. Pass it up
467 return schema_type_id_or.status();
468 } else {
469 // We're guaranteed that SchemaTypeId is valid now
470 schema_type_id = schema_type_id_or.ValueOrDie();
471 }
472
473 ICING_ASSIGN_OR_RETURN(
474 NamespaceId namespace_id,
475 namespace_mapper_->GetOrPut(document_wrapper.document().namespace_(),
476 namespace_mapper_->num_keys()));
477
478 // Update corpus maps
479 std::string corpus =
480 MakeFingerprint(document_wrapper.document().namespace_(),
481 document_wrapper.document().schema());
482 ICING_ASSIGN_OR_RETURN(
483 CorpusId corpusId,
484 corpus_mapper_->GetOrPut(corpus, corpus_mapper_->num_keys()));
485
486 ICING_ASSIGN_OR_RETURN(CorpusAssociatedScoreData scoring_data,
487 GetCorpusAssociatedScoreDataToUpdate(corpusId));
488 scoring_data.AddDocument(
489 document_wrapper.document().internal_fields().length_in_tokens());
490
491 ICING_RETURN_IF_ERROR(
492 UpdateCorpusAssociatedScoreCache(corpusId, scoring_data));
493
494 ICING_RETURN_IF_ERROR(UpdateDocumentAssociatedScoreCache(
495 new_document_id,
496 DocumentAssociatedScoreData(
497 corpusId, document_wrapper.document().score(),
498 document_wrapper.document().creation_timestamp_ms(),
499 document_wrapper.document().internal_fields().length_in_tokens())));
500
501 int64_t expiration_timestamp_ms = CalculateExpirationTimestampMs(
502 document_wrapper.document().creation_timestamp_ms(),
503 document_wrapper.document().ttl_ms());
504
505 ICING_RETURN_IF_ERROR(UpdateFilterCache(
506 new_document_id, DocumentFilterData(namespace_id, schema_type_id,
507 expiration_timestamp_ms)));
508 iterator_status = iterator.Advance();
509 }
510
511 if (!absl_ports::IsOutOfRange(iterator_status)) {
512 ICING_LOG(WARNING)
513 << "Failed to iterate through proto log while regenerating "
514 "derived files";
515 return absl_ports::Annotate(iterator_status,
516 "Failed to iterate through proto log.");
517 }
518
519 // Shrink usage_store_ to the correct size.
520 ICING_RETURN_IF_ERROR(
521 usage_store_->TruncateTo(document_id_mapper_->num_elements()));
522
523 // Write the header
524 ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
525 ICING_RETURN_IF_ERROR(UpdateHeader(checksum));
526
527 return libtextclassifier3::Status::OK;
528 }
529
ResetDocumentKeyMapper()530 libtextclassifier3::Status DocumentStore::ResetDocumentKeyMapper() {
531 // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
532 document_key_mapper_.reset();
533 // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
534 // that can support error logging.
535 libtextclassifier3::Status status =
536 KeyMapper<DocumentId>::Delete(*filesystem_, base_dir_);
537 if (!status.ok()) {
538 ICING_LOG(ERROR) << status.error_message()
539 << "Failed to delete old key mapper";
540 return status;
541 }
542
543 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
544 // that can support error logging.
545 auto document_key_mapper_or =
546 KeyMapper<DocumentId>::Create(*filesystem_, base_dir_, kUriMapperMaxSize);
547 if (!document_key_mapper_or.ok()) {
548 ICING_LOG(ERROR) << document_key_mapper_or.status().error_message()
549 << "Failed to re-init key mapper";
550 return document_key_mapper_or.status();
551 }
552 document_key_mapper_ = std::move(document_key_mapper_or).ValueOrDie();
553 return libtextclassifier3::Status::OK;
554 }
555
ResetDocumentIdMapper()556 libtextclassifier3::Status DocumentStore::ResetDocumentIdMapper() {
557 // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
558 document_id_mapper_.reset();
559 // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
560 // that can support error logging.
561 libtextclassifier3::Status status = FileBackedVector<int64_t>::Delete(
562 *filesystem_, MakeDocumentIdMapperFilename(base_dir_));
563 if (!status.ok()) {
564 ICING_LOG(ERROR) << status.error_message()
565 << "Failed to delete old document_id mapper";
566 return status;
567 }
568 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
569 // that can support error logging.
570 auto document_id_mapper_or = FileBackedVector<int64_t>::Create(
571 *filesystem_, MakeDocumentIdMapperFilename(base_dir_),
572 MemoryMappedFile::READ_WRITE_AUTO_SYNC);
573 if (!document_id_mapper_or.ok()) {
574 ICING_LOG(ERROR) << document_id_mapper_or.status().error_message()
575 << "Failed to re-init document_id mapper";
576 return document_id_mapper_or.status();
577 }
578 document_id_mapper_ = std::move(document_id_mapper_or).ValueOrDie();
579 return libtextclassifier3::Status::OK;
580 }
581
ResetDocumentAssociatedScoreCache()582 libtextclassifier3::Status DocumentStore::ResetDocumentAssociatedScoreCache() {
583 // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
584 score_cache_.reset();
585 ICING_RETURN_IF_ERROR(FileBackedVector<DocumentAssociatedScoreData>::Delete(
586 *filesystem_, MakeScoreCacheFilename(base_dir_)));
587 ICING_ASSIGN_OR_RETURN(score_cache_,
588 FileBackedVector<DocumentAssociatedScoreData>::Create(
589 *filesystem_, MakeScoreCacheFilename(base_dir_),
590 MemoryMappedFile::READ_WRITE_AUTO_SYNC));
591 return libtextclassifier3::Status::OK;
592 }
593
ResetCorpusAssociatedScoreCache()594 libtextclassifier3::Status DocumentStore::ResetCorpusAssociatedScoreCache() {
595 // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
596 corpus_score_cache_.reset();
597 ICING_RETURN_IF_ERROR(FileBackedVector<CorpusAssociatedScoreData>::Delete(
598 *filesystem_, MakeCorpusScoreCache(base_dir_)));
599 ICING_ASSIGN_OR_RETURN(corpus_score_cache_,
600 FileBackedVector<CorpusAssociatedScoreData>::Create(
601 *filesystem_, MakeCorpusScoreCache(base_dir_),
602 MemoryMappedFile::READ_WRITE_AUTO_SYNC));
603 return libtextclassifier3::Status::OK;
604 }
605
ResetFilterCache()606 libtextclassifier3::Status DocumentStore::ResetFilterCache() {
607 // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
608 filter_cache_.reset();
609 ICING_RETURN_IF_ERROR(FileBackedVector<DocumentFilterData>::Delete(
610 *filesystem_, MakeFilterCacheFilename(base_dir_)));
611 ICING_ASSIGN_OR_RETURN(filter_cache_,
612 FileBackedVector<DocumentFilterData>::Create(
613 *filesystem_, MakeFilterCacheFilename(base_dir_),
614 MemoryMappedFile::READ_WRITE_AUTO_SYNC));
615 return libtextclassifier3::Status::OK;
616 }
617
ResetNamespaceMapper()618 libtextclassifier3::Status DocumentStore::ResetNamespaceMapper() {
619 // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
620 namespace_mapper_.reset();
621 // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
622 // that can support error logging.
623 libtextclassifier3::Status status = KeyMapper<NamespaceId>::Delete(
624 *filesystem_, MakeNamespaceMapperFilename(base_dir_));
625 if (!status.ok()) {
626 ICING_LOG(ERROR) << status.error_message()
627 << "Failed to delete old namespace_id mapper";
628 return status;
629 }
630 ICING_ASSIGN_OR_RETURN(
631 namespace_mapper_,
632 KeyMapper<NamespaceId>::Create(*filesystem_,
633 MakeNamespaceMapperFilename(base_dir_),
634 kNamespaceMapperMaxSize));
635 return libtextclassifier3::Status::OK;
636 }
637
ResetCorpusMapper()638 libtextclassifier3::Status DocumentStore::ResetCorpusMapper() {
639 // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
640 corpus_mapper_.reset();
641 // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
642 // that can support error logging.
643 libtextclassifier3::Status status = KeyMapper<CorpusId>::Delete(
644 *filesystem_, MakeCorpusMapperFilename(base_dir_));
645 if (!status.ok()) {
646 ICING_LOG(ERROR) << status.error_message()
647 << "Failed to delete old corpus_id mapper";
648 return status;
649 }
650 ICING_ASSIGN_OR_RETURN(corpus_mapper_,
651 KeyMapper<CorpusId>::Create(
652 *filesystem_, MakeCorpusMapperFilename(base_dir_),
653 kCorpusMapperMaxSize));
654 return libtextclassifier3::Status::OK;
655 }
656
ComputeChecksum() const657 libtextclassifier3::StatusOr<Crc32> DocumentStore::ComputeChecksum() const {
658 Crc32 total_checksum;
659
660 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
661 // that can support error logging.
662 auto checksum_or = document_log_->ComputeChecksum();
663 if (!checksum_or.ok()) {
664 ICING_LOG(ERROR) << checksum_or.status().error_message()
665 << "Failed to compute checksum of DocumentLog";
666 return checksum_or.status();
667 }
668 Crc32 document_log_checksum = std::move(checksum_or).ValueOrDie();
669
670 Crc32 document_key_mapper_checksum = document_key_mapper_->ComputeChecksum();
671
672 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
673 // that can support error logging.
674 checksum_or = document_id_mapper_->ComputeChecksum();
675 if (!checksum_or.ok()) {
676 ICING_LOG(ERROR) << checksum_or.status().error_message()
677 << "Failed to compute checksum of DocumentIdMapper";
678 return checksum_or.status();
679 }
680 Crc32 document_id_mapper_checksum = std::move(checksum_or).ValueOrDie();
681
682 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
683 // that can support error logging.
684 checksum_or = score_cache_->ComputeChecksum();
685 if (!checksum_or.ok()) {
686 ICING_LOG(ERROR) << checksum_or.status().error_message()
687 << "Failed to compute checksum of score cache";
688 return checksum_or.status();
689 }
690 Crc32 score_cache_checksum = std::move(checksum_or).ValueOrDie();
691
692 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
693 // that can support error logging.
694 checksum_or = filter_cache_->ComputeChecksum();
695 if (!checksum_or.ok()) {
696 ICING_LOG(ERROR) << checksum_or.status().error_message()
697 << "Failed to compute checksum of filter cache";
698 return checksum_or.status();
699 }
700 Crc32 filter_cache_checksum = std::move(checksum_or).ValueOrDie();
701
702 Crc32 namespace_mapper_checksum = namespace_mapper_->ComputeChecksum();
703
704 Crc32 corpus_mapper_checksum = corpus_mapper_->ComputeChecksum();
705
706 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
707 // that can support error logging.
708 checksum_or = corpus_score_cache_->ComputeChecksum();
709 if (!checksum_or.ok()) {
710 ICING_LOG(WARNING) << checksum_or.status().error_message()
711 << "Failed to compute checksum of score cache";
712 return checksum_or.status();
713 }
714 Crc32 corpus_score_cache_checksum = std::move(checksum_or).ValueOrDie();
715
716 // NOTE: We purposely don't include usage_store checksum here because we can't
717 // regenerate it from ground truth documents. If it gets corrupted, we'll just
718 // clear all usage reports, but we shouldn't throw everything else in the
719 // document store out.
720
721 total_checksum.Append(std::to_string(document_log_checksum.Get()));
722 total_checksum.Append(std::to_string(document_key_mapper_checksum.Get()));
723 total_checksum.Append(std::to_string(document_id_mapper_checksum.Get()));
724 total_checksum.Append(std::to_string(score_cache_checksum.Get()));
725 total_checksum.Append(std::to_string(filter_cache_checksum.Get()));
726 total_checksum.Append(std::to_string(namespace_mapper_checksum.Get()));
727 total_checksum.Append(std::to_string(corpus_mapper_checksum.Get()));
728 total_checksum.Append(std::to_string(corpus_score_cache_checksum.Get()));
729
730 return total_checksum;
731 }
732
HeaderExists()733 bool DocumentStore::HeaderExists() {
734 if (!filesystem_->FileExists(MakeHeaderFilename(base_dir_).c_str())) {
735 return false;
736 }
737
738 int64_t file_size =
739 filesystem_->GetFileSize(MakeHeaderFilename(base_dir_).c_str());
740
741 // If it's been truncated to size 0 before, we consider it to be a new file
742 return file_size != 0 && file_size != Filesystem::kBadFileSize;
743 }
744
UpdateHeader(const Crc32 & checksum)745 libtextclassifier3::Status DocumentStore::UpdateHeader(const Crc32& checksum) {
746 // Write the header
747 DocumentStore::Header header;
748 header.magic = DocumentStore::Header::kMagic;
749 header.checksum = checksum.Get();
750
751 // This should overwrite the header.
752 ScopedFd sfd(
753 filesystem_->OpenForWrite(MakeHeaderFilename(base_dir_).c_str()));
754 if (!sfd.is_valid() ||
755 !filesystem_->Write(sfd.get(), &header, sizeof(header)) ||
756 !filesystem_->DataSync(sfd.get())) {
757 return absl_ports::InternalError(absl_ports::StrCat(
758 "Failed to write DocStore header: ", MakeHeaderFilename(base_dir_)));
759 }
760 return libtextclassifier3::Status::OK;
761 }
762
InternalPut(DocumentProto & document,PutDocumentStatsProto * put_document_stats)763 libtextclassifier3::StatusOr<DocumentId> DocumentStore::InternalPut(
764 DocumentProto& document, PutDocumentStatsProto* put_document_stats) {
765 std::unique_ptr<Timer> put_timer = clock_.GetNewTimer();
766 ICING_RETURN_IF_ERROR(document_validator_.Validate(document));
767
768 if (put_document_stats != nullptr) {
769 put_document_stats->set_document_size(document.ByteSizeLong());
770 }
771
772 // Copy fields needed before they are moved
773 std::string name_space = document.namespace_();
774 std::string uri = document.uri();
775 std::string schema = document.schema();
776 int document_score = document.score();
777 int32_t length_in_tokens = document.internal_fields().length_in_tokens();
778 int64_t creation_timestamp_ms = document.creation_timestamp_ms();
779
780 // Sets the creation timestamp if caller hasn't specified.
781 if (document.creation_timestamp_ms() == 0) {
782 creation_timestamp_ms = clock_.GetSystemTimeMilliseconds();
783 document.set_creation_timestamp_ms(creation_timestamp_ms);
784 }
785
786 int64_t expiration_timestamp_ms =
787 CalculateExpirationTimestampMs(creation_timestamp_ms, document.ttl_ms());
788
789 // Update ground truth first
790 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
791 // that can support error logging.
792 auto offset_or =
793 document_log_->WriteProto(CreateDocumentWrapper(std::move(document)));
794 if (!offset_or.ok()) {
795 ICING_LOG(ERROR) << offset_or.status().error_message()
796 << "Failed to write document";
797 return offset_or.status();
798 }
799 int64_t file_offset = std::move(offset_or).ValueOrDie();
800
801 // Get existing document id
802 auto old_document_id_or = GetDocumentId(name_space, uri);
803 if (!old_document_id_or.ok() &&
804 !absl_ports::IsNotFound(old_document_id_or.status())) {
805 return absl_ports::InternalError("Failed to read from key mapper");
806 }
807
808 // Creates a new document id, updates key mapper and document_id mapper
809 DocumentId new_document_id = document_id_mapper_->num_elements();
810 if (!IsDocumentIdValid(new_document_id)) {
811 return absl_ports::ResourceExhaustedError(
812 "Exceeded maximum number of documents. Try calling Optimize to reclaim "
813 "some space.");
814 }
815
816 ICING_RETURN_IF_ERROR(document_key_mapper_->Put(
817 MakeFingerprint(name_space, uri), new_document_id));
818 ICING_RETURN_IF_ERROR(document_id_mapper_->Set(new_document_id, file_offset));
819
820 // Update namespace maps
821 ICING_ASSIGN_OR_RETURN(
822 NamespaceId namespace_id,
823 namespace_mapper_->GetOrPut(name_space, namespace_mapper_->num_keys()));
824
825 // Update corpus maps
826 ICING_ASSIGN_OR_RETURN(
827 CorpusId corpusId,
828 corpus_mapper_->GetOrPut(MakeFingerprint(name_space, schema),
829 corpus_mapper_->num_keys()));
830
831 ICING_ASSIGN_OR_RETURN(CorpusAssociatedScoreData scoring_data,
832 GetCorpusAssociatedScoreDataToUpdate(corpusId));
833 scoring_data.AddDocument(length_in_tokens);
834
835 ICING_RETURN_IF_ERROR(
836 UpdateCorpusAssociatedScoreCache(corpusId, scoring_data));
837
838 ICING_RETURN_IF_ERROR(UpdateDocumentAssociatedScoreCache(
839 new_document_id,
840 DocumentAssociatedScoreData(corpusId, document_score,
841 creation_timestamp_ms, length_in_tokens)));
842
843 ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
844 schema_store_->GetSchemaTypeId(schema));
845
846 ICING_RETURN_IF_ERROR(UpdateFilterCache(
847 new_document_id, DocumentFilterData(namespace_id, schema_type_id,
848 expiration_timestamp_ms)));
849
850 if (old_document_id_or.ok()) {
851 // The old document exists, copy over the usage scores and delete the old
852 // document.
853 DocumentId old_document_id = old_document_id_or.ValueOrDie();
854
855 ICING_RETURN_IF_ERROR(
856 usage_store_->CloneUsageScores(/*from_document_id=*/old_document_id,
857 /*to_document_id=*/new_document_id));
858
859 // Delete the old document. It's fine if it's not found since it might have
860 // been deleted previously.
861 auto delete_status = Delete(old_document_id);
862 if (!delete_status.ok() && !absl_ports::IsNotFound(delete_status)) {
863 // Real error, pass it up.
864 return delete_status;
865 }
866 }
867
868 if (put_document_stats != nullptr) {
869 put_document_stats->set_document_store_latency_ms(
870 put_timer->GetElapsedMilliseconds());
871 }
872
873 return new_document_id;
874 }
875
Get(const std::string_view name_space,const std::string_view uri,bool clear_internal_fields) const876 libtextclassifier3::StatusOr<DocumentProto> DocumentStore::Get(
877 const std::string_view name_space, const std::string_view uri,
878 bool clear_internal_fields) const {
879 // TODO(b/147231617): Make a better way to replace the error message in an
880 // existing Status.
881 auto document_id_or = GetDocumentId(name_space, uri);
882 if (absl_ports::IsNotFound(document_id_or.status())) {
883 ICING_VLOG(1) << document_id_or.status().error_message();
884 return libtextclassifier3::Status(
885 document_id_or.status().CanonicalCode(),
886 IcingStringUtil::StringPrintf("Document (%s, %s) not found.",
887 name_space.data(), uri.data()));
888 }
889 DocumentId document_id = document_id_or.ValueOrDie();
890
891 // TODO(b/147231617): Make a better way to replace the error message in an
892 // existing Status.
893 auto status_or = Get(document_id);
894 if (absl_ports::IsNotFound(status_or.status())) {
895 ICING_LOG(ERROR) << document_id_or.status().error_message();
896 return libtextclassifier3::Status(
897 status_or.status().CanonicalCode(),
898 IcingStringUtil::StringPrintf("Document (%s, %s) not found.",
899 name_space.data(), uri.data()));
900 }
901 return status_or;
902 }
903
Get(DocumentId document_id,bool clear_internal_fields) const904 libtextclassifier3::StatusOr<DocumentProto> DocumentStore::Get(
905 DocumentId document_id, bool clear_internal_fields) const {
906 ICING_RETURN_IF_ERROR(DoesDocumentExistWithStatus(document_id));
907
908 auto document_log_offset_or = document_id_mapper_->Get(document_id);
909 if (!document_log_offset_or.ok()) {
910 // Since we've just checked that our document_id is valid a few lines
911 // above, there's no reason this should fail and an error should never
912 // happen.
913 return absl_ports::InternalError("Failed to find document offset.");
914 }
915 int64_t document_log_offset = *document_log_offset_or.ValueOrDie();
916
917 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
918 // that can support error logging.
919 auto document_wrapper_or = document_log_->ReadProto(document_log_offset);
920 if (!document_wrapper_or.ok()) {
921 ICING_LOG(ERROR) << document_wrapper_or.status().error_message()
922 << "Failed to read from document log";
923 return document_wrapper_or.status();
924 }
925 DocumentWrapper document_wrapper =
926 std::move(document_wrapper_or).ValueOrDie();
927 if (clear_internal_fields) {
928 document_wrapper.mutable_document()->clear_internal_fields();
929 }
930
931 return std::move(*document_wrapper.mutable_document());
932 }
933
GetDocumentId(const std::string_view name_space,const std::string_view uri) const934 libtextclassifier3::StatusOr<DocumentId> DocumentStore::GetDocumentId(
935 const std::string_view name_space, const std::string_view uri) const {
936 auto document_id_or =
937 document_key_mapper_->Get(MakeFingerprint(name_space, uri));
938 if (!document_id_or.ok()) {
939 return absl_ports::Annotate(
940 document_id_or.status(),
941 absl_ports::StrCat("Failed to find DocumentId by key: ", name_space,
942 ", ", uri));
943 }
944
945 // Guaranteed to have a DocumentId now
946 return document_id_or.ValueOrDie();
947 }
948
GetAllNamespaces() const949 std::vector<std::string> DocumentStore::GetAllNamespaces() const {
950 std::unordered_map<NamespaceId, std::string> namespace_id_to_namespace =
951 namespace_mapper_->GetValuesToKeys();
952
953 std::unordered_set<NamespaceId> existing_namespace_ids;
954 for (DocumentId document_id = 0; document_id < filter_cache_->num_elements();
955 ++document_id) {
956 // filter_cache_->Get can only fail if document_id is < 0
957 // or >= filter_cache_->num_elements. So, this error SHOULD NEVER HAPPEN.
958 auto status_or_data = filter_cache_->Get(document_id);
959 if (!status_or_data.ok()) {
960 ICING_LOG(ERROR)
961 << "Error while iterating over filter cache in GetAllNamespaces";
962 return std::vector<std::string>();
963 }
964 const DocumentFilterData* data = status_or_data.ValueOrDie();
965
966 if (InternalDoesDocumentExist(document_id)) {
967 existing_namespace_ids.insert(data->namespace_id());
968 }
969 }
970
971 std::vector<std::string> existing_namespaces;
972 for (auto itr = existing_namespace_ids.begin();
973 itr != existing_namespace_ids.end(); ++itr) {
974 existing_namespaces.push_back(namespace_id_to_namespace.at(*itr));
975 }
976 return existing_namespaces;
977 }
978
DoesDocumentExist(DocumentId document_id) const979 bool DocumentStore::DoesDocumentExist(DocumentId document_id) const {
980 if (!IsDocumentIdValid(document_id)) {
981 return false;
982 }
983
984 if (document_id >= document_id_mapper_->num_elements()) {
985 // Somehow got an validly constructed document_id that the document store
986 // doesn't know about
987 return false;
988 }
989
990 return InternalDoesDocumentExist(document_id);
991 }
992
DoesDocumentExistWithStatus(DocumentId document_id) const993 libtextclassifier3::Status DocumentStore::DoesDocumentExistWithStatus(
994 DocumentId document_id) const {
995 if (!IsDocumentIdValid(document_id)) {
996 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
997 "Document id '%d' invalid.", document_id));
998 }
999
1000 if (document_id >= document_id_mapper_->num_elements()) {
1001 // Somehow got a validly constructed document_id that the document store
1002 // doesn't know about.
1003 return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
1004 "Unknown document id '%d'.", document_id));
1005 }
1006
1007 if (!InternalDoesDocumentExist(document_id)) {
1008 return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
1009 "Document id '%d' doesn't exist", document_id));
1010 };
1011 return libtextclassifier3::Status::OK;
1012 }
1013
InternalDoesDocumentExist(DocumentId document_id) const1014 bool DocumentStore::InternalDoesDocumentExist(DocumentId document_id) const {
1015 return !IsDeleted(document_id) && !IsExpired(document_id);
1016 }
1017
IsDeleted(DocumentId document_id) const1018 bool DocumentStore::IsDeleted(DocumentId document_id) const {
1019 auto file_offset_or = document_id_mapper_->Get(document_id);
1020 if (!file_offset_or.ok()) {
1021 // This would only happen if document_id is out of range of the
1022 // document_id_mapper, meaning we got some invalid document_id. Callers
1023 // should already have checked that their document_id is valid or used
1024 // DoesDocumentExist(WithStatus). Regardless, return true since the
1025 // document doesn't exist.
1026 return true;
1027 }
1028 int64_t file_offset = *file_offset_or.ValueOrDie();
1029 return file_offset == kDocDeletedFlag;
1030 }
1031
IsExpired(DocumentId document_id) const1032 bool DocumentStore::IsExpired(DocumentId document_id) const {
1033 auto filter_data_or = filter_cache_->Get(document_id);
1034 if (!filter_data_or.ok()) {
1035 // This would only happen if document_id is out of range of the
1036 // filter_cache, meaning we got some invalid document_id. Callers should
1037 // already have checked that their document_id is valid or used
1038 // DoesDocumentExist(WithStatus). Regardless, return true since the
1039 // document doesn't exist.
1040 return true;
1041 }
1042 const DocumentFilterData* filter_data = filter_data_or.ValueOrDie();
1043
1044 // Check if it's past the expiration time
1045 return clock_.GetSystemTimeMilliseconds() >=
1046 filter_data->expiration_timestamp_ms();
1047 }
1048
Delete(const std::string_view name_space,const std::string_view uri)1049 libtextclassifier3::Status DocumentStore::Delete(
1050 const std::string_view name_space, const std::string_view uri) {
1051 // Try to get the DocumentId first
1052 auto document_id_or = GetDocumentId(name_space, uri);
1053 if (!document_id_or.ok()) {
1054 return absl_ports::Annotate(
1055 document_id_or.status(),
1056 absl_ports::StrCat("Failed to delete Document. namespace: ", name_space,
1057 ", uri: ", uri));
1058 }
1059 return Delete(document_id_or.ValueOrDie());
1060 }
1061
Delete(DocumentId document_id)1062 libtextclassifier3::Status DocumentStore::Delete(DocumentId document_id) {
1063 ICING_RETURN_IF_ERROR(DoesDocumentExistWithStatus(document_id));
1064
1065 auto document_log_offset_or = document_id_mapper_->Get(document_id);
1066 if (!document_log_offset_or.ok()) {
1067 return absl_ports::InternalError("Failed to find document offset.");
1068 }
1069 int64_t document_log_offset = *document_log_offset_or.ValueOrDie();
1070
1071 // Erases document proto.
1072 ICING_RETURN_IF_ERROR(document_log_->EraseProto(document_log_offset));
1073 return ClearDerivedData(document_id);
1074 }
1075
GetNamespaceId(std::string_view name_space) const1076 libtextclassifier3::StatusOr<NamespaceId> DocumentStore::GetNamespaceId(
1077 std::string_view name_space) const {
1078 return namespace_mapper_->Get(name_space);
1079 }
1080
GetCorpusId(const std::string_view name_space,const std::string_view schema) const1081 libtextclassifier3::StatusOr<CorpusId> DocumentStore::GetCorpusId(
1082 const std::string_view name_space, const std::string_view schema) const {
1083 return corpus_mapper_->Get(MakeFingerprint(name_space, schema));
1084 }
1085
1086 libtextclassifier3::StatusOr<DocumentAssociatedScoreData>
GetDocumentAssociatedScoreData(DocumentId document_id) const1087 DocumentStore::GetDocumentAssociatedScoreData(DocumentId document_id) const {
1088 if (!DoesDocumentExist(document_id)) {
1089 return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
1090 "Can't get usage scores, document id '%d' doesn't exist", document_id));
1091 }
1092
1093 auto score_data_or = score_cache_->GetCopy(document_id);
1094 if (!score_data_or.ok()) {
1095 ICING_LOG(ERROR) << " while trying to access DocumentId " << document_id
1096 << " from score_cache_";
1097 return score_data_or.status();
1098 }
1099
1100 DocumentAssociatedScoreData document_associated_score_data =
1101 std::move(score_data_or).ValueOrDie();
1102 if (document_associated_score_data.document_score() < 0) {
1103 // An negative / invalid score means that the score data has been deleted.
1104 return absl_ports::NotFoundError("Document score data not found.");
1105 }
1106 return document_associated_score_data;
1107 }
1108
1109 libtextclassifier3::StatusOr<CorpusAssociatedScoreData>
GetCorpusAssociatedScoreData(CorpusId corpus_id) const1110 DocumentStore::GetCorpusAssociatedScoreData(CorpusId corpus_id) const {
1111 auto score_data_or = corpus_score_cache_->GetCopy(corpus_id);
1112 if (!score_data_or.ok()) {
1113 return score_data_or.status();
1114 }
1115
1116 CorpusAssociatedScoreData corpus_associated_score_data =
1117 std::move(score_data_or).ValueOrDie();
1118 return corpus_associated_score_data;
1119 }
1120
1121 libtextclassifier3::StatusOr<CorpusAssociatedScoreData>
GetCorpusAssociatedScoreDataToUpdate(CorpusId corpus_id) const1122 DocumentStore::GetCorpusAssociatedScoreDataToUpdate(CorpusId corpus_id) const {
1123 auto corpus_scoring_data_or = GetCorpusAssociatedScoreData(corpus_id);
1124 if (corpus_scoring_data_or.ok()) {
1125 return std::move(corpus_scoring_data_or).ValueOrDie();
1126 }
1127 CorpusAssociatedScoreData scoringData;
1128 // OUT_OF_RANGE is the StatusCode returned when a corpus id is added to
1129 // corpus_score_cache_ for the first time.
1130 if (corpus_scoring_data_or.status().CanonicalCode() ==
1131 libtextclassifier3::StatusCode::OUT_OF_RANGE) {
1132 return scoringData;
1133 }
1134 return corpus_scoring_data_or.status();
1135 }
1136
1137 libtextclassifier3::StatusOr<DocumentFilterData>
GetDocumentFilterData(DocumentId document_id) const1138 DocumentStore::GetDocumentFilterData(DocumentId document_id) const {
1139 if (!DoesDocumentExist(document_id)) {
1140 return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
1141 "Can't get filter data, document id '%d' doesn't exist", document_id));
1142 }
1143
1144 auto filter_data_or = filter_cache_->GetCopy(document_id);
1145 if (!filter_data_or.ok()) {
1146 ICING_LOG(ERROR) << " while trying to access DocumentId " << document_id
1147 << " from filter_cache_";
1148 return filter_data_or.status();
1149 }
1150 DocumentFilterData document_filter_data =
1151 std::move(filter_data_or).ValueOrDie();
1152 return document_filter_data;
1153 }
1154
1155 libtextclassifier3::StatusOr<UsageStore::UsageScores>
GetUsageScores(DocumentId document_id) const1156 DocumentStore::GetUsageScores(DocumentId document_id) const {
1157 if (!DoesDocumentExist(document_id)) {
1158 return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
1159 "Can't get usage scores, document id '%d' doesn't exist", document_id));
1160 }
1161 return usage_store_->GetUsageScores(document_id);
1162 }
1163
ReportUsage(const UsageReport & usage_report)1164 libtextclassifier3::Status DocumentStore::ReportUsage(
1165 const UsageReport& usage_report) {
1166 ICING_ASSIGN_OR_RETURN(DocumentId document_id,
1167 GetDocumentId(usage_report.document_namespace(),
1168 usage_report.document_uri()));
1169 // We can use the internal version here because we got our document_id from
1170 // our internal data structures. We would have thrown some error if the
1171 // namespace and/or uri were incorrect.
1172 if (!InternalDoesDocumentExist(document_id)) {
1173 // Document was probably deleted or expired.
1174 return absl_ports::NotFoundError(absl_ports::StrCat(
1175 "Couldn't report usage on a nonexistent document: (namespace: '",
1176 usage_report.document_namespace(), "', uri: '",
1177 usage_report.document_uri(), "')"));
1178 }
1179
1180 return usage_store_->AddUsageReport(usage_report, document_id);
1181 }
1182
DeleteByNamespace(std::string_view name_space)1183 DocumentStore::DeleteByGroupResult DocumentStore::DeleteByNamespace(
1184 std::string_view name_space) {
1185 DeleteByGroupResult result;
1186 auto namespace_id_or = namespace_mapper_->Get(name_space);
1187 if (!namespace_id_or.ok()) {
1188 result.status = absl_ports::Annotate(
1189 namespace_id_or.status(),
1190 absl_ports::StrCat("Failed to find namespace: ", name_space));
1191 return result;
1192 }
1193 NamespaceId namespace_id = namespace_id_or.ValueOrDie();
1194 auto num_deleted_or = BatchDelete(namespace_id, kInvalidSchemaTypeId);
1195 if (!num_deleted_or.ok()) {
1196 result.status = std::move(num_deleted_or).status();
1197 return result;
1198 }
1199
1200 result.num_docs_deleted = num_deleted_or.ValueOrDie();
1201 if (result.num_docs_deleted <= 0) {
1202 // Treat the fact that no existing documents had this namespace to be the
1203 // same as this namespace not existing at all.
1204 result.status = absl_ports::NotFoundError(
1205 absl_ports::StrCat("Namespace '", name_space, "' doesn't exist"));
1206 return result;
1207 }
1208
1209 return result;
1210 }
1211
DeleteBySchemaType(std::string_view schema_type)1212 DocumentStore::DeleteByGroupResult DocumentStore::DeleteBySchemaType(
1213 std::string_view schema_type) {
1214 DeleteByGroupResult result;
1215 auto schema_type_id_or = schema_store_->GetSchemaTypeId(schema_type);
1216 if (!schema_type_id_or.ok()) {
1217 result.status = absl_ports::Annotate(
1218 schema_type_id_or.status(),
1219 absl_ports::StrCat("Failed to find schema type. schema_type: ",
1220 schema_type));
1221 return result;
1222 }
1223 SchemaTypeId schema_type_id = schema_type_id_or.ValueOrDie();
1224 auto num_deleted_or = BatchDelete(kInvalidNamespaceId, schema_type_id);
1225 if (!num_deleted_or.ok()) {
1226 result.status = std::move(num_deleted_or).status();
1227 return result;
1228 }
1229
1230 result.num_docs_deleted = num_deleted_or.ValueOrDie();
1231 if (result.num_docs_deleted <= 0) {
1232 result.status = absl_ports::NotFoundError(absl_ports::StrCat(
1233 "No documents found with schema type '", schema_type, "'"));
1234 return result;
1235 }
1236
1237 return result;
1238 }
1239
BatchDelete(NamespaceId namespace_id,SchemaTypeId schema_type_id)1240 libtextclassifier3::StatusOr<int> DocumentStore::BatchDelete(
1241 NamespaceId namespace_id, SchemaTypeId schema_type_id) {
1242 // Tracks if there were any existing documents with this namespace that we
1243 // will mark as deleted.
1244 int num_updated_documents = 0;
1245
1246 // Traverse FilterCache and delete all docs that match namespace_id and
1247 // schema_type_id.
1248 for (DocumentId document_id = 0; document_id < filter_cache_->num_elements();
1249 ++document_id) {
1250 // filter_cache_->Get can only fail if document_id is < 0
1251 // or >= filter_cache_->num_elements. So, this error SHOULD NEVER HAPPEN.
1252 ICING_ASSIGN_OR_RETURN(const DocumentFilterData* data,
1253 filter_cache_->Get(document_id));
1254
1255 // Check namespace only when the input namespace id is valid.
1256 if (namespace_id != kInvalidNamespaceId &&
1257 (data->namespace_id() == kInvalidNamespaceId ||
1258 data->namespace_id() != namespace_id)) {
1259 // The document has already been hard-deleted or isn't from the desired
1260 // namespace.
1261 continue;
1262 }
1263
1264 // Check schema type only when the input schema type id is valid.
1265 if (schema_type_id != kInvalidSchemaTypeId &&
1266 (data->schema_type_id() == kInvalidSchemaTypeId ||
1267 data->schema_type_id() != schema_type_id)) {
1268 // The document has already been hard-deleted or doesn't have the
1269 // desired schema type.
1270 continue;
1271 }
1272
1273 // The document has the desired namespace and schema type, it either
1274 // exists or has expired.
1275 libtextclassifier3::Status delete_status = Delete(document_id);
1276 if (absl_ports::IsNotFound(delete_status)) {
1277 continue;
1278 } else if (!delete_status.ok()) {
1279 // Real error, pass up.
1280 return delete_status;
1281 }
1282 ++num_updated_documents;
1283 }
1284
1285 return num_updated_documents;
1286 }
1287
PersistToDisk(PersistType::Code persist_type)1288 libtextclassifier3::Status DocumentStore::PersistToDisk(
1289 PersistType::Code persist_type) {
1290 if (persist_type == PersistType::LITE) {
1291 // only persist the document log.
1292 return document_log_->PersistToDisk();
1293 }
1294 ICING_RETURN_IF_ERROR(document_log_->PersistToDisk());
1295 ICING_RETURN_IF_ERROR(document_key_mapper_->PersistToDisk());
1296 ICING_RETURN_IF_ERROR(document_id_mapper_->PersistToDisk());
1297 ICING_RETURN_IF_ERROR(score_cache_->PersistToDisk());
1298 ICING_RETURN_IF_ERROR(filter_cache_->PersistToDisk());
1299 ICING_RETURN_IF_ERROR(namespace_mapper_->PersistToDisk());
1300 ICING_RETURN_IF_ERROR(usage_store_->PersistToDisk());
1301 ICING_RETURN_IF_ERROR(corpus_mapper_->PersistToDisk());
1302 ICING_RETURN_IF_ERROR(corpus_score_cache_->PersistToDisk());
1303
1304 // Update the combined checksum and write to header file.
1305 ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
1306 ICING_RETURN_IF_ERROR(UpdateHeader(checksum));
1307
1308 return libtextclassifier3::Status::OK;
1309 }
1310
GetValueOrDefault(const libtextclassifier3::StatusOr<int64_t> & value_or,int64_t default_value)1311 int64_t GetValueOrDefault(const libtextclassifier3::StatusOr<int64_t>& value_or,
1312 int64_t default_value) {
1313 return (value_or.ok()) ? value_or.ValueOrDie() : default_value;
1314 }
1315
GetMemberStorageInfo() const1316 DocumentStorageInfoProto DocumentStore::GetMemberStorageInfo() const {
1317 DocumentStorageInfoProto storage_info;
1318 storage_info.set_document_log_size(
1319 GetValueOrDefault(document_log_->GetDiskUsage(), -1));
1320 storage_info.set_key_mapper_size(
1321 GetValueOrDefault(document_key_mapper_->GetDiskUsage(), -1));
1322 storage_info.set_document_id_mapper_size(
1323 GetValueOrDefault(document_id_mapper_->GetDiskUsage(), -1));
1324 storage_info.set_score_cache_size(
1325 GetValueOrDefault(score_cache_->GetDiskUsage(), -1));
1326 storage_info.set_filter_cache_size(
1327 GetValueOrDefault(filter_cache_->GetDiskUsage(), -1));
1328 storage_info.set_namespace_id_mapper_size(
1329 GetValueOrDefault(namespace_mapper_->GetDiskUsage(), -1));
1330 storage_info.set_corpus_mapper_size(
1331 GetValueOrDefault(corpus_mapper_->GetDiskUsage(), -1));
1332 storage_info.set_corpus_score_cache_size(
1333 GetValueOrDefault(corpus_score_cache_->GetDiskUsage(), -1));
1334 return storage_info;
1335 }
1336
CalculateDocumentStatusCounts(DocumentStorageInfoProto storage_info) const1337 DocumentStorageInfoProto DocumentStore::CalculateDocumentStatusCounts(
1338 DocumentStorageInfoProto storage_info) const {
1339 int total_num_alive = 0;
1340 int total_num_expired = 0;
1341 int total_num_deleted = 0;
1342 std::unordered_map<NamespaceId, std::string> namespace_id_to_namespace =
1343 namespace_mapper_->GetValuesToKeys();
1344 std::unordered_map<std::string, NamespaceStorageInfoProto>
1345 namespace_to_storage_info;
1346
1347 for (DocumentId document_id = 0;
1348 document_id < document_id_mapper_->num_elements(); ++document_id) {
1349 // Check if it's deleted first.
1350 if (IsDeleted(document_id)) {
1351 // We don't have the namespace id of hard deleted documents anymore, so
1352 // we can't add to our namespace storage info.
1353 ++total_num_deleted;
1354 continue;
1355 }
1356
1357 // At this point, the document is either alive or expired, we can get
1358 // namespace info for it.
1359 auto filter_data_or = filter_cache_->Get(document_id);
1360 if (!filter_data_or.ok()) {
1361 ICING_VLOG(1) << "Error trying to get filter data for document store "
1362 "storage info counts.";
1363 continue;
1364 }
1365 const DocumentFilterData* filter_data = filter_data_or.ValueOrDie();
1366 auto itr = namespace_id_to_namespace.find(filter_data->namespace_id());
1367 if (itr == namespace_id_to_namespace.end()) {
1368 ICING_VLOG(1) << "Error trying to find namespace for document store "
1369 "storage info counts.";
1370 continue;
1371 }
1372 const std::string& name_space = itr->second;
1373
1374 // Always set the namespace, if the NamespaceStorageInfoProto didn't exist
1375 // before, we'll get back a default instance of it.
1376 NamespaceStorageInfoProto& namespace_storage_info =
1377 namespace_to_storage_info[name_space];
1378 namespace_storage_info.set_namespace_(name_space);
1379
1380 // Get usage scores
1381 auto usage_scores_or = usage_store_->GetUsageScores(document_id);
1382 if (!usage_scores_or.ok()) {
1383 ICING_VLOG(1) << "Error trying to get usage scores for document store "
1384 "storage info counts.";
1385 continue;
1386 }
1387 UsageStore::UsageScores usage_scores = usage_scores_or.ValueOrDie();
1388
1389 // Update our stats
1390 if (IsExpired(document_id)) {
1391 ++total_num_expired;
1392 namespace_storage_info.set_num_expired_documents(
1393 namespace_storage_info.num_expired_documents() + 1);
1394 if (usage_scores.usage_type1_count > 0) {
1395 namespace_storage_info.set_num_expired_documents_usage_type1(
1396 namespace_storage_info.num_expired_documents_usage_type1() + 1);
1397 }
1398 if (usage_scores.usage_type2_count > 0) {
1399 namespace_storage_info.set_num_expired_documents_usage_type2(
1400 namespace_storage_info.num_expired_documents_usage_type2() + 1);
1401 }
1402 if (usage_scores.usage_type3_count > 0) {
1403 namespace_storage_info.set_num_expired_documents_usage_type3(
1404 namespace_storage_info.num_expired_documents_usage_type3() + 1);
1405 }
1406 } else {
1407 ++total_num_alive;
1408 namespace_storage_info.set_num_alive_documents(
1409 namespace_storage_info.num_alive_documents() + 1);
1410 if (usage_scores.usage_type1_count > 0) {
1411 namespace_storage_info.set_num_alive_documents_usage_type1(
1412 namespace_storage_info.num_alive_documents_usage_type1() + 1);
1413 }
1414 if (usage_scores.usage_type2_count > 0) {
1415 namespace_storage_info.set_num_alive_documents_usage_type2(
1416 namespace_storage_info.num_alive_documents_usage_type2() + 1);
1417 }
1418 if (usage_scores.usage_type3_count > 0) {
1419 namespace_storage_info.set_num_alive_documents_usage_type3(
1420 namespace_storage_info.num_alive_documents_usage_type3() + 1);
1421 }
1422 }
1423 }
1424
1425 for (auto& itr : namespace_to_storage_info) {
1426 storage_info.mutable_namespace_storage_info()->Add(std::move(itr.second));
1427 }
1428 storage_info.set_num_alive_documents(total_num_alive);
1429 storage_info.set_num_deleted_documents(total_num_deleted);
1430 storage_info.set_num_expired_documents(total_num_expired);
1431 return storage_info;
1432 }
1433
GetStorageInfo() const1434 DocumentStorageInfoProto DocumentStore::GetStorageInfo() const {
1435 DocumentStorageInfoProto storage_info = GetMemberStorageInfo();
1436 int64_t directory_size = filesystem_->GetDiskUsage(base_dir_.c_str());
1437 if (directory_size != Filesystem::kBadFileSize) {
1438 storage_info.set_document_store_size(directory_size);
1439 } else {
1440 storage_info.set_document_store_size(-1);
1441 }
1442 storage_info.set_num_namespaces(namespace_mapper_->num_keys());
1443 return CalculateDocumentStatusCounts(std::move(storage_info));
1444 }
1445
UpdateSchemaStore(const SchemaStore * schema_store)1446 libtextclassifier3::Status DocumentStore::UpdateSchemaStore(
1447 const SchemaStore* schema_store) {
1448 // Update all references to the SchemaStore
1449 schema_store_ = schema_store;
1450 document_validator_.UpdateSchemaStore(schema_store);
1451
1452 int size = document_id_mapper_->num_elements();
1453 for (DocumentId document_id = 0; document_id < size; document_id++) {
1454 auto document_or = Get(document_id);
1455 if (absl_ports::IsNotFound(document_or.status())) {
1456 // Skip nonexistent documents
1457 continue;
1458 } else if (!document_or.ok()) {
1459 // Real error, pass up
1460 return absl_ports::Annotate(
1461 document_or.status(),
1462 IcingStringUtil::StringPrintf(
1463 "Failed to retrieve Document for DocumentId %d", document_id));
1464 }
1465
1466 // Guaranteed to have a document now.
1467 DocumentProto document = document_or.ValueOrDie();
1468
1469 // Revalidate that this document is still compatible
1470 if (document_validator_.Validate(document).ok()) {
1471 // Update the SchemaTypeId for this entry
1472 ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
1473 schema_store_->GetSchemaTypeId(document.schema()));
1474 filter_cache_->mutable_array()[document_id].set_schema_type_id(
1475 schema_type_id);
1476 } else {
1477 // Document is no longer valid with the new SchemaStore. Mark as
1478 // deleted
1479 auto delete_status = Delete(document.namespace_(), document.uri());
1480 if (!delete_status.ok() && !absl_ports::IsNotFound(delete_status)) {
1481 // Real error, pass up
1482 return delete_status;
1483 }
1484 }
1485 }
1486
1487 return libtextclassifier3::Status::OK;
1488 }
1489
OptimizedUpdateSchemaStore(const SchemaStore * schema_store,const SchemaStore::SetSchemaResult & set_schema_result)1490 libtextclassifier3::Status DocumentStore::OptimizedUpdateSchemaStore(
1491 const SchemaStore* schema_store,
1492 const SchemaStore::SetSchemaResult& set_schema_result) {
1493 if (!set_schema_result.success) {
1494 // No new schema was set, no work to be done
1495 return libtextclassifier3::Status::OK;
1496 }
1497
1498 // Update all references to the SchemaStore
1499 schema_store_ = schema_store;
1500 document_validator_.UpdateSchemaStore(schema_store);
1501
1502 int size = document_id_mapper_->num_elements();
1503 for (DocumentId document_id = 0; document_id < size; document_id++) {
1504 if (!InternalDoesDocumentExist(document_id)) {
1505 // Skip nonexistent documents
1506 continue;
1507 }
1508
1509 // Guaranteed that the document exists now.
1510 ICING_ASSIGN_OR_RETURN(const DocumentFilterData* filter_data,
1511 filter_cache_->Get(document_id));
1512
1513 bool delete_document = set_schema_result.schema_types_deleted_by_id.count(
1514 filter_data->schema_type_id()) != 0;
1515
1516 // Check if we need to update the FilterCache entry for this document. It
1517 // may have been assigned a different SchemaTypeId in the new SchemaStore.
1518 bool update_filter_cache =
1519 set_schema_result.old_schema_type_ids_changed.count(
1520 filter_data->schema_type_id()) != 0;
1521
1522 // Check if we need to revalidate this document if the type is now
1523 // incompatible
1524 bool revalidate_document =
1525 set_schema_result.schema_types_incompatible_by_id.count(
1526 filter_data->schema_type_id()) != 0;
1527
1528 if (update_filter_cache || revalidate_document) {
1529 ICING_ASSIGN_OR_RETURN(DocumentProto document, Get(document_id));
1530
1531 if (update_filter_cache) {
1532 ICING_ASSIGN_OR_RETURN(
1533 SchemaTypeId schema_type_id,
1534 schema_store_->GetSchemaTypeId(document.schema()));
1535 filter_cache_->mutable_array()[document_id].set_schema_type_id(
1536 schema_type_id);
1537 }
1538 if (revalidate_document) {
1539 delete_document = !document_validator_.Validate(document).ok();
1540 }
1541 }
1542
1543 if (delete_document) {
1544 // Document is no longer valid with the new SchemaStore. Mark as deleted
1545 auto delete_status = Delete(document_id);
1546 if (!delete_status.ok() && !absl_ports::IsNotFound(delete_status)) {
1547 // Real error, pass up
1548 return delete_status;
1549 }
1550 }
1551 }
1552
1553 return libtextclassifier3::Status::OK;
1554 }
1555
1556 // TODO(b/121227117): Implement Optimize()
Optimize()1557 libtextclassifier3::Status DocumentStore::Optimize() {
1558 return libtextclassifier3::Status::OK;
1559 }
1560
OptimizeInto(const std::string & new_directory,const LanguageSegmenter * lang_segmenter,OptimizeStatsProto * stats)1561 libtextclassifier3::Status DocumentStore::OptimizeInto(
1562 const std::string& new_directory, const LanguageSegmenter* lang_segmenter,
1563 OptimizeStatsProto* stats) {
1564 // Validates directory
1565 if (new_directory == base_dir_) {
1566 return absl_ports::InvalidArgumentError(
1567 "New directory is the same as the current one.");
1568 }
1569
1570 ICING_ASSIGN_OR_RETURN(auto doc_store_create_result,
1571 DocumentStore::Create(filesystem_, new_directory,
1572 &clock_, schema_store_));
1573 std::unique_ptr<DocumentStore> new_doc_store =
1574 std::move(doc_store_create_result.document_store);
1575
1576 // Writes all valid docs into new document store (new directory)
1577 int size = document_id_mapper_->num_elements();
1578 int num_deleted = 0;
1579 int num_expired = 0;
1580 UsageStore::UsageScores default_usage;
1581 for (DocumentId document_id = 0; document_id < size; document_id++) {
1582 auto document_or = Get(document_id, /*clear_internal_fields=*/false);
1583 if (absl_ports::IsNotFound(document_or.status())) {
1584 if (IsDeleted(document_id)) {
1585 ++num_deleted;
1586 } else if (IsExpired(document_id)) {
1587 ++num_expired;
1588 }
1589 continue;
1590 } else if (!document_or.ok()) {
1591 // Real error, pass up
1592 return absl_ports::Annotate(
1593 document_or.status(),
1594 IcingStringUtil::StringPrintf(
1595 "Failed to retrieve Document for DocumentId %d", document_id));
1596 }
1597
1598 // Guaranteed to have a document now.
1599 DocumentProto document_to_keep = document_or.ValueOrDie();
1600
1601 libtextclassifier3::StatusOr<DocumentId> new_document_id_or;
1602 if (document_to_keep.internal_fields().length_in_tokens() == 0) {
1603 auto tokenized_document_or = TokenizedDocument::Create(
1604 schema_store_, lang_segmenter, document_to_keep);
1605 if (!tokenized_document_or.ok()) {
1606 return absl_ports::Annotate(
1607 tokenized_document_or.status(),
1608 IcingStringUtil::StringPrintf(
1609 "Failed to tokenize Document for DocumentId %d", document_id));
1610 }
1611 TokenizedDocument tokenized_document(
1612 std::move(tokenized_document_or).ValueOrDie());
1613 new_document_id_or =
1614 new_doc_store->Put(document_to_keep, tokenized_document.num_tokens());
1615 } else {
1616 // TODO(b/144458732): Implement a more robust version of
1617 // TC_ASSIGN_OR_RETURN that can support error logging.
1618 new_document_id_or = new_doc_store->InternalPut(document_to_keep);
1619 }
1620 if (!new_document_id_or.ok()) {
1621 ICING_LOG(ERROR) << new_document_id_or.status().error_message()
1622 << "Failed to write into new document store";
1623 return new_document_id_or.status();
1624 }
1625
1626 // Copy over usage scores.
1627 ICING_ASSIGN_OR_RETURN(UsageStore::UsageScores usage_scores,
1628 usage_store_->GetUsageScores(document_id));
1629 if (!(usage_scores == default_usage)) {
1630 // If the usage scores for this document are the default (no usage), then
1631 // don't bother setting it. No need to possibly allocate storage if
1632 // there's nothing interesting to store.
1633 DocumentId new_document_id = new_document_id_or.ValueOrDie();
1634 ICING_RETURN_IF_ERROR(
1635 new_doc_store->SetUsageScores(new_document_id, usage_scores));
1636 }
1637 }
1638 if (stats != nullptr) {
1639 stats->set_num_original_documents(size);
1640 stats->set_num_deleted_documents(num_deleted);
1641 stats->set_num_expired_documents(num_expired);
1642 }
1643 ICING_RETURN_IF_ERROR(new_doc_store->PersistToDisk(PersistType::FULL));
1644 return libtextclassifier3::Status::OK;
1645 }
1646
1647 libtextclassifier3::StatusOr<DocumentStore::OptimizeInfo>
GetOptimizeInfo() const1648 DocumentStore::GetOptimizeInfo() const {
1649 OptimizeInfo optimize_info;
1650
1651 // Figure out our ratio of optimizable/total docs.
1652 int32_t num_documents = document_id_mapper_->num_elements();
1653 for (DocumentId document_id = kMinDocumentId; document_id < num_documents;
1654 ++document_id) {
1655 if (!InternalDoesDocumentExist(document_id)) {
1656 ++optimize_info.optimizable_docs;
1657 }
1658
1659 ++optimize_info.total_docs;
1660 }
1661
1662 if (optimize_info.total_docs == 0) {
1663 // Can exit early since there's nothing to calculate.
1664 return optimize_info;
1665 }
1666
1667 // Get the total element size.
1668 //
1669 // We use file size instead of disk usage here because the files are not
1670 // sparse, so it's more accurate. Disk usage rounds up to the nearest block
1671 // size.
1672 ICING_ASSIGN_OR_RETURN(const int64_t document_log_file_size,
1673 document_log_->GetElementsFileSize());
1674 ICING_ASSIGN_OR_RETURN(const int64_t document_id_mapper_file_size,
1675 document_id_mapper_->GetElementsFileSize());
1676 ICING_ASSIGN_OR_RETURN(const int64_t score_cache_file_size,
1677 score_cache_->GetElementsFileSize());
1678 ICING_ASSIGN_OR_RETURN(const int64_t filter_cache_file_size,
1679 filter_cache_->GetElementsFileSize());
1680 ICING_ASSIGN_OR_RETURN(const int64_t corpus_score_cache_file_size,
1681 corpus_score_cache_->GetElementsFileSize());
1682
1683 // Usage store might be sparse, but we'll still use file size for more
1684 // accurate counting.
1685 ICING_ASSIGN_OR_RETURN(const int64_t usage_store_file_size,
1686 usage_store_->GetElementsFileSize());
1687
1688 // We use a combined disk usage and file size for the KeyMapper because it's
1689 // backed by a trie, which has some sparse property bitmaps.
1690 ICING_ASSIGN_OR_RETURN(const int64_t document_key_mapper_size,
1691 document_key_mapper_->GetElementsSize());
1692
1693 // We don't include the namespace_mapper or the corpus_mapper because it's
1694 // not clear if we could recover any space even if Optimize were called.
1695 // Deleting 100s of documents could still leave a few documents of a
1696 // namespace, and then there would be no change.
1697
1698 int64_t total_size = document_log_file_size + document_key_mapper_size +
1699 document_id_mapper_file_size + score_cache_file_size +
1700 filter_cache_file_size + corpus_score_cache_file_size +
1701 usage_store_file_size;
1702
1703 optimize_info.estimated_optimizable_bytes =
1704 total_size * optimize_info.optimizable_docs / optimize_info.total_docs;
1705 return optimize_info;
1706 }
1707
UpdateCorpusAssociatedScoreCache(CorpusId corpus_id,const CorpusAssociatedScoreData & score_data)1708 libtextclassifier3::Status DocumentStore::UpdateCorpusAssociatedScoreCache(
1709 CorpusId corpus_id, const CorpusAssociatedScoreData& score_data) {
1710 return corpus_score_cache_->Set(corpus_id, score_data);
1711 }
1712
UpdateDocumentAssociatedScoreCache(DocumentId document_id,const DocumentAssociatedScoreData & score_data)1713 libtextclassifier3::Status DocumentStore::UpdateDocumentAssociatedScoreCache(
1714 DocumentId document_id, const DocumentAssociatedScoreData& score_data) {
1715 return score_cache_->Set(document_id, score_data);
1716 }
1717
UpdateFilterCache(DocumentId document_id,const DocumentFilterData & filter_data)1718 libtextclassifier3::Status DocumentStore::UpdateFilterCache(
1719 DocumentId document_id, const DocumentFilterData& filter_data) {
1720 return filter_cache_->Set(document_id, filter_data);
1721 }
1722
ClearDerivedData(DocumentId document_id)1723 libtextclassifier3::Status DocumentStore::ClearDerivedData(
1724 DocumentId document_id) {
1725 // We intentionally leave the data in key_mapper_ because locating that data
1726 // requires fetching namespace and uri. Leaving data in key_mapper_ should
1727 // be fine because the data is hashed.
1728
1729 ICING_RETURN_IF_ERROR(document_id_mapper_->Set(document_id, kDocDeletedFlag));
1730
1731 // Resets the score cache entry
1732 ICING_RETURN_IF_ERROR(UpdateDocumentAssociatedScoreCache(
1733 document_id, DocumentAssociatedScoreData(kInvalidCorpusId,
1734 /*document_score=*/-1,
1735 /*creation_timestamp_ms=*/-1,
1736 /*length_in_tokens=*/0)));
1737
1738 // Resets the filter cache entry
1739 ICING_RETURN_IF_ERROR(UpdateFilterCache(
1740 document_id, DocumentFilterData(kInvalidNamespaceId, kInvalidSchemaTypeId,
1741 /*expiration_timestamp_ms=*/-1)));
1742
1743 // Clears the usage scores.
1744 return usage_store_->DeleteUsageScores(document_id);
1745 }
1746
SetUsageScores(DocumentId document_id,const UsageStore::UsageScores & usage_scores)1747 libtextclassifier3::Status DocumentStore::SetUsageScores(
1748 DocumentId document_id, const UsageStore::UsageScores& usage_scores) {
1749 return usage_store_->SetUsageScores(document_id, usage_scores);
1750 }
1751
1752 } // namespace lib
1753 } // namespace icing
1754