1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/store/document-store.h"
16
17 #include <cstdint>
18 #include <limits>
19 #include <memory>
20 #include <optional>
21 #include <string>
22 #include <string_view>
23 #include <unordered_map>
24 #include <utility>
25 #include <vector>
26
27 #include "icing/text_classifier/lib3/utils/base/status.h"
28 #include "icing/text_classifier/lib3/utils/base/statusor.h"
29 #include "icing/text_classifier/lib3/utils/hash/farmhash.h"
30 #include "icing/absl_ports/annotate.h"
31 #include "icing/absl_ports/canonical_errors.h"
32 #include "icing/absl_ports/str_cat.h"
33 #include "icing/file/file-backed-proto-log.h"
34 #include "icing/file/file-backed-vector.h"
35 #include "icing/file/filesystem.h"
36 #include "icing/file/memory-mapped-file.h"
37 #include "icing/file/portable-file-backed-proto-log.h"
38 #include "icing/legacy/core/icing-string-util.h"
39 #include "icing/proto/debug.pb.h"
40 #include "icing/proto/document.pb.h"
41 #include "icing/proto/document_wrapper.pb.h"
42 #include "icing/proto/logging.pb.h"
43 #include "icing/proto/optimize.pb.h"
44 #include "icing/proto/persist.pb.h"
45 #include "icing/proto/schema.pb.h"
46 #include "icing/proto/storage.pb.h"
47 #include "icing/proto/usage.pb.h"
48 #include "icing/schema/schema-store.h"
49 #include "icing/store/corpus-associated-scoring-data.h"
50 #include "icing/store/corpus-id.h"
51 #include "icing/store/document-associated-score-data.h"
52 #include "icing/store/document-filter-data.h"
53 #include "icing/store/document-id.h"
54 #include "icing/store/document-log-creator.h"
55 #include "icing/store/dynamic-trie-key-mapper.h"
56 #include "icing/store/namespace-id.h"
57 #include "icing/store/usage-store.h"
58 #include "icing/tokenization/language-segmenter.h"
59 #include "icing/util/clock.h"
60 #include "icing/util/crc32.h"
61 #include "icing/util/data-loss.h"
62 #include "icing/util/encode-util.h"
63 #include "icing/util/fingerprint-util.h"
64 #include "icing/util/logging.h"
65 #include "icing/util/status-macros.h"
66 #include "icing/util/tokenized-document.h"
67
68 namespace icing {
69 namespace lib {
70
71 namespace {
72
73 // Used in DocumentId mapper to mark a document as deleted
74 constexpr int64_t kDocDeletedFlag = -1;
75 constexpr char kDocumentIdMapperFilename[] = "document_id_mapper";
76 constexpr char kDocumentStoreHeaderFilename[] = "document_store_header";
77 constexpr char kScoreCacheFilename[] = "score_cache";
78 constexpr char kCorpusScoreCache[] = "corpus_score_cache";
79 constexpr char kFilterCacheFilename[] = "filter_cache";
80 constexpr char kNamespaceMapperFilename[] = "namespace_mapper";
81 constexpr char kUsageStoreDirectoryName[] = "usage_store";
82 constexpr char kCorpusIdMapperFilename[] = "corpus_mapper";
83
84 // Determined through manual testing to allow for 1 million uris. 1 million
85 // because we allow up to 1 million DocumentIds.
86 constexpr int32_t kUriMapperMaxSize = 36 * 1024 * 1024; // 36 MiB
87
88 // 384 KiB for a DynamicTrieKeyMapper would allow each internal array to have a
89 // max of 128 KiB for storage.
90 constexpr int32_t kNamespaceMapperMaxSize = 3 * 128 * 1024; // 384 KiB
91 constexpr int32_t kCorpusMapperMaxSize = 3 * 128 * 1024; // 384 KiB
92
CreateDocumentWrapper(DocumentProto && document)93 DocumentWrapper CreateDocumentWrapper(DocumentProto&& document) {
94 DocumentWrapper document_wrapper;
95 *document_wrapper.mutable_document() = std::move(document);
96 return document_wrapper;
97 }
98
MakeHeaderFilename(const std::string & base_dir)99 std::string MakeHeaderFilename(const std::string& base_dir) {
100 return absl_ports::StrCat(base_dir, "/", kDocumentStoreHeaderFilename);
101 }
102
MakeDocumentIdMapperFilename(const std::string & base_dir)103 std::string MakeDocumentIdMapperFilename(const std::string& base_dir) {
104 return absl_ports::StrCat(base_dir, "/", kDocumentIdMapperFilename);
105 }
106
MakeScoreCacheFilename(const std::string & base_dir)107 std::string MakeScoreCacheFilename(const std::string& base_dir) {
108 return absl_ports::StrCat(base_dir, "/", kScoreCacheFilename);
109 }
110
MakeCorpusScoreCache(const std::string & base_dir)111 std::string MakeCorpusScoreCache(const std::string& base_dir) {
112 return absl_ports::StrCat(base_dir, "/", kCorpusScoreCache);
113 }
114
MakeFilterCacheFilename(const std::string & base_dir)115 std::string MakeFilterCacheFilename(const std::string& base_dir) {
116 return absl_ports::StrCat(base_dir, "/", kFilterCacheFilename);
117 }
118
MakeNamespaceMapperFilename(const std::string & base_dir)119 std::string MakeNamespaceMapperFilename(const std::string& base_dir) {
120 return absl_ports::StrCat(base_dir, "/", kNamespaceMapperFilename);
121 }
122
MakeUsageStoreDirectoryName(const std::string & base_dir)123 std::string MakeUsageStoreDirectoryName(const std::string& base_dir) {
124 return absl_ports::StrCat(base_dir, "/", kUsageStoreDirectoryName);
125 }
126
MakeCorpusMapperFilename(const std::string & base_dir)127 std::string MakeCorpusMapperFilename(const std::string& base_dir) {
128 return absl_ports::StrCat(base_dir, "/", kCorpusIdMapperFilename);
129 }
130
131 // This function will encode a namespace id into a fixed 3 bytes string.
EncodeNamespaceId(NamespaceId namespace_id)132 std::string EncodeNamespaceId(NamespaceId namespace_id) {
133 // encoding should be 1 to 3 bytes based on the value of namespace_id.
134 std::string encoding = encode_util::EncodeIntToCString(namespace_id);
135 // Make encoding to fixed 3 bytes.
136 while (encoding.size() < 3) {
137 // DynamicTrie cannot handle keys with 0 as bytes, so we append it using 1,
138 // just like what we do in encode_util::EncodeIntToCString.
139 //
140 // The reason that this works is because DecodeIntToString decodes a byte
141 // value of 0x01 as 0x00. When EncodeIntToCString returns a namespaceid
142 // encoding that is less than 3 bytes, it means that the id contains
143 // unencoded leading 0x00. So here we're explicitly encoding those bytes as
144 // 0x01.
145 encoding.push_back(1);
146 }
147 return encoding;
148 }
149
CalculateExpirationTimestampMs(int64_t creation_timestamp_ms,int64_t ttl_ms)150 int64_t CalculateExpirationTimestampMs(int64_t creation_timestamp_ms,
151 int64_t ttl_ms) {
152 if (ttl_ms == 0) {
153 // Special case where a TTL of 0 indicates the document should never
154 // expire. int64_t max, interpreted as seconds since epoch, represents
155 // some point in the year 292,277,026,596. So we're probably ok to use
156 // this as "never reaching this point".
157 return std::numeric_limits<int64_t>::max();
158 }
159
160 int64_t expiration_timestamp_ms;
161 if (__builtin_add_overflow(creation_timestamp_ms, ttl_ms,
162 &expiration_timestamp_ms)) {
163 // Overflow detected. Treat overflow as the same behavior of just int64_t
164 // max
165 return std::numeric_limits<int64_t>::max();
166 }
167
168 return expiration_timestamp_ms;
169 }
170
GetRecoveryCause(const DocumentLogCreator::CreateResult & create_result,bool force_recovery_and_revalidate_documents)171 InitializeStatsProto::RecoveryCause GetRecoveryCause(
172 const DocumentLogCreator::CreateResult& create_result,
173 bool force_recovery_and_revalidate_documents) {
174 if (force_recovery_and_revalidate_documents) {
175 return InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC;
176 } else if (create_result.log_create_result.has_data_loss()) {
177 return InitializeStatsProto::DATA_LOSS;
178 } else if (create_result.preexisting_file_version !=
179 DocumentLogCreator::kCurrentVersion) {
180 return InitializeStatsProto::LEGACY_DOCUMENT_LOG_FORMAT;
181 }
182 return InitializeStatsProto::NONE;
183 }
184
GetDataStatus(DataLoss data_loss)185 InitializeStatsProto::DocumentStoreDataStatus GetDataStatus(
186 DataLoss data_loss) {
187 switch (data_loss) {
188 case DataLoss::PARTIAL:
189 return InitializeStatsProto::PARTIAL_LOSS;
190 case DataLoss::COMPLETE:
191 return InitializeStatsProto::COMPLETE_LOSS;
192 case DataLoss::NONE:
193 return InitializeStatsProto::NO_DATA_LOSS;
194 }
195 }
196
GetNamespaceIdsToNamespaces(const KeyMapper<NamespaceId> * key_mapper)197 std::unordered_map<NamespaceId, std::string> GetNamespaceIdsToNamespaces(
198 const KeyMapper<NamespaceId>* key_mapper) {
199 std::unordered_map<NamespaceId, std::string> namespace_ids_to_namespaces;
200
201 std::unique_ptr<typename KeyMapper<NamespaceId>::Iterator> itr =
202 key_mapper->GetIterator();
203 while (itr->Advance()) {
204 namespace_ids_to_namespaces.insert(
205 {itr->GetValue(), std::string(itr->GetKey())});
206 }
207 return namespace_ids_to_namespaces;
208 }
209
210 } // namespace
211
MakeFingerprint(NamespaceId namespace_id,std::string_view namespace_,std::string_view uri_or_schema) const212 std::string DocumentStore::MakeFingerprint(
213 NamespaceId namespace_id, std::string_view namespace_,
214 std::string_view uri_or_schema) const {
215 if (!namespace_id_fingerprint_) {
216 // Using a 64-bit fingerprint to represent the key could lead to collisions.
217 // But, even with 200K unique keys, the probability of collision is about
218 // one-in-a-billion (https://en.wikipedia.org/wiki/Birthday_attack).
219 uint64_t fprint = tc3farmhash::Fingerprint64(
220 absl_ports::StrCat(namespace_, uri_or_schema));
221 return fingerprint_util::GetFingerprintString(fprint);
222 } else {
223 return absl_ports::StrCat(EncodeNamespaceId(namespace_id),
224 encode_util::EncodeIntToCString(
225 tc3farmhash::Fingerprint64(uri_or_schema)));
226 }
227 }
228
DocumentStore(const Filesystem * filesystem,const std::string_view base_dir,const Clock * clock,const SchemaStore * schema_store,bool namespace_id_fingerprint,int32_t compression_level)229 DocumentStore::DocumentStore(const Filesystem* filesystem,
230 const std::string_view base_dir,
231 const Clock* clock,
232 const SchemaStore* schema_store,
233 bool namespace_id_fingerprint,
234 int32_t compression_level)
235 : filesystem_(filesystem),
236 base_dir_(base_dir),
237 clock_(*clock),
238 schema_store_(schema_store),
239 document_validator_(schema_store),
240 namespace_id_fingerprint_(namespace_id_fingerprint),
241 compression_level_(compression_level) {}
242
Put(const DocumentProto & document,int32_t num_tokens,PutDocumentStatsProto * put_document_stats)243 libtextclassifier3::StatusOr<DocumentId> DocumentStore::Put(
244 const DocumentProto& document, int32_t num_tokens,
245 PutDocumentStatsProto* put_document_stats) {
246 return Put(DocumentProto(document), num_tokens, put_document_stats);
247 }
248
Put(DocumentProto && document,int32_t num_tokens,PutDocumentStatsProto * put_document_stats)249 libtextclassifier3::StatusOr<DocumentId> DocumentStore::Put(
250 DocumentProto&& document, int32_t num_tokens,
251 PutDocumentStatsProto* put_document_stats) {
252 document.mutable_internal_fields()->set_length_in_tokens(num_tokens);
253 return InternalPut(std::move(document), put_document_stats);
254 }
255
~DocumentStore()256 DocumentStore::~DocumentStore() {
257 if (initialized_) {
258 if (!PersistToDisk(PersistType::FULL).ok()) {
259 ICING_LOG(ERROR)
260 << "Error persisting to disk in DocumentStore destructor";
261 }
262 }
263 }
264
Create(const Filesystem * filesystem,const std::string & base_dir,const Clock * clock,const SchemaStore * schema_store,bool force_recovery_and_revalidate_documents,bool namespace_id_fingerprint,int32_t compression_level,InitializeStatsProto * initialize_stats)265 libtextclassifier3::StatusOr<DocumentStore::CreateResult> DocumentStore::Create(
266 const Filesystem* filesystem, const std::string& base_dir,
267 const Clock* clock, const SchemaStore* schema_store,
268 bool force_recovery_and_revalidate_documents, bool namespace_id_fingerprint,
269 int32_t compression_level, InitializeStatsProto* initialize_stats) {
270 ICING_RETURN_ERROR_IF_NULL(filesystem);
271 ICING_RETURN_ERROR_IF_NULL(clock);
272 ICING_RETURN_ERROR_IF_NULL(schema_store);
273
274 auto document_store = std::unique_ptr<DocumentStore>(new DocumentStore(
275 filesystem, base_dir, clock, schema_store, namespace_id_fingerprint,
276 compression_level));
277 ICING_ASSIGN_OR_RETURN(
278 DataLoss data_loss,
279 document_store->Initialize(force_recovery_and_revalidate_documents,
280 initialize_stats));
281
282 CreateResult create_result;
283 create_result.document_store = std::move(document_store);
284 create_result.data_loss = data_loss;
285 return create_result;
286 }
287
DiscardDerivedFiles(const Filesystem * filesystem,const std::string & base_dir)288 /* static */ libtextclassifier3::Status DocumentStore::DiscardDerivedFiles(
289 const Filesystem* filesystem, const std::string& base_dir) {
290 // Header
291 const std::string header_filename = MakeHeaderFilename(base_dir);
292 if (!filesystem->DeleteFile(MakeHeaderFilename(base_dir).c_str())) {
293 return absl_ports::InternalError("Couldn't delete header file");
294 }
295
296 // Document key mapper
297 ICING_RETURN_IF_ERROR(
298 DynamicTrieKeyMapper<DocumentId>::Delete(*filesystem, base_dir));
299
300 // Document id mapper
301 ICING_RETURN_IF_ERROR(FileBackedVector<int64_t>::Delete(
302 *filesystem, MakeDocumentIdMapperFilename(base_dir)));
303
304 // Document associated score cache
305 ICING_RETURN_IF_ERROR(FileBackedVector<DocumentAssociatedScoreData>::Delete(
306 *filesystem, MakeScoreCacheFilename(base_dir)));
307
308 // Filter cache
309 ICING_RETURN_IF_ERROR(FileBackedVector<DocumentFilterData>::Delete(
310 *filesystem, MakeFilterCacheFilename(base_dir)));
311
312 // Namespace mapper
313 ICING_RETURN_IF_ERROR(DynamicTrieKeyMapper<NamespaceId>::Delete(
314 *filesystem, MakeNamespaceMapperFilename(base_dir)));
315
316 // Corpus mapper
317 ICING_RETURN_IF_ERROR(DynamicTrieKeyMapper<CorpusId>::Delete(
318 *filesystem, MakeCorpusMapperFilename(base_dir)));
319
320 // Corpus associated score cache
321 ICING_RETURN_IF_ERROR(FileBackedVector<CorpusAssociatedScoreData>::Delete(
322 *filesystem, MakeCorpusScoreCache(base_dir)));
323
324 return libtextclassifier3::Status::OK;
325 }
326
Initialize(bool force_recovery_and_revalidate_documents,InitializeStatsProto * initialize_stats)327 libtextclassifier3::StatusOr<DataLoss> DocumentStore::Initialize(
328 bool force_recovery_and_revalidate_documents,
329 InitializeStatsProto* initialize_stats) {
330 auto create_result_or =
331 DocumentLogCreator::Create(filesystem_, base_dir_, compression_level_);
332
333 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
334 // that can support error logging.
335 if (!create_result_or.ok()) {
336 ICING_LOG(ERROR) << create_result_or.status().error_message()
337 << "\nFailed to initialize DocumentLog.";
338 return create_result_or.status();
339 }
340 DocumentLogCreator::CreateResult create_result =
341 std::move(create_result_or).ValueOrDie();
342
343 document_log_ = std::move(create_result.log_create_result.proto_log);
344 InitializeStatsProto::RecoveryCause recovery_cause =
345 GetRecoveryCause(create_result, force_recovery_and_revalidate_documents);
346
347 if (recovery_cause != InitializeStatsProto::NONE || create_result.new_file) {
348 ICING_LOG(INFO) << "Starting Document Store Recovery with cause="
349 << recovery_cause << ", and create result { new_file="
350 << create_result.new_file << ", preeisting_file_version="
351 << create_result.preexisting_file_version << ", data_loss="
352 << create_result.log_create_result.data_loss
353 << "} and kCurrentVersion="
354 << DocumentLogCreator::kCurrentVersion;
355 // We can't rely on any existing derived files. Recreate them from scratch.
356 // Currently happens if:
357 // 1) This is a new log and we don't have derived files yet
358 // 2) Client wanted us to force a regeneration.
359 // 3) Log has some data loss, can't rely on existing derived data.
360 std::unique_ptr<Timer> document_recovery_timer = clock_.GetNewTimer();
361 libtextclassifier3::Status status =
362 RegenerateDerivedFiles(force_recovery_and_revalidate_documents);
363 if (initialize_stats != nullptr &&
364 recovery_cause != InitializeStatsProto::NONE) {
365 // Only consider it a recovery if the client forced a recovery or there
366 // was data loss. Otherwise, this could just be the first time we're
367 // initializing and generating derived files.
368 initialize_stats->set_document_store_recovery_latency_ms(
369 document_recovery_timer->GetElapsedMilliseconds());
370 initialize_stats->set_document_store_recovery_cause(recovery_cause);
371 initialize_stats->set_document_store_data_status(
372 GetDataStatus(create_result.log_create_result.data_loss));
373 }
374 if (!status.ok()) {
375 ICING_LOG(ERROR)
376 << "Failed to regenerate derived files for DocumentStore";
377 return status;
378 }
379 } else {
380 if (!InitializeExistingDerivedFiles().ok()) {
381 ICING_LOG(WARNING)
382 << "Couldn't find derived files or failed to initialize them, "
383 "regenerating derived files for DocumentStore.";
384 std::unique_ptr<Timer> document_recovery_timer = clock_.GetNewTimer();
385 libtextclassifier3::Status status = RegenerateDerivedFiles(
386 /*force_recovery_and_revalidate_documents=*/false);
387 if (initialize_stats != nullptr) {
388 initialize_stats->set_document_store_recovery_cause(
389 InitializeStatsProto::IO_ERROR);
390 initialize_stats->set_document_store_recovery_latency_ms(
391 document_recovery_timer->GetElapsedMilliseconds());
392 }
393 if (!status.ok()) {
394 ICING_LOG(ERROR)
395 << "Failed to regenerate derived files for DocumentStore";
396 return status;
397 }
398 }
399 }
400
401 initialized_ = true;
402 if (initialize_stats != nullptr) {
403 initialize_stats->set_num_documents(document_id_mapper_->num_elements());
404 }
405
406 return create_result.log_create_result.data_loss;
407 }
408
InitializeExistingDerivedFiles()409 libtextclassifier3::Status DocumentStore::InitializeExistingDerivedFiles() {
410 if (!HeaderExists()) {
411 // Without a header, we don't know if things are consistent between each
412 // other so the caller should just regenerate everything from ground
413 // truth.
414 return absl_ports::InternalError("DocumentStore header doesn't exist");
415 }
416
417 DocumentStore::Header header;
418 if (!filesystem_->Read(MakeHeaderFilename(base_dir_).c_str(), &header,
419 sizeof(header))) {
420 return absl_ports::InternalError(
421 absl_ports::StrCat("Couldn't read: ", MakeHeaderFilename(base_dir_)));
422 }
423
424 if (header.magic !=
425 DocumentStore::Header::GetCurrentMagic(namespace_id_fingerprint_)) {
426 return absl_ports::InternalError(absl_ports::StrCat(
427 "Invalid header kMagic for file: ", MakeHeaderFilename(base_dir_)));
428 }
429
430 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
431 // that can support error logging.
432 auto document_key_mapper_or = DynamicTrieKeyMapper<
433 DocumentId,
434 fingerprint_util::FingerprintStringFormatter>::Create(*filesystem_,
435 base_dir_,
436 kUriMapperMaxSize);
437 if (!document_key_mapper_or.ok()) {
438 ICING_LOG(ERROR) << document_key_mapper_or.status().error_message()
439 << "Failed to initialize KeyMapper";
440 return document_key_mapper_or.status();
441 }
442 document_key_mapper_ = std::move(document_key_mapper_or).ValueOrDie();
443
444 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
445 // that can support error logging.
446 auto document_id_mapper_or = FileBackedVector<int64_t>::Create(
447 *filesystem_, MakeDocumentIdMapperFilename(base_dir_),
448 MemoryMappedFile::READ_WRITE_AUTO_SYNC);
449 if (!document_id_mapper_or.ok()) {
450 ICING_LOG(ERROR) << document_id_mapper_or.status().error_message()
451 << "Failed to initialize DocumentIdMapper";
452 return document_id_mapper_or.status();
453 }
454 document_id_mapper_ = std::move(document_id_mapper_or).ValueOrDie();
455
456 ICING_ASSIGN_OR_RETURN(score_cache_,
457 FileBackedVector<DocumentAssociatedScoreData>::Create(
458 *filesystem_, MakeScoreCacheFilename(base_dir_),
459 MemoryMappedFile::READ_WRITE_AUTO_SYNC));
460
461 ICING_ASSIGN_OR_RETURN(filter_cache_,
462 FileBackedVector<DocumentFilterData>::Create(
463 *filesystem_, MakeFilterCacheFilename(base_dir_),
464 MemoryMappedFile::READ_WRITE_AUTO_SYNC));
465
466 ICING_ASSIGN_OR_RETURN(
467 namespace_mapper_,
468 DynamicTrieKeyMapper<NamespaceId>::Create(
469 *filesystem_, MakeNamespaceMapperFilename(base_dir_),
470 kNamespaceMapperMaxSize));
471
472 ICING_ASSIGN_OR_RETURN(
473 usage_store_,
474 UsageStore::Create(filesystem_, MakeUsageStoreDirectoryName(base_dir_)));
475
476 auto corpus_mapper_or =
477 DynamicTrieKeyMapper<CorpusId,
478 fingerprint_util::FingerprintStringFormatter>::
479 Create(*filesystem_, MakeCorpusMapperFilename(base_dir_),
480 kCorpusMapperMaxSize);
481 if (!corpus_mapper_or.ok()) {
482 return std::move(corpus_mapper_or).status();
483 }
484 corpus_mapper_ = std::move(corpus_mapper_or).ValueOrDie();
485
486 ICING_ASSIGN_OR_RETURN(corpus_score_cache_,
487 FileBackedVector<CorpusAssociatedScoreData>::Create(
488 *filesystem_, MakeCorpusScoreCache(base_dir_),
489 MemoryMappedFile::READ_WRITE_AUTO_SYNC));
490
491 // Ensure the usage store is the correct size.
492 ICING_RETURN_IF_ERROR(
493 usage_store_->TruncateTo(document_id_mapper_->num_elements()));
494
495 ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
496 if (checksum.Get() != header.checksum) {
497 return absl_ports::InternalError(
498 "Combined checksum of DocStore was inconsistent");
499 }
500
501 return libtextclassifier3::Status::OK;
502 }
503
RegenerateDerivedFiles(bool revalidate_documents)504 libtextclassifier3::Status DocumentStore::RegenerateDerivedFiles(
505 bool revalidate_documents) {
506 ICING_RETURN_IF_ERROR(ResetDocumentKeyMapper());
507 ICING_RETURN_IF_ERROR(ResetDocumentIdMapper());
508 ICING_RETURN_IF_ERROR(ResetDocumentAssociatedScoreCache());
509 ICING_RETURN_IF_ERROR(ResetFilterCache());
510 ICING_RETURN_IF_ERROR(ResetNamespaceMapper());
511 ICING_RETURN_IF_ERROR(ResetCorpusMapper());
512 ICING_RETURN_IF_ERROR(ResetCorpusAssociatedScoreCache());
513
514 // Creates a new UsageStore instance. Note that we don't reset the data in
515 // usage store here because we're not able to regenerate the usage scores.
516 ICING_ASSIGN_OR_RETURN(
517 usage_store_,
518 UsageStore::Create(filesystem_, MakeUsageStoreDirectoryName(base_dir_)));
519
520 // Iterates through document log
521 auto iterator = document_log_->GetIterator();
522 auto iterator_status = iterator.Advance();
523 libtextclassifier3::StatusOr<int64_t> element_size =
524 document_log_->GetElementsFileSize();
525 libtextclassifier3::StatusOr<int64_t> disk_usage =
526 document_log_->GetDiskUsage();
527 if (element_size.ok() && disk_usage.ok()) {
528 ICING_VLOG(1) << "Starting recovery of document store. Document store "
529 "elements file size:"
530 << element_size.ValueOrDie()
531 << ", disk usage=" << disk_usage.ValueOrDie();
532 }
533 while (iterator_status.ok()) {
534 ICING_VLOG(2) << "Attempting to read document at offset="
535 << iterator.GetOffset();
536 libtextclassifier3::StatusOr<DocumentWrapper> document_wrapper_or =
537 document_log_->ReadProto(iterator.GetOffset());
538
539 if (absl_ports::IsNotFound(document_wrapper_or.status())) {
540 // The erased document still occupies 1 document id.
541 DocumentId new_document_id = document_id_mapper_->num_elements();
542 ICING_RETURN_IF_ERROR(ClearDerivedData(new_document_id));
543 iterator_status = iterator.Advance();
544 continue;
545 } else if (!document_wrapper_or.ok()) {
546 return document_wrapper_or.status();
547 }
548
549 DocumentWrapper document_wrapper =
550 std::move(document_wrapper_or).ValueOrDie();
551 // Revalidate that this document is still compatible if requested.
552 if (revalidate_documents) {
553 if (!document_validator_.Validate(document_wrapper.document()).ok()) {
554 // Document is no longer valid with the current schema. Mark as
555 // deleted
556 DocumentId new_document_id = document_id_mapper_->num_elements();
557 ICING_RETURN_IF_ERROR(document_log_->EraseProto(iterator.GetOffset()));
558 ICING_RETURN_IF_ERROR(ClearDerivedData(new_document_id));
559 continue;
560 }
561 }
562
563 ICING_ASSIGN_OR_RETURN(
564 NamespaceId namespace_id,
565 namespace_mapper_->GetOrPut(document_wrapper.document().namespace_(),
566 namespace_mapper_->num_keys()));
567
568 // Updates key mapper and document_id mapper with the new document
569 DocumentId new_document_id = document_id_mapper_->num_elements();
570 ICING_RETURN_IF_ERROR(document_key_mapper_->Put(
571 MakeFingerprint(namespace_id, document_wrapper.document().namespace_(),
572 document_wrapper.document().uri()),
573 new_document_id));
574 ICING_RETURN_IF_ERROR(
575 document_id_mapper_->Set(new_document_id, iterator.GetOffset()));
576
577 SchemaTypeId schema_type_id;
578 auto schema_type_id_or =
579 schema_store_->GetSchemaTypeId(document_wrapper.document().schema());
580 if (absl_ports::IsNotFound(schema_type_id_or.status())) {
581 // Didn't find a SchemaTypeId. This means that the DocumentStore and
582 // the SchemaStore are out of sync. But DocumentStore can't do
583 // anything about it so just ignore this for now. This should be
584 // detected/handled by the owner of DocumentStore. Set it to some
585 // arbitrary invalid value for now, it'll get updated to the correct
586 // ID later.
587 schema_type_id = -1;
588 } else if (!schema_type_id_or.ok()) {
589 // Real error. Pass it up
590 return schema_type_id_or.status();
591 } else {
592 // We're guaranteed that SchemaTypeId is valid now
593 schema_type_id = schema_type_id_or.ValueOrDie();
594 }
595
596 // Update corpus maps
597 std::string corpus =
598 MakeFingerprint(namespace_id, document_wrapper.document().namespace_(),
599 document_wrapper.document().schema());
600 ICING_ASSIGN_OR_RETURN(
601 CorpusId corpusId,
602 corpus_mapper_->GetOrPut(corpus, corpus_mapper_->num_keys()));
603
604 ICING_ASSIGN_OR_RETURN(CorpusAssociatedScoreData scoring_data,
605 GetCorpusAssociatedScoreDataToUpdate(corpusId));
606 scoring_data.AddDocument(
607 document_wrapper.document().internal_fields().length_in_tokens());
608
609 ICING_RETURN_IF_ERROR(
610 UpdateCorpusAssociatedScoreCache(corpusId, scoring_data));
611
612 ICING_RETURN_IF_ERROR(UpdateDocumentAssociatedScoreCache(
613 new_document_id,
614 DocumentAssociatedScoreData(
615 corpusId, document_wrapper.document().score(),
616 document_wrapper.document().creation_timestamp_ms(),
617 document_wrapper.document().internal_fields().length_in_tokens())));
618
619 int64_t expiration_timestamp_ms = CalculateExpirationTimestampMs(
620 document_wrapper.document().creation_timestamp_ms(),
621 document_wrapper.document().ttl_ms());
622
623 ICING_RETURN_IF_ERROR(UpdateFilterCache(
624 new_document_id, DocumentFilterData(namespace_id, schema_type_id,
625 expiration_timestamp_ms)));
626 iterator_status = iterator.Advance();
627 }
628
629 if (!absl_ports::IsOutOfRange(iterator_status)) {
630 ICING_LOG(WARNING)
631 << "Failed to iterate through proto log while regenerating "
632 "derived files";
633 return absl_ports::Annotate(iterator_status,
634 "Failed to iterate through proto log.");
635 }
636
637 // Shrink usage_store_ to the correct size.
638 ICING_RETURN_IF_ERROR(
639 usage_store_->TruncateTo(document_id_mapper_->num_elements()));
640
641 // Write the header
642 ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
643 ICING_RETURN_IF_ERROR(UpdateHeader(checksum));
644
645 return libtextclassifier3::Status::OK;
646 }
647
ResetDocumentKeyMapper()648 libtextclassifier3::Status DocumentStore::ResetDocumentKeyMapper() {
649 // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
650 document_key_mapper_.reset();
651 // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
652 // that can support error logging.
653 libtextclassifier3::Status status =
654 DynamicTrieKeyMapper<DocumentId>::Delete(*filesystem_, base_dir_);
655 if (!status.ok()) {
656 ICING_LOG(ERROR) << status.error_message()
657 << "Failed to delete old key mapper";
658 return status;
659 }
660
661 // TODO(b/216487496): Implement a more robust version of TC_ASSIGN_OR_RETURN
662 // that can support error logging.
663 auto document_key_mapper_or = DynamicTrieKeyMapper<
664 DocumentId,
665 fingerprint_util::FingerprintStringFormatter>::Create(*filesystem_,
666 base_dir_,
667 kUriMapperMaxSize);
668 if (!document_key_mapper_or.ok()) {
669 ICING_LOG(ERROR) << document_key_mapper_or.status().error_message()
670 << "Failed to re-init key mapper";
671 return document_key_mapper_or.status();
672 }
673 document_key_mapper_ = std::move(document_key_mapper_or).ValueOrDie();
674 return libtextclassifier3::Status::OK;
675 }
676
ResetDocumentIdMapper()677 libtextclassifier3::Status DocumentStore::ResetDocumentIdMapper() {
678 // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
679 document_id_mapper_.reset();
680 // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
681 // that can support error logging.
682 libtextclassifier3::Status status = FileBackedVector<int64_t>::Delete(
683 *filesystem_, MakeDocumentIdMapperFilename(base_dir_));
684 if (!status.ok()) {
685 ICING_LOG(ERROR) << status.error_message()
686 << "Failed to delete old document_id mapper";
687 return status;
688 }
689 // TODO(b/216487496): Implement a more robust version of TC_ASSIGN_OR_RETURN
690 // that can support error logging.
691 auto document_id_mapper_or = FileBackedVector<int64_t>::Create(
692 *filesystem_, MakeDocumentIdMapperFilename(base_dir_),
693 MemoryMappedFile::READ_WRITE_AUTO_SYNC);
694 if (!document_id_mapper_or.ok()) {
695 ICING_LOG(ERROR) << document_id_mapper_or.status().error_message()
696 << "Failed to re-init document_id mapper";
697 return document_id_mapper_or.status();
698 }
699 document_id_mapper_ = std::move(document_id_mapper_or).ValueOrDie();
700 return libtextclassifier3::Status::OK;
701 }
702
ResetDocumentAssociatedScoreCache()703 libtextclassifier3::Status DocumentStore::ResetDocumentAssociatedScoreCache() {
704 // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
705 score_cache_.reset();
706 ICING_RETURN_IF_ERROR(FileBackedVector<DocumentAssociatedScoreData>::Delete(
707 *filesystem_, MakeScoreCacheFilename(base_dir_)));
708 ICING_ASSIGN_OR_RETURN(score_cache_,
709 FileBackedVector<DocumentAssociatedScoreData>::Create(
710 *filesystem_, MakeScoreCacheFilename(base_dir_),
711 MemoryMappedFile::READ_WRITE_AUTO_SYNC));
712 return libtextclassifier3::Status::OK;
713 }
714
ResetCorpusAssociatedScoreCache()715 libtextclassifier3::Status DocumentStore::ResetCorpusAssociatedScoreCache() {
716 // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
717 corpus_score_cache_.reset();
718 ICING_RETURN_IF_ERROR(FileBackedVector<CorpusAssociatedScoreData>::Delete(
719 *filesystem_, MakeCorpusScoreCache(base_dir_)));
720 ICING_ASSIGN_OR_RETURN(corpus_score_cache_,
721 FileBackedVector<CorpusAssociatedScoreData>::Create(
722 *filesystem_, MakeCorpusScoreCache(base_dir_),
723 MemoryMappedFile::READ_WRITE_AUTO_SYNC));
724 return libtextclassifier3::Status::OK;
725 }
726
ResetFilterCache()727 libtextclassifier3::Status DocumentStore::ResetFilterCache() {
728 // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
729 filter_cache_.reset();
730 ICING_RETURN_IF_ERROR(FileBackedVector<DocumentFilterData>::Delete(
731 *filesystem_, MakeFilterCacheFilename(base_dir_)));
732 ICING_ASSIGN_OR_RETURN(filter_cache_,
733 FileBackedVector<DocumentFilterData>::Create(
734 *filesystem_, MakeFilterCacheFilename(base_dir_),
735 MemoryMappedFile::READ_WRITE_AUTO_SYNC));
736 return libtextclassifier3::Status::OK;
737 }
738
ResetNamespaceMapper()739 libtextclassifier3::Status DocumentStore::ResetNamespaceMapper() {
740 // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
741 namespace_mapper_.reset();
742 // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
743 // that can support error logging.
744 libtextclassifier3::Status status = DynamicTrieKeyMapper<NamespaceId>::Delete(
745 *filesystem_, MakeNamespaceMapperFilename(base_dir_));
746 if (!status.ok()) {
747 ICING_LOG(ERROR) << status.error_message()
748 << "Failed to delete old namespace_id mapper";
749 return status;
750 }
751 ICING_ASSIGN_OR_RETURN(
752 namespace_mapper_,
753 DynamicTrieKeyMapper<NamespaceId>::Create(
754 *filesystem_, MakeNamespaceMapperFilename(base_dir_),
755 kNamespaceMapperMaxSize));
756 return libtextclassifier3::Status::OK;
757 }
758
ResetCorpusMapper()759 libtextclassifier3::Status DocumentStore::ResetCorpusMapper() {
760 // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
761 corpus_mapper_.reset();
762 // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
763 // that can support error logging.
764 libtextclassifier3::Status status = DynamicTrieKeyMapper<CorpusId>::Delete(
765 *filesystem_, MakeCorpusMapperFilename(base_dir_));
766 if (!status.ok()) {
767 ICING_LOG(ERROR) << status.error_message()
768 << "Failed to delete old corpus_id mapper";
769 return status;
770 }
771 auto corpus_mapper_or =
772 DynamicTrieKeyMapper<CorpusId,
773 fingerprint_util::FingerprintStringFormatter>::
774 Create(*filesystem_, MakeCorpusMapperFilename(base_dir_),
775 kCorpusMapperMaxSize);
776 if (!corpus_mapper_or.ok()) {
777 return std::move(corpus_mapper_or).status();
778 }
779 corpus_mapper_ = std::move(corpus_mapper_or).ValueOrDie();
780 return libtextclassifier3::Status::OK;
781 }
782
ComputeChecksum() const783 libtextclassifier3::StatusOr<Crc32> DocumentStore::ComputeChecksum() const {
784 Crc32 total_checksum;
785
786 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
787 // that can support error logging.
788 auto checksum_or = document_log_->ComputeChecksum();
789 if (!checksum_or.ok()) {
790 ICING_LOG(ERROR) << checksum_or.status().error_message()
791 << "Failed to compute checksum of DocumentLog";
792 return checksum_or.status();
793 }
794 Crc32 document_log_checksum = std::move(checksum_or).ValueOrDie();
795
796 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
797 // that can support error logging.
798 checksum_or = document_key_mapper_->ComputeChecksum();
799 if (!checksum_or.ok()) {
800 ICING_LOG(ERROR) << checksum_or.status().error_message()
801 << "Failed to compute checksum of DocumentKeyMapper";
802 return checksum_or.status();
803 }
804 Crc32 document_key_mapper_checksum = std::move(checksum_or).ValueOrDie();
805
806 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
807 // that can support error logging.
808 checksum_or = document_id_mapper_->ComputeChecksum();
809 if (!checksum_or.ok()) {
810 ICING_LOG(ERROR) << checksum_or.status().error_message()
811 << "Failed to compute checksum of DocumentIdMapper";
812 return checksum_or.status();
813 }
814 Crc32 document_id_mapper_checksum = std::move(checksum_or).ValueOrDie();
815
816 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
817 // that can support error logging.
818 checksum_or = score_cache_->ComputeChecksum();
819 if (!checksum_or.ok()) {
820 ICING_LOG(ERROR) << checksum_or.status().error_message()
821 << "Failed to compute checksum of score cache";
822 return checksum_or.status();
823 }
824 Crc32 score_cache_checksum = std::move(checksum_or).ValueOrDie();
825
826 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
827 // that can support error logging.
828 checksum_or = filter_cache_->ComputeChecksum();
829 if (!checksum_or.ok()) {
830 ICING_LOG(ERROR) << checksum_or.status().error_message()
831 << "Failed to compute checksum of filter cache";
832 return checksum_or.status();
833 }
834 Crc32 filter_cache_checksum = std::move(checksum_or).ValueOrDie();
835
836 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
837 // that can support error logging.
838 checksum_or = namespace_mapper_->ComputeChecksum();
839 if (!checksum_or.ok()) {
840 ICING_LOG(ERROR) << checksum_or.status().error_message()
841 << "Failed to compute checksum of namespace mapper";
842 return checksum_or.status();
843 }
844 Crc32 namespace_mapper_checksum = std::move(checksum_or).ValueOrDie();
845
846 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
847 // that can support error logging.
848 checksum_or = corpus_mapper_->ComputeChecksum();
849 if (!checksum_or.ok()) {
850 ICING_LOG(ERROR) << checksum_or.status().error_message()
851 << "Failed to compute checksum of corpus mapper";
852 return checksum_or.status();
853 }
854 Crc32 corpus_mapper_checksum = std::move(checksum_or).ValueOrDie();
855
856 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
857 // that can support error logging.
858 checksum_or = corpus_score_cache_->ComputeChecksum();
859 if (!checksum_or.ok()) {
860 ICING_LOG(WARNING) << checksum_or.status().error_message()
861 << "Failed to compute checksum of score cache";
862 return checksum_or.status();
863 }
864 Crc32 corpus_score_cache_checksum = std::move(checksum_or).ValueOrDie();
865
866 // NOTE: We purposely don't include usage_store checksum here because we can't
867 // regenerate it from ground truth documents. If it gets corrupted, we'll just
868 // clear all usage reports, but we shouldn't throw everything else in the
869 // document store out.
870
871 total_checksum.Append(std::to_string(document_log_checksum.Get()));
872 total_checksum.Append(std::to_string(document_key_mapper_checksum.Get()));
873 total_checksum.Append(std::to_string(document_id_mapper_checksum.Get()));
874 total_checksum.Append(std::to_string(score_cache_checksum.Get()));
875 total_checksum.Append(std::to_string(filter_cache_checksum.Get()));
876 total_checksum.Append(std::to_string(namespace_mapper_checksum.Get()));
877 total_checksum.Append(std::to_string(corpus_mapper_checksum.Get()));
878 total_checksum.Append(std::to_string(corpus_score_cache_checksum.Get()));
879
880 return total_checksum;
881 }
882
HeaderExists()883 bool DocumentStore::HeaderExists() {
884 if (!filesystem_->FileExists(MakeHeaderFilename(base_dir_).c_str())) {
885 return false;
886 }
887
888 int64_t file_size =
889 filesystem_->GetFileSize(MakeHeaderFilename(base_dir_).c_str());
890
891 // If it's been truncated to size 0 before, we consider it to be a new file
892 return file_size != 0 && file_size != Filesystem::kBadFileSize;
893 }
894
UpdateHeader(const Crc32 & checksum)895 libtextclassifier3::Status DocumentStore::UpdateHeader(const Crc32& checksum) {
896 // Write the header
897 DocumentStore::Header header;
898 header.magic =
899 DocumentStore::Header::GetCurrentMagic(namespace_id_fingerprint_);
900 header.checksum = checksum.Get();
901
902 // This should overwrite the header.
903 ScopedFd sfd(
904 filesystem_->OpenForWrite(MakeHeaderFilename(base_dir_).c_str()));
905 if (!sfd.is_valid() ||
906 !filesystem_->Write(sfd.get(), &header, sizeof(header)) ||
907 !filesystem_->DataSync(sfd.get())) {
908 return absl_ports::InternalError(absl_ports::StrCat(
909 "Failed to write DocStore header: ", MakeHeaderFilename(base_dir_)));
910 }
911 return libtextclassifier3::Status::OK;
912 }
913
InternalPut(DocumentProto && document,PutDocumentStatsProto * put_document_stats)914 libtextclassifier3::StatusOr<DocumentId> DocumentStore::InternalPut(
915 DocumentProto&& document, PutDocumentStatsProto* put_document_stats) {
916 std::unique_ptr<Timer> put_timer = clock_.GetNewTimer();
917 ICING_RETURN_IF_ERROR(document_validator_.Validate(document));
918
919 if (put_document_stats != nullptr) {
920 put_document_stats->set_document_size(document.ByteSizeLong());
921 }
922
923 // Copy fields needed before they are moved
924 std::string name_space = document.namespace_();
925 std::string uri = document.uri();
926 std::string schema = document.schema();
927 int document_score = document.score();
928 int32_t length_in_tokens = document.internal_fields().length_in_tokens();
929 int64_t creation_timestamp_ms = document.creation_timestamp_ms();
930
931 // Sets the creation timestamp if caller hasn't specified.
932 if (document.creation_timestamp_ms() == 0) {
933 creation_timestamp_ms = clock_.GetSystemTimeMilliseconds();
934 document.set_creation_timestamp_ms(creation_timestamp_ms);
935 }
936
937 int64_t expiration_timestamp_ms =
938 CalculateExpirationTimestampMs(creation_timestamp_ms, document.ttl_ms());
939
940 // Update ground truth first
941 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
942 // that can support error logging.
943 auto offset_or =
944 document_log_->WriteProto(CreateDocumentWrapper(std::move(document)));
945 if (!offset_or.ok()) {
946 ICING_LOG(ERROR) << offset_or.status().error_message()
947 << "Failed to write document";
948 return offset_or.status();
949 }
950 int64_t file_offset = std::move(offset_or).ValueOrDie();
951
952 // Get existing document id
953 auto old_document_id_or = GetDocumentId(name_space, uri);
954 if (!old_document_id_or.ok() &&
955 !absl_ports::IsNotFound(old_document_id_or.status())) {
956 return absl_ports::InternalError("Failed to read from key mapper");
957 }
958
959 // Creates a new document id, updates key mapper and document_id mapper
960 DocumentId new_document_id = document_id_mapper_->num_elements();
961 if (!IsDocumentIdValid(new_document_id)) {
962 return absl_ports::ResourceExhaustedError(
963 "Exceeded maximum number of documents. Try calling Optimize to reclaim "
964 "some space.");
965 }
966
967 // Update namespace maps
968 ICING_ASSIGN_OR_RETURN(
969 NamespaceId namespace_id,
970 namespace_mapper_->GetOrPut(name_space, namespace_mapper_->num_keys()));
971
972 // Updates key mapper and document_id mapper
973 ICING_RETURN_IF_ERROR(document_key_mapper_->Put(
974 MakeFingerprint(namespace_id, name_space, uri), new_document_id));
975 ICING_RETURN_IF_ERROR(document_id_mapper_->Set(new_document_id, file_offset));
976
977 // Update corpus maps
978 ICING_ASSIGN_OR_RETURN(CorpusId corpusId,
979 corpus_mapper_->GetOrPut(
980 MakeFingerprint(namespace_id, name_space, schema),
981 corpus_mapper_->num_keys()));
982
983 ICING_ASSIGN_OR_RETURN(CorpusAssociatedScoreData scoring_data,
984 GetCorpusAssociatedScoreDataToUpdate(corpusId));
985 scoring_data.AddDocument(length_in_tokens);
986
987 ICING_RETURN_IF_ERROR(
988 UpdateCorpusAssociatedScoreCache(corpusId, scoring_data));
989
990 ICING_RETURN_IF_ERROR(UpdateDocumentAssociatedScoreCache(
991 new_document_id,
992 DocumentAssociatedScoreData(corpusId, document_score,
993 creation_timestamp_ms, length_in_tokens)));
994
995 ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
996 schema_store_->GetSchemaTypeId(schema));
997
998 ICING_RETURN_IF_ERROR(UpdateFilterCache(
999 new_document_id, DocumentFilterData(namespace_id, schema_type_id,
1000 expiration_timestamp_ms)));
1001
1002 if (old_document_id_or.ok()) {
1003 // The old document exists, copy over the usage scores and delete the old
1004 // document.
1005 DocumentId old_document_id = old_document_id_or.ValueOrDie();
1006
1007 ICING_RETURN_IF_ERROR(
1008 usage_store_->CloneUsageScores(/*from_document_id=*/old_document_id,
1009 /*to_document_id=*/new_document_id));
1010
1011 // Delete the old document. It's fine if it's not found since it might have
1012 // been deleted previously.
1013 auto delete_status =
1014 Delete(old_document_id, clock_.GetSystemTimeMilliseconds());
1015 if (!delete_status.ok() && !absl_ports::IsNotFound(delete_status)) {
1016 // Real error, pass it up.
1017 return delete_status;
1018 }
1019 }
1020
1021 if (put_document_stats != nullptr) {
1022 put_document_stats->set_document_store_latency_ms(
1023 put_timer->GetElapsedMilliseconds());
1024 }
1025
1026 return new_document_id;
1027 }
1028
Get(const std::string_view name_space,const std::string_view uri,bool clear_internal_fields) const1029 libtextclassifier3::StatusOr<DocumentProto> DocumentStore::Get(
1030 const std::string_view name_space, const std::string_view uri,
1031 bool clear_internal_fields) const {
1032 // TODO(b/147231617): Make a better way to replace the error message in an
1033 // existing Status.
1034 auto document_id_or = GetDocumentId(name_space, uri);
1035 if (absl_ports::IsNotFound(document_id_or.status())) {
1036 ICING_VLOG(1) << document_id_or.status().error_message();
1037 return libtextclassifier3::Status(
1038 document_id_or.status().CanonicalCode(),
1039 IcingStringUtil::StringPrintf("Document (%s, %s) not found.",
1040 name_space.data(), uri.data()));
1041 }
1042 DocumentId document_id = document_id_or.ValueOrDie();
1043
1044 // TODO(b/147231617): Make a better way to replace the error message in an
1045 // existing Status.
1046 auto status_or = Get(document_id);
1047 if (absl_ports::IsNotFound(status_or.status())) {
1048 ICING_LOG(ERROR) << document_id_or.status().error_message();
1049 return libtextclassifier3::Status(
1050 status_or.status().CanonicalCode(),
1051 IcingStringUtil::StringPrintf("Document (%s, %s) not found.",
1052 name_space.data(), uri.data()));
1053 }
1054 return status_or;
1055 }
1056
Get(DocumentId document_id,bool clear_internal_fields) const1057 libtextclassifier3::StatusOr<DocumentProto> DocumentStore::Get(
1058 DocumentId document_id, bool clear_internal_fields) const {
1059 int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
1060 auto document_filter_data_optional_ =
1061 GetAliveDocumentFilterData(document_id, current_time_ms);
1062 if (!document_filter_data_optional_) {
1063 // The document doesn't exist. Let's check if the document id is invalid, we
1064 // will return InvalidArgumentError. Otherwise we should return NOT_FOUND
1065 // error.
1066 if (!IsDocumentIdValid(document_id)) {
1067 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
1068 "Document id '%d' invalid.", document_id));
1069 }
1070 return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
1071 "Document id '%d' doesn't exist", document_id));
1072 }
1073
1074 auto document_log_offset_or = document_id_mapper_->Get(document_id);
1075 if (!document_log_offset_or.ok()) {
1076 // Since we've just checked that our document_id is valid a few lines
1077 // above, there's no reason this should fail and an error should never
1078 // happen.
1079 return absl_ports::InternalError("Failed to find document offset.");
1080 }
1081 int64_t document_log_offset = *document_log_offset_or.ValueOrDie();
1082
1083 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
1084 // that can support error logging.
1085 auto document_wrapper_or = document_log_->ReadProto(document_log_offset);
1086 if (!document_wrapper_or.ok()) {
1087 ICING_LOG(ERROR) << document_wrapper_or.status().error_message()
1088 << "Failed to read from document log";
1089 return document_wrapper_or.status();
1090 }
1091 DocumentWrapper document_wrapper =
1092 std::move(document_wrapper_or).ValueOrDie();
1093 if (clear_internal_fields) {
1094 document_wrapper.mutable_document()->clear_internal_fields();
1095 }
1096
1097 return std::move(*document_wrapper.mutable_document());
1098 }
1099
GetDocumentId(const std::string_view name_space,const std::string_view uri) const1100 libtextclassifier3::StatusOr<DocumentId> DocumentStore::GetDocumentId(
1101 const std::string_view name_space, const std::string_view uri) const {
1102 auto namespace_id_or = namespace_mapper_->Get(name_space);
1103 libtextclassifier3::Status status = namespace_id_or.status();
1104 if (status.ok()) {
1105 NamespaceId namespace_id = namespace_id_or.ValueOrDie();
1106 auto document_id_or = document_key_mapper_->Get(
1107 MakeFingerprint(namespace_id, name_space, uri));
1108 status = document_id_or.status();
1109 if (status.ok()) {
1110 // Guaranteed to have a DocumentId now
1111 return document_id_or.ValueOrDie();
1112 }
1113 }
1114 return absl_ports::Annotate(
1115 status, absl_ports::StrCat(
1116 "Failed to find DocumentId by key: ", name_space, ", ", uri));
1117 }
1118
GetAllNamespaces() const1119 std::vector<std::string> DocumentStore::GetAllNamespaces() const {
1120 std::unordered_map<NamespaceId, std::string> namespace_id_to_namespace =
1121 GetNamespaceIdsToNamespaces(namespace_mapper_.get());
1122
1123 std::unordered_set<NamespaceId> existing_namespace_ids;
1124 int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
1125 for (DocumentId document_id = 0; document_id < filter_cache_->num_elements();
1126 ++document_id) {
1127 // filter_cache_->Get can only fail if document_id is < 0
1128 // or >= filter_cache_->num_elements. So, this error SHOULD NEVER HAPPEN.
1129 auto status_or_data = filter_cache_->Get(document_id);
1130 if (!status_or_data.ok()) {
1131 ICING_LOG(ERROR)
1132 << "Error while iterating over filter cache in GetAllNamespaces";
1133 return std::vector<std::string>();
1134 }
1135 const DocumentFilterData* data = status_or_data.ValueOrDie();
1136
1137 if (GetAliveDocumentFilterData(document_id, current_time_ms)) {
1138 existing_namespace_ids.insert(data->namespace_id());
1139 }
1140 }
1141
1142 std::vector<std::string> existing_namespaces;
1143 for (auto itr = existing_namespace_ids.begin();
1144 itr != existing_namespace_ids.end(); ++itr) {
1145 existing_namespaces.push_back(namespace_id_to_namespace.at(*itr));
1146 }
1147 return existing_namespaces;
1148 }
1149
GetAliveDocumentFilterData(DocumentId document_id,int64_t current_time_ms) const1150 std::optional<DocumentFilterData> DocumentStore::GetAliveDocumentFilterData(
1151 DocumentId document_id, int64_t current_time_ms) const {
1152 if (IsDeleted(document_id)) {
1153 return std::nullopt;
1154 }
1155 return GetNonExpiredDocumentFilterData(document_id, current_time_ms);
1156 }
1157
IsDeleted(DocumentId document_id) const1158 bool DocumentStore::IsDeleted(DocumentId document_id) const {
1159 auto file_offset_or = document_id_mapper_->Get(document_id);
1160 if (!file_offset_or.ok()) {
1161 // This would only happen if document_id is out of range of the
1162 // document_id_mapper, meaning we got some invalid document_id. Callers
1163 // should already have checked that their document_id is valid or used
1164 // DoesDocumentExist(WithStatus). Regardless, return true since the
1165 // document doesn't exist.
1166 return true;
1167 }
1168 int64_t file_offset = *file_offset_or.ValueOrDie();
1169 return file_offset == kDocDeletedFlag;
1170 }
1171
1172 // Returns DocumentFilterData if the document is not expired. Otherwise,
1173 // std::nullopt.
1174 std::optional<DocumentFilterData>
GetNonExpiredDocumentFilterData(DocumentId document_id,int64_t current_time_ms) const1175 DocumentStore::GetNonExpiredDocumentFilterData(DocumentId document_id,
1176 int64_t current_time_ms) const {
1177 auto filter_data_or = filter_cache_->GetCopy(document_id);
1178 if (!filter_data_or.ok()) {
1179 // This would only happen if document_id is out of range of the
1180 // filter_cache, meaning we got some invalid document_id. Callers should
1181 // already have checked that their document_id is valid or used
1182 // DoesDocumentExist(WithStatus). Regardless, return true since the
1183 // document doesn't exist.
1184 return std::nullopt;
1185 }
1186 DocumentFilterData document_filter_data = filter_data_or.ValueOrDie();
1187
1188 // Check if it's past the expiration time
1189 if (current_time_ms >= document_filter_data.expiration_timestamp_ms()) {
1190 return std::nullopt;
1191 }
1192 return document_filter_data;
1193 }
1194
Delete(const std::string_view name_space,const std::string_view uri,int64_t current_time_ms)1195 libtextclassifier3::Status DocumentStore::Delete(
1196 const std::string_view name_space, const std::string_view uri,
1197 int64_t current_time_ms) {
1198 // Try to get the DocumentId first
1199 auto document_id_or = GetDocumentId(name_space, uri);
1200 if (!document_id_or.ok()) {
1201 return absl_ports::Annotate(
1202 document_id_or.status(),
1203 absl_ports::StrCat("Failed to delete Document. namespace: ", name_space,
1204 ", uri: ", uri));
1205 }
1206 return Delete(document_id_or.ValueOrDie(), current_time_ms);
1207 }
1208
Delete(DocumentId document_id,int64_t current_time_ms)1209 libtextclassifier3::Status DocumentStore::Delete(DocumentId document_id,
1210 int64_t current_time_ms) {
1211 auto document_filter_data_optional_ =
1212 GetAliveDocumentFilterData(document_id, current_time_ms);
1213 if (!document_filter_data_optional_) {
1214 // The document doesn't exist. We should return InvalidArgumentError if the
1215 // document id is invalid. Otherwise we should return NOT_FOUND error.
1216 if (!IsDocumentIdValid(document_id)) {
1217 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
1218 "Document id '%d' invalid.", document_id));
1219 }
1220 return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
1221 "Document id '%d' doesn't exist", document_id));
1222 }
1223
1224 auto document_log_offset_or = document_id_mapper_->Get(document_id);
1225 if (!document_log_offset_or.ok()) {
1226 return absl_ports::InternalError("Failed to find document offset.");
1227 }
1228 int64_t document_log_offset = *document_log_offset_or.ValueOrDie();
1229
1230 // Erases document proto.
1231 ICING_RETURN_IF_ERROR(document_log_->EraseProto(document_log_offset));
1232 return ClearDerivedData(document_id);
1233 }
1234
GetNamespaceId(std::string_view name_space) const1235 libtextclassifier3::StatusOr<NamespaceId> DocumentStore::GetNamespaceId(
1236 std::string_view name_space) const {
1237 return namespace_mapper_->Get(name_space);
1238 }
1239
GetCorpusId(const std::string_view name_space,const std::string_view schema) const1240 libtextclassifier3::StatusOr<CorpusId> DocumentStore::GetCorpusId(
1241 const std::string_view name_space, const std::string_view schema) const {
1242 ICING_ASSIGN_OR_RETURN(NamespaceId namespace_id,
1243 namespace_mapper_->Get(name_space));
1244 return corpus_mapper_->Get(MakeFingerprint(namespace_id, name_space, schema));
1245 }
1246
GetResultGroupingEntryId(ResultSpecProto::ResultGroupingType result_group_type,const std::string_view name_space,const std::string_view schema) const1247 libtextclassifier3::StatusOr<int32_t> DocumentStore::GetResultGroupingEntryId(
1248 ResultSpecProto::ResultGroupingType result_group_type,
1249 const std::string_view name_space, const std::string_view schema) const {
1250 auto namespace_id = GetNamespaceId(name_space);
1251 auto schema_type_id = schema_store_->GetSchemaTypeId(schema);
1252 switch (result_group_type) {
1253 case ResultSpecProto::NONE:
1254 return absl_ports::InvalidArgumentError(
1255 "Cannot group by ResultSpecProto::NONE");
1256 case ResultSpecProto::SCHEMA_TYPE:
1257 if (schema_type_id.ok()) {
1258 return schema_type_id.ValueOrDie();
1259 }
1260 break;
1261 case ResultSpecProto::NAMESPACE:
1262 if (namespace_id.ok()) {
1263 return namespace_id.ValueOrDie();
1264 }
1265 break;
1266 case ResultSpecProto::NAMESPACE_AND_SCHEMA_TYPE:
1267 if (namespace_id.ok() && schema_type_id.ok()) {
1268 // TODO(b/258715421): Temporary workaround to get a
1269 // ResultGroupingEntryId given the Namespace string
1270 // and Schema string.
1271 return namespace_id.ValueOrDie() << 16 | schema_type_id.ValueOrDie();
1272 }
1273 break;
1274 }
1275 return absl_ports::NotFoundError("Cannot generate ResultGrouping Entry Id");
1276 }
1277
GetResultGroupingEntryId(ResultSpecProto::ResultGroupingType result_group_type,const NamespaceId namespace_id,const SchemaTypeId schema_type_id) const1278 libtextclassifier3::StatusOr<int32_t> DocumentStore::GetResultGroupingEntryId(
1279 ResultSpecProto::ResultGroupingType result_group_type,
1280 const NamespaceId namespace_id, const SchemaTypeId schema_type_id) const {
1281 switch (result_group_type) {
1282 case ResultSpecProto::NONE:
1283 return absl_ports::InvalidArgumentError(
1284 "Cannot group by ResultSpecProto::NONE");
1285 case ResultSpecProto::SCHEMA_TYPE:
1286 return schema_type_id;
1287 case ResultSpecProto::NAMESPACE:
1288 return namespace_id;
1289 case ResultSpecProto::NAMESPACE_AND_SCHEMA_TYPE:
1290 // TODO(b/258715421): Temporary workaround to get a ResultGroupingEntryId
1291 // given the Namespace Id and SchemaType Id.
1292 return namespace_id << 16 | schema_type_id;
1293 }
1294 return absl_ports::NotFoundError("Cannot generate ResultGrouping Entry Id");
1295 }
1296
1297 libtextclassifier3::StatusOr<DocumentAssociatedScoreData>
GetDocumentAssociatedScoreData(DocumentId document_id) const1298 DocumentStore::GetDocumentAssociatedScoreData(DocumentId document_id) const {
1299 auto score_data_or = score_cache_->GetCopy(document_id);
1300 if (!score_data_or.ok()) {
1301 ICING_LOG(ERROR) << " while trying to access DocumentId " << document_id
1302 << " from score_cache_";
1303 return absl_ports::NotFoundError(
1304 std::move(score_data_or).status().error_message());
1305 }
1306
1307 DocumentAssociatedScoreData document_associated_score_data =
1308 std::move(score_data_or).ValueOrDie();
1309 return document_associated_score_data;
1310 }
1311
1312 libtextclassifier3::StatusOr<CorpusAssociatedScoreData>
GetCorpusAssociatedScoreData(CorpusId corpus_id) const1313 DocumentStore::GetCorpusAssociatedScoreData(CorpusId corpus_id) const {
1314 auto score_data_or = corpus_score_cache_->GetCopy(corpus_id);
1315 if (!score_data_or.ok()) {
1316 return score_data_or.status();
1317 }
1318
1319 CorpusAssociatedScoreData corpus_associated_score_data =
1320 std::move(score_data_or).ValueOrDie();
1321 return corpus_associated_score_data;
1322 }
1323
1324 libtextclassifier3::StatusOr<CorpusAssociatedScoreData>
GetCorpusAssociatedScoreDataToUpdate(CorpusId corpus_id) const1325 DocumentStore::GetCorpusAssociatedScoreDataToUpdate(CorpusId corpus_id) const {
1326 auto corpus_scoring_data_or = GetCorpusAssociatedScoreData(corpus_id);
1327 if (corpus_scoring_data_or.ok()) {
1328 return std::move(corpus_scoring_data_or).ValueOrDie();
1329 }
1330 CorpusAssociatedScoreData scoringData;
1331 // OUT_OF_RANGE is the StatusCode returned when a corpus id is added to
1332 // corpus_score_cache_ for the first time.
1333 if (corpus_scoring_data_or.status().CanonicalCode() ==
1334 libtextclassifier3::StatusCode::OUT_OF_RANGE) {
1335 return scoringData;
1336 }
1337 return corpus_scoring_data_or.status();
1338 }
1339
1340 // TODO(b/273826815): Decide on and adopt a consistent pattern for handling
1341 // NOT_FOUND 'errors' returned by our internal classes.
GetUsageScores(DocumentId document_id,int64_t current_time_ms) const1342 std::optional<UsageStore::UsageScores> DocumentStore::GetUsageScores(
1343 DocumentId document_id, int64_t current_time_ms) const {
1344 std::optional<DocumentFilterData> opt =
1345 GetAliveDocumentFilterData(document_id, current_time_ms);
1346 if (!opt) {
1347 return std::nullopt;
1348 }
1349 if (document_id >= usage_store_->num_elements()) {
1350 return std::nullopt;
1351 }
1352 auto usage_scores_or = usage_store_->GetUsageScores(document_id);
1353 if (!usage_scores_or.ok()) {
1354 ICING_LOG(ERROR) << "Error retrieving usage for " << document_id << ": "
1355 << usage_scores_or.status().error_message();
1356 return std::nullopt;
1357 }
1358 return std::move(usage_scores_or).ValueOrDie();
1359 }
1360
ReportUsage(const UsageReport & usage_report)1361 libtextclassifier3::Status DocumentStore::ReportUsage(
1362 const UsageReport& usage_report) {
1363 ICING_ASSIGN_OR_RETURN(DocumentId document_id,
1364 GetDocumentId(usage_report.document_namespace(),
1365 usage_report.document_uri()));
1366 // We can use the internal version here because we got our document_id from
1367 // our internal data structures. We would have thrown some error if the
1368 // namespace and/or uri were incorrect.
1369 int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
1370 if (!GetAliveDocumentFilterData(document_id, current_time_ms)) {
1371 // Document was probably deleted or expired.
1372 return absl_ports::NotFoundError(absl_ports::StrCat(
1373 "Couldn't report usage on a nonexistent document: (namespace: '",
1374 usage_report.document_namespace(), "', uri: '",
1375 usage_report.document_uri(), "')"));
1376 }
1377
1378 return usage_store_->AddUsageReport(usage_report, document_id);
1379 }
1380
DeleteByNamespace(std::string_view name_space)1381 DocumentStore::DeleteByGroupResult DocumentStore::DeleteByNamespace(
1382 std::string_view name_space) {
1383 DeleteByGroupResult result;
1384 auto namespace_id_or = namespace_mapper_->Get(name_space);
1385 if (!namespace_id_or.ok()) {
1386 result.status = absl_ports::Annotate(
1387 namespace_id_or.status(),
1388 absl_ports::StrCat("Failed to find namespace: ", name_space));
1389 return result;
1390 }
1391 NamespaceId namespace_id = namespace_id_or.ValueOrDie();
1392 auto num_deleted_or = BatchDelete(namespace_id, kInvalidSchemaTypeId);
1393 if (!num_deleted_or.ok()) {
1394 result.status = std::move(num_deleted_or).status();
1395 return result;
1396 }
1397
1398 result.num_docs_deleted = num_deleted_or.ValueOrDie();
1399 if (result.num_docs_deleted <= 0) {
1400 // Treat the fact that no existing documents had this namespace to be the
1401 // same as this namespace not existing at all.
1402 result.status = absl_ports::NotFoundError(
1403 absl_ports::StrCat("Namespace '", name_space, "' doesn't exist"));
1404 return result;
1405 }
1406
1407 return result;
1408 }
1409
DeleteBySchemaType(std::string_view schema_type)1410 DocumentStore::DeleteByGroupResult DocumentStore::DeleteBySchemaType(
1411 std::string_view schema_type) {
1412 DeleteByGroupResult result;
1413 auto schema_type_id_or = schema_store_->GetSchemaTypeId(schema_type);
1414 if (!schema_type_id_or.ok()) {
1415 result.status = absl_ports::Annotate(
1416 schema_type_id_or.status(),
1417 absl_ports::StrCat("Failed to find schema type. schema_type: ",
1418 schema_type));
1419 return result;
1420 }
1421 SchemaTypeId schema_type_id = schema_type_id_or.ValueOrDie();
1422 auto num_deleted_or = BatchDelete(kInvalidNamespaceId, schema_type_id);
1423 if (!num_deleted_or.ok()) {
1424 result.status = std::move(num_deleted_or).status();
1425 return result;
1426 }
1427
1428 result.num_docs_deleted = num_deleted_or.ValueOrDie();
1429 if (result.num_docs_deleted <= 0) {
1430 result.status = absl_ports::NotFoundError(absl_ports::StrCat(
1431 "No documents found with schema type '", schema_type, "'"));
1432 return result;
1433 }
1434
1435 return result;
1436 }
1437
BatchDelete(NamespaceId namespace_id,SchemaTypeId schema_type_id)1438 libtextclassifier3::StatusOr<int> DocumentStore::BatchDelete(
1439 NamespaceId namespace_id, SchemaTypeId schema_type_id) {
1440 // Tracks if there were any existing documents with this namespace that we
1441 // will mark as deleted.
1442 int num_updated_documents = 0;
1443
1444 // Traverse FilterCache and delete all docs that match namespace_id and
1445 // schema_type_id.
1446 int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
1447 for (DocumentId document_id = 0; document_id < filter_cache_->num_elements();
1448 ++document_id) {
1449 // filter_cache_->Get can only fail if document_id is < 0
1450 // or >= filter_cache_->num_elements. So, this error SHOULD NEVER HAPPEN.
1451 ICING_ASSIGN_OR_RETURN(const DocumentFilterData* data,
1452 filter_cache_->Get(document_id));
1453
1454 // Check namespace only when the input namespace id is valid.
1455 if (namespace_id != kInvalidNamespaceId &&
1456 (data->namespace_id() == kInvalidNamespaceId ||
1457 data->namespace_id() != namespace_id)) {
1458 // The document has already been hard-deleted or isn't from the desired
1459 // namespace.
1460 continue;
1461 }
1462
1463 // Check schema type only when the input schema type id is valid.
1464 if (schema_type_id != kInvalidSchemaTypeId &&
1465 (data->schema_type_id() == kInvalidSchemaTypeId ||
1466 data->schema_type_id() != schema_type_id)) {
1467 // The document has already been hard-deleted or doesn't have the
1468 // desired schema type.
1469 continue;
1470 }
1471
1472 // The document has the desired namespace and schema type, it either
1473 // exists or has expired.
1474 libtextclassifier3::Status delete_status =
1475 Delete(document_id, current_time_ms);
1476 if (absl_ports::IsNotFound(delete_status)) {
1477 continue;
1478 } else if (!delete_status.ok()) {
1479 // Real error, pass up.
1480 return delete_status;
1481 }
1482 ++num_updated_documents;
1483 }
1484
1485 return num_updated_documents;
1486 }
1487
PersistToDisk(PersistType::Code persist_type)1488 libtextclassifier3::Status DocumentStore::PersistToDisk(
1489 PersistType::Code persist_type) {
1490 if (persist_type == PersistType::LITE) {
1491 // only persist the document log.
1492 return document_log_->PersistToDisk();
1493 }
1494 ICING_RETURN_IF_ERROR(document_log_->PersistToDisk());
1495 ICING_RETURN_IF_ERROR(document_key_mapper_->PersistToDisk());
1496 ICING_RETURN_IF_ERROR(document_id_mapper_->PersistToDisk());
1497 ICING_RETURN_IF_ERROR(score_cache_->PersistToDisk());
1498 ICING_RETURN_IF_ERROR(filter_cache_->PersistToDisk());
1499 ICING_RETURN_IF_ERROR(namespace_mapper_->PersistToDisk());
1500 ICING_RETURN_IF_ERROR(usage_store_->PersistToDisk());
1501 ICING_RETURN_IF_ERROR(corpus_mapper_->PersistToDisk());
1502 ICING_RETURN_IF_ERROR(corpus_score_cache_->PersistToDisk());
1503
1504 // Update the combined checksum and write to header file.
1505 ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
1506 ICING_RETURN_IF_ERROR(UpdateHeader(checksum));
1507
1508 return libtextclassifier3::Status::OK;
1509 }
1510
GetValueOrDefault(const libtextclassifier3::StatusOr<int64_t> & value_or,int64_t default_value)1511 int64_t GetValueOrDefault(const libtextclassifier3::StatusOr<int64_t>& value_or,
1512 int64_t default_value) {
1513 return (value_or.ok()) ? value_or.ValueOrDie() : default_value;
1514 }
1515
GetMemberStorageInfo() const1516 DocumentStorageInfoProto DocumentStore::GetMemberStorageInfo() const {
1517 DocumentStorageInfoProto storage_info;
1518 storage_info.set_document_log_size(
1519 GetValueOrDefault(document_log_->GetDiskUsage(), -1));
1520 storage_info.set_key_mapper_size(
1521 GetValueOrDefault(document_key_mapper_->GetDiskUsage(), -1));
1522 storage_info.set_document_id_mapper_size(
1523 GetValueOrDefault(document_id_mapper_->GetDiskUsage(), -1));
1524 storage_info.set_score_cache_size(
1525 GetValueOrDefault(score_cache_->GetDiskUsage(), -1));
1526 storage_info.set_filter_cache_size(
1527 GetValueOrDefault(filter_cache_->GetDiskUsage(), -1));
1528 storage_info.set_namespace_id_mapper_size(
1529 GetValueOrDefault(namespace_mapper_->GetDiskUsage(), -1));
1530 storage_info.set_corpus_mapper_size(
1531 GetValueOrDefault(corpus_mapper_->GetDiskUsage(), -1));
1532 storage_info.set_corpus_score_cache_size(
1533 GetValueOrDefault(corpus_score_cache_->GetDiskUsage(), -1));
1534 return storage_info;
1535 }
1536
CalculateDocumentStatusCounts(DocumentStorageInfoProto storage_info) const1537 DocumentStorageInfoProto DocumentStore::CalculateDocumentStatusCounts(
1538 DocumentStorageInfoProto storage_info) const {
1539 int total_num_alive = 0;
1540 int total_num_expired = 0;
1541 int total_num_deleted = 0;
1542 std::unordered_map<NamespaceId, std::string> namespace_id_to_namespace =
1543 GetNamespaceIdsToNamespaces(namespace_mapper_.get());
1544 std::unordered_map<std::string, NamespaceStorageInfoProto>
1545 namespace_to_storage_info;
1546
1547 int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
1548 for (DocumentId document_id = 0;
1549 document_id < document_id_mapper_->num_elements(); ++document_id) {
1550 // Check if it's deleted first.
1551 if (IsDeleted(document_id)) {
1552 // We don't have the namespace id of hard deleted documents anymore, so
1553 // we can't add to our namespace storage info.
1554 ++total_num_deleted;
1555 continue;
1556 }
1557
1558 // At this point, the document is either alive or expired, we can get
1559 // namespace info for it.
1560 auto filter_data_or = filter_cache_->Get(document_id);
1561 if (!filter_data_or.ok()) {
1562 ICING_VLOG(1) << "Error trying to get filter data for document store "
1563 "storage info counts.";
1564 continue;
1565 }
1566 const DocumentFilterData* filter_data = filter_data_or.ValueOrDie();
1567 auto itr = namespace_id_to_namespace.find(filter_data->namespace_id());
1568 if (itr == namespace_id_to_namespace.end()) {
1569 ICING_VLOG(1) << "Error trying to find namespace for document store "
1570 "storage info counts.";
1571 continue;
1572 }
1573 const std::string& name_space = itr->second;
1574
1575 // Always set the namespace, if the NamespaceStorageInfoProto didn't exist
1576 // before, we'll get back a default instance of it.
1577 NamespaceStorageInfoProto& namespace_storage_info =
1578 namespace_to_storage_info[name_space];
1579 namespace_storage_info.set_namespace_(name_space);
1580
1581 // Get usage scores
1582 auto usage_scores_or = usage_store_->GetUsageScores(document_id);
1583 if (!usage_scores_or.ok()) {
1584 ICING_VLOG(1) << "Error trying to get usage scores for document store "
1585 "storage info counts.";
1586 continue;
1587 }
1588 UsageStore::UsageScores usage_scores = usage_scores_or.ValueOrDie();
1589
1590 // Update our stats
1591 if (!GetNonExpiredDocumentFilterData(document_id, current_time_ms)) {
1592 ++total_num_expired;
1593 namespace_storage_info.set_num_expired_documents(
1594 namespace_storage_info.num_expired_documents() + 1);
1595 if (usage_scores.usage_type1_count > 0) {
1596 namespace_storage_info.set_num_expired_documents_usage_type1(
1597 namespace_storage_info.num_expired_documents_usage_type1() + 1);
1598 }
1599 if (usage_scores.usage_type2_count > 0) {
1600 namespace_storage_info.set_num_expired_documents_usage_type2(
1601 namespace_storage_info.num_expired_documents_usage_type2() + 1);
1602 }
1603 if (usage_scores.usage_type3_count > 0) {
1604 namespace_storage_info.set_num_expired_documents_usage_type3(
1605 namespace_storage_info.num_expired_documents_usage_type3() + 1);
1606 }
1607 } else {
1608 ++total_num_alive;
1609 namespace_storage_info.set_num_alive_documents(
1610 namespace_storage_info.num_alive_documents() + 1);
1611 if (usage_scores.usage_type1_count > 0) {
1612 namespace_storage_info.set_num_alive_documents_usage_type1(
1613 namespace_storage_info.num_alive_documents_usage_type1() + 1);
1614 }
1615 if (usage_scores.usage_type2_count > 0) {
1616 namespace_storage_info.set_num_alive_documents_usage_type2(
1617 namespace_storage_info.num_alive_documents_usage_type2() + 1);
1618 }
1619 if (usage_scores.usage_type3_count > 0) {
1620 namespace_storage_info.set_num_alive_documents_usage_type3(
1621 namespace_storage_info.num_alive_documents_usage_type3() + 1);
1622 }
1623 }
1624 }
1625
1626 for (auto& itr : namespace_to_storage_info) {
1627 storage_info.mutable_namespace_storage_info()->Add(std::move(itr.second));
1628 }
1629 storage_info.set_num_alive_documents(total_num_alive);
1630 storage_info.set_num_deleted_documents(total_num_deleted);
1631 storage_info.set_num_expired_documents(total_num_expired);
1632 return storage_info;
1633 }
1634
GetStorageInfo() const1635 DocumentStorageInfoProto DocumentStore::GetStorageInfo() const {
1636 DocumentStorageInfoProto storage_info = GetMemberStorageInfo();
1637 int64_t directory_size = filesystem_->GetDiskUsage(base_dir_.c_str());
1638 if (directory_size != Filesystem::kBadFileSize) {
1639 storage_info.set_document_store_size(directory_size);
1640 } else {
1641 storage_info.set_document_store_size(-1);
1642 }
1643 storage_info.set_num_namespaces(namespace_mapper_->num_keys());
1644 return CalculateDocumentStatusCounts(std::move(storage_info));
1645 }
1646
UpdateSchemaStore(const SchemaStore * schema_store)1647 libtextclassifier3::Status DocumentStore::UpdateSchemaStore(
1648 const SchemaStore* schema_store) {
1649 // Update all references to the SchemaStore
1650 schema_store_ = schema_store;
1651 document_validator_.UpdateSchemaStore(schema_store);
1652
1653 int size = document_id_mapper_->num_elements();
1654 int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
1655 for (DocumentId document_id = 0; document_id < size; document_id++) {
1656 auto document_or = Get(document_id);
1657 if (absl_ports::IsNotFound(document_or.status())) {
1658 // Skip nonexistent documents
1659 continue;
1660 } else if (!document_or.ok()) {
1661 // Real error, pass up
1662 return absl_ports::Annotate(
1663 document_or.status(),
1664 IcingStringUtil::StringPrintf(
1665 "Failed to retrieve Document for DocumentId %d", document_id));
1666 }
1667
1668 // Guaranteed to have a document now.
1669 DocumentProto document = document_or.ValueOrDie();
1670
1671 // Revalidate that this document is still compatible
1672 if (document_validator_.Validate(document).ok()) {
1673 // Update the SchemaTypeId for this entry
1674 ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
1675 schema_store_->GetSchemaTypeId(document.schema()));
1676 ICING_ASSIGN_OR_RETURN(
1677 typename FileBackedVector<DocumentFilterData>::MutableView
1678 doc_filter_data_view,
1679 filter_cache_->GetMutable(document_id));
1680 doc_filter_data_view.Get().set_schema_type_id(schema_type_id);
1681 } else {
1682 // Document is no longer valid with the new SchemaStore. Mark as
1683 // deleted
1684 auto delete_status =
1685 Delete(document.namespace_(), document.uri(), current_time_ms);
1686 if (!delete_status.ok() && !absl_ports::IsNotFound(delete_status)) {
1687 // Real error, pass up
1688 return delete_status;
1689 }
1690 }
1691 }
1692
1693 return libtextclassifier3::Status::OK;
1694 }
1695
OptimizedUpdateSchemaStore(const SchemaStore * schema_store,const SchemaStore::SetSchemaResult & set_schema_result)1696 libtextclassifier3::Status DocumentStore::OptimizedUpdateSchemaStore(
1697 const SchemaStore* schema_store,
1698 const SchemaStore::SetSchemaResult& set_schema_result) {
1699 if (!set_schema_result.success) {
1700 // No new schema was set, no work to be done
1701 return libtextclassifier3::Status::OK;
1702 }
1703
1704 // Update all references to the SchemaStore
1705 schema_store_ = schema_store;
1706 document_validator_.UpdateSchemaStore(schema_store);
1707
1708 int size = document_id_mapper_->num_elements();
1709 int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
1710 for (DocumentId document_id = 0; document_id < size; document_id++) {
1711 if (!GetAliveDocumentFilterData(document_id, current_time_ms)) {
1712 // Skip nonexistent documents
1713 continue;
1714 }
1715
1716 // Guaranteed that the document exists now.
1717 ICING_ASSIGN_OR_RETURN(const DocumentFilterData* filter_data,
1718 filter_cache_->Get(document_id));
1719
1720 bool delete_document = set_schema_result.schema_types_deleted_by_id.count(
1721 filter_data->schema_type_id()) != 0;
1722
1723 // Check if we need to update the FilterCache entry for this document. It
1724 // may have been assigned a different SchemaTypeId in the new SchemaStore.
1725 bool update_filter_cache =
1726 set_schema_result.old_schema_type_ids_changed.count(
1727 filter_data->schema_type_id()) != 0;
1728
1729 // Check if we need to revalidate this document if the type is now
1730 // incompatible
1731 bool revalidate_document =
1732 set_schema_result.schema_types_incompatible_by_id.count(
1733 filter_data->schema_type_id()) != 0;
1734
1735 if (update_filter_cache || revalidate_document) {
1736 ICING_ASSIGN_OR_RETURN(DocumentProto document, Get(document_id));
1737
1738 if (update_filter_cache) {
1739 ICING_ASSIGN_OR_RETURN(
1740 SchemaTypeId schema_type_id,
1741 schema_store_->GetSchemaTypeId(document.schema()));
1742 ICING_ASSIGN_OR_RETURN(
1743 typename FileBackedVector<DocumentFilterData>::MutableView
1744 doc_filter_data_view,
1745 filter_cache_->GetMutable(document_id));
1746 doc_filter_data_view.Get().set_schema_type_id(schema_type_id);
1747 }
1748 if (revalidate_document) {
1749 delete_document = !document_validator_.Validate(document).ok();
1750 }
1751 }
1752
1753 if (delete_document) {
1754 // Document is no longer valid with the new SchemaStore. Mark as deleted
1755 auto delete_status = Delete(document_id, current_time_ms);
1756 if (!delete_status.ok() && !absl_ports::IsNotFound(delete_status)) {
1757 // Real error, pass up
1758 return delete_status;
1759 }
1760 }
1761 }
1762
1763 return libtextclassifier3::Status::OK;
1764 }
1765
1766 // TODO(b/121227117): Implement Optimize()
Optimize()1767 libtextclassifier3::Status DocumentStore::Optimize() {
1768 return libtextclassifier3::Status::OK;
1769 }
1770
1771 libtextclassifier3::StatusOr<std::vector<DocumentId>>
OptimizeInto(const std::string & new_directory,const LanguageSegmenter * lang_segmenter,bool namespace_id_fingerprint,OptimizeStatsProto * stats)1772 DocumentStore::OptimizeInto(const std::string& new_directory,
1773 const LanguageSegmenter* lang_segmenter,
1774 bool namespace_id_fingerprint,
1775 OptimizeStatsProto* stats) {
1776 // Validates directory
1777 if (new_directory == base_dir_) {
1778 return absl_ports::InvalidArgumentError(
1779 "New directory is the same as the current one.");
1780 }
1781
1782 ICING_ASSIGN_OR_RETURN(
1783 auto doc_store_create_result,
1784 DocumentStore::Create(filesystem_, new_directory, &clock_, schema_store_,
1785 /*force_recovery_and_revalidate_documents=*/false,
1786 namespace_id_fingerprint, compression_level_,
1787 /*initialize_stats=*/nullptr));
1788 std::unique_ptr<DocumentStore> new_doc_store =
1789 std::move(doc_store_create_result.document_store);
1790
1791 // Writes all valid docs into new document store (new directory)
1792 int size = document_id_mapper_->num_elements();
1793 int num_deleted = 0;
1794 int num_expired = 0;
1795 UsageStore::UsageScores default_usage;
1796 std::vector<DocumentId> document_id_old_to_new(size, kInvalidDocumentId);
1797 int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
1798 for (DocumentId document_id = 0; document_id < size; document_id++) {
1799 auto document_or = Get(document_id, /*clear_internal_fields=*/false);
1800 if (absl_ports::IsNotFound(document_or.status())) {
1801 if (IsDeleted(document_id)) {
1802 ++num_deleted;
1803 } else if (!GetNonExpiredDocumentFilterData(document_id,
1804 current_time_ms)) {
1805 ++num_expired;
1806 }
1807 continue;
1808 } else if (!document_or.ok()) {
1809 // Real error, pass up
1810 return absl_ports::Annotate(
1811 document_or.status(),
1812 IcingStringUtil::StringPrintf(
1813 "Failed to retrieve Document for DocumentId %d", document_id));
1814 }
1815
1816 // Guaranteed to have a document now.
1817 DocumentProto document_to_keep = std::move(document_or).ValueOrDie();
1818
1819 libtextclassifier3::StatusOr<DocumentId> new_document_id_or;
1820 if (document_to_keep.internal_fields().length_in_tokens() == 0) {
1821 auto tokenized_document_or = TokenizedDocument::Create(
1822 schema_store_, lang_segmenter, document_to_keep);
1823 if (!tokenized_document_or.ok()) {
1824 return absl_ports::Annotate(
1825 tokenized_document_or.status(),
1826 IcingStringUtil::StringPrintf(
1827 "Failed to tokenize Document for DocumentId %d", document_id));
1828 }
1829 TokenizedDocument tokenized_document(
1830 std::move(tokenized_document_or).ValueOrDie());
1831 new_document_id_or = new_doc_store->Put(
1832 std::move(document_to_keep), tokenized_document.num_string_tokens());
1833 } else {
1834 // TODO(b/144458732): Implement a more robust version of
1835 // TC_ASSIGN_OR_RETURN that can support error logging.
1836 new_document_id_or =
1837 new_doc_store->InternalPut(std::move(document_to_keep));
1838 }
1839 if (!new_document_id_or.ok()) {
1840 ICING_LOG(ERROR) << new_document_id_or.status().error_message()
1841 << "Failed to write into new document store";
1842 return new_document_id_or.status();
1843 }
1844
1845 document_id_old_to_new[document_id] = new_document_id_or.ValueOrDie();
1846
1847 // Copy over usage scores.
1848 ICING_ASSIGN_OR_RETURN(UsageStore::UsageScores usage_scores,
1849 usage_store_->GetUsageScores(document_id));
1850 if (!(usage_scores == default_usage)) {
1851 // If the usage scores for this document are the default (no usage), then
1852 // don't bother setting it. No need to possibly allocate storage if
1853 // there's nothing interesting to store.
1854 DocumentId new_document_id = new_document_id_or.ValueOrDie();
1855 ICING_RETURN_IF_ERROR(
1856 new_doc_store->SetUsageScores(new_document_id, usage_scores));
1857 }
1858 }
1859 if (stats != nullptr) {
1860 stats->set_num_original_documents(size);
1861 stats->set_num_deleted_documents(num_deleted);
1862 stats->set_num_expired_documents(num_expired);
1863 }
1864 ICING_RETURN_IF_ERROR(new_doc_store->PersistToDisk(PersistType::FULL));
1865 return document_id_old_to_new;
1866 }
1867
1868 libtextclassifier3::StatusOr<DocumentStore::OptimizeInfo>
GetOptimizeInfo() const1869 DocumentStore::GetOptimizeInfo() const {
1870 OptimizeInfo optimize_info;
1871
1872 // Figure out our ratio of optimizable/total docs.
1873 int32_t num_documents = document_id_mapper_->num_elements();
1874 int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
1875 for (DocumentId document_id = kMinDocumentId; document_id < num_documents;
1876 ++document_id) {
1877 if (!GetAliveDocumentFilterData(document_id, current_time_ms)) {
1878 ++optimize_info.optimizable_docs;
1879 }
1880
1881 ++optimize_info.total_docs;
1882 }
1883
1884 if (optimize_info.total_docs == 0) {
1885 // Can exit early since there's nothing to calculate.
1886 return optimize_info;
1887 }
1888
1889 // Get the total element size.
1890 //
1891 // We use file size instead of disk usage here because the files are not
1892 // sparse, so it's more accurate. Disk usage rounds up to the nearest block
1893 // size.
1894 ICING_ASSIGN_OR_RETURN(const int64_t document_log_file_size,
1895 document_log_->GetElementsFileSize());
1896 ICING_ASSIGN_OR_RETURN(const int64_t document_id_mapper_file_size,
1897 document_id_mapper_->GetElementsFileSize());
1898 ICING_ASSIGN_OR_RETURN(const int64_t score_cache_file_size,
1899 score_cache_->GetElementsFileSize());
1900 ICING_ASSIGN_OR_RETURN(const int64_t filter_cache_file_size,
1901 filter_cache_->GetElementsFileSize());
1902 ICING_ASSIGN_OR_RETURN(const int64_t corpus_score_cache_file_size,
1903 corpus_score_cache_->GetElementsFileSize());
1904
1905 // Usage store might be sparse, but we'll still use file size for more
1906 // accurate counting.
1907 ICING_ASSIGN_OR_RETURN(const int64_t usage_store_file_size,
1908 usage_store_->GetElementsFileSize());
1909
1910 // We use a combined disk usage and file size for the DynamicTrieKeyMapper
1911 // because it's backed by a trie, which has some sparse property bitmaps.
1912 ICING_ASSIGN_OR_RETURN(const int64_t document_key_mapper_size,
1913 document_key_mapper_->GetElementsSize());
1914
1915 // We don't include the namespace_mapper or the corpus_mapper because it's
1916 // not clear if we could recover any space even if Optimize were called.
1917 // Deleting 100s of documents could still leave a few documents of a
1918 // namespace, and then there would be no change.
1919
1920 int64_t total_size = document_log_file_size + document_key_mapper_size +
1921 document_id_mapper_file_size + score_cache_file_size +
1922 filter_cache_file_size + corpus_score_cache_file_size +
1923 usage_store_file_size;
1924
1925 optimize_info.estimated_optimizable_bytes =
1926 total_size * optimize_info.optimizable_docs / optimize_info.total_docs;
1927 return optimize_info;
1928 }
1929
UpdateCorpusAssociatedScoreCache(CorpusId corpus_id,const CorpusAssociatedScoreData & score_data)1930 libtextclassifier3::Status DocumentStore::UpdateCorpusAssociatedScoreCache(
1931 CorpusId corpus_id, const CorpusAssociatedScoreData& score_data) {
1932 return corpus_score_cache_->Set(corpus_id, score_data);
1933 }
1934
UpdateDocumentAssociatedScoreCache(DocumentId document_id,const DocumentAssociatedScoreData & score_data)1935 libtextclassifier3::Status DocumentStore::UpdateDocumentAssociatedScoreCache(
1936 DocumentId document_id, const DocumentAssociatedScoreData& score_data) {
1937 return score_cache_->Set(document_id, score_data);
1938 }
1939
UpdateFilterCache(DocumentId document_id,const DocumentFilterData & filter_data)1940 libtextclassifier3::Status DocumentStore::UpdateFilterCache(
1941 DocumentId document_id, const DocumentFilterData& filter_data) {
1942 return filter_cache_->Set(document_id, filter_data);
1943 }
1944
ClearDerivedData(DocumentId document_id)1945 libtextclassifier3::Status DocumentStore::ClearDerivedData(
1946 DocumentId document_id) {
1947 // We intentionally leave the data in key_mapper_ because locating that data
1948 // requires fetching namespace and uri. Leaving data in key_mapper_ should
1949 // be fine because the data is hashed.
1950
1951 ICING_RETURN_IF_ERROR(document_id_mapper_->Set(document_id, kDocDeletedFlag));
1952
1953 // Resets the score cache entry
1954 ICING_RETURN_IF_ERROR(UpdateDocumentAssociatedScoreCache(
1955 document_id, DocumentAssociatedScoreData(kInvalidCorpusId,
1956 /*document_score=*/-1,
1957 /*creation_timestamp_ms=*/-1,
1958 /*length_in_tokens=*/0)));
1959
1960 // Resets the filter cache entry
1961 ICING_RETURN_IF_ERROR(UpdateFilterCache(
1962 document_id, DocumentFilterData(kInvalidNamespaceId, kInvalidSchemaTypeId,
1963 /*expiration_timestamp_ms=*/-1)));
1964
1965 // Clears the usage scores.
1966 return usage_store_->DeleteUsageScores(document_id);
1967 }
1968
SetUsageScores(DocumentId document_id,const UsageStore::UsageScores & usage_scores)1969 libtextclassifier3::Status DocumentStore::SetUsageScores(
1970 DocumentId document_id, const UsageStore::UsageScores& usage_scores) {
1971 return usage_store_->SetUsageScores(document_id, usage_scores);
1972 }
1973
1974 libtextclassifier3::StatusOr<
1975 google::protobuf::RepeatedPtrField<DocumentDebugInfoProto::CorpusInfo>>
CollectCorpusInfo() const1976 DocumentStore::CollectCorpusInfo() const {
1977 google::protobuf::RepeatedPtrField<DocumentDebugInfoProto::CorpusInfo> corpus_info;
1978 libtextclassifier3::StatusOr<const SchemaProto*> schema_proto_or =
1979 schema_store_->GetSchema();
1980 if (!schema_proto_or.ok()) {
1981 return corpus_info;
1982 }
1983 // Maps from CorpusId to the corresponding protocol buffer in the result.
1984 std::unordered_map<CorpusId, DocumentDebugInfoProto::CorpusInfo*> info_map;
1985 std::unordered_map<NamespaceId, std::string> namespace_id_to_namespace =
1986 GetNamespaceIdsToNamespaces(namespace_mapper_.get());
1987 const SchemaProto* schema_proto = schema_proto_or.ValueOrDie();
1988 int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
1989 for (DocumentId document_id = 0; document_id < filter_cache_->num_elements();
1990 ++document_id) {
1991 if (!GetAliveDocumentFilterData(document_id, current_time_ms)) {
1992 continue;
1993 }
1994 ICING_ASSIGN_OR_RETURN(const DocumentFilterData* filter_data,
1995 filter_cache_->Get(document_id));
1996 ICING_ASSIGN_OR_RETURN(const DocumentAssociatedScoreData* score_data,
1997 score_cache_->Get(document_id));
1998 const std::string& name_space =
1999 namespace_id_to_namespace[filter_data->namespace_id()];
2000 const std::string& schema =
2001 schema_proto->types()[filter_data->schema_type_id()].schema_type();
2002 auto iter = info_map.find(score_data->corpus_id());
2003 if (iter == info_map.end()) {
2004 DocumentDebugInfoProto::CorpusInfo* entry = corpus_info.Add();
2005 entry->set_namespace_(name_space);
2006 entry->set_schema(schema);
2007 iter = info_map.insert({score_data->corpus_id(), entry}).first;
2008 }
2009 iter->second->set_total_documents(iter->second->total_documents() + 1);
2010 iter->second->set_total_token(iter->second->total_token() +
2011 score_data->length_in_tokens());
2012 }
2013 return corpus_info;
2014 }
2015
2016 libtextclassifier3::StatusOr<DocumentDebugInfoProto>
GetDebugInfo(int verbosity) const2017 DocumentStore::GetDebugInfo(int verbosity) const {
2018 DocumentDebugInfoProto debug_info;
2019 *debug_info.mutable_document_storage_info() = GetStorageInfo();
2020 ICING_ASSIGN_OR_RETURN(Crc32 crc, ComputeChecksum());
2021 debug_info.set_crc(crc.Get());
2022 if (verbosity > 0) {
2023 ICING_ASSIGN_OR_RETURN(
2024 google::protobuf::RepeatedPtrField<DocumentDebugInfoProto::CorpusInfo>
2025 corpus_info,
2026 CollectCorpusInfo());
2027 *debug_info.mutable_corpus_info() = std::move(corpus_info);
2028 }
2029 return debug_info;
2030 }
2031
2032 } // namespace lib
2033 } // namespace icing
2034