1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/store/document-store.h"
16
17 #include <cstdint>
18 #include <limits>
19 #include <memory>
20 #include <optional>
21 #include <string>
22 #include <string_view>
23 #include <unordered_map>
24 #include <utility>
25 #include <vector>
26
27 #include "icing/text_classifier/lib3/utils/base/status.h"
28 #include "icing/text_classifier/lib3/utils/base/statusor.h"
29 #include "icing/text_classifier/lib3/utils/hash/farmhash.h"
30 #include "icing/absl_ports/annotate.h"
31 #include "icing/absl_ports/canonical_errors.h"
32 #include "icing/absl_ports/str_cat.h"
33 #include "icing/file/file-backed-proto-log.h"
34 #include "icing/file/file-backed-vector.h"
35 #include "icing/file/filesystem.h"
36 #include "icing/file/memory-mapped-file.h"
37 #include "icing/file/portable-file-backed-proto-log.h"
38 #include "icing/legacy/core/icing-string-util.h"
39 #include "icing/proto/debug.pb.h"
40 #include "icing/proto/document.pb.h"
41 #include "icing/proto/document_wrapper.pb.h"
42 #include "icing/proto/logging.pb.h"
43 #include "icing/proto/optimize.pb.h"
44 #include "icing/proto/persist.pb.h"
45 #include "icing/proto/schema.pb.h"
46 #include "icing/proto/storage.pb.h"
47 #include "icing/proto/usage.pb.h"
48 #include "icing/schema/schema-store.h"
49 #include "icing/store/corpus-associated-scoring-data.h"
50 #include "icing/store/corpus-id.h"
51 #include "icing/store/document-associated-score-data.h"
52 #include "icing/store/document-filter-data.h"
53 #include "icing/store/document-id.h"
54 #include "icing/store/document-log-creator.h"
55 #include "icing/store/dynamic-trie-key-mapper.h"
56 #include "icing/store/namespace-fingerprint-identifier.h"
57 #include "icing/store/namespace-id.h"
58 #include "icing/store/persistent-hash-map-key-mapper.h"
59 #include "icing/store/usage-store.h"
60 #include "icing/tokenization/language-segmenter.h"
61 #include "icing/util/clock.h"
62 #include "icing/util/crc32.h"
63 #include "icing/util/data-loss.h"
64 #include "icing/util/encode-util.h"
65 #include "icing/util/fingerprint-util.h"
66 #include "icing/util/logging.h"
67 #include "icing/util/status-macros.h"
68 #include "icing/util/tokenized-document.h"
69
70 namespace icing {
71 namespace lib {
72
73 namespace {
74
75 // Used in DocumentId mapper to mark a document as deleted
76 constexpr int64_t kDocDeletedFlag = -1;
77 constexpr char kDocumentIdMapperFilename[] = "document_id_mapper";
78 constexpr char kUriHashMapperWorkingPath[] = "uri_mapper";
79 constexpr char kDocumentStoreHeaderFilename[] = "document_store_header";
80 constexpr char kScoreCacheFilename[] = "score_cache";
81 constexpr char kCorpusScoreCache[] = "corpus_score_cache";
82 constexpr char kFilterCacheFilename[] = "filter_cache";
83 constexpr char kNamespaceMapperFilename[] = "namespace_mapper";
84 constexpr char kUsageStoreDirectoryName[] = "usage_store";
85 constexpr char kCorpusIdMapperFilename[] = "corpus_mapper";
86
87 // Determined through manual testing to allow for 4 million uris. 4 million
88 // because we allow up to 4 million DocumentIds.
89 constexpr int32_t kUriDynamicTrieKeyMapperMaxSize =
90 144 * 1024 * 1024; // 144 MiB
91
92 constexpr int32_t kUriHashKeyMapperMaxNumEntries =
93 kMaxDocumentId + 1; // 1 << 22, 4M
94 // - Key: namespace_id_str (3 bytes) + fingerprinted_uri (10 bytes) + '\0' (1
95 // byte)
96 // - Value: DocumentId (4 bytes)
97 constexpr int32_t kUriHashKeyMapperKVByteSize = 13 + 1 + sizeof(DocumentId);
98
99 // 384 KiB for a DynamicTrieKeyMapper would allow each internal array to have a
100 // max of 128 KiB for storage.
101 constexpr int32_t kNamespaceMapperMaxSize = 3 * 128 * 1024; // 384 KiB
102 constexpr int32_t kCorpusMapperMaxSize = 3 * 128 * 1024; // 384 KiB
103
CreateDocumentWrapper(DocumentProto && document)104 DocumentWrapper CreateDocumentWrapper(DocumentProto&& document) {
105 DocumentWrapper document_wrapper;
106 *document_wrapper.mutable_document() = std::move(document);
107 return document_wrapper;
108 }
109
MakeHeaderFilename(const std::string & base_dir)110 std::string MakeHeaderFilename(const std::string& base_dir) {
111 return absl_ports::StrCat(base_dir, "/", kDocumentStoreHeaderFilename);
112 }
113
MakeUriHashMapperWorkingPath(const std::string & base_dir)114 std::string MakeUriHashMapperWorkingPath(const std::string& base_dir) {
115 return absl_ports::StrCat(base_dir, "/", kUriHashMapperWorkingPath);
116 }
117
MakeDocumentIdMapperFilename(const std::string & base_dir)118 std::string MakeDocumentIdMapperFilename(const std::string& base_dir) {
119 return absl_ports::StrCat(base_dir, "/", kDocumentIdMapperFilename);
120 }
121
MakeScoreCacheFilename(const std::string & base_dir)122 std::string MakeScoreCacheFilename(const std::string& base_dir) {
123 return absl_ports::StrCat(base_dir, "/", kScoreCacheFilename);
124 }
125
MakeCorpusScoreCache(const std::string & base_dir)126 std::string MakeCorpusScoreCache(const std::string& base_dir) {
127 return absl_ports::StrCat(base_dir, "/", kCorpusScoreCache);
128 }
129
MakeFilterCacheFilename(const std::string & base_dir)130 std::string MakeFilterCacheFilename(const std::string& base_dir) {
131 return absl_ports::StrCat(base_dir, "/", kFilterCacheFilename);
132 }
133
MakeNamespaceMapperFilename(const std::string & base_dir)134 std::string MakeNamespaceMapperFilename(const std::string& base_dir) {
135 return absl_ports::StrCat(base_dir, "/", kNamespaceMapperFilename);
136 }
137
MakeUsageStoreDirectoryName(const std::string & base_dir)138 std::string MakeUsageStoreDirectoryName(const std::string& base_dir) {
139 return absl_ports::StrCat(base_dir, "/", kUsageStoreDirectoryName);
140 }
141
MakeCorpusMapperFilename(const std::string & base_dir)142 std::string MakeCorpusMapperFilename(const std::string& base_dir) {
143 return absl_ports::StrCat(base_dir, "/", kCorpusIdMapperFilename);
144 }
145
CalculateExpirationTimestampMs(int64_t creation_timestamp_ms,int64_t ttl_ms)146 int64_t CalculateExpirationTimestampMs(int64_t creation_timestamp_ms,
147 int64_t ttl_ms) {
148 if (ttl_ms == 0) {
149 // Special case where a TTL of 0 indicates the document should never
150 // expire. int64_t max, interpreted as seconds since epoch, represents
151 // some point in the year 292,277,026,596. So we're probably ok to use
152 // this as "never reaching this point".
153 return std::numeric_limits<int64_t>::max();
154 }
155
156 int64_t expiration_timestamp_ms;
157 if (__builtin_add_overflow(creation_timestamp_ms, ttl_ms,
158 &expiration_timestamp_ms)) {
159 // Overflow detected. Treat overflow as the same behavior of just int64_t
160 // max
161 return std::numeric_limits<int64_t>::max();
162 }
163
164 return expiration_timestamp_ms;
165 }
166
GetRecoveryCause(const DocumentLogCreator::CreateResult & create_result,bool force_recovery_and_revalidate_documents)167 InitializeStatsProto::RecoveryCause GetRecoveryCause(
168 const DocumentLogCreator::CreateResult& create_result,
169 bool force_recovery_and_revalidate_documents) {
170 if (force_recovery_and_revalidate_documents) {
171 return InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC;
172 } else if (create_result.log_create_result.has_data_loss()) {
173 return InitializeStatsProto::DATA_LOSS;
174 } else if (create_result.preexisting_file_version !=
175 DocumentLogCreator::kCurrentVersion) {
176 return InitializeStatsProto::LEGACY_DOCUMENT_LOG_FORMAT;
177 }
178 return InitializeStatsProto::NONE;
179 }
180
GetDataStatus(DataLoss data_loss)181 InitializeStatsProto::DocumentStoreDataStatus GetDataStatus(
182 DataLoss data_loss) {
183 switch (data_loss) {
184 case DataLoss::PARTIAL:
185 return InitializeStatsProto::PARTIAL_LOSS;
186 case DataLoss::COMPLETE:
187 return InitializeStatsProto::COMPLETE_LOSS;
188 case DataLoss::NONE:
189 return InitializeStatsProto::NO_DATA_LOSS;
190 }
191 }
192
GetNamespaceIdsToNamespaces(const KeyMapper<NamespaceId> * key_mapper)193 std::unordered_map<NamespaceId, std::string> GetNamespaceIdsToNamespaces(
194 const KeyMapper<NamespaceId>* key_mapper) {
195 std::unordered_map<NamespaceId, std::string> namespace_ids_to_namespaces;
196
197 std::unique_ptr<typename KeyMapper<NamespaceId>::Iterator> itr =
198 key_mapper->GetIterator();
199 while (itr->Advance()) {
200 namespace_ids_to_namespaces.insert(
201 {itr->GetValue(), std::string(itr->GetKey())});
202 }
203 return namespace_ids_to_namespaces;
204 }
205
206 libtextclassifier3::StatusOr<std::unique_ptr<
207 KeyMapper<DocumentId, fingerprint_util::FingerprintStringFormatter>>>
CreateUriMapper(const Filesystem & filesystem,const std::string & base_dir,bool use_persistent_hash_map)208 CreateUriMapper(const Filesystem& filesystem, const std::string& base_dir,
209 bool use_persistent_hash_map) {
210 std::string uri_hash_mapper_working_path =
211 MakeUriHashMapperWorkingPath(base_dir);
212 // Due to historic issue, we use document store's base_dir directly as
213 // DynamicTrieKeyMapper's working directory for uri mapper.
214 // DynamicTrieKeyMapper also creates a subdirectory "key_mapper_dir", so the
215 // actual files will be put under "<base_dir>/key_mapper_dir/".
216 bool dynamic_trie_key_mapper_dir_exists = filesystem.DirectoryExists(
217 absl_ports::StrCat(base_dir, "/key_mapper_dir").c_str());
218 bool persistent_hash_map_dir_exists =
219 filesystem.DirectoryExists(uri_hash_mapper_working_path.c_str());
220 if ((use_persistent_hash_map && dynamic_trie_key_mapper_dir_exists) ||
221 (!use_persistent_hash_map && persistent_hash_map_dir_exists)) {
222 // Return a failure here so that the caller can properly delete and rebuild
223 // this component.
224 return absl_ports::FailedPreconditionError("Key mapper type mismatch");
225 }
226
227 if (use_persistent_hash_map) {
228 return PersistentHashMapKeyMapper<
229 DocumentId, fingerprint_util::FingerprintStringFormatter>::
230 Create(filesystem, std::move(uri_hash_mapper_working_path),
231 /*pre_mapping_fbv=*/false,
232 /*max_num_entries=*/kUriHashKeyMapperMaxNumEntries,
233 /*average_kv_byte_size=*/kUriHashKeyMapperKVByteSize);
234 } else {
235 return DynamicTrieKeyMapper<DocumentId,
236 fingerprint_util::FingerprintStringFormatter>::
237 Create(filesystem, base_dir, kUriDynamicTrieKeyMapperMaxSize);
238 }
239 }
240
241 } // namespace
242
MakeFingerprint(NamespaceId namespace_id,std::string_view namespace_,std::string_view uri_or_schema) const243 std::string DocumentStore::MakeFingerprint(
244 NamespaceId namespace_id, std::string_view namespace_,
245 std::string_view uri_or_schema) const {
246 if (!namespace_id_fingerprint_) {
247 // Using a 64-bit fingerprint to represent the key could lead to collisions.
248 // But, even with 200K unique keys, the probability of collision is about
249 // one-in-a-billion (https://en.wikipedia.org/wiki/Birthday_attack).
250 uint64_t fprint = tc3farmhash::Fingerprint64(
251 absl_ports::StrCat(namespace_, uri_or_schema));
252 return fingerprint_util::GetFingerprintString(fprint);
253 } else {
254 return NamespaceFingerprintIdentifier(namespace_id, uri_or_schema)
255 .EncodeToCString();
256 }
257 }
258
DocumentStore(const Filesystem * filesystem,const std::string_view base_dir,const Clock * clock,const SchemaStore * schema_store,bool namespace_id_fingerprint,bool pre_mapping_fbv,bool use_persistent_hash_map,int32_t compression_level)259 DocumentStore::DocumentStore(const Filesystem* filesystem,
260 const std::string_view base_dir,
261 const Clock* clock,
262 const SchemaStore* schema_store,
263 bool namespace_id_fingerprint,
264 bool pre_mapping_fbv, bool use_persistent_hash_map,
265 int32_t compression_level)
266 : filesystem_(filesystem),
267 base_dir_(base_dir),
268 clock_(*clock),
269 schema_store_(schema_store),
270 document_validator_(schema_store),
271 namespace_id_fingerprint_(namespace_id_fingerprint),
272 pre_mapping_fbv_(pre_mapping_fbv),
273 use_persistent_hash_map_(use_persistent_hash_map),
274 compression_level_(compression_level) {}
275
Put(const DocumentProto & document,int32_t num_tokens,PutDocumentStatsProto * put_document_stats)276 libtextclassifier3::StatusOr<DocumentId> DocumentStore::Put(
277 const DocumentProto& document, int32_t num_tokens,
278 PutDocumentStatsProto* put_document_stats) {
279 return Put(DocumentProto(document), num_tokens, put_document_stats);
280 }
281
Put(DocumentProto && document,int32_t num_tokens,PutDocumentStatsProto * put_document_stats)282 libtextclassifier3::StatusOr<DocumentId> DocumentStore::Put(
283 DocumentProto&& document, int32_t num_tokens,
284 PutDocumentStatsProto* put_document_stats) {
285 document.mutable_internal_fields()->set_length_in_tokens(num_tokens);
286 return InternalPut(std::move(document), put_document_stats);
287 }
288
~DocumentStore()289 DocumentStore::~DocumentStore() {
290 if (initialized_) {
291 if (!PersistToDisk(PersistType::FULL).ok()) {
292 ICING_LOG(ERROR)
293 << "Error persisting to disk in DocumentStore destructor";
294 }
295 }
296 }
297
Create(const Filesystem * filesystem,const std::string & base_dir,const Clock * clock,const SchemaStore * schema_store,bool force_recovery_and_revalidate_documents,bool namespace_id_fingerprint,bool pre_mapping_fbv,bool use_persistent_hash_map,int32_t compression_level,InitializeStatsProto * initialize_stats)298 libtextclassifier3::StatusOr<DocumentStore::CreateResult> DocumentStore::Create(
299 const Filesystem* filesystem, const std::string& base_dir,
300 const Clock* clock, const SchemaStore* schema_store,
301 bool force_recovery_and_revalidate_documents, bool namespace_id_fingerprint,
302 bool pre_mapping_fbv, bool use_persistent_hash_map,
303 int32_t compression_level, InitializeStatsProto* initialize_stats) {
304 ICING_RETURN_ERROR_IF_NULL(filesystem);
305 ICING_RETURN_ERROR_IF_NULL(clock);
306 ICING_RETURN_ERROR_IF_NULL(schema_store);
307
308 auto document_store = std::unique_ptr<DocumentStore>(new DocumentStore(
309 filesystem, base_dir, clock, schema_store, namespace_id_fingerprint,
310 pre_mapping_fbv, use_persistent_hash_map, compression_level));
311 ICING_ASSIGN_OR_RETURN(
312 InitializeResult initialize_result,
313 document_store->Initialize(force_recovery_and_revalidate_documents,
314 initialize_stats));
315
316 CreateResult create_result;
317 create_result.document_store = std::move(document_store);
318 create_result.data_loss = initialize_result.data_loss;
319 create_result.derived_files_regenerated =
320 initialize_result.derived_files_regenerated;
321 return create_result;
322 }
323
DiscardDerivedFiles(const Filesystem * filesystem,const std::string & base_dir)324 /* static */ libtextclassifier3::Status DocumentStore::DiscardDerivedFiles(
325 const Filesystem* filesystem, const std::string& base_dir) {
326 // Header
327 const std::string header_filename = MakeHeaderFilename(base_dir);
328 if (!filesystem->DeleteFile(MakeHeaderFilename(base_dir).c_str())) {
329 return absl_ports::InternalError("Couldn't delete header file");
330 }
331
332 // Document key mapper. Doesn't hurt to delete both dynamic trie and
333 // persistent hash map without checking.
334 ICING_RETURN_IF_ERROR(
335 DynamicTrieKeyMapper<DocumentId>::Delete(*filesystem, base_dir));
336 ICING_RETURN_IF_ERROR(PersistentHashMapKeyMapper<DocumentId>::Delete(
337 *filesystem, MakeUriHashMapperWorkingPath(base_dir)));
338
339 // Document id mapper
340 ICING_RETURN_IF_ERROR(FileBackedVector<int64_t>::Delete(
341 *filesystem, MakeDocumentIdMapperFilename(base_dir)));
342
343 // Document associated score cache
344 ICING_RETURN_IF_ERROR(FileBackedVector<DocumentAssociatedScoreData>::Delete(
345 *filesystem, MakeScoreCacheFilename(base_dir)));
346
347 // Filter cache
348 ICING_RETURN_IF_ERROR(FileBackedVector<DocumentFilterData>::Delete(
349 *filesystem, MakeFilterCacheFilename(base_dir)));
350
351 // Namespace mapper
352 ICING_RETURN_IF_ERROR(DynamicTrieKeyMapper<NamespaceId>::Delete(
353 *filesystem, MakeNamespaceMapperFilename(base_dir)));
354
355 // Corpus mapper
356 ICING_RETURN_IF_ERROR(DynamicTrieKeyMapper<CorpusId>::Delete(
357 *filesystem, MakeCorpusMapperFilename(base_dir)));
358
359 // Corpus associated score cache
360 ICING_RETURN_IF_ERROR(FileBackedVector<CorpusAssociatedScoreData>::Delete(
361 *filesystem, MakeCorpusScoreCache(base_dir)));
362
363 return libtextclassifier3::Status::OK;
364 }
365
366 libtextclassifier3::StatusOr<DocumentStore::InitializeResult>
Initialize(bool force_recovery_and_revalidate_documents,InitializeStatsProto * initialize_stats)367 DocumentStore::Initialize(bool force_recovery_and_revalidate_documents,
368 InitializeStatsProto* initialize_stats) {
369 auto create_result_or =
370 DocumentLogCreator::Create(filesystem_, base_dir_, compression_level_);
371
372 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
373 // that can support error logging.
374 if (!create_result_or.ok()) {
375 ICING_LOG(ERROR) << create_result_or.status().error_message()
376 << "\nFailed to initialize DocumentLog.";
377 return create_result_or.status();
378 }
379 DocumentLogCreator::CreateResult create_result =
380 std::move(create_result_or).ValueOrDie();
381
382 document_log_ = std::move(create_result.log_create_result.proto_log);
383 InitializeStatsProto::RecoveryCause recovery_cause =
384 GetRecoveryCause(create_result, force_recovery_and_revalidate_documents);
385
386 bool derived_files_regenerated = false;
387 if (recovery_cause != InitializeStatsProto::NONE || create_result.new_file) {
388 ICING_LOG(INFO) << "Starting Document Store Recovery with cause="
389 << recovery_cause << ", and create result { new_file="
390 << create_result.new_file << ", preeisting_file_version="
391 << create_result.preexisting_file_version << ", data_loss="
392 << create_result.log_create_result.data_loss
393 << "} and kCurrentVersion="
394 << DocumentLogCreator::kCurrentVersion;
395 // We can't rely on any existing derived files. Recreate them from scratch.
396 // Currently happens if:
397 // 1) This is a new log and we don't have derived files yet
398 // 2) Client wanted us to force a regeneration.
399 // 3) Log has some data loss, can't rely on existing derived data.
400 std::unique_ptr<Timer> document_recovery_timer = clock_.GetNewTimer();
401 libtextclassifier3::Status status =
402 RegenerateDerivedFiles(force_recovery_and_revalidate_documents);
403 if (recovery_cause != InitializeStatsProto::NONE) {
404 // Only consider it a recovery if the client forced a recovery or there
405 // was data loss. Otherwise, this could just be the first time we're
406 // initializing and generating derived files.
407 derived_files_regenerated = true;
408 if (initialize_stats != nullptr) {
409 initialize_stats->set_document_store_recovery_latency_ms(
410 document_recovery_timer->GetElapsedMilliseconds());
411 initialize_stats->set_document_store_recovery_cause(recovery_cause);
412 initialize_stats->set_document_store_data_status(
413 GetDataStatus(create_result.log_create_result.data_loss));
414 }
415 }
416 if (!status.ok()) {
417 ICING_LOG(ERROR)
418 << "Failed to regenerate derived files for DocumentStore";
419 return status;
420 }
421 } else {
422 if (!InitializeExistingDerivedFiles().ok()) {
423 ICING_LOG(WARNING)
424 << "Couldn't find derived files or failed to initialize them, "
425 "regenerating derived files for DocumentStore.";
426 std::unique_ptr<Timer> document_recovery_timer = clock_.GetNewTimer();
427 derived_files_regenerated = true;
428 libtextclassifier3::Status status = RegenerateDerivedFiles(
429 /*force_recovery_and_revalidate_documents=*/false);
430 if (initialize_stats != nullptr) {
431 initialize_stats->set_document_store_recovery_cause(
432 InitializeStatsProto::IO_ERROR);
433 initialize_stats->set_document_store_recovery_latency_ms(
434 document_recovery_timer->GetElapsedMilliseconds());
435 }
436 if (!status.ok()) {
437 ICING_LOG(ERROR)
438 << "Failed to regenerate derived files for DocumentStore";
439 return status;
440 }
441 }
442 }
443
444 initialized_ = true;
445 if (initialize_stats != nullptr) {
446 initialize_stats->set_num_documents(document_id_mapper_->num_elements());
447 }
448
449 InitializeResult initialize_result = {
450 .data_loss = create_result.log_create_result.data_loss,
451 .derived_files_regenerated = derived_files_regenerated};
452 return initialize_result;
453 }
454
InitializeExistingDerivedFiles()455 libtextclassifier3::Status DocumentStore::InitializeExistingDerivedFiles() {
456 if (!HeaderExists()) {
457 // Without a header, we don't know if things are consistent between each
458 // other so the caller should just regenerate everything from ground
459 // truth.
460 return absl_ports::InternalError("DocumentStore header doesn't exist");
461 }
462
463 DocumentStore::Header header;
464 if (!filesystem_->Read(MakeHeaderFilename(base_dir_).c_str(), &header,
465 sizeof(header))) {
466 return absl_ports::InternalError(
467 absl_ports::StrCat("Couldn't read: ", MakeHeaderFilename(base_dir_)));
468 }
469
470 if (header.magic !=
471 DocumentStore::Header::GetCurrentMagic(namespace_id_fingerprint_)) {
472 return absl_ports::InternalError(absl_ports::StrCat(
473 "Invalid header kMagic for file: ", MakeHeaderFilename(base_dir_)));
474 }
475
476 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
477 // that can support error logging.
478 auto document_key_mapper_or =
479 CreateUriMapper(*filesystem_, base_dir_, use_persistent_hash_map_);
480 if (!document_key_mapper_or.ok()) {
481 ICING_LOG(ERROR) << document_key_mapper_or.status().error_message()
482 << "Failed to initialize KeyMapper";
483 return document_key_mapper_or.status();
484 }
485 document_key_mapper_ = std::move(document_key_mapper_or).ValueOrDie();
486
487 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
488 // that can support error logging.
489 auto document_id_mapper_or = FileBackedVector<int64_t>::Create(
490 *filesystem_, MakeDocumentIdMapperFilename(base_dir_),
491 MemoryMappedFile::READ_WRITE_AUTO_SYNC);
492 if (!document_id_mapper_or.ok()) {
493 ICING_LOG(ERROR) << document_id_mapper_or.status().error_message()
494 << "Failed to initialize DocumentIdMapper";
495 return document_id_mapper_or.status();
496 }
497 document_id_mapper_ = std::move(document_id_mapper_or).ValueOrDie();
498
499 ICING_ASSIGN_OR_RETURN(score_cache_,
500 FileBackedVector<DocumentAssociatedScoreData>::Create(
501 *filesystem_, MakeScoreCacheFilename(base_dir_),
502 MemoryMappedFile::READ_WRITE_AUTO_SYNC));
503
504 ICING_ASSIGN_OR_RETURN(filter_cache_,
505 FileBackedVector<DocumentFilterData>::Create(
506 *filesystem_, MakeFilterCacheFilename(base_dir_),
507 MemoryMappedFile::READ_WRITE_AUTO_SYNC));
508
509 ICING_ASSIGN_OR_RETURN(
510 namespace_mapper_,
511 DynamicTrieKeyMapper<NamespaceId>::Create(
512 *filesystem_, MakeNamespaceMapperFilename(base_dir_),
513 kNamespaceMapperMaxSize));
514
515 ICING_ASSIGN_OR_RETURN(
516 usage_store_,
517 UsageStore::Create(filesystem_, MakeUsageStoreDirectoryName(base_dir_)));
518
519 auto corpus_mapper_or =
520 DynamicTrieKeyMapper<CorpusId,
521 fingerprint_util::FingerprintStringFormatter>::
522 Create(*filesystem_, MakeCorpusMapperFilename(base_dir_),
523 kCorpusMapperMaxSize);
524 if (!corpus_mapper_or.ok()) {
525 return std::move(corpus_mapper_or).status();
526 }
527 corpus_mapper_ = std::move(corpus_mapper_or).ValueOrDie();
528
529 ICING_ASSIGN_OR_RETURN(corpus_score_cache_,
530 FileBackedVector<CorpusAssociatedScoreData>::Create(
531 *filesystem_, MakeCorpusScoreCache(base_dir_),
532 MemoryMappedFile::READ_WRITE_AUTO_SYNC));
533
534 // Ensure the usage store is the correct size.
535 ICING_RETURN_IF_ERROR(
536 usage_store_->TruncateTo(document_id_mapper_->num_elements()));
537
538 ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
539 if (checksum.Get() != header.checksum) {
540 return absl_ports::InternalError(
541 "Combined checksum of DocStore was inconsistent");
542 }
543
544 return libtextclassifier3::Status::OK;
545 }
546
RegenerateDerivedFiles(bool revalidate_documents)547 libtextclassifier3::Status DocumentStore::RegenerateDerivedFiles(
548 bool revalidate_documents) {
549 ICING_RETURN_IF_ERROR(ResetDocumentKeyMapper());
550 ICING_RETURN_IF_ERROR(ResetDocumentIdMapper());
551 ICING_RETURN_IF_ERROR(ResetDocumentAssociatedScoreCache());
552 ICING_RETURN_IF_ERROR(ResetFilterCache());
553 ICING_RETURN_IF_ERROR(ResetNamespaceMapper());
554 ICING_RETURN_IF_ERROR(ResetCorpusMapper());
555 ICING_RETURN_IF_ERROR(ResetCorpusAssociatedScoreCache());
556
557 // Creates a new UsageStore instance. Note that we don't reset the data in
558 // usage store here because we're not able to regenerate the usage scores.
559 ICING_ASSIGN_OR_RETURN(
560 usage_store_,
561 UsageStore::Create(filesystem_, MakeUsageStoreDirectoryName(base_dir_)));
562
563 // Iterates through document log
564 auto iterator = document_log_->GetIterator();
565 auto iterator_status = iterator.Advance();
566 libtextclassifier3::StatusOr<int64_t> element_size =
567 document_log_->GetElementsFileSize();
568 libtextclassifier3::StatusOr<int64_t> disk_usage =
569 document_log_->GetDiskUsage();
570 if (element_size.ok() && disk_usage.ok()) {
571 ICING_VLOG(1) << "Starting recovery of document store. Document store "
572 "elements file size:"
573 << element_size.ValueOrDie()
574 << ", disk usage=" << disk_usage.ValueOrDie();
575 }
576 while (iterator_status.ok()) {
577 ICING_VLOG(2) << "Attempting to read document at offset="
578 << iterator.GetOffset();
579 libtextclassifier3::StatusOr<DocumentWrapper> document_wrapper_or =
580 document_log_->ReadProto(iterator.GetOffset());
581
582 if (absl_ports::IsNotFound(document_wrapper_or.status())) {
583 // The erased document still occupies 1 document id.
584 DocumentId new_document_id = document_id_mapper_->num_elements();
585 ICING_RETURN_IF_ERROR(ClearDerivedData(new_document_id));
586 iterator_status = iterator.Advance();
587 continue;
588 } else if (!document_wrapper_or.ok()) {
589 return document_wrapper_or.status();
590 }
591
592 DocumentWrapper document_wrapper =
593 std::move(document_wrapper_or).ValueOrDie();
594 // Revalidate that this document is still compatible if requested.
595 if (revalidate_documents) {
596 if (!document_validator_.Validate(document_wrapper.document()).ok()) {
597 // Document is no longer valid with the current schema. Mark as
598 // deleted
599 DocumentId new_document_id = document_id_mapper_->num_elements();
600 ICING_RETURN_IF_ERROR(document_log_->EraseProto(iterator.GetOffset()));
601 ICING_RETURN_IF_ERROR(ClearDerivedData(new_document_id));
602 continue;
603 }
604 }
605
606 ICING_ASSIGN_OR_RETURN(
607 NamespaceId namespace_id,
608 namespace_mapper_->GetOrPut(document_wrapper.document().namespace_(),
609 namespace_mapper_->num_keys()));
610
611 // Updates key mapper and document_id mapper with the new document
612 DocumentId new_document_id = document_id_mapper_->num_elements();
613 ICING_RETURN_IF_ERROR(document_key_mapper_->Put(
614 MakeFingerprint(namespace_id, document_wrapper.document().namespace_(),
615 document_wrapper.document().uri()),
616 new_document_id));
617 ICING_RETURN_IF_ERROR(
618 document_id_mapper_->Set(new_document_id, iterator.GetOffset()));
619
620 SchemaTypeId schema_type_id;
621 auto schema_type_id_or =
622 schema_store_->GetSchemaTypeId(document_wrapper.document().schema());
623 if (absl_ports::IsNotFound(schema_type_id_or.status())) {
624 // Didn't find a SchemaTypeId. This means that the DocumentStore and
625 // the SchemaStore are out of sync. But DocumentStore can't do
626 // anything about it so just ignore this for now. This should be
627 // detected/handled by the owner of DocumentStore. Set it to some
628 // arbitrary invalid value for now, it'll get updated to the correct
629 // ID later.
630 schema_type_id = -1;
631 } else if (!schema_type_id_or.ok()) {
632 // Real error. Pass it up
633 return schema_type_id_or.status();
634 } else {
635 // We're guaranteed that SchemaTypeId is valid now
636 schema_type_id = schema_type_id_or.ValueOrDie();
637 }
638
639 // Update corpus maps
640 std::string corpus =
641 MakeFingerprint(namespace_id, document_wrapper.document().namespace_(),
642 document_wrapper.document().schema());
643 ICING_ASSIGN_OR_RETURN(
644 CorpusId corpusId,
645 corpus_mapper_->GetOrPut(corpus, corpus_mapper_->num_keys()));
646
647 ICING_ASSIGN_OR_RETURN(CorpusAssociatedScoreData scoring_data,
648 GetCorpusAssociatedScoreDataToUpdate(corpusId));
649 scoring_data.AddDocument(
650 document_wrapper.document().internal_fields().length_in_tokens());
651
652 ICING_RETURN_IF_ERROR(
653 UpdateCorpusAssociatedScoreCache(corpusId, scoring_data));
654
655 ICING_RETURN_IF_ERROR(UpdateDocumentAssociatedScoreCache(
656 new_document_id,
657 DocumentAssociatedScoreData(
658 corpusId, document_wrapper.document().score(),
659 document_wrapper.document().creation_timestamp_ms(),
660 document_wrapper.document().internal_fields().length_in_tokens())));
661
662 int64_t expiration_timestamp_ms = CalculateExpirationTimestampMs(
663 document_wrapper.document().creation_timestamp_ms(),
664 document_wrapper.document().ttl_ms());
665
666 ICING_RETURN_IF_ERROR(UpdateFilterCache(
667 new_document_id, DocumentFilterData(namespace_id, schema_type_id,
668 expiration_timestamp_ms)));
669 iterator_status = iterator.Advance();
670 }
671
672 if (!absl_ports::IsOutOfRange(iterator_status)) {
673 ICING_LOG(WARNING)
674 << "Failed to iterate through proto log while regenerating "
675 "derived files";
676 return absl_ports::Annotate(iterator_status,
677 "Failed to iterate through proto log.");
678 }
679
680 // Shrink usage_store_ to the correct size.
681 ICING_RETURN_IF_ERROR(
682 usage_store_->TruncateTo(document_id_mapper_->num_elements()));
683
684 // Write the header
685 ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
686 ICING_RETURN_IF_ERROR(UpdateHeader(checksum));
687
688 return libtextclassifier3::Status::OK;
689 }
690
ResetDocumentKeyMapper()691 libtextclassifier3::Status DocumentStore::ResetDocumentKeyMapper() {
692 // Only one type of KeyMapper (either DynamicTrieKeyMapper or
693 // PersistentHashMapKeyMapper) will actually exist at any moment, but it is ok
694 // to call Delete() for both since Delete() returns OK if any of them doesn't
695 // exist.
696 // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
697 document_key_mapper_.reset();
698 // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
699 // that can support error logging.
700 libtextclassifier3::Status status =
701 DynamicTrieKeyMapper<DocumentId>::Delete(*filesystem_, base_dir_);
702 if (!status.ok()) {
703 ICING_LOG(ERROR) << status.error_message()
704 << "Failed to delete old dynamic trie key mapper";
705 return status;
706 }
707 status = PersistentHashMapKeyMapper<DocumentId>::Delete(
708 *filesystem_, MakeUriHashMapperWorkingPath(base_dir_));
709 if (!status.ok()) {
710 ICING_LOG(ERROR) << status.error_message()
711 << "Failed to delete old persistent hash map key mapper";
712 return status;
713 }
714
715 // TODO(b/216487496): Implement a more robust version of TC_ASSIGN_OR_RETURN
716 // that can support error logging.
717 auto document_key_mapper_or =
718 CreateUriMapper(*filesystem_, base_dir_, use_persistent_hash_map_);
719 if (!document_key_mapper_or.ok()) {
720 ICING_LOG(ERROR) << document_key_mapper_or.status().error_message()
721 << "Failed to re-init key mapper";
722 return document_key_mapper_or.status();
723 }
724 document_key_mapper_ = std::move(document_key_mapper_or).ValueOrDie();
725 return libtextclassifier3::Status::OK;
726 }
727
ResetDocumentIdMapper()728 libtextclassifier3::Status DocumentStore::ResetDocumentIdMapper() {
729 // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
730 document_id_mapper_.reset();
731 // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
732 // that can support error logging.
733 libtextclassifier3::Status status = FileBackedVector<int64_t>::Delete(
734 *filesystem_, MakeDocumentIdMapperFilename(base_dir_));
735 if (!status.ok()) {
736 ICING_LOG(ERROR) << status.error_message()
737 << "Failed to delete old document_id mapper";
738 return status;
739 }
740 // TODO(b/216487496): Implement a more robust version of TC_ASSIGN_OR_RETURN
741 // that can support error logging.
742 auto document_id_mapper_or = FileBackedVector<int64_t>::Create(
743 *filesystem_, MakeDocumentIdMapperFilename(base_dir_),
744 MemoryMappedFile::READ_WRITE_AUTO_SYNC);
745 if (!document_id_mapper_or.ok()) {
746 ICING_LOG(ERROR) << document_id_mapper_or.status().error_message()
747 << "Failed to re-init document_id mapper";
748 return document_id_mapper_or.status();
749 }
750 document_id_mapper_ = std::move(document_id_mapper_or).ValueOrDie();
751 return libtextclassifier3::Status::OK;
752 }
753
ResetDocumentAssociatedScoreCache()754 libtextclassifier3::Status DocumentStore::ResetDocumentAssociatedScoreCache() {
755 // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
756 score_cache_.reset();
757 ICING_RETURN_IF_ERROR(FileBackedVector<DocumentAssociatedScoreData>::Delete(
758 *filesystem_, MakeScoreCacheFilename(base_dir_)));
759 ICING_ASSIGN_OR_RETURN(score_cache_,
760 FileBackedVector<DocumentAssociatedScoreData>::Create(
761 *filesystem_, MakeScoreCacheFilename(base_dir_),
762 MemoryMappedFile::READ_WRITE_AUTO_SYNC));
763 return libtextclassifier3::Status::OK;
764 }
765
ResetCorpusAssociatedScoreCache()766 libtextclassifier3::Status DocumentStore::ResetCorpusAssociatedScoreCache() {
767 // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
768 corpus_score_cache_.reset();
769 ICING_RETURN_IF_ERROR(FileBackedVector<CorpusAssociatedScoreData>::Delete(
770 *filesystem_, MakeCorpusScoreCache(base_dir_)));
771 ICING_ASSIGN_OR_RETURN(corpus_score_cache_,
772 FileBackedVector<CorpusAssociatedScoreData>::Create(
773 *filesystem_, MakeCorpusScoreCache(base_dir_),
774 MemoryMappedFile::READ_WRITE_AUTO_SYNC));
775 return libtextclassifier3::Status::OK;
776 }
777
ResetFilterCache()778 libtextclassifier3::Status DocumentStore::ResetFilterCache() {
779 // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
780 filter_cache_.reset();
781 ICING_RETURN_IF_ERROR(FileBackedVector<DocumentFilterData>::Delete(
782 *filesystem_, MakeFilterCacheFilename(base_dir_)));
783 ICING_ASSIGN_OR_RETURN(filter_cache_,
784 FileBackedVector<DocumentFilterData>::Create(
785 *filesystem_, MakeFilterCacheFilename(base_dir_),
786 MemoryMappedFile::READ_WRITE_AUTO_SYNC));
787 return libtextclassifier3::Status::OK;
788 }
789
ResetNamespaceMapper()790 libtextclassifier3::Status DocumentStore::ResetNamespaceMapper() {
791 // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
792 namespace_mapper_.reset();
793 // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
794 // that can support error logging.
795 libtextclassifier3::Status status = DynamicTrieKeyMapper<NamespaceId>::Delete(
796 *filesystem_, MakeNamespaceMapperFilename(base_dir_));
797 if (!status.ok()) {
798 ICING_LOG(ERROR) << status.error_message()
799 << "Failed to delete old namespace_id mapper";
800 return status;
801 }
802 ICING_ASSIGN_OR_RETURN(
803 namespace_mapper_,
804 DynamicTrieKeyMapper<NamespaceId>::Create(
805 *filesystem_, MakeNamespaceMapperFilename(base_dir_),
806 kNamespaceMapperMaxSize));
807 return libtextclassifier3::Status::OK;
808 }
809
ResetCorpusMapper()810 libtextclassifier3::Status DocumentStore::ResetCorpusMapper() {
811 // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
812 corpus_mapper_.reset();
813 // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
814 // that can support error logging.
815 libtextclassifier3::Status status = DynamicTrieKeyMapper<CorpusId>::Delete(
816 *filesystem_, MakeCorpusMapperFilename(base_dir_));
817 if (!status.ok()) {
818 ICING_LOG(ERROR) << status.error_message()
819 << "Failed to delete old corpus_id mapper";
820 return status;
821 }
822 auto corpus_mapper_or =
823 DynamicTrieKeyMapper<CorpusId,
824 fingerprint_util::FingerprintStringFormatter>::
825 Create(*filesystem_, MakeCorpusMapperFilename(base_dir_),
826 kCorpusMapperMaxSize);
827 if (!corpus_mapper_or.ok()) {
828 return std::move(corpus_mapper_or).status();
829 }
830 corpus_mapper_ = std::move(corpus_mapper_or).ValueOrDie();
831 return libtextclassifier3::Status::OK;
832 }
833
ComputeChecksum() const834 libtextclassifier3::StatusOr<Crc32> DocumentStore::ComputeChecksum() const {
835 Crc32 total_checksum;
836
837 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
838 // that can support error logging.
839 auto checksum_or = document_log_->ComputeChecksum();
840 if (!checksum_or.ok()) {
841 ICING_LOG(ERROR) << checksum_or.status().error_message()
842 << "Failed to compute checksum of DocumentLog";
843 return checksum_or.status();
844 }
845 Crc32 document_log_checksum = std::move(checksum_or).ValueOrDie();
846
847 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
848 // that can support error logging.
849 checksum_or = document_key_mapper_->ComputeChecksum();
850 if (!checksum_or.ok()) {
851 ICING_LOG(ERROR) << checksum_or.status().error_message()
852 << "Failed to compute checksum of DocumentKeyMapper";
853 return checksum_or.status();
854 }
855 Crc32 document_key_mapper_checksum = std::move(checksum_or).ValueOrDie();
856
857 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
858 // that can support error logging.
859 checksum_or = document_id_mapper_->ComputeChecksum();
860 if (!checksum_or.ok()) {
861 ICING_LOG(ERROR) << checksum_or.status().error_message()
862 << "Failed to compute checksum of DocumentIdMapper";
863 return checksum_or.status();
864 }
865 Crc32 document_id_mapper_checksum = std::move(checksum_or).ValueOrDie();
866
867 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
868 // that can support error logging.
869 checksum_or = score_cache_->ComputeChecksum();
870 if (!checksum_or.ok()) {
871 ICING_LOG(ERROR) << checksum_or.status().error_message()
872 << "Failed to compute checksum of score cache";
873 return checksum_or.status();
874 }
875 Crc32 score_cache_checksum = std::move(checksum_or).ValueOrDie();
876
877 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
878 // that can support error logging.
879 checksum_or = filter_cache_->ComputeChecksum();
880 if (!checksum_or.ok()) {
881 ICING_LOG(ERROR) << checksum_or.status().error_message()
882 << "Failed to compute checksum of filter cache";
883 return checksum_or.status();
884 }
885 Crc32 filter_cache_checksum = std::move(checksum_or).ValueOrDie();
886
887 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
888 // that can support error logging.
889 checksum_or = namespace_mapper_->ComputeChecksum();
890 if (!checksum_or.ok()) {
891 ICING_LOG(ERROR) << checksum_or.status().error_message()
892 << "Failed to compute checksum of namespace mapper";
893 return checksum_or.status();
894 }
895 Crc32 namespace_mapper_checksum = std::move(checksum_or).ValueOrDie();
896
897 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
898 // that can support error logging.
899 checksum_or = corpus_mapper_->ComputeChecksum();
900 if (!checksum_or.ok()) {
901 ICING_LOG(ERROR) << checksum_or.status().error_message()
902 << "Failed to compute checksum of corpus mapper";
903 return checksum_or.status();
904 }
905 Crc32 corpus_mapper_checksum = std::move(checksum_or).ValueOrDie();
906
907 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
908 // that can support error logging.
909 checksum_or = corpus_score_cache_->ComputeChecksum();
910 if (!checksum_or.ok()) {
911 ICING_LOG(WARNING) << checksum_or.status().error_message()
912 << "Failed to compute checksum of score cache";
913 return checksum_or.status();
914 }
915 Crc32 corpus_score_cache_checksum = std::move(checksum_or).ValueOrDie();
916
917 // NOTE: We purposely don't include usage_store checksum here because we can't
918 // regenerate it from ground truth documents. If it gets corrupted, we'll just
919 // clear all usage reports, but we shouldn't throw everything else in the
920 // document store out.
921
922 total_checksum.Append(std::to_string(document_log_checksum.Get()));
923 total_checksum.Append(std::to_string(document_key_mapper_checksum.Get()));
924 total_checksum.Append(std::to_string(document_id_mapper_checksum.Get()));
925 total_checksum.Append(std::to_string(score_cache_checksum.Get()));
926 total_checksum.Append(std::to_string(filter_cache_checksum.Get()));
927 total_checksum.Append(std::to_string(namespace_mapper_checksum.Get()));
928 total_checksum.Append(std::to_string(corpus_mapper_checksum.Get()));
929 total_checksum.Append(std::to_string(corpus_score_cache_checksum.Get()));
930
931 return total_checksum;
932 }
933
HeaderExists()934 bool DocumentStore::HeaderExists() {
935 if (!filesystem_->FileExists(MakeHeaderFilename(base_dir_).c_str())) {
936 return false;
937 }
938
939 int64_t file_size =
940 filesystem_->GetFileSize(MakeHeaderFilename(base_dir_).c_str());
941
942 // If it's been truncated to size 0 before, we consider it to be a new file
943 return file_size != 0 && file_size != Filesystem::kBadFileSize;
944 }
945
UpdateHeader(const Crc32 & checksum)946 libtextclassifier3::Status DocumentStore::UpdateHeader(const Crc32& checksum) {
947 // Write the header
948 DocumentStore::Header header;
949 header.magic =
950 DocumentStore::Header::GetCurrentMagic(namespace_id_fingerprint_);
951 header.checksum = checksum.Get();
952
953 // This should overwrite the header.
954 ScopedFd sfd(
955 filesystem_->OpenForWrite(MakeHeaderFilename(base_dir_).c_str()));
956 if (!sfd.is_valid() ||
957 !filesystem_->Write(sfd.get(), &header, sizeof(header)) ||
958 !filesystem_->DataSync(sfd.get())) {
959 return absl_ports::InternalError(absl_ports::StrCat(
960 "Failed to write DocStore header: ", MakeHeaderFilename(base_dir_)));
961 }
962 return libtextclassifier3::Status::OK;
963 }
964
InternalPut(DocumentProto && document,PutDocumentStatsProto * put_document_stats)965 libtextclassifier3::StatusOr<DocumentId> DocumentStore::InternalPut(
966 DocumentProto&& document, PutDocumentStatsProto* put_document_stats) {
967 std::unique_ptr<Timer> put_timer = clock_.GetNewTimer();
968 ICING_RETURN_IF_ERROR(document_validator_.Validate(document));
969
970 if (put_document_stats != nullptr) {
971 put_document_stats->set_document_size(document.ByteSizeLong());
972 }
973
974 // Copy fields needed before they are moved
975 std::string name_space = document.namespace_();
976 std::string uri = document.uri();
977 std::string schema = document.schema();
978 int document_score = document.score();
979 int32_t length_in_tokens = document.internal_fields().length_in_tokens();
980 int64_t creation_timestamp_ms = document.creation_timestamp_ms();
981
982 // Sets the creation timestamp if caller hasn't specified.
983 if (document.creation_timestamp_ms() == 0) {
984 creation_timestamp_ms = clock_.GetSystemTimeMilliseconds();
985 document.set_creation_timestamp_ms(creation_timestamp_ms);
986 }
987
988 int64_t expiration_timestamp_ms =
989 CalculateExpirationTimestampMs(creation_timestamp_ms, document.ttl_ms());
990
991 // Update ground truth first
992 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
993 // that can support error logging.
994 auto offset_or =
995 document_log_->WriteProto(CreateDocumentWrapper(std::move(document)));
996 if (!offset_or.ok()) {
997 ICING_LOG(ERROR) << offset_or.status().error_message()
998 << "Failed to write document";
999 return offset_or.status();
1000 }
1001 int64_t file_offset = std::move(offset_or).ValueOrDie();
1002
1003 // Get existing document id
1004 auto old_document_id_or = GetDocumentId(name_space, uri);
1005 if (!old_document_id_or.ok() &&
1006 !absl_ports::IsNotFound(old_document_id_or.status())) {
1007 return absl_ports::InternalError("Failed to read from key mapper");
1008 }
1009
1010 // Creates a new document id, updates key mapper and document_id mapper
1011 DocumentId new_document_id = document_id_mapper_->num_elements();
1012 if (!IsDocumentIdValid(new_document_id)) {
1013 return absl_ports::ResourceExhaustedError(
1014 "Exceeded maximum number of documents. Try calling Optimize to reclaim "
1015 "some space.");
1016 }
1017
1018 // Update namespace maps
1019 ICING_ASSIGN_OR_RETURN(
1020 NamespaceId namespace_id,
1021 namespace_mapper_->GetOrPut(name_space, namespace_mapper_->num_keys()));
1022
1023 // Updates key mapper and document_id mapper
1024 ICING_RETURN_IF_ERROR(document_key_mapper_->Put(
1025 MakeFingerprint(namespace_id, name_space, uri), new_document_id));
1026 ICING_RETURN_IF_ERROR(document_id_mapper_->Set(new_document_id, file_offset));
1027
1028 // Update corpus maps
1029 ICING_ASSIGN_OR_RETURN(CorpusId corpusId,
1030 corpus_mapper_->GetOrPut(
1031 MakeFingerprint(namespace_id, name_space, schema),
1032 corpus_mapper_->num_keys()));
1033
1034 ICING_ASSIGN_OR_RETURN(CorpusAssociatedScoreData scoring_data,
1035 GetCorpusAssociatedScoreDataToUpdate(corpusId));
1036 scoring_data.AddDocument(length_in_tokens);
1037
1038 ICING_RETURN_IF_ERROR(
1039 UpdateCorpusAssociatedScoreCache(corpusId, scoring_data));
1040
1041 ICING_RETURN_IF_ERROR(UpdateDocumentAssociatedScoreCache(
1042 new_document_id,
1043 DocumentAssociatedScoreData(corpusId, document_score,
1044 creation_timestamp_ms, length_in_tokens)));
1045
1046 ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
1047 schema_store_->GetSchemaTypeId(schema));
1048
1049 ICING_RETURN_IF_ERROR(UpdateFilterCache(
1050 new_document_id, DocumentFilterData(namespace_id, schema_type_id,
1051 expiration_timestamp_ms)));
1052
1053 if (old_document_id_or.ok()) {
1054 // The old document exists, copy over the usage scores and delete the old
1055 // document.
1056 DocumentId old_document_id = old_document_id_or.ValueOrDie();
1057
1058 ICING_RETURN_IF_ERROR(
1059 usage_store_->CloneUsageScores(/*from_document_id=*/old_document_id,
1060 /*to_document_id=*/new_document_id));
1061
1062 // Delete the old document. It's fine if it's not found since it might have
1063 // been deleted previously.
1064 auto delete_status =
1065 Delete(old_document_id, clock_.GetSystemTimeMilliseconds());
1066 if (!delete_status.ok() && !absl_ports::IsNotFound(delete_status)) {
1067 // Real error, pass it up.
1068 return delete_status;
1069 }
1070 }
1071
1072 if (put_document_stats != nullptr) {
1073 put_document_stats->set_document_store_latency_ms(
1074 put_timer->GetElapsedMilliseconds());
1075 }
1076
1077 return new_document_id;
1078 }
1079
Get(const std::string_view name_space,const std::string_view uri,bool clear_internal_fields) const1080 libtextclassifier3::StatusOr<DocumentProto> DocumentStore::Get(
1081 const std::string_view name_space, const std::string_view uri,
1082 bool clear_internal_fields) const {
1083 // TODO(b/147231617): Make a better way to replace the error message in an
1084 // existing Status.
1085 auto document_id_or = GetDocumentId(name_space, uri);
1086 if (!document_id_or.ok()) {
1087 if (absl_ports::IsNotFound(document_id_or.status())) {
1088 ICING_VLOG(1) << document_id_or.status().error_message();
1089 return absl_ports::NotFoundError(absl_ports::StrCat(
1090 "Document (", name_space, ", ", uri, ") not found."));
1091 }
1092
1093 // Real error. Log it in error level and pass it up.
1094 ICING_LOG(ERROR) << document_id_or.status().error_message();
1095 return std::move(document_id_or).status();
1096 }
1097 DocumentId document_id = document_id_or.ValueOrDie();
1098
1099 // TODO(b/147231617): Make a better way to replace the error message in an
1100 // existing Status.
1101 auto status_or = Get(document_id, clear_internal_fields);
1102 if (!status_or.ok()) {
1103 if (absl_ports::IsNotFound(status_or.status())) {
1104 ICING_VLOG(1) << status_or.status().error_message();
1105 return absl_ports::NotFoundError(absl_ports::StrCat(
1106 "Document (", name_space, ", ", uri, ") not found."));
1107 }
1108
1109 // Real error. Log it in error level.
1110 ICING_LOG(ERROR) << status_or.status().error_message();
1111 }
1112 return status_or;
1113 }
1114
Get(DocumentId document_id,bool clear_internal_fields) const1115 libtextclassifier3::StatusOr<DocumentProto> DocumentStore::Get(
1116 DocumentId document_id, bool clear_internal_fields) const {
1117 int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
1118 auto document_filter_data_optional_ =
1119 GetAliveDocumentFilterData(document_id, current_time_ms);
1120 if (!document_filter_data_optional_) {
1121 // The document doesn't exist. Let's check if the document id is invalid, we
1122 // will return InvalidArgumentError. Otherwise we should return NOT_FOUND
1123 // error.
1124 if (!IsDocumentIdValid(document_id)) {
1125 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
1126 "Document id '%d' invalid.", document_id));
1127 }
1128 return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
1129 "Document id '%d' doesn't exist", document_id));
1130 }
1131
1132 auto document_log_offset_or = document_id_mapper_->Get(document_id);
1133 if (!document_log_offset_or.ok()) {
1134 // Since we've just checked that our document_id is valid a few lines
1135 // above, there's no reason this should fail and an error should never
1136 // happen.
1137 return absl_ports::InternalError("Failed to find document offset.");
1138 }
1139 int64_t document_log_offset = *document_log_offset_or.ValueOrDie();
1140
1141 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
1142 // that can support error logging.
1143 auto document_wrapper_or = document_log_->ReadProto(document_log_offset);
1144 if (!document_wrapper_or.ok()) {
1145 ICING_LOG(ERROR) << document_wrapper_or.status().error_message()
1146 << "Failed to read from document log";
1147 return document_wrapper_or.status();
1148 }
1149 DocumentWrapper document_wrapper =
1150 std::move(document_wrapper_or).ValueOrDie();
1151 if (clear_internal_fields) {
1152 document_wrapper.mutable_document()->clear_internal_fields();
1153 }
1154
1155 return std::move(*document_wrapper.mutable_document());
1156 }
1157
GetDocumentId(const std::string_view name_space,const std::string_view uri) const1158 libtextclassifier3::StatusOr<DocumentId> DocumentStore::GetDocumentId(
1159 const std::string_view name_space, const std::string_view uri) const {
1160 auto namespace_id_or = namespace_mapper_->Get(name_space);
1161 libtextclassifier3::Status status = namespace_id_or.status();
1162 if (status.ok()) {
1163 NamespaceId namespace_id = namespace_id_or.ValueOrDie();
1164 auto document_id_or = document_key_mapper_->Get(
1165 MakeFingerprint(namespace_id, name_space, uri));
1166 status = document_id_or.status();
1167 if (status.ok()) {
1168 // Guaranteed to have a DocumentId now
1169 return document_id_or.ValueOrDie();
1170 }
1171 }
1172 return absl_ports::Annotate(
1173 status, absl_ports::StrCat(
1174 "Failed to find DocumentId by key: ", name_space, ", ", uri));
1175 }
1176
GetDocumentId(const NamespaceFingerprintIdentifier & namespace_fingerprint_identifier) const1177 libtextclassifier3::StatusOr<DocumentId> DocumentStore::GetDocumentId(
1178 const NamespaceFingerprintIdentifier& namespace_fingerprint_identifier)
1179 const {
1180 if (!namespace_id_fingerprint_) {
1181 return absl_ports::FailedPreconditionError(
1182 "Cannot lookup document id by namespace id + fingerprint without "
1183 "enabling it on uri_mapper");
1184 }
1185
1186 auto document_id_or = document_key_mapper_->Get(
1187 namespace_fingerprint_identifier.EncodeToCString());
1188 if (document_id_or.ok()) {
1189 return document_id_or.ValueOrDie();
1190 }
1191 return absl_ports::Annotate(
1192 std::move(document_id_or).status(),
1193 "Failed to find DocumentId by namespace id + fingerprint");
1194 }
1195
GetAllNamespaces() const1196 std::vector<std::string> DocumentStore::GetAllNamespaces() const {
1197 std::unordered_map<NamespaceId, std::string> namespace_id_to_namespace =
1198 GetNamespaceIdsToNamespaces(namespace_mapper_.get());
1199
1200 std::unordered_set<NamespaceId> existing_namespace_ids;
1201 int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
1202 for (DocumentId document_id = 0; document_id < filter_cache_->num_elements();
1203 ++document_id) {
1204 // filter_cache_->Get can only fail if document_id is < 0
1205 // or >= filter_cache_->num_elements. So, this error SHOULD NEVER HAPPEN.
1206 auto status_or_data = filter_cache_->Get(document_id);
1207 if (!status_or_data.ok()) {
1208 ICING_LOG(ERROR)
1209 << "Error while iterating over filter cache in GetAllNamespaces";
1210 return std::vector<std::string>();
1211 }
1212 const DocumentFilterData* data = status_or_data.ValueOrDie();
1213
1214 if (GetAliveDocumentFilterData(document_id, current_time_ms)) {
1215 existing_namespace_ids.insert(data->namespace_id());
1216 }
1217 }
1218
1219 std::vector<std::string> existing_namespaces;
1220 for (auto itr = existing_namespace_ids.begin();
1221 itr != existing_namespace_ids.end(); ++itr) {
1222 existing_namespaces.push_back(namespace_id_to_namespace.at(*itr));
1223 }
1224 return existing_namespaces;
1225 }
1226
GetAliveDocumentFilterData(DocumentId document_id,int64_t current_time_ms) const1227 std::optional<DocumentFilterData> DocumentStore::GetAliveDocumentFilterData(
1228 DocumentId document_id, int64_t current_time_ms) const {
1229 if (IsDeleted(document_id)) {
1230 return std::nullopt;
1231 }
1232 return GetNonExpiredDocumentFilterData(document_id, current_time_ms);
1233 }
1234
IsDeleted(DocumentId document_id) const1235 bool DocumentStore::IsDeleted(DocumentId document_id) const {
1236 auto file_offset_or = document_id_mapper_->Get(document_id);
1237 if (!file_offset_or.ok()) {
1238 // This would only happen if document_id is out of range of the
1239 // document_id_mapper, meaning we got some invalid document_id. Callers
1240 // should already have checked that their document_id is valid or used
1241 // DoesDocumentExist(WithStatus). Regardless, return true since the
1242 // document doesn't exist.
1243 return true;
1244 }
1245 int64_t file_offset = *file_offset_or.ValueOrDie();
1246 return file_offset == kDocDeletedFlag;
1247 }
1248
1249 // Returns DocumentFilterData if the document is not expired. Otherwise,
1250 // std::nullopt.
1251 std::optional<DocumentFilterData>
GetNonExpiredDocumentFilterData(DocumentId document_id,int64_t current_time_ms) const1252 DocumentStore::GetNonExpiredDocumentFilterData(DocumentId document_id,
1253 int64_t current_time_ms) const {
1254 auto filter_data_or = filter_cache_->GetCopy(document_id);
1255 if (!filter_data_or.ok()) {
1256 // This would only happen if document_id is out of range of the
1257 // filter_cache, meaning we got some invalid document_id. Callers should
1258 // already have checked that their document_id is valid or used
1259 // DoesDocumentExist(WithStatus). Regardless, return true since the
1260 // document doesn't exist.
1261 return std::nullopt;
1262 }
1263 DocumentFilterData document_filter_data = filter_data_or.ValueOrDie();
1264
1265 // Check if it's past the expiration time
1266 if (current_time_ms >= document_filter_data.expiration_timestamp_ms()) {
1267 return std::nullopt;
1268 }
1269 return document_filter_data;
1270 }
1271
Delete(const std::string_view name_space,const std::string_view uri,int64_t current_time_ms)1272 libtextclassifier3::Status DocumentStore::Delete(
1273 const std::string_view name_space, const std::string_view uri,
1274 int64_t current_time_ms) {
1275 // Try to get the DocumentId first
1276 auto document_id_or = GetDocumentId(name_space, uri);
1277 if (!document_id_or.ok()) {
1278 return absl_ports::Annotate(
1279 document_id_or.status(),
1280 absl_ports::StrCat("Failed to delete Document. namespace: ", name_space,
1281 ", uri: ", uri));
1282 }
1283 return Delete(document_id_or.ValueOrDie(), current_time_ms);
1284 }
1285
Delete(DocumentId document_id,int64_t current_time_ms)1286 libtextclassifier3::Status DocumentStore::Delete(DocumentId document_id,
1287 int64_t current_time_ms) {
1288 auto document_filter_data_optional_ =
1289 GetAliveDocumentFilterData(document_id, current_time_ms);
1290 if (!document_filter_data_optional_) {
1291 // The document doesn't exist. We should return InvalidArgumentError if the
1292 // document id is invalid. Otherwise we should return NOT_FOUND error.
1293 if (!IsDocumentIdValid(document_id)) {
1294 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
1295 "Document id '%d' invalid.", document_id));
1296 }
1297 return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
1298 "Document id '%d' doesn't exist", document_id));
1299 }
1300
1301 auto document_log_offset_or = document_id_mapper_->Get(document_id);
1302 if (!document_log_offset_or.ok()) {
1303 return absl_ports::InternalError("Failed to find document offset.");
1304 }
1305 int64_t document_log_offset = *document_log_offset_or.ValueOrDie();
1306
1307 // Erases document proto.
1308 ICING_RETURN_IF_ERROR(document_log_->EraseProto(document_log_offset));
1309 return ClearDerivedData(document_id);
1310 }
1311
GetNamespaceId(std::string_view name_space) const1312 libtextclassifier3::StatusOr<NamespaceId> DocumentStore::GetNamespaceId(
1313 std::string_view name_space) const {
1314 return namespace_mapper_->Get(name_space);
1315 }
1316
GetCorpusId(const std::string_view name_space,const std::string_view schema) const1317 libtextclassifier3::StatusOr<CorpusId> DocumentStore::GetCorpusId(
1318 const std::string_view name_space, const std::string_view schema) const {
1319 ICING_ASSIGN_OR_RETURN(NamespaceId namespace_id,
1320 namespace_mapper_->Get(name_space));
1321 return corpus_mapper_->Get(MakeFingerprint(namespace_id, name_space, schema));
1322 }
1323
GetResultGroupingEntryId(ResultSpecProto::ResultGroupingType result_group_type,const std::string_view name_space,const std::string_view schema) const1324 libtextclassifier3::StatusOr<int32_t> DocumentStore::GetResultGroupingEntryId(
1325 ResultSpecProto::ResultGroupingType result_group_type,
1326 const std::string_view name_space, const std::string_view schema) const {
1327 auto namespace_id = GetNamespaceId(name_space);
1328 auto schema_type_id = schema_store_->GetSchemaTypeId(schema);
1329 switch (result_group_type) {
1330 case ResultSpecProto::NONE:
1331 return absl_ports::InvalidArgumentError(
1332 "Cannot group by ResultSpecProto::NONE");
1333 case ResultSpecProto::SCHEMA_TYPE:
1334 if (schema_type_id.ok()) {
1335 return schema_type_id.ValueOrDie();
1336 }
1337 break;
1338 case ResultSpecProto::NAMESPACE:
1339 if (namespace_id.ok()) {
1340 return namespace_id.ValueOrDie();
1341 }
1342 break;
1343 case ResultSpecProto::NAMESPACE_AND_SCHEMA_TYPE:
1344 if (namespace_id.ok() && schema_type_id.ok()) {
1345 // TODO(b/258715421): Temporary workaround to get a
1346 // ResultGroupingEntryId given the Namespace string
1347 // and Schema string.
1348 return namespace_id.ValueOrDie() << 16 | schema_type_id.ValueOrDie();
1349 }
1350 break;
1351 }
1352 return absl_ports::NotFoundError("Cannot generate ResultGrouping Entry Id");
1353 }
1354
GetResultGroupingEntryId(ResultSpecProto::ResultGroupingType result_group_type,const NamespaceId namespace_id,const SchemaTypeId schema_type_id) const1355 libtextclassifier3::StatusOr<int32_t> DocumentStore::GetResultGroupingEntryId(
1356 ResultSpecProto::ResultGroupingType result_group_type,
1357 const NamespaceId namespace_id, const SchemaTypeId schema_type_id) const {
1358 switch (result_group_type) {
1359 case ResultSpecProto::NONE:
1360 return absl_ports::InvalidArgumentError(
1361 "Cannot group by ResultSpecProto::NONE");
1362 case ResultSpecProto::SCHEMA_TYPE:
1363 return schema_type_id;
1364 case ResultSpecProto::NAMESPACE:
1365 return namespace_id;
1366 case ResultSpecProto::NAMESPACE_AND_SCHEMA_TYPE:
1367 // TODO(b/258715421): Temporary workaround to get a ResultGroupingEntryId
1368 // given the Namespace Id and SchemaType Id.
1369 return namespace_id << 16 | schema_type_id;
1370 }
1371 return absl_ports::NotFoundError("Cannot generate ResultGrouping Entry Id");
1372 }
1373
1374 libtextclassifier3::StatusOr<DocumentAssociatedScoreData>
GetDocumentAssociatedScoreData(DocumentId document_id) const1375 DocumentStore::GetDocumentAssociatedScoreData(DocumentId document_id) const {
1376 auto score_data_or = score_cache_->GetCopy(document_id);
1377 if (!score_data_or.ok()) {
1378 ICING_LOG(ERROR) << " while trying to access DocumentId " << document_id
1379 << " from score_cache_";
1380 return absl_ports::NotFoundError(
1381 std::move(score_data_or).status().error_message());
1382 }
1383
1384 DocumentAssociatedScoreData document_associated_score_data =
1385 std::move(score_data_or).ValueOrDie();
1386 return document_associated_score_data;
1387 }
1388
1389 libtextclassifier3::StatusOr<CorpusAssociatedScoreData>
GetCorpusAssociatedScoreData(CorpusId corpus_id) const1390 DocumentStore::GetCorpusAssociatedScoreData(CorpusId corpus_id) const {
1391 return corpus_score_cache_->GetCopy(corpus_id);
1392 }
1393
1394 libtextclassifier3::StatusOr<CorpusAssociatedScoreData>
GetCorpusAssociatedScoreDataToUpdate(CorpusId corpus_id) const1395 DocumentStore::GetCorpusAssociatedScoreDataToUpdate(CorpusId corpus_id) const {
1396 auto corpus_scoring_data_or = GetCorpusAssociatedScoreData(corpus_id);
1397 if (!corpus_scoring_data_or.ok() &&
1398 absl_ports::IsOutOfRange(corpus_scoring_data_or.status())) {
1399 // OUT_OF_RANGE is the StatusCode returned when a corpus id is added to
1400 // corpus_score_cache_ for the first time. Return a default
1401 // CorpusAssociatedScoreData object in this case.
1402 return CorpusAssociatedScoreData();
1403 }
1404
1405 return corpus_scoring_data_or;
1406 }
1407
1408 // TODO(b/273826815): Decide on and adopt a consistent pattern for handling
1409 // NOT_FOUND 'errors' returned by our internal classes.
GetUsageScores(DocumentId document_id,int64_t current_time_ms) const1410 std::optional<UsageStore::UsageScores> DocumentStore::GetUsageScores(
1411 DocumentId document_id, int64_t current_time_ms) const {
1412 std::optional<DocumentFilterData> opt =
1413 GetAliveDocumentFilterData(document_id, current_time_ms);
1414 if (!opt) {
1415 return std::nullopt;
1416 }
1417 if (document_id >= usage_store_->num_elements()) {
1418 return std::nullopt;
1419 }
1420 auto usage_scores_or = usage_store_->GetUsageScores(document_id);
1421 if (!usage_scores_or.ok()) {
1422 ICING_LOG(ERROR) << "Error retrieving usage for " << document_id << ": "
1423 << usage_scores_or.status().error_message();
1424 return std::nullopt;
1425 }
1426 return std::move(usage_scores_or).ValueOrDie();
1427 }
1428
ReportUsage(const UsageReport & usage_report)1429 libtextclassifier3::Status DocumentStore::ReportUsage(
1430 const UsageReport& usage_report) {
1431 ICING_ASSIGN_OR_RETURN(DocumentId document_id,
1432 GetDocumentId(usage_report.document_namespace(),
1433 usage_report.document_uri()));
1434 // We can use the internal version here because we got our document_id from
1435 // our internal data structures. We would have thrown some error if the
1436 // namespace and/or uri were incorrect.
1437 int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
1438 if (!GetAliveDocumentFilterData(document_id, current_time_ms)) {
1439 // Document was probably deleted or expired.
1440 return absl_ports::NotFoundError(absl_ports::StrCat(
1441 "Couldn't report usage on a nonexistent document: (namespace: '",
1442 usage_report.document_namespace(), "', uri: '",
1443 usage_report.document_uri(), "')"));
1444 }
1445
1446 return usage_store_->AddUsageReport(usage_report, document_id);
1447 }
1448
DeleteByNamespace(std::string_view name_space)1449 DocumentStore::DeleteByGroupResult DocumentStore::DeleteByNamespace(
1450 std::string_view name_space) {
1451 DeleteByGroupResult result;
1452 auto namespace_id_or = namespace_mapper_->Get(name_space);
1453 if (!namespace_id_or.ok()) {
1454 result.status = absl_ports::Annotate(
1455 namespace_id_or.status(),
1456 absl_ports::StrCat("Failed to find namespace: ", name_space));
1457 return result;
1458 }
1459 NamespaceId namespace_id = namespace_id_or.ValueOrDie();
1460 auto num_deleted_or = BatchDelete(namespace_id, kInvalidSchemaTypeId);
1461 if (!num_deleted_or.ok()) {
1462 result.status = std::move(num_deleted_or).status();
1463 return result;
1464 }
1465
1466 result.num_docs_deleted = num_deleted_or.ValueOrDie();
1467 if (result.num_docs_deleted <= 0) {
1468 // Treat the fact that no existing documents had this namespace to be the
1469 // same as this namespace not existing at all.
1470 result.status = absl_ports::NotFoundError(
1471 absl_ports::StrCat("Namespace '", name_space, "' doesn't exist"));
1472 return result;
1473 }
1474
1475 return result;
1476 }
1477
DeleteBySchemaType(std::string_view schema_type)1478 DocumentStore::DeleteByGroupResult DocumentStore::DeleteBySchemaType(
1479 std::string_view schema_type) {
1480 DeleteByGroupResult result;
1481 auto schema_type_id_or = schema_store_->GetSchemaTypeId(schema_type);
1482 if (!schema_type_id_or.ok()) {
1483 result.status = absl_ports::Annotate(
1484 schema_type_id_or.status(),
1485 absl_ports::StrCat("Failed to find schema type. schema_type: ",
1486 schema_type));
1487 return result;
1488 }
1489 SchemaTypeId schema_type_id = schema_type_id_or.ValueOrDie();
1490 auto num_deleted_or = BatchDelete(kInvalidNamespaceId, schema_type_id);
1491 if (!num_deleted_or.ok()) {
1492 result.status = std::move(num_deleted_or).status();
1493 return result;
1494 }
1495
1496 result.num_docs_deleted = num_deleted_or.ValueOrDie();
1497 if (result.num_docs_deleted <= 0) {
1498 result.status = absl_ports::NotFoundError(absl_ports::StrCat(
1499 "No documents found with schema type '", schema_type, "'"));
1500 return result;
1501 }
1502
1503 return result;
1504 }
1505
BatchDelete(NamespaceId namespace_id,SchemaTypeId schema_type_id)1506 libtextclassifier3::StatusOr<int> DocumentStore::BatchDelete(
1507 NamespaceId namespace_id, SchemaTypeId schema_type_id) {
1508 // Tracks if there were any existing documents with this namespace that we
1509 // will mark as deleted.
1510 int num_updated_documents = 0;
1511
1512 // Traverse FilterCache and delete all docs that match namespace_id and
1513 // schema_type_id.
1514 int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
1515 for (DocumentId document_id = 0; document_id < filter_cache_->num_elements();
1516 ++document_id) {
1517 // filter_cache_->Get can only fail if document_id is < 0
1518 // or >= filter_cache_->num_elements. So, this error SHOULD NEVER HAPPEN.
1519 ICING_ASSIGN_OR_RETURN(const DocumentFilterData* data,
1520 filter_cache_->Get(document_id));
1521
1522 // Check namespace only when the input namespace id is valid.
1523 if (namespace_id != kInvalidNamespaceId &&
1524 (data->namespace_id() == kInvalidNamespaceId ||
1525 data->namespace_id() != namespace_id)) {
1526 // The document has already been hard-deleted or isn't from the desired
1527 // namespace.
1528 continue;
1529 }
1530
1531 // Check schema type only when the input schema type id is valid.
1532 if (schema_type_id != kInvalidSchemaTypeId &&
1533 (data->schema_type_id() == kInvalidSchemaTypeId ||
1534 data->schema_type_id() != schema_type_id)) {
1535 // The document has already been hard-deleted or doesn't have the
1536 // desired schema type.
1537 continue;
1538 }
1539
1540 // The document has the desired namespace and schema type, it either
1541 // exists or has expired.
1542 libtextclassifier3::Status delete_status =
1543 Delete(document_id, current_time_ms);
1544 if (absl_ports::IsNotFound(delete_status)) {
1545 continue;
1546 } else if (!delete_status.ok()) {
1547 // Real error, pass up.
1548 return delete_status;
1549 }
1550 ++num_updated_documents;
1551 }
1552
1553 return num_updated_documents;
1554 }
1555
PersistToDisk(PersistType::Code persist_type)1556 libtextclassifier3::Status DocumentStore::PersistToDisk(
1557 PersistType::Code persist_type) {
1558 if (persist_type == PersistType::LITE) {
1559 // only persist the document log.
1560 return document_log_->PersistToDisk();
1561 }
1562 ICING_RETURN_IF_ERROR(document_log_->PersistToDisk());
1563 ICING_RETURN_IF_ERROR(document_key_mapper_->PersistToDisk());
1564 ICING_RETURN_IF_ERROR(document_id_mapper_->PersistToDisk());
1565 ICING_RETURN_IF_ERROR(score_cache_->PersistToDisk());
1566 ICING_RETURN_IF_ERROR(filter_cache_->PersistToDisk());
1567 ICING_RETURN_IF_ERROR(namespace_mapper_->PersistToDisk());
1568 ICING_RETURN_IF_ERROR(usage_store_->PersistToDisk());
1569 ICING_RETURN_IF_ERROR(corpus_mapper_->PersistToDisk());
1570 ICING_RETURN_IF_ERROR(corpus_score_cache_->PersistToDisk());
1571
1572 // Update the combined checksum and write to header file.
1573 ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
1574 ICING_RETURN_IF_ERROR(UpdateHeader(checksum));
1575
1576 return libtextclassifier3::Status::OK;
1577 }
1578
GetValueOrDefault(const libtextclassifier3::StatusOr<int64_t> & value_or,int64_t default_value)1579 int64_t GetValueOrDefault(const libtextclassifier3::StatusOr<int64_t>& value_or,
1580 int64_t default_value) {
1581 return (value_or.ok()) ? value_or.ValueOrDie() : default_value;
1582 }
1583
GetMemberStorageInfo() const1584 DocumentStorageInfoProto DocumentStore::GetMemberStorageInfo() const {
1585 DocumentStorageInfoProto storage_info;
1586 storage_info.set_document_log_size(
1587 GetValueOrDefault(document_log_->GetDiskUsage(), -1));
1588 storage_info.set_key_mapper_size(
1589 GetValueOrDefault(document_key_mapper_->GetDiskUsage(), -1));
1590 storage_info.set_document_id_mapper_size(
1591 GetValueOrDefault(document_id_mapper_->GetDiskUsage(), -1));
1592 storage_info.set_score_cache_size(
1593 GetValueOrDefault(score_cache_->GetDiskUsage(), -1));
1594 storage_info.set_filter_cache_size(
1595 GetValueOrDefault(filter_cache_->GetDiskUsage(), -1));
1596 storage_info.set_namespace_id_mapper_size(
1597 GetValueOrDefault(namespace_mapper_->GetDiskUsage(), -1));
1598 storage_info.set_corpus_mapper_size(
1599 GetValueOrDefault(corpus_mapper_->GetDiskUsage(), -1));
1600 storage_info.set_corpus_score_cache_size(
1601 GetValueOrDefault(corpus_score_cache_->GetDiskUsage(), -1));
1602 return storage_info;
1603 }
1604
CalculateDocumentStatusCounts(DocumentStorageInfoProto storage_info) const1605 DocumentStorageInfoProto DocumentStore::CalculateDocumentStatusCounts(
1606 DocumentStorageInfoProto storage_info) const {
1607 int total_num_alive = 0;
1608 int total_num_expired = 0;
1609 int total_num_deleted = 0;
1610 std::unordered_map<NamespaceId, std::string> namespace_id_to_namespace =
1611 GetNamespaceIdsToNamespaces(namespace_mapper_.get());
1612 std::unordered_map<std::string, NamespaceStorageInfoProto>
1613 namespace_to_storage_info;
1614
1615 int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
1616 for (DocumentId document_id = 0;
1617 document_id < document_id_mapper_->num_elements(); ++document_id) {
1618 // Check if it's deleted first.
1619 if (IsDeleted(document_id)) {
1620 // We don't have the namespace id of hard deleted documents anymore, so
1621 // we can't add to our namespace storage info.
1622 ++total_num_deleted;
1623 continue;
1624 }
1625
1626 // At this point, the document is either alive or expired, we can get
1627 // namespace info for it.
1628 auto filter_data_or = filter_cache_->Get(document_id);
1629 if (!filter_data_or.ok()) {
1630 ICING_VLOG(1) << "Error trying to get filter data for document store "
1631 "storage info counts.";
1632 continue;
1633 }
1634 const DocumentFilterData* filter_data = filter_data_or.ValueOrDie();
1635 auto itr = namespace_id_to_namespace.find(filter_data->namespace_id());
1636 if (itr == namespace_id_to_namespace.end()) {
1637 ICING_VLOG(1) << "Error trying to find namespace for document store "
1638 "storage info counts.";
1639 continue;
1640 }
1641 const std::string& name_space = itr->second;
1642
1643 // Always set the namespace, if the NamespaceStorageInfoProto didn't exist
1644 // before, we'll get back a default instance of it.
1645 NamespaceStorageInfoProto& namespace_storage_info =
1646 namespace_to_storage_info[name_space];
1647 namespace_storage_info.set_namespace_(name_space);
1648
1649 // Get usage scores
1650 auto usage_scores_or = usage_store_->GetUsageScores(document_id);
1651 if (!usage_scores_or.ok()) {
1652 ICING_VLOG(1) << "Error trying to get usage scores for document store "
1653 "storage info counts.";
1654 continue;
1655 }
1656 UsageStore::UsageScores usage_scores = usage_scores_or.ValueOrDie();
1657
1658 // Update our stats
1659 if (!GetNonExpiredDocumentFilterData(document_id, current_time_ms)) {
1660 ++total_num_expired;
1661 namespace_storage_info.set_num_expired_documents(
1662 namespace_storage_info.num_expired_documents() + 1);
1663 if (usage_scores.usage_type1_count > 0) {
1664 namespace_storage_info.set_num_expired_documents_usage_type1(
1665 namespace_storage_info.num_expired_documents_usage_type1() + 1);
1666 }
1667 if (usage_scores.usage_type2_count > 0) {
1668 namespace_storage_info.set_num_expired_documents_usage_type2(
1669 namespace_storage_info.num_expired_documents_usage_type2() + 1);
1670 }
1671 if (usage_scores.usage_type3_count > 0) {
1672 namespace_storage_info.set_num_expired_documents_usage_type3(
1673 namespace_storage_info.num_expired_documents_usage_type3() + 1);
1674 }
1675 } else {
1676 ++total_num_alive;
1677 namespace_storage_info.set_num_alive_documents(
1678 namespace_storage_info.num_alive_documents() + 1);
1679 if (usage_scores.usage_type1_count > 0) {
1680 namespace_storage_info.set_num_alive_documents_usage_type1(
1681 namespace_storage_info.num_alive_documents_usage_type1() + 1);
1682 }
1683 if (usage_scores.usage_type2_count > 0) {
1684 namespace_storage_info.set_num_alive_documents_usage_type2(
1685 namespace_storage_info.num_alive_documents_usage_type2() + 1);
1686 }
1687 if (usage_scores.usage_type3_count > 0) {
1688 namespace_storage_info.set_num_alive_documents_usage_type3(
1689 namespace_storage_info.num_alive_documents_usage_type3() + 1);
1690 }
1691 }
1692 }
1693
1694 for (auto& itr : namespace_to_storage_info) {
1695 storage_info.mutable_namespace_storage_info()->Add(std::move(itr.second));
1696 }
1697 storage_info.set_num_alive_documents(total_num_alive);
1698 storage_info.set_num_deleted_documents(total_num_deleted);
1699 storage_info.set_num_expired_documents(total_num_expired);
1700 return storage_info;
1701 }
1702
GetStorageInfo() const1703 DocumentStorageInfoProto DocumentStore::GetStorageInfo() const {
1704 DocumentStorageInfoProto storage_info = GetMemberStorageInfo();
1705 int64_t directory_size = filesystem_->GetDiskUsage(base_dir_.c_str());
1706 if (directory_size != Filesystem::kBadFileSize) {
1707 storage_info.set_document_store_size(directory_size);
1708 } else {
1709 storage_info.set_document_store_size(-1);
1710 }
1711 storage_info.set_num_namespaces(namespace_mapper_->num_keys());
1712 return CalculateDocumentStatusCounts(std::move(storage_info));
1713 }
1714
UpdateSchemaStore(const SchemaStore * schema_store)1715 libtextclassifier3::Status DocumentStore::UpdateSchemaStore(
1716 const SchemaStore* schema_store) {
1717 // Update all references to the SchemaStore
1718 schema_store_ = schema_store;
1719 document_validator_.UpdateSchemaStore(schema_store);
1720
1721 int size = document_id_mapper_->num_elements();
1722 int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
1723 for (DocumentId document_id = 0; document_id < size; document_id++) {
1724 auto document_or = Get(document_id);
1725 if (absl_ports::IsNotFound(document_or.status())) {
1726 // Skip nonexistent documents
1727 continue;
1728 } else if (!document_or.ok()) {
1729 // Real error, pass up
1730 return absl_ports::Annotate(
1731 document_or.status(),
1732 IcingStringUtil::StringPrintf(
1733 "Failed to retrieve Document for DocumentId %d", document_id));
1734 }
1735
1736 // Guaranteed to have a document now.
1737 DocumentProto document = document_or.ValueOrDie();
1738
1739 // Revalidate that this document is still compatible
1740 if (document_validator_.Validate(document).ok()) {
1741 // Update the SchemaTypeId for this entry
1742 ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
1743 schema_store_->GetSchemaTypeId(document.schema()));
1744 ICING_ASSIGN_OR_RETURN(
1745 typename FileBackedVector<DocumentFilterData>::MutableView
1746 doc_filter_data_view,
1747 filter_cache_->GetMutable(document_id));
1748 doc_filter_data_view.Get().set_schema_type_id(schema_type_id);
1749 } else {
1750 // Document is no longer valid with the new SchemaStore. Mark as
1751 // deleted
1752 auto delete_status =
1753 Delete(document.namespace_(), document.uri(), current_time_ms);
1754 if (!delete_status.ok() && !absl_ports::IsNotFound(delete_status)) {
1755 // Real error, pass up
1756 return delete_status;
1757 }
1758 }
1759 }
1760
1761 return libtextclassifier3::Status::OK;
1762 }
1763
OptimizedUpdateSchemaStore(const SchemaStore * schema_store,const SchemaStore::SetSchemaResult & set_schema_result)1764 libtextclassifier3::Status DocumentStore::OptimizedUpdateSchemaStore(
1765 const SchemaStore* schema_store,
1766 const SchemaStore::SetSchemaResult& set_schema_result) {
1767 if (!set_schema_result.success) {
1768 // No new schema was set, no work to be done
1769 return libtextclassifier3::Status::OK;
1770 }
1771
1772 // Update all references to the SchemaStore
1773 schema_store_ = schema_store;
1774 document_validator_.UpdateSchemaStore(schema_store);
1775
1776 int size = document_id_mapper_->num_elements();
1777 int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
1778 for (DocumentId document_id = 0; document_id < size; document_id++) {
1779 if (!GetAliveDocumentFilterData(document_id, current_time_ms)) {
1780 // Skip nonexistent documents
1781 continue;
1782 }
1783
1784 // Guaranteed that the document exists now.
1785 ICING_ASSIGN_OR_RETURN(const DocumentFilterData* filter_data,
1786 filter_cache_->Get(document_id));
1787
1788 bool delete_document = set_schema_result.schema_types_deleted_by_id.count(
1789 filter_data->schema_type_id()) != 0;
1790
1791 // Check if we need to update the FilterCache entry for this document. It
1792 // may have been assigned a different SchemaTypeId in the new SchemaStore.
1793 bool update_filter_cache =
1794 set_schema_result.old_schema_type_ids_changed.count(
1795 filter_data->schema_type_id()) != 0;
1796
1797 // Check if we need to revalidate this document if the type is now
1798 // incompatible
1799 bool revalidate_document =
1800 set_schema_result.schema_types_incompatible_by_id.count(
1801 filter_data->schema_type_id()) != 0;
1802
1803 if (update_filter_cache || revalidate_document) {
1804 ICING_ASSIGN_OR_RETURN(DocumentProto document, Get(document_id));
1805
1806 if (update_filter_cache) {
1807 ICING_ASSIGN_OR_RETURN(
1808 SchemaTypeId schema_type_id,
1809 schema_store_->GetSchemaTypeId(document.schema()));
1810 ICING_ASSIGN_OR_RETURN(
1811 typename FileBackedVector<DocumentFilterData>::MutableView
1812 doc_filter_data_view,
1813 filter_cache_->GetMutable(document_id));
1814 doc_filter_data_view.Get().set_schema_type_id(schema_type_id);
1815 }
1816 if (revalidate_document) {
1817 delete_document = !document_validator_.Validate(document).ok();
1818 }
1819 }
1820
1821 if (delete_document) {
1822 // Document is no longer valid with the new SchemaStore. Mark as deleted
1823 auto delete_status = Delete(document_id, current_time_ms);
1824 if (!delete_status.ok() && !absl_ports::IsNotFound(delete_status)) {
1825 // Real error, pass up
1826 return delete_status;
1827 }
1828 }
1829 }
1830
1831 return libtextclassifier3::Status::OK;
1832 }
1833
1834 // TODO(b/121227117): Implement Optimize()
Optimize()1835 libtextclassifier3::Status DocumentStore::Optimize() {
1836 return libtextclassifier3::Status::OK;
1837 }
1838
1839 libtextclassifier3::StatusOr<DocumentStore::OptimizeResult>
OptimizeInto(const std::string & new_directory,const LanguageSegmenter * lang_segmenter,OptimizeStatsProto * stats) const1840 DocumentStore::OptimizeInto(const std::string& new_directory,
1841 const LanguageSegmenter* lang_segmenter,
1842 OptimizeStatsProto* stats) const {
1843 // Validates directory
1844 if (new_directory == base_dir_) {
1845 return absl_ports::InvalidArgumentError(
1846 "New directory is the same as the current one.");
1847 }
1848
1849 ICING_ASSIGN_OR_RETURN(
1850 auto doc_store_create_result,
1851 DocumentStore::Create(filesystem_, new_directory, &clock_, schema_store_,
1852 /*force_recovery_and_revalidate_documents=*/false,
1853 namespace_id_fingerprint_, pre_mapping_fbv_,
1854 use_persistent_hash_map_, compression_level_,
1855 /*initialize_stats=*/nullptr));
1856 std::unique_ptr<DocumentStore> new_doc_store =
1857 std::move(doc_store_create_result.document_store);
1858
1859 // Writes all valid docs into new document store (new directory)
1860 int document_cnt = document_id_mapper_->num_elements();
1861 int num_deleted_documents = 0;
1862 int num_expired_documents = 0;
1863 UsageStore::UsageScores default_usage;
1864
1865 OptimizeResult result;
1866 result.document_id_old_to_new.resize(document_cnt, kInvalidDocumentId);
1867 int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
1868 for (DocumentId document_id = 0; document_id < document_cnt; document_id++) {
1869 auto document_or = Get(document_id, /*clear_internal_fields=*/false);
1870 if (absl_ports::IsNotFound(document_or.status())) {
1871 if (IsDeleted(document_id)) {
1872 ++num_deleted_documents;
1873 } else if (!GetNonExpiredDocumentFilterData(document_id,
1874 current_time_ms)) {
1875 ++num_expired_documents;
1876 }
1877 continue;
1878 } else if (!document_or.ok()) {
1879 // Real error, pass up
1880 return absl_ports::Annotate(
1881 document_or.status(),
1882 IcingStringUtil::StringPrintf(
1883 "Failed to retrieve Document for DocumentId %d", document_id));
1884 }
1885
1886 // Guaranteed to have a document now.
1887 DocumentProto document_to_keep = std::move(document_or).ValueOrDie();
1888
1889 libtextclassifier3::StatusOr<DocumentId> new_document_id_or;
1890 if (document_to_keep.internal_fields().length_in_tokens() == 0) {
1891 auto tokenized_document_or = TokenizedDocument::Create(
1892 schema_store_, lang_segmenter, document_to_keep);
1893 if (!tokenized_document_or.ok()) {
1894 return absl_ports::Annotate(
1895 tokenized_document_or.status(),
1896 IcingStringUtil::StringPrintf(
1897 "Failed to tokenize Document for DocumentId %d", document_id));
1898 }
1899 TokenizedDocument tokenized_document(
1900 std::move(tokenized_document_or).ValueOrDie());
1901 new_document_id_or = new_doc_store->Put(
1902 std::move(document_to_keep), tokenized_document.num_string_tokens());
1903 } else {
1904 // TODO(b/144458732): Implement a more robust version of
1905 // TC_ASSIGN_OR_RETURN that can support error logging.
1906 new_document_id_or =
1907 new_doc_store->InternalPut(std::move(document_to_keep));
1908 }
1909 if (!new_document_id_or.ok()) {
1910 ICING_LOG(ERROR) << new_document_id_or.status().error_message()
1911 << "Failed to write into new document store";
1912 return new_document_id_or.status();
1913 }
1914
1915 result.document_id_old_to_new[document_id] =
1916 new_document_id_or.ValueOrDie();
1917
1918 // Copy over usage scores.
1919 ICING_ASSIGN_OR_RETURN(UsageStore::UsageScores usage_scores,
1920 usage_store_->GetUsageScores(document_id));
1921 if (!(usage_scores == default_usage)) {
1922 // If the usage scores for this document are the default (no usage), then
1923 // don't bother setting it. No need to possibly allocate storage if
1924 // there's nothing interesting to store.
1925 DocumentId new_document_id = new_document_id_or.ValueOrDie();
1926 ICING_RETURN_IF_ERROR(
1927 new_doc_store->SetUsageScores(new_document_id, usage_scores));
1928 }
1929 }
1930
1931 // Construct namespace_id_old_to_new
1932 int namespace_cnt = namespace_mapper_->num_keys();
1933 std::unordered_map<NamespaceId, std::string> old_namespaces =
1934 GetNamespaceIdsToNamespaces(namespace_mapper_.get());
1935 if (namespace_cnt != old_namespaces.size()) {
1936 // This really shouldn't happen. If it really happens, then:
1937 // - It won't block DocumentStore optimization, so don't return error here.
1938 // - Instead, write a warning log here and hint the caller to rebuild index.
1939 ICING_LOG(WARNING) << "Unexpected old namespace count " << namespace_cnt
1940 << " vs " << old_namespaces.size();
1941 result.should_rebuild_index = true;
1942 } else {
1943 result.namespace_id_old_to_new.resize(namespace_cnt, kInvalidNamespaceId);
1944 for (const auto& [old_namespace_id, ns] : old_namespaces) {
1945 if (old_namespace_id >= result.namespace_id_old_to_new.size()) {
1946 // This really shouldn't happen. If it really happens, then:
1947 // - It won't block DocumentStore optimization, so don't return error
1948 // here.
1949 // - Instead, write a warning log here and hint the caller to rebuild
1950 // index.
1951 ICING_LOG(WARNING) << "Found unexpected namespace id "
1952 << old_namespace_id << ". Should be in range 0 to "
1953 << result.namespace_id_old_to_new.size()
1954 << " (exclusive).";
1955 result.namespace_id_old_to_new.clear();
1956 result.should_rebuild_index = true;
1957 break;
1958 }
1959
1960 auto new_namespace_id_or = new_doc_store->namespace_mapper_->Get(ns);
1961 if (!new_namespace_id_or.ok()) {
1962 if (absl_ports::IsNotFound(new_namespace_id_or.status())) {
1963 continue;
1964 }
1965 // Real error, return it.
1966 return std::move(new_namespace_id_or).status();
1967 }
1968
1969 NamespaceId new_namespace_id = new_namespace_id_or.ValueOrDie();
1970 // Safe to use bracket to assign given that we've checked the range above.
1971 result.namespace_id_old_to_new[old_namespace_id] = new_namespace_id;
1972 }
1973 }
1974
1975 if (stats != nullptr) {
1976 stats->set_num_original_documents(document_cnt);
1977 stats->set_num_deleted_documents(num_deleted_documents);
1978 stats->set_num_expired_documents(num_expired_documents);
1979 stats->set_num_original_namespaces(namespace_cnt);
1980 stats->set_num_deleted_namespaces(
1981 namespace_cnt - new_doc_store->namespace_mapper_->num_keys());
1982 }
1983 ICING_RETURN_IF_ERROR(new_doc_store->PersistToDisk(PersistType::FULL));
1984 return result;
1985 }
1986
1987 libtextclassifier3::StatusOr<DocumentStore::OptimizeInfo>
GetOptimizeInfo() const1988 DocumentStore::GetOptimizeInfo() const {
1989 OptimizeInfo optimize_info;
1990
1991 // Figure out our ratio of optimizable/total docs.
1992 int32_t num_documents = document_id_mapper_->num_elements();
1993 int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
1994 for (DocumentId document_id = kMinDocumentId; document_id < num_documents;
1995 ++document_id) {
1996 if (!GetAliveDocumentFilterData(document_id, current_time_ms)) {
1997 ++optimize_info.optimizable_docs;
1998 }
1999
2000 ++optimize_info.total_docs;
2001 }
2002
2003 if (optimize_info.total_docs == 0) {
2004 // Can exit early since there's nothing to calculate.
2005 return optimize_info;
2006 }
2007
2008 // Get the total element size.
2009 //
2010 // We use file size instead of disk usage here because the files are not
2011 // sparse, so it's more accurate. Disk usage rounds up to the nearest block
2012 // size.
2013 ICING_ASSIGN_OR_RETURN(const int64_t document_log_file_size,
2014 document_log_->GetElementsFileSize());
2015 ICING_ASSIGN_OR_RETURN(const int64_t document_id_mapper_file_size,
2016 document_id_mapper_->GetElementsFileSize());
2017 ICING_ASSIGN_OR_RETURN(const int64_t score_cache_file_size,
2018 score_cache_->GetElementsFileSize());
2019 ICING_ASSIGN_OR_RETURN(const int64_t filter_cache_file_size,
2020 filter_cache_->GetElementsFileSize());
2021 ICING_ASSIGN_OR_RETURN(const int64_t corpus_score_cache_file_size,
2022 corpus_score_cache_->GetElementsFileSize());
2023
2024 // Usage store might be sparse, but we'll still use file size for more
2025 // accurate counting.
2026 ICING_ASSIGN_OR_RETURN(const int64_t usage_store_file_size,
2027 usage_store_->GetElementsFileSize());
2028
2029 // We use a combined disk usage and file size for the DynamicTrieKeyMapper
2030 // because it's backed by a trie, which has some sparse property bitmaps.
2031 ICING_ASSIGN_OR_RETURN(const int64_t document_key_mapper_size,
2032 document_key_mapper_->GetElementsSize());
2033
2034 // We don't include the namespace_mapper or the corpus_mapper because it's
2035 // not clear if we could recover any space even if Optimize were called.
2036 // Deleting 100s of documents could still leave a few documents of a
2037 // namespace, and then there would be no change.
2038
2039 int64_t total_size = document_log_file_size + document_key_mapper_size +
2040 document_id_mapper_file_size + score_cache_file_size +
2041 filter_cache_file_size + corpus_score_cache_file_size +
2042 usage_store_file_size;
2043
2044 optimize_info.estimated_optimizable_bytes =
2045 total_size * optimize_info.optimizable_docs / optimize_info.total_docs;
2046 return optimize_info;
2047 }
2048
UpdateCorpusAssociatedScoreCache(CorpusId corpus_id,const CorpusAssociatedScoreData & score_data)2049 libtextclassifier3::Status DocumentStore::UpdateCorpusAssociatedScoreCache(
2050 CorpusId corpus_id, const CorpusAssociatedScoreData& score_data) {
2051 return corpus_score_cache_->Set(corpus_id, score_data);
2052 }
2053
UpdateDocumentAssociatedScoreCache(DocumentId document_id,const DocumentAssociatedScoreData & score_data)2054 libtextclassifier3::Status DocumentStore::UpdateDocumentAssociatedScoreCache(
2055 DocumentId document_id, const DocumentAssociatedScoreData& score_data) {
2056 return score_cache_->Set(document_id, score_data);
2057 }
2058
UpdateFilterCache(DocumentId document_id,const DocumentFilterData & filter_data)2059 libtextclassifier3::Status DocumentStore::UpdateFilterCache(
2060 DocumentId document_id, const DocumentFilterData& filter_data) {
2061 return filter_cache_->Set(document_id, filter_data);
2062 }
2063
ClearDerivedData(DocumentId document_id)2064 libtextclassifier3::Status DocumentStore::ClearDerivedData(
2065 DocumentId document_id) {
2066 // We intentionally leave the data in key_mapper_ because locating that data
2067 // requires fetching namespace and uri. Leaving data in key_mapper_ should
2068 // be fine because the data is hashed.
2069
2070 ICING_RETURN_IF_ERROR(document_id_mapper_->Set(document_id, kDocDeletedFlag));
2071
2072 // Resets the score cache entry
2073 ICING_RETURN_IF_ERROR(UpdateDocumentAssociatedScoreCache(
2074 document_id, DocumentAssociatedScoreData(kInvalidCorpusId,
2075 /*document_score=*/-1,
2076 /*creation_timestamp_ms=*/-1,
2077 /*length_in_tokens=*/0)));
2078
2079 // Resets the filter cache entry
2080 ICING_RETURN_IF_ERROR(UpdateFilterCache(
2081 document_id, DocumentFilterData(kInvalidNamespaceId, kInvalidSchemaTypeId,
2082 /*expiration_timestamp_ms=*/-1)));
2083
2084 // Clears the usage scores.
2085 return usage_store_->DeleteUsageScores(document_id);
2086 }
2087
SetUsageScores(DocumentId document_id,const UsageStore::UsageScores & usage_scores)2088 libtextclassifier3::Status DocumentStore::SetUsageScores(
2089 DocumentId document_id, const UsageStore::UsageScores& usage_scores) {
2090 return usage_store_->SetUsageScores(document_id, usage_scores);
2091 }
2092
2093 libtextclassifier3::StatusOr<
2094 google::protobuf::RepeatedPtrField<DocumentDebugInfoProto::CorpusInfo>>
CollectCorpusInfo() const2095 DocumentStore::CollectCorpusInfo() const {
2096 google::protobuf::RepeatedPtrField<DocumentDebugInfoProto::CorpusInfo> corpus_info;
2097 libtextclassifier3::StatusOr<const SchemaProto*> schema_proto_or =
2098 schema_store_->GetSchema();
2099 if (!schema_proto_or.ok()) {
2100 return corpus_info;
2101 }
2102 // Maps from CorpusId to the corresponding protocol buffer in the result.
2103 std::unordered_map<CorpusId, DocumentDebugInfoProto::CorpusInfo*> info_map;
2104 std::unordered_map<NamespaceId, std::string> namespace_id_to_namespace =
2105 GetNamespaceIdsToNamespaces(namespace_mapper_.get());
2106 const SchemaProto* schema_proto = schema_proto_or.ValueOrDie();
2107 int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
2108 for (DocumentId document_id = 0; document_id < filter_cache_->num_elements();
2109 ++document_id) {
2110 if (!GetAliveDocumentFilterData(document_id, current_time_ms)) {
2111 continue;
2112 }
2113 ICING_ASSIGN_OR_RETURN(const DocumentFilterData* filter_data,
2114 filter_cache_->Get(document_id));
2115 ICING_ASSIGN_OR_RETURN(const DocumentAssociatedScoreData* score_data,
2116 score_cache_->Get(document_id));
2117 const std::string& name_space =
2118 namespace_id_to_namespace[filter_data->namespace_id()];
2119 const std::string& schema =
2120 schema_proto->types()[filter_data->schema_type_id()].schema_type();
2121 auto iter = info_map.find(score_data->corpus_id());
2122 if (iter == info_map.end()) {
2123 DocumentDebugInfoProto::CorpusInfo* entry = corpus_info.Add();
2124 entry->set_namespace_(name_space);
2125 entry->set_schema(schema);
2126 iter = info_map.insert({score_data->corpus_id(), entry}).first;
2127 }
2128 iter->second->set_total_documents(iter->second->total_documents() + 1);
2129 iter->second->set_total_token(iter->second->total_token() +
2130 score_data->length_in_tokens());
2131 }
2132 return corpus_info;
2133 }
2134
2135 libtextclassifier3::StatusOr<DocumentDebugInfoProto>
GetDebugInfo(int verbosity) const2136 DocumentStore::GetDebugInfo(int verbosity) const {
2137 DocumentDebugInfoProto debug_info;
2138 *debug_info.mutable_document_storage_info() = GetStorageInfo();
2139 ICING_ASSIGN_OR_RETURN(Crc32 crc, ComputeChecksum());
2140 debug_info.set_crc(crc.Get());
2141 if (verbosity > 0) {
2142 ICING_ASSIGN_OR_RETURN(
2143 google::protobuf::RepeatedPtrField<DocumentDebugInfoProto::CorpusInfo>
2144 corpus_info,
2145 CollectCorpusInfo());
2146 *debug_info.mutable_corpus_info() = std::move(corpus_info);
2147 }
2148 return debug_info;
2149 }
2150
2151 } // namespace lib
2152 } // namespace icing
2153