1 // Copyright (C) 2022 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/file/persistent-hash-map.h"
16
17 #include <cstdint>
18 #include <cstring>
19 #include <memory>
20 #include <string>
21 #include <string_view>
22 #include <utility>
23
24 #include "icing/text_classifier/lib3/utils/base/status.h"
25 #include "icing/text_classifier/lib3/utils/base/statusor.h"
26 #include "icing/absl_ports/canonical_errors.h"
27 #include "icing/absl_ports/str_cat.h"
28 #include "icing/file/file-backed-vector.h"
29 #include "icing/file/memory-mapped-file.h"
30 #include "icing/file/persistent-storage.h"
31 #include "icing/util/crc32.h"
32 #include "icing/util/status-macros.h"
33
34 namespace icing {
35 namespace lib {
36
37 namespace {
38
39 // Helper function to check if there is no termination character '\0' in the
40 // key.
ValidateKey(std::string_view key)41 libtextclassifier3::Status ValidateKey(std::string_view key) {
42 if (key.find('\0') != std::string_view::npos) { // NOLINT
43 return absl_ports::InvalidArgumentError(
44 "Key cannot contain termination character '\\0'");
45 }
46 return libtextclassifier3::Status::OK;
47 }
48
49 // Helper function to convert the key to bucket index by hash.
50 //
51 // Returns:
52 // int32_t: A valid bucket index with range [0, num_buckets - 1].
53 // INTERNAL_ERROR if num_buckets == 0
HashKeyToBucketIndex(std::string_view key,int32_t num_buckets)54 libtextclassifier3::StatusOr<int32_t> HashKeyToBucketIndex(
55 std::string_view key, int32_t num_buckets) {
56 if (num_buckets == 0) {
57 return absl_ports::InternalError("Should not have empty bucket");
58 }
59 return static_cast<int32_t>(std::hash<std::string_view>()(key) % num_buckets);
60 }
61
62 // The following 4 methods are helper functions to get the correct path of
63 // metadata/bucket/entry/key-value storages, according to the given working
64 // directory path.
GetMetadataFilePath(std::string_view working_path)65 std::string GetMetadataFilePath(std::string_view working_path) {
66 return absl_ports::StrCat(working_path, "/", PersistentHashMap::kFilePrefix,
67 ".m");
68 }
69
GetBucketStorageFilePath(std::string_view working_path)70 std::string GetBucketStorageFilePath(std::string_view working_path) {
71 return absl_ports::StrCat(working_path, "/", PersistentHashMap::kFilePrefix,
72 ".b");
73 }
74
GetEntryStorageFilePath(std::string_view working_path)75 std::string GetEntryStorageFilePath(std::string_view working_path) {
76 return absl_ports::StrCat(working_path, "/", PersistentHashMap::kFilePrefix,
77 ".e");
78 }
79
GetKeyValueStorageFilePath(std::string_view working_path)80 std::string GetKeyValueStorageFilePath(std::string_view working_path) {
81 return absl_ports::StrCat(working_path, "/", PersistentHashMap::kFilePrefix,
82 ".k");
83 }
84
85 // Calculates how many buckets we need given num_entries and
86 // max_load_factor_percent. Round it up to 2's power.
87 //
88 // REQUIRES: 0 < num_entries <= Entry::kMaxNumEntries &&
89 // max_load_factor_percent > 0
CalculateNumBucketsRequired(int32_t num_entries,int32_t max_load_factor_percent)90 int32_t CalculateNumBucketsRequired(int32_t num_entries,
91 int32_t max_load_factor_percent) {
92 // Calculate ceil(num_entries * 100 / max_load_factor_percent)
93 int32_t num_entries_100 = num_entries * 100;
94 int32_t num_buckets_required =
95 num_entries_100 / max_load_factor_percent +
96 (num_entries_100 % max_load_factor_percent == 0 ? 0 : 1);
97 if ((num_buckets_required & (num_buckets_required - 1)) != 0) {
98 // not 2's power
99 return 1 << (32 - __builtin_clz(num_buckets_required));
100 }
101 return num_buckets_required;
102 }
103
104 } // namespace
105
IsValid() const106 bool PersistentHashMap::Options::IsValid() const {
107 if (!(value_type_size > 0 && value_type_size <= kMaxValueTypeSize &&
108 max_num_entries > 0 && max_num_entries <= Entry::kMaxNumEntries &&
109 max_load_factor_percent > 0 && average_kv_byte_size > 0 &&
110 init_num_buckets > 0 && init_num_buckets <= Bucket::kMaxNumBuckets)) {
111 return false;
112 }
113
114 // We've ensured (static_assert) that storing kMaxNumBuckets buckets won't
115 // exceed FileBackedVector::kMaxFileSize, so only need to verify # of buckets
116 // required won't exceed kMaxNumBuckets.
117 if (CalculateNumBucketsRequired(max_num_entries, max_load_factor_percent) >
118 Bucket::kMaxNumBuckets) {
119 return false;
120 }
121
122 // Verify # of key value pairs can fit into kv_storage.
123 if (average_kv_byte_size > kMaxKVTotalByteSize / max_num_entries) {
124 return false;
125 }
126
127 // Verify init_num_buckets is 2's power. Requiring init_num_buckets to be 2^n
128 // guarantees that num_buckets will eventually grow to be exactly
129 // max_num_buckets since CalculateNumBucketsRequired rounds it up to 2^n.
130 if ((init_num_buckets & (init_num_buckets - 1)) != 0) {
131 return false;
132 }
133
134 return true;
135 }
136
137 /* static */ libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>>
Create(const Filesystem & filesystem,std::string working_path,Options options)138 PersistentHashMap::Create(const Filesystem& filesystem,
139 std::string working_path, Options options) {
140 if (!options.IsValid()) {
141 return absl_ports::InvalidArgumentError(
142 "Invalid PersistentHashMap options");
143 }
144
145 if (!filesystem.FileExists(GetMetadataFilePath(working_path).c_str()) ||
146 !filesystem.FileExists(GetBucketStorageFilePath(working_path).c_str()) ||
147 !filesystem.FileExists(GetEntryStorageFilePath(working_path).c_str()) ||
148 !filesystem.FileExists(
149 GetKeyValueStorageFilePath(working_path).c_str())) {
150 // Discard working_path if any of them is missing, and reinitialize.
151 if (filesystem.DirectoryExists(working_path.c_str())) {
152 ICING_RETURN_IF_ERROR(Discard(filesystem, working_path));
153 }
154 return InitializeNewFiles(filesystem, std::move(working_path),
155 std::move(options));
156 }
157 return InitializeExistingFiles(filesystem, std::move(working_path),
158 std::move(options));
159 }
160
~PersistentHashMap()161 PersistentHashMap::~PersistentHashMap() {
162 if (!PersistToDisk().ok()) {
163 ICING_LOG(WARNING)
164 << "Failed to persist hash map to disk while destructing "
165 << working_path_;
166 }
167 }
168
Put(std::string_view key,const void * value)169 libtextclassifier3::Status PersistentHashMap::Put(std::string_view key,
170 const void* value) {
171 SetDirty();
172
173 ICING_RETURN_IF_ERROR(ValidateKey(key));
174 ICING_ASSIGN_OR_RETURN(
175 int32_t bucket_idx,
176 HashKeyToBucketIndex(key, bucket_storage_->num_elements()));
177
178 ICING_ASSIGN_OR_RETURN(EntryIndexPair idx_pair,
179 FindEntryIndexByKey(bucket_idx, key));
180 if (idx_pair.target_entry_index == Entry::kInvalidIndex) {
181 // If not found, then insert new key value pair.
182 return Insert(bucket_idx, key, value);
183 }
184
185 // Otherwise, overwrite the value.
186 ICING_ASSIGN_OR_RETURN(const Entry* entry,
187 entry_storage_->Get(idx_pair.target_entry_index));
188
189 int32_t kv_len = key.length() + 1 + info().value_type_size;
190 int32_t value_offset = key.length() + 1;
191 ICING_ASSIGN_OR_RETURN(
192 typename FileBackedVector<char>::MutableArrayView mutable_kv_arr,
193 kv_storage_->GetMutable(entry->key_value_index(), kv_len));
194 // It is the same key and value_size is fixed, so we can directly overwrite
195 // serialized value.
196 mutable_kv_arr.SetArray(value_offset, reinterpret_cast<const char*>(value),
197 info().value_type_size);
198
199 return libtextclassifier3::Status::OK;
200 }
201
GetOrPut(std::string_view key,void * next_value)202 libtextclassifier3::Status PersistentHashMap::GetOrPut(std::string_view key,
203 void* next_value) {
204 ICING_RETURN_IF_ERROR(ValidateKey(key));
205 ICING_ASSIGN_OR_RETURN(
206 int32_t bucket_idx,
207 HashKeyToBucketIndex(key, bucket_storage_->num_elements()));
208
209 ICING_ASSIGN_OR_RETURN(EntryIndexPair idx_pair,
210 FindEntryIndexByKey(bucket_idx, key));
211 if (idx_pair.target_entry_index == Entry::kInvalidIndex) {
212 // If not found, then insert new key value pair.
213 SetDirty();
214 return Insert(bucket_idx, key, next_value);
215 }
216
217 // Otherwise, copy the hash map value into next_value.
218 return CopyEntryValue(idx_pair.target_entry_index, next_value);
219 }
220
Get(std::string_view key,void * value) const221 libtextclassifier3::Status PersistentHashMap::Get(std::string_view key,
222 void* value) const {
223 ICING_RETURN_IF_ERROR(ValidateKey(key));
224 ICING_ASSIGN_OR_RETURN(
225 int32_t bucket_idx,
226 HashKeyToBucketIndex(key, bucket_storage_->num_elements()));
227
228 ICING_ASSIGN_OR_RETURN(EntryIndexPair idx_pair,
229 FindEntryIndexByKey(bucket_idx, key));
230 if (idx_pair.target_entry_index == Entry::kInvalidIndex) {
231 return absl_ports::NotFoundError(absl_ports::StrCat(
232 "Key not found in PersistentHashMap ", working_path_));
233 }
234
235 return CopyEntryValue(idx_pair.target_entry_index, value);
236 }
237
Delete(std::string_view key)238 libtextclassifier3::Status PersistentHashMap::Delete(std::string_view key) {
239 SetDirty();
240
241 ICING_RETURN_IF_ERROR(ValidateKey(key));
242 ICING_ASSIGN_OR_RETURN(
243 int32_t bucket_idx,
244 HashKeyToBucketIndex(key, bucket_storage_->num_elements()));
245
246 ICING_ASSIGN_OR_RETURN(EntryIndexPair idx_pair,
247 FindEntryIndexByKey(bucket_idx, key));
248 if (idx_pair.target_entry_index == Entry::kInvalidIndex) {
249 return absl_ports::NotFoundError(absl_ports::StrCat(
250 "Key not found in PersistentHashMap ", working_path_));
251 }
252
253 ICING_ASSIGN_OR_RETURN(
254 typename FileBackedVector<Entry>::MutableView mutable_target_entry,
255 entry_storage_->GetMutable(idx_pair.target_entry_index));
256 if (idx_pair.prev_entry_index == Entry::kInvalidIndex) {
257 // If prev_entry_idx is Entry::kInvalidIndex, then target_entry must be the
258 // head element of the entry linked list, and we have to update
259 // bucket->head_entry_index_.
260 //
261 // Before: target_entry (head) -> next_entry -> ...
262 // After: next_entry (head) -> ...
263 ICING_ASSIGN_OR_RETURN(
264 typename FileBackedVector<Bucket>::MutableView mutable_bucket,
265 bucket_storage_->GetMutable(bucket_idx));
266 if (mutable_bucket.Get().head_entry_index() !=
267 idx_pair.target_entry_index) {
268 return absl_ports::InternalError(
269 "Bucket head entry index is inconsistent with the actual entry linked"
270 "list head. This shouldn't happen");
271 }
272 mutable_bucket.Get().set_head_entry_index(
273 mutable_target_entry.Get().next_entry_index());
274 } else {
275 // Otherwise, connect prev_entry and next_entry, to remove target_entry from
276 // the entry linked list.
277 //
278 // Before: ... -> prev_entry -> target_entry -> next_entry -> ...
279 // After: ... -> prev_entry -> next_entry -> ...
280 ICING_ASSIGN_OR_RETURN(
281 typename FileBackedVector<Entry>::MutableView mutable_prev_entry,
282 entry_storage_->GetMutable(idx_pair.prev_entry_index));
283 mutable_prev_entry.Get().set_next_entry_index(
284 mutable_target_entry.Get().next_entry_index());
285 }
286
287 // Zero out the key value bytes. It is necessary for iterator to iterate
288 // through kv_storage and handle deleted keys properly.
289 int32_t kv_len = key.length() + 1 + info().value_type_size;
290 ICING_RETURN_IF_ERROR(kv_storage_->Set(
291 mutable_target_entry.Get().key_value_index(), kv_len, '\0'));
292
293 // Invalidate target_entry
294 mutable_target_entry.Get().set_key_value_index(kInvalidKVIndex);
295 mutable_target_entry.Get().set_next_entry_index(Entry::kInvalidIndex);
296
297 ++(info().num_deleted_entries);
298
299 return libtextclassifier3::Status::OK;
300 }
301
GetDiskUsage() const302 libtextclassifier3::StatusOr<int64_t> PersistentHashMap::GetDiskUsage() const {
303 ICING_ASSIGN_OR_RETURN(int64_t bucket_storage_disk_usage,
304 bucket_storage_->GetDiskUsage());
305 ICING_ASSIGN_OR_RETURN(int64_t entry_storage_disk_usage,
306 entry_storage_->GetDiskUsage());
307 ICING_ASSIGN_OR_RETURN(int64_t kv_storage_disk_usage,
308 kv_storage_->GetDiskUsage());
309
310 int64_t total = bucket_storage_disk_usage + entry_storage_disk_usage +
311 kv_storage_disk_usage;
312 Filesystem::IncrementByOrSetInvalid(
313 filesystem_.GetDiskUsage(GetMetadataFilePath(working_path_).c_str()),
314 &total);
315
316 if (total < 0 || total == Filesystem::kBadFileSize) {
317 return absl_ports::InternalError(
318 "Failed to get disk usage of PersistentHashMap");
319 }
320 return total;
321 }
322
GetElementsSize() const323 libtextclassifier3::StatusOr<int64_t> PersistentHashMap::GetElementsSize()
324 const {
325 ICING_ASSIGN_OR_RETURN(int64_t bucket_storage_elements_size,
326 bucket_storage_->GetElementsFileSize());
327 ICING_ASSIGN_OR_RETURN(int64_t entry_storage_elements_size,
328 entry_storage_->GetElementsFileSize());
329 ICING_ASSIGN_OR_RETURN(int64_t kv_storage_elements_size,
330 kv_storage_->GetElementsFileSize());
331 return bucket_storage_elements_size + entry_storage_elements_size +
332 kv_storage_elements_size;
333 }
334
335 /* static */ libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>>
InitializeNewFiles(const Filesystem & filesystem,std::string && working_path,Options && options)336 PersistentHashMap::InitializeNewFiles(const Filesystem& filesystem,
337 std::string&& working_path,
338 Options&& options) {
339 // PersistentHashMap uses working_path as working directory path.
340 // Create working directory.
341 if (!filesystem.CreateDirectory(working_path.c_str())) {
342 return absl_ports::InternalError(
343 absl_ports::StrCat("Failed to create directory: ", working_path));
344 }
345
346 int32_t max_num_buckets_required =
347 std::max(options.init_num_buckets,
348 CalculateNumBucketsRequired(options.max_num_entries,
349 options.max_load_factor_percent));
350
351 // Initialize bucket_storage
352 int32_t pre_mapping_mmap_size = sizeof(Bucket) * max_num_buckets_required;
353 int32_t max_file_size =
354 pre_mapping_mmap_size + FileBackedVector<Bucket>::Header::kHeaderSize;
355 ICING_ASSIGN_OR_RETURN(
356 std::unique_ptr<FileBackedVector<Bucket>> bucket_storage,
357 FileBackedVector<Bucket>::Create(
358 filesystem, GetBucketStorageFilePath(working_path),
359 MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size,
360 options.pre_mapping_fbv ? pre_mapping_mmap_size : 0));
361
362 // Initialize entry_storage
363 pre_mapping_mmap_size = sizeof(Entry) * options.max_num_entries;
364 max_file_size =
365 pre_mapping_mmap_size + FileBackedVector<Entry>::Header::kHeaderSize;
366 ICING_ASSIGN_OR_RETURN(
367 std::unique_ptr<FileBackedVector<Entry>> entry_storage,
368 FileBackedVector<Entry>::Create(
369 filesystem, GetEntryStorageFilePath(working_path),
370 MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size,
371 options.pre_mapping_fbv ? pre_mapping_mmap_size : 0));
372
373 // Initialize kv_storage
374 pre_mapping_mmap_size =
375 options.average_kv_byte_size * options.max_num_entries;
376 max_file_size =
377 pre_mapping_mmap_size + FileBackedVector<char>::Header::kHeaderSize;
378 ICING_ASSIGN_OR_RETURN(
379 std::unique_ptr<FileBackedVector<char>> kv_storage,
380 FileBackedVector<char>::Create(
381 filesystem, GetKeyValueStorageFilePath(working_path),
382 MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size,
383 options.pre_mapping_fbv ? pre_mapping_mmap_size : 0));
384
385 // Initialize buckets.
386 ICING_RETURN_IF_ERROR(bucket_storage->Set(
387 /*idx=*/0, /*len=*/options.init_num_buckets, Bucket()));
388 ICING_RETURN_IF_ERROR(bucket_storage->PersistToDisk());
389
390 // Initialize metadata file. Create MemoryMappedFile with pre-mapping, and
391 // call GrowAndRemapIfNecessary to grow the underlying file.
392 ICING_ASSIGN_OR_RETURN(
393 MemoryMappedFile metadata_mmapped_file,
394 MemoryMappedFile::Create(filesystem, GetMetadataFilePath(working_path),
395 MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
396 /*max_file_size=*/kMetadataFileSize,
397 /*pre_mapping_file_offset=*/0,
398 /*pre_mapping_mmap_size=*/kMetadataFileSize));
399 ICING_RETURN_IF_ERROR(metadata_mmapped_file.GrowAndRemapIfNecessary(
400 /*file_offset=*/0, /*mmap_size=*/kMetadataFileSize));
401
402 // Create instance.
403 auto new_persistent_hash_map =
404 std::unique_ptr<PersistentHashMap>(new PersistentHashMap(
405 filesystem, std::move(working_path), std::move(options),
406 std::move(metadata_mmapped_file), std::move(bucket_storage),
407 std::move(entry_storage), std::move(kv_storage)));
408 // Initialize info content by writing mapped memory directly.
409 Info& info_ref = new_persistent_hash_map->info();
410 info_ref.magic = Info::kMagic;
411 info_ref.value_type_size = new_persistent_hash_map->options_.value_type_size;
412 info_ref.max_load_factor_percent =
413 new_persistent_hash_map->options_.max_load_factor_percent;
414 info_ref.num_deleted_entries = 0;
415 info_ref.num_deleted_key_value_bytes = 0;
416 // Initialize new PersistentStorage. The initial checksums will be computed
417 // and set via InitializeNewStorage.
418 ICING_RETURN_IF_ERROR(new_persistent_hash_map->InitializeNewStorage());
419
420 return new_persistent_hash_map;
421 }
422
423 /* static */ libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>>
InitializeExistingFiles(const Filesystem & filesystem,std::string && working_path,Options && options)424 PersistentHashMap::InitializeExistingFiles(const Filesystem& filesystem,
425 std::string&& working_path,
426 Options&& options) {
427 // Initialize metadata file
428 ICING_ASSIGN_OR_RETURN(
429 MemoryMappedFile metadata_mmapped_file,
430 MemoryMappedFile::Create(filesystem, GetMetadataFilePath(working_path),
431 MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
432 /*max_file_size=*/kMetadataFileSize,
433 /*pre_mapping_file_offset=*/0,
434 /*pre_mapping_mmap_size=*/kMetadataFileSize));
435 if (metadata_mmapped_file.available_size() != kMetadataFileSize) {
436 return absl_ports::FailedPreconditionError("Incorrect metadata file size");
437 }
438
439 int32_t max_num_buckets_required = CalculateNumBucketsRequired(
440 options.max_num_entries, options.max_load_factor_percent);
441
442 // Initialize bucket_storage
443 int32_t pre_mapping_mmap_size = sizeof(Bucket) * max_num_buckets_required;
444 int32_t max_file_size =
445 pre_mapping_mmap_size + FileBackedVector<Bucket>::Header::kHeaderSize;
446 ICING_ASSIGN_OR_RETURN(
447 std::unique_ptr<FileBackedVector<Bucket>> bucket_storage,
448 FileBackedVector<Bucket>::Create(
449 filesystem, GetBucketStorageFilePath(working_path),
450 MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size,
451 options.pre_mapping_fbv ? pre_mapping_mmap_size : 0));
452
453 // Initialize entry_storage
454 pre_mapping_mmap_size = sizeof(Entry) * options.max_num_entries;
455 max_file_size =
456 pre_mapping_mmap_size + FileBackedVector<Entry>::Header::kHeaderSize;
457 ICING_ASSIGN_OR_RETURN(
458 std::unique_ptr<FileBackedVector<Entry>> entry_storage,
459 FileBackedVector<Entry>::Create(
460 filesystem, GetEntryStorageFilePath(working_path),
461 MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size,
462 options.pre_mapping_fbv ? pre_mapping_mmap_size : 0));
463
464 // Initialize kv_storage
465 pre_mapping_mmap_size =
466 options.average_kv_byte_size * options.max_num_entries;
467 max_file_size =
468 pre_mapping_mmap_size + FileBackedVector<char>::Header::kHeaderSize;
469 ICING_ASSIGN_OR_RETURN(
470 std::unique_ptr<FileBackedVector<char>> kv_storage,
471 FileBackedVector<char>::Create(
472 filesystem, GetKeyValueStorageFilePath(working_path),
473 MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size,
474 options.pre_mapping_fbv ? pre_mapping_mmap_size : 0));
475
476 // Create instance.
477 auto persistent_hash_map =
478 std::unique_ptr<PersistentHashMap>(new PersistentHashMap(
479 filesystem, std::move(working_path), std::move(options),
480 std::move(metadata_mmapped_file), std::move(bucket_storage),
481 std::move(entry_storage), std::move(kv_storage)));
482 // Initialize existing PersistentStorage. Checksums will be validated.
483 ICING_RETURN_IF_ERROR(persistent_hash_map->InitializeExistingStorage());
484
485 // Validate other values of info and options.
486 // Current # of entries should not exceed options_.max_num_entries
487 // We compute max_file_size of 3 storages by options_.max_num_entries. Since
488 // we won't recycle space of deleted entries (and key-value bytes), they're
489 // still occupying space in storages. Even if # of "active" entries doesn't
490 // exceed options_.max_num_entries, the new kvp to be inserted still
491 // potentially exceeds max_file_size.
492 // Therefore, we should use entry_storage_->num_elements() instead of # of
493 // "active" entries
494 // (i.e. entry_storage_->num_elements() - info_ptr->num_deleted_entries) to
495 // check. This feature avoids storages being grown extremely large when there
496 // are many Delete() and Put() operations.
497 if (persistent_hash_map->entry_storage_->num_elements() >
498 persistent_hash_map->options_.max_num_entries) {
499 return absl_ports::FailedPreconditionError(
500 "Current # of entries exceeds max num entries");
501 }
502
503 // Magic should be the same.
504 if (persistent_hash_map->info().magic != Info::kMagic) {
505 return absl_ports::FailedPreconditionError(
506 "PersistentHashMap header magic mismatch");
507 }
508
509 // Value type size should be consistent.
510 if (persistent_hash_map->options_.value_type_size !=
511 persistent_hash_map->info().value_type_size) {
512 return absl_ports::FailedPreconditionError("Incorrect value type size");
513 }
514
515 // Allow max_load_factor_percent_ change.
516 if (persistent_hash_map->options_.max_load_factor_percent !=
517 persistent_hash_map->info().max_load_factor_percent) {
518 ICING_VLOG(2) << "Changing max_load_factor_percent from "
519 << persistent_hash_map->info().max_load_factor_percent
520 << " to "
521 << persistent_hash_map->options_.max_load_factor_percent;
522
523 persistent_hash_map->SetInfoDirty();
524 persistent_hash_map->info().max_load_factor_percent =
525 persistent_hash_map->options_.max_load_factor_percent;
526 ICING_RETURN_IF_ERROR(
527 persistent_hash_map->RehashIfNecessary(/*force_rehash=*/false));
528
529 ICING_RETURN_IF_ERROR(persistent_hash_map->PersistToDisk());
530 }
531
532 return persistent_hash_map;
533 }
534
PersistStoragesToDisk(bool force)535 libtextclassifier3::Status PersistentHashMap::PersistStoragesToDisk(
536 bool force) {
537 if (!force && !is_storage_dirty()) {
538 return libtextclassifier3::Status::OK;
539 }
540
541 ICING_RETURN_IF_ERROR(bucket_storage_->PersistToDisk());
542 ICING_RETURN_IF_ERROR(entry_storage_->PersistToDisk());
543 ICING_RETURN_IF_ERROR(kv_storage_->PersistToDisk());
544 is_storage_dirty_ = false;
545 return libtextclassifier3::Status::OK;
546 }
547
PersistMetadataToDisk(bool force)548 libtextclassifier3::Status PersistentHashMap::PersistMetadataToDisk(
549 bool force) {
550 // We can skip persisting metadata to disk only if both info and storage are
551 // clean.
552 if (!force && !is_info_dirty() && !is_storage_dirty()) {
553 return libtextclassifier3::Status::OK;
554 }
555
556 // Changes should have been applied to the underlying file when using
557 // MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, but call msync() as an
558 // extra safety step to ensure they are written out.
559 ICING_RETURN_IF_ERROR(metadata_mmapped_file_->PersistToDisk());
560 is_info_dirty_ = false;
561 return libtextclassifier3::Status::OK;
562 }
563
ComputeInfoChecksum(bool force)564 libtextclassifier3::StatusOr<Crc32> PersistentHashMap::ComputeInfoChecksum(
565 bool force) {
566 if (!force && !is_info_dirty()) {
567 return Crc32(crcs().component_crcs.info_crc);
568 }
569
570 return info().ComputeChecksum();
571 }
572
ComputeStoragesChecksum(bool force)573 libtextclassifier3::StatusOr<Crc32> PersistentHashMap::ComputeStoragesChecksum(
574 bool force) {
575 if (!force && !is_storage_dirty()) {
576 return Crc32(crcs().component_crcs.storages_crc);
577 }
578
579 // Compute crcs
580 ICING_ASSIGN_OR_RETURN(Crc32 bucket_storage_crc,
581 bucket_storage_->ComputeChecksum());
582 ICING_ASSIGN_OR_RETURN(Crc32 entry_storage_crc,
583 entry_storage_->ComputeChecksum());
584 ICING_ASSIGN_OR_RETURN(Crc32 kv_storage_crc, kv_storage_->ComputeChecksum());
585
586 return Crc32(bucket_storage_crc.Get() ^ entry_storage_crc.Get() ^
587 kv_storage_crc.Get());
588 }
589
590 libtextclassifier3::StatusOr<PersistentHashMap::EntryIndexPair>
FindEntryIndexByKey(int32_t bucket_idx,std::string_view key) const591 PersistentHashMap::FindEntryIndexByKey(int32_t bucket_idx,
592 std::string_view key) const {
593 // Iterate all entries in the bucket, compare with key, and return the entry
594 // index if exists.
595 ICING_ASSIGN_OR_RETURN(const Bucket* bucket,
596 bucket_storage_->Get(bucket_idx));
597
598 int32_t prev_entry_idx = Entry::kInvalidIndex;
599 int32_t curr_entry_idx = bucket->head_entry_index();
600 while (curr_entry_idx != Entry::kInvalidIndex) {
601 ICING_ASSIGN_OR_RETURN(const Entry* entry,
602 entry_storage_->Get(curr_entry_idx));
603 if (entry->key_value_index() == kInvalidKVIndex) {
604 ICING_LOG(ERROR) << "Got an invalid key value index in the persistent "
605 "hash map bucket. This shouldn't happen";
606 return absl_ports::InternalError("Unexpected invalid key value index");
607 }
608 ICING_ASSIGN_OR_RETURN(const char* kv_arr,
609 kv_storage_->Get(entry->key_value_index()));
610 if (key.compare(kv_arr) == 0) {
611 return EntryIndexPair(curr_entry_idx, prev_entry_idx);
612 }
613
614 prev_entry_idx = curr_entry_idx;
615 curr_entry_idx = entry->next_entry_index();
616 }
617
618 return EntryIndexPair(curr_entry_idx, prev_entry_idx);
619 }
620
CopyEntryValue(int32_t entry_idx,void * value) const621 libtextclassifier3::Status PersistentHashMap::CopyEntryValue(
622 int32_t entry_idx, void* value) const {
623 ICING_ASSIGN_OR_RETURN(const Entry* entry, entry_storage_->Get(entry_idx));
624
625 ICING_ASSIGN_OR_RETURN(const char* kv_arr,
626 kv_storage_->Get(entry->key_value_index()));
627 int32_t value_offset = strlen(kv_arr) + 1;
628 memcpy(value, kv_arr + value_offset, info().value_type_size);
629
630 return libtextclassifier3::Status::OK;
631 }
632
Insert(int32_t bucket_idx,std::string_view key,const void * value)633 libtextclassifier3::Status PersistentHashMap::Insert(int32_t bucket_idx,
634 std::string_view key,
635 const void* value) {
636 SetDirty();
637
638 // If entry_storage_->num_elements() + 1 exceeds options_.max_num_entries,
639 // then return error.
640 // We compute max_file_size of 3 storages by options_.max_num_entries. Since
641 // we won't recycle space of deleted entries (and key-value bytes), they're
642 // still occupying space in storages. Even if # of "active" entries (i.e.
643 // size()) doesn't exceed options_.max_num_entries, the new kvp to be inserted
644 // still potentially exceeds max_file_size.
645 // Therefore, we should use entry_storage_->num_elements() instead of size()
646 // to check. This feature avoids storages being grown extremely large when
647 // there are many Delete() and Put() operations.
648 if (entry_storage_->num_elements() > options_.max_num_entries - 1) {
649 return absl_ports::ResourceExhaustedError("Cannot insert new entry");
650 }
651
652 ICING_ASSIGN_OR_RETURN(
653 typename FileBackedVector<Bucket>::MutableView mutable_bucket,
654 bucket_storage_->GetMutable(bucket_idx));
655
656 // Append new key value.
657 int32_t new_kv_idx = kv_storage_->num_elements();
658 int32_t kv_len = key.size() + 1 + info().value_type_size;
659 int32_t value_offset = key.size() + 1;
660 ICING_ASSIGN_OR_RETURN(
661 typename FileBackedVector<char>::MutableArrayView mutable_new_kv_arr,
662 kv_storage_->Allocate(kv_len));
663 mutable_new_kv_arr.SetArray(/*idx=*/0, key.data(), key.size());
664 mutable_new_kv_arr.SetArray(/*idx=*/key.size(), "\0", 1);
665 mutable_new_kv_arr.SetArray(/*idx=*/value_offset,
666 reinterpret_cast<const char*>(value),
667 info().value_type_size);
668
669 // Append new entry.
670 int32_t new_entry_idx = entry_storage_->num_elements();
671 ICING_RETURN_IF_ERROR(entry_storage_->Append(
672 Entry(new_kv_idx, mutable_bucket.Get().head_entry_index())));
673 mutable_bucket.Get().set_head_entry_index(new_entry_idx);
674
675 return RehashIfNecessary(/*force_rehash=*/false);
676 }
677
RehashIfNecessary(bool force_rehash)678 libtextclassifier3::Status PersistentHashMap::RehashIfNecessary(
679 bool force_rehash) {
680 int32_t new_num_bucket = bucket_storage_->num_elements();
681 while (new_num_bucket <= Bucket::kMaxNumBuckets / 2 &&
682 size() > static_cast<int64_t>(new_num_bucket) *
683 info().max_load_factor_percent / 100) {
684 new_num_bucket *= 2;
685 }
686
687 if (!force_rehash && new_num_bucket == bucket_storage_->num_elements()) {
688 return libtextclassifier3::Status::OK;
689 }
690
691 SetDirty();
692
693 // Resize and reset buckets.
694 ICING_RETURN_IF_ERROR(
695 bucket_storage_->Set(0, new_num_bucket, Bucket(Entry::kInvalidIndex)));
696
697 // Iterate all key value pairs in kv_storage, rehash and insert.
698 Iterator iter = GetIterator();
699 int32_t entry_idx = 0;
700 while (iter.Advance()) {
701 ICING_ASSIGN_OR_RETURN(int32_t bucket_idx,
702 HashKeyToBucketIndex(iter.GetKey(), new_num_bucket));
703 ICING_ASSIGN_OR_RETURN(FileBackedVector<Bucket>::MutableView mutable_bucket,
704 bucket_storage_->GetMutable(bucket_idx));
705
706 // Update entry and bucket.
707 ICING_RETURN_IF_ERROR(entry_storage_->Set(
708 entry_idx,
709 Entry(iter.GetIndex(), mutable_bucket.Get().head_entry_index())));
710 mutable_bucket.Get().set_head_entry_index(entry_idx);
711
712 ++entry_idx;
713 }
714
715 // Since there will be some deleted entries, after rehashing entry_storage_
716 // # of vector elements may be greater than the actual # of entries.
717 // Therefore, we have to truncate entry_storage_ to the correct size.
718 if (entry_idx < entry_storage_->num_elements()) {
719 ICING_RETURN_IF_ERROR(entry_storage_->TruncateTo(entry_idx));
720 }
721
722 info().num_deleted_entries = 0;
723
724 return libtextclassifier3::Status::OK;
725 }
726
Advance()727 bool PersistentHashMap::Iterator::Advance() {
728 // Jump over the current key value pair before advancing to the next valid
729 // key value pair. In the first round (after construction), curr_key_len_
730 // is 0, so don't jump over anything.
731 if (curr_key_len_ != 0) {
732 curr_kv_idx_ += curr_key_len_ + 1 + map_->info().value_type_size;
733 curr_key_len_ = 0;
734 }
735
736 // By skipping null chars, we will be automatically handling deleted entries
737 // (which are zeroed out during deletion).
738 for (const char* curr_kv_ptr = map_->kv_storage_->array() + curr_kv_idx_;
739 curr_kv_idx_ < map_->kv_storage_->num_elements();
740 ++curr_kv_ptr, ++curr_kv_idx_) {
741 if (*curr_kv_ptr != '\0') {
742 curr_key_len_ = strlen(curr_kv_ptr);
743 return true;
744 }
745 }
746 return false;
747 }
748
749 } // namespace lib
750 } // namespace icing
751