• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2022 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/file/persistent-hash-map.h"
16 
17 #include <cstdint>
18 #include <cstring>
19 #include <memory>
20 #include <string>
21 #include <string_view>
22 #include <utility>
23 
24 #include "icing/text_classifier/lib3/utils/base/status.h"
25 #include "icing/text_classifier/lib3/utils/base/statusor.h"
26 #include "icing/absl_ports/canonical_errors.h"
27 #include "icing/absl_ports/str_cat.h"
28 #include "icing/file/file-backed-vector.h"
29 #include "icing/file/memory-mapped-file.h"
30 #include "icing/file/persistent-storage.h"
31 #include "icing/util/crc32.h"
32 #include "icing/util/status-macros.h"
33 
34 namespace icing {
35 namespace lib {
36 
37 namespace {
38 
39 // Helper function to check if there is no termination character '\0' in the
40 // key.
ValidateKey(std::string_view key)41 libtextclassifier3::Status ValidateKey(std::string_view key) {
42   if (key.find('\0') != std::string_view::npos) {  // NOLINT
43     return absl_ports::InvalidArgumentError(
44         "Key cannot contain termination character '\\0'");
45   }
46   return libtextclassifier3::Status::OK;
47 }
48 
49 // Helper function to convert the key to bucket index by hash.
50 //
51 // Returns:
52 //   int32_t: A valid bucket index with range [0, num_buckets - 1].
53 //   INTERNAL_ERROR if num_buckets == 0
HashKeyToBucketIndex(std::string_view key,int32_t num_buckets)54 libtextclassifier3::StatusOr<int32_t> HashKeyToBucketIndex(
55     std::string_view key, int32_t num_buckets) {
56   if (num_buckets == 0) {
57     return absl_ports::InternalError("Should not have empty bucket");
58   }
59   return static_cast<int32_t>(std::hash<std::string_view>()(key) % num_buckets);
60 }
61 
62 // The following 4 methods are helper functions to get the correct path of
63 // metadata/bucket/entry/key-value storages, according to the given working
64 // directory path.
GetMetadataFilePath(std::string_view working_path)65 std::string GetMetadataFilePath(std::string_view working_path) {
66   return absl_ports::StrCat(working_path, "/", PersistentHashMap::kFilePrefix,
67                             ".m");
68 }
69 
GetBucketStorageFilePath(std::string_view working_path)70 std::string GetBucketStorageFilePath(std::string_view working_path) {
71   return absl_ports::StrCat(working_path, "/", PersistentHashMap::kFilePrefix,
72                             ".b");
73 }
74 
GetEntryStorageFilePath(std::string_view working_path)75 std::string GetEntryStorageFilePath(std::string_view working_path) {
76   return absl_ports::StrCat(working_path, "/", PersistentHashMap::kFilePrefix,
77                             ".e");
78 }
79 
GetKeyValueStorageFilePath(std::string_view working_path)80 std::string GetKeyValueStorageFilePath(std::string_view working_path) {
81   return absl_ports::StrCat(working_path, "/", PersistentHashMap::kFilePrefix,
82                             ".k");
83 }
84 
85 // Calculates how many buckets we need given num_entries and
86 // max_load_factor_percent. Round it up to 2's power.
87 //
88 // REQUIRES: 0 < num_entries <= Entry::kMaxNumEntries &&
89 //           max_load_factor_percent > 0
CalculateNumBucketsRequired(int32_t num_entries,int32_t max_load_factor_percent)90 int32_t CalculateNumBucketsRequired(int32_t num_entries,
91                                     int32_t max_load_factor_percent) {
92   // Calculate ceil(num_entries * 100 / max_load_factor_percent)
93   int32_t num_entries_100 = num_entries * 100;
94   int32_t num_buckets_required =
95       num_entries_100 / max_load_factor_percent +
96       (num_entries_100 % max_load_factor_percent == 0 ? 0 : 1);
97   if ((num_buckets_required & (num_buckets_required - 1)) != 0) {
98     // not 2's power
99     return 1 << (32 - __builtin_clz(num_buckets_required));
100   }
101   return num_buckets_required;
102 }
103 
104 }  // namespace
105 
IsValid() const106 bool PersistentHashMap::Options::IsValid() const {
107   if (!(value_type_size > 0 && value_type_size <= kMaxValueTypeSize &&
108         max_num_entries > 0 && max_num_entries <= Entry::kMaxNumEntries &&
109         max_load_factor_percent > 0 && average_kv_byte_size > 0 &&
110         init_num_buckets > 0 && init_num_buckets <= Bucket::kMaxNumBuckets)) {
111     return false;
112   }
113 
114   // We've ensured (static_assert) that storing kMaxNumBuckets buckets won't
115   // exceed FileBackedVector::kMaxFileSize, so only need to verify # of buckets
116   // required won't exceed kMaxNumBuckets.
117   if (CalculateNumBucketsRequired(max_num_entries, max_load_factor_percent) >
118       Bucket::kMaxNumBuckets) {
119     return false;
120   }
121 
122   // Verify # of key value pairs can fit into kv_storage.
123   if (average_kv_byte_size > kMaxKVTotalByteSize / max_num_entries) {
124     return false;
125   }
126 
127   // Verify init_num_buckets is 2's power. Requiring init_num_buckets to be 2^n
128   // guarantees that num_buckets will eventually grow to be exactly
129   // max_num_buckets since CalculateNumBucketsRequired rounds it up to 2^n.
130   if ((init_num_buckets & (init_num_buckets - 1)) != 0) {
131     return false;
132   }
133 
134   return true;
135 }
136 
137 /* static */ libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>>
Create(const Filesystem & filesystem,std::string working_path,Options options)138 PersistentHashMap::Create(const Filesystem& filesystem,
139                           std::string working_path, Options options) {
140   if (!options.IsValid()) {
141     return absl_ports::InvalidArgumentError(
142         "Invalid PersistentHashMap options");
143   }
144 
145   if (!filesystem.FileExists(GetMetadataFilePath(working_path).c_str()) ||
146       !filesystem.FileExists(GetBucketStorageFilePath(working_path).c_str()) ||
147       !filesystem.FileExists(GetEntryStorageFilePath(working_path).c_str()) ||
148       !filesystem.FileExists(
149           GetKeyValueStorageFilePath(working_path).c_str())) {
150     // Discard working_path if any of them is missing, and reinitialize.
151     if (filesystem.DirectoryExists(working_path.c_str())) {
152       ICING_RETURN_IF_ERROR(Discard(filesystem, working_path));
153     }
154     return InitializeNewFiles(filesystem, std::move(working_path),
155                               std::move(options));
156   }
157   return InitializeExistingFiles(filesystem, std::move(working_path),
158                                  std::move(options));
159 }
160 
~PersistentHashMap()161 PersistentHashMap::~PersistentHashMap() {
162   if (!PersistToDisk().ok()) {
163     ICING_LOG(WARNING)
164         << "Failed to persist hash map to disk while destructing "
165         << working_path_;
166   }
167 }
168 
Put(std::string_view key,const void * value)169 libtextclassifier3::Status PersistentHashMap::Put(std::string_view key,
170                                                   const void* value) {
171   SetDirty();
172 
173   ICING_RETURN_IF_ERROR(ValidateKey(key));
174   ICING_ASSIGN_OR_RETURN(
175       int32_t bucket_idx,
176       HashKeyToBucketIndex(key, bucket_storage_->num_elements()));
177 
178   ICING_ASSIGN_OR_RETURN(EntryIndexPair idx_pair,
179                          FindEntryIndexByKey(bucket_idx, key));
180   if (idx_pair.target_entry_index == Entry::kInvalidIndex) {
181     // If not found, then insert new key value pair.
182     return Insert(bucket_idx, key, value);
183   }
184 
185   // Otherwise, overwrite the value.
186   ICING_ASSIGN_OR_RETURN(const Entry* entry,
187                          entry_storage_->Get(idx_pair.target_entry_index));
188 
189   int32_t kv_len = key.length() + 1 + info().value_type_size;
190   int32_t value_offset = key.length() + 1;
191   ICING_ASSIGN_OR_RETURN(
192       typename FileBackedVector<char>::MutableArrayView mutable_kv_arr,
193       kv_storage_->GetMutable(entry->key_value_index(), kv_len));
194   // It is the same key and value_size is fixed, so we can directly overwrite
195   // serialized value.
196   mutable_kv_arr.SetArray(value_offset, reinterpret_cast<const char*>(value),
197                           info().value_type_size);
198 
199   return libtextclassifier3::Status::OK;
200 }
201 
GetOrPut(std::string_view key,void * next_value)202 libtextclassifier3::Status PersistentHashMap::GetOrPut(std::string_view key,
203                                                        void* next_value) {
204   ICING_RETURN_IF_ERROR(ValidateKey(key));
205   ICING_ASSIGN_OR_RETURN(
206       int32_t bucket_idx,
207       HashKeyToBucketIndex(key, bucket_storage_->num_elements()));
208 
209   ICING_ASSIGN_OR_RETURN(EntryIndexPair idx_pair,
210                          FindEntryIndexByKey(bucket_idx, key));
211   if (idx_pair.target_entry_index == Entry::kInvalidIndex) {
212     // If not found, then insert new key value pair.
213     SetDirty();
214     return Insert(bucket_idx, key, next_value);
215   }
216 
217   // Otherwise, copy the hash map value into next_value.
218   return CopyEntryValue(idx_pair.target_entry_index, next_value);
219 }
220 
Get(std::string_view key,void * value) const221 libtextclassifier3::Status PersistentHashMap::Get(std::string_view key,
222                                                   void* value) const {
223   ICING_RETURN_IF_ERROR(ValidateKey(key));
224   ICING_ASSIGN_OR_RETURN(
225       int32_t bucket_idx,
226       HashKeyToBucketIndex(key, bucket_storage_->num_elements()));
227 
228   ICING_ASSIGN_OR_RETURN(EntryIndexPair idx_pair,
229                          FindEntryIndexByKey(bucket_idx, key));
230   if (idx_pair.target_entry_index == Entry::kInvalidIndex) {
231     return absl_ports::NotFoundError(absl_ports::StrCat(
232         "Key not found in PersistentHashMap ", working_path_));
233   }
234 
235   return CopyEntryValue(idx_pair.target_entry_index, value);
236 }
237 
Delete(std::string_view key)238 libtextclassifier3::Status PersistentHashMap::Delete(std::string_view key) {
239   SetDirty();
240 
241   ICING_RETURN_IF_ERROR(ValidateKey(key));
242   ICING_ASSIGN_OR_RETURN(
243       int32_t bucket_idx,
244       HashKeyToBucketIndex(key, bucket_storage_->num_elements()));
245 
246   ICING_ASSIGN_OR_RETURN(EntryIndexPair idx_pair,
247                          FindEntryIndexByKey(bucket_idx, key));
248   if (idx_pair.target_entry_index == Entry::kInvalidIndex) {
249     return absl_ports::NotFoundError(absl_ports::StrCat(
250         "Key not found in PersistentHashMap ", working_path_));
251   }
252 
253   ICING_ASSIGN_OR_RETURN(
254       typename FileBackedVector<Entry>::MutableView mutable_target_entry,
255       entry_storage_->GetMutable(idx_pair.target_entry_index));
256   if (idx_pair.prev_entry_index == Entry::kInvalidIndex) {
257     // If prev_entry_idx is Entry::kInvalidIndex, then target_entry must be the
258     // head element of the entry linked list, and we have to update
259     // bucket->head_entry_index_.
260     //
261     // Before: target_entry (head) -> next_entry -> ...
262     // After: next_entry (head) -> ...
263     ICING_ASSIGN_OR_RETURN(
264         typename FileBackedVector<Bucket>::MutableView mutable_bucket,
265         bucket_storage_->GetMutable(bucket_idx));
266     if (mutable_bucket.Get().head_entry_index() !=
267         idx_pair.target_entry_index) {
268       return absl_ports::InternalError(
269           "Bucket head entry index is inconsistent with the actual entry linked"
270           "list head. This shouldn't happen");
271     }
272     mutable_bucket.Get().set_head_entry_index(
273         mutable_target_entry.Get().next_entry_index());
274   } else {
275     // Otherwise, connect prev_entry and next_entry, to remove target_entry from
276     // the entry linked list.
277     //
278     // Before: ... -> prev_entry -> target_entry -> next_entry -> ...
279     // After: ... -> prev_entry -> next_entry -> ...
280     ICING_ASSIGN_OR_RETURN(
281         typename FileBackedVector<Entry>::MutableView mutable_prev_entry,
282         entry_storage_->GetMutable(idx_pair.prev_entry_index));
283     mutable_prev_entry.Get().set_next_entry_index(
284         mutable_target_entry.Get().next_entry_index());
285   }
286 
287   // Zero out the key value bytes. It is necessary for iterator to iterate
288   // through kv_storage and handle deleted keys properly.
289   int32_t kv_len = key.length() + 1 + info().value_type_size;
290   ICING_RETURN_IF_ERROR(kv_storage_->Set(
291       mutable_target_entry.Get().key_value_index(), kv_len, '\0'));
292 
293   // Invalidate target_entry
294   mutable_target_entry.Get().set_key_value_index(kInvalidKVIndex);
295   mutable_target_entry.Get().set_next_entry_index(Entry::kInvalidIndex);
296 
297   ++(info().num_deleted_entries);
298 
299   return libtextclassifier3::Status::OK;
300 }
301 
GetDiskUsage() const302 libtextclassifier3::StatusOr<int64_t> PersistentHashMap::GetDiskUsage() const {
303   ICING_ASSIGN_OR_RETURN(int64_t bucket_storage_disk_usage,
304                          bucket_storage_->GetDiskUsage());
305   ICING_ASSIGN_OR_RETURN(int64_t entry_storage_disk_usage,
306                          entry_storage_->GetDiskUsage());
307   ICING_ASSIGN_OR_RETURN(int64_t kv_storage_disk_usage,
308                          kv_storage_->GetDiskUsage());
309 
310   int64_t total = bucket_storage_disk_usage + entry_storage_disk_usage +
311                   kv_storage_disk_usage;
312   Filesystem::IncrementByOrSetInvalid(
313       filesystem_.GetDiskUsage(GetMetadataFilePath(working_path_).c_str()),
314       &total);
315 
316   if (total < 0 || total == Filesystem::kBadFileSize) {
317     return absl_ports::InternalError(
318         "Failed to get disk usage of PersistentHashMap");
319   }
320   return total;
321 }
322 
GetElementsSize() const323 libtextclassifier3::StatusOr<int64_t> PersistentHashMap::GetElementsSize()
324     const {
325   ICING_ASSIGN_OR_RETURN(int64_t bucket_storage_elements_size,
326                          bucket_storage_->GetElementsFileSize());
327   ICING_ASSIGN_OR_RETURN(int64_t entry_storage_elements_size,
328                          entry_storage_->GetElementsFileSize());
329   ICING_ASSIGN_OR_RETURN(int64_t kv_storage_elements_size,
330                          kv_storage_->GetElementsFileSize());
331   return bucket_storage_elements_size + entry_storage_elements_size +
332          kv_storage_elements_size;
333 }
334 
335 /* static */ libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>>
InitializeNewFiles(const Filesystem & filesystem,std::string && working_path,Options && options)336 PersistentHashMap::InitializeNewFiles(const Filesystem& filesystem,
337                                       std::string&& working_path,
338                                       Options&& options) {
339   // PersistentHashMap uses working_path as working directory path.
340   // Create working directory.
341   if (!filesystem.CreateDirectory(working_path.c_str())) {
342     return absl_ports::InternalError(
343         absl_ports::StrCat("Failed to create directory: ", working_path));
344   }
345 
346   int32_t max_num_buckets_required =
347       std::max(options.init_num_buckets,
348                CalculateNumBucketsRequired(options.max_num_entries,
349                                            options.max_load_factor_percent));
350 
351   // Initialize bucket_storage
352   int32_t pre_mapping_mmap_size = sizeof(Bucket) * max_num_buckets_required;
353   int32_t max_file_size =
354       pre_mapping_mmap_size + FileBackedVector<Bucket>::Header::kHeaderSize;
355   ICING_ASSIGN_OR_RETURN(
356       std::unique_ptr<FileBackedVector<Bucket>> bucket_storage,
357       FileBackedVector<Bucket>::Create(
358           filesystem, GetBucketStorageFilePath(working_path),
359           MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size,
360           options.pre_mapping_fbv ? pre_mapping_mmap_size : 0));
361 
362   // Initialize entry_storage
363   pre_mapping_mmap_size = sizeof(Entry) * options.max_num_entries;
364   max_file_size =
365       pre_mapping_mmap_size + FileBackedVector<Entry>::Header::kHeaderSize;
366   ICING_ASSIGN_OR_RETURN(
367       std::unique_ptr<FileBackedVector<Entry>> entry_storage,
368       FileBackedVector<Entry>::Create(
369           filesystem, GetEntryStorageFilePath(working_path),
370           MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size,
371           options.pre_mapping_fbv ? pre_mapping_mmap_size : 0));
372 
373   // Initialize kv_storage
374   pre_mapping_mmap_size =
375       options.average_kv_byte_size * options.max_num_entries;
376   max_file_size =
377       pre_mapping_mmap_size + FileBackedVector<char>::Header::kHeaderSize;
378   ICING_ASSIGN_OR_RETURN(
379       std::unique_ptr<FileBackedVector<char>> kv_storage,
380       FileBackedVector<char>::Create(
381           filesystem, GetKeyValueStorageFilePath(working_path),
382           MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size,
383           options.pre_mapping_fbv ? pre_mapping_mmap_size : 0));
384 
385   // Initialize buckets.
386   ICING_RETURN_IF_ERROR(bucket_storage->Set(
387       /*idx=*/0, /*len=*/options.init_num_buckets, Bucket()));
388   ICING_RETURN_IF_ERROR(bucket_storage->PersistToDisk());
389 
390   // Initialize metadata file. Create MemoryMappedFile with pre-mapping, and
391   // call GrowAndRemapIfNecessary to grow the underlying file.
392   ICING_ASSIGN_OR_RETURN(
393       MemoryMappedFile metadata_mmapped_file,
394       MemoryMappedFile::Create(filesystem, GetMetadataFilePath(working_path),
395                                MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
396                                /*max_file_size=*/kMetadataFileSize,
397                                /*pre_mapping_file_offset=*/0,
398                                /*pre_mapping_mmap_size=*/kMetadataFileSize));
399   ICING_RETURN_IF_ERROR(metadata_mmapped_file.GrowAndRemapIfNecessary(
400       /*file_offset=*/0, /*mmap_size=*/kMetadataFileSize));
401 
402   // Create instance.
403   auto new_persistent_hash_map =
404       std::unique_ptr<PersistentHashMap>(new PersistentHashMap(
405           filesystem, std::move(working_path), std::move(options),
406           std::move(metadata_mmapped_file), std::move(bucket_storage),
407           std::move(entry_storage), std::move(kv_storage)));
408   // Initialize info content by writing mapped memory directly.
409   Info& info_ref = new_persistent_hash_map->info();
410   info_ref.magic = Info::kMagic;
411   info_ref.value_type_size = new_persistent_hash_map->options_.value_type_size;
412   info_ref.max_load_factor_percent =
413       new_persistent_hash_map->options_.max_load_factor_percent;
414   info_ref.num_deleted_entries = 0;
415   info_ref.num_deleted_key_value_bytes = 0;
416   // Initialize new PersistentStorage. The initial checksums will be computed
417   // and set via InitializeNewStorage.
418   ICING_RETURN_IF_ERROR(new_persistent_hash_map->InitializeNewStorage());
419 
420   return new_persistent_hash_map;
421 }
422 
423 /* static */ libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>>
InitializeExistingFiles(const Filesystem & filesystem,std::string && working_path,Options && options)424 PersistentHashMap::InitializeExistingFiles(const Filesystem& filesystem,
425                                            std::string&& working_path,
426                                            Options&& options) {
427   // Initialize metadata file
428   ICING_ASSIGN_OR_RETURN(
429       MemoryMappedFile metadata_mmapped_file,
430       MemoryMappedFile::Create(filesystem, GetMetadataFilePath(working_path),
431                                MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
432                                /*max_file_size=*/kMetadataFileSize,
433                                /*pre_mapping_file_offset=*/0,
434                                /*pre_mapping_mmap_size=*/kMetadataFileSize));
435   if (metadata_mmapped_file.available_size() != kMetadataFileSize) {
436     return absl_ports::FailedPreconditionError("Incorrect metadata file size");
437   }
438 
439   int32_t max_num_buckets_required = CalculateNumBucketsRequired(
440       options.max_num_entries, options.max_load_factor_percent);
441 
442   // Initialize bucket_storage
443   int32_t pre_mapping_mmap_size = sizeof(Bucket) * max_num_buckets_required;
444   int32_t max_file_size =
445       pre_mapping_mmap_size + FileBackedVector<Bucket>::Header::kHeaderSize;
446   ICING_ASSIGN_OR_RETURN(
447       std::unique_ptr<FileBackedVector<Bucket>> bucket_storage,
448       FileBackedVector<Bucket>::Create(
449           filesystem, GetBucketStorageFilePath(working_path),
450           MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size,
451           options.pre_mapping_fbv ? pre_mapping_mmap_size : 0));
452 
453   // Initialize entry_storage
454   pre_mapping_mmap_size = sizeof(Entry) * options.max_num_entries;
455   max_file_size =
456       pre_mapping_mmap_size + FileBackedVector<Entry>::Header::kHeaderSize;
457   ICING_ASSIGN_OR_RETURN(
458       std::unique_ptr<FileBackedVector<Entry>> entry_storage,
459       FileBackedVector<Entry>::Create(
460           filesystem, GetEntryStorageFilePath(working_path),
461           MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size,
462           options.pre_mapping_fbv ? pre_mapping_mmap_size : 0));
463 
464   // Initialize kv_storage
465   pre_mapping_mmap_size =
466       options.average_kv_byte_size * options.max_num_entries;
467   max_file_size =
468       pre_mapping_mmap_size + FileBackedVector<char>::Header::kHeaderSize;
469   ICING_ASSIGN_OR_RETURN(
470       std::unique_ptr<FileBackedVector<char>> kv_storage,
471       FileBackedVector<char>::Create(
472           filesystem, GetKeyValueStorageFilePath(working_path),
473           MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size,
474           options.pre_mapping_fbv ? pre_mapping_mmap_size : 0));
475 
476   // Create instance.
477   auto persistent_hash_map =
478       std::unique_ptr<PersistentHashMap>(new PersistentHashMap(
479           filesystem, std::move(working_path), std::move(options),
480           std::move(metadata_mmapped_file), std::move(bucket_storage),
481           std::move(entry_storage), std::move(kv_storage)));
482   // Initialize existing PersistentStorage. Checksums will be validated.
483   ICING_RETURN_IF_ERROR(persistent_hash_map->InitializeExistingStorage());
484 
485   // Validate other values of info and options.
486   // Current # of entries should not exceed options_.max_num_entries
487   // We compute max_file_size of 3 storages by options_.max_num_entries. Since
488   // we won't recycle space of deleted entries (and key-value bytes), they're
489   // still occupying space in storages. Even if # of "active" entries doesn't
490   // exceed options_.max_num_entries, the new kvp to be inserted still
491   // potentially exceeds max_file_size.
492   // Therefore, we should use entry_storage_->num_elements() instead of # of
493   // "active" entries
494   // (i.e. entry_storage_->num_elements() - info_ptr->num_deleted_entries) to
495   // check. This feature avoids storages being grown extremely large when there
496   // are many Delete() and Put() operations.
497   if (persistent_hash_map->entry_storage_->num_elements() >
498       persistent_hash_map->options_.max_num_entries) {
499     return absl_ports::FailedPreconditionError(
500         "Current # of entries exceeds max num entries");
501   }
502 
503   // Magic should be the same.
504   if (persistent_hash_map->info().magic != Info::kMagic) {
505     return absl_ports::FailedPreconditionError(
506         "PersistentHashMap header magic mismatch");
507   }
508 
509   // Value type size should be consistent.
510   if (persistent_hash_map->options_.value_type_size !=
511       persistent_hash_map->info().value_type_size) {
512     return absl_ports::FailedPreconditionError("Incorrect value type size");
513   }
514 
515   // Allow max_load_factor_percent_ change.
516   if (persistent_hash_map->options_.max_load_factor_percent !=
517       persistent_hash_map->info().max_load_factor_percent) {
518     ICING_VLOG(2) << "Changing max_load_factor_percent from "
519                   << persistent_hash_map->info().max_load_factor_percent
520                   << " to "
521                   << persistent_hash_map->options_.max_load_factor_percent;
522 
523     persistent_hash_map->SetInfoDirty();
524     persistent_hash_map->info().max_load_factor_percent =
525         persistent_hash_map->options_.max_load_factor_percent;
526     ICING_RETURN_IF_ERROR(
527         persistent_hash_map->RehashIfNecessary(/*force_rehash=*/false));
528 
529     ICING_RETURN_IF_ERROR(persistent_hash_map->PersistToDisk());
530   }
531 
532   return persistent_hash_map;
533 }
534 
PersistStoragesToDisk(bool force)535 libtextclassifier3::Status PersistentHashMap::PersistStoragesToDisk(
536     bool force) {
537   if (!force && !is_storage_dirty()) {
538     return libtextclassifier3::Status::OK;
539   }
540 
541   ICING_RETURN_IF_ERROR(bucket_storage_->PersistToDisk());
542   ICING_RETURN_IF_ERROR(entry_storage_->PersistToDisk());
543   ICING_RETURN_IF_ERROR(kv_storage_->PersistToDisk());
544   is_storage_dirty_ = false;
545   return libtextclassifier3::Status::OK;
546 }
547 
PersistMetadataToDisk(bool force)548 libtextclassifier3::Status PersistentHashMap::PersistMetadataToDisk(
549     bool force) {
550   // We can skip persisting metadata to disk only if both info and storage are
551   // clean.
552   if (!force && !is_info_dirty() && !is_storage_dirty()) {
553     return libtextclassifier3::Status::OK;
554   }
555 
556   // Changes should have been applied to the underlying file when using
557   // MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, but call msync() as an
558   // extra safety step to ensure they are written out.
559   ICING_RETURN_IF_ERROR(metadata_mmapped_file_->PersistToDisk());
560   is_info_dirty_ = false;
561   return libtextclassifier3::Status::OK;
562 }
563 
ComputeInfoChecksum(bool force)564 libtextclassifier3::StatusOr<Crc32> PersistentHashMap::ComputeInfoChecksum(
565     bool force) {
566   if (!force && !is_info_dirty()) {
567     return Crc32(crcs().component_crcs.info_crc);
568   }
569 
570   return info().ComputeChecksum();
571 }
572 
ComputeStoragesChecksum(bool force)573 libtextclassifier3::StatusOr<Crc32> PersistentHashMap::ComputeStoragesChecksum(
574     bool force) {
575   if (!force && !is_storage_dirty()) {
576     return Crc32(crcs().component_crcs.storages_crc);
577   }
578 
579   // Compute crcs
580   ICING_ASSIGN_OR_RETURN(Crc32 bucket_storage_crc,
581                          bucket_storage_->ComputeChecksum());
582   ICING_ASSIGN_OR_RETURN(Crc32 entry_storage_crc,
583                          entry_storage_->ComputeChecksum());
584   ICING_ASSIGN_OR_RETURN(Crc32 kv_storage_crc, kv_storage_->ComputeChecksum());
585 
586   return Crc32(bucket_storage_crc.Get() ^ entry_storage_crc.Get() ^
587                kv_storage_crc.Get());
588 }
589 
590 libtextclassifier3::StatusOr<PersistentHashMap::EntryIndexPair>
FindEntryIndexByKey(int32_t bucket_idx,std::string_view key) const591 PersistentHashMap::FindEntryIndexByKey(int32_t bucket_idx,
592                                        std::string_view key) const {
593   // Iterate all entries in the bucket, compare with key, and return the entry
594   // index if exists.
595   ICING_ASSIGN_OR_RETURN(const Bucket* bucket,
596                          bucket_storage_->Get(bucket_idx));
597 
598   int32_t prev_entry_idx = Entry::kInvalidIndex;
599   int32_t curr_entry_idx = bucket->head_entry_index();
600   while (curr_entry_idx != Entry::kInvalidIndex) {
601     ICING_ASSIGN_OR_RETURN(const Entry* entry,
602                            entry_storage_->Get(curr_entry_idx));
603     if (entry->key_value_index() == kInvalidKVIndex) {
604       ICING_LOG(ERROR) << "Got an invalid key value index in the persistent "
605                           "hash map bucket. This shouldn't happen";
606       return absl_ports::InternalError("Unexpected invalid key value index");
607     }
608     ICING_ASSIGN_OR_RETURN(const char* kv_arr,
609                            kv_storage_->Get(entry->key_value_index()));
610     if (key.compare(kv_arr) == 0) {
611       return EntryIndexPair(curr_entry_idx, prev_entry_idx);
612     }
613 
614     prev_entry_idx = curr_entry_idx;
615     curr_entry_idx = entry->next_entry_index();
616   }
617 
618   return EntryIndexPair(curr_entry_idx, prev_entry_idx);
619 }
620 
CopyEntryValue(int32_t entry_idx,void * value) const621 libtextclassifier3::Status PersistentHashMap::CopyEntryValue(
622     int32_t entry_idx, void* value) const {
623   ICING_ASSIGN_OR_RETURN(const Entry* entry, entry_storage_->Get(entry_idx));
624 
625   ICING_ASSIGN_OR_RETURN(const char* kv_arr,
626                          kv_storage_->Get(entry->key_value_index()));
627   int32_t value_offset = strlen(kv_arr) + 1;
628   memcpy(value, kv_arr + value_offset, info().value_type_size);
629 
630   return libtextclassifier3::Status::OK;
631 }
632 
Insert(int32_t bucket_idx,std::string_view key,const void * value)633 libtextclassifier3::Status PersistentHashMap::Insert(int32_t bucket_idx,
634                                                      std::string_view key,
635                                                      const void* value) {
636   SetDirty();
637 
638   // If entry_storage_->num_elements() + 1 exceeds options_.max_num_entries,
639   // then return error.
640   // We compute max_file_size of 3 storages by options_.max_num_entries. Since
641   // we won't recycle space of deleted entries (and key-value bytes), they're
642   // still occupying space in storages. Even if # of "active" entries (i.e.
643   // size()) doesn't exceed options_.max_num_entries, the new kvp to be inserted
644   // still potentially exceeds max_file_size.
645   // Therefore, we should use entry_storage_->num_elements() instead of size()
646   // to check. This feature avoids storages being grown extremely large when
647   // there are many Delete() and Put() operations.
648   if (entry_storage_->num_elements() > options_.max_num_entries - 1) {
649     return absl_ports::ResourceExhaustedError("Cannot insert new entry");
650   }
651 
652   ICING_ASSIGN_OR_RETURN(
653       typename FileBackedVector<Bucket>::MutableView mutable_bucket,
654       bucket_storage_->GetMutable(bucket_idx));
655 
656   // Append new key value.
657   int32_t new_kv_idx = kv_storage_->num_elements();
658   int32_t kv_len = key.size() + 1 + info().value_type_size;
659   int32_t value_offset = key.size() + 1;
660   ICING_ASSIGN_OR_RETURN(
661       typename FileBackedVector<char>::MutableArrayView mutable_new_kv_arr,
662       kv_storage_->Allocate(kv_len));
663   mutable_new_kv_arr.SetArray(/*idx=*/0, key.data(), key.size());
664   mutable_new_kv_arr.SetArray(/*idx=*/key.size(), "\0", 1);
665   mutable_new_kv_arr.SetArray(/*idx=*/value_offset,
666                               reinterpret_cast<const char*>(value),
667                               info().value_type_size);
668 
669   // Append new entry.
670   int32_t new_entry_idx = entry_storage_->num_elements();
671   ICING_RETURN_IF_ERROR(entry_storage_->Append(
672       Entry(new_kv_idx, mutable_bucket.Get().head_entry_index())));
673   mutable_bucket.Get().set_head_entry_index(new_entry_idx);
674 
675   return RehashIfNecessary(/*force_rehash=*/false);
676 }
677 
RehashIfNecessary(bool force_rehash)678 libtextclassifier3::Status PersistentHashMap::RehashIfNecessary(
679     bool force_rehash) {
680   int32_t new_num_bucket = bucket_storage_->num_elements();
681   while (new_num_bucket <= Bucket::kMaxNumBuckets / 2 &&
682          size() > static_cast<int64_t>(new_num_bucket) *
683                       info().max_load_factor_percent / 100) {
684     new_num_bucket *= 2;
685   }
686 
687   if (!force_rehash && new_num_bucket == bucket_storage_->num_elements()) {
688     return libtextclassifier3::Status::OK;
689   }
690 
691   SetDirty();
692 
693   // Resize and reset buckets.
694   ICING_RETURN_IF_ERROR(
695       bucket_storage_->Set(0, new_num_bucket, Bucket(Entry::kInvalidIndex)));
696 
697   // Iterate all key value pairs in kv_storage, rehash and insert.
698   Iterator iter = GetIterator();
699   int32_t entry_idx = 0;
700   while (iter.Advance()) {
701     ICING_ASSIGN_OR_RETURN(int32_t bucket_idx,
702                            HashKeyToBucketIndex(iter.GetKey(), new_num_bucket));
703     ICING_ASSIGN_OR_RETURN(FileBackedVector<Bucket>::MutableView mutable_bucket,
704                            bucket_storage_->GetMutable(bucket_idx));
705 
706     // Update entry and bucket.
707     ICING_RETURN_IF_ERROR(entry_storage_->Set(
708         entry_idx,
709         Entry(iter.GetIndex(), mutable_bucket.Get().head_entry_index())));
710     mutable_bucket.Get().set_head_entry_index(entry_idx);
711 
712     ++entry_idx;
713   }
714 
715   // Since there will be some deleted entries, after rehashing entry_storage_
716   // # of vector elements may be greater than the actual # of entries.
717   // Therefore, we have to truncate entry_storage_ to the correct size.
718   if (entry_idx < entry_storage_->num_elements()) {
719     ICING_RETURN_IF_ERROR(entry_storage_->TruncateTo(entry_idx));
720   }
721 
722   info().num_deleted_entries = 0;
723 
724   return libtextclassifier3::Status::OK;
725 }
726 
Advance()727 bool PersistentHashMap::Iterator::Advance() {
728   // Jump over the current key value pair before advancing to the next valid
729   // key value pair. In the first round (after construction), curr_key_len_
730   // is 0, so don't jump over anything.
731   if (curr_key_len_ != 0) {
732     curr_kv_idx_ += curr_key_len_ + 1 + map_->info().value_type_size;
733     curr_key_len_ = 0;
734   }
735 
736   // By skipping null chars, we will be automatically handling deleted entries
737   // (which are zeroed out during deletion).
738   for (const char* curr_kv_ptr = map_->kv_storage_->array() + curr_kv_idx_;
739        curr_kv_idx_ < map_->kv_storage_->num_elements();
740        ++curr_kv_ptr, ++curr_kv_idx_) {
741     if (*curr_kv_ptr != '\0') {
742       curr_key_len_ = strlen(curr_kv_ptr);
743       return true;
744     }
745   }
746   return false;
747 }
748 
749 }  // namespace lib
750 }  // namespace icing
751