1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 // A file-backed vector that can store fixed-width elements. It provides
16 // built-in support for checksums to verify data integrity and an in-memory
17 // cache for fast read/writes.
18 //
19 // If the file is corrupted/in an invalid state, all contents are lost, i.e.
20 // there is no clear recovery path other than recreating/repopulating the
21 // contents.
22 //
23 // Note on Performance:
24 // The class keeps the vector in a mmapped area. This allows users to specify
25 // which MemoryMappedFile::Strategy they wish to use with this class. The vector
26 // will implicitly grow when the user tries to access an element beyond its
27 // current size. Growing happens in 16KiB chunks, up to a maximum size of 1MiB.
28 //
29 // Note on Checksumming:
30 // Checksumming happens lazily. We do tail checksums to avoid recalculating the
31 // checksum of the entire file on each modfification. A full checksum will be
32 // computed/verified at creation time, when persisting to disk, or whenever the
33 // user manually calls ComputeChecksum(). A separate header checksum is kept for
34 // a quick integrity check.
35 //
36 //
37 // Usage:
38 // RETURN_OR_ASSIGN(auto vector, FileBackedVector<char>::Create(...));
39 //
40 // ICING_RETURN_IF_ERROR(vector->Set(0, 'a'));
41 // ICING_RETURN_IF_ERROR(vector->Set(1, 'b'));
42 // ICING_RETURN_IF_ERROR(vector->Set(2, 'c'));
43 //
44 // vector->num_elements(); // Returns 3
45 //
46 // vector->At(2); // Returns 'c'
47 //
48 // vector->TruncateTo(1);
49 // vector->num_elements(); // Returns 1
50 // vector->At(0); // Returns 'a'
51 //
52 // vector->ComputeChecksum(); // Force a checksum update and gets the checksum
53 //
54 // vector->PersistToDisk(); // Persist contents to disk.
55
56 #ifndef ICING_FILE_FILE_BACKED_VECTOR_H_
57 #define ICING_FILE_FILE_BACKED_VECTOR_H_
58
59 #include <inttypes.h>
60 #include <stdint.h>
61 #include <sys/mman.h>
62
63 #include <cstdint>
64 #include <memory>
65 #include <string>
66 #include <utility>
67 #include <vector>
68
69 #include "icing/text_classifier/lib3/utils/base/status.h"
70 #include "icing/text_classifier/lib3/utils/base/statusor.h"
71 #include "icing/absl_ports/canonical_errors.h"
72 #include "icing/absl_ports/str_cat.h"
73 #include "icing/file/filesystem.h"
74 #include "icing/file/memory-mapped-file.h"
75 #include "icing/legacy/core/icing-string-util.h"
76 #include "icing/util/crc32.h"
77 #include "icing/util/logging.h"
78 #include "icing/util/math-util.h"
79 #include "icing/util/status-macros.h"
80
81 namespace icing {
82 namespace lib {
83
84 template <typename T>
85 class FileBackedVector {
86 public:
87 // Header stored at the beginning of the file before the rest of the vector
88 // elements. Stores metadata on the vector.
89 struct Header {
90 // Static assert constants.
91 static constexpr int32_t kHeaderSize = 24;
92 static constexpr int32_t kHeaderChecksumOffset = 16;
93
94 static constexpr int32_t kMagic = 0x8bbbe237;
95
96 // Holds the magic as quick sanity check against file corruption
97 int32_t magic;
98
99 // Byte size of each element in the vector
100 int32_t element_size;
101
102 // Number of elements currently in the vector
103 int32_t num_elements;
104
105 // Checksum of the vector elements, doesn't include the header fields.
106 //
107 // TODO(cassiewang): Add a checksum state that can track if the checksum is
108 // fresh or stale. This lets us short circuit checksum computations if we
109 // know the checksum is fresh.
110 uint32_t vector_checksum;
111
112 // Must be below all actual header content fields and above the padding
113 // field. Contains the crc checksum of the preceding fields.
114 uint32_t header_checksum;
115
116 // This field has no actual meaning here but is just used as padding for the
117 // struct so the size of the struct can be a multiple of 8. Doing this makes
118 // the address right after the header a multiple of 8 and prevents a ubsan
119 // misalign-pointer-use error (go/ubsan).
120 //
121 // NOTE: please remove this when adding new fields and re-assert that the
122 // size is multiple of 8.
123 int32_t padding_for_ptr_alignment;
124
CalculateHeaderChecksumHeader125 uint32_t CalculateHeaderChecksum() const {
126 // Sanity check that the memory layout matches the disk layout.
127 static_assert(std::is_standard_layout<FileBackedVector::Header>::value,
128 "");
129 static_assert(sizeof(FileBackedVector::Header) == kHeaderSize, "");
130 static_assert(
131 sizeof(FileBackedVector::Header) % sizeof(void*) == 0,
132 "Header has insufficient padding for void* pointer alignment");
133 static_assert(offsetof(FileBackedVector::Header, header_checksum) ==
134 kHeaderChecksumOffset,
135 "");
136
137 Crc32 crc;
138 std::string_view header_str(
139 reinterpret_cast<const char*>(this),
140 offsetof(FileBackedVector::Header, header_checksum));
141 crc.Append(header_str);
142 return crc.Get();
143 }
144 };
145
146 // Creates a new FileBackedVector to read/write content to.
147 //
148 // filesystem: Object to make system level calls
149 // file_path : Specifies the file to persist the vector to; must be a path
150 // within a directory that already exists.
151 // mmap_strategy : Strategy/optimizations to access the content in the vector,
152 // see MemoryMappedFile::Strategy for more details
153 //
154 // Return:
155 // FAILED_PRECONDITION_ERROR if the file checksum doesn't match the stored
156 // checksum.
157 // INTERNAL_ERROR on I/O errors.
158 // UNIMPLEMENTED_ERROR if created with strategy READ_WRITE_MANUAL_SYNC.
159 static libtextclassifier3::StatusOr<std::unique_ptr<FileBackedVector<T>>>
160 Create(const Filesystem& filesystem, const std::string& file_path,
161 MemoryMappedFile::Strategy mmap_strategy);
162
163 // Deletes the FileBackedVector
164 //
165 // filesystem: Object to make system level calls
166 // file_path : Specifies the file the vector is persisted to.
167 static libtextclassifier3::Status Delete(const Filesystem& filesystem,
168 const std::string& file_path);
169
170 // Not copyable
171 FileBackedVector(const FileBackedVector&) = delete;
172 FileBackedVector& operator=(const FileBackedVector&) = delete;
173
174 // If the vector was created with
175 // MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, then changes will be
176 // synced by the system and the checksum will be updated.
177 ~FileBackedVector();
178
179 // Gets a copy of the element at idx.
180 //
181 // This is useful if you think the FileBackedVector may grow before you need
182 // to access this return value. When the FileBackedVector grows, the
183 // underlying mmap will be unmapped and remapped, which will invalidate any
184 // pointers to the previously mapped region. Getting a copy will avoid
185 // referencing the now-invalidated region.
186 //
187 // Returns:
188 // OUT_OF_RANGE_ERROR if idx < 0 or > num_elements()
189 libtextclassifier3::StatusOr<T> GetCopy(int32_t idx) const;
190
191 // Gets a pointer to the element at idx.
192 //
193 // WARNING: Subsequent calls to Set may invalidate the pointer returned by
194 // Get.
195 //
196 // This is useful if you do not think the FileBackedVector will grow before
197 // you need to reference this value, and you want to avoid a copy. When the
198 // FileBackedVector grows, the underlying mmap will be unmapped and remapped,
199 // which will invalidate this pointer to the previously mapped region.
200 //
201 // Returns:
202 // OUT_OF_RANGE_ERROR if idx < 0 or > num_elements()
203 libtextclassifier3::StatusOr<const T*> Get(int32_t idx) const;
204
205 // Writes the value at idx.
206 //
207 // May grow the underlying file and mmapped region as needed to fit the new
208 // value. If it does grow, then any pointers to previous values returned
209 // from Get() may be invalidated.
210 //
211 // Returns:
212 // OUT_OF_RANGE_ERROR if idx < 0 or file cannot be grown idx size
213 libtextclassifier3::Status Set(int32_t idx, const T& value);
214
215 // Resizes to first len elements. The crc is cleared on truncation and will be
216 // updated on destruction, or once the client calls ComputeChecksum() or
217 // PersistToDisk().
218 //
219 // Returns:
220 // OUT_OF_RANGE_ERROR if len < 0 or >= num_elements()
221 libtextclassifier3::Status TruncateTo(int32_t new_num_elements);
222
223 // Flushes content to underlying file.
224 //
225 // Returns:
226 // OK on success
227 // INTERNAL on I/O error
228 libtextclassifier3::Status PersistToDisk();
229
230 // Calculates and returns the disk usage in bytes. Rounds up to the nearest
231 // block size.
232 //
233 // Returns:
234 // Disk usage on success
235 // INTERNAL_ERROR on IO error
236 libtextclassifier3::StatusOr<int64_t> GetDiskUsage() const;
237
238 // Returns the file size of the all the elements held in the vector. File size
239 // is in bytes. This excludes the size of any internal metadata of the vector,
240 // e.g. the vector's header.
241 //
242 // Returns:
243 // File size on success
244 // INTERNAL_ERROR on IO error
245 libtextclassifier3::StatusOr<int64_t> GetElementsFileSize() const;
246
247 // Accessors.
array()248 const T* array() const {
249 return reinterpret_cast<const T*>(mmapped_file_->region());
250 }
251
mutable_array()252 T* mutable_array() const {
253 return reinterpret_cast<T*>(mmapped_file_->mutable_region());
254 }
255
num_elements()256 int32_t num_elements() const { return header_->num_elements; }
257
258 // Updates checksum of the vector contents and returns it.
259 //
260 // Returns:
261 // INTERNAL_ERROR if the vector's internal state is inconsistent
262 libtextclassifier3::StatusOr<Crc32> ComputeChecksum();
263
264 private:
265 // We track partial updates to the array for crc updating. This
266 // requires extra memory to keep track of original buffers but
267 // allows for much faster crc re-computation. This is the frac limit
268 // of byte len after which we will discard recorded changes and
269 // recompute the entire crc instead.
270 static constexpr int32_t kPartialCrcLimitDiv = 8; // limit is 1/8th
271
272 // Grow file by at least this many elements if array is growable.
273 static constexpr int64_t kGrowElements = 1u << 14; // 16K
274
275 // Max number of elements that can be held by the vector.
276 static constexpr int64_t kMaxNumElements = 1u << 20; // 1M
277
278 // Can only be created through the factory ::Create function
279 FileBackedVector(const Filesystem& filesystem, const std::string& file_path,
280 std::unique_ptr<Header> header,
281 std::unique_ptr<MemoryMappedFile> mmapped_file);
282
283 // Initialize a new FileBackedVector, and create the file.
284 static libtextclassifier3::StatusOr<std::unique_ptr<FileBackedVector<T>>>
285 InitializeNewFile(const Filesystem& filesystem, const std::string& file_path,
286 ScopedFd fd, MemoryMappedFile::Strategy mmap_strategy);
287
288 // Initialize a FileBackedVector from an existing file.
289 static libtextclassifier3::StatusOr<std::unique_ptr<FileBackedVector<T>>>
290 InitializeExistingFile(const Filesystem& filesystem,
291 const std::string& file_path, ScopedFd fd,
292 MemoryMappedFile::Strategy mmap_strategy);
293
294 // Grows the underlying file to hold at least num_elements
295 //
296 // Returns:
297 // OUT_OF_RANGE_ERROR if we can't grow to the specified size
298 libtextclassifier3::Status GrowIfNecessary(int32_t num_elements);
299
300 // Cached constructor params.
301 const Filesystem* const filesystem_;
302 const std::string file_path_;
303 std::unique_ptr<Header> header_;
304 std::unique_ptr<MemoryMappedFile> mmapped_file_;
305
306 // Offset before which all the elements have been included in the calculation
307 // of crc at the time it was calculated.
308 int32_t changes_end_ = 0;
309
310 // Offset of changes that have happened since the last crc update between [0,
311 // changes_end_).
312 std::vector<int32_t> changes_;
313
314 // Buffer of the original elements that have been changed since the last crc
315 // update. Will be cleared if the size grows too big.
316 std::string saved_original_buffer_;
317
318 // Keep track of all pages we touched so we can write them back to
319 // disk.
320 std::vector<bool> dirty_pages_;
321 };
322
323 template <typename T>
324 constexpr int32_t FileBackedVector<T>::kPartialCrcLimitDiv;
325
326 template <typename T>
327 constexpr int64_t FileBackedVector<T>::kGrowElements;
328
329 template <typename T>
330 constexpr int64_t FileBackedVector<T>::kMaxNumElements;
331
332 template <typename T>
333 libtextclassifier3::StatusOr<std::unique_ptr<FileBackedVector<T>>>
Create(const Filesystem & filesystem,const std::string & file_path,MemoryMappedFile::Strategy mmap_strategy)334 FileBackedVector<T>::Create(const Filesystem& filesystem,
335 const std::string& file_path,
336 MemoryMappedFile::Strategy mmap_strategy) {
337 if (mmap_strategy == MemoryMappedFile::Strategy::READ_WRITE_MANUAL_SYNC) {
338 // FileBackedVector's behavior of growing the file underneath the mmap is
339 // inherently broken with MAP_PRIVATE. Growing the vector requires extending
340 // the file size, then unmapping and then re-mmapping over the new, larger
341 // file. But when we unmap, we lose all the vector's contents if they
342 // weren't manually persisted. Either don't allow READ_WRITE_MANUAL_SYNC
343 // vectors from growing, or make users aware of this somehow
344 return absl_ports::UnimplementedError(
345 "FileBackedVector currently doesn't support READ_WRITE_MANUAL_SYNC "
346 "mmap strategy.");
347 }
348
349 ScopedFd fd(filesystem.OpenForWrite(file_path.c_str()));
350 if (!fd.is_valid()) {
351 return absl_ports::InternalError(
352 absl_ports::StrCat("Failed to open ", file_path));
353 }
354
355 int64_t file_size = filesystem.GetFileSize(file_path.c_str());
356 if (file_size == Filesystem::kBadFileSize) {
357 return absl_ports::InternalError(
358 absl_ports::StrCat("Bad file size for file ", file_path));
359 }
360
361 const bool new_file = file_size == 0;
362 if (new_file) {
363 return InitializeNewFile(filesystem, file_path, std::move(fd),
364 mmap_strategy);
365 }
366 return InitializeExistingFile(filesystem, file_path, std::move(fd),
367 mmap_strategy);
368 }
369
370 template <typename T>
371 libtextclassifier3::StatusOr<std::unique_ptr<FileBackedVector<T>>>
InitializeNewFile(const Filesystem & filesystem,const std::string & file_path,ScopedFd fd,MemoryMappedFile::Strategy mmap_strategy)372 FileBackedVector<T>::InitializeNewFile(
373 const Filesystem& filesystem, const std::string& file_path, ScopedFd fd,
374 MemoryMappedFile::Strategy mmap_strategy) {
375 // Create header.
376 auto header = std::make_unique<Header>();
377 header->magic = FileBackedVector<T>::Header::kMagic;
378 header->element_size = sizeof(T);
379 header->header_checksum = header->CalculateHeaderChecksum();
380
381 // We use Write() here, instead of writing through the mmapped region
382 // created below, so we can gracefully handle errors that occur when the
383 // disk is full. See b/77309668 for details.
384 if (!filesystem.PWrite(fd.get(), /*offset=*/0, header.get(),
385 sizeof(Header))) {
386 return absl_ports::InternalError("Failed to write header");
387 }
388
389 // Constructor of MemoryMappedFile doesn't actually call mmap(), mmap()
390 // happens on MemoryMappedFile::Remap(). So having a potentially unflushed fd
391 // at this point shouldn't run into issues with a mmap of the same file. But
392 // we'll close the fd just in case.
393 fd.reset();
394 auto mmapped_file =
395 std::make_unique<MemoryMappedFile>(filesystem, file_path, mmap_strategy);
396
397 return std::unique_ptr<FileBackedVector<T>>(new FileBackedVector<T>(
398 filesystem, file_path, std::move(header), std::move(mmapped_file)));
399 }
400
401 template <typename T>
402 libtextclassifier3::StatusOr<std::unique_ptr<FileBackedVector<T>>>
InitializeExistingFile(const Filesystem & filesystem,const std::string & file_path,const ScopedFd fd,MemoryMappedFile::Strategy mmap_strategy)403 FileBackedVector<T>::InitializeExistingFile(
404 const Filesystem& filesystem, const std::string& file_path,
405 const ScopedFd fd, MemoryMappedFile::Strategy mmap_strategy) {
406 int64_t file_size = filesystem.GetFileSize(file_path.c_str());
407 if (file_size < sizeof(FileBackedVector<T>::Header)) {
408 return absl_ports::InternalError(
409 absl_ports::StrCat("File header too short for ", file_path));
410 }
411
412 auto header = std::make_unique<Header>();
413 if (!filesystem.PRead(fd.get(), header.get(), sizeof(Header),
414 /*offset=*/0)) {
415 return absl_ports::InternalError(
416 absl_ports::StrCat("Failed to read header of ", file_path));
417 }
418
419 // Make sure the header is still valid before we use any of its values. This
420 // should technically be included in the header_checksum check below, but this
421 // is a quick/fast check that can save us from an extra crc computation.
422 if (header->kMagic != FileBackedVector<T>::Header::kMagic) {
423 return absl_ports::InternalError(
424 absl_ports::StrCat("Invalid header kMagic for ", file_path));
425 }
426
427 // Check header
428 if (header->header_checksum != header->CalculateHeaderChecksum()) {
429 return absl_ports::FailedPreconditionError(
430 absl_ports::StrCat("Invalid header crc for ", file_path));
431 }
432
433 if (header->element_size != sizeof(T)) {
434 return absl_ports::InternalError(IcingStringUtil::StringPrintf(
435 "Inconsistent element size, expected %zd, actual %d", sizeof(T),
436 header->element_size));
437 }
438
439 int64_t min_file_size = header->num_elements * sizeof(T) + sizeof(Header);
440 if (min_file_size > file_size) {
441 return absl_ports::InternalError(IcingStringUtil::StringPrintf(
442 "Inconsistent file size, expected %" PRId64 ", actual %" PRId64,
443 min_file_size, file_size));
444 }
445
446 // Mmap the content of the vector, excluding the header so its easier to
447 // access elements from the mmapped region
448 auto mmapped_file =
449 std::make_unique<MemoryMappedFile>(filesystem, file_path, mmap_strategy);
450 ICING_RETURN_IF_ERROR(
451 mmapped_file->Remap(sizeof(Header), file_size - sizeof(Header)));
452
453 // Check vector contents
454 Crc32 vector_checksum;
455 std::string_view vector_contents(
456 reinterpret_cast<const char*>(mmapped_file->region()),
457 header->num_elements * sizeof(T));
458 vector_checksum.Append(vector_contents);
459
460 if (vector_checksum.Get() != header->vector_checksum) {
461 return absl_ports::FailedPreconditionError(
462 absl_ports::StrCat("Invalid vector contents for ", file_path));
463 }
464
465 return std::unique_ptr<FileBackedVector<T>>(new FileBackedVector<T>(
466 filesystem, file_path, std::move(header), std::move(mmapped_file)));
467 }
468
469 template <typename T>
Delete(const Filesystem & filesystem,const std::string & file_path)470 libtextclassifier3::Status FileBackedVector<T>::Delete(
471 const Filesystem& filesystem, const std::string& file_path) {
472 if (!filesystem.DeleteFile(file_path.c_str())) {
473 return absl_ports::InternalError(
474 absl_ports::StrCat("Failed to delete file: ", file_path));
475 }
476 return libtextclassifier3::Status::OK;
477 }
478
479 template <typename T>
FileBackedVector(const Filesystem & filesystem,const std::string & file_path,std::unique_ptr<Header> header,std::unique_ptr<MemoryMappedFile> mmapped_file)480 FileBackedVector<T>::FileBackedVector(
481 const Filesystem& filesystem, const std::string& file_path,
482 std::unique_ptr<Header> header,
483 std::unique_ptr<MemoryMappedFile> mmapped_file)
484 : filesystem_(&filesystem),
485 file_path_(file_path),
486 header_(std::move(header)),
487 mmapped_file_(std::move(mmapped_file)),
488 changes_end_(header_->num_elements) {}
489
490 template <typename T>
~FileBackedVector()491 FileBackedVector<T>::~FileBackedVector() {
492 if (mmapped_file_->strategy() ==
493 MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC) {
494 if (!PersistToDisk().ok()) {
495 ICING_LOG(WARNING)
496 << "Failed to persist vector to disk while destructing "
497 << file_path_;
498 }
499 }
500 }
501
502 template <typename T>
GetCopy(int32_t idx)503 libtextclassifier3::StatusOr<T> FileBackedVector<T>::GetCopy(
504 int32_t idx) const {
505 ICING_ASSIGN_OR_RETURN(const T* value, Get(idx));
506 return *value;
507 }
508
509 template <typename T>
Get(int32_t idx)510 libtextclassifier3::StatusOr<const T*> FileBackedVector<T>::Get(
511 int32_t idx) const {
512 if (idx < 0) {
513 return absl_ports::OutOfRangeError(
514 IcingStringUtil::StringPrintf("Index, %d, was less than 0", idx));
515 }
516
517 if (idx >= header_->num_elements) {
518 return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
519 "Index, %d, was greater than vector size, %d", idx,
520 header_->num_elements));
521 }
522
523 return &array()[idx];
524 }
525
526 template <typename T>
Set(int32_t idx,const T & value)527 libtextclassifier3::Status FileBackedVector<T>::Set(int32_t idx,
528 const T& value) {
529 if (idx < 0) {
530 return absl_ports::OutOfRangeError(
531 IcingStringUtil::StringPrintf("Index, %d, was less than 0", idx));
532 }
533
534 ICING_RETURN_IF_ERROR(GrowIfNecessary(idx + 1));
535
536 if (idx + 1 > header_->num_elements) {
537 header_->num_elements = idx + 1;
538 }
539
540 if (mutable_array()[idx] == value) {
541 // No need to update
542 return libtextclassifier3::Status::OK;
543 }
544
545 // Cache original value to update crcs.
546 if (idx < changes_end_) {
547 // If we exceed kPartialCrcLimitDiv, clear changes_end_ to
548 // revert to full CRC.
549 if ((saved_original_buffer_.size() + sizeof(T)) *
550 FileBackedVector<T>::kPartialCrcLimitDiv >
551 changes_end_ * sizeof(T)) {
552 ICING_VLOG(2) << "FileBackedVector change tracking limit exceeded";
553 changes_.clear();
554 saved_original_buffer_.clear();
555 changes_end_ = 0;
556 header_->vector_checksum = 0;
557 } else {
558 int32_t start_byte = idx * sizeof(T);
559
560 changes_.push_back(idx);
561 saved_original_buffer_.append(
562 reinterpret_cast<char*>(const_cast<T*>(array())) + start_byte,
563 sizeof(T));
564 }
565 }
566
567 mutable_array()[idx] = value;
568 return libtextclassifier3::Status::OK;
569 }
570
571 template <typename T>
GrowIfNecessary(int32_t num_elements)572 libtextclassifier3::Status FileBackedVector<T>::GrowIfNecessary(
573 int32_t num_elements) {
574 if (sizeof(T) == 0) {
575 // Growing is a no-op
576 return libtextclassifier3::Status::OK;
577 }
578
579 if (num_elements <= header_->num_elements) {
580 return libtextclassifier3::Status::OK;
581 }
582
583 if (num_elements > FileBackedVector<T>::kMaxNumElements) {
584 return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
585 "%d exceeds maximum number of elements allowed, %lld", num_elements,
586 static_cast<long long>(FileBackedVector<T>::kMaxNumElements)));
587 }
588
589 int64_t current_file_size = filesystem_->GetFileSize(file_path_.c_str());
590 int64_t least_file_size_needed = sizeof(Header) + num_elements * sizeof(T);
591
592 if (least_file_size_needed <= current_file_size) {
593 // Our underlying file can hold the target num_elements cause we've grown
594 // before
595 return libtextclassifier3::Status::OK;
596 }
597
598 // Otherwise, we need to grow. Grow to kGrowElements boundary.
599 least_file_size_needed = math_util::RoundUpTo(
600 least_file_size_needed,
601 int64_t{FileBackedVector<T>::kGrowElements * sizeof(T)});
602
603 // We use PWrite here rather than Grow because Grow doesn't actually allocate
604 // an underlying disk block. This can lead to problems with mmap because mmap
605 // has no effective way to signal that it was impossible to allocate the disk
606 // block and ends up crashing instead. PWrite will force the allocation of
607 // these blocks, which will ensure that any failure to grow will surface here.
608 int64_t page_size = getpagesize();
609 auto buf = std::make_unique<uint8_t[]>(page_size);
610 int64_t size_to_write = page_size - (current_file_size % page_size);
611 ScopedFd sfd(filesystem_->OpenForWrite(file_path_.c_str()));
612 while (current_file_size < least_file_size_needed) {
613 if (!filesystem_->PWrite(sfd.get(), current_file_size, buf.get(),
614 size_to_write)) {
615 return absl_ports::InternalError(
616 absl_ports::StrCat("Couldn't grow file ", file_path_));
617 }
618 current_file_size += size_to_write;
619 size_to_write = page_size - (current_file_size % page_size);
620 }
621
622 ICING_RETURN_IF_ERROR(mmapped_file_->Remap(
623 sizeof(Header), least_file_size_needed - sizeof(Header)));
624
625 return libtextclassifier3::Status::OK;
626 }
627
628 template <typename T>
TruncateTo(int32_t new_num_elements)629 libtextclassifier3::Status FileBackedVector<T>::TruncateTo(
630 int32_t new_num_elements) {
631 if (new_num_elements < 0) {
632 return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
633 "Truncated length %d must be >= 0", new_num_elements));
634 }
635
636 if (new_num_elements >= header_->num_elements) {
637 return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
638 "Truncated length %d must be less than the current size %d",
639 new_num_elements, header_->num_elements));
640 }
641
642 ICING_VLOG(2)
643 << "FileBackedVector truncating, need to recalculate entire checksum";
644 changes_.clear();
645 saved_original_buffer_.clear();
646 changes_end_ = 0;
647 header_->vector_checksum = 0;
648
649 header_->num_elements = new_num_elements;
650 return libtextclassifier3::Status::OK;
651 }
652
653 template <typename T>
ComputeChecksum()654 libtextclassifier3::StatusOr<Crc32> FileBackedVector<T>::ComputeChecksum() {
655 // First apply the modified area. Keep a bitmap of already updated
656 // regions so we don't double-update.
657 std::vector<bool> updated(changes_end_);
658 uint32_t cur_offset = 0;
659 Crc32 cur_crc(header_->vector_checksum);
660 int num_partial_crcs = 0;
661 int num_truncated = 0;
662 int num_overlapped = 0;
663 int num_duplicate = 0;
664 for (size_t i = 0; i < changes_.size(); i++) {
665 const int32_t change_offset = changes_[i];
666 if (change_offset > changes_end_) {
667 return absl_ports::InternalError(IcingStringUtil::StringPrintf(
668 "Failed to update crc, change offset %d, changes_end_ %d",
669 change_offset, changes_end_));
670 }
671
672 // Skip truncated tracked changes.
673 if (change_offset >= header_->num_elements) {
674 ++num_truncated;
675 continue;
676 }
677
678 // Turn change buffer into change^original.
679 const char* buffer_end = &saved_original_buffer_[cur_offset + sizeof(T)];
680 const char* cur_array =
681 reinterpret_cast<const char*>(array()) + change_offset * sizeof(T);
682 // Now xor in. SSE acceleration please?
683 for (char* cur = &saved_original_buffer_[cur_offset]; cur < buffer_end;
684 cur++, cur_array++) {
685 *cur ^= *cur_array;
686 }
687
688 // Skip over already updated bytes by setting update to 0.
689 bool new_update = false;
690 bool overlap = false;
691 uint32_t cur_element = change_offset;
692 for (char* cur = &saved_original_buffer_[cur_offset]; cur < buffer_end;
693 cur_element++, cur += sizeof(T)) {
694 if (updated[cur_element]) {
695 memset(cur, 0, sizeof(T));
696 overlap = true;
697 } else {
698 updated[cur_element] = true;
699 new_update = true;
700 }
701 }
702
703 // Apply update to crc.
704 if (new_update) {
705 // Explicitly create the string_view with length
706 std::string_view xored_str(buffer_end - sizeof(T), sizeof(T));
707 if (!cur_crc
708 .UpdateWithXor(xored_str, changes_end_ * sizeof(T),
709 change_offset * sizeof(T))
710 .ok()) {
711 return absl_ports::InternalError(IcingStringUtil::StringPrintf(
712 "Failed to update crc, change offset %d, change "
713 "length %zd changes_end_ %d",
714 change_offset, xored_str.length(), changes_end_));
715 }
716 num_partial_crcs++;
717 if (overlap) {
718 num_overlapped++;
719 }
720 } else {
721 num_duplicate++;
722 }
723 cur_offset += sizeof(T);
724 }
725
726 if (!changes_.empty()) {
727 ICING_VLOG(2) << IcingStringUtil::StringPrintf(
728 "Array update partial crcs %d truncated %d overlapped %d duplicate %d",
729 num_partial_crcs, num_truncated, num_overlapped, num_duplicate);
730 }
731
732 // Now update with grown area.
733 if (changes_end_ < header_->num_elements) {
734 // Explicitly create the string_view with length
735 std::string_view update_str(
736 reinterpret_cast<const char*>(array()) + changes_end_ * sizeof(T),
737 (header_->num_elements - changes_end_) * sizeof(T));
738 cur_crc.Append(update_str);
739 ICING_VLOG(2) << IcingStringUtil::StringPrintf(
740 "Array update tail crc offset %d -> %d", changes_end_,
741 header_->num_elements);
742 }
743
744 // Clear, now that we've applied changes.
745 changes_.clear();
746 saved_original_buffer_.clear();
747 changes_end_ = header_->num_elements;
748
749 // Commit new crc.
750 header_->vector_checksum = cur_crc.Get();
751 return cur_crc;
752 }
753
754 template <typename T>
PersistToDisk()755 libtextclassifier3::Status FileBackedVector<T>::PersistToDisk() {
756 // Update and write the header
757 ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
758 header_->vector_checksum = checksum.Get();
759 header_->header_checksum = header_->CalculateHeaderChecksum();
760
761 if (!filesystem_->PWrite(file_path_.c_str(), /*offset=*/0, header_.get(),
762 sizeof(Header))) {
763 return absl_ports::InternalError("Failed to sync header");
764 }
765
766 MemoryMappedFile::Strategy strategy = mmapped_file_->strategy();
767
768 if (strategy == MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC) {
769 // Changes should have been applied to the underlying file, but call msync()
770 // as an extra safety step to ensure they are written out.
771 ICING_RETURN_IF_ERROR(mmapped_file_->PersistToDisk());
772 }
773
774 return libtextclassifier3::Status::OK;
775 }
776
777 template <typename T>
GetDiskUsage()778 libtextclassifier3::StatusOr<int64_t> FileBackedVector<T>::GetDiskUsage()
779 const {
780 int64_t size = filesystem_->GetDiskUsage(file_path_.c_str());
781 if (size == Filesystem::kBadFileSize) {
782 return absl_ports::InternalError(
783 "Failed to get disk usage of file-backed vector");
784 }
785 return size;
786 }
787
788 template <typename T>
GetElementsFileSize()789 libtextclassifier3::StatusOr<int64_t> FileBackedVector<T>::GetElementsFileSize()
790 const {
791 int64_t total_file_size = filesystem_->GetFileSize(file_path_.c_str());
792 if (total_file_size == Filesystem::kBadFileSize) {
793 return absl_ports::InternalError(
794 "Failed to get file size of elements in the file-backed vector");
795 }
796 return total_file_size - sizeof(Header);
797 }
798
799 } // namespace lib
800 } // namespace icing
801
802 #endif // ICING_FILE_FILE_BACKED_VECTOR_H_
803