1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 // A file-backed vector that can store fixed-width elements. It provides
16 // built-in support for checksums to verify data integrity and an in-memory
17 // cache for fast read/writes.
18 //
19 // If the file is corrupted/in an invalid state, all contents are lost, i.e.
20 // there is no clear recovery path other than recreating/repopulating the
21 // contents.
22 //
23 // Note on Performance:
24 // The class keeps the vector in a mmapped area. This allows users to specify
25 // which MemoryMappedFile::Strategy they wish to use with this class. The vector
26 // will implicitly grow when the user tries to access an element beyond its
27 // current size. Growing happens in 16KiB chunks, up to a maximum size of 1MiB.
28 //
29 // Note on Checksumming:
30 // Checksumming happens lazily. We do tail checksums to avoid recalculating the
31 // checksum of the entire file on each modfification. A full checksum will be
32 // computed/verified at creation time, when persisting to disk, or whenever the
33 // user manually calls ComputeChecksum(). A separate header checksum is kept for
34 // a quick integrity check.
35 //
36 //
37 // Usage:
38 // RETURN_OR_ASSIGN(auto vector, FileBackedVector<char>::Create(...));
39 //
40 // ICING_RETURN_IF_ERROR(vector->Set(0, 'a'));
41 // ICING_RETURN_IF_ERROR(vector->Set(1, 'b'));
42 // ICING_RETURN_IF_ERROR(vector->Set(2, 'c'));
43 //
44 // vector->num_elements(); // Returns 3
45 //
46 // vector->At(2); // Returns 'c'
47 //
48 // vector->TruncateTo(1);
49 // vector->num_elements(); // Returns 1
50 // vector->At(0); // Returns 'a'
51 //
52 // vector->ComputeChecksum(); // Force a checksum update and gets the checksum
53 //
54 // vector->PersistToDisk(); // Persist contents to disk.
55
56 #ifndef ICING_FILE_FILE_BACKED_VECTOR_H_
57 #define ICING_FILE_FILE_BACKED_VECTOR_H_
58
59 #include <sys/mman.h>
60 #include <unistd.h>
61
62 #include <algorithm>
63 #include <cinttypes>
64 #include <cstdint>
65 #include <cstring>
66 #include <functional>
67 #include <limits>
68 #include <memory>
69 #include <string>
70 #include <utility>
71 #include <vector>
72
73 #include "icing/text_classifier/lib3/utils/base/status.h"
74 #include "icing/text_classifier/lib3/utils/base/statusor.h"
75 #include "icing/absl_ports/canonical_errors.h"
76 #include "icing/absl_ports/str_cat.h"
77 #include "icing/file/filesystem.h"
78 #include "icing/file/memory-mapped-file.h"
79 #include "icing/legacy/core/icing-string-util.h"
80 #include "icing/portable/platform.h"
81 #include "icing/util/crc32.h"
82 #include "icing/util/logging.h"
83 #include "icing/util/math-util.h"
84 #include "icing/util/status-macros.h"
85
86 namespace icing {
87 namespace lib {
88
89 template <typename T>
90 class FileBackedVector {
91 public:
92 class MutableArrayView;
93 class MutableView;
94
95 // Header stored at the beginning of the file before the rest of the vector
96 // elements. Stores metadata on the vector.
97 struct Header {
98 // Static assert constants.
99 static constexpr int32_t kHeaderSize = 24;
100 static constexpr int32_t kHeaderChecksumOffset = 16;
101
102 static constexpr int32_t kMagic = 0x8bbbe237;
103
104 // Holds the magic as quick sanity check against file corruption
105 int32_t magic;
106
107 // Byte size of each element in the vector
108 int32_t element_size;
109
110 // Number of elements currently in the vector
111 int32_t num_elements;
112
113 // Checksum of the vector elements, doesn't include the header fields.
114 //
115 // TODO(cassiewang): Add a checksum state that can track if the checksum is
116 // fresh or stale. This lets us short circuit checksum computations if we
117 // know the checksum is fresh.
118 uint32_t vector_checksum;
119
120 // Must be below all actual header content fields and above the padding
121 // field. Contains the crc checksum of the preceding fields.
122 uint32_t header_checksum;
123
124 // This field has no actual meaning here but is just used as padding for the
125 // struct so the size of the struct can be a multiple of 8. Doing this makes
126 // the address right after the header a multiple of 8 and prevents a ubsan
127 // misalign-pointer-use error (go/ubsan).
128 //
129 // NOTE: please remove this when adding new fields and re-assert that the
130 // size is multiple of 8.
131 int32_t padding_for_ptr_alignment;
132
CalculateHeaderChecksumHeader133 uint32_t CalculateHeaderChecksum() const {
134 // Sanity check that the memory layout matches the disk layout.
135 static_assert(std::is_standard_layout<FileBackedVector::Header>::value,
136 "");
137 static_assert(sizeof(FileBackedVector::Header) == kHeaderSize, "");
138 static_assert(
139 sizeof(FileBackedVector::Header) % sizeof(void*) == 0,
140 "Header has insufficient padding for void* pointer alignment");
141 static_assert(offsetof(FileBackedVector::Header, header_checksum) ==
142 kHeaderChecksumOffset,
143 "");
144
145 return Crc32(std::string_view(
146 reinterpret_cast<const char*>(this),
147 offsetof(FileBackedVector::Header, header_checksum)))
148 .Get();
149 }
150 };
151
152 // Absolute max file size for FileBackedVector.
153 // - We memory map the whole file, so file size ~= memory size.
154 // - On 32-bit platform, the virtual memory address space is 4GB. To avoid
155 // exhausting the memory, set smaller file size limit for 32-bit platform.
156 #ifdef ICING_ARCH_BIT_64
157 static constexpr int32_t kMaxFileSize =
158 std::numeric_limits<int32_t>::max(); // 2^31-1 Bytes, ~2.1 GB
159 #else
160 static constexpr int32_t kMaxFileSize =
161 (1 << 28) + Header::kHeaderSize; // 2^28 + 12 Bytes, ~256 MiB
162 #endif
163
164 // Size of element type T. The value is same as sizeof(T), while we should
165 // avoid using sizeof(T) in our codebase to prevent unexpected unsigned
166 // integer casting.
167 static constexpr int32_t kElementTypeSize = static_cast<int32_t>(sizeof(T));
168 static_assert(sizeof(T) <= (1 << 10));
169
170 // Absolute max # of elements allowed. Since we are using int32_t to store
171 // num_elements, max value is 2^31-1. Still the actual max # of elements are
172 // determined by max_file_size, kMaxFileSize, kElementTypeSize, and
173 // Header::kHeaderSize.
174 static constexpr int32_t kMaxNumElements =
175 std::numeric_limits<int32_t>::max();
176
177 // Creates a new FileBackedVector to read/write content to.
178 //
179 // filesystem: Object to make system level calls
180 // file_path : Specifies the file to persist the vector to; must be a path
181 // within a directory that already exists.
182 // mmap_strategy : Strategy/optimizations to access the content in the vector,
183 // see MemoryMappedFile::Strategy for more details
184 // max_file_size: Maximum file size for FileBackedVector, default
185 // kMaxFileSize. Note that this value won't be written into the
186 // header, so maximum file size will always be specified in
187 // runtime and the caller should make sure the value is correct
188 // and reasonable. Also it will be cached in MemoryMappedFile
189 // member, so we can always call mmapped_file_->max_file_size()
190 // to get it.
191 // The range should be in
192 // [Header::kHeaderSize + kElementTypeSize, kMaxFileSize], and
193 // (max_file_size - Header::kHeaderSize) / kElementTypeSize is
194 // max # of elements that can be stored.
195 // pre_mapping_mmap_size: pre-mapping size of MemoryMappedFile, default 0.
196 // Pre-mapping a large memory region to the file and
197 // grow the underlying file later, so we can avoid
198 // remapping too frequently and reduce the cost of
199 // system call and memory paging after remap. The user
200 // should specify reasonable size to save remapping
201 // cost and avoid exhausting the memory at once in the
202 // beginning.
203 // Note: if the file exists and pre_mapping_mmap_size
204 // is smaller than file_size - Header::kHeaderSize,
205 // then it still pre-maps file_size -
206 // Header::kHeaderSize to make all existing elements
207 // available.
208 // TODO(b/247671531): figure out pre_mapping_mmap_size for each
209 // FileBackedVector use case.
210 //
211 // Return:
212 // FAILED_PRECONDITION_ERROR if the file checksum doesn't match the stored
213 // checksum.
214 // INTERNAL_ERROR on I/O errors.
215 // INVALID_ARGUMENT_ERROR if max_file_size is incorrect.
216 // UNIMPLEMENTED_ERROR if created with strategy READ_WRITE_MANUAL_SYNC.
217 static libtextclassifier3::StatusOr<std::unique_ptr<FileBackedVector<T>>>
218 Create(const Filesystem& filesystem, const std::string& file_path,
219 MemoryMappedFile::Strategy mmap_strategy,
220 int32_t max_file_size = kMaxFileSize,
221 int32_t pre_mapping_mmap_size = 0);
222
223 // Deletes the FileBackedVector
224 //
225 // filesystem: Object to make system level calls
226 // file_path : Specifies the file the vector is persisted to.
227 static libtextclassifier3::Status Delete(const Filesystem& filesystem,
228 const std::string& file_path);
229
230 // Not copyable
231 FileBackedVector(const FileBackedVector&) = delete;
232 FileBackedVector& operator=(const FileBackedVector&) = delete;
233
234 // If the vector was created with
235 // MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, then changes will be
236 // synced by the system and the checksum will be updated.
237 ~FileBackedVector();
238
239 // Gets a copy of the element at idx.
240 //
241 // This is useful if you think the FileBackedVector may grow before you need
242 // to access this return value. When the FileBackedVector grows, the
243 // underlying mmap will be unmapped and remapped, which will invalidate any
244 // pointers to the previously mapped region. Getting a copy will avoid
245 // referencing the now-invalidated region.
246 //
247 // Returns:
248 // OUT_OF_RANGE_ERROR if idx < 0 or idx >= num_elements()
249 libtextclassifier3::StatusOr<T> GetCopy(int32_t idx) const;
250
251 // Gets an immutable pointer to the element at idx.
252 //
253 // WARNING: Subsequent calls to Set/Append/Allocate may invalidate the pointer
254 // returned by Get.
255 //
256 // This is useful if you do not think the FileBackedVector will grow before
257 // you need to reference this value, and you want to avoid a copy. When the
258 // FileBackedVector grows, the underlying mmap will be unmapped and remapped,
259 // which will invalidate this pointer to the previously mapped region.
260 //
261 // Returns:
262 // OUT_OF_RANGE_ERROR if idx < 0 or idx >= num_elements()
263 libtextclassifier3::StatusOr<const T*> Get(int32_t idx) const;
264
265 // Gets a MutableView to the element at idx.
266 //
267 // WARNING: Subsequent calls to Set/Append/Allocate may invalidate the
268 // reference returned by MutableView::Get().
269 //
270 // This is useful if you do not think the FileBackedVector will grow before
271 // you need to reference this value, and you want to mutate the underlying
272 // data directly. When the FileBackedVector grows, the underlying mmap will be
273 // unmapped and remapped, which will invalidate this MutableView to the
274 // previously mapped region.
275 //
276 // Returns:
277 // OUT_OF_RANGE_ERROR if idx < 0 or idx >= num_elements()
278 libtextclassifier3::StatusOr<MutableView> GetMutable(int32_t idx);
279
280 // Gets a MutableArrayView to the elements at range [idx, idx + len).
281 //
282 // WARNING: Subsequent calls to Set/Append/Allocate may invalidate the
283 // reference/pointer returned by MutableArrayView::operator[]/data().
284 //
285 // This is useful if you do not think the FileBackedVector will grow before
286 // you need to reference this value, and you want to mutate the underlying
287 // data directly. When the FileBackedVector grows, the underlying mmap will be
288 // unmapped and remapped, which will invalidate this MutableArrayView to the
289 // previously mapped region.
290 //
291 // Returns:
292 // OUT_OF_RANGE_ERROR if idx < 0 or idx + len > num_elements()
293 libtextclassifier3::StatusOr<MutableArrayView> GetMutable(int32_t idx,
294 int32_t len);
295
296 // Writes the value at idx.
297 //
298 // May grow the underlying file and mmapped region as needed to fit the new
299 // value. If it does grow, then any pointers/references to previous values
300 // returned from Get/GetMutable/Allocate may be invalidated.
301 //
302 // Returns:
303 // OUT_OF_RANGE_ERROR if idx < 0 or idx > kMaxIndex or file cannot be grown
304 // to fit idx + 1 elements
305 libtextclassifier3::Status Set(int32_t idx, const T& value);
306
307 // Set [idx, idx + len) to a single value.
308 //
309 // May grow the underlying file and mmapped region as needed to fit the new
310 // value. If it does grow, then any pointers/references to previous values
311 // returned from Get/GetMutable/Allocate may be invalidated.
312 //
313 // Returns:
314 // OUT_OF_RANGE_ERROR if idx < 0 or idx + len > kMaxNumElements or file
315 // cannot be grown to fit idx + len elements
316 libtextclassifier3::Status Set(int32_t idx, int32_t len, const T& value);
317
318 // Appends the value to the end of the vector.
319 //
320 // May grow the underlying file and mmapped region as needed to fit the new
321 // value. If it does grow, then any pointers/references to previous values
322 // returned from Get/GetMutable/Allocate may be invalidated.
323 //
324 // Returns:
325 // OUT_OF_RANGE_ERROR if file cannot be grown (i.e. reach
326 // mmapped_file_->max_file_size())
Append(const T & value)327 libtextclassifier3::Status Append(const T& value) {
328 return Set(header_->num_elements, value);
329 }
330
331 // Allocates spaces with given length in the end of the vector and returns a
332 // MutableArrayView to the space.
333 //
334 // May grow the underlying file and mmapped region as needed to fit the new
335 // value. If it does grow, then any pointers/references to previous values
336 // returned from Get/GetMutable/Allocate may be invalidated.
337 //
338 // WARNING: Subsequent calls to Set/Append/Allocate may invalidate the
339 // reference/pointer returned by MutableArrayView::operator[]/data().
340 //
341 // This is useful if you do not think the FileBackedVector will grow before
342 // you need to reference this value, and you want to allocate adjacent spaces
343 // for multiple elements and mutate the underlying data directly. When the
344 // FileBackedVector grows, the underlying mmap will be unmapped and remapped,
345 // which will invalidate this MutableArrayView to the previously mapped
346 // region.
347 //
348 // Returns:
349 // OUT_OF_RANGE_ERROR if len <= 0 or file cannot be grown (i.e. reach
350 // mmapped_file_->max_file_size())
351 libtextclassifier3::StatusOr<MutableArrayView> Allocate(int32_t len);
352
353 // Resizes to first len elements. The crc is cleared on truncation and will be
354 // updated on destruction, or once the client calls ComputeChecksum() or
355 // PersistToDisk().
356 //
357 // Returns:
358 // OUT_OF_RANGE_ERROR if len < 0 or len >= num_elements()
359 libtextclassifier3::Status TruncateTo(int32_t new_num_elements);
360
361 // Sorts the vector within range [begin_idx, end_idx).
362 // It handles SetDirty properly for the file-backed-vector.
363 //
364 // Returns:
365 // OUT_OF_RANGE_ERROR if (0 <= begin_idx < end_idx <= num_elements()) does
366 // not hold
367 libtextclassifier3::Status Sort(int32_t begin_idx, int32_t end_idx);
368
369 // Mark idx as changed iff idx < changes_end_, so later ComputeChecksum() can
370 // update checksum by the cached changes without going over [0, changes_end_).
371 //
372 // If the buffer size exceeds kPartialCrcLimitDiv, then clear all change
373 // buffers and set changes_end_ as 0, indicating that the checksum should be
374 // recomputed from idx 0 (starting from the beginning). Otherwise cache the
375 // change.
376 void SetDirty(int32_t idx);
377
378 // Flushes content to underlying file.
379 //
380 // Returns:
381 // OK on success
382 // INTERNAL on I/O error
383 libtextclassifier3::Status PersistToDisk();
384
385 // Calculates and returns the disk usage in bytes. Rounds up to the nearest
386 // block size.
387 //
388 // Returns:
389 // Disk usage on success
390 // INTERNAL_ERROR on IO error
391 libtextclassifier3::StatusOr<int64_t> GetDiskUsage() const;
392
393 // Returns the file size of the all the elements held in the vector. File size
394 // is in bytes. This excludes the size of any internal metadata of the vector,
395 // e.g. the vector's header.
396 //
397 // Returns:
398 // File size on success
399 // INTERNAL_ERROR on IO error
400 libtextclassifier3::StatusOr<int64_t> GetElementsFileSize() const;
401
402 // Updates checksum of the vector contents and returns it.
403 //
404 // Returns:
405 // INTERNAL_ERROR if the vector's internal state is inconsistent
406 libtextclassifier3::StatusOr<Crc32> ComputeChecksum();
407
408 // Accessors.
array()409 const T* array() const {
410 return reinterpret_cast<const T*>(mmapped_file_->region());
411 }
412
num_elements()413 int32_t num_elements() const { return header_->num_elements; }
414
415 public:
416 class MutableArrayView {
417 public:
418 const T& operator[](int32_t idx) const { return data_[idx]; }
419 T& operator[](int32_t idx) {
420 SetDirty(idx);
421 return data_[idx];
422 }
423
data()424 const T* data() const { return data_; }
425
size()426 int32_t size() const { return len_; }
427
428 // Set the mutable array slice (starting at idx) by the given element array.
429 // It handles SetDirty properly for the file-backed-vector when modifying
430 // elements.
431 //
432 // REQUIRES: arr is valid && arr_len >= 0 && idx >= 0 && idx + arr_len <=
433 // size(), otherwise the behavior is undefined.
SetArray(int32_t idx,const T * arr,int32_t arr_len)434 void SetArray(int32_t idx, const T* arr, int32_t arr_len) {
435 for (int32_t i = 0; i < arr_len; ++i) {
436 SetDirty(idx + i);
437 data_[idx + i] = arr[i];
438 }
439 }
440
441 private:
MutableArrayView(FileBackedVector<T> * vector,T * data,int32_t len)442 MutableArrayView(FileBackedVector<T>* vector, T* data, int32_t len)
443 : vector_(vector),
444 data_(data),
445 original_idx_(data - vector->array()),
446 len_(len) {}
447
SetDirty(int32_t idx)448 void SetDirty(int32_t idx) { vector_->SetDirty(original_idx_ + idx); }
449
450 // Does not own. For SetDirty only.
451 FileBackedVector<T>* vector_;
452
453 // data_ points at vector_->mutable_array()[original_idx_]
454 T* data_;
455 int32_t original_idx_;
456 int32_t len_;
457
458 friend class FileBackedVector;
459 };
460
461 class MutableView {
462 public:
Get()463 const T& Get() const { return mutable_array_view_[0]; }
Get()464 T& Get() { return mutable_array_view_[0]; }
465
466 private:
MutableView(FileBackedVector<T> * vector,T * data)467 MutableView(FileBackedVector<T>* vector, T* data)
468 : mutable_array_view_(vector, data, 1) {}
469
470 MutableArrayView mutable_array_view_;
471
472 friend class FileBackedVector;
473 };
474
475 private:
476 // We track partial updates to the array for crc updating. This
477 // requires extra memory to keep track of original buffers but
478 // allows for much faster crc re-computation. This is the frac limit
479 // of byte len after which we will discard recorded changes and
480 // recompute the entire crc instead.
481 static constexpr int32_t kPartialCrcLimitDiv = 8; // limit is 1/8th
482
483 // Grow file by at least this many elements if array is growable.
484 static constexpr int64_t kGrowElements = 1u << 14; // 16K
485
486 // Absolute max index allowed.
487 static constexpr int32_t kMaxIndex = kMaxNumElements - 1;
488
489 // Can only be created through the factory ::Create function
490 explicit FileBackedVector(const Filesystem& filesystem,
491 const std::string& file_path,
492 std::unique_ptr<Header> header,
493 MemoryMappedFile&& mmapped_file);
494
495 // Initialize a new FileBackedVector, and create the file.
496 static libtextclassifier3::StatusOr<std::unique_ptr<FileBackedVector<T>>>
497 InitializeNewFile(const Filesystem& filesystem, const std::string& file_path,
498 ScopedFd fd, MemoryMappedFile::Strategy mmap_strategy,
499 int32_t max_file_size, int32_t pre_mapping_mmap_size);
500
501 // Initialize a FileBackedVector from an existing file.
502 static libtextclassifier3::StatusOr<std::unique_ptr<FileBackedVector<T>>>
503 InitializeExistingFile(const Filesystem& filesystem,
504 const std::string& file_path, ScopedFd fd,
505 MemoryMappedFile::Strategy mmap_strategy,
506 int32_t max_file_size, int32_t pre_mapping_mmap_size);
507
508 // Grows the underlying file to hold at least num_elements
509 //
510 // Returns:
511 // OUT_OF_RANGE_ERROR if we can't grow to the specified size
512 libtextclassifier3::Status GrowIfNecessary(int32_t num_elements);
513
mutable_array()514 T* mutable_array() const {
515 return reinterpret_cast<T*>(mmapped_file_->mutable_region());
516 }
517
518 // Cached constructor params.
519 const Filesystem* const filesystem_;
520 const std::string file_path_;
521 std::unique_ptr<Header> header_;
522 std::unique_ptr<MemoryMappedFile> mmapped_file_;
523
524 // Offset before which all the elements have been included in the calculation
525 // of crc at the time it was calculated.
526 int32_t changes_end_ = 0;
527
528 // Offset of changes that have happened since the last crc update between [0,
529 // changes_end_).
530 std::vector<int32_t> changes_;
531
532 // Buffer of the original elements that have been changed since the last crc
533 // update. Will be cleared if the size grows too big.
534 std::string saved_original_buffer_;
535 };
536
537 template <typename T>
538 constexpr int32_t FileBackedVector<T>::kMaxFileSize;
539
540 template <typename T>
541 constexpr int32_t FileBackedVector<T>::kElementTypeSize;
542
543 template <typename T>
544 constexpr int32_t FileBackedVector<T>::kMaxNumElements;
545
546 template <typename T>
547 constexpr int32_t FileBackedVector<T>::kPartialCrcLimitDiv;
548
549 template <typename T>
550 constexpr int64_t FileBackedVector<T>::kGrowElements;
551
552 template <typename T>
553 constexpr int32_t FileBackedVector<T>::kMaxIndex;
554
555 template <typename T>
556 libtextclassifier3::StatusOr<std::unique_ptr<FileBackedVector<T>>>
Create(const Filesystem & filesystem,const std::string & file_path,MemoryMappedFile::Strategy mmap_strategy,int32_t max_file_size,int32_t pre_mapping_mmap_size)557 FileBackedVector<T>::Create(const Filesystem& filesystem,
558 const std::string& file_path,
559 MemoryMappedFile::Strategy mmap_strategy,
560 int32_t max_file_size,
561 int32_t pre_mapping_mmap_size) {
562 if (mmap_strategy == MemoryMappedFile::Strategy::READ_WRITE_MANUAL_SYNC) {
563 // FileBackedVector's behavior of growing the file underneath the mmap is
564 // inherently broken with MAP_PRIVATE. Growing the vector requires extending
565 // the file size, then unmapping and then re-mmapping over the new, larger
566 // file. But when we unmap, we lose all the vector's contents if they
567 // weren't manually persisted. Either don't allow READ_WRITE_MANUAL_SYNC
568 // vectors from growing, or make users aware of this somehow
569 return absl_ports::UnimplementedError(
570 "FileBackedVector currently doesn't support READ_WRITE_MANUAL_SYNC "
571 "mmap strategy.");
572 }
573
574 if (max_file_size < Header::kHeaderSize + kElementTypeSize ||
575 max_file_size > kMaxFileSize) {
576 // FileBackedVector should be able to store at least 1 element, so
577 // max_file_size should be at least Header::kHeaderSize + kElementTypeSize.
578 return absl_ports::InvalidArgumentError(
579 "Invalid max file size for FileBackedVector");
580 }
581
582 ScopedFd fd(filesystem.OpenForWrite(file_path.c_str()));
583 if (!fd.is_valid()) {
584 return absl_ports::InternalError(
585 absl_ports::StrCat("Failed to open ", file_path));
586 }
587
588 int64_t file_size = filesystem.GetFileSize(file_path.c_str());
589 if (file_size == Filesystem::kBadFileSize) {
590 return absl_ports::InternalError(
591 absl_ports::StrCat("Bad file size for file ", file_path));
592 }
593
594 if (max_file_size < file_size) {
595 return absl_ports::InvalidArgumentError(
596 "Max file size should not be smaller than the existing file size");
597 }
598
599 const bool new_file = file_size == 0;
600 if (new_file) {
601 return InitializeNewFile(filesystem, file_path, std::move(fd),
602 mmap_strategy, max_file_size,
603 pre_mapping_mmap_size);
604 }
605 return InitializeExistingFile(filesystem, file_path, std::move(fd),
606 mmap_strategy, max_file_size,
607 pre_mapping_mmap_size);
608 }
609
610 template <typename T>
611 libtextclassifier3::StatusOr<std::unique_ptr<FileBackedVector<T>>>
InitializeNewFile(const Filesystem & filesystem,const std::string & file_path,ScopedFd fd,MemoryMappedFile::Strategy mmap_strategy,int32_t max_file_size,int32_t pre_mapping_mmap_size)612 FileBackedVector<T>::InitializeNewFile(const Filesystem& filesystem,
613 const std::string& file_path,
614 ScopedFd fd,
615 MemoryMappedFile::Strategy mmap_strategy,
616 int32_t max_file_size,
617 int32_t pre_mapping_mmap_size) {
618 // Create header.
619 auto header = std::make_unique<Header>();
620 header->magic = FileBackedVector<T>::Header::kMagic;
621 header->element_size = kElementTypeSize;
622 header->header_checksum = header->CalculateHeaderChecksum();
623
624 // We use Write() here, instead of writing through the mmapped region
625 // created below, so we can gracefully handle errors that occur when the
626 // disk is full. See b/77309668 for details.
627 if (!filesystem.PWrite(fd.get(), /*offset=*/0, header.get(),
628 Header::kHeaderSize)) {
629 return absl_ports::InternalError("Failed to write header");
630 }
631
632 // Close the fd since constructor of MemoryMappedFile calls mmap() and we need
633 // to flush fd before mmap().
634 fd.reset();
635
636 ICING_ASSIGN_OR_RETURN(
637 MemoryMappedFile mmapped_file,
638 MemoryMappedFile::Create(filesystem, file_path, mmap_strategy,
639 max_file_size,
640 /*pre_mapping_file_offset=*/Header::kHeaderSize,
641 /*pre_mapping_mmap_size=*/
642 std::min(max_file_size - Header::kHeaderSize,
643 pre_mapping_mmap_size)));
644
645 return std::unique_ptr<FileBackedVector<T>>(new FileBackedVector<T>(
646 filesystem, file_path, std::move(header), std::move(mmapped_file)));
647 }
648
649 template <typename T>
650 libtextclassifier3::StatusOr<std::unique_ptr<FileBackedVector<T>>>
InitializeExistingFile(const Filesystem & filesystem,const std::string & file_path,const ScopedFd fd,MemoryMappedFile::Strategy mmap_strategy,int32_t max_file_size,int32_t pre_mapping_mmap_size)651 FileBackedVector<T>::InitializeExistingFile(
652 const Filesystem& filesystem, const std::string& file_path,
653 const ScopedFd fd, MemoryMappedFile::Strategy mmap_strategy,
654 int32_t max_file_size, int32_t pre_mapping_mmap_size) {
655 int64_t file_size = filesystem.GetFileSize(file_path.c_str());
656 if (file_size == Filesystem::kBadFileSize) {
657 return absl_ports::InternalError(
658 absl_ports::StrCat("Bad file size for file ", file_path));
659 }
660
661 if (file_size < Header::kHeaderSize) {
662 return absl_ports::InternalError(
663 absl_ports::StrCat("File header too short for ", file_path));
664 }
665
666 auto header = std::make_unique<Header>();
667 if (!filesystem.PRead(fd.get(), header.get(), Header::kHeaderSize,
668 /*offset=*/0)) {
669 return absl_ports::InternalError(
670 absl_ports::StrCat("Failed to read header of ", file_path));
671 }
672
673 // Make sure the header is still valid before we use any of its values. This
674 // should technically be included in the header_checksum check below, but this
675 // is a quick/fast check that can save us from an extra crc computation.
676 if (header->kMagic != FileBackedVector<T>::Header::kMagic) {
677 return absl_ports::InternalError(
678 absl_ports::StrCat("Invalid header kMagic for ", file_path));
679 }
680
681 // Check header
682 if (header->header_checksum != header->CalculateHeaderChecksum()) {
683 return absl_ports::FailedPreconditionError(
684 absl_ports::StrCat("Invalid header crc for ", file_path));
685 }
686
687 if (header->element_size != kElementTypeSize) {
688 return absl_ports::InternalError(IcingStringUtil::StringPrintf(
689 "Inconsistent element size, expected %d, actual %d", kElementTypeSize,
690 header->element_size));
691 }
692
693 int64_t min_file_size =
694 static_cast<int64_t>(header->num_elements) * kElementTypeSize +
695 Header::kHeaderSize;
696 if (min_file_size > file_size) {
697 return absl_ports::InternalError(IcingStringUtil::StringPrintf(
698 "Inconsistent file size, expected %" PRId64 ", actual %" PRId64,
699 min_file_size, file_size));
700 }
701
702 // Mmap the content of the vector, excluding the header so its easier to
703 // access elements from the mmapped region
704 // Although users can specify their own pre_mapping_mmap_size, we should make
705 // sure that the pre-map size is at least file_size - Header::kHeaderSize to
706 // make all existing elements available.
707 ICING_ASSIGN_OR_RETURN(
708 MemoryMappedFile mmapped_file,
709 MemoryMappedFile::Create(
710 filesystem, file_path, mmap_strategy, max_file_size,
711 /*pre_mapping_file_offset=*/Header::kHeaderSize,
712 /*pre_mapping_mmap_size=*/
713 std::max(
714 file_size - Header::kHeaderSize,
715 static_cast<int64_t>(std::min(max_file_size - Header::kHeaderSize,
716 pre_mapping_mmap_size)))));
717
718 // Check vector contents
719 Crc32 vector_checksum(
720 std::string_view(reinterpret_cast<const char*>(mmapped_file.region()),
721 header->num_elements * kElementTypeSize));
722
723 if (vector_checksum.Get() != header->vector_checksum) {
724 return absl_ports::FailedPreconditionError(
725 absl_ports::StrCat("Invalid vector contents for ", file_path));
726 }
727
728 return std::unique_ptr<FileBackedVector<T>>(new FileBackedVector<T>(
729 filesystem, file_path, std::move(header), std::move(mmapped_file)));
730 }
731
732 template <typename T>
Delete(const Filesystem & filesystem,const std::string & file_path)733 libtextclassifier3::Status FileBackedVector<T>::Delete(
734 const Filesystem& filesystem, const std::string& file_path) {
735 if (!filesystem.DeleteFile(file_path.c_str())) {
736 return absl_ports::InternalError(
737 absl_ports::StrCat("Failed to delete file: ", file_path));
738 }
739 return libtextclassifier3::Status::OK;
740 }
741
742 template <typename T>
FileBackedVector(const Filesystem & filesystem,const std::string & file_path,std::unique_ptr<Header> header,MemoryMappedFile && mmapped_file)743 FileBackedVector<T>::FileBackedVector(const Filesystem& filesystem,
744 const std::string& file_path,
745 std::unique_ptr<Header> header,
746 MemoryMappedFile&& mmapped_file)
747 : filesystem_(&filesystem),
748 file_path_(file_path),
749 header_(std::move(header)),
750 mmapped_file_(
751 std::make_unique<MemoryMappedFile>(std::move(mmapped_file))),
752 changes_end_(header_->num_elements) {}
753
754 template <typename T>
~FileBackedVector()755 FileBackedVector<T>::~FileBackedVector() {
756 if (mmapped_file_->strategy() ==
757 MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC) {
758 if (!PersistToDisk().ok()) {
759 ICING_LOG(WARNING)
760 << "Failed to persist vector to disk while destructing "
761 << file_path_;
762 }
763 }
764 }
765
766 template <typename T>
GetCopy(int32_t idx)767 libtextclassifier3::StatusOr<T> FileBackedVector<T>::GetCopy(
768 int32_t idx) const {
769 ICING_ASSIGN_OR_RETURN(const T* value, Get(idx));
770 return *value;
771 }
772
773 template <typename T>
Get(int32_t idx)774 libtextclassifier3::StatusOr<const T*> FileBackedVector<T>::Get(
775 int32_t idx) const {
776 if (idx < 0) {
777 return absl_ports::OutOfRangeError(
778 IcingStringUtil::StringPrintf("Index, %d, was less than 0", idx));
779 }
780
781 if (idx >= header_->num_elements) {
782 return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
783 "Index, %d, was greater than vector size, %d", idx,
784 header_->num_elements));
785 }
786
787 return &array()[idx];
788 }
789
790 template <typename T>
791 libtextclassifier3::StatusOr<typename FileBackedVector<T>::MutableView>
GetMutable(int32_t idx)792 FileBackedVector<T>::GetMutable(int32_t idx) {
793 if (idx < 0) {
794 return absl_ports::OutOfRangeError(
795 IcingStringUtil::StringPrintf("Index, %d, was less than 0", idx));
796 }
797
798 if (idx >= header_->num_elements) {
799 return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
800 "Index, %d, was greater than vector size, %d", idx,
801 header_->num_elements));
802 }
803
804 return MutableView(this, &mutable_array()[idx]);
805 }
806
807 template <typename T>
808 libtextclassifier3::StatusOr<typename FileBackedVector<T>::MutableArrayView>
GetMutable(int32_t idx,int32_t len)809 FileBackedVector<T>::GetMutable(int32_t idx, int32_t len) {
810 if (idx < 0) {
811 return absl_ports::OutOfRangeError(
812 IcingStringUtil::StringPrintf("Index, %d, was less than 0", idx));
813 }
814
815 if (idx > header_->num_elements - len) {
816 return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
817 "Index with len, %d %d, was greater than vector size, %d", idx, len,
818 header_->num_elements));
819 }
820
821 return MutableArrayView(this, &mutable_array()[idx], len);
822 }
823
824 template <typename T>
Set(int32_t idx,const T & value)825 libtextclassifier3::Status FileBackedVector<T>::Set(int32_t idx,
826 const T& value) {
827 return Set(idx, 1, value);
828 }
829
830 template <typename T>
Set(int32_t idx,int32_t len,const T & value)831 libtextclassifier3::Status FileBackedVector<T>::Set(int32_t idx, int32_t len,
832 const T& value) {
833 if (idx < 0) {
834 return absl_ports::OutOfRangeError(
835 IcingStringUtil::StringPrintf("Index, %d, was less than 0", idx));
836 }
837
838 if (len <= 0) {
839 return absl_ports::OutOfRangeError("Invalid set length");
840 }
841
842 if (idx > kMaxNumElements - len) {
843 return absl_ports::OutOfRangeError(
844 IcingStringUtil::StringPrintf("Length %d (with index %d), was too long "
845 "for max num elements allowed, %d",
846 len, idx, kMaxNumElements));
847 }
848
849 ICING_RETURN_IF_ERROR(GrowIfNecessary(idx + len));
850
851 if (idx + len > header_->num_elements) {
852 header_->num_elements = idx + len;
853 }
854
855 for (int32_t i = 0; i < len; ++i) {
856 if (array()[idx + i] == value) {
857 // No need to update
858 continue;
859 }
860
861 SetDirty(idx + i);
862 mutable_array()[idx + i] = value;
863 }
864
865 return libtextclassifier3::Status::OK;
866 }
867
868 template <typename T>
869 libtextclassifier3::StatusOr<typename FileBackedVector<T>::MutableArrayView>
Allocate(int32_t len)870 FileBackedVector<T>::Allocate(int32_t len) {
871 if (len <= 0) {
872 return absl_ports::OutOfRangeError("Invalid allocate length");
873 }
874
875 if (len > kMaxNumElements - header_->num_elements) {
876 return absl_ports::OutOfRangeError(
877 IcingStringUtil::StringPrintf("Cannot allocate %d elements", len));
878 }
879
880 // Although header_->num_elements + len doesn't exceed kMaxNumElements, the
881 // actual max # of elements are determined by mmapped_file_->max_file_size(),
882 // kElementTypeSize, and kHeaderSize. Thus, it is still possible to fail to
883 // grow the file.
884 ICING_RETURN_IF_ERROR(GrowIfNecessary(header_->num_elements + len));
885
886 int32_t start_idx = header_->num_elements;
887 header_->num_elements += len;
888
889 return MutableArrayView(this, &mutable_array()[start_idx], len);
890 }
891
892 template <typename T>
GrowIfNecessary(int32_t num_elements)893 libtextclassifier3::Status FileBackedVector<T>::GrowIfNecessary(
894 int32_t num_elements) {
895 if (kElementTypeSize == 0) {
896 // Growing is a no-op
897 return libtextclassifier3::Status::OK;
898 }
899
900 if (num_elements <= header_->num_elements) {
901 return libtextclassifier3::Status::OK;
902 }
903
904 if (num_elements > (mmapped_file_->max_file_size() - Header::kHeaderSize) /
905 kElementTypeSize) {
906 return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
907 "%d elements total size exceed maximum bytes of elements allowed, "
908 "%" PRId64 " bytes",
909 num_elements, mmapped_file_->max_file_size() - Header::kHeaderSize));
910 }
911
912 int32_t least_file_size_needed =
913 Header::kHeaderSize + num_elements * kElementTypeSize; // Won't overflow
914 if (least_file_size_needed <= mmapped_file_->available_size()) {
915 return libtextclassifier3::Status::OK;
916 }
917
918 int64_t round_up_file_size_needed = math_util::RoundUpTo(
919 int64_t{least_file_size_needed},
920 int64_t{FileBackedVector<T>::kGrowElements} * kElementTypeSize);
921
922 // Call GrowAndRemapIfNecessary. It handles file growth internally and remaps
923 // intelligently.
924 // We've ensured that least_file_size_needed (for num_elements) doesn't exceed
925 // mmapped_file_->max_file_size(), but it is still possible that
926 // round_up_file_size_needed exceeds it, so use the smaller value of them as
927 // new_mmap_size.
928 ICING_RETURN_IF_ERROR(mmapped_file_->GrowAndRemapIfNecessary(
929 /*new_file_offset=*/Header::kHeaderSize,
930 /*new_mmap_size=*/std::min(round_up_file_size_needed,
931 mmapped_file_->max_file_size()) -
932 Header::kHeaderSize));
933
934 return libtextclassifier3::Status::OK;
935 }
936
937 template <typename T>
TruncateTo(int32_t new_num_elements)938 libtextclassifier3::Status FileBackedVector<T>::TruncateTo(
939 int32_t new_num_elements) {
940 if (new_num_elements < 0) {
941 return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
942 "Truncated length %d must be >= 0", new_num_elements));
943 }
944
945 if (new_num_elements >= header_->num_elements) {
946 return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
947 "Truncated length %d must be less than the current size %d",
948 new_num_elements, header_->num_elements));
949 }
950
951 ICING_VLOG(2)
952 << "FileBackedVector truncating, need to recalculate entire checksum";
953 changes_.clear();
954 saved_original_buffer_.clear();
955 changes_end_ = 0;
956 header_->vector_checksum = 0;
957
958 header_->num_elements = new_num_elements;
959 return libtextclassifier3::Status::OK;
960 }
961
962 template <typename T>
Sort(int32_t begin_idx,int32_t end_idx)963 libtextclassifier3::Status FileBackedVector<T>::Sort(int32_t begin_idx,
964 int32_t end_idx) {
965 if (begin_idx < 0 || begin_idx >= end_idx ||
966 end_idx > header_->num_elements) {
967 return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
968 "Invalid sort index, %d, %d", begin_idx, end_idx));
969 }
970 for (int32_t i = begin_idx; i < end_idx; ++i) {
971 SetDirty(i);
972 }
973 std::sort(mutable_array() + begin_idx, mutable_array() + end_idx);
974 return libtextclassifier3::Status::OK;
975 }
976
977 template <typename T>
SetDirty(int32_t idx)978 void FileBackedVector<T>::SetDirty(int32_t idx) {
979 // Cache original value to update crcs.
980 if (idx >= 0 && idx < changes_end_) {
981 // If we exceed kPartialCrcLimitDiv, clear changes_end_ to
982 // revert to full CRC.
983 if ((saved_original_buffer_.size() + kElementTypeSize) *
984 FileBackedVector<T>::kPartialCrcLimitDiv >
985 changes_end_ * kElementTypeSize) {
986 ICING_VLOG(2) << "FileBackedVector change tracking limit exceeded";
987 changes_.clear();
988 saved_original_buffer_.clear();
989 changes_end_ = 0;
990 header_->vector_checksum = 0;
991 } else {
992 int32_t start_byte = idx * kElementTypeSize;
993
994 changes_.push_back(idx);
995 saved_original_buffer_.append(
996 reinterpret_cast<char*>(const_cast<T*>(array())) + start_byte,
997 kElementTypeSize);
998 }
999 }
1000 }
1001
1002 template <typename T>
ComputeChecksum()1003 libtextclassifier3::StatusOr<Crc32> FileBackedVector<T>::ComputeChecksum() {
1004 // First apply the modified area. Keep a bitmap of already updated
1005 // regions so we don't double-update.
1006 std::vector<bool> updated(changes_end_);
1007 uint32_t cur_offset = 0;
1008 Crc32 cur_crc(header_->vector_checksum);
1009 int num_partial_crcs = 0;
1010 int num_truncated = 0;
1011 int num_overlapped = 0;
1012 int num_duplicate = 0;
1013 for (const int32_t change_offset : changes_) {
1014 if (change_offset > changes_end_) {
1015 return absl_ports::InternalError(IcingStringUtil::StringPrintf(
1016 "Failed to update crc, change offset %d, changes_end_ %d",
1017 change_offset, changes_end_));
1018 }
1019
1020 // Skip truncated tracked changes.
1021 if (change_offset >= header_->num_elements) {
1022 ++num_truncated;
1023 continue;
1024 }
1025
1026 // Turn change buffer into change^original.
1027 const char* buffer_end =
1028 &saved_original_buffer_[cur_offset + kElementTypeSize];
1029 const char* cur_array = reinterpret_cast<const char*>(array()) +
1030 change_offset * kElementTypeSize;
1031 // Now xor in. SSE acceleration please?
1032 for (char* cur = &saved_original_buffer_[cur_offset]; cur < buffer_end;
1033 cur++, cur_array++) {
1034 *cur ^= *cur_array;
1035 }
1036
1037 // Skip over already updated bytes by setting update to 0.
1038 bool new_update = false;
1039 bool overlap = false;
1040 uint32_t cur_element = change_offset;
1041 for (char* cur = &saved_original_buffer_[cur_offset]; cur < buffer_end;
1042 cur_element++, cur += kElementTypeSize) {
1043 if (updated[cur_element]) {
1044 memset(cur, 0, kElementTypeSize);
1045 overlap = true;
1046 } else {
1047 updated[cur_element] = true;
1048 new_update = true;
1049 }
1050 }
1051
1052 // Apply update to crc.
1053 if (new_update) {
1054 // Explicitly create the string_view with length
1055 std::string_view xored_str(buffer_end - kElementTypeSize,
1056 kElementTypeSize);
1057 if (!cur_crc
1058 .UpdateWithXor(xored_str, changes_end_ * kElementTypeSize,
1059 change_offset * kElementTypeSize)
1060 .ok()) {
1061 return absl_ports::InternalError(IcingStringUtil::StringPrintf(
1062 "Failed to update crc, change offset %d, change "
1063 "length %zd changes_end_ %d",
1064 change_offset, xored_str.length(), changes_end_));
1065 }
1066 num_partial_crcs++;
1067 if (overlap) {
1068 num_overlapped++;
1069 }
1070 } else {
1071 num_duplicate++;
1072 }
1073 cur_offset += kElementTypeSize;
1074 }
1075
1076 if (!changes_.empty()) {
1077 ICING_VLOG(2) << IcingStringUtil::StringPrintf(
1078 "Array update partial crcs %d truncated %d overlapped %d duplicate %d",
1079 num_partial_crcs, num_truncated, num_overlapped, num_duplicate);
1080 }
1081
1082 // Now update with grown area.
1083 if (changes_end_ < header_->num_elements) {
1084 // Explicitly create the string_view with length
1085 std::string_view update_str(
1086 reinterpret_cast<const char*>(array()) +
1087 changes_end_ * kElementTypeSize,
1088 (header_->num_elements - changes_end_) * kElementTypeSize);
1089 cur_crc.Append(update_str);
1090 ICING_VLOG(2) << IcingStringUtil::StringPrintf(
1091 "Array update tail crc offset %d -> %d", changes_end_,
1092 header_->num_elements);
1093 }
1094
1095 // Clear, now that we've applied changes.
1096 changes_.clear();
1097 saved_original_buffer_.clear();
1098 changes_end_ = header_->num_elements;
1099
1100 // Commit new crc.
1101 header_->vector_checksum = cur_crc.Get();
1102 return cur_crc;
1103 }
1104
1105 template <typename T>
PersistToDisk()1106 libtextclassifier3::Status FileBackedVector<T>::PersistToDisk() {
1107 // Update and write the header
1108 ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
1109 header_->vector_checksum = checksum.Get();
1110 header_->header_checksum = header_->CalculateHeaderChecksum();
1111
1112 if (!filesystem_->PWrite(file_path_.c_str(), /*offset=*/0, header_.get(),
1113 Header::kHeaderSize)) {
1114 return absl_ports::InternalError("Failed to sync header");
1115 }
1116
1117 MemoryMappedFile::Strategy strategy = mmapped_file_->strategy();
1118
1119 if (strategy == MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC) {
1120 // Changes should have been applied to the underlying file, but call msync()
1121 // as an extra safety step to ensure they are written out.
1122 ICING_RETURN_IF_ERROR(mmapped_file_->PersistToDisk());
1123 }
1124
1125 return libtextclassifier3::Status::OK;
1126 }
1127
1128 template <typename T>
GetDiskUsage()1129 libtextclassifier3::StatusOr<int64_t> FileBackedVector<T>::GetDiskUsage()
1130 const {
1131 int64_t size = filesystem_->GetDiskUsage(file_path_.c_str());
1132 if (size == Filesystem::kBadFileSize) {
1133 return absl_ports::InternalError(
1134 "Failed to get disk usage of file-backed vector");
1135 }
1136 return size;
1137 }
1138
1139 template <typename T>
GetElementsFileSize()1140 libtextclassifier3::StatusOr<int64_t> FileBackedVector<T>::GetElementsFileSize()
1141 const {
1142 int64_t total_file_size = filesystem_->GetFileSize(file_path_.c_str());
1143 if (total_file_size == Filesystem::kBadFileSize) {
1144 return absl_ports::InternalError(
1145 "Failed to get file size of elements in the file-backed vector");
1146 }
1147 if (total_file_size < Header::kHeaderSize) {
1148 return absl_ports::InternalError(
1149 "File size should not be smaller than header size");
1150 }
1151 return total_file_size - Header::kHeaderSize;
1152 }
1153
1154 } // namespace lib
1155 } // namespace icing
1156
1157 #endif // ICING_FILE_FILE_BACKED_VECTOR_H_
1158