• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // A file-backed vector that can store fixed-width elements. It provides
16 // built-in support for checksums to verify data integrity and an in-memory
17 // cache for fast read/writes.
18 //
19 // If the file is corrupted/in an invalid state, all contents are lost, i.e.
20 // there is no clear recovery path other than recreating/repopulating the
21 // contents.
22 //
23 // Note on Performance:
24 // The class keeps the vector in a mmapped area. This allows users to specify
25 // which MemoryMappedFile::Strategy they wish to use with this class. The vector
26 // will implicitly grow when the user tries to access an element beyond its
27 // current size. Growing happens in 16KiB chunks, up to a maximum size of 1MiB.
28 //
29 // Note on Checksumming:
30 // Checksumming happens lazily. We do tail checksums to avoid recalculating the
31 // checksum of the entire file on each modfification. A full checksum will be
32 // computed/verified at creation time, when persisting to disk, or whenever the
33 // user manually calls ComputeChecksum(). A separate header checksum is kept for
34 // a quick integrity check.
35 //
36 //
37 // Usage:
38 // RETURN_OR_ASSIGN(auto vector, FileBackedVector<char>::Create(...));
39 //
40 // ICING_RETURN_IF_ERROR(vector->Set(0, 'a'));
41 // ICING_RETURN_IF_ERROR(vector->Set(1, 'b'));
42 // ICING_RETURN_IF_ERROR(vector->Set(2, 'c'));
43 //
44 // vector->num_elements();  // Returns 3
45 //
46 // vector->At(2);  // Returns 'c'
47 //
48 // vector->TruncateTo(1);
49 // vector->num_elements();  // Returns 1
50 // vector->At(0);  // Returns 'a'
51 //
52 // vector->ComputeChecksum();  // Force a checksum update and gets the checksum
53 //
54 // vector->PersistToDisk();  // Persist contents to disk.
55 
56 #ifndef ICING_FILE_FILE_BACKED_VECTOR_H_
57 #define ICING_FILE_FILE_BACKED_VECTOR_H_
58 
59 #include <inttypes.h>
60 #include <stdint.h>
61 #include <sys/mman.h>
62 
63 #include <cstdint>
64 #include <memory>
65 #include <string>
66 #include <utility>
67 #include <vector>
68 
69 #include "icing/text_classifier/lib3/utils/base/status.h"
70 #include "icing/text_classifier/lib3/utils/base/statusor.h"
71 #include "icing/absl_ports/canonical_errors.h"
72 #include "icing/absl_ports/str_cat.h"
73 #include "icing/file/filesystem.h"
74 #include "icing/file/memory-mapped-file.h"
75 #include "icing/legacy/core/icing-string-util.h"
76 #include "icing/util/crc32.h"
77 #include "icing/util/logging.h"
78 #include "icing/util/math-util.h"
79 #include "icing/util/status-macros.h"
80 
81 namespace icing {
82 namespace lib {
83 
84 template <typename T>
85 class FileBackedVector {
86  public:
87   // Header stored at the beginning of the file before the rest of the vector
88   // elements. Stores metadata on the vector.
89   struct Header {
90     // Static assert constants.
91     static constexpr int32_t kHeaderSize = 24;
92     static constexpr int32_t kHeaderChecksumOffset = 16;
93 
94     static constexpr int32_t kMagic = 0x8bbbe237;
95 
96     // Holds the magic as quick sanity check against file corruption
97     int32_t magic;
98 
99     // Byte size of each element in the vector
100     int32_t element_size;
101 
102     // Number of elements currently in the vector
103     int32_t num_elements;
104 
105     // Checksum of the vector elements, doesn't include the header fields.
106     //
107     // TODO(cassiewang): Add a checksum state that can track if the checksum is
108     // fresh or stale. This lets us short circuit checksum computations if we
109     // know the checksum is fresh.
110     uint32_t vector_checksum;
111 
112     // Must be below all actual header content fields and above the padding
113     // field. Contains the crc checksum of the preceding fields.
114     uint32_t header_checksum;
115 
116     // This field has no actual meaning here but is just used as padding for the
117     // struct so the size of the struct can be a multiple of 8. Doing this makes
118     // the address right after the header a multiple of 8 and prevents a ubsan
119     // misalign-pointer-use error (go/ubsan).
120     //
121     // NOTE: please remove this when adding new fields and re-assert that the
122     // size is multiple of 8.
123     int32_t padding_for_ptr_alignment;
124 
CalculateHeaderChecksumHeader125     uint32_t CalculateHeaderChecksum() const {
126       // Sanity check that the memory layout matches the disk layout.
127       static_assert(std::is_standard_layout<FileBackedVector::Header>::value,
128                     "");
129       static_assert(sizeof(FileBackedVector::Header) == kHeaderSize, "");
130       static_assert(
131           sizeof(FileBackedVector::Header) % sizeof(void*) == 0,
132           "Header has insufficient padding for void* pointer alignment");
133       static_assert(offsetof(FileBackedVector::Header, header_checksum) ==
134                         kHeaderChecksumOffset,
135                     "");
136 
137       Crc32 crc;
138       std::string_view header_str(
139           reinterpret_cast<const char*>(this),
140           offsetof(FileBackedVector::Header, header_checksum));
141       crc.Append(header_str);
142       return crc.Get();
143     }
144   };
145 
146   // Creates a new FileBackedVector to read/write content to.
147   //
148   // filesystem: Object to make system level calls
149   // file_path : Specifies the file to persist the vector to; must be a path
150   //             within a directory that already exists.
151   // mmap_strategy : Strategy/optimizations to access the content in the vector,
152   //                 see MemoryMappedFile::Strategy for more details
153   //
154   // Return:
155   //   FAILED_PRECONDITION_ERROR if the file checksum doesn't match the stored
156   //                             checksum.
157   //   INTERNAL_ERROR on I/O errors.
158   //   UNIMPLEMENTED_ERROR if created with strategy READ_WRITE_MANUAL_SYNC.
159   static libtextclassifier3::StatusOr<std::unique_ptr<FileBackedVector<T>>>
160   Create(const Filesystem& filesystem, const std::string& file_path,
161          MemoryMappedFile::Strategy mmap_strategy);
162 
163   // Deletes the FileBackedVector
164   //
165   // filesystem: Object to make system level calls
166   // file_path : Specifies the file the vector is persisted to.
167   static libtextclassifier3::Status Delete(const Filesystem& filesystem,
168                                            const std::string& file_path);
169 
170   // Not copyable
171   FileBackedVector(const FileBackedVector&) = delete;
172   FileBackedVector& operator=(const FileBackedVector&) = delete;
173 
174   // If the vector was created with
175   // MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, then changes will be
176   // synced by the system and the checksum will be updated.
177   ~FileBackedVector();
178 
179   // Gets a copy of the element at idx.
180   //
181   // This is useful if you think the FileBackedVector may grow before you need
182   // to access this return value. When the FileBackedVector grows, the
183   // underlying mmap will be unmapped and remapped, which will invalidate any
184   // pointers to the previously mapped region. Getting a copy will avoid
185   // referencing the now-invalidated region.
186   //
187   // Returns:
188   //   OUT_OF_RANGE_ERROR if idx < 0 or > num_elements()
189   libtextclassifier3::StatusOr<T> GetCopy(int32_t idx) const;
190 
191   // Gets a pointer to the element at idx.
192   //
193   // WARNING: Subsequent calls to Set may invalidate the pointer returned by
194   // Get.
195   //
196   // This is useful if you do not think the FileBackedVector will grow before
197   // you need to reference this value, and you want to avoid a copy. When the
198   // FileBackedVector grows, the underlying mmap will be unmapped and remapped,
199   // which will invalidate this pointer to the previously mapped region.
200   //
201   // Returns:
202   //   OUT_OF_RANGE_ERROR if idx < 0 or > num_elements()
203   libtextclassifier3::StatusOr<const T*> Get(int32_t idx) const;
204 
205   // Writes the value at idx.
206   //
207   // May grow the underlying file and mmapped region as needed to fit the new
208   // value. If it does grow, then any pointers to previous values returned
209   // from Get() may be invalidated.
210   //
211   // Returns:
212   //   OUT_OF_RANGE_ERROR if idx < 0 or file cannot be grown idx size
213   libtextclassifier3::Status Set(int32_t idx, const T& value);
214 
215   // Resizes to first len elements. The crc is cleared on truncation and will be
216   // updated on destruction, or once the client calls ComputeChecksum() or
217   // PersistToDisk().
218   //
219   // Returns:
220   //   OUT_OF_RANGE_ERROR if len < 0 or >= num_elements()
221   libtextclassifier3::Status TruncateTo(int32_t new_num_elements);
222 
223   // Flushes content to underlying file.
224   //
225   // Returns:
226   //   OK on success
227   //   INTERNAL on I/O error
228   libtextclassifier3::Status PersistToDisk();
229 
230   // Calculates and returns the disk usage in bytes. Rounds up to the nearest
231   // block size.
232   //
233   // Returns:
234   //   Disk usage on success
235   //   INTERNAL_ERROR on IO error
236   libtextclassifier3::StatusOr<int64_t> GetDiskUsage() const;
237 
238   // Returns the file size of the all the elements held in the vector. File size
239   // is in bytes. This excludes the size of any internal metadata of the vector,
240   // e.g. the vector's header.
241   //
242   // Returns:
243   //   File size on success
244   //   INTERNAL_ERROR on IO error
245   libtextclassifier3::StatusOr<int64_t> GetElementsFileSize() const;
246 
247   // Accessors.
array()248   const T* array() const {
249     return reinterpret_cast<const T*>(mmapped_file_->region());
250   }
251 
mutable_array()252   T* mutable_array() const {
253     return reinterpret_cast<T*>(mmapped_file_->mutable_region());
254   }
255 
num_elements()256   int32_t num_elements() const { return header_->num_elements; }
257 
258   // Updates checksum of the vector contents and returns it.
259   //
260   // Returns:
261   //   INTERNAL_ERROR if the vector's internal state is inconsistent
262   libtextclassifier3::StatusOr<Crc32> ComputeChecksum();
263 
264  private:
265   // We track partial updates to the array for crc updating. This
266   // requires extra memory to keep track of original buffers but
267   // allows for much faster crc re-computation. This is the frac limit
268   // of byte len after which we will discard recorded changes and
269   // recompute the entire crc instead.
270   static constexpr int32_t kPartialCrcLimitDiv = 8;  // limit is 1/8th
271 
272   // Grow file by at least this many elements if array is growable.
273   static constexpr int64_t kGrowElements = 1u << 14;  // 16K
274 
275   // Max number of elements that can be held by the vector.
276   static constexpr int64_t kMaxNumElements = 1u << 20;  // 1M
277 
278   // Can only be created through the factory ::Create function
279   FileBackedVector(const Filesystem& filesystem, const std::string& file_path,
280                    std::unique_ptr<Header> header,
281                    std::unique_ptr<MemoryMappedFile> mmapped_file);
282 
283   // Initialize a new FileBackedVector, and create the file.
284   static libtextclassifier3::StatusOr<std::unique_ptr<FileBackedVector<T>>>
285   InitializeNewFile(const Filesystem& filesystem, const std::string& file_path,
286                     ScopedFd fd, MemoryMappedFile::Strategy mmap_strategy);
287 
288   // Initialize a FileBackedVector from an existing file.
289   static libtextclassifier3::StatusOr<std::unique_ptr<FileBackedVector<T>>>
290   InitializeExistingFile(const Filesystem& filesystem,
291                          const std::string& file_path, ScopedFd fd,
292                          MemoryMappedFile::Strategy mmap_strategy);
293 
294   // Grows the underlying file to hold at least num_elements
295   //
296   // Returns:
297   //   OUT_OF_RANGE_ERROR if we can't grow to the specified size
298   libtextclassifier3::Status GrowIfNecessary(int32_t num_elements);
299 
300   // Cached constructor params.
301   const Filesystem* const filesystem_;
302   const std::string file_path_;
303   std::unique_ptr<Header> header_;
304   std::unique_ptr<MemoryMappedFile> mmapped_file_;
305 
306   // Offset before which all the elements have been included in the calculation
307   // of crc at the time it was calculated.
308   int32_t changes_end_ = 0;
309 
310   // Offset of changes that have happened since the last crc update between [0,
311   // changes_end_).
312   std::vector<int32_t> changes_;
313 
314   // Buffer of the original elements that have been changed since the last crc
315   // update. Will be cleared if the size grows too big.
316   std::string saved_original_buffer_;
317 
318   // Keep track of all pages we touched so we can write them back to
319   // disk.
320   std::vector<bool> dirty_pages_;
321 };
322 
323 template <typename T>
324 constexpr int32_t FileBackedVector<T>::kPartialCrcLimitDiv;
325 
326 template <typename T>
327 constexpr int64_t FileBackedVector<T>::kGrowElements;
328 
329 template <typename T>
330 constexpr int64_t FileBackedVector<T>::kMaxNumElements;
331 
332 template <typename T>
333 libtextclassifier3::StatusOr<std::unique_ptr<FileBackedVector<T>>>
Create(const Filesystem & filesystem,const std::string & file_path,MemoryMappedFile::Strategy mmap_strategy)334 FileBackedVector<T>::Create(const Filesystem& filesystem,
335                             const std::string& file_path,
336                             MemoryMappedFile::Strategy mmap_strategy) {
337   if (mmap_strategy == MemoryMappedFile::Strategy::READ_WRITE_MANUAL_SYNC) {
338     // FileBackedVector's behavior of growing the file underneath the mmap is
339     // inherently broken with MAP_PRIVATE. Growing the vector requires extending
340     // the file size, then unmapping and then re-mmapping over the new, larger
341     // file. But when we unmap, we lose all the vector's contents if they
342     // weren't manually persisted. Either don't allow READ_WRITE_MANUAL_SYNC
343     // vectors from growing, or make users aware of this somehow
344     return absl_ports::UnimplementedError(
345         "FileBackedVector currently doesn't support READ_WRITE_MANUAL_SYNC "
346         "mmap strategy.");
347   }
348 
349   ScopedFd fd(filesystem.OpenForWrite(file_path.c_str()));
350   if (!fd.is_valid()) {
351     return absl_ports::InternalError(
352         absl_ports::StrCat("Failed to open ", file_path));
353   }
354 
355   int64_t file_size = filesystem.GetFileSize(file_path.c_str());
356   if (file_size == Filesystem::kBadFileSize) {
357     return absl_ports::InternalError(
358         absl_ports::StrCat("Bad file size for file ", file_path));
359   }
360 
361   const bool new_file = file_size == 0;
362   if (new_file) {
363     return InitializeNewFile(filesystem, file_path, std::move(fd),
364                              mmap_strategy);
365   }
366   return InitializeExistingFile(filesystem, file_path, std::move(fd),
367                                 mmap_strategy);
368 }
369 
370 template <typename T>
371 libtextclassifier3::StatusOr<std::unique_ptr<FileBackedVector<T>>>
InitializeNewFile(const Filesystem & filesystem,const std::string & file_path,ScopedFd fd,MemoryMappedFile::Strategy mmap_strategy)372 FileBackedVector<T>::InitializeNewFile(
373     const Filesystem& filesystem, const std::string& file_path, ScopedFd fd,
374     MemoryMappedFile::Strategy mmap_strategy) {
375   // Create header.
376   auto header = std::make_unique<Header>();
377   header->magic = FileBackedVector<T>::Header::kMagic;
378   header->element_size = sizeof(T);
379   header->header_checksum = header->CalculateHeaderChecksum();
380 
381   // We use Write() here, instead of writing through the mmapped region
382   // created below, so we can gracefully handle errors that occur when the
383   // disk is full. See b/77309668 for details.
384   if (!filesystem.PWrite(fd.get(), /*offset=*/0, header.get(),
385                          sizeof(Header))) {
386     return absl_ports::InternalError("Failed to write header");
387   }
388 
389   // Constructor of MemoryMappedFile doesn't actually call mmap(), mmap()
390   // happens on MemoryMappedFile::Remap(). So having a potentially unflushed fd
391   // at this point shouldn't run into issues with a mmap of the same file. But
392   // we'll close the fd just in case.
393   fd.reset();
394   auto mmapped_file =
395       std::make_unique<MemoryMappedFile>(filesystem, file_path, mmap_strategy);
396 
397   return std::unique_ptr<FileBackedVector<T>>(new FileBackedVector<T>(
398       filesystem, file_path, std::move(header), std::move(mmapped_file)));
399 }
400 
401 template <typename T>
402 libtextclassifier3::StatusOr<std::unique_ptr<FileBackedVector<T>>>
InitializeExistingFile(const Filesystem & filesystem,const std::string & file_path,const ScopedFd fd,MemoryMappedFile::Strategy mmap_strategy)403 FileBackedVector<T>::InitializeExistingFile(
404     const Filesystem& filesystem, const std::string& file_path,
405     const ScopedFd fd, MemoryMappedFile::Strategy mmap_strategy) {
406   int64_t file_size = filesystem.GetFileSize(file_path.c_str());
407   if (file_size < sizeof(FileBackedVector<T>::Header)) {
408     return absl_ports::InternalError(
409         absl_ports::StrCat("File header too short for ", file_path));
410   }
411 
412   auto header = std::make_unique<Header>();
413   if (!filesystem.PRead(fd.get(), header.get(), sizeof(Header),
414                         /*offset=*/0)) {
415     return absl_ports::InternalError(
416         absl_ports::StrCat("Failed to read header of ", file_path));
417   }
418 
419   // Make sure the header is still valid before we use any of its values. This
420   // should technically be included in the header_checksum check below, but this
421   // is a quick/fast check that can save us from an extra crc computation.
422   if (header->kMagic != FileBackedVector<T>::Header::kMagic) {
423     return absl_ports::InternalError(
424         absl_ports::StrCat("Invalid header kMagic for ", file_path));
425   }
426 
427   // Check header
428   if (header->header_checksum != header->CalculateHeaderChecksum()) {
429     return absl_ports::FailedPreconditionError(
430         absl_ports::StrCat("Invalid header crc for ", file_path));
431   }
432 
433   if (header->element_size != sizeof(T)) {
434     return absl_ports::InternalError(IcingStringUtil::StringPrintf(
435         "Inconsistent element size, expected %zd, actual %d", sizeof(T),
436         header->element_size));
437   }
438 
439   int64_t min_file_size = header->num_elements * sizeof(T) + sizeof(Header);
440   if (min_file_size > file_size) {
441     return absl_ports::InternalError(IcingStringUtil::StringPrintf(
442         "Inconsistent file size, expected %" PRId64 ", actual %" PRId64,
443         min_file_size, file_size));
444   }
445 
446   // Mmap the content of the vector, excluding the header so its easier to
447   // access elements from the mmapped region
448   auto mmapped_file =
449       std::make_unique<MemoryMappedFile>(filesystem, file_path, mmap_strategy);
450   ICING_RETURN_IF_ERROR(
451       mmapped_file->Remap(sizeof(Header), file_size - sizeof(Header)));
452 
453   // Check vector contents
454   Crc32 vector_checksum;
455   std::string_view vector_contents(
456       reinterpret_cast<const char*>(mmapped_file->region()),
457       header->num_elements * sizeof(T));
458   vector_checksum.Append(vector_contents);
459 
460   if (vector_checksum.Get() != header->vector_checksum) {
461     return absl_ports::FailedPreconditionError(
462         absl_ports::StrCat("Invalid vector contents for ", file_path));
463   }
464 
465   return std::unique_ptr<FileBackedVector<T>>(new FileBackedVector<T>(
466       filesystem, file_path, std::move(header), std::move(mmapped_file)));
467 }
468 
469 template <typename T>
Delete(const Filesystem & filesystem,const std::string & file_path)470 libtextclassifier3::Status FileBackedVector<T>::Delete(
471     const Filesystem& filesystem, const std::string& file_path) {
472   if (!filesystem.DeleteFile(file_path.c_str())) {
473     return absl_ports::InternalError(
474         absl_ports::StrCat("Failed to delete file: ", file_path));
475   }
476   return libtextclassifier3::Status::OK;
477 }
478 
479 template <typename T>
FileBackedVector(const Filesystem & filesystem,const std::string & file_path,std::unique_ptr<Header> header,std::unique_ptr<MemoryMappedFile> mmapped_file)480 FileBackedVector<T>::FileBackedVector(
481     const Filesystem& filesystem, const std::string& file_path,
482     std::unique_ptr<Header> header,
483     std::unique_ptr<MemoryMappedFile> mmapped_file)
484     : filesystem_(&filesystem),
485       file_path_(file_path),
486       header_(std::move(header)),
487       mmapped_file_(std::move(mmapped_file)),
488       changes_end_(header_->num_elements) {}
489 
490 template <typename T>
~FileBackedVector()491 FileBackedVector<T>::~FileBackedVector() {
492   if (mmapped_file_->strategy() ==
493       MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC) {
494     if (!PersistToDisk().ok()) {
495       ICING_LOG(WARNING)
496           << "Failed to persist vector to disk while destructing "
497           << file_path_;
498     }
499   }
500 }
501 
502 template <typename T>
GetCopy(int32_t idx)503 libtextclassifier3::StatusOr<T> FileBackedVector<T>::GetCopy(
504     int32_t idx) const {
505   ICING_ASSIGN_OR_RETURN(const T* value, Get(idx));
506   return *value;
507 }
508 
509 template <typename T>
Get(int32_t idx)510 libtextclassifier3::StatusOr<const T*> FileBackedVector<T>::Get(
511     int32_t idx) const {
512   if (idx < 0) {
513     return absl_ports::OutOfRangeError(
514         IcingStringUtil::StringPrintf("Index, %d, was less than 0", idx));
515   }
516 
517   if (idx >= header_->num_elements) {
518     return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
519         "Index, %d, was greater than vector size, %d", idx,
520         header_->num_elements));
521   }
522 
523   return &array()[idx];
524 }
525 
526 template <typename T>
Set(int32_t idx,const T & value)527 libtextclassifier3::Status FileBackedVector<T>::Set(int32_t idx,
528                                                     const T& value) {
529   if (idx < 0) {
530     return absl_ports::OutOfRangeError(
531         IcingStringUtil::StringPrintf("Index, %d, was less than 0", idx));
532   }
533 
534   ICING_RETURN_IF_ERROR(GrowIfNecessary(idx + 1));
535 
536   if (idx + 1 > header_->num_elements) {
537     header_->num_elements = idx + 1;
538   }
539 
540   if (mutable_array()[idx] == value) {
541     // No need to update
542     return libtextclassifier3::Status::OK;
543   }
544 
545   // Cache original value to update crcs.
546   if (idx < changes_end_) {
547     // If we exceed kPartialCrcLimitDiv, clear changes_end_ to
548     // revert to full CRC.
549     if ((saved_original_buffer_.size() + sizeof(T)) *
550             FileBackedVector<T>::kPartialCrcLimitDiv >
551         changes_end_ * sizeof(T)) {
552       ICING_VLOG(2) << "FileBackedVector change tracking limit exceeded";
553       changes_.clear();
554       saved_original_buffer_.clear();
555       changes_end_ = 0;
556       header_->vector_checksum = 0;
557     } else {
558       int32_t start_byte = idx * sizeof(T);
559 
560       changes_.push_back(idx);
561       saved_original_buffer_.append(
562           reinterpret_cast<char*>(const_cast<T*>(array())) + start_byte,
563           sizeof(T));
564     }
565   }
566 
567   mutable_array()[idx] = value;
568   return libtextclassifier3::Status::OK;
569 }
570 
571 template <typename T>
GrowIfNecessary(int32_t num_elements)572 libtextclassifier3::Status FileBackedVector<T>::GrowIfNecessary(
573     int32_t num_elements) {
574   if (sizeof(T) == 0) {
575     // Growing is a no-op
576     return libtextclassifier3::Status::OK;
577   }
578 
579   if (num_elements <= header_->num_elements) {
580     return libtextclassifier3::Status::OK;
581   }
582 
583   if (num_elements > FileBackedVector<T>::kMaxNumElements) {
584     return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
585         "%d exceeds maximum number of elements allowed, %lld", num_elements,
586         static_cast<long long>(FileBackedVector<T>::kMaxNumElements)));
587   }
588 
589   int64_t current_file_size = filesystem_->GetFileSize(file_path_.c_str());
590   int64_t least_file_size_needed = sizeof(Header) + num_elements * sizeof(T);
591 
592   if (least_file_size_needed <= current_file_size) {
593     // Our underlying file can hold the target num_elements cause we've grown
594     // before
595     return libtextclassifier3::Status::OK;
596   }
597 
598   // Otherwise, we need to grow. Grow to kGrowElements boundary.
599   least_file_size_needed = math_util::RoundUpTo(
600       least_file_size_needed,
601       int64_t{FileBackedVector<T>::kGrowElements * sizeof(T)});
602 
603   // We use PWrite here rather than Grow because Grow doesn't actually allocate
604   // an underlying disk block. This can lead to problems with mmap because mmap
605   // has no effective way to signal that it was impossible to allocate the disk
606   // block and ends up crashing instead. PWrite will force the allocation of
607   // these blocks, which will ensure that any failure to grow will surface here.
608   int64_t page_size = getpagesize();
609   auto buf = std::make_unique<uint8_t[]>(page_size);
610   int64_t size_to_write = page_size - (current_file_size % page_size);
611   ScopedFd sfd(filesystem_->OpenForWrite(file_path_.c_str()));
612   while (current_file_size < least_file_size_needed) {
613     if (!filesystem_->PWrite(sfd.get(), current_file_size, buf.get(),
614                              size_to_write)) {
615       return absl_ports::InternalError(
616           absl_ports::StrCat("Couldn't grow file ", file_path_));
617     }
618     current_file_size += size_to_write;
619     size_to_write = page_size - (current_file_size % page_size);
620   }
621 
622   ICING_RETURN_IF_ERROR(mmapped_file_->Remap(
623       sizeof(Header), least_file_size_needed - sizeof(Header)));
624 
625   return libtextclassifier3::Status::OK;
626 }
627 
628 template <typename T>
TruncateTo(int32_t new_num_elements)629 libtextclassifier3::Status FileBackedVector<T>::TruncateTo(
630     int32_t new_num_elements) {
631   if (new_num_elements < 0) {
632     return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
633         "Truncated length %d must be >= 0", new_num_elements));
634   }
635 
636   if (new_num_elements >= header_->num_elements) {
637     return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
638         "Truncated length %d must be less than the current size %d",
639         new_num_elements, header_->num_elements));
640   }
641 
642   ICING_VLOG(2)
643       << "FileBackedVector truncating, need to recalculate entire checksum";
644   changes_.clear();
645   saved_original_buffer_.clear();
646   changes_end_ = 0;
647   header_->vector_checksum = 0;
648 
649   header_->num_elements = new_num_elements;
650   return libtextclassifier3::Status::OK;
651 }
652 
653 template <typename T>
ComputeChecksum()654 libtextclassifier3::StatusOr<Crc32> FileBackedVector<T>::ComputeChecksum() {
655   // First apply the modified area. Keep a bitmap of already updated
656   // regions so we don't double-update.
657   std::vector<bool> updated(changes_end_);
658   uint32_t cur_offset = 0;
659   Crc32 cur_crc(header_->vector_checksum);
660   int num_partial_crcs = 0;
661   int num_truncated = 0;
662   int num_overlapped = 0;
663   int num_duplicate = 0;
664   for (size_t i = 0; i < changes_.size(); i++) {
665     const int32_t change_offset = changes_[i];
666     if (change_offset > changes_end_) {
667       return absl_ports::InternalError(IcingStringUtil::StringPrintf(
668           "Failed to update crc, change offset %d, changes_end_ %d",
669           change_offset, changes_end_));
670     }
671 
672     // Skip truncated tracked changes.
673     if (change_offset >= header_->num_elements) {
674       ++num_truncated;
675       continue;
676     }
677 
678     // Turn change buffer into change^original.
679     const char* buffer_end = &saved_original_buffer_[cur_offset + sizeof(T)];
680     const char* cur_array =
681         reinterpret_cast<const char*>(array()) + change_offset * sizeof(T);
682     // Now xor in. SSE acceleration please?
683     for (char* cur = &saved_original_buffer_[cur_offset]; cur < buffer_end;
684          cur++, cur_array++) {
685       *cur ^= *cur_array;
686     }
687 
688     // Skip over already updated bytes by setting update to 0.
689     bool new_update = false;
690     bool overlap = false;
691     uint32_t cur_element = change_offset;
692     for (char* cur = &saved_original_buffer_[cur_offset]; cur < buffer_end;
693          cur_element++, cur += sizeof(T)) {
694       if (updated[cur_element]) {
695         memset(cur, 0, sizeof(T));
696         overlap = true;
697       } else {
698         updated[cur_element] = true;
699         new_update = true;
700       }
701     }
702 
703     // Apply update to crc.
704     if (new_update) {
705       // Explicitly create the string_view with length
706       std::string_view xored_str(buffer_end - sizeof(T), sizeof(T));
707       if (!cur_crc
708                .UpdateWithXor(xored_str, changes_end_ * sizeof(T),
709                               change_offset * sizeof(T))
710                .ok()) {
711         return absl_ports::InternalError(IcingStringUtil::StringPrintf(
712             "Failed to update crc, change offset %d, change "
713             "length %zd changes_end_ %d",
714             change_offset, xored_str.length(), changes_end_));
715       }
716       num_partial_crcs++;
717       if (overlap) {
718         num_overlapped++;
719       }
720     } else {
721       num_duplicate++;
722     }
723     cur_offset += sizeof(T);
724   }
725 
726   if (!changes_.empty()) {
727     ICING_VLOG(2) << IcingStringUtil::StringPrintf(
728         "Array update partial crcs %d truncated %d overlapped %d duplicate %d",
729         num_partial_crcs, num_truncated, num_overlapped, num_duplicate);
730   }
731 
732   // Now update with grown area.
733   if (changes_end_ < header_->num_elements) {
734     // Explicitly create the string_view with length
735     std::string_view update_str(
736         reinterpret_cast<const char*>(array()) + changes_end_ * sizeof(T),
737         (header_->num_elements - changes_end_) * sizeof(T));
738     cur_crc.Append(update_str);
739     ICING_VLOG(2) << IcingStringUtil::StringPrintf(
740         "Array update tail crc offset %d -> %d", changes_end_,
741         header_->num_elements);
742   }
743 
744   // Clear, now that we've applied changes.
745   changes_.clear();
746   saved_original_buffer_.clear();
747   changes_end_ = header_->num_elements;
748 
749   // Commit new crc.
750   header_->vector_checksum = cur_crc.Get();
751   return cur_crc;
752 }
753 
754 template <typename T>
PersistToDisk()755 libtextclassifier3::Status FileBackedVector<T>::PersistToDisk() {
756   // Update and write the header
757   ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
758   header_->vector_checksum = checksum.Get();
759   header_->header_checksum = header_->CalculateHeaderChecksum();
760 
761   if (!filesystem_->PWrite(file_path_.c_str(), /*offset=*/0, header_.get(),
762                            sizeof(Header))) {
763     return absl_ports::InternalError("Failed to sync header");
764   }
765 
766   MemoryMappedFile::Strategy strategy = mmapped_file_->strategy();
767 
768   if (strategy == MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC) {
769     // Changes should have been applied to the underlying file, but call msync()
770     // as an extra safety step to ensure they are written out.
771     ICING_RETURN_IF_ERROR(mmapped_file_->PersistToDisk());
772   }
773 
774   return libtextclassifier3::Status::OK;
775 }
776 
777 template <typename T>
GetDiskUsage()778 libtextclassifier3::StatusOr<int64_t> FileBackedVector<T>::GetDiskUsage()
779     const {
780   int64_t size = filesystem_->GetDiskUsage(file_path_.c_str());
781   if (size == Filesystem::kBadFileSize) {
782     return absl_ports::InternalError(
783         "Failed to get disk usage of file-backed vector");
784   }
785   return size;
786 }
787 
788 template <typename T>
GetElementsFileSize()789 libtextclassifier3::StatusOr<int64_t> FileBackedVector<T>::GetElementsFileSize()
790     const {
791   int64_t total_file_size = filesystem_->GetFileSize(file_path_.c_str());
792   if (total_file_size == Filesystem::kBadFileSize) {
793     return absl_ports::InternalError(
794         "Failed to get file size of elements in the file-backed vector");
795   }
796   return total_file_size - sizeof(Header);
797 }
798 
799 }  // namespace lib
800 }  // namespace icing
801 
802 #endif  // ICING_FILE_FILE_BACKED_VECTOR_H_
803