• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2021 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // File-backed log of protos with append-only writes and position based reads.
16 //
17 // There should only be one instance of a PortableFileBackedProtoLog of the same
18 // file at a time; using multiple instances at the same time may lead to
19 // undefined behavior.
20 //
21 // The entire checksum is computed on initialization to verify the contents are
22 // valid. On failure, the log will be truncated to the last verified state when
23 // PersistToDisk() was called. If the log cannot successfully restore the last
24 // state due to disk corruption or some other inconsistency, then the entire log
25 // will be lost.
26 //
27 // Each proto written to the file will have a metadata written just before it.
28 // The metadata consists of
29 //   {
30 //     1 bytes of kProtoMagic;
31 //     3 bytes of the proto size
32 //     n bytes of the proto itself
33 //   }
34 //
35 // All metadata is written in a portable format, encoded with htonl before
36 // writing to file and decoded with ntohl when reading from file.
37 //
38 // Example usage:
39 //   ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
40 //       PortableFileBackedProtoLog<DocumentProto>::Create(filesystem,
41 //       file_path_,
42 //                                                  options));
43 //   auto proto_log = create_result.proto_log;
44 //
45 //   Document document;
46 //   document.set_namespace("com.google.android.example");
47 //   document.set_uri("www.google.com");
48 //
49 //   int64_t document_offset = proto_log->WriteProto(document));
50 //   Document same_document = proto_log->ReadProto(document_offset));
51 //   proto_log->PersistToDisk();
52 
53 #ifndef ICING_FILE_PORTABLE_FILE_BACKED_PROTO_LOG_H_
54 #define ICING_FILE_PORTABLE_FILE_BACKED_PROTO_LOG_H_
55 
56 #include <cstddef>
57 #include <cstdint>
58 #include <cstring>
59 #include <memory>
60 #include <string>
61 #include <string_view>
62 #include <utility>
63 #include <vector>
64 
65 #include "icing/text_classifier/lib3/utils/base/status.h"
66 #include "icing/text_classifier/lib3/utils/base/statusor.h"
67 #include <google/protobuf/io/gzip_stream.h>
68 #include <google/protobuf/io/zero_copy_stream_impl_lite.h>
69 #include "icing/absl_ports/canonical_errors.h"
70 #include "icing/absl_ports/str_cat.h"
71 #include "icing/file/filesystem.h"
72 #include "icing/file/memory-mapped-file.h"
73 #include "icing/legacy/core/icing-string-util.h"
74 #include "icing/portable/endian.h"
75 #include "icing/portable/platform.h"
76 #include "icing/portable/zlib.h"
77 #include "icing/util/bit-util.h"
78 #include "icing/util/crc32.h"
79 #include "icing/util/data-loss.h"
80 #include "icing/util/logging.h"
81 #include "icing/util/status-macros.h"
82 
83 namespace icing {
84 namespace lib {
85 
86 template <typename ProtoT>
87 class PortableFileBackedProtoLog {
88  public:
89   struct Options {
90     // Whether to compress each proto before writing to the proto log.
91     bool compress;
92 
93     // Byte-size limit for each proto written to the store. This does not
94     // include the bytes needed for the metadata of each proto.
95     //
96     // NOTE: Currently, we only support protos up to 16MiB. We store the proto
97     // size in 3 bytes within the metadata.
98     //
99     // NOTE: This limit is only enforced for future writes. If the store
100     // previously had a higher limit, then reading older entries could return
101     // larger protos.
102     //
103     // NOTE: The max_proto_size is the upper limit for input protos into the
104     // ProtoLog. Even if the proto is larger than max_proto_size, but compresses
105     // to a smaller size, ProtoLog will not accept it. Protos that result in a
106     // compressed size larger than max_proto_size are also not accepted.
107     const int32_t max_proto_size;
108 
109     // Must specify values for options.
110     Options() = delete;
111     explicit Options(bool compress_in,
112                      const int32_t max_proto_size_in = kMaxProtoSize)
compressOptions113         : compress(compress_in), max_proto_size(max_proto_size_in) {}
114   };
115 
116   // Number of bytes we reserve for the heading at the beginning of the proto
117   // log. We reserve this so the header can grow without running into the
118   // contents of the proto log, triggering an unnecessary migration of the data.
119   static constexpr int kHeaderReservedBytes = 256;
120 
121   // Header stored at the beginning of the file before the rest of the log
122   // contents. Stores metadata on the log.
123   class Header {
124    public:
125     static constexpr int32_t kMagic = 0xf4c6f67a;
126 
127     static constexpr int32_t kFileFormatVersion = 0;
128 
CalculateHeaderChecksum()129     uint32_t CalculateHeaderChecksum() const {
130       Crc32 crc;
131 
132       // Get a string_view of all the fields of the Header, excluding the
133       // magic_nbytes_ and header_checksum_nbytes_
134       std::string_view header_str(
135           reinterpret_cast<const char*>(this) +
136               offsetof(Header, header_checksum_nbytes_) +
137               sizeof(header_checksum_nbytes_),
138           sizeof(Header) - sizeof(magic_nbytes_) -
139               sizeof(header_checksum_nbytes_));
140       crc.Append(header_str);
141       return crc.Get();
142     }
143 
GetMagic()144     int32_t GetMagic() const { return gntohl(magic_nbytes_); }
145 
SetMagic(int32_t magic_in)146     void SetMagic(int32_t magic_in) { magic_nbytes_ = ghtonl(magic_in); }
147 
GetFileFormatVersion()148     int32_t GetFileFormatVersion() const {
149       return gntohl(file_format_version_nbytes_);
150     }
151 
SetFileFormatVersion(int32_t file_format_version_in)152     void SetFileFormatVersion(int32_t file_format_version_in) {
153       file_format_version_nbytes_ = ghtonl(file_format_version_in);
154     }
155 
GetMaxProtoSize()156     int32_t GetMaxProtoSize() const { return gntohl(max_proto_size_nbytes_); }
157 
SetMaxProtoSize(int32_t max_proto_size_in)158     void SetMaxProtoSize(int32_t max_proto_size_in) {
159       max_proto_size_nbytes_ = ghtonl(max_proto_size_in);
160     }
161 
GetLogChecksum()162     int32_t GetLogChecksum() const { return gntohl(log_checksum_nbytes_); }
163 
SetLogChecksum(int32_t log_checksum_in)164     void SetLogChecksum(int32_t log_checksum_in) {
165       log_checksum_nbytes_ = ghtonl(log_checksum_in);
166     }
167 
GetRewindOffset()168     int64_t GetRewindOffset() const { return gntohll(rewind_offset_nbytes_); }
169 
SetRewindOffset(int64_t rewind_offset_in)170     void SetRewindOffset(int64_t rewind_offset_in) {
171       rewind_offset_nbytes_ = ghtonll(rewind_offset_in);
172     }
173 
GetHeaderChecksum()174     int32_t GetHeaderChecksum() const {
175       return gntohl(header_checksum_nbytes_);
176     }
177 
SetHeaderChecksum(int32_t header_checksum_in)178     void SetHeaderChecksum(int32_t header_checksum_in) {
179       header_checksum_nbytes_ = ghtonl(header_checksum_in);
180     }
181 
GetCompressFlag()182     bool GetCompressFlag() const { return GetFlag(kCompressBit); }
183 
SetCompressFlag(bool compress)184     void SetCompressFlag(bool compress) { SetFlag(kCompressBit, compress); }
185 
GetDirtyFlag()186     bool GetDirtyFlag() { return GetFlag(kDirtyBit); }
187 
SetDirtyFlag(bool dirty)188     void SetDirtyFlag(bool dirty) { SetFlag(kDirtyBit, dirty); }
189 
190    private:
191     // The least-significant bit offset at which the compress flag is stored in
192     // 'flags_nbytes_'. Represents whether the protos in the log are compressed
193     // or not.
194     static constexpr int32_t kCompressBit = 0;
195 
196     // The least-significant bit offset at which the dirty flag is stored in
197     // 'flags'. Represents whether the checksummed portion of the log has been
198     // modified after the last checksum was computed.
199     static constexpr int32_t kDirtyBit = 1;
200 
GetFlag(int offset)201     bool GetFlag(int offset) const {
202       return bit_util::BitfieldGet(flags_, offset, /*len=*/1);
203     }
204 
SetFlag(int offset,bool value)205     void SetFlag(int offset, bool value) {
206       bit_util::BitfieldSet(value, offset, /*len=*/1, &flags_);
207     }
208 
209     // Holds the magic as a quick sanity check against file corruption.
210     //
211     // Field is in network-byte order.
212     int32_t magic_nbytes_ = ghtonl(kMagic);
213 
214     // Must be at the beginning after kMagic. Contains the crc checksum of
215     // the following fields.
216     //
217     // Field is in network-byte order.
218     uint32_t header_checksum_nbytes_ = 0;
219 
220     // Last known good offset at which the log and its checksum were updated.
221     // If we crash between writing to the log and updating the checksum, we can
222     // try to rewind the log to this offset and verify the checksum is still
223     // valid instead of throwing away the entire log.
224     //
225     // Field is in network-byte order.
226     int64_t rewind_offset_nbytes_ = ghtonll(kHeaderReservedBytes);
227 
228     // Version number tracking how we serialize the file to disk. If we change
229     // how/what we write to disk, this version should be updated and this class
230     // should handle a migration.
231     //
232     // Currently at kFileFormatVersion.
233     //
234     // Field is in network-byte order.
235     int32_t file_format_version_nbytes_ = 0;
236 
237     // The maximum proto size that can be written to the log.
238     //
239     // Field is in network-byte order.
240     int32_t max_proto_size_nbytes_ = 0;
241 
242     // Checksum of the log elements, doesn't include the header fields.
243     //
244     // Field is in network-byte order.
245     uint32_t log_checksum_nbytes_ = 0;
246 
247     // Bits are used to hold various flags.
248     //   Lowest bit is whether the protos are compressed or not.
249     //
250     // Field is only 1 byte, so is byte-order agnostic.
251     uint8_t flags_ = 0;
252 
253     // NOTE: New fields should *almost always* be added to the end here. Since
254     // this class may have already been written to disk, appending fields
255     // increases the chances that changes are backwards-compatible.
256   };
257   static_assert(sizeof(Header) <= kHeaderReservedBytes,
258                 "Header has grown past our reserved bytes!");
259 
260   struct CreateResult {
261     // A successfully initialized log.
262     std::unique_ptr<PortableFileBackedProtoLog<ProtoT>> proto_log;
263 
264     // The data status after initializing from a previous state. Data loss can
265     // happen if the file is corrupted or some previously added data was
266     // unpersisted. This may be used to signal that any derived data off of the
267     // proto log may need to be regenerated.
268     DataLoss data_loss = DataLoss::NONE;
269 
270     // Whether the proto log had to recalculate the checksum to check its
271     // integrity. This can be avoided if no changes were made or the log was
272     // able to update its checksum before shutting down. But it may have to
273     // recalculate if it's unclear if we crashed after updating the log, but
274     // before updating our checksum.
275     bool recalculated_checksum = false;
276 
has_data_lossCreateResult277     bool has_data_loss() {
278       return data_loss == DataLoss::PARTIAL || data_loss == DataLoss::COMPLETE;
279     }
280   };
281 
282   // Factory method to create, initialize, and return a
283   // PortableFileBackedProtoLog. Will create the file if it doesn't exist.
284   //
285   // If on re-initialization the log detects disk corruption or some previously
286   // added data was unpersisted, the log will rewind to the last-good state. The
287   // log saves these checkpointed "good" states when PersistToDisk() is called
288   // or the log is safely destructed. If the log rewinds successfully to the
289   // last-good state, then the returned CreateResult.data_loss indicates
290   // whether it has a data loss and what kind of data loss it is (partial or
291   // complete) so that any derived data may know that it needs to be updated. If
292   // the log re-initializes successfully without any data loss,
293   // CreateResult.data_loss will be NONE.
294   //
295   // Params:
296   //   filesystem: Handles system level calls
297   //   file_path: Path of the underlying file. Directory of the file should
298   //   already exist
299   //   options: Configuration options for the proto log
300   //
301   // Returns:
302   //   PortableFileBackedProtoLog::CreateResult on success
303   //   INVALID_ARGUMENT on an invalid option
304   //   INTERNAL_ERROR on IO error
305   static libtextclassifier3::StatusOr<CreateResult> Create(
306       const Filesystem* filesystem, const std::string& file_path,
307       const Options& options);
308 
309   // Not copyable
310   PortableFileBackedProtoLog(const PortableFileBackedProtoLog&) = delete;
311   PortableFileBackedProtoLog& operator=(const PortableFileBackedProtoLog&) =
312       delete;
313 
314   // This will update the checksum of the log as well.
315   ~PortableFileBackedProtoLog();
316 
317   // Writes the serialized proto to the underlying file. Writes are applied
318   // directly to the underlying file. Users do not need to sync the file after
319   // writing.
320   //
321   // Returns:
322   //   Offset of the newly appended proto in file on success
323   //   INVALID_ARGUMENT if proto is too large, as decided by
324   //     Options.max_proto_size
325   //   INTERNAL_ERROR on IO error
326   libtextclassifier3::StatusOr<int64_t> WriteProto(const ProtoT& proto);
327 
328   // Reads out a proto located at file_offset from the file.
329   //
330   // Returns:
331   //   A proto on success
332   //   NOT_FOUND if the proto at the given offset has been erased
333   //   OUT_OF_RANGE_ERROR if file_offset exceeds file size
334   //   INTERNAL_ERROR on IO error
335   libtextclassifier3::StatusOr<ProtoT> ReadProto(int64_t file_offset) const;
336 
337   // Erases the data of a proto located at file_offset from the file.
338   //
339   // Returns:
340   //   OK on success
341   //   OUT_OF_RANGE_ERROR if file_offset exceeds file size
342   //   INTERNAL_ERROR on IO error
343   libtextclassifier3::Status EraseProto(int64_t file_offset);
344 
345   // Calculates and returns the disk usage in bytes. Rounds up to the nearest
346   // block size.
347   //
348   // Returns:
349   //   Disk usage on success
350   //   INTERNAL_ERROR on IO error
351   libtextclassifier3::StatusOr<int64_t> GetDiskUsage() const;
352 
353   // Returns the file size of all the elements held in the log. File size is in
354   // bytes. This excludes the size of any internal metadata of the log, e.g. the
355   // log's header.
356   //
357   // Returns:
358   //   File size on success
359   //   INTERNAL_ERROR on IO error
360   libtextclassifier3::StatusOr<int64_t> GetElementsFileSize() const;
361 
362   // An iterator helping to find offsets of all the protos in file.
363   // Example usage:
364   //
365   // while (iterator.Advance().ok()) {
366   //   int64_t offset = iterator.GetOffset();
367   //   // Do something
368   // }
369   class Iterator {
370    public:
371     Iterator(const Filesystem& filesystem, const std::string& file_path,
372              int64_t initial_offset);
373 
374     // Advances to the position of next proto whether it has been erased or not.
375     //
376     // Returns:
377     //   OK on success
378     //   OUT_OF_RANGE_ERROR if it reaches the end
379     //   INTERNAL_ERROR on IO error
380     libtextclassifier3::Status Advance();
381 
382     // Returns the file offset of current proto.
383     int64_t GetOffset();
384 
385    private:
386     static constexpr int64_t kInvalidOffset = -1;
387     // Used to read proto metadata
388     MemoryMappedFile mmapped_file_;
389     // Offset of first proto
390     int64_t initial_offset_;
391     int64_t current_offset_;
392     int64_t file_size_;
393   };
394 
395   // Returns an iterator of current proto log. The caller needs to keep the
396   // proto log unchanged while using the iterator, otherwise unexpected
397   // behaviors could happen.
398   Iterator GetIterator();
399 
400   // Persists all changes since initialization or the last call to
401   // PersistToDisk(). Any changes that aren't persisted may be lost if the
402   // system fails to close safely.
403   //
404   // Example use case:
405   //
406   //   Document document;
407   //   document.set_namespace("com.google.android.example");
408   //   document.set_uri("www.google.com");
409   //
410   //   {
411   //     ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
412   //         PortableFileBackedProtoLog<DocumentProto>::Create(filesystem,
413   //         file_path,
414   //                                                    options));
415   //     auto proto_log = std::move(create_result.proto_log);
416   //
417   //     int64_t document_offset = proto_log->WriteProto(document));
418   //
419   //     // We lose the document here since it wasn't persisted.
420   //     // *SYSTEM CRASH*
421   //   }
422   //
423   //   {
424   //     // Can still successfully create after a crash since the log can
425   //     // rewind/truncate to recover into a previously good state
426   //     ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
427   //         PortableFileBackedProtoLog<DocumentProto>::Create(filesystem,
428   //         file_path,
429   //                                                    options));
430   //     auto proto_log = std::move(create_result.proto_log);
431   //
432   //     // Lost the proto since we didn't PersistToDisk before the crash
433   //     proto_log->ReadProto(document_offset)); // INVALID_ARGUMENT error
434   //
435   //     int64_t document_offset = proto_log->WriteProto(document));
436   //
437   //     // Persisted this time, so we should be ok.
438   //     ICING_ASSERT_OK(proto_log->PersistToDisk());
439   //   }
440   //
441   //   {
442   //     ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
443   //         PortableFileBackedProtoLog<DocumentProto>::Create(filesystem,
444   //         file_path,
445   //                                                    options));
446   //     auto proto_log = std::move(create_result.proto_log);
447   //
448   //     // SUCCESS
449   //     Document same_document = proto_log->ReadProto(document_offset));
450   //   }
451   //
452   // NOTE: Since all protos are already written to the file directly, this
453   // just updates the checksum and rewind position. Without these updates,
454   // future initializations will truncate the file and discard unpersisted
455   // changes.
456   //
457   // Returns:
458   //   OK on success
459   //   INTERNAL_ERROR on IO error
460   libtextclassifier3::Status PersistToDisk();
461 
462   // Calculates the checksum of the log contents. Excludes the header content.
463   //
464   // Returns:
465   //   Crc of the log content
466   //   INTERNAL_ERROR on IO error
467   libtextclassifier3::StatusOr<Crc32> ComputeChecksum();
468 
469  private:
470   // Object can only be instantiated via the ::Create factory.
471   PortableFileBackedProtoLog(const Filesystem* filesystem,
472                              const std::string& file_path,
473                              std::unique_ptr<Header> header);
474 
475   // Initializes a new proto log.
476   //
477   // Returns:
478   //   std::unique_ptr<CreateResult> on success
479   //   INTERNAL_ERROR on IO error
480   static libtextclassifier3::StatusOr<CreateResult> InitializeNewFile(
481       const Filesystem* filesystem, const std::string& file_path,
482       const Options& options);
483 
484   // Verifies that the existing proto log is in a good state. If not in a good
485   // state, then the proto log may be truncated to the last good state and
486   // content will be lost.
487   //
488   // Returns:
489   //   std::unique_ptr<CreateResult> on success
490   //   INTERNAL_ERROR on IO error or internal inconsistencies in the file
491   //   INVALID_ARGUMENT_ERROR if options aren't consistent with previous
492   //     instances
493   static libtextclassifier3::StatusOr<CreateResult> InitializeExistingFile(
494       const Filesystem* filesystem, const std::string& file_path,
495       const Options& options, int64_t file_size);
496 
497   // Takes an initial checksum and updates it with the content between `start`
498   // and `end` offsets in the file.
499   //
500   // Returns:
501   //   Crc of the content between `start`, inclusive, and `end`, exclusive.
502   //   INTERNAL_ERROR on IO error
503   //   INVALID_ARGUMENT_ERROR if start and end aren't within the file size
504   static libtextclassifier3::StatusOr<Crc32> ComputeChecksum(
505       const Filesystem* filesystem, const std::string& file_path,
506       Crc32 initial_crc, int64_t start, int64_t end);
507 
508   // Reads out the metadata of a proto located at file_offset from the file.
509   // Metadata will be returned in host byte order endianness.
510   //
511   // Returns:
512   //   Proto's metadata on success
513   //   OUT_OF_RANGE_ERROR if file_offset exceeds file_size
514   //   INTERNAL_ERROR if the metadata is invalid or any IO errors happen
515   static libtextclassifier3::StatusOr<int32_t> ReadProtoMetadata(
516       MemoryMappedFile* mmapped_file, int64_t file_offset, int64_t file_size);
517 
518   // Writes metadata of a proto to the fd. Takes in a host byte order endianness
519   // metadata and converts it into a portable metadata before writing.
520   //
521   // Returns:
522   //   OK on success
523   //   INTERNAL_ERROR on any IO errors
524   static libtextclassifier3::Status WriteProtoMetadata(
525       const Filesystem* filesystem, int fd, int32_t host_order_metadata);
526 
IsEmptyBuffer(const char * buffer,int size)527   static bool IsEmptyBuffer(const char* buffer, int size) {
528     return std::all_of(buffer, buffer + size,
529                        [](const char byte) { return byte == 0; });
530   }
531 
532   // Helper function to get stored proto size from the metadata.
533   // Metadata format: 8 bits magic + 24 bits size
GetProtoSize(int metadata)534   static int GetProtoSize(int metadata) { return metadata & 0x00FFFFFF; }
535 
536   // Helper function to get stored proto magic from the metadata.
537   // Metadata format: 8 bits magic + 24 bits size
GetProtoMagic(int metadata)538   static uint8_t GetProtoMagic(int metadata) { return metadata >> 24; }
539 
540   // Magic number added in front of every proto. Used when reading out protos
541   // as a first check for corruption in each entry in the file. Even if there is
542   // a corruption, the best we can do is roll back to our last recovery point
543   // and throw away un-flushed data. We can discard/reuse this byte if needed so
544   // that we have 4 bytes to store the size of protos, and increase the size of
545   // protos we support.
546   static constexpr uint8_t kProtoMagic = 0x5C;
547 
548   // Our internal max for protos.
549   //
550   // WARNING: Changing this to a larger number may invalidate our assumption
551   // that that proto size can safely be stored in the last 3 bytes of the proto
552   // header.
553   static constexpr int kMaxProtoSize = (1 << 24) - 1;  // 16MiB
554   static_assert(kMaxProtoSize <= 0x00FFFFFF,
555                 "kMaxProtoSize doesn't fit in 3 bytes");
556 
557   // Level of compression, BEST_SPEED = 1, BEST_COMPRESSION = 9
558   static constexpr int kDeflateCompressionLevel = 3;
559 
560   // Chunks of the file to mmap at a time, so we don't mmap the entire file.
561   // Only used on 32-bit devices
562   static constexpr int kMmapChunkSize = 4 * 1024 * 1024;  // 4MiB
563 
564   ScopedFd fd_;
565   const Filesystem* const filesystem_;
566   const std::string file_path_;
567   std::unique_ptr<Header> header_;
568 };
569 
570 template <typename ProtoT>
571 constexpr uint8_t PortableFileBackedProtoLog<ProtoT>::kProtoMagic;
572 
573 template <typename ProtoT>
PortableFileBackedProtoLog(const Filesystem * filesystem,const std::string & file_path,std::unique_ptr<Header> header)574 PortableFileBackedProtoLog<ProtoT>::PortableFileBackedProtoLog(
575     const Filesystem* filesystem, const std::string& file_path,
576     std::unique_ptr<Header> header)
577     : filesystem_(filesystem),
578       file_path_(file_path),
579       header_(std::move(header)) {
580   fd_.reset(filesystem_->OpenForAppend(file_path.c_str()));
581 }
582 
583 template <typename ProtoT>
~PortableFileBackedProtoLog()584 PortableFileBackedProtoLog<ProtoT>::~PortableFileBackedProtoLog() {
585   if (!PersistToDisk().ok()) {
586     ICING_LOG(WARNING) << "Error persisting to disk during destruction of "
587                           "PortableFileBackedProtoLog: "
588                        << file_path_;
589   }
590 }
591 
592 template <typename ProtoT>
593 libtextclassifier3::StatusOr<
594     typename PortableFileBackedProtoLog<ProtoT>::CreateResult>
Create(const Filesystem * filesystem,const std::string & file_path,const Options & options)595 PortableFileBackedProtoLog<ProtoT>::Create(const Filesystem* filesystem,
596                                            const std::string& file_path,
597                                            const Options& options) {
598   if (options.max_proto_size <= 0) {
599     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
600         "options.max_proto_size must be greater than 0, was %d",
601         options.max_proto_size));
602   }
603 
604   // Since we store the proto_size in 3 bytes, we can only support protos of up
605   // to 16MiB.
606   if (options.max_proto_size > kMaxProtoSize) {
607     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
608         "options.max_proto_size must be under 16MiB, was %d",
609         options.max_proto_size));
610   }
611 
612   if (!filesystem->FileExists(file_path.c_str())) {
613     return InitializeNewFile(filesystem, file_path, options);
614   }
615 
616   int64_t file_size = filesystem->GetFileSize(file_path.c_str());
617   if (file_size == Filesystem::kBadFileSize) {
618     return absl_ports::InternalError(
619         absl_ports::StrCat("Bad file size '", file_path, "'"));
620   }
621 
622   if (file_size == 0) {
623     return InitializeNewFile(filesystem, file_path, options);
624   }
625 
626   return InitializeExistingFile(filesystem, file_path, options, file_size);
627 }
628 
629 template <typename ProtoT>
630 libtextclassifier3::StatusOr<
631     typename PortableFileBackedProtoLog<ProtoT>::CreateResult>
InitializeNewFile(const Filesystem * filesystem,const std::string & file_path,const Options & options)632 PortableFileBackedProtoLog<ProtoT>::InitializeNewFile(
633     const Filesystem* filesystem, const std::string& file_path,
634     const Options& options) {
635   // Grow to the minimum reserved bytes for the header.
636   if (!filesystem->Truncate(file_path.c_str(), kHeaderReservedBytes)) {
637     return absl_ports::InternalError(
638         absl_ports::StrCat("Failed to initialize file size: ", file_path));
639   }
640 
641   // Create the header
642   std::unique_ptr<Header> header = std::make_unique<Header>();
643   header->SetCompressFlag(options.compress);
644   header->SetMaxProtoSize(options.max_proto_size);
645   header->SetHeaderChecksum(header->CalculateHeaderChecksum());
646 
647   if (!filesystem->Write(file_path.c_str(), header.get(), sizeof(Header))) {
648     return absl_ports::InternalError(
649         absl_ports::StrCat("Failed to write header for file: ", file_path));
650   }
651 
652   CreateResult create_result = {
653       std::unique_ptr<PortableFileBackedProtoLog<ProtoT>>(
654           new PortableFileBackedProtoLog<ProtoT>(filesystem, file_path,
655                                                  std::move(header))),
656       /*data_loss=*/DataLoss::NONE, /*recalculated_checksum=*/false};
657 
658   return create_result;
659 }
660 
661 template <typename ProtoT>
662 libtextclassifier3::StatusOr<
663     typename PortableFileBackedProtoLog<ProtoT>::CreateResult>
InitializeExistingFile(const Filesystem * filesystem,const std::string & file_path,const Options & options,int64_t file_size)664 PortableFileBackedProtoLog<ProtoT>::InitializeExistingFile(
665     const Filesystem* filesystem, const std::string& file_path,
666     const Options& options, int64_t file_size) {
667   bool header_changed = false;
668   if (file_size < kHeaderReservedBytes) {
669     return absl_ports::InternalError(
670         absl_ports::StrCat("File header too short for: ", file_path));
671   }
672 
673   std::unique_ptr<Header> header = std::make_unique<Header>();
674   if (!filesystem->PRead(file_path.c_str(), header.get(), sizeof(Header),
675                          /*offset=*/0)) {
676     return absl_ports::InternalError(
677         absl_ports::StrCat("Failed to read header for file: ", file_path));
678   }
679 
680   // Make sure the header is still valid before we use any of its values. This
681   // is covered by the header_checksum check below, but this is a quick check
682   // that can save us from an extra crc computation.
683   if (header->GetMagic() != Header::kMagic) {
684     return absl_ports::InternalError(
685         absl_ports::StrCat("Invalid header kMagic for file: ", file_path));
686   }
687 
688   if (header->GetHeaderChecksum() != header->CalculateHeaderChecksum()) {
689     return absl_ports::InternalError(
690         absl_ports::StrCat("Invalid header checksum for: ", file_path));
691   }
692 
693   if (header->GetFileFormatVersion() != Header::kFileFormatVersion) {
694     // If this changes, we might need to handle a migration rather than throwing
695     // an error.
696     return absl_ports::InternalError(
697         absl_ports::StrCat("Invalid header file format version: ", file_path));
698   }
699 
700   if (header->GetCompressFlag() != options.compress) {
701     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
702         "Inconsistent compress option, expected %d, actual %d",
703         header->GetCompressFlag(), options.compress));
704   }
705 
706   int32_t existing_max_proto_size = header->GetMaxProtoSize();
707   if (existing_max_proto_size > options.max_proto_size) {
708     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
709         "Max proto size cannot be smaller than previous "
710         "instantiations, previous size %d, wanted size %d",
711         header->GetMaxProtoSize(), options.max_proto_size));
712   } else if (existing_max_proto_size < options.max_proto_size) {
713     // It's fine if our new max size is greater than our previous one. Existing
714     // data is still valid.
715     header->SetMaxProtoSize(options.max_proto_size);
716     header_changed = true;
717   }
718 
719   DataLoss data_loss = DataLoss::NONE;
720 
721   // If we have any documents in our tail, get rid of them since they're not in
722   // our checksum. Our checksum reflects content up to the rewind offset.
723   if (file_size > header->GetRewindOffset()) {
724     if (!filesystem->Truncate(file_path.c_str(), header->GetRewindOffset())) {
725       return absl_ports::InternalError(IcingStringUtil::StringPrintf(
726           "Failed to truncate '%s' to size %lld", file_path.data(),
727           static_cast<long long>(header->GetRewindOffset())));
728     };
729     data_loss = DataLoss::PARTIAL;
730   }
731 
732   bool recalculated_checksum = false;
733 
734   // If our dirty flag is set, that means we might have crashed in the middle of
735   // erasing a proto. This could have happened anywhere between:
736   //   A. Set dirty flag to true and update header checksum
737   //   B. Erase the proto
738   //   C. Set dirty flag to false, update log checksum, update header checksum
739   //
740   // Scenario 1: We went down between A and B. Maybe our dirty flag is a
741   // false alarm and we can keep all our data.
742   //
743   // Scenario 2: We went down between B and C. Our data is compromised and
744   // we need to throw everything out.
745   if (header->GetDirtyFlag()) {
746     // Recompute the log's checksum to detect which scenario we're in.
747     ICING_ASSIGN_OR_RETURN(
748         Crc32 calculated_log_checksum,
749         ComputeChecksum(filesystem, file_path, Crc32(),
750                         /*start=*/kHeaderReservedBytes, /*end=*/file_size));
751 
752     if (header->GetLogChecksum() != calculated_log_checksum.Get()) {
753       // Still doesn't match, we're in Scenario 2. Throw out all our data now
754       // and initialize as a new instance.
755       ICING_ASSIGN_OR_RETURN(CreateResult create_result,
756                              InitializeNewFile(filesystem, file_path, options));
757       create_result.data_loss = DataLoss::COMPLETE;
758       create_result.recalculated_checksum = true;
759       return create_result;
760     }
761     // Otherwise we're good, checksum matches our contents so continue
762     // initializing like normal.
763     recalculated_checksum = true;
764 
765     // Update our header.
766     header->SetDirtyFlag(false);
767     header_changed = true;
768   }
769 
770   if (header_changed) {
771     header->SetHeaderChecksum(header->CalculateHeaderChecksum());
772 
773     if (!filesystem->PWrite(file_path.c_str(), /*offset=*/0, header.get(),
774                             sizeof(Header))) {
775       return absl_ports::InternalError(
776           absl_ports::StrCat("Failed to update header to: ", file_path));
777     }
778   }
779 
780   CreateResult create_result = {
781       std::unique_ptr<PortableFileBackedProtoLog<ProtoT>>(
782           new PortableFileBackedProtoLog<ProtoT>(filesystem, file_path,
783                                                  std::move(header))),
784       data_loss, recalculated_checksum};
785 
786   return create_result;
787 }
788 
789 template <typename ProtoT>
790 libtextclassifier3::StatusOr<Crc32>
ComputeChecksum(const Filesystem * filesystem,const std::string & file_path,Crc32 initial_crc,int64_t start,int64_t end)791 PortableFileBackedProtoLog<ProtoT>::ComputeChecksum(
792     const Filesystem* filesystem, const std::string& file_path,
793     Crc32 initial_crc, int64_t start, int64_t end) {
794   auto mmapped_file = MemoryMappedFile(*filesystem, file_path,
795                                        MemoryMappedFile::Strategy::READ_ONLY);
796   Crc32 new_crc(initial_crc.Get());
797 
798   if (start < 0) {
799     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
800         "Starting checksum offset of file '%s' must be greater than 0, was "
801         "%lld",
802         file_path.c_str(), static_cast<long long>(start)));
803   }
804 
805   if (end < start) {
806     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
807         "Ending checksum offset of file '%s' must be greater than start "
808         "'%lld', was '%lld'",
809         file_path.c_str(), static_cast<long long>(start),
810         static_cast<long long>(end)));
811   }
812 
813   int64_t file_size = filesystem->GetFileSize(file_path.c_str());
814   if (end > file_size) {
815     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
816         "Ending checksum offset of file '%s' must be within "
817         "file size of %lld, was %lld",
818         file_path.c_str(), static_cast<long long>(file_size),
819         static_cast<long long>(end)));
820   }
821 
822   Architecture architecture = GetArchitecture();
823   switch (architecture) {
824     case Architecture::BIT_64: {
825       // Don't mmap in chunks here since mmapping can be harmful on 64-bit
826       // devices where mmap/munmap calls need the mmap write semaphore, which
827       // blocks mmap/munmap/mprotect and all page faults from executing while
828       // they run. On 64-bit devices, this doesn't actually load into memory, it
829       // just makes the file faultable. So the whole file should be ok.
830       // b/185822878.
831       ICING_RETURN_IF_ERROR(mmapped_file.Remap(start, end - start));
832       auto mmap_str = std::string_view(mmapped_file.region(), end - start);
833       new_crc.Append(mmap_str);
834       break;
835     }
836     case Architecture::BIT_32:
837       [[fallthrough]];
838     case Architecture::UNKNOWN: {
839       // 32-bit devices only have 4GB of RAM. Mmap in chunks to not use up too
840       // much memory at once. If we're unknown, then also chunk it because we're
841       // not sure what the device can handle.
842       for (int i = start; i < end; i += kMmapChunkSize) {
843         // Don't read past the file size.
844         int next_chunk_size = kMmapChunkSize;
845         if ((i + kMmapChunkSize) >= end) {
846           next_chunk_size = end - i;
847         }
848 
849         ICING_RETURN_IF_ERROR(mmapped_file.Remap(i, next_chunk_size));
850 
851         auto mmap_str =
852             std::string_view(mmapped_file.region(), next_chunk_size);
853         new_crc.Append(mmap_str);
854       }
855       break;
856     }
857   }
858 
859   return new_crc;
860 }
861 
862 template <typename ProtoT>
863 libtextclassifier3::StatusOr<int64_t>
WriteProto(const ProtoT & proto)864 PortableFileBackedProtoLog<ProtoT>::WriteProto(const ProtoT& proto) {
865   int64_t proto_size = proto.ByteSizeLong();
866   int32_t host_order_metadata;
867   int64_t current_position = filesystem_->GetCurrentPosition(fd_.get());
868 
869   if (proto_size > header_->GetMaxProtoSize()) {
870     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
871         "proto_size, %lld, was too large to write. Max is %d",
872         static_cast<long long>(proto_size), header_->GetMaxProtoSize()));
873   }
874 
875   // At this point, we've guaranteed that proto_size is under kMaxProtoSize
876   // (see
877   // ::Create), so we can safely store it in an int.
878   int final_size = 0;
879 
880   std::string proto_str;
881   google::protobuf::io::StringOutputStream proto_stream(&proto_str);
882 
883   if (header_->GetCompressFlag()) {
884     google::protobuf::io::GzipOutputStream::Options options;
885     options.format = google::protobuf::io::GzipOutputStream::ZLIB;
886     options.compression_level = kDeflateCompressionLevel;
887 
888     google::protobuf::io::GzipOutputStream compressing_stream(&proto_stream,
889                                                                   options);
890 
891     bool success = proto.SerializeToZeroCopyStream(&compressing_stream) &&
892                    compressing_stream.Close();
893 
894     if (!success) {
895       return absl_ports::InternalError("Error compressing proto.");
896     }
897 
898     final_size = proto_str.size();
899 
900     // In case the compressed proto is larger than the original proto, we also
901     // can't write it.
902     if (final_size > header_->GetMaxProtoSize()) {
903       return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
904           "Compressed proto size, %d, was greater than "
905           "max_proto_size, %d",
906           final_size, header_->GetMaxProtoSize()));
907     }
908   } else {
909     // Serialize the proto directly into the write buffer at an offset of the
910     // metadata.
911     proto.SerializeToZeroCopyStream(&proto_stream);
912     final_size = proto_str.size();
913   }
914 
915   // 1st byte for magic, next 3 bytes for proto size.
916   host_order_metadata = (kProtoMagic << 24) | final_size;
917 
918   // Actually write metadata, has to be done after we know the possibly
919   // compressed proto size
920   ICING_RETURN_IF_ERROR(
921       WriteProtoMetadata(filesystem_, fd_.get(), host_order_metadata));
922 
923   // Write the serialized proto
924   if (!filesystem_->Write(fd_.get(), proto_str.data(), proto_str.size())) {
925     return absl_ports::InternalError(
926         absl_ports::StrCat("Failed to write proto to: ", file_path_));
927   }
928 
929   return current_position;
930 }
931 
932 template <typename ProtoT>
933 libtextclassifier3::StatusOr<ProtoT>
ReadProto(int64_t file_offset)934 PortableFileBackedProtoLog<ProtoT>::ReadProto(int64_t file_offset) const {
935   int64_t file_size = filesystem_->GetFileSize(fd_.get());
936   MemoryMappedFile mmapped_file(*filesystem_, file_path_,
937                                 MemoryMappedFile::Strategy::READ_ONLY);
938   if (file_offset >= file_size) {
939     // file_size points to the next byte to write at, so subtract one to get
940     // the inclusive, actual size of file.
941     return absl_ports::OutOfRangeError(
942         IcingStringUtil::StringPrintf("Trying to read from a location, %lld, "
943                                       "out of range of the file size, %lld",
944                                       static_cast<long long>(file_offset),
945                                       static_cast<long long>(file_size - 1)));
946   }
947 
948   // Read out the metadata
949   ICING_ASSIGN_OR_RETURN(
950       int32_t metadata,
951       ReadProtoMetadata(&mmapped_file, file_offset, file_size));
952 
953   // Copy out however many bytes it says the proto is
954   int stored_size = GetProtoSize(metadata);
955 
956   ICING_RETURN_IF_ERROR(
957       mmapped_file.Remap(file_offset + sizeof(metadata), stored_size));
958 
959   if (IsEmptyBuffer(mmapped_file.region(), mmapped_file.region_size())) {
960     return absl_ports::NotFoundError("The proto data has been erased.");
961   }
962 
963   google::protobuf::io::ArrayInputStream proto_stream(
964       mmapped_file.mutable_region(), stored_size);
965 
966   // Deserialize proto
967   ProtoT proto;
968   if (header_->GetCompressFlag()) {
969     google::protobuf::io::GzipInputStream decompress_stream(&proto_stream);
970     proto.ParseFromZeroCopyStream(&decompress_stream);
971   } else {
972     proto.ParseFromZeroCopyStream(&proto_stream);
973   }
974 
975   return proto;
976 }
977 
978 template <typename ProtoT>
EraseProto(int64_t file_offset)979 libtextclassifier3::Status PortableFileBackedProtoLog<ProtoT>::EraseProto(
980     int64_t file_offset) {
981   int64_t file_size = filesystem_->GetFileSize(fd_.get());
982   if (file_offset >= file_size) {
983     // file_size points to the next byte to write at, so subtract one to get
984     // the inclusive, actual size of file.
985     return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
986         "Trying to erase data at a location, %lld, "
987         "out of range of the file size, %lld",
988         static_cast<long long>(file_offset),
989         static_cast<long long>(file_size - 1)));
990   }
991 
992   MemoryMappedFile mmapped_file(
993       *filesystem_, file_path_,
994       MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC);
995 
996   // Read out the metadata
997   ICING_ASSIGN_OR_RETURN(
998       int32_t metadata,
999       ReadProtoMetadata(&mmapped_file, file_offset, file_size));
1000 
1001   ICING_RETURN_IF_ERROR(mmapped_file.Remap(file_offset + sizeof(metadata),
1002                                            GetProtoSize(metadata)));
1003 
1004   // We need to update the crc checksum if the erased area is before the
1005   // rewind position.
1006   int32_t new_crc;
1007   int64_t erased_proto_offset = file_offset + sizeof(metadata);
1008   if (erased_proto_offset < header_->GetRewindOffset()) {
1009     // Set to "dirty" before we start writing anything.
1010     header_->SetDirtyFlag(true);
1011     header_->SetHeaderChecksum(header_->CalculateHeaderChecksum());
1012     if (!filesystem_->PWrite(fd_.get(), /*offset=*/0, header_.get(),
1013                              sizeof(Header))) {
1014       return absl_ports::InternalError(absl_ports::StrCat(
1015           "Failed to update dirty bit of header to: ", file_path_));
1016     }
1017 
1018     // We need to calculate [original string xor 0s].
1019     // The xored string is the same as the original string because 0 xor 0 =
1020     // 0, 1 xor 0 = 1.
1021     const std::string_view xored_str(mmapped_file.region(),
1022                                      mmapped_file.region_size());
1023 
1024     Crc32 crc(header_->GetLogChecksum());
1025     ICING_ASSIGN_OR_RETURN(
1026         new_crc, crc.UpdateWithXor(
1027                      xored_str,
1028                      /*full_data_size=*/header_->GetRewindOffset() -
1029                          kHeaderReservedBytes,
1030                      /*position=*/erased_proto_offset - kHeaderReservedBytes));
1031   }
1032 
1033   // Clear the region.
1034   memset(mmapped_file.mutable_region(), '\0', mmapped_file.region_size());
1035 
1036   // If we cleared something in our checksummed area, we should update our
1037   // checksum and reset our dirty bit.
1038   if (erased_proto_offset < header_->GetRewindOffset()) {
1039     header_->SetDirtyFlag(false);
1040     header_->SetLogChecksum(new_crc);
1041     header_->SetHeaderChecksum(header_->CalculateHeaderChecksum());
1042 
1043     if (!filesystem_->PWrite(fd_.get(), /*offset=*/0, header_.get(),
1044                              sizeof(Header))) {
1045       return absl_ports::InternalError(
1046           absl_ports::StrCat("Failed to update header to: ", file_path_));
1047     }
1048   }
1049 
1050   return libtextclassifier3::Status::OK;
1051 }
1052 
1053 template <typename ProtoT>
1054 libtextclassifier3::StatusOr<int64_t>
GetDiskUsage()1055 PortableFileBackedProtoLog<ProtoT>::GetDiskUsage() const {
1056   int64_t size = filesystem_->GetDiskUsage(file_path_.c_str());
1057   if (size == Filesystem::kBadFileSize) {
1058     return absl_ports::InternalError("Failed to get disk usage of proto log");
1059   }
1060   return size;
1061 }
1062 
1063 template <typename ProtoT>
1064 libtextclassifier3::StatusOr<int64_t>
GetElementsFileSize()1065 PortableFileBackedProtoLog<ProtoT>::GetElementsFileSize() const {
1066   int64_t total_file_size = filesystem_->GetFileSize(file_path_.c_str());
1067   if (total_file_size == Filesystem::kBadFileSize) {
1068     return absl_ports::InternalError(
1069         "Failed to get file size of elments in the proto log");
1070   }
1071   return total_file_size - kHeaderReservedBytes;
1072 }
1073 
1074 template <typename ProtoT>
Iterator(const Filesystem & filesystem,const std::string & file_path,int64_t initial_offset)1075 PortableFileBackedProtoLog<ProtoT>::Iterator::Iterator(
1076     const Filesystem& filesystem, const std::string& file_path,
1077     int64_t initial_offset)
1078     : mmapped_file_(filesystem, file_path,
1079                     MemoryMappedFile::Strategy::READ_ONLY),
1080       initial_offset_(initial_offset),
1081       current_offset_(kInvalidOffset),
1082       file_size_(filesystem.GetFileSize(file_path.c_str())) {
1083   if (file_size_ == Filesystem::kBadFileSize) {
1084     // Fails all Advance() calls
1085     file_size_ = 0;
1086   }
1087 }
1088 
1089 template <typename ProtoT>
1090 libtextclassifier3::Status
Advance()1091 PortableFileBackedProtoLog<ProtoT>::Iterator::Advance() {
1092   if (current_offset_ == kInvalidOffset) {
1093     // First Advance() call
1094     current_offset_ = initial_offset_;
1095   } else {
1096     // Jumps to the next proto position
1097     ICING_ASSIGN_OR_RETURN(
1098         int32_t metadata,
1099         ReadProtoMetadata(&mmapped_file_, current_offset_, file_size_));
1100     current_offset_ += sizeof(metadata) + GetProtoSize(metadata);
1101   }
1102 
1103   if (current_offset_ < file_size_) {
1104     return libtextclassifier3::Status::OK;
1105   } else {
1106     return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
1107         "The next proto offset, %lld, is out of file range [0, %lld)",
1108         static_cast<long long>(current_offset_),
1109         static_cast<long long>(file_size_)));
1110   }
1111 }
1112 
1113 template <typename ProtoT>
GetOffset()1114 int64_t PortableFileBackedProtoLog<ProtoT>::Iterator::GetOffset() {
1115   return current_offset_;
1116 }
1117 
1118 template <typename ProtoT>
1119 typename PortableFileBackedProtoLog<ProtoT>::Iterator
GetIterator()1120 PortableFileBackedProtoLog<ProtoT>::GetIterator() {
1121   return Iterator(*filesystem_, file_path_,
1122                   /*initial_offset=*/kHeaderReservedBytes);
1123 }
1124 
1125 template <typename ProtoT>
1126 libtextclassifier3::StatusOr<int32_t>
ReadProtoMetadata(MemoryMappedFile * mmapped_file,int64_t file_offset,int64_t file_size)1127 PortableFileBackedProtoLog<ProtoT>::ReadProtoMetadata(
1128     MemoryMappedFile* mmapped_file, int64_t file_offset, int64_t file_size) {
1129   // Checks file_offset
1130   if (file_offset >= file_size) {
1131     return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
1132         "offset, %lld, is out of file range [0, %lld)",
1133         static_cast<long long>(file_offset),
1134         static_cast<long long>(file_size)));
1135   }
1136   int32_t portable_metadata;
1137   int metadata_size = sizeof(portable_metadata);
1138   if (file_offset + metadata_size >= file_size) {
1139     return absl_ports::InternalError(IcingStringUtil::StringPrintf(
1140         "Wrong metadata offset %lld, metadata doesn't fit in "
1141         "with file range [0, %lld)",
1142         static_cast<long long>(file_offset),
1143         static_cast<long long>(file_size)));
1144   }
1145 
1146   // Reads metadata
1147   ICING_RETURN_IF_ERROR(mmapped_file->Remap(file_offset, metadata_size));
1148   memcpy(&portable_metadata, mmapped_file->region(), metadata_size);
1149 
1150   // Need to switch it back to host order endianness after reading from disk.
1151   int32_t host_order_metadata = gntohl(portable_metadata);
1152 
1153   // Checks magic number
1154   uint8_t stored_k_proto_magic = GetProtoMagic(host_order_metadata);
1155   if (stored_k_proto_magic != kProtoMagic) {
1156     return absl_ports::InternalError(IcingStringUtil::StringPrintf(
1157         "Failed to read kProtoMagic, expected %d, actual %d", kProtoMagic,
1158         stored_k_proto_magic));
1159   }
1160 
1161   return host_order_metadata;
1162 }
1163 
1164 template <typename ProtoT>
1165 libtextclassifier3::Status
WriteProtoMetadata(const Filesystem * filesystem,int fd,int32_t host_order_metadata)1166 PortableFileBackedProtoLog<ProtoT>::WriteProtoMetadata(
1167     const Filesystem* filesystem, int fd, int32_t host_order_metadata) {
1168   // Convert it into portable endian format before writing to disk
1169   int32_t portable_metadata = ghtonl(host_order_metadata);
1170   int portable_metadata_size = sizeof(portable_metadata);
1171 
1172   // Write metadata
1173   if (!filesystem->Write(fd, &portable_metadata, portable_metadata_size)) {
1174     return absl_ports::InternalError(
1175         absl_ports::StrCat("Failed to write proto metadata."));
1176   }
1177 
1178   return libtextclassifier3::Status::OK;
1179 }
1180 
1181 template <typename ProtoT>
PersistToDisk()1182 libtextclassifier3::Status PortableFileBackedProtoLog<ProtoT>::PersistToDisk() {
1183   int64_t file_size = filesystem_->GetFileSize(file_path_.c_str());
1184   if (file_size == header_->GetRewindOffset()) {
1185     // No new protos appended, don't need to update the checksum.
1186     return libtextclassifier3::Status::OK;
1187   }
1188 
1189   int64_t new_content_size = file_size - header_->GetRewindOffset();
1190   Crc32 crc;
1191   if (new_content_size < 0) {
1192     // File shrunk, recalculate the entire checksum.
1193     ICING_ASSIGN_OR_RETURN(
1194         crc,
1195         ComputeChecksum(filesystem_, file_path_, Crc32(),
1196                         /*start=*/kHeaderReservedBytes, /*end=*/file_size));
1197   } else {
1198     // Append new changes to the existing checksum.
1199     ICING_ASSIGN_OR_RETURN(
1200         crc, ComputeChecksum(filesystem_, file_path_,
1201                              Crc32(header_->GetLogChecksum()),
1202                              header_->GetRewindOffset(), file_size));
1203   }
1204 
1205   header_->SetLogChecksum(crc.Get());
1206   header_->SetRewindOffset(file_size);
1207   header_->SetHeaderChecksum(header_->CalculateHeaderChecksum());
1208 
1209   if (!filesystem_->PWrite(fd_.get(), /*offset=*/0, header_.get(),
1210                            sizeof(Header)) ||
1211       !filesystem_->DataSync(fd_.get())) {
1212     return absl_ports::InternalError(
1213         absl_ports::StrCat("Failed to update header to: ", file_path_));
1214   }
1215 
1216   return libtextclassifier3::Status::OK;
1217 }
1218 
1219 template <typename ProtoT>
1220 libtextclassifier3::StatusOr<Crc32>
ComputeChecksum()1221 PortableFileBackedProtoLog<ProtoT>::ComputeChecksum() {
1222   return PortableFileBackedProtoLog<ProtoT>::ComputeChecksum(
1223       filesystem_, file_path_, Crc32(), /*start=*/kHeaderReservedBytes,
1224       /*end=*/filesystem_->GetFileSize(file_path_.c_str()));
1225 }
1226 
1227 }  // namespace lib
1228 }  // namespace icing
1229 
1230 #endif  // ICING_FILE_PORTABLE_FILE_BACKED_PROTO_LOG_H_
1231