• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2021 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // File-backed log of protos with append-only writes and position based reads.
16 //
17 // There should only be one instance of a PortableFileBackedProtoLog of the same
18 // file at a time; using multiple instances at the same time may lead to
19 // undefined behavior.
20 //
21 // The entire checksum is computed on initialization to verify the contents are
22 // valid. On failure, the log will be truncated to the last verified state when
23 // PersistToDisk() was called. If the log cannot successfully restore the last
24 // state due to disk corruption or some other inconsistency, then the entire log
25 // will be lost.
26 //
27 // Each proto written to the file will have a metadata written just before it.
28 // The metadata consists of
29 //   {
30 //     1 bytes of kProtoMagic;
31 //     3 bytes of the proto size
32 //     n bytes of the proto itself
33 //   }
34 //
35 // All metadata is written in a portable format, encoded with htonl before
36 // writing to file and decoded with ntohl when reading from file.
37 //
38 // Example usage:
39 //   ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
40 //       PortableFileBackedProtoLog<DocumentProto>::Create(filesystem,
41 //       file_path_,
42 //                                                  options));
43 //   auto proto_log = create_result.proto_log;
44 //
45 //   Document document;
46 //   document.set_namespace("com.google.android.example");
47 //   document.set_uri("www.google.com");
48 //
49 //   int64_t document_offset = proto_log->WriteProto(document));
50 //   Document same_document = proto_log->ReadProto(document_offset));
51 //   proto_log->PersistToDisk();
52 
53 #ifndef ICING_FILE_PORTABLE_FILE_BACKED_PROTO_LOG_H_
54 #define ICING_FILE_PORTABLE_FILE_BACKED_PROTO_LOG_H_
55 
56 #include <cstddef>
57 #include <cstdint>
58 #include <cstring>
59 #include <memory>
60 #include <string>
61 #include <string_view>
62 #include <utility>
63 #include <vector>
64 
65 #include "icing/text_classifier/lib3/utils/base/status.h"
66 #include "icing/text_classifier/lib3/utils/base/statusor.h"
67 #include "icing/absl_ports/canonical_errors.h"
68 #include "icing/absl_ports/str_cat.h"
69 #include "icing/file/constants.h"
70 #include "icing/file/filesystem.h"
71 #include "icing/file/memory-mapped-file.h"
72 #include "icing/legacy/core/icing-string-util.h"
73 #include "icing/portable/endian.h"
74 #include "icing/portable/gzip_stream.h"
75 #include "icing/portable/platform.h"
76 #include "icing/portable/zlib.h"
77 #include "icing/util/bit-util.h"
78 #include "icing/util/crc32.h"
79 #include "icing/util/data-loss.h"
80 #include "icing/util/logging.h"
81 #include "icing/util/status-macros.h"
82 #include <google/protobuf/io/zero_copy_stream_impl_lite.h>
83 
84 namespace icing {
85 namespace lib {
86 
87 template <typename ProtoT>
88 class PortableFileBackedProtoLog {
89  public:
90   struct Options {
91     // Whether to compress each proto before writing to the proto log.
92     bool compress;
93 
94     // Byte-size limit for each proto written to the store. This does not
95     // include the bytes needed for the metadata of each proto.
96     //
97     // NOTE: Currently, we only support protos up to 16MiB. We store the proto
98     // size in 3 bytes within the metadata.
99     //
100     // NOTE: This limit is only enforced for future writes. If the store
101     // previously had a higher limit, then reading older entries could return
102     // larger protos.
103     //
104     // NOTE: The max_proto_size is the upper limit for input protos into the
105     // ProtoLog. Even if the proto is larger than max_proto_size, but compresses
106     // to a smaller size, ProtoLog will not accept it. Protos that result in a
107     // compressed size larger than max_proto_size are also not accepted.
108     const int32_t max_proto_size;
109 
110     // Level of compression if enabled, NO_COMPRESSION = 0, BEST_SPEED = 1,
111     // BEST_COMPRESSION = 9
112     const int32_t compression_level;
113 
114     // Must specify values for options.
115     Options() = delete;
116     explicit Options(
117         bool compress_in,
118         const int32_t max_proto_size_in = constants::kMaxProtoSize,
119         const int32_t compression_level_in = kDefaultCompressionLevel)
compressOptions120         : compress(compress_in),
121           max_proto_size(max_proto_size_in),
122           compression_level(compression_level_in) {}
123   };
124 
125   // Level of compression, BEST_SPEED = 1, BEST_COMPRESSION = 9
126   static constexpr int kDefaultCompressionLevel = 3;
127 
128   // Number of bytes we reserve for the heading at the beginning of the proto
129   // log. We reserve this so the header can grow without running into the
130   // contents of the proto log, triggering an unnecessary migration of the data.
131   static constexpr int kHeaderReservedBytes = 256;
132 
133   // Header stored at the beginning of the file before the rest of the log
134   // contents. Stores metadata on the log.
135   class Header {
136    public:
137     static constexpr int32_t kMagic = 0xf4c6f67a;
138 
139     // We should go directly from 0 to 2 the next time we have to change the
140     // format.
141     static constexpr int32_t kFileFormatVersion = 0;
142 
CalculateHeaderChecksum()143     uint32_t CalculateHeaderChecksum() const {
144       Crc32 crc;
145 
146       // Get a string_view of all the fields of the Header, excluding the
147       // magic_nbytes_ and header_checksum_nbytes_
148       std::string_view header_str(
149           reinterpret_cast<const char*>(this) +
150               offsetof(Header, header_checksum_nbytes_) +
151               sizeof(header_checksum_nbytes_),
152           sizeof(Header) - sizeof(magic_nbytes_) -
153               sizeof(header_checksum_nbytes_));
154       crc.Append(header_str);
155       return crc.Get();
156     }
157 
GetMagic()158     int32_t GetMagic() const { return GNetworkToHostL(magic_nbytes_); }
159 
SetMagic(int32_t magic_in)160     void SetMagic(int32_t magic_in) {
161       magic_nbytes_ = GHostToNetworkL(magic_in);
162     }
163 
GetFileFormatVersion()164     int32_t GetFileFormatVersion() const {
165       return GNetworkToHostL(file_format_version_nbytes_);
166     }
167 
SetFileFormatVersion(int32_t file_format_version_in)168     void SetFileFormatVersion(int32_t file_format_version_in) {
169       file_format_version_nbytes_ = GHostToNetworkL(file_format_version_in);
170     }
171 
GetMaxProtoSize()172     int32_t GetMaxProtoSize() const {
173       return GNetworkToHostL(max_proto_size_nbytes_);
174     }
175 
SetMaxProtoSize(int32_t max_proto_size_in)176     void SetMaxProtoSize(int32_t max_proto_size_in) {
177       max_proto_size_nbytes_ = GHostToNetworkL(max_proto_size_in);
178     }
179 
GetLogChecksum()180     int32_t GetLogChecksum() const {
181       return GNetworkToHostL(log_checksum_nbytes_);
182     }
183 
SetLogChecksum(int32_t log_checksum_in)184     void SetLogChecksum(int32_t log_checksum_in) {
185       log_checksum_nbytes_ = GHostToNetworkL(log_checksum_in);
186     }
187 
GetRewindOffset()188     int64_t GetRewindOffset() const {
189       return GNetworkToHostLL(rewind_offset_nbytes_);
190     }
191 
SetRewindOffset(int64_t rewind_offset_in)192     void SetRewindOffset(int64_t rewind_offset_in) {
193       rewind_offset_nbytes_ = GHostToNetworkLL(rewind_offset_in);
194     }
195 
GetHeaderChecksum()196     int32_t GetHeaderChecksum() const {
197       return GNetworkToHostL(header_checksum_nbytes_);
198     }
199 
SetHeaderChecksum(int32_t header_checksum_in)200     void SetHeaderChecksum(int32_t header_checksum_in) {
201       header_checksum_nbytes_ = GHostToNetworkL(header_checksum_in);
202     }
203 
GetCompressFlag()204     bool GetCompressFlag() const { return GetFlag(kCompressBit); }
205 
SetCompressFlag(bool compress)206     void SetCompressFlag(bool compress) { SetFlag(kCompressBit, compress); }
207 
GetDirtyFlag()208     bool GetDirtyFlag() const { return GetFlag(kDirtyBit); }
209 
SetDirtyFlag(bool dirty)210     void SetDirtyFlag(bool dirty) { SetFlag(kDirtyBit, dirty); }
211 
212    private:
213     // The least-significant bit offset at which the compress flag is stored in
214     // 'flags_nbytes_'. Represents whether the protos in the log are compressed
215     // or not.
216     static constexpr int32_t kCompressBit = 0;
217 
218     // The least-significant bit offset at which the dirty flag is stored in
219     // 'flags'. Represents whether the checksummed portion of the log has been
220     // modified after the last checksum was computed.
221     static constexpr int32_t kDirtyBit = 1;
222 
GetFlag(int offset)223     bool GetFlag(int offset) const {
224       return bit_util::BitfieldGet(flags_, offset, /*len=*/1);
225     }
226 
SetFlag(int offset,bool value)227     void SetFlag(int offset, bool value) {
228       bit_util::BitfieldSet(value, offset, /*len=*/1, &flags_);
229     }
230 
231     // Holds the magic as a quick sanity check against file corruption.
232     //
233     // Field is in network-byte order.
234     int32_t magic_nbytes_ = GHostToNetworkL(kMagic);
235 
236     // Must be at the beginning after kMagic. Contains the crc checksum of
237     // the following fields.
238     //
239     // Field is in network-byte order.
240     uint32_t header_checksum_nbytes_ = 0;
241 
242     // Last known good offset at which the log and its checksum were updated.
243     // If we crash between writing to the log and updating the checksum, we can
244     // try to rewind the log to this offset and verify the checksum is still
245     // valid instead of throwing away the entire log.
246     //
247     // Field is in network-byte order.
248     int64_t rewind_offset_nbytes_ = GHostToNetworkLL(kHeaderReservedBytes);
249 
250     // Version number tracking how we serialize the file to disk. If we change
251     // how/what we write to disk, this version should be updated and this class
252     // should handle a migration.
253     //
254     // Currently at kFileFormatVersion.
255     //
256     // Field is in network-byte order.
257     int32_t file_format_version_nbytes_ = 0;
258 
259     // The maximum proto size that can be written to the log.
260     //
261     // Field is in network-byte order.
262     int32_t max_proto_size_nbytes_ = 0;
263 
264     // Checksum of the log elements, doesn't include the header fields.
265     //
266     // Field is in network-byte order.
267     uint32_t log_checksum_nbytes_ = 0;
268 
269     // Bits are used to hold various flags.
270     //   Lowest bit is whether the protos are compressed or not.
271     //
272     // Field is only 1 byte, so is byte-order agnostic.
273     uint8_t flags_ = 0;
274 
275     // NOTE: New fields should *almost always* be added to the end here. Since
276     // this class may have already been written to disk, appending fields
277     // increases the chances that changes are backwards-compatible.
278   };
279   static_assert(sizeof(Header) <= kHeaderReservedBytes,
280                 "Header has grown past our reserved bytes!");
281 
282   struct CreateResult {
283     // A successfully initialized log.
284     std::unique_ptr<PortableFileBackedProtoLog<ProtoT>> proto_log;
285 
286     // The data status after initializing from a previous state. Data loss can
287     // happen if the file is corrupted or some previously added data was
288     // unpersisted. This may be used to signal that any derived data off of the
289     // proto log may need to be regenerated.
290     DataLoss data_loss = DataLoss::NONE;
291 
292     // Whether the proto log had to recalculate the checksum to check its
293     // integrity. This can be avoided if no changes were made or the log was
294     // able to update its checksum before shutting down. But it may have to
295     // recalculate if it's unclear if we crashed after updating the log, but
296     // before updating our checksum.
297     bool recalculated_checksum = false;
298 
has_data_lossCreateResult299     bool has_data_loss() const {
300       return data_loss == DataLoss::PARTIAL || data_loss == DataLoss::COMPLETE;
301     }
302   };
303 
304   // Factory method to create, initialize, and return a
305   // PortableFileBackedProtoLog. Will create the file if it doesn't exist.
306   //
307   // If on re-initialization the log detects disk corruption or some previously
308   // added data was unpersisted, the log will rewind to the last-good state. The
309   // log saves these checkpointed "good" states when PersistToDisk() is called
310   // or the log is safely destructed. If the log rewinds successfully to the
311   // last-good state, then the returned CreateResult.data_loss indicates
312   // whether it has a data loss and what kind of data loss it is (partial or
313   // complete) so that any derived data may know that it needs to be updated. If
314   // the log re-initializes successfully without any data loss,
315   // CreateResult.data_loss will be NONE.
316   //
317   // Params:
318   //   filesystem: Handles system level calls
319   //   file_path: Path of the underlying file. Directory of the file should
320   //   already exist
321   //   options: Configuration options for the proto log
322   //
323   // Returns:
324   //   PortableFileBackedProtoLog::CreateResult on success
325   //   INVALID_ARGUMENT on an invalid option
326   //   INTERNAL_ERROR on IO error
327   static libtextclassifier3::StatusOr<CreateResult> Create(
328       const Filesystem* filesystem, const std::string& file_path,
329       const Options& options);
330 
331   // Not copyable
332   PortableFileBackedProtoLog(const PortableFileBackedProtoLog&) = delete;
333   PortableFileBackedProtoLog& operator=(const PortableFileBackedProtoLog&) =
334       delete;
335 
336   // This will update the checksum of the log as well.
337   ~PortableFileBackedProtoLog();
338 
339   // Writes the serialized proto to the underlying file. Writes are applied
340   // directly to the underlying file. Users do not need to sync the file after
341   // writing.
342   //
343   // Returns:
344   //   Offset of the newly appended proto in file on success
345   //   INVALID_ARGUMENT if proto is too large, as decided by
346   //     Options.max_proto_size
347   //   INTERNAL_ERROR on IO error
348   libtextclassifier3::StatusOr<int64_t> WriteProto(const ProtoT& proto);
349 
350   // Reads out a proto located at file_offset from the file.
351   //
352   // Returns:
353   //   A proto on success
354   //   NOT_FOUND if the proto at the given offset has been erased
355   //   OUT_OF_RANGE_ERROR if file_offset exceeds file size
356   //   INTERNAL_ERROR on IO error
357   libtextclassifier3::StatusOr<ProtoT> ReadProto(int64_t file_offset) const;
358 
359   // Erases the data of a proto located at file_offset from the file.
360   //
361   // Returns:
362   //   OK on success
363   //   OUT_OF_RANGE_ERROR if file_offset exceeds file size
364   //   INTERNAL_ERROR on IO error
365   libtextclassifier3::Status EraseProto(int64_t file_offset);
366 
367   // Calculates and returns the disk usage in bytes. Rounds up to the nearest
368   // block size.
369   //
370   // Returns:
371   //   Disk usage on success
372   //   INTERNAL_ERROR on IO error
373   libtextclassifier3::StatusOr<int64_t> GetDiskUsage() const;
374 
375   // Returns the file size of all the elements held in the log. File size is in
376   // bytes. This excludes the size of any internal metadata of the log, e.g. the
377   // log's header.
378   //
379   // Returns:
380   //   File size on success
381   //   INTERNAL_ERROR on IO error
382   libtextclassifier3::StatusOr<int64_t> GetElementsFileSize() const;
383 
384   // An iterator helping to find offsets of all the protos in file.
385   // Example usage:
386   //
387   // while (iterator.Advance().ok()) {
388   //   int64_t offset = iterator.GetOffset();
389   //   // Do something
390   // }
391   class Iterator {
392    public:
393     explicit Iterator(const Filesystem& filesystem, int fd,
394                       int64_t initial_offset, int64_t file_size);
395 
396     // Advances to the position of next proto whether it has been erased or not.
397     //
398     // Returns:
399     //   OK on success
400     //   OUT_OF_RANGE_ERROR if it reaches the end
401     //   INTERNAL_ERROR on IO error
402     libtextclassifier3::Status Advance();
403 
404     // Returns the file offset of current proto.
405     int64_t GetOffset() const;
406 
407    private:
408     static constexpr int64_t kInvalidOffset = -1;
409     // Used to read proto metadata
410     // Offset of first proto
411     const Filesystem* const filesystem_;
412     int64_t initial_offset_;
413     int64_t current_offset_;
414     int64_t file_size_;
415     int fd_;
416   };
417 
418   // Returns an iterator of current proto log. The caller needs to keep the
419   // proto log unchanged while using the iterator, otherwise unexpected
420   // behaviors could happen.
421   Iterator GetIterator() const;
422 
423   // Persists all changes since initialization or the last call to
424   // PersistToDisk(). Any changes that aren't persisted may be lost if the
425   // system fails to close safely.
426   //
427   // Example use case:
428   //
429   //   Document document;
430   //   document.set_namespace("com.google.android.example");
431   //   document.set_uri("www.google.com");
432   //
433   //   {
434   //     ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
435   //         PortableFileBackedProtoLog<DocumentProto>::Create(filesystem,
436   //         file_path,
437   //                                                    options));
438   //     auto proto_log = std::move(create_result.proto_log);
439   //
440   //     int64_t document_offset = proto_log->WriteProto(document));
441   //
442   //     // We lose the document here since it wasn't persisted.
443   //     // *SYSTEM CRASH*
444   //   }
445   //
446   //   {
447   //     // Can still successfully create after a crash since the log can
448   //     // rewind/truncate to recover into a previously good state
449   //     ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
450   //         PortableFileBackedProtoLog<DocumentProto>::Create(filesystem,
451   //         file_path,
452   //                                                    options));
453   //     auto proto_log = std::move(create_result.proto_log);
454   //
455   //     // Lost the proto since we didn't PersistToDisk before the crash
456   //     proto_log->ReadProto(document_offset)); // INVALID_ARGUMENT error
457   //
458   //     int64_t document_offset = proto_log->WriteProto(document));
459   //
460   //     // Persisted this time, so we should be ok.
461   //     ICING_ASSERT_OK(proto_log->PersistToDisk());
462   //   }
463   //
464   //   {
465   //     ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
466   //         PortableFileBackedProtoLog<DocumentProto>::Create(filesystem,
467   //         file_path,
468   //                                                    options));
469   //     auto proto_log = std::move(create_result.proto_log);
470   //
471   //     // SUCCESS
472   //     Document same_document = proto_log->ReadProto(document_offset));
473   //   }
474   //
475   // NOTE: Since all protos are already written to the file directly, this
476   // just updates the checksum and rewind position. Without these updates,
477   // future initializations will truncate the file and discard unpersisted
478   // changes.
479   //
480   // Returns:
481   //   OK on success
482   //   INTERNAL_ERROR on IO error
483   libtextclassifier3::Status PersistToDisk();
484 
485   // Calculates the checksum of the log contents (excluding the header) and
486   // updates the header.
487   //
488   // Returns:
489   //   Crc of the log content
490   //   INTERNAL_ERROR on IO error
491   libtextclassifier3::StatusOr<Crc32> UpdateChecksum();
492 
493   // Calculates and returns the checksum of the log contents (excluding the
494   // header). Does NOT update the header.
495   //
496   // Returns:
497   //   Crc of the log content
498   //   INTERNAL_ERROR on IO error
499   libtextclassifier3::StatusOr<Crc32> GetChecksum() const;
500 
501  private:
502   // Object can only be instantiated via the ::Create factory.
503   PortableFileBackedProtoLog(const Filesystem* filesystem,
504                              const std::string& file_path,
505                              std::unique_ptr<Header> header, int64_t file_size,
506                              int32_t compression_level);
507 
508   // Initializes a new proto log.
509   //
510   // Returns:
511   //   std::unique_ptr<CreateResult> on success
512   //   INTERNAL_ERROR on IO error
513   static libtextclassifier3::StatusOr<CreateResult> InitializeNewFile(
514       const Filesystem* filesystem, const std::string& file_path,
515       const Options& options);
516 
517   // Verifies that the existing proto log is in a good state. If not in a good
518   // state, then the proto log may be truncated to the last good state and
519   // content will be lost.
520   //
521   // Returns:
522   //   std::unique_ptr<CreateResult> on success
523   //   INTERNAL_ERROR on IO error or internal inconsistencies in the file
524   //   INVALID_ARGUMENT_ERROR if options aren't consistent with previous
525   //     instances
526   static libtextclassifier3::StatusOr<CreateResult> InitializeExistingFile(
527       const Filesystem* filesystem, const std::string& file_path,
528       const Options& options, int64_t file_size);
529 
530   // Takes an initial checksum and updates it with the content between `start`
531   // and `end` offsets in the file.
532   //
533   // Returns:
534   //   Crc of the content between `start`, inclusive, and `end`, exclusive.
535   //   INTERNAL_ERROR on IO error
536   //   INVALID_ARGUMENT_ERROR if start and end aren't within the file size
537   static libtextclassifier3::StatusOr<Crc32> GetPartialChecksum(
538       const Filesystem* filesystem, const std::string& file_path,
539       Crc32 initial_crc, int64_t start, int64_t end, int64_t file_size);
540 
541   // Reads out the metadata of a proto located at file_offset from the fd.
542   // Metadata will be returned in host byte order endianness.
543   //
544   // Returns:
545   //   Proto's metadata on success
546   //   OUT_OF_RANGE_ERROR if file_offset exceeds file_size
547   //   INTERNAL_ERROR if the metadata is invalid or any IO errors happen
548   static libtextclassifier3::StatusOr<int32_t> ReadProtoMetadata(
549       const Filesystem* const filesystem, int fd, int64_t file_offset,
550       int64_t file_size);
551 
552   // Writes metadata of a proto to the fd. Takes in a host byte order endianness
553   // metadata and converts it into a portable metadata before writing.
554   //
555   // Returns:
556   //   OK on success
557   //   INTERNAL_ERROR on any IO errors
558   static libtextclassifier3::Status WriteProtoMetadata(
559       const Filesystem* filesystem, int fd, int32_t host_order_metadata);
560 
IsEmptyBuffer(const char * buffer,int size)561   static bool IsEmptyBuffer(const char* buffer, int size) {
562     return std::all_of(buffer, buffer + size,
563                        [](const char byte) { return byte == 0; });
564   }
565 
566   // Helper function to get stored proto size from the metadata.
567   // Metadata format: 8 bits magic + 24 bits size
GetProtoSize(int metadata)568   static int GetProtoSize(int metadata) { return metadata & 0x00FFFFFF; }
569 
570   // Helper function to get stored proto magic from the metadata.
571   // Metadata format: 8 bits magic + 24 bits size
GetProtoMagic(int metadata)572   static uint8_t GetProtoMagic(int metadata) { return metadata >> 24; }
573 
574   // Magic number added in front of every proto. Used when reading out protos
575   // as a first check for corruption in each entry in the file. Even if there is
576   // a corruption, the best we can do is roll back to our last recovery point
577   // and throw away un-flushed data. We can discard/reuse this byte if needed so
578   // that we have 4 bytes to store the size of protos, and increase the size of
579   // protos we support.
580   static constexpr uint8_t kProtoMagic = 0x5C;
581 
582   // Chunks of the file to mmap at a time, so we don't mmap the entire file.
583   // Only used on 32-bit devices
584   static constexpr int kMmapChunkSize = 4 * 1024 * 1024;  // 4MiB
585 
586   ScopedFd fd_;
587   const Filesystem* const filesystem_;
588   const std::string file_path_;
589   std::unique_ptr<Header> header_;
590   int64_t file_size_;
591   const int32_t compression_level_;
592 };
593 
594 template <typename ProtoT>
PortableFileBackedProtoLog(const Filesystem * filesystem,const std::string & file_path,std::unique_ptr<Header> header,int64_t file_size,int32_t compression_level)595 PortableFileBackedProtoLog<ProtoT>::PortableFileBackedProtoLog(
596     const Filesystem* filesystem, const std::string& file_path,
597     std::unique_ptr<Header> header, int64_t file_size,
598     int32_t compression_level)
599     : filesystem_(filesystem),
600       file_path_(file_path),
601       header_(std::move(header)),
602       file_size_(file_size),
603       compression_level_(compression_level) {
604   fd_.reset(filesystem_->OpenForAppend(file_path.c_str()));
605 }
606 
607 template <typename ProtoT>
~PortableFileBackedProtoLog()608 PortableFileBackedProtoLog<ProtoT>::~PortableFileBackedProtoLog() {
609   if (!PersistToDisk().ok()) {
610     ICING_LOG(WARNING) << "Error persisting to disk during destruction of "
611                           "PortableFileBackedProtoLog: "
612                        << file_path_;
613   }
614 }
615 
616 template <typename ProtoT>
617 libtextclassifier3::StatusOr<
618     typename PortableFileBackedProtoLog<ProtoT>::CreateResult>
Create(const Filesystem * filesystem,const std::string & file_path,const Options & options)619 PortableFileBackedProtoLog<ProtoT>::Create(const Filesystem* filesystem,
620                                            const std::string& file_path,
621                                            const Options& options) {
622   if (options.max_proto_size <= 0) {
623     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
624         "options.max_proto_size must be greater than 0, was %d",
625         options.max_proto_size));
626   }
627 
628   // Since we store the proto_size in 3 bytes, we can only support protos of up
629   // to 16MiB.
630   if (options.max_proto_size > constants::kMaxProtoSize) {
631     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
632         "options.max_proto_size must be under 16MiB, was %d",
633         options.max_proto_size));
634   }
635 
636   if (options.compression_level < 0 || options.compression_level > 9) {
637     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
638         "options.compression_level must be between 0 and 9 inclusive, was %d",
639         options.compression_level));
640   }
641 
642   if (!filesystem->FileExists(file_path.c_str())) {
643     return InitializeNewFile(filesystem, file_path, options);
644   }
645 
646   int64_t file_size = filesystem->GetFileSize(file_path.c_str());
647   if (file_size == Filesystem::kBadFileSize) {
648     return absl_ports::InternalError(
649         absl_ports::StrCat("Bad file size '", file_path, "'"));
650   }
651 
652   if (file_size == 0) {
653     return InitializeNewFile(filesystem, file_path, options);
654   }
655 
656   return InitializeExistingFile(filesystem, file_path, options, file_size);
657 }
658 
659 template <typename ProtoT>
660 libtextclassifier3::StatusOr<
661     typename PortableFileBackedProtoLog<ProtoT>::CreateResult>
InitializeNewFile(const Filesystem * filesystem,const std::string & file_path,const Options & options)662 PortableFileBackedProtoLog<ProtoT>::InitializeNewFile(
663     const Filesystem* filesystem, const std::string& file_path,
664     const Options& options) {
665   // Grow to the minimum reserved bytes for the header.
666   if (!filesystem->Truncate(file_path.c_str(), kHeaderReservedBytes)) {
667     return absl_ports::InternalError(
668         absl_ports::StrCat("Failed to initialize file size: ", file_path));
669   }
670 
671   // Create the header
672   std::unique_ptr<Header> header = std::make_unique<Header>();
673   header->SetCompressFlag(options.compress);
674   header->SetMaxProtoSize(options.max_proto_size);
675   header->SetHeaderChecksum(header->CalculateHeaderChecksum());
676 
677   if (!filesystem->Write(file_path.c_str(), header.get(), sizeof(Header))) {
678     return absl_ports::InternalError(
679         absl_ports::StrCat("Failed to write header for file: ", file_path));
680   }
681 
682   CreateResult create_result = {
683       std::unique_ptr<PortableFileBackedProtoLog<ProtoT>>(
684           new PortableFileBackedProtoLog<ProtoT>(
685               filesystem, file_path, std::move(header),
686               /*file_size=*/kHeaderReservedBytes, options.compression_level)),
687       /*data_loss=*/DataLoss::NONE, /*recalculated_checksum=*/false};
688 
689   return create_result;
690 }
691 
692 template <typename ProtoT>
693 libtextclassifier3::StatusOr<
694     typename PortableFileBackedProtoLog<ProtoT>::CreateResult>
InitializeExistingFile(const Filesystem * filesystem,const std::string & file_path,const Options & options,int64_t file_size)695 PortableFileBackedProtoLog<ProtoT>::InitializeExistingFile(
696     const Filesystem* filesystem, const std::string& file_path,
697     const Options& options, int64_t file_size) {
698   bool header_changed = false;
699   if (file_size < kHeaderReservedBytes) {
700     return absl_ports::InternalError(
701         absl_ports::StrCat("File header too short for: ", file_path));
702   }
703 
704   std::unique_ptr<Header> header = std::make_unique<Header>();
705   if (!filesystem->PRead(file_path.c_str(), header.get(), sizeof(Header),
706                          /*offset=*/0)) {
707     return absl_ports::InternalError(
708         absl_ports::StrCat("Failed to read header for file: ", file_path));
709   }
710 
711   // Make sure the header is still valid before we use any of its values. This
712   // is covered by the header_checksum check below, but this is a quick check
713   // that can save us from an extra crc computation.
714   if (header->GetMagic() != Header::kMagic) {
715     return absl_ports::InternalError(
716         absl_ports::StrCat("Invalid header kMagic for file: ", file_path));
717   }
718 
719   if (header->GetHeaderChecksum() != header->CalculateHeaderChecksum()) {
720     return absl_ports::InternalError(
721         absl_ports::StrCat("Invalid header checksum for: ", file_path));
722   }
723 
724   if (header->GetRewindOffset() < kHeaderReservedBytes) {
725     return absl_ports::InternalError(
726         absl_ports::StrCat("Invalid header rewind offset for: ", file_path));
727   }
728 
729   if (header->GetFileFormatVersion() != Header::kFileFormatVersion) {
730     // If this changes, we might need to handle a migration rather than throwing
731     // an error.
732     return absl_ports::InternalError(
733         absl_ports::StrCat("Invalid header file format version: ", file_path));
734   }
735 
736   if (header->GetCompressFlag() != options.compress) {
737     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
738         "Inconsistent compress option, expected %d, actual %d",
739         header->GetCompressFlag(), options.compress));
740   }
741 
742   int32_t existing_max_proto_size = header->GetMaxProtoSize();
743   if (existing_max_proto_size > options.max_proto_size) {
744     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
745         "Max proto size cannot be smaller than previous "
746         "instantiations, previous size %d, wanted size %d",
747         header->GetMaxProtoSize(), options.max_proto_size));
748   } else if (existing_max_proto_size < options.max_proto_size) {
749     // It's fine if our new max size is greater than our previous one. Existing
750     // data is still valid.
751     header->SetMaxProtoSize(options.max_proto_size);
752     header_changed = true;
753   }
754 
755   DataLoss data_loss = DataLoss::NONE;
756 
757   // If we have any documents in our tail, get rid of them since they're not in
758   // our checksum. Our checksum reflects content up to the rewind offset.
759   if (file_size > header->GetRewindOffset()) {
760     if (!filesystem->Truncate(file_path.c_str(), header->GetRewindOffset())) {
761       return absl_ports::InternalError(IcingStringUtil::StringPrintf(
762           "Failed to truncate '%s' to size %lld", file_path.data(),
763           static_cast<long long>(header->GetRewindOffset())));
764     }
765     file_size = header->GetRewindOffset();
766     data_loss = DataLoss::PARTIAL;
767   }
768 
769   bool recalculated_checksum = false;
770 
771   // If our dirty flag is set, that means we might have crashed in the middle of
772   // erasing a proto. This could have happened anywhere between:
773   //   A. Set dirty flag to true and update header checksum
774   //   B. Erase the proto
775   //   C. Set dirty flag to false, update log checksum, update header checksum
776   //
777   // Scenario 1: We went down between A and B. Maybe our dirty flag is a
778   // false alarm and we can keep all our data.
779   //
780   // Scenario 2: We went down between B and C. Our data is compromised and
781   // we need to throw everything out.
782   if (header->GetDirtyFlag()) {
783     // Recompute the log's checksum to detect which scenario we're in.
784     ICING_ASSIGN_OR_RETURN(Crc32 calculated_log_checksum,
785                            GetPartialChecksum(filesystem, file_path, Crc32(),
786                                               /*start=*/kHeaderReservedBytes,
787                                               /*end=*/file_size, file_size));
788 
789     if (header->GetLogChecksum() != calculated_log_checksum.Get()) {
790       // Still doesn't match, we're in Scenario 2. Throw out all our data now
791       // and initialize as a new instance.
792       ICING_ASSIGN_OR_RETURN(CreateResult create_result,
793                              InitializeNewFile(filesystem, file_path, options));
794       create_result.data_loss = DataLoss::COMPLETE;
795       create_result.recalculated_checksum = true;
796       return create_result;
797     }
798     // Otherwise we're good, checksum matches our contents so continue
799     // initializing like normal.
800     recalculated_checksum = true;
801 
802     // Update our header.
803     header->SetDirtyFlag(false);
804     header_changed = true;
805   }
806 
807   if (header_changed) {
808     header->SetHeaderChecksum(header->CalculateHeaderChecksum());
809 
810     if (!filesystem->PWrite(file_path.c_str(), /*offset=*/0, header.get(),
811                             sizeof(Header))) {
812       return absl_ports::InternalError(
813           absl_ports::StrCat("Failed to update header to: ", file_path));
814     }
815   }
816 
817   CreateResult create_result = {
818       std::unique_ptr<PortableFileBackedProtoLog<ProtoT>>(
819           new PortableFileBackedProtoLog<ProtoT>(filesystem, file_path,
820                                                  std::move(header), file_size,
821                                                  options.compression_level)),
822       data_loss, recalculated_checksum};
823 
824   return create_result;
825 }
826 
827 template <typename ProtoT>
828 libtextclassifier3::StatusOr<Crc32>
GetPartialChecksum(const Filesystem * filesystem,const std::string & file_path,Crc32 initial_crc,int64_t start,int64_t end,int64_t file_size)829 PortableFileBackedProtoLog<ProtoT>::GetPartialChecksum(
830     const Filesystem* filesystem, const std::string& file_path,
831     Crc32 initial_crc, int64_t start, int64_t end, int64_t file_size) {
832   ICING_ASSIGN_OR_RETURN(
833       MemoryMappedFile mmapped_file,
834       MemoryMappedFile::Create(*filesystem, file_path,
835                                MemoryMappedFile::Strategy::READ_ONLY));
836   Crc32 new_crc(initial_crc.Get());
837 
838   if (start < 0) {
839     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
840         "Starting checksum offset of file '%s' must be greater than 0, was "
841         "%lld",
842         file_path.c_str(), static_cast<long long>(start)));
843   }
844 
845   if (end < start) {
846     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
847         "Ending checksum offset of file '%s' must be greater than start "
848         "'%lld', was '%lld'",
849         file_path.c_str(), static_cast<long long>(start),
850         static_cast<long long>(end)));
851   }
852 
853   if (end > file_size) {
854     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
855         "Ending checksum offset of file '%s' must be within "
856         "file size of %lld, was %lld",
857         file_path.c_str(), static_cast<long long>(file_size),
858         static_cast<long long>(end)));
859   }
860 
861   Architecture architecture = GetArchitecture();
862   switch (architecture) {
863     case Architecture::BIT_64: {
864       // Don't mmap in chunks here since mmapping can be harmful on 64-bit
865       // devices where mmap/munmap calls need the mmap write semaphore, which
866       // blocks mmap/munmap/mprotect and all page faults from executing while
867       // they run. On 64-bit devices, this doesn't actually load into memory, it
868       // just makes the file faultable. So the whole file should be ok.
869       // b/185822878.
870       ICING_RETURN_IF_ERROR(mmapped_file.Remap(start, end - start));
871       auto mmap_str = std::string_view(mmapped_file.region(), end - start);
872       new_crc.Append(mmap_str);
873       break;
874     }
875     case Architecture::BIT_32:
876       [[fallthrough]];
877     case Architecture::UNKNOWN: {
878       // 32-bit devices only have 4GB of RAM. Mmap in chunks to not use up too
879       // much memory at once. If we're unknown, then also chunk it because we're
880       // not sure what the device can handle.
881       for (int i = start; i < end; i += kMmapChunkSize) {
882         // Don't read past the file size.
883         int next_chunk_size = kMmapChunkSize;
884         if ((i + kMmapChunkSize) >= end) {
885           next_chunk_size = end - i;
886         }
887 
888         ICING_RETURN_IF_ERROR(mmapped_file.Remap(i, next_chunk_size));
889 
890         auto mmap_str =
891             std::string_view(mmapped_file.region(), next_chunk_size);
892         new_crc.Append(mmap_str);
893       }
894       break;
895     }
896   }
897 
898   return new_crc;
899 }
900 
901 template <typename ProtoT>
902 libtextclassifier3::StatusOr<int64_t>
WriteProto(const ProtoT & proto)903 PortableFileBackedProtoLog<ProtoT>::WriteProto(const ProtoT& proto) {
904   int64_t proto_size = proto.ByteSizeLong();
905   int32_t host_order_metadata;
906   int64_t current_position = filesystem_->GetCurrentPosition(fd_.get());
907 
908   if (proto_size > header_->GetMaxProtoSize()) {
909     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
910         "proto_size, %lld, was too large to write. Max is %d",
911         static_cast<long long>(proto_size), header_->GetMaxProtoSize()));
912   }
913 
914   // At this point, we've guaranteed that proto_size is under kMaxProtoSize
915   // (see
916   // ::Create), so we can safely store it in an int.
917   int final_size = 0;
918 
919   std::string proto_str;
920   google::protobuf::io::StringOutputStream proto_stream(&proto_str);
921 
922   if (header_->GetCompressFlag()) {
923     protobuf_ports::GzipOutputStream::Options options;
924     options.format = protobuf_ports::GzipOutputStream::ZLIB;
925     options.compression_level = compression_level_;
926 
927     protobuf_ports::GzipOutputStream compressing_stream(&proto_stream, options);
928 
929     bool success = proto.SerializeToZeroCopyStream(&compressing_stream) &&
930                    compressing_stream.Close();
931 
932     if (!success) {
933       return absl_ports::InternalError("Error compressing proto.");
934     }
935 
936     final_size = proto_str.size();
937 
938     // In case the compressed proto is larger than the original proto, we also
939     // can't write it.
940     if (final_size > header_->GetMaxProtoSize()) {
941       return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
942           "Compressed proto size, %d, was greater than "
943           "max_proto_size, %d",
944           final_size, header_->GetMaxProtoSize()));
945     }
946   } else {
947     // Serialize the proto directly into the write buffer at an offset of the
948     // metadata.
949     proto.SerializeToZeroCopyStream(&proto_stream);
950     final_size = proto_str.size();
951   }
952 
953   // 1st byte for magic, next 3 bytes for proto size.
954   host_order_metadata = (kProtoMagic << 24) | final_size;
955 
956   // Actually write metadata, has to be done after we know the possibly
957   // compressed proto size
958   ICING_RETURN_IF_ERROR(
959       WriteProtoMetadata(filesystem_, fd_.get(), host_order_metadata));
960 
961   // Write the serialized proto
962   if (!filesystem_->Write(fd_.get(), proto_str.data(), proto_str.size())) {
963     return absl_ports::InternalError(
964         absl_ports::StrCat("Failed to write proto to: ", file_path_));
965   }
966 
967   // Update file size. The file should have grown by sizeof(Metadata) + size of
968   // the serialized proto.
969   file_size_ += sizeof(host_order_metadata) + final_size;
970   return current_position;
971 }
972 
973 template <typename ProtoT>
974 libtextclassifier3::StatusOr<ProtoT>
ReadProto(int64_t file_offset)975 PortableFileBackedProtoLog<ProtoT>::ReadProto(int64_t file_offset) const {
976   ICING_ASSIGN_OR_RETURN(
977       int32_t metadata,
978       ReadProtoMetadata(filesystem_, fd_.get(), file_offset, file_size_));
979 
980   // Copy out however many bytes it says the proto is
981   int stored_size = GetProtoSize(metadata);
982   file_offset += sizeof(metadata);
983 
984   // Read the compressed proto out.
985   if (file_offset + stored_size > file_size_) {
986     return absl_ports::OutOfRangeError(
987         IcingStringUtil::StringPrintf("Trying to read from a location, %lld, "
988                                       "out of range of the file size, %lld",
989                                       static_cast<long long>(file_offset),
990                                       static_cast<long long>(file_size_ - 1)));
991   }
992   auto buf = std::make_unique<char[]>(stored_size);
993   if (!filesystem_->PRead(fd_.get(), buf.get(), stored_size, file_offset)) {
994     return absl_ports::InternalError("");
995   }
996 
997   if (IsEmptyBuffer(buf.get(), stored_size)) {
998     return absl_ports::NotFoundError("The proto data has been erased.");
999   }
1000 
1001   google::protobuf::io::ArrayInputStream proto_stream(buf.get(), stored_size);
1002 
1003   // Deserialize proto
1004   ProtoT proto;
1005   if (header_->GetCompressFlag()) {
1006     protobuf_ports::GzipInputStream decompress_stream(&proto_stream);
1007     proto.ParseFromZeroCopyStream(&decompress_stream);
1008   } else {
1009     proto.ParseFromZeroCopyStream(&proto_stream);
1010   }
1011 
1012   return proto;
1013 }
1014 
1015 template <typename ProtoT>
EraseProto(int64_t file_offset)1016 libtextclassifier3::Status PortableFileBackedProtoLog<ProtoT>::EraseProto(
1017     int64_t file_offset) {
1018   ICING_ASSIGN_OR_RETURN(
1019       int32_t metadata,
1020       ReadProtoMetadata(filesystem_, fd_.get(), file_offset, file_size_));
1021   // Copy out however many bytes it says the proto is
1022   int stored_size = GetProtoSize(metadata);
1023   file_offset += sizeof(metadata);
1024   if (file_offset + stored_size > file_size_) {
1025     return absl_ports::OutOfRangeError(
1026         IcingStringUtil::StringPrintf("Trying to read from a location, %lld, "
1027                                       "out of range of the file size, %lld",
1028                                       static_cast<long long>(file_offset),
1029                                       static_cast<long long>(file_size_ - 1)));
1030   }
1031   auto buf = std::make_unique<char[]>(stored_size);
1032 
1033   // We need to update the crc checksum if the erased area is before the
1034   // rewind position.
1035   int32_t new_crc;
1036   if (file_offset < header_->GetRewindOffset()) {
1037     // Set to "dirty" before we start writing anything.
1038     header_->SetDirtyFlag(true);
1039     header_->SetHeaderChecksum(header_->CalculateHeaderChecksum());
1040     if (!filesystem_->PWrite(fd_.get(), /*offset=*/0, header_.get(),
1041                              sizeof(Header))) {
1042       return absl_ports::InternalError(absl_ports::StrCat(
1043           "Failed to update dirty bit of header to: ", file_path_));
1044     }
1045 
1046     // We need to calculate [original string xor 0s].
1047     // The xored string is the same as the original string because 0 xor 0 =
1048     // 0, 1 xor 0 = 1.
1049     // Read the compressed proto out.
1050     if (!filesystem_->PRead(fd_.get(), buf.get(), stored_size, file_offset)) {
1051       return absl_ports::InternalError("");
1052     }
1053     const std::string_view xored_str(buf.get(), stored_size);
1054 
1055     Crc32 crc(header_->GetLogChecksum());
1056     ICING_ASSIGN_OR_RETURN(
1057         new_crc,
1058         crc.UpdateWithXor(xored_str,
1059                           /*full_data_size=*/header_->GetRewindOffset() -
1060                               kHeaderReservedBytes,
1061                           /*position=*/file_offset - kHeaderReservedBytes));
1062   }
1063 
1064   // Clear the region.
1065   memset(buf.get(), '\0', stored_size);
1066   if (!filesystem_->PWrite(fd_.get(), file_offset, buf.get(), stored_size)) {
1067     return absl_ports::InternalError("");
1068   }
1069 
1070   // If we cleared something in our checksummed area, we should update our
1071   // checksum and reset our dirty bit.
1072   if (file_offset < header_->GetRewindOffset()) {
1073     header_->SetDirtyFlag(false);
1074     header_->SetLogChecksum(new_crc);
1075     header_->SetHeaderChecksum(header_->CalculateHeaderChecksum());
1076 
1077     if (!filesystem_->PWrite(fd_.get(), /*offset=*/0, header_.get(),
1078                              sizeof(Header))) {
1079       return absl_ports::InternalError(
1080           absl_ports::StrCat("Failed to update header to: ", file_path_));
1081     }
1082   }
1083 
1084   return libtextclassifier3::Status::OK;
1085 }
1086 
1087 template <typename ProtoT>
1088 libtextclassifier3::StatusOr<int64_t>
GetDiskUsage()1089 PortableFileBackedProtoLog<ProtoT>::GetDiskUsage() const {
1090   int64_t size = filesystem_->GetDiskUsage(file_path_.c_str());
1091   if (size == Filesystem::kBadFileSize) {
1092     return absl_ports::InternalError("Failed to get disk usage of proto log");
1093   }
1094   return size;
1095 }
1096 
1097 template <typename ProtoT>
1098 libtextclassifier3::StatusOr<int64_t>
GetElementsFileSize()1099 PortableFileBackedProtoLog<ProtoT>::GetElementsFileSize() const {
1100   return file_size_ - kHeaderReservedBytes;
1101 }
1102 
1103 template <typename ProtoT>
Iterator(const Filesystem & filesystem,int fd,int64_t initial_offset,int64_t file_size)1104 PortableFileBackedProtoLog<ProtoT>::Iterator::Iterator(
1105     const Filesystem& filesystem, int fd, int64_t initial_offset,
1106     int64_t file_size)
1107     : filesystem_(&filesystem),
1108       initial_offset_(initial_offset),
1109       current_offset_(kInvalidOffset),
1110       file_size_(file_size),
1111       fd_(fd) {}
1112 
1113 template <typename ProtoT>
1114 libtextclassifier3::Status
Advance()1115 PortableFileBackedProtoLog<ProtoT>::Iterator::Advance() {
1116   if (current_offset_ == kInvalidOffset) {
1117     // First Advance() call
1118     current_offset_ = initial_offset_;
1119   } else {
1120     // Jumps to the next proto position
1121     ICING_ASSIGN_OR_RETURN(
1122         int32_t metadata,
1123         ReadProtoMetadata(filesystem_, fd_, current_offset_, file_size_));
1124     current_offset_ += sizeof(metadata) + GetProtoSize(metadata);
1125   }
1126 
1127   if (current_offset_ < file_size_) {
1128     return libtextclassifier3::Status::OK;
1129   } else {
1130     return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
1131         "The next proto offset, %lld, is out of file range [0, %lld)",
1132         static_cast<long long>(current_offset_),
1133         static_cast<long long>(file_size_)));
1134   }
1135 }
1136 
1137 template <typename ProtoT>
GetOffset()1138 int64_t PortableFileBackedProtoLog<ProtoT>::Iterator::GetOffset() const {
1139   return current_offset_;
1140 }
1141 
1142 template <typename ProtoT>
1143 typename PortableFileBackedProtoLog<ProtoT>::Iterator
GetIterator()1144 PortableFileBackedProtoLog<ProtoT>::GetIterator() const {
1145   return Iterator(*filesystem_, fd_.get(),
1146                   /*initial_offset=*/kHeaderReservedBytes, file_size_);
1147 }
1148 
1149 template <typename ProtoT>
1150 libtextclassifier3::StatusOr<int32_t>
ReadProtoMetadata(const Filesystem * const filesystem,int fd,int64_t file_offset,int64_t file_size)1151 PortableFileBackedProtoLog<ProtoT>::ReadProtoMetadata(
1152     const Filesystem* const filesystem, int fd, int64_t file_offset,
1153     int64_t file_size) {
1154   // Checks file_offset
1155   if (file_offset >= file_size) {
1156     return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
1157         "offset, %lld, is out of file range [0, %lld)",
1158         static_cast<long long>(file_offset),
1159         static_cast<long long>(file_size)));
1160   }
1161   int32_t portable_metadata;
1162   int metadata_size = sizeof(portable_metadata);
1163   if (file_offset + metadata_size >= file_size) {
1164     return absl_ports::InternalError(IcingStringUtil::StringPrintf(
1165         "Wrong metadata offset %lld, metadata doesn't fit in "
1166         "with file range [0, %lld)",
1167         static_cast<long long>(file_offset),
1168         static_cast<long long>(file_size)));
1169   }
1170 
1171   if (!filesystem->PRead(fd, &portable_metadata, metadata_size, file_offset)) {
1172     return absl_ports::InternalError("");
1173   }
1174 
1175   // Need to switch it back to host order endianness after reading from disk.
1176   int32_t host_order_metadata = GNetworkToHostL(portable_metadata);
1177 
1178   // Checks magic number
1179   uint8_t stored_k_proto_magic = GetProtoMagic(host_order_metadata);
1180   if (stored_k_proto_magic != kProtoMagic) {
1181     return absl_ports::InternalError(IcingStringUtil::StringPrintf(
1182         "Failed to read kProtoMagic, expected %d, actual %d", kProtoMagic,
1183         stored_k_proto_magic));
1184   }
1185 
1186   return host_order_metadata;
1187 }
1188 
1189 template <typename ProtoT>
1190 libtextclassifier3::Status
WriteProtoMetadata(const Filesystem * filesystem,int fd,int32_t host_order_metadata)1191 PortableFileBackedProtoLog<ProtoT>::WriteProtoMetadata(
1192     const Filesystem* filesystem, int fd, int32_t host_order_metadata) {
1193   // Convert it into portable endian format before writing to disk
1194   int32_t portable_metadata = GHostToNetworkL(host_order_metadata);
1195   int portable_metadata_size = sizeof(portable_metadata);
1196 
1197   // Write metadata
1198   if (!filesystem->Write(fd, &portable_metadata, portable_metadata_size)) {
1199     return absl_ports::InternalError(
1200         absl_ports::StrCat("Failed to write proto metadata."));
1201   }
1202 
1203   return libtextclassifier3::Status::OK;
1204 }
1205 
1206 template <typename ProtoT>
PersistToDisk()1207 libtextclassifier3::Status PortableFileBackedProtoLog<ProtoT>::PersistToDisk() {
1208   if (file_size_ == header_->GetRewindOffset()) {
1209     // No new protos appended, don't need to update the checksum.
1210     return libtextclassifier3::Status::OK;
1211   }
1212 
1213   ICING_RETURN_IF_ERROR(UpdateChecksum());
1214   if (!filesystem_->DataSync(fd_.get())) {
1215     return absl_ports::InternalError(
1216         absl_ports::StrCat("Failed to sync data to disk: ", file_path_));
1217   }
1218 
1219   return libtextclassifier3::Status::OK;
1220 }
1221 
1222 template <typename ProtoT>
1223 libtextclassifier3::StatusOr<Crc32>
UpdateChecksum()1224 PortableFileBackedProtoLog<ProtoT>::UpdateChecksum() {
1225   if (file_size_ == header_->GetRewindOffset()) {
1226     return Crc32(header_->GetLogChecksum());
1227   }
1228   ICING_ASSIGN_OR_RETURN(Crc32 crc, GetChecksum());
1229   header_->SetLogChecksum(crc.Get());
1230   header_->SetRewindOffset(file_size_);
1231   header_->SetHeaderChecksum(header_->CalculateHeaderChecksum());
1232 
1233   if (!filesystem_->PWrite(fd_.get(), /*offset=*/0, header_.get(),
1234                            sizeof(Header))) {
1235     return absl_ports::InternalError(
1236         absl_ports::StrCat("Failed to update header to: ", file_path_));
1237   }
1238   return crc;
1239 }
1240 
1241 template <typename ProtoT>
1242 libtextclassifier3::StatusOr<Crc32>
GetChecksum()1243 PortableFileBackedProtoLog<ProtoT>::GetChecksum() const {
1244   int64_t new_content_size = file_size_ - header_->GetRewindOffset();
1245   if (new_content_size == 0) {
1246     // No new protos appended, return cached checksum
1247     return Crc32(header_->GetLogChecksum());
1248   } else if (new_content_size < 0) {
1249     // File shrunk, recalculate the entire checksum.
1250     return GetPartialChecksum(filesystem_, file_path_, Crc32(),
1251                               /*start=*/kHeaderReservedBytes,
1252                               /*end=*/file_size_, file_size_);
1253   } else {
1254     // Append new changes to the existing checksum.
1255     return GetPartialChecksum(
1256         filesystem_, file_path_, Crc32(header_->GetLogChecksum()),
1257         /*start=*/header_->GetRewindOffset(), /*end=*/file_size_, file_size_);
1258   }
1259 }
1260 
1261 }  // namespace lib
1262 }  // namespace icing
1263 
1264 #endif  // ICING_FILE_PORTABLE_FILE_BACKED_PROTO_LOG_H_
1265