• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2021 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // File-backed log of protos with append-only writes and position based reads.
16 //
17 // There should only be one instance of a PortableFileBackedProtoLog of the same
18 // file at a time; using multiple instances at the same time may lead to
19 // undefined behavior.
20 //
21 // The entire checksum is computed on initialization to verify the contents are
22 // valid. On failure, the log will be truncated to the last verified state when
23 // PersistToDisk() was called. If the log cannot successfully restore the last
24 // state due to disk corruption or some other inconsistency, then the entire log
25 // will be lost.
26 //
27 // Each proto written to the file will have a metadata written just before it.
28 // The metadata consists of
29 //   {
30 //     1 bytes of kProtoMagic;
31 //     3 bytes of the proto size
32 //     n bytes of the proto itself
33 //   }
34 //
35 // All metadata is written in a portable format, encoded with htonl before
36 // writing to file and decoded with ntohl when reading from file.
37 //
38 // Example usage:
39 //   ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
40 //       PortableFileBackedProtoLog<DocumentProto>::Create(filesystem,
41 //       file_path_,
42 //                                                  options));
43 //   auto proto_log = create_result.proto_log;
44 //
45 //   Document document;
46 //   document.set_namespace("com.google.android.example");
47 //   document.set_uri("www.google.com");
48 //
49 //   int64_t document_offset = proto_log->WriteProto(document));
50 //   Document same_document = proto_log->ReadProto(document_offset));
51 //   proto_log->PersistToDisk();
52 
53 #ifndef ICING_FILE_PORTABLE_FILE_BACKED_PROTO_LOG_H_
54 #define ICING_FILE_PORTABLE_FILE_BACKED_PROTO_LOG_H_
55 
56 #include <cstddef>
57 #include <cstdint>
58 #include <cstring>
59 #include <memory>
60 #include <string>
61 #include <string_view>
62 #include <utility>
63 #include <vector>
64 
65 #include "icing/text_classifier/lib3/utils/base/status.h"
66 #include "icing/text_classifier/lib3/utils/base/statusor.h"
67 #include "icing/absl_ports/canonical_errors.h"
68 #include "icing/absl_ports/str_cat.h"
69 #include "icing/file/filesystem.h"
70 #include "icing/file/memory-mapped-file.h"
71 #include "icing/legacy/core/icing-string-util.h"
72 #include "icing/portable/endian.h"
73 #include "icing/portable/gzip_stream.h"
74 #include "icing/portable/platform.h"
75 #include "icing/portable/zlib.h"
76 #include "icing/util/bit-util.h"
77 #include "icing/util/crc32.h"
78 #include "icing/util/data-loss.h"
79 #include "icing/util/logging.h"
80 #include "icing/util/status-macros.h"
81 #include <google/protobuf/io/zero_copy_stream_impl_lite.h>
82 
83 namespace icing {
84 namespace lib {
85 
86 template <typename ProtoT>
87 class PortableFileBackedProtoLog {
88  public:
89   struct Options {
90     // Whether to compress each proto before writing to the proto log.
91     bool compress;
92 
93     // Byte-size limit for each proto written to the store. This does not
94     // include the bytes needed for the metadata of each proto.
95     //
96     // NOTE: Currently, we only support protos up to 16MiB. We store the proto
97     // size in 3 bytes within the metadata.
98     //
99     // NOTE: This limit is only enforced for future writes. If the store
100     // previously had a higher limit, then reading older entries could return
101     // larger protos.
102     //
103     // NOTE: The max_proto_size is the upper limit for input protos into the
104     // ProtoLog. Even if the proto is larger than max_proto_size, but compresses
105     // to a smaller size, ProtoLog will not accept it. Protos that result in a
106     // compressed size larger than max_proto_size are also not accepted.
107     const int32_t max_proto_size;
108 
109     // Level of compression if enabled, NO_COMPRESSION = 0, BEST_SPEED = 1,
110     // BEST_COMPRESSION = 9
111     const int32_t compression_level;
112 
113     // Must specify values for options.
114     Options() = delete;
115     explicit Options(
116         bool compress_in, const int32_t max_proto_size_in = kMaxProtoSize,
117         const int32_t compression_level_in = kDeflateCompressionLevel)
compressOptions118         : compress(compress_in),
119           max_proto_size(max_proto_size_in),
120           compression_level(compression_level_in) {}
121   };
122 
123   // Our internal max for protos.
124   //
125   // WARNING: Changing this to a larger number may invalidate our assumption
126   // that that proto size can safely be stored in the last 3 bytes of the proto
127   // header.
128   static constexpr int kMaxProtoSize = (1 << 24) - 1;  // 16MiB
129   static_assert(kMaxProtoSize <= 0x00FFFFFF,
130                 "kMaxProtoSize doesn't fit in 3 bytes");
131 
132   // Level of compression, BEST_SPEED = 1, BEST_COMPRESSION = 9
133   static constexpr int kDeflateCompressionLevel = 3;
134 
135   // Number of bytes we reserve for the heading at the beginning of the proto
136   // log. We reserve this so the header can grow without running into the
137   // contents of the proto log, triggering an unnecessary migration of the data.
138   static constexpr int kHeaderReservedBytes = 256;
139 
140   // Header stored at the beginning of the file before the rest of the log
141   // contents. Stores metadata on the log.
142   class Header {
143    public:
144     static constexpr int32_t kMagic = 0xf4c6f67a;
145 
146     // We should go directly from 0 to 2 the next time we have to change the
147     // format.
148     static constexpr int32_t kFileFormatVersion = 0;
149 
CalculateHeaderChecksum()150     uint32_t CalculateHeaderChecksum() const {
151       Crc32 crc;
152 
153       // Get a string_view of all the fields of the Header, excluding the
154       // magic_nbytes_ and header_checksum_nbytes_
155       std::string_view header_str(
156           reinterpret_cast<const char*>(this) +
157               offsetof(Header, header_checksum_nbytes_) +
158               sizeof(header_checksum_nbytes_),
159           sizeof(Header) - sizeof(magic_nbytes_) -
160               sizeof(header_checksum_nbytes_));
161       crc.Append(header_str);
162       return crc.Get();
163     }
164 
GetMagic()165     int32_t GetMagic() const { return GNetworkToHostL(magic_nbytes_); }
166 
SetMagic(int32_t magic_in)167     void SetMagic(int32_t magic_in) {
168       magic_nbytes_ = GHostToNetworkL(magic_in);
169     }
170 
GetFileFormatVersion()171     int32_t GetFileFormatVersion() const {
172       return GNetworkToHostL(file_format_version_nbytes_);
173     }
174 
SetFileFormatVersion(int32_t file_format_version_in)175     void SetFileFormatVersion(int32_t file_format_version_in) {
176       file_format_version_nbytes_ = GHostToNetworkL(file_format_version_in);
177     }
178 
GetMaxProtoSize()179     int32_t GetMaxProtoSize() const {
180       return GNetworkToHostL(max_proto_size_nbytes_);
181     }
182 
SetMaxProtoSize(int32_t max_proto_size_in)183     void SetMaxProtoSize(int32_t max_proto_size_in) {
184       max_proto_size_nbytes_ = GHostToNetworkL(max_proto_size_in);
185     }
186 
GetLogChecksum()187     int32_t GetLogChecksum() const {
188       return GNetworkToHostL(log_checksum_nbytes_);
189     }
190 
SetLogChecksum(int32_t log_checksum_in)191     void SetLogChecksum(int32_t log_checksum_in) {
192       log_checksum_nbytes_ = GHostToNetworkL(log_checksum_in);
193     }
194 
GetRewindOffset()195     int64_t GetRewindOffset() const {
196       return GNetworkToHostLL(rewind_offset_nbytes_);
197     }
198 
SetRewindOffset(int64_t rewind_offset_in)199     void SetRewindOffset(int64_t rewind_offset_in) {
200       rewind_offset_nbytes_ = GHostToNetworkLL(rewind_offset_in);
201     }
202 
GetHeaderChecksum()203     int32_t GetHeaderChecksum() const {
204       return GNetworkToHostL(header_checksum_nbytes_);
205     }
206 
SetHeaderChecksum(int32_t header_checksum_in)207     void SetHeaderChecksum(int32_t header_checksum_in) {
208       header_checksum_nbytes_ = GHostToNetworkL(header_checksum_in);
209     }
210 
GetCompressFlag()211     bool GetCompressFlag() const { return GetFlag(kCompressBit); }
212 
SetCompressFlag(bool compress)213     void SetCompressFlag(bool compress) { SetFlag(kCompressBit, compress); }
214 
GetDirtyFlag()215     bool GetDirtyFlag() const { return GetFlag(kDirtyBit); }
216 
SetDirtyFlag(bool dirty)217     void SetDirtyFlag(bool dirty) { SetFlag(kDirtyBit, dirty); }
218 
219    private:
220     // The least-significant bit offset at which the compress flag is stored in
221     // 'flags_nbytes_'. Represents whether the protos in the log are compressed
222     // or not.
223     static constexpr int32_t kCompressBit = 0;
224 
225     // The least-significant bit offset at which the dirty flag is stored in
226     // 'flags'. Represents whether the checksummed portion of the log has been
227     // modified after the last checksum was computed.
228     static constexpr int32_t kDirtyBit = 1;
229 
GetFlag(int offset)230     bool GetFlag(int offset) const {
231       return bit_util::BitfieldGet(flags_, offset, /*len=*/1);
232     }
233 
SetFlag(int offset,bool value)234     void SetFlag(int offset, bool value) {
235       bit_util::BitfieldSet(value, offset, /*len=*/1, &flags_);
236     }
237 
238     // Holds the magic as a quick sanity check against file corruption.
239     //
240     // Field is in network-byte order.
241     int32_t magic_nbytes_ = GHostToNetworkL(kMagic);
242 
243     // Must be at the beginning after kMagic. Contains the crc checksum of
244     // the following fields.
245     //
246     // Field is in network-byte order.
247     uint32_t header_checksum_nbytes_ = 0;
248 
249     // Last known good offset at which the log and its checksum were updated.
250     // If we crash between writing to the log and updating the checksum, we can
251     // try to rewind the log to this offset and verify the checksum is still
252     // valid instead of throwing away the entire log.
253     //
254     // Field is in network-byte order.
255     int64_t rewind_offset_nbytes_ = GHostToNetworkLL(kHeaderReservedBytes);
256 
257     // Version number tracking how we serialize the file to disk. If we change
258     // how/what we write to disk, this version should be updated and this class
259     // should handle a migration.
260     //
261     // Currently at kFileFormatVersion.
262     //
263     // Field is in network-byte order.
264     int32_t file_format_version_nbytes_ = 0;
265 
266     // The maximum proto size that can be written to the log.
267     //
268     // Field is in network-byte order.
269     int32_t max_proto_size_nbytes_ = 0;
270 
271     // Checksum of the log elements, doesn't include the header fields.
272     //
273     // Field is in network-byte order.
274     uint32_t log_checksum_nbytes_ = 0;
275 
276     // Bits are used to hold various flags.
277     //   Lowest bit is whether the protos are compressed or not.
278     //
279     // Field is only 1 byte, so is byte-order agnostic.
280     uint8_t flags_ = 0;
281 
282     // NOTE: New fields should *almost always* be added to the end here. Since
283     // this class may have already been written to disk, appending fields
284     // increases the chances that changes are backwards-compatible.
285   };
286   static_assert(sizeof(Header) <= kHeaderReservedBytes,
287                 "Header has grown past our reserved bytes!");
288 
289   struct CreateResult {
290     // A successfully initialized log.
291     std::unique_ptr<PortableFileBackedProtoLog<ProtoT>> proto_log;
292 
293     // The data status after initializing from a previous state. Data loss can
294     // happen if the file is corrupted or some previously added data was
295     // unpersisted. This may be used to signal that any derived data off of the
296     // proto log may need to be regenerated.
297     DataLoss data_loss = DataLoss::NONE;
298 
299     // Whether the proto log had to recalculate the checksum to check its
300     // integrity. This can be avoided if no changes were made or the log was
301     // able to update its checksum before shutting down. But it may have to
302     // recalculate if it's unclear if we crashed after updating the log, but
303     // before updating our checksum.
304     bool recalculated_checksum = false;
305 
has_data_lossCreateResult306     bool has_data_loss() const {
307       return data_loss == DataLoss::PARTIAL || data_loss == DataLoss::COMPLETE;
308     }
309   };
310 
311   // Factory method to create, initialize, and return a
312   // PortableFileBackedProtoLog. Will create the file if it doesn't exist.
313   //
314   // If on re-initialization the log detects disk corruption or some previously
315   // added data was unpersisted, the log will rewind to the last-good state. The
316   // log saves these checkpointed "good" states when PersistToDisk() is called
317   // or the log is safely destructed. If the log rewinds successfully to the
318   // last-good state, then the returned CreateResult.data_loss indicates
319   // whether it has a data loss and what kind of data loss it is (partial or
320   // complete) so that any derived data may know that it needs to be updated. If
321   // the log re-initializes successfully without any data loss,
322   // CreateResult.data_loss will be NONE.
323   //
324   // Params:
325   //   filesystem: Handles system level calls
326   //   file_path: Path of the underlying file. Directory of the file should
327   //   already exist
328   //   options: Configuration options for the proto log
329   //
330   // Returns:
331   //   PortableFileBackedProtoLog::CreateResult on success
332   //   INVALID_ARGUMENT on an invalid option
333   //   INTERNAL_ERROR on IO error
334   static libtextclassifier3::StatusOr<CreateResult> Create(
335       const Filesystem* filesystem, const std::string& file_path,
336       const Options& options);
337 
338   // Not copyable
339   PortableFileBackedProtoLog(const PortableFileBackedProtoLog&) = delete;
340   PortableFileBackedProtoLog& operator=(const PortableFileBackedProtoLog&) =
341       delete;
342 
343   // This will update the checksum of the log as well.
344   ~PortableFileBackedProtoLog();
345 
346   // Writes the serialized proto to the underlying file. Writes are applied
347   // directly to the underlying file. Users do not need to sync the file after
348   // writing.
349   //
350   // Returns:
351   //   Offset of the newly appended proto in file on success
352   //   INVALID_ARGUMENT if proto is too large, as decided by
353   //     Options.max_proto_size
354   //   INTERNAL_ERROR on IO error
355   libtextclassifier3::StatusOr<int64_t> WriteProto(const ProtoT& proto);
356 
357   // Reads out a proto located at file_offset from the file.
358   //
359   // Returns:
360   //   A proto on success
361   //   NOT_FOUND if the proto at the given offset has been erased
362   //   OUT_OF_RANGE_ERROR if file_offset exceeds file size
363   //   INTERNAL_ERROR on IO error
364   libtextclassifier3::StatusOr<ProtoT> ReadProto(int64_t file_offset) const;
365 
366   // Erases the data of a proto located at file_offset from the file.
367   //
368   // Returns:
369   //   OK on success
370   //   OUT_OF_RANGE_ERROR if file_offset exceeds file size
371   //   INTERNAL_ERROR on IO error
372   libtextclassifier3::Status EraseProto(int64_t file_offset);
373 
374   // Calculates and returns the disk usage in bytes. Rounds up to the nearest
375   // block size.
376   //
377   // Returns:
378   //   Disk usage on success
379   //   INTERNAL_ERROR on IO error
380   libtextclassifier3::StatusOr<int64_t> GetDiskUsage() const;
381 
382   // Returns the file size of all the elements held in the log. File size is in
383   // bytes. This excludes the size of any internal metadata of the log, e.g. the
384   // log's header.
385   //
386   // Returns:
387   //   File size on success
388   //   INTERNAL_ERROR on IO error
389   libtextclassifier3::StatusOr<int64_t> GetElementsFileSize() const;
390 
391   // An iterator helping to find offsets of all the protos in file.
392   // Example usage:
393   //
394   // while (iterator.Advance().ok()) {
395   //   int64_t offset = iterator.GetOffset();
396   //   // Do something
397   // }
398   class Iterator {
399    public:
400     Iterator(const Filesystem& filesystem, int fd, int64_t initial_offset);
401 
402     // Advances to the position of next proto whether it has been erased or not.
403     //
404     // Returns:
405     //   OK on success
406     //   OUT_OF_RANGE_ERROR if it reaches the end
407     //   INTERNAL_ERROR on IO error
408     libtextclassifier3::Status Advance();
409 
410     // Returns the file offset of current proto.
411     int64_t GetOffset();
412 
413    private:
414     static constexpr int64_t kInvalidOffset = -1;
415     // Used to read proto metadata
416     // Offset of first proto
417     const Filesystem* const filesystem_;
418     int64_t initial_offset_;
419     int64_t current_offset_;
420     int64_t file_size_;
421     int fd_;
422   };
423 
424   // Returns an iterator of current proto log. The caller needs to keep the
425   // proto log unchanged while using the iterator, otherwise unexpected
426   // behaviors could happen.
427   Iterator GetIterator();
428 
429   // Persists all changes since initialization or the last call to
430   // PersistToDisk(). Any changes that aren't persisted may be lost if the
431   // system fails to close safely.
432   //
433   // Example use case:
434   //
435   //   Document document;
436   //   document.set_namespace("com.google.android.example");
437   //   document.set_uri("www.google.com");
438   //
439   //   {
440   //     ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
441   //         PortableFileBackedProtoLog<DocumentProto>::Create(filesystem,
442   //         file_path,
443   //                                                    options));
444   //     auto proto_log = std::move(create_result.proto_log);
445   //
446   //     int64_t document_offset = proto_log->WriteProto(document));
447   //
448   //     // We lose the document here since it wasn't persisted.
449   //     // *SYSTEM CRASH*
450   //   }
451   //
452   //   {
453   //     // Can still successfully create after a crash since the log can
454   //     // rewind/truncate to recover into a previously good state
455   //     ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
456   //         PortableFileBackedProtoLog<DocumentProto>::Create(filesystem,
457   //         file_path,
458   //                                                    options));
459   //     auto proto_log = std::move(create_result.proto_log);
460   //
461   //     // Lost the proto since we didn't PersistToDisk before the crash
462   //     proto_log->ReadProto(document_offset)); // INVALID_ARGUMENT error
463   //
464   //     int64_t document_offset = proto_log->WriteProto(document));
465   //
466   //     // Persisted this time, so we should be ok.
467   //     ICING_ASSERT_OK(proto_log->PersistToDisk());
468   //   }
469   //
470   //   {
471   //     ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
472   //         PortableFileBackedProtoLog<DocumentProto>::Create(filesystem,
473   //         file_path,
474   //                                                    options));
475   //     auto proto_log = std::move(create_result.proto_log);
476   //
477   //     // SUCCESS
478   //     Document same_document = proto_log->ReadProto(document_offset));
479   //   }
480   //
481   // NOTE: Since all protos are already written to the file directly, this
482   // just updates the checksum and rewind position. Without these updates,
483   // future initializations will truncate the file and discard unpersisted
484   // changes.
485   //
486   // Returns:
487   //   OK on success
488   //   INTERNAL_ERROR on IO error
489   libtextclassifier3::Status PersistToDisk();
490 
491   // Calculates the checksum of the log contents. Excludes the header content.
492   //
493   // Returns:
494   //   Crc of the log content
495   //   INTERNAL_ERROR on IO error
496   libtextclassifier3::StatusOr<Crc32> ComputeChecksum();
497 
498  private:
499   // Object can only be instantiated via the ::Create factory.
500   PortableFileBackedProtoLog(const Filesystem* filesystem,
501                              const std::string& file_path,
502                              std::unique_ptr<Header> header,
503                              int32_t compression_level);
504 
505   // Initializes a new proto log.
506   //
507   // Returns:
508   //   std::unique_ptr<CreateResult> on success
509   //   INTERNAL_ERROR on IO error
510   static libtextclassifier3::StatusOr<CreateResult> InitializeNewFile(
511       const Filesystem* filesystem, const std::string& file_path,
512       const Options& options);
513 
514   // Verifies that the existing proto log is in a good state. If not in a good
515   // state, then the proto log may be truncated to the last good state and
516   // content will be lost.
517   //
518   // Returns:
519   //   std::unique_ptr<CreateResult> on success
520   //   INTERNAL_ERROR on IO error or internal inconsistencies in the file
521   //   INVALID_ARGUMENT_ERROR if options aren't consistent with previous
522   //     instances
523   static libtextclassifier3::StatusOr<CreateResult> InitializeExistingFile(
524       const Filesystem* filesystem, const std::string& file_path,
525       const Options& options, int64_t file_size);
526 
527   // Takes an initial checksum and updates it with the content between `start`
528   // and `end` offsets in the file.
529   //
530   // Returns:
531   //   Crc of the content between `start`, inclusive, and `end`, exclusive.
532   //   INTERNAL_ERROR on IO error
533   //   INVALID_ARGUMENT_ERROR if start and end aren't within the file size
534   static libtextclassifier3::StatusOr<Crc32> ComputeChecksum(
535       const Filesystem* filesystem, const std::string& file_path,
536       Crc32 initial_crc, int64_t start, int64_t end);
537 
538   // Reads out the metadata of a proto located at file_offset from the fd.
539   // Metadata will be returned in host byte order endianness.
540   //
541   // Returns:
542   //   Proto's metadata on success
543   //   OUT_OF_RANGE_ERROR if file_offset exceeds file_size
544   //   INTERNAL_ERROR if the metadata is invalid or any IO errors happen
545   static libtextclassifier3::StatusOr<int32_t> ReadProtoMetadata(
546       const Filesystem* const filesystem, int fd, int64_t file_offset,
547       int64_t file_size);
548 
549   // Writes metadata of a proto to the fd. Takes in a host byte order endianness
550   // metadata and converts it into a portable metadata before writing.
551   //
552   // Returns:
553   //   OK on success
554   //   INTERNAL_ERROR on any IO errors
555   static libtextclassifier3::Status WriteProtoMetadata(
556       const Filesystem* filesystem, int fd, int32_t host_order_metadata);
557 
IsEmptyBuffer(const char * buffer,int size)558   static bool IsEmptyBuffer(const char* buffer, int size) {
559     return std::all_of(buffer, buffer + size,
560                        [](const char byte) { return byte == 0; });
561   }
562 
563   // Helper function to get stored proto size from the metadata.
564   // Metadata format: 8 bits magic + 24 bits size
GetProtoSize(int metadata)565   static int GetProtoSize(int metadata) { return metadata & 0x00FFFFFF; }
566 
567   // Helper function to get stored proto magic from the metadata.
568   // Metadata format: 8 bits magic + 24 bits size
GetProtoMagic(int metadata)569   static uint8_t GetProtoMagic(int metadata) { return metadata >> 24; }
570 
571   // Magic number added in front of every proto. Used when reading out protos
572   // as a first check for corruption in each entry in the file. Even if there is
573   // a corruption, the best we can do is roll back to our last recovery point
574   // and throw away un-flushed data. We can discard/reuse this byte if needed so
575   // that we have 4 bytes to store the size of protos, and increase the size of
576   // protos we support.
577   static constexpr uint8_t kProtoMagic = 0x5C;
578 
579   // Chunks of the file to mmap at a time, so we don't mmap the entire file.
580   // Only used on 32-bit devices
581   static constexpr int kMmapChunkSize = 4 * 1024 * 1024;  // 4MiB
582 
583   ScopedFd fd_;
584   const Filesystem* const filesystem_;
585   const std::string file_path_;
586   std::unique_ptr<Header> header_;
587   const int32_t compression_level_;
588 };
589 
590 template <typename ProtoT>
PortableFileBackedProtoLog(const Filesystem * filesystem,const std::string & file_path,std::unique_ptr<Header> header,int32_t compression_level)591 PortableFileBackedProtoLog<ProtoT>::PortableFileBackedProtoLog(
592     const Filesystem* filesystem, const std::string& file_path,
593     std::unique_ptr<Header> header, int32_t compression_level)
594     : filesystem_(filesystem),
595       file_path_(file_path),
596       header_(std::move(header)),
597       compression_level_(compression_level) {
598   fd_.reset(filesystem_->OpenForAppend(file_path.c_str()));
599 }
600 
601 template <typename ProtoT>
~PortableFileBackedProtoLog()602 PortableFileBackedProtoLog<ProtoT>::~PortableFileBackedProtoLog() {
603   if (!PersistToDisk().ok()) {
604     ICING_LOG(WARNING) << "Error persisting to disk during destruction of "
605                           "PortableFileBackedProtoLog: "
606                        << file_path_;
607   }
608 }
609 
610 template <typename ProtoT>
611 libtextclassifier3::StatusOr<
612     typename PortableFileBackedProtoLog<ProtoT>::CreateResult>
Create(const Filesystem * filesystem,const std::string & file_path,const Options & options)613 PortableFileBackedProtoLog<ProtoT>::Create(const Filesystem* filesystem,
614                                            const std::string& file_path,
615                                            const Options& options) {
616   if (options.max_proto_size <= 0) {
617     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
618         "options.max_proto_size must be greater than 0, was %d",
619         options.max_proto_size));
620   }
621 
622   // Since we store the proto_size in 3 bytes, we can only support protos of up
623   // to 16MiB.
624   if (options.max_proto_size > kMaxProtoSize) {
625     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
626         "options.max_proto_size must be under 16MiB, was %d",
627         options.max_proto_size));
628   }
629 
630   if (options.compression_level < 0 || options.compression_level > 9) {
631     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
632         "options.compression_level must be between 0 and 9 inclusive, was %d",
633         options.compression_level));
634   }
635 
636   if (!filesystem->FileExists(file_path.c_str())) {
637     return InitializeNewFile(filesystem, file_path, options);
638   }
639 
640   int64_t file_size = filesystem->GetFileSize(file_path.c_str());
641   if (file_size == Filesystem::kBadFileSize) {
642     return absl_ports::InternalError(
643         absl_ports::StrCat("Bad file size '", file_path, "'"));
644   }
645 
646   if (file_size == 0) {
647     return InitializeNewFile(filesystem, file_path, options);
648   }
649 
650   return InitializeExistingFile(filesystem, file_path, options, file_size);
651 }
652 
653 template <typename ProtoT>
654 libtextclassifier3::StatusOr<
655     typename PortableFileBackedProtoLog<ProtoT>::CreateResult>
InitializeNewFile(const Filesystem * filesystem,const std::string & file_path,const Options & options)656 PortableFileBackedProtoLog<ProtoT>::InitializeNewFile(
657     const Filesystem* filesystem, const std::string& file_path,
658     const Options& options) {
659   // Grow to the minimum reserved bytes for the header.
660   if (!filesystem->Truncate(file_path.c_str(), kHeaderReservedBytes)) {
661     return absl_ports::InternalError(
662         absl_ports::StrCat("Failed to initialize file size: ", file_path));
663   }
664 
665   // Create the header
666   std::unique_ptr<Header> header = std::make_unique<Header>();
667   header->SetCompressFlag(options.compress);
668   header->SetMaxProtoSize(options.max_proto_size);
669   header->SetHeaderChecksum(header->CalculateHeaderChecksum());
670 
671   if (!filesystem->Write(file_path.c_str(), header.get(), sizeof(Header))) {
672     return absl_ports::InternalError(
673         absl_ports::StrCat("Failed to write header for file: ", file_path));
674   }
675 
676   CreateResult create_result = {
677       std::unique_ptr<PortableFileBackedProtoLog<ProtoT>>(
678           new PortableFileBackedProtoLog<ProtoT>(filesystem, file_path,
679                                                  std::move(header),
680                                                  options.compression_level)),
681       /*data_loss=*/DataLoss::NONE, /*recalculated_checksum=*/false};
682 
683   return create_result;
684 }
685 
686 template <typename ProtoT>
687 libtextclassifier3::StatusOr<
688     typename PortableFileBackedProtoLog<ProtoT>::CreateResult>
InitializeExistingFile(const Filesystem * filesystem,const std::string & file_path,const Options & options,int64_t file_size)689 PortableFileBackedProtoLog<ProtoT>::InitializeExistingFile(
690     const Filesystem* filesystem, const std::string& file_path,
691     const Options& options, int64_t file_size) {
692   bool header_changed = false;
693   if (file_size < kHeaderReservedBytes) {
694     return absl_ports::InternalError(
695         absl_ports::StrCat("File header too short for: ", file_path));
696   }
697 
698   std::unique_ptr<Header> header = std::make_unique<Header>();
699   if (!filesystem->PRead(file_path.c_str(), header.get(), sizeof(Header),
700                          /*offset=*/0)) {
701     return absl_ports::InternalError(
702         absl_ports::StrCat("Failed to read header for file: ", file_path));
703   }
704 
705   // Make sure the header is still valid before we use any of its values. This
706   // is covered by the header_checksum check below, but this is a quick check
707   // that can save us from an extra crc computation.
708   if (header->GetMagic() != Header::kMagic) {
709     return absl_ports::InternalError(
710         absl_ports::StrCat("Invalid header kMagic for file: ", file_path));
711   }
712 
713   if (header->GetHeaderChecksum() != header->CalculateHeaderChecksum()) {
714     return absl_ports::InternalError(
715         absl_ports::StrCat("Invalid header checksum for: ", file_path));
716   }
717 
718   if (header->GetFileFormatVersion() != Header::kFileFormatVersion) {
719     // If this changes, we might need to handle a migration rather than throwing
720     // an error.
721     return absl_ports::InternalError(
722         absl_ports::StrCat("Invalid header file format version: ", file_path));
723   }
724 
725   if (header->GetCompressFlag() != options.compress) {
726     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
727         "Inconsistent compress option, expected %d, actual %d",
728         header->GetCompressFlag(), options.compress));
729   }
730 
731   int32_t existing_max_proto_size = header->GetMaxProtoSize();
732   if (existing_max_proto_size > options.max_proto_size) {
733     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
734         "Max proto size cannot be smaller than previous "
735         "instantiations, previous size %d, wanted size %d",
736         header->GetMaxProtoSize(), options.max_proto_size));
737   } else if (existing_max_proto_size < options.max_proto_size) {
738     // It's fine if our new max size is greater than our previous one. Existing
739     // data is still valid.
740     header->SetMaxProtoSize(options.max_proto_size);
741     header_changed = true;
742   }
743 
744   DataLoss data_loss = DataLoss::NONE;
745 
746   // If we have any documents in our tail, get rid of them since they're not in
747   // our checksum. Our checksum reflects content up to the rewind offset.
748   if (file_size > header->GetRewindOffset()) {
749     if (!filesystem->Truncate(file_path.c_str(), header->GetRewindOffset())) {
750       return absl_ports::InternalError(IcingStringUtil::StringPrintf(
751           "Failed to truncate '%s' to size %lld", file_path.data(),
752           static_cast<long long>(header->GetRewindOffset())));
753     }
754     data_loss = DataLoss::PARTIAL;
755   }
756 
757   bool recalculated_checksum = false;
758 
759   // If our dirty flag is set, that means we might have crashed in the middle of
760   // erasing a proto. This could have happened anywhere between:
761   //   A. Set dirty flag to true and update header checksum
762   //   B. Erase the proto
763   //   C. Set dirty flag to false, update log checksum, update header checksum
764   //
765   // Scenario 1: We went down between A and B. Maybe our dirty flag is a
766   // false alarm and we can keep all our data.
767   //
768   // Scenario 2: We went down between B and C. Our data is compromised and
769   // we need to throw everything out.
770   if (header->GetDirtyFlag()) {
771     // Recompute the log's checksum to detect which scenario we're in.
772     ICING_ASSIGN_OR_RETURN(
773         Crc32 calculated_log_checksum,
774         ComputeChecksum(filesystem, file_path, Crc32(),
775                         /*start=*/kHeaderReservedBytes, /*end=*/file_size));
776 
777     if (header->GetLogChecksum() != calculated_log_checksum.Get()) {
778       // Still doesn't match, we're in Scenario 2. Throw out all our data now
779       // and initialize as a new instance.
780       ICING_ASSIGN_OR_RETURN(CreateResult create_result,
781                              InitializeNewFile(filesystem, file_path, options));
782       create_result.data_loss = DataLoss::COMPLETE;
783       create_result.recalculated_checksum = true;
784       return create_result;
785     }
786     // Otherwise we're good, checksum matches our contents so continue
787     // initializing like normal.
788     recalculated_checksum = true;
789 
790     // Update our header.
791     header->SetDirtyFlag(false);
792     header_changed = true;
793   }
794 
795   if (header_changed) {
796     header->SetHeaderChecksum(header->CalculateHeaderChecksum());
797 
798     if (!filesystem->PWrite(file_path.c_str(), /*offset=*/0, header.get(),
799                             sizeof(Header))) {
800       return absl_ports::InternalError(
801           absl_ports::StrCat("Failed to update header to: ", file_path));
802     }
803   }
804 
805   CreateResult create_result = {
806       std::unique_ptr<PortableFileBackedProtoLog<ProtoT>>(
807           new PortableFileBackedProtoLog<ProtoT>(filesystem, file_path,
808                                                  std::move(header),
809                                                  options.compression_level)),
810       data_loss, recalculated_checksum};
811 
812   return create_result;
813 }
814 
815 template <typename ProtoT>
816 libtextclassifier3::StatusOr<Crc32>
ComputeChecksum(const Filesystem * filesystem,const std::string & file_path,Crc32 initial_crc,int64_t start,int64_t end)817 PortableFileBackedProtoLog<ProtoT>::ComputeChecksum(
818     const Filesystem* filesystem, const std::string& file_path,
819     Crc32 initial_crc, int64_t start, int64_t end) {
820   ICING_ASSIGN_OR_RETURN(
821       MemoryMappedFile mmapped_file,
822       MemoryMappedFile::Create(*filesystem, file_path,
823                                MemoryMappedFile::Strategy::READ_ONLY));
824   Crc32 new_crc(initial_crc.Get());
825 
826   if (start < 0) {
827     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
828         "Starting checksum offset of file '%s' must be greater than 0, was "
829         "%lld",
830         file_path.c_str(), static_cast<long long>(start)));
831   }
832 
833   if (end < start) {
834     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
835         "Ending checksum offset of file '%s' must be greater than start "
836         "'%lld', was '%lld'",
837         file_path.c_str(), static_cast<long long>(start),
838         static_cast<long long>(end)));
839   }
840 
841   int64_t file_size = filesystem->GetFileSize(file_path.c_str());
842   if (end > file_size) {
843     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
844         "Ending checksum offset of file '%s' must be within "
845         "file size of %lld, was %lld",
846         file_path.c_str(), static_cast<long long>(file_size),
847         static_cast<long long>(end)));
848   }
849 
850   Architecture architecture = GetArchitecture();
851   switch (architecture) {
852     case Architecture::BIT_64: {
853       // Don't mmap in chunks here since mmapping can be harmful on 64-bit
854       // devices where mmap/munmap calls need the mmap write semaphore, which
855       // blocks mmap/munmap/mprotect and all page faults from executing while
856       // they run. On 64-bit devices, this doesn't actually load into memory, it
857       // just makes the file faultable. So the whole file should be ok.
858       // b/185822878.
859       ICING_RETURN_IF_ERROR(mmapped_file.Remap(start, end - start));
860       auto mmap_str = std::string_view(mmapped_file.region(), end - start);
861       new_crc.Append(mmap_str);
862       break;
863     }
864     case Architecture::BIT_32:
865       [[fallthrough]];
866     case Architecture::UNKNOWN: {
867       // 32-bit devices only have 4GB of RAM. Mmap in chunks to not use up too
868       // much memory at once. If we're unknown, then also chunk it because we're
869       // not sure what the device can handle.
870       for (int i = start; i < end; i += kMmapChunkSize) {
871         // Don't read past the file size.
872         int next_chunk_size = kMmapChunkSize;
873         if ((i + kMmapChunkSize) >= end) {
874           next_chunk_size = end - i;
875         }
876 
877         ICING_RETURN_IF_ERROR(mmapped_file.Remap(i, next_chunk_size));
878 
879         auto mmap_str =
880             std::string_view(mmapped_file.region(), next_chunk_size);
881         new_crc.Append(mmap_str);
882       }
883       break;
884     }
885   }
886 
887   return new_crc;
888 }
889 
890 template <typename ProtoT>
891 libtextclassifier3::StatusOr<int64_t>
WriteProto(const ProtoT & proto)892 PortableFileBackedProtoLog<ProtoT>::WriteProto(const ProtoT& proto) {
893   int64_t proto_size = proto.ByteSizeLong();
894   int32_t host_order_metadata;
895   int64_t current_position = filesystem_->GetCurrentPosition(fd_.get());
896 
897   if (proto_size > header_->GetMaxProtoSize()) {
898     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
899         "proto_size, %lld, was too large to write. Max is %d",
900         static_cast<long long>(proto_size), header_->GetMaxProtoSize()));
901   }
902 
903   // At this point, we've guaranteed that proto_size is under kMaxProtoSize
904   // (see
905   // ::Create), so we can safely store it in an int.
906   int final_size = 0;
907 
908   std::string proto_str;
909   google::protobuf::io::StringOutputStream proto_stream(&proto_str);
910 
911   if (header_->GetCompressFlag()) {
912     protobuf_ports::GzipOutputStream::Options options;
913     options.format = protobuf_ports::GzipOutputStream::ZLIB;
914     options.compression_level = compression_level_;
915 
916     protobuf_ports::GzipOutputStream compressing_stream(&proto_stream, options);
917 
918     bool success = proto.SerializeToZeroCopyStream(&compressing_stream) &&
919                    compressing_stream.Close();
920 
921     if (!success) {
922       return absl_ports::InternalError("Error compressing proto.");
923     }
924 
925     final_size = proto_str.size();
926 
927     // In case the compressed proto is larger than the original proto, we also
928     // can't write it.
929     if (final_size > header_->GetMaxProtoSize()) {
930       return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
931           "Compressed proto size, %d, was greater than "
932           "max_proto_size, %d",
933           final_size, header_->GetMaxProtoSize()));
934     }
935   } else {
936     // Serialize the proto directly into the write buffer at an offset of the
937     // metadata.
938     proto.SerializeToZeroCopyStream(&proto_stream);
939     final_size = proto_str.size();
940   }
941 
942   // 1st byte for magic, next 3 bytes for proto size.
943   host_order_metadata = (kProtoMagic << 24) | final_size;
944 
945   // Actually write metadata, has to be done after we know the possibly
946   // compressed proto size
947   ICING_RETURN_IF_ERROR(
948       WriteProtoMetadata(filesystem_, fd_.get(), host_order_metadata));
949 
950   // Write the serialized proto
951   if (!filesystem_->Write(fd_.get(), proto_str.data(), proto_str.size())) {
952     return absl_ports::InternalError(
953         absl_ports::StrCat("Failed to write proto to: ", file_path_));
954   }
955 
956   return current_position;
957 }
958 
959 template <typename ProtoT>
960 libtextclassifier3::StatusOr<ProtoT>
ReadProto(int64_t file_offset)961 PortableFileBackedProtoLog<ProtoT>::ReadProto(int64_t file_offset) const {
962   int64_t file_size = filesystem_->GetFileSize(fd_.get());
963   // Read out the metadata
964   if (file_size == Filesystem::kBadFileSize) {
965     return absl_ports::OutOfRangeError("Unable to correctly read size.");
966   }
967   ICING_ASSIGN_OR_RETURN(
968       int32_t metadata,
969       ReadProtoMetadata(filesystem_, fd_.get(), file_offset, file_size));
970 
971   // Copy out however many bytes it says the proto is
972   int stored_size = GetProtoSize(metadata);
973   file_offset += sizeof(metadata);
974 
975   // Read the compressed proto out.
976   if (file_offset + stored_size > file_size) {
977     return absl_ports::OutOfRangeError(
978         IcingStringUtil::StringPrintf("Trying to read from a location, %lld, "
979                                       "out of range of the file size, %lld",
980                                       static_cast<long long>(file_offset),
981                                       static_cast<long long>(file_size - 1)));
982   }
983   auto buf = std::make_unique<char[]>(stored_size);
984   if (!filesystem_->PRead(fd_.get(), buf.get(), stored_size, file_offset)) {
985     return absl_ports::InternalError("");
986   }
987 
988   if (IsEmptyBuffer(buf.get(), stored_size)) {
989     return absl_ports::NotFoundError("The proto data has been erased.");
990   }
991 
992   google::protobuf::io::ArrayInputStream proto_stream(buf.get(), stored_size);
993 
994   // Deserialize proto
995   ProtoT proto;
996   if (header_->GetCompressFlag()) {
997     protobuf_ports::GzipInputStream decompress_stream(&proto_stream);
998     proto.ParseFromZeroCopyStream(&decompress_stream);
999   } else {
1000     proto.ParseFromZeroCopyStream(&proto_stream);
1001   }
1002 
1003   return proto;
1004 }
1005 
1006 template <typename ProtoT>
EraseProto(int64_t file_offset)1007 libtextclassifier3::Status PortableFileBackedProtoLog<ProtoT>::EraseProto(
1008     int64_t file_offset) {
1009   int64_t file_size = filesystem_->GetFileSize(fd_.get());
1010   if (file_size == Filesystem::kBadFileSize) {
1011     return absl_ports::OutOfRangeError("Unable to correctly read size.");
1012   }
1013 
1014   ICING_ASSIGN_OR_RETURN(
1015       int32_t metadata,
1016       ReadProtoMetadata(filesystem_, fd_.get(), file_offset, file_size));
1017   // Copy out however many bytes it says the proto is
1018   int stored_size = GetProtoSize(metadata);
1019   file_offset += sizeof(metadata);
1020   if (file_offset + stored_size > file_size) {
1021     return absl_ports::OutOfRangeError(
1022         IcingStringUtil::StringPrintf("Trying to read from a location, %lld, "
1023                                       "out of range of the file size, %lld",
1024                                       static_cast<long long>(file_offset),
1025                                       static_cast<long long>(file_size - 1)));
1026   }
1027   auto buf = std::make_unique<char[]>(stored_size);
1028 
1029   // We need to update the crc checksum if the erased area is before the
1030   // rewind position.
1031   int32_t new_crc;
1032   if (file_offset < header_->GetRewindOffset()) {
1033     // Set to "dirty" before we start writing anything.
1034     header_->SetDirtyFlag(true);
1035     header_->SetHeaderChecksum(header_->CalculateHeaderChecksum());
1036     if (!filesystem_->PWrite(fd_.get(), /*offset=*/0, header_.get(),
1037                              sizeof(Header))) {
1038       return absl_ports::InternalError(absl_ports::StrCat(
1039           "Failed to update dirty bit of header to: ", file_path_));
1040     }
1041 
1042     // We need to calculate [original string xor 0s].
1043     // The xored string is the same as the original string because 0 xor 0 =
1044     // 0, 1 xor 0 = 1.
1045     // Read the compressed proto out.
1046     if (!filesystem_->PRead(fd_.get(), buf.get(), stored_size, file_offset)) {
1047       return absl_ports::InternalError("");
1048     }
1049     const std::string_view xored_str(buf.get(), stored_size);
1050 
1051     Crc32 crc(header_->GetLogChecksum());
1052     ICING_ASSIGN_OR_RETURN(
1053         new_crc,
1054         crc.UpdateWithXor(xored_str,
1055                           /*full_data_size=*/header_->GetRewindOffset() -
1056                               kHeaderReservedBytes,
1057                           /*position=*/file_offset - kHeaderReservedBytes));
1058   }
1059 
1060   // Clear the region.
1061   memset(buf.get(), '\0', stored_size);
1062   if (!filesystem_->PWrite(fd_.get(), file_offset, buf.get(), stored_size)) {
1063     return absl_ports::InternalError("");
1064   }
1065 
1066   // If we cleared something in our checksummed area, we should update our
1067   // checksum and reset our dirty bit.
1068   if (file_offset < header_->GetRewindOffset()) {
1069     header_->SetDirtyFlag(false);
1070     header_->SetLogChecksum(new_crc);
1071     header_->SetHeaderChecksum(header_->CalculateHeaderChecksum());
1072 
1073     if (!filesystem_->PWrite(fd_.get(), /*offset=*/0, header_.get(),
1074                              sizeof(Header))) {
1075       return absl_ports::InternalError(
1076           absl_ports::StrCat("Failed to update header to: ", file_path_));
1077     }
1078   }
1079 
1080   return libtextclassifier3::Status::OK;
1081 }
1082 
1083 template <typename ProtoT>
1084 libtextclassifier3::StatusOr<int64_t>
GetDiskUsage()1085 PortableFileBackedProtoLog<ProtoT>::GetDiskUsage() const {
1086   int64_t size = filesystem_->GetDiskUsage(file_path_.c_str());
1087   if (size == Filesystem::kBadFileSize) {
1088     return absl_ports::InternalError("Failed to get disk usage of proto log");
1089   }
1090   return size;
1091 }
1092 
1093 template <typename ProtoT>
1094 libtextclassifier3::StatusOr<int64_t>
GetElementsFileSize()1095 PortableFileBackedProtoLog<ProtoT>::GetElementsFileSize() const {
1096   int64_t total_file_size = filesystem_->GetFileSize(file_path_.c_str());
1097   if (total_file_size == Filesystem::kBadFileSize) {
1098     return absl_ports::InternalError(
1099         "Failed to get file size of elments in the proto log");
1100   }
1101   return total_file_size - kHeaderReservedBytes;
1102 }
1103 
1104 template <typename ProtoT>
Iterator(const Filesystem & filesystem,int fd,int64_t initial_offset)1105 PortableFileBackedProtoLog<ProtoT>::Iterator::Iterator(
1106     const Filesystem& filesystem, int fd, int64_t initial_offset)
1107     : filesystem_(&filesystem),
1108       initial_offset_(initial_offset),
1109       current_offset_(kInvalidOffset),
1110       fd_(fd) {
1111   file_size_ = filesystem_->GetFileSize(fd_);
1112   if (file_size_ == Filesystem::kBadFileSize) {
1113     // Fails all Advance() calls
1114     file_size_ = 0;
1115   }
1116 }
1117 
1118 template <typename ProtoT>
1119 libtextclassifier3::Status
Advance()1120 PortableFileBackedProtoLog<ProtoT>::Iterator::Advance() {
1121   if (current_offset_ == kInvalidOffset) {
1122     // First Advance() call
1123     current_offset_ = initial_offset_;
1124   } else {
1125     // Jumps to the next proto position
1126     ICING_ASSIGN_OR_RETURN(
1127         int32_t metadata,
1128         ReadProtoMetadata(filesystem_, fd_, current_offset_, file_size_));
1129     current_offset_ += sizeof(metadata) + GetProtoSize(metadata);
1130   }
1131 
1132   if (current_offset_ < file_size_) {
1133     return libtextclassifier3::Status::OK;
1134   } else {
1135     return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
1136         "The next proto offset, %lld, is out of file range [0, %lld)",
1137         static_cast<long long>(current_offset_),
1138         static_cast<long long>(file_size_)));
1139   }
1140 }
1141 
1142 template <typename ProtoT>
GetOffset()1143 int64_t PortableFileBackedProtoLog<ProtoT>::Iterator::GetOffset() {
1144   return current_offset_;
1145 }
1146 
1147 template <typename ProtoT>
1148 typename PortableFileBackedProtoLog<ProtoT>::Iterator
GetIterator()1149 PortableFileBackedProtoLog<ProtoT>::GetIterator() {
1150   return Iterator(*filesystem_, fd_.get(),
1151                   /*initial_offset=*/kHeaderReservedBytes);
1152 }
1153 
1154 template <typename ProtoT>
1155 libtextclassifier3::StatusOr<int32_t>
ReadProtoMetadata(const Filesystem * const filesystem,int fd,int64_t file_offset,int64_t file_size)1156 PortableFileBackedProtoLog<ProtoT>::ReadProtoMetadata(
1157     const Filesystem* const filesystem, int fd, int64_t file_offset,
1158     int64_t file_size) {
1159   // Checks file_offset
1160   if (file_offset >= file_size) {
1161     return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
1162         "offset, %lld, is out of file range [0, %lld)",
1163         static_cast<long long>(file_offset),
1164         static_cast<long long>(file_size)));
1165   }
1166   int32_t portable_metadata;
1167   int metadata_size = sizeof(portable_metadata);
1168   if (file_offset + metadata_size >= file_size) {
1169     return absl_ports::InternalError(IcingStringUtil::StringPrintf(
1170         "Wrong metadata offset %lld, metadata doesn't fit in "
1171         "with file range [0, %lld)",
1172         static_cast<long long>(file_offset),
1173         static_cast<long long>(file_size)));
1174   }
1175 
1176   if (!filesystem->PRead(fd, &portable_metadata, metadata_size, file_offset)) {
1177     return absl_ports::InternalError("");
1178   }
1179 
1180   // Need to switch it back to host order endianness after reading from disk.
1181   int32_t host_order_metadata = GNetworkToHostL(portable_metadata);
1182 
1183   // Checks magic number
1184   uint8_t stored_k_proto_magic = GetProtoMagic(host_order_metadata);
1185   if (stored_k_proto_magic != kProtoMagic) {
1186     return absl_ports::InternalError(IcingStringUtil::StringPrintf(
1187         "Failed to read kProtoMagic, expected %d, actual %d", kProtoMagic,
1188         stored_k_proto_magic));
1189   }
1190 
1191   return host_order_metadata;
1192 }
1193 
1194 template <typename ProtoT>
1195 libtextclassifier3::Status
WriteProtoMetadata(const Filesystem * filesystem,int fd,int32_t host_order_metadata)1196 PortableFileBackedProtoLog<ProtoT>::WriteProtoMetadata(
1197     const Filesystem* filesystem, int fd, int32_t host_order_metadata) {
1198   // Convert it into portable endian format before writing to disk
1199   int32_t portable_metadata = GHostToNetworkL(host_order_metadata);
1200   int portable_metadata_size = sizeof(portable_metadata);
1201 
1202   // Write metadata
1203   if (!filesystem->Write(fd, &portable_metadata, portable_metadata_size)) {
1204     return absl_ports::InternalError(
1205         absl_ports::StrCat("Failed to write proto metadata."));
1206   }
1207 
1208   return libtextclassifier3::Status::OK;
1209 }
1210 
1211 template <typename ProtoT>
PersistToDisk()1212 libtextclassifier3::Status PortableFileBackedProtoLog<ProtoT>::PersistToDisk() {
1213   int64_t file_size = filesystem_->GetFileSize(file_path_.c_str());
1214   if (file_size == header_->GetRewindOffset()) {
1215     // No new protos appended, don't need to update the checksum.
1216     return libtextclassifier3::Status::OK;
1217   }
1218 
1219   ICING_ASSIGN_OR_RETURN(Crc32 crc, ComputeChecksum());
1220 
1221   header_->SetLogChecksum(crc.Get());
1222   header_->SetRewindOffset(file_size);
1223   header_->SetHeaderChecksum(header_->CalculateHeaderChecksum());
1224 
1225   if (!filesystem_->PWrite(fd_.get(), /*offset=*/0, header_.get(),
1226                            sizeof(Header)) ||
1227       !filesystem_->DataSync(fd_.get())) {
1228     return absl_ports::InternalError(
1229         absl_ports::StrCat("Failed to update header to: ", file_path_));
1230   }
1231 
1232   return libtextclassifier3::Status::OK;
1233 }
1234 
1235 template <typename ProtoT>
1236 libtextclassifier3::StatusOr<Crc32>
ComputeChecksum()1237 PortableFileBackedProtoLog<ProtoT>::ComputeChecksum() {
1238   int64_t file_size = filesystem_->GetFileSize(file_path_.c_str());
1239   int64_t new_content_size = file_size - header_->GetRewindOffset();
1240   Crc32 crc;
1241   if (new_content_size == 0) {
1242     // No new protos appended, return cached checksum
1243     return Crc32(header_->GetLogChecksum());
1244   } else if (new_content_size < 0) {
1245     // File shrunk, recalculate the entire checksum.
1246     ICING_ASSIGN_OR_RETURN(
1247         crc,
1248         ComputeChecksum(filesystem_, file_path_, Crc32(),
1249                         /*start=*/kHeaderReservedBytes, /*end=*/file_size));
1250   } else {
1251     // Append new changes to the existing checksum.
1252     ICING_ASSIGN_OR_RETURN(
1253         crc, ComputeChecksum(
1254                  filesystem_, file_path_, Crc32(header_->GetLogChecksum()),
1255                  /*start=*/header_->GetRewindOffset(), /*end=*/file_size));
1256   }
1257   return crc;
1258 }
1259 
1260 }  // namespace lib
1261 }  // namespace icing
1262 
1263 #endif  // ICING_FILE_PORTABLE_FILE_BACKED_PROTO_LOG_H_
1264