• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // File-backed log of protos with append-only writes and position based reads.
16 //
17 // The implementation in this file is deprecated and replaced by
18 // portable-file-backed-proto-log.h.
19 //
20 // This deprecated implementation has been made read-only for the purposes of
21 // migration; writing and erasing this format of log is no longer supported and
22 // the methods to accomplish this have been removed.
23 //
24 // The details of this format follow below:
25 // Each proto written to the file will have a metadata written just before it.
26 // The metadata consists of
27 //   {
28 //     1 bytes of kProtoMagic;
29 //     3 bytes of the proto size
30 //     n bytes of the proto itself
31 //   }
32 // TODO(b/136514769): Add versioning to the header and a UpgradeToVersion
33 // migration method.
34 #ifndef ICING_FILE_FILE_BACKED_PROTO_LOG_H_
35 #define ICING_FILE_FILE_BACKED_PROTO_LOG_H_
36 
37 #include <cstdint>
38 #include <memory>
39 #include <string>
40 #include <string_view>
41 
42 #include "icing/text_classifier/lib3/utils/base/statusor.h"
43 #include "icing/absl_ports/canonical_errors.h"
44 #include "icing/absl_ports/str_cat.h"
45 #include "icing/file/filesystem.h"
46 #include "icing/file/memory-mapped-file.h"
47 #include "icing/legacy/core/icing-string-util.h"
48 #include "icing/portable/gzip_stream.h"
49 #include "icing/portable/platform.h"
50 #include "icing/portable/zlib.h"
51 #include "icing/util/crc32.h"
52 #include "icing/util/data-loss.h"
53 #include "icing/util/logging.h"
54 #include "icing/util/status-macros.h"
55 #include <google/protobuf/io/zero_copy_stream_impl_lite.h>
56 
57 namespace icing {
58 namespace lib {
59 
60 template <typename ProtoT>
61 class FileBackedProtoLog {
62  public:
63   struct Options {
64     // Whether to compress each proto before writing to the proto log.
65     bool compress;
66 
67     // Byte-size limit for each proto written to the store. This does not
68     // include the bytes needed for the metadata of each proto.
69     //
70     // NOTE: Currently, we only support protos up to 16MiB. We store the proto
71     // size in 3 bytes within the metadata.
72     //
73     // NOTE: This limit is only enforced for future writes. If the store
74     // previously had a higher limit, then reading older entries could return
75     // larger protos.
76     //
77     // NOTE: The max_proto_size is the upper limit for input protos into the
78     // ProtoLog. Even if the proto is larger than max_proto_size, but compresses
79     // to a smaller size, ProtoLog will not accept it. Protos that result in a
80     // compressed size larger than max_proto_size are also not accepted.
81     const int32_t max_proto_size;
82 
83     // Must specify values for options.
84     Options() = delete;
85     explicit Options(bool compress_in,
86                      const int32_t max_proto_size_in = kMaxProtoSize)
compressOptions87         : compress(compress_in), max_proto_size(max_proto_size_in) {}
88   };
89 
90   // Header stored at the beginning of the file before the rest of the log
91   // contents. Stores metadata on the log.
92   struct Header {
93     static constexpr int32_t kMagic = 0xf4c6f67a;
94 
95     // Holds the magic as a quick sanity check against file corruption.
96     int32_t magic = kMagic;
97 
98     // Whether to compress the protos before writing to the log.
99     bool compress = true;
100 
101     // The maximum proto size that can be written to the log.
102     int32_t max_proto_size = 0;
103 
104     // Checksum of the log elements, doesn't include the header fields.
105     uint32_t log_checksum = 0;
106 
107     // Last known good offset at which the log and its checksum were updated.
108     // If we crash between writing to the log and updating the checksum, we can
109     // try to rewind the log to this offset and verify the checksum is still
110     // valid instead of throwing away the entire log.
111     int64_t rewind_offset = sizeof(Header);
112 
113     // Must be at the end. Contains the crc checksum of the preceding fields.
114     uint32_t header_checksum = 0;
115 
CalculateHeaderChecksumHeader116     uint32_t CalculateHeaderChecksum() const {
117       Crc32 crc;
118       std::string_view header_str(reinterpret_cast<const char*>(this),
119                                   offsetof(Header, header_checksum));
120       crc.Append(header_str);
121       return crc.Get();
122     }
123   };
124 
125   struct CreateResult {
126     // A successfully initialized log.
127     std::unique_ptr<FileBackedProtoLog<ProtoT>> proto_log;
128 
129     // The data status after initializing from a previous state. Data loss can
130     // happen if the file is corrupted or some previously added data was
131     // unpersisted. This may be used to signal that any derived data off of the
132     // proto log may need to be regenerated.
133     DataLoss data_loss;
134 
has_data_lossCreateResult135     bool has_data_loss() {
136       return data_loss == DataLoss::PARTIAL || data_loss == DataLoss::COMPLETE;
137     }
138   };
139 
140   // Factory method to create, initialize, and return a FileBackedProtoLog. Will
141   // create the file if it doesn't exist.
142   //
143   // If on re-initialization the log detects disk corruption or some previously
144   // added data was unpersisted, the log will rewind to the last-good state. The
145   // log saves these checkpointed "good" states when PersistToDisk() is called
146   // or the log is safely destructed. If the log rewinds successfully to the
147   // last-good state, then the returned CreateResult.data_loss indicates
148   // whether it has a data loss and what kind of data loss it is (partial or
149   // complete) so that any derived data may know that it needs to be updated. If
150   // the log re-initializes successfully without any data loss,
151   // CreateResult.data_loss will be NONE.
152   //
153   // Params:
154   //   filesystem: Handles system level calls
155   //   file_path: Path of the underlying file. Directory of the file should
156   //   already exist
157   //   options: Configuration options for the proto log
158   //
159   // Returns:
160   //   FileBackedProtoLog::CreateResult on success
161   //   INVALID_ARGUMENT on an invalid option
162   //   INTERNAL_ERROR on IO error
163   static libtextclassifier3::StatusOr<CreateResult> Create(
164       const Filesystem* filesystem, const std::string& file_path,
165       const Options& options);
166 
167   // Not copyable
168   FileBackedProtoLog(const FileBackedProtoLog&) = delete;
169   FileBackedProtoLog& operator=(const FileBackedProtoLog&) = delete;
170 
171   // Reads out a proto located at file_offset from the file.
172   //
173   // Returns:
174   //   A proto on success
175   //   NOT_FOUND if the proto at the given offset has been erased
176   //   OUT_OF_RANGE_ERROR if file_offset exceeds file size
177   //   INTERNAL_ERROR on IO error
178   libtextclassifier3::StatusOr<ProtoT> ReadProto(int64_t file_offset) const;
179 
180   // An iterator helping to find offsets of all the protos in file.
181   // Example usage:
182   //
183   // while (iterator.Advance().ok()) {
184   //   int64_t offset = iterator.GetOffset();
185   //   // Do something
186   // }
187   class Iterator {
188    public:
189     explicit Iterator(const Filesystem& filesystem,
190                       const std::string& file_path, int64_t initial_offset,
191                       MemoryMappedFile&& mmapped_file);
192 
193     // Advances to the position of next proto whether it has been erased or not.
194     //
195     // Returns:
196     //   OK on success
197     //   OUT_OF_RANGE_ERROR if it reaches the end
198     //   INTERNAL_ERROR on IO error
199     libtextclassifier3::Status Advance();
200 
201     // Returns the file offset of current proto.
202     int64_t GetOffset();
203 
204    private:
205     static constexpr int64_t kInvalidOffset = -1;
206     // Used to read proto metadata
207     MemoryMappedFile mmapped_file_;
208     // Offset of first proto
209     int64_t initial_offset_;
210     int64_t current_offset_;
211     int64_t file_size_;
212   };
213 
214   // Returns an iterator of current proto log. The caller needs to keep the
215   // proto log unchanged while using the iterator, otherwise unexpected
216   // behaviors could happen.
217   libtextclassifier3::StatusOr<Iterator> GetIterator();
218 
219  private:
220   // Object can only be instantiated via the ::Create factory.
221   FileBackedProtoLog(const Filesystem* filesystem, const std::string& file_path,
222                      std::unique_ptr<Header> header);
223 
224   // Initializes a new proto log.
225   //
226   // Returns:
227   //   std::unique_ptr<CreateResult> on success
228   //   INTERNAL_ERROR on IO error
229   static libtextclassifier3::StatusOr<CreateResult> InitializeNewFile(
230       const Filesystem* filesystem, const std::string& file_path,
231       const Options& options);
232 
233   // Verifies that the existing proto log is in a good state. If not in a good
234   // state, then the proto log may be truncated to the last good state and
235   // content will be lost.
236   //
237   // Returns:
238   //   std::unique_ptr<CreateResult> on success
239   //   INTERNAL_ERROR on IO error or internal inconsistencies in the file
240   //   INVALID_ARGUMENT_ERROR if options aren't consistent with previous
241   //     instances
242   static libtextclassifier3::StatusOr<CreateResult> InitializeExistingFile(
243       const Filesystem* filesystem, const std::string& file_path,
244       const Options& options, int64_t file_size);
245 
246   // Takes an initial checksum and updates it with the content between `start`
247   // and `end` offsets in the file.
248   //
249   // Returns:
250   //   Crc of the content between `start`, inclusive, and `end`, exclusive.
251   //   INTERNAL_ERROR on IO error
252   //   INVALID_ARGUMENT_ERROR if start and end aren't within the file size
253   static libtextclassifier3::StatusOr<Crc32> ComputeChecksum(
254       const Filesystem* filesystem, const std::string& file_path,
255       Crc32 initial_crc, int64_t start, int64_t end);
256 
IsEmptyBuffer(const char * buffer,int size)257   static bool IsEmptyBuffer(const char* buffer, int size) {
258     return std::all_of(buffer, buffer + size,
259                        [](const char byte) { return byte == 0; });
260   }
261 
262   // Helper function to get stored proto size from the metadata.
263   // Metadata format: 8 bits magic + 24 bits size
GetProtoSize(int metadata)264   static int GetProtoSize(int metadata) { return metadata & 0x00FFFFFF; }
265 
266   // Helper function to get stored proto magic from the metadata.
267   // Metadata format: 8 bits magic + 24 bits size
GetProtoMagic(int metadata)268   static uint8_t GetProtoMagic(int metadata) { return metadata >> 24; }
269 
270   // Reads out the metadata of a proto located at file_offset from the file.
271   //
272   // Returns:
273   //   Proto's metadata on success
274   //   OUT_OF_RANGE_ERROR if file_offset exceeds file_size
275   //   INTERNAL_ERROR if the metadata is invalid or any IO errors happen
276   static libtextclassifier3::StatusOr<int> ReadProtoMetadata(
277       MemoryMappedFile* mmapped_file, int64_t file_offset, int64_t file_size);
278 
279   // Magic number added in front of every proto. Used when reading out protos
280   // as a first check for corruption in each entry in the file. Even if there is
281   // a corruption, the best we can do is roll back to our last recovery point
282   // and throw away un-flushed data. We can discard/reuse this byte if needed so
283   // that we have 4 bytes to store the size of protos, and increase the size of
284   // protos we support.
285   static constexpr uint8_t kProtoMagic = 0x5C;
286 
287   // Our internal max for protos.
288   //
289   // WARNING: Changing this to a larger number may invalidate our assumption
290   // that that proto size can safely be stored in the last 3 bytes of the proto
291   // header.
292   static constexpr int kMaxProtoSize = (1 << 24) - 1;  // 16MiB
293   static_assert(kMaxProtoSize <= 0x00FFFFFF,
294                 "kMaxProtoSize doesn't fit in 3 bytes");
295 
296   // Chunks of the file to mmap at a time, so we don't mmap the entire file.
297   // Only used on 32-bit devices
298   static constexpr int kMmapChunkSize = 4 * 1024 * 1024;  // 4MiB
299 
300   ScopedFd fd_;
301   const Filesystem* const filesystem_;
302   const std::string file_path_;
303   std::unique_ptr<Header> header_;
304 };
305 
306 template <typename ProtoT>
FileBackedProtoLog(const Filesystem * filesystem,const std::string & file_path,std::unique_ptr<Header> header)307 FileBackedProtoLog<ProtoT>::FileBackedProtoLog(const Filesystem* filesystem,
308                                                const std::string& file_path,
309                                                std::unique_ptr<Header> header)
310     : filesystem_(filesystem),
311       file_path_(file_path),
312       header_(std::move(header)) {
313   fd_.reset(filesystem_->OpenForAppend(file_path.c_str()));
314 }
315 
316 template <typename ProtoT>
317 libtextclassifier3::StatusOr<typename FileBackedProtoLog<ProtoT>::CreateResult>
Create(const Filesystem * filesystem,const std::string & file_path,const Options & options)318 FileBackedProtoLog<ProtoT>::Create(const Filesystem* filesystem,
319                                    const std::string& file_path,
320                                    const Options& options) {
321   if (options.max_proto_size <= 0) {
322     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
323         "options.max_proto_size must be greater than 0, was %d",
324         options.max_proto_size));
325   }
326 
327   // Since we store the proto_size in 3 bytes, we can only support protos of up
328   // to 16MiB.
329   if (options.max_proto_size > kMaxProtoSize) {
330     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
331         "options.max_proto_size must be under 16MiB, was %d",
332         options.max_proto_size));
333   }
334 
335   if (!filesystem->FileExists(file_path.c_str())) {
336     return InitializeNewFile(filesystem, file_path, options);
337   }
338 
339   int64_t file_size = filesystem->GetFileSize(file_path.c_str());
340   if (file_size == Filesystem::kBadFileSize) {
341     return absl_ports::InternalError(
342         absl_ports::StrCat("Bad file size '", file_path, "'"));
343   }
344 
345   if (file_size == 0) {
346     return InitializeNewFile(filesystem, file_path, options);
347   }
348 
349   return InitializeExistingFile(filesystem, file_path, options, file_size);
350 }
351 
352 template <typename ProtoT>
353 libtextclassifier3::StatusOr<typename FileBackedProtoLog<ProtoT>::CreateResult>
InitializeNewFile(const Filesystem * filesystem,const std::string & file_path,const Options & options)354 FileBackedProtoLog<ProtoT>::InitializeNewFile(const Filesystem* filesystem,
355                                               const std::string& file_path,
356                                               const Options& options) {
357   // Create the header
358   std::unique_ptr<Header> header = std::make_unique<Header>();
359   header->compress = options.compress;
360   header->max_proto_size = options.max_proto_size;
361   header->header_checksum = header->CalculateHeaderChecksum();
362 
363   if (!filesystem->Write(file_path.c_str(), header.get(), sizeof(Header))) {
364     return absl_ports::InternalError(
365         absl_ports::StrCat("Failed to write header for file: ", file_path));
366   }
367 
368   CreateResult create_result = {
369       std::unique_ptr<FileBackedProtoLog<ProtoT>>(
370           new FileBackedProtoLog<ProtoT>(filesystem, file_path,
371                                          std::move(header))),
372       /*data_loss=*/DataLoss::NONE};
373 
374   return create_result;
375 }
376 
377 template <typename ProtoT>
378 libtextclassifier3::StatusOr<typename FileBackedProtoLog<ProtoT>::CreateResult>
InitializeExistingFile(const Filesystem * filesystem,const std::string & file_path,const Options & options,int64_t file_size)379 FileBackedProtoLog<ProtoT>::InitializeExistingFile(const Filesystem* filesystem,
380                                                    const std::string& file_path,
381                                                    const Options& options,
382                                                    int64_t file_size) {
383   if (file_size < sizeof(Header)) {
384     return absl_ports::InternalError(
385         absl_ports::StrCat("File header too short for: ", file_path));
386   }
387 
388   std::unique_ptr<Header> header = std::make_unique<Header>();
389   if (!filesystem->PRead(file_path.c_str(), header.get(), sizeof(Header),
390                          /*offset=*/0)) {
391     return absl_ports::InternalError(
392         absl_ports::StrCat("Failed to read header for file: ", file_path));
393   }
394 
395   // Make sure the header is still valid before we use any of its values. This
396   // is covered by the header_checksum check below, but this is a quick check
397   // that can save us from an extra crc computation.
398   if (header->magic != Header::kMagic) {
399     return absl_ports::InternalError(
400         absl_ports::StrCat("Invalid header kMagic for file: ", file_path));
401   }
402 
403   if (header->header_checksum != header->CalculateHeaderChecksum()) {
404     return absl_ports::InternalError(
405         absl_ports::StrCat("Invalid header checksum for: ", file_path));
406   }
407 
408   if (header->compress != options.compress) {
409     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
410         "Inconsistent compress option, expected %d, actual %d",
411         header->compress, options.compress));
412   }
413 
414   if (header->max_proto_size > options.max_proto_size) {
415     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
416         "Max proto size cannot be smaller than previous "
417         "instantiations, previous size %d, wanted size %d",
418         header->max_proto_size, options.max_proto_size));
419   }
420   header->max_proto_size = options.max_proto_size;
421 
422   DataLoss data_loss = DataLoss::NONE;
423   ICING_ASSIGN_OR_RETURN(Crc32 calculated_log_checksum,
424                          ComputeChecksum(filesystem, file_path, Crc32(),
425                                          sizeof(Header), file_size));
426 
427   // Double check that the log checksum is the same as the one that was
428   // persisted last time. If not, we start recovery logic.
429   if (header->log_checksum != calculated_log_checksum.Get()) {
430     // Need to rewind the proto log since the checksums don't match.
431     // Worst case, we have to rewind the entire log back to just the header
432     int64_t last_known_good = sizeof(Header);
433 
434     // Calculate the checksum of the log contents just up to the last rewind
435     // offset point. This will be valid if we just appended contents to the log
436     // without updating the checksum, and we can rewind back to this point
437     // safely.
438     ICING_ASSIGN_OR_RETURN(
439         calculated_log_checksum,
440         ComputeChecksum(filesystem, file_path, Crc32(), sizeof(Header),
441                         header->rewind_offset));
442     if (header->log_checksum == calculated_log_checksum.Get()) {
443       // Check if it matches our last rewind state. If so, this becomes our last
444       // good state and we can safely truncate and recover from here.
445       last_known_good = header->rewind_offset;
446       data_loss = DataLoss::PARTIAL;
447     } else {
448       // Otherwise, we're going to truncate the entire log and this resets the
449       // checksum to an empty log state.
450       header->log_checksum = 0;
451       data_loss = DataLoss::COMPLETE;
452     }
453 
454     if (!filesystem->Truncate(file_path.c_str(), last_known_good)) {
455       return absl_ports::InternalError(
456           absl_ports::StrCat("Error truncating file: ", file_path));
457     }
458 
459     ICING_LOG(WARNING) << "Truncated '" << file_path << "' to size "
460                        << last_known_good;
461   }
462 
463   CreateResult create_result = {
464       std::unique_ptr<FileBackedProtoLog<ProtoT>>(
465           new FileBackedProtoLog<ProtoT>(filesystem, file_path,
466                                          std::move(header))),
467       data_loss};
468 
469   return create_result;
470 }
471 
472 template <typename ProtoT>
ComputeChecksum(const Filesystem * filesystem,const std::string & file_path,Crc32 initial_crc,int64_t start,int64_t end)473 libtextclassifier3::StatusOr<Crc32> FileBackedProtoLog<ProtoT>::ComputeChecksum(
474     const Filesystem* filesystem, const std::string& file_path,
475     Crc32 initial_crc, int64_t start, int64_t end) {
476   ICING_ASSIGN_OR_RETURN(
477       MemoryMappedFile mmapped_file,
478       MemoryMappedFile::Create(*filesystem, file_path,
479                                MemoryMappedFile::Strategy::READ_ONLY));
480   Crc32 new_crc(initial_crc.Get());
481 
482   if (start < 0) {
483     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
484         "Starting checksum offset of file '%s' must be greater than 0, was "
485         "%lld",
486         file_path.c_str(), static_cast<long long>(start)));
487   }
488 
489   if (end < start) {
490     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
491         "Ending checksum offset of file '%s' must be greater than start "
492         "'%lld', was '%lld'",
493         file_path.c_str(), static_cast<long long>(start),
494         static_cast<long long>(end)));
495   }
496 
497   int64_t file_size = filesystem->GetFileSize(file_path.c_str());
498   if (end > file_size) {
499     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
500         "Ending checksum offset of file '%s' must be within "
501         "file size of %lld, was %lld",
502         file_path.c_str(), static_cast<long long>(file_size),
503         static_cast<long long>(end)));
504   }
505 
506   Architecture architecture = GetArchitecture();
507   switch (architecture) {
508     case Architecture::BIT_64: {
509       // Don't mmap in chunks here since mmapping can be harmful on 64-bit
510       // devices where mmap/munmap calls need the mmap write semaphore, which
511       // blocks mmap/munmap/mprotect and all page faults from executing while
512       // they run. On 64-bit devices, this doesn't actually load into memory, it
513       // just makes the file faultable. So the whole file should be ok.
514       // b/185822878.
515       ICING_RETURN_IF_ERROR(mmapped_file.Remap(start, end - start));
516       auto mmap_str = std::string_view(mmapped_file.region(), end - start);
517       new_crc.Append(mmap_str);
518       break;
519     }
520     case Architecture::BIT_32:
521       [[fallthrough]];
522     case Architecture::UNKNOWN: {
523       // 32-bit devices only have 4GB of RAM. Mmap in chunks to not use up too
524       // much memory at once. If we're unknown, then also chunk it because we're
525       // not sure what the device can handle.
526       for (int i = start; i < end; i += kMmapChunkSize) {
527         // Don't read past the file size.
528         int next_chunk_size = kMmapChunkSize;
529         if ((i + kMmapChunkSize) >= end) {
530           next_chunk_size = end - i;
531         }
532 
533         ICING_RETURN_IF_ERROR(mmapped_file.Remap(i, next_chunk_size));
534 
535         auto mmap_str =
536             std::string_view(mmapped_file.region(), next_chunk_size);
537         new_crc.Append(mmap_str);
538       }
539       break;
540     }
541   }
542 
543   return new_crc;
544 }
545 
546 template <typename ProtoT>
ReadProto(int64_t file_offset)547 libtextclassifier3::StatusOr<ProtoT> FileBackedProtoLog<ProtoT>::ReadProto(
548     int64_t file_offset) const {
549   int64_t file_size = filesystem_->GetFileSize(fd_.get());
550   ICING_ASSIGN_OR_RETURN(
551       MemoryMappedFile mmapped_file,
552       MemoryMappedFile::Create(*filesystem_, file_path_,
553                                MemoryMappedFile::Strategy::READ_ONLY));
554   if (file_offset >= file_size) {
555     // file_size points to the next byte to write at, so subtract one to get
556     // the inclusive, actual size of file.
557     return absl_ports::OutOfRangeError(
558         IcingStringUtil::StringPrintf("Trying to read from a location, %lld, "
559                                       "out of range of the file size, %lld",
560                                       static_cast<long long>(file_offset),
561                                       static_cast<long long>(file_size - 1)));
562   }
563 
564   // Read out the metadata
565   ICING_ASSIGN_OR_RETURN(
566       int metadata, ReadProtoMetadata(&mmapped_file, file_offset, file_size));
567 
568   // Copy out however many bytes it says the proto is
569   int stored_size = GetProtoSize(metadata);
570 
571   ICING_RETURN_IF_ERROR(
572       mmapped_file.Remap(file_offset + sizeof(metadata), stored_size));
573 
574   if (IsEmptyBuffer(mmapped_file.region(), mmapped_file.region_size())) {
575     return absl_ports::NotFoundError("The proto data has been erased.");
576   }
577 
578   google::protobuf::io::ArrayInputStream proto_stream(mmapped_file.mutable_region(),
579                                             stored_size);
580 
581   // Deserialize proto
582   ProtoT proto;
583   if (header_->compress) {
584     protobuf_ports::GzipInputStream decompress_stream(&proto_stream);
585     proto.ParseFromZeroCopyStream(&decompress_stream);
586   } else {
587     proto.ParseFromZeroCopyStream(&proto_stream);
588   }
589 
590   return proto;
591 }
592 
593 template <typename ProtoT>
Iterator(const Filesystem & filesystem,const std::string & file_path,int64_t initial_offset,MemoryMappedFile && mmapped_file)594 FileBackedProtoLog<ProtoT>::Iterator::Iterator(const Filesystem& filesystem,
595                                                const std::string& file_path,
596                                                int64_t initial_offset,
597                                                MemoryMappedFile&& mmapped_file)
598     : mmapped_file_(std::move(mmapped_file)),
599       initial_offset_(initial_offset),
600       current_offset_(kInvalidOffset),
601       file_size_(filesystem.GetFileSize(file_path.c_str())) {
602   if (file_size_ == Filesystem::kBadFileSize) {
603     // Fails all Advance() calls
604     file_size_ = 0;
605   }
606 }
607 
608 template <typename ProtoT>
Advance()609 libtextclassifier3::Status FileBackedProtoLog<ProtoT>::Iterator::Advance() {
610   if (current_offset_ == kInvalidOffset) {
611     // First Advance() call
612     current_offset_ = initial_offset_;
613   } else {
614     // Jumps to the next proto position
615     ICING_ASSIGN_OR_RETURN(
616         int metadata,
617         ReadProtoMetadata(&mmapped_file_, current_offset_, file_size_));
618     current_offset_ += sizeof(metadata) + GetProtoSize(metadata);
619   }
620 
621   if (current_offset_ < file_size_) {
622     return libtextclassifier3::Status::OK;
623   } else {
624     return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
625         "The next proto offset, %lld, is out of file range [0, %lld)",
626         static_cast<long long>(current_offset_),
627         static_cast<long long>(file_size_)));
628   }
629 }
630 
631 template <typename ProtoT>
GetOffset()632 int64_t FileBackedProtoLog<ProtoT>::Iterator::GetOffset() {
633   return current_offset_;
634 }
635 
636 template <typename ProtoT>
637 libtextclassifier3::StatusOr<typename FileBackedProtoLog<ProtoT>::Iterator>
GetIterator()638 FileBackedProtoLog<ProtoT>::GetIterator() {
639   ICING_ASSIGN_OR_RETURN(
640       MemoryMappedFile mmapped_file,
641       MemoryMappedFile::Create(*filesystem_, file_path_,
642                                MemoryMappedFile::Strategy::READ_ONLY));
643   return Iterator(*filesystem_, file_path_,
644                   /*initial_offset=*/sizeof(Header), std::move(mmapped_file));
645 }
646 
647 template <typename ProtoT>
ReadProtoMetadata(MemoryMappedFile * mmapped_file,int64_t file_offset,int64_t file_size)648 libtextclassifier3::StatusOr<int> FileBackedProtoLog<ProtoT>::ReadProtoMetadata(
649     MemoryMappedFile* mmapped_file, int64_t file_offset, int64_t file_size) {
650   // Checks file_offset
651   if (file_offset >= file_size) {
652     return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
653         "offset, %lld, is out of file range [0, %lld)",
654         static_cast<long long>(file_offset),
655         static_cast<long long>(file_size)));
656   }
657   int metadata;
658   int metadata_size = sizeof(metadata);
659   if (file_offset + metadata_size >= file_size) {
660     return absl_ports::InternalError(IcingStringUtil::StringPrintf(
661         "Wrong metadata offset %lld, metadata doesn't fit in "
662         "with file range [0, %lld)",
663         static_cast<long long>(file_offset),
664         static_cast<long long>(file_size)));
665   }
666   // Reads metadata
667   ICING_RETURN_IF_ERROR(mmapped_file->Remap(file_offset, metadata_size));
668   memcpy(&metadata, mmapped_file->region(), metadata_size);
669   // Checks magic number
670   uint8_t stored_k_proto_magic = GetProtoMagic(metadata);
671   if (stored_k_proto_magic != kProtoMagic) {
672     return absl_ports::InternalError(IcingStringUtil::StringPrintf(
673         "Failed to read kProtoMagic, expected %d, actual %d", kProtoMagic,
674         stored_k_proto_magic));
675   }
676   return metadata;
677 }
678 
679 }  // namespace lib
680 }  // namespace icing
681 
682 #endif  // ICING_FILE_FILE_BACKED_PROTO_LOG_H_
683