• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // File-backed log of protos with append-only writes and position based reads.
16 //
17 // There should only be one instance of a FileBackedProtoLog of the same file at
18 // a time; using multiple instances at the same time may lead to undefined
19 // behavior.
20 //
21 // The entire checksum is computed on initialization to verify the contents are
22 // valid. On failure, the log will be truncated to the last verified state when
23 // PersistToDisk() was called. If the log cannot successfully restore the last
24 // state due to disk corruption or some other inconsistency, then the entire log
25 // will be lost.
26 //
27 // Each proto written to the file will have a metadata written just before it.
28 // The metadata consists of
29 //   {
30 //     1 bytes of kProtoMagic;
31 //     3 bytes of the proto size
32 //     n bytes of the proto itself
33 //   }
34 //
35 // Example usage:
36 //   ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
37 //       FileBackedProtoLog<DocumentProto>::Create(filesystem, file_path_,
38 //                                                  options));
39 //   auto proto_log = create_result.proto_log;
40 //
41 //   Document document;
42 //   document.set_namespace("com.google.android.example");
43 //   document.set_uri("www.google.com");
44 //
45 //   int64_t document_offset = proto_log->WriteProto(document));
46 //   Document same_document = proto_log->ReadProto(document_offset));
47 //   proto_log->PersistToDisk();
48 //
49 // TODO(b/136514769): Add versioning to the header and a UpgradeToVersion
50 // migration method.
51 
52 #ifndef ICING_FILE_FILE_BACKED_PROTO_LOG_H_
53 #define ICING_FILE_FILE_BACKED_PROTO_LOG_H_
54 
55 #include <cstddef>
56 #include <cstdint>
57 #include <cstring>
58 #include <memory>
59 #include <string>
60 #include <string_view>
61 #include <utility>
62 #include <vector>
63 
64 #include "icing/text_classifier/lib3/utils/base/status.h"
65 #include "icing/text_classifier/lib3/utils/base/statusor.h"
66 #include <google/protobuf/io/gzip_stream.h>
67 #include <google/protobuf/io/zero_copy_stream_impl_lite.h>
68 #include "icing/absl_ports/canonical_errors.h"
69 #include "icing/absl_ports/str_cat.h"
70 #include "icing/file/filesystem.h"
71 #include "icing/file/memory-mapped-file.h"
72 #include "icing/legacy/core/icing-string-util.h"
73 #include "icing/portable/platform.h"
74 #include "icing/portable/zlib.h"
75 #include "icing/util/crc32.h"
76 #include "icing/util/data-loss.h"
77 #include "icing/util/logging.h"
78 #include "icing/util/status-macros.h"
79 
80 namespace icing {
81 namespace lib {
82 
83 template <typename ProtoT>
84 class FileBackedProtoLog {
85  public:
86   struct Options {
87     // Whether to compress each proto before writing to the proto log.
88     bool compress;
89 
90     // Byte-size limit for each proto written to the store. This does not
91     // include the bytes needed for the metadata of each proto.
92     //
93     // NOTE: Currently, we only support protos up to 16MiB. We store the proto
94     // size in 3 bytes within the metadata.
95     //
96     // NOTE: This limit is only enforced for future writes. If the store
97     // previously had a higher limit, then reading older entries could return
98     // larger protos.
99     //
100     // NOTE: The max_proto_size is the upper limit for input protos into the
101     // ProtoLog. Even if the proto is larger than max_proto_size, but compresses
102     // to a smaller size, ProtoLog will not accept it. Protos that result in a
103     // compressed size larger than max_proto_size are also not accepted.
104     const int32_t max_proto_size;
105 
106     // Must specify values for options.
107     Options() = delete;
108     explicit Options(bool compress_in,
109                      const int32_t max_proto_size_in = kMaxProtoSize)
compressOptions110         : compress(compress_in), max_proto_size(max_proto_size_in) {}
111   };
112 
113   // Header stored at the beginning of the file before the rest of the log
114   // contents. Stores metadata on the log.
115   //
116   // TODO(b/139375388): Migrate the Header struct to a proto. This makes
117   // migrations easier since we don't need to worry about different size padding
118   // (which would affect the checksum) and different endians.
119   struct Header {
120     static constexpr int32_t kMagic = 0xf4c6f67a;
121 
122     // Holds the magic as a quick sanity check against file corruption.
123     int32_t magic = kMagic;
124 
125     // Whether to compress the protos before writing to the log.
126     bool compress = true;
127 
128     // The maximum proto size that can be written to the log.
129     int32_t max_proto_size = 0;
130 
131     // Checksum of the log elements, doesn't include the header fields.
132     uint32_t log_checksum = 0;
133 
134     // Last known good offset at which the log and its checksum were updated.
135     // If we crash between writing to the log and updating the checksum, we can
136     // try to rewind the log to this offset and verify the checksum is still
137     // valid instead of throwing away the entire log.
138     int64_t rewind_offset = sizeof(Header);
139 
140     // Must be at the end. Contains the crc checksum of the preceding fields.
141     uint32_t header_checksum = 0;
142 
CalculateHeaderChecksumHeader143     uint32_t CalculateHeaderChecksum() const {
144       Crc32 crc;
145       std::string_view header_str(reinterpret_cast<const char*>(this),
146                                   offsetof(Header, header_checksum));
147       crc.Append(header_str);
148       return crc.Get();
149     }
150   };
151 
152   struct CreateResult {
153     // A successfully initialized log.
154     std::unique_ptr<FileBackedProtoLog<ProtoT>> proto_log;
155 
156     // The data status after initializing from a previous state. Data loss can
157     // happen if the file is corrupted or some previously added data was
158     // unpersisted. This may be used to signal that any derived data off of the
159     // proto log may need to be regenerated.
160     DataLoss data_loss;
161 
has_data_lossCreateResult162     bool has_data_loss() {
163       return data_loss == DataLoss::PARTIAL || data_loss == DataLoss::COMPLETE;
164     }
165   };
166 
167   // Factory method to create, initialize, and return a FileBackedProtoLog. Will
168   // create the file if it doesn't exist.
169   //
170   // If on re-initialization the log detects disk corruption or some previously
171   // added data was unpersisted, the log will rewind to the last-good state. The
172   // log saves these checkpointed "good" states when PersistToDisk() is called
173   // or the log is safely destructed. If the log rewinds successfully to the
174   // last-good state, then the returned CreateResult.data_loss indicates
175   // whether it has a data loss and what kind of data loss it is (partial or
176   // complete) so that any derived data may know that it needs to be updated. If
177   // the log re-initializes successfully without any data loss,
178   // CreateResult.data_loss will be NONE.
179   //
180   // Params:
181   //   filesystem: Handles system level calls
182   //   file_path: Path of the underlying file. Directory of the file should
183   //   already exist
184   //   options: Configuration options for the proto log
185   //
186   // Returns:
187   //   FileBackedProtoLog::CreateResult on success
188   //   INVALID_ARGUMENT on an invalid option
189   //   INTERNAL_ERROR on IO error
190   static libtextclassifier3::StatusOr<CreateResult> Create(
191       const Filesystem* filesystem, const std::string& file_path,
192       const Options& options);
193 
194   // Not copyable
195   FileBackedProtoLog(const FileBackedProtoLog&) = delete;
196   FileBackedProtoLog& operator=(const FileBackedProtoLog&) = delete;
197 
198   // This will update the checksum of the log as well.
199   ~FileBackedProtoLog();
200 
201   // Writes the serialized proto to the underlying file. Writes are applied
202   // directly to the underlying file. Users do not need to sync the file after
203   // writing.
204   //
205   // Returns:
206   //   Offset of the newly appended proto in file on success
207   //   INVALID_ARGUMENT if proto is too large, as decided by
208   //     Options.max_proto_size
209   //   INTERNAL_ERROR on IO error
210   libtextclassifier3::StatusOr<int64_t> WriteProto(const ProtoT& proto);
211 
212   // Reads out a proto located at file_offset from the file.
213   //
214   // Returns:
215   //   A proto on success
216   //   NOT_FOUND if the proto at the given offset has been erased
217   //   OUT_OF_RANGE_ERROR if file_offset exceeds file size
218   //   INTERNAL_ERROR on IO error
219   libtextclassifier3::StatusOr<ProtoT> ReadProto(int64_t file_offset) const;
220 
221   // Erases the data of a proto located at file_offset from the file.
222   //
223   // Returns:
224   //   OK on success
225   //   OUT_OF_RANGE_ERROR if file_offset exceeds file size
226   //   INTERNAL_ERROR on IO error
227   libtextclassifier3::Status EraseProto(int64_t file_offset);
228 
229   // Calculates and returns the disk usage in bytes. Rounds up to the nearest
230   // block size.
231   //
232   // Returns:
233   //   Disk usage on success
234   //   INTERNAL_ERROR on IO error
235   libtextclassifier3::StatusOr<int64_t> GetDiskUsage() const;
236 
237   // Returns the file size of all the elements held in the log. File size is in
238   // bytes. This excludes the size of any internal metadata of the log, e.g. the
239   // log's header.
240   //
241   // Returns:
242   //   File size on success
243   //   INTERNAL_ERROR on IO error
244   libtextclassifier3::StatusOr<int64_t> GetElementsFileSize() const;
245 
246   // An iterator helping to find offsets of all the protos in file.
247   // Example usage:
248   //
249   // while (iterator.Advance().ok()) {
250   //   int64_t offset = iterator.GetOffset();
251   //   // Do something
252   // }
253   class Iterator {
254    public:
255     Iterator(const Filesystem& filesystem, const std::string& file_path,
256              int64_t initial_offset);
257 
258     // Advances to the position of next proto whether it has been erased or not.
259     //
260     // Returns:
261     //   OK on success
262     //   OUT_OF_RANGE_ERROR if it reaches the end
263     //   INTERNAL_ERROR on IO error
264     libtextclassifier3::Status Advance();
265 
266     // Returns the file offset of current proto.
267     int64_t GetOffset();
268 
269    private:
270     static constexpr int64_t kInvalidOffset = -1;
271     // Used to read proto metadata
272     MemoryMappedFile mmapped_file_;
273     // Offset of first proto
274     int64_t initial_offset_;
275     int64_t current_offset_;
276     int64_t file_size_;
277   };
278 
279   // Returns an iterator of current proto log. The caller needs to keep the
280   // proto log unchanged while using the iterator, otherwise unexpected
281   // behaviors could happen.
282   Iterator GetIterator();
283 
284   // Persists all changes since initialization or the last call to
285   // PersistToDisk(). Any changes that aren't persisted may be lost if the
286   // system fails to close safely.
287   //
288   // Example use case:
289   //
290   //   Document document;
291   //   document.set_namespace("com.google.android.example");
292   //   document.set_uri("www.google.com");
293   //
294   //   {
295   //     ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
296   //         FileBackedProtoLog<DocumentProto>::Create(filesystem, file_path,
297   //                                                    options));
298   //     auto proto_log = std::move(create_result.proto_log);
299   //
300   //     int64_t document_offset = proto_log->WriteProto(document));
301   //
302   //     // We lose the document here since it wasn't persisted.
303   //     // *SYSTEM CRASH*
304   //   }
305   //
306   //   {
307   //     // Can still successfully create after a crash since the log can
308   //     // rewind/truncate to recover into a previously good state
309   //     ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
310   //         FileBackedProtoLog<DocumentProto>::Create(filesystem, file_path,
311   //                                                    options));
312   //     auto proto_log = std::move(create_result.proto_log);
313   //
314   //     // Lost the proto since we didn't PersistToDisk before the crash
315   //     proto_log->ReadProto(document_offset)); // INVALID_ARGUMENT error
316   //
317   //     int64_t document_offset = proto_log->WriteProto(document));
318   //
319   //     // Persisted this time, so we should be ok.
320   //     ICING_ASSERT_OK(proto_log->PersistToDisk());
321   //   }
322   //
323   //   {
324   //     ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
325   //         FileBackedProtoLog<DocumentProto>::Create(filesystem, file_path,
326   //                                                    options));
327   //     auto proto_log = std::move(create_result.proto_log);
328   //
329   //     // SUCCESS
330   //     Document same_document = proto_log->ReadProto(document_offset));
331   //   }
332   //
333   // NOTE: Since all protos are already written to the file directly, this
334   // just updates the checksum and rewind position. Without these updates,
335   // future initializations will truncate the file and discard unpersisted
336   // changes.
337   //
338   // Returns:
339   //   OK on success
340   //   INTERNAL_ERROR on IO error
341   libtextclassifier3::Status PersistToDisk();
342 
343   // Calculates the checksum of the log contents. Excludes the header content.
344   //
345   // Returns:
346   //   Crc of the log content
347   //   INTERNAL_ERROR on IO error
348   libtextclassifier3::StatusOr<Crc32> ComputeChecksum();
349 
350  private:
351   // Object can only be instantiated via the ::Create factory.
352   FileBackedProtoLog(const Filesystem* filesystem, const std::string& file_path,
353                      std::unique_ptr<Header> header);
354 
355   // Initializes a new proto log.
356   //
357   // Returns:
358   //   std::unique_ptr<CreateResult> on success
359   //   INTERNAL_ERROR on IO error
360   static libtextclassifier3::StatusOr<CreateResult> InitializeNewFile(
361       const Filesystem* filesystem, const std::string& file_path,
362       const Options& options);
363 
364   // Verifies that the existing proto log is in a good state. If not in a good
365   // state, then the proto log may be truncated to the last good state and
366   // content will be lost.
367   //
368   // Returns:
369   //   std::unique_ptr<CreateResult> on success
370   //   INTERNAL_ERROR on IO error or internal inconsistencies in the file
371   //   INVALID_ARGUMENT_ERROR if options aren't consistent with previous
372   //     instances
373   static libtextclassifier3::StatusOr<CreateResult> InitializeExistingFile(
374       const Filesystem* filesystem, const std::string& file_path,
375       const Options& options, int64_t file_size);
376 
377   // Takes an initial checksum and updates it with the content between `start`
378   // and `end` offsets in the file.
379   //
380   // Returns:
381   //   Crc of the content between `start`, inclusive, and `end`, exclusive.
382   //   INTERNAL_ERROR on IO error
383   //   INVALID_ARGUMENT_ERROR if start and end aren't within the file size
384   static libtextclassifier3::StatusOr<Crc32> ComputeChecksum(
385       const Filesystem* filesystem, const std::string& file_path,
386       Crc32 initial_crc, int64_t start, int64_t end);
387 
IsEmptyBuffer(const char * buffer,int size)388   static bool IsEmptyBuffer(const char* buffer, int size) {
389     return std::all_of(buffer, buffer + size,
390                        [](const char byte) { return byte == 0; });
391   }
392 
393   // Helper function to get stored proto size from the metadata.
394   // Metadata format: 8 bits magic + 24 bits size
GetProtoSize(int metadata)395   static int GetProtoSize(int metadata) { return metadata & 0x00FFFFFF; }
396 
397   // Helper function to get stored proto magic from the metadata.
398   // Metadata format: 8 bits magic + 24 bits size
GetProtoMagic(int metadata)399   static uint8_t GetProtoMagic(int metadata) { return metadata >> 24; }
400 
401   // Reads out the metadata of a proto located at file_offset from the file.
402   //
403   // Returns:
404   //   Proto's metadata on success
405   //   OUT_OF_RANGE_ERROR if file_offset exceeds file_size
406   //   INTERNAL_ERROR if the metadata is invalid or any IO errors happen
407   static libtextclassifier3::StatusOr<int> ReadProtoMetadata(
408       MemoryMappedFile* mmapped_file, int64_t file_offset, int64_t file_size);
409 
410   // Magic number added in front of every proto. Used when reading out protos
411   // as a first check for corruption in each entry in the file. Even if there is
412   // a corruption, the best we can do is roll back to our last recovery point
413   // and throw away un-flushed data. We can discard/reuse this byte if needed so
414   // that we have 4 bytes to store the size of protos, and increase the size of
415   // protos we support.
416   static constexpr uint8_t kProtoMagic = 0x5C;
417 
418   // Our internal max for protos.
419   //
420   // WARNING: Changing this to a larger number may invalidate our assumption
421   // that that proto size can safely be stored in the last 3 bytes of the proto
422   // header.
423   static constexpr int kMaxProtoSize = (1 << 24) - 1;  // 16MiB
424   static_assert(kMaxProtoSize <= 0x00FFFFFF,
425                 "kMaxProtoSize doesn't fit in 3 bytes");
426 
427   // Level of compression, BEST_SPEED = 1, BEST_COMPRESSION = 9
428   static constexpr int kDeflateCompressionLevel = 3;
429 
430   // Chunks of the file to mmap at a time, so we don't mmap the entire file.
431   // Only used on 32-bit devices
432   static constexpr int kMmapChunkSize = 4 * 1024 * 1024;  // 4MiB
433 
434   ScopedFd fd_;
435   const Filesystem* const filesystem_;
436   const std::string file_path_;
437   std::unique_ptr<Header> header_;
438 };
439 
440 template <typename ProtoT>
441 constexpr uint8_t FileBackedProtoLog<ProtoT>::kProtoMagic;
442 
443 template <typename ProtoT>
FileBackedProtoLog(const Filesystem * filesystem,const std::string & file_path,std::unique_ptr<Header> header)444 FileBackedProtoLog<ProtoT>::FileBackedProtoLog(const Filesystem* filesystem,
445                                                const std::string& file_path,
446                                                std::unique_ptr<Header> header)
447     : filesystem_(filesystem),
448       file_path_(file_path),
449       header_(std::move(header)) {
450   fd_.reset(filesystem_->OpenForAppend(file_path.c_str()));
451 }
452 
453 template <typename ProtoT>
~FileBackedProtoLog()454 FileBackedProtoLog<ProtoT>::~FileBackedProtoLog() {
455   if (!PersistToDisk().ok()) {
456     ICING_LOG(WARNING)
457         << "Error persisting to disk during destruction of FileBackedProtoLog: "
458         << file_path_;
459   }
460 }
461 
462 template <typename ProtoT>
463 libtextclassifier3::StatusOr<typename FileBackedProtoLog<ProtoT>::CreateResult>
Create(const Filesystem * filesystem,const std::string & file_path,const Options & options)464 FileBackedProtoLog<ProtoT>::Create(const Filesystem* filesystem,
465                                    const std::string& file_path,
466                                    const Options& options) {
467   if (options.max_proto_size <= 0) {
468     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
469         "options.max_proto_size must be greater than 0, was %d",
470         options.max_proto_size));
471   }
472 
473   // Since we store the proto_size in 3 bytes, we can only support protos of up
474   // to 16MiB.
475   if (options.max_proto_size > kMaxProtoSize) {
476     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
477         "options.max_proto_size must be under 16MiB, was %d",
478         options.max_proto_size));
479   }
480 
481   if (!filesystem->FileExists(file_path.c_str())) {
482     return InitializeNewFile(filesystem, file_path, options);
483   }
484 
485   int64_t file_size = filesystem->GetFileSize(file_path.c_str());
486   if (file_size == Filesystem::kBadFileSize) {
487     return absl_ports::InternalError(
488         absl_ports::StrCat("Bad file size '", file_path, "'"));
489   }
490 
491   if (file_size == 0) {
492     return InitializeNewFile(filesystem, file_path, options);
493   }
494 
495   return InitializeExistingFile(filesystem, file_path, options, file_size);
496 }
497 
498 template <typename ProtoT>
499 libtextclassifier3::StatusOr<typename FileBackedProtoLog<ProtoT>::CreateResult>
InitializeNewFile(const Filesystem * filesystem,const std::string & file_path,const Options & options)500 FileBackedProtoLog<ProtoT>::InitializeNewFile(const Filesystem* filesystem,
501                                               const std::string& file_path,
502                                               const Options& options) {
503   // Create the header
504   std::unique_ptr<Header> header = std::make_unique<Header>();
505   header->compress = options.compress;
506   header->max_proto_size = options.max_proto_size;
507   header->header_checksum = header->CalculateHeaderChecksum();
508 
509   if (!filesystem->Write(file_path.c_str(), header.get(), sizeof(Header))) {
510     return absl_ports::InternalError(
511         absl_ports::StrCat("Failed to write header for file: ", file_path));
512   }
513 
514   CreateResult create_result = {
515       std::unique_ptr<FileBackedProtoLog<ProtoT>>(
516           new FileBackedProtoLog<ProtoT>(filesystem, file_path,
517                                          std::move(header))),
518       /*data_loss=*/DataLoss::NONE};
519 
520   return create_result;
521 }
522 
523 template <typename ProtoT>
524 libtextclassifier3::StatusOr<typename FileBackedProtoLog<ProtoT>::CreateResult>
InitializeExistingFile(const Filesystem * filesystem,const std::string & file_path,const Options & options,int64_t file_size)525 FileBackedProtoLog<ProtoT>::InitializeExistingFile(const Filesystem* filesystem,
526                                                    const std::string& file_path,
527                                                    const Options& options,
528                                                    int64_t file_size) {
529   if (file_size < sizeof(Header)) {
530     return absl_ports::InternalError(
531         absl_ports::StrCat("File header too short for: ", file_path));
532   }
533 
534   std::unique_ptr<Header> header = std::make_unique<Header>();
535   if (!filesystem->PRead(file_path.c_str(), header.get(), sizeof(Header),
536                          /*offset=*/0)) {
537     return absl_ports::InternalError(
538         absl_ports::StrCat("Failed to read header for file: ", file_path));
539   }
540 
541   // Make sure the header is still valid before we use any of its values. This
542   // is covered by the header_checksum check below, but this is a quick check
543   // that can save us from an extra crc computation.
544   if (header->magic != Header::kMagic) {
545     return absl_ports::InternalError(
546         absl_ports::StrCat("Invalid header kMagic for file: ", file_path));
547   }
548 
549   if (header->header_checksum != header->CalculateHeaderChecksum()) {
550     return absl_ports::InternalError(
551         absl_ports::StrCat("Invalid header checksum for: ", file_path));
552   }
553 
554   if (header->compress != options.compress) {
555     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
556         "Inconsistent compress option, expected %d, actual %d",
557         header->compress, options.compress));
558   }
559 
560   if (header->max_proto_size > options.max_proto_size) {
561     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
562         "Max proto size cannot be smaller than previous "
563         "instantiations, previous size %d, wanted size %d",
564         header->max_proto_size, options.max_proto_size));
565   }
566   header->max_proto_size = options.max_proto_size;
567 
568   DataLoss data_loss = DataLoss::NONE;
569   ICING_ASSIGN_OR_RETURN(Crc32 calculated_log_checksum,
570                          ComputeChecksum(filesystem, file_path, Crc32(),
571                                          sizeof(Header), file_size));
572 
573   // Double check that the log checksum is the same as the one that was
574   // persisted last time. If not, we start recovery logic.
575   if (header->log_checksum != calculated_log_checksum.Get()) {
576     // Need to rewind the proto log since the checksums don't match.
577     // Worst case, we have to rewind the entire log back to just the header
578     int64_t last_known_good = sizeof(Header);
579 
580     // Calculate the checksum of the log contents just up to the last rewind
581     // offset point. This will be valid if we just appended contents to the log
582     // without updating the checksum, and we can rewind back to this point
583     // safely.
584     ICING_ASSIGN_OR_RETURN(
585         calculated_log_checksum,
586         ComputeChecksum(filesystem, file_path, Crc32(), sizeof(Header),
587                         header->rewind_offset));
588     if (header->log_checksum == calculated_log_checksum.Get()) {
589       // Check if it matches our last rewind state. If so, this becomes our last
590       // good state and we can safely truncate and recover from here.
591       last_known_good = header->rewind_offset;
592       data_loss = DataLoss::PARTIAL;
593     } else {
594       // Otherwise, we're going to truncate the entire log and this resets the
595       // checksum to an empty log state.
596       header->log_checksum = 0;
597       data_loss = DataLoss::COMPLETE;
598     }
599 
600     if (!filesystem->Truncate(file_path.c_str(), last_known_good)) {
601       return absl_ports::InternalError(
602           absl_ports::StrCat("Error truncating file: ", file_path));
603     }
604 
605     ICING_LOG(INFO) << "Truncated '" << file_path << "' to size "
606                     << last_known_good;
607   }
608 
609   CreateResult create_result = {
610       std::unique_ptr<FileBackedProtoLog<ProtoT>>(
611           new FileBackedProtoLog<ProtoT>(filesystem, file_path,
612                                          std::move(header))),
613       data_loss};
614 
615   return create_result;
616 }
617 
618 template <typename ProtoT>
ComputeChecksum(const Filesystem * filesystem,const std::string & file_path,Crc32 initial_crc,int64_t start,int64_t end)619 libtextclassifier3::StatusOr<Crc32> FileBackedProtoLog<ProtoT>::ComputeChecksum(
620     const Filesystem* filesystem, const std::string& file_path,
621     Crc32 initial_crc, int64_t start, int64_t end) {
622   auto mmapped_file = MemoryMappedFile(*filesystem, file_path,
623                                        MemoryMappedFile::Strategy::READ_ONLY);
624   Crc32 new_crc(initial_crc.Get());
625 
626   if (start < 0) {
627     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
628         "Starting checksum offset of file '%s' must be greater than 0, was "
629         "%lld",
630         file_path.c_str(), static_cast<long long>(start)));
631   }
632 
633   if (end < start) {
634     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
635         "Ending checksum offset of file '%s' must be greater than start "
636         "'%lld', was '%lld'",
637         file_path.c_str(), static_cast<long long>(start),
638         static_cast<long long>(end)));
639   }
640 
641   int64_t file_size = filesystem->GetFileSize(file_path.c_str());
642   if (end > file_size) {
643     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
644         "Ending checksum offset of file '%s' must be within "
645         "file size of %lld, was %lld",
646         file_path.c_str(), static_cast<long long>(file_size),
647         static_cast<long long>(end)));
648   }
649 
650   Architecture architecture = GetArchitecture();
651   switch (architecture) {
652     case Architecture::BIT_64: {
653       // Don't mmap in chunks here since mmapping can be harmful on 64-bit
654       // devices where mmap/munmap calls need the mmap write semaphore, which
655       // blocks mmap/munmap/mprotect and all page faults from executing while
656       // they run. On 64-bit devices, this doesn't actually load into memory, it
657       // just makes the file faultable. So the whole file should be ok.
658       // b/185822878.
659       ICING_RETURN_IF_ERROR(mmapped_file.Remap(start, end - start));
660       auto mmap_str = std::string_view(mmapped_file.region(), end - start);
661       new_crc.Append(mmap_str);
662       break;
663     }
664     case Architecture::BIT_32:
665       [[fallthrough]];
666     case Architecture::UNKNOWN: {
667       // 32-bit devices only have 4GB of RAM. Mmap in chunks to not use up too
668       // much memory at once. If we're unknown, then also chunk it because we're
669       // not sure what the device can handle.
670       for (int i = start; i < end; i += kMmapChunkSize) {
671         // Don't read past the file size.
672         int next_chunk_size = kMmapChunkSize;
673         if ((i + kMmapChunkSize) >= end) {
674           next_chunk_size = end - i;
675         }
676 
677         ICING_RETURN_IF_ERROR(mmapped_file.Remap(i, next_chunk_size));
678 
679         auto mmap_str =
680             std::string_view(mmapped_file.region(), next_chunk_size);
681         new_crc.Append(mmap_str);
682       }
683       break;
684     }
685   }
686 
687   return new_crc;
688 }
689 
690 template <typename ProtoT>
WriteProto(const ProtoT & proto)691 libtextclassifier3::StatusOr<int64_t> FileBackedProtoLog<ProtoT>::WriteProto(
692     const ProtoT& proto) {
693   int64_t proto_size = proto.ByteSizeLong();
694   int32_t metadata;
695   int metadata_size = sizeof(metadata);
696   int64_t current_position = filesystem_->GetCurrentPosition(fd_.get());
697 
698   if (proto_size > header_->max_proto_size) {
699     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
700         "proto_size, %lld, was too large to write. Max is %d",
701         static_cast<long long>(proto_size), header_->max_proto_size));
702   }
703 
704   // At this point, we've guaranteed that proto_size is under kMaxProtoSize
705   // (see
706   // ::Create), so we can safely store it in an int.
707   int final_size = 0;
708 
709   std::string proto_str;
710   google::protobuf::io::StringOutputStream proto_stream(&proto_str);
711 
712   if (header_->compress) {
713     google::protobuf::io::GzipOutputStream::Options options;
714     options.format = google::protobuf::io::GzipOutputStream::ZLIB;
715     options.compression_level = kDeflateCompressionLevel;
716 
717     google::protobuf::io::GzipOutputStream compressing_stream(&proto_stream,
718                                                                   options);
719 
720     bool success = proto.SerializeToZeroCopyStream(&compressing_stream) &&
721                    compressing_stream.Close();
722 
723     if (!success) {
724       return absl_ports::InternalError("Error compressing proto.");
725     }
726 
727     final_size = proto_str.size();
728 
729     // In case the compressed proto is larger than the original proto, we also
730     // can't write it.
731     if (final_size > header_->max_proto_size) {
732       return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
733           "Compressed proto size, %d, was greater than "
734           "max_proto_size, %d",
735           final_size, header_->max_proto_size));
736     }
737   } else {
738     // Serialize the proto directly into the write buffer at an offset of the
739     // metadata.
740     proto.SerializeToZeroCopyStream(&proto_stream);
741     final_size = proto_str.size();
742   }
743 
744   // 1st byte for magic, next 3 bytes for proto size.
745   metadata = (kProtoMagic << 24) | final_size;
746 
747   // Actually write metadata, has to be done after we know the possibly
748   // compressed proto size
749   if (!filesystem_->Write(fd_.get(), &metadata, metadata_size)) {
750     return absl_ports::InternalError(
751         absl_ports::StrCat("Failed to write proto metadata to: ", file_path_));
752   }
753 
754   // Write the serialized proto
755   if (!filesystem_->Write(fd_.get(), proto_str.data(), proto_str.size())) {
756     return absl_ports::InternalError(
757         absl_ports::StrCat("Failed to write proto to: ", file_path_));
758   }
759 
760   return current_position;
761 }
762 
763 template <typename ProtoT>
ReadProto(int64_t file_offset)764 libtextclassifier3::StatusOr<ProtoT> FileBackedProtoLog<ProtoT>::ReadProto(
765     int64_t file_offset) const {
766   int64_t file_size = filesystem_->GetFileSize(fd_.get());
767   MemoryMappedFile mmapped_file(*filesystem_, file_path_,
768                                 MemoryMappedFile::Strategy::READ_ONLY);
769   if (file_offset >= file_size) {
770     // file_size points to the next byte to write at, so subtract one to get
771     // the inclusive, actual size of file.
772     return absl_ports::OutOfRangeError(
773         IcingStringUtil::StringPrintf("Trying to read from a location, %lld, "
774                                       "out of range of the file size, %lld",
775                                       static_cast<long long>(file_offset),
776                                       static_cast<long long>(file_size - 1)));
777   }
778 
779   // Read out the metadata
780   ICING_ASSIGN_OR_RETURN(
781       int metadata, ReadProtoMetadata(&mmapped_file, file_offset, file_size));
782 
783   // Copy out however many bytes it says the proto is
784   int stored_size = GetProtoSize(metadata);
785 
786   ICING_RETURN_IF_ERROR(
787       mmapped_file.Remap(file_offset + sizeof(metadata), stored_size));
788 
789   if (IsEmptyBuffer(mmapped_file.region(), mmapped_file.region_size())) {
790     return absl_ports::NotFoundError("The proto data has been erased.");
791   }
792 
793   google::protobuf::io::ArrayInputStream proto_stream(
794       mmapped_file.mutable_region(), stored_size);
795 
796   // Deserialize proto
797   ProtoT proto;
798   if (header_->compress) {
799     google::protobuf::io::GzipInputStream decompress_stream(&proto_stream);
800     proto.ParseFromZeroCopyStream(&decompress_stream);
801   } else {
802     proto.ParseFromZeroCopyStream(&proto_stream);
803   }
804 
805   return proto;
806 }
807 
808 template <typename ProtoT>
EraseProto(int64_t file_offset)809 libtextclassifier3::Status FileBackedProtoLog<ProtoT>::EraseProto(
810     int64_t file_offset) {
811   int64_t file_size = filesystem_->GetFileSize(fd_.get());
812   if (file_offset >= file_size) {
813     // file_size points to the next byte to write at, so subtract one to get
814     // the inclusive, actual size of file.
815     return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
816         "Trying to erase data at a location, %lld, "
817         "out of range of the file size, %lld",
818         static_cast<long long>(file_offset),
819         static_cast<long long>(file_size - 1)));
820   }
821 
822   MemoryMappedFile mmapped_file(
823       *filesystem_, file_path_,
824       MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC);
825 
826   // Read out the metadata
827   ICING_ASSIGN_OR_RETURN(
828       int metadata, ReadProtoMetadata(&mmapped_file, file_offset, file_size));
829 
830   ICING_RETURN_IF_ERROR(mmapped_file.Remap(file_offset + sizeof(metadata),
831                                            GetProtoSize(metadata)));
832 
833   // We need to update the crc checksum if the erased area is before the
834   // rewind position.
835   if (file_offset + sizeof(metadata) < header_->rewind_offset) {
836     // We need to calculate [original string xor 0s].
837     // The xored string is the same as the original string because 0 xor 0 =
838     // 0, 1 xor 0 = 1.
839     const std::string_view xored_str(mmapped_file.region(),
840                                      mmapped_file.region_size());
841 
842     Crc32 crc(header_->log_checksum);
843     ICING_ASSIGN_OR_RETURN(
844         uint32_t new_crc,
845         crc.UpdateWithXor(
846             xored_str,
847             /*full_data_size=*/header_->rewind_offset - sizeof(Header),
848             /*position=*/file_offset + sizeof(metadata) - sizeof(Header)));
849 
850     header_->log_checksum = new_crc;
851     header_->header_checksum = header_->CalculateHeaderChecksum();
852 
853     if (!filesystem_->PWrite(fd_.get(), /*offset=*/0, header_.get(),
854                              sizeof(Header))) {
855       return absl_ports::InternalError(
856           absl_ports::StrCat("Failed to update header to: ", file_path_));
857     }
858   }
859 
860   memset(mmapped_file.mutable_region(), '\0', mmapped_file.region_size());
861   return libtextclassifier3::Status::OK;
862 }
863 
864 template <typename ProtoT>
GetDiskUsage()865 libtextclassifier3::StatusOr<int64_t> FileBackedProtoLog<ProtoT>::GetDiskUsage()
866     const {
867   int64_t size = filesystem_->GetDiskUsage(file_path_.c_str());
868   if (size == Filesystem::kBadFileSize) {
869     return absl_ports::InternalError("Failed to get disk usage of proto log");
870   }
871   return size;
872 }
873 
874 template <typename ProtoT>
875 libtextclassifier3::StatusOr<int64_t>
GetElementsFileSize()876 FileBackedProtoLog<ProtoT>::GetElementsFileSize() const {
877   int64_t total_file_size = filesystem_->GetFileSize(file_path_.c_str());
878   if (total_file_size == Filesystem::kBadFileSize) {
879     return absl_ports::InternalError(
880         "Failed to get file size of elments in the proto log");
881   }
882   return total_file_size - sizeof(Header);
883 }
884 
885 template <typename ProtoT>
Iterator(const Filesystem & filesystem,const std::string & file_path,int64_t initial_offset)886 FileBackedProtoLog<ProtoT>::Iterator::Iterator(const Filesystem& filesystem,
887                                                const std::string& file_path,
888                                                int64_t initial_offset)
889     : mmapped_file_(filesystem, file_path,
890                     MemoryMappedFile::Strategy::READ_ONLY),
891       initial_offset_(initial_offset),
892       current_offset_(kInvalidOffset),
893       file_size_(filesystem.GetFileSize(file_path.c_str())) {
894   if (file_size_ == Filesystem::kBadFileSize) {
895     // Fails all Advance() calls
896     file_size_ = 0;
897   }
898 }
899 
900 template <typename ProtoT>
Advance()901 libtextclassifier3::Status FileBackedProtoLog<ProtoT>::Iterator::Advance() {
902   if (current_offset_ == kInvalidOffset) {
903     // First Advance() call
904     current_offset_ = initial_offset_;
905   } else {
906     // Jumps to the next proto position
907     ICING_ASSIGN_OR_RETURN(
908         int metadata,
909         ReadProtoMetadata(&mmapped_file_, current_offset_, file_size_));
910     current_offset_ += sizeof(metadata) + GetProtoSize(metadata);
911   }
912 
913   if (current_offset_ < file_size_) {
914     return libtextclassifier3::Status::OK;
915   } else {
916     return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
917         "The next proto offset, %lld, is out of file range [0, %lld)",
918         static_cast<long long>(current_offset_),
919         static_cast<long long>(file_size_)));
920   }
921 }
922 
923 template <typename ProtoT>
GetOffset()924 int64_t FileBackedProtoLog<ProtoT>::Iterator::GetOffset() {
925   return current_offset_;
926 }
927 
928 template <typename ProtoT>
929 typename FileBackedProtoLog<ProtoT>::Iterator
GetIterator()930 FileBackedProtoLog<ProtoT>::GetIterator() {
931   return Iterator(*filesystem_, file_path_,
932                   /*initial_offset=*/sizeof(Header));
933 }
934 
935 template <typename ProtoT>
ReadProtoMetadata(MemoryMappedFile * mmapped_file,int64_t file_offset,int64_t file_size)936 libtextclassifier3::StatusOr<int> FileBackedProtoLog<ProtoT>::ReadProtoMetadata(
937     MemoryMappedFile* mmapped_file, int64_t file_offset, int64_t file_size) {
938   // Checks file_offset
939   if (file_offset >= file_size) {
940     return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
941         "offset, %lld, is out of file range [0, %lld)",
942         static_cast<long long>(file_offset),
943         static_cast<long long>(file_size)));
944   }
945   int metadata;
946   int metadata_size = sizeof(metadata);
947   if (file_offset + metadata_size >= file_size) {
948     return absl_ports::InternalError(IcingStringUtil::StringPrintf(
949         "Wrong metadata offset %lld, metadata doesn't fit in "
950         "with file range [0, %lld)",
951         static_cast<long long>(file_offset),
952         static_cast<long long>(file_size)));
953   }
954   // Reads metadata
955   ICING_RETURN_IF_ERROR(mmapped_file->Remap(file_offset, metadata_size));
956   memcpy(&metadata, mmapped_file->region(), metadata_size);
957   // Checks magic number
958   uint8_t stored_k_proto_magic = GetProtoMagic(metadata);
959   if (stored_k_proto_magic != kProtoMagic) {
960     return absl_ports::InternalError(IcingStringUtil::StringPrintf(
961         "Failed to read kProtoMagic, expected %d, actual %d", kProtoMagic,
962         stored_k_proto_magic));
963   }
964   return metadata;
965 }
966 
967 template <typename ProtoT>
PersistToDisk()968 libtextclassifier3::Status FileBackedProtoLog<ProtoT>::PersistToDisk() {
969   int64_t file_size = filesystem_->GetFileSize(file_path_.c_str());
970   if (file_size == header_->rewind_offset) {
971     // No new protos appended, don't need to update the checksum.
972     return libtextclassifier3::Status::OK;
973   }
974 
975   int64_t new_content_size = file_size - header_->rewind_offset;
976   Crc32 crc;
977   if (new_content_size < 0) {
978     // File shrunk, recalculate the entire checksum.
979     ICING_ASSIGN_OR_RETURN(
980         crc, ComputeChecksum(filesystem_, file_path_, Crc32(), sizeof(Header),
981                              file_size));
982   } else {
983     // Append new changes to the existing checksum.
984     ICING_ASSIGN_OR_RETURN(
985         crc,
986         ComputeChecksum(filesystem_, file_path_, Crc32(header_->log_checksum),
987                         header_->rewind_offset, file_size));
988   }
989 
990   header_->log_checksum = crc.Get();
991   header_->rewind_offset = file_size;
992   header_->header_checksum = header_->CalculateHeaderChecksum();
993 
994   if (!filesystem_->PWrite(fd_.get(), /*offset=*/0, header_.get(),
995                            sizeof(Header)) ||
996       !filesystem_->DataSync(fd_.get())) {
997     return absl_ports::InternalError(
998         absl_ports::StrCat("Failed to update header to: ", file_path_));
999   }
1000 
1001   return libtextclassifier3::Status::OK;
1002 }
1003 
1004 template <typename ProtoT>
1005 libtextclassifier3::StatusOr<Crc32>
ComputeChecksum()1006 FileBackedProtoLog<ProtoT>::ComputeChecksum() {
1007   return FileBackedProtoLog<ProtoT>::ComputeChecksum(
1008       filesystem_, file_path_, Crc32(), /*start=*/sizeof(Header),
1009       /*end=*/filesystem_->GetFileSize(file_path_.c_str()));
1010 }
1011 
1012 }  // namespace lib
1013 }  // namespace icing
1014 
1015 #endif  // ICING_FILE_FILE_BACKED_PROTO_LOG_H_
1016