1 // Copyright (C) 2021 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 // File-backed log of protos with append-only writes and position based reads.
16 //
17 // There should only be one instance of a PortableFileBackedProtoLog of the same
18 // file at a time; using multiple instances at the same time may lead to
19 // undefined behavior.
20 //
21 // The entire checksum is computed on initialization to verify the contents are
22 // valid. On failure, the log will be truncated to the last verified state when
23 // PersistToDisk() was called. If the log cannot successfully restore the last
24 // state due to disk corruption or some other inconsistency, then the entire log
25 // will be lost.
26 //
27 // Each proto written to the file will have a metadata written just before it.
28 // The metadata consists of
29 // {
30 // 1 bytes of kProtoMagic;
31 // 3 bytes of the proto size
32 // n bytes of the proto itself
33 // }
34 //
35 // All metadata is written in a portable format, encoded with htonl before
36 // writing to file and decoded with ntohl when reading from file.
37 //
38 // Example usage:
39 // ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
40 // PortableFileBackedProtoLog<DocumentProto>::Create(filesystem,
41 // file_path_,
42 // options));
43 // auto proto_log = create_result.proto_log;
44 //
45 // Document document;
46 // document.set_namespace("com.google.android.example");
47 // document.set_uri("www.google.com");
48 //
49 // int64_t document_offset = proto_log->WriteProto(document));
50 // Document same_document = proto_log->ReadProto(document_offset));
51 // proto_log->PersistToDisk();
52
53 #ifndef ICING_FILE_PORTABLE_FILE_BACKED_PROTO_LOG_H_
54 #define ICING_FILE_PORTABLE_FILE_BACKED_PROTO_LOG_H_
55
56 #include <cstddef>
57 #include <cstdint>
58 #include <cstring>
59 #include <memory>
60 #include <string>
61 #include <string_view>
62 #include <utility>
63 #include <vector>
64
65 #include "icing/text_classifier/lib3/utils/base/status.h"
66 #include "icing/text_classifier/lib3/utils/base/statusor.h"
67 #include "icing/absl_ports/canonical_errors.h"
68 #include "icing/absl_ports/str_cat.h"
69 #include "icing/file/filesystem.h"
70 #include "icing/file/memory-mapped-file.h"
71 #include "icing/legacy/core/icing-string-util.h"
72 #include "icing/portable/endian.h"
73 #include "icing/portable/gzip_stream.h"
74 #include "icing/portable/platform.h"
75 #include "icing/portable/zlib.h"
76 #include "icing/util/bit-util.h"
77 #include "icing/util/crc32.h"
78 #include "icing/util/data-loss.h"
79 #include "icing/util/logging.h"
80 #include "icing/util/status-macros.h"
81 #include <google/protobuf/io/zero_copy_stream_impl_lite.h>
82
83 namespace icing {
84 namespace lib {
85
86 template <typename ProtoT>
87 class PortableFileBackedProtoLog {
88 public:
89 struct Options {
90 // Whether to compress each proto before writing to the proto log.
91 bool compress;
92
93 // Byte-size limit for each proto written to the store. This does not
94 // include the bytes needed for the metadata of each proto.
95 //
96 // NOTE: Currently, we only support protos up to 16MiB. We store the proto
97 // size in 3 bytes within the metadata.
98 //
99 // NOTE: This limit is only enforced for future writes. If the store
100 // previously had a higher limit, then reading older entries could return
101 // larger protos.
102 //
103 // NOTE: The max_proto_size is the upper limit for input protos into the
104 // ProtoLog. Even if the proto is larger than max_proto_size, but compresses
105 // to a smaller size, ProtoLog will not accept it. Protos that result in a
106 // compressed size larger than max_proto_size are also not accepted.
107 const int32_t max_proto_size;
108
109 // Level of compression if enabled, NO_COMPRESSION = 0, BEST_SPEED = 1,
110 // BEST_COMPRESSION = 9
111 const int32_t compression_level;
112
113 // Must specify values for options.
114 Options() = delete;
115 explicit Options(
116 bool compress_in, const int32_t max_proto_size_in = kMaxProtoSize,
117 const int32_t compression_level_in = kDeflateCompressionLevel)
compressOptions118 : compress(compress_in),
119 max_proto_size(max_proto_size_in),
120 compression_level(compression_level_in) {}
121 };
122
123 // Our internal max for protos.
124 //
125 // WARNING: Changing this to a larger number may invalidate our assumption
126 // that that proto size can safely be stored in the last 3 bytes of the proto
127 // header.
128 static constexpr int kMaxProtoSize = (1 << 24) - 1; // 16MiB
129 static_assert(kMaxProtoSize <= 0x00FFFFFF,
130 "kMaxProtoSize doesn't fit in 3 bytes");
131
132 // Level of compression, BEST_SPEED = 1, BEST_COMPRESSION = 9
133 static constexpr int kDeflateCompressionLevel = 3;
134
135 // Number of bytes we reserve for the heading at the beginning of the proto
136 // log. We reserve this so the header can grow without running into the
137 // contents of the proto log, triggering an unnecessary migration of the data.
138 static constexpr int kHeaderReservedBytes = 256;
139
140 // Header stored at the beginning of the file before the rest of the log
141 // contents. Stores metadata on the log.
142 class Header {
143 public:
144 static constexpr int32_t kMagic = 0xf4c6f67a;
145
146 // We should go directly from 0 to 2 the next time we have to change the
147 // format.
148 static constexpr int32_t kFileFormatVersion = 0;
149
CalculateHeaderChecksum()150 uint32_t CalculateHeaderChecksum() const {
151 Crc32 crc;
152
153 // Get a string_view of all the fields of the Header, excluding the
154 // magic_nbytes_ and header_checksum_nbytes_
155 std::string_view header_str(
156 reinterpret_cast<const char*>(this) +
157 offsetof(Header, header_checksum_nbytes_) +
158 sizeof(header_checksum_nbytes_),
159 sizeof(Header) - sizeof(magic_nbytes_) -
160 sizeof(header_checksum_nbytes_));
161 crc.Append(header_str);
162 return crc.Get();
163 }
164
GetMagic()165 int32_t GetMagic() const { return GNetworkToHostL(magic_nbytes_); }
166
SetMagic(int32_t magic_in)167 void SetMagic(int32_t magic_in) {
168 magic_nbytes_ = GHostToNetworkL(magic_in);
169 }
170
GetFileFormatVersion()171 int32_t GetFileFormatVersion() const {
172 return GNetworkToHostL(file_format_version_nbytes_);
173 }
174
SetFileFormatVersion(int32_t file_format_version_in)175 void SetFileFormatVersion(int32_t file_format_version_in) {
176 file_format_version_nbytes_ = GHostToNetworkL(file_format_version_in);
177 }
178
GetMaxProtoSize()179 int32_t GetMaxProtoSize() const {
180 return GNetworkToHostL(max_proto_size_nbytes_);
181 }
182
SetMaxProtoSize(int32_t max_proto_size_in)183 void SetMaxProtoSize(int32_t max_proto_size_in) {
184 max_proto_size_nbytes_ = GHostToNetworkL(max_proto_size_in);
185 }
186
GetLogChecksum()187 int32_t GetLogChecksum() const {
188 return GNetworkToHostL(log_checksum_nbytes_);
189 }
190
SetLogChecksum(int32_t log_checksum_in)191 void SetLogChecksum(int32_t log_checksum_in) {
192 log_checksum_nbytes_ = GHostToNetworkL(log_checksum_in);
193 }
194
GetRewindOffset()195 int64_t GetRewindOffset() const {
196 return GNetworkToHostLL(rewind_offset_nbytes_);
197 }
198
SetRewindOffset(int64_t rewind_offset_in)199 void SetRewindOffset(int64_t rewind_offset_in) {
200 rewind_offset_nbytes_ = GHostToNetworkLL(rewind_offset_in);
201 }
202
GetHeaderChecksum()203 int32_t GetHeaderChecksum() const {
204 return GNetworkToHostL(header_checksum_nbytes_);
205 }
206
SetHeaderChecksum(int32_t header_checksum_in)207 void SetHeaderChecksum(int32_t header_checksum_in) {
208 header_checksum_nbytes_ = GHostToNetworkL(header_checksum_in);
209 }
210
GetCompressFlag()211 bool GetCompressFlag() const { return GetFlag(kCompressBit); }
212
SetCompressFlag(bool compress)213 void SetCompressFlag(bool compress) { SetFlag(kCompressBit, compress); }
214
GetDirtyFlag()215 bool GetDirtyFlag() const { return GetFlag(kDirtyBit); }
216
SetDirtyFlag(bool dirty)217 void SetDirtyFlag(bool dirty) { SetFlag(kDirtyBit, dirty); }
218
219 private:
220 // The least-significant bit offset at which the compress flag is stored in
221 // 'flags_nbytes_'. Represents whether the protos in the log are compressed
222 // or not.
223 static constexpr int32_t kCompressBit = 0;
224
225 // The least-significant bit offset at which the dirty flag is stored in
226 // 'flags'. Represents whether the checksummed portion of the log has been
227 // modified after the last checksum was computed.
228 static constexpr int32_t kDirtyBit = 1;
229
GetFlag(int offset)230 bool GetFlag(int offset) const {
231 return bit_util::BitfieldGet(flags_, offset, /*len=*/1);
232 }
233
SetFlag(int offset,bool value)234 void SetFlag(int offset, bool value) {
235 bit_util::BitfieldSet(value, offset, /*len=*/1, &flags_);
236 }
237
238 // Holds the magic as a quick sanity check against file corruption.
239 //
240 // Field is in network-byte order.
241 int32_t magic_nbytes_ = GHostToNetworkL(kMagic);
242
243 // Must be at the beginning after kMagic. Contains the crc checksum of
244 // the following fields.
245 //
246 // Field is in network-byte order.
247 uint32_t header_checksum_nbytes_ = 0;
248
249 // Last known good offset at which the log and its checksum were updated.
250 // If we crash between writing to the log and updating the checksum, we can
251 // try to rewind the log to this offset and verify the checksum is still
252 // valid instead of throwing away the entire log.
253 //
254 // Field is in network-byte order.
255 int64_t rewind_offset_nbytes_ = GHostToNetworkLL(kHeaderReservedBytes);
256
257 // Version number tracking how we serialize the file to disk. If we change
258 // how/what we write to disk, this version should be updated and this class
259 // should handle a migration.
260 //
261 // Currently at kFileFormatVersion.
262 //
263 // Field is in network-byte order.
264 int32_t file_format_version_nbytes_ = 0;
265
266 // The maximum proto size that can be written to the log.
267 //
268 // Field is in network-byte order.
269 int32_t max_proto_size_nbytes_ = 0;
270
271 // Checksum of the log elements, doesn't include the header fields.
272 //
273 // Field is in network-byte order.
274 uint32_t log_checksum_nbytes_ = 0;
275
276 // Bits are used to hold various flags.
277 // Lowest bit is whether the protos are compressed or not.
278 //
279 // Field is only 1 byte, so is byte-order agnostic.
280 uint8_t flags_ = 0;
281
282 // NOTE: New fields should *almost always* be added to the end here. Since
283 // this class may have already been written to disk, appending fields
284 // increases the chances that changes are backwards-compatible.
285 };
286 static_assert(sizeof(Header) <= kHeaderReservedBytes,
287 "Header has grown past our reserved bytes!");
288
289 struct CreateResult {
290 // A successfully initialized log.
291 std::unique_ptr<PortableFileBackedProtoLog<ProtoT>> proto_log;
292
293 // The data status after initializing from a previous state. Data loss can
294 // happen if the file is corrupted or some previously added data was
295 // unpersisted. This may be used to signal that any derived data off of the
296 // proto log may need to be regenerated.
297 DataLoss data_loss = DataLoss::NONE;
298
299 // Whether the proto log had to recalculate the checksum to check its
300 // integrity. This can be avoided if no changes were made or the log was
301 // able to update its checksum before shutting down. But it may have to
302 // recalculate if it's unclear if we crashed after updating the log, but
303 // before updating our checksum.
304 bool recalculated_checksum = false;
305
has_data_lossCreateResult306 bool has_data_loss() const {
307 return data_loss == DataLoss::PARTIAL || data_loss == DataLoss::COMPLETE;
308 }
309 };
310
311 // Factory method to create, initialize, and return a
312 // PortableFileBackedProtoLog. Will create the file if it doesn't exist.
313 //
314 // If on re-initialization the log detects disk corruption or some previously
315 // added data was unpersisted, the log will rewind to the last-good state. The
316 // log saves these checkpointed "good" states when PersistToDisk() is called
317 // or the log is safely destructed. If the log rewinds successfully to the
318 // last-good state, then the returned CreateResult.data_loss indicates
319 // whether it has a data loss and what kind of data loss it is (partial or
320 // complete) so that any derived data may know that it needs to be updated. If
321 // the log re-initializes successfully without any data loss,
322 // CreateResult.data_loss will be NONE.
323 //
324 // Params:
325 // filesystem: Handles system level calls
326 // file_path: Path of the underlying file. Directory of the file should
327 // already exist
328 // options: Configuration options for the proto log
329 //
330 // Returns:
331 // PortableFileBackedProtoLog::CreateResult on success
332 // INVALID_ARGUMENT on an invalid option
333 // INTERNAL_ERROR on IO error
334 static libtextclassifier3::StatusOr<CreateResult> Create(
335 const Filesystem* filesystem, const std::string& file_path,
336 const Options& options);
337
338 // Not copyable
339 PortableFileBackedProtoLog(const PortableFileBackedProtoLog&) = delete;
340 PortableFileBackedProtoLog& operator=(const PortableFileBackedProtoLog&) =
341 delete;
342
343 // This will update the checksum of the log as well.
344 ~PortableFileBackedProtoLog();
345
346 // Writes the serialized proto to the underlying file. Writes are applied
347 // directly to the underlying file. Users do not need to sync the file after
348 // writing.
349 //
350 // Returns:
351 // Offset of the newly appended proto in file on success
352 // INVALID_ARGUMENT if proto is too large, as decided by
353 // Options.max_proto_size
354 // INTERNAL_ERROR on IO error
355 libtextclassifier3::StatusOr<int64_t> WriteProto(const ProtoT& proto);
356
357 // Reads out a proto located at file_offset from the file.
358 //
359 // Returns:
360 // A proto on success
361 // NOT_FOUND if the proto at the given offset has been erased
362 // OUT_OF_RANGE_ERROR if file_offset exceeds file size
363 // INTERNAL_ERROR on IO error
364 libtextclassifier3::StatusOr<ProtoT> ReadProto(int64_t file_offset) const;
365
366 // Erases the data of a proto located at file_offset from the file.
367 //
368 // Returns:
369 // OK on success
370 // OUT_OF_RANGE_ERROR if file_offset exceeds file size
371 // INTERNAL_ERROR on IO error
372 libtextclassifier3::Status EraseProto(int64_t file_offset);
373
374 // Calculates and returns the disk usage in bytes. Rounds up to the nearest
375 // block size.
376 //
377 // Returns:
378 // Disk usage on success
379 // INTERNAL_ERROR on IO error
380 libtextclassifier3::StatusOr<int64_t> GetDiskUsage() const;
381
382 // Returns the file size of all the elements held in the log. File size is in
383 // bytes. This excludes the size of any internal metadata of the log, e.g. the
384 // log's header.
385 //
386 // Returns:
387 // File size on success
388 // INTERNAL_ERROR on IO error
389 libtextclassifier3::StatusOr<int64_t> GetElementsFileSize() const;
390
391 // An iterator helping to find offsets of all the protos in file.
392 // Example usage:
393 //
394 // while (iterator.Advance().ok()) {
395 // int64_t offset = iterator.GetOffset();
396 // // Do something
397 // }
398 class Iterator {
399 public:
400 Iterator(const Filesystem& filesystem, int fd, int64_t initial_offset);
401
402 // Advances to the position of next proto whether it has been erased or not.
403 //
404 // Returns:
405 // OK on success
406 // OUT_OF_RANGE_ERROR if it reaches the end
407 // INTERNAL_ERROR on IO error
408 libtextclassifier3::Status Advance();
409
410 // Returns the file offset of current proto.
411 int64_t GetOffset();
412
413 private:
414 static constexpr int64_t kInvalidOffset = -1;
415 // Used to read proto metadata
416 // Offset of first proto
417 const Filesystem* const filesystem_;
418 int64_t initial_offset_;
419 int64_t current_offset_;
420 int64_t file_size_;
421 int fd_;
422 };
423
424 // Returns an iterator of current proto log. The caller needs to keep the
425 // proto log unchanged while using the iterator, otherwise unexpected
426 // behaviors could happen.
427 Iterator GetIterator();
428
429 // Persists all changes since initialization or the last call to
430 // PersistToDisk(). Any changes that aren't persisted may be lost if the
431 // system fails to close safely.
432 //
433 // Example use case:
434 //
435 // Document document;
436 // document.set_namespace("com.google.android.example");
437 // document.set_uri("www.google.com");
438 //
439 // {
440 // ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
441 // PortableFileBackedProtoLog<DocumentProto>::Create(filesystem,
442 // file_path,
443 // options));
444 // auto proto_log = std::move(create_result.proto_log);
445 //
446 // int64_t document_offset = proto_log->WriteProto(document));
447 //
448 // // We lose the document here since it wasn't persisted.
449 // // *SYSTEM CRASH*
450 // }
451 //
452 // {
453 // // Can still successfully create after a crash since the log can
454 // // rewind/truncate to recover into a previously good state
455 // ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
456 // PortableFileBackedProtoLog<DocumentProto>::Create(filesystem,
457 // file_path,
458 // options));
459 // auto proto_log = std::move(create_result.proto_log);
460 //
461 // // Lost the proto since we didn't PersistToDisk before the crash
462 // proto_log->ReadProto(document_offset)); // INVALID_ARGUMENT error
463 //
464 // int64_t document_offset = proto_log->WriteProto(document));
465 //
466 // // Persisted this time, so we should be ok.
467 // ICING_ASSERT_OK(proto_log->PersistToDisk());
468 // }
469 //
470 // {
471 // ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
472 // PortableFileBackedProtoLog<DocumentProto>::Create(filesystem,
473 // file_path,
474 // options));
475 // auto proto_log = std::move(create_result.proto_log);
476 //
477 // // SUCCESS
478 // Document same_document = proto_log->ReadProto(document_offset));
479 // }
480 //
481 // NOTE: Since all protos are already written to the file directly, this
482 // just updates the checksum and rewind position. Without these updates,
483 // future initializations will truncate the file and discard unpersisted
484 // changes.
485 //
486 // Returns:
487 // OK on success
488 // INTERNAL_ERROR on IO error
489 libtextclassifier3::Status PersistToDisk();
490
491 // Calculates the checksum of the log contents. Excludes the header content.
492 //
493 // Returns:
494 // Crc of the log content
495 // INTERNAL_ERROR on IO error
496 libtextclassifier3::StatusOr<Crc32> ComputeChecksum();
497
498 private:
499 // Object can only be instantiated via the ::Create factory.
500 PortableFileBackedProtoLog(const Filesystem* filesystem,
501 const std::string& file_path,
502 std::unique_ptr<Header> header,
503 int32_t compression_level);
504
505 // Initializes a new proto log.
506 //
507 // Returns:
508 // std::unique_ptr<CreateResult> on success
509 // INTERNAL_ERROR on IO error
510 static libtextclassifier3::StatusOr<CreateResult> InitializeNewFile(
511 const Filesystem* filesystem, const std::string& file_path,
512 const Options& options);
513
514 // Verifies that the existing proto log is in a good state. If not in a good
515 // state, then the proto log may be truncated to the last good state and
516 // content will be lost.
517 //
518 // Returns:
519 // std::unique_ptr<CreateResult> on success
520 // INTERNAL_ERROR on IO error or internal inconsistencies in the file
521 // INVALID_ARGUMENT_ERROR if options aren't consistent with previous
522 // instances
523 static libtextclassifier3::StatusOr<CreateResult> InitializeExistingFile(
524 const Filesystem* filesystem, const std::string& file_path,
525 const Options& options, int64_t file_size);
526
527 // Takes an initial checksum and updates it with the content between `start`
528 // and `end` offsets in the file.
529 //
530 // Returns:
531 // Crc of the content between `start`, inclusive, and `end`, exclusive.
532 // INTERNAL_ERROR on IO error
533 // INVALID_ARGUMENT_ERROR if start and end aren't within the file size
534 static libtextclassifier3::StatusOr<Crc32> ComputeChecksum(
535 const Filesystem* filesystem, const std::string& file_path,
536 Crc32 initial_crc, int64_t start, int64_t end);
537
538 // Reads out the metadata of a proto located at file_offset from the fd.
539 // Metadata will be returned in host byte order endianness.
540 //
541 // Returns:
542 // Proto's metadata on success
543 // OUT_OF_RANGE_ERROR if file_offset exceeds file_size
544 // INTERNAL_ERROR if the metadata is invalid or any IO errors happen
545 static libtextclassifier3::StatusOr<int32_t> ReadProtoMetadata(
546 const Filesystem* const filesystem, int fd, int64_t file_offset,
547 int64_t file_size);
548
549 // Writes metadata of a proto to the fd. Takes in a host byte order endianness
550 // metadata and converts it into a portable metadata before writing.
551 //
552 // Returns:
553 // OK on success
554 // INTERNAL_ERROR on any IO errors
555 static libtextclassifier3::Status WriteProtoMetadata(
556 const Filesystem* filesystem, int fd, int32_t host_order_metadata);
557
IsEmptyBuffer(const char * buffer,int size)558 static bool IsEmptyBuffer(const char* buffer, int size) {
559 return std::all_of(buffer, buffer + size,
560 [](const char byte) { return byte == 0; });
561 }
562
563 // Helper function to get stored proto size from the metadata.
564 // Metadata format: 8 bits magic + 24 bits size
GetProtoSize(int metadata)565 static int GetProtoSize(int metadata) { return metadata & 0x00FFFFFF; }
566
567 // Helper function to get stored proto magic from the metadata.
568 // Metadata format: 8 bits magic + 24 bits size
GetProtoMagic(int metadata)569 static uint8_t GetProtoMagic(int metadata) { return metadata >> 24; }
570
571 // Magic number added in front of every proto. Used when reading out protos
572 // as a first check for corruption in each entry in the file. Even if there is
573 // a corruption, the best we can do is roll back to our last recovery point
574 // and throw away un-flushed data. We can discard/reuse this byte if needed so
575 // that we have 4 bytes to store the size of protos, and increase the size of
576 // protos we support.
577 static constexpr uint8_t kProtoMagic = 0x5C;
578
579 // Chunks of the file to mmap at a time, so we don't mmap the entire file.
580 // Only used on 32-bit devices
581 static constexpr int kMmapChunkSize = 4 * 1024 * 1024; // 4MiB
582
583 ScopedFd fd_;
584 const Filesystem* const filesystem_;
585 const std::string file_path_;
586 std::unique_ptr<Header> header_;
587 const int32_t compression_level_;
588 };
589
590 template <typename ProtoT>
PortableFileBackedProtoLog(const Filesystem * filesystem,const std::string & file_path,std::unique_ptr<Header> header,int32_t compression_level)591 PortableFileBackedProtoLog<ProtoT>::PortableFileBackedProtoLog(
592 const Filesystem* filesystem, const std::string& file_path,
593 std::unique_ptr<Header> header, int32_t compression_level)
594 : filesystem_(filesystem),
595 file_path_(file_path),
596 header_(std::move(header)),
597 compression_level_(compression_level) {
598 fd_.reset(filesystem_->OpenForAppend(file_path.c_str()));
599 }
600
601 template <typename ProtoT>
~PortableFileBackedProtoLog()602 PortableFileBackedProtoLog<ProtoT>::~PortableFileBackedProtoLog() {
603 if (!PersistToDisk().ok()) {
604 ICING_LOG(WARNING) << "Error persisting to disk during destruction of "
605 "PortableFileBackedProtoLog: "
606 << file_path_;
607 }
608 }
609
610 template <typename ProtoT>
611 libtextclassifier3::StatusOr<
612 typename PortableFileBackedProtoLog<ProtoT>::CreateResult>
Create(const Filesystem * filesystem,const std::string & file_path,const Options & options)613 PortableFileBackedProtoLog<ProtoT>::Create(const Filesystem* filesystem,
614 const std::string& file_path,
615 const Options& options) {
616 if (options.max_proto_size <= 0) {
617 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
618 "options.max_proto_size must be greater than 0, was %d",
619 options.max_proto_size));
620 }
621
622 // Since we store the proto_size in 3 bytes, we can only support protos of up
623 // to 16MiB.
624 if (options.max_proto_size > kMaxProtoSize) {
625 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
626 "options.max_proto_size must be under 16MiB, was %d",
627 options.max_proto_size));
628 }
629
630 if (options.compression_level < 0 || options.compression_level > 9) {
631 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
632 "options.compression_level must be between 0 and 9 inclusive, was %d",
633 options.compression_level));
634 }
635
636 if (!filesystem->FileExists(file_path.c_str())) {
637 return InitializeNewFile(filesystem, file_path, options);
638 }
639
640 int64_t file_size = filesystem->GetFileSize(file_path.c_str());
641 if (file_size == Filesystem::kBadFileSize) {
642 return absl_ports::InternalError(
643 absl_ports::StrCat("Bad file size '", file_path, "'"));
644 }
645
646 if (file_size == 0) {
647 return InitializeNewFile(filesystem, file_path, options);
648 }
649
650 return InitializeExistingFile(filesystem, file_path, options, file_size);
651 }
652
653 template <typename ProtoT>
654 libtextclassifier3::StatusOr<
655 typename PortableFileBackedProtoLog<ProtoT>::CreateResult>
InitializeNewFile(const Filesystem * filesystem,const std::string & file_path,const Options & options)656 PortableFileBackedProtoLog<ProtoT>::InitializeNewFile(
657 const Filesystem* filesystem, const std::string& file_path,
658 const Options& options) {
659 // Grow to the minimum reserved bytes for the header.
660 if (!filesystem->Truncate(file_path.c_str(), kHeaderReservedBytes)) {
661 return absl_ports::InternalError(
662 absl_ports::StrCat("Failed to initialize file size: ", file_path));
663 }
664
665 // Create the header
666 std::unique_ptr<Header> header = std::make_unique<Header>();
667 header->SetCompressFlag(options.compress);
668 header->SetMaxProtoSize(options.max_proto_size);
669 header->SetHeaderChecksum(header->CalculateHeaderChecksum());
670
671 if (!filesystem->Write(file_path.c_str(), header.get(), sizeof(Header))) {
672 return absl_ports::InternalError(
673 absl_ports::StrCat("Failed to write header for file: ", file_path));
674 }
675
676 CreateResult create_result = {
677 std::unique_ptr<PortableFileBackedProtoLog<ProtoT>>(
678 new PortableFileBackedProtoLog<ProtoT>(filesystem, file_path,
679 std::move(header),
680 options.compression_level)),
681 /*data_loss=*/DataLoss::NONE, /*recalculated_checksum=*/false};
682
683 return create_result;
684 }
685
686 template <typename ProtoT>
687 libtextclassifier3::StatusOr<
688 typename PortableFileBackedProtoLog<ProtoT>::CreateResult>
InitializeExistingFile(const Filesystem * filesystem,const std::string & file_path,const Options & options,int64_t file_size)689 PortableFileBackedProtoLog<ProtoT>::InitializeExistingFile(
690 const Filesystem* filesystem, const std::string& file_path,
691 const Options& options, int64_t file_size) {
692 bool header_changed = false;
693 if (file_size < kHeaderReservedBytes) {
694 return absl_ports::InternalError(
695 absl_ports::StrCat("File header too short for: ", file_path));
696 }
697
698 std::unique_ptr<Header> header = std::make_unique<Header>();
699 if (!filesystem->PRead(file_path.c_str(), header.get(), sizeof(Header),
700 /*offset=*/0)) {
701 return absl_ports::InternalError(
702 absl_ports::StrCat("Failed to read header for file: ", file_path));
703 }
704
705 // Make sure the header is still valid before we use any of its values. This
706 // is covered by the header_checksum check below, but this is a quick check
707 // that can save us from an extra crc computation.
708 if (header->GetMagic() != Header::kMagic) {
709 return absl_ports::InternalError(
710 absl_ports::StrCat("Invalid header kMagic for file: ", file_path));
711 }
712
713 if (header->GetHeaderChecksum() != header->CalculateHeaderChecksum()) {
714 return absl_ports::InternalError(
715 absl_ports::StrCat("Invalid header checksum for: ", file_path));
716 }
717
718 if (header->GetFileFormatVersion() != Header::kFileFormatVersion) {
719 // If this changes, we might need to handle a migration rather than throwing
720 // an error.
721 return absl_ports::InternalError(
722 absl_ports::StrCat("Invalid header file format version: ", file_path));
723 }
724
725 if (header->GetCompressFlag() != options.compress) {
726 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
727 "Inconsistent compress option, expected %d, actual %d",
728 header->GetCompressFlag(), options.compress));
729 }
730
731 int32_t existing_max_proto_size = header->GetMaxProtoSize();
732 if (existing_max_proto_size > options.max_proto_size) {
733 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
734 "Max proto size cannot be smaller than previous "
735 "instantiations, previous size %d, wanted size %d",
736 header->GetMaxProtoSize(), options.max_proto_size));
737 } else if (existing_max_proto_size < options.max_proto_size) {
738 // It's fine if our new max size is greater than our previous one. Existing
739 // data is still valid.
740 header->SetMaxProtoSize(options.max_proto_size);
741 header_changed = true;
742 }
743
744 DataLoss data_loss = DataLoss::NONE;
745
746 // If we have any documents in our tail, get rid of them since they're not in
747 // our checksum. Our checksum reflects content up to the rewind offset.
748 if (file_size > header->GetRewindOffset()) {
749 if (!filesystem->Truncate(file_path.c_str(), header->GetRewindOffset())) {
750 return absl_ports::InternalError(IcingStringUtil::StringPrintf(
751 "Failed to truncate '%s' to size %lld", file_path.data(),
752 static_cast<long long>(header->GetRewindOffset())));
753 }
754 data_loss = DataLoss::PARTIAL;
755 }
756
757 bool recalculated_checksum = false;
758
759 // If our dirty flag is set, that means we might have crashed in the middle of
760 // erasing a proto. This could have happened anywhere between:
761 // A. Set dirty flag to true and update header checksum
762 // B. Erase the proto
763 // C. Set dirty flag to false, update log checksum, update header checksum
764 //
765 // Scenario 1: We went down between A and B. Maybe our dirty flag is a
766 // false alarm and we can keep all our data.
767 //
768 // Scenario 2: We went down between B and C. Our data is compromised and
769 // we need to throw everything out.
770 if (header->GetDirtyFlag()) {
771 // Recompute the log's checksum to detect which scenario we're in.
772 ICING_ASSIGN_OR_RETURN(
773 Crc32 calculated_log_checksum,
774 ComputeChecksum(filesystem, file_path, Crc32(),
775 /*start=*/kHeaderReservedBytes, /*end=*/file_size));
776
777 if (header->GetLogChecksum() != calculated_log_checksum.Get()) {
778 // Still doesn't match, we're in Scenario 2. Throw out all our data now
779 // and initialize as a new instance.
780 ICING_ASSIGN_OR_RETURN(CreateResult create_result,
781 InitializeNewFile(filesystem, file_path, options));
782 create_result.data_loss = DataLoss::COMPLETE;
783 create_result.recalculated_checksum = true;
784 return create_result;
785 }
786 // Otherwise we're good, checksum matches our contents so continue
787 // initializing like normal.
788 recalculated_checksum = true;
789
790 // Update our header.
791 header->SetDirtyFlag(false);
792 header_changed = true;
793 }
794
795 if (header_changed) {
796 header->SetHeaderChecksum(header->CalculateHeaderChecksum());
797
798 if (!filesystem->PWrite(file_path.c_str(), /*offset=*/0, header.get(),
799 sizeof(Header))) {
800 return absl_ports::InternalError(
801 absl_ports::StrCat("Failed to update header to: ", file_path));
802 }
803 }
804
805 CreateResult create_result = {
806 std::unique_ptr<PortableFileBackedProtoLog<ProtoT>>(
807 new PortableFileBackedProtoLog<ProtoT>(filesystem, file_path,
808 std::move(header),
809 options.compression_level)),
810 data_loss, recalculated_checksum};
811
812 return create_result;
813 }
814
815 template <typename ProtoT>
816 libtextclassifier3::StatusOr<Crc32>
ComputeChecksum(const Filesystem * filesystem,const std::string & file_path,Crc32 initial_crc,int64_t start,int64_t end)817 PortableFileBackedProtoLog<ProtoT>::ComputeChecksum(
818 const Filesystem* filesystem, const std::string& file_path,
819 Crc32 initial_crc, int64_t start, int64_t end) {
820 ICING_ASSIGN_OR_RETURN(
821 MemoryMappedFile mmapped_file,
822 MemoryMappedFile::Create(*filesystem, file_path,
823 MemoryMappedFile::Strategy::READ_ONLY));
824 Crc32 new_crc(initial_crc.Get());
825
826 if (start < 0) {
827 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
828 "Starting checksum offset of file '%s' must be greater than 0, was "
829 "%lld",
830 file_path.c_str(), static_cast<long long>(start)));
831 }
832
833 if (end < start) {
834 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
835 "Ending checksum offset of file '%s' must be greater than start "
836 "'%lld', was '%lld'",
837 file_path.c_str(), static_cast<long long>(start),
838 static_cast<long long>(end)));
839 }
840
841 int64_t file_size = filesystem->GetFileSize(file_path.c_str());
842 if (end > file_size) {
843 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
844 "Ending checksum offset of file '%s' must be within "
845 "file size of %lld, was %lld",
846 file_path.c_str(), static_cast<long long>(file_size),
847 static_cast<long long>(end)));
848 }
849
850 Architecture architecture = GetArchitecture();
851 switch (architecture) {
852 case Architecture::BIT_64: {
853 // Don't mmap in chunks here since mmapping can be harmful on 64-bit
854 // devices where mmap/munmap calls need the mmap write semaphore, which
855 // blocks mmap/munmap/mprotect and all page faults from executing while
856 // they run. On 64-bit devices, this doesn't actually load into memory, it
857 // just makes the file faultable. So the whole file should be ok.
858 // b/185822878.
859 ICING_RETURN_IF_ERROR(mmapped_file.Remap(start, end - start));
860 auto mmap_str = std::string_view(mmapped_file.region(), end - start);
861 new_crc.Append(mmap_str);
862 break;
863 }
864 case Architecture::BIT_32:
865 [[fallthrough]];
866 case Architecture::UNKNOWN: {
867 // 32-bit devices only have 4GB of RAM. Mmap in chunks to not use up too
868 // much memory at once. If we're unknown, then also chunk it because we're
869 // not sure what the device can handle.
870 for (int i = start; i < end; i += kMmapChunkSize) {
871 // Don't read past the file size.
872 int next_chunk_size = kMmapChunkSize;
873 if ((i + kMmapChunkSize) >= end) {
874 next_chunk_size = end - i;
875 }
876
877 ICING_RETURN_IF_ERROR(mmapped_file.Remap(i, next_chunk_size));
878
879 auto mmap_str =
880 std::string_view(mmapped_file.region(), next_chunk_size);
881 new_crc.Append(mmap_str);
882 }
883 break;
884 }
885 }
886
887 return new_crc;
888 }
889
890 template <typename ProtoT>
891 libtextclassifier3::StatusOr<int64_t>
WriteProto(const ProtoT & proto)892 PortableFileBackedProtoLog<ProtoT>::WriteProto(const ProtoT& proto) {
893 int64_t proto_size = proto.ByteSizeLong();
894 int32_t host_order_metadata;
895 int64_t current_position = filesystem_->GetCurrentPosition(fd_.get());
896
897 if (proto_size > header_->GetMaxProtoSize()) {
898 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
899 "proto_size, %lld, was too large to write. Max is %d",
900 static_cast<long long>(proto_size), header_->GetMaxProtoSize()));
901 }
902
903 // At this point, we've guaranteed that proto_size is under kMaxProtoSize
904 // (see
905 // ::Create), so we can safely store it in an int.
906 int final_size = 0;
907
908 std::string proto_str;
909 google::protobuf::io::StringOutputStream proto_stream(&proto_str);
910
911 if (header_->GetCompressFlag()) {
912 protobuf_ports::GzipOutputStream::Options options;
913 options.format = protobuf_ports::GzipOutputStream::ZLIB;
914 options.compression_level = compression_level_;
915
916 protobuf_ports::GzipOutputStream compressing_stream(&proto_stream, options);
917
918 bool success = proto.SerializeToZeroCopyStream(&compressing_stream) &&
919 compressing_stream.Close();
920
921 if (!success) {
922 return absl_ports::InternalError("Error compressing proto.");
923 }
924
925 final_size = proto_str.size();
926
927 // In case the compressed proto is larger than the original proto, we also
928 // can't write it.
929 if (final_size > header_->GetMaxProtoSize()) {
930 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
931 "Compressed proto size, %d, was greater than "
932 "max_proto_size, %d",
933 final_size, header_->GetMaxProtoSize()));
934 }
935 } else {
936 // Serialize the proto directly into the write buffer at an offset of the
937 // metadata.
938 proto.SerializeToZeroCopyStream(&proto_stream);
939 final_size = proto_str.size();
940 }
941
942 // 1st byte for magic, next 3 bytes for proto size.
943 host_order_metadata = (kProtoMagic << 24) | final_size;
944
945 // Actually write metadata, has to be done after we know the possibly
946 // compressed proto size
947 ICING_RETURN_IF_ERROR(
948 WriteProtoMetadata(filesystem_, fd_.get(), host_order_metadata));
949
950 // Write the serialized proto
951 if (!filesystem_->Write(fd_.get(), proto_str.data(), proto_str.size())) {
952 return absl_ports::InternalError(
953 absl_ports::StrCat("Failed to write proto to: ", file_path_));
954 }
955
956 return current_position;
957 }
958
959 template <typename ProtoT>
960 libtextclassifier3::StatusOr<ProtoT>
ReadProto(int64_t file_offset)961 PortableFileBackedProtoLog<ProtoT>::ReadProto(int64_t file_offset) const {
962 int64_t file_size = filesystem_->GetFileSize(fd_.get());
963 // Read out the metadata
964 if (file_size == Filesystem::kBadFileSize) {
965 return absl_ports::OutOfRangeError("Unable to correctly read size.");
966 }
967 ICING_ASSIGN_OR_RETURN(
968 int32_t metadata,
969 ReadProtoMetadata(filesystem_, fd_.get(), file_offset, file_size));
970
971 // Copy out however many bytes it says the proto is
972 int stored_size = GetProtoSize(metadata);
973 file_offset += sizeof(metadata);
974
975 // Read the compressed proto out.
976 if (file_offset + stored_size > file_size) {
977 return absl_ports::OutOfRangeError(
978 IcingStringUtil::StringPrintf("Trying to read from a location, %lld, "
979 "out of range of the file size, %lld",
980 static_cast<long long>(file_offset),
981 static_cast<long long>(file_size - 1)));
982 }
983 auto buf = std::make_unique<char[]>(stored_size);
984 if (!filesystem_->PRead(fd_.get(), buf.get(), stored_size, file_offset)) {
985 return absl_ports::InternalError("");
986 }
987
988 if (IsEmptyBuffer(buf.get(), stored_size)) {
989 return absl_ports::NotFoundError("The proto data has been erased.");
990 }
991
992 google::protobuf::io::ArrayInputStream proto_stream(buf.get(), stored_size);
993
994 // Deserialize proto
995 ProtoT proto;
996 if (header_->GetCompressFlag()) {
997 protobuf_ports::GzipInputStream decompress_stream(&proto_stream);
998 proto.ParseFromZeroCopyStream(&decompress_stream);
999 } else {
1000 proto.ParseFromZeroCopyStream(&proto_stream);
1001 }
1002
1003 return proto;
1004 }
1005
1006 template <typename ProtoT>
EraseProto(int64_t file_offset)1007 libtextclassifier3::Status PortableFileBackedProtoLog<ProtoT>::EraseProto(
1008 int64_t file_offset) {
1009 int64_t file_size = filesystem_->GetFileSize(fd_.get());
1010 if (file_size == Filesystem::kBadFileSize) {
1011 return absl_ports::OutOfRangeError("Unable to correctly read size.");
1012 }
1013
1014 ICING_ASSIGN_OR_RETURN(
1015 int32_t metadata,
1016 ReadProtoMetadata(filesystem_, fd_.get(), file_offset, file_size));
1017 // Copy out however many bytes it says the proto is
1018 int stored_size = GetProtoSize(metadata);
1019 file_offset += sizeof(metadata);
1020 if (file_offset + stored_size > file_size) {
1021 return absl_ports::OutOfRangeError(
1022 IcingStringUtil::StringPrintf("Trying to read from a location, %lld, "
1023 "out of range of the file size, %lld",
1024 static_cast<long long>(file_offset),
1025 static_cast<long long>(file_size - 1)));
1026 }
1027 auto buf = std::make_unique<char[]>(stored_size);
1028
1029 // We need to update the crc checksum if the erased area is before the
1030 // rewind position.
1031 int32_t new_crc;
1032 if (file_offset < header_->GetRewindOffset()) {
1033 // Set to "dirty" before we start writing anything.
1034 header_->SetDirtyFlag(true);
1035 header_->SetHeaderChecksum(header_->CalculateHeaderChecksum());
1036 if (!filesystem_->PWrite(fd_.get(), /*offset=*/0, header_.get(),
1037 sizeof(Header))) {
1038 return absl_ports::InternalError(absl_ports::StrCat(
1039 "Failed to update dirty bit of header to: ", file_path_));
1040 }
1041
1042 // We need to calculate [original string xor 0s].
1043 // The xored string is the same as the original string because 0 xor 0 =
1044 // 0, 1 xor 0 = 1.
1045 // Read the compressed proto out.
1046 if (!filesystem_->PRead(fd_.get(), buf.get(), stored_size, file_offset)) {
1047 return absl_ports::InternalError("");
1048 }
1049 const std::string_view xored_str(buf.get(), stored_size);
1050
1051 Crc32 crc(header_->GetLogChecksum());
1052 ICING_ASSIGN_OR_RETURN(
1053 new_crc,
1054 crc.UpdateWithXor(xored_str,
1055 /*full_data_size=*/header_->GetRewindOffset() -
1056 kHeaderReservedBytes,
1057 /*position=*/file_offset - kHeaderReservedBytes));
1058 }
1059
1060 // Clear the region.
1061 memset(buf.get(), '\0', stored_size);
1062 if (!filesystem_->PWrite(fd_.get(), file_offset, buf.get(), stored_size)) {
1063 return absl_ports::InternalError("");
1064 }
1065
1066 // If we cleared something in our checksummed area, we should update our
1067 // checksum and reset our dirty bit.
1068 if (file_offset < header_->GetRewindOffset()) {
1069 header_->SetDirtyFlag(false);
1070 header_->SetLogChecksum(new_crc);
1071 header_->SetHeaderChecksum(header_->CalculateHeaderChecksum());
1072
1073 if (!filesystem_->PWrite(fd_.get(), /*offset=*/0, header_.get(),
1074 sizeof(Header))) {
1075 return absl_ports::InternalError(
1076 absl_ports::StrCat("Failed to update header to: ", file_path_));
1077 }
1078 }
1079
1080 return libtextclassifier3::Status::OK;
1081 }
1082
1083 template <typename ProtoT>
1084 libtextclassifier3::StatusOr<int64_t>
GetDiskUsage()1085 PortableFileBackedProtoLog<ProtoT>::GetDiskUsage() const {
1086 int64_t size = filesystem_->GetDiskUsage(file_path_.c_str());
1087 if (size == Filesystem::kBadFileSize) {
1088 return absl_ports::InternalError("Failed to get disk usage of proto log");
1089 }
1090 return size;
1091 }
1092
1093 template <typename ProtoT>
1094 libtextclassifier3::StatusOr<int64_t>
GetElementsFileSize()1095 PortableFileBackedProtoLog<ProtoT>::GetElementsFileSize() const {
1096 int64_t total_file_size = filesystem_->GetFileSize(file_path_.c_str());
1097 if (total_file_size == Filesystem::kBadFileSize) {
1098 return absl_ports::InternalError(
1099 "Failed to get file size of elments in the proto log");
1100 }
1101 return total_file_size - kHeaderReservedBytes;
1102 }
1103
1104 template <typename ProtoT>
Iterator(const Filesystem & filesystem,int fd,int64_t initial_offset)1105 PortableFileBackedProtoLog<ProtoT>::Iterator::Iterator(
1106 const Filesystem& filesystem, int fd, int64_t initial_offset)
1107 : filesystem_(&filesystem),
1108 initial_offset_(initial_offset),
1109 current_offset_(kInvalidOffset),
1110 fd_(fd) {
1111 file_size_ = filesystem_->GetFileSize(fd_);
1112 if (file_size_ == Filesystem::kBadFileSize) {
1113 // Fails all Advance() calls
1114 file_size_ = 0;
1115 }
1116 }
1117
1118 template <typename ProtoT>
1119 libtextclassifier3::Status
Advance()1120 PortableFileBackedProtoLog<ProtoT>::Iterator::Advance() {
1121 if (current_offset_ == kInvalidOffset) {
1122 // First Advance() call
1123 current_offset_ = initial_offset_;
1124 } else {
1125 // Jumps to the next proto position
1126 ICING_ASSIGN_OR_RETURN(
1127 int32_t metadata,
1128 ReadProtoMetadata(filesystem_, fd_, current_offset_, file_size_));
1129 current_offset_ += sizeof(metadata) + GetProtoSize(metadata);
1130 }
1131
1132 if (current_offset_ < file_size_) {
1133 return libtextclassifier3::Status::OK;
1134 } else {
1135 return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
1136 "The next proto offset, %lld, is out of file range [0, %lld)",
1137 static_cast<long long>(current_offset_),
1138 static_cast<long long>(file_size_)));
1139 }
1140 }
1141
1142 template <typename ProtoT>
GetOffset()1143 int64_t PortableFileBackedProtoLog<ProtoT>::Iterator::GetOffset() {
1144 return current_offset_;
1145 }
1146
1147 template <typename ProtoT>
1148 typename PortableFileBackedProtoLog<ProtoT>::Iterator
GetIterator()1149 PortableFileBackedProtoLog<ProtoT>::GetIterator() {
1150 return Iterator(*filesystem_, fd_.get(),
1151 /*initial_offset=*/kHeaderReservedBytes);
1152 }
1153
1154 template <typename ProtoT>
1155 libtextclassifier3::StatusOr<int32_t>
ReadProtoMetadata(const Filesystem * const filesystem,int fd,int64_t file_offset,int64_t file_size)1156 PortableFileBackedProtoLog<ProtoT>::ReadProtoMetadata(
1157 const Filesystem* const filesystem, int fd, int64_t file_offset,
1158 int64_t file_size) {
1159 // Checks file_offset
1160 if (file_offset >= file_size) {
1161 return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
1162 "offset, %lld, is out of file range [0, %lld)",
1163 static_cast<long long>(file_offset),
1164 static_cast<long long>(file_size)));
1165 }
1166 int32_t portable_metadata;
1167 int metadata_size = sizeof(portable_metadata);
1168 if (file_offset + metadata_size >= file_size) {
1169 return absl_ports::InternalError(IcingStringUtil::StringPrintf(
1170 "Wrong metadata offset %lld, metadata doesn't fit in "
1171 "with file range [0, %lld)",
1172 static_cast<long long>(file_offset),
1173 static_cast<long long>(file_size)));
1174 }
1175
1176 if (!filesystem->PRead(fd, &portable_metadata, metadata_size, file_offset)) {
1177 return absl_ports::InternalError("");
1178 }
1179
1180 // Need to switch it back to host order endianness after reading from disk.
1181 int32_t host_order_metadata = GNetworkToHostL(portable_metadata);
1182
1183 // Checks magic number
1184 uint8_t stored_k_proto_magic = GetProtoMagic(host_order_metadata);
1185 if (stored_k_proto_magic != kProtoMagic) {
1186 return absl_ports::InternalError(IcingStringUtil::StringPrintf(
1187 "Failed to read kProtoMagic, expected %d, actual %d", kProtoMagic,
1188 stored_k_proto_magic));
1189 }
1190
1191 return host_order_metadata;
1192 }
1193
1194 template <typename ProtoT>
1195 libtextclassifier3::Status
WriteProtoMetadata(const Filesystem * filesystem,int fd,int32_t host_order_metadata)1196 PortableFileBackedProtoLog<ProtoT>::WriteProtoMetadata(
1197 const Filesystem* filesystem, int fd, int32_t host_order_metadata) {
1198 // Convert it into portable endian format before writing to disk
1199 int32_t portable_metadata = GHostToNetworkL(host_order_metadata);
1200 int portable_metadata_size = sizeof(portable_metadata);
1201
1202 // Write metadata
1203 if (!filesystem->Write(fd, &portable_metadata, portable_metadata_size)) {
1204 return absl_ports::InternalError(
1205 absl_ports::StrCat("Failed to write proto metadata."));
1206 }
1207
1208 return libtextclassifier3::Status::OK;
1209 }
1210
1211 template <typename ProtoT>
PersistToDisk()1212 libtextclassifier3::Status PortableFileBackedProtoLog<ProtoT>::PersistToDisk() {
1213 int64_t file_size = filesystem_->GetFileSize(file_path_.c_str());
1214 if (file_size == header_->GetRewindOffset()) {
1215 // No new protos appended, don't need to update the checksum.
1216 return libtextclassifier3::Status::OK;
1217 }
1218
1219 ICING_ASSIGN_OR_RETURN(Crc32 crc, ComputeChecksum());
1220
1221 header_->SetLogChecksum(crc.Get());
1222 header_->SetRewindOffset(file_size);
1223 header_->SetHeaderChecksum(header_->CalculateHeaderChecksum());
1224
1225 if (!filesystem_->PWrite(fd_.get(), /*offset=*/0, header_.get(),
1226 sizeof(Header)) ||
1227 !filesystem_->DataSync(fd_.get())) {
1228 return absl_ports::InternalError(
1229 absl_ports::StrCat("Failed to update header to: ", file_path_));
1230 }
1231
1232 return libtextclassifier3::Status::OK;
1233 }
1234
1235 template <typename ProtoT>
1236 libtextclassifier3::StatusOr<Crc32>
ComputeChecksum()1237 PortableFileBackedProtoLog<ProtoT>::ComputeChecksum() {
1238 int64_t file_size = filesystem_->GetFileSize(file_path_.c_str());
1239 int64_t new_content_size = file_size - header_->GetRewindOffset();
1240 Crc32 crc;
1241 if (new_content_size == 0) {
1242 // No new protos appended, return cached checksum
1243 return Crc32(header_->GetLogChecksum());
1244 } else if (new_content_size < 0) {
1245 // File shrunk, recalculate the entire checksum.
1246 ICING_ASSIGN_OR_RETURN(
1247 crc,
1248 ComputeChecksum(filesystem_, file_path_, Crc32(),
1249 /*start=*/kHeaderReservedBytes, /*end=*/file_size));
1250 } else {
1251 // Append new changes to the existing checksum.
1252 ICING_ASSIGN_OR_RETURN(
1253 crc, ComputeChecksum(
1254 filesystem_, file_path_, Crc32(header_->GetLogChecksum()),
1255 /*start=*/header_->GetRewindOffset(), /*end=*/file_size));
1256 }
1257 return crc;
1258 }
1259
1260 } // namespace lib
1261 } // namespace icing
1262
1263 #endif // ICING_FILE_PORTABLE_FILE_BACKED_PROTO_LOG_H_
1264