1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 // File-backed log of protos with append-only writes and position based reads.
16 //
17 // There should only be one instance of a FileBackedProtoLog of the same file at
18 // a time; using multiple instances at the same time may lead to undefined
19 // behavior.
20 //
21 // The entire checksum is computed on initialization to verify the contents are
22 // valid. On failure, the log will be truncated to the last verified state when
23 // PersistToDisk() was called. If the log cannot successfully restore the last
24 // state due to disk corruption or some other inconsistency, then the entire log
25 // will be lost.
26 //
27 // Each proto written to the file will have a metadata written just before it.
28 // The metadata consists of
29 // {
30 // 1 bytes of kProtoMagic;
31 // 3 bytes of the proto size
32 // n bytes of the proto itself
33 // }
34 //
35 // Example usage:
36 // ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
37 // FileBackedProtoLog<DocumentProto>::Create(filesystem, file_path_,
38 // options));
39 // auto proto_log = create_result.proto_log;
40 //
41 // Document document;
42 // document.set_namespace("com.google.android.example");
43 // document.set_uri("www.google.com");
44 //
45 // int64_t document_offset = proto_log->WriteProto(document));
46 // Document same_document = proto_log->ReadProto(document_offset));
47 // proto_log->PersistToDisk();
48 //
49 // TODO(b/136514769): Add versioning to the header and a UpgradeToVersion
50 // migration method.
51
52 #ifndef ICING_FILE_FILE_BACKED_PROTO_LOG_H_
53 #define ICING_FILE_FILE_BACKED_PROTO_LOG_H_
54
55 #include <cstddef>
56 #include <cstdint>
57 #include <cstring>
58 #include <memory>
59 #include <string>
60 #include <string_view>
61 #include <utility>
62 #include <vector>
63
64 #include "icing/text_classifier/lib3/utils/base/status.h"
65 #include "icing/text_classifier/lib3/utils/base/statusor.h"
66 #include <google/protobuf/io/gzip_stream.h>
67 #include <google/protobuf/io/zero_copy_stream_impl_lite.h>
68 #include "icing/absl_ports/canonical_errors.h"
69 #include "icing/absl_ports/str_cat.h"
70 #include "icing/file/filesystem.h"
71 #include "icing/file/memory-mapped-file.h"
72 #include "icing/legacy/core/icing-string-util.h"
73 #include "icing/portable/platform.h"
74 #include "icing/portable/zlib.h"
75 #include "icing/util/crc32.h"
76 #include "icing/util/data-loss.h"
77 #include "icing/util/logging.h"
78 #include "icing/util/status-macros.h"
79
80 namespace icing {
81 namespace lib {
82
83 template <typename ProtoT>
84 class FileBackedProtoLog {
85 public:
86 struct Options {
87 // Whether to compress each proto before writing to the proto log.
88 bool compress;
89
90 // Byte-size limit for each proto written to the store. This does not
91 // include the bytes needed for the metadata of each proto.
92 //
93 // NOTE: Currently, we only support protos up to 16MiB. We store the proto
94 // size in 3 bytes within the metadata.
95 //
96 // NOTE: This limit is only enforced for future writes. If the store
97 // previously had a higher limit, then reading older entries could return
98 // larger protos.
99 //
100 // NOTE: The max_proto_size is the upper limit for input protos into the
101 // ProtoLog. Even if the proto is larger than max_proto_size, but compresses
102 // to a smaller size, ProtoLog will not accept it. Protos that result in a
103 // compressed size larger than max_proto_size are also not accepted.
104 const int32_t max_proto_size;
105
106 // Must specify values for options.
107 Options() = delete;
108 explicit Options(bool compress_in,
109 const int32_t max_proto_size_in = kMaxProtoSize)
compressOptions110 : compress(compress_in), max_proto_size(max_proto_size_in) {}
111 };
112
113 // Header stored at the beginning of the file before the rest of the log
114 // contents. Stores metadata on the log.
115 //
116 // TODO(b/139375388): Migrate the Header struct to a proto. This makes
117 // migrations easier since we don't need to worry about different size padding
118 // (which would affect the checksum) and different endians.
119 struct Header {
120 static constexpr int32_t kMagic = 0xf4c6f67a;
121
122 // Holds the magic as a quick sanity check against file corruption.
123 int32_t magic = kMagic;
124
125 // Whether to compress the protos before writing to the log.
126 bool compress = true;
127
128 // The maximum proto size that can be written to the log.
129 int32_t max_proto_size = 0;
130
131 // Checksum of the log elements, doesn't include the header fields.
132 uint32_t log_checksum = 0;
133
134 // Last known good offset at which the log and its checksum were updated.
135 // If we crash between writing to the log and updating the checksum, we can
136 // try to rewind the log to this offset and verify the checksum is still
137 // valid instead of throwing away the entire log.
138 int64_t rewind_offset = sizeof(Header);
139
140 // Must be at the end. Contains the crc checksum of the preceding fields.
141 uint32_t header_checksum = 0;
142
CalculateHeaderChecksumHeader143 uint32_t CalculateHeaderChecksum() const {
144 Crc32 crc;
145 std::string_view header_str(reinterpret_cast<const char*>(this),
146 offsetof(Header, header_checksum));
147 crc.Append(header_str);
148 return crc.Get();
149 }
150 };
151
152 struct CreateResult {
153 // A successfully initialized log.
154 std::unique_ptr<FileBackedProtoLog<ProtoT>> proto_log;
155
156 // The data status after initializing from a previous state. Data loss can
157 // happen if the file is corrupted or some previously added data was
158 // unpersisted. This may be used to signal that any derived data off of the
159 // proto log may need to be regenerated.
160 DataLoss data_loss;
161
has_data_lossCreateResult162 bool has_data_loss() {
163 return data_loss == DataLoss::PARTIAL || data_loss == DataLoss::COMPLETE;
164 }
165 };
166
167 // Factory method to create, initialize, and return a FileBackedProtoLog. Will
168 // create the file if it doesn't exist.
169 //
170 // If on re-initialization the log detects disk corruption or some previously
171 // added data was unpersisted, the log will rewind to the last-good state. The
172 // log saves these checkpointed "good" states when PersistToDisk() is called
173 // or the log is safely destructed. If the log rewinds successfully to the
174 // last-good state, then the returned CreateResult.data_loss indicates
175 // whether it has a data loss and what kind of data loss it is (partial or
176 // complete) so that any derived data may know that it needs to be updated. If
177 // the log re-initializes successfully without any data loss,
178 // CreateResult.data_loss will be NONE.
179 //
180 // Params:
181 // filesystem: Handles system level calls
182 // file_path: Path of the underlying file. Directory of the file should
183 // already exist
184 // options: Configuration options for the proto log
185 //
186 // Returns:
187 // FileBackedProtoLog::CreateResult on success
188 // INVALID_ARGUMENT on an invalid option
189 // INTERNAL_ERROR on IO error
190 static libtextclassifier3::StatusOr<CreateResult> Create(
191 const Filesystem* filesystem, const std::string& file_path,
192 const Options& options);
193
194 // Not copyable
195 FileBackedProtoLog(const FileBackedProtoLog&) = delete;
196 FileBackedProtoLog& operator=(const FileBackedProtoLog&) = delete;
197
198 // This will update the checksum of the log as well.
199 ~FileBackedProtoLog();
200
201 // Writes the serialized proto to the underlying file. Writes are applied
202 // directly to the underlying file. Users do not need to sync the file after
203 // writing.
204 //
205 // Returns:
206 // Offset of the newly appended proto in file on success
207 // INVALID_ARGUMENT if proto is too large, as decided by
208 // Options.max_proto_size
209 // INTERNAL_ERROR on IO error
210 libtextclassifier3::StatusOr<int64_t> WriteProto(const ProtoT& proto);
211
212 // Reads out a proto located at file_offset from the file.
213 //
214 // Returns:
215 // A proto on success
216 // NOT_FOUND if the proto at the given offset has been erased
217 // OUT_OF_RANGE_ERROR if file_offset exceeds file size
218 // INTERNAL_ERROR on IO error
219 libtextclassifier3::StatusOr<ProtoT> ReadProto(int64_t file_offset) const;
220
221 // Erases the data of a proto located at file_offset from the file.
222 //
223 // Returns:
224 // OK on success
225 // OUT_OF_RANGE_ERROR if file_offset exceeds file size
226 // INTERNAL_ERROR on IO error
227 libtextclassifier3::Status EraseProto(int64_t file_offset);
228
229 // Calculates and returns the disk usage in bytes. Rounds up to the nearest
230 // block size.
231 //
232 // Returns:
233 // Disk usage on success
234 // INTERNAL_ERROR on IO error
235 libtextclassifier3::StatusOr<int64_t> GetDiskUsage() const;
236
237 // Returns the file size of all the elements held in the log. File size is in
238 // bytes. This excludes the size of any internal metadata of the log, e.g. the
239 // log's header.
240 //
241 // Returns:
242 // File size on success
243 // INTERNAL_ERROR on IO error
244 libtextclassifier3::StatusOr<int64_t> GetElementsFileSize() const;
245
246 // An iterator helping to find offsets of all the protos in file.
247 // Example usage:
248 //
249 // while (iterator.Advance().ok()) {
250 // int64_t offset = iterator.GetOffset();
251 // // Do something
252 // }
253 class Iterator {
254 public:
255 Iterator(const Filesystem& filesystem, const std::string& file_path,
256 int64_t initial_offset);
257
258 // Advances to the position of next proto whether it has been erased or not.
259 //
260 // Returns:
261 // OK on success
262 // OUT_OF_RANGE_ERROR if it reaches the end
263 // INTERNAL_ERROR on IO error
264 libtextclassifier3::Status Advance();
265
266 // Returns the file offset of current proto.
267 int64_t GetOffset();
268
269 private:
270 static constexpr int64_t kInvalidOffset = -1;
271 // Used to read proto metadata
272 MemoryMappedFile mmapped_file_;
273 // Offset of first proto
274 int64_t initial_offset_;
275 int64_t current_offset_;
276 int64_t file_size_;
277 };
278
279 // Returns an iterator of current proto log. The caller needs to keep the
280 // proto log unchanged while using the iterator, otherwise unexpected
281 // behaviors could happen.
282 Iterator GetIterator();
283
284 // Persists all changes since initialization or the last call to
285 // PersistToDisk(). Any changes that aren't persisted may be lost if the
286 // system fails to close safely.
287 //
288 // Example use case:
289 //
290 // Document document;
291 // document.set_namespace("com.google.android.example");
292 // document.set_uri("www.google.com");
293 //
294 // {
295 // ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
296 // FileBackedProtoLog<DocumentProto>::Create(filesystem, file_path,
297 // options));
298 // auto proto_log = std::move(create_result.proto_log);
299 //
300 // int64_t document_offset = proto_log->WriteProto(document));
301 //
302 // // We lose the document here since it wasn't persisted.
303 // // *SYSTEM CRASH*
304 // }
305 //
306 // {
307 // // Can still successfully create after a crash since the log can
308 // // rewind/truncate to recover into a previously good state
309 // ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
310 // FileBackedProtoLog<DocumentProto>::Create(filesystem, file_path,
311 // options));
312 // auto proto_log = std::move(create_result.proto_log);
313 //
314 // // Lost the proto since we didn't PersistToDisk before the crash
315 // proto_log->ReadProto(document_offset)); // INVALID_ARGUMENT error
316 //
317 // int64_t document_offset = proto_log->WriteProto(document));
318 //
319 // // Persisted this time, so we should be ok.
320 // ICING_ASSERT_OK(proto_log->PersistToDisk());
321 // }
322 //
323 // {
324 // ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
325 // FileBackedProtoLog<DocumentProto>::Create(filesystem, file_path,
326 // options));
327 // auto proto_log = std::move(create_result.proto_log);
328 //
329 // // SUCCESS
330 // Document same_document = proto_log->ReadProto(document_offset));
331 // }
332 //
333 // NOTE: Since all protos are already written to the file directly, this
334 // just updates the checksum and rewind position. Without these updates,
335 // future initializations will truncate the file and discard unpersisted
336 // changes.
337 //
338 // Returns:
339 // OK on success
340 // INTERNAL_ERROR on IO error
341 libtextclassifier3::Status PersistToDisk();
342
343 // Calculates the checksum of the log contents. Excludes the header content.
344 //
345 // Returns:
346 // Crc of the log content
347 // INTERNAL_ERROR on IO error
348 libtextclassifier3::StatusOr<Crc32> ComputeChecksum();
349
350 private:
351 // Object can only be instantiated via the ::Create factory.
352 FileBackedProtoLog(const Filesystem* filesystem, const std::string& file_path,
353 std::unique_ptr<Header> header);
354
355 // Initializes a new proto log.
356 //
357 // Returns:
358 // std::unique_ptr<CreateResult> on success
359 // INTERNAL_ERROR on IO error
360 static libtextclassifier3::StatusOr<CreateResult> InitializeNewFile(
361 const Filesystem* filesystem, const std::string& file_path,
362 const Options& options);
363
364 // Verifies that the existing proto log is in a good state. If not in a good
365 // state, then the proto log may be truncated to the last good state and
366 // content will be lost.
367 //
368 // Returns:
369 // std::unique_ptr<CreateResult> on success
370 // INTERNAL_ERROR on IO error or internal inconsistencies in the file
371 // INVALID_ARGUMENT_ERROR if options aren't consistent with previous
372 // instances
373 static libtextclassifier3::StatusOr<CreateResult> InitializeExistingFile(
374 const Filesystem* filesystem, const std::string& file_path,
375 const Options& options, int64_t file_size);
376
377 // Takes an initial checksum and updates it with the content between `start`
378 // and `end` offsets in the file.
379 //
380 // Returns:
381 // Crc of the content between `start`, inclusive, and `end`, exclusive.
382 // INTERNAL_ERROR on IO error
383 // INVALID_ARGUMENT_ERROR if start and end aren't within the file size
384 static libtextclassifier3::StatusOr<Crc32> ComputeChecksum(
385 const Filesystem* filesystem, const std::string& file_path,
386 Crc32 initial_crc, int64_t start, int64_t end);
387
IsEmptyBuffer(const char * buffer,int size)388 static bool IsEmptyBuffer(const char* buffer, int size) {
389 return std::all_of(buffer, buffer + size,
390 [](const char byte) { return byte == 0; });
391 }
392
393 // Helper function to get stored proto size from the metadata.
394 // Metadata format: 8 bits magic + 24 bits size
GetProtoSize(int metadata)395 static int GetProtoSize(int metadata) { return metadata & 0x00FFFFFF; }
396
397 // Helper function to get stored proto magic from the metadata.
398 // Metadata format: 8 bits magic + 24 bits size
GetProtoMagic(int metadata)399 static uint8_t GetProtoMagic(int metadata) { return metadata >> 24; }
400
401 // Reads out the metadata of a proto located at file_offset from the file.
402 //
403 // Returns:
404 // Proto's metadata on success
405 // OUT_OF_RANGE_ERROR if file_offset exceeds file_size
406 // INTERNAL_ERROR if the metadata is invalid or any IO errors happen
407 static libtextclassifier3::StatusOr<int> ReadProtoMetadata(
408 MemoryMappedFile* mmapped_file, int64_t file_offset, int64_t file_size);
409
410 // Magic number added in front of every proto. Used when reading out protos
411 // as a first check for corruption in each entry in the file. Even if there is
412 // a corruption, the best we can do is roll back to our last recovery point
413 // and throw away un-flushed data. We can discard/reuse this byte if needed so
414 // that we have 4 bytes to store the size of protos, and increase the size of
415 // protos we support.
416 static constexpr uint8_t kProtoMagic = 0x5C;
417
418 // Our internal max for protos.
419 //
420 // WARNING: Changing this to a larger number may invalidate our assumption
421 // that that proto size can safely be stored in the last 3 bytes of the proto
422 // header.
423 static constexpr int kMaxProtoSize = (1 << 24) - 1; // 16MiB
424 static_assert(kMaxProtoSize <= 0x00FFFFFF,
425 "kMaxProtoSize doesn't fit in 3 bytes");
426
427 // Level of compression, BEST_SPEED = 1, BEST_COMPRESSION = 9
428 static constexpr int kDeflateCompressionLevel = 3;
429
430 // Chunks of the file to mmap at a time, so we don't mmap the entire file.
431 // Only used on 32-bit devices
432 static constexpr int kMmapChunkSize = 4 * 1024 * 1024; // 4MiB
433
434 ScopedFd fd_;
435 const Filesystem* const filesystem_;
436 const std::string file_path_;
437 std::unique_ptr<Header> header_;
438 };
439
440 template <typename ProtoT>
441 constexpr uint8_t FileBackedProtoLog<ProtoT>::kProtoMagic;
442
443 template <typename ProtoT>
FileBackedProtoLog(const Filesystem * filesystem,const std::string & file_path,std::unique_ptr<Header> header)444 FileBackedProtoLog<ProtoT>::FileBackedProtoLog(const Filesystem* filesystem,
445 const std::string& file_path,
446 std::unique_ptr<Header> header)
447 : filesystem_(filesystem),
448 file_path_(file_path),
449 header_(std::move(header)) {
450 fd_.reset(filesystem_->OpenForAppend(file_path.c_str()));
451 }
452
453 template <typename ProtoT>
~FileBackedProtoLog()454 FileBackedProtoLog<ProtoT>::~FileBackedProtoLog() {
455 if (!PersistToDisk().ok()) {
456 ICING_LOG(WARNING)
457 << "Error persisting to disk during destruction of FileBackedProtoLog: "
458 << file_path_;
459 }
460 }
461
462 template <typename ProtoT>
463 libtextclassifier3::StatusOr<typename FileBackedProtoLog<ProtoT>::CreateResult>
Create(const Filesystem * filesystem,const std::string & file_path,const Options & options)464 FileBackedProtoLog<ProtoT>::Create(const Filesystem* filesystem,
465 const std::string& file_path,
466 const Options& options) {
467 if (options.max_proto_size <= 0) {
468 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
469 "options.max_proto_size must be greater than 0, was %d",
470 options.max_proto_size));
471 }
472
473 // Since we store the proto_size in 3 bytes, we can only support protos of up
474 // to 16MiB.
475 if (options.max_proto_size > kMaxProtoSize) {
476 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
477 "options.max_proto_size must be under 16MiB, was %d",
478 options.max_proto_size));
479 }
480
481 if (!filesystem->FileExists(file_path.c_str())) {
482 return InitializeNewFile(filesystem, file_path, options);
483 }
484
485 int64_t file_size = filesystem->GetFileSize(file_path.c_str());
486 if (file_size == Filesystem::kBadFileSize) {
487 return absl_ports::InternalError(
488 absl_ports::StrCat("Bad file size '", file_path, "'"));
489 }
490
491 if (file_size == 0) {
492 return InitializeNewFile(filesystem, file_path, options);
493 }
494
495 return InitializeExistingFile(filesystem, file_path, options, file_size);
496 }
497
498 template <typename ProtoT>
499 libtextclassifier3::StatusOr<typename FileBackedProtoLog<ProtoT>::CreateResult>
InitializeNewFile(const Filesystem * filesystem,const std::string & file_path,const Options & options)500 FileBackedProtoLog<ProtoT>::InitializeNewFile(const Filesystem* filesystem,
501 const std::string& file_path,
502 const Options& options) {
503 // Create the header
504 std::unique_ptr<Header> header = std::make_unique<Header>();
505 header->compress = options.compress;
506 header->max_proto_size = options.max_proto_size;
507 header->header_checksum = header->CalculateHeaderChecksum();
508
509 if (!filesystem->Write(file_path.c_str(), header.get(), sizeof(Header))) {
510 return absl_ports::InternalError(
511 absl_ports::StrCat("Failed to write header for file: ", file_path));
512 }
513
514 CreateResult create_result = {
515 std::unique_ptr<FileBackedProtoLog<ProtoT>>(
516 new FileBackedProtoLog<ProtoT>(filesystem, file_path,
517 std::move(header))),
518 /*data_loss=*/DataLoss::NONE};
519
520 return create_result;
521 }
522
523 template <typename ProtoT>
524 libtextclassifier3::StatusOr<typename FileBackedProtoLog<ProtoT>::CreateResult>
InitializeExistingFile(const Filesystem * filesystem,const std::string & file_path,const Options & options,int64_t file_size)525 FileBackedProtoLog<ProtoT>::InitializeExistingFile(const Filesystem* filesystem,
526 const std::string& file_path,
527 const Options& options,
528 int64_t file_size) {
529 if (file_size < sizeof(Header)) {
530 return absl_ports::InternalError(
531 absl_ports::StrCat("File header too short for: ", file_path));
532 }
533
534 std::unique_ptr<Header> header = std::make_unique<Header>();
535 if (!filesystem->PRead(file_path.c_str(), header.get(), sizeof(Header),
536 /*offset=*/0)) {
537 return absl_ports::InternalError(
538 absl_ports::StrCat("Failed to read header for file: ", file_path));
539 }
540
541 // Make sure the header is still valid before we use any of its values. This
542 // is covered by the header_checksum check below, but this is a quick check
543 // that can save us from an extra crc computation.
544 if (header->magic != Header::kMagic) {
545 return absl_ports::InternalError(
546 absl_ports::StrCat("Invalid header kMagic for file: ", file_path));
547 }
548
549 if (header->header_checksum != header->CalculateHeaderChecksum()) {
550 return absl_ports::InternalError(
551 absl_ports::StrCat("Invalid header checksum for: ", file_path));
552 }
553
554 if (header->compress != options.compress) {
555 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
556 "Inconsistent compress option, expected %d, actual %d",
557 header->compress, options.compress));
558 }
559
560 if (header->max_proto_size > options.max_proto_size) {
561 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
562 "Max proto size cannot be smaller than previous "
563 "instantiations, previous size %d, wanted size %d",
564 header->max_proto_size, options.max_proto_size));
565 }
566 header->max_proto_size = options.max_proto_size;
567
568 DataLoss data_loss = DataLoss::NONE;
569 ICING_ASSIGN_OR_RETURN(Crc32 calculated_log_checksum,
570 ComputeChecksum(filesystem, file_path, Crc32(),
571 sizeof(Header), file_size));
572
573 // Double check that the log checksum is the same as the one that was
574 // persisted last time. If not, we start recovery logic.
575 if (header->log_checksum != calculated_log_checksum.Get()) {
576 // Need to rewind the proto log since the checksums don't match.
577 // Worst case, we have to rewind the entire log back to just the header
578 int64_t last_known_good = sizeof(Header);
579
580 // Calculate the checksum of the log contents just up to the last rewind
581 // offset point. This will be valid if we just appended contents to the log
582 // without updating the checksum, and we can rewind back to this point
583 // safely.
584 ICING_ASSIGN_OR_RETURN(
585 calculated_log_checksum,
586 ComputeChecksum(filesystem, file_path, Crc32(), sizeof(Header),
587 header->rewind_offset));
588 if (header->log_checksum == calculated_log_checksum.Get()) {
589 // Check if it matches our last rewind state. If so, this becomes our last
590 // good state and we can safely truncate and recover from here.
591 last_known_good = header->rewind_offset;
592 data_loss = DataLoss::PARTIAL;
593 } else {
594 // Otherwise, we're going to truncate the entire log and this resets the
595 // checksum to an empty log state.
596 header->log_checksum = 0;
597 data_loss = DataLoss::COMPLETE;
598 }
599
600 if (!filesystem->Truncate(file_path.c_str(), last_known_good)) {
601 return absl_ports::InternalError(
602 absl_ports::StrCat("Error truncating file: ", file_path));
603 }
604
605 ICING_LOG(INFO) << "Truncated '" << file_path << "' to size "
606 << last_known_good;
607 }
608
609 CreateResult create_result = {
610 std::unique_ptr<FileBackedProtoLog<ProtoT>>(
611 new FileBackedProtoLog<ProtoT>(filesystem, file_path,
612 std::move(header))),
613 data_loss};
614
615 return create_result;
616 }
617
618 template <typename ProtoT>
ComputeChecksum(const Filesystem * filesystem,const std::string & file_path,Crc32 initial_crc,int64_t start,int64_t end)619 libtextclassifier3::StatusOr<Crc32> FileBackedProtoLog<ProtoT>::ComputeChecksum(
620 const Filesystem* filesystem, const std::string& file_path,
621 Crc32 initial_crc, int64_t start, int64_t end) {
622 auto mmapped_file = MemoryMappedFile(*filesystem, file_path,
623 MemoryMappedFile::Strategy::READ_ONLY);
624 Crc32 new_crc(initial_crc.Get());
625
626 if (start < 0) {
627 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
628 "Starting checksum offset of file '%s' must be greater than 0, was "
629 "%lld",
630 file_path.c_str(), static_cast<long long>(start)));
631 }
632
633 if (end < start) {
634 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
635 "Ending checksum offset of file '%s' must be greater than start "
636 "'%lld', was '%lld'",
637 file_path.c_str(), static_cast<long long>(start),
638 static_cast<long long>(end)));
639 }
640
641 int64_t file_size = filesystem->GetFileSize(file_path.c_str());
642 if (end > file_size) {
643 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
644 "Ending checksum offset of file '%s' must be within "
645 "file size of %lld, was %lld",
646 file_path.c_str(), static_cast<long long>(file_size),
647 static_cast<long long>(end)));
648 }
649
650 Architecture architecture = GetArchitecture();
651 switch (architecture) {
652 case Architecture::BIT_64: {
653 // Don't mmap in chunks here since mmapping can be harmful on 64-bit
654 // devices where mmap/munmap calls need the mmap write semaphore, which
655 // blocks mmap/munmap/mprotect and all page faults from executing while
656 // they run. On 64-bit devices, this doesn't actually load into memory, it
657 // just makes the file faultable. So the whole file should be ok.
658 // b/185822878.
659 ICING_RETURN_IF_ERROR(mmapped_file.Remap(start, end - start));
660 auto mmap_str = std::string_view(mmapped_file.region(), end - start);
661 new_crc.Append(mmap_str);
662 break;
663 }
664 case Architecture::BIT_32:
665 [[fallthrough]];
666 case Architecture::UNKNOWN: {
667 // 32-bit devices only have 4GB of RAM. Mmap in chunks to not use up too
668 // much memory at once. If we're unknown, then also chunk it because we're
669 // not sure what the device can handle.
670 for (int i = start; i < end; i += kMmapChunkSize) {
671 // Don't read past the file size.
672 int next_chunk_size = kMmapChunkSize;
673 if ((i + kMmapChunkSize) >= end) {
674 next_chunk_size = end - i;
675 }
676
677 ICING_RETURN_IF_ERROR(mmapped_file.Remap(i, next_chunk_size));
678
679 auto mmap_str =
680 std::string_view(mmapped_file.region(), next_chunk_size);
681 new_crc.Append(mmap_str);
682 }
683 break;
684 }
685 }
686
687 return new_crc;
688 }
689
690 template <typename ProtoT>
WriteProto(const ProtoT & proto)691 libtextclassifier3::StatusOr<int64_t> FileBackedProtoLog<ProtoT>::WriteProto(
692 const ProtoT& proto) {
693 int64_t proto_size = proto.ByteSizeLong();
694 int32_t metadata;
695 int metadata_size = sizeof(metadata);
696 int64_t current_position = filesystem_->GetCurrentPosition(fd_.get());
697
698 if (proto_size > header_->max_proto_size) {
699 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
700 "proto_size, %lld, was too large to write. Max is %d",
701 static_cast<long long>(proto_size), header_->max_proto_size));
702 }
703
704 // At this point, we've guaranteed that proto_size is under kMaxProtoSize
705 // (see
706 // ::Create), so we can safely store it in an int.
707 int final_size = 0;
708
709 std::string proto_str;
710 google::protobuf::io::StringOutputStream proto_stream(&proto_str);
711
712 if (header_->compress) {
713 google::protobuf::io::GzipOutputStream::Options options;
714 options.format = google::protobuf::io::GzipOutputStream::ZLIB;
715 options.compression_level = kDeflateCompressionLevel;
716
717 google::protobuf::io::GzipOutputStream compressing_stream(&proto_stream,
718 options);
719
720 bool success = proto.SerializeToZeroCopyStream(&compressing_stream) &&
721 compressing_stream.Close();
722
723 if (!success) {
724 return absl_ports::InternalError("Error compressing proto.");
725 }
726
727 final_size = proto_str.size();
728
729 // In case the compressed proto is larger than the original proto, we also
730 // can't write it.
731 if (final_size > header_->max_proto_size) {
732 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
733 "Compressed proto size, %d, was greater than "
734 "max_proto_size, %d",
735 final_size, header_->max_proto_size));
736 }
737 } else {
738 // Serialize the proto directly into the write buffer at an offset of the
739 // metadata.
740 proto.SerializeToZeroCopyStream(&proto_stream);
741 final_size = proto_str.size();
742 }
743
744 // 1st byte for magic, next 3 bytes for proto size.
745 metadata = (kProtoMagic << 24) | final_size;
746
747 // Actually write metadata, has to be done after we know the possibly
748 // compressed proto size
749 if (!filesystem_->Write(fd_.get(), &metadata, metadata_size)) {
750 return absl_ports::InternalError(
751 absl_ports::StrCat("Failed to write proto metadata to: ", file_path_));
752 }
753
754 // Write the serialized proto
755 if (!filesystem_->Write(fd_.get(), proto_str.data(), proto_str.size())) {
756 return absl_ports::InternalError(
757 absl_ports::StrCat("Failed to write proto to: ", file_path_));
758 }
759
760 return current_position;
761 }
762
763 template <typename ProtoT>
ReadProto(int64_t file_offset)764 libtextclassifier3::StatusOr<ProtoT> FileBackedProtoLog<ProtoT>::ReadProto(
765 int64_t file_offset) const {
766 int64_t file_size = filesystem_->GetFileSize(fd_.get());
767 MemoryMappedFile mmapped_file(*filesystem_, file_path_,
768 MemoryMappedFile::Strategy::READ_ONLY);
769 if (file_offset >= file_size) {
770 // file_size points to the next byte to write at, so subtract one to get
771 // the inclusive, actual size of file.
772 return absl_ports::OutOfRangeError(
773 IcingStringUtil::StringPrintf("Trying to read from a location, %lld, "
774 "out of range of the file size, %lld",
775 static_cast<long long>(file_offset),
776 static_cast<long long>(file_size - 1)));
777 }
778
779 // Read out the metadata
780 ICING_ASSIGN_OR_RETURN(
781 int metadata, ReadProtoMetadata(&mmapped_file, file_offset, file_size));
782
783 // Copy out however many bytes it says the proto is
784 int stored_size = GetProtoSize(metadata);
785
786 ICING_RETURN_IF_ERROR(
787 mmapped_file.Remap(file_offset + sizeof(metadata), stored_size));
788
789 if (IsEmptyBuffer(mmapped_file.region(), mmapped_file.region_size())) {
790 return absl_ports::NotFoundError("The proto data has been erased.");
791 }
792
793 google::protobuf::io::ArrayInputStream proto_stream(
794 mmapped_file.mutable_region(), stored_size);
795
796 // Deserialize proto
797 ProtoT proto;
798 if (header_->compress) {
799 google::protobuf::io::GzipInputStream decompress_stream(&proto_stream);
800 proto.ParseFromZeroCopyStream(&decompress_stream);
801 } else {
802 proto.ParseFromZeroCopyStream(&proto_stream);
803 }
804
805 return proto;
806 }
807
808 template <typename ProtoT>
EraseProto(int64_t file_offset)809 libtextclassifier3::Status FileBackedProtoLog<ProtoT>::EraseProto(
810 int64_t file_offset) {
811 int64_t file_size = filesystem_->GetFileSize(fd_.get());
812 if (file_offset >= file_size) {
813 // file_size points to the next byte to write at, so subtract one to get
814 // the inclusive, actual size of file.
815 return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
816 "Trying to erase data at a location, %lld, "
817 "out of range of the file size, %lld",
818 static_cast<long long>(file_offset),
819 static_cast<long long>(file_size - 1)));
820 }
821
822 MemoryMappedFile mmapped_file(
823 *filesystem_, file_path_,
824 MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC);
825
826 // Read out the metadata
827 ICING_ASSIGN_OR_RETURN(
828 int metadata, ReadProtoMetadata(&mmapped_file, file_offset, file_size));
829
830 ICING_RETURN_IF_ERROR(mmapped_file.Remap(file_offset + sizeof(metadata),
831 GetProtoSize(metadata)));
832
833 // We need to update the crc checksum if the erased area is before the
834 // rewind position.
835 if (file_offset + sizeof(metadata) < header_->rewind_offset) {
836 // We need to calculate [original string xor 0s].
837 // The xored string is the same as the original string because 0 xor 0 =
838 // 0, 1 xor 0 = 1.
839 const std::string_view xored_str(mmapped_file.region(),
840 mmapped_file.region_size());
841
842 Crc32 crc(header_->log_checksum);
843 ICING_ASSIGN_OR_RETURN(
844 uint32_t new_crc,
845 crc.UpdateWithXor(
846 xored_str,
847 /*full_data_size=*/header_->rewind_offset - sizeof(Header),
848 /*position=*/file_offset + sizeof(metadata) - sizeof(Header)));
849
850 header_->log_checksum = new_crc;
851 header_->header_checksum = header_->CalculateHeaderChecksum();
852
853 if (!filesystem_->PWrite(fd_.get(), /*offset=*/0, header_.get(),
854 sizeof(Header))) {
855 return absl_ports::InternalError(
856 absl_ports::StrCat("Failed to update header to: ", file_path_));
857 }
858 }
859
860 memset(mmapped_file.mutable_region(), '\0', mmapped_file.region_size());
861 return libtextclassifier3::Status::OK;
862 }
863
864 template <typename ProtoT>
GetDiskUsage()865 libtextclassifier3::StatusOr<int64_t> FileBackedProtoLog<ProtoT>::GetDiskUsage()
866 const {
867 int64_t size = filesystem_->GetDiskUsage(file_path_.c_str());
868 if (size == Filesystem::kBadFileSize) {
869 return absl_ports::InternalError("Failed to get disk usage of proto log");
870 }
871 return size;
872 }
873
874 template <typename ProtoT>
875 libtextclassifier3::StatusOr<int64_t>
GetElementsFileSize()876 FileBackedProtoLog<ProtoT>::GetElementsFileSize() const {
877 int64_t total_file_size = filesystem_->GetFileSize(file_path_.c_str());
878 if (total_file_size == Filesystem::kBadFileSize) {
879 return absl_ports::InternalError(
880 "Failed to get file size of elments in the proto log");
881 }
882 return total_file_size - sizeof(Header);
883 }
884
885 template <typename ProtoT>
Iterator(const Filesystem & filesystem,const std::string & file_path,int64_t initial_offset)886 FileBackedProtoLog<ProtoT>::Iterator::Iterator(const Filesystem& filesystem,
887 const std::string& file_path,
888 int64_t initial_offset)
889 : mmapped_file_(filesystem, file_path,
890 MemoryMappedFile::Strategy::READ_ONLY),
891 initial_offset_(initial_offset),
892 current_offset_(kInvalidOffset),
893 file_size_(filesystem.GetFileSize(file_path.c_str())) {
894 if (file_size_ == Filesystem::kBadFileSize) {
895 // Fails all Advance() calls
896 file_size_ = 0;
897 }
898 }
899
900 template <typename ProtoT>
Advance()901 libtextclassifier3::Status FileBackedProtoLog<ProtoT>::Iterator::Advance() {
902 if (current_offset_ == kInvalidOffset) {
903 // First Advance() call
904 current_offset_ = initial_offset_;
905 } else {
906 // Jumps to the next proto position
907 ICING_ASSIGN_OR_RETURN(
908 int metadata,
909 ReadProtoMetadata(&mmapped_file_, current_offset_, file_size_));
910 current_offset_ += sizeof(metadata) + GetProtoSize(metadata);
911 }
912
913 if (current_offset_ < file_size_) {
914 return libtextclassifier3::Status::OK;
915 } else {
916 return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
917 "The next proto offset, %lld, is out of file range [0, %lld)",
918 static_cast<long long>(current_offset_),
919 static_cast<long long>(file_size_)));
920 }
921 }
922
923 template <typename ProtoT>
GetOffset()924 int64_t FileBackedProtoLog<ProtoT>::Iterator::GetOffset() {
925 return current_offset_;
926 }
927
928 template <typename ProtoT>
929 typename FileBackedProtoLog<ProtoT>::Iterator
GetIterator()930 FileBackedProtoLog<ProtoT>::GetIterator() {
931 return Iterator(*filesystem_, file_path_,
932 /*initial_offset=*/sizeof(Header));
933 }
934
935 template <typename ProtoT>
ReadProtoMetadata(MemoryMappedFile * mmapped_file,int64_t file_offset,int64_t file_size)936 libtextclassifier3::StatusOr<int> FileBackedProtoLog<ProtoT>::ReadProtoMetadata(
937 MemoryMappedFile* mmapped_file, int64_t file_offset, int64_t file_size) {
938 // Checks file_offset
939 if (file_offset >= file_size) {
940 return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
941 "offset, %lld, is out of file range [0, %lld)",
942 static_cast<long long>(file_offset),
943 static_cast<long long>(file_size)));
944 }
945 int metadata;
946 int metadata_size = sizeof(metadata);
947 if (file_offset + metadata_size >= file_size) {
948 return absl_ports::InternalError(IcingStringUtil::StringPrintf(
949 "Wrong metadata offset %lld, metadata doesn't fit in "
950 "with file range [0, %lld)",
951 static_cast<long long>(file_offset),
952 static_cast<long long>(file_size)));
953 }
954 // Reads metadata
955 ICING_RETURN_IF_ERROR(mmapped_file->Remap(file_offset, metadata_size));
956 memcpy(&metadata, mmapped_file->region(), metadata_size);
957 // Checks magic number
958 uint8_t stored_k_proto_magic = GetProtoMagic(metadata);
959 if (stored_k_proto_magic != kProtoMagic) {
960 return absl_ports::InternalError(IcingStringUtil::StringPrintf(
961 "Failed to read kProtoMagic, expected %d, actual %d", kProtoMagic,
962 stored_k_proto_magic));
963 }
964 return metadata;
965 }
966
967 template <typename ProtoT>
PersistToDisk()968 libtextclassifier3::Status FileBackedProtoLog<ProtoT>::PersistToDisk() {
969 int64_t file_size = filesystem_->GetFileSize(file_path_.c_str());
970 if (file_size == header_->rewind_offset) {
971 // No new protos appended, don't need to update the checksum.
972 return libtextclassifier3::Status::OK;
973 }
974
975 int64_t new_content_size = file_size - header_->rewind_offset;
976 Crc32 crc;
977 if (new_content_size < 0) {
978 // File shrunk, recalculate the entire checksum.
979 ICING_ASSIGN_OR_RETURN(
980 crc, ComputeChecksum(filesystem_, file_path_, Crc32(), sizeof(Header),
981 file_size));
982 } else {
983 // Append new changes to the existing checksum.
984 ICING_ASSIGN_OR_RETURN(
985 crc,
986 ComputeChecksum(filesystem_, file_path_, Crc32(header_->log_checksum),
987 header_->rewind_offset, file_size));
988 }
989
990 header_->log_checksum = crc.Get();
991 header_->rewind_offset = file_size;
992 header_->header_checksum = header_->CalculateHeaderChecksum();
993
994 if (!filesystem_->PWrite(fd_.get(), /*offset=*/0, header_.get(),
995 sizeof(Header)) ||
996 !filesystem_->DataSync(fd_.get())) {
997 return absl_ports::InternalError(
998 absl_ports::StrCat("Failed to update header to: ", file_path_));
999 }
1000
1001 return libtextclassifier3::Status::OK;
1002 }
1003
1004 template <typename ProtoT>
1005 libtextclassifier3::StatusOr<Crc32>
ComputeChecksum()1006 FileBackedProtoLog<ProtoT>::ComputeChecksum() {
1007 return FileBackedProtoLog<ProtoT>::ComputeChecksum(
1008 filesystem_, file_path_, Crc32(), /*start=*/sizeof(Header),
1009 /*end=*/filesystem_->GetFileSize(file_path_.c_str()));
1010 }
1011
1012 } // namespace lib
1013 } // namespace icing
1014
1015 #endif // ICING_FILE_FILE_BACKED_PROTO_LOG_H_
1016