1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 // File-backed log of protos with append-only writes and position based reads.
16 //
17 // The implementation in this file is deprecated and replaced by
18 // portable-file-backed-proto-log.h.
19 //
20 // This deprecated implementation has been made read-only for the purposes of
21 // migration; writing and erasing this format of log is no longer supported and
22 // the methods to accomplish this have been removed.
23 //
24 // The details of this format follow below:
25 // Each proto written to the file will have a metadata written just before it.
26 // The metadata consists of
27 // {
28 // 1 bytes of kProtoMagic;
29 // 3 bytes of the proto size
30 // n bytes of the proto itself
31 // }
32 // TODO(b/136514769): Add versioning to the header and a UpgradeToVersion
33 // migration method.
34 #ifndef ICING_FILE_FILE_BACKED_PROTO_LOG_H_
35 #define ICING_FILE_FILE_BACKED_PROTO_LOG_H_
36
37 #include <cstdint>
38 #include <memory>
39 #include <string>
40 #include <string_view>
41
42 #include "icing/text_classifier/lib3/utils/base/statusor.h"
43 #include "icing/absl_ports/canonical_errors.h"
44 #include "icing/absl_ports/str_cat.h"
45 #include "icing/file/filesystem.h"
46 #include "icing/file/memory-mapped-file.h"
47 #include "icing/legacy/core/icing-string-util.h"
48 #include "icing/portable/gzip_stream.h"
49 #include "icing/portable/platform.h"
50 #include "icing/portable/zlib.h"
51 #include "icing/util/crc32.h"
52 #include "icing/util/data-loss.h"
53 #include "icing/util/logging.h"
54 #include "icing/util/status-macros.h"
55 #include <google/protobuf/io/zero_copy_stream_impl_lite.h>
56
57 namespace icing {
58 namespace lib {
59
60 template <typename ProtoT>
61 class FileBackedProtoLog {
62 public:
63 struct Options {
64 // Whether to compress each proto before writing to the proto log.
65 bool compress;
66
67 // Byte-size limit for each proto written to the store. This does not
68 // include the bytes needed for the metadata of each proto.
69 //
70 // NOTE: Currently, we only support protos up to 16MiB. We store the proto
71 // size in 3 bytes within the metadata.
72 //
73 // NOTE: This limit is only enforced for future writes. If the store
74 // previously had a higher limit, then reading older entries could return
75 // larger protos.
76 //
77 // NOTE: The max_proto_size is the upper limit for input protos into the
78 // ProtoLog. Even if the proto is larger than max_proto_size, but compresses
79 // to a smaller size, ProtoLog will not accept it. Protos that result in a
80 // compressed size larger than max_proto_size are also not accepted.
81 const int32_t max_proto_size;
82
83 // Must specify values for options.
84 Options() = delete;
85 explicit Options(bool compress_in,
86 const int32_t max_proto_size_in = kMaxProtoSize)
compressOptions87 : compress(compress_in), max_proto_size(max_proto_size_in) {}
88 };
89
90 // Header stored at the beginning of the file before the rest of the log
91 // contents. Stores metadata on the log.
92 struct Header {
93 static constexpr int32_t kMagic = 0xf4c6f67a;
94
95 // Holds the magic as a quick sanity check against file corruption.
96 int32_t magic = kMagic;
97
98 // Whether to compress the protos before writing to the log.
99 bool compress = true;
100
101 // The maximum proto size that can be written to the log.
102 int32_t max_proto_size = 0;
103
104 // Checksum of the log elements, doesn't include the header fields.
105 uint32_t log_checksum = 0;
106
107 // Last known good offset at which the log and its checksum were updated.
108 // If we crash between writing to the log and updating the checksum, we can
109 // try to rewind the log to this offset and verify the checksum is still
110 // valid instead of throwing away the entire log.
111 int64_t rewind_offset = sizeof(Header);
112
113 // Must be at the end. Contains the crc checksum of the preceding fields.
114 uint32_t header_checksum = 0;
115
CalculateHeaderChecksumHeader116 uint32_t CalculateHeaderChecksum() const {
117 Crc32 crc;
118 std::string_view header_str(reinterpret_cast<const char*>(this),
119 offsetof(Header, header_checksum));
120 crc.Append(header_str);
121 return crc.Get();
122 }
123 };
124
125 struct CreateResult {
126 // A successfully initialized log.
127 std::unique_ptr<FileBackedProtoLog<ProtoT>> proto_log;
128
129 // The data status after initializing from a previous state. Data loss can
130 // happen if the file is corrupted or some previously added data was
131 // unpersisted. This may be used to signal that any derived data off of the
132 // proto log may need to be regenerated.
133 DataLoss data_loss;
134
has_data_lossCreateResult135 bool has_data_loss() {
136 return data_loss == DataLoss::PARTIAL || data_loss == DataLoss::COMPLETE;
137 }
138 };
139
140 // Factory method to create, initialize, and return a FileBackedProtoLog. Will
141 // create the file if it doesn't exist.
142 //
143 // If on re-initialization the log detects disk corruption or some previously
144 // added data was unpersisted, the log will rewind to the last-good state. The
145 // log saves these checkpointed "good" states when PersistToDisk() is called
146 // or the log is safely destructed. If the log rewinds successfully to the
147 // last-good state, then the returned CreateResult.data_loss indicates
148 // whether it has a data loss and what kind of data loss it is (partial or
149 // complete) so that any derived data may know that it needs to be updated. If
150 // the log re-initializes successfully without any data loss,
151 // CreateResult.data_loss will be NONE.
152 //
153 // Params:
154 // filesystem: Handles system level calls
155 // file_path: Path of the underlying file. Directory of the file should
156 // already exist
157 // options: Configuration options for the proto log
158 //
159 // Returns:
160 // FileBackedProtoLog::CreateResult on success
161 // INVALID_ARGUMENT on an invalid option
162 // INTERNAL_ERROR on IO error
163 static libtextclassifier3::StatusOr<CreateResult> Create(
164 const Filesystem* filesystem, const std::string& file_path,
165 const Options& options);
166
167 // Not copyable
168 FileBackedProtoLog(const FileBackedProtoLog&) = delete;
169 FileBackedProtoLog& operator=(const FileBackedProtoLog&) = delete;
170
171 // Reads out a proto located at file_offset from the file.
172 //
173 // Returns:
174 // A proto on success
175 // NOT_FOUND if the proto at the given offset has been erased
176 // OUT_OF_RANGE_ERROR if file_offset exceeds file size
177 // INTERNAL_ERROR on IO error
178 libtextclassifier3::StatusOr<ProtoT> ReadProto(int64_t file_offset) const;
179
180 // An iterator helping to find offsets of all the protos in file.
181 // Example usage:
182 //
183 // while (iterator.Advance().ok()) {
184 // int64_t offset = iterator.GetOffset();
185 // // Do something
186 // }
187 class Iterator {
188 public:
189 explicit Iterator(const Filesystem& filesystem,
190 const std::string& file_path, int64_t initial_offset,
191 MemoryMappedFile&& mmapped_file);
192
193 // Advances to the position of next proto whether it has been erased or not.
194 //
195 // Returns:
196 // OK on success
197 // OUT_OF_RANGE_ERROR if it reaches the end
198 // INTERNAL_ERROR on IO error
199 libtextclassifier3::Status Advance();
200
201 // Returns the file offset of current proto.
202 int64_t GetOffset();
203
204 private:
205 static constexpr int64_t kInvalidOffset = -1;
206 // Used to read proto metadata
207 MemoryMappedFile mmapped_file_;
208 // Offset of first proto
209 int64_t initial_offset_;
210 int64_t current_offset_;
211 int64_t file_size_;
212 };
213
214 // Returns an iterator of current proto log. The caller needs to keep the
215 // proto log unchanged while using the iterator, otherwise unexpected
216 // behaviors could happen.
217 libtextclassifier3::StatusOr<Iterator> GetIterator();
218
219 private:
220 // Object can only be instantiated via the ::Create factory.
221 FileBackedProtoLog(const Filesystem* filesystem, const std::string& file_path,
222 std::unique_ptr<Header> header);
223
224 // Initializes a new proto log.
225 //
226 // Returns:
227 // std::unique_ptr<CreateResult> on success
228 // INTERNAL_ERROR on IO error
229 static libtextclassifier3::StatusOr<CreateResult> InitializeNewFile(
230 const Filesystem* filesystem, const std::string& file_path,
231 const Options& options);
232
233 // Verifies that the existing proto log is in a good state. If not in a good
234 // state, then the proto log may be truncated to the last good state and
235 // content will be lost.
236 //
237 // Returns:
238 // std::unique_ptr<CreateResult> on success
239 // INTERNAL_ERROR on IO error or internal inconsistencies in the file
240 // INVALID_ARGUMENT_ERROR if options aren't consistent with previous
241 // instances
242 static libtextclassifier3::StatusOr<CreateResult> InitializeExistingFile(
243 const Filesystem* filesystem, const std::string& file_path,
244 const Options& options, int64_t file_size);
245
246 // Takes an initial checksum and updates it with the content between `start`
247 // and `end` offsets in the file.
248 //
249 // Returns:
250 // Crc of the content between `start`, inclusive, and `end`, exclusive.
251 // INTERNAL_ERROR on IO error
252 // INVALID_ARGUMENT_ERROR if start and end aren't within the file size
253 static libtextclassifier3::StatusOr<Crc32> ComputeChecksum(
254 const Filesystem* filesystem, const std::string& file_path,
255 Crc32 initial_crc, int64_t start, int64_t end);
256
IsEmptyBuffer(const char * buffer,int size)257 static bool IsEmptyBuffer(const char* buffer, int size) {
258 return std::all_of(buffer, buffer + size,
259 [](const char byte) { return byte == 0; });
260 }
261
262 // Helper function to get stored proto size from the metadata.
263 // Metadata format: 8 bits magic + 24 bits size
GetProtoSize(int metadata)264 static int GetProtoSize(int metadata) { return metadata & 0x00FFFFFF; }
265
266 // Helper function to get stored proto magic from the metadata.
267 // Metadata format: 8 bits magic + 24 bits size
GetProtoMagic(int metadata)268 static uint8_t GetProtoMagic(int metadata) { return metadata >> 24; }
269
270 // Reads out the metadata of a proto located at file_offset from the file.
271 //
272 // Returns:
273 // Proto's metadata on success
274 // OUT_OF_RANGE_ERROR if file_offset exceeds file_size
275 // INTERNAL_ERROR if the metadata is invalid or any IO errors happen
276 static libtextclassifier3::StatusOr<int> ReadProtoMetadata(
277 MemoryMappedFile* mmapped_file, int64_t file_offset, int64_t file_size);
278
279 // Magic number added in front of every proto. Used when reading out protos
280 // as a first check for corruption in each entry in the file. Even if there is
281 // a corruption, the best we can do is roll back to our last recovery point
282 // and throw away un-flushed data. We can discard/reuse this byte if needed so
283 // that we have 4 bytes to store the size of protos, and increase the size of
284 // protos we support.
285 static constexpr uint8_t kProtoMagic = 0x5C;
286
287 // Our internal max for protos.
288 //
289 // WARNING: Changing this to a larger number may invalidate our assumption
290 // that that proto size can safely be stored in the last 3 bytes of the proto
291 // header.
292 static constexpr int kMaxProtoSize = (1 << 24) - 1; // 16MiB
293 static_assert(kMaxProtoSize <= 0x00FFFFFF,
294 "kMaxProtoSize doesn't fit in 3 bytes");
295
296 // Chunks of the file to mmap at a time, so we don't mmap the entire file.
297 // Only used on 32-bit devices
298 static constexpr int kMmapChunkSize = 4 * 1024 * 1024; // 4MiB
299
300 ScopedFd fd_;
301 const Filesystem* const filesystem_;
302 const std::string file_path_;
303 std::unique_ptr<Header> header_;
304 };
305
306 template <typename ProtoT>
FileBackedProtoLog(const Filesystem * filesystem,const std::string & file_path,std::unique_ptr<Header> header)307 FileBackedProtoLog<ProtoT>::FileBackedProtoLog(const Filesystem* filesystem,
308 const std::string& file_path,
309 std::unique_ptr<Header> header)
310 : filesystem_(filesystem),
311 file_path_(file_path),
312 header_(std::move(header)) {
313 fd_.reset(filesystem_->OpenForAppend(file_path.c_str()));
314 }
315
316 template <typename ProtoT>
317 libtextclassifier3::StatusOr<typename FileBackedProtoLog<ProtoT>::CreateResult>
Create(const Filesystem * filesystem,const std::string & file_path,const Options & options)318 FileBackedProtoLog<ProtoT>::Create(const Filesystem* filesystem,
319 const std::string& file_path,
320 const Options& options) {
321 if (options.max_proto_size <= 0) {
322 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
323 "options.max_proto_size must be greater than 0, was %d",
324 options.max_proto_size));
325 }
326
327 // Since we store the proto_size in 3 bytes, we can only support protos of up
328 // to 16MiB.
329 if (options.max_proto_size > kMaxProtoSize) {
330 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
331 "options.max_proto_size must be under 16MiB, was %d",
332 options.max_proto_size));
333 }
334
335 if (!filesystem->FileExists(file_path.c_str())) {
336 return InitializeNewFile(filesystem, file_path, options);
337 }
338
339 int64_t file_size = filesystem->GetFileSize(file_path.c_str());
340 if (file_size == Filesystem::kBadFileSize) {
341 return absl_ports::InternalError(
342 absl_ports::StrCat("Bad file size '", file_path, "'"));
343 }
344
345 if (file_size == 0) {
346 return InitializeNewFile(filesystem, file_path, options);
347 }
348
349 return InitializeExistingFile(filesystem, file_path, options, file_size);
350 }
351
352 template <typename ProtoT>
353 libtextclassifier3::StatusOr<typename FileBackedProtoLog<ProtoT>::CreateResult>
InitializeNewFile(const Filesystem * filesystem,const std::string & file_path,const Options & options)354 FileBackedProtoLog<ProtoT>::InitializeNewFile(const Filesystem* filesystem,
355 const std::string& file_path,
356 const Options& options) {
357 // Create the header
358 std::unique_ptr<Header> header = std::make_unique<Header>();
359 header->compress = options.compress;
360 header->max_proto_size = options.max_proto_size;
361 header->header_checksum = header->CalculateHeaderChecksum();
362
363 if (!filesystem->Write(file_path.c_str(), header.get(), sizeof(Header))) {
364 return absl_ports::InternalError(
365 absl_ports::StrCat("Failed to write header for file: ", file_path));
366 }
367
368 CreateResult create_result = {
369 std::unique_ptr<FileBackedProtoLog<ProtoT>>(
370 new FileBackedProtoLog<ProtoT>(filesystem, file_path,
371 std::move(header))),
372 /*data_loss=*/DataLoss::NONE};
373
374 return create_result;
375 }
376
377 template <typename ProtoT>
378 libtextclassifier3::StatusOr<typename FileBackedProtoLog<ProtoT>::CreateResult>
InitializeExistingFile(const Filesystem * filesystem,const std::string & file_path,const Options & options,int64_t file_size)379 FileBackedProtoLog<ProtoT>::InitializeExistingFile(const Filesystem* filesystem,
380 const std::string& file_path,
381 const Options& options,
382 int64_t file_size) {
383 if (file_size < sizeof(Header)) {
384 return absl_ports::InternalError(
385 absl_ports::StrCat("File header too short for: ", file_path));
386 }
387
388 std::unique_ptr<Header> header = std::make_unique<Header>();
389 if (!filesystem->PRead(file_path.c_str(), header.get(), sizeof(Header),
390 /*offset=*/0)) {
391 return absl_ports::InternalError(
392 absl_ports::StrCat("Failed to read header for file: ", file_path));
393 }
394
395 // Make sure the header is still valid before we use any of its values. This
396 // is covered by the header_checksum check below, but this is a quick check
397 // that can save us from an extra crc computation.
398 if (header->magic != Header::kMagic) {
399 return absl_ports::InternalError(
400 absl_ports::StrCat("Invalid header kMagic for file: ", file_path));
401 }
402
403 if (header->header_checksum != header->CalculateHeaderChecksum()) {
404 return absl_ports::InternalError(
405 absl_ports::StrCat("Invalid header checksum for: ", file_path));
406 }
407
408 if (header->compress != options.compress) {
409 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
410 "Inconsistent compress option, expected %d, actual %d",
411 header->compress, options.compress));
412 }
413
414 if (header->max_proto_size > options.max_proto_size) {
415 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
416 "Max proto size cannot be smaller than previous "
417 "instantiations, previous size %d, wanted size %d",
418 header->max_proto_size, options.max_proto_size));
419 }
420 header->max_proto_size = options.max_proto_size;
421
422 DataLoss data_loss = DataLoss::NONE;
423 ICING_ASSIGN_OR_RETURN(Crc32 calculated_log_checksum,
424 ComputeChecksum(filesystem, file_path, Crc32(),
425 sizeof(Header), file_size));
426
427 // Double check that the log checksum is the same as the one that was
428 // persisted last time. If not, we start recovery logic.
429 if (header->log_checksum != calculated_log_checksum.Get()) {
430 // Need to rewind the proto log since the checksums don't match.
431 // Worst case, we have to rewind the entire log back to just the header
432 int64_t last_known_good = sizeof(Header);
433
434 // Calculate the checksum of the log contents just up to the last rewind
435 // offset point. This will be valid if we just appended contents to the log
436 // without updating the checksum, and we can rewind back to this point
437 // safely.
438 ICING_ASSIGN_OR_RETURN(
439 calculated_log_checksum,
440 ComputeChecksum(filesystem, file_path, Crc32(), sizeof(Header),
441 header->rewind_offset));
442 if (header->log_checksum == calculated_log_checksum.Get()) {
443 // Check if it matches our last rewind state. If so, this becomes our last
444 // good state and we can safely truncate and recover from here.
445 last_known_good = header->rewind_offset;
446 data_loss = DataLoss::PARTIAL;
447 } else {
448 // Otherwise, we're going to truncate the entire log and this resets the
449 // checksum to an empty log state.
450 header->log_checksum = 0;
451 data_loss = DataLoss::COMPLETE;
452 }
453
454 if (!filesystem->Truncate(file_path.c_str(), last_known_good)) {
455 return absl_ports::InternalError(
456 absl_ports::StrCat("Error truncating file: ", file_path));
457 }
458
459 ICING_LOG(WARNING) << "Truncated '" << file_path << "' to size "
460 << last_known_good;
461 }
462
463 CreateResult create_result = {
464 std::unique_ptr<FileBackedProtoLog<ProtoT>>(
465 new FileBackedProtoLog<ProtoT>(filesystem, file_path,
466 std::move(header))),
467 data_loss};
468
469 return create_result;
470 }
471
472 template <typename ProtoT>
ComputeChecksum(const Filesystem * filesystem,const std::string & file_path,Crc32 initial_crc,int64_t start,int64_t end)473 libtextclassifier3::StatusOr<Crc32> FileBackedProtoLog<ProtoT>::ComputeChecksum(
474 const Filesystem* filesystem, const std::string& file_path,
475 Crc32 initial_crc, int64_t start, int64_t end) {
476 ICING_ASSIGN_OR_RETURN(
477 MemoryMappedFile mmapped_file,
478 MemoryMappedFile::Create(*filesystem, file_path,
479 MemoryMappedFile::Strategy::READ_ONLY));
480 Crc32 new_crc(initial_crc.Get());
481
482 if (start < 0) {
483 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
484 "Starting checksum offset of file '%s' must be greater than 0, was "
485 "%lld",
486 file_path.c_str(), static_cast<long long>(start)));
487 }
488
489 if (end < start) {
490 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
491 "Ending checksum offset of file '%s' must be greater than start "
492 "'%lld', was '%lld'",
493 file_path.c_str(), static_cast<long long>(start),
494 static_cast<long long>(end)));
495 }
496
497 int64_t file_size = filesystem->GetFileSize(file_path.c_str());
498 if (end > file_size) {
499 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
500 "Ending checksum offset of file '%s' must be within "
501 "file size of %lld, was %lld",
502 file_path.c_str(), static_cast<long long>(file_size),
503 static_cast<long long>(end)));
504 }
505
506 Architecture architecture = GetArchitecture();
507 switch (architecture) {
508 case Architecture::BIT_64: {
509 // Don't mmap in chunks here since mmapping can be harmful on 64-bit
510 // devices where mmap/munmap calls need the mmap write semaphore, which
511 // blocks mmap/munmap/mprotect and all page faults from executing while
512 // they run. On 64-bit devices, this doesn't actually load into memory, it
513 // just makes the file faultable. So the whole file should be ok.
514 // b/185822878.
515 ICING_RETURN_IF_ERROR(mmapped_file.Remap(start, end - start));
516 auto mmap_str = std::string_view(mmapped_file.region(), end - start);
517 new_crc.Append(mmap_str);
518 break;
519 }
520 case Architecture::BIT_32:
521 [[fallthrough]];
522 case Architecture::UNKNOWN: {
523 // 32-bit devices only have 4GB of RAM. Mmap in chunks to not use up too
524 // much memory at once. If we're unknown, then also chunk it because we're
525 // not sure what the device can handle.
526 for (int i = start; i < end; i += kMmapChunkSize) {
527 // Don't read past the file size.
528 int next_chunk_size = kMmapChunkSize;
529 if ((i + kMmapChunkSize) >= end) {
530 next_chunk_size = end - i;
531 }
532
533 ICING_RETURN_IF_ERROR(mmapped_file.Remap(i, next_chunk_size));
534
535 auto mmap_str =
536 std::string_view(mmapped_file.region(), next_chunk_size);
537 new_crc.Append(mmap_str);
538 }
539 break;
540 }
541 }
542
543 return new_crc;
544 }
545
546 template <typename ProtoT>
ReadProto(int64_t file_offset)547 libtextclassifier3::StatusOr<ProtoT> FileBackedProtoLog<ProtoT>::ReadProto(
548 int64_t file_offset) const {
549 int64_t file_size = filesystem_->GetFileSize(fd_.get());
550 ICING_ASSIGN_OR_RETURN(
551 MemoryMappedFile mmapped_file,
552 MemoryMappedFile::Create(*filesystem_, file_path_,
553 MemoryMappedFile::Strategy::READ_ONLY));
554 if (file_offset >= file_size) {
555 // file_size points to the next byte to write at, so subtract one to get
556 // the inclusive, actual size of file.
557 return absl_ports::OutOfRangeError(
558 IcingStringUtil::StringPrintf("Trying to read from a location, %lld, "
559 "out of range of the file size, %lld",
560 static_cast<long long>(file_offset),
561 static_cast<long long>(file_size - 1)));
562 }
563
564 // Read out the metadata
565 ICING_ASSIGN_OR_RETURN(
566 int metadata, ReadProtoMetadata(&mmapped_file, file_offset, file_size));
567
568 // Copy out however many bytes it says the proto is
569 int stored_size = GetProtoSize(metadata);
570
571 ICING_RETURN_IF_ERROR(
572 mmapped_file.Remap(file_offset + sizeof(metadata), stored_size));
573
574 if (IsEmptyBuffer(mmapped_file.region(), mmapped_file.region_size())) {
575 return absl_ports::NotFoundError("The proto data has been erased.");
576 }
577
578 google::protobuf::io::ArrayInputStream proto_stream(mmapped_file.mutable_region(),
579 stored_size);
580
581 // Deserialize proto
582 ProtoT proto;
583 if (header_->compress) {
584 protobuf_ports::GzipInputStream decompress_stream(&proto_stream);
585 proto.ParseFromZeroCopyStream(&decompress_stream);
586 } else {
587 proto.ParseFromZeroCopyStream(&proto_stream);
588 }
589
590 return proto;
591 }
592
593 template <typename ProtoT>
Iterator(const Filesystem & filesystem,const std::string & file_path,int64_t initial_offset,MemoryMappedFile && mmapped_file)594 FileBackedProtoLog<ProtoT>::Iterator::Iterator(const Filesystem& filesystem,
595 const std::string& file_path,
596 int64_t initial_offset,
597 MemoryMappedFile&& mmapped_file)
598 : mmapped_file_(std::move(mmapped_file)),
599 initial_offset_(initial_offset),
600 current_offset_(kInvalidOffset),
601 file_size_(filesystem.GetFileSize(file_path.c_str())) {
602 if (file_size_ == Filesystem::kBadFileSize) {
603 // Fails all Advance() calls
604 file_size_ = 0;
605 }
606 }
607
608 template <typename ProtoT>
Advance()609 libtextclassifier3::Status FileBackedProtoLog<ProtoT>::Iterator::Advance() {
610 if (current_offset_ == kInvalidOffset) {
611 // First Advance() call
612 current_offset_ = initial_offset_;
613 } else {
614 // Jumps to the next proto position
615 ICING_ASSIGN_OR_RETURN(
616 int metadata,
617 ReadProtoMetadata(&mmapped_file_, current_offset_, file_size_));
618 current_offset_ += sizeof(metadata) + GetProtoSize(metadata);
619 }
620
621 if (current_offset_ < file_size_) {
622 return libtextclassifier3::Status::OK;
623 } else {
624 return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
625 "The next proto offset, %lld, is out of file range [0, %lld)",
626 static_cast<long long>(current_offset_),
627 static_cast<long long>(file_size_)));
628 }
629 }
630
631 template <typename ProtoT>
GetOffset()632 int64_t FileBackedProtoLog<ProtoT>::Iterator::GetOffset() {
633 return current_offset_;
634 }
635
636 template <typename ProtoT>
637 libtextclassifier3::StatusOr<typename FileBackedProtoLog<ProtoT>::Iterator>
GetIterator()638 FileBackedProtoLog<ProtoT>::GetIterator() {
639 ICING_ASSIGN_OR_RETURN(
640 MemoryMappedFile mmapped_file,
641 MemoryMappedFile::Create(*filesystem_, file_path_,
642 MemoryMappedFile::Strategy::READ_ONLY));
643 return Iterator(*filesystem_, file_path_,
644 /*initial_offset=*/sizeof(Header), std::move(mmapped_file));
645 }
646
647 template <typename ProtoT>
ReadProtoMetadata(MemoryMappedFile * mmapped_file,int64_t file_offset,int64_t file_size)648 libtextclassifier3::StatusOr<int> FileBackedProtoLog<ProtoT>::ReadProtoMetadata(
649 MemoryMappedFile* mmapped_file, int64_t file_offset, int64_t file_size) {
650 // Checks file_offset
651 if (file_offset >= file_size) {
652 return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
653 "offset, %lld, is out of file range [0, %lld)",
654 static_cast<long long>(file_offset),
655 static_cast<long long>(file_size)));
656 }
657 int metadata;
658 int metadata_size = sizeof(metadata);
659 if (file_offset + metadata_size >= file_size) {
660 return absl_ports::InternalError(IcingStringUtil::StringPrintf(
661 "Wrong metadata offset %lld, metadata doesn't fit in "
662 "with file range [0, %lld)",
663 static_cast<long long>(file_offset),
664 static_cast<long long>(file_size)));
665 }
666 // Reads metadata
667 ICING_RETURN_IF_ERROR(mmapped_file->Remap(file_offset, metadata_size));
668 memcpy(&metadata, mmapped_file->region(), metadata_size);
669 // Checks magic number
670 uint8_t stored_k_proto_magic = GetProtoMagic(metadata);
671 if (stored_k_proto_magic != kProtoMagic) {
672 return absl_ports::InternalError(IcingStringUtil::StringPrintf(
673 "Failed to read kProtoMagic, expected %d, actual %d", kProtoMagic,
674 stored_k_proto_magic));
675 }
676 return metadata;
677 }
678
679 } // namespace lib
680 } // namespace icing
681
682 #endif // ICING_FILE_FILE_BACKED_PROTO_LOG_H_
683