• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // A simple file-backed proto with an in-memory cache.
16 // WARNING: Only use this for small protos. Files storing larger protos can
17 // benefit from more sophisticated strategies like chunked reads/writes,
18 // using mmap and ideally, not even using protos.
19 //
20 // TODO(b/133793579) Consider exposing a checksum mismatch to callers.
21 
22 #ifndef ICING_FILE_FILE_BACKED_PROTO_H_
23 #define ICING_FILE_FILE_BACKED_PROTO_H_
24 
25 #include <cstdint>
26 #include <memory>
27 #include <string>
28 #include <string_view>
29 
30 #include "icing/text_classifier/lib3/utils/base/status.h"
31 #include "icing/text_classifier/lib3/utils/base/statusor.h"
32 #include "icing/absl_ports/canonical_errors.h"
33 #include "icing/absl_ports/mutex.h"
34 #include "icing/absl_ports/str_cat.h"
35 #include "icing/absl_ports/thread_annotations.h"
36 #include "icing/file/filesystem.h"
37 #include "icing/legacy/core/icing-string-util.h"
38 #include "icing/util/crc32.h"
39 #include "icing/util/logging.h"
40 
41 namespace icing {
42 namespace lib {
43 
44 // This class is go/thread-compatible
45 template <typename ProtoT>
46 class FileBackedProto {
47  public:
48   // Header stored at the beginning of the file before the proto.
49   struct Header {
50     static constexpr int32_t kMagic = 0x726f746f;
51 
52     // Holds the magic as a quick sanity check against file corruption.
53     int32_t magic;
54 
55     // Checksum of the serialized proto, for a more thorough check against file
56     // corruption.
57     uint32_t proto_checksum;
58   };
59 
60   // Used the specified file to read older version of the proto and store
61   // newer versions of the proto.
62   //
63   // file_path : Must be a path within in a directory that already exists.
64   FileBackedProto(const Filesystem& filesystem, std::string_view file_path);
65 
66   // Returns a reference to the proto read from the file. It
67   // internally caches the read proto so that future calls are fast.
68   //
69   // NOTE: The caller does NOT get ownership of the object returned and
70   // the returned object is only valid till a new version of the proto is
71   // written to the file.
72   //
73   // Returns NOT_FOUND if the file was empty or never written to.
74   // Returns INTERNAL_ERROR if an IO error or a corruption was encountered.
75   libtextclassifier3::StatusOr<const ProtoT*> Read() const
76       ICING_LOCKS_EXCLUDED(mutex_);
77 
78   // Writes the new version of the proto provided through to disk.
79   // Successful Write() invalidates any previously read version of the proto.
80   //
81   // Returns INTERNAL_ERROR if any IO error is encountered and will NOT
82   // invalidate any previously read versions of the proto.
83   //
84   // TODO(cassiewang) The implementation today loses old data if Write() fails.
85   // We should write to a tmp file first and rename the file to fix this.
86   // TODO(cassiewang) Change to Write(ProtoT&& proto)
87   libtextclassifier3::Status Write(std::unique_ptr<ProtoT> proto)
88       ICING_LOCKS_EXCLUDED(mutex_);
89 
90   // Disallow copy and assign.
91   FileBackedProto(const FileBackedProto&) = delete;
92   FileBackedProto& operator=(const FileBackedProto&) = delete;
93 
94  private:
95   // Upper bound of file-size that is supported.
96   static constexpr int32_t kMaxFileSize = 1 * 1024 * 1024;  // 1 MiB.
97 
98   // Used to provide reader and writer locks
99   mutable absl_ports::shared_mutex mutex_;
100 
101   const Filesystem* const filesystem_;
102   const std::string file_path_;
103 
104   mutable std::unique_ptr<ProtoT> cached_proto_ ICING_GUARDED_BY(mutex_);
105 };
106 
107 template <typename ProtoT>
108 constexpr int32_t FileBackedProto<ProtoT>::kMaxFileSize;
109 
110 template <typename ProtoT>
FileBackedProto(const Filesystem & filesystem,const std::string_view file_path)111 FileBackedProto<ProtoT>::FileBackedProto(const Filesystem& filesystem,
112                                          const std::string_view file_path)
113     : filesystem_(&filesystem), file_path_(file_path) {}
114 
115 template <typename ProtoT>
Read()116 libtextclassifier3::StatusOr<const ProtoT*> FileBackedProto<ProtoT>::Read()
117     const {
118   ICING_VLOG(1) << "Reading proto from file: " << file_path_;
119 
120   absl_ports::unique_lock l(&mutex_);
121 
122   // Return cached proto if we've already read from disk.
123   if (cached_proto_ != nullptr) {
124     ICING_VLOG(1) << "Reusing cached proto for file: " << file_path_;
125     return cached_proto_.get();
126   }
127 
128   int64_t file_size = filesystem_->GetFileSize(file_path_.c_str());
129   if (file_size == Filesystem::kBadFileSize || file_size == 0) {
130     return absl_ports::NotFoundError(
131         absl_ports::StrCat("Missing file: ", file_path_));
132   }
133 
134   if (file_size > kMaxFileSize) {
135     return absl_ports::InternalError(absl_ports::StrCat(
136         "File larger than expected, couldn't read: ", file_path_));
137   }
138 
139   ScopedFd fd(filesystem_->OpenForRead(file_path_.c_str()));
140   if (!fd.is_valid()) {
141     return absl_ports::InternalError(
142         absl_ports::StrCat("Unable to open file for read: ", file_path_));
143   }
144 
145   ICING_VLOG(1) << "Loading proto from  file: " << file_path_
146                 << " of size: " << file_size;
147 
148   Header header;
149   if (!filesystem_->PRead(fd.get(), &header, sizeof(Header),
150                           /*offset=*/0)) {
151     return absl_ports::InternalError(
152         absl_ports::StrCat("Unable to read header of: ", file_path_));
153   }
154 
155   if (header.magic != Header::kMagic) {
156     return absl_ports::InternalError(
157         absl_ports::StrCat("Invalid header kMagic for: ", file_path_));
158   }
159 
160   int proto_size = file_size - sizeof(Header);
161   auto buffer = std::make_unique<uint8_t[]>(proto_size);
162   if (!filesystem_->PRead(fd.get(), buffer.get(), proto_size,
163                           /*offset=*/sizeof(Header))) {
164     return absl_ports::InternalError(
165         absl_ports::StrCat("File read failed: ", file_path_));
166   }
167 
168   std::string_view buffer_str(reinterpret_cast<const char*>(buffer.get()),
169                               proto_size);
170   Crc32 crc;
171   crc.Append(buffer_str);
172   if (header.proto_checksum != crc.Get()) {
173     return absl_ports::InternalError(
174         absl_ports::StrCat("Checksum of file does not match: ", file_path_));
175   }
176 
177   auto proto = std::make_unique<ProtoT>();
178   if (!proto->ParseFromArray(buffer.get(), proto_size)) {
179     return absl_ports::InternalError(
180         absl_ports::StrCat("Proto parse failed. File corrupted: ", file_path_));
181   }
182 
183   ICING_VLOG(1) << "Successfully read proto from file: " << file_path_;
184   cached_proto_ = std::move(proto);
185   return cached_proto_.get();
186 }
187 
188 template <typename ProtoT>
Write(std::unique_ptr<ProtoT> new_proto)189 libtextclassifier3::Status FileBackedProto<ProtoT>::Write(
190     std::unique_ptr<ProtoT> new_proto) {
191   ICING_VLOG(1) << "Writing proto to file: " << file_path_;
192 
193   absl_ports::unique_lock l(&mutex_);
194 
195   const std::string new_proto_str = new_proto->SerializeAsString();
196   if (new_proto_str.size() >= kMaxFileSize) {
197     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
198         "New proto too large. size: %d; limit: %d.",
199         static_cast<int>(new_proto_str.size()), kMaxFileSize));
200   }
201 
202   if (cached_proto_ != nullptr &&
203       cached_proto_->SerializeAsString() == new_proto_str) {
204     ICING_VLOG(1) << "Skip writing proto to file as contents are identical: "
205                   << file_path_;
206     return libtextclassifier3::Status::OK;
207   }
208 
209   ScopedFd fd(filesystem_->OpenForWrite(file_path_.c_str()));
210   if (!fd.is_valid()) {
211     return absl_ports::InternalError(
212         absl_ports::StrCat("Unable to open file for write: ", file_path_));
213   }
214 
215   if (!filesystem_->Truncate(fd.get(), 0)) {
216     return absl_ports::InternalError(
217         absl_ports::StrCat("Failed to truncate file: ", file_path_));
218   }
219 
220   Header header;
221   header.magic = Header::kMagic;
222 
223   Crc32 crc;
224   crc.Append(new_proto_str);
225   header.proto_checksum = crc.Get();
226   if (!filesystem_->Write(fd.get(), &header, sizeof(Header))) {
227     return absl_ports::InternalError(
228         absl_ports::StrCat("Failed to write header to file: ", file_path_));
229   }
230 
231   if (!filesystem_->Write(fd.get(), new_proto_str.data(),
232                           new_proto_str.size())) {
233     return absl_ports::InternalError(
234         absl_ports::StrCat("Failed to write proto to file: ", file_path_));
235   }
236 
237   if (!filesystem_->DataSync(fd.get())) {
238     return absl_ports::InternalError(IcingStringUtil::StringPrintf(
239         "Failed to sync file; filename: %s; content_size: %d ",
240         file_path_.c_str(), static_cast<int>(new_proto_str.size())));
241   }
242 
243   ICING_VLOG(1) << "Successfully wrote proto to file: " << file_path_;
244   cached_proto_ = std::move(new_proto);
245   return libtextclassifier3::Status::OK;
246 }
247 
248 }  // namespace lib
249 }  // namespace icing
250 
251 #endif  // ICING_FILE_FILE_BACKED_PROTO_H_
252