1 /*
2 * Copyright (C) 2021 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "snapuserd.h"
18
19 #include <csignal>
20 #include <optional>
21 #include <set>
22
23 #include <snapuserd/snapuserd_client.h>
24
25 namespace android {
26 namespace snapshot {
27
28 using namespace android;
29 using namespace android::dm;
30 using android::base::unique_fd;
31
32 #define SNAP_LOG(level) LOG(level) << misc_name_ << ": "
33 #define SNAP_PLOG(level) PLOG(level) << misc_name_ << ": "
34
35 /*
36 * Merging a copy operation involves the following flow:
37 *
38 * 1: dm-snapshot layer requests merge for a 4k block. dm-user sends the request
39 * to the daemon
40 * 2: daemon reads the source block
41 * 3: daemon copies the source data
42 * 4: IO completion sent back to dm-user (a switch from user space to kernel)
43 * 5: dm-snapshot merges the data to base device
44 * 6: dm-snapshot sends the merge-completion IO to dm-user
45 * 7: dm-user re-directs the merge completion IO to daemon (one more switch)
46 * 8: daemon updates the COW file about the completed merge request (a write syscall) and followed
47 * by a fysnc. 9: Send the IO completion back to dm-user
48 *
49 * The above sequence is a significant overhead especially when merging one 4k
50 * block at a time.
51 *
52 * Read-ahead layer will optimize the above path by reading the data from base
53 * device in the background so that merging thread can retrieve the data from
54 * the read-ahead cache. Additionally, syncing of merged data is deferred to
55 * read-ahead thread threadby the IO path is not bottlenecked.
56 *
57 * We create a scratch space of 2MB to store the read-ahead data in the COW
58 * device.
59 *
60 * +-----------------------+
61 * | Header (fixed) |
62 * +-----------------------+
63 * | Scratch space | <-- 2MB
64 * +-----------------------+
65 *
66 * Scratch space is as follows:
67 *
68 * +-----------------------+
69 * | Metadata | <- 4k page
70 * +-----------------------+
71 * | Metadata | <- 4k page
72 * +-----------------------+
73 * | |
74 * | Read-ahead data |
75 * | |
76 * +-----------------------+
77 *
78 * State transitions and communication between read-ahead thread and worker
79 * thread during merge:
80 * =====================================================================
81 *
82 * Worker Threads Read-Ahead thread
83 * ------------------------------------------------------------------
84 *
85 * |
86 * |
87 * --> -----------------READ_AHEAD_BEGIN------------->|
88 * | | | READ_AHEAD_IN_PROGRESS
89 * | WAIT |
90 * | | |
91 * | |<-----------------IO_IN_PROGRESS---------------
92 * | | |
93 * | | IO_IN_PRGRESS WAIT
94 * | | |
95 * |<--| |
96 * | |
97 * ------------------IO_TERMINATED--------------->|
98 * END
99 *
100 *
101 * ===================================================================
102 *
103 * Example:
104 *
105 * We have 6 copy operations to be executed in OTA and there is a overlap. Update-engine
106 * will write to COW file as follows:
107 *
108 * Op-1: 20 -> 23
109 * Op-2: 19 -> 22
110 * Op-3: 18 -> 21
111 * Op-4: 17 -> 20
112 * Op-5: 16 -> 19
113 * Op-6: 15 -> 18
114 *
115 * Read-ahead thread will read all the 6 source blocks and store the data in the
116 * scratch space. Metadata will contain the destination block numbers. Thus,
117 * scratch space will look something like this:
118 *
119 * +--------------+
120 * | Block 23 |
121 * | offset - 1 |
122 * +--------------+
123 * | Block 22 |
124 * | offset - 2 |
125 * +--------------+
126 * | Block 21 |
127 * | offset - 3 |
128 * +--------------+
129 * ...
130 * ...
131 * +--------------+
132 * | Data-Block 20| <-- offset - 1
133 * +--------------+
134 * | Data-Block 19| <-- offset - 2
135 * +--------------+
136 * | Data-Block 18| <-- offset - 3
137 * +--------------+
138 * ...
139 * ...
140 *
141 * ====================================================================
142 * IO Path:
143 *
144 * Read-ahead will serve the data to worker threads during merge only
145 * after metadata and data are persisted to the scratch space. Worker
146 * threads during merge will always retrieve the data from cache; if the
147 * cache is not populated, it will wait for the read-ahead thread to finish.
148 * Furthermore, the number of operations merged will by synced to the header
149 * only when all the blocks in the read-ahead cache are merged. In the above
150 * case, when all 6 operations are merged, COW Header is updated with
151 * num_merge_ops = 6.
152 *
153 * Merge resume after crash:
154 *
155 * Let's say we have a crash after 5 operations are merged. i.e. after
156 * Op-5: 16->19 is completed but before the Op-6 is merged. Thus, COW Header
157 * num_merge_ops will be 0 as the all the ops were not merged yet. During next
158 * reboot, read-ahead thread will re-construct the data in-memory from the
159 * scratch space; when merge resumes, Op-1 will be re-exectued. However,
160 * data will be served from read-ahead cache safely even though, block 20
161 * was over-written by Op-4.
162 *
163 */
164
ReadAheadThread(const std::string & cow_device,const std::string & backing_device,const std::string & misc_name,std::shared_ptr<Snapuserd> snapuserd)165 ReadAheadThread::ReadAheadThread(const std::string& cow_device, const std::string& backing_device,
166 const std::string& misc_name,
167 std::shared_ptr<Snapuserd> snapuserd) {
168 cow_device_ = cow_device;
169 backing_store_device_ = backing_device;
170 misc_name_ = misc_name;
171 snapuserd_ = snapuserd;
172 }
173
CheckOverlap(const CowOperation * cow_op)174 void ReadAheadThread::CheckOverlap(const CowOperation* cow_op) {
175 uint64_t source_block = cow_op->source;
176 uint64_t source_offset = 0;
177 if (cow_op->type == kCowXorOp) {
178 source_block /= BLOCK_SZ;
179 source_offset = cow_op->source % BLOCK_SZ;
180 }
181 if (dest_blocks_.count(cow_op->new_block) || source_blocks_.count(source_block) ||
182 (source_offset > 0 && source_blocks_.count(source_block + 1))) {
183 overlap_ = true;
184 }
185
186 dest_blocks_.insert(source_block);
187 if (source_offset > 0) {
188 dest_blocks_.insert(source_block + 1);
189 }
190 source_blocks_.insert(cow_op->new_block);
191 }
192
PrepareReadAhead(uint64_t * source_offset,int * pending_ops,std::vector<uint64_t> & blocks)193 void ReadAheadThread::PrepareReadAhead(uint64_t* source_offset, int* pending_ops,
194 std::vector<uint64_t>& blocks) {
195 int num_ops = *pending_ops;
196 int nr_consecutive = 0;
197 CHECK_NE(source_offset, nullptr);
198
199 if (!RAIterDone() && num_ops) {
200 // Get the first block with offset
201 const CowOperation* cow_op = GetRAOpIter();
202 CHECK_NE(cow_op, nullptr);
203 *source_offset = cow_op->source;
204 if (cow_op->type == kCowCopyOp) {
205 *source_offset *= BLOCK_SZ;
206 }
207 RAIterNext();
208 num_ops -= 1;
209 nr_consecutive = 1;
210 blocks.push_back(cow_op->new_block);
211
212 if (!overlap_) {
213 CheckOverlap(cow_op);
214 }
215
216 /*
217 * Find number of consecutive blocks working backwards.
218 */
219 while (!RAIterDone() && num_ops) {
220 const CowOperation* op = GetRAOpIter();
221 CHECK_NE(op, nullptr);
222 uint64_t next_offset = op->source;
223 if (op->type == kCowCopyOp) {
224 next_offset *= BLOCK_SZ;
225 }
226 if (next_offset + nr_consecutive * BLOCK_SZ != *source_offset) {
227 break;
228 }
229 nr_consecutive += 1;
230 num_ops -= 1;
231 blocks.push_back(op->new_block);
232 RAIterNext();
233
234 if (!overlap_) {
235 CheckOverlap(op);
236 }
237 }
238 }
239 }
240
ReconstructDataFromCow()241 bool ReadAheadThread::ReconstructDataFromCow() {
242 std::unordered_map<uint64_t, void*>& read_ahead_buffer_map = snapuserd_->GetReadAheadMap();
243 read_ahead_buffer_map.clear();
244 loff_t metadata_offset = 0;
245 loff_t start_data_offset = snapuserd_->GetBufferDataOffset();
246 int num_ops = 0;
247 int total_blocks_merged = 0;
248
249 // This memcpy is important as metadata_buffer_ will be an unaligned address and will fault
250 // on 32-bit systems
251 std::unique_ptr<uint8_t[]> metadata_buffer =
252 std::make_unique<uint8_t[]>(snapuserd_->GetBufferMetadataSize());
253 memcpy(metadata_buffer.get(), metadata_buffer_, snapuserd_->GetBufferMetadataSize());
254
255 while (true) {
256 struct ScratchMetadata* bm = reinterpret_cast<struct ScratchMetadata*>(
257 (char*)metadata_buffer.get() + metadata_offset);
258
259 // Done reading metadata
260 if (bm->new_block == 0 && bm->file_offset == 0) {
261 break;
262 }
263
264 loff_t buffer_offset = bm->file_offset - start_data_offset;
265 void* bufptr = static_cast<void*>((char*)read_ahead_buffer_ + buffer_offset);
266 read_ahead_buffer_map[bm->new_block] = bufptr;
267 num_ops += 1;
268 total_blocks_merged += 1;
269
270 metadata_offset += sizeof(struct ScratchMetadata);
271 }
272
273 // We are done re-constructing the mapping; however, we need to make sure
274 // all the COW operations to-be merged are present in the re-constructed
275 // mapping.
276 while (!RAIterDone()) {
277 const CowOperation* op = GetRAOpIter();
278 if (read_ahead_buffer_map.find(op->new_block) != read_ahead_buffer_map.end()) {
279 num_ops -= 1;
280 snapuserd_->SetFinalBlockMerged(op->new_block);
281 RAIterNext();
282 } else {
283 // Verify that we have covered all the ops which were re-constructed
284 // from COW device - These are the ops which are being
285 // re-constructed after crash.
286 if (!(num_ops == 0)) {
287 SNAP_LOG(ERROR) << "ReconstructDataFromCow failed. Not all ops recoverd "
288 << " Pending ops: " << num_ops;
289 snapuserd_->ReadAheadIOFailed();
290 return false;
291 }
292 break;
293 }
294 }
295
296 snapuserd_->SetTotalRaBlocksMerged(total_blocks_merged);
297
298 snapuserd_->ReconstructDataFromCowFinish();
299
300 if (!snapuserd_->ReadAheadIOCompleted(true)) {
301 SNAP_LOG(ERROR) << "ReadAheadIOCompleted failed...";
302 snapuserd_->ReadAheadIOFailed();
303 return false;
304 }
305
306 SNAP_LOG(INFO) << "ReconstructDataFromCow success";
307 return true;
308 }
309
ReadAheadIOStart()310 bool ReadAheadThread::ReadAheadIOStart() {
311 // Check if the data has to be constructed from the COW file.
312 // This will be true only once during boot up after a crash
313 // during merge.
314 if (snapuserd_->ReconstructDataFromCow()) {
315 return ReconstructDataFromCow();
316 }
317
318 std::unordered_map<uint64_t, void*>& read_ahead_buffer_map = snapuserd_->GetReadAheadMap();
319 read_ahead_buffer_map.clear();
320
321 int num_ops = (snapuserd_->GetBufferDataSize()) / BLOCK_SZ;
322 loff_t metadata_offset = 0;
323
324 struct ScratchMetadata* bm =
325 reinterpret_cast<struct ScratchMetadata*>((char*)metadata_buffer_ + metadata_offset);
326
327 bm->new_block = 0;
328 bm->file_offset = 0;
329
330 std::vector<uint64_t> blocks;
331
332 loff_t buffer_offset = 0;
333 loff_t offset = 0;
334 loff_t file_offset = snapuserd_->GetBufferDataOffset();
335 int total_blocks_merged = 0;
336 overlap_ = false;
337 dest_blocks_.clear();
338 source_blocks_.clear();
339
340 while (true) {
341 uint64_t source_offset;
342 int linear_blocks;
343
344 PrepareReadAhead(&source_offset, &num_ops, blocks);
345 linear_blocks = blocks.size();
346 if (linear_blocks == 0) {
347 // No more blocks to read
348 SNAP_LOG(DEBUG) << " Read-ahead completed....";
349 break;
350 }
351
352 // Get the first block in the consecutive set of blocks
353 source_offset = source_offset - (linear_blocks - 1) * BLOCK_SZ;
354 size_t io_size = (linear_blocks * BLOCK_SZ);
355 num_ops -= linear_blocks;
356 total_blocks_merged += linear_blocks;
357
358 // Mark the block number as the one which will
359 // be the final block to be merged in this entire region.
360 // Read-ahead thread will get
361 // notified when this block is merged to make
362 // forward progress
363 snapuserd_->SetFinalBlockMerged(blocks.back());
364
365 while (linear_blocks) {
366 uint64_t new_block = blocks.back();
367 blocks.pop_back();
368 // Assign the mapping
369 void* bufptr = static_cast<void*>((char*)read_ahead_buffer_ + offset);
370 read_ahead_buffer_map[new_block] = bufptr;
371 offset += BLOCK_SZ;
372
373 bm = reinterpret_cast<struct ScratchMetadata*>((char*)metadata_buffer_ +
374 metadata_offset);
375 bm->new_block = new_block;
376 bm->file_offset = file_offset;
377
378 metadata_offset += sizeof(struct ScratchMetadata);
379 file_offset += BLOCK_SZ;
380
381 linear_blocks -= 1;
382 }
383
384 // Read from the base device consecutive set of blocks in one shot
385 if (!android::base::ReadFullyAtOffset(backing_store_fd_,
386 (char*)read_ahead_buffer_ + buffer_offset, io_size,
387 source_offset)) {
388 SNAP_PLOG(ERROR) << "Ordered-op failed. Read from backing store: "
389 << backing_store_device_ << "at block :" << source_offset / BLOCK_SZ
390 << " offset :" << source_offset % BLOCK_SZ
391 << " buffer_offset : " << buffer_offset << " io_size : " << io_size
392 << " buf-addr : " << read_ahead_buffer_;
393
394 snapuserd_->ReadAheadIOFailed();
395 return false;
396 }
397
398 // This is important - explicitly set the contents to zero. This is used
399 // when re-constructing the data after crash. This indicates end of
400 // reading metadata contents when re-constructing the data
401 bm = reinterpret_cast<struct ScratchMetadata*>((char*)metadata_buffer_ + metadata_offset);
402 bm->new_block = 0;
403 bm->file_offset = 0;
404
405 buffer_offset += io_size;
406 }
407
408 snapuserd_->SetTotalRaBlocksMerged(total_blocks_merged);
409
410 // Flush the data only if we have a overlapping blocks in the region
411 if (!snapuserd_->ReadAheadIOCompleted(overlap_)) {
412 SNAP_LOG(ERROR) << "ReadAheadIOCompleted failed...";
413 snapuserd_->ReadAheadIOFailed();
414 return false;
415 }
416
417 return true;
418 }
419
RunThread()420 bool ReadAheadThread::RunThread() {
421 if (!InitializeFds()) {
422 return false;
423 }
424
425 InitializeRAIter();
426 InitializeBuffer();
427
428 while (!RAIterDone()) {
429 if (!ReadAheadIOStart()) {
430 return false;
431 }
432
433 bool status = snapuserd_->WaitForMergeToComplete();
434
435 if (status && !snapuserd_->CommitMerge(snapuserd_->GetTotalRaBlocksMerged())) {
436 return false;
437 }
438
439 if (!status) break;
440 }
441
442 CloseFds();
443 SNAP_LOG(INFO) << " ReadAhead thread terminating....";
444 return true;
445 }
446
447 // Initialization
InitializeFds()448 bool ReadAheadThread::InitializeFds() {
449 backing_store_fd_.reset(open(backing_store_device_.c_str(), O_RDONLY));
450 if (backing_store_fd_ < 0) {
451 SNAP_PLOG(ERROR) << "Open Failed: " << backing_store_device_;
452 return false;
453 }
454
455 cow_fd_.reset(open(cow_device_.c_str(), O_RDWR));
456 if (cow_fd_ < 0) {
457 SNAP_PLOG(ERROR) << "Open Failed: " << cow_device_;
458 return false;
459 }
460
461 return true;
462 }
463
InitializeRAIter()464 void ReadAheadThread::InitializeRAIter() {
465 std::vector<const CowOperation*>& read_ahead_ops = snapuserd_->GetReadAheadOpsVec();
466 read_ahead_iter_ = read_ahead_ops.rbegin();
467 }
468
RAIterDone()469 bool ReadAheadThread::RAIterDone() {
470 std::vector<const CowOperation*>& read_ahead_ops = snapuserd_->GetReadAheadOpsVec();
471 return read_ahead_iter_ == read_ahead_ops.rend();
472 }
473
RAIterNext()474 void ReadAheadThread::RAIterNext() {
475 read_ahead_iter_++;
476 }
477
GetRAOpIter()478 const CowOperation* ReadAheadThread::GetRAOpIter() {
479 return *read_ahead_iter_;
480 }
481
InitializeBuffer()482 void ReadAheadThread::InitializeBuffer() {
483 void* mapped_addr = snapuserd_->GetMappedAddr();
484 // Map the scratch space region into memory
485 metadata_buffer_ =
486 static_cast<void*>((char*)mapped_addr + snapuserd_->GetBufferMetadataOffset());
487 read_ahead_buffer_ = static_cast<void*>((char*)mapped_addr + snapuserd_->GetBufferDataOffset());
488 }
489
490 } // namespace snapshot
491 } // namespace android
492