• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2021 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "snapuserd.h"
18 
19 #include <csignal>
20 #include <optional>
21 #include <set>
22 
23 #include <snapuserd/snapuserd_client.h>
24 
25 namespace android {
26 namespace snapshot {
27 
28 using namespace android;
29 using namespace android::dm;
30 using android::base::unique_fd;
31 
32 #define SNAP_LOG(level) LOG(level) << misc_name_ << ": "
33 #define SNAP_PLOG(level) PLOG(level) << misc_name_ << ": "
34 
35 /*
36  * Merging a copy operation involves the following flow:
37  *
38  * 1: dm-snapshot layer requests merge for a 4k block. dm-user sends the request
39  *    to the daemon
40  * 2: daemon reads the source block
41  * 3: daemon copies the source data
42  * 4: IO completion sent back to dm-user (a switch from user space to kernel)
43  * 5: dm-snapshot merges the data to base device
44  * 6: dm-snapshot sends the merge-completion IO to dm-user
45  * 7: dm-user re-directs the merge completion IO to daemon (one more switch)
46  * 8: daemon updates the COW file about the completed merge request (a write syscall) and followed
47  * by a fysnc. 9: Send the IO completion back to dm-user
48  *
49  * The above sequence is a significant overhead especially when merging one 4k
50  * block at a time.
51  *
52  * Read-ahead layer will optimize the above path by reading the data from base
53  * device in the background so that merging thread can retrieve the data from
54  * the read-ahead cache. Additionally, syncing of merged data is deferred to
55  * read-ahead thread threadby the IO path is not bottlenecked.
56  *
57  * We create a scratch space of 2MB to store the read-ahead data in the COW
58  * device.
59  *
60  *      +-----------------------+
61  *      |     Header (fixed)    |
62  *      +-----------------------+
63  *      |    Scratch space      |  <-- 2MB
64  *      +-----------------------+
65  *
66  *      Scratch space is as follows:
67  *
68  *      +-----------------------+
69  *      |       Metadata        | <- 4k page
70  *      +-----------------------+
71  *      |       Metadata        | <- 4k page
72  *      +-----------------------+
73  *      |                       |
74  *      |    Read-ahead data    |
75  *      |                       |
76  *      +-----------------------+
77  *
78  * State transitions and communication between read-ahead thread and worker
79  * thread during merge:
80  * =====================================================================
81  *
82  *   Worker Threads                                 Read-Ahead thread
83  *   ------------------------------------------------------------------
84  *
85  *      |
86  *      |
87  *  --> -----------------READ_AHEAD_BEGIN------------->|
88  *  |   |                                              | READ_AHEAD_IN_PROGRESS
89  *  |  WAIT                                            |
90  *  |   |                                              |
91  *  |   |<-----------------IO_IN_PROGRESS---------------
92  *  |   |                                              |
93  *  |   | IO_IN_PRGRESS                               WAIT
94  *  |   |                                              |
95  *  |<--|                                              |
96  *      |                                              |
97  *      ------------------IO_TERMINATED--------------->|
98  *                                                     END
99  *
100  *
101  * ===================================================================
102  *
103  * Example:
104  *
105  * We have 6 copy operations to be executed in OTA and there is a overlap. Update-engine
106  * will write to COW file as follows:
107  *
108  * Op-1: 20 -> 23
109  * Op-2: 19 -> 22
110  * Op-3: 18 -> 21
111  * Op-4: 17 -> 20
112  * Op-5: 16 -> 19
113  * Op-6: 15 -> 18
114  *
115  * Read-ahead thread will read all the 6 source blocks and store the data in the
116  * scratch space. Metadata will contain the destination block numbers. Thus,
117  * scratch space will look something like this:
118  *
119  * +--------------+
120  * | Block   23   |
121  * | offset - 1   |
122  * +--------------+
123  * | Block   22   |
124  * | offset - 2   |
125  * +--------------+
126  * | Block   21   |
127  * | offset - 3   |
128  * +--------------+
129  *    ...
130  *    ...
131  * +--------------+
132  * | Data-Block 20| <-- offset - 1
133  * +--------------+
134  * | Data-Block 19| <-- offset - 2
135  * +--------------+
136  * | Data-Block 18| <-- offset - 3
137  * +--------------+
138  *     ...
139  *     ...
140  *
141  * ====================================================================
142  * IO Path:
143  *
144  * Read-ahead will serve the data to worker threads during merge only
145  * after metadata and data are persisted to the scratch space. Worker
146  * threads during merge will always retrieve the data from cache; if the
147  * cache is not populated, it will wait for the read-ahead thread to finish.
148  * Furthermore, the number of operations merged will by synced to the header
149  * only when all the blocks in the read-ahead cache are merged. In the above
150  * case, when all 6 operations are merged, COW Header is updated with
151  * num_merge_ops = 6.
152  *
153  * Merge resume after crash:
154  *
155  * Let's say we have a crash after 5 operations are merged. i.e. after
156  * Op-5: 16->19 is completed but before the Op-6 is merged. Thus, COW Header
157  * num_merge_ops will be 0 as the all the ops were not merged yet. During next
158  * reboot, read-ahead thread will re-construct the data in-memory from the
159  * scratch space; when merge resumes, Op-1 will be re-exectued. However,
160  * data will be served from read-ahead cache safely even though, block 20
161  * was over-written by Op-4.
162  *
163  */
164 
ReadAheadThread(const std::string & cow_device,const std::string & backing_device,const std::string & misc_name,std::shared_ptr<Snapuserd> snapuserd)165 ReadAheadThread::ReadAheadThread(const std::string& cow_device, const std::string& backing_device,
166                                  const std::string& misc_name,
167                                  std::shared_ptr<Snapuserd> snapuserd) {
168     cow_device_ = cow_device;
169     backing_store_device_ = backing_device;
170     misc_name_ = misc_name;
171     snapuserd_ = snapuserd;
172 }
173 
CheckOverlap(const CowOperation * cow_op)174 void ReadAheadThread::CheckOverlap(const CowOperation* cow_op) {
175     uint64_t source_block = cow_op->source;
176     uint64_t source_offset = 0;
177     if (cow_op->type == kCowXorOp) {
178         source_block /= BLOCK_SZ;
179         source_offset = cow_op->source % BLOCK_SZ;
180     }
181     if (dest_blocks_.count(cow_op->new_block) || source_blocks_.count(source_block) ||
182         (source_offset > 0 && source_blocks_.count(source_block + 1))) {
183         overlap_ = true;
184     }
185 
186     dest_blocks_.insert(source_block);
187     if (source_offset > 0) {
188         dest_blocks_.insert(source_block + 1);
189     }
190     source_blocks_.insert(cow_op->new_block);
191 }
192 
PrepareReadAhead(uint64_t * source_offset,int * pending_ops,std::vector<uint64_t> & blocks)193 void ReadAheadThread::PrepareReadAhead(uint64_t* source_offset, int* pending_ops,
194                                        std::vector<uint64_t>& blocks) {
195     int num_ops = *pending_ops;
196     int nr_consecutive = 0;
197     CHECK_NE(source_offset, nullptr);
198 
199     if (!RAIterDone() && num_ops) {
200         // Get the first block with offset
201         const CowOperation* cow_op = GetRAOpIter();
202         CHECK_NE(cow_op, nullptr);
203         *source_offset = cow_op->source;
204         if (cow_op->type == kCowCopyOp) {
205             *source_offset *= BLOCK_SZ;
206         }
207         RAIterNext();
208         num_ops -= 1;
209         nr_consecutive = 1;
210         blocks.push_back(cow_op->new_block);
211 
212         if (!overlap_) {
213             CheckOverlap(cow_op);
214         }
215 
216         /*
217          * Find number of consecutive blocks working backwards.
218          */
219         while (!RAIterDone() && num_ops) {
220             const CowOperation* op = GetRAOpIter();
221             CHECK_NE(op, nullptr);
222             uint64_t next_offset = op->source;
223             if (op->type == kCowCopyOp) {
224                 next_offset *= BLOCK_SZ;
225             }
226             if (next_offset + nr_consecutive * BLOCK_SZ != *source_offset) {
227                 break;
228             }
229             nr_consecutive += 1;
230             num_ops -= 1;
231             blocks.push_back(op->new_block);
232             RAIterNext();
233 
234             if (!overlap_) {
235                 CheckOverlap(op);
236             }
237         }
238     }
239 }
240 
ReconstructDataFromCow()241 bool ReadAheadThread::ReconstructDataFromCow() {
242     std::unordered_map<uint64_t, void*>& read_ahead_buffer_map = snapuserd_->GetReadAheadMap();
243     read_ahead_buffer_map.clear();
244     loff_t metadata_offset = 0;
245     loff_t start_data_offset = snapuserd_->GetBufferDataOffset();
246     int num_ops = 0;
247     int total_blocks_merged = 0;
248 
249     // This memcpy is important as metadata_buffer_ will be an unaligned address and will fault
250     // on 32-bit systems
251     std::unique_ptr<uint8_t[]> metadata_buffer =
252             std::make_unique<uint8_t[]>(snapuserd_->GetBufferMetadataSize());
253     memcpy(metadata_buffer.get(), metadata_buffer_, snapuserd_->GetBufferMetadataSize());
254 
255     while (true) {
256         struct ScratchMetadata* bm = reinterpret_cast<struct ScratchMetadata*>(
257                 (char*)metadata_buffer.get() + metadata_offset);
258 
259         // Done reading metadata
260         if (bm->new_block == 0 && bm->file_offset == 0) {
261             break;
262         }
263 
264         loff_t buffer_offset = bm->file_offset - start_data_offset;
265         void* bufptr = static_cast<void*>((char*)read_ahead_buffer_ + buffer_offset);
266         read_ahead_buffer_map[bm->new_block] = bufptr;
267         num_ops += 1;
268         total_blocks_merged += 1;
269 
270         metadata_offset += sizeof(struct ScratchMetadata);
271     }
272 
273     // We are done re-constructing the mapping; however, we need to make sure
274     // all the COW operations to-be merged are present in the re-constructed
275     // mapping.
276     while (!RAIterDone()) {
277         const CowOperation* op = GetRAOpIter();
278         if (read_ahead_buffer_map.find(op->new_block) != read_ahead_buffer_map.end()) {
279             num_ops -= 1;
280             snapuserd_->SetFinalBlockMerged(op->new_block);
281             RAIterNext();
282         } else {
283             // Verify that we have covered all the ops which were re-constructed
284             // from COW device - These are the ops which are being
285             // re-constructed after crash.
286             if (!(num_ops == 0)) {
287                 SNAP_LOG(ERROR) << "ReconstructDataFromCow failed. Not all ops recoverd "
288                                 << " Pending ops: " << num_ops;
289                 snapuserd_->ReadAheadIOFailed();
290                 return false;
291             }
292             break;
293         }
294     }
295 
296     snapuserd_->SetTotalRaBlocksMerged(total_blocks_merged);
297 
298     snapuserd_->ReconstructDataFromCowFinish();
299 
300     if (!snapuserd_->ReadAheadIOCompleted(true)) {
301         SNAP_LOG(ERROR) << "ReadAheadIOCompleted failed...";
302         snapuserd_->ReadAheadIOFailed();
303         return false;
304     }
305 
306     SNAP_LOG(INFO) << "ReconstructDataFromCow success";
307     return true;
308 }
309 
ReadAheadIOStart()310 bool ReadAheadThread::ReadAheadIOStart() {
311     // Check if the data has to be constructed from the COW file.
312     // This will be true only once during boot up after a crash
313     // during merge.
314     if (snapuserd_->ReconstructDataFromCow()) {
315         return ReconstructDataFromCow();
316     }
317 
318     std::unordered_map<uint64_t, void*>& read_ahead_buffer_map = snapuserd_->GetReadAheadMap();
319     read_ahead_buffer_map.clear();
320 
321     int num_ops = (snapuserd_->GetBufferDataSize()) / BLOCK_SZ;
322     loff_t metadata_offset = 0;
323 
324     struct ScratchMetadata* bm =
325             reinterpret_cast<struct ScratchMetadata*>((char*)metadata_buffer_ + metadata_offset);
326 
327     bm->new_block = 0;
328     bm->file_offset = 0;
329 
330     std::vector<uint64_t> blocks;
331 
332     loff_t buffer_offset = 0;
333     loff_t offset = 0;
334     loff_t file_offset = snapuserd_->GetBufferDataOffset();
335     int total_blocks_merged = 0;
336     overlap_ = false;
337     dest_blocks_.clear();
338     source_blocks_.clear();
339 
340     while (true) {
341         uint64_t source_offset;
342         int linear_blocks;
343 
344         PrepareReadAhead(&source_offset, &num_ops, blocks);
345         linear_blocks = blocks.size();
346         if (linear_blocks == 0) {
347             // No more blocks to read
348             SNAP_LOG(DEBUG) << " Read-ahead completed....";
349             break;
350         }
351 
352         // Get the first block in the consecutive set of blocks
353         source_offset = source_offset - (linear_blocks - 1) * BLOCK_SZ;
354         size_t io_size = (linear_blocks * BLOCK_SZ);
355         num_ops -= linear_blocks;
356         total_blocks_merged += linear_blocks;
357 
358         // Mark the block number as the one which will
359         // be the final block to be merged in this entire region.
360         // Read-ahead thread will get
361         // notified when this block is merged to make
362         // forward progress
363         snapuserd_->SetFinalBlockMerged(blocks.back());
364 
365         while (linear_blocks) {
366             uint64_t new_block = blocks.back();
367             blocks.pop_back();
368             // Assign the mapping
369             void* bufptr = static_cast<void*>((char*)read_ahead_buffer_ + offset);
370             read_ahead_buffer_map[new_block] = bufptr;
371             offset += BLOCK_SZ;
372 
373             bm = reinterpret_cast<struct ScratchMetadata*>((char*)metadata_buffer_ +
374                                                            metadata_offset);
375             bm->new_block = new_block;
376             bm->file_offset = file_offset;
377 
378             metadata_offset += sizeof(struct ScratchMetadata);
379             file_offset += BLOCK_SZ;
380 
381             linear_blocks -= 1;
382         }
383 
384         // Read from the base device consecutive set of blocks in one shot
385         if (!android::base::ReadFullyAtOffset(backing_store_fd_,
386                                               (char*)read_ahead_buffer_ + buffer_offset, io_size,
387                                               source_offset)) {
388             SNAP_PLOG(ERROR) << "Ordered-op failed. Read from backing store: "
389                              << backing_store_device_ << "at block :" << source_offset / BLOCK_SZ
390                              << " offset :" << source_offset % BLOCK_SZ
391                              << " buffer_offset : " << buffer_offset << " io_size : " << io_size
392                              << " buf-addr : " << read_ahead_buffer_;
393 
394             snapuserd_->ReadAheadIOFailed();
395             return false;
396         }
397 
398         // This is important - explicitly set the contents to zero. This is used
399         // when re-constructing the data after crash. This indicates end of
400         // reading metadata contents when re-constructing the data
401         bm = reinterpret_cast<struct ScratchMetadata*>((char*)metadata_buffer_ + metadata_offset);
402         bm->new_block = 0;
403         bm->file_offset = 0;
404 
405         buffer_offset += io_size;
406     }
407 
408     snapuserd_->SetTotalRaBlocksMerged(total_blocks_merged);
409 
410     // Flush the data only if we have a overlapping blocks in the region
411     if (!snapuserd_->ReadAheadIOCompleted(overlap_)) {
412         SNAP_LOG(ERROR) << "ReadAheadIOCompleted failed...";
413         snapuserd_->ReadAheadIOFailed();
414         return false;
415     }
416 
417     return true;
418 }
419 
RunThread()420 bool ReadAheadThread::RunThread() {
421     if (!InitializeFds()) {
422         return false;
423     }
424 
425     InitializeRAIter();
426     InitializeBuffer();
427 
428     while (!RAIterDone()) {
429         if (!ReadAheadIOStart()) {
430             return false;
431         }
432 
433         bool status = snapuserd_->WaitForMergeToComplete();
434 
435         if (status && !snapuserd_->CommitMerge(snapuserd_->GetTotalRaBlocksMerged())) {
436             return false;
437         }
438 
439         if (!status) break;
440     }
441 
442     CloseFds();
443     SNAP_LOG(INFO) << " ReadAhead thread terminating....";
444     return true;
445 }
446 
447 // Initialization
InitializeFds()448 bool ReadAheadThread::InitializeFds() {
449     backing_store_fd_.reset(open(backing_store_device_.c_str(), O_RDONLY));
450     if (backing_store_fd_ < 0) {
451         SNAP_PLOG(ERROR) << "Open Failed: " << backing_store_device_;
452         return false;
453     }
454 
455     cow_fd_.reset(open(cow_device_.c_str(), O_RDWR));
456     if (cow_fd_ < 0) {
457         SNAP_PLOG(ERROR) << "Open Failed: " << cow_device_;
458         return false;
459     }
460 
461     return true;
462 }
463 
InitializeRAIter()464 void ReadAheadThread::InitializeRAIter() {
465     std::vector<const CowOperation*>& read_ahead_ops = snapuserd_->GetReadAheadOpsVec();
466     read_ahead_iter_ = read_ahead_ops.rbegin();
467 }
468 
RAIterDone()469 bool ReadAheadThread::RAIterDone() {
470     std::vector<const CowOperation*>& read_ahead_ops = snapuserd_->GetReadAheadOpsVec();
471     return read_ahead_iter_ == read_ahead_ops.rend();
472 }
473 
RAIterNext()474 void ReadAheadThread::RAIterNext() {
475     read_ahead_iter_++;
476 }
477 
GetRAOpIter()478 const CowOperation* ReadAheadThread::GetRAOpIter() {
479     return *read_ahead_iter_;
480 }
481 
InitializeBuffer()482 void ReadAheadThread::InitializeBuffer() {
483     void* mapped_addr = snapuserd_->GetMappedAddr();
484     // Map the scratch space region into memory
485     metadata_buffer_ =
486             static_cast<void*>((char*)mapped_addr + snapuserd_->GetBufferMetadataOffset());
487     read_ahead_buffer_ = static_cast<void*>((char*)mapped_addr + snapuserd_->GetBufferDataOffset());
488 }
489 
490 }  // namespace snapshot
491 }  // namespace android
492