• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2020 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "snapuserd.h"
18 
19 #include <dirent.h>
20 #include <fcntl.h>
21 #include <linux/fs.h>
22 #include <unistd.h>
23 #include <algorithm>
24 
25 #include <csignal>
26 #include <optional>
27 #include <set>
28 
29 #include <android-base/file.h>
30 #include <android-base/logging.h>
31 #include <android-base/parseint.h>
32 #include <android-base/properties.h>
33 #include <android-base/strings.h>
34 #include <android-base/unique_fd.h>
35 #include <snapuserd/snapuserd_client.h>
36 
37 namespace android {
38 namespace snapshot {
39 
40 using namespace android;
41 using namespace android::dm;
42 using android::base::unique_fd;
43 
44 #define SNAP_LOG(level) LOG(level) << misc_name_ << ": "
45 #define SNAP_PLOG(level) PLOG(level) << misc_name_ << ": "
46 
Snapuserd(const std::string & misc_name,const std::string & cow_device,const std::string & backing_device)47 Snapuserd::Snapuserd(const std::string& misc_name, const std::string& cow_device,
48                      const std::string& backing_device) {
49     misc_name_ = misc_name;
50     cow_device_ = cow_device;
51     backing_store_device_ = backing_device;
52     control_device_ = "/dev/dm-user/" + misc_name;
53 }
54 
InitializeWorkers()55 bool Snapuserd::InitializeWorkers() {
56     for (int i = 0; i < NUM_THREADS_PER_PARTITION; i++) {
57         std::unique_ptr<WorkerThread> wt = std::make_unique<WorkerThread>(
58                 cow_device_, backing_store_device_, control_device_, misc_name_, GetSharedPtr());
59 
60         worker_threads_.push_back(std::move(wt));
61     }
62 
63     read_ahead_thread_ = std::make_unique<ReadAheadThread>(cow_device_, backing_store_device_,
64                                                            misc_name_, GetSharedPtr());
65     return true;
66 }
67 
CloneReaderForWorker()68 std::unique_ptr<CowReader> Snapuserd::CloneReaderForWorker() {
69     return reader_->CloneCowReader();
70 }
71 
CommitMerge(int num_merge_ops)72 bool Snapuserd::CommitMerge(int num_merge_ops) {
73     struct CowHeader* ch = reinterpret_cast<struct CowHeader*>(mapped_addr_);
74     ch->num_merge_ops += num_merge_ops;
75 
76     if (read_ahead_feature_ && read_ahead_ops_.size() > 0) {
77         struct BufferState* ra_state = GetBufferState();
78         ra_state->read_ahead_state = kCowReadAheadInProgress;
79     }
80 
81     int ret = msync(mapped_addr_, BLOCK_SZ, MS_SYNC);
82     if (ret < 0) {
83         SNAP_PLOG(ERROR) << "msync header failed: " << ret;
84         return false;
85     }
86 
87     merge_initiated_ = true;
88 
89     return true;
90 }
91 
PrepareReadAhead()92 void Snapuserd::PrepareReadAhead() {
93     if (!read_ahead_feature_) {
94         return;
95     }
96 
97     struct BufferState* ra_state = GetBufferState();
98     // Check if the data has to be re-constructed from COW device
99     if (ra_state->read_ahead_state == kCowReadAheadDone) {
100         populate_data_from_cow_ = true;
101     } else {
102         populate_data_from_cow_ = false;
103     }
104 
105     StartReadAhead();
106 }
107 
GetRABuffer(std::unique_lock<std::mutex> * lock,uint64_t block,void * buffer)108 bool Snapuserd::GetRABuffer(std::unique_lock<std::mutex>* lock, uint64_t block, void* buffer) {
109     if (!lock->owns_lock()) {
110         SNAP_LOG(ERROR) << "GetRABuffer - Lock not held";
111         return false;
112     }
113     std::unordered_map<uint64_t, void*>::iterator it = read_ahead_buffer_map_.find(block);
114 
115     // This will be true only for IO's generated as part of reading a root
116     // filesystem. IO's related to merge should always be in read-ahead cache.
117     if (it == read_ahead_buffer_map_.end()) {
118         return false;
119     }
120 
121     // Theoretically, we can send the data back from the read-ahead buffer
122     // all the way to the kernel without memcpy. However, if the IO is
123     // un-aligned, the wrapper function will need to touch the read-ahead
124     // buffers and transitions will be bit more complicated.
125     memcpy(buffer, it->second, BLOCK_SZ);
126     return true;
127 }
128 
129 // ========== State transition functions for read-ahead operations ===========
130 
GetReadAheadPopulatedBuffer(uint64_t block,void * buffer)131 bool Snapuserd::GetReadAheadPopulatedBuffer(uint64_t block, void* buffer) {
132     if (!read_ahead_feature_) {
133         return false;
134     }
135 
136     {
137         std::unique_lock<std::mutex> lock(lock_);
138         if (io_state_ == READ_AHEAD_IO_TRANSITION::READ_AHEAD_FAILURE) {
139             return false;
140         }
141 
142         if (io_state_ == READ_AHEAD_IO_TRANSITION::IO_IN_PROGRESS) {
143             return GetRABuffer(&lock, block, buffer);
144         }
145     }
146 
147     {
148         // Read-ahead thread IO is in-progress. Wait for it to complete
149         std::unique_lock<std::mutex> lock(lock_);
150         while (!(io_state_ == READ_AHEAD_IO_TRANSITION::READ_AHEAD_FAILURE ||
151                  io_state_ == READ_AHEAD_IO_TRANSITION::IO_IN_PROGRESS)) {
152             cv.wait(lock);
153         }
154 
155         return GetRABuffer(&lock, block, buffer);
156     }
157 }
158 
159 // This is invoked by read-ahead thread waiting for merge IO's
160 // to complete
WaitForMergeToComplete()161 bool Snapuserd::WaitForMergeToComplete() {
162     {
163         std::unique_lock<std::mutex> lock(lock_);
164         while (!(io_state_ == READ_AHEAD_IO_TRANSITION::READ_AHEAD_BEGIN ||
165                  io_state_ == READ_AHEAD_IO_TRANSITION::IO_TERMINATED)) {
166             cv.wait(lock);
167         }
168 
169         if (io_state_ == READ_AHEAD_IO_TRANSITION::IO_TERMINATED) {
170             return false;
171         }
172 
173         io_state_ = READ_AHEAD_IO_TRANSITION::READ_AHEAD_IN_PROGRESS;
174         return true;
175     }
176 }
177 
178 // This is invoked during the launch of worker threads. We wait
179 // for read-ahead thread to by fully up before worker threads
180 // are launched; else we will have a race between worker threads
181 // and read-ahead thread specifically during re-construction.
WaitForReadAheadToStart()182 bool Snapuserd::WaitForReadAheadToStart() {
183     {
184         std::unique_lock<std::mutex> lock(lock_);
185         while (!(io_state_ == READ_AHEAD_IO_TRANSITION::IO_IN_PROGRESS ||
186                  io_state_ == READ_AHEAD_IO_TRANSITION::READ_AHEAD_FAILURE)) {
187             cv.wait(lock);
188         }
189 
190         if (io_state_ == READ_AHEAD_IO_TRANSITION::READ_AHEAD_FAILURE) {
191             return false;
192         }
193 
194         return true;
195     }
196 }
197 
198 // Invoked by worker threads when a sequence of merge operation
199 // is complete notifying read-ahead thread to make forward
200 // progress.
StartReadAhead()201 void Snapuserd::StartReadAhead() {
202     {
203         std::lock_guard<std::mutex> lock(lock_);
204         io_state_ = READ_AHEAD_IO_TRANSITION::READ_AHEAD_BEGIN;
205     }
206 
207     cv.notify_one();
208 }
209 
MergeCompleted()210 void Snapuserd::MergeCompleted() {
211     {
212         std::lock_guard<std::mutex> lock(lock_);
213         io_state_ = READ_AHEAD_IO_TRANSITION::IO_TERMINATED;
214     }
215 
216     cv.notify_one();
217 }
218 
ReadAheadIOCompleted(bool sync)219 bool Snapuserd::ReadAheadIOCompleted(bool sync) {
220     if (sync) {
221         // Flush the entire buffer region
222         int ret = msync(mapped_addr_, total_mapped_addr_length_, MS_SYNC);
223         if (ret < 0) {
224             PLOG(ERROR) << "msync failed after ReadAheadIOCompleted: " << ret;
225             return false;
226         }
227 
228         // Metadata and data are synced. Now, update the state.
229         // We need to update the state after flushing data; if there is a crash
230         // when read-ahead IO is in progress, the state of data in the COW file
231         // is unknown. kCowReadAheadDone acts as a checkpoint wherein the data
232         // in the scratch space is good and during next reboot, read-ahead thread
233         // can safely re-construct the data.
234         struct BufferState* ra_state = GetBufferState();
235         ra_state->read_ahead_state = kCowReadAheadDone;
236 
237         ret = msync(mapped_addr_, BLOCK_SZ, MS_SYNC);
238         if (ret < 0) {
239             PLOG(ERROR) << "msync failed to flush Readahead completion state...";
240             return false;
241         }
242     }
243 
244     // Notify the worker threads
245     {
246         std::lock_guard<std::mutex> lock(lock_);
247         io_state_ = READ_AHEAD_IO_TRANSITION::IO_IN_PROGRESS;
248     }
249 
250     cv.notify_all();
251     return true;
252 }
253 
ReadAheadIOFailed()254 void Snapuserd::ReadAheadIOFailed() {
255     {
256         std::lock_guard<std::mutex> lock(lock_);
257         io_state_ = READ_AHEAD_IO_TRANSITION::READ_AHEAD_FAILURE;
258     }
259 
260     cv.notify_all();
261 }
262 
263 //========== End of state transition functions ====================
264 
IsChunkIdMetadata(chunk_t chunk)265 bool Snapuserd::IsChunkIdMetadata(chunk_t chunk) {
266     uint32_t stride = exceptions_per_area_ + 1;
267     lldiv_t divresult = lldiv(chunk, stride);
268 
269     return (divresult.rem == NUM_SNAPSHOT_HDR_CHUNKS);
270 }
271 
272 // Find the next free chunk-id to be assigned. Check if the next free
273 // chunk-id represents a metadata page. If so, skip it.
GetNextAllocatableChunkId(chunk_t chunk)274 chunk_t Snapuserd::GetNextAllocatableChunkId(chunk_t chunk) {
275     chunk_t next_chunk = chunk + 1;
276 
277     if (IsChunkIdMetadata(next_chunk)) {
278         next_chunk += 1;
279     }
280     return next_chunk;
281 }
282 
CheckMergeCompletionStatus()283 void Snapuserd::CheckMergeCompletionStatus() {
284     if (!merge_initiated_) {
285         SNAP_LOG(INFO) << "Merge was not initiated. Total-data-ops: "
286                        << reader_->get_num_total_data_ops();
287         return;
288     }
289 
290     struct CowHeader* ch = reinterpret_cast<struct CowHeader*>(mapped_addr_);
291 
292     SNAP_LOG(INFO) << "Merge-status: Total-Merged-ops: " << ch->num_merge_ops
293                    << " Total-data-ops: " << reader_->get_num_total_data_ops();
294 }
295 
296 /*
297  * Read the metadata from COW device and
298  * construct the metadata as required by the kernel.
299  *
300  * Please see design on kernel COW format
301  *
302  * 1: Read the metadata from internal COW device
303  * 2: There are 3 COW operations:
304  *     a: Replace op
305  *     b: Copy op
306  *     c: Zero op
307  * 3: For each of the 3 operations, op->new_block
308  *    represents the block number in the base device
309  *    for which one of the 3 operations have to be applied.
310  *    This represents the old_chunk in the kernel COW format
311  * 4: We need to assign new_chunk for a corresponding old_chunk
312  * 5: The algorithm is similar to how kernel assigns chunk number
313  *    while creating exceptions. However, there are few cases
314  *    which needs to be addressed here:
315  *      a: During merge process, kernel scans the metadata page
316  *      from backwards when merge is initiated. Since, we need
317  *      to make sure that the merge ordering follows our COW format,
318  *      we read the COW operation from backwards and populate the
319  *      metadata so that when kernel starts the merging from backwards,
320  *      those ops correspond to the beginning of our COW format.
321  *      b: Kernel can merge successive operations if the two chunk IDs
322  *      are contiguous. This can be problematic when there is a crash
323  *      during merge; specifically when the merge operation has dependency.
324  *      These dependencies can only happen during copy operations.
325  *
326  *      To avoid this problem, we make sure overlap copy operations
327  *      are not batch merged.
328  * 6: Use a monotonically increasing chunk number to assign the
329  *    new_chunk
330  * 7: Each chunk-id represents either
331  *        a: Metadata page or
332  *        b: Data page
333  * 8: Chunk-id representing a data page is stored in a map.
334  * 9: Chunk-id representing a metadata page is converted into a vector
335  *    index. We store this in vector as kernel requests metadata during
336  *    two stage:
337  *       a: When initial dm-snapshot device is created, kernel requests
338  *          all the metadata and stores it in its internal data-structures.
339  *       b: During merge, kernel once again requests the same metadata
340  *          once-again.
341  *    In both these cases, a quick lookup based on chunk-id is done.
342  * 10: When chunk number is incremented, we need to make sure that
343  *    if the chunk is representing a metadata page and skip.
344  * 11: Each 4k page will contain 256 disk exceptions. We call this
345  *    exceptions_per_area_
346  * 12: Kernel will stop issuing metadata IO request when new-chunk ID is 0.
347  */
ReadMetadata()348 bool Snapuserd::ReadMetadata() {
349     reader_ = std::make_unique<CowReader>();
350     CowHeader header;
351     CowOptions options;
352     bool metadata_found = false;
353     int replace_ops = 0, zero_ops = 0, copy_ops = 0, xor_ops = 0;
354 
355     SNAP_LOG(DEBUG) << "ReadMetadata: Parsing cow file";
356 
357     if (!reader_->Parse(cow_fd_)) {
358         SNAP_LOG(ERROR) << "Failed to parse";
359         return false;
360     }
361 
362     if (!reader_->GetHeader(&header)) {
363         SNAP_LOG(ERROR) << "Failed to get header";
364         return false;
365     }
366 
367     if (!(header.block_size == BLOCK_SZ)) {
368         SNAP_LOG(ERROR) << "Invalid header block size found: " << header.block_size;
369         return false;
370     }
371 
372     SNAP_LOG(DEBUG) << "Merge-ops: " << header.num_merge_ops;
373 
374     if (!MmapMetadata()) {
375         SNAP_LOG(ERROR) << "mmap failed";
376         return false;
377     }
378 
379     // Initialize the iterator for reading metadata
380     std::unique_ptr<ICowOpIter> cowop_rm_iter = reader_->GetRevMergeOpIter();
381 
382     exceptions_per_area_ = (CHUNK_SIZE << SECTOR_SHIFT) / sizeof(struct disk_exception);
383 
384     // Start from chunk number 2. Chunk 0 represents header and chunk 1
385     // represents first metadata page.
386     chunk_t data_chunk_id = NUM_SNAPSHOT_HDR_CHUNKS + 1;
387     size_t num_ops = 0;
388 
389     loff_t offset = 0;
390     std::unique_ptr<uint8_t[]> de_ptr =
391             std::make_unique<uint8_t[]>(exceptions_per_area_ * sizeof(struct disk_exception));
392 
393     // This memset is important. Kernel will stop issuing IO when new-chunk ID
394     // is 0. When Area is not filled completely with all 256 exceptions,
395     // this memset will ensure that metadata read is completed.
396     memset(de_ptr.get(), 0, (exceptions_per_area_ * sizeof(struct disk_exception)));
397 
398     while (!cowop_rm_iter->Done()) {
399         const CowOperation* cow_op = &cowop_rm_iter->Get();
400         struct disk_exception* de =
401                 reinterpret_cast<struct disk_exception*>((char*)de_ptr.get() + offset);
402 
403         metadata_found = true;
404         // This loop will handle all the replace and zero ops.
405         // We will handle the copy ops later as it requires special
406         // handling of assigning chunk-id's. Furthermore, we make
407         // sure that replace/zero and copy ops are not batch merged; hence,
408         // the bump in the chunk_id before break of this loop
409         if (IsOrderedOp(*cow_op)) {
410             data_chunk_id = GetNextAllocatableChunkId(data_chunk_id);
411             break;
412         }
413 
414         if (cow_op->type == kCowReplaceOp) {
415             replace_ops++;
416         } else if (cow_op->type == kCowZeroOp) {
417             zero_ops++;
418         }
419 
420         // Construct the disk-exception
421         de->old_chunk = cow_op->new_block;
422         de->new_chunk = data_chunk_id;
423 
424         // Store operation pointer.
425         chunk_vec_.push_back(std::make_pair(ChunkToSector(data_chunk_id), cow_op));
426         num_ops += 1;
427         offset += sizeof(struct disk_exception);
428         cowop_rm_iter->Next();
429 
430         SNAP_LOG(DEBUG) << num_ops << ":"
431                         << " Old-chunk: " << de->old_chunk << " New-chunk: " << de->new_chunk;
432 
433         if (num_ops == exceptions_per_area_) {
434             // Store it in vector at the right index. This maps the chunk-id to
435             // vector index.
436             vec_.push_back(std::move(de_ptr));
437             offset = 0;
438             num_ops = 0;
439 
440             // Create buffer for next area
441             de_ptr = std::make_unique<uint8_t[]>(exceptions_per_area_ *
442                                                  sizeof(struct disk_exception));
443             memset(de_ptr.get(), 0, (exceptions_per_area_ * sizeof(struct disk_exception)));
444 
445             if (cowop_rm_iter->Done()) {
446                 vec_.push_back(std::move(de_ptr));
447             }
448         }
449 
450         data_chunk_id = GetNextAllocatableChunkId(data_chunk_id);
451     }
452 
453     int num_ra_ops_per_iter = ((GetBufferDataSize()) / BLOCK_SZ);
454     std::optional<chunk_t> prev_id = {};
455     std::vector<const CowOperation*> vec;
456     std::set<uint64_t> dest_blocks;
457     std::set<uint64_t> source_blocks;
458     size_t pending_ordered_ops = exceptions_per_area_ - num_ops;
459     uint64_t total_ordered_ops = reader_->get_num_ordered_ops_to_merge();
460 
461     SNAP_LOG(DEBUG) << " Processing copy-ops at Area: " << vec_.size()
462                     << " Number of replace/zero ops completed in this area: " << num_ops
463                     << " Pending copy ops for this area: " << pending_ordered_ops;
464 
465     while (!cowop_rm_iter->Done()) {
466         do {
467             const CowOperation* cow_op = &cowop_rm_iter->Get();
468 
469             // We have two cases specific cases:
470             //
471             // =====================================================
472             // Case 1: Overlapping copy regions
473             //
474             // Ex:
475             //
476             // Source -> Destination
477             //
478             // 1: 15 -> 18
479             // 2: 16 -> 19
480             // 3: 17 -> 20
481             // 4: 18 -> 21
482             // 5: 19 -> 22
483             // 6: 20 -> 23
484             //
485             // We have 6 copy operations to be executed in OTA and there is a overlap. Update-engine
486             // will write to COW file as follows:
487             //
488             // Op-1: 20 -> 23
489             // Op-2: 19 -> 22
490             // Op-3: 18 -> 21
491             // Op-4: 17 -> 20
492             // Op-5: 16 -> 19
493             // Op-6: 15 -> 18
494             //
495             // Note that the blocks numbers are contiguous. Hence, all 6 copy
496             // operations can be batch merged. However, that will be
497             // problematic if we have a crash as block 20, 19, 18 would have
498             // been overwritten and hence subsequent recovery may end up with
499             // a silent data corruption when op-1, op-2 and op-3 are
500             // re-executed.
501             //
502             // To address the above problem, read-ahead thread will
503             // read all the 6 source blocks, cache them in the scratch
504             // space of the COW file. During merge, read-ahead
505             // thread will serve the blocks from the read-ahead cache.
506             // If there is a crash during merge; on subsequent reboot,
507             // read-ahead thread will recover the data from the
508             // scratch space and re-construct it thereby there
509             // is no loss of data.
510             //
511             // Note that we will follow the same order of COW operations
512             // as present in the COW file. This will make sure that
513             // the merge of operations are done based on the ops present
514             // in the file.
515             //===========================================================
516             uint64_t block_source = cow_op->source;
517             uint64_t block_offset = 0;
518             if (cow_op->type == kCowXorOp) {
519                 block_source /= BLOCK_SZ;
520                 block_offset = cow_op->source % BLOCK_SZ;
521             }
522             if (prev_id.has_value()) {
523                 if (dest_blocks.count(cow_op->new_block) || source_blocks.count(block_source) ||
524                     (block_offset > 0 && source_blocks.count(block_source + 1))) {
525                     break;
526                 }
527             }
528             metadata_found = true;
529             pending_ordered_ops -= 1;
530             vec.push_back(cow_op);
531             dest_blocks.insert(block_source);
532             if (block_offset > 0) {
533                 dest_blocks.insert(block_source + 1);
534             }
535             source_blocks.insert(cow_op->new_block);
536             prev_id = cow_op->new_block;
537             cowop_rm_iter->Next();
538         } while (!cowop_rm_iter->Done() && pending_ordered_ops);
539 
540         data_chunk_id = GetNextAllocatableChunkId(data_chunk_id);
541         SNAP_LOG(DEBUG) << "Batch Merge copy-ops/xor-ops of size: " << vec.size()
542                         << " Area: " << vec_.size() << " Area offset: " << offset
543                         << " Pending-ordered-ops in this area: " << pending_ordered_ops;
544 
545         for (size_t i = 0; i < vec.size(); i++) {
546             struct disk_exception* de =
547                     reinterpret_cast<struct disk_exception*>((char*)de_ptr.get() + offset);
548             const CowOperation* cow_op = vec[i];
549 
550             de->old_chunk = cow_op->new_block;
551             de->new_chunk = data_chunk_id;
552 
553             // Store operation pointer.
554             chunk_vec_.push_back(std::make_pair(ChunkToSector(data_chunk_id), cow_op));
555             offset += sizeof(struct disk_exception);
556             num_ops += 1;
557             if (cow_op->type == kCowCopyOp) {
558                 copy_ops++;
559             } else {  // it->second->type == kCowXorOp
560                 xor_ops++;
561             }
562 
563             if (read_ahead_feature_) {
564                 read_ahead_ops_.push_back(cow_op);
565             }
566 
567             SNAP_LOG(DEBUG) << num_ops << ":"
568                             << " Ordered-op: "
569                             << " Old-chunk: " << de->old_chunk << " New-chunk: " << de->new_chunk;
570 
571             if (num_ops == exceptions_per_area_) {
572                 // Store it in vector at the right index. This maps the chunk-id to
573                 // vector index.
574                 vec_.push_back(std::move(de_ptr));
575                 num_ops = 0;
576                 offset = 0;
577 
578                 // Create buffer for next area
579                 de_ptr = std::make_unique<uint8_t[]>(exceptions_per_area_ *
580                                                      sizeof(struct disk_exception));
581                 memset(de_ptr.get(), 0, (exceptions_per_area_ * sizeof(struct disk_exception)));
582 
583                 if (cowop_rm_iter->Done()) {
584                     vec_.push_back(std::move(de_ptr));
585                     SNAP_LOG(DEBUG) << "ReadMetadata() completed; Number of Areas: " << vec_.size();
586                 }
587 
588                 if (!(pending_ordered_ops == 0)) {
589                     SNAP_LOG(ERROR) << "Invalid pending_ordered_ops: expected: 0 found: "
590                                     << pending_ordered_ops;
591                     return false;
592                 }
593                 pending_ordered_ops = exceptions_per_area_;
594             }
595 
596             data_chunk_id = GetNextAllocatableChunkId(data_chunk_id);
597             total_ordered_ops -= 1;
598             /*
599              * Split the number of ops based on the size of read-ahead buffer
600              * region. We need to ensure that kernel doesn't issue IO on blocks
601              * which are not read by the read-ahead thread.
602              */
603             if (read_ahead_feature_ && (total_ordered_ops % num_ra_ops_per_iter == 0)) {
604                 data_chunk_id = GetNextAllocatableChunkId(data_chunk_id);
605             }
606         }
607         vec.clear();
608         dest_blocks.clear();
609         source_blocks.clear();
610         prev_id.reset();
611     }
612 
613     // Partially filled area or there is no metadata
614     // If there is no metadata, fill with zero so that kernel
615     // is aware that merge is completed.
616     if (num_ops || !metadata_found) {
617         vec_.push_back(std::move(de_ptr));
618         SNAP_LOG(DEBUG) << "ReadMetadata() completed. Partially filled area num_ops: " << num_ops
619                         << "Areas : " << vec_.size();
620     }
621 
622     chunk_vec_.shrink_to_fit();
623     vec_.shrink_to_fit();
624     read_ahead_ops_.shrink_to_fit();
625 
626     // Sort the vector based on sectors as we need this during un-aligned access
627     std::sort(chunk_vec_.begin(), chunk_vec_.end(), compare);
628 
629     SNAP_LOG(INFO) << "ReadMetadata completed. Final-chunk-id: " << data_chunk_id
630                    << " Num Sector: " << ChunkToSector(data_chunk_id)
631                    << " Replace-ops: " << replace_ops << " Zero-ops: " << zero_ops
632                    << " Copy-ops: " << copy_ops << " Xor-ops: " << xor_ops
633                    << " Areas: " << vec_.size() << " Num-ops-merged: " << header.num_merge_ops
634                    << " Total-data-ops: " << reader_->get_num_total_data_ops();
635 
636     // Total number of sectors required for creating dm-user device
637     num_sectors_ = ChunkToSector(data_chunk_id);
638     merge_initiated_ = false;
639     PrepareReadAhead();
640 
641     return true;
642 }
643 
MmapMetadata()644 bool Snapuserd::MmapMetadata() {
645     CowHeader header;
646     reader_->GetHeader(&header);
647 
648     if (header.major_version >= 2 && header.buffer_size > 0) {
649         total_mapped_addr_length_ = header.header_size + BUFFER_REGION_DEFAULT_SIZE;
650         read_ahead_feature_ = true;
651     } else {
652         // mmap the first 4k page - older COW format
653         total_mapped_addr_length_ = BLOCK_SZ;
654         read_ahead_feature_ = false;
655     }
656 
657     mapped_addr_ = mmap(NULL, total_mapped_addr_length_, PROT_READ | PROT_WRITE, MAP_SHARED,
658                         cow_fd_.get(), 0);
659     if (mapped_addr_ == MAP_FAILED) {
660         SNAP_LOG(ERROR) << "mmap metadata failed";
661         return false;
662     }
663 
664     return true;
665 }
666 
UnmapBufferRegion()667 void Snapuserd::UnmapBufferRegion() {
668     int ret = munmap(mapped_addr_, total_mapped_addr_length_);
669     if (ret < 0) {
670         SNAP_PLOG(ERROR) << "munmap failed";
671     }
672 }
673 
MyLogger(android::base::LogId,android::base::LogSeverity severity,const char *,const char *,unsigned int,const char * message)674 void MyLogger(android::base::LogId, android::base::LogSeverity severity, const char*, const char*,
675               unsigned int, const char* message) {
676     if (severity == android::base::ERROR) {
677         fprintf(stderr, "%s\n", message);
678     } else {
679         fprintf(stdout, "%s\n", message);
680     }
681 }
682 
InitCowDevice()683 bool Snapuserd::InitCowDevice() {
684     cow_fd_.reset(open(cow_device_.c_str(), O_RDWR));
685     if (cow_fd_ < 0) {
686         SNAP_PLOG(ERROR) << "Open Failed: " << cow_device_;
687         return false;
688     }
689 
690     return ReadMetadata();
691 }
692 
ReadBlocksToCache(const std::string & dm_block_device,const std::string & partition_name,off_t offset,size_t size)693 void Snapuserd::ReadBlocksToCache(const std::string& dm_block_device,
694                                   const std::string& partition_name, off_t offset, size_t size) {
695     android::base::unique_fd fd(TEMP_FAILURE_RETRY(open(dm_block_device.c_str(), O_RDONLY)));
696     if (fd.get() == -1) {
697         SNAP_PLOG(ERROR) << "Error reading " << dm_block_device
698                          << " partition-name: " << partition_name;
699         return;
700     }
701 
702     size_t remain = size;
703     off_t file_offset = offset;
704     // We pick 4M I/O size based on the fact that the current
705     // update_verifier has a similar I/O size.
706     size_t read_sz = 1024 * BLOCK_SZ;
707     std::vector<uint8_t> buf(read_sz);
708 
709     while (remain > 0) {
710         size_t to_read = std::min(remain, read_sz);
711 
712         if (!android::base::ReadFullyAtOffset(fd.get(), buf.data(), to_read, file_offset)) {
713             SNAP_PLOG(ERROR) << "Failed to read block from block device: " << dm_block_device
714                              << " at offset: " << file_offset
715                              << " partition-name: " << partition_name << " total-size: " << size
716                              << " remain_size: " << remain;
717             return;
718         }
719 
720         file_offset += to_read;
721         remain -= to_read;
722     }
723 
724     SNAP_LOG(INFO) << "Finished reading block-device: " << dm_block_device
725                    << " partition: " << partition_name << " size: " << size
726                    << " offset: " << offset;
727 }
728 
ReadBlocks(const std::string & partition_name,const std::string & dm_block_device)729 void Snapuserd::ReadBlocks(const std::string& partition_name, const std::string& dm_block_device) {
730     SNAP_LOG(DEBUG) << "Reading partition: " << partition_name
731                     << " Block-Device: " << dm_block_device;
732 
733     uint64_t dev_sz = 0;
734 
735     unique_fd fd(TEMP_FAILURE_RETRY(open(dm_block_device.c_str(), O_RDONLY | O_CLOEXEC)));
736     if (fd < 0) {
737         SNAP_LOG(ERROR) << "Cannot open block device";
738         return;
739     }
740 
741     dev_sz = get_block_device_size(fd.get());
742     if (!dev_sz) {
743         SNAP_PLOG(ERROR) << "Could not determine block device size: " << dm_block_device;
744         return;
745     }
746 
747     int num_threads = 2;
748     size_t num_blocks = dev_sz >> BLOCK_SHIFT;
749     size_t num_blocks_per_thread = num_blocks / num_threads;
750     size_t read_sz_per_thread = num_blocks_per_thread << BLOCK_SHIFT;
751     off_t offset = 0;
752 
753     for (int i = 0; i < num_threads; i++) {
754         std::async(std::launch::async, &Snapuserd::ReadBlocksToCache, this, dm_block_device,
755                    partition_name, offset, read_sz_per_thread);
756 
757         offset += read_sz_per_thread;
758     }
759 }
760 
761 /*
762  * Entry point to launch threads
763  */
Start()764 bool Snapuserd::Start() {
765     std::vector<std::future<bool>> threads;
766     std::future<bool> ra_thread;
767     bool rathread = (read_ahead_feature_ && (read_ahead_ops_.size() > 0));
768 
769     // Start the read-ahead thread and wait
770     // for it as the data has to be re-constructed
771     // from COW device.
772     if (rathread) {
773         ra_thread = std::async(std::launch::async, &ReadAheadThread::RunThread,
774                                read_ahead_thread_.get());
775         if (!WaitForReadAheadToStart()) {
776             SNAP_LOG(ERROR) << "Failed to start Read-ahead thread...";
777             return false;
778         }
779 
780         SNAP_LOG(INFO) << "Read-ahead thread started...";
781     }
782 
783     // Launch worker threads
784     for (int i = 0; i < worker_threads_.size(); i++) {
785         threads.emplace_back(
786                 std::async(std::launch::async, &WorkerThread::RunThread, worker_threads_[i].get()));
787     }
788 
789     bool second_stage_init = true;
790 
791     // We don't want to read the blocks during first stage init.
792     if (android::base::EndsWith(misc_name_, "-init") || is_socket_present_) {
793         second_stage_init = false;
794     }
795 
796     if (second_stage_init) {
797         SNAP_LOG(INFO) << "Reading blocks to cache....";
798         auto& dm = DeviceMapper::Instance();
799         auto dm_block_devices = dm.FindDmPartitions();
800         if (dm_block_devices.empty()) {
801             SNAP_LOG(ERROR) << "No dm-enabled block device is found.";
802         } else {
803             auto parts = android::base::Split(misc_name_, "-");
804             std::string partition_name = parts[0];
805 
806             const char* suffix_b = "_b";
807             const char* suffix_a = "_a";
808 
809             partition_name.erase(partition_name.find_last_not_of(suffix_b) + 1);
810             partition_name.erase(partition_name.find_last_not_of(suffix_a) + 1);
811 
812             if (dm_block_devices.find(partition_name) == dm_block_devices.end()) {
813                 SNAP_LOG(ERROR) << "Failed to find dm block device for " << partition_name;
814             } else {
815                 ReadBlocks(partition_name, dm_block_devices.at(partition_name));
816             }
817         }
818     } else {
819         SNAP_LOG(INFO) << "Not reading block device into cache";
820     }
821 
822     bool ret = true;
823     for (auto& t : threads) {
824         ret = t.get() && ret;
825     }
826 
827     if (rathread) {
828         // Notify the read-ahead thread that all worker threads
829         // are done. We need this explicit notification when
830         // there is an IO failure or there was a switch
831         // of dm-user table; thus, forcing the read-ahead
832         // thread to wake up.
833         MergeCompleted();
834         ret = ret && ra_thread.get();
835     }
836 
837     return ret;
838 }
839 
GetBufferMetadataOffset()840 uint64_t Snapuserd::GetBufferMetadataOffset() {
841     CowHeader header;
842     reader_->GetHeader(&header);
843 
844     size_t size = header.header_size + sizeof(BufferState);
845     return size;
846 }
847 
848 /*
849  * Metadata for read-ahead is 16 bytes. For a 2 MB region, we will
850  * end up with 8k (2 PAGE) worth of metadata. Thus, a 2MB buffer
851  * region is split into:
852  *
853  * 1: 8k metadata
854  *
855  */
GetBufferMetadataSize()856 size_t Snapuserd::GetBufferMetadataSize() {
857     CowHeader header;
858     reader_->GetHeader(&header);
859 
860     size_t metadata_bytes = (header.buffer_size * sizeof(struct ScratchMetadata)) / BLOCK_SZ;
861     return metadata_bytes;
862 }
863 
GetBufferDataOffset()864 size_t Snapuserd::GetBufferDataOffset() {
865     CowHeader header;
866     reader_->GetHeader(&header);
867 
868     return (header.header_size + GetBufferMetadataSize());
869 }
870 
871 /*
872  * (2MB - 8K = 2088960 bytes) will be the buffer region to hold the data.
873  */
GetBufferDataSize()874 size_t Snapuserd::GetBufferDataSize() {
875     CowHeader header;
876     reader_->GetHeader(&header);
877 
878     size_t size = header.buffer_size - GetBufferMetadataSize();
879     return size;
880 }
881 
GetBufferState()882 struct BufferState* Snapuserd::GetBufferState() {
883     CowHeader header;
884     reader_->GetHeader(&header);
885 
886     struct BufferState* ra_state =
887             reinterpret_cast<struct BufferState*>((char*)mapped_addr_ + header.header_size);
888     return ra_state;
889 }
890 
891 }  // namespace snapshot
892 }  // namespace android
893