• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #define ATRACE_TAG ATRACE_TAG_PACKAGE_MANAGER
18 
19 #include "apexd_loop.h"
20 
21 #include <ApexProperties.sysprop.h>
22 #include <android-base/file.h>
23 #include <android-base/logging.h>
24 #include <android-base/parseint.h>
25 #include <android-base/properties.h>
26 #include <android-base/stringprintf.h>
27 #include <android-base/strings.h>
28 #include <dirent.h>
29 #include <fcntl.h>
30 #include <libdm/dm.h>
31 #include <linux/fs.h>
32 #include <linux/loop.h>
33 #include <sys/ioctl.h>
34 #include <sys/stat.h>
35 #include <sys/statfs.h>
36 #include <sys/sysmacros.h>
37 #include <sys/types.h>
38 #include <unistd.h>
39 #include <utils/Trace.h>
40 
41 #include <array>
42 #include <filesystem>
43 #include <mutex>
44 #include <string_view>
45 
46 #include "apexd_utils.h"
47 
48 using android::base::Basename;
49 using android::base::ErrnoError;
50 using android::base::Error;
51 using android::base::GetBoolProperty;
52 using android::base::ParseUint;
53 using android::base::ReadFileToString;
54 using android::base::Result;
55 using android::base::StartsWith;
56 using android::base::StringPrintf;
57 using android::base::unique_fd;
58 using android::dm::DeviceMapper;
59 
60 namespace android {
61 namespace apex {
62 namespace loop {
63 
64 static constexpr const char* kApexLoopIdPrefix = "apex:";
65 
66 // 128 kB read-ahead, which we currently use for /system as well
67 static constexpr const char* kReadAheadKb = "128";
68 
MaybeCloseBad()69 void LoopbackDeviceUniqueFd::MaybeCloseBad() {
70   if (device_fd.get() != -1) {
71     // Disassociate any files.
72     if (ioctl(device_fd.get(), LOOP_CLR_FD) == -1) {
73       PLOG(ERROR) << "Unable to clear fd for loopback device";
74     }
75   }
76 }
77 
ConfigureScheduler(const std::string & device_path)78 Result<void> ConfigureScheduler(const std::string& device_path) {
79   ATRACE_NAME("ConfigureScheduler");
80   if (!StartsWith(device_path, "/dev/")) {
81     return Error() << "Invalid argument " << device_path;
82   }
83 
84   const std::string device_name = Basename(device_path);
85 
86   const std::string sysfs_path =
87       StringPrintf("/sys/block/%s/queue/scheduler", device_name.c_str());
88   unique_fd sysfs_fd(open(sysfs_path.c_str(), O_RDWR | O_CLOEXEC));
89   if (sysfs_fd.get() == -1) {
90     return ErrnoError() << "Failed to open " << sysfs_path;
91   }
92 
93   // Kernels before v4.1 only support 'noop'. Kernels [v4.1, v5.0) support
94   // 'noop' and 'none'. Kernels v5.0 and later only support 'none'.
95   static constexpr const std::array<std::string_view, 2> kNoScheduler = {
96       "none", "noop"};
97 
98   int ret = 0;
99   std::string cur_sched_str;
100   if (!ReadFileToString(sysfs_path, &cur_sched_str)) {
101     return ErrnoError() << "Failed to read " << sysfs_path;
102   }
103   cur_sched_str = android::base::Trim(cur_sched_str);
104   if (std::count(kNoScheduler.begin(), kNoScheduler.end(), cur_sched_str)) {
105     return {};
106   }
107 
108   for (const std::string_view& scheduler : kNoScheduler) {
109     ret = write(sysfs_fd.get(), scheduler.data(), scheduler.size());
110     if (ret > 0) {
111       break;
112     }
113   }
114 
115   if (ret <= 0) {
116     return ErrnoError() << "Failed to write to " << sysfs_path;
117   }
118 
119   return {};
120 }
121 
122 // Return the parent device of a partition. Converts e.g. "sda26" into "sda".
PartitionParent(const std::string & blockdev)123 static Result<std::string> PartitionParent(const std::string& blockdev) {
124   if (blockdev.find('/') != std::string::npos) {
125     return Error() << "Invalid argument " << blockdev;
126   }
127   std::error_code ec;
128   for (const auto& entry :
129        std::filesystem::directory_iterator("/sys/class/block", ec)) {
130     const std::string path = entry.path().string();
131     if (std::filesystem::exists(
132             StringPrintf("%s/%s", path.c_str(), blockdev.c_str()))) {
133       return Basename(path);
134     }
135   }
136   return blockdev;
137 }
138 
139 // Convert a major:minor pair into a block device name.
BlockdevName(dev_t dev)140 static std::string BlockdevName(dev_t dev) {
141   std::error_code ec;
142   for (const auto& entry :
143        std::filesystem::directory_iterator("/dev/block", ec)) {
144     struct stat statbuf;
145     if (stat(entry.path().string().c_str(), &statbuf) < 0) {
146       continue;
147     }
148     if (dev == statbuf.st_rdev) {
149       return Basename(entry.path().string());
150     }
151   }
152   return {};
153 }
154 
155 // For file `file_path`, retrieve the block device backing the filesystem on
156 // which the file exists and return the queue depth of the block device. The
157 // loop in this function may e.g. traverse the following hierarchy:
158 // /dev/block/dm-9 (system-verity; dm-verity)
159 // -> /dev/block/dm-1 (system_b; dm-linear)
160 // -> /dev/sda26
BlockDeviceQueueDepth(const std::string & file_path)161 static Result<uint32_t> BlockDeviceQueueDepth(const std::string& file_path) {
162   struct stat statbuf;
163   int res = stat(file_path.c_str(), &statbuf);
164   if (res < 0) {
165     return ErrnoErrorf("stat({})", file_path.c_str());
166   }
167   std::string blockdev = "/dev/block/" + BlockdevName(statbuf.st_dev);
168   LOG(VERBOSE) << file_path << " -> " << blockdev;
169   if (blockdev.empty()) {
170     return Errorf("Failed to convert {}:{} (path {})", major(statbuf.st_dev),
171                   minor(statbuf.st_dev), file_path.c_str());
172   }
173   auto& dm = DeviceMapper::Instance();
174   for (;;) {
175     std::optional<std::string> child = dm.GetParentBlockDeviceByPath(blockdev);
176     if (!child) {
177       break;
178     }
179     LOG(VERBOSE) << blockdev << " -> " << *child;
180     blockdev = *child;
181   }
182   std::optional<std::string> maybe_blockdev =
183       android::dm::ExtractBlockDeviceName(blockdev);
184   if (!maybe_blockdev) {
185     return Error() << "Failed to remove /dev/block/ prefix from " << blockdev;
186   }
187   Result<std::string> maybe_parent = PartitionParent(*maybe_blockdev);
188   if (!maybe_parent.ok()) {
189     return Error() << "Failed to determine parent of " << *maybe_blockdev;
190   }
191   blockdev = *maybe_parent;
192   LOG(VERBOSE) << "Partition parent: " << blockdev;
193   const std::string nr_tags_path =
194       StringPrintf("/sys/class/block/%s/mq/0/nr_tags", blockdev.c_str());
195   std::string nr_tags;
196   if (!ReadFileToString(nr_tags_path, &nr_tags)) {
197     return ErrnoError() << "Failed to read " << nr_tags_path;
198   }
199   nr_tags = android::base::Trim(nr_tags);
200   LOG(VERBOSE) << file_path << " is backed by /dev/" << blockdev
201                << " and that block device supports queue depth " << nr_tags;
202   return strtol(nr_tags.c_str(), NULL, 0);
203 }
204 
205 // Set 'nr_requests' of `loop_device_path` equal to the queue depth of
206 // the block device backing `file_path`.
ConfigureQueueDepth(const std::string & loop_device_path,const std::string & file_path)207 Result<void> ConfigureQueueDepth(const std::string& loop_device_path,
208                                  const std::string& file_path) {
209   ATRACE_NAME("ConfigureQueueDepth");
210   if (!StartsWith(loop_device_path, "/dev/")) {
211     return Error() << "Invalid argument " << loop_device_path;
212   }
213 
214   const std::string loop_device_name = Basename(loop_device_path);
215 
216   const std::string sysfs_path =
217       StringPrintf("/sys/block/%s/queue/nr_requests", loop_device_name.c_str());
218   std::string cur_nr_requests_str;
219   if (!ReadFileToString(sysfs_path, &cur_nr_requests_str)) {
220     return ErrnoError() << "Failed to read " << sysfs_path;
221   }
222   cur_nr_requests_str = android::base::Trim(cur_nr_requests_str);
223   uint32_t cur_nr_requests = 0;
224   if (!ParseUint(cur_nr_requests_str.c_str(), &cur_nr_requests)) {
225     return Error() << "Failed to parse " << cur_nr_requests_str;
226   }
227 
228   unique_fd sysfs_fd(open(sysfs_path.c_str(), O_RDWR | O_CLOEXEC));
229   if (sysfs_fd.get() == -1) {
230     return ErrnoErrorf("Failed to open {}", sysfs_path);
231   }
232 
233   const auto qd = BlockDeviceQueueDepth(file_path);
234   if (!qd.ok()) {
235     return qd.error();
236   }
237   if (*qd == cur_nr_requests) {
238     return {};
239   }
240   // Only report write failures if reducing the queue depth. Attempts to
241   // increase the queue depth are rejected by the kernel if no I/O scheduler
242   // is associated with the request queue.
243   if (!WriteStringToFd(StringPrintf("%u", *qd), sysfs_fd) &&
244       *qd < cur_nr_requests) {
245     return ErrnoErrorf("Failed to write {} to {}", *qd, sysfs_path);
246   }
247   return {};
248 }
249 
ConfigureReadAhead(const std::string & device_path)250 Result<void> ConfigureReadAhead(const std::string& device_path) {
251   ATRACE_NAME("ConfigureReadAhead");
252   CHECK(StartsWith(device_path, "/dev/"));
253   std::string device_name = Basename(device_path);
254 
255   std::string sysfs_device =
256       StringPrintf("/sys/block/%s/queue/read_ahead_kb", device_name.c_str());
257   unique_fd sysfs_fd(open(sysfs_device.c_str(), O_RDWR | O_CLOEXEC));
258   if (sysfs_fd.get() == -1) {
259     return ErrnoError() << "Failed to open " << sysfs_device;
260   }
261 
262   int ret = TEMP_FAILURE_RETRY(
263       write(sysfs_fd.get(), kReadAheadKb, strlen(kReadAheadKb) + 1));
264   if (ret < 0) {
265     return ErrnoError() << "Failed to write to " << sysfs_device;
266   }
267 
268   return {};
269 }
270 
PreAllocateLoopDevices(size_t num)271 Result<void> PreAllocateLoopDevices(size_t num) {
272   Result<void> loop_ready = WaitForFile("/dev/loop-control", 20s);
273   if (!loop_ready.ok()) {
274     return loop_ready;
275   }
276   unique_fd ctl_fd(
277       TEMP_FAILURE_RETRY(open("/dev/loop-control", O_RDWR | O_CLOEXEC)));
278   if (ctl_fd.get() == -1) {
279     return ErrnoError() << "Failed to open loop-control";
280   }
281 
282   int new_allocations = 0;  // for logging purpose
283 
284   // Assumption: loop device ID [0..num) is valid.
285   // This is because pre-allocation happens during bootstrap.
286   // Anyway Kernel pre-allocated loop devices
287   // as many as CONFIG_BLK_DEV_LOOP_MIN_COUNT,
288   // Within the amount of kernel-pre-allocation,
289   // LOOP_CTL_ADD will fail with EEXIST
290   for (size_t id = 0ul, cnt = 0; cnt < num; ++id) {
291     int ret = ioctl(ctl_fd.get(), LOOP_CTL_ADD, id);
292     if (ret > 0) {
293       new_allocations++;
294       cnt++;
295     } else if (errno == EEXIST) {
296       // When LOOP_CTL_ADD failed with EEXIST, it can check
297       // whether it is already in use.
298       // Otherwise, the loop devices pre-allocated by the kernel can be used.
299       std::string loop_device = StringPrintf("/sys/block/loop%zu/loop", id);
300       if (access(loop_device.c_str(), F_OK) == 0) {
301         LOG(WARNING) << "Loop device " << id << " already in use";
302       } else {
303         cnt++;
304       }
305     } else {
306       return ErrnoError() << "Failed LOOP_CTL_ADD id = " << id;
307     }
308   }
309 
310   // Don't wait until the dev nodes are actually created, which
311   // will delay the boot. By simply returing here, the creation of the dev
312   // nodes will be done in parallel with other boot processes, and we
313   // just optimistally hope that they are all created when we actually
314   // access them for activating APEXes. If the dev nodes are not ready
315   // even then, we wait 50ms and warning message will be printed (see below
316   // CreateLoopDevice()).
317   LOG(INFO) << "Found " << (num - new_allocations)
318             << " idle loopback devices that were "
319             << "pre-allocated by kernel. Allocated " << new_allocations
320             << " more.";
321   return {};
322 }
323 
ConfigureLoopDevice(const int device_fd,const std::string & target,const uint32_t image_offset,const size_t image_size)324 Result<void> ConfigureLoopDevice(const int device_fd, const std::string& target,
325                                  const uint32_t image_offset,
326                                  const size_t image_size) {
327   static bool use_loop_configure;
328   static std::once_flag once_flag;
329   std::call_once(once_flag, [&]() {
330     // LOOP_CONFIGURE is a new ioctl in Linux 5.8 (and backported in Android
331     // common) that allows atomically configuring a loop device. It is a lot
332     // faster than the traditional LOOP_SET_FD/LOOP_SET_STATUS64 combo, but
333     // it may not be available on updating devices, so try once before
334     // deciding.
335     struct loop_config config;
336     memset(&config, 0, sizeof(config));
337     config.fd = -1;
338     if (ioctl(device_fd, LOOP_CONFIGURE, &config) == -1 && errno == EBADF) {
339       // If the IOCTL exists, it will fail with EBADF for the -1 fd
340       use_loop_configure = true;
341     }
342   });
343 
344   /*
345    * Using O_DIRECT will tell the kernel that we want to use Direct I/O
346    * on the underlying file, which we want to do to avoid double caching.
347    * Note that Direct I/O won't be enabled immediately, because the block
348    * size of the underlying block device may not match the default loop
349    * device block size (512); when we call LOOP_SET_BLOCK_SIZE below, the
350    * kernel driver will automatically enable Direct I/O when it sees that
351    * condition is now met.
352    */
353   bool use_buffered_io = false;
354   unique_fd target_fd(open(target.c_str(), O_RDONLY | O_CLOEXEC | O_DIRECT));
355   if (target_fd.get() == -1) {
356     struct statfs stbuf;
357     int saved_errno = errno;
358     // let's give another try with buffered I/O for EROFS and squashfs
359     if (statfs(target.c_str(), &stbuf) != 0 ||
360         (stbuf.f_type != EROFS_SUPER_MAGIC_V1 &&
361          stbuf.f_type != SQUASHFS_MAGIC &&
362          stbuf.f_type != OVERLAYFS_SUPER_MAGIC)) {
363       return Error(saved_errno) << "Failed to open " << target;
364     }
365     LOG(WARNING) << "Fallback to buffered I/O for " << target;
366     use_buffered_io = true;
367     target_fd.reset(open(target.c_str(), O_RDONLY | O_CLOEXEC));
368     if (target_fd.get() == -1) {
369       return ErrnoError() << "Failed to open " << target;
370     }
371   }
372 
373   struct loop_info64 li;
374   memset(&li, 0, sizeof(li));
375   strlcpy((char*)li.lo_crypt_name, kApexLoopIdPrefix, LO_NAME_SIZE);
376   li.lo_offset = image_offset;
377   li.lo_sizelimit = image_size;
378   // Automatically free loop device on last close.
379   li.lo_flags |= LO_FLAGS_AUTOCLEAR;
380 
381   if (use_loop_configure) {
382     struct loop_config config;
383     memset(&config, 0, sizeof(config));
384     config.fd = target_fd.get();
385     config.info = li;
386     config.block_size = 4096;
387     if (!use_buffered_io) {
388         li.lo_flags |= LO_FLAGS_DIRECT_IO;
389     }
390 
391     if (ioctl(device_fd, LOOP_CONFIGURE, &config) == -1) {
392       return ErrnoError() << "Failed to LOOP_CONFIGURE";
393     }
394 
395     return {};
396   } else {
397     if (ioctl(device_fd, LOOP_SET_FD, target_fd.get()) == -1) {
398       return ErrnoError() << "Failed to LOOP_SET_FD";
399     }
400 
401     if (ioctl(device_fd, LOOP_SET_STATUS64, &li) == -1) {
402       return ErrnoError() << "Failed to LOOP_SET_STATUS64";
403     }
404 
405     if (ioctl(device_fd, BLKFLSBUF, 0) == -1) {
406       // This works around a kernel bug where the following happens.
407       // 1) The device runs with a value of loop.max_part > 0
408       // 2) As part of LOOP_SET_FD above, we do a partition scan, which loads
409       //    the first 2 pages of the underlying file into the buffer cache
410       // 3) When we then change the offset with LOOP_SET_STATUS64, those pages
411       //    are not invalidated from the cache.
412       // 4) When we try to mount an ext4 filesystem on the loop device, the ext4
413       //    code will try to find a superblock by reading 4k at offset 0; but,
414       //    because we still have the old pages at offset 0 lying in the cache,
415       //    those pages will be returned directly. However, those pages contain
416       //    the data at offset 0 in the underlying file, not at the offset that
417       //    we configured
418       // 5) the ext4 driver fails to find a superblock in the (wrong) data, and
419       //    fails to mount the filesystem.
420       //
421       // To work around this, explicitly flush the block device, which will
422       // flush the buffer cache and make sure we actually read the data at the
423       // correct offset.
424       return ErrnoError() << "Failed to flush buffers on the loop device";
425     }
426 
427     // Direct-IO requires the loop device to have the same block size as the
428     // underlying filesystem.
429     if (ioctl(device_fd, LOOP_SET_BLOCK_SIZE, 4096) == -1) {
430       PLOG(WARNING) << "Failed to LOOP_SET_BLOCK_SIZE";
431     }
432   }
433   return {};
434 }
435 
WaitForDevice(int num)436 Result<LoopbackDeviceUniqueFd> WaitForDevice(int num) {
437   std::string opened_device;
438   const std::vector<std::string> candidate_devices = {
439       StringPrintf("/dev/block/loop%d", num),
440       StringPrintf("/dev/loop%d", num),
441   };
442 
443   // apexd-bootstrap runs in parallel with ueventd to optimize boot time. In
444   // rare cases apexd would try attempt to mount an apex before ueventd created
445   // a loop device for it. To work around this we keep polling for loop device
446   // to be created until ueventd's cold boot sequence is done.
447   bool cold_boot_done = GetBoolProperty("ro.cold_boot_done", false);
448 
449   // Even though the kernel has created the loop device, we still depend on
450   // ueventd to run to actually create the device node in userspace. To solve
451   // this properly we should listen on the netlink socket for uevents, or use
452   // inotify. For now, this will have to do.
453   size_t attempts =
454       android::sysprop::ApexProperties::loop_wait_attempts().value_or(3u);
455   for (size_t i = 0; i != attempts; ++i) {
456     if (!cold_boot_done) {
457       cold_boot_done = GetBoolProperty("ro.cold_boot_done", false);
458     }
459     for (const auto& device : candidate_devices) {
460       unique_fd sysfs_fd(open(device.c_str(), O_RDWR | O_CLOEXEC));
461       if (sysfs_fd.get() != -1) {
462         return LoopbackDeviceUniqueFd(std::move(sysfs_fd), device);
463       }
464     }
465     PLOG(WARNING) << "Loopback device " << num << " not ready. Waiting 50ms...";
466     usleep(50000);
467     if (!cold_boot_done) {
468       // ueventd hasn't finished cold boot yet, keep trying.
469       i = 0;
470     }
471   }
472 
473   return Error() << "Failed to open loopback device " << num;
474 }
475 
CreateLoopDevice(const std::string & target,uint32_t image_offset,size_t image_size)476 Result<LoopbackDeviceUniqueFd> CreateLoopDevice(const std::string& target,
477                                                 uint32_t image_offset,
478                                                 size_t image_size) {
479   ATRACE_NAME("CreateLoopDevice");
480 
481   unique_fd ctl_fd(open("/dev/loop-control", O_RDWR | O_CLOEXEC));
482   if (ctl_fd.get() == -1) {
483     return ErrnoError() << "Failed to open loop-control";
484   }
485 
486   static std::mutex mtx;
487   std::lock_guard lock(mtx);
488   int num = ioctl(ctl_fd.get(), LOOP_CTL_GET_FREE);
489   if (num == -1) {
490     return ErrnoError() << "Failed LOOP_CTL_GET_FREE";
491   }
492 
493   Result<LoopbackDeviceUniqueFd> loop_device = WaitForDevice(num);
494   if (!loop_device.ok()) {
495     return loop_device.error();
496   }
497   CHECK_NE(loop_device->device_fd.get(), -1);
498 
499   Result<void> configure_status = ConfigureLoopDevice(
500       loop_device->device_fd.get(), target, image_offset, image_size);
501   if (!configure_status.ok()) {
502     return configure_status.error();
503   }
504 
505   return loop_device;
506 }
507 
CreateAndConfigureLoopDevice(const std::string & target,uint32_t image_offset,size_t image_size)508 Result<LoopbackDeviceUniqueFd> CreateAndConfigureLoopDevice(
509     const std::string& target, uint32_t image_offset, size_t image_size) {
510   ATRACE_NAME("CreateAndConfigureLoopDevice");
511   // Do minimal amount of work while holding a mutex. We need it because
512   // acquiring + configuring a loop device is not atomic. Ideally we should
513   // pre-acquire all the loop devices in advance, so that when we run APEX
514   // activation in-parallel, we can do it without holding any lock.
515   // Unfortunately, this will require some refactoring of how we manage loop
516   // devices, and probably some new loop-control ioctls, so for the time being
517   // we just limit the scope that requires locking.
518   auto loop_device = CreateLoopDevice(target, image_offset, image_size);
519   if (!loop_device.ok()) {
520     return loop_device.error();
521   }
522 
523   Result<void> sched_status = ConfigureScheduler(loop_device->name);
524   if (!sched_status.ok()) {
525     LOG(WARNING) << "Configuring I/O scheduler failed: "
526                  << sched_status.error();
527   }
528 
529   Result<void> qd_status = ConfigureQueueDepth(loop_device->name, target);
530   if (!qd_status.ok()) {
531     LOG(WARNING) << qd_status.error();
532   }
533 
534   Result<void> read_ahead_status = ConfigureReadAhead(loop_device->name);
535   if (!read_ahead_status.ok()) {
536     return read_ahead_status.error();
537   }
538 
539   return loop_device;
540 }
541 
DestroyLoopDevice(const std::string & path,const DestroyLoopFn & extra)542 void DestroyLoopDevice(const std::string& path, const DestroyLoopFn& extra) {
543   unique_fd fd(open(path.c_str(), O_RDWR | O_CLOEXEC));
544   if (fd.get() == -1) {
545     if (errno != ENOENT) {
546       PLOG(WARNING) << "Failed to open " << path;
547     }
548     return;
549   }
550 
551   struct loop_info64 li;
552   if (ioctl(fd.get(), LOOP_GET_STATUS64, &li) < 0) {
553     if (errno != ENXIO) {
554       PLOG(WARNING) << "Failed to LOOP_GET_STATUS64 " << path;
555     }
556     return;
557   }
558 
559   auto id = std::string((char*)li.lo_crypt_name);
560   if (StartsWith(id, kApexLoopIdPrefix)) {
561     extra(path, id);
562 
563     if (ioctl(fd.get(), LOOP_CLR_FD, 0) < 0) {
564       PLOG(WARNING) << "Failed to LOOP_CLR_FD " << path;
565     }
566   }
567 }
568 
569 }  // namespace loop
570 }  // namespace apex
571 }  // namespace android
572