1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (C) 2007 Oracle. All rights reserved.
4 */
5
6 #include <linux/sched.h>
7 #include <linux/sched/mm.h>
8 #include <linux/bio.h>
9 #include <linux/slab.h>
10 #include <linux/blkdev.h>
11 #include <linux/ratelimit.h>
12 #include <linux/kthread.h>
13 #include <linux/raid/pq.h>
14 #include <linux/semaphore.h>
15 #include <linux/uuid.h>
16 #include <linux/list_sort.h>
17 #include <linux/namei.h>
18 #include "misc.h"
19 #include "ctree.h"
20 #include "extent_map.h"
21 #include "disk-io.h"
22 #include "transaction.h"
23 #include "print-tree.h"
24 #include "volumes.h"
25 #include "raid56.h"
26 #include "async-thread.h"
27 #include "check-integrity.h"
28 #include "rcu-string.h"
29 #include "dev-replace.h"
30 #include "sysfs.h"
31 #include "tree-checker.h"
32 #include "space-info.h"
33 #include "block-group.h"
34 #include "discard.h"
35
36 const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
37 [BTRFS_RAID_RAID10] = {
38 .sub_stripes = 2,
39 .dev_stripes = 1,
40 .devs_max = 0, /* 0 == as many as possible */
41 .devs_min = 4,
42 .tolerated_failures = 1,
43 .devs_increment = 2,
44 .ncopies = 2,
45 .nparity = 0,
46 .raid_name = "raid10",
47 .bg_flag = BTRFS_BLOCK_GROUP_RAID10,
48 .mindev_error = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
49 },
50 [BTRFS_RAID_RAID1] = {
51 .sub_stripes = 1,
52 .dev_stripes = 1,
53 .devs_max = 2,
54 .devs_min = 2,
55 .tolerated_failures = 1,
56 .devs_increment = 2,
57 .ncopies = 2,
58 .nparity = 0,
59 .raid_name = "raid1",
60 .bg_flag = BTRFS_BLOCK_GROUP_RAID1,
61 .mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
62 },
63 [BTRFS_RAID_RAID1C3] = {
64 .sub_stripes = 1,
65 .dev_stripes = 1,
66 .devs_max = 3,
67 .devs_min = 3,
68 .tolerated_failures = 2,
69 .devs_increment = 3,
70 .ncopies = 3,
71 .nparity = 0,
72 .raid_name = "raid1c3",
73 .bg_flag = BTRFS_BLOCK_GROUP_RAID1C3,
74 .mindev_error = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET,
75 },
76 [BTRFS_RAID_RAID1C4] = {
77 .sub_stripes = 1,
78 .dev_stripes = 1,
79 .devs_max = 4,
80 .devs_min = 4,
81 .tolerated_failures = 3,
82 .devs_increment = 4,
83 .ncopies = 4,
84 .nparity = 0,
85 .raid_name = "raid1c4",
86 .bg_flag = BTRFS_BLOCK_GROUP_RAID1C4,
87 .mindev_error = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET,
88 },
89 [BTRFS_RAID_DUP] = {
90 .sub_stripes = 1,
91 .dev_stripes = 2,
92 .devs_max = 1,
93 .devs_min = 1,
94 .tolerated_failures = 0,
95 .devs_increment = 1,
96 .ncopies = 2,
97 .nparity = 0,
98 .raid_name = "dup",
99 .bg_flag = BTRFS_BLOCK_GROUP_DUP,
100 .mindev_error = 0,
101 },
102 [BTRFS_RAID_RAID0] = {
103 .sub_stripes = 1,
104 .dev_stripes = 1,
105 .devs_max = 0,
106 .devs_min = 2,
107 .tolerated_failures = 0,
108 .devs_increment = 1,
109 .ncopies = 1,
110 .nparity = 0,
111 .raid_name = "raid0",
112 .bg_flag = BTRFS_BLOCK_GROUP_RAID0,
113 .mindev_error = 0,
114 },
115 [BTRFS_RAID_SINGLE] = {
116 .sub_stripes = 1,
117 .dev_stripes = 1,
118 .devs_max = 1,
119 .devs_min = 1,
120 .tolerated_failures = 0,
121 .devs_increment = 1,
122 .ncopies = 1,
123 .nparity = 0,
124 .raid_name = "single",
125 .bg_flag = 0,
126 .mindev_error = 0,
127 },
128 [BTRFS_RAID_RAID5] = {
129 .sub_stripes = 1,
130 .dev_stripes = 1,
131 .devs_max = 0,
132 .devs_min = 2,
133 .tolerated_failures = 1,
134 .devs_increment = 1,
135 .ncopies = 1,
136 .nparity = 1,
137 .raid_name = "raid5",
138 .bg_flag = BTRFS_BLOCK_GROUP_RAID5,
139 .mindev_error = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
140 },
141 [BTRFS_RAID_RAID6] = {
142 .sub_stripes = 1,
143 .dev_stripes = 1,
144 .devs_max = 0,
145 .devs_min = 3,
146 .tolerated_failures = 2,
147 .devs_increment = 1,
148 .ncopies = 1,
149 .nparity = 2,
150 .raid_name = "raid6",
151 .bg_flag = BTRFS_BLOCK_GROUP_RAID6,
152 .mindev_error = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
153 },
154 };
155
btrfs_bg_type_to_raid_name(u64 flags)156 const char *btrfs_bg_type_to_raid_name(u64 flags)
157 {
158 const int index = btrfs_bg_flags_to_raid_index(flags);
159
160 if (index >= BTRFS_NR_RAID_TYPES)
161 return NULL;
162
163 return btrfs_raid_array[index].raid_name;
164 }
165
166 /*
167 * Fill @buf with textual description of @bg_flags, no more than @size_buf
168 * bytes including terminating null byte.
169 */
btrfs_describe_block_groups(u64 bg_flags,char * buf,u32 size_buf)170 void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
171 {
172 int i;
173 int ret;
174 char *bp = buf;
175 u64 flags = bg_flags;
176 u32 size_bp = size_buf;
177
178 if (!flags) {
179 strcpy(bp, "NONE");
180 return;
181 }
182
183 #define DESCRIBE_FLAG(flag, desc) \
184 do { \
185 if (flags & (flag)) { \
186 ret = snprintf(bp, size_bp, "%s|", (desc)); \
187 if (ret < 0 || ret >= size_bp) \
188 goto out_overflow; \
189 size_bp -= ret; \
190 bp += ret; \
191 flags &= ~(flag); \
192 } \
193 } while (0)
194
195 DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data");
196 DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system");
197 DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata");
198
199 DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single");
200 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
201 DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag,
202 btrfs_raid_array[i].raid_name);
203 #undef DESCRIBE_FLAG
204
205 if (flags) {
206 ret = snprintf(bp, size_bp, "0x%llx|", flags);
207 size_bp -= ret;
208 }
209
210 if (size_bp < size_buf)
211 buf[size_buf - size_bp - 1] = '\0'; /* remove last | */
212
213 /*
214 * The text is trimmed, it's up to the caller to provide sufficiently
215 * large buffer
216 */
217 out_overflow:;
218 }
219
220 static int init_first_rw_device(struct btrfs_trans_handle *trans);
221 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
222 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
223 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
224 static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
225 enum btrfs_map_op op,
226 u64 logical, u64 *length,
227 struct btrfs_bio **bbio_ret,
228 int mirror_num, int need_raid_map);
229
230 /*
231 * Device locking
232 * ==============
233 *
234 * There are several mutexes that protect manipulation of devices and low-level
235 * structures like chunks but not block groups, extents or files
236 *
237 * uuid_mutex (global lock)
238 * ------------------------
239 * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
240 * the SCAN_DEV ioctl registration or from mount either implicitly (the first
241 * device) or requested by the device= mount option
242 *
243 * the mutex can be very coarse and can cover long-running operations
244 *
245 * protects: updates to fs_devices counters like missing devices, rw devices,
246 * seeding, structure cloning, opening/closing devices at mount/umount time
247 *
248 * global::fs_devs - add, remove, updates to the global list
249 *
250 * does not protect: manipulation of the fs_devices::devices list in general
251 * but in mount context it could be used to exclude list modifications by eg.
252 * scan ioctl
253 *
254 * btrfs_device::name - renames (write side), read is RCU
255 *
256 * fs_devices::device_list_mutex (per-fs, with RCU)
257 * ------------------------------------------------
258 * protects updates to fs_devices::devices, ie. adding and deleting
259 *
260 * simple list traversal with read-only actions can be done with RCU protection
261 *
262 * may be used to exclude some operations from running concurrently without any
263 * modifications to the list (see write_all_supers)
264 *
265 * Is not required at mount and close times, because our device list is
266 * protected by the uuid_mutex at that point.
267 *
268 * balance_mutex
269 * -------------
270 * protects balance structures (status, state) and context accessed from
271 * several places (internally, ioctl)
272 *
273 * chunk_mutex
274 * -----------
275 * protects chunks, adding or removing during allocation, trim or when a new
276 * device is added/removed. Additionally it also protects post_commit_list of
277 * individual devices, since they can be added to the transaction's
278 * post_commit_list only with chunk_mutex held.
279 *
280 * cleaner_mutex
281 * -------------
282 * a big lock that is held by the cleaner thread and prevents running subvolume
283 * cleaning together with relocation or delayed iputs
284 *
285 *
286 * Lock nesting
287 * ============
288 *
289 * uuid_mutex
290 * device_list_mutex
291 * chunk_mutex
292 * balance_mutex
293 *
294 *
295 * Exclusive operations
296 * ====================
297 *
298 * Maintains the exclusivity of the following operations that apply to the
299 * whole filesystem and cannot run in parallel.
300 *
301 * - Balance (*)
302 * - Device add
303 * - Device remove
304 * - Device replace (*)
305 * - Resize
306 *
307 * The device operations (as above) can be in one of the following states:
308 *
309 * - Running state
310 * - Paused state
311 * - Completed state
312 *
313 * Only device operations marked with (*) can go into the Paused state for the
314 * following reasons:
315 *
316 * - ioctl (only Balance can be Paused through ioctl)
317 * - filesystem remounted as read-only
318 * - filesystem unmounted and mounted as read-only
319 * - system power-cycle and filesystem mounted as read-only
320 * - filesystem or device errors leading to forced read-only
321 *
322 * The status of exclusive operation is set and cleared atomically.
323 * During the course of Paused state, fs_info::exclusive_operation remains set.
324 * A device operation in Paused or Running state can be canceled or resumed
325 * either by ioctl (Balance only) or when remounted as read-write.
326 * The exclusive status is cleared when the device operation is canceled or
327 * completed.
328 */
329
330 DEFINE_MUTEX(uuid_mutex);
331 static LIST_HEAD(fs_uuids);
btrfs_get_fs_uuids(void)332 struct list_head * __attribute_const__ btrfs_get_fs_uuids(void)
333 {
334 return &fs_uuids;
335 }
336
337 /*
338 * alloc_fs_devices - allocate struct btrfs_fs_devices
339 * @fsid: if not NULL, copy the UUID to fs_devices::fsid
340 * @metadata_fsid: if not NULL, copy the UUID to fs_devices::metadata_fsid
341 *
342 * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
343 * The returned struct is not linked onto any lists and can be destroyed with
344 * kfree() right away.
345 */
alloc_fs_devices(const u8 * fsid,const u8 * metadata_fsid)346 static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
347 const u8 *metadata_fsid)
348 {
349 struct btrfs_fs_devices *fs_devs;
350
351 fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
352 if (!fs_devs)
353 return ERR_PTR(-ENOMEM);
354
355 mutex_init(&fs_devs->device_list_mutex);
356
357 INIT_LIST_HEAD(&fs_devs->devices);
358 INIT_LIST_HEAD(&fs_devs->alloc_list);
359 INIT_LIST_HEAD(&fs_devs->fs_list);
360 INIT_LIST_HEAD(&fs_devs->seed_list);
361 if (fsid)
362 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
363
364 if (metadata_fsid)
365 memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE);
366 else if (fsid)
367 memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE);
368
369 return fs_devs;
370 }
371
btrfs_free_device(struct btrfs_device * device)372 void btrfs_free_device(struct btrfs_device *device)
373 {
374 WARN_ON(!list_empty(&device->post_commit_list));
375 rcu_string_free(device->name);
376 extent_io_tree_release(&device->alloc_state);
377 bio_put(device->flush_bio);
378 kfree(device);
379 }
380
free_fs_devices(struct btrfs_fs_devices * fs_devices)381 static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
382 {
383 struct btrfs_device *device;
384 WARN_ON(fs_devices->opened);
385 while (!list_empty(&fs_devices->devices)) {
386 device = list_entry(fs_devices->devices.next,
387 struct btrfs_device, dev_list);
388 list_del(&device->dev_list);
389 btrfs_free_device(device);
390 }
391 kfree(fs_devices);
392 }
393
btrfs_cleanup_fs_uuids(void)394 void __exit btrfs_cleanup_fs_uuids(void)
395 {
396 struct btrfs_fs_devices *fs_devices;
397
398 while (!list_empty(&fs_uuids)) {
399 fs_devices = list_entry(fs_uuids.next,
400 struct btrfs_fs_devices, fs_list);
401 list_del(&fs_devices->fs_list);
402 free_fs_devices(fs_devices);
403 }
404 }
405
406 /*
407 * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error.
408 * Returned struct is not linked onto any lists and must be destroyed using
409 * btrfs_free_device.
410 */
__alloc_device(struct btrfs_fs_info * fs_info)411 static struct btrfs_device *__alloc_device(struct btrfs_fs_info *fs_info)
412 {
413 struct btrfs_device *dev;
414
415 dev = kzalloc(sizeof(*dev), GFP_KERNEL);
416 if (!dev)
417 return ERR_PTR(-ENOMEM);
418
419 /*
420 * Preallocate a bio that's always going to be used for flushing device
421 * barriers and matches the device lifespan
422 */
423 dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL);
424 if (!dev->flush_bio) {
425 kfree(dev);
426 return ERR_PTR(-ENOMEM);
427 }
428
429 INIT_LIST_HEAD(&dev->dev_list);
430 INIT_LIST_HEAD(&dev->dev_alloc_list);
431 INIT_LIST_HEAD(&dev->post_commit_list);
432
433 atomic_set(&dev->reada_in_flight, 0);
434 atomic_set(&dev->dev_stats_ccnt, 0);
435 btrfs_device_data_ordered_init(dev);
436 INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
437 INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
438 extent_io_tree_init(fs_info, &dev->alloc_state,
439 IO_TREE_DEVICE_ALLOC_STATE, NULL);
440
441 return dev;
442 }
443
find_fsid(const u8 * fsid,const u8 * metadata_fsid)444 static noinline struct btrfs_fs_devices *find_fsid(
445 const u8 *fsid, const u8 *metadata_fsid)
446 {
447 struct btrfs_fs_devices *fs_devices;
448
449 ASSERT(fsid);
450
451 /* Handle non-split brain cases */
452 list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
453 if (metadata_fsid) {
454 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0
455 && memcmp(metadata_fsid, fs_devices->metadata_uuid,
456 BTRFS_FSID_SIZE) == 0)
457 return fs_devices;
458 } else {
459 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
460 return fs_devices;
461 }
462 }
463 return NULL;
464 }
465
find_fsid_with_metadata_uuid(struct btrfs_super_block * disk_super)466 static struct btrfs_fs_devices *find_fsid_with_metadata_uuid(
467 struct btrfs_super_block *disk_super)
468 {
469
470 struct btrfs_fs_devices *fs_devices;
471
472 /*
473 * Handle scanned device having completed its fsid change but
474 * belonging to a fs_devices that was created by first scanning
475 * a device which didn't have its fsid/metadata_uuid changed
476 * at all and the CHANGING_FSID_V2 flag set.
477 */
478 list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
479 if (fs_devices->fsid_change &&
480 memcmp(disk_super->metadata_uuid, fs_devices->fsid,
481 BTRFS_FSID_SIZE) == 0 &&
482 memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
483 BTRFS_FSID_SIZE) == 0) {
484 return fs_devices;
485 }
486 }
487 /*
488 * Handle scanned device having completed its fsid change but
489 * belonging to a fs_devices that was created by a device that
490 * has an outdated pair of fsid/metadata_uuid and
491 * CHANGING_FSID_V2 flag set.
492 */
493 list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
494 if (fs_devices->fsid_change &&
495 memcmp(fs_devices->metadata_uuid,
496 fs_devices->fsid, BTRFS_FSID_SIZE) != 0 &&
497 memcmp(disk_super->metadata_uuid, fs_devices->metadata_uuid,
498 BTRFS_FSID_SIZE) == 0) {
499 return fs_devices;
500 }
501 }
502
503 return find_fsid(disk_super->fsid, disk_super->metadata_uuid);
504 }
505
506
507 static int
btrfs_get_bdev_and_sb(const char * device_path,fmode_t flags,void * holder,int flush,struct block_device ** bdev,struct btrfs_super_block ** disk_super)508 btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
509 int flush, struct block_device **bdev,
510 struct btrfs_super_block **disk_super)
511 {
512 int ret;
513
514 *bdev = blkdev_get_by_path(device_path, flags, holder);
515
516 if (IS_ERR(*bdev)) {
517 ret = PTR_ERR(*bdev);
518 goto error;
519 }
520
521 if (flush)
522 filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
523 ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
524 if (ret) {
525 blkdev_put(*bdev, flags);
526 goto error;
527 }
528 invalidate_bdev(*bdev);
529 *disk_super = btrfs_read_dev_super(*bdev);
530 if (IS_ERR(*disk_super)) {
531 ret = PTR_ERR(*disk_super);
532 blkdev_put(*bdev, flags);
533 goto error;
534 }
535
536 return 0;
537
538 error:
539 *bdev = NULL;
540 return ret;
541 }
542
543 /*
544 * Check if the device in the path matches the device in the given struct device.
545 *
546 * Returns:
547 * true If it is the same device.
548 * false If it is not the same device or on error.
549 */
device_matched(const struct btrfs_device * device,const char * path)550 static bool device_matched(const struct btrfs_device *device, const char *path)
551 {
552 char *device_name;
553 struct block_device *bdev_old;
554 struct block_device *bdev_new;
555
556 /*
557 * If we are looking for a device with the matching dev_t, then skip
558 * device without a name (a missing device).
559 */
560 if (!device->name)
561 return false;
562
563 device_name = kzalloc(BTRFS_PATH_NAME_MAX, GFP_KERNEL);
564 if (!device_name)
565 return false;
566
567 rcu_read_lock();
568 scnprintf(device_name, BTRFS_PATH_NAME_MAX, "%s", rcu_str_deref(device->name));
569 rcu_read_unlock();
570
571 bdev_old = lookup_bdev(device_name);
572 kfree(device_name);
573 if (IS_ERR(bdev_old))
574 return false;
575
576 bdev_new = lookup_bdev(path);
577 if (IS_ERR(bdev_new))
578 return false;
579
580 if (bdev_old == bdev_new)
581 return true;
582
583 return false;
584 }
585
586 /*
587 * Search and remove all stale (devices which are not mounted) devices.
588 * When both inputs are NULL, it will search and release all stale devices.
589 * path: Optional. When provided will it release all unmounted devices
590 * matching this path only.
591 * skip_dev: Optional. Will skip this device when searching for the stale
592 * devices.
593 * Return: 0 for success or if @path is NULL.
594 * -EBUSY if @path is a mounted device.
595 * -ENOENT if @path does not match any device in the list.
596 */
btrfs_free_stale_devices(const char * path,struct btrfs_device * skip_device)597 static int btrfs_free_stale_devices(const char *path,
598 struct btrfs_device *skip_device)
599 {
600 struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
601 struct btrfs_device *device, *tmp_device;
602 int ret = 0;
603
604 lockdep_assert_held(&uuid_mutex);
605
606 if (path)
607 ret = -ENOENT;
608
609 list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
610
611 mutex_lock(&fs_devices->device_list_mutex);
612 list_for_each_entry_safe(device, tmp_device,
613 &fs_devices->devices, dev_list) {
614 if (skip_device && skip_device == device)
615 continue;
616 if (path && !device_matched(device, path))
617 continue;
618 if (fs_devices->opened) {
619 /* for an already deleted device return 0 */
620 if (path && ret != 0)
621 ret = -EBUSY;
622 break;
623 }
624
625 /* delete the stale device */
626 fs_devices->num_devices--;
627 list_del(&device->dev_list);
628 btrfs_free_device(device);
629
630 ret = 0;
631 }
632 mutex_unlock(&fs_devices->device_list_mutex);
633
634 if (fs_devices->num_devices == 0) {
635 btrfs_sysfs_remove_fsid(fs_devices);
636 list_del(&fs_devices->fs_list);
637 free_fs_devices(fs_devices);
638 }
639 }
640
641 return ret;
642 }
643
644 /*
645 * This is only used on mount, and we are protected from competing things
646 * messing with our fs_devices by the uuid_mutex, thus we do not need the
647 * fs_devices->device_list_mutex here.
648 */
btrfs_open_one_device(struct btrfs_fs_devices * fs_devices,struct btrfs_device * device,fmode_t flags,void * holder)649 static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
650 struct btrfs_device *device, fmode_t flags,
651 void *holder)
652 {
653 struct request_queue *q;
654 struct block_device *bdev;
655 struct btrfs_super_block *disk_super;
656 u64 devid;
657 int ret;
658
659 if (device->bdev)
660 return -EINVAL;
661 if (!device->name)
662 return -EINVAL;
663
664 ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
665 &bdev, &disk_super);
666 if (ret)
667 return ret;
668
669 devid = btrfs_stack_device_id(&disk_super->dev_item);
670 if (devid != device->devid)
671 goto error_free_page;
672
673 if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
674 goto error_free_page;
675
676 device->generation = btrfs_super_generation(disk_super);
677
678 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
679 if (btrfs_super_incompat_flags(disk_super) &
680 BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {
681 pr_err(
682 "BTRFS: Invalid seeding and uuid-changed device detected\n");
683 goto error_free_page;
684 }
685
686 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
687 fs_devices->seeding = true;
688 } else {
689 if (bdev_read_only(bdev))
690 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
691 else
692 set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
693 }
694
695 q = bdev_get_queue(bdev);
696 if (!blk_queue_nonrot(q))
697 fs_devices->rotating = true;
698
699 device->bdev = bdev;
700 clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
701 device->mode = flags;
702
703 fs_devices->open_devices++;
704 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
705 device->devid != BTRFS_DEV_REPLACE_DEVID) {
706 fs_devices->rw_devices++;
707 list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
708 }
709 btrfs_release_disk_super(disk_super);
710
711 return 0;
712
713 error_free_page:
714 btrfs_release_disk_super(disk_super);
715 blkdev_put(bdev, flags);
716
717 return -EINVAL;
718 }
719
720 /*
721 * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices
722 * being created with a disk that has already completed its fsid change. Such
723 * disk can belong to an fs which has its FSID changed or to one which doesn't.
724 * Handle both cases here.
725 */
find_fsid_inprogress(struct btrfs_super_block * disk_super)726 static struct btrfs_fs_devices *find_fsid_inprogress(
727 struct btrfs_super_block *disk_super)
728 {
729 struct btrfs_fs_devices *fs_devices;
730
731 list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
732 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
733 BTRFS_FSID_SIZE) != 0 &&
734 memcmp(fs_devices->metadata_uuid, disk_super->fsid,
735 BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) {
736 return fs_devices;
737 }
738 }
739
740 return find_fsid(disk_super->fsid, NULL);
741 }
742
743
find_fsid_changed(struct btrfs_super_block * disk_super)744 static struct btrfs_fs_devices *find_fsid_changed(
745 struct btrfs_super_block *disk_super)
746 {
747 struct btrfs_fs_devices *fs_devices;
748
749 /*
750 * Handles the case where scanned device is part of an fs that had
751 * multiple successful changes of FSID but curently device didn't
752 * observe it. Meaning our fsid will be different than theirs. We need
753 * to handle two subcases :
754 * 1 - The fs still continues to have different METADATA/FSID uuids.
755 * 2 - The fs is switched back to its original FSID (METADATA/FSID
756 * are equal).
757 */
758 list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
759 /* Changed UUIDs */
760 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
761 BTRFS_FSID_SIZE) != 0 &&
762 memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid,
763 BTRFS_FSID_SIZE) == 0 &&
764 memcmp(fs_devices->fsid, disk_super->fsid,
765 BTRFS_FSID_SIZE) != 0)
766 return fs_devices;
767
768 /* Unchanged UUIDs */
769 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
770 BTRFS_FSID_SIZE) == 0 &&
771 memcmp(fs_devices->fsid, disk_super->metadata_uuid,
772 BTRFS_FSID_SIZE) == 0)
773 return fs_devices;
774 }
775
776 return NULL;
777 }
778
find_fsid_reverted_metadata(struct btrfs_super_block * disk_super)779 static struct btrfs_fs_devices *find_fsid_reverted_metadata(
780 struct btrfs_super_block *disk_super)
781 {
782 struct btrfs_fs_devices *fs_devices;
783
784 /*
785 * Handle the case where the scanned device is part of an fs whose last
786 * metadata UUID change reverted it to the original FSID. At the same
787 * time * fs_devices was first created by another constitutent device
788 * which didn't fully observe the operation. This results in an
789 * btrfs_fs_devices created with metadata/fsid different AND
790 * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the
791 * fs_devices equal to the FSID of the disk.
792 */
793 list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
794 if (memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
795 BTRFS_FSID_SIZE) != 0 &&
796 memcmp(fs_devices->metadata_uuid, disk_super->fsid,
797 BTRFS_FSID_SIZE) == 0 &&
798 fs_devices->fsid_change)
799 return fs_devices;
800 }
801
802 return NULL;
803 }
804 /*
805 * Add new device to list of registered devices
806 *
807 * Returns:
808 * device pointer which was just added or updated when successful
809 * error pointer when failed
810 */
device_list_add(const char * path,struct btrfs_super_block * disk_super,bool * new_device_added)811 static noinline struct btrfs_device *device_list_add(const char *path,
812 struct btrfs_super_block *disk_super,
813 bool *new_device_added)
814 {
815 struct btrfs_device *device;
816 struct btrfs_fs_devices *fs_devices = NULL;
817 struct rcu_string *name;
818 u64 found_transid = btrfs_super_generation(disk_super);
819 u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
820 bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
821 BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
822 bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
823 BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
824
825 if (fsid_change_in_progress) {
826 if (!has_metadata_uuid)
827 fs_devices = find_fsid_inprogress(disk_super);
828 else
829 fs_devices = find_fsid_changed(disk_super);
830 } else if (has_metadata_uuid) {
831 fs_devices = find_fsid_with_metadata_uuid(disk_super);
832 } else {
833 fs_devices = find_fsid_reverted_metadata(disk_super);
834 if (!fs_devices)
835 fs_devices = find_fsid(disk_super->fsid, NULL);
836 }
837
838
839 if (!fs_devices) {
840 if (has_metadata_uuid)
841 fs_devices = alloc_fs_devices(disk_super->fsid,
842 disk_super->metadata_uuid);
843 else
844 fs_devices = alloc_fs_devices(disk_super->fsid, NULL);
845
846 if (IS_ERR(fs_devices))
847 return ERR_CAST(fs_devices);
848
849 fs_devices->fsid_change = fsid_change_in_progress;
850
851 mutex_lock(&fs_devices->device_list_mutex);
852 list_add(&fs_devices->fs_list, &fs_uuids);
853
854 device = NULL;
855 } else {
856 mutex_lock(&fs_devices->device_list_mutex);
857 device = btrfs_find_device(fs_devices, devid,
858 disk_super->dev_item.uuid, NULL, false);
859
860 /*
861 * If this disk has been pulled into an fs devices created by
862 * a device which had the CHANGING_FSID_V2 flag then replace the
863 * metadata_uuid/fsid values of the fs_devices.
864 */
865 if (fs_devices->fsid_change &&
866 found_transid > fs_devices->latest_generation) {
867 memcpy(fs_devices->fsid, disk_super->fsid,
868 BTRFS_FSID_SIZE);
869
870 if (has_metadata_uuid)
871 memcpy(fs_devices->metadata_uuid,
872 disk_super->metadata_uuid,
873 BTRFS_FSID_SIZE);
874 else
875 memcpy(fs_devices->metadata_uuid,
876 disk_super->fsid, BTRFS_FSID_SIZE);
877
878 fs_devices->fsid_change = false;
879 }
880 }
881
882 if (!device) {
883 if (fs_devices->opened) {
884 mutex_unlock(&fs_devices->device_list_mutex);
885 return ERR_PTR(-EBUSY);
886 }
887
888 device = btrfs_alloc_device(NULL, &devid,
889 disk_super->dev_item.uuid);
890 if (IS_ERR(device)) {
891 mutex_unlock(&fs_devices->device_list_mutex);
892 /* we can safely leave the fs_devices entry around */
893 return device;
894 }
895
896 name = rcu_string_strdup(path, GFP_NOFS);
897 if (!name) {
898 btrfs_free_device(device);
899 mutex_unlock(&fs_devices->device_list_mutex);
900 return ERR_PTR(-ENOMEM);
901 }
902 rcu_assign_pointer(device->name, name);
903
904 list_add_rcu(&device->dev_list, &fs_devices->devices);
905 fs_devices->num_devices++;
906
907 device->fs_devices = fs_devices;
908 *new_device_added = true;
909
910 if (disk_super->label[0])
911 pr_info(
912 "BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n",
913 disk_super->label, devid, found_transid, path,
914 current->comm, task_pid_nr(current));
915 else
916 pr_info(
917 "BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n",
918 disk_super->fsid, devid, found_transid, path,
919 current->comm, task_pid_nr(current));
920
921 } else if (!device->name || strcmp(device->name->str, path)) {
922 /*
923 * When FS is already mounted.
924 * 1. If you are here and if the device->name is NULL that
925 * means this device was missing at time of FS mount.
926 * 2. If you are here and if the device->name is different
927 * from 'path' that means either
928 * a. The same device disappeared and reappeared with
929 * different name. or
930 * b. The missing-disk-which-was-replaced, has
931 * reappeared now.
932 *
933 * We must allow 1 and 2a above. But 2b would be a spurious
934 * and unintentional.
935 *
936 * Further in case of 1 and 2a above, the disk at 'path'
937 * would have missed some transaction when it was away and
938 * in case of 2a the stale bdev has to be updated as well.
939 * 2b must not be allowed at all time.
940 */
941
942 /*
943 * For now, we do allow update to btrfs_fs_device through the
944 * btrfs dev scan cli after FS has been mounted. We're still
945 * tracking a problem where systems fail mount by subvolume id
946 * when we reject replacement on a mounted FS.
947 */
948 if (!fs_devices->opened && found_transid < device->generation) {
949 /*
950 * That is if the FS is _not_ mounted and if you
951 * are here, that means there is more than one
952 * disk with same uuid and devid.We keep the one
953 * with larger generation number or the last-in if
954 * generation are equal.
955 */
956 mutex_unlock(&fs_devices->device_list_mutex);
957 return ERR_PTR(-EEXIST);
958 }
959
960 /*
961 * We are going to replace the device path for a given devid,
962 * make sure it's the same device if the device is mounted
963 */
964 if (device->bdev) {
965 struct block_device *path_bdev;
966
967 path_bdev = lookup_bdev(path);
968 if (IS_ERR(path_bdev)) {
969 mutex_unlock(&fs_devices->device_list_mutex);
970 return ERR_CAST(path_bdev);
971 }
972
973 if (device->bdev != path_bdev) {
974 bdput(path_bdev);
975 mutex_unlock(&fs_devices->device_list_mutex);
976 /*
977 * device->fs_info may not be reliable here, so
978 * pass in a NULL instead. This avoids a
979 * possible use-after-free when the fs_info and
980 * fs_info->sb are already torn down.
981 */
982 btrfs_warn_in_rcu(NULL,
983 "duplicate device %s devid %llu generation %llu scanned by %s (%d)",
984 path, devid, found_transid,
985 current->comm,
986 task_pid_nr(current));
987 return ERR_PTR(-EEXIST);
988 }
989 bdput(path_bdev);
990 btrfs_info_in_rcu(device->fs_info,
991 "devid %llu device path %s changed to %s scanned by %s (%d)",
992 devid, rcu_str_deref(device->name),
993 path, current->comm,
994 task_pid_nr(current));
995 }
996
997 name = rcu_string_strdup(path, GFP_NOFS);
998 if (!name) {
999 mutex_unlock(&fs_devices->device_list_mutex);
1000 return ERR_PTR(-ENOMEM);
1001 }
1002 rcu_string_free(device->name);
1003 rcu_assign_pointer(device->name, name);
1004 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
1005 fs_devices->missing_devices--;
1006 clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
1007 }
1008 }
1009
1010 /*
1011 * Unmount does not free the btrfs_device struct but would zero
1012 * generation along with most of the other members. So just update
1013 * it back. We need it to pick the disk with largest generation
1014 * (as above).
1015 */
1016 if (!fs_devices->opened) {
1017 device->generation = found_transid;
1018 fs_devices->latest_generation = max_t(u64, found_transid,
1019 fs_devices->latest_generation);
1020 }
1021
1022 fs_devices->total_devices = btrfs_super_num_devices(disk_super);
1023
1024 mutex_unlock(&fs_devices->device_list_mutex);
1025 return device;
1026 }
1027
clone_fs_devices(struct btrfs_fs_devices * orig)1028 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
1029 {
1030 struct btrfs_fs_devices *fs_devices;
1031 struct btrfs_device *device;
1032 struct btrfs_device *orig_dev;
1033 int ret = 0;
1034
1035 lockdep_assert_held(&uuid_mutex);
1036
1037 fs_devices = alloc_fs_devices(orig->fsid, NULL);
1038 if (IS_ERR(fs_devices))
1039 return fs_devices;
1040
1041 fs_devices->total_devices = orig->total_devices;
1042
1043 list_for_each_entry(orig_dev, &orig->devices, dev_list) {
1044 struct rcu_string *name;
1045
1046 device = btrfs_alloc_device(NULL, &orig_dev->devid,
1047 orig_dev->uuid);
1048 if (IS_ERR(device)) {
1049 ret = PTR_ERR(device);
1050 goto error;
1051 }
1052
1053 /*
1054 * This is ok to do without rcu read locked because we hold the
1055 * uuid mutex so nothing we touch in here is going to disappear.
1056 */
1057 if (orig_dev->name) {
1058 name = rcu_string_strdup(orig_dev->name->str,
1059 GFP_KERNEL);
1060 if (!name) {
1061 btrfs_free_device(device);
1062 ret = -ENOMEM;
1063 goto error;
1064 }
1065 rcu_assign_pointer(device->name, name);
1066 }
1067
1068 list_add(&device->dev_list, &fs_devices->devices);
1069 device->fs_devices = fs_devices;
1070 fs_devices->num_devices++;
1071 }
1072 return fs_devices;
1073 error:
1074 free_fs_devices(fs_devices);
1075 return ERR_PTR(ret);
1076 }
1077
__btrfs_free_extra_devids(struct btrfs_fs_devices * fs_devices,int step,struct btrfs_device ** latest_dev)1078 static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
1079 int step, struct btrfs_device **latest_dev)
1080 {
1081 struct btrfs_device *device, *next;
1082
1083 /* This is the initialized path, it is safe to release the devices. */
1084 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
1085 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) {
1086 if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
1087 &device->dev_state) &&
1088 !test_bit(BTRFS_DEV_STATE_MISSING,
1089 &device->dev_state) &&
1090 (!*latest_dev ||
1091 device->generation > (*latest_dev)->generation)) {
1092 *latest_dev = device;
1093 }
1094 continue;
1095 }
1096
1097 /*
1098 * We have already validated the presence of BTRFS_DEV_REPLACE_DEVID,
1099 * in btrfs_init_dev_replace() so just continue.
1100 */
1101 if (device->devid == BTRFS_DEV_REPLACE_DEVID)
1102 continue;
1103
1104 if (device->bdev) {
1105 blkdev_put(device->bdev, device->mode);
1106 device->bdev = NULL;
1107 fs_devices->open_devices--;
1108 }
1109 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1110 list_del_init(&device->dev_alloc_list);
1111 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
1112 fs_devices->rw_devices--;
1113 }
1114 list_del_init(&device->dev_list);
1115 fs_devices->num_devices--;
1116 btrfs_free_device(device);
1117 }
1118
1119 }
1120
1121 /*
1122 * After we have read the system tree and know devids belonging to this
1123 * filesystem, remove the device which does not belong there.
1124 */
btrfs_free_extra_devids(struct btrfs_fs_devices * fs_devices,int step)1125 void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
1126 {
1127 struct btrfs_device *latest_dev = NULL;
1128 struct btrfs_fs_devices *seed_dev;
1129
1130 mutex_lock(&uuid_mutex);
1131 __btrfs_free_extra_devids(fs_devices, step, &latest_dev);
1132
1133 list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list)
1134 __btrfs_free_extra_devids(seed_dev, step, &latest_dev);
1135
1136 fs_devices->latest_bdev = latest_dev->bdev;
1137
1138 mutex_unlock(&uuid_mutex);
1139 }
1140
btrfs_close_bdev(struct btrfs_device * device)1141 static void btrfs_close_bdev(struct btrfs_device *device)
1142 {
1143 if (!device->bdev)
1144 return;
1145
1146 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1147 sync_blockdev(device->bdev);
1148 invalidate_bdev(device->bdev);
1149 }
1150
1151 blkdev_put(device->bdev, device->mode);
1152 }
1153
btrfs_close_one_device(struct btrfs_device * device)1154 static void btrfs_close_one_device(struct btrfs_device *device)
1155 {
1156 struct btrfs_fs_devices *fs_devices = device->fs_devices;
1157
1158 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
1159 device->devid != BTRFS_DEV_REPLACE_DEVID) {
1160 list_del_init(&device->dev_alloc_list);
1161 fs_devices->rw_devices--;
1162 }
1163
1164 if (device->devid == BTRFS_DEV_REPLACE_DEVID)
1165 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
1166
1167 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
1168 clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
1169 fs_devices->missing_devices--;
1170 }
1171
1172 btrfs_close_bdev(device);
1173 if (device->bdev) {
1174 fs_devices->open_devices--;
1175 device->bdev = NULL;
1176 }
1177 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
1178
1179 device->fs_info = NULL;
1180 atomic_set(&device->dev_stats_ccnt, 0);
1181 extent_io_tree_release(&device->alloc_state);
1182
1183 /*
1184 * Reset the flush error record. We might have a transient flush error
1185 * in this mount, and if so we aborted the current transaction and set
1186 * the fs to an error state, guaranteeing no super blocks can be further
1187 * committed. However that error might be transient and if we unmount the
1188 * filesystem and mount it again, we should allow the mount to succeed
1189 * (btrfs_check_rw_degradable() should not fail) - if after mounting the
1190 * filesystem again we still get flush errors, then we will again abort
1191 * any transaction and set the error state, guaranteeing no commits of
1192 * unsafe super blocks.
1193 */
1194 device->last_flush_error = 0;
1195
1196 /* Verify the device is back in a pristine state */
1197 ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state));
1198 ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
1199 ASSERT(list_empty(&device->dev_alloc_list));
1200 ASSERT(list_empty(&device->post_commit_list));
1201 ASSERT(atomic_read(&device->reada_in_flight) == 0);
1202 }
1203
close_fs_devices(struct btrfs_fs_devices * fs_devices)1204 static void close_fs_devices(struct btrfs_fs_devices *fs_devices)
1205 {
1206 struct btrfs_device *device, *tmp;
1207
1208 lockdep_assert_held(&uuid_mutex);
1209
1210 if (--fs_devices->opened > 0)
1211 return;
1212
1213 list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list)
1214 btrfs_close_one_device(device);
1215
1216 WARN_ON(fs_devices->open_devices);
1217 WARN_ON(fs_devices->rw_devices);
1218 fs_devices->opened = 0;
1219 fs_devices->seeding = false;
1220 fs_devices->fs_info = NULL;
1221 }
1222
btrfs_close_devices(struct btrfs_fs_devices * fs_devices)1223 void btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
1224 {
1225 LIST_HEAD(list);
1226 struct btrfs_fs_devices *tmp;
1227
1228 mutex_lock(&uuid_mutex);
1229 close_fs_devices(fs_devices);
1230 if (!fs_devices->opened)
1231 list_splice_init(&fs_devices->seed_list, &list);
1232
1233 list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) {
1234 close_fs_devices(fs_devices);
1235 list_del(&fs_devices->seed_list);
1236 free_fs_devices(fs_devices);
1237 }
1238 mutex_unlock(&uuid_mutex);
1239 }
1240
open_fs_devices(struct btrfs_fs_devices * fs_devices,fmode_t flags,void * holder)1241 static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
1242 fmode_t flags, void *holder)
1243 {
1244 struct btrfs_device *device;
1245 struct btrfs_device *latest_dev = NULL;
1246 struct btrfs_device *tmp_device;
1247
1248 flags |= FMODE_EXCL;
1249
1250 list_for_each_entry_safe(device, tmp_device, &fs_devices->devices,
1251 dev_list) {
1252 int ret;
1253
1254 ret = btrfs_open_one_device(fs_devices, device, flags, holder);
1255 if (ret == 0 &&
1256 (!latest_dev || device->generation > latest_dev->generation)) {
1257 latest_dev = device;
1258 } else if (ret == -ENODATA) {
1259 fs_devices->num_devices--;
1260 list_del(&device->dev_list);
1261 btrfs_free_device(device);
1262 }
1263 }
1264 if (fs_devices->open_devices == 0)
1265 return -EINVAL;
1266
1267 fs_devices->opened = 1;
1268 fs_devices->latest_bdev = latest_dev->bdev;
1269 fs_devices->total_rw_bytes = 0;
1270 fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
1271
1272 return 0;
1273 }
1274
devid_cmp(void * priv,const struct list_head * a,const struct list_head * b)1275 static int devid_cmp(void *priv, const struct list_head *a,
1276 const struct list_head *b)
1277 {
1278 struct btrfs_device *dev1, *dev2;
1279
1280 dev1 = list_entry(a, struct btrfs_device, dev_list);
1281 dev2 = list_entry(b, struct btrfs_device, dev_list);
1282
1283 if (dev1->devid < dev2->devid)
1284 return -1;
1285 else if (dev1->devid > dev2->devid)
1286 return 1;
1287 return 0;
1288 }
1289
btrfs_open_devices(struct btrfs_fs_devices * fs_devices,fmode_t flags,void * holder)1290 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
1291 fmode_t flags, void *holder)
1292 {
1293 int ret;
1294
1295 lockdep_assert_held(&uuid_mutex);
1296 /*
1297 * The device_list_mutex cannot be taken here in case opening the
1298 * underlying device takes further locks like bd_mutex.
1299 *
1300 * We also don't need the lock here as this is called during mount and
1301 * exclusion is provided by uuid_mutex
1302 */
1303
1304 if (fs_devices->opened) {
1305 fs_devices->opened++;
1306 ret = 0;
1307 } else {
1308 list_sort(NULL, &fs_devices->devices, devid_cmp);
1309 ret = open_fs_devices(fs_devices, flags, holder);
1310 }
1311
1312 return ret;
1313 }
1314
btrfs_release_disk_super(struct btrfs_super_block * super)1315 void btrfs_release_disk_super(struct btrfs_super_block *super)
1316 {
1317 struct page *page = virt_to_page(super);
1318
1319 put_page(page);
1320 }
1321
btrfs_read_disk_super(struct block_device * bdev,u64 bytenr)1322 static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
1323 u64 bytenr)
1324 {
1325 struct btrfs_super_block *disk_super;
1326 struct page *page;
1327 void *p;
1328 pgoff_t index;
1329
1330 /* make sure our super fits in the device */
1331 if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
1332 return ERR_PTR(-EINVAL);
1333
1334 /* make sure our super fits in the page */
1335 if (sizeof(*disk_super) > PAGE_SIZE)
1336 return ERR_PTR(-EINVAL);
1337
1338 /* make sure our super doesn't straddle pages on disk */
1339 index = bytenr >> PAGE_SHIFT;
1340 if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index)
1341 return ERR_PTR(-EINVAL);
1342
1343 /* pull in the page with our super */
1344 page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL);
1345
1346 if (IS_ERR(page))
1347 return ERR_CAST(page);
1348
1349 p = page_address(page);
1350
1351 /* align our pointer to the offset of the super block */
1352 disk_super = p + offset_in_page(bytenr);
1353
1354 if (btrfs_super_bytenr(disk_super) != bytenr ||
1355 btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
1356 btrfs_release_disk_super(p);
1357 return ERR_PTR(-EINVAL);
1358 }
1359
1360 if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1])
1361 disk_super->label[BTRFS_LABEL_SIZE - 1] = 0;
1362
1363 return disk_super;
1364 }
1365
btrfs_forget_devices(const char * path)1366 int btrfs_forget_devices(const char *path)
1367 {
1368 int ret;
1369
1370 mutex_lock(&uuid_mutex);
1371 ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL);
1372 mutex_unlock(&uuid_mutex);
1373
1374 return ret;
1375 }
1376
1377 /*
1378 * Look for a btrfs signature on a device. This may be called out of the mount path
1379 * and we are not allowed to call set_blocksize during the scan. The superblock
1380 * is read via pagecache
1381 */
btrfs_scan_one_device(const char * path,fmode_t flags,void * holder)1382 struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
1383 void *holder)
1384 {
1385 struct btrfs_super_block *disk_super;
1386 bool new_device_added = false;
1387 struct btrfs_device *device = NULL;
1388 struct block_device *bdev;
1389 u64 bytenr;
1390
1391 lockdep_assert_held(&uuid_mutex);
1392
1393 /*
1394 * we would like to check all the supers, but that would make
1395 * a btrfs mount succeed after a mkfs from a different FS.
1396 * So, we need to add a special mount option to scan for
1397 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
1398 */
1399 bytenr = btrfs_sb_offset(0);
1400 flags |= FMODE_EXCL;
1401
1402 bdev = blkdev_get_by_path(path, flags, holder);
1403 if (IS_ERR(bdev))
1404 return ERR_CAST(bdev);
1405
1406 disk_super = btrfs_read_disk_super(bdev, bytenr);
1407 if (IS_ERR(disk_super)) {
1408 device = ERR_CAST(disk_super);
1409 goto error_bdev_put;
1410 }
1411
1412 device = device_list_add(path, disk_super, &new_device_added);
1413 if (!IS_ERR(device)) {
1414 if (new_device_added)
1415 btrfs_free_stale_devices(path, device);
1416 }
1417
1418 btrfs_release_disk_super(disk_super);
1419
1420 error_bdev_put:
1421 blkdev_put(bdev, flags);
1422
1423 return device;
1424 }
1425
1426 /*
1427 * Try to find a chunk that intersects [start, start + len] range and when one
1428 * such is found, record the end of it in *start
1429 */
contains_pending_extent(struct btrfs_device * device,u64 * start,u64 len)1430 static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
1431 u64 len)
1432 {
1433 u64 physical_start, physical_end;
1434
1435 lockdep_assert_held(&device->fs_info->chunk_mutex);
1436
1437 if (!find_first_extent_bit(&device->alloc_state, *start,
1438 &physical_start, &physical_end,
1439 CHUNK_ALLOCATED, NULL)) {
1440
1441 if (in_range(physical_start, *start, len) ||
1442 in_range(*start, physical_start,
1443 physical_end - physical_start)) {
1444 *start = physical_end + 1;
1445 return true;
1446 }
1447 }
1448 return false;
1449 }
1450
dev_extent_search_start(struct btrfs_device * device,u64 start)1451 static u64 dev_extent_search_start(struct btrfs_device *device, u64 start)
1452 {
1453 switch (device->fs_devices->chunk_alloc_policy) {
1454 case BTRFS_CHUNK_ALLOC_REGULAR:
1455 /*
1456 * We don't want to overwrite the superblock on the drive nor
1457 * any area used by the boot loader (grub for example), so we
1458 * make sure to start at an offset of at least 1MB.
1459 */
1460 return max_t(u64, start, SZ_1M);
1461 default:
1462 BUG();
1463 }
1464 }
1465
1466 /**
1467 * dev_extent_hole_check - check if specified hole is suitable for allocation
1468 * @device: the device which we have the hole
1469 * @hole_start: starting position of the hole
1470 * @hole_size: the size of the hole
1471 * @num_bytes: the size of the free space that we need
1472 *
1473 * This function may modify @hole_start and @hole_end to reflect the suitable
1474 * position for allocation. Returns 1 if hole position is updated, 0 otherwise.
1475 */
dev_extent_hole_check(struct btrfs_device * device,u64 * hole_start,u64 * hole_size,u64 num_bytes)1476 static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
1477 u64 *hole_size, u64 num_bytes)
1478 {
1479 bool changed = false;
1480 u64 hole_end = *hole_start + *hole_size;
1481
1482 /*
1483 * Check before we set max_hole_start, otherwise we could end up
1484 * sending back this offset anyway.
1485 */
1486 if (contains_pending_extent(device, hole_start, *hole_size)) {
1487 if (hole_end >= *hole_start)
1488 *hole_size = hole_end - *hole_start;
1489 else
1490 *hole_size = 0;
1491 changed = true;
1492 }
1493
1494 switch (device->fs_devices->chunk_alloc_policy) {
1495 case BTRFS_CHUNK_ALLOC_REGULAR:
1496 /* No extra check */
1497 break;
1498 default:
1499 BUG();
1500 }
1501
1502 return changed;
1503 }
1504
1505 /*
1506 * find_free_dev_extent_start - find free space in the specified device
1507 * @device: the device which we search the free space in
1508 * @num_bytes: the size of the free space that we need
1509 * @search_start: the position from which to begin the search
1510 * @start: store the start of the free space.
1511 * @len: the size of the free space. that we find, or the size
1512 * of the max free space if we don't find suitable free space
1513 *
1514 * this uses a pretty simple search, the expectation is that it is
1515 * called very infrequently and that a given device has a small number
1516 * of extents
1517 *
1518 * @start is used to store the start of the free space if we find. But if we
1519 * don't find suitable free space, it will be used to store the start position
1520 * of the max free space.
1521 *
1522 * @len is used to store the size of the free space that we find.
1523 * But if we don't find suitable free space, it is used to store the size of
1524 * the max free space.
1525 *
1526 * NOTE: This function will search *commit* root of device tree, and does extra
1527 * check to ensure dev extents are not double allocated.
1528 * This makes the function safe to allocate dev extents but may not report
1529 * correct usable device space, as device extent freed in current transaction
1530 * is not reported as avaiable.
1531 */
find_free_dev_extent_start(struct btrfs_device * device,u64 num_bytes,u64 search_start,u64 * start,u64 * len)1532 static int find_free_dev_extent_start(struct btrfs_device *device,
1533 u64 num_bytes, u64 search_start, u64 *start,
1534 u64 *len)
1535 {
1536 struct btrfs_fs_info *fs_info = device->fs_info;
1537 struct btrfs_root *root = fs_info->dev_root;
1538 struct btrfs_key key;
1539 struct btrfs_dev_extent *dev_extent;
1540 struct btrfs_path *path;
1541 u64 hole_size;
1542 u64 max_hole_start;
1543 u64 max_hole_size;
1544 u64 extent_end;
1545 u64 search_end = device->total_bytes;
1546 int ret;
1547 int slot;
1548 struct extent_buffer *l;
1549
1550 search_start = dev_extent_search_start(device, search_start);
1551
1552 path = btrfs_alloc_path();
1553 if (!path)
1554 return -ENOMEM;
1555
1556 max_hole_start = search_start;
1557 max_hole_size = 0;
1558
1559 again:
1560 if (search_start >= search_end ||
1561 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
1562 ret = -ENOSPC;
1563 goto out;
1564 }
1565
1566 path->reada = READA_FORWARD;
1567 path->search_commit_root = 1;
1568 path->skip_locking = 1;
1569
1570 key.objectid = device->devid;
1571 key.offset = search_start;
1572 key.type = BTRFS_DEV_EXTENT_KEY;
1573
1574 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1575 if (ret < 0)
1576 goto out;
1577 if (ret > 0) {
1578 ret = btrfs_previous_item(root, path, key.objectid, key.type);
1579 if (ret < 0)
1580 goto out;
1581 }
1582
1583 while (1) {
1584 l = path->nodes[0];
1585 slot = path->slots[0];
1586 if (slot >= btrfs_header_nritems(l)) {
1587 ret = btrfs_next_leaf(root, path);
1588 if (ret == 0)
1589 continue;
1590 if (ret < 0)
1591 goto out;
1592
1593 break;
1594 }
1595 btrfs_item_key_to_cpu(l, &key, slot);
1596
1597 if (key.objectid < device->devid)
1598 goto next;
1599
1600 if (key.objectid > device->devid)
1601 break;
1602
1603 if (key.type != BTRFS_DEV_EXTENT_KEY)
1604 goto next;
1605
1606 if (key.offset > search_start) {
1607 hole_size = key.offset - search_start;
1608 dev_extent_hole_check(device, &search_start, &hole_size,
1609 num_bytes);
1610
1611 if (hole_size > max_hole_size) {
1612 max_hole_start = search_start;
1613 max_hole_size = hole_size;
1614 }
1615
1616 /*
1617 * If this free space is greater than which we need,
1618 * it must be the max free space that we have found
1619 * until now, so max_hole_start must point to the start
1620 * of this free space and the length of this free space
1621 * is stored in max_hole_size. Thus, we return
1622 * max_hole_start and max_hole_size and go back to the
1623 * caller.
1624 */
1625 if (hole_size >= num_bytes) {
1626 ret = 0;
1627 goto out;
1628 }
1629 }
1630
1631 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1632 extent_end = key.offset + btrfs_dev_extent_length(l,
1633 dev_extent);
1634 if (extent_end > search_start)
1635 search_start = extent_end;
1636 next:
1637 path->slots[0]++;
1638 cond_resched();
1639 }
1640
1641 /*
1642 * At this point, search_start should be the end of
1643 * allocated dev extents, and when shrinking the device,
1644 * search_end may be smaller than search_start.
1645 */
1646 if (search_end > search_start) {
1647 hole_size = search_end - search_start;
1648 if (dev_extent_hole_check(device, &search_start, &hole_size,
1649 num_bytes)) {
1650 btrfs_release_path(path);
1651 goto again;
1652 }
1653
1654 if (hole_size > max_hole_size) {
1655 max_hole_start = search_start;
1656 max_hole_size = hole_size;
1657 }
1658 }
1659
1660 /* See above. */
1661 if (max_hole_size < num_bytes)
1662 ret = -ENOSPC;
1663 else
1664 ret = 0;
1665
1666 out:
1667 btrfs_free_path(path);
1668 *start = max_hole_start;
1669 if (len)
1670 *len = max_hole_size;
1671 return ret;
1672 }
1673
find_free_dev_extent(struct btrfs_device * device,u64 num_bytes,u64 * start,u64 * len)1674 int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
1675 u64 *start, u64 *len)
1676 {
1677 /* FIXME use last free of some kind */
1678 return find_free_dev_extent_start(device, num_bytes, 0, start, len);
1679 }
1680
btrfs_free_dev_extent(struct btrfs_trans_handle * trans,struct btrfs_device * device,u64 start,u64 * dev_extent_len)1681 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1682 struct btrfs_device *device,
1683 u64 start, u64 *dev_extent_len)
1684 {
1685 struct btrfs_fs_info *fs_info = device->fs_info;
1686 struct btrfs_root *root = fs_info->dev_root;
1687 int ret;
1688 struct btrfs_path *path;
1689 struct btrfs_key key;
1690 struct btrfs_key found_key;
1691 struct extent_buffer *leaf = NULL;
1692 struct btrfs_dev_extent *extent = NULL;
1693
1694 path = btrfs_alloc_path();
1695 if (!path)
1696 return -ENOMEM;
1697
1698 key.objectid = device->devid;
1699 key.offset = start;
1700 key.type = BTRFS_DEV_EXTENT_KEY;
1701 again:
1702 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1703 if (ret > 0) {
1704 ret = btrfs_previous_item(root, path, key.objectid,
1705 BTRFS_DEV_EXTENT_KEY);
1706 if (ret)
1707 goto out;
1708 leaf = path->nodes[0];
1709 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1710 extent = btrfs_item_ptr(leaf, path->slots[0],
1711 struct btrfs_dev_extent);
1712 BUG_ON(found_key.offset > start || found_key.offset +
1713 btrfs_dev_extent_length(leaf, extent) < start);
1714 key = found_key;
1715 btrfs_release_path(path);
1716 goto again;
1717 } else if (ret == 0) {
1718 leaf = path->nodes[0];
1719 extent = btrfs_item_ptr(leaf, path->slots[0],
1720 struct btrfs_dev_extent);
1721 } else {
1722 btrfs_handle_fs_error(fs_info, ret, "Slot search failed");
1723 goto out;
1724 }
1725
1726 *dev_extent_len = btrfs_dev_extent_length(leaf, extent);
1727
1728 ret = btrfs_del_item(trans, root, path);
1729 if (ret) {
1730 btrfs_handle_fs_error(fs_info, ret,
1731 "Failed to remove dev extent item");
1732 } else {
1733 set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
1734 }
1735 out:
1736 btrfs_free_path(path);
1737 return ret;
1738 }
1739
btrfs_alloc_dev_extent(struct btrfs_trans_handle * trans,struct btrfs_device * device,u64 chunk_offset,u64 start,u64 num_bytes)1740 static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
1741 struct btrfs_device *device,
1742 u64 chunk_offset, u64 start, u64 num_bytes)
1743 {
1744 int ret;
1745 struct btrfs_path *path;
1746 struct btrfs_fs_info *fs_info = device->fs_info;
1747 struct btrfs_root *root = fs_info->dev_root;
1748 struct btrfs_dev_extent *extent;
1749 struct extent_buffer *leaf;
1750 struct btrfs_key key;
1751
1752 WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
1753 WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
1754 path = btrfs_alloc_path();
1755 if (!path)
1756 return -ENOMEM;
1757
1758 key.objectid = device->devid;
1759 key.offset = start;
1760 key.type = BTRFS_DEV_EXTENT_KEY;
1761 ret = btrfs_insert_empty_item(trans, root, path, &key,
1762 sizeof(*extent));
1763 if (ret)
1764 goto out;
1765
1766 leaf = path->nodes[0];
1767 extent = btrfs_item_ptr(leaf, path->slots[0],
1768 struct btrfs_dev_extent);
1769 btrfs_set_dev_extent_chunk_tree(leaf, extent,
1770 BTRFS_CHUNK_TREE_OBJECTID);
1771 btrfs_set_dev_extent_chunk_objectid(leaf, extent,
1772 BTRFS_FIRST_CHUNK_TREE_OBJECTID);
1773 btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
1774
1775 btrfs_set_dev_extent_length(leaf, extent, num_bytes);
1776 btrfs_mark_buffer_dirty(leaf);
1777 out:
1778 btrfs_free_path(path);
1779 return ret;
1780 }
1781
find_next_chunk(struct btrfs_fs_info * fs_info)1782 static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
1783 {
1784 struct extent_map_tree *em_tree;
1785 struct extent_map *em;
1786 struct rb_node *n;
1787 u64 ret = 0;
1788
1789 em_tree = &fs_info->mapping_tree;
1790 read_lock(&em_tree->lock);
1791 n = rb_last(&em_tree->map.rb_root);
1792 if (n) {
1793 em = rb_entry(n, struct extent_map, rb_node);
1794 ret = em->start + em->len;
1795 }
1796 read_unlock(&em_tree->lock);
1797
1798 return ret;
1799 }
1800
find_next_devid(struct btrfs_fs_info * fs_info,u64 * devid_ret)1801 static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
1802 u64 *devid_ret)
1803 {
1804 int ret;
1805 struct btrfs_key key;
1806 struct btrfs_key found_key;
1807 struct btrfs_path *path;
1808
1809 path = btrfs_alloc_path();
1810 if (!path)
1811 return -ENOMEM;
1812
1813 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1814 key.type = BTRFS_DEV_ITEM_KEY;
1815 key.offset = (u64)-1;
1816
1817 ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
1818 if (ret < 0)
1819 goto error;
1820
1821 if (ret == 0) {
1822 /* Corruption */
1823 btrfs_err(fs_info, "corrupted chunk tree devid -1 matched");
1824 ret = -EUCLEAN;
1825 goto error;
1826 }
1827
1828 ret = btrfs_previous_item(fs_info->chunk_root, path,
1829 BTRFS_DEV_ITEMS_OBJECTID,
1830 BTRFS_DEV_ITEM_KEY);
1831 if (ret) {
1832 *devid_ret = 1;
1833 } else {
1834 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1835 path->slots[0]);
1836 *devid_ret = found_key.offset + 1;
1837 }
1838 ret = 0;
1839 error:
1840 btrfs_free_path(path);
1841 return ret;
1842 }
1843
1844 /*
1845 * the device information is stored in the chunk root
1846 * the btrfs_device struct should be fully filled in
1847 */
btrfs_add_dev_item(struct btrfs_trans_handle * trans,struct btrfs_device * device)1848 static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
1849 struct btrfs_device *device)
1850 {
1851 int ret;
1852 struct btrfs_path *path;
1853 struct btrfs_dev_item *dev_item;
1854 struct extent_buffer *leaf;
1855 struct btrfs_key key;
1856 unsigned long ptr;
1857
1858 path = btrfs_alloc_path();
1859 if (!path)
1860 return -ENOMEM;
1861
1862 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1863 key.type = BTRFS_DEV_ITEM_KEY;
1864 key.offset = device->devid;
1865
1866 ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path,
1867 &key, sizeof(*dev_item));
1868 if (ret)
1869 goto out;
1870
1871 leaf = path->nodes[0];
1872 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1873
1874 btrfs_set_device_id(leaf, dev_item, device->devid);
1875 btrfs_set_device_generation(leaf, dev_item, 0);
1876 btrfs_set_device_type(leaf, dev_item, device->type);
1877 btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1878 btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1879 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1880 btrfs_set_device_total_bytes(leaf, dev_item,
1881 btrfs_device_get_disk_total_bytes(device));
1882 btrfs_set_device_bytes_used(leaf, dev_item,
1883 btrfs_device_get_bytes_used(device));
1884 btrfs_set_device_group(leaf, dev_item, 0);
1885 btrfs_set_device_seek_speed(leaf, dev_item, 0);
1886 btrfs_set_device_bandwidth(leaf, dev_item, 0);
1887 btrfs_set_device_start_offset(leaf, dev_item, 0);
1888
1889 ptr = btrfs_device_uuid(dev_item);
1890 write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1891 ptr = btrfs_device_fsid(dev_item);
1892 write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid,
1893 ptr, BTRFS_FSID_SIZE);
1894 btrfs_mark_buffer_dirty(leaf);
1895
1896 ret = 0;
1897 out:
1898 btrfs_free_path(path);
1899 return ret;
1900 }
1901
1902 /*
1903 * Function to update ctime/mtime for a given device path.
1904 * Mainly used for ctime/mtime based probe like libblkid.
1905 *
1906 * We don't care about errors here, this is just to be kind to userspace.
1907 */
update_dev_time(const char * device_path)1908 static void update_dev_time(const char *device_path)
1909 {
1910 struct path path;
1911 struct timespec64 now;
1912 int ret;
1913
1914 ret = kern_path(device_path, LOOKUP_FOLLOW, &path);
1915 if (ret)
1916 return;
1917
1918 now = current_time(d_inode(path.dentry));
1919 inode_update_time(d_inode(path.dentry), &now, S_MTIME | S_CTIME);
1920 path_put(&path);
1921 }
1922
btrfs_rm_dev_item(struct btrfs_device * device)1923 static int btrfs_rm_dev_item(struct btrfs_device *device)
1924 {
1925 struct btrfs_root *root = device->fs_info->chunk_root;
1926 int ret;
1927 struct btrfs_path *path;
1928 struct btrfs_key key;
1929 struct btrfs_trans_handle *trans;
1930
1931 path = btrfs_alloc_path();
1932 if (!path)
1933 return -ENOMEM;
1934
1935 trans = btrfs_start_transaction(root, 0);
1936 if (IS_ERR(trans)) {
1937 btrfs_free_path(path);
1938 return PTR_ERR(trans);
1939 }
1940 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1941 key.type = BTRFS_DEV_ITEM_KEY;
1942 key.offset = device->devid;
1943
1944 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1945 if (ret) {
1946 if (ret > 0)
1947 ret = -ENOENT;
1948 btrfs_abort_transaction(trans, ret);
1949 btrfs_end_transaction(trans);
1950 goto out;
1951 }
1952
1953 ret = btrfs_del_item(trans, root, path);
1954 if (ret) {
1955 btrfs_abort_transaction(trans, ret);
1956 btrfs_end_transaction(trans);
1957 }
1958
1959 out:
1960 btrfs_free_path(path);
1961 if (!ret)
1962 ret = btrfs_commit_transaction(trans);
1963 return ret;
1964 }
1965
1966 /*
1967 * Verify that @num_devices satisfies the RAID profile constraints in the whole
1968 * filesystem. It's up to the caller to adjust that number regarding eg. device
1969 * replace.
1970 */
btrfs_check_raid_min_devices(struct btrfs_fs_info * fs_info,u64 num_devices)1971 static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
1972 u64 num_devices)
1973 {
1974 u64 all_avail;
1975 unsigned seq;
1976 int i;
1977
1978 do {
1979 seq = read_seqbegin(&fs_info->profiles_lock);
1980
1981 all_avail = fs_info->avail_data_alloc_bits |
1982 fs_info->avail_system_alloc_bits |
1983 fs_info->avail_metadata_alloc_bits;
1984 } while (read_seqretry(&fs_info->profiles_lock, seq));
1985
1986 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
1987 if (!(all_avail & btrfs_raid_array[i].bg_flag))
1988 continue;
1989
1990 if (num_devices < btrfs_raid_array[i].devs_min) {
1991 int ret = btrfs_raid_array[i].mindev_error;
1992
1993 if (ret)
1994 return ret;
1995 }
1996 }
1997
1998 return 0;
1999 }
2000
btrfs_find_next_active_device(struct btrfs_fs_devices * fs_devs,struct btrfs_device * device)2001 static struct btrfs_device * btrfs_find_next_active_device(
2002 struct btrfs_fs_devices *fs_devs, struct btrfs_device *device)
2003 {
2004 struct btrfs_device *next_device;
2005
2006 list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
2007 if (next_device != device &&
2008 !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state)
2009 && next_device->bdev)
2010 return next_device;
2011 }
2012
2013 return NULL;
2014 }
2015
2016 /*
2017 * Helper function to check if the given device is part of s_bdev / latest_bdev
2018 * and replace it with the provided or the next active device, in the context
2019 * where this function called, there should be always be another device (or
2020 * this_dev) which is active.
2021 */
btrfs_assign_next_active_device(struct btrfs_device * device,struct btrfs_device * next_device)2022 void __cold btrfs_assign_next_active_device(struct btrfs_device *device,
2023 struct btrfs_device *next_device)
2024 {
2025 struct btrfs_fs_info *fs_info = device->fs_info;
2026
2027 if (!next_device)
2028 next_device = btrfs_find_next_active_device(fs_info->fs_devices,
2029 device);
2030 ASSERT(next_device);
2031
2032 if (fs_info->sb->s_bdev &&
2033 (fs_info->sb->s_bdev == device->bdev))
2034 fs_info->sb->s_bdev = next_device->bdev;
2035
2036 if (fs_info->fs_devices->latest_bdev == device->bdev)
2037 fs_info->fs_devices->latest_bdev = next_device->bdev;
2038 }
2039
2040 /*
2041 * Return btrfs_fs_devices::num_devices excluding the device that's being
2042 * currently replaced.
2043 */
btrfs_num_devices(struct btrfs_fs_info * fs_info)2044 static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
2045 {
2046 u64 num_devices = fs_info->fs_devices->num_devices;
2047
2048 down_read(&fs_info->dev_replace.rwsem);
2049 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
2050 ASSERT(num_devices > 1);
2051 num_devices--;
2052 }
2053 up_read(&fs_info->dev_replace.rwsem);
2054
2055 return num_devices;
2056 }
2057
btrfs_scratch_superblocks(struct btrfs_fs_info * fs_info,struct block_device * bdev,const char * device_path)2058 void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
2059 struct block_device *bdev,
2060 const char *device_path)
2061 {
2062 struct btrfs_super_block *disk_super;
2063 int copy_num;
2064
2065 if (!bdev)
2066 return;
2067
2068 for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) {
2069 struct page *page;
2070 int ret;
2071
2072 disk_super = btrfs_read_dev_one_super(bdev, copy_num);
2073 if (IS_ERR(disk_super))
2074 continue;
2075
2076 memset(&disk_super->magic, 0, sizeof(disk_super->magic));
2077
2078 page = virt_to_page(disk_super);
2079 set_page_dirty(page);
2080 lock_page(page);
2081 /* write_on_page() unlocks the page */
2082 ret = write_one_page(page);
2083 if (ret)
2084 btrfs_warn(fs_info,
2085 "error clearing superblock number %d (%d)",
2086 copy_num, ret);
2087 btrfs_release_disk_super(disk_super);
2088
2089 }
2090
2091 /* Notify udev that device has changed */
2092 btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
2093
2094 /* Update ctime/mtime for device path for libblkid */
2095 update_dev_time(device_path);
2096 }
2097
btrfs_rm_device(struct btrfs_fs_info * fs_info,const char * device_path,u64 devid)2098 int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
2099 u64 devid)
2100 {
2101 struct btrfs_device *device;
2102 struct btrfs_fs_devices *cur_devices;
2103 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2104 u64 num_devices;
2105 int ret = 0;
2106
2107 /*
2108 * The device list in fs_devices is accessed without locks (neither
2109 * uuid_mutex nor device_list_mutex) as it won't change on a mounted
2110 * filesystem and another device rm cannot run.
2111 */
2112 num_devices = btrfs_num_devices(fs_info);
2113
2114 ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
2115 if (ret)
2116 goto out;
2117
2118 device = btrfs_find_device_by_devspec(fs_info, devid, device_path);
2119
2120 if (IS_ERR(device)) {
2121 if (PTR_ERR(device) == -ENOENT &&
2122 device_path && strcmp(device_path, "missing") == 0)
2123 ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
2124 else
2125 ret = PTR_ERR(device);
2126 goto out;
2127 }
2128
2129 if (btrfs_pinned_by_swapfile(fs_info, device)) {
2130 btrfs_warn_in_rcu(fs_info,
2131 "cannot remove device %s (devid %llu) due to active swapfile",
2132 rcu_str_deref(device->name), device->devid);
2133 ret = -ETXTBSY;
2134 goto out;
2135 }
2136
2137 if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
2138 ret = BTRFS_ERROR_DEV_TGT_REPLACE;
2139 goto out;
2140 }
2141
2142 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
2143 fs_info->fs_devices->rw_devices == 1) {
2144 ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
2145 goto out;
2146 }
2147
2148 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2149 mutex_lock(&fs_info->chunk_mutex);
2150 list_del_init(&device->dev_alloc_list);
2151 device->fs_devices->rw_devices--;
2152 mutex_unlock(&fs_info->chunk_mutex);
2153 }
2154
2155 ret = btrfs_shrink_device(device, 0);
2156 if (!ret)
2157 btrfs_reada_remove_dev(device);
2158 if (ret)
2159 goto error_undo;
2160
2161 /*
2162 * TODO: the superblock still includes this device in its num_devices
2163 * counter although write_all_supers() is not locked out. This
2164 * could give a filesystem state which requires a degraded mount.
2165 */
2166 ret = btrfs_rm_dev_item(device);
2167 if (ret)
2168 goto error_undo;
2169
2170 clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2171 btrfs_scrub_cancel_dev(device);
2172
2173 /*
2174 * the device list mutex makes sure that we don't change
2175 * the device list while someone else is writing out all
2176 * the device supers. Whoever is writing all supers, should
2177 * lock the device list mutex before getting the number of
2178 * devices in the super block (super_copy). Conversely,
2179 * whoever updates the number of devices in the super block
2180 * (super_copy) should hold the device list mutex.
2181 */
2182
2183 /*
2184 * In normal cases the cur_devices == fs_devices. But in case
2185 * of deleting a seed device, the cur_devices should point to
2186 * its own fs_devices listed under the fs_devices->seed.
2187 */
2188 cur_devices = device->fs_devices;
2189 mutex_lock(&fs_devices->device_list_mutex);
2190 list_del_rcu(&device->dev_list);
2191
2192 cur_devices->num_devices--;
2193 cur_devices->total_devices--;
2194 /* Update total_devices of the parent fs_devices if it's seed */
2195 if (cur_devices != fs_devices)
2196 fs_devices->total_devices--;
2197
2198 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
2199 cur_devices->missing_devices--;
2200
2201 btrfs_assign_next_active_device(device, NULL);
2202
2203 if (device->bdev) {
2204 cur_devices->open_devices--;
2205 /* remove sysfs entry */
2206 btrfs_sysfs_remove_device(device);
2207 }
2208
2209 num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
2210 btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
2211 mutex_unlock(&fs_devices->device_list_mutex);
2212
2213 /*
2214 * at this point, the device is zero sized and detached from
2215 * the devices list. All that's left is to zero out the old
2216 * supers and free the device.
2217 */
2218 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
2219 btrfs_scratch_superblocks(fs_info, device->bdev,
2220 device->name->str);
2221
2222 btrfs_close_bdev(device);
2223 synchronize_rcu();
2224 btrfs_free_device(device);
2225
2226 if (cur_devices->open_devices == 0) {
2227 list_del_init(&cur_devices->seed_list);
2228 close_fs_devices(cur_devices);
2229 free_fs_devices(cur_devices);
2230 }
2231
2232 out:
2233 return ret;
2234
2235 error_undo:
2236 btrfs_reada_undo_remove_dev(device);
2237 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2238 mutex_lock(&fs_info->chunk_mutex);
2239 list_add(&device->dev_alloc_list,
2240 &fs_devices->alloc_list);
2241 device->fs_devices->rw_devices++;
2242 mutex_unlock(&fs_info->chunk_mutex);
2243 }
2244 goto out;
2245 }
2246
btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device * srcdev)2247 void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)
2248 {
2249 struct btrfs_fs_devices *fs_devices;
2250
2251 lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex);
2252
2253 /*
2254 * in case of fs with no seed, srcdev->fs_devices will point
2255 * to fs_devices of fs_info. However when the dev being replaced is
2256 * a seed dev it will point to the seed's local fs_devices. In short
2257 * srcdev will have its correct fs_devices in both the cases.
2258 */
2259 fs_devices = srcdev->fs_devices;
2260
2261 list_del_rcu(&srcdev->dev_list);
2262 list_del(&srcdev->dev_alloc_list);
2263 fs_devices->num_devices--;
2264 if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state))
2265 fs_devices->missing_devices--;
2266
2267 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
2268 fs_devices->rw_devices--;
2269
2270 if (srcdev->bdev)
2271 fs_devices->open_devices--;
2272 }
2273
btrfs_rm_dev_replace_free_srcdev(struct btrfs_device * srcdev)2274 void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
2275 {
2276 struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
2277
2278 mutex_lock(&uuid_mutex);
2279
2280 btrfs_close_bdev(srcdev);
2281 synchronize_rcu();
2282 btrfs_free_device(srcdev);
2283
2284 /* if this is no devs we rather delete the fs_devices */
2285 if (!fs_devices->num_devices) {
2286 /*
2287 * On a mounted FS, num_devices can't be zero unless it's a
2288 * seed. In case of a seed device being replaced, the replace
2289 * target added to the sprout FS, so there will be no more
2290 * device left under the seed FS.
2291 */
2292 ASSERT(fs_devices->seeding);
2293
2294 list_del_init(&fs_devices->seed_list);
2295 close_fs_devices(fs_devices);
2296 free_fs_devices(fs_devices);
2297 }
2298 mutex_unlock(&uuid_mutex);
2299 }
2300
btrfs_destroy_dev_replace_tgtdev(struct btrfs_device * tgtdev)2301 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
2302 {
2303 struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices;
2304
2305 mutex_lock(&fs_devices->device_list_mutex);
2306
2307 btrfs_sysfs_remove_device(tgtdev);
2308
2309 if (tgtdev->bdev)
2310 fs_devices->open_devices--;
2311
2312 fs_devices->num_devices--;
2313
2314 btrfs_assign_next_active_device(tgtdev, NULL);
2315
2316 list_del_rcu(&tgtdev->dev_list);
2317
2318 mutex_unlock(&fs_devices->device_list_mutex);
2319
2320 /*
2321 * The update_dev_time() with in btrfs_scratch_superblocks()
2322 * may lead to a call to btrfs_show_devname() which will try
2323 * to hold device_list_mutex. And here this device
2324 * is already out of device list, so we don't have to hold
2325 * the device_list_mutex lock.
2326 */
2327 btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev,
2328 tgtdev->name->str);
2329
2330 btrfs_close_bdev(tgtdev);
2331 synchronize_rcu();
2332 btrfs_free_device(tgtdev);
2333 }
2334
btrfs_find_device_by_path(struct btrfs_fs_info * fs_info,const char * device_path)2335 static struct btrfs_device *btrfs_find_device_by_path(
2336 struct btrfs_fs_info *fs_info, const char *device_path)
2337 {
2338 int ret = 0;
2339 struct btrfs_super_block *disk_super;
2340 u64 devid;
2341 u8 *dev_uuid;
2342 struct block_device *bdev;
2343 struct btrfs_device *device;
2344
2345 ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
2346 fs_info->bdev_holder, 0, &bdev, &disk_super);
2347 if (ret)
2348 return ERR_PTR(ret);
2349
2350 devid = btrfs_stack_device_id(&disk_super->dev_item);
2351 dev_uuid = disk_super->dev_item.uuid;
2352 if (btrfs_fs_incompat(fs_info, METADATA_UUID))
2353 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
2354 disk_super->metadata_uuid, true);
2355 else
2356 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
2357 disk_super->fsid, true);
2358
2359 btrfs_release_disk_super(disk_super);
2360 if (!device)
2361 device = ERR_PTR(-ENOENT);
2362 blkdev_put(bdev, FMODE_READ);
2363 return device;
2364 }
2365
2366 /*
2367 * Lookup a device given by device id, or the path if the id is 0.
2368 */
btrfs_find_device_by_devspec(struct btrfs_fs_info * fs_info,u64 devid,const char * device_path)2369 struct btrfs_device *btrfs_find_device_by_devspec(
2370 struct btrfs_fs_info *fs_info, u64 devid,
2371 const char *device_path)
2372 {
2373 struct btrfs_device *device;
2374
2375 if (devid) {
2376 device = btrfs_find_device(fs_info->fs_devices, devid, NULL,
2377 NULL, true);
2378 if (!device)
2379 return ERR_PTR(-ENOENT);
2380 return device;
2381 }
2382
2383 if (!device_path || !device_path[0])
2384 return ERR_PTR(-EINVAL);
2385
2386 if (strcmp(device_path, "missing") == 0) {
2387 /* Find first missing device */
2388 list_for_each_entry(device, &fs_info->fs_devices->devices,
2389 dev_list) {
2390 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
2391 &device->dev_state) && !device->bdev)
2392 return device;
2393 }
2394 return ERR_PTR(-ENOENT);
2395 }
2396
2397 return btrfs_find_device_by_path(fs_info, device_path);
2398 }
2399
2400 /*
2401 * does all the dirty work required for changing file system's UUID.
2402 */
btrfs_prepare_sprout(struct btrfs_fs_info * fs_info)2403 static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
2404 {
2405 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2406 struct btrfs_fs_devices *old_devices;
2407 struct btrfs_fs_devices *seed_devices;
2408 struct btrfs_super_block *disk_super = fs_info->super_copy;
2409 struct btrfs_device *device;
2410 u64 super_flags;
2411
2412 lockdep_assert_held(&uuid_mutex);
2413 if (!fs_devices->seeding)
2414 return -EINVAL;
2415
2416 /*
2417 * Private copy of the seed devices, anchored at
2418 * fs_info->fs_devices->seed_list
2419 */
2420 seed_devices = alloc_fs_devices(NULL, NULL);
2421 if (IS_ERR(seed_devices))
2422 return PTR_ERR(seed_devices);
2423
2424 /*
2425 * It's necessary to retain a copy of the original seed fs_devices in
2426 * fs_uuids so that filesystems which have been seeded can successfully
2427 * reference the seed device from open_seed_devices. This also supports
2428 * multiple fs seed.
2429 */
2430 old_devices = clone_fs_devices(fs_devices);
2431 if (IS_ERR(old_devices)) {
2432 kfree(seed_devices);
2433 return PTR_ERR(old_devices);
2434 }
2435
2436 list_add(&old_devices->fs_list, &fs_uuids);
2437
2438 memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
2439 seed_devices->opened = 1;
2440 INIT_LIST_HEAD(&seed_devices->devices);
2441 INIT_LIST_HEAD(&seed_devices->alloc_list);
2442 mutex_init(&seed_devices->device_list_mutex);
2443
2444 mutex_lock(&fs_devices->device_list_mutex);
2445 list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
2446 synchronize_rcu);
2447 list_for_each_entry(device, &seed_devices->devices, dev_list)
2448 device->fs_devices = seed_devices;
2449
2450 fs_devices->seeding = false;
2451 fs_devices->num_devices = 0;
2452 fs_devices->open_devices = 0;
2453 fs_devices->missing_devices = 0;
2454 fs_devices->rotating = false;
2455 list_add(&seed_devices->seed_list, &fs_devices->seed_list);
2456
2457 generate_random_uuid(fs_devices->fsid);
2458 memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE);
2459 memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2460 mutex_unlock(&fs_devices->device_list_mutex);
2461
2462 super_flags = btrfs_super_flags(disk_super) &
2463 ~BTRFS_SUPER_FLAG_SEEDING;
2464 btrfs_set_super_flags(disk_super, super_flags);
2465
2466 return 0;
2467 }
2468
2469 /*
2470 * Store the expected generation for seed devices in device items.
2471 */
btrfs_finish_sprout(struct btrfs_trans_handle * trans)2472 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
2473 {
2474 struct btrfs_fs_info *fs_info = trans->fs_info;
2475 struct btrfs_root *root = fs_info->chunk_root;
2476 struct btrfs_path *path;
2477 struct extent_buffer *leaf;
2478 struct btrfs_dev_item *dev_item;
2479 struct btrfs_device *device;
2480 struct btrfs_key key;
2481 u8 fs_uuid[BTRFS_FSID_SIZE];
2482 u8 dev_uuid[BTRFS_UUID_SIZE];
2483 u64 devid;
2484 int ret;
2485
2486 path = btrfs_alloc_path();
2487 if (!path)
2488 return -ENOMEM;
2489
2490 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2491 key.offset = 0;
2492 key.type = BTRFS_DEV_ITEM_KEY;
2493
2494 while (1) {
2495 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2496 if (ret < 0)
2497 goto error;
2498
2499 leaf = path->nodes[0];
2500 next_slot:
2501 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
2502 ret = btrfs_next_leaf(root, path);
2503 if (ret > 0)
2504 break;
2505 if (ret < 0)
2506 goto error;
2507 leaf = path->nodes[0];
2508 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2509 btrfs_release_path(path);
2510 continue;
2511 }
2512
2513 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2514 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
2515 key.type != BTRFS_DEV_ITEM_KEY)
2516 break;
2517
2518 dev_item = btrfs_item_ptr(leaf, path->slots[0],
2519 struct btrfs_dev_item);
2520 devid = btrfs_device_id(leaf, dev_item);
2521 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
2522 BTRFS_UUID_SIZE);
2523 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
2524 BTRFS_FSID_SIZE);
2525 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
2526 fs_uuid, true);
2527 BUG_ON(!device); /* Logic error */
2528
2529 if (device->fs_devices->seeding) {
2530 btrfs_set_device_generation(leaf, dev_item,
2531 device->generation);
2532 btrfs_mark_buffer_dirty(leaf);
2533 }
2534
2535 path->slots[0]++;
2536 goto next_slot;
2537 }
2538 ret = 0;
2539 error:
2540 btrfs_free_path(path);
2541 return ret;
2542 }
2543
btrfs_init_new_device(struct btrfs_fs_info * fs_info,const char * device_path)2544 int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
2545 {
2546 struct btrfs_root *root = fs_info->dev_root;
2547 struct request_queue *q;
2548 struct btrfs_trans_handle *trans;
2549 struct btrfs_device *device;
2550 struct block_device *bdev;
2551 struct super_block *sb = fs_info->sb;
2552 struct rcu_string *name;
2553 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2554 u64 orig_super_total_bytes;
2555 u64 orig_super_num_devices;
2556 int seeding_dev = 0;
2557 int ret = 0;
2558 bool locked = false;
2559
2560 if (sb_rdonly(sb) && !fs_devices->seeding)
2561 return -EROFS;
2562
2563 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2564 fs_info->bdev_holder);
2565 if (IS_ERR(bdev))
2566 return PTR_ERR(bdev);
2567
2568 if (fs_devices->seeding) {
2569 seeding_dev = 1;
2570 down_write(&sb->s_umount);
2571 mutex_lock(&uuid_mutex);
2572 locked = true;
2573 }
2574
2575 sync_blockdev(bdev);
2576
2577 rcu_read_lock();
2578 list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
2579 if (device->bdev == bdev) {
2580 ret = -EEXIST;
2581 rcu_read_unlock();
2582 goto error;
2583 }
2584 }
2585 rcu_read_unlock();
2586
2587 device = btrfs_alloc_device(fs_info, NULL, NULL);
2588 if (IS_ERR(device)) {
2589 /* we can safely leave the fs_devices entry around */
2590 ret = PTR_ERR(device);
2591 goto error;
2592 }
2593
2594 name = rcu_string_strdup(device_path, GFP_KERNEL);
2595 if (!name) {
2596 ret = -ENOMEM;
2597 goto error_free_device;
2598 }
2599 rcu_assign_pointer(device->name, name);
2600
2601 trans = btrfs_start_transaction(root, 0);
2602 if (IS_ERR(trans)) {
2603 ret = PTR_ERR(trans);
2604 goto error_free_device;
2605 }
2606
2607 q = bdev_get_queue(bdev);
2608 set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
2609 device->generation = trans->transid;
2610 device->io_width = fs_info->sectorsize;
2611 device->io_align = fs_info->sectorsize;
2612 device->sector_size = fs_info->sectorsize;
2613 device->total_bytes = round_down(i_size_read(bdev->bd_inode),
2614 fs_info->sectorsize);
2615 device->disk_total_bytes = device->total_bytes;
2616 device->commit_total_bytes = device->total_bytes;
2617 device->fs_info = fs_info;
2618 device->bdev = bdev;
2619 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2620 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
2621 device->mode = FMODE_EXCL;
2622 device->dev_stats_valid = 1;
2623 set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
2624
2625 if (seeding_dev) {
2626 sb->s_flags &= ~SB_RDONLY;
2627 ret = btrfs_prepare_sprout(fs_info);
2628 if (ret) {
2629 btrfs_abort_transaction(trans, ret);
2630 goto error_trans;
2631 }
2632 }
2633
2634 device->fs_devices = fs_devices;
2635
2636 mutex_lock(&fs_devices->device_list_mutex);
2637 mutex_lock(&fs_info->chunk_mutex);
2638 list_add_rcu(&device->dev_list, &fs_devices->devices);
2639 list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
2640 fs_devices->num_devices++;
2641 fs_devices->open_devices++;
2642 fs_devices->rw_devices++;
2643 fs_devices->total_devices++;
2644 fs_devices->total_rw_bytes += device->total_bytes;
2645
2646 atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
2647
2648 if (!blk_queue_nonrot(q))
2649 fs_devices->rotating = true;
2650
2651 orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
2652 btrfs_set_super_total_bytes(fs_info->super_copy,
2653 round_down(orig_super_total_bytes + device->total_bytes,
2654 fs_info->sectorsize));
2655
2656 orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy);
2657 btrfs_set_super_num_devices(fs_info->super_copy,
2658 orig_super_num_devices + 1);
2659
2660 /*
2661 * we've got more storage, clear any full flags on the space
2662 * infos
2663 */
2664 btrfs_clear_space_info_full(fs_info);
2665
2666 mutex_unlock(&fs_info->chunk_mutex);
2667
2668 /* Add sysfs device entry */
2669 btrfs_sysfs_add_device(device);
2670
2671 mutex_unlock(&fs_devices->device_list_mutex);
2672
2673 if (seeding_dev) {
2674 mutex_lock(&fs_info->chunk_mutex);
2675 ret = init_first_rw_device(trans);
2676 mutex_unlock(&fs_info->chunk_mutex);
2677 if (ret) {
2678 btrfs_abort_transaction(trans, ret);
2679 goto error_sysfs;
2680 }
2681 }
2682
2683 ret = btrfs_add_dev_item(trans, device);
2684 if (ret) {
2685 btrfs_abort_transaction(trans, ret);
2686 goto error_sysfs;
2687 }
2688
2689 if (seeding_dev) {
2690 ret = btrfs_finish_sprout(trans);
2691 if (ret) {
2692 btrfs_abort_transaction(trans, ret);
2693 goto error_sysfs;
2694 }
2695
2696 /*
2697 * fs_devices now represents the newly sprouted filesystem and
2698 * its fsid has been changed by btrfs_prepare_sprout
2699 */
2700 btrfs_sysfs_update_sprout_fsid(fs_devices);
2701 }
2702
2703 ret = btrfs_commit_transaction(trans);
2704
2705 if (seeding_dev) {
2706 mutex_unlock(&uuid_mutex);
2707 up_write(&sb->s_umount);
2708 locked = false;
2709
2710 if (ret) /* transaction commit */
2711 return ret;
2712
2713 ret = btrfs_relocate_sys_chunks(fs_info);
2714 if (ret < 0)
2715 btrfs_handle_fs_error(fs_info, ret,
2716 "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
2717 trans = btrfs_attach_transaction(root);
2718 if (IS_ERR(trans)) {
2719 if (PTR_ERR(trans) == -ENOENT)
2720 return 0;
2721 ret = PTR_ERR(trans);
2722 trans = NULL;
2723 goto error_sysfs;
2724 }
2725 ret = btrfs_commit_transaction(trans);
2726 }
2727
2728 /*
2729 * Now that we have written a new super block to this device, check all
2730 * other fs_devices list if device_path alienates any other scanned
2731 * device.
2732 * We can ignore the return value as it typically returns -EINVAL and
2733 * only succeeds if the device was an alien.
2734 */
2735 btrfs_forget_devices(device_path);
2736
2737 /* Update ctime/mtime for blkid or udev */
2738 update_dev_time(device_path);
2739
2740 return ret;
2741
2742 error_sysfs:
2743 btrfs_sysfs_remove_device(device);
2744 mutex_lock(&fs_info->fs_devices->device_list_mutex);
2745 mutex_lock(&fs_info->chunk_mutex);
2746 list_del_rcu(&device->dev_list);
2747 list_del(&device->dev_alloc_list);
2748 fs_info->fs_devices->num_devices--;
2749 fs_info->fs_devices->open_devices--;
2750 fs_info->fs_devices->rw_devices--;
2751 fs_info->fs_devices->total_devices--;
2752 fs_info->fs_devices->total_rw_bytes -= device->total_bytes;
2753 atomic64_sub(device->total_bytes, &fs_info->free_chunk_space);
2754 btrfs_set_super_total_bytes(fs_info->super_copy,
2755 orig_super_total_bytes);
2756 btrfs_set_super_num_devices(fs_info->super_copy,
2757 orig_super_num_devices);
2758 mutex_unlock(&fs_info->chunk_mutex);
2759 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2760 error_trans:
2761 if (seeding_dev)
2762 sb->s_flags |= SB_RDONLY;
2763 if (trans)
2764 btrfs_end_transaction(trans);
2765 error_free_device:
2766 btrfs_free_device(device);
2767 error:
2768 blkdev_put(bdev, FMODE_EXCL);
2769 if (locked) {
2770 mutex_unlock(&uuid_mutex);
2771 up_write(&sb->s_umount);
2772 }
2773 return ret;
2774 }
2775
btrfs_update_device(struct btrfs_trans_handle * trans,struct btrfs_device * device)2776 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
2777 struct btrfs_device *device)
2778 {
2779 int ret;
2780 struct btrfs_path *path;
2781 struct btrfs_root *root = device->fs_info->chunk_root;
2782 struct btrfs_dev_item *dev_item;
2783 struct extent_buffer *leaf;
2784 struct btrfs_key key;
2785
2786 path = btrfs_alloc_path();
2787 if (!path)
2788 return -ENOMEM;
2789
2790 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2791 key.type = BTRFS_DEV_ITEM_KEY;
2792 key.offset = device->devid;
2793
2794 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2795 if (ret < 0)
2796 goto out;
2797
2798 if (ret > 0) {
2799 ret = -ENOENT;
2800 goto out;
2801 }
2802
2803 leaf = path->nodes[0];
2804 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
2805
2806 btrfs_set_device_id(leaf, dev_item, device->devid);
2807 btrfs_set_device_type(leaf, dev_item, device->type);
2808 btrfs_set_device_io_align(leaf, dev_item, device->io_align);
2809 btrfs_set_device_io_width(leaf, dev_item, device->io_width);
2810 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
2811 btrfs_set_device_total_bytes(leaf, dev_item,
2812 btrfs_device_get_disk_total_bytes(device));
2813 btrfs_set_device_bytes_used(leaf, dev_item,
2814 btrfs_device_get_bytes_used(device));
2815 btrfs_mark_buffer_dirty(leaf);
2816
2817 out:
2818 btrfs_free_path(path);
2819 return ret;
2820 }
2821
btrfs_grow_device(struct btrfs_trans_handle * trans,struct btrfs_device * device,u64 new_size)2822 int btrfs_grow_device(struct btrfs_trans_handle *trans,
2823 struct btrfs_device *device, u64 new_size)
2824 {
2825 struct btrfs_fs_info *fs_info = device->fs_info;
2826 struct btrfs_super_block *super_copy = fs_info->super_copy;
2827 u64 old_total;
2828 u64 diff;
2829
2830 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
2831 return -EACCES;
2832
2833 new_size = round_down(new_size, fs_info->sectorsize);
2834
2835 mutex_lock(&fs_info->chunk_mutex);
2836 old_total = btrfs_super_total_bytes(super_copy);
2837 diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
2838
2839 if (new_size <= device->total_bytes ||
2840 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
2841 mutex_unlock(&fs_info->chunk_mutex);
2842 return -EINVAL;
2843 }
2844
2845 btrfs_set_super_total_bytes(super_copy,
2846 round_down(old_total + diff, fs_info->sectorsize));
2847 device->fs_devices->total_rw_bytes += diff;
2848
2849 btrfs_device_set_total_bytes(device, new_size);
2850 btrfs_device_set_disk_total_bytes(device, new_size);
2851 btrfs_clear_space_info_full(device->fs_info);
2852 if (list_empty(&device->post_commit_list))
2853 list_add_tail(&device->post_commit_list,
2854 &trans->transaction->dev_update_list);
2855 mutex_unlock(&fs_info->chunk_mutex);
2856
2857 return btrfs_update_device(trans, device);
2858 }
2859
btrfs_free_chunk(struct btrfs_trans_handle * trans,u64 chunk_offset)2860 static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
2861 {
2862 struct btrfs_fs_info *fs_info = trans->fs_info;
2863 struct btrfs_root *root = fs_info->chunk_root;
2864 int ret;
2865 struct btrfs_path *path;
2866 struct btrfs_key key;
2867
2868 path = btrfs_alloc_path();
2869 if (!path)
2870 return -ENOMEM;
2871
2872 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2873 key.offset = chunk_offset;
2874 key.type = BTRFS_CHUNK_ITEM_KEY;
2875
2876 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2877 if (ret < 0)
2878 goto out;
2879 else if (ret > 0) { /* Logic error or corruption */
2880 btrfs_handle_fs_error(fs_info, -ENOENT,
2881 "Failed lookup while freeing chunk.");
2882 ret = -ENOENT;
2883 goto out;
2884 }
2885
2886 ret = btrfs_del_item(trans, root, path);
2887 if (ret < 0)
2888 btrfs_handle_fs_error(fs_info, ret,
2889 "Failed to delete chunk item.");
2890 out:
2891 btrfs_free_path(path);
2892 return ret;
2893 }
2894
btrfs_del_sys_chunk(struct btrfs_fs_info * fs_info,u64 chunk_offset)2895 static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2896 {
2897 struct btrfs_super_block *super_copy = fs_info->super_copy;
2898 struct btrfs_disk_key *disk_key;
2899 struct btrfs_chunk *chunk;
2900 u8 *ptr;
2901 int ret = 0;
2902 u32 num_stripes;
2903 u32 array_size;
2904 u32 len = 0;
2905 u32 cur;
2906 struct btrfs_key key;
2907
2908 mutex_lock(&fs_info->chunk_mutex);
2909 array_size = btrfs_super_sys_array_size(super_copy);
2910
2911 ptr = super_copy->sys_chunk_array;
2912 cur = 0;
2913
2914 while (cur < array_size) {
2915 disk_key = (struct btrfs_disk_key *)ptr;
2916 btrfs_disk_key_to_cpu(&key, disk_key);
2917
2918 len = sizeof(*disk_key);
2919
2920 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
2921 chunk = (struct btrfs_chunk *)(ptr + len);
2922 num_stripes = btrfs_stack_chunk_num_stripes(chunk);
2923 len += btrfs_chunk_item_size(num_stripes);
2924 } else {
2925 ret = -EIO;
2926 break;
2927 }
2928 if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
2929 key.offset == chunk_offset) {
2930 memmove(ptr, ptr + len, array_size - (cur + len));
2931 array_size -= len;
2932 btrfs_set_super_sys_array_size(super_copy, array_size);
2933 } else {
2934 ptr += len;
2935 cur += len;
2936 }
2937 }
2938 mutex_unlock(&fs_info->chunk_mutex);
2939 return ret;
2940 }
2941
2942 /*
2943 * btrfs_get_chunk_map() - Find the mapping containing the given logical extent.
2944 * @logical: Logical block offset in bytes.
2945 * @length: Length of extent in bytes.
2946 *
2947 * Return: Chunk mapping or ERR_PTR.
2948 */
btrfs_get_chunk_map(struct btrfs_fs_info * fs_info,u64 logical,u64 length)2949 struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
2950 u64 logical, u64 length)
2951 {
2952 struct extent_map_tree *em_tree;
2953 struct extent_map *em;
2954
2955 em_tree = &fs_info->mapping_tree;
2956 read_lock(&em_tree->lock);
2957 em = lookup_extent_mapping(em_tree, logical, length);
2958 read_unlock(&em_tree->lock);
2959
2960 if (!em) {
2961 btrfs_crit(fs_info, "unable to find logical %llu length %llu",
2962 logical, length);
2963 return ERR_PTR(-EINVAL);
2964 }
2965
2966 if (em->start > logical || em->start + em->len < logical) {
2967 btrfs_crit(fs_info,
2968 "found a bad mapping, wanted %llu-%llu, found %llu-%llu",
2969 logical, length, em->start, em->start + em->len);
2970 free_extent_map(em);
2971 return ERR_PTR(-EINVAL);
2972 }
2973
2974 /* callers are responsible for dropping em's ref. */
2975 return em;
2976 }
2977
btrfs_remove_chunk(struct btrfs_trans_handle * trans,u64 chunk_offset)2978 int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
2979 {
2980 struct btrfs_fs_info *fs_info = trans->fs_info;
2981 struct extent_map *em;
2982 struct map_lookup *map;
2983 u64 dev_extent_len = 0;
2984 int i, ret = 0;
2985 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2986
2987 em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
2988 if (IS_ERR(em)) {
2989 /*
2990 * This is a logic error, but we don't want to just rely on the
2991 * user having built with ASSERT enabled, so if ASSERT doesn't
2992 * do anything we still error out.
2993 */
2994 ASSERT(0);
2995 return PTR_ERR(em);
2996 }
2997 map = em->map_lookup;
2998 mutex_lock(&fs_info->chunk_mutex);
2999 check_system_chunk(trans, map->type);
3000 mutex_unlock(&fs_info->chunk_mutex);
3001
3002 /*
3003 * Take the device list mutex to prevent races with the final phase of
3004 * a device replace operation that replaces the device object associated
3005 * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()).
3006 */
3007 mutex_lock(&fs_devices->device_list_mutex);
3008 for (i = 0; i < map->num_stripes; i++) {
3009 struct btrfs_device *device = map->stripes[i].dev;
3010 ret = btrfs_free_dev_extent(trans, device,
3011 map->stripes[i].physical,
3012 &dev_extent_len);
3013 if (ret) {
3014 mutex_unlock(&fs_devices->device_list_mutex);
3015 btrfs_abort_transaction(trans, ret);
3016 goto out;
3017 }
3018
3019 if (device->bytes_used > 0) {
3020 mutex_lock(&fs_info->chunk_mutex);
3021 btrfs_device_set_bytes_used(device,
3022 device->bytes_used - dev_extent_len);
3023 atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
3024 btrfs_clear_space_info_full(fs_info);
3025 mutex_unlock(&fs_info->chunk_mutex);
3026 }
3027
3028 ret = btrfs_update_device(trans, device);
3029 if (ret) {
3030 mutex_unlock(&fs_devices->device_list_mutex);
3031 btrfs_abort_transaction(trans, ret);
3032 goto out;
3033 }
3034 }
3035 mutex_unlock(&fs_devices->device_list_mutex);
3036
3037 ret = btrfs_free_chunk(trans, chunk_offset);
3038 if (ret) {
3039 btrfs_abort_transaction(trans, ret);
3040 goto out;
3041 }
3042
3043 trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
3044
3045 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
3046 ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
3047 if (ret) {
3048 btrfs_abort_transaction(trans, ret);
3049 goto out;
3050 }
3051 }
3052
3053 ret = btrfs_remove_block_group(trans, chunk_offset, em);
3054 if (ret) {
3055 btrfs_abort_transaction(trans, ret);
3056 goto out;
3057 }
3058
3059 out:
3060 /* once for us */
3061 free_extent_map(em);
3062 return ret;
3063 }
3064
btrfs_relocate_chunk(struct btrfs_fs_info * fs_info,u64 chunk_offset)3065 static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
3066 {
3067 struct btrfs_root *root = fs_info->chunk_root;
3068 struct btrfs_trans_handle *trans;
3069 struct btrfs_block_group *block_group;
3070 int ret;
3071
3072 /*
3073 * Prevent races with automatic removal of unused block groups.
3074 * After we relocate and before we remove the chunk with offset
3075 * chunk_offset, automatic removal of the block group can kick in,
3076 * resulting in a failure when calling btrfs_remove_chunk() below.
3077 *
3078 * Make sure to acquire this mutex before doing a tree search (dev
3079 * or chunk trees) to find chunks. Otherwise the cleaner kthread might
3080 * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
3081 * we release the path used to search the chunk/dev tree and before
3082 * the current task acquires this mutex and calls us.
3083 */
3084 lockdep_assert_held(&fs_info->delete_unused_bgs_mutex);
3085
3086 /* step one, relocate all the extents inside this chunk */
3087 btrfs_scrub_pause(fs_info);
3088 ret = btrfs_relocate_block_group(fs_info, chunk_offset);
3089 btrfs_scrub_continue(fs_info);
3090 if (ret)
3091 return ret;
3092
3093 block_group = btrfs_lookup_block_group(fs_info, chunk_offset);
3094 if (!block_group)
3095 return -ENOENT;
3096 btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
3097 btrfs_put_block_group(block_group);
3098
3099 trans = btrfs_start_trans_remove_block_group(root->fs_info,
3100 chunk_offset);
3101 if (IS_ERR(trans)) {
3102 ret = PTR_ERR(trans);
3103 btrfs_handle_fs_error(root->fs_info, ret, NULL);
3104 return ret;
3105 }
3106
3107 /*
3108 * step two, delete the device extents and the
3109 * chunk tree entries
3110 */
3111 ret = btrfs_remove_chunk(trans, chunk_offset);
3112 btrfs_end_transaction(trans);
3113 return ret;
3114 }
3115
btrfs_relocate_sys_chunks(struct btrfs_fs_info * fs_info)3116 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
3117 {
3118 struct btrfs_root *chunk_root = fs_info->chunk_root;
3119 struct btrfs_path *path;
3120 struct extent_buffer *leaf;
3121 struct btrfs_chunk *chunk;
3122 struct btrfs_key key;
3123 struct btrfs_key found_key;
3124 u64 chunk_type;
3125 bool retried = false;
3126 int failed = 0;
3127 int ret;
3128
3129 path = btrfs_alloc_path();
3130 if (!path)
3131 return -ENOMEM;
3132
3133 again:
3134 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3135 key.offset = (u64)-1;
3136 key.type = BTRFS_CHUNK_ITEM_KEY;
3137
3138 while (1) {
3139 mutex_lock(&fs_info->delete_unused_bgs_mutex);
3140 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
3141 if (ret < 0) {
3142 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3143 goto error;
3144 }
3145 BUG_ON(ret == 0); /* Corruption */
3146
3147 ret = btrfs_previous_item(chunk_root, path, key.objectid,
3148 key.type);
3149 if (ret)
3150 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3151 if (ret < 0)
3152 goto error;
3153 if (ret > 0)
3154 break;
3155
3156 leaf = path->nodes[0];
3157 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3158
3159 chunk = btrfs_item_ptr(leaf, path->slots[0],
3160 struct btrfs_chunk);
3161 chunk_type = btrfs_chunk_type(leaf, chunk);
3162 btrfs_release_path(path);
3163
3164 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
3165 ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3166 if (ret == -ENOSPC)
3167 failed++;
3168 else
3169 BUG_ON(ret);
3170 }
3171 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3172
3173 if (found_key.offset == 0)
3174 break;
3175 key.offset = found_key.offset - 1;
3176 }
3177 ret = 0;
3178 if (failed && !retried) {
3179 failed = 0;
3180 retried = true;
3181 goto again;
3182 } else if (WARN_ON(failed && retried)) {
3183 ret = -ENOSPC;
3184 }
3185 error:
3186 btrfs_free_path(path);
3187 return ret;
3188 }
3189
3190 /*
3191 * return 1 : allocate a data chunk successfully,
3192 * return <0: errors during allocating a data chunk,
3193 * return 0 : no need to allocate a data chunk.
3194 */
btrfs_may_alloc_data_chunk(struct btrfs_fs_info * fs_info,u64 chunk_offset)3195 static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
3196 u64 chunk_offset)
3197 {
3198 struct btrfs_block_group *cache;
3199 u64 bytes_used;
3200 u64 chunk_type;
3201
3202 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3203 ASSERT(cache);
3204 chunk_type = cache->flags;
3205 btrfs_put_block_group(cache);
3206
3207 if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA))
3208 return 0;
3209
3210 spin_lock(&fs_info->data_sinfo->lock);
3211 bytes_used = fs_info->data_sinfo->bytes_used;
3212 spin_unlock(&fs_info->data_sinfo->lock);
3213
3214 if (!bytes_used) {
3215 struct btrfs_trans_handle *trans;
3216 int ret;
3217
3218 trans = btrfs_join_transaction(fs_info->tree_root);
3219 if (IS_ERR(trans))
3220 return PTR_ERR(trans);
3221
3222 ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA);
3223 btrfs_end_transaction(trans);
3224 if (ret < 0)
3225 return ret;
3226 return 1;
3227 }
3228
3229 return 0;
3230 }
3231
insert_balance_item(struct btrfs_fs_info * fs_info,struct btrfs_balance_control * bctl)3232 static int insert_balance_item(struct btrfs_fs_info *fs_info,
3233 struct btrfs_balance_control *bctl)
3234 {
3235 struct btrfs_root *root = fs_info->tree_root;
3236 struct btrfs_trans_handle *trans;
3237 struct btrfs_balance_item *item;
3238 struct btrfs_disk_balance_args disk_bargs;
3239 struct btrfs_path *path;
3240 struct extent_buffer *leaf;
3241 struct btrfs_key key;
3242 int ret, err;
3243
3244 path = btrfs_alloc_path();
3245 if (!path)
3246 return -ENOMEM;
3247
3248 trans = btrfs_start_transaction(root, 0);
3249 if (IS_ERR(trans)) {
3250 btrfs_free_path(path);
3251 return PTR_ERR(trans);
3252 }
3253
3254 key.objectid = BTRFS_BALANCE_OBJECTID;
3255 key.type = BTRFS_TEMPORARY_ITEM_KEY;
3256 key.offset = 0;
3257
3258 ret = btrfs_insert_empty_item(trans, root, path, &key,
3259 sizeof(*item));
3260 if (ret)
3261 goto out;
3262
3263 leaf = path->nodes[0];
3264 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
3265
3266 memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
3267
3268 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
3269 btrfs_set_balance_data(leaf, item, &disk_bargs);
3270 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
3271 btrfs_set_balance_meta(leaf, item, &disk_bargs);
3272 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
3273 btrfs_set_balance_sys(leaf, item, &disk_bargs);
3274
3275 btrfs_set_balance_flags(leaf, item, bctl->flags);
3276
3277 btrfs_mark_buffer_dirty(leaf);
3278 out:
3279 btrfs_free_path(path);
3280 err = btrfs_commit_transaction(trans);
3281 if (err && !ret)
3282 ret = err;
3283 return ret;
3284 }
3285
del_balance_item(struct btrfs_fs_info * fs_info)3286 static int del_balance_item(struct btrfs_fs_info *fs_info)
3287 {
3288 struct btrfs_root *root = fs_info->tree_root;
3289 struct btrfs_trans_handle *trans;
3290 struct btrfs_path *path;
3291 struct btrfs_key key;
3292 int ret, err;
3293
3294 path = btrfs_alloc_path();
3295 if (!path)
3296 return -ENOMEM;
3297
3298 trans = btrfs_start_transaction_fallback_global_rsv(root, 0);
3299 if (IS_ERR(trans)) {
3300 btrfs_free_path(path);
3301 return PTR_ERR(trans);
3302 }
3303
3304 key.objectid = BTRFS_BALANCE_OBJECTID;
3305 key.type = BTRFS_TEMPORARY_ITEM_KEY;
3306 key.offset = 0;
3307
3308 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
3309 if (ret < 0)
3310 goto out;
3311 if (ret > 0) {
3312 ret = -ENOENT;
3313 goto out;
3314 }
3315
3316 ret = btrfs_del_item(trans, root, path);
3317 out:
3318 btrfs_free_path(path);
3319 err = btrfs_commit_transaction(trans);
3320 if (err && !ret)
3321 ret = err;
3322 return ret;
3323 }
3324
3325 /*
3326 * This is a heuristic used to reduce the number of chunks balanced on
3327 * resume after balance was interrupted.
3328 */
update_balance_args(struct btrfs_balance_control * bctl)3329 static void update_balance_args(struct btrfs_balance_control *bctl)
3330 {
3331 /*
3332 * Turn on soft mode for chunk types that were being converted.
3333 */
3334 if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
3335 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
3336 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
3337 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
3338 if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
3339 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
3340
3341 /*
3342 * Turn on usage filter if is not already used. The idea is
3343 * that chunks that we have already balanced should be
3344 * reasonably full. Don't do it for chunks that are being
3345 * converted - that will keep us from relocating unconverted
3346 * (albeit full) chunks.
3347 */
3348 if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3349 !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3350 !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3351 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
3352 bctl->data.usage = 90;
3353 }
3354 if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3355 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3356 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3357 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
3358 bctl->sys.usage = 90;
3359 }
3360 if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3361 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3362 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3363 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
3364 bctl->meta.usage = 90;
3365 }
3366 }
3367
3368 /*
3369 * Clear the balance status in fs_info and delete the balance item from disk.
3370 */
reset_balance_state(struct btrfs_fs_info * fs_info)3371 static void reset_balance_state(struct btrfs_fs_info *fs_info)
3372 {
3373 struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3374 int ret;
3375
3376 BUG_ON(!fs_info->balance_ctl);
3377
3378 spin_lock(&fs_info->balance_lock);
3379 fs_info->balance_ctl = NULL;
3380 spin_unlock(&fs_info->balance_lock);
3381
3382 kfree(bctl);
3383 ret = del_balance_item(fs_info);
3384 if (ret)
3385 btrfs_handle_fs_error(fs_info, ret, NULL);
3386 }
3387
3388 /*
3389 * Balance filters. Return 1 if chunk should be filtered out
3390 * (should not be balanced).
3391 */
chunk_profiles_filter(u64 chunk_type,struct btrfs_balance_args * bargs)3392 static int chunk_profiles_filter(u64 chunk_type,
3393 struct btrfs_balance_args *bargs)
3394 {
3395 chunk_type = chunk_to_extended(chunk_type) &
3396 BTRFS_EXTENDED_PROFILE_MASK;
3397
3398 if (bargs->profiles & chunk_type)
3399 return 0;
3400
3401 return 1;
3402 }
3403
chunk_usage_range_filter(struct btrfs_fs_info * fs_info,u64 chunk_offset,struct btrfs_balance_args * bargs)3404 static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
3405 struct btrfs_balance_args *bargs)
3406 {
3407 struct btrfs_block_group *cache;
3408 u64 chunk_used;
3409 u64 user_thresh_min;
3410 u64 user_thresh_max;
3411 int ret = 1;
3412
3413 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3414 chunk_used = cache->used;
3415
3416 if (bargs->usage_min == 0)
3417 user_thresh_min = 0;
3418 else
3419 user_thresh_min = div_factor_fine(cache->length,
3420 bargs->usage_min);
3421
3422 if (bargs->usage_max == 0)
3423 user_thresh_max = 1;
3424 else if (bargs->usage_max > 100)
3425 user_thresh_max = cache->length;
3426 else
3427 user_thresh_max = div_factor_fine(cache->length,
3428 bargs->usage_max);
3429
3430 if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
3431 ret = 0;
3432
3433 btrfs_put_block_group(cache);
3434 return ret;
3435 }
3436
chunk_usage_filter(struct btrfs_fs_info * fs_info,u64 chunk_offset,struct btrfs_balance_args * bargs)3437 static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
3438 u64 chunk_offset, struct btrfs_balance_args *bargs)
3439 {
3440 struct btrfs_block_group *cache;
3441 u64 chunk_used, user_thresh;
3442 int ret = 1;
3443
3444 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3445 chunk_used = cache->used;
3446
3447 if (bargs->usage_min == 0)
3448 user_thresh = 1;
3449 else if (bargs->usage > 100)
3450 user_thresh = cache->length;
3451 else
3452 user_thresh = div_factor_fine(cache->length, bargs->usage);
3453
3454 if (chunk_used < user_thresh)
3455 ret = 0;
3456
3457 btrfs_put_block_group(cache);
3458 return ret;
3459 }
3460
chunk_devid_filter(struct extent_buffer * leaf,struct btrfs_chunk * chunk,struct btrfs_balance_args * bargs)3461 static int chunk_devid_filter(struct extent_buffer *leaf,
3462 struct btrfs_chunk *chunk,
3463 struct btrfs_balance_args *bargs)
3464 {
3465 struct btrfs_stripe *stripe;
3466 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3467 int i;
3468
3469 for (i = 0; i < num_stripes; i++) {
3470 stripe = btrfs_stripe_nr(chunk, i);
3471 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
3472 return 0;
3473 }
3474
3475 return 1;
3476 }
3477
calc_data_stripes(u64 type,int num_stripes)3478 static u64 calc_data_stripes(u64 type, int num_stripes)
3479 {
3480 const int index = btrfs_bg_flags_to_raid_index(type);
3481 const int ncopies = btrfs_raid_array[index].ncopies;
3482 const int nparity = btrfs_raid_array[index].nparity;
3483
3484 if (nparity)
3485 return num_stripes - nparity;
3486 else
3487 return num_stripes / ncopies;
3488 }
3489
3490 /* [pstart, pend) */
chunk_drange_filter(struct extent_buffer * leaf,struct btrfs_chunk * chunk,struct btrfs_balance_args * bargs)3491 static int chunk_drange_filter(struct extent_buffer *leaf,
3492 struct btrfs_chunk *chunk,
3493 struct btrfs_balance_args *bargs)
3494 {
3495 struct btrfs_stripe *stripe;
3496 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3497 u64 stripe_offset;
3498 u64 stripe_length;
3499 u64 type;
3500 int factor;
3501 int i;
3502
3503 if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
3504 return 0;
3505
3506 type = btrfs_chunk_type(leaf, chunk);
3507 factor = calc_data_stripes(type, num_stripes);
3508
3509 for (i = 0; i < num_stripes; i++) {
3510 stripe = btrfs_stripe_nr(chunk, i);
3511 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
3512 continue;
3513
3514 stripe_offset = btrfs_stripe_offset(leaf, stripe);
3515 stripe_length = btrfs_chunk_length(leaf, chunk);
3516 stripe_length = div_u64(stripe_length, factor);
3517
3518 if (stripe_offset < bargs->pend &&
3519 stripe_offset + stripe_length > bargs->pstart)
3520 return 0;
3521 }
3522
3523 return 1;
3524 }
3525
3526 /* [vstart, vend) */
chunk_vrange_filter(struct extent_buffer * leaf,struct btrfs_chunk * chunk,u64 chunk_offset,struct btrfs_balance_args * bargs)3527 static int chunk_vrange_filter(struct extent_buffer *leaf,
3528 struct btrfs_chunk *chunk,
3529 u64 chunk_offset,
3530 struct btrfs_balance_args *bargs)
3531 {
3532 if (chunk_offset < bargs->vend &&
3533 chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
3534 /* at least part of the chunk is inside this vrange */
3535 return 0;
3536
3537 return 1;
3538 }
3539
chunk_stripes_range_filter(struct extent_buffer * leaf,struct btrfs_chunk * chunk,struct btrfs_balance_args * bargs)3540 static int chunk_stripes_range_filter(struct extent_buffer *leaf,
3541 struct btrfs_chunk *chunk,
3542 struct btrfs_balance_args *bargs)
3543 {
3544 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3545
3546 if (bargs->stripes_min <= num_stripes
3547 && num_stripes <= bargs->stripes_max)
3548 return 0;
3549
3550 return 1;
3551 }
3552
chunk_soft_convert_filter(u64 chunk_type,struct btrfs_balance_args * bargs)3553 static int chunk_soft_convert_filter(u64 chunk_type,
3554 struct btrfs_balance_args *bargs)
3555 {
3556 if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
3557 return 0;
3558
3559 chunk_type = chunk_to_extended(chunk_type) &
3560 BTRFS_EXTENDED_PROFILE_MASK;
3561
3562 if (bargs->target == chunk_type)
3563 return 1;
3564
3565 return 0;
3566 }
3567
should_balance_chunk(struct extent_buffer * leaf,struct btrfs_chunk * chunk,u64 chunk_offset)3568 static int should_balance_chunk(struct extent_buffer *leaf,
3569 struct btrfs_chunk *chunk, u64 chunk_offset)
3570 {
3571 struct btrfs_fs_info *fs_info = leaf->fs_info;
3572 struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3573 struct btrfs_balance_args *bargs = NULL;
3574 u64 chunk_type = btrfs_chunk_type(leaf, chunk);
3575
3576 /* type filter */
3577 if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
3578 (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
3579 return 0;
3580 }
3581
3582 if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3583 bargs = &bctl->data;
3584 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3585 bargs = &bctl->sys;
3586 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3587 bargs = &bctl->meta;
3588
3589 /* profiles filter */
3590 if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
3591 chunk_profiles_filter(chunk_type, bargs)) {
3592 return 0;
3593 }
3594
3595 /* usage filter */
3596 if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
3597 chunk_usage_filter(fs_info, chunk_offset, bargs)) {
3598 return 0;
3599 } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3600 chunk_usage_range_filter(fs_info, chunk_offset, bargs)) {
3601 return 0;
3602 }
3603
3604 /* devid filter */
3605 if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
3606 chunk_devid_filter(leaf, chunk, bargs)) {
3607 return 0;
3608 }
3609
3610 /* drange filter, makes sense only with devid filter */
3611 if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
3612 chunk_drange_filter(leaf, chunk, bargs)) {
3613 return 0;
3614 }
3615
3616 /* vrange filter */
3617 if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
3618 chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
3619 return 0;
3620 }
3621
3622 /* stripes filter */
3623 if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
3624 chunk_stripes_range_filter(leaf, chunk, bargs)) {
3625 return 0;
3626 }
3627
3628 /* soft profile changing mode */
3629 if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
3630 chunk_soft_convert_filter(chunk_type, bargs)) {
3631 return 0;
3632 }
3633
3634 /*
3635 * limited by count, must be the last filter
3636 */
3637 if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
3638 if (bargs->limit == 0)
3639 return 0;
3640 else
3641 bargs->limit--;
3642 } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
3643 /*
3644 * Same logic as the 'limit' filter; the minimum cannot be
3645 * determined here because we do not have the global information
3646 * about the count of all chunks that satisfy the filters.
3647 */
3648 if (bargs->limit_max == 0)
3649 return 0;
3650 else
3651 bargs->limit_max--;
3652 }
3653
3654 return 1;
3655 }
3656
__btrfs_balance(struct btrfs_fs_info * fs_info)3657 static int __btrfs_balance(struct btrfs_fs_info *fs_info)
3658 {
3659 struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3660 struct btrfs_root *chunk_root = fs_info->chunk_root;
3661 u64 chunk_type;
3662 struct btrfs_chunk *chunk;
3663 struct btrfs_path *path = NULL;
3664 struct btrfs_key key;
3665 struct btrfs_key found_key;
3666 struct extent_buffer *leaf;
3667 int slot;
3668 int ret;
3669 int enospc_errors = 0;
3670 bool counting = true;
3671 /* The single value limit and min/max limits use the same bytes in the */
3672 u64 limit_data = bctl->data.limit;
3673 u64 limit_meta = bctl->meta.limit;
3674 u64 limit_sys = bctl->sys.limit;
3675 u32 count_data = 0;
3676 u32 count_meta = 0;
3677 u32 count_sys = 0;
3678 int chunk_reserved = 0;
3679
3680 path = btrfs_alloc_path();
3681 if (!path) {
3682 ret = -ENOMEM;
3683 goto error;
3684 }
3685
3686 /* zero out stat counters */
3687 spin_lock(&fs_info->balance_lock);
3688 memset(&bctl->stat, 0, sizeof(bctl->stat));
3689 spin_unlock(&fs_info->balance_lock);
3690 again:
3691 if (!counting) {
3692 /*
3693 * The single value limit and min/max limits use the same bytes
3694 * in the
3695 */
3696 bctl->data.limit = limit_data;
3697 bctl->meta.limit = limit_meta;
3698 bctl->sys.limit = limit_sys;
3699 }
3700 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3701 key.offset = (u64)-1;
3702 key.type = BTRFS_CHUNK_ITEM_KEY;
3703
3704 while (1) {
3705 if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
3706 atomic_read(&fs_info->balance_cancel_req)) {
3707 ret = -ECANCELED;
3708 goto error;
3709 }
3710
3711 mutex_lock(&fs_info->delete_unused_bgs_mutex);
3712 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
3713 if (ret < 0) {
3714 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3715 goto error;
3716 }
3717
3718 /*
3719 * this shouldn't happen, it means the last relocate
3720 * failed
3721 */
3722 if (ret == 0)
3723 BUG(); /* FIXME break ? */
3724
3725 ret = btrfs_previous_item(chunk_root, path, 0,
3726 BTRFS_CHUNK_ITEM_KEY);
3727 if (ret) {
3728 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3729 ret = 0;
3730 break;
3731 }
3732
3733 leaf = path->nodes[0];
3734 slot = path->slots[0];
3735 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3736
3737 if (found_key.objectid != key.objectid) {
3738 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3739 break;
3740 }
3741
3742 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
3743 chunk_type = btrfs_chunk_type(leaf, chunk);
3744
3745 if (!counting) {
3746 spin_lock(&fs_info->balance_lock);
3747 bctl->stat.considered++;
3748 spin_unlock(&fs_info->balance_lock);
3749 }
3750
3751 ret = should_balance_chunk(leaf, chunk, found_key.offset);
3752
3753 btrfs_release_path(path);
3754 if (!ret) {
3755 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3756 goto loop;
3757 }
3758
3759 if (counting) {
3760 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3761 spin_lock(&fs_info->balance_lock);
3762 bctl->stat.expected++;
3763 spin_unlock(&fs_info->balance_lock);
3764
3765 if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3766 count_data++;
3767 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3768 count_sys++;
3769 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3770 count_meta++;
3771
3772 goto loop;
3773 }
3774
3775 /*
3776 * Apply limit_min filter, no need to check if the LIMITS
3777 * filter is used, limit_min is 0 by default
3778 */
3779 if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
3780 count_data < bctl->data.limit_min)
3781 || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) &&
3782 count_meta < bctl->meta.limit_min)
3783 || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
3784 count_sys < bctl->sys.limit_min)) {
3785 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3786 goto loop;
3787 }
3788
3789 if (!chunk_reserved) {
3790 /*
3791 * We may be relocating the only data chunk we have,
3792 * which could potentially end up with losing data's
3793 * raid profile, so lets allocate an empty one in
3794 * advance.
3795 */
3796 ret = btrfs_may_alloc_data_chunk(fs_info,
3797 found_key.offset);
3798 if (ret < 0) {
3799 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3800 goto error;
3801 } else if (ret == 1) {
3802 chunk_reserved = 1;
3803 }
3804 }
3805
3806 ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3807 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3808 if (ret == -ENOSPC) {
3809 enospc_errors++;
3810 } else if (ret == -ETXTBSY) {
3811 btrfs_info(fs_info,
3812 "skipping relocation of block group %llu due to active swapfile",
3813 found_key.offset);
3814 ret = 0;
3815 } else if (ret) {
3816 goto error;
3817 } else {
3818 spin_lock(&fs_info->balance_lock);
3819 bctl->stat.completed++;
3820 spin_unlock(&fs_info->balance_lock);
3821 }
3822 loop:
3823 if (found_key.offset == 0)
3824 break;
3825 key.offset = found_key.offset - 1;
3826 }
3827
3828 if (counting) {
3829 btrfs_release_path(path);
3830 counting = false;
3831 goto again;
3832 }
3833 error:
3834 btrfs_free_path(path);
3835 if (enospc_errors) {
3836 btrfs_info(fs_info, "%d enospc errors during balance",
3837 enospc_errors);
3838 if (!ret)
3839 ret = -ENOSPC;
3840 }
3841
3842 return ret;
3843 }
3844
3845 /**
3846 * alloc_profile_is_valid - see if a given profile is valid and reduced
3847 * @flags: profile to validate
3848 * @extended: if true @flags is treated as an extended profile
3849 */
alloc_profile_is_valid(u64 flags,int extended)3850 static int alloc_profile_is_valid(u64 flags, int extended)
3851 {
3852 u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
3853 BTRFS_BLOCK_GROUP_PROFILE_MASK);
3854
3855 flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
3856
3857 /* 1) check that all other bits are zeroed */
3858 if (flags & ~mask)
3859 return 0;
3860
3861 /* 2) see if profile is reduced */
3862 if (flags == 0)
3863 return !extended; /* "0" is valid for usual profiles */
3864
3865 return has_single_bit_set(flags);
3866 }
3867
balance_need_close(struct btrfs_fs_info * fs_info)3868 static inline int balance_need_close(struct btrfs_fs_info *fs_info)
3869 {
3870 /* cancel requested || normal exit path */
3871 return atomic_read(&fs_info->balance_cancel_req) ||
3872 (atomic_read(&fs_info->balance_pause_req) == 0 &&
3873 atomic_read(&fs_info->balance_cancel_req) == 0);
3874 }
3875
3876 /*
3877 * Validate target profile against allowed profiles and return true if it's OK.
3878 * Otherwise print the error message and return false.
3879 */
validate_convert_profile(struct btrfs_fs_info * fs_info,const struct btrfs_balance_args * bargs,u64 allowed,const char * type)3880 static inline int validate_convert_profile(struct btrfs_fs_info *fs_info,
3881 const struct btrfs_balance_args *bargs,
3882 u64 allowed, const char *type)
3883 {
3884 if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
3885 return true;
3886
3887 /* Profile is valid and does not have bits outside of the allowed set */
3888 if (alloc_profile_is_valid(bargs->target, 1) &&
3889 (bargs->target & ~allowed) == 0)
3890 return true;
3891
3892 btrfs_err(fs_info, "balance: invalid convert %s profile %s",
3893 type, btrfs_bg_type_to_raid_name(bargs->target));
3894 return false;
3895 }
3896
3897 /*
3898 * Fill @buf with textual description of balance filter flags @bargs, up to
3899 * @size_buf including the terminating null. The output may be trimmed if it
3900 * does not fit into the provided buffer.
3901 */
describe_balance_args(struct btrfs_balance_args * bargs,char * buf,u32 size_buf)3902 static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf,
3903 u32 size_buf)
3904 {
3905 int ret;
3906 u32 size_bp = size_buf;
3907 char *bp = buf;
3908 u64 flags = bargs->flags;
3909 char tmp_buf[128] = {'\0'};
3910
3911 if (!flags)
3912 return;
3913
3914 #define CHECK_APPEND_NOARG(a) \
3915 do { \
3916 ret = snprintf(bp, size_bp, (a)); \
3917 if (ret < 0 || ret >= size_bp) \
3918 goto out_overflow; \
3919 size_bp -= ret; \
3920 bp += ret; \
3921 } while (0)
3922
3923 #define CHECK_APPEND_1ARG(a, v1) \
3924 do { \
3925 ret = snprintf(bp, size_bp, (a), (v1)); \
3926 if (ret < 0 || ret >= size_bp) \
3927 goto out_overflow; \
3928 size_bp -= ret; \
3929 bp += ret; \
3930 } while (0)
3931
3932 #define CHECK_APPEND_2ARG(a, v1, v2) \
3933 do { \
3934 ret = snprintf(bp, size_bp, (a), (v1), (v2)); \
3935 if (ret < 0 || ret >= size_bp) \
3936 goto out_overflow; \
3937 size_bp -= ret; \
3938 bp += ret; \
3939 } while (0)
3940
3941 if (flags & BTRFS_BALANCE_ARGS_CONVERT)
3942 CHECK_APPEND_1ARG("convert=%s,",
3943 btrfs_bg_type_to_raid_name(bargs->target));
3944
3945 if (flags & BTRFS_BALANCE_ARGS_SOFT)
3946 CHECK_APPEND_NOARG("soft,");
3947
3948 if (flags & BTRFS_BALANCE_ARGS_PROFILES) {
3949 btrfs_describe_block_groups(bargs->profiles, tmp_buf,
3950 sizeof(tmp_buf));
3951 CHECK_APPEND_1ARG("profiles=%s,", tmp_buf);
3952 }
3953
3954 if (flags & BTRFS_BALANCE_ARGS_USAGE)
3955 CHECK_APPEND_1ARG("usage=%llu,", bargs->usage);
3956
3957 if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE)
3958 CHECK_APPEND_2ARG("usage=%u..%u,",
3959 bargs->usage_min, bargs->usage_max);
3960
3961 if (flags & BTRFS_BALANCE_ARGS_DEVID)
3962 CHECK_APPEND_1ARG("devid=%llu,", bargs->devid);
3963
3964 if (flags & BTRFS_BALANCE_ARGS_DRANGE)
3965 CHECK_APPEND_2ARG("drange=%llu..%llu,",
3966 bargs->pstart, bargs->pend);
3967
3968 if (flags & BTRFS_BALANCE_ARGS_VRANGE)
3969 CHECK_APPEND_2ARG("vrange=%llu..%llu,",
3970 bargs->vstart, bargs->vend);
3971
3972 if (flags & BTRFS_BALANCE_ARGS_LIMIT)
3973 CHECK_APPEND_1ARG("limit=%llu,", bargs->limit);
3974
3975 if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)
3976 CHECK_APPEND_2ARG("limit=%u..%u,",
3977 bargs->limit_min, bargs->limit_max);
3978
3979 if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE)
3980 CHECK_APPEND_2ARG("stripes=%u..%u,",
3981 bargs->stripes_min, bargs->stripes_max);
3982
3983 #undef CHECK_APPEND_2ARG
3984 #undef CHECK_APPEND_1ARG
3985 #undef CHECK_APPEND_NOARG
3986
3987 out_overflow:
3988
3989 if (size_bp < size_buf)
3990 buf[size_buf - size_bp - 1] = '\0'; /* remove last , */
3991 else
3992 buf[0] = '\0';
3993 }
3994
describe_balance_start_or_resume(struct btrfs_fs_info * fs_info)3995 static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info)
3996 {
3997 u32 size_buf = 1024;
3998 char tmp_buf[192] = {'\0'};
3999 char *buf;
4000 char *bp;
4001 u32 size_bp = size_buf;
4002 int ret;
4003 struct btrfs_balance_control *bctl = fs_info->balance_ctl;
4004
4005 buf = kzalloc(size_buf, GFP_KERNEL);
4006 if (!buf)
4007 return;
4008
4009 bp = buf;
4010
4011 #define CHECK_APPEND_1ARG(a, v1) \
4012 do { \
4013 ret = snprintf(bp, size_bp, (a), (v1)); \
4014 if (ret < 0 || ret >= size_bp) \
4015 goto out_overflow; \
4016 size_bp -= ret; \
4017 bp += ret; \
4018 } while (0)
4019
4020 if (bctl->flags & BTRFS_BALANCE_FORCE)
4021 CHECK_APPEND_1ARG("%s", "-f ");
4022
4023 if (bctl->flags & BTRFS_BALANCE_DATA) {
4024 describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf));
4025 CHECK_APPEND_1ARG("-d%s ", tmp_buf);
4026 }
4027
4028 if (bctl->flags & BTRFS_BALANCE_METADATA) {
4029 describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf));
4030 CHECK_APPEND_1ARG("-m%s ", tmp_buf);
4031 }
4032
4033 if (bctl->flags & BTRFS_BALANCE_SYSTEM) {
4034 describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf));
4035 CHECK_APPEND_1ARG("-s%s ", tmp_buf);
4036 }
4037
4038 #undef CHECK_APPEND_1ARG
4039
4040 out_overflow:
4041
4042 if (size_bp < size_buf)
4043 buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */
4044 btrfs_info(fs_info, "balance: %s %s",
4045 (bctl->flags & BTRFS_BALANCE_RESUME) ?
4046 "resume" : "start", buf);
4047
4048 kfree(buf);
4049 }
4050
4051 /*
4052 * Should be called with balance mutexe held
4053 */
btrfs_balance(struct btrfs_fs_info * fs_info,struct btrfs_balance_control * bctl,struct btrfs_ioctl_balance_args * bargs)4054 int btrfs_balance(struct btrfs_fs_info *fs_info,
4055 struct btrfs_balance_control *bctl,
4056 struct btrfs_ioctl_balance_args *bargs)
4057 {
4058 u64 meta_target, data_target;
4059 u64 allowed;
4060 int mixed = 0;
4061 int ret;
4062 u64 num_devices;
4063 unsigned seq;
4064 bool reducing_redundancy;
4065 int i;
4066
4067 if (btrfs_fs_closing(fs_info) ||
4068 atomic_read(&fs_info->balance_pause_req) ||
4069 btrfs_should_cancel_balance(fs_info)) {
4070 ret = -EINVAL;
4071 goto out;
4072 }
4073
4074 allowed = btrfs_super_incompat_flags(fs_info->super_copy);
4075 if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
4076 mixed = 1;
4077
4078 /*
4079 * In case of mixed groups both data and meta should be picked,
4080 * and identical options should be given for both of them.
4081 */
4082 allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
4083 if (mixed && (bctl->flags & allowed)) {
4084 if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
4085 !(bctl->flags & BTRFS_BALANCE_METADATA) ||
4086 memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
4087 btrfs_err(fs_info,
4088 "balance: mixed groups data and metadata options must be the same");
4089 ret = -EINVAL;
4090 goto out;
4091 }
4092 }
4093
4094 /*
4095 * rw_devices will not change at the moment, device add/delete/replace
4096 * are exclusive
4097 */
4098 num_devices = fs_info->fs_devices->rw_devices;
4099
4100 /*
4101 * SINGLE profile on-disk has no profile bit, but in-memory we have a
4102 * special bit for it, to make it easier to distinguish. Thus we need
4103 * to set it manually, or balance would refuse the profile.
4104 */
4105 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
4106 for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++)
4107 if (num_devices >= btrfs_raid_array[i].devs_min)
4108 allowed |= btrfs_raid_array[i].bg_flag;
4109
4110 if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") ||
4111 !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") ||
4112 !validate_convert_profile(fs_info, &bctl->sys, allowed, "system")) {
4113 ret = -EINVAL;
4114 goto out;
4115 }
4116
4117 /*
4118 * Allow to reduce metadata or system integrity only if force set for
4119 * profiles with redundancy (copies, parity)
4120 */
4121 allowed = 0;
4122 for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) {
4123 if (btrfs_raid_array[i].ncopies >= 2 ||
4124 btrfs_raid_array[i].tolerated_failures >= 1)
4125 allowed |= btrfs_raid_array[i].bg_flag;
4126 }
4127 do {
4128 seq = read_seqbegin(&fs_info->profiles_lock);
4129
4130 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
4131 (fs_info->avail_system_alloc_bits & allowed) &&
4132 !(bctl->sys.target & allowed)) ||
4133 ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
4134 (fs_info->avail_metadata_alloc_bits & allowed) &&
4135 !(bctl->meta.target & allowed)))
4136 reducing_redundancy = true;
4137 else
4138 reducing_redundancy = false;
4139
4140 /* if we're not converting, the target field is uninitialized */
4141 meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
4142 bctl->meta.target : fs_info->avail_metadata_alloc_bits;
4143 data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
4144 bctl->data.target : fs_info->avail_data_alloc_bits;
4145 } while (read_seqretry(&fs_info->profiles_lock, seq));
4146
4147 if (reducing_redundancy) {
4148 if (bctl->flags & BTRFS_BALANCE_FORCE) {
4149 btrfs_info(fs_info,
4150 "balance: force reducing metadata redundancy");
4151 } else {
4152 btrfs_err(fs_info,
4153 "balance: reduces metadata redundancy, use --force if you want this");
4154 ret = -EINVAL;
4155 goto out;
4156 }
4157 }
4158
4159 if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
4160 btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
4161 btrfs_warn(fs_info,
4162 "balance: metadata profile %s has lower redundancy than data profile %s",
4163 btrfs_bg_type_to_raid_name(meta_target),
4164 btrfs_bg_type_to_raid_name(data_target));
4165 }
4166
4167 if (fs_info->send_in_progress) {
4168 btrfs_warn_rl(fs_info,
4169 "cannot run balance while send operations are in progress (%d in progress)",
4170 fs_info->send_in_progress);
4171 ret = -EAGAIN;
4172 goto out;
4173 }
4174
4175 ret = insert_balance_item(fs_info, bctl);
4176 if (ret && ret != -EEXIST)
4177 goto out;
4178
4179 if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
4180 BUG_ON(ret == -EEXIST);
4181 BUG_ON(fs_info->balance_ctl);
4182 spin_lock(&fs_info->balance_lock);
4183 fs_info->balance_ctl = bctl;
4184 spin_unlock(&fs_info->balance_lock);
4185 } else {
4186 BUG_ON(ret != -EEXIST);
4187 spin_lock(&fs_info->balance_lock);
4188 update_balance_args(bctl);
4189 spin_unlock(&fs_info->balance_lock);
4190 }
4191
4192 ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4193 set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
4194 describe_balance_start_or_resume(fs_info);
4195 mutex_unlock(&fs_info->balance_mutex);
4196
4197 ret = __btrfs_balance(fs_info);
4198
4199 mutex_lock(&fs_info->balance_mutex);
4200 if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req))
4201 btrfs_info(fs_info, "balance: paused");
4202 /*
4203 * Balance can be canceled by:
4204 *
4205 * - Regular cancel request
4206 * Then ret == -ECANCELED and balance_cancel_req > 0
4207 *
4208 * - Fatal signal to "btrfs" process
4209 * Either the signal caught by wait_reserve_ticket() and callers
4210 * got -EINTR, or caught by btrfs_should_cancel_balance() and
4211 * got -ECANCELED.
4212 * Either way, in this case balance_cancel_req = 0, and
4213 * ret == -EINTR or ret == -ECANCELED.
4214 *
4215 * So here we only check the return value to catch canceled balance.
4216 */
4217 else if (ret == -ECANCELED || ret == -EINTR)
4218 btrfs_info(fs_info, "balance: canceled");
4219 else
4220 btrfs_info(fs_info, "balance: ended with status: %d", ret);
4221
4222 clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
4223
4224 if (bargs) {
4225 memset(bargs, 0, sizeof(*bargs));
4226 btrfs_update_ioctl_balance_args(fs_info, bargs);
4227 }
4228
4229 if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
4230 balance_need_close(fs_info)) {
4231 reset_balance_state(fs_info);
4232 btrfs_exclop_finish(fs_info);
4233 }
4234
4235 wake_up(&fs_info->balance_wait_q);
4236
4237 return ret;
4238 out:
4239 if (bctl->flags & BTRFS_BALANCE_RESUME)
4240 reset_balance_state(fs_info);
4241 else
4242 kfree(bctl);
4243 btrfs_exclop_finish(fs_info);
4244
4245 return ret;
4246 }
4247
balance_kthread(void * data)4248 static int balance_kthread(void *data)
4249 {
4250 struct btrfs_fs_info *fs_info = data;
4251 int ret = 0;
4252
4253 sb_start_write(fs_info->sb);
4254 mutex_lock(&fs_info->balance_mutex);
4255 if (fs_info->balance_ctl)
4256 ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
4257 mutex_unlock(&fs_info->balance_mutex);
4258 sb_end_write(fs_info->sb);
4259
4260 return ret;
4261 }
4262
btrfs_resume_balance_async(struct btrfs_fs_info * fs_info)4263 int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
4264 {
4265 struct task_struct *tsk;
4266
4267 mutex_lock(&fs_info->balance_mutex);
4268 if (!fs_info->balance_ctl) {
4269 mutex_unlock(&fs_info->balance_mutex);
4270 return 0;
4271 }
4272 mutex_unlock(&fs_info->balance_mutex);
4273
4274 if (btrfs_test_opt(fs_info, SKIP_BALANCE)) {
4275 btrfs_info(fs_info, "balance: resume skipped");
4276 return 0;
4277 }
4278
4279 /*
4280 * A ro->rw remount sequence should continue with the paused balance
4281 * regardless of who pauses it, system or the user as of now, so set
4282 * the resume flag.
4283 */
4284 spin_lock(&fs_info->balance_lock);
4285 fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME;
4286 spin_unlock(&fs_info->balance_lock);
4287
4288 tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
4289 return PTR_ERR_OR_ZERO(tsk);
4290 }
4291
btrfs_recover_balance(struct btrfs_fs_info * fs_info)4292 int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
4293 {
4294 struct btrfs_balance_control *bctl;
4295 struct btrfs_balance_item *item;
4296 struct btrfs_disk_balance_args disk_bargs;
4297 struct btrfs_path *path;
4298 struct extent_buffer *leaf;
4299 struct btrfs_key key;
4300 int ret;
4301
4302 path = btrfs_alloc_path();
4303 if (!path)
4304 return -ENOMEM;
4305
4306 key.objectid = BTRFS_BALANCE_OBJECTID;
4307 key.type = BTRFS_TEMPORARY_ITEM_KEY;
4308 key.offset = 0;
4309
4310 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
4311 if (ret < 0)
4312 goto out;
4313 if (ret > 0) { /* ret = -ENOENT; */
4314 ret = 0;
4315 goto out;
4316 }
4317
4318 bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
4319 if (!bctl) {
4320 ret = -ENOMEM;
4321 goto out;
4322 }
4323
4324 leaf = path->nodes[0];
4325 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
4326
4327 bctl->flags = btrfs_balance_flags(leaf, item);
4328 bctl->flags |= BTRFS_BALANCE_RESUME;
4329
4330 btrfs_balance_data(leaf, item, &disk_bargs);
4331 btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
4332 btrfs_balance_meta(leaf, item, &disk_bargs);
4333 btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
4334 btrfs_balance_sys(leaf, item, &disk_bargs);
4335 btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
4336
4337 /*
4338 * This should never happen, as the paused balance state is recovered
4339 * during mount without any chance of other exclusive ops to collide.
4340 *
4341 * This gives the exclusive op status to balance and keeps in paused
4342 * state until user intervention (cancel or umount). If the ownership
4343 * cannot be assigned, show a message but do not fail. The balance
4344 * is in a paused state and must have fs_info::balance_ctl properly
4345 * set up.
4346 */
4347 if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
4348 btrfs_warn(fs_info,
4349 "balance: cannot set exclusive op status, resume manually");
4350
4351 btrfs_release_path(path);
4352
4353 mutex_lock(&fs_info->balance_mutex);
4354 BUG_ON(fs_info->balance_ctl);
4355 spin_lock(&fs_info->balance_lock);
4356 fs_info->balance_ctl = bctl;
4357 spin_unlock(&fs_info->balance_lock);
4358 mutex_unlock(&fs_info->balance_mutex);
4359 out:
4360 btrfs_free_path(path);
4361 return ret;
4362 }
4363
btrfs_pause_balance(struct btrfs_fs_info * fs_info)4364 int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
4365 {
4366 int ret = 0;
4367
4368 mutex_lock(&fs_info->balance_mutex);
4369 if (!fs_info->balance_ctl) {
4370 mutex_unlock(&fs_info->balance_mutex);
4371 return -ENOTCONN;
4372 }
4373
4374 if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
4375 atomic_inc(&fs_info->balance_pause_req);
4376 mutex_unlock(&fs_info->balance_mutex);
4377
4378 wait_event(fs_info->balance_wait_q,
4379 !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4380
4381 mutex_lock(&fs_info->balance_mutex);
4382 /* we are good with balance_ctl ripped off from under us */
4383 BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4384 atomic_dec(&fs_info->balance_pause_req);
4385 } else {
4386 ret = -ENOTCONN;
4387 }
4388
4389 mutex_unlock(&fs_info->balance_mutex);
4390 return ret;
4391 }
4392
btrfs_cancel_balance(struct btrfs_fs_info * fs_info)4393 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
4394 {
4395 mutex_lock(&fs_info->balance_mutex);
4396 if (!fs_info->balance_ctl) {
4397 mutex_unlock(&fs_info->balance_mutex);
4398 return -ENOTCONN;
4399 }
4400
4401 /*
4402 * A paused balance with the item stored on disk can be resumed at
4403 * mount time if the mount is read-write. Otherwise it's still paused
4404 * and we must not allow cancelling as it deletes the item.
4405 */
4406 if (sb_rdonly(fs_info->sb)) {
4407 mutex_unlock(&fs_info->balance_mutex);
4408 return -EROFS;
4409 }
4410
4411 atomic_inc(&fs_info->balance_cancel_req);
4412 /*
4413 * if we are running just wait and return, balance item is
4414 * deleted in btrfs_balance in this case
4415 */
4416 if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
4417 mutex_unlock(&fs_info->balance_mutex);
4418 wait_event(fs_info->balance_wait_q,
4419 !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4420 mutex_lock(&fs_info->balance_mutex);
4421 } else {
4422 mutex_unlock(&fs_info->balance_mutex);
4423 /*
4424 * Lock released to allow other waiters to continue, we'll
4425 * reexamine the status again.
4426 */
4427 mutex_lock(&fs_info->balance_mutex);
4428
4429 if (fs_info->balance_ctl) {
4430 reset_balance_state(fs_info);
4431 btrfs_exclop_finish(fs_info);
4432 btrfs_info(fs_info, "balance: canceled");
4433 }
4434 }
4435
4436 BUG_ON(fs_info->balance_ctl ||
4437 test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4438 atomic_dec(&fs_info->balance_cancel_req);
4439 mutex_unlock(&fs_info->balance_mutex);
4440 return 0;
4441 }
4442
btrfs_uuid_scan_kthread(void * data)4443 int btrfs_uuid_scan_kthread(void *data)
4444 {
4445 struct btrfs_fs_info *fs_info = data;
4446 struct btrfs_root *root = fs_info->tree_root;
4447 struct btrfs_key key;
4448 struct btrfs_path *path = NULL;
4449 int ret = 0;
4450 struct extent_buffer *eb;
4451 int slot;
4452 struct btrfs_root_item root_item;
4453 u32 item_size;
4454 struct btrfs_trans_handle *trans = NULL;
4455 bool closing = false;
4456
4457 path = btrfs_alloc_path();
4458 if (!path) {
4459 ret = -ENOMEM;
4460 goto out;
4461 }
4462
4463 key.objectid = 0;
4464 key.type = BTRFS_ROOT_ITEM_KEY;
4465 key.offset = 0;
4466
4467 while (1) {
4468 if (btrfs_fs_closing(fs_info)) {
4469 closing = true;
4470 break;
4471 }
4472 ret = btrfs_search_forward(root, &key, path,
4473 BTRFS_OLDEST_GENERATION);
4474 if (ret) {
4475 if (ret > 0)
4476 ret = 0;
4477 break;
4478 }
4479
4480 if (key.type != BTRFS_ROOT_ITEM_KEY ||
4481 (key.objectid < BTRFS_FIRST_FREE_OBJECTID &&
4482 key.objectid != BTRFS_FS_TREE_OBJECTID) ||
4483 key.objectid > BTRFS_LAST_FREE_OBJECTID)
4484 goto skip;
4485
4486 eb = path->nodes[0];
4487 slot = path->slots[0];
4488 item_size = btrfs_item_size_nr(eb, slot);
4489 if (item_size < sizeof(root_item))
4490 goto skip;
4491
4492 read_extent_buffer(eb, &root_item,
4493 btrfs_item_ptr_offset(eb, slot),
4494 (int)sizeof(root_item));
4495 if (btrfs_root_refs(&root_item) == 0)
4496 goto skip;
4497
4498 if (!btrfs_is_empty_uuid(root_item.uuid) ||
4499 !btrfs_is_empty_uuid(root_item.received_uuid)) {
4500 if (trans)
4501 goto update_tree;
4502
4503 btrfs_release_path(path);
4504 /*
4505 * 1 - subvol uuid item
4506 * 1 - received_subvol uuid item
4507 */
4508 trans = btrfs_start_transaction(fs_info->uuid_root, 2);
4509 if (IS_ERR(trans)) {
4510 ret = PTR_ERR(trans);
4511 break;
4512 }
4513 continue;
4514 } else {
4515 goto skip;
4516 }
4517 update_tree:
4518 btrfs_release_path(path);
4519 if (!btrfs_is_empty_uuid(root_item.uuid)) {
4520 ret = btrfs_uuid_tree_add(trans, root_item.uuid,
4521 BTRFS_UUID_KEY_SUBVOL,
4522 key.objectid);
4523 if (ret < 0) {
4524 btrfs_warn(fs_info, "uuid_tree_add failed %d",
4525 ret);
4526 break;
4527 }
4528 }
4529
4530 if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
4531 ret = btrfs_uuid_tree_add(trans,
4532 root_item.received_uuid,
4533 BTRFS_UUID_KEY_RECEIVED_SUBVOL,
4534 key.objectid);
4535 if (ret < 0) {
4536 btrfs_warn(fs_info, "uuid_tree_add failed %d",
4537 ret);
4538 break;
4539 }
4540 }
4541
4542 skip:
4543 btrfs_release_path(path);
4544 if (trans) {
4545 ret = btrfs_end_transaction(trans);
4546 trans = NULL;
4547 if (ret)
4548 break;
4549 }
4550
4551 if (key.offset < (u64)-1) {
4552 key.offset++;
4553 } else if (key.type < BTRFS_ROOT_ITEM_KEY) {
4554 key.offset = 0;
4555 key.type = BTRFS_ROOT_ITEM_KEY;
4556 } else if (key.objectid < (u64)-1) {
4557 key.offset = 0;
4558 key.type = BTRFS_ROOT_ITEM_KEY;
4559 key.objectid++;
4560 } else {
4561 break;
4562 }
4563 cond_resched();
4564 }
4565
4566 out:
4567 btrfs_free_path(path);
4568 if (trans && !IS_ERR(trans))
4569 btrfs_end_transaction(trans);
4570 if (ret)
4571 btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
4572 else if (!closing)
4573 set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
4574 up(&fs_info->uuid_tree_rescan_sem);
4575 return 0;
4576 }
4577
btrfs_create_uuid_tree(struct btrfs_fs_info * fs_info)4578 int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
4579 {
4580 struct btrfs_trans_handle *trans;
4581 struct btrfs_root *tree_root = fs_info->tree_root;
4582 struct btrfs_root *uuid_root;
4583 struct task_struct *task;
4584 int ret;
4585
4586 /*
4587 * 1 - root node
4588 * 1 - root item
4589 */
4590 trans = btrfs_start_transaction(tree_root, 2);
4591 if (IS_ERR(trans))
4592 return PTR_ERR(trans);
4593
4594 uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID);
4595 if (IS_ERR(uuid_root)) {
4596 ret = PTR_ERR(uuid_root);
4597 btrfs_abort_transaction(trans, ret);
4598 btrfs_end_transaction(trans);
4599 return ret;
4600 }
4601
4602 fs_info->uuid_root = uuid_root;
4603
4604 ret = btrfs_commit_transaction(trans);
4605 if (ret)
4606 return ret;
4607
4608 down(&fs_info->uuid_tree_rescan_sem);
4609 task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
4610 if (IS_ERR(task)) {
4611 /* fs_info->update_uuid_tree_gen remains 0 in all error case */
4612 btrfs_warn(fs_info, "failed to start uuid_scan task");
4613 up(&fs_info->uuid_tree_rescan_sem);
4614 return PTR_ERR(task);
4615 }
4616
4617 return 0;
4618 }
4619
4620 /*
4621 * shrinking a device means finding all of the device extents past
4622 * the new size, and then following the back refs to the chunks.
4623 * The chunk relocation code actually frees the device extent
4624 */
btrfs_shrink_device(struct btrfs_device * device,u64 new_size)4625 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
4626 {
4627 struct btrfs_fs_info *fs_info = device->fs_info;
4628 struct btrfs_root *root = fs_info->dev_root;
4629 struct btrfs_trans_handle *trans;
4630 struct btrfs_dev_extent *dev_extent = NULL;
4631 struct btrfs_path *path;
4632 u64 length;
4633 u64 chunk_offset;
4634 int ret;
4635 int slot;
4636 int failed = 0;
4637 bool retried = false;
4638 struct extent_buffer *l;
4639 struct btrfs_key key;
4640 struct btrfs_super_block *super_copy = fs_info->super_copy;
4641 u64 old_total = btrfs_super_total_bytes(super_copy);
4642 u64 old_size = btrfs_device_get_total_bytes(device);
4643 u64 diff;
4644 u64 start;
4645
4646 new_size = round_down(new_size, fs_info->sectorsize);
4647 start = new_size;
4648 diff = round_down(old_size - new_size, fs_info->sectorsize);
4649
4650 if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
4651 return -EINVAL;
4652
4653 path = btrfs_alloc_path();
4654 if (!path)
4655 return -ENOMEM;
4656
4657 path->reada = READA_BACK;
4658
4659 trans = btrfs_start_transaction(root, 0);
4660 if (IS_ERR(trans)) {
4661 btrfs_free_path(path);
4662 return PTR_ERR(trans);
4663 }
4664
4665 mutex_lock(&fs_info->chunk_mutex);
4666
4667 btrfs_device_set_total_bytes(device, new_size);
4668 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
4669 device->fs_devices->total_rw_bytes -= diff;
4670 atomic64_sub(diff, &fs_info->free_chunk_space);
4671 }
4672
4673 /*
4674 * Once the device's size has been set to the new size, ensure all
4675 * in-memory chunks are synced to disk so that the loop below sees them
4676 * and relocates them accordingly.
4677 */
4678 if (contains_pending_extent(device, &start, diff)) {
4679 mutex_unlock(&fs_info->chunk_mutex);
4680 ret = btrfs_commit_transaction(trans);
4681 if (ret)
4682 goto done;
4683 } else {
4684 mutex_unlock(&fs_info->chunk_mutex);
4685 btrfs_end_transaction(trans);
4686 }
4687
4688 again:
4689 key.objectid = device->devid;
4690 key.offset = (u64)-1;
4691 key.type = BTRFS_DEV_EXTENT_KEY;
4692
4693 do {
4694 mutex_lock(&fs_info->delete_unused_bgs_mutex);
4695 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4696 if (ret < 0) {
4697 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4698 goto done;
4699 }
4700
4701 ret = btrfs_previous_item(root, path, 0, key.type);
4702 if (ret)
4703 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4704 if (ret < 0)
4705 goto done;
4706 if (ret) {
4707 ret = 0;
4708 btrfs_release_path(path);
4709 break;
4710 }
4711
4712 l = path->nodes[0];
4713 slot = path->slots[0];
4714 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
4715
4716 if (key.objectid != device->devid) {
4717 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4718 btrfs_release_path(path);
4719 break;
4720 }
4721
4722 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
4723 length = btrfs_dev_extent_length(l, dev_extent);
4724
4725 if (key.offset + length <= new_size) {
4726 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4727 btrfs_release_path(path);
4728 break;
4729 }
4730
4731 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
4732 btrfs_release_path(path);
4733
4734 /*
4735 * We may be relocating the only data chunk we have,
4736 * which could potentially end up with losing data's
4737 * raid profile, so lets allocate an empty one in
4738 * advance.
4739 */
4740 ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset);
4741 if (ret < 0) {
4742 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4743 goto done;
4744 }
4745
4746 ret = btrfs_relocate_chunk(fs_info, chunk_offset);
4747 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4748 if (ret == -ENOSPC) {
4749 failed++;
4750 } else if (ret) {
4751 if (ret == -ETXTBSY) {
4752 btrfs_warn(fs_info,
4753 "could not shrink block group %llu due to active swapfile",
4754 chunk_offset);
4755 }
4756 goto done;
4757 }
4758 } while (key.offset-- > 0);
4759
4760 if (failed && !retried) {
4761 failed = 0;
4762 retried = true;
4763 goto again;
4764 } else if (failed && retried) {
4765 ret = -ENOSPC;
4766 goto done;
4767 }
4768
4769 /* Shrinking succeeded, else we would be at "done". */
4770 trans = btrfs_start_transaction(root, 0);
4771 if (IS_ERR(trans)) {
4772 ret = PTR_ERR(trans);
4773 goto done;
4774 }
4775
4776 mutex_lock(&fs_info->chunk_mutex);
4777 /* Clear all state bits beyond the shrunk device size */
4778 clear_extent_bits(&device->alloc_state, new_size, (u64)-1,
4779 CHUNK_STATE_MASK);
4780
4781 btrfs_device_set_disk_total_bytes(device, new_size);
4782 if (list_empty(&device->post_commit_list))
4783 list_add_tail(&device->post_commit_list,
4784 &trans->transaction->dev_update_list);
4785
4786 WARN_ON(diff > old_total);
4787 btrfs_set_super_total_bytes(super_copy,
4788 round_down(old_total - diff, fs_info->sectorsize));
4789 mutex_unlock(&fs_info->chunk_mutex);
4790
4791 /* Now btrfs_update_device() will change the on-disk size. */
4792 ret = btrfs_update_device(trans, device);
4793 if (ret < 0) {
4794 btrfs_abort_transaction(trans, ret);
4795 btrfs_end_transaction(trans);
4796 } else {
4797 ret = btrfs_commit_transaction(trans);
4798 }
4799 done:
4800 btrfs_free_path(path);
4801 if (ret) {
4802 mutex_lock(&fs_info->chunk_mutex);
4803 btrfs_device_set_total_bytes(device, old_size);
4804 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
4805 device->fs_devices->total_rw_bytes += diff;
4806 atomic64_add(diff, &fs_info->free_chunk_space);
4807 mutex_unlock(&fs_info->chunk_mutex);
4808 }
4809 return ret;
4810 }
4811
btrfs_add_system_chunk(struct btrfs_fs_info * fs_info,struct btrfs_key * key,struct btrfs_chunk * chunk,int item_size)4812 static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
4813 struct btrfs_key *key,
4814 struct btrfs_chunk *chunk, int item_size)
4815 {
4816 struct btrfs_super_block *super_copy = fs_info->super_copy;
4817 struct btrfs_disk_key disk_key;
4818 u32 array_size;
4819 u8 *ptr;
4820
4821 mutex_lock(&fs_info->chunk_mutex);
4822 array_size = btrfs_super_sys_array_size(super_copy);
4823 if (array_size + item_size + sizeof(disk_key)
4824 > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
4825 mutex_unlock(&fs_info->chunk_mutex);
4826 return -EFBIG;
4827 }
4828
4829 ptr = super_copy->sys_chunk_array + array_size;
4830 btrfs_cpu_key_to_disk(&disk_key, key);
4831 memcpy(ptr, &disk_key, sizeof(disk_key));
4832 ptr += sizeof(disk_key);
4833 memcpy(ptr, chunk, item_size);
4834 item_size += sizeof(disk_key);
4835 btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
4836 mutex_unlock(&fs_info->chunk_mutex);
4837
4838 return 0;
4839 }
4840
4841 /*
4842 * sort the devices in descending order by max_avail, total_avail
4843 */
btrfs_cmp_device_info(const void * a,const void * b)4844 static int btrfs_cmp_device_info(const void *a, const void *b)
4845 {
4846 const struct btrfs_device_info *di_a = a;
4847 const struct btrfs_device_info *di_b = b;
4848
4849 if (di_a->max_avail > di_b->max_avail)
4850 return -1;
4851 if (di_a->max_avail < di_b->max_avail)
4852 return 1;
4853 if (di_a->total_avail > di_b->total_avail)
4854 return -1;
4855 if (di_a->total_avail < di_b->total_avail)
4856 return 1;
4857 return 0;
4858 }
4859
check_raid56_incompat_flag(struct btrfs_fs_info * info,u64 type)4860 static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
4861 {
4862 if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
4863 return;
4864
4865 btrfs_set_fs_incompat(info, RAID56);
4866 }
4867
check_raid1c34_incompat_flag(struct btrfs_fs_info * info,u64 type)4868 static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type)
4869 {
4870 if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4)))
4871 return;
4872
4873 btrfs_set_fs_incompat(info, RAID1C34);
4874 }
4875
4876 /*
4877 * Structure used internally for __btrfs_alloc_chunk() function.
4878 * Wraps needed parameters.
4879 */
4880 struct alloc_chunk_ctl {
4881 u64 start;
4882 u64 type;
4883 /* Total number of stripes to allocate */
4884 int num_stripes;
4885 /* sub_stripes info for map */
4886 int sub_stripes;
4887 /* Stripes per device */
4888 int dev_stripes;
4889 /* Maximum number of devices to use */
4890 int devs_max;
4891 /* Minimum number of devices to use */
4892 int devs_min;
4893 /* ndevs has to be a multiple of this */
4894 int devs_increment;
4895 /* Number of copies */
4896 int ncopies;
4897 /* Number of stripes worth of bytes to store parity information */
4898 int nparity;
4899 u64 max_stripe_size;
4900 u64 max_chunk_size;
4901 u64 dev_extent_min;
4902 u64 stripe_size;
4903 u64 chunk_size;
4904 int ndevs;
4905 };
4906
init_alloc_chunk_ctl_policy_regular(struct btrfs_fs_devices * fs_devices,struct alloc_chunk_ctl * ctl)4907 static void init_alloc_chunk_ctl_policy_regular(
4908 struct btrfs_fs_devices *fs_devices,
4909 struct alloc_chunk_ctl *ctl)
4910 {
4911 u64 type = ctl->type;
4912
4913 if (type & BTRFS_BLOCK_GROUP_DATA) {
4914 ctl->max_stripe_size = SZ_1G;
4915 ctl->max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE;
4916 } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
4917 /* For larger filesystems, use larger metadata chunks */
4918 if (fs_devices->total_rw_bytes > 50ULL * SZ_1G)
4919 ctl->max_stripe_size = SZ_1G;
4920 else
4921 ctl->max_stripe_size = SZ_256M;
4922 ctl->max_chunk_size = ctl->max_stripe_size;
4923 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
4924 ctl->max_stripe_size = SZ_32M;
4925 ctl->max_chunk_size = 2 * ctl->max_stripe_size;
4926 ctl->devs_max = min_t(int, ctl->devs_max,
4927 BTRFS_MAX_DEVS_SYS_CHUNK);
4928 } else {
4929 BUG();
4930 }
4931
4932 /* We don't want a chunk larger than 10% of writable space */
4933 ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
4934 ctl->max_chunk_size);
4935 ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes;
4936 }
4937
init_alloc_chunk_ctl(struct btrfs_fs_devices * fs_devices,struct alloc_chunk_ctl * ctl)4938 static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices,
4939 struct alloc_chunk_ctl *ctl)
4940 {
4941 int index = btrfs_bg_flags_to_raid_index(ctl->type);
4942
4943 ctl->sub_stripes = btrfs_raid_array[index].sub_stripes;
4944 ctl->dev_stripes = btrfs_raid_array[index].dev_stripes;
4945 ctl->devs_max = btrfs_raid_array[index].devs_max;
4946 if (!ctl->devs_max)
4947 ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info);
4948 ctl->devs_min = btrfs_raid_array[index].devs_min;
4949 ctl->devs_increment = btrfs_raid_array[index].devs_increment;
4950 ctl->ncopies = btrfs_raid_array[index].ncopies;
4951 ctl->nparity = btrfs_raid_array[index].nparity;
4952 ctl->ndevs = 0;
4953
4954 switch (fs_devices->chunk_alloc_policy) {
4955 case BTRFS_CHUNK_ALLOC_REGULAR:
4956 init_alloc_chunk_ctl_policy_regular(fs_devices, ctl);
4957 break;
4958 default:
4959 BUG();
4960 }
4961 }
4962
gather_device_info(struct btrfs_fs_devices * fs_devices,struct alloc_chunk_ctl * ctl,struct btrfs_device_info * devices_info)4963 static int gather_device_info(struct btrfs_fs_devices *fs_devices,
4964 struct alloc_chunk_ctl *ctl,
4965 struct btrfs_device_info *devices_info)
4966 {
4967 struct btrfs_fs_info *info = fs_devices->fs_info;
4968 struct btrfs_device *device;
4969 u64 total_avail;
4970 u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes;
4971 int ret;
4972 int ndevs = 0;
4973 u64 max_avail;
4974 u64 dev_offset;
4975
4976 /*
4977 * in the first pass through the devices list, we gather information
4978 * about the available holes on each device.
4979 */
4980 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
4981 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
4982 WARN(1, KERN_ERR
4983 "BTRFS: read-only device in alloc_list\n");
4984 continue;
4985 }
4986
4987 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
4988 &device->dev_state) ||
4989 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
4990 continue;
4991
4992 if (device->total_bytes > device->bytes_used)
4993 total_avail = device->total_bytes - device->bytes_used;
4994 else
4995 total_avail = 0;
4996
4997 /* If there is no space on this device, skip it. */
4998 if (total_avail < ctl->dev_extent_min)
4999 continue;
5000
5001 ret = find_free_dev_extent(device, dev_extent_want, &dev_offset,
5002 &max_avail);
5003 if (ret && ret != -ENOSPC)
5004 return ret;
5005
5006 if (ret == 0)
5007 max_avail = dev_extent_want;
5008
5009 if (max_avail < ctl->dev_extent_min) {
5010 if (btrfs_test_opt(info, ENOSPC_DEBUG))
5011 btrfs_debug(info,
5012 "%s: devid %llu has no free space, have=%llu want=%llu",
5013 __func__, device->devid, max_avail,
5014 ctl->dev_extent_min);
5015 continue;
5016 }
5017
5018 if (ndevs == fs_devices->rw_devices) {
5019 WARN(1, "%s: found more than %llu devices\n",
5020 __func__, fs_devices->rw_devices);
5021 break;
5022 }
5023 devices_info[ndevs].dev_offset = dev_offset;
5024 devices_info[ndevs].max_avail = max_avail;
5025 devices_info[ndevs].total_avail = total_avail;
5026 devices_info[ndevs].dev = device;
5027 ++ndevs;
5028 }
5029 ctl->ndevs = ndevs;
5030
5031 /*
5032 * now sort the devices by hole size / available space
5033 */
5034 sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
5035 btrfs_cmp_device_info, NULL);
5036
5037 return 0;
5038 }
5039
decide_stripe_size_regular(struct alloc_chunk_ctl * ctl,struct btrfs_device_info * devices_info)5040 static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl,
5041 struct btrfs_device_info *devices_info)
5042 {
5043 /* Number of stripes that count for block group size */
5044 int data_stripes;
5045
5046 /*
5047 * The primary goal is to maximize the number of stripes, so use as
5048 * many devices as possible, even if the stripes are not maximum sized.
5049 *
5050 * The DUP profile stores more than one stripe per device, the
5051 * max_avail is the total size so we have to adjust.
5052 */
5053 ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail,
5054 ctl->dev_stripes);
5055 ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
5056
5057 /* This will have to be fixed for RAID1 and RAID10 over more drives */
5058 data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
5059
5060 /*
5061 * Use the number of data stripes to figure out how big this chunk is
5062 * really going to be in terms of logical address space, and compare
5063 * that answer with the max chunk size. If it's higher, we try to
5064 * reduce stripe_size.
5065 */
5066 if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
5067 /*
5068 * Reduce stripe_size, round it up to a 16MB boundary again and
5069 * then use it, unless it ends up being even bigger than the
5070 * previous value we had already.
5071 */
5072 ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size,
5073 data_stripes), SZ_16M),
5074 ctl->stripe_size);
5075 }
5076
5077 /* Align to BTRFS_STRIPE_LEN */
5078 ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN);
5079 ctl->chunk_size = ctl->stripe_size * data_stripes;
5080
5081 return 0;
5082 }
5083
decide_stripe_size(struct btrfs_fs_devices * fs_devices,struct alloc_chunk_ctl * ctl,struct btrfs_device_info * devices_info)5084 static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
5085 struct alloc_chunk_ctl *ctl,
5086 struct btrfs_device_info *devices_info)
5087 {
5088 struct btrfs_fs_info *info = fs_devices->fs_info;
5089
5090 /*
5091 * Round down to number of usable stripes, devs_increment can be any
5092 * number so we can't use round_down() that requires power of 2, while
5093 * rounddown is safe.
5094 */
5095 ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment);
5096
5097 if (ctl->ndevs < ctl->devs_min) {
5098 if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
5099 btrfs_debug(info,
5100 "%s: not enough devices with free space: have=%d minimum required=%d",
5101 __func__, ctl->ndevs, ctl->devs_min);
5102 }
5103 return -ENOSPC;
5104 }
5105
5106 ctl->ndevs = min(ctl->ndevs, ctl->devs_max);
5107
5108 switch (fs_devices->chunk_alloc_policy) {
5109 case BTRFS_CHUNK_ALLOC_REGULAR:
5110 return decide_stripe_size_regular(ctl, devices_info);
5111 default:
5112 BUG();
5113 }
5114 }
5115
create_chunk(struct btrfs_trans_handle * trans,struct alloc_chunk_ctl * ctl,struct btrfs_device_info * devices_info)5116 static int create_chunk(struct btrfs_trans_handle *trans,
5117 struct alloc_chunk_ctl *ctl,
5118 struct btrfs_device_info *devices_info)
5119 {
5120 struct btrfs_fs_info *info = trans->fs_info;
5121 struct map_lookup *map = NULL;
5122 struct extent_map_tree *em_tree;
5123 struct extent_map *em;
5124 u64 start = ctl->start;
5125 u64 type = ctl->type;
5126 int ret;
5127 int i;
5128 int j;
5129
5130 map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS);
5131 if (!map)
5132 return -ENOMEM;
5133 map->num_stripes = ctl->num_stripes;
5134
5135 for (i = 0; i < ctl->ndevs; ++i) {
5136 for (j = 0; j < ctl->dev_stripes; ++j) {
5137 int s = i * ctl->dev_stripes + j;
5138 map->stripes[s].dev = devices_info[i].dev;
5139 map->stripes[s].physical = devices_info[i].dev_offset +
5140 j * ctl->stripe_size;
5141 }
5142 }
5143 map->stripe_len = BTRFS_STRIPE_LEN;
5144 map->io_align = BTRFS_STRIPE_LEN;
5145 map->io_width = BTRFS_STRIPE_LEN;
5146 map->type = type;
5147 map->sub_stripes = ctl->sub_stripes;
5148
5149 trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size);
5150
5151 em = alloc_extent_map();
5152 if (!em) {
5153 kfree(map);
5154 return -ENOMEM;
5155 }
5156 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
5157 em->map_lookup = map;
5158 em->start = start;
5159 em->len = ctl->chunk_size;
5160 em->block_start = 0;
5161 em->block_len = em->len;
5162 em->orig_block_len = ctl->stripe_size;
5163
5164 em_tree = &info->mapping_tree;
5165 write_lock(&em_tree->lock);
5166 ret = add_extent_mapping(em_tree, em, 0);
5167 if (ret) {
5168 write_unlock(&em_tree->lock);
5169 free_extent_map(em);
5170 return ret;
5171 }
5172 write_unlock(&em_tree->lock);
5173
5174 ret = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size);
5175 if (ret)
5176 goto error_del_extent;
5177
5178 for (i = 0; i < map->num_stripes; i++) {
5179 struct btrfs_device *dev = map->stripes[i].dev;
5180
5181 btrfs_device_set_bytes_used(dev,
5182 dev->bytes_used + ctl->stripe_size);
5183 if (list_empty(&dev->post_commit_list))
5184 list_add_tail(&dev->post_commit_list,
5185 &trans->transaction->dev_update_list);
5186 }
5187
5188 atomic64_sub(ctl->stripe_size * map->num_stripes,
5189 &info->free_chunk_space);
5190
5191 free_extent_map(em);
5192 check_raid56_incompat_flag(info, type);
5193 check_raid1c34_incompat_flag(info, type);
5194
5195 return 0;
5196
5197 error_del_extent:
5198 write_lock(&em_tree->lock);
5199 remove_extent_mapping(em_tree, em);
5200 write_unlock(&em_tree->lock);
5201
5202 /* One for our allocation */
5203 free_extent_map(em);
5204 /* One for the tree reference */
5205 free_extent_map(em);
5206
5207 return ret;
5208 }
5209
btrfs_alloc_chunk(struct btrfs_trans_handle * trans,u64 type)5210 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type)
5211 {
5212 struct btrfs_fs_info *info = trans->fs_info;
5213 struct btrfs_fs_devices *fs_devices = info->fs_devices;
5214 struct btrfs_device_info *devices_info = NULL;
5215 struct alloc_chunk_ctl ctl;
5216 int ret;
5217
5218 lockdep_assert_held(&info->chunk_mutex);
5219
5220 if (!alloc_profile_is_valid(type, 0)) {
5221 ASSERT(0);
5222 return -EINVAL;
5223 }
5224
5225 if (list_empty(&fs_devices->alloc_list)) {
5226 if (btrfs_test_opt(info, ENOSPC_DEBUG))
5227 btrfs_debug(info, "%s: no writable device", __func__);
5228 return -ENOSPC;
5229 }
5230
5231 if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
5232 btrfs_err(info, "invalid chunk type 0x%llx requested", type);
5233 ASSERT(0);
5234 return -EINVAL;
5235 }
5236
5237 ctl.start = find_next_chunk(info);
5238 ctl.type = type;
5239 init_alloc_chunk_ctl(fs_devices, &ctl);
5240
5241 devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
5242 GFP_NOFS);
5243 if (!devices_info)
5244 return -ENOMEM;
5245
5246 ret = gather_device_info(fs_devices, &ctl, devices_info);
5247 if (ret < 0)
5248 goto out;
5249
5250 ret = decide_stripe_size(fs_devices, &ctl, devices_info);
5251 if (ret < 0)
5252 goto out;
5253
5254 ret = create_chunk(trans, &ctl, devices_info);
5255
5256 out:
5257 kfree(devices_info);
5258 return ret;
5259 }
5260
5261 /*
5262 * Chunk allocation falls into two parts. The first part does work
5263 * that makes the new allocated chunk usable, but does not do any operation
5264 * that modifies the chunk tree. The second part does the work that
5265 * requires modifying the chunk tree. This division is important for the
5266 * bootstrap process of adding storage to a seed btrfs.
5267 */
btrfs_finish_chunk_alloc(struct btrfs_trans_handle * trans,u64 chunk_offset,u64 chunk_size)5268 int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
5269 u64 chunk_offset, u64 chunk_size)
5270 {
5271 struct btrfs_fs_info *fs_info = trans->fs_info;
5272 struct btrfs_root *extent_root = fs_info->extent_root;
5273 struct btrfs_root *chunk_root = fs_info->chunk_root;
5274 struct btrfs_key key;
5275 struct btrfs_device *device;
5276 struct btrfs_chunk *chunk;
5277 struct btrfs_stripe *stripe;
5278 struct extent_map *em;
5279 struct map_lookup *map;
5280 size_t item_size;
5281 u64 dev_offset;
5282 u64 stripe_size;
5283 int i = 0;
5284 int ret = 0;
5285
5286 em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
5287 if (IS_ERR(em))
5288 return PTR_ERR(em);
5289
5290 map = em->map_lookup;
5291 item_size = btrfs_chunk_item_size(map->num_stripes);
5292 stripe_size = em->orig_block_len;
5293
5294 chunk = kzalloc(item_size, GFP_NOFS);
5295 if (!chunk) {
5296 ret = -ENOMEM;
5297 goto out;
5298 }
5299
5300 /*
5301 * Take the device list mutex to prevent races with the final phase of
5302 * a device replace operation that replaces the device object associated
5303 * with the map's stripes, because the device object's id can change
5304 * at any time during that final phase of the device replace operation
5305 * (dev-replace.c:btrfs_dev_replace_finishing()).
5306 */
5307 mutex_lock(&fs_info->fs_devices->device_list_mutex);
5308 for (i = 0; i < map->num_stripes; i++) {
5309 device = map->stripes[i].dev;
5310 dev_offset = map->stripes[i].physical;
5311
5312 ret = btrfs_update_device(trans, device);
5313 if (ret)
5314 break;
5315 ret = btrfs_alloc_dev_extent(trans, device, chunk_offset,
5316 dev_offset, stripe_size);
5317 if (ret)
5318 break;
5319 }
5320 if (ret) {
5321 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
5322 goto out;
5323 }
5324
5325 stripe = &chunk->stripe;
5326 for (i = 0; i < map->num_stripes; i++) {
5327 device = map->stripes[i].dev;
5328 dev_offset = map->stripes[i].physical;
5329
5330 btrfs_set_stack_stripe_devid(stripe, device->devid);
5331 btrfs_set_stack_stripe_offset(stripe, dev_offset);
5332 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
5333 stripe++;
5334 }
5335 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
5336
5337 btrfs_set_stack_chunk_length(chunk, chunk_size);
5338 btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
5339 btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
5340 btrfs_set_stack_chunk_type(chunk, map->type);
5341 btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
5342 btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
5343 btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
5344 btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize);
5345 btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
5346
5347 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
5348 key.type = BTRFS_CHUNK_ITEM_KEY;
5349 key.offset = chunk_offset;
5350
5351 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
5352 if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
5353 /*
5354 * TODO: Cleanup of inserted chunk root in case of
5355 * failure.
5356 */
5357 ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
5358 }
5359
5360 out:
5361 kfree(chunk);
5362 free_extent_map(em);
5363 return ret;
5364 }
5365
init_first_rw_device(struct btrfs_trans_handle * trans)5366 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
5367 {
5368 struct btrfs_fs_info *fs_info = trans->fs_info;
5369 u64 alloc_profile;
5370 int ret;
5371
5372 alloc_profile = btrfs_metadata_alloc_profile(fs_info);
5373 ret = btrfs_alloc_chunk(trans, alloc_profile);
5374 if (ret)
5375 return ret;
5376
5377 alloc_profile = btrfs_system_alloc_profile(fs_info);
5378 ret = btrfs_alloc_chunk(trans, alloc_profile);
5379 return ret;
5380 }
5381
btrfs_chunk_max_errors(struct map_lookup * map)5382 static inline int btrfs_chunk_max_errors(struct map_lookup *map)
5383 {
5384 const int index = btrfs_bg_flags_to_raid_index(map->type);
5385
5386 return btrfs_raid_array[index].tolerated_failures;
5387 }
5388
btrfs_chunk_readonly(struct btrfs_fs_info * fs_info,u64 chunk_offset)5389 int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
5390 {
5391 struct extent_map *em;
5392 struct map_lookup *map;
5393 int readonly = 0;
5394 int miss_ndevs = 0;
5395 int i;
5396
5397 em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
5398 if (IS_ERR(em))
5399 return 1;
5400
5401 map = em->map_lookup;
5402 for (i = 0; i < map->num_stripes; i++) {
5403 if (test_bit(BTRFS_DEV_STATE_MISSING,
5404 &map->stripes[i].dev->dev_state)) {
5405 miss_ndevs++;
5406 continue;
5407 }
5408 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
5409 &map->stripes[i].dev->dev_state)) {
5410 readonly = 1;
5411 goto end;
5412 }
5413 }
5414
5415 /*
5416 * If the number of missing devices is larger than max errors,
5417 * we can not write the data into that chunk successfully, so
5418 * set it readonly.
5419 */
5420 if (miss_ndevs > btrfs_chunk_max_errors(map))
5421 readonly = 1;
5422 end:
5423 free_extent_map(em);
5424 return readonly;
5425 }
5426
btrfs_mapping_tree_free(struct extent_map_tree * tree)5427 void btrfs_mapping_tree_free(struct extent_map_tree *tree)
5428 {
5429 struct extent_map *em;
5430
5431 while (1) {
5432 write_lock(&tree->lock);
5433 em = lookup_extent_mapping(tree, 0, (u64)-1);
5434 if (em)
5435 remove_extent_mapping(tree, em);
5436 write_unlock(&tree->lock);
5437 if (!em)
5438 break;
5439 /* once for us */
5440 free_extent_map(em);
5441 /* once for the tree */
5442 free_extent_map(em);
5443 }
5444 }
5445
btrfs_num_copies(struct btrfs_fs_info * fs_info,u64 logical,u64 len)5446 int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
5447 {
5448 struct extent_map *em;
5449 struct map_lookup *map;
5450 int ret;
5451
5452 em = btrfs_get_chunk_map(fs_info, logical, len);
5453 if (IS_ERR(em))
5454 /*
5455 * We could return errors for these cases, but that could get
5456 * ugly and we'd probably do the same thing which is just not do
5457 * anything else and exit, so return 1 so the callers don't try
5458 * to use other copies.
5459 */
5460 return 1;
5461
5462 map = em->map_lookup;
5463 if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK))
5464 ret = map->num_stripes;
5465 else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
5466 ret = map->sub_stripes;
5467 else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
5468 ret = 2;
5469 else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
5470 /*
5471 * There could be two corrupted data stripes, we need
5472 * to loop retry in order to rebuild the correct data.
5473 *
5474 * Fail a stripe at a time on every retry except the
5475 * stripe under reconstruction.
5476 */
5477 ret = map->num_stripes;
5478 else
5479 ret = 1;
5480 free_extent_map(em);
5481
5482 down_read(&fs_info->dev_replace.rwsem);
5483 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
5484 fs_info->dev_replace.tgtdev)
5485 ret++;
5486 up_read(&fs_info->dev_replace.rwsem);
5487
5488 return ret;
5489 }
5490
btrfs_full_stripe_len(struct btrfs_fs_info * fs_info,u64 logical)5491 unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
5492 u64 logical)
5493 {
5494 struct extent_map *em;
5495 struct map_lookup *map;
5496 unsigned long len = fs_info->sectorsize;
5497
5498 em = btrfs_get_chunk_map(fs_info, logical, len);
5499
5500 if (!WARN_ON(IS_ERR(em))) {
5501 map = em->map_lookup;
5502 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
5503 len = map->stripe_len * nr_data_stripes(map);
5504 free_extent_map(em);
5505 }
5506 return len;
5507 }
5508
btrfs_is_parity_mirror(struct btrfs_fs_info * fs_info,u64 logical,u64 len)5509 int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
5510 {
5511 struct extent_map *em;
5512 struct map_lookup *map;
5513 int ret = 0;
5514
5515 em = btrfs_get_chunk_map(fs_info, logical, len);
5516
5517 if(!WARN_ON(IS_ERR(em))) {
5518 map = em->map_lookup;
5519 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
5520 ret = 1;
5521 free_extent_map(em);
5522 }
5523 return ret;
5524 }
5525
find_live_mirror(struct btrfs_fs_info * fs_info,struct map_lookup * map,int first,int dev_replace_is_ongoing)5526 static int find_live_mirror(struct btrfs_fs_info *fs_info,
5527 struct map_lookup *map, int first,
5528 int dev_replace_is_ongoing)
5529 {
5530 int i;
5531 int num_stripes;
5532 int preferred_mirror;
5533 int tolerance;
5534 struct btrfs_device *srcdev;
5535
5536 ASSERT((map->type &
5537 (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10)));
5538
5539 if (map->type & BTRFS_BLOCK_GROUP_RAID10)
5540 num_stripes = map->sub_stripes;
5541 else
5542 num_stripes = map->num_stripes;
5543
5544 preferred_mirror = first + current->pid % num_stripes;
5545
5546 if (dev_replace_is_ongoing &&
5547 fs_info->dev_replace.cont_reading_from_srcdev_mode ==
5548 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
5549 srcdev = fs_info->dev_replace.srcdev;
5550 else
5551 srcdev = NULL;
5552
5553 /*
5554 * try to avoid the drive that is the source drive for a
5555 * dev-replace procedure, only choose it if no other non-missing
5556 * mirror is available
5557 */
5558 for (tolerance = 0; tolerance < 2; tolerance++) {
5559 if (map->stripes[preferred_mirror].dev->bdev &&
5560 (tolerance || map->stripes[preferred_mirror].dev != srcdev))
5561 return preferred_mirror;
5562 for (i = first; i < first + num_stripes; i++) {
5563 if (map->stripes[i].dev->bdev &&
5564 (tolerance || map->stripes[i].dev != srcdev))
5565 return i;
5566 }
5567 }
5568
5569 /* we couldn't find one that doesn't fail. Just return something
5570 * and the io error handling code will clean up eventually
5571 */
5572 return preferred_mirror;
5573 }
5574
5575 /* Bubble-sort the stripe set to put the parity/syndrome stripes last */
sort_parity_stripes(struct btrfs_bio * bbio,int num_stripes)5576 static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
5577 {
5578 int i;
5579 int again = 1;
5580
5581 while (again) {
5582 again = 0;
5583 for (i = 0; i < num_stripes - 1; i++) {
5584 /* Swap if parity is on a smaller index */
5585 if (bbio->raid_map[i] > bbio->raid_map[i + 1]) {
5586 swap(bbio->stripes[i], bbio->stripes[i + 1]);
5587 swap(bbio->raid_map[i], bbio->raid_map[i + 1]);
5588 again = 1;
5589 }
5590 }
5591 }
5592 }
5593
alloc_btrfs_bio(int total_stripes,int real_stripes)5594 static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
5595 {
5596 struct btrfs_bio *bbio = kzalloc(
5597 /* the size of the btrfs_bio */
5598 sizeof(struct btrfs_bio) +
5599 /* plus the variable array for the stripes */
5600 sizeof(struct btrfs_bio_stripe) * (total_stripes) +
5601 /* plus the variable array for the tgt dev */
5602 sizeof(int) * (real_stripes) +
5603 /*
5604 * plus the raid_map, which includes both the tgt dev
5605 * and the stripes
5606 */
5607 sizeof(u64) * (total_stripes),
5608 GFP_NOFS|__GFP_NOFAIL);
5609
5610 atomic_set(&bbio->error, 0);
5611 refcount_set(&bbio->refs, 1);
5612
5613 bbio->tgtdev_map = (int *)(bbio->stripes + total_stripes);
5614 bbio->raid_map = (u64 *)(bbio->tgtdev_map + real_stripes);
5615
5616 return bbio;
5617 }
5618
btrfs_get_bbio(struct btrfs_bio * bbio)5619 void btrfs_get_bbio(struct btrfs_bio *bbio)
5620 {
5621 WARN_ON(!refcount_read(&bbio->refs));
5622 refcount_inc(&bbio->refs);
5623 }
5624
btrfs_put_bbio(struct btrfs_bio * bbio)5625 void btrfs_put_bbio(struct btrfs_bio *bbio)
5626 {
5627 if (!bbio)
5628 return;
5629 if (refcount_dec_and_test(&bbio->refs))
5630 kfree(bbio);
5631 }
5632
5633 /* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */
5634 /*
5635 * Please note that, discard won't be sent to target device of device
5636 * replace.
5637 */
__btrfs_map_block_for_discard(struct btrfs_fs_info * fs_info,u64 logical,u64 * length_ret,struct btrfs_bio ** bbio_ret)5638 static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
5639 u64 logical, u64 *length_ret,
5640 struct btrfs_bio **bbio_ret)
5641 {
5642 struct extent_map *em;
5643 struct map_lookup *map;
5644 struct btrfs_bio *bbio;
5645 u64 length = *length_ret;
5646 u64 offset;
5647 u64 stripe_nr;
5648 u64 stripe_nr_end;
5649 u64 stripe_end_offset;
5650 u64 stripe_cnt;
5651 u64 stripe_len;
5652 u64 stripe_offset;
5653 u64 num_stripes;
5654 u32 stripe_index;
5655 u32 factor = 0;
5656 u32 sub_stripes = 0;
5657 u64 stripes_per_dev = 0;
5658 u32 remaining_stripes = 0;
5659 u32 last_stripe = 0;
5660 int ret = 0;
5661 int i;
5662
5663 /* discard always return a bbio */
5664 ASSERT(bbio_ret);
5665
5666 em = btrfs_get_chunk_map(fs_info, logical, length);
5667 if (IS_ERR(em))
5668 return PTR_ERR(em);
5669
5670 map = em->map_lookup;
5671 /* we don't discard raid56 yet */
5672 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5673 ret = -EOPNOTSUPP;
5674 goto out;
5675 }
5676
5677 offset = logical - em->start;
5678 length = min_t(u64, em->start + em->len - logical, length);
5679 *length_ret = length;
5680
5681 stripe_len = map->stripe_len;
5682 /*
5683 * stripe_nr counts the total number of stripes we have to stride
5684 * to get to this block
5685 */
5686 stripe_nr = div64_u64(offset, stripe_len);
5687
5688 /* stripe_offset is the offset of this block in its stripe */
5689 stripe_offset = offset - stripe_nr * stripe_len;
5690
5691 stripe_nr_end = round_up(offset + length, map->stripe_len);
5692 stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len);
5693 stripe_cnt = stripe_nr_end - stripe_nr;
5694 stripe_end_offset = stripe_nr_end * map->stripe_len -
5695 (offset + length);
5696 /*
5697 * after this, stripe_nr is the number of stripes on this
5698 * device we have to walk to find the data, and stripe_index is
5699 * the number of our device in the stripe array
5700 */
5701 num_stripes = 1;
5702 stripe_index = 0;
5703 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
5704 BTRFS_BLOCK_GROUP_RAID10)) {
5705 if (map->type & BTRFS_BLOCK_GROUP_RAID0)
5706 sub_stripes = 1;
5707 else
5708 sub_stripes = map->sub_stripes;
5709
5710 factor = map->num_stripes / sub_stripes;
5711 num_stripes = min_t(u64, map->num_stripes,
5712 sub_stripes * stripe_cnt);
5713 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
5714 stripe_index *= sub_stripes;
5715 stripes_per_dev = div_u64_rem(stripe_cnt, factor,
5716 &remaining_stripes);
5717 div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
5718 last_stripe *= sub_stripes;
5719 } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK |
5720 BTRFS_BLOCK_GROUP_DUP)) {
5721 num_stripes = map->num_stripes;
5722 } else {
5723 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
5724 &stripe_index);
5725 }
5726
5727 bbio = alloc_btrfs_bio(num_stripes, 0);
5728 if (!bbio) {
5729 ret = -ENOMEM;
5730 goto out;
5731 }
5732
5733 for (i = 0; i < num_stripes; i++) {
5734 bbio->stripes[i].physical =
5735 map->stripes[stripe_index].physical +
5736 stripe_offset + stripe_nr * map->stripe_len;
5737 bbio->stripes[i].dev = map->stripes[stripe_index].dev;
5738
5739 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
5740 BTRFS_BLOCK_GROUP_RAID10)) {
5741 bbio->stripes[i].length = stripes_per_dev *
5742 map->stripe_len;
5743
5744 if (i / sub_stripes < remaining_stripes)
5745 bbio->stripes[i].length +=
5746 map->stripe_len;
5747
5748 /*
5749 * Special for the first stripe and
5750 * the last stripe:
5751 *
5752 * |-------|...|-------|
5753 * |----------|
5754 * off end_off
5755 */
5756 if (i < sub_stripes)
5757 bbio->stripes[i].length -=
5758 stripe_offset;
5759
5760 if (stripe_index >= last_stripe &&
5761 stripe_index <= (last_stripe +
5762 sub_stripes - 1))
5763 bbio->stripes[i].length -=
5764 stripe_end_offset;
5765
5766 if (i == sub_stripes - 1)
5767 stripe_offset = 0;
5768 } else {
5769 bbio->stripes[i].length = length;
5770 }
5771
5772 stripe_index++;
5773 if (stripe_index == map->num_stripes) {
5774 stripe_index = 0;
5775 stripe_nr++;
5776 }
5777 }
5778
5779 *bbio_ret = bbio;
5780 bbio->map_type = map->type;
5781 bbio->num_stripes = num_stripes;
5782 out:
5783 free_extent_map(em);
5784 return ret;
5785 }
5786
5787 /*
5788 * In dev-replace case, for repair case (that's the only case where the mirror
5789 * is selected explicitly when calling btrfs_map_block), blocks left of the
5790 * left cursor can also be read from the target drive.
5791 *
5792 * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the
5793 * array of stripes.
5794 * For READ, it also needs to be supported using the same mirror number.
5795 *
5796 * If the requested block is not left of the left cursor, EIO is returned. This
5797 * can happen because btrfs_num_copies() returns one more in the dev-replace
5798 * case.
5799 */
get_extra_mirror_from_replace(struct btrfs_fs_info * fs_info,u64 logical,u64 length,u64 srcdev_devid,int * mirror_num,u64 * physical)5800 static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
5801 u64 logical, u64 length,
5802 u64 srcdev_devid, int *mirror_num,
5803 u64 *physical)
5804 {
5805 struct btrfs_bio *bbio = NULL;
5806 int num_stripes;
5807 int index_srcdev = 0;
5808 int found = 0;
5809 u64 physical_of_found = 0;
5810 int i;
5811 int ret = 0;
5812
5813 ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
5814 logical, &length, &bbio, 0, 0);
5815 if (ret) {
5816 ASSERT(bbio == NULL);
5817 return ret;
5818 }
5819
5820 num_stripes = bbio->num_stripes;
5821 if (*mirror_num > num_stripes) {
5822 /*
5823 * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror,
5824 * that means that the requested area is not left of the left
5825 * cursor
5826 */
5827 btrfs_put_bbio(bbio);
5828 return -EIO;
5829 }
5830
5831 /*
5832 * process the rest of the function using the mirror_num of the source
5833 * drive. Therefore look it up first. At the end, patch the device
5834 * pointer to the one of the target drive.
5835 */
5836 for (i = 0; i < num_stripes; i++) {
5837 if (bbio->stripes[i].dev->devid != srcdev_devid)
5838 continue;
5839
5840 /*
5841 * In case of DUP, in order to keep it simple, only add the
5842 * mirror with the lowest physical address
5843 */
5844 if (found &&
5845 physical_of_found <= bbio->stripes[i].physical)
5846 continue;
5847
5848 index_srcdev = i;
5849 found = 1;
5850 physical_of_found = bbio->stripes[i].physical;
5851 }
5852
5853 btrfs_put_bbio(bbio);
5854
5855 ASSERT(found);
5856 if (!found)
5857 return -EIO;
5858
5859 *mirror_num = index_srcdev + 1;
5860 *physical = physical_of_found;
5861 return ret;
5862 }
5863
handle_ops_on_dev_replace(enum btrfs_map_op op,struct btrfs_bio ** bbio_ret,struct btrfs_dev_replace * dev_replace,int * num_stripes_ret,int * max_errors_ret)5864 static void handle_ops_on_dev_replace(enum btrfs_map_op op,
5865 struct btrfs_bio **bbio_ret,
5866 struct btrfs_dev_replace *dev_replace,
5867 int *num_stripes_ret, int *max_errors_ret)
5868 {
5869 struct btrfs_bio *bbio = *bbio_ret;
5870 u64 srcdev_devid = dev_replace->srcdev->devid;
5871 int tgtdev_indexes = 0;
5872 int num_stripes = *num_stripes_ret;
5873 int max_errors = *max_errors_ret;
5874 int i;
5875
5876 if (op == BTRFS_MAP_WRITE) {
5877 int index_where_to_add;
5878
5879 /*
5880 * duplicate the write operations while the dev replace
5881 * procedure is running. Since the copying of the old disk to
5882 * the new disk takes place at run time while the filesystem is
5883 * mounted writable, the regular write operations to the old
5884 * disk have to be duplicated to go to the new disk as well.
5885 *
5886 * Note that device->missing is handled by the caller, and that
5887 * the write to the old disk is already set up in the stripes
5888 * array.
5889 */
5890 index_where_to_add = num_stripes;
5891 for (i = 0; i < num_stripes; i++) {
5892 if (bbio->stripes[i].dev->devid == srcdev_devid) {
5893 /* write to new disk, too */
5894 struct btrfs_bio_stripe *new =
5895 bbio->stripes + index_where_to_add;
5896 struct btrfs_bio_stripe *old =
5897 bbio->stripes + i;
5898
5899 new->physical = old->physical;
5900 new->length = old->length;
5901 new->dev = dev_replace->tgtdev;
5902 bbio->tgtdev_map[i] = index_where_to_add;
5903 index_where_to_add++;
5904 max_errors++;
5905 tgtdev_indexes++;
5906 }
5907 }
5908 num_stripes = index_where_to_add;
5909 } else if (op == BTRFS_MAP_GET_READ_MIRRORS) {
5910 int index_srcdev = 0;
5911 int found = 0;
5912 u64 physical_of_found = 0;
5913
5914 /*
5915 * During the dev-replace procedure, the target drive can also
5916 * be used to read data in case it is needed to repair a corrupt
5917 * block elsewhere. This is possible if the requested area is
5918 * left of the left cursor. In this area, the target drive is a
5919 * full copy of the source drive.
5920 */
5921 for (i = 0; i < num_stripes; i++) {
5922 if (bbio->stripes[i].dev->devid == srcdev_devid) {
5923 /*
5924 * In case of DUP, in order to keep it simple,
5925 * only add the mirror with the lowest physical
5926 * address
5927 */
5928 if (found &&
5929 physical_of_found <=
5930 bbio->stripes[i].physical)
5931 continue;
5932 index_srcdev = i;
5933 found = 1;
5934 physical_of_found = bbio->stripes[i].physical;
5935 }
5936 }
5937 if (found) {
5938 struct btrfs_bio_stripe *tgtdev_stripe =
5939 bbio->stripes + num_stripes;
5940
5941 tgtdev_stripe->physical = physical_of_found;
5942 tgtdev_stripe->length =
5943 bbio->stripes[index_srcdev].length;
5944 tgtdev_stripe->dev = dev_replace->tgtdev;
5945 bbio->tgtdev_map[index_srcdev] = num_stripes;
5946
5947 tgtdev_indexes++;
5948 num_stripes++;
5949 }
5950 }
5951
5952 *num_stripes_ret = num_stripes;
5953 *max_errors_ret = max_errors;
5954 bbio->num_tgtdevs = tgtdev_indexes;
5955 *bbio_ret = bbio;
5956 }
5957
need_full_stripe(enum btrfs_map_op op)5958 static bool need_full_stripe(enum btrfs_map_op op)
5959 {
5960 return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS);
5961 }
5962
5963 /*
5964 * btrfs_get_io_geometry - calculates the geomery of a particular (address, len)
5965 * tuple. This information is used to calculate how big a
5966 * particular bio can get before it straddles a stripe.
5967 *
5968 * @fs_info - the filesystem
5969 * @logical - address that we want to figure out the geometry of
5970 * @len - the length of IO we are going to perform, starting at @logical
5971 * @op - type of operation - write or read
5972 * @io_geom - pointer used to return values
5973 *
5974 * Returns < 0 in case a chunk for the given logical address cannot be found,
5975 * usually shouldn't happen unless @logical is corrupted, 0 otherwise.
5976 */
btrfs_get_io_geometry(struct btrfs_fs_info * fs_info,enum btrfs_map_op op,u64 logical,u64 len,struct btrfs_io_geometry * io_geom)5977 int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
5978 u64 logical, u64 len, struct btrfs_io_geometry *io_geom)
5979 {
5980 struct extent_map *em;
5981 struct map_lookup *map;
5982 u64 offset;
5983 u64 stripe_offset;
5984 u64 stripe_nr;
5985 u64 stripe_len;
5986 u64 raid56_full_stripe_start = (u64)-1;
5987 int data_stripes;
5988 int ret = 0;
5989
5990 ASSERT(op != BTRFS_MAP_DISCARD);
5991
5992 em = btrfs_get_chunk_map(fs_info, logical, len);
5993 if (IS_ERR(em))
5994 return PTR_ERR(em);
5995
5996 map = em->map_lookup;
5997 /* Offset of this logical address in the chunk */
5998 offset = logical - em->start;
5999 /* Len of a stripe in a chunk */
6000 stripe_len = map->stripe_len;
6001 /* Stripe wher this block falls in */
6002 stripe_nr = div64_u64(offset, stripe_len);
6003 /* Offset of stripe in the chunk */
6004 stripe_offset = stripe_nr * stripe_len;
6005 if (offset < stripe_offset) {
6006 btrfs_crit(fs_info,
6007 "stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu",
6008 stripe_offset, offset, em->start, logical, stripe_len);
6009 ret = -EINVAL;
6010 goto out;
6011 }
6012
6013 /* stripe_offset is the offset of this block in its stripe */
6014 stripe_offset = offset - stripe_offset;
6015 data_stripes = nr_data_stripes(map);
6016
6017 if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
6018 u64 max_len = stripe_len - stripe_offset;
6019
6020 /*
6021 * In case of raid56, we need to know the stripe aligned start
6022 */
6023 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
6024 unsigned long full_stripe_len = stripe_len * data_stripes;
6025 raid56_full_stripe_start = offset;
6026
6027 /*
6028 * Allow a write of a full stripe, but make sure we
6029 * don't allow straddling of stripes
6030 */
6031 raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
6032 full_stripe_len);
6033 raid56_full_stripe_start *= full_stripe_len;
6034
6035 /*
6036 * For writes to RAID[56], allow a full stripeset across
6037 * all disks. For other RAID types and for RAID[56]
6038 * reads, just allow a single stripe (on a single disk).
6039 */
6040 if (op == BTRFS_MAP_WRITE) {
6041 max_len = stripe_len * data_stripes -
6042 (offset - raid56_full_stripe_start);
6043 }
6044 }
6045 len = min_t(u64, em->len - offset, max_len);
6046 } else {
6047 len = em->len - offset;
6048 }
6049
6050 io_geom->len = len;
6051 io_geom->offset = offset;
6052 io_geom->stripe_len = stripe_len;
6053 io_geom->stripe_nr = stripe_nr;
6054 io_geom->stripe_offset = stripe_offset;
6055 io_geom->raid56_stripe_offset = raid56_full_stripe_start;
6056
6057 out:
6058 /* once for us */
6059 free_extent_map(em);
6060 return ret;
6061 }
6062
__btrfs_map_block(struct btrfs_fs_info * fs_info,enum btrfs_map_op op,u64 logical,u64 * length,struct btrfs_bio ** bbio_ret,int mirror_num,int need_raid_map)6063 static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
6064 enum btrfs_map_op op,
6065 u64 logical, u64 *length,
6066 struct btrfs_bio **bbio_ret,
6067 int mirror_num, int need_raid_map)
6068 {
6069 struct extent_map *em;
6070 struct map_lookup *map;
6071 u64 stripe_offset;
6072 u64 stripe_nr;
6073 u64 stripe_len;
6074 u32 stripe_index;
6075 int data_stripes;
6076 int i;
6077 int ret = 0;
6078 int num_stripes;
6079 int max_errors = 0;
6080 int tgtdev_indexes = 0;
6081 struct btrfs_bio *bbio = NULL;
6082 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
6083 int dev_replace_is_ongoing = 0;
6084 int num_alloc_stripes;
6085 int patch_the_first_stripe_for_dev_replace = 0;
6086 u64 physical_to_patch_in_first_stripe = 0;
6087 u64 raid56_full_stripe_start = (u64)-1;
6088 struct btrfs_io_geometry geom;
6089
6090 ASSERT(bbio_ret);
6091 ASSERT(op != BTRFS_MAP_DISCARD);
6092
6093 ret = btrfs_get_io_geometry(fs_info, op, logical, *length, &geom);
6094 if (ret < 0)
6095 return ret;
6096
6097 em = btrfs_get_chunk_map(fs_info, logical, *length);
6098 ASSERT(!IS_ERR(em));
6099 map = em->map_lookup;
6100
6101 *length = geom.len;
6102 stripe_len = geom.stripe_len;
6103 stripe_nr = geom.stripe_nr;
6104 stripe_offset = geom.stripe_offset;
6105 raid56_full_stripe_start = geom.raid56_stripe_offset;
6106 data_stripes = nr_data_stripes(map);
6107
6108 down_read(&dev_replace->rwsem);
6109 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
6110 /*
6111 * Hold the semaphore for read during the whole operation, write is
6112 * requested at commit time but must wait.
6113 */
6114 if (!dev_replace_is_ongoing)
6115 up_read(&dev_replace->rwsem);
6116
6117 if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
6118 !need_full_stripe(op) && dev_replace->tgtdev != NULL) {
6119 ret = get_extra_mirror_from_replace(fs_info, logical, *length,
6120 dev_replace->srcdev->devid,
6121 &mirror_num,
6122 &physical_to_patch_in_first_stripe);
6123 if (ret)
6124 goto out;
6125 else
6126 patch_the_first_stripe_for_dev_replace = 1;
6127 } else if (mirror_num > map->num_stripes) {
6128 mirror_num = 0;
6129 }
6130
6131 num_stripes = 1;
6132 stripe_index = 0;
6133 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
6134 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
6135 &stripe_index);
6136 if (!need_full_stripe(op))
6137 mirror_num = 1;
6138 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
6139 if (need_full_stripe(op))
6140 num_stripes = map->num_stripes;
6141 else if (mirror_num)
6142 stripe_index = mirror_num - 1;
6143 else {
6144 stripe_index = find_live_mirror(fs_info, map, 0,
6145 dev_replace_is_ongoing);
6146 mirror_num = stripe_index + 1;
6147 }
6148
6149 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
6150 if (need_full_stripe(op)) {
6151 num_stripes = map->num_stripes;
6152 } else if (mirror_num) {
6153 stripe_index = mirror_num - 1;
6154 } else {
6155 mirror_num = 1;
6156 }
6157
6158 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
6159 u32 factor = map->num_stripes / map->sub_stripes;
6160
6161 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
6162 stripe_index *= map->sub_stripes;
6163
6164 if (need_full_stripe(op))
6165 num_stripes = map->sub_stripes;
6166 else if (mirror_num)
6167 stripe_index += mirror_num - 1;
6168 else {
6169 int old_stripe_index = stripe_index;
6170 stripe_index = find_live_mirror(fs_info, map,
6171 stripe_index,
6172 dev_replace_is_ongoing);
6173 mirror_num = stripe_index - old_stripe_index + 1;
6174 }
6175
6176 } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
6177 if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) {
6178 /* push stripe_nr back to the start of the full stripe */
6179 stripe_nr = div64_u64(raid56_full_stripe_start,
6180 stripe_len * data_stripes);
6181
6182 /* RAID[56] write or recovery. Return all stripes */
6183 num_stripes = map->num_stripes;
6184 max_errors = nr_parity_stripes(map);
6185
6186 *length = map->stripe_len;
6187 stripe_index = 0;
6188 stripe_offset = 0;
6189 } else {
6190 /*
6191 * Mirror #0 or #1 means the original data block.
6192 * Mirror #2 is RAID5 parity block.
6193 * Mirror #3 is RAID6 Q block.
6194 */
6195 stripe_nr = div_u64_rem(stripe_nr,
6196 data_stripes, &stripe_index);
6197 if (mirror_num > 1)
6198 stripe_index = data_stripes + mirror_num - 2;
6199
6200 /* We distribute the parity blocks across stripes */
6201 div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
6202 &stripe_index);
6203 if (!need_full_stripe(op) && mirror_num <= 1)
6204 mirror_num = 1;
6205 }
6206 } else {
6207 /*
6208 * after this, stripe_nr is the number of stripes on this
6209 * device we have to walk to find the data, and stripe_index is
6210 * the number of our device in the stripe array
6211 */
6212 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
6213 &stripe_index);
6214 mirror_num = stripe_index + 1;
6215 }
6216 if (stripe_index >= map->num_stripes) {
6217 btrfs_crit(fs_info,
6218 "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u",
6219 stripe_index, map->num_stripes);
6220 ret = -EINVAL;
6221 goto out;
6222 }
6223
6224 num_alloc_stripes = num_stripes;
6225 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) {
6226 if (op == BTRFS_MAP_WRITE)
6227 num_alloc_stripes <<= 1;
6228 if (op == BTRFS_MAP_GET_READ_MIRRORS)
6229 num_alloc_stripes++;
6230 tgtdev_indexes = num_stripes;
6231 }
6232
6233 bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes);
6234 if (!bbio) {
6235 ret = -ENOMEM;
6236 goto out;
6237 }
6238
6239 for (i = 0; i < num_stripes; i++) {
6240 bbio->stripes[i].physical = map->stripes[stripe_index].physical +
6241 stripe_offset + stripe_nr * map->stripe_len;
6242 bbio->stripes[i].dev = map->stripes[stripe_index].dev;
6243 stripe_index++;
6244 }
6245
6246 /* build raid_map */
6247 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
6248 (need_full_stripe(op) || mirror_num > 1)) {
6249 u64 tmp;
6250 unsigned rot;
6251
6252 /* Work out the disk rotation on this stripe-set */
6253 div_u64_rem(stripe_nr, num_stripes, &rot);
6254
6255 /* Fill in the logical address of each stripe */
6256 tmp = stripe_nr * data_stripes;
6257 for (i = 0; i < data_stripes; i++)
6258 bbio->raid_map[(i+rot) % num_stripes] =
6259 em->start + (tmp + i) * map->stripe_len;
6260
6261 bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
6262 if (map->type & BTRFS_BLOCK_GROUP_RAID6)
6263 bbio->raid_map[(i+rot+1) % num_stripes] =
6264 RAID6_Q_STRIPE;
6265
6266 sort_parity_stripes(bbio, num_stripes);
6267 }
6268
6269 if (need_full_stripe(op))
6270 max_errors = btrfs_chunk_max_errors(map);
6271
6272 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
6273 need_full_stripe(op)) {
6274 handle_ops_on_dev_replace(op, &bbio, dev_replace, &num_stripes,
6275 &max_errors);
6276 }
6277
6278 *bbio_ret = bbio;
6279 bbio->map_type = map->type;
6280 bbio->num_stripes = num_stripes;
6281 bbio->max_errors = max_errors;
6282 bbio->mirror_num = mirror_num;
6283
6284 /*
6285 * this is the case that REQ_READ && dev_replace_is_ongoing &&
6286 * mirror_num == num_stripes + 1 && dev_replace target drive is
6287 * available as a mirror
6288 */
6289 if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
6290 WARN_ON(num_stripes > 1);
6291 bbio->stripes[0].dev = dev_replace->tgtdev;
6292 bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
6293 bbio->mirror_num = map->num_stripes + 1;
6294 }
6295 out:
6296 if (dev_replace_is_ongoing) {
6297 lockdep_assert_held(&dev_replace->rwsem);
6298 /* Unlock and let waiting writers proceed */
6299 up_read(&dev_replace->rwsem);
6300 }
6301 free_extent_map(em);
6302 return ret;
6303 }
6304
btrfs_map_block(struct btrfs_fs_info * fs_info,enum btrfs_map_op op,u64 logical,u64 * length,struct btrfs_bio ** bbio_ret,int mirror_num)6305 int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
6306 u64 logical, u64 *length,
6307 struct btrfs_bio **bbio_ret, int mirror_num)
6308 {
6309 if (op == BTRFS_MAP_DISCARD)
6310 return __btrfs_map_block_for_discard(fs_info, logical,
6311 length, bbio_ret);
6312
6313 return __btrfs_map_block(fs_info, op, logical, length, bbio_ret,
6314 mirror_num, 0);
6315 }
6316
6317 /* For Scrub/replace */
btrfs_map_sblock(struct btrfs_fs_info * fs_info,enum btrfs_map_op op,u64 logical,u64 * length,struct btrfs_bio ** bbio_ret)6318 int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
6319 u64 logical, u64 *length,
6320 struct btrfs_bio **bbio_ret)
6321 {
6322 return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1);
6323 }
6324
btrfs_end_bbio(struct btrfs_bio * bbio,struct bio * bio)6325 static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio)
6326 {
6327 bio->bi_private = bbio->private;
6328 bio->bi_end_io = bbio->end_io;
6329 bio_endio(bio);
6330
6331 btrfs_put_bbio(bbio);
6332 }
6333
btrfs_end_bio(struct bio * bio)6334 static void btrfs_end_bio(struct bio *bio)
6335 {
6336 struct btrfs_bio *bbio = bio->bi_private;
6337 int is_orig_bio = 0;
6338
6339 if (bio->bi_status) {
6340 atomic_inc(&bbio->error);
6341 if (bio->bi_status == BLK_STS_IOERR ||
6342 bio->bi_status == BLK_STS_TARGET) {
6343 struct btrfs_device *dev = btrfs_io_bio(bio)->device;
6344
6345 ASSERT(dev->bdev);
6346 if (bio_op(bio) == REQ_OP_WRITE)
6347 btrfs_dev_stat_inc_and_print(dev,
6348 BTRFS_DEV_STAT_WRITE_ERRS);
6349 else if (!(bio->bi_opf & REQ_RAHEAD))
6350 btrfs_dev_stat_inc_and_print(dev,
6351 BTRFS_DEV_STAT_READ_ERRS);
6352 if (bio->bi_opf & REQ_PREFLUSH)
6353 btrfs_dev_stat_inc_and_print(dev,
6354 BTRFS_DEV_STAT_FLUSH_ERRS);
6355 }
6356 }
6357
6358 if (bio == bbio->orig_bio)
6359 is_orig_bio = 1;
6360
6361 btrfs_bio_counter_dec(bbio->fs_info);
6362
6363 if (atomic_dec_and_test(&bbio->stripes_pending)) {
6364 if (!is_orig_bio) {
6365 bio_put(bio);
6366 bio = bbio->orig_bio;
6367 }
6368
6369 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
6370 /* only send an error to the higher layers if it is
6371 * beyond the tolerance of the btrfs bio
6372 */
6373 if (atomic_read(&bbio->error) > bbio->max_errors) {
6374 bio->bi_status = BLK_STS_IOERR;
6375 } else {
6376 /*
6377 * this bio is actually up to date, we didn't
6378 * go over the max number of errors
6379 */
6380 bio->bi_status = BLK_STS_OK;
6381 }
6382
6383 btrfs_end_bbio(bbio, bio);
6384 } else if (!is_orig_bio) {
6385 bio_put(bio);
6386 }
6387 }
6388
submit_stripe_bio(struct btrfs_bio * bbio,struct bio * bio,u64 physical,struct btrfs_device * dev)6389 static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
6390 u64 physical, struct btrfs_device *dev)
6391 {
6392 struct btrfs_fs_info *fs_info = bbio->fs_info;
6393
6394 bio->bi_private = bbio;
6395 btrfs_io_bio(bio)->device = dev;
6396 bio->bi_end_io = btrfs_end_bio;
6397 bio->bi_iter.bi_sector = physical >> 9;
6398 btrfs_debug_in_rcu(fs_info,
6399 "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
6400 bio_op(bio), bio->bi_opf, (u64)bio->bi_iter.bi_sector,
6401 (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name),
6402 dev->devid, bio->bi_iter.bi_size);
6403 bio_set_dev(bio, dev->bdev);
6404
6405 btrfs_bio_counter_inc_noblocked(fs_info);
6406
6407 btrfsic_submit_bio(bio);
6408 }
6409
bbio_error(struct btrfs_bio * bbio,struct bio * bio,u64 logical)6410 static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
6411 {
6412 atomic_inc(&bbio->error);
6413 if (atomic_dec_and_test(&bbio->stripes_pending)) {
6414 /* Should be the original bio. */
6415 WARN_ON(bio != bbio->orig_bio);
6416
6417 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
6418 bio->bi_iter.bi_sector = logical >> 9;
6419 if (atomic_read(&bbio->error) > bbio->max_errors)
6420 bio->bi_status = BLK_STS_IOERR;
6421 else
6422 bio->bi_status = BLK_STS_OK;
6423 btrfs_end_bbio(bbio, bio);
6424 }
6425 }
6426
btrfs_map_bio(struct btrfs_fs_info * fs_info,struct bio * bio,int mirror_num)6427 blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
6428 int mirror_num)
6429 {
6430 struct btrfs_device *dev;
6431 struct bio *first_bio = bio;
6432 u64 logical = (u64)bio->bi_iter.bi_sector << 9;
6433 u64 length = 0;
6434 u64 map_length;
6435 int ret;
6436 int dev_nr;
6437 int total_devs;
6438 struct btrfs_bio *bbio = NULL;
6439
6440 length = bio->bi_iter.bi_size;
6441 map_length = length;
6442
6443 btrfs_bio_counter_inc_blocked(fs_info);
6444 ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical,
6445 &map_length, &bbio, mirror_num, 1);
6446 if (ret) {
6447 btrfs_bio_counter_dec(fs_info);
6448 return errno_to_blk_status(ret);
6449 }
6450
6451 total_devs = bbio->num_stripes;
6452 bbio->orig_bio = first_bio;
6453 bbio->private = first_bio->bi_private;
6454 bbio->end_io = first_bio->bi_end_io;
6455 bbio->fs_info = fs_info;
6456 atomic_set(&bbio->stripes_pending, bbio->num_stripes);
6457
6458 if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
6459 ((bio_op(bio) == REQ_OP_WRITE) || (mirror_num > 1))) {
6460 /* In this case, map_length has been set to the length of
6461 a single stripe; not the whole write */
6462 if (bio_op(bio) == REQ_OP_WRITE) {
6463 ret = raid56_parity_write(fs_info, bio, bbio,
6464 map_length);
6465 } else {
6466 ret = raid56_parity_recover(fs_info, bio, bbio,
6467 map_length, mirror_num, 1);
6468 }
6469
6470 btrfs_bio_counter_dec(fs_info);
6471 return errno_to_blk_status(ret);
6472 }
6473
6474 if (map_length < length) {
6475 btrfs_crit(fs_info,
6476 "mapping failed logical %llu bio len %llu len %llu",
6477 logical, length, map_length);
6478 BUG();
6479 }
6480
6481 for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
6482 dev = bbio->stripes[dev_nr].dev;
6483 if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING,
6484 &dev->dev_state) ||
6485 (bio_op(first_bio) == REQ_OP_WRITE &&
6486 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
6487 bbio_error(bbio, first_bio, logical);
6488 continue;
6489 }
6490
6491 if (dev_nr < total_devs - 1)
6492 bio = btrfs_bio_clone(first_bio);
6493 else
6494 bio = first_bio;
6495
6496 submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, dev);
6497 }
6498 btrfs_bio_counter_dec(fs_info);
6499 return BLK_STS_OK;
6500 }
6501
6502 /*
6503 * Find a device specified by @devid or @uuid in the list of @fs_devices, or
6504 * return NULL.
6505 *
6506 * If devid and uuid are both specified, the match must be exact, otherwise
6507 * only devid is used.
6508 *
6509 * If @seed is true, traverse through the seed devices.
6510 */
btrfs_find_device(struct btrfs_fs_devices * fs_devices,u64 devid,u8 * uuid,u8 * fsid,bool seed)6511 struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
6512 u64 devid, u8 *uuid, u8 *fsid,
6513 bool seed)
6514 {
6515 struct btrfs_device *device;
6516 struct btrfs_fs_devices *seed_devs;
6517
6518 if (!fsid || !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
6519 list_for_each_entry(device, &fs_devices->devices, dev_list) {
6520 if (device->devid == devid &&
6521 (!uuid || memcmp(device->uuid, uuid,
6522 BTRFS_UUID_SIZE) == 0))
6523 return device;
6524 }
6525 }
6526
6527 list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
6528 if (!fsid ||
6529 !memcmp(seed_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
6530 list_for_each_entry(device, &seed_devs->devices,
6531 dev_list) {
6532 if (device->devid == devid &&
6533 (!uuid || memcmp(device->uuid, uuid,
6534 BTRFS_UUID_SIZE) == 0))
6535 return device;
6536 }
6537 }
6538 }
6539
6540 return NULL;
6541 }
6542
add_missing_dev(struct btrfs_fs_devices * fs_devices,u64 devid,u8 * dev_uuid)6543 static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
6544 u64 devid, u8 *dev_uuid)
6545 {
6546 struct btrfs_device *device;
6547 unsigned int nofs_flag;
6548
6549 /*
6550 * We call this under the chunk_mutex, so we want to use NOFS for this
6551 * allocation, however we don't want to change btrfs_alloc_device() to
6552 * always do NOFS because we use it in a lot of other GFP_KERNEL safe
6553 * places.
6554 */
6555 nofs_flag = memalloc_nofs_save();
6556 device = btrfs_alloc_device(NULL, &devid, dev_uuid);
6557 memalloc_nofs_restore(nofs_flag);
6558 if (IS_ERR(device))
6559 return device;
6560
6561 list_add(&device->dev_list, &fs_devices->devices);
6562 device->fs_devices = fs_devices;
6563 fs_devices->num_devices++;
6564
6565 set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
6566 fs_devices->missing_devices++;
6567
6568 return device;
6569 }
6570
6571 /**
6572 * btrfs_alloc_device - allocate struct btrfs_device
6573 * @fs_info: used only for generating a new devid, can be NULL if
6574 * devid is provided (i.e. @devid != NULL).
6575 * @devid: a pointer to devid for this device. If NULL a new devid
6576 * is generated.
6577 * @uuid: a pointer to UUID for this device. If NULL a new UUID
6578 * is generated.
6579 *
6580 * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
6581 * on error. Returned struct is not linked onto any lists and must be
6582 * destroyed with btrfs_free_device.
6583 */
btrfs_alloc_device(struct btrfs_fs_info * fs_info,const u64 * devid,const u8 * uuid)6584 struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
6585 const u64 *devid,
6586 const u8 *uuid)
6587 {
6588 struct btrfs_device *dev;
6589 u64 tmp;
6590
6591 if (WARN_ON(!devid && !fs_info))
6592 return ERR_PTR(-EINVAL);
6593
6594 dev = __alloc_device(fs_info);
6595 if (IS_ERR(dev))
6596 return dev;
6597
6598 if (devid)
6599 tmp = *devid;
6600 else {
6601 int ret;
6602
6603 ret = find_next_devid(fs_info, &tmp);
6604 if (ret) {
6605 btrfs_free_device(dev);
6606 return ERR_PTR(ret);
6607 }
6608 }
6609 dev->devid = tmp;
6610
6611 if (uuid)
6612 memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE);
6613 else
6614 generate_random_uuid(dev->uuid);
6615
6616 return dev;
6617 }
6618
btrfs_report_missing_device(struct btrfs_fs_info * fs_info,u64 devid,u8 * uuid,bool error)6619 static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
6620 u64 devid, u8 *uuid, bool error)
6621 {
6622 if (error)
6623 btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing",
6624 devid, uuid);
6625 else
6626 btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing",
6627 devid, uuid);
6628 }
6629
calc_stripe_length(u64 type,u64 chunk_len,int num_stripes)6630 static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes)
6631 {
6632 int index = btrfs_bg_flags_to_raid_index(type);
6633 int ncopies = btrfs_raid_array[index].ncopies;
6634 const int nparity = btrfs_raid_array[index].nparity;
6635 int data_stripes;
6636
6637 if (nparity)
6638 data_stripes = num_stripes - nparity;
6639 else
6640 data_stripes = num_stripes / ncopies;
6641
6642 return div_u64(chunk_len, data_stripes);
6643 }
6644
read_one_chunk(struct btrfs_key * key,struct extent_buffer * leaf,struct btrfs_chunk * chunk)6645 static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
6646 struct btrfs_chunk *chunk)
6647 {
6648 struct btrfs_fs_info *fs_info = leaf->fs_info;
6649 struct extent_map_tree *map_tree = &fs_info->mapping_tree;
6650 struct map_lookup *map;
6651 struct extent_map *em;
6652 u64 logical;
6653 u64 length;
6654 u64 devid;
6655 u8 uuid[BTRFS_UUID_SIZE];
6656 int num_stripes;
6657 int ret;
6658 int i;
6659
6660 logical = key->offset;
6661 length = btrfs_chunk_length(leaf, chunk);
6662 num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
6663
6664 /*
6665 * Only need to verify chunk item if we're reading from sys chunk array,
6666 * as chunk item in tree block is already verified by tree-checker.
6667 */
6668 if (leaf->start == BTRFS_SUPER_INFO_OFFSET) {
6669 ret = btrfs_check_chunk_valid(leaf, chunk, logical);
6670 if (ret)
6671 return ret;
6672 }
6673
6674 read_lock(&map_tree->lock);
6675 em = lookup_extent_mapping(map_tree, logical, 1);
6676 read_unlock(&map_tree->lock);
6677
6678 /* already mapped? */
6679 if (em && em->start <= logical && em->start + em->len > logical) {
6680 free_extent_map(em);
6681 return 0;
6682 } else if (em) {
6683 free_extent_map(em);
6684 }
6685
6686 em = alloc_extent_map();
6687 if (!em)
6688 return -ENOMEM;
6689 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
6690 if (!map) {
6691 free_extent_map(em);
6692 return -ENOMEM;
6693 }
6694
6695 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
6696 em->map_lookup = map;
6697 em->start = logical;
6698 em->len = length;
6699 em->orig_start = 0;
6700 em->block_start = 0;
6701 em->block_len = em->len;
6702
6703 map->num_stripes = num_stripes;
6704 map->io_width = btrfs_chunk_io_width(leaf, chunk);
6705 map->io_align = btrfs_chunk_io_align(leaf, chunk);
6706 map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
6707 map->type = btrfs_chunk_type(leaf, chunk);
6708 map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
6709 map->verified_stripes = 0;
6710 em->orig_block_len = calc_stripe_length(map->type, em->len,
6711 map->num_stripes);
6712 for (i = 0; i < num_stripes; i++) {
6713 map->stripes[i].physical =
6714 btrfs_stripe_offset_nr(leaf, chunk, i);
6715 devid = btrfs_stripe_devid_nr(leaf, chunk, i);
6716 read_extent_buffer(leaf, uuid, (unsigned long)
6717 btrfs_stripe_dev_uuid_nr(chunk, i),
6718 BTRFS_UUID_SIZE);
6719 map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices,
6720 devid, uuid, NULL, true);
6721 if (!map->stripes[i].dev &&
6722 !btrfs_test_opt(fs_info, DEGRADED)) {
6723 free_extent_map(em);
6724 btrfs_report_missing_device(fs_info, devid, uuid, true);
6725 return -ENOENT;
6726 }
6727 if (!map->stripes[i].dev) {
6728 map->stripes[i].dev =
6729 add_missing_dev(fs_info->fs_devices, devid,
6730 uuid);
6731 if (IS_ERR(map->stripes[i].dev)) {
6732 free_extent_map(em);
6733 btrfs_err(fs_info,
6734 "failed to init missing dev %llu: %ld",
6735 devid, PTR_ERR(map->stripes[i].dev));
6736 return PTR_ERR(map->stripes[i].dev);
6737 }
6738 btrfs_report_missing_device(fs_info, devid, uuid, false);
6739 }
6740 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
6741 &(map->stripes[i].dev->dev_state));
6742
6743 }
6744
6745 write_lock(&map_tree->lock);
6746 ret = add_extent_mapping(map_tree, em, 0);
6747 write_unlock(&map_tree->lock);
6748 if (ret < 0) {
6749 btrfs_err(fs_info,
6750 "failed to add chunk map, start=%llu len=%llu: %d",
6751 em->start, em->len, ret);
6752 }
6753 free_extent_map(em);
6754
6755 return ret;
6756 }
6757
fill_device_from_item(struct extent_buffer * leaf,struct btrfs_dev_item * dev_item,struct btrfs_device * device)6758 static void fill_device_from_item(struct extent_buffer *leaf,
6759 struct btrfs_dev_item *dev_item,
6760 struct btrfs_device *device)
6761 {
6762 unsigned long ptr;
6763
6764 device->devid = btrfs_device_id(leaf, dev_item);
6765 device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
6766 device->total_bytes = device->disk_total_bytes;
6767 device->commit_total_bytes = device->disk_total_bytes;
6768 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
6769 device->commit_bytes_used = device->bytes_used;
6770 device->type = btrfs_device_type(leaf, dev_item);
6771 device->io_align = btrfs_device_io_align(leaf, dev_item);
6772 device->io_width = btrfs_device_io_width(leaf, dev_item);
6773 device->sector_size = btrfs_device_sector_size(leaf, dev_item);
6774 WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
6775 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
6776
6777 ptr = btrfs_device_uuid(dev_item);
6778 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
6779 }
6780
open_seed_devices(struct btrfs_fs_info * fs_info,u8 * fsid)6781 static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
6782 u8 *fsid)
6783 {
6784 struct btrfs_fs_devices *fs_devices;
6785 int ret;
6786
6787 lockdep_assert_held(&uuid_mutex);
6788 ASSERT(fsid);
6789
6790 /* This will match only for multi-device seed fs */
6791 list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list)
6792 if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE))
6793 return fs_devices;
6794
6795
6796 fs_devices = find_fsid(fsid, NULL);
6797 if (!fs_devices) {
6798 if (!btrfs_test_opt(fs_info, DEGRADED))
6799 return ERR_PTR(-ENOENT);
6800
6801 fs_devices = alloc_fs_devices(fsid, NULL);
6802 if (IS_ERR(fs_devices))
6803 return fs_devices;
6804
6805 fs_devices->seeding = true;
6806 fs_devices->opened = 1;
6807 return fs_devices;
6808 }
6809
6810 /*
6811 * Upon first call for a seed fs fsid, just create a private copy of the
6812 * respective fs_devices and anchor it at fs_info->fs_devices->seed_list
6813 */
6814 fs_devices = clone_fs_devices(fs_devices);
6815 if (IS_ERR(fs_devices))
6816 return fs_devices;
6817
6818 ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder);
6819 if (ret) {
6820 free_fs_devices(fs_devices);
6821 return ERR_PTR(ret);
6822 }
6823
6824 if (!fs_devices->seeding) {
6825 close_fs_devices(fs_devices);
6826 free_fs_devices(fs_devices);
6827 return ERR_PTR(-EINVAL);
6828 }
6829
6830 list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list);
6831
6832 return fs_devices;
6833 }
6834
read_one_dev(struct extent_buffer * leaf,struct btrfs_dev_item * dev_item)6835 static int read_one_dev(struct extent_buffer *leaf,
6836 struct btrfs_dev_item *dev_item)
6837 {
6838 struct btrfs_fs_info *fs_info = leaf->fs_info;
6839 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6840 struct btrfs_device *device;
6841 u64 devid;
6842 int ret;
6843 u8 fs_uuid[BTRFS_FSID_SIZE];
6844 u8 dev_uuid[BTRFS_UUID_SIZE];
6845
6846 devid = btrfs_device_id(leaf, dev_item);
6847 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
6848 BTRFS_UUID_SIZE);
6849 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
6850 BTRFS_FSID_SIZE);
6851
6852 if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) {
6853 fs_devices = open_seed_devices(fs_info, fs_uuid);
6854 if (IS_ERR(fs_devices))
6855 return PTR_ERR(fs_devices);
6856 }
6857
6858 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
6859 fs_uuid, true);
6860 if (!device) {
6861 if (!btrfs_test_opt(fs_info, DEGRADED)) {
6862 btrfs_report_missing_device(fs_info, devid,
6863 dev_uuid, true);
6864 return -ENOENT;
6865 }
6866
6867 device = add_missing_dev(fs_devices, devid, dev_uuid);
6868 if (IS_ERR(device)) {
6869 btrfs_err(fs_info,
6870 "failed to add missing dev %llu: %ld",
6871 devid, PTR_ERR(device));
6872 return PTR_ERR(device);
6873 }
6874 btrfs_report_missing_device(fs_info, devid, dev_uuid, false);
6875 } else {
6876 if (!device->bdev) {
6877 if (!btrfs_test_opt(fs_info, DEGRADED)) {
6878 btrfs_report_missing_device(fs_info,
6879 devid, dev_uuid, true);
6880 return -ENOENT;
6881 }
6882 btrfs_report_missing_device(fs_info, devid,
6883 dev_uuid, false);
6884 }
6885
6886 if (!device->bdev &&
6887 !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
6888 /*
6889 * this happens when a device that was properly setup
6890 * in the device info lists suddenly goes bad.
6891 * device->bdev is NULL, and so we have to set
6892 * device->missing to one here
6893 */
6894 device->fs_devices->missing_devices++;
6895 set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
6896 }
6897
6898 /* Move the device to its own fs_devices */
6899 if (device->fs_devices != fs_devices) {
6900 ASSERT(test_bit(BTRFS_DEV_STATE_MISSING,
6901 &device->dev_state));
6902
6903 list_move(&device->dev_list, &fs_devices->devices);
6904 device->fs_devices->num_devices--;
6905 fs_devices->num_devices++;
6906
6907 device->fs_devices->missing_devices--;
6908 fs_devices->missing_devices++;
6909
6910 device->fs_devices = fs_devices;
6911 }
6912 }
6913
6914 if (device->fs_devices != fs_info->fs_devices) {
6915 BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state));
6916 if (device->generation !=
6917 btrfs_device_generation(leaf, dev_item))
6918 return -EINVAL;
6919 }
6920
6921 fill_device_from_item(leaf, dev_item, device);
6922 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
6923 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
6924 !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
6925 device->fs_devices->total_rw_bytes += device->total_bytes;
6926 atomic64_add(device->total_bytes - device->bytes_used,
6927 &fs_info->free_chunk_space);
6928 }
6929 ret = 0;
6930 return ret;
6931 }
6932
btrfs_read_sys_array(struct btrfs_fs_info * fs_info)6933 int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
6934 {
6935 struct btrfs_root *root = fs_info->tree_root;
6936 struct btrfs_super_block *super_copy = fs_info->super_copy;
6937 struct extent_buffer *sb;
6938 struct btrfs_disk_key *disk_key;
6939 struct btrfs_chunk *chunk;
6940 u8 *array_ptr;
6941 unsigned long sb_array_offset;
6942 int ret = 0;
6943 u32 num_stripes;
6944 u32 array_size;
6945 u32 len = 0;
6946 u32 cur_offset;
6947 u64 type;
6948 struct btrfs_key key;
6949
6950 ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);
6951 /*
6952 * This will create extent buffer of nodesize, superblock size is
6953 * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will
6954 * overallocate but we can keep it as-is, only the first page is used.
6955 */
6956 sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET);
6957 if (IS_ERR(sb))
6958 return PTR_ERR(sb);
6959 set_extent_buffer_uptodate(sb);
6960 btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
6961 /*
6962 * The sb extent buffer is artificial and just used to read the system array.
6963 * set_extent_buffer_uptodate() call does not properly mark all it's
6964 * pages up-to-date when the page is larger: extent does not cover the
6965 * whole page and consequently check_page_uptodate does not find all
6966 * the page's extents up-to-date (the hole beyond sb),
6967 * write_extent_buffer then triggers a WARN_ON.
6968 *
6969 * Regular short extents go through mark_extent_buffer_dirty/writeback cycle,
6970 * but sb spans only this function. Add an explicit SetPageUptodate call
6971 * to silence the warning eg. on PowerPC 64.
6972 */
6973 if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE)
6974 SetPageUptodate(sb->pages[0]);
6975
6976 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
6977 array_size = btrfs_super_sys_array_size(super_copy);
6978
6979 array_ptr = super_copy->sys_chunk_array;
6980 sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array);
6981 cur_offset = 0;
6982
6983 while (cur_offset < array_size) {
6984 disk_key = (struct btrfs_disk_key *)array_ptr;
6985 len = sizeof(*disk_key);
6986 if (cur_offset + len > array_size)
6987 goto out_short_read;
6988
6989 btrfs_disk_key_to_cpu(&key, disk_key);
6990
6991 array_ptr += len;
6992 sb_array_offset += len;
6993 cur_offset += len;
6994
6995 if (key.type != BTRFS_CHUNK_ITEM_KEY) {
6996 btrfs_err(fs_info,
6997 "unexpected item type %u in sys_array at offset %u",
6998 (u32)key.type, cur_offset);
6999 ret = -EIO;
7000 break;
7001 }
7002
7003 chunk = (struct btrfs_chunk *)sb_array_offset;
7004 /*
7005 * At least one btrfs_chunk with one stripe must be present,
7006 * exact stripe count check comes afterwards
7007 */
7008 len = btrfs_chunk_item_size(1);
7009 if (cur_offset + len > array_size)
7010 goto out_short_read;
7011
7012 num_stripes = btrfs_chunk_num_stripes(sb, chunk);
7013 if (!num_stripes) {
7014 btrfs_err(fs_info,
7015 "invalid number of stripes %u in sys_array at offset %u",
7016 num_stripes, cur_offset);
7017 ret = -EIO;
7018 break;
7019 }
7020
7021 type = btrfs_chunk_type(sb, chunk);
7022 if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
7023 btrfs_err(fs_info,
7024 "invalid chunk type %llu in sys_array at offset %u",
7025 type, cur_offset);
7026 ret = -EIO;
7027 break;
7028 }
7029
7030 len = btrfs_chunk_item_size(num_stripes);
7031 if (cur_offset + len > array_size)
7032 goto out_short_read;
7033
7034 ret = read_one_chunk(&key, sb, chunk);
7035 if (ret)
7036 break;
7037
7038 array_ptr += len;
7039 sb_array_offset += len;
7040 cur_offset += len;
7041 }
7042 clear_extent_buffer_uptodate(sb);
7043 free_extent_buffer_stale(sb);
7044 return ret;
7045
7046 out_short_read:
7047 btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u",
7048 len, cur_offset);
7049 clear_extent_buffer_uptodate(sb);
7050 free_extent_buffer_stale(sb);
7051 return -EIO;
7052 }
7053
7054 /*
7055 * Check if all chunks in the fs are OK for read-write degraded mount
7056 *
7057 * If the @failing_dev is specified, it's accounted as missing.
7058 *
7059 * Return true if all chunks meet the minimal RW mount requirements.
7060 * Return false if any chunk doesn't meet the minimal RW mount requirements.
7061 */
btrfs_check_rw_degradable(struct btrfs_fs_info * fs_info,struct btrfs_device * failing_dev)7062 bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
7063 struct btrfs_device *failing_dev)
7064 {
7065 struct extent_map_tree *map_tree = &fs_info->mapping_tree;
7066 struct extent_map *em;
7067 u64 next_start = 0;
7068 bool ret = true;
7069
7070 read_lock(&map_tree->lock);
7071 em = lookup_extent_mapping(map_tree, 0, (u64)-1);
7072 read_unlock(&map_tree->lock);
7073 /* No chunk at all? Return false anyway */
7074 if (!em) {
7075 ret = false;
7076 goto out;
7077 }
7078 while (em) {
7079 struct map_lookup *map;
7080 int missing = 0;
7081 int max_tolerated;
7082 int i;
7083
7084 map = em->map_lookup;
7085 max_tolerated =
7086 btrfs_get_num_tolerated_disk_barrier_failures(
7087 map->type);
7088 for (i = 0; i < map->num_stripes; i++) {
7089 struct btrfs_device *dev = map->stripes[i].dev;
7090
7091 if (!dev || !dev->bdev ||
7092 test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
7093 dev->last_flush_error)
7094 missing++;
7095 else if (failing_dev && failing_dev == dev)
7096 missing++;
7097 }
7098 if (missing > max_tolerated) {
7099 if (!failing_dev)
7100 btrfs_warn(fs_info,
7101 "chunk %llu missing %d devices, max tolerance is %d for writable mount",
7102 em->start, missing, max_tolerated);
7103 free_extent_map(em);
7104 ret = false;
7105 goto out;
7106 }
7107 next_start = extent_map_end(em);
7108 free_extent_map(em);
7109
7110 read_lock(&map_tree->lock);
7111 em = lookup_extent_mapping(map_tree, next_start,
7112 (u64)(-1) - next_start);
7113 read_unlock(&map_tree->lock);
7114 }
7115 out:
7116 return ret;
7117 }
7118
readahead_tree_node_children(struct extent_buffer * node)7119 static void readahead_tree_node_children(struct extent_buffer *node)
7120 {
7121 int i;
7122 const int nr_items = btrfs_header_nritems(node);
7123
7124 for (i = 0; i < nr_items; i++) {
7125 u64 start;
7126
7127 start = btrfs_node_blockptr(node, i);
7128 readahead_tree_block(node->fs_info, start);
7129 }
7130 }
7131
btrfs_read_chunk_tree(struct btrfs_fs_info * fs_info)7132 int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
7133 {
7134 struct btrfs_root *root = fs_info->chunk_root;
7135 struct btrfs_path *path;
7136 struct extent_buffer *leaf;
7137 struct btrfs_key key;
7138 struct btrfs_key found_key;
7139 int ret;
7140 int slot;
7141 u64 total_dev = 0;
7142 u64 last_ra_node = 0;
7143
7144 path = btrfs_alloc_path();
7145 if (!path)
7146 return -ENOMEM;
7147
7148 /*
7149 * uuid_mutex is needed only if we are mounting a sprout FS
7150 * otherwise we don't need it.
7151 */
7152 mutex_lock(&uuid_mutex);
7153
7154 /*
7155 * It is possible for mount and umount to race in such a way that
7156 * we execute this code path, but open_fs_devices failed to clear
7157 * total_rw_bytes. We certainly want it cleared before reading the
7158 * device items, so clear it here.
7159 */
7160 fs_info->fs_devices->total_rw_bytes = 0;
7161
7162 /*
7163 * Read all device items, and then all the chunk items. All
7164 * device items are found before any chunk item (their object id
7165 * is smaller than the lowest possible object id for a chunk
7166 * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
7167 */
7168 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
7169 key.offset = 0;
7170 key.type = 0;
7171 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
7172 if (ret < 0)
7173 goto error;
7174 while (1) {
7175 struct extent_buffer *node;
7176
7177 leaf = path->nodes[0];
7178 slot = path->slots[0];
7179 if (slot >= btrfs_header_nritems(leaf)) {
7180 ret = btrfs_next_leaf(root, path);
7181 if (ret == 0)
7182 continue;
7183 if (ret < 0)
7184 goto error;
7185 break;
7186 }
7187 /*
7188 * The nodes on level 1 are not locked but we don't need to do
7189 * that during mount time as nothing else can access the tree
7190 */
7191 node = path->nodes[1];
7192 if (node) {
7193 if (last_ra_node != node->start) {
7194 readahead_tree_node_children(node);
7195 last_ra_node = node->start;
7196 }
7197 }
7198 btrfs_item_key_to_cpu(leaf, &found_key, slot);
7199 if (found_key.type == BTRFS_DEV_ITEM_KEY) {
7200 struct btrfs_dev_item *dev_item;
7201 dev_item = btrfs_item_ptr(leaf, slot,
7202 struct btrfs_dev_item);
7203 ret = read_one_dev(leaf, dev_item);
7204 if (ret)
7205 goto error;
7206 total_dev++;
7207 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
7208 struct btrfs_chunk *chunk;
7209 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
7210 mutex_lock(&fs_info->chunk_mutex);
7211 ret = read_one_chunk(&found_key, leaf, chunk);
7212 mutex_unlock(&fs_info->chunk_mutex);
7213 if (ret)
7214 goto error;
7215 }
7216 path->slots[0]++;
7217 }
7218
7219 /*
7220 * After loading chunk tree, we've got all device information,
7221 * do another round of validation checks.
7222 */
7223 if (total_dev != fs_info->fs_devices->total_devices) {
7224 btrfs_warn(fs_info,
7225 "super block num_devices %llu mismatch with DEV_ITEM count %llu, will be repaired on next transaction commit",
7226 btrfs_super_num_devices(fs_info->super_copy),
7227 total_dev);
7228 fs_info->fs_devices->total_devices = total_dev;
7229 btrfs_set_super_num_devices(fs_info->super_copy, total_dev);
7230 }
7231 if (btrfs_super_total_bytes(fs_info->super_copy) <
7232 fs_info->fs_devices->total_rw_bytes) {
7233 btrfs_err(fs_info,
7234 "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu",
7235 btrfs_super_total_bytes(fs_info->super_copy),
7236 fs_info->fs_devices->total_rw_bytes);
7237 ret = -EINVAL;
7238 goto error;
7239 }
7240 ret = 0;
7241 error:
7242 mutex_unlock(&uuid_mutex);
7243
7244 btrfs_free_path(path);
7245 return ret;
7246 }
7247
btrfs_init_devices_late(struct btrfs_fs_info * fs_info)7248 void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
7249 {
7250 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
7251 struct btrfs_device *device;
7252
7253 fs_devices->fs_info = fs_info;
7254
7255 mutex_lock(&fs_devices->device_list_mutex);
7256 list_for_each_entry(device, &fs_devices->devices, dev_list)
7257 device->fs_info = fs_info;
7258
7259 list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
7260 list_for_each_entry(device, &seed_devs->devices, dev_list)
7261 device->fs_info = fs_info;
7262
7263 seed_devs->fs_info = fs_info;
7264 }
7265 mutex_unlock(&fs_devices->device_list_mutex);
7266 }
7267
btrfs_dev_stats_value(const struct extent_buffer * eb,const struct btrfs_dev_stats_item * ptr,int index)7268 static u64 btrfs_dev_stats_value(const struct extent_buffer *eb,
7269 const struct btrfs_dev_stats_item *ptr,
7270 int index)
7271 {
7272 u64 val;
7273
7274 read_extent_buffer(eb, &val,
7275 offsetof(struct btrfs_dev_stats_item, values) +
7276 ((unsigned long)ptr) + (index * sizeof(u64)),
7277 sizeof(val));
7278 return val;
7279 }
7280
btrfs_set_dev_stats_value(struct extent_buffer * eb,struct btrfs_dev_stats_item * ptr,int index,u64 val)7281 static void btrfs_set_dev_stats_value(struct extent_buffer *eb,
7282 struct btrfs_dev_stats_item *ptr,
7283 int index, u64 val)
7284 {
7285 write_extent_buffer(eb, &val,
7286 offsetof(struct btrfs_dev_stats_item, values) +
7287 ((unsigned long)ptr) + (index * sizeof(u64)),
7288 sizeof(val));
7289 }
7290
btrfs_device_init_dev_stats(struct btrfs_device * device,struct btrfs_path * path)7291 static int btrfs_device_init_dev_stats(struct btrfs_device *device,
7292 struct btrfs_path *path)
7293 {
7294 struct btrfs_dev_stats_item *ptr;
7295 struct extent_buffer *eb;
7296 struct btrfs_key key;
7297 int item_size;
7298 int i, ret, slot;
7299
7300 key.objectid = BTRFS_DEV_STATS_OBJECTID;
7301 key.type = BTRFS_PERSISTENT_ITEM_KEY;
7302 key.offset = device->devid;
7303 ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0);
7304 if (ret) {
7305 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7306 btrfs_dev_stat_set(device, i, 0);
7307 device->dev_stats_valid = 1;
7308 btrfs_release_path(path);
7309 return ret < 0 ? ret : 0;
7310 }
7311 slot = path->slots[0];
7312 eb = path->nodes[0];
7313 item_size = btrfs_item_size_nr(eb, slot);
7314
7315 ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item);
7316
7317 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
7318 if (item_size >= (1 + i) * sizeof(__le64))
7319 btrfs_dev_stat_set(device, i,
7320 btrfs_dev_stats_value(eb, ptr, i));
7321 else
7322 btrfs_dev_stat_set(device, i, 0);
7323 }
7324
7325 device->dev_stats_valid = 1;
7326 btrfs_dev_stat_print_on_load(device);
7327 btrfs_release_path(path);
7328
7329 return 0;
7330 }
7331
btrfs_init_dev_stats(struct btrfs_fs_info * fs_info)7332 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
7333 {
7334 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
7335 struct btrfs_device *device;
7336 struct btrfs_path *path = NULL;
7337 int ret = 0;
7338
7339 path = btrfs_alloc_path();
7340 if (!path)
7341 return -ENOMEM;
7342
7343 mutex_lock(&fs_devices->device_list_mutex);
7344 list_for_each_entry(device, &fs_devices->devices, dev_list) {
7345 ret = btrfs_device_init_dev_stats(device, path);
7346 if (ret)
7347 goto out;
7348 }
7349 list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
7350 list_for_each_entry(device, &seed_devs->devices, dev_list) {
7351 ret = btrfs_device_init_dev_stats(device, path);
7352 if (ret)
7353 goto out;
7354 }
7355 }
7356 out:
7357 mutex_unlock(&fs_devices->device_list_mutex);
7358
7359 btrfs_free_path(path);
7360 return ret;
7361 }
7362
update_dev_stat_item(struct btrfs_trans_handle * trans,struct btrfs_device * device)7363 static int update_dev_stat_item(struct btrfs_trans_handle *trans,
7364 struct btrfs_device *device)
7365 {
7366 struct btrfs_fs_info *fs_info = trans->fs_info;
7367 struct btrfs_root *dev_root = fs_info->dev_root;
7368 struct btrfs_path *path;
7369 struct btrfs_key key;
7370 struct extent_buffer *eb;
7371 struct btrfs_dev_stats_item *ptr;
7372 int ret;
7373 int i;
7374
7375 key.objectid = BTRFS_DEV_STATS_OBJECTID;
7376 key.type = BTRFS_PERSISTENT_ITEM_KEY;
7377 key.offset = device->devid;
7378
7379 path = btrfs_alloc_path();
7380 if (!path)
7381 return -ENOMEM;
7382 ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
7383 if (ret < 0) {
7384 btrfs_warn_in_rcu(fs_info,
7385 "error %d while searching for dev_stats item for device %s",
7386 ret, rcu_str_deref(device->name));
7387 goto out;
7388 }
7389
7390 if (ret == 0 &&
7391 btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
7392 /* need to delete old one and insert a new one */
7393 ret = btrfs_del_item(trans, dev_root, path);
7394 if (ret != 0) {
7395 btrfs_warn_in_rcu(fs_info,
7396 "delete too small dev_stats item for device %s failed %d",
7397 rcu_str_deref(device->name), ret);
7398 goto out;
7399 }
7400 ret = 1;
7401 }
7402
7403 if (ret == 1) {
7404 /* need to insert a new item */
7405 btrfs_release_path(path);
7406 ret = btrfs_insert_empty_item(trans, dev_root, path,
7407 &key, sizeof(*ptr));
7408 if (ret < 0) {
7409 btrfs_warn_in_rcu(fs_info,
7410 "insert dev_stats item for device %s failed %d",
7411 rcu_str_deref(device->name), ret);
7412 goto out;
7413 }
7414 }
7415
7416 eb = path->nodes[0];
7417 ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
7418 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7419 btrfs_set_dev_stats_value(eb, ptr, i,
7420 btrfs_dev_stat_read(device, i));
7421 btrfs_mark_buffer_dirty(eb);
7422
7423 out:
7424 btrfs_free_path(path);
7425 return ret;
7426 }
7427
7428 /*
7429 * called from commit_transaction. Writes all changed device stats to disk.
7430 */
btrfs_run_dev_stats(struct btrfs_trans_handle * trans)7431 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans)
7432 {
7433 struct btrfs_fs_info *fs_info = trans->fs_info;
7434 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7435 struct btrfs_device *device;
7436 int stats_cnt;
7437 int ret = 0;
7438
7439 mutex_lock(&fs_devices->device_list_mutex);
7440 list_for_each_entry(device, &fs_devices->devices, dev_list) {
7441 stats_cnt = atomic_read(&device->dev_stats_ccnt);
7442 if (!device->dev_stats_valid || stats_cnt == 0)
7443 continue;
7444
7445
7446 /*
7447 * There is a LOAD-LOAD control dependency between the value of
7448 * dev_stats_ccnt and updating the on-disk values which requires
7449 * reading the in-memory counters. Such control dependencies
7450 * require explicit read memory barriers.
7451 *
7452 * This memory barriers pairs with smp_mb__before_atomic in
7453 * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full
7454 * barrier implied by atomic_xchg in
7455 * btrfs_dev_stats_read_and_reset
7456 */
7457 smp_rmb();
7458
7459 ret = update_dev_stat_item(trans, device);
7460 if (!ret)
7461 atomic_sub(stats_cnt, &device->dev_stats_ccnt);
7462 }
7463 mutex_unlock(&fs_devices->device_list_mutex);
7464
7465 return ret;
7466 }
7467
btrfs_dev_stat_inc_and_print(struct btrfs_device * dev,int index)7468 void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
7469 {
7470 btrfs_dev_stat_inc(dev, index);
7471 btrfs_dev_stat_print_on_error(dev);
7472 }
7473
btrfs_dev_stat_print_on_error(struct btrfs_device * dev)7474 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
7475 {
7476 if (!dev->dev_stats_valid)
7477 return;
7478 btrfs_err_rl_in_rcu(dev->fs_info,
7479 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7480 rcu_str_deref(dev->name),
7481 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
7482 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
7483 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
7484 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
7485 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
7486 }
7487
btrfs_dev_stat_print_on_load(struct btrfs_device * dev)7488 static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
7489 {
7490 int i;
7491
7492 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7493 if (btrfs_dev_stat_read(dev, i) != 0)
7494 break;
7495 if (i == BTRFS_DEV_STAT_VALUES_MAX)
7496 return; /* all values == 0, suppress message */
7497
7498 btrfs_info_in_rcu(dev->fs_info,
7499 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7500 rcu_str_deref(dev->name),
7501 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
7502 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
7503 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
7504 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
7505 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
7506 }
7507
btrfs_get_dev_stats(struct btrfs_fs_info * fs_info,struct btrfs_ioctl_get_dev_stats * stats)7508 int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
7509 struct btrfs_ioctl_get_dev_stats *stats)
7510 {
7511 struct btrfs_device *dev;
7512 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7513 int i;
7514
7515 mutex_lock(&fs_devices->device_list_mutex);
7516 dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL,
7517 true);
7518 mutex_unlock(&fs_devices->device_list_mutex);
7519
7520 if (!dev) {
7521 btrfs_warn(fs_info, "get dev_stats failed, device not found");
7522 return -ENODEV;
7523 } else if (!dev->dev_stats_valid) {
7524 btrfs_warn(fs_info, "get dev_stats failed, not yet valid");
7525 return -ENODEV;
7526 } else if (stats->flags & BTRFS_DEV_STATS_RESET) {
7527 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
7528 if (stats->nr_items > i)
7529 stats->values[i] =
7530 btrfs_dev_stat_read_and_reset(dev, i);
7531 else
7532 btrfs_dev_stat_set(dev, i, 0);
7533 }
7534 btrfs_info(fs_info, "device stats zeroed by %s (%d)",
7535 current->comm, task_pid_nr(current));
7536 } else {
7537 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7538 if (stats->nr_items > i)
7539 stats->values[i] = btrfs_dev_stat_read(dev, i);
7540 }
7541 if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
7542 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
7543 return 0;
7544 }
7545
7546 /*
7547 * Update the size and bytes used for each device where it changed. This is
7548 * delayed since we would otherwise get errors while writing out the
7549 * superblocks.
7550 *
7551 * Must be invoked during transaction commit.
7552 */
btrfs_commit_device_sizes(struct btrfs_transaction * trans)7553 void btrfs_commit_device_sizes(struct btrfs_transaction *trans)
7554 {
7555 struct btrfs_device *curr, *next;
7556
7557 ASSERT(trans->state == TRANS_STATE_COMMIT_DOING);
7558
7559 if (list_empty(&trans->dev_update_list))
7560 return;
7561
7562 /*
7563 * We don't need the device_list_mutex here. This list is owned by the
7564 * transaction and the transaction must complete before the device is
7565 * released.
7566 */
7567 mutex_lock(&trans->fs_info->chunk_mutex);
7568 list_for_each_entry_safe(curr, next, &trans->dev_update_list,
7569 post_commit_list) {
7570 list_del_init(&curr->post_commit_list);
7571 curr->commit_total_bytes = curr->disk_total_bytes;
7572 curr->commit_bytes_used = curr->bytes_used;
7573 }
7574 mutex_unlock(&trans->fs_info->chunk_mutex);
7575 }
7576
7577 /*
7578 * Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10.
7579 */
btrfs_bg_type_to_factor(u64 flags)7580 int btrfs_bg_type_to_factor(u64 flags)
7581 {
7582 const int index = btrfs_bg_flags_to_raid_index(flags);
7583
7584 return btrfs_raid_array[index].ncopies;
7585 }
7586
7587
7588
verify_one_dev_extent(struct btrfs_fs_info * fs_info,u64 chunk_offset,u64 devid,u64 physical_offset,u64 physical_len)7589 static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
7590 u64 chunk_offset, u64 devid,
7591 u64 physical_offset, u64 physical_len)
7592 {
7593 struct extent_map_tree *em_tree = &fs_info->mapping_tree;
7594 struct extent_map *em;
7595 struct map_lookup *map;
7596 struct btrfs_device *dev;
7597 u64 stripe_len;
7598 bool found = false;
7599 int ret = 0;
7600 int i;
7601
7602 read_lock(&em_tree->lock);
7603 em = lookup_extent_mapping(em_tree, chunk_offset, 1);
7604 read_unlock(&em_tree->lock);
7605
7606 if (!em) {
7607 btrfs_err(fs_info,
7608 "dev extent physical offset %llu on devid %llu doesn't have corresponding chunk",
7609 physical_offset, devid);
7610 ret = -EUCLEAN;
7611 goto out;
7612 }
7613
7614 map = em->map_lookup;
7615 stripe_len = calc_stripe_length(map->type, em->len, map->num_stripes);
7616 if (physical_len != stripe_len) {
7617 btrfs_err(fs_info,
7618 "dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu",
7619 physical_offset, devid, em->start, physical_len,
7620 stripe_len);
7621 ret = -EUCLEAN;
7622 goto out;
7623 }
7624
7625 for (i = 0; i < map->num_stripes; i++) {
7626 if (map->stripes[i].dev->devid == devid &&
7627 map->stripes[i].physical == physical_offset) {
7628 found = true;
7629 if (map->verified_stripes >= map->num_stripes) {
7630 btrfs_err(fs_info,
7631 "too many dev extents for chunk %llu found",
7632 em->start);
7633 ret = -EUCLEAN;
7634 goto out;
7635 }
7636 map->verified_stripes++;
7637 break;
7638 }
7639 }
7640 if (!found) {
7641 btrfs_err(fs_info,
7642 "dev extent physical offset %llu devid %llu has no corresponding chunk",
7643 physical_offset, devid);
7644 ret = -EUCLEAN;
7645 }
7646
7647 /* Make sure no dev extent is beyond device bondary */
7648 dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
7649 if (!dev) {
7650 btrfs_err(fs_info, "failed to find devid %llu", devid);
7651 ret = -EUCLEAN;
7652 goto out;
7653 }
7654
7655 /* It's possible this device is a dummy for seed device */
7656 if (dev->disk_total_bytes == 0) {
7657 struct btrfs_fs_devices *devs;
7658
7659 devs = list_first_entry(&fs_info->fs_devices->seed_list,
7660 struct btrfs_fs_devices, seed_list);
7661 dev = btrfs_find_device(devs, devid, NULL, NULL, false);
7662 if (!dev) {
7663 btrfs_err(fs_info, "failed to find seed devid %llu",
7664 devid);
7665 ret = -EUCLEAN;
7666 goto out;
7667 }
7668 }
7669
7670 if (physical_offset + physical_len > dev->disk_total_bytes) {
7671 btrfs_err(fs_info,
7672 "dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu",
7673 devid, physical_offset, physical_len,
7674 dev->disk_total_bytes);
7675 ret = -EUCLEAN;
7676 goto out;
7677 }
7678 out:
7679 free_extent_map(em);
7680 return ret;
7681 }
7682
verify_chunk_dev_extent_mapping(struct btrfs_fs_info * fs_info)7683 static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
7684 {
7685 struct extent_map_tree *em_tree = &fs_info->mapping_tree;
7686 struct extent_map *em;
7687 struct rb_node *node;
7688 int ret = 0;
7689
7690 read_lock(&em_tree->lock);
7691 for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
7692 em = rb_entry(node, struct extent_map, rb_node);
7693 if (em->map_lookup->num_stripes !=
7694 em->map_lookup->verified_stripes) {
7695 btrfs_err(fs_info,
7696 "chunk %llu has missing dev extent, have %d expect %d",
7697 em->start, em->map_lookup->verified_stripes,
7698 em->map_lookup->num_stripes);
7699 ret = -EUCLEAN;
7700 goto out;
7701 }
7702 }
7703 out:
7704 read_unlock(&em_tree->lock);
7705 return ret;
7706 }
7707
7708 /*
7709 * Ensure that all dev extents are mapped to correct chunk, otherwise
7710 * later chunk allocation/free would cause unexpected behavior.
7711 *
7712 * NOTE: This will iterate through the whole device tree, which should be of
7713 * the same size level as the chunk tree. This slightly increases mount time.
7714 */
btrfs_verify_dev_extents(struct btrfs_fs_info * fs_info)7715 int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
7716 {
7717 struct btrfs_path *path;
7718 struct btrfs_root *root = fs_info->dev_root;
7719 struct btrfs_key key;
7720 u64 prev_devid = 0;
7721 u64 prev_dev_ext_end = 0;
7722 int ret = 0;
7723
7724 key.objectid = 1;
7725 key.type = BTRFS_DEV_EXTENT_KEY;
7726 key.offset = 0;
7727
7728 path = btrfs_alloc_path();
7729 if (!path)
7730 return -ENOMEM;
7731
7732 path->reada = READA_FORWARD;
7733 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
7734 if (ret < 0)
7735 goto out;
7736
7737 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
7738 ret = btrfs_next_item(root, path);
7739 if (ret < 0)
7740 goto out;
7741 /* No dev extents at all? Not good */
7742 if (ret > 0) {
7743 ret = -EUCLEAN;
7744 goto out;
7745 }
7746 }
7747 while (1) {
7748 struct extent_buffer *leaf = path->nodes[0];
7749 struct btrfs_dev_extent *dext;
7750 int slot = path->slots[0];
7751 u64 chunk_offset;
7752 u64 physical_offset;
7753 u64 physical_len;
7754 u64 devid;
7755
7756 btrfs_item_key_to_cpu(leaf, &key, slot);
7757 if (key.type != BTRFS_DEV_EXTENT_KEY)
7758 break;
7759 devid = key.objectid;
7760 physical_offset = key.offset;
7761
7762 dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent);
7763 chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext);
7764 physical_len = btrfs_dev_extent_length(leaf, dext);
7765
7766 /* Check if this dev extent overlaps with the previous one */
7767 if (devid == prev_devid && physical_offset < prev_dev_ext_end) {
7768 btrfs_err(fs_info,
7769 "dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu",
7770 devid, physical_offset, prev_dev_ext_end);
7771 ret = -EUCLEAN;
7772 goto out;
7773 }
7774
7775 ret = verify_one_dev_extent(fs_info, chunk_offset, devid,
7776 physical_offset, physical_len);
7777 if (ret < 0)
7778 goto out;
7779 prev_devid = devid;
7780 prev_dev_ext_end = physical_offset + physical_len;
7781
7782 ret = btrfs_next_item(root, path);
7783 if (ret < 0)
7784 goto out;
7785 if (ret > 0) {
7786 ret = 0;
7787 break;
7788 }
7789 }
7790
7791 /* Ensure all chunks have corresponding dev extents */
7792 ret = verify_chunk_dev_extent_mapping(fs_info);
7793 out:
7794 btrfs_free_path(path);
7795 return ret;
7796 }
7797
7798 /*
7799 * Check whether the given block group or device is pinned by any inode being
7800 * used as a swapfile.
7801 */
btrfs_pinned_by_swapfile(struct btrfs_fs_info * fs_info,void * ptr)7802 bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr)
7803 {
7804 struct btrfs_swapfile_pin *sp;
7805 struct rb_node *node;
7806
7807 spin_lock(&fs_info->swapfile_pins_lock);
7808 node = fs_info->swapfile_pins.rb_node;
7809 while (node) {
7810 sp = rb_entry(node, struct btrfs_swapfile_pin, node);
7811 if (ptr < sp->ptr)
7812 node = node->rb_left;
7813 else if (ptr > sp->ptr)
7814 node = node->rb_right;
7815 else
7816 break;
7817 }
7818 spin_unlock(&fs_info->swapfile_pins_lock);
7819 return node != NULL;
7820 }
7821