1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (C) 2007 Oracle. All rights reserved.
4 */
5
6 #include <linux/sched.h>
7 #include <linux/sched/mm.h>
8 #include <linux/bio.h>
9 #include <linux/slab.h>
10 #include <linux/blkdev.h>
11 #include <linux/ratelimit.h>
12 #include <linux/kthread.h>
13 #include <linux/raid/pq.h>
14 #include <linux/semaphore.h>
15 #include <linux/uuid.h>
16 #include <linux/list_sort.h>
17 #include <linux/namei.h>
18 #include "misc.h"
19 #include "ctree.h"
20 #include "extent_map.h"
21 #include "disk-io.h"
22 #include "transaction.h"
23 #include "print-tree.h"
24 #include "volumes.h"
25 #include "raid56.h"
26 #include "async-thread.h"
27 #include "check-integrity.h"
28 #include "rcu-string.h"
29 #include "dev-replace.h"
30 #include "sysfs.h"
31 #include "tree-checker.h"
32 #include "space-info.h"
33 #include "block-group.h"
34 #include "discard.h"
35
36 const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
37 [BTRFS_RAID_RAID10] = {
38 .sub_stripes = 2,
39 .dev_stripes = 1,
40 .devs_max = 0, /* 0 == as many as possible */
41 .devs_min = 4,
42 .tolerated_failures = 1,
43 .devs_increment = 2,
44 .ncopies = 2,
45 .nparity = 0,
46 .raid_name = "raid10",
47 .bg_flag = BTRFS_BLOCK_GROUP_RAID10,
48 .mindev_error = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
49 },
50 [BTRFS_RAID_RAID1] = {
51 .sub_stripes = 1,
52 .dev_stripes = 1,
53 .devs_max = 2,
54 .devs_min = 2,
55 .tolerated_failures = 1,
56 .devs_increment = 2,
57 .ncopies = 2,
58 .nparity = 0,
59 .raid_name = "raid1",
60 .bg_flag = BTRFS_BLOCK_GROUP_RAID1,
61 .mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
62 },
63 [BTRFS_RAID_RAID1C3] = {
64 .sub_stripes = 1,
65 .dev_stripes = 1,
66 .devs_max = 3,
67 .devs_min = 3,
68 .tolerated_failures = 2,
69 .devs_increment = 3,
70 .ncopies = 3,
71 .nparity = 0,
72 .raid_name = "raid1c3",
73 .bg_flag = BTRFS_BLOCK_GROUP_RAID1C3,
74 .mindev_error = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET,
75 },
76 [BTRFS_RAID_RAID1C4] = {
77 .sub_stripes = 1,
78 .dev_stripes = 1,
79 .devs_max = 4,
80 .devs_min = 4,
81 .tolerated_failures = 3,
82 .devs_increment = 4,
83 .ncopies = 4,
84 .nparity = 0,
85 .raid_name = "raid1c4",
86 .bg_flag = BTRFS_BLOCK_GROUP_RAID1C4,
87 .mindev_error = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET,
88 },
89 [BTRFS_RAID_DUP] = {
90 .sub_stripes = 1,
91 .dev_stripes = 2,
92 .devs_max = 1,
93 .devs_min = 1,
94 .tolerated_failures = 0,
95 .devs_increment = 1,
96 .ncopies = 2,
97 .nparity = 0,
98 .raid_name = "dup",
99 .bg_flag = BTRFS_BLOCK_GROUP_DUP,
100 .mindev_error = 0,
101 },
102 [BTRFS_RAID_RAID0] = {
103 .sub_stripes = 1,
104 .dev_stripes = 1,
105 .devs_max = 0,
106 .devs_min = 2,
107 .tolerated_failures = 0,
108 .devs_increment = 1,
109 .ncopies = 1,
110 .nparity = 0,
111 .raid_name = "raid0",
112 .bg_flag = BTRFS_BLOCK_GROUP_RAID0,
113 .mindev_error = 0,
114 },
115 [BTRFS_RAID_SINGLE] = {
116 .sub_stripes = 1,
117 .dev_stripes = 1,
118 .devs_max = 1,
119 .devs_min = 1,
120 .tolerated_failures = 0,
121 .devs_increment = 1,
122 .ncopies = 1,
123 .nparity = 0,
124 .raid_name = "single",
125 .bg_flag = 0,
126 .mindev_error = 0,
127 },
128 [BTRFS_RAID_RAID5] = {
129 .sub_stripes = 1,
130 .dev_stripes = 1,
131 .devs_max = 0,
132 .devs_min = 2,
133 .tolerated_failures = 1,
134 .devs_increment = 1,
135 .ncopies = 1,
136 .nparity = 1,
137 .raid_name = "raid5",
138 .bg_flag = BTRFS_BLOCK_GROUP_RAID5,
139 .mindev_error = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
140 },
141 [BTRFS_RAID_RAID6] = {
142 .sub_stripes = 1,
143 .dev_stripes = 1,
144 .devs_max = 0,
145 .devs_min = 3,
146 .tolerated_failures = 2,
147 .devs_increment = 1,
148 .ncopies = 1,
149 .nparity = 2,
150 .raid_name = "raid6",
151 .bg_flag = BTRFS_BLOCK_GROUP_RAID6,
152 .mindev_error = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
153 },
154 };
155
btrfs_bg_type_to_raid_name(u64 flags)156 const char *btrfs_bg_type_to_raid_name(u64 flags)
157 {
158 const int index = btrfs_bg_flags_to_raid_index(flags);
159
160 if (index >= BTRFS_NR_RAID_TYPES)
161 return NULL;
162
163 return btrfs_raid_array[index].raid_name;
164 }
165
166 /*
167 * Fill @buf with textual description of @bg_flags, no more than @size_buf
168 * bytes including terminating null byte.
169 */
btrfs_describe_block_groups(u64 bg_flags,char * buf,u32 size_buf)170 void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
171 {
172 int i;
173 int ret;
174 char *bp = buf;
175 u64 flags = bg_flags;
176 u32 size_bp = size_buf;
177
178 if (!flags) {
179 strcpy(bp, "NONE");
180 return;
181 }
182
183 #define DESCRIBE_FLAG(flag, desc) \
184 do { \
185 if (flags & (flag)) { \
186 ret = snprintf(bp, size_bp, "%s|", (desc)); \
187 if (ret < 0 || ret >= size_bp) \
188 goto out_overflow; \
189 size_bp -= ret; \
190 bp += ret; \
191 flags &= ~(flag); \
192 } \
193 } while (0)
194
195 DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data");
196 DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system");
197 DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata");
198
199 DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single");
200 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
201 DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag,
202 btrfs_raid_array[i].raid_name);
203 #undef DESCRIBE_FLAG
204
205 if (flags) {
206 ret = snprintf(bp, size_bp, "0x%llx|", flags);
207 size_bp -= ret;
208 }
209
210 if (size_bp < size_buf)
211 buf[size_buf - size_bp - 1] = '\0'; /* remove last | */
212
213 /*
214 * The text is trimmed, it's up to the caller to provide sufficiently
215 * large buffer
216 */
217 out_overflow:;
218 }
219
220 static int init_first_rw_device(struct btrfs_trans_handle *trans);
221 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
222 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
223 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
224 static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
225 enum btrfs_map_op op,
226 u64 logical, u64 *length,
227 struct btrfs_bio **bbio_ret,
228 int mirror_num, int need_raid_map);
229
230 /*
231 * Device locking
232 * ==============
233 *
234 * There are several mutexes that protect manipulation of devices and low-level
235 * structures like chunks but not block groups, extents or files
236 *
237 * uuid_mutex (global lock)
238 * ------------------------
239 * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
240 * the SCAN_DEV ioctl registration or from mount either implicitly (the first
241 * device) or requested by the device= mount option
242 *
243 * the mutex can be very coarse and can cover long-running operations
244 *
245 * protects: updates to fs_devices counters like missing devices, rw devices,
246 * seeding, structure cloning, opening/closing devices at mount/umount time
247 *
248 * global::fs_devs - add, remove, updates to the global list
249 *
250 * does not protect: manipulation of the fs_devices::devices list in general
251 * but in mount context it could be used to exclude list modifications by eg.
252 * scan ioctl
253 *
254 * btrfs_device::name - renames (write side), read is RCU
255 *
256 * fs_devices::device_list_mutex (per-fs, with RCU)
257 * ------------------------------------------------
258 * protects updates to fs_devices::devices, ie. adding and deleting
259 *
260 * simple list traversal with read-only actions can be done with RCU protection
261 *
262 * may be used to exclude some operations from running concurrently without any
263 * modifications to the list (see write_all_supers)
264 *
265 * Is not required at mount and close times, because our device list is
266 * protected by the uuid_mutex at that point.
267 *
268 * balance_mutex
269 * -------------
270 * protects balance structures (status, state) and context accessed from
271 * several places (internally, ioctl)
272 *
273 * chunk_mutex
274 * -----------
275 * protects chunks, adding or removing during allocation, trim or when a new
276 * device is added/removed. Additionally it also protects post_commit_list of
277 * individual devices, since they can be added to the transaction's
278 * post_commit_list only with chunk_mutex held.
279 *
280 * cleaner_mutex
281 * -------------
282 * a big lock that is held by the cleaner thread and prevents running subvolume
283 * cleaning together with relocation or delayed iputs
284 *
285 *
286 * Lock nesting
287 * ============
288 *
289 * uuid_mutex
290 * device_list_mutex
291 * chunk_mutex
292 * balance_mutex
293 *
294 *
295 * Exclusive operations
296 * ====================
297 *
298 * Maintains the exclusivity of the following operations that apply to the
299 * whole filesystem and cannot run in parallel.
300 *
301 * - Balance (*)
302 * - Device add
303 * - Device remove
304 * - Device replace (*)
305 * - Resize
306 *
307 * The device operations (as above) can be in one of the following states:
308 *
309 * - Running state
310 * - Paused state
311 * - Completed state
312 *
313 * Only device operations marked with (*) can go into the Paused state for the
314 * following reasons:
315 *
316 * - ioctl (only Balance can be Paused through ioctl)
317 * - filesystem remounted as read-only
318 * - filesystem unmounted and mounted as read-only
319 * - system power-cycle and filesystem mounted as read-only
320 * - filesystem or device errors leading to forced read-only
321 *
322 * The status of exclusive operation is set and cleared atomically.
323 * During the course of Paused state, fs_info::exclusive_operation remains set.
324 * A device operation in Paused or Running state can be canceled or resumed
325 * either by ioctl (Balance only) or when remounted as read-write.
326 * The exclusive status is cleared when the device operation is canceled or
327 * completed.
328 */
329
330 DEFINE_MUTEX(uuid_mutex);
331 static LIST_HEAD(fs_uuids);
btrfs_get_fs_uuids(void)332 struct list_head * __attribute_const__ btrfs_get_fs_uuids(void)
333 {
334 return &fs_uuids;
335 }
336
337 /*
338 * alloc_fs_devices - allocate struct btrfs_fs_devices
339 * @fsid: if not NULL, copy the UUID to fs_devices::fsid
340 * @metadata_fsid: if not NULL, copy the UUID to fs_devices::metadata_fsid
341 *
342 * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
343 * The returned struct is not linked onto any lists and can be destroyed with
344 * kfree() right away.
345 */
alloc_fs_devices(const u8 * fsid,const u8 * metadata_fsid)346 static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
347 const u8 *metadata_fsid)
348 {
349 struct btrfs_fs_devices *fs_devs;
350
351 fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
352 if (!fs_devs)
353 return ERR_PTR(-ENOMEM);
354
355 mutex_init(&fs_devs->device_list_mutex);
356
357 INIT_LIST_HEAD(&fs_devs->devices);
358 INIT_LIST_HEAD(&fs_devs->alloc_list);
359 INIT_LIST_HEAD(&fs_devs->fs_list);
360 INIT_LIST_HEAD(&fs_devs->seed_list);
361 if (fsid)
362 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
363
364 if (metadata_fsid)
365 memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE);
366 else if (fsid)
367 memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE);
368
369 return fs_devs;
370 }
371
btrfs_free_device(struct btrfs_device * device)372 void btrfs_free_device(struct btrfs_device *device)
373 {
374 WARN_ON(!list_empty(&device->post_commit_list));
375 rcu_string_free(device->name);
376 extent_io_tree_release(&device->alloc_state);
377 bio_put(device->flush_bio);
378 kfree(device);
379 }
380
free_fs_devices(struct btrfs_fs_devices * fs_devices)381 static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
382 {
383 struct btrfs_device *device;
384 WARN_ON(fs_devices->opened);
385 while (!list_empty(&fs_devices->devices)) {
386 device = list_entry(fs_devices->devices.next,
387 struct btrfs_device, dev_list);
388 list_del(&device->dev_list);
389 btrfs_free_device(device);
390 }
391 kfree(fs_devices);
392 }
393
btrfs_cleanup_fs_uuids(void)394 void __exit btrfs_cleanup_fs_uuids(void)
395 {
396 struct btrfs_fs_devices *fs_devices;
397
398 while (!list_empty(&fs_uuids)) {
399 fs_devices = list_entry(fs_uuids.next,
400 struct btrfs_fs_devices, fs_list);
401 list_del(&fs_devices->fs_list);
402 free_fs_devices(fs_devices);
403 }
404 }
405
406 /*
407 * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error.
408 * Returned struct is not linked onto any lists and must be destroyed using
409 * btrfs_free_device.
410 */
__alloc_device(struct btrfs_fs_info * fs_info)411 static struct btrfs_device *__alloc_device(struct btrfs_fs_info *fs_info)
412 {
413 struct btrfs_device *dev;
414
415 dev = kzalloc(sizeof(*dev), GFP_KERNEL);
416 if (!dev)
417 return ERR_PTR(-ENOMEM);
418
419 /*
420 * Preallocate a bio that's always going to be used for flushing device
421 * barriers and matches the device lifespan
422 */
423 dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL);
424 if (!dev->flush_bio) {
425 kfree(dev);
426 return ERR_PTR(-ENOMEM);
427 }
428
429 INIT_LIST_HEAD(&dev->dev_list);
430 INIT_LIST_HEAD(&dev->dev_alloc_list);
431 INIT_LIST_HEAD(&dev->post_commit_list);
432
433 atomic_set(&dev->reada_in_flight, 0);
434 atomic_set(&dev->dev_stats_ccnt, 0);
435 btrfs_device_data_ordered_init(dev);
436 INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
437 INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
438 extent_io_tree_init(fs_info, &dev->alloc_state,
439 IO_TREE_DEVICE_ALLOC_STATE, NULL);
440
441 return dev;
442 }
443
find_fsid(const u8 * fsid,const u8 * metadata_fsid)444 static noinline struct btrfs_fs_devices *find_fsid(
445 const u8 *fsid, const u8 *metadata_fsid)
446 {
447 struct btrfs_fs_devices *fs_devices;
448
449 ASSERT(fsid);
450
451 /* Handle non-split brain cases */
452 list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
453 if (metadata_fsid) {
454 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0
455 && memcmp(metadata_fsid, fs_devices->metadata_uuid,
456 BTRFS_FSID_SIZE) == 0)
457 return fs_devices;
458 } else {
459 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
460 return fs_devices;
461 }
462 }
463 return NULL;
464 }
465
find_fsid_with_metadata_uuid(struct btrfs_super_block * disk_super)466 static struct btrfs_fs_devices *find_fsid_with_metadata_uuid(
467 struct btrfs_super_block *disk_super)
468 {
469
470 struct btrfs_fs_devices *fs_devices;
471
472 /*
473 * Handle scanned device having completed its fsid change but
474 * belonging to a fs_devices that was created by first scanning
475 * a device which didn't have its fsid/metadata_uuid changed
476 * at all and the CHANGING_FSID_V2 flag set.
477 */
478 list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
479 if (fs_devices->fsid_change &&
480 memcmp(disk_super->metadata_uuid, fs_devices->fsid,
481 BTRFS_FSID_SIZE) == 0 &&
482 memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
483 BTRFS_FSID_SIZE) == 0) {
484 return fs_devices;
485 }
486 }
487 /*
488 * Handle scanned device having completed its fsid change but
489 * belonging to a fs_devices that was created by a device that
490 * has an outdated pair of fsid/metadata_uuid and
491 * CHANGING_FSID_V2 flag set.
492 */
493 list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
494 if (fs_devices->fsid_change &&
495 memcmp(fs_devices->metadata_uuid,
496 fs_devices->fsid, BTRFS_FSID_SIZE) != 0 &&
497 memcmp(disk_super->metadata_uuid, fs_devices->metadata_uuid,
498 BTRFS_FSID_SIZE) == 0) {
499 return fs_devices;
500 }
501 }
502
503 return find_fsid(disk_super->fsid, disk_super->metadata_uuid);
504 }
505
506
507 static int
btrfs_get_bdev_and_sb(const char * device_path,fmode_t flags,void * holder,int flush,struct block_device ** bdev,struct btrfs_super_block ** disk_super)508 btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
509 int flush, struct block_device **bdev,
510 struct btrfs_super_block **disk_super)
511 {
512 int ret;
513
514 *bdev = blkdev_get_by_path(device_path, flags, holder);
515
516 if (IS_ERR(*bdev)) {
517 ret = PTR_ERR(*bdev);
518 goto error;
519 }
520
521 if (flush)
522 filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
523 ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
524 if (ret) {
525 blkdev_put(*bdev, flags);
526 goto error;
527 }
528 invalidate_bdev(*bdev);
529 *disk_super = btrfs_read_dev_super(*bdev);
530 if (IS_ERR(*disk_super)) {
531 ret = PTR_ERR(*disk_super);
532 blkdev_put(*bdev, flags);
533 goto error;
534 }
535
536 return 0;
537
538 error:
539 *bdev = NULL;
540 return ret;
541 }
542
device_path_matched(const char * path,struct btrfs_device * device)543 static bool device_path_matched(const char *path, struct btrfs_device *device)
544 {
545 int found;
546
547 rcu_read_lock();
548 found = strcmp(rcu_str_deref(device->name), path);
549 rcu_read_unlock();
550
551 return found == 0;
552 }
553
554 /*
555 * Search and remove all stale (devices which are not mounted) devices.
556 * When both inputs are NULL, it will search and release all stale devices.
557 * path: Optional. When provided will it release all unmounted devices
558 * matching this path only.
559 * skip_dev: Optional. Will skip this device when searching for the stale
560 * devices.
561 * Return: 0 for success or if @path is NULL.
562 * -EBUSY if @path is a mounted device.
563 * -ENOENT if @path does not match any device in the list.
564 */
btrfs_free_stale_devices(const char * path,struct btrfs_device * skip_device)565 static int btrfs_free_stale_devices(const char *path,
566 struct btrfs_device *skip_device)
567 {
568 struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
569 struct btrfs_device *device, *tmp_device;
570 int ret = 0;
571
572 lockdep_assert_held(&uuid_mutex);
573
574 if (path)
575 ret = -ENOENT;
576
577 list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
578
579 mutex_lock(&fs_devices->device_list_mutex);
580 list_for_each_entry_safe(device, tmp_device,
581 &fs_devices->devices, dev_list) {
582 if (skip_device && skip_device == device)
583 continue;
584 if (path && !device->name)
585 continue;
586 if (path && !device_path_matched(path, device))
587 continue;
588 if (fs_devices->opened) {
589 /* for an already deleted device return 0 */
590 if (path && ret != 0)
591 ret = -EBUSY;
592 break;
593 }
594
595 /* delete the stale device */
596 fs_devices->num_devices--;
597 list_del(&device->dev_list);
598 btrfs_free_device(device);
599
600 ret = 0;
601 }
602 mutex_unlock(&fs_devices->device_list_mutex);
603
604 if (fs_devices->num_devices == 0) {
605 btrfs_sysfs_remove_fsid(fs_devices);
606 list_del(&fs_devices->fs_list);
607 free_fs_devices(fs_devices);
608 }
609 }
610
611 return ret;
612 }
613
614 /*
615 * This is only used on mount, and we are protected from competing things
616 * messing with our fs_devices by the uuid_mutex, thus we do not need the
617 * fs_devices->device_list_mutex here.
618 */
btrfs_open_one_device(struct btrfs_fs_devices * fs_devices,struct btrfs_device * device,fmode_t flags,void * holder)619 static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
620 struct btrfs_device *device, fmode_t flags,
621 void *holder)
622 {
623 struct request_queue *q;
624 struct block_device *bdev;
625 struct btrfs_super_block *disk_super;
626 u64 devid;
627 int ret;
628
629 if (device->bdev)
630 return -EINVAL;
631 if (!device->name)
632 return -EINVAL;
633
634 ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
635 &bdev, &disk_super);
636 if (ret)
637 return ret;
638
639 devid = btrfs_stack_device_id(&disk_super->dev_item);
640 if (devid != device->devid)
641 goto error_free_page;
642
643 if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
644 goto error_free_page;
645
646 device->generation = btrfs_super_generation(disk_super);
647
648 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
649 if (btrfs_super_incompat_flags(disk_super) &
650 BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {
651 pr_err(
652 "BTRFS: Invalid seeding and uuid-changed device detected\n");
653 goto error_free_page;
654 }
655
656 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
657 fs_devices->seeding = true;
658 } else {
659 if (bdev_read_only(bdev))
660 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
661 else
662 set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
663 }
664
665 q = bdev_get_queue(bdev);
666 if (!blk_queue_nonrot(q))
667 fs_devices->rotating = true;
668
669 device->bdev = bdev;
670 clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
671 device->mode = flags;
672
673 fs_devices->open_devices++;
674 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
675 device->devid != BTRFS_DEV_REPLACE_DEVID) {
676 fs_devices->rw_devices++;
677 list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
678 }
679 btrfs_release_disk_super(disk_super);
680
681 return 0;
682
683 error_free_page:
684 btrfs_release_disk_super(disk_super);
685 blkdev_put(bdev, flags);
686
687 return -EINVAL;
688 }
689
690 /*
691 * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices
692 * being created with a disk that has already completed its fsid change. Such
693 * disk can belong to an fs which has its FSID changed or to one which doesn't.
694 * Handle both cases here.
695 */
find_fsid_inprogress(struct btrfs_super_block * disk_super)696 static struct btrfs_fs_devices *find_fsid_inprogress(
697 struct btrfs_super_block *disk_super)
698 {
699 struct btrfs_fs_devices *fs_devices;
700
701 list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
702 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
703 BTRFS_FSID_SIZE) != 0 &&
704 memcmp(fs_devices->metadata_uuid, disk_super->fsid,
705 BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) {
706 return fs_devices;
707 }
708 }
709
710 return find_fsid(disk_super->fsid, NULL);
711 }
712
713
find_fsid_changed(struct btrfs_super_block * disk_super)714 static struct btrfs_fs_devices *find_fsid_changed(
715 struct btrfs_super_block *disk_super)
716 {
717 struct btrfs_fs_devices *fs_devices;
718
719 /*
720 * Handles the case where scanned device is part of an fs that had
721 * multiple successful changes of FSID but curently device didn't
722 * observe it. Meaning our fsid will be different than theirs. We need
723 * to handle two subcases :
724 * 1 - The fs still continues to have different METADATA/FSID uuids.
725 * 2 - The fs is switched back to its original FSID (METADATA/FSID
726 * are equal).
727 */
728 list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
729 /* Changed UUIDs */
730 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
731 BTRFS_FSID_SIZE) != 0 &&
732 memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid,
733 BTRFS_FSID_SIZE) == 0 &&
734 memcmp(fs_devices->fsid, disk_super->fsid,
735 BTRFS_FSID_SIZE) != 0)
736 return fs_devices;
737
738 /* Unchanged UUIDs */
739 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
740 BTRFS_FSID_SIZE) == 0 &&
741 memcmp(fs_devices->fsid, disk_super->metadata_uuid,
742 BTRFS_FSID_SIZE) == 0)
743 return fs_devices;
744 }
745
746 return NULL;
747 }
748
find_fsid_reverted_metadata(struct btrfs_super_block * disk_super)749 static struct btrfs_fs_devices *find_fsid_reverted_metadata(
750 struct btrfs_super_block *disk_super)
751 {
752 struct btrfs_fs_devices *fs_devices;
753
754 /*
755 * Handle the case where the scanned device is part of an fs whose last
756 * metadata UUID change reverted it to the original FSID. At the same
757 * time * fs_devices was first created by another constitutent device
758 * which didn't fully observe the operation. This results in an
759 * btrfs_fs_devices created with metadata/fsid different AND
760 * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the
761 * fs_devices equal to the FSID of the disk.
762 */
763 list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
764 if (memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
765 BTRFS_FSID_SIZE) != 0 &&
766 memcmp(fs_devices->metadata_uuid, disk_super->fsid,
767 BTRFS_FSID_SIZE) == 0 &&
768 fs_devices->fsid_change)
769 return fs_devices;
770 }
771
772 return NULL;
773 }
774 /*
775 * Add new device to list of registered devices
776 *
777 * Returns:
778 * device pointer which was just added or updated when successful
779 * error pointer when failed
780 */
device_list_add(const char * path,struct btrfs_super_block * disk_super,bool * new_device_added)781 static noinline struct btrfs_device *device_list_add(const char *path,
782 struct btrfs_super_block *disk_super,
783 bool *new_device_added)
784 {
785 struct btrfs_device *device;
786 struct btrfs_fs_devices *fs_devices = NULL;
787 struct rcu_string *name;
788 u64 found_transid = btrfs_super_generation(disk_super);
789 u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
790 bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
791 BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
792 bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
793 BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
794
795 if (fsid_change_in_progress) {
796 if (!has_metadata_uuid)
797 fs_devices = find_fsid_inprogress(disk_super);
798 else
799 fs_devices = find_fsid_changed(disk_super);
800 } else if (has_metadata_uuid) {
801 fs_devices = find_fsid_with_metadata_uuid(disk_super);
802 } else {
803 fs_devices = find_fsid_reverted_metadata(disk_super);
804 if (!fs_devices)
805 fs_devices = find_fsid(disk_super->fsid, NULL);
806 }
807
808
809 if (!fs_devices) {
810 if (has_metadata_uuid)
811 fs_devices = alloc_fs_devices(disk_super->fsid,
812 disk_super->metadata_uuid);
813 else
814 fs_devices = alloc_fs_devices(disk_super->fsid, NULL);
815
816 if (IS_ERR(fs_devices))
817 return ERR_CAST(fs_devices);
818
819 fs_devices->fsid_change = fsid_change_in_progress;
820
821 mutex_lock(&fs_devices->device_list_mutex);
822 list_add(&fs_devices->fs_list, &fs_uuids);
823
824 device = NULL;
825 } else {
826 mutex_lock(&fs_devices->device_list_mutex);
827 device = btrfs_find_device(fs_devices, devid,
828 disk_super->dev_item.uuid, NULL, false);
829
830 /*
831 * If this disk has been pulled into an fs devices created by
832 * a device which had the CHANGING_FSID_V2 flag then replace the
833 * metadata_uuid/fsid values of the fs_devices.
834 */
835 if (fs_devices->fsid_change &&
836 found_transid > fs_devices->latest_generation) {
837 memcpy(fs_devices->fsid, disk_super->fsid,
838 BTRFS_FSID_SIZE);
839
840 if (has_metadata_uuid)
841 memcpy(fs_devices->metadata_uuid,
842 disk_super->metadata_uuid,
843 BTRFS_FSID_SIZE);
844 else
845 memcpy(fs_devices->metadata_uuid,
846 disk_super->fsid, BTRFS_FSID_SIZE);
847
848 fs_devices->fsid_change = false;
849 }
850 }
851
852 if (!device) {
853 if (fs_devices->opened) {
854 mutex_unlock(&fs_devices->device_list_mutex);
855 return ERR_PTR(-EBUSY);
856 }
857
858 device = btrfs_alloc_device(NULL, &devid,
859 disk_super->dev_item.uuid);
860 if (IS_ERR(device)) {
861 mutex_unlock(&fs_devices->device_list_mutex);
862 /* we can safely leave the fs_devices entry around */
863 return device;
864 }
865
866 name = rcu_string_strdup(path, GFP_NOFS);
867 if (!name) {
868 btrfs_free_device(device);
869 mutex_unlock(&fs_devices->device_list_mutex);
870 return ERR_PTR(-ENOMEM);
871 }
872 rcu_assign_pointer(device->name, name);
873
874 list_add_rcu(&device->dev_list, &fs_devices->devices);
875 fs_devices->num_devices++;
876
877 device->fs_devices = fs_devices;
878 *new_device_added = true;
879
880 if (disk_super->label[0])
881 pr_info(
882 "BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n",
883 disk_super->label, devid, found_transid, path,
884 current->comm, task_pid_nr(current));
885 else
886 pr_info(
887 "BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n",
888 disk_super->fsid, devid, found_transid, path,
889 current->comm, task_pid_nr(current));
890
891 } else if (!device->name || strcmp(device->name->str, path)) {
892 /*
893 * When FS is already mounted.
894 * 1. If you are here and if the device->name is NULL that
895 * means this device was missing at time of FS mount.
896 * 2. If you are here and if the device->name is different
897 * from 'path' that means either
898 * a. The same device disappeared and reappeared with
899 * different name. or
900 * b. The missing-disk-which-was-replaced, has
901 * reappeared now.
902 *
903 * We must allow 1 and 2a above. But 2b would be a spurious
904 * and unintentional.
905 *
906 * Further in case of 1 and 2a above, the disk at 'path'
907 * would have missed some transaction when it was away and
908 * in case of 2a the stale bdev has to be updated as well.
909 * 2b must not be allowed at all time.
910 */
911
912 /*
913 * For now, we do allow update to btrfs_fs_device through the
914 * btrfs dev scan cli after FS has been mounted. We're still
915 * tracking a problem where systems fail mount by subvolume id
916 * when we reject replacement on a mounted FS.
917 */
918 if (!fs_devices->opened && found_transid < device->generation) {
919 /*
920 * That is if the FS is _not_ mounted and if you
921 * are here, that means there is more than one
922 * disk with same uuid and devid.We keep the one
923 * with larger generation number or the last-in if
924 * generation are equal.
925 */
926 mutex_unlock(&fs_devices->device_list_mutex);
927 return ERR_PTR(-EEXIST);
928 }
929
930 /*
931 * We are going to replace the device path for a given devid,
932 * make sure it's the same device if the device is mounted
933 */
934 if (device->bdev) {
935 struct block_device *path_bdev;
936
937 path_bdev = lookup_bdev(path);
938 if (IS_ERR(path_bdev)) {
939 mutex_unlock(&fs_devices->device_list_mutex);
940 return ERR_CAST(path_bdev);
941 }
942
943 if (device->bdev != path_bdev) {
944 bdput(path_bdev);
945 mutex_unlock(&fs_devices->device_list_mutex);
946 /*
947 * device->fs_info may not be reliable here, so
948 * pass in a NULL instead. This avoids a
949 * possible use-after-free when the fs_info and
950 * fs_info->sb are already torn down.
951 */
952 btrfs_warn_in_rcu(NULL,
953 "duplicate device %s devid %llu generation %llu scanned by %s (%d)",
954 path, devid, found_transid,
955 current->comm,
956 task_pid_nr(current));
957 return ERR_PTR(-EEXIST);
958 }
959 bdput(path_bdev);
960 btrfs_info_in_rcu(device->fs_info,
961 "devid %llu device path %s changed to %s scanned by %s (%d)",
962 devid, rcu_str_deref(device->name),
963 path, current->comm,
964 task_pid_nr(current));
965 }
966
967 name = rcu_string_strdup(path, GFP_NOFS);
968 if (!name) {
969 mutex_unlock(&fs_devices->device_list_mutex);
970 return ERR_PTR(-ENOMEM);
971 }
972 rcu_string_free(device->name);
973 rcu_assign_pointer(device->name, name);
974 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
975 fs_devices->missing_devices--;
976 clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
977 }
978 }
979
980 /*
981 * Unmount does not free the btrfs_device struct but would zero
982 * generation along with most of the other members. So just update
983 * it back. We need it to pick the disk with largest generation
984 * (as above).
985 */
986 if (!fs_devices->opened) {
987 device->generation = found_transid;
988 fs_devices->latest_generation = max_t(u64, found_transid,
989 fs_devices->latest_generation);
990 }
991
992 fs_devices->total_devices = btrfs_super_num_devices(disk_super);
993
994 mutex_unlock(&fs_devices->device_list_mutex);
995 return device;
996 }
997
clone_fs_devices(struct btrfs_fs_devices * orig)998 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
999 {
1000 struct btrfs_fs_devices *fs_devices;
1001 struct btrfs_device *device;
1002 struct btrfs_device *orig_dev;
1003 int ret = 0;
1004
1005 lockdep_assert_held(&uuid_mutex);
1006
1007 fs_devices = alloc_fs_devices(orig->fsid, NULL);
1008 if (IS_ERR(fs_devices))
1009 return fs_devices;
1010
1011 fs_devices->total_devices = orig->total_devices;
1012
1013 list_for_each_entry(orig_dev, &orig->devices, dev_list) {
1014 struct rcu_string *name;
1015
1016 device = btrfs_alloc_device(NULL, &orig_dev->devid,
1017 orig_dev->uuid);
1018 if (IS_ERR(device)) {
1019 ret = PTR_ERR(device);
1020 goto error;
1021 }
1022
1023 /*
1024 * This is ok to do without rcu read locked because we hold the
1025 * uuid mutex so nothing we touch in here is going to disappear.
1026 */
1027 if (orig_dev->name) {
1028 name = rcu_string_strdup(orig_dev->name->str,
1029 GFP_KERNEL);
1030 if (!name) {
1031 btrfs_free_device(device);
1032 ret = -ENOMEM;
1033 goto error;
1034 }
1035 rcu_assign_pointer(device->name, name);
1036 }
1037
1038 list_add(&device->dev_list, &fs_devices->devices);
1039 device->fs_devices = fs_devices;
1040 fs_devices->num_devices++;
1041 }
1042 return fs_devices;
1043 error:
1044 free_fs_devices(fs_devices);
1045 return ERR_PTR(ret);
1046 }
1047
__btrfs_free_extra_devids(struct btrfs_fs_devices * fs_devices,int step,struct btrfs_device ** latest_dev)1048 static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
1049 int step, struct btrfs_device **latest_dev)
1050 {
1051 struct btrfs_device *device, *next;
1052
1053 /* This is the initialized path, it is safe to release the devices. */
1054 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
1055 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) {
1056 if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
1057 &device->dev_state) &&
1058 !test_bit(BTRFS_DEV_STATE_MISSING,
1059 &device->dev_state) &&
1060 (!*latest_dev ||
1061 device->generation > (*latest_dev)->generation)) {
1062 *latest_dev = device;
1063 }
1064 continue;
1065 }
1066
1067 /*
1068 * We have already validated the presence of BTRFS_DEV_REPLACE_DEVID,
1069 * in btrfs_init_dev_replace() so just continue.
1070 */
1071 if (device->devid == BTRFS_DEV_REPLACE_DEVID)
1072 continue;
1073
1074 if (device->bdev) {
1075 blkdev_put(device->bdev, device->mode);
1076 device->bdev = NULL;
1077 fs_devices->open_devices--;
1078 }
1079 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1080 list_del_init(&device->dev_alloc_list);
1081 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
1082 fs_devices->rw_devices--;
1083 }
1084 list_del_init(&device->dev_list);
1085 fs_devices->num_devices--;
1086 btrfs_free_device(device);
1087 }
1088
1089 }
1090
1091 /*
1092 * After we have read the system tree and know devids belonging to this
1093 * filesystem, remove the device which does not belong there.
1094 */
btrfs_free_extra_devids(struct btrfs_fs_devices * fs_devices,int step)1095 void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
1096 {
1097 struct btrfs_device *latest_dev = NULL;
1098 struct btrfs_fs_devices *seed_dev;
1099
1100 mutex_lock(&uuid_mutex);
1101 __btrfs_free_extra_devids(fs_devices, step, &latest_dev);
1102
1103 list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list)
1104 __btrfs_free_extra_devids(seed_dev, step, &latest_dev);
1105
1106 fs_devices->latest_bdev = latest_dev->bdev;
1107
1108 mutex_unlock(&uuid_mutex);
1109 }
1110
btrfs_close_bdev(struct btrfs_device * device)1111 static void btrfs_close_bdev(struct btrfs_device *device)
1112 {
1113 if (!device->bdev)
1114 return;
1115
1116 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1117 sync_blockdev(device->bdev);
1118 invalidate_bdev(device->bdev);
1119 }
1120
1121 blkdev_put(device->bdev, device->mode);
1122 }
1123
btrfs_close_one_device(struct btrfs_device * device)1124 static void btrfs_close_one_device(struct btrfs_device *device)
1125 {
1126 struct btrfs_fs_devices *fs_devices = device->fs_devices;
1127
1128 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
1129 device->devid != BTRFS_DEV_REPLACE_DEVID) {
1130 list_del_init(&device->dev_alloc_list);
1131 fs_devices->rw_devices--;
1132 }
1133
1134 if (device->devid == BTRFS_DEV_REPLACE_DEVID)
1135 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
1136
1137 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
1138 clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
1139 fs_devices->missing_devices--;
1140 }
1141
1142 btrfs_close_bdev(device);
1143 if (device->bdev) {
1144 fs_devices->open_devices--;
1145 device->bdev = NULL;
1146 }
1147 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
1148
1149 device->fs_info = NULL;
1150 atomic_set(&device->dev_stats_ccnt, 0);
1151 extent_io_tree_release(&device->alloc_state);
1152
1153 /*
1154 * Reset the flush error record. We might have a transient flush error
1155 * in this mount, and if so we aborted the current transaction and set
1156 * the fs to an error state, guaranteeing no super blocks can be further
1157 * committed. However that error might be transient and if we unmount the
1158 * filesystem and mount it again, we should allow the mount to succeed
1159 * (btrfs_check_rw_degradable() should not fail) - if after mounting the
1160 * filesystem again we still get flush errors, then we will again abort
1161 * any transaction and set the error state, guaranteeing no commits of
1162 * unsafe super blocks.
1163 */
1164 device->last_flush_error = 0;
1165
1166 /* Verify the device is back in a pristine state */
1167 ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state));
1168 ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
1169 ASSERT(list_empty(&device->dev_alloc_list));
1170 ASSERT(list_empty(&device->post_commit_list));
1171 ASSERT(atomic_read(&device->reada_in_flight) == 0);
1172 }
1173
close_fs_devices(struct btrfs_fs_devices * fs_devices)1174 static void close_fs_devices(struct btrfs_fs_devices *fs_devices)
1175 {
1176 struct btrfs_device *device, *tmp;
1177
1178 lockdep_assert_held(&uuid_mutex);
1179
1180 if (--fs_devices->opened > 0)
1181 return;
1182
1183 list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list)
1184 btrfs_close_one_device(device);
1185
1186 WARN_ON(fs_devices->open_devices);
1187 WARN_ON(fs_devices->rw_devices);
1188 fs_devices->opened = 0;
1189 fs_devices->seeding = false;
1190 fs_devices->fs_info = NULL;
1191 }
1192
btrfs_close_devices(struct btrfs_fs_devices * fs_devices)1193 void btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
1194 {
1195 LIST_HEAD(list);
1196 struct btrfs_fs_devices *tmp;
1197
1198 mutex_lock(&uuid_mutex);
1199 close_fs_devices(fs_devices);
1200 if (!fs_devices->opened)
1201 list_splice_init(&fs_devices->seed_list, &list);
1202
1203 list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) {
1204 close_fs_devices(fs_devices);
1205 list_del(&fs_devices->seed_list);
1206 free_fs_devices(fs_devices);
1207 }
1208 mutex_unlock(&uuid_mutex);
1209 }
1210
open_fs_devices(struct btrfs_fs_devices * fs_devices,fmode_t flags,void * holder)1211 static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
1212 fmode_t flags, void *holder)
1213 {
1214 struct btrfs_device *device;
1215 struct btrfs_device *latest_dev = NULL;
1216 struct btrfs_device *tmp_device;
1217
1218 flags |= FMODE_EXCL;
1219
1220 list_for_each_entry_safe(device, tmp_device, &fs_devices->devices,
1221 dev_list) {
1222 int ret;
1223
1224 ret = btrfs_open_one_device(fs_devices, device, flags, holder);
1225 if (ret == 0 &&
1226 (!latest_dev || device->generation > latest_dev->generation)) {
1227 latest_dev = device;
1228 } else if (ret == -ENODATA) {
1229 fs_devices->num_devices--;
1230 list_del(&device->dev_list);
1231 btrfs_free_device(device);
1232 }
1233 }
1234 if (fs_devices->open_devices == 0)
1235 return -EINVAL;
1236
1237 fs_devices->opened = 1;
1238 fs_devices->latest_bdev = latest_dev->bdev;
1239 fs_devices->total_rw_bytes = 0;
1240 fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
1241
1242 return 0;
1243 }
1244
devid_cmp(void * priv,const struct list_head * a,const struct list_head * b)1245 static int devid_cmp(void *priv, const struct list_head *a,
1246 const struct list_head *b)
1247 {
1248 struct btrfs_device *dev1, *dev2;
1249
1250 dev1 = list_entry(a, struct btrfs_device, dev_list);
1251 dev2 = list_entry(b, struct btrfs_device, dev_list);
1252
1253 if (dev1->devid < dev2->devid)
1254 return -1;
1255 else if (dev1->devid > dev2->devid)
1256 return 1;
1257 return 0;
1258 }
1259
btrfs_open_devices(struct btrfs_fs_devices * fs_devices,fmode_t flags,void * holder)1260 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
1261 fmode_t flags, void *holder)
1262 {
1263 int ret;
1264
1265 lockdep_assert_held(&uuid_mutex);
1266 /*
1267 * The device_list_mutex cannot be taken here in case opening the
1268 * underlying device takes further locks like bd_mutex.
1269 *
1270 * We also don't need the lock here as this is called during mount and
1271 * exclusion is provided by uuid_mutex
1272 */
1273
1274 if (fs_devices->opened) {
1275 fs_devices->opened++;
1276 ret = 0;
1277 } else {
1278 list_sort(NULL, &fs_devices->devices, devid_cmp);
1279 ret = open_fs_devices(fs_devices, flags, holder);
1280 }
1281
1282 return ret;
1283 }
1284
btrfs_release_disk_super(struct btrfs_super_block * super)1285 void btrfs_release_disk_super(struct btrfs_super_block *super)
1286 {
1287 struct page *page = virt_to_page(super);
1288
1289 put_page(page);
1290 }
1291
btrfs_read_disk_super(struct block_device * bdev,u64 bytenr)1292 static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
1293 u64 bytenr)
1294 {
1295 struct btrfs_super_block *disk_super;
1296 struct page *page;
1297 void *p;
1298 pgoff_t index;
1299
1300 /* make sure our super fits in the device */
1301 if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
1302 return ERR_PTR(-EINVAL);
1303
1304 /* make sure our super fits in the page */
1305 if (sizeof(*disk_super) > PAGE_SIZE)
1306 return ERR_PTR(-EINVAL);
1307
1308 /* make sure our super doesn't straddle pages on disk */
1309 index = bytenr >> PAGE_SHIFT;
1310 if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index)
1311 return ERR_PTR(-EINVAL);
1312
1313 /* pull in the page with our super */
1314 page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL);
1315
1316 if (IS_ERR(page))
1317 return ERR_CAST(page);
1318
1319 p = page_address(page);
1320
1321 /* align our pointer to the offset of the super block */
1322 disk_super = p + offset_in_page(bytenr);
1323
1324 if (btrfs_super_bytenr(disk_super) != bytenr ||
1325 btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
1326 btrfs_release_disk_super(p);
1327 return ERR_PTR(-EINVAL);
1328 }
1329
1330 if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1])
1331 disk_super->label[BTRFS_LABEL_SIZE - 1] = 0;
1332
1333 return disk_super;
1334 }
1335
btrfs_forget_devices(const char * path)1336 int btrfs_forget_devices(const char *path)
1337 {
1338 int ret;
1339
1340 mutex_lock(&uuid_mutex);
1341 ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL);
1342 mutex_unlock(&uuid_mutex);
1343
1344 return ret;
1345 }
1346
1347 /*
1348 * Look for a btrfs signature on a device. This may be called out of the mount path
1349 * and we are not allowed to call set_blocksize during the scan. The superblock
1350 * is read via pagecache
1351 */
btrfs_scan_one_device(const char * path,fmode_t flags,void * holder)1352 struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
1353 void *holder)
1354 {
1355 struct btrfs_super_block *disk_super;
1356 bool new_device_added = false;
1357 struct btrfs_device *device = NULL;
1358 struct block_device *bdev;
1359 u64 bytenr;
1360
1361 lockdep_assert_held(&uuid_mutex);
1362
1363 /*
1364 * we would like to check all the supers, but that would make
1365 * a btrfs mount succeed after a mkfs from a different FS.
1366 * So, we need to add a special mount option to scan for
1367 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
1368 */
1369 bytenr = btrfs_sb_offset(0);
1370 flags |= FMODE_EXCL;
1371
1372 bdev = blkdev_get_by_path(path, flags, holder);
1373 if (IS_ERR(bdev))
1374 return ERR_CAST(bdev);
1375
1376 disk_super = btrfs_read_disk_super(bdev, bytenr);
1377 if (IS_ERR(disk_super)) {
1378 device = ERR_CAST(disk_super);
1379 goto error_bdev_put;
1380 }
1381
1382 device = device_list_add(path, disk_super, &new_device_added);
1383 if (!IS_ERR(device)) {
1384 if (new_device_added)
1385 btrfs_free_stale_devices(path, device);
1386 }
1387
1388 btrfs_release_disk_super(disk_super);
1389
1390 error_bdev_put:
1391 blkdev_put(bdev, flags);
1392
1393 return device;
1394 }
1395
1396 /*
1397 * Try to find a chunk that intersects [start, start + len] range and when one
1398 * such is found, record the end of it in *start
1399 */
contains_pending_extent(struct btrfs_device * device,u64 * start,u64 len)1400 static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
1401 u64 len)
1402 {
1403 u64 physical_start, physical_end;
1404
1405 lockdep_assert_held(&device->fs_info->chunk_mutex);
1406
1407 if (!find_first_extent_bit(&device->alloc_state, *start,
1408 &physical_start, &physical_end,
1409 CHUNK_ALLOCATED, NULL)) {
1410
1411 if (in_range(physical_start, *start, len) ||
1412 in_range(*start, physical_start,
1413 physical_end - physical_start)) {
1414 *start = physical_end + 1;
1415 return true;
1416 }
1417 }
1418 return false;
1419 }
1420
dev_extent_search_start(struct btrfs_device * device,u64 start)1421 static u64 dev_extent_search_start(struct btrfs_device *device, u64 start)
1422 {
1423 switch (device->fs_devices->chunk_alloc_policy) {
1424 case BTRFS_CHUNK_ALLOC_REGULAR:
1425 /*
1426 * We don't want to overwrite the superblock on the drive nor
1427 * any area used by the boot loader (grub for example), so we
1428 * make sure to start at an offset of at least 1MB.
1429 */
1430 return max_t(u64, start, SZ_1M);
1431 default:
1432 BUG();
1433 }
1434 }
1435
1436 /**
1437 * dev_extent_hole_check - check if specified hole is suitable for allocation
1438 * @device: the device which we have the hole
1439 * @hole_start: starting position of the hole
1440 * @hole_size: the size of the hole
1441 * @num_bytes: the size of the free space that we need
1442 *
1443 * This function may modify @hole_start and @hole_end to reflect the suitable
1444 * position for allocation. Returns 1 if hole position is updated, 0 otherwise.
1445 */
dev_extent_hole_check(struct btrfs_device * device,u64 * hole_start,u64 * hole_size,u64 num_bytes)1446 static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
1447 u64 *hole_size, u64 num_bytes)
1448 {
1449 bool changed = false;
1450 u64 hole_end = *hole_start + *hole_size;
1451
1452 /*
1453 * Check before we set max_hole_start, otherwise we could end up
1454 * sending back this offset anyway.
1455 */
1456 if (contains_pending_extent(device, hole_start, *hole_size)) {
1457 if (hole_end >= *hole_start)
1458 *hole_size = hole_end - *hole_start;
1459 else
1460 *hole_size = 0;
1461 changed = true;
1462 }
1463
1464 switch (device->fs_devices->chunk_alloc_policy) {
1465 case BTRFS_CHUNK_ALLOC_REGULAR:
1466 /* No extra check */
1467 break;
1468 default:
1469 BUG();
1470 }
1471
1472 return changed;
1473 }
1474
1475 /*
1476 * find_free_dev_extent_start - find free space in the specified device
1477 * @device: the device which we search the free space in
1478 * @num_bytes: the size of the free space that we need
1479 * @search_start: the position from which to begin the search
1480 * @start: store the start of the free space.
1481 * @len: the size of the free space. that we find, or the size
1482 * of the max free space if we don't find suitable free space
1483 *
1484 * this uses a pretty simple search, the expectation is that it is
1485 * called very infrequently and that a given device has a small number
1486 * of extents
1487 *
1488 * @start is used to store the start of the free space if we find. But if we
1489 * don't find suitable free space, it will be used to store the start position
1490 * of the max free space.
1491 *
1492 * @len is used to store the size of the free space that we find.
1493 * But if we don't find suitable free space, it is used to store the size of
1494 * the max free space.
1495 *
1496 * NOTE: This function will search *commit* root of device tree, and does extra
1497 * check to ensure dev extents are not double allocated.
1498 * This makes the function safe to allocate dev extents but may not report
1499 * correct usable device space, as device extent freed in current transaction
1500 * is not reported as avaiable.
1501 */
find_free_dev_extent_start(struct btrfs_device * device,u64 num_bytes,u64 search_start,u64 * start,u64 * len)1502 static int find_free_dev_extent_start(struct btrfs_device *device,
1503 u64 num_bytes, u64 search_start, u64 *start,
1504 u64 *len)
1505 {
1506 struct btrfs_fs_info *fs_info = device->fs_info;
1507 struct btrfs_root *root = fs_info->dev_root;
1508 struct btrfs_key key;
1509 struct btrfs_dev_extent *dev_extent;
1510 struct btrfs_path *path;
1511 u64 hole_size;
1512 u64 max_hole_start;
1513 u64 max_hole_size;
1514 u64 extent_end;
1515 u64 search_end = device->total_bytes;
1516 int ret;
1517 int slot;
1518 struct extent_buffer *l;
1519
1520 search_start = dev_extent_search_start(device, search_start);
1521
1522 path = btrfs_alloc_path();
1523 if (!path)
1524 return -ENOMEM;
1525
1526 max_hole_start = search_start;
1527 max_hole_size = 0;
1528
1529 again:
1530 if (search_start >= search_end ||
1531 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
1532 ret = -ENOSPC;
1533 goto out;
1534 }
1535
1536 path->reada = READA_FORWARD;
1537 path->search_commit_root = 1;
1538 path->skip_locking = 1;
1539
1540 key.objectid = device->devid;
1541 key.offset = search_start;
1542 key.type = BTRFS_DEV_EXTENT_KEY;
1543
1544 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1545 if (ret < 0)
1546 goto out;
1547 if (ret > 0) {
1548 ret = btrfs_previous_item(root, path, key.objectid, key.type);
1549 if (ret < 0)
1550 goto out;
1551 }
1552
1553 while (1) {
1554 l = path->nodes[0];
1555 slot = path->slots[0];
1556 if (slot >= btrfs_header_nritems(l)) {
1557 ret = btrfs_next_leaf(root, path);
1558 if (ret == 0)
1559 continue;
1560 if (ret < 0)
1561 goto out;
1562
1563 break;
1564 }
1565 btrfs_item_key_to_cpu(l, &key, slot);
1566
1567 if (key.objectid < device->devid)
1568 goto next;
1569
1570 if (key.objectid > device->devid)
1571 break;
1572
1573 if (key.type != BTRFS_DEV_EXTENT_KEY)
1574 goto next;
1575
1576 if (key.offset > search_start) {
1577 hole_size = key.offset - search_start;
1578 dev_extent_hole_check(device, &search_start, &hole_size,
1579 num_bytes);
1580
1581 if (hole_size > max_hole_size) {
1582 max_hole_start = search_start;
1583 max_hole_size = hole_size;
1584 }
1585
1586 /*
1587 * If this free space is greater than which we need,
1588 * it must be the max free space that we have found
1589 * until now, so max_hole_start must point to the start
1590 * of this free space and the length of this free space
1591 * is stored in max_hole_size. Thus, we return
1592 * max_hole_start and max_hole_size and go back to the
1593 * caller.
1594 */
1595 if (hole_size >= num_bytes) {
1596 ret = 0;
1597 goto out;
1598 }
1599 }
1600
1601 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1602 extent_end = key.offset + btrfs_dev_extent_length(l,
1603 dev_extent);
1604 if (extent_end > search_start)
1605 search_start = extent_end;
1606 next:
1607 path->slots[0]++;
1608 cond_resched();
1609 }
1610
1611 /*
1612 * At this point, search_start should be the end of
1613 * allocated dev extents, and when shrinking the device,
1614 * search_end may be smaller than search_start.
1615 */
1616 if (search_end > search_start) {
1617 hole_size = search_end - search_start;
1618 if (dev_extent_hole_check(device, &search_start, &hole_size,
1619 num_bytes)) {
1620 btrfs_release_path(path);
1621 goto again;
1622 }
1623
1624 if (hole_size > max_hole_size) {
1625 max_hole_start = search_start;
1626 max_hole_size = hole_size;
1627 }
1628 }
1629
1630 /* See above. */
1631 if (max_hole_size < num_bytes)
1632 ret = -ENOSPC;
1633 else
1634 ret = 0;
1635
1636 out:
1637 btrfs_free_path(path);
1638 *start = max_hole_start;
1639 if (len)
1640 *len = max_hole_size;
1641 return ret;
1642 }
1643
find_free_dev_extent(struct btrfs_device * device,u64 num_bytes,u64 * start,u64 * len)1644 int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
1645 u64 *start, u64 *len)
1646 {
1647 /* FIXME use last free of some kind */
1648 return find_free_dev_extent_start(device, num_bytes, 0, start, len);
1649 }
1650
btrfs_free_dev_extent(struct btrfs_trans_handle * trans,struct btrfs_device * device,u64 start,u64 * dev_extent_len)1651 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1652 struct btrfs_device *device,
1653 u64 start, u64 *dev_extent_len)
1654 {
1655 struct btrfs_fs_info *fs_info = device->fs_info;
1656 struct btrfs_root *root = fs_info->dev_root;
1657 int ret;
1658 struct btrfs_path *path;
1659 struct btrfs_key key;
1660 struct btrfs_key found_key;
1661 struct extent_buffer *leaf = NULL;
1662 struct btrfs_dev_extent *extent = NULL;
1663
1664 path = btrfs_alloc_path();
1665 if (!path)
1666 return -ENOMEM;
1667
1668 key.objectid = device->devid;
1669 key.offset = start;
1670 key.type = BTRFS_DEV_EXTENT_KEY;
1671 again:
1672 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1673 if (ret > 0) {
1674 ret = btrfs_previous_item(root, path, key.objectid,
1675 BTRFS_DEV_EXTENT_KEY);
1676 if (ret)
1677 goto out;
1678 leaf = path->nodes[0];
1679 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1680 extent = btrfs_item_ptr(leaf, path->slots[0],
1681 struct btrfs_dev_extent);
1682 BUG_ON(found_key.offset > start || found_key.offset +
1683 btrfs_dev_extent_length(leaf, extent) < start);
1684 key = found_key;
1685 btrfs_release_path(path);
1686 goto again;
1687 } else if (ret == 0) {
1688 leaf = path->nodes[0];
1689 extent = btrfs_item_ptr(leaf, path->slots[0],
1690 struct btrfs_dev_extent);
1691 } else {
1692 btrfs_handle_fs_error(fs_info, ret, "Slot search failed");
1693 goto out;
1694 }
1695
1696 *dev_extent_len = btrfs_dev_extent_length(leaf, extent);
1697
1698 ret = btrfs_del_item(trans, root, path);
1699 if (ret) {
1700 btrfs_handle_fs_error(fs_info, ret,
1701 "Failed to remove dev extent item");
1702 } else {
1703 set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
1704 }
1705 out:
1706 btrfs_free_path(path);
1707 return ret;
1708 }
1709
btrfs_alloc_dev_extent(struct btrfs_trans_handle * trans,struct btrfs_device * device,u64 chunk_offset,u64 start,u64 num_bytes)1710 static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
1711 struct btrfs_device *device,
1712 u64 chunk_offset, u64 start, u64 num_bytes)
1713 {
1714 int ret;
1715 struct btrfs_path *path;
1716 struct btrfs_fs_info *fs_info = device->fs_info;
1717 struct btrfs_root *root = fs_info->dev_root;
1718 struct btrfs_dev_extent *extent;
1719 struct extent_buffer *leaf;
1720 struct btrfs_key key;
1721
1722 WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
1723 WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
1724 path = btrfs_alloc_path();
1725 if (!path)
1726 return -ENOMEM;
1727
1728 key.objectid = device->devid;
1729 key.offset = start;
1730 key.type = BTRFS_DEV_EXTENT_KEY;
1731 ret = btrfs_insert_empty_item(trans, root, path, &key,
1732 sizeof(*extent));
1733 if (ret)
1734 goto out;
1735
1736 leaf = path->nodes[0];
1737 extent = btrfs_item_ptr(leaf, path->slots[0],
1738 struct btrfs_dev_extent);
1739 btrfs_set_dev_extent_chunk_tree(leaf, extent,
1740 BTRFS_CHUNK_TREE_OBJECTID);
1741 btrfs_set_dev_extent_chunk_objectid(leaf, extent,
1742 BTRFS_FIRST_CHUNK_TREE_OBJECTID);
1743 btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
1744
1745 btrfs_set_dev_extent_length(leaf, extent, num_bytes);
1746 btrfs_mark_buffer_dirty(leaf);
1747 out:
1748 btrfs_free_path(path);
1749 return ret;
1750 }
1751
find_next_chunk(struct btrfs_fs_info * fs_info)1752 static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
1753 {
1754 struct extent_map_tree *em_tree;
1755 struct extent_map *em;
1756 struct rb_node *n;
1757 u64 ret = 0;
1758
1759 em_tree = &fs_info->mapping_tree;
1760 read_lock(&em_tree->lock);
1761 n = rb_last(&em_tree->map.rb_root);
1762 if (n) {
1763 em = rb_entry(n, struct extent_map, rb_node);
1764 ret = em->start + em->len;
1765 }
1766 read_unlock(&em_tree->lock);
1767
1768 return ret;
1769 }
1770
find_next_devid(struct btrfs_fs_info * fs_info,u64 * devid_ret)1771 static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
1772 u64 *devid_ret)
1773 {
1774 int ret;
1775 struct btrfs_key key;
1776 struct btrfs_key found_key;
1777 struct btrfs_path *path;
1778
1779 path = btrfs_alloc_path();
1780 if (!path)
1781 return -ENOMEM;
1782
1783 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1784 key.type = BTRFS_DEV_ITEM_KEY;
1785 key.offset = (u64)-1;
1786
1787 ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
1788 if (ret < 0)
1789 goto error;
1790
1791 if (ret == 0) {
1792 /* Corruption */
1793 btrfs_err(fs_info, "corrupted chunk tree devid -1 matched");
1794 ret = -EUCLEAN;
1795 goto error;
1796 }
1797
1798 ret = btrfs_previous_item(fs_info->chunk_root, path,
1799 BTRFS_DEV_ITEMS_OBJECTID,
1800 BTRFS_DEV_ITEM_KEY);
1801 if (ret) {
1802 *devid_ret = 1;
1803 } else {
1804 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1805 path->slots[0]);
1806 *devid_ret = found_key.offset + 1;
1807 }
1808 ret = 0;
1809 error:
1810 btrfs_free_path(path);
1811 return ret;
1812 }
1813
1814 /*
1815 * the device information is stored in the chunk root
1816 * the btrfs_device struct should be fully filled in
1817 */
btrfs_add_dev_item(struct btrfs_trans_handle * trans,struct btrfs_device * device)1818 static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
1819 struct btrfs_device *device)
1820 {
1821 int ret;
1822 struct btrfs_path *path;
1823 struct btrfs_dev_item *dev_item;
1824 struct extent_buffer *leaf;
1825 struct btrfs_key key;
1826 unsigned long ptr;
1827
1828 path = btrfs_alloc_path();
1829 if (!path)
1830 return -ENOMEM;
1831
1832 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1833 key.type = BTRFS_DEV_ITEM_KEY;
1834 key.offset = device->devid;
1835
1836 ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path,
1837 &key, sizeof(*dev_item));
1838 if (ret)
1839 goto out;
1840
1841 leaf = path->nodes[0];
1842 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1843
1844 btrfs_set_device_id(leaf, dev_item, device->devid);
1845 btrfs_set_device_generation(leaf, dev_item, 0);
1846 btrfs_set_device_type(leaf, dev_item, device->type);
1847 btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1848 btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1849 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1850 btrfs_set_device_total_bytes(leaf, dev_item,
1851 btrfs_device_get_disk_total_bytes(device));
1852 btrfs_set_device_bytes_used(leaf, dev_item,
1853 btrfs_device_get_bytes_used(device));
1854 btrfs_set_device_group(leaf, dev_item, 0);
1855 btrfs_set_device_seek_speed(leaf, dev_item, 0);
1856 btrfs_set_device_bandwidth(leaf, dev_item, 0);
1857 btrfs_set_device_start_offset(leaf, dev_item, 0);
1858
1859 ptr = btrfs_device_uuid(dev_item);
1860 write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1861 ptr = btrfs_device_fsid(dev_item);
1862 write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid,
1863 ptr, BTRFS_FSID_SIZE);
1864 btrfs_mark_buffer_dirty(leaf);
1865
1866 ret = 0;
1867 out:
1868 btrfs_free_path(path);
1869 return ret;
1870 }
1871
1872 /*
1873 * Function to update ctime/mtime for a given device path.
1874 * Mainly used for ctime/mtime based probe like libblkid.
1875 *
1876 * We don't care about errors here, this is just to be kind to userspace.
1877 */
update_dev_time(const char * device_path)1878 static void update_dev_time(const char *device_path)
1879 {
1880 struct path path;
1881 struct timespec64 now;
1882 int ret;
1883
1884 ret = kern_path(device_path, LOOKUP_FOLLOW, &path);
1885 if (ret)
1886 return;
1887
1888 now = current_time(d_inode(path.dentry));
1889 inode_update_time(d_inode(path.dentry), &now, S_MTIME | S_CTIME);
1890 path_put(&path);
1891 }
1892
btrfs_rm_dev_item(struct btrfs_device * device)1893 static int btrfs_rm_dev_item(struct btrfs_device *device)
1894 {
1895 struct btrfs_root *root = device->fs_info->chunk_root;
1896 int ret;
1897 struct btrfs_path *path;
1898 struct btrfs_key key;
1899 struct btrfs_trans_handle *trans;
1900
1901 path = btrfs_alloc_path();
1902 if (!path)
1903 return -ENOMEM;
1904
1905 trans = btrfs_start_transaction(root, 0);
1906 if (IS_ERR(trans)) {
1907 btrfs_free_path(path);
1908 return PTR_ERR(trans);
1909 }
1910 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1911 key.type = BTRFS_DEV_ITEM_KEY;
1912 key.offset = device->devid;
1913
1914 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1915 if (ret) {
1916 if (ret > 0)
1917 ret = -ENOENT;
1918 btrfs_abort_transaction(trans, ret);
1919 btrfs_end_transaction(trans);
1920 goto out;
1921 }
1922
1923 ret = btrfs_del_item(trans, root, path);
1924 if (ret) {
1925 btrfs_abort_transaction(trans, ret);
1926 btrfs_end_transaction(trans);
1927 }
1928
1929 out:
1930 btrfs_free_path(path);
1931 if (!ret)
1932 ret = btrfs_commit_transaction(trans);
1933 return ret;
1934 }
1935
1936 /*
1937 * Verify that @num_devices satisfies the RAID profile constraints in the whole
1938 * filesystem. It's up to the caller to adjust that number regarding eg. device
1939 * replace.
1940 */
btrfs_check_raid_min_devices(struct btrfs_fs_info * fs_info,u64 num_devices)1941 static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
1942 u64 num_devices)
1943 {
1944 u64 all_avail;
1945 unsigned seq;
1946 int i;
1947
1948 do {
1949 seq = read_seqbegin(&fs_info->profiles_lock);
1950
1951 all_avail = fs_info->avail_data_alloc_bits |
1952 fs_info->avail_system_alloc_bits |
1953 fs_info->avail_metadata_alloc_bits;
1954 } while (read_seqretry(&fs_info->profiles_lock, seq));
1955
1956 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
1957 if (!(all_avail & btrfs_raid_array[i].bg_flag))
1958 continue;
1959
1960 if (num_devices < btrfs_raid_array[i].devs_min) {
1961 int ret = btrfs_raid_array[i].mindev_error;
1962
1963 if (ret)
1964 return ret;
1965 }
1966 }
1967
1968 return 0;
1969 }
1970
btrfs_find_next_active_device(struct btrfs_fs_devices * fs_devs,struct btrfs_device * device)1971 static struct btrfs_device * btrfs_find_next_active_device(
1972 struct btrfs_fs_devices *fs_devs, struct btrfs_device *device)
1973 {
1974 struct btrfs_device *next_device;
1975
1976 list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
1977 if (next_device != device &&
1978 !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state)
1979 && next_device->bdev)
1980 return next_device;
1981 }
1982
1983 return NULL;
1984 }
1985
1986 /*
1987 * Helper function to check if the given device is part of s_bdev / latest_bdev
1988 * and replace it with the provided or the next active device, in the context
1989 * where this function called, there should be always be another device (or
1990 * this_dev) which is active.
1991 */
btrfs_assign_next_active_device(struct btrfs_device * device,struct btrfs_device * next_device)1992 void __cold btrfs_assign_next_active_device(struct btrfs_device *device,
1993 struct btrfs_device *next_device)
1994 {
1995 struct btrfs_fs_info *fs_info = device->fs_info;
1996
1997 if (!next_device)
1998 next_device = btrfs_find_next_active_device(fs_info->fs_devices,
1999 device);
2000 ASSERT(next_device);
2001
2002 if (fs_info->sb->s_bdev &&
2003 (fs_info->sb->s_bdev == device->bdev))
2004 fs_info->sb->s_bdev = next_device->bdev;
2005
2006 if (fs_info->fs_devices->latest_bdev == device->bdev)
2007 fs_info->fs_devices->latest_bdev = next_device->bdev;
2008 }
2009
2010 /*
2011 * Return btrfs_fs_devices::num_devices excluding the device that's being
2012 * currently replaced.
2013 */
btrfs_num_devices(struct btrfs_fs_info * fs_info)2014 static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
2015 {
2016 u64 num_devices = fs_info->fs_devices->num_devices;
2017
2018 down_read(&fs_info->dev_replace.rwsem);
2019 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
2020 ASSERT(num_devices > 1);
2021 num_devices--;
2022 }
2023 up_read(&fs_info->dev_replace.rwsem);
2024
2025 return num_devices;
2026 }
2027
btrfs_scratch_superblocks(struct btrfs_fs_info * fs_info,struct block_device * bdev,const char * device_path)2028 void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
2029 struct block_device *bdev,
2030 const char *device_path)
2031 {
2032 struct btrfs_super_block *disk_super;
2033 int copy_num;
2034
2035 if (!bdev)
2036 return;
2037
2038 for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) {
2039 struct page *page;
2040 int ret;
2041
2042 disk_super = btrfs_read_dev_one_super(bdev, copy_num);
2043 if (IS_ERR(disk_super))
2044 continue;
2045
2046 memset(&disk_super->magic, 0, sizeof(disk_super->magic));
2047
2048 page = virt_to_page(disk_super);
2049 set_page_dirty(page);
2050 lock_page(page);
2051 /* write_on_page() unlocks the page */
2052 ret = write_one_page(page);
2053 if (ret)
2054 btrfs_warn(fs_info,
2055 "error clearing superblock number %d (%d)",
2056 copy_num, ret);
2057 btrfs_release_disk_super(disk_super);
2058
2059 }
2060
2061 /* Notify udev that device has changed */
2062 btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
2063
2064 /* Update ctime/mtime for device path for libblkid */
2065 update_dev_time(device_path);
2066 }
2067
btrfs_rm_device(struct btrfs_fs_info * fs_info,const char * device_path,u64 devid)2068 int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
2069 u64 devid)
2070 {
2071 struct btrfs_device *device;
2072 struct btrfs_fs_devices *cur_devices;
2073 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2074 u64 num_devices;
2075 int ret = 0;
2076
2077 /*
2078 * The device list in fs_devices is accessed without locks (neither
2079 * uuid_mutex nor device_list_mutex) as it won't change on a mounted
2080 * filesystem and another device rm cannot run.
2081 */
2082 num_devices = btrfs_num_devices(fs_info);
2083
2084 ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
2085 if (ret)
2086 goto out;
2087
2088 device = btrfs_find_device_by_devspec(fs_info, devid, device_path);
2089
2090 if (IS_ERR(device)) {
2091 if (PTR_ERR(device) == -ENOENT &&
2092 device_path && strcmp(device_path, "missing") == 0)
2093 ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
2094 else
2095 ret = PTR_ERR(device);
2096 goto out;
2097 }
2098
2099 if (btrfs_pinned_by_swapfile(fs_info, device)) {
2100 btrfs_warn_in_rcu(fs_info,
2101 "cannot remove device %s (devid %llu) due to active swapfile",
2102 rcu_str_deref(device->name), device->devid);
2103 ret = -ETXTBSY;
2104 goto out;
2105 }
2106
2107 if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
2108 ret = BTRFS_ERROR_DEV_TGT_REPLACE;
2109 goto out;
2110 }
2111
2112 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
2113 fs_info->fs_devices->rw_devices == 1) {
2114 ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
2115 goto out;
2116 }
2117
2118 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2119 mutex_lock(&fs_info->chunk_mutex);
2120 list_del_init(&device->dev_alloc_list);
2121 device->fs_devices->rw_devices--;
2122 mutex_unlock(&fs_info->chunk_mutex);
2123 }
2124
2125 ret = btrfs_shrink_device(device, 0);
2126 if (!ret)
2127 btrfs_reada_remove_dev(device);
2128 if (ret)
2129 goto error_undo;
2130
2131 /*
2132 * TODO: the superblock still includes this device in its num_devices
2133 * counter although write_all_supers() is not locked out. This
2134 * could give a filesystem state which requires a degraded mount.
2135 */
2136 ret = btrfs_rm_dev_item(device);
2137 if (ret)
2138 goto error_undo;
2139
2140 clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2141 btrfs_scrub_cancel_dev(device);
2142
2143 /*
2144 * the device list mutex makes sure that we don't change
2145 * the device list while someone else is writing out all
2146 * the device supers. Whoever is writing all supers, should
2147 * lock the device list mutex before getting the number of
2148 * devices in the super block (super_copy). Conversely,
2149 * whoever updates the number of devices in the super block
2150 * (super_copy) should hold the device list mutex.
2151 */
2152
2153 /*
2154 * In normal cases the cur_devices == fs_devices. But in case
2155 * of deleting a seed device, the cur_devices should point to
2156 * its own fs_devices listed under the fs_devices->seed.
2157 */
2158 cur_devices = device->fs_devices;
2159 mutex_lock(&fs_devices->device_list_mutex);
2160 list_del_rcu(&device->dev_list);
2161
2162 cur_devices->num_devices--;
2163 cur_devices->total_devices--;
2164 /* Update total_devices of the parent fs_devices if it's seed */
2165 if (cur_devices != fs_devices)
2166 fs_devices->total_devices--;
2167
2168 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
2169 cur_devices->missing_devices--;
2170
2171 btrfs_assign_next_active_device(device, NULL);
2172
2173 if (device->bdev) {
2174 cur_devices->open_devices--;
2175 /* remove sysfs entry */
2176 btrfs_sysfs_remove_device(device);
2177 }
2178
2179 num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
2180 btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
2181 mutex_unlock(&fs_devices->device_list_mutex);
2182
2183 /*
2184 * at this point, the device is zero sized and detached from
2185 * the devices list. All that's left is to zero out the old
2186 * supers and free the device.
2187 */
2188 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
2189 btrfs_scratch_superblocks(fs_info, device->bdev,
2190 device->name->str);
2191
2192 btrfs_close_bdev(device);
2193 synchronize_rcu();
2194 btrfs_free_device(device);
2195
2196 if (cur_devices->open_devices == 0) {
2197 list_del_init(&cur_devices->seed_list);
2198 close_fs_devices(cur_devices);
2199 free_fs_devices(cur_devices);
2200 }
2201
2202 out:
2203 return ret;
2204
2205 error_undo:
2206 btrfs_reada_undo_remove_dev(device);
2207 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2208 mutex_lock(&fs_info->chunk_mutex);
2209 list_add(&device->dev_alloc_list,
2210 &fs_devices->alloc_list);
2211 device->fs_devices->rw_devices++;
2212 mutex_unlock(&fs_info->chunk_mutex);
2213 }
2214 goto out;
2215 }
2216
btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device * srcdev)2217 void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)
2218 {
2219 struct btrfs_fs_devices *fs_devices;
2220
2221 lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex);
2222
2223 /*
2224 * in case of fs with no seed, srcdev->fs_devices will point
2225 * to fs_devices of fs_info. However when the dev being replaced is
2226 * a seed dev it will point to the seed's local fs_devices. In short
2227 * srcdev will have its correct fs_devices in both the cases.
2228 */
2229 fs_devices = srcdev->fs_devices;
2230
2231 list_del_rcu(&srcdev->dev_list);
2232 list_del(&srcdev->dev_alloc_list);
2233 fs_devices->num_devices--;
2234 if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state))
2235 fs_devices->missing_devices--;
2236
2237 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
2238 fs_devices->rw_devices--;
2239
2240 if (srcdev->bdev)
2241 fs_devices->open_devices--;
2242 }
2243
btrfs_rm_dev_replace_free_srcdev(struct btrfs_device * srcdev)2244 void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
2245 {
2246 struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
2247
2248 mutex_lock(&uuid_mutex);
2249
2250 btrfs_close_bdev(srcdev);
2251 synchronize_rcu();
2252 btrfs_free_device(srcdev);
2253
2254 /* if this is no devs we rather delete the fs_devices */
2255 if (!fs_devices->num_devices) {
2256 /*
2257 * On a mounted FS, num_devices can't be zero unless it's a
2258 * seed. In case of a seed device being replaced, the replace
2259 * target added to the sprout FS, so there will be no more
2260 * device left under the seed FS.
2261 */
2262 ASSERT(fs_devices->seeding);
2263
2264 list_del_init(&fs_devices->seed_list);
2265 close_fs_devices(fs_devices);
2266 free_fs_devices(fs_devices);
2267 }
2268 mutex_unlock(&uuid_mutex);
2269 }
2270
btrfs_destroy_dev_replace_tgtdev(struct btrfs_device * tgtdev)2271 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
2272 {
2273 struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices;
2274
2275 mutex_lock(&fs_devices->device_list_mutex);
2276
2277 btrfs_sysfs_remove_device(tgtdev);
2278
2279 if (tgtdev->bdev)
2280 fs_devices->open_devices--;
2281
2282 fs_devices->num_devices--;
2283
2284 btrfs_assign_next_active_device(tgtdev, NULL);
2285
2286 list_del_rcu(&tgtdev->dev_list);
2287
2288 mutex_unlock(&fs_devices->device_list_mutex);
2289
2290 /*
2291 * The update_dev_time() with in btrfs_scratch_superblocks()
2292 * may lead to a call to btrfs_show_devname() which will try
2293 * to hold device_list_mutex. And here this device
2294 * is already out of device list, so we don't have to hold
2295 * the device_list_mutex lock.
2296 */
2297 btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev,
2298 tgtdev->name->str);
2299
2300 btrfs_close_bdev(tgtdev);
2301 synchronize_rcu();
2302 btrfs_free_device(tgtdev);
2303 }
2304
btrfs_find_device_by_path(struct btrfs_fs_info * fs_info,const char * device_path)2305 static struct btrfs_device *btrfs_find_device_by_path(
2306 struct btrfs_fs_info *fs_info, const char *device_path)
2307 {
2308 int ret = 0;
2309 struct btrfs_super_block *disk_super;
2310 u64 devid;
2311 u8 *dev_uuid;
2312 struct block_device *bdev;
2313 struct btrfs_device *device;
2314
2315 ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
2316 fs_info->bdev_holder, 0, &bdev, &disk_super);
2317 if (ret)
2318 return ERR_PTR(ret);
2319
2320 devid = btrfs_stack_device_id(&disk_super->dev_item);
2321 dev_uuid = disk_super->dev_item.uuid;
2322 if (btrfs_fs_incompat(fs_info, METADATA_UUID))
2323 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
2324 disk_super->metadata_uuid, true);
2325 else
2326 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
2327 disk_super->fsid, true);
2328
2329 btrfs_release_disk_super(disk_super);
2330 if (!device)
2331 device = ERR_PTR(-ENOENT);
2332 blkdev_put(bdev, FMODE_READ);
2333 return device;
2334 }
2335
2336 /*
2337 * Lookup a device given by device id, or the path if the id is 0.
2338 */
btrfs_find_device_by_devspec(struct btrfs_fs_info * fs_info,u64 devid,const char * device_path)2339 struct btrfs_device *btrfs_find_device_by_devspec(
2340 struct btrfs_fs_info *fs_info, u64 devid,
2341 const char *device_path)
2342 {
2343 struct btrfs_device *device;
2344
2345 if (devid) {
2346 device = btrfs_find_device(fs_info->fs_devices, devid, NULL,
2347 NULL, true);
2348 if (!device)
2349 return ERR_PTR(-ENOENT);
2350 return device;
2351 }
2352
2353 if (!device_path || !device_path[0])
2354 return ERR_PTR(-EINVAL);
2355
2356 if (strcmp(device_path, "missing") == 0) {
2357 /* Find first missing device */
2358 list_for_each_entry(device, &fs_info->fs_devices->devices,
2359 dev_list) {
2360 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
2361 &device->dev_state) && !device->bdev)
2362 return device;
2363 }
2364 return ERR_PTR(-ENOENT);
2365 }
2366
2367 return btrfs_find_device_by_path(fs_info, device_path);
2368 }
2369
2370 /*
2371 * does all the dirty work required for changing file system's UUID.
2372 */
btrfs_prepare_sprout(struct btrfs_fs_info * fs_info)2373 static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
2374 {
2375 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2376 struct btrfs_fs_devices *old_devices;
2377 struct btrfs_fs_devices *seed_devices;
2378 struct btrfs_super_block *disk_super = fs_info->super_copy;
2379 struct btrfs_device *device;
2380 u64 super_flags;
2381
2382 lockdep_assert_held(&uuid_mutex);
2383 if (!fs_devices->seeding)
2384 return -EINVAL;
2385
2386 /*
2387 * Private copy of the seed devices, anchored at
2388 * fs_info->fs_devices->seed_list
2389 */
2390 seed_devices = alloc_fs_devices(NULL, NULL);
2391 if (IS_ERR(seed_devices))
2392 return PTR_ERR(seed_devices);
2393
2394 /*
2395 * It's necessary to retain a copy of the original seed fs_devices in
2396 * fs_uuids so that filesystems which have been seeded can successfully
2397 * reference the seed device from open_seed_devices. This also supports
2398 * multiple fs seed.
2399 */
2400 old_devices = clone_fs_devices(fs_devices);
2401 if (IS_ERR(old_devices)) {
2402 kfree(seed_devices);
2403 return PTR_ERR(old_devices);
2404 }
2405
2406 list_add(&old_devices->fs_list, &fs_uuids);
2407
2408 memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
2409 seed_devices->opened = 1;
2410 INIT_LIST_HEAD(&seed_devices->devices);
2411 INIT_LIST_HEAD(&seed_devices->alloc_list);
2412 mutex_init(&seed_devices->device_list_mutex);
2413
2414 mutex_lock(&fs_devices->device_list_mutex);
2415 list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
2416 synchronize_rcu);
2417 list_for_each_entry(device, &seed_devices->devices, dev_list)
2418 device->fs_devices = seed_devices;
2419
2420 fs_devices->seeding = false;
2421 fs_devices->num_devices = 0;
2422 fs_devices->open_devices = 0;
2423 fs_devices->missing_devices = 0;
2424 fs_devices->rotating = false;
2425 list_add(&seed_devices->seed_list, &fs_devices->seed_list);
2426
2427 generate_random_uuid(fs_devices->fsid);
2428 memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE);
2429 memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2430 mutex_unlock(&fs_devices->device_list_mutex);
2431
2432 super_flags = btrfs_super_flags(disk_super) &
2433 ~BTRFS_SUPER_FLAG_SEEDING;
2434 btrfs_set_super_flags(disk_super, super_flags);
2435
2436 return 0;
2437 }
2438
2439 /*
2440 * Store the expected generation for seed devices in device items.
2441 */
btrfs_finish_sprout(struct btrfs_trans_handle * trans)2442 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
2443 {
2444 struct btrfs_fs_info *fs_info = trans->fs_info;
2445 struct btrfs_root *root = fs_info->chunk_root;
2446 struct btrfs_path *path;
2447 struct extent_buffer *leaf;
2448 struct btrfs_dev_item *dev_item;
2449 struct btrfs_device *device;
2450 struct btrfs_key key;
2451 u8 fs_uuid[BTRFS_FSID_SIZE];
2452 u8 dev_uuid[BTRFS_UUID_SIZE];
2453 u64 devid;
2454 int ret;
2455
2456 path = btrfs_alloc_path();
2457 if (!path)
2458 return -ENOMEM;
2459
2460 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2461 key.offset = 0;
2462 key.type = BTRFS_DEV_ITEM_KEY;
2463
2464 while (1) {
2465 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2466 if (ret < 0)
2467 goto error;
2468
2469 leaf = path->nodes[0];
2470 next_slot:
2471 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
2472 ret = btrfs_next_leaf(root, path);
2473 if (ret > 0)
2474 break;
2475 if (ret < 0)
2476 goto error;
2477 leaf = path->nodes[0];
2478 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2479 btrfs_release_path(path);
2480 continue;
2481 }
2482
2483 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2484 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
2485 key.type != BTRFS_DEV_ITEM_KEY)
2486 break;
2487
2488 dev_item = btrfs_item_ptr(leaf, path->slots[0],
2489 struct btrfs_dev_item);
2490 devid = btrfs_device_id(leaf, dev_item);
2491 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
2492 BTRFS_UUID_SIZE);
2493 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
2494 BTRFS_FSID_SIZE);
2495 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
2496 fs_uuid, true);
2497 BUG_ON(!device); /* Logic error */
2498
2499 if (device->fs_devices->seeding) {
2500 btrfs_set_device_generation(leaf, dev_item,
2501 device->generation);
2502 btrfs_mark_buffer_dirty(leaf);
2503 }
2504
2505 path->slots[0]++;
2506 goto next_slot;
2507 }
2508 ret = 0;
2509 error:
2510 btrfs_free_path(path);
2511 return ret;
2512 }
2513
btrfs_init_new_device(struct btrfs_fs_info * fs_info,const char * device_path)2514 int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
2515 {
2516 struct btrfs_root *root = fs_info->dev_root;
2517 struct request_queue *q;
2518 struct btrfs_trans_handle *trans;
2519 struct btrfs_device *device;
2520 struct block_device *bdev;
2521 struct super_block *sb = fs_info->sb;
2522 struct rcu_string *name;
2523 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2524 u64 orig_super_total_bytes;
2525 u64 orig_super_num_devices;
2526 int seeding_dev = 0;
2527 int ret = 0;
2528 bool locked = false;
2529
2530 if (sb_rdonly(sb) && !fs_devices->seeding)
2531 return -EROFS;
2532
2533 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2534 fs_info->bdev_holder);
2535 if (IS_ERR(bdev))
2536 return PTR_ERR(bdev);
2537
2538 if (fs_devices->seeding) {
2539 seeding_dev = 1;
2540 down_write(&sb->s_umount);
2541 mutex_lock(&uuid_mutex);
2542 locked = true;
2543 }
2544
2545 sync_blockdev(bdev);
2546
2547 rcu_read_lock();
2548 list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
2549 if (device->bdev == bdev) {
2550 ret = -EEXIST;
2551 rcu_read_unlock();
2552 goto error;
2553 }
2554 }
2555 rcu_read_unlock();
2556
2557 device = btrfs_alloc_device(fs_info, NULL, NULL);
2558 if (IS_ERR(device)) {
2559 /* we can safely leave the fs_devices entry around */
2560 ret = PTR_ERR(device);
2561 goto error;
2562 }
2563
2564 name = rcu_string_strdup(device_path, GFP_KERNEL);
2565 if (!name) {
2566 ret = -ENOMEM;
2567 goto error_free_device;
2568 }
2569 rcu_assign_pointer(device->name, name);
2570
2571 trans = btrfs_start_transaction(root, 0);
2572 if (IS_ERR(trans)) {
2573 ret = PTR_ERR(trans);
2574 goto error_free_device;
2575 }
2576
2577 q = bdev_get_queue(bdev);
2578 set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
2579 device->generation = trans->transid;
2580 device->io_width = fs_info->sectorsize;
2581 device->io_align = fs_info->sectorsize;
2582 device->sector_size = fs_info->sectorsize;
2583 device->total_bytes = round_down(i_size_read(bdev->bd_inode),
2584 fs_info->sectorsize);
2585 device->disk_total_bytes = device->total_bytes;
2586 device->commit_total_bytes = device->total_bytes;
2587 device->fs_info = fs_info;
2588 device->bdev = bdev;
2589 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2590 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
2591 device->mode = FMODE_EXCL;
2592 device->dev_stats_valid = 1;
2593 set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
2594
2595 if (seeding_dev) {
2596 sb->s_flags &= ~SB_RDONLY;
2597 ret = btrfs_prepare_sprout(fs_info);
2598 if (ret) {
2599 btrfs_abort_transaction(trans, ret);
2600 goto error_trans;
2601 }
2602 }
2603
2604 device->fs_devices = fs_devices;
2605
2606 mutex_lock(&fs_devices->device_list_mutex);
2607 mutex_lock(&fs_info->chunk_mutex);
2608 list_add_rcu(&device->dev_list, &fs_devices->devices);
2609 list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
2610 fs_devices->num_devices++;
2611 fs_devices->open_devices++;
2612 fs_devices->rw_devices++;
2613 fs_devices->total_devices++;
2614 fs_devices->total_rw_bytes += device->total_bytes;
2615
2616 atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
2617
2618 if (!blk_queue_nonrot(q))
2619 fs_devices->rotating = true;
2620
2621 orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
2622 btrfs_set_super_total_bytes(fs_info->super_copy,
2623 round_down(orig_super_total_bytes + device->total_bytes,
2624 fs_info->sectorsize));
2625
2626 orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy);
2627 btrfs_set_super_num_devices(fs_info->super_copy,
2628 orig_super_num_devices + 1);
2629
2630 /*
2631 * we've got more storage, clear any full flags on the space
2632 * infos
2633 */
2634 btrfs_clear_space_info_full(fs_info);
2635
2636 mutex_unlock(&fs_info->chunk_mutex);
2637
2638 /* Add sysfs device entry */
2639 btrfs_sysfs_add_device(device);
2640
2641 mutex_unlock(&fs_devices->device_list_mutex);
2642
2643 if (seeding_dev) {
2644 mutex_lock(&fs_info->chunk_mutex);
2645 ret = init_first_rw_device(trans);
2646 mutex_unlock(&fs_info->chunk_mutex);
2647 if (ret) {
2648 btrfs_abort_transaction(trans, ret);
2649 goto error_sysfs;
2650 }
2651 }
2652
2653 ret = btrfs_add_dev_item(trans, device);
2654 if (ret) {
2655 btrfs_abort_transaction(trans, ret);
2656 goto error_sysfs;
2657 }
2658
2659 if (seeding_dev) {
2660 ret = btrfs_finish_sprout(trans);
2661 if (ret) {
2662 btrfs_abort_transaction(trans, ret);
2663 goto error_sysfs;
2664 }
2665
2666 /*
2667 * fs_devices now represents the newly sprouted filesystem and
2668 * its fsid has been changed by btrfs_prepare_sprout
2669 */
2670 btrfs_sysfs_update_sprout_fsid(fs_devices);
2671 }
2672
2673 ret = btrfs_commit_transaction(trans);
2674
2675 if (seeding_dev) {
2676 mutex_unlock(&uuid_mutex);
2677 up_write(&sb->s_umount);
2678 locked = false;
2679
2680 if (ret) /* transaction commit */
2681 return ret;
2682
2683 ret = btrfs_relocate_sys_chunks(fs_info);
2684 if (ret < 0)
2685 btrfs_handle_fs_error(fs_info, ret,
2686 "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
2687 trans = btrfs_attach_transaction(root);
2688 if (IS_ERR(trans)) {
2689 if (PTR_ERR(trans) == -ENOENT)
2690 return 0;
2691 ret = PTR_ERR(trans);
2692 trans = NULL;
2693 goto error_sysfs;
2694 }
2695 ret = btrfs_commit_transaction(trans);
2696 }
2697
2698 /*
2699 * Now that we have written a new super block to this device, check all
2700 * other fs_devices list if device_path alienates any other scanned
2701 * device.
2702 * We can ignore the return value as it typically returns -EINVAL and
2703 * only succeeds if the device was an alien.
2704 */
2705 btrfs_forget_devices(device_path);
2706
2707 /* Update ctime/mtime for blkid or udev */
2708 update_dev_time(device_path);
2709
2710 return ret;
2711
2712 error_sysfs:
2713 btrfs_sysfs_remove_device(device);
2714 mutex_lock(&fs_info->fs_devices->device_list_mutex);
2715 mutex_lock(&fs_info->chunk_mutex);
2716 list_del_rcu(&device->dev_list);
2717 list_del(&device->dev_alloc_list);
2718 fs_info->fs_devices->num_devices--;
2719 fs_info->fs_devices->open_devices--;
2720 fs_info->fs_devices->rw_devices--;
2721 fs_info->fs_devices->total_devices--;
2722 fs_info->fs_devices->total_rw_bytes -= device->total_bytes;
2723 atomic64_sub(device->total_bytes, &fs_info->free_chunk_space);
2724 btrfs_set_super_total_bytes(fs_info->super_copy,
2725 orig_super_total_bytes);
2726 btrfs_set_super_num_devices(fs_info->super_copy,
2727 orig_super_num_devices);
2728 mutex_unlock(&fs_info->chunk_mutex);
2729 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2730 error_trans:
2731 if (seeding_dev)
2732 sb->s_flags |= SB_RDONLY;
2733 if (trans)
2734 btrfs_end_transaction(trans);
2735 error_free_device:
2736 btrfs_free_device(device);
2737 error:
2738 blkdev_put(bdev, FMODE_EXCL);
2739 if (locked) {
2740 mutex_unlock(&uuid_mutex);
2741 up_write(&sb->s_umount);
2742 }
2743 return ret;
2744 }
2745
btrfs_update_device(struct btrfs_trans_handle * trans,struct btrfs_device * device)2746 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
2747 struct btrfs_device *device)
2748 {
2749 int ret;
2750 struct btrfs_path *path;
2751 struct btrfs_root *root = device->fs_info->chunk_root;
2752 struct btrfs_dev_item *dev_item;
2753 struct extent_buffer *leaf;
2754 struct btrfs_key key;
2755
2756 path = btrfs_alloc_path();
2757 if (!path)
2758 return -ENOMEM;
2759
2760 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2761 key.type = BTRFS_DEV_ITEM_KEY;
2762 key.offset = device->devid;
2763
2764 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2765 if (ret < 0)
2766 goto out;
2767
2768 if (ret > 0) {
2769 ret = -ENOENT;
2770 goto out;
2771 }
2772
2773 leaf = path->nodes[0];
2774 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
2775
2776 btrfs_set_device_id(leaf, dev_item, device->devid);
2777 btrfs_set_device_type(leaf, dev_item, device->type);
2778 btrfs_set_device_io_align(leaf, dev_item, device->io_align);
2779 btrfs_set_device_io_width(leaf, dev_item, device->io_width);
2780 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
2781 btrfs_set_device_total_bytes(leaf, dev_item,
2782 btrfs_device_get_disk_total_bytes(device));
2783 btrfs_set_device_bytes_used(leaf, dev_item,
2784 btrfs_device_get_bytes_used(device));
2785 btrfs_mark_buffer_dirty(leaf);
2786
2787 out:
2788 btrfs_free_path(path);
2789 return ret;
2790 }
2791
btrfs_grow_device(struct btrfs_trans_handle * trans,struct btrfs_device * device,u64 new_size)2792 int btrfs_grow_device(struct btrfs_trans_handle *trans,
2793 struct btrfs_device *device, u64 new_size)
2794 {
2795 struct btrfs_fs_info *fs_info = device->fs_info;
2796 struct btrfs_super_block *super_copy = fs_info->super_copy;
2797 u64 old_total;
2798 u64 diff;
2799
2800 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
2801 return -EACCES;
2802
2803 new_size = round_down(new_size, fs_info->sectorsize);
2804
2805 mutex_lock(&fs_info->chunk_mutex);
2806 old_total = btrfs_super_total_bytes(super_copy);
2807 diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
2808
2809 if (new_size <= device->total_bytes ||
2810 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
2811 mutex_unlock(&fs_info->chunk_mutex);
2812 return -EINVAL;
2813 }
2814
2815 btrfs_set_super_total_bytes(super_copy,
2816 round_down(old_total + diff, fs_info->sectorsize));
2817 device->fs_devices->total_rw_bytes += diff;
2818
2819 btrfs_device_set_total_bytes(device, new_size);
2820 btrfs_device_set_disk_total_bytes(device, new_size);
2821 btrfs_clear_space_info_full(device->fs_info);
2822 if (list_empty(&device->post_commit_list))
2823 list_add_tail(&device->post_commit_list,
2824 &trans->transaction->dev_update_list);
2825 mutex_unlock(&fs_info->chunk_mutex);
2826
2827 return btrfs_update_device(trans, device);
2828 }
2829
btrfs_free_chunk(struct btrfs_trans_handle * trans,u64 chunk_offset)2830 static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
2831 {
2832 struct btrfs_fs_info *fs_info = trans->fs_info;
2833 struct btrfs_root *root = fs_info->chunk_root;
2834 int ret;
2835 struct btrfs_path *path;
2836 struct btrfs_key key;
2837
2838 path = btrfs_alloc_path();
2839 if (!path)
2840 return -ENOMEM;
2841
2842 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2843 key.offset = chunk_offset;
2844 key.type = BTRFS_CHUNK_ITEM_KEY;
2845
2846 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2847 if (ret < 0)
2848 goto out;
2849 else if (ret > 0) { /* Logic error or corruption */
2850 btrfs_handle_fs_error(fs_info, -ENOENT,
2851 "Failed lookup while freeing chunk.");
2852 ret = -ENOENT;
2853 goto out;
2854 }
2855
2856 ret = btrfs_del_item(trans, root, path);
2857 if (ret < 0)
2858 btrfs_handle_fs_error(fs_info, ret,
2859 "Failed to delete chunk item.");
2860 out:
2861 btrfs_free_path(path);
2862 return ret;
2863 }
2864
btrfs_del_sys_chunk(struct btrfs_fs_info * fs_info,u64 chunk_offset)2865 static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2866 {
2867 struct btrfs_super_block *super_copy = fs_info->super_copy;
2868 struct btrfs_disk_key *disk_key;
2869 struct btrfs_chunk *chunk;
2870 u8 *ptr;
2871 int ret = 0;
2872 u32 num_stripes;
2873 u32 array_size;
2874 u32 len = 0;
2875 u32 cur;
2876 struct btrfs_key key;
2877
2878 mutex_lock(&fs_info->chunk_mutex);
2879 array_size = btrfs_super_sys_array_size(super_copy);
2880
2881 ptr = super_copy->sys_chunk_array;
2882 cur = 0;
2883
2884 while (cur < array_size) {
2885 disk_key = (struct btrfs_disk_key *)ptr;
2886 btrfs_disk_key_to_cpu(&key, disk_key);
2887
2888 len = sizeof(*disk_key);
2889
2890 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
2891 chunk = (struct btrfs_chunk *)(ptr + len);
2892 num_stripes = btrfs_stack_chunk_num_stripes(chunk);
2893 len += btrfs_chunk_item_size(num_stripes);
2894 } else {
2895 ret = -EIO;
2896 break;
2897 }
2898 if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
2899 key.offset == chunk_offset) {
2900 memmove(ptr, ptr + len, array_size - (cur + len));
2901 array_size -= len;
2902 btrfs_set_super_sys_array_size(super_copy, array_size);
2903 } else {
2904 ptr += len;
2905 cur += len;
2906 }
2907 }
2908 mutex_unlock(&fs_info->chunk_mutex);
2909 return ret;
2910 }
2911
2912 /*
2913 * btrfs_get_chunk_map() - Find the mapping containing the given logical extent.
2914 * @logical: Logical block offset in bytes.
2915 * @length: Length of extent in bytes.
2916 *
2917 * Return: Chunk mapping or ERR_PTR.
2918 */
btrfs_get_chunk_map(struct btrfs_fs_info * fs_info,u64 logical,u64 length)2919 struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
2920 u64 logical, u64 length)
2921 {
2922 struct extent_map_tree *em_tree;
2923 struct extent_map *em;
2924
2925 em_tree = &fs_info->mapping_tree;
2926 read_lock(&em_tree->lock);
2927 em = lookup_extent_mapping(em_tree, logical, length);
2928 read_unlock(&em_tree->lock);
2929
2930 if (!em) {
2931 btrfs_crit(fs_info, "unable to find logical %llu length %llu",
2932 logical, length);
2933 return ERR_PTR(-EINVAL);
2934 }
2935
2936 if (em->start > logical || em->start + em->len < logical) {
2937 btrfs_crit(fs_info,
2938 "found a bad mapping, wanted %llu-%llu, found %llu-%llu",
2939 logical, length, em->start, em->start + em->len);
2940 free_extent_map(em);
2941 return ERR_PTR(-EINVAL);
2942 }
2943
2944 /* callers are responsible for dropping em's ref. */
2945 return em;
2946 }
2947
btrfs_remove_chunk(struct btrfs_trans_handle * trans,u64 chunk_offset)2948 int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
2949 {
2950 struct btrfs_fs_info *fs_info = trans->fs_info;
2951 struct extent_map *em;
2952 struct map_lookup *map;
2953 u64 dev_extent_len = 0;
2954 int i, ret = 0;
2955 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2956
2957 em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
2958 if (IS_ERR(em)) {
2959 /*
2960 * This is a logic error, but we don't want to just rely on the
2961 * user having built with ASSERT enabled, so if ASSERT doesn't
2962 * do anything we still error out.
2963 */
2964 ASSERT(0);
2965 return PTR_ERR(em);
2966 }
2967 map = em->map_lookup;
2968 mutex_lock(&fs_info->chunk_mutex);
2969 check_system_chunk(trans, map->type);
2970 mutex_unlock(&fs_info->chunk_mutex);
2971
2972 /*
2973 * Take the device list mutex to prevent races with the final phase of
2974 * a device replace operation that replaces the device object associated
2975 * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()).
2976 */
2977 mutex_lock(&fs_devices->device_list_mutex);
2978 for (i = 0; i < map->num_stripes; i++) {
2979 struct btrfs_device *device = map->stripes[i].dev;
2980 ret = btrfs_free_dev_extent(trans, device,
2981 map->stripes[i].physical,
2982 &dev_extent_len);
2983 if (ret) {
2984 mutex_unlock(&fs_devices->device_list_mutex);
2985 btrfs_abort_transaction(trans, ret);
2986 goto out;
2987 }
2988
2989 if (device->bytes_used > 0) {
2990 mutex_lock(&fs_info->chunk_mutex);
2991 btrfs_device_set_bytes_used(device,
2992 device->bytes_used - dev_extent_len);
2993 atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
2994 btrfs_clear_space_info_full(fs_info);
2995 mutex_unlock(&fs_info->chunk_mutex);
2996 }
2997
2998 ret = btrfs_update_device(trans, device);
2999 if (ret) {
3000 mutex_unlock(&fs_devices->device_list_mutex);
3001 btrfs_abort_transaction(trans, ret);
3002 goto out;
3003 }
3004 }
3005 mutex_unlock(&fs_devices->device_list_mutex);
3006
3007 ret = btrfs_free_chunk(trans, chunk_offset);
3008 if (ret) {
3009 btrfs_abort_transaction(trans, ret);
3010 goto out;
3011 }
3012
3013 trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
3014
3015 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
3016 ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
3017 if (ret) {
3018 btrfs_abort_transaction(trans, ret);
3019 goto out;
3020 }
3021 }
3022
3023 ret = btrfs_remove_block_group(trans, chunk_offset, em);
3024 if (ret) {
3025 btrfs_abort_transaction(trans, ret);
3026 goto out;
3027 }
3028
3029 out:
3030 /* once for us */
3031 free_extent_map(em);
3032 return ret;
3033 }
3034
btrfs_relocate_chunk(struct btrfs_fs_info * fs_info,u64 chunk_offset)3035 static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
3036 {
3037 struct btrfs_root *root = fs_info->chunk_root;
3038 struct btrfs_trans_handle *trans;
3039 struct btrfs_block_group *block_group;
3040 int ret;
3041
3042 /*
3043 * Prevent races with automatic removal of unused block groups.
3044 * After we relocate and before we remove the chunk with offset
3045 * chunk_offset, automatic removal of the block group can kick in,
3046 * resulting in a failure when calling btrfs_remove_chunk() below.
3047 *
3048 * Make sure to acquire this mutex before doing a tree search (dev
3049 * or chunk trees) to find chunks. Otherwise the cleaner kthread might
3050 * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
3051 * we release the path used to search the chunk/dev tree and before
3052 * the current task acquires this mutex and calls us.
3053 */
3054 lockdep_assert_held(&fs_info->delete_unused_bgs_mutex);
3055
3056 /* step one, relocate all the extents inside this chunk */
3057 btrfs_scrub_pause(fs_info);
3058 ret = btrfs_relocate_block_group(fs_info, chunk_offset);
3059 btrfs_scrub_continue(fs_info);
3060 if (ret)
3061 return ret;
3062
3063 block_group = btrfs_lookup_block_group(fs_info, chunk_offset);
3064 if (!block_group)
3065 return -ENOENT;
3066 btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
3067 btrfs_put_block_group(block_group);
3068
3069 trans = btrfs_start_trans_remove_block_group(root->fs_info,
3070 chunk_offset);
3071 if (IS_ERR(trans)) {
3072 ret = PTR_ERR(trans);
3073 btrfs_handle_fs_error(root->fs_info, ret, NULL);
3074 return ret;
3075 }
3076
3077 /*
3078 * step two, delete the device extents and the
3079 * chunk tree entries
3080 */
3081 ret = btrfs_remove_chunk(trans, chunk_offset);
3082 btrfs_end_transaction(trans);
3083 return ret;
3084 }
3085
btrfs_relocate_sys_chunks(struct btrfs_fs_info * fs_info)3086 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
3087 {
3088 struct btrfs_root *chunk_root = fs_info->chunk_root;
3089 struct btrfs_path *path;
3090 struct extent_buffer *leaf;
3091 struct btrfs_chunk *chunk;
3092 struct btrfs_key key;
3093 struct btrfs_key found_key;
3094 u64 chunk_type;
3095 bool retried = false;
3096 int failed = 0;
3097 int ret;
3098
3099 path = btrfs_alloc_path();
3100 if (!path)
3101 return -ENOMEM;
3102
3103 again:
3104 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3105 key.offset = (u64)-1;
3106 key.type = BTRFS_CHUNK_ITEM_KEY;
3107
3108 while (1) {
3109 mutex_lock(&fs_info->delete_unused_bgs_mutex);
3110 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
3111 if (ret < 0) {
3112 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3113 goto error;
3114 }
3115 BUG_ON(ret == 0); /* Corruption */
3116
3117 ret = btrfs_previous_item(chunk_root, path, key.objectid,
3118 key.type);
3119 if (ret)
3120 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3121 if (ret < 0)
3122 goto error;
3123 if (ret > 0)
3124 break;
3125
3126 leaf = path->nodes[0];
3127 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3128
3129 chunk = btrfs_item_ptr(leaf, path->slots[0],
3130 struct btrfs_chunk);
3131 chunk_type = btrfs_chunk_type(leaf, chunk);
3132 btrfs_release_path(path);
3133
3134 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
3135 ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3136 if (ret == -ENOSPC)
3137 failed++;
3138 else
3139 BUG_ON(ret);
3140 }
3141 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3142
3143 if (found_key.offset == 0)
3144 break;
3145 key.offset = found_key.offset - 1;
3146 }
3147 ret = 0;
3148 if (failed && !retried) {
3149 failed = 0;
3150 retried = true;
3151 goto again;
3152 } else if (WARN_ON(failed && retried)) {
3153 ret = -ENOSPC;
3154 }
3155 error:
3156 btrfs_free_path(path);
3157 return ret;
3158 }
3159
3160 /*
3161 * return 1 : allocate a data chunk successfully,
3162 * return <0: errors during allocating a data chunk,
3163 * return 0 : no need to allocate a data chunk.
3164 */
btrfs_may_alloc_data_chunk(struct btrfs_fs_info * fs_info,u64 chunk_offset)3165 static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
3166 u64 chunk_offset)
3167 {
3168 struct btrfs_block_group *cache;
3169 u64 bytes_used;
3170 u64 chunk_type;
3171
3172 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3173 ASSERT(cache);
3174 chunk_type = cache->flags;
3175 btrfs_put_block_group(cache);
3176
3177 if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA))
3178 return 0;
3179
3180 spin_lock(&fs_info->data_sinfo->lock);
3181 bytes_used = fs_info->data_sinfo->bytes_used;
3182 spin_unlock(&fs_info->data_sinfo->lock);
3183
3184 if (!bytes_used) {
3185 struct btrfs_trans_handle *trans;
3186 int ret;
3187
3188 trans = btrfs_join_transaction(fs_info->tree_root);
3189 if (IS_ERR(trans))
3190 return PTR_ERR(trans);
3191
3192 ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA);
3193 btrfs_end_transaction(trans);
3194 if (ret < 0)
3195 return ret;
3196 return 1;
3197 }
3198
3199 return 0;
3200 }
3201
insert_balance_item(struct btrfs_fs_info * fs_info,struct btrfs_balance_control * bctl)3202 static int insert_balance_item(struct btrfs_fs_info *fs_info,
3203 struct btrfs_balance_control *bctl)
3204 {
3205 struct btrfs_root *root = fs_info->tree_root;
3206 struct btrfs_trans_handle *trans;
3207 struct btrfs_balance_item *item;
3208 struct btrfs_disk_balance_args disk_bargs;
3209 struct btrfs_path *path;
3210 struct extent_buffer *leaf;
3211 struct btrfs_key key;
3212 int ret, err;
3213
3214 path = btrfs_alloc_path();
3215 if (!path)
3216 return -ENOMEM;
3217
3218 trans = btrfs_start_transaction(root, 0);
3219 if (IS_ERR(trans)) {
3220 btrfs_free_path(path);
3221 return PTR_ERR(trans);
3222 }
3223
3224 key.objectid = BTRFS_BALANCE_OBJECTID;
3225 key.type = BTRFS_TEMPORARY_ITEM_KEY;
3226 key.offset = 0;
3227
3228 ret = btrfs_insert_empty_item(trans, root, path, &key,
3229 sizeof(*item));
3230 if (ret)
3231 goto out;
3232
3233 leaf = path->nodes[0];
3234 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
3235
3236 memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
3237
3238 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
3239 btrfs_set_balance_data(leaf, item, &disk_bargs);
3240 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
3241 btrfs_set_balance_meta(leaf, item, &disk_bargs);
3242 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
3243 btrfs_set_balance_sys(leaf, item, &disk_bargs);
3244
3245 btrfs_set_balance_flags(leaf, item, bctl->flags);
3246
3247 btrfs_mark_buffer_dirty(leaf);
3248 out:
3249 btrfs_free_path(path);
3250 err = btrfs_commit_transaction(trans);
3251 if (err && !ret)
3252 ret = err;
3253 return ret;
3254 }
3255
del_balance_item(struct btrfs_fs_info * fs_info)3256 static int del_balance_item(struct btrfs_fs_info *fs_info)
3257 {
3258 struct btrfs_root *root = fs_info->tree_root;
3259 struct btrfs_trans_handle *trans;
3260 struct btrfs_path *path;
3261 struct btrfs_key key;
3262 int ret, err;
3263
3264 path = btrfs_alloc_path();
3265 if (!path)
3266 return -ENOMEM;
3267
3268 trans = btrfs_start_transaction_fallback_global_rsv(root, 0);
3269 if (IS_ERR(trans)) {
3270 btrfs_free_path(path);
3271 return PTR_ERR(trans);
3272 }
3273
3274 key.objectid = BTRFS_BALANCE_OBJECTID;
3275 key.type = BTRFS_TEMPORARY_ITEM_KEY;
3276 key.offset = 0;
3277
3278 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
3279 if (ret < 0)
3280 goto out;
3281 if (ret > 0) {
3282 ret = -ENOENT;
3283 goto out;
3284 }
3285
3286 ret = btrfs_del_item(trans, root, path);
3287 out:
3288 btrfs_free_path(path);
3289 err = btrfs_commit_transaction(trans);
3290 if (err && !ret)
3291 ret = err;
3292 return ret;
3293 }
3294
3295 /*
3296 * This is a heuristic used to reduce the number of chunks balanced on
3297 * resume after balance was interrupted.
3298 */
update_balance_args(struct btrfs_balance_control * bctl)3299 static void update_balance_args(struct btrfs_balance_control *bctl)
3300 {
3301 /*
3302 * Turn on soft mode for chunk types that were being converted.
3303 */
3304 if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
3305 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
3306 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
3307 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
3308 if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
3309 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
3310
3311 /*
3312 * Turn on usage filter if is not already used. The idea is
3313 * that chunks that we have already balanced should be
3314 * reasonably full. Don't do it for chunks that are being
3315 * converted - that will keep us from relocating unconverted
3316 * (albeit full) chunks.
3317 */
3318 if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3319 !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3320 !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3321 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
3322 bctl->data.usage = 90;
3323 }
3324 if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3325 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3326 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3327 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
3328 bctl->sys.usage = 90;
3329 }
3330 if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3331 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3332 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3333 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
3334 bctl->meta.usage = 90;
3335 }
3336 }
3337
3338 /*
3339 * Clear the balance status in fs_info and delete the balance item from disk.
3340 */
reset_balance_state(struct btrfs_fs_info * fs_info)3341 static void reset_balance_state(struct btrfs_fs_info *fs_info)
3342 {
3343 struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3344 int ret;
3345
3346 BUG_ON(!fs_info->balance_ctl);
3347
3348 spin_lock(&fs_info->balance_lock);
3349 fs_info->balance_ctl = NULL;
3350 spin_unlock(&fs_info->balance_lock);
3351
3352 kfree(bctl);
3353 ret = del_balance_item(fs_info);
3354 if (ret)
3355 btrfs_handle_fs_error(fs_info, ret, NULL);
3356 }
3357
3358 /*
3359 * Balance filters. Return 1 if chunk should be filtered out
3360 * (should not be balanced).
3361 */
chunk_profiles_filter(u64 chunk_type,struct btrfs_balance_args * bargs)3362 static int chunk_profiles_filter(u64 chunk_type,
3363 struct btrfs_balance_args *bargs)
3364 {
3365 chunk_type = chunk_to_extended(chunk_type) &
3366 BTRFS_EXTENDED_PROFILE_MASK;
3367
3368 if (bargs->profiles & chunk_type)
3369 return 0;
3370
3371 return 1;
3372 }
3373
chunk_usage_range_filter(struct btrfs_fs_info * fs_info,u64 chunk_offset,struct btrfs_balance_args * bargs)3374 static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
3375 struct btrfs_balance_args *bargs)
3376 {
3377 struct btrfs_block_group *cache;
3378 u64 chunk_used;
3379 u64 user_thresh_min;
3380 u64 user_thresh_max;
3381 int ret = 1;
3382
3383 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3384 chunk_used = cache->used;
3385
3386 if (bargs->usage_min == 0)
3387 user_thresh_min = 0;
3388 else
3389 user_thresh_min = div_factor_fine(cache->length,
3390 bargs->usage_min);
3391
3392 if (bargs->usage_max == 0)
3393 user_thresh_max = 1;
3394 else if (bargs->usage_max > 100)
3395 user_thresh_max = cache->length;
3396 else
3397 user_thresh_max = div_factor_fine(cache->length,
3398 bargs->usage_max);
3399
3400 if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
3401 ret = 0;
3402
3403 btrfs_put_block_group(cache);
3404 return ret;
3405 }
3406
chunk_usage_filter(struct btrfs_fs_info * fs_info,u64 chunk_offset,struct btrfs_balance_args * bargs)3407 static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
3408 u64 chunk_offset, struct btrfs_balance_args *bargs)
3409 {
3410 struct btrfs_block_group *cache;
3411 u64 chunk_used, user_thresh;
3412 int ret = 1;
3413
3414 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3415 chunk_used = cache->used;
3416
3417 if (bargs->usage_min == 0)
3418 user_thresh = 1;
3419 else if (bargs->usage > 100)
3420 user_thresh = cache->length;
3421 else
3422 user_thresh = div_factor_fine(cache->length, bargs->usage);
3423
3424 if (chunk_used < user_thresh)
3425 ret = 0;
3426
3427 btrfs_put_block_group(cache);
3428 return ret;
3429 }
3430
chunk_devid_filter(struct extent_buffer * leaf,struct btrfs_chunk * chunk,struct btrfs_balance_args * bargs)3431 static int chunk_devid_filter(struct extent_buffer *leaf,
3432 struct btrfs_chunk *chunk,
3433 struct btrfs_balance_args *bargs)
3434 {
3435 struct btrfs_stripe *stripe;
3436 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3437 int i;
3438
3439 for (i = 0; i < num_stripes; i++) {
3440 stripe = btrfs_stripe_nr(chunk, i);
3441 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
3442 return 0;
3443 }
3444
3445 return 1;
3446 }
3447
calc_data_stripes(u64 type,int num_stripes)3448 static u64 calc_data_stripes(u64 type, int num_stripes)
3449 {
3450 const int index = btrfs_bg_flags_to_raid_index(type);
3451 const int ncopies = btrfs_raid_array[index].ncopies;
3452 const int nparity = btrfs_raid_array[index].nparity;
3453
3454 if (nparity)
3455 return num_stripes - nparity;
3456 else
3457 return num_stripes / ncopies;
3458 }
3459
3460 /* [pstart, pend) */
chunk_drange_filter(struct extent_buffer * leaf,struct btrfs_chunk * chunk,struct btrfs_balance_args * bargs)3461 static int chunk_drange_filter(struct extent_buffer *leaf,
3462 struct btrfs_chunk *chunk,
3463 struct btrfs_balance_args *bargs)
3464 {
3465 struct btrfs_stripe *stripe;
3466 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3467 u64 stripe_offset;
3468 u64 stripe_length;
3469 u64 type;
3470 int factor;
3471 int i;
3472
3473 if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
3474 return 0;
3475
3476 type = btrfs_chunk_type(leaf, chunk);
3477 factor = calc_data_stripes(type, num_stripes);
3478
3479 for (i = 0; i < num_stripes; i++) {
3480 stripe = btrfs_stripe_nr(chunk, i);
3481 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
3482 continue;
3483
3484 stripe_offset = btrfs_stripe_offset(leaf, stripe);
3485 stripe_length = btrfs_chunk_length(leaf, chunk);
3486 stripe_length = div_u64(stripe_length, factor);
3487
3488 if (stripe_offset < bargs->pend &&
3489 stripe_offset + stripe_length > bargs->pstart)
3490 return 0;
3491 }
3492
3493 return 1;
3494 }
3495
3496 /* [vstart, vend) */
chunk_vrange_filter(struct extent_buffer * leaf,struct btrfs_chunk * chunk,u64 chunk_offset,struct btrfs_balance_args * bargs)3497 static int chunk_vrange_filter(struct extent_buffer *leaf,
3498 struct btrfs_chunk *chunk,
3499 u64 chunk_offset,
3500 struct btrfs_balance_args *bargs)
3501 {
3502 if (chunk_offset < bargs->vend &&
3503 chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
3504 /* at least part of the chunk is inside this vrange */
3505 return 0;
3506
3507 return 1;
3508 }
3509
chunk_stripes_range_filter(struct extent_buffer * leaf,struct btrfs_chunk * chunk,struct btrfs_balance_args * bargs)3510 static int chunk_stripes_range_filter(struct extent_buffer *leaf,
3511 struct btrfs_chunk *chunk,
3512 struct btrfs_balance_args *bargs)
3513 {
3514 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3515
3516 if (bargs->stripes_min <= num_stripes
3517 && num_stripes <= bargs->stripes_max)
3518 return 0;
3519
3520 return 1;
3521 }
3522
chunk_soft_convert_filter(u64 chunk_type,struct btrfs_balance_args * bargs)3523 static int chunk_soft_convert_filter(u64 chunk_type,
3524 struct btrfs_balance_args *bargs)
3525 {
3526 if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
3527 return 0;
3528
3529 chunk_type = chunk_to_extended(chunk_type) &
3530 BTRFS_EXTENDED_PROFILE_MASK;
3531
3532 if (bargs->target == chunk_type)
3533 return 1;
3534
3535 return 0;
3536 }
3537
should_balance_chunk(struct extent_buffer * leaf,struct btrfs_chunk * chunk,u64 chunk_offset)3538 static int should_balance_chunk(struct extent_buffer *leaf,
3539 struct btrfs_chunk *chunk, u64 chunk_offset)
3540 {
3541 struct btrfs_fs_info *fs_info = leaf->fs_info;
3542 struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3543 struct btrfs_balance_args *bargs = NULL;
3544 u64 chunk_type = btrfs_chunk_type(leaf, chunk);
3545
3546 /* type filter */
3547 if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
3548 (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
3549 return 0;
3550 }
3551
3552 if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3553 bargs = &bctl->data;
3554 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3555 bargs = &bctl->sys;
3556 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3557 bargs = &bctl->meta;
3558
3559 /* profiles filter */
3560 if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
3561 chunk_profiles_filter(chunk_type, bargs)) {
3562 return 0;
3563 }
3564
3565 /* usage filter */
3566 if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
3567 chunk_usage_filter(fs_info, chunk_offset, bargs)) {
3568 return 0;
3569 } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3570 chunk_usage_range_filter(fs_info, chunk_offset, bargs)) {
3571 return 0;
3572 }
3573
3574 /* devid filter */
3575 if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
3576 chunk_devid_filter(leaf, chunk, bargs)) {
3577 return 0;
3578 }
3579
3580 /* drange filter, makes sense only with devid filter */
3581 if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
3582 chunk_drange_filter(leaf, chunk, bargs)) {
3583 return 0;
3584 }
3585
3586 /* vrange filter */
3587 if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
3588 chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
3589 return 0;
3590 }
3591
3592 /* stripes filter */
3593 if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
3594 chunk_stripes_range_filter(leaf, chunk, bargs)) {
3595 return 0;
3596 }
3597
3598 /* soft profile changing mode */
3599 if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
3600 chunk_soft_convert_filter(chunk_type, bargs)) {
3601 return 0;
3602 }
3603
3604 /*
3605 * limited by count, must be the last filter
3606 */
3607 if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
3608 if (bargs->limit == 0)
3609 return 0;
3610 else
3611 bargs->limit--;
3612 } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
3613 /*
3614 * Same logic as the 'limit' filter; the minimum cannot be
3615 * determined here because we do not have the global information
3616 * about the count of all chunks that satisfy the filters.
3617 */
3618 if (bargs->limit_max == 0)
3619 return 0;
3620 else
3621 bargs->limit_max--;
3622 }
3623
3624 return 1;
3625 }
3626
__btrfs_balance(struct btrfs_fs_info * fs_info)3627 static int __btrfs_balance(struct btrfs_fs_info *fs_info)
3628 {
3629 struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3630 struct btrfs_root *chunk_root = fs_info->chunk_root;
3631 u64 chunk_type;
3632 struct btrfs_chunk *chunk;
3633 struct btrfs_path *path = NULL;
3634 struct btrfs_key key;
3635 struct btrfs_key found_key;
3636 struct extent_buffer *leaf;
3637 int slot;
3638 int ret;
3639 int enospc_errors = 0;
3640 bool counting = true;
3641 /* The single value limit and min/max limits use the same bytes in the */
3642 u64 limit_data = bctl->data.limit;
3643 u64 limit_meta = bctl->meta.limit;
3644 u64 limit_sys = bctl->sys.limit;
3645 u32 count_data = 0;
3646 u32 count_meta = 0;
3647 u32 count_sys = 0;
3648 int chunk_reserved = 0;
3649
3650 path = btrfs_alloc_path();
3651 if (!path) {
3652 ret = -ENOMEM;
3653 goto error;
3654 }
3655
3656 /* zero out stat counters */
3657 spin_lock(&fs_info->balance_lock);
3658 memset(&bctl->stat, 0, sizeof(bctl->stat));
3659 spin_unlock(&fs_info->balance_lock);
3660 again:
3661 if (!counting) {
3662 /*
3663 * The single value limit and min/max limits use the same bytes
3664 * in the
3665 */
3666 bctl->data.limit = limit_data;
3667 bctl->meta.limit = limit_meta;
3668 bctl->sys.limit = limit_sys;
3669 }
3670 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3671 key.offset = (u64)-1;
3672 key.type = BTRFS_CHUNK_ITEM_KEY;
3673
3674 while (1) {
3675 if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
3676 atomic_read(&fs_info->balance_cancel_req)) {
3677 ret = -ECANCELED;
3678 goto error;
3679 }
3680
3681 mutex_lock(&fs_info->delete_unused_bgs_mutex);
3682 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
3683 if (ret < 0) {
3684 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3685 goto error;
3686 }
3687
3688 /*
3689 * this shouldn't happen, it means the last relocate
3690 * failed
3691 */
3692 if (ret == 0)
3693 BUG(); /* FIXME break ? */
3694
3695 ret = btrfs_previous_item(chunk_root, path, 0,
3696 BTRFS_CHUNK_ITEM_KEY);
3697 if (ret) {
3698 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3699 ret = 0;
3700 break;
3701 }
3702
3703 leaf = path->nodes[0];
3704 slot = path->slots[0];
3705 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3706
3707 if (found_key.objectid != key.objectid) {
3708 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3709 break;
3710 }
3711
3712 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
3713 chunk_type = btrfs_chunk_type(leaf, chunk);
3714
3715 if (!counting) {
3716 spin_lock(&fs_info->balance_lock);
3717 bctl->stat.considered++;
3718 spin_unlock(&fs_info->balance_lock);
3719 }
3720
3721 ret = should_balance_chunk(leaf, chunk, found_key.offset);
3722
3723 btrfs_release_path(path);
3724 if (!ret) {
3725 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3726 goto loop;
3727 }
3728
3729 if (counting) {
3730 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3731 spin_lock(&fs_info->balance_lock);
3732 bctl->stat.expected++;
3733 spin_unlock(&fs_info->balance_lock);
3734
3735 if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3736 count_data++;
3737 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3738 count_sys++;
3739 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3740 count_meta++;
3741
3742 goto loop;
3743 }
3744
3745 /*
3746 * Apply limit_min filter, no need to check if the LIMITS
3747 * filter is used, limit_min is 0 by default
3748 */
3749 if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
3750 count_data < bctl->data.limit_min)
3751 || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) &&
3752 count_meta < bctl->meta.limit_min)
3753 || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
3754 count_sys < bctl->sys.limit_min)) {
3755 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3756 goto loop;
3757 }
3758
3759 if (!chunk_reserved) {
3760 /*
3761 * We may be relocating the only data chunk we have,
3762 * which could potentially end up with losing data's
3763 * raid profile, so lets allocate an empty one in
3764 * advance.
3765 */
3766 ret = btrfs_may_alloc_data_chunk(fs_info,
3767 found_key.offset);
3768 if (ret < 0) {
3769 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3770 goto error;
3771 } else if (ret == 1) {
3772 chunk_reserved = 1;
3773 }
3774 }
3775
3776 ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3777 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3778 if (ret == -ENOSPC) {
3779 enospc_errors++;
3780 } else if (ret == -ETXTBSY) {
3781 btrfs_info(fs_info,
3782 "skipping relocation of block group %llu due to active swapfile",
3783 found_key.offset);
3784 ret = 0;
3785 } else if (ret) {
3786 goto error;
3787 } else {
3788 spin_lock(&fs_info->balance_lock);
3789 bctl->stat.completed++;
3790 spin_unlock(&fs_info->balance_lock);
3791 }
3792 loop:
3793 if (found_key.offset == 0)
3794 break;
3795 key.offset = found_key.offset - 1;
3796 }
3797
3798 if (counting) {
3799 btrfs_release_path(path);
3800 counting = false;
3801 goto again;
3802 }
3803 error:
3804 btrfs_free_path(path);
3805 if (enospc_errors) {
3806 btrfs_info(fs_info, "%d enospc errors during balance",
3807 enospc_errors);
3808 if (!ret)
3809 ret = -ENOSPC;
3810 }
3811
3812 return ret;
3813 }
3814
3815 /**
3816 * alloc_profile_is_valid - see if a given profile is valid and reduced
3817 * @flags: profile to validate
3818 * @extended: if true @flags is treated as an extended profile
3819 */
alloc_profile_is_valid(u64 flags,int extended)3820 static int alloc_profile_is_valid(u64 flags, int extended)
3821 {
3822 u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
3823 BTRFS_BLOCK_GROUP_PROFILE_MASK);
3824
3825 flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
3826
3827 /* 1) check that all other bits are zeroed */
3828 if (flags & ~mask)
3829 return 0;
3830
3831 /* 2) see if profile is reduced */
3832 if (flags == 0)
3833 return !extended; /* "0" is valid for usual profiles */
3834
3835 return has_single_bit_set(flags);
3836 }
3837
balance_need_close(struct btrfs_fs_info * fs_info)3838 static inline int balance_need_close(struct btrfs_fs_info *fs_info)
3839 {
3840 /* cancel requested || normal exit path */
3841 return atomic_read(&fs_info->balance_cancel_req) ||
3842 (atomic_read(&fs_info->balance_pause_req) == 0 &&
3843 atomic_read(&fs_info->balance_cancel_req) == 0);
3844 }
3845
3846 /*
3847 * Validate target profile against allowed profiles and return true if it's OK.
3848 * Otherwise print the error message and return false.
3849 */
validate_convert_profile(struct btrfs_fs_info * fs_info,const struct btrfs_balance_args * bargs,u64 allowed,const char * type)3850 static inline int validate_convert_profile(struct btrfs_fs_info *fs_info,
3851 const struct btrfs_balance_args *bargs,
3852 u64 allowed, const char *type)
3853 {
3854 if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
3855 return true;
3856
3857 /* Profile is valid and does not have bits outside of the allowed set */
3858 if (alloc_profile_is_valid(bargs->target, 1) &&
3859 (bargs->target & ~allowed) == 0)
3860 return true;
3861
3862 btrfs_err(fs_info, "balance: invalid convert %s profile %s",
3863 type, btrfs_bg_type_to_raid_name(bargs->target));
3864 return false;
3865 }
3866
3867 /*
3868 * Fill @buf with textual description of balance filter flags @bargs, up to
3869 * @size_buf including the terminating null. The output may be trimmed if it
3870 * does not fit into the provided buffer.
3871 */
describe_balance_args(struct btrfs_balance_args * bargs,char * buf,u32 size_buf)3872 static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf,
3873 u32 size_buf)
3874 {
3875 int ret;
3876 u32 size_bp = size_buf;
3877 char *bp = buf;
3878 u64 flags = bargs->flags;
3879 char tmp_buf[128] = {'\0'};
3880
3881 if (!flags)
3882 return;
3883
3884 #define CHECK_APPEND_NOARG(a) \
3885 do { \
3886 ret = snprintf(bp, size_bp, (a)); \
3887 if (ret < 0 || ret >= size_bp) \
3888 goto out_overflow; \
3889 size_bp -= ret; \
3890 bp += ret; \
3891 } while (0)
3892
3893 #define CHECK_APPEND_1ARG(a, v1) \
3894 do { \
3895 ret = snprintf(bp, size_bp, (a), (v1)); \
3896 if (ret < 0 || ret >= size_bp) \
3897 goto out_overflow; \
3898 size_bp -= ret; \
3899 bp += ret; \
3900 } while (0)
3901
3902 #define CHECK_APPEND_2ARG(a, v1, v2) \
3903 do { \
3904 ret = snprintf(bp, size_bp, (a), (v1), (v2)); \
3905 if (ret < 0 || ret >= size_bp) \
3906 goto out_overflow; \
3907 size_bp -= ret; \
3908 bp += ret; \
3909 } while (0)
3910
3911 if (flags & BTRFS_BALANCE_ARGS_CONVERT)
3912 CHECK_APPEND_1ARG("convert=%s,",
3913 btrfs_bg_type_to_raid_name(bargs->target));
3914
3915 if (flags & BTRFS_BALANCE_ARGS_SOFT)
3916 CHECK_APPEND_NOARG("soft,");
3917
3918 if (flags & BTRFS_BALANCE_ARGS_PROFILES) {
3919 btrfs_describe_block_groups(bargs->profiles, tmp_buf,
3920 sizeof(tmp_buf));
3921 CHECK_APPEND_1ARG("profiles=%s,", tmp_buf);
3922 }
3923
3924 if (flags & BTRFS_BALANCE_ARGS_USAGE)
3925 CHECK_APPEND_1ARG("usage=%llu,", bargs->usage);
3926
3927 if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE)
3928 CHECK_APPEND_2ARG("usage=%u..%u,",
3929 bargs->usage_min, bargs->usage_max);
3930
3931 if (flags & BTRFS_BALANCE_ARGS_DEVID)
3932 CHECK_APPEND_1ARG("devid=%llu,", bargs->devid);
3933
3934 if (flags & BTRFS_BALANCE_ARGS_DRANGE)
3935 CHECK_APPEND_2ARG("drange=%llu..%llu,",
3936 bargs->pstart, bargs->pend);
3937
3938 if (flags & BTRFS_BALANCE_ARGS_VRANGE)
3939 CHECK_APPEND_2ARG("vrange=%llu..%llu,",
3940 bargs->vstart, bargs->vend);
3941
3942 if (flags & BTRFS_BALANCE_ARGS_LIMIT)
3943 CHECK_APPEND_1ARG("limit=%llu,", bargs->limit);
3944
3945 if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)
3946 CHECK_APPEND_2ARG("limit=%u..%u,",
3947 bargs->limit_min, bargs->limit_max);
3948
3949 if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE)
3950 CHECK_APPEND_2ARG("stripes=%u..%u,",
3951 bargs->stripes_min, bargs->stripes_max);
3952
3953 #undef CHECK_APPEND_2ARG
3954 #undef CHECK_APPEND_1ARG
3955 #undef CHECK_APPEND_NOARG
3956
3957 out_overflow:
3958
3959 if (size_bp < size_buf)
3960 buf[size_buf - size_bp - 1] = '\0'; /* remove last , */
3961 else
3962 buf[0] = '\0';
3963 }
3964
describe_balance_start_or_resume(struct btrfs_fs_info * fs_info)3965 static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info)
3966 {
3967 u32 size_buf = 1024;
3968 char tmp_buf[192] = {'\0'};
3969 char *buf;
3970 char *bp;
3971 u32 size_bp = size_buf;
3972 int ret;
3973 struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3974
3975 buf = kzalloc(size_buf, GFP_KERNEL);
3976 if (!buf)
3977 return;
3978
3979 bp = buf;
3980
3981 #define CHECK_APPEND_1ARG(a, v1) \
3982 do { \
3983 ret = snprintf(bp, size_bp, (a), (v1)); \
3984 if (ret < 0 || ret >= size_bp) \
3985 goto out_overflow; \
3986 size_bp -= ret; \
3987 bp += ret; \
3988 } while (0)
3989
3990 if (bctl->flags & BTRFS_BALANCE_FORCE)
3991 CHECK_APPEND_1ARG("%s", "-f ");
3992
3993 if (bctl->flags & BTRFS_BALANCE_DATA) {
3994 describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf));
3995 CHECK_APPEND_1ARG("-d%s ", tmp_buf);
3996 }
3997
3998 if (bctl->flags & BTRFS_BALANCE_METADATA) {
3999 describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf));
4000 CHECK_APPEND_1ARG("-m%s ", tmp_buf);
4001 }
4002
4003 if (bctl->flags & BTRFS_BALANCE_SYSTEM) {
4004 describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf));
4005 CHECK_APPEND_1ARG("-s%s ", tmp_buf);
4006 }
4007
4008 #undef CHECK_APPEND_1ARG
4009
4010 out_overflow:
4011
4012 if (size_bp < size_buf)
4013 buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */
4014 btrfs_info(fs_info, "balance: %s %s",
4015 (bctl->flags & BTRFS_BALANCE_RESUME) ?
4016 "resume" : "start", buf);
4017
4018 kfree(buf);
4019 }
4020
4021 /*
4022 * Should be called with balance mutexe held
4023 */
btrfs_balance(struct btrfs_fs_info * fs_info,struct btrfs_balance_control * bctl,struct btrfs_ioctl_balance_args * bargs)4024 int btrfs_balance(struct btrfs_fs_info *fs_info,
4025 struct btrfs_balance_control *bctl,
4026 struct btrfs_ioctl_balance_args *bargs)
4027 {
4028 u64 meta_target, data_target;
4029 u64 allowed;
4030 int mixed = 0;
4031 int ret;
4032 u64 num_devices;
4033 unsigned seq;
4034 bool reducing_redundancy;
4035 int i;
4036
4037 if (btrfs_fs_closing(fs_info) ||
4038 atomic_read(&fs_info->balance_pause_req) ||
4039 btrfs_should_cancel_balance(fs_info)) {
4040 ret = -EINVAL;
4041 goto out;
4042 }
4043
4044 allowed = btrfs_super_incompat_flags(fs_info->super_copy);
4045 if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
4046 mixed = 1;
4047
4048 /*
4049 * In case of mixed groups both data and meta should be picked,
4050 * and identical options should be given for both of them.
4051 */
4052 allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
4053 if (mixed && (bctl->flags & allowed)) {
4054 if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
4055 !(bctl->flags & BTRFS_BALANCE_METADATA) ||
4056 memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
4057 btrfs_err(fs_info,
4058 "balance: mixed groups data and metadata options must be the same");
4059 ret = -EINVAL;
4060 goto out;
4061 }
4062 }
4063
4064 /*
4065 * rw_devices will not change at the moment, device add/delete/replace
4066 * are exclusive
4067 */
4068 num_devices = fs_info->fs_devices->rw_devices;
4069
4070 /*
4071 * SINGLE profile on-disk has no profile bit, but in-memory we have a
4072 * special bit for it, to make it easier to distinguish. Thus we need
4073 * to set it manually, or balance would refuse the profile.
4074 */
4075 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
4076 for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++)
4077 if (num_devices >= btrfs_raid_array[i].devs_min)
4078 allowed |= btrfs_raid_array[i].bg_flag;
4079
4080 if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") ||
4081 !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") ||
4082 !validate_convert_profile(fs_info, &bctl->sys, allowed, "system")) {
4083 ret = -EINVAL;
4084 goto out;
4085 }
4086
4087 /*
4088 * Allow to reduce metadata or system integrity only if force set for
4089 * profiles with redundancy (copies, parity)
4090 */
4091 allowed = 0;
4092 for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) {
4093 if (btrfs_raid_array[i].ncopies >= 2 ||
4094 btrfs_raid_array[i].tolerated_failures >= 1)
4095 allowed |= btrfs_raid_array[i].bg_flag;
4096 }
4097 do {
4098 seq = read_seqbegin(&fs_info->profiles_lock);
4099
4100 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
4101 (fs_info->avail_system_alloc_bits & allowed) &&
4102 !(bctl->sys.target & allowed)) ||
4103 ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
4104 (fs_info->avail_metadata_alloc_bits & allowed) &&
4105 !(bctl->meta.target & allowed)))
4106 reducing_redundancy = true;
4107 else
4108 reducing_redundancy = false;
4109
4110 /* if we're not converting, the target field is uninitialized */
4111 meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
4112 bctl->meta.target : fs_info->avail_metadata_alloc_bits;
4113 data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
4114 bctl->data.target : fs_info->avail_data_alloc_bits;
4115 } while (read_seqretry(&fs_info->profiles_lock, seq));
4116
4117 if (reducing_redundancy) {
4118 if (bctl->flags & BTRFS_BALANCE_FORCE) {
4119 btrfs_info(fs_info,
4120 "balance: force reducing metadata redundancy");
4121 } else {
4122 btrfs_err(fs_info,
4123 "balance: reduces metadata redundancy, use --force if you want this");
4124 ret = -EINVAL;
4125 goto out;
4126 }
4127 }
4128
4129 if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
4130 btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
4131 btrfs_warn(fs_info,
4132 "balance: metadata profile %s has lower redundancy than data profile %s",
4133 btrfs_bg_type_to_raid_name(meta_target),
4134 btrfs_bg_type_to_raid_name(data_target));
4135 }
4136
4137 if (fs_info->send_in_progress) {
4138 btrfs_warn_rl(fs_info,
4139 "cannot run balance while send operations are in progress (%d in progress)",
4140 fs_info->send_in_progress);
4141 ret = -EAGAIN;
4142 goto out;
4143 }
4144
4145 ret = insert_balance_item(fs_info, bctl);
4146 if (ret && ret != -EEXIST)
4147 goto out;
4148
4149 if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
4150 BUG_ON(ret == -EEXIST);
4151 BUG_ON(fs_info->balance_ctl);
4152 spin_lock(&fs_info->balance_lock);
4153 fs_info->balance_ctl = bctl;
4154 spin_unlock(&fs_info->balance_lock);
4155 } else {
4156 BUG_ON(ret != -EEXIST);
4157 spin_lock(&fs_info->balance_lock);
4158 update_balance_args(bctl);
4159 spin_unlock(&fs_info->balance_lock);
4160 }
4161
4162 ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4163 set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
4164 describe_balance_start_or_resume(fs_info);
4165 mutex_unlock(&fs_info->balance_mutex);
4166
4167 ret = __btrfs_balance(fs_info);
4168
4169 mutex_lock(&fs_info->balance_mutex);
4170 if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req))
4171 btrfs_info(fs_info, "balance: paused");
4172 /*
4173 * Balance can be canceled by:
4174 *
4175 * - Regular cancel request
4176 * Then ret == -ECANCELED and balance_cancel_req > 0
4177 *
4178 * - Fatal signal to "btrfs" process
4179 * Either the signal caught by wait_reserve_ticket() and callers
4180 * got -EINTR, or caught by btrfs_should_cancel_balance() and
4181 * got -ECANCELED.
4182 * Either way, in this case balance_cancel_req = 0, and
4183 * ret == -EINTR or ret == -ECANCELED.
4184 *
4185 * So here we only check the return value to catch canceled balance.
4186 */
4187 else if (ret == -ECANCELED || ret == -EINTR)
4188 btrfs_info(fs_info, "balance: canceled");
4189 else
4190 btrfs_info(fs_info, "balance: ended with status: %d", ret);
4191
4192 clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
4193
4194 if (bargs) {
4195 memset(bargs, 0, sizeof(*bargs));
4196 btrfs_update_ioctl_balance_args(fs_info, bargs);
4197 }
4198
4199 if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
4200 balance_need_close(fs_info)) {
4201 reset_balance_state(fs_info);
4202 btrfs_exclop_finish(fs_info);
4203 }
4204
4205 wake_up(&fs_info->balance_wait_q);
4206
4207 return ret;
4208 out:
4209 if (bctl->flags & BTRFS_BALANCE_RESUME)
4210 reset_balance_state(fs_info);
4211 else
4212 kfree(bctl);
4213 btrfs_exclop_finish(fs_info);
4214
4215 return ret;
4216 }
4217
balance_kthread(void * data)4218 static int balance_kthread(void *data)
4219 {
4220 struct btrfs_fs_info *fs_info = data;
4221 int ret = 0;
4222
4223 mutex_lock(&fs_info->balance_mutex);
4224 if (fs_info->balance_ctl)
4225 ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
4226 mutex_unlock(&fs_info->balance_mutex);
4227
4228 return ret;
4229 }
4230
btrfs_resume_balance_async(struct btrfs_fs_info * fs_info)4231 int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
4232 {
4233 struct task_struct *tsk;
4234
4235 mutex_lock(&fs_info->balance_mutex);
4236 if (!fs_info->balance_ctl) {
4237 mutex_unlock(&fs_info->balance_mutex);
4238 return 0;
4239 }
4240 mutex_unlock(&fs_info->balance_mutex);
4241
4242 if (btrfs_test_opt(fs_info, SKIP_BALANCE)) {
4243 btrfs_info(fs_info, "balance: resume skipped");
4244 return 0;
4245 }
4246
4247 /*
4248 * A ro->rw remount sequence should continue with the paused balance
4249 * regardless of who pauses it, system or the user as of now, so set
4250 * the resume flag.
4251 */
4252 spin_lock(&fs_info->balance_lock);
4253 fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME;
4254 spin_unlock(&fs_info->balance_lock);
4255
4256 tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
4257 return PTR_ERR_OR_ZERO(tsk);
4258 }
4259
btrfs_recover_balance(struct btrfs_fs_info * fs_info)4260 int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
4261 {
4262 struct btrfs_balance_control *bctl;
4263 struct btrfs_balance_item *item;
4264 struct btrfs_disk_balance_args disk_bargs;
4265 struct btrfs_path *path;
4266 struct extent_buffer *leaf;
4267 struct btrfs_key key;
4268 int ret;
4269
4270 path = btrfs_alloc_path();
4271 if (!path)
4272 return -ENOMEM;
4273
4274 key.objectid = BTRFS_BALANCE_OBJECTID;
4275 key.type = BTRFS_TEMPORARY_ITEM_KEY;
4276 key.offset = 0;
4277
4278 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
4279 if (ret < 0)
4280 goto out;
4281 if (ret > 0) { /* ret = -ENOENT; */
4282 ret = 0;
4283 goto out;
4284 }
4285
4286 bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
4287 if (!bctl) {
4288 ret = -ENOMEM;
4289 goto out;
4290 }
4291
4292 leaf = path->nodes[0];
4293 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
4294
4295 bctl->flags = btrfs_balance_flags(leaf, item);
4296 bctl->flags |= BTRFS_BALANCE_RESUME;
4297
4298 btrfs_balance_data(leaf, item, &disk_bargs);
4299 btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
4300 btrfs_balance_meta(leaf, item, &disk_bargs);
4301 btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
4302 btrfs_balance_sys(leaf, item, &disk_bargs);
4303 btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
4304
4305 /*
4306 * This should never happen, as the paused balance state is recovered
4307 * during mount without any chance of other exclusive ops to collide.
4308 *
4309 * This gives the exclusive op status to balance and keeps in paused
4310 * state until user intervention (cancel or umount). If the ownership
4311 * cannot be assigned, show a message but do not fail. The balance
4312 * is in a paused state and must have fs_info::balance_ctl properly
4313 * set up.
4314 */
4315 if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
4316 btrfs_warn(fs_info,
4317 "balance: cannot set exclusive op status, resume manually");
4318
4319 btrfs_release_path(path);
4320
4321 mutex_lock(&fs_info->balance_mutex);
4322 BUG_ON(fs_info->balance_ctl);
4323 spin_lock(&fs_info->balance_lock);
4324 fs_info->balance_ctl = bctl;
4325 spin_unlock(&fs_info->balance_lock);
4326 mutex_unlock(&fs_info->balance_mutex);
4327 out:
4328 btrfs_free_path(path);
4329 return ret;
4330 }
4331
btrfs_pause_balance(struct btrfs_fs_info * fs_info)4332 int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
4333 {
4334 int ret = 0;
4335
4336 mutex_lock(&fs_info->balance_mutex);
4337 if (!fs_info->balance_ctl) {
4338 mutex_unlock(&fs_info->balance_mutex);
4339 return -ENOTCONN;
4340 }
4341
4342 if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
4343 atomic_inc(&fs_info->balance_pause_req);
4344 mutex_unlock(&fs_info->balance_mutex);
4345
4346 wait_event(fs_info->balance_wait_q,
4347 !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4348
4349 mutex_lock(&fs_info->balance_mutex);
4350 /* we are good with balance_ctl ripped off from under us */
4351 BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4352 atomic_dec(&fs_info->balance_pause_req);
4353 } else {
4354 ret = -ENOTCONN;
4355 }
4356
4357 mutex_unlock(&fs_info->balance_mutex);
4358 return ret;
4359 }
4360
btrfs_cancel_balance(struct btrfs_fs_info * fs_info)4361 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
4362 {
4363 mutex_lock(&fs_info->balance_mutex);
4364 if (!fs_info->balance_ctl) {
4365 mutex_unlock(&fs_info->balance_mutex);
4366 return -ENOTCONN;
4367 }
4368
4369 /*
4370 * A paused balance with the item stored on disk can be resumed at
4371 * mount time if the mount is read-write. Otherwise it's still paused
4372 * and we must not allow cancelling as it deletes the item.
4373 */
4374 if (sb_rdonly(fs_info->sb)) {
4375 mutex_unlock(&fs_info->balance_mutex);
4376 return -EROFS;
4377 }
4378
4379 atomic_inc(&fs_info->balance_cancel_req);
4380 /*
4381 * if we are running just wait and return, balance item is
4382 * deleted in btrfs_balance in this case
4383 */
4384 if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
4385 mutex_unlock(&fs_info->balance_mutex);
4386 wait_event(fs_info->balance_wait_q,
4387 !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4388 mutex_lock(&fs_info->balance_mutex);
4389 } else {
4390 mutex_unlock(&fs_info->balance_mutex);
4391 /*
4392 * Lock released to allow other waiters to continue, we'll
4393 * reexamine the status again.
4394 */
4395 mutex_lock(&fs_info->balance_mutex);
4396
4397 if (fs_info->balance_ctl) {
4398 reset_balance_state(fs_info);
4399 btrfs_exclop_finish(fs_info);
4400 btrfs_info(fs_info, "balance: canceled");
4401 }
4402 }
4403
4404 BUG_ON(fs_info->balance_ctl ||
4405 test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4406 atomic_dec(&fs_info->balance_cancel_req);
4407 mutex_unlock(&fs_info->balance_mutex);
4408 return 0;
4409 }
4410
btrfs_uuid_scan_kthread(void * data)4411 int btrfs_uuid_scan_kthread(void *data)
4412 {
4413 struct btrfs_fs_info *fs_info = data;
4414 struct btrfs_root *root = fs_info->tree_root;
4415 struct btrfs_key key;
4416 struct btrfs_path *path = NULL;
4417 int ret = 0;
4418 struct extent_buffer *eb;
4419 int slot;
4420 struct btrfs_root_item root_item;
4421 u32 item_size;
4422 struct btrfs_trans_handle *trans = NULL;
4423 bool closing = false;
4424
4425 path = btrfs_alloc_path();
4426 if (!path) {
4427 ret = -ENOMEM;
4428 goto out;
4429 }
4430
4431 key.objectid = 0;
4432 key.type = BTRFS_ROOT_ITEM_KEY;
4433 key.offset = 0;
4434
4435 while (1) {
4436 if (btrfs_fs_closing(fs_info)) {
4437 closing = true;
4438 break;
4439 }
4440 ret = btrfs_search_forward(root, &key, path,
4441 BTRFS_OLDEST_GENERATION);
4442 if (ret) {
4443 if (ret > 0)
4444 ret = 0;
4445 break;
4446 }
4447
4448 if (key.type != BTRFS_ROOT_ITEM_KEY ||
4449 (key.objectid < BTRFS_FIRST_FREE_OBJECTID &&
4450 key.objectid != BTRFS_FS_TREE_OBJECTID) ||
4451 key.objectid > BTRFS_LAST_FREE_OBJECTID)
4452 goto skip;
4453
4454 eb = path->nodes[0];
4455 slot = path->slots[0];
4456 item_size = btrfs_item_size_nr(eb, slot);
4457 if (item_size < sizeof(root_item))
4458 goto skip;
4459
4460 read_extent_buffer(eb, &root_item,
4461 btrfs_item_ptr_offset(eb, slot),
4462 (int)sizeof(root_item));
4463 if (btrfs_root_refs(&root_item) == 0)
4464 goto skip;
4465
4466 if (!btrfs_is_empty_uuid(root_item.uuid) ||
4467 !btrfs_is_empty_uuid(root_item.received_uuid)) {
4468 if (trans)
4469 goto update_tree;
4470
4471 btrfs_release_path(path);
4472 /*
4473 * 1 - subvol uuid item
4474 * 1 - received_subvol uuid item
4475 */
4476 trans = btrfs_start_transaction(fs_info->uuid_root, 2);
4477 if (IS_ERR(trans)) {
4478 ret = PTR_ERR(trans);
4479 break;
4480 }
4481 continue;
4482 } else {
4483 goto skip;
4484 }
4485 update_tree:
4486 btrfs_release_path(path);
4487 if (!btrfs_is_empty_uuid(root_item.uuid)) {
4488 ret = btrfs_uuid_tree_add(trans, root_item.uuid,
4489 BTRFS_UUID_KEY_SUBVOL,
4490 key.objectid);
4491 if (ret < 0) {
4492 btrfs_warn(fs_info, "uuid_tree_add failed %d",
4493 ret);
4494 break;
4495 }
4496 }
4497
4498 if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
4499 ret = btrfs_uuid_tree_add(trans,
4500 root_item.received_uuid,
4501 BTRFS_UUID_KEY_RECEIVED_SUBVOL,
4502 key.objectid);
4503 if (ret < 0) {
4504 btrfs_warn(fs_info, "uuid_tree_add failed %d",
4505 ret);
4506 break;
4507 }
4508 }
4509
4510 skip:
4511 btrfs_release_path(path);
4512 if (trans) {
4513 ret = btrfs_end_transaction(trans);
4514 trans = NULL;
4515 if (ret)
4516 break;
4517 }
4518
4519 if (key.offset < (u64)-1) {
4520 key.offset++;
4521 } else if (key.type < BTRFS_ROOT_ITEM_KEY) {
4522 key.offset = 0;
4523 key.type = BTRFS_ROOT_ITEM_KEY;
4524 } else if (key.objectid < (u64)-1) {
4525 key.offset = 0;
4526 key.type = BTRFS_ROOT_ITEM_KEY;
4527 key.objectid++;
4528 } else {
4529 break;
4530 }
4531 cond_resched();
4532 }
4533
4534 out:
4535 btrfs_free_path(path);
4536 if (trans && !IS_ERR(trans))
4537 btrfs_end_transaction(trans);
4538 if (ret)
4539 btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
4540 else if (!closing)
4541 set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
4542 up(&fs_info->uuid_tree_rescan_sem);
4543 return 0;
4544 }
4545
btrfs_create_uuid_tree(struct btrfs_fs_info * fs_info)4546 int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
4547 {
4548 struct btrfs_trans_handle *trans;
4549 struct btrfs_root *tree_root = fs_info->tree_root;
4550 struct btrfs_root *uuid_root;
4551 struct task_struct *task;
4552 int ret;
4553
4554 /*
4555 * 1 - root node
4556 * 1 - root item
4557 */
4558 trans = btrfs_start_transaction(tree_root, 2);
4559 if (IS_ERR(trans))
4560 return PTR_ERR(trans);
4561
4562 uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID);
4563 if (IS_ERR(uuid_root)) {
4564 ret = PTR_ERR(uuid_root);
4565 btrfs_abort_transaction(trans, ret);
4566 btrfs_end_transaction(trans);
4567 return ret;
4568 }
4569
4570 fs_info->uuid_root = uuid_root;
4571
4572 ret = btrfs_commit_transaction(trans);
4573 if (ret)
4574 return ret;
4575
4576 down(&fs_info->uuid_tree_rescan_sem);
4577 task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
4578 if (IS_ERR(task)) {
4579 /* fs_info->update_uuid_tree_gen remains 0 in all error case */
4580 btrfs_warn(fs_info, "failed to start uuid_scan task");
4581 up(&fs_info->uuid_tree_rescan_sem);
4582 return PTR_ERR(task);
4583 }
4584
4585 return 0;
4586 }
4587
4588 /*
4589 * shrinking a device means finding all of the device extents past
4590 * the new size, and then following the back refs to the chunks.
4591 * The chunk relocation code actually frees the device extent
4592 */
btrfs_shrink_device(struct btrfs_device * device,u64 new_size)4593 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
4594 {
4595 struct btrfs_fs_info *fs_info = device->fs_info;
4596 struct btrfs_root *root = fs_info->dev_root;
4597 struct btrfs_trans_handle *trans;
4598 struct btrfs_dev_extent *dev_extent = NULL;
4599 struct btrfs_path *path;
4600 u64 length;
4601 u64 chunk_offset;
4602 int ret;
4603 int slot;
4604 int failed = 0;
4605 bool retried = false;
4606 struct extent_buffer *l;
4607 struct btrfs_key key;
4608 struct btrfs_super_block *super_copy = fs_info->super_copy;
4609 u64 old_total = btrfs_super_total_bytes(super_copy);
4610 u64 old_size = btrfs_device_get_total_bytes(device);
4611 u64 diff;
4612 u64 start;
4613
4614 new_size = round_down(new_size, fs_info->sectorsize);
4615 start = new_size;
4616 diff = round_down(old_size - new_size, fs_info->sectorsize);
4617
4618 if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
4619 return -EINVAL;
4620
4621 path = btrfs_alloc_path();
4622 if (!path)
4623 return -ENOMEM;
4624
4625 path->reada = READA_BACK;
4626
4627 trans = btrfs_start_transaction(root, 0);
4628 if (IS_ERR(trans)) {
4629 btrfs_free_path(path);
4630 return PTR_ERR(trans);
4631 }
4632
4633 mutex_lock(&fs_info->chunk_mutex);
4634
4635 btrfs_device_set_total_bytes(device, new_size);
4636 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
4637 device->fs_devices->total_rw_bytes -= diff;
4638 atomic64_sub(diff, &fs_info->free_chunk_space);
4639 }
4640
4641 /*
4642 * Once the device's size has been set to the new size, ensure all
4643 * in-memory chunks are synced to disk so that the loop below sees them
4644 * and relocates them accordingly.
4645 */
4646 if (contains_pending_extent(device, &start, diff)) {
4647 mutex_unlock(&fs_info->chunk_mutex);
4648 ret = btrfs_commit_transaction(trans);
4649 if (ret)
4650 goto done;
4651 } else {
4652 mutex_unlock(&fs_info->chunk_mutex);
4653 btrfs_end_transaction(trans);
4654 }
4655
4656 again:
4657 key.objectid = device->devid;
4658 key.offset = (u64)-1;
4659 key.type = BTRFS_DEV_EXTENT_KEY;
4660
4661 do {
4662 mutex_lock(&fs_info->delete_unused_bgs_mutex);
4663 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4664 if (ret < 0) {
4665 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4666 goto done;
4667 }
4668
4669 ret = btrfs_previous_item(root, path, 0, key.type);
4670 if (ret)
4671 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4672 if (ret < 0)
4673 goto done;
4674 if (ret) {
4675 ret = 0;
4676 btrfs_release_path(path);
4677 break;
4678 }
4679
4680 l = path->nodes[0];
4681 slot = path->slots[0];
4682 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
4683
4684 if (key.objectid != device->devid) {
4685 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4686 btrfs_release_path(path);
4687 break;
4688 }
4689
4690 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
4691 length = btrfs_dev_extent_length(l, dev_extent);
4692
4693 if (key.offset + length <= new_size) {
4694 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4695 btrfs_release_path(path);
4696 break;
4697 }
4698
4699 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
4700 btrfs_release_path(path);
4701
4702 /*
4703 * We may be relocating the only data chunk we have,
4704 * which could potentially end up with losing data's
4705 * raid profile, so lets allocate an empty one in
4706 * advance.
4707 */
4708 ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset);
4709 if (ret < 0) {
4710 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4711 goto done;
4712 }
4713
4714 ret = btrfs_relocate_chunk(fs_info, chunk_offset);
4715 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4716 if (ret == -ENOSPC) {
4717 failed++;
4718 } else if (ret) {
4719 if (ret == -ETXTBSY) {
4720 btrfs_warn(fs_info,
4721 "could not shrink block group %llu due to active swapfile",
4722 chunk_offset);
4723 }
4724 goto done;
4725 }
4726 } while (key.offset-- > 0);
4727
4728 if (failed && !retried) {
4729 failed = 0;
4730 retried = true;
4731 goto again;
4732 } else if (failed && retried) {
4733 ret = -ENOSPC;
4734 goto done;
4735 }
4736
4737 /* Shrinking succeeded, else we would be at "done". */
4738 trans = btrfs_start_transaction(root, 0);
4739 if (IS_ERR(trans)) {
4740 ret = PTR_ERR(trans);
4741 goto done;
4742 }
4743
4744 mutex_lock(&fs_info->chunk_mutex);
4745 /* Clear all state bits beyond the shrunk device size */
4746 clear_extent_bits(&device->alloc_state, new_size, (u64)-1,
4747 CHUNK_STATE_MASK);
4748
4749 btrfs_device_set_disk_total_bytes(device, new_size);
4750 if (list_empty(&device->post_commit_list))
4751 list_add_tail(&device->post_commit_list,
4752 &trans->transaction->dev_update_list);
4753
4754 WARN_ON(diff > old_total);
4755 btrfs_set_super_total_bytes(super_copy,
4756 round_down(old_total - diff, fs_info->sectorsize));
4757 mutex_unlock(&fs_info->chunk_mutex);
4758
4759 /* Now btrfs_update_device() will change the on-disk size. */
4760 ret = btrfs_update_device(trans, device);
4761 if (ret < 0) {
4762 btrfs_abort_transaction(trans, ret);
4763 btrfs_end_transaction(trans);
4764 } else {
4765 ret = btrfs_commit_transaction(trans);
4766 }
4767 done:
4768 btrfs_free_path(path);
4769 if (ret) {
4770 mutex_lock(&fs_info->chunk_mutex);
4771 btrfs_device_set_total_bytes(device, old_size);
4772 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
4773 device->fs_devices->total_rw_bytes += diff;
4774 atomic64_add(diff, &fs_info->free_chunk_space);
4775 mutex_unlock(&fs_info->chunk_mutex);
4776 }
4777 return ret;
4778 }
4779
btrfs_add_system_chunk(struct btrfs_fs_info * fs_info,struct btrfs_key * key,struct btrfs_chunk * chunk,int item_size)4780 static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
4781 struct btrfs_key *key,
4782 struct btrfs_chunk *chunk, int item_size)
4783 {
4784 struct btrfs_super_block *super_copy = fs_info->super_copy;
4785 struct btrfs_disk_key disk_key;
4786 u32 array_size;
4787 u8 *ptr;
4788
4789 mutex_lock(&fs_info->chunk_mutex);
4790 array_size = btrfs_super_sys_array_size(super_copy);
4791 if (array_size + item_size + sizeof(disk_key)
4792 > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
4793 mutex_unlock(&fs_info->chunk_mutex);
4794 return -EFBIG;
4795 }
4796
4797 ptr = super_copy->sys_chunk_array + array_size;
4798 btrfs_cpu_key_to_disk(&disk_key, key);
4799 memcpy(ptr, &disk_key, sizeof(disk_key));
4800 ptr += sizeof(disk_key);
4801 memcpy(ptr, chunk, item_size);
4802 item_size += sizeof(disk_key);
4803 btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
4804 mutex_unlock(&fs_info->chunk_mutex);
4805
4806 return 0;
4807 }
4808
4809 /*
4810 * sort the devices in descending order by max_avail, total_avail
4811 */
btrfs_cmp_device_info(const void * a,const void * b)4812 static int btrfs_cmp_device_info(const void *a, const void *b)
4813 {
4814 const struct btrfs_device_info *di_a = a;
4815 const struct btrfs_device_info *di_b = b;
4816
4817 if (di_a->max_avail > di_b->max_avail)
4818 return -1;
4819 if (di_a->max_avail < di_b->max_avail)
4820 return 1;
4821 if (di_a->total_avail > di_b->total_avail)
4822 return -1;
4823 if (di_a->total_avail < di_b->total_avail)
4824 return 1;
4825 return 0;
4826 }
4827
check_raid56_incompat_flag(struct btrfs_fs_info * info,u64 type)4828 static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
4829 {
4830 if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
4831 return;
4832
4833 btrfs_set_fs_incompat(info, RAID56);
4834 }
4835
check_raid1c34_incompat_flag(struct btrfs_fs_info * info,u64 type)4836 static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type)
4837 {
4838 if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4)))
4839 return;
4840
4841 btrfs_set_fs_incompat(info, RAID1C34);
4842 }
4843
4844 /*
4845 * Structure used internally for __btrfs_alloc_chunk() function.
4846 * Wraps needed parameters.
4847 */
4848 struct alloc_chunk_ctl {
4849 u64 start;
4850 u64 type;
4851 /* Total number of stripes to allocate */
4852 int num_stripes;
4853 /* sub_stripes info for map */
4854 int sub_stripes;
4855 /* Stripes per device */
4856 int dev_stripes;
4857 /* Maximum number of devices to use */
4858 int devs_max;
4859 /* Minimum number of devices to use */
4860 int devs_min;
4861 /* ndevs has to be a multiple of this */
4862 int devs_increment;
4863 /* Number of copies */
4864 int ncopies;
4865 /* Number of stripes worth of bytes to store parity information */
4866 int nparity;
4867 u64 max_stripe_size;
4868 u64 max_chunk_size;
4869 u64 dev_extent_min;
4870 u64 stripe_size;
4871 u64 chunk_size;
4872 int ndevs;
4873 };
4874
init_alloc_chunk_ctl_policy_regular(struct btrfs_fs_devices * fs_devices,struct alloc_chunk_ctl * ctl)4875 static void init_alloc_chunk_ctl_policy_regular(
4876 struct btrfs_fs_devices *fs_devices,
4877 struct alloc_chunk_ctl *ctl)
4878 {
4879 u64 type = ctl->type;
4880
4881 if (type & BTRFS_BLOCK_GROUP_DATA) {
4882 ctl->max_stripe_size = SZ_1G;
4883 ctl->max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE;
4884 } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
4885 /* For larger filesystems, use larger metadata chunks */
4886 if (fs_devices->total_rw_bytes > 50ULL * SZ_1G)
4887 ctl->max_stripe_size = SZ_1G;
4888 else
4889 ctl->max_stripe_size = SZ_256M;
4890 ctl->max_chunk_size = ctl->max_stripe_size;
4891 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
4892 ctl->max_stripe_size = SZ_32M;
4893 ctl->max_chunk_size = 2 * ctl->max_stripe_size;
4894 ctl->devs_max = min_t(int, ctl->devs_max,
4895 BTRFS_MAX_DEVS_SYS_CHUNK);
4896 } else {
4897 BUG();
4898 }
4899
4900 /* We don't want a chunk larger than 10% of writable space */
4901 ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
4902 ctl->max_chunk_size);
4903 ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes;
4904 }
4905
init_alloc_chunk_ctl(struct btrfs_fs_devices * fs_devices,struct alloc_chunk_ctl * ctl)4906 static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices,
4907 struct alloc_chunk_ctl *ctl)
4908 {
4909 int index = btrfs_bg_flags_to_raid_index(ctl->type);
4910
4911 ctl->sub_stripes = btrfs_raid_array[index].sub_stripes;
4912 ctl->dev_stripes = btrfs_raid_array[index].dev_stripes;
4913 ctl->devs_max = btrfs_raid_array[index].devs_max;
4914 if (!ctl->devs_max)
4915 ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info);
4916 ctl->devs_min = btrfs_raid_array[index].devs_min;
4917 ctl->devs_increment = btrfs_raid_array[index].devs_increment;
4918 ctl->ncopies = btrfs_raid_array[index].ncopies;
4919 ctl->nparity = btrfs_raid_array[index].nparity;
4920 ctl->ndevs = 0;
4921
4922 switch (fs_devices->chunk_alloc_policy) {
4923 case BTRFS_CHUNK_ALLOC_REGULAR:
4924 init_alloc_chunk_ctl_policy_regular(fs_devices, ctl);
4925 break;
4926 default:
4927 BUG();
4928 }
4929 }
4930
gather_device_info(struct btrfs_fs_devices * fs_devices,struct alloc_chunk_ctl * ctl,struct btrfs_device_info * devices_info)4931 static int gather_device_info(struct btrfs_fs_devices *fs_devices,
4932 struct alloc_chunk_ctl *ctl,
4933 struct btrfs_device_info *devices_info)
4934 {
4935 struct btrfs_fs_info *info = fs_devices->fs_info;
4936 struct btrfs_device *device;
4937 u64 total_avail;
4938 u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes;
4939 int ret;
4940 int ndevs = 0;
4941 u64 max_avail;
4942 u64 dev_offset;
4943
4944 /*
4945 * in the first pass through the devices list, we gather information
4946 * about the available holes on each device.
4947 */
4948 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
4949 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
4950 WARN(1, KERN_ERR
4951 "BTRFS: read-only device in alloc_list\n");
4952 continue;
4953 }
4954
4955 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
4956 &device->dev_state) ||
4957 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
4958 continue;
4959
4960 if (device->total_bytes > device->bytes_used)
4961 total_avail = device->total_bytes - device->bytes_used;
4962 else
4963 total_avail = 0;
4964
4965 /* If there is no space on this device, skip it. */
4966 if (total_avail < ctl->dev_extent_min)
4967 continue;
4968
4969 ret = find_free_dev_extent(device, dev_extent_want, &dev_offset,
4970 &max_avail);
4971 if (ret && ret != -ENOSPC)
4972 return ret;
4973
4974 if (ret == 0)
4975 max_avail = dev_extent_want;
4976
4977 if (max_avail < ctl->dev_extent_min) {
4978 if (btrfs_test_opt(info, ENOSPC_DEBUG))
4979 btrfs_debug(info,
4980 "%s: devid %llu has no free space, have=%llu want=%llu",
4981 __func__, device->devid, max_avail,
4982 ctl->dev_extent_min);
4983 continue;
4984 }
4985
4986 if (ndevs == fs_devices->rw_devices) {
4987 WARN(1, "%s: found more than %llu devices\n",
4988 __func__, fs_devices->rw_devices);
4989 break;
4990 }
4991 devices_info[ndevs].dev_offset = dev_offset;
4992 devices_info[ndevs].max_avail = max_avail;
4993 devices_info[ndevs].total_avail = total_avail;
4994 devices_info[ndevs].dev = device;
4995 ++ndevs;
4996 }
4997 ctl->ndevs = ndevs;
4998
4999 /*
5000 * now sort the devices by hole size / available space
5001 */
5002 sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
5003 btrfs_cmp_device_info, NULL);
5004
5005 return 0;
5006 }
5007
decide_stripe_size_regular(struct alloc_chunk_ctl * ctl,struct btrfs_device_info * devices_info)5008 static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl,
5009 struct btrfs_device_info *devices_info)
5010 {
5011 /* Number of stripes that count for block group size */
5012 int data_stripes;
5013
5014 /*
5015 * The primary goal is to maximize the number of stripes, so use as
5016 * many devices as possible, even if the stripes are not maximum sized.
5017 *
5018 * The DUP profile stores more than one stripe per device, the
5019 * max_avail is the total size so we have to adjust.
5020 */
5021 ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail,
5022 ctl->dev_stripes);
5023 ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
5024
5025 /* This will have to be fixed for RAID1 and RAID10 over more drives */
5026 data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
5027
5028 /*
5029 * Use the number of data stripes to figure out how big this chunk is
5030 * really going to be in terms of logical address space, and compare
5031 * that answer with the max chunk size. If it's higher, we try to
5032 * reduce stripe_size.
5033 */
5034 if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
5035 /*
5036 * Reduce stripe_size, round it up to a 16MB boundary again and
5037 * then use it, unless it ends up being even bigger than the
5038 * previous value we had already.
5039 */
5040 ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size,
5041 data_stripes), SZ_16M),
5042 ctl->stripe_size);
5043 }
5044
5045 /* Align to BTRFS_STRIPE_LEN */
5046 ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN);
5047 ctl->chunk_size = ctl->stripe_size * data_stripes;
5048
5049 return 0;
5050 }
5051
decide_stripe_size(struct btrfs_fs_devices * fs_devices,struct alloc_chunk_ctl * ctl,struct btrfs_device_info * devices_info)5052 static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
5053 struct alloc_chunk_ctl *ctl,
5054 struct btrfs_device_info *devices_info)
5055 {
5056 struct btrfs_fs_info *info = fs_devices->fs_info;
5057
5058 /*
5059 * Round down to number of usable stripes, devs_increment can be any
5060 * number so we can't use round_down() that requires power of 2, while
5061 * rounddown is safe.
5062 */
5063 ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment);
5064
5065 if (ctl->ndevs < ctl->devs_min) {
5066 if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
5067 btrfs_debug(info,
5068 "%s: not enough devices with free space: have=%d minimum required=%d",
5069 __func__, ctl->ndevs, ctl->devs_min);
5070 }
5071 return -ENOSPC;
5072 }
5073
5074 ctl->ndevs = min(ctl->ndevs, ctl->devs_max);
5075
5076 switch (fs_devices->chunk_alloc_policy) {
5077 case BTRFS_CHUNK_ALLOC_REGULAR:
5078 return decide_stripe_size_regular(ctl, devices_info);
5079 default:
5080 BUG();
5081 }
5082 }
5083
create_chunk(struct btrfs_trans_handle * trans,struct alloc_chunk_ctl * ctl,struct btrfs_device_info * devices_info)5084 static int create_chunk(struct btrfs_trans_handle *trans,
5085 struct alloc_chunk_ctl *ctl,
5086 struct btrfs_device_info *devices_info)
5087 {
5088 struct btrfs_fs_info *info = trans->fs_info;
5089 struct map_lookup *map = NULL;
5090 struct extent_map_tree *em_tree;
5091 struct extent_map *em;
5092 u64 start = ctl->start;
5093 u64 type = ctl->type;
5094 int ret;
5095 int i;
5096 int j;
5097
5098 map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS);
5099 if (!map)
5100 return -ENOMEM;
5101 map->num_stripes = ctl->num_stripes;
5102
5103 for (i = 0; i < ctl->ndevs; ++i) {
5104 for (j = 0; j < ctl->dev_stripes; ++j) {
5105 int s = i * ctl->dev_stripes + j;
5106 map->stripes[s].dev = devices_info[i].dev;
5107 map->stripes[s].physical = devices_info[i].dev_offset +
5108 j * ctl->stripe_size;
5109 }
5110 }
5111 map->stripe_len = BTRFS_STRIPE_LEN;
5112 map->io_align = BTRFS_STRIPE_LEN;
5113 map->io_width = BTRFS_STRIPE_LEN;
5114 map->type = type;
5115 map->sub_stripes = ctl->sub_stripes;
5116
5117 trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size);
5118
5119 em = alloc_extent_map();
5120 if (!em) {
5121 kfree(map);
5122 return -ENOMEM;
5123 }
5124 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
5125 em->map_lookup = map;
5126 em->start = start;
5127 em->len = ctl->chunk_size;
5128 em->block_start = 0;
5129 em->block_len = em->len;
5130 em->orig_block_len = ctl->stripe_size;
5131
5132 em_tree = &info->mapping_tree;
5133 write_lock(&em_tree->lock);
5134 ret = add_extent_mapping(em_tree, em, 0);
5135 if (ret) {
5136 write_unlock(&em_tree->lock);
5137 free_extent_map(em);
5138 return ret;
5139 }
5140 write_unlock(&em_tree->lock);
5141
5142 ret = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size);
5143 if (ret)
5144 goto error_del_extent;
5145
5146 for (i = 0; i < map->num_stripes; i++) {
5147 struct btrfs_device *dev = map->stripes[i].dev;
5148
5149 btrfs_device_set_bytes_used(dev,
5150 dev->bytes_used + ctl->stripe_size);
5151 if (list_empty(&dev->post_commit_list))
5152 list_add_tail(&dev->post_commit_list,
5153 &trans->transaction->dev_update_list);
5154 }
5155
5156 atomic64_sub(ctl->stripe_size * map->num_stripes,
5157 &info->free_chunk_space);
5158
5159 free_extent_map(em);
5160 check_raid56_incompat_flag(info, type);
5161 check_raid1c34_incompat_flag(info, type);
5162
5163 return 0;
5164
5165 error_del_extent:
5166 write_lock(&em_tree->lock);
5167 remove_extent_mapping(em_tree, em);
5168 write_unlock(&em_tree->lock);
5169
5170 /* One for our allocation */
5171 free_extent_map(em);
5172 /* One for the tree reference */
5173 free_extent_map(em);
5174
5175 return ret;
5176 }
5177
btrfs_alloc_chunk(struct btrfs_trans_handle * trans,u64 type)5178 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type)
5179 {
5180 struct btrfs_fs_info *info = trans->fs_info;
5181 struct btrfs_fs_devices *fs_devices = info->fs_devices;
5182 struct btrfs_device_info *devices_info = NULL;
5183 struct alloc_chunk_ctl ctl;
5184 int ret;
5185
5186 lockdep_assert_held(&info->chunk_mutex);
5187
5188 if (!alloc_profile_is_valid(type, 0)) {
5189 ASSERT(0);
5190 return -EINVAL;
5191 }
5192
5193 if (list_empty(&fs_devices->alloc_list)) {
5194 if (btrfs_test_opt(info, ENOSPC_DEBUG))
5195 btrfs_debug(info, "%s: no writable device", __func__);
5196 return -ENOSPC;
5197 }
5198
5199 if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
5200 btrfs_err(info, "invalid chunk type 0x%llx requested", type);
5201 ASSERT(0);
5202 return -EINVAL;
5203 }
5204
5205 ctl.start = find_next_chunk(info);
5206 ctl.type = type;
5207 init_alloc_chunk_ctl(fs_devices, &ctl);
5208
5209 devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
5210 GFP_NOFS);
5211 if (!devices_info)
5212 return -ENOMEM;
5213
5214 ret = gather_device_info(fs_devices, &ctl, devices_info);
5215 if (ret < 0)
5216 goto out;
5217
5218 ret = decide_stripe_size(fs_devices, &ctl, devices_info);
5219 if (ret < 0)
5220 goto out;
5221
5222 ret = create_chunk(trans, &ctl, devices_info);
5223
5224 out:
5225 kfree(devices_info);
5226 return ret;
5227 }
5228
5229 /*
5230 * Chunk allocation falls into two parts. The first part does work
5231 * that makes the new allocated chunk usable, but does not do any operation
5232 * that modifies the chunk tree. The second part does the work that
5233 * requires modifying the chunk tree. This division is important for the
5234 * bootstrap process of adding storage to a seed btrfs.
5235 */
btrfs_finish_chunk_alloc(struct btrfs_trans_handle * trans,u64 chunk_offset,u64 chunk_size)5236 int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
5237 u64 chunk_offset, u64 chunk_size)
5238 {
5239 struct btrfs_fs_info *fs_info = trans->fs_info;
5240 struct btrfs_root *extent_root = fs_info->extent_root;
5241 struct btrfs_root *chunk_root = fs_info->chunk_root;
5242 struct btrfs_key key;
5243 struct btrfs_device *device;
5244 struct btrfs_chunk *chunk;
5245 struct btrfs_stripe *stripe;
5246 struct extent_map *em;
5247 struct map_lookup *map;
5248 size_t item_size;
5249 u64 dev_offset;
5250 u64 stripe_size;
5251 int i = 0;
5252 int ret = 0;
5253
5254 em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
5255 if (IS_ERR(em))
5256 return PTR_ERR(em);
5257
5258 map = em->map_lookup;
5259 item_size = btrfs_chunk_item_size(map->num_stripes);
5260 stripe_size = em->orig_block_len;
5261
5262 chunk = kzalloc(item_size, GFP_NOFS);
5263 if (!chunk) {
5264 ret = -ENOMEM;
5265 goto out;
5266 }
5267
5268 /*
5269 * Take the device list mutex to prevent races with the final phase of
5270 * a device replace operation that replaces the device object associated
5271 * with the map's stripes, because the device object's id can change
5272 * at any time during that final phase of the device replace operation
5273 * (dev-replace.c:btrfs_dev_replace_finishing()).
5274 */
5275 mutex_lock(&fs_info->fs_devices->device_list_mutex);
5276 for (i = 0; i < map->num_stripes; i++) {
5277 device = map->stripes[i].dev;
5278 dev_offset = map->stripes[i].physical;
5279
5280 ret = btrfs_update_device(trans, device);
5281 if (ret)
5282 break;
5283 ret = btrfs_alloc_dev_extent(trans, device, chunk_offset,
5284 dev_offset, stripe_size);
5285 if (ret)
5286 break;
5287 }
5288 if (ret) {
5289 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
5290 goto out;
5291 }
5292
5293 stripe = &chunk->stripe;
5294 for (i = 0; i < map->num_stripes; i++) {
5295 device = map->stripes[i].dev;
5296 dev_offset = map->stripes[i].physical;
5297
5298 btrfs_set_stack_stripe_devid(stripe, device->devid);
5299 btrfs_set_stack_stripe_offset(stripe, dev_offset);
5300 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
5301 stripe++;
5302 }
5303 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
5304
5305 btrfs_set_stack_chunk_length(chunk, chunk_size);
5306 btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
5307 btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
5308 btrfs_set_stack_chunk_type(chunk, map->type);
5309 btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
5310 btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
5311 btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
5312 btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize);
5313 btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
5314
5315 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
5316 key.type = BTRFS_CHUNK_ITEM_KEY;
5317 key.offset = chunk_offset;
5318
5319 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
5320 if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
5321 /*
5322 * TODO: Cleanup of inserted chunk root in case of
5323 * failure.
5324 */
5325 ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
5326 }
5327
5328 out:
5329 kfree(chunk);
5330 free_extent_map(em);
5331 return ret;
5332 }
5333
init_first_rw_device(struct btrfs_trans_handle * trans)5334 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
5335 {
5336 struct btrfs_fs_info *fs_info = trans->fs_info;
5337 u64 alloc_profile;
5338 int ret;
5339
5340 alloc_profile = btrfs_metadata_alloc_profile(fs_info);
5341 ret = btrfs_alloc_chunk(trans, alloc_profile);
5342 if (ret)
5343 return ret;
5344
5345 alloc_profile = btrfs_system_alloc_profile(fs_info);
5346 ret = btrfs_alloc_chunk(trans, alloc_profile);
5347 return ret;
5348 }
5349
btrfs_chunk_max_errors(struct map_lookup * map)5350 static inline int btrfs_chunk_max_errors(struct map_lookup *map)
5351 {
5352 const int index = btrfs_bg_flags_to_raid_index(map->type);
5353
5354 return btrfs_raid_array[index].tolerated_failures;
5355 }
5356
btrfs_chunk_readonly(struct btrfs_fs_info * fs_info,u64 chunk_offset)5357 int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
5358 {
5359 struct extent_map *em;
5360 struct map_lookup *map;
5361 int readonly = 0;
5362 int miss_ndevs = 0;
5363 int i;
5364
5365 em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
5366 if (IS_ERR(em))
5367 return 1;
5368
5369 map = em->map_lookup;
5370 for (i = 0; i < map->num_stripes; i++) {
5371 if (test_bit(BTRFS_DEV_STATE_MISSING,
5372 &map->stripes[i].dev->dev_state)) {
5373 miss_ndevs++;
5374 continue;
5375 }
5376 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
5377 &map->stripes[i].dev->dev_state)) {
5378 readonly = 1;
5379 goto end;
5380 }
5381 }
5382
5383 /*
5384 * If the number of missing devices is larger than max errors,
5385 * we can not write the data into that chunk successfully, so
5386 * set it readonly.
5387 */
5388 if (miss_ndevs > btrfs_chunk_max_errors(map))
5389 readonly = 1;
5390 end:
5391 free_extent_map(em);
5392 return readonly;
5393 }
5394
btrfs_mapping_tree_free(struct extent_map_tree * tree)5395 void btrfs_mapping_tree_free(struct extent_map_tree *tree)
5396 {
5397 struct extent_map *em;
5398
5399 while (1) {
5400 write_lock(&tree->lock);
5401 em = lookup_extent_mapping(tree, 0, (u64)-1);
5402 if (em)
5403 remove_extent_mapping(tree, em);
5404 write_unlock(&tree->lock);
5405 if (!em)
5406 break;
5407 /* once for us */
5408 free_extent_map(em);
5409 /* once for the tree */
5410 free_extent_map(em);
5411 }
5412 }
5413
btrfs_num_copies(struct btrfs_fs_info * fs_info,u64 logical,u64 len)5414 int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
5415 {
5416 struct extent_map *em;
5417 struct map_lookup *map;
5418 int ret;
5419
5420 em = btrfs_get_chunk_map(fs_info, logical, len);
5421 if (IS_ERR(em))
5422 /*
5423 * We could return errors for these cases, but that could get
5424 * ugly and we'd probably do the same thing which is just not do
5425 * anything else and exit, so return 1 so the callers don't try
5426 * to use other copies.
5427 */
5428 return 1;
5429
5430 map = em->map_lookup;
5431 if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK))
5432 ret = map->num_stripes;
5433 else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
5434 ret = map->sub_stripes;
5435 else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
5436 ret = 2;
5437 else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
5438 /*
5439 * There could be two corrupted data stripes, we need
5440 * to loop retry in order to rebuild the correct data.
5441 *
5442 * Fail a stripe at a time on every retry except the
5443 * stripe under reconstruction.
5444 */
5445 ret = map->num_stripes;
5446 else
5447 ret = 1;
5448 free_extent_map(em);
5449
5450 down_read(&fs_info->dev_replace.rwsem);
5451 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
5452 fs_info->dev_replace.tgtdev)
5453 ret++;
5454 up_read(&fs_info->dev_replace.rwsem);
5455
5456 return ret;
5457 }
5458
btrfs_full_stripe_len(struct btrfs_fs_info * fs_info,u64 logical)5459 unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
5460 u64 logical)
5461 {
5462 struct extent_map *em;
5463 struct map_lookup *map;
5464 unsigned long len = fs_info->sectorsize;
5465
5466 em = btrfs_get_chunk_map(fs_info, logical, len);
5467
5468 if (!WARN_ON(IS_ERR(em))) {
5469 map = em->map_lookup;
5470 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
5471 len = map->stripe_len * nr_data_stripes(map);
5472 free_extent_map(em);
5473 }
5474 return len;
5475 }
5476
btrfs_is_parity_mirror(struct btrfs_fs_info * fs_info,u64 logical,u64 len)5477 int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
5478 {
5479 struct extent_map *em;
5480 struct map_lookup *map;
5481 int ret = 0;
5482
5483 em = btrfs_get_chunk_map(fs_info, logical, len);
5484
5485 if(!WARN_ON(IS_ERR(em))) {
5486 map = em->map_lookup;
5487 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
5488 ret = 1;
5489 free_extent_map(em);
5490 }
5491 return ret;
5492 }
5493
find_live_mirror(struct btrfs_fs_info * fs_info,struct map_lookup * map,int first,int dev_replace_is_ongoing)5494 static int find_live_mirror(struct btrfs_fs_info *fs_info,
5495 struct map_lookup *map, int first,
5496 int dev_replace_is_ongoing)
5497 {
5498 int i;
5499 int num_stripes;
5500 int preferred_mirror;
5501 int tolerance;
5502 struct btrfs_device *srcdev;
5503
5504 ASSERT((map->type &
5505 (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10)));
5506
5507 if (map->type & BTRFS_BLOCK_GROUP_RAID10)
5508 num_stripes = map->sub_stripes;
5509 else
5510 num_stripes = map->num_stripes;
5511
5512 preferred_mirror = first + current->pid % num_stripes;
5513
5514 if (dev_replace_is_ongoing &&
5515 fs_info->dev_replace.cont_reading_from_srcdev_mode ==
5516 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
5517 srcdev = fs_info->dev_replace.srcdev;
5518 else
5519 srcdev = NULL;
5520
5521 /*
5522 * try to avoid the drive that is the source drive for a
5523 * dev-replace procedure, only choose it if no other non-missing
5524 * mirror is available
5525 */
5526 for (tolerance = 0; tolerance < 2; tolerance++) {
5527 if (map->stripes[preferred_mirror].dev->bdev &&
5528 (tolerance || map->stripes[preferred_mirror].dev != srcdev))
5529 return preferred_mirror;
5530 for (i = first; i < first + num_stripes; i++) {
5531 if (map->stripes[i].dev->bdev &&
5532 (tolerance || map->stripes[i].dev != srcdev))
5533 return i;
5534 }
5535 }
5536
5537 /* we couldn't find one that doesn't fail. Just return something
5538 * and the io error handling code will clean up eventually
5539 */
5540 return preferred_mirror;
5541 }
5542
5543 /* Bubble-sort the stripe set to put the parity/syndrome stripes last */
sort_parity_stripes(struct btrfs_bio * bbio,int num_stripes)5544 static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
5545 {
5546 int i;
5547 int again = 1;
5548
5549 while (again) {
5550 again = 0;
5551 for (i = 0; i < num_stripes - 1; i++) {
5552 /* Swap if parity is on a smaller index */
5553 if (bbio->raid_map[i] > bbio->raid_map[i + 1]) {
5554 swap(bbio->stripes[i], bbio->stripes[i + 1]);
5555 swap(bbio->raid_map[i], bbio->raid_map[i + 1]);
5556 again = 1;
5557 }
5558 }
5559 }
5560 }
5561
alloc_btrfs_bio(int total_stripes,int real_stripes)5562 static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
5563 {
5564 struct btrfs_bio *bbio = kzalloc(
5565 /* the size of the btrfs_bio */
5566 sizeof(struct btrfs_bio) +
5567 /* plus the variable array for the stripes */
5568 sizeof(struct btrfs_bio_stripe) * (total_stripes) +
5569 /* plus the variable array for the tgt dev */
5570 sizeof(int) * (real_stripes) +
5571 /*
5572 * plus the raid_map, which includes both the tgt dev
5573 * and the stripes
5574 */
5575 sizeof(u64) * (total_stripes),
5576 GFP_NOFS|__GFP_NOFAIL);
5577
5578 atomic_set(&bbio->error, 0);
5579 refcount_set(&bbio->refs, 1);
5580
5581 bbio->tgtdev_map = (int *)(bbio->stripes + total_stripes);
5582 bbio->raid_map = (u64 *)(bbio->tgtdev_map + real_stripes);
5583
5584 return bbio;
5585 }
5586
btrfs_get_bbio(struct btrfs_bio * bbio)5587 void btrfs_get_bbio(struct btrfs_bio *bbio)
5588 {
5589 WARN_ON(!refcount_read(&bbio->refs));
5590 refcount_inc(&bbio->refs);
5591 }
5592
btrfs_put_bbio(struct btrfs_bio * bbio)5593 void btrfs_put_bbio(struct btrfs_bio *bbio)
5594 {
5595 if (!bbio)
5596 return;
5597 if (refcount_dec_and_test(&bbio->refs))
5598 kfree(bbio);
5599 }
5600
5601 /* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */
5602 /*
5603 * Please note that, discard won't be sent to target device of device
5604 * replace.
5605 */
__btrfs_map_block_for_discard(struct btrfs_fs_info * fs_info,u64 logical,u64 * length_ret,struct btrfs_bio ** bbio_ret)5606 static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
5607 u64 logical, u64 *length_ret,
5608 struct btrfs_bio **bbio_ret)
5609 {
5610 struct extent_map *em;
5611 struct map_lookup *map;
5612 struct btrfs_bio *bbio;
5613 u64 length = *length_ret;
5614 u64 offset;
5615 u64 stripe_nr;
5616 u64 stripe_nr_end;
5617 u64 stripe_end_offset;
5618 u64 stripe_cnt;
5619 u64 stripe_len;
5620 u64 stripe_offset;
5621 u64 num_stripes;
5622 u32 stripe_index;
5623 u32 factor = 0;
5624 u32 sub_stripes = 0;
5625 u64 stripes_per_dev = 0;
5626 u32 remaining_stripes = 0;
5627 u32 last_stripe = 0;
5628 int ret = 0;
5629 int i;
5630
5631 /* discard always return a bbio */
5632 ASSERT(bbio_ret);
5633
5634 em = btrfs_get_chunk_map(fs_info, logical, length);
5635 if (IS_ERR(em))
5636 return PTR_ERR(em);
5637
5638 map = em->map_lookup;
5639 /* we don't discard raid56 yet */
5640 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5641 ret = -EOPNOTSUPP;
5642 goto out;
5643 }
5644
5645 offset = logical - em->start;
5646 length = min_t(u64, em->start + em->len - logical, length);
5647 *length_ret = length;
5648
5649 stripe_len = map->stripe_len;
5650 /*
5651 * stripe_nr counts the total number of stripes we have to stride
5652 * to get to this block
5653 */
5654 stripe_nr = div64_u64(offset, stripe_len);
5655
5656 /* stripe_offset is the offset of this block in its stripe */
5657 stripe_offset = offset - stripe_nr * stripe_len;
5658
5659 stripe_nr_end = round_up(offset + length, map->stripe_len);
5660 stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len);
5661 stripe_cnt = stripe_nr_end - stripe_nr;
5662 stripe_end_offset = stripe_nr_end * map->stripe_len -
5663 (offset + length);
5664 /*
5665 * after this, stripe_nr is the number of stripes on this
5666 * device we have to walk to find the data, and stripe_index is
5667 * the number of our device in the stripe array
5668 */
5669 num_stripes = 1;
5670 stripe_index = 0;
5671 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
5672 BTRFS_BLOCK_GROUP_RAID10)) {
5673 if (map->type & BTRFS_BLOCK_GROUP_RAID0)
5674 sub_stripes = 1;
5675 else
5676 sub_stripes = map->sub_stripes;
5677
5678 factor = map->num_stripes / sub_stripes;
5679 num_stripes = min_t(u64, map->num_stripes,
5680 sub_stripes * stripe_cnt);
5681 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
5682 stripe_index *= sub_stripes;
5683 stripes_per_dev = div_u64_rem(stripe_cnt, factor,
5684 &remaining_stripes);
5685 div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
5686 last_stripe *= sub_stripes;
5687 } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK |
5688 BTRFS_BLOCK_GROUP_DUP)) {
5689 num_stripes = map->num_stripes;
5690 } else {
5691 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
5692 &stripe_index);
5693 }
5694
5695 bbio = alloc_btrfs_bio(num_stripes, 0);
5696 if (!bbio) {
5697 ret = -ENOMEM;
5698 goto out;
5699 }
5700
5701 for (i = 0; i < num_stripes; i++) {
5702 bbio->stripes[i].physical =
5703 map->stripes[stripe_index].physical +
5704 stripe_offset + stripe_nr * map->stripe_len;
5705 bbio->stripes[i].dev = map->stripes[stripe_index].dev;
5706
5707 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
5708 BTRFS_BLOCK_GROUP_RAID10)) {
5709 bbio->stripes[i].length = stripes_per_dev *
5710 map->stripe_len;
5711
5712 if (i / sub_stripes < remaining_stripes)
5713 bbio->stripes[i].length +=
5714 map->stripe_len;
5715
5716 /*
5717 * Special for the first stripe and
5718 * the last stripe:
5719 *
5720 * |-------|...|-------|
5721 * |----------|
5722 * off end_off
5723 */
5724 if (i < sub_stripes)
5725 bbio->stripes[i].length -=
5726 stripe_offset;
5727
5728 if (stripe_index >= last_stripe &&
5729 stripe_index <= (last_stripe +
5730 sub_stripes - 1))
5731 bbio->stripes[i].length -=
5732 stripe_end_offset;
5733
5734 if (i == sub_stripes - 1)
5735 stripe_offset = 0;
5736 } else {
5737 bbio->stripes[i].length = length;
5738 }
5739
5740 stripe_index++;
5741 if (stripe_index == map->num_stripes) {
5742 stripe_index = 0;
5743 stripe_nr++;
5744 }
5745 }
5746
5747 *bbio_ret = bbio;
5748 bbio->map_type = map->type;
5749 bbio->num_stripes = num_stripes;
5750 out:
5751 free_extent_map(em);
5752 return ret;
5753 }
5754
5755 /*
5756 * In dev-replace case, for repair case (that's the only case where the mirror
5757 * is selected explicitly when calling btrfs_map_block), blocks left of the
5758 * left cursor can also be read from the target drive.
5759 *
5760 * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the
5761 * array of stripes.
5762 * For READ, it also needs to be supported using the same mirror number.
5763 *
5764 * If the requested block is not left of the left cursor, EIO is returned. This
5765 * can happen because btrfs_num_copies() returns one more in the dev-replace
5766 * case.
5767 */
get_extra_mirror_from_replace(struct btrfs_fs_info * fs_info,u64 logical,u64 length,u64 srcdev_devid,int * mirror_num,u64 * physical)5768 static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
5769 u64 logical, u64 length,
5770 u64 srcdev_devid, int *mirror_num,
5771 u64 *physical)
5772 {
5773 struct btrfs_bio *bbio = NULL;
5774 int num_stripes;
5775 int index_srcdev = 0;
5776 int found = 0;
5777 u64 physical_of_found = 0;
5778 int i;
5779 int ret = 0;
5780
5781 ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
5782 logical, &length, &bbio, 0, 0);
5783 if (ret) {
5784 ASSERT(bbio == NULL);
5785 return ret;
5786 }
5787
5788 num_stripes = bbio->num_stripes;
5789 if (*mirror_num > num_stripes) {
5790 /*
5791 * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror,
5792 * that means that the requested area is not left of the left
5793 * cursor
5794 */
5795 btrfs_put_bbio(bbio);
5796 return -EIO;
5797 }
5798
5799 /*
5800 * process the rest of the function using the mirror_num of the source
5801 * drive. Therefore look it up first. At the end, patch the device
5802 * pointer to the one of the target drive.
5803 */
5804 for (i = 0; i < num_stripes; i++) {
5805 if (bbio->stripes[i].dev->devid != srcdev_devid)
5806 continue;
5807
5808 /*
5809 * In case of DUP, in order to keep it simple, only add the
5810 * mirror with the lowest physical address
5811 */
5812 if (found &&
5813 physical_of_found <= bbio->stripes[i].physical)
5814 continue;
5815
5816 index_srcdev = i;
5817 found = 1;
5818 physical_of_found = bbio->stripes[i].physical;
5819 }
5820
5821 btrfs_put_bbio(bbio);
5822
5823 ASSERT(found);
5824 if (!found)
5825 return -EIO;
5826
5827 *mirror_num = index_srcdev + 1;
5828 *physical = physical_of_found;
5829 return ret;
5830 }
5831
handle_ops_on_dev_replace(enum btrfs_map_op op,struct btrfs_bio ** bbio_ret,struct btrfs_dev_replace * dev_replace,int * num_stripes_ret,int * max_errors_ret)5832 static void handle_ops_on_dev_replace(enum btrfs_map_op op,
5833 struct btrfs_bio **bbio_ret,
5834 struct btrfs_dev_replace *dev_replace,
5835 int *num_stripes_ret, int *max_errors_ret)
5836 {
5837 struct btrfs_bio *bbio = *bbio_ret;
5838 u64 srcdev_devid = dev_replace->srcdev->devid;
5839 int tgtdev_indexes = 0;
5840 int num_stripes = *num_stripes_ret;
5841 int max_errors = *max_errors_ret;
5842 int i;
5843
5844 if (op == BTRFS_MAP_WRITE) {
5845 int index_where_to_add;
5846
5847 /*
5848 * duplicate the write operations while the dev replace
5849 * procedure is running. Since the copying of the old disk to
5850 * the new disk takes place at run time while the filesystem is
5851 * mounted writable, the regular write operations to the old
5852 * disk have to be duplicated to go to the new disk as well.
5853 *
5854 * Note that device->missing is handled by the caller, and that
5855 * the write to the old disk is already set up in the stripes
5856 * array.
5857 */
5858 index_where_to_add = num_stripes;
5859 for (i = 0; i < num_stripes; i++) {
5860 if (bbio->stripes[i].dev->devid == srcdev_devid) {
5861 /* write to new disk, too */
5862 struct btrfs_bio_stripe *new =
5863 bbio->stripes + index_where_to_add;
5864 struct btrfs_bio_stripe *old =
5865 bbio->stripes + i;
5866
5867 new->physical = old->physical;
5868 new->length = old->length;
5869 new->dev = dev_replace->tgtdev;
5870 bbio->tgtdev_map[i] = index_where_to_add;
5871 index_where_to_add++;
5872 max_errors++;
5873 tgtdev_indexes++;
5874 }
5875 }
5876 num_stripes = index_where_to_add;
5877 } else if (op == BTRFS_MAP_GET_READ_MIRRORS) {
5878 int index_srcdev = 0;
5879 int found = 0;
5880 u64 physical_of_found = 0;
5881
5882 /*
5883 * During the dev-replace procedure, the target drive can also
5884 * be used to read data in case it is needed to repair a corrupt
5885 * block elsewhere. This is possible if the requested area is
5886 * left of the left cursor. In this area, the target drive is a
5887 * full copy of the source drive.
5888 */
5889 for (i = 0; i < num_stripes; i++) {
5890 if (bbio->stripes[i].dev->devid == srcdev_devid) {
5891 /*
5892 * In case of DUP, in order to keep it simple,
5893 * only add the mirror with the lowest physical
5894 * address
5895 */
5896 if (found &&
5897 physical_of_found <=
5898 bbio->stripes[i].physical)
5899 continue;
5900 index_srcdev = i;
5901 found = 1;
5902 physical_of_found = bbio->stripes[i].physical;
5903 }
5904 }
5905 if (found) {
5906 struct btrfs_bio_stripe *tgtdev_stripe =
5907 bbio->stripes + num_stripes;
5908
5909 tgtdev_stripe->physical = physical_of_found;
5910 tgtdev_stripe->length =
5911 bbio->stripes[index_srcdev].length;
5912 tgtdev_stripe->dev = dev_replace->tgtdev;
5913 bbio->tgtdev_map[index_srcdev] = num_stripes;
5914
5915 tgtdev_indexes++;
5916 num_stripes++;
5917 }
5918 }
5919
5920 *num_stripes_ret = num_stripes;
5921 *max_errors_ret = max_errors;
5922 bbio->num_tgtdevs = tgtdev_indexes;
5923 *bbio_ret = bbio;
5924 }
5925
need_full_stripe(enum btrfs_map_op op)5926 static bool need_full_stripe(enum btrfs_map_op op)
5927 {
5928 return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS);
5929 }
5930
5931 /*
5932 * btrfs_get_io_geometry - calculates the geomery of a particular (address, len)
5933 * tuple. This information is used to calculate how big a
5934 * particular bio can get before it straddles a stripe.
5935 *
5936 * @fs_info - the filesystem
5937 * @logical - address that we want to figure out the geometry of
5938 * @len - the length of IO we are going to perform, starting at @logical
5939 * @op - type of operation - write or read
5940 * @io_geom - pointer used to return values
5941 *
5942 * Returns < 0 in case a chunk for the given logical address cannot be found,
5943 * usually shouldn't happen unless @logical is corrupted, 0 otherwise.
5944 */
btrfs_get_io_geometry(struct btrfs_fs_info * fs_info,enum btrfs_map_op op,u64 logical,u64 len,struct btrfs_io_geometry * io_geom)5945 int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
5946 u64 logical, u64 len, struct btrfs_io_geometry *io_geom)
5947 {
5948 struct extent_map *em;
5949 struct map_lookup *map;
5950 u64 offset;
5951 u64 stripe_offset;
5952 u64 stripe_nr;
5953 u64 stripe_len;
5954 u64 raid56_full_stripe_start = (u64)-1;
5955 int data_stripes;
5956 int ret = 0;
5957
5958 ASSERT(op != BTRFS_MAP_DISCARD);
5959
5960 em = btrfs_get_chunk_map(fs_info, logical, len);
5961 if (IS_ERR(em))
5962 return PTR_ERR(em);
5963
5964 map = em->map_lookup;
5965 /* Offset of this logical address in the chunk */
5966 offset = logical - em->start;
5967 /* Len of a stripe in a chunk */
5968 stripe_len = map->stripe_len;
5969 /* Stripe wher this block falls in */
5970 stripe_nr = div64_u64(offset, stripe_len);
5971 /* Offset of stripe in the chunk */
5972 stripe_offset = stripe_nr * stripe_len;
5973 if (offset < stripe_offset) {
5974 btrfs_crit(fs_info,
5975 "stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu",
5976 stripe_offset, offset, em->start, logical, stripe_len);
5977 ret = -EINVAL;
5978 goto out;
5979 }
5980
5981 /* stripe_offset is the offset of this block in its stripe */
5982 stripe_offset = offset - stripe_offset;
5983 data_stripes = nr_data_stripes(map);
5984
5985 if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
5986 u64 max_len = stripe_len - stripe_offset;
5987
5988 /*
5989 * In case of raid56, we need to know the stripe aligned start
5990 */
5991 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5992 unsigned long full_stripe_len = stripe_len * data_stripes;
5993 raid56_full_stripe_start = offset;
5994
5995 /*
5996 * Allow a write of a full stripe, but make sure we
5997 * don't allow straddling of stripes
5998 */
5999 raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
6000 full_stripe_len);
6001 raid56_full_stripe_start *= full_stripe_len;
6002
6003 /*
6004 * For writes to RAID[56], allow a full stripeset across
6005 * all disks. For other RAID types and for RAID[56]
6006 * reads, just allow a single stripe (on a single disk).
6007 */
6008 if (op == BTRFS_MAP_WRITE) {
6009 max_len = stripe_len * data_stripes -
6010 (offset - raid56_full_stripe_start);
6011 }
6012 }
6013 len = min_t(u64, em->len - offset, max_len);
6014 } else {
6015 len = em->len - offset;
6016 }
6017
6018 io_geom->len = len;
6019 io_geom->offset = offset;
6020 io_geom->stripe_len = stripe_len;
6021 io_geom->stripe_nr = stripe_nr;
6022 io_geom->stripe_offset = stripe_offset;
6023 io_geom->raid56_stripe_offset = raid56_full_stripe_start;
6024
6025 out:
6026 /* once for us */
6027 free_extent_map(em);
6028 return ret;
6029 }
6030
__btrfs_map_block(struct btrfs_fs_info * fs_info,enum btrfs_map_op op,u64 logical,u64 * length,struct btrfs_bio ** bbio_ret,int mirror_num,int need_raid_map)6031 static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
6032 enum btrfs_map_op op,
6033 u64 logical, u64 *length,
6034 struct btrfs_bio **bbio_ret,
6035 int mirror_num, int need_raid_map)
6036 {
6037 struct extent_map *em;
6038 struct map_lookup *map;
6039 u64 stripe_offset;
6040 u64 stripe_nr;
6041 u64 stripe_len;
6042 u32 stripe_index;
6043 int data_stripes;
6044 int i;
6045 int ret = 0;
6046 int num_stripes;
6047 int max_errors = 0;
6048 int tgtdev_indexes = 0;
6049 struct btrfs_bio *bbio = NULL;
6050 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
6051 int dev_replace_is_ongoing = 0;
6052 int num_alloc_stripes;
6053 int patch_the_first_stripe_for_dev_replace = 0;
6054 u64 physical_to_patch_in_first_stripe = 0;
6055 u64 raid56_full_stripe_start = (u64)-1;
6056 struct btrfs_io_geometry geom;
6057
6058 ASSERT(bbio_ret);
6059 ASSERT(op != BTRFS_MAP_DISCARD);
6060
6061 ret = btrfs_get_io_geometry(fs_info, op, logical, *length, &geom);
6062 if (ret < 0)
6063 return ret;
6064
6065 em = btrfs_get_chunk_map(fs_info, logical, *length);
6066 ASSERT(!IS_ERR(em));
6067 map = em->map_lookup;
6068
6069 *length = geom.len;
6070 stripe_len = geom.stripe_len;
6071 stripe_nr = geom.stripe_nr;
6072 stripe_offset = geom.stripe_offset;
6073 raid56_full_stripe_start = geom.raid56_stripe_offset;
6074 data_stripes = nr_data_stripes(map);
6075
6076 down_read(&dev_replace->rwsem);
6077 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
6078 /*
6079 * Hold the semaphore for read during the whole operation, write is
6080 * requested at commit time but must wait.
6081 */
6082 if (!dev_replace_is_ongoing)
6083 up_read(&dev_replace->rwsem);
6084
6085 if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
6086 !need_full_stripe(op) && dev_replace->tgtdev != NULL) {
6087 ret = get_extra_mirror_from_replace(fs_info, logical, *length,
6088 dev_replace->srcdev->devid,
6089 &mirror_num,
6090 &physical_to_patch_in_first_stripe);
6091 if (ret)
6092 goto out;
6093 else
6094 patch_the_first_stripe_for_dev_replace = 1;
6095 } else if (mirror_num > map->num_stripes) {
6096 mirror_num = 0;
6097 }
6098
6099 num_stripes = 1;
6100 stripe_index = 0;
6101 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
6102 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
6103 &stripe_index);
6104 if (!need_full_stripe(op))
6105 mirror_num = 1;
6106 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
6107 if (need_full_stripe(op))
6108 num_stripes = map->num_stripes;
6109 else if (mirror_num)
6110 stripe_index = mirror_num - 1;
6111 else {
6112 stripe_index = find_live_mirror(fs_info, map, 0,
6113 dev_replace_is_ongoing);
6114 mirror_num = stripe_index + 1;
6115 }
6116
6117 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
6118 if (need_full_stripe(op)) {
6119 num_stripes = map->num_stripes;
6120 } else if (mirror_num) {
6121 stripe_index = mirror_num - 1;
6122 } else {
6123 mirror_num = 1;
6124 }
6125
6126 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
6127 u32 factor = map->num_stripes / map->sub_stripes;
6128
6129 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
6130 stripe_index *= map->sub_stripes;
6131
6132 if (need_full_stripe(op))
6133 num_stripes = map->sub_stripes;
6134 else if (mirror_num)
6135 stripe_index += mirror_num - 1;
6136 else {
6137 int old_stripe_index = stripe_index;
6138 stripe_index = find_live_mirror(fs_info, map,
6139 stripe_index,
6140 dev_replace_is_ongoing);
6141 mirror_num = stripe_index - old_stripe_index + 1;
6142 }
6143
6144 } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
6145 if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) {
6146 /* push stripe_nr back to the start of the full stripe */
6147 stripe_nr = div64_u64(raid56_full_stripe_start,
6148 stripe_len * data_stripes);
6149
6150 /* RAID[56] write or recovery. Return all stripes */
6151 num_stripes = map->num_stripes;
6152 max_errors = nr_parity_stripes(map);
6153
6154 *length = map->stripe_len;
6155 stripe_index = 0;
6156 stripe_offset = 0;
6157 } else {
6158 /*
6159 * Mirror #0 or #1 means the original data block.
6160 * Mirror #2 is RAID5 parity block.
6161 * Mirror #3 is RAID6 Q block.
6162 */
6163 stripe_nr = div_u64_rem(stripe_nr,
6164 data_stripes, &stripe_index);
6165 if (mirror_num > 1)
6166 stripe_index = data_stripes + mirror_num - 2;
6167
6168 /* We distribute the parity blocks across stripes */
6169 div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
6170 &stripe_index);
6171 if (!need_full_stripe(op) && mirror_num <= 1)
6172 mirror_num = 1;
6173 }
6174 } else {
6175 /*
6176 * after this, stripe_nr is the number of stripes on this
6177 * device we have to walk to find the data, and stripe_index is
6178 * the number of our device in the stripe array
6179 */
6180 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
6181 &stripe_index);
6182 mirror_num = stripe_index + 1;
6183 }
6184 if (stripe_index >= map->num_stripes) {
6185 btrfs_crit(fs_info,
6186 "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u",
6187 stripe_index, map->num_stripes);
6188 ret = -EINVAL;
6189 goto out;
6190 }
6191
6192 num_alloc_stripes = num_stripes;
6193 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) {
6194 if (op == BTRFS_MAP_WRITE)
6195 num_alloc_stripes <<= 1;
6196 if (op == BTRFS_MAP_GET_READ_MIRRORS)
6197 num_alloc_stripes++;
6198 tgtdev_indexes = num_stripes;
6199 }
6200
6201 bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes);
6202 if (!bbio) {
6203 ret = -ENOMEM;
6204 goto out;
6205 }
6206
6207 for (i = 0; i < num_stripes; i++) {
6208 bbio->stripes[i].physical = map->stripes[stripe_index].physical +
6209 stripe_offset + stripe_nr * map->stripe_len;
6210 bbio->stripes[i].dev = map->stripes[stripe_index].dev;
6211 stripe_index++;
6212 }
6213
6214 /* build raid_map */
6215 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
6216 (need_full_stripe(op) || mirror_num > 1)) {
6217 u64 tmp;
6218 unsigned rot;
6219
6220 /* Work out the disk rotation on this stripe-set */
6221 div_u64_rem(stripe_nr, num_stripes, &rot);
6222
6223 /* Fill in the logical address of each stripe */
6224 tmp = stripe_nr * data_stripes;
6225 for (i = 0; i < data_stripes; i++)
6226 bbio->raid_map[(i+rot) % num_stripes] =
6227 em->start + (tmp + i) * map->stripe_len;
6228
6229 bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
6230 if (map->type & BTRFS_BLOCK_GROUP_RAID6)
6231 bbio->raid_map[(i+rot+1) % num_stripes] =
6232 RAID6_Q_STRIPE;
6233
6234 sort_parity_stripes(bbio, num_stripes);
6235 }
6236
6237 if (need_full_stripe(op))
6238 max_errors = btrfs_chunk_max_errors(map);
6239
6240 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
6241 need_full_stripe(op)) {
6242 handle_ops_on_dev_replace(op, &bbio, dev_replace, &num_stripes,
6243 &max_errors);
6244 }
6245
6246 *bbio_ret = bbio;
6247 bbio->map_type = map->type;
6248 bbio->num_stripes = num_stripes;
6249 bbio->max_errors = max_errors;
6250 bbio->mirror_num = mirror_num;
6251
6252 /*
6253 * this is the case that REQ_READ && dev_replace_is_ongoing &&
6254 * mirror_num == num_stripes + 1 && dev_replace target drive is
6255 * available as a mirror
6256 */
6257 if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
6258 WARN_ON(num_stripes > 1);
6259 bbio->stripes[0].dev = dev_replace->tgtdev;
6260 bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
6261 bbio->mirror_num = map->num_stripes + 1;
6262 }
6263 out:
6264 if (dev_replace_is_ongoing) {
6265 lockdep_assert_held(&dev_replace->rwsem);
6266 /* Unlock and let waiting writers proceed */
6267 up_read(&dev_replace->rwsem);
6268 }
6269 free_extent_map(em);
6270 return ret;
6271 }
6272
btrfs_map_block(struct btrfs_fs_info * fs_info,enum btrfs_map_op op,u64 logical,u64 * length,struct btrfs_bio ** bbio_ret,int mirror_num)6273 int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
6274 u64 logical, u64 *length,
6275 struct btrfs_bio **bbio_ret, int mirror_num)
6276 {
6277 if (op == BTRFS_MAP_DISCARD)
6278 return __btrfs_map_block_for_discard(fs_info, logical,
6279 length, bbio_ret);
6280
6281 return __btrfs_map_block(fs_info, op, logical, length, bbio_ret,
6282 mirror_num, 0);
6283 }
6284
6285 /* For Scrub/replace */
btrfs_map_sblock(struct btrfs_fs_info * fs_info,enum btrfs_map_op op,u64 logical,u64 * length,struct btrfs_bio ** bbio_ret)6286 int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
6287 u64 logical, u64 *length,
6288 struct btrfs_bio **bbio_ret)
6289 {
6290 return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1);
6291 }
6292
btrfs_end_bbio(struct btrfs_bio * bbio,struct bio * bio)6293 static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio)
6294 {
6295 bio->bi_private = bbio->private;
6296 bio->bi_end_io = bbio->end_io;
6297 bio_endio(bio);
6298
6299 btrfs_put_bbio(bbio);
6300 }
6301
btrfs_end_bio(struct bio * bio)6302 static void btrfs_end_bio(struct bio *bio)
6303 {
6304 struct btrfs_bio *bbio = bio->bi_private;
6305 int is_orig_bio = 0;
6306
6307 if (bio->bi_status) {
6308 atomic_inc(&bbio->error);
6309 if (bio->bi_status == BLK_STS_IOERR ||
6310 bio->bi_status == BLK_STS_TARGET) {
6311 struct btrfs_device *dev = btrfs_io_bio(bio)->device;
6312
6313 ASSERT(dev->bdev);
6314 if (bio_op(bio) == REQ_OP_WRITE)
6315 btrfs_dev_stat_inc_and_print(dev,
6316 BTRFS_DEV_STAT_WRITE_ERRS);
6317 else if (!(bio->bi_opf & REQ_RAHEAD))
6318 btrfs_dev_stat_inc_and_print(dev,
6319 BTRFS_DEV_STAT_READ_ERRS);
6320 if (bio->bi_opf & REQ_PREFLUSH)
6321 btrfs_dev_stat_inc_and_print(dev,
6322 BTRFS_DEV_STAT_FLUSH_ERRS);
6323 }
6324 }
6325
6326 if (bio == bbio->orig_bio)
6327 is_orig_bio = 1;
6328
6329 btrfs_bio_counter_dec(bbio->fs_info);
6330
6331 if (atomic_dec_and_test(&bbio->stripes_pending)) {
6332 if (!is_orig_bio) {
6333 bio_put(bio);
6334 bio = bbio->orig_bio;
6335 }
6336
6337 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
6338 /* only send an error to the higher layers if it is
6339 * beyond the tolerance of the btrfs bio
6340 */
6341 if (atomic_read(&bbio->error) > bbio->max_errors) {
6342 bio->bi_status = BLK_STS_IOERR;
6343 } else {
6344 /*
6345 * this bio is actually up to date, we didn't
6346 * go over the max number of errors
6347 */
6348 bio->bi_status = BLK_STS_OK;
6349 }
6350
6351 btrfs_end_bbio(bbio, bio);
6352 } else if (!is_orig_bio) {
6353 bio_put(bio);
6354 }
6355 }
6356
submit_stripe_bio(struct btrfs_bio * bbio,struct bio * bio,u64 physical,struct btrfs_device * dev)6357 static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
6358 u64 physical, struct btrfs_device *dev)
6359 {
6360 struct btrfs_fs_info *fs_info = bbio->fs_info;
6361
6362 bio->bi_private = bbio;
6363 btrfs_io_bio(bio)->device = dev;
6364 bio->bi_end_io = btrfs_end_bio;
6365 bio->bi_iter.bi_sector = physical >> 9;
6366 btrfs_debug_in_rcu(fs_info,
6367 "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
6368 bio_op(bio), bio->bi_opf, (u64)bio->bi_iter.bi_sector,
6369 (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name),
6370 dev->devid, bio->bi_iter.bi_size);
6371 bio_set_dev(bio, dev->bdev);
6372
6373 btrfs_bio_counter_inc_noblocked(fs_info);
6374
6375 btrfsic_submit_bio(bio);
6376 }
6377
bbio_error(struct btrfs_bio * bbio,struct bio * bio,u64 logical)6378 static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
6379 {
6380 atomic_inc(&bbio->error);
6381 if (atomic_dec_and_test(&bbio->stripes_pending)) {
6382 /* Should be the original bio. */
6383 WARN_ON(bio != bbio->orig_bio);
6384
6385 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
6386 bio->bi_iter.bi_sector = logical >> 9;
6387 if (atomic_read(&bbio->error) > bbio->max_errors)
6388 bio->bi_status = BLK_STS_IOERR;
6389 else
6390 bio->bi_status = BLK_STS_OK;
6391 btrfs_end_bbio(bbio, bio);
6392 }
6393 }
6394
btrfs_map_bio(struct btrfs_fs_info * fs_info,struct bio * bio,int mirror_num)6395 blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
6396 int mirror_num)
6397 {
6398 struct btrfs_device *dev;
6399 struct bio *first_bio = bio;
6400 u64 logical = (u64)bio->bi_iter.bi_sector << 9;
6401 u64 length = 0;
6402 u64 map_length;
6403 int ret;
6404 int dev_nr;
6405 int total_devs;
6406 struct btrfs_bio *bbio = NULL;
6407
6408 length = bio->bi_iter.bi_size;
6409 map_length = length;
6410
6411 btrfs_bio_counter_inc_blocked(fs_info);
6412 ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical,
6413 &map_length, &bbio, mirror_num, 1);
6414 if (ret) {
6415 btrfs_bio_counter_dec(fs_info);
6416 return errno_to_blk_status(ret);
6417 }
6418
6419 total_devs = bbio->num_stripes;
6420 bbio->orig_bio = first_bio;
6421 bbio->private = first_bio->bi_private;
6422 bbio->end_io = first_bio->bi_end_io;
6423 bbio->fs_info = fs_info;
6424 atomic_set(&bbio->stripes_pending, bbio->num_stripes);
6425
6426 if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
6427 ((bio_op(bio) == REQ_OP_WRITE) || (mirror_num > 1))) {
6428 /* In this case, map_length has been set to the length of
6429 a single stripe; not the whole write */
6430 if (bio_op(bio) == REQ_OP_WRITE) {
6431 ret = raid56_parity_write(fs_info, bio, bbio,
6432 map_length);
6433 } else {
6434 ret = raid56_parity_recover(fs_info, bio, bbio,
6435 map_length, mirror_num, 1);
6436 }
6437
6438 btrfs_bio_counter_dec(fs_info);
6439 return errno_to_blk_status(ret);
6440 }
6441
6442 if (map_length < length) {
6443 btrfs_crit(fs_info,
6444 "mapping failed logical %llu bio len %llu len %llu",
6445 logical, length, map_length);
6446 BUG();
6447 }
6448
6449 for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
6450 dev = bbio->stripes[dev_nr].dev;
6451 if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING,
6452 &dev->dev_state) ||
6453 (bio_op(first_bio) == REQ_OP_WRITE &&
6454 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
6455 bbio_error(bbio, first_bio, logical);
6456 continue;
6457 }
6458
6459 if (dev_nr < total_devs - 1)
6460 bio = btrfs_bio_clone(first_bio);
6461 else
6462 bio = first_bio;
6463
6464 submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, dev);
6465 }
6466 btrfs_bio_counter_dec(fs_info);
6467 return BLK_STS_OK;
6468 }
6469
6470 /*
6471 * Find a device specified by @devid or @uuid in the list of @fs_devices, or
6472 * return NULL.
6473 *
6474 * If devid and uuid are both specified, the match must be exact, otherwise
6475 * only devid is used.
6476 *
6477 * If @seed is true, traverse through the seed devices.
6478 */
btrfs_find_device(struct btrfs_fs_devices * fs_devices,u64 devid,u8 * uuid,u8 * fsid,bool seed)6479 struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
6480 u64 devid, u8 *uuid, u8 *fsid,
6481 bool seed)
6482 {
6483 struct btrfs_device *device;
6484 struct btrfs_fs_devices *seed_devs;
6485
6486 if (!fsid || !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
6487 list_for_each_entry(device, &fs_devices->devices, dev_list) {
6488 if (device->devid == devid &&
6489 (!uuid || memcmp(device->uuid, uuid,
6490 BTRFS_UUID_SIZE) == 0))
6491 return device;
6492 }
6493 }
6494
6495 list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
6496 if (!fsid ||
6497 !memcmp(seed_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
6498 list_for_each_entry(device, &seed_devs->devices,
6499 dev_list) {
6500 if (device->devid == devid &&
6501 (!uuid || memcmp(device->uuid, uuid,
6502 BTRFS_UUID_SIZE) == 0))
6503 return device;
6504 }
6505 }
6506 }
6507
6508 return NULL;
6509 }
6510
add_missing_dev(struct btrfs_fs_devices * fs_devices,u64 devid,u8 * dev_uuid)6511 static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
6512 u64 devid, u8 *dev_uuid)
6513 {
6514 struct btrfs_device *device;
6515 unsigned int nofs_flag;
6516
6517 /*
6518 * We call this under the chunk_mutex, so we want to use NOFS for this
6519 * allocation, however we don't want to change btrfs_alloc_device() to
6520 * always do NOFS because we use it in a lot of other GFP_KERNEL safe
6521 * places.
6522 */
6523 nofs_flag = memalloc_nofs_save();
6524 device = btrfs_alloc_device(NULL, &devid, dev_uuid);
6525 memalloc_nofs_restore(nofs_flag);
6526 if (IS_ERR(device))
6527 return device;
6528
6529 list_add(&device->dev_list, &fs_devices->devices);
6530 device->fs_devices = fs_devices;
6531 fs_devices->num_devices++;
6532
6533 set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
6534 fs_devices->missing_devices++;
6535
6536 return device;
6537 }
6538
6539 /**
6540 * btrfs_alloc_device - allocate struct btrfs_device
6541 * @fs_info: used only for generating a new devid, can be NULL if
6542 * devid is provided (i.e. @devid != NULL).
6543 * @devid: a pointer to devid for this device. If NULL a new devid
6544 * is generated.
6545 * @uuid: a pointer to UUID for this device. If NULL a new UUID
6546 * is generated.
6547 *
6548 * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
6549 * on error. Returned struct is not linked onto any lists and must be
6550 * destroyed with btrfs_free_device.
6551 */
btrfs_alloc_device(struct btrfs_fs_info * fs_info,const u64 * devid,const u8 * uuid)6552 struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
6553 const u64 *devid,
6554 const u8 *uuid)
6555 {
6556 struct btrfs_device *dev;
6557 u64 tmp;
6558
6559 if (WARN_ON(!devid && !fs_info))
6560 return ERR_PTR(-EINVAL);
6561
6562 dev = __alloc_device(fs_info);
6563 if (IS_ERR(dev))
6564 return dev;
6565
6566 if (devid)
6567 tmp = *devid;
6568 else {
6569 int ret;
6570
6571 ret = find_next_devid(fs_info, &tmp);
6572 if (ret) {
6573 btrfs_free_device(dev);
6574 return ERR_PTR(ret);
6575 }
6576 }
6577 dev->devid = tmp;
6578
6579 if (uuid)
6580 memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE);
6581 else
6582 generate_random_uuid(dev->uuid);
6583
6584 return dev;
6585 }
6586
btrfs_report_missing_device(struct btrfs_fs_info * fs_info,u64 devid,u8 * uuid,bool error)6587 static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
6588 u64 devid, u8 *uuid, bool error)
6589 {
6590 if (error)
6591 btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing",
6592 devid, uuid);
6593 else
6594 btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing",
6595 devid, uuid);
6596 }
6597
calc_stripe_length(u64 type,u64 chunk_len,int num_stripes)6598 static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes)
6599 {
6600 int index = btrfs_bg_flags_to_raid_index(type);
6601 int ncopies = btrfs_raid_array[index].ncopies;
6602 const int nparity = btrfs_raid_array[index].nparity;
6603 int data_stripes;
6604
6605 if (nparity)
6606 data_stripes = num_stripes - nparity;
6607 else
6608 data_stripes = num_stripes / ncopies;
6609
6610 return div_u64(chunk_len, data_stripes);
6611 }
6612
read_one_chunk(struct btrfs_key * key,struct extent_buffer * leaf,struct btrfs_chunk * chunk)6613 static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
6614 struct btrfs_chunk *chunk)
6615 {
6616 struct btrfs_fs_info *fs_info = leaf->fs_info;
6617 struct extent_map_tree *map_tree = &fs_info->mapping_tree;
6618 struct map_lookup *map;
6619 struct extent_map *em;
6620 u64 logical;
6621 u64 length;
6622 u64 devid;
6623 u8 uuid[BTRFS_UUID_SIZE];
6624 int num_stripes;
6625 int ret;
6626 int i;
6627
6628 logical = key->offset;
6629 length = btrfs_chunk_length(leaf, chunk);
6630 num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
6631
6632 /*
6633 * Only need to verify chunk item if we're reading from sys chunk array,
6634 * as chunk item in tree block is already verified by tree-checker.
6635 */
6636 if (leaf->start == BTRFS_SUPER_INFO_OFFSET) {
6637 ret = btrfs_check_chunk_valid(leaf, chunk, logical);
6638 if (ret)
6639 return ret;
6640 }
6641
6642 read_lock(&map_tree->lock);
6643 em = lookup_extent_mapping(map_tree, logical, 1);
6644 read_unlock(&map_tree->lock);
6645
6646 /* already mapped? */
6647 if (em && em->start <= logical && em->start + em->len > logical) {
6648 free_extent_map(em);
6649 return 0;
6650 } else if (em) {
6651 free_extent_map(em);
6652 }
6653
6654 em = alloc_extent_map();
6655 if (!em)
6656 return -ENOMEM;
6657 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
6658 if (!map) {
6659 free_extent_map(em);
6660 return -ENOMEM;
6661 }
6662
6663 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
6664 em->map_lookup = map;
6665 em->start = logical;
6666 em->len = length;
6667 em->orig_start = 0;
6668 em->block_start = 0;
6669 em->block_len = em->len;
6670
6671 map->num_stripes = num_stripes;
6672 map->io_width = btrfs_chunk_io_width(leaf, chunk);
6673 map->io_align = btrfs_chunk_io_align(leaf, chunk);
6674 map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
6675 map->type = btrfs_chunk_type(leaf, chunk);
6676 map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
6677 map->verified_stripes = 0;
6678 em->orig_block_len = calc_stripe_length(map->type, em->len,
6679 map->num_stripes);
6680 for (i = 0; i < num_stripes; i++) {
6681 map->stripes[i].physical =
6682 btrfs_stripe_offset_nr(leaf, chunk, i);
6683 devid = btrfs_stripe_devid_nr(leaf, chunk, i);
6684 read_extent_buffer(leaf, uuid, (unsigned long)
6685 btrfs_stripe_dev_uuid_nr(chunk, i),
6686 BTRFS_UUID_SIZE);
6687 map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices,
6688 devid, uuid, NULL, true);
6689 if (!map->stripes[i].dev &&
6690 !btrfs_test_opt(fs_info, DEGRADED)) {
6691 free_extent_map(em);
6692 btrfs_report_missing_device(fs_info, devid, uuid, true);
6693 return -ENOENT;
6694 }
6695 if (!map->stripes[i].dev) {
6696 map->stripes[i].dev =
6697 add_missing_dev(fs_info->fs_devices, devid,
6698 uuid);
6699 if (IS_ERR(map->stripes[i].dev)) {
6700 free_extent_map(em);
6701 btrfs_err(fs_info,
6702 "failed to init missing dev %llu: %ld",
6703 devid, PTR_ERR(map->stripes[i].dev));
6704 return PTR_ERR(map->stripes[i].dev);
6705 }
6706 btrfs_report_missing_device(fs_info, devid, uuid, false);
6707 }
6708 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
6709 &(map->stripes[i].dev->dev_state));
6710
6711 }
6712
6713 write_lock(&map_tree->lock);
6714 ret = add_extent_mapping(map_tree, em, 0);
6715 write_unlock(&map_tree->lock);
6716 if (ret < 0) {
6717 btrfs_err(fs_info,
6718 "failed to add chunk map, start=%llu len=%llu: %d",
6719 em->start, em->len, ret);
6720 }
6721 free_extent_map(em);
6722
6723 return ret;
6724 }
6725
fill_device_from_item(struct extent_buffer * leaf,struct btrfs_dev_item * dev_item,struct btrfs_device * device)6726 static void fill_device_from_item(struct extent_buffer *leaf,
6727 struct btrfs_dev_item *dev_item,
6728 struct btrfs_device *device)
6729 {
6730 unsigned long ptr;
6731
6732 device->devid = btrfs_device_id(leaf, dev_item);
6733 device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
6734 device->total_bytes = device->disk_total_bytes;
6735 device->commit_total_bytes = device->disk_total_bytes;
6736 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
6737 device->commit_bytes_used = device->bytes_used;
6738 device->type = btrfs_device_type(leaf, dev_item);
6739 device->io_align = btrfs_device_io_align(leaf, dev_item);
6740 device->io_width = btrfs_device_io_width(leaf, dev_item);
6741 device->sector_size = btrfs_device_sector_size(leaf, dev_item);
6742 WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
6743 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
6744
6745 ptr = btrfs_device_uuid(dev_item);
6746 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
6747 }
6748
open_seed_devices(struct btrfs_fs_info * fs_info,u8 * fsid)6749 static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
6750 u8 *fsid)
6751 {
6752 struct btrfs_fs_devices *fs_devices;
6753 int ret;
6754
6755 lockdep_assert_held(&uuid_mutex);
6756 ASSERT(fsid);
6757
6758 /* This will match only for multi-device seed fs */
6759 list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list)
6760 if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE))
6761 return fs_devices;
6762
6763
6764 fs_devices = find_fsid(fsid, NULL);
6765 if (!fs_devices) {
6766 if (!btrfs_test_opt(fs_info, DEGRADED))
6767 return ERR_PTR(-ENOENT);
6768
6769 fs_devices = alloc_fs_devices(fsid, NULL);
6770 if (IS_ERR(fs_devices))
6771 return fs_devices;
6772
6773 fs_devices->seeding = true;
6774 fs_devices->opened = 1;
6775 return fs_devices;
6776 }
6777
6778 /*
6779 * Upon first call for a seed fs fsid, just create a private copy of the
6780 * respective fs_devices and anchor it at fs_info->fs_devices->seed_list
6781 */
6782 fs_devices = clone_fs_devices(fs_devices);
6783 if (IS_ERR(fs_devices))
6784 return fs_devices;
6785
6786 ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder);
6787 if (ret) {
6788 free_fs_devices(fs_devices);
6789 return ERR_PTR(ret);
6790 }
6791
6792 if (!fs_devices->seeding) {
6793 close_fs_devices(fs_devices);
6794 free_fs_devices(fs_devices);
6795 return ERR_PTR(-EINVAL);
6796 }
6797
6798 list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list);
6799
6800 return fs_devices;
6801 }
6802
read_one_dev(struct extent_buffer * leaf,struct btrfs_dev_item * dev_item)6803 static int read_one_dev(struct extent_buffer *leaf,
6804 struct btrfs_dev_item *dev_item)
6805 {
6806 struct btrfs_fs_info *fs_info = leaf->fs_info;
6807 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6808 struct btrfs_device *device;
6809 u64 devid;
6810 int ret;
6811 u8 fs_uuid[BTRFS_FSID_SIZE];
6812 u8 dev_uuid[BTRFS_UUID_SIZE];
6813
6814 devid = btrfs_device_id(leaf, dev_item);
6815 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
6816 BTRFS_UUID_SIZE);
6817 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
6818 BTRFS_FSID_SIZE);
6819
6820 if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) {
6821 fs_devices = open_seed_devices(fs_info, fs_uuid);
6822 if (IS_ERR(fs_devices))
6823 return PTR_ERR(fs_devices);
6824 }
6825
6826 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
6827 fs_uuid, true);
6828 if (!device) {
6829 if (!btrfs_test_opt(fs_info, DEGRADED)) {
6830 btrfs_report_missing_device(fs_info, devid,
6831 dev_uuid, true);
6832 return -ENOENT;
6833 }
6834
6835 device = add_missing_dev(fs_devices, devid, dev_uuid);
6836 if (IS_ERR(device)) {
6837 btrfs_err(fs_info,
6838 "failed to add missing dev %llu: %ld",
6839 devid, PTR_ERR(device));
6840 return PTR_ERR(device);
6841 }
6842 btrfs_report_missing_device(fs_info, devid, dev_uuid, false);
6843 } else {
6844 if (!device->bdev) {
6845 if (!btrfs_test_opt(fs_info, DEGRADED)) {
6846 btrfs_report_missing_device(fs_info,
6847 devid, dev_uuid, true);
6848 return -ENOENT;
6849 }
6850 btrfs_report_missing_device(fs_info, devid,
6851 dev_uuid, false);
6852 }
6853
6854 if (!device->bdev &&
6855 !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
6856 /*
6857 * this happens when a device that was properly setup
6858 * in the device info lists suddenly goes bad.
6859 * device->bdev is NULL, and so we have to set
6860 * device->missing to one here
6861 */
6862 device->fs_devices->missing_devices++;
6863 set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
6864 }
6865
6866 /* Move the device to its own fs_devices */
6867 if (device->fs_devices != fs_devices) {
6868 ASSERT(test_bit(BTRFS_DEV_STATE_MISSING,
6869 &device->dev_state));
6870
6871 list_move(&device->dev_list, &fs_devices->devices);
6872 device->fs_devices->num_devices--;
6873 fs_devices->num_devices++;
6874
6875 device->fs_devices->missing_devices--;
6876 fs_devices->missing_devices++;
6877
6878 device->fs_devices = fs_devices;
6879 }
6880 }
6881
6882 if (device->fs_devices != fs_info->fs_devices) {
6883 BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state));
6884 if (device->generation !=
6885 btrfs_device_generation(leaf, dev_item))
6886 return -EINVAL;
6887 }
6888
6889 fill_device_from_item(leaf, dev_item, device);
6890 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
6891 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
6892 !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
6893 device->fs_devices->total_rw_bytes += device->total_bytes;
6894 atomic64_add(device->total_bytes - device->bytes_used,
6895 &fs_info->free_chunk_space);
6896 }
6897 ret = 0;
6898 return ret;
6899 }
6900
btrfs_read_sys_array(struct btrfs_fs_info * fs_info)6901 int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
6902 {
6903 struct btrfs_root *root = fs_info->tree_root;
6904 struct btrfs_super_block *super_copy = fs_info->super_copy;
6905 struct extent_buffer *sb;
6906 struct btrfs_disk_key *disk_key;
6907 struct btrfs_chunk *chunk;
6908 u8 *array_ptr;
6909 unsigned long sb_array_offset;
6910 int ret = 0;
6911 u32 num_stripes;
6912 u32 array_size;
6913 u32 len = 0;
6914 u32 cur_offset;
6915 u64 type;
6916 struct btrfs_key key;
6917
6918 ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);
6919 /*
6920 * This will create extent buffer of nodesize, superblock size is
6921 * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will
6922 * overallocate but we can keep it as-is, only the first page is used.
6923 */
6924 sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET);
6925 if (IS_ERR(sb))
6926 return PTR_ERR(sb);
6927 set_extent_buffer_uptodate(sb);
6928 btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
6929 /*
6930 * The sb extent buffer is artificial and just used to read the system array.
6931 * set_extent_buffer_uptodate() call does not properly mark all it's
6932 * pages up-to-date when the page is larger: extent does not cover the
6933 * whole page and consequently check_page_uptodate does not find all
6934 * the page's extents up-to-date (the hole beyond sb),
6935 * write_extent_buffer then triggers a WARN_ON.
6936 *
6937 * Regular short extents go through mark_extent_buffer_dirty/writeback cycle,
6938 * but sb spans only this function. Add an explicit SetPageUptodate call
6939 * to silence the warning eg. on PowerPC 64.
6940 */
6941 if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE)
6942 SetPageUptodate(sb->pages[0]);
6943
6944 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
6945 array_size = btrfs_super_sys_array_size(super_copy);
6946
6947 array_ptr = super_copy->sys_chunk_array;
6948 sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array);
6949 cur_offset = 0;
6950
6951 while (cur_offset < array_size) {
6952 disk_key = (struct btrfs_disk_key *)array_ptr;
6953 len = sizeof(*disk_key);
6954 if (cur_offset + len > array_size)
6955 goto out_short_read;
6956
6957 btrfs_disk_key_to_cpu(&key, disk_key);
6958
6959 array_ptr += len;
6960 sb_array_offset += len;
6961 cur_offset += len;
6962
6963 if (key.type != BTRFS_CHUNK_ITEM_KEY) {
6964 btrfs_err(fs_info,
6965 "unexpected item type %u in sys_array at offset %u",
6966 (u32)key.type, cur_offset);
6967 ret = -EIO;
6968 break;
6969 }
6970
6971 chunk = (struct btrfs_chunk *)sb_array_offset;
6972 /*
6973 * At least one btrfs_chunk with one stripe must be present,
6974 * exact stripe count check comes afterwards
6975 */
6976 len = btrfs_chunk_item_size(1);
6977 if (cur_offset + len > array_size)
6978 goto out_short_read;
6979
6980 num_stripes = btrfs_chunk_num_stripes(sb, chunk);
6981 if (!num_stripes) {
6982 btrfs_err(fs_info,
6983 "invalid number of stripes %u in sys_array at offset %u",
6984 num_stripes, cur_offset);
6985 ret = -EIO;
6986 break;
6987 }
6988
6989 type = btrfs_chunk_type(sb, chunk);
6990 if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
6991 btrfs_err(fs_info,
6992 "invalid chunk type %llu in sys_array at offset %u",
6993 type, cur_offset);
6994 ret = -EIO;
6995 break;
6996 }
6997
6998 len = btrfs_chunk_item_size(num_stripes);
6999 if (cur_offset + len > array_size)
7000 goto out_short_read;
7001
7002 ret = read_one_chunk(&key, sb, chunk);
7003 if (ret)
7004 break;
7005
7006 array_ptr += len;
7007 sb_array_offset += len;
7008 cur_offset += len;
7009 }
7010 clear_extent_buffer_uptodate(sb);
7011 free_extent_buffer_stale(sb);
7012 return ret;
7013
7014 out_short_read:
7015 btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u",
7016 len, cur_offset);
7017 clear_extent_buffer_uptodate(sb);
7018 free_extent_buffer_stale(sb);
7019 return -EIO;
7020 }
7021
7022 /*
7023 * Check if all chunks in the fs are OK for read-write degraded mount
7024 *
7025 * If the @failing_dev is specified, it's accounted as missing.
7026 *
7027 * Return true if all chunks meet the minimal RW mount requirements.
7028 * Return false if any chunk doesn't meet the minimal RW mount requirements.
7029 */
btrfs_check_rw_degradable(struct btrfs_fs_info * fs_info,struct btrfs_device * failing_dev)7030 bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
7031 struct btrfs_device *failing_dev)
7032 {
7033 struct extent_map_tree *map_tree = &fs_info->mapping_tree;
7034 struct extent_map *em;
7035 u64 next_start = 0;
7036 bool ret = true;
7037
7038 read_lock(&map_tree->lock);
7039 em = lookup_extent_mapping(map_tree, 0, (u64)-1);
7040 read_unlock(&map_tree->lock);
7041 /* No chunk at all? Return false anyway */
7042 if (!em) {
7043 ret = false;
7044 goto out;
7045 }
7046 while (em) {
7047 struct map_lookup *map;
7048 int missing = 0;
7049 int max_tolerated;
7050 int i;
7051
7052 map = em->map_lookup;
7053 max_tolerated =
7054 btrfs_get_num_tolerated_disk_barrier_failures(
7055 map->type);
7056 for (i = 0; i < map->num_stripes; i++) {
7057 struct btrfs_device *dev = map->stripes[i].dev;
7058
7059 if (!dev || !dev->bdev ||
7060 test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
7061 dev->last_flush_error)
7062 missing++;
7063 else if (failing_dev && failing_dev == dev)
7064 missing++;
7065 }
7066 if (missing > max_tolerated) {
7067 if (!failing_dev)
7068 btrfs_warn(fs_info,
7069 "chunk %llu missing %d devices, max tolerance is %d for writable mount",
7070 em->start, missing, max_tolerated);
7071 free_extent_map(em);
7072 ret = false;
7073 goto out;
7074 }
7075 next_start = extent_map_end(em);
7076 free_extent_map(em);
7077
7078 read_lock(&map_tree->lock);
7079 em = lookup_extent_mapping(map_tree, next_start,
7080 (u64)(-1) - next_start);
7081 read_unlock(&map_tree->lock);
7082 }
7083 out:
7084 return ret;
7085 }
7086
readahead_tree_node_children(struct extent_buffer * node)7087 static void readahead_tree_node_children(struct extent_buffer *node)
7088 {
7089 int i;
7090 const int nr_items = btrfs_header_nritems(node);
7091
7092 for (i = 0; i < nr_items; i++) {
7093 u64 start;
7094
7095 start = btrfs_node_blockptr(node, i);
7096 readahead_tree_block(node->fs_info, start);
7097 }
7098 }
7099
btrfs_read_chunk_tree(struct btrfs_fs_info * fs_info)7100 int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
7101 {
7102 struct btrfs_root *root = fs_info->chunk_root;
7103 struct btrfs_path *path;
7104 struct extent_buffer *leaf;
7105 struct btrfs_key key;
7106 struct btrfs_key found_key;
7107 int ret;
7108 int slot;
7109 u64 total_dev = 0;
7110 u64 last_ra_node = 0;
7111
7112 path = btrfs_alloc_path();
7113 if (!path)
7114 return -ENOMEM;
7115
7116 /*
7117 * uuid_mutex is needed only if we are mounting a sprout FS
7118 * otherwise we don't need it.
7119 */
7120 mutex_lock(&uuid_mutex);
7121
7122 /*
7123 * It is possible for mount and umount to race in such a way that
7124 * we execute this code path, but open_fs_devices failed to clear
7125 * total_rw_bytes. We certainly want it cleared before reading the
7126 * device items, so clear it here.
7127 */
7128 fs_info->fs_devices->total_rw_bytes = 0;
7129
7130 /*
7131 * Read all device items, and then all the chunk items. All
7132 * device items are found before any chunk item (their object id
7133 * is smaller than the lowest possible object id for a chunk
7134 * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
7135 */
7136 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
7137 key.offset = 0;
7138 key.type = 0;
7139 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
7140 if (ret < 0)
7141 goto error;
7142 while (1) {
7143 struct extent_buffer *node;
7144
7145 leaf = path->nodes[0];
7146 slot = path->slots[0];
7147 if (slot >= btrfs_header_nritems(leaf)) {
7148 ret = btrfs_next_leaf(root, path);
7149 if (ret == 0)
7150 continue;
7151 if (ret < 0)
7152 goto error;
7153 break;
7154 }
7155 /*
7156 * The nodes on level 1 are not locked but we don't need to do
7157 * that during mount time as nothing else can access the tree
7158 */
7159 node = path->nodes[1];
7160 if (node) {
7161 if (last_ra_node != node->start) {
7162 readahead_tree_node_children(node);
7163 last_ra_node = node->start;
7164 }
7165 }
7166 btrfs_item_key_to_cpu(leaf, &found_key, slot);
7167 if (found_key.type == BTRFS_DEV_ITEM_KEY) {
7168 struct btrfs_dev_item *dev_item;
7169 dev_item = btrfs_item_ptr(leaf, slot,
7170 struct btrfs_dev_item);
7171 ret = read_one_dev(leaf, dev_item);
7172 if (ret)
7173 goto error;
7174 total_dev++;
7175 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
7176 struct btrfs_chunk *chunk;
7177 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
7178 mutex_lock(&fs_info->chunk_mutex);
7179 ret = read_one_chunk(&found_key, leaf, chunk);
7180 mutex_unlock(&fs_info->chunk_mutex);
7181 if (ret)
7182 goto error;
7183 }
7184 path->slots[0]++;
7185 }
7186
7187 /*
7188 * After loading chunk tree, we've got all device information,
7189 * do another round of validation checks.
7190 */
7191 if (total_dev != fs_info->fs_devices->total_devices) {
7192 btrfs_err(fs_info,
7193 "super_num_devices %llu mismatch with num_devices %llu found here",
7194 btrfs_super_num_devices(fs_info->super_copy),
7195 total_dev);
7196 ret = -EINVAL;
7197 goto error;
7198 }
7199 if (btrfs_super_total_bytes(fs_info->super_copy) <
7200 fs_info->fs_devices->total_rw_bytes) {
7201 btrfs_err(fs_info,
7202 "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu",
7203 btrfs_super_total_bytes(fs_info->super_copy),
7204 fs_info->fs_devices->total_rw_bytes);
7205 ret = -EINVAL;
7206 goto error;
7207 }
7208 ret = 0;
7209 error:
7210 mutex_unlock(&uuid_mutex);
7211
7212 btrfs_free_path(path);
7213 return ret;
7214 }
7215
btrfs_init_devices_late(struct btrfs_fs_info * fs_info)7216 void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
7217 {
7218 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
7219 struct btrfs_device *device;
7220
7221 fs_devices->fs_info = fs_info;
7222
7223 mutex_lock(&fs_devices->device_list_mutex);
7224 list_for_each_entry(device, &fs_devices->devices, dev_list)
7225 device->fs_info = fs_info;
7226
7227 list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
7228 list_for_each_entry(device, &seed_devs->devices, dev_list)
7229 device->fs_info = fs_info;
7230
7231 seed_devs->fs_info = fs_info;
7232 }
7233 mutex_unlock(&fs_devices->device_list_mutex);
7234 }
7235
btrfs_dev_stats_value(const struct extent_buffer * eb,const struct btrfs_dev_stats_item * ptr,int index)7236 static u64 btrfs_dev_stats_value(const struct extent_buffer *eb,
7237 const struct btrfs_dev_stats_item *ptr,
7238 int index)
7239 {
7240 u64 val;
7241
7242 read_extent_buffer(eb, &val,
7243 offsetof(struct btrfs_dev_stats_item, values) +
7244 ((unsigned long)ptr) + (index * sizeof(u64)),
7245 sizeof(val));
7246 return val;
7247 }
7248
btrfs_set_dev_stats_value(struct extent_buffer * eb,struct btrfs_dev_stats_item * ptr,int index,u64 val)7249 static void btrfs_set_dev_stats_value(struct extent_buffer *eb,
7250 struct btrfs_dev_stats_item *ptr,
7251 int index, u64 val)
7252 {
7253 write_extent_buffer(eb, &val,
7254 offsetof(struct btrfs_dev_stats_item, values) +
7255 ((unsigned long)ptr) + (index * sizeof(u64)),
7256 sizeof(val));
7257 }
7258
btrfs_device_init_dev_stats(struct btrfs_device * device,struct btrfs_path * path)7259 static int btrfs_device_init_dev_stats(struct btrfs_device *device,
7260 struct btrfs_path *path)
7261 {
7262 struct btrfs_dev_stats_item *ptr;
7263 struct extent_buffer *eb;
7264 struct btrfs_key key;
7265 int item_size;
7266 int i, ret, slot;
7267
7268 key.objectid = BTRFS_DEV_STATS_OBJECTID;
7269 key.type = BTRFS_PERSISTENT_ITEM_KEY;
7270 key.offset = device->devid;
7271 ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0);
7272 if (ret) {
7273 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7274 btrfs_dev_stat_set(device, i, 0);
7275 device->dev_stats_valid = 1;
7276 btrfs_release_path(path);
7277 return ret < 0 ? ret : 0;
7278 }
7279 slot = path->slots[0];
7280 eb = path->nodes[0];
7281 item_size = btrfs_item_size_nr(eb, slot);
7282
7283 ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item);
7284
7285 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
7286 if (item_size >= (1 + i) * sizeof(__le64))
7287 btrfs_dev_stat_set(device, i,
7288 btrfs_dev_stats_value(eb, ptr, i));
7289 else
7290 btrfs_dev_stat_set(device, i, 0);
7291 }
7292
7293 device->dev_stats_valid = 1;
7294 btrfs_dev_stat_print_on_load(device);
7295 btrfs_release_path(path);
7296
7297 return 0;
7298 }
7299
btrfs_init_dev_stats(struct btrfs_fs_info * fs_info)7300 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
7301 {
7302 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
7303 struct btrfs_device *device;
7304 struct btrfs_path *path = NULL;
7305 int ret = 0;
7306
7307 path = btrfs_alloc_path();
7308 if (!path)
7309 return -ENOMEM;
7310
7311 mutex_lock(&fs_devices->device_list_mutex);
7312 list_for_each_entry(device, &fs_devices->devices, dev_list) {
7313 ret = btrfs_device_init_dev_stats(device, path);
7314 if (ret)
7315 goto out;
7316 }
7317 list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
7318 list_for_each_entry(device, &seed_devs->devices, dev_list) {
7319 ret = btrfs_device_init_dev_stats(device, path);
7320 if (ret)
7321 goto out;
7322 }
7323 }
7324 out:
7325 mutex_unlock(&fs_devices->device_list_mutex);
7326
7327 btrfs_free_path(path);
7328 return ret;
7329 }
7330
update_dev_stat_item(struct btrfs_trans_handle * trans,struct btrfs_device * device)7331 static int update_dev_stat_item(struct btrfs_trans_handle *trans,
7332 struct btrfs_device *device)
7333 {
7334 struct btrfs_fs_info *fs_info = trans->fs_info;
7335 struct btrfs_root *dev_root = fs_info->dev_root;
7336 struct btrfs_path *path;
7337 struct btrfs_key key;
7338 struct extent_buffer *eb;
7339 struct btrfs_dev_stats_item *ptr;
7340 int ret;
7341 int i;
7342
7343 key.objectid = BTRFS_DEV_STATS_OBJECTID;
7344 key.type = BTRFS_PERSISTENT_ITEM_KEY;
7345 key.offset = device->devid;
7346
7347 path = btrfs_alloc_path();
7348 if (!path)
7349 return -ENOMEM;
7350 ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
7351 if (ret < 0) {
7352 btrfs_warn_in_rcu(fs_info,
7353 "error %d while searching for dev_stats item for device %s",
7354 ret, rcu_str_deref(device->name));
7355 goto out;
7356 }
7357
7358 if (ret == 0 &&
7359 btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
7360 /* need to delete old one and insert a new one */
7361 ret = btrfs_del_item(trans, dev_root, path);
7362 if (ret != 0) {
7363 btrfs_warn_in_rcu(fs_info,
7364 "delete too small dev_stats item for device %s failed %d",
7365 rcu_str_deref(device->name), ret);
7366 goto out;
7367 }
7368 ret = 1;
7369 }
7370
7371 if (ret == 1) {
7372 /* need to insert a new item */
7373 btrfs_release_path(path);
7374 ret = btrfs_insert_empty_item(trans, dev_root, path,
7375 &key, sizeof(*ptr));
7376 if (ret < 0) {
7377 btrfs_warn_in_rcu(fs_info,
7378 "insert dev_stats item for device %s failed %d",
7379 rcu_str_deref(device->name), ret);
7380 goto out;
7381 }
7382 }
7383
7384 eb = path->nodes[0];
7385 ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
7386 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7387 btrfs_set_dev_stats_value(eb, ptr, i,
7388 btrfs_dev_stat_read(device, i));
7389 btrfs_mark_buffer_dirty(eb);
7390
7391 out:
7392 btrfs_free_path(path);
7393 return ret;
7394 }
7395
7396 /*
7397 * called from commit_transaction. Writes all changed device stats to disk.
7398 */
btrfs_run_dev_stats(struct btrfs_trans_handle * trans)7399 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans)
7400 {
7401 struct btrfs_fs_info *fs_info = trans->fs_info;
7402 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7403 struct btrfs_device *device;
7404 int stats_cnt;
7405 int ret = 0;
7406
7407 mutex_lock(&fs_devices->device_list_mutex);
7408 list_for_each_entry(device, &fs_devices->devices, dev_list) {
7409 stats_cnt = atomic_read(&device->dev_stats_ccnt);
7410 if (!device->dev_stats_valid || stats_cnt == 0)
7411 continue;
7412
7413
7414 /*
7415 * There is a LOAD-LOAD control dependency between the value of
7416 * dev_stats_ccnt and updating the on-disk values which requires
7417 * reading the in-memory counters. Such control dependencies
7418 * require explicit read memory barriers.
7419 *
7420 * This memory barriers pairs with smp_mb__before_atomic in
7421 * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full
7422 * barrier implied by atomic_xchg in
7423 * btrfs_dev_stats_read_and_reset
7424 */
7425 smp_rmb();
7426
7427 ret = update_dev_stat_item(trans, device);
7428 if (!ret)
7429 atomic_sub(stats_cnt, &device->dev_stats_ccnt);
7430 }
7431 mutex_unlock(&fs_devices->device_list_mutex);
7432
7433 return ret;
7434 }
7435
btrfs_dev_stat_inc_and_print(struct btrfs_device * dev,int index)7436 void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
7437 {
7438 btrfs_dev_stat_inc(dev, index);
7439 btrfs_dev_stat_print_on_error(dev);
7440 }
7441
btrfs_dev_stat_print_on_error(struct btrfs_device * dev)7442 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
7443 {
7444 if (!dev->dev_stats_valid)
7445 return;
7446 btrfs_err_rl_in_rcu(dev->fs_info,
7447 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7448 rcu_str_deref(dev->name),
7449 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
7450 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
7451 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
7452 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
7453 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
7454 }
7455
btrfs_dev_stat_print_on_load(struct btrfs_device * dev)7456 static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
7457 {
7458 int i;
7459
7460 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7461 if (btrfs_dev_stat_read(dev, i) != 0)
7462 break;
7463 if (i == BTRFS_DEV_STAT_VALUES_MAX)
7464 return; /* all values == 0, suppress message */
7465
7466 btrfs_info_in_rcu(dev->fs_info,
7467 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7468 rcu_str_deref(dev->name),
7469 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
7470 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
7471 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
7472 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
7473 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
7474 }
7475
btrfs_get_dev_stats(struct btrfs_fs_info * fs_info,struct btrfs_ioctl_get_dev_stats * stats)7476 int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
7477 struct btrfs_ioctl_get_dev_stats *stats)
7478 {
7479 struct btrfs_device *dev;
7480 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7481 int i;
7482
7483 mutex_lock(&fs_devices->device_list_mutex);
7484 dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL,
7485 true);
7486 mutex_unlock(&fs_devices->device_list_mutex);
7487
7488 if (!dev) {
7489 btrfs_warn(fs_info, "get dev_stats failed, device not found");
7490 return -ENODEV;
7491 } else if (!dev->dev_stats_valid) {
7492 btrfs_warn(fs_info, "get dev_stats failed, not yet valid");
7493 return -ENODEV;
7494 } else if (stats->flags & BTRFS_DEV_STATS_RESET) {
7495 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
7496 if (stats->nr_items > i)
7497 stats->values[i] =
7498 btrfs_dev_stat_read_and_reset(dev, i);
7499 else
7500 btrfs_dev_stat_set(dev, i, 0);
7501 }
7502 btrfs_info(fs_info, "device stats zeroed by %s (%d)",
7503 current->comm, task_pid_nr(current));
7504 } else {
7505 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7506 if (stats->nr_items > i)
7507 stats->values[i] = btrfs_dev_stat_read(dev, i);
7508 }
7509 if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
7510 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
7511 return 0;
7512 }
7513
7514 /*
7515 * Update the size and bytes used for each device where it changed. This is
7516 * delayed since we would otherwise get errors while writing out the
7517 * superblocks.
7518 *
7519 * Must be invoked during transaction commit.
7520 */
btrfs_commit_device_sizes(struct btrfs_transaction * trans)7521 void btrfs_commit_device_sizes(struct btrfs_transaction *trans)
7522 {
7523 struct btrfs_device *curr, *next;
7524
7525 ASSERT(trans->state == TRANS_STATE_COMMIT_DOING);
7526
7527 if (list_empty(&trans->dev_update_list))
7528 return;
7529
7530 /*
7531 * We don't need the device_list_mutex here. This list is owned by the
7532 * transaction and the transaction must complete before the device is
7533 * released.
7534 */
7535 mutex_lock(&trans->fs_info->chunk_mutex);
7536 list_for_each_entry_safe(curr, next, &trans->dev_update_list,
7537 post_commit_list) {
7538 list_del_init(&curr->post_commit_list);
7539 curr->commit_total_bytes = curr->disk_total_bytes;
7540 curr->commit_bytes_used = curr->bytes_used;
7541 }
7542 mutex_unlock(&trans->fs_info->chunk_mutex);
7543 }
7544
7545 /*
7546 * Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10.
7547 */
btrfs_bg_type_to_factor(u64 flags)7548 int btrfs_bg_type_to_factor(u64 flags)
7549 {
7550 const int index = btrfs_bg_flags_to_raid_index(flags);
7551
7552 return btrfs_raid_array[index].ncopies;
7553 }
7554
7555
7556
verify_one_dev_extent(struct btrfs_fs_info * fs_info,u64 chunk_offset,u64 devid,u64 physical_offset,u64 physical_len)7557 static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
7558 u64 chunk_offset, u64 devid,
7559 u64 physical_offset, u64 physical_len)
7560 {
7561 struct extent_map_tree *em_tree = &fs_info->mapping_tree;
7562 struct extent_map *em;
7563 struct map_lookup *map;
7564 struct btrfs_device *dev;
7565 u64 stripe_len;
7566 bool found = false;
7567 int ret = 0;
7568 int i;
7569
7570 read_lock(&em_tree->lock);
7571 em = lookup_extent_mapping(em_tree, chunk_offset, 1);
7572 read_unlock(&em_tree->lock);
7573
7574 if (!em) {
7575 btrfs_err(fs_info,
7576 "dev extent physical offset %llu on devid %llu doesn't have corresponding chunk",
7577 physical_offset, devid);
7578 ret = -EUCLEAN;
7579 goto out;
7580 }
7581
7582 map = em->map_lookup;
7583 stripe_len = calc_stripe_length(map->type, em->len, map->num_stripes);
7584 if (physical_len != stripe_len) {
7585 btrfs_err(fs_info,
7586 "dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu",
7587 physical_offset, devid, em->start, physical_len,
7588 stripe_len);
7589 ret = -EUCLEAN;
7590 goto out;
7591 }
7592
7593 for (i = 0; i < map->num_stripes; i++) {
7594 if (map->stripes[i].dev->devid == devid &&
7595 map->stripes[i].physical == physical_offset) {
7596 found = true;
7597 if (map->verified_stripes >= map->num_stripes) {
7598 btrfs_err(fs_info,
7599 "too many dev extents for chunk %llu found",
7600 em->start);
7601 ret = -EUCLEAN;
7602 goto out;
7603 }
7604 map->verified_stripes++;
7605 break;
7606 }
7607 }
7608 if (!found) {
7609 btrfs_err(fs_info,
7610 "dev extent physical offset %llu devid %llu has no corresponding chunk",
7611 physical_offset, devid);
7612 ret = -EUCLEAN;
7613 }
7614
7615 /* Make sure no dev extent is beyond device bondary */
7616 dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
7617 if (!dev) {
7618 btrfs_err(fs_info, "failed to find devid %llu", devid);
7619 ret = -EUCLEAN;
7620 goto out;
7621 }
7622
7623 /* It's possible this device is a dummy for seed device */
7624 if (dev->disk_total_bytes == 0) {
7625 struct btrfs_fs_devices *devs;
7626
7627 devs = list_first_entry(&fs_info->fs_devices->seed_list,
7628 struct btrfs_fs_devices, seed_list);
7629 dev = btrfs_find_device(devs, devid, NULL, NULL, false);
7630 if (!dev) {
7631 btrfs_err(fs_info, "failed to find seed devid %llu",
7632 devid);
7633 ret = -EUCLEAN;
7634 goto out;
7635 }
7636 }
7637
7638 if (physical_offset + physical_len > dev->disk_total_bytes) {
7639 btrfs_err(fs_info,
7640 "dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu",
7641 devid, physical_offset, physical_len,
7642 dev->disk_total_bytes);
7643 ret = -EUCLEAN;
7644 goto out;
7645 }
7646 out:
7647 free_extent_map(em);
7648 return ret;
7649 }
7650
verify_chunk_dev_extent_mapping(struct btrfs_fs_info * fs_info)7651 static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
7652 {
7653 struct extent_map_tree *em_tree = &fs_info->mapping_tree;
7654 struct extent_map *em;
7655 struct rb_node *node;
7656 int ret = 0;
7657
7658 read_lock(&em_tree->lock);
7659 for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
7660 em = rb_entry(node, struct extent_map, rb_node);
7661 if (em->map_lookup->num_stripes !=
7662 em->map_lookup->verified_stripes) {
7663 btrfs_err(fs_info,
7664 "chunk %llu has missing dev extent, have %d expect %d",
7665 em->start, em->map_lookup->verified_stripes,
7666 em->map_lookup->num_stripes);
7667 ret = -EUCLEAN;
7668 goto out;
7669 }
7670 }
7671 out:
7672 read_unlock(&em_tree->lock);
7673 return ret;
7674 }
7675
7676 /*
7677 * Ensure that all dev extents are mapped to correct chunk, otherwise
7678 * later chunk allocation/free would cause unexpected behavior.
7679 *
7680 * NOTE: This will iterate through the whole device tree, which should be of
7681 * the same size level as the chunk tree. This slightly increases mount time.
7682 */
btrfs_verify_dev_extents(struct btrfs_fs_info * fs_info)7683 int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
7684 {
7685 struct btrfs_path *path;
7686 struct btrfs_root *root = fs_info->dev_root;
7687 struct btrfs_key key;
7688 u64 prev_devid = 0;
7689 u64 prev_dev_ext_end = 0;
7690 int ret = 0;
7691
7692 key.objectid = 1;
7693 key.type = BTRFS_DEV_EXTENT_KEY;
7694 key.offset = 0;
7695
7696 path = btrfs_alloc_path();
7697 if (!path)
7698 return -ENOMEM;
7699
7700 path->reada = READA_FORWARD;
7701 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
7702 if (ret < 0)
7703 goto out;
7704
7705 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
7706 ret = btrfs_next_item(root, path);
7707 if (ret < 0)
7708 goto out;
7709 /* No dev extents at all? Not good */
7710 if (ret > 0) {
7711 ret = -EUCLEAN;
7712 goto out;
7713 }
7714 }
7715 while (1) {
7716 struct extent_buffer *leaf = path->nodes[0];
7717 struct btrfs_dev_extent *dext;
7718 int slot = path->slots[0];
7719 u64 chunk_offset;
7720 u64 physical_offset;
7721 u64 physical_len;
7722 u64 devid;
7723
7724 btrfs_item_key_to_cpu(leaf, &key, slot);
7725 if (key.type != BTRFS_DEV_EXTENT_KEY)
7726 break;
7727 devid = key.objectid;
7728 physical_offset = key.offset;
7729
7730 dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent);
7731 chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext);
7732 physical_len = btrfs_dev_extent_length(leaf, dext);
7733
7734 /* Check if this dev extent overlaps with the previous one */
7735 if (devid == prev_devid && physical_offset < prev_dev_ext_end) {
7736 btrfs_err(fs_info,
7737 "dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu",
7738 devid, physical_offset, prev_dev_ext_end);
7739 ret = -EUCLEAN;
7740 goto out;
7741 }
7742
7743 ret = verify_one_dev_extent(fs_info, chunk_offset, devid,
7744 physical_offset, physical_len);
7745 if (ret < 0)
7746 goto out;
7747 prev_devid = devid;
7748 prev_dev_ext_end = physical_offset + physical_len;
7749
7750 ret = btrfs_next_item(root, path);
7751 if (ret < 0)
7752 goto out;
7753 if (ret > 0) {
7754 ret = 0;
7755 break;
7756 }
7757 }
7758
7759 /* Ensure all chunks have corresponding dev extents */
7760 ret = verify_chunk_dev_extent_mapping(fs_info);
7761 out:
7762 btrfs_free_path(path);
7763 return ret;
7764 }
7765
7766 /*
7767 * Check whether the given block group or device is pinned by any inode being
7768 * used as a swapfile.
7769 */
btrfs_pinned_by_swapfile(struct btrfs_fs_info * fs_info,void * ptr)7770 bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr)
7771 {
7772 struct btrfs_swapfile_pin *sp;
7773 struct rb_node *node;
7774
7775 spin_lock(&fs_info->swapfile_pins_lock);
7776 node = fs_info->swapfile_pins.rb_node;
7777 while (node) {
7778 sp = rb_entry(node, struct btrfs_swapfile_pin, node);
7779 if (ptr < sp->ptr)
7780 node = node->rb_left;
7781 else if (ptr > sp->ptr)
7782 node = node->rb_right;
7783 else
7784 break;
7785 }
7786 spin_unlock(&fs_info->swapfile_pins_lock);
7787 return node != NULL;
7788 }
7789