1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include <linux/sizes.h>
4 #include <linux/list_sort.h>
5 #include "misc.h"
6 #include "ctree.h"
7 #include "block-group.h"
8 #include "space-info.h"
9 #include "disk-io.h"
10 #include "free-space-cache.h"
11 #include "free-space-tree.h"
12 #include "volumes.h"
13 #include "transaction.h"
14 #include "ref-verify.h"
15 #include "sysfs.h"
16 #include "tree-log.h"
17 #include "delalloc-space.h"
18 #include "discard.h"
19 #include "raid56.h"
20 #include "zoned.h"
21 #include "fs.h"
22 #include "accessors.h"
23 #include "extent-tree.h"
24 
25 #ifdef CONFIG_BTRFS_DEBUG
btrfs_should_fragment_free_space(const struct btrfs_block_group * block_group)26 int btrfs_should_fragment_free_space(const struct btrfs_block_group *block_group)
27 {
28 	struct btrfs_fs_info *fs_info = block_group->fs_info;
29 
30 	return (btrfs_test_opt(fs_info, FRAGMENT_METADATA) &&
31 		block_group->flags & BTRFS_BLOCK_GROUP_METADATA) ||
32 	       (btrfs_test_opt(fs_info, FRAGMENT_DATA) &&
33 		block_group->flags &  BTRFS_BLOCK_GROUP_DATA);
34 }
35 #endif
36 
has_unwritten_metadata(struct btrfs_block_group * block_group)37 static inline bool has_unwritten_metadata(struct btrfs_block_group *block_group)
38 {
39 	/* The meta_write_pointer is available only on the zoned setup. */
40 	if (!btrfs_is_zoned(block_group->fs_info))
41 		return false;
42 
43 	if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
44 		return false;
45 
46 	return block_group->start + block_group->alloc_offset >
47 		block_group->meta_write_pointer;
48 }
49 
50 /*
51  * Return target flags in extended format or 0 if restripe for this chunk_type
52  * is not in progress
53  *
54  * Should be called with balance_lock held
55  */
get_restripe_target(const struct btrfs_fs_info * fs_info,u64 flags)56 static u64 get_restripe_target(const struct btrfs_fs_info *fs_info, u64 flags)
57 {
58 	const struct btrfs_balance_control *bctl = fs_info->balance_ctl;
59 	u64 target = 0;
60 
61 	if (!bctl)
62 		return 0;
63 
64 	if (flags & BTRFS_BLOCK_GROUP_DATA &&
65 	    bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
66 		target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
67 	} else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
68 		   bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
69 		target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
70 	} else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
71 		   bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
72 		target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
73 	}
74 
75 	return target;
76 }
77 
78 /*
79  * @flags: available profiles in extended format (see ctree.h)
80  *
81  * Return reduced profile in chunk format.  If profile changing is in progress
82  * (either running or paused) picks the target profile (if it's already
83  * available), otherwise falls back to plain reducing.
84  */
btrfs_reduce_alloc_profile(struct btrfs_fs_info * fs_info,u64 flags)85 static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
86 {
87 	u64 num_devices = fs_info->fs_devices->rw_devices;
88 	u64 target;
89 	u64 raid_type;
90 	u64 allowed = 0;
91 
92 	/*
93 	 * See if restripe for this chunk_type is in progress, if so try to
94 	 * reduce to the target profile
95 	 */
96 	spin_lock(&fs_info->balance_lock);
97 	target = get_restripe_target(fs_info, flags);
98 	if (target) {
99 		spin_unlock(&fs_info->balance_lock);
100 		return extended_to_chunk(target);
101 	}
102 	spin_unlock(&fs_info->balance_lock);
103 
104 	/* First, mask out the RAID levels which aren't possible */
105 	for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
106 		if (num_devices >= btrfs_raid_array[raid_type].devs_min)
107 			allowed |= btrfs_raid_array[raid_type].bg_flag;
108 	}
109 	allowed &= flags;
110 
111 	/* Select the highest-redundancy RAID level. */
112 	if (allowed & BTRFS_BLOCK_GROUP_RAID1C4)
113 		allowed = BTRFS_BLOCK_GROUP_RAID1C4;
114 	else if (allowed & BTRFS_BLOCK_GROUP_RAID6)
115 		allowed = BTRFS_BLOCK_GROUP_RAID6;
116 	else if (allowed & BTRFS_BLOCK_GROUP_RAID1C3)
117 		allowed = BTRFS_BLOCK_GROUP_RAID1C3;
118 	else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
119 		allowed = BTRFS_BLOCK_GROUP_RAID5;
120 	else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
121 		allowed = BTRFS_BLOCK_GROUP_RAID10;
122 	else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
123 		allowed = BTRFS_BLOCK_GROUP_RAID1;
124 	else if (allowed & BTRFS_BLOCK_GROUP_DUP)
125 		allowed = BTRFS_BLOCK_GROUP_DUP;
126 	else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
127 		allowed = BTRFS_BLOCK_GROUP_RAID0;
128 
129 	flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
130 
131 	return extended_to_chunk(flags | allowed);
132 }
133 
btrfs_get_alloc_profile(struct btrfs_fs_info * fs_info,u64 orig_flags)134 u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
135 {
136 	unsigned seq;
137 	u64 flags;
138 
139 	do {
140 		flags = orig_flags;
141 		seq = read_seqbegin(&fs_info->profiles_lock);
142 
143 		if (flags & BTRFS_BLOCK_GROUP_DATA)
144 			flags |= fs_info->avail_data_alloc_bits;
145 		else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
146 			flags |= fs_info->avail_system_alloc_bits;
147 		else if (flags & BTRFS_BLOCK_GROUP_METADATA)
148 			flags |= fs_info->avail_metadata_alloc_bits;
149 	} while (read_seqretry(&fs_info->profiles_lock, seq));
150 
151 	return btrfs_reduce_alloc_profile(fs_info, flags);
152 }
153 
btrfs_get_block_group(struct btrfs_block_group * cache)154 void btrfs_get_block_group(struct btrfs_block_group *cache)
155 {
156 	refcount_inc(&cache->refs);
157 }
158 
btrfs_put_block_group(struct btrfs_block_group * cache)159 void btrfs_put_block_group(struct btrfs_block_group *cache)
160 {
161 	if (refcount_dec_and_test(&cache->refs)) {
162 		WARN_ON(cache->pinned > 0);
163 		/*
164 		 * If there was a failure to cleanup a log tree, very likely due
165 		 * to an IO failure on a writeback attempt of one or more of its
166 		 * extent buffers, we could not do proper (and cheap) unaccounting
167 		 * of their reserved space, so don't warn on reserved > 0 in that
168 		 * case.
169 		 */
170 		if (!(cache->flags & BTRFS_BLOCK_GROUP_METADATA) ||
171 		    !BTRFS_FS_LOG_CLEANUP_ERROR(cache->fs_info))
172 			WARN_ON(cache->reserved > 0);
173 
174 		/*
175 		 * A block_group shouldn't be on the discard_list anymore.
176 		 * Remove the block_group from the discard_list to prevent us
177 		 * from causing a panic due to NULL pointer dereference.
178 		 */
179 		if (WARN_ON(!list_empty(&cache->discard_list)))
180 			btrfs_discard_cancel_work(&cache->fs_info->discard_ctl,
181 						  cache);
182 
183 		kfree(cache->free_space_ctl);
184 		btrfs_free_chunk_map(cache->physical_map);
185 		kfree(cache);
186 	}
187 }
188 
189 /*
190  * This adds the block group to the fs_info rb tree for the block group cache
191  */
btrfs_add_block_group_cache(struct btrfs_fs_info * info,struct btrfs_block_group * block_group)192 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
193 				       struct btrfs_block_group *block_group)
194 {
195 	struct rb_node **p;
196 	struct rb_node *parent = NULL;
197 	struct btrfs_block_group *cache;
198 	bool leftmost = true;
199 
200 	ASSERT(block_group->length != 0);
201 
202 	write_lock(&info->block_group_cache_lock);
203 	p = &info->block_group_cache_tree.rb_root.rb_node;
204 
205 	while (*p) {
206 		parent = *p;
207 		cache = rb_entry(parent, struct btrfs_block_group, cache_node);
208 		if (block_group->start < cache->start) {
209 			p = &(*p)->rb_left;
210 		} else if (block_group->start > cache->start) {
211 			p = &(*p)->rb_right;
212 			leftmost = false;
213 		} else {
214 			write_unlock(&info->block_group_cache_lock);
215 			return -EEXIST;
216 		}
217 	}
218 
219 	rb_link_node(&block_group->cache_node, parent, p);
220 	rb_insert_color_cached(&block_group->cache_node,
221 			       &info->block_group_cache_tree, leftmost);
222 
223 	write_unlock(&info->block_group_cache_lock);
224 
225 	return 0;
226 }
227 
228 /*
229  * This will return the block group at or after bytenr if contains is 0, else
230  * it will return the block group that contains the bytenr
231  */
block_group_cache_tree_search(struct btrfs_fs_info * info,u64 bytenr,int contains)232 static struct btrfs_block_group *block_group_cache_tree_search(
233 		struct btrfs_fs_info *info, u64 bytenr, int contains)
234 {
235 	struct btrfs_block_group *cache, *ret = NULL;
236 	struct rb_node *n;
237 	u64 end, start;
238 
239 	read_lock(&info->block_group_cache_lock);
240 	n = info->block_group_cache_tree.rb_root.rb_node;
241 
242 	while (n) {
243 		cache = rb_entry(n, struct btrfs_block_group, cache_node);
244 		end = cache->start + cache->length - 1;
245 		start = cache->start;
246 
247 		if (bytenr < start) {
248 			if (!contains && (!ret || start < ret->start))
249 				ret = cache;
250 			n = n->rb_left;
251 		} else if (bytenr > start) {
252 			if (contains && bytenr <= end) {
253 				ret = cache;
254 				break;
255 			}
256 			n = n->rb_right;
257 		} else {
258 			ret = cache;
259 			break;
260 		}
261 	}
262 	if (ret)
263 		btrfs_get_block_group(ret);
264 	read_unlock(&info->block_group_cache_lock);
265 
266 	return ret;
267 }
268 
269 /*
270  * Return the block group that starts at or after bytenr
271  */
btrfs_lookup_first_block_group(struct btrfs_fs_info * info,u64 bytenr)272 struct btrfs_block_group *btrfs_lookup_first_block_group(
273 		struct btrfs_fs_info *info, u64 bytenr)
274 {
275 	return block_group_cache_tree_search(info, bytenr, 0);
276 }
277 
278 /*
279  * Return the block group that contains the given bytenr
280  */
btrfs_lookup_block_group(struct btrfs_fs_info * info,u64 bytenr)281 struct btrfs_block_group *btrfs_lookup_block_group(
282 		struct btrfs_fs_info *info, u64 bytenr)
283 {
284 	return block_group_cache_tree_search(info, bytenr, 1);
285 }
286 
btrfs_next_block_group(struct btrfs_block_group * cache)287 struct btrfs_block_group *btrfs_next_block_group(
288 		struct btrfs_block_group *cache)
289 {
290 	struct btrfs_fs_info *fs_info = cache->fs_info;
291 	struct rb_node *node;
292 
293 	read_lock(&fs_info->block_group_cache_lock);
294 
295 	/* If our block group was removed, we need a full search. */
296 	if (RB_EMPTY_NODE(&cache->cache_node)) {
297 		const u64 next_bytenr = cache->start + cache->length;
298 
299 		read_unlock(&fs_info->block_group_cache_lock);
300 		btrfs_put_block_group(cache);
301 		return btrfs_lookup_first_block_group(fs_info, next_bytenr);
302 	}
303 	node = rb_next(&cache->cache_node);
304 	btrfs_put_block_group(cache);
305 	if (node) {
306 		cache = rb_entry(node, struct btrfs_block_group, cache_node);
307 		btrfs_get_block_group(cache);
308 	} else
309 		cache = NULL;
310 	read_unlock(&fs_info->block_group_cache_lock);
311 	return cache;
312 }
313 
314 /*
315  * Check if we can do a NOCOW write for a given extent.
316  *
317  * @fs_info:       The filesystem information object.
318  * @bytenr:        Logical start address of the extent.
319  *
320  * Check if we can do a NOCOW write for the given extent, and increments the
321  * number of NOCOW writers in the block group that contains the extent, as long
322  * as the block group exists and it's currently not in read-only mode.
323  *
324  * Returns: A non-NULL block group pointer if we can do a NOCOW write, the caller
325  *          is responsible for calling btrfs_dec_nocow_writers() later.
326  *
327  *          Or NULL if we can not do a NOCOW write
328  */
btrfs_inc_nocow_writers(struct btrfs_fs_info * fs_info,u64 bytenr)329 struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info,
330 						  u64 bytenr)
331 {
332 	struct btrfs_block_group *bg;
333 	bool can_nocow = true;
334 
335 	bg = btrfs_lookup_block_group(fs_info, bytenr);
336 	if (!bg)
337 		return NULL;
338 
339 	spin_lock(&bg->lock);
340 	if (bg->ro)
341 		can_nocow = false;
342 	else
343 		atomic_inc(&bg->nocow_writers);
344 	spin_unlock(&bg->lock);
345 
346 	if (!can_nocow) {
347 		btrfs_put_block_group(bg);
348 		return NULL;
349 	}
350 
351 	/* No put on block group, done by btrfs_dec_nocow_writers(). */
352 	return bg;
353 }
354 
355 /*
356  * Decrement the number of NOCOW writers in a block group.
357  *
358  * This is meant to be called after a previous call to btrfs_inc_nocow_writers(),
359  * and on the block group returned by that call. Typically this is called after
360  * creating an ordered extent for a NOCOW write, to prevent races with scrub and
361  * relocation.
362  *
363  * After this call, the caller should not use the block group anymore. It it wants
364  * to use it, then it should get a reference on it before calling this function.
365  */
btrfs_dec_nocow_writers(struct btrfs_block_group * bg)366 void btrfs_dec_nocow_writers(struct btrfs_block_group *bg)
367 {
368 	if (atomic_dec_and_test(&bg->nocow_writers))
369 		wake_up_var(&bg->nocow_writers);
370 
371 	/* For the lookup done by a previous call to btrfs_inc_nocow_writers(). */
372 	btrfs_put_block_group(bg);
373 }
374 
btrfs_wait_nocow_writers(struct btrfs_block_group * bg)375 void btrfs_wait_nocow_writers(struct btrfs_block_group *bg)
376 {
377 	wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers));
378 }
379 
btrfs_dec_block_group_reservations(struct btrfs_fs_info * fs_info,const u64 start)380 void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
381 					const u64 start)
382 {
383 	struct btrfs_block_group *bg;
384 
385 	bg = btrfs_lookup_block_group(fs_info, start);
386 	ASSERT(bg);
387 	if (atomic_dec_and_test(&bg->reservations))
388 		wake_up_var(&bg->reservations);
389 	btrfs_put_block_group(bg);
390 }
391 
btrfs_wait_block_group_reservations(struct btrfs_block_group * bg)392 void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg)
393 {
394 	struct btrfs_space_info *space_info = bg->space_info;
395 
396 	ASSERT(bg->ro);
397 
398 	if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
399 		return;
400 
401 	/*
402 	 * Our block group is read only but before we set it to read only,
403 	 * some task might have had allocated an extent from it already, but it
404 	 * has not yet created a respective ordered extent (and added it to a
405 	 * root's list of ordered extents).
406 	 * Therefore wait for any task currently allocating extents, since the
407 	 * block group's reservations counter is incremented while a read lock
408 	 * on the groups' semaphore is held and decremented after releasing
409 	 * the read access on that semaphore and creating the ordered extent.
410 	 */
411 	down_write(&space_info->groups_sem);
412 	up_write(&space_info->groups_sem);
413 
414 	wait_var_event(&bg->reservations, !atomic_read(&bg->reservations));
415 }
416 
btrfs_get_caching_control(struct btrfs_block_group * cache)417 struct btrfs_caching_control *btrfs_get_caching_control(
418 		struct btrfs_block_group *cache)
419 {
420 	struct btrfs_caching_control *ctl;
421 
422 	spin_lock(&cache->lock);
423 	if (!cache->caching_ctl) {
424 		spin_unlock(&cache->lock);
425 		return NULL;
426 	}
427 
428 	ctl = cache->caching_ctl;
429 	refcount_inc(&ctl->count);
430 	spin_unlock(&cache->lock);
431 	return ctl;
432 }
433 
btrfs_put_caching_control(struct btrfs_caching_control * ctl)434 static void btrfs_put_caching_control(struct btrfs_caching_control *ctl)
435 {
436 	if (refcount_dec_and_test(&ctl->count))
437 		kfree(ctl);
438 }
439 
440 /*
441  * When we wait for progress in the block group caching, its because our
442  * allocation attempt failed at least once.  So, we must sleep and let some
443  * progress happen before we try again.
444  *
445  * This function will sleep at least once waiting for new free space to show
446  * up, and then it will check the block group free space numbers for our min
447  * num_bytes.  Another option is to have it go ahead and look in the rbtree for
448  * a free extent of a given size, but this is a good start.
449  *
450  * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
451  * any of the information in this block group.
452  */
btrfs_wait_block_group_cache_progress(struct btrfs_block_group * cache,u64 num_bytes)453 void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
454 					   u64 num_bytes)
455 {
456 	struct btrfs_caching_control *caching_ctl;
457 	int progress;
458 
459 	caching_ctl = btrfs_get_caching_control(cache);
460 	if (!caching_ctl)
461 		return;
462 
463 	/*
464 	 * We've already failed to allocate from this block group, so even if
465 	 * there's enough space in the block group it isn't contiguous enough to
466 	 * allow for an allocation, so wait for at least the next wakeup tick,
467 	 * or for the thing to be done.
468 	 */
469 	progress = atomic_read(&caching_ctl->progress);
470 
471 	wait_event(caching_ctl->wait, btrfs_block_group_done(cache) ||
472 		   (progress != atomic_read(&caching_ctl->progress) &&
473 		    (cache->free_space_ctl->free_space >= num_bytes)));
474 
475 	btrfs_put_caching_control(caching_ctl);
476 }
477 
btrfs_caching_ctl_wait_done(struct btrfs_block_group * cache,struct btrfs_caching_control * caching_ctl)478 static int btrfs_caching_ctl_wait_done(struct btrfs_block_group *cache,
479 				       struct btrfs_caching_control *caching_ctl)
480 {
481 	wait_event(caching_ctl->wait, btrfs_block_group_done(cache));
482 	return cache->cached == BTRFS_CACHE_ERROR ? -EIO : 0;
483 }
484 
btrfs_wait_block_group_cache_done(struct btrfs_block_group * cache)485 static int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache)
486 {
487 	struct btrfs_caching_control *caching_ctl;
488 	int ret;
489 
490 	caching_ctl = btrfs_get_caching_control(cache);
491 	if (!caching_ctl)
492 		return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
493 	ret = btrfs_caching_ctl_wait_done(cache, caching_ctl);
494 	btrfs_put_caching_control(caching_ctl);
495 	return ret;
496 }
497 
498 #ifdef CONFIG_BTRFS_DEBUG
fragment_free_space(struct btrfs_block_group * block_group)499 static void fragment_free_space(struct btrfs_block_group *block_group)
500 {
501 	struct btrfs_fs_info *fs_info = block_group->fs_info;
502 	u64 start = block_group->start;
503 	u64 len = block_group->length;
504 	u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
505 		fs_info->nodesize : fs_info->sectorsize;
506 	u64 step = chunk << 1;
507 
508 	while (len > chunk) {
509 		btrfs_remove_free_space(block_group, start, chunk);
510 		start += step;
511 		if (len < step)
512 			len = 0;
513 		else
514 			len -= step;
515 	}
516 }
517 #endif
518 
519 /*
520  * Add a free space range to the in memory free space cache of a block group.
521  * This checks if the range contains super block locations and any such
522  * locations are not added to the free space cache.
523  *
524  * @block_group:      The target block group.
525  * @start:            Start offset of the range.
526  * @end:              End offset of the range (exclusive).
527  * @total_added_ret:  Optional pointer to return the total amount of space
528  *                    added to the block group's free space cache.
529  *
530  * Returns 0 on success or < 0 on error.
531  */
btrfs_add_new_free_space(struct btrfs_block_group * block_group,u64 start,u64 end,u64 * total_added_ret)532 int btrfs_add_new_free_space(struct btrfs_block_group *block_group, u64 start,
533 			     u64 end, u64 *total_added_ret)
534 {
535 	struct btrfs_fs_info *info = block_group->fs_info;
536 	u64 extent_start, extent_end, size;
537 	int ret;
538 
539 	if (total_added_ret)
540 		*total_added_ret = 0;
541 
542 	while (start < end) {
543 		if (!find_first_extent_bit(&info->excluded_extents, start,
544 					   &extent_start, &extent_end,
545 					   EXTENT_DIRTY | EXTENT_UPTODATE,
546 					   NULL))
547 			break;
548 
549 		if (extent_start <= start) {
550 			start = extent_end + 1;
551 		} else if (extent_start > start && extent_start < end) {
552 			size = extent_start - start;
553 			ret = btrfs_add_free_space_async_trimmed(block_group,
554 								 start, size);
555 			if (ret)
556 				return ret;
557 			if (total_added_ret)
558 				*total_added_ret += size;
559 			start = extent_end + 1;
560 		} else {
561 			break;
562 		}
563 	}
564 
565 	if (start < end) {
566 		size = end - start;
567 		ret = btrfs_add_free_space_async_trimmed(block_group, start,
568 							 size);
569 		if (ret)
570 			return ret;
571 		if (total_added_ret)
572 			*total_added_ret += size;
573 	}
574 
575 	return 0;
576 }
577 
578 /*
579  * Get an arbitrary extent item index / max_index through the block group
580  *
581  * @block_group   the block group to sample from
582  * @index:        the integral step through the block group to grab from
583  * @max_index:    the granularity of the sampling
584  * @key:          return value parameter for the item we find
585  *
586  * Pre-conditions on indices:
587  * 0 <= index <= max_index
588  * 0 < max_index
589  *
590  * Returns: 0 on success, 1 if the search didn't yield a useful item, negative
591  * error code on error.
592  */
sample_block_group_extent_item(struct btrfs_caching_control * caching_ctl,struct btrfs_block_group * block_group,int index,int max_index,struct btrfs_key * found_key)593 static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ctl,
594 					  struct btrfs_block_group *block_group,
595 					  int index, int max_index,
596 					  struct btrfs_key *found_key)
597 {
598 	struct btrfs_fs_info *fs_info = block_group->fs_info;
599 	struct btrfs_root *extent_root;
600 	u64 search_offset;
601 	u64 search_end = block_group->start + block_group->length;
602 	struct btrfs_path *path;
603 	struct btrfs_key search_key;
604 	int ret = 0;
605 
606 	ASSERT(index >= 0);
607 	ASSERT(index <= max_index);
608 	ASSERT(max_index > 0);
609 	lockdep_assert_held(&caching_ctl->mutex);
610 	lockdep_assert_held_read(&fs_info->commit_root_sem);
611 
612 	path = btrfs_alloc_path();
613 	if (!path)
614 		return -ENOMEM;
615 
616 	extent_root = btrfs_extent_root(fs_info, max_t(u64, block_group->start,
617 						       BTRFS_SUPER_INFO_OFFSET));
618 
619 	path->skip_locking = 1;
620 	path->search_commit_root = 1;
621 	path->reada = READA_FORWARD;
622 
623 	search_offset = index * div_u64(block_group->length, max_index);
624 	search_key.objectid = block_group->start + search_offset;
625 	search_key.type = BTRFS_EXTENT_ITEM_KEY;
626 	search_key.offset = 0;
627 
628 	btrfs_for_each_slot(extent_root, &search_key, found_key, path, ret) {
629 		/* Success; sampled an extent item in the block group */
630 		if (found_key->type == BTRFS_EXTENT_ITEM_KEY &&
631 		    found_key->objectid >= block_group->start &&
632 		    found_key->objectid + found_key->offset <= search_end)
633 			break;
634 
635 		/* We can't possibly find a valid extent item anymore */
636 		if (found_key->objectid >= search_end) {
637 			ret = 1;
638 			break;
639 		}
640 	}
641 
642 	lockdep_assert_held(&caching_ctl->mutex);
643 	lockdep_assert_held_read(&fs_info->commit_root_sem);
644 	btrfs_free_path(path);
645 	return ret;
646 }
647 
648 /*
649  * Best effort attempt to compute a block group's size class while caching it.
650  *
651  * @block_group: the block group we are caching
652  *
653  * We cannot infer the size class while adding free space extents, because that
654  * logic doesn't care about contiguous file extents (it doesn't differentiate
655  * between a 100M extent and 100 contiguous 1M extents). So we need to read the
656  * file extent items. Reading all of them is quite wasteful, because usually
657  * only a handful are enough to give a good answer. Therefore, we just grab 5 of
658  * them at even steps through the block group and pick the smallest size class
659  * we see. Since size class is best effort, and not guaranteed in general,
660  * inaccuracy is acceptable.
661  *
662  * To be more explicit about why this algorithm makes sense:
663  *
664  * If we are caching in a block group from disk, then there are three major cases
665  * to consider:
666  * 1. the block group is well behaved and all extents in it are the same size
667  *    class.
668  * 2. the block group is mostly one size class with rare exceptions for last
669  *    ditch allocations
670  * 3. the block group was populated before size classes and can have a totally
671  *    arbitrary mix of size classes.
672  *
673  * In case 1, looking at any extent in the block group will yield the correct
674  * result. For the mixed cases, taking the minimum size class seems like a good
675  * approximation, since gaps from frees will be usable to the size class. For
676  * 2., a small handful of file extents is likely to yield the right answer. For
677  * 3, we can either read every file extent, or admit that this is best effort
678  * anyway and try to stay fast.
679  *
680  * Returns: 0 on success, negative error code on error.
681  */
load_block_group_size_class(struct btrfs_caching_control * caching_ctl,struct btrfs_block_group * block_group)682 static int load_block_group_size_class(struct btrfs_caching_control *caching_ctl,
683 				       struct btrfs_block_group *block_group)
684 {
685 	struct btrfs_fs_info *fs_info = block_group->fs_info;
686 	struct btrfs_key key;
687 	int i;
688 	u64 min_size = block_group->length;
689 	enum btrfs_block_group_size_class size_class = BTRFS_BG_SZ_NONE;
690 	int ret;
691 
692 	if (!btrfs_block_group_should_use_size_class(block_group))
693 		return 0;
694 
695 	lockdep_assert_held(&caching_ctl->mutex);
696 	lockdep_assert_held_read(&fs_info->commit_root_sem);
697 	for (i = 0; i < 5; ++i) {
698 		ret = sample_block_group_extent_item(caching_ctl, block_group, i, 5, &key);
699 		if (ret < 0)
700 			goto out;
701 		if (ret > 0)
702 			continue;
703 		min_size = min_t(u64, min_size, key.offset);
704 		size_class = btrfs_calc_block_group_size_class(min_size);
705 	}
706 	if (size_class != BTRFS_BG_SZ_NONE) {
707 		spin_lock(&block_group->lock);
708 		block_group->size_class = size_class;
709 		spin_unlock(&block_group->lock);
710 	}
711 out:
712 	return ret;
713 }
714 
load_extent_tree_free(struct btrfs_caching_control * caching_ctl)715 static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
716 {
717 	struct btrfs_block_group *block_group = caching_ctl->block_group;
718 	struct btrfs_fs_info *fs_info = block_group->fs_info;
719 	struct btrfs_root *extent_root;
720 	struct btrfs_path *path;
721 	struct extent_buffer *leaf;
722 	struct btrfs_key key;
723 	u64 total_found = 0;
724 	u64 last = 0;
725 	u32 nritems;
726 	int ret;
727 	bool wakeup = true;
728 
729 	path = btrfs_alloc_path();
730 	if (!path)
731 		return -ENOMEM;
732 
733 	last = max_t(u64, block_group->start, BTRFS_SUPER_INFO_OFFSET);
734 	extent_root = btrfs_extent_root(fs_info, last);
735 
736 #ifdef CONFIG_BTRFS_DEBUG
737 	/*
738 	 * If we're fragmenting we don't want to make anybody think we can
739 	 * allocate from this block group until we've had a chance to fragment
740 	 * the free space.
741 	 */
742 	if (btrfs_should_fragment_free_space(block_group))
743 		wakeup = false;
744 #endif
745 	/*
746 	 * We don't want to deadlock with somebody trying to allocate a new
747 	 * extent for the extent root while also trying to search the extent
748 	 * root to add free space.  So we skip locking and search the commit
749 	 * root, since its read-only
750 	 */
751 	path->skip_locking = 1;
752 	path->search_commit_root = 1;
753 	path->reada = READA_FORWARD;
754 
755 	key.objectid = last;
756 	key.offset = 0;
757 	key.type = BTRFS_EXTENT_ITEM_KEY;
758 
759 next:
760 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
761 	if (ret < 0)
762 		goto out;
763 
764 	leaf = path->nodes[0];
765 	nritems = btrfs_header_nritems(leaf);
766 
767 	while (1) {
768 		if (btrfs_fs_closing(fs_info) > 1) {
769 			last = (u64)-1;
770 			break;
771 		}
772 
773 		if (path->slots[0] < nritems) {
774 			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
775 		} else {
776 			ret = btrfs_find_next_key(extent_root, path, &key, 0, 0);
777 			if (ret)
778 				break;
779 
780 			if (need_resched() ||
781 			    rwsem_is_contended(&fs_info->commit_root_sem)) {
782 				btrfs_release_path(path);
783 				up_read(&fs_info->commit_root_sem);
784 				mutex_unlock(&caching_ctl->mutex);
785 				cond_resched();
786 				mutex_lock(&caching_ctl->mutex);
787 				down_read(&fs_info->commit_root_sem);
788 				goto next;
789 			}
790 
791 			ret = btrfs_next_leaf(extent_root, path);
792 			if (ret < 0)
793 				goto out;
794 			if (ret)
795 				break;
796 			leaf = path->nodes[0];
797 			nritems = btrfs_header_nritems(leaf);
798 			continue;
799 		}
800 
801 		if (key.objectid < last) {
802 			key.objectid = last;
803 			key.offset = 0;
804 			key.type = BTRFS_EXTENT_ITEM_KEY;
805 			btrfs_release_path(path);
806 			goto next;
807 		}
808 
809 		if (key.objectid < block_group->start) {
810 			path->slots[0]++;
811 			continue;
812 		}
813 
814 		if (key.objectid >= block_group->start + block_group->length)
815 			break;
816 
817 		if (key.type == BTRFS_EXTENT_ITEM_KEY ||
818 		    key.type == BTRFS_METADATA_ITEM_KEY) {
819 			u64 space_added;
820 
821 			ret = btrfs_add_new_free_space(block_group, last,
822 						       key.objectid, &space_added);
823 			if (ret)
824 				goto out;
825 			total_found += space_added;
826 			if (key.type == BTRFS_METADATA_ITEM_KEY)
827 				last = key.objectid +
828 					fs_info->nodesize;
829 			else
830 				last = key.objectid + key.offset;
831 
832 			if (total_found > CACHING_CTL_WAKE_UP) {
833 				total_found = 0;
834 				if (wakeup) {
835 					atomic_inc(&caching_ctl->progress);
836 					wake_up(&caching_ctl->wait);
837 				}
838 			}
839 		}
840 		path->slots[0]++;
841 	}
842 
843 	ret = btrfs_add_new_free_space(block_group, last,
844 				       block_group->start + block_group->length,
845 				       NULL);
846 out:
847 	btrfs_free_path(path);
848 	return ret;
849 }
850 
btrfs_free_excluded_extents(const struct btrfs_block_group * bg)851 static inline void btrfs_free_excluded_extents(const struct btrfs_block_group *bg)
852 {
853 	clear_extent_bits(&bg->fs_info->excluded_extents, bg->start,
854 			  bg->start + bg->length - 1, EXTENT_UPTODATE);
855 }
856 
caching_thread(struct btrfs_work * work)857 static noinline void caching_thread(struct btrfs_work *work)
858 {
859 	struct btrfs_block_group *block_group;
860 	struct btrfs_fs_info *fs_info;
861 	struct btrfs_caching_control *caching_ctl;
862 	int ret;
863 
864 	caching_ctl = container_of(work, struct btrfs_caching_control, work);
865 	block_group = caching_ctl->block_group;
866 	fs_info = block_group->fs_info;
867 
868 	mutex_lock(&caching_ctl->mutex);
869 	down_read(&fs_info->commit_root_sem);
870 
871 	load_block_group_size_class(caching_ctl, block_group);
872 	if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
873 		ret = load_free_space_cache(block_group);
874 		if (ret == 1) {
875 			ret = 0;
876 			goto done;
877 		}
878 
879 		/*
880 		 * We failed to load the space cache, set ourselves to
881 		 * CACHE_STARTED and carry on.
882 		 */
883 		spin_lock(&block_group->lock);
884 		block_group->cached = BTRFS_CACHE_STARTED;
885 		spin_unlock(&block_group->lock);
886 		wake_up(&caching_ctl->wait);
887 	}
888 
889 	/*
890 	 * If we are in the transaction that populated the free space tree we
891 	 * can't actually cache from the free space tree as our commit root and
892 	 * real root are the same, so we could change the contents of the blocks
893 	 * while caching.  Instead do the slow caching in this case, and after
894 	 * the transaction has committed we will be safe.
895 	 */
896 	if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
897 	    !(test_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags)))
898 		ret = load_free_space_tree(caching_ctl);
899 	else
900 		ret = load_extent_tree_free(caching_ctl);
901 done:
902 	spin_lock(&block_group->lock);
903 	block_group->caching_ctl = NULL;
904 	block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
905 	spin_unlock(&block_group->lock);
906 
907 #ifdef CONFIG_BTRFS_DEBUG
908 	if (btrfs_should_fragment_free_space(block_group)) {
909 		u64 bytes_used;
910 
911 		spin_lock(&block_group->space_info->lock);
912 		spin_lock(&block_group->lock);
913 		bytes_used = block_group->length - block_group->used;
914 		block_group->space_info->bytes_used += bytes_used >> 1;
915 		spin_unlock(&block_group->lock);
916 		spin_unlock(&block_group->space_info->lock);
917 		fragment_free_space(block_group);
918 	}
919 #endif
920 
921 	up_read(&fs_info->commit_root_sem);
922 	btrfs_free_excluded_extents(block_group);
923 	mutex_unlock(&caching_ctl->mutex);
924 
925 	wake_up(&caching_ctl->wait);
926 
927 	btrfs_put_caching_control(caching_ctl);
928 	btrfs_put_block_group(block_group);
929 }
930 
btrfs_cache_block_group(struct btrfs_block_group * cache,bool wait)931 int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait)
932 {
933 	struct btrfs_fs_info *fs_info = cache->fs_info;
934 	struct btrfs_caching_control *caching_ctl = NULL;
935 	int ret = 0;
936 
937 	/* Allocator for zoned filesystems does not use the cache at all */
938 	if (btrfs_is_zoned(fs_info))
939 		return 0;
940 
941 	caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
942 	if (!caching_ctl)
943 		return -ENOMEM;
944 
945 	INIT_LIST_HEAD(&caching_ctl->list);
946 	mutex_init(&caching_ctl->mutex);
947 	init_waitqueue_head(&caching_ctl->wait);
948 	caching_ctl->block_group = cache;
949 	refcount_set(&caching_ctl->count, 2);
950 	atomic_set(&caching_ctl->progress, 0);
951 	btrfs_init_work(&caching_ctl->work, caching_thread, NULL);
952 
953 	spin_lock(&cache->lock);
954 	if (cache->cached != BTRFS_CACHE_NO) {
955 		kfree(caching_ctl);
956 
957 		caching_ctl = cache->caching_ctl;
958 		if (caching_ctl)
959 			refcount_inc(&caching_ctl->count);
960 		spin_unlock(&cache->lock);
961 		goto out;
962 	}
963 	WARN_ON(cache->caching_ctl);
964 	cache->caching_ctl = caching_ctl;
965 	cache->cached = BTRFS_CACHE_STARTED;
966 	spin_unlock(&cache->lock);
967 
968 	write_lock(&fs_info->block_group_cache_lock);
969 	refcount_inc(&caching_ctl->count);
970 	list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
971 	write_unlock(&fs_info->block_group_cache_lock);
972 
973 	btrfs_get_block_group(cache);
974 
975 	btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
976 out:
977 	if (wait && caching_ctl)
978 		ret = btrfs_caching_ctl_wait_done(cache, caching_ctl);
979 	if (caching_ctl)
980 		btrfs_put_caching_control(caching_ctl);
981 
982 	return ret;
983 }
984 
clear_avail_alloc_bits(struct btrfs_fs_info * fs_info,u64 flags)985 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
986 {
987 	u64 extra_flags = chunk_to_extended(flags) &
988 				BTRFS_EXTENDED_PROFILE_MASK;
989 
990 	write_seqlock(&fs_info->profiles_lock);
991 	if (flags & BTRFS_BLOCK_GROUP_DATA)
992 		fs_info->avail_data_alloc_bits &= ~extra_flags;
993 	if (flags & BTRFS_BLOCK_GROUP_METADATA)
994 		fs_info->avail_metadata_alloc_bits &= ~extra_flags;
995 	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
996 		fs_info->avail_system_alloc_bits &= ~extra_flags;
997 	write_sequnlock(&fs_info->profiles_lock);
998 }
999 
1000 /*
1001  * Clear incompat bits for the following feature(s):
1002  *
1003  * - RAID56 - in case there's neither RAID5 nor RAID6 profile block group
1004  *            in the whole filesystem
1005  *
1006  * - RAID1C34 - same as above for RAID1C3 and RAID1C4 block groups
1007  */
clear_incompat_bg_bits(struct btrfs_fs_info * fs_info,u64 flags)1008 static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags)
1009 {
1010 	bool found_raid56 = false;
1011 	bool found_raid1c34 = false;
1012 
1013 	if ((flags & BTRFS_BLOCK_GROUP_RAID56_MASK) ||
1014 	    (flags & BTRFS_BLOCK_GROUP_RAID1C3) ||
1015 	    (flags & BTRFS_BLOCK_GROUP_RAID1C4)) {
1016 		struct list_head *head = &fs_info->space_info;
1017 		struct btrfs_space_info *sinfo;
1018 
1019 		list_for_each_entry_rcu(sinfo, head, list) {
1020 			down_read(&sinfo->groups_sem);
1021 			if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5]))
1022 				found_raid56 = true;
1023 			if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6]))
1024 				found_raid56 = true;
1025 			if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C3]))
1026 				found_raid1c34 = true;
1027 			if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C4]))
1028 				found_raid1c34 = true;
1029 			up_read(&sinfo->groups_sem);
1030 		}
1031 		if (!found_raid56)
1032 			btrfs_clear_fs_incompat(fs_info, RAID56);
1033 		if (!found_raid1c34)
1034 			btrfs_clear_fs_incompat(fs_info, RAID1C34);
1035 	}
1036 }
1037 
btrfs_block_group_root(struct btrfs_fs_info * fs_info)1038 static struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info)
1039 {
1040 	if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE))
1041 		return fs_info->block_group_root;
1042 	return btrfs_extent_root(fs_info, 0);
1043 }
1044 
remove_block_group_item(struct btrfs_trans_handle * trans,struct btrfs_path * path,struct btrfs_block_group * block_group)1045 static int remove_block_group_item(struct btrfs_trans_handle *trans,
1046 				   struct btrfs_path *path,
1047 				   struct btrfs_block_group *block_group)
1048 {
1049 	struct btrfs_fs_info *fs_info = trans->fs_info;
1050 	struct btrfs_root *root;
1051 	struct btrfs_key key;
1052 	int ret;
1053 
1054 	root = btrfs_block_group_root(fs_info);
1055 	key.objectid = block_group->start;
1056 	key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
1057 	key.offset = block_group->length;
1058 
1059 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1060 	if (ret > 0)
1061 		ret = -ENOENT;
1062 	if (ret < 0)
1063 		return ret;
1064 
1065 	ret = btrfs_del_item(trans, root, path);
1066 	return ret;
1067 }
1068 
btrfs_remove_block_group(struct btrfs_trans_handle * trans,struct btrfs_chunk_map * map)1069 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
1070 			     struct btrfs_chunk_map *map)
1071 {
1072 	struct btrfs_fs_info *fs_info = trans->fs_info;
1073 	struct btrfs_path *path;
1074 	struct btrfs_block_group *block_group;
1075 	struct btrfs_free_cluster *cluster;
1076 	struct inode *inode;
1077 	struct kobject *kobj = NULL;
1078 	int ret;
1079 	int index;
1080 	int factor;
1081 	struct btrfs_caching_control *caching_ctl = NULL;
1082 	bool remove_map;
1083 	bool remove_rsv = false;
1084 
1085 	block_group = btrfs_lookup_block_group(fs_info, map->start);
1086 	if (!block_group)
1087 		return -ENOENT;
1088 
1089 	BUG_ON(!block_group->ro);
1090 
1091 	trace_btrfs_remove_block_group(block_group);
1092 	/*
1093 	 * Free the reserved super bytes from this block group before
1094 	 * remove it.
1095 	 */
1096 	btrfs_free_excluded_extents(block_group);
1097 	btrfs_free_ref_tree_range(fs_info, block_group->start,
1098 				  block_group->length);
1099 
1100 	index = btrfs_bg_flags_to_raid_index(block_group->flags);
1101 	factor = btrfs_bg_type_to_factor(block_group->flags);
1102 
1103 	/* make sure this block group isn't part of an allocation cluster */
1104 	cluster = &fs_info->data_alloc_cluster;
1105 	spin_lock(&cluster->refill_lock);
1106 	btrfs_return_cluster_to_free_space(block_group, cluster);
1107 	spin_unlock(&cluster->refill_lock);
1108 
1109 	/*
1110 	 * make sure this block group isn't part of a metadata
1111 	 * allocation cluster
1112 	 */
1113 	cluster = &fs_info->meta_alloc_cluster;
1114 	spin_lock(&cluster->refill_lock);
1115 	btrfs_return_cluster_to_free_space(block_group, cluster);
1116 	spin_unlock(&cluster->refill_lock);
1117 
1118 	btrfs_clear_treelog_bg(block_group);
1119 	btrfs_clear_data_reloc_bg(block_group);
1120 
1121 	path = btrfs_alloc_path();
1122 	if (!path) {
1123 		ret = -ENOMEM;
1124 		goto out;
1125 	}
1126 
1127 	/*
1128 	 * get the inode first so any iput calls done for the io_list
1129 	 * aren't the final iput (no unlinks allowed now)
1130 	 */
1131 	inode = lookup_free_space_inode(block_group, path);
1132 
1133 	mutex_lock(&trans->transaction->cache_write_mutex);
1134 	/*
1135 	 * Make sure our free space cache IO is done before removing the
1136 	 * free space inode
1137 	 */
1138 	spin_lock(&trans->transaction->dirty_bgs_lock);
1139 	if (!list_empty(&block_group->io_list)) {
1140 		list_del_init(&block_group->io_list);
1141 
1142 		WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
1143 
1144 		spin_unlock(&trans->transaction->dirty_bgs_lock);
1145 		btrfs_wait_cache_io(trans, block_group, path);
1146 		btrfs_put_block_group(block_group);
1147 		spin_lock(&trans->transaction->dirty_bgs_lock);
1148 	}
1149 
1150 	if (!list_empty(&block_group->dirty_list)) {
1151 		list_del_init(&block_group->dirty_list);
1152 		remove_rsv = true;
1153 		btrfs_put_block_group(block_group);
1154 	}
1155 	spin_unlock(&trans->transaction->dirty_bgs_lock);
1156 	mutex_unlock(&trans->transaction->cache_write_mutex);
1157 
1158 	ret = btrfs_remove_free_space_inode(trans, inode, block_group);
1159 	if (ret)
1160 		goto out;
1161 
1162 	write_lock(&fs_info->block_group_cache_lock);
1163 	rb_erase_cached(&block_group->cache_node,
1164 			&fs_info->block_group_cache_tree);
1165 	RB_CLEAR_NODE(&block_group->cache_node);
1166 
1167 	/* Once for the block groups rbtree */
1168 	btrfs_put_block_group(block_group);
1169 
1170 	write_unlock(&fs_info->block_group_cache_lock);
1171 
1172 	down_write(&block_group->space_info->groups_sem);
1173 	/*
1174 	 * we must use list_del_init so people can check to see if they
1175 	 * are still on the list after taking the semaphore
1176 	 */
1177 	list_del_init(&block_group->list);
1178 	if (list_empty(&block_group->space_info->block_groups[index])) {
1179 		kobj = block_group->space_info->block_group_kobjs[index];
1180 		block_group->space_info->block_group_kobjs[index] = NULL;
1181 		clear_avail_alloc_bits(fs_info, block_group->flags);
1182 	}
1183 	up_write(&block_group->space_info->groups_sem);
1184 	clear_incompat_bg_bits(fs_info, block_group->flags);
1185 	if (kobj) {
1186 		kobject_del(kobj);
1187 		kobject_put(kobj);
1188 	}
1189 
1190 	if (block_group->cached == BTRFS_CACHE_STARTED)
1191 		btrfs_wait_block_group_cache_done(block_group);
1192 
1193 	write_lock(&fs_info->block_group_cache_lock);
1194 	caching_ctl = btrfs_get_caching_control(block_group);
1195 	if (!caching_ctl) {
1196 		struct btrfs_caching_control *ctl;
1197 
1198 		list_for_each_entry(ctl, &fs_info->caching_block_groups, list) {
1199 			if (ctl->block_group == block_group) {
1200 				caching_ctl = ctl;
1201 				refcount_inc(&caching_ctl->count);
1202 				break;
1203 			}
1204 		}
1205 	}
1206 	if (caching_ctl)
1207 		list_del_init(&caching_ctl->list);
1208 	write_unlock(&fs_info->block_group_cache_lock);
1209 
1210 	if (caching_ctl) {
1211 		/* Once for the caching bgs list and once for us. */
1212 		btrfs_put_caching_control(caching_ctl);
1213 		btrfs_put_caching_control(caching_ctl);
1214 	}
1215 
1216 	spin_lock(&trans->transaction->dirty_bgs_lock);
1217 	WARN_ON(!list_empty(&block_group->dirty_list));
1218 	WARN_ON(!list_empty(&block_group->io_list));
1219 	spin_unlock(&trans->transaction->dirty_bgs_lock);
1220 
1221 	btrfs_remove_free_space_cache(block_group);
1222 
1223 	spin_lock(&block_group->space_info->lock);
1224 	list_del_init(&block_group->ro_list);
1225 
1226 	if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
1227 		WARN_ON(block_group->space_info->total_bytes
1228 			< block_group->length);
1229 		WARN_ON(block_group->space_info->bytes_readonly
1230 			< block_group->length - block_group->zone_unusable);
1231 		WARN_ON(block_group->space_info->bytes_zone_unusable
1232 			< block_group->zone_unusable);
1233 		WARN_ON(block_group->space_info->disk_total
1234 			< block_group->length * factor);
1235 	}
1236 	block_group->space_info->total_bytes -= block_group->length;
1237 	block_group->space_info->bytes_readonly -=
1238 		(block_group->length - block_group->zone_unusable);
1239 	btrfs_space_info_update_bytes_zone_unusable(fs_info, block_group->space_info,
1240 						    -block_group->zone_unusable);
1241 	block_group->space_info->disk_total -= block_group->length * factor;
1242 
1243 	spin_unlock(&block_group->space_info->lock);
1244 
1245 	/*
1246 	 * Remove the free space for the block group from the free space tree
1247 	 * and the block group's item from the extent tree before marking the
1248 	 * block group as removed. This is to prevent races with tasks that
1249 	 * freeze and unfreeze a block group, this task and another task
1250 	 * allocating a new block group - the unfreeze task ends up removing
1251 	 * the block group's extent map before the task calling this function
1252 	 * deletes the block group item from the extent tree, allowing for
1253 	 * another task to attempt to create another block group with the same
1254 	 * item key (and failing with -EEXIST and a transaction abort).
1255 	 */
1256 	ret = remove_block_group_free_space(trans, block_group);
1257 	if (ret)
1258 		goto out;
1259 
1260 	ret = remove_block_group_item(trans, path, block_group);
1261 	if (ret < 0)
1262 		goto out;
1263 
1264 	spin_lock(&block_group->lock);
1265 	/*
1266 	 * Hitting this WARN means we removed a block group with an unwritten
1267 	 * region. It will cause "unable to find chunk map for logical" errors.
1268 	 */
1269 	if (WARN_ON(has_unwritten_metadata(block_group)))
1270 		btrfs_warn(fs_info,
1271 			   "block group %llu is removed before metadata write out",
1272 			   block_group->start);
1273 
1274 	set_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags);
1275 
1276 	/*
1277 	 * At this point trimming or scrub can't start on this block group,
1278 	 * because we removed the block group from the rbtree
1279 	 * fs_info->block_group_cache_tree so no one can't find it anymore and
1280 	 * even if someone already got this block group before we removed it
1281 	 * from the rbtree, they have already incremented block_group->frozen -
1282 	 * if they didn't, for the trimming case they won't find any free space
1283 	 * entries because we already removed them all when we called
1284 	 * btrfs_remove_free_space_cache().
1285 	 *
1286 	 * And we must not remove the chunk map from the fs_info->mapping_tree
1287 	 * to prevent the same logical address range and physical device space
1288 	 * ranges from being reused for a new block group. This is needed to
1289 	 * avoid races with trimming and scrub.
1290 	 *
1291 	 * An fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
1292 	 * completely transactionless, so while it is trimming a range the
1293 	 * currently running transaction might finish and a new one start,
1294 	 * allowing for new block groups to be created that can reuse the same
1295 	 * physical device locations unless we take this special care.
1296 	 *
1297 	 * There may also be an implicit trim operation if the file system
1298 	 * is mounted with -odiscard. The same protections must remain
1299 	 * in place until the extents have been discarded completely when
1300 	 * the transaction commit has completed.
1301 	 */
1302 	remove_map = (atomic_read(&block_group->frozen) == 0);
1303 	spin_unlock(&block_group->lock);
1304 
1305 	if (remove_map)
1306 		btrfs_remove_chunk_map(fs_info, map);
1307 
1308 out:
1309 	/* Once for the lookup reference */
1310 	btrfs_put_block_group(block_group);
1311 	if (remove_rsv)
1312 		btrfs_dec_delayed_refs_rsv_bg_updates(fs_info);
1313 	btrfs_free_path(path);
1314 	return ret;
1315 }
1316 
btrfs_start_trans_remove_block_group(struct btrfs_fs_info * fs_info,const u64 chunk_offset)1317 struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
1318 		struct btrfs_fs_info *fs_info, const u64 chunk_offset)
1319 {
1320 	struct btrfs_root *root = btrfs_block_group_root(fs_info);
1321 	struct btrfs_chunk_map *map;
1322 	unsigned int num_items;
1323 
1324 	map = btrfs_find_chunk_map(fs_info, chunk_offset, 1);
1325 	ASSERT(map != NULL);
1326 	ASSERT(map->start == chunk_offset);
1327 
1328 	/*
1329 	 * We need to reserve 3 + N units from the metadata space info in order
1330 	 * to remove a block group (done at btrfs_remove_chunk() and at
1331 	 * btrfs_remove_block_group()), which are used for:
1332 	 *
1333 	 * 1 unit for adding the free space inode's orphan (located in the tree
1334 	 * of tree roots).
1335 	 * 1 unit for deleting the block group item (located in the extent
1336 	 * tree).
1337 	 * 1 unit for deleting the free space item (located in tree of tree
1338 	 * roots).
1339 	 * N units for deleting N device extent items corresponding to each
1340 	 * stripe (located in the device tree).
1341 	 *
1342 	 * In order to remove a block group we also need to reserve units in the
1343 	 * system space info in order to update the chunk tree (update one or
1344 	 * more device items and remove one chunk item), but this is done at
1345 	 * btrfs_remove_chunk() through a call to check_system_chunk().
1346 	 */
1347 	num_items = 3 + map->num_stripes;
1348 	btrfs_free_chunk_map(map);
1349 
1350 	return btrfs_start_transaction_fallback_global_rsv(root, num_items);
1351 }
1352 
1353 /*
1354  * Mark block group @cache read-only, so later write won't happen to block
1355  * group @cache.
1356  *
1357  * If @force is not set, this function will only mark the block group readonly
1358  * if we have enough free space (1M) in other metadata/system block groups.
1359  * If @force is not set, this function will mark the block group readonly
1360  * without checking free space.
1361  *
1362  * NOTE: This function doesn't care if other block groups can contain all the
1363  * data in this block group. That check should be done by relocation routine,
1364  * not this function.
1365  */
inc_block_group_ro(struct btrfs_block_group * cache,int force)1366 static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
1367 {
1368 	struct btrfs_space_info *sinfo = cache->space_info;
1369 	u64 num_bytes;
1370 	int ret = -ENOSPC;
1371 
1372 	spin_lock(&sinfo->lock);
1373 	spin_lock(&cache->lock);
1374 
1375 	if (cache->swap_extents) {
1376 		ret = -ETXTBSY;
1377 		goto out;
1378 	}
1379 
1380 	if (cache->ro) {
1381 		cache->ro++;
1382 		ret = 0;
1383 		goto out;
1384 	}
1385 
1386 	num_bytes = cache->length - cache->reserved - cache->pinned -
1387 		    cache->bytes_super - cache->zone_unusable - cache->used;
1388 
1389 	/*
1390 	 * Data never overcommits, even in mixed mode, so do just the straight
1391 	 * check of left over space in how much we have allocated.
1392 	 */
1393 	if (force) {
1394 		ret = 0;
1395 	} else if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) {
1396 		u64 sinfo_used = btrfs_space_info_used(sinfo, true);
1397 
1398 		/*
1399 		 * Here we make sure if we mark this bg RO, we still have enough
1400 		 * free space as buffer.
1401 		 */
1402 		if (sinfo_used + num_bytes <= sinfo->total_bytes)
1403 			ret = 0;
1404 	} else {
1405 		/*
1406 		 * We overcommit metadata, so we need to do the
1407 		 * btrfs_can_overcommit check here, and we need to pass in
1408 		 * BTRFS_RESERVE_NO_FLUSH to give ourselves the most amount of
1409 		 * leeway to allow us to mark this block group as read only.
1410 		 */
1411 		if (btrfs_can_overcommit(cache->fs_info, sinfo, num_bytes,
1412 					 BTRFS_RESERVE_NO_FLUSH))
1413 			ret = 0;
1414 	}
1415 
1416 	if (!ret) {
1417 		sinfo->bytes_readonly += num_bytes;
1418 		if (btrfs_is_zoned(cache->fs_info)) {
1419 			/* Migrate zone_unusable bytes to readonly */
1420 			sinfo->bytes_readonly += cache->zone_unusable;
1421 			btrfs_space_info_update_bytes_zone_unusable(cache->fs_info, sinfo,
1422 								    -cache->zone_unusable);
1423 			cache->zone_unusable = 0;
1424 		}
1425 		cache->ro++;
1426 		list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
1427 	}
1428 out:
1429 	spin_unlock(&cache->lock);
1430 	spin_unlock(&sinfo->lock);
1431 	if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
1432 		btrfs_info(cache->fs_info,
1433 			"unable to make block group %llu ro", cache->start);
1434 		btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0);
1435 	}
1436 	return ret;
1437 }
1438 
clean_pinned_extents(struct btrfs_trans_handle * trans,const struct btrfs_block_group * bg)1439 static bool clean_pinned_extents(struct btrfs_trans_handle *trans,
1440 				 const struct btrfs_block_group *bg)
1441 {
1442 	struct btrfs_fs_info *fs_info = trans->fs_info;
1443 	struct btrfs_transaction *prev_trans = NULL;
1444 	const u64 start = bg->start;
1445 	const u64 end = start + bg->length - 1;
1446 	int ret;
1447 
1448 	spin_lock(&fs_info->trans_lock);
1449 	if (trans->transaction->list.prev != &fs_info->trans_list) {
1450 		prev_trans = list_last_entry(&trans->transaction->list,
1451 					     struct btrfs_transaction, list);
1452 		refcount_inc(&prev_trans->use_count);
1453 	}
1454 	spin_unlock(&fs_info->trans_lock);
1455 
1456 	/*
1457 	 * Hold the unused_bg_unpin_mutex lock to avoid racing with
1458 	 * btrfs_finish_extent_commit(). If we are at transaction N, another
1459 	 * task might be running finish_extent_commit() for the previous
1460 	 * transaction N - 1, and have seen a range belonging to the block
1461 	 * group in pinned_extents before we were able to clear the whole block
1462 	 * group range from pinned_extents. This means that task can lookup for
1463 	 * the block group after we unpinned it from pinned_extents and removed
1464 	 * it, leading to an error at unpin_extent_range().
1465 	 */
1466 	mutex_lock(&fs_info->unused_bg_unpin_mutex);
1467 	if (prev_trans) {
1468 		ret = clear_extent_bits(&prev_trans->pinned_extents, start, end,
1469 					EXTENT_DIRTY);
1470 		if (ret)
1471 			goto out;
1472 	}
1473 
1474 	ret = clear_extent_bits(&trans->transaction->pinned_extents, start, end,
1475 				EXTENT_DIRTY);
1476 out:
1477 	mutex_unlock(&fs_info->unused_bg_unpin_mutex);
1478 	if (prev_trans)
1479 		btrfs_put_transaction(prev_trans);
1480 
1481 	return ret == 0;
1482 }
1483 
1484 /*
1485  * Link the block_group to a list via bg_list.
1486  *
1487  * @bg:       The block_group to link to the list.
1488  * @list:     The list to link it to.
1489  *
1490  * Use this rather than list_add_tail() directly to ensure proper respect
1491  * to locking and refcounting.
1492  *
1493  * Returns: true if the bg was linked with a refcount bump and false otherwise.
1494  */
btrfs_link_bg_list(struct btrfs_block_group * bg,struct list_head * list)1495 static bool btrfs_link_bg_list(struct btrfs_block_group *bg, struct list_head *list)
1496 {
1497 	struct btrfs_fs_info *fs_info = bg->fs_info;
1498 	bool added = false;
1499 
1500 	spin_lock(&fs_info->unused_bgs_lock);
1501 	if (list_empty(&bg->bg_list)) {
1502 		btrfs_get_block_group(bg);
1503 		list_add_tail(&bg->bg_list, list);
1504 		added = true;
1505 	}
1506 	spin_unlock(&fs_info->unused_bgs_lock);
1507 	return added;
1508 }
1509 
1510 /*
1511  * Process the unused_bgs list and remove any that don't have any allocated
1512  * space inside of them.
1513  */
btrfs_delete_unused_bgs(struct btrfs_fs_info * fs_info)1514 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
1515 {
1516 	LIST_HEAD(retry_list);
1517 	struct btrfs_block_group *block_group;
1518 	struct btrfs_space_info *space_info;
1519 	struct btrfs_trans_handle *trans;
1520 	const bool async_trim_enabled = btrfs_test_opt(fs_info, DISCARD_ASYNC);
1521 	int ret = 0;
1522 
1523 	if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
1524 		return;
1525 
1526 	if (btrfs_fs_closing(fs_info))
1527 		return;
1528 
1529 	/*
1530 	 * Long running balances can keep us blocked here for eternity, so
1531 	 * simply skip deletion if we're unable to get the mutex.
1532 	 */
1533 	if (!mutex_trylock(&fs_info->reclaim_bgs_lock))
1534 		return;
1535 
1536 	spin_lock(&fs_info->unused_bgs_lock);
1537 	while (!list_empty(&fs_info->unused_bgs)) {
1538 		u64 used;
1539 		int trimming;
1540 
1541 		block_group = list_first_entry(&fs_info->unused_bgs,
1542 					       struct btrfs_block_group,
1543 					       bg_list);
1544 		list_del_init(&block_group->bg_list);
1545 
1546 		space_info = block_group->space_info;
1547 
1548 		if (ret || btrfs_mixed_space_info(space_info)) {
1549 			btrfs_put_block_group(block_group);
1550 			continue;
1551 		}
1552 		spin_unlock(&fs_info->unused_bgs_lock);
1553 
1554 		btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
1555 
1556 		/* Don't want to race with allocators so take the groups_sem */
1557 		down_write(&space_info->groups_sem);
1558 
1559 		/*
1560 		 * Async discard moves the final block group discard to be prior
1561 		 * to the unused_bgs code path.  Therefore, if it's not fully
1562 		 * trimmed, punt it back to the async discard lists.
1563 		 */
1564 		if (btrfs_test_opt(fs_info, DISCARD_ASYNC) &&
1565 		    !btrfs_is_free_space_trimmed(block_group)) {
1566 			trace_btrfs_skip_unused_block_group(block_group);
1567 			up_write(&space_info->groups_sem);
1568 			/* Requeue if we failed because of async discard */
1569 			btrfs_discard_queue_work(&fs_info->discard_ctl,
1570 						 block_group);
1571 			goto next;
1572 		}
1573 
1574 		spin_lock(&space_info->lock);
1575 		spin_lock(&block_group->lock);
1576 		if (btrfs_is_block_group_used(block_group) || block_group->ro ||
1577 		    list_is_singular(&block_group->list)) {
1578 			/*
1579 			 * We want to bail if we made new allocations or have
1580 			 * outstanding allocations in this block group.  We do
1581 			 * the ro check in case balance is currently acting on
1582 			 * this block group.
1583 			 *
1584 			 * Also bail out if this is the only block group for its
1585 			 * type, because otherwise we would lose profile
1586 			 * information from fs_info->avail_*_alloc_bits and the
1587 			 * next block group of this type would be created with a
1588 			 * "single" profile (even if we're in a raid fs) because
1589 			 * fs_info->avail_*_alloc_bits would be 0.
1590 			 */
1591 			trace_btrfs_skip_unused_block_group(block_group);
1592 			spin_unlock(&block_group->lock);
1593 			spin_unlock(&space_info->lock);
1594 			up_write(&space_info->groups_sem);
1595 			goto next;
1596 		}
1597 
1598 		/*
1599 		 * The block group may be unused but there may be space reserved
1600 		 * accounting with the existence of that block group, that is,
1601 		 * space_info->bytes_may_use was incremented by a task but no
1602 		 * space was yet allocated from the block group by the task.
1603 		 * That space may or may not be allocated, as we are generally
1604 		 * pessimistic about space reservation for metadata as well as
1605 		 * for data when using compression (as we reserve space based on
1606 		 * the worst case, when data can't be compressed, and before
1607 		 * actually attempting compression, before starting writeback).
1608 		 *
1609 		 * So check if the total space of the space_info minus the size
1610 		 * of this block group is less than the used space of the
1611 		 * space_info - if that's the case, then it means we have tasks
1612 		 * that might be relying on the block group in order to allocate
1613 		 * extents, and add back the block group to the unused list when
1614 		 * we finish, so that we retry later in case no tasks ended up
1615 		 * needing to allocate extents from the block group.
1616 		 */
1617 		used = btrfs_space_info_used(space_info, true);
1618 		if ((space_info->total_bytes - block_group->length < used &&
1619 		     block_group->zone_unusable < block_group->length) ||
1620 		    has_unwritten_metadata(block_group)) {
1621 			/*
1622 			 * Add a reference for the list, compensate for the ref
1623 			 * drop under the "next" label for the
1624 			 * fs_info->unused_bgs list.
1625 			 */
1626 			btrfs_link_bg_list(block_group, &retry_list);
1627 
1628 			trace_btrfs_skip_unused_block_group(block_group);
1629 			spin_unlock(&block_group->lock);
1630 			spin_unlock(&space_info->lock);
1631 			up_write(&space_info->groups_sem);
1632 			goto next;
1633 		}
1634 
1635 		spin_unlock(&block_group->lock);
1636 		spin_unlock(&space_info->lock);
1637 
1638 		/* We don't want to force the issue, only flip if it's ok. */
1639 		ret = inc_block_group_ro(block_group, 0);
1640 		up_write(&space_info->groups_sem);
1641 		if (ret < 0) {
1642 			ret = 0;
1643 			goto next;
1644 		}
1645 
1646 		ret = btrfs_zone_finish(block_group);
1647 		if (ret < 0) {
1648 			btrfs_dec_block_group_ro(block_group);
1649 			if (ret == -EAGAIN) {
1650 				btrfs_link_bg_list(block_group, &retry_list);
1651 				ret = 0;
1652 			}
1653 			goto next;
1654 		}
1655 
1656 		/*
1657 		 * Want to do this before we do anything else so we can recover
1658 		 * properly if we fail to join the transaction.
1659 		 */
1660 		trans = btrfs_start_trans_remove_block_group(fs_info,
1661 						     block_group->start);
1662 		if (IS_ERR(trans)) {
1663 			btrfs_dec_block_group_ro(block_group);
1664 			ret = PTR_ERR(trans);
1665 			goto next;
1666 		}
1667 
1668 		/*
1669 		 * We could have pending pinned extents for this block group,
1670 		 * just delete them, we don't care about them anymore.
1671 		 */
1672 		if (!clean_pinned_extents(trans, block_group)) {
1673 			btrfs_dec_block_group_ro(block_group);
1674 			goto end_trans;
1675 		}
1676 
1677 		/*
1678 		 * At this point, the block_group is read only and should fail
1679 		 * new allocations.  However, btrfs_finish_extent_commit() can
1680 		 * cause this block_group to be placed back on the discard
1681 		 * lists because now the block_group isn't fully discarded.
1682 		 * Bail here and try again later after discarding everything.
1683 		 */
1684 		spin_lock(&fs_info->discard_ctl.lock);
1685 		if (!list_empty(&block_group->discard_list)) {
1686 			spin_unlock(&fs_info->discard_ctl.lock);
1687 			btrfs_dec_block_group_ro(block_group);
1688 			btrfs_discard_queue_work(&fs_info->discard_ctl,
1689 						 block_group);
1690 			goto end_trans;
1691 		}
1692 		spin_unlock(&fs_info->discard_ctl.lock);
1693 
1694 		/* Reset pinned so btrfs_put_block_group doesn't complain */
1695 		spin_lock(&space_info->lock);
1696 		spin_lock(&block_group->lock);
1697 
1698 		btrfs_space_info_update_bytes_pinned(fs_info, space_info,
1699 						     -block_group->pinned);
1700 		space_info->bytes_readonly += block_group->pinned;
1701 		block_group->pinned = 0;
1702 
1703 		spin_unlock(&block_group->lock);
1704 		spin_unlock(&space_info->lock);
1705 
1706 		/*
1707 		 * The normal path here is an unused block group is passed here,
1708 		 * then trimming is handled in the transaction commit path.
1709 		 * Async discard interposes before this to do the trimming
1710 		 * before coming down the unused block group path as trimming
1711 		 * will no longer be done later in the transaction commit path.
1712 		 */
1713 		if (!async_trim_enabled && btrfs_test_opt(fs_info, DISCARD_ASYNC))
1714 			goto flip_async;
1715 
1716 		/*
1717 		 * DISCARD can flip during remount. On zoned filesystems, we
1718 		 * need to reset sequential-required zones.
1719 		 */
1720 		trimming = btrfs_test_opt(fs_info, DISCARD_SYNC) ||
1721 				btrfs_is_zoned(fs_info);
1722 
1723 		/* Implicit trim during transaction commit. */
1724 		if (trimming)
1725 			btrfs_freeze_block_group(block_group);
1726 
1727 		/*
1728 		 * Btrfs_remove_chunk will abort the transaction if things go
1729 		 * horribly wrong.
1730 		 */
1731 		ret = btrfs_remove_chunk(trans, block_group->start);
1732 
1733 		if (ret) {
1734 			if (trimming)
1735 				btrfs_unfreeze_block_group(block_group);
1736 			goto end_trans;
1737 		}
1738 
1739 		/*
1740 		 * If we're not mounted with -odiscard, we can just forget
1741 		 * about this block group. Otherwise we'll need to wait
1742 		 * until transaction commit to do the actual discard.
1743 		 */
1744 		if (trimming) {
1745 			spin_lock(&fs_info->unused_bgs_lock);
1746 			/*
1747 			 * A concurrent scrub might have added us to the list
1748 			 * fs_info->unused_bgs, so use a list_move operation
1749 			 * to add the block group to the deleted_bgs list.
1750 			 */
1751 			list_move(&block_group->bg_list,
1752 				  &trans->transaction->deleted_bgs);
1753 			spin_unlock(&fs_info->unused_bgs_lock);
1754 			btrfs_get_block_group(block_group);
1755 		}
1756 end_trans:
1757 		btrfs_end_transaction(trans);
1758 next:
1759 		btrfs_put_block_group(block_group);
1760 		spin_lock(&fs_info->unused_bgs_lock);
1761 	}
1762 	list_splice_tail(&retry_list, &fs_info->unused_bgs);
1763 	spin_unlock(&fs_info->unused_bgs_lock);
1764 	mutex_unlock(&fs_info->reclaim_bgs_lock);
1765 	return;
1766 
1767 flip_async:
1768 	btrfs_end_transaction(trans);
1769 	spin_lock(&fs_info->unused_bgs_lock);
1770 	list_splice_tail(&retry_list, &fs_info->unused_bgs);
1771 	spin_unlock(&fs_info->unused_bgs_lock);
1772 	mutex_unlock(&fs_info->reclaim_bgs_lock);
1773 	btrfs_put_block_group(block_group);
1774 	btrfs_discard_punt_unused_bgs_list(fs_info);
1775 }
1776 
btrfs_mark_bg_unused(struct btrfs_block_group * bg)1777 void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
1778 {
1779 	struct btrfs_fs_info *fs_info = bg->fs_info;
1780 
1781 	spin_lock(&fs_info->unused_bgs_lock);
1782 	if (list_empty(&bg->bg_list)) {
1783 		btrfs_get_block_group(bg);
1784 		trace_btrfs_add_unused_block_group(bg);
1785 		list_add_tail(&bg->bg_list, &fs_info->unused_bgs);
1786 	} else if (!test_bit(BLOCK_GROUP_FLAG_NEW, &bg->runtime_flags)) {
1787 		/* Pull out the block group from the reclaim_bgs list. */
1788 		trace_btrfs_add_unused_block_group(bg);
1789 		list_move_tail(&bg->bg_list, &fs_info->unused_bgs);
1790 	}
1791 	spin_unlock(&fs_info->unused_bgs_lock);
1792 }
1793 
1794 /*
1795  * We want block groups with a low number of used bytes to be in the beginning
1796  * of the list, so they will get reclaimed first.
1797  */
reclaim_bgs_cmp(void * unused,const struct list_head * a,const struct list_head * b)1798 static int reclaim_bgs_cmp(void *unused, const struct list_head *a,
1799 			   const struct list_head *b)
1800 {
1801 	const struct btrfs_block_group *bg1, *bg2;
1802 
1803 	bg1 = list_entry(a, struct btrfs_block_group, bg_list);
1804 	bg2 = list_entry(b, struct btrfs_block_group, bg_list);
1805 
1806 	return bg1->used > bg2->used;
1807 }
1808 
btrfs_should_reclaim(const struct btrfs_fs_info * fs_info)1809 static inline bool btrfs_should_reclaim(const struct btrfs_fs_info *fs_info)
1810 {
1811 	if (btrfs_is_zoned(fs_info))
1812 		return btrfs_zoned_should_reclaim(fs_info);
1813 	return true;
1814 }
1815 
should_reclaim_block_group(const struct btrfs_block_group * bg,u64 bytes_freed)1816 static bool should_reclaim_block_group(const struct btrfs_block_group *bg, u64 bytes_freed)
1817 {
1818 	const int thresh_pct = btrfs_calc_reclaim_threshold(bg->space_info);
1819 	u64 thresh_bytes = mult_perc(bg->length, thresh_pct);
1820 	const u64 new_val = bg->used;
1821 	const u64 old_val = new_val + bytes_freed;
1822 
1823 	if (thresh_bytes == 0)
1824 		return false;
1825 
1826 	/*
1827 	 * If we were below the threshold before don't reclaim, we are likely a
1828 	 * brand new block group and we don't want to relocate new block groups.
1829 	 */
1830 	if (old_val < thresh_bytes)
1831 		return false;
1832 	if (new_val >= thresh_bytes)
1833 		return false;
1834 	return true;
1835 }
1836 
btrfs_reclaim_bgs_work(struct work_struct * work)1837 void btrfs_reclaim_bgs_work(struct work_struct *work)
1838 {
1839 	struct btrfs_fs_info *fs_info =
1840 		container_of(work, struct btrfs_fs_info, reclaim_bgs_work);
1841 	struct btrfs_block_group *bg;
1842 	struct btrfs_space_info *space_info;
1843 	LIST_HEAD(retry_list);
1844 
1845 	if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
1846 		return;
1847 
1848 	if (btrfs_fs_closing(fs_info))
1849 		return;
1850 
1851 	if (!btrfs_should_reclaim(fs_info))
1852 		return;
1853 
1854 	sb_start_write(fs_info->sb);
1855 
1856 	if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
1857 		sb_end_write(fs_info->sb);
1858 		return;
1859 	}
1860 
1861 	/*
1862 	 * Long running balances can keep us blocked here for eternity, so
1863 	 * simply skip reclaim if we're unable to get the mutex.
1864 	 */
1865 	if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) {
1866 		btrfs_exclop_finish(fs_info);
1867 		sb_end_write(fs_info->sb);
1868 		return;
1869 	}
1870 
1871 	spin_lock(&fs_info->unused_bgs_lock);
1872 	/*
1873 	 * Sort happens under lock because we can't simply splice it and sort.
1874 	 * The block groups might still be in use and reachable via bg_list,
1875 	 * and their presence in the reclaim_bgs list must be preserved.
1876 	 */
1877 	list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp);
1878 	while (!list_empty(&fs_info->reclaim_bgs)) {
1879 		u64 zone_unusable;
1880 		u64 reclaimed;
1881 		int ret = 0;
1882 
1883 		bg = list_first_entry(&fs_info->reclaim_bgs,
1884 				      struct btrfs_block_group,
1885 				      bg_list);
1886 		list_del_init(&bg->bg_list);
1887 
1888 		space_info = bg->space_info;
1889 		spin_unlock(&fs_info->unused_bgs_lock);
1890 
1891 		/* Don't race with allocators so take the groups_sem */
1892 		down_write(&space_info->groups_sem);
1893 
1894 		spin_lock(&space_info->lock);
1895 		spin_lock(&bg->lock);
1896 		if (bg->reserved || bg->pinned || bg->ro) {
1897 			/*
1898 			 * We want to bail if we made new allocations or have
1899 			 * outstanding allocations in this block group.  We do
1900 			 * the ro check in case balance is currently acting on
1901 			 * this block group.
1902 			 */
1903 			spin_unlock(&bg->lock);
1904 			spin_unlock(&space_info->lock);
1905 			up_write(&space_info->groups_sem);
1906 			goto next;
1907 		}
1908 		if (bg->used == 0) {
1909 			/*
1910 			 * It is possible that we trigger relocation on a block
1911 			 * group as its extents are deleted and it first goes
1912 			 * below the threshold, then shortly after goes empty.
1913 			 *
1914 			 * In this case, relocating it does delete it, but has
1915 			 * some overhead in relocation specific metadata, looking
1916 			 * for the non-existent extents and running some extra
1917 			 * transactions, which we can avoid by using one of the
1918 			 * other mechanisms for dealing with empty block groups.
1919 			 */
1920 			if (!btrfs_test_opt(fs_info, DISCARD_ASYNC))
1921 				btrfs_mark_bg_unused(bg);
1922 			spin_unlock(&bg->lock);
1923 			spin_unlock(&space_info->lock);
1924 			up_write(&space_info->groups_sem);
1925 			goto next;
1926 
1927 		}
1928 		/*
1929 		 * The block group might no longer meet the reclaim condition by
1930 		 * the time we get around to reclaiming it, so to avoid
1931 		 * reclaiming overly full block_groups, skip reclaiming them.
1932 		 *
1933 		 * Since the decision making process also depends on the amount
1934 		 * being freed, pass in a fake giant value to skip that extra
1935 		 * check, which is more meaningful when adding to the list in
1936 		 * the first place.
1937 		 */
1938 		if (!should_reclaim_block_group(bg, bg->length)) {
1939 			spin_unlock(&bg->lock);
1940 			spin_unlock(&space_info->lock);
1941 			up_write(&space_info->groups_sem);
1942 			goto next;
1943 		}
1944 
1945 		/*
1946 		 * Cache the zone_unusable value before turning the block group
1947 		 * to read only. As soon as the block group is read only it's
1948 		 * zone_unusable value gets moved to the block group's read-only
1949 		 * bytes and isn't available for calculations anymore. We also
1950 		 * cache it before unlocking the block group, to prevent races
1951 		 * (reports from KCSAN and such tools) with tasks updating it.
1952 		 */
1953 		zone_unusable = bg->zone_unusable;
1954 
1955 		spin_unlock(&bg->lock);
1956 		spin_unlock(&space_info->lock);
1957 
1958 		/*
1959 		 * Get out fast, in case we're read-only or unmounting the
1960 		 * filesystem. It is OK to drop block groups from the list even
1961 		 * for the read-only case. As we did sb_start_write(),
1962 		 * "mount -o remount,ro" won't happen and read-only filesystem
1963 		 * means it is forced read-only due to a fatal error. So, it
1964 		 * never gets back to read-write to let us reclaim again.
1965 		 */
1966 		if (btrfs_need_cleaner_sleep(fs_info)) {
1967 			up_write(&space_info->groups_sem);
1968 			goto next;
1969 		}
1970 
1971 		ret = inc_block_group_ro(bg, 0);
1972 		up_write(&space_info->groups_sem);
1973 		if (ret < 0)
1974 			goto next;
1975 
1976 		btrfs_info(fs_info,
1977 			"reclaiming chunk %llu with %llu%% used %llu%% unusable",
1978 				bg->start,
1979 				div64_u64(bg->used * 100, bg->length),
1980 				div64_u64(zone_unusable * 100, bg->length));
1981 		trace_btrfs_reclaim_block_group(bg);
1982 		reclaimed = bg->used;
1983 		ret = btrfs_relocate_chunk(fs_info, bg->start);
1984 		if (ret) {
1985 			btrfs_dec_block_group_ro(bg);
1986 			btrfs_err(fs_info, "error relocating chunk %llu",
1987 				  bg->start);
1988 			reclaimed = 0;
1989 			spin_lock(&space_info->lock);
1990 			space_info->reclaim_errors++;
1991 			if (READ_ONCE(space_info->periodic_reclaim))
1992 				space_info->periodic_reclaim_ready = false;
1993 			spin_unlock(&space_info->lock);
1994 		}
1995 		spin_lock(&space_info->lock);
1996 		space_info->reclaim_count++;
1997 		space_info->reclaim_bytes += reclaimed;
1998 		spin_unlock(&space_info->lock);
1999 
2000 next:
2001 		if (ret && !READ_ONCE(space_info->periodic_reclaim))
2002 			btrfs_link_bg_list(bg, &retry_list);
2003 		btrfs_put_block_group(bg);
2004 
2005 		mutex_unlock(&fs_info->reclaim_bgs_lock);
2006 		/*
2007 		 * Reclaiming all the block groups in the list can take really
2008 		 * long.  Prioritize cleaning up unused block groups.
2009 		 */
2010 		btrfs_delete_unused_bgs(fs_info);
2011 		/*
2012 		 * If we are interrupted by a balance, we can just bail out. The
2013 		 * cleaner thread restart again if necessary.
2014 		 */
2015 		if (!mutex_trylock(&fs_info->reclaim_bgs_lock))
2016 			goto end;
2017 		spin_lock(&fs_info->unused_bgs_lock);
2018 	}
2019 	spin_unlock(&fs_info->unused_bgs_lock);
2020 	mutex_unlock(&fs_info->reclaim_bgs_lock);
2021 end:
2022 	spin_lock(&fs_info->unused_bgs_lock);
2023 	list_splice_tail(&retry_list, &fs_info->reclaim_bgs);
2024 	spin_unlock(&fs_info->unused_bgs_lock);
2025 	btrfs_exclop_finish(fs_info);
2026 	sb_end_write(fs_info->sb);
2027 }
2028 
btrfs_reclaim_bgs(struct btrfs_fs_info * fs_info)2029 void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
2030 {
2031 	btrfs_reclaim_sweep(fs_info);
2032 	spin_lock(&fs_info->unused_bgs_lock);
2033 	if (!list_empty(&fs_info->reclaim_bgs))
2034 		queue_work(system_unbound_wq, &fs_info->reclaim_bgs_work);
2035 	spin_unlock(&fs_info->unused_bgs_lock);
2036 }
2037 
btrfs_mark_bg_to_reclaim(struct btrfs_block_group * bg)2038 void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg)
2039 {
2040 	struct btrfs_fs_info *fs_info = bg->fs_info;
2041 
2042 	if (btrfs_link_bg_list(bg, &fs_info->reclaim_bgs))
2043 		trace_btrfs_add_reclaim_block_group(bg);
2044 }
2045 
read_bg_from_eb(struct btrfs_fs_info * fs_info,const struct btrfs_key * key,const struct btrfs_path * path)2046 static int read_bg_from_eb(struct btrfs_fs_info *fs_info, const struct btrfs_key *key,
2047 			   const struct btrfs_path *path)
2048 {
2049 	struct btrfs_chunk_map *map;
2050 	struct btrfs_block_group_item bg;
2051 	struct extent_buffer *leaf;
2052 	int slot;
2053 	u64 flags;
2054 	int ret = 0;
2055 
2056 	slot = path->slots[0];
2057 	leaf = path->nodes[0];
2058 
2059 	map = btrfs_find_chunk_map(fs_info, key->objectid, key->offset);
2060 	if (!map) {
2061 		btrfs_err(fs_info,
2062 			  "logical %llu len %llu found bg but no related chunk",
2063 			  key->objectid, key->offset);
2064 		return -ENOENT;
2065 	}
2066 
2067 	if (map->start != key->objectid || map->chunk_len != key->offset) {
2068 		btrfs_err(fs_info,
2069 			"block group %llu len %llu mismatch with chunk %llu len %llu",
2070 			  key->objectid, key->offset, map->start, map->chunk_len);
2071 		ret = -EUCLEAN;
2072 		goto out_free_map;
2073 	}
2074 
2075 	read_extent_buffer(leaf, &bg, btrfs_item_ptr_offset(leaf, slot),
2076 			   sizeof(bg));
2077 	flags = btrfs_stack_block_group_flags(&bg) &
2078 		BTRFS_BLOCK_GROUP_TYPE_MASK;
2079 
2080 	if (flags != (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
2081 		btrfs_err(fs_info,
2082 "block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
2083 			  key->objectid, key->offset, flags,
2084 			  (BTRFS_BLOCK_GROUP_TYPE_MASK & map->type));
2085 		ret = -EUCLEAN;
2086 	}
2087 
2088 out_free_map:
2089 	btrfs_free_chunk_map(map);
2090 	return ret;
2091 }
2092 
find_first_block_group(struct btrfs_fs_info * fs_info,struct btrfs_path * path,const struct btrfs_key * key)2093 static int find_first_block_group(struct btrfs_fs_info *fs_info,
2094 				  struct btrfs_path *path,
2095 				  const struct btrfs_key *key)
2096 {
2097 	struct btrfs_root *root = btrfs_block_group_root(fs_info);
2098 	int ret;
2099 	struct btrfs_key found_key;
2100 
2101 	btrfs_for_each_slot(root, key, &found_key, path, ret) {
2102 		if (found_key.objectid >= key->objectid &&
2103 		    found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
2104 			return read_bg_from_eb(fs_info, &found_key, path);
2105 		}
2106 	}
2107 	return ret;
2108 }
2109 
set_avail_alloc_bits(struct btrfs_fs_info * fs_info,u64 flags)2110 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
2111 {
2112 	u64 extra_flags = chunk_to_extended(flags) &
2113 				BTRFS_EXTENDED_PROFILE_MASK;
2114 
2115 	write_seqlock(&fs_info->profiles_lock);
2116 	if (flags & BTRFS_BLOCK_GROUP_DATA)
2117 		fs_info->avail_data_alloc_bits |= extra_flags;
2118 	if (flags & BTRFS_BLOCK_GROUP_METADATA)
2119 		fs_info->avail_metadata_alloc_bits |= extra_flags;
2120 	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
2121 		fs_info->avail_system_alloc_bits |= extra_flags;
2122 	write_sequnlock(&fs_info->profiles_lock);
2123 }
2124 
2125 /*
2126  * Map a physical disk address to a list of logical addresses.
2127  *
2128  * @fs_info:       the filesystem
2129  * @chunk_start:   logical address of block group
2130  * @physical:	   physical address to map to logical addresses
2131  * @logical:	   return array of logical addresses which map to @physical
2132  * @naddrs:	   length of @logical
2133  * @stripe_len:    size of IO stripe for the given block group
2134  *
2135  * Maps a particular @physical disk address to a list of @logical addresses.
2136  * Used primarily to exclude those portions of a block group that contain super
2137  * block copies.
2138  */
btrfs_rmap_block(struct btrfs_fs_info * fs_info,u64 chunk_start,u64 physical,u64 ** logical,int * naddrs,int * stripe_len)2139 int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
2140 		     u64 physical, u64 **logical, int *naddrs, int *stripe_len)
2141 {
2142 	struct btrfs_chunk_map *map;
2143 	u64 *buf;
2144 	u64 bytenr;
2145 	u64 data_stripe_length;
2146 	u64 io_stripe_size;
2147 	int i, nr = 0;
2148 	int ret = 0;
2149 
2150 	map = btrfs_get_chunk_map(fs_info, chunk_start, 1);
2151 	if (IS_ERR(map))
2152 		return -EIO;
2153 
2154 	data_stripe_length = map->stripe_size;
2155 	io_stripe_size = BTRFS_STRIPE_LEN;
2156 	chunk_start = map->start;
2157 
2158 	/* For RAID5/6 adjust to a full IO stripe length */
2159 	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2160 		io_stripe_size = btrfs_stripe_nr_to_offset(nr_data_stripes(map));
2161 
2162 	buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
2163 	if (!buf) {
2164 		ret = -ENOMEM;
2165 		goto out;
2166 	}
2167 
2168 	for (i = 0; i < map->num_stripes; i++) {
2169 		bool already_inserted = false;
2170 		u32 stripe_nr;
2171 		u32 offset;
2172 		int j;
2173 
2174 		if (!in_range(physical, map->stripes[i].physical,
2175 			      data_stripe_length))
2176 			continue;
2177 
2178 		stripe_nr = (physical - map->stripes[i].physical) >>
2179 			    BTRFS_STRIPE_LEN_SHIFT;
2180 		offset = (physical - map->stripes[i].physical) &
2181 			 BTRFS_STRIPE_LEN_MASK;
2182 
2183 		if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
2184 				 BTRFS_BLOCK_GROUP_RAID10))
2185 			stripe_nr = div_u64(stripe_nr * map->num_stripes + i,
2186 					    map->sub_stripes);
2187 		/*
2188 		 * The remaining case would be for RAID56, multiply by
2189 		 * nr_data_stripes().  Alternatively, just use rmap_len below
2190 		 * instead of map->stripe_len
2191 		 */
2192 		bytenr = chunk_start + stripe_nr * io_stripe_size + offset;
2193 
2194 		/* Ensure we don't add duplicate addresses */
2195 		for (j = 0; j < nr; j++) {
2196 			if (buf[j] == bytenr) {
2197 				already_inserted = true;
2198 				break;
2199 			}
2200 		}
2201 
2202 		if (!already_inserted)
2203 			buf[nr++] = bytenr;
2204 	}
2205 
2206 	*logical = buf;
2207 	*naddrs = nr;
2208 	*stripe_len = io_stripe_size;
2209 out:
2210 	btrfs_free_chunk_map(map);
2211 	return ret;
2212 }
2213 
exclude_super_stripes(struct btrfs_block_group * cache)2214 static int exclude_super_stripes(struct btrfs_block_group *cache)
2215 {
2216 	struct btrfs_fs_info *fs_info = cache->fs_info;
2217 	const bool zoned = btrfs_is_zoned(fs_info);
2218 	u64 bytenr;
2219 	u64 *logical;
2220 	int stripe_len;
2221 	int i, nr, ret;
2222 
2223 	if (cache->start < BTRFS_SUPER_INFO_OFFSET) {
2224 		stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->start;
2225 		cache->bytes_super += stripe_len;
2226 		ret = set_extent_bit(&fs_info->excluded_extents, cache->start,
2227 				     cache->start + stripe_len - 1,
2228 				     EXTENT_UPTODATE, NULL);
2229 		if (ret)
2230 			return ret;
2231 	}
2232 
2233 	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
2234 		bytenr = btrfs_sb_offset(i);
2235 		ret = btrfs_rmap_block(fs_info, cache->start,
2236 				       bytenr, &logical, &nr, &stripe_len);
2237 		if (ret)
2238 			return ret;
2239 
2240 		/* Shouldn't have super stripes in sequential zones */
2241 		if (zoned && nr) {
2242 			kfree(logical);
2243 			btrfs_err(fs_info,
2244 			"zoned: block group %llu must not contain super block",
2245 				  cache->start);
2246 			return -EUCLEAN;
2247 		}
2248 
2249 		while (nr--) {
2250 			u64 len = min_t(u64, stripe_len,
2251 				cache->start + cache->length - logical[nr]);
2252 
2253 			cache->bytes_super += len;
2254 			ret = set_extent_bit(&fs_info->excluded_extents, logical[nr],
2255 					     logical[nr] + len - 1,
2256 					     EXTENT_UPTODATE, NULL);
2257 			if (ret) {
2258 				kfree(logical);
2259 				return ret;
2260 			}
2261 		}
2262 
2263 		kfree(logical);
2264 	}
2265 	return 0;
2266 }
2267 
btrfs_create_block_group_cache(struct btrfs_fs_info * fs_info,u64 start)2268 static struct btrfs_block_group *btrfs_create_block_group_cache(
2269 		struct btrfs_fs_info *fs_info, u64 start)
2270 {
2271 	struct btrfs_block_group *cache;
2272 
2273 	cache = kzalloc(sizeof(*cache), GFP_NOFS);
2274 	if (!cache)
2275 		return NULL;
2276 
2277 	cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
2278 					GFP_NOFS);
2279 	if (!cache->free_space_ctl) {
2280 		kfree(cache);
2281 		return NULL;
2282 	}
2283 
2284 	cache->start = start;
2285 
2286 	cache->fs_info = fs_info;
2287 	cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
2288 
2289 	cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
2290 
2291 	refcount_set(&cache->refs, 1);
2292 	spin_lock_init(&cache->lock);
2293 	init_rwsem(&cache->data_rwsem);
2294 	INIT_LIST_HEAD(&cache->list);
2295 	INIT_LIST_HEAD(&cache->cluster_list);
2296 	INIT_LIST_HEAD(&cache->bg_list);
2297 	INIT_LIST_HEAD(&cache->ro_list);
2298 	INIT_LIST_HEAD(&cache->discard_list);
2299 	INIT_LIST_HEAD(&cache->dirty_list);
2300 	INIT_LIST_HEAD(&cache->io_list);
2301 	INIT_LIST_HEAD(&cache->active_bg_list);
2302 	btrfs_init_free_space_ctl(cache, cache->free_space_ctl);
2303 	atomic_set(&cache->frozen, 0);
2304 	mutex_init(&cache->free_space_lock);
2305 
2306 	return cache;
2307 }
2308 
2309 /*
2310  * Iterate all chunks and verify that each of them has the corresponding block
2311  * group
2312  */
check_chunk_block_group_mappings(struct btrfs_fs_info * fs_info)2313 static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
2314 {
2315 	u64 start = 0;
2316 	int ret = 0;
2317 
2318 	while (1) {
2319 		struct btrfs_chunk_map *map;
2320 		struct btrfs_block_group *bg;
2321 
2322 		/*
2323 		 * btrfs_find_chunk_map() will return the first chunk map
2324 		 * intersecting the range, so setting @length to 1 is enough to
2325 		 * get the first chunk.
2326 		 */
2327 		map = btrfs_find_chunk_map(fs_info, start, 1);
2328 		if (!map)
2329 			break;
2330 
2331 		bg = btrfs_lookup_block_group(fs_info, map->start);
2332 		if (!bg) {
2333 			btrfs_err(fs_info,
2334 	"chunk start=%llu len=%llu doesn't have corresponding block group",
2335 				     map->start, map->chunk_len);
2336 			ret = -EUCLEAN;
2337 			btrfs_free_chunk_map(map);
2338 			break;
2339 		}
2340 		if (bg->start != map->start || bg->length != map->chunk_len ||
2341 		    (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
2342 		    (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
2343 			btrfs_err(fs_info,
2344 "chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
2345 				map->start, map->chunk_len,
2346 				map->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
2347 				bg->start, bg->length,
2348 				bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
2349 			ret = -EUCLEAN;
2350 			btrfs_free_chunk_map(map);
2351 			btrfs_put_block_group(bg);
2352 			break;
2353 		}
2354 		start = map->start + map->chunk_len;
2355 		btrfs_free_chunk_map(map);
2356 		btrfs_put_block_group(bg);
2357 	}
2358 	return ret;
2359 }
2360 
read_one_block_group(struct btrfs_fs_info * info,struct btrfs_block_group_item * bgi,const struct btrfs_key * key,int need_clear)2361 static int read_one_block_group(struct btrfs_fs_info *info,
2362 				struct btrfs_block_group_item *bgi,
2363 				const struct btrfs_key *key,
2364 				int need_clear)
2365 {
2366 	struct btrfs_block_group *cache;
2367 	const bool mixed = btrfs_fs_incompat(info, MIXED_GROUPS);
2368 	int ret;
2369 
2370 	ASSERT(key->type == BTRFS_BLOCK_GROUP_ITEM_KEY);
2371 
2372 	cache = btrfs_create_block_group_cache(info, key->objectid);
2373 	if (!cache)
2374 		return -ENOMEM;
2375 
2376 	cache->length = key->offset;
2377 	cache->used = btrfs_stack_block_group_used(bgi);
2378 	cache->commit_used = cache->used;
2379 	cache->flags = btrfs_stack_block_group_flags(bgi);
2380 	cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi);
2381 
2382 	set_free_space_tree_thresholds(cache);
2383 
2384 	if (need_clear) {
2385 		/*
2386 		 * When we mount with old space cache, we need to
2387 		 * set BTRFS_DC_CLEAR and set dirty flag.
2388 		 *
2389 		 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
2390 		 *    truncate the old free space cache inode and
2391 		 *    setup a new one.
2392 		 * b) Setting 'dirty flag' makes sure that we flush
2393 		 *    the new space cache info onto disk.
2394 		 */
2395 		if (btrfs_test_opt(info, SPACE_CACHE))
2396 			cache->disk_cache_state = BTRFS_DC_CLEAR;
2397 	}
2398 	if (!mixed && ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
2399 	    (cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
2400 			btrfs_err(info,
2401 "bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
2402 				  cache->start);
2403 			ret = -EINVAL;
2404 			goto error;
2405 	}
2406 
2407 	ret = btrfs_load_block_group_zone_info(cache, false);
2408 	if (ret) {
2409 		btrfs_err(info, "zoned: failed to load zone info of bg %llu",
2410 			  cache->start);
2411 		goto error;
2412 	}
2413 
2414 	/*
2415 	 * We need to exclude the super stripes now so that the space info has
2416 	 * super bytes accounted for, otherwise we'll think we have more space
2417 	 * than we actually do.
2418 	 */
2419 	ret = exclude_super_stripes(cache);
2420 	if (ret) {
2421 		/* We may have excluded something, so call this just in case. */
2422 		btrfs_free_excluded_extents(cache);
2423 		goto error;
2424 	}
2425 
2426 	/*
2427 	 * For zoned filesystem, space after the allocation offset is the only
2428 	 * free space for a block group. So, we don't need any caching work.
2429 	 * btrfs_calc_zone_unusable() will set the amount of free space and
2430 	 * zone_unusable space.
2431 	 *
2432 	 * For regular filesystem, check for two cases, either we are full, and
2433 	 * therefore don't need to bother with the caching work since we won't
2434 	 * find any space, or we are empty, and we can just add all the space
2435 	 * in and be done with it.  This saves us _a_lot_ of time, particularly
2436 	 * in the full case.
2437 	 */
2438 	if (btrfs_is_zoned(info)) {
2439 		btrfs_calc_zone_unusable(cache);
2440 		/* Should not have any excluded extents. Just in case, though. */
2441 		btrfs_free_excluded_extents(cache);
2442 	} else if (cache->length == cache->used) {
2443 		cache->cached = BTRFS_CACHE_FINISHED;
2444 		btrfs_free_excluded_extents(cache);
2445 	} else if (cache->used == 0) {
2446 		cache->cached = BTRFS_CACHE_FINISHED;
2447 		ret = btrfs_add_new_free_space(cache, cache->start,
2448 					       cache->start + cache->length, NULL);
2449 		btrfs_free_excluded_extents(cache);
2450 		if (ret)
2451 			goto error;
2452 	}
2453 
2454 	ret = btrfs_add_block_group_cache(info, cache);
2455 	if (ret) {
2456 		btrfs_remove_free_space_cache(cache);
2457 		goto error;
2458 	}
2459 	trace_btrfs_add_block_group(info, cache, 0);
2460 	btrfs_add_bg_to_space_info(info, cache);
2461 
2462 	set_avail_alloc_bits(info, cache->flags);
2463 	if (btrfs_chunk_writeable(info, cache->start)) {
2464 		if (cache->used == 0) {
2465 			ASSERT(list_empty(&cache->bg_list));
2466 			if (btrfs_test_opt(info, DISCARD_ASYNC))
2467 				btrfs_discard_queue_work(&info->discard_ctl, cache);
2468 			else
2469 				btrfs_mark_bg_unused(cache);
2470 		}
2471 	} else {
2472 		inc_block_group_ro(cache, 1);
2473 	}
2474 
2475 	return 0;
2476 error:
2477 	btrfs_put_block_group(cache);
2478 	return ret;
2479 }
2480 
fill_dummy_bgs(struct btrfs_fs_info * fs_info)2481 static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
2482 {
2483 	struct rb_node *node;
2484 	int ret = 0;
2485 
2486 	for (node = rb_first_cached(&fs_info->mapping_tree); node; node = rb_next(node)) {
2487 		struct btrfs_chunk_map *map;
2488 		struct btrfs_block_group *bg;
2489 
2490 		map = rb_entry(node, struct btrfs_chunk_map, rb_node);
2491 		bg = btrfs_create_block_group_cache(fs_info, map->start);
2492 		if (!bg) {
2493 			ret = -ENOMEM;
2494 			break;
2495 		}
2496 
2497 		/* Fill dummy cache as FULL */
2498 		bg->length = map->chunk_len;
2499 		bg->flags = map->type;
2500 		bg->cached = BTRFS_CACHE_FINISHED;
2501 		bg->used = map->chunk_len;
2502 		bg->flags = map->type;
2503 		ret = btrfs_add_block_group_cache(fs_info, bg);
2504 		/*
2505 		 * We may have some valid block group cache added already, in
2506 		 * that case we skip to the next one.
2507 		 */
2508 		if (ret == -EEXIST) {
2509 			ret = 0;
2510 			btrfs_put_block_group(bg);
2511 			continue;
2512 		}
2513 
2514 		if (ret) {
2515 			btrfs_remove_free_space_cache(bg);
2516 			btrfs_put_block_group(bg);
2517 			break;
2518 		}
2519 
2520 		btrfs_add_bg_to_space_info(fs_info, bg);
2521 
2522 		set_avail_alloc_bits(fs_info, bg->flags);
2523 	}
2524 	if (!ret)
2525 		btrfs_init_global_block_rsv(fs_info);
2526 	return ret;
2527 }
2528 
btrfs_read_block_groups(struct btrfs_fs_info * info)2529 int btrfs_read_block_groups(struct btrfs_fs_info *info)
2530 {
2531 	struct btrfs_root *root = btrfs_block_group_root(info);
2532 	struct btrfs_path *path;
2533 	int ret;
2534 	struct btrfs_block_group *cache;
2535 	struct btrfs_space_info *space_info;
2536 	struct btrfs_key key;
2537 	int need_clear = 0;
2538 	u64 cache_gen;
2539 
2540 	/*
2541 	 * Either no extent root (with ibadroots rescue option) or we have
2542 	 * unsupported RO options. The fs can never be mounted read-write, so no
2543 	 * need to waste time searching block group items.
2544 	 *
2545 	 * This also allows new extent tree related changes to be RO compat,
2546 	 * no need for a full incompat flag.
2547 	 */
2548 	if (!root || (btrfs_super_compat_ro_flags(info->super_copy) &
2549 		      ~BTRFS_FEATURE_COMPAT_RO_SUPP))
2550 		return fill_dummy_bgs(info);
2551 
2552 	key.objectid = 0;
2553 	key.offset = 0;
2554 	key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
2555 	path = btrfs_alloc_path();
2556 	if (!path)
2557 		return -ENOMEM;
2558 
2559 	cache_gen = btrfs_super_cache_generation(info->super_copy);
2560 	if (btrfs_test_opt(info, SPACE_CACHE) &&
2561 	    btrfs_super_generation(info->super_copy) != cache_gen)
2562 		need_clear = 1;
2563 	if (btrfs_test_opt(info, CLEAR_CACHE))
2564 		need_clear = 1;
2565 
2566 	while (1) {
2567 		struct btrfs_block_group_item bgi;
2568 		struct extent_buffer *leaf;
2569 		int slot;
2570 
2571 		ret = find_first_block_group(info, path, &key);
2572 		if (ret > 0)
2573 			break;
2574 		if (ret != 0)
2575 			goto error;
2576 
2577 		leaf = path->nodes[0];
2578 		slot = path->slots[0];
2579 
2580 		read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot),
2581 				   sizeof(bgi));
2582 
2583 		btrfs_item_key_to_cpu(leaf, &key, slot);
2584 		btrfs_release_path(path);
2585 		ret = read_one_block_group(info, &bgi, &key, need_clear);
2586 		if (ret < 0)
2587 			goto error;
2588 		key.objectid += key.offset;
2589 		key.offset = 0;
2590 	}
2591 	btrfs_release_path(path);
2592 
2593 	list_for_each_entry(space_info, &info->space_info, list) {
2594 		int i;
2595 
2596 		for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
2597 			if (list_empty(&space_info->block_groups[i]))
2598 				continue;
2599 			cache = list_first_entry(&space_info->block_groups[i],
2600 						 struct btrfs_block_group,
2601 						 list);
2602 			btrfs_sysfs_add_block_group_type(cache);
2603 		}
2604 
2605 		if (!(btrfs_get_alloc_profile(info, space_info->flags) &
2606 		      (BTRFS_BLOCK_GROUP_RAID10 |
2607 		       BTRFS_BLOCK_GROUP_RAID1_MASK |
2608 		       BTRFS_BLOCK_GROUP_RAID56_MASK |
2609 		       BTRFS_BLOCK_GROUP_DUP)))
2610 			continue;
2611 		/*
2612 		 * Avoid allocating from un-mirrored block group if there are
2613 		 * mirrored block groups.
2614 		 */
2615 		list_for_each_entry(cache,
2616 				&space_info->block_groups[BTRFS_RAID_RAID0],
2617 				list)
2618 			inc_block_group_ro(cache, 1);
2619 		list_for_each_entry(cache,
2620 				&space_info->block_groups[BTRFS_RAID_SINGLE],
2621 				list)
2622 			inc_block_group_ro(cache, 1);
2623 	}
2624 
2625 	btrfs_init_global_block_rsv(info);
2626 	ret = check_chunk_block_group_mappings(info);
2627 error:
2628 	btrfs_free_path(path);
2629 	/*
2630 	 * We've hit some error while reading the extent tree, and have
2631 	 * rescue=ibadroots mount option.
2632 	 * Try to fill the tree using dummy block groups so that the user can
2633 	 * continue to mount and grab their data.
2634 	 */
2635 	if (ret && btrfs_test_opt(info, IGNOREBADROOTS))
2636 		ret = fill_dummy_bgs(info);
2637 	return ret;
2638 }
2639 
2640 /*
2641  * This function, insert_block_group_item(), belongs to the phase 2 of chunk
2642  * allocation.
2643  *
2644  * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
2645  * phases.
2646  */
insert_block_group_item(struct btrfs_trans_handle * trans,struct btrfs_block_group * block_group)2647 static int insert_block_group_item(struct btrfs_trans_handle *trans,
2648 				   struct btrfs_block_group *block_group)
2649 {
2650 	struct btrfs_fs_info *fs_info = trans->fs_info;
2651 	struct btrfs_block_group_item bgi;
2652 	struct btrfs_root *root = btrfs_block_group_root(fs_info);
2653 	struct btrfs_key key;
2654 	u64 old_commit_used;
2655 	int ret;
2656 
2657 	spin_lock(&block_group->lock);
2658 	btrfs_set_stack_block_group_used(&bgi, block_group->used);
2659 	btrfs_set_stack_block_group_chunk_objectid(&bgi,
2660 						   block_group->global_root_id);
2661 	btrfs_set_stack_block_group_flags(&bgi, block_group->flags);
2662 	old_commit_used = block_group->commit_used;
2663 	block_group->commit_used = block_group->used;
2664 	key.objectid = block_group->start;
2665 	key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
2666 	key.offset = block_group->length;
2667 	spin_unlock(&block_group->lock);
2668 
2669 	ret = btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi));
2670 	if (ret < 0) {
2671 		spin_lock(&block_group->lock);
2672 		block_group->commit_used = old_commit_used;
2673 		spin_unlock(&block_group->lock);
2674 	}
2675 
2676 	return ret;
2677 }
2678 
insert_dev_extent(struct btrfs_trans_handle * trans,const struct btrfs_device * device,u64 chunk_offset,u64 start,u64 num_bytes)2679 static int insert_dev_extent(struct btrfs_trans_handle *trans,
2680 			     const struct btrfs_device *device, u64 chunk_offset,
2681 			     u64 start, u64 num_bytes)
2682 {
2683 	struct btrfs_fs_info *fs_info = device->fs_info;
2684 	struct btrfs_root *root = fs_info->dev_root;
2685 	struct btrfs_path *path;
2686 	struct btrfs_dev_extent *extent;
2687 	struct extent_buffer *leaf;
2688 	struct btrfs_key key;
2689 	int ret;
2690 
2691 	WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
2692 	WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
2693 	path = btrfs_alloc_path();
2694 	if (!path)
2695 		return -ENOMEM;
2696 
2697 	key.objectid = device->devid;
2698 	key.type = BTRFS_DEV_EXTENT_KEY;
2699 	key.offset = start;
2700 	ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent));
2701 	if (ret)
2702 		goto out;
2703 
2704 	leaf = path->nodes[0];
2705 	extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
2706 	btrfs_set_dev_extent_chunk_tree(leaf, extent, BTRFS_CHUNK_TREE_OBJECTID);
2707 	btrfs_set_dev_extent_chunk_objectid(leaf, extent,
2708 					    BTRFS_FIRST_CHUNK_TREE_OBJECTID);
2709 	btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
2710 
2711 	btrfs_set_dev_extent_length(leaf, extent, num_bytes);
2712 	btrfs_mark_buffer_dirty(trans, leaf);
2713 out:
2714 	btrfs_free_path(path);
2715 	return ret;
2716 }
2717 
2718 /*
2719  * This function belongs to phase 2.
2720  *
2721  * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
2722  * phases.
2723  */
insert_dev_extents(struct btrfs_trans_handle * trans,u64 chunk_offset,u64 chunk_size)2724 static int insert_dev_extents(struct btrfs_trans_handle *trans,
2725 				   u64 chunk_offset, u64 chunk_size)
2726 {
2727 	struct btrfs_fs_info *fs_info = trans->fs_info;
2728 	struct btrfs_device *device;
2729 	struct btrfs_chunk_map *map;
2730 	u64 dev_offset;
2731 	int i;
2732 	int ret = 0;
2733 
2734 	map = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
2735 	if (IS_ERR(map))
2736 		return PTR_ERR(map);
2737 
2738 	/*
2739 	 * Take the device list mutex to prevent races with the final phase of
2740 	 * a device replace operation that replaces the device object associated
2741 	 * with the map's stripes, because the device object's id can change
2742 	 * at any time during that final phase of the device replace operation
2743 	 * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
2744 	 * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
2745 	 * resulting in persisting a device extent item with such ID.
2746 	 */
2747 	mutex_lock(&fs_info->fs_devices->device_list_mutex);
2748 	for (i = 0; i < map->num_stripes; i++) {
2749 		device = map->stripes[i].dev;
2750 		dev_offset = map->stripes[i].physical;
2751 
2752 		ret = insert_dev_extent(trans, device, chunk_offset, dev_offset,
2753 					map->stripe_size);
2754 		if (ret)
2755 			break;
2756 	}
2757 	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2758 
2759 	btrfs_free_chunk_map(map);
2760 	return ret;
2761 }
2762 
2763 /*
2764  * This function, btrfs_create_pending_block_groups(), belongs to the phase 2 of
2765  * chunk allocation.
2766  *
2767  * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
2768  * phases.
2769  */
btrfs_create_pending_block_groups(struct btrfs_trans_handle * trans)2770 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
2771 {
2772 	struct btrfs_fs_info *fs_info = trans->fs_info;
2773 	struct btrfs_block_group *block_group;
2774 	int ret = 0;
2775 
2776 	while (!list_empty(&trans->new_bgs)) {
2777 		int index;
2778 
2779 		block_group = list_first_entry(&trans->new_bgs,
2780 					       struct btrfs_block_group,
2781 					       bg_list);
2782 		if (ret)
2783 			goto next;
2784 
2785 		index = btrfs_bg_flags_to_raid_index(block_group->flags);
2786 
2787 		ret = insert_block_group_item(trans, block_group);
2788 		if (ret)
2789 			btrfs_abort_transaction(trans, ret);
2790 		if (!test_bit(BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED,
2791 			      &block_group->runtime_flags)) {
2792 			mutex_lock(&fs_info->chunk_mutex);
2793 			ret = btrfs_chunk_alloc_add_chunk_item(trans, block_group);
2794 			mutex_unlock(&fs_info->chunk_mutex);
2795 			if (ret)
2796 				btrfs_abort_transaction(trans, ret);
2797 		}
2798 		ret = insert_dev_extents(trans, block_group->start,
2799 					 block_group->length);
2800 		if (ret)
2801 			btrfs_abort_transaction(trans, ret);
2802 		add_block_group_free_space(trans, block_group);
2803 
2804 		/*
2805 		 * If we restriped during balance, we may have added a new raid
2806 		 * type, so now add the sysfs entries when it is safe to do so.
2807 		 * We don't have to worry about locking here as it's handled in
2808 		 * btrfs_sysfs_add_block_group_type.
2809 		 */
2810 		if (block_group->space_info->block_group_kobjs[index] == NULL)
2811 			btrfs_sysfs_add_block_group_type(block_group);
2812 
2813 		/* Already aborted the transaction if it failed. */
2814 next:
2815 		btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info);
2816 
2817 		spin_lock(&fs_info->unused_bgs_lock);
2818 		list_del_init(&block_group->bg_list);
2819 		clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags);
2820 		btrfs_put_block_group(block_group);
2821 		spin_unlock(&fs_info->unused_bgs_lock);
2822 
2823 		/*
2824 		 * If the block group is still unused, add it to the list of
2825 		 * unused block groups. The block group may have been created in
2826 		 * order to satisfy a space reservation, in which case the
2827 		 * extent allocation only happens later. But often we don't
2828 		 * actually need to allocate space that we previously reserved,
2829 		 * so the block group may become unused for a long time. For
2830 		 * example for metadata we generally reserve space for a worst
2831 		 * possible scenario, but then don't end up allocating all that
2832 		 * space or none at all (due to no need to COW, extent buffers
2833 		 * were already COWed in the current transaction and still
2834 		 * unwritten, tree heights lower than the maximum possible
2835 		 * height, etc). For data we generally reserve the axact amount
2836 		 * of space we are going to allocate later, the exception is
2837 		 * when using compression, as we must reserve space based on the
2838 		 * uncompressed data size, because the compression is only done
2839 		 * when writeback triggered and we don't know how much space we
2840 		 * are actually going to need, so we reserve the uncompressed
2841 		 * size because the data may be uncompressible in the worst case.
2842 		 */
2843 		if (ret == 0) {
2844 			bool used;
2845 
2846 			spin_lock(&block_group->lock);
2847 			used = btrfs_is_block_group_used(block_group);
2848 			spin_unlock(&block_group->lock);
2849 
2850 			if (!used)
2851 				btrfs_mark_bg_unused(block_group);
2852 		}
2853 	}
2854 	btrfs_trans_release_chunk_metadata(trans);
2855 }
2856 
2857 /*
2858  * For extent tree v2 we use the block_group_item->chunk_offset to point at our
2859  * global root id.  For v1 it's always set to BTRFS_FIRST_CHUNK_TREE_OBJECTID.
2860  */
calculate_global_root_id(const struct btrfs_fs_info * fs_info,u64 offset)2861 static u64 calculate_global_root_id(const struct btrfs_fs_info *fs_info, u64 offset)
2862 {
2863 	u64 div = SZ_1G;
2864 	u64 index;
2865 
2866 	if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
2867 		return BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2868 
2869 	/* If we have a smaller fs index based on 128MiB. */
2870 	if (btrfs_super_total_bytes(fs_info->super_copy) <= (SZ_1G * 10ULL))
2871 		div = SZ_128M;
2872 
2873 	offset = div64_u64(offset, div);
2874 	div64_u64_rem(offset, fs_info->nr_global_roots, &index);
2875 	return index;
2876 }
2877 
btrfs_make_block_group(struct btrfs_trans_handle * trans,u64 type,u64 chunk_offset,u64 size)2878 struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
2879 						 u64 type,
2880 						 u64 chunk_offset, u64 size)
2881 {
2882 	struct btrfs_fs_info *fs_info = trans->fs_info;
2883 	struct btrfs_block_group *cache;
2884 	int ret;
2885 
2886 	btrfs_set_log_full_commit(trans);
2887 
2888 	cache = btrfs_create_block_group_cache(fs_info, chunk_offset);
2889 	if (!cache)
2890 		return ERR_PTR(-ENOMEM);
2891 
2892 	/*
2893 	 * Mark it as new before adding it to the rbtree of block groups or any
2894 	 * list, so that no other task finds it and calls btrfs_mark_bg_unused()
2895 	 * before the new flag is set.
2896 	 */
2897 	set_bit(BLOCK_GROUP_FLAG_NEW, &cache->runtime_flags);
2898 
2899 	cache->length = size;
2900 	set_free_space_tree_thresholds(cache);
2901 	cache->flags = type;
2902 	cache->cached = BTRFS_CACHE_FINISHED;
2903 	cache->global_root_id = calculate_global_root_id(fs_info, cache->start);
2904 
2905 	if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
2906 		set_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &cache->runtime_flags);
2907 
2908 	ret = btrfs_load_block_group_zone_info(cache, true);
2909 	if (ret) {
2910 		btrfs_put_block_group(cache);
2911 		return ERR_PTR(ret);
2912 	}
2913 
2914 	ret = exclude_super_stripes(cache);
2915 	if (ret) {
2916 		/* We may have excluded something, so call this just in case */
2917 		btrfs_free_excluded_extents(cache);
2918 		btrfs_put_block_group(cache);
2919 		return ERR_PTR(ret);
2920 	}
2921 
2922 	ret = btrfs_add_new_free_space(cache, chunk_offset, chunk_offset + size, NULL);
2923 	btrfs_free_excluded_extents(cache);
2924 	if (ret) {
2925 		btrfs_put_block_group(cache);
2926 		return ERR_PTR(ret);
2927 	}
2928 
2929 	/*
2930 	 * Ensure the corresponding space_info object is created and
2931 	 * assigned to our block group. We want our bg to be added to the rbtree
2932 	 * with its ->space_info set.
2933 	 */
2934 	cache->space_info = btrfs_find_space_info(fs_info, cache->flags);
2935 	ASSERT(cache->space_info);
2936 
2937 	ret = btrfs_add_block_group_cache(fs_info, cache);
2938 	if (ret) {
2939 		btrfs_remove_free_space_cache(cache);
2940 		btrfs_put_block_group(cache);
2941 		return ERR_PTR(ret);
2942 	}
2943 
2944 	/*
2945 	 * Now that our block group has its ->space_info set and is inserted in
2946 	 * the rbtree, update the space info's counters.
2947 	 */
2948 	trace_btrfs_add_block_group(fs_info, cache, 1);
2949 	btrfs_add_bg_to_space_info(fs_info, cache);
2950 	btrfs_update_global_block_rsv(fs_info);
2951 
2952 #ifdef CONFIG_BTRFS_DEBUG
2953 	if (btrfs_should_fragment_free_space(cache)) {
2954 		cache->space_info->bytes_used += size >> 1;
2955 		fragment_free_space(cache);
2956 	}
2957 #endif
2958 
2959 	btrfs_link_bg_list(cache, &trans->new_bgs);
2960 	btrfs_inc_delayed_refs_rsv_bg_inserts(fs_info);
2961 
2962 	set_avail_alloc_bits(fs_info, type);
2963 	return cache;
2964 }
2965 
2966 /*
2967  * Mark one block group RO, can be called several times for the same block
2968  * group.
2969  *
2970  * @cache:		the destination block group
2971  * @do_chunk_alloc:	whether need to do chunk pre-allocation, this is to
2972  * 			ensure we still have some free space after marking this
2973  * 			block group RO.
2974  */
btrfs_inc_block_group_ro(struct btrfs_block_group * cache,bool do_chunk_alloc)2975 int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
2976 			     bool do_chunk_alloc)
2977 {
2978 	struct btrfs_fs_info *fs_info = cache->fs_info;
2979 	struct btrfs_trans_handle *trans;
2980 	struct btrfs_root *root = btrfs_block_group_root(fs_info);
2981 	u64 alloc_flags;
2982 	int ret;
2983 	bool dirty_bg_running;
2984 
2985 	/*
2986 	 * This can only happen when we are doing read-only scrub on read-only
2987 	 * mount.
2988 	 * In that case we should not start a new transaction on read-only fs.
2989 	 * Thus here we skip all chunk allocations.
2990 	 */
2991 	if (sb_rdonly(fs_info->sb)) {
2992 		mutex_lock(&fs_info->ro_block_group_mutex);
2993 		ret = inc_block_group_ro(cache, 0);
2994 		mutex_unlock(&fs_info->ro_block_group_mutex);
2995 		return ret;
2996 	}
2997 
2998 	do {
2999 		trans = btrfs_join_transaction(root);
3000 		if (IS_ERR(trans))
3001 			return PTR_ERR(trans);
3002 
3003 		dirty_bg_running = false;
3004 
3005 		/*
3006 		 * We're not allowed to set block groups readonly after the dirty
3007 		 * block group cache has started writing.  If it already started,
3008 		 * back off and let this transaction commit.
3009 		 */
3010 		mutex_lock(&fs_info->ro_block_group_mutex);
3011 		if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
3012 			u64 transid = trans->transid;
3013 
3014 			mutex_unlock(&fs_info->ro_block_group_mutex);
3015 			btrfs_end_transaction(trans);
3016 
3017 			ret = btrfs_wait_for_commit(fs_info, transid);
3018 			if (ret)
3019 				return ret;
3020 			dirty_bg_running = true;
3021 		}
3022 	} while (dirty_bg_running);
3023 
3024 	if (do_chunk_alloc) {
3025 		/*
3026 		 * If we are changing raid levels, try to allocate a
3027 		 * corresponding block group with the new raid level.
3028 		 */
3029 		alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
3030 		if (alloc_flags != cache->flags) {
3031 			ret = btrfs_chunk_alloc(trans, alloc_flags,
3032 						CHUNK_ALLOC_FORCE);
3033 			/*
3034 			 * ENOSPC is allowed here, we may have enough space
3035 			 * already allocated at the new raid level to carry on
3036 			 */
3037 			if (ret == -ENOSPC)
3038 				ret = 0;
3039 			if (ret < 0)
3040 				goto out;
3041 		}
3042 	}
3043 
3044 	ret = inc_block_group_ro(cache, 0);
3045 	if (!ret)
3046 		goto out;
3047 	if (ret == -ETXTBSY)
3048 		goto unlock_out;
3049 
3050 	/*
3051 	 * Skip chunk allocation if the bg is SYSTEM, this is to avoid system
3052 	 * chunk allocation storm to exhaust the system chunk array.  Otherwise
3053 	 * we still want to try our best to mark the block group read-only.
3054 	 */
3055 	if (!do_chunk_alloc && ret == -ENOSPC &&
3056 	    (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM))
3057 		goto unlock_out;
3058 
3059 	alloc_flags = btrfs_get_alloc_profile(fs_info, cache->space_info->flags);
3060 	ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
3061 	if (ret < 0)
3062 		goto out;
3063 	/*
3064 	 * We have allocated a new chunk. We also need to activate that chunk to
3065 	 * grant metadata tickets for zoned filesystem.
3066 	 */
3067 	ret = btrfs_zoned_activate_one_bg(fs_info, cache->space_info, true);
3068 	if (ret < 0)
3069 		goto out;
3070 
3071 	ret = inc_block_group_ro(cache, 0);
3072 	if (ret == -ETXTBSY)
3073 		goto unlock_out;
3074 out:
3075 	if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
3076 		alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
3077 		mutex_lock(&fs_info->chunk_mutex);
3078 		check_system_chunk(trans, alloc_flags);
3079 		mutex_unlock(&fs_info->chunk_mutex);
3080 	}
3081 unlock_out:
3082 	mutex_unlock(&fs_info->ro_block_group_mutex);
3083 
3084 	btrfs_end_transaction(trans);
3085 	return ret;
3086 }
3087 
btrfs_dec_block_group_ro(struct btrfs_block_group * cache)3088 void btrfs_dec_block_group_ro(struct btrfs_block_group *cache)
3089 {
3090 	struct btrfs_space_info *sinfo = cache->space_info;
3091 	u64 num_bytes;
3092 
3093 	BUG_ON(!cache->ro);
3094 
3095 	spin_lock(&sinfo->lock);
3096 	spin_lock(&cache->lock);
3097 	if (!--cache->ro) {
3098 		if (btrfs_is_zoned(cache->fs_info)) {
3099 			/* Migrate zone_unusable bytes back */
3100 			cache->zone_unusable =
3101 				(cache->alloc_offset - cache->used - cache->pinned -
3102 				 cache->reserved) +
3103 				(cache->length - cache->zone_capacity);
3104 			btrfs_space_info_update_bytes_zone_unusable(cache->fs_info, sinfo,
3105 								    cache->zone_unusable);
3106 			sinfo->bytes_readonly -= cache->zone_unusable;
3107 		}
3108 		num_bytes = cache->length - cache->reserved -
3109 			    cache->pinned - cache->bytes_super -
3110 			    cache->zone_unusable - cache->used;
3111 		sinfo->bytes_readonly -= num_bytes;
3112 		list_del_init(&cache->ro_list);
3113 	}
3114 	spin_unlock(&cache->lock);
3115 	spin_unlock(&sinfo->lock);
3116 }
3117 
update_block_group_item(struct btrfs_trans_handle * trans,struct btrfs_path * path,struct btrfs_block_group * cache)3118 static int update_block_group_item(struct btrfs_trans_handle *trans,
3119 				   struct btrfs_path *path,
3120 				   struct btrfs_block_group *cache)
3121 {
3122 	struct btrfs_fs_info *fs_info = trans->fs_info;
3123 	int ret;
3124 	struct btrfs_root *root = btrfs_block_group_root(fs_info);
3125 	unsigned long bi;
3126 	struct extent_buffer *leaf;
3127 	struct btrfs_block_group_item bgi;
3128 	struct btrfs_key key;
3129 	u64 old_commit_used;
3130 	u64 used;
3131 
3132 	/*
3133 	 * Block group items update can be triggered out of commit transaction
3134 	 * critical section, thus we need a consistent view of used bytes.
3135 	 * We cannot use cache->used directly outside of the spin lock, as it
3136 	 * may be changed.
3137 	 */
3138 	spin_lock(&cache->lock);
3139 	old_commit_used = cache->commit_used;
3140 	used = cache->used;
3141 	/* No change in used bytes, can safely skip it. */
3142 	if (cache->commit_used == used) {
3143 		spin_unlock(&cache->lock);
3144 		return 0;
3145 	}
3146 	cache->commit_used = used;
3147 	spin_unlock(&cache->lock);
3148 
3149 	key.objectid = cache->start;
3150 	key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
3151 	key.offset = cache->length;
3152 
3153 	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
3154 	if (ret) {
3155 		if (ret > 0)
3156 			ret = -ENOENT;
3157 		goto fail;
3158 	}
3159 
3160 	leaf = path->nodes[0];
3161 	bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
3162 	btrfs_set_stack_block_group_used(&bgi, used);
3163 	btrfs_set_stack_block_group_chunk_objectid(&bgi,
3164 						   cache->global_root_id);
3165 	btrfs_set_stack_block_group_flags(&bgi, cache->flags);
3166 	write_extent_buffer(leaf, &bgi, bi, sizeof(bgi));
3167 	btrfs_mark_buffer_dirty(trans, leaf);
3168 fail:
3169 	btrfs_release_path(path);
3170 	/*
3171 	 * We didn't update the block group item, need to revert commit_used
3172 	 * unless the block group item didn't exist yet - this is to prevent a
3173 	 * race with a concurrent insertion of the block group item, with
3174 	 * insert_block_group_item(), that happened just after we attempted to
3175 	 * update. In that case we would reset commit_used to 0 just after the
3176 	 * insertion set it to a value greater than 0 - if the block group later
3177 	 * becomes with 0 used bytes, we would incorrectly skip its update.
3178 	 */
3179 	if (ret < 0 && ret != -ENOENT) {
3180 		spin_lock(&cache->lock);
3181 		cache->commit_used = old_commit_used;
3182 		spin_unlock(&cache->lock);
3183 	}
3184 	return ret;
3185 
3186 }
3187 
cache_save_setup(struct btrfs_block_group * block_group,struct btrfs_trans_handle * trans,struct btrfs_path * path)3188 static int cache_save_setup(struct btrfs_block_group *block_group,
3189 			    struct btrfs_trans_handle *trans,
3190 			    struct btrfs_path *path)
3191 {
3192 	struct btrfs_fs_info *fs_info = block_group->fs_info;
3193 	struct inode *inode = NULL;
3194 	struct extent_changeset *data_reserved = NULL;
3195 	u64 alloc_hint = 0;
3196 	int dcs = BTRFS_DC_ERROR;
3197 	u64 cache_size = 0;
3198 	int retries = 0;
3199 	int ret = 0;
3200 
3201 	if (!btrfs_test_opt(fs_info, SPACE_CACHE))
3202 		return 0;
3203 
3204 	/*
3205 	 * If this block group is smaller than 100 megs don't bother caching the
3206 	 * block group.
3207 	 */
3208 	if (block_group->length < (100 * SZ_1M)) {
3209 		spin_lock(&block_group->lock);
3210 		block_group->disk_cache_state = BTRFS_DC_WRITTEN;
3211 		spin_unlock(&block_group->lock);
3212 		return 0;
3213 	}
3214 
3215 	if (TRANS_ABORTED(trans))
3216 		return 0;
3217 again:
3218 	inode = lookup_free_space_inode(block_group, path);
3219 	if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
3220 		ret = PTR_ERR(inode);
3221 		btrfs_release_path(path);
3222 		goto out;
3223 	}
3224 
3225 	if (IS_ERR(inode)) {
3226 		BUG_ON(retries);
3227 		retries++;
3228 
3229 		if (block_group->ro)
3230 			goto out_free;
3231 
3232 		ret = create_free_space_inode(trans, block_group, path);
3233 		if (ret)
3234 			goto out_free;
3235 		goto again;
3236 	}
3237 
3238 	/*
3239 	 * We want to set the generation to 0, that way if anything goes wrong
3240 	 * from here on out we know not to trust this cache when we load up next
3241 	 * time.
3242 	 */
3243 	BTRFS_I(inode)->generation = 0;
3244 	ret = btrfs_update_inode(trans, BTRFS_I(inode));
3245 	if (ret) {
3246 		/*
3247 		 * So theoretically we could recover from this, simply set the
3248 		 * super cache generation to 0 so we know to invalidate the
3249 		 * cache, but then we'd have to keep track of the block groups
3250 		 * that fail this way so we know we _have_ to reset this cache
3251 		 * before the next commit or risk reading stale cache.  So to
3252 		 * limit our exposure to horrible edge cases lets just abort the
3253 		 * transaction, this only happens in really bad situations
3254 		 * anyway.
3255 		 */
3256 		btrfs_abort_transaction(trans, ret);
3257 		goto out_put;
3258 	}
3259 	WARN_ON(ret);
3260 
3261 	/* We've already setup this transaction, go ahead and exit */
3262 	if (block_group->cache_generation == trans->transid &&
3263 	    i_size_read(inode)) {
3264 		dcs = BTRFS_DC_SETUP;
3265 		goto out_put;
3266 	}
3267 
3268 	if (i_size_read(inode) > 0) {
3269 		ret = btrfs_check_trunc_cache_free_space(fs_info,
3270 					&fs_info->global_block_rsv);
3271 		if (ret)
3272 			goto out_put;
3273 
3274 		ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
3275 		if (ret)
3276 			goto out_put;
3277 	}
3278 
3279 	spin_lock(&block_group->lock);
3280 	if (block_group->cached != BTRFS_CACHE_FINISHED ||
3281 	    !btrfs_test_opt(fs_info, SPACE_CACHE)) {
3282 		/*
3283 		 * don't bother trying to write stuff out _if_
3284 		 * a) we're not cached,
3285 		 * b) we're with nospace_cache mount option,
3286 		 * c) we're with v2 space_cache (FREE_SPACE_TREE).
3287 		 */
3288 		dcs = BTRFS_DC_WRITTEN;
3289 		spin_unlock(&block_group->lock);
3290 		goto out_put;
3291 	}
3292 	spin_unlock(&block_group->lock);
3293 
3294 	/*
3295 	 * We hit an ENOSPC when setting up the cache in this transaction, just
3296 	 * skip doing the setup, we've already cleared the cache so we're safe.
3297 	 */
3298 	if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
3299 		ret = -ENOSPC;
3300 		goto out_put;
3301 	}
3302 
3303 	/*
3304 	 * Try to preallocate enough space based on how big the block group is.
3305 	 * Keep in mind this has to include any pinned space which could end up
3306 	 * taking up quite a bit since it's not folded into the other space
3307 	 * cache.
3308 	 */
3309 	cache_size = div_u64(block_group->length, SZ_256M);
3310 	if (!cache_size)
3311 		cache_size = 1;
3312 
3313 	cache_size *= 16;
3314 	cache_size *= fs_info->sectorsize;
3315 
3316 	ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, 0,
3317 					  cache_size, false);
3318 	if (ret)
3319 		goto out_put;
3320 
3321 	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, cache_size,
3322 					      cache_size, cache_size,
3323 					      &alloc_hint);
3324 	/*
3325 	 * Our cache requires contiguous chunks so that we don't modify a bunch
3326 	 * of metadata or split extents when writing the cache out, which means
3327 	 * we can enospc if we are heavily fragmented in addition to just normal
3328 	 * out of space conditions.  So if we hit this just skip setting up any
3329 	 * other block groups for this transaction, maybe we'll unpin enough
3330 	 * space the next time around.
3331 	 */
3332 	if (!ret)
3333 		dcs = BTRFS_DC_SETUP;
3334 	else if (ret == -ENOSPC)
3335 		set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
3336 
3337 out_put:
3338 	iput(inode);
3339 out_free:
3340 	btrfs_release_path(path);
3341 out:
3342 	spin_lock(&block_group->lock);
3343 	if (!ret && dcs == BTRFS_DC_SETUP)
3344 		block_group->cache_generation = trans->transid;
3345 	block_group->disk_cache_state = dcs;
3346 	spin_unlock(&block_group->lock);
3347 
3348 	extent_changeset_free(data_reserved);
3349 	return ret;
3350 }
3351 
btrfs_setup_space_cache(struct btrfs_trans_handle * trans)3352 int btrfs_setup_space_cache(struct btrfs_trans_handle *trans)
3353 {
3354 	struct btrfs_fs_info *fs_info = trans->fs_info;
3355 	struct btrfs_block_group *cache, *tmp;
3356 	struct btrfs_transaction *cur_trans = trans->transaction;
3357 	struct btrfs_path *path;
3358 
3359 	if (list_empty(&cur_trans->dirty_bgs) ||
3360 	    !btrfs_test_opt(fs_info, SPACE_CACHE))
3361 		return 0;
3362 
3363 	path = btrfs_alloc_path();
3364 	if (!path)
3365 		return -ENOMEM;
3366 
3367 	/* Could add new block groups, use _safe just in case */
3368 	list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
3369 				 dirty_list) {
3370 		if (cache->disk_cache_state == BTRFS_DC_CLEAR)
3371 			cache_save_setup(cache, trans, path);
3372 	}
3373 
3374 	btrfs_free_path(path);
3375 	return 0;
3376 }
3377 
3378 /*
3379  * Transaction commit does final block group cache writeback during a critical
3380  * section where nothing is allowed to change the FS.  This is required in
3381  * order for the cache to actually match the block group, but can introduce a
3382  * lot of latency into the commit.
3383  *
3384  * So, btrfs_start_dirty_block_groups is here to kick off block group cache IO.
3385  * There's a chance we'll have to redo some of it if the block group changes
3386  * again during the commit, but it greatly reduces the commit latency by
3387  * getting rid of the easy block groups while we're still allowing others to
3388  * join the commit.
3389  */
btrfs_start_dirty_block_groups(struct btrfs_trans_handle * trans)3390 int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
3391 {
3392 	struct btrfs_fs_info *fs_info = trans->fs_info;
3393 	struct btrfs_block_group *cache;
3394 	struct btrfs_transaction *cur_trans = trans->transaction;
3395 	int ret = 0;
3396 	int should_put;
3397 	struct btrfs_path *path = NULL;
3398 	LIST_HEAD(dirty);
3399 	struct list_head *io = &cur_trans->io_bgs;
3400 	int loops = 0;
3401 
3402 	spin_lock(&cur_trans->dirty_bgs_lock);
3403 	if (list_empty(&cur_trans->dirty_bgs)) {
3404 		spin_unlock(&cur_trans->dirty_bgs_lock);
3405 		return 0;
3406 	}
3407 	list_splice_init(&cur_trans->dirty_bgs, &dirty);
3408 	spin_unlock(&cur_trans->dirty_bgs_lock);
3409 
3410 again:
3411 	/* Make sure all the block groups on our dirty list actually exist */
3412 	btrfs_create_pending_block_groups(trans);
3413 
3414 	if (!path) {
3415 		path = btrfs_alloc_path();
3416 		if (!path) {
3417 			ret = -ENOMEM;
3418 			goto out;
3419 		}
3420 	}
3421 
3422 	/*
3423 	 * cache_write_mutex is here only to save us from balance or automatic
3424 	 * removal of empty block groups deleting this block group while we are
3425 	 * writing out the cache
3426 	 */
3427 	mutex_lock(&trans->transaction->cache_write_mutex);
3428 	while (!list_empty(&dirty)) {
3429 		bool drop_reserve = true;
3430 
3431 		cache = list_first_entry(&dirty, struct btrfs_block_group,
3432 					 dirty_list);
3433 		/*
3434 		 * This can happen if something re-dirties a block group that
3435 		 * is already under IO.  Just wait for it to finish and then do
3436 		 * it all again
3437 		 */
3438 		if (!list_empty(&cache->io_list)) {
3439 			list_del_init(&cache->io_list);
3440 			btrfs_wait_cache_io(trans, cache, path);
3441 			btrfs_put_block_group(cache);
3442 		}
3443 
3444 
3445 		/*
3446 		 * btrfs_wait_cache_io uses the cache->dirty_list to decide if
3447 		 * it should update the cache_state.  Don't delete until after
3448 		 * we wait.
3449 		 *
3450 		 * Since we're not running in the commit critical section
3451 		 * we need the dirty_bgs_lock to protect from update_block_group
3452 		 */
3453 		spin_lock(&cur_trans->dirty_bgs_lock);
3454 		list_del_init(&cache->dirty_list);
3455 		spin_unlock(&cur_trans->dirty_bgs_lock);
3456 
3457 		should_put = 1;
3458 
3459 		cache_save_setup(cache, trans, path);
3460 
3461 		if (cache->disk_cache_state == BTRFS_DC_SETUP) {
3462 			cache->io_ctl.inode = NULL;
3463 			ret = btrfs_write_out_cache(trans, cache, path);
3464 			if (ret == 0 && cache->io_ctl.inode) {
3465 				should_put = 0;
3466 
3467 				/*
3468 				 * The cache_write_mutex is protecting the
3469 				 * io_list, also refer to the definition of
3470 				 * btrfs_transaction::io_bgs for more details
3471 				 */
3472 				list_add_tail(&cache->io_list, io);
3473 			} else {
3474 				/*
3475 				 * If we failed to write the cache, the
3476 				 * generation will be bad and life goes on
3477 				 */
3478 				ret = 0;
3479 			}
3480 		}
3481 		if (!ret) {
3482 			ret = update_block_group_item(trans, path, cache);
3483 			/*
3484 			 * Our block group might still be attached to the list
3485 			 * of new block groups in the transaction handle of some
3486 			 * other task (struct btrfs_trans_handle->new_bgs). This
3487 			 * means its block group item isn't yet in the extent
3488 			 * tree. If this happens ignore the error, as we will
3489 			 * try again later in the critical section of the
3490 			 * transaction commit.
3491 			 */
3492 			if (ret == -ENOENT) {
3493 				ret = 0;
3494 				spin_lock(&cur_trans->dirty_bgs_lock);
3495 				if (list_empty(&cache->dirty_list)) {
3496 					list_add_tail(&cache->dirty_list,
3497 						      &cur_trans->dirty_bgs);
3498 					btrfs_get_block_group(cache);
3499 					drop_reserve = false;
3500 				}
3501 				spin_unlock(&cur_trans->dirty_bgs_lock);
3502 			} else if (ret) {
3503 				btrfs_abort_transaction(trans, ret);
3504 			}
3505 		}
3506 
3507 		/* If it's not on the io list, we need to put the block group */
3508 		if (should_put)
3509 			btrfs_put_block_group(cache);
3510 		if (drop_reserve)
3511 			btrfs_dec_delayed_refs_rsv_bg_updates(fs_info);
3512 		/*
3513 		 * Avoid blocking other tasks for too long. It might even save
3514 		 * us from writing caches for block groups that are going to be
3515 		 * removed.
3516 		 */
3517 		mutex_unlock(&trans->transaction->cache_write_mutex);
3518 		if (ret)
3519 			goto out;
3520 		mutex_lock(&trans->transaction->cache_write_mutex);
3521 	}
3522 	mutex_unlock(&trans->transaction->cache_write_mutex);
3523 
3524 	/*
3525 	 * Go through delayed refs for all the stuff we've just kicked off
3526 	 * and then loop back (just once)
3527 	 */
3528 	if (!ret)
3529 		ret = btrfs_run_delayed_refs(trans, 0);
3530 	if (!ret && loops == 0) {
3531 		loops++;
3532 		spin_lock(&cur_trans->dirty_bgs_lock);
3533 		list_splice_init(&cur_trans->dirty_bgs, &dirty);
3534 		/*
3535 		 * dirty_bgs_lock protects us from concurrent block group
3536 		 * deletes too (not just cache_write_mutex).
3537 		 */
3538 		if (!list_empty(&dirty)) {
3539 			spin_unlock(&cur_trans->dirty_bgs_lock);
3540 			goto again;
3541 		}
3542 		spin_unlock(&cur_trans->dirty_bgs_lock);
3543 	}
3544 out:
3545 	if (ret < 0) {
3546 		spin_lock(&cur_trans->dirty_bgs_lock);
3547 		list_splice_init(&dirty, &cur_trans->dirty_bgs);
3548 		spin_unlock(&cur_trans->dirty_bgs_lock);
3549 		btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
3550 	}
3551 
3552 	btrfs_free_path(path);
3553 	return ret;
3554 }
3555 
btrfs_write_dirty_block_groups(struct btrfs_trans_handle * trans)3556 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
3557 {
3558 	struct btrfs_fs_info *fs_info = trans->fs_info;
3559 	struct btrfs_block_group *cache;
3560 	struct btrfs_transaction *cur_trans = trans->transaction;
3561 	int ret = 0;
3562 	int should_put;
3563 	struct btrfs_path *path;
3564 	struct list_head *io = &cur_trans->io_bgs;
3565 
3566 	path = btrfs_alloc_path();
3567 	if (!path)
3568 		return -ENOMEM;
3569 
3570 	/*
3571 	 * Even though we are in the critical section of the transaction commit,
3572 	 * we can still have concurrent tasks adding elements to this
3573 	 * transaction's list of dirty block groups. These tasks correspond to
3574 	 * endio free space workers started when writeback finishes for a
3575 	 * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
3576 	 * allocate new block groups as a result of COWing nodes of the root
3577 	 * tree when updating the free space inode. The writeback for the space
3578 	 * caches is triggered by an earlier call to
3579 	 * btrfs_start_dirty_block_groups() and iterations of the following
3580 	 * loop.
3581 	 * Also we want to do the cache_save_setup first and then run the
3582 	 * delayed refs to make sure we have the best chance at doing this all
3583 	 * in one shot.
3584 	 */
3585 	spin_lock(&cur_trans->dirty_bgs_lock);
3586 	while (!list_empty(&cur_trans->dirty_bgs)) {
3587 		cache = list_first_entry(&cur_trans->dirty_bgs,
3588 					 struct btrfs_block_group,
3589 					 dirty_list);
3590 
3591 		/*
3592 		 * This can happen if cache_save_setup re-dirties a block group
3593 		 * that is already under IO.  Just wait for it to finish and
3594 		 * then do it all again
3595 		 */
3596 		if (!list_empty(&cache->io_list)) {
3597 			spin_unlock(&cur_trans->dirty_bgs_lock);
3598 			list_del_init(&cache->io_list);
3599 			btrfs_wait_cache_io(trans, cache, path);
3600 			btrfs_put_block_group(cache);
3601 			spin_lock(&cur_trans->dirty_bgs_lock);
3602 		}
3603 
3604 		/*
3605 		 * Don't remove from the dirty list until after we've waited on
3606 		 * any pending IO
3607 		 */
3608 		list_del_init(&cache->dirty_list);
3609 		spin_unlock(&cur_trans->dirty_bgs_lock);
3610 		should_put = 1;
3611 
3612 		cache_save_setup(cache, trans, path);
3613 
3614 		if (!ret)
3615 			ret = btrfs_run_delayed_refs(trans, U64_MAX);
3616 
3617 		if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
3618 			cache->io_ctl.inode = NULL;
3619 			ret = btrfs_write_out_cache(trans, cache, path);
3620 			if (ret == 0 && cache->io_ctl.inode) {
3621 				should_put = 0;
3622 				list_add_tail(&cache->io_list, io);
3623 			} else {
3624 				/*
3625 				 * If we failed to write the cache, the
3626 				 * generation will be bad and life goes on
3627 				 */
3628 				ret = 0;
3629 			}
3630 		}
3631 		if (!ret) {
3632 			ret = update_block_group_item(trans, path, cache);
3633 			/*
3634 			 * One of the free space endio workers might have
3635 			 * created a new block group while updating a free space
3636 			 * cache's inode (at inode.c:btrfs_finish_ordered_io())
3637 			 * and hasn't released its transaction handle yet, in
3638 			 * which case the new block group is still attached to
3639 			 * its transaction handle and its creation has not
3640 			 * finished yet (no block group item in the extent tree
3641 			 * yet, etc). If this is the case, wait for all free
3642 			 * space endio workers to finish and retry. This is a
3643 			 * very rare case so no need for a more efficient and
3644 			 * complex approach.
3645 			 */
3646 			if (ret == -ENOENT) {
3647 				wait_event(cur_trans->writer_wait,
3648 				   atomic_read(&cur_trans->num_writers) == 1);
3649 				ret = update_block_group_item(trans, path, cache);
3650 			}
3651 			if (ret)
3652 				btrfs_abort_transaction(trans, ret);
3653 		}
3654 
3655 		/* If its not on the io list, we need to put the block group */
3656 		if (should_put)
3657 			btrfs_put_block_group(cache);
3658 		btrfs_dec_delayed_refs_rsv_bg_updates(fs_info);
3659 		spin_lock(&cur_trans->dirty_bgs_lock);
3660 	}
3661 	spin_unlock(&cur_trans->dirty_bgs_lock);
3662 
3663 	/*
3664 	 * Refer to the definition of io_bgs member for details why it's safe
3665 	 * to use it without any locking
3666 	 */
3667 	while (!list_empty(io)) {
3668 		cache = list_first_entry(io, struct btrfs_block_group,
3669 					 io_list);
3670 		list_del_init(&cache->io_list);
3671 		btrfs_wait_cache_io(trans, cache, path);
3672 		btrfs_put_block_group(cache);
3673 	}
3674 
3675 	btrfs_free_path(path);
3676 	return ret;
3677 }
3678 
btrfs_update_block_group(struct btrfs_trans_handle * trans,u64 bytenr,u64 num_bytes,bool alloc)3679 int btrfs_update_block_group(struct btrfs_trans_handle *trans,
3680 			     u64 bytenr, u64 num_bytes, bool alloc)
3681 {
3682 	struct btrfs_fs_info *info = trans->fs_info;
3683 	struct btrfs_space_info *space_info;
3684 	struct btrfs_block_group *cache;
3685 	u64 old_val;
3686 	bool reclaim = false;
3687 	bool bg_already_dirty = true;
3688 	int factor;
3689 
3690 	/* Block accounting for super block */
3691 	spin_lock(&info->delalloc_root_lock);
3692 	old_val = btrfs_super_bytes_used(info->super_copy);
3693 	if (alloc)
3694 		old_val += num_bytes;
3695 	else
3696 		old_val -= num_bytes;
3697 	btrfs_set_super_bytes_used(info->super_copy, old_val);
3698 	spin_unlock(&info->delalloc_root_lock);
3699 
3700 	cache = btrfs_lookup_block_group(info, bytenr);
3701 	if (!cache)
3702 		return -ENOENT;
3703 
3704 	/* An extent can not span multiple block groups. */
3705 	ASSERT(bytenr + num_bytes <= cache->start + cache->length);
3706 
3707 	space_info = cache->space_info;
3708 	factor = btrfs_bg_type_to_factor(cache->flags);
3709 
3710 	/*
3711 	 * If this block group has free space cache written out, we need to make
3712 	 * sure to load it if we are removing space.  This is because we need
3713 	 * the unpinning stage to actually add the space back to the block group,
3714 	 * otherwise we will leak space.
3715 	 */
3716 	if (!alloc && !btrfs_block_group_done(cache))
3717 		btrfs_cache_block_group(cache, true);
3718 
3719 	spin_lock(&space_info->lock);
3720 	spin_lock(&cache->lock);
3721 
3722 	if (btrfs_test_opt(info, SPACE_CACHE) &&
3723 	    cache->disk_cache_state < BTRFS_DC_CLEAR)
3724 		cache->disk_cache_state = BTRFS_DC_CLEAR;
3725 
3726 	old_val = cache->used;
3727 	if (alloc) {
3728 		old_val += num_bytes;
3729 		cache->used = old_val;
3730 		cache->reserved -= num_bytes;
3731 		cache->reclaim_mark = 0;
3732 		space_info->bytes_reserved -= num_bytes;
3733 		space_info->bytes_used += num_bytes;
3734 		space_info->disk_used += num_bytes * factor;
3735 		if (READ_ONCE(space_info->periodic_reclaim))
3736 			btrfs_space_info_update_reclaimable(space_info, -num_bytes);
3737 		spin_unlock(&cache->lock);
3738 		spin_unlock(&space_info->lock);
3739 	} else {
3740 		old_val -= num_bytes;
3741 		cache->used = old_val;
3742 		cache->pinned += num_bytes;
3743 		btrfs_space_info_update_bytes_pinned(info, space_info, num_bytes);
3744 		space_info->bytes_used -= num_bytes;
3745 		space_info->disk_used -= num_bytes * factor;
3746 		if (READ_ONCE(space_info->periodic_reclaim))
3747 			btrfs_space_info_update_reclaimable(space_info, num_bytes);
3748 		else
3749 			reclaim = should_reclaim_block_group(cache, num_bytes);
3750 
3751 		spin_unlock(&cache->lock);
3752 		spin_unlock(&space_info->lock);
3753 
3754 		set_extent_bit(&trans->transaction->pinned_extents, bytenr,
3755 			       bytenr + num_bytes - 1, EXTENT_DIRTY, NULL);
3756 	}
3757 
3758 	spin_lock(&trans->transaction->dirty_bgs_lock);
3759 	if (list_empty(&cache->dirty_list)) {
3760 		list_add_tail(&cache->dirty_list, &trans->transaction->dirty_bgs);
3761 		bg_already_dirty = false;
3762 		btrfs_get_block_group(cache);
3763 	}
3764 	spin_unlock(&trans->transaction->dirty_bgs_lock);
3765 
3766 	/*
3767 	 * No longer have used bytes in this block group, queue it for deletion.
3768 	 * We do this after adding the block group to the dirty list to avoid
3769 	 * races between cleaner kthread and space cache writeout.
3770 	 */
3771 	if (!alloc && old_val == 0) {
3772 		if (!btrfs_test_opt(info, DISCARD_ASYNC))
3773 			btrfs_mark_bg_unused(cache);
3774 	} else if (!alloc && reclaim) {
3775 		btrfs_mark_bg_to_reclaim(cache);
3776 	}
3777 
3778 	btrfs_put_block_group(cache);
3779 
3780 	/* Modified block groups are accounted for in the delayed_refs_rsv. */
3781 	if (!bg_already_dirty)
3782 		btrfs_inc_delayed_refs_rsv_bg_updates(info);
3783 
3784 	return 0;
3785 }
3786 
3787 /*
3788  * Update the block_group and space info counters.
3789  *
3790  * @cache:	The cache we are manipulating
3791  * @ram_bytes:  The number of bytes of file content, and will be same to
3792  *              @num_bytes except for the compress path.
3793  * @num_bytes:	The number of bytes in question
3794  * @delalloc:   The blocks are allocated for the delalloc write
3795  *
3796  * This is called by the allocator when it reserves space. If this is a
3797  * reservation and the block group has become read only we cannot make the
3798  * reservation and return -EAGAIN, otherwise this function always succeeds.
3799  */
btrfs_add_reserved_bytes(struct btrfs_block_group * cache,u64 ram_bytes,u64 num_bytes,int delalloc,bool force_wrong_size_class)3800 int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
3801 			     u64 ram_bytes, u64 num_bytes, int delalloc,
3802 			     bool force_wrong_size_class)
3803 {
3804 	struct btrfs_space_info *space_info = cache->space_info;
3805 	enum btrfs_block_group_size_class size_class;
3806 	int ret = 0;
3807 
3808 	spin_lock(&space_info->lock);
3809 	spin_lock(&cache->lock);
3810 	if (cache->ro) {
3811 		ret = -EAGAIN;
3812 		goto out;
3813 	}
3814 
3815 	if (btrfs_block_group_should_use_size_class(cache)) {
3816 		size_class = btrfs_calc_block_group_size_class(num_bytes);
3817 		ret = btrfs_use_block_group_size_class(cache, size_class, force_wrong_size_class);
3818 		if (ret)
3819 			goto out;
3820 	}
3821 	cache->reserved += num_bytes;
3822 	space_info->bytes_reserved += num_bytes;
3823 	trace_btrfs_space_reservation(cache->fs_info, "space_info",
3824 				      space_info->flags, num_bytes, 1);
3825 	btrfs_space_info_update_bytes_may_use(cache->fs_info,
3826 					      space_info, -ram_bytes);
3827 	if (delalloc)
3828 		cache->delalloc_bytes += num_bytes;
3829 
3830 	/*
3831 	 * Compression can use less space than we reserved, so wake tickets if
3832 	 * that happens.
3833 	 */
3834 	if (num_bytes < ram_bytes)
3835 		btrfs_try_granting_tickets(cache->fs_info, space_info);
3836 out:
3837 	spin_unlock(&cache->lock);
3838 	spin_unlock(&space_info->lock);
3839 	return ret;
3840 }
3841 
3842 /*
3843  * Update the block_group and space info counters.
3844  *
3845  * @cache:      The cache we are manipulating
3846  * @num_bytes:  The number of bytes in question
3847  * @delalloc:   The blocks are allocated for the delalloc write
3848  *
3849  * This is called by somebody who is freeing space that was never actually used
3850  * on disk.  For example if you reserve some space for a new leaf in transaction
3851  * A and before transaction A commits you free that leaf, you call this with
3852  * reserve set to 0 in order to clear the reservation.
3853  */
btrfs_free_reserved_bytes(struct btrfs_block_group * cache,u64 num_bytes,int delalloc)3854 void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
3855 			       u64 num_bytes, int delalloc)
3856 {
3857 	struct btrfs_space_info *space_info = cache->space_info;
3858 
3859 	spin_lock(&space_info->lock);
3860 	spin_lock(&cache->lock);
3861 	if (cache->ro)
3862 		space_info->bytes_readonly += num_bytes;
3863 	else if (btrfs_is_zoned(cache->fs_info))
3864 		space_info->bytes_zone_unusable += num_bytes;
3865 	cache->reserved -= num_bytes;
3866 	space_info->bytes_reserved -= num_bytes;
3867 	space_info->max_extent_size = 0;
3868 
3869 	if (delalloc)
3870 		cache->delalloc_bytes -= num_bytes;
3871 	spin_unlock(&cache->lock);
3872 
3873 	btrfs_try_granting_tickets(cache->fs_info, space_info);
3874 	spin_unlock(&space_info->lock);
3875 }
3876 
force_metadata_allocation(struct btrfs_fs_info * info)3877 static void force_metadata_allocation(struct btrfs_fs_info *info)
3878 {
3879 	struct list_head *head = &info->space_info;
3880 	struct btrfs_space_info *found;
3881 
3882 	list_for_each_entry(found, head, list) {
3883 		if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
3884 			found->force_alloc = CHUNK_ALLOC_FORCE;
3885 	}
3886 }
3887 
should_alloc_chunk(const struct btrfs_fs_info * fs_info,const struct btrfs_space_info * sinfo,int force)3888 static int should_alloc_chunk(const struct btrfs_fs_info *fs_info,
3889 			      const struct btrfs_space_info *sinfo, int force)
3890 {
3891 	u64 bytes_used = btrfs_space_info_used(sinfo, false);
3892 	u64 thresh;
3893 
3894 	if (force == CHUNK_ALLOC_FORCE)
3895 		return 1;
3896 
3897 	/*
3898 	 * in limited mode, we want to have some free space up to
3899 	 * about 1% of the FS size.
3900 	 */
3901 	if (force == CHUNK_ALLOC_LIMITED) {
3902 		thresh = btrfs_super_total_bytes(fs_info->super_copy);
3903 		thresh = max_t(u64, SZ_64M, mult_perc(thresh, 1));
3904 
3905 		if (sinfo->total_bytes - bytes_used < thresh)
3906 			return 1;
3907 	}
3908 
3909 	if (bytes_used + SZ_2M < mult_perc(sinfo->total_bytes, 80))
3910 		return 0;
3911 	return 1;
3912 }
3913 
btrfs_force_chunk_alloc(struct btrfs_trans_handle * trans,u64 type)3914 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
3915 {
3916 	u64 alloc_flags = btrfs_get_alloc_profile(trans->fs_info, type);
3917 
3918 	return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
3919 }
3920 
do_chunk_alloc(struct btrfs_trans_handle * trans,u64 flags)3921 static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
3922 {
3923 	struct btrfs_block_group *bg;
3924 	int ret;
3925 
3926 	/*
3927 	 * Check if we have enough space in the system space info because we
3928 	 * will need to update device items in the chunk btree and insert a new
3929 	 * chunk item in the chunk btree as well. This will allocate a new
3930 	 * system block group if needed.
3931 	 */
3932 	check_system_chunk(trans, flags);
3933 
3934 	bg = btrfs_create_chunk(trans, flags);
3935 	if (IS_ERR(bg)) {
3936 		ret = PTR_ERR(bg);
3937 		goto out;
3938 	}
3939 
3940 	ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
3941 	/*
3942 	 * Normally we are not expected to fail with -ENOSPC here, since we have
3943 	 * previously reserved space in the system space_info and allocated one
3944 	 * new system chunk if necessary. However there are three exceptions:
3945 	 *
3946 	 * 1) We may have enough free space in the system space_info but all the
3947 	 *    existing system block groups have a profile which can not be used
3948 	 *    for extent allocation.
3949 	 *
3950 	 *    This happens when mounting in degraded mode. For example we have a
3951 	 *    RAID1 filesystem with 2 devices, lose one device and mount the fs
3952 	 *    using the other device in degraded mode. If we then allocate a chunk,
3953 	 *    we may have enough free space in the existing system space_info, but
3954 	 *    none of the block groups can be used for extent allocation since they
3955 	 *    have a RAID1 profile, and because we are in degraded mode with a
3956 	 *    single device, we are forced to allocate a new system chunk with a
3957 	 *    SINGLE profile. Making check_system_chunk() iterate over all system
3958 	 *    block groups and check if they have a usable profile and enough space
3959 	 *    can be slow on very large filesystems, so we tolerate the -ENOSPC and
3960 	 *    try again after forcing allocation of a new system chunk. Like this
3961 	 *    we avoid paying the cost of that search in normal circumstances, when
3962 	 *    we were not mounted in degraded mode;
3963 	 *
3964 	 * 2) We had enough free space info the system space_info, and one suitable
3965 	 *    block group to allocate from when we called check_system_chunk()
3966 	 *    above. However right after we called it, the only system block group
3967 	 *    with enough free space got turned into RO mode by a running scrub,
3968 	 *    and in this case we have to allocate a new one and retry. We only
3969 	 *    need do this allocate and retry once, since we have a transaction
3970 	 *    handle and scrub uses the commit root to search for block groups;
3971 	 *
3972 	 * 3) We had one system block group with enough free space when we called
3973 	 *    check_system_chunk(), but after that, right before we tried to
3974 	 *    allocate the last extent buffer we needed, a discard operation came
3975 	 *    in and it temporarily removed the last free space entry from the
3976 	 *    block group (discard removes a free space entry, discards it, and
3977 	 *    then adds back the entry to the block group cache).
3978 	 */
3979 	if (ret == -ENOSPC) {
3980 		const u64 sys_flags = btrfs_system_alloc_profile(trans->fs_info);
3981 		struct btrfs_block_group *sys_bg;
3982 
3983 		sys_bg = btrfs_create_chunk(trans, sys_flags);
3984 		if (IS_ERR(sys_bg)) {
3985 			ret = PTR_ERR(sys_bg);
3986 			btrfs_abort_transaction(trans, ret);
3987 			goto out;
3988 		}
3989 
3990 		ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
3991 		if (ret) {
3992 			btrfs_abort_transaction(trans, ret);
3993 			goto out;
3994 		}
3995 
3996 		ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
3997 		if (ret) {
3998 			btrfs_abort_transaction(trans, ret);
3999 			goto out;
4000 		}
4001 	} else if (ret) {
4002 		btrfs_abort_transaction(trans, ret);
4003 		goto out;
4004 	}
4005 out:
4006 	btrfs_trans_release_chunk_metadata(trans);
4007 
4008 	if (ret)
4009 		return ERR_PTR(ret);
4010 
4011 	btrfs_get_block_group(bg);
4012 	return bg;
4013 }
4014 
4015 /*
4016  * Chunk allocation is done in 2 phases:
4017  *
4018  * 1) Phase 1 - through btrfs_chunk_alloc() we allocate device extents for
4019  *    the chunk, the chunk mapping, create its block group and add the items
4020  *    that belong in the chunk btree to it - more specifically, we need to
4021  *    update device items in the chunk btree and add a new chunk item to it.
4022  *
4023  * 2) Phase 2 - through btrfs_create_pending_block_groups(), we add the block
4024  *    group item to the extent btree and the device extent items to the devices
4025  *    btree.
4026  *
4027  * This is done to prevent deadlocks. For example when COWing a node from the
4028  * extent btree we are holding a write lock on the node's parent and if we
4029  * trigger chunk allocation and attempted to insert the new block group item
4030  * in the extent btree right way, we could deadlock because the path for the
4031  * insertion can include that parent node. At first glance it seems impossible
4032  * to trigger chunk allocation after starting a transaction since tasks should
4033  * reserve enough transaction units (metadata space), however while that is true
4034  * most of the time, chunk allocation may still be triggered for several reasons:
4035  *
4036  * 1) When reserving metadata, we check if there is enough free space in the
4037  *    metadata space_info and therefore don't trigger allocation of a new chunk.
4038  *    However later when the task actually tries to COW an extent buffer from
4039  *    the extent btree or from the device btree for example, it is forced to
4040  *    allocate a new block group (chunk) because the only one that had enough
4041  *    free space was just turned to RO mode by a running scrub for example (or
4042  *    device replace, block group reclaim thread, etc), so we can not use it
4043  *    for allocating an extent and end up being forced to allocate a new one;
4044  *
4045  * 2) Because we only check that the metadata space_info has enough free bytes,
4046  *    we end up not allocating a new metadata chunk in that case. However if
4047  *    the filesystem was mounted in degraded mode, none of the existing block
4048  *    groups might be suitable for extent allocation due to their incompatible
4049  *    profile (for e.g. mounting a 2 devices filesystem, where all block groups
4050  *    use a RAID1 profile, in degraded mode using a single device). In this case
4051  *    when the task attempts to COW some extent buffer of the extent btree for
4052  *    example, it will trigger allocation of a new metadata block group with a
4053  *    suitable profile (SINGLE profile in the example of the degraded mount of
4054  *    the RAID1 filesystem);
4055  *
4056  * 3) The task has reserved enough transaction units / metadata space, but when
4057  *    it attempts to COW an extent buffer from the extent or device btree for
4058  *    example, it does not find any free extent in any metadata block group,
4059  *    therefore forced to try to allocate a new metadata block group.
4060  *    This is because some other task allocated all available extents in the
4061  *    meanwhile - this typically happens with tasks that don't reserve space
4062  *    properly, either intentionally or as a bug. One example where this is
4063  *    done intentionally is fsync, as it does not reserve any transaction units
4064  *    and ends up allocating a variable number of metadata extents for log
4065  *    tree extent buffers;
4066  *
4067  * 4) The task has reserved enough transaction units / metadata space, but right
4068  *    before it tries to allocate the last extent buffer it needs, a discard
4069  *    operation comes in and, temporarily, removes the last free space entry from
4070  *    the only metadata block group that had free space (discard starts by
4071  *    removing a free space entry from a block group, then does the discard
4072  *    operation and, once it's done, it adds back the free space entry to the
4073  *    block group).
4074  *
4075  * We also need this 2 phases setup when adding a device to a filesystem with
4076  * a seed device - we must create new metadata and system chunks without adding
4077  * any of the block group items to the chunk, extent and device btrees. If we
4078  * did not do it this way, we would get ENOSPC when attempting to update those
4079  * btrees, since all the chunks from the seed device are read-only.
4080  *
4081  * Phase 1 does the updates and insertions to the chunk btree because if we had
4082  * it done in phase 2 and have a thundering herd of tasks allocating chunks in
4083  * parallel, we risk having too many system chunks allocated by many tasks if
4084  * many tasks reach phase 1 without the previous ones completing phase 2. In the
4085  * extreme case this leads to exhaustion of the system chunk array in the
4086  * superblock. This is easier to trigger if using a btree node/leaf size of 64K
4087  * and with RAID filesystems (so we have more device items in the chunk btree).
4088  * This has happened before and commit eafa4fd0ad0607 ("btrfs: fix exhaustion of
4089  * the system chunk array due to concurrent allocations") provides more details.
4090  *
4091  * Allocation of system chunks does not happen through this function. A task that
4092  * needs to update the chunk btree (the only btree that uses system chunks), must
4093  * preallocate chunk space by calling either check_system_chunk() or
4094  * btrfs_reserve_chunk_metadata() - the former is used when allocating a data or
4095  * metadata chunk or when removing a chunk, while the later is used before doing
4096  * a modification to the chunk btree - use cases for the later are adding,
4097  * removing and resizing a device as well as relocation of a system chunk.
4098  * See the comment below for more details.
4099  *
4100  * The reservation of system space, done through check_system_chunk(), as well
4101  * as all the updates and insertions into the chunk btree must be done while
4102  * holding fs_info->chunk_mutex. This is important to guarantee that while COWing
4103  * an extent buffer from the chunks btree we never trigger allocation of a new
4104  * system chunk, which would result in a deadlock (trying to lock twice an
4105  * extent buffer of the chunk btree, first time before triggering the chunk
4106  * allocation and the second time during chunk allocation while attempting to
4107  * update the chunks btree). The system chunk array is also updated while holding
4108  * that mutex. The same logic applies to removing chunks - we must reserve system
4109  * space, update the chunk btree and the system chunk array in the superblock
4110  * while holding fs_info->chunk_mutex.
4111  *
4112  * This function, btrfs_chunk_alloc(), belongs to phase 1.
4113  *
4114  * If @force is CHUNK_ALLOC_FORCE:
4115  *    - return 1 if it successfully allocates a chunk,
4116  *    - return errors including -ENOSPC otherwise.
4117  * If @force is NOT CHUNK_ALLOC_FORCE:
4118  *    - return 0 if it doesn't need to allocate a new chunk,
4119  *    - return 1 if it successfully allocates a chunk,
4120  *    - return errors including -ENOSPC otherwise.
4121  */
btrfs_chunk_alloc(struct btrfs_trans_handle * trans,u64 flags,enum btrfs_chunk_alloc_enum force)4122 int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
4123 		      enum btrfs_chunk_alloc_enum force)
4124 {
4125 	struct btrfs_fs_info *fs_info = trans->fs_info;
4126 	struct btrfs_space_info *space_info;
4127 	struct btrfs_block_group *ret_bg;
4128 	bool wait_for_alloc = false;
4129 	bool should_alloc = false;
4130 	bool from_extent_allocation = false;
4131 	int ret = 0;
4132 
4133 	if (force == CHUNK_ALLOC_FORCE_FOR_EXTENT) {
4134 		from_extent_allocation = true;
4135 		force = CHUNK_ALLOC_FORCE;
4136 	}
4137 
4138 	/* Don't re-enter if we're already allocating a chunk */
4139 	if (trans->allocating_chunk)
4140 		return -ENOSPC;
4141 	/*
4142 	 * Allocation of system chunks can not happen through this path, as we
4143 	 * could end up in a deadlock if we are allocating a data or metadata
4144 	 * chunk and there is another task modifying the chunk btree.
4145 	 *
4146 	 * This is because while we are holding the chunk mutex, we will attempt
4147 	 * to add the new chunk item to the chunk btree or update an existing
4148 	 * device item in the chunk btree, while the other task that is modifying
4149 	 * the chunk btree is attempting to COW an extent buffer while holding a
4150 	 * lock on it and on its parent - if the COW operation triggers a system
4151 	 * chunk allocation, then we can deadlock because we are holding the
4152 	 * chunk mutex and we may need to access that extent buffer or its parent
4153 	 * in order to add the chunk item or update a device item.
4154 	 *
4155 	 * Tasks that want to modify the chunk tree should reserve system space
4156 	 * before updating the chunk btree, by calling either
4157 	 * btrfs_reserve_chunk_metadata() or check_system_chunk().
4158 	 * It's possible that after a task reserves the space, it still ends up
4159 	 * here - this happens in the cases described above at do_chunk_alloc().
4160 	 * The task will have to either retry or fail.
4161 	 */
4162 	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
4163 		return -ENOSPC;
4164 
4165 	space_info = btrfs_find_space_info(fs_info, flags);
4166 	ASSERT(space_info);
4167 
4168 	do {
4169 		spin_lock(&space_info->lock);
4170 		if (force < space_info->force_alloc)
4171 			force = space_info->force_alloc;
4172 		should_alloc = should_alloc_chunk(fs_info, space_info, force);
4173 		if (space_info->full) {
4174 			/* No more free physical space */
4175 			if (should_alloc)
4176 				ret = -ENOSPC;
4177 			else
4178 				ret = 0;
4179 			spin_unlock(&space_info->lock);
4180 			return ret;
4181 		} else if (!should_alloc) {
4182 			spin_unlock(&space_info->lock);
4183 			return 0;
4184 		} else if (space_info->chunk_alloc) {
4185 			/*
4186 			 * Someone is already allocating, so we need to block
4187 			 * until this someone is finished and then loop to
4188 			 * recheck if we should continue with our allocation
4189 			 * attempt.
4190 			 */
4191 			wait_for_alloc = true;
4192 			force = CHUNK_ALLOC_NO_FORCE;
4193 			spin_unlock(&space_info->lock);
4194 			mutex_lock(&fs_info->chunk_mutex);
4195 			mutex_unlock(&fs_info->chunk_mutex);
4196 		} else {
4197 			/* Proceed with allocation */
4198 			space_info->chunk_alloc = 1;
4199 			wait_for_alloc = false;
4200 			spin_unlock(&space_info->lock);
4201 		}
4202 
4203 		cond_resched();
4204 	} while (wait_for_alloc);
4205 
4206 	mutex_lock(&fs_info->chunk_mutex);
4207 	trans->allocating_chunk = true;
4208 
4209 	/*
4210 	 * If we have mixed data/metadata chunks we want to make sure we keep
4211 	 * allocating mixed chunks instead of individual chunks.
4212 	 */
4213 	if (btrfs_mixed_space_info(space_info))
4214 		flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
4215 
4216 	/*
4217 	 * if we're doing a data chunk, go ahead and make sure that
4218 	 * we keep a reasonable number of metadata chunks allocated in the
4219 	 * FS as well.
4220 	 */
4221 	if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
4222 		fs_info->data_chunk_allocations++;
4223 		if (!(fs_info->data_chunk_allocations %
4224 		      fs_info->metadata_ratio))
4225 			force_metadata_allocation(fs_info);
4226 	}
4227 
4228 	ret_bg = do_chunk_alloc(trans, flags);
4229 	trans->allocating_chunk = false;
4230 
4231 	if (IS_ERR(ret_bg)) {
4232 		ret = PTR_ERR(ret_bg);
4233 	} else if (from_extent_allocation && (flags & BTRFS_BLOCK_GROUP_DATA)) {
4234 		/*
4235 		 * New block group is likely to be used soon. Try to activate
4236 		 * it now. Failure is OK for now.
4237 		 */
4238 		btrfs_zone_activate(ret_bg);
4239 	}
4240 
4241 	if (!ret)
4242 		btrfs_put_block_group(ret_bg);
4243 
4244 	spin_lock(&space_info->lock);
4245 	if (ret < 0) {
4246 		if (ret == -ENOSPC)
4247 			space_info->full = 1;
4248 		else
4249 			goto out;
4250 	} else {
4251 		ret = 1;
4252 		space_info->max_extent_size = 0;
4253 	}
4254 
4255 	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
4256 out:
4257 	space_info->chunk_alloc = 0;
4258 	spin_unlock(&space_info->lock);
4259 	mutex_unlock(&fs_info->chunk_mutex);
4260 
4261 	return ret;
4262 }
4263 
get_profile_num_devs(const struct btrfs_fs_info * fs_info,u64 type)4264 static u64 get_profile_num_devs(const struct btrfs_fs_info *fs_info, u64 type)
4265 {
4266 	u64 num_dev;
4267 
4268 	num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max;
4269 	if (!num_dev)
4270 		num_dev = fs_info->fs_devices->rw_devices;
4271 
4272 	return num_dev;
4273 }
4274 
reserve_chunk_space(struct btrfs_trans_handle * trans,u64 bytes,u64 type)4275 static void reserve_chunk_space(struct btrfs_trans_handle *trans,
4276 				u64 bytes,
4277 				u64 type)
4278 {
4279 	struct btrfs_fs_info *fs_info = trans->fs_info;
4280 	struct btrfs_space_info *info;
4281 	u64 left;
4282 	int ret = 0;
4283 
4284 	/*
4285 	 * Needed because we can end up allocating a system chunk and for an
4286 	 * atomic and race free space reservation in the chunk block reserve.
4287 	 */
4288 	lockdep_assert_held(&fs_info->chunk_mutex);
4289 
4290 	info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4291 	spin_lock(&info->lock);
4292 	left = info->total_bytes - btrfs_space_info_used(info, true);
4293 	spin_unlock(&info->lock);
4294 
4295 	if (left < bytes && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
4296 		btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
4297 			   left, bytes, type);
4298 		btrfs_dump_space_info(fs_info, info, 0, 0);
4299 	}
4300 
4301 	if (left < bytes) {
4302 		u64 flags = btrfs_system_alloc_profile(fs_info);
4303 		struct btrfs_block_group *bg;
4304 
4305 		/*
4306 		 * Ignore failure to create system chunk. We might end up not
4307 		 * needing it, as we might not need to COW all nodes/leafs from
4308 		 * the paths we visit in the chunk tree (they were already COWed
4309 		 * or created in the current transaction for example).
4310 		 */
4311 		bg = btrfs_create_chunk(trans, flags);
4312 		if (IS_ERR(bg)) {
4313 			ret = PTR_ERR(bg);
4314 		} else {
4315 			/*
4316 			 * We have a new chunk. We also need to activate it for
4317 			 * zoned filesystem.
4318 			 */
4319 			ret = btrfs_zoned_activate_one_bg(fs_info, info, true);
4320 			if (ret < 0)
4321 				return;
4322 
4323 			/*
4324 			 * If we fail to add the chunk item here, we end up
4325 			 * trying again at phase 2 of chunk allocation, at
4326 			 * btrfs_create_pending_block_groups(). So ignore
4327 			 * any error here. An ENOSPC here could happen, due to
4328 			 * the cases described at do_chunk_alloc() - the system
4329 			 * block group we just created was just turned into RO
4330 			 * mode by a scrub for example, or a running discard
4331 			 * temporarily removed its free space entries, etc.
4332 			 */
4333 			btrfs_chunk_alloc_add_chunk_item(trans, bg);
4334 		}
4335 	}
4336 
4337 	if (!ret) {
4338 		ret = btrfs_block_rsv_add(fs_info,
4339 					  &fs_info->chunk_block_rsv,
4340 					  bytes, BTRFS_RESERVE_NO_FLUSH);
4341 		if (!ret)
4342 			trans->chunk_bytes_reserved += bytes;
4343 	}
4344 }
4345 
4346 /*
4347  * Reserve space in the system space for allocating or removing a chunk.
4348  * The caller must be holding fs_info->chunk_mutex.
4349  */
check_system_chunk(struct btrfs_trans_handle * trans,u64 type)4350 void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
4351 {
4352 	struct btrfs_fs_info *fs_info = trans->fs_info;
4353 	const u64 num_devs = get_profile_num_devs(fs_info, type);
4354 	u64 bytes;
4355 
4356 	/* num_devs device items to update and 1 chunk item to add or remove. */
4357 	bytes = btrfs_calc_metadata_size(fs_info, num_devs) +
4358 		btrfs_calc_insert_metadata_size(fs_info, 1);
4359 
4360 	reserve_chunk_space(trans, bytes, type);
4361 }
4362 
4363 /*
4364  * Reserve space in the system space, if needed, for doing a modification to the
4365  * chunk btree.
4366  *
4367  * @trans:		A transaction handle.
4368  * @is_item_insertion:	Indicate if the modification is for inserting a new item
4369  *			in the chunk btree or if it's for the deletion or update
4370  *			of an existing item.
4371  *
4372  * This is used in a context where we need to update the chunk btree outside
4373  * block group allocation and removal, to avoid a deadlock with a concurrent
4374  * task that is allocating a metadata or data block group and therefore needs to
4375  * update the chunk btree while holding the chunk mutex. After the update to the
4376  * chunk btree is done, btrfs_trans_release_chunk_metadata() should be called.
4377  *
4378  */
btrfs_reserve_chunk_metadata(struct btrfs_trans_handle * trans,bool is_item_insertion)4379 void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
4380 				  bool is_item_insertion)
4381 {
4382 	struct btrfs_fs_info *fs_info = trans->fs_info;
4383 	u64 bytes;
4384 
4385 	if (is_item_insertion)
4386 		bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
4387 	else
4388 		bytes = btrfs_calc_metadata_size(fs_info, 1);
4389 
4390 	mutex_lock(&fs_info->chunk_mutex);
4391 	reserve_chunk_space(trans, bytes, BTRFS_BLOCK_GROUP_SYSTEM);
4392 	mutex_unlock(&fs_info->chunk_mutex);
4393 }
4394 
btrfs_put_block_group_cache(struct btrfs_fs_info * info)4395 void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
4396 {
4397 	struct btrfs_block_group *block_group;
4398 
4399 	block_group = btrfs_lookup_first_block_group(info, 0);
4400 	while (block_group) {
4401 		btrfs_wait_block_group_cache_done(block_group);
4402 		spin_lock(&block_group->lock);
4403 		if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF,
4404 				       &block_group->runtime_flags)) {
4405 			struct btrfs_inode *inode = block_group->inode;
4406 
4407 			block_group->inode = NULL;
4408 			spin_unlock(&block_group->lock);
4409 
4410 			ASSERT(block_group->io_ctl.inode == NULL);
4411 			iput(&inode->vfs_inode);
4412 		} else {
4413 			spin_unlock(&block_group->lock);
4414 		}
4415 		block_group = btrfs_next_block_group(block_group);
4416 	}
4417 }
4418 
4419 /*
4420  * Must be called only after stopping all workers, since we could have block
4421  * group caching kthreads running, and therefore they could race with us if we
4422  * freed the block groups before stopping them.
4423  */
btrfs_free_block_groups(struct btrfs_fs_info * info)4424 int btrfs_free_block_groups(struct btrfs_fs_info *info)
4425 {
4426 	struct btrfs_block_group *block_group;
4427 	struct btrfs_space_info *space_info;
4428 	struct btrfs_caching_control *caching_ctl;
4429 	struct rb_node *n;
4430 
4431 	if (btrfs_is_zoned(info)) {
4432 		if (info->active_meta_bg) {
4433 			btrfs_put_block_group(info->active_meta_bg);
4434 			info->active_meta_bg = NULL;
4435 		}
4436 		if (info->active_system_bg) {
4437 			btrfs_put_block_group(info->active_system_bg);
4438 			info->active_system_bg = NULL;
4439 		}
4440 	}
4441 
4442 	write_lock(&info->block_group_cache_lock);
4443 	while (!list_empty(&info->caching_block_groups)) {
4444 		caching_ctl = list_entry(info->caching_block_groups.next,
4445 					 struct btrfs_caching_control, list);
4446 		list_del(&caching_ctl->list);
4447 		btrfs_put_caching_control(caching_ctl);
4448 	}
4449 	write_unlock(&info->block_group_cache_lock);
4450 
4451 	spin_lock(&info->unused_bgs_lock);
4452 	while (!list_empty(&info->unused_bgs)) {
4453 		block_group = list_first_entry(&info->unused_bgs,
4454 					       struct btrfs_block_group,
4455 					       bg_list);
4456 		list_del_init(&block_group->bg_list);
4457 		btrfs_put_block_group(block_group);
4458 	}
4459 
4460 	while (!list_empty(&info->reclaim_bgs)) {
4461 		block_group = list_first_entry(&info->reclaim_bgs,
4462 					       struct btrfs_block_group,
4463 					       bg_list);
4464 		list_del_init(&block_group->bg_list);
4465 		btrfs_put_block_group(block_group);
4466 	}
4467 	spin_unlock(&info->unused_bgs_lock);
4468 
4469 	spin_lock(&info->zone_active_bgs_lock);
4470 	while (!list_empty(&info->zone_active_bgs)) {
4471 		block_group = list_first_entry(&info->zone_active_bgs,
4472 					       struct btrfs_block_group,
4473 					       active_bg_list);
4474 		list_del_init(&block_group->active_bg_list);
4475 		btrfs_put_block_group(block_group);
4476 	}
4477 	spin_unlock(&info->zone_active_bgs_lock);
4478 
4479 	write_lock(&info->block_group_cache_lock);
4480 	while ((n = rb_last(&info->block_group_cache_tree.rb_root)) != NULL) {
4481 		block_group = rb_entry(n, struct btrfs_block_group,
4482 				       cache_node);
4483 		rb_erase_cached(&block_group->cache_node,
4484 				&info->block_group_cache_tree);
4485 		RB_CLEAR_NODE(&block_group->cache_node);
4486 		write_unlock(&info->block_group_cache_lock);
4487 
4488 		down_write(&block_group->space_info->groups_sem);
4489 		list_del(&block_group->list);
4490 		up_write(&block_group->space_info->groups_sem);
4491 
4492 		/*
4493 		 * We haven't cached this block group, which means we could
4494 		 * possibly have excluded extents on this block group.
4495 		 */
4496 		if (block_group->cached == BTRFS_CACHE_NO ||
4497 		    block_group->cached == BTRFS_CACHE_ERROR)
4498 			btrfs_free_excluded_extents(block_group);
4499 
4500 		btrfs_remove_free_space_cache(block_group);
4501 		ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
4502 		ASSERT(list_empty(&block_group->dirty_list));
4503 		ASSERT(list_empty(&block_group->io_list));
4504 		ASSERT(list_empty(&block_group->bg_list));
4505 		ASSERT(refcount_read(&block_group->refs) == 1);
4506 		ASSERT(block_group->swap_extents == 0);
4507 		btrfs_put_block_group(block_group);
4508 
4509 		write_lock(&info->block_group_cache_lock);
4510 	}
4511 	write_unlock(&info->block_group_cache_lock);
4512 
4513 	btrfs_release_global_block_rsv(info);
4514 
4515 	while (!list_empty(&info->space_info)) {
4516 		space_info = list_entry(info->space_info.next,
4517 					struct btrfs_space_info,
4518 					list);
4519 
4520 		/*
4521 		 * Do not hide this behind enospc_debug, this is actually
4522 		 * important and indicates a real bug if this happens.
4523 		 */
4524 		if (WARN_ON(space_info->bytes_pinned > 0 ||
4525 			    space_info->bytes_may_use > 0))
4526 			btrfs_dump_space_info(info, space_info, 0, 0);
4527 
4528 		/*
4529 		 * If there was a failure to cleanup a log tree, very likely due
4530 		 * to an IO failure on a writeback attempt of one or more of its
4531 		 * extent buffers, we could not do proper (and cheap) unaccounting
4532 		 * of their reserved space, so don't warn on bytes_reserved > 0 in
4533 		 * that case.
4534 		 */
4535 		if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) ||
4536 		    !BTRFS_FS_LOG_CLEANUP_ERROR(info)) {
4537 			if (WARN_ON(space_info->bytes_reserved > 0))
4538 				btrfs_dump_space_info(info, space_info, 0, 0);
4539 		}
4540 
4541 		WARN_ON(space_info->reclaim_size > 0);
4542 		list_del(&space_info->list);
4543 		btrfs_sysfs_remove_space_info(space_info);
4544 	}
4545 	return 0;
4546 }
4547 
btrfs_freeze_block_group(struct btrfs_block_group * cache)4548 void btrfs_freeze_block_group(struct btrfs_block_group *cache)
4549 {
4550 	atomic_inc(&cache->frozen);
4551 }
4552 
btrfs_unfreeze_block_group(struct btrfs_block_group * block_group)4553 void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
4554 {
4555 	struct btrfs_fs_info *fs_info = block_group->fs_info;
4556 	bool cleanup;
4557 
4558 	spin_lock(&block_group->lock);
4559 	cleanup = (atomic_dec_and_test(&block_group->frozen) &&
4560 		   test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags));
4561 	spin_unlock(&block_group->lock);
4562 
4563 	if (cleanup) {
4564 		struct btrfs_chunk_map *map;
4565 
4566 		map = btrfs_find_chunk_map(fs_info, block_group->start, 1);
4567 		/* Logic error, can't happen. */
4568 		ASSERT(map);
4569 
4570 		btrfs_remove_chunk_map(fs_info, map);
4571 
4572 		/* Once for our lookup reference. */
4573 		btrfs_free_chunk_map(map);
4574 
4575 		/*
4576 		 * We may have left one free space entry and other possible
4577 		 * tasks trimming this block group have left 1 entry each one.
4578 		 * Free them if any.
4579 		 */
4580 		btrfs_remove_free_space_cache(block_group);
4581 	}
4582 }
4583 
btrfs_inc_block_group_swap_extents(struct btrfs_block_group * bg)4584 bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg)
4585 {
4586 	bool ret = true;
4587 
4588 	spin_lock(&bg->lock);
4589 	if (bg->ro)
4590 		ret = false;
4591 	else
4592 		bg->swap_extents++;
4593 	spin_unlock(&bg->lock);
4594 
4595 	return ret;
4596 }
4597 
btrfs_dec_block_group_swap_extents(struct btrfs_block_group * bg,int amount)4598 void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount)
4599 {
4600 	spin_lock(&bg->lock);
4601 	ASSERT(!bg->ro);
4602 	ASSERT(bg->swap_extents >= amount);
4603 	bg->swap_extents -= amount;
4604 	spin_unlock(&bg->lock);
4605 }
4606 
btrfs_calc_block_group_size_class(u64 size)4607 enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size)
4608 {
4609 	if (size <= SZ_128K)
4610 		return BTRFS_BG_SZ_SMALL;
4611 	if (size <= SZ_8M)
4612 		return BTRFS_BG_SZ_MEDIUM;
4613 	return BTRFS_BG_SZ_LARGE;
4614 }
4615 
4616 /*
4617  * Handle a block group allocating an extent in a size class
4618  *
4619  * @bg:				The block group we allocated in.
4620  * @size_class:			The size class of the allocation.
4621  * @force_wrong_size_class:	Whether we are desperate enough to allow
4622  *				mismatched size classes.
4623  *
4624  * Returns: 0 if the size class was valid for this block_group, -EAGAIN in the
4625  * case of a race that leads to the wrong size class without
4626  * force_wrong_size_class set.
4627  *
4628  * find_free_extent will skip block groups with a mismatched size class until
4629  * it really needs to avoid ENOSPC. In that case it will set
4630  * force_wrong_size_class. However, if a block group is newly allocated and
4631  * doesn't yet have a size class, then it is possible for two allocations of
4632  * different sizes to race and both try to use it. The loser is caught here and
4633  * has to retry.
4634  */
btrfs_use_block_group_size_class(struct btrfs_block_group * bg,enum btrfs_block_group_size_class size_class,bool force_wrong_size_class)4635 int btrfs_use_block_group_size_class(struct btrfs_block_group *bg,
4636 				     enum btrfs_block_group_size_class size_class,
4637 				     bool force_wrong_size_class)
4638 {
4639 	ASSERT(size_class != BTRFS_BG_SZ_NONE);
4640 
4641 	/* The new allocation is in the right size class, do nothing */
4642 	if (bg->size_class == size_class)
4643 		return 0;
4644 	/*
4645 	 * The new allocation is in a mismatched size class.
4646 	 * This means one of two things:
4647 	 *
4648 	 * 1. Two tasks in find_free_extent for different size_classes raced
4649 	 *    and hit the same empty block_group. Make the loser try again.
4650 	 * 2. A call to find_free_extent got desperate enough to set
4651 	 *    'force_wrong_slab'. Don't change the size_class, but allow the
4652 	 *    allocation.
4653 	 */
4654 	if (bg->size_class != BTRFS_BG_SZ_NONE) {
4655 		if (force_wrong_size_class)
4656 			return 0;
4657 		return -EAGAIN;
4658 	}
4659 	/*
4660 	 * The happy new block group case: the new allocation is the first
4661 	 * one in the block_group so we set size_class.
4662 	 */
4663 	bg->size_class = size_class;
4664 
4665 	return 0;
4666 }
4667 
btrfs_block_group_should_use_size_class(const struct btrfs_block_group * bg)4668 bool btrfs_block_group_should_use_size_class(const struct btrfs_block_group *bg)
4669 {
4670 	if (btrfs_is_zoned(bg->fs_info))
4671 		return false;
4672 	if (!btrfs_is_block_group_data_only(bg))
4673 		return false;
4674 	return true;
4675 }
4676