• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "misc.h"
4 #include "ctree.h"
5 #include "block-group.h"
6 #include "space-info.h"
7 #include "disk-io.h"
8 #include "free-space-cache.h"
9 #include "free-space-tree.h"
10 #include "disk-io.h"
11 #include "volumes.h"
12 #include "transaction.h"
13 #include "ref-verify.h"
14 #include "sysfs.h"
15 #include "tree-log.h"
16 #include "delalloc-space.h"
17 
18 /*
19  * Return target flags in extended format or 0 if restripe for this chunk_type
20  * is not in progress
21  *
22  * Should be called with balance_lock held
23  */
get_restripe_target(struct btrfs_fs_info * fs_info,u64 flags)24 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
25 {
26 	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
27 	u64 target = 0;
28 
29 	if (!bctl)
30 		return 0;
31 
32 	if (flags & BTRFS_BLOCK_GROUP_DATA &&
33 	    bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
34 		target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
35 	} else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
36 		   bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
37 		target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
38 	} else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
39 		   bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
40 		target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
41 	}
42 
43 	return target;
44 }
45 
46 /*
47  * @flags: available profiles in extended format (see ctree.h)
48  *
49  * Return reduced profile in chunk format.  If profile changing is in progress
50  * (either running or paused) picks the target profile (if it's already
51  * available), otherwise falls back to plain reducing.
52  */
btrfs_reduce_alloc_profile(struct btrfs_fs_info * fs_info,u64 flags)53 static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
54 {
55 	u64 num_devices = fs_info->fs_devices->rw_devices;
56 	u64 target;
57 	u64 raid_type;
58 	u64 allowed = 0;
59 
60 	/*
61 	 * See if restripe for this chunk_type is in progress, if so try to
62 	 * reduce to the target profile
63 	 */
64 	spin_lock(&fs_info->balance_lock);
65 	target = get_restripe_target(fs_info, flags);
66 	if (target) {
67 		/* Pick target profile only if it's already available */
68 		if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
69 			spin_unlock(&fs_info->balance_lock);
70 			return extended_to_chunk(target);
71 		}
72 	}
73 	spin_unlock(&fs_info->balance_lock);
74 
75 	/* First, mask out the RAID levels which aren't possible */
76 	for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
77 		if (num_devices >= btrfs_raid_array[raid_type].devs_min)
78 			allowed |= btrfs_raid_array[raid_type].bg_flag;
79 	}
80 	allowed &= flags;
81 
82 	if (allowed & BTRFS_BLOCK_GROUP_RAID6)
83 		allowed = BTRFS_BLOCK_GROUP_RAID6;
84 	else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
85 		allowed = BTRFS_BLOCK_GROUP_RAID5;
86 	else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
87 		allowed = BTRFS_BLOCK_GROUP_RAID10;
88 	else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
89 		allowed = BTRFS_BLOCK_GROUP_RAID1;
90 	else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
91 		allowed = BTRFS_BLOCK_GROUP_RAID0;
92 
93 	flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
94 
95 	return extended_to_chunk(flags | allowed);
96 }
97 
get_alloc_profile(struct btrfs_fs_info * fs_info,u64 orig_flags)98 static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
99 {
100 	unsigned seq;
101 	u64 flags;
102 
103 	do {
104 		flags = orig_flags;
105 		seq = read_seqbegin(&fs_info->profiles_lock);
106 
107 		if (flags & BTRFS_BLOCK_GROUP_DATA)
108 			flags |= fs_info->avail_data_alloc_bits;
109 		else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
110 			flags |= fs_info->avail_system_alloc_bits;
111 		else if (flags & BTRFS_BLOCK_GROUP_METADATA)
112 			flags |= fs_info->avail_metadata_alloc_bits;
113 	} while (read_seqretry(&fs_info->profiles_lock, seq));
114 
115 	return btrfs_reduce_alloc_profile(fs_info, flags);
116 }
117 
btrfs_get_alloc_profile(struct btrfs_fs_info * fs_info,u64 orig_flags)118 u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
119 {
120 	return get_alloc_profile(fs_info, orig_flags);
121 }
122 
btrfs_get_block_group(struct btrfs_block_group_cache * cache)123 void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
124 {
125 	atomic_inc(&cache->count);
126 }
127 
btrfs_put_block_group(struct btrfs_block_group_cache * cache)128 void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
129 {
130 	if (atomic_dec_and_test(&cache->count)) {
131 		WARN_ON(cache->pinned > 0);
132 		WARN_ON(cache->reserved > 0);
133 
134 		/*
135 		 * If not empty, someone is still holding mutex of
136 		 * full_stripe_lock, which can only be released by caller.
137 		 * And it will definitely cause use-after-free when caller
138 		 * tries to release full stripe lock.
139 		 *
140 		 * No better way to resolve, but only to warn.
141 		 */
142 		WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
143 		kfree(cache->free_space_ctl);
144 		kfree(cache);
145 	}
146 }
147 
148 /*
149  * This adds the block group to the fs_info rb tree for the block group cache
150  */
btrfs_add_block_group_cache(struct btrfs_fs_info * info,struct btrfs_block_group_cache * block_group)151 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
152 				struct btrfs_block_group_cache *block_group)
153 {
154 	struct rb_node **p;
155 	struct rb_node *parent = NULL;
156 	struct btrfs_block_group_cache *cache;
157 
158 	spin_lock(&info->block_group_cache_lock);
159 	p = &info->block_group_cache_tree.rb_node;
160 
161 	while (*p) {
162 		parent = *p;
163 		cache = rb_entry(parent, struct btrfs_block_group_cache,
164 				 cache_node);
165 		if (block_group->key.objectid < cache->key.objectid) {
166 			p = &(*p)->rb_left;
167 		} else if (block_group->key.objectid > cache->key.objectid) {
168 			p = &(*p)->rb_right;
169 		} else {
170 			spin_unlock(&info->block_group_cache_lock);
171 			return -EEXIST;
172 		}
173 	}
174 
175 	rb_link_node(&block_group->cache_node, parent, p);
176 	rb_insert_color(&block_group->cache_node,
177 			&info->block_group_cache_tree);
178 
179 	if (info->first_logical_byte > block_group->key.objectid)
180 		info->first_logical_byte = block_group->key.objectid;
181 
182 	spin_unlock(&info->block_group_cache_lock);
183 
184 	return 0;
185 }
186 
187 /*
188  * This will return the block group at or after bytenr if contains is 0, else
189  * it will return the block group that contains the bytenr
190  */
block_group_cache_tree_search(struct btrfs_fs_info * info,u64 bytenr,int contains)191 static struct btrfs_block_group_cache *block_group_cache_tree_search(
192 		struct btrfs_fs_info *info, u64 bytenr, int contains)
193 {
194 	struct btrfs_block_group_cache *cache, *ret = NULL;
195 	struct rb_node *n;
196 	u64 end, start;
197 
198 	spin_lock(&info->block_group_cache_lock);
199 	n = info->block_group_cache_tree.rb_node;
200 
201 	while (n) {
202 		cache = rb_entry(n, struct btrfs_block_group_cache,
203 				 cache_node);
204 		end = cache->key.objectid + cache->key.offset - 1;
205 		start = cache->key.objectid;
206 
207 		if (bytenr < start) {
208 			if (!contains && (!ret || start < ret->key.objectid))
209 				ret = cache;
210 			n = n->rb_left;
211 		} else if (bytenr > start) {
212 			if (contains && bytenr <= end) {
213 				ret = cache;
214 				break;
215 			}
216 			n = n->rb_right;
217 		} else {
218 			ret = cache;
219 			break;
220 		}
221 	}
222 	if (ret) {
223 		btrfs_get_block_group(ret);
224 		if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
225 			info->first_logical_byte = ret->key.objectid;
226 	}
227 	spin_unlock(&info->block_group_cache_lock);
228 
229 	return ret;
230 }
231 
232 /*
233  * Return the block group that starts at or after bytenr
234  */
btrfs_lookup_first_block_group(struct btrfs_fs_info * info,u64 bytenr)235 struct btrfs_block_group_cache *btrfs_lookup_first_block_group(
236 		struct btrfs_fs_info *info, u64 bytenr)
237 {
238 	return block_group_cache_tree_search(info, bytenr, 0);
239 }
240 
241 /*
242  * Return the block group that contains the given bytenr
243  */
btrfs_lookup_block_group(struct btrfs_fs_info * info,u64 bytenr)244 struct btrfs_block_group_cache *btrfs_lookup_block_group(
245 		struct btrfs_fs_info *info, u64 bytenr)
246 {
247 	return block_group_cache_tree_search(info, bytenr, 1);
248 }
249 
btrfs_next_block_group(struct btrfs_block_group_cache * cache)250 struct btrfs_block_group_cache *btrfs_next_block_group(
251 		struct btrfs_block_group_cache *cache)
252 {
253 	struct btrfs_fs_info *fs_info = cache->fs_info;
254 	struct rb_node *node;
255 
256 	spin_lock(&fs_info->block_group_cache_lock);
257 
258 	/* If our block group was removed, we need a full search. */
259 	if (RB_EMPTY_NODE(&cache->cache_node)) {
260 		const u64 next_bytenr = cache->key.objectid + cache->key.offset;
261 
262 		spin_unlock(&fs_info->block_group_cache_lock);
263 		btrfs_put_block_group(cache);
264 		cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache;
265 	}
266 	node = rb_next(&cache->cache_node);
267 	btrfs_put_block_group(cache);
268 	if (node) {
269 		cache = rb_entry(node, struct btrfs_block_group_cache,
270 				 cache_node);
271 		btrfs_get_block_group(cache);
272 	} else
273 		cache = NULL;
274 	spin_unlock(&fs_info->block_group_cache_lock);
275 	return cache;
276 }
277 
btrfs_inc_nocow_writers(struct btrfs_fs_info * fs_info,u64 bytenr)278 bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
279 {
280 	struct btrfs_block_group_cache *bg;
281 	bool ret = true;
282 
283 	bg = btrfs_lookup_block_group(fs_info, bytenr);
284 	if (!bg)
285 		return false;
286 
287 	spin_lock(&bg->lock);
288 	if (bg->ro)
289 		ret = false;
290 	else
291 		atomic_inc(&bg->nocow_writers);
292 	spin_unlock(&bg->lock);
293 
294 	/* No put on block group, done by btrfs_dec_nocow_writers */
295 	if (!ret)
296 		btrfs_put_block_group(bg);
297 
298 	return ret;
299 }
300 
btrfs_dec_nocow_writers(struct btrfs_fs_info * fs_info,u64 bytenr)301 void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
302 {
303 	struct btrfs_block_group_cache *bg;
304 
305 	bg = btrfs_lookup_block_group(fs_info, bytenr);
306 	ASSERT(bg);
307 	if (atomic_dec_and_test(&bg->nocow_writers))
308 		wake_up_var(&bg->nocow_writers);
309 	/*
310 	 * Once for our lookup and once for the lookup done by a previous call
311 	 * to btrfs_inc_nocow_writers()
312 	 */
313 	btrfs_put_block_group(bg);
314 	btrfs_put_block_group(bg);
315 }
316 
btrfs_wait_nocow_writers(struct btrfs_block_group_cache * bg)317 void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg)
318 {
319 	wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers));
320 }
321 
btrfs_dec_block_group_reservations(struct btrfs_fs_info * fs_info,const u64 start)322 void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
323 					const u64 start)
324 {
325 	struct btrfs_block_group_cache *bg;
326 
327 	bg = btrfs_lookup_block_group(fs_info, start);
328 	ASSERT(bg);
329 	if (atomic_dec_and_test(&bg->reservations))
330 		wake_up_var(&bg->reservations);
331 	btrfs_put_block_group(bg);
332 }
333 
btrfs_wait_block_group_reservations(struct btrfs_block_group_cache * bg)334 void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
335 {
336 	struct btrfs_space_info *space_info = bg->space_info;
337 
338 	ASSERT(bg->ro);
339 
340 	if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
341 		return;
342 
343 	/*
344 	 * Our block group is read only but before we set it to read only,
345 	 * some task might have had allocated an extent from it already, but it
346 	 * has not yet created a respective ordered extent (and added it to a
347 	 * root's list of ordered extents).
348 	 * Therefore wait for any task currently allocating extents, since the
349 	 * block group's reservations counter is incremented while a read lock
350 	 * on the groups' semaphore is held and decremented after releasing
351 	 * the read access on that semaphore and creating the ordered extent.
352 	 */
353 	down_write(&space_info->groups_sem);
354 	up_write(&space_info->groups_sem);
355 
356 	wait_var_event(&bg->reservations, !atomic_read(&bg->reservations));
357 }
358 
btrfs_get_caching_control(struct btrfs_block_group_cache * cache)359 struct btrfs_caching_control *btrfs_get_caching_control(
360 		struct btrfs_block_group_cache *cache)
361 {
362 	struct btrfs_caching_control *ctl;
363 
364 	spin_lock(&cache->lock);
365 	if (!cache->caching_ctl) {
366 		spin_unlock(&cache->lock);
367 		return NULL;
368 	}
369 
370 	ctl = cache->caching_ctl;
371 	refcount_inc(&ctl->count);
372 	spin_unlock(&cache->lock);
373 	return ctl;
374 }
375 
btrfs_put_caching_control(struct btrfs_caching_control * ctl)376 void btrfs_put_caching_control(struct btrfs_caching_control *ctl)
377 {
378 	if (refcount_dec_and_test(&ctl->count))
379 		kfree(ctl);
380 }
381 
382 /*
383  * When we wait for progress in the block group caching, its because our
384  * allocation attempt failed at least once.  So, we must sleep and let some
385  * progress happen before we try again.
386  *
387  * This function will sleep at least once waiting for new free space to show
388  * up, and then it will check the block group free space numbers for our min
389  * num_bytes.  Another option is to have it go ahead and look in the rbtree for
390  * a free extent of a given size, but this is a good start.
391  *
392  * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
393  * any of the information in this block group.
394  */
btrfs_wait_block_group_cache_progress(struct btrfs_block_group_cache * cache,u64 num_bytes)395 void btrfs_wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
396 					   u64 num_bytes)
397 {
398 	struct btrfs_caching_control *caching_ctl;
399 
400 	caching_ctl = btrfs_get_caching_control(cache);
401 	if (!caching_ctl)
402 		return;
403 
404 	wait_event(caching_ctl->wait, btrfs_block_group_cache_done(cache) ||
405 		   (cache->free_space_ctl->free_space >= num_bytes));
406 
407 	btrfs_put_caching_control(caching_ctl);
408 }
409 
btrfs_wait_block_group_cache_done(struct btrfs_block_group_cache * cache)410 int btrfs_wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
411 {
412 	struct btrfs_caching_control *caching_ctl;
413 	int ret = 0;
414 
415 	caching_ctl = btrfs_get_caching_control(cache);
416 	if (!caching_ctl)
417 		return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
418 
419 	wait_event(caching_ctl->wait, btrfs_block_group_cache_done(cache));
420 	if (cache->cached == BTRFS_CACHE_ERROR)
421 		ret = -EIO;
422 	btrfs_put_caching_control(caching_ctl);
423 	return ret;
424 }
425 
426 #ifdef CONFIG_BTRFS_DEBUG
fragment_free_space(struct btrfs_block_group_cache * block_group)427 static void fragment_free_space(struct btrfs_block_group_cache *block_group)
428 {
429 	struct btrfs_fs_info *fs_info = block_group->fs_info;
430 	u64 start = block_group->key.objectid;
431 	u64 len = block_group->key.offset;
432 	u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
433 		fs_info->nodesize : fs_info->sectorsize;
434 	u64 step = chunk << 1;
435 
436 	while (len > chunk) {
437 		btrfs_remove_free_space(block_group, start, chunk);
438 		start += step;
439 		if (len < step)
440 			len = 0;
441 		else
442 			len -= step;
443 	}
444 }
445 #endif
446 
447 /*
448  * This is only called by btrfs_cache_block_group, since we could have freed
449  * extents we need to check the pinned_extents for any extents that can't be
450  * used yet since their free space will be released as soon as the transaction
451  * commits.
452  */
add_new_free_space(struct btrfs_block_group_cache * block_group,u64 start,u64 end)453 u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
454 		       u64 start, u64 end)
455 {
456 	struct btrfs_fs_info *info = block_group->fs_info;
457 	u64 extent_start, extent_end, size, total_added = 0;
458 	int ret;
459 
460 	while (start < end) {
461 		ret = find_first_extent_bit(info->pinned_extents, start,
462 					    &extent_start, &extent_end,
463 					    EXTENT_DIRTY | EXTENT_UPTODATE,
464 					    NULL);
465 		if (ret)
466 			break;
467 
468 		if (extent_start <= start) {
469 			start = extent_end + 1;
470 		} else if (extent_start > start && extent_start < end) {
471 			size = extent_start - start;
472 			total_added += size;
473 			ret = btrfs_add_free_space(block_group, start,
474 						   size);
475 			BUG_ON(ret); /* -ENOMEM or logic error */
476 			start = extent_end + 1;
477 		} else {
478 			break;
479 		}
480 	}
481 
482 	if (start < end) {
483 		size = end - start;
484 		total_added += size;
485 		ret = btrfs_add_free_space(block_group, start, size);
486 		BUG_ON(ret); /* -ENOMEM or logic error */
487 	}
488 
489 	return total_added;
490 }
491 
load_extent_tree_free(struct btrfs_caching_control * caching_ctl)492 static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
493 {
494 	struct btrfs_block_group_cache *block_group = caching_ctl->block_group;
495 	struct btrfs_fs_info *fs_info = block_group->fs_info;
496 	struct btrfs_root *extent_root = fs_info->extent_root;
497 	struct btrfs_path *path;
498 	struct extent_buffer *leaf;
499 	struct btrfs_key key;
500 	u64 total_found = 0;
501 	u64 last = 0;
502 	u32 nritems;
503 	int ret;
504 	bool wakeup = true;
505 
506 	path = btrfs_alloc_path();
507 	if (!path)
508 		return -ENOMEM;
509 
510 	last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
511 
512 #ifdef CONFIG_BTRFS_DEBUG
513 	/*
514 	 * If we're fragmenting we don't want to make anybody think we can
515 	 * allocate from this block group until we've had a chance to fragment
516 	 * the free space.
517 	 */
518 	if (btrfs_should_fragment_free_space(block_group))
519 		wakeup = false;
520 #endif
521 	/*
522 	 * We don't want to deadlock with somebody trying to allocate a new
523 	 * extent for the extent root while also trying to search the extent
524 	 * root to add free space.  So we skip locking and search the commit
525 	 * root, since its read-only
526 	 */
527 	path->skip_locking = 1;
528 	path->search_commit_root = 1;
529 	path->reada = READA_FORWARD;
530 
531 	key.objectid = last;
532 	key.offset = 0;
533 	key.type = BTRFS_EXTENT_ITEM_KEY;
534 
535 next:
536 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
537 	if (ret < 0)
538 		goto out;
539 
540 	leaf = path->nodes[0];
541 	nritems = btrfs_header_nritems(leaf);
542 
543 	while (1) {
544 		if (btrfs_fs_closing(fs_info) > 1) {
545 			last = (u64)-1;
546 			break;
547 		}
548 
549 		if (path->slots[0] < nritems) {
550 			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
551 		} else {
552 			ret = btrfs_find_next_key(extent_root, path, &key, 0, 0);
553 			if (ret)
554 				break;
555 
556 			if (need_resched() ||
557 			    rwsem_is_contended(&fs_info->commit_root_sem)) {
558 				if (wakeup)
559 					caching_ctl->progress = last;
560 				btrfs_release_path(path);
561 				up_read(&fs_info->commit_root_sem);
562 				mutex_unlock(&caching_ctl->mutex);
563 				cond_resched();
564 				mutex_lock(&caching_ctl->mutex);
565 				down_read(&fs_info->commit_root_sem);
566 				goto next;
567 			}
568 
569 			ret = btrfs_next_leaf(extent_root, path);
570 			if (ret < 0)
571 				goto out;
572 			if (ret)
573 				break;
574 			leaf = path->nodes[0];
575 			nritems = btrfs_header_nritems(leaf);
576 			continue;
577 		}
578 
579 		if (key.objectid < last) {
580 			key.objectid = last;
581 			key.offset = 0;
582 			key.type = BTRFS_EXTENT_ITEM_KEY;
583 
584 			if (wakeup)
585 				caching_ctl->progress = last;
586 			btrfs_release_path(path);
587 			goto next;
588 		}
589 
590 		if (key.objectid < block_group->key.objectid) {
591 			path->slots[0]++;
592 			continue;
593 		}
594 
595 		if (key.objectid >= block_group->key.objectid +
596 		    block_group->key.offset)
597 			break;
598 
599 		if (key.type == BTRFS_EXTENT_ITEM_KEY ||
600 		    key.type == BTRFS_METADATA_ITEM_KEY) {
601 			total_found += add_new_free_space(block_group, last,
602 							  key.objectid);
603 			if (key.type == BTRFS_METADATA_ITEM_KEY)
604 				last = key.objectid +
605 					fs_info->nodesize;
606 			else
607 				last = key.objectid + key.offset;
608 
609 			if (total_found > CACHING_CTL_WAKE_UP) {
610 				total_found = 0;
611 				if (wakeup)
612 					wake_up(&caching_ctl->wait);
613 			}
614 		}
615 		path->slots[0]++;
616 	}
617 	ret = 0;
618 
619 	total_found += add_new_free_space(block_group, last,
620 					  block_group->key.objectid +
621 					  block_group->key.offset);
622 	caching_ctl->progress = (u64)-1;
623 
624 out:
625 	btrfs_free_path(path);
626 	return ret;
627 }
628 
caching_thread(struct btrfs_work * work)629 static noinline void caching_thread(struct btrfs_work *work)
630 {
631 	struct btrfs_block_group_cache *block_group;
632 	struct btrfs_fs_info *fs_info;
633 	struct btrfs_caching_control *caching_ctl;
634 	int ret;
635 
636 	caching_ctl = container_of(work, struct btrfs_caching_control, work);
637 	block_group = caching_ctl->block_group;
638 	fs_info = block_group->fs_info;
639 
640 	mutex_lock(&caching_ctl->mutex);
641 	down_read(&fs_info->commit_root_sem);
642 
643 	if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
644 		ret = load_free_space_tree(caching_ctl);
645 	else
646 		ret = load_extent_tree_free(caching_ctl);
647 
648 	spin_lock(&block_group->lock);
649 	block_group->caching_ctl = NULL;
650 	block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
651 	spin_unlock(&block_group->lock);
652 
653 #ifdef CONFIG_BTRFS_DEBUG
654 	if (btrfs_should_fragment_free_space(block_group)) {
655 		u64 bytes_used;
656 
657 		spin_lock(&block_group->space_info->lock);
658 		spin_lock(&block_group->lock);
659 		bytes_used = block_group->key.offset -
660 			btrfs_block_group_used(&block_group->item);
661 		block_group->space_info->bytes_used += bytes_used >> 1;
662 		spin_unlock(&block_group->lock);
663 		spin_unlock(&block_group->space_info->lock);
664 		fragment_free_space(block_group);
665 	}
666 #endif
667 
668 	caching_ctl->progress = (u64)-1;
669 
670 	up_read(&fs_info->commit_root_sem);
671 	btrfs_free_excluded_extents(block_group);
672 	mutex_unlock(&caching_ctl->mutex);
673 
674 	wake_up(&caching_ctl->wait);
675 
676 	btrfs_put_caching_control(caching_ctl);
677 	btrfs_put_block_group(block_group);
678 }
679 
btrfs_cache_block_group(struct btrfs_block_group_cache * cache,int load_cache_only)680 int btrfs_cache_block_group(struct btrfs_block_group_cache *cache,
681 			    int load_cache_only)
682 {
683 	DEFINE_WAIT(wait);
684 	struct btrfs_fs_info *fs_info = cache->fs_info;
685 	struct btrfs_caching_control *caching_ctl;
686 	int ret = 0;
687 
688 	caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
689 	if (!caching_ctl)
690 		return -ENOMEM;
691 
692 	INIT_LIST_HEAD(&caching_ctl->list);
693 	mutex_init(&caching_ctl->mutex);
694 	init_waitqueue_head(&caching_ctl->wait);
695 	caching_ctl->block_group = cache;
696 	caching_ctl->progress = cache->key.objectid;
697 	refcount_set(&caching_ctl->count, 1);
698 	btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
699 
700 	spin_lock(&cache->lock);
701 	/*
702 	 * This should be a rare occasion, but this could happen I think in the
703 	 * case where one thread starts to load the space cache info, and then
704 	 * some other thread starts a transaction commit which tries to do an
705 	 * allocation while the other thread is still loading the space cache
706 	 * info.  The previous loop should have kept us from choosing this block
707 	 * group, but if we've moved to the state where we will wait on caching
708 	 * block groups we need to first check if we're doing a fast load here,
709 	 * so we can wait for it to finish, otherwise we could end up allocating
710 	 * from a block group who's cache gets evicted for one reason or
711 	 * another.
712 	 */
713 	while (cache->cached == BTRFS_CACHE_FAST) {
714 		struct btrfs_caching_control *ctl;
715 
716 		ctl = cache->caching_ctl;
717 		refcount_inc(&ctl->count);
718 		prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
719 		spin_unlock(&cache->lock);
720 
721 		schedule();
722 
723 		finish_wait(&ctl->wait, &wait);
724 		btrfs_put_caching_control(ctl);
725 		spin_lock(&cache->lock);
726 	}
727 
728 	if (cache->cached != BTRFS_CACHE_NO) {
729 		spin_unlock(&cache->lock);
730 		kfree(caching_ctl);
731 		return 0;
732 	}
733 	WARN_ON(cache->caching_ctl);
734 	cache->caching_ctl = caching_ctl;
735 	cache->cached = BTRFS_CACHE_FAST;
736 	spin_unlock(&cache->lock);
737 
738 	if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
739 		mutex_lock(&caching_ctl->mutex);
740 		ret = load_free_space_cache(cache);
741 
742 		spin_lock(&cache->lock);
743 		if (ret == 1) {
744 			cache->caching_ctl = NULL;
745 			cache->cached = BTRFS_CACHE_FINISHED;
746 			cache->last_byte_to_unpin = (u64)-1;
747 			caching_ctl->progress = (u64)-1;
748 		} else {
749 			if (load_cache_only) {
750 				cache->caching_ctl = NULL;
751 				cache->cached = BTRFS_CACHE_NO;
752 			} else {
753 				cache->cached = BTRFS_CACHE_STARTED;
754 				cache->has_caching_ctl = 1;
755 			}
756 		}
757 		spin_unlock(&cache->lock);
758 #ifdef CONFIG_BTRFS_DEBUG
759 		if (ret == 1 &&
760 		    btrfs_should_fragment_free_space(cache)) {
761 			u64 bytes_used;
762 
763 			spin_lock(&cache->space_info->lock);
764 			spin_lock(&cache->lock);
765 			bytes_used = cache->key.offset -
766 				btrfs_block_group_used(&cache->item);
767 			cache->space_info->bytes_used += bytes_used >> 1;
768 			spin_unlock(&cache->lock);
769 			spin_unlock(&cache->space_info->lock);
770 			fragment_free_space(cache);
771 		}
772 #endif
773 		mutex_unlock(&caching_ctl->mutex);
774 
775 		wake_up(&caching_ctl->wait);
776 		if (ret == 1) {
777 			btrfs_put_caching_control(caching_ctl);
778 			btrfs_free_excluded_extents(cache);
779 			return 0;
780 		}
781 	} else {
782 		/*
783 		 * We're either using the free space tree or no caching at all.
784 		 * Set cached to the appropriate value and wakeup any waiters.
785 		 */
786 		spin_lock(&cache->lock);
787 		if (load_cache_only) {
788 			cache->caching_ctl = NULL;
789 			cache->cached = BTRFS_CACHE_NO;
790 		} else {
791 			cache->cached = BTRFS_CACHE_STARTED;
792 			cache->has_caching_ctl = 1;
793 		}
794 		spin_unlock(&cache->lock);
795 		wake_up(&caching_ctl->wait);
796 	}
797 
798 	if (load_cache_only) {
799 		btrfs_put_caching_control(caching_ctl);
800 		return 0;
801 	}
802 
803 	down_write(&fs_info->commit_root_sem);
804 	refcount_inc(&caching_ctl->count);
805 	list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
806 	up_write(&fs_info->commit_root_sem);
807 
808 	btrfs_get_block_group(cache);
809 
810 	btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
811 
812 	return ret;
813 }
814 
clear_avail_alloc_bits(struct btrfs_fs_info * fs_info,u64 flags)815 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
816 {
817 	u64 extra_flags = chunk_to_extended(flags) &
818 				BTRFS_EXTENDED_PROFILE_MASK;
819 
820 	write_seqlock(&fs_info->profiles_lock);
821 	if (flags & BTRFS_BLOCK_GROUP_DATA)
822 		fs_info->avail_data_alloc_bits &= ~extra_flags;
823 	if (flags & BTRFS_BLOCK_GROUP_METADATA)
824 		fs_info->avail_metadata_alloc_bits &= ~extra_flags;
825 	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
826 		fs_info->avail_system_alloc_bits &= ~extra_flags;
827 	write_sequnlock(&fs_info->profiles_lock);
828 }
829 
830 /*
831  * Clear incompat bits for the following feature(s):
832  *
833  * - RAID56 - in case there's neither RAID5 nor RAID6 profile block group
834  *            in the whole filesystem
835  */
clear_incompat_bg_bits(struct btrfs_fs_info * fs_info,u64 flags)836 static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags)
837 {
838 	if (flags & BTRFS_BLOCK_GROUP_RAID56_MASK) {
839 		struct list_head *head = &fs_info->space_info;
840 		struct btrfs_space_info *sinfo;
841 
842 		list_for_each_entry_rcu(sinfo, head, list) {
843 			bool found = false;
844 
845 			down_read(&sinfo->groups_sem);
846 			if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5]))
847 				found = true;
848 			if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6]))
849 				found = true;
850 			up_read(&sinfo->groups_sem);
851 
852 			if (found)
853 				return;
854 		}
855 		btrfs_clear_fs_incompat(fs_info, RAID56);
856 	}
857 }
858 
btrfs_remove_block_group(struct btrfs_trans_handle * trans,u64 group_start,struct extent_map * em)859 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
860 			     u64 group_start, struct extent_map *em)
861 {
862 	struct btrfs_fs_info *fs_info = trans->fs_info;
863 	struct btrfs_root *root = fs_info->extent_root;
864 	struct btrfs_path *path;
865 	struct btrfs_block_group_cache *block_group;
866 	struct btrfs_free_cluster *cluster;
867 	struct btrfs_root *tree_root = fs_info->tree_root;
868 	struct btrfs_key key;
869 	struct inode *inode;
870 	struct kobject *kobj = NULL;
871 	int ret;
872 	int index;
873 	int factor;
874 	struct btrfs_caching_control *caching_ctl = NULL;
875 	bool remove_em;
876 	bool remove_rsv = false;
877 
878 	block_group = btrfs_lookup_block_group(fs_info, group_start);
879 	BUG_ON(!block_group);
880 	BUG_ON(!block_group->ro);
881 
882 	trace_btrfs_remove_block_group(block_group);
883 	/*
884 	 * Free the reserved super bytes from this block group before
885 	 * remove it.
886 	 */
887 	btrfs_free_excluded_extents(block_group);
888 	btrfs_free_ref_tree_range(fs_info, block_group->key.objectid,
889 				  block_group->key.offset);
890 
891 	memcpy(&key, &block_group->key, sizeof(key));
892 	index = btrfs_bg_flags_to_raid_index(block_group->flags);
893 	factor = btrfs_bg_type_to_factor(block_group->flags);
894 
895 	/* make sure this block group isn't part of an allocation cluster */
896 	cluster = &fs_info->data_alloc_cluster;
897 	spin_lock(&cluster->refill_lock);
898 	btrfs_return_cluster_to_free_space(block_group, cluster);
899 	spin_unlock(&cluster->refill_lock);
900 
901 	/*
902 	 * make sure this block group isn't part of a metadata
903 	 * allocation cluster
904 	 */
905 	cluster = &fs_info->meta_alloc_cluster;
906 	spin_lock(&cluster->refill_lock);
907 	btrfs_return_cluster_to_free_space(block_group, cluster);
908 	spin_unlock(&cluster->refill_lock);
909 
910 	path = btrfs_alloc_path();
911 	if (!path) {
912 		ret = -ENOMEM;
913 		goto out;
914 	}
915 
916 	/*
917 	 * get the inode first so any iput calls done for the io_list
918 	 * aren't the final iput (no unlinks allowed now)
919 	 */
920 	inode = lookup_free_space_inode(block_group, path);
921 
922 	mutex_lock(&trans->transaction->cache_write_mutex);
923 	/*
924 	 * Make sure our free space cache IO is done before removing the
925 	 * free space inode
926 	 */
927 	spin_lock(&trans->transaction->dirty_bgs_lock);
928 	if (!list_empty(&block_group->io_list)) {
929 		list_del_init(&block_group->io_list);
930 
931 		WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
932 
933 		spin_unlock(&trans->transaction->dirty_bgs_lock);
934 		btrfs_wait_cache_io(trans, block_group, path);
935 		btrfs_put_block_group(block_group);
936 		spin_lock(&trans->transaction->dirty_bgs_lock);
937 	}
938 
939 	if (!list_empty(&block_group->dirty_list)) {
940 		list_del_init(&block_group->dirty_list);
941 		remove_rsv = true;
942 		btrfs_put_block_group(block_group);
943 	}
944 	spin_unlock(&trans->transaction->dirty_bgs_lock);
945 	mutex_unlock(&trans->transaction->cache_write_mutex);
946 
947 	if (!IS_ERR(inode)) {
948 		ret = btrfs_orphan_add(trans, BTRFS_I(inode));
949 		if (ret) {
950 			btrfs_add_delayed_iput(inode);
951 			goto out;
952 		}
953 		clear_nlink(inode);
954 		/* One for the block groups ref */
955 		spin_lock(&block_group->lock);
956 		if (block_group->iref) {
957 			block_group->iref = 0;
958 			block_group->inode = NULL;
959 			spin_unlock(&block_group->lock);
960 			iput(inode);
961 		} else {
962 			spin_unlock(&block_group->lock);
963 		}
964 		/* One for our lookup ref */
965 		btrfs_add_delayed_iput(inode);
966 	}
967 
968 	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
969 	key.offset = block_group->key.objectid;
970 	key.type = 0;
971 
972 	ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
973 	if (ret < 0)
974 		goto out;
975 	if (ret > 0)
976 		btrfs_release_path(path);
977 	if (ret == 0) {
978 		ret = btrfs_del_item(trans, tree_root, path);
979 		if (ret)
980 			goto out;
981 		btrfs_release_path(path);
982 	}
983 
984 	spin_lock(&fs_info->block_group_cache_lock);
985 	rb_erase(&block_group->cache_node,
986 		 &fs_info->block_group_cache_tree);
987 	RB_CLEAR_NODE(&block_group->cache_node);
988 
989 	if (fs_info->first_logical_byte == block_group->key.objectid)
990 		fs_info->first_logical_byte = (u64)-1;
991 	spin_unlock(&fs_info->block_group_cache_lock);
992 
993 	down_write(&block_group->space_info->groups_sem);
994 	/*
995 	 * we must use list_del_init so people can check to see if they
996 	 * are still on the list after taking the semaphore
997 	 */
998 	list_del_init(&block_group->list);
999 	if (list_empty(&block_group->space_info->block_groups[index])) {
1000 		kobj = block_group->space_info->block_group_kobjs[index];
1001 		block_group->space_info->block_group_kobjs[index] = NULL;
1002 		clear_avail_alloc_bits(fs_info, block_group->flags);
1003 	}
1004 	up_write(&block_group->space_info->groups_sem);
1005 	clear_incompat_bg_bits(fs_info, block_group->flags);
1006 	if (kobj) {
1007 		kobject_del(kobj);
1008 		kobject_put(kobj);
1009 	}
1010 
1011 	if (block_group->has_caching_ctl)
1012 		caching_ctl = btrfs_get_caching_control(block_group);
1013 	if (block_group->cached == BTRFS_CACHE_STARTED)
1014 		btrfs_wait_block_group_cache_done(block_group);
1015 	if (block_group->has_caching_ctl) {
1016 		down_write(&fs_info->commit_root_sem);
1017 		if (!caching_ctl) {
1018 			struct btrfs_caching_control *ctl;
1019 
1020 			list_for_each_entry(ctl,
1021 				    &fs_info->caching_block_groups, list)
1022 				if (ctl->block_group == block_group) {
1023 					caching_ctl = ctl;
1024 					refcount_inc(&caching_ctl->count);
1025 					break;
1026 				}
1027 		}
1028 		if (caching_ctl)
1029 			list_del_init(&caching_ctl->list);
1030 		up_write(&fs_info->commit_root_sem);
1031 		if (caching_ctl) {
1032 			/* Once for the caching bgs list and once for us. */
1033 			btrfs_put_caching_control(caching_ctl);
1034 			btrfs_put_caching_control(caching_ctl);
1035 		}
1036 	}
1037 
1038 	spin_lock(&trans->transaction->dirty_bgs_lock);
1039 	WARN_ON(!list_empty(&block_group->dirty_list));
1040 	WARN_ON(!list_empty(&block_group->io_list));
1041 	spin_unlock(&trans->transaction->dirty_bgs_lock);
1042 
1043 	btrfs_remove_free_space_cache(block_group);
1044 
1045 	spin_lock(&block_group->space_info->lock);
1046 	list_del_init(&block_group->ro_list);
1047 
1048 	if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
1049 		WARN_ON(block_group->space_info->total_bytes
1050 			< block_group->key.offset);
1051 		WARN_ON(block_group->space_info->bytes_readonly
1052 			< block_group->key.offset);
1053 		WARN_ON(block_group->space_info->disk_total
1054 			< block_group->key.offset * factor);
1055 	}
1056 	block_group->space_info->total_bytes -= block_group->key.offset;
1057 	block_group->space_info->bytes_readonly -= block_group->key.offset;
1058 	block_group->space_info->disk_total -= block_group->key.offset * factor;
1059 
1060 	spin_unlock(&block_group->space_info->lock);
1061 
1062 	memcpy(&key, &block_group->key, sizeof(key));
1063 
1064 	mutex_lock(&fs_info->chunk_mutex);
1065 	spin_lock(&block_group->lock);
1066 	block_group->removed = 1;
1067 	/*
1068 	 * At this point trimming can't start on this block group, because we
1069 	 * removed the block group from the tree fs_info->block_group_cache_tree
1070 	 * so no one can't find it anymore and even if someone already got this
1071 	 * block group before we removed it from the rbtree, they have already
1072 	 * incremented block_group->trimming - if they didn't, they won't find
1073 	 * any free space entries because we already removed them all when we
1074 	 * called btrfs_remove_free_space_cache().
1075 	 *
1076 	 * And we must not remove the extent map from the fs_info->mapping_tree
1077 	 * to prevent the same logical address range and physical device space
1078 	 * ranges from being reused for a new block group. This is because our
1079 	 * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
1080 	 * completely transactionless, so while it is trimming a range the
1081 	 * currently running transaction might finish and a new one start,
1082 	 * allowing for new block groups to be created that can reuse the same
1083 	 * physical device locations unless we take this special care.
1084 	 *
1085 	 * There may also be an implicit trim operation if the file system
1086 	 * is mounted with -odiscard. The same protections must remain
1087 	 * in place until the extents have been discarded completely when
1088 	 * the transaction commit has completed.
1089 	 */
1090 	remove_em = (atomic_read(&block_group->trimming) == 0);
1091 	spin_unlock(&block_group->lock);
1092 
1093 	mutex_unlock(&fs_info->chunk_mutex);
1094 
1095 	ret = remove_block_group_free_space(trans, block_group);
1096 	if (ret)
1097 		goto out;
1098 
1099 	btrfs_put_block_group(block_group);
1100 	btrfs_put_block_group(block_group);
1101 
1102 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1103 	if (ret > 0)
1104 		ret = -EIO;
1105 	if (ret < 0)
1106 		goto out;
1107 
1108 	ret = btrfs_del_item(trans, root, path);
1109 	if (ret)
1110 		goto out;
1111 
1112 	if (remove_em) {
1113 		struct extent_map_tree *em_tree;
1114 
1115 		em_tree = &fs_info->mapping_tree;
1116 		write_lock(&em_tree->lock);
1117 		remove_extent_mapping(em_tree, em);
1118 		write_unlock(&em_tree->lock);
1119 		/* once for the tree */
1120 		free_extent_map(em);
1121 	}
1122 out:
1123 	if (remove_rsv)
1124 		btrfs_delayed_refs_rsv_release(fs_info, 1);
1125 	btrfs_free_path(path);
1126 	return ret;
1127 }
1128 
btrfs_start_trans_remove_block_group(struct btrfs_fs_info * fs_info,const u64 chunk_offset)1129 struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
1130 		struct btrfs_fs_info *fs_info, const u64 chunk_offset)
1131 {
1132 	struct extent_map_tree *em_tree = &fs_info->mapping_tree;
1133 	struct extent_map *em;
1134 	struct map_lookup *map;
1135 	unsigned int num_items;
1136 
1137 	read_lock(&em_tree->lock);
1138 	em = lookup_extent_mapping(em_tree, chunk_offset, 1);
1139 	read_unlock(&em_tree->lock);
1140 	ASSERT(em && em->start == chunk_offset);
1141 
1142 	/*
1143 	 * We need to reserve 3 + N units from the metadata space info in order
1144 	 * to remove a block group (done at btrfs_remove_chunk() and at
1145 	 * btrfs_remove_block_group()), which are used for:
1146 	 *
1147 	 * 1 unit for adding the free space inode's orphan (located in the tree
1148 	 * of tree roots).
1149 	 * 1 unit for deleting the block group item (located in the extent
1150 	 * tree).
1151 	 * 1 unit for deleting the free space item (located in tree of tree
1152 	 * roots).
1153 	 * N units for deleting N device extent items corresponding to each
1154 	 * stripe (located in the device tree).
1155 	 *
1156 	 * In order to remove a block group we also need to reserve units in the
1157 	 * system space info in order to update the chunk tree (update one or
1158 	 * more device items and remove one chunk item), but this is done at
1159 	 * btrfs_remove_chunk() through a call to check_system_chunk().
1160 	 */
1161 	map = em->map_lookup;
1162 	num_items = 3 + map->num_stripes;
1163 	free_extent_map(em);
1164 
1165 	return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
1166 							   num_items, 1);
1167 }
1168 
1169 /*
1170  * Mark block group @cache read-only, so later write won't happen to block
1171  * group @cache.
1172  *
1173  * If @force is not set, this function will only mark the block group readonly
1174  * if we have enough free space (1M) in other metadata/system block groups.
1175  * If @force is not set, this function will mark the block group readonly
1176  * without checking free space.
1177  *
1178  * NOTE: This function doesn't care if other block groups can contain all the
1179  * data in this block group. That check should be done by relocation routine,
1180  * not this function.
1181  */
inc_block_group_ro(struct btrfs_block_group_cache * cache,int force)1182 static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
1183 {
1184 	struct btrfs_space_info *sinfo = cache->space_info;
1185 	u64 num_bytes;
1186 	u64 sinfo_used;
1187 	u64 min_allocable_bytes;
1188 	int ret = -ENOSPC;
1189 
1190 	/*
1191 	 * We need some metadata space and system metadata space for
1192 	 * allocating chunks in some corner cases until we force to set
1193 	 * it to be readonly.
1194 	 */
1195 	if ((sinfo->flags &
1196 	     (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
1197 	    !force)
1198 		min_allocable_bytes = SZ_1M;
1199 	else
1200 		min_allocable_bytes = 0;
1201 
1202 	spin_lock(&sinfo->lock);
1203 	spin_lock(&cache->lock);
1204 
1205 	if (cache->ro) {
1206 		cache->ro++;
1207 		ret = 0;
1208 		goto out;
1209 	}
1210 
1211 	num_bytes = cache->key.offset - cache->reserved - cache->pinned -
1212 		    cache->bytes_super - btrfs_block_group_used(&cache->item);
1213 	sinfo_used = btrfs_space_info_used(sinfo, true);
1214 
1215 	/*
1216 	 * sinfo_used + num_bytes should always <= sinfo->total_bytes.
1217 	 *
1218 	 * Here we make sure if we mark this bg RO, we still have enough
1219 	 * free space as buffer (if min_allocable_bytes is not 0).
1220 	 */
1221 	if (sinfo_used + num_bytes + min_allocable_bytes <=
1222 	    sinfo->total_bytes) {
1223 		sinfo->bytes_readonly += num_bytes;
1224 		cache->ro++;
1225 		list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
1226 		ret = 0;
1227 	}
1228 out:
1229 	spin_unlock(&cache->lock);
1230 	spin_unlock(&sinfo->lock);
1231 	if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
1232 		btrfs_info(cache->fs_info,
1233 			"unable to make block group %llu ro",
1234 			cache->key.objectid);
1235 		btrfs_info(cache->fs_info,
1236 			"sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu",
1237 			sinfo_used, num_bytes, min_allocable_bytes);
1238 		btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0);
1239 	}
1240 	return ret;
1241 }
1242 
1243 /*
1244  * Process the unused_bgs list and remove any that don't have any allocated
1245  * space inside of them.
1246  */
btrfs_delete_unused_bgs(struct btrfs_fs_info * fs_info)1247 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
1248 {
1249 	struct btrfs_block_group_cache *block_group;
1250 	struct btrfs_space_info *space_info;
1251 	struct btrfs_trans_handle *trans;
1252 	int ret = 0;
1253 
1254 	if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
1255 		return;
1256 
1257 	spin_lock(&fs_info->unused_bgs_lock);
1258 	while (!list_empty(&fs_info->unused_bgs)) {
1259 		u64 start, end;
1260 		int trimming;
1261 
1262 		block_group = list_first_entry(&fs_info->unused_bgs,
1263 					       struct btrfs_block_group_cache,
1264 					       bg_list);
1265 		list_del_init(&block_group->bg_list);
1266 
1267 		space_info = block_group->space_info;
1268 
1269 		if (ret || btrfs_mixed_space_info(space_info)) {
1270 			btrfs_put_block_group(block_group);
1271 			continue;
1272 		}
1273 		spin_unlock(&fs_info->unused_bgs_lock);
1274 
1275 		mutex_lock(&fs_info->delete_unused_bgs_mutex);
1276 
1277 		/* Don't want to race with allocators so take the groups_sem */
1278 		down_write(&space_info->groups_sem);
1279 		spin_lock(&block_group->lock);
1280 		if (block_group->reserved || block_group->pinned ||
1281 		    btrfs_block_group_used(&block_group->item) ||
1282 		    block_group->ro ||
1283 		    list_is_singular(&block_group->list)) {
1284 			/*
1285 			 * We want to bail if we made new allocations or have
1286 			 * outstanding allocations in this block group.  We do
1287 			 * the ro check in case balance is currently acting on
1288 			 * this block group.
1289 			 */
1290 			trace_btrfs_skip_unused_block_group(block_group);
1291 			spin_unlock(&block_group->lock);
1292 			up_write(&space_info->groups_sem);
1293 			goto next;
1294 		}
1295 		spin_unlock(&block_group->lock);
1296 
1297 		/* We don't want to force the issue, only flip if it's ok. */
1298 		ret = inc_block_group_ro(block_group, 0);
1299 		up_write(&space_info->groups_sem);
1300 		if (ret < 0) {
1301 			ret = 0;
1302 			goto next;
1303 		}
1304 
1305 		/*
1306 		 * Want to do this before we do anything else so we can recover
1307 		 * properly if we fail to join the transaction.
1308 		 */
1309 		trans = btrfs_start_trans_remove_block_group(fs_info,
1310 						     block_group->key.objectid);
1311 		if (IS_ERR(trans)) {
1312 			btrfs_dec_block_group_ro(block_group);
1313 			ret = PTR_ERR(trans);
1314 			goto next;
1315 		}
1316 
1317 		/*
1318 		 * We could have pending pinned extents for this block group,
1319 		 * just delete them, we don't care about them anymore.
1320 		 */
1321 		start = block_group->key.objectid;
1322 		end = start + block_group->key.offset - 1;
1323 		/*
1324 		 * Hold the unused_bg_unpin_mutex lock to avoid racing with
1325 		 * btrfs_finish_extent_commit(). If we are at transaction N,
1326 		 * another task might be running finish_extent_commit() for the
1327 		 * previous transaction N - 1, and have seen a range belonging
1328 		 * to the block group in freed_extents[] before we were able to
1329 		 * clear the whole block group range from freed_extents[]. This
1330 		 * means that task can lookup for the block group after we
1331 		 * unpinned it from freed_extents[] and removed it, leading to
1332 		 * a BUG_ON() at btrfs_unpin_extent_range().
1333 		 */
1334 		mutex_lock(&fs_info->unused_bg_unpin_mutex);
1335 		ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
1336 				  EXTENT_DIRTY);
1337 		if (ret) {
1338 			mutex_unlock(&fs_info->unused_bg_unpin_mutex);
1339 			btrfs_dec_block_group_ro(block_group);
1340 			goto end_trans;
1341 		}
1342 		ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
1343 				  EXTENT_DIRTY);
1344 		if (ret) {
1345 			mutex_unlock(&fs_info->unused_bg_unpin_mutex);
1346 			btrfs_dec_block_group_ro(block_group);
1347 			goto end_trans;
1348 		}
1349 		mutex_unlock(&fs_info->unused_bg_unpin_mutex);
1350 
1351 		/* Reset pinned so btrfs_put_block_group doesn't complain */
1352 		spin_lock(&space_info->lock);
1353 		spin_lock(&block_group->lock);
1354 
1355 		btrfs_space_info_update_bytes_pinned(fs_info, space_info,
1356 						     -block_group->pinned);
1357 		space_info->bytes_readonly += block_group->pinned;
1358 		percpu_counter_add_batch(&space_info->total_bytes_pinned,
1359 				   -block_group->pinned,
1360 				   BTRFS_TOTAL_BYTES_PINNED_BATCH);
1361 		block_group->pinned = 0;
1362 
1363 		spin_unlock(&block_group->lock);
1364 		spin_unlock(&space_info->lock);
1365 
1366 		/* DISCARD can flip during remount */
1367 		trimming = btrfs_test_opt(fs_info, DISCARD);
1368 
1369 		/* Implicit trim during transaction commit. */
1370 		if (trimming)
1371 			btrfs_get_block_group_trimming(block_group);
1372 
1373 		/*
1374 		 * Btrfs_remove_chunk will abort the transaction if things go
1375 		 * horribly wrong.
1376 		 */
1377 		ret = btrfs_remove_chunk(trans, block_group->key.objectid);
1378 
1379 		if (ret) {
1380 			if (trimming)
1381 				btrfs_put_block_group_trimming(block_group);
1382 			goto end_trans;
1383 		}
1384 
1385 		/*
1386 		 * If we're not mounted with -odiscard, we can just forget
1387 		 * about this block group. Otherwise we'll need to wait
1388 		 * until transaction commit to do the actual discard.
1389 		 */
1390 		if (trimming) {
1391 			spin_lock(&fs_info->unused_bgs_lock);
1392 			/*
1393 			 * A concurrent scrub might have added us to the list
1394 			 * fs_info->unused_bgs, so use a list_move operation
1395 			 * to add the block group to the deleted_bgs list.
1396 			 */
1397 			list_move(&block_group->bg_list,
1398 				  &trans->transaction->deleted_bgs);
1399 			spin_unlock(&fs_info->unused_bgs_lock);
1400 			btrfs_get_block_group(block_group);
1401 		}
1402 end_trans:
1403 		btrfs_end_transaction(trans);
1404 next:
1405 		mutex_unlock(&fs_info->delete_unused_bgs_mutex);
1406 		btrfs_put_block_group(block_group);
1407 		spin_lock(&fs_info->unused_bgs_lock);
1408 	}
1409 	spin_unlock(&fs_info->unused_bgs_lock);
1410 }
1411 
btrfs_mark_bg_unused(struct btrfs_block_group_cache * bg)1412 void btrfs_mark_bg_unused(struct btrfs_block_group_cache *bg)
1413 {
1414 	struct btrfs_fs_info *fs_info = bg->fs_info;
1415 
1416 	spin_lock(&fs_info->unused_bgs_lock);
1417 	if (list_empty(&bg->bg_list)) {
1418 		btrfs_get_block_group(bg);
1419 		trace_btrfs_add_unused_block_group(bg);
1420 		list_add_tail(&bg->bg_list, &fs_info->unused_bgs);
1421 	}
1422 	spin_unlock(&fs_info->unused_bgs_lock);
1423 }
1424 
find_first_block_group(struct btrfs_fs_info * fs_info,struct btrfs_path * path,struct btrfs_key * key)1425 static int find_first_block_group(struct btrfs_fs_info *fs_info,
1426 				  struct btrfs_path *path,
1427 				  struct btrfs_key *key)
1428 {
1429 	struct btrfs_root *root = fs_info->extent_root;
1430 	int ret = 0;
1431 	struct btrfs_key found_key;
1432 	struct extent_buffer *leaf;
1433 	struct btrfs_block_group_item bg;
1434 	u64 flags;
1435 	int slot;
1436 
1437 	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
1438 	if (ret < 0)
1439 		goto out;
1440 
1441 	while (1) {
1442 		slot = path->slots[0];
1443 		leaf = path->nodes[0];
1444 		if (slot >= btrfs_header_nritems(leaf)) {
1445 			ret = btrfs_next_leaf(root, path);
1446 			if (ret == 0)
1447 				continue;
1448 			if (ret < 0)
1449 				goto out;
1450 			break;
1451 		}
1452 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
1453 
1454 		if (found_key.objectid >= key->objectid &&
1455 		    found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
1456 			struct extent_map_tree *em_tree;
1457 			struct extent_map *em;
1458 
1459 			em_tree = &root->fs_info->mapping_tree;
1460 			read_lock(&em_tree->lock);
1461 			em = lookup_extent_mapping(em_tree, found_key.objectid,
1462 						   found_key.offset);
1463 			read_unlock(&em_tree->lock);
1464 			if (!em) {
1465 				btrfs_err(fs_info,
1466 			"logical %llu len %llu found bg but no related chunk",
1467 					  found_key.objectid, found_key.offset);
1468 				ret = -ENOENT;
1469 			} else if (em->start != found_key.objectid ||
1470 				   em->len != found_key.offset) {
1471 				btrfs_err(fs_info,
1472 		"block group %llu len %llu mismatch with chunk %llu len %llu",
1473 					  found_key.objectid, found_key.offset,
1474 					  em->start, em->len);
1475 				ret = -EUCLEAN;
1476 			} else {
1477 				read_extent_buffer(leaf, &bg,
1478 					btrfs_item_ptr_offset(leaf, slot),
1479 					sizeof(bg));
1480 				flags = btrfs_block_group_flags(&bg) &
1481 					BTRFS_BLOCK_GROUP_TYPE_MASK;
1482 
1483 				if (flags != (em->map_lookup->type &
1484 					      BTRFS_BLOCK_GROUP_TYPE_MASK)) {
1485 					btrfs_err(fs_info,
1486 "block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
1487 						found_key.objectid,
1488 						found_key.offset, flags,
1489 						(BTRFS_BLOCK_GROUP_TYPE_MASK &
1490 						 em->map_lookup->type));
1491 					ret = -EUCLEAN;
1492 				} else {
1493 					ret = 0;
1494 				}
1495 			}
1496 			free_extent_map(em);
1497 			goto out;
1498 		}
1499 		path->slots[0]++;
1500 	}
1501 out:
1502 	return ret;
1503 }
1504 
set_avail_alloc_bits(struct btrfs_fs_info * fs_info,u64 flags)1505 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
1506 {
1507 	u64 extra_flags = chunk_to_extended(flags) &
1508 				BTRFS_EXTENDED_PROFILE_MASK;
1509 
1510 	write_seqlock(&fs_info->profiles_lock);
1511 	if (flags & BTRFS_BLOCK_GROUP_DATA)
1512 		fs_info->avail_data_alloc_bits |= extra_flags;
1513 	if (flags & BTRFS_BLOCK_GROUP_METADATA)
1514 		fs_info->avail_metadata_alloc_bits |= extra_flags;
1515 	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
1516 		fs_info->avail_system_alloc_bits |= extra_flags;
1517 	write_sequnlock(&fs_info->profiles_lock);
1518 }
1519 
exclude_super_stripes(struct btrfs_block_group_cache * cache)1520 static int exclude_super_stripes(struct btrfs_block_group_cache *cache)
1521 {
1522 	struct btrfs_fs_info *fs_info = cache->fs_info;
1523 	u64 bytenr;
1524 	u64 *logical;
1525 	int stripe_len;
1526 	int i, nr, ret;
1527 
1528 	if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
1529 		stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
1530 		cache->bytes_super += stripe_len;
1531 		ret = btrfs_add_excluded_extent(fs_info, cache->key.objectid,
1532 						stripe_len);
1533 		if (ret)
1534 			return ret;
1535 	}
1536 
1537 	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
1538 		bytenr = btrfs_sb_offset(i);
1539 		ret = btrfs_rmap_block(fs_info, cache->key.objectid,
1540 				       bytenr, &logical, &nr, &stripe_len);
1541 		if (ret)
1542 			return ret;
1543 
1544 		while (nr--) {
1545 			u64 start, len;
1546 
1547 			if (logical[nr] > cache->key.objectid +
1548 			    cache->key.offset)
1549 				continue;
1550 
1551 			if (logical[nr] + stripe_len <= cache->key.objectid)
1552 				continue;
1553 
1554 			start = logical[nr];
1555 			if (start < cache->key.objectid) {
1556 				start = cache->key.objectid;
1557 				len = (logical[nr] + stripe_len) - start;
1558 			} else {
1559 				len = min_t(u64, stripe_len,
1560 					    cache->key.objectid +
1561 					    cache->key.offset - start);
1562 			}
1563 
1564 			cache->bytes_super += len;
1565 			ret = btrfs_add_excluded_extent(fs_info, start, len);
1566 			if (ret) {
1567 				kfree(logical);
1568 				return ret;
1569 			}
1570 		}
1571 
1572 		kfree(logical);
1573 	}
1574 	return 0;
1575 }
1576 
link_block_group(struct btrfs_block_group_cache * cache)1577 static void link_block_group(struct btrfs_block_group_cache *cache)
1578 {
1579 	struct btrfs_space_info *space_info = cache->space_info;
1580 	int index = btrfs_bg_flags_to_raid_index(cache->flags);
1581 	bool first = false;
1582 
1583 	down_write(&space_info->groups_sem);
1584 	if (list_empty(&space_info->block_groups[index]))
1585 		first = true;
1586 	list_add_tail(&cache->list, &space_info->block_groups[index]);
1587 	up_write(&space_info->groups_sem);
1588 
1589 	if (first)
1590 		btrfs_sysfs_add_block_group_type(cache);
1591 }
1592 
btrfs_create_block_group_cache(struct btrfs_fs_info * fs_info,u64 start,u64 size)1593 static struct btrfs_block_group_cache *btrfs_create_block_group_cache(
1594 		struct btrfs_fs_info *fs_info, u64 start, u64 size)
1595 {
1596 	struct btrfs_block_group_cache *cache;
1597 
1598 	cache = kzalloc(sizeof(*cache), GFP_NOFS);
1599 	if (!cache)
1600 		return NULL;
1601 
1602 	cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
1603 					GFP_NOFS);
1604 	if (!cache->free_space_ctl) {
1605 		kfree(cache);
1606 		return NULL;
1607 	}
1608 
1609 	cache->key.objectid = start;
1610 	cache->key.offset = size;
1611 	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
1612 
1613 	cache->fs_info = fs_info;
1614 	cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
1615 	set_free_space_tree_thresholds(cache);
1616 
1617 	atomic_set(&cache->count, 1);
1618 	spin_lock_init(&cache->lock);
1619 	init_rwsem(&cache->data_rwsem);
1620 	INIT_LIST_HEAD(&cache->list);
1621 	INIT_LIST_HEAD(&cache->cluster_list);
1622 	INIT_LIST_HEAD(&cache->bg_list);
1623 	INIT_LIST_HEAD(&cache->ro_list);
1624 	INIT_LIST_HEAD(&cache->dirty_list);
1625 	INIT_LIST_HEAD(&cache->io_list);
1626 	btrfs_init_free_space_ctl(cache);
1627 	atomic_set(&cache->trimming, 0);
1628 	mutex_init(&cache->free_space_lock);
1629 	btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
1630 
1631 	return cache;
1632 }
1633 
1634 /*
1635  * Iterate all chunks and verify that each of them has the corresponding block
1636  * group
1637  */
check_chunk_block_group_mappings(struct btrfs_fs_info * fs_info)1638 static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
1639 {
1640 	struct extent_map_tree *map_tree = &fs_info->mapping_tree;
1641 	struct extent_map *em;
1642 	struct btrfs_block_group_cache *bg;
1643 	u64 start = 0;
1644 	int ret = 0;
1645 
1646 	while (1) {
1647 		read_lock(&map_tree->lock);
1648 		/*
1649 		 * lookup_extent_mapping will return the first extent map
1650 		 * intersecting the range, so setting @len to 1 is enough to
1651 		 * get the first chunk.
1652 		 */
1653 		em = lookup_extent_mapping(map_tree, start, 1);
1654 		read_unlock(&map_tree->lock);
1655 		if (!em)
1656 			break;
1657 
1658 		bg = btrfs_lookup_block_group(fs_info, em->start);
1659 		if (!bg) {
1660 			btrfs_err(fs_info,
1661 	"chunk start=%llu len=%llu doesn't have corresponding block group",
1662 				     em->start, em->len);
1663 			ret = -EUCLEAN;
1664 			free_extent_map(em);
1665 			break;
1666 		}
1667 		if (bg->key.objectid != em->start ||
1668 		    bg->key.offset != em->len ||
1669 		    (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
1670 		    (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
1671 			btrfs_err(fs_info,
1672 "chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
1673 				em->start, em->len,
1674 				em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
1675 				bg->key.objectid, bg->key.offset,
1676 				bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
1677 			ret = -EUCLEAN;
1678 			free_extent_map(em);
1679 			btrfs_put_block_group(bg);
1680 			break;
1681 		}
1682 		start = em->start + em->len;
1683 		free_extent_map(em);
1684 		btrfs_put_block_group(bg);
1685 	}
1686 	return ret;
1687 }
1688 
btrfs_read_block_groups(struct btrfs_fs_info * info)1689 int btrfs_read_block_groups(struct btrfs_fs_info *info)
1690 {
1691 	struct btrfs_path *path;
1692 	int ret;
1693 	struct btrfs_block_group_cache *cache;
1694 	struct btrfs_space_info *space_info;
1695 	struct btrfs_key key;
1696 	struct btrfs_key found_key;
1697 	struct extent_buffer *leaf;
1698 	int need_clear = 0;
1699 	u64 cache_gen;
1700 	u64 feature;
1701 	int mixed;
1702 
1703 	feature = btrfs_super_incompat_flags(info->super_copy);
1704 	mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS);
1705 
1706 	key.objectid = 0;
1707 	key.offset = 0;
1708 	key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
1709 	path = btrfs_alloc_path();
1710 	if (!path)
1711 		return -ENOMEM;
1712 	path->reada = READA_FORWARD;
1713 
1714 	cache_gen = btrfs_super_cache_generation(info->super_copy);
1715 	if (btrfs_test_opt(info, SPACE_CACHE) &&
1716 	    btrfs_super_generation(info->super_copy) != cache_gen)
1717 		need_clear = 1;
1718 	if (btrfs_test_opt(info, CLEAR_CACHE))
1719 		need_clear = 1;
1720 
1721 	while (1) {
1722 		ret = find_first_block_group(info, path, &key);
1723 		if (ret > 0)
1724 			break;
1725 		if (ret != 0)
1726 			goto error;
1727 
1728 		leaf = path->nodes[0];
1729 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1730 
1731 		cache = btrfs_create_block_group_cache(info, found_key.objectid,
1732 						       found_key.offset);
1733 		if (!cache) {
1734 			ret = -ENOMEM;
1735 			goto error;
1736 		}
1737 
1738 		if (need_clear) {
1739 			/*
1740 			 * When we mount with old space cache, we need to
1741 			 * set BTRFS_DC_CLEAR and set dirty flag.
1742 			 *
1743 			 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
1744 			 *    truncate the old free space cache inode and
1745 			 *    setup a new one.
1746 			 * b) Setting 'dirty flag' makes sure that we flush
1747 			 *    the new space cache info onto disk.
1748 			 */
1749 			if (btrfs_test_opt(info, SPACE_CACHE))
1750 				cache->disk_cache_state = BTRFS_DC_CLEAR;
1751 		}
1752 
1753 		read_extent_buffer(leaf, &cache->item,
1754 				   btrfs_item_ptr_offset(leaf, path->slots[0]),
1755 				   sizeof(cache->item));
1756 		cache->flags = btrfs_block_group_flags(&cache->item);
1757 		if (!mixed &&
1758 		    ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
1759 		    (cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
1760 			btrfs_err(info,
1761 "bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
1762 				  cache->key.objectid);
1763 			btrfs_put_block_group(cache);
1764 			ret = -EINVAL;
1765 			goto error;
1766 		}
1767 
1768 		key.objectid = found_key.objectid + found_key.offset;
1769 		btrfs_release_path(path);
1770 
1771 		/*
1772 		 * We need to exclude the super stripes now so that the space
1773 		 * info has super bytes accounted for, otherwise we'll think
1774 		 * we have more space than we actually do.
1775 		 */
1776 		ret = exclude_super_stripes(cache);
1777 		if (ret) {
1778 			/*
1779 			 * We may have excluded something, so call this just in
1780 			 * case.
1781 			 */
1782 			btrfs_free_excluded_extents(cache);
1783 			btrfs_put_block_group(cache);
1784 			goto error;
1785 		}
1786 
1787 		/*
1788 		 * Check for two cases, either we are full, and therefore
1789 		 * don't need to bother with the caching work since we won't
1790 		 * find any space, or we are empty, and we can just add all
1791 		 * the space in and be done with it.  This saves us _a_lot_ of
1792 		 * time, particularly in the full case.
1793 		 */
1794 		if (found_key.offset == btrfs_block_group_used(&cache->item)) {
1795 			cache->last_byte_to_unpin = (u64)-1;
1796 			cache->cached = BTRFS_CACHE_FINISHED;
1797 			btrfs_free_excluded_extents(cache);
1798 		} else if (btrfs_block_group_used(&cache->item) == 0) {
1799 			cache->last_byte_to_unpin = (u64)-1;
1800 			cache->cached = BTRFS_CACHE_FINISHED;
1801 			add_new_free_space(cache, found_key.objectid,
1802 					   found_key.objectid +
1803 					   found_key.offset);
1804 			btrfs_free_excluded_extents(cache);
1805 		}
1806 
1807 		ret = btrfs_add_block_group_cache(info, cache);
1808 		if (ret) {
1809 			btrfs_remove_free_space_cache(cache);
1810 			btrfs_put_block_group(cache);
1811 			goto error;
1812 		}
1813 
1814 		trace_btrfs_add_block_group(info, cache, 0);
1815 		btrfs_update_space_info(info, cache->flags, found_key.offset,
1816 					btrfs_block_group_used(&cache->item),
1817 					cache->bytes_super, &space_info);
1818 
1819 		cache->space_info = space_info;
1820 
1821 		link_block_group(cache);
1822 
1823 		set_avail_alloc_bits(info, cache->flags);
1824 		if (btrfs_chunk_readonly(info, cache->key.objectid)) {
1825 			inc_block_group_ro(cache, 1);
1826 		} else if (btrfs_block_group_used(&cache->item) == 0) {
1827 			ASSERT(list_empty(&cache->bg_list));
1828 			btrfs_mark_bg_unused(cache);
1829 		}
1830 	}
1831 
1832 	list_for_each_entry_rcu(space_info, &info->space_info, list) {
1833 		if (!(btrfs_get_alloc_profile(info, space_info->flags) &
1834 		      (BTRFS_BLOCK_GROUP_RAID10 |
1835 		       BTRFS_BLOCK_GROUP_RAID1_MASK |
1836 		       BTRFS_BLOCK_GROUP_RAID56_MASK |
1837 		       BTRFS_BLOCK_GROUP_DUP)))
1838 			continue;
1839 		/*
1840 		 * Avoid allocating from un-mirrored block group if there are
1841 		 * mirrored block groups.
1842 		 */
1843 		list_for_each_entry(cache,
1844 				&space_info->block_groups[BTRFS_RAID_RAID0],
1845 				list)
1846 			inc_block_group_ro(cache, 1);
1847 		list_for_each_entry(cache,
1848 				&space_info->block_groups[BTRFS_RAID_SINGLE],
1849 				list)
1850 			inc_block_group_ro(cache, 1);
1851 	}
1852 
1853 	btrfs_init_global_block_rsv(info);
1854 	ret = check_chunk_block_group_mappings(info);
1855 error:
1856 	btrfs_free_path(path);
1857 	return ret;
1858 }
1859 
btrfs_create_pending_block_groups(struct btrfs_trans_handle * trans)1860 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
1861 {
1862 	struct btrfs_fs_info *fs_info = trans->fs_info;
1863 	struct btrfs_block_group_cache *block_group;
1864 	struct btrfs_root *extent_root = fs_info->extent_root;
1865 	struct btrfs_block_group_item item;
1866 	struct btrfs_key key;
1867 	int ret = 0;
1868 
1869 	if (!trans->can_flush_pending_bgs)
1870 		return;
1871 
1872 	while (!list_empty(&trans->new_bgs)) {
1873 		block_group = list_first_entry(&trans->new_bgs,
1874 					       struct btrfs_block_group_cache,
1875 					       bg_list);
1876 		if (ret)
1877 			goto next;
1878 
1879 		spin_lock(&block_group->lock);
1880 		memcpy(&item, &block_group->item, sizeof(item));
1881 		memcpy(&key, &block_group->key, sizeof(key));
1882 		spin_unlock(&block_group->lock);
1883 
1884 		ret = btrfs_insert_item(trans, extent_root, &key, &item,
1885 					sizeof(item));
1886 		if (ret)
1887 			btrfs_abort_transaction(trans, ret);
1888 		ret = btrfs_finish_chunk_alloc(trans, key.objectid, key.offset);
1889 		if (ret)
1890 			btrfs_abort_transaction(trans, ret);
1891 		add_block_group_free_space(trans, block_group);
1892 		/* Already aborted the transaction if it failed. */
1893 next:
1894 		btrfs_delayed_refs_rsv_release(fs_info, 1);
1895 		list_del_init(&block_group->bg_list);
1896 	}
1897 	btrfs_trans_release_chunk_metadata(trans);
1898 }
1899 
btrfs_make_block_group(struct btrfs_trans_handle * trans,u64 bytes_used,u64 type,u64 chunk_offset,u64 size)1900 int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
1901 			   u64 type, u64 chunk_offset, u64 size)
1902 {
1903 	struct btrfs_fs_info *fs_info = trans->fs_info;
1904 	struct btrfs_block_group_cache *cache;
1905 	int ret;
1906 
1907 	btrfs_set_log_full_commit(trans);
1908 
1909 	cache = btrfs_create_block_group_cache(fs_info, chunk_offset, size);
1910 	if (!cache)
1911 		return -ENOMEM;
1912 
1913 	btrfs_set_block_group_used(&cache->item, bytes_used);
1914 	btrfs_set_block_group_chunk_objectid(&cache->item,
1915 					     BTRFS_FIRST_CHUNK_TREE_OBJECTID);
1916 	btrfs_set_block_group_flags(&cache->item, type);
1917 
1918 	cache->flags = type;
1919 	cache->last_byte_to_unpin = (u64)-1;
1920 	cache->cached = BTRFS_CACHE_FINISHED;
1921 	cache->needs_free_space = 1;
1922 	ret = exclude_super_stripes(cache);
1923 	if (ret) {
1924 		/* We may have excluded something, so call this just in case */
1925 		btrfs_free_excluded_extents(cache);
1926 		btrfs_put_block_group(cache);
1927 		return ret;
1928 	}
1929 
1930 	add_new_free_space(cache, chunk_offset, chunk_offset + size);
1931 
1932 	btrfs_free_excluded_extents(cache);
1933 
1934 #ifdef CONFIG_BTRFS_DEBUG
1935 	if (btrfs_should_fragment_free_space(cache)) {
1936 		u64 new_bytes_used = size - bytes_used;
1937 
1938 		bytes_used += new_bytes_used >> 1;
1939 		fragment_free_space(cache);
1940 	}
1941 #endif
1942 	/*
1943 	 * Ensure the corresponding space_info object is created and
1944 	 * assigned to our block group. We want our bg to be added to the rbtree
1945 	 * with its ->space_info set.
1946 	 */
1947 	cache->space_info = btrfs_find_space_info(fs_info, cache->flags);
1948 	ASSERT(cache->space_info);
1949 
1950 	ret = btrfs_add_block_group_cache(fs_info, cache);
1951 	if (ret) {
1952 		btrfs_remove_free_space_cache(cache);
1953 		btrfs_put_block_group(cache);
1954 		return ret;
1955 	}
1956 
1957 	/*
1958 	 * Now that our block group has its ->space_info set and is inserted in
1959 	 * the rbtree, update the space info's counters.
1960 	 */
1961 	trace_btrfs_add_block_group(fs_info, cache, 1);
1962 	btrfs_update_space_info(fs_info, cache->flags, size, bytes_used,
1963 				cache->bytes_super, &cache->space_info);
1964 	btrfs_update_global_block_rsv(fs_info);
1965 
1966 	link_block_group(cache);
1967 
1968 	list_add_tail(&cache->bg_list, &trans->new_bgs);
1969 	trans->delayed_ref_updates++;
1970 	btrfs_update_delayed_refs_rsv(trans);
1971 
1972 	set_avail_alloc_bits(fs_info, type);
1973 	return 0;
1974 }
1975 
update_block_group_flags(struct btrfs_fs_info * fs_info,u64 flags)1976 static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags)
1977 {
1978 	u64 num_devices;
1979 	u64 stripped;
1980 
1981 	/*
1982 	 * if restripe for this chunk_type is on pick target profile and
1983 	 * return, otherwise do the usual balance
1984 	 */
1985 	stripped = get_restripe_target(fs_info, flags);
1986 	if (stripped)
1987 		return extended_to_chunk(stripped);
1988 
1989 	num_devices = fs_info->fs_devices->rw_devices;
1990 
1991 	stripped = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID56_MASK |
1992 		BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10;
1993 
1994 	if (num_devices == 1) {
1995 		stripped |= BTRFS_BLOCK_GROUP_DUP;
1996 		stripped = flags & ~stripped;
1997 
1998 		/* turn raid0 into single device chunks */
1999 		if (flags & BTRFS_BLOCK_GROUP_RAID0)
2000 			return stripped;
2001 
2002 		/* turn mirroring into duplication */
2003 		if (flags & (BTRFS_BLOCK_GROUP_RAID1_MASK |
2004 			     BTRFS_BLOCK_GROUP_RAID10))
2005 			return stripped | BTRFS_BLOCK_GROUP_DUP;
2006 	} else {
2007 		/* they already had raid on here, just return */
2008 		if (flags & stripped)
2009 			return flags;
2010 
2011 		stripped |= BTRFS_BLOCK_GROUP_DUP;
2012 		stripped = flags & ~stripped;
2013 
2014 		/* switch duplicated blocks with raid1 */
2015 		if (flags & BTRFS_BLOCK_GROUP_DUP)
2016 			return stripped | BTRFS_BLOCK_GROUP_RAID1;
2017 
2018 		/* this is drive concat, leave it alone */
2019 	}
2020 
2021 	return flags;
2022 }
2023 
btrfs_inc_block_group_ro(struct btrfs_block_group_cache * cache)2024 int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache)
2025 
2026 {
2027 	struct btrfs_fs_info *fs_info = cache->fs_info;
2028 	struct btrfs_trans_handle *trans;
2029 	u64 alloc_flags;
2030 	int ret;
2031 
2032 again:
2033 	trans = btrfs_join_transaction(fs_info->extent_root);
2034 	if (IS_ERR(trans))
2035 		return PTR_ERR(trans);
2036 
2037 	/*
2038 	 * we're not allowed to set block groups readonly after the dirty
2039 	 * block groups cache has started writing.  If it already started,
2040 	 * back off and let this transaction commit
2041 	 */
2042 	mutex_lock(&fs_info->ro_block_group_mutex);
2043 	if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
2044 		u64 transid = trans->transid;
2045 
2046 		mutex_unlock(&fs_info->ro_block_group_mutex);
2047 		btrfs_end_transaction(trans);
2048 
2049 		ret = btrfs_wait_for_commit(fs_info, transid);
2050 		if (ret)
2051 			return ret;
2052 		goto again;
2053 	}
2054 
2055 	/*
2056 	 * if we are changing raid levels, try to allocate a corresponding
2057 	 * block group with the new raid level.
2058 	 */
2059 	alloc_flags = update_block_group_flags(fs_info, cache->flags);
2060 	if (alloc_flags != cache->flags) {
2061 		ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
2062 		/*
2063 		 * ENOSPC is allowed here, we may have enough space
2064 		 * already allocated at the new raid level to
2065 		 * carry on
2066 		 */
2067 		if (ret == -ENOSPC)
2068 			ret = 0;
2069 		if (ret < 0)
2070 			goto out;
2071 	}
2072 
2073 	ret = inc_block_group_ro(cache, 0);
2074 	if (!ret)
2075 		goto out;
2076 	alloc_flags = btrfs_get_alloc_profile(fs_info, cache->space_info->flags);
2077 	ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
2078 	if (ret < 0)
2079 		goto out;
2080 	ret = inc_block_group_ro(cache, 0);
2081 out:
2082 	if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
2083 		alloc_flags = update_block_group_flags(fs_info, cache->flags);
2084 		mutex_lock(&fs_info->chunk_mutex);
2085 		check_system_chunk(trans, alloc_flags);
2086 		mutex_unlock(&fs_info->chunk_mutex);
2087 	}
2088 	mutex_unlock(&fs_info->ro_block_group_mutex);
2089 
2090 	btrfs_end_transaction(trans);
2091 	return ret;
2092 }
2093 
btrfs_dec_block_group_ro(struct btrfs_block_group_cache * cache)2094 void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache)
2095 {
2096 	struct btrfs_space_info *sinfo = cache->space_info;
2097 	u64 num_bytes;
2098 
2099 	BUG_ON(!cache->ro);
2100 
2101 	spin_lock(&sinfo->lock);
2102 	spin_lock(&cache->lock);
2103 	if (!--cache->ro) {
2104 		num_bytes = cache->key.offset - cache->reserved -
2105 			    cache->pinned - cache->bytes_super -
2106 			    btrfs_block_group_used(&cache->item);
2107 		sinfo->bytes_readonly -= num_bytes;
2108 		list_del_init(&cache->ro_list);
2109 	}
2110 	spin_unlock(&cache->lock);
2111 	spin_unlock(&sinfo->lock);
2112 }
2113 
write_one_cache_group(struct btrfs_trans_handle * trans,struct btrfs_path * path,struct btrfs_block_group_cache * cache)2114 static int write_one_cache_group(struct btrfs_trans_handle *trans,
2115 				 struct btrfs_path *path,
2116 				 struct btrfs_block_group_cache *cache)
2117 {
2118 	struct btrfs_fs_info *fs_info = trans->fs_info;
2119 	int ret;
2120 	struct btrfs_root *extent_root = fs_info->extent_root;
2121 	unsigned long bi;
2122 	struct extent_buffer *leaf;
2123 
2124 	ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
2125 	if (ret) {
2126 		if (ret > 0)
2127 			ret = -ENOENT;
2128 		goto fail;
2129 	}
2130 
2131 	leaf = path->nodes[0];
2132 	bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
2133 	write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
2134 	btrfs_mark_buffer_dirty(leaf);
2135 fail:
2136 	btrfs_release_path(path);
2137 	return ret;
2138 
2139 }
2140 
cache_save_setup(struct btrfs_block_group_cache * block_group,struct btrfs_trans_handle * trans,struct btrfs_path * path)2141 static int cache_save_setup(struct btrfs_block_group_cache *block_group,
2142 			    struct btrfs_trans_handle *trans,
2143 			    struct btrfs_path *path)
2144 {
2145 	struct btrfs_fs_info *fs_info = block_group->fs_info;
2146 	struct btrfs_root *root = fs_info->tree_root;
2147 	struct inode *inode = NULL;
2148 	struct extent_changeset *data_reserved = NULL;
2149 	u64 alloc_hint = 0;
2150 	int dcs = BTRFS_DC_ERROR;
2151 	u64 num_pages = 0;
2152 	int retries = 0;
2153 	int ret = 0;
2154 
2155 	/*
2156 	 * If this block group is smaller than 100 megs don't bother caching the
2157 	 * block group.
2158 	 */
2159 	if (block_group->key.offset < (100 * SZ_1M)) {
2160 		spin_lock(&block_group->lock);
2161 		block_group->disk_cache_state = BTRFS_DC_WRITTEN;
2162 		spin_unlock(&block_group->lock);
2163 		return 0;
2164 	}
2165 
2166 	if (trans->aborted)
2167 		return 0;
2168 again:
2169 	inode = lookup_free_space_inode(block_group, path);
2170 	if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
2171 		ret = PTR_ERR(inode);
2172 		btrfs_release_path(path);
2173 		goto out;
2174 	}
2175 
2176 	if (IS_ERR(inode)) {
2177 		BUG_ON(retries);
2178 		retries++;
2179 
2180 		if (block_group->ro)
2181 			goto out_free;
2182 
2183 		ret = create_free_space_inode(trans, block_group, path);
2184 		if (ret)
2185 			goto out_free;
2186 		goto again;
2187 	}
2188 
2189 	/*
2190 	 * We want to set the generation to 0, that way if anything goes wrong
2191 	 * from here on out we know not to trust this cache when we load up next
2192 	 * time.
2193 	 */
2194 	BTRFS_I(inode)->generation = 0;
2195 	ret = btrfs_update_inode(trans, root, inode);
2196 	if (ret) {
2197 		/*
2198 		 * So theoretically we could recover from this, simply set the
2199 		 * super cache generation to 0 so we know to invalidate the
2200 		 * cache, but then we'd have to keep track of the block groups
2201 		 * that fail this way so we know we _have_ to reset this cache
2202 		 * before the next commit or risk reading stale cache.  So to
2203 		 * limit our exposure to horrible edge cases lets just abort the
2204 		 * transaction, this only happens in really bad situations
2205 		 * anyway.
2206 		 */
2207 		btrfs_abort_transaction(trans, ret);
2208 		goto out_put;
2209 	}
2210 	WARN_ON(ret);
2211 
2212 	/* We've already setup this transaction, go ahead and exit */
2213 	if (block_group->cache_generation == trans->transid &&
2214 	    i_size_read(inode)) {
2215 		dcs = BTRFS_DC_SETUP;
2216 		goto out_put;
2217 	}
2218 
2219 	if (i_size_read(inode) > 0) {
2220 		ret = btrfs_check_trunc_cache_free_space(fs_info,
2221 					&fs_info->global_block_rsv);
2222 		if (ret)
2223 			goto out_put;
2224 
2225 		ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
2226 		if (ret)
2227 			goto out_put;
2228 	}
2229 
2230 	spin_lock(&block_group->lock);
2231 	if (block_group->cached != BTRFS_CACHE_FINISHED ||
2232 	    !btrfs_test_opt(fs_info, SPACE_CACHE)) {
2233 		/*
2234 		 * don't bother trying to write stuff out _if_
2235 		 * a) we're not cached,
2236 		 * b) we're with nospace_cache mount option,
2237 		 * c) we're with v2 space_cache (FREE_SPACE_TREE).
2238 		 */
2239 		dcs = BTRFS_DC_WRITTEN;
2240 		spin_unlock(&block_group->lock);
2241 		goto out_put;
2242 	}
2243 	spin_unlock(&block_group->lock);
2244 
2245 	/*
2246 	 * We hit an ENOSPC when setting up the cache in this transaction, just
2247 	 * skip doing the setup, we've already cleared the cache so we're safe.
2248 	 */
2249 	if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
2250 		ret = -ENOSPC;
2251 		goto out_put;
2252 	}
2253 
2254 	/*
2255 	 * Try to preallocate enough space based on how big the block group is.
2256 	 * Keep in mind this has to include any pinned space which could end up
2257 	 * taking up quite a bit since it's not folded into the other space
2258 	 * cache.
2259 	 */
2260 	num_pages = div_u64(block_group->key.offset, SZ_256M);
2261 	if (!num_pages)
2262 		num_pages = 1;
2263 
2264 	num_pages *= 16;
2265 	num_pages *= PAGE_SIZE;
2266 
2267 	ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages);
2268 	if (ret)
2269 		goto out_put;
2270 
2271 	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
2272 					      num_pages, num_pages,
2273 					      &alloc_hint);
2274 	/*
2275 	 * Our cache requires contiguous chunks so that we don't modify a bunch
2276 	 * of metadata or split extents when writing the cache out, which means
2277 	 * we can enospc if we are heavily fragmented in addition to just normal
2278 	 * out of space conditions.  So if we hit this just skip setting up any
2279 	 * other block groups for this transaction, maybe we'll unpin enough
2280 	 * space the next time around.
2281 	 */
2282 	if (!ret)
2283 		dcs = BTRFS_DC_SETUP;
2284 	else if (ret == -ENOSPC)
2285 		set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
2286 
2287 out_put:
2288 	iput(inode);
2289 out_free:
2290 	btrfs_release_path(path);
2291 out:
2292 	spin_lock(&block_group->lock);
2293 	if (!ret && dcs == BTRFS_DC_SETUP)
2294 		block_group->cache_generation = trans->transid;
2295 	block_group->disk_cache_state = dcs;
2296 	spin_unlock(&block_group->lock);
2297 
2298 	extent_changeset_free(data_reserved);
2299 	return ret;
2300 }
2301 
btrfs_setup_space_cache(struct btrfs_trans_handle * trans)2302 int btrfs_setup_space_cache(struct btrfs_trans_handle *trans)
2303 {
2304 	struct btrfs_fs_info *fs_info = trans->fs_info;
2305 	struct btrfs_block_group_cache *cache, *tmp;
2306 	struct btrfs_transaction *cur_trans = trans->transaction;
2307 	struct btrfs_path *path;
2308 
2309 	if (list_empty(&cur_trans->dirty_bgs) ||
2310 	    !btrfs_test_opt(fs_info, SPACE_CACHE))
2311 		return 0;
2312 
2313 	path = btrfs_alloc_path();
2314 	if (!path)
2315 		return -ENOMEM;
2316 
2317 	/* Could add new block groups, use _safe just in case */
2318 	list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
2319 				 dirty_list) {
2320 		if (cache->disk_cache_state == BTRFS_DC_CLEAR)
2321 			cache_save_setup(cache, trans, path);
2322 	}
2323 
2324 	btrfs_free_path(path);
2325 	return 0;
2326 }
2327 
2328 /*
2329  * Transaction commit does final block group cache writeback during a critical
2330  * section where nothing is allowed to change the FS.  This is required in
2331  * order for the cache to actually match the block group, but can introduce a
2332  * lot of latency into the commit.
2333  *
2334  * So, btrfs_start_dirty_block_groups is here to kick off block group cache IO.
2335  * There's a chance we'll have to redo some of it if the block group changes
2336  * again during the commit, but it greatly reduces the commit latency by
2337  * getting rid of the easy block groups while we're still allowing others to
2338  * join the commit.
2339  */
btrfs_start_dirty_block_groups(struct btrfs_trans_handle * trans)2340 int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
2341 {
2342 	struct btrfs_fs_info *fs_info = trans->fs_info;
2343 	struct btrfs_block_group_cache *cache;
2344 	struct btrfs_transaction *cur_trans = trans->transaction;
2345 	int ret = 0;
2346 	int should_put;
2347 	struct btrfs_path *path = NULL;
2348 	LIST_HEAD(dirty);
2349 	struct list_head *io = &cur_trans->io_bgs;
2350 	int num_started = 0;
2351 	int loops = 0;
2352 
2353 	spin_lock(&cur_trans->dirty_bgs_lock);
2354 	if (list_empty(&cur_trans->dirty_bgs)) {
2355 		spin_unlock(&cur_trans->dirty_bgs_lock);
2356 		return 0;
2357 	}
2358 	list_splice_init(&cur_trans->dirty_bgs, &dirty);
2359 	spin_unlock(&cur_trans->dirty_bgs_lock);
2360 
2361 again:
2362 	/* Make sure all the block groups on our dirty list actually exist */
2363 	btrfs_create_pending_block_groups(trans);
2364 
2365 	if (!path) {
2366 		path = btrfs_alloc_path();
2367 		if (!path)
2368 			return -ENOMEM;
2369 	}
2370 
2371 	/*
2372 	 * cache_write_mutex is here only to save us from balance or automatic
2373 	 * removal of empty block groups deleting this block group while we are
2374 	 * writing out the cache
2375 	 */
2376 	mutex_lock(&trans->transaction->cache_write_mutex);
2377 	while (!list_empty(&dirty)) {
2378 		bool drop_reserve = true;
2379 
2380 		cache = list_first_entry(&dirty,
2381 					 struct btrfs_block_group_cache,
2382 					 dirty_list);
2383 		/*
2384 		 * This can happen if something re-dirties a block group that
2385 		 * is already under IO.  Just wait for it to finish and then do
2386 		 * it all again
2387 		 */
2388 		if (!list_empty(&cache->io_list)) {
2389 			list_del_init(&cache->io_list);
2390 			btrfs_wait_cache_io(trans, cache, path);
2391 			btrfs_put_block_group(cache);
2392 		}
2393 
2394 
2395 		/*
2396 		 * btrfs_wait_cache_io uses the cache->dirty_list to decide if
2397 		 * it should update the cache_state.  Don't delete until after
2398 		 * we wait.
2399 		 *
2400 		 * Since we're not running in the commit critical section
2401 		 * we need the dirty_bgs_lock to protect from update_block_group
2402 		 */
2403 		spin_lock(&cur_trans->dirty_bgs_lock);
2404 		list_del_init(&cache->dirty_list);
2405 		spin_unlock(&cur_trans->dirty_bgs_lock);
2406 
2407 		should_put = 1;
2408 
2409 		cache_save_setup(cache, trans, path);
2410 
2411 		if (cache->disk_cache_state == BTRFS_DC_SETUP) {
2412 			cache->io_ctl.inode = NULL;
2413 			ret = btrfs_write_out_cache(trans, cache, path);
2414 			if (ret == 0 && cache->io_ctl.inode) {
2415 				num_started++;
2416 				should_put = 0;
2417 
2418 				/*
2419 				 * The cache_write_mutex is protecting the
2420 				 * io_list, also refer to the definition of
2421 				 * btrfs_transaction::io_bgs for more details
2422 				 */
2423 				list_add_tail(&cache->io_list, io);
2424 			} else {
2425 				/*
2426 				 * If we failed to write the cache, the
2427 				 * generation will be bad and life goes on
2428 				 */
2429 				ret = 0;
2430 			}
2431 		}
2432 		if (!ret) {
2433 			ret = write_one_cache_group(trans, path, cache);
2434 			/*
2435 			 * Our block group might still be attached to the list
2436 			 * of new block groups in the transaction handle of some
2437 			 * other task (struct btrfs_trans_handle->new_bgs). This
2438 			 * means its block group item isn't yet in the extent
2439 			 * tree. If this happens ignore the error, as we will
2440 			 * try again later in the critical section of the
2441 			 * transaction commit.
2442 			 */
2443 			if (ret == -ENOENT) {
2444 				ret = 0;
2445 				spin_lock(&cur_trans->dirty_bgs_lock);
2446 				if (list_empty(&cache->dirty_list)) {
2447 					list_add_tail(&cache->dirty_list,
2448 						      &cur_trans->dirty_bgs);
2449 					btrfs_get_block_group(cache);
2450 					drop_reserve = false;
2451 				}
2452 				spin_unlock(&cur_trans->dirty_bgs_lock);
2453 			} else if (ret) {
2454 				btrfs_abort_transaction(trans, ret);
2455 			}
2456 		}
2457 
2458 		/* If it's not on the io list, we need to put the block group */
2459 		if (should_put)
2460 			btrfs_put_block_group(cache);
2461 		if (drop_reserve)
2462 			btrfs_delayed_refs_rsv_release(fs_info, 1);
2463 
2464 		if (ret)
2465 			break;
2466 
2467 		/*
2468 		 * Avoid blocking other tasks for too long. It might even save
2469 		 * us from writing caches for block groups that are going to be
2470 		 * removed.
2471 		 */
2472 		mutex_unlock(&trans->transaction->cache_write_mutex);
2473 		mutex_lock(&trans->transaction->cache_write_mutex);
2474 	}
2475 	mutex_unlock(&trans->transaction->cache_write_mutex);
2476 
2477 	/*
2478 	 * Go through delayed refs for all the stuff we've just kicked off
2479 	 * and then loop back (just once)
2480 	 */
2481 	ret = btrfs_run_delayed_refs(trans, 0);
2482 	if (!ret && loops == 0) {
2483 		loops++;
2484 		spin_lock(&cur_trans->dirty_bgs_lock);
2485 		list_splice_init(&cur_trans->dirty_bgs, &dirty);
2486 		/*
2487 		 * dirty_bgs_lock protects us from concurrent block group
2488 		 * deletes too (not just cache_write_mutex).
2489 		 */
2490 		if (!list_empty(&dirty)) {
2491 			spin_unlock(&cur_trans->dirty_bgs_lock);
2492 			goto again;
2493 		}
2494 		spin_unlock(&cur_trans->dirty_bgs_lock);
2495 	} else if (ret < 0) {
2496 		btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
2497 	}
2498 
2499 	btrfs_free_path(path);
2500 	return ret;
2501 }
2502 
btrfs_write_dirty_block_groups(struct btrfs_trans_handle * trans)2503 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
2504 {
2505 	struct btrfs_fs_info *fs_info = trans->fs_info;
2506 	struct btrfs_block_group_cache *cache;
2507 	struct btrfs_transaction *cur_trans = trans->transaction;
2508 	int ret = 0;
2509 	int should_put;
2510 	struct btrfs_path *path;
2511 	struct list_head *io = &cur_trans->io_bgs;
2512 	int num_started = 0;
2513 
2514 	path = btrfs_alloc_path();
2515 	if (!path)
2516 		return -ENOMEM;
2517 
2518 	/*
2519 	 * Even though we are in the critical section of the transaction commit,
2520 	 * we can still have concurrent tasks adding elements to this
2521 	 * transaction's list of dirty block groups. These tasks correspond to
2522 	 * endio free space workers started when writeback finishes for a
2523 	 * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
2524 	 * allocate new block groups as a result of COWing nodes of the root
2525 	 * tree when updating the free space inode. The writeback for the space
2526 	 * caches is triggered by an earlier call to
2527 	 * btrfs_start_dirty_block_groups() and iterations of the following
2528 	 * loop.
2529 	 * Also we want to do the cache_save_setup first and then run the
2530 	 * delayed refs to make sure we have the best chance at doing this all
2531 	 * in one shot.
2532 	 */
2533 	spin_lock(&cur_trans->dirty_bgs_lock);
2534 	while (!list_empty(&cur_trans->dirty_bgs)) {
2535 		cache = list_first_entry(&cur_trans->dirty_bgs,
2536 					 struct btrfs_block_group_cache,
2537 					 dirty_list);
2538 
2539 		/*
2540 		 * This can happen if cache_save_setup re-dirties a block group
2541 		 * that is already under IO.  Just wait for it to finish and
2542 		 * then do it all again
2543 		 */
2544 		if (!list_empty(&cache->io_list)) {
2545 			spin_unlock(&cur_trans->dirty_bgs_lock);
2546 			list_del_init(&cache->io_list);
2547 			btrfs_wait_cache_io(trans, cache, path);
2548 			btrfs_put_block_group(cache);
2549 			spin_lock(&cur_trans->dirty_bgs_lock);
2550 		}
2551 
2552 		/*
2553 		 * Don't remove from the dirty list until after we've waited on
2554 		 * any pending IO
2555 		 */
2556 		list_del_init(&cache->dirty_list);
2557 		spin_unlock(&cur_trans->dirty_bgs_lock);
2558 		should_put = 1;
2559 
2560 		cache_save_setup(cache, trans, path);
2561 
2562 		if (!ret)
2563 			ret = btrfs_run_delayed_refs(trans,
2564 						     (unsigned long) -1);
2565 
2566 		if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
2567 			cache->io_ctl.inode = NULL;
2568 			ret = btrfs_write_out_cache(trans, cache, path);
2569 			if (ret == 0 && cache->io_ctl.inode) {
2570 				num_started++;
2571 				should_put = 0;
2572 				list_add_tail(&cache->io_list, io);
2573 			} else {
2574 				/*
2575 				 * If we failed to write the cache, the
2576 				 * generation will be bad and life goes on
2577 				 */
2578 				ret = 0;
2579 			}
2580 		}
2581 		if (!ret) {
2582 			ret = write_one_cache_group(trans, path, cache);
2583 			/*
2584 			 * One of the free space endio workers might have
2585 			 * created a new block group while updating a free space
2586 			 * cache's inode (at inode.c:btrfs_finish_ordered_io())
2587 			 * and hasn't released its transaction handle yet, in
2588 			 * which case the new block group is still attached to
2589 			 * its transaction handle and its creation has not
2590 			 * finished yet (no block group item in the extent tree
2591 			 * yet, etc). If this is the case, wait for all free
2592 			 * space endio workers to finish and retry. This is a
2593 			 * a very rare case so no need for a more efficient and
2594 			 * complex approach.
2595 			 */
2596 			if (ret == -ENOENT) {
2597 				wait_event(cur_trans->writer_wait,
2598 				   atomic_read(&cur_trans->num_writers) == 1);
2599 				ret = write_one_cache_group(trans, path, cache);
2600 			}
2601 			if (ret)
2602 				btrfs_abort_transaction(trans, ret);
2603 		}
2604 
2605 		/* If its not on the io list, we need to put the block group */
2606 		if (should_put)
2607 			btrfs_put_block_group(cache);
2608 		btrfs_delayed_refs_rsv_release(fs_info, 1);
2609 		spin_lock(&cur_trans->dirty_bgs_lock);
2610 	}
2611 	spin_unlock(&cur_trans->dirty_bgs_lock);
2612 
2613 	/*
2614 	 * Refer to the definition of io_bgs member for details why it's safe
2615 	 * to use it without any locking
2616 	 */
2617 	while (!list_empty(io)) {
2618 		cache = list_first_entry(io, struct btrfs_block_group_cache,
2619 					 io_list);
2620 		list_del_init(&cache->io_list);
2621 		btrfs_wait_cache_io(trans, cache, path);
2622 		btrfs_put_block_group(cache);
2623 	}
2624 
2625 	btrfs_free_path(path);
2626 	return ret;
2627 }
2628 
btrfs_update_block_group(struct btrfs_trans_handle * trans,u64 bytenr,u64 num_bytes,int alloc)2629 int btrfs_update_block_group(struct btrfs_trans_handle *trans,
2630 			     u64 bytenr, u64 num_bytes, int alloc)
2631 {
2632 	struct btrfs_fs_info *info = trans->fs_info;
2633 	struct btrfs_block_group_cache *cache = NULL;
2634 	u64 total = num_bytes;
2635 	u64 old_val;
2636 	u64 byte_in_group;
2637 	int factor;
2638 	int ret = 0;
2639 
2640 	/* Block accounting for super block */
2641 	spin_lock(&info->delalloc_root_lock);
2642 	old_val = btrfs_super_bytes_used(info->super_copy);
2643 	if (alloc)
2644 		old_val += num_bytes;
2645 	else
2646 		old_val -= num_bytes;
2647 	btrfs_set_super_bytes_used(info->super_copy, old_val);
2648 	spin_unlock(&info->delalloc_root_lock);
2649 
2650 	while (total) {
2651 		cache = btrfs_lookup_block_group(info, bytenr);
2652 		if (!cache) {
2653 			ret = -ENOENT;
2654 			break;
2655 		}
2656 		factor = btrfs_bg_type_to_factor(cache->flags);
2657 
2658 		/*
2659 		 * If this block group has free space cache written out, we
2660 		 * need to make sure to load it if we are removing space.  This
2661 		 * is because we need the unpinning stage to actually add the
2662 		 * space back to the block group, otherwise we will leak space.
2663 		 */
2664 		if (!alloc && !btrfs_block_group_cache_done(cache))
2665 			btrfs_cache_block_group(cache, 1);
2666 
2667 		byte_in_group = bytenr - cache->key.objectid;
2668 		WARN_ON(byte_in_group > cache->key.offset);
2669 
2670 		spin_lock(&cache->space_info->lock);
2671 		spin_lock(&cache->lock);
2672 
2673 		if (btrfs_test_opt(info, SPACE_CACHE) &&
2674 		    cache->disk_cache_state < BTRFS_DC_CLEAR)
2675 			cache->disk_cache_state = BTRFS_DC_CLEAR;
2676 
2677 		old_val = btrfs_block_group_used(&cache->item);
2678 		num_bytes = min(total, cache->key.offset - byte_in_group);
2679 		if (alloc) {
2680 			old_val += num_bytes;
2681 			btrfs_set_block_group_used(&cache->item, old_val);
2682 			cache->reserved -= num_bytes;
2683 			cache->space_info->bytes_reserved -= num_bytes;
2684 			cache->space_info->bytes_used += num_bytes;
2685 			cache->space_info->disk_used += num_bytes * factor;
2686 			spin_unlock(&cache->lock);
2687 			spin_unlock(&cache->space_info->lock);
2688 		} else {
2689 			old_val -= num_bytes;
2690 			btrfs_set_block_group_used(&cache->item, old_val);
2691 			cache->pinned += num_bytes;
2692 			btrfs_space_info_update_bytes_pinned(info,
2693 					cache->space_info, num_bytes);
2694 			cache->space_info->bytes_used -= num_bytes;
2695 			cache->space_info->disk_used -= num_bytes * factor;
2696 			spin_unlock(&cache->lock);
2697 			spin_unlock(&cache->space_info->lock);
2698 
2699 			percpu_counter_add_batch(
2700 					&cache->space_info->total_bytes_pinned,
2701 					num_bytes,
2702 					BTRFS_TOTAL_BYTES_PINNED_BATCH);
2703 			set_extent_dirty(info->pinned_extents,
2704 					 bytenr, bytenr + num_bytes - 1,
2705 					 GFP_NOFS | __GFP_NOFAIL);
2706 		}
2707 
2708 		spin_lock(&trans->transaction->dirty_bgs_lock);
2709 		if (list_empty(&cache->dirty_list)) {
2710 			list_add_tail(&cache->dirty_list,
2711 				      &trans->transaction->dirty_bgs);
2712 			trans->delayed_ref_updates++;
2713 			btrfs_get_block_group(cache);
2714 		}
2715 		spin_unlock(&trans->transaction->dirty_bgs_lock);
2716 
2717 		/*
2718 		 * No longer have used bytes in this block group, queue it for
2719 		 * deletion. We do this after adding the block group to the
2720 		 * dirty list to avoid races between cleaner kthread and space
2721 		 * cache writeout.
2722 		 */
2723 		if (!alloc && old_val == 0)
2724 			btrfs_mark_bg_unused(cache);
2725 
2726 		btrfs_put_block_group(cache);
2727 		total -= num_bytes;
2728 		bytenr += num_bytes;
2729 	}
2730 
2731 	/* Modified block groups are accounted for in the delayed_refs_rsv. */
2732 	btrfs_update_delayed_refs_rsv(trans);
2733 	return ret;
2734 }
2735 
2736 /**
2737  * btrfs_add_reserved_bytes - update the block_group and space info counters
2738  * @cache:	The cache we are manipulating
2739  * @ram_bytes:  The number of bytes of file content, and will be same to
2740  *              @num_bytes except for the compress path.
2741  * @num_bytes:	The number of bytes in question
2742  * @delalloc:   The blocks are allocated for the delalloc write
2743  *
2744  * This is called by the allocator when it reserves space. If this is a
2745  * reservation and the block group has become read only we cannot make the
2746  * reservation and return -EAGAIN, otherwise this function always succeeds.
2747  */
btrfs_add_reserved_bytes(struct btrfs_block_group_cache * cache,u64 ram_bytes,u64 num_bytes,int delalloc)2748 int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
2749 			     u64 ram_bytes, u64 num_bytes, int delalloc)
2750 {
2751 	struct btrfs_space_info *space_info = cache->space_info;
2752 	int ret = 0;
2753 
2754 	spin_lock(&space_info->lock);
2755 	spin_lock(&cache->lock);
2756 	if (cache->ro) {
2757 		ret = -EAGAIN;
2758 	} else {
2759 		cache->reserved += num_bytes;
2760 		space_info->bytes_reserved += num_bytes;
2761 		trace_btrfs_space_reservation(cache->fs_info, "space_info",
2762 					      space_info->flags, num_bytes, 1);
2763 		btrfs_space_info_update_bytes_may_use(cache->fs_info,
2764 						      space_info, -ram_bytes);
2765 		if (delalloc)
2766 			cache->delalloc_bytes += num_bytes;
2767 	}
2768 	spin_unlock(&cache->lock);
2769 	spin_unlock(&space_info->lock);
2770 	return ret;
2771 }
2772 
2773 /**
2774  * btrfs_free_reserved_bytes - update the block_group and space info counters
2775  * @cache:      The cache we are manipulating
2776  * @num_bytes:  The number of bytes in question
2777  * @delalloc:   The blocks are allocated for the delalloc write
2778  *
2779  * This is called by somebody who is freeing space that was never actually used
2780  * on disk.  For example if you reserve some space for a new leaf in transaction
2781  * A and before transaction A commits you free that leaf, you call this with
2782  * reserve set to 0 in order to clear the reservation.
2783  */
btrfs_free_reserved_bytes(struct btrfs_block_group_cache * cache,u64 num_bytes,int delalloc)2784 void btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
2785 			       u64 num_bytes, int delalloc)
2786 {
2787 	struct btrfs_space_info *space_info = cache->space_info;
2788 
2789 	spin_lock(&space_info->lock);
2790 	spin_lock(&cache->lock);
2791 	if (cache->ro)
2792 		space_info->bytes_readonly += num_bytes;
2793 	cache->reserved -= num_bytes;
2794 	space_info->bytes_reserved -= num_bytes;
2795 	space_info->max_extent_size = 0;
2796 
2797 	if (delalloc)
2798 		cache->delalloc_bytes -= num_bytes;
2799 	spin_unlock(&cache->lock);
2800 	spin_unlock(&space_info->lock);
2801 }
2802 
force_metadata_allocation(struct btrfs_fs_info * info)2803 static void force_metadata_allocation(struct btrfs_fs_info *info)
2804 {
2805 	struct list_head *head = &info->space_info;
2806 	struct btrfs_space_info *found;
2807 
2808 	rcu_read_lock();
2809 	list_for_each_entry_rcu(found, head, list) {
2810 		if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
2811 			found->force_alloc = CHUNK_ALLOC_FORCE;
2812 	}
2813 	rcu_read_unlock();
2814 }
2815 
should_alloc_chunk(struct btrfs_fs_info * fs_info,struct btrfs_space_info * sinfo,int force)2816 static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
2817 			      struct btrfs_space_info *sinfo, int force)
2818 {
2819 	u64 bytes_used = btrfs_space_info_used(sinfo, false);
2820 	u64 thresh;
2821 
2822 	if (force == CHUNK_ALLOC_FORCE)
2823 		return 1;
2824 
2825 	/*
2826 	 * in limited mode, we want to have some free space up to
2827 	 * about 1% of the FS size.
2828 	 */
2829 	if (force == CHUNK_ALLOC_LIMITED) {
2830 		thresh = btrfs_super_total_bytes(fs_info->super_copy);
2831 		thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
2832 
2833 		if (sinfo->total_bytes - bytes_used < thresh)
2834 			return 1;
2835 	}
2836 
2837 	if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8))
2838 		return 0;
2839 	return 1;
2840 }
2841 
btrfs_force_chunk_alloc(struct btrfs_trans_handle * trans,u64 type)2842 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
2843 {
2844 	u64 alloc_flags = btrfs_get_alloc_profile(trans->fs_info, type);
2845 
2846 	return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
2847 }
2848 
2849 /*
2850  * If force is CHUNK_ALLOC_FORCE:
2851  *    - return 1 if it successfully allocates a chunk,
2852  *    - return errors including -ENOSPC otherwise.
2853  * If force is NOT CHUNK_ALLOC_FORCE:
2854  *    - return 0 if it doesn't need to allocate a new chunk,
2855  *    - return 1 if it successfully allocates a chunk,
2856  *    - return errors including -ENOSPC otherwise.
2857  */
btrfs_chunk_alloc(struct btrfs_trans_handle * trans,u64 flags,enum btrfs_chunk_alloc_enum force)2858 int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
2859 		      enum btrfs_chunk_alloc_enum force)
2860 {
2861 	struct btrfs_fs_info *fs_info = trans->fs_info;
2862 	struct btrfs_space_info *space_info;
2863 	bool wait_for_alloc = false;
2864 	bool should_alloc = false;
2865 	int ret = 0;
2866 
2867 	/* Don't re-enter if we're already allocating a chunk */
2868 	if (trans->allocating_chunk)
2869 		return -ENOSPC;
2870 
2871 	space_info = btrfs_find_space_info(fs_info, flags);
2872 	ASSERT(space_info);
2873 
2874 	do {
2875 		spin_lock(&space_info->lock);
2876 		if (force < space_info->force_alloc)
2877 			force = space_info->force_alloc;
2878 		should_alloc = should_alloc_chunk(fs_info, space_info, force);
2879 		if (space_info->full) {
2880 			/* No more free physical space */
2881 			if (should_alloc)
2882 				ret = -ENOSPC;
2883 			else
2884 				ret = 0;
2885 			spin_unlock(&space_info->lock);
2886 			return ret;
2887 		} else if (!should_alloc) {
2888 			spin_unlock(&space_info->lock);
2889 			return 0;
2890 		} else if (space_info->chunk_alloc) {
2891 			/*
2892 			 * Someone is already allocating, so we need to block
2893 			 * until this someone is finished and then loop to
2894 			 * recheck if we should continue with our allocation
2895 			 * attempt.
2896 			 */
2897 			wait_for_alloc = true;
2898 			spin_unlock(&space_info->lock);
2899 			mutex_lock(&fs_info->chunk_mutex);
2900 			mutex_unlock(&fs_info->chunk_mutex);
2901 		} else {
2902 			/* Proceed with allocation */
2903 			space_info->chunk_alloc = 1;
2904 			wait_for_alloc = false;
2905 			spin_unlock(&space_info->lock);
2906 		}
2907 
2908 		cond_resched();
2909 	} while (wait_for_alloc);
2910 
2911 	mutex_lock(&fs_info->chunk_mutex);
2912 	trans->allocating_chunk = true;
2913 
2914 	/*
2915 	 * If we have mixed data/metadata chunks we want to make sure we keep
2916 	 * allocating mixed chunks instead of individual chunks.
2917 	 */
2918 	if (btrfs_mixed_space_info(space_info))
2919 		flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
2920 
2921 	/*
2922 	 * if we're doing a data chunk, go ahead and make sure that
2923 	 * we keep a reasonable number of metadata chunks allocated in the
2924 	 * FS as well.
2925 	 */
2926 	if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
2927 		fs_info->data_chunk_allocations++;
2928 		if (!(fs_info->data_chunk_allocations %
2929 		      fs_info->metadata_ratio))
2930 			force_metadata_allocation(fs_info);
2931 	}
2932 
2933 	/*
2934 	 * Check if we have enough space in SYSTEM chunk because we may need
2935 	 * to update devices.
2936 	 */
2937 	check_system_chunk(trans, flags);
2938 
2939 	ret = btrfs_alloc_chunk(trans, flags);
2940 	trans->allocating_chunk = false;
2941 
2942 	spin_lock(&space_info->lock);
2943 	if (ret < 0) {
2944 		if (ret == -ENOSPC)
2945 			space_info->full = 1;
2946 		else
2947 			goto out;
2948 	} else {
2949 		ret = 1;
2950 		space_info->max_extent_size = 0;
2951 	}
2952 
2953 	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
2954 out:
2955 	space_info->chunk_alloc = 0;
2956 	spin_unlock(&space_info->lock);
2957 	mutex_unlock(&fs_info->chunk_mutex);
2958 	/*
2959 	 * When we allocate a new chunk we reserve space in the chunk block
2960 	 * reserve to make sure we can COW nodes/leafs in the chunk tree or
2961 	 * add new nodes/leafs to it if we end up needing to do it when
2962 	 * inserting the chunk item and updating device items as part of the
2963 	 * second phase of chunk allocation, performed by
2964 	 * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
2965 	 * large number of new block groups to create in our transaction
2966 	 * handle's new_bgs list to avoid exhausting the chunk block reserve
2967 	 * in extreme cases - like having a single transaction create many new
2968 	 * block groups when starting to write out the free space caches of all
2969 	 * the block groups that were made dirty during the lifetime of the
2970 	 * transaction.
2971 	 */
2972 	if (trans->chunk_bytes_reserved >= (u64)SZ_2M)
2973 		btrfs_create_pending_block_groups(trans);
2974 
2975 	return ret;
2976 }
2977 
get_profile_num_devs(struct btrfs_fs_info * fs_info,u64 type)2978 static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
2979 {
2980 	u64 num_dev;
2981 
2982 	num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max;
2983 	if (!num_dev)
2984 		num_dev = fs_info->fs_devices->rw_devices;
2985 
2986 	return num_dev;
2987 }
2988 
2989 /*
2990  * If @is_allocation is true, reserve space in the system space info necessary
2991  * for allocating a chunk, otherwise if it's false, reserve space necessary for
2992  * removing a chunk.
2993  */
check_system_chunk(struct btrfs_trans_handle * trans,u64 type)2994 void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
2995 {
2996 	struct btrfs_fs_info *fs_info = trans->fs_info;
2997 	struct btrfs_space_info *info;
2998 	u64 left;
2999 	u64 thresh;
3000 	int ret = 0;
3001 	u64 num_devs;
3002 
3003 	/*
3004 	 * Needed because we can end up allocating a system chunk and for an
3005 	 * atomic and race free space reservation in the chunk block reserve.
3006 	 */
3007 	lockdep_assert_held(&fs_info->chunk_mutex);
3008 
3009 	info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3010 	spin_lock(&info->lock);
3011 	left = info->total_bytes - btrfs_space_info_used(info, true);
3012 	spin_unlock(&info->lock);
3013 
3014 	num_devs = get_profile_num_devs(fs_info, type);
3015 
3016 	/* num_devs device items to update and 1 chunk item to add or remove */
3017 	thresh = btrfs_calc_metadata_size(fs_info, num_devs) +
3018 		btrfs_calc_insert_metadata_size(fs_info, 1);
3019 
3020 	if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
3021 		btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
3022 			   left, thresh, type);
3023 		btrfs_dump_space_info(fs_info, info, 0, 0);
3024 	}
3025 
3026 	if (left < thresh) {
3027 		u64 flags = btrfs_system_alloc_profile(fs_info);
3028 
3029 		/*
3030 		 * Ignore failure to create system chunk. We might end up not
3031 		 * needing it, as we might not need to COW all nodes/leafs from
3032 		 * the paths we visit in the chunk tree (they were already COWed
3033 		 * or created in the current transaction for example).
3034 		 */
3035 		ret = btrfs_alloc_chunk(trans, flags);
3036 	}
3037 
3038 	if (!ret) {
3039 		ret = btrfs_block_rsv_add(fs_info->chunk_root,
3040 					  &fs_info->chunk_block_rsv,
3041 					  thresh, BTRFS_RESERVE_NO_FLUSH);
3042 		if (!ret)
3043 			trans->chunk_bytes_reserved += thresh;
3044 	}
3045 }
3046 
btrfs_put_block_group_cache(struct btrfs_fs_info * info)3047 void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
3048 {
3049 	struct btrfs_block_group_cache *block_group;
3050 	u64 last = 0;
3051 
3052 	while (1) {
3053 		struct inode *inode;
3054 
3055 		block_group = btrfs_lookup_first_block_group(info, last);
3056 		while (block_group) {
3057 			btrfs_wait_block_group_cache_done(block_group);
3058 			spin_lock(&block_group->lock);
3059 			if (block_group->iref)
3060 				break;
3061 			spin_unlock(&block_group->lock);
3062 			block_group = btrfs_next_block_group(block_group);
3063 		}
3064 		if (!block_group) {
3065 			if (last == 0)
3066 				break;
3067 			last = 0;
3068 			continue;
3069 		}
3070 
3071 		inode = block_group->inode;
3072 		block_group->iref = 0;
3073 		block_group->inode = NULL;
3074 		spin_unlock(&block_group->lock);
3075 		ASSERT(block_group->io_ctl.inode == NULL);
3076 		iput(inode);
3077 		last = block_group->key.objectid + block_group->key.offset;
3078 		btrfs_put_block_group(block_group);
3079 	}
3080 }
3081 
3082 /*
3083  * Must be called only after stopping all workers, since we could have block
3084  * group caching kthreads running, and therefore they could race with us if we
3085  * freed the block groups before stopping them.
3086  */
btrfs_free_block_groups(struct btrfs_fs_info * info)3087 int btrfs_free_block_groups(struct btrfs_fs_info *info)
3088 {
3089 	struct btrfs_block_group_cache *block_group;
3090 	struct btrfs_space_info *space_info;
3091 	struct btrfs_caching_control *caching_ctl;
3092 	struct rb_node *n;
3093 
3094 	down_write(&info->commit_root_sem);
3095 	while (!list_empty(&info->caching_block_groups)) {
3096 		caching_ctl = list_entry(info->caching_block_groups.next,
3097 					 struct btrfs_caching_control, list);
3098 		list_del(&caching_ctl->list);
3099 		btrfs_put_caching_control(caching_ctl);
3100 	}
3101 	up_write(&info->commit_root_sem);
3102 
3103 	spin_lock(&info->unused_bgs_lock);
3104 	while (!list_empty(&info->unused_bgs)) {
3105 		block_group = list_first_entry(&info->unused_bgs,
3106 					       struct btrfs_block_group_cache,
3107 					       bg_list);
3108 		list_del_init(&block_group->bg_list);
3109 		btrfs_put_block_group(block_group);
3110 	}
3111 	spin_unlock(&info->unused_bgs_lock);
3112 
3113 	spin_lock(&info->block_group_cache_lock);
3114 	while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
3115 		block_group = rb_entry(n, struct btrfs_block_group_cache,
3116 				       cache_node);
3117 		rb_erase(&block_group->cache_node,
3118 			 &info->block_group_cache_tree);
3119 		RB_CLEAR_NODE(&block_group->cache_node);
3120 		spin_unlock(&info->block_group_cache_lock);
3121 
3122 		down_write(&block_group->space_info->groups_sem);
3123 		list_del(&block_group->list);
3124 		up_write(&block_group->space_info->groups_sem);
3125 
3126 		/*
3127 		 * We haven't cached this block group, which means we could
3128 		 * possibly have excluded extents on this block group.
3129 		 */
3130 		if (block_group->cached == BTRFS_CACHE_NO ||
3131 		    block_group->cached == BTRFS_CACHE_ERROR)
3132 			btrfs_free_excluded_extents(block_group);
3133 
3134 		btrfs_remove_free_space_cache(block_group);
3135 		ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
3136 		ASSERT(list_empty(&block_group->dirty_list));
3137 		ASSERT(list_empty(&block_group->io_list));
3138 		ASSERT(list_empty(&block_group->bg_list));
3139 		ASSERT(atomic_read(&block_group->count) == 1);
3140 		btrfs_put_block_group(block_group);
3141 
3142 		spin_lock(&info->block_group_cache_lock);
3143 	}
3144 	spin_unlock(&info->block_group_cache_lock);
3145 
3146 	/*
3147 	 * Now that all the block groups are freed, go through and free all the
3148 	 * space_info structs.  This is only called during the final stages of
3149 	 * unmount, and so we know nobody is using them.  We call
3150 	 * synchronize_rcu() once before we start, just to be on the safe side.
3151 	 */
3152 	synchronize_rcu();
3153 
3154 	btrfs_release_global_block_rsv(info);
3155 
3156 	while (!list_empty(&info->space_info)) {
3157 		space_info = list_entry(info->space_info.next,
3158 					struct btrfs_space_info,
3159 					list);
3160 
3161 		/*
3162 		 * Do not hide this behind enospc_debug, this is actually
3163 		 * important and indicates a real bug if this happens.
3164 		 */
3165 		if (WARN_ON(space_info->bytes_pinned > 0 ||
3166 			    space_info->bytes_reserved > 0 ||
3167 			    space_info->bytes_may_use > 0))
3168 			btrfs_dump_space_info(info, space_info, 0, 0);
3169 		list_del(&space_info->list);
3170 		btrfs_sysfs_remove_space_info(space_info);
3171 	}
3172 	return 0;
3173 }
3174