• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "misc.h"
4 #include "ctree.h"
5 #include "block-group.h"
6 #include "space-info.h"
7 #include "disk-io.h"
8 #include "free-space-cache.h"
9 #include "free-space-tree.h"
10 #include "disk-io.h"
11 #include "volumes.h"
12 #include "transaction.h"
13 #include "ref-verify.h"
14 #include "sysfs.h"
15 #include "tree-log.h"
16 #include "delalloc-space.h"
17 
18 /*
19  * Return target flags in extended format or 0 if restripe for this chunk_type
20  * is not in progress
21  *
22  * Should be called with balance_lock held
23  */
get_restripe_target(struct btrfs_fs_info * fs_info,u64 flags)24 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
25 {
26 	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
27 	u64 target = 0;
28 
29 	if (!bctl)
30 		return 0;
31 
32 	if (flags & BTRFS_BLOCK_GROUP_DATA &&
33 	    bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
34 		target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
35 	} else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
36 		   bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
37 		target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
38 	} else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
39 		   bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
40 		target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
41 	}
42 
43 	return target;
44 }
45 
46 /*
47  * @flags: available profiles in extended format (see ctree.h)
48  *
49  * Return reduced profile in chunk format.  If profile changing is in progress
50  * (either running or paused) picks the target profile (if it's already
51  * available), otherwise falls back to plain reducing.
52  */
btrfs_reduce_alloc_profile(struct btrfs_fs_info * fs_info,u64 flags)53 static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
54 {
55 	u64 num_devices = fs_info->fs_devices->rw_devices;
56 	u64 target;
57 	u64 raid_type;
58 	u64 allowed = 0;
59 
60 	/*
61 	 * See if restripe for this chunk_type is in progress, if so try to
62 	 * reduce to the target profile
63 	 */
64 	spin_lock(&fs_info->balance_lock);
65 	target = get_restripe_target(fs_info, flags);
66 	if (target) {
67 		/* Pick target profile only if it's already available */
68 		if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
69 			spin_unlock(&fs_info->balance_lock);
70 			return extended_to_chunk(target);
71 		}
72 	}
73 	spin_unlock(&fs_info->balance_lock);
74 
75 	/* First, mask out the RAID levels which aren't possible */
76 	for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
77 		if (num_devices >= btrfs_raid_array[raid_type].devs_min)
78 			allowed |= btrfs_raid_array[raid_type].bg_flag;
79 	}
80 	allowed &= flags;
81 
82 	if (allowed & BTRFS_BLOCK_GROUP_RAID6)
83 		allowed = BTRFS_BLOCK_GROUP_RAID6;
84 	else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
85 		allowed = BTRFS_BLOCK_GROUP_RAID5;
86 	else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
87 		allowed = BTRFS_BLOCK_GROUP_RAID10;
88 	else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
89 		allowed = BTRFS_BLOCK_GROUP_RAID1;
90 	else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
91 		allowed = BTRFS_BLOCK_GROUP_RAID0;
92 
93 	flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
94 
95 	return extended_to_chunk(flags | allowed);
96 }
97 
get_alloc_profile(struct btrfs_fs_info * fs_info,u64 orig_flags)98 static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
99 {
100 	unsigned seq;
101 	u64 flags;
102 
103 	do {
104 		flags = orig_flags;
105 		seq = read_seqbegin(&fs_info->profiles_lock);
106 
107 		if (flags & BTRFS_BLOCK_GROUP_DATA)
108 			flags |= fs_info->avail_data_alloc_bits;
109 		else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
110 			flags |= fs_info->avail_system_alloc_bits;
111 		else if (flags & BTRFS_BLOCK_GROUP_METADATA)
112 			flags |= fs_info->avail_metadata_alloc_bits;
113 	} while (read_seqretry(&fs_info->profiles_lock, seq));
114 
115 	return btrfs_reduce_alloc_profile(fs_info, flags);
116 }
117 
btrfs_get_alloc_profile(struct btrfs_fs_info * fs_info,u64 orig_flags)118 u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
119 {
120 	return get_alloc_profile(fs_info, orig_flags);
121 }
122 
btrfs_get_block_group(struct btrfs_block_group_cache * cache)123 void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
124 {
125 	atomic_inc(&cache->count);
126 }
127 
btrfs_put_block_group(struct btrfs_block_group_cache * cache)128 void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
129 {
130 	if (atomic_dec_and_test(&cache->count)) {
131 		WARN_ON(cache->pinned > 0);
132 		WARN_ON(cache->reserved > 0);
133 
134 		/*
135 		 * If not empty, someone is still holding mutex of
136 		 * full_stripe_lock, which can only be released by caller.
137 		 * And it will definitely cause use-after-free when caller
138 		 * tries to release full stripe lock.
139 		 *
140 		 * No better way to resolve, but only to warn.
141 		 */
142 		WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
143 		kfree(cache->free_space_ctl);
144 		kfree(cache);
145 	}
146 }
147 
148 /*
149  * This adds the block group to the fs_info rb tree for the block group cache
150  */
btrfs_add_block_group_cache(struct btrfs_fs_info * info,struct btrfs_block_group_cache * block_group)151 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
152 				struct btrfs_block_group_cache *block_group)
153 {
154 	struct rb_node **p;
155 	struct rb_node *parent = NULL;
156 	struct btrfs_block_group_cache *cache;
157 
158 	spin_lock(&info->block_group_cache_lock);
159 	p = &info->block_group_cache_tree.rb_node;
160 
161 	while (*p) {
162 		parent = *p;
163 		cache = rb_entry(parent, struct btrfs_block_group_cache,
164 				 cache_node);
165 		if (block_group->key.objectid < cache->key.objectid) {
166 			p = &(*p)->rb_left;
167 		} else if (block_group->key.objectid > cache->key.objectid) {
168 			p = &(*p)->rb_right;
169 		} else {
170 			spin_unlock(&info->block_group_cache_lock);
171 			return -EEXIST;
172 		}
173 	}
174 
175 	rb_link_node(&block_group->cache_node, parent, p);
176 	rb_insert_color(&block_group->cache_node,
177 			&info->block_group_cache_tree);
178 
179 	if (info->first_logical_byte > block_group->key.objectid)
180 		info->first_logical_byte = block_group->key.objectid;
181 
182 	spin_unlock(&info->block_group_cache_lock);
183 
184 	return 0;
185 }
186 
187 /*
188  * This will return the block group at or after bytenr if contains is 0, else
189  * it will return the block group that contains the bytenr
190  */
block_group_cache_tree_search(struct btrfs_fs_info * info,u64 bytenr,int contains)191 static struct btrfs_block_group_cache *block_group_cache_tree_search(
192 		struct btrfs_fs_info *info, u64 bytenr, int contains)
193 {
194 	struct btrfs_block_group_cache *cache, *ret = NULL;
195 	struct rb_node *n;
196 	u64 end, start;
197 
198 	spin_lock(&info->block_group_cache_lock);
199 	n = info->block_group_cache_tree.rb_node;
200 
201 	while (n) {
202 		cache = rb_entry(n, struct btrfs_block_group_cache,
203 				 cache_node);
204 		end = cache->key.objectid + cache->key.offset - 1;
205 		start = cache->key.objectid;
206 
207 		if (bytenr < start) {
208 			if (!contains && (!ret || start < ret->key.objectid))
209 				ret = cache;
210 			n = n->rb_left;
211 		} else if (bytenr > start) {
212 			if (contains && bytenr <= end) {
213 				ret = cache;
214 				break;
215 			}
216 			n = n->rb_right;
217 		} else {
218 			ret = cache;
219 			break;
220 		}
221 	}
222 	if (ret) {
223 		btrfs_get_block_group(ret);
224 		if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
225 			info->first_logical_byte = ret->key.objectid;
226 	}
227 	spin_unlock(&info->block_group_cache_lock);
228 
229 	return ret;
230 }
231 
232 /*
233  * Return the block group that starts at or after bytenr
234  */
btrfs_lookup_first_block_group(struct btrfs_fs_info * info,u64 bytenr)235 struct btrfs_block_group_cache *btrfs_lookup_first_block_group(
236 		struct btrfs_fs_info *info, u64 bytenr)
237 {
238 	return block_group_cache_tree_search(info, bytenr, 0);
239 }
240 
241 /*
242  * Return the block group that contains the given bytenr
243  */
btrfs_lookup_block_group(struct btrfs_fs_info * info,u64 bytenr)244 struct btrfs_block_group_cache *btrfs_lookup_block_group(
245 		struct btrfs_fs_info *info, u64 bytenr)
246 {
247 	return block_group_cache_tree_search(info, bytenr, 1);
248 }
249 
btrfs_next_block_group(struct btrfs_block_group_cache * cache)250 struct btrfs_block_group_cache *btrfs_next_block_group(
251 		struct btrfs_block_group_cache *cache)
252 {
253 	struct btrfs_fs_info *fs_info = cache->fs_info;
254 	struct rb_node *node;
255 
256 	spin_lock(&fs_info->block_group_cache_lock);
257 
258 	/* If our block group was removed, we need a full search. */
259 	if (RB_EMPTY_NODE(&cache->cache_node)) {
260 		const u64 next_bytenr = cache->key.objectid + cache->key.offset;
261 
262 		spin_unlock(&fs_info->block_group_cache_lock);
263 		btrfs_put_block_group(cache);
264 		cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache;
265 	}
266 	node = rb_next(&cache->cache_node);
267 	btrfs_put_block_group(cache);
268 	if (node) {
269 		cache = rb_entry(node, struct btrfs_block_group_cache,
270 				 cache_node);
271 		btrfs_get_block_group(cache);
272 	} else
273 		cache = NULL;
274 	spin_unlock(&fs_info->block_group_cache_lock);
275 	return cache;
276 }
277 
btrfs_inc_nocow_writers(struct btrfs_fs_info * fs_info,u64 bytenr)278 bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
279 {
280 	struct btrfs_block_group_cache *bg;
281 	bool ret = true;
282 
283 	bg = btrfs_lookup_block_group(fs_info, bytenr);
284 	if (!bg)
285 		return false;
286 
287 	spin_lock(&bg->lock);
288 	if (bg->ro)
289 		ret = false;
290 	else
291 		atomic_inc(&bg->nocow_writers);
292 	spin_unlock(&bg->lock);
293 
294 	/* No put on block group, done by btrfs_dec_nocow_writers */
295 	if (!ret)
296 		btrfs_put_block_group(bg);
297 
298 	return ret;
299 }
300 
btrfs_dec_nocow_writers(struct btrfs_fs_info * fs_info,u64 bytenr)301 void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
302 {
303 	struct btrfs_block_group_cache *bg;
304 
305 	bg = btrfs_lookup_block_group(fs_info, bytenr);
306 	ASSERT(bg);
307 	if (atomic_dec_and_test(&bg->nocow_writers))
308 		wake_up_var(&bg->nocow_writers);
309 	/*
310 	 * Once for our lookup and once for the lookup done by a previous call
311 	 * to btrfs_inc_nocow_writers()
312 	 */
313 	btrfs_put_block_group(bg);
314 	btrfs_put_block_group(bg);
315 }
316 
btrfs_wait_nocow_writers(struct btrfs_block_group_cache * bg)317 void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg)
318 {
319 	wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers));
320 }
321 
btrfs_dec_block_group_reservations(struct btrfs_fs_info * fs_info,const u64 start)322 void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
323 					const u64 start)
324 {
325 	struct btrfs_block_group_cache *bg;
326 
327 	bg = btrfs_lookup_block_group(fs_info, start);
328 	ASSERT(bg);
329 	if (atomic_dec_and_test(&bg->reservations))
330 		wake_up_var(&bg->reservations);
331 	btrfs_put_block_group(bg);
332 }
333 
btrfs_wait_block_group_reservations(struct btrfs_block_group_cache * bg)334 void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
335 {
336 	struct btrfs_space_info *space_info = bg->space_info;
337 
338 	ASSERT(bg->ro);
339 
340 	if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
341 		return;
342 
343 	/*
344 	 * Our block group is read only but before we set it to read only,
345 	 * some task might have had allocated an extent from it already, but it
346 	 * has not yet created a respective ordered extent (and added it to a
347 	 * root's list of ordered extents).
348 	 * Therefore wait for any task currently allocating extents, since the
349 	 * block group's reservations counter is incremented while a read lock
350 	 * on the groups' semaphore is held and decremented after releasing
351 	 * the read access on that semaphore and creating the ordered extent.
352 	 */
353 	down_write(&space_info->groups_sem);
354 	up_write(&space_info->groups_sem);
355 
356 	wait_var_event(&bg->reservations, !atomic_read(&bg->reservations));
357 }
358 
btrfs_get_caching_control(struct btrfs_block_group_cache * cache)359 struct btrfs_caching_control *btrfs_get_caching_control(
360 		struct btrfs_block_group_cache *cache)
361 {
362 	struct btrfs_caching_control *ctl;
363 
364 	spin_lock(&cache->lock);
365 	if (!cache->caching_ctl) {
366 		spin_unlock(&cache->lock);
367 		return NULL;
368 	}
369 
370 	ctl = cache->caching_ctl;
371 	refcount_inc(&ctl->count);
372 	spin_unlock(&cache->lock);
373 	return ctl;
374 }
375 
btrfs_put_caching_control(struct btrfs_caching_control * ctl)376 void btrfs_put_caching_control(struct btrfs_caching_control *ctl)
377 {
378 	if (refcount_dec_and_test(&ctl->count))
379 		kfree(ctl);
380 }
381 
382 /*
383  * When we wait for progress in the block group caching, its because our
384  * allocation attempt failed at least once.  So, we must sleep and let some
385  * progress happen before we try again.
386  *
387  * This function will sleep at least once waiting for new free space to show
388  * up, and then it will check the block group free space numbers for our min
389  * num_bytes.  Another option is to have it go ahead and look in the rbtree for
390  * a free extent of a given size, but this is a good start.
391  *
392  * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
393  * any of the information in this block group.
394  */
btrfs_wait_block_group_cache_progress(struct btrfs_block_group_cache * cache,u64 num_bytes)395 void btrfs_wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
396 					   u64 num_bytes)
397 {
398 	struct btrfs_caching_control *caching_ctl;
399 
400 	caching_ctl = btrfs_get_caching_control(cache);
401 	if (!caching_ctl)
402 		return;
403 
404 	wait_event(caching_ctl->wait, btrfs_block_group_cache_done(cache) ||
405 		   (cache->free_space_ctl->free_space >= num_bytes));
406 
407 	btrfs_put_caching_control(caching_ctl);
408 }
409 
btrfs_wait_block_group_cache_done(struct btrfs_block_group_cache * cache)410 int btrfs_wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
411 {
412 	struct btrfs_caching_control *caching_ctl;
413 	int ret = 0;
414 
415 	caching_ctl = btrfs_get_caching_control(cache);
416 	if (!caching_ctl)
417 		return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
418 
419 	wait_event(caching_ctl->wait, btrfs_block_group_cache_done(cache));
420 	if (cache->cached == BTRFS_CACHE_ERROR)
421 		ret = -EIO;
422 	btrfs_put_caching_control(caching_ctl);
423 	return ret;
424 }
425 
426 #ifdef CONFIG_BTRFS_DEBUG
fragment_free_space(struct btrfs_block_group_cache * block_group)427 static void fragment_free_space(struct btrfs_block_group_cache *block_group)
428 {
429 	struct btrfs_fs_info *fs_info = block_group->fs_info;
430 	u64 start = block_group->key.objectid;
431 	u64 len = block_group->key.offset;
432 	u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
433 		fs_info->nodesize : fs_info->sectorsize;
434 	u64 step = chunk << 1;
435 
436 	while (len > chunk) {
437 		btrfs_remove_free_space(block_group, start, chunk);
438 		start += step;
439 		if (len < step)
440 			len = 0;
441 		else
442 			len -= step;
443 	}
444 }
445 #endif
446 
447 /*
448  * This is only called by btrfs_cache_block_group, since we could have freed
449  * extents we need to check the pinned_extents for any extents that can't be
450  * used yet since their free space will be released as soon as the transaction
451  * commits.
452  */
add_new_free_space(struct btrfs_block_group_cache * block_group,u64 start,u64 end)453 u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
454 		       u64 start, u64 end)
455 {
456 	struct btrfs_fs_info *info = block_group->fs_info;
457 	u64 extent_start, extent_end, size, total_added = 0;
458 	int ret;
459 
460 	while (start < end) {
461 		ret = find_first_extent_bit(info->pinned_extents, start,
462 					    &extent_start, &extent_end,
463 					    EXTENT_DIRTY | EXTENT_UPTODATE,
464 					    NULL);
465 		if (ret)
466 			break;
467 
468 		if (extent_start <= start) {
469 			start = extent_end + 1;
470 		} else if (extent_start > start && extent_start < end) {
471 			size = extent_start - start;
472 			total_added += size;
473 			ret = btrfs_add_free_space(block_group, start,
474 						   size);
475 			BUG_ON(ret); /* -ENOMEM or logic error */
476 			start = extent_end + 1;
477 		} else {
478 			break;
479 		}
480 	}
481 
482 	if (start < end) {
483 		size = end - start;
484 		total_added += size;
485 		ret = btrfs_add_free_space(block_group, start, size);
486 		BUG_ON(ret); /* -ENOMEM or logic error */
487 	}
488 
489 	return total_added;
490 }
491 
load_extent_tree_free(struct btrfs_caching_control * caching_ctl)492 static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
493 {
494 	struct btrfs_block_group_cache *block_group = caching_ctl->block_group;
495 	struct btrfs_fs_info *fs_info = block_group->fs_info;
496 	struct btrfs_root *extent_root = fs_info->extent_root;
497 	struct btrfs_path *path;
498 	struct extent_buffer *leaf;
499 	struct btrfs_key key;
500 	u64 total_found = 0;
501 	u64 last = 0;
502 	u32 nritems;
503 	int ret;
504 	bool wakeup = true;
505 
506 	path = btrfs_alloc_path();
507 	if (!path)
508 		return -ENOMEM;
509 
510 	last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
511 
512 #ifdef CONFIG_BTRFS_DEBUG
513 	/*
514 	 * If we're fragmenting we don't want to make anybody think we can
515 	 * allocate from this block group until we've had a chance to fragment
516 	 * the free space.
517 	 */
518 	if (btrfs_should_fragment_free_space(block_group))
519 		wakeup = false;
520 #endif
521 	/*
522 	 * We don't want to deadlock with somebody trying to allocate a new
523 	 * extent for the extent root while also trying to search the extent
524 	 * root to add free space.  So we skip locking and search the commit
525 	 * root, since its read-only
526 	 */
527 	path->skip_locking = 1;
528 	path->search_commit_root = 1;
529 	path->reada = READA_FORWARD;
530 
531 	key.objectid = last;
532 	key.offset = 0;
533 	key.type = BTRFS_EXTENT_ITEM_KEY;
534 
535 next:
536 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
537 	if (ret < 0)
538 		goto out;
539 
540 	leaf = path->nodes[0];
541 	nritems = btrfs_header_nritems(leaf);
542 
543 	while (1) {
544 		if (btrfs_fs_closing(fs_info) > 1) {
545 			last = (u64)-1;
546 			break;
547 		}
548 
549 		if (path->slots[0] < nritems) {
550 			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
551 		} else {
552 			ret = btrfs_find_next_key(extent_root, path, &key, 0, 0);
553 			if (ret)
554 				break;
555 
556 			if (need_resched() ||
557 			    rwsem_is_contended(&fs_info->commit_root_sem)) {
558 				if (wakeup)
559 					caching_ctl->progress = last;
560 				btrfs_release_path(path);
561 				up_read(&fs_info->commit_root_sem);
562 				mutex_unlock(&caching_ctl->mutex);
563 				cond_resched();
564 				mutex_lock(&caching_ctl->mutex);
565 				down_read(&fs_info->commit_root_sem);
566 				goto next;
567 			}
568 
569 			ret = btrfs_next_leaf(extent_root, path);
570 			if (ret < 0)
571 				goto out;
572 			if (ret)
573 				break;
574 			leaf = path->nodes[0];
575 			nritems = btrfs_header_nritems(leaf);
576 			continue;
577 		}
578 
579 		if (key.objectid < last) {
580 			key.objectid = last;
581 			key.offset = 0;
582 			key.type = BTRFS_EXTENT_ITEM_KEY;
583 
584 			if (wakeup)
585 				caching_ctl->progress = last;
586 			btrfs_release_path(path);
587 			goto next;
588 		}
589 
590 		if (key.objectid < block_group->key.objectid) {
591 			path->slots[0]++;
592 			continue;
593 		}
594 
595 		if (key.objectid >= block_group->key.objectid +
596 		    block_group->key.offset)
597 			break;
598 
599 		if (key.type == BTRFS_EXTENT_ITEM_KEY ||
600 		    key.type == BTRFS_METADATA_ITEM_KEY) {
601 			total_found += add_new_free_space(block_group, last,
602 							  key.objectid);
603 			if (key.type == BTRFS_METADATA_ITEM_KEY)
604 				last = key.objectid +
605 					fs_info->nodesize;
606 			else
607 				last = key.objectid + key.offset;
608 
609 			if (total_found > CACHING_CTL_WAKE_UP) {
610 				total_found = 0;
611 				if (wakeup)
612 					wake_up(&caching_ctl->wait);
613 			}
614 		}
615 		path->slots[0]++;
616 	}
617 	ret = 0;
618 
619 	total_found += add_new_free_space(block_group, last,
620 					  block_group->key.objectid +
621 					  block_group->key.offset);
622 	caching_ctl->progress = (u64)-1;
623 
624 out:
625 	btrfs_free_path(path);
626 	return ret;
627 }
628 
caching_thread(struct btrfs_work * work)629 static noinline void caching_thread(struct btrfs_work *work)
630 {
631 	struct btrfs_block_group_cache *block_group;
632 	struct btrfs_fs_info *fs_info;
633 	struct btrfs_caching_control *caching_ctl;
634 	int ret;
635 
636 	caching_ctl = container_of(work, struct btrfs_caching_control, work);
637 	block_group = caching_ctl->block_group;
638 	fs_info = block_group->fs_info;
639 
640 	mutex_lock(&caching_ctl->mutex);
641 	down_read(&fs_info->commit_root_sem);
642 
643 	/*
644 	 * If we are in the transaction that populated the free space tree we
645 	 * can't actually cache from the free space tree as our commit root and
646 	 * real root are the same, so we could change the contents of the blocks
647 	 * while caching.  Instead do the slow caching in this case, and after
648 	 * the transaction has committed we will be safe.
649 	 */
650 	if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
651 	    !(test_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags)))
652 		ret = load_free_space_tree(caching_ctl);
653 	else
654 		ret = load_extent_tree_free(caching_ctl);
655 
656 	spin_lock(&block_group->lock);
657 	block_group->caching_ctl = NULL;
658 	block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
659 	spin_unlock(&block_group->lock);
660 
661 #ifdef CONFIG_BTRFS_DEBUG
662 	if (btrfs_should_fragment_free_space(block_group)) {
663 		u64 bytes_used;
664 
665 		spin_lock(&block_group->space_info->lock);
666 		spin_lock(&block_group->lock);
667 		bytes_used = block_group->key.offset -
668 			btrfs_block_group_used(&block_group->item);
669 		block_group->space_info->bytes_used += bytes_used >> 1;
670 		spin_unlock(&block_group->lock);
671 		spin_unlock(&block_group->space_info->lock);
672 		fragment_free_space(block_group);
673 	}
674 #endif
675 
676 	caching_ctl->progress = (u64)-1;
677 
678 	up_read(&fs_info->commit_root_sem);
679 	btrfs_free_excluded_extents(block_group);
680 	mutex_unlock(&caching_ctl->mutex);
681 
682 	wake_up(&caching_ctl->wait);
683 
684 	btrfs_put_caching_control(caching_ctl);
685 	btrfs_put_block_group(block_group);
686 }
687 
btrfs_cache_block_group(struct btrfs_block_group_cache * cache,int load_cache_only)688 int btrfs_cache_block_group(struct btrfs_block_group_cache *cache,
689 			    int load_cache_only)
690 {
691 	DEFINE_WAIT(wait);
692 	struct btrfs_fs_info *fs_info = cache->fs_info;
693 	struct btrfs_caching_control *caching_ctl;
694 	int ret = 0;
695 
696 	caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
697 	if (!caching_ctl)
698 		return -ENOMEM;
699 
700 	INIT_LIST_HEAD(&caching_ctl->list);
701 	mutex_init(&caching_ctl->mutex);
702 	init_waitqueue_head(&caching_ctl->wait);
703 	caching_ctl->block_group = cache;
704 	caching_ctl->progress = cache->key.objectid;
705 	refcount_set(&caching_ctl->count, 1);
706 	btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
707 
708 	spin_lock(&cache->lock);
709 	/*
710 	 * This should be a rare occasion, but this could happen I think in the
711 	 * case where one thread starts to load the space cache info, and then
712 	 * some other thread starts a transaction commit which tries to do an
713 	 * allocation while the other thread is still loading the space cache
714 	 * info.  The previous loop should have kept us from choosing this block
715 	 * group, but if we've moved to the state where we will wait on caching
716 	 * block groups we need to first check if we're doing a fast load here,
717 	 * so we can wait for it to finish, otherwise we could end up allocating
718 	 * from a block group who's cache gets evicted for one reason or
719 	 * another.
720 	 */
721 	while (cache->cached == BTRFS_CACHE_FAST) {
722 		struct btrfs_caching_control *ctl;
723 
724 		ctl = cache->caching_ctl;
725 		refcount_inc(&ctl->count);
726 		prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
727 		spin_unlock(&cache->lock);
728 
729 		schedule();
730 
731 		finish_wait(&ctl->wait, &wait);
732 		btrfs_put_caching_control(ctl);
733 		spin_lock(&cache->lock);
734 	}
735 
736 	if (cache->cached != BTRFS_CACHE_NO) {
737 		spin_unlock(&cache->lock);
738 		kfree(caching_ctl);
739 		return 0;
740 	}
741 	WARN_ON(cache->caching_ctl);
742 	cache->caching_ctl = caching_ctl;
743 	cache->cached = BTRFS_CACHE_FAST;
744 	spin_unlock(&cache->lock);
745 
746 	if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
747 		mutex_lock(&caching_ctl->mutex);
748 		ret = load_free_space_cache(cache);
749 
750 		spin_lock(&cache->lock);
751 		if (ret == 1) {
752 			cache->caching_ctl = NULL;
753 			cache->cached = BTRFS_CACHE_FINISHED;
754 			cache->last_byte_to_unpin = (u64)-1;
755 			caching_ctl->progress = (u64)-1;
756 		} else {
757 			if (load_cache_only) {
758 				cache->caching_ctl = NULL;
759 				cache->cached = BTRFS_CACHE_NO;
760 			} else {
761 				cache->cached = BTRFS_CACHE_STARTED;
762 				cache->has_caching_ctl = 1;
763 			}
764 		}
765 		spin_unlock(&cache->lock);
766 #ifdef CONFIG_BTRFS_DEBUG
767 		if (ret == 1 &&
768 		    btrfs_should_fragment_free_space(cache)) {
769 			u64 bytes_used;
770 
771 			spin_lock(&cache->space_info->lock);
772 			spin_lock(&cache->lock);
773 			bytes_used = cache->key.offset -
774 				btrfs_block_group_used(&cache->item);
775 			cache->space_info->bytes_used += bytes_used >> 1;
776 			spin_unlock(&cache->lock);
777 			spin_unlock(&cache->space_info->lock);
778 			fragment_free_space(cache);
779 		}
780 #endif
781 		mutex_unlock(&caching_ctl->mutex);
782 
783 		wake_up(&caching_ctl->wait);
784 		if (ret == 1) {
785 			btrfs_put_caching_control(caching_ctl);
786 			btrfs_free_excluded_extents(cache);
787 			return 0;
788 		}
789 	} else {
790 		/*
791 		 * We're either using the free space tree or no caching at all.
792 		 * Set cached to the appropriate value and wakeup any waiters.
793 		 */
794 		spin_lock(&cache->lock);
795 		if (load_cache_only) {
796 			cache->caching_ctl = NULL;
797 			cache->cached = BTRFS_CACHE_NO;
798 		} else {
799 			cache->cached = BTRFS_CACHE_STARTED;
800 			cache->has_caching_ctl = 1;
801 		}
802 		spin_unlock(&cache->lock);
803 		wake_up(&caching_ctl->wait);
804 	}
805 
806 	if (load_cache_only) {
807 		btrfs_put_caching_control(caching_ctl);
808 		return 0;
809 	}
810 
811 	down_write(&fs_info->commit_root_sem);
812 	refcount_inc(&caching_ctl->count);
813 	list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
814 	up_write(&fs_info->commit_root_sem);
815 
816 	btrfs_get_block_group(cache);
817 
818 	btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
819 
820 	return ret;
821 }
822 
clear_avail_alloc_bits(struct btrfs_fs_info * fs_info,u64 flags)823 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
824 {
825 	u64 extra_flags = chunk_to_extended(flags) &
826 				BTRFS_EXTENDED_PROFILE_MASK;
827 
828 	write_seqlock(&fs_info->profiles_lock);
829 	if (flags & BTRFS_BLOCK_GROUP_DATA)
830 		fs_info->avail_data_alloc_bits &= ~extra_flags;
831 	if (flags & BTRFS_BLOCK_GROUP_METADATA)
832 		fs_info->avail_metadata_alloc_bits &= ~extra_flags;
833 	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
834 		fs_info->avail_system_alloc_bits &= ~extra_flags;
835 	write_sequnlock(&fs_info->profiles_lock);
836 }
837 
838 /*
839  * Clear incompat bits for the following feature(s):
840  *
841  * - RAID56 - in case there's neither RAID5 nor RAID6 profile block group
842  *            in the whole filesystem
843  */
clear_incompat_bg_bits(struct btrfs_fs_info * fs_info,u64 flags)844 static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags)
845 {
846 	if (flags & BTRFS_BLOCK_GROUP_RAID56_MASK) {
847 		struct list_head *head = &fs_info->space_info;
848 		struct btrfs_space_info *sinfo;
849 
850 		list_for_each_entry_rcu(sinfo, head, list) {
851 			bool found = false;
852 
853 			down_read(&sinfo->groups_sem);
854 			if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5]))
855 				found = true;
856 			if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6]))
857 				found = true;
858 			up_read(&sinfo->groups_sem);
859 
860 			if (found)
861 				return;
862 		}
863 		btrfs_clear_fs_incompat(fs_info, RAID56);
864 	}
865 }
866 
btrfs_remove_block_group(struct btrfs_trans_handle * trans,u64 group_start,struct extent_map * em)867 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
868 			     u64 group_start, struct extent_map *em)
869 {
870 	struct btrfs_fs_info *fs_info = trans->fs_info;
871 	struct btrfs_root *root = fs_info->extent_root;
872 	struct btrfs_path *path;
873 	struct btrfs_block_group_cache *block_group;
874 	struct btrfs_free_cluster *cluster;
875 	struct btrfs_root *tree_root = fs_info->tree_root;
876 	struct btrfs_key key;
877 	struct inode *inode;
878 	struct kobject *kobj = NULL;
879 	int ret;
880 	int index;
881 	int factor;
882 	struct btrfs_caching_control *caching_ctl = NULL;
883 	bool remove_em;
884 	bool remove_rsv = false;
885 
886 	block_group = btrfs_lookup_block_group(fs_info, group_start);
887 	BUG_ON(!block_group);
888 	BUG_ON(!block_group->ro);
889 
890 	trace_btrfs_remove_block_group(block_group);
891 	/*
892 	 * Free the reserved super bytes from this block group before
893 	 * remove it.
894 	 */
895 	btrfs_free_excluded_extents(block_group);
896 	btrfs_free_ref_tree_range(fs_info, block_group->key.objectid,
897 				  block_group->key.offset);
898 
899 	memcpy(&key, &block_group->key, sizeof(key));
900 	index = btrfs_bg_flags_to_raid_index(block_group->flags);
901 	factor = btrfs_bg_type_to_factor(block_group->flags);
902 
903 	/* make sure this block group isn't part of an allocation cluster */
904 	cluster = &fs_info->data_alloc_cluster;
905 	spin_lock(&cluster->refill_lock);
906 	btrfs_return_cluster_to_free_space(block_group, cluster);
907 	spin_unlock(&cluster->refill_lock);
908 
909 	/*
910 	 * make sure this block group isn't part of a metadata
911 	 * allocation cluster
912 	 */
913 	cluster = &fs_info->meta_alloc_cluster;
914 	spin_lock(&cluster->refill_lock);
915 	btrfs_return_cluster_to_free_space(block_group, cluster);
916 	spin_unlock(&cluster->refill_lock);
917 
918 	path = btrfs_alloc_path();
919 	if (!path) {
920 		ret = -ENOMEM;
921 		goto out;
922 	}
923 
924 	/*
925 	 * get the inode first so any iput calls done for the io_list
926 	 * aren't the final iput (no unlinks allowed now)
927 	 */
928 	inode = lookup_free_space_inode(block_group, path);
929 
930 	mutex_lock(&trans->transaction->cache_write_mutex);
931 	/*
932 	 * Make sure our free space cache IO is done before removing the
933 	 * free space inode
934 	 */
935 	spin_lock(&trans->transaction->dirty_bgs_lock);
936 	if (!list_empty(&block_group->io_list)) {
937 		list_del_init(&block_group->io_list);
938 
939 		WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
940 
941 		spin_unlock(&trans->transaction->dirty_bgs_lock);
942 		btrfs_wait_cache_io(trans, block_group, path);
943 		btrfs_put_block_group(block_group);
944 		spin_lock(&trans->transaction->dirty_bgs_lock);
945 	}
946 
947 	if (!list_empty(&block_group->dirty_list)) {
948 		list_del_init(&block_group->dirty_list);
949 		remove_rsv = true;
950 		btrfs_put_block_group(block_group);
951 	}
952 	spin_unlock(&trans->transaction->dirty_bgs_lock);
953 	mutex_unlock(&trans->transaction->cache_write_mutex);
954 
955 	if (!IS_ERR(inode)) {
956 		ret = btrfs_orphan_add(trans, BTRFS_I(inode));
957 		if (ret) {
958 			btrfs_add_delayed_iput(inode);
959 			goto out;
960 		}
961 		clear_nlink(inode);
962 		/* One for the block groups ref */
963 		spin_lock(&block_group->lock);
964 		if (block_group->iref) {
965 			block_group->iref = 0;
966 			block_group->inode = NULL;
967 			spin_unlock(&block_group->lock);
968 			iput(inode);
969 		} else {
970 			spin_unlock(&block_group->lock);
971 		}
972 		/* One for our lookup ref */
973 		btrfs_add_delayed_iput(inode);
974 	}
975 
976 	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
977 	key.offset = block_group->key.objectid;
978 	key.type = 0;
979 
980 	ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
981 	if (ret < 0)
982 		goto out;
983 	if (ret > 0)
984 		btrfs_release_path(path);
985 	if (ret == 0) {
986 		ret = btrfs_del_item(trans, tree_root, path);
987 		if (ret)
988 			goto out;
989 		btrfs_release_path(path);
990 	}
991 
992 	spin_lock(&fs_info->block_group_cache_lock);
993 	rb_erase(&block_group->cache_node,
994 		 &fs_info->block_group_cache_tree);
995 	RB_CLEAR_NODE(&block_group->cache_node);
996 
997 	/* Once for the block groups rbtree */
998 	btrfs_put_block_group(block_group);
999 
1000 	if (fs_info->first_logical_byte == block_group->key.objectid)
1001 		fs_info->first_logical_byte = (u64)-1;
1002 	spin_unlock(&fs_info->block_group_cache_lock);
1003 
1004 	down_write(&block_group->space_info->groups_sem);
1005 	/*
1006 	 * we must use list_del_init so people can check to see if they
1007 	 * are still on the list after taking the semaphore
1008 	 */
1009 	list_del_init(&block_group->list);
1010 	if (list_empty(&block_group->space_info->block_groups[index])) {
1011 		kobj = block_group->space_info->block_group_kobjs[index];
1012 		block_group->space_info->block_group_kobjs[index] = NULL;
1013 		clear_avail_alloc_bits(fs_info, block_group->flags);
1014 	}
1015 	up_write(&block_group->space_info->groups_sem);
1016 	clear_incompat_bg_bits(fs_info, block_group->flags);
1017 	if (kobj) {
1018 		kobject_del(kobj);
1019 		kobject_put(kobj);
1020 	}
1021 
1022 	if (block_group->has_caching_ctl)
1023 		caching_ctl = btrfs_get_caching_control(block_group);
1024 	if (block_group->cached == BTRFS_CACHE_STARTED)
1025 		btrfs_wait_block_group_cache_done(block_group);
1026 	if (block_group->has_caching_ctl) {
1027 		down_write(&fs_info->commit_root_sem);
1028 		if (!caching_ctl) {
1029 			struct btrfs_caching_control *ctl;
1030 
1031 			list_for_each_entry(ctl,
1032 				    &fs_info->caching_block_groups, list)
1033 				if (ctl->block_group == block_group) {
1034 					caching_ctl = ctl;
1035 					refcount_inc(&caching_ctl->count);
1036 					break;
1037 				}
1038 		}
1039 		if (caching_ctl)
1040 			list_del_init(&caching_ctl->list);
1041 		up_write(&fs_info->commit_root_sem);
1042 		if (caching_ctl) {
1043 			/* Once for the caching bgs list and once for us. */
1044 			btrfs_put_caching_control(caching_ctl);
1045 			btrfs_put_caching_control(caching_ctl);
1046 		}
1047 	}
1048 
1049 	spin_lock(&trans->transaction->dirty_bgs_lock);
1050 	WARN_ON(!list_empty(&block_group->dirty_list));
1051 	WARN_ON(!list_empty(&block_group->io_list));
1052 	spin_unlock(&trans->transaction->dirty_bgs_lock);
1053 
1054 	btrfs_remove_free_space_cache(block_group);
1055 
1056 	spin_lock(&block_group->space_info->lock);
1057 	list_del_init(&block_group->ro_list);
1058 
1059 	if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
1060 		WARN_ON(block_group->space_info->total_bytes
1061 			< block_group->key.offset);
1062 		WARN_ON(block_group->space_info->bytes_readonly
1063 			< block_group->key.offset);
1064 		WARN_ON(block_group->space_info->disk_total
1065 			< block_group->key.offset * factor);
1066 	}
1067 	block_group->space_info->total_bytes -= block_group->key.offset;
1068 	block_group->space_info->bytes_readonly -= block_group->key.offset;
1069 	block_group->space_info->disk_total -= block_group->key.offset * factor;
1070 
1071 	spin_unlock(&block_group->space_info->lock);
1072 
1073 	memcpy(&key, &block_group->key, sizeof(key));
1074 
1075 	mutex_lock(&fs_info->chunk_mutex);
1076 	spin_lock(&block_group->lock);
1077 	block_group->removed = 1;
1078 	/*
1079 	 * At this point trimming can't start on this block group, because we
1080 	 * removed the block group from the tree fs_info->block_group_cache_tree
1081 	 * so no one can't find it anymore and even if someone already got this
1082 	 * block group before we removed it from the rbtree, they have already
1083 	 * incremented block_group->trimming - if they didn't, they won't find
1084 	 * any free space entries because we already removed them all when we
1085 	 * called btrfs_remove_free_space_cache().
1086 	 *
1087 	 * And we must not remove the extent map from the fs_info->mapping_tree
1088 	 * to prevent the same logical address range and physical device space
1089 	 * ranges from being reused for a new block group. This is because our
1090 	 * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
1091 	 * completely transactionless, so while it is trimming a range the
1092 	 * currently running transaction might finish and a new one start,
1093 	 * allowing for new block groups to be created that can reuse the same
1094 	 * physical device locations unless we take this special care.
1095 	 *
1096 	 * There may also be an implicit trim operation if the file system
1097 	 * is mounted with -odiscard. The same protections must remain
1098 	 * in place until the extents have been discarded completely when
1099 	 * the transaction commit has completed.
1100 	 */
1101 	remove_em = (atomic_read(&block_group->trimming) == 0);
1102 	spin_unlock(&block_group->lock);
1103 
1104 	mutex_unlock(&fs_info->chunk_mutex);
1105 
1106 	ret = remove_block_group_free_space(trans, block_group);
1107 	if (ret)
1108 		goto out;
1109 
1110 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1111 	if (ret > 0)
1112 		ret = -EIO;
1113 	if (ret < 0)
1114 		goto out;
1115 
1116 	ret = btrfs_del_item(trans, root, path);
1117 	if (ret)
1118 		goto out;
1119 
1120 	if (remove_em) {
1121 		struct extent_map_tree *em_tree;
1122 
1123 		em_tree = &fs_info->mapping_tree;
1124 		write_lock(&em_tree->lock);
1125 		remove_extent_mapping(em_tree, em);
1126 		write_unlock(&em_tree->lock);
1127 		/* once for the tree */
1128 		free_extent_map(em);
1129 	}
1130 
1131 out:
1132 	/* Once for the lookup reference */
1133 	btrfs_put_block_group(block_group);
1134 	if (remove_rsv)
1135 		btrfs_delayed_refs_rsv_release(fs_info, 1);
1136 	btrfs_free_path(path);
1137 	return ret;
1138 }
1139 
btrfs_start_trans_remove_block_group(struct btrfs_fs_info * fs_info,const u64 chunk_offset)1140 struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
1141 		struct btrfs_fs_info *fs_info, const u64 chunk_offset)
1142 {
1143 	struct extent_map_tree *em_tree = &fs_info->mapping_tree;
1144 	struct extent_map *em;
1145 	struct map_lookup *map;
1146 	unsigned int num_items;
1147 
1148 	read_lock(&em_tree->lock);
1149 	em = lookup_extent_mapping(em_tree, chunk_offset, 1);
1150 	read_unlock(&em_tree->lock);
1151 	ASSERT(em && em->start == chunk_offset);
1152 
1153 	/*
1154 	 * We need to reserve 3 + N units from the metadata space info in order
1155 	 * to remove a block group (done at btrfs_remove_chunk() and at
1156 	 * btrfs_remove_block_group()), which are used for:
1157 	 *
1158 	 * 1 unit for adding the free space inode's orphan (located in the tree
1159 	 * of tree roots).
1160 	 * 1 unit for deleting the block group item (located in the extent
1161 	 * tree).
1162 	 * 1 unit for deleting the free space item (located in tree of tree
1163 	 * roots).
1164 	 * N units for deleting N device extent items corresponding to each
1165 	 * stripe (located in the device tree).
1166 	 *
1167 	 * In order to remove a block group we also need to reserve units in the
1168 	 * system space info in order to update the chunk tree (update one or
1169 	 * more device items and remove one chunk item), but this is done at
1170 	 * btrfs_remove_chunk() through a call to check_system_chunk().
1171 	 */
1172 	map = em->map_lookup;
1173 	num_items = 3 + map->num_stripes;
1174 	free_extent_map(em);
1175 
1176 	return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
1177 							   num_items);
1178 }
1179 
1180 /*
1181  * Mark block group @cache read-only, so later write won't happen to block
1182  * group @cache.
1183  *
1184  * If @force is not set, this function will only mark the block group readonly
1185  * if we have enough free space (1M) in other metadata/system block groups.
1186  * If @force is not set, this function will mark the block group readonly
1187  * without checking free space.
1188  *
1189  * NOTE: This function doesn't care if other block groups can contain all the
1190  * data in this block group. That check should be done by relocation routine,
1191  * not this function.
1192  */
inc_block_group_ro(struct btrfs_block_group_cache * cache,int force)1193 static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
1194 {
1195 	struct btrfs_space_info *sinfo = cache->space_info;
1196 	u64 num_bytes;
1197 	u64 min_allocable_bytes;
1198 	int ret = -ENOSPC;
1199 
1200 	/*
1201 	 * We need some metadata space and system metadata space for
1202 	 * allocating chunks in some corner cases until we force to set
1203 	 * it to be readonly.
1204 	 */
1205 	if ((sinfo->flags &
1206 	     (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
1207 	    !force)
1208 		min_allocable_bytes = SZ_1M;
1209 	else
1210 		min_allocable_bytes = 0;
1211 
1212 	spin_lock(&sinfo->lock);
1213 	spin_lock(&cache->lock);
1214 
1215 	if (cache->ro) {
1216 		cache->ro++;
1217 		ret = 0;
1218 		goto out;
1219 	}
1220 
1221 	num_bytes = cache->key.offset - cache->reserved - cache->pinned -
1222 		    cache->bytes_super - btrfs_block_group_used(&cache->item);
1223 
1224 	/*
1225 	 * Data never overcommits, even in mixed mode, so do just the straight
1226 	 * check of left over space in how much we have allocated.
1227 	 */
1228 	if (force) {
1229 		ret = 0;
1230 	} else if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) {
1231 		u64 sinfo_used = btrfs_space_info_used(sinfo, true);
1232 
1233 		/*
1234 		 * Here we make sure if we mark this bg RO, we still have enough
1235 		 * free space as buffer.
1236 		 */
1237 		if (sinfo_used + num_bytes <= sinfo->total_bytes)
1238 			ret = 0;
1239 	} else {
1240 		/*
1241 		 * We overcommit metadata, so we need to do the
1242 		 * btrfs_can_overcommit check here, and we need to pass in
1243 		 * BTRFS_RESERVE_NO_FLUSH to give ourselves the most amount of
1244 		 * leeway to allow us to mark this block group as read only.
1245 		 */
1246 		if (btrfs_can_overcommit(cache->fs_info, sinfo, num_bytes,
1247 					 BTRFS_RESERVE_NO_FLUSH))
1248 			ret = 0;
1249 	}
1250 
1251 	if (!ret) {
1252 		sinfo->bytes_readonly += num_bytes;
1253 		cache->ro++;
1254 		list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
1255 	}
1256 out:
1257 	spin_unlock(&cache->lock);
1258 	spin_unlock(&sinfo->lock);
1259 	if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
1260 		btrfs_info(cache->fs_info,
1261 			"unable to make block group %llu ro",
1262 			cache->key.objectid);
1263 		btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0);
1264 	}
1265 	return ret;
1266 }
1267 
1268 /*
1269  * Process the unused_bgs list and remove any that don't have any allocated
1270  * space inside of them.
1271  */
btrfs_delete_unused_bgs(struct btrfs_fs_info * fs_info)1272 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
1273 {
1274 	struct btrfs_block_group_cache *block_group;
1275 	struct btrfs_space_info *space_info;
1276 	struct btrfs_trans_handle *trans;
1277 	int ret = 0;
1278 
1279 	if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
1280 		return;
1281 
1282 	spin_lock(&fs_info->unused_bgs_lock);
1283 	while (!list_empty(&fs_info->unused_bgs)) {
1284 		u64 start, end;
1285 		int trimming;
1286 
1287 		block_group = list_first_entry(&fs_info->unused_bgs,
1288 					       struct btrfs_block_group_cache,
1289 					       bg_list);
1290 		list_del_init(&block_group->bg_list);
1291 
1292 		space_info = block_group->space_info;
1293 
1294 		if (ret || btrfs_mixed_space_info(space_info)) {
1295 			btrfs_put_block_group(block_group);
1296 			continue;
1297 		}
1298 		spin_unlock(&fs_info->unused_bgs_lock);
1299 
1300 		mutex_lock(&fs_info->delete_unused_bgs_mutex);
1301 
1302 		/* Don't want to race with allocators so take the groups_sem */
1303 		down_write(&space_info->groups_sem);
1304 		spin_lock(&block_group->lock);
1305 		if (block_group->reserved || block_group->pinned ||
1306 		    btrfs_block_group_used(&block_group->item) ||
1307 		    block_group->ro ||
1308 		    list_is_singular(&block_group->list)) {
1309 			/*
1310 			 * We want to bail if we made new allocations or have
1311 			 * outstanding allocations in this block group.  We do
1312 			 * the ro check in case balance is currently acting on
1313 			 * this block group.
1314 			 */
1315 			trace_btrfs_skip_unused_block_group(block_group);
1316 			spin_unlock(&block_group->lock);
1317 			up_write(&space_info->groups_sem);
1318 			goto next;
1319 		}
1320 		spin_unlock(&block_group->lock);
1321 
1322 		/* We don't want to force the issue, only flip if it's ok. */
1323 		ret = inc_block_group_ro(block_group, 0);
1324 		up_write(&space_info->groups_sem);
1325 		if (ret < 0) {
1326 			ret = 0;
1327 			goto next;
1328 		}
1329 
1330 		/*
1331 		 * Want to do this before we do anything else so we can recover
1332 		 * properly if we fail to join the transaction.
1333 		 */
1334 		trans = btrfs_start_trans_remove_block_group(fs_info,
1335 						     block_group->key.objectid);
1336 		if (IS_ERR(trans)) {
1337 			btrfs_dec_block_group_ro(block_group);
1338 			ret = PTR_ERR(trans);
1339 			goto next;
1340 		}
1341 
1342 		/*
1343 		 * We could have pending pinned extents for this block group,
1344 		 * just delete them, we don't care about them anymore.
1345 		 */
1346 		start = block_group->key.objectid;
1347 		end = start + block_group->key.offset - 1;
1348 		/*
1349 		 * Hold the unused_bg_unpin_mutex lock to avoid racing with
1350 		 * btrfs_finish_extent_commit(). If we are at transaction N,
1351 		 * another task might be running finish_extent_commit() for the
1352 		 * previous transaction N - 1, and have seen a range belonging
1353 		 * to the block group in freed_extents[] before we were able to
1354 		 * clear the whole block group range from freed_extents[]. This
1355 		 * means that task can lookup for the block group after we
1356 		 * unpinned it from freed_extents[] and removed it, leading to
1357 		 * a BUG_ON() at btrfs_unpin_extent_range().
1358 		 */
1359 		mutex_lock(&fs_info->unused_bg_unpin_mutex);
1360 		ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
1361 				  EXTENT_DIRTY);
1362 		if (ret) {
1363 			mutex_unlock(&fs_info->unused_bg_unpin_mutex);
1364 			btrfs_dec_block_group_ro(block_group);
1365 			goto end_trans;
1366 		}
1367 		ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
1368 				  EXTENT_DIRTY);
1369 		if (ret) {
1370 			mutex_unlock(&fs_info->unused_bg_unpin_mutex);
1371 			btrfs_dec_block_group_ro(block_group);
1372 			goto end_trans;
1373 		}
1374 		mutex_unlock(&fs_info->unused_bg_unpin_mutex);
1375 
1376 		/* Reset pinned so btrfs_put_block_group doesn't complain */
1377 		spin_lock(&space_info->lock);
1378 		spin_lock(&block_group->lock);
1379 
1380 		btrfs_space_info_update_bytes_pinned(fs_info, space_info,
1381 						     -block_group->pinned);
1382 		space_info->bytes_readonly += block_group->pinned;
1383 		percpu_counter_add_batch(&space_info->total_bytes_pinned,
1384 				   -block_group->pinned,
1385 				   BTRFS_TOTAL_BYTES_PINNED_BATCH);
1386 		block_group->pinned = 0;
1387 
1388 		spin_unlock(&block_group->lock);
1389 		spin_unlock(&space_info->lock);
1390 
1391 		/* DISCARD can flip during remount */
1392 		trimming = btrfs_test_opt(fs_info, DISCARD);
1393 
1394 		/* Implicit trim during transaction commit. */
1395 		if (trimming)
1396 			btrfs_get_block_group_trimming(block_group);
1397 
1398 		/*
1399 		 * Btrfs_remove_chunk will abort the transaction if things go
1400 		 * horribly wrong.
1401 		 */
1402 		ret = btrfs_remove_chunk(trans, block_group->key.objectid);
1403 
1404 		if (ret) {
1405 			if (trimming)
1406 				btrfs_put_block_group_trimming(block_group);
1407 			goto end_trans;
1408 		}
1409 
1410 		/*
1411 		 * If we're not mounted with -odiscard, we can just forget
1412 		 * about this block group. Otherwise we'll need to wait
1413 		 * until transaction commit to do the actual discard.
1414 		 */
1415 		if (trimming) {
1416 			spin_lock(&fs_info->unused_bgs_lock);
1417 			/*
1418 			 * A concurrent scrub might have added us to the list
1419 			 * fs_info->unused_bgs, so use a list_move operation
1420 			 * to add the block group to the deleted_bgs list.
1421 			 */
1422 			list_move(&block_group->bg_list,
1423 				  &trans->transaction->deleted_bgs);
1424 			spin_unlock(&fs_info->unused_bgs_lock);
1425 			btrfs_get_block_group(block_group);
1426 		}
1427 end_trans:
1428 		btrfs_end_transaction(trans);
1429 next:
1430 		mutex_unlock(&fs_info->delete_unused_bgs_mutex);
1431 		btrfs_put_block_group(block_group);
1432 		spin_lock(&fs_info->unused_bgs_lock);
1433 	}
1434 	spin_unlock(&fs_info->unused_bgs_lock);
1435 }
1436 
btrfs_mark_bg_unused(struct btrfs_block_group_cache * bg)1437 void btrfs_mark_bg_unused(struct btrfs_block_group_cache *bg)
1438 {
1439 	struct btrfs_fs_info *fs_info = bg->fs_info;
1440 
1441 	spin_lock(&fs_info->unused_bgs_lock);
1442 	if (list_empty(&bg->bg_list)) {
1443 		btrfs_get_block_group(bg);
1444 		trace_btrfs_add_unused_block_group(bg);
1445 		list_add_tail(&bg->bg_list, &fs_info->unused_bgs);
1446 	}
1447 	spin_unlock(&fs_info->unused_bgs_lock);
1448 }
1449 
find_first_block_group(struct btrfs_fs_info * fs_info,struct btrfs_path * path,struct btrfs_key * key)1450 static int find_first_block_group(struct btrfs_fs_info *fs_info,
1451 				  struct btrfs_path *path,
1452 				  struct btrfs_key *key)
1453 {
1454 	struct btrfs_root *root = fs_info->extent_root;
1455 	int ret = 0;
1456 	struct btrfs_key found_key;
1457 	struct extent_buffer *leaf;
1458 	struct btrfs_block_group_item bg;
1459 	u64 flags;
1460 	int slot;
1461 
1462 	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
1463 	if (ret < 0)
1464 		goto out;
1465 
1466 	while (1) {
1467 		slot = path->slots[0];
1468 		leaf = path->nodes[0];
1469 		if (slot >= btrfs_header_nritems(leaf)) {
1470 			ret = btrfs_next_leaf(root, path);
1471 			if (ret == 0)
1472 				continue;
1473 			if (ret < 0)
1474 				goto out;
1475 			break;
1476 		}
1477 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
1478 
1479 		if (found_key.objectid >= key->objectid &&
1480 		    found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
1481 			struct extent_map_tree *em_tree;
1482 			struct extent_map *em;
1483 
1484 			em_tree = &root->fs_info->mapping_tree;
1485 			read_lock(&em_tree->lock);
1486 			em = lookup_extent_mapping(em_tree, found_key.objectid,
1487 						   found_key.offset);
1488 			read_unlock(&em_tree->lock);
1489 			if (!em) {
1490 				btrfs_err(fs_info,
1491 			"logical %llu len %llu found bg but no related chunk",
1492 					  found_key.objectid, found_key.offset);
1493 				ret = -ENOENT;
1494 			} else if (em->start != found_key.objectid ||
1495 				   em->len != found_key.offset) {
1496 				btrfs_err(fs_info,
1497 		"block group %llu len %llu mismatch with chunk %llu len %llu",
1498 					  found_key.objectid, found_key.offset,
1499 					  em->start, em->len);
1500 				ret = -EUCLEAN;
1501 			} else {
1502 				read_extent_buffer(leaf, &bg,
1503 					btrfs_item_ptr_offset(leaf, slot),
1504 					sizeof(bg));
1505 				flags = btrfs_block_group_flags(&bg) &
1506 					BTRFS_BLOCK_GROUP_TYPE_MASK;
1507 
1508 				if (flags != (em->map_lookup->type &
1509 					      BTRFS_BLOCK_GROUP_TYPE_MASK)) {
1510 					btrfs_err(fs_info,
1511 "block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
1512 						found_key.objectid,
1513 						found_key.offset, flags,
1514 						(BTRFS_BLOCK_GROUP_TYPE_MASK &
1515 						 em->map_lookup->type));
1516 					ret = -EUCLEAN;
1517 				} else {
1518 					ret = 0;
1519 				}
1520 			}
1521 			free_extent_map(em);
1522 			goto out;
1523 		}
1524 		path->slots[0]++;
1525 	}
1526 out:
1527 	return ret;
1528 }
1529 
set_avail_alloc_bits(struct btrfs_fs_info * fs_info,u64 flags)1530 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
1531 {
1532 	u64 extra_flags = chunk_to_extended(flags) &
1533 				BTRFS_EXTENDED_PROFILE_MASK;
1534 
1535 	write_seqlock(&fs_info->profiles_lock);
1536 	if (flags & BTRFS_BLOCK_GROUP_DATA)
1537 		fs_info->avail_data_alloc_bits |= extra_flags;
1538 	if (flags & BTRFS_BLOCK_GROUP_METADATA)
1539 		fs_info->avail_metadata_alloc_bits |= extra_flags;
1540 	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
1541 		fs_info->avail_system_alloc_bits |= extra_flags;
1542 	write_sequnlock(&fs_info->profiles_lock);
1543 }
1544 
exclude_super_stripes(struct btrfs_block_group_cache * cache)1545 static int exclude_super_stripes(struct btrfs_block_group_cache *cache)
1546 {
1547 	struct btrfs_fs_info *fs_info = cache->fs_info;
1548 	u64 bytenr;
1549 	u64 *logical;
1550 	int stripe_len;
1551 	int i, nr, ret;
1552 
1553 	if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
1554 		stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
1555 		cache->bytes_super += stripe_len;
1556 		ret = btrfs_add_excluded_extent(fs_info, cache->key.objectid,
1557 						stripe_len);
1558 		if (ret)
1559 			return ret;
1560 	}
1561 
1562 	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
1563 		bytenr = btrfs_sb_offset(i);
1564 		ret = btrfs_rmap_block(fs_info, cache->key.objectid,
1565 				       bytenr, &logical, &nr, &stripe_len);
1566 		if (ret)
1567 			return ret;
1568 
1569 		while (nr--) {
1570 			u64 start, len;
1571 
1572 			if (logical[nr] > cache->key.objectid +
1573 			    cache->key.offset)
1574 				continue;
1575 
1576 			if (logical[nr] + stripe_len <= cache->key.objectid)
1577 				continue;
1578 
1579 			start = logical[nr];
1580 			if (start < cache->key.objectid) {
1581 				start = cache->key.objectid;
1582 				len = (logical[nr] + stripe_len) - start;
1583 			} else {
1584 				len = min_t(u64, stripe_len,
1585 					    cache->key.objectid +
1586 					    cache->key.offset - start);
1587 			}
1588 
1589 			cache->bytes_super += len;
1590 			ret = btrfs_add_excluded_extent(fs_info, start, len);
1591 			if (ret) {
1592 				kfree(logical);
1593 				return ret;
1594 			}
1595 		}
1596 
1597 		kfree(logical);
1598 	}
1599 	return 0;
1600 }
1601 
link_block_group(struct btrfs_block_group_cache * cache)1602 static void link_block_group(struct btrfs_block_group_cache *cache)
1603 {
1604 	struct btrfs_space_info *space_info = cache->space_info;
1605 	int index = btrfs_bg_flags_to_raid_index(cache->flags);
1606 	bool first = false;
1607 
1608 	down_write(&space_info->groups_sem);
1609 	if (list_empty(&space_info->block_groups[index]))
1610 		first = true;
1611 	list_add_tail(&cache->list, &space_info->block_groups[index]);
1612 	up_write(&space_info->groups_sem);
1613 
1614 	if (first)
1615 		btrfs_sysfs_add_block_group_type(cache);
1616 }
1617 
btrfs_create_block_group_cache(struct btrfs_fs_info * fs_info,u64 start,u64 size)1618 static struct btrfs_block_group_cache *btrfs_create_block_group_cache(
1619 		struct btrfs_fs_info *fs_info, u64 start, u64 size)
1620 {
1621 	struct btrfs_block_group_cache *cache;
1622 
1623 	cache = kzalloc(sizeof(*cache), GFP_NOFS);
1624 	if (!cache)
1625 		return NULL;
1626 
1627 	cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
1628 					GFP_NOFS);
1629 	if (!cache->free_space_ctl) {
1630 		kfree(cache);
1631 		return NULL;
1632 	}
1633 
1634 	cache->key.objectid = start;
1635 	cache->key.offset = size;
1636 	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
1637 
1638 	cache->fs_info = fs_info;
1639 	cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
1640 	set_free_space_tree_thresholds(cache);
1641 
1642 	atomic_set(&cache->count, 1);
1643 	spin_lock_init(&cache->lock);
1644 	init_rwsem(&cache->data_rwsem);
1645 	INIT_LIST_HEAD(&cache->list);
1646 	INIT_LIST_HEAD(&cache->cluster_list);
1647 	INIT_LIST_HEAD(&cache->bg_list);
1648 	INIT_LIST_HEAD(&cache->ro_list);
1649 	INIT_LIST_HEAD(&cache->dirty_list);
1650 	INIT_LIST_HEAD(&cache->io_list);
1651 	btrfs_init_free_space_ctl(cache);
1652 	atomic_set(&cache->trimming, 0);
1653 	mutex_init(&cache->free_space_lock);
1654 	btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
1655 
1656 	return cache;
1657 }
1658 
1659 /*
1660  * Iterate all chunks and verify that each of them has the corresponding block
1661  * group
1662  */
check_chunk_block_group_mappings(struct btrfs_fs_info * fs_info)1663 static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
1664 {
1665 	struct extent_map_tree *map_tree = &fs_info->mapping_tree;
1666 	struct extent_map *em;
1667 	struct btrfs_block_group_cache *bg;
1668 	u64 start = 0;
1669 	int ret = 0;
1670 
1671 	while (1) {
1672 		read_lock(&map_tree->lock);
1673 		/*
1674 		 * lookup_extent_mapping will return the first extent map
1675 		 * intersecting the range, so setting @len to 1 is enough to
1676 		 * get the first chunk.
1677 		 */
1678 		em = lookup_extent_mapping(map_tree, start, 1);
1679 		read_unlock(&map_tree->lock);
1680 		if (!em)
1681 			break;
1682 
1683 		bg = btrfs_lookup_block_group(fs_info, em->start);
1684 		if (!bg) {
1685 			btrfs_err(fs_info,
1686 	"chunk start=%llu len=%llu doesn't have corresponding block group",
1687 				     em->start, em->len);
1688 			ret = -EUCLEAN;
1689 			free_extent_map(em);
1690 			break;
1691 		}
1692 		if (bg->key.objectid != em->start ||
1693 		    bg->key.offset != em->len ||
1694 		    (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
1695 		    (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
1696 			btrfs_err(fs_info,
1697 "chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
1698 				em->start, em->len,
1699 				em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
1700 				bg->key.objectid, bg->key.offset,
1701 				bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
1702 			ret = -EUCLEAN;
1703 			free_extent_map(em);
1704 			btrfs_put_block_group(bg);
1705 			break;
1706 		}
1707 		start = em->start + em->len;
1708 		free_extent_map(em);
1709 		btrfs_put_block_group(bg);
1710 	}
1711 	return ret;
1712 }
1713 
btrfs_read_block_groups(struct btrfs_fs_info * info)1714 int btrfs_read_block_groups(struct btrfs_fs_info *info)
1715 {
1716 	struct btrfs_path *path;
1717 	int ret;
1718 	struct btrfs_block_group_cache *cache;
1719 	struct btrfs_space_info *space_info;
1720 	struct btrfs_key key;
1721 	struct btrfs_key found_key;
1722 	struct extent_buffer *leaf;
1723 	int need_clear = 0;
1724 	u64 cache_gen;
1725 	u64 feature;
1726 	int mixed;
1727 
1728 	feature = btrfs_super_incompat_flags(info->super_copy);
1729 	mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS);
1730 
1731 	key.objectid = 0;
1732 	key.offset = 0;
1733 	key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
1734 	path = btrfs_alloc_path();
1735 	if (!path)
1736 		return -ENOMEM;
1737 	path->reada = READA_FORWARD;
1738 
1739 	cache_gen = btrfs_super_cache_generation(info->super_copy);
1740 	if (btrfs_test_opt(info, SPACE_CACHE) &&
1741 	    btrfs_super_generation(info->super_copy) != cache_gen)
1742 		need_clear = 1;
1743 	if (btrfs_test_opt(info, CLEAR_CACHE))
1744 		need_clear = 1;
1745 
1746 	while (1) {
1747 		ret = find_first_block_group(info, path, &key);
1748 		if (ret > 0)
1749 			break;
1750 		if (ret != 0)
1751 			goto error;
1752 
1753 		leaf = path->nodes[0];
1754 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1755 
1756 		cache = btrfs_create_block_group_cache(info, found_key.objectid,
1757 						       found_key.offset);
1758 		if (!cache) {
1759 			ret = -ENOMEM;
1760 			goto error;
1761 		}
1762 
1763 		if (need_clear) {
1764 			/*
1765 			 * When we mount with old space cache, we need to
1766 			 * set BTRFS_DC_CLEAR and set dirty flag.
1767 			 *
1768 			 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
1769 			 *    truncate the old free space cache inode and
1770 			 *    setup a new one.
1771 			 * b) Setting 'dirty flag' makes sure that we flush
1772 			 *    the new space cache info onto disk.
1773 			 */
1774 			if (btrfs_test_opt(info, SPACE_CACHE))
1775 				cache->disk_cache_state = BTRFS_DC_CLEAR;
1776 		}
1777 
1778 		read_extent_buffer(leaf, &cache->item,
1779 				   btrfs_item_ptr_offset(leaf, path->slots[0]),
1780 				   sizeof(cache->item));
1781 		cache->flags = btrfs_block_group_flags(&cache->item);
1782 		if (!mixed &&
1783 		    ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
1784 		    (cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
1785 			btrfs_err(info,
1786 "bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
1787 				  cache->key.objectid);
1788 			btrfs_put_block_group(cache);
1789 			ret = -EINVAL;
1790 			goto error;
1791 		}
1792 
1793 		key.objectid = found_key.objectid + found_key.offset;
1794 		btrfs_release_path(path);
1795 
1796 		/*
1797 		 * We need to exclude the super stripes now so that the space
1798 		 * info has super bytes accounted for, otherwise we'll think
1799 		 * we have more space than we actually do.
1800 		 */
1801 		ret = exclude_super_stripes(cache);
1802 		if (ret) {
1803 			/*
1804 			 * We may have excluded something, so call this just in
1805 			 * case.
1806 			 */
1807 			btrfs_free_excluded_extents(cache);
1808 			btrfs_put_block_group(cache);
1809 			goto error;
1810 		}
1811 
1812 		/*
1813 		 * Check for two cases, either we are full, and therefore
1814 		 * don't need to bother with the caching work since we won't
1815 		 * find any space, or we are empty, and we can just add all
1816 		 * the space in and be done with it.  This saves us _a_lot_ of
1817 		 * time, particularly in the full case.
1818 		 */
1819 		if (found_key.offset == btrfs_block_group_used(&cache->item)) {
1820 			cache->last_byte_to_unpin = (u64)-1;
1821 			cache->cached = BTRFS_CACHE_FINISHED;
1822 			btrfs_free_excluded_extents(cache);
1823 		} else if (btrfs_block_group_used(&cache->item) == 0) {
1824 			cache->last_byte_to_unpin = (u64)-1;
1825 			cache->cached = BTRFS_CACHE_FINISHED;
1826 			add_new_free_space(cache, found_key.objectid,
1827 					   found_key.objectid +
1828 					   found_key.offset);
1829 			btrfs_free_excluded_extents(cache);
1830 		}
1831 
1832 		ret = btrfs_add_block_group_cache(info, cache);
1833 		if (ret) {
1834 			btrfs_remove_free_space_cache(cache);
1835 			btrfs_put_block_group(cache);
1836 			goto error;
1837 		}
1838 
1839 		trace_btrfs_add_block_group(info, cache, 0);
1840 		btrfs_update_space_info(info, cache->flags, found_key.offset,
1841 					btrfs_block_group_used(&cache->item),
1842 					cache->bytes_super, &space_info);
1843 
1844 		cache->space_info = space_info;
1845 
1846 		link_block_group(cache);
1847 
1848 		set_avail_alloc_bits(info, cache->flags);
1849 		if (btrfs_chunk_readonly(info, cache->key.objectid)) {
1850 			inc_block_group_ro(cache, 1);
1851 		} else if (btrfs_block_group_used(&cache->item) == 0) {
1852 			ASSERT(list_empty(&cache->bg_list));
1853 			btrfs_mark_bg_unused(cache);
1854 		}
1855 	}
1856 
1857 	rcu_read_lock();
1858 	list_for_each_entry_rcu(space_info, &info->space_info, list) {
1859 		if (!(btrfs_get_alloc_profile(info, space_info->flags) &
1860 		      (BTRFS_BLOCK_GROUP_RAID10 |
1861 		       BTRFS_BLOCK_GROUP_RAID1_MASK |
1862 		       BTRFS_BLOCK_GROUP_RAID56_MASK |
1863 		       BTRFS_BLOCK_GROUP_DUP)))
1864 			continue;
1865 		/*
1866 		 * Avoid allocating from un-mirrored block group if there are
1867 		 * mirrored block groups.
1868 		 */
1869 		list_for_each_entry(cache,
1870 				&space_info->block_groups[BTRFS_RAID_RAID0],
1871 				list)
1872 			inc_block_group_ro(cache, 1);
1873 		list_for_each_entry(cache,
1874 				&space_info->block_groups[BTRFS_RAID_SINGLE],
1875 				list)
1876 			inc_block_group_ro(cache, 1);
1877 	}
1878 	rcu_read_unlock();
1879 
1880 	btrfs_init_global_block_rsv(info);
1881 	ret = check_chunk_block_group_mappings(info);
1882 error:
1883 	btrfs_free_path(path);
1884 	return ret;
1885 }
1886 
btrfs_create_pending_block_groups(struct btrfs_trans_handle * trans)1887 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
1888 {
1889 	struct btrfs_fs_info *fs_info = trans->fs_info;
1890 	struct btrfs_block_group_cache *block_group;
1891 	struct btrfs_root *extent_root = fs_info->extent_root;
1892 	struct btrfs_block_group_item item;
1893 	struct btrfs_key key;
1894 	int ret = 0;
1895 
1896 	if (!trans->can_flush_pending_bgs)
1897 		return;
1898 
1899 	while (!list_empty(&trans->new_bgs)) {
1900 		block_group = list_first_entry(&trans->new_bgs,
1901 					       struct btrfs_block_group_cache,
1902 					       bg_list);
1903 		if (ret)
1904 			goto next;
1905 
1906 		spin_lock(&block_group->lock);
1907 		memcpy(&item, &block_group->item, sizeof(item));
1908 		memcpy(&key, &block_group->key, sizeof(key));
1909 		spin_unlock(&block_group->lock);
1910 
1911 		ret = btrfs_insert_item(trans, extent_root, &key, &item,
1912 					sizeof(item));
1913 		if (ret)
1914 			btrfs_abort_transaction(trans, ret);
1915 		ret = btrfs_finish_chunk_alloc(trans, key.objectid, key.offset);
1916 		if (ret)
1917 			btrfs_abort_transaction(trans, ret);
1918 		add_block_group_free_space(trans, block_group);
1919 		/* Already aborted the transaction if it failed. */
1920 next:
1921 		btrfs_delayed_refs_rsv_release(fs_info, 1);
1922 		list_del_init(&block_group->bg_list);
1923 	}
1924 	btrfs_trans_release_chunk_metadata(trans);
1925 }
1926 
btrfs_make_block_group(struct btrfs_trans_handle * trans,u64 bytes_used,u64 type,u64 chunk_offset,u64 size)1927 int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
1928 			   u64 type, u64 chunk_offset, u64 size)
1929 {
1930 	struct btrfs_fs_info *fs_info = trans->fs_info;
1931 	struct btrfs_block_group_cache *cache;
1932 	int ret;
1933 
1934 	btrfs_set_log_full_commit(trans);
1935 
1936 	cache = btrfs_create_block_group_cache(fs_info, chunk_offset, size);
1937 	if (!cache)
1938 		return -ENOMEM;
1939 
1940 	btrfs_set_block_group_used(&cache->item, bytes_used);
1941 	btrfs_set_block_group_chunk_objectid(&cache->item,
1942 					     BTRFS_FIRST_CHUNK_TREE_OBJECTID);
1943 	btrfs_set_block_group_flags(&cache->item, type);
1944 
1945 	cache->flags = type;
1946 	cache->last_byte_to_unpin = (u64)-1;
1947 	cache->cached = BTRFS_CACHE_FINISHED;
1948 	cache->needs_free_space = 1;
1949 	ret = exclude_super_stripes(cache);
1950 	if (ret) {
1951 		/* We may have excluded something, so call this just in case */
1952 		btrfs_free_excluded_extents(cache);
1953 		btrfs_put_block_group(cache);
1954 		return ret;
1955 	}
1956 
1957 	add_new_free_space(cache, chunk_offset, chunk_offset + size);
1958 
1959 	btrfs_free_excluded_extents(cache);
1960 
1961 #ifdef CONFIG_BTRFS_DEBUG
1962 	if (btrfs_should_fragment_free_space(cache)) {
1963 		u64 new_bytes_used = size - bytes_used;
1964 
1965 		bytes_used += new_bytes_used >> 1;
1966 		fragment_free_space(cache);
1967 	}
1968 #endif
1969 	/*
1970 	 * Ensure the corresponding space_info object is created and
1971 	 * assigned to our block group. We want our bg to be added to the rbtree
1972 	 * with its ->space_info set.
1973 	 */
1974 	cache->space_info = btrfs_find_space_info(fs_info, cache->flags);
1975 	ASSERT(cache->space_info);
1976 
1977 	ret = btrfs_add_block_group_cache(fs_info, cache);
1978 	if (ret) {
1979 		btrfs_remove_free_space_cache(cache);
1980 		btrfs_put_block_group(cache);
1981 		return ret;
1982 	}
1983 
1984 	/*
1985 	 * Now that our block group has its ->space_info set and is inserted in
1986 	 * the rbtree, update the space info's counters.
1987 	 */
1988 	trace_btrfs_add_block_group(fs_info, cache, 1);
1989 	btrfs_update_space_info(fs_info, cache->flags, size, bytes_used,
1990 				cache->bytes_super, &cache->space_info);
1991 	btrfs_update_global_block_rsv(fs_info);
1992 
1993 	link_block_group(cache);
1994 
1995 	list_add_tail(&cache->bg_list, &trans->new_bgs);
1996 	trans->delayed_ref_updates++;
1997 	btrfs_update_delayed_refs_rsv(trans);
1998 
1999 	set_avail_alloc_bits(fs_info, type);
2000 	return 0;
2001 }
2002 
update_block_group_flags(struct btrfs_fs_info * fs_info,u64 flags)2003 static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags)
2004 {
2005 	u64 num_devices;
2006 	u64 stripped;
2007 
2008 	/*
2009 	 * if restripe for this chunk_type is on pick target profile and
2010 	 * return, otherwise do the usual balance
2011 	 */
2012 	stripped = get_restripe_target(fs_info, flags);
2013 	if (stripped)
2014 		return extended_to_chunk(stripped);
2015 
2016 	num_devices = fs_info->fs_devices->rw_devices;
2017 
2018 	stripped = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID56_MASK |
2019 		BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10;
2020 
2021 	if (num_devices == 1) {
2022 		stripped |= BTRFS_BLOCK_GROUP_DUP;
2023 		stripped = flags & ~stripped;
2024 
2025 		/* turn raid0 into single device chunks */
2026 		if (flags & BTRFS_BLOCK_GROUP_RAID0)
2027 			return stripped;
2028 
2029 		/* turn mirroring into duplication */
2030 		if (flags & (BTRFS_BLOCK_GROUP_RAID1_MASK |
2031 			     BTRFS_BLOCK_GROUP_RAID10))
2032 			return stripped | BTRFS_BLOCK_GROUP_DUP;
2033 	} else {
2034 		/* they already had raid on here, just return */
2035 		if (flags & stripped)
2036 			return flags;
2037 
2038 		stripped |= BTRFS_BLOCK_GROUP_DUP;
2039 		stripped = flags & ~stripped;
2040 
2041 		/* switch duplicated blocks with raid1 */
2042 		if (flags & BTRFS_BLOCK_GROUP_DUP)
2043 			return stripped | BTRFS_BLOCK_GROUP_RAID1;
2044 
2045 		/* this is drive concat, leave it alone */
2046 	}
2047 
2048 	return flags;
2049 }
2050 
2051 /*
2052  * Mark one block group RO, can be called several times for the same block
2053  * group.
2054  *
2055  * @cache:		the destination block group
2056  * @do_chunk_alloc:	whether need to do chunk pre-allocation, this is to
2057  * 			ensure we still have some free space after marking this
2058  * 			block group RO.
2059  */
btrfs_inc_block_group_ro(struct btrfs_block_group_cache * cache,bool do_chunk_alloc)2060 int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache,
2061 			     bool do_chunk_alloc)
2062 {
2063 	struct btrfs_fs_info *fs_info = cache->fs_info;
2064 	struct btrfs_trans_handle *trans;
2065 	u64 alloc_flags;
2066 	int ret;
2067 
2068 again:
2069 	trans = btrfs_join_transaction(fs_info->extent_root);
2070 	if (IS_ERR(trans))
2071 		return PTR_ERR(trans);
2072 
2073 	/*
2074 	 * we're not allowed to set block groups readonly after the dirty
2075 	 * block groups cache has started writing.  If it already started,
2076 	 * back off and let this transaction commit
2077 	 */
2078 	mutex_lock(&fs_info->ro_block_group_mutex);
2079 	if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
2080 		u64 transid = trans->transid;
2081 
2082 		mutex_unlock(&fs_info->ro_block_group_mutex);
2083 		btrfs_end_transaction(trans);
2084 
2085 		ret = btrfs_wait_for_commit(fs_info, transid);
2086 		if (ret)
2087 			return ret;
2088 		goto again;
2089 	}
2090 
2091 	if (do_chunk_alloc) {
2092 		/*
2093 		 * If we are changing raid levels, try to allocate a
2094 		 * corresponding block group with the new raid level.
2095 		 */
2096 		alloc_flags = update_block_group_flags(fs_info, cache->flags);
2097 		if (alloc_flags != cache->flags) {
2098 			ret = btrfs_chunk_alloc(trans, alloc_flags,
2099 						CHUNK_ALLOC_FORCE);
2100 			/*
2101 			 * ENOSPC is allowed here, we may have enough space
2102 			 * already allocated at the new raid level to carry on
2103 			 */
2104 			if (ret == -ENOSPC)
2105 				ret = 0;
2106 			if (ret < 0)
2107 				goto out;
2108 		}
2109 	}
2110 
2111 	ret = inc_block_group_ro(cache, !do_chunk_alloc);
2112 	if (!do_chunk_alloc)
2113 		goto unlock_out;
2114 	if (!ret)
2115 		goto out;
2116 	alloc_flags = btrfs_get_alloc_profile(fs_info, cache->space_info->flags);
2117 	ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
2118 	if (ret < 0)
2119 		goto out;
2120 	ret = inc_block_group_ro(cache, 0);
2121 out:
2122 	if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
2123 		alloc_flags = update_block_group_flags(fs_info, cache->flags);
2124 		mutex_lock(&fs_info->chunk_mutex);
2125 		check_system_chunk(trans, alloc_flags);
2126 		mutex_unlock(&fs_info->chunk_mutex);
2127 	}
2128 unlock_out:
2129 	mutex_unlock(&fs_info->ro_block_group_mutex);
2130 
2131 	btrfs_end_transaction(trans);
2132 	return ret;
2133 }
2134 
btrfs_dec_block_group_ro(struct btrfs_block_group_cache * cache)2135 void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache)
2136 {
2137 	struct btrfs_space_info *sinfo = cache->space_info;
2138 	u64 num_bytes;
2139 
2140 	BUG_ON(!cache->ro);
2141 
2142 	spin_lock(&sinfo->lock);
2143 	spin_lock(&cache->lock);
2144 	if (!--cache->ro) {
2145 		num_bytes = cache->key.offset - cache->reserved -
2146 			    cache->pinned - cache->bytes_super -
2147 			    btrfs_block_group_used(&cache->item);
2148 		sinfo->bytes_readonly -= num_bytes;
2149 		list_del_init(&cache->ro_list);
2150 	}
2151 	spin_unlock(&cache->lock);
2152 	spin_unlock(&sinfo->lock);
2153 }
2154 
write_one_cache_group(struct btrfs_trans_handle * trans,struct btrfs_path * path,struct btrfs_block_group_cache * cache)2155 static int write_one_cache_group(struct btrfs_trans_handle *trans,
2156 				 struct btrfs_path *path,
2157 				 struct btrfs_block_group_cache *cache)
2158 {
2159 	struct btrfs_fs_info *fs_info = trans->fs_info;
2160 	int ret;
2161 	struct btrfs_root *extent_root = fs_info->extent_root;
2162 	unsigned long bi;
2163 	struct extent_buffer *leaf;
2164 
2165 	ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
2166 	if (ret) {
2167 		if (ret > 0)
2168 			ret = -ENOENT;
2169 		goto fail;
2170 	}
2171 
2172 	leaf = path->nodes[0];
2173 	bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
2174 	write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
2175 	btrfs_mark_buffer_dirty(leaf);
2176 fail:
2177 	btrfs_release_path(path);
2178 	return ret;
2179 
2180 }
2181 
cache_save_setup(struct btrfs_block_group_cache * block_group,struct btrfs_trans_handle * trans,struct btrfs_path * path)2182 static int cache_save_setup(struct btrfs_block_group_cache *block_group,
2183 			    struct btrfs_trans_handle *trans,
2184 			    struct btrfs_path *path)
2185 {
2186 	struct btrfs_fs_info *fs_info = block_group->fs_info;
2187 	struct btrfs_root *root = fs_info->tree_root;
2188 	struct inode *inode = NULL;
2189 	struct extent_changeset *data_reserved = NULL;
2190 	u64 alloc_hint = 0;
2191 	int dcs = BTRFS_DC_ERROR;
2192 	u64 num_pages = 0;
2193 	int retries = 0;
2194 	int ret = 0;
2195 
2196 	/*
2197 	 * If this block group is smaller than 100 megs don't bother caching the
2198 	 * block group.
2199 	 */
2200 	if (block_group->key.offset < (100 * SZ_1M)) {
2201 		spin_lock(&block_group->lock);
2202 		block_group->disk_cache_state = BTRFS_DC_WRITTEN;
2203 		spin_unlock(&block_group->lock);
2204 		return 0;
2205 	}
2206 
2207 	if (TRANS_ABORTED(trans))
2208 		return 0;
2209 again:
2210 	inode = lookup_free_space_inode(block_group, path);
2211 	if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
2212 		ret = PTR_ERR(inode);
2213 		btrfs_release_path(path);
2214 		goto out;
2215 	}
2216 
2217 	if (IS_ERR(inode)) {
2218 		BUG_ON(retries);
2219 		retries++;
2220 
2221 		if (block_group->ro)
2222 			goto out_free;
2223 
2224 		ret = create_free_space_inode(trans, block_group, path);
2225 		if (ret)
2226 			goto out_free;
2227 		goto again;
2228 	}
2229 
2230 	/*
2231 	 * We want to set the generation to 0, that way if anything goes wrong
2232 	 * from here on out we know not to trust this cache when we load up next
2233 	 * time.
2234 	 */
2235 	BTRFS_I(inode)->generation = 0;
2236 	ret = btrfs_update_inode(trans, root, inode);
2237 	if (ret) {
2238 		/*
2239 		 * So theoretically we could recover from this, simply set the
2240 		 * super cache generation to 0 so we know to invalidate the
2241 		 * cache, but then we'd have to keep track of the block groups
2242 		 * that fail this way so we know we _have_ to reset this cache
2243 		 * before the next commit or risk reading stale cache.  So to
2244 		 * limit our exposure to horrible edge cases lets just abort the
2245 		 * transaction, this only happens in really bad situations
2246 		 * anyway.
2247 		 */
2248 		btrfs_abort_transaction(trans, ret);
2249 		goto out_put;
2250 	}
2251 	WARN_ON(ret);
2252 
2253 	/* We've already setup this transaction, go ahead and exit */
2254 	if (block_group->cache_generation == trans->transid &&
2255 	    i_size_read(inode)) {
2256 		dcs = BTRFS_DC_SETUP;
2257 		goto out_put;
2258 	}
2259 
2260 	if (i_size_read(inode) > 0) {
2261 		ret = btrfs_check_trunc_cache_free_space(fs_info,
2262 					&fs_info->global_block_rsv);
2263 		if (ret)
2264 			goto out_put;
2265 
2266 		ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
2267 		if (ret)
2268 			goto out_put;
2269 	}
2270 
2271 	spin_lock(&block_group->lock);
2272 	if (block_group->cached != BTRFS_CACHE_FINISHED ||
2273 	    !btrfs_test_opt(fs_info, SPACE_CACHE)) {
2274 		/*
2275 		 * don't bother trying to write stuff out _if_
2276 		 * a) we're not cached,
2277 		 * b) we're with nospace_cache mount option,
2278 		 * c) we're with v2 space_cache (FREE_SPACE_TREE).
2279 		 */
2280 		dcs = BTRFS_DC_WRITTEN;
2281 		spin_unlock(&block_group->lock);
2282 		goto out_put;
2283 	}
2284 	spin_unlock(&block_group->lock);
2285 
2286 	/*
2287 	 * We hit an ENOSPC when setting up the cache in this transaction, just
2288 	 * skip doing the setup, we've already cleared the cache so we're safe.
2289 	 */
2290 	if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
2291 		ret = -ENOSPC;
2292 		goto out_put;
2293 	}
2294 
2295 	/*
2296 	 * Try to preallocate enough space based on how big the block group is.
2297 	 * Keep in mind this has to include any pinned space which could end up
2298 	 * taking up quite a bit since it's not folded into the other space
2299 	 * cache.
2300 	 */
2301 	num_pages = div_u64(block_group->key.offset, SZ_256M);
2302 	if (!num_pages)
2303 		num_pages = 1;
2304 
2305 	num_pages *= 16;
2306 	num_pages *= PAGE_SIZE;
2307 
2308 	ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages);
2309 	if (ret)
2310 		goto out_put;
2311 
2312 	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
2313 					      num_pages, num_pages,
2314 					      &alloc_hint);
2315 	/*
2316 	 * Our cache requires contiguous chunks so that we don't modify a bunch
2317 	 * of metadata or split extents when writing the cache out, which means
2318 	 * we can enospc if we are heavily fragmented in addition to just normal
2319 	 * out of space conditions.  So if we hit this just skip setting up any
2320 	 * other block groups for this transaction, maybe we'll unpin enough
2321 	 * space the next time around.
2322 	 */
2323 	if (!ret)
2324 		dcs = BTRFS_DC_SETUP;
2325 	else if (ret == -ENOSPC)
2326 		set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
2327 
2328 out_put:
2329 	iput(inode);
2330 out_free:
2331 	btrfs_release_path(path);
2332 out:
2333 	spin_lock(&block_group->lock);
2334 	if (!ret && dcs == BTRFS_DC_SETUP)
2335 		block_group->cache_generation = trans->transid;
2336 	block_group->disk_cache_state = dcs;
2337 	spin_unlock(&block_group->lock);
2338 
2339 	extent_changeset_free(data_reserved);
2340 	return ret;
2341 }
2342 
btrfs_setup_space_cache(struct btrfs_trans_handle * trans)2343 int btrfs_setup_space_cache(struct btrfs_trans_handle *trans)
2344 {
2345 	struct btrfs_fs_info *fs_info = trans->fs_info;
2346 	struct btrfs_block_group_cache *cache, *tmp;
2347 	struct btrfs_transaction *cur_trans = trans->transaction;
2348 	struct btrfs_path *path;
2349 
2350 	if (list_empty(&cur_trans->dirty_bgs) ||
2351 	    !btrfs_test_opt(fs_info, SPACE_CACHE))
2352 		return 0;
2353 
2354 	path = btrfs_alloc_path();
2355 	if (!path)
2356 		return -ENOMEM;
2357 
2358 	/* Could add new block groups, use _safe just in case */
2359 	list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
2360 				 dirty_list) {
2361 		if (cache->disk_cache_state == BTRFS_DC_CLEAR)
2362 			cache_save_setup(cache, trans, path);
2363 	}
2364 
2365 	btrfs_free_path(path);
2366 	return 0;
2367 }
2368 
2369 /*
2370  * Transaction commit does final block group cache writeback during a critical
2371  * section where nothing is allowed to change the FS.  This is required in
2372  * order for the cache to actually match the block group, but can introduce a
2373  * lot of latency into the commit.
2374  *
2375  * So, btrfs_start_dirty_block_groups is here to kick off block group cache IO.
2376  * There's a chance we'll have to redo some of it if the block group changes
2377  * again during the commit, but it greatly reduces the commit latency by
2378  * getting rid of the easy block groups while we're still allowing others to
2379  * join the commit.
2380  */
btrfs_start_dirty_block_groups(struct btrfs_trans_handle * trans)2381 int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
2382 {
2383 	struct btrfs_fs_info *fs_info = trans->fs_info;
2384 	struct btrfs_block_group_cache *cache;
2385 	struct btrfs_transaction *cur_trans = trans->transaction;
2386 	int ret = 0;
2387 	int should_put;
2388 	struct btrfs_path *path = NULL;
2389 	LIST_HEAD(dirty);
2390 	struct list_head *io = &cur_trans->io_bgs;
2391 	int loops = 0;
2392 
2393 	spin_lock(&cur_trans->dirty_bgs_lock);
2394 	if (list_empty(&cur_trans->dirty_bgs)) {
2395 		spin_unlock(&cur_trans->dirty_bgs_lock);
2396 		return 0;
2397 	}
2398 	list_splice_init(&cur_trans->dirty_bgs, &dirty);
2399 	spin_unlock(&cur_trans->dirty_bgs_lock);
2400 
2401 again:
2402 	/* Make sure all the block groups on our dirty list actually exist */
2403 	btrfs_create_pending_block_groups(trans);
2404 
2405 	if (!path) {
2406 		path = btrfs_alloc_path();
2407 		if (!path) {
2408 			ret = -ENOMEM;
2409 			goto out;
2410 		}
2411 	}
2412 
2413 	/*
2414 	 * cache_write_mutex is here only to save us from balance or automatic
2415 	 * removal of empty block groups deleting this block group while we are
2416 	 * writing out the cache
2417 	 */
2418 	mutex_lock(&trans->transaction->cache_write_mutex);
2419 	while (!list_empty(&dirty)) {
2420 		bool drop_reserve = true;
2421 
2422 		cache = list_first_entry(&dirty,
2423 					 struct btrfs_block_group_cache,
2424 					 dirty_list);
2425 		/*
2426 		 * This can happen if something re-dirties a block group that
2427 		 * is already under IO.  Just wait for it to finish and then do
2428 		 * it all again
2429 		 */
2430 		if (!list_empty(&cache->io_list)) {
2431 			list_del_init(&cache->io_list);
2432 			btrfs_wait_cache_io(trans, cache, path);
2433 			btrfs_put_block_group(cache);
2434 		}
2435 
2436 
2437 		/*
2438 		 * btrfs_wait_cache_io uses the cache->dirty_list to decide if
2439 		 * it should update the cache_state.  Don't delete until after
2440 		 * we wait.
2441 		 *
2442 		 * Since we're not running in the commit critical section
2443 		 * we need the dirty_bgs_lock to protect from update_block_group
2444 		 */
2445 		spin_lock(&cur_trans->dirty_bgs_lock);
2446 		list_del_init(&cache->dirty_list);
2447 		spin_unlock(&cur_trans->dirty_bgs_lock);
2448 
2449 		should_put = 1;
2450 
2451 		cache_save_setup(cache, trans, path);
2452 
2453 		if (cache->disk_cache_state == BTRFS_DC_SETUP) {
2454 			cache->io_ctl.inode = NULL;
2455 			ret = btrfs_write_out_cache(trans, cache, path);
2456 			if (ret == 0 && cache->io_ctl.inode) {
2457 				should_put = 0;
2458 
2459 				/*
2460 				 * The cache_write_mutex is protecting the
2461 				 * io_list, also refer to the definition of
2462 				 * btrfs_transaction::io_bgs for more details
2463 				 */
2464 				list_add_tail(&cache->io_list, io);
2465 			} else {
2466 				/*
2467 				 * If we failed to write the cache, the
2468 				 * generation will be bad and life goes on
2469 				 */
2470 				ret = 0;
2471 			}
2472 		}
2473 		if (!ret) {
2474 			ret = write_one_cache_group(trans, path, cache);
2475 			/*
2476 			 * Our block group might still be attached to the list
2477 			 * of new block groups in the transaction handle of some
2478 			 * other task (struct btrfs_trans_handle->new_bgs). This
2479 			 * means its block group item isn't yet in the extent
2480 			 * tree. If this happens ignore the error, as we will
2481 			 * try again later in the critical section of the
2482 			 * transaction commit.
2483 			 */
2484 			if (ret == -ENOENT) {
2485 				ret = 0;
2486 				spin_lock(&cur_trans->dirty_bgs_lock);
2487 				if (list_empty(&cache->dirty_list)) {
2488 					list_add_tail(&cache->dirty_list,
2489 						      &cur_trans->dirty_bgs);
2490 					btrfs_get_block_group(cache);
2491 					drop_reserve = false;
2492 				}
2493 				spin_unlock(&cur_trans->dirty_bgs_lock);
2494 			} else if (ret) {
2495 				btrfs_abort_transaction(trans, ret);
2496 			}
2497 		}
2498 
2499 		/* If it's not on the io list, we need to put the block group */
2500 		if (should_put)
2501 			btrfs_put_block_group(cache);
2502 		if (drop_reserve)
2503 			btrfs_delayed_refs_rsv_release(fs_info, 1);
2504 		/*
2505 		 * Avoid blocking other tasks for too long. It might even save
2506 		 * us from writing caches for block groups that are going to be
2507 		 * removed.
2508 		 */
2509 		mutex_unlock(&trans->transaction->cache_write_mutex);
2510 		if (ret)
2511 			goto out;
2512 		mutex_lock(&trans->transaction->cache_write_mutex);
2513 	}
2514 	mutex_unlock(&trans->transaction->cache_write_mutex);
2515 
2516 	/*
2517 	 * Go through delayed refs for all the stuff we've just kicked off
2518 	 * and then loop back (just once)
2519 	 */
2520 	if (!ret)
2521 		ret = btrfs_run_delayed_refs(trans, 0);
2522 	if (!ret && loops == 0) {
2523 		loops++;
2524 		spin_lock(&cur_trans->dirty_bgs_lock);
2525 		list_splice_init(&cur_trans->dirty_bgs, &dirty);
2526 		/*
2527 		 * dirty_bgs_lock protects us from concurrent block group
2528 		 * deletes too (not just cache_write_mutex).
2529 		 */
2530 		if (!list_empty(&dirty)) {
2531 			spin_unlock(&cur_trans->dirty_bgs_lock);
2532 			goto again;
2533 		}
2534 		spin_unlock(&cur_trans->dirty_bgs_lock);
2535 	}
2536 out:
2537 	if (ret < 0) {
2538 		spin_lock(&cur_trans->dirty_bgs_lock);
2539 		list_splice_init(&dirty, &cur_trans->dirty_bgs);
2540 		spin_unlock(&cur_trans->dirty_bgs_lock);
2541 		btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
2542 	}
2543 
2544 	btrfs_free_path(path);
2545 	return ret;
2546 }
2547 
btrfs_write_dirty_block_groups(struct btrfs_trans_handle * trans)2548 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
2549 {
2550 	struct btrfs_fs_info *fs_info = trans->fs_info;
2551 	struct btrfs_block_group_cache *cache;
2552 	struct btrfs_transaction *cur_trans = trans->transaction;
2553 	int ret = 0;
2554 	int should_put;
2555 	struct btrfs_path *path;
2556 	struct list_head *io = &cur_trans->io_bgs;
2557 
2558 	path = btrfs_alloc_path();
2559 	if (!path)
2560 		return -ENOMEM;
2561 
2562 	/*
2563 	 * Even though we are in the critical section of the transaction commit,
2564 	 * we can still have concurrent tasks adding elements to this
2565 	 * transaction's list of dirty block groups. These tasks correspond to
2566 	 * endio free space workers started when writeback finishes for a
2567 	 * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
2568 	 * allocate new block groups as a result of COWing nodes of the root
2569 	 * tree when updating the free space inode. The writeback for the space
2570 	 * caches is triggered by an earlier call to
2571 	 * btrfs_start_dirty_block_groups() and iterations of the following
2572 	 * loop.
2573 	 * Also we want to do the cache_save_setup first and then run the
2574 	 * delayed refs to make sure we have the best chance at doing this all
2575 	 * in one shot.
2576 	 */
2577 	spin_lock(&cur_trans->dirty_bgs_lock);
2578 	while (!list_empty(&cur_trans->dirty_bgs)) {
2579 		cache = list_first_entry(&cur_trans->dirty_bgs,
2580 					 struct btrfs_block_group_cache,
2581 					 dirty_list);
2582 
2583 		/*
2584 		 * This can happen if cache_save_setup re-dirties a block group
2585 		 * that is already under IO.  Just wait for it to finish and
2586 		 * then do it all again
2587 		 */
2588 		if (!list_empty(&cache->io_list)) {
2589 			spin_unlock(&cur_trans->dirty_bgs_lock);
2590 			list_del_init(&cache->io_list);
2591 			btrfs_wait_cache_io(trans, cache, path);
2592 			btrfs_put_block_group(cache);
2593 			spin_lock(&cur_trans->dirty_bgs_lock);
2594 		}
2595 
2596 		/*
2597 		 * Don't remove from the dirty list until after we've waited on
2598 		 * any pending IO
2599 		 */
2600 		list_del_init(&cache->dirty_list);
2601 		spin_unlock(&cur_trans->dirty_bgs_lock);
2602 		should_put = 1;
2603 
2604 		cache_save_setup(cache, trans, path);
2605 
2606 		if (!ret)
2607 			ret = btrfs_run_delayed_refs(trans,
2608 						     (unsigned long) -1);
2609 
2610 		if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
2611 			cache->io_ctl.inode = NULL;
2612 			ret = btrfs_write_out_cache(trans, cache, path);
2613 			if (ret == 0 && cache->io_ctl.inode) {
2614 				should_put = 0;
2615 				list_add_tail(&cache->io_list, io);
2616 			} else {
2617 				/*
2618 				 * If we failed to write the cache, the
2619 				 * generation will be bad and life goes on
2620 				 */
2621 				ret = 0;
2622 			}
2623 		}
2624 		if (!ret) {
2625 			ret = write_one_cache_group(trans, path, cache);
2626 			/*
2627 			 * One of the free space endio workers might have
2628 			 * created a new block group while updating a free space
2629 			 * cache's inode (at inode.c:btrfs_finish_ordered_io())
2630 			 * and hasn't released its transaction handle yet, in
2631 			 * which case the new block group is still attached to
2632 			 * its transaction handle and its creation has not
2633 			 * finished yet (no block group item in the extent tree
2634 			 * yet, etc). If this is the case, wait for all free
2635 			 * space endio workers to finish and retry. This is a
2636 			 * very rare case so no need for a more efficient and
2637 			 * complex approach.
2638 			 */
2639 			if (ret == -ENOENT) {
2640 				wait_event(cur_trans->writer_wait,
2641 				   atomic_read(&cur_trans->num_writers) == 1);
2642 				ret = write_one_cache_group(trans, path, cache);
2643 			}
2644 			if (ret)
2645 				btrfs_abort_transaction(trans, ret);
2646 		}
2647 
2648 		/* If its not on the io list, we need to put the block group */
2649 		if (should_put)
2650 			btrfs_put_block_group(cache);
2651 		btrfs_delayed_refs_rsv_release(fs_info, 1);
2652 		spin_lock(&cur_trans->dirty_bgs_lock);
2653 	}
2654 	spin_unlock(&cur_trans->dirty_bgs_lock);
2655 
2656 	/*
2657 	 * Refer to the definition of io_bgs member for details why it's safe
2658 	 * to use it without any locking
2659 	 */
2660 	while (!list_empty(io)) {
2661 		cache = list_first_entry(io, struct btrfs_block_group_cache,
2662 					 io_list);
2663 		list_del_init(&cache->io_list);
2664 		btrfs_wait_cache_io(trans, cache, path);
2665 		btrfs_put_block_group(cache);
2666 	}
2667 
2668 	btrfs_free_path(path);
2669 	return ret;
2670 }
2671 
btrfs_update_block_group(struct btrfs_trans_handle * trans,u64 bytenr,u64 num_bytes,int alloc)2672 int btrfs_update_block_group(struct btrfs_trans_handle *trans,
2673 			     u64 bytenr, u64 num_bytes, int alloc)
2674 {
2675 	struct btrfs_fs_info *info = trans->fs_info;
2676 	struct btrfs_block_group_cache *cache = NULL;
2677 	u64 total = num_bytes;
2678 	u64 old_val;
2679 	u64 byte_in_group;
2680 	int factor;
2681 	int ret = 0;
2682 
2683 	/* Block accounting for super block */
2684 	spin_lock(&info->delalloc_root_lock);
2685 	old_val = btrfs_super_bytes_used(info->super_copy);
2686 	if (alloc)
2687 		old_val += num_bytes;
2688 	else
2689 		old_val -= num_bytes;
2690 	btrfs_set_super_bytes_used(info->super_copy, old_val);
2691 	spin_unlock(&info->delalloc_root_lock);
2692 
2693 	while (total) {
2694 		cache = btrfs_lookup_block_group(info, bytenr);
2695 		if (!cache) {
2696 			ret = -ENOENT;
2697 			break;
2698 		}
2699 		factor = btrfs_bg_type_to_factor(cache->flags);
2700 
2701 		/*
2702 		 * If this block group has free space cache written out, we
2703 		 * need to make sure to load it if we are removing space.  This
2704 		 * is because we need the unpinning stage to actually add the
2705 		 * space back to the block group, otherwise we will leak space.
2706 		 */
2707 		if (!alloc && !btrfs_block_group_cache_done(cache))
2708 			btrfs_cache_block_group(cache, 1);
2709 
2710 		byte_in_group = bytenr - cache->key.objectid;
2711 		WARN_ON(byte_in_group > cache->key.offset);
2712 
2713 		spin_lock(&cache->space_info->lock);
2714 		spin_lock(&cache->lock);
2715 
2716 		if (btrfs_test_opt(info, SPACE_CACHE) &&
2717 		    cache->disk_cache_state < BTRFS_DC_CLEAR)
2718 			cache->disk_cache_state = BTRFS_DC_CLEAR;
2719 
2720 		old_val = btrfs_block_group_used(&cache->item);
2721 		num_bytes = min(total, cache->key.offset - byte_in_group);
2722 		if (alloc) {
2723 			old_val += num_bytes;
2724 			btrfs_set_block_group_used(&cache->item, old_val);
2725 			cache->reserved -= num_bytes;
2726 			cache->space_info->bytes_reserved -= num_bytes;
2727 			cache->space_info->bytes_used += num_bytes;
2728 			cache->space_info->disk_used += num_bytes * factor;
2729 			spin_unlock(&cache->lock);
2730 			spin_unlock(&cache->space_info->lock);
2731 		} else {
2732 			old_val -= num_bytes;
2733 			btrfs_set_block_group_used(&cache->item, old_val);
2734 			cache->pinned += num_bytes;
2735 			btrfs_space_info_update_bytes_pinned(info,
2736 					cache->space_info, num_bytes);
2737 			cache->space_info->bytes_used -= num_bytes;
2738 			cache->space_info->disk_used -= num_bytes * factor;
2739 			spin_unlock(&cache->lock);
2740 			spin_unlock(&cache->space_info->lock);
2741 
2742 			percpu_counter_add_batch(
2743 					&cache->space_info->total_bytes_pinned,
2744 					num_bytes,
2745 					BTRFS_TOTAL_BYTES_PINNED_BATCH);
2746 			set_extent_dirty(info->pinned_extents,
2747 					 bytenr, bytenr + num_bytes - 1,
2748 					 GFP_NOFS | __GFP_NOFAIL);
2749 		}
2750 
2751 		spin_lock(&trans->transaction->dirty_bgs_lock);
2752 		if (list_empty(&cache->dirty_list)) {
2753 			list_add_tail(&cache->dirty_list,
2754 				      &trans->transaction->dirty_bgs);
2755 			trans->delayed_ref_updates++;
2756 			btrfs_get_block_group(cache);
2757 		}
2758 		spin_unlock(&trans->transaction->dirty_bgs_lock);
2759 
2760 		/*
2761 		 * No longer have used bytes in this block group, queue it for
2762 		 * deletion. We do this after adding the block group to the
2763 		 * dirty list to avoid races between cleaner kthread and space
2764 		 * cache writeout.
2765 		 */
2766 		if (!alloc && old_val == 0)
2767 			btrfs_mark_bg_unused(cache);
2768 
2769 		btrfs_put_block_group(cache);
2770 		total -= num_bytes;
2771 		bytenr += num_bytes;
2772 	}
2773 
2774 	/* Modified block groups are accounted for in the delayed_refs_rsv. */
2775 	btrfs_update_delayed_refs_rsv(trans);
2776 	return ret;
2777 }
2778 
2779 /**
2780  * btrfs_add_reserved_bytes - update the block_group and space info counters
2781  * @cache:	The cache we are manipulating
2782  * @ram_bytes:  The number of bytes of file content, and will be same to
2783  *              @num_bytes except for the compress path.
2784  * @num_bytes:	The number of bytes in question
2785  * @delalloc:   The blocks are allocated for the delalloc write
2786  *
2787  * This is called by the allocator when it reserves space. If this is a
2788  * reservation and the block group has become read only we cannot make the
2789  * reservation and return -EAGAIN, otherwise this function always succeeds.
2790  */
btrfs_add_reserved_bytes(struct btrfs_block_group_cache * cache,u64 ram_bytes,u64 num_bytes,int delalloc)2791 int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
2792 			     u64 ram_bytes, u64 num_bytes, int delalloc)
2793 {
2794 	struct btrfs_space_info *space_info = cache->space_info;
2795 	int ret = 0;
2796 
2797 	spin_lock(&space_info->lock);
2798 	spin_lock(&cache->lock);
2799 	if (cache->ro) {
2800 		ret = -EAGAIN;
2801 	} else {
2802 		cache->reserved += num_bytes;
2803 		space_info->bytes_reserved += num_bytes;
2804 		trace_btrfs_space_reservation(cache->fs_info, "space_info",
2805 					      space_info->flags, num_bytes, 1);
2806 		btrfs_space_info_update_bytes_may_use(cache->fs_info,
2807 						      space_info, -ram_bytes);
2808 		if (delalloc)
2809 			cache->delalloc_bytes += num_bytes;
2810 	}
2811 	spin_unlock(&cache->lock);
2812 	spin_unlock(&space_info->lock);
2813 	return ret;
2814 }
2815 
2816 /**
2817  * btrfs_free_reserved_bytes - update the block_group and space info counters
2818  * @cache:      The cache we are manipulating
2819  * @num_bytes:  The number of bytes in question
2820  * @delalloc:   The blocks are allocated for the delalloc write
2821  *
2822  * This is called by somebody who is freeing space that was never actually used
2823  * on disk.  For example if you reserve some space for a new leaf in transaction
2824  * A and before transaction A commits you free that leaf, you call this with
2825  * reserve set to 0 in order to clear the reservation.
2826  */
btrfs_free_reserved_bytes(struct btrfs_block_group_cache * cache,u64 num_bytes,int delalloc)2827 void btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
2828 			       u64 num_bytes, int delalloc)
2829 {
2830 	struct btrfs_space_info *space_info = cache->space_info;
2831 
2832 	spin_lock(&space_info->lock);
2833 	spin_lock(&cache->lock);
2834 	if (cache->ro)
2835 		space_info->bytes_readonly += num_bytes;
2836 	cache->reserved -= num_bytes;
2837 	space_info->bytes_reserved -= num_bytes;
2838 	space_info->max_extent_size = 0;
2839 
2840 	if (delalloc)
2841 		cache->delalloc_bytes -= num_bytes;
2842 	spin_unlock(&cache->lock);
2843 	spin_unlock(&space_info->lock);
2844 }
2845 
force_metadata_allocation(struct btrfs_fs_info * info)2846 static void force_metadata_allocation(struct btrfs_fs_info *info)
2847 {
2848 	struct list_head *head = &info->space_info;
2849 	struct btrfs_space_info *found;
2850 
2851 	rcu_read_lock();
2852 	list_for_each_entry_rcu(found, head, list) {
2853 		if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
2854 			found->force_alloc = CHUNK_ALLOC_FORCE;
2855 	}
2856 	rcu_read_unlock();
2857 }
2858 
should_alloc_chunk(struct btrfs_fs_info * fs_info,struct btrfs_space_info * sinfo,int force)2859 static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
2860 			      struct btrfs_space_info *sinfo, int force)
2861 {
2862 	u64 bytes_used = btrfs_space_info_used(sinfo, false);
2863 	u64 thresh;
2864 
2865 	if (force == CHUNK_ALLOC_FORCE)
2866 		return 1;
2867 
2868 	/*
2869 	 * in limited mode, we want to have some free space up to
2870 	 * about 1% of the FS size.
2871 	 */
2872 	if (force == CHUNK_ALLOC_LIMITED) {
2873 		thresh = btrfs_super_total_bytes(fs_info->super_copy);
2874 		thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
2875 
2876 		if (sinfo->total_bytes - bytes_used < thresh)
2877 			return 1;
2878 	}
2879 
2880 	if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8))
2881 		return 0;
2882 	return 1;
2883 }
2884 
btrfs_force_chunk_alloc(struct btrfs_trans_handle * trans,u64 type)2885 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
2886 {
2887 	u64 alloc_flags = btrfs_get_alloc_profile(trans->fs_info, type);
2888 
2889 	return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
2890 }
2891 
2892 /*
2893  * If force is CHUNK_ALLOC_FORCE:
2894  *    - return 1 if it successfully allocates a chunk,
2895  *    - return errors including -ENOSPC otherwise.
2896  * If force is NOT CHUNK_ALLOC_FORCE:
2897  *    - return 0 if it doesn't need to allocate a new chunk,
2898  *    - return 1 if it successfully allocates a chunk,
2899  *    - return errors including -ENOSPC otherwise.
2900  */
btrfs_chunk_alloc(struct btrfs_trans_handle * trans,u64 flags,enum btrfs_chunk_alloc_enum force)2901 int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
2902 		      enum btrfs_chunk_alloc_enum force)
2903 {
2904 	struct btrfs_fs_info *fs_info = trans->fs_info;
2905 	struct btrfs_space_info *space_info;
2906 	bool wait_for_alloc = false;
2907 	bool should_alloc = false;
2908 	int ret = 0;
2909 
2910 	/* Don't re-enter if we're already allocating a chunk */
2911 	if (trans->allocating_chunk)
2912 		return -ENOSPC;
2913 
2914 	space_info = btrfs_find_space_info(fs_info, flags);
2915 	ASSERT(space_info);
2916 
2917 	do {
2918 		spin_lock(&space_info->lock);
2919 		if (force < space_info->force_alloc)
2920 			force = space_info->force_alloc;
2921 		should_alloc = should_alloc_chunk(fs_info, space_info, force);
2922 		if (space_info->full) {
2923 			/* No more free physical space */
2924 			if (should_alloc)
2925 				ret = -ENOSPC;
2926 			else
2927 				ret = 0;
2928 			spin_unlock(&space_info->lock);
2929 			return ret;
2930 		} else if (!should_alloc) {
2931 			spin_unlock(&space_info->lock);
2932 			return 0;
2933 		} else if (space_info->chunk_alloc) {
2934 			/*
2935 			 * Someone is already allocating, so we need to block
2936 			 * until this someone is finished and then loop to
2937 			 * recheck if we should continue with our allocation
2938 			 * attempt.
2939 			 */
2940 			wait_for_alloc = true;
2941 			force = CHUNK_ALLOC_NO_FORCE;
2942 			spin_unlock(&space_info->lock);
2943 			mutex_lock(&fs_info->chunk_mutex);
2944 			mutex_unlock(&fs_info->chunk_mutex);
2945 		} else {
2946 			/* Proceed with allocation */
2947 			space_info->chunk_alloc = 1;
2948 			wait_for_alloc = false;
2949 			spin_unlock(&space_info->lock);
2950 		}
2951 
2952 		cond_resched();
2953 	} while (wait_for_alloc);
2954 
2955 	mutex_lock(&fs_info->chunk_mutex);
2956 	trans->allocating_chunk = true;
2957 
2958 	/*
2959 	 * If we have mixed data/metadata chunks we want to make sure we keep
2960 	 * allocating mixed chunks instead of individual chunks.
2961 	 */
2962 	if (btrfs_mixed_space_info(space_info))
2963 		flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
2964 
2965 	/*
2966 	 * if we're doing a data chunk, go ahead and make sure that
2967 	 * we keep a reasonable number of metadata chunks allocated in the
2968 	 * FS as well.
2969 	 */
2970 	if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
2971 		fs_info->data_chunk_allocations++;
2972 		if (!(fs_info->data_chunk_allocations %
2973 		      fs_info->metadata_ratio))
2974 			force_metadata_allocation(fs_info);
2975 	}
2976 
2977 	/*
2978 	 * Check if we have enough space in SYSTEM chunk because we may need
2979 	 * to update devices.
2980 	 */
2981 	check_system_chunk(trans, flags);
2982 
2983 	ret = btrfs_alloc_chunk(trans, flags);
2984 	trans->allocating_chunk = false;
2985 
2986 	spin_lock(&space_info->lock);
2987 	if (ret < 0) {
2988 		if (ret == -ENOSPC)
2989 			space_info->full = 1;
2990 		else
2991 			goto out;
2992 	} else {
2993 		ret = 1;
2994 		space_info->max_extent_size = 0;
2995 	}
2996 
2997 	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
2998 out:
2999 	space_info->chunk_alloc = 0;
3000 	spin_unlock(&space_info->lock);
3001 	mutex_unlock(&fs_info->chunk_mutex);
3002 	/*
3003 	 * When we allocate a new chunk we reserve space in the chunk block
3004 	 * reserve to make sure we can COW nodes/leafs in the chunk tree or
3005 	 * add new nodes/leafs to it if we end up needing to do it when
3006 	 * inserting the chunk item and updating device items as part of the
3007 	 * second phase of chunk allocation, performed by
3008 	 * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
3009 	 * large number of new block groups to create in our transaction
3010 	 * handle's new_bgs list to avoid exhausting the chunk block reserve
3011 	 * in extreme cases - like having a single transaction create many new
3012 	 * block groups when starting to write out the free space caches of all
3013 	 * the block groups that were made dirty during the lifetime of the
3014 	 * transaction.
3015 	 */
3016 	if (trans->chunk_bytes_reserved >= (u64)SZ_2M)
3017 		btrfs_create_pending_block_groups(trans);
3018 
3019 	return ret;
3020 }
3021 
get_profile_num_devs(struct btrfs_fs_info * fs_info,u64 type)3022 static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
3023 {
3024 	u64 num_dev;
3025 
3026 	num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max;
3027 	if (!num_dev)
3028 		num_dev = fs_info->fs_devices->rw_devices;
3029 
3030 	return num_dev;
3031 }
3032 
3033 /*
3034  * If @is_allocation is true, reserve space in the system space info necessary
3035  * for allocating a chunk, otherwise if it's false, reserve space necessary for
3036  * removing a chunk.
3037  */
check_system_chunk(struct btrfs_trans_handle * trans,u64 type)3038 void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
3039 {
3040 	struct btrfs_fs_info *fs_info = trans->fs_info;
3041 	struct btrfs_space_info *info;
3042 	u64 left;
3043 	u64 thresh;
3044 	int ret = 0;
3045 	u64 num_devs;
3046 
3047 	/*
3048 	 * Needed because we can end up allocating a system chunk and for an
3049 	 * atomic and race free space reservation in the chunk block reserve.
3050 	 */
3051 	lockdep_assert_held(&fs_info->chunk_mutex);
3052 
3053 	info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3054 	spin_lock(&info->lock);
3055 	left = info->total_bytes - btrfs_space_info_used(info, true);
3056 	spin_unlock(&info->lock);
3057 
3058 	num_devs = get_profile_num_devs(fs_info, type);
3059 
3060 	/* num_devs device items to update and 1 chunk item to add or remove */
3061 	thresh = btrfs_calc_metadata_size(fs_info, num_devs) +
3062 		btrfs_calc_insert_metadata_size(fs_info, 1);
3063 
3064 	if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
3065 		btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
3066 			   left, thresh, type);
3067 		btrfs_dump_space_info(fs_info, info, 0, 0);
3068 	}
3069 
3070 	if (left < thresh) {
3071 		u64 flags = btrfs_system_alloc_profile(fs_info);
3072 
3073 		/*
3074 		 * Ignore failure to create system chunk. We might end up not
3075 		 * needing it, as we might not need to COW all nodes/leafs from
3076 		 * the paths we visit in the chunk tree (they were already COWed
3077 		 * or created in the current transaction for example).
3078 		 */
3079 		ret = btrfs_alloc_chunk(trans, flags);
3080 	}
3081 
3082 	if (!ret) {
3083 		ret = btrfs_block_rsv_add(fs_info->chunk_root,
3084 					  &fs_info->chunk_block_rsv,
3085 					  thresh, BTRFS_RESERVE_NO_FLUSH);
3086 		if (!ret)
3087 			trans->chunk_bytes_reserved += thresh;
3088 	}
3089 }
3090 
btrfs_put_block_group_cache(struct btrfs_fs_info * info)3091 void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
3092 {
3093 	struct btrfs_block_group_cache *block_group;
3094 	u64 last = 0;
3095 
3096 	while (1) {
3097 		struct inode *inode;
3098 
3099 		block_group = btrfs_lookup_first_block_group(info, last);
3100 		while (block_group) {
3101 			btrfs_wait_block_group_cache_done(block_group);
3102 			spin_lock(&block_group->lock);
3103 			if (block_group->iref)
3104 				break;
3105 			spin_unlock(&block_group->lock);
3106 			block_group = btrfs_next_block_group(block_group);
3107 		}
3108 		if (!block_group) {
3109 			if (last == 0)
3110 				break;
3111 			last = 0;
3112 			continue;
3113 		}
3114 
3115 		inode = block_group->inode;
3116 		block_group->iref = 0;
3117 		block_group->inode = NULL;
3118 		spin_unlock(&block_group->lock);
3119 		ASSERT(block_group->io_ctl.inode == NULL);
3120 		iput(inode);
3121 		last = block_group->key.objectid + block_group->key.offset;
3122 		btrfs_put_block_group(block_group);
3123 	}
3124 }
3125 
3126 /*
3127  * Must be called only after stopping all workers, since we could have block
3128  * group caching kthreads running, and therefore they could race with us if we
3129  * freed the block groups before stopping them.
3130  */
btrfs_free_block_groups(struct btrfs_fs_info * info)3131 int btrfs_free_block_groups(struct btrfs_fs_info *info)
3132 {
3133 	struct btrfs_block_group_cache *block_group;
3134 	struct btrfs_space_info *space_info;
3135 	struct btrfs_caching_control *caching_ctl;
3136 	struct rb_node *n;
3137 
3138 	down_write(&info->commit_root_sem);
3139 	while (!list_empty(&info->caching_block_groups)) {
3140 		caching_ctl = list_entry(info->caching_block_groups.next,
3141 					 struct btrfs_caching_control, list);
3142 		list_del(&caching_ctl->list);
3143 		btrfs_put_caching_control(caching_ctl);
3144 	}
3145 	up_write(&info->commit_root_sem);
3146 
3147 	spin_lock(&info->unused_bgs_lock);
3148 	while (!list_empty(&info->unused_bgs)) {
3149 		block_group = list_first_entry(&info->unused_bgs,
3150 					       struct btrfs_block_group_cache,
3151 					       bg_list);
3152 		list_del_init(&block_group->bg_list);
3153 		btrfs_put_block_group(block_group);
3154 	}
3155 	spin_unlock(&info->unused_bgs_lock);
3156 
3157 	spin_lock(&info->block_group_cache_lock);
3158 	while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
3159 		block_group = rb_entry(n, struct btrfs_block_group_cache,
3160 				       cache_node);
3161 		rb_erase(&block_group->cache_node,
3162 			 &info->block_group_cache_tree);
3163 		RB_CLEAR_NODE(&block_group->cache_node);
3164 		spin_unlock(&info->block_group_cache_lock);
3165 
3166 		down_write(&block_group->space_info->groups_sem);
3167 		list_del(&block_group->list);
3168 		up_write(&block_group->space_info->groups_sem);
3169 
3170 		/*
3171 		 * We haven't cached this block group, which means we could
3172 		 * possibly have excluded extents on this block group.
3173 		 */
3174 		if (block_group->cached == BTRFS_CACHE_NO ||
3175 		    block_group->cached == BTRFS_CACHE_ERROR)
3176 			btrfs_free_excluded_extents(block_group);
3177 
3178 		btrfs_remove_free_space_cache(block_group);
3179 		ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
3180 		ASSERT(list_empty(&block_group->dirty_list));
3181 		ASSERT(list_empty(&block_group->io_list));
3182 		ASSERT(list_empty(&block_group->bg_list));
3183 		ASSERT(atomic_read(&block_group->count) == 1);
3184 		btrfs_put_block_group(block_group);
3185 
3186 		spin_lock(&info->block_group_cache_lock);
3187 	}
3188 	spin_unlock(&info->block_group_cache_lock);
3189 
3190 	/*
3191 	 * Now that all the block groups are freed, go through and free all the
3192 	 * space_info structs.  This is only called during the final stages of
3193 	 * unmount, and so we know nobody is using them.  We call
3194 	 * synchronize_rcu() once before we start, just to be on the safe side.
3195 	 */
3196 	synchronize_rcu();
3197 
3198 	btrfs_release_global_block_rsv(info);
3199 
3200 	while (!list_empty(&info->space_info)) {
3201 		space_info = list_entry(info->space_info.next,
3202 					struct btrfs_space_info,
3203 					list);
3204 
3205 		/*
3206 		 * Do not hide this behind enospc_debug, this is actually
3207 		 * important and indicates a real bug if this happens.
3208 		 */
3209 		if (WARN_ON(space_info->bytes_pinned > 0 ||
3210 			    space_info->bytes_reserved > 0 ||
3211 			    space_info->bytes_may_use > 0))
3212 			btrfs_dump_space_info(info, space_info, 0, 0);
3213 		list_del(&space_info->list);
3214 		btrfs_sysfs_remove_space_info(space_info);
3215 	}
3216 	return 0;
3217 }
3218