1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007 Oracle.  All rights reserved.
4  */
5 
6 #include <linux/sched.h>
7 #include <linux/sched/signal.h>
8 #include <linux/pagemap.h>
9 #include <linux/writeback.h>
10 #include <linux/blkdev.h>
11 #include <linux/sort.h>
12 #include <linux/rcupdate.h>
13 #include <linux/kthread.h>
14 #include <linux/slab.h>
15 #include <linux/ratelimit.h>
16 #include <linux/percpu_counter.h>
17 #include <linux/lockdep.h>
18 #include <linux/crc32c.h>
19 #include "ctree.h"
20 #include "extent-tree.h"
21 #include "transaction.h"
22 #include "disk-io.h"
23 #include "print-tree.h"
24 #include "volumes.h"
25 #include "raid56.h"
26 #include "locking.h"
27 #include "free-space-cache.h"
28 #include "free-space-tree.h"
29 #include "qgroup.h"
30 #include "ref-verify.h"
31 #include "space-info.h"
32 #include "block-rsv.h"
33 #include "discard.h"
34 #include "zoned.h"
35 #include "dev-replace.h"
36 #include "fs.h"
37 #include "accessors.h"
38 #include "root-tree.h"
39 #include "file-item.h"
40 #include "orphan.h"
41 #include "tree-checker.h"
42 #include "raid-stripe-tree.h"
43 
44 #undef SCRAMBLE_DELAYED_REFS
45 
46 
47 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
48 			       struct btrfs_delayed_ref_head *href,
49 			       struct btrfs_delayed_ref_node *node,
50 			       struct btrfs_delayed_extent_op *extra_op);
51 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
52 				    struct extent_buffer *leaf,
53 				    struct btrfs_extent_item *ei);
54 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
55 				      u64 parent, u64 root_objectid,
56 				      u64 flags, u64 owner, u64 offset,
57 				      struct btrfs_key *ins, int ref_mod, u64 oref_root);
58 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
59 				     struct btrfs_delayed_ref_node *node,
60 				     struct btrfs_delayed_extent_op *extent_op);
61 static int find_next_key(struct btrfs_path *path, int level,
62 			 struct btrfs_key *key);
63 
block_group_bits(struct btrfs_block_group * cache,u64 bits)64 static int block_group_bits(struct btrfs_block_group *cache, u64 bits)
65 {
66 	return (cache->flags & bits) == bits;
67 }
68 
69 /* simple helper to search for an existing data extent at a given offset */
btrfs_lookup_data_extent(struct btrfs_fs_info * fs_info,u64 start,u64 len)70 int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
71 {
72 	struct btrfs_root *root = btrfs_extent_root(fs_info, start);
73 	int ret;
74 	struct btrfs_key key;
75 	struct btrfs_path *path;
76 
77 	path = btrfs_alloc_path();
78 	if (!path)
79 		return -ENOMEM;
80 
81 	key.objectid = start;
82 	key.offset = len;
83 	key.type = BTRFS_EXTENT_ITEM_KEY;
84 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
85 	btrfs_free_path(path);
86 	return ret;
87 }
88 
89 /*
90  * helper function to lookup reference count and flags of a tree block.
91  *
92  * the head node for delayed ref is used to store the sum of all the
93  * reference count modifications queued up in the rbtree. the head
94  * node may also store the extent flags to set. This way you can check
95  * to see what the reference count and extent flags would be if all of
96  * the delayed refs are not processed.
97  */
btrfs_lookup_extent_info(struct btrfs_trans_handle * trans,struct btrfs_fs_info * fs_info,u64 bytenr,u64 offset,int metadata,u64 * refs,u64 * flags,u64 * owning_root)98 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
99 			     struct btrfs_fs_info *fs_info, u64 bytenr,
100 			     u64 offset, int metadata, u64 *refs, u64 *flags,
101 			     u64 *owning_root)
102 {
103 	struct btrfs_root *extent_root;
104 	struct btrfs_delayed_ref_head *head;
105 	struct btrfs_delayed_ref_root *delayed_refs;
106 	struct btrfs_path *path;
107 	struct btrfs_key key;
108 	u64 num_refs;
109 	u64 extent_flags;
110 	u64 owner = 0;
111 	int ret;
112 
113 	/*
114 	 * If we don't have skinny metadata, don't bother doing anything
115 	 * different
116 	 */
117 	if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
118 		offset = fs_info->nodesize;
119 		metadata = 0;
120 	}
121 
122 	path = btrfs_alloc_path();
123 	if (!path)
124 		return -ENOMEM;
125 
126 search_again:
127 	key.objectid = bytenr;
128 	key.offset = offset;
129 	if (metadata)
130 		key.type = BTRFS_METADATA_ITEM_KEY;
131 	else
132 		key.type = BTRFS_EXTENT_ITEM_KEY;
133 
134 	extent_root = btrfs_extent_root(fs_info, bytenr);
135 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
136 	if (ret < 0)
137 		goto out_free;
138 
139 	if (ret > 0 && key.type == BTRFS_METADATA_ITEM_KEY) {
140 		if (path->slots[0]) {
141 			path->slots[0]--;
142 			btrfs_item_key_to_cpu(path->nodes[0], &key,
143 					      path->slots[0]);
144 			if (key.objectid == bytenr &&
145 			    key.type == BTRFS_EXTENT_ITEM_KEY &&
146 			    key.offset == fs_info->nodesize)
147 				ret = 0;
148 		}
149 	}
150 
151 	if (ret == 0) {
152 		struct extent_buffer *leaf = path->nodes[0];
153 		struct btrfs_extent_item *ei;
154 		const u32 item_size = btrfs_item_size(leaf, path->slots[0]);
155 
156 		if (unlikely(item_size < sizeof(*ei))) {
157 			ret = -EUCLEAN;
158 			btrfs_err(fs_info,
159 			"unexpected extent item size, has %u expect >= %zu",
160 				  item_size, sizeof(*ei));
161 			btrfs_abort_transaction(trans, ret);
162 			goto out_free;
163 		}
164 
165 		ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
166 		num_refs = btrfs_extent_refs(leaf, ei);
167 		if (unlikely(num_refs == 0)) {
168 			ret = -EUCLEAN;
169 			btrfs_err(fs_info,
170 		"unexpected zero reference count for extent item (%llu %u %llu)",
171 				  key.objectid, key.type, key.offset);
172 			btrfs_abort_transaction(trans, ret);
173 			goto out_free;
174 		}
175 		extent_flags = btrfs_extent_flags(leaf, ei);
176 		owner = btrfs_get_extent_owner_root(fs_info, leaf, path->slots[0]);
177 	} else {
178 		num_refs = 0;
179 		extent_flags = 0;
180 		ret = 0;
181 	}
182 
183 	delayed_refs = &trans->transaction->delayed_refs;
184 	spin_lock(&delayed_refs->lock);
185 	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
186 	if (head) {
187 		if (!mutex_trylock(&head->mutex)) {
188 			refcount_inc(&head->refs);
189 			spin_unlock(&delayed_refs->lock);
190 
191 			btrfs_release_path(path);
192 
193 			/*
194 			 * Mutex was contended, block until it's released and try
195 			 * again
196 			 */
197 			mutex_lock(&head->mutex);
198 			mutex_unlock(&head->mutex);
199 			btrfs_put_delayed_ref_head(head);
200 			goto search_again;
201 		}
202 		spin_lock(&head->lock);
203 		if (head->extent_op && head->extent_op->update_flags)
204 			extent_flags |= head->extent_op->flags_to_set;
205 
206 		num_refs += head->ref_mod;
207 		spin_unlock(&head->lock);
208 		mutex_unlock(&head->mutex);
209 	}
210 	spin_unlock(&delayed_refs->lock);
211 
212 	WARN_ON(num_refs == 0);
213 	if (refs)
214 		*refs = num_refs;
215 	if (flags)
216 		*flags = extent_flags;
217 	if (owning_root)
218 		*owning_root = owner;
219 out_free:
220 	btrfs_free_path(path);
221 	return ret;
222 }
223 
224 /*
225  * Back reference rules.  Back refs have three main goals:
226  *
227  * 1) differentiate between all holders of references to an extent so that
228  *    when a reference is dropped we can make sure it was a valid reference
229  *    before freeing the extent.
230  *
231  * 2) Provide enough information to quickly find the holders of an extent
232  *    if we notice a given block is corrupted or bad.
233  *
234  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
235  *    maintenance.  This is actually the same as #2, but with a slightly
236  *    different use case.
237  *
238  * There are two kinds of back refs. The implicit back refs is optimized
239  * for pointers in non-shared tree blocks. For a given pointer in a block,
240  * back refs of this kind provide information about the block's owner tree
241  * and the pointer's key. These information allow us to find the block by
242  * b-tree searching. The full back refs is for pointers in tree blocks not
243  * referenced by their owner trees. The location of tree block is recorded
244  * in the back refs. Actually the full back refs is generic, and can be
245  * used in all cases the implicit back refs is used. The major shortcoming
246  * of the full back refs is its overhead. Every time a tree block gets
247  * COWed, we have to update back refs entry for all pointers in it.
248  *
249  * For a newly allocated tree block, we use implicit back refs for
250  * pointers in it. This means most tree related operations only involve
251  * implicit back refs. For a tree block created in old transaction, the
252  * only way to drop a reference to it is COW it. So we can detect the
253  * event that tree block loses its owner tree's reference and do the
254  * back refs conversion.
255  *
256  * When a tree block is COWed through a tree, there are four cases:
257  *
258  * The reference count of the block is one and the tree is the block's
259  * owner tree. Nothing to do in this case.
260  *
261  * The reference count of the block is one and the tree is not the
262  * block's owner tree. In this case, full back refs is used for pointers
263  * in the block. Remove these full back refs, add implicit back refs for
264  * every pointers in the new block.
265  *
266  * The reference count of the block is greater than one and the tree is
267  * the block's owner tree. In this case, implicit back refs is used for
268  * pointers in the block. Add full back refs for every pointers in the
269  * block, increase lower level extents' reference counts. The original
270  * implicit back refs are entailed to the new block.
271  *
272  * The reference count of the block is greater than one and the tree is
273  * not the block's owner tree. Add implicit back refs for every pointer in
274  * the new block, increase lower level extents' reference count.
275  *
276  * Back Reference Key composing:
277  *
278  * The key objectid corresponds to the first byte in the extent,
279  * The key type is used to differentiate between types of back refs.
280  * There are different meanings of the key offset for different types
281  * of back refs.
282  *
283  * File extents can be referenced by:
284  *
285  * - multiple snapshots, subvolumes, or different generations in one subvol
286  * - different files inside a single subvolume
287  * - different offsets inside a file (bookend extents in file.c)
288  *
289  * The extent ref structure for the implicit back refs has fields for:
290  *
291  * - Objectid of the subvolume root
292  * - objectid of the file holding the reference
293  * - original offset in the file
294  * - how many bookend extents
295  *
296  * The key offset for the implicit back refs is hash of the first
297  * three fields.
298  *
299  * The extent ref structure for the full back refs has field for:
300  *
301  * - number of pointers in the tree leaf
302  *
303  * The key offset for the implicit back refs is the first byte of
304  * the tree leaf
305  *
306  * When a file extent is allocated, The implicit back refs is used.
307  * the fields are filled in:
308  *
309  *     (root_key.objectid, inode objectid, offset in file, 1)
310  *
311  * When a file extent is removed file truncation, we find the
312  * corresponding implicit back refs and check the following fields:
313  *
314  *     (btrfs_header_owner(leaf), inode objectid, offset in file)
315  *
316  * Btree extents can be referenced by:
317  *
318  * - Different subvolumes
319  *
320  * Both the implicit back refs and the full back refs for tree blocks
321  * only consist of key. The key offset for the implicit back refs is
322  * objectid of block's owner tree. The key offset for the full back refs
323  * is the first byte of parent block.
324  *
325  * When implicit back refs is used, information about the lowest key and
326  * level of the tree block are required. These information are stored in
327  * tree block info structure.
328  */
329 
330 /*
331  * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
332  * is_data == BTRFS_REF_TYPE_DATA, data type is requiried,
333  * is_data == BTRFS_REF_TYPE_ANY, either type is OK.
334  */
btrfs_get_extent_inline_ref_type(const struct extent_buffer * eb,struct btrfs_extent_inline_ref * iref,enum btrfs_inline_ref_type is_data)335 int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
336 				     struct btrfs_extent_inline_ref *iref,
337 				     enum btrfs_inline_ref_type is_data)
338 {
339 	struct btrfs_fs_info *fs_info = eb->fs_info;
340 	int type = btrfs_extent_inline_ref_type(eb, iref);
341 	u64 offset = btrfs_extent_inline_ref_offset(eb, iref);
342 
343 	if (type == BTRFS_EXTENT_OWNER_REF_KEY) {
344 		ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA));
345 		return type;
346 	}
347 
348 	if (type == BTRFS_TREE_BLOCK_REF_KEY ||
349 	    type == BTRFS_SHARED_BLOCK_REF_KEY ||
350 	    type == BTRFS_SHARED_DATA_REF_KEY ||
351 	    type == BTRFS_EXTENT_DATA_REF_KEY) {
352 		if (is_data == BTRFS_REF_TYPE_BLOCK) {
353 			if (type == BTRFS_TREE_BLOCK_REF_KEY)
354 				return type;
355 			if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
356 				ASSERT(fs_info);
357 				/*
358 				 * Every shared one has parent tree block,
359 				 * which must be aligned to sector size.
360 				 */
361 				if (offset && IS_ALIGNED(offset, fs_info->sectorsize))
362 					return type;
363 			}
364 		} else if (is_data == BTRFS_REF_TYPE_DATA) {
365 			if (type == BTRFS_EXTENT_DATA_REF_KEY)
366 				return type;
367 			if (type == BTRFS_SHARED_DATA_REF_KEY) {
368 				ASSERT(fs_info);
369 				/*
370 				 * Every shared one has parent tree block,
371 				 * which must be aligned to sector size.
372 				 */
373 				if (offset &&
374 				    IS_ALIGNED(offset, fs_info->sectorsize))
375 					return type;
376 			}
377 		} else {
378 			ASSERT(is_data == BTRFS_REF_TYPE_ANY);
379 			return type;
380 		}
381 	}
382 
383 	WARN_ON(1);
384 	btrfs_print_leaf(eb);
385 	btrfs_err(fs_info,
386 		  "eb %llu iref 0x%lx invalid extent inline ref type %d",
387 		  eb->start, (unsigned long)iref, type);
388 
389 	return BTRFS_REF_TYPE_INVALID;
390 }
391 
hash_extent_data_ref(u64 root_objectid,u64 owner,u64 offset)392 u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
393 {
394 	u32 high_crc = ~(u32)0;
395 	u32 low_crc = ~(u32)0;
396 	__le64 lenum;
397 
398 	lenum = cpu_to_le64(root_objectid);
399 	high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
400 	lenum = cpu_to_le64(owner);
401 	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
402 	lenum = cpu_to_le64(offset);
403 	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
404 
405 	return ((u64)high_crc << 31) ^ (u64)low_crc;
406 }
407 
hash_extent_data_ref_item(struct extent_buffer * leaf,struct btrfs_extent_data_ref * ref)408 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
409 				     struct btrfs_extent_data_ref *ref)
410 {
411 	return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
412 				    btrfs_extent_data_ref_objectid(leaf, ref),
413 				    btrfs_extent_data_ref_offset(leaf, ref));
414 }
415 
match_extent_data_ref(struct extent_buffer * leaf,struct btrfs_extent_data_ref * ref,u64 root_objectid,u64 owner,u64 offset)416 static int match_extent_data_ref(struct extent_buffer *leaf,
417 				 struct btrfs_extent_data_ref *ref,
418 				 u64 root_objectid, u64 owner, u64 offset)
419 {
420 	if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
421 	    btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
422 	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
423 		return 0;
424 	return 1;
425 }
426 
lookup_extent_data_ref(struct btrfs_trans_handle * trans,struct btrfs_path * path,u64 bytenr,u64 parent,u64 root_objectid,u64 owner,u64 offset)427 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
428 					   struct btrfs_path *path,
429 					   u64 bytenr, u64 parent,
430 					   u64 root_objectid,
431 					   u64 owner, u64 offset)
432 {
433 	struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
434 	struct btrfs_key key;
435 	struct btrfs_extent_data_ref *ref;
436 	struct extent_buffer *leaf;
437 	u32 nritems;
438 	int recow;
439 	int ret;
440 
441 	key.objectid = bytenr;
442 	if (parent) {
443 		key.type = BTRFS_SHARED_DATA_REF_KEY;
444 		key.offset = parent;
445 	} else {
446 		key.type = BTRFS_EXTENT_DATA_REF_KEY;
447 		key.offset = hash_extent_data_ref(root_objectid,
448 						  owner, offset);
449 	}
450 again:
451 	recow = 0;
452 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
453 	if (ret < 0)
454 		return ret;
455 
456 	if (parent) {
457 		if (ret)
458 			return -ENOENT;
459 		return 0;
460 	}
461 
462 	ret = -ENOENT;
463 	leaf = path->nodes[0];
464 	nritems = btrfs_header_nritems(leaf);
465 	while (1) {
466 		if (path->slots[0] >= nritems) {
467 			ret = btrfs_next_leaf(root, path);
468 			if (ret) {
469 				if (ret > 0)
470 					return -ENOENT;
471 				return ret;
472 			}
473 
474 			leaf = path->nodes[0];
475 			nritems = btrfs_header_nritems(leaf);
476 			recow = 1;
477 		}
478 
479 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
480 		if (key.objectid != bytenr ||
481 		    key.type != BTRFS_EXTENT_DATA_REF_KEY)
482 			goto fail;
483 
484 		ref = btrfs_item_ptr(leaf, path->slots[0],
485 				     struct btrfs_extent_data_ref);
486 
487 		if (match_extent_data_ref(leaf, ref, root_objectid,
488 					  owner, offset)) {
489 			if (recow) {
490 				btrfs_release_path(path);
491 				goto again;
492 			}
493 			ret = 0;
494 			break;
495 		}
496 		path->slots[0]++;
497 	}
498 fail:
499 	return ret;
500 }
501 
insert_extent_data_ref(struct btrfs_trans_handle * trans,struct btrfs_path * path,struct btrfs_delayed_ref_node * node,u64 bytenr)502 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
503 					   struct btrfs_path *path,
504 					   struct btrfs_delayed_ref_node *node,
505 					   u64 bytenr)
506 {
507 	struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
508 	struct btrfs_key key;
509 	struct extent_buffer *leaf;
510 	u64 owner = btrfs_delayed_ref_owner(node);
511 	u64 offset = btrfs_delayed_ref_offset(node);
512 	u32 size;
513 	u32 num_refs;
514 	int ret;
515 
516 	key.objectid = bytenr;
517 	if (node->parent) {
518 		key.type = BTRFS_SHARED_DATA_REF_KEY;
519 		key.offset = node->parent;
520 		size = sizeof(struct btrfs_shared_data_ref);
521 	} else {
522 		key.type = BTRFS_EXTENT_DATA_REF_KEY;
523 		key.offset = hash_extent_data_ref(node->ref_root, owner, offset);
524 		size = sizeof(struct btrfs_extent_data_ref);
525 	}
526 
527 	ret = btrfs_insert_empty_item(trans, root, path, &key, size);
528 	if (ret && ret != -EEXIST)
529 		goto fail;
530 
531 	leaf = path->nodes[0];
532 	if (node->parent) {
533 		struct btrfs_shared_data_ref *ref;
534 		ref = btrfs_item_ptr(leaf, path->slots[0],
535 				     struct btrfs_shared_data_ref);
536 		if (ret == 0) {
537 			btrfs_set_shared_data_ref_count(leaf, ref, node->ref_mod);
538 		} else {
539 			num_refs = btrfs_shared_data_ref_count(leaf, ref);
540 			num_refs += node->ref_mod;
541 			btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
542 		}
543 	} else {
544 		struct btrfs_extent_data_ref *ref;
545 		while (ret == -EEXIST) {
546 			ref = btrfs_item_ptr(leaf, path->slots[0],
547 					     struct btrfs_extent_data_ref);
548 			if (match_extent_data_ref(leaf, ref, node->ref_root,
549 						  owner, offset))
550 				break;
551 			btrfs_release_path(path);
552 			key.offset++;
553 			ret = btrfs_insert_empty_item(trans, root, path, &key,
554 						      size);
555 			if (ret && ret != -EEXIST)
556 				goto fail;
557 
558 			leaf = path->nodes[0];
559 		}
560 		ref = btrfs_item_ptr(leaf, path->slots[0],
561 				     struct btrfs_extent_data_ref);
562 		if (ret == 0) {
563 			btrfs_set_extent_data_ref_root(leaf, ref, node->ref_root);
564 			btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
565 			btrfs_set_extent_data_ref_offset(leaf, ref, offset);
566 			btrfs_set_extent_data_ref_count(leaf, ref, node->ref_mod);
567 		} else {
568 			num_refs = btrfs_extent_data_ref_count(leaf, ref);
569 			num_refs += node->ref_mod;
570 			btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
571 		}
572 	}
573 	btrfs_mark_buffer_dirty(trans, leaf);
574 	ret = 0;
575 fail:
576 	btrfs_release_path(path);
577 	return ret;
578 }
579 
remove_extent_data_ref(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,int refs_to_drop)580 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
581 					   struct btrfs_root *root,
582 					   struct btrfs_path *path,
583 					   int refs_to_drop)
584 {
585 	struct btrfs_key key;
586 	struct btrfs_extent_data_ref *ref1 = NULL;
587 	struct btrfs_shared_data_ref *ref2 = NULL;
588 	struct extent_buffer *leaf;
589 	u32 num_refs = 0;
590 	int ret = 0;
591 
592 	leaf = path->nodes[0];
593 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
594 
595 	if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
596 		ref1 = btrfs_item_ptr(leaf, path->slots[0],
597 				      struct btrfs_extent_data_ref);
598 		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
599 	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
600 		ref2 = btrfs_item_ptr(leaf, path->slots[0],
601 				      struct btrfs_shared_data_ref);
602 		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
603 	} else {
604 		btrfs_err(trans->fs_info,
605 			  "unrecognized backref key (%llu %u %llu)",
606 			  key.objectid, key.type, key.offset);
607 		btrfs_abort_transaction(trans, -EUCLEAN);
608 		return -EUCLEAN;
609 	}
610 
611 	BUG_ON(num_refs < refs_to_drop);
612 	num_refs -= refs_to_drop;
613 
614 	if (num_refs == 0) {
615 		ret = btrfs_del_item(trans, root, path);
616 	} else {
617 		if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
618 			btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
619 		else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
620 			btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
621 		btrfs_mark_buffer_dirty(trans, leaf);
622 	}
623 	return ret;
624 }
625 
extent_data_ref_count(struct btrfs_path * path,struct btrfs_extent_inline_ref * iref)626 static noinline u32 extent_data_ref_count(struct btrfs_path *path,
627 					  struct btrfs_extent_inline_ref *iref)
628 {
629 	struct btrfs_key key;
630 	struct extent_buffer *leaf;
631 	struct btrfs_extent_data_ref *ref1;
632 	struct btrfs_shared_data_ref *ref2;
633 	u32 num_refs = 0;
634 	int type;
635 
636 	leaf = path->nodes[0];
637 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
638 
639 	if (iref) {
640 		/*
641 		 * If type is invalid, we should have bailed out earlier than
642 		 * this call.
643 		 */
644 		type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
645 		ASSERT(type != BTRFS_REF_TYPE_INVALID);
646 		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
647 			ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
648 			num_refs = btrfs_extent_data_ref_count(leaf, ref1);
649 		} else {
650 			ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
651 			num_refs = btrfs_shared_data_ref_count(leaf, ref2);
652 		}
653 	} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
654 		ref1 = btrfs_item_ptr(leaf, path->slots[0],
655 				      struct btrfs_extent_data_ref);
656 		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
657 	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
658 		ref2 = btrfs_item_ptr(leaf, path->slots[0],
659 				      struct btrfs_shared_data_ref);
660 		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
661 	} else {
662 		WARN_ON(1);
663 	}
664 	return num_refs;
665 }
666 
lookup_tree_block_ref(struct btrfs_trans_handle * trans,struct btrfs_path * path,u64 bytenr,u64 parent,u64 root_objectid)667 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
668 					  struct btrfs_path *path,
669 					  u64 bytenr, u64 parent,
670 					  u64 root_objectid)
671 {
672 	struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
673 	struct btrfs_key key;
674 	int ret;
675 
676 	key.objectid = bytenr;
677 	if (parent) {
678 		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
679 		key.offset = parent;
680 	} else {
681 		key.type = BTRFS_TREE_BLOCK_REF_KEY;
682 		key.offset = root_objectid;
683 	}
684 
685 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
686 	if (ret > 0)
687 		ret = -ENOENT;
688 	return ret;
689 }
690 
insert_tree_block_ref(struct btrfs_trans_handle * trans,struct btrfs_path * path,struct btrfs_delayed_ref_node * node,u64 bytenr)691 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
692 					  struct btrfs_path *path,
693 					  struct btrfs_delayed_ref_node *node,
694 					  u64 bytenr)
695 {
696 	struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
697 	struct btrfs_key key;
698 	int ret;
699 
700 	key.objectid = bytenr;
701 	if (node->parent) {
702 		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
703 		key.offset = node->parent;
704 	} else {
705 		key.type = BTRFS_TREE_BLOCK_REF_KEY;
706 		key.offset = node->ref_root;
707 	}
708 
709 	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
710 	btrfs_release_path(path);
711 	return ret;
712 }
713 
extent_ref_type(u64 parent,u64 owner)714 static inline int extent_ref_type(u64 parent, u64 owner)
715 {
716 	int type;
717 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
718 		if (parent > 0)
719 			type = BTRFS_SHARED_BLOCK_REF_KEY;
720 		else
721 			type = BTRFS_TREE_BLOCK_REF_KEY;
722 	} else {
723 		if (parent > 0)
724 			type = BTRFS_SHARED_DATA_REF_KEY;
725 		else
726 			type = BTRFS_EXTENT_DATA_REF_KEY;
727 	}
728 	return type;
729 }
730 
find_next_key(struct btrfs_path * path,int level,struct btrfs_key * key)731 static int find_next_key(struct btrfs_path *path, int level,
732 			 struct btrfs_key *key)
733 
734 {
735 	for (; level < BTRFS_MAX_LEVEL; level++) {
736 		if (!path->nodes[level])
737 			break;
738 		if (path->slots[level] + 1 >=
739 		    btrfs_header_nritems(path->nodes[level]))
740 			continue;
741 		if (level == 0)
742 			btrfs_item_key_to_cpu(path->nodes[level], key,
743 					      path->slots[level] + 1);
744 		else
745 			btrfs_node_key_to_cpu(path->nodes[level], key,
746 					      path->slots[level] + 1);
747 		return 0;
748 	}
749 	return 1;
750 }
751 
752 /*
753  * look for inline back ref. if back ref is found, *ref_ret is set
754  * to the address of inline back ref, and 0 is returned.
755  *
756  * if back ref isn't found, *ref_ret is set to the address where it
757  * should be inserted, and -ENOENT is returned.
758  *
759  * if insert is true and there are too many inline back refs, the path
760  * points to the extent item, and -EAGAIN is returned.
761  *
762  * NOTE: inline back refs are ordered in the same way that back ref
763  *	 items in the tree are ordered.
764  */
765 static noinline_for_stack
lookup_inline_extent_backref(struct btrfs_trans_handle * trans,struct btrfs_path * path,struct btrfs_extent_inline_ref ** ref_ret,u64 bytenr,u64 num_bytes,u64 parent,u64 root_objectid,u64 owner,u64 offset,int insert)766 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
767 				 struct btrfs_path *path,
768 				 struct btrfs_extent_inline_ref **ref_ret,
769 				 u64 bytenr, u64 num_bytes,
770 				 u64 parent, u64 root_objectid,
771 				 u64 owner, u64 offset, int insert)
772 {
773 	struct btrfs_fs_info *fs_info = trans->fs_info;
774 	struct btrfs_root *root = btrfs_extent_root(fs_info, bytenr);
775 	struct btrfs_key key;
776 	struct extent_buffer *leaf;
777 	struct btrfs_extent_item *ei;
778 	struct btrfs_extent_inline_ref *iref;
779 	u64 flags;
780 	u64 item_size;
781 	unsigned long ptr;
782 	unsigned long end;
783 	int extra_size;
784 	int type;
785 	int want;
786 	int ret;
787 	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
788 	int needed;
789 
790 	key.objectid = bytenr;
791 	key.type = BTRFS_EXTENT_ITEM_KEY;
792 	key.offset = num_bytes;
793 
794 	want = extent_ref_type(parent, owner);
795 	if (insert) {
796 		extra_size = btrfs_extent_inline_ref_size(want);
797 		path->search_for_extension = 1;
798 		path->keep_locks = 1;
799 	} else
800 		extra_size = -1;
801 
802 	/*
803 	 * Owner is our level, so we can just add one to get the level for the
804 	 * block we are interested in.
805 	 */
806 	if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
807 		key.type = BTRFS_METADATA_ITEM_KEY;
808 		key.offset = owner;
809 	}
810 
811 again:
812 	ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
813 	if (ret < 0)
814 		goto out;
815 
816 	/*
817 	 * We may be a newly converted file system which still has the old fat
818 	 * extent entries for metadata, so try and see if we have one of those.
819 	 */
820 	if (ret > 0 && skinny_metadata) {
821 		skinny_metadata = false;
822 		if (path->slots[0]) {
823 			path->slots[0]--;
824 			btrfs_item_key_to_cpu(path->nodes[0], &key,
825 					      path->slots[0]);
826 			if (key.objectid == bytenr &&
827 			    key.type == BTRFS_EXTENT_ITEM_KEY &&
828 			    key.offset == num_bytes)
829 				ret = 0;
830 		}
831 		if (ret) {
832 			key.objectid = bytenr;
833 			key.type = BTRFS_EXTENT_ITEM_KEY;
834 			key.offset = num_bytes;
835 			btrfs_release_path(path);
836 			goto again;
837 		}
838 	}
839 
840 	if (ret && !insert) {
841 		ret = -ENOENT;
842 		goto out;
843 	} else if (WARN_ON(ret)) {
844 		btrfs_print_leaf(path->nodes[0]);
845 		btrfs_err(fs_info,
846 "extent item not found for insert, bytenr %llu num_bytes %llu parent %llu root_objectid %llu owner %llu offset %llu",
847 			  bytenr, num_bytes, parent, root_objectid, owner,
848 			  offset);
849 		ret = -EUCLEAN;
850 		goto out;
851 	}
852 
853 	leaf = path->nodes[0];
854 	item_size = btrfs_item_size(leaf, path->slots[0]);
855 	if (unlikely(item_size < sizeof(*ei))) {
856 		ret = -EUCLEAN;
857 		btrfs_err(fs_info,
858 			  "unexpected extent item size, has %llu expect >= %zu",
859 			  item_size, sizeof(*ei));
860 		btrfs_abort_transaction(trans, ret);
861 		goto out;
862 	}
863 
864 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
865 	flags = btrfs_extent_flags(leaf, ei);
866 
867 	ptr = (unsigned long)(ei + 1);
868 	end = (unsigned long)ei + item_size;
869 
870 	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
871 		ptr += sizeof(struct btrfs_tree_block_info);
872 		BUG_ON(ptr > end);
873 	}
874 
875 	if (owner >= BTRFS_FIRST_FREE_OBJECTID)
876 		needed = BTRFS_REF_TYPE_DATA;
877 	else
878 		needed = BTRFS_REF_TYPE_BLOCK;
879 
880 	ret = -ENOENT;
881 	while (ptr < end) {
882 		iref = (struct btrfs_extent_inline_ref *)ptr;
883 		type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
884 		if (type == BTRFS_EXTENT_OWNER_REF_KEY) {
885 			ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA));
886 			ptr += btrfs_extent_inline_ref_size(type);
887 			continue;
888 		}
889 		if (type == BTRFS_REF_TYPE_INVALID) {
890 			ret = -EUCLEAN;
891 			goto out;
892 		}
893 
894 		if (want < type)
895 			break;
896 		if (want > type) {
897 			ptr += btrfs_extent_inline_ref_size(type);
898 			continue;
899 		}
900 
901 		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
902 			struct btrfs_extent_data_ref *dref;
903 			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
904 			if (match_extent_data_ref(leaf, dref, root_objectid,
905 						  owner, offset)) {
906 				ret = 0;
907 				break;
908 			}
909 			if (hash_extent_data_ref_item(leaf, dref) <
910 			    hash_extent_data_ref(root_objectid, owner, offset))
911 				break;
912 		} else {
913 			u64 ref_offset;
914 			ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
915 			if (parent > 0) {
916 				if (parent == ref_offset) {
917 					ret = 0;
918 					break;
919 				}
920 				if (ref_offset < parent)
921 					break;
922 			} else {
923 				if (root_objectid == ref_offset) {
924 					ret = 0;
925 					break;
926 				}
927 				if (ref_offset < root_objectid)
928 					break;
929 			}
930 		}
931 		ptr += btrfs_extent_inline_ref_size(type);
932 	}
933 
934 	if (unlikely(ptr > end)) {
935 		ret = -EUCLEAN;
936 		btrfs_print_leaf(path->nodes[0]);
937 		btrfs_crit(fs_info,
938 "overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu",
939 			   path->slots[0], root_objectid, owner, offset, parent);
940 		goto out;
941 	}
942 
943 	if (ret == -ENOENT && insert) {
944 		if (item_size + extra_size >=
945 		    BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
946 			ret = -EAGAIN;
947 			goto out;
948 		}
949 		/*
950 		 * To add new inline back ref, we have to make sure
951 		 * there is no corresponding back ref item.
952 		 * For simplicity, we just do not add new inline back
953 		 * ref if there is any kind of item for this block
954 		 */
955 		if (find_next_key(path, 0, &key) == 0 &&
956 		    key.objectid == bytenr &&
957 		    key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
958 			ret = -EAGAIN;
959 			goto out;
960 		}
961 	}
962 	*ref_ret = (struct btrfs_extent_inline_ref *)ptr;
963 out:
964 	if (insert) {
965 		path->keep_locks = 0;
966 		path->search_for_extension = 0;
967 		btrfs_unlock_up_safe(path, 1);
968 	}
969 	return ret;
970 }
971 
972 /*
973  * helper to add new inline back ref
974  */
975 static noinline_for_stack
setup_inline_extent_backref(struct btrfs_trans_handle * trans,struct btrfs_path * path,struct btrfs_extent_inline_ref * iref,u64 parent,u64 root_objectid,u64 owner,u64 offset,int refs_to_add,struct btrfs_delayed_extent_op * extent_op)976 void setup_inline_extent_backref(struct btrfs_trans_handle *trans,
977 				 struct btrfs_path *path,
978 				 struct btrfs_extent_inline_ref *iref,
979 				 u64 parent, u64 root_objectid,
980 				 u64 owner, u64 offset, int refs_to_add,
981 				 struct btrfs_delayed_extent_op *extent_op)
982 {
983 	struct extent_buffer *leaf;
984 	struct btrfs_extent_item *ei;
985 	unsigned long ptr;
986 	unsigned long end;
987 	unsigned long item_offset;
988 	u64 refs;
989 	int size;
990 	int type;
991 
992 	leaf = path->nodes[0];
993 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
994 	item_offset = (unsigned long)iref - (unsigned long)ei;
995 
996 	type = extent_ref_type(parent, owner);
997 	size = btrfs_extent_inline_ref_size(type);
998 
999 	btrfs_extend_item(trans, path, size);
1000 
1001 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1002 	refs = btrfs_extent_refs(leaf, ei);
1003 	refs += refs_to_add;
1004 	btrfs_set_extent_refs(leaf, ei, refs);
1005 	if (extent_op)
1006 		__run_delayed_extent_op(extent_op, leaf, ei);
1007 
1008 	ptr = (unsigned long)ei + item_offset;
1009 	end = (unsigned long)ei + btrfs_item_size(leaf, path->slots[0]);
1010 	if (ptr < end - size)
1011 		memmove_extent_buffer(leaf, ptr + size, ptr,
1012 				      end - size - ptr);
1013 
1014 	iref = (struct btrfs_extent_inline_ref *)ptr;
1015 	btrfs_set_extent_inline_ref_type(leaf, iref, type);
1016 	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1017 		struct btrfs_extent_data_ref *dref;
1018 		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1019 		btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1020 		btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1021 		btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1022 		btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1023 	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1024 		struct btrfs_shared_data_ref *sref;
1025 		sref = (struct btrfs_shared_data_ref *)(iref + 1);
1026 		btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1027 		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1028 	} else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1029 		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1030 	} else {
1031 		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1032 	}
1033 	btrfs_mark_buffer_dirty(trans, leaf);
1034 }
1035 
lookup_extent_backref(struct btrfs_trans_handle * trans,struct btrfs_path * path,struct btrfs_extent_inline_ref ** ref_ret,u64 bytenr,u64 num_bytes,u64 parent,u64 root_objectid,u64 owner,u64 offset)1036 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1037 				 struct btrfs_path *path,
1038 				 struct btrfs_extent_inline_ref **ref_ret,
1039 				 u64 bytenr, u64 num_bytes, u64 parent,
1040 				 u64 root_objectid, u64 owner, u64 offset)
1041 {
1042 	int ret;
1043 
1044 	ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr,
1045 					   num_bytes, parent, root_objectid,
1046 					   owner, offset, 0);
1047 	if (ret != -ENOENT)
1048 		return ret;
1049 
1050 	btrfs_release_path(path);
1051 	*ref_ret = NULL;
1052 
1053 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1054 		ret = lookup_tree_block_ref(trans, path, bytenr, parent,
1055 					    root_objectid);
1056 	} else {
1057 		ret = lookup_extent_data_ref(trans, path, bytenr, parent,
1058 					     root_objectid, owner, offset);
1059 	}
1060 	return ret;
1061 }
1062 
1063 /*
1064  * helper to update/remove inline back ref
1065  */
update_inline_extent_backref(struct btrfs_trans_handle * trans,struct btrfs_path * path,struct btrfs_extent_inline_ref * iref,int refs_to_mod,struct btrfs_delayed_extent_op * extent_op)1066 static noinline_for_stack int update_inline_extent_backref(
1067 				  struct btrfs_trans_handle *trans,
1068 				  struct btrfs_path *path,
1069 				  struct btrfs_extent_inline_ref *iref,
1070 				  int refs_to_mod,
1071 				  struct btrfs_delayed_extent_op *extent_op)
1072 {
1073 	struct extent_buffer *leaf = path->nodes[0];
1074 	struct btrfs_fs_info *fs_info = leaf->fs_info;
1075 	struct btrfs_extent_item *ei;
1076 	struct btrfs_extent_data_ref *dref = NULL;
1077 	struct btrfs_shared_data_ref *sref = NULL;
1078 	unsigned long ptr;
1079 	unsigned long end;
1080 	u32 item_size;
1081 	int size;
1082 	int type;
1083 	u64 refs;
1084 
1085 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1086 	refs = btrfs_extent_refs(leaf, ei);
1087 	if (unlikely(refs_to_mod < 0 && refs + refs_to_mod <= 0)) {
1088 		struct btrfs_key key;
1089 		u32 extent_size;
1090 
1091 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1092 		if (key.type == BTRFS_METADATA_ITEM_KEY)
1093 			extent_size = fs_info->nodesize;
1094 		else
1095 			extent_size = key.offset;
1096 		btrfs_print_leaf(leaf);
1097 		btrfs_err(fs_info,
1098 	"invalid refs_to_mod for extent %llu num_bytes %u, has %d expect >= -%llu",
1099 			  key.objectid, extent_size, refs_to_mod, refs);
1100 		return -EUCLEAN;
1101 	}
1102 	refs += refs_to_mod;
1103 	btrfs_set_extent_refs(leaf, ei, refs);
1104 	if (extent_op)
1105 		__run_delayed_extent_op(extent_op, leaf, ei);
1106 
1107 	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
1108 	/*
1109 	 * Function btrfs_get_extent_inline_ref_type() has already printed
1110 	 * error messages.
1111 	 */
1112 	if (unlikely(type == BTRFS_REF_TYPE_INVALID))
1113 		return -EUCLEAN;
1114 
1115 	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1116 		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1117 		refs = btrfs_extent_data_ref_count(leaf, dref);
1118 	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1119 		sref = (struct btrfs_shared_data_ref *)(iref + 1);
1120 		refs = btrfs_shared_data_ref_count(leaf, sref);
1121 	} else {
1122 		refs = 1;
1123 		/*
1124 		 * For tree blocks we can only drop one ref for it, and tree
1125 		 * blocks should not have refs > 1.
1126 		 *
1127 		 * Furthermore if we're inserting a new inline backref, we
1128 		 * won't reach this path either. That would be
1129 		 * setup_inline_extent_backref().
1130 		 */
1131 		if (unlikely(refs_to_mod != -1)) {
1132 			struct btrfs_key key;
1133 
1134 			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1135 
1136 			btrfs_print_leaf(leaf);
1137 			btrfs_err(fs_info,
1138 			"invalid refs_to_mod for tree block %llu, has %d expect -1",
1139 				  key.objectid, refs_to_mod);
1140 			return -EUCLEAN;
1141 		}
1142 	}
1143 
1144 	if (unlikely(refs_to_mod < 0 && refs < -refs_to_mod)) {
1145 		struct btrfs_key key;
1146 		u32 extent_size;
1147 
1148 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1149 		if (key.type == BTRFS_METADATA_ITEM_KEY)
1150 			extent_size = fs_info->nodesize;
1151 		else
1152 			extent_size = key.offset;
1153 		btrfs_print_leaf(leaf);
1154 		btrfs_err(fs_info,
1155 "invalid refs_to_mod for backref entry, iref %lu extent %llu num_bytes %u, has %d expect >= -%llu",
1156 			  (unsigned long)iref, key.objectid, extent_size,
1157 			  refs_to_mod, refs);
1158 		return -EUCLEAN;
1159 	}
1160 	refs += refs_to_mod;
1161 
1162 	if (refs > 0) {
1163 		if (type == BTRFS_EXTENT_DATA_REF_KEY)
1164 			btrfs_set_extent_data_ref_count(leaf, dref, refs);
1165 		else
1166 			btrfs_set_shared_data_ref_count(leaf, sref, refs);
1167 	} else {
1168 		size =  btrfs_extent_inline_ref_size(type);
1169 		item_size = btrfs_item_size(leaf, path->slots[0]);
1170 		ptr = (unsigned long)iref;
1171 		end = (unsigned long)ei + item_size;
1172 		if (ptr + size < end)
1173 			memmove_extent_buffer(leaf, ptr, ptr + size,
1174 					      end - ptr - size);
1175 		item_size -= size;
1176 		btrfs_truncate_item(trans, path, item_size, 1);
1177 	}
1178 	btrfs_mark_buffer_dirty(trans, leaf);
1179 	return 0;
1180 }
1181 
1182 static noinline_for_stack
insert_inline_extent_backref(struct btrfs_trans_handle * trans,struct btrfs_path * path,u64 bytenr,u64 num_bytes,u64 parent,u64 root_objectid,u64 owner,u64 offset,int refs_to_add,struct btrfs_delayed_extent_op * extent_op)1183 int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1184 				 struct btrfs_path *path,
1185 				 u64 bytenr, u64 num_bytes, u64 parent,
1186 				 u64 root_objectid, u64 owner,
1187 				 u64 offset, int refs_to_add,
1188 				 struct btrfs_delayed_extent_op *extent_op)
1189 {
1190 	struct btrfs_extent_inline_ref *iref;
1191 	int ret;
1192 
1193 	ret = lookup_inline_extent_backref(trans, path, &iref, bytenr,
1194 					   num_bytes, parent, root_objectid,
1195 					   owner, offset, 1);
1196 	if (ret == 0) {
1197 		/*
1198 		 * We're adding refs to a tree block we already own, this
1199 		 * should not happen at all.
1200 		 */
1201 		if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1202 			btrfs_print_leaf(path->nodes[0]);
1203 			btrfs_crit(trans->fs_info,
1204 "adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu slot %u",
1205 				   bytenr, num_bytes, root_objectid, path->slots[0]);
1206 			return -EUCLEAN;
1207 		}
1208 		ret = update_inline_extent_backref(trans, path, iref,
1209 						   refs_to_add, extent_op);
1210 	} else if (ret == -ENOENT) {
1211 		setup_inline_extent_backref(trans, path, iref, parent,
1212 					    root_objectid, owner, offset,
1213 					    refs_to_add, extent_op);
1214 		ret = 0;
1215 	}
1216 	return ret;
1217 }
1218 
remove_extent_backref(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct btrfs_extent_inline_ref * iref,int refs_to_drop,int is_data)1219 static int remove_extent_backref(struct btrfs_trans_handle *trans,
1220 				 struct btrfs_root *root,
1221 				 struct btrfs_path *path,
1222 				 struct btrfs_extent_inline_ref *iref,
1223 				 int refs_to_drop, int is_data)
1224 {
1225 	int ret = 0;
1226 
1227 	BUG_ON(!is_data && refs_to_drop != 1);
1228 	if (iref)
1229 		ret = update_inline_extent_backref(trans, path, iref,
1230 						   -refs_to_drop, NULL);
1231 	else if (is_data)
1232 		ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
1233 	else
1234 		ret = btrfs_del_item(trans, root, path);
1235 	return ret;
1236 }
1237 
btrfs_issue_discard(struct block_device * bdev,u64 start,u64 len,u64 * discarded_bytes)1238 static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
1239 			       u64 *discarded_bytes)
1240 {
1241 	int j, ret = 0;
1242 	u64 bytes_left, end;
1243 	u64 aligned_start = ALIGN(start, 1 << SECTOR_SHIFT);
1244 
1245 	/* Adjust the range to be aligned to 512B sectors if necessary. */
1246 	if (start != aligned_start) {
1247 		len -= aligned_start - start;
1248 		len = round_down(len, 1 << SECTOR_SHIFT);
1249 		start = aligned_start;
1250 	}
1251 
1252 	*discarded_bytes = 0;
1253 
1254 	if (!len)
1255 		return 0;
1256 
1257 	end = start + len;
1258 	bytes_left = len;
1259 
1260 	/* Skip any superblocks on this device. */
1261 	for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
1262 		u64 sb_start = btrfs_sb_offset(j);
1263 		u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
1264 		u64 size = sb_start - start;
1265 
1266 		if (!in_range(sb_start, start, bytes_left) &&
1267 		    !in_range(sb_end, start, bytes_left) &&
1268 		    !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
1269 			continue;
1270 
1271 		/*
1272 		 * Superblock spans beginning of range.  Adjust start and
1273 		 * try again.
1274 		 */
1275 		if (sb_start <= start) {
1276 			start += sb_end - start;
1277 			if (start > end) {
1278 				bytes_left = 0;
1279 				break;
1280 			}
1281 			bytes_left = end - start;
1282 			continue;
1283 		}
1284 
1285 		if (size) {
1286 			ret = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT,
1287 						   size >> SECTOR_SHIFT,
1288 						   GFP_NOFS);
1289 			if (!ret)
1290 				*discarded_bytes += size;
1291 			else if (ret != -EOPNOTSUPP)
1292 				return ret;
1293 		}
1294 
1295 		start = sb_end;
1296 		if (start > end) {
1297 			bytes_left = 0;
1298 			break;
1299 		}
1300 		bytes_left = end - start;
1301 	}
1302 
1303 	while (bytes_left) {
1304 		u64 bytes_to_discard = min(BTRFS_MAX_DISCARD_CHUNK_SIZE, bytes_left);
1305 
1306 		ret = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT,
1307 					   bytes_to_discard >> SECTOR_SHIFT,
1308 					   GFP_NOFS);
1309 
1310 		if (ret) {
1311 			if (ret != -EOPNOTSUPP)
1312 				break;
1313 			continue;
1314 		}
1315 
1316 		start += bytes_to_discard;
1317 		bytes_left -= bytes_to_discard;
1318 		*discarded_bytes += bytes_to_discard;
1319 
1320 		if (btrfs_trim_interrupted()) {
1321 			ret = -ERESTARTSYS;
1322 			break;
1323 		}
1324 	}
1325 
1326 	return ret;
1327 }
1328 
do_discard_extent(struct btrfs_discard_stripe * stripe,u64 * bytes)1329 static int do_discard_extent(struct btrfs_discard_stripe *stripe, u64 *bytes)
1330 {
1331 	struct btrfs_device *dev = stripe->dev;
1332 	struct btrfs_fs_info *fs_info = dev->fs_info;
1333 	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
1334 	u64 phys = stripe->physical;
1335 	u64 len = stripe->length;
1336 	u64 discarded = 0;
1337 	int ret = 0;
1338 
1339 	/* Zone reset on a zoned filesystem */
1340 	if (btrfs_can_zone_reset(dev, phys, len)) {
1341 		u64 src_disc;
1342 
1343 		ret = btrfs_reset_device_zone(dev, phys, len, &discarded);
1344 		if (ret)
1345 			goto out;
1346 
1347 		if (!btrfs_dev_replace_is_ongoing(dev_replace) ||
1348 		    dev != dev_replace->srcdev)
1349 			goto out;
1350 
1351 		src_disc = discarded;
1352 
1353 		/* Send to replace target as well */
1354 		ret = btrfs_reset_device_zone(dev_replace->tgtdev, phys, len,
1355 					      &discarded);
1356 		discarded += src_disc;
1357 	} else if (bdev_max_discard_sectors(stripe->dev->bdev)) {
1358 		ret = btrfs_issue_discard(dev->bdev, phys, len, &discarded);
1359 	} else {
1360 		ret = 0;
1361 		*bytes = 0;
1362 	}
1363 
1364 out:
1365 	*bytes = discarded;
1366 	return ret;
1367 }
1368 
btrfs_discard_extent(struct btrfs_fs_info * fs_info,u64 bytenr,u64 num_bytes,u64 * actual_bytes)1369 int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
1370 			 u64 num_bytes, u64 *actual_bytes)
1371 {
1372 	int ret = 0;
1373 	u64 discarded_bytes = 0;
1374 	u64 end = bytenr + num_bytes;
1375 	u64 cur = bytenr;
1376 
1377 	/*
1378 	 * Avoid races with device replace and make sure the devices in the
1379 	 * stripes don't go away while we are discarding.
1380 	 */
1381 	btrfs_bio_counter_inc_blocked(fs_info);
1382 	while (cur < end) {
1383 		struct btrfs_discard_stripe *stripes;
1384 		unsigned int num_stripes;
1385 		int i;
1386 
1387 		num_bytes = end - cur;
1388 		stripes = btrfs_map_discard(fs_info, cur, &num_bytes, &num_stripes);
1389 		if (IS_ERR(stripes)) {
1390 			ret = PTR_ERR(stripes);
1391 			if (ret == -EOPNOTSUPP)
1392 				ret = 0;
1393 			break;
1394 		}
1395 
1396 		for (i = 0; i < num_stripes; i++) {
1397 			struct btrfs_discard_stripe *stripe = stripes + i;
1398 			u64 bytes;
1399 
1400 			if (!stripe->dev->bdev) {
1401 				ASSERT(btrfs_test_opt(fs_info, DEGRADED));
1402 				continue;
1403 			}
1404 
1405 			if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
1406 					&stripe->dev->dev_state))
1407 				continue;
1408 
1409 			ret = do_discard_extent(stripe, &bytes);
1410 			if (ret) {
1411 				/*
1412 				 * Keep going if discard is not supported by the
1413 				 * device.
1414 				 */
1415 				if (ret != -EOPNOTSUPP)
1416 					break;
1417 				ret = 0;
1418 			} else {
1419 				discarded_bytes += bytes;
1420 			}
1421 		}
1422 		kfree(stripes);
1423 		if (ret)
1424 			break;
1425 		cur += num_bytes;
1426 	}
1427 	btrfs_bio_counter_dec(fs_info);
1428 	if (actual_bytes)
1429 		*actual_bytes = discarded_bytes;
1430 	return ret;
1431 }
1432 
1433 /* Can return -ENOMEM */
btrfs_inc_extent_ref(struct btrfs_trans_handle * trans,struct btrfs_ref * generic_ref)1434 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1435 			 struct btrfs_ref *generic_ref)
1436 {
1437 	struct btrfs_fs_info *fs_info = trans->fs_info;
1438 	int ret;
1439 
1440 	ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
1441 	       generic_ref->action);
1442 	BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
1443 	       generic_ref->ref_root == BTRFS_TREE_LOG_OBJECTID);
1444 
1445 	if (generic_ref->type == BTRFS_REF_METADATA)
1446 		ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL);
1447 	else
1448 		ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0);
1449 
1450 	btrfs_ref_tree_mod(fs_info, generic_ref);
1451 
1452 	return ret;
1453 }
1454 
1455 /*
1456  * Insert backreference for a given extent.
1457  *
1458  * The counterpart is in __btrfs_free_extent(), with examples and more details
1459  * how it works.
1460  *
1461  * @trans:	    Handle of transaction
1462  *
1463  * @node:	    The delayed ref node used to get the bytenr/length for
1464  *		    extent whose references are incremented.
1465  *
1466  * @extent_op       Pointer to a structure, holding information necessary when
1467  *                  updating a tree block's flags
1468  *
1469  */
__btrfs_inc_extent_ref(struct btrfs_trans_handle * trans,struct btrfs_delayed_ref_node * node,struct btrfs_delayed_extent_op * extent_op)1470 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1471 				  struct btrfs_delayed_ref_node *node,
1472 				  struct btrfs_delayed_extent_op *extent_op)
1473 {
1474 	struct btrfs_path *path;
1475 	struct extent_buffer *leaf;
1476 	struct btrfs_extent_item *item;
1477 	struct btrfs_key key;
1478 	u64 bytenr = node->bytenr;
1479 	u64 num_bytes = node->num_bytes;
1480 	u64 owner = btrfs_delayed_ref_owner(node);
1481 	u64 offset = btrfs_delayed_ref_offset(node);
1482 	u64 refs;
1483 	int refs_to_add = node->ref_mod;
1484 	int ret;
1485 
1486 	path = btrfs_alloc_path();
1487 	if (!path)
1488 		return -ENOMEM;
1489 
1490 	/* this will setup the path even if it fails to insert the back ref */
1491 	ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
1492 					   node->parent, node->ref_root, owner,
1493 					   offset, refs_to_add, extent_op);
1494 	if ((ret < 0 && ret != -EAGAIN) || !ret)
1495 		goto out;
1496 
1497 	/*
1498 	 * Ok we had -EAGAIN which means we didn't have space to insert and
1499 	 * inline extent ref, so just update the reference count and add a
1500 	 * normal backref.
1501 	 */
1502 	leaf = path->nodes[0];
1503 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1504 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1505 	refs = btrfs_extent_refs(leaf, item);
1506 	btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
1507 	if (extent_op)
1508 		__run_delayed_extent_op(extent_op, leaf, item);
1509 
1510 	btrfs_mark_buffer_dirty(trans, leaf);
1511 	btrfs_release_path(path);
1512 
1513 	/* now insert the actual backref */
1514 	if (owner < BTRFS_FIRST_FREE_OBJECTID)
1515 		ret = insert_tree_block_ref(trans, path, node, bytenr);
1516 	else
1517 		ret = insert_extent_data_ref(trans, path, node, bytenr);
1518 
1519 	if (ret)
1520 		btrfs_abort_transaction(trans, ret);
1521 out:
1522 	btrfs_free_path(path);
1523 	return ret;
1524 }
1525 
free_head_ref_squota_rsv(struct btrfs_fs_info * fs_info,struct btrfs_delayed_ref_head * href)1526 static void free_head_ref_squota_rsv(struct btrfs_fs_info *fs_info,
1527 				     struct btrfs_delayed_ref_head *href)
1528 {
1529 	u64 root = href->owning_root;
1530 
1531 	/*
1532 	 * Don't check must_insert_reserved, as this is called from contexts
1533 	 * where it has already been unset.
1534 	 */
1535 	if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE ||
1536 	    !href->is_data || !is_fstree(root))
1537 		return;
1538 
1539 	btrfs_qgroup_free_refroot(fs_info, root, href->reserved_bytes,
1540 				  BTRFS_QGROUP_RSV_DATA);
1541 }
1542 
run_delayed_data_ref(struct btrfs_trans_handle * trans,struct btrfs_delayed_ref_head * href,struct btrfs_delayed_ref_node * node,struct btrfs_delayed_extent_op * extent_op,bool insert_reserved)1543 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
1544 				struct btrfs_delayed_ref_head *href,
1545 				struct btrfs_delayed_ref_node *node,
1546 				struct btrfs_delayed_extent_op *extent_op,
1547 				bool insert_reserved)
1548 {
1549 	int ret = 0;
1550 	u64 parent = 0;
1551 	u64 flags = 0;
1552 
1553 	trace_run_delayed_data_ref(trans->fs_info, node);
1554 
1555 	if (node->type == BTRFS_SHARED_DATA_REF_KEY)
1556 		parent = node->parent;
1557 
1558 	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1559 		struct btrfs_key key;
1560 		struct btrfs_squota_delta delta = {
1561 			.root = href->owning_root,
1562 			.num_bytes = node->num_bytes,
1563 			.is_data = true,
1564 			.is_inc	= true,
1565 			.generation = trans->transid,
1566 		};
1567 		u64 owner = btrfs_delayed_ref_owner(node);
1568 		u64 offset = btrfs_delayed_ref_offset(node);
1569 
1570 		if (extent_op)
1571 			flags |= extent_op->flags_to_set;
1572 
1573 		key.objectid = node->bytenr;
1574 		key.type = BTRFS_EXTENT_ITEM_KEY;
1575 		key.offset = node->num_bytes;
1576 
1577 		ret = alloc_reserved_file_extent(trans, parent, node->ref_root,
1578 						 flags, owner, offset, &key,
1579 						 node->ref_mod,
1580 						 href->owning_root);
1581 		free_head_ref_squota_rsv(trans->fs_info, href);
1582 		if (!ret)
1583 			ret = btrfs_record_squota_delta(trans->fs_info, &delta);
1584 	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
1585 		ret = __btrfs_inc_extent_ref(trans, node, extent_op);
1586 	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
1587 		ret = __btrfs_free_extent(trans, href, node, extent_op);
1588 	} else {
1589 		BUG();
1590 	}
1591 	return ret;
1592 }
1593 
__run_delayed_extent_op(struct btrfs_delayed_extent_op * extent_op,struct extent_buffer * leaf,struct btrfs_extent_item * ei)1594 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
1595 				    struct extent_buffer *leaf,
1596 				    struct btrfs_extent_item *ei)
1597 {
1598 	u64 flags = btrfs_extent_flags(leaf, ei);
1599 	if (extent_op->update_flags) {
1600 		flags |= extent_op->flags_to_set;
1601 		btrfs_set_extent_flags(leaf, ei, flags);
1602 	}
1603 
1604 	if (extent_op->update_key) {
1605 		struct btrfs_tree_block_info *bi;
1606 		BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
1607 		bi = (struct btrfs_tree_block_info *)(ei + 1);
1608 		btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
1609 	}
1610 }
1611 
run_delayed_extent_op(struct btrfs_trans_handle * trans,struct btrfs_delayed_ref_head * head,struct btrfs_delayed_extent_op * extent_op)1612 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
1613 				 struct btrfs_delayed_ref_head *head,
1614 				 struct btrfs_delayed_extent_op *extent_op)
1615 {
1616 	struct btrfs_fs_info *fs_info = trans->fs_info;
1617 	struct btrfs_root *root;
1618 	struct btrfs_key key;
1619 	struct btrfs_path *path;
1620 	struct btrfs_extent_item *ei;
1621 	struct extent_buffer *leaf;
1622 	u32 item_size;
1623 	int ret;
1624 	int metadata = 1;
1625 
1626 	if (TRANS_ABORTED(trans))
1627 		return 0;
1628 
1629 	if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
1630 		metadata = 0;
1631 
1632 	path = btrfs_alloc_path();
1633 	if (!path)
1634 		return -ENOMEM;
1635 
1636 	key.objectid = head->bytenr;
1637 
1638 	if (metadata) {
1639 		key.type = BTRFS_METADATA_ITEM_KEY;
1640 		key.offset = head->level;
1641 	} else {
1642 		key.type = BTRFS_EXTENT_ITEM_KEY;
1643 		key.offset = head->num_bytes;
1644 	}
1645 
1646 	root = btrfs_extent_root(fs_info, key.objectid);
1647 again:
1648 	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1649 	if (ret < 0) {
1650 		goto out;
1651 	} else if (ret > 0) {
1652 		if (metadata) {
1653 			if (path->slots[0] > 0) {
1654 				path->slots[0]--;
1655 				btrfs_item_key_to_cpu(path->nodes[0], &key,
1656 						      path->slots[0]);
1657 				if (key.objectid == head->bytenr &&
1658 				    key.type == BTRFS_EXTENT_ITEM_KEY &&
1659 				    key.offset == head->num_bytes)
1660 					ret = 0;
1661 			}
1662 			if (ret > 0) {
1663 				btrfs_release_path(path);
1664 				metadata = 0;
1665 
1666 				key.objectid = head->bytenr;
1667 				key.offset = head->num_bytes;
1668 				key.type = BTRFS_EXTENT_ITEM_KEY;
1669 				goto again;
1670 			}
1671 		} else {
1672 			ret = -EUCLEAN;
1673 			btrfs_err(fs_info,
1674 		  "missing extent item for extent %llu num_bytes %llu level %d",
1675 				  head->bytenr, head->num_bytes, head->level);
1676 			goto out;
1677 		}
1678 	}
1679 
1680 	leaf = path->nodes[0];
1681 	item_size = btrfs_item_size(leaf, path->slots[0]);
1682 
1683 	if (unlikely(item_size < sizeof(*ei))) {
1684 		ret = -EUCLEAN;
1685 		btrfs_err(fs_info,
1686 			  "unexpected extent item size, has %u expect >= %zu",
1687 			  item_size, sizeof(*ei));
1688 		btrfs_abort_transaction(trans, ret);
1689 		goto out;
1690 	}
1691 
1692 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1693 	__run_delayed_extent_op(extent_op, leaf, ei);
1694 
1695 	btrfs_mark_buffer_dirty(trans, leaf);
1696 out:
1697 	btrfs_free_path(path);
1698 	return ret;
1699 }
1700 
run_delayed_tree_ref(struct btrfs_trans_handle * trans,struct btrfs_delayed_ref_head * href,struct btrfs_delayed_ref_node * node,struct btrfs_delayed_extent_op * extent_op,bool insert_reserved)1701 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
1702 				struct btrfs_delayed_ref_head *href,
1703 				struct btrfs_delayed_ref_node *node,
1704 				struct btrfs_delayed_extent_op *extent_op,
1705 				bool insert_reserved)
1706 {
1707 	int ret = 0;
1708 	struct btrfs_fs_info *fs_info = trans->fs_info;
1709 	u64 parent = 0;
1710 	u64 ref_root = 0;
1711 
1712 	trace_run_delayed_tree_ref(trans->fs_info, node);
1713 
1714 	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
1715 		parent = node->parent;
1716 	ref_root = node->ref_root;
1717 
1718 	if (unlikely(node->ref_mod != 1)) {
1719 		btrfs_err(trans->fs_info,
1720 	"btree block %llu has %d references rather than 1: action %d ref_root %llu parent %llu",
1721 			  node->bytenr, node->ref_mod, node->action, ref_root,
1722 			  parent);
1723 		return -EUCLEAN;
1724 	}
1725 	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1726 		struct btrfs_squota_delta delta = {
1727 			.root = href->owning_root,
1728 			.num_bytes = fs_info->nodesize,
1729 			.is_data = false,
1730 			.is_inc = true,
1731 			.generation = trans->transid,
1732 		};
1733 
1734 		ret = alloc_reserved_tree_block(trans, node, extent_op);
1735 		if (!ret)
1736 			btrfs_record_squota_delta(fs_info, &delta);
1737 	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
1738 		ret = __btrfs_inc_extent_ref(trans, node, extent_op);
1739 	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
1740 		ret = __btrfs_free_extent(trans, href, node, extent_op);
1741 	} else {
1742 		BUG();
1743 	}
1744 	return ret;
1745 }
1746 
1747 /* helper function to actually process a single delayed ref entry */
run_one_delayed_ref(struct btrfs_trans_handle * trans,struct btrfs_delayed_ref_head * href,struct btrfs_delayed_ref_node * node,struct btrfs_delayed_extent_op * extent_op,bool insert_reserved)1748 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
1749 			       struct btrfs_delayed_ref_head *href,
1750 			       struct btrfs_delayed_ref_node *node,
1751 			       struct btrfs_delayed_extent_op *extent_op,
1752 			       bool insert_reserved)
1753 {
1754 	int ret = 0;
1755 
1756 	if (TRANS_ABORTED(trans)) {
1757 		if (insert_reserved) {
1758 			btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
1759 			free_head_ref_squota_rsv(trans->fs_info, href);
1760 		}
1761 		return 0;
1762 	}
1763 
1764 	if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
1765 	    node->type == BTRFS_SHARED_BLOCK_REF_KEY)
1766 		ret = run_delayed_tree_ref(trans, href, node, extent_op,
1767 					   insert_reserved);
1768 	else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
1769 		 node->type == BTRFS_SHARED_DATA_REF_KEY)
1770 		ret = run_delayed_data_ref(trans, href, node, extent_op,
1771 					   insert_reserved);
1772 	else if (node->type == BTRFS_EXTENT_OWNER_REF_KEY)
1773 		ret = 0;
1774 	else
1775 		BUG();
1776 	if (ret && insert_reserved)
1777 		btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
1778 	if (ret < 0)
1779 		btrfs_err(trans->fs_info,
1780 "failed to run delayed ref for logical %llu num_bytes %llu type %u action %u ref_mod %d: %d",
1781 			  node->bytenr, node->num_bytes, node->type,
1782 			  node->action, node->ref_mod, ret);
1783 	return ret;
1784 }
1785 
1786 static inline struct btrfs_delayed_ref_node *
select_delayed_ref(struct btrfs_delayed_ref_head * head)1787 select_delayed_ref(struct btrfs_delayed_ref_head *head)
1788 {
1789 	struct btrfs_delayed_ref_node *ref;
1790 
1791 	if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
1792 		return NULL;
1793 
1794 	/*
1795 	 * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
1796 	 * This is to prevent a ref count from going down to zero, which deletes
1797 	 * the extent item from the extent tree, when there still are references
1798 	 * to add, which would fail because they would not find the extent item.
1799 	 */
1800 	if (!list_empty(&head->ref_add_list))
1801 		return list_first_entry(&head->ref_add_list,
1802 				struct btrfs_delayed_ref_node, add_list);
1803 
1804 	ref = rb_entry(rb_first_cached(&head->ref_tree),
1805 		       struct btrfs_delayed_ref_node, ref_node);
1806 	ASSERT(list_empty(&ref->add_list));
1807 	return ref;
1808 }
1809 
unselect_delayed_ref_head(struct btrfs_delayed_ref_root * delayed_refs,struct btrfs_delayed_ref_head * head)1810 static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
1811 				      struct btrfs_delayed_ref_head *head)
1812 {
1813 	spin_lock(&delayed_refs->lock);
1814 	head->processing = false;
1815 	delayed_refs->num_heads_ready++;
1816 	spin_unlock(&delayed_refs->lock);
1817 	btrfs_delayed_ref_unlock(head);
1818 }
1819 
cleanup_extent_op(struct btrfs_delayed_ref_head * head)1820 static struct btrfs_delayed_extent_op *cleanup_extent_op(
1821 				struct btrfs_delayed_ref_head *head)
1822 {
1823 	struct btrfs_delayed_extent_op *extent_op = head->extent_op;
1824 
1825 	if (!extent_op)
1826 		return NULL;
1827 
1828 	if (head->must_insert_reserved) {
1829 		head->extent_op = NULL;
1830 		btrfs_free_delayed_extent_op(extent_op);
1831 		return NULL;
1832 	}
1833 	return extent_op;
1834 }
1835 
run_and_cleanup_extent_op(struct btrfs_trans_handle * trans,struct btrfs_delayed_ref_head * head)1836 static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans,
1837 				     struct btrfs_delayed_ref_head *head)
1838 {
1839 	struct btrfs_delayed_extent_op *extent_op;
1840 	int ret;
1841 
1842 	extent_op = cleanup_extent_op(head);
1843 	if (!extent_op)
1844 		return 0;
1845 	head->extent_op = NULL;
1846 	spin_unlock(&head->lock);
1847 	ret = run_delayed_extent_op(trans, head, extent_op);
1848 	btrfs_free_delayed_extent_op(extent_op);
1849 	return ret ? ret : 1;
1850 }
1851 
btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info * fs_info,struct btrfs_delayed_ref_root * delayed_refs,struct btrfs_delayed_ref_head * head)1852 u64 btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
1853 				  struct btrfs_delayed_ref_root *delayed_refs,
1854 				  struct btrfs_delayed_ref_head *head)
1855 {
1856 	u64 ret = 0;
1857 
1858 	/*
1859 	 * We had csum deletions accounted for in our delayed refs rsv, we need
1860 	 * to drop the csum leaves for this update from our delayed_refs_rsv.
1861 	 */
1862 	if (head->total_ref_mod < 0 && head->is_data) {
1863 		int nr_csums;
1864 
1865 		spin_lock(&delayed_refs->lock);
1866 		delayed_refs->pending_csums -= head->num_bytes;
1867 		spin_unlock(&delayed_refs->lock);
1868 		nr_csums = btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes);
1869 
1870 		btrfs_delayed_refs_rsv_release(fs_info, 0, nr_csums);
1871 
1872 		ret = btrfs_calc_delayed_ref_csum_bytes(fs_info, nr_csums);
1873 	}
1874 	/* must_insert_reserved can be set only if we didn't run the head ref. */
1875 	if (head->must_insert_reserved)
1876 		free_head_ref_squota_rsv(fs_info, head);
1877 
1878 	return ret;
1879 }
1880 
cleanup_ref_head(struct btrfs_trans_handle * trans,struct btrfs_delayed_ref_head * head,u64 * bytes_released)1881 static int cleanup_ref_head(struct btrfs_trans_handle *trans,
1882 			    struct btrfs_delayed_ref_head *head,
1883 			    u64 *bytes_released)
1884 {
1885 
1886 	struct btrfs_fs_info *fs_info = trans->fs_info;
1887 	struct btrfs_delayed_ref_root *delayed_refs;
1888 	int ret;
1889 
1890 	delayed_refs = &trans->transaction->delayed_refs;
1891 
1892 	ret = run_and_cleanup_extent_op(trans, head);
1893 	if (ret < 0) {
1894 		unselect_delayed_ref_head(delayed_refs, head);
1895 		btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
1896 		return ret;
1897 	} else if (ret) {
1898 		return ret;
1899 	}
1900 
1901 	/*
1902 	 * Need to drop our head ref lock and re-acquire the delayed ref lock
1903 	 * and then re-check to make sure nobody got added.
1904 	 */
1905 	spin_unlock(&head->lock);
1906 	spin_lock(&delayed_refs->lock);
1907 	spin_lock(&head->lock);
1908 	if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) {
1909 		spin_unlock(&head->lock);
1910 		spin_unlock(&delayed_refs->lock);
1911 		return 1;
1912 	}
1913 	btrfs_delete_ref_head(delayed_refs, head);
1914 	spin_unlock(&head->lock);
1915 	spin_unlock(&delayed_refs->lock);
1916 
1917 	if (head->must_insert_reserved) {
1918 		btrfs_pin_extent(trans, head->bytenr, head->num_bytes, 1);
1919 		if (head->is_data) {
1920 			struct btrfs_root *csum_root;
1921 
1922 			csum_root = btrfs_csum_root(fs_info, head->bytenr);
1923 			ret = btrfs_del_csums(trans, csum_root, head->bytenr,
1924 					      head->num_bytes);
1925 		}
1926 	}
1927 
1928 	*bytes_released += btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
1929 
1930 	trace_run_delayed_ref_head(fs_info, head, 0);
1931 	btrfs_delayed_ref_unlock(head);
1932 	btrfs_put_delayed_ref_head(head);
1933 	return ret;
1934 }
1935 
btrfs_obtain_ref_head(struct btrfs_trans_handle * trans)1936 static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
1937 					struct btrfs_trans_handle *trans)
1938 {
1939 	struct btrfs_delayed_ref_root *delayed_refs =
1940 		&trans->transaction->delayed_refs;
1941 	struct btrfs_delayed_ref_head *head = NULL;
1942 	int ret;
1943 
1944 	spin_lock(&delayed_refs->lock);
1945 	head = btrfs_select_ref_head(delayed_refs);
1946 	if (!head) {
1947 		spin_unlock(&delayed_refs->lock);
1948 		return head;
1949 	}
1950 
1951 	/*
1952 	 * Grab the lock that says we are going to process all the refs for
1953 	 * this head
1954 	 */
1955 	ret = btrfs_delayed_ref_lock(delayed_refs, head);
1956 	spin_unlock(&delayed_refs->lock);
1957 
1958 	/*
1959 	 * We may have dropped the spin lock to get the head mutex lock, and
1960 	 * that might have given someone else time to free the head.  If that's
1961 	 * true, it has been removed from our list and we can move on.
1962 	 */
1963 	if (ret == -EAGAIN)
1964 		head = ERR_PTR(-EAGAIN);
1965 
1966 	return head;
1967 }
1968 
btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle * trans,struct btrfs_delayed_ref_head * locked_ref,u64 * bytes_released)1969 static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
1970 					   struct btrfs_delayed_ref_head *locked_ref,
1971 					   u64 *bytes_released)
1972 {
1973 	struct btrfs_fs_info *fs_info = trans->fs_info;
1974 	struct btrfs_delayed_ref_root *delayed_refs;
1975 	struct btrfs_delayed_extent_op *extent_op;
1976 	struct btrfs_delayed_ref_node *ref;
1977 	bool must_insert_reserved;
1978 	int ret;
1979 
1980 	delayed_refs = &trans->transaction->delayed_refs;
1981 
1982 	lockdep_assert_held(&locked_ref->mutex);
1983 	lockdep_assert_held(&locked_ref->lock);
1984 
1985 	while ((ref = select_delayed_ref(locked_ref))) {
1986 		if (ref->seq &&
1987 		    btrfs_check_delayed_seq(fs_info, ref->seq)) {
1988 			spin_unlock(&locked_ref->lock);
1989 			unselect_delayed_ref_head(delayed_refs, locked_ref);
1990 			return -EAGAIN;
1991 		}
1992 
1993 		rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
1994 		RB_CLEAR_NODE(&ref->ref_node);
1995 		if (!list_empty(&ref->add_list))
1996 			list_del(&ref->add_list);
1997 		/*
1998 		 * When we play the delayed ref, also correct the ref_mod on
1999 		 * head
2000 		 */
2001 		switch (ref->action) {
2002 		case BTRFS_ADD_DELAYED_REF:
2003 		case BTRFS_ADD_DELAYED_EXTENT:
2004 			locked_ref->ref_mod -= ref->ref_mod;
2005 			break;
2006 		case BTRFS_DROP_DELAYED_REF:
2007 			locked_ref->ref_mod += ref->ref_mod;
2008 			break;
2009 		default:
2010 			WARN_ON(1);
2011 		}
2012 		atomic_dec(&delayed_refs->num_entries);
2013 
2014 		/*
2015 		 * Record the must_insert_reserved flag before we drop the
2016 		 * spin lock.
2017 		 */
2018 		must_insert_reserved = locked_ref->must_insert_reserved;
2019 		/*
2020 		 * Unsetting this on the head ref relinquishes ownership of
2021 		 * the rsv_bytes, so it is critical that every possible code
2022 		 * path from here forward frees all reserves including qgroup
2023 		 * reserve.
2024 		 */
2025 		locked_ref->must_insert_reserved = false;
2026 
2027 		extent_op = locked_ref->extent_op;
2028 		locked_ref->extent_op = NULL;
2029 		spin_unlock(&locked_ref->lock);
2030 
2031 		ret = run_one_delayed_ref(trans, locked_ref, ref, extent_op,
2032 					  must_insert_reserved);
2033 		btrfs_delayed_refs_rsv_release(fs_info, 1, 0);
2034 		*bytes_released += btrfs_calc_delayed_ref_bytes(fs_info, 1);
2035 
2036 		btrfs_free_delayed_extent_op(extent_op);
2037 		if (ret) {
2038 			unselect_delayed_ref_head(delayed_refs, locked_ref);
2039 			btrfs_put_delayed_ref(ref);
2040 			return ret;
2041 		}
2042 
2043 		btrfs_put_delayed_ref(ref);
2044 		cond_resched();
2045 
2046 		spin_lock(&locked_ref->lock);
2047 		btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref);
2048 	}
2049 
2050 	return 0;
2051 }
2052 
2053 /*
2054  * Returns 0 on success or if called with an already aborted transaction.
2055  * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2056  */
__btrfs_run_delayed_refs(struct btrfs_trans_handle * trans,u64 min_bytes)2057 static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2058 					     u64 min_bytes)
2059 {
2060 	struct btrfs_fs_info *fs_info = trans->fs_info;
2061 	struct btrfs_delayed_ref_root *delayed_refs;
2062 	struct btrfs_delayed_ref_head *locked_ref = NULL;
2063 	int ret;
2064 	unsigned long count = 0;
2065 	unsigned long max_count = 0;
2066 	u64 bytes_processed = 0;
2067 
2068 	delayed_refs = &trans->transaction->delayed_refs;
2069 	if (min_bytes == 0) {
2070 		max_count = delayed_refs->num_heads_ready;
2071 		min_bytes = U64_MAX;
2072 	}
2073 
2074 	do {
2075 		if (!locked_ref) {
2076 			locked_ref = btrfs_obtain_ref_head(trans);
2077 			if (IS_ERR_OR_NULL(locked_ref)) {
2078 				if (PTR_ERR(locked_ref) == -EAGAIN) {
2079 					continue;
2080 				} else {
2081 					break;
2082 				}
2083 			}
2084 			count++;
2085 		}
2086 		/*
2087 		 * We need to try and merge add/drops of the same ref since we
2088 		 * can run into issues with relocate dropping the implicit ref
2089 		 * and then it being added back again before the drop can
2090 		 * finish.  If we merged anything we need to re-loop so we can
2091 		 * get a good ref.
2092 		 * Or we can get node references of the same type that weren't
2093 		 * merged when created due to bumps in the tree mod seq, and
2094 		 * we need to merge them to prevent adding an inline extent
2095 		 * backref before dropping it (triggering a BUG_ON at
2096 		 * insert_inline_extent_backref()).
2097 		 */
2098 		spin_lock(&locked_ref->lock);
2099 		btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref);
2100 
2101 		ret = btrfs_run_delayed_refs_for_head(trans, locked_ref, &bytes_processed);
2102 		if (ret < 0 && ret != -EAGAIN) {
2103 			/*
2104 			 * Error, btrfs_run_delayed_refs_for_head already
2105 			 * unlocked everything so just bail out
2106 			 */
2107 			return ret;
2108 		} else if (!ret) {
2109 			/*
2110 			 * Success, perform the usual cleanup of a processed
2111 			 * head
2112 			 */
2113 			ret = cleanup_ref_head(trans, locked_ref, &bytes_processed);
2114 			if (ret > 0 ) {
2115 				/* We dropped our lock, we need to loop. */
2116 				ret = 0;
2117 				continue;
2118 			} else if (ret) {
2119 				return ret;
2120 			}
2121 		}
2122 
2123 		/*
2124 		 * Either success case or btrfs_run_delayed_refs_for_head
2125 		 * returned -EAGAIN, meaning we need to select another head
2126 		 */
2127 
2128 		locked_ref = NULL;
2129 		cond_resched();
2130 	} while ((min_bytes != U64_MAX && bytes_processed < min_bytes) ||
2131 		 (max_count > 0 && count < max_count) ||
2132 		 locked_ref);
2133 
2134 	return 0;
2135 }
2136 
2137 #ifdef SCRAMBLE_DELAYED_REFS
2138 /*
2139  * Normally delayed refs get processed in ascending bytenr order. This
2140  * correlates in most cases to the order added. To expose dependencies on this
2141  * order, we start to process the tree in the middle instead of the beginning
2142  */
find_middle(struct rb_root * root)2143 static u64 find_middle(struct rb_root *root)
2144 {
2145 	struct rb_node *n = root->rb_node;
2146 	struct btrfs_delayed_ref_node *entry;
2147 	int alt = 1;
2148 	u64 middle;
2149 	u64 first = 0, last = 0;
2150 
2151 	n = rb_first(root);
2152 	if (n) {
2153 		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2154 		first = entry->bytenr;
2155 	}
2156 	n = rb_last(root);
2157 	if (n) {
2158 		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2159 		last = entry->bytenr;
2160 	}
2161 	n = root->rb_node;
2162 
2163 	while (n) {
2164 		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2165 		WARN_ON(!entry->in_tree);
2166 
2167 		middle = entry->bytenr;
2168 
2169 		if (alt)
2170 			n = n->rb_left;
2171 		else
2172 			n = n->rb_right;
2173 
2174 		alt = 1 - alt;
2175 	}
2176 	return middle;
2177 }
2178 #endif
2179 
2180 /*
2181  * Start processing the delayed reference count updates and extent insertions
2182  * we have queued up so far.
2183  *
2184  * @trans:	Transaction handle.
2185  * @min_bytes:	How many bytes of delayed references to process. After this
2186  *		many bytes we stop processing delayed references if there are
2187  *		any more. If 0 it means to run all existing delayed references,
2188  *		but not new ones added after running all existing ones.
2189  *		Use (u64)-1 (U64_MAX) to run all existing delayed references
2190  *		plus any new ones that are added.
2191  *
2192  * Returns 0 on success or if called with an aborted transaction
2193  * Returns <0 on error and aborts the transaction
2194  */
btrfs_run_delayed_refs(struct btrfs_trans_handle * trans,u64 min_bytes)2195 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, u64 min_bytes)
2196 {
2197 	struct btrfs_fs_info *fs_info = trans->fs_info;
2198 	struct btrfs_delayed_ref_root *delayed_refs;
2199 	int ret;
2200 
2201 	/* We'll clean this up in btrfs_cleanup_transaction */
2202 	if (TRANS_ABORTED(trans))
2203 		return 0;
2204 
2205 	if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
2206 		return 0;
2207 
2208 	delayed_refs = &trans->transaction->delayed_refs;
2209 again:
2210 #ifdef SCRAMBLE_DELAYED_REFS
2211 	delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2212 #endif
2213 	ret = __btrfs_run_delayed_refs(trans, min_bytes);
2214 	if (ret < 0) {
2215 		btrfs_abort_transaction(trans, ret);
2216 		return ret;
2217 	}
2218 
2219 	if (min_bytes == U64_MAX) {
2220 		btrfs_create_pending_block_groups(trans);
2221 
2222 		spin_lock(&delayed_refs->lock);
2223 		if (RB_EMPTY_ROOT(&delayed_refs->href_root.rb_root)) {
2224 			spin_unlock(&delayed_refs->lock);
2225 			return 0;
2226 		}
2227 		spin_unlock(&delayed_refs->lock);
2228 
2229 		cond_resched();
2230 		goto again;
2231 	}
2232 
2233 	return 0;
2234 }
2235 
btrfs_set_disk_extent_flags(struct btrfs_trans_handle * trans,struct extent_buffer * eb,u64 flags)2236 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2237 				struct extent_buffer *eb, u64 flags)
2238 {
2239 	struct btrfs_delayed_extent_op *extent_op;
2240 	int ret;
2241 
2242 	extent_op = btrfs_alloc_delayed_extent_op();
2243 	if (!extent_op)
2244 		return -ENOMEM;
2245 
2246 	extent_op->flags_to_set = flags;
2247 	extent_op->update_flags = true;
2248 	extent_op->update_key = false;
2249 
2250 	ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len,
2251 					  btrfs_header_level(eb), extent_op);
2252 	if (ret)
2253 		btrfs_free_delayed_extent_op(extent_op);
2254 	return ret;
2255 }
2256 
check_delayed_ref(struct btrfs_root * root,struct btrfs_path * path,u64 objectid,u64 offset,u64 bytenr)2257 static noinline int check_delayed_ref(struct btrfs_root *root,
2258 				      struct btrfs_path *path,
2259 				      u64 objectid, u64 offset, u64 bytenr)
2260 {
2261 	struct btrfs_delayed_ref_head *head;
2262 	struct btrfs_delayed_ref_node *ref;
2263 	struct btrfs_delayed_ref_root *delayed_refs;
2264 	struct btrfs_transaction *cur_trans;
2265 	struct rb_node *node;
2266 	int ret = 0;
2267 
2268 	spin_lock(&root->fs_info->trans_lock);
2269 	cur_trans = root->fs_info->running_transaction;
2270 	if (cur_trans)
2271 		refcount_inc(&cur_trans->use_count);
2272 	spin_unlock(&root->fs_info->trans_lock);
2273 	if (!cur_trans)
2274 		return 0;
2275 
2276 	delayed_refs = &cur_trans->delayed_refs;
2277 	spin_lock(&delayed_refs->lock);
2278 	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
2279 	if (!head) {
2280 		spin_unlock(&delayed_refs->lock);
2281 		btrfs_put_transaction(cur_trans);
2282 		return 0;
2283 	}
2284 
2285 	if (!mutex_trylock(&head->mutex)) {
2286 		if (path->nowait) {
2287 			spin_unlock(&delayed_refs->lock);
2288 			btrfs_put_transaction(cur_trans);
2289 			return -EAGAIN;
2290 		}
2291 
2292 		refcount_inc(&head->refs);
2293 		spin_unlock(&delayed_refs->lock);
2294 
2295 		btrfs_release_path(path);
2296 
2297 		/*
2298 		 * Mutex was contended, block until it's released and let
2299 		 * caller try again
2300 		 */
2301 		mutex_lock(&head->mutex);
2302 		mutex_unlock(&head->mutex);
2303 		btrfs_put_delayed_ref_head(head);
2304 		btrfs_put_transaction(cur_trans);
2305 		return -EAGAIN;
2306 	}
2307 	spin_unlock(&delayed_refs->lock);
2308 
2309 	spin_lock(&head->lock);
2310 	/*
2311 	 * XXX: We should replace this with a proper search function in the
2312 	 * future.
2313 	 */
2314 	for (node = rb_first_cached(&head->ref_tree); node;
2315 	     node = rb_next(node)) {
2316 		u64 ref_owner;
2317 		u64 ref_offset;
2318 
2319 		ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
2320 		/* If it's a shared ref we know a cross reference exists */
2321 		if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
2322 			ret = 1;
2323 			break;
2324 		}
2325 
2326 		ref_owner = btrfs_delayed_ref_owner(ref);
2327 		ref_offset = btrfs_delayed_ref_offset(ref);
2328 
2329 		/*
2330 		 * If our ref doesn't match the one we're currently looking at
2331 		 * then we have a cross reference.
2332 		 */
2333 		if (ref->ref_root != btrfs_root_id(root) ||
2334 		    ref_owner != objectid || ref_offset != offset) {
2335 			ret = 1;
2336 			break;
2337 		}
2338 	}
2339 	spin_unlock(&head->lock);
2340 	mutex_unlock(&head->mutex);
2341 	btrfs_put_transaction(cur_trans);
2342 	return ret;
2343 }
2344 
check_committed_ref(struct btrfs_root * root,struct btrfs_path * path,u64 objectid,u64 offset,u64 bytenr,bool strict)2345 static noinline int check_committed_ref(struct btrfs_root *root,
2346 					struct btrfs_path *path,
2347 					u64 objectid, u64 offset, u64 bytenr,
2348 					bool strict)
2349 {
2350 	struct btrfs_fs_info *fs_info = root->fs_info;
2351 	struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr);
2352 	struct extent_buffer *leaf;
2353 	struct btrfs_extent_data_ref *ref;
2354 	struct btrfs_extent_inline_ref *iref;
2355 	struct btrfs_extent_item *ei;
2356 	struct btrfs_key key;
2357 	u32 item_size;
2358 	u32 expected_size;
2359 	int type;
2360 	int ret;
2361 
2362 	key.objectid = bytenr;
2363 	key.offset = (u64)-1;
2364 	key.type = BTRFS_EXTENT_ITEM_KEY;
2365 
2366 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2367 	if (ret < 0)
2368 		goto out;
2369 	if (ret == 0) {
2370 		/*
2371 		 * Key with offset -1 found, there would have to exist an extent
2372 		 * item with such offset, but this is out of the valid range.
2373 		 */
2374 		ret = -EUCLEAN;
2375 		goto out;
2376 	}
2377 
2378 	ret = -ENOENT;
2379 	if (path->slots[0] == 0)
2380 		goto out;
2381 
2382 	path->slots[0]--;
2383 	leaf = path->nodes[0];
2384 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2385 
2386 	if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
2387 		goto out;
2388 
2389 	ret = 1;
2390 	item_size = btrfs_item_size(leaf, path->slots[0]);
2391 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2392 	expected_size = sizeof(*ei) + btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY);
2393 
2394 	/* No inline refs; we need to bail before checking for owner ref. */
2395 	if (item_size == sizeof(*ei))
2396 		goto out;
2397 
2398 	/* Check for an owner ref; skip over it to the real inline refs. */
2399 	iref = (struct btrfs_extent_inline_ref *)(ei + 1);
2400 	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
2401 	if (btrfs_fs_incompat(fs_info, SIMPLE_QUOTA) && type == BTRFS_EXTENT_OWNER_REF_KEY) {
2402 		expected_size += btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY);
2403 		iref = (struct btrfs_extent_inline_ref *)(iref + 1);
2404 	}
2405 
2406 	/* If extent item has more than 1 inline ref then it's shared */
2407 	if (item_size != expected_size)
2408 		goto out;
2409 
2410 	/*
2411 	 * If extent created before last snapshot => it's shared unless the
2412 	 * snapshot has been deleted. Use the heuristic if strict is false.
2413 	 */
2414 	if (!strict &&
2415 	    (btrfs_extent_generation(leaf, ei) <=
2416 	     btrfs_root_last_snapshot(&root->root_item)))
2417 		goto out;
2418 
2419 	/* If this extent has SHARED_DATA_REF then it's shared */
2420 	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
2421 	if (type != BTRFS_EXTENT_DATA_REF_KEY)
2422 		goto out;
2423 
2424 	ref = (struct btrfs_extent_data_ref *)(&iref->offset);
2425 	if (btrfs_extent_refs(leaf, ei) !=
2426 	    btrfs_extent_data_ref_count(leaf, ref) ||
2427 	    btrfs_extent_data_ref_root(leaf, ref) != btrfs_root_id(root) ||
2428 	    btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
2429 	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
2430 		goto out;
2431 
2432 	ret = 0;
2433 out:
2434 	return ret;
2435 }
2436 
btrfs_cross_ref_exist(struct btrfs_root * root,u64 objectid,u64 offset,u64 bytenr,bool strict,struct btrfs_path * path)2437 int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
2438 			  u64 bytenr, bool strict, struct btrfs_path *path)
2439 {
2440 	int ret;
2441 
2442 	do {
2443 		ret = check_committed_ref(root, path, objectid,
2444 					  offset, bytenr, strict);
2445 		if (ret && ret != -ENOENT)
2446 			goto out;
2447 
2448 		ret = check_delayed_ref(root, path, objectid, offset, bytenr);
2449 	} while (ret == -EAGAIN && !path->nowait);
2450 
2451 out:
2452 	btrfs_release_path(path);
2453 	if (btrfs_is_data_reloc_root(root))
2454 		WARN_ON(ret > 0);
2455 	return ret;
2456 }
2457 
__btrfs_mod_ref(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct extent_buffer * buf,int full_backref,int inc)2458 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2459 			   struct btrfs_root *root,
2460 			   struct extent_buffer *buf,
2461 			   int full_backref, int inc)
2462 {
2463 	struct btrfs_fs_info *fs_info = root->fs_info;
2464 	u64 parent;
2465 	u64 ref_root;
2466 	u32 nritems;
2467 	struct btrfs_key key;
2468 	struct btrfs_file_extent_item *fi;
2469 	bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC);
2470 	int i;
2471 	int action;
2472 	int level;
2473 	int ret = 0;
2474 
2475 	if (btrfs_is_testing(fs_info))
2476 		return 0;
2477 
2478 	ref_root = btrfs_header_owner(buf);
2479 	nritems = btrfs_header_nritems(buf);
2480 	level = btrfs_header_level(buf);
2481 
2482 	if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && level == 0)
2483 		return 0;
2484 
2485 	if (full_backref)
2486 		parent = buf->start;
2487 	else
2488 		parent = 0;
2489 	if (inc)
2490 		action = BTRFS_ADD_DELAYED_REF;
2491 	else
2492 		action = BTRFS_DROP_DELAYED_REF;
2493 
2494 	for (i = 0; i < nritems; i++) {
2495 		struct btrfs_ref ref = {
2496 			.action = action,
2497 			.parent = parent,
2498 			.ref_root = ref_root,
2499 		};
2500 
2501 		if (level == 0) {
2502 			btrfs_item_key_to_cpu(buf, &key, i);
2503 			if (key.type != BTRFS_EXTENT_DATA_KEY)
2504 				continue;
2505 			fi = btrfs_item_ptr(buf, i,
2506 					    struct btrfs_file_extent_item);
2507 			if (btrfs_file_extent_type(buf, fi) ==
2508 			    BTRFS_FILE_EXTENT_INLINE)
2509 				continue;
2510 			ref.bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
2511 			if (ref.bytenr == 0)
2512 				continue;
2513 
2514 			ref.num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
2515 			ref.owning_root = ref_root;
2516 
2517 			key.offset -= btrfs_file_extent_offset(buf, fi);
2518 			btrfs_init_data_ref(&ref, key.objectid, key.offset,
2519 					    btrfs_root_id(root), for_reloc);
2520 			if (inc)
2521 				ret = btrfs_inc_extent_ref(trans, &ref);
2522 			else
2523 				ret = btrfs_free_extent(trans, &ref);
2524 			if (ret)
2525 				goto fail;
2526 		} else {
2527 			/* We don't know the owning_root, leave as 0. */
2528 			ref.bytenr = btrfs_node_blockptr(buf, i);
2529 			ref.num_bytes = fs_info->nodesize;
2530 
2531 			btrfs_init_tree_ref(&ref, level - 1,
2532 					    btrfs_root_id(root), for_reloc);
2533 			if (inc)
2534 				ret = btrfs_inc_extent_ref(trans, &ref);
2535 			else
2536 				ret = btrfs_free_extent(trans, &ref);
2537 			if (ret)
2538 				goto fail;
2539 		}
2540 	}
2541 	return 0;
2542 fail:
2543 	return ret;
2544 }
2545 
btrfs_inc_ref(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct extent_buffer * buf,int full_backref)2546 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2547 		  struct extent_buffer *buf, int full_backref)
2548 {
2549 	return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
2550 }
2551 
btrfs_dec_ref(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct extent_buffer * buf,int full_backref)2552 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2553 		  struct extent_buffer *buf, int full_backref)
2554 {
2555 	return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
2556 }
2557 
get_alloc_profile_by_root(struct btrfs_root * root,int data)2558 static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
2559 {
2560 	struct btrfs_fs_info *fs_info = root->fs_info;
2561 	u64 flags;
2562 	u64 ret;
2563 
2564 	if (data)
2565 		flags = BTRFS_BLOCK_GROUP_DATA;
2566 	else if (root == fs_info->chunk_root)
2567 		flags = BTRFS_BLOCK_GROUP_SYSTEM;
2568 	else
2569 		flags = BTRFS_BLOCK_GROUP_METADATA;
2570 
2571 	ret = btrfs_get_alloc_profile(fs_info, flags);
2572 	return ret;
2573 }
2574 
first_logical_byte(struct btrfs_fs_info * fs_info)2575 static u64 first_logical_byte(struct btrfs_fs_info *fs_info)
2576 {
2577 	struct rb_node *leftmost;
2578 	u64 bytenr = 0;
2579 
2580 	read_lock(&fs_info->block_group_cache_lock);
2581 	/* Get the block group with the lowest logical start address. */
2582 	leftmost = rb_first_cached(&fs_info->block_group_cache_tree);
2583 	if (leftmost) {
2584 		struct btrfs_block_group *bg;
2585 
2586 		bg = rb_entry(leftmost, struct btrfs_block_group, cache_node);
2587 		bytenr = bg->start;
2588 	}
2589 	read_unlock(&fs_info->block_group_cache_lock);
2590 
2591 	return bytenr;
2592 }
2593 
pin_down_extent(struct btrfs_trans_handle * trans,struct btrfs_block_group * cache,u64 bytenr,u64 num_bytes,int reserved)2594 static int pin_down_extent(struct btrfs_trans_handle *trans,
2595 			   struct btrfs_block_group *cache,
2596 			   u64 bytenr, u64 num_bytes, int reserved)
2597 {
2598 	struct btrfs_fs_info *fs_info = cache->fs_info;
2599 
2600 	spin_lock(&cache->space_info->lock);
2601 	spin_lock(&cache->lock);
2602 	cache->pinned += num_bytes;
2603 	btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info,
2604 					     num_bytes);
2605 	if (reserved) {
2606 		cache->reserved -= num_bytes;
2607 		cache->space_info->bytes_reserved -= num_bytes;
2608 	}
2609 	spin_unlock(&cache->lock);
2610 	spin_unlock(&cache->space_info->lock);
2611 
2612 	set_extent_bit(&trans->transaction->pinned_extents, bytenr,
2613 		       bytenr + num_bytes - 1, EXTENT_DIRTY, NULL);
2614 	return 0;
2615 }
2616 
btrfs_pin_extent(struct btrfs_trans_handle * trans,u64 bytenr,u64 num_bytes,int reserved)2617 int btrfs_pin_extent(struct btrfs_trans_handle *trans,
2618 		     u64 bytenr, u64 num_bytes, int reserved)
2619 {
2620 	struct btrfs_block_group *cache;
2621 
2622 	cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
2623 	BUG_ON(!cache); /* Logic error */
2624 
2625 	pin_down_extent(trans, cache, bytenr, num_bytes, reserved);
2626 
2627 	btrfs_put_block_group(cache);
2628 	return 0;
2629 }
2630 
btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle * trans,const struct extent_buffer * eb)2631 int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
2632 				    const struct extent_buffer *eb)
2633 {
2634 	struct btrfs_block_group *cache;
2635 	int ret;
2636 
2637 	cache = btrfs_lookup_block_group(trans->fs_info, eb->start);
2638 	if (!cache)
2639 		return -EINVAL;
2640 
2641 	/*
2642 	 * Fully cache the free space first so that our pin removes the free space
2643 	 * from the cache.
2644 	 */
2645 	ret = btrfs_cache_block_group(cache, true);
2646 	if (ret)
2647 		goto out;
2648 
2649 	pin_down_extent(trans, cache, eb->start, eb->len, 0);
2650 
2651 	/* remove us from the free space cache (if we're there at all) */
2652 	ret = btrfs_remove_free_space(cache, eb->start, eb->len);
2653 out:
2654 	btrfs_put_block_group(cache);
2655 	return ret;
2656 }
2657 
__exclude_logged_extent(struct btrfs_fs_info * fs_info,u64 start,u64 num_bytes)2658 static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
2659 				   u64 start, u64 num_bytes)
2660 {
2661 	int ret;
2662 	struct btrfs_block_group *block_group;
2663 
2664 	block_group = btrfs_lookup_block_group(fs_info, start);
2665 	if (!block_group)
2666 		return -EINVAL;
2667 
2668 	ret = btrfs_cache_block_group(block_group, true);
2669 	if (ret)
2670 		goto out;
2671 
2672 	ret = btrfs_remove_free_space(block_group, start, num_bytes);
2673 out:
2674 	btrfs_put_block_group(block_group);
2675 	return ret;
2676 }
2677 
btrfs_exclude_logged_extents(struct extent_buffer * eb)2678 int btrfs_exclude_logged_extents(struct extent_buffer *eb)
2679 {
2680 	struct btrfs_fs_info *fs_info = eb->fs_info;
2681 	struct btrfs_file_extent_item *item;
2682 	struct btrfs_key key;
2683 	int found_type;
2684 	int i;
2685 	int ret = 0;
2686 
2687 	if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS))
2688 		return 0;
2689 
2690 	for (i = 0; i < btrfs_header_nritems(eb); i++) {
2691 		btrfs_item_key_to_cpu(eb, &key, i);
2692 		if (key.type != BTRFS_EXTENT_DATA_KEY)
2693 			continue;
2694 		item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
2695 		found_type = btrfs_file_extent_type(eb, item);
2696 		if (found_type == BTRFS_FILE_EXTENT_INLINE)
2697 			continue;
2698 		if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
2699 			continue;
2700 		key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
2701 		key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
2702 		ret = __exclude_logged_extent(fs_info, key.objectid, key.offset);
2703 		if (ret)
2704 			break;
2705 	}
2706 
2707 	return ret;
2708 }
2709 
2710 static void
btrfs_inc_block_group_reservations(struct btrfs_block_group * bg)2711 btrfs_inc_block_group_reservations(struct btrfs_block_group *bg)
2712 {
2713 	atomic_inc(&bg->reservations);
2714 }
2715 
2716 /*
2717  * Returns the free cluster for the given space info and sets empty_cluster to
2718  * what it should be based on the mount options.
2719  */
2720 static struct btrfs_free_cluster *
fetch_cluster_info(struct btrfs_fs_info * fs_info,struct btrfs_space_info * space_info,u64 * empty_cluster)2721 fetch_cluster_info(struct btrfs_fs_info *fs_info,
2722 		   struct btrfs_space_info *space_info, u64 *empty_cluster)
2723 {
2724 	struct btrfs_free_cluster *ret = NULL;
2725 
2726 	*empty_cluster = 0;
2727 	if (btrfs_mixed_space_info(space_info))
2728 		return ret;
2729 
2730 	if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
2731 		ret = &fs_info->meta_alloc_cluster;
2732 		if (btrfs_test_opt(fs_info, SSD))
2733 			*empty_cluster = SZ_2M;
2734 		else
2735 			*empty_cluster = SZ_64K;
2736 	} else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) &&
2737 		   btrfs_test_opt(fs_info, SSD_SPREAD)) {
2738 		*empty_cluster = SZ_2M;
2739 		ret = &fs_info->data_alloc_cluster;
2740 	}
2741 
2742 	return ret;
2743 }
2744 
unpin_extent_range(struct btrfs_fs_info * fs_info,u64 start,u64 end,const bool return_free_space)2745 static int unpin_extent_range(struct btrfs_fs_info *fs_info,
2746 			      u64 start, u64 end,
2747 			      const bool return_free_space)
2748 {
2749 	struct btrfs_block_group *cache = NULL;
2750 	struct btrfs_space_info *space_info;
2751 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
2752 	struct btrfs_free_cluster *cluster = NULL;
2753 	u64 len;
2754 	u64 total_unpinned = 0;
2755 	u64 empty_cluster = 0;
2756 	bool readonly;
2757 	int ret = 0;
2758 
2759 	while (start <= end) {
2760 		readonly = false;
2761 		if (!cache ||
2762 		    start >= cache->start + cache->length) {
2763 			if (cache)
2764 				btrfs_put_block_group(cache);
2765 			total_unpinned = 0;
2766 			cache = btrfs_lookup_block_group(fs_info, start);
2767 			if (cache == NULL) {
2768 				/* Logic error, something removed the block group. */
2769 				ret = -EUCLEAN;
2770 				goto out;
2771 			}
2772 
2773 			cluster = fetch_cluster_info(fs_info,
2774 						     cache->space_info,
2775 						     &empty_cluster);
2776 			empty_cluster <<= 1;
2777 		}
2778 
2779 		len = cache->start + cache->length - start;
2780 		len = min(len, end + 1 - start);
2781 
2782 		if (return_free_space)
2783 			btrfs_add_free_space(cache, start, len);
2784 
2785 		start += len;
2786 		total_unpinned += len;
2787 		space_info = cache->space_info;
2788 
2789 		/*
2790 		 * If this space cluster has been marked as fragmented and we've
2791 		 * unpinned enough in this block group to potentially allow a
2792 		 * cluster to be created inside of it go ahead and clear the
2793 		 * fragmented check.
2794 		 */
2795 		if (cluster && cluster->fragmented &&
2796 		    total_unpinned > empty_cluster) {
2797 			spin_lock(&cluster->lock);
2798 			cluster->fragmented = 0;
2799 			spin_unlock(&cluster->lock);
2800 		}
2801 
2802 		spin_lock(&space_info->lock);
2803 		spin_lock(&cache->lock);
2804 		cache->pinned -= len;
2805 		btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len);
2806 		space_info->max_extent_size = 0;
2807 		if (cache->ro) {
2808 			space_info->bytes_readonly += len;
2809 			readonly = true;
2810 		} else if (btrfs_is_zoned(fs_info)) {
2811 			/* Need reset before reusing in a zoned block group */
2812 			btrfs_space_info_update_bytes_zone_unusable(fs_info, space_info,
2813 								    len);
2814 			readonly = true;
2815 		}
2816 		spin_unlock(&cache->lock);
2817 		if (!readonly && return_free_space &&
2818 		    global_rsv->space_info == space_info) {
2819 			spin_lock(&global_rsv->lock);
2820 			if (!global_rsv->full) {
2821 				u64 to_add = min(len, global_rsv->size -
2822 						      global_rsv->reserved);
2823 
2824 				global_rsv->reserved += to_add;
2825 				btrfs_space_info_update_bytes_may_use(fs_info,
2826 						space_info, to_add);
2827 				if (global_rsv->reserved >= global_rsv->size)
2828 					global_rsv->full = 1;
2829 				len -= to_add;
2830 			}
2831 			spin_unlock(&global_rsv->lock);
2832 		}
2833 		/* Add to any tickets we may have */
2834 		if (!readonly && return_free_space && len)
2835 			btrfs_try_granting_tickets(fs_info, space_info);
2836 		spin_unlock(&space_info->lock);
2837 	}
2838 
2839 	if (cache)
2840 		btrfs_put_block_group(cache);
2841 out:
2842 	return ret;
2843 }
2844 
btrfs_finish_extent_commit(struct btrfs_trans_handle * trans)2845 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
2846 {
2847 	struct btrfs_fs_info *fs_info = trans->fs_info;
2848 	struct btrfs_block_group *block_group, *tmp;
2849 	struct list_head *deleted_bgs;
2850 	struct extent_io_tree *unpin;
2851 	u64 start;
2852 	u64 end;
2853 	int ret;
2854 
2855 	unpin = &trans->transaction->pinned_extents;
2856 
2857 	while (!TRANS_ABORTED(trans)) {
2858 		struct extent_state *cached_state = NULL;
2859 
2860 		mutex_lock(&fs_info->unused_bg_unpin_mutex);
2861 		if (!find_first_extent_bit(unpin, 0, &start, &end,
2862 					   EXTENT_DIRTY, &cached_state)) {
2863 			mutex_unlock(&fs_info->unused_bg_unpin_mutex);
2864 			break;
2865 		}
2866 
2867 		if (btrfs_test_opt(fs_info, DISCARD_SYNC))
2868 			ret = btrfs_discard_extent(fs_info, start,
2869 						   end + 1 - start, NULL);
2870 
2871 		clear_extent_dirty(unpin, start, end, &cached_state);
2872 		ret = unpin_extent_range(fs_info, start, end, true);
2873 		BUG_ON(ret);
2874 		mutex_unlock(&fs_info->unused_bg_unpin_mutex);
2875 		free_extent_state(cached_state);
2876 		cond_resched();
2877 	}
2878 
2879 	if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) {
2880 		btrfs_discard_calc_delay(&fs_info->discard_ctl);
2881 		btrfs_discard_schedule_work(&fs_info->discard_ctl, true);
2882 	}
2883 
2884 	/*
2885 	 * Transaction is finished.  We don't need the lock anymore.  We
2886 	 * do need to clean up the block groups in case of a transaction
2887 	 * abort.
2888 	 */
2889 	deleted_bgs = &trans->transaction->deleted_bgs;
2890 	list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
2891 		u64 trimmed = 0;
2892 
2893 		ret = -EROFS;
2894 		if (!TRANS_ABORTED(trans))
2895 			ret = btrfs_discard_extent(fs_info,
2896 						   block_group->start,
2897 						   block_group->length,
2898 						   &trimmed);
2899 
2900 		/*
2901 		 * Not strictly necessary to lock, as the block_group should be
2902 		 * read-only from btrfs_delete_unused_bgs().
2903 		 */
2904 		ASSERT(block_group->ro);
2905 		spin_lock(&fs_info->unused_bgs_lock);
2906 		list_del_init(&block_group->bg_list);
2907 		spin_unlock(&fs_info->unused_bgs_lock);
2908 
2909 		btrfs_unfreeze_block_group(block_group);
2910 		btrfs_put_block_group(block_group);
2911 
2912 		if (ret) {
2913 			const char *errstr = btrfs_decode_error(ret);
2914 			btrfs_warn(fs_info,
2915 			   "discard failed while removing blockgroup: errno=%d %s",
2916 				   ret, errstr);
2917 		}
2918 	}
2919 
2920 	return 0;
2921 }
2922 
2923 /*
2924  * Parse an extent item's inline extents looking for a simple quotas owner ref.
2925  *
2926  * @fs_info:	the btrfs_fs_info for this mount
2927  * @leaf:	a leaf in the extent tree containing the extent item
2928  * @slot:	the slot in the leaf where the extent item is found
2929  *
2930  * Returns the objectid of the root that originally allocated the extent item
2931  * if the inline owner ref is expected and present, otherwise 0.
2932  *
2933  * If an extent item has an owner ref item, it will be the first inline ref
2934  * item. Therefore the logic is to check whether there are any inline ref
2935  * items, then check the type of the first one.
2936  */
btrfs_get_extent_owner_root(struct btrfs_fs_info * fs_info,struct extent_buffer * leaf,int slot)2937 u64 btrfs_get_extent_owner_root(struct btrfs_fs_info *fs_info,
2938 				struct extent_buffer *leaf, int slot)
2939 {
2940 	struct btrfs_extent_item *ei;
2941 	struct btrfs_extent_inline_ref *iref;
2942 	struct btrfs_extent_owner_ref *oref;
2943 	unsigned long ptr;
2944 	unsigned long end;
2945 	int type;
2946 
2947 	if (!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA))
2948 		return 0;
2949 
2950 	ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
2951 	ptr = (unsigned long)(ei + 1);
2952 	end = (unsigned long)ei + btrfs_item_size(leaf, slot);
2953 
2954 	/* No inline ref items of any kind, can't check type. */
2955 	if (ptr == end)
2956 		return 0;
2957 
2958 	iref = (struct btrfs_extent_inline_ref *)ptr;
2959 	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
2960 
2961 	/* We found an owner ref, get the root out of it. */
2962 	if (type == BTRFS_EXTENT_OWNER_REF_KEY) {
2963 		oref = (struct btrfs_extent_owner_ref *)(&iref->offset);
2964 		return btrfs_extent_owner_ref_root_id(leaf, oref);
2965 	}
2966 
2967 	/* We have inline refs, but not an owner ref. */
2968 	return 0;
2969 }
2970 
do_free_extent_accounting(struct btrfs_trans_handle * trans,u64 bytenr,struct btrfs_squota_delta * delta)2971 static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
2972 				     u64 bytenr, struct btrfs_squota_delta *delta)
2973 {
2974 	int ret;
2975 	u64 num_bytes = delta->num_bytes;
2976 
2977 	if (delta->is_data) {
2978 		struct btrfs_root *csum_root;
2979 
2980 		csum_root = btrfs_csum_root(trans->fs_info, bytenr);
2981 		ret = btrfs_del_csums(trans, csum_root, bytenr, num_bytes);
2982 		if (ret) {
2983 			btrfs_abort_transaction(trans, ret);
2984 			return ret;
2985 		}
2986 
2987 		ret = btrfs_delete_raid_extent(trans, bytenr, num_bytes);
2988 		if (ret) {
2989 			btrfs_abort_transaction(trans, ret);
2990 			return ret;
2991 		}
2992 	}
2993 
2994 	ret = btrfs_record_squota_delta(trans->fs_info, delta);
2995 	if (ret) {
2996 		btrfs_abort_transaction(trans, ret);
2997 		return ret;
2998 	}
2999 
3000 	ret = add_to_free_space_tree(trans, bytenr, num_bytes);
3001 	if (ret) {
3002 		btrfs_abort_transaction(trans, ret);
3003 		return ret;
3004 	}
3005 
3006 	ret = btrfs_update_block_group(trans, bytenr, num_bytes, false);
3007 	if (ret)
3008 		btrfs_abort_transaction(trans, ret);
3009 
3010 	return ret;
3011 }
3012 
3013 #define abort_and_dump(trans, path, fmt, args...)	\
3014 ({							\
3015 	btrfs_abort_transaction(trans, -EUCLEAN);	\
3016 	btrfs_print_leaf(path->nodes[0]);		\
3017 	btrfs_crit(trans->fs_info, fmt, ##args);	\
3018 })
3019 
3020 /*
3021  * Drop one or more refs of @node.
3022  *
3023  * 1. Locate the extent refs.
3024  *    It's either inline in EXTENT/METADATA_ITEM or in keyed SHARED_* item.
3025  *    Locate it, then reduce the refs number or remove the ref line completely.
3026  *
3027  * 2. Update the refs count in EXTENT/METADATA_ITEM
3028  *
3029  * Inline backref case:
3030  *
3031  * in extent tree we have:
3032  *
3033  * 	item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
3034  *		refs 2 gen 6 flags DATA
3035  *		extent data backref root FS_TREE objectid 258 offset 0 count 1
3036  *		extent data backref root FS_TREE objectid 257 offset 0 count 1
3037  *
3038  * This function gets called with:
3039  *
3040  *    node->bytenr = 13631488
3041  *    node->num_bytes = 1048576
3042  *    root_objectid = FS_TREE
3043  *    owner_objectid = 257
3044  *    owner_offset = 0
3045  *    refs_to_drop = 1
3046  *
3047  * Then we should get some like:
3048  *
3049  * 	item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
3050  *		refs 1 gen 6 flags DATA
3051  *		extent data backref root FS_TREE objectid 258 offset 0 count 1
3052  *
3053  * Keyed backref case:
3054  *
3055  * in extent tree we have:
3056  *
3057  *	item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
3058  *		refs 754 gen 6 flags DATA
3059  *	[...]
3060  *	item 2 key (13631488 EXTENT_DATA_REF <HASH>) itemoff 3915 itemsize 28
3061  *		extent data backref root FS_TREE objectid 866 offset 0 count 1
3062  *
3063  * This function get called with:
3064  *
3065  *    node->bytenr = 13631488
3066  *    node->num_bytes = 1048576
3067  *    root_objectid = FS_TREE
3068  *    owner_objectid = 866
3069  *    owner_offset = 0
3070  *    refs_to_drop = 1
3071  *
3072  * Then we should get some like:
3073  *
3074  *	item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
3075  *		refs 753 gen 6 flags DATA
3076  *
3077  * And that (13631488 EXTENT_DATA_REF <HASH>) gets removed.
3078  */
__btrfs_free_extent(struct btrfs_trans_handle * trans,struct btrfs_delayed_ref_head * href,struct btrfs_delayed_ref_node * node,struct btrfs_delayed_extent_op * extent_op)3079 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3080 			       struct btrfs_delayed_ref_head *href,
3081 			       struct btrfs_delayed_ref_node *node,
3082 			       struct btrfs_delayed_extent_op *extent_op)
3083 {
3084 	struct btrfs_fs_info *info = trans->fs_info;
3085 	struct btrfs_key key;
3086 	struct btrfs_path *path;
3087 	struct btrfs_root *extent_root;
3088 	struct extent_buffer *leaf;
3089 	struct btrfs_extent_item *ei;
3090 	struct btrfs_extent_inline_ref *iref;
3091 	int ret;
3092 	int is_data;
3093 	int extent_slot = 0;
3094 	int found_extent = 0;
3095 	int num_to_del = 1;
3096 	int refs_to_drop = node->ref_mod;
3097 	u32 item_size;
3098 	u64 refs;
3099 	u64 bytenr = node->bytenr;
3100 	u64 num_bytes = node->num_bytes;
3101 	u64 owner_objectid = btrfs_delayed_ref_owner(node);
3102 	u64 owner_offset = btrfs_delayed_ref_offset(node);
3103 	bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
3104 	u64 delayed_ref_root = href->owning_root;
3105 
3106 	extent_root = btrfs_extent_root(info, bytenr);
3107 	ASSERT(extent_root);
3108 
3109 	path = btrfs_alloc_path();
3110 	if (!path)
3111 		return -ENOMEM;
3112 
3113 	is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
3114 
3115 	if (!is_data && refs_to_drop != 1) {
3116 		btrfs_crit(info,
3117 "invalid refs_to_drop, dropping more than 1 refs for tree block %llu refs_to_drop %u",
3118 			   node->bytenr, refs_to_drop);
3119 		ret = -EINVAL;
3120 		btrfs_abort_transaction(trans, ret);
3121 		goto out;
3122 	}
3123 
3124 	if (is_data)
3125 		skinny_metadata = false;
3126 
3127 	ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes,
3128 				    node->parent, node->ref_root, owner_objectid,
3129 				    owner_offset);
3130 	if (ret == 0) {
3131 		/*
3132 		 * Either the inline backref or the SHARED_DATA_REF/
3133 		 * SHARED_BLOCK_REF is found
3134 		 *
3135 		 * Here is a quick path to locate EXTENT/METADATA_ITEM.
3136 		 * It's possible the EXTENT/METADATA_ITEM is near current slot.
3137 		 */
3138 		extent_slot = path->slots[0];
3139 		while (extent_slot >= 0) {
3140 			btrfs_item_key_to_cpu(path->nodes[0], &key,
3141 					      extent_slot);
3142 			if (key.objectid != bytenr)
3143 				break;
3144 			if (key.type == BTRFS_EXTENT_ITEM_KEY &&
3145 			    key.offset == num_bytes) {
3146 				found_extent = 1;
3147 				break;
3148 			}
3149 			if (key.type == BTRFS_METADATA_ITEM_KEY &&
3150 			    key.offset == owner_objectid) {
3151 				found_extent = 1;
3152 				break;
3153 			}
3154 
3155 			/* Quick path didn't find the EXTEMT/METADATA_ITEM */
3156 			if (path->slots[0] - extent_slot > 5)
3157 				break;
3158 			extent_slot--;
3159 		}
3160 
3161 		if (!found_extent) {
3162 			if (iref) {
3163 				abort_and_dump(trans, path,
3164 "invalid iref slot %u, no EXTENT/METADATA_ITEM found but has inline extent ref",
3165 					   path->slots[0]);
3166 				ret = -EUCLEAN;
3167 				goto out;
3168 			}
3169 			/* Must be SHARED_* item, remove the backref first */
3170 			ret = remove_extent_backref(trans, extent_root, path,
3171 						    NULL, refs_to_drop, is_data);
3172 			if (ret) {
3173 				btrfs_abort_transaction(trans, ret);
3174 				goto out;
3175 			}
3176 			btrfs_release_path(path);
3177 
3178 			/* Slow path to locate EXTENT/METADATA_ITEM */
3179 			key.objectid = bytenr;
3180 			key.type = BTRFS_EXTENT_ITEM_KEY;
3181 			key.offset = num_bytes;
3182 
3183 			if (!is_data && skinny_metadata) {
3184 				key.type = BTRFS_METADATA_ITEM_KEY;
3185 				key.offset = owner_objectid;
3186 			}
3187 
3188 			ret = btrfs_search_slot(trans, extent_root,
3189 						&key, path, -1, 1);
3190 			if (ret > 0 && skinny_metadata && path->slots[0]) {
3191 				/*
3192 				 * Couldn't find our skinny metadata item,
3193 				 * see if we have ye olde extent item.
3194 				 */
3195 				path->slots[0]--;
3196 				btrfs_item_key_to_cpu(path->nodes[0], &key,
3197 						      path->slots[0]);
3198 				if (key.objectid == bytenr &&
3199 				    key.type == BTRFS_EXTENT_ITEM_KEY &&
3200 				    key.offset == num_bytes)
3201 					ret = 0;
3202 			}
3203 
3204 			if (ret > 0 && skinny_metadata) {
3205 				skinny_metadata = false;
3206 				key.objectid = bytenr;
3207 				key.type = BTRFS_EXTENT_ITEM_KEY;
3208 				key.offset = num_bytes;
3209 				btrfs_release_path(path);
3210 				ret = btrfs_search_slot(trans, extent_root,
3211 							&key, path, -1, 1);
3212 			}
3213 
3214 			if (ret) {
3215 				if (ret > 0)
3216 					btrfs_print_leaf(path->nodes[0]);
3217 				btrfs_err(info,
3218 			"umm, got %d back from search, was looking for %llu, slot %d",
3219 					  ret, bytenr, path->slots[0]);
3220 			}
3221 			if (ret < 0) {
3222 				btrfs_abort_transaction(trans, ret);
3223 				goto out;
3224 			}
3225 			extent_slot = path->slots[0];
3226 		}
3227 	} else if (WARN_ON(ret == -ENOENT)) {
3228 		abort_and_dump(trans, path,
3229 "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu slot %d",
3230 			       bytenr, node->parent, node->ref_root, owner_objectid,
3231 			       owner_offset, path->slots[0]);
3232 		goto out;
3233 	} else {
3234 		btrfs_abort_transaction(trans, ret);
3235 		goto out;
3236 	}
3237 
3238 	leaf = path->nodes[0];
3239 	item_size = btrfs_item_size(leaf, extent_slot);
3240 	if (unlikely(item_size < sizeof(*ei))) {
3241 		ret = -EUCLEAN;
3242 		btrfs_err(trans->fs_info,
3243 			  "unexpected extent item size, has %u expect >= %zu",
3244 			  item_size, sizeof(*ei));
3245 		btrfs_abort_transaction(trans, ret);
3246 		goto out;
3247 	}
3248 	ei = btrfs_item_ptr(leaf, extent_slot,
3249 			    struct btrfs_extent_item);
3250 	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
3251 	    key.type == BTRFS_EXTENT_ITEM_KEY) {
3252 		struct btrfs_tree_block_info *bi;
3253 
3254 		if (item_size < sizeof(*ei) + sizeof(*bi)) {
3255 			abort_and_dump(trans, path,
3256 "invalid extent item size for key (%llu, %u, %llu) slot %u owner %llu, has %u expect >= %zu",
3257 				       key.objectid, key.type, key.offset,
3258 				       path->slots[0], owner_objectid, item_size,
3259 				       sizeof(*ei) + sizeof(*bi));
3260 			ret = -EUCLEAN;
3261 			goto out;
3262 		}
3263 		bi = (struct btrfs_tree_block_info *)(ei + 1);
3264 		WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
3265 	}
3266 
3267 	refs = btrfs_extent_refs(leaf, ei);
3268 	if (refs < refs_to_drop) {
3269 		abort_and_dump(trans, path,
3270 		"trying to drop %d refs but we only have %llu for bytenr %llu slot %u",
3271 			       refs_to_drop, refs, bytenr, path->slots[0]);
3272 		ret = -EUCLEAN;
3273 		goto out;
3274 	}
3275 	refs -= refs_to_drop;
3276 
3277 	if (refs > 0) {
3278 		if (extent_op)
3279 			__run_delayed_extent_op(extent_op, leaf, ei);
3280 		/*
3281 		 * In the case of inline back ref, reference count will
3282 		 * be updated by remove_extent_backref
3283 		 */
3284 		if (iref) {
3285 			if (!found_extent) {
3286 				abort_and_dump(trans, path,
3287 "invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found, slot %u",
3288 					       path->slots[0]);
3289 				ret = -EUCLEAN;
3290 				goto out;
3291 			}
3292 		} else {
3293 			btrfs_set_extent_refs(leaf, ei, refs);
3294 			btrfs_mark_buffer_dirty(trans, leaf);
3295 		}
3296 		if (found_extent) {
3297 			ret = remove_extent_backref(trans, extent_root, path,
3298 						    iref, refs_to_drop, is_data);
3299 			if (ret) {
3300 				btrfs_abort_transaction(trans, ret);
3301 				goto out;
3302 			}
3303 		}
3304 	} else {
3305 		struct btrfs_squota_delta delta = {
3306 			.root = delayed_ref_root,
3307 			.num_bytes = num_bytes,
3308 			.is_data = is_data,
3309 			.is_inc = false,
3310 			.generation = btrfs_extent_generation(leaf, ei),
3311 		};
3312 
3313 		/* In this branch refs == 1 */
3314 		if (found_extent) {
3315 			if (is_data && refs_to_drop !=
3316 			    extent_data_ref_count(path, iref)) {
3317 				abort_and_dump(trans, path,
3318 		"invalid refs_to_drop, current refs %u refs_to_drop %u slot %u",
3319 					       extent_data_ref_count(path, iref),
3320 					       refs_to_drop, path->slots[0]);
3321 				ret = -EUCLEAN;
3322 				goto out;
3323 			}
3324 			if (iref) {
3325 				if (path->slots[0] != extent_slot) {
3326 					abort_and_dump(trans, path,
3327 "invalid iref, extent item key (%llu %u %llu) slot %u doesn't have wanted iref",
3328 						       key.objectid, key.type,
3329 						       key.offset, path->slots[0]);
3330 					ret = -EUCLEAN;
3331 					goto out;
3332 				}
3333 			} else {
3334 				/*
3335 				 * No inline ref, we must be at SHARED_* item,
3336 				 * And it's single ref, it must be:
3337 				 * |	extent_slot	  ||extent_slot + 1|
3338 				 * [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ]
3339 				 */
3340 				if (path->slots[0] != extent_slot + 1) {
3341 					abort_and_dump(trans, path,
3342 	"invalid SHARED_* item slot %u, previous item is not EXTENT/METADATA_ITEM",
3343 						       path->slots[0]);
3344 					ret = -EUCLEAN;
3345 					goto out;
3346 				}
3347 				path->slots[0] = extent_slot;
3348 				num_to_del = 2;
3349 			}
3350 		}
3351 		/*
3352 		 * We can't infer the data owner from the delayed ref, so we need
3353 		 * to try to get it from the owning ref item.
3354 		 *
3355 		 * If it is not present, then that extent was not written under
3356 		 * simple quotas mode, so we don't need to account for its deletion.
3357 		 */
3358 		if (is_data)
3359 			delta.root = btrfs_get_extent_owner_root(trans->fs_info,
3360 								 leaf, extent_slot);
3361 
3362 		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
3363 				      num_to_del);
3364 		if (ret) {
3365 			btrfs_abort_transaction(trans, ret);
3366 			goto out;
3367 		}
3368 		btrfs_release_path(path);
3369 
3370 		ret = do_free_extent_accounting(trans, bytenr, &delta);
3371 	}
3372 	btrfs_release_path(path);
3373 
3374 out:
3375 	btrfs_free_path(path);
3376 	return ret;
3377 }
3378 
3379 /*
3380  * when we free an block, it is possible (and likely) that we free the last
3381  * delayed ref for that extent as well.  This searches the delayed ref tree for
3382  * a given extent, and if there are no other delayed refs to be processed, it
3383  * removes it from the tree.
3384  */
check_ref_cleanup(struct btrfs_trans_handle * trans,u64 bytenr)3385 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
3386 				      u64 bytenr)
3387 {
3388 	struct btrfs_delayed_ref_head *head;
3389 	struct btrfs_delayed_ref_root *delayed_refs;
3390 	int ret = 0;
3391 
3392 	delayed_refs = &trans->transaction->delayed_refs;
3393 	spin_lock(&delayed_refs->lock);
3394 	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
3395 	if (!head)
3396 		goto out_delayed_unlock;
3397 
3398 	spin_lock(&head->lock);
3399 	if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root))
3400 		goto out;
3401 
3402 	if (cleanup_extent_op(head) != NULL)
3403 		goto out;
3404 
3405 	/*
3406 	 * waiting for the lock here would deadlock.  If someone else has it
3407 	 * locked they are already in the process of dropping it anyway
3408 	 */
3409 	if (!mutex_trylock(&head->mutex))
3410 		goto out;
3411 
3412 	btrfs_delete_ref_head(delayed_refs, head);
3413 	head->processing = false;
3414 
3415 	spin_unlock(&head->lock);
3416 	spin_unlock(&delayed_refs->lock);
3417 
3418 	BUG_ON(head->extent_op);
3419 	if (head->must_insert_reserved)
3420 		ret = 1;
3421 
3422 	btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head);
3423 	mutex_unlock(&head->mutex);
3424 	btrfs_put_delayed_ref_head(head);
3425 	return ret;
3426 out:
3427 	spin_unlock(&head->lock);
3428 
3429 out_delayed_unlock:
3430 	spin_unlock(&delayed_refs->lock);
3431 	return 0;
3432 }
3433 
btrfs_free_tree_block(struct btrfs_trans_handle * trans,u64 root_id,struct extent_buffer * buf,u64 parent,int last_ref)3434 int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
3435 			  u64 root_id,
3436 			  struct extent_buffer *buf,
3437 			  u64 parent, int last_ref)
3438 {
3439 	struct btrfs_fs_info *fs_info = trans->fs_info;
3440 	struct btrfs_block_group *bg;
3441 	int ret;
3442 
3443 	if (root_id != BTRFS_TREE_LOG_OBJECTID) {
3444 		struct btrfs_ref generic_ref = {
3445 			.action = BTRFS_DROP_DELAYED_REF,
3446 			.bytenr = buf->start,
3447 			.num_bytes = buf->len,
3448 			.parent = parent,
3449 			.owning_root = btrfs_header_owner(buf),
3450 			.ref_root = root_id,
3451 		};
3452 
3453 		/*
3454 		 * Assert that the extent buffer is not cleared due to
3455 		 * EXTENT_BUFFER_ZONED_ZEROOUT. Please refer
3456 		 * btrfs_clear_buffer_dirty() and btree_csum_one_bio() for
3457 		 * detail.
3458 		 */
3459 		ASSERT(btrfs_header_bytenr(buf) != 0);
3460 
3461 		btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf), 0, false);
3462 		btrfs_ref_tree_mod(fs_info, &generic_ref);
3463 		ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL);
3464 		if (ret < 0)
3465 			return ret;
3466 	}
3467 
3468 	if (!last_ref)
3469 		return 0;
3470 
3471 	if (btrfs_header_generation(buf) != trans->transid)
3472 		goto out;
3473 
3474 	if (root_id != BTRFS_TREE_LOG_OBJECTID) {
3475 		ret = check_ref_cleanup(trans, buf->start);
3476 		if (!ret)
3477 			goto out;
3478 	}
3479 
3480 	bg = btrfs_lookup_block_group(fs_info, buf->start);
3481 
3482 	if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
3483 		pin_down_extent(trans, bg, buf->start, buf->len, 1);
3484 		btrfs_put_block_group(bg);
3485 		goto out;
3486 	}
3487 
3488 	/*
3489 	 * If there are tree mod log users we may have recorded mod log
3490 	 * operations for this node.  If we re-allocate this node we
3491 	 * could replay operations on this node that happened when it
3492 	 * existed in a completely different root.  For example if it
3493 	 * was part of root A, then was reallocated to root B, and we
3494 	 * are doing a btrfs_old_search_slot(root b), we could replay
3495 	 * operations that happened when the block was part of root A,
3496 	 * giving us an inconsistent view of the btree.
3497 	 *
3498 	 * We are safe from races here because at this point no other
3499 	 * node or root points to this extent buffer, so if after this
3500 	 * check a new tree mod log user joins we will not have an
3501 	 * existing log of operations on this node that we have to
3502 	 * contend with.
3503 	 */
3504 
3505 	if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)
3506 		     || btrfs_is_zoned(fs_info)) {
3507 		pin_down_extent(trans, bg, buf->start, buf->len, 1);
3508 		btrfs_put_block_group(bg);
3509 		goto out;
3510 	}
3511 
3512 	WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
3513 
3514 	btrfs_add_free_space(bg, buf->start, buf->len);
3515 	btrfs_free_reserved_bytes(bg, buf->len, 0);
3516 	btrfs_put_block_group(bg);
3517 	trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
3518 
3519 out:
3520 
3521 	/*
3522 	 * Deleting the buffer, clear the corrupt flag since it doesn't
3523 	 * matter anymore.
3524 	 */
3525 	clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
3526 	return 0;
3527 }
3528 
3529 /* Can return -ENOMEM */
btrfs_free_extent(struct btrfs_trans_handle * trans,struct btrfs_ref * ref)3530 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
3531 {
3532 	struct btrfs_fs_info *fs_info = trans->fs_info;
3533 	int ret;
3534 
3535 	if (btrfs_is_testing(fs_info))
3536 		return 0;
3537 
3538 	/*
3539 	 * tree log blocks never actually go into the extent allocation
3540 	 * tree, just update pinning info and exit early.
3541 	 */
3542 	if (ref->ref_root == BTRFS_TREE_LOG_OBJECTID) {
3543 		btrfs_pin_extent(trans, ref->bytenr, ref->num_bytes, 1);
3544 		ret = 0;
3545 	} else if (ref->type == BTRFS_REF_METADATA) {
3546 		ret = btrfs_add_delayed_tree_ref(trans, ref, NULL);
3547 	} else {
3548 		ret = btrfs_add_delayed_data_ref(trans, ref, 0);
3549 	}
3550 
3551 	if (ref->ref_root != BTRFS_TREE_LOG_OBJECTID)
3552 		btrfs_ref_tree_mod(fs_info, ref);
3553 
3554 	return ret;
3555 }
3556 
3557 enum btrfs_loop_type {
3558 	/*
3559 	 * Start caching block groups but do not wait for progress or for them
3560 	 * to be done.
3561 	 */
3562 	LOOP_CACHING_NOWAIT,
3563 
3564 	/*
3565 	 * Wait for the block group free_space >= the space we're waiting for if
3566 	 * the block group isn't cached.
3567 	 */
3568 	LOOP_CACHING_WAIT,
3569 
3570 	/*
3571 	 * Allow allocations to happen from block groups that do not yet have a
3572 	 * size classification.
3573 	 */
3574 	LOOP_UNSET_SIZE_CLASS,
3575 
3576 	/*
3577 	 * Allocate a chunk and then retry the allocation.
3578 	 */
3579 	LOOP_ALLOC_CHUNK,
3580 
3581 	/*
3582 	 * Ignore the size class restrictions for this allocation.
3583 	 */
3584 	LOOP_WRONG_SIZE_CLASS,
3585 
3586 	/*
3587 	 * Ignore the empty size, only try to allocate the number of bytes
3588 	 * needed for this allocation.
3589 	 */
3590 	LOOP_NO_EMPTY_SIZE,
3591 };
3592 
3593 static inline void
btrfs_lock_block_group(struct btrfs_block_group * cache,int delalloc)3594 btrfs_lock_block_group(struct btrfs_block_group *cache,
3595 		       int delalloc)
3596 {
3597 	if (delalloc)
3598 		down_read(&cache->data_rwsem);
3599 }
3600 
btrfs_grab_block_group(struct btrfs_block_group * cache,int delalloc)3601 static inline void btrfs_grab_block_group(struct btrfs_block_group *cache,
3602 		       int delalloc)
3603 {
3604 	btrfs_get_block_group(cache);
3605 	if (delalloc)
3606 		down_read(&cache->data_rwsem);
3607 }
3608 
btrfs_lock_cluster(struct btrfs_block_group * block_group,struct btrfs_free_cluster * cluster,int delalloc)3609 static struct btrfs_block_group *btrfs_lock_cluster(
3610 		   struct btrfs_block_group *block_group,
3611 		   struct btrfs_free_cluster *cluster,
3612 		   int delalloc)
3613 	__acquires(&cluster->refill_lock)
3614 {
3615 	struct btrfs_block_group *used_bg = NULL;
3616 
3617 	spin_lock(&cluster->refill_lock);
3618 	while (1) {
3619 		used_bg = cluster->block_group;
3620 		if (!used_bg)
3621 			return NULL;
3622 
3623 		if (used_bg == block_group)
3624 			return used_bg;
3625 
3626 		btrfs_get_block_group(used_bg);
3627 
3628 		if (!delalloc)
3629 			return used_bg;
3630 
3631 		if (down_read_trylock(&used_bg->data_rwsem))
3632 			return used_bg;
3633 
3634 		spin_unlock(&cluster->refill_lock);
3635 
3636 		/* We should only have one-level nested. */
3637 		down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING);
3638 
3639 		spin_lock(&cluster->refill_lock);
3640 		if (used_bg == cluster->block_group)
3641 			return used_bg;
3642 
3643 		up_read(&used_bg->data_rwsem);
3644 		btrfs_put_block_group(used_bg);
3645 	}
3646 }
3647 
3648 static inline void
btrfs_release_block_group(struct btrfs_block_group * cache,int delalloc)3649 btrfs_release_block_group(struct btrfs_block_group *cache,
3650 			 int delalloc)
3651 {
3652 	if (delalloc)
3653 		up_read(&cache->data_rwsem);
3654 	btrfs_put_block_group(cache);
3655 }
3656 
find_free_extent_check_size_class(const struct find_free_extent_ctl * ffe_ctl,const struct btrfs_block_group * bg)3657 static bool find_free_extent_check_size_class(const struct find_free_extent_ctl *ffe_ctl,
3658 					      const struct btrfs_block_group *bg)
3659 {
3660 	if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED)
3661 		return true;
3662 	if (!btrfs_block_group_should_use_size_class(bg))
3663 		return true;
3664 	if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS)
3665 		return true;
3666 	if (ffe_ctl->loop >= LOOP_UNSET_SIZE_CLASS &&
3667 	    bg->size_class == BTRFS_BG_SZ_NONE)
3668 		return true;
3669 	return ffe_ctl->size_class == bg->size_class;
3670 }
3671 
3672 /*
3673  * Helper function for find_free_extent().
3674  *
3675  * Return -ENOENT to inform caller that we need fallback to unclustered mode.
3676  * Return >0 to inform caller that we find nothing
3677  * Return 0 means we have found a location and set ffe_ctl->found_offset.
3678  */
find_free_extent_clustered(struct btrfs_block_group * bg,struct find_free_extent_ctl * ffe_ctl,struct btrfs_block_group ** cluster_bg_ret)3679 static int find_free_extent_clustered(struct btrfs_block_group *bg,
3680 				      struct find_free_extent_ctl *ffe_ctl,
3681 				      struct btrfs_block_group **cluster_bg_ret)
3682 {
3683 	struct btrfs_block_group *cluster_bg;
3684 	struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
3685 	u64 aligned_cluster;
3686 	u64 offset;
3687 	int ret;
3688 
3689 	cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc);
3690 	if (!cluster_bg)
3691 		goto refill_cluster;
3692 	if (cluster_bg != bg && (cluster_bg->ro ||
3693 	    !block_group_bits(cluster_bg, ffe_ctl->flags) ||
3694 	    !find_free_extent_check_size_class(ffe_ctl, cluster_bg)))
3695 		goto release_cluster;
3696 
3697 	offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
3698 			ffe_ctl->num_bytes, cluster_bg->start,
3699 			&ffe_ctl->max_extent_size);
3700 	if (offset) {
3701 		/* We have a block, we're done */
3702 		spin_unlock(&last_ptr->refill_lock);
3703 		trace_btrfs_reserve_extent_cluster(cluster_bg, ffe_ctl);
3704 		*cluster_bg_ret = cluster_bg;
3705 		ffe_ctl->found_offset = offset;
3706 		return 0;
3707 	}
3708 	WARN_ON(last_ptr->block_group != cluster_bg);
3709 
3710 release_cluster:
3711 	/*
3712 	 * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so
3713 	 * lets just skip it and let the allocator find whatever block it can
3714 	 * find. If we reach this point, we will have tried the cluster
3715 	 * allocator plenty of times and not have found anything, so we are
3716 	 * likely way too fragmented for the clustering stuff to find anything.
3717 	 *
3718 	 * However, if the cluster is taken from the current block group,
3719 	 * release the cluster first, so that we stand a better chance of
3720 	 * succeeding in the unclustered allocation.
3721 	 */
3722 	if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) {
3723 		spin_unlock(&last_ptr->refill_lock);
3724 		btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
3725 		return -ENOENT;
3726 	}
3727 
3728 	/* This cluster didn't work out, free it and start over */
3729 	btrfs_return_cluster_to_free_space(NULL, last_ptr);
3730 
3731 	if (cluster_bg != bg)
3732 		btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
3733 
3734 refill_cluster:
3735 	if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) {
3736 		spin_unlock(&last_ptr->refill_lock);
3737 		return -ENOENT;
3738 	}
3739 
3740 	aligned_cluster = max_t(u64,
3741 			ffe_ctl->empty_cluster + ffe_ctl->empty_size,
3742 			bg->full_stripe_len);
3743 	ret = btrfs_find_space_cluster(bg, last_ptr, ffe_ctl->search_start,
3744 			ffe_ctl->num_bytes, aligned_cluster);
3745 	if (ret == 0) {
3746 		/* Now pull our allocation out of this cluster */
3747 		offset = btrfs_alloc_from_cluster(bg, last_ptr,
3748 				ffe_ctl->num_bytes, ffe_ctl->search_start,
3749 				&ffe_ctl->max_extent_size);
3750 		if (offset) {
3751 			/* We found one, proceed */
3752 			spin_unlock(&last_ptr->refill_lock);
3753 			ffe_ctl->found_offset = offset;
3754 			trace_btrfs_reserve_extent_cluster(bg, ffe_ctl);
3755 			return 0;
3756 		}
3757 	}
3758 	/*
3759 	 * At this point we either didn't find a cluster or we weren't able to
3760 	 * allocate a block from our cluster.  Free the cluster we've been
3761 	 * trying to use, and go to the next block group.
3762 	 */
3763 	btrfs_return_cluster_to_free_space(NULL, last_ptr);
3764 	spin_unlock(&last_ptr->refill_lock);
3765 	return 1;
3766 }
3767 
3768 /*
3769  * Return >0 to inform caller that we find nothing
3770  * Return 0 when we found an free extent and set ffe_ctrl->found_offset
3771  */
find_free_extent_unclustered(struct btrfs_block_group * bg,struct find_free_extent_ctl * ffe_ctl)3772 static int find_free_extent_unclustered(struct btrfs_block_group *bg,
3773 					struct find_free_extent_ctl *ffe_ctl)
3774 {
3775 	struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
3776 	u64 offset;
3777 
3778 	/*
3779 	 * We are doing an unclustered allocation, set the fragmented flag so
3780 	 * we don't bother trying to setup a cluster again until we get more
3781 	 * space.
3782 	 */
3783 	if (unlikely(last_ptr)) {
3784 		spin_lock(&last_ptr->lock);
3785 		last_ptr->fragmented = 1;
3786 		spin_unlock(&last_ptr->lock);
3787 	}
3788 	if (ffe_ctl->cached) {
3789 		struct btrfs_free_space_ctl *free_space_ctl;
3790 
3791 		free_space_ctl = bg->free_space_ctl;
3792 		spin_lock(&free_space_ctl->tree_lock);
3793 		if (free_space_ctl->free_space <
3794 		    ffe_ctl->num_bytes + ffe_ctl->empty_cluster +
3795 		    ffe_ctl->empty_size) {
3796 			ffe_ctl->total_free_space = max_t(u64,
3797 					ffe_ctl->total_free_space,
3798 					free_space_ctl->free_space);
3799 			spin_unlock(&free_space_ctl->tree_lock);
3800 			return 1;
3801 		}
3802 		spin_unlock(&free_space_ctl->tree_lock);
3803 	}
3804 
3805 	offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start,
3806 			ffe_ctl->num_bytes, ffe_ctl->empty_size,
3807 			&ffe_ctl->max_extent_size);
3808 	if (!offset)
3809 		return 1;
3810 	ffe_ctl->found_offset = offset;
3811 	return 0;
3812 }
3813 
do_allocation_clustered(struct btrfs_block_group * block_group,struct find_free_extent_ctl * ffe_ctl,struct btrfs_block_group ** bg_ret)3814 static int do_allocation_clustered(struct btrfs_block_group *block_group,
3815 				   struct find_free_extent_ctl *ffe_ctl,
3816 				   struct btrfs_block_group **bg_ret)
3817 {
3818 	int ret;
3819 
3820 	/* We want to try and use the cluster allocator, so lets look there */
3821 	if (ffe_ctl->last_ptr && ffe_ctl->use_cluster) {
3822 		ret = find_free_extent_clustered(block_group, ffe_ctl, bg_ret);
3823 		if (ret >= 0)
3824 			return ret;
3825 		/* ret == -ENOENT case falls through */
3826 	}
3827 
3828 	return find_free_extent_unclustered(block_group, ffe_ctl);
3829 }
3830 
3831 /*
3832  * Tree-log block group locking
3833  * ============================
3834  *
3835  * fs_info::treelog_bg_lock protects the fs_info::treelog_bg which
3836  * indicates the starting address of a block group, which is reserved only
3837  * for tree-log metadata.
3838  *
3839  * Lock nesting
3840  * ============
3841  *
3842  * space_info::lock
3843  *   block_group::lock
3844  *     fs_info::treelog_bg_lock
3845  */
3846 
3847 /*
3848  * Simple allocator for sequential-only block group. It only allows sequential
3849  * allocation. No need to play with trees. This function also reserves the
3850  * bytes as in btrfs_add_reserved_bytes.
3851  */
do_allocation_zoned(struct btrfs_block_group * block_group,struct find_free_extent_ctl * ffe_ctl,struct btrfs_block_group ** bg_ret)3852 static int do_allocation_zoned(struct btrfs_block_group *block_group,
3853 			       struct find_free_extent_ctl *ffe_ctl,
3854 			       struct btrfs_block_group **bg_ret)
3855 {
3856 	struct btrfs_fs_info *fs_info = block_group->fs_info;
3857 	struct btrfs_space_info *space_info = block_group->space_info;
3858 	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3859 	u64 start = block_group->start;
3860 	u64 num_bytes = ffe_ctl->num_bytes;
3861 	u64 avail;
3862 	u64 bytenr = block_group->start;
3863 	u64 log_bytenr;
3864 	u64 data_reloc_bytenr;
3865 	int ret = 0;
3866 	bool skip = false;
3867 
3868 	ASSERT(btrfs_is_zoned(block_group->fs_info));
3869 
3870 	/*
3871 	 * Do not allow non-tree-log blocks in the dedicated tree-log block
3872 	 * group, and vice versa.
3873 	 */
3874 	spin_lock(&fs_info->treelog_bg_lock);
3875 	log_bytenr = fs_info->treelog_bg;
3876 	if (log_bytenr && ((ffe_ctl->for_treelog && bytenr != log_bytenr) ||
3877 			   (!ffe_ctl->for_treelog && bytenr == log_bytenr)))
3878 		skip = true;
3879 	spin_unlock(&fs_info->treelog_bg_lock);
3880 	if (skip)
3881 		return 1;
3882 
3883 	/*
3884 	 * Do not allow non-relocation blocks in the dedicated relocation block
3885 	 * group, and vice versa.
3886 	 */
3887 	spin_lock(&fs_info->relocation_bg_lock);
3888 	data_reloc_bytenr = fs_info->data_reloc_bg;
3889 	if (data_reloc_bytenr &&
3890 	    ((ffe_ctl->for_data_reloc && bytenr != data_reloc_bytenr) ||
3891 	     (!ffe_ctl->for_data_reloc && bytenr == data_reloc_bytenr)))
3892 		skip = true;
3893 	spin_unlock(&fs_info->relocation_bg_lock);
3894 	if (skip)
3895 		return 1;
3896 
3897 	/* Check RO and no space case before trying to activate it */
3898 	spin_lock(&block_group->lock);
3899 	if (block_group->ro || btrfs_zoned_bg_is_full(block_group)) {
3900 		ret = 1;
3901 		/*
3902 		 * May need to clear fs_info->{treelog,data_reloc}_bg.
3903 		 * Return the error after taking the locks.
3904 		 */
3905 	}
3906 	spin_unlock(&block_group->lock);
3907 
3908 	/* Metadata block group is activated at write time. */
3909 	if (!ret && (block_group->flags & BTRFS_BLOCK_GROUP_DATA) &&
3910 	    !btrfs_zone_activate(block_group)) {
3911 		ret = 1;
3912 		/*
3913 		 * May need to clear fs_info->{treelog,data_reloc}_bg.
3914 		 * Return the error after taking the locks.
3915 		 */
3916 	}
3917 
3918 	spin_lock(&space_info->lock);
3919 	spin_lock(&block_group->lock);
3920 	spin_lock(&fs_info->treelog_bg_lock);
3921 	spin_lock(&fs_info->relocation_bg_lock);
3922 
3923 	if (ret)
3924 		goto out;
3925 
3926 	ASSERT(!ffe_ctl->for_treelog ||
3927 	       block_group->start == fs_info->treelog_bg ||
3928 	       fs_info->treelog_bg == 0);
3929 	ASSERT(!ffe_ctl->for_data_reloc ||
3930 	       block_group->start == fs_info->data_reloc_bg ||
3931 	       fs_info->data_reloc_bg == 0);
3932 
3933 	if (block_group->ro ||
3934 	    (!ffe_ctl->for_data_reloc &&
3935 	     test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags))) {
3936 		ret = 1;
3937 		goto out;
3938 	}
3939 
3940 	/*
3941 	 * Do not allow currently using block group to be tree-log dedicated
3942 	 * block group.
3943 	 */
3944 	if (ffe_ctl->for_treelog && !fs_info->treelog_bg &&
3945 	    (block_group->used || block_group->reserved)) {
3946 		ret = 1;
3947 		goto out;
3948 	}
3949 
3950 	/*
3951 	 * Do not allow currently used block group to be the data relocation
3952 	 * dedicated block group.
3953 	 */
3954 	if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg &&
3955 	    (block_group->used || block_group->reserved)) {
3956 		ret = 1;
3957 		goto out;
3958 	}
3959 
3960 	WARN_ON_ONCE(block_group->alloc_offset > block_group->zone_capacity);
3961 	avail = block_group->zone_capacity - block_group->alloc_offset;
3962 	if (avail < num_bytes) {
3963 		if (ffe_ctl->max_extent_size < avail) {
3964 			/*
3965 			 * With sequential allocator, free space is always
3966 			 * contiguous
3967 			 */
3968 			ffe_ctl->max_extent_size = avail;
3969 			ffe_ctl->total_free_space = avail;
3970 		}
3971 		ret = 1;
3972 		goto out;
3973 	}
3974 
3975 	if (ffe_ctl->for_treelog && !fs_info->treelog_bg)
3976 		fs_info->treelog_bg = block_group->start;
3977 
3978 	if (ffe_ctl->for_data_reloc) {
3979 		if (!fs_info->data_reloc_bg)
3980 			fs_info->data_reloc_bg = block_group->start;
3981 		/*
3982 		 * Do not allow allocations from this block group, unless it is
3983 		 * for data relocation. Compared to increasing the ->ro, setting
3984 		 * the ->zoned_data_reloc_ongoing flag still allows nocow
3985 		 * writers to come in. See btrfs_inc_nocow_writers().
3986 		 *
3987 		 * We need to disable an allocation to avoid an allocation of
3988 		 * regular (non-relocation data) extent. With mix of relocation
3989 		 * extents and regular extents, we can dispatch WRITE commands
3990 		 * (for relocation extents) and ZONE APPEND commands (for
3991 		 * regular extents) at the same time to the same zone, which
3992 		 * easily break the write pointer.
3993 		 *
3994 		 * Also, this flag avoids this block group to be zone finished.
3995 		 */
3996 		set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags);
3997 	}
3998 
3999 	ffe_ctl->found_offset = start + block_group->alloc_offset;
4000 	block_group->alloc_offset += num_bytes;
4001 	spin_lock(&ctl->tree_lock);
4002 	ctl->free_space -= num_bytes;
4003 	spin_unlock(&ctl->tree_lock);
4004 
4005 	/*
4006 	 * We do not check if found_offset is aligned to stripesize. The
4007 	 * address is anyway rewritten when using zone append writing.
4008 	 */
4009 
4010 	ffe_ctl->search_start = ffe_ctl->found_offset;
4011 
4012 out:
4013 	if (ret && ffe_ctl->for_treelog)
4014 		fs_info->treelog_bg = 0;
4015 	if (ret && ffe_ctl->for_data_reloc)
4016 		fs_info->data_reloc_bg = 0;
4017 	spin_unlock(&fs_info->relocation_bg_lock);
4018 	spin_unlock(&fs_info->treelog_bg_lock);
4019 	spin_unlock(&block_group->lock);
4020 	spin_unlock(&space_info->lock);
4021 	return ret;
4022 }
4023 
do_allocation(struct btrfs_block_group * block_group,struct find_free_extent_ctl * ffe_ctl,struct btrfs_block_group ** bg_ret)4024 static int do_allocation(struct btrfs_block_group *block_group,
4025 			 struct find_free_extent_ctl *ffe_ctl,
4026 			 struct btrfs_block_group **bg_ret)
4027 {
4028 	switch (ffe_ctl->policy) {
4029 	case BTRFS_EXTENT_ALLOC_CLUSTERED:
4030 		return do_allocation_clustered(block_group, ffe_ctl, bg_ret);
4031 	case BTRFS_EXTENT_ALLOC_ZONED:
4032 		return do_allocation_zoned(block_group, ffe_ctl, bg_ret);
4033 	default:
4034 		BUG();
4035 	}
4036 }
4037 
release_block_group(struct btrfs_block_group * block_group,struct find_free_extent_ctl * ffe_ctl,int delalloc)4038 static void release_block_group(struct btrfs_block_group *block_group,
4039 				struct find_free_extent_ctl *ffe_ctl,
4040 				int delalloc)
4041 {
4042 	switch (ffe_ctl->policy) {
4043 	case BTRFS_EXTENT_ALLOC_CLUSTERED:
4044 		ffe_ctl->retry_uncached = false;
4045 		break;
4046 	case BTRFS_EXTENT_ALLOC_ZONED:
4047 		/* Nothing to do */
4048 		break;
4049 	default:
4050 		BUG();
4051 	}
4052 
4053 	BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
4054 	       ffe_ctl->index);
4055 	btrfs_release_block_group(block_group, delalloc);
4056 }
4057 
found_extent_clustered(struct find_free_extent_ctl * ffe_ctl,struct btrfs_key * ins)4058 static void found_extent_clustered(struct find_free_extent_ctl *ffe_ctl,
4059 				   struct btrfs_key *ins)
4060 {
4061 	struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
4062 
4063 	if (!ffe_ctl->use_cluster && last_ptr) {
4064 		spin_lock(&last_ptr->lock);
4065 		last_ptr->window_start = ins->objectid;
4066 		spin_unlock(&last_ptr->lock);
4067 	}
4068 }
4069 
found_extent(struct find_free_extent_ctl * ffe_ctl,struct btrfs_key * ins)4070 static void found_extent(struct find_free_extent_ctl *ffe_ctl,
4071 			 struct btrfs_key *ins)
4072 {
4073 	switch (ffe_ctl->policy) {
4074 	case BTRFS_EXTENT_ALLOC_CLUSTERED:
4075 		found_extent_clustered(ffe_ctl, ins);
4076 		break;
4077 	case BTRFS_EXTENT_ALLOC_ZONED:
4078 		/* Nothing to do */
4079 		break;
4080 	default:
4081 		BUG();
4082 	}
4083 }
4084 
can_allocate_chunk_zoned(struct btrfs_fs_info * fs_info,struct find_free_extent_ctl * ffe_ctl)4085 static int can_allocate_chunk_zoned(struct btrfs_fs_info *fs_info,
4086 				    struct find_free_extent_ctl *ffe_ctl)
4087 {
4088 	/* Block group's activeness is not a requirement for METADATA block groups. */
4089 	if (!(ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA))
4090 		return 0;
4091 
4092 	/* If we can activate new zone, just allocate a chunk and use it */
4093 	if (btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags))
4094 		return 0;
4095 
4096 	/*
4097 	 * We already reached the max active zones. Try to finish one block
4098 	 * group to make a room for a new block group. This is only possible
4099 	 * for a data block group because btrfs_zone_finish() may need to wait
4100 	 * for a running transaction which can cause a deadlock for metadata
4101 	 * allocation.
4102 	 */
4103 	if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) {
4104 		int ret = btrfs_zone_finish_one_bg(fs_info);
4105 
4106 		if (ret == 1)
4107 			return 0;
4108 		else if (ret < 0)
4109 			return ret;
4110 	}
4111 
4112 	/*
4113 	 * If we have enough free space left in an already active block group
4114 	 * and we can't activate any other zone now, do not allow allocating a
4115 	 * new chunk and let find_free_extent() retry with a smaller size.
4116 	 */
4117 	if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size)
4118 		return -ENOSPC;
4119 
4120 	/*
4121 	 * Even min_alloc_size is not left in any block groups. Since we cannot
4122 	 * activate a new block group, allocating it may not help. Let's tell a
4123 	 * caller to try again and hope it progress something by writing some
4124 	 * parts of the region. That is only possible for data block groups,
4125 	 * where a part of the region can be written.
4126 	 */
4127 	if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA)
4128 		return -EAGAIN;
4129 
4130 	/*
4131 	 * We cannot activate a new block group and no enough space left in any
4132 	 * block groups. So, allocating a new block group may not help. But,
4133 	 * there is nothing to do anyway, so let's go with it.
4134 	 */
4135 	return 0;
4136 }
4137 
can_allocate_chunk(struct btrfs_fs_info * fs_info,struct find_free_extent_ctl * ffe_ctl)4138 static int can_allocate_chunk(struct btrfs_fs_info *fs_info,
4139 			      struct find_free_extent_ctl *ffe_ctl)
4140 {
4141 	switch (ffe_ctl->policy) {
4142 	case BTRFS_EXTENT_ALLOC_CLUSTERED:
4143 		return 0;
4144 	case BTRFS_EXTENT_ALLOC_ZONED:
4145 		return can_allocate_chunk_zoned(fs_info, ffe_ctl);
4146 	default:
4147 		BUG();
4148 	}
4149 }
4150 
4151 /*
4152  * Return >0 means caller needs to re-search for free extent
4153  * Return 0 means we have the needed free extent.
4154  * Return <0 means we failed to locate any free extent.
4155  */
find_free_extent_update_loop(struct btrfs_fs_info * fs_info,struct btrfs_key * ins,struct find_free_extent_ctl * ffe_ctl,bool full_search)4156 static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
4157 					struct btrfs_key *ins,
4158 					struct find_free_extent_ctl *ffe_ctl,
4159 					bool full_search)
4160 {
4161 	struct btrfs_root *root = fs_info->chunk_root;
4162 	int ret;
4163 
4164 	if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) &&
4165 	    ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg)
4166 		ffe_ctl->orig_have_caching_bg = true;
4167 
4168 	if (ins->objectid) {
4169 		found_extent(ffe_ctl, ins);
4170 		return 0;
4171 	}
4172 
4173 	if (ffe_ctl->loop >= LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg)
4174 		return 1;
4175 
4176 	ffe_ctl->index++;
4177 	if (ffe_ctl->index < BTRFS_NR_RAID_TYPES)
4178 		return 1;
4179 
4180 	/* See the comments for btrfs_loop_type for an explanation of the phases. */
4181 	if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) {
4182 		ffe_ctl->index = 0;
4183 		/*
4184 		 * We want to skip the LOOP_CACHING_WAIT step if we don't have
4185 		 * any uncached bgs and we've already done a full search
4186 		 * through.
4187 		 */
4188 		if (ffe_ctl->loop == LOOP_CACHING_NOWAIT &&
4189 		    (!ffe_ctl->orig_have_caching_bg && full_search))
4190 			ffe_ctl->loop++;
4191 		ffe_ctl->loop++;
4192 
4193 		if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) {
4194 			struct btrfs_trans_handle *trans;
4195 			int exist = 0;
4196 
4197 			/* Check if allocation policy allows to create a new chunk */
4198 			ret = can_allocate_chunk(fs_info, ffe_ctl);
4199 			if (ret)
4200 				return ret;
4201 
4202 			trans = current->journal_info;
4203 			if (trans)
4204 				exist = 1;
4205 			else
4206 				trans = btrfs_join_transaction(root);
4207 
4208 			if (IS_ERR(trans)) {
4209 				ret = PTR_ERR(trans);
4210 				return ret;
4211 			}
4212 
4213 			ret = btrfs_chunk_alloc(trans, ffe_ctl->flags,
4214 						CHUNK_ALLOC_FORCE_FOR_EXTENT);
4215 
4216 			/* Do not bail out on ENOSPC since we can do more. */
4217 			if (ret == -ENOSPC) {
4218 				ret = 0;
4219 				ffe_ctl->loop++;
4220 			}
4221 			else if (ret < 0)
4222 				btrfs_abort_transaction(trans, ret);
4223 			else
4224 				ret = 0;
4225 			if (!exist)
4226 				btrfs_end_transaction(trans);
4227 			if (ret)
4228 				return ret;
4229 		}
4230 
4231 		if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) {
4232 			if (ffe_ctl->policy != BTRFS_EXTENT_ALLOC_CLUSTERED)
4233 				return -ENOSPC;
4234 
4235 			/*
4236 			 * Don't loop again if we already have no empty_size and
4237 			 * no empty_cluster.
4238 			 */
4239 			if (ffe_ctl->empty_size == 0 &&
4240 			    ffe_ctl->empty_cluster == 0)
4241 				return -ENOSPC;
4242 			ffe_ctl->empty_size = 0;
4243 			ffe_ctl->empty_cluster = 0;
4244 		}
4245 		return 1;
4246 	}
4247 	return -ENOSPC;
4248 }
4249 
prepare_allocation_clustered(struct btrfs_fs_info * fs_info,struct find_free_extent_ctl * ffe_ctl,struct btrfs_space_info * space_info,struct btrfs_key * ins)4250 static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
4251 					struct find_free_extent_ctl *ffe_ctl,
4252 					struct btrfs_space_info *space_info,
4253 					struct btrfs_key *ins)
4254 {
4255 	/*
4256 	 * If our free space is heavily fragmented we may not be able to make
4257 	 * big contiguous allocations, so instead of doing the expensive search
4258 	 * for free space, simply return ENOSPC with our max_extent_size so we
4259 	 * can go ahead and search for a more manageable chunk.
4260 	 *
4261 	 * If our max_extent_size is large enough for our allocation simply
4262 	 * disable clustering since we will likely not be able to find enough
4263 	 * space to create a cluster and induce latency trying.
4264 	 */
4265 	if (space_info->max_extent_size) {
4266 		spin_lock(&space_info->lock);
4267 		if (space_info->max_extent_size &&
4268 		    ffe_ctl->num_bytes > space_info->max_extent_size) {
4269 			ins->offset = space_info->max_extent_size;
4270 			spin_unlock(&space_info->lock);
4271 			return -ENOSPC;
4272 		} else if (space_info->max_extent_size) {
4273 			ffe_ctl->use_cluster = false;
4274 		}
4275 		spin_unlock(&space_info->lock);
4276 	}
4277 
4278 	ffe_ctl->last_ptr = fetch_cluster_info(fs_info, space_info,
4279 					       &ffe_ctl->empty_cluster);
4280 	if (ffe_ctl->last_ptr) {
4281 		struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
4282 
4283 		spin_lock(&last_ptr->lock);
4284 		if (last_ptr->block_group)
4285 			ffe_ctl->hint_byte = last_ptr->window_start;
4286 		if (last_ptr->fragmented) {
4287 			/*
4288 			 * We still set window_start so we can keep track of the
4289 			 * last place we found an allocation to try and save
4290 			 * some time.
4291 			 */
4292 			ffe_ctl->hint_byte = last_ptr->window_start;
4293 			ffe_ctl->use_cluster = false;
4294 		}
4295 		spin_unlock(&last_ptr->lock);
4296 	}
4297 
4298 	return 0;
4299 }
4300 
prepare_allocation_zoned(struct btrfs_fs_info * fs_info,struct find_free_extent_ctl * ffe_ctl)4301 static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info,
4302 				    struct find_free_extent_ctl *ffe_ctl)
4303 {
4304 	if (ffe_ctl->for_treelog) {
4305 		spin_lock(&fs_info->treelog_bg_lock);
4306 		if (fs_info->treelog_bg)
4307 			ffe_ctl->hint_byte = fs_info->treelog_bg;
4308 		spin_unlock(&fs_info->treelog_bg_lock);
4309 	} else if (ffe_ctl->for_data_reloc) {
4310 		spin_lock(&fs_info->relocation_bg_lock);
4311 		if (fs_info->data_reloc_bg)
4312 			ffe_ctl->hint_byte = fs_info->data_reloc_bg;
4313 		spin_unlock(&fs_info->relocation_bg_lock);
4314 	} else if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) {
4315 		struct btrfs_block_group *block_group;
4316 
4317 		spin_lock(&fs_info->zone_active_bgs_lock);
4318 		list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) {
4319 			/*
4320 			 * No lock is OK here because avail is monotinically
4321 			 * decreasing, and this is just a hint.
4322 			 */
4323 			u64 avail = block_group->zone_capacity - block_group->alloc_offset;
4324 
4325 			if (block_group_bits(block_group, ffe_ctl->flags) &&
4326 			    avail >= ffe_ctl->num_bytes) {
4327 				ffe_ctl->hint_byte = block_group->start;
4328 				break;
4329 			}
4330 		}
4331 		spin_unlock(&fs_info->zone_active_bgs_lock);
4332 	}
4333 
4334 	return 0;
4335 }
4336 
prepare_allocation(struct btrfs_fs_info * fs_info,struct find_free_extent_ctl * ffe_ctl,struct btrfs_space_info * space_info,struct btrfs_key * ins)4337 static int prepare_allocation(struct btrfs_fs_info *fs_info,
4338 			      struct find_free_extent_ctl *ffe_ctl,
4339 			      struct btrfs_space_info *space_info,
4340 			      struct btrfs_key *ins)
4341 {
4342 	switch (ffe_ctl->policy) {
4343 	case BTRFS_EXTENT_ALLOC_CLUSTERED:
4344 		return prepare_allocation_clustered(fs_info, ffe_ctl,
4345 						    space_info, ins);
4346 	case BTRFS_EXTENT_ALLOC_ZONED:
4347 		return prepare_allocation_zoned(fs_info, ffe_ctl);
4348 	default:
4349 		BUG();
4350 	}
4351 }
4352 
4353 /*
4354  * walks the btree of allocated extents and find a hole of a given size.
4355  * The key ins is changed to record the hole:
4356  * ins->objectid == start position
4357  * ins->flags = BTRFS_EXTENT_ITEM_KEY
4358  * ins->offset == the size of the hole.
4359  * Any available blocks before search_start are skipped.
4360  *
4361  * If there is no suitable free space, we will record the max size of
4362  * the free space extent currently.
4363  *
4364  * The overall logic and call chain:
4365  *
4366  * find_free_extent()
4367  * |- Iterate through all block groups
4368  * |  |- Get a valid block group
4369  * |  |- Try to do clustered allocation in that block group
4370  * |  |- Try to do unclustered allocation in that block group
4371  * |  |- Check if the result is valid
4372  * |  |  |- If valid, then exit
4373  * |  |- Jump to next block group
4374  * |
4375  * |- Push harder to find free extents
4376  *    |- If not found, re-iterate all block groups
4377  */
find_free_extent(struct btrfs_root * root,struct btrfs_key * ins,struct find_free_extent_ctl * ffe_ctl)4378 static noinline int find_free_extent(struct btrfs_root *root,
4379 				     struct btrfs_key *ins,
4380 				     struct find_free_extent_ctl *ffe_ctl)
4381 {
4382 	struct btrfs_fs_info *fs_info = root->fs_info;
4383 	int ret = 0;
4384 	int cache_block_group_error = 0;
4385 	struct btrfs_block_group *block_group = NULL;
4386 	struct btrfs_space_info *space_info;
4387 	bool full_search = false;
4388 
4389 	WARN_ON(ffe_ctl->num_bytes < fs_info->sectorsize);
4390 
4391 	ffe_ctl->search_start = 0;
4392 	/* For clustered allocation */
4393 	ffe_ctl->empty_cluster = 0;
4394 	ffe_ctl->last_ptr = NULL;
4395 	ffe_ctl->use_cluster = true;
4396 	ffe_ctl->have_caching_bg = false;
4397 	ffe_ctl->orig_have_caching_bg = false;
4398 	ffe_ctl->index = btrfs_bg_flags_to_raid_index(ffe_ctl->flags);
4399 	ffe_ctl->loop = 0;
4400 	ffe_ctl->retry_uncached = false;
4401 	ffe_ctl->cached = 0;
4402 	ffe_ctl->max_extent_size = 0;
4403 	ffe_ctl->total_free_space = 0;
4404 	ffe_ctl->found_offset = 0;
4405 	ffe_ctl->policy = BTRFS_EXTENT_ALLOC_CLUSTERED;
4406 	ffe_ctl->size_class = btrfs_calc_block_group_size_class(ffe_ctl->num_bytes);
4407 
4408 	if (btrfs_is_zoned(fs_info))
4409 		ffe_ctl->policy = BTRFS_EXTENT_ALLOC_ZONED;
4410 
4411 	ins->type = BTRFS_EXTENT_ITEM_KEY;
4412 	ins->objectid = 0;
4413 	ins->offset = 0;
4414 
4415 	trace_find_free_extent(root, ffe_ctl);
4416 
4417 	space_info = btrfs_find_space_info(fs_info, ffe_ctl->flags);
4418 	if (!space_info) {
4419 		btrfs_err(fs_info, "No space info for %llu", ffe_ctl->flags);
4420 		return -ENOSPC;
4421 	}
4422 
4423 	ret = prepare_allocation(fs_info, ffe_ctl, space_info, ins);
4424 	if (ret < 0)
4425 		return ret;
4426 
4427 	ffe_ctl->search_start = max(ffe_ctl->search_start,
4428 				    first_logical_byte(fs_info));
4429 	ffe_ctl->search_start = max(ffe_ctl->search_start, ffe_ctl->hint_byte);
4430 	if (ffe_ctl->search_start == ffe_ctl->hint_byte) {
4431 		block_group = btrfs_lookup_block_group(fs_info,
4432 						       ffe_ctl->search_start);
4433 		/*
4434 		 * we don't want to use the block group if it doesn't match our
4435 		 * allocation bits, or if its not cached.
4436 		 *
4437 		 * However if we are re-searching with an ideal block group
4438 		 * picked out then we don't care that the block group is cached.
4439 		 */
4440 		if (block_group && block_group_bits(block_group, ffe_ctl->flags) &&
4441 		    block_group->cached != BTRFS_CACHE_NO) {
4442 			down_read(&space_info->groups_sem);
4443 			if (list_empty(&block_group->list) ||
4444 			    block_group->ro) {
4445 				/*
4446 				 * someone is removing this block group,
4447 				 * we can't jump into the have_block_group
4448 				 * target because our list pointers are not
4449 				 * valid
4450 				 */
4451 				btrfs_put_block_group(block_group);
4452 				up_read(&space_info->groups_sem);
4453 			} else {
4454 				ffe_ctl->index = btrfs_bg_flags_to_raid_index(
4455 							block_group->flags);
4456 				btrfs_lock_block_group(block_group,
4457 						       ffe_ctl->delalloc);
4458 				ffe_ctl->hinted = true;
4459 				goto have_block_group;
4460 			}
4461 		} else if (block_group) {
4462 			btrfs_put_block_group(block_group);
4463 		}
4464 	}
4465 search:
4466 	trace_find_free_extent_search_loop(root, ffe_ctl);
4467 	ffe_ctl->have_caching_bg = false;
4468 	if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) ||
4469 	    ffe_ctl->index == 0)
4470 		full_search = true;
4471 	down_read(&space_info->groups_sem);
4472 	list_for_each_entry(block_group,
4473 			    &space_info->block_groups[ffe_ctl->index], list) {
4474 		struct btrfs_block_group *bg_ret;
4475 
4476 		ffe_ctl->hinted = false;
4477 		/* If the block group is read-only, we can skip it entirely. */
4478 		if (unlikely(block_group->ro)) {
4479 			if (ffe_ctl->for_treelog)
4480 				btrfs_clear_treelog_bg(block_group);
4481 			if (ffe_ctl->for_data_reloc)
4482 				btrfs_clear_data_reloc_bg(block_group);
4483 			continue;
4484 		}
4485 
4486 		btrfs_grab_block_group(block_group, ffe_ctl->delalloc);
4487 		ffe_ctl->search_start = block_group->start;
4488 
4489 		/*
4490 		 * this can happen if we end up cycling through all the
4491 		 * raid types, but we want to make sure we only allocate
4492 		 * for the proper type.
4493 		 */
4494 		if (!block_group_bits(block_group, ffe_ctl->flags)) {
4495 			u64 extra = BTRFS_BLOCK_GROUP_DUP |
4496 				BTRFS_BLOCK_GROUP_RAID1_MASK |
4497 				BTRFS_BLOCK_GROUP_RAID56_MASK |
4498 				BTRFS_BLOCK_GROUP_RAID10;
4499 
4500 			/*
4501 			 * if they asked for extra copies and this block group
4502 			 * doesn't provide them, bail.  This does allow us to
4503 			 * fill raid0 from raid1.
4504 			 */
4505 			if ((ffe_ctl->flags & extra) && !(block_group->flags & extra))
4506 				goto loop;
4507 
4508 			/*
4509 			 * This block group has different flags than we want.
4510 			 * It's possible that we have MIXED_GROUP flag but no
4511 			 * block group is mixed.  Just skip such block group.
4512 			 */
4513 			btrfs_release_block_group(block_group, ffe_ctl->delalloc);
4514 			continue;
4515 		}
4516 
4517 have_block_group:
4518 		trace_find_free_extent_have_block_group(root, ffe_ctl, block_group);
4519 		ffe_ctl->cached = btrfs_block_group_done(block_group);
4520 		if (unlikely(!ffe_ctl->cached)) {
4521 			ffe_ctl->have_caching_bg = true;
4522 			ret = btrfs_cache_block_group(block_group, false);
4523 
4524 			/*
4525 			 * If we get ENOMEM here or something else we want to
4526 			 * try other block groups, because it may not be fatal.
4527 			 * However if we can't find anything else we need to
4528 			 * save our return here so that we return the actual
4529 			 * error that caused problems, not ENOSPC.
4530 			 */
4531 			if (ret < 0) {
4532 				if (!cache_block_group_error)
4533 					cache_block_group_error = ret;
4534 				ret = 0;
4535 				goto loop;
4536 			}
4537 			ret = 0;
4538 		}
4539 
4540 		if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) {
4541 			if (!cache_block_group_error)
4542 				cache_block_group_error = -EIO;
4543 			goto loop;
4544 		}
4545 
4546 		if (!find_free_extent_check_size_class(ffe_ctl, block_group))
4547 			goto loop;
4548 
4549 		bg_ret = NULL;
4550 		ret = do_allocation(block_group, ffe_ctl, &bg_ret);
4551 		if (ret > 0)
4552 			goto loop;
4553 
4554 		if (bg_ret && bg_ret != block_group) {
4555 			btrfs_release_block_group(block_group, ffe_ctl->delalloc);
4556 			block_group = bg_ret;
4557 		}
4558 
4559 		/* Checks */
4560 		ffe_ctl->search_start = round_up(ffe_ctl->found_offset,
4561 						 fs_info->stripesize);
4562 
4563 		/* move on to the next group */
4564 		if (ffe_ctl->search_start + ffe_ctl->num_bytes >
4565 		    block_group->start + block_group->length) {
4566 			btrfs_add_free_space_unused(block_group,
4567 					    ffe_ctl->found_offset,
4568 					    ffe_ctl->num_bytes);
4569 			goto loop;
4570 		}
4571 
4572 		if (ffe_ctl->found_offset < ffe_ctl->search_start)
4573 			btrfs_add_free_space_unused(block_group,
4574 					ffe_ctl->found_offset,
4575 					ffe_ctl->search_start - ffe_ctl->found_offset);
4576 
4577 		ret = btrfs_add_reserved_bytes(block_group, ffe_ctl->ram_bytes,
4578 					       ffe_ctl->num_bytes,
4579 					       ffe_ctl->delalloc,
4580 					       ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS);
4581 		if (ret == -EAGAIN) {
4582 			btrfs_add_free_space_unused(block_group,
4583 					ffe_ctl->found_offset,
4584 					ffe_ctl->num_bytes);
4585 			goto loop;
4586 		}
4587 		btrfs_inc_block_group_reservations(block_group);
4588 
4589 		/* we are all good, lets return */
4590 		ins->objectid = ffe_ctl->search_start;
4591 		ins->offset = ffe_ctl->num_bytes;
4592 
4593 		trace_btrfs_reserve_extent(block_group, ffe_ctl);
4594 		btrfs_release_block_group(block_group, ffe_ctl->delalloc);
4595 		break;
4596 loop:
4597 		if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
4598 		    !ffe_ctl->retry_uncached) {
4599 			ffe_ctl->retry_uncached = true;
4600 			btrfs_wait_block_group_cache_progress(block_group,
4601 						ffe_ctl->num_bytes +
4602 						ffe_ctl->empty_cluster +
4603 						ffe_ctl->empty_size);
4604 			goto have_block_group;
4605 		}
4606 		release_block_group(block_group, ffe_ctl, ffe_ctl->delalloc);
4607 		cond_resched();
4608 	}
4609 	up_read(&space_info->groups_sem);
4610 
4611 	ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, full_search);
4612 	if (ret > 0)
4613 		goto search;
4614 
4615 	if (ret == -ENOSPC && !cache_block_group_error) {
4616 		/*
4617 		 * Use ffe_ctl->total_free_space as fallback if we can't find
4618 		 * any contiguous hole.
4619 		 */
4620 		if (!ffe_ctl->max_extent_size)
4621 			ffe_ctl->max_extent_size = ffe_ctl->total_free_space;
4622 		spin_lock(&space_info->lock);
4623 		space_info->max_extent_size = ffe_ctl->max_extent_size;
4624 		spin_unlock(&space_info->lock);
4625 		ins->offset = ffe_ctl->max_extent_size;
4626 	} else if (ret == -ENOSPC) {
4627 		ret = cache_block_group_error;
4628 	}
4629 	return ret;
4630 }
4631 
4632 /*
4633  * Entry point to the extent allocator. Tries to find a hole that is at least
4634  * as big as @num_bytes.
4635  *
4636  * @root           -	The root that will contain this extent
4637  *
4638  * @ram_bytes      -	The amount of space in ram that @num_bytes take. This
4639  *			is used for accounting purposes. This value differs
4640  *			from @num_bytes only in the case of compressed extents.
4641  *
4642  * @num_bytes      -	Number of bytes to allocate on-disk.
4643  *
4644  * @min_alloc_size -	Indicates the minimum amount of space that the
4645  *			allocator should try to satisfy. In some cases
4646  *			@num_bytes may be larger than what is required and if
4647  *			the filesystem is fragmented then allocation fails.
4648  *			However, the presence of @min_alloc_size gives a
4649  *			chance to try and satisfy the smaller allocation.
4650  *
4651  * @empty_size     -	A hint that you plan on doing more COW. This is the
4652  *			size in bytes the allocator should try to find free
4653  *			next to the block it returns.  This is just a hint and
4654  *			may be ignored by the allocator.
4655  *
4656  * @hint_byte      -	Hint to the allocator to start searching above the byte
4657  *			address passed. It might be ignored.
4658  *
4659  * @ins            -	This key is modified to record the found hole. It will
4660  *			have the following values:
4661  *			ins->objectid == start position
4662  *			ins->flags = BTRFS_EXTENT_ITEM_KEY
4663  *			ins->offset == the size of the hole.
4664  *
4665  * @is_data        -	Boolean flag indicating whether an extent is
4666  *			allocated for data (true) or metadata (false)
4667  *
4668  * @delalloc       -	Boolean flag indicating whether this allocation is for
4669  *			delalloc or not. If 'true' data_rwsem of block groups
4670  *			is going to be acquired.
4671  *
4672  *
4673  * Returns 0 when an allocation succeeded or < 0 when an error occurred. In
4674  * case -ENOSPC is returned then @ins->offset will contain the size of the
4675  * largest available hole the allocator managed to find.
4676  */
btrfs_reserve_extent(struct btrfs_root * root,u64 ram_bytes,u64 num_bytes,u64 min_alloc_size,u64 empty_size,u64 hint_byte,struct btrfs_key * ins,int is_data,int delalloc)4677 int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
4678 			 u64 num_bytes, u64 min_alloc_size,
4679 			 u64 empty_size, u64 hint_byte,
4680 			 struct btrfs_key *ins, int is_data, int delalloc)
4681 {
4682 	struct btrfs_fs_info *fs_info = root->fs_info;
4683 	struct find_free_extent_ctl ffe_ctl = {};
4684 	bool final_tried = num_bytes == min_alloc_size;
4685 	u64 flags;
4686 	int ret;
4687 	bool for_treelog = (btrfs_root_id(root) == BTRFS_TREE_LOG_OBJECTID);
4688 	bool for_data_reloc = (btrfs_is_data_reloc_root(root) && is_data);
4689 
4690 	flags = get_alloc_profile_by_root(root, is_data);
4691 again:
4692 	WARN_ON(num_bytes < fs_info->sectorsize);
4693 
4694 	ffe_ctl.ram_bytes = ram_bytes;
4695 	ffe_ctl.num_bytes = num_bytes;
4696 	ffe_ctl.min_alloc_size = min_alloc_size;
4697 	ffe_ctl.empty_size = empty_size;
4698 	ffe_ctl.flags = flags;
4699 	ffe_ctl.delalloc = delalloc;
4700 	ffe_ctl.hint_byte = hint_byte;
4701 	ffe_ctl.for_treelog = for_treelog;
4702 	ffe_ctl.for_data_reloc = for_data_reloc;
4703 
4704 	ret = find_free_extent(root, ins, &ffe_ctl);
4705 	if (!ret && !is_data) {
4706 		btrfs_dec_block_group_reservations(fs_info, ins->objectid);
4707 	} else if (ret == -ENOSPC) {
4708 		if (!final_tried && ins->offset) {
4709 			num_bytes = min(num_bytes >> 1, ins->offset);
4710 			num_bytes = round_down(num_bytes,
4711 					       fs_info->sectorsize);
4712 			num_bytes = max(num_bytes, min_alloc_size);
4713 			ram_bytes = num_bytes;
4714 			if (num_bytes == min_alloc_size)
4715 				final_tried = true;
4716 			goto again;
4717 		} else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
4718 			struct btrfs_space_info *sinfo;
4719 
4720 			sinfo = btrfs_find_space_info(fs_info, flags);
4721 			btrfs_err(fs_info,
4722 	"allocation failed flags %llu, wanted %llu tree-log %d, relocation: %d",
4723 				  flags, num_bytes, for_treelog, for_data_reloc);
4724 			if (sinfo)
4725 				btrfs_dump_space_info(fs_info, sinfo,
4726 						      num_bytes, 1);
4727 		}
4728 	}
4729 
4730 	return ret;
4731 }
4732 
btrfs_free_reserved_extent(struct btrfs_fs_info * fs_info,u64 start,u64 len,int delalloc)4733 int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
4734 			       u64 start, u64 len, int delalloc)
4735 {
4736 	struct btrfs_block_group *cache;
4737 
4738 	cache = btrfs_lookup_block_group(fs_info, start);
4739 	if (!cache) {
4740 		btrfs_err(fs_info, "Unable to find block group for %llu",
4741 			  start);
4742 		return -ENOSPC;
4743 	}
4744 
4745 	btrfs_add_free_space(cache, start, len);
4746 	btrfs_free_reserved_bytes(cache, len, delalloc);
4747 	trace_btrfs_reserved_extent_free(fs_info, start, len);
4748 
4749 	btrfs_put_block_group(cache);
4750 	return 0;
4751 }
4752 
btrfs_pin_reserved_extent(struct btrfs_trans_handle * trans,const struct extent_buffer * eb)4753 int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans,
4754 			      const struct extent_buffer *eb)
4755 {
4756 	struct btrfs_block_group *cache;
4757 	int ret = 0;
4758 
4759 	cache = btrfs_lookup_block_group(trans->fs_info, eb->start);
4760 	if (!cache) {
4761 		btrfs_err(trans->fs_info, "unable to find block group for %llu",
4762 			  eb->start);
4763 		return -ENOSPC;
4764 	}
4765 
4766 	ret = pin_down_extent(trans, cache, eb->start, eb->len, 1);
4767 	btrfs_put_block_group(cache);
4768 	return ret;
4769 }
4770 
alloc_reserved_extent(struct btrfs_trans_handle * trans,u64 bytenr,u64 num_bytes)4771 static int alloc_reserved_extent(struct btrfs_trans_handle *trans, u64 bytenr,
4772 				 u64 num_bytes)
4773 {
4774 	struct btrfs_fs_info *fs_info = trans->fs_info;
4775 	int ret;
4776 
4777 	ret = remove_from_free_space_tree(trans, bytenr, num_bytes);
4778 	if (ret)
4779 		return ret;
4780 
4781 	ret = btrfs_update_block_group(trans, bytenr, num_bytes, true);
4782 	if (ret) {
4783 		ASSERT(!ret);
4784 		btrfs_err(fs_info, "update block group failed for %llu %llu",
4785 			  bytenr, num_bytes);
4786 		return ret;
4787 	}
4788 
4789 	trace_btrfs_reserved_extent_alloc(fs_info, bytenr, num_bytes);
4790 	return 0;
4791 }
4792 
alloc_reserved_file_extent(struct btrfs_trans_handle * trans,u64 parent,u64 root_objectid,u64 flags,u64 owner,u64 offset,struct btrfs_key * ins,int ref_mod,u64 oref_root)4793 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
4794 				      u64 parent, u64 root_objectid,
4795 				      u64 flags, u64 owner, u64 offset,
4796 				      struct btrfs_key *ins, int ref_mod, u64 oref_root)
4797 {
4798 	struct btrfs_fs_info *fs_info = trans->fs_info;
4799 	struct btrfs_root *extent_root;
4800 	int ret;
4801 	struct btrfs_extent_item *extent_item;
4802 	struct btrfs_extent_owner_ref *oref;
4803 	struct btrfs_extent_inline_ref *iref;
4804 	struct btrfs_path *path;
4805 	struct extent_buffer *leaf;
4806 	int type;
4807 	u32 size;
4808 	const bool simple_quota = (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE);
4809 
4810 	if (parent > 0)
4811 		type = BTRFS_SHARED_DATA_REF_KEY;
4812 	else
4813 		type = BTRFS_EXTENT_DATA_REF_KEY;
4814 
4815 	size = sizeof(*extent_item);
4816 	if (simple_quota)
4817 		size += btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY);
4818 	size += btrfs_extent_inline_ref_size(type);
4819 
4820 	path = btrfs_alloc_path();
4821 	if (!path)
4822 		return -ENOMEM;
4823 
4824 	extent_root = btrfs_extent_root(fs_info, ins->objectid);
4825 	ret = btrfs_insert_empty_item(trans, extent_root, path, ins, size);
4826 	if (ret) {
4827 		btrfs_free_path(path);
4828 		return ret;
4829 	}
4830 
4831 	leaf = path->nodes[0];
4832 	extent_item = btrfs_item_ptr(leaf, path->slots[0],
4833 				     struct btrfs_extent_item);
4834 	btrfs_set_extent_refs(leaf, extent_item, ref_mod);
4835 	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
4836 	btrfs_set_extent_flags(leaf, extent_item,
4837 			       flags | BTRFS_EXTENT_FLAG_DATA);
4838 
4839 	iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
4840 	if (simple_quota) {
4841 		btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_EXTENT_OWNER_REF_KEY);
4842 		oref = (struct btrfs_extent_owner_ref *)(&iref->offset);
4843 		btrfs_set_extent_owner_ref_root_id(leaf, oref, oref_root);
4844 		iref = (struct btrfs_extent_inline_ref *)(oref + 1);
4845 	}
4846 	btrfs_set_extent_inline_ref_type(leaf, iref, type);
4847 
4848 	if (parent > 0) {
4849 		struct btrfs_shared_data_ref *ref;
4850 		ref = (struct btrfs_shared_data_ref *)(iref + 1);
4851 		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
4852 		btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
4853 	} else {
4854 		struct btrfs_extent_data_ref *ref;
4855 		ref = (struct btrfs_extent_data_ref *)(&iref->offset);
4856 		btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
4857 		btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
4858 		btrfs_set_extent_data_ref_offset(leaf, ref, offset);
4859 		btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
4860 	}
4861 
4862 	btrfs_mark_buffer_dirty(trans, path->nodes[0]);
4863 	btrfs_free_path(path);
4864 
4865 	return alloc_reserved_extent(trans, ins->objectid, ins->offset);
4866 }
4867 
alloc_reserved_tree_block(struct btrfs_trans_handle * trans,struct btrfs_delayed_ref_node * node,struct btrfs_delayed_extent_op * extent_op)4868 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
4869 				     struct btrfs_delayed_ref_node *node,
4870 				     struct btrfs_delayed_extent_op *extent_op)
4871 {
4872 	struct btrfs_fs_info *fs_info = trans->fs_info;
4873 	struct btrfs_root *extent_root;
4874 	int ret;
4875 	struct btrfs_extent_item *extent_item;
4876 	struct btrfs_key extent_key;
4877 	struct btrfs_tree_block_info *block_info;
4878 	struct btrfs_extent_inline_ref *iref;
4879 	struct btrfs_path *path;
4880 	struct extent_buffer *leaf;
4881 	u32 size = sizeof(*extent_item) + sizeof(*iref);
4882 	const u64 flags = (extent_op ? extent_op->flags_to_set : 0);
4883 	/* The owner of a tree block is the level. */
4884 	int level = btrfs_delayed_ref_owner(node);
4885 	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
4886 
4887 	extent_key.objectid = node->bytenr;
4888 	if (skinny_metadata) {
4889 		/* The owner of a tree block is the level. */
4890 		extent_key.offset = level;
4891 		extent_key.type = BTRFS_METADATA_ITEM_KEY;
4892 	} else {
4893 		extent_key.offset = node->num_bytes;
4894 		extent_key.type = BTRFS_EXTENT_ITEM_KEY;
4895 		size += sizeof(*block_info);
4896 	}
4897 
4898 	path = btrfs_alloc_path();
4899 	if (!path)
4900 		return -ENOMEM;
4901 
4902 	extent_root = btrfs_extent_root(fs_info, extent_key.objectid);
4903 	ret = btrfs_insert_empty_item(trans, extent_root, path, &extent_key,
4904 				      size);
4905 	if (ret) {
4906 		btrfs_free_path(path);
4907 		return ret;
4908 	}
4909 
4910 	leaf = path->nodes[0];
4911 	extent_item = btrfs_item_ptr(leaf, path->slots[0],
4912 				     struct btrfs_extent_item);
4913 	btrfs_set_extent_refs(leaf, extent_item, 1);
4914 	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
4915 	btrfs_set_extent_flags(leaf, extent_item,
4916 			       flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
4917 
4918 	if (skinny_metadata) {
4919 		iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
4920 	} else {
4921 		block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
4922 		btrfs_set_tree_block_key(leaf, block_info, &extent_op->key);
4923 		btrfs_set_tree_block_level(leaf, block_info, level);
4924 		iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
4925 	}
4926 
4927 	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
4928 		btrfs_set_extent_inline_ref_type(leaf, iref,
4929 						 BTRFS_SHARED_BLOCK_REF_KEY);
4930 		btrfs_set_extent_inline_ref_offset(leaf, iref, node->parent);
4931 	} else {
4932 		btrfs_set_extent_inline_ref_type(leaf, iref,
4933 						 BTRFS_TREE_BLOCK_REF_KEY);
4934 		btrfs_set_extent_inline_ref_offset(leaf, iref, node->ref_root);
4935 	}
4936 
4937 	btrfs_mark_buffer_dirty(trans, leaf);
4938 	btrfs_free_path(path);
4939 
4940 	return alloc_reserved_extent(trans, node->bytenr, fs_info->nodesize);
4941 }
4942 
btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle * trans,struct btrfs_root * root,u64 owner,u64 offset,u64 ram_bytes,struct btrfs_key * ins)4943 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
4944 				     struct btrfs_root *root, u64 owner,
4945 				     u64 offset, u64 ram_bytes,
4946 				     struct btrfs_key *ins)
4947 {
4948 	struct btrfs_ref generic_ref = {
4949 		.action = BTRFS_ADD_DELAYED_EXTENT,
4950 		.bytenr = ins->objectid,
4951 		.num_bytes = ins->offset,
4952 		.owning_root = btrfs_root_id(root),
4953 		.ref_root = btrfs_root_id(root),
4954 	};
4955 
4956 	ASSERT(generic_ref.ref_root != BTRFS_TREE_LOG_OBJECTID);
4957 
4958 	if (btrfs_is_data_reloc_root(root) && is_fstree(root->relocation_src_root))
4959 		generic_ref.owning_root = root->relocation_src_root;
4960 
4961 	btrfs_init_data_ref(&generic_ref, owner, offset, 0, false);
4962 	btrfs_ref_tree_mod(root->fs_info, &generic_ref);
4963 
4964 	return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes);
4965 }
4966 
4967 /*
4968  * this is used by the tree logging recovery code.  It records that
4969  * an extent has been allocated and makes sure to clear the free
4970  * space cache bits as well
4971  */
btrfs_alloc_logged_file_extent(struct btrfs_trans_handle * trans,u64 root_objectid,u64 owner,u64 offset,struct btrfs_key * ins)4972 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
4973 				   u64 root_objectid, u64 owner, u64 offset,
4974 				   struct btrfs_key *ins)
4975 {
4976 	struct btrfs_fs_info *fs_info = trans->fs_info;
4977 	int ret;
4978 	struct btrfs_block_group *block_group;
4979 	struct btrfs_space_info *space_info;
4980 	struct btrfs_squota_delta delta = {
4981 		.root = root_objectid,
4982 		.num_bytes = ins->offset,
4983 		.generation = trans->transid,
4984 		.is_data = true,
4985 		.is_inc = true,
4986 	};
4987 
4988 	/*
4989 	 * Mixed block groups will exclude before processing the log so we only
4990 	 * need to do the exclude dance if this fs isn't mixed.
4991 	 */
4992 	if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
4993 		ret = __exclude_logged_extent(fs_info, ins->objectid,
4994 					      ins->offset);
4995 		if (ret)
4996 			return ret;
4997 	}
4998 
4999 	block_group = btrfs_lookup_block_group(fs_info, ins->objectid);
5000 	if (!block_group)
5001 		return -EINVAL;
5002 
5003 	space_info = block_group->space_info;
5004 	spin_lock(&space_info->lock);
5005 	spin_lock(&block_group->lock);
5006 	space_info->bytes_reserved += ins->offset;
5007 	block_group->reserved += ins->offset;
5008 	spin_unlock(&block_group->lock);
5009 	spin_unlock(&space_info->lock);
5010 
5011 	ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
5012 					 offset, ins, 1, root_objectid);
5013 	if (ret)
5014 		btrfs_pin_extent(trans, ins->objectid, ins->offset, 1);
5015 	ret = btrfs_record_squota_delta(fs_info, &delta);
5016 	btrfs_put_block_group(block_group);
5017 	return ret;
5018 }
5019 
5020 #ifdef CONFIG_BTRFS_DEBUG
5021 /*
5022  * Extra safety check in case the extent tree is corrupted and extent allocator
5023  * chooses to use a tree block which is already used and locked.
5024  */
check_eb_lock_owner(const struct extent_buffer * eb)5025 static bool check_eb_lock_owner(const struct extent_buffer *eb)
5026 {
5027 	if (eb->lock_owner == current->pid) {
5028 		btrfs_err_rl(eb->fs_info,
5029 "tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
5030 			     eb->start, btrfs_header_owner(eb), current->pid);
5031 		return true;
5032 	}
5033 	return false;
5034 }
5035 #else
check_eb_lock_owner(struct extent_buffer * eb)5036 static bool check_eb_lock_owner(struct extent_buffer *eb)
5037 {
5038 	return false;
5039 }
5040 #endif
5041 
5042 static struct extent_buffer *
btrfs_init_new_buffer(struct btrfs_trans_handle * trans,struct btrfs_root * root,u64 bytenr,int level,u64 owner,enum btrfs_lock_nesting nest)5043 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
5044 		      u64 bytenr, int level, u64 owner,
5045 		      enum btrfs_lock_nesting nest)
5046 {
5047 	struct btrfs_fs_info *fs_info = root->fs_info;
5048 	struct extent_buffer *buf;
5049 	u64 lockdep_owner = owner;
5050 
5051 	buf = btrfs_find_create_tree_block(fs_info, bytenr, owner, level);
5052 	if (IS_ERR(buf))
5053 		return buf;
5054 
5055 	if (check_eb_lock_owner(buf)) {
5056 		free_extent_buffer(buf);
5057 		return ERR_PTR(-EUCLEAN);
5058 	}
5059 
5060 	/*
5061 	 * The reloc trees are just snapshots, so we need them to appear to be
5062 	 * just like any other fs tree WRT lockdep.
5063 	 *
5064 	 * The exception however is in replace_path() in relocation, where we
5065 	 * hold the lock on the original fs root and then search for the reloc
5066 	 * root.  At that point we need to make sure any reloc root buffers are
5067 	 * set to the BTRFS_TREE_RELOC_OBJECTID lockdep class in order to make
5068 	 * lockdep happy.
5069 	 */
5070 	if (lockdep_owner == BTRFS_TREE_RELOC_OBJECTID &&
5071 	    !test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state))
5072 		lockdep_owner = BTRFS_FS_TREE_OBJECTID;
5073 
5074 	/* btrfs_clear_buffer_dirty() accesses generation field. */
5075 	btrfs_set_header_generation(buf, trans->transid);
5076 
5077 	/*
5078 	 * This needs to stay, because we could allocate a freed block from an
5079 	 * old tree into a new tree, so we need to make sure this new block is
5080 	 * set to the appropriate level and owner.
5081 	 */
5082 	btrfs_set_buffer_lockdep_class(lockdep_owner, buf, level);
5083 
5084 	btrfs_tree_lock_nested(buf, nest);
5085 	btrfs_clear_buffer_dirty(trans, buf);
5086 	clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
5087 	clear_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &buf->bflags);
5088 
5089 	set_extent_buffer_uptodate(buf);
5090 
5091 	memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
5092 	btrfs_set_header_level(buf, level);
5093 	btrfs_set_header_bytenr(buf, buf->start);
5094 	btrfs_set_header_generation(buf, trans->transid);
5095 	btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV);
5096 	btrfs_set_header_owner(buf, owner);
5097 	write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid);
5098 	write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid);
5099 	if (btrfs_root_id(root) == BTRFS_TREE_LOG_OBJECTID) {
5100 		buf->log_index = root->log_transid % 2;
5101 		/*
5102 		 * we allow two log transactions at a time, use different
5103 		 * EXTENT bit to differentiate dirty pages.
5104 		 */
5105 		if (buf->log_index == 0)
5106 			set_extent_bit(&root->dirty_log_pages, buf->start,
5107 				       buf->start + buf->len - 1,
5108 				       EXTENT_DIRTY, NULL);
5109 		else
5110 			set_extent_bit(&root->dirty_log_pages, buf->start,
5111 				       buf->start + buf->len - 1,
5112 				       EXTENT_NEW, NULL);
5113 	} else {
5114 		buf->log_index = -1;
5115 		set_extent_bit(&trans->transaction->dirty_pages, buf->start,
5116 			       buf->start + buf->len - 1, EXTENT_DIRTY, NULL);
5117 	}
5118 	/* this returns a buffer locked for blocking */
5119 	return buf;
5120 }
5121 
5122 /*
5123  * finds a free extent and does all the dirty work required for allocation
5124  * returns the tree buffer or an ERR_PTR on error.
5125  */
btrfs_alloc_tree_block(struct btrfs_trans_handle * trans,struct btrfs_root * root,u64 parent,u64 root_objectid,const struct btrfs_disk_key * key,int level,u64 hint,u64 empty_size,u64 reloc_src_root,enum btrfs_lock_nesting nest)5126 struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
5127 					     struct btrfs_root *root,
5128 					     u64 parent, u64 root_objectid,
5129 					     const struct btrfs_disk_key *key,
5130 					     int level, u64 hint,
5131 					     u64 empty_size,
5132 					     u64 reloc_src_root,
5133 					     enum btrfs_lock_nesting nest)
5134 {
5135 	struct btrfs_fs_info *fs_info = root->fs_info;
5136 	struct btrfs_key ins;
5137 	struct btrfs_block_rsv *block_rsv;
5138 	struct extent_buffer *buf;
5139 	u64 flags = 0;
5140 	int ret;
5141 	u32 blocksize = fs_info->nodesize;
5142 	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
5143 	u64 owning_root;
5144 
5145 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
5146 	if (btrfs_is_testing(fs_info)) {
5147 		buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
5148 					    level, root_objectid, nest);
5149 		if (!IS_ERR(buf))
5150 			root->alloc_bytenr += blocksize;
5151 		return buf;
5152 	}
5153 #endif
5154 
5155 	block_rsv = btrfs_use_block_rsv(trans, root, blocksize);
5156 	if (IS_ERR(block_rsv))
5157 		return ERR_CAST(block_rsv);
5158 
5159 	ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
5160 				   empty_size, hint, &ins, 0, 0);
5161 	if (ret)
5162 		goto out_unuse;
5163 
5164 	buf = btrfs_init_new_buffer(trans, root, ins.objectid, level,
5165 				    root_objectid, nest);
5166 	if (IS_ERR(buf)) {
5167 		ret = PTR_ERR(buf);
5168 		goto out_free_reserved;
5169 	}
5170 	owning_root = btrfs_header_owner(buf);
5171 
5172 	if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
5173 		if (parent == 0)
5174 			parent = ins.objectid;
5175 		flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
5176 		owning_root = reloc_src_root;
5177 	} else
5178 		BUG_ON(parent > 0);
5179 
5180 	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
5181 		struct btrfs_delayed_extent_op *extent_op;
5182 		struct btrfs_ref generic_ref = {
5183 			.action = BTRFS_ADD_DELAYED_EXTENT,
5184 			.bytenr = ins.objectid,
5185 			.num_bytes = ins.offset,
5186 			.parent = parent,
5187 			.owning_root = owning_root,
5188 			.ref_root = root_objectid,
5189 		};
5190 
5191 		if (!skinny_metadata || flags != 0) {
5192 			extent_op = btrfs_alloc_delayed_extent_op();
5193 			if (!extent_op) {
5194 				ret = -ENOMEM;
5195 				goto out_free_buf;
5196 			}
5197 			if (key)
5198 				memcpy(&extent_op->key, key, sizeof(extent_op->key));
5199 			else
5200 				memset(&extent_op->key, 0, sizeof(extent_op->key));
5201 			extent_op->flags_to_set = flags;
5202 			extent_op->update_key = (skinny_metadata ? false : true);
5203 			extent_op->update_flags = (flags != 0);
5204 		} else {
5205 			extent_op = NULL;
5206 		}
5207 
5208 		btrfs_init_tree_ref(&generic_ref, level, btrfs_root_id(root), false);
5209 		btrfs_ref_tree_mod(fs_info, &generic_ref);
5210 		ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op);
5211 		if (ret) {
5212 			btrfs_free_delayed_extent_op(extent_op);
5213 			goto out_free_buf;
5214 		}
5215 	}
5216 	return buf;
5217 
5218 out_free_buf:
5219 	btrfs_tree_unlock(buf);
5220 	free_extent_buffer(buf);
5221 out_free_reserved:
5222 	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
5223 out_unuse:
5224 	btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize);
5225 	return ERR_PTR(ret);
5226 }
5227 
5228 struct walk_control {
5229 	u64 refs[BTRFS_MAX_LEVEL];
5230 	u64 flags[BTRFS_MAX_LEVEL];
5231 	struct btrfs_key update_progress;
5232 	struct btrfs_key drop_progress;
5233 	int drop_level;
5234 	int stage;
5235 	int level;
5236 	int shared_level;
5237 	int update_ref;
5238 	int keep_locks;
5239 	int reada_slot;
5240 	int reada_count;
5241 	int restarted;
5242 	/* Indicate that extent info needs to be looked up when walking the tree. */
5243 	int lookup_info;
5244 };
5245 
5246 /*
5247  * This is our normal stage.  We are traversing blocks the current snapshot owns
5248  * and we are dropping any of our references to any children we are able to, and
5249  * then freeing the block once we've processed all of the children.
5250  */
5251 #define DROP_REFERENCE	1
5252 
5253 /*
5254  * We enter this stage when we have to walk into a child block (meaning we can't
5255  * simply drop our reference to it from our current parent node) and there are
5256  * more than one reference on it.  If we are the owner of any of the children
5257  * blocks from the current parent node then we have to do the FULL_BACKREF dance
5258  * on them in order to drop our normal ref and add the shared ref.
5259  */
5260 #define UPDATE_BACKREF	2
5261 
5262 /*
5263  * Decide if we need to walk down into this node to adjust the references.
5264  *
5265  * @root:	the root we are currently deleting
5266  * @wc:		the walk control for this deletion
5267  * @eb:		the parent eb that we're currently visiting
5268  * @refs:	the number of refs for wc->level - 1
5269  * @flags:	the flags for wc->level - 1
5270  * @slot:	the slot in the eb that we're currently checking
5271  *
5272  * This is meant to be called when we're evaluating if a node we point to at
5273  * wc->level should be read and walked into, or if we can simply delete our
5274  * reference to it.  We return true if we should walk into the node, false if we
5275  * can skip it.
5276  *
5277  * We have assertions in here to make sure this is called correctly.  We assume
5278  * that sanity checking on the blocks read to this point has been done, so any
5279  * corrupted file systems must have been caught before calling this function.
5280  */
visit_node_for_delete(struct btrfs_root * root,struct walk_control * wc,struct extent_buffer * eb,u64 refs,u64 flags,int slot)5281 static bool visit_node_for_delete(struct btrfs_root *root, struct walk_control *wc,
5282 				  struct extent_buffer *eb, u64 refs, u64 flags, int slot)
5283 {
5284 	struct btrfs_key key;
5285 	u64 generation;
5286 	int level = wc->level;
5287 
5288 	ASSERT(level > 0);
5289 	ASSERT(wc->refs[level - 1] > 0);
5290 
5291 	/*
5292 	 * The update backref stage we only want to skip if we already have
5293 	 * FULL_BACKREF set, otherwise we need to read.
5294 	 */
5295 	if (wc->stage == UPDATE_BACKREF) {
5296 		if (level == 1 && flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5297 			return false;
5298 		return true;
5299 	}
5300 
5301 	/*
5302 	 * We're the last ref on this block, we must walk into it and process
5303 	 * any refs it's pointing at.
5304 	 */
5305 	if (wc->refs[level - 1] == 1)
5306 		return true;
5307 
5308 	/*
5309 	 * If we're already FULL_BACKREF then we know we can just drop our
5310 	 * current reference.
5311 	 */
5312 	if (level == 1 && flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5313 		return false;
5314 
5315 	/*
5316 	 * This block is older than our creation generation, we can drop our
5317 	 * reference to it.
5318 	 */
5319 	generation = btrfs_node_ptr_generation(eb, slot);
5320 	if (!wc->update_ref || generation <= btrfs_root_origin_generation(root))
5321 		return false;
5322 
5323 	/*
5324 	 * This block was processed from a previous snapshot deletion run, we
5325 	 * can skip it.
5326 	 */
5327 	btrfs_node_key_to_cpu(eb, &key, slot);
5328 	if (btrfs_comp_cpu_keys(&key, &wc->update_progress) < 0)
5329 		return false;
5330 
5331 	/* All other cases we need to wander into the node. */
5332 	return true;
5333 }
5334 
reada_walk_down(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct walk_control * wc,struct btrfs_path * path)5335 static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
5336 				     struct btrfs_root *root,
5337 				     struct walk_control *wc,
5338 				     struct btrfs_path *path)
5339 {
5340 	struct btrfs_fs_info *fs_info = root->fs_info;
5341 	u64 bytenr;
5342 	u64 generation;
5343 	u64 refs;
5344 	u64 flags;
5345 	u32 nritems;
5346 	struct extent_buffer *eb;
5347 	int ret;
5348 	int slot;
5349 	int nread = 0;
5350 
5351 	if (path->slots[wc->level] < wc->reada_slot) {
5352 		wc->reada_count = wc->reada_count * 2 / 3;
5353 		wc->reada_count = max(wc->reada_count, 2);
5354 	} else {
5355 		wc->reada_count = wc->reada_count * 3 / 2;
5356 		wc->reada_count = min_t(int, wc->reada_count,
5357 					BTRFS_NODEPTRS_PER_BLOCK(fs_info));
5358 	}
5359 
5360 	eb = path->nodes[wc->level];
5361 	nritems = btrfs_header_nritems(eb);
5362 
5363 	for (slot = path->slots[wc->level]; slot < nritems; slot++) {
5364 		if (nread >= wc->reada_count)
5365 			break;
5366 
5367 		cond_resched();
5368 		bytenr = btrfs_node_blockptr(eb, slot);
5369 		generation = btrfs_node_ptr_generation(eb, slot);
5370 
5371 		if (slot == path->slots[wc->level])
5372 			goto reada;
5373 
5374 		if (wc->stage == UPDATE_BACKREF &&
5375 		    generation <= btrfs_root_origin_generation(root))
5376 			continue;
5377 
5378 		/* We don't lock the tree block, it's OK to be racy here */
5379 		ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
5380 					       wc->level - 1, 1, &refs,
5381 					       &flags, NULL);
5382 		/* We don't care about errors in readahead. */
5383 		if (ret < 0)
5384 			continue;
5385 
5386 		/*
5387 		 * This could be racey, it's conceivable that we raced and end
5388 		 * up with a bogus refs count, if that's the case just skip, if
5389 		 * we are actually corrupt we will notice when we look up
5390 		 * everything again with our locks.
5391 		 */
5392 		if (refs == 0)
5393 			continue;
5394 
5395 		/* If we don't need to visit this node don't reada. */
5396 		if (!visit_node_for_delete(root, wc, eb, refs, flags, slot))
5397 			continue;
5398 reada:
5399 		btrfs_readahead_node_child(eb, slot);
5400 		nread++;
5401 	}
5402 	wc->reada_slot = slot;
5403 }
5404 
5405 /*
5406  * helper to process tree block while walking down the tree.
5407  *
5408  * when wc->stage == UPDATE_BACKREF, this function updates
5409  * back refs for pointers in the block.
5410  *
5411  * NOTE: return value 1 means we should stop walking down.
5412  */
walk_down_proc(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct walk_control * wc)5413 static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
5414 				   struct btrfs_root *root,
5415 				   struct btrfs_path *path,
5416 				   struct walk_control *wc)
5417 {
5418 	struct btrfs_fs_info *fs_info = root->fs_info;
5419 	int level = wc->level;
5420 	struct extent_buffer *eb = path->nodes[level];
5421 	u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
5422 	int ret;
5423 
5424 	if (wc->stage == UPDATE_BACKREF && btrfs_header_owner(eb) != btrfs_root_id(root))
5425 		return 1;
5426 
5427 	/*
5428 	 * when reference count of tree block is 1, it won't increase
5429 	 * again. once full backref flag is set, we never clear it.
5430 	 */
5431 	if (wc->lookup_info &&
5432 	    ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
5433 	     (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
5434 		ASSERT(path->locks[level]);
5435 		ret = btrfs_lookup_extent_info(trans, fs_info,
5436 					       eb->start, level, 1,
5437 					       &wc->refs[level],
5438 					       &wc->flags[level],
5439 					       NULL);
5440 		if (ret)
5441 			return ret;
5442 		if (unlikely(wc->refs[level] == 0)) {
5443 			btrfs_err(fs_info, "bytenr %llu has 0 references, expect > 0",
5444 				  eb->start);
5445 			return -EUCLEAN;
5446 		}
5447 	}
5448 
5449 	if (wc->stage == DROP_REFERENCE) {
5450 		if (wc->refs[level] > 1)
5451 			return 1;
5452 
5453 		if (path->locks[level] && !wc->keep_locks) {
5454 			btrfs_tree_unlock_rw(eb, path->locks[level]);
5455 			path->locks[level] = 0;
5456 		}
5457 		return 0;
5458 	}
5459 
5460 	/* wc->stage == UPDATE_BACKREF */
5461 	if (!(wc->flags[level] & flag)) {
5462 		ASSERT(path->locks[level]);
5463 		ret = btrfs_inc_ref(trans, root, eb, 1);
5464 		if (ret) {
5465 			btrfs_abort_transaction(trans, ret);
5466 			return ret;
5467 		}
5468 		ret = btrfs_dec_ref(trans, root, eb, 0);
5469 		if (ret) {
5470 			btrfs_abort_transaction(trans, ret);
5471 			return ret;
5472 		}
5473 		ret = btrfs_set_disk_extent_flags(trans, eb, flag);
5474 		if (ret) {
5475 			btrfs_abort_transaction(trans, ret);
5476 			return ret;
5477 		}
5478 		wc->flags[level] |= flag;
5479 	}
5480 
5481 	/*
5482 	 * the block is shared by multiple trees, so it's not good to
5483 	 * keep the tree lock
5484 	 */
5485 	if (path->locks[level] && level > 0) {
5486 		btrfs_tree_unlock_rw(eb, path->locks[level]);
5487 		path->locks[level] = 0;
5488 	}
5489 	return 0;
5490 }
5491 
5492 /*
5493  * This is used to verify a ref exists for this root to deal with a bug where we
5494  * would have a drop_progress key that hadn't been updated properly.
5495  */
check_ref_exists(struct btrfs_trans_handle * trans,struct btrfs_root * root,u64 bytenr,u64 parent,int level)5496 static int check_ref_exists(struct btrfs_trans_handle *trans,
5497 			    struct btrfs_root *root, u64 bytenr, u64 parent,
5498 			    int level)
5499 {
5500 	struct btrfs_delayed_ref_root *delayed_refs;
5501 	struct btrfs_delayed_ref_head *head;
5502 	struct btrfs_path *path;
5503 	struct btrfs_extent_inline_ref *iref;
5504 	int ret;
5505 	bool exists = false;
5506 
5507 	path = btrfs_alloc_path();
5508 	if (!path)
5509 		return -ENOMEM;
5510 again:
5511 	ret = lookup_extent_backref(trans, path, &iref, bytenr,
5512 				    root->fs_info->nodesize, parent,
5513 				    btrfs_root_id(root), level, 0);
5514 	if (ret != -ENOENT) {
5515 		/*
5516 		 * If we get 0 then we found our reference, return 1, else
5517 		 * return the error if it's not -ENOENT;
5518 		 */
5519 		btrfs_free_path(path);
5520 		return (ret < 0 ) ? ret : 1;
5521 	}
5522 
5523 	/*
5524 	 * We could have a delayed ref with this reference, so look it up while
5525 	 * we're holding the path open to make sure we don't race with the
5526 	 * delayed ref running.
5527 	 */
5528 	delayed_refs = &trans->transaction->delayed_refs;
5529 	spin_lock(&delayed_refs->lock);
5530 	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
5531 	if (!head)
5532 		goto out;
5533 	if (!mutex_trylock(&head->mutex)) {
5534 		/*
5535 		 * We're contended, means that the delayed ref is running, get a
5536 		 * reference and wait for the ref head to be complete and then
5537 		 * try again.
5538 		 */
5539 		refcount_inc(&head->refs);
5540 		spin_unlock(&delayed_refs->lock);
5541 
5542 		btrfs_release_path(path);
5543 
5544 		mutex_lock(&head->mutex);
5545 		mutex_unlock(&head->mutex);
5546 		btrfs_put_delayed_ref_head(head);
5547 		goto again;
5548 	}
5549 
5550 	exists = btrfs_find_delayed_tree_ref(head, root->root_key.objectid, parent);
5551 	mutex_unlock(&head->mutex);
5552 out:
5553 	spin_unlock(&delayed_refs->lock);
5554 	btrfs_free_path(path);
5555 	return exists ? 1 : 0;
5556 }
5557 
5558 /*
5559  * We may not have an uptodate block, so if we are going to walk down into this
5560  * block we need to drop the lock, read it off of the disk, re-lock it and
5561  * return to continue dropping the snapshot.
5562  */
check_next_block_uptodate(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct walk_control * wc,struct extent_buffer * next)5563 static int check_next_block_uptodate(struct btrfs_trans_handle *trans,
5564 				     struct btrfs_root *root,
5565 				     struct btrfs_path *path,
5566 				     struct walk_control *wc,
5567 				     struct extent_buffer *next)
5568 {
5569 	struct btrfs_tree_parent_check check = { 0 };
5570 	u64 generation;
5571 	int level = wc->level;
5572 	int ret;
5573 
5574 	btrfs_assert_tree_write_locked(next);
5575 
5576 	generation = btrfs_node_ptr_generation(path->nodes[level], path->slots[level]);
5577 
5578 	if (btrfs_buffer_uptodate(next, generation, 0))
5579 		return 0;
5580 
5581 	check.level = level - 1;
5582 	check.transid = generation;
5583 	check.owner_root = btrfs_root_id(root);
5584 	check.has_first_key = true;
5585 	btrfs_node_key_to_cpu(path->nodes[level], &check.first_key, path->slots[level]);
5586 
5587 	btrfs_tree_unlock(next);
5588 	if (level == 1)
5589 		reada_walk_down(trans, root, wc, path);
5590 	ret = btrfs_read_extent_buffer(next, &check);
5591 	if (ret) {
5592 		free_extent_buffer(next);
5593 		return ret;
5594 	}
5595 	btrfs_tree_lock(next);
5596 	wc->lookup_info = 1;
5597 	return 0;
5598 }
5599 
5600 /*
5601  * If we determine that we don't have to visit wc->level - 1 then we need to
5602  * determine if we can drop our reference.
5603  *
5604  * If we are UPDATE_BACKREF then we will not, we need to update our backrefs.
5605  *
5606  * If we are DROP_REFERENCE this will figure out if we need to drop our current
5607  * reference, skipping it if we dropped it from a previous incompleted drop, or
5608  * dropping it if we still have a reference to it.
5609  */
maybe_drop_reference(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct walk_control * wc,struct extent_buffer * next,u64 owner_root)5610 static int maybe_drop_reference(struct btrfs_trans_handle *trans, struct btrfs_root *root,
5611 				struct btrfs_path *path, struct walk_control *wc,
5612 				struct extent_buffer *next, u64 owner_root)
5613 {
5614 	struct btrfs_ref ref = {
5615 		.action = BTRFS_DROP_DELAYED_REF,
5616 		.bytenr = next->start,
5617 		.num_bytes = root->fs_info->nodesize,
5618 		.owning_root = owner_root,
5619 		.ref_root = btrfs_root_id(root),
5620 	};
5621 	int level = wc->level;
5622 	int ret;
5623 
5624 	/* We are UPDATE_BACKREF, we're not dropping anything. */
5625 	if (wc->stage == UPDATE_BACKREF)
5626 		return 0;
5627 
5628 	if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
5629 		ref.parent = path->nodes[level]->start;
5630 	} else {
5631 		ASSERT(btrfs_root_id(root) == btrfs_header_owner(path->nodes[level]));
5632 		if (btrfs_root_id(root) != btrfs_header_owner(path->nodes[level])) {
5633 			btrfs_err(root->fs_info, "mismatched block owner");
5634 			return -EIO;
5635 		}
5636 	}
5637 
5638 	/*
5639 	 * If we had a drop_progress we need to verify the refs are set as
5640 	 * expected.  If we find our ref then we know that from here on out
5641 	 * everything should be correct, and we can clear the
5642 	 * ->restarted flag.
5643 	 */
5644 	if (wc->restarted) {
5645 		ret = check_ref_exists(trans, root, next->start, ref.parent,
5646 				       level - 1);
5647 		if (ret <= 0)
5648 			return ret;
5649 		ret = 0;
5650 		wc->restarted = 0;
5651 	}
5652 
5653 	/*
5654 	 * Reloc tree doesn't contribute to qgroup numbers, and we have already
5655 	 * accounted them at merge time (replace_path), thus we could skip
5656 	 * expensive subtree trace here.
5657 	 */
5658 	if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID &&
5659 	    wc->refs[level - 1] > 1) {
5660 		u64 generation = btrfs_node_ptr_generation(path->nodes[level],
5661 							   path->slots[level]);
5662 
5663 		ret = btrfs_qgroup_trace_subtree(trans, next, generation, level - 1);
5664 		if (ret) {
5665 			btrfs_err_rl(root->fs_info,
5666 "error %d accounting shared subtree, quota is out of sync, rescan required",
5667 				     ret);
5668 		}
5669 	}
5670 
5671 	/*
5672 	 * We need to update the next key in our walk control so we can update
5673 	 * the drop_progress key accordingly.  We don't care if find_next_key
5674 	 * doesn't find a key because that means we're at the end and are going
5675 	 * to clean up now.
5676 	 */
5677 	wc->drop_level = level;
5678 	find_next_key(path, level, &wc->drop_progress);
5679 
5680 	btrfs_init_tree_ref(&ref, level - 1, 0, false);
5681 	return btrfs_free_extent(trans, &ref);
5682 }
5683 
5684 /*
5685  * helper to process tree block pointer.
5686  *
5687  * when wc->stage == DROP_REFERENCE, this function checks
5688  * reference count of the block pointed to. if the block
5689  * is shared and we need update back refs for the subtree
5690  * rooted at the block, this function changes wc->stage to
5691  * UPDATE_BACKREF. if the block is shared and there is no
5692  * need to update back, this function drops the reference
5693  * to the block.
5694  *
5695  * NOTE: return value 1 means we should stop walking down.
5696  */
do_walk_down(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct walk_control * wc)5697 static noinline int do_walk_down(struct btrfs_trans_handle *trans,
5698 				 struct btrfs_root *root,
5699 				 struct btrfs_path *path,
5700 				 struct walk_control *wc)
5701 {
5702 	struct btrfs_fs_info *fs_info = root->fs_info;
5703 	u64 bytenr;
5704 	u64 generation;
5705 	u64 owner_root = 0;
5706 	struct extent_buffer *next;
5707 	int level = wc->level;
5708 	int ret = 0;
5709 
5710 	generation = btrfs_node_ptr_generation(path->nodes[level],
5711 					       path->slots[level]);
5712 	/*
5713 	 * if the lower level block was created before the snapshot
5714 	 * was created, we know there is no need to update back refs
5715 	 * for the subtree
5716 	 */
5717 	if (wc->stage == UPDATE_BACKREF &&
5718 	    generation <= btrfs_root_origin_generation(root)) {
5719 		wc->lookup_info = 1;
5720 		return 1;
5721 	}
5722 
5723 	bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
5724 
5725 	next = btrfs_find_create_tree_block(fs_info, bytenr, btrfs_root_id(root),
5726 					    level - 1);
5727 	if (IS_ERR(next))
5728 		return PTR_ERR(next);
5729 
5730 	btrfs_tree_lock(next);
5731 
5732 	ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
5733 				       &wc->refs[level - 1],
5734 				       &wc->flags[level - 1],
5735 				       &owner_root);
5736 	if (ret < 0)
5737 		goto out_unlock;
5738 
5739 	if (unlikely(wc->refs[level - 1] == 0)) {
5740 		btrfs_err(fs_info, "bytenr %llu has 0 references, expect > 0",
5741 			  bytenr);
5742 		ret = -EUCLEAN;
5743 		goto out_unlock;
5744 	}
5745 	wc->lookup_info = 0;
5746 
5747 	/* If we don't have to walk into this node skip it. */
5748 	if (!visit_node_for_delete(root, wc, path->nodes[level],
5749 				   wc->refs[level - 1], wc->flags[level - 1],
5750 				   path->slots[level]))
5751 		goto skip;
5752 
5753 	/*
5754 	 * We have to walk down into this node, and if we're currently at the
5755 	 * DROP_REFERNCE stage and this block is shared then we need to switch
5756 	 * to the UPDATE_BACKREF stage in order to convert to FULL_BACKREF.
5757 	 */
5758 	if (wc->stage == DROP_REFERENCE && wc->refs[level - 1] > 1) {
5759 		wc->stage = UPDATE_BACKREF;
5760 		wc->shared_level = level - 1;
5761 	}
5762 
5763 	ret = check_next_block_uptodate(trans, root, path, wc, next);
5764 	if (ret)
5765 		return ret;
5766 
5767 	level--;
5768 	ASSERT(level == btrfs_header_level(next));
5769 	if (level != btrfs_header_level(next)) {
5770 		btrfs_err(root->fs_info, "mismatched level");
5771 		ret = -EIO;
5772 		goto out_unlock;
5773 	}
5774 	path->nodes[level] = next;
5775 	path->slots[level] = 0;
5776 	path->locks[level] = BTRFS_WRITE_LOCK;
5777 	wc->level = level;
5778 	if (wc->level == 1)
5779 		wc->reada_slot = 0;
5780 	return 0;
5781 skip:
5782 	ret = maybe_drop_reference(trans, root, path, wc, next, owner_root);
5783 	if (ret)
5784 		goto out_unlock;
5785 	wc->refs[level - 1] = 0;
5786 	wc->flags[level - 1] = 0;
5787 	wc->lookup_info = 1;
5788 	ret = 1;
5789 
5790 out_unlock:
5791 	btrfs_tree_unlock(next);
5792 	free_extent_buffer(next);
5793 
5794 	return ret;
5795 }
5796 
5797 /*
5798  * helper to process tree block while walking up the tree.
5799  *
5800  * when wc->stage == DROP_REFERENCE, this function drops
5801  * reference count on the block.
5802  *
5803  * when wc->stage == UPDATE_BACKREF, this function changes
5804  * wc->stage back to DROP_REFERENCE if we changed wc->stage
5805  * to UPDATE_BACKREF previously while processing the block.
5806  *
5807  * NOTE: return value 1 means we should stop walking up.
5808  */
walk_up_proc(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct walk_control * wc)5809 static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
5810 				 struct btrfs_root *root,
5811 				 struct btrfs_path *path,
5812 				 struct walk_control *wc)
5813 {
5814 	struct btrfs_fs_info *fs_info = root->fs_info;
5815 	int ret = 0;
5816 	int level = wc->level;
5817 	struct extent_buffer *eb = path->nodes[level];
5818 	u64 parent = 0;
5819 
5820 	if (wc->stage == UPDATE_BACKREF) {
5821 		ASSERT(wc->shared_level >= level);
5822 		if (level < wc->shared_level)
5823 			goto out;
5824 
5825 		ret = find_next_key(path, level + 1, &wc->update_progress);
5826 		if (ret > 0)
5827 			wc->update_ref = 0;
5828 
5829 		wc->stage = DROP_REFERENCE;
5830 		wc->shared_level = -1;
5831 		path->slots[level] = 0;
5832 
5833 		/*
5834 		 * check reference count again if the block isn't locked.
5835 		 * we should start walking down the tree again if reference
5836 		 * count is one.
5837 		 */
5838 		if (!path->locks[level]) {
5839 			ASSERT(level > 0);
5840 			btrfs_tree_lock(eb);
5841 			path->locks[level] = BTRFS_WRITE_LOCK;
5842 
5843 			ret = btrfs_lookup_extent_info(trans, fs_info,
5844 						       eb->start, level, 1,
5845 						       &wc->refs[level],
5846 						       &wc->flags[level],
5847 						       NULL);
5848 			if (ret < 0) {
5849 				btrfs_tree_unlock_rw(eb, path->locks[level]);
5850 				path->locks[level] = 0;
5851 				return ret;
5852 			}
5853 			if (unlikely(wc->refs[level] == 0)) {
5854 				btrfs_tree_unlock_rw(eb, path->locks[level]);
5855 				btrfs_err(fs_info, "bytenr %llu has 0 references, expect > 0",
5856 					  eb->start);
5857 				return -EUCLEAN;
5858 			}
5859 			if (wc->refs[level] == 1) {
5860 				btrfs_tree_unlock_rw(eb, path->locks[level]);
5861 				path->locks[level] = 0;
5862 				return 1;
5863 			}
5864 		}
5865 	}
5866 
5867 	/* wc->stage == DROP_REFERENCE */
5868 	ASSERT(path->locks[level] || wc->refs[level] == 1);
5869 
5870 	if (wc->refs[level] == 1) {
5871 		if (level == 0) {
5872 			if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5873 				ret = btrfs_dec_ref(trans, root, eb, 1);
5874 			else
5875 				ret = btrfs_dec_ref(trans, root, eb, 0);
5876 			if (ret) {
5877 				btrfs_abort_transaction(trans, ret);
5878 				return ret;
5879 			}
5880 			if (is_fstree(btrfs_root_id(root))) {
5881 				ret = btrfs_qgroup_trace_leaf_items(trans, eb);
5882 				if (ret) {
5883 					btrfs_err_rl(fs_info,
5884 	"error %d accounting leaf items, quota is out of sync, rescan required",
5885 					     ret);
5886 				}
5887 			}
5888 		}
5889 		/* Make block locked assertion in btrfs_clear_buffer_dirty happy. */
5890 		if (!path->locks[level]) {
5891 			btrfs_tree_lock(eb);
5892 			path->locks[level] = BTRFS_WRITE_LOCK;
5893 		}
5894 		btrfs_clear_buffer_dirty(trans, eb);
5895 	}
5896 
5897 	if (eb == root->node) {
5898 		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5899 			parent = eb->start;
5900 		else if (btrfs_root_id(root) != btrfs_header_owner(eb))
5901 			goto owner_mismatch;
5902 	} else {
5903 		if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5904 			parent = path->nodes[level + 1]->start;
5905 		else if (btrfs_root_id(root) !=
5906 			 btrfs_header_owner(path->nodes[level + 1]))
5907 			goto owner_mismatch;
5908 	}
5909 
5910 	ret = btrfs_free_tree_block(trans, btrfs_root_id(root), eb, parent,
5911 				    wc->refs[level] == 1);
5912 	if (ret < 0)
5913 		btrfs_abort_transaction(trans, ret);
5914 out:
5915 	wc->refs[level] = 0;
5916 	wc->flags[level] = 0;
5917 	return ret;
5918 
5919 owner_mismatch:
5920 	btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu",
5921 		     btrfs_header_owner(eb), btrfs_root_id(root));
5922 	return -EUCLEAN;
5923 }
5924 
5925 /*
5926  * walk_down_tree consists of two steps.
5927  *
5928  * walk_down_proc().  Look up the reference count and reference of our current
5929  * wc->level.  At this point path->nodes[wc->level] should be populated and
5930  * uptodate, and in most cases should already be locked.  If we are in
5931  * DROP_REFERENCE and our refcount is > 1 then we've entered a shared node and
5932  * we can walk back up the tree.  If we are UPDATE_BACKREF we have to set
5933  * FULL_BACKREF on this node if it's not already set, and then do the
5934  * FULL_BACKREF conversion dance, which is to drop the root reference and add
5935  * the shared reference to all of this nodes children.
5936  *
5937  * do_walk_down().  This is where we actually start iterating on the children of
5938  * our current path->nodes[wc->level].  For DROP_REFERENCE that means dropping
5939  * our reference to the children that return false from visit_node_for_delete(),
5940  * which has various conditions where we know we can just drop our reference
5941  * without visiting the node.  For UPDATE_BACKREF we will skip any children that
5942  * visit_node_for_delete() returns false for, only walking down when necessary.
5943  * The bulk of the work for UPDATE_BACKREF occurs in the walk_up_tree() part of
5944  * snapshot deletion.
5945  */
walk_down_tree(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct walk_control * wc)5946 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
5947 				   struct btrfs_root *root,
5948 				   struct btrfs_path *path,
5949 				   struct walk_control *wc)
5950 {
5951 	int level = wc->level;
5952 	int ret = 0;
5953 
5954 	wc->lookup_info = 1;
5955 	while (level >= 0) {
5956 		ret = walk_down_proc(trans, root, path, wc);
5957 		if (ret)
5958 			break;
5959 
5960 		if (level == 0)
5961 			break;
5962 
5963 		if (path->slots[level] >=
5964 		    btrfs_header_nritems(path->nodes[level]))
5965 			break;
5966 
5967 		ret = do_walk_down(trans, root, path, wc);
5968 		if (ret > 0) {
5969 			path->slots[level]++;
5970 			continue;
5971 		} else if (ret < 0)
5972 			break;
5973 		level = wc->level;
5974 	}
5975 	return (ret == 1) ? 0 : ret;
5976 }
5977 
5978 /*
5979  * walk_up_tree() is responsible for making sure we visit every slot on our
5980  * current node, and if we're at the end of that node then we call
5981  * walk_up_proc() on our current node which will do one of a few things based on
5982  * our stage.
5983  *
5984  * UPDATE_BACKREF.  If we wc->level is currently less than our wc->shared_level
5985  * then we need to walk back up the tree, and then going back down into the
5986  * other slots via walk_down_tree to update any other children from our original
5987  * wc->shared_level.  Once we're at or above our wc->shared_level we can switch
5988  * back to DROP_REFERENCE, lookup the current nodes refs and flags, and carry on.
5989  *
5990  * DROP_REFERENCE. If our refs == 1 then we're going to free this tree block.
5991  * If we're level 0 then we need to btrfs_dec_ref() on all of the data extents
5992  * in our current leaf.  After that we call btrfs_free_tree_block() on the
5993  * current node and walk up to the next node to walk down the next slot.
5994  */
walk_up_tree(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct walk_control * wc,int max_level)5995 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
5996 				 struct btrfs_root *root,
5997 				 struct btrfs_path *path,
5998 				 struct walk_control *wc, int max_level)
5999 {
6000 	int level = wc->level;
6001 	int ret;
6002 
6003 	path->slots[level] = btrfs_header_nritems(path->nodes[level]);
6004 	while (level < max_level && path->nodes[level]) {
6005 		wc->level = level;
6006 		if (path->slots[level] + 1 <
6007 		    btrfs_header_nritems(path->nodes[level])) {
6008 			path->slots[level]++;
6009 			return 0;
6010 		} else {
6011 			ret = walk_up_proc(trans, root, path, wc);
6012 			if (ret > 0)
6013 				return 0;
6014 			if (ret < 0)
6015 				return ret;
6016 
6017 			if (path->locks[level]) {
6018 				btrfs_tree_unlock_rw(path->nodes[level],
6019 						     path->locks[level]);
6020 				path->locks[level] = 0;
6021 			}
6022 			free_extent_buffer(path->nodes[level]);
6023 			path->nodes[level] = NULL;
6024 			level++;
6025 		}
6026 	}
6027 	return 1;
6028 }
6029 
6030 /*
6031  * drop a subvolume tree.
6032  *
6033  * this function traverses the tree freeing any blocks that only
6034  * referenced by the tree.
6035  *
6036  * when a shared tree block is found. this function decreases its
6037  * reference count by one. if update_ref is true, this function
6038  * also make sure backrefs for the shared block and all lower level
6039  * blocks are properly updated.
6040  *
6041  * If called with for_reloc == 0, may exit early with -EAGAIN
6042  */
btrfs_drop_snapshot(struct btrfs_root * root,int update_ref,int for_reloc)6043 int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
6044 {
6045 	const bool is_reloc_root = (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID);
6046 	struct btrfs_fs_info *fs_info = root->fs_info;
6047 	struct btrfs_path *path;
6048 	struct btrfs_trans_handle *trans;
6049 	struct btrfs_root *tree_root = fs_info->tree_root;
6050 	struct btrfs_root_item *root_item = &root->root_item;
6051 	struct walk_control *wc;
6052 	struct btrfs_key key;
6053 	const u64 rootid = btrfs_root_id(root);
6054 	int ret = 0;
6055 	int level;
6056 	bool root_dropped = false;
6057 	bool unfinished_drop = false;
6058 
6059 	btrfs_debug(fs_info, "Drop subvolume %llu", btrfs_root_id(root));
6060 
6061 	path = btrfs_alloc_path();
6062 	if (!path) {
6063 		ret = -ENOMEM;
6064 		goto out;
6065 	}
6066 
6067 	wc = kzalloc(sizeof(*wc), GFP_NOFS);
6068 	if (!wc) {
6069 		btrfs_free_path(path);
6070 		ret = -ENOMEM;
6071 		goto out;
6072 	}
6073 
6074 	/*
6075 	 * Use join to avoid potential EINTR from transaction start. See
6076 	 * wait_reserve_ticket and the whole reservation callchain.
6077 	 */
6078 	if (for_reloc)
6079 		trans = btrfs_join_transaction(tree_root);
6080 	else
6081 		trans = btrfs_start_transaction(tree_root, 0);
6082 	if (IS_ERR(trans)) {
6083 		ret = PTR_ERR(trans);
6084 		goto out_free;
6085 	}
6086 
6087 	ret = btrfs_run_delayed_items(trans);
6088 	if (ret)
6089 		goto out_end_trans;
6090 
6091 	/*
6092 	 * This will help us catch people modifying the fs tree while we're
6093 	 * dropping it.  It is unsafe to mess with the fs tree while it's being
6094 	 * dropped as we unlock the root node and parent nodes as we walk down
6095 	 * the tree, assuming nothing will change.  If something does change
6096 	 * then we'll have stale information and drop references to blocks we've
6097 	 * already dropped.
6098 	 */
6099 	set_bit(BTRFS_ROOT_DELETING, &root->state);
6100 	unfinished_drop = test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state);
6101 
6102 	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
6103 		level = btrfs_header_level(root->node);
6104 		path->nodes[level] = btrfs_lock_root_node(root);
6105 		path->slots[level] = 0;
6106 		path->locks[level] = BTRFS_WRITE_LOCK;
6107 		memset(&wc->update_progress, 0,
6108 		       sizeof(wc->update_progress));
6109 	} else {
6110 		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
6111 		memcpy(&wc->update_progress, &key,
6112 		       sizeof(wc->update_progress));
6113 
6114 		level = btrfs_root_drop_level(root_item);
6115 		BUG_ON(level == 0);
6116 		path->lowest_level = level;
6117 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6118 		path->lowest_level = 0;
6119 		if (ret < 0)
6120 			goto out_end_trans;
6121 
6122 		WARN_ON(ret > 0);
6123 		ret = 0;
6124 
6125 		/*
6126 		 * unlock our path, this is safe because only this
6127 		 * function is allowed to delete this snapshot
6128 		 */
6129 		btrfs_unlock_up_safe(path, 0);
6130 
6131 		level = btrfs_header_level(root->node);
6132 		while (1) {
6133 			btrfs_tree_lock(path->nodes[level]);
6134 			path->locks[level] = BTRFS_WRITE_LOCK;
6135 
6136 			/*
6137 			 * btrfs_lookup_extent_info() returns 0 for success,
6138 			 * or < 0 for error.
6139 			 */
6140 			ret = btrfs_lookup_extent_info(trans, fs_info,
6141 						path->nodes[level]->start,
6142 						level, 1, &wc->refs[level],
6143 						&wc->flags[level], NULL);
6144 			if (ret < 0)
6145 				goto out_end_trans;
6146 
6147 			BUG_ON(wc->refs[level] == 0);
6148 
6149 			if (level == btrfs_root_drop_level(root_item))
6150 				break;
6151 
6152 			btrfs_tree_unlock(path->nodes[level]);
6153 			path->locks[level] = 0;
6154 			WARN_ON(wc->refs[level] != 1);
6155 			level--;
6156 		}
6157 	}
6158 
6159 	wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
6160 	wc->level = level;
6161 	wc->shared_level = -1;
6162 	wc->stage = DROP_REFERENCE;
6163 	wc->update_ref = update_ref;
6164 	wc->keep_locks = 0;
6165 	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
6166 
6167 	while (1) {
6168 
6169 		ret = walk_down_tree(trans, root, path, wc);
6170 		if (ret < 0) {
6171 			btrfs_abort_transaction(trans, ret);
6172 			break;
6173 		}
6174 
6175 		ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
6176 		if (ret < 0) {
6177 			btrfs_abort_transaction(trans, ret);
6178 			break;
6179 		}
6180 
6181 		if (ret > 0) {
6182 			BUG_ON(wc->stage != DROP_REFERENCE);
6183 			ret = 0;
6184 			break;
6185 		}
6186 
6187 		if (wc->stage == DROP_REFERENCE) {
6188 			wc->drop_level = wc->level;
6189 			btrfs_node_key_to_cpu(path->nodes[wc->drop_level],
6190 					      &wc->drop_progress,
6191 					      path->slots[wc->drop_level]);
6192 		}
6193 		btrfs_cpu_key_to_disk(&root_item->drop_progress,
6194 				      &wc->drop_progress);
6195 		btrfs_set_root_drop_level(root_item, wc->drop_level);
6196 
6197 		BUG_ON(wc->level == 0);
6198 		if (btrfs_should_end_transaction(trans) ||
6199 		    (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) {
6200 			ret = btrfs_update_root(trans, tree_root,
6201 						&root->root_key,
6202 						root_item);
6203 			if (ret) {
6204 				btrfs_abort_transaction(trans, ret);
6205 				goto out_end_trans;
6206 			}
6207 
6208 			if (!is_reloc_root)
6209 				btrfs_set_last_root_drop_gen(fs_info, trans->transid);
6210 
6211 			btrfs_end_transaction_throttle(trans);
6212 			if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
6213 				btrfs_debug(fs_info,
6214 					    "drop snapshot early exit");
6215 				ret = -EAGAIN;
6216 				goto out_free;
6217 			}
6218 
6219 		       /*
6220 			* Use join to avoid potential EINTR from transaction
6221 			* start. See wait_reserve_ticket and the whole
6222 			* reservation callchain.
6223 			*/
6224 			if (for_reloc)
6225 				trans = btrfs_join_transaction(tree_root);
6226 			else
6227 				trans = btrfs_start_transaction(tree_root, 0);
6228 			if (IS_ERR(trans)) {
6229 				ret = PTR_ERR(trans);
6230 				goto out_free;
6231 			}
6232 		}
6233 	}
6234 	btrfs_release_path(path);
6235 	if (ret)
6236 		goto out_end_trans;
6237 
6238 	ret = btrfs_del_root(trans, &root->root_key);
6239 	if (ret) {
6240 		btrfs_abort_transaction(trans, ret);
6241 		goto out_end_trans;
6242 	}
6243 
6244 	if (!is_reloc_root) {
6245 		ret = btrfs_find_root(tree_root, &root->root_key, path,
6246 				      NULL, NULL);
6247 		if (ret < 0) {
6248 			btrfs_abort_transaction(trans, ret);
6249 			goto out_end_trans;
6250 		} else if (ret > 0) {
6251 			ret = 0;
6252 			/*
6253 			 * If we fail to delete the orphan item this time
6254 			 * around, it'll get picked up the next time.
6255 			 *
6256 			 * The most common failure here is just -ENOENT.
6257 			 */
6258 			btrfs_del_orphan_item(trans, tree_root, btrfs_root_id(root));
6259 		}
6260 	}
6261 
6262 	/*
6263 	 * This subvolume is going to be completely dropped, and won't be
6264 	 * recorded as dirty roots, thus pertrans meta rsv will not be freed at
6265 	 * commit transaction time.  So free it here manually.
6266 	 */
6267 	btrfs_qgroup_convert_reserved_meta(root, INT_MAX);
6268 	btrfs_qgroup_free_meta_all_pertrans(root);
6269 
6270 	if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state))
6271 		btrfs_add_dropped_root(trans, root);
6272 	else
6273 		btrfs_put_root(root);
6274 	root_dropped = true;
6275 out_end_trans:
6276 	if (!is_reloc_root)
6277 		btrfs_set_last_root_drop_gen(fs_info, trans->transid);
6278 
6279 	btrfs_end_transaction_throttle(trans);
6280 out_free:
6281 	kfree(wc);
6282 	btrfs_free_path(path);
6283 out:
6284 	if (!ret && root_dropped) {
6285 		ret = btrfs_qgroup_cleanup_dropped_subvolume(fs_info, rootid);
6286 		if (ret < 0)
6287 			btrfs_warn_rl(fs_info,
6288 				      "failed to cleanup qgroup 0/%llu: %d",
6289 				      rootid, ret);
6290 		ret = 0;
6291 	}
6292 	/*
6293 	 * We were an unfinished drop root, check to see if there are any
6294 	 * pending, and if not clear and wake up any waiters.
6295 	 */
6296 	if (!ret && unfinished_drop)
6297 		btrfs_maybe_wake_unfinished_drop(fs_info);
6298 
6299 	/*
6300 	 * So if we need to stop dropping the snapshot for whatever reason we
6301 	 * need to make sure to add it back to the dead root list so that we
6302 	 * keep trying to do the work later.  This also cleans up roots if we
6303 	 * don't have it in the radix (like when we recover after a power fail
6304 	 * or unmount) so we don't leak memory.
6305 	 */
6306 	if (!for_reloc && !root_dropped)
6307 		btrfs_add_dead_root(root);
6308 	return ret;
6309 }
6310 
6311 /*
6312  * drop subtree rooted at tree block 'node'.
6313  *
6314  * NOTE: this function will unlock and release tree block 'node'
6315  * only used by relocation code
6316  */
btrfs_drop_subtree(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct extent_buffer * node,struct extent_buffer * parent)6317 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
6318 			struct btrfs_root *root,
6319 			struct extent_buffer *node,
6320 			struct extent_buffer *parent)
6321 {
6322 	struct btrfs_fs_info *fs_info = root->fs_info;
6323 	struct btrfs_path *path;
6324 	struct walk_control *wc;
6325 	int level;
6326 	int parent_level;
6327 	int ret = 0;
6328 
6329 	BUG_ON(btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID);
6330 
6331 	path = btrfs_alloc_path();
6332 	if (!path)
6333 		return -ENOMEM;
6334 
6335 	wc = kzalloc(sizeof(*wc), GFP_NOFS);
6336 	if (!wc) {
6337 		btrfs_free_path(path);
6338 		return -ENOMEM;
6339 	}
6340 
6341 	btrfs_assert_tree_write_locked(parent);
6342 	parent_level = btrfs_header_level(parent);
6343 	atomic_inc(&parent->refs);
6344 	path->nodes[parent_level] = parent;
6345 	path->slots[parent_level] = btrfs_header_nritems(parent);
6346 
6347 	btrfs_assert_tree_write_locked(node);
6348 	level = btrfs_header_level(node);
6349 	path->nodes[level] = node;
6350 	path->slots[level] = 0;
6351 	path->locks[level] = BTRFS_WRITE_LOCK;
6352 
6353 	wc->refs[parent_level] = 1;
6354 	wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
6355 	wc->level = level;
6356 	wc->shared_level = -1;
6357 	wc->stage = DROP_REFERENCE;
6358 	wc->update_ref = 0;
6359 	wc->keep_locks = 1;
6360 	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
6361 
6362 	while (1) {
6363 		ret = walk_down_tree(trans, root, path, wc);
6364 		if (ret < 0)
6365 			break;
6366 
6367 		ret = walk_up_tree(trans, root, path, wc, parent_level);
6368 		if (ret) {
6369 			if (ret > 0)
6370 				ret = 0;
6371 			break;
6372 		}
6373 	}
6374 
6375 	kfree(wc);
6376 	btrfs_free_path(path);
6377 	return ret;
6378 }
6379 
6380 /*
6381  * Unpin the extent range in an error context and don't add the space back.
6382  * Errors are not propagated further.
6383  */
btrfs_error_unpin_extent_range(struct btrfs_fs_info * fs_info,u64 start,u64 end)6384 void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end)
6385 {
6386 	unpin_extent_range(fs_info, start, end, false);
6387 }
6388 
6389 /*
6390  * It used to be that old block groups would be left around forever.
6391  * Iterating over them would be enough to trim unused space.  Since we
6392  * now automatically remove them, we also need to iterate over unallocated
6393  * space.
6394  *
6395  * We don't want a transaction for this since the discard may take a
6396  * substantial amount of time.  We don't require that a transaction be
6397  * running, but we do need to take a running transaction into account
6398  * to ensure that we're not discarding chunks that were released or
6399  * allocated in the current transaction.
6400  *
6401  * Holding the chunks lock will prevent other threads from allocating
6402  * or releasing chunks, but it won't prevent a running transaction
6403  * from committing and releasing the memory that the pending chunks
6404  * list head uses.  For that, we need to take a reference to the
6405  * transaction and hold the commit root sem.  We only need to hold
6406  * it while performing the free space search since we have already
6407  * held back allocations.
6408  */
btrfs_trim_free_extents(struct btrfs_device * device,u64 * trimmed)6409 static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
6410 {
6411 	u64 start = BTRFS_DEVICE_RANGE_RESERVED, len = 0, end = 0;
6412 	int ret;
6413 
6414 	*trimmed = 0;
6415 
6416 	/* Discard not supported = nothing to do. */
6417 	if (!bdev_max_discard_sectors(device->bdev))
6418 		return 0;
6419 
6420 	/* Not writable = nothing to do. */
6421 	if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
6422 		return 0;
6423 
6424 	/* No free space = nothing to do. */
6425 	if (device->total_bytes <= device->bytes_used)
6426 		return 0;
6427 
6428 	ret = 0;
6429 
6430 	while (1) {
6431 		struct btrfs_fs_info *fs_info = device->fs_info;
6432 		u64 bytes;
6433 
6434 		ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
6435 		if (ret)
6436 			break;
6437 
6438 		find_first_clear_extent_bit(&device->alloc_state, start,
6439 					    &start, &end,
6440 					    CHUNK_TRIMMED | CHUNK_ALLOCATED);
6441 
6442 		/* Check if there are any CHUNK_* bits left */
6443 		if (start > device->total_bytes) {
6444 			WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
6445 			btrfs_warn_in_rcu(fs_info,
6446 "ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu",
6447 					  start, end - start + 1,
6448 					  btrfs_dev_name(device),
6449 					  device->total_bytes);
6450 			mutex_unlock(&fs_info->chunk_mutex);
6451 			ret = 0;
6452 			break;
6453 		}
6454 
6455 		/* Ensure we skip the reserved space on each device. */
6456 		start = max_t(u64, start, BTRFS_DEVICE_RANGE_RESERVED);
6457 
6458 		/*
6459 		 * If find_first_clear_extent_bit find a range that spans the
6460 		 * end of the device it will set end to -1, in this case it's up
6461 		 * to the caller to trim the value to the size of the device.
6462 		 */
6463 		end = min(end, device->total_bytes - 1);
6464 
6465 		len = end - start + 1;
6466 
6467 		/* We didn't find any extents */
6468 		if (!len) {
6469 			mutex_unlock(&fs_info->chunk_mutex);
6470 			ret = 0;
6471 			break;
6472 		}
6473 
6474 		ret = btrfs_issue_discard(device->bdev, start, len,
6475 					  &bytes);
6476 		if (!ret)
6477 			set_extent_bit(&device->alloc_state, start,
6478 				       start + bytes - 1, CHUNK_TRIMMED, NULL);
6479 		mutex_unlock(&fs_info->chunk_mutex);
6480 
6481 		if (ret)
6482 			break;
6483 
6484 		start += len;
6485 		*trimmed += bytes;
6486 
6487 		if (btrfs_trim_interrupted()) {
6488 			ret = -ERESTARTSYS;
6489 			break;
6490 		}
6491 
6492 		cond_resched();
6493 	}
6494 
6495 	return ret;
6496 }
6497 
6498 /*
6499  * Trim the whole filesystem by:
6500  * 1) trimming the free space in each block group
6501  * 2) trimming the unallocated space on each device
6502  *
6503  * This will also continue trimming even if a block group or device encounters
6504  * an error.  The return value will be the last error, or 0 if nothing bad
6505  * happens.
6506  */
btrfs_trim_fs(struct btrfs_fs_info * fs_info,struct fstrim_range * range)6507 int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
6508 {
6509 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6510 	struct btrfs_block_group *cache = NULL;
6511 	struct btrfs_device *device;
6512 	u64 group_trimmed;
6513 	u64 range_end = U64_MAX;
6514 	u64 start;
6515 	u64 end;
6516 	u64 trimmed = 0;
6517 	u64 bg_failed = 0;
6518 	u64 dev_failed = 0;
6519 	int bg_ret = 0;
6520 	int dev_ret = 0;
6521 	int ret = 0;
6522 
6523 	if (range->start == U64_MAX)
6524 		return -EINVAL;
6525 
6526 	/*
6527 	 * Check range overflow if range->len is set.
6528 	 * The default range->len is U64_MAX.
6529 	 */
6530 	if (range->len != U64_MAX &&
6531 	    check_add_overflow(range->start, range->len, &range_end))
6532 		return -EINVAL;
6533 
6534 	cache = btrfs_lookup_first_block_group(fs_info, range->start);
6535 	for (; cache; cache = btrfs_next_block_group(cache)) {
6536 		if (cache->start >= range_end) {
6537 			btrfs_put_block_group(cache);
6538 			break;
6539 		}
6540 
6541 		start = max(range->start, cache->start);
6542 		end = min(range_end, cache->start + cache->length);
6543 
6544 		if (end - start >= range->minlen) {
6545 			if (!btrfs_block_group_done(cache)) {
6546 				ret = btrfs_cache_block_group(cache, true);
6547 				if (ret) {
6548 					bg_failed++;
6549 					bg_ret = ret;
6550 					continue;
6551 				}
6552 			}
6553 			ret = btrfs_trim_block_group(cache,
6554 						     &group_trimmed,
6555 						     start,
6556 						     end,
6557 						     range->minlen);
6558 
6559 			trimmed += group_trimmed;
6560 			if (ret) {
6561 				bg_failed++;
6562 				bg_ret = ret;
6563 				continue;
6564 			}
6565 		}
6566 	}
6567 
6568 	if (bg_failed)
6569 		btrfs_warn(fs_info,
6570 			"failed to trim %llu block group(s), last error %d",
6571 			bg_failed, bg_ret);
6572 
6573 	mutex_lock(&fs_devices->device_list_mutex);
6574 	list_for_each_entry(device, &fs_devices->devices, dev_list) {
6575 		if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
6576 			continue;
6577 
6578 		ret = btrfs_trim_free_extents(device, &group_trimmed);
6579 
6580 		trimmed += group_trimmed;
6581 		if (ret) {
6582 			dev_failed++;
6583 			dev_ret = ret;
6584 			break;
6585 		}
6586 	}
6587 	mutex_unlock(&fs_devices->device_list_mutex);
6588 
6589 	if (dev_failed)
6590 		btrfs_warn(fs_info,
6591 			"failed to trim %llu device(s), last error %d",
6592 			dev_failed, dev_ret);
6593 	range->len = trimmed;
6594 	if (bg_ret)
6595 		return bg_ret;
6596 	return dev_ret;
6597 }
6598