• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2009 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 
19 #include <linux/sched.h>
20 #include <linux/pagemap.h>
21 #include <linux/writeback.h>
22 #include <linux/blkdev.h>
23 #include <linux/rbtree.h>
24 #include <linux/slab.h>
25 #include "ctree.h"
26 #include "disk-io.h"
27 #include "transaction.h"
28 #include "volumes.h"
29 #include "locking.h"
30 #include "btrfs_inode.h"
31 #include "async-thread.h"
32 #include "free-space-cache.h"
33 #include "inode-map.h"
34 
35 /*
36  * backref_node, mapping_node and tree_block start with this
37  */
38 struct tree_entry {
39 	struct rb_node rb_node;
40 	u64 bytenr;
41 };
42 
43 /*
44  * present a tree block in the backref cache
45  */
46 struct backref_node {
47 	struct rb_node rb_node;
48 	u64 bytenr;
49 
50 	u64 new_bytenr;
51 	/* objectid of tree block owner, can be not uptodate */
52 	u64 owner;
53 	/* link to pending, changed or detached list */
54 	struct list_head list;
55 	/* list of upper level blocks reference this block */
56 	struct list_head upper;
57 	/* list of child blocks in the cache */
58 	struct list_head lower;
59 	/* NULL if this node is not tree root */
60 	struct btrfs_root *root;
61 	/* extent buffer got by COW the block */
62 	struct extent_buffer *eb;
63 	/* level of tree block */
64 	unsigned int level:8;
65 	/* is the block in non-reference counted tree */
66 	unsigned int cowonly:1;
67 	/* 1 if no child node in the cache */
68 	unsigned int lowest:1;
69 	/* is the extent buffer locked */
70 	unsigned int locked:1;
71 	/* has the block been processed */
72 	unsigned int processed:1;
73 	/* have backrefs of this block been checked */
74 	unsigned int checked:1;
75 	/*
76 	 * 1 if corresponding block has been cowed but some upper
77 	 * level block pointers may not point to the new location
78 	 */
79 	unsigned int pending:1;
80 	/*
81 	 * 1 if the backref node isn't connected to any other
82 	 * backref node.
83 	 */
84 	unsigned int detached:1;
85 };
86 
87 /*
88  * present a block pointer in the backref cache
89  */
90 struct backref_edge {
91 	struct list_head list[2];
92 	struct backref_node *node[2];
93 };
94 
95 #define LOWER	0
96 #define UPPER	1
97 #define RELOCATION_RESERVED_NODES	256
98 
99 struct backref_cache {
100 	/* red black tree of all backref nodes in the cache */
101 	struct rb_root rb_root;
102 	/* for passing backref nodes to btrfs_reloc_cow_block */
103 	struct backref_node *path[BTRFS_MAX_LEVEL];
104 	/*
105 	 * list of blocks that have been cowed but some block
106 	 * pointers in upper level blocks may not reflect the
107 	 * new location
108 	 */
109 	struct list_head pending[BTRFS_MAX_LEVEL];
110 	/* list of backref nodes with no child node */
111 	struct list_head leaves;
112 	/* list of blocks that have been cowed in current transaction */
113 	struct list_head changed;
114 	/* list of detached backref node. */
115 	struct list_head detached;
116 
117 	u64 last_trans;
118 
119 	int nr_nodes;
120 	int nr_edges;
121 };
122 
123 /*
124  * map address of tree root to tree
125  */
126 struct mapping_node {
127 	struct rb_node rb_node;
128 	u64 bytenr;
129 	void *data;
130 };
131 
132 struct mapping_tree {
133 	struct rb_root rb_root;
134 	spinlock_t lock;
135 };
136 
137 /*
138  * present a tree block to process
139  */
140 struct tree_block {
141 	struct rb_node rb_node;
142 	u64 bytenr;
143 	struct btrfs_key key;
144 	unsigned int level:8;
145 	unsigned int key_ready:1;
146 };
147 
148 #define MAX_EXTENTS 128
149 
150 struct file_extent_cluster {
151 	u64 start;
152 	u64 end;
153 	u64 boundary[MAX_EXTENTS];
154 	unsigned int nr;
155 };
156 
157 struct reloc_control {
158 	/* block group to relocate */
159 	struct btrfs_block_group_cache *block_group;
160 	/* extent tree */
161 	struct btrfs_root *extent_root;
162 	/* inode for moving data */
163 	struct inode *data_inode;
164 
165 	struct btrfs_block_rsv *block_rsv;
166 
167 	struct backref_cache backref_cache;
168 
169 	struct file_extent_cluster cluster;
170 	/* tree blocks have been processed */
171 	struct extent_io_tree processed_blocks;
172 	/* map start of tree root to corresponding reloc tree */
173 	struct mapping_tree reloc_root_tree;
174 	/* list of reloc trees */
175 	struct list_head reloc_roots;
176 	/* size of metadata reservation for merging reloc trees */
177 	u64 merging_rsv_size;
178 	/* size of relocated tree nodes */
179 	u64 nodes_relocated;
180 	/* reserved size for block group relocation*/
181 	u64 reserved_bytes;
182 
183 	u64 search_start;
184 	u64 extents_found;
185 
186 	unsigned int stage:8;
187 	unsigned int create_reloc_tree:1;
188 	unsigned int merge_reloc_tree:1;
189 	unsigned int found_file_extent:1;
190 };
191 
192 /* stages of data relocation */
193 #define MOVE_DATA_EXTENTS	0
194 #define UPDATE_DATA_PTRS	1
195 
196 static void remove_backref_node(struct backref_cache *cache,
197 				struct backref_node *node);
198 static void __mark_block_processed(struct reloc_control *rc,
199 				   struct backref_node *node);
200 
mapping_tree_init(struct mapping_tree * tree)201 static void mapping_tree_init(struct mapping_tree *tree)
202 {
203 	tree->rb_root = RB_ROOT;
204 	spin_lock_init(&tree->lock);
205 }
206 
backref_cache_init(struct backref_cache * cache)207 static void backref_cache_init(struct backref_cache *cache)
208 {
209 	int i;
210 	cache->rb_root = RB_ROOT;
211 	for (i = 0; i < BTRFS_MAX_LEVEL; i++)
212 		INIT_LIST_HEAD(&cache->pending[i]);
213 	INIT_LIST_HEAD(&cache->changed);
214 	INIT_LIST_HEAD(&cache->detached);
215 	INIT_LIST_HEAD(&cache->leaves);
216 }
217 
backref_cache_cleanup(struct backref_cache * cache)218 static void backref_cache_cleanup(struct backref_cache *cache)
219 {
220 	struct backref_node *node;
221 	int i;
222 
223 	while (!list_empty(&cache->detached)) {
224 		node = list_entry(cache->detached.next,
225 				  struct backref_node, list);
226 		remove_backref_node(cache, node);
227 	}
228 
229 	while (!list_empty(&cache->leaves)) {
230 		node = list_entry(cache->leaves.next,
231 				  struct backref_node, lower);
232 		remove_backref_node(cache, node);
233 	}
234 
235 	cache->last_trans = 0;
236 
237 	for (i = 0; i < BTRFS_MAX_LEVEL; i++)
238 		BUG_ON(!list_empty(&cache->pending[i]));
239 	BUG_ON(!list_empty(&cache->changed));
240 	BUG_ON(!list_empty(&cache->detached));
241 	BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
242 	BUG_ON(cache->nr_nodes);
243 	BUG_ON(cache->nr_edges);
244 }
245 
alloc_backref_node(struct backref_cache * cache)246 static struct backref_node *alloc_backref_node(struct backref_cache *cache)
247 {
248 	struct backref_node *node;
249 
250 	node = kzalloc(sizeof(*node), GFP_NOFS);
251 	if (node) {
252 		INIT_LIST_HEAD(&node->list);
253 		INIT_LIST_HEAD(&node->upper);
254 		INIT_LIST_HEAD(&node->lower);
255 		RB_CLEAR_NODE(&node->rb_node);
256 		cache->nr_nodes++;
257 	}
258 	return node;
259 }
260 
free_backref_node(struct backref_cache * cache,struct backref_node * node)261 static void free_backref_node(struct backref_cache *cache,
262 			      struct backref_node *node)
263 {
264 	if (node) {
265 		cache->nr_nodes--;
266 		kfree(node);
267 	}
268 }
269 
alloc_backref_edge(struct backref_cache * cache)270 static struct backref_edge *alloc_backref_edge(struct backref_cache *cache)
271 {
272 	struct backref_edge *edge;
273 
274 	edge = kzalloc(sizeof(*edge), GFP_NOFS);
275 	if (edge)
276 		cache->nr_edges++;
277 	return edge;
278 }
279 
free_backref_edge(struct backref_cache * cache,struct backref_edge * edge)280 static void free_backref_edge(struct backref_cache *cache,
281 			      struct backref_edge *edge)
282 {
283 	if (edge) {
284 		cache->nr_edges--;
285 		kfree(edge);
286 	}
287 }
288 
tree_insert(struct rb_root * root,u64 bytenr,struct rb_node * node)289 static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
290 				   struct rb_node *node)
291 {
292 	struct rb_node **p = &root->rb_node;
293 	struct rb_node *parent = NULL;
294 	struct tree_entry *entry;
295 
296 	while (*p) {
297 		parent = *p;
298 		entry = rb_entry(parent, struct tree_entry, rb_node);
299 
300 		if (bytenr < entry->bytenr)
301 			p = &(*p)->rb_left;
302 		else if (bytenr > entry->bytenr)
303 			p = &(*p)->rb_right;
304 		else
305 			return parent;
306 	}
307 
308 	rb_link_node(node, parent, p);
309 	rb_insert_color(node, root);
310 	return NULL;
311 }
312 
tree_search(struct rb_root * root,u64 bytenr)313 static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
314 {
315 	struct rb_node *n = root->rb_node;
316 	struct tree_entry *entry;
317 
318 	while (n) {
319 		entry = rb_entry(n, struct tree_entry, rb_node);
320 
321 		if (bytenr < entry->bytenr)
322 			n = n->rb_left;
323 		else if (bytenr > entry->bytenr)
324 			n = n->rb_right;
325 		else
326 			return n;
327 	}
328 	return NULL;
329 }
330 
backref_tree_panic(struct rb_node * rb_node,int errno,u64 bytenr)331 static void backref_tree_panic(struct rb_node *rb_node, int errno, u64 bytenr)
332 {
333 
334 	struct btrfs_fs_info *fs_info = NULL;
335 	struct backref_node *bnode = rb_entry(rb_node, struct backref_node,
336 					      rb_node);
337 	if (bnode->root)
338 		fs_info = bnode->root->fs_info;
339 	btrfs_panic(fs_info, errno, "Inconsistency in backref cache "
340 		    "found at offset %llu", bytenr);
341 }
342 
343 /*
344  * walk up backref nodes until reach node presents tree root
345  */
walk_up_backref(struct backref_node * node,struct backref_edge * edges[],int * index)346 static struct backref_node *walk_up_backref(struct backref_node *node,
347 					    struct backref_edge *edges[],
348 					    int *index)
349 {
350 	struct backref_edge *edge;
351 	int idx = *index;
352 
353 	while (!list_empty(&node->upper)) {
354 		edge = list_entry(node->upper.next,
355 				  struct backref_edge, list[LOWER]);
356 		edges[idx++] = edge;
357 		node = edge->node[UPPER];
358 	}
359 	BUG_ON(node->detached);
360 	*index = idx;
361 	return node;
362 }
363 
364 /*
365  * walk down backref nodes to find start of next reference path
366  */
walk_down_backref(struct backref_edge * edges[],int * index)367 static struct backref_node *walk_down_backref(struct backref_edge *edges[],
368 					      int *index)
369 {
370 	struct backref_edge *edge;
371 	struct backref_node *lower;
372 	int idx = *index;
373 
374 	while (idx > 0) {
375 		edge = edges[idx - 1];
376 		lower = edge->node[LOWER];
377 		if (list_is_last(&edge->list[LOWER], &lower->upper)) {
378 			idx--;
379 			continue;
380 		}
381 		edge = list_entry(edge->list[LOWER].next,
382 				  struct backref_edge, list[LOWER]);
383 		edges[idx - 1] = edge;
384 		*index = idx;
385 		return edge->node[UPPER];
386 	}
387 	*index = 0;
388 	return NULL;
389 }
390 
unlock_node_buffer(struct backref_node * node)391 static void unlock_node_buffer(struct backref_node *node)
392 {
393 	if (node->locked) {
394 		btrfs_tree_unlock(node->eb);
395 		node->locked = 0;
396 	}
397 }
398 
drop_node_buffer(struct backref_node * node)399 static void drop_node_buffer(struct backref_node *node)
400 {
401 	if (node->eb) {
402 		unlock_node_buffer(node);
403 		free_extent_buffer(node->eb);
404 		node->eb = NULL;
405 	}
406 }
407 
drop_backref_node(struct backref_cache * tree,struct backref_node * node)408 static void drop_backref_node(struct backref_cache *tree,
409 			      struct backref_node *node)
410 {
411 	BUG_ON(!list_empty(&node->upper));
412 
413 	drop_node_buffer(node);
414 	list_del(&node->list);
415 	list_del(&node->lower);
416 	if (!RB_EMPTY_NODE(&node->rb_node))
417 		rb_erase(&node->rb_node, &tree->rb_root);
418 	free_backref_node(tree, node);
419 }
420 
421 /*
422  * remove a backref node from the backref cache
423  */
remove_backref_node(struct backref_cache * cache,struct backref_node * node)424 static void remove_backref_node(struct backref_cache *cache,
425 				struct backref_node *node)
426 {
427 	struct backref_node *upper;
428 	struct backref_edge *edge;
429 
430 	if (!node)
431 		return;
432 
433 	BUG_ON(!node->lowest && !node->detached);
434 	while (!list_empty(&node->upper)) {
435 		edge = list_entry(node->upper.next, struct backref_edge,
436 				  list[LOWER]);
437 		upper = edge->node[UPPER];
438 		list_del(&edge->list[LOWER]);
439 		list_del(&edge->list[UPPER]);
440 		free_backref_edge(cache, edge);
441 
442 		if (RB_EMPTY_NODE(&upper->rb_node)) {
443 			BUG_ON(!list_empty(&node->upper));
444 			drop_backref_node(cache, node);
445 			node = upper;
446 			node->lowest = 1;
447 			continue;
448 		}
449 		/*
450 		 * add the node to leaf node list if no other
451 		 * child block cached.
452 		 */
453 		if (list_empty(&upper->lower)) {
454 			list_add_tail(&upper->lower, &cache->leaves);
455 			upper->lowest = 1;
456 		}
457 	}
458 
459 	drop_backref_node(cache, node);
460 }
461 
update_backref_node(struct backref_cache * cache,struct backref_node * node,u64 bytenr)462 static void update_backref_node(struct backref_cache *cache,
463 				struct backref_node *node, u64 bytenr)
464 {
465 	struct rb_node *rb_node;
466 	rb_erase(&node->rb_node, &cache->rb_root);
467 	node->bytenr = bytenr;
468 	rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
469 	if (rb_node)
470 		backref_tree_panic(rb_node, -EEXIST, bytenr);
471 }
472 
473 /*
474  * update backref cache after a transaction commit
475  */
update_backref_cache(struct btrfs_trans_handle * trans,struct backref_cache * cache)476 static int update_backref_cache(struct btrfs_trans_handle *trans,
477 				struct backref_cache *cache)
478 {
479 	struct backref_node *node;
480 	int level = 0;
481 
482 	if (cache->last_trans == 0) {
483 		cache->last_trans = trans->transid;
484 		return 0;
485 	}
486 
487 	if (cache->last_trans == trans->transid)
488 		return 0;
489 
490 	/*
491 	 * detached nodes are used to avoid unnecessary backref
492 	 * lookup. transaction commit changes the extent tree.
493 	 * so the detached nodes are no longer useful.
494 	 */
495 	while (!list_empty(&cache->detached)) {
496 		node = list_entry(cache->detached.next,
497 				  struct backref_node, list);
498 		remove_backref_node(cache, node);
499 	}
500 
501 	while (!list_empty(&cache->changed)) {
502 		node = list_entry(cache->changed.next,
503 				  struct backref_node, list);
504 		list_del_init(&node->list);
505 		BUG_ON(node->pending);
506 		update_backref_node(cache, node, node->new_bytenr);
507 	}
508 
509 	/*
510 	 * some nodes can be left in the pending list if there were
511 	 * errors during processing the pending nodes.
512 	 */
513 	for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
514 		list_for_each_entry(node, &cache->pending[level], list) {
515 			BUG_ON(!node->pending);
516 			if (node->bytenr == node->new_bytenr)
517 				continue;
518 			update_backref_node(cache, node, node->new_bytenr);
519 		}
520 	}
521 
522 	cache->last_trans = 0;
523 	return 1;
524 }
525 
526 
should_ignore_root(struct btrfs_root * root)527 static int should_ignore_root(struct btrfs_root *root)
528 {
529 	struct btrfs_root *reloc_root;
530 
531 	if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
532 		return 0;
533 
534 	reloc_root = root->reloc_root;
535 	if (!reloc_root)
536 		return 0;
537 
538 	if (btrfs_root_last_snapshot(&reloc_root->root_item) ==
539 	    root->fs_info->running_transaction->transid - 1)
540 		return 0;
541 	/*
542 	 * if there is reloc tree and it was created in previous
543 	 * transaction backref lookup can find the reloc tree,
544 	 * so backref node for the fs tree root is useless for
545 	 * relocation.
546 	 */
547 	return 1;
548 }
549 /*
550  * find reloc tree by address of tree root
551  */
find_reloc_root(struct reloc_control * rc,u64 bytenr)552 static struct btrfs_root *find_reloc_root(struct reloc_control *rc,
553 					  u64 bytenr)
554 {
555 	struct rb_node *rb_node;
556 	struct mapping_node *node;
557 	struct btrfs_root *root = NULL;
558 
559 	spin_lock(&rc->reloc_root_tree.lock);
560 	rb_node = tree_search(&rc->reloc_root_tree.rb_root, bytenr);
561 	if (rb_node) {
562 		node = rb_entry(rb_node, struct mapping_node, rb_node);
563 		root = (struct btrfs_root *)node->data;
564 	}
565 	spin_unlock(&rc->reloc_root_tree.lock);
566 	return root;
567 }
568 
is_cowonly_root(u64 root_objectid)569 static int is_cowonly_root(u64 root_objectid)
570 {
571 	if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
572 	    root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
573 	    root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
574 	    root_objectid == BTRFS_DEV_TREE_OBJECTID ||
575 	    root_objectid == BTRFS_TREE_LOG_OBJECTID ||
576 	    root_objectid == BTRFS_CSUM_TREE_OBJECTID ||
577 	    root_objectid == BTRFS_UUID_TREE_OBJECTID ||
578 	    root_objectid == BTRFS_QUOTA_TREE_OBJECTID)
579 		return 1;
580 	return 0;
581 }
582 
read_fs_root(struct btrfs_fs_info * fs_info,u64 root_objectid)583 static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info,
584 					u64 root_objectid)
585 {
586 	struct btrfs_key key;
587 
588 	key.objectid = root_objectid;
589 	key.type = BTRFS_ROOT_ITEM_KEY;
590 	if (is_cowonly_root(root_objectid))
591 		key.offset = 0;
592 	else
593 		key.offset = (u64)-1;
594 
595 	return btrfs_get_fs_root(fs_info, &key, false);
596 }
597 
598 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
599 static noinline_for_stack
find_tree_root(struct reloc_control * rc,struct extent_buffer * leaf,struct btrfs_extent_ref_v0 * ref0)600 struct btrfs_root *find_tree_root(struct reloc_control *rc,
601 				  struct extent_buffer *leaf,
602 				  struct btrfs_extent_ref_v0 *ref0)
603 {
604 	struct btrfs_root *root;
605 	u64 root_objectid = btrfs_ref_root_v0(leaf, ref0);
606 	u64 generation = btrfs_ref_generation_v0(leaf, ref0);
607 
608 	BUG_ON(root_objectid == BTRFS_TREE_RELOC_OBJECTID);
609 
610 	root = read_fs_root(rc->extent_root->fs_info, root_objectid);
611 	BUG_ON(IS_ERR(root));
612 
613 	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
614 	    generation != btrfs_root_generation(&root->root_item))
615 		return NULL;
616 
617 	return root;
618 }
619 #endif
620 
621 static noinline_for_stack
find_inline_backref(struct extent_buffer * leaf,int slot,unsigned long * ptr,unsigned long * end)622 int find_inline_backref(struct extent_buffer *leaf, int slot,
623 			unsigned long *ptr, unsigned long *end)
624 {
625 	struct btrfs_key key;
626 	struct btrfs_extent_item *ei;
627 	struct btrfs_tree_block_info *bi;
628 	u32 item_size;
629 
630 	btrfs_item_key_to_cpu(leaf, &key, slot);
631 
632 	item_size = btrfs_item_size_nr(leaf, slot);
633 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
634 	if (item_size < sizeof(*ei)) {
635 		WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
636 		return 1;
637 	}
638 #endif
639 	ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
640 	WARN_ON(!(btrfs_extent_flags(leaf, ei) &
641 		  BTRFS_EXTENT_FLAG_TREE_BLOCK));
642 
643 	if (key.type == BTRFS_EXTENT_ITEM_KEY &&
644 	    item_size <= sizeof(*ei) + sizeof(*bi)) {
645 		WARN_ON(item_size < sizeof(*ei) + sizeof(*bi));
646 		return 1;
647 	}
648 	if (key.type == BTRFS_METADATA_ITEM_KEY &&
649 	    item_size <= sizeof(*ei)) {
650 		WARN_ON(item_size < sizeof(*ei));
651 		return 1;
652 	}
653 
654 	if (key.type == BTRFS_EXTENT_ITEM_KEY) {
655 		bi = (struct btrfs_tree_block_info *)(ei + 1);
656 		*ptr = (unsigned long)(bi + 1);
657 	} else {
658 		*ptr = (unsigned long)(ei + 1);
659 	}
660 	*end = (unsigned long)ei + item_size;
661 	return 0;
662 }
663 
664 /*
665  * build backref tree for a given tree block. root of the backref tree
666  * corresponds the tree block, leaves of the backref tree correspond
667  * roots of b-trees that reference the tree block.
668  *
669  * the basic idea of this function is check backrefs of a given block
670  * to find upper level blocks that refernece the block, and then check
671  * bakcrefs of these upper level blocks recursively. the recursion stop
672  * when tree root is reached or backrefs for the block is cached.
673  *
674  * NOTE: if we find backrefs for a block are cached, we know backrefs
675  * for all upper level blocks that directly/indirectly reference the
676  * block are also cached.
677  */
678 static noinline_for_stack
build_backref_tree(struct reloc_control * rc,struct btrfs_key * node_key,int level,u64 bytenr)679 struct backref_node *build_backref_tree(struct reloc_control *rc,
680 					struct btrfs_key *node_key,
681 					int level, u64 bytenr)
682 {
683 	struct backref_cache *cache = &rc->backref_cache;
684 	struct btrfs_path *path1;
685 	struct btrfs_path *path2;
686 	struct extent_buffer *eb;
687 	struct btrfs_root *root;
688 	struct backref_node *cur;
689 	struct backref_node *upper;
690 	struct backref_node *lower;
691 	struct backref_node *node = NULL;
692 	struct backref_node *exist = NULL;
693 	struct backref_edge *edge;
694 	struct rb_node *rb_node;
695 	struct btrfs_key key;
696 	unsigned long end;
697 	unsigned long ptr;
698 	LIST_HEAD(list);
699 	LIST_HEAD(useless);
700 	int cowonly;
701 	int ret;
702 	int err = 0;
703 	bool need_check = true;
704 
705 	path1 = btrfs_alloc_path();
706 	path2 = btrfs_alloc_path();
707 	if (!path1 || !path2) {
708 		err = -ENOMEM;
709 		goto out;
710 	}
711 	path1->reada = 1;
712 	path2->reada = 2;
713 
714 	node = alloc_backref_node(cache);
715 	if (!node) {
716 		err = -ENOMEM;
717 		goto out;
718 	}
719 
720 	node->bytenr = bytenr;
721 	node->level = level;
722 	node->lowest = 1;
723 	cur = node;
724 again:
725 	end = 0;
726 	ptr = 0;
727 	key.objectid = cur->bytenr;
728 	key.type = BTRFS_METADATA_ITEM_KEY;
729 	key.offset = (u64)-1;
730 
731 	path1->search_commit_root = 1;
732 	path1->skip_locking = 1;
733 	ret = btrfs_search_slot(NULL, rc->extent_root, &key, path1,
734 				0, 0);
735 	if (ret < 0) {
736 		err = ret;
737 		goto out;
738 	}
739 	ASSERT(ret);
740 	ASSERT(path1->slots[0]);
741 
742 	path1->slots[0]--;
743 
744 	WARN_ON(cur->checked);
745 	if (!list_empty(&cur->upper)) {
746 		/*
747 		 * the backref was added previously when processing
748 		 * backref of type BTRFS_TREE_BLOCK_REF_KEY
749 		 */
750 		ASSERT(list_is_singular(&cur->upper));
751 		edge = list_entry(cur->upper.next, struct backref_edge,
752 				  list[LOWER]);
753 		ASSERT(list_empty(&edge->list[UPPER]));
754 		exist = edge->node[UPPER];
755 		/*
756 		 * add the upper level block to pending list if we need
757 		 * check its backrefs
758 		 */
759 		if (!exist->checked)
760 			list_add_tail(&edge->list[UPPER], &list);
761 	} else {
762 		exist = NULL;
763 	}
764 
765 	while (1) {
766 		cond_resched();
767 		eb = path1->nodes[0];
768 
769 		if (ptr >= end) {
770 			if (path1->slots[0] >= btrfs_header_nritems(eb)) {
771 				ret = btrfs_next_leaf(rc->extent_root, path1);
772 				if (ret < 0) {
773 					err = ret;
774 					goto out;
775 				}
776 				if (ret > 0)
777 					break;
778 				eb = path1->nodes[0];
779 			}
780 
781 			btrfs_item_key_to_cpu(eb, &key, path1->slots[0]);
782 			if (key.objectid != cur->bytenr) {
783 				WARN_ON(exist);
784 				break;
785 			}
786 
787 			if (key.type == BTRFS_EXTENT_ITEM_KEY ||
788 			    key.type == BTRFS_METADATA_ITEM_KEY) {
789 				ret = find_inline_backref(eb, path1->slots[0],
790 							  &ptr, &end);
791 				if (ret)
792 					goto next;
793 			}
794 		}
795 
796 		if (ptr < end) {
797 			/* update key for inline back ref */
798 			struct btrfs_extent_inline_ref *iref;
799 			iref = (struct btrfs_extent_inline_ref *)ptr;
800 			key.type = btrfs_extent_inline_ref_type(eb, iref);
801 			key.offset = btrfs_extent_inline_ref_offset(eb, iref);
802 			WARN_ON(key.type != BTRFS_TREE_BLOCK_REF_KEY &&
803 				key.type != BTRFS_SHARED_BLOCK_REF_KEY);
804 		}
805 
806 		if (exist &&
807 		    ((key.type == BTRFS_TREE_BLOCK_REF_KEY &&
808 		      exist->owner == key.offset) ||
809 		     (key.type == BTRFS_SHARED_BLOCK_REF_KEY &&
810 		      exist->bytenr == key.offset))) {
811 			exist = NULL;
812 			goto next;
813 		}
814 
815 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
816 		if (key.type == BTRFS_SHARED_BLOCK_REF_KEY ||
817 		    key.type == BTRFS_EXTENT_REF_V0_KEY) {
818 			if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
819 				struct btrfs_extent_ref_v0 *ref0;
820 				ref0 = btrfs_item_ptr(eb, path1->slots[0],
821 						struct btrfs_extent_ref_v0);
822 				if (key.objectid == key.offset) {
823 					root = find_tree_root(rc, eb, ref0);
824 					if (root && !should_ignore_root(root))
825 						cur->root = root;
826 					else
827 						list_add(&cur->list, &useless);
828 					break;
829 				}
830 				if (is_cowonly_root(btrfs_ref_root_v0(eb,
831 								      ref0)))
832 					cur->cowonly = 1;
833 			}
834 #else
835 		ASSERT(key.type != BTRFS_EXTENT_REF_V0_KEY);
836 		if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
837 #endif
838 			if (key.objectid == key.offset) {
839 				/*
840 				 * only root blocks of reloc trees use
841 				 * backref of this type.
842 				 */
843 				root = find_reloc_root(rc, cur->bytenr);
844 				ASSERT(root);
845 				cur->root = root;
846 				break;
847 			}
848 
849 			edge = alloc_backref_edge(cache);
850 			if (!edge) {
851 				err = -ENOMEM;
852 				goto out;
853 			}
854 			rb_node = tree_search(&cache->rb_root, key.offset);
855 			if (!rb_node) {
856 				upper = alloc_backref_node(cache);
857 				if (!upper) {
858 					free_backref_edge(cache, edge);
859 					err = -ENOMEM;
860 					goto out;
861 				}
862 				upper->bytenr = key.offset;
863 				upper->level = cur->level + 1;
864 				/*
865 				 *  backrefs for the upper level block isn't
866 				 *  cached, add the block to pending list
867 				 */
868 				list_add_tail(&edge->list[UPPER], &list);
869 			} else {
870 				upper = rb_entry(rb_node, struct backref_node,
871 						 rb_node);
872 				ASSERT(upper->checked);
873 				INIT_LIST_HEAD(&edge->list[UPPER]);
874 			}
875 			list_add_tail(&edge->list[LOWER], &cur->upper);
876 			edge->node[LOWER] = cur;
877 			edge->node[UPPER] = upper;
878 
879 			goto next;
880 		} else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
881 			goto next;
882 		}
883 
884 		/* key.type == BTRFS_TREE_BLOCK_REF_KEY */
885 		root = read_fs_root(rc->extent_root->fs_info, key.offset);
886 		if (IS_ERR(root)) {
887 			err = PTR_ERR(root);
888 			goto out;
889 		}
890 
891 		if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
892 			cur->cowonly = 1;
893 
894 		if (btrfs_root_level(&root->root_item) == cur->level) {
895 			/* tree root */
896 			ASSERT(btrfs_root_bytenr(&root->root_item) ==
897 			       cur->bytenr);
898 			if (should_ignore_root(root))
899 				list_add(&cur->list, &useless);
900 			else
901 				cur->root = root;
902 			break;
903 		}
904 
905 		level = cur->level + 1;
906 
907 		/*
908 		 * searching the tree to find upper level blocks
909 		 * reference the block.
910 		 */
911 		path2->search_commit_root = 1;
912 		path2->skip_locking = 1;
913 		path2->lowest_level = level;
914 		ret = btrfs_search_slot(NULL, root, node_key, path2, 0, 0);
915 		path2->lowest_level = 0;
916 		if (ret < 0) {
917 			err = ret;
918 			goto out;
919 		}
920 		if (ret > 0 && path2->slots[level] > 0)
921 			path2->slots[level]--;
922 
923 		eb = path2->nodes[level];
924 		if (btrfs_node_blockptr(eb, path2->slots[level]) !=
925 		    cur->bytenr) {
926 			btrfs_err(root->fs_info,
927 	"couldn't find block (%llu) (level %d) in tree (%llu) with key (%llu %u %llu)",
928 				  cur->bytenr, level - 1, root->objectid,
929 				  node_key->objectid, node_key->type,
930 				  node_key->offset);
931 			err = -ENOENT;
932 			goto out;
933 		}
934 		lower = cur;
935 		need_check = true;
936 		for (; level < BTRFS_MAX_LEVEL; level++) {
937 			if (!path2->nodes[level]) {
938 				ASSERT(btrfs_root_bytenr(&root->root_item) ==
939 				       lower->bytenr);
940 				if (should_ignore_root(root))
941 					list_add(&lower->list, &useless);
942 				else
943 					lower->root = root;
944 				break;
945 			}
946 
947 			edge = alloc_backref_edge(cache);
948 			if (!edge) {
949 				err = -ENOMEM;
950 				goto out;
951 			}
952 
953 			eb = path2->nodes[level];
954 			rb_node = tree_search(&cache->rb_root, eb->start);
955 			if (!rb_node) {
956 				upper = alloc_backref_node(cache);
957 				if (!upper) {
958 					free_backref_edge(cache, edge);
959 					err = -ENOMEM;
960 					goto out;
961 				}
962 				upper->bytenr = eb->start;
963 				upper->owner = btrfs_header_owner(eb);
964 				upper->level = lower->level + 1;
965 				if (!test_bit(BTRFS_ROOT_REF_COWS,
966 					      &root->state))
967 					upper->cowonly = 1;
968 
969 				/*
970 				 * if we know the block isn't shared
971 				 * we can void checking its backrefs.
972 				 */
973 				if (btrfs_block_can_be_shared(root, eb))
974 					upper->checked = 0;
975 				else
976 					upper->checked = 1;
977 
978 				/*
979 				 * add the block to pending list if we
980 				 * need check its backrefs, we only do this once
981 				 * while walking up a tree as we will catch
982 				 * anything else later on.
983 				 */
984 				if (!upper->checked && need_check) {
985 					need_check = false;
986 					list_add_tail(&edge->list[UPPER],
987 						      &list);
988 				} else {
989 					if (upper->checked)
990 						need_check = true;
991 					INIT_LIST_HEAD(&edge->list[UPPER]);
992 				}
993 			} else {
994 				upper = rb_entry(rb_node, struct backref_node,
995 						 rb_node);
996 				ASSERT(upper->checked);
997 				INIT_LIST_HEAD(&edge->list[UPPER]);
998 				if (!upper->owner)
999 					upper->owner = btrfs_header_owner(eb);
1000 			}
1001 			list_add_tail(&edge->list[LOWER], &lower->upper);
1002 			edge->node[LOWER] = lower;
1003 			edge->node[UPPER] = upper;
1004 
1005 			if (rb_node)
1006 				break;
1007 			lower = upper;
1008 			upper = NULL;
1009 		}
1010 		btrfs_release_path(path2);
1011 next:
1012 		if (ptr < end) {
1013 			ptr += btrfs_extent_inline_ref_size(key.type);
1014 			if (ptr >= end) {
1015 				WARN_ON(ptr > end);
1016 				ptr = 0;
1017 				end = 0;
1018 			}
1019 		}
1020 		if (ptr >= end)
1021 			path1->slots[0]++;
1022 	}
1023 	btrfs_release_path(path1);
1024 
1025 	cur->checked = 1;
1026 	WARN_ON(exist);
1027 
1028 	/* the pending list isn't empty, take the first block to process */
1029 	if (!list_empty(&list)) {
1030 		edge = list_entry(list.next, struct backref_edge, list[UPPER]);
1031 		list_del_init(&edge->list[UPPER]);
1032 		cur = edge->node[UPPER];
1033 		goto again;
1034 	}
1035 
1036 	/*
1037 	 * everything goes well, connect backref nodes and insert backref nodes
1038 	 * into the cache.
1039 	 */
1040 	ASSERT(node->checked);
1041 	cowonly = node->cowonly;
1042 	if (!cowonly) {
1043 		rb_node = tree_insert(&cache->rb_root, node->bytenr,
1044 				      &node->rb_node);
1045 		if (rb_node)
1046 			backref_tree_panic(rb_node, -EEXIST, node->bytenr);
1047 		list_add_tail(&node->lower, &cache->leaves);
1048 	}
1049 
1050 	list_for_each_entry(edge, &node->upper, list[LOWER])
1051 		list_add_tail(&edge->list[UPPER], &list);
1052 
1053 	while (!list_empty(&list)) {
1054 		edge = list_entry(list.next, struct backref_edge, list[UPPER]);
1055 		list_del_init(&edge->list[UPPER]);
1056 		upper = edge->node[UPPER];
1057 		if (upper->detached) {
1058 			list_del(&edge->list[LOWER]);
1059 			lower = edge->node[LOWER];
1060 			free_backref_edge(cache, edge);
1061 			if (list_empty(&lower->upper))
1062 				list_add(&lower->list, &useless);
1063 			continue;
1064 		}
1065 
1066 		if (!RB_EMPTY_NODE(&upper->rb_node)) {
1067 			if (upper->lowest) {
1068 				list_del_init(&upper->lower);
1069 				upper->lowest = 0;
1070 			}
1071 
1072 			list_add_tail(&edge->list[UPPER], &upper->lower);
1073 			continue;
1074 		}
1075 
1076 		if (!upper->checked) {
1077 			/*
1078 			 * Still want to blow up for developers since this is a
1079 			 * logic bug.
1080 			 */
1081 			ASSERT(0);
1082 			err = -EINVAL;
1083 			goto out;
1084 		}
1085 		if (cowonly != upper->cowonly) {
1086 			ASSERT(0);
1087 			err = -EINVAL;
1088 			goto out;
1089 		}
1090 
1091 		if (!cowonly) {
1092 			rb_node = tree_insert(&cache->rb_root, upper->bytenr,
1093 					      &upper->rb_node);
1094 			if (rb_node)
1095 				backref_tree_panic(rb_node, -EEXIST,
1096 						   upper->bytenr);
1097 		}
1098 
1099 		list_add_tail(&edge->list[UPPER], &upper->lower);
1100 
1101 		list_for_each_entry(edge, &upper->upper, list[LOWER])
1102 			list_add_tail(&edge->list[UPPER], &list);
1103 	}
1104 	/*
1105 	 * process useless backref nodes. backref nodes for tree leaves
1106 	 * are deleted from the cache. backref nodes for upper level
1107 	 * tree blocks are left in the cache to avoid unnecessary backref
1108 	 * lookup.
1109 	 */
1110 	while (!list_empty(&useless)) {
1111 		upper = list_entry(useless.next, struct backref_node, list);
1112 		list_del_init(&upper->list);
1113 		ASSERT(list_empty(&upper->upper));
1114 		if (upper == node)
1115 			node = NULL;
1116 		if (upper->lowest) {
1117 			list_del_init(&upper->lower);
1118 			upper->lowest = 0;
1119 		}
1120 		while (!list_empty(&upper->lower)) {
1121 			edge = list_entry(upper->lower.next,
1122 					  struct backref_edge, list[UPPER]);
1123 			list_del(&edge->list[UPPER]);
1124 			list_del(&edge->list[LOWER]);
1125 			lower = edge->node[LOWER];
1126 			free_backref_edge(cache, edge);
1127 
1128 			if (list_empty(&lower->upper))
1129 				list_add(&lower->list, &useless);
1130 		}
1131 		__mark_block_processed(rc, upper);
1132 		if (upper->level > 0) {
1133 			list_add(&upper->list, &cache->detached);
1134 			upper->detached = 1;
1135 		} else {
1136 			rb_erase(&upper->rb_node, &cache->rb_root);
1137 			free_backref_node(cache, upper);
1138 		}
1139 	}
1140 out:
1141 	btrfs_free_path(path1);
1142 	btrfs_free_path(path2);
1143 	if (err) {
1144 		while (!list_empty(&useless)) {
1145 			lower = list_entry(useless.next,
1146 					   struct backref_node, list);
1147 			list_del_init(&lower->list);
1148 		}
1149 		while (!list_empty(&list)) {
1150 			edge = list_first_entry(&list, struct backref_edge,
1151 						list[UPPER]);
1152 			list_del(&edge->list[UPPER]);
1153 			list_del(&edge->list[LOWER]);
1154 			lower = edge->node[LOWER];
1155 			upper = edge->node[UPPER];
1156 			free_backref_edge(cache, edge);
1157 
1158 			/*
1159 			 * Lower is no longer linked to any upper backref nodes
1160 			 * and isn't in the cache, we can free it ourselves.
1161 			 */
1162 			if (list_empty(&lower->upper) &&
1163 			    RB_EMPTY_NODE(&lower->rb_node))
1164 				list_add(&lower->list, &useless);
1165 
1166 			if (!RB_EMPTY_NODE(&upper->rb_node))
1167 				continue;
1168 
1169 			/* Add this guy's upper edges to the list to proces */
1170 			list_for_each_entry(edge, &upper->upper, list[LOWER])
1171 				list_add_tail(&edge->list[UPPER], &list);
1172 			if (list_empty(&upper->upper))
1173 				list_add(&upper->list, &useless);
1174 		}
1175 
1176 		while (!list_empty(&useless)) {
1177 			lower = list_entry(useless.next,
1178 					   struct backref_node, list);
1179 			list_del_init(&lower->list);
1180 			free_backref_node(cache, lower);
1181 		}
1182 		return ERR_PTR(err);
1183 	}
1184 	ASSERT(!node || !node->detached);
1185 	return node;
1186 }
1187 
1188 /*
1189  * helper to add backref node for the newly created snapshot.
1190  * the backref node is created by cloning backref node that
1191  * corresponds to root of source tree
1192  */
1193 static int clone_backref_node(struct btrfs_trans_handle *trans,
1194 			      struct reloc_control *rc,
1195 			      struct btrfs_root *src,
1196 			      struct btrfs_root *dest)
1197 {
1198 	struct btrfs_root *reloc_root = src->reloc_root;
1199 	struct backref_cache *cache = &rc->backref_cache;
1200 	struct backref_node *node = NULL;
1201 	struct backref_node *new_node;
1202 	struct backref_edge *edge;
1203 	struct backref_edge *new_edge;
1204 	struct rb_node *rb_node;
1205 
1206 	if (cache->last_trans > 0)
1207 		update_backref_cache(trans, cache);
1208 
1209 	rb_node = tree_search(&cache->rb_root, src->commit_root->start);
1210 	if (rb_node) {
1211 		node = rb_entry(rb_node, struct backref_node, rb_node);
1212 		if (node->detached)
1213 			node = NULL;
1214 		else
1215 			BUG_ON(node->new_bytenr != reloc_root->node->start);
1216 	}
1217 
1218 	if (!node) {
1219 		rb_node = tree_search(&cache->rb_root,
1220 				      reloc_root->commit_root->start);
1221 		if (rb_node) {
1222 			node = rb_entry(rb_node, struct backref_node,
1223 					rb_node);
1224 			BUG_ON(node->detached);
1225 		}
1226 	}
1227 
1228 	if (!node)
1229 		return 0;
1230 
1231 	new_node = alloc_backref_node(cache);
1232 	if (!new_node)
1233 		return -ENOMEM;
1234 
1235 	new_node->bytenr = dest->node->start;
1236 	new_node->level = node->level;
1237 	new_node->lowest = node->lowest;
1238 	new_node->checked = 1;
1239 	new_node->root = dest;
1240 
1241 	if (!node->lowest) {
1242 		list_for_each_entry(edge, &node->lower, list[UPPER]) {
1243 			new_edge = alloc_backref_edge(cache);
1244 			if (!new_edge)
1245 				goto fail;
1246 
1247 			new_edge->node[UPPER] = new_node;
1248 			new_edge->node[LOWER] = edge->node[LOWER];
1249 			list_add_tail(&new_edge->list[UPPER],
1250 				      &new_node->lower);
1251 		}
1252 	} else {
1253 		list_add_tail(&new_node->lower, &cache->leaves);
1254 	}
1255 
1256 	rb_node = tree_insert(&cache->rb_root, new_node->bytenr,
1257 			      &new_node->rb_node);
1258 	if (rb_node)
1259 		backref_tree_panic(rb_node, -EEXIST, new_node->bytenr);
1260 
1261 	if (!new_node->lowest) {
1262 		list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
1263 			list_add_tail(&new_edge->list[LOWER],
1264 				      &new_edge->node[LOWER]->upper);
1265 		}
1266 	}
1267 	return 0;
1268 fail:
1269 	while (!list_empty(&new_node->lower)) {
1270 		new_edge = list_entry(new_node->lower.next,
1271 				      struct backref_edge, list[UPPER]);
1272 		list_del(&new_edge->list[UPPER]);
1273 		free_backref_edge(cache, new_edge);
1274 	}
1275 	free_backref_node(cache, new_node);
1276 	return -ENOMEM;
1277 }
1278 
1279 /*
1280  * helper to add 'address of tree root -> reloc tree' mapping
1281  */
1282 static int __must_check __add_reloc_root(struct btrfs_root *root)
1283 {
1284 	struct rb_node *rb_node;
1285 	struct mapping_node *node;
1286 	struct reloc_control *rc = root->fs_info->reloc_ctl;
1287 
1288 	node = kmalloc(sizeof(*node), GFP_NOFS);
1289 	if (!node)
1290 		return -ENOMEM;
1291 
1292 	node->bytenr = root->commit_root->start;
1293 	node->data = root;
1294 
1295 	spin_lock(&rc->reloc_root_tree.lock);
1296 	rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
1297 			      node->bytenr, &node->rb_node);
1298 	spin_unlock(&rc->reloc_root_tree.lock);
1299 	if (rb_node) {
1300 		btrfs_panic(root->fs_info, -EEXIST, "Duplicate root found "
1301 			    "for start=%llu while inserting into relocation "
1302 			    "tree", node->bytenr);
1303 		kfree(node);
1304 		return -EEXIST;
1305 	}
1306 
1307 	list_add_tail(&root->root_list, &rc->reloc_roots);
1308 	return 0;
1309 }
1310 
1311 /*
1312  * helper to delete the 'address of tree root -> reloc tree'
1313  * mapping
1314  */
1315 static void __del_reloc_root(struct btrfs_root *root)
1316 {
1317 	struct rb_node *rb_node;
1318 	struct mapping_node *node = NULL;
1319 	struct reloc_control *rc = root->fs_info->reloc_ctl;
1320 
1321 	if (rc && root->node) {
1322 		spin_lock(&rc->reloc_root_tree.lock);
1323 		rb_node = tree_search(&rc->reloc_root_tree.rb_root,
1324 				      root->commit_root->start);
1325 		if (rb_node) {
1326 			node = rb_entry(rb_node, struct mapping_node, rb_node);
1327 			rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
1328 			RB_CLEAR_NODE(&node->rb_node);
1329 		}
1330 		spin_unlock(&rc->reloc_root_tree.lock);
1331 		ASSERT(!node || (struct btrfs_root *)node->data == root);
1332 	}
1333 
1334 	spin_lock(&root->fs_info->trans_lock);
1335 	list_del_init(&root->root_list);
1336 	spin_unlock(&root->fs_info->trans_lock);
1337 	kfree(node);
1338 }
1339 
1340 /*
1341  * helper to update the 'address of tree root -> reloc tree'
1342  * mapping
1343  */
1344 static int __update_reloc_root(struct btrfs_root *root)
1345 {
1346 	struct rb_node *rb_node;
1347 	struct mapping_node *node = NULL;
1348 	struct reloc_control *rc = root->fs_info->reloc_ctl;
1349 
1350 	spin_lock(&rc->reloc_root_tree.lock);
1351 	rb_node = tree_search(&rc->reloc_root_tree.rb_root,
1352 			      root->commit_root->start);
1353 	if (rb_node) {
1354 		node = rb_entry(rb_node, struct mapping_node, rb_node);
1355 		rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
1356 	}
1357 	spin_unlock(&rc->reloc_root_tree.lock);
1358 
1359 	if (!node)
1360 		return 0;
1361 	BUG_ON((struct btrfs_root *)node->data != root);
1362 
1363 	spin_lock(&rc->reloc_root_tree.lock);
1364 	node->bytenr = root->node->start;
1365 	rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
1366 			      node->bytenr, &node->rb_node);
1367 	spin_unlock(&rc->reloc_root_tree.lock);
1368 	if (rb_node)
1369 		backref_tree_panic(rb_node, -EEXIST, node->bytenr);
1370 	return 0;
1371 }
1372 
1373 static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
1374 					struct btrfs_root *root, u64 objectid)
1375 {
1376 	struct btrfs_root *reloc_root;
1377 	struct extent_buffer *eb;
1378 	struct btrfs_root_item *root_item;
1379 	struct btrfs_key root_key;
1380 	u64 last_snap = 0;
1381 	int ret;
1382 
1383 	root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
1384 	BUG_ON(!root_item);
1385 
1386 	root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
1387 	root_key.type = BTRFS_ROOT_ITEM_KEY;
1388 	root_key.offset = objectid;
1389 
1390 	if (root->root_key.objectid == objectid) {
1391 		/* called by btrfs_init_reloc_root */
1392 		ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
1393 				      BTRFS_TREE_RELOC_OBJECTID);
1394 		BUG_ON(ret);
1395 
1396 		last_snap = btrfs_root_last_snapshot(&root->root_item);
1397 		btrfs_set_root_last_snapshot(&root->root_item,
1398 					     trans->transid - 1);
1399 	} else {
1400 		/*
1401 		 * called by btrfs_reloc_post_snapshot_hook.
1402 		 * the source tree is a reloc tree, all tree blocks
1403 		 * modified after it was created have RELOC flag
1404 		 * set in their headers. so it's OK to not update
1405 		 * the 'last_snapshot'.
1406 		 */
1407 		ret = btrfs_copy_root(trans, root, root->node, &eb,
1408 				      BTRFS_TREE_RELOC_OBJECTID);
1409 		BUG_ON(ret);
1410 	}
1411 
1412 	memcpy(root_item, &root->root_item, sizeof(*root_item));
1413 	btrfs_set_root_bytenr(root_item, eb->start);
1414 	btrfs_set_root_level(root_item, btrfs_header_level(eb));
1415 	btrfs_set_root_generation(root_item, trans->transid);
1416 
1417 	if (root->root_key.objectid == objectid) {
1418 		btrfs_set_root_refs(root_item, 0);
1419 		memset(&root_item->drop_progress, 0,
1420 		       sizeof(struct btrfs_disk_key));
1421 		root_item->drop_level = 0;
1422 		/*
1423 		 * abuse rtransid, it is safe because it is impossible to
1424 		 * receive data into a relocation tree.
1425 		 */
1426 		btrfs_set_root_rtransid(root_item, last_snap);
1427 		btrfs_set_root_otransid(root_item, trans->transid);
1428 	}
1429 
1430 	btrfs_tree_unlock(eb);
1431 	free_extent_buffer(eb);
1432 
1433 	ret = btrfs_insert_root(trans, root->fs_info->tree_root,
1434 				&root_key, root_item);
1435 	BUG_ON(ret);
1436 	kfree(root_item);
1437 
1438 	reloc_root = btrfs_read_fs_root(root->fs_info->tree_root, &root_key);
1439 	BUG_ON(IS_ERR(reloc_root));
1440 	reloc_root->last_trans = trans->transid;
1441 	return reloc_root;
1442 }
1443 
1444 /*
1445  * create reloc tree for a given fs tree. reloc tree is just a
1446  * snapshot of the fs tree with special root objectid.
1447  */
1448 int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
1449 			  struct btrfs_root *root)
1450 {
1451 	struct btrfs_root *reloc_root;
1452 	struct reloc_control *rc = root->fs_info->reloc_ctl;
1453 	struct btrfs_block_rsv *rsv;
1454 	int clear_rsv = 0;
1455 	int ret;
1456 
1457 	if (root->reloc_root) {
1458 		reloc_root = root->reloc_root;
1459 		reloc_root->last_trans = trans->transid;
1460 		return 0;
1461 	}
1462 
1463 	if (!rc || !rc->create_reloc_tree ||
1464 	    root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1465 		return 0;
1466 
1467 	if (!trans->reloc_reserved) {
1468 		rsv = trans->block_rsv;
1469 		trans->block_rsv = rc->block_rsv;
1470 		clear_rsv = 1;
1471 	}
1472 	reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
1473 	if (clear_rsv)
1474 		trans->block_rsv = rsv;
1475 
1476 	ret = __add_reloc_root(reloc_root);
1477 	BUG_ON(ret < 0);
1478 	root->reloc_root = reloc_root;
1479 	return 0;
1480 }
1481 
1482 /*
1483  * update root item of reloc tree
1484  */
1485 int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
1486 			    struct btrfs_root *root)
1487 {
1488 	struct btrfs_root *reloc_root;
1489 	struct btrfs_root_item *root_item;
1490 	int ret;
1491 
1492 	if (!root->reloc_root)
1493 		goto out;
1494 
1495 	reloc_root = root->reloc_root;
1496 	root_item = &reloc_root->root_item;
1497 
1498 	if (root->fs_info->reloc_ctl->merge_reloc_tree &&
1499 	    btrfs_root_refs(root_item) == 0) {
1500 		root->reloc_root = NULL;
1501 		__del_reloc_root(reloc_root);
1502 	}
1503 
1504 	if (reloc_root->commit_root != reloc_root->node) {
1505 		__update_reloc_root(reloc_root);
1506 		btrfs_set_root_node(root_item, reloc_root->node);
1507 		free_extent_buffer(reloc_root->commit_root);
1508 		reloc_root->commit_root = btrfs_root_node(reloc_root);
1509 	}
1510 
1511 	ret = btrfs_update_root(trans, root->fs_info->tree_root,
1512 				&reloc_root->root_key, root_item);
1513 	BUG_ON(ret);
1514 
1515 out:
1516 	return 0;
1517 }
1518 
1519 /*
1520  * helper to find first cached inode with inode number >= objectid
1521  * in a subvolume
1522  */
1523 static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid)
1524 {
1525 	struct rb_node *node;
1526 	struct rb_node *prev;
1527 	struct btrfs_inode *entry;
1528 	struct inode *inode;
1529 
1530 	spin_lock(&root->inode_lock);
1531 again:
1532 	node = root->inode_tree.rb_node;
1533 	prev = NULL;
1534 	while (node) {
1535 		prev = node;
1536 		entry = rb_entry(node, struct btrfs_inode, rb_node);
1537 
1538 		if (objectid < btrfs_ino(&entry->vfs_inode))
1539 			node = node->rb_left;
1540 		else if (objectid > btrfs_ino(&entry->vfs_inode))
1541 			node = node->rb_right;
1542 		else
1543 			break;
1544 	}
1545 	if (!node) {
1546 		while (prev) {
1547 			entry = rb_entry(prev, struct btrfs_inode, rb_node);
1548 			if (objectid <= btrfs_ino(&entry->vfs_inode)) {
1549 				node = prev;
1550 				break;
1551 			}
1552 			prev = rb_next(prev);
1553 		}
1554 	}
1555 	while (node) {
1556 		entry = rb_entry(node, struct btrfs_inode, rb_node);
1557 		inode = igrab(&entry->vfs_inode);
1558 		if (inode) {
1559 			spin_unlock(&root->inode_lock);
1560 			return inode;
1561 		}
1562 
1563 		objectid = btrfs_ino(&entry->vfs_inode) + 1;
1564 		if (cond_resched_lock(&root->inode_lock))
1565 			goto again;
1566 
1567 		node = rb_next(node);
1568 	}
1569 	spin_unlock(&root->inode_lock);
1570 	return NULL;
1571 }
1572 
1573 static int in_block_group(u64 bytenr,
1574 			  struct btrfs_block_group_cache *block_group)
1575 {
1576 	if (bytenr >= block_group->key.objectid &&
1577 	    bytenr < block_group->key.objectid + block_group->key.offset)
1578 		return 1;
1579 	return 0;
1580 }
1581 
1582 /*
1583  * get new location of data
1584  */
1585 static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
1586 			    u64 bytenr, u64 num_bytes)
1587 {
1588 	struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
1589 	struct btrfs_path *path;
1590 	struct btrfs_file_extent_item *fi;
1591 	struct extent_buffer *leaf;
1592 	int ret;
1593 
1594 	path = btrfs_alloc_path();
1595 	if (!path)
1596 		return -ENOMEM;
1597 
1598 	bytenr -= BTRFS_I(reloc_inode)->index_cnt;
1599 	ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(reloc_inode),
1600 				       bytenr, 0);
1601 	if (ret < 0)
1602 		goto out;
1603 	if (ret > 0) {
1604 		ret = -ENOENT;
1605 		goto out;
1606 	}
1607 
1608 	leaf = path->nodes[0];
1609 	fi = btrfs_item_ptr(leaf, path->slots[0],
1610 			    struct btrfs_file_extent_item);
1611 
1612 	BUG_ON(btrfs_file_extent_offset(leaf, fi) ||
1613 	       btrfs_file_extent_compression(leaf, fi) ||
1614 	       btrfs_file_extent_encryption(leaf, fi) ||
1615 	       btrfs_file_extent_other_encoding(leaf, fi));
1616 
1617 	if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
1618 		ret = -EINVAL;
1619 		goto out;
1620 	}
1621 
1622 	*new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1623 	ret = 0;
1624 out:
1625 	btrfs_free_path(path);
1626 	return ret;
1627 }
1628 
1629 /*
1630  * update file extent items in the tree leaf to point to
1631  * the new locations.
1632  */
1633 static noinline_for_stack
1634 int replace_file_extents(struct btrfs_trans_handle *trans,
1635 			 struct reloc_control *rc,
1636 			 struct btrfs_root *root,
1637 			 struct extent_buffer *leaf)
1638 {
1639 	struct btrfs_key key;
1640 	struct btrfs_file_extent_item *fi;
1641 	struct inode *inode = NULL;
1642 	u64 parent;
1643 	u64 bytenr;
1644 	u64 new_bytenr = 0;
1645 	u64 num_bytes;
1646 	u64 end;
1647 	u32 nritems;
1648 	u32 i;
1649 	int ret = 0;
1650 	int first = 1;
1651 	int dirty = 0;
1652 
1653 	if (rc->stage != UPDATE_DATA_PTRS)
1654 		return 0;
1655 
1656 	/* reloc trees always use full backref */
1657 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1658 		parent = leaf->start;
1659 	else
1660 		parent = 0;
1661 
1662 	nritems = btrfs_header_nritems(leaf);
1663 	for (i = 0; i < nritems; i++) {
1664 		cond_resched();
1665 		btrfs_item_key_to_cpu(leaf, &key, i);
1666 		if (key.type != BTRFS_EXTENT_DATA_KEY)
1667 			continue;
1668 		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
1669 		if (btrfs_file_extent_type(leaf, fi) ==
1670 		    BTRFS_FILE_EXTENT_INLINE)
1671 			continue;
1672 		bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1673 		num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1674 		if (bytenr == 0)
1675 			continue;
1676 		if (!in_block_group(bytenr, rc->block_group))
1677 			continue;
1678 
1679 		/*
1680 		 * if we are modifying block in fs tree, wait for readpage
1681 		 * to complete and drop the extent cache
1682 		 */
1683 		if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1684 			if (first) {
1685 				inode = find_next_inode(root, key.objectid);
1686 				first = 0;
1687 			} else if (inode && btrfs_ino(inode) < key.objectid) {
1688 				btrfs_add_delayed_iput(inode);
1689 				inode = find_next_inode(root, key.objectid);
1690 			}
1691 			if (inode && btrfs_ino(inode) == key.objectid) {
1692 				end = key.offset +
1693 				      btrfs_file_extent_num_bytes(leaf, fi);
1694 				WARN_ON(!IS_ALIGNED(key.offset,
1695 						    root->sectorsize));
1696 				WARN_ON(!IS_ALIGNED(end, root->sectorsize));
1697 				end--;
1698 				ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
1699 						      key.offset, end);
1700 				if (!ret)
1701 					continue;
1702 
1703 				btrfs_drop_extent_cache(inode, key.offset, end,
1704 							1);
1705 				unlock_extent(&BTRFS_I(inode)->io_tree,
1706 					      key.offset, end);
1707 			}
1708 		}
1709 
1710 		ret = get_new_location(rc->data_inode, &new_bytenr,
1711 				       bytenr, num_bytes);
1712 		if (ret) {
1713 			/*
1714 			 * Don't have to abort since we've not changed anything
1715 			 * in the file extent yet.
1716 			 */
1717 			break;
1718 		}
1719 
1720 		btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
1721 		dirty = 1;
1722 
1723 		key.offset -= btrfs_file_extent_offset(leaf, fi);
1724 		ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
1725 					   num_bytes, parent,
1726 					   btrfs_header_owner(leaf),
1727 					   key.objectid, key.offset);
1728 		if (ret) {
1729 			btrfs_abort_transaction(trans, root, ret);
1730 			break;
1731 		}
1732 
1733 		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
1734 					parent, btrfs_header_owner(leaf),
1735 					key.objectid, key.offset);
1736 		if (ret) {
1737 			btrfs_abort_transaction(trans, root, ret);
1738 			break;
1739 		}
1740 	}
1741 	if (dirty)
1742 		btrfs_mark_buffer_dirty(leaf);
1743 	if (inode)
1744 		btrfs_add_delayed_iput(inode);
1745 	return ret;
1746 }
1747 
1748 static noinline_for_stack
1749 int memcmp_node_keys(struct extent_buffer *eb, int slot,
1750 		     struct btrfs_path *path, int level)
1751 {
1752 	struct btrfs_disk_key key1;
1753 	struct btrfs_disk_key key2;
1754 	btrfs_node_key(eb, &key1, slot);
1755 	btrfs_node_key(path->nodes[level], &key2, path->slots[level]);
1756 	return memcmp(&key1, &key2, sizeof(key1));
1757 }
1758 
1759 /*
1760  * try to replace tree blocks in fs tree with the new blocks
1761  * in reloc tree. tree blocks haven't been modified since the
1762  * reloc tree was create can be replaced.
1763  *
1764  * if a block was replaced, level of the block + 1 is returned.
1765  * if no block got replaced, 0 is returned. if there are other
1766  * errors, a negative error number is returned.
1767  */
1768 static noinline_for_stack
1769 int replace_path(struct btrfs_trans_handle *trans,
1770 		 struct btrfs_root *dest, struct btrfs_root *src,
1771 		 struct btrfs_path *path, struct btrfs_key *next_key,
1772 		 int lowest_level, int max_level)
1773 {
1774 	struct extent_buffer *eb;
1775 	struct extent_buffer *parent;
1776 	struct btrfs_key key;
1777 	u64 old_bytenr;
1778 	u64 new_bytenr;
1779 	u64 old_ptr_gen;
1780 	u64 new_ptr_gen;
1781 	u64 last_snapshot;
1782 	u32 blocksize;
1783 	int cow = 0;
1784 	int level;
1785 	int ret;
1786 	int slot;
1787 
1788 	ASSERT(src->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
1789 	ASSERT(dest->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
1790 
1791 	last_snapshot = btrfs_root_last_snapshot(&src->root_item);
1792 again:
1793 	slot = path->slots[lowest_level];
1794 	btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
1795 
1796 	eb = btrfs_lock_root_node(dest);
1797 	btrfs_set_lock_blocking(eb);
1798 	level = btrfs_header_level(eb);
1799 
1800 	if (level < lowest_level) {
1801 		btrfs_tree_unlock(eb);
1802 		free_extent_buffer(eb);
1803 		return 0;
1804 	}
1805 
1806 	if (cow) {
1807 		ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
1808 		BUG_ON(ret);
1809 	}
1810 	btrfs_set_lock_blocking(eb);
1811 
1812 	if (next_key) {
1813 		next_key->objectid = (u64)-1;
1814 		next_key->type = (u8)-1;
1815 		next_key->offset = (u64)-1;
1816 	}
1817 
1818 	parent = eb;
1819 	while (1) {
1820 		level = btrfs_header_level(parent);
1821 		ASSERT(level >= lowest_level);
1822 
1823 		ret = btrfs_bin_search(parent, &key, level, &slot);
1824 		if (ret && slot > 0)
1825 			slot--;
1826 
1827 		if (next_key && slot + 1 < btrfs_header_nritems(parent))
1828 			btrfs_node_key_to_cpu(parent, next_key, slot + 1);
1829 
1830 		old_bytenr = btrfs_node_blockptr(parent, slot);
1831 		blocksize = dest->nodesize;
1832 		old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
1833 
1834 		if (level <= max_level) {
1835 			eb = path->nodes[level];
1836 			new_bytenr = btrfs_node_blockptr(eb,
1837 							path->slots[level]);
1838 			new_ptr_gen = btrfs_node_ptr_generation(eb,
1839 							path->slots[level]);
1840 		} else {
1841 			new_bytenr = 0;
1842 			new_ptr_gen = 0;
1843 		}
1844 
1845 		if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) {
1846 			ret = level;
1847 			break;
1848 		}
1849 
1850 		if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
1851 		    memcmp_node_keys(parent, slot, path, level)) {
1852 			if (level <= lowest_level) {
1853 				ret = 0;
1854 				break;
1855 			}
1856 
1857 			eb = read_tree_block(dest, old_bytenr, old_ptr_gen);
1858 			if (IS_ERR(eb)) {
1859 				ret = PTR_ERR(eb);
1860 			} else if (!extent_buffer_uptodate(eb)) {
1861 				ret = -EIO;
1862 				free_extent_buffer(eb);
1863 				break;
1864 			}
1865 			btrfs_tree_lock(eb);
1866 			if (cow) {
1867 				ret = btrfs_cow_block(trans, dest, eb, parent,
1868 						      slot, &eb);
1869 				BUG_ON(ret);
1870 			}
1871 			btrfs_set_lock_blocking(eb);
1872 
1873 			btrfs_tree_unlock(parent);
1874 			free_extent_buffer(parent);
1875 
1876 			parent = eb;
1877 			continue;
1878 		}
1879 
1880 		if (!cow) {
1881 			btrfs_tree_unlock(parent);
1882 			free_extent_buffer(parent);
1883 			cow = 1;
1884 			goto again;
1885 		}
1886 
1887 		btrfs_node_key_to_cpu(path->nodes[level], &key,
1888 				      path->slots[level]);
1889 		btrfs_release_path(path);
1890 
1891 		path->lowest_level = level;
1892 		ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
1893 		path->lowest_level = 0;
1894 		BUG_ON(ret);
1895 
1896 		/*
1897 		 * swap blocks in fs tree and reloc tree.
1898 		 */
1899 		btrfs_set_node_blockptr(parent, slot, new_bytenr);
1900 		btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen);
1901 		btrfs_mark_buffer_dirty(parent);
1902 
1903 		btrfs_set_node_blockptr(path->nodes[level],
1904 					path->slots[level], old_bytenr);
1905 		btrfs_set_node_ptr_generation(path->nodes[level],
1906 					      path->slots[level], old_ptr_gen);
1907 		btrfs_mark_buffer_dirty(path->nodes[level]);
1908 
1909 		ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize,
1910 					path->nodes[level]->start,
1911 					src->root_key.objectid, level - 1, 0);
1912 		BUG_ON(ret);
1913 		ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize,
1914 					0, dest->root_key.objectid, level - 1,
1915 					0);
1916 		BUG_ON(ret);
1917 
1918 		ret = btrfs_free_extent(trans, src, new_bytenr, blocksize,
1919 					path->nodes[level]->start,
1920 					src->root_key.objectid, level - 1, 0);
1921 		BUG_ON(ret);
1922 
1923 		ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize,
1924 					0, dest->root_key.objectid, level - 1,
1925 					0);
1926 		BUG_ON(ret);
1927 
1928 		btrfs_unlock_up_safe(path, 0);
1929 
1930 		ret = level;
1931 		break;
1932 	}
1933 	btrfs_tree_unlock(parent);
1934 	free_extent_buffer(parent);
1935 	return ret;
1936 }
1937 
1938 /*
1939  * helper to find next relocated block in reloc tree
1940  */
1941 static noinline_for_stack
1942 int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1943 		       int *level)
1944 {
1945 	struct extent_buffer *eb;
1946 	int i;
1947 	u64 last_snapshot;
1948 	u32 nritems;
1949 
1950 	last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1951 
1952 	for (i = 0; i < *level; i++) {
1953 		free_extent_buffer(path->nodes[i]);
1954 		path->nodes[i] = NULL;
1955 	}
1956 
1957 	for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
1958 		eb = path->nodes[i];
1959 		nritems = btrfs_header_nritems(eb);
1960 		while (path->slots[i] + 1 < nritems) {
1961 			path->slots[i]++;
1962 			if (btrfs_node_ptr_generation(eb, path->slots[i]) <=
1963 			    last_snapshot)
1964 				continue;
1965 
1966 			*level = i;
1967 			return 0;
1968 		}
1969 		free_extent_buffer(path->nodes[i]);
1970 		path->nodes[i] = NULL;
1971 	}
1972 	return 1;
1973 }
1974 
1975 /*
1976  * walk down reloc tree to find relocated block of lowest level
1977  */
1978 static noinline_for_stack
1979 int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1980 			 int *level)
1981 {
1982 	struct extent_buffer *eb = NULL;
1983 	int i;
1984 	u64 bytenr;
1985 	u64 ptr_gen = 0;
1986 	u64 last_snapshot;
1987 	u32 nritems;
1988 
1989 	last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1990 
1991 	for (i = *level; i > 0; i--) {
1992 		eb = path->nodes[i];
1993 		nritems = btrfs_header_nritems(eb);
1994 		while (path->slots[i] < nritems) {
1995 			ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]);
1996 			if (ptr_gen > last_snapshot)
1997 				break;
1998 			path->slots[i]++;
1999 		}
2000 		if (path->slots[i] >= nritems) {
2001 			if (i == *level)
2002 				break;
2003 			*level = i + 1;
2004 			return 0;
2005 		}
2006 		if (i == 1) {
2007 			*level = i;
2008 			return 0;
2009 		}
2010 
2011 		bytenr = btrfs_node_blockptr(eb, path->slots[i]);
2012 		eb = read_tree_block(root, bytenr, ptr_gen);
2013 		if (IS_ERR(eb)) {
2014 			return PTR_ERR(eb);
2015 		} else if (!extent_buffer_uptodate(eb)) {
2016 			free_extent_buffer(eb);
2017 			return -EIO;
2018 		}
2019 		BUG_ON(btrfs_header_level(eb) != i - 1);
2020 		path->nodes[i - 1] = eb;
2021 		path->slots[i - 1] = 0;
2022 	}
2023 	return 1;
2024 }
2025 
2026 /*
2027  * invalidate extent cache for file extents whose key in range of
2028  * [min_key, max_key)
2029  */
2030 static int invalidate_extent_cache(struct btrfs_root *root,
2031 				   struct btrfs_key *min_key,
2032 				   struct btrfs_key *max_key)
2033 {
2034 	struct inode *inode = NULL;
2035 	u64 objectid;
2036 	u64 start, end;
2037 	u64 ino;
2038 
2039 	objectid = min_key->objectid;
2040 	while (1) {
2041 		cond_resched();
2042 		iput(inode);
2043 
2044 		if (objectid > max_key->objectid)
2045 			break;
2046 
2047 		inode = find_next_inode(root, objectid);
2048 		if (!inode)
2049 			break;
2050 		ino = btrfs_ino(inode);
2051 
2052 		if (ino > max_key->objectid) {
2053 			iput(inode);
2054 			break;
2055 		}
2056 
2057 		objectid = ino + 1;
2058 		if (!S_ISREG(inode->i_mode))
2059 			continue;
2060 
2061 		if (unlikely(min_key->objectid == ino)) {
2062 			if (min_key->type > BTRFS_EXTENT_DATA_KEY)
2063 				continue;
2064 			if (min_key->type < BTRFS_EXTENT_DATA_KEY)
2065 				start = 0;
2066 			else {
2067 				start = min_key->offset;
2068 				WARN_ON(!IS_ALIGNED(start, root->sectorsize));
2069 			}
2070 		} else {
2071 			start = 0;
2072 		}
2073 
2074 		if (unlikely(max_key->objectid == ino)) {
2075 			if (max_key->type < BTRFS_EXTENT_DATA_KEY)
2076 				continue;
2077 			if (max_key->type > BTRFS_EXTENT_DATA_KEY) {
2078 				end = (u64)-1;
2079 			} else {
2080 				if (max_key->offset == 0)
2081 					continue;
2082 				end = max_key->offset;
2083 				WARN_ON(!IS_ALIGNED(end, root->sectorsize));
2084 				end--;
2085 			}
2086 		} else {
2087 			end = (u64)-1;
2088 		}
2089 
2090 		/* the lock_extent waits for readpage to complete */
2091 		lock_extent(&BTRFS_I(inode)->io_tree, start, end);
2092 		btrfs_drop_extent_cache(inode, start, end, 1);
2093 		unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
2094 	}
2095 	return 0;
2096 }
2097 
2098 static int find_next_key(struct btrfs_path *path, int level,
2099 			 struct btrfs_key *key)
2100 
2101 {
2102 	while (level < BTRFS_MAX_LEVEL) {
2103 		if (!path->nodes[level])
2104 			break;
2105 		if (path->slots[level] + 1 <
2106 		    btrfs_header_nritems(path->nodes[level])) {
2107 			btrfs_node_key_to_cpu(path->nodes[level], key,
2108 					      path->slots[level] + 1);
2109 			return 0;
2110 		}
2111 		level++;
2112 	}
2113 	return 1;
2114 }
2115 
2116 /*
2117  * merge the relocated tree blocks in reloc tree with corresponding
2118  * fs tree.
2119  */
2120 static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
2121 					       struct btrfs_root *root)
2122 {
2123 	LIST_HEAD(inode_list);
2124 	struct btrfs_key key;
2125 	struct btrfs_key next_key;
2126 	struct btrfs_trans_handle *trans = NULL;
2127 	struct btrfs_root *reloc_root;
2128 	struct btrfs_root_item *root_item;
2129 	struct btrfs_path *path;
2130 	struct extent_buffer *leaf;
2131 	int level;
2132 	int max_level;
2133 	int replaced = 0;
2134 	int ret;
2135 	int err = 0;
2136 	u32 min_reserved;
2137 
2138 	path = btrfs_alloc_path();
2139 	if (!path)
2140 		return -ENOMEM;
2141 	path->reada = 1;
2142 
2143 	reloc_root = root->reloc_root;
2144 	root_item = &reloc_root->root_item;
2145 
2146 	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
2147 		level = btrfs_root_level(root_item);
2148 		extent_buffer_get(reloc_root->node);
2149 		path->nodes[level] = reloc_root->node;
2150 		path->slots[level] = 0;
2151 	} else {
2152 		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
2153 
2154 		level = root_item->drop_level;
2155 		BUG_ON(level == 0);
2156 		path->lowest_level = level;
2157 		ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
2158 		path->lowest_level = 0;
2159 		if (ret < 0) {
2160 			btrfs_free_path(path);
2161 			return ret;
2162 		}
2163 
2164 		btrfs_node_key_to_cpu(path->nodes[level], &next_key,
2165 				      path->slots[level]);
2166 		WARN_ON(memcmp(&key, &next_key, sizeof(key)));
2167 
2168 		btrfs_unlock_up_safe(path, 0);
2169 	}
2170 
2171 	min_reserved = root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
2172 	memset(&next_key, 0, sizeof(next_key));
2173 
2174 	while (1) {
2175 		ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved,
2176 					     BTRFS_RESERVE_FLUSH_ALL);
2177 		if (ret) {
2178 			err = ret;
2179 			goto out;
2180 		}
2181 		trans = btrfs_start_transaction(root, 0);
2182 		if (IS_ERR(trans)) {
2183 			err = PTR_ERR(trans);
2184 			trans = NULL;
2185 			goto out;
2186 		}
2187 		trans->block_rsv = rc->block_rsv;
2188 
2189 		replaced = 0;
2190 		max_level = level;
2191 
2192 		ret = walk_down_reloc_tree(reloc_root, path, &level);
2193 		if (ret < 0) {
2194 			err = ret;
2195 			goto out;
2196 		}
2197 		if (ret > 0)
2198 			break;
2199 
2200 		if (!find_next_key(path, level, &key) &&
2201 		    btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
2202 			ret = 0;
2203 		} else {
2204 			ret = replace_path(trans, root, reloc_root, path,
2205 					   &next_key, level, max_level);
2206 		}
2207 		if (ret < 0) {
2208 			err = ret;
2209 			goto out;
2210 		}
2211 
2212 		if (ret > 0) {
2213 			level = ret;
2214 			btrfs_node_key_to_cpu(path->nodes[level], &key,
2215 					      path->slots[level]);
2216 			replaced = 1;
2217 		}
2218 
2219 		ret = walk_up_reloc_tree(reloc_root, path, &level);
2220 		if (ret > 0)
2221 			break;
2222 
2223 		BUG_ON(level == 0);
2224 		/*
2225 		 * save the merging progress in the drop_progress.
2226 		 * this is OK since root refs == 1 in this case.
2227 		 */
2228 		btrfs_node_key(path->nodes[level], &root_item->drop_progress,
2229 			       path->slots[level]);
2230 		root_item->drop_level = level;
2231 
2232 		btrfs_end_transaction_throttle(trans, root);
2233 		trans = NULL;
2234 
2235 		btrfs_btree_balance_dirty(root);
2236 
2237 		if (replaced && rc->stage == UPDATE_DATA_PTRS)
2238 			invalidate_extent_cache(root, &key, &next_key);
2239 	}
2240 
2241 	/*
2242 	 * handle the case only one block in the fs tree need to be
2243 	 * relocated and the block is tree root.
2244 	 */
2245 	leaf = btrfs_lock_root_node(root);
2246 	ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf);
2247 	btrfs_tree_unlock(leaf);
2248 	free_extent_buffer(leaf);
2249 	if (ret < 0)
2250 		err = ret;
2251 out:
2252 	btrfs_free_path(path);
2253 
2254 	if (err == 0) {
2255 		memset(&root_item->drop_progress, 0,
2256 		       sizeof(root_item->drop_progress));
2257 		root_item->drop_level = 0;
2258 		btrfs_set_root_refs(root_item, 0);
2259 		btrfs_update_reloc_root(trans, root);
2260 	}
2261 
2262 	if (trans)
2263 		btrfs_end_transaction_throttle(trans, root);
2264 
2265 	btrfs_btree_balance_dirty(root);
2266 
2267 	if (replaced && rc->stage == UPDATE_DATA_PTRS)
2268 		invalidate_extent_cache(root, &key, &next_key);
2269 
2270 	return err;
2271 }
2272 
2273 static noinline_for_stack
2274 int prepare_to_merge(struct reloc_control *rc, int err)
2275 {
2276 	struct btrfs_root *root = rc->extent_root;
2277 	struct btrfs_root *reloc_root;
2278 	struct btrfs_trans_handle *trans;
2279 	LIST_HEAD(reloc_roots);
2280 	u64 num_bytes = 0;
2281 	int ret;
2282 
2283 	mutex_lock(&root->fs_info->reloc_mutex);
2284 	rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
2285 	rc->merging_rsv_size += rc->nodes_relocated * 2;
2286 	mutex_unlock(&root->fs_info->reloc_mutex);
2287 
2288 again:
2289 	if (!err) {
2290 		num_bytes = rc->merging_rsv_size;
2291 		ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
2292 					  BTRFS_RESERVE_FLUSH_ALL);
2293 		if (ret)
2294 			err = ret;
2295 	}
2296 
2297 	trans = btrfs_join_transaction(rc->extent_root);
2298 	if (IS_ERR(trans)) {
2299 		if (!err)
2300 			btrfs_block_rsv_release(rc->extent_root,
2301 						rc->block_rsv, num_bytes);
2302 		return PTR_ERR(trans);
2303 	}
2304 
2305 	if (!err) {
2306 		if (num_bytes != rc->merging_rsv_size) {
2307 			btrfs_end_transaction(trans, rc->extent_root);
2308 			btrfs_block_rsv_release(rc->extent_root,
2309 						rc->block_rsv, num_bytes);
2310 			goto again;
2311 		}
2312 	}
2313 
2314 	rc->merge_reloc_tree = 1;
2315 
2316 	while (!list_empty(&rc->reloc_roots)) {
2317 		reloc_root = list_entry(rc->reloc_roots.next,
2318 					struct btrfs_root, root_list);
2319 		list_del_init(&reloc_root->root_list);
2320 
2321 		root = read_fs_root(reloc_root->fs_info,
2322 				    reloc_root->root_key.offset);
2323 		BUG_ON(IS_ERR(root));
2324 		BUG_ON(root->reloc_root != reloc_root);
2325 
2326 		/*
2327 		 * set reference count to 1, so btrfs_recover_relocation
2328 		 * knows it should resumes merging
2329 		 */
2330 		if (!err)
2331 			btrfs_set_root_refs(&reloc_root->root_item, 1);
2332 		btrfs_update_reloc_root(trans, root);
2333 
2334 		list_add(&reloc_root->root_list, &reloc_roots);
2335 	}
2336 
2337 	list_splice(&reloc_roots, &rc->reloc_roots);
2338 
2339 	if (!err)
2340 		btrfs_commit_transaction(trans, rc->extent_root);
2341 	else
2342 		btrfs_end_transaction(trans, rc->extent_root);
2343 	return err;
2344 }
2345 
2346 static noinline_for_stack
2347 void free_reloc_roots(struct list_head *list)
2348 {
2349 	struct btrfs_root *reloc_root;
2350 
2351 	while (!list_empty(list)) {
2352 		reloc_root = list_entry(list->next, struct btrfs_root,
2353 					root_list);
2354 		__del_reloc_root(reloc_root);
2355 		free_extent_buffer(reloc_root->node);
2356 		free_extent_buffer(reloc_root->commit_root);
2357 		reloc_root->node = NULL;
2358 		reloc_root->commit_root = NULL;
2359 	}
2360 }
2361 
2362 static noinline_for_stack
2363 void merge_reloc_roots(struct reloc_control *rc)
2364 {
2365 	struct btrfs_root *root;
2366 	struct btrfs_root *reloc_root;
2367 	u64 last_snap;
2368 	u64 otransid;
2369 	u64 objectid;
2370 	LIST_HEAD(reloc_roots);
2371 	int found = 0;
2372 	int ret = 0;
2373 again:
2374 	root = rc->extent_root;
2375 
2376 	/*
2377 	 * this serializes us with btrfs_record_root_in_transaction,
2378 	 * we have to make sure nobody is in the middle of
2379 	 * adding their roots to the list while we are
2380 	 * doing this splice
2381 	 */
2382 	mutex_lock(&root->fs_info->reloc_mutex);
2383 	list_splice_init(&rc->reloc_roots, &reloc_roots);
2384 	mutex_unlock(&root->fs_info->reloc_mutex);
2385 
2386 	while (!list_empty(&reloc_roots)) {
2387 		found = 1;
2388 		reloc_root = list_entry(reloc_roots.next,
2389 					struct btrfs_root, root_list);
2390 
2391 		if (btrfs_root_refs(&reloc_root->root_item) > 0) {
2392 			root = read_fs_root(reloc_root->fs_info,
2393 					    reloc_root->root_key.offset);
2394 			BUG_ON(IS_ERR(root));
2395 			BUG_ON(root->reloc_root != reloc_root);
2396 
2397 			ret = merge_reloc_root(rc, root);
2398 			if (ret) {
2399 				if (list_empty(&reloc_root->root_list))
2400 					list_add_tail(&reloc_root->root_list,
2401 						      &reloc_roots);
2402 				goto out;
2403 			}
2404 		} else {
2405 			list_del_init(&reloc_root->root_list);
2406 		}
2407 
2408 		/*
2409 		 * we keep the old last snapshod transid in rtranid when we
2410 		 * created the relocation tree.
2411 		 */
2412 		last_snap = btrfs_root_rtransid(&reloc_root->root_item);
2413 		otransid = btrfs_root_otransid(&reloc_root->root_item);
2414 		objectid = reloc_root->root_key.offset;
2415 
2416 		ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1);
2417 		if (ret < 0) {
2418 			if (list_empty(&reloc_root->root_list))
2419 				list_add_tail(&reloc_root->root_list,
2420 					      &reloc_roots);
2421 			goto out;
2422 		}
2423 	}
2424 
2425 	if (found) {
2426 		found = 0;
2427 		goto again;
2428 	}
2429 out:
2430 	if (ret) {
2431 		btrfs_std_error(root->fs_info, ret, NULL);
2432 		if (!list_empty(&reloc_roots))
2433 			free_reloc_roots(&reloc_roots);
2434 
2435 		/* new reloc root may be added */
2436 		mutex_lock(&root->fs_info->reloc_mutex);
2437 		list_splice_init(&rc->reloc_roots, &reloc_roots);
2438 		mutex_unlock(&root->fs_info->reloc_mutex);
2439 		if (!list_empty(&reloc_roots))
2440 			free_reloc_roots(&reloc_roots);
2441 	}
2442 
2443 	/*
2444 	 * We used to have
2445 	 *
2446 	 * BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
2447 	 *
2448 	 * here, but it's wrong.  If we fail to start the transaction in
2449 	 * prepare_to_merge() we will have only 0 ref reloc roots, none of which
2450 	 * have actually been removed from the reloc_root_tree rb tree.  This is
2451 	 * fine because we're bailing here, and we hold a reference on the root
2452 	 * for the list that holds it, so these roots will be cleaned up when we
2453 	 * do the reloc_dirty_list afterwards.  Meanwhile the root->reloc_root
2454 	 * will be cleaned up on unmount.
2455 	 *
2456 	 * The remaining nodes will be cleaned up by free_reloc_control.
2457 	 */
2458 }
2459 
2460 static void free_block_list(struct rb_root *blocks)
2461 {
2462 	struct tree_block *block;
2463 	struct rb_node *rb_node;
2464 	while ((rb_node = rb_first(blocks))) {
2465 		block = rb_entry(rb_node, struct tree_block, rb_node);
2466 		rb_erase(rb_node, blocks);
2467 		kfree(block);
2468 	}
2469 }
2470 
2471 static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
2472 				      struct btrfs_root *reloc_root)
2473 {
2474 	struct btrfs_root *root;
2475 
2476 	if (reloc_root->last_trans == trans->transid)
2477 		return 0;
2478 
2479 	root = read_fs_root(reloc_root->fs_info, reloc_root->root_key.offset);
2480 	BUG_ON(IS_ERR(root));
2481 	BUG_ON(root->reloc_root != reloc_root);
2482 
2483 	return btrfs_record_root_in_trans(trans, root);
2484 }
2485 
2486 static noinline_for_stack
2487 struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
2488 				     struct reloc_control *rc,
2489 				     struct backref_node *node,
2490 				     struct backref_edge *edges[])
2491 {
2492 	struct backref_node *next;
2493 	struct btrfs_root *root;
2494 	int index = 0;
2495 
2496 	next = node;
2497 	while (1) {
2498 		cond_resched();
2499 		next = walk_up_backref(next, edges, &index);
2500 		root = next->root;
2501 		BUG_ON(!root);
2502 		BUG_ON(!test_bit(BTRFS_ROOT_REF_COWS, &root->state));
2503 
2504 		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
2505 			record_reloc_root_in_trans(trans, root);
2506 			break;
2507 		}
2508 
2509 		btrfs_record_root_in_trans(trans, root);
2510 		root = root->reloc_root;
2511 
2512 		if (next->new_bytenr != root->node->start) {
2513 			BUG_ON(next->new_bytenr);
2514 			BUG_ON(!list_empty(&next->list));
2515 			next->new_bytenr = root->node->start;
2516 			next->root = root;
2517 			list_add_tail(&next->list,
2518 				      &rc->backref_cache.changed);
2519 			__mark_block_processed(rc, next);
2520 			break;
2521 		}
2522 
2523 		WARN_ON(1);
2524 		root = NULL;
2525 		next = walk_down_backref(edges, &index);
2526 		if (!next || next->level <= node->level)
2527 			break;
2528 	}
2529 	if (!root)
2530 		return NULL;
2531 
2532 	next = node;
2533 	/* setup backref node path for btrfs_reloc_cow_block */
2534 	while (1) {
2535 		rc->backref_cache.path[next->level] = next;
2536 		if (--index < 0)
2537 			break;
2538 		next = edges[index]->node[UPPER];
2539 	}
2540 	return root;
2541 }
2542 
2543 /*
2544  * select a tree root for relocation. return NULL if the block
2545  * is reference counted. we should use do_relocation() in this
2546  * case. return a tree root pointer if the block isn't reference
2547  * counted. return -ENOENT if the block is root of reloc tree.
2548  */
2549 static noinline_for_stack
2550 struct btrfs_root *select_one_root(struct backref_node *node)
2551 {
2552 	struct backref_node *next;
2553 	struct btrfs_root *root;
2554 	struct btrfs_root *fs_root = NULL;
2555 	struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2556 	int index = 0;
2557 
2558 	next = node;
2559 	while (1) {
2560 		cond_resched();
2561 		next = walk_up_backref(next, edges, &index);
2562 		root = next->root;
2563 		BUG_ON(!root);
2564 
2565 		/* no other choice for non-references counted tree */
2566 		if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
2567 			return root;
2568 
2569 		if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
2570 			fs_root = root;
2571 
2572 		if (next != node)
2573 			return NULL;
2574 
2575 		next = walk_down_backref(edges, &index);
2576 		if (!next || next->level <= node->level)
2577 			break;
2578 	}
2579 
2580 	if (!fs_root)
2581 		return ERR_PTR(-ENOENT);
2582 	return fs_root;
2583 }
2584 
2585 static noinline_for_stack
2586 u64 calcu_metadata_size(struct reloc_control *rc,
2587 			struct backref_node *node, int reserve)
2588 {
2589 	struct backref_node *next = node;
2590 	struct backref_edge *edge;
2591 	struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2592 	u64 num_bytes = 0;
2593 	int index = 0;
2594 
2595 	BUG_ON(reserve && node->processed);
2596 
2597 	while (next) {
2598 		cond_resched();
2599 		while (1) {
2600 			if (next->processed && (reserve || next != node))
2601 				break;
2602 
2603 			num_bytes += rc->extent_root->nodesize;
2604 
2605 			if (list_empty(&next->upper))
2606 				break;
2607 
2608 			edge = list_entry(next->upper.next,
2609 					  struct backref_edge, list[LOWER]);
2610 			edges[index++] = edge;
2611 			next = edge->node[UPPER];
2612 		}
2613 		next = walk_down_backref(edges, &index);
2614 	}
2615 	return num_bytes;
2616 }
2617 
2618 static int reserve_metadata_space(struct btrfs_trans_handle *trans,
2619 				  struct reloc_control *rc,
2620 				  struct backref_node *node)
2621 {
2622 	struct btrfs_root *root = rc->extent_root;
2623 	u64 num_bytes;
2624 	int ret;
2625 	u64 tmp;
2626 
2627 	num_bytes = calcu_metadata_size(rc, node, 1) * 2;
2628 
2629 	trans->block_rsv = rc->block_rsv;
2630 	rc->reserved_bytes += num_bytes;
2631 	ret = btrfs_block_rsv_refill(root, rc->block_rsv, num_bytes,
2632 				BTRFS_RESERVE_FLUSH_ALL);
2633 	if (ret) {
2634 		if (ret == -EAGAIN) {
2635 			tmp = rc->extent_root->nodesize *
2636 				RELOCATION_RESERVED_NODES;
2637 			while (tmp <= rc->reserved_bytes)
2638 				tmp <<= 1;
2639 			/*
2640 			 * only one thread can access block_rsv at this point,
2641 			 * so we don't need hold lock to protect block_rsv.
2642 			 * we expand more reservation size here to allow enough
2643 			 * space for relocation and we will return eailer in
2644 			 * enospc case.
2645 			 */
2646 			rc->block_rsv->size = tmp + rc->extent_root->nodesize *
2647 					      RELOCATION_RESERVED_NODES;
2648 		}
2649 		return ret;
2650 	}
2651 
2652 	return 0;
2653 }
2654 
2655 /*
2656  * relocate a block tree, and then update pointers in upper level
2657  * blocks that reference the block to point to the new location.
2658  *
2659  * if called by link_to_upper, the block has already been relocated.
2660  * in that case this function just updates pointers.
2661  */
2662 static int do_relocation(struct btrfs_trans_handle *trans,
2663 			 struct reloc_control *rc,
2664 			 struct backref_node *node,
2665 			 struct btrfs_key *key,
2666 			 struct btrfs_path *path, int lowest)
2667 {
2668 	struct backref_node *upper;
2669 	struct backref_edge *edge;
2670 	struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2671 	struct btrfs_root *root;
2672 	struct extent_buffer *eb;
2673 	u32 blocksize;
2674 	u64 bytenr;
2675 	u64 generation;
2676 	int slot;
2677 	int ret;
2678 	int err = 0;
2679 
2680 	BUG_ON(lowest && node->eb);
2681 
2682 	path->lowest_level = node->level + 1;
2683 	rc->backref_cache.path[node->level] = node;
2684 	list_for_each_entry(edge, &node->upper, list[LOWER]) {
2685 		cond_resched();
2686 
2687 		upper = edge->node[UPPER];
2688 		root = select_reloc_root(trans, rc, upper, edges);
2689 		BUG_ON(!root);
2690 
2691 		if (upper->eb && !upper->locked) {
2692 			if (!lowest) {
2693 				ret = btrfs_bin_search(upper->eb, key,
2694 						       upper->level, &slot);
2695 				BUG_ON(ret);
2696 				bytenr = btrfs_node_blockptr(upper->eb, slot);
2697 				if (node->eb->start == bytenr)
2698 					goto next;
2699 			}
2700 			drop_node_buffer(upper);
2701 		}
2702 
2703 		if (!upper->eb) {
2704 			ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2705 			if (ret) {
2706 				if (ret < 0)
2707 					err = ret;
2708 				else
2709 					err = -ENOENT;
2710 
2711 				btrfs_release_path(path);
2712 				break;
2713 			}
2714 
2715 			if (!upper->eb) {
2716 				upper->eb = path->nodes[upper->level];
2717 				path->nodes[upper->level] = NULL;
2718 			} else {
2719 				BUG_ON(upper->eb != path->nodes[upper->level]);
2720 			}
2721 
2722 			upper->locked = 1;
2723 			path->locks[upper->level] = 0;
2724 
2725 			slot = path->slots[upper->level];
2726 			btrfs_release_path(path);
2727 		} else {
2728 			ret = btrfs_bin_search(upper->eb, key, upper->level,
2729 					       &slot);
2730 			BUG_ON(ret);
2731 		}
2732 
2733 		bytenr = btrfs_node_blockptr(upper->eb, slot);
2734 		if (lowest) {
2735 			BUG_ON(bytenr != node->bytenr);
2736 		} else {
2737 			if (node->eb->start == bytenr)
2738 				goto next;
2739 		}
2740 
2741 		blocksize = root->nodesize;
2742 		generation = btrfs_node_ptr_generation(upper->eb, slot);
2743 		eb = read_tree_block(root, bytenr, generation);
2744 		if (IS_ERR(eb)) {
2745 			err = PTR_ERR(eb);
2746 			goto next;
2747 		} else if (!extent_buffer_uptodate(eb)) {
2748 			free_extent_buffer(eb);
2749 			err = -EIO;
2750 			goto next;
2751 		}
2752 		btrfs_tree_lock(eb);
2753 		btrfs_set_lock_blocking(eb);
2754 
2755 		if (!node->eb) {
2756 			ret = btrfs_cow_block(trans, root, eb, upper->eb,
2757 					      slot, &eb);
2758 			btrfs_tree_unlock(eb);
2759 			free_extent_buffer(eb);
2760 			if (ret < 0) {
2761 				err = ret;
2762 				goto next;
2763 			}
2764 			BUG_ON(node->eb != eb);
2765 		} else {
2766 			btrfs_set_node_blockptr(upper->eb, slot,
2767 						node->eb->start);
2768 			btrfs_set_node_ptr_generation(upper->eb, slot,
2769 						      trans->transid);
2770 			btrfs_mark_buffer_dirty(upper->eb);
2771 
2772 			ret = btrfs_inc_extent_ref(trans, root,
2773 						node->eb->start, blocksize,
2774 						upper->eb->start,
2775 						btrfs_header_owner(upper->eb),
2776 						node->level, 0);
2777 			BUG_ON(ret);
2778 
2779 			ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
2780 			BUG_ON(ret);
2781 		}
2782 next:
2783 		if (!upper->pending)
2784 			drop_node_buffer(upper);
2785 		else
2786 			unlock_node_buffer(upper);
2787 		if (err)
2788 			break;
2789 	}
2790 
2791 	if (!err && node->pending) {
2792 		drop_node_buffer(node);
2793 		list_move_tail(&node->list, &rc->backref_cache.changed);
2794 		node->pending = 0;
2795 	}
2796 
2797 	path->lowest_level = 0;
2798 	BUG_ON(err == -ENOSPC);
2799 	return err;
2800 }
2801 
2802 static int link_to_upper(struct btrfs_trans_handle *trans,
2803 			 struct reloc_control *rc,
2804 			 struct backref_node *node,
2805 			 struct btrfs_path *path)
2806 {
2807 	struct btrfs_key key;
2808 
2809 	btrfs_node_key_to_cpu(node->eb, &key, 0);
2810 	return do_relocation(trans, rc, node, &key, path, 0);
2811 }
2812 
2813 static int finish_pending_nodes(struct btrfs_trans_handle *trans,
2814 				struct reloc_control *rc,
2815 				struct btrfs_path *path, int err)
2816 {
2817 	LIST_HEAD(list);
2818 	struct backref_cache *cache = &rc->backref_cache;
2819 	struct backref_node *node;
2820 	int level;
2821 	int ret;
2822 
2823 	for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2824 		while (!list_empty(&cache->pending[level])) {
2825 			node = list_entry(cache->pending[level].next,
2826 					  struct backref_node, list);
2827 			list_move_tail(&node->list, &list);
2828 			BUG_ON(!node->pending);
2829 
2830 			if (!err) {
2831 				ret = link_to_upper(trans, rc, node, path);
2832 				if (ret < 0)
2833 					err = ret;
2834 			}
2835 		}
2836 		list_splice_init(&list, &cache->pending[level]);
2837 	}
2838 	return err;
2839 }
2840 
2841 static void mark_block_processed(struct reloc_control *rc,
2842 				 u64 bytenr, u32 blocksize)
2843 {
2844 	set_extent_bits(&rc->processed_blocks, bytenr, bytenr + blocksize - 1,
2845 			EXTENT_DIRTY, GFP_NOFS);
2846 }
2847 
2848 static void __mark_block_processed(struct reloc_control *rc,
2849 				   struct backref_node *node)
2850 {
2851 	u32 blocksize;
2852 	if (node->level == 0 ||
2853 	    in_block_group(node->bytenr, rc->block_group)) {
2854 		blocksize = rc->extent_root->nodesize;
2855 		mark_block_processed(rc, node->bytenr, blocksize);
2856 	}
2857 	node->processed = 1;
2858 }
2859 
2860 /*
2861  * mark a block and all blocks directly/indirectly reference the block
2862  * as processed.
2863  */
2864 static void update_processed_blocks(struct reloc_control *rc,
2865 				    struct backref_node *node)
2866 {
2867 	struct backref_node *next = node;
2868 	struct backref_edge *edge;
2869 	struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2870 	int index = 0;
2871 
2872 	while (next) {
2873 		cond_resched();
2874 		while (1) {
2875 			if (next->processed)
2876 				break;
2877 
2878 			__mark_block_processed(rc, next);
2879 
2880 			if (list_empty(&next->upper))
2881 				break;
2882 
2883 			edge = list_entry(next->upper.next,
2884 					  struct backref_edge, list[LOWER]);
2885 			edges[index++] = edge;
2886 			next = edge->node[UPPER];
2887 		}
2888 		next = walk_down_backref(edges, &index);
2889 	}
2890 }
2891 
2892 static int tree_block_processed(u64 bytenr, struct reloc_control *rc)
2893 {
2894 	u32 blocksize = rc->extent_root->nodesize;
2895 
2896 	if (test_range_bit(&rc->processed_blocks, bytenr,
2897 			   bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
2898 		return 1;
2899 	return 0;
2900 }
2901 
2902 static int get_tree_block_key(struct reloc_control *rc,
2903 			      struct tree_block *block)
2904 {
2905 	struct extent_buffer *eb;
2906 
2907 	BUG_ON(block->key_ready);
2908 	eb = read_tree_block(rc->extent_root, block->bytenr,
2909 			     block->key.offset);
2910 	if (IS_ERR(eb)) {
2911 		return PTR_ERR(eb);
2912 	} else if (!extent_buffer_uptodate(eb)) {
2913 		free_extent_buffer(eb);
2914 		return -EIO;
2915 	}
2916 	WARN_ON(btrfs_header_level(eb) != block->level);
2917 	if (block->level == 0)
2918 		btrfs_item_key_to_cpu(eb, &block->key, 0);
2919 	else
2920 		btrfs_node_key_to_cpu(eb, &block->key, 0);
2921 	free_extent_buffer(eb);
2922 	block->key_ready = 1;
2923 	return 0;
2924 }
2925 
2926 /*
2927  * helper function to relocate a tree block
2928  */
2929 static int relocate_tree_block(struct btrfs_trans_handle *trans,
2930 				struct reloc_control *rc,
2931 				struct backref_node *node,
2932 				struct btrfs_key *key,
2933 				struct btrfs_path *path)
2934 {
2935 	struct btrfs_root *root;
2936 	int ret = 0;
2937 
2938 	if (!node)
2939 		return 0;
2940 
2941 	BUG_ON(node->processed);
2942 	root = select_one_root(node);
2943 	if (root == ERR_PTR(-ENOENT)) {
2944 		update_processed_blocks(rc, node);
2945 		goto out;
2946 	}
2947 
2948 	if (!root || test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
2949 		ret = reserve_metadata_space(trans, rc, node);
2950 		if (ret)
2951 			goto out;
2952 	}
2953 
2954 	if (root) {
2955 		if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
2956 			BUG_ON(node->new_bytenr);
2957 			BUG_ON(!list_empty(&node->list));
2958 			btrfs_record_root_in_trans(trans, root);
2959 			root = root->reloc_root;
2960 			node->new_bytenr = root->node->start;
2961 			node->root = root;
2962 			list_add_tail(&node->list, &rc->backref_cache.changed);
2963 		} else {
2964 			path->lowest_level = node->level;
2965 			ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2966 			btrfs_release_path(path);
2967 			if (ret > 0)
2968 				ret = 0;
2969 		}
2970 		if (!ret)
2971 			update_processed_blocks(rc, node);
2972 	} else {
2973 		ret = do_relocation(trans, rc, node, key, path, 1);
2974 	}
2975 out:
2976 	if (ret || node->level == 0 || node->cowonly)
2977 		remove_backref_node(&rc->backref_cache, node);
2978 	return ret;
2979 }
2980 
2981 /*
2982  * relocate a list of blocks
2983  */
2984 static noinline_for_stack
2985 int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2986 			 struct reloc_control *rc, struct rb_root *blocks)
2987 {
2988 	struct backref_node *node;
2989 	struct btrfs_path *path;
2990 	struct tree_block *block;
2991 	struct rb_node *rb_node;
2992 	int ret;
2993 	int err = 0;
2994 
2995 	path = btrfs_alloc_path();
2996 	if (!path) {
2997 		err = -ENOMEM;
2998 		goto out_free_blocks;
2999 	}
3000 
3001 	rb_node = rb_first(blocks);
3002 	while (rb_node) {
3003 		block = rb_entry(rb_node, struct tree_block, rb_node);
3004 		if (!block->key_ready)
3005 			readahead_tree_block(rc->extent_root, block->bytenr);
3006 		rb_node = rb_next(rb_node);
3007 	}
3008 
3009 	rb_node = rb_first(blocks);
3010 	while (rb_node) {
3011 		block = rb_entry(rb_node, struct tree_block, rb_node);
3012 		if (!block->key_ready) {
3013 			err = get_tree_block_key(rc, block);
3014 			if (err)
3015 				goto out_free_path;
3016 		}
3017 		rb_node = rb_next(rb_node);
3018 	}
3019 
3020 	rb_node = rb_first(blocks);
3021 	while (rb_node) {
3022 		block = rb_entry(rb_node, struct tree_block, rb_node);
3023 
3024 		node = build_backref_tree(rc, &block->key,
3025 					  block->level, block->bytenr);
3026 		if (IS_ERR(node)) {
3027 			err = PTR_ERR(node);
3028 			goto out;
3029 		}
3030 
3031 		ret = relocate_tree_block(trans, rc, node, &block->key,
3032 					  path);
3033 		if (ret < 0) {
3034 			if (ret != -EAGAIN || rb_node == rb_first(blocks))
3035 				err = ret;
3036 			goto out;
3037 		}
3038 		rb_node = rb_next(rb_node);
3039 	}
3040 out:
3041 	err = finish_pending_nodes(trans, rc, path, err);
3042 
3043 out_free_path:
3044 	btrfs_free_path(path);
3045 out_free_blocks:
3046 	free_block_list(blocks);
3047 	return err;
3048 }
3049 
3050 static noinline_for_stack
3051 int prealloc_file_extent_cluster(struct inode *inode,
3052 				 struct file_extent_cluster *cluster)
3053 {
3054 	u64 alloc_hint = 0;
3055 	u64 start;
3056 	u64 end;
3057 	u64 offset = BTRFS_I(inode)->index_cnt;
3058 	u64 num_bytes;
3059 	int nr = 0;
3060 	int ret = 0;
3061 
3062 	BUG_ON(cluster->start != cluster->boundary[0]);
3063 	mutex_lock(&inode->i_mutex);
3064 
3065 	ret = btrfs_check_data_free_space(inode, cluster->start,
3066 					  cluster->end + 1 - cluster->start);
3067 	if (ret)
3068 		goto out;
3069 
3070 	while (nr < cluster->nr) {
3071 		start = cluster->boundary[nr] - offset;
3072 		if (nr + 1 < cluster->nr)
3073 			end = cluster->boundary[nr + 1] - 1 - offset;
3074 		else
3075 			end = cluster->end - offset;
3076 
3077 		lock_extent(&BTRFS_I(inode)->io_tree, start, end);
3078 		num_bytes = end + 1 - start;
3079 		ret = btrfs_prealloc_file_range(inode, 0, start,
3080 						num_bytes, num_bytes,
3081 						end + 1, &alloc_hint);
3082 		unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
3083 		if (ret)
3084 			break;
3085 		nr++;
3086 	}
3087 	btrfs_free_reserved_data_space(inode, cluster->start,
3088 				       cluster->end + 1 - cluster->start);
3089 out:
3090 	mutex_unlock(&inode->i_mutex);
3091 	return ret;
3092 }
3093 
3094 static noinline_for_stack
3095 int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
3096 			 u64 block_start)
3097 {
3098 	struct btrfs_root *root = BTRFS_I(inode)->root;
3099 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
3100 	struct extent_map *em;
3101 	int ret = 0;
3102 
3103 	em = alloc_extent_map();
3104 	if (!em)
3105 		return -ENOMEM;
3106 
3107 	em->start = start;
3108 	em->len = end + 1 - start;
3109 	em->block_len = em->len;
3110 	em->block_start = block_start;
3111 	em->bdev = root->fs_info->fs_devices->latest_bdev;
3112 	set_bit(EXTENT_FLAG_PINNED, &em->flags);
3113 
3114 	lock_extent(&BTRFS_I(inode)->io_tree, start, end);
3115 	while (1) {
3116 		write_lock(&em_tree->lock);
3117 		ret = add_extent_mapping(em_tree, em, 0);
3118 		write_unlock(&em_tree->lock);
3119 		if (ret != -EEXIST) {
3120 			free_extent_map(em);
3121 			break;
3122 		}
3123 		btrfs_drop_extent_cache(inode, start, end, 0);
3124 	}
3125 	unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
3126 	return ret;
3127 }
3128 
3129 static int relocate_file_extent_cluster(struct inode *inode,
3130 					struct file_extent_cluster *cluster)
3131 {
3132 	u64 page_start;
3133 	u64 page_end;
3134 	u64 offset = BTRFS_I(inode)->index_cnt;
3135 	unsigned long index;
3136 	unsigned long last_index;
3137 	struct page *page;
3138 	struct file_ra_state *ra;
3139 	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
3140 	int nr = 0;
3141 	int ret = 0;
3142 
3143 	if (!cluster->nr)
3144 		return 0;
3145 
3146 	ra = kzalloc(sizeof(*ra), GFP_NOFS);
3147 	if (!ra)
3148 		return -ENOMEM;
3149 
3150 	ret = prealloc_file_extent_cluster(inode, cluster);
3151 	if (ret)
3152 		goto out;
3153 
3154 	file_ra_state_init(ra, inode->i_mapping);
3155 
3156 	ret = setup_extent_mapping(inode, cluster->start - offset,
3157 				   cluster->end - offset, cluster->start);
3158 	if (ret)
3159 		goto out;
3160 
3161 	index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
3162 	last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
3163 	while (index <= last_index) {
3164 		ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE);
3165 		if (ret)
3166 			goto out;
3167 
3168 		page = find_lock_page(inode->i_mapping, index);
3169 		if (!page) {
3170 			page_cache_sync_readahead(inode->i_mapping,
3171 						  ra, NULL, index,
3172 						  last_index + 1 - index);
3173 			page = find_or_create_page(inode->i_mapping, index,
3174 						   mask);
3175 			if (!page) {
3176 				btrfs_delalloc_release_metadata(inode,
3177 							PAGE_CACHE_SIZE);
3178 				ret = -ENOMEM;
3179 				goto out;
3180 			}
3181 		}
3182 
3183 		if (PageReadahead(page)) {
3184 			page_cache_async_readahead(inode->i_mapping,
3185 						   ra, NULL, page, index,
3186 						   last_index + 1 - index);
3187 		}
3188 
3189 		if (!PageUptodate(page)) {
3190 			btrfs_readpage(NULL, page);
3191 			lock_page(page);
3192 			if (!PageUptodate(page)) {
3193 				unlock_page(page);
3194 				page_cache_release(page);
3195 				btrfs_delalloc_release_metadata(inode,
3196 							PAGE_CACHE_SIZE);
3197 				ret = -EIO;
3198 				goto out;
3199 			}
3200 		}
3201 
3202 		page_start = page_offset(page);
3203 		page_end = page_start + PAGE_CACHE_SIZE - 1;
3204 
3205 		lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end);
3206 
3207 		set_page_extent_mapped(page);
3208 
3209 		if (nr < cluster->nr &&
3210 		    page_start + offset == cluster->boundary[nr]) {
3211 			set_extent_bits(&BTRFS_I(inode)->io_tree,
3212 					page_start, page_end,
3213 					EXTENT_BOUNDARY, GFP_NOFS);
3214 			nr++;
3215 		}
3216 
3217 		btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
3218 		set_page_dirty(page);
3219 
3220 		unlock_extent(&BTRFS_I(inode)->io_tree,
3221 			      page_start, page_end);
3222 		unlock_page(page);
3223 		page_cache_release(page);
3224 
3225 		index++;
3226 		balance_dirty_pages_ratelimited(inode->i_mapping);
3227 		btrfs_throttle(BTRFS_I(inode)->root);
3228 	}
3229 	WARN_ON(nr != cluster->nr);
3230 out:
3231 	kfree(ra);
3232 	return ret;
3233 }
3234 
3235 static noinline_for_stack
3236 int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
3237 			 struct file_extent_cluster *cluster)
3238 {
3239 	int ret;
3240 
3241 	if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
3242 		ret = relocate_file_extent_cluster(inode, cluster);
3243 		if (ret)
3244 			return ret;
3245 		cluster->nr = 0;
3246 	}
3247 
3248 	if (!cluster->nr)
3249 		cluster->start = extent_key->objectid;
3250 	else
3251 		BUG_ON(cluster->nr >= MAX_EXTENTS);
3252 	cluster->end = extent_key->objectid + extent_key->offset - 1;
3253 	cluster->boundary[cluster->nr] = extent_key->objectid;
3254 	cluster->nr++;
3255 
3256 	if (cluster->nr >= MAX_EXTENTS) {
3257 		ret = relocate_file_extent_cluster(inode, cluster);
3258 		if (ret)
3259 			return ret;
3260 		cluster->nr = 0;
3261 	}
3262 	return 0;
3263 }
3264 
3265 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3266 static int get_ref_objectid_v0(struct reloc_control *rc,
3267 			       struct btrfs_path *path,
3268 			       struct btrfs_key *extent_key,
3269 			       u64 *ref_objectid, int *path_change)
3270 {
3271 	struct btrfs_key key;
3272 	struct extent_buffer *leaf;
3273 	struct btrfs_extent_ref_v0 *ref0;
3274 	int ret;
3275 	int slot;
3276 
3277 	leaf = path->nodes[0];
3278 	slot = path->slots[0];
3279 	while (1) {
3280 		if (slot >= btrfs_header_nritems(leaf)) {
3281 			ret = btrfs_next_leaf(rc->extent_root, path);
3282 			if (ret < 0)
3283 				return ret;
3284 			BUG_ON(ret > 0);
3285 			leaf = path->nodes[0];
3286 			slot = path->slots[0];
3287 			if (path_change)
3288 				*path_change = 1;
3289 		}
3290 		btrfs_item_key_to_cpu(leaf, &key, slot);
3291 		if (key.objectid != extent_key->objectid)
3292 			return -ENOENT;
3293 
3294 		if (key.type != BTRFS_EXTENT_REF_V0_KEY) {
3295 			slot++;
3296 			continue;
3297 		}
3298 		ref0 = btrfs_item_ptr(leaf, slot,
3299 				struct btrfs_extent_ref_v0);
3300 		*ref_objectid = btrfs_ref_objectid_v0(leaf, ref0);
3301 		break;
3302 	}
3303 	return 0;
3304 }
3305 #endif
3306 
3307 /*
3308  * helper to add a tree block to the list.
3309  * the major work is getting the generation and level of the block
3310  */
3311 static int add_tree_block(struct reloc_control *rc,
3312 			  struct btrfs_key *extent_key,
3313 			  struct btrfs_path *path,
3314 			  struct rb_root *blocks)
3315 {
3316 	struct extent_buffer *eb;
3317 	struct btrfs_extent_item *ei;
3318 	struct btrfs_tree_block_info *bi;
3319 	struct tree_block *block;
3320 	struct rb_node *rb_node;
3321 	u32 item_size;
3322 	int level = -1;
3323 	u64 generation;
3324 
3325 	eb =  path->nodes[0];
3326 	item_size = btrfs_item_size_nr(eb, path->slots[0]);
3327 
3328 	if (extent_key->type == BTRFS_METADATA_ITEM_KEY ||
3329 	    item_size >= sizeof(*ei) + sizeof(*bi)) {
3330 		ei = btrfs_item_ptr(eb, path->slots[0],
3331 				struct btrfs_extent_item);
3332 		if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) {
3333 			bi = (struct btrfs_tree_block_info *)(ei + 1);
3334 			level = btrfs_tree_block_level(eb, bi);
3335 		} else {
3336 			level = (int)extent_key->offset;
3337 		}
3338 		generation = btrfs_extent_generation(eb, ei);
3339 	} else {
3340 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3341 		u64 ref_owner;
3342 		int ret;
3343 
3344 		BUG_ON(item_size != sizeof(struct btrfs_extent_item_v0));
3345 		ret = get_ref_objectid_v0(rc, path, extent_key,
3346 					  &ref_owner, NULL);
3347 		if (ret < 0)
3348 			return ret;
3349 		BUG_ON(ref_owner >= BTRFS_MAX_LEVEL);
3350 		level = (int)ref_owner;
3351 		/* FIXME: get real generation */
3352 		generation = 0;
3353 #else
3354 		BUG();
3355 #endif
3356 	}
3357 
3358 	btrfs_release_path(path);
3359 
3360 	BUG_ON(level == -1);
3361 
3362 	block = kmalloc(sizeof(*block), GFP_NOFS);
3363 	if (!block)
3364 		return -ENOMEM;
3365 
3366 	block->bytenr = extent_key->objectid;
3367 	block->key.objectid = rc->extent_root->nodesize;
3368 	block->key.offset = generation;
3369 	block->level = level;
3370 	block->key_ready = 0;
3371 
3372 	rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
3373 	if (rb_node)
3374 		backref_tree_panic(rb_node, -EEXIST, block->bytenr);
3375 
3376 	return 0;
3377 }
3378 
3379 /*
3380  * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY
3381  */
3382 static int __add_tree_block(struct reloc_control *rc,
3383 			    u64 bytenr, u32 blocksize,
3384 			    struct rb_root *blocks)
3385 {
3386 	struct btrfs_path *path;
3387 	struct btrfs_key key;
3388 	int ret;
3389 	bool skinny = btrfs_fs_incompat(rc->extent_root->fs_info,
3390 					SKINNY_METADATA);
3391 
3392 	if (tree_block_processed(bytenr, rc))
3393 		return 0;
3394 
3395 	if (tree_search(blocks, bytenr))
3396 		return 0;
3397 
3398 	path = btrfs_alloc_path();
3399 	if (!path)
3400 		return -ENOMEM;
3401 again:
3402 	key.objectid = bytenr;
3403 	if (skinny) {
3404 		key.type = BTRFS_METADATA_ITEM_KEY;
3405 		key.offset = (u64)-1;
3406 	} else {
3407 		key.type = BTRFS_EXTENT_ITEM_KEY;
3408 		key.offset = blocksize;
3409 	}
3410 
3411 	path->search_commit_root = 1;
3412 	path->skip_locking = 1;
3413 	ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
3414 	if (ret < 0)
3415 		goto out;
3416 
3417 	if (ret > 0 && skinny) {
3418 		if (path->slots[0]) {
3419 			path->slots[0]--;
3420 			btrfs_item_key_to_cpu(path->nodes[0], &key,
3421 					      path->slots[0]);
3422 			if (key.objectid == bytenr &&
3423 			    (key.type == BTRFS_METADATA_ITEM_KEY ||
3424 			     (key.type == BTRFS_EXTENT_ITEM_KEY &&
3425 			      key.offset == blocksize)))
3426 				ret = 0;
3427 		}
3428 
3429 		if (ret) {
3430 			skinny = false;
3431 			btrfs_release_path(path);
3432 			goto again;
3433 		}
3434 	}
3435 	BUG_ON(ret);
3436 
3437 	ret = add_tree_block(rc, &key, path, blocks);
3438 out:
3439 	btrfs_free_path(path);
3440 	return ret;
3441 }
3442 
3443 /*
3444  * helper to check if the block use full backrefs for pointers in it
3445  */
3446 static int block_use_full_backref(struct reloc_control *rc,
3447 				  struct extent_buffer *eb)
3448 {
3449 	u64 flags;
3450 	int ret;
3451 
3452 	if (btrfs_header_flag(eb, BTRFS_HEADER_FLAG_RELOC) ||
3453 	    btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV)
3454 		return 1;
3455 
3456 	ret = btrfs_lookup_extent_info(NULL, rc->extent_root,
3457 				       eb->start, btrfs_header_level(eb), 1,
3458 				       NULL, &flags);
3459 	BUG_ON(ret);
3460 
3461 	if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
3462 		ret = 1;
3463 	else
3464 		ret = 0;
3465 	return ret;
3466 }
3467 
3468 static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
3469 				    struct btrfs_block_group_cache *block_group,
3470 				    struct inode *inode,
3471 				    u64 ino)
3472 {
3473 	struct btrfs_key key;
3474 	struct btrfs_root *root = fs_info->tree_root;
3475 	struct btrfs_trans_handle *trans;
3476 	int ret = 0;
3477 
3478 	if (inode)
3479 		goto truncate;
3480 
3481 	key.objectid = ino;
3482 	key.type = BTRFS_INODE_ITEM_KEY;
3483 	key.offset = 0;
3484 
3485 	inode = btrfs_iget(fs_info->sb, &key, root, NULL);
3486 	if (IS_ERR(inode) || is_bad_inode(inode)) {
3487 		if (!IS_ERR(inode))
3488 			iput(inode);
3489 		return -ENOENT;
3490 	}
3491 
3492 truncate:
3493 	ret = btrfs_check_trunc_cache_free_space(root,
3494 						 &fs_info->global_block_rsv);
3495 	if (ret)
3496 		goto out;
3497 
3498 	trans = btrfs_join_transaction(root);
3499 	if (IS_ERR(trans)) {
3500 		ret = PTR_ERR(trans);
3501 		goto out;
3502 	}
3503 
3504 	ret = btrfs_truncate_free_space_cache(root, trans, block_group, inode);
3505 
3506 	btrfs_end_transaction(trans, root);
3507 	btrfs_btree_balance_dirty(root);
3508 out:
3509 	iput(inode);
3510 	return ret;
3511 }
3512 
3513 /*
3514  * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY
3515  * this function scans fs tree to find blocks reference the data extent
3516  */
3517 static int find_data_references(struct reloc_control *rc,
3518 				struct btrfs_key *extent_key,
3519 				struct extent_buffer *leaf,
3520 				struct btrfs_extent_data_ref *ref,
3521 				struct rb_root *blocks)
3522 {
3523 	struct btrfs_path *path;
3524 	struct tree_block *block;
3525 	struct btrfs_root *root;
3526 	struct btrfs_file_extent_item *fi;
3527 	struct rb_node *rb_node;
3528 	struct btrfs_key key;
3529 	u64 ref_root;
3530 	u64 ref_objectid;
3531 	u64 ref_offset;
3532 	u32 ref_count;
3533 	u32 nritems;
3534 	int err = 0;
3535 	int added = 0;
3536 	int counted;
3537 	int ret;
3538 
3539 	ref_root = btrfs_extent_data_ref_root(leaf, ref);
3540 	ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref);
3541 	ref_offset = btrfs_extent_data_ref_offset(leaf, ref);
3542 	ref_count = btrfs_extent_data_ref_count(leaf, ref);
3543 
3544 	/*
3545 	 * This is an extent belonging to the free space cache, lets just delete
3546 	 * it and redo the search.
3547 	 */
3548 	if (ref_root == BTRFS_ROOT_TREE_OBJECTID) {
3549 		ret = delete_block_group_cache(rc->extent_root->fs_info,
3550 					       rc->block_group,
3551 					       NULL, ref_objectid);
3552 		if (ret != -ENOENT)
3553 			return ret;
3554 		ret = 0;
3555 	}
3556 
3557 	path = btrfs_alloc_path();
3558 	if (!path)
3559 		return -ENOMEM;
3560 	path->reada = 1;
3561 
3562 	root = read_fs_root(rc->extent_root->fs_info, ref_root);
3563 	if (IS_ERR(root)) {
3564 		err = PTR_ERR(root);
3565 		goto out;
3566 	}
3567 
3568 	key.objectid = ref_objectid;
3569 	key.type = BTRFS_EXTENT_DATA_KEY;
3570 	if (ref_offset > ((u64)-1 << 32))
3571 		key.offset = 0;
3572 	else
3573 		key.offset = ref_offset;
3574 
3575 	path->search_commit_root = 1;
3576 	path->skip_locking = 1;
3577 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3578 	if (ret < 0) {
3579 		err = ret;
3580 		goto out;
3581 	}
3582 
3583 	leaf = path->nodes[0];
3584 	nritems = btrfs_header_nritems(leaf);
3585 	/*
3586 	 * the references in tree blocks that use full backrefs
3587 	 * are not counted in
3588 	 */
3589 	if (block_use_full_backref(rc, leaf))
3590 		counted = 0;
3591 	else
3592 		counted = 1;
3593 	rb_node = tree_search(blocks, leaf->start);
3594 	if (rb_node) {
3595 		if (counted)
3596 			added = 1;
3597 		else
3598 			path->slots[0] = nritems;
3599 	}
3600 
3601 	while (ref_count > 0) {
3602 		while (path->slots[0] >= nritems) {
3603 			ret = btrfs_next_leaf(root, path);
3604 			if (ret < 0) {
3605 				err = ret;
3606 				goto out;
3607 			}
3608 			if (WARN_ON(ret > 0))
3609 				goto out;
3610 
3611 			leaf = path->nodes[0];
3612 			nritems = btrfs_header_nritems(leaf);
3613 			added = 0;
3614 
3615 			if (block_use_full_backref(rc, leaf))
3616 				counted = 0;
3617 			else
3618 				counted = 1;
3619 			rb_node = tree_search(blocks, leaf->start);
3620 			if (rb_node) {
3621 				if (counted)
3622 					added = 1;
3623 				else
3624 					path->slots[0] = nritems;
3625 			}
3626 		}
3627 
3628 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3629 		if (WARN_ON(key.objectid != ref_objectid ||
3630 		    key.type != BTRFS_EXTENT_DATA_KEY))
3631 			break;
3632 
3633 		fi = btrfs_item_ptr(leaf, path->slots[0],
3634 				    struct btrfs_file_extent_item);
3635 
3636 		if (btrfs_file_extent_type(leaf, fi) ==
3637 		    BTRFS_FILE_EXTENT_INLINE)
3638 			goto next;
3639 
3640 		if (btrfs_file_extent_disk_bytenr(leaf, fi) !=
3641 		    extent_key->objectid)
3642 			goto next;
3643 
3644 		key.offset -= btrfs_file_extent_offset(leaf, fi);
3645 		if (key.offset != ref_offset)
3646 			goto next;
3647 
3648 		if (counted)
3649 			ref_count--;
3650 		if (added)
3651 			goto next;
3652 
3653 		if (!tree_block_processed(leaf->start, rc)) {
3654 			block = kmalloc(sizeof(*block), GFP_NOFS);
3655 			if (!block) {
3656 				err = -ENOMEM;
3657 				break;
3658 			}
3659 			block->bytenr = leaf->start;
3660 			btrfs_item_key_to_cpu(leaf, &block->key, 0);
3661 			block->level = 0;
3662 			block->key_ready = 1;
3663 			rb_node = tree_insert(blocks, block->bytenr,
3664 					      &block->rb_node);
3665 			if (rb_node)
3666 				backref_tree_panic(rb_node, -EEXIST,
3667 						   block->bytenr);
3668 		}
3669 		if (counted)
3670 			added = 1;
3671 		else
3672 			path->slots[0] = nritems;
3673 next:
3674 		path->slots[0]++;
3675 
3676 	}
3677 out:
3678 	btrfs_free_path(path);
3679 	return err;
3680 }
3681 
3682 /*
3683  * helper to find all tree blocks that reference a given data extent
3684  */
3685 static noinline_for_stack
3686 int add_data_references(struct reloc_control *rc,
3687 			struct btrfs_key *extent_key,
3688 			struct btrfs_path *path,
3689 			struct rb_root *blocks)
3690 {
3691 	struct btrfs_key key;
3692 	struct extent_buffer *eb;
3693 	struct btrfs_extent_data_ref *dref;
3694 	struct btrfs_extent_inline_ref *iref;
3695 	unsigned long ptr;
3696 	unsigned long end;
3697 	u32 blocksize = rc->extent_root->nodesize;
3698 	int ret = 0;
3699 	int err = 0;
3700 
3701 	eb = path->nodes[0];
3702 	ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
3703 	end = ptr + btrfs_item_size_nr(eb, path->slots[0]);
3704 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3705 	if (ptr + sizeof(struct btrfs_extent_item_v0) == end)
3706 		ptr = end;
3707 	else
3708 #endif
3709 		ptr += sizeof(struct btrfs_extent_item);
3710 
3711 	while (ptr < end) {
3712 		iref = (struct btrfs_extent_inline_ref *)ptr;
3713 		key.type = btrfs_extent_inline_ref_type(eb, iref);
3714 		if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
3715 			key.offset = btrfs_extent_inline_ref_offset(eb, iref);
3716 			ret = __add_tree_block(rc, key.offset, blocksize,
3717 					       blocks);
3718 		} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
3719 			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
3720 			ret = find_data_references(rc, extent_key,
3721 						   eb, dref, blocks);
3722 		} else {
3723 			BUG();
3724 		}
3725 		if (ret) {
3726 			err = ret;
3727 			goto out;
3728 		}
3729 		ptr += btrfs_extent_inline_ref_size(key.type);
3730 	}
3731 	WARN_ON(ptr > end);
3732 
3733 	while (1) {
3734 		cond_resched();
3735 		eb = path->nodes[0];
3736 		if (path->slots[0] >= btrfs_header_nritems(eb)) {
3737 			ret = btrfs_next_leaf(rc->extent_root, path);
3738 			if (ret < 0) {
3739 				err = ret;
3740 				break;
3741 			}
3742 			if (ret > 0)
3743 				break;
3744 			eb = path->nodes[0];
3745 		}
3746 
3747 		btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
3748 		if (key.objectid != extent_key->objectid)
3749 			break;
3750 
3751 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3752 		if (key.type == BTRFS_SHARED_DATA_REF_KEY ||
3753 		    key.type == BTRFS_EXTENT_REF_V0_KEY) {
3754 #else
3755 		BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
3756 		if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
3757 #endif
3758 			ret = __add_tree_block(rc, key.offset, blocksize,
3759 					       blocks);
3760 		} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
3761 			dref = btrfs_item_ptr(eb, path->slots[0],
3762 					      struct btrfs_extent_data_ref);
3763 			ret = find_data_references(rc, extent_key,
3764 						   eb, dref, blocks);
3765 		} else {
3766 			ret = 0;
3767 		}
3768 		if (ret) {
3769 			err = ret;
3770 			break;
3771 		}
3772 		path->slots[0]++;
3773 	}
3774 out:
3775 	btrfs_release_path(path);
3776 	if (err)
3777 		free_block_list(blocks);
3778 	return err;
3779 }
3780 
3781 /*
3782  * helper to find next unprocessed extent
3783  */
3784 static noinline_for_stack
3785 int find_next_extent(struct reloc_control *rc, struct btrfs_path *path,
3786 		     struct btrfs_key *extent_key)
3787 {
3788 	struct btrfs_key key;
3789 	struct extent_buffer *leaf;
3790 	u64 start, end, last;
3791 	int ret;
3792 
3793 	last = rc->block_group->key.objectid + rc->block_group->key.offset;
3794 	while (1) {
3795 		cond_resched();
3796 		if (rc->search_start >= last) {
3797 			ret = 1;
3798 			break;
3799 		}
3800 
3801 		key.objectid = rc->search_start;
3802 		key.type = BTRFS_EXTENT_ITEM_KEY;
3803 		key.offset = 0;
3804 
3805 		path->search_commit_root = 1;
3806 		path->skip_locking = 1;
3807 		ret = btrfs_search_slot(NULL, rc->extent_root, &key, path,
3808 					0, 0);
3809 		if (ret < 0)
3810 			break;
3811 next:
3812 		leaf = path->nodes[0];
3813 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3814 			ret = btrfs_next_leaf(rc->extent_root, path);
3815 			if (ret != 0)
3816 				break;
3817 			leaf = path->nodes[0];
3818 		}
3819 
3820 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3821 		if (key.objectid >= last) {
3822 			ret = 1;
3823 			break;
3824 		}
3825 
3826 		if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3827 		    key.type != BTRFS_METADATA_ITEM_KEY) {
3828 			path->slots[0]++;
3829 			goto next;
3830 		}
3831 
3832 		if (key.type == BTRFS_EXTENT_ITEM_KEY &&
3833 		    key.objectid + key.offset <= rc->search_start) {
3834 			path->slots[0]++;
3835 			goto next;
3836 		}
3837 
3838 		if (key.type == BTRFS_METADATA_ITEM_KEY &&
3839 		    key.objectid + rc->extent_root->nodesize <=
3840 		    rc->search_start) {
3841 			path->slots[0]++;
3842 			goto next;
3843 		}
3844 
3845 		ret = find_first_extent_bit(&rc->processed_blocks,
3846 					    key.objectid, &start, &end,
3847 					    EXTENT_DIRTY, NULL);
3848 
3849 		if (ret == 0 && start <= key.objectid) {
3850 			btrfs_release_path(path);
3851 			rc->search_start = end + 1;
3852 		} else {
3853 			if (key.type == BTRFS_EXTENT_ITEM_KEY)
3854 				rc->search_start = key.objectid + key.offset;
3855 			else
3856 				rc->search_start = key.objectid +
3857 					rc->extent_root->nodesize;
3858 			memcpy(extent_key, &key, sizeof(key));
3859 			return 0;
3860 		}
3861 	}
3862 	btrfs_release_path(path);
3863 	return ret;
3864 }
3865 
3866 static void set_reloc_control(struct reloc_control *rc)
3867 {
3868 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3869 
3870 	mutex_lock(&fs_info->reloc_mutex);
3871 	fs_info->reloc_ctl = rc;
3872 	mutex_unlock(&fs_info->reloc_mutex);
3873 }
3874 
3875 static void unset_reloc_control(struct reloc_control *rc)
3876 {
3877 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3878 
3879 	mutex_lock(&fs_info->reloc_mutex);
3880 	fs_info->reloc_ctl = NULL;
3881 	mutex_unlock(&fs_info->reloc_mutex);
3882 }
3883 
3884 static int check_extent_flags(u64 flags)
3885 {
3886 	if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3887 	    (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3888 		return 1;
3889 	if (!(flags & BTRFS_EXTENT_FLAG_DATA) &&
3890 	    !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3891 		return 1;
3892 	if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3893 	    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
3894 		return 1;
3895 	return 0;
3896 }
3897 
3898 static noinline_for_stack
3899 int prepare_to_relocate(struct reloc_control *rc)
3900 {
3901 	struct btrfs_trans_handle *trans;
3902 
3903 	rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root,
3904 					      BTRFS_BLOCK_RSV_TEMP);
3905 	if (!rc->block_rsv)
3906 		return -ENOMEM;
3907 
3908 	memset(&rc->cluster, 0, sizeof(rc->cluster));
3909 	rc->search_start = rc->block_group->key.objectid;
3910 	rc->extents_found = 0;
3911 	rc->nodes_relocated = 0;
3912 	rc->merging_rsv_size = 0;
3913 	rc->reserved_bytes = 0;
3914 	rc->block_rsv->size = rc->extent_root->nodesize *
3915 			      RELOCATION_RESERVED_NODES;
3916 
3917 	rc->create_reloc_tree = 1;
3918 	set_reloc_control(rc);
3919 
3920 	trans = btrfs_join_transaction(rc->extent_root);
3921 	if (IS_ERR(trans)) {
3922 		unset_reloc_control(rc);
3923 		/*
3924 		 * extent tree is not a ref_cow tree and has no reloc_root to
3925 		 * cleanup.  And callers are responsible to free the above
3926 		 * block rsv.
3927 		 */
3928 		return PTR_ERR(trans);
3929 	}
3930 	btrfs_commit_transaction(trans, rc->extent_root);
3931 	return 0;
3932 }
3933 
3934 static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3935 {
3936 	struct rb_root blocks = RB_ROOT;
3937 	struct btrfs_key key;
3938 	struct btrfs_trans_handle *trans = NULL;
3939 	struct btrfs_path *path;
3940 	struct btrfs_extent_item *ei;
3941 	u64 flags;
3942 	u32 item_size;
3943 	int ret;
3944 	int err = 0;
3945 	int progress = 0;
3946 
3947 	path = btrfs_alloc_path();
3948 	if (!path)
3949 		return -ENOMEM;
3950 	path->reada = 1;
3951 
3952 	ret = prepare_to_relocate(rc);
3953 	if (ret) {
3954 		err = ret;
3955 		goto out_free;
3956 	}
3957 
3958 	while (1) {
3959 		rc->reserved_bytes = 0;
3960 		ret = btrfs_block_rsv_refill(rc->extent_root,
3961 					rc->block_rsv, rc->block_rsv->size,
3962 					BTRFS_RESERVE_FLUSH_ALL);
3963 		if (ret) {
3964 			err = ret;
3965 			break;
3966 		}
3967 		progress++;
3968 		trans = btrfs_start_transaction(rc->extent_root, 0);
3969 		if (IS_ERR(trans)) {
3970 			err = PTR_ERR(trans);
3971 			trans = NULL;
3972 			break;
3973 		}
3974 restart:
3975 		if (update_backref_cache(trans, &rc->backref_cache)) {
3976 			btrfs_end_transaction(trans, rc->extent_root);
3977 			continue;
3978 		}
3979 
3980 		ret = find_next_extent(rc, path, &key);
3981 		if (ret < 0)
3982 			err = ret;
3983 		if (ret != 0)
3984 			break;
3985 
3986 		rc->extents_found++;
3987 
3988 		ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
3989 				    struct btrfs_extent_item);
3990 		item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
3991 		if (item_size >= sizeof(*ei)) {
3992 			flags = btrfs_extent_flags(path->nodes[0], ei);
3993 			ret = check_extent_flags(flags);
3994 			BUG_ON(ret);
3995 
3996 		} else {
3997 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3998 			u64 ref_owner;
3999 			int path_change = 0;
4000 
4001 			BUG_ON(item_size !=
4002 			       sizeof(struct btrfs_extent_item_v0));
4003 			ret = get_ref_objectid_v0(rc, path, &key, &ref_owner,
4004 						  &path_change);
4005 			if (ret < 0) {
4006 				err = ret;
4007 				break;
4008 			}
4009 			if (ref_owner < BTRFS_FIRST_FREE_OBJECTID)
4010 				flags = BTRFS_EXTENT_FLAG_TREE_BLOCK;
4011 			else
4012 				flags = BTRFS_EXTENT_FLAG_DATA;
4013 
4014 			if (path_change) {
4015 				btrfs_release_path(path);
4016 
4017 				path->search_commit_root = 1;
4018 				path->skip_locking = 1;
4019 				ret = btrfs_search_slot(NULL, rc->extent_root,
4020 							&key, path, 0, 0);
4021 				if (ret < 0) {
4022 					err = ret;
4023 					break;
4024 				}
4025 				BUG_ON(ret > 0);
4026 			}
4027 #else
4028 			BUG();
4029 #endif
4030 		}
4031 
4032 		if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
4033 			ret = add_tree_block(rc, &key, path, &blocks);
4034 		} else if (rc->stage == UPDATE_DATA_PTRS &&
4035 			   (flags & BTRFS_EXTENT_FLAG_DATA)) {
4036 			ret = add_data_references(rc, &key, path, &blocks);
4037 		} else {
4038 			btrfs_release_path(path);
4039 			ret = 0;
4040 		}
4041 		if (ret < 0) {
4042 			err = ret;
4043 			break;
4044 		}
4045 
4046 		if (!RB_EMPTY_ROOT(&blocks)) {
4047 			ret = relocate_tree_blocks(trans, rc, &blocks);
4048 			if (ret < 0) {
4049 				/*
4050 				 * if we fail to relocate tree blocks, force to update
4051 				 * backref cache when committing transaction.
4052 				 */
4053 				rc->backref_cache.last_trans = trans->transid - 1;
4054 
4055 				if (ret != -EAGAIN) {
4056 					err = ret;
4057 					break;
4058 				}
4059 				rc->extents_found--;
4060 				rc->search_start = key.objectid;
4061 			}
4062 		}
4063 
4064 		btrfs_end_transaction_throttle(trans, rc->extent_root);
4065 		btrfs_btree_balance_dirty(rc->extent_root);
4066 		trans = NULL;
4067 
4068 		if (rc->stage == MOVE_DATA_EXTENTS &&
4069 		    (flags & BTRFS_EXTENT_FLAG_DATA)) {
4070 			rc->found_file_extent = 1;
4071 			ret = relocate_data_extent(rc->data_inode,
4072 						   &key, &rc->cluster);
4073 			if (ret < 0) {
4074 				err = ret;
4075 				break;
4076 			}
4077 		}
4078 	}
4079 	if (trans && progress && err == -ENOSPC) {
4080 		ret = btrfs_force_chunk_alloc(trans, rc->extent_root,
4081 					      rc->block_group->flags);
4082 		if (ret == 1) {
4083 			err = 0;
4084 			progress = 0;
4085 			goto restart;
4086 		}
4087 	}
4088 
4089 	btrfs_release_path(path);
4090 	clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
4091 			  GFP_NOFS);
4092 
4093 	if (trans) {
4094 		btrfs_end_transaction_throttle(trans, rc->extent_root);
4095 		btrfs_btree_balance_dirty(rc->extent_root);
4096 	}
4097 
4098 	if (!err) {
4099 		ret = relocate_file_extent_cluster(rc->data_inode,
4100 						   &rc->cluster);
4101 		if (ret < 0)
4102 			err = ret;
4103 	}
4104 
4105 	rc->create_reloc_tree = 0;
4106 	set_reloc_control(rc);
4107 
4108 	backref_cache_cleanup(&rc->backref_cache);
4109 	btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
4110 
4111 	err = prepare_to_merge(rc, err);
4112 
4113 	merge_reloc_roots(rc);
4114 
4115 	rc->merge_reloc_tree = 0;
4116 	unset_reloc_control(rc);
4117 	btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
4118 
4119 	/* get rid of pinned extents */
4120 	trans = btrfs_join_transaction(rc->extent_root);
4121 	if (IS_ERR(trans))
4122 		err = PTR_ERR(trans);
4123 	else
4124 		btrfs_commit_transaction(trans, rc->extent_root);
4125 out_free:
4126 	btrfs_free_block_rsv(rc->extent_root, rc->block_rsv);
4127 	btrfs_free_path(path);
4128 	return err;
4129 }
4130 
4131 static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
4132 				 struct btrfs_root *root, u64 objectid)
4133 {
4134 	struct btrfs_path *path;
4135 	struct btrfs_inode_item *item;
4136 	struct extent_buffer *leaf;
4137 	int ret;
4138 
4139 	path = btrfs_alloc_path();
4140 	if (!path)
4141 		return -ENOMEM;
4142 
4143 	ret = btrfs_insert_empty_inode(trans, root, path, objectid);
4144 	if (ret)
4145 		goto out;
4146 
4147 	leaf = path->nodes[0];
4148 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
4149 	memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
4150 	btrfs_set_inode_generation(leaf, item, 1);
4151 	btrfs_set_inode_size(leaf, item, 0);
4152 	btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
4153 	btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
4154 					  BTRFS_INODE_PREALLOC);
4155 	btrfs_mark_buffer_dirty(leaf);
4156 out:
4157 	btrfs_free_path(path);
4158 	return ret;
4159 }
4160 
4161 /*
4162  * helper to create inode for data relocation.
4163  * the inode is in data relocation tree and its link count is 0
4164  */
4165 static noinline_for_stack
4166 struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
4167 				 struct btrfs_block_group_cache *group)
4168 {
4169 	struct inode *inode = NULL;
4170 	struct btrfs_trans_handle *trans;
4171 	struct btrfs_root *root;
4172 	struct btrfs_key key;
4173 	u64 objectid;
4174 	int err = 0;
4175 
4176 	root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
4177 	if (IS_ERR(root))
4178 		return ERR_CAST(root);
4179 
4180 	trans = btrfs_start_transaction(root, 6);
4181 	if (IS_ERR(trans))
4182 		return ERR_CAST(trans);
4183 
4184 	err = btrfs_find_free_objectid(root, &objectid);
4185 	if (err)
4186 		goto out;
4187 
4188 	err = __insert_orphan_inode(trans, root, objectid);
4189 	BUG_ON(err);
4190 
4191 	key.objectid = objectid;
4192 	key.type = BTRFS_INODE_ITEM_KEY;
4193 	key.offset = 0;
4194 	inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
4195 	BUG_ON(IS_ERR(inode) || is_bad_inode(inode));
4196 	BTRFS_I(inode)->index_cnt = group->key.objectid;
4197 
4198 	err = btrfs_orphan_add(trans, inode);
4199 out:
4200 	btrfs_end_transaction(trans, root);
4201 	btrfs_btree_balance_dirty(root);
4202 	if (err) {
4203 		if (inode)
4204 			iput(inode);
4205 		inode = ERR_PTR(err);
4206 	}
4207 	return inode;
4208 }
4209 
4210 static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
4211 {
4212 	struct reloc_control *rc;
4213 
4214 	rc = kzalloc(sizeof(*rc), GFP_NOFS);
4215 	if (!rc)
4216 		return NULL;
4217 
4218 	INIT_LIST_HEAD(&rc->reloc_roots);
4219 	backref_cache_init(&rc->backref_cache);
4220 	mapping_tree_init(&rc->reloc_root_tree);
4221 	extent_io_tree_init(&rc->processed_blocks,
4222 			    fs_info->btree_inode->i_mapping);
4223 	return rc;
4224 }
4225 
4226 /*
4227  * function to relocate all extents in a block group.
4228  */
4229 int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
4230 {
4231 	struct btrfs_fs_info *fs_info = extent_root->fs_info;
4232 	struct reloc_control *rc;
4233 	struct inode *inode;
4234 	struct btrfs_path *path;
4235 	int ret;
4236 	int rw = 0;
4237 	int err = 0;
4238 
4239 	rc = alloc_reloc_control(fs_info);
4240 	if (!rc)
4241 		return -ENOMEM;
4242 
4243 	rc->extent_root = extent_root;
4244 
4245 	rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
4246 	BUG_ON(!rc->block_group);
4247 
4248 	ret = btrfs_inc_block_group_ro(extent_root, rc->block_group);
4249 	if (ret) {
4250 		err = ret;
4251 		goto out;
4252 	}
4253 	rw = 1;
4254 
4255 	path = btrfs_alloc_path();
4256 	if (!path) {
4257 		err = -ENOMEM;
4258 		goto out;
4259 	}
4260 
4261 	inode = lookup_free_space_inode(fs_info->tree_root, rc->block_group,
4262 					path);
4263 	btrfs_free_path(path);
4264 
4265 	if (!IS_ERR(inode))
4266 		ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0);
4267 	else
4268 		ret = PTR_ERR(inode);
4269 
4270 	if (ret && ret != -ENOENT) {
4271 		err = ret;
4272 		goto out;
4273 	}
4274 
4275 	rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
4276 	if (IS_ERR(rc->data_inode)) {
4277 		err = PTR_ERR(rc->data_inode);
4278 		rc->data_inode = NULL;
4279 		goto out;
4280 	}
4281 
4282 	btrfs_info(extent_root->fs_info, "relocating block group %llu flags %llu",
4283 	       rc->block_group->key.objectid, rc->block_group->flags);
4284 
4285 	ret = btrfs_start_delalloc_roots(fs_info, 0, -1);
4286 	if (ret < 0) {
4287 		err = ret;
4288 		goto out;
4289 	}
4290 	btrfs_wait_ordered_roots(fs_info, -1);
4291 
4292 	while (1) {
4293 		mutex_lock(&fs_info->cleaner_mutex);
4294 		ret = relocate_block_group(rc);
4295 		mutex_unlock(&fs_info->cleaner_mutex);
4296 		if (ret < 0) {
4297 			err = ret;
4298 			goto out;
4299 		}
4300 
4301 		if (rc->extents_found == 0)
4302 			break;
4303 
4304 		btrfs_info(extent_root->fs_info, "found %llu extents",
4305 			rc->extents_found);
4306 
4307 		if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
4308 			ret = btrfs_wait_ordered_range(rc->data_inode, 0,
4309 						       (u64)-1);
4310 			if (ret) {
4311 				err = ret;
4312 				goto out;
4313 			}
4314 			invalidate_mapping_pages(rc->data_inode->i_mapping,
4315 						 0, -1);
4316 			rc->stage = UPDATE_DATA_PTRS;
4317 		}
4318 	}
4319 
4320 	WARN_ON(rc->block_group->pinned > 0);
4321 	WARN_ON(rc->block_group->reserved > 0);
4322 	WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
4323 out:
4324 	if (err && rw)
4325 		btrfs_dec_block_group_ro(extent_root, rc->block_group);
4326 	iput(rc->data_inode);
4327 	btrfs_put_block_group(rc->block_group);
4328 	kfree(rc);
4329 	return err;
4330 }
4331 
4332 static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
4333 {
4334 	struct btrfs_trans_handle *trans;
4335 	int ret, err;
4336 
4337 	trans = btrfs_start_transaction(root->fs_info->tree_root, 0);
4338 	if (IS_ERR(trans))
4339 		return PTR_ERR(trans);
4340 
4341 	memset(&root->root_item.drop_progress, 0,
4342 		sizeof(root->root_item.drop_progress));
4343 	root->root_item.drop_level = 0;
4344 	btrfs_set_root_refs(&root->root_item, 0);
4345 	ret = btrfs_update_root(trans, root->fs_info->tree_root,
4346 				&root->root_key, &root->root_item);
4347 
4348 	err = btrfs_end_transaction(trans, root->fs_info->tree_root);
4349 	if (err)
4350 		return err;
4351 	return ret;
4352 }
4353 
4354 /*
4355  * recover relocation interrupted by system crash.
4356  *
4357  * this function resumes merging reloc trees with corresponding fs trees.
4358  * this is important for keeping the sharing of tree blocks
4359  */
4360 int btrfs_recover_relocation(struct btrfs_root *root)
4361 {
4362 	LIST_HEAD(reloc_roots);
4363 	struct btrfs_key key;
4364 	struct btrfs_root *fs_root;
4365 	struct btrfs_root *reloc_root;
4366 	struct btrfs_path *path;
4367 	struct extent_buffer *leaf;
4368 	struct reloc_control *rc = NULL;
4369 	struct btrfs_trans_handle *trans;
4370 	int ret;
4371 	int err = 0;
4372 
4373 	path = btrfs_alloc_path();
4374 	if (!path)
4375 		return -ENOMEM;
4376 	path->reada = -1;
4377 
4378 	key.objectid = BTRFS_TREE_RELOC_OBJECTID;
4379 	key.type = BTRFS_ROOT_ITEM_KEY;
4380 	key.offset = (u64)-1;
4381 
4382 	while (1) {
4383 		ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key,
4384 					path, 0, 0);
4385 		if (ret < 0) {
4386 			err = ret;
4387 			goto out;
4388 		}
4389 		if (ret > 0) {
4390 			if (path->slots[0] == 0)
4391 				break;
4392 			path->slots[0]--;
4393 		}
4394 		leaf = path->nodes[0];
4395 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4396 		btrfs_release_path(path);
4397 
4398 		if (key.objectid != BTRFS_TREE_RELOC_OBJECTID ||
4399 		    key.type != BTRFS_ROOT_ITEM_KEY)
4400 			break;
4401 
4402 		reloc_root = btrfs_read_fs_root(root, &key);
4403 		if (IS_ERR(reloc_root)) {
4404 			err = PTR_ERR(reloc_root);
4405 			goto out;
4406 		}
4407 
4408 		list_add(&reloc_root->root_list, &reloc_roots);
4409 
4410 		if (btrfs_root_refs(&reloc_root->root_item) > 0) {
4411 			fs_root = read_fs_root(root->fs_info,
4412 					       reloc_root->root_key.offset);
4413 			if (IS_ERR(fs_root)) {
4414 				ret = PTR_ERR(fs_root);
4415 				if (ret != -ENOENT) {
4416 					err = ret;
4417 					goto out;
4418 				}
4419 				ret = mark_garbage_root(reloc_root);
4420 				if (ret < 0) {
4421 					err = ret;
4422 					goto out;
4423 				}
4424 			}
4425 		}
4426 
4427 		if (key.offset == 0)
4428 			break;
4429 
4430 		key.offset--;
4431 	}
4432 	btrfs_release_path(path);
4433 
4434 	if (list_empty(&reloc_roots))
4435 		goto out;
4436 
4437 	rc = alloc_reloc_control(root->fs_info);
4438 	if (!rc) {
4439 		err = -ENOMEM;
4440 		goto out;
4441 	}
4442 
4443 	rc->extent_root = root->fs_info->extent_root;
4444 
4445 	set_reloc_control(rc);
4446 
4447 	trans = btrfs_join_transaction(rc->extent_root);
4448 	if (IS_ERR(trans)) {
4449 		unset_reloc_control(rc);
4450 		err = PTR_ERR(trans);
4451 		goto out_free;
4452 	}
4453 
4454 	rc->merge_reloc_tree = 1;
4455 
4456 	while (!list_empty(&reloc_roots)) {
4457 		reloc_root = list_entry(reloc_roots.next,
4458 					struct btrfs_root, root_list);
4459 		list_del(&reloc_root->root_list);
4460 
4461 		if (btrfs_root_refs(&reloc_root->root_item) == 0) {
4462 			list_add_tail(&reloc_root->root_list,
4463 				      &rc->reloc_roots);
4464 			continue;
4465 		}
4466 
4467 		fs_root = read_fs_root(root->fs_info,
4468 				       reloc_root->root_key.offset);
4469 		if (IS_ERR(fs_root)) {
4470 			err = PTR_ERR(fs_root);
4471 			list_add_tail(&reloc_root->root_list, &reloc_roots);
4472 			goto out_free;
4473 		}
4474 
4475 		err = __add_reloc_root(reloc_root);
4476 		BUG_ON(err < 0); /* -ENOMEM or logic error */
4477 		fs_root->reloc_root = reloc_root;
4478 	}
4479 
4480 	err = btrfs_commit_transaction(trans, rc->extent_root);
4481 	if (err)
4482 		goto out_free;
4483 
4484 	merge_reloc_roots(rc);
4485 
4486 	unset_reloc_control(rc);
4487 
4488 	trans = btrfs_join_transaction(rc->extent_root);
4489 	if (IS_ERR(trans))
4490 		err = PTR_ERR(trans);
4491 	else
4492 		err = btrfs_commit_transaction(trans, rc->extent_root);
4493 out_free:
4494 	kfree(rc);
4495 out:
4496 	if (!list_empty(&reloc_roots))
4497 		free_reloc_roots(&reloc_roots);
4498 
4499 	btrfs_free_path(path);
4500 
4501 	if (err == 0) {
4502 		/* cleanup orphan inode in data relocation tree */
4503 		fs_root = read_fs_root(root->fs_info,
4504 				       BTRFS_DATA_RELOC_TREE_OBJECTID);
4505 		if (IS_ERR(fs_root))
4506 			err = PTR_ERR(fs_root);
4507 		else
4508 			err = btrfs_orphan_cleanup(fs_root);
4509 	}
4510 	return err;
4511 }
4512 
4513 /*
4514  * helper to add ordered checksum for data relocation.
4515  *
4516  * cloning checksum properly handles the nodatasum extents.
4517  * it also saves CPU time to re-calculate the checksum.
4518  */
4519 int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
4520 {
4521 	struct btrfs_ordered_sum *sums;
4522 	struct btrfs_ordered_extent *ordered;
4523 	struct btrfs_root *root = BTRFS_I(inode)->root;
4524 	int ret;
4525 	u64 disk_bytenr;
4526 	u64 new_bytenr;
4527 	LIST_HEAD(list);
4528 
4529 	ordered = btrfs_lookup_ordered_extent(inode, file_pos);
4530 	BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
4531 
4532 	disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
4533 	ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
4534 				       disk_bytenr + len - 1, &list, 0);
4535 	if (ret)
4536 		goto out;
4537 
4538 	while (!list_empty(&list)) {
4539 		sums = list_entry(list.next, struct btrfs_ordered_sum, list);
4540 		list_del_init(&sums->list);
4541 
4542 		/*
4543 		 * We need to offset the new_bytenr based on where the csum is.
4544 		 * We need to do this because we will read in entire prealloc
4545 		 * extents but we may have written to say the middle of the
4546 		 * prealloc extent, so we need to make sure the csum goes with
4547 		 * the right disk offset.
4548 		 *
4549 		 * We can do this because the data reloc inode refers strictly
4550 		 * to the on disk bytes, so we don't have to worry about
4551 		 * disk_len vs real len like with real inodes since it's all
4552 		 * disk length.
4553 		 */
4554 		new_bytenr = ordered->start + (sums->bytenr - disk_bytenr);
4555 		sums->bytenr = new_bytenr;
4556 
4557 		btrfs_add_ordered_sum(inode, ordered, sums);
4558 	}
4559 out:
4560 	btrfs_put_ordered_extent(ordered);
4561 	return ret;
4562 }
4563 
4564 int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
4565 			  struct btrfs_root *root, struct extent_buffer *buf,
4566 			  struct extent_buffer *cow)
4567 {
4568 	struct reloc_control *rc;
4569 	struct backref_node *node;
4570 	int first_cow = 0;
4571 	int level;
4572 	int ret = 0;
4573 
4574 	rc = root->fs_info->reloc_ctl;
4575 	if (!rc)
4576 		return 0;
4577 
4578 	BUG_ON(rc->stage == UPDATE_DATA_PTRS &&
4579 	       root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID);
4580 
4581 	level = btrfs_header_level(buf);
4582 	if (btrfs_header_generation(buf) <=
4583 	    btrfs_root_last_snapshot(&root->root_item))
4584 		first_cow = 1;
4585 
4586 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID &&
4587 	    rc->create_reloc_tree) {
4588 		WARN_ON(!first_cow && level == 0);
4589 
4590 		node = rc->backref_cache.path[level];
4591 		BUG_ON(node->bytenr != buf->start &&
4592 		       node->new_bytenr != buf->start);
4593 
4594 		drop_node_buffer(node);
4595 		extent_buffer_get(cow);
4596 		node->eb = cow;
4597 		node->new_bytenr = cow->start;
4598 
4599 		if (!node->pending) {
4600 			list_move_tail(&node->list,
4601 				       &rc->backref_cache.pending[level]);
4602 			node->pending = 1;
4603 		}
4604 
4605 		if (first_cow)
4606 			__mark_block_processed(rc, node);
4607 
4608 		if (first_cow && level > 0)
4609 			rc->nodes_relocated += buf->len;
4610 	}
4611 
4612 	if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS)
4613 		ret = replace_file_extents(trans, rc, root, cow);
4614 	return ret;
4615 }
4616 
4617 /*
4618  * called before creating snapshot. it calculates metadata reservation
4619  * requried for relocating tree blocks in the snapshot
4620  */
4621 void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
4622 			      u64 *bytes_to_reserve)
4623 {
4624 	struct btrfs_root *root;
4625 	struct reloc_control *rc;
4626 
4627 	root = pending->root;
4628 	if (!root->reloc_root)
4629 		return;
4630 
4631 	rc = root->fs_info->reloc_ctl;
4632 	if (!rc->merge_reloc_tree)
4633 		return;
4634 
4635 	root = root->reloc_root;
4636 	BUG_ON(btrfs_root_refs(&root->root_item) == 0);
4637 	/*
4638 	 * relocation is in the stage of merging trees. the space
4639 	 * used by merging a reloc tree is twice the size of
4640 	 * relocated tree nodes in the worst case. half for cowing
4641 	 * the reloc tree, half for cowing the fs tree. the space
4642 	 * used by cowing the reloc tree will be freed after the
4643 	 * tree is dropped. if we create snapshot, cowing the fs
4644 	 * tree may use more space than it frees. so we need
4645 	 * reserve extra space.
4646 	 */
4647 	*bytes_to_reserve += rc->nodes_relocated;
4648 }
4649 
4650 /*
4651  * called after snapshot is created. migrate block reservation
4652  * and create reloc root for the newly created snapshot
4653  */
4654 int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
4655 			       struct btrfs_pending_snapshot *pending)
4656 {
4657 	struct btrfs_root *root = pending->root;
4658 	struct btrfs_root *reloc_root;
4659 	struct btrfs_root *new_root;
4660 	struct reloc_control *rc;
4661 	int ret;
4662 
4663 	if (!root->reloc_root)
4664 		return 0;
4665 
4666 	rc = root->fs_info->reloc_ctl;
4667 	rc->merging_rsv_size += rc->nodes_relocated;
4668 
4669 	if (rc->merge_reloc_tree) {
4670 		ret = btrfs_block_rsv_migrate(&pending->block_rsv,
4671 					      rc->block_rsv,
4672 					      rc->nodes_relocated);
4673 		if (ret)
4674 			return ret;
4675 	}
4676 
4677 	new_root = pending->snap;
4678 	reloc_root = create_reloc_root(trans, root->reloc_root,
4679 				       new_root->root_key.objectid);
4680 	if (IS_ERR(reloc_root))
4681 		return PTR_ERR(reloc_root);
4682 
4683 	ret = __add_reloc_root(reloc_root);
4684 	BUG_ON(ret < 0);
4685 	new_root->reloc_root = reloc_root;
4686 
4687 	if (rc->create_reloc_tree)
4688 		ret = clone_backref_node(trans, rc, root, reloc_root);
4689 	return ret;
4690 }
4691