• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007,2008 Oracle.  All rights reserved.
4  */
5 
6 #include <linux/sched.h>
7 #include <linux/slab.h>
8 #include <linux/rbtree.h>
9 #include <linux/mm.h>
10 #include "ctree.h"
11 #include "disk-io.h"
12 #include "transaction.h"
13 #include "print-tree.h"
14 #include "locking.h"
15 
16 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
17 		      *root, struct btrfs_path *path, int level);
18 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
19 		      const struct btrfs_key *ins_key, struct btrfs_path *path,
20 		      int data_size, int extend);
21 static int push_node_left(struct btrfs_trans_handle *trans,
22 			  struct btrfs_fs_info *fs_info,
23 			  struct extent_buffer *dst,
24 			  struct extent_buffer *src, int empty);
25 static int balance_node_right(struct btrfs_trans_handle *trans,
26 			      struct btrfs_fs_info *fs_info,
27 			      struct extent_buffer *dst_buf,
28 			      struct extent_buffer *src_buf);
29 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
30 		    int level, int slot);
31 
btrfs_alloc_path(void)32 struct btrfs_path *btrfs_alloc_path(void)
33 {
34 	return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
35 }
36 
37 /*
38  * set all locked nodes in the path to blocking locks.  This should
39  * be done before scheduling
40  */
btrfs_set_path_blocking(struct btrfs_path * p)41 noinline void btrfs_set_path_blocking(struct btrfs_path *p)
42 {
43 	int i;
44 	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
45 		if (!p->nodes[i] || !p->locks[i])
46 			continue;
47 		btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
48 		if (p->locks[i] == BTRFS_READ_LOCK)
49 			p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
50 		else if (p->locks[i] == BTRFS_WRITE_LOCK)
51 			p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
52 	}
53 }
54 
55 /*
56  * reset all the locked nodes in the patch to spinning locks.
57  *
58  * held is used to keep lockdep happy, when lockdep is enabled
59  * we set held to a blocking lock before we go around and
60  * retake all the spinlocks in the path.  You can safely use NULL
61  * for held
62  */
btrfs_clear_path_blocking(struct btrfs_path * p,struct extent_buffer * held,int held_rw)63 noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
64 					struct extent_buffer *held, int held_rw)
65 {
66 	int i;
67 
68 	if (held) {
69 		btrfs_set_lock_blocking_rw(held, held_rw);
70 		if (held_rw == BTRFS_WRITE_LOCK)
71 			held_rw = BTRFS_WRITE_LOCK_BLOCKING;
72 		else if (held_rw == BTRFS_READ_LOCK)
73 			held_rw = BTRFS_READ_LOCK_BLOCKING;
74 	}
75 	btrfs_set_path_blocking(p);
76 
77 	for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
78 		if (p->nodes[i] && p->locks[i]) {
79 			btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]);
80 			if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING)
81 				p->locks[i] = BTRFS_WRITE_LOCK;
82 			else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING)
83 				p->locks[i] = BTRFS_READ_LOCK;
84 		}
85 	}
86 
87 	if (held)
88 		btrfs_clear_lock_blocking_rw(held, held_rw);
89 }
90 
91 /* this also releases the path */
btrfs_free_path(struct btrfs_path * p)92 void btrfs_free_path(struct btrfs_path *p)
93 {
94 	if (!p)
95 		return;
96 	btrfs_release_path(p);
97 	kmem_cache_free(btrfs_path_cachep, p);
98 }
99 
100 /*
101  * path release drops references on the extent buffers in the path
102  * and it drops any locks held by this path
103  *
104  * It is safe to call this on paths that no locks or extent buffers held.
105  */
btrfs_release_path(struct btrfs_path * p)106 noinline void btrfs_release_path(struct btrfs_path *p)
107 {
108 	int i;
109 
110 	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
111 		p->slots[i] = 0;
112 		if (!p->nodes[i])
113 			continue;
114 		if (p->locks[i]) {
115 			btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
116 			p->locks[i] = 0;
117 		}
118 		free_extent_buffer(p->nodes[i]);
119 		p->nodes[i] = NULL;
120 	}
121 }
122 
123 /*
124  * safely gets a reference on the root node of a tree.  A lock
125  * is not taken, so a concurrent writer may put a different node
126  * at the root of the tree.  See btrfs_lock_root_node for the
127  * looping required.
128  *
129  * The extent buffer returned by this has a reference taken, so
130  * it won't disappear.  It may stop being the root of the tree
131  * at any time because there are no locks held.
132  */
btrfs_root_node(struct btrfs_root * root)133 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
134 {
135 	struct extent_buffer *eb;
136 
137 	while (1) {
138 		rcu_read_lock();
139 		eb = rcu_dereference(root->node);
140 
141 		/*
142 		 * RCU really hurts here, we could free up the root node because
143 		 * it was COWed but we may not get the new root node yet so do
144 		 * the inc_not_zero dance and if it doesn't work then
145 		 * synchronize_rcu and try again.
146 		 */
147 		if (atomic_inc_not_zero(&eb->refs)) {
148 			rcu_read_unlock();
149 			break;
150 		}
151 		rcu_read_unlock();
152 		synchronize_rcu();
153 	}
154 	return eb;
155 }
156 
157 /* loop around taking references on and locking the root node of the
158  * tree until you end up with a lock on the root.  A locked buffer
159  * is returned, with a reference held.
160  */
btrfs_lock_root_node(struct btrfs_root * root)161 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
162 {
163 	struct extent_buffer *eb;
164 
165 	while (1) {
166 		eb = btrfs_root_node(root);
167 		btrfs_tree_lock(eb);
168 		if (eb == root->node)
169 			break;
170 		btrfs_tree_unlock(eb);
171 		free_extent_buffer(eb);
172 	}
173 	return eb;
174 }
175 
176 /* loop around taking references on and locking the root node of the
177  * tree until you end up with a lock on the root.  A locked buffer
178  * is returned, with a reference held.
179  */
btrfs_read_lock_root_node(struct btrfs_root * root)180 struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
181 {
182 	struct extent_buffer *eb;
183 
184 	while (1) {
185 		eb = btrfs_root_node(root);
186 		btrfs_tree_read_lock(eb);
187 		if (eb == root->node)
188 			break;
189 		btrfs_tree_read_unlock(eb);
190 		free_extent_buffer(eb);
191 	}
192 	return eb;
193 }
194 
195 /* cowonly root (everything not a reference counted cow subvolume), just get
196  * put onto a simple dirty list.  transaction.c walks this to make sure they
197  * get properly updated on disk.
198  */
add_root_to_dirty_list(struct btrfs_root * root)199 static void add_root_to_dirty_list(struct btrfs_root *root)
200 {
201 	struct btrfs_fs_info *fs_info = root->fs_info;
202 
203 	if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
204 	    !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
205 		return;
206 
207 	spin_lock(&fs_info->trans_lock);
208 	if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
209 		/* Want the extent tree to be the last on the list */
210 		if (root->objectid == BTRFS_EXTENT_TREE_OBJECTID)
211 			list_move_tail(&root->dirty_list,
212 				       &fs_info->dirty_cowonly_roots);
213 		else
214 			list_move(&root->dirty_list,
215 				  &fs_info->dirty_cowonly_roots);
216 	}
217 	spin_unlock(&fs_info->trans_lock);
218 }
219 
220 /*
221  * used by snapshot creation to make a copy of a root for a tree with
222  * a given objectid.  The buffer with the new root node is returned in
223  * cow_ret, and this func returns zero on success or a negative error code.
224  */
btrfs_copy_root(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct extent_buffer * buf,struct extent_buffer ** cow_ret,u64 new_root_objectid)225 int btrfs_copy_root(struct btrfs_trans_handle *trans,
226 		      struct btrfs_root *root,
227 		      struct extent_buffer *buf,
228 		      struct extent_buffer **cow_ret, u64 new_root_objectid)
229 {
230 	struct btrfs_fs_info *fs_info = root->fs_info;
231 	struct extent_buffer *cow;
232 	int ret = 0;
233 	int level;
234 	struct btrfs_disk_key disk_key;
235 
236 	WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
237 		trans->transid != fs_info->running_transaction->transid);
238 	WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
239 		trans->transid != root->last_trans);
240 
241 	level = btrfs_header_level(buf);
242 	if (level == 0)
243 		btrfs_item_key(buf, &disk_key, 0);
244 	else
245 		btrfs_node_key(buf, &disk_key, 0);
246 
247 	cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
248 			&disk_key, level, buf->start, 0);
249 	if (IS_ERR(cow))
250 		return PTR_ERR(cow);
251 
252 	copy_extent_buffer_full(cow, buf);
253 	btrfs_set_header_bytenr(cow, cow->start);
254 	btrfs_set_header_generation(cow, trans->transid);
255 	btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
256 	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
257 				     BTRFS_HEADER_FLAG_RELOC);
258 	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
259 		btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
260 	else
261 		btrfs_set_header_owner(cow, new_root_objectid);
262 
263 	write_extent_buffer_fsid(cow, fs_info->fsid);
264 
265 	WARN_ON(btrfs_header_generation(buf) > trans->transid);
266 	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
267 		ret = btrfs_inc_ref(trans, root, cow, 1);
268 	else
269 		ret = btrfs_inc_ref(trans, root, cow, 0);
270 
271 	if (ret)
272 		return ret;
273 
274 	btrfs_mark_buffer_dirty(cow);
275 	*cow_ret = cow;
276 	return 0;
277 }
278 
279 enum mod_log_op {
280 	MOD_LOG_KEY_REPLACE,
281 	MOD_LOG_KEY_ADD,
282 	MOD_LOG_KEY_REMOVE,
283 	MOD_LOG_KEY_REMOVE_WHILE_FREEING,
284 	MOD_LOG_KEY_REMOVE_WHILE_MOVING,
285 	MOD_LOG_MOVE_KEYS,
286 	MOD_LOG_ROOT_REPLACE,
287 };
288 
289 struct tree_mod_root {
290 	u64 logical;
291 	u8 level;
292 };
293 
294 struct tree_mod_elem {
295 	struct rb_node node;
296 	u64 logical;
297 	u64 seq;
298 	enum mod_log_op op;
299 
300 	/* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
301 	int slot;
302 
303 	/* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
304 	u64 generation;
305 
306 	/* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
307 	struct btrfs_disk_key key;
308 	u64 blockptr;
309 
310 	/* this is used for op == MOD_LOG_MOVE_KEYS */
311 	struct {
312 		int dst_slot;
313 		int nr_items;
314 	} move;
315 
316 	/* this is used for op == MOD_LOG_ROOT_REPLACE */
317 	struct tree_mod_root old_root;
318 };
319 
320 /*
321  * Pull a new tree mod seq number for our operation.
322  */
btrfs_inc_tree_mod_seq(struct btrfs_fs_info * fs_info)323 static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
324 {
325 	return atomic64_inc_return(&fs_info->tree_mod_seq);
326 }
327 
328 /*
329  * This adds a new blocker to the tree mod log's blocker list if the @elem
330  * passed does not already have a sequence number set. So when a caller expects
331  * to record tree modifications, it should ensure to set elem->seq to zero
332  * before calling btrfs_get_tree_mod_seq.
333  * Returns a fresh, unused tree log modification sequence number, even if no new
334  * blocker was added.
335  */
btrfs_get_tree_mod_seq(struct btrfs_fs_info * fs_info,struct seq_list * elem)336 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
337 			   struct seq_list *elem)
338 {
339 	write_lock(&fs_info->tree_mod_log_lock);
340 	if (!elem->seq) {
341 		elem->seq = btrfs_inc_tree_mod_seq(fs_info);
342 		list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
343 	}
344 	write_unlock(&fs_info->tree_mod_log_lock);
345 
346 	return elem->seq;
347 }
348 
btrfs_put_tree_mod_seq(struct btrfs_fs_info * fs_info,struct seq_list * elem)349 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
350 			    struct seq_list *elem)
351 {
352 	struct rb_root *tm_root;
353 	struct rb_node *node;
354 	struct rb_node *next;
355 	struct seq_list *cur_elem;
356 	struct tree_mod_elem *tm;
357 	u64 min_seq = (u64)-1;
358 	u64 seq_putting = elem->seq;
359 
360 	if (!seq_putting)
361 		return;
362 
363 	write_lock(&fs_info->tree_mod_log_lock);
364 	list_del(&elem->list);
365 	elem->seq = 0;
366 
367 	list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
368 		if (cur_elem->seq < min_seq) {
369 			if (seq_putting > cur_elem->seq) {
370 				/*
371 				 * blocker with lower sequence number exists, we
372 				 * cannot remove anything from the log
373 				 */
374 				write_unlock(&fs_info->tree_mod_log_lock);
375 				return;
376 			}
377 			min_seq = cur_elem->seq;
378 		}
379 	}
380 
381 	/*
382 	 * anything that's lower than the lowest existing (read: blocked)
383 	 * sequence number can be removed from the tree.
384 	 */
385 	tm_root = &fs_info->tree_mod_log;
386 	for (node = rb_first(tm_root); node; node = next) {
387 		next = rb_next(node);
388 		tm = rb_entry(node, struct tree_mod_elem, node);
389 		if (tm->seq >= min_seq)
390 			continue;
391 		rb_erase(node, tm_root);
392 		kfree(tm);
393 	}
394 	write_unlock(&fs_info->tree_mod_log_lock);
395 }
396 
397 /*
398  * key order of the log:
399  *       node/leaf start address -> sequence
400  *
401  * The 'start address' is the logical address of the *new* root node
402  * for root replace operations, or the logical address of the affected
403  * block for all other operations.
404  *
405  * Note: must be called with write lock for fs_info::tree_mod_log_lock.
406  */
407 static noinline int
__tree_mod_log_insert(struct btrfs_fs_info * fs_info,struct tree_mod_elem * tm)408 __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
409 {
410 	struct rb_root *tm_root;
411 	struct rb_node **new;
412 	struct rb_node *parent = NULL;
413 	struct tree_mod_elem *cur;
414 
415 	tm->seq = btrfs_inc_tree_mod_seq(fs_info);
416 
417 	tm_root = &fs_info->tree_mod_log;
418 	new = &tm_root->rb_node;
419 	while (*new) {
420 		cur = rb_entry(*new, struct tree_mod_elem, node);
421 		parent = *new;
422 		if (cur->logical < tm->logical)
423 			new = &((*new)->rb_left);
424 		else if (cur->logical > tm->logical)
425 			new = &((*new)->rb_right);
426 		else if (cur->seq < tm->seq)
427 			new = &((*new)->rb_left);
428 		else if (cur->seq > tm->seq)
429 			new = &((*new)->rb_right);
430 		else
431 			return -EEXIST;
432 	}
433 
434 	rb_link_node(&tm->node, parent, new);
435 	rb_insert_color(&tm->node, tm_root);
436 	return 0;
437 }
438 
439 /*
440  * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
441  * returns zero with the tree_mod_log_lock acquired. The caller must hold
442  * this until all tree mod log insertions are recorded in the rb tree and then
443  * write unlock fs_info::tree_mod_log_lock.
444  */
tree_mod_dont_log(struct btrfs_fs_info * fs_info,struct extent_buffer * eb)445 static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
446 				    struct extent_buffer *eb) {
447 	smp_mb();
448 	if (list_empty(&(fs_info)->tree_mod_seq_list))
449 		return 1;
450 	if (eb && btrfs_header_level(eb) == 0)
451 		return 1;
452 
453 	write_lock(&fs_info->tree_mod_log_lock);
454 	if (list_empty(&(fs_info)->tree_mod_seq_list)) {
455 		write_unlock(&fs_info->tree_mod_log_lock);
456 		return 1;
457 	}
458 
459 	return 0;
460 }
461 
462 /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
tree_mod_need_log(const struct btrfs_fs_info * fs_info,struct extent_buffer * eb)463 static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
464 				    struct extent_buffer *eb)
465 {
466 	smp_mb();
467 	if (list_empty(&(fs_info)->tree_mod_seq_list))
468 		return 0;
469 	if (eb && btrfs_header_level(eb) == 0)
470 		return 0;
471 
472 	return 1;
473 }
474 
475 static struct tree_mod_elem *
alloc_tree_mod_elem(struct extent_buffer * eb,int slot,enum mod_log_op op,gfp_t flags)476 alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
477 		    enum mod_log_op op, gfp_t flags)
478 {
479 	struct tree_mod_elem *tm;
480 
481 	tm = kzalloc(sizeof(*tm), flags);
482 	if (!tm)
483 		return NULL;
484 
485 	tm->logical = eb->start;
486 	if (op != MOD_LOG_KEY_ADD) {
487 		btrfs_node_key(eb, &tm->key, slot);
488 		tm->blockptr = btrfs_node_blockptr(eb, slot);
489 	}
490 	tm->op = op;
491 	tm->slot = slot;
492 	tm->generation = btrfs_node_ptr_generation(eb, slot);
493 	RB_CLEAR_NODE(&tm->node);
494 
495 	return tm;
496 }
497 
tree_mod_log_insert_key(struct extent_buffer * eb,int slot,enum mod_log_op op,gfp_t flags)498 static noinline int tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
499 		enum mod_log_op op, gfp_t flags)
500 {
501 	struct tree_mod_elem *tm;
502 	int ret;
503 
504 	if (!tree_mod_need_log(eb->fs_info, eb))
505 		return 0;
506 
507 	tm = alloc_tree_mod_elem(eb, slot, op, flags);
508 	if (!tm)
509 		return -ENOMEM;
510 
511 	if (tree_mod_dont_log(eb->fs_info, eb)) {
512 		kfree(tm);
513 		return 0;
514 	}
515 
516 	ret = __tree_mod_log_insert(eb->fs_info, tm);
517 	write_unlock(&eb->fs_info->tree_mod_log_lock);
518 	if (ret)
519 		kfree(tm);
520 
521 	return ret;
522 }
523 
tree_mod_log_insert_move(struct extent_buffer * eb,int dst_slot,int src_slot,int nr_items)524 static noinline int tree_mod_log_insert_move(struct extent_buffer *eb,
525 		int dst_slot, int src_slot, int nr_items)
526 {
527 	struct tree_mod_elem *tm = NULL;
528 	struct tree_mod_elem **tm_list = NULL;
529 	int ret = 0;
530 	int i;
531 	int locked = 0;
532 
533 	if (!tree_mod_need_log(eb->fs_info, eb))
534 		return 0;
535 
536 	tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
537 	if (!tm_list)
538 		return -ENOMEM;
539 
540 	tm = kzalloc(sizeof(*tm), GFP_NOFS);
541 	if (!tm) {
542 		ret = -ENOMEM;
543 		goto free_tms;
544 	}
545 
546 	tm->logical = eb->start;
547 	tm->slot = src_slot;
548 	tm->move.dst_slot = dst_slot;
549 	tm->move.nr_items = nr_items;
550 	tm->op = MOD_LOG_MOVE_KEYS;
551 
552 	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
553 		tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
554 		    MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
555 		if (!tm_list[i]) {
556 			ret = -ENOMEM;
557 			goto free_tms;
558 		}
559 	}
560 
561 	if (tree_mod_dont_log(eb->fs_info, eb))
562 		goto free_tms;
563 	locked = 1;
564 
565 	/*
566 	 * When we override something during the move, we log these removals.
567 	 * This can only happen when we move towards the beginning of the
568 	 * buffer, i.e. dst_slot < src_slot.
569 	 */
570 	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
571 		ret = __tree_mod_log_insert(eb->fs_info, tm_list[i]);
572 		if (ret)
573 			goto free_tms;
574 	}
575 
576 	ret = __tree_mod_log_insert(eb->fs_info, tm);
577 	if (ret)
578 		goto free_tms;
579 	write_unlock(&eb->fs_info->tree_mod_log_lock);
580 	kfree(tm_list);
581 
582 	return 0;
583 free_tms:
584 	for (i = 0; i < nr_items; i++) {
585 		if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
586 			rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
587 		kfree(tm_list[i]);
588 	}
589 	if (locked)
590 		write_unlock(&eb->fs_info->tree_mod_log_lock);
591 	kfree(tm_list);
592 	kfree(tm);
593 
594 	return ret;
595 }
596 
597 static inline int
__tree_mod_log_free_eb(struct btrfs_fs_info * fs_info,struct tree_mod_elem ** tm_list,int nritems)598 __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
599 		       struct tree_mod_elem **tm_list,
600 		       int nritems)
601 {
602 	int i, j;
603 	int ret;
604 
605 	for (i = nritems - 1; i >= 0; i--) {
606 		ret = __tree_mod_log_insert(fs_info, tm_list[i]);
607 		if (ret) {
608 			for (j = nritems - 1; j > i; j--)
609 				rb_erase(&tm_list[j]->node,
610 					 &fs_info->tree_mod_log);
611 			return ret;
612 		}
613 	}
614 
615 	return 0;
616 }
617 
tree_mod_log_insert_root(struct extent_buffer * old_root,struct extent_buffer * new_root,int log_removal)618 static noinline int tree_mod_log_insert_root(struct extent_buffer *old_root,
619 			 struct extent_buffer *new_root, int log_removal)
620 {
621 	struct btrfs_fs_info *fs_info = old_root->fs_info;
622 	struct tree_mod_elem *tm = NULL;
623 	struct tree_mod_elem **tm_list = NULL;
624 	int nritems = 0;
625 	int ret = 0;
626 	int i;
627 
628 	if (!tree_mod_need_log(fs_info, NULL))
629 		return 0;
630 
631 	if (log_removal && btrfs_header_level(old_root) > 0) {
632 		nritems = btrfs_header_nritems(old_root);
633 		tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
634 				  GFP_NOFS);
635 		if (!tm_list) {
636 			ret = -ENOMEM;
637 			goto free_tms;
638 		}
639 		for (i = 0; i < nritems; i++) {
640 			tm_list[i] = alloc_tree_mod_elem(old_root, i,
641 			    MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
642 			if (!tm_list[i]) {
643 				ret = -ENOMEM;
644 				goto free_tms;
645 			}
646 		}
647 	}
648 
649 	tm = kzalloc(sizeof(*tm), GFP_NOFS);
650 	if (!tm) {
651 		ret = -ENOMEM;
652 		goto free_tms;
653 	}
654 
655 	tm->logical = new_root->start;
656 	tm->old_root.logical = old_root->start;
657 	tm->old_root.level = btrfs_header_level(old_root);
658 	tm->generation = btrfs_header_generation(old_root);
659 	tm->op = MOD_LOG_ROOT_REPLACE;
660 
661 	if (tree_mod_dont_log(fs_info, NULL))
662 		goto free_tms;
663 
664 	if (tm_list)
665 		ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
666 	if (!ret)
667 		ret = __tree_mod_log_insert(fs_info, tm);
668 
669 	write_unlock(&fs_info->tree_mod_log_lock);
670 	if (ret)
671 		goto free_tms;
672 	kfree(tm_list);
673 
674 	return ret;
675 
676 free_tms:
677 	if (tm_list) {
678 		for (i = 0; i < nritems; i++)
679 			kfree(tm_list[i]);
680 		kfree(tm_list);
681 	}
682 	kfree(tm);
683 
684 	return ret;
685 }
686 
687 static struct tree_mod_elem *
__tree_mod_log_search(struct btrfs_fs_info * fs_info,u64 start,u64 min_seq,int smallest)688 __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
689 		      int smallest)
690 {
691 	struct rb_root *tm_root;
692 	struct rb_node *node;
693 	struct tree_mod_elem *cur = NULL;
694 	struct tree_mod_elem *found = NULL;
695 
696 	read_lock(&fs_info->tree_mod_log_lock);
697 	tm_root = &fs_info->tree_mod_log;
698 	node = tm_root->rb_node;
699 	while (node) {
700 		cur = rb_entry(node, struct tree_mod_elem, node);
701 		if (cur->logical < start) {
702 			node = node->rb_left;
703 		} else if (cur->logical > start) {
704 			node = node->rb_right;
705 		} else if (cur->seq < min_seq) {
706 			node = node->rb_left;
707 		} else if (!smallest) {
708 			/* we want the node with the highest seq */
709 			if (found)
710 				BUG_ON(found->seq > cur->seq);
711 			found = cur;
712 			node = node->rb_left;
713 		} else if (cur->seq > min_seq) {
714 			/* we want the node with the smallest seq */
715 			if (found)
716 				BUG_ON(found->seq < cur->seq);
717 			found = cur;
718 			node = node->rb_right;
719 		} else {
720 			found = cur;
721 			break;
722 		}
723 	}
724 	read_unlock(&fs_info->tree_mod_log_lock);
725 
726 	return found;
727 }
728 
729 /*
730  * this returns the element from the log with the smallest time sequence
731  * value that's in the log (the oldest log item). any element with a time
732  * sequence lower than min_seq will be ignored.
733  */
734 static struct tree_mod_elem *
tree_mod_log_search_oldest(struct btrfs_fs_info * fs_info,u64 start,u64 min_seq)735 tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
736 			   u64 min_seq)
737 {
738 	return __tree_mod_log_search(fs_info, start, min_seq, 1);
739 }
740 
741 /*
742  * this returns the element from the log with the largest time sequence
743  * value that's in the log (the most recent log item). any element with
744  * a time sequence lower than min_seq will be ignored.
745  */
746 static struct tree_mod_elem *
tree_mod_log_search(struct btrfs_fs_info * fs_info,u64 start,u64 min_seq)747 tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
748 {
749 	return __tree_mod_log_search(fs_info, start, min_seq, 0);
750 }
751 
752 static noinline int
tree_mod_log_eb_copy(struct btrfs_fs_info * fs_info,struct extent_buffer * dst,struct extent_buffer * src,unsigned long dst_offset,unsigned long src_offset,int nr_items)753 tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
754 		     struct extent_buffer *src, unsigned long dst_offset,
755 		     unsigned long src_offset, int nr_items)
756 {
757 	int ret = 0;
758 	struct tree_mod_elem **tm_list = NULL;
759 	struct tree_mod_elem **tm_list_add, **tm_list_rem;
760 	int i;
761 	int locked = 0;
762 
763 	if (!tree_mod_need_log(fs_info, NULL))
764 		return 0;
765 
766 	if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
767 		return 0;
768 
769 	tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
770 			  GFP_NOFS);
771 	if (!tm_list)
772 		return -ENOMEM;
773 
774 	tm_list_add = tm_list;
775 	tm_list_rem = tm_list + nr_items;
776 	for (i = 0; i < nr_items; i++) {
777 		tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
778 		    MOD_LOG_KEY_REMOVE, GFP_NOFS);
779 		if (!tm_list_rem[i]) {
780 			ret = -ENOMEM;
781 			goto free_tms;
782 		}
783 
784 		tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
785 		    MOD_LOG_KEY_ADD, GFP_NOFS);
786 		if (!tm_list_add[i]) {
787 			ret = -ENOMEM;
788 			goto free_tms;
789 		}
790 	}
791 
792 	if (tree_mod_dont_log(fs_info, NULL))
793 		goto free_tms;
794 	locked = 1;
795 
796 	for (i = 0; i < nr_items; i++) {
797 		ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
798 		if (ret)
799 			goto free_tms;
800 		ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
801 		if (ret)
802 			goto free_tms;
803 	}
804 
805 	write_unlock(&fs_info->tree_mod_log_lock);
806 	kfree(tm_list);
807 
808 	return 0;
809 
810 free_tms:
811 	for (i = 0; i < nr_items * 2; i++) {
812 		if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
813 			rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
814 		kfree(tm_list[i]);
815 	}
816 	if (locked)
817 		write_unlock(&fs_info->tree_mod_log_lock);
818 	kfree(tm_list);
819 
820 	return ret;
821 }
822 
tree_mod_log_free_eb(struct extent_buffer * eb)823 static noinline int tree_mod_log_free_eb(struct extent_buffer *eb)
824 {
825 	struct tree_mod_elem **tm_list = NULL;
826 	int nritems = 0;
827 	int i;
828 	int ret = 0;
829 
830 	if (btrfs_header_level(eb) == 0)
831 		return 0;
832 
833 	if (!tree_mod_need_log(eb->fs_info, NULL))
834 		return 0;
835 
836 	nritems = btrfs_header_nritems(eb);
837 	tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
838 	if (!tm_list)
839 		return -ENOMEM;
840 
841 	for (i = 0; i < nritems; i++) {
842 		tm_list[i] = alloc_tree_mod_elem(eb, i,
843 		    MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
844 		if (!tm_list[i]) {
845 			ret = -ENOMEM;
846 			goto free_tms;
847 		}
848 	}
849 
850 	if (tree_mod_dont_log(eb->fs_info, eb))
851 		goto free_tms;
852 
853 	ret = __tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
854 	write_unlock(&eb->fs_info->tree_mod_log_lock);
855 	if (ret)
856 		goto free_tms;
857 	kfree(tm_list);
858 
859 	return 0;
860 
861 free_tms:
862 	for (i = 0; i < nritems; i++)
863 		kfree(tm_list[i]);
864 	kfree(tm_list);
865 
866 	return ret;
867 }
868 
869 /*
870  * check if the tree block can be shared by multiple trees
871  */
btrfs_block_can_be_shared(struct btrfs_root * root,struct extent_buffer * buf)872 int btrfs_block_can_be_shared(struct btrfs_root *root,
873 			      struct extent_buffer *buf)
874 {
875 	/*
876 	 * Tree blocks not in reference counted trees and tree roots
877 	 * are never shared. If a block was allocated after the last
878 	 * snapshot and the block was not allocated by tree relocation,
879 	 * we know the block is not shared.
880 	 */
881 	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
882 	    buf != root->node && buf != root->commit_root &&
883 	    (btrfs_header_generation(buf) <=
884 	     btrfs_root_last_snapshot(&root->root_item) ||
885 	     btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
886 		return 1;
887 
888 	return 0;
889 }
890 
update_ref_for_cow(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct extent_buffer * buf,struct extent_buffer * cow,int * last_ref)891 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
892 				       struct btrfs_root *root,
893 				       struct extent_buffer *buf,
894 				       struct extent_buffer *cow,
895 				       int *last_ref)
896 {
897 	struct btrfs_fs_info *fs_info = root->fs_info;
898 	u64 refs;
899 	u64 owner;
900 	u64 flags;
901 	u64 new_flags = 0;
902 	int ret;
903 
904 	/*
905 	 * Backrefs update rules:
906 	 *
907 	 * Always use full backrefs for extent pointers in tree block
908 	 * allocated by tree relocation.
909 	 *
910 	 * If a shared tree block is no longer referenced by its owner
911 	 * tree (btrfs_header_owner(buf) == root->root_key.objectid),
912 	 * use full backrefs for extent pointers in tree block.
913 	 *
914 	 * If a tree block is been relocating
915 	 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
916 	 * use full backrefs for extent pointers in tree block.
917 	 * The reason for this is some operations (such as drop tree)
918 	 * are only allowed for blocks use full backrefs.
919 	 */
920 
921 	if (btrfs_block_can_be_shared(root, buf)) {
922 		ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
923 					       btrfs_header_level(buf), 1,
924 					       &refs, &flags);
925 		if (ret)
926 			return ret;
927 		if (refs == 0) {
928 			ret = -EROFS;
929 			btrfs_handle_fs_error(fs_info, ret, NULL);
930 			return ret;
931 		}
932 	} else {
933 		refs = 1;
934 		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
935 		    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
936 			flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
937 		else
938 			flags = 0;
939 	}
940 
941 	owner = btrfs_header_owner(buf);
942 	BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
943 	       !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
944 
945 	if (refs > 1) {
946 		if ((owner == root->root_key.objectid ||
947 		     root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
948 		    !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
949 			ret = btrfs_inc_ref(trans, root, buf, 1);
950 			if (ret)
951 				return ret;
952 
953 			if (root->root_key.objectid ==
954 			    BTRFS_TREE_RELOC_OBJECTID) {
955 				ret = btrfs_dec_ref(trans, root, buf, 0);
956 				if (ret)
957 					return ret;
958 				ret = btrfs_inc_ref(trans, root, cow, 1);
959 				if (ret)
960 					return ret;
961 			}
962 			new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
963 		} else {
964 
965 			if (root->root_key.objectid ==
966 			    BTRFS_TREE_RELOC_OBJECTID)
967 				ret = btrfs_inc_ref(trans, root, cow, 1);
968 			else
969 				ret = btrfs_inc_ref(trans, root, cow, 0);
970 			if (ret)
971 				return ret;
972 		}
973 		if (new_flags != 0) {
974 			int level = btrfs_header_level(buf);
975 
976 			ret = btrfs_set_disk_extent_flags(trans, fs_info,
977 							  buf->start,
978 							  buf->len,
979 							  new_flags, level, 0);
980 			if (ret)
981 				return ret;
982 		}
983 	} else {
984 		if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
985 			if (root->root_key.objectid ==
986 			    BTRFS_TREE_RELOC_OBJECTID)
987 				ret = btrfs_inc_ref(trans, root, cow, 1);
988 			else
989 				ret = btrfs_inc_ref(trans, root, cow, 0);
990 			if (ret)
991 				return ret;
992 			ret = btrfs_dec_ref(trans, root, buf, 1);
993 			if (ret)
994 				return ret;
995 		}
996 		clean_tree_block(fs_info, buf);
997 		*last_ref = 1;
998 	}
999 	return 0;
1000 }
1001 
alloc_tree_block_no_bg_flush(struct btrfs_trans_handle * trans,struct btrfs_root * root,u64 parent_start,const struct btrfs_disk_key * disk_key,int level,u64 hint,u64 empty_size)1002 static struct extent_buffer *alloc_tree_block_no_bg_flush(
1003 					  struct btrfs_trans_handle *trans,
1004 					  struct btrfs_root *root,
1005 					  u64 parent_start,
1006 					  const struct btrfs_disk_key *disk_key,
1007 					  int level,
1008 					  u64 hint,
1009 					  u64 empty_size)
1010 {
1011 	struct btrfs_fs_info *fs_info = root->fs_info;
1012 	struct extent_buffer *ret;
1013 
1014 	/*
1015 	 * If we are COWing a node/leaf from the extent, chunk, device or free
1016 	 * space trees, make sure that we do not finish block group creation of
1017 	 * pending block groups. We do this to avoid a deadlock.
1018 	 * COWing can result in allocation of a new chunk, and flushing pending
1019 	 * block groups (btrfs_create_pending_block_groups()) can be triggered
1020 	 * when finishing allocation of a new chunk. Creation of a pending block
1021 	 * group modifies the extent, chunk, device and free space trees,
1022 	 * therefore we could deadlock with ourselves since we are holding a
1023 	 * lock on an extent buffer that btrfs_create_pending_block_groups() may
1024 	 * try to COW later.
1025 	 * For similar reasons, we also need to delay flushing pending block
1026 	 * groups when splitting a leaf or node, from one of those trees, since
1027 	 * we are holding a write lock on it and its parent or when inserting a
1028 	 * new root node for one of those trees.
1029 	 */
1030 	if (root == fs_info->extent_root ||
1031 	    root == fs_info->chunk_root ||
1032 	    root == fs_info->dev_root ||
1033 	    root == fs_info->free_space_root)
1034 		trans->can_flush_pending_bgs = false;
1035 
1036 	ret = btrfs_alloc_tree_block(trans, root, parent_start,
1037 				     root->root_key.objectid, disk_key, level,
1038 				     hint, empty_size);
1039 	trans->can_flush_pending_bgs = true;
1040 
1041 	return ret;
1042 }
1043 
1044 /*
1045  * does the dirty work in cow of a single block.  The parent block (if
1046  * supplied) is updated to point to the new cow copy.  The new buffer is marked
1047  * dirty and returned locked.  If you modify the block it needs to be marked
1048  * dirty again.
1049  *
1050  * search_start -- an allocation hint for the new block
1051  *
1052  * empty_size -- a hint that you plan on doing more cow.  This is the size in
1053  * bytes the allocator should try to find free next to the block it returns.
1054  * This is just a hint and may be ignored by the allocator.
1055  */
__btrfs_cow_block(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct extent_buffer * buf,struct extent_buffer * parent,int parent_slot,struct extent_buffer ** cow_ret,u64 search_start,u64 empty_size)1056 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
1057 			     struct btrfs_root *root,
1058 			     struct extent_buffer *buf,
1059 			     struct extent_buffer *parent, int parent_slot,
1060 			     struct extent_buffer **cow_ret,
1061 			     u64 search_start, u64 empty_size)
1062 {
1063 	struct btrfs_fs_info *fs_info = root->fs_info;
1064 	struct btrfs_disk_key disk_key;
1065 	struct extent_buffer *cow;
1066 	int level, ret;
1067 	int last_ref = 0;
1068 	int unlock_orig = 0;
1069 	u64 parent_start = 0;
1070 
1071 	if (*cow_ret == buf)
1072 		unlock_orig = 1;
1073 
1074 	btrfs_assert_tree_locked(buf);
1075 
1076 	WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1077 		trans->transid != fs_info->running_transaction->transid);
1078 	WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1079 		trans->transid != root->last_trans);
1080 
1081 	level = btrfs_header_level(buf);
1082 
1083 	if (level == 0)
1084 		btrfs_item_key(buf, &disk_key, 0);
1085 	else
1086 		btrfs_node_key(buf, &disk_key, 0);
1087 
1088 	if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
1089 		parent_start = parent->start;
1090 
1091 	cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key,
1092 					   level, search_start, empty_size);
1093 	if (IS_ERR(cow))
1094 		return PTR_ERR(cow);
1095 
1096 	/* cow is set to blocking by btrfs_init_new_buffer */
1097 
1098 	copy_extent_buffer_full(cow, buf);
1099 	btrfs_set_header_bytenr(cow, cow->start);
1100 	btrfs_set_header_generation(cow, trans->transid);
1101 	btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
1102 	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
1103 				     BTRFS_HEADER_FLAG_RELOC);
1104 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1105 		btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
1106 	else
1107 		btrfs_set_header_owner(cow, root->root_key.objectid);
1108 
1109 	write_extent_buffer_fsid(cow, fs_info->fsid);
1110 
1111 	ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
1112 	if (ret) {
1113 		btrfs_tree_unlock(cow);
1114 		free_extent_buffer(cow);
1115 		btrfs_abort_transaction(trans, ret);
1116 		return ret;
1117 	}
1118 
1119 	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
1120 		ret = btrfs_reloc_cow_block(trans, root, buf, cow);
1121 		if (ret) {
1122 			btrfs_tree_unlock(cow);
1123 			free_extent_buffer(cow);
1124 			btrfs_abort_transaction(trans, ret);
1125 			return ret;
1126 		}
1127 	}
1128 
1129 	if (buf == root->node) {
1130 		WARN_ON(parent && parent != buf);
1131 		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1132 		    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1133 			parent_start = buf->start;
1134 
1135 		extent_buffer_get(cow);
1136 		ret = tree_mod_log_insert_root(root->node, cow, 1);
1137 		BUG_ON(ret < 0);
1138 		rcu_assign_pointer(root->node, cow);
1139 
1140 		btrfs_free_tree_block(trans, root, buf, parent_start,
1141 				      last_ref);
1142 		free_extent_buffer(buf);
1143 		add_root_to_dirty_list(root);
1144 	} else {
1145 		WARN_ON(trans->transid != btrfs_header_generation(parent));
1146 		tree_mod_log_insert_key(parent, parent_slot,
1147 					MOD_LOG_KEY_REPLACE, GFP_NOFS);
1148 		btrfs_set_node_blockptr(parent, parent_slot,
1149 					cow->start);
1150 		btrfs_set_node_ptr_generation(parent, parent_slot,
1151 					      trans->transid);
1152 		btrfs_mark_buffer_dirty(parent);
1153 		if (last_ref) {
1154 			ret = tree_mod_log_free_eb(buf);
1155 			if (ret) {
1156 				btrfs_tree_unlock(cow);
1157 				free_extent_buffer(cow);
1158 				btrfs_abort_transaction(trans, ret);
1159 				return ret;
1160 			}
1161 		}
1162 		btrfs_free_tree_block(trans, root, buf, parent_start,
1163 				      last_ref);
1164 	}
1165 	if (unlock_orig)
1166 		btrfs_tree_unlock(buf);
1167 	free_extent_buffer_stale(buf);
1168 	btrfs_mark_buffer_dirty(cow);
1169 	*cow_ret = cow;
1170 	return 0;
1171 }
1172 
1173 /*
1174  * returns the logical address of the oldest predecessor of the given root.
1175  * entries older than time_seq are ignored.
1176  */
__tree_mod_log_oldest_root(struct extent_buffer * eb_root,u64 time_seq)1177 static struct tree_mod_elem *__tree_mod_log_oldest_root(
1178 		struct extent_buffer *eb_root, u64 time_seq)
1179 {
1180 	struct tree_mod_elem *tm;
1181 	struct tree_mod_elem *found = NULL;
1182 	u64 root_logical = eb_root->start;
1183 	int looped = 0;
1184 
1185 	if (!time_seq)
1186 		return NULL;
1187 
1188 	/*
1189 	 * the very last operation that's logged for a root is the
1190 	 * replacement operation (if it is replaced at all). this has
1191 	 * the logical address of the *new* root, making it the very
1192 	 * first operation that's logged for this root.
1193 	 */
1194 	while (1) {
1195 		tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
1196 						time_seq);
1197 		if (!looped && !tm)
1198 			return NULL;
1199 		/*
1200 		 * if there are no tree operation for the oldest root, we simply
1201 		 * return it. this should only happen if that (old) root is at
1202 		 * level 0.
1203 		 */
1204 		if (!tm)
1205 			break;
1206 
1207 		/*
1208 		 * if there's an operation that's not a root replacement, we
1209 		 * found the oldest version of our root. normally, we'll find a
1210 		 * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
1211 		 */
1212 		if (tm->op != MOD_LOG_ROOT_REPLACE)
1213 			break;
1214 
1215 		found = tm;
1216 		root_logical = tm->old_root.logical;
1217 		looped = 1;
1218 	}
1219 
1220 	/* if there's no old root to return, return what we found instead */
1221 	if (!found)
1222 		found = tm;
1223 
1224 	return found;
1225 }
1226 
1227 /*
1228  * tm is a pointer to the first operation to rewind within eb. then, all
1229  * previous operations will be rewound (until we reach something older than
1230  * time_seq).
1231  */
1232 static void
__tree_mod_log_rewind(struct btrfs_fs_info * fs_info,struct extent_buffer * eb,u64 time_seq,struct tree_mod_elem * first_tm)1233 __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1234 		      u64 time_seq, struct tree_mod_elem *first_tm)
1235 {
1236 	u32 n;
1237 	struct rb_node *next;
1238 	struct tree_mod_elem *tm = first_tm;
1239 	unsigned long o_dst;
1240 	unsigned long o_src;
1241 	unsigned long p_size = sizeof(struct btrfs_key_ptr);
1242 
1243 	n = btrfs_header_nritems(eb);
1244 	read_lock(&fs_info->tree_mod_log_lock);
1245 	while (tm && tm->seq >= time_seq) {
1246 		/*
1247 		 * all the operations are recorded with the operator used for
1248 		 * the modification. as we're going backwards, we do the
1249 		 * opposite of each operation here.
1250 		 */
1251 		switch (tm->op) {
1252 		case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1253 			BUG_ON(tm->slot < n);
1254 			/* Fallthrough */
1255 		case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1256 		case MOD_LOG_KEY_REMOVE:
1257 			btrfs_set_node_key(eb, &tm->key, tm->slot);
1258 			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1259 			btrfs_set_node_ptr_generation(eb, tm->slot,
1260 						      tm->generation);
1261 			n++;
1262 			break;
1263 		case MOD_LOG_KEY_REPLACE:
1264 			BUG_ON(tm->slot >= n);
1265 			btrfs_set_node_key(eb, &tm->key, tm->slot);
1266 			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1267 			btrfs_set_node_ptr_generation(eb, tm->slot,
1268 						      tm->generation);
1269 			break;
1270 		case MOD_LOG_KEY_ADD:
1271 			/* if a move operation is needed it's in the log */
1272 			n--;
1273 			break;
1274 		case MOD_LOG_MOVE_KEYS:
1275 			o_dst = btrfs_node_key_ptr_offset(tm->slot);
1276 			o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
1277 			memmove_extent_buffer(eb, o_dst, o_src,
1278 					      tm->move.nr_items * p_size);
1279 			break;
1280 		case MOD_LOG_ROOT_REPLACE:
1281 			/*
1282 			 * this operation is special. for roots, this must be
1283 			 * handled explicitly before rewinding.
1284 			 * for non-roots, this operation may exist if the node
1285 			 * was a root: root A -> child B; then A gets empty and
1286 			 * B is promoted to the new root. in the mod log, we'll
1287 			 * have a root-replace operation for B, a tree block
1288 			 * that is no root. we simply ignore that operation.
1289 			 */
1290 			break;
1291 		}
1292 		next = rb_next(&tm->node);
1293 		if (!next)
1294 			break;
1295 		tm = rb_entry(next, struct tree_mod_elem, node);
1296 		if (tm->logical != first_tm->logical)
1297 			break;
1298 	}
1299 	read_unlock(&fs_info->tree_mod_log_lock);
1300 	btrfs_set_header_nritems(eb, n);
1301 }
1302 
1303 /*
1304  * Called with eb read locked. If the buffer cannot be rewound, the same buffer
1305  * is returned. If rewind operations happen, a fresh buffer is returned. The
1306  * returned buffer is always read-locked. If the returned buffer is not the
1307  * input buffer, the lock on the input buffer is released and the input buffer
1308  * is freed (its refcount is decremented).
1309  */
1310 static struct extent_buffer *
tree_mod_log_rewind(struct btrfs_fs_info * fs_info,struct btrfs_path * path,struct extent_buffer * eb,u64 time_seq)1311 tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1312 		    struct extent_buffer *eb, u64 time_seq)
1313 {
1314 	struct extent_buffer *eb_rewin;
1315 	struct tree_mod_elem *tm;
1316 
1317 	if (!time_seq)
1318 		return eb;
1319 
1320 	if (btrfs_header_level(eb) == 0)
1321 		return eb;
1322 
1323 	tm = tree_mod_log_search(fs_info, eb->start, time_seq);
1324 	if (!tm)
1325 		return eb;
1326 
1327 	btrfs_set_path_blocking(path);
1328 	btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
1329 
1330 	if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1331 		BUG_ON(tm->slot != 0);
1332 		eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
1333 		if (!eb_rewin) {
1334 			btrfs_tree_read_unlock_blocking(eb);
1335 			free_extent_buffer(eb);
1336 			return NULL;
1337 		}
1338 		btrfs_set_header_bytenr(eb_rewin, eb->start);
1339 		btrfs_set_header_backref_rev(eb_rewin,
1340 					     btrfs_header_backref_rev(eb));
1341 		btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
1342 		btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
1343 	} else {
1344 		eb_rewin = btrfs_clone_extent_buffer(eb);
1345 		if (!eb_rewin) {
1346 			btrfs_tree_read_unlock_blocking(eb);
1347 			free_extent_buffer(eb);
1348 			return NULL;
1349 		}
1350 	}
1351 
1352 	btrfs_clear_path_blocking(path, NULL, BTRFS_READ_LOCK);
1353 	btrfs_tree_read_unlock_blocking(eb);
1354 	free_extent_buffer(eb);
1355 
1356 	btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin),
1357 				       eb_rewin, btrfs_header_level(eb_rewin));
1358 	btrfs_tree_read_lock(eb_rewin);
1359 	__tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
1360 	WARN_ON(btrfs_header_nritems(eb_rewin) >
1361 		BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1362 
1363 	return eb_rewin;
1364 }
1365 
1366 /*
1367  * get_old_root() rewinds the state of @root's root node to the given @time_seq
1368  * value. If there are no changes, the current root->root_node is returned. If
1369  * anything changed in between, there's a fresh buffer allocated on which the
1370  * rewind operations are done. In any case, the returned buffer is read locked.
1371  * Returns NULL on error (with no locks held).
1372  */
1373 static inline struct extent_buffer *
get_old_root(struct btrfs_root * root,u64 time_seq)1374 get_old_root(struct btrfs_root *root, u64 time_seq)
1375 {
1376 	struct btrfs_fs_info *fs_info = root->fs_info;
1377 	struct tree_mod_elem *tm;
1378 	struct extent_buffer *eb = NULL;
1379 	struct extent_buffer *eb_root;
1380 	u64 eb_root_owner = 0;
1381 	struct extent_buffer *old;
1382 	struct tree_mod_root *old_root = NULL;
1383 	u64 old_generation = 0;
1384 	u64 logical;
1385 	int level;
1386 
1387 	eb_root = btrfs_read_lock_root_node(root);
1388 	tm = __tree_mod_log_oldest_root(eb_root, time_seq);
1389 	if (!tm)
1390 		return eb_root;
1391 
1392 	if (tm->op == MOD_LOG_ROOT_REPLACE) {
1393 		old_root = &tm->old_root;
1394 		old_generation = tm->generation;
1395 		logical = old_root->logical;
1396 		level = old_root->level;
1397 	} else {
1398 		logical = eb_root->start;
1399 		level = btrfs_header_level(eb_root);
1400 	}
1401 
1402 	tm = tree_mod_log_search(fs_info, logical, time_seq);
1403 	if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1404 		btrfs_tree_read_unlock(eb_root);
1405 		free_extent_buffer(eb_root);
1406 		old = read_tree_block(fs_info, logical, 0, level, NULL);
1407 		if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
1408 			if (!IS_ERR(old))
1409 				free_extent_buffer(old);
1410 			btrfs_warn(fs_info,
1411 				   "failed to read tree block %llu from get_old_root",
1412 				   logical);
1413 		} else {
1414 			btrfs_tree_read_lock(old);
1415 			eb = btrfs_clone_extent_buffer(old);
1416 			btrfs_tree_read_unlock(old);
1417 			free_extent_buffer(old);
1418 		}
1419 	} else if (old_root) {
1420 		eb_root_owner = btrfs_header_owner(eb_root);
1421 		btrfs_tree_read_unlock(eb_root);
1422 		free_extent_buffer(eb_root);
1423 		eb = alloc_dummy_extent_buffer(fs_info, logical);
1424 	} else {
1425 		btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
1426 		eb = btrfs_clone_extent_buffer(eb_root);
1427 		btrfs_tree_read_unlock_blocking(eb_root);
1428 		free_extent_buffer(eb_root);
1429 	}
1430 
1431 	if (!eb)
1432 		return NULL;
1433 	if (old_root) {
1434 		btrfs_set_header_bytenr(eb, eb->start);
1435 		btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1436 		btrfs_set_header_owner(eb, eb_root_owner);
1437 		btrfs_set_header_level(eb, old_root->level);
1438 		btrfs_set_header_generation(eb, old_generation);
1439 	}
1440 	btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb,
1441 				       btrfs_header_level(eb));
1442 	btrfs_tree_read_lock(eb);
1443 	if (tm)
1444 		__tree_mod_log_rewind(fs_info, eb, time_seq, tm);
1445 	else
1446 		WARN_ON(btrfs_header_level(eb) != 0);
1447 	WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1448 
1449 	return eb;
1450 }
1451 
btrfs_old_root_level(struct btrfs_root * root,u64 time_seq)1452 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1453 {
1454 	struct tree_mod_elem *tm;
1455 	int level;
1456 	struct extent_buffer *eb_root = btrfs_root_node(root);
1457 
1458 	tm = __tree_mod_log_oldest_root(eb_root, time_seq);
1459 	if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
1460 		level = tm->old_root.level;
1461 	} else {
1462 		level = btrfs_header_level(eb_root);
1463 	}
1464 	free_extent_buffer(eb_root);
1465 
1466 	return level;
1467 }
1468 
should_cow_block(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct extent_buffer * buf)1469 static inline int should_cow_block(struct btrfs_trans_handle *trans,
1470 				   struct btrfs_root *root,
1471 				   struct extent_buffer *buf)
1472 {
1473 	if (btrfs_is_testing(root->fs_info))
1474 		return 0;
1475 
1476 	/* Ensure we can see the FORCE_COW bit */
1477 	smp_mb__before_atomic();
1478 
1479 	/*
1480 	 * We do not need to cow a block if
1481 	 * 1) this block is not created or changed in this transaction;
1482 	 * 2) this block does not belong to TREE_RELOC tree;
1483 	 * 3) the root is not forced COW.
1484 	 *
1485 	 * What is forced COW:
1486 	 *    when we create snapshot during committing the transaction,
1487 	 *    after we've finished coping src root, we must COW the shared
1488 	 *    block to ensure the metadata consistency.
1489 	 */
1490 	if (btrfs_header_generation(buf) == trans->transid &&
1491 	    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
1492 	    !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1493 	      btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
1494 	    !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
1495 		return 0;
1496 	return 1;
1497 }
1498 
1499 /*
1500  * cows a single block, see __btrfs_cow_block for the real work.
1501  * This version of it has extra checks so that a block isn't COWed more than
1502  * once per transaction, as long as it hasn't been written yet
1503  */
btrfs_cow_block(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct extent_buffer * buf,struct extent_buffer * parent,int parent_slot,struct extent_buffer ** cow_ret)1504 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1505 		    struct btrfs_root *root, struct extent_buffer *buf,
1506 		    struct extent_buffer *parent, int parent_slot,
1507 		    struct extent_buffer **cow_ret)
1508 {
1509 	struct btrfs_fs_info *fs_info = root->fs_info;
1510 	u64 search_start;
1511 	int ret;
1512 
1513 	if (trans->transaction != fs_info->running_transaction)
1514 		WARN(1, KERN_CRIT "trans %llu running %llu\n",
1515 		       trans->transid,
1516 		       fs_info->running_transaction->transid);
1517 
1518 	if (trans->transid != fs_info->generation)
1519 		WARN(1, KERN_CRIT "trans %llu running %llu\n",
1520 		       trans->transid, fs_info->generation);
1521 
1522 	if (!should_cow_block(trans, root, buf)) {
1523 		trans->dirty = true;
1524 		*cow_ret = buf;
1525 		return 0;
1526 	}
1527 
1528 	search_start = buf->start & ~((u64)SZ_1G - 1);
1529 
1530 	if (parent)
1531 		btrfs_set_lock_blocking(parent);
1532 	btrfs_set_lock_blocking(buf);
1533 
1534 	ret = __btrfs_cow_block(trans, root, buf, parent,
1535 				 parent_slot, cow_ret, search_start, 0);
1536 
1537 	trace_btrfs_cow_block(root, buf, *cow_ret);
1538 
1539 	return ret;
1540 }
1541 
1542 /*
1543  * helper function for defrag to decide if two blocks pointed to by a
1544  * node are actually close by
1545  */
close_blocks(u64 blocknr,u64 other,u32 blocksize)1546 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
1547 {
1548 	if (blocknr < other && other - (blocknr + blocksize) < 32768)
1549 		return 1;
1550 	if (blocknr > other && blocknr - (other + blocksize) < 32768)
1551 		return 1;
1552 	return 0;
1553 }
1554 
1555 /*
1556  * compare two keys in a memcmp fashion
1557  */
comp_keys(const struct btrfs_disk_key * disk,const struct btrfs_key * k2)1558 static int comp_keys(const struct btrfs_disk_key *disk,
1559 		     const struct btrfs_key *k2)
1560 {
1561 	struct btrfs_key k1;
1562 
1563 	btrfs_disk_key_to_cpu(&k1, disk);
1564 
1565 	return btrfs_comp_cpu_keys(&k1, k2);
1566 }
1567 
1568 /*
1569  * same as comp_keys only with two btrfs_key's
1570  */
btrfs_comp_cpu_keys(const struct btrfs_key * k1,const struct btrfs_key * k2)1571 int btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
1572 {
1573 	if (k1->objectid > k2->objectid)
1574 		return 1;
1575 	if (k1->objectid < k2->objectid)
1576 		return -1;
1577 	if (k1->type > k2->type)
1578 		return 1;
1579 	if (k1->type < k2->type)
1580 		return -1;
1581 	if (k1->offset > k2->offset)
1582 		return 1;
1583 	if (k1->offset < k2->offset)
1584 		return -1;
1585 	return 0;
1586 }
1587 
1588 /*
1589  * this is used by the defrag code to go through all the
1590  * leaves pointed to by a node and reallocate them so that
1591  * disk order is close to key order
1592  */
btrfs_realloc_node(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct extent_buffer * parent,int start_slot,u64 * last_ret,struct btrfs_key * progress)1593 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1594 		       struct btrfs_root *root, struct extent_buffer *parent,
1595 		       int start_slot, u64 *last_ret,
1596 		       struct btrfs_key *progress)
1597 {
1598 	struct btrfs_fs_info *fs_info = root->fs_info;
1599 	struct extent_buffer *cur;
1600 	u64 blocknr;
1601 	u64 gen;
1602 	u64 search_start = *last_ret;
1603 	u64 last_block = 0;
1604 	u64 other;
1605 	u32 parent_nritems;
1606 	int end_slot;
1607 	int i;
1608 	int err = 0;
1609 	int parent_level;
1610 	int uptodate;
1611 	u32 blocksize;
1612 	int progress_passed = 0;
1613 	struct btrfs_disk_key disk_key;
1614 
1615 	parent_level = btrfs_header_level(parent);
1616 
1617 	WARN_ON(trans->transaction != fs_info->running_transaction);
1618 	WARN_ON(trans->transid != fs_info->generation);
1619 
1620 	parent_nritems = btrfs_header_nritems(parent);
1621 	blocksize = fs_info->nodesize;
1622 	end_slot = parent_nritems - 1;
1623 
1624 	if (parent_nritems <= 1)
1625 		return 0;
1626 
1627 	btrfs_set_lock_blocking(parent);
1628 
1629 	for (i = start_slot; i <= end_slot; i++) {
1630 		struct btrfs_key first_key;
1631 		int close = 1;
1632 
1633 		btrfs_node_key(parent, &disk_key, i);
1634 		if (!progress_passed && comp_keys(&disk_key, progress) < 0)
1635 			continue;
1636 
1637 		progress_passed = 1;
1638 		blocknr = btrfs_node_blockptr(parent, i);
1639 		gen = btrfs_node_ptr_generation(parent, i);
1640 		btrfs_node_key_to_cpu(parent, &first_key, i);
1641 		if (last_block == 0)
1642 			last_block = blocknr;
1643 
1644 		if (i > 0) {
1645 			other = btrfs_node_blockptr(parent, i - 1);
1646 			close = close_blocks(blocknr, other, blocksize);
1647 		}
1648 		if (!close && i < end_slot) {
1649 			other = btrfs_node_blockptr(parent, i + 1);
1650 			close = close_blocks(blocknr, other, blocksize);
1651 		}
1652 		if (close) {
1653 			last_block = blocknr;
1654 			continue;
1655 		}
1656 
1657 		cur = find_extent_buffer(fs_info, blocknr);
1658 		if (cur)
1659 			uptodate = btrfs_buffer_uptodate(cur, gen, 0);
1660 		else
1661 			uptodate = 0;
1662 		if (!cur || !uptodate) {
1663 			if (!cur) {
1664 				cur = read_tree_block(fs_info, blocknr, gen,
1665 						      parent_level - 1,
1666 						      &first_key);
1667 				if (IS_ERR(cur)) {
1668 					return PTR_ERR(cur);
1669 				} else if (!extent_buffer_uptodate(cur)) {
1670 					free_extent_buffer(cur);
1671 					return -EIO;
1672 				}
1673 			} else if (!uptodate) {
1674 				err = btrfs_read_buffer(cur, gen,
1675 						parent_level - 1,&first_key);
1676 				if (err) {
1677 					free_extent_buffer(cur);
1678 					return err;
1679 				}
1680 			}
1681 		}
1682 		if (search_start == 0)
1683 			search_start = last_block;
1684 
1685 		btrfs_tree_lock(cur);
1686 		btrfs_set_lock_blocking(cur);
1687 		err = __btrfs_cow_block(trans, root, cur, parent, i,
1688 					&cur, search_start,
1689 					min(16 * blocksize,
1690 					    (end_slot - i) * blocksize));
1691 		if (err) {
1692 			btrfs_tree_unlock(cur);
1693 			free_extent_buffer(cur);
1694 			break;
1695 		}
1696 		search_start = cur->start;
1697 		last_block = cur->start;
1698 		*last_ret = search_start;
1699 		btrfs_tree_unlock(cur);
1700 		free_extent_buffer(cur);
1701 	}
1702 	return err;
1703 }
1704 
1705 /*
1706  * search for key in the extent_buffer.  The items start at offset p,
1707  * and they are item_size apart.  There are 'max' items in p.
1708  *
1709  * the slot in the array is returned via slot, and it points to
1710  * the place where you would insert key if it is not found in
1711  * the array.
1712  *
1713  * slot may point to max if the key is bigger than all of the keys
1714  */
generic_bin_search(struct extent_buffer * eb,unsigned long p,int item_size,const struct btrfs_key * key,int max,int * slot)1715 static noinline int generic_bin_search(struct extent_buffer *eb,
1716 				       unsigned long p, int item_size,
1717 				       const struct btrfs_key *key,
1718 				       int max, int *slot)
1719 {
1720 	int low = 0;
1721 	int high = max;
1722 	int mid;
1723 	int ret;
1724 	struct btrfs_disk_key *tmp = NULL;
1725 	struct btrfs_disk_key unaligned;
1726 	unsigned long offset;
1727 	char *kaddr = NULL;
1728 	unsigned long map_start = 0;
1729 	unsigned long map_len = 0;
1730 	int err;
1731 
1732 	if (low > high) {
1733 		btrfs_err(eb->fs_info,
1734 		 "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
1735 			  __func__, low, high, eb->start,
1736 			  btrfs_header_owner(eb), btrfs_header_level(eb));
1737 		return -EINVAL;
1738 	}
1739 
1740 	while (low < high) {
1741 		mid = (low + high) / 2;
1742 		offset = p + mid * item_size;
1743 
1744 		if (!kaddr || offset < map_start ||
1745 		    (offset + sizeof(struct btrfs_disk_key)) >
1746 		    map_start + map_len) {
1747 
1748 			err = map_private_extent_buffer(eb, offset,
1749 						sizeof(struct btrfs_disk_key),
1750 						&kaddr, &map_start, &map_len);
1751 
1752 			if (!err) {
1753 				tmp = (struct btrfs_disk_key *)(kaddr + offset -
1754 							map_start);
1755 			} else if (err == 1) {
1756 				read_extent_buffer(eb, &unaligned,
1757 						   offset, sizeof(unaligned));
1758 				tmp = &unaligned;
1759 			} else {
1760 				return err;
1761 			}
1762 
1763 		} else {
1764 			tmp = (struct btrfs_disk_key *)(kaddr + offset -
1765 							map_start);
1766 		}
1767 		ret = comp_keys(tmp, key);
1768 
1769 		if (ret < 0)
1770 			low = mid + 1;
1771 		else if (ret > 0)
1772 			high = mid;
1773 		else {
1774 			*slot = mid;
1775 			return 0;
1776 		}
1777 	}
1778 	*slot = low;
1779 	return 1;
1780 }
1781 
1782 /*
1783  * simple bin_search frontend that does the right thing for
1784  * leaves vs nodes
1785  */
btrfs_bin_search(struct extent_buffer * eb,const struct btrfs_key * key,int level,int * slot)1786 int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
1787 		     int level, int *slot)
1788 {
1789 	if (level == 0)
1790 		return generic_bin_search(eb,
1791 					  offsetof(struct btrfs_leaf, items),
1792 					  sizeof(struct btrfs_item),
1793 					  key, btrfs_header_nritems(eb),
1794 					  slot);
1795 	else
1796 		return generic_bin_search(eb,
1797 					  offsetof(struct btrfs_node, ptrs),
1798 					  sizeof(struct btrfs_key_ptr),
1799 					  key, btrfs_header_nritems(eb),
1800 					  slot);
1801 }
1802 
root_add_used(struct btrfs_root * root,u32 size)1803 static void root_add_used(struct btrfs_root *root, u32 size)
1804 {
1805 	spin_lock(&root->accounting_lock);
1806 	btrfs_set_root_used(&root->root_item,
1807 			    btrfs_root_used(&root->root_item) + size);
1808 	spin_unlock(&root->accounting_lock);
1809 }
1810 
root_sub_used(struct btrfs_root * root,u32 size)1811 static void root_sub_used(struct btrfs_root *root, u32 size)
1812 {
1813 	spin_lock(&root->accounting_lock);
1814 	btrfs_set_root_used(&root->root_item,
1815 			    btrfs_root_used(&root->root_item) - size);
1816 	spin_unlock(&root->accounting_lock);
1817 }
1818 
1819 /* given a node and slot number, this reads the blocks it points to.  The
1820  * extent buffer is returned with a reference taken (but unlocked).
1821  */
1822 static noinline struct extent_buffer *
read_node_slot(struct btrfs_fs_info * fs_info,struct extent_buffer * parent,int slot)1823 read_node_slot(struct btrfs_fs_info *fs_info, struct extent_buffer *parent,
1824 	       int slot)
1825 {
1826 	int level = btrfs_header_level(parent);
1827 	struct extent_buffer *eb;
1828 	struct btrfs_key first_key;
1829 
1830 	if (slot < 0 || slot >= btrfs_header_nritems(parent))
1831 		return ERR_PTR(-ENOENT);
1832 
1833 	BUG_ON(level == 0);
1834 
1835 	btrfs_node_key_to_cpu(parent, &first_key, slot);
1836 	eb = read_tree_block(fs_info, btrfs_node_blockptr(parent, slot),
1837 			     btrfs_node_ptr_generation(parent, slot),
1838 			     level - 1, &first_key);
1839 	if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
1840 		free_extent_buffer(eb);
1841 		eb = ERR_PTR(-EIO);
1842 	}
1843 
1844 	return eb;
1845 }
1846 
1847 /*
1848  * node level balancing, used to make sure nodes are in proper order for
1849  * item deletion.  We balance from the top down, so we have to make sure
1850  * that a deletion won't leave an node completely empty later on.
1851  */
balance_level(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,int level)1852 static noinline int balance_level(struct btrfs_trans_handle *trans,
1853 			 struct btrfs_root *root,
1854 			 struct btrfs_path *path, int level)
1855 {
1856 	struct btrfs_fs_info *fs_info = root->fs_info;
1857 	struct extent_buffer *right = NULL;
1858 	struct extent_buffer *mid;
1859 	struct extent_buffer *left = NULL;
1860 	struct extent_buffer *parent = NULL;
1861 	int ret = 0;
1862 	int wret;
1863 	int pslot;
1864 	int orig_slot = path->slots[level];
1865 	u64 orig_ptr;
1866 
1867 	if (level == 0)
1868 		return 0;
1869 
1870 	mid = path->nodes[level];
1871 
1872 	WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
1873 		path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
1874 	WARN_ON(btrfs_header_generation(mid) != trans->transid);
1875 
1876 	orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1877 
1878 	if (level < BTRFS_MAX_LEVEL - 1) {
1879 		parent = path->nodes[level + 1];
1880 		pslot = path->slots[level + 1];
1881 	}
1882 
1883 	/*
1884 	 * deal with the case where there is only one pointer in the root
1885 	 * by promoting the node below to a root
1886 	 */
1887 	if (!parent) {
1888 		struct extent_buffer *child;
1889 
1890 		if (btrfs_header_nritems(mid) != 1)
1891 			return 0;
1892 
1893 		/* promote the child to a root */
1894 		child = read_node_slot(fs_info, mid, 0);
1895 		if (IS_ERR(child)) {
1896 			ret = PTR_ERR(child);
1897 			btrfs_handle_fs_error(fs_info, ret, NULL);
1898 			goto enospc;
1899 		}
1900 
1901 		btrfs_tree_lock(child);
1902 		btrfs_set_lock_blocking(child);
1903 		ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1904 		if (ret) {
1905 			btrfs_tree_unlock(child);
1906 			free_extent_buffer(child);
1907 			goto enospc;
1908 		}
1909 
1910 		ret = tree_mod_log_insert_root(root->node, child, 1);
1911 		BUG_ON(ret < 0);
1912 		rcu_assign_pointer(root->node, child);
1913 
1914 		add_root_to_dirty_list(root);
1915 		btrfs_tree_unlock(child);
1916 
1917 		path->locks[level] = 0;
1918 		path->nodes[level] = NULL;
1919 		clean_tree_block(fs_info, mid);
1920 		btrfs_tree_unlock(mid);
1921 		/* once for the path */
1922 		free_extent_buffer(mid);
1923 
1924 		root_sub_used(root, mid->len);
1925 		btrfs_free_tree_block(trans, root, mid, 0, 1);
1926 		/* once for the root ptr */
1927 		free_extent_buffer_stale(mid);
1928 		return 0;
1929 	}
1930 	if (btrfs_header_nritems(mid) >
1931 	    BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
1932 		return 0;
1933 
1934 	left = read_node_slot(fs_info, parent, pslot - 1);
1935 	if (IS_ERR(left))
1936 		left = NULL;
1937 
1938 	if (left) {
1939 		btrfs_tree_lock(left);
1940 		btrfs_set_lock_blocking(left);
1941 		wret = btrfs_cow_block(trans, root, left,
1942 				       parent, pslot - 1, &left);
1943 		if (wret) {
1944 			ret = wret;
1945 			goto enospc;
1946 		}
1947 	}
1948 
1949 	right = read_node_slot(fs_info, parent, pslot + 1);
1950 	if (IS_ERR(right))
1951 		right = NULL;
1952 
1953 	if (right) {
1954 		btrfs_tree_lock(right);
1955 		btrfs_set_lock_blocking(right);
1956 		wret = btrfs_cow_block(trans, root, right,
1957 				       parent, pslot + 1, &right);
1958 		if (wret) {
1959 			ret = wret;
1960 			goto enospc;
1961 		}
1962 	}
1963 
1964 	/* first, try to make some room in the middle buffer */
1965 	if (left) {
1966 		orig_slot += btrfs_header_nritems(left);
1967 		wret = push_node_left(trans, fs_info, left, mid, 1);
1968 		if (wret < 0)
1969 			ret = wret;
1970 	}
1971 
1972 	/*
1973 	 * then try to empty the right most buffer into the middle
1974 	 */
1975 	if (right) {
1976 		wret = push_node_left(trans, fs_info, mid, right, 1);
1977 		if (wret < 0 && wret != -ENOSPC)
1978 			ret = wret;
1979 		if (btrfs_header_nritems(right) == 0) {
1980 			clean_tree_block(fs_info, right);
1981 			btrfs_tree_unlock(right);
1982 			del_ptr(root, path, level + 1, pslot + 1);
1983 			root_sub_used(root, right->len);
1984 			btrfs_free_tree_block(trans, root, right, 0, 1);
1985 			free_extent_buffer_stale(right);
1986 			right = NULL;
1987 		} else {
1988 			struct btrfs_disk_key right_key;
1989 			btrfs_node_key(right, &right_key, 0);
1990 			ret = tree_mod_log_insert_key(parent, pslot + 1,
1991 					MOD_LOG_KEY_REPLACE, GFP_NOFS);
1992 			BUG_ON(ret < 0);
1993 			btrfs_set_node_key(parent, &right_key, pslot + 1);
1994 			btrfs_mark_buffer_dirty(parent);
1995 		}
1996 	}
1997 	if (btrfs_header_nritems(mid) == 1) {
1998 		/*
1999 		 * we're not allowed to leave a node with one item in the
2000 		 * tree during a delete.  A deletion from lower in the tree
2001 		 * could try to delete the only pointer in this node.
2002 		 * So, pull some keys from the left.
2003 		 * There has to be a left pointer at this point because
2004 		 * otherwise we would have pulled some pointers from the
2005 		 * right
2006 		 */
2007 		if (!left) {
2008 			ret = -EROFS;
2009 			btrfs_handle_fs_error(fs_info, ret, NULL);
2010 			goto enospc;
2011 		}
2012 		wret = balance_node_right(trans, fs_info, mid, left);
2013 		if (wret < 0) {
2014 			ret = wret;
2015 			goto enospc;
2016 		}
2017 		if (wret == 1) {
2018 			wret = push_node_left(trans, fs_info, left, mid, 1);
2019 			if (wret < 0)
2020 				ret = wret;
2021 		}
2022 		BUG_ON(wret == 1);
2023 	}
2024 	if (btrfs_header_nritems(mid) == 0) {
2025 		clean_tree_block(fs_info, mid);
2026 		btrfs_tree_unlock(mid);
2027 		del_ptr(root, path, level + 1, pslot);
2028 		root_sub_used(root, mid->len);
2029 		btrfs_free_tree_block(trans, root, mid, 0, 1);
2030 		free_extent_buffer_stale(mid);
2031 		mid = NULL;
2032 	} else {
2033 		/* update the parent key to reflect our changes */
2034 		struct btrfs_disk_key mid_key;
2035 		btrfs_node_key(mid, &mid_key, 0);
2036 		ret = tree_mod_log_insert_key(parent, pslot,
2037 				MOD_LOG_KEY_REPLACE, GFP_NOFS);
2038 		BUG_ON(ret < 0);
2039 		btrfs_set_node_key(parent, &mid_key, pslot);
2040 		btrfs_mark_buffer_dirty(parent);
2041 	}
2042 
2043 	/* update the path */
2044 	if (left) {
2045 		if (btrfs_header_nritems(left) > orig_slot) {
2046 			extent_buffer_get(left);
2047 			/* left was locked after cow */
2048 			path->nodes[level] = left;
2049 			path->slots[level + 1] -= 1;
2050 			path->slots[level] = orig_slot;
2051 			if (mid) {
2052 				btrfs_tree_unlock(mid);
2053 				free_extent_buffer(mid);
2054 			}
2055 		} else {
2056 			orig_slot -= btrfs_header_nritems(left);
2057 			path->slots[level] = orig_slot;
2058 		}
2059 	}
2060 	/* double check we haven't messed things up */
2061 	if (orig_ptr !=
2062 	    btrfs_node_blockptr(path->nodes[level], path->slots[level]))
2063 		BUG();
2064 enospc:
2065 	if (right) {
2066 		btrfs_tree_unlock(right);
2067 		free_extent_buffer(right);
2068 	}
2069 	if (left) {
2070 		if (path->nodes[level] != left)
2071 			btrfs_tree_unlock(left);
2072 		free_extent_buffer(left);
2073 	}
2074 	return ret;
2075 }
2076 
2077 /* Node balancing for insertion.  Here we only split or push nodes around
2078  * when they are completely full.  This is also done top down, so we
2079  * have to be pessimistic.
2080  */
push_nodes_for_insert(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,int level)2081 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
2082 					  struct btrfs_root *root,
2083 					  struct btrfs_path *path, int level)
2084 {
2085 	struct btrfs_fs_info *fs_info = root->fs_info;
2086 	struct extent_buffer *right = NULL;
2087 	struct extent_buffer *mid;
2088 	struct extent_buffer *left = NULL;
2089 	struct extent_buffer *parent = NULL;
2090 	int ret = 0;
2091 	int wret;
2092 	int pslot;
2093 	int orig_slot = path->slots[level];
2094 
2095 	if (level == 0)
2096 		return 1;
2097 
2098 	mid = path->nodes[level];
2099 	WARN_ON(btrfs_header_generation(mid) != trans->transid);
2100 
2101 	if (level < BTRFS_MAX_LEVEL - 1) {
2102 		parent = path->nodes[level + 1];
2103 		pslot = path->slots[level + 1];
2104 	}
2105 
2106 	if (!parent)
2107 		return 1;
2108 
2109 	left = read_node_slot(fs_info, parent, pslot - 1);
2110 	if (IS_ERR(left))
2111 		left = NULL;
2112 
2113 	/* first, try to make some room in the middle buffer */
2114 	if (left) {
2115 		u32 left_nr;
2116 
2117 		btrfs_tree_lock(left);
2118 		btrfs_set_lock_blocking(left);
2119 
2120 		left_nr = btrfs_header_nritems(left);
2121 		if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
2122 			wret = 1;
2123 		} else {
2124 			ret = btrfs_cow_block(trans, root, left, parent,
2125 					      pslot - 1, &left);
2126 			if (ret)
2127 				wret = 1;
2128 			else {
2129 				wret = push_node_left(trans, fs_info,
2130 						      left, mid, 0);
2131 			}
2132 		}
2133 		if (wret < 0)
2134 			ret = wret;
2135 		if (wret == 0) {
2136 			struct btrfs_disk_key disk_key;
2137 			orig_slot += left_nr;
2138 			btrfs_node_key(mid, &disk_key, 0);
2139 			ret = tree_mod_log_insert_key(parent, pslot,
2140 					MOD_LOG_KEY_REPLACE, GFP_NOFS);
2141 			BUG_ON(ret < 0);
2142 			btrfs_set_node_key(parent, &disk_key, pslot);
2143 			btrfs_mark_buffer_dirty(parent);
2144 			if (btrfs_header_nritems(left) > orig_slot) {
2145 				path->nodes[level] = left;
2146 				path->slots[level + 1] -= 1;
2147 				path->slots[level] = orig_slot;
2148 				btrfs_tree_unlock(mid);
2149 				free_extent_buffer(mid);
2150 			} else {
2151 				orig_slot -=
2152 					btrfs_header_nritems(left);
2153 				path->slots[level] = orig_slot;
2154 				btrfs_tree_unlock(left);
2155 				free_extent_buffer(left);
2156 			}
2157 			return 0;
2158 		}
2159 		btrfs_tree_unlock(left);
2160 		free_extent_buffer(left);
2161 	}
2162 	right = read_node_slot(fs_info, parent, pslot + 1);
2163 	if (IS_ERR(right))
2164 		right = NULL;
2165 
2166 	/*
2167 	 * then try to empty the right most buffer into the middle
2168 	 */
2169 	if (right) {
2170 		u32 right_nr;
2171 
2172 		btrfs_tree_lock(right);
2173 		btrfs_set_lock_blocking(right);
2174 
2175 		right_nr = btrfs_header_nritems(right);
2176 		if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
2177 			wret = 1;
2178 		} else {
2179 			ret = btrfs_cow_block(trans, root, right,
2180 					      parent, pslot + 1,
2181 					      &right);
2182 			if (ret)
2183 				wret = 1;
2184 			else {
2185 				wret = balance_node_right(trans, fs_info,
2186 							  right, mid);
2187 			}
2188 		}
2189 		if (wret < 0)
2190 			ret = wret;
2191 		if (wret == 0) {
2192 			struct btrfs_disk_key disk_key;
2193 
2194 			btrfs_node_key(right, &disk_key, 0);
2195 			ret = tree_mod_log_insert_key(parent, pslot + 1,
2196 					MOD_LOG_KEY_REPLACE, GFP_NOFS);
2197 			BUG_ON(ret < 0);
2198 			btrfs_set_node_key(parent, &disk_key, pslot + 1);
2199 			btrfs_mark_buffer_dirty(parent);
2200 
2201 			if (btrfs_header_nritems(mid) <= orig_slot) {
2202 				path->nodes[level] = right;
2203 				path->slots[level + 1] += 1;
2204 				path->slots[level] = orig_slot -
2205 					btrfs_header_nritems(mid);
2206 				btrfs_tree_unlock(mid);
2207 				free_extent_buffer(mid);
2208 			} else {
2209 				btrfs_tree_unlock(right);
2210 				free_extent_buffer(right);
2211 			}
2212 			return 0;
2213 		}
2214 		btrfs_tree_unlock(right);
2215 		free_extent_buffer(right);
2216 	}
2217 	return 1;
2218 }
2219 
2220 /*
2221  * readahead one full node of leaves, finding things that are close
2222  * to the block in 'slot', and triggering ra on them.
2223  */
reada_for_search(struct btrfs_fs_info * fs_info,struct btrfs_path * path,int level,int slot,u64 objectid)2224 static void reada_for_search(struct btrfs_fs_info *fs_info,
2225 			     struct btrfs_path *path,
2226 			     int level, int slot, u64 objectid)
2227 {
2228 	struct extent_buffer *node;
2229 	struct btrfs_disk_key disk_key;
2230 	u32 nritems;
2231 	u64 search;
2232 	u64 target;
2233 	u64 nread = 0;
2234 	struct extent_buffer *eb;
2235 	u32 nr;
2236 	u32 blocksize;
2237 	u32 nscan = 0;
2238 
2239 	if (level != 1)
2240 		return;
2241 
2242 	if (!path->nodes[level])
2243 		return;
2244 
2245 	node = path->nodes[level];
2246 
2247 	search = btrfs_node_blockptr(node, slot);
2248 	blocksize = fs_info->nodesize;
2249 	eb = find_extent_buffer(fs_info, search);
2250 	if (eb) {
2251 		free_extent_buffer(eb);
2252 		return;
2253 	}
2254 
2255 	target = search;
2256 
2257 	nritems = btrfs_header_nritems(node);
2258 	nr = slot;
2259 
2260 	while (1) {
2261 		if (path->reada == READA_BACK) {
2262 			if (nr == 0)
2263 				break;
2264 			nr--;
2265 		} else if (path->reada == READA_FORWARD) {
2266 			nr++;
2267 			if (nr >= nritems)
2268 				break;
2269 		}
2270 		if (path->reada == READA_BACK && objectid) {
2271 			btrfs_node_key(node, &disk_key, nr);
2272 			if (btrfs_disk_key_objectid(&disk_key) != objectid)
2273 				break;
2274 		}
2275 		search = btrfs_node_blockptr(node, nr);
2276 		if ((search <= target && target - search <= 65536) ||
2277 		    (search > target && search - target <= 65536)) {
2278 			readahead_tree_block(fs_info, search);
2279 			nread += blocksize;
2280 		}
2281 		nscan++;
2282 		if ((nread > 65536 || nscan > 32))
2283 			break;
2284 	}
2285 }
2286 
reada_for_balance(struct btrfs_fs_info * fs_info,struct btrfs_path * path,int level)2287 static noinline void reada_for_balance(struct btrfs_fs_info *fs_info,
2288 				       struct btrfs_path *path, int level)
2289 {
2290 	int slot;
2291 	int nritems;
2292 	struct extent_buffer *parent;
2293 	struct extent_buffer *eb;
2294 	u64 gen;
2295 	u64 block1 = 0;
2296 	u64 block2 = 0;
2297 
2298 	parent = path->nodes[level + 1];
2299 	if (!parent)
2300 		return;
2301 
2302 	nritems = btrfs_header_nritems(parent);
2303 	slot = path->slots[level + 1];
2304 
2305 	if (slot > 0) {
2306 		block1 = btrfs_node_blockptr(parent, slot - 1);
2307 		gen = btrfs_node_ptr_generation(parent, slot - 1);
2308 		eb = find_extent_buffer(fs_info, block1);
2309 		/*
2310 		 * if we get -eagain from btrfs_buffer_uptodate, we
2311 		 * don't want to return eagain here.  That will loop
2312 		 * forever
2313 		 */
2314 		if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2315 			block1 = 0;
2316 		free_extent_buffer(eb);
2317 	}
2318 	if (slot + 1 < nritems) {
2319 		block2 = btrfs_node_blockptr(parent, slot + 1);
2320 		gen = btrfs_node_ptr_generation(parent, slot + 1);
2321 		eb = find_extent_buffer(fs_info, block2);
2322 		if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2323 			block2 = 0;
2324 		free_extent_buffer(eb);
2325 	}
2326 
2327 	if (block1)
2328 		readahead_tree_block(fs_info, block1);
2329 	if (block2)
2330 		readahead_tree_block(fs_info, block2);
2331 }
2332 
2333 
2334 /*
2335  * when we walk down the tree, it is usually safe to unlock the higher layers
2336  * in the tree.  The exceptions are when our path goes through slot 0, because
2337  * operations on the tree might require changing key pointers higher up in the
2338  * tree.
2339  *
2340  * callers might also have set path->keep_locks, which tells this code to keep
2341  * the lock if the path points to the last slot in the block.  This is part of
2342  * walking through the tree, and selecting the next slot in the higher block.
2343  *
2344  * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
2345  * if lowest_unlock is 1, level 0 won't be unlocked
2346  */
unlock_up(struct btrfs_path * path,int level,int lowest_unlock,int min_write_lock_level,int * write_lock_level)2347 static noinline void unlock_up(struct btrfs_path *path, int level,
2348 			       int lowest_unlock, int min_write_lock_level,
2349 			       int *write_lock_level)
2350 {
2351 	int i;
2352 	int skip_level = level;
2353 	int no_skips = 0;
2354 	struct extent_buffer *t;
2355 
2356 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2357 		if (!path->nodes[i])
2358 			break;
2359 		if (!path->locks[i])
2360 			break;
2361 		if (!no_skips && path->slots[i] == 0) {
2362 			skip_level = i + 1;
2363 			continue;
2364 		}
2365 		if (!no_skips && path->keep_locks) {
2366 			u32 nritems;
2367 			t = path->nodes[i];
2368 			nritems = btrfs_header_nritems(t);
2369 			if (nritems < 1 || path->slots[i] >= nritems - 1) {
2370 				skip_level = i + 1;
2371 				continue;
2372 			}
2373 		}
2374 		if (skip_level < i && i >= lowest_unlock)
2375 			no_skips = 1;
2376 
2377 		t = path->nodes[i];
2378 		if (i >= lowest_unlock && i > skip_level) {
2379 			btrfs_tree_unlock_rw(t, path->locks[i]);
2380 			path->locks[i] = 0;
2381 			if (write_lock_level &&
2382 			    i > min_write_lock_level &&
2383 			    i <= *write_lock_level) {
2384 				*write_lock_level = i - 1;
2385 			}
2386 		}
2387 	}
2388 }
2389 
2390 /*
2391  * This releases any locks held in the path starting at level and
2392  * going all the way up to the root.
2393  *
2394  * btrfs_search_slot will keep the lock held on higher nodes in a few
2395  * corner cases, such as COW of the block at slot zero in the node.  This
2396  * ignores those rules, and it should only be called when there are no
2397  * more updates to be done higher up in the tree.
2398  */
btrfs_unlock_up_safe(struct btrfs_path * path,int level)2399 noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
2400 {
2401 	int i;
2402 
2403 	if (path->keep_locks)
2404 		return;
2405 
2406 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2407 		if (!path->nodes[i])
2408 			continue;
2409 		if (!path->locks[i])
2410 			continue;
2411 		btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
2412 		path->locks[i] = 0;
2413 	}
2414 }
2415 
2416 /*
2417  * helper function for btrfs_search_slot.  The goal is to find a block
2418  * in cache without setting the path to blocking.  If we find the block
2419  * we return zero and the path is unchanged.
2420  *
2421  * If we can't find the block, we set the path blocking and do some
2422  * reada.  -EAGAIN is returned and the search must be repeated.
2423  */
2424 static int
read_block_for_search(struct btrfs_root * root,struct btrfs_path * p,struct extent_buffer ** eb_ret,int level,int slot,const struct btrfs_key * key)2425 read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
2426 		      struct extent_buffer **eb_ret, int level, int slot,
2427 		      const struct btrfs_key *key)
2428 {
2429 	struct btrfs_fs_info *fs_info = root->fs_info;
2430 	u64 blocknr;
2431 	u64 gen;
2432 	struct extent_buffer *b = *eb_ret;
2433 	struct extent_buffer *tmp;
2434 	struct btrfs_key first_key;
2435 	int ret;
2436 	int parent_level;
2437 
2438 	blocknr = btrfs_node_blockptr(b, slot);
2439 	gen = btrfs_node_ptr_generation(b, slot);
2440 	parent_level = btrfs_header_level(b);
2441 	btrfs_node_key_to_cpu(b, &first_key, slot);
2442 
2443 	tmp = find_extent_buffer(fs_info, blocknr);
2444 	if (tmp) {
2445 		/* first we do an atomic uptodate check */
2446 		if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
2447 			/*
2448 			 * Do extra check for first_key, eb can be stale due to
2449 			 * being cached, read from scrub, or have multiple
2450 			 * parents (shared tree blocks).
2451 			 */
2452 			if (btrfs_verify_level_key(fs_info, tmp,
2453 					parent_level - 1, &first_key, gen)) {
2454 				free_extent_buffer(tmp);
2455 				return -EUCLEAN;
2456 			}
2457 			*eb_ret = tmp;
2458 			return 0;
2459 		}
2460 
2461 		/* the pages were up to date, but we failed
2462 		 * the generation number check.  Do a full
2463 		 * read for the generation number that is correct.
2464 		 * We must do this without dropping locks so
2465 		 * we can trust our generation number
2466 		 */
2467 		btrfs_set_path_blocking(p);
2468 
2469 		/* now we're allowed to do a blocking uptodate check */
2470 		ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key);
2471 		if (!ret) {
2472 			*eb_ret = tmp;
2473 			return 0;
2474 		}
2475 		free_extent_buffer(tmp);
2476 		btrfs_release_path(p);
2477 		return -EIO;
2478 	}
2479 
2480 	/*
2481 	 * reduce lock contention at high levels
2482 	 * of the btree by dropping locks before
2483 	 * we read.  Don't release the lock on the current
2484 	 * level because we need to walk this node to figure
2485 	 * out which blocks to read.
2486 	 */
2487 	btrfs_unlock_up_safe(p, level + 1);
2488 	btrfs_set_path_blocking(p);
2489 
2490 	if (p->reada != READA_NONE)
2491 		reada_for_search(fs_info, p, level, slot, key->objectid);
2492 
2493 	ret = -EAGAIN;
2494 	tmp = read_tree_block(fs_info, blocknr, gen, parent_level - 1,
2495 			      &first_key);
2496 	if (!IS_ERR(tmp)) {
2497 		/*
2498 		 * If the read above didn't mark this buffer up to date,
2499 		 * it will never end up being up to date.  Set ret to EIO now
2500 		 * and give up so that our caller doesn't loop forever
2501 		 * on our EAGAINs.
2502 		 */
2503 		if (!extent_buffer_uptodate(tmp))
2504 			ret = -EIO;
2505 		free_extent_buffer(tmp);
2506 	} else {
2507 		ret = PTR_ERR(tmp);
2508 	}
2509 
2510 	btrfs_release_path(p);
2511 	return ret;
2512 }
2513 
2514 /*
2515  * helper function for btrfs_search_slot.  This does all of the checks
2516  * for node-level blocks and does any balancing required based on
2517  * the ins_len.
2518  *
2519  * If no extra work was required, zero is returned.  If we had to
2520  * drop the path, -EAGAIN is returned and btrfs_search_slot must
2521  * start over
2522  */
2523 static int
setup_nodes_for_search(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * p,struct extent_buffer * b,int level,int ins_len,int * write_lock_level)2524 setup_nodes_for_search(struct btrfs_trans_handle *trans,
2525 		       struct btrfs_root *root, struct btrfs_path *p,
2526 		       struct extent_buffer *b, int level, int ins_len,
2527 		       int *write_lock_level)
2528 {
2529 	struct btrfs_fs_info *fs_info = root->fs_info;
2530 	int ret;
2531 
2532 	if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
2533 	    BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
2534 		int sret;
2535 
2536 		if (*write_lock_level < level + 1) {
2537 			*write_lock_level = level + 1;
2538 			btrfs_release_path(p);
2539 			goto again;
2540 		}
2541 
2542 		btrfs_set_path_blocking(p);
2543 		reada_for_balance(fs_info, p, level);
2544 		sret = split_node(trans, root, p, level);
2545 		btrfs_clear_path_blocking(p, NULL, 0);
2546 
2547 		BUG_ON(sret > 0);
2548 		if (sret) {
2549 			ret = sret;
2550 			goto done;
2551 		}
2552 		b = p->nodes[level];
2553 	} else if (ins_len < 0 && btrfs_header_nritems(b) <
2554 		   BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
2555 		int sret;
2556 
2557 		if (*write_lock_level < level + 1) {
2558 			*write_lock_level = level + 1;
2559 			btrfs_release_path(p);
2560 			goto again;
2561 		}
2562 
2563 		btrfs_set_path_blocking(p);
2564 		reada_for_balance(fs_info, p, level);
2565 		sret = balance_level(trans, root, p, level);
2566 		btrfs_clear_path_blocking(p, NULL, 0);
2567 
2568 		if (sret) {
2569 			ret = sret;
2570 			goto done;
2571 		}
2572 		b = p->nodes[level];
2573 		if (!b) {
2574 			btrfs_release_path(p);
2575 			goto again;
2576 		}
2577 		BUG_ON(btrfs_header_nritems(b) == 1);
2578 	}
2579 	return 0;
2580 
2581 again:
2582 	ret = -EAGAIN;
2583 done:
2584 	return ret;
2585 }
2586 
key_search_validate(struct extent_buffer * b,const struct btrfs_key * key,int level)2587 static void key_search_validate(struct extent_buffer *b,
2588 				const struct btrfs_key *key,
2589 				int level)
2590 {
2591 #ifdef CONFIG_BTRFS_ASSERT
2592 	struct btrfs_disk_key disk_key;
2593 
2594 	btrfs_cpu_key_to_disk(&disk_key, key);
2595 
2596 	if (level == 0)
2597 		ASSERT(!memcmp_extent_buffer(b, &disk_key,
2598 		    offsetof(struct btrfs_leaf, items[0].key),
2599 		    sizeof(disk_key)));
2600 	else
2601 		ASSERT(!memcmp_extent_buffer(b, &disk_key,
2602 		    offsetof(struct btrfs_node, ptrs[0].key),
2603 		    sizeof(disk_key)));
2604 #endif
2605 }
2606 
key_search(struct extent_buffer * b,const struct btrfs_key * key,int level,int * prev_cmp,int * slot)2607 static int key_search(struct extent_buffer *b, const struct btrfs_key *key,
2608 		      int level, int *prev_cmp, int *slot)
2609 {
2610 	if (*prev_cmp != 0) {
2611 		*prev_cmp = btrfs_bin_search(b, key, level, slot);
2612 		return *prev_cmp;
2613 	}
2614 
2615 	key_search_validate(b, key, level);
2616 	*slot = 0;
2617 
2618 	return 0;
2619 }
2620 
btrfs_find_item(struct btrfs_root * fs_root,struct btrfs_path * path,u64 iobjectid,u64 ioff,u8 key_type,struct btrfs_key * found_key)2621 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
2622 		u64 iobjectid, u64 ioff, u8 key_type,
2623 		struct btrfs_key *found_key)
2624 {
2625 	int ret;
2626 	struct btrfs_key key;
2627 	struct extent_buffer *eb;
2628 
2629 	ASSERT(path);
2630 	ASSERT(found_key);
2631 
2632 	key.type = key_type;
2633 	key.objectid = iobjectid;
2634 	key.offset = ioff;
2635 
2636 	ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
2637 	if (ret < 0)
2638 		return ret;
2639 
2640 	eb = path->nodes[0];
2641 	if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
2642 		ret = btrfs_next_leaf(fs_root, path);
2643 		if (ret)
2644 			return ret;
2645 		eb = path->nodes[0];
2646 	}
2647 
2648 	btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
2649 	if (found_key->type != key.type ||
2650 			found_key->objectid != key.objectid)
2651 		return 1;
2652 
2653 	return 0;
2654 }
2655 
btrfs_search_slot_get_root(struct btrfs_root * root,struct btrfs_path * p,int write_lock_level)2656 static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
2657 							struct btrfs_path *p,
2658 							int write_lock_level)
2659 {
2660 	struct btrfs_fs_info *fs_info = root->fs_info;
2661 	struct extent_buffer *b;
2662 	int root_lock;
2663 	int level = 0;
2664 
2665 	/* We try very hard to do read locks on the root */
2666 	root_lock = BTRFS_READ_LOCK;
2667 
2668 	if (p->search_commit_root) {
2669 		/*
2670 		 * The commit roots are read only so we always do read locks,
2671 		 * and we always must hold the commit_root_sem when doing
2672 		 * searches on them, the only exception is send where we don't
2673 		 * want to block transaction commits for a long time, so
2674 		 * we need to clone the commit root in order to avoid races
2675 		 * with transaction commits that create a snapshot of one of
2676 		 * the roots used by a send operation.
2677 		 */
2678 		if (p->need_commit_sem) {
2679 			down_read(&fs_info->commit_root_sem);
2680 			b = btrfs_clone_extent_buffer(root->commit_root);
2681 			up_read(&fs_info->commit_root_sem);
2682 			if (!b)
2683 				return ERR_PTR(-ENOMEM);
2684 
2685 		} else {
2686 			b = root->commit_root;
2687 			extent_buffer_get(b);
2688 		}
2689 		level = btrfs_header_level(b);
2690 		/*
2691 		 * Ensure that all callers have set skip_locking when
2692 		 * p->search_commit_root = 1.
2693 		 */
2694 		ASSERT(p->skip_locking == 1);
2695 
2696 		goto out;
2697 	}
2698 
2699 	if (p->skip_locking) {
2700 		b = btrfs_root_node(root);
2701 		level = btrfs_header_level(b);
2702 		goto out;
2703 	}
2704 
2705 	/*
2706 	 * If the level is set to maximum, we can skip trying to get the read
2707 	 * lock.
2708 	 */
2709 	if (write_lock_level < BTRFS_MAX_LEVEL) {
2710 		/*
2711 		 * We don't know the level of the root node until we actually
2712 		 * have it read locked
2713 		 */
2714 		b = btrfs_read_lock_root_node(root);
2715 		level = btrfs_header_level(b);
2716 		if (level > write_lock_level)
2717 			goto out;
2718 
2719 		/* Whoops, must trade for write lock */
2720 		btrfs_tree_read_unlock(b);
2721 		free_extent_buffer(b);
2722 	}
2723 
2724 	b = btrfs_lock_root_node(root);
2725 	root_lock = BTRFS_WRITE_LOCK;
2726 
2727 	/* The level might have changed, check again */
2728 	level = btrfs_header_level(b);
2729 
2730 out:
2731 	p->nodes[level] = b;
2732 	if (!p->skip_locking)
2733 		p->locks[level] = root_lock;
2734 	/*
2735 	 * Callers are responsible for dropping b's references.
2736 	 */
2737 	return b;
2738 }
2739 
2740 
2741 /*
2742  * btrfs_search_slot - look for a key in a tree and perform necessary
2743  * modifications to preserve tree invariants.
2744  *
2745  * @trans:	Handle of transaction, used when modifying the tree
2746  * @p:		Holds all btree nodes along the search path
2747  * @root:	The root node of the tree
2748  * @key:	The key we are looking for
2749  * @ins_len:	Indicates purpose of search, for inserts it is 1, for
2750  *		deletions it's -1. 0 for plain searches
2751  * @cow:	boolean should CoW operations be performed. Must always be 1
2752  *		when modifying the tree.
2753  *
2754  * If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
2755  * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
2756  *
2757  * If @key is found, 0 is returned and you can find the item in the leaf level
2758  * of the path (level 0)
2759  *
2760  * If @key isn't found, 1 is returned and the leaf level of the path (level 0)
2761  * points to the slot where it should be inserted
2762  *
2763  * If an error is encountered while searching the tree a negative error number
2764  * is returned
2765  */
btrfs_search_slot(struct btrfs_trans_handle * trans,struct btrfs_root * root,const struct btrfs_key * key,struct btrfs_path * p,int ins_len,int cow)2766 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2767 		      const struct btrfs_key *key, struct btrfs_path *p,
2768 		      int ins_len, int cow)
2769 {
2770 	struct btrfs_fs_info *fs_info = root->fs_info;
2771 	struct extent_buffer *b;
2772 	int slot;
2773 	int ret;
2774 	int err;
2775 	int level;
2776 	int lowest_unlock = 1;
2777 	/* everything at write_lock_level or lower must be write locked */
2778 	int write_lock_level = 0;
2779 	u8 lowest_level = 0;
2780 	int min_write_lock_level;
2781 	int prev_cmp;
2782 
2783 	lowest_level = p->lowest_level;
2784 	WARN_ON(lowest_level && ins_len > 0);
2785 	WARN_ON(p->nodes[0] != NULL);
2786 	BUG_ON(!cow && ins_len);
2787 
2788 	if (ins_len < 0) {
2789 		lowest_unlock = 2;
2790 
2791 		/* when we are removing items, we might have to go up to level
2792 		 * two as we update tree pointers  Make sure we keep write
2793 		 * for those levels as well
2794 		 */
2795 		write_lock_level = 2;
2796 	} else if (ins_len > 0) {
2797 		/*
2798 		 * for inserting items, make sure we have a write lock on
2799 		 * level 1 so we can update keys
2800 		 */
2801 		write_lock_level = 1;
2802 	}
2803 
2804 	if (!cow)
2805 		write_lock_level = -1;
2806 
2807 	if (cow && (p->keep_locks || p->lowest_level))
2808 		write_lock_level = BTRFS_MAX_LEVEL;
2809 
2810 	min_write_lock_level = write_lock_level;
2811 
2812 again:
2813 	prev_cmp = -1;
2814 	b = btrfs_search_slot_get_root(root, p, write_lock_level);
2815 	if (IS_ERR(b)) {
2816 		ret = PTR_ERR(b);
2817 		goto done;
2818 	}
2819 
2820 	while (b) {
2821 		level = btrfs_header_level(b);
2822 
2823 		/*
2824 		 * setup the path here so we can release it under lock
2825 		 * contention with the cow code
2826 		 */
2827 		if (cow) {
2828 			bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
2829 
2830 			/*
2831 			 * if we don't really need to cow this block
2832 			 * then we don't want to set the path blocking,
2833 			 * so we test it here
2834 			 */
2835 			if (!should_cow_block(trans, root, b)) {
2836 				trans->dirty = true;
2837 				goto cow_done;
2838 			}
2839 
2840 			/*
2841 			 * must have write locks on this node and the
2842 			 * parent
2843 			 */
2844 			if (level > write_lock_level ||
2845 			    (level + 1 > write_lock_level &&
2846 			    level + 1 < BTRFS_MAX_LEVEL &&
2847 			    p->nodes[level + 1])) {
2848 				write_lock_level = level + 1;
2849 				btrfs_release_path(p);
2850 				goto again;
2851 			}
2852 
2853 			btrfs_set_path_blocking(p);
2854 			if (last_level)
2855 				err = btrfs_cow_block(trans, root, b, NULL, 0,
2856 						      &b);
2857 			else
2858 				err = btrfs_cow_block(trans, root, b,
2859 						      p->nodes[level + 1],
2860 						      p->slots[level + 1], &b);
2861 			if (err) {
2862 				ret = err;
2863 				goto done;
2864 			}
2865 		}
2866 cow_done:
2867 		p->nodes[level] = b;
2868 		btrfs_clear_path_blocking(p, NULL, 0);
2869 
2870 		/*
2871 		 * we have a lock on b and as long as we aren't changing
2872 		 * the tree, there is no way to for the items in b to change.
2873 		 * It is safe to drop the lock on our parent before we
2874 		 * go through the expensive btree search on b.
2875 		 *
2876 		 * If we're inserting or deleting (ins_len != 0), then we might
2877 		 * be changing slot zero, which may require changing the parent.
2878 		 * So, we can't drop the lock until after we know which slot
2879 		 * we're operating on.
2880 		 */
2881 		if (!ins_len && !p->keep_locks) {
2882 			int u = level + 1;
2883 
2884 			if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2885 				btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
2886 				p->locks[u] = 0;
2887 			}
2888 		}
2889 
2890 		ret = key_search(b, key, level, &prev_cmp, &slot);
2891 		if (ret < 0)
2892 			goto done;
2893 
2894 		if (level != 0) {
2895 			int dec = 0;
2896 			if (ret && slot > 0) {
2897 				dec = 1;
2898 				slot -= 1;
2899 			}
2900 			p->slots[level] = slot;
2901 			err = setup_nodes_for_search(trans, root, p, b, level,
2902 					     ins_len, &write_lock_level);
2903 			if (err == -EAGAIN)
2904 				goto again;
2905 			if (err) {
2906 				ret = err;
2907 				goto done;
2908 			}
2909 			b = p->nodes[level];
2910 			slot = p->slots[level];
2911 
2912 			/*
2913 			 * slot 0 is special, if we change the key
2914 			 * we have to update the parent pointer
2915 			 * which means we must have a write lock
2916 			 * on the parent
2917 			 */
2918 			if (slot == 0 && ins_len &&
2919 			    write_lock_level < level + 1) {
2920 				write_lock_level = level + 1;
2921 				btrfs_release_path(p);
2922 				goto again;
2923 			}
2924 
2925 			unlock_up(p, level, lowest_unlock,
2926 				  min_write_lock_level, &write_lock_level);
2927 
2928 			if (level == lowest_level) {
2929 				if (dec)
2930 					p->slots[level]++;
2931 				goto done;
2932 			}
2933 
2934 			err = read_block_for_search(root, p, &b, level,
2935 						    slot, key);
2936 			if (err == -EAGAIN)
2937 				goto again;
2938 			if (err) {
2939 				ret = err;
2940 				goto done;
2941 			}
2942 
2943 			if (!p->skip_locking) {
2944 				level = btrfs_header_level(b);
2945 				if (level <= write_lock_level) {
2946 					err = btrfs_try_tree_write_lock(b);
2947 					if (!err) {
2948 						btrfs_set_path_blocking(p);
2949 						btrfs_tree_lock(b);
2950 						btrfs_clear_path_blocking(p, b,
2951 								  BTRFS_WRITE_LOCK);
2952 					}
2953 					p->locks[level] = BTRFS_WRITE_LOCK;
2954 				} else {
2955 					err = btrfs_tree_read_lock_atomic(b);
2956 					if (!err) {
2957 						btrfs_set_path_blocking(p);
2958 						btrfs_tree_read_lock(b);
2959 						btrfs_clear_path_blocking(p, b,
2960 								  BTRFS_READ_LOCK);
2961 					}
2962 					p->locks[level] = BTRFS_READ_LOCK;
2963 				}
2964 				p->nodes[level] = b;
2965 			}
2966 		} else {
2967 			p->slots[level] = slot;
2968 			if (ins_len > 0 &&
2969 			    btrfs_leaf_free_space(fs_info, b) < ins_len) {
2970 				if (write_lock_level < 1) {
2971 					write_lock_level = 1;
2972 					btrfs_release_path(p);
2973 					goto again;
2974 				}
2975 
2976 				btrfs_set_path_blocking(p);
2977 				err = split_leaf(trans, root, key,
2978 						 p, ins_len, ret == 0);
2979 				btrfs_clear_path_blocking(p, NULL, 0);
2980 
2981 				BUG_ON(err > 0);
2982 				if (err) {
2983 					ret = err;
2984 					goto done;
2985 				}
2986 			}
2987 			if (!p->search_for_split)
2988 				unlock_up(p, level, lowest_unlock,
2989 					  min_write_lock_level, &write_lock_level);
2990 			goto done;
2991 		}
2992 	}
2993 	ret = 1;
2994 done:
2995 	/*
2996 	 * we don't really know what they plan on doing with the path
2997 	 * from here on, so for now just mark it as blocking
2998 	 */
2999 	if (!p->leave_spinning)
3000 		btrfs_set_path_blocking(p);
3001 	if (ret < 0 && !p->skip_release_on_error)
3002 		btrfs_release_path(p);
3003 	return ret;
3004 }
3005 
3006 /*
3007  * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
3008  * current state of the tree together with the operations recorded in the tree
3009  * modification log to search for the key in a previous version of this tree, as
3010  * denoted by the time_seq parameter.
3011  *
3012  * Naturally, there is no support for insert, delete or cow operations.
3013  *
3014  * The resulting path and return value will be set up as if we called
3015  * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
3016  */
btrfs_search_old_slot(struct btrfs_root * root,const struct btrfs_key * key,struct btrfs_path * p,u64 time_seq)3017 int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
3018 			  struct btrfs_path *p, u64 time_seq)
3019 {
3020 	struct btrfs_fs_info *fs_info = root->fs_info;
3021 	struct extent_buffer *b;
3022 	int slot;
3023 	int ret;
3024 	int err;
3025 	int level;
3026 	int lowest_unlock = 1;
3027 	u8 lowest_level = 0;
3028 	int prev_cmp = -1;
3029 
3030 	lowest_level = p->lowest_level;
3031 	WARN_ON(p->nodes[0] != NULL);
3032 
3033 	if (p->search_commit_root) {
3034 		BUG_ON(time_seq);
3035 		return btrfs_search_slot(NULL, root, key, p, 0, 0);
3036 	}
3037 
3038 again:
3039 	b = get_old_root(root, time_seq);
3040 	if (!b) {
3041 		ret = -EIO;
3042 		goto done;
3043 	}
3044 	level = btrfs_header_level(b);
3045 	p->locks[level] = BTRFS_READ_LOCK;
3046 
3047 	while (b) {
3048 		level = btrfs_header_level(b);
3049 		p->nodes[level] = b;
3050 		btrfs_clear_path_blocking(p, NULL, 0);
3051 
3052 		/*
3053 		 * we have a lock on b and as long as we aren't changing
3054 		 * the tree, there is no way to for the items in b to change.
3055 		 * It is safe to drop the lock on our parent before we
3056 		 * go through the expensive btree search on b.
3057 		 */
3058 		btrfs_unlock_up_safe(p, level + 1);
3059 
3060 		/*
3061 		 * Since we can unwind ebs we want to do a real search every
3062 		 * time.
3063 		 */
3064 		prev_cmp = -1;
3065 		ret = key_search(b, key, level, &prev_cmp, &slot);
3066 
3067 		if (level != 0) {
3068 			int dec = 0;
3069 			if (ret && slot > 0) {
3070 				dec = 1;
3071 				slot -= 1;
3072 			}
3073 			p->slots[level] = slot;
3074 			unlock_up(p, level, lowest_unlock, 0, NULL);
3075 
3076 			if (level == lowest_level) {
3077 				if (dec)
3078 					p->slots[level]++;
3079 				goto done;
3080 			}
3081 
3082 			err = read_block_for_search(root, p, &b, level,
3083 						    slot, key);
3084 			if (err == -EAGAIN)
3085 				goto again;
3086 			if (err) {
3087 				ret = err;
3088 				goto done;
3089 			}
3090 
3091 			level = btrfs_header_level(b);
3092 			err = btrfs_tree_read_lock_atomic(b);
3093 			if (!err) {
3094 				btrfs_set_path_blocking(p);
3095 				btrfs_tree_read_lock(b);
3096 				btrfs_clear_path_blocking(p, b,
3097 							  BTRFS_READ_LOCK);
3098 			}
3099 			b = tree_mod_log_rewind(fs_info, p, b, time_seq);
3100 			if (!b) {
3101 				ret = -ENOMEM;
3102 				goto done;
3103 			}
3104 			p->locks[level] = BTRFS_READ_LOCK;
3105 			p->nodes[level] = b;
3106 		} else {
3107 			p->slots[level] = slot;
3108 			unlock_up(p, level, lowest_unlock, 0, NULL);
3109 			goto done;
3110 		}
3111 	}
3112 	ret = 1;
3113 done:
3114 	if (!p->leave_spinning)
3115 		btrfs_set_path_blocking(p);
3116 	if (ret < 0)
3117 		btrfs_release_path(p);
3118 
3119 	return ret;
3120 }
3121 
3122 /*
3123  * helper to use instead of search slot if no exact match is needed but
3124  * instead the next or previous item should be returned.
3125  * When find_higher is true, the next higher item is returned, the next lower
3126  * otherwise.
3127  * When return_any and find_higher are both true, and no higher item is found,
3128  * return the next lower instead.
3129  * When return_any is true and find_higher is false, and no lower item is found,
3130  * return the next higher instead.
3131  * It returns 0 if any item is found, 1 if none is found (tree empty), and
3132  * < 0 on error
3133  */
btrfs_search_slot_for_read(struct btrfs_root * root,const struct btrfs_key * key,struct btrfs_path * p,int find_higher,int return_any)3134 int btrfs_search_slot_for_read(struct btrfs_root *root,
3135 			       const struct btrfs_key *key,
3136 			       struct btrfs_path *p, int find_higher,
3137 			       int return_any)
3138 {
3139 	int ret;
3140 	struct extent_buffer *leaf;
3141 
3142 again:
3143 	ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
3144 	if (ret <= 0)
3145 		return ret;
3146 	/*
3147 	 * a return value of 1 means the path is at the position where the
3148 	 * item should be inserted. Normally this is the next bigger item,
3149 	 * but in case the previous item is the last in a leaf, path points
3150 	 * to the first free slot in the previous leaf, i.e. at an invalid
3151 	 * item.
3152 	 */
3153 	leaf = p->nodes[0];
3154 
3155 	if (find_higher) {
3156 		if (p->slots[0] >= btrfs_header_nritems(leaf)) {
3157 			ret = btrfs_next_leaf(root, p);
3158 			if (ret <= 0)
3159 				return ret;
3160 			if (!return_any)
3161 				return 1;
3162 			/*
3163 			 * no higher item found, return the next
3164 			 * lower instead
3165 			 */
3166 			return_any = 0;
3167 			find_higher = 0;
3168 			btrfs_release_path(p);
3169 			goto again;
3170 		}
3171 	} else {
3172 		if (p->slots[0] == 0) {
3173 			ret = btrfs_prev_leaf(root, p);
3174 			if (ret < 0)
3175 				return ret;
3176 			if (!ret) {
3177 				leaf = p->nodes[0];
3178 				if (p->slots[0] == btrfs_header_nritems(leaf))
3179 					p->slots[0]--;
3180 				return 0;
3181 			}
3182 			if (!return_any)
3183 				return 1;
3184 			/*
3185 			 * no lower item found, return the next
3186 			 * higher instead
3187 			 */
3188 			return_any = 0;
3189 			find_higher = 1;
3190 			btrfs_release_path(p);
3191 			goto again;
3192 		} else {
3193 			--p->slots[0];
3194 		}
3195 	}
3196 	return 0;
3197 }
3198 
3199 /*
3200  * adjust the pointers going up the tree, starting at level
3201  * making sure the right key of each node is points to 'key'.
3202  * This is used after shifting pointers to the left, so it stops
3203  * fixing up pointers when a given leaf/node is not in slot 0 of the
3204  * higher levels
3205  *
3206  */
fixup_low_keys(struct btrfs_path * path,struct btrfs_disk_key * key,int level)3207 static void fixup_low_keys(struct btrfs_path *path,
3208 			   struct btrfs_disk_key *key, int level)
3209 {
3210 	int i;
3211 	struct extent_buffer *t;
3212 	int ret;
3213 
3214 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
3215 		int tslot = path->slots[i];
3216 
3217 		if (!path->nodes[i])
3218 			break;
3219 		t = path->nodes[i];
3220 		ret = tree_mod_log_insert_key(t, tslot, MOD_LOG_KEY_REPLACE,
3221 				GFP_ATOMIC);
3222 		BUG_ON(ret < 0);
3223 		btrfs_set_node_key(t, key, tslot);
3224 		btrfs_mark_buffer_dirty(path->nodes[i]);
3225 		if (tslot != 0)
3226 			break;
3227 	}
3228 }
3229 
3230 /*
3231  * update item key.
3232  *
3233  * This function isn't completely safe. It's the caller's responsibility
3234  * that the new key won't break the order
3235  */
btrfs_set_item_key_safe(struct btrfs_fs_info * fs_info,struct btrfs_path * path,const struct btrfs_key * new_key)3236 void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
3237 			     struct btrfs_path *path,
3238 			     const struct btrfs_key *new_key)
3239 {
3240 	struct btrfs_disk_key disk_key;
3241 	struct extent_buffer *eb;
3242 	int slot;
3243 
3244 	eb = path->nodes[0];
3245 	slot = path->slots[0];
3246 	if (slot > 0) {
3247 		btrfs_item_key(eb, &disk_key, slot - 1);
3248 		BUG_ON(comp_keys(&disk_key, new_key) >= 0);
3249 	}
3250 	if (slot < btrfs_header_nritems(eb) - 1) {
3251 		btrfs_item_key(eb, &disk_key, slot + 1);
3252 		BUG_ON(comp_keys(&disk_key, new_key) <= 0);
3253 	}
3254 
3255 	btrfs_cpu_key_to_disk(&disk_key, new_key);
3256 	btrfs_set_item_key(eb, &disk_key, slot);
3257 	btrfs_mark_buffer_dirty(eb);
3258 	if (slot == 0)
3259 		fixup_low_keys(path, &disk_key, 1);
3260 }
3261 
3262 /*
3263  * try to push data from one node into the next node left in the
3264  * tree.
3265  *
3266  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
3267  * error, and > 0 if there was no room in the left hand block.
3268  */
push_node_left(struct btrfs_trans_handle * trans,struct btrfs_fs_info * fs_info,struct extent_buffer * dst,struct extent_buffer * src,int empty)3269 static int push_node_left(struct btrfs_trans_handle *trans,
3270 			  struct btrfs_fs_info *fs_info,
3271 			  struct extent_buffer *dst,
3272 			  struct extent_buffer *src, int empty)
3273 {
3274 	int push_items = 0;
3275 	int src_nritems;
3276 	int dst_nritems;
3277 	int ret = 0;
3278 
3279 	src_nritems = btrfs_header_nritems(src);
3280 	dst_nritems = btrfs_header_nritems(dst);
3281 	push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
3282 	WARN_ON(btrfs_header_generation(src) != trans->transid);
3283 	WARN_ON(btrfs_header_generation(dst) != trans->transid);
3284 
3285 	if (!empty && src_nritems <= 8)
3286 		return 1;
3287 
3288 	if (push_items <= 0)
3289 		return 1;
3290 
3291 	if (empty) {
3292 		push_items = min(src_nritems, push_items);
3293 		if (push_items < src_nritems) {
3294 			/* leave at least 8 pointers in the node if
3295 			 * we aren't going to empty it
3296 			 */
3297 			if (src_nritems - push_items < 8) {
3298 				if (push_items <= 8)
3299 					return 1;
3300 				push_items -= 8;
3301 			}
3302 		}
3303 	} else
3304 		push_items = min(src_nritems - 8, push_items);
3305 
3306 	ret = tree_mod_log_eb_copy(fs_info, dst, src, dst_nritems, 0,
3307 				   push_items);
3308 	if (ret) {
3309 		btrfs_abort_transaction(trans, ret);
3310 		return ret;
3311 	}
3312 	copy_extent_buffer(dst, src,
3313 			   btrfs_node_key_ptr_offset(dst_nritems),
3314 			   btrfs_node_key_ptr_offset(0),
3315 			   push_items * sizeof(struct btrfs_key_ptr));
3316 
3317 	if (push_items < src_nritems) {
3318 		/*
3319 		 * Don't call tree_mod_log_insert_move here, key removal was
3320 		 * already fully logged by tree_mod_log_eb_copy above.
3321 		 */
3322 		memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
3323 				      btrfs_node_key_ptr_offset(push_items),
3324 				      (src_nritems - push_items) *
3325 				      sizeof(struct btrfs_key_ptr));
3326 	}
3327 	btrfs_set_header_nritems(src, src_nritems - push_items);
3328 	btrfs_set_header_nritems(dst, dst_nritems + push_items);
3329 	btrfs_mark_buffer_dirty(src);
3330 	btrfs_mark_buffer_dirty(dst);
3331 
3332 	return ret;
3333 }
3334 
3335 /*
3336  * try to push data from one node into the next node right in the
3337  * tree.
3338  *
3339  * returns 0 if some ptrs were pushed, < 0 if there was some horrible
3340  * error, and > 0 if there was no room in the right hand block.
3341  *
3342  * this will  only push up to 1/2 the contents of the left node over
3343  */
balance_node_right(struct btrfs_trans_handle * trans,struct btrfs_fs_info * fs_info,struct extent_buffer * dst,struct extent_buffer * src)3344 static int balance_node_right(struct btrfs_trans_handle *trans,
3345 			      struct btrfs_fs_info *fs_info,
3346 			      struct extent_buffer *dst,
3347 			      struct extent_buffer *src)
3348 {
3349 	int push_items = 0;
3350 	int max_push;
3351 	int src_nritems;
3352 	int dst_nritems;
3353 	int ret = 0;
3354 
3355 	WARN_ON(btrfs_header_generation(src) != trans->transid);
3356 	WARN_ON(btrfs_header_generation(dst) != trans->transid);
3357 
3358 	src_nritems = btrfs_header_nritems(src);
3359 	dst_nritems = btrfs_header_nritems(dst);
3360 	push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
3361 	if (push_items <= 0)
3362 		return 1;
3363 
3364 	if (src_nritems < 4)
3365 		return 1;
3366 
3367 	max_push = src_nritems / 2 + 1;
3368 	/* don't try to empty the node */
3369 	if (max_push >= src_nritems)
3370 		return 1;
3371 
3372 	if (max_push < push_items)
3373 		push_items = max_push;
3374 
3375 	ret = tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
3376 	BUG_ON(ret < 0);
3377 	memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
3378 				      btrfs_node_key_ptr_offset(0),
3379 				      (dst_nritems) *
3380 				      sizeof(struct btrfs_key_ptr));
3381 
3382 	ret = tree_mod_log_eb_copy(fs_info, dst, src, 0,
3383 				   src_nritems - push_items, push_items);
3384 	if (ret) {
3385 		btrfs_abort_transaction(trans, ret);
3386 		return ret;
3387 	}
3388 	copy_extent_buffer(dst, src,
3389 			   btrfs_node_key_ptr_offset(0),
3390 			   btrfs_node_key_ptr_offset(src_nritems - push_items),
3391 			   push_items * sizeof(struct btrfs_key_ptr));
3392 
3393 	btrfs_set_header_nritems(src, src_nritems - push_items);
3394 	btrfs_set_header_nritems(dst, dst_nritems + push_items);
3395 
3396 	btrfs_mark_buffer_dirty(src);
3397 	btrfs_mark_buffer_dirty(dst);
3398 
3399 	return ret;
3400 }
3401 
3402 /*
3403  * helper function to insert a new root level in the tree.
3404  * A new node is allocated, and a single item is inserted to
3405  * point to the existing root
3406  *
3407  * returns zero on success or < 0 on failure.
3408  */
insert_new_root(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,int level)3409 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3410 			   struct btrfs_root *root,
3411 			   struct btrfs_path *path, int level)
3412 {
3413 	struct btrfs_fs_info *fs_info = root->fs_info;
3414 	u64 lower_gen;
3415 	struct extent_buffer *lower;
3416 	struct extent_buffer *c;
3417 	struct extent_buffer *old;
3418 	struct btrfs_disk_key lower_key;
3419 	int ret;
3420 
3421 	BUG_ON(path->nodes[level]);
3422 	BUG_ON(path->nodes[level-1] != root->node);
3423 
3424 	lower = path->nodes[level-1];
3425 	if (level == 1)
3426 		btrfs_item_key(lower, &lower_key, 0);
3427 	else
3428 		btrfs_node_key(lower, &lower_key, 0);
3429 
3430 	c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level,
3431 					 root->node->start, 0);
3432 	if (IS_ERR(c))
3433 		return PTR_ERR(c);
3434 
3435 	root_add_used(root, fs_info->nodesize);
3436 
3437 	btrfs_set_header_nritems(c, 1);
3438 	btrfs_set_node_key(c, &lower_key, 0);
3439 	btrfs_set_node_blockptr(c, 0, lower->start);
3440 	lower_gen = btrfs_header_generation(lower);
3441 	WARN_ON(lower_gen != trans->transid);
3442 
3443 	btrfs_set_node_ptr_generation(c, 0, lower_gen);
3444 
3445 	btrfs_mark_buffer_dirty(c);
3446 
3447 	old = root->node;
3448 	ret = tree_mod_log_insert_root(root->node, c, 0);
3449 	BUG_ON(ret < 0);
3450 	rcu_assign_pointer(root->node, c);
3451 
3452 	/* the super has an extra ref to root->node */
3453 	free_extent_buffer(old);
3454 
3455 	add_root_to_dirty_list(root);
3456 	extent_buffer_get(c);
3457 	path->nodes[level] = c;
3458 	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
3459 	path->slots[level] = 0;
3460 	return 0;
3461 }
3462 
3463 /*
3464  * worker function to insert a single pointer in a node.
3465  * the node should have enough room for the pointer already
3466  *
3467  * slot and level indicate where you want the key to go, and
3468  * blocknr is the block the key points to.
3469  */
insert_ptr(struct btrfs_trans_handle * trans,struct btrfs_fs_info * fs_info,struct btrfs_path * path,struct btrfs_disk_key * key,u64 bytenr,int slot,int level)3470 static void insert_ptr(struct btrfs_trans_handle *trans,
3471 		       struct btrfs_fs_info *fs_info, struct btrfs_path *path,
3472 		       struct btrfs_disk_key *key, u64 bytenr,
3473 		       int slot, int level)
3474 {
3475 	struct extent_buffer *lower;
3476 	int nritems;
3477 	int ret;
3478 
3479 	BUG_ON(!path->nodes[level]);
3480 	btrfs_assert_tree_locked(path->nodes[level]);
3481 	lower = path->nodes[level];
3482 	nritems = btrfs_header_nritems(lower);
3483 	BUG_ON(slot > nritems);
3484 	BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(fs_info));
3485 	if (slot != nritems) {
3486 		if (level) {
3487 			ret = tree_mod_log_insert_move(lower, slot + 1, slot,
3488 					nritems - slot);
3489 			BUG_ON(ret < 0);
3490 		}
3491 		memmove_extent_buffer(lower,
3492 			      btrfs_node_key_ptr_offset(slot + 1),
3493 			      btrfs_node_key_ptr_offset(slot),
3494 			      (nritems - slot) * sizeof(struct btrfs_key_ptr));
3495 	}
3496 	if (level) {
3497 		ret = tree_mod_log_insert_key(lower, slot, MOD_LOG_KEY_ADD,
3498 				GFP_NOFS);
3499 		BUG_ON(ret < 0);
3500 	}
3501 	btrfs_set_node_key(lower, key, slot);
3502 	btrfs_set_node_blockptr(lower, slot, bytenr);
3503 	WARN_ON(trans->transid == 0);
3504 	btrfs_set_node_ptr_generation(lower, slot, trans->transid);
3505 	btrfs_set_header_nritems(lower, nritems + 1);
3506 	btrfs_mark_buffer_dirty(lower);
3507 }
3508 
3509 /*
3510  * split the node at the specified level in path in two.
3511  * The path is corrected to point to the appropriate node after the split
3512  *
3513  * Before splitting this tries to make some room in the node by pushing
3514  * left and right, if either one works, it returns right away.
3515  *
3516  * returns 0 on success and < 0 on failure
3517  */
split_node(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,int level)3518 static noinline int split_node(struct btrfs_trans_handle *trans,
3519 			       struct btrfs_root *root,
3520 			       struct btrfs_path *path, int level)
3521 {
3522 	struct btrfs_fs_info *fs_info = root->fs_info;
3523 	struct extent_buffer *c;
3524 	struct extent_buffer *split;
3525 	struct btrfs_disk_key disk_key;
3526 	int mid;
3527 	int ret;
3528 	u32 c_nritems;
3529 
3530 	c = path->nodes[level];
3531 	WARN_ON(btrfs_header_generation(c) != trans->transid);
3532 	if (c == root->node) {
3533 		/*
3534 		 * trying to split the root, lets make a new one
3535 		 *
3536 		 * tree mod log: We don't log_removal old root in
3537 		 * insert_new_root, because that root buffer will be kept as a
3538 		 * normal node. We are going to log removal of half of the
3539 		 * elements below with tree_mod_log_eb_copy. We're holding a
3540 		 * tree lock on the buffer, which is why we cannot race with
3541 		 * other tree_mod_log users.
3542 		 */
3543 		ret = insert_new_root(trans, root, path, level + 1);
3544 		if (ret)
3545 			return ret;
3546 	} else {
3547 		ret = push_nodes_for_insert(trans, root, path, level);
3548 		c = path->nodes[level];
3549 		if (!ret && btrfs_header_nritems(c) <
3550 		    BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3)
3551 			return 0;
3552 		if (ret < 0)
3553 			return ret;
3554 	}
3555 
3556 	c_nritems = btrfs_header_nritems(c);
3557 	mid = (c_nritems + 1) / 2;
3558 	btrfs_node_key(c, &disk_key, mid);
3559 
3560 	split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level,
3561 					     c->start, 0);
3562 	if (IS_ERR(split))
3563 		return PTR_ERR(split);
3564 
3565 	root_add_used(root, fs_info->nodesize);
3566 	ASSERT(btrfs_header_level(c) == level);
3567 
3568 	ret = tree_mod_log_eb_copy(fs_info, split, c, 0, mid, c_nritems - mid);
3569 	if (ret) {
3570 		btrfs_abort_transaction(trans, ret);
3571 		return ret;
3572 	}
3573 	copy_extent_buffer(split, c,
3574 			   btrfs_node_key_ptr_offset(0),
3575 			   btrfs_node_key_ptr_offset(mid),
3576 			   (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3577 	btrfs_set_header_nritems(split, c_nritems - mid);
3578 	btrfs_set_header_nritems(c, mid);
3579 	ret = 0;
3580 
3581 	btrfs_mark_buffer_dirty(c);
3582 	btrfs_mark_buffer_dirty(split);
3583 
3584 	insert_ptr(trans, fs_info, path, &disk_key, split->start,
3585 		   path->slots[level + 1] + 1, level + 1);
3586 
3587 	if (path->slots[level] >= mid) {
3588 		path->slots[level] -= mid;
3589 		btrfs_tree_unlock(c);
3590 		free_extent_buffer(c);
3591 		path->nodes[level] = split;
3592 		path->slots[level + 1] += 1;
3593 	} else {
3594 		btrfs_tree_unlock(split);
3595 		free_extent_buffer(split);
3596 	}
3597 	return ret;
3598 }
3599 
3600 /*
3601  * how many bytes are required to store the items in a leaf.  start
3602  * and nr indicate which items in the leaf to check.  This totals up the
3603  * space used both by the item structs and the item data
3604  */
leaf_space_used(struct extent_buffer * l,int start,int nr)3605 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
3606 {
3607 	struct btrfs_item *start_item;
3608 	struct btrfs_item *end_item;
3609 	struct btrfs_map_token token;
3610 	int data_len;
3611 	int nritems = btrfs_header_nritems(l);
3612 	int end = min(nritems, start + nr) - 1;
3613 
3614 	if (!nr)
3615 		return 0;
3616 	btrfs_init_map_token(&token);
3617 	start_item = btrfs_item_nr(start);
3618 	end_item = btrfs_item_nr(end);
3619 	data_len = btrfs_token_item_offset(l, start_item, &token) +
3620 		btrfs_token_item_size(l, start_item, &token);
3621 	data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
3622 	data_len += sizeof(struct btrfs_item) * nr;
3623 	WARN_ON(data_len < 0);
3624 	return data_len;
3625 }
3626 
3627 /*
3628  * The space between the end of the leaf items and
3629  * the start of the leaf data.  IOW, how much room
3630  * the leaf has left for both items and data
3631  */
btrfs_leaf_free_space(struct btrfs_fs_info * fs_info,struct extent_buffer * leaf)3632 noinline int btrfs_leaf_free_space(struct btrfs_fs_info *fs_info,
3633 				   struct extent_buffer *leaf)
3634 {
3635 	int nritems = btrfs_header_nritems(leaf);
3636 	int ret;
3637 
3638 	ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
3639 	if (ret < 0) {
3640 		btrfs_crit(fs_info,
3641 			   "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
3642 			   ret,
3643 			   (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
3644 			   leaf_space_used(leaf, 0, nritems), nritems);
3645 	}
3646 	return ret;
3647 }
3648 
3649 /*
3650  * min slot controls the lowest index we're willing to push to the
3651  * right.  We'll push up to and including min_slot, but no lower
3652  */
__push_leaf_right(struct btrfs_fs_info * fs_info,struct btrfs_path * path,int data_size,int empty,struct extent_buffer * right,int free_space,u32 left_nritems,u32 min_slot)3653 static noinline int __push_leaf_right(struct btrfs_fs_info *fs_info,
3654 				      struct btrfs_path *path,
3655 				      int data_size, int empty,
3656 				      struct extent_buffer *right,
3657 				      int free_space, u32 left_nritems,
3658 				      u32 min_slot)
3659 {
3660 	struct extent_buffer *left = path->nodes[0];
3661 	struct extent_buffer *upper = path->nodes[1];
3662 	struct btrfs_map_token token;
3663 	struct btrfs_disk_key disk_key;
3664 	int slot;
3665 	u32 i;
3666 	int push_space = 0;
3667 	int push_items = 0;
3668 	struct btrfs_item *item;
3669 	u32 nr;
3670 	u32 right_nritems;
3671 	u32 data_end;
3672 	u32 this_item_size;
3673 
3674 	btrfs_init_map_token(&token);
3675 
3676 	if (empty)
3677 		nr = 0;
3678 	else
3679 		nr = max_t(u32, 1, min_slot);
3680 
3681 	if (path->slots[0] >= left_nritems)
3682 		push_space += data_size;
3683 
3684 	slot = path->slots[1];
3685 	i = left_nritems - 1;
3686 	while (i >= nr) {
3687 		item = btrfs_item_nr(i);
3688 
3689 		if (!empty && push_items > 0) {
3690 			if (path->slots[0] > i)
3691 				break;
3692 			if (path->slots[0] == i) {
3693 				int space = btrfs_leaf_free_space(fs_info, left);
3694 				if (space + push_space * 2 > free_space)
3695 					break;
3696 			}
3697 		}
3698 
3699 		if (path->slots[0] == i)
3700 			push_space += data_size;
3701 
3702 		this_item_size = btrfs_item_size(left, item);
3703 		if (this_item_size + sizeof(*item) + push_space > free_space)
3704 			break;
3705 
3706 		push_items++;
3707 		push_space += this_item_size + sizeof(*item);
3708 		if (i == 0)
3709 			break;
3710 		i--;
3711 	}
3712 
3713 	if (push_items == 0)
3714 		goto out_unlock;
3715 
3716 	WARN_ON(!empty && push_items == left_nritems);
3717 
3718 	/* push left to right */
3719 	right_nritems = btrfs_header_nritems(right);
3720 
3721 	push_space = btrfs_item_end_nr(left, left_nritems - push_items);
3722 	push_space -= leaf_data_end(fs_info, left);
3723 
3724 	/* make room in the right data area */
3725 	data_end = leaf_data_end(fs_info, right);
3726 	memmove_extent_buffer(right,
3727 			      BTRFS_LEAF_DATA_OFFSET + data_end - push_space,
3728 			      BTRFS_LEAF_DATA_OFFSET + data_end,
3729 			      BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
3730 
3731 	/* copy from the left data area */
3732 	copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET +
3733 		     BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3734 		     BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, left),
3735 		     push_space);
3736 
3737 	memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
3738 			      btrfs_item_nr_offset(0),
3739 			      right_nritems * sizeof(struct btrfs_item));
3740 
3741 	/* copy the items from left to right */
3742 	copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
3743 		   btrfs_item_nr_offset(left_nritems - push_items),
3744 		   push_items * sizeof(struct btrfs_item));
3745 
3746 	/* update the item pointers */
3747 	right_nritems += push_items;
3748 	btrfs_set_header_nritems(right, right_nritems);
3749 	push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3750 	for (i = 0; i < right_nritems; i++) {
3751 		item = btrfs_item_nr(i);
3752 		push_space -= btrfs_token_item_size(right, item, &token);
3753 		btrfs_set_token_item_offset(right, item, push_space, &token);
3754 	}
3755 
3756 	left_nritems -= push_items;
3757 	btrfs_set_header_nritems(left, left_nritems);
3758 
3759 	if (left_nritems)
3760 		btrfs_mark_buffer_dirty(left);
3761 	else
3762 		clean_tree_block(fs_info, left);
3763 
3764 	btrfs_mark_buffer_dirty(right);
3765 
3766 	btrfs_item_key(right, &disk_key, 0);
3767 	btrfs_set_node_key(upper, &disk_key, slot + 1);
3768 	btrfs_mark_buffer_dirty(upper);
3769 
3770 	/* then fixup the leaf pointer in the path */
3771 	if (path->slots[0] >= left_nritems) {
3772 		path->slots[0] -= left_nritems;
3773 		if (btrfs_header_nritems(path->nodes[0]) == 0)
3774 			clean_tree_block(fs_info, path->nodes[0]);
3775 		btrfs_tree_unlock(path->nodes[0]);
3776 		free_extent_buffer(path->nodes[0]);
3777 		path->nodes[0] = right;
3778 		path->slots[1] += 1;
3779 	} else {
3780 		btrfs_tree_unlock(right);
3781 		free_extent_buffer(right);
3782 	}
3783 	return 0;
3784 
3785 out_unlock:
3786 	btrfs_tree_unlock(right);
3787 	free_extent_buffer(right);
3788 	return 1;
3789 }
3790 
3791 /*
3792  * push some data in the path leaf to the right, trying to free up at
3793  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3794  *
3795  * returns 1 if the push failed because the other node didn't have enough
3796  * room, 0 if everything worked out and < 0 if there were major errors.
3797  *
3798  * this will push starting from min_slot to the end of the leaf.  It won't
3799  * push any slot lower than min_slot
3800  */
push_leaf_right(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,int min_data_size,int data_size,int empty,u32 min_slot)3801 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3802 			   *root, struct btrfs_path *path,
3803 			   int min_data_size, int data_size,
3804 			   int empty, u32 min_slot)
3805 {
3806 	struct btrfs_fs_info *fs_info = root->fs_info;
3807 	struct extent_buffer *left = path->nodes[0];
3808 	struct extent_buffer *right;
3809 	struct extent_buffer *upper;
3810 	int slot;
3811 	int free_space;
3812 	u32 left_nritems;
3813 	int ret;
3814 
3815 	if (!path->nodes[1])
3816 		return 1;
3817 
3818 	slot = path->slots[1];
3819 	upper = path->nodes[1];
3820 	if (slot >= btrfs_header_nritems(upper) - 1)
3821 		return 1;
3822 
3823 	btrfs_assert_tree_locked(path->nodes[1]);
3824 
3825 	right = read_node_slot(fs_info, upper, slot + 1);
3826 	/*
3827 	 * slot + 1 is not valid or we fail to read the right node,
3828 	 * no big deal, just return.
3829 	 */
3830 	if (IS_ERR(right))
3831 		return 1;
3832 
3833 	btrfs_tree_lock(right);
3834 	btrfs_set_lock_blocking(right);
3835 
3836 	free_space = btrfs_leaf_free_space(fs_info, right);
3837 	if (free_space < data_size)
3838 		goto out_unlock;
3839 
3840 	/* cow and double check */
3841 	ret = btrfs_cow_block(trans, root, right, upper,
3842 			      slot + 1, &right);
3843 	if (ret)
3844 		goto out_unlock;
3845 
3846 	free_space = btrfs_leaf_free_space(fs_info, right);
3847 	if (free_space < data_size)
3848 		goto out_unlock;
3849 
3850 	left_nritems = btrfs_header_nritems(left);
3851 	if (left_nritems == 0)
3852 		goto out_unlock;
3853 
3854 	if (path->slots[0] == left_nritems && !empty) {
3855 		/* Key greater than all keys in the leaf, right neighbor has
3856 		 * enough room for it and we're not emptying our leaf to delete
3857 		 * it, therefore use right neighbor to insert the new item and
3858 		 * no need to touch/dirty our left leaft. */
3859 		btrfs_tree_unlock(left);
3860 		free_extent_buffer(left);
3861 		path->nodes[0] = right;
3862 		path->slots[0] = 0;
3863 		path->slots[1]++;
3864 		return 0;
3865 	}
3866 
3867 	return __push_leaf_right(fs_info, path, min_data_size, empty,
3868 				right, free_space, left_nritems, min_slot);
3869 out_unlock:
3870 	btrfs_tree_unlock(right);
3871 	free_extent_buffer(right);
3872 	return 1;
3873 }
3874 
3875 /*
3876  * push some data in the path leaf to the left, trying to free up at
3877  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3878  *
3879  * max_slot can put a limit on how far into the leaf we'll push items.  The
3880  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
3881  * items
3882  */
__push_leaf_left(struct btrfs_fs_info * fs_info,struct btrfs_path * path,int data_size,int empty,struct extent_buffer * left,int free_space,u32 right_nritems,u32 max_slot)3883 static noinline int __push_leaf_left(struct btrfs_fs_info *fs_info,
3884 				     struct btrfs_path *path, int data_size,
3885 				     int empty, struct extent_buffer *left,
3886 				     int free_space, u32 right_nritems,
3887 				     u32 max_slot)
3888 {
3889 	struct btrfs_disk_key disk_key;
3890 	struct extent_buffer *right = path->nodes[0];
3891 	int i;
3892 	int push_space = 0;
3893 	int push_items = 0;
3894 	struct btrfs_item *item;
3895 	u32 old_left_nritems;
3896 	u32 nr;
3897 	int ret = 0;
3898 	u32 this_item_size;
3899 	u32 old_left_item_size;
3900 	struct btrfs_map_token token;
3901 
3902 	btrfs_init_map_token(&token);
3903 
3904 	if (empty)
3905 		nr = min(right_nritems, max_slot);
3906 	else
3907 		nr = min(right_nritems - 1, max_slot);
3908 
3909 	for (i = 0; i < nr; i++) {
3910 		item = btrfs_item_nr(i);
3911 
3912 		if (!empty && push_items > 0) {
3913 			if (path->slots[0] < i)
3914 				break;
3915 			if (path->slots[0] == i) {
3916 				int space = btrfs_leaf_free_space(fs_info, right);
3917 				if (space + push_space * 2 > free_space)
3918 					break;
3919 			}
3920 		}
3921 
3922 		if (path->slots[0] == i)
3923 			push_space += data_size;
3924 
3925 		this_item_size = btrfs_item_size(right, item);
3926 		if (this_item_size + sizeof(*item) + push_space > free_space)
3927 			break;
3928 
3929 		push_items++;
3930 		push_space += this_item_size + sizeof(*item);
3931 	}
3932 
3933 	if (push_items == 0) {
3934 		ret = 1;
3935 		goto out;
3936 	}
3937 	WARN_ON(!empty && push_items == btrfs_header_nritems(right));
3938 
3939 	/* push data from right to left */
3940 	copy_extent_buffer(left, right,
3941 			   btrfs_item_nr_offset(btrfs_header_nritems(left)),
3942 			   btrfs_item_nr_offset(0),
3943 			   push_items * sizeof(struct btrfs_item));
3944 
3945 	push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
3946 		     btrfs_item_offset_nr(right, push_items - 1);
3947 
3948 	copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
3949 		     leaf_data_end(fs_info, left) - push_space,
3950 		     BTRFS_LEAF_DATA_OFFSET +
3951 		     btrfs_item_offset_nr(right, push_items - 1),
3952 		     push_space);
3953 	old_left_nritems = btrfs_header_nritems(left);
3954 	BUG_ON(old_left_nritems <= 0);
3955 
3956 	old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
3957 	for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3958 		u32 ioff;
3959 
3960 		item = btrfs_item_nr(i);
3961 
3962 		ioff = btrfs_token_item_offset(left, item, &token);
3963 		btrfs_set_token_item_offset(left, item,
3964 		      ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size),
3965 		      &token);
3966 	}
3967 	btrfs_set_header_nritems(left, old_left_nritems + push_items);
3968 
3969 	/* fixup right node */
3970 	if (push_items > right_nritems)
3971 		WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3972 		       right_nritems);
3973 
3974 	if (push_items < right_nritems) {
3975 		push_space = btrfs_item_offset_nr(right, push_items - 1) -
3976 						  leaf_data_end(fs_info, right);
3977 		memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
3978 				      BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3979 				      BTRFS_LEAF_DATA_OFFSET +
3980 				      leaf_data_end(fs_info, right), push_space);
3981 
3982 		memmove_extent_buffer(right, btrfs_item_nr_offset(0),
3983 			      btrfs_item_nr_offset(push_items),
3984 			     (btrfs_header_nritems(right) - push_items) *
3985 			     sizeof(struct btrfs_item));
3986 	}
3987 	right_nritems -= push_items;
3988 	btrfs_set_header_nritems(right, right_nritems);
3989 	push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3990 	for (i = 0; i < right_nritems; i++) {
3991 		item = btrfs_item_nr(i);
3992 
3993 		push_space = push_space - btrfs_token_item_size(right,
3994 								item, &token);
3995 		btrfs_set_token_item_offset(right, item, push_space, &token);
3996 	}
3997 
3998 	btrfs_mark_buffer_dirty(left);
3999 	if (right_nritems)
4000 		btrfs_mark_buffer_dirty(right);
4001 	else
4002 		clean_tree_block(fs_info, right);
4003 
4004 	btrfs_item_key(right, &disk_key, 0);
4005 	fixup_low_keys(path, &disk_key, 1);
4006 
4007 	/* then fixup the leaf pointer in the path */
4008 	if (path->slots[0] < push_items) {
4009 		path->slots[0] += old_left_nritems;
4010 		btrfs_tree_unlock(path->nodes[0]);
4011 		free_extent_buffer(path->nodes[0]);
4012 		path->nodes[0] = left;
4013 		path->slots[1] -= 1;
4014 	} else {
4015 		btrfs_tree_unlock(left);
4016 		free_extent_buffer(left);
4017 		path->slots[0] -= push_items;
4018 	}
4019 	BUG_ON(path->slots[0] < 0);
4020 	return ret;
4021 out:
4022 	btrfs_tree_unlock(left);
4023 	free_extent_buffer(left);
4024 	return ret;
4025 }
4026 
4027 /*
4028  * push some data in the path leaf to the left, trying to free up at
4029  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
4030  *
4031  * max_slot can put a limit on how far into the leaf we'll push items.  The
4032  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
4033  * items
4034  */
push_leaf_left(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,int min_data_size,int data_size,int empty,u32 max_slot)4035 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
4036 			  *root, struct btrfs_path *path, int min_data_size,
4037 			  int data_size, int empty, u32 max_slot)
4038 {
4039 	struct btrfs_fs_info *fs_info = root->fs_info;
4040 	struct extent_buffer *right = path->nodes[0];
4041 	struct extent_buffer *left;
4042 	int slot;
4043 	int free_space;
4044 	u32 right_nritems;
4045 	int ret = 0;
4046 
4047 	slot = path->slots[1];
4048 	if (slot == 0)
4049 		return 1;
4050 	if (!path->nodes[1])
4051 		return 1;
4052 
4053 	right_nritems = btrfs_header_nritems(right);
4054 	if (right_nritems == 0)
4055 		return 1;
4056 
4057 	btrfs_assert_tree_locked(path->nodes[1]);
4058 
4059 	left = read_node_slot(fs_info, path->nodes[1], slot - 1);
4060 	/*
4061 	 * slot - 1 is not valid or we fail to read the left node,
4062 	 * no big deal, just return.
4063 	 */
4064 	if (IS_ERR(left))
4065 		return 1;
4066 
4067 	btrfs_tree_lock(left);
4068 	btrfs_set_lock_blocking(left);
4069 
4070 	free_space = btrfs_leaf_free_space(fs_info, left);
4071 	if (free_space < data_size) {
4072 		ret = 1;
4073 		goto out;
4074 	}
4075 
4076 	/* cow and double check */
4077 	ret = btrfs_cow_block(trans, root, left,
4078 			      path->nodes[1], slot - 1, &left);
4079 	if (ret) {
4080 		/* we hit -ENOSPC, but it isn't fatal here */
4081 		if (ret == -ENOSPC)
4082 			ret = 1;
4083 		goto out;
4084 	}
4085 
4086 	free_space = btrfs_leaf_free_space(fs_info, left);
4087 	if (free_space < data_size) {
4088 		ret = 1;
4089 		goto out;
4090 	}
4091 
4092 	return __push_leaf_left(fs_info, path, min_data_size,
4093 			       empty, left, free_space, right_nritems,
4094 			       max_slot);
4095 out:
4096 	btrfs_tree_unlock(left);
4097 	free_extent_buffer(left);
4098 	return ret;
4099 }
4100 
4101 /*
4102  * split the path's leaf in two, making sure there is at least data_size
4103  * available for the resulting leaf level of the path.
4104  */
copy_for_split(struct btrfs_trans_handle * trans,struct btrfs_fs_info * fs_info,struct btrfs_path * path,struct extent_buffer * l,struct extent_buffer * right,int slot,int mid,int nritems)4105 static noinline void copy_for_split(struct btrfs_trans_handle *trans,
4106 				    struct btrfs_fs_info *fs_info,
4107 				    struct btrfs_path *path,
4108 				    struct extent_buffer *l,
4109 				    struct extent_buffer *right,
4110 				    int slot, int mid, int nritems)
4111 {
4112 	int data_copy_size;
4113 	int rt_data_off;
4114 	int i;
4115 	struct btrfs_disk_key disk_key;
4116 	struct btrfs_map_token token;
4117 
4118 	btrfs_init_map_token(&token);
4119 
4120 	nritems = nritems - mid;
4121 	btrfs_set_header_nritems(right, nritems);
4122 	data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(fs_info, l);
4123 
4124 	copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
4125 			   btrfs_item_nr_offset(mid),
4126 			   nritems * sizeof(struct btrfs_item));
4127 
4128 	copy_extent_buffer(right, l,
4129 		     BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) -
4130 		     data_copy_size, BTRFS_LEAF_DATA_OFFSET +
4131 		     leaf_data_end(fs_info, l), data_copy_size);
4132 
4133 	rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid);
4134 
4135 	for (i = 0; i < nritems; i++) {
4136 		struct btrfs_item *item = btrfs_item_nr(i);
4137 		u32 ioff;
4138 
4139 		ioff = btrfs_token_item_offset(right, item, &token);
4140 		btrfs_set_token_item_offset(right, item,
4141 					    ioff + rt_data_off, &token);
4142 	}
4143 
4144 	btrfs_set_header_nritems(l, mid);
4145 	btrfs_item_key(right, &disk_key, 0);
4146 	insert_ptr(trans, fs_info, path, &disk_key, right->start,
4147 		   path->slots[1] + 1, 1);
4148 
4149 	btrfs_mark_buffer_dirty(right);
4150 	btrfs_mark_buffer_dirty(l);
4151 	BUG_ON(path->slots[0] != slot);
4152 
4153 	if (mid <= slot) {
4154 		btrfs_tree_unlock(path->nodes[0]);
4155 		free_extent_buffer(path->nodes[0]);
4156 		path->nodes[0] = right;
4157 		path->slots[0] -= mid;
4158 		path->slots[1] += 1;
4159 	} else {
4160 		btrfs_tree_unlock(right);
4161 		free_extent_buffer(right);
4162 	}
4163 
4164 	BUG_ON(path->slots[0] < 0);
4165 }
4166 
4167 /*
4168  * double splits happen when we need to insert a big item in the middle
4169  * of a leaf.  A double split can leave us with 3 mostly empty leaves:
4170  * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
4171  *          A                 B                 C
4172  *
4173  * We avoid this by trying to push the items on either side of our target
4174  * into the adjacent leaves.  If all goes well we can avoid the double split
4175  * completely.
4176  */
push_for_double_split(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,int data_size)4177 static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
4178 					  struct btrfs_root *root,
4179 					  struct btrfs_path *path,
4180 					  int data_size)
4181 {
4182 	struct btrfs_fs_info *fs_info = root->fs_info;
4183 	int ret;
4184 	int progress = 0;
4185 	int slot;
4186 	u32 nritems;
4187 	int space_needed = data_size;
4188 
4189 	slot = path->slots[0];
4190 	if (slot < btrfs_header_nritems(path->nodes[0]))
4191 		space_needed -= btrfs_leaf_free_space(fs_info, path->nodes[0]);
4192 
4193 	/*
4194 	 * try to push all the items after our slot into the
4195 	 * right leaf
4196 	 */
4197 	ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
4198 	if (ret < 0)
4199 		return ret;
4200 
4201 	if (ret == 0)
4202 		progress++;
4203 
4204 	nritems = btrfs_header_nritems(path->nodes[0]);
4205 	/*
4206 	 * our goal is to get our slot at the start or end of a leaf.  If
4207 	 * we've done so we're done
4208 	 */
4209 	if (path->slots[0] == 0 || path->slots[0] == nritems)
4210 		return 0;
4211 
4212 	if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= data_size)
4213 		return 0;
4214 
4215 	/* try to push all the items before our slot into the next leaf */
4216 	slot = path->slots[0];
4217 	space_needed = data_size;
4218 	if (slot > 0)
4219 		space_needed -= btrfs_leaf_free_space(fs_info, path->nodes[0]);
4220 	ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
4221 	if (ret < 0)
4222 		return ret;
4223 
4224 	if (ret == 0)
4225 		progress++;
4226 
4227 	if (progress)
4228 		return 0;
4229 	return 1;
4230 }
4231 
4232 /*
4233  * split the path's leaf in two, making sure there is at least data_size
4234  * available for the resulting leaf level of the path.
4235  *
4236  * returns 0 if all went well and < 0 on failure.
4237  */
split_leaf(struct btrfs_trans_handle * trans,struct btrfs_root * root,const struct btrfs_key * ins_key,struct btrfs_path * path,int data_size,int extend)4238 static noinline int split_leaf(struct btrfs_trans_handle *trans,
4239 			       struct btrfs_root *root,
4240 			       const struct btrfs_key *ins_key,
4241 			       struct btrfs_path *path, int data_size,
4242 			       int extend)
4243 {
4244 	struct btrfs_disk_key disk_key;
4245 	struct extent_buffer *l;
4246 	u32 nritems;
4247 	int mid;
4248 	int slot;
4249 	struct extent_buffer *right;
4250 	struct btrfs_fs_info *fs_info = root->fs_info;
4251 	int ret = 0;
4252 	int wret;
4253 	int split;
4254 	int num_doubles = 0;
4255 	int tried_avoid_double = 0;
4256 
4257 	l = path->nodes[0];
4258 	slot = path->slots[0];
4259 	if (extend && data_size + btrfs_item_size_nr(l, slot) +
4260 	    sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
4261 		return -EOVERFLOW;
4262 
4263 	/* first try to make some room by pushing left and right */
4264 	if (data_size && path->nodes[1]) {
4265 		int space_needed = data_size;
4266 
4267 		if (slot < btrfs_header_nritems(l))
4268 			space_needed -= btrfs_leaf_free_space(fs_info, l);
4269 
4270 		wret = push_leaf_right(trans, root, path, space_needed,
4271 				       space_needed, 0, 0);
4272 		if (wret < 0)
4273 			return wret;
4274 		if (wret) {
4275 			space_needed = data_size;
4276 			if (slot > 0)
4277 				space_needed -= btrfs_leaf_free_space(fs_info,
4278 								      l);
4279 			wret = push_leaf_left(trans, root, path, space_needed,
4280 					      space_needed, 0, (u32)-1);
4281 			if (wret < 0)
4282 				return wret;
4283 		}
4284 		l = path->nodes[0];
4285 
4286 		/* did the pushes work? */
4287 		if (btrfs_leaf_free_space(fs_info, l) >= data_size)
4288 			return 0;
4289 	}
4290 
4291 	if (!path->nodes[1]) {
4292 		ret = insert_new_root(trans, root, path, 1);
4293 		if (ret)
4294 			return ret;
4295 	}
4296 again:
4297 	split = 1;
4298 	l = path->nodes[0];
4299 	slot = path->slots[0];
4300 	nritems = btrfs_header_nritems(l);
4301 	mid = (nritems + 1) / 2;
4302 
4303 	if (mid <= slot) {
4304 		if (nritems == 1 ||
4305 		    leaf_space_used(l, mid, nritems - mid) + data_size >
4306 			BTRFS_LEAF_DATA_SIZE(fs_info)) {
4307 			if (slot >= nritems) {
4308 				split = 0;
4309 			} else {
4310 				mid = slot;
4311 				if (mid != nritems &&
4312 				    leaf_space_used(l, mid, nritems - mid) +
4313 				    data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
4314 					if (data_size && !tried_avoid_double)
4315 						goto push_for_double;
4316 					split = 2;
4317 				}
4318 			}
4319 		}
4320 	} else {
4321 		if (leaf_space_used(l, 0, mid) + data_size >
4322 			BTRFS_LEAF_DATA_SIZE(fs_info)) {
4323 			if (!extend && data_size && slot == 0) {
4324 				split = 0;
4325 			} else if ((extend || !data_size) && slot == 0) {
4326 				mid = 1;
4327 			} else {
4328 				mid = slot;
4329 				if (mid != nritems &&
4330 				    leaf_space_used(l, mid, nritems - mid) +
4331 				    data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
4332 					if (data_size && !tried_avoid_double)
4333 						goto push_for_double;
4334 					split = 2;
4335 				}
4336 			}
4337 		}
4338 	}
4339 
4340 	if (split == 0)
4341 		btrfs_cpu_key_to_disk(&disk_key, ins_key);
4342 	else
4343 		btrfs_item_key(l, &disk_key, mid);
4344 
4345 	right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0,
4346 					     l->start, 0);
4347 	if (IS_ERR(right))
4348 		return PTR_ERR(right);
4349 
4350 	root_add_used(root, fs_info->nodesize);
4351 
4352 	if (split == 0) {
4353 		if (mid <= slot) {
4354 			btrfs_set_header_nritems(right, 0);
4355 			insert_ptr(trans, fs_info, path, &disk_key,
4356 				   right->start, path->slots[1] + 1, 1);
4357 			btrfs_tree_unlock(path->nodes[0]);
4358 			free_extent_buffer(path->nodes[0]);
4359 			path->nodes[0] = right;
4360 			path->slots[0] = 0;
4361 			path->slots[1] += 1;
4362 		} else {
4363 			btrfs_set_header_nritems(right, 0);
4364 			insert_ptr(trans, fs_info, path, &disk_key,
4365 				   right->start, path->slots[1], 1);
4366 			btrfs_tree_unlock(path->nodes[0]);
4367 			free_extent_buffer(path->nodes[0]);
4368 			path->nodes[0] = right;
4369 			path->slots[0] = 0;
4370 			if (path->slots[1] == 0)
4371 				fixup_low_keys(path, &disk_key, 1);
4372 		}
4373 		/*
4374 		 * We create a new leaf 'right' for the required ins_len and
4375 		 * we'll do btrfs_mark_buffer_dirty() on this leaf after copying
4376 		 * the content of ins_len to 'right'.
4377 		 */
4378 		return ret;
4379 	}
4380 
4381 	copy_for_split(trans, fs_info, path, l, right, slot, mid, nritems);
4382 
4383 	if (split == 2) {
4384 		BUG_ON(num_doubles != 0);
4385 		num_doubles++;
4386 		goto again;
4387 	}
4388 
4389 	return 0;
4390 
4391 push_for_double:
4392 	push_for_double_split(trans, root, path, data_size);
4393 	tried_avoid_double = 1;
4394 	if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= data_size)
4395 		return 0;
4396 	goto again;
4397 }
4398 
setup_leaf_for_split(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,int ins_len)4399 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
4400 					 struct btrfs_root *root,
4401 					 struct btrfs_path *path, int ins_len)
4402 {
4403 	struct btrfs_fs_info *fs_info = root->fs_info;
4404 	struct btrfs_key key;
4405 	struct extent_buffer *leaf;
4406 	struct btrfs_file_extent_item *fi;
4407 	u64 extent_len = 0;
4408 	u32 item_size;
4409 	int ret;
4410 
4411 	leaf = path->nodes[0];
4412 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4413 
4414 	BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
4415 	       key.type != BTRFS_EXTENT_CSUM_KEY);
4416 
4417 	if (btrfs_leaf_free_space(fs_info, leaf) >= ins_len)
4418 		return 0;
4419 
4420 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4421 	if (key.type == BTRFS_EXTENT_DATA_KEY) {
4422 		fi = btrfs_item_ptr(leaf, path->slots[0],
4423 				    struct btrfs_file_extent_item);
4424 		extent_len = btrfs_file_extent_num_bytes(leaf, fi);
4425 	}
4426 	btrfs_release_path(path);
4427 
4428 	path->keep_locks = 1;
4429 	path->search_for_split = 1;
4430 	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4431 	path->search_for_split = 0;
4432 	if (ret > 0)
4433 		ret = -EAGAIN;
4434 	if (ret < 0)
4435 		goto err;
4436 
4437 	ret = -EAGAIN;
4438 	leaf = path->nodes[0];
4439 	/* if our item isn't there, return now */
4440 	if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
4441 		goto err;
4442 
4443 	/* the leaf has  changed, it now has room.  return now */
4444 	if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= ins_len)
4445 		goto err;
4446 
4447 	if (key.type == BTRFS_EXTENT_DATA_KEY) {
4448 		fi = btrfs_item_ptr(leaf, path->slots[0],
4449 				    struct btrfs_file_extent_item);
4450 		if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
4451 			goto err;
4452 	}
4453 
4454 	btrfs_set_path_blocking(path);
4455 	ret = split_leaf(trans, root, &key, path, ins_len, 1);
4456 	if (ret)
4457 		goto err;
4458 
4459 	path->keep_locks = 0;
4460 	btrfs_unlock_up_safe(path, 1);
4461 	return 0;
4462 err:
4463 	path->keep_locks = 0;
4464 	return ret;
4465 }
4466 
split_item(struct btrfs_fs_info * fs_info,struct btrfs_path * path,const struct btrfs_key * new_key,unsigned long split_offset)4467 static noinline int split_item(struct btrfs_fs_info *fs_info,
4468 			       struct btrfs_path *path,
4469 			       const struct btrfs_key *new_key,
4470 			       unsigned long split_offset)
4471 {
4472 	struct extent_buffer *leaf;
4473 	struct btrfs_item *item;
4474 	struct btrfs_item *new_item;
4475 	int slot;
4476 	char *buf;
4477 	u32 nritems;
4478 	u32 item_size;
4479 	u32 orig_offset;
4480 	struct btrfs_disk_key disk_key;
4481 
4482 	leaf = path->nodes[0];
4483 	BUG_ON(btrfs_leaf_free_space(fs_info, leaf) < sizeof(struct btrfs_item));
4484 
4485 	btrfs_set_path_blocking(path);
4486 
4487 	item = btrfs_item_nr(path->slots[0]);
4488 	orig_offset = btrfs_item_offset(leaf, item);
4489 	item_size = btrfs_item_size(leaf, item);
4490 
4491 	buf = kmalloc(item_size, GFP_NOFS);
4492 	if (!buf)
4493 		return -ENOMEM;
4494 
4495 	read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
4496 			    path->slots[0]), item_size);
4497 
4498 	slot = path->slots[0] + 1;
4499 	nritems = btrfs_header_nritems(leaf);
4500 	if (slot != nritems) {
4501 		/* shift the items */
4502 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
4503 				btrfs_item_nr_offset(slot),
4504 				(nritems - slot) * sizeof(struct btrfs_item));
4505 	}
4506 
4507 	btrfs_cpu_key_to_disk(&disk_key, new_key);
4508 	btrfs_set_item_key(leaf, &disk_key, slot);
4509 
4510 	new_item = btrfs_item_nr(slot);
4511 
4512 	btrfs_set_item_offset(leaf, new_item, orig_offset);
4513 	btrfs_set_item_size(leaf, new_item, item_size - split_offset);
4514 
4515 	btrfs_set_item_offset(leaf, item,
4516 			      orig_offset + item_size - split_offset);
4517 	btrfs_set_item_size(leaf, item, split_offset);
4518 
4519 	btrfs_set_header_nritems(leaf, nritems + 1);
4520 
4521 	/* write the data for the start of the original item */
4522 	write_extent_buffer(leaf, buf,
4523 			    btrfs_item_ptr_offset(leaf, path->slots[0]),
4524 			    split_offset);
4525 
4526 	/* write the data for the new item */
4527 	write_extent_buffer(leaf, buf + split_offset,
4528 			    btrfs_item_ptr_offset(leaf, slot),
4529 			    item_size - split_offset);
4530 	btrfs_mark_buffer_dirty(leaf);
4531 
4532 	BUG_ON(btrfs_leaf_free_space(fs_info, leaf) < 0);
4533 	kfree(buf);
4534 	return 0;
4535 }
4536 
4537 /*
4538  * This function splits a single item into two items,
4539  * giving 'new_key' to the new item and splitting the
4540  * old one at split_offset (from the start of the item).
4541  *
4542  * The path may be released by this operation.  After
4543  * the split, the path is pointing to the old item.  The
4544  * new item is going to be in the same node as the old one.
4545  *
4546  * Note, the item being split must be smaller enough to live alone on
4547  * a tree block with room for one extra struct btrfs_item
4548  *
4549  * This allows us to split the item in place, keeping a lock on the
4550  * leaf the entire time.
4551  */
btrfs_split_item(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,const struct btrfs_key * new_key,unsigned long split_offset)4552 int btrfs_split_item(struct btrfs_trans_handle *trans,
4553 		     struct btrfs_root *root,
4554 		     struct btrfs_path *path,
4555 		     const struct btrfs_key *new_key,
4556 		     unsigned long split_offset)
4557 {
4558 	int ret;
4559 	ret = setup_leaf_for_split(trans, root, path,
4560 				   sizeof(struct btrfs_item));
4561 	if (ret)
4562 		return ret;
4563 
4564 	ret = split_item(root->fs_info, path, new_key, split_offset);
4565 	return ret;
4566 }
4567 
4568 /*
4569  * This function duplicate a item, giving 'new_key' to the new item.
4570  * It guarantees both items live in the same tree leaf and the new item
4571  * is contiguous with the original item.
4572  *
4573  * This allows us to split file extent in place, keeping a lock on the
4574  * leaf the entire time.
4575  */
btrfs_duplicate_item(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,const struct btrfs_key * new_key)4576 int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4577 			 struct btrfs_root *root,
4578 			 struct btrfs_path *path,
4579 			 const struct btrfs_key *new_key)
4580 {
4581 	struct extent_buffer *leaf;
4582 	int ret;
4583 	u32 item_size;
4584 
4585 	leaf = path->nodes[0];
4586 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4587 	ret = setup_leaf_for_split(trans, root, path,
4588 				   item_size + sizeof(struct btrfs_item));
4589 	if (ret)
4590 		return ret;
4591 
4592 	path->slots[0]++;
4593 	setup_items_for_insert(root, path, new_key, &item_size,
4594 			       item_size, item_size +
4595 			       sizeof(struct btrfs_item), 1);
4596 	leaf = path->nodes[0];
4597 	memcpy_extent_buffer(leaf,
4598 			     btrfs_item_ptr_offset(leaf, path->slots[0]),
4599 			     btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4600 			     item_size);
4601 	return 0;
4602 }
4603 
4604 /*
4605  * make the item pointed to by the path smaller.  new_size indicates
4606  * how small to make it, and from_end tells us if we just chop bytes
4607  * off the end of the item or if we shift the item to chop bytes off
4608  * the front.
4609  */
btrfs_truncate_item(struct btrfs_fs_info * fs_info,struct btrfs_path * path,u32 new_size,int from_end)4610 void btrfs_truncate_item(struct btrfs_fs_info *fs_info,
4611 			 struct btrfs_path *path, u32 new_size, int from_end)
4612 {
4613 	int slot;
4614 	struct extent_buffer *leaf;
4615 	struct btrfs_item *item;
4616 	u32 nritems;
4617 	unsigned int data_end;
4618 	unsigned int old_data_start;
4619 	unsigned int old_size;
4620 	unsigned int size_diff;
4621 	int i;
4622 	struct btrfs_map_token token;
4623 
4624 	btrfs_init_map_token(&token);
4625 
4626 	leaf = path->nodes[0];
4627 	slot = path->slots[0];
4628 
4629 	old_size = btrfs_item_size_nr(leaf, slot);
4630 	if (old_size == new_size)
4631 		return;
4632 
4633 	nritems = btrfs_header_nritems(leaf);
4634 	data_end = leaf_data_end(fs_info, leaf);
4635 
4636 	old_data_start = btrfs_item_offset_nr(leaf, slot);
4637 
4638 	size_diff = old_size - new_size;
4639 
4640 	BUG_ON(slot < 0);
4641 	BUG_ON(slot >= nritems);
4642 
4643 	/*
4644 	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4645 	 */
4646 	/* first correct the data pointers */
4647 	for (i = slot; i < nritems; i++) {
4648 		u32 ioff;
4649 		item = btrfs_item_nr(i);
4650 
4651 		ioff = btrfs_token_item_offset(leaf, item, &token);
4652 		btrfs_set_token_item_offset(leaf, item,
4653 					    ioff + size_diff, &token);
4654 	}
4655 
4656 	/* shift the data */
4657 	if (from_end) {
4658 		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4659 			      data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
4660 			      data_end, old_data_start + new_size - data_end);
4661 	} else {
4662 		struct btrfs_disk_key disk_key;
4663 		u64 offset;
4664 
4665 		btrfs_item_key(leaf, &disk_key, slot);
4666 
4667 		if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
4668 			unsigned long ptr;
4669 			struct btrfs_file_extent_item *fi;
4670 
4671 			fi = btrfs_item_ptr(leaf, slot,
4672 					    struct btrfs_file_extent_item);
4673 			fi = (struct btrfs_file_extent_item *)(
4674 			     (unsigned long)fi - size_diff);
4675 
4676 			if (btrfs_file_extent_type(leaf, fi) ==
4677 			    BTRFS_FILE_EXTENT_INLINE) {
4678 				ptr = btrfs_item_ptr_offset(leaf, slot);
4679 				memmove_extent_buffer(leaf, ptr,
4680 				      (unsigned long)fi,
4681 				      BTRFS_FILE_EXTENT_INLINE_DATA_START);
4682 			}
4683 		}
4684 
4685 		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4686 			      data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
4687 			      data_end, old_data_start - data_end);
4688 
4689 		offset = btrfs_disk_key_offset(&disk_key);
4690 		btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
4691 		btrfs_set_item_key(leaf, &disk_key, slot);
4692 		if (slot == 0)
4693 			fixup_low_keys(path, &disk_key, 1);
4694 	}
4695 
4696 	item = btrfs_item_nr(slot);
4697 	btrfs_set_item_size(leaf, item, new_size);
4698 	btrfs_mark_buffer_dirty(leaf);
4699 
4700 	if (btrfs_leaf_free_space(fs_info, leaf) < 0) {
4701 		btrfs_print_leaf(leaf);
4702 		BUG();
4703 	}
4704 }
4705 
4706 /*
4707  * make the item pointed to by the path bigger, data_size is the added size.
4708  */
btrfs_extend_item(struct btrfs_fs_info * fs_info,struct btrfs_path * path,u32 data_size)4709 void btrfs_extend_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
4710 		       u32 data_size)
4711 {
4712 	int slot;
4713 	struct extent_buffer *leaf;
4714 	struct btrfs_item *item;
4715 	u32 nritems;
4716 	unsigned int data_end;
4717 	unsigned int old_data;
4718 	unsigned int old_size;
4719 	int i;
4720 	struct btrfs_map_token token;
4721 
4722 	btrfs_init_map_token(&token);
4723 
4724 	leaf = path->nodes[0];
4725 
4726 	nritems = btrfs_header_nritems(leaf);
4727 	data_end = leaf_data_end(fs_info, leaf);
4728 
4729 	if (btrfs_leaf_free_space(fs_info, leaf) < data_size) {
4730 		btrfs_print_leaf(leaf);
4731 		BUG();
4732 	}
4733 	slot = path->slots[0];
4734 	old_data = btrfs_item_end_nr(leaf, slot);
4735 
4736 	BUG_ON(slot < 0);
4737 	if (slot >= nritems) {
4738 		btrfs_print_leaf(leaf);
4739 		btrfs_crit(fs_info, "slot %d too large, nritems %d",
4740 			   slot, nritems);
4741 		BUG_ON(1);
4742 	}
4743 
4744 	/*
4745 	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4746 	 */
4747 	/* first correct the data pointers */
4748 	for (i = slot; i < nritems; i++) {
4749 		u32 ioff;
4750 		item = btrfs_item_nr(i);
4751 
4752 		ioff = btrfs_token_item_offset(leaf, item, &token);
4753 		btrfs_set_token_item_offset(leaf, item,
4754 					    ioff - data_size, &token);
4755 	}
4756 
4757 	/* shift the data */
4758 	memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4759 		      data_end - data_size, BTRFS_LEAF_DATA_OFFSET +
4760 		      data_end, old_data - data_end);
4761 
4762 	data_end = old_data;
4763 	old_size = btrfs_item_size_nr(leaf, slot);
4764 	item = btrfs_item_nr(slot);
4765 	btrfs_set_item_size(leaf, item, old_size + data_size);
4766 	btrfs_mark_buffer_dirty(leaf);
4767 
4768 	if (btrfs_leaf_free_space(fs_info, leaf) < 0) {
4769 		btrfs_print_leaf(leaf);
4770 		BUG();
4771 	}
4772 }
4773 
4774 /*
4775  * this is a helper for btrfs_insert_empty_items, the main goal here is
4776  * to save stack depth by doing the bulk of the work in a function
4777  * that doesn't call btrfs_search_slot
4778  */
setup_items_for_insert(struct btrfs_root * root,struct btrfs_path * path,const struct btrfs_key * cpu_key,u32 * data_size,u32 total_data,u32 total_size,int nr)4779 void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4780 			    const struct btrfs_key *cpu_key, u32 *data_size,
4781 			    u32 total_data, u32 total_size, int nr)
4782 {
4783 	struct btrfs_fs_info *fs_info = root->fs_info;
4784 	struct btrfs_item *item;
4785 	int i;
4786 	u32 nritems;
4787 	unsigned int data_end;
4788 	struct btrfs_disk_key disk_key;
4789 	struct extent_buffer *leaf;
4790 	int slot;
4791 	struct btrfs_map_token token;
4792 
4793 	if (path->slots[0] == 0) {
4794 		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4795 		fixup_low_keys(path, &disk_key, 1);
4796 	}
4797 	btrfs_unlock_up_safe(path, 1);
4798 
4799 	btrfs_init_map_token(&token);
4800 
4801 	leaf = path->nodes[0];
4802 	slot = path->slots[0];
4803 
4804 	nritems = btrfs_header_nritems(leaf);
4805 	data_end = leaf_data_end(fs_info, leaf);
4806 
4807 	if (btrfs_leaf_free_space(fs_info, leaf) < total_size) {
4808 		btrfs_print_leaf(leaf);
4809 		btrfs_crit(fs_info, "not enough freespace need %u have %d",
4810 			   total_size, btrfs_leaf_free_space(fs_info, leaf));
4811 		BUG();
4812 	}
4813 
4814 	if (slot != nritems) {
4815 		unsigned int old_data = btrfs_item_end_nr(leaf, slot);
4816 
4817 		if (old_data < data_end) {
4818 			btrfs_print_leaf(leaf);
4819 			btrfs_crit(fs_info, "slot %d old_data %d data_end %d",
4820 				   slot, old_data, data_end);
4821 			BUG_ON(1);
4822 		}
4823 		/*
4824 		 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4825 		 */
4826 		/* first correct the data pointers */
4827 		for (i = slot; i < nritems; i++) {
4828 			u32 ioff;
4829 
4830 			item = btrfs_item_nr(i);
4831 			ioff = btrfs_token_item_offset(leaf, item, &token);
4832 			btrfs_set_token_item_offset(leaf, item,
4833 						    ioff - total_data, &token);
4834 		}
4835 		/* shift the items */
4836 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
4837 			      btrfs_item_nr_offset(slot),
4838 			      (nritems - slot) * sizeof(struct btrfs_item));
4839 
4840 		/* shift the data */
4841 		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4842 			      data_end - total_data, BTRFS_LEAF_DATA_OFFSET +
4843 			      data_end, old_data - data_end);
4844 		data_end = old_data;
4845 	}
4846 
4847 	/* setup the item for the new data */
4848 	for (i = 0; i < nr; i++) {
4849 		btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
4850 		btrfs_set_item_key(leaf, &disk_key, slot + i);
4851 		item = btrfs_item_nr(slot + i);
4852 		btrfs_set_token_item_offset(leaf, item,
4853 					    data_end - data_size[i], &token);
4854 		data_end -= data_size[i];
4855 		btrfs_set_token_item_size(leaf, item, data_size[i], &token);
4856 	}
4857 
4858 	btrfs_set_header_nritems(leaf, nritems + nr);
4859 	btrfs_mark_buffer_dirty(leaf);
4860 
4861 	if (btrfs_leaf_free_space(fs_info, leaf) < 0) {
4862 		btrfs_print_leaf(leaf);
4863 		BUG();
4864 	}
4865 }
4866 
4867 /*
4868  * Given a key and some data, insert items into the tree.
4869  * This does all the path init required, making room in the tree if needed.
4870  */
btrfs_insert_empty_items(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,const struct btrfs_key * cpu_key,u32 * data_size,int nr)4871 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4872 			    struct btrfs_root *root,
4873 			    struct btrfs_path *path,
4874 			    const struct btrfs_key *cpu_key, u32 *data_size,
4875 			    int nr)
4876 {
4877 	int ret = 0;
4878 	int slot;
4879 	int i;
4880 	u32 total_size = 0;
4881 	u32 total_data = 0;
4882 
4883 	for (i = 0; i < nr; i++)
4884 		total_data += data_size[i];
4885 
4886 	total_size = total_data + (nr * sizeof(struct btrfs_item));
4887 	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
4888 	if (ret == 0)
4889 		return -EEXIST;
4890 	if (ret < 0)
4891 		return ret;
4892 
4893 	slot = path->slots[0];
4894 	BUG_ON(slot < 0);
4895 
4896 	setup_items_for_insert(root, path, cpu_key, data_size,
4897 			       total_data, total_size, nr);
4898 	return 0;
4899 }
4900 
4901 /*
4902  * Given a key and some data, insert an item into the tree.
4903  * This does all the path init required, making room in the tree if needed.
4904  */
btrfs_insert_item(struct btrfs_trans_handle * trans,struct btrfs_root * root,const struct btrfs_key * cpu_key,void * data,u32 data_size)4905 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4906 		      const struct btrfs_key *cpu_key, void *data,
4907 		      u32 data_size)
4908 {
4909 	int ret = 0;
4910 	struct btrfs_path *path;
4911 	struct extent_buffer *leaf;
4912 	unsigned long ptr;
4913 
4914 	path = btrfs_alloc_path();
4915 	if (!path)
4916 		return -ENOMEM;
4917 	ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4918 	if (!ret) {
4919 		leaf = path->nodes[0];
4920 		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4921 		write_extent_buffer(leaf, data, ptr, data_size);
4922 		btrfs_mark_buffer_dirty(leaf);
4923 	}
4924 	btrfs_free_path(path);
4925 	return ret;
4926 }
4927 
4928 /*
4929  * delete the pointer from a given node.
4930  *
4931  * the tree should have been previously balanced so the deletion does not
4932  * empty a node.
4933  */
del_ptr(struct btrfs_root * root,struct btrfs_path * path,int level,int slot)4934 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
4935 		    int level, int slot)
4936 {
4937 	struct extent_buffer *parent = path->nodes[level];
4938 	u32 nritems;
4939 	int ret;
4940 
4941 	nritems = btrfs_header_nritems(parent);
4942 	if (slot != nritems - 1) {
4943 		if (level) {
4944 			ret = tree_mod_log_insert_move(parent, slot, slot + 1,
4945 					nritems - slot - 1);
4946 			BUG_ON(ret < 0);
4947 		}
4948 		memmove_extent_buffer(parent,
4949 			      btrfs_node_key_ptr_offset(slot),
4950 			      btrfs_node_key_ptr_offset(slot + 1),
4951 			      sizeof(struct btrfs_key_ptr) *
4952 			      (nritems - slot - 1));
4953 	} else if (level) {
4954 		ret = tree_mod_log_insert_key(parent, slot, MOD_LOG_KEY_REMOVE,
4955 				GFP_NOFS);
4956 		BUG_ON(ret < 0);
4957 	}
4958 
4959 	nritems--;
4960 	btrfs_set_header_nritems(parent, nritems);
4961 	if (nritems == 0 && parent == root->node) {
4962 		BUG_ON(btrfs_header_level(root->node) != 1);
4963 		/* just turn the root into a leaf and break */
4964 		btrfs_set_header_level(root->node, 0);
4965 	} else if (slot == 0) {
4966 		struct btrfs_disk_key disk_key;
4967 
4968 		btrfs_node_key(parent, &disk_key, 0);
4969 		fixup_low_keys(path, &disk_key, level + 1);
4970 	}
4971 	btrfs_mark_buffer_dirty(parent);
4972 }
4973 
4974 /*
4975  * a helper function to delete the leaf pointed to by path->slots[1] and
4976  * path->nodes[1].
4977  *
4978  * This deletes the pointer in path->nodes[1] and frees the leaf
4979  * block extent.  zero is returned if it all worked out, < 0 otherwise.
4980  *
4981  * The path must have already been setup for deleting the leaf, including
4982  * all the proper balancing.  path->nodes[1] must be locked.
4983  */
btrfs_del_leaf(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct extent_buffer * leaf)4984 static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4985 				    struct btrfs_root *root,
4986 				    struct btrfs_path *path,
4987 				    struct extent_buffer *leaf)
4988 {
4989 	WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4990 	del_ptr(root, path, 1, path->slots[1]);
4991 
4992 	/*
4993 	 * btrfs_free_extent is expensive, we want to make sure we
4994 	 * aren't holding any locks when we call it
4995 	 */
4996 	btrfs_unlock_up_safe(path, 0);
4997 
4998 	root_sub_used(root, leaf->len);
4999 
5000 	extent_buffer_get(leaf);
5001 	btrfs_free_tree_block(trans, root, leaf, 0, 1);
5002 	free_extent_buffer_stale(leaf);
5003 }
5004 /*
5005  * delete the item at the leaf level in path.  If that empties
5006  * the leaf, remove it from the tree
5007  */
btrfs_del_items(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,int slot,int nr)5008 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
5009 		    struct btrfs_path *path, int slot, int nr)
5010 {
5011 	struct btrfs_fs_info *fs_info = root->fs_info;
5012 	struct extent_buffer *leaf;
5013 	struct btrfs_item *item;
5014 	u32 last_off;
5015 	u32 dsize = 0;
5016 	int ret = 0;
5017 	int wret;
5018 	int i;
5019 	u32 nritems;
5020 	struct btrfs_map_token token;
5021 
5022 	btrfs_init_map_token(&token);
5023 
5024 	leaf = path->nodes[0];
5025 	last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
5026 
5027 	for (i = 0; i < nr; i++)
5028 		dsize += btrfs_item_size_nr(leaf, slot + i);
5029 
5030 	nritems = btrfs_header_nritems(leaf);
5031 
5032 	if (slot + nr != nritems) {
5033 		int data_end = leaf_data_end(fs_info, leaf);
5034 
5035 		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
5036 			      data_end + dsize,
5037 			      BTRFS_LEAF_DATA_OFFSET + data_end,
5038 			      last_off - data_end);
5039 
5040 		for (i = slot + nr; i < nritems; i++) {
5041 			u32 ioff;
5042 
5043 			item = btrfs_item_nr(i);
5044 			ioff = btrfs_token_item_offset(leaf, item, &token);
5045 			btrfs_set_token_item_offset(leaf, item,
5046 						    ioff + dsize, &token);
5047 		}
5048 
5049 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
5050 			      btrfs_item_nr_offset(slot + nr),
5051 			      sizeof(struct btrfs_item) *
5052 			      (nritems - slot - nr));
5053 	}
5054 	btrfs_set_header_nritems(leaf, nritems - nr);
5055 	nritems -= nr;
5056 
5057 	/* delete the leaf if we've emptied it */
5058 	if (nritems == 0) {
5059 		if (leaf == root->node) {
5060 			btrfs_set_header_level(leaf, 0);
5061 		} else {
5062 			btrfs_set_path_blocking(path);
5063 			clean_tree_block(fs_info, leaf);
5064 			btrfs_del_leaf(trans, root, path, leaf);
5065 		}
5066 	} else {
5067 		int used = leaf_space_used(leaf, 0, nritems);
5068 		if (slot == 0) {
5069 			struct btrfs_disk_key disk_key;
5070 
5071 			btrfs_item_key(leaf, &disk_key, 0);
5072 			fixup_low_keys(path, &disk_key, 1);
5073 		}
5074 
5075 		/* delete the leaf if it is mostly empty */
5076 		if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
5077 			/* push_leaf_left fixes the path.
5078 			 * make sure the path still points to our leaf
5079 			 * for possible call to del_ptr below
5080 			 */
5081 			slot = path->slots[1];
5082 			extent_buffer_get(leaf);
5083 
5084 			btrfs_set_path_blocking(path);
5085 			wret = push_leaf_left(trans, root, path, 1, 1,
5086 					      1, (u32)-1);
5087 			if (wret < 0 && wret != -ENOSPC)
5088 				ret = wret;
5089 
5090 			if (path->nodes[0] == leaf &&
5091 			    btrfs_header_nritems(leaf)) {
5092 				wret = push_leaf_right(trans, root, path, 1,
5093 						       1, 1, 0);
5094 				if (wret < 0 && wret != -ENOSPC)
5095 					ret = wret;
5096 			}
5097 
5098 			if (btrfs_header_nritems(leaf) == 0) {
5099 				path->slots[1] = slot;
5100 				btrfs_del_leaf(trans, root, path, leaf);
5101 				free_extent_buffer(leaf);
5102 				ret = 0;
5103 			} else {
5104 				/* if we're still in the path, make sure
5105 				 * we're dirty.  Otherwise, one of the
5106 				 * push_leaf functions must have already
5107 				 * dirtied this buffer
5108 				 */
5109 				if (path->nodes[0] == leaf)
5110 					btrfs_mark_buffer_dirty(leaf);
5111 				free_extent_buffer(leaf);
5112 			}
5113 		} else {
5114 			btrfs_mark_buffer_dirty(leaf);
5115 		}
5116 	}
5117 	return ret;
5118 }
5119 
5120 /*
5121  * search the tree again to find a leaf with lesser keys
5122  * returns 0 if it found something or 1 if there are no lesser leaves.
5123  * returns < 0 on io errors.
5124  *
5125  * This may release the path, and so you may lose any locks held at the
5126  * time you call it.
5127  */
btrfs_prev_leaf(struct btrfs_root * root,struct btrfs_path * path)5128 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
5129 {
5130 	struct btrfs_key key;
5131 	struct btrfs_disk_key found_key;
5132 	int ret;
5133 
5134 	btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
5135 
5136 	if (key.offset > 0) {
5137 		key.offset--;
5138 	} else if (key.type > 0) {
5139 		key.type--;
5140 		key.offset = (u64)-1;
5141 	} else if (key.objectid > 0) {
5142 		key.objectid--;
5143 		key.type = (u8)-1;
5144 		key.offset = (u64)-1;
5145 	} else {
5146 		return 1;
5147 	}
5148 
5149 	btrfs_release_path(path);
5150 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5151 	if (ret < 0)
5152 		return ret;
5153 	btrfs_item_key(path->nodes[0], &found_key, 0);
5154 	ret = comp_keys(&found_key, &key);
5155 	/*
5156 	 * We might have had an item with the previous key in the tree right
5157 	 * before we released our path. And after we released our path, that
5158 	 * item might have been pushed to the first slot (0) of the leaf we
5159 	 * were holding due to a tree balance. Alternatively, an item with the
5160 	 * previous key can exist as the only element of a leaf (big fat item).
5161 	 * Therefore account for these 2 cases, so that our callers (like
5162 	 * btrfs_previous_item) don't miss an existing item with a key matching
5163 	 * the previous key we computed above.
5164 	 */
5165 	if (ret <= 0)
5166 		return 0;
5167 	return 1;
5168 }
5169 
5170 /*
5171  * A helper function to walk down the tree starting at min_key, and looking
5172  * for nodes or leaves that are have a minimum transaction id.
5173  * This is used by the btree defrag code, and tree logging
5174  *
5175  * This does not cow, but it does stuff the starting key it finds back
5176  * into min_key, so you can call btrfs_search_slot with cow=1 on the
5177  * key and get a writable path.
5178  *
5179  * This honors path->lowest_level to prevent descent past a given level
5180  * of the tree.
5181  *
5182  * min_trans indicates the oldest transaction that you are interested
5183  * in walking through.  Any nodes or leaves older than min_trans are
5184  * skipped over (without reading them).
5185  *
5186  * returns zero if something useful was found, < 0 on error and 1 if there
5187  * was nothing in the tree that matched the search criteria.
5188  */
btrfs_search_forward(struct btrfs_root * root,struct btrfs_key * min_key,struct btrfs_path * path,u64 min_trans)5189 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
5190 			 struct btrfs_path *path,
5191 			 u64 min_trans)
5192 {
5193 	struct btrfs_fs_info *fs_info = root->fs_info;
5194 	struct extent_buffer *cur;
5195 	struct btrfs_key found_key;
5196 	int slot;
5197 	int sret;
5198 	u32 nritems;
5199 	int level;
5200 	int ret = 1;
5201 	int keep_locks = path->keep_locks;
5202 
5203 	path->keep_locks = 1;
5204 again:
5205 	cur = btrfs_read_lock_root_node(root);
5206 	level = btrfs_header_level(cur);
5207 	WARN_ON(path->nodes[level]);
5208 	path->nodes[level] = cur;
5209 	path->locks[level] = BTRFS_READ_LOCK;
5210 
5211 	if (btrfs_header_generation(cur) < min_trans) {
5212 		ret = 1;
5213 		goto out;
5214 	}
5215 	while (1) {
5216 		nritems = btrfs_header_nritems(cur);
5217 		level = btrfs_header_level(cur);
5218 		sret = btrfs_bin_search(cur, min_key, level, &slot);
5219 
5220 		/* at the lowest level, we're done, setup the path and exit */
5221 		if (level == path->lowest_level) {
5222 			if (slot >= nritems)
5223 				goto find_next_key;
5224 			ret = 0;
5225 			path->slots[level] = slot;
5226 			btrfs_item_key_to_cpu(cur, &found_key, slot);
5227 			goto out;
5228 		}
5229 		if (sret && slot > 0)
5230 			slot--;
5231 		/*
5232 		 * check this node pointer against the min_trans parameters.
5233 		 * If it is too old, old, skip to the next one.
5234 		 */
5235 		while (slot < nritems) {
5236 			u64 gen;
5237 
5238 			gen = btrfs_node_ptr_generation(cur, slot);
5239 			if (gen < min_trans) {
5240 				slot++;
5241 				continue;
5242 			}
5243 			break;
5244 		}
5245 find_next_key:
5246 		/*
5247 		 * we didn't find a candidate key in this node, walk forward
5248 		 * and find another one
5249 		 */
5250 		if (slot >= nritems) {
5251 			path->slots[level] = slot;
5252 			btrfs_set_path_blocking(path);
5253 			sret = btrfs_find_next_key(root, path, min_key, level,
5254 						  min_trans);
5255 			if (sret == 0) {
5256 				btrfs_release_path(path);
5257 				goto again;
5258 			} else {
5259 				goto out;
5260 			}
5261 		}
5262 		/* save our key for returning back */
5263 		btrfs_node_key_to_cpu(cur, &found_key, slot);
5264 		path->slots[level] = slot;
5265 		if (level == path->lowest_level) {
5266 			ret = 0;
5267 			goto out;
5268 		}
5269 		btrfs_set_path_blocking(path);
5270 		cur = read_node_slot(fs_info, cur, slot);
5271 		if (IS_ERR(cur)) {
5272 			ret = PTR_ERR(cur);
5273 			goto out;
5274 		}
5275 
5276 		btrfs_tree_read_lock(cur);
5277 
5278 		path->locks[level - 1] = BTRFS_READ_LOCK;
5279 		path->nodes[level - 1] = cur;
5280 		unlock_up(path, level, 1, 0, NULL);
5281 		btrfs_clear_path_blocking(path, NULL, 0);
5282 	}
5283 out:
5284 	path->keep_locks = keep_locks;
5285 	if (ret == 0) {
5286 		btrfs_unlock_up_safe(path, path->lowest_level + 1);
5287 		btrfs_set_path_blocking(path);
5288 		memcpy(min_key, &found_key, sizeof(found_key));
5289 	}
5290 	return ret;
5291 }
5292 
tree_move_down(struct btrfs_fs_info * fs_info,struct btrfs_path * path,int * level)5293 static int tree_move_down(struct btrfs_fs_info *fs_info,
5294 			   struct btrfs_path *path,
5295 			   int *level)
5296 {
5297 	struct extent_buffer *eb;
5298 
5299 	BUG_ON(*level == 0);
5300 	eb = read_node_slot(fs_info, path->nodes[*level], path->slots[*level]);
5301 	if (IS_ERR(eb))
5302 		return PTR_ERR(eb);
5303 
5304 	path->nodes[*level - 1] = eb;
5305 	path->slots[*level - 1] = 0;
5306 	(*level)--;
5307 	return 0;
5308 }
5309 
tree_move_next_or_upnext(struct btrfs_path * path,int * level,int root_level)5310 static int tree_move_next_or_upnext(struct btrfs_path *path,
5311 				    int *level, int root_level)
5312 {
5313 	int ret = 0;
5314 	int nritems;
5315 	nritems = btrfs_header_nritems(path->nodes[*level]);
5316 
5317 	path->slots[*level]++;
5318 
5319 	while (path->slots[*level] >= nritems) {
5320 		if (*level == root_level)
5321 			return -1;
5322 
5323 		/* move upnext */
5324 		path->slots[*level] = 0;
5325 		free_extent_buffer(path->nodes[*level]);
5326 		path->nodes[*level] = NULL;
5327 		(*level)++;
5328 		path->slots[*level]++;
5329 
5330 		nritems = btrfs_header_nritems(path->nodes[*level]);
5331 		ret = 1;
5332 	}
5333 	return ret;
5334 }
5335 
5336 /*
5337  * Returns 1 if it had to move up and next. 0 is returned if it moved only next
5338  * or down.
5339  */
tree_advance(struct btrfs_fs_info * fs_info,struct btrfs_path * path,int * level,int root_level,int allow_down,struct btrfs_key * key)5340 static int tree_advance(struct btrfs_fs_info *fs_info,
5341 			struct btrfs_path *path,
5342 			int *level, int root_level,
5343 			int allow_down,
5344 			struct btrfs_key *key)
5345 {
5346 	int ret;
5347 
5348 	if (*level == 0 || !allow_down) {
5349 		ret = tree_move_next_or_upnext(path, level, root_level);
5350 	} else {
5351 		ret = tree_move_down(fs_info, path, level);
5352 	}
5353 	if (ret >= 0) {
5354 		if (*level == 0)
5355 			btrfs_item_key_to_cpu(path->nodes[*level], key,
5356 					path->slots[*level]);
5357 		else
5358 			btrfs_node_key_to_cpu(path->nodes[*level], key,
5359 					path->slots[*level]);
5360 	}
5361 	return ret;
5362 }
5363 
tree_compare_item(struct btrfs_path * left_path,struct btrfs_path * right_path,char * tmp_buf)5364 static int tree_compare_item(struct btrfs_path *left_path,
5365 			     struct btrfs_path *right_path,
5366 			     char *tmp_buf)
5367 {
5368 	int cmp;
5369 	int len1, len2;
5370 	unsigned long off1, off2;
5371 
5372 	len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
5373 	len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
5374 	if (len1 != len2)
5375 		return 1;
5376 
5377 	off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]);
5378 	off2 = btrfs_item_ptr_offset(right_path->nodes[0],
5379 				right_path->slots[0]);
5380 
5381 	read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
5382 
5383 	cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
5384 	if (cmp)
5385 		return 1;
5386 	return 0;
5387 }
5388 
5389 #define ADVANCE 1
5390 #define ADVANCE_ONLY_NEXT -1
5391 
5392 /*
5393  * This function compares two trees and calls the provided callback for
5394  * every changed/new/deleted item it finds.
5395  * If shared tree blocks are encountered, whole subtrees are skipped, making
5396  * the compare pretty fast on snapshotted subvolumes.
5397  *
5398  * This currently works on commit roots only. As commit roots are read only,
5399  * we don't do any locking. The commit roots are protected with transactions.
5400  * Transactions are ended and rejoined when a commit is tried in between.
5401  *
5402  * This function checks for modifications done to the trees while comparing.
5403  * If it detects a change, it aborts immediately.
5404  */
btrfs_compare_trees(struct btrfs_root * left_root,struct btrfs_root * right_root,btrfs_changed_cb_t changed_cb,void * ctx)5405 int btrfs_compare_trees(struct btrfs_root *left_root,
5406 			struct btrfs_root *right_root,
5407 			btrfs_changed_cb_t changed_cb, void *ctx)
5408 {
5409 	struct btrfs_fs_info *fs_info = left_root->fs_info;
5410 	int ret;
5411 	int cmp;
5412 	struct btrfs_path *left_path = NULL;
5413 	struct btrfs_path *right_path = NULL;
5414 	struct btrfs_key left_key;
5415 	struct btrfs_key right_key;
5416 	char *tmp_buf = NULL;
5417 	int left_root_level;
5418 	int right_root_level;
5419 	int left_level;
5420 	int right_level;
5421 	int left_end_reached;
5422 	int right_end_reached;
5423 	int advance_left;
5424 	int advance_right;
5425 	u64 left_blockptr;
5426 	u64 right_blockptr;
5427 	u64 left_gen;
5428 	u64 right_gen;
5429 
5430 	left_path = btrfs_alloc_path();
5431 	if (!left_path) {
5432 		ret = -ENOMEM;
5433 		goto out;
5434 	}
5435 	right_path = btrfs_alloc_path();
5436 	if (!right_path) {
5437 		ret = -ENOMEM;
5438 		goto out;
5439 	}
5440 
5441 	tmp_buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
5442 	if (!tmp_buf) {
5443 		ret = -ENOMEM;
5444 		goto out;
5445 	}
5446 
5447 	left_path->search_commit_root = 1;
5448 	left_path->skip_locking = 1;
5449 	right_path->search_commit_root = 1;
5450 	right_path->skip_locking = 1;
5451 
5452 	/*
5453 	 * Strategy: Go to the first items of both trees. Then do
5454 	 *
5455 	 * If both trees are at level 0
5456 	 *   Compare keys of current items
5457 	 *     If left < right treat left item as new, advance left tree
5458 	 *       and repeat
5459 	 *     If left > right treat right item as deleted, advance right tree
5460 	 *       and repeat
5461 	 *     If left == right do deep compare of items, treat as changed if
5462 	 *       needed, advance both trees and repeat
5463 	 * If both trees are at the same level but not at level 0
5464 	 *   Compare keys of current nodes/leafs
5465 	 *     If left < right advance left tree and repeat
5466 	 *     If left > right advance right tree and repeat
5467 	 *     If left == right compare blockptrs of the next nodes/leafs
5468 	 *       If they match advance both trees but stay at the same level
5469 	 *         and repeat
5470 	 *       If they don't match advance both trees while allowing to go
5471 	 *         deeper and repeat
5472 	 * If tree levels are different
5473 	 *   Advance the tree that needs it and repeat
5474 	 *
5475 	 * Advancing a tree means:
5476 	 *   If we are at level 0, try to go to the next slot. If that's not
5477 	 *   possible, go one level up and repeat. Stop when we found a level
5478 	 *   where we could go to the next slot. We may at this point be on a
5479 	 *   node or a leaf.
5480 	 *
5481 	 *   If we are not at level 0 and not on shared tree blocks, go one
5482 	 *   level deeper.
5483 	 *
5484 	 *   If we are not at level 0 and on shared tree blocks, go one slot to
5485 	 *   the right if possible or go up and right.
5486 	 */
5487 
5488 	down_read(&fs_info->commit_root_sem);
5489 	left_level = btrfs_header_level(left_root->commit_root);
5490 	left_root_level = left_level;
5491 	left_path->nodes[left_level] =
5492 			btrfs_clone_extent_buffer(left_root->commit_root);
5493 	if (!left_path->nodes[left_level]) {
5494 		up_read(&fs_info->commit_root_sem);
5495 		ret = -ENOMEM;
5496 		goto out;
5497 	}
5498 	extent_buffer_get(left_path->nodes[left_level]);
5499 
5500 	right_level = btrfs_header_level(right_root->commit_root);
5501 	right_root_level = right_level;
5502 	right_path->nodes[right_level] =
5503 			btrfs_clone_extent_buffer(right_root->commit_root);
5504 	if (!right_path->nodes[right_level]) {
5505 		up_read(&fs_info->commit_root_sem);
5506 		ret = -ENOMEM;
5507 		goto out;
5508 	}
5509 	extent_buffer_get(right_path->nodes[right_level]);
5510 	up_read(&fs_info->commit_root_sem);
5511 
5512 	if (left_level == 0)
5513 		btrfs_item_key_to_cpu(left_path->nodes[left_level],
5514 				&left_key, left_path->slots[left_level]);
5515 	else
5516 		btrfs_node_key_to_cpu(left_path->nodes[left_level],
5517 				&left_key, left_path->slots[left_level]);
5518 	if (right_level == 0)
5519 		btrfs_item_key_to_cpu(right_path->nodes[right_level],
5520 				&right_key, right_path->slots[right_level]);
5521 	else
5522 		btrfs_node_key_to_cpu(right_path->nodes[right_level],
5523 				&right_key, right_path->slots[right_level]);
5524 
5525 	left_end_reached = right_end_reached = 0;
5526 	advance_left = advance_right = 0;
5527 
5528 	while (1) {
5529 		cond_resched();
5530 		if (advance_left && !left_end_reached) {
5531 			ret = tree_advance(fs_info, left_path, &left_level,
5532 					left_root_level,
5533 					advance_left != ADVANCE_ONLY_NEXT,
5534 					&left_key);
5535 			if (ret == -1)
5536 				left_end_reached = ADVANCE;
5537 			else if (ret < 0)
5538 				goto out;
5539 			advance_left = 0;
5540 		}
5541 		if (advance_right && !right_end_reached) {
5542 			ret = tree_advance(fs_info, right_path, &right_level,
5543 					right_root_level,
5544 					advance_right != ADVANCE_ONLY_NEXT,
5545 					&right_key);
5546 			if (ret == -1)
5547 				right_end_reached = ADVANCE;
5548 			else if (ret < 0)
5549 				goto out;
5550 			advance_right = 0;
5551 		}
5552 
5553 		if (left_end_reached && right_end_reached) {
5554 			ret = 0;
5555 			goto out;
5556 		} else if (left_end_reached) {
5557 			if (right_level == 0) {
5558 				ret = changed_cb(left_path, right_path,
5559 						&right_key,
5560 						BTRFS_COMPARE_TREE_DELETED,
5561 						ctx);
5562 				if (ret < 0)
5563 					goto out;
5564 			}
5565 			advance_right = ADVANCE;
5566 			continue;
5567 		} else if (right_end_reached) {
5568 			if (left_level == 0) {
5569 				ret = changed_cb(left_path, right_path,
5570 						&left_key,
5571 						BTRFS_COMPARE_TREE_NEW,
5572 						ctx);
5573 				if (ret < 0)
5574 					goto out;
5575 			}
5576 			advance_left = ADVANCE;
5577 			continue;
5578 		}
5579 
5580 		if (left_level == 0 && right_level == 0) {
5581 			cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5582 			if (cmp < 0) {
5583 				ret = changed_cb(left_path, right_path,
5584 						&left_key,
5585 						BTRFS_COMPARE_TREE_NEW,
5586 						ctx);
5587 				if (ret < 0)
5588 					goto out;
5589 				advance_left = ADVANCE;
5590 			} else if (cmp > 0) {
5591 				ret = changed_cb(left_path, right_path,
5592 						&right_key,
5593 						BTRFS_COMPARE_TREE_DELETED,
5594 						ctx);
5595 				if (ret < 0)
5596 					goto out;
5597 				advance_right = ADVANCE;
5598 			} else {
5599 				enum btrfs_compare_tree_result result;
5600 
5601 				WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
5602 				ret = tree_compare_item(left_path, right_path,
5603 							tmp_buf);
5604 				if (ret)
5605 					result = BTRFS_COMPARE_TREE_CHANGED;
5606 				else
5607 					result = BTRFS_COMPARE_TREE_SAME;
5608 				ret = changed_cb(left_path, right_path,
5609 						 &left_key, result, ctx);
5610 				if (ret < 0)
5611 					goto out;
5612 				advance_left = ADVANCE;
5613 				advance_right = ADVANCE;
5614 			}
5615 		} else if (left_level == right_level) {
5616 			cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5617 			if (cmp < 0) {
5618 				advance_left = ADVANCE;
5619 			} else if (cmp > 0) {
5620 				advance_right = ADVANCE;
5621 			} else {
5622 				left_blockptr = btrfs_node_blockptr(
5623 						left_path->nodes[left_level],
5624 						left_path->slots[left_level]);
5625 				right_blockptr = btrfs_node_blockptr(
5626 						right_path->nodes[right_level],
5627 						right_path->slots[right_level]);
5628 				left_gen = btrfs_node_ptr_generation(
5629 						left_path->nodes[left_level],
5630 						left_path->slots[left_level]);
5631 				right_gen = btrfs_node_ptr_generation(
5632 						right_path->nodes[right_level],
5633 						right_path->slots[right_level]);
5634 				if (left_blockptr == right_blockptr &&
5635 				    left_gen == right_gen) {
5636 					/*
5637 					 * As we're on a shared block, don't
5638 					 * allow to go deeper.
5639 					 */
5640 					advance_left = ADVANCE_ONLY_NEXT;
5641 					advance_right = ADVANCE_ONLY_NEXT;
5642 				} else {
5643 					advance_left = ADVANCE;
5644 					advance_right = ADVANCE;
5645 				}
5646 			}
5647 		} else if (left_level < right_level) {
5648 			advance_right = ADVANCE;
5649 		} else {
5650 			advance_left = ADVANCE;
5651 		}
5652 	}
5653 
5654 out:
5655 	btrfs_free_path(left_path);
5656 	btrfs_free_path(right_path);
5657 	kvfree(tmp_buf);
5658 	return ret;
5659 }
5660 
5661 /*
5662  * this is similar to btrfs_next_leaf, but does not try to preserve
5663  * and fixup the path.  It looks for and returns the next key in the
5664  * tree based on the current path and the min_trans parameters.
5665  *
5666  * 0 is returned if another key is found, < 0 if there are any errors
5667  * and 1 is returned if there are no higher keys in the tree
5668  *
5669  * path->keep_locks should be set to 1 on the search made before
5670  * calling this function.
5671  */
btrfs_find_next_key(struct btrfs_root * root,struct btrfs_path * path,struct btrfs_key * key,int level,u64 min_trans)5672 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
5673 			struct btrfs_key *key, int level, u64 min_trans)
5674 {
5675 	int slot;
5676 	struct extent_buffer *c;
5677 
5678 	WARN_ON(!path->keep_locks);
5679 	while (level < BTRFS_MAX_LEVEL) {
5680 		if (!path->nodes[level])
5681 			return 1;
5682 
5683 		slot = path->slots[level] + 1;
5684 		c = path->nodes[level];
5685 next:
5686 		if (slot >= btrfs_header_nritems(c)) {
5687 			int ret;
5688 			int orig_lowest;
5689 			struct btrfs_key cur_key;
5690 			if (level + 1 >= BTRFS_MAX_LEVEL ||
5691 			    !path->nodes[level + 1])
5692 				return 1;
5693 
5694 			if (path->locks[level + 1]) {
5695 				level++;
5696 				continue;
5697 			}
5698 
5699 			slot = btrfs_header_nritems(c) - 1;
5700 			if (level == 0)
5701 				btrfs_item_key_to_cpu(c, &cur_key, slot);
5702 			else
5703 				btrfs_node_key_to_cpu(c, &cur_key, slot);
5704 
5705 			orig_lowest = path->lowest_level;
5706 			btrfs_release_path(path);
5707 			path->lowest_level = level;
5708 			ret = btrfs_search_slot(NULL, root, &cur_key, path,
5709 						0, 0);
5710 			path->lowest_level = orig_lowest;
5711 			if (ret < 0)
5712 				return ret;
5713 
5714 			c = path->nodes[level];
5715 			slot = path->slots[level];
5716 			if (ret == 0)
5717 				slot++;
5718 			goto next;
5719 		}
5720 
5721 		if (level == 0)
5722 			btrfs_item_key_to_cpu(c, key, slot);
5723 		else {
5724 			u64 gen = btrfs_node_ptr_generation(c, slot);
5725 
5726 			if (gen < min_trans) {
5727 				slot++;
5728 				goto next;
5729 			}
5730 			btrfs_node_key_to_cpu(c, key, slot);
5731 		}
5732 		return 0;
5733 	}
5734 	return 1;
5735 }
5736 
5737 /*
5738  * search the tree again to find a leaf with greater keys
5739  * returns 0 if it found something or 1 if there are no greater leaves.
5740  * returns < 0 on io errors.
5741  */
btrfs_next_leaf(struct btrfs_root * root,struct btrfs_path * path)5742 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
5743 {
5744 	return btrfs_next_old_leaf(root, path, 0);
5745 }
5746 
btrfs_next_old_leaf(struct btrfs_root * root,struct btrfs_path * path,u64 time_seq)5747 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
5748 			u64 time_seq)
5749 {
5750 	int slot;
5751 	int level;
5752 	struct extent_buffer *c;
5753 	struct extent_buffer *next;
5754 	struct btrfs_key key;
5755 	u32 nritems;
5756 	int ret;
5757 	int old_spinning = path->leave_spinning;
5758 	int next_rw_lock = 0;
5759 
5760 	nritems = btrfs_header_nritems(path->nodes[0]);
5761 	if (nritems == 0)
5762 		return 1;
5763 
5764 	btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
5765 again:
5766 	level = 1;
5767 	next = NULL;
5768 	next_rw_lock = 0;
5769 	btrfs_release_path(path);
5770 
5771 	path->keep_locks = 1;
5772 	path->leave_spinning = 1;
5773 
5774 	if (time_seq)
5775 		ret = btrfs_search_old_slot(root, &key, path, time_seq);
5776 	else
5777 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5778 	path->keep_locks = 0;
5779 
5780 	if (ret < 0)
5781 		return ret;
5782 
5783 	nritems = btrfs_header_nritems(path->nodes[0]);
5784 	/*
5785 	 * by releasing the path above we dropped all our locks.  A balance
5786 	 * could have added more items next to the key that used to be
5787 	 * at the very end of the block.  So, check again here and
5788 	 * advance the path if there are now more items available.
5789 	 */
5790 	if (nritems > 0 && path->slots[0] < nritems - 1) {
5791 		if (ret == 0)
5792 			path->slots[0]++;
5793 		ret = 0;
5794 		goto done;
5795 	}
5796 	/*
5797 	 * So the above check misses one case:
5798 	 * - after releasing the path above, someone has removed the item that
5799 	 *   used to be at the very end of the block, and balance between leafs
5800 	 *   gets another one with bigger key.offset to replace it.
5801 	 *
5802 	 * This one should be returned as well, or we can get leaf corruption
5803 	 * later(esp. in __btrfs_drop_extents()).
5804 	 *
5805 	 * And a bit more explanation about this check,
5806 	 * with ret > 0, the key isn't found, the path points to the slot
5807 	 * where it should be inserted, so the path->slots[0] item must be the
5808 	 * bigger one.
5809 	 */
5810 	if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
5811 		ret = 0;
5812 		goto done;
5813 	}
5814 
5815 	while (level < BTRFS_MAX_LEVEL) {
5816 		if (!path->nodes[level]) {
5817 			ret = 1;
5818 			goto done;
5819 		}
5820 
5821 		slot = path->slots[level] + 1;
5822 		c = path->nodes[level];
5823 		if (slot >= btrfs_header_nritems(c)) {
5824 			level++;
5825 			if (level == BTRFS_MAX_LEVEL) {
5826 				ret = 1;
5827 				goto done;
5828 			}
5829 			continue;
5830 		}
5831 
5832 		if (next) {
5833 			btrfs_tree_unlock_rw(next, next_rw_lock);
5834 			free_extent_buffer(next);
5835 		}
5836 
5837 		next = c;
5838 		next_rw_lock = path->locks[level];
5839 		ret = read_block_for_search(root, path, &next, level,
5840 					    slot, &key);
5841 		if (ret == -EAGAIN)
5842 			goto again;
5843 
5844 		if (ret < 0) {
5845 			btrfs_release_path(path);
5846 			goto done;
5847 		}
5848 
5849 		if (!path->skip_locking) {
5850 			ret = btrfs_try_tree_read_lock(next);
5851 			if (!ret && time_seq) {
5852 				/*
5853 				 * If we don't get the lock, we may be racing
5854 				 * with push_leaf_left, holding that lock while
5855 				 * itself waiting for the leaf we've currently
5856 				 * locked. To solve this situation, we give up
5857 				 * on our lock and cycle.
5858 				 */
5859 				free_extent_buffer(next);
5860 				btrfs_release_path(path);
5861 				cond_resched();
5862 				goto again;
5863 			}
5864 			if (!ret) {
5865 				btrfs_set_path_blocking(path);
5866 				btrfs_tree_read_lock(next);
5867 				btrfs_clear_path_blocking(path, next,
5868 							  BTRFS_READ_LOCK);
5869 			}
5870 			next_rw_lock = BTRFS_READ_LOCK;
5871 		}
5872 		break;
5873 	}
5874 	path->slots[level] = slot;
5875 	while (1) {
5876 		level--;
5877 		c = path->nodes[level];
5878 		if (path->locks[level])
5879 			btrfs_tree_unlock_rw(c, path->locks[level]);
5880 
5881 		free_extent_buffer(c);
5882 		path->nodes[level] = next;
5883 		path->slots[level] = 0;
5884 		if (!path->skip_locking)
5885 			path->locks[level] = next_rw_lock;
5886 		if (!level)
5887 			break;
5888 
5889 		ret = read_block_for_search(root, path, &next, level,
5890 					    0, &key);
5891 		if (ret == -EAGAIN)
5892 			goto again;
5893 
5894 		if (ret < 0) {
5895 			btrfs_release_path(path);
5896 			goto done;
5897 		}
5898 
5899 		if (!path->skip_locking) {
5900 			ret = btrfs_try_tree_read_lock(next);
5901 			if (!ret) {
5902 				btrfs_set_path_blocking(path);
5903 				btrfs_tree_read_lock(next);
5904 				btrfs_clear_path_blocking(path, next,
5905 							  BTRFS_READ_LOCK);
5906 			}
5907 			next_rw_lock = BTRFS_READ_LOCK;
5908 		}
5909 	}
5910 	ret = 0;
5911 done:
5912 	unlock_up(path, 0, 1, 0, NULL);
5913 	path->leave_spinning = old_spinning;
5914 	if (!old_spinning)
5915 		btrfs_set_path_blocking(path);
5916 
5917 	return ret;
5918 }
5919 
5920 /*
5921  * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
5922  * searching until it gets past min_objectid or finds an item of 'type'
5923  *
5924  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5925  */
btrfs_previous_item(struct btrfs_root * root,struct btrfs_path * path,u64 min_objectid,int type)5926 int btrfs_previous_item(struct btrfs_root *root,
5927 			struct btrfs_path *path, u64 min_objectid,
5928 			int type)
5929 {
5930 	struct btrfs_key found_key;
5931 	struct extent_buffer *leaf;
5932 	u32 nritems;
5933 	int ret;
5934 
5935 	while (1) {
5936 		if (path->slots[0] == 0) {
5937 			btrfs_set_path_blocking(path);
5938 			ret = btrfs_prev_leaf(root, path);
5939 			if (ret != 0)
5940 				return ret;
5941 		} else {
5942 			path->slots[0]--;
5943 		}
5944 		leaf = path->nodes[0];
5945 		nritems = btrfs_header_nritems(leaf);
5946 		if (nritems == 0)
5947 			return 1;
5948 		if (path->slots[0] == nritems)
5949 			path->slots[0]--;
5950 
5951 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5952 		if (found_key.objectid < min_objectid)
5953 			break;
5954 		if (found_key.type == type)
5955 			return 0;
5956 		if (found_key.objectid == min_objectid &&
5957 		    found_key.type < type)
5958 			break;
5959 	}
5960 	return 1;
5961 }
5962 
5963 /*
5964  * search in extent tree to find a previous Metadata/Data extent item with
5965  * min objecitd.
5966  *
5967  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5968  */
btrfs_previous_extent_item(struct btrfs_root * root,struct btrfs_path * path,u64 min_objectid)5969 int btrfs_previous_extent_item(struct btrfs_root *root,
5970 			struct btrfs_path *path, u64 min_objectid)
5971 {
5972 	struct btrfs_key found_key;
5973 	struct extent_buffer *leaf;
5974 	u32 nritems;
5975 	int ret;
5976 
5977 	while (1) {
5978 		if (path->slots[0] == 0) {
5979 			btrfs_set_path_blocking(path);
5980 			ret = btrfs_prev_leaf(root, path);
5981 			if (ret != 0)
5982 				return ret;
5983 		} else {
5984 			path->slots[0]--;
5985 		}
5986 		leaf = path->nodes[0];
5987 		nritems = btrfs_header_nritems(leaf);
5988 		if (nritems == 0)
5989 			return 1;
5990 		if (path->slots[0] == nritems)
5991 			path->slots[0]--;
5992 
5993 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5994 		if (found_key.objectid < min_objectid)
5995 			break;
5996 		if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
5997 		    found_key.type == BTRFS_METADATA_ITEM_KEY)
5998 			return 0;
5999 		if (found_key.objectid == min_objectid &&
6000 		    found_key.type < BTRFS_EXTENT_ITEM_KEY)
6001 			break;
6002 	}
6003 	return 1;
6004 }
6005