1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (C) 2007,2008 Oracle. All rights reserved.
4 */
5
6 #include <linux/sched.h>
7 #include <linux/slab.h>
8 #include <linux/rbtree.h>
9 #include <linux/mm.h>
10 #include "ctree.h"
11 #include "disk-io.h"
12 #include "transaction.h"
13 #include "print-tree.h"
14 #include "locking.h"
15 #include "volumes.h"
16 #include "qgroup.h"
17 #include "tree-mod-log.h"
18
19 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
20 *root, struct btrfs_path *path, int level);
21 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
22 const struct btrfs_key *ins_key, struct btrfs_path *path,
23 int data_size, int extend);
24 static int push_node_left(struct btrfs_trans_handle *trans,
25 struct extent_buffer *dst,
26 struct extent_buffer *src, int empty);
27 static int balance_node_right(struct btrfs_trans_handle *trans,
28 struct extent_buffer *dst_buf,
29 struct extent_buffer *src_buf);
30 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
31 int level, int slot);
32
33 static const struct btrfs_csums {
34 u16 size;
35 const char name[10];
36 const char driver[12];
37 } btrfs_csums[] = {
38 [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
39 [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" },
40 [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" },
41 [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b",
42 .driver = "blake2b-256" },
43 };
44
btrfs_super_csum_size(const struct btrfs_super_block * s)45 int btrfs_super_csum_size(const struct btrfs_super_block *s)
46 {
47 u16 t = btrfs_super_csum_type(s);
48 /*
49 * csum type is validated at mount time
50 */
51 return btrfs_csums[t].size;
52 }
53
btrfs_super_csum_name(u16 csum_type)54 const char *btrfs_super_csum_name(u16 csum_type)
55 {
56 /* csum type is validated at mount time */
57 return btrfs_csums[csum_type].name;
58 }
59
60 /*
61 * Return driver name if defined, otherwise the name that's also a valid driver
62 * name
63 */
btrfs_super_csum_driver(u16 csum_type)64 const char *btrfs_super_csum_driver(u16 csum_type)
65 {
66 /* csum type is validated at mount time */
67 return btrfs_csums[csum_type].driver[0] ?
68 btrfs_csums[csum_type].driver :
69 btrfs_csums[csum_type].name;
70 }
71
btrfs_get_num_csums(void)72 size_t __attribute_const__ btrfs_get_num_csums(void)
73 {
74 return ARRAY_SIZE(btrfs_csums);
75 }
76
btrfs_alloc_path(void)77 struct btrfs_path *btrfs_alloc_path(void)
78 {
79 return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
80 }
81
82 /* this also releases the path */
btrfs_free_path(struct btrfs_path * p)83 void btrfs_free_path(struct btrfs_path *p)
84 {
85 if (!p)
86 return;
87 btrfs_release_path(p);
88 kmem_cache_free(btrfs_path_cachep, p);
89 }
90
91 /*
92 * path release drops references on the extent buffers in the path
93 * and it drops any locks held by this path
94 *
95 * It is safe to call this on paths that no locks or extent buffers held.
96 */
btrfs_release_path(struct btrfs_path * p)97 noinline void btrfs_release_path(struct btrfs_path *p)
98 {
99 int i;
100
101 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
102 p->slots[i] = 0;
103 if (!p->nodes[i])
104 continue;
105 if (p->locks[i]) {
106 btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
107 p->locks[i] = 0;
108 }
109 free_extent_buffer(p->nodes[i]);
110 p->nodes[i] = NULL;
111 }
112 }
113
114 /*
115 * safely gets a reference on the root node of a tree. A lock
116 * is not taken, so a concurrent writer may put a different node
117 * at the root of the tree. See btrfs_lock_root_node for the
118 * looping required.
119 *
120 * The extent buffer returned by this has a reference taken, so
121 * it won't disappear. It may stop being the root of the tree
122 * at any time because there are no locks held.
123 */
btrfs_root_node(struct btrfs_root * root)124 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
125 {
126 struct extent_buffer *eb;
127
128 while (1) {
129 rcu_read_lock();
130 eb = rcu_dereference(root->node);
131
132 /*
133 * RCU really hurts here, we could free up the root node because
134 * it was COWed but we may not get the new root node yet so do
135 * the inc_not_zero dance and if it doesn't work then
136 * synchronize_rcu and try again.
137 */
138 if (atomic_inc_not_zero(&eb->refs)) {
139 rcu_read_unlock();
140 break;
141 }
142 rcu_read_unlock();
143 synchronize_rcu();
144 }
145 return eb;
146 }
147
148 /*
149 * Cowonly root (not-shareable trees, everything not subvolume or reloc roots),
150 * just get put onto a simple dirty list. Transaction walks this list to make
151 * sure they get properly updated on disk.
152 */
add_root_to_dirty_list(struct btrfs_root * root)153 static void add_root_to_dirty_list(struct btrfs_root *root)
154 {
155 struct btrfs_fs_info *fs_info = root->fs_info;
156
157 if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
158 !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
159 return;
160
161 spin_lock(&fs_info->trans_lock);
162 if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
163 /* Want the extent tree to be the last on the list */
164 if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID)
165 list_move_tail(&root->dirty_list,
166 &fs_info->dirty_cowonly_roots);
167 else
168 list_move(&root->dirty_list,
169 &fs_info->dirty_cowonly_roots);
170 }
171 spin_unlock(&fs_info->trans_lock);
172 }
173
174 /*
175 * used by snapshot creation to make a copy of a root for a tree with
176 * a given objectid. The buffer with the new root node is returned in
177 * cow_ret, and this func returns zero on success or a negative error code.
178 */
btrfs_copy_root(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct extent_buffer * buf,struct extent_buffer ** cow_ret,u64 new_root_objectid)179 int btrfs_copy_root(struct btrfs_trans_handle *trans,
180 struct btrfs_root *root,
181 struct extent_buffer *buf,
182 struct extent_buffer **cow_ret, u64 new_root_objectid)
183 {
184 struct btrfs_fs_info *fs_info = root->fs_info;
185 struct extent_buffer *cow;
186 int ret = 0;
187 int level;
188 struct btrfs_disk_key disk_key;
189
190 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
191 trans->transid != fs_info->running_transaction->transid);
192 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
193 trans->transid != root->last_trans);
194
195 level = btrfs_header_level(buf);
196 if (level == 0)
197 btrfs_item_key(buf, &disk_key, 0);
198 else
199 btrfs_node_key(buf, &disk_key, 0);
200
201 cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
202 &disk_key, level, buf->start, 0,
203 BTRFS_NESTING_NEW_ROOT);
204 if (IS_ERR(cow))
205 return PTR_ERR(cow);
206
207 copy_extent_buffer_full(cow, buf);
208 btrfs_set_header_bytenr(cow, cow->start);
209 btrfs_set_header_generation(cow, trans->transid);
210 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
211 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
212 BTRFS_HEADER_FLAG_RELOC);
213 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
214 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
215 else
216 btrfs_set_header_owner(cow, new_root_objectid);
217
218 write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
219
220 WARN_ON(btrfs_header_generation(buf) > trans->transid);
221 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
222 ret = btrfs_inc_ref(trans, root, cow, 1);
223 else
224 ret = btrfs_inc_ref(trans, root, cow, 0);
225 if (ret) {
226 btrfs_tree_unlock(cow);
227 free_extent_buffer(cow);
228 btrfs_abort_transaction(trans, ret);
229 return ret;
230 }
231
232 btrfs_mark_buffer_dirty(cow);
233 *cow_ret = cow;
234 return 0;
235 }
236
237 /*
238 * check if the tree block can be shared by multiple trees
239 */
btrfs_block_can_be_shared(struct btrfs_root * root,struct extent_buffer * buf)240 int btrfs_block_can_be_shared(struct btrfs_root *root,
241 struct extent_buffer *buf)
242 {
243 /*
244 * Tree blocks not in shareable trees and tree roots are never shared.
245 * If a block was allocated after the last snapshot and the block was
246 * not allocated by tree relocation, we know the block is not shared.
247 */
248 if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
249 buf != root->node && buf != root->commit_root &&
250 (btrfs_header_generation(buf) <=
251 btrfs_root_last_snapshot(&root->root_item) ||
252 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
253 return 1;
254
255 return 0;
256 }
257
update_ref_for_cow(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct extent_buffer * buf,struct extent_buffer * cow,int * last_ref)258 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
259 struct btrfs_root *root,
260 struct extent_buffer *buf,
261 struct extent_buffer *cow,
262 int *last_ref)
263 {
264 struct btrfs_fs_info *fs_info = root->fs_info;
265 u64 refs;
266 u64 owner;
267 u64 flags;
268 u64 new_flags = 0;
269 int ret;
270
271 /*
272 * Backrefs update rules:
273 *
274 * Always use full backrefs for extent pointers in tree block
275 * allocated by tree relocation.
276 *
277 * If a shared tree block is no longer referenced by its owner
278 * tree (btrfs_header_owner(buf) == root->root_key.objectid),
279 * use full backrefs for extent pointers in tree block.
280 *
281 * If a tree block is been relocating
282 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
283 * use full backrefs for extent pointers in tree block.
284 * The reason for this is some operations (such as drop tree)
285 * are only allowed for blocks use full backrefs.
286 */
287
288 if (btrfs_block_can_be_shared(root, buf)) {
289 ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
290 btrfs_header_level(buf), 1,
291 &refs, &flags);
292 if (ret)
293 return ret;
294 if (refs == 0) {
295 ret = -EROFS;
296 btrfs_handle_fs_error(fs_info, ret, NULL);
297 return ret;
298 }
299 } else {
300 refs = 1;
301 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
302 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
303 flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
304 else
305 flags = 0;
306 }
307
308 owner = btrfs_header_owner(buf);
309 BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
310 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
311
312 if (refs > 1) {
313 if ((owner == root->root_key.objectid ||
314 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
315 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
316 ret = btrfs_inc_ref(trans, root, buf, 1);
317 if (ret)
318 return ret;
319
320 if (root->root_key.objectid ==
321 BTRFS_TREE_RELOC_OBJECTID) {
322 ret = btrfs_dec_ref(trans, root, buf, 0);
323 if (ret)
324 return ret;
325 ret = btrfs_inc_ref(trans, root, cow, 1);
326 if (ret)
327 return ret;
328 }
329 new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
330 } else {
331
332 if (root->root_key.objectid ==
333 BTRFS_TREE_RELOC_OBJECTID)
334 ret = btrfs_inc_ref(trans, root, cow, 1);
335 else
336 ret = btrfs_inc_ref(trans, root, cow, 0);
337 if (ret)
338 return ret;
339 }
340 if (new_flags != 0) {
341 int level = btrfs_header_level(buf);
342
343 ret = btrfs_set_disk_extent_flags(trans, buf,
344 new_flags, level, 0);
345 if (ret)
346 return ret;
347 }
348 } else {
349 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
350 if (root->root_key.objectid ==
351 BTRFS_TREE_RELOC_OBJECTID)
352 ret = btrfs_inc_ref(trans, root, cow, 1);
353 else
354 ret = btrfs_inc_ref(trans, root, cow, 0);
355 if (ret)
356 return ret;
357 ret = btrfs_dec_ref(trans, root, buf, 1);
358 if (ret)
359 return ret;
360 }
361 btrfs_clean_tree_block(buf);
362 *last_ref = 1;
363 }
364 return 0;
365 }
366
367 /*
368 * does the dirty work in cow of a single block. The parent block (if
369 * supplied) is updated to point to the new cow copy. The new buffer is marked
370 * dirty and returned locked. If you modify the block it needs to be marked
371 * dirty again.
372 *
373 * search_start -- an allocation hint for the new block
374 *
375 * empty_size -- a hint that you plan on doing more cow. This is the size in
376 * bytes the allocator should try to find free next to the block it returns.
377 * This is just a hint and may be ignored by the allocator.
378 */
__btrfs_cow_block(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct extent_buffer * buf,struct extent_buffer * parent,int parent_slot,struct extent_buffer ** cow_ret,u64 search_start,u64 empty_size,enum btrfs_lock_nesting nest)379 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
380 struct btrfs_root *root,
381 struct extent_buffer *buf,
382 struct extent_buffer *parent, int parent_slot,
383 struct extent_buffer **cow_ret,
384 u64 search_start, u64 empty_size,
385 enum btrfs_lock_nesting nest)
386 {
387 struct btrfs_fs_info *fs_info = root->fs_info;
388 struct btrfs_disk_key disk_key;
389 struct extent_buffer *cow;
390 int level, ret;
391 int last_ref = 0;
392 int unlock_orig = 0;
393 u64 parent_start = 0;
394
395 if (*cow_ret == buf)
396 unlock_orig = 1;
397
398 btrfs_assert_tree_locked(buf);
399
400 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
401 trans->transid != fs_info->running_transaction->transid);
402 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
403 trans->transid != root->last_trans);
404
405 level = btrfs_header_level(buf);
406
407 if (level == 0)
408 btrfs_item_key(buf, &disk_key, 0);
409 else
410 btrfs_node_key(buf, &disk_key, 0);
411
412 if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
413 parent_start = parent->start;
414
415 cow = btrfs_alloc_tree_block(trans, root, parent_start,
416 root->root_key.objectid, &disk_key, level,
417 search_start, empty_size, nest);
418 if (IS_ERR(cow))
419 return PTR_ERR(cow);
420
421 /* cow is set to blocking by btrfs_init_new_buffer */
422
423 copy_extent_buffer_full(cow, buf);
424 btrfs_set_header_bytenr(cow, cow->start);
425 btrfs_set_header_generation(cow, trans->transid);
426 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
427 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
428 BTRFS_HEADER_FLAG_RELOC);
429 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
430 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
431 else
432 btrfs_set_header_owner(cow, root->root_key.objectid);
433
434 write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
435
436 ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
437 if (ret) {
438 btrfs_tree_unlock(cow);
439 free_extent_buffer(cow);
440 btrfs_abort_transaction(trans, ret);
441 return ret;
442 }
443
444 if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
445 ret = btrfs_reloc_cow_block(trans, root, buf, cow);
446 if (ret) {
447 btrfs_tree_unlock(cow);
448 free_extent_buffer(cow);
449 btrfs_abort_transaction(trans, ret);
450 return ret;
451 }
452 }
453
454 if (buf == root->node) {
455 WARN_ON(parent && parent != buf);
456 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
457 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
458 parent_start = buf->start;
459
460 ret = btrfs_tree_mod_log_insert_root(root->node, cow, true);
461 if (ret < 0) {
462 btrfs_tree_unlock(cow);
463 free_extent_buffer(cow);
464 btrfs_abort_transaction(trans, ret);
465 return ret;
466 }
467 atomic_inc(&cow->refs);
468 rcu_assign_pointer(root->node, cow);
469
470 btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
471 parent_start, last_ref);
472 free_extent_buffer(buf);
473 add_root_to_dirty_list(root);
474 } else {
475 WARN_ON(trans->transid != btrfs_header_generation(parent));
476 btrfs_tree_mod_log_insert_key(parent, parent_slot,
477 BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
478 btrfs_set_node_blockptr(parent, parent_slot,
479 cow->start);
480 btrfs_set_node_ptr_generation(parent, parent_slot,
481 trans->transid);
482 btrfs_mark_buffer_dirty(parent);
483 if (last_ref) {
484 ret = btrfs_tree_mod_log_free_eb(buf);
485 if (ret) {
486 btrfs_tree_unlock(cow);
487 free_extent_buffer(cow);
488 btrfs_abort_transaction(trans, ret);
489 return ret;
490 }
491 }
492 btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
493 parent_start, last_ref);
494 }
495 if (unlock_orig)
496 btrfs_tree_unlock(buf);
497 free_extent_buffer_stale(buf);
498 btrfs_mark_buffer_dirty(cow);
499 *cow_ret = cow;
500 return 0;
501 }
502
should_cow_block(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct extent_buffer * buf)503 static inline int should_cow_block(struct btrfs_trans_handle *trans,
504 struct btrfs_root *root,
505 struct extent_buffer *buf)
506 {
507 if (btrfs_is_testing(root->fs_info))
508 return 0;
509
510 /* Ensure we can see the FORCE_COW bit */
511 smp_mb__before_atomic();
512
513 /*
514 * We do not need to cow a block if
515 * 1) this block is not created or changed in this transaction;
516 * 2) this block does not belong to TREE_RELOC tree;
517 * 3) the root is not forced COW.
518 *
519 * What is forced COW:
520 * when we create snapshot during committing the transaction,
521 * after we've finished copying src root, we must COW the shared
522 * block to ensure the metadata consistency.
523 */
524 if (btrfs_header_generation(buf) == trans->transid &&
525 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
526 !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
527 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
528 !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
529 return 0;
530 return 1;
531 }
532
533 /*
534 * cows a single block, see __btrfs_cow_block for the real work.
535 * This version of it has extra checks so that a block isn't COWed more than
536 * once per transaction, as long as it hasn't been written yet
537 */
btrfs_cow_block(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct extent_buffer * buf,struct extent_buffer * parent,int parent_slot,struct extent_buffer ** cow_ret,enum btrfs_lock_nesting nest)538 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
539 struct btrfs_root *root, struct extent_buffer *buf,
540 struct extent_buffer *parent, int parent_slot,
541 struct extent_buffer **cow_ret,
542 enum btrfs_lock_nesting nest)
543 {
544 struct btrfs_fs_info *fs_info = root->fs_info;
545 u64 search_start;
546 int ret;
547
548 if (unlikely(test_bit(BTRFS_ROOT_DELETING, &root->state))) {
549 btrfs_abort_transaction(trans, -EUCLEAN);
550 btrfs_crit(fs_info,
551 "attempt to COW block %llu on root %llu that is being deleted",
552 buf->start, btrfs_root_id(root));
553 return -EUCLEAN;
554 }
555
556 /*
557 * COWing must happen through a running transaction, which always
558 * matches the current fs generation (it's a transaction with a state
559 * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs
560 * into error state to prevent the commit of any transaction.
561 */
562 if (unlikely(trans->transaction != fs_info->running_transaction ||
563 trans->transid != fs_info->generation)) {
564 btrfs_abort_transaction(trans, -EUCLEAN);
565 btrfs_crit(fs_info,
566 "unexpected transaction when attempting to COW block %llu on root %llu, transaction %llu running transaction %llu fs generation %llu",
567 buf->start, btrfs_root_id(root), trans->transid,
568 fs_info->running_transaction->transid,
569 fs_info->generation);
570 return -EUCLEAN;
571 }
572
573 if (!should_cow_block(trans, root, buf)) {
574 *cow_ret = buf;
575 return 0;
576 }
577
578 search_start = buf->start & ~((u64)SZ_1G - 1);
579
580 /*
581 * Before CoWing this block for later modification, check if it's
582 * the subtree root and do the delayed subtree trace if needed.
583 *
584 * Also We don't care about the error, as it's handled internally.
585 */
586 btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
587 ret = __btrfs_cow_block(trans, root, buf, parent,
588 parent_slot, cow_ret, search_start, 0, nest);
589
590 trace_btrfs_cow_block(root, buf, *cow_ret);
591
592 return ret;
593 }
594 ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO);
595
596 /*
597 * helper function for defrag to decide if two blocks pointed to by a
598 * node are actually close by
599 */
close_blocks(u64 blocknr,u64 other,u32 blocksize)600 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
601 {
602 if (blocknr < other && other - (blocknr + blocksize) < 32768)
603 return 1;
604 if (blocknr > other && blocknr - (other + blocksize) < 32768)
605 return 1;
606 return 0;
607 }
608
609 #ifdef __LITTLE_ENDIAN
610
611 /*
612 * Compare two keys, on little-endian the disk order is same as CPU order and
613 * we can avoid the conversion.
614 */
comp_keys(const struct btrfs_disk_key * disk_key,const struct btrfs_key * k2)615 static int comp_keys(const struct btrfs_disk_key *disk_key,
616 const struct btrfs_key *k2)
617 {
618 const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key;
619
620 return btrfs_comp_cpu_keys(k1, k2);
621 }
622
623 #else
624
625 /*
626 * compare two keys in a memcmp fashion
627 */
comp_keys(const struct btrfs_disk_key * disk,const struct btrfs_key * k2)628 static int comp_keys(const struct btrfs_disk_key *disk,
629 const struct btrfs_key *k2)
630 {
631 struct btrfs_key k1;
632
633 btrfs_disk_key_to_cpu(&k1, disk);
634
635 return btrfs_comp_cpu_keys(&k1, k2);
636 }
637 #endif
638
639 /*
640 * same as comp_keys only with two btrfs_key's
641 */
btrfs_comp_cpu_keys(const struct btrfs_key * k1,const struct btrfs_key * k2)642 int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
643 {
644 if (k1->objectid > k2->objectid)
645 return 1;
646 if (k1->objectid < k2->objectid)
647 return -1;
648 if (k1->type > k2->type)
649 return 1;
650 if (k1->type < k2->type)
651 return -1;
652 if (k1->offset > k2->offset)
653 return 1;
654 if (k1->offset < k2->offset)
655 return -1;
656 return 0;
657 }
658
659 /*
660 * this is used by the defrag code to go through all the
661 * leaves pointed to by a node and reallocate them so that
662 * disk order is close to key order
663 */
btrfs_realloc_node(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct extent_buffer * parent,int start_slot,u64 * last_ret,struct btrfs_key * progress)664 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
665 struct btrfs_root *root, struct extent_buffer *parent,
666 int start_slot, u64 *last_ret,
667 struct btrfs_key *progress)
668 {
669 struct btrfs_fs_info *fs_info = root->fs_info;
670 struct extent_buffer *cur;
671 u64 blocknr;
672 u64 search_start = *last_ret;
673 u64 last_block = 0;
674 u64 other;
675 u32 parent_nritems;
676 int end_slot;
677 int i;
678 int err = 0;
679 u32 blocksize;
680 int progress_passed = 0;
681 struct btrfs_disk_key disk_key;
682
683 /*
684 * COWing must happen through a running transaction, which always
685 * matches the current fs generation (it's a transaction with a state
686 * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs
687 * into error state to prevent the commit of any transaction.
688 */
689 if (unlikely(trans->transaction != fs_info->running_transaction ||
690 trans->transid != fs_info->generation)) {
691 btrfs_abort_transaction(trans, -EUCLEAN);
692 btrfs_crit(fs_info,
693 "unexpected transaction when attempting to reallocate parent %llu for root %llu, transaction %llu running transaction %llu fs generation %llu",
694 parent->start, btrfs_root_id(root), trans->transid,
695 fs_info->running_transaction->transid,
696 fs_info->generation);
697 return -EUCLEAN;
698 }
699
700 parent_nritems = btrfs_header_nritems(parent);
701 blocksize = fs_info->nodesize;
702 end_slot = parent_nritems - 1;
703
704 if (parent_nritems <= 1)
705 return 0;
706
707 for (i = start_slot; i <= end_slot; i++) {
708 int close = 1;
709
710 btrfs_node_key(parent, &disk_key, i);
711 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
712 continue;
713
714 progress_passed = 1;
715 blocknr = btrfs_node_blockptr(parent, i);
716 if (last_block == 0)
717 last_block = blocknr;
718
719 if (i > 0) {
720 other = btrfs_node_blockptr(parent, i - 1);
721 close = close_blocks(blocknr, other, blocksize);
722 }
723 if (!close && i < end_slot) {
724 other = btrfs_node_blockptr(parent, i + 1);
725 close = close_blocks(blocknr, other, blocksize);
726 }
727 if (close) {
728 last_block = blocknr;
729 continue;
730 }
731
732 cur = btrfs_read_node_slot(parent, i);
733 if (IS_ERR(cur))
734 return PTR_ERR(cur);
735 if (search_start == 0)
736 search_start = last_block;
737
738 btrfs_tree_lock(cur);
739 err = __btrfs_cow_block(trans, root, cur, parent, i,
740 &cur, search_start,
741 min(16 * blocksize,
742 (end_slot - i) * blocksize),
743 BTRFS_NESTING_COW);
744 if (err) {
745 btrfs_tree_unlock(cur);
746 free_extent_buffer(cur);
747 break;
748 }
749 search_start = cur->start;
750 last_block = cur->start;
751 *last_ret = search_start;
752 btrfs_tree_unlock(cur);
753 free_extent_buffer(cur);
754 }
755 return err;
756 }
757
758 /*
759 * search for key in the extent_buffer. The items start at offset p,
760 * and they are item_size apart.
761 *
762 * the slot in the array is returned via slot, and it points to
763 * the place where you would insert key if it is not found in
764 * the array.
765 *
766 * Slot may point to total number of items if the key is bigger than
767 * all of the keys
768 */
generic_bin_search(struct extent_buffer * eb,unsigned long p,int item_size,const struct btrfs_key * key,int * slot)769 static noinline int generic_bin_search(struct extent_buffer *eb,
770 unsigned long p, int item_size,
771 const struct btrfs_key *key, int *slot)
772 {
773 int low = 0;
774 int high = btrfs_header_nritems(eb);
775 int ret;
776 const int key_size = sizeof(struct btrfs_disk_key);
777
778 if (low > high) {
779 btrfs_err(eb->fs_info,
780 "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
781 __func__, low, high, eb->start,
782 btrfs_header_owner(eb), btrfs_header_level(eb));
783 return -EINVAL;
784 }
785
786 while (low < high) {
787 unsigned long oip;
788 unsigned long offset;
789 struct btrfs_disk_key *tmp;
790 struct btrfs_disk_key unaligned;
791 int mid;
792
793 mid = (low + high) / 2;
794 offset = p + mid * item_size;
795 oip = offset_in_page(offset);
796
797 if (oip + key_size <= PAGE_SIZE) {
798 const unsigned long idx = get_eb_page_index(offset);
799 char *kaddr = page_address(eb->pages[idx]);
800
801 oip = get_eb_offset_in_page(eb, offset);
802 tmp = (struct btrfs_disk_key *)(kaddr + oip);
803 } else {
804 read_extent_buffer(eb, &unaligned, offset, key_size);
805 tmp = &unaligned;
806 }
807
808 ret = comp_keys(tmp, key);
809
810 if (ret < 0)
811 low = mid + 1;
812 else if (ret > 0)
813 high = mid;
814 else {
815 *slot = mid;
816 return 0;
817 }
818 }
819 *slot = low;
820 return 1;
821 }
822
823 /*
824 * simple bin_search frontend that does the right thing for
825 * leaves vs nodes
826 */
btrfs_bin_search(struct extent_buffer * eb,const struct btrfs_key * key,int * slot)827 int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
828 int *slot)
829 {
830 if (btrfs_header_level(eb) == 0)
831 return generic_bin_search(eb,
832 offsetof(struct btrfs_leaf, items),
833 sizeof(struct btrfs_item), key, slot);
834 else
835 return generic_bin_search(eb,
836 offsetof(struct btrfs_node, ptrs),
837 sizeof(struct btrfs_key_ptr), key, slot);
838 }
839
root_add_used(struct btrfs_root * root,u32 size)840 static void root_add_used(struct btrfs_root *root, u32 size)
841 {
842 spin_lock(&root->accounting_lock);
843 btrfs_set_root_used(&root->root_item,
844 btrfs_root_used(&root->root_item) + size);
845 spin_unlock(&root->accounting_lock);
846 }
847
root_sub_used(struct btrfs_root * root,u32 size)848 static void root_sub_used(struct btrfs_root *root, u32 size)
849 {
850 spin_lock(&root->accounting_lock);
851 btrfs_set_root_used(&root->root_item,
852 btrfs_root_used(&root->root_item) - size);
853 spin_unlock(&root->accounting_lock);
854 }
855
856 /* given a node and slot number, this reads the blocks it points to. The
857 * extent buffer is returned with a reference taken (but unlocked).
858 */
btrfs_read_node_slot(struct extent_buffer * parent,int slot)859 struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
860 int slot)
861 {
862 int level = btrfs_header_level(parent);
863 struct extent_buffer *eb;
864 struct btrfs_key first_key;
865
866 if (slot < 0 || slot >= btrfs_header_nritems(parent))
867 return ERR_PTR(-ENOENT);
868
869 BUG_ON(level == 0);
870
871 btrfs_node_key_to_cpu(parent, &first_key, slot);
872 eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot),
873 btrfs_header_owner(parent),
874 btrfs_node_ptr_generation(parent, slot),
875 level - 1, &first_key);
876 if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
877 free_extent_buffer(eb);
878 eb = ERR_PTR(-EIO);
879 }
880
881 return eb;
882 }
883
884 /*
885 * node level balancing, used to make sure nodes are in proper order for
886 * item deletion. We balance from the top down, so we have to make sure
887 * that a deletion won't leave an node completely empty later on.
888 */
balance_level(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,int level)889 static noinline int balance_level(struct btrfs_trans_handle *trans,
890 struct btrfs_root *root,
891 struct btrfs_path *path, int level)
892 {
893 struct btrfs_fs_info *fs_info = root->fs_info;
894 struct extent_buffer *right = NULL;
895 struct extent_buffer *mid;
896 struct extent_buffer *left = NULL;
897 struct extent_buffer *parent = NULL;
898 int ret = 0;
899 int wret;
900 int pslot;
901 int orig_slot = path->slots[level];
902 u64 orig_ptr;
903
904 ASSERT(level > 0);
905
906 mid = path->nodes[level];
907
908 WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK);
909 WARN_ON(btrfs_header_generation(mid) != trans->transid);
910
911 orig_ptr = btrfs_node_blockptr(mid, orig_slot);
912
913 if (level < BTRFS_MAX_LEVEL - 1) {
914 parent = path->nodes[level + 1];
915 pslot = path->slots[level + 1];
916 }
917
918 /*
919 * deal with the case where there is only one pointer in the root
920 * by promoting the node below to a root
921 */
922 if (!parent) {
923 struct extent_buffer *child;
924
925 if (btrfs_header_nritems(mid) != 1)
926 return 0;
927
928 /* promote the child to a root */
929 child = btrfs_read_node_slot(mid, 0);
930 if (IS_ERR(child)) {
931 ret = PTR_ERR(child);
932 btrfs_handle_fs_error(fs_info, ret, NULL);
933 goto enospc;
934 }
935
936 btrfs_tree_lock(child);
937 ret = btrfs_cow_block(trans, root, child, mid, 0, &child,
938 BTRFS_NESTING_COW);
939 if (ret) {
940 btrfs_tree_unlock(child);
941 free_extent_buffer(child);
942 goto enospc;
943 }
944
945 ret = btrfs_tree_mod_log_insert_root(root->node, child, true);
946 if (ret < 0) {
947 btrfs_tree_unlock(child);
948 free_extent_buffer(child);
949 btrfs_abort_transaction(trans, ret);
950 goto enospc;
951 }
952 rcu_assign_pointer(root->node, child);
953
954 add_root_to_dirty_list(root);
955 btrfs_tree_unlock(child);
956
957 path->locks[level] = 0;
958 path->nodes[level] = NULL;
959 btrfs_clean_tree_block(mid);
960 btrfs_tree_unlock(mid);
961 /* once for the path */
962 free_extent_buffer(mid);
963
964 root_sub_used(root, mid->len);
965 btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
966 /* once for the root ptr */
967 free_extent_buffer_stale(mid);
968 return 0;
969 }
970 if (btrfs_header_nritems(mid) >
971 BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
972 return 0;
973
974 left = btrfs_read_node_slot(parent, pslot - 1);
975 if (IS_ERR(left))
976 left = NULL;
977
978 if (left) {
979 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
980 wret = btrfs_cow_block(trans, root, left,
981 parent, pslot - 1, &left,
982 BTRFS_NESTING_LEFT_COW);
983 if (wret) {
984 ret = wret;
985 goto enospc;
986 }
987 }
988
989 right = btrfs_read_node_slot(parent, pslot + 1);
990 if (IS_ERR(right))
991 right = NULL;
992
993 if (right) {
994 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
995 wret = btrfs_cow_block(trans, root, right,
996 parent, pslot + 1, &right,
997 BTRFS_NESTING_RIGHT_COW);
998 if (wret) {
999 ret = wret;
1000 goto enospc;
1001 }
1002 }
1003
1004 /* first, try to make some room in the middle buffer */
1005 if (left) {
1006 orig_slot += btrfs_header_nritems(left);
1007 wret = push_node_left(trans, left, mid, 1);
1008 if (wret < 0)
1009 ret = wret;
1010 }
1011
1012 /*
1013 * then try to empty the right most buffer into the middle
1014 */
1015 if (right) {
1016 wret = push_node_left(trans, mid, right, 1);
1017 if (wret < 0 && wret != -ENOSPC)
1018 ret = wret;
1019 if (btrfs_header_nritems(right) == 0) {
1020 btrfs_clean_tree_block(right);
1021 btrfs_tree_unlock(right);
1022 del_ptr(root, path, level + 1, pslot + 1);
1023 root_sub_used(root, right->len);
1024 btrfs_free_tree_block(trans, btrfs_root_id(root), right,
1025 0, 1);
1026 free_extent_buffer_stale(right);
1027 right = NULL;
1028 } else {
1029 struct btrfs_disk_key right_key;
1030 btrfs_node_key(right, &right_key, 0);
1031 ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
1032 BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
1033 if (ret < 0) {
1034 btrfs_abort_transaction(trans, ret);
1035 goto enospc;
1036 }
1037 btrfs_set_node_key(parent, &right_key, pslot + 1);
1038 btrfs_mark_buffer_dirty(parent);
1039 }
1040 }
1041 if (btrfs_header_nritems(mid) == 1) {
1042 /*
1043 * we're not allowed to leave a node with one item in the
1044 * tree during a delete. A deletion from lower in the tree
1045 * could try to delete the only pointer in this node.
1046 * So, pull some keys from the left.
1047 * There has to be a left pointer at this point because
1048 * otherwise we would have pulled some pointers from the
1049 * right
1050 */
1051 if (!left) {
1052 ret = -EROFS;
1053 btrfs_handle_fs_error(fs_info, ret, NULL);
1054 goto enospc;
1055 }
1056 wret = balance_node_right(trans, mid, left);
1057 if (wret < 0) {
1058 ret = wret;
1059 goto enospc;
1060 }
1061 if (wret == 1) {
1062 wret = push_node_left(trans, left, mid, 1);
1063 if (wret < 0)
1064 ret = wret;
1065 }
1066 BUG_ON(wret == 1);
1067 }
1068 if (btrfs_header_nritems(mid) == 0) {
1069 btrfs_clean_tree_block(mid);
1070 btrfs_tree_unlock(mid);
1071 del_ptr(root, path, level + 1, pslot);
1072 root_sub_used(root, mid->len);
1073 btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
1074 free_extent_buffer_stale(mid);
1075 mid = NULL;
1076 } else {
1077 /* update the parent key to reflect our changes */
1078 struct btrfs_disk_key mid_key;
1079 btrfs_node_key(mid, &mid_key, 0);
1080 ret = btrfs_tree_mod_log_insert_key(parent, pslot,
1081 BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
1082 if (ret < 0) {
1083 btrfs_abort_transaction(trans, ret);
1084 goto enospc;
1085 }
1086 btrfs_set_node_key(parent, &mid_key, pslot);
1087 btrfs_mark_buffer_dirty(parent);
1088 }
1089
1090 /* update the path */
1091 if (left) {
1092 if (btrfs_header_nritems(left) > orig_slot) {
1093 atomic_inc(&left->refs);
1094 /* left was locked after cow */
1095 path->nodes[level] = left;
1096 path->slots[level + 1] -= 1;
1097 path->slots[level] = orig_slot;
1098 if (mid) {
1099 btrfs_tree_unlock(mid);
1100 free_extent_buffer(mid);
1101 }
1102 } else {
1103 orig_slot -= btrfs_header_nritems(left);
1104 path->slots[level] = orig_slot;
1105 }
1106 }
1107 /* double check we haven't messed things up */
1108 if (orig_ptr !=
1109 btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1110 BUG();
1111 enospc:
1112 if (right) {
1113 btrfs_tree_unlock(right);
1114 free_extent_buffer(right);
1115 }
1116 if (left) {
1117 if (path->nodes[level] != left)
1118 btrfs_tree_unlock(left);
1119 free_extent_buffer(left);
1120 }
1121 return ret;
1122 }
1123
1124 /* Node balancing for insertion. Here we only split or push nodes around
1125 * when they are completely full. This is also done top down, so we
1126 * have to be pessimistic.
1127 */
push_nodes_for_insert(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,int level)1128 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1129 struct btrfs_root *root,
1130 struct btrfs_path *path, int level)
1131 {
1132 struct btrfs_fs_info *fs_info = root->fs_info;
1133 struct extent_buffer *right = NULL;
1134 struct extent_buffer *mid;
1135 struct extent_buffer *left = NULL;
1136 struct extent_buffer *parent = NULL;
1137 int ret = 0;
1138 int wret;
1139 int pslot;
1140 int orig_slot = path->slots[level];
1141
1142 if (level == 0)
1143 return 1;
1144
1145 mid = path->nodes[level];
1146 WARN_ON(btrfs_header_generation(mid) != trans->transid);
1147
1148 if (level < BTRFS_MAX_LEVEL - 1) {
1149 parent = path->nodes[level + 1];
1150 pslot = path->slots[level + 1];
1151 }
1152
1153 if (!parent)
1154 return 1;
1155
1156 left = btrfs_read_node_slot(parent, pslot - 1);
1157 if (IS_ERR(left))
1158 left = NULL;
1159
1160 /* first, try to make some room in the middle buffer */
1161 if (left) {
1162 u32 left_nr;
1163
1164 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
1165
1166 left_nr = btrfs_header_nritems(left);
1167 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
1168 wret = 1;
1169 } else {
1170 ret = btrfs_cow_block(trans, root, left, parent,
1171 pslot - 1, &left,
1172 BTRFS_NESTING_LEFT_COW);
1173 if (ret)
1174 wret = 1;
1175 else {
1176 wret = push_node_left(trans, left, mid, 0);
1177 }
1178 }
1179 if (wret < 0)
1180 ret = wret;
1181 if (wret == 0) {
1182 struct btrfs_disk_key disk_key;
1183 orig_slot += left_nr;
1184 btrfs_node_key(mid, &disk_key, 0);
1185 ret = btrfs_tree_mod_log_insert_key(parent, pslot,
1186 BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
1187 BUG_ON(ret < 0);
1188 btrfs_set_node_key(parent, &disk_key, pslot);
1189 btrfs_mark_buffer_dirty(parent);
1190 if (btrfs_header_nritems(left) > orig_slot) {
1191 path->nodes[level] = left;
1192 path->slots[level + 1] -= 1;
1193 path->slots[level] = orig_slot;
1194 btrfs_tree_unlock(mid);
1195 free_extent_buffer(mid);
1196 } else {
1197 orig_slot -=
1198 btrfs_header_nritems(left);
1199 path->slots[level] = orig_slot;
1200 btrfs_tree_unlock(left);
1201 free_extent_buffer(left);
1202 }
1203 return 0;
1204 }
1205 btrfs_tree_unlock(left);
1206 free_extent_buffer(left);
1207 }
1208 right = btrfs_read_node_slot(parent, pslot + 1);
1209 if (IS_ERR(right))
1210 right = NULL;
1211
1212 /*
1213 * then try to empty the right most buffer into the middle
1214 */
1215 if (right) {
1216 u32 right_nr;
1217
1218 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
1219
1220 right_nr = btrfs_header_nritems(right);
1221 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
1222 wret = 1;
1223 } else {
1224 ret = btrfs_cow_block(trans, root, right,
1225 parent, pslot + 1,
1226 &right, BTRFS_NESTING_RIGHT_COW);
1227 if (ret)
1228 wret = 1;
1229 else {
1230 wret = balance_node_right(trans, right, mid);
1231 }
1232 }
1233 if (wret < 0)
1234 ret = wret;
1235 if (wret == 0) {
1236 struct btrfs_disk_key disk_key;
1237
1238 btrfs_node_key(right, &disk_key, 0);
1239 ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
1240 BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
1241 BUG_ON(ret < 0);
1242 btrfs_set_node_key(parent, &disk_key, pslot + 1);
1243 btrfs_mark_buffer_dirty(parent);
1244
1245 if (btrfs_header_nritems(mid) <= orig_slot) {
1246 path->nodes[level] = right;
1247 path->slots[level + 1] += 1;
1248 path->slots[level] = orig_slot -
1249 btrfs_header_nritems(mid);
1250 btrfs_tree_unlock(mid);
1251 free_extent_buffer(mid);
1252 } else {
1253 btrfs_tree_unlock(right);
1254 free_extent_buffer(right);
1255 }
1256 return 0;
1257 }
1258 btrfs_tree_unlock(right);
1259 free_extent_buffer(right);
1260 }
1261 return 1;
1262 }
1263
1264 /*
1265 * readahead one full node of leaves, finding things that are close
1266 * to the block in 'slot', and triggering ra on them.
1267 */
reada_for_search(struct btrfs_fs_info * fs_info,struct btrfs_path * path,int level,int slot,u64 objectid)1268 static void reada_for_search(struct btrfs_fs_info *fs_info,
1269 struct btrfs_path *path,
1270 int level, int slot, u64 objectid)
1271 {
1272 struct extent_buffer *node;
1273 struct btrfs_disk_key disk_key;
1274 u32 nritems;
1275 u64 search;
1276 u64 target;
1277 u64 nread = 0;
1278 u64 nread_max;
1279 u32 nr;
1280 u32 blocksize;
1281 u32 nscan = 0;
1282
1283 if (level != 1 && path->reada != READA_FORWARD_ALWAYS)
1284 return;
1285
1286 if (!path->nodes[level])
1287 return;
1288
1289 node = path->nodes[level];
1290
1291 /*
1292 * Since the time between visiting leaves is much shorter than the time
1293 * between visiting nodes, limit read ahead of nodes to 1, to avoid too
1294 * much IO at once (possibly random).
1295 */
1296 if (path->reada == READA_FORWARD_ALWAYS) {
1297 if (level > 1)
1298 nread_max = node->fs_info->nodesize;
1299 else
1300 nread_max = SZ_128K;
1301 } else {
1302 nread_max = SZ_64K;
1303 }
1304
1305 search = btrfs_node_blockptr(node, slot);
1306 blocksize = fs_info->nodesize;
1307 if (path->reada != READA_FORWARD_ALWAYS) {
1308 struct extent_buffer *eb;
1309
1310 eb = find_extent_buffer(fs_info, search);
1311 if (eb) {
1312 free_extent_buffer(eb);
1313 return;
1314 }
1315 }
1316
1317 target = search;
1318
1319 nritems = btrfs_header_nritems(node);
1320 nr = slot;
1321
1322 while (1) {
1323 if (path->reada == READA_BACK) {
1324 if (nr == 0)
1325 break;
1326 nr--;
1327 } else if (path->reada == READA_FORWARD ||
1328 path->reada == READA_FORWARD_ALWAYS) {
1329 nr++;
1330 if (nr >= nritems)
1331 break;
1332 }
1333 if (path->reada == READA_BACK && objectid) {
1334 btrfs_node_key(node, &disk_key, nr);
1335 if (btrfs_disk_key_objectid(&disk_key) != objectid)
1336 break;
1337 }
1338 search = btrfs_node_blockptr(node, nr);
1339 if (path->reada == READA_FORWARD_ALWAYS ||
1340 (search <= target && target - search <= 65536) ||
1341 (search > target && search - target <= 65536)) {
1342 btrfs_readahead_node_child(node, nr);
1343 nread += blocksize;
1344 }
1345 nscan++;
1346 if (nread > nread_max || nscan > 32)
1347 break;
1348 }
1349 }
1350
reada_for_balance(struct btrfs_path * path,int level)1351 static noinline void reada_for_balance(struct btrfs_path *path, int level)
1352 {
1353 struct extent_buffer *parent;
1354 int slot;
1355 int nritems;
1356
1357 parent = path->nodes[level + 1];
1358 if (!parent)
1359 return;
1360
1361 nritems = btrfs_header_nritems(parent);
1362 slot = path->slots[level + 1];
1363
1364 if (slot > 0)
1365 btrfs_readahead_node_child(parent, slot - 1);
1366 if (slot + 1 < nritems)
1367 btrfs_readahead_node_child(parent, slot + 1);
1368 }
1369
1370
1371 /*
1372 * when we walk down the tree, it is usually safe to unlock the higher layers
1373 * in the tree. The exceptions are when our path goes through slot 0, because
1374 * operations on the tree might require changing key pointers higher up in the
1375 * tree.
1376 *
1377 * callers might also have set path->keep_locks, which tells this code to keep
1378 * the lock if the path points to the last slot in the block. This is part of
1379 * walking through the tree, and selecting the next slot in the higher block.
1380 *
1381 * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so
1382 * if lowest_unlock is 1, level 0 won't be unlocked
1383 */
unlock_up(struct btrfs_path * path,int level,int lowest_unlock,int min_write_lock_level,int * write_lock_level)1384 static noinline void unlock_up(struct btrfs_path *path, int level,
1385 int lowest_unlock, int min_write_lock_level,
1386 int *write_lock_level)
1387 {
1388 int i;
1389 int skip_level = level;
1390 int no_skips = 0;
1391 struct extent_buffer *t;
1392
1393 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1394 if (!path->nodes[i])
1395 break;
1396 if (!path->locks[i])
1397 break;
1398 if (!no_skips && path->slots[i] == 0) {
1399 skip_level = i + 1;
1400 continue;
1401 }
1402 if (!no_skips && path->keep_locks) {
1403 u32 nritems;
1404 t = path->nodes[i];
1405 nritems = btrfs_header_nritems(t);
1406 if (nritems < 1 || path->slots[i] >= nritems - 1) {
1407 skip_level = i + 1;
1408 continue;
1409 }
1410 }
1411 if (skip_level < i && i >= lowest_unlock)
1412 no_skips = 1;
1413
1414 t = path->nodes[i];
1415 if (i >= lowest_unlock && i > skip_level) {
1416 btrfs_tree_unlock_rw(t, path->locks[i]);
1417 path->locks[i] = 0;
1418 if (write_lock_level &&
1419 i > min_write_lock_level &&
1420 i <= *write_lock_level) {
1421 *write_lock_level = i - 1;
1422 }
1423 }
1424 }
1425 }
1426
1427 /*
1428 * helper function for btrfs_search_slot. The goal is to find a block
1429 * in cache without setting the path to blocking. If we find the block
1430 * we return zero and the path is unchanged.
1431 *
1432 * If we can't find the block, we set the path blocking and do some
1433 * reada. -EAGAIN is returned and the search must be repeated.
1434 */
1435 static int
read_block_for_search(struct btrfs_root * root,struct btrfs_path * p,struct extent_buffer ** eb_ret,int level,int slot,const struct btrfs_key * key)1436 read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
1437 struct extent_buffer **eb_ret, int level, int slot,
1438 const struct btrfs_key *key)
1439 {
1440 struct btrfs_fs_info *fs_info = root->fs_info;
1441 u64 blocknr;
1442 u64 gen;
1443 struct extent_buffer *tmp;
1444 struct btrfs_key first_key;
1445 int ret;
1446 int parent_level;
1447
1448 blocknr = btrfs_node_blockptr(*eb_ret, slot);
1449 gen = btrfs_node_ptr_generation(*eb_ret, slot);
1450 parent_level = btrfs_header_level(*eb_ret);
1451 btrfs_node_key_to_cpu(*eb_ret, &first_key, slot);
1452
1453 tmp = find_extent_buffer(fs_info, blocknr);
1454 if (tmp) {
1455 if (p->reada == READA_FORWARD_ALWAYS)
1456 reada_for_search(fs_info, p, level, slot, key->objectid);
1457
1458 /* first we do an atomic uptodate check */
1459 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
1460 /*
1461 * Do extra check for first_key, eb can be stale due to
1462 * being cached, read from scrub, or have multiple
1463 * parents (shared tree blocks).
1464 */
1465 if (btrfs_verify_level_key(tmp,
1466 parent_level - 1, &first_key, gen)) {
1467 free_extent_buffer(tmp);
1468 return -EUCLEAN;
1469 }
1470 *eb_ret = tmp;
1471 return 0;
1472 }
1473
1474 /* now we're allowed to do a blocking uptodate check */
1475 ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key);
1476 if (!ret) {
1477 *eb_ret = tmp;
1478 return 0;
1479 }
1480 free_extent_buffer(tmp);
1481 btrfs_release_path(p);
1482 return -EIO;
1483 }
1484
1485 /*
1486 * reduce lock contention at high levels
1487 * of the btree by dropping locks before
1488 * we read. Don't release the lock on the current
1489 * level because we need to walk this node to figure
1490 * out which blocks to read.
1491 */
1492 btrfs_unlock_up_safe(p, level + 1);
1493
1494 if (p->reada != READA_NONE)
1495 reada_for_search(fs_info, p, level, slot, key->objectid);
1496
1497 ret = -EAGAIN;
1498 tmp = read_tree_block(fs_info, blocknr, root->root_key.objectid,
1499 gen, parent_level - 1, &first_key);
1500 if (!IS_ERR(tmp)) {
1501 /*
1502 * If the read above didn't mark this buffer up to date,
1503 * it will never end up being up to date. Set ret to EIO now
1504 * and give up so that our caller doesn't loop forever
1505 * on our EAGAINs.
1506 */
1507 if (!extent_buffer_uptodate(tmp))
1508 ret = -EIO;
1509 free_extent_buffer(tmp);
1510 } else {
1511 ret = PTR_ERR(tmp);
1512 }
1513
1514 btrfs_release_path(p);
1515 return ret;
1516 }
1517
1518 /*
1519 * helper function for btrfs_search_slot. This does all of the checks
1520 * for node-level blocks and does any balancing required based on
1521 * the ins_len.
1522 *
1523 * If no extra work was required, zero is returned. If we had to
1524 * drop the path, -EAGAIN is returned and btrfs_search_slot must
1525 * start over
1526 */
1527 static int
setup_nodes_for_search(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * p,struct extent_buffer * b,int level,int ins_len,int * write_lock_level)1528 setup_nodes_for_search(struct btrfs_trans_handle *trans,
1529 struct btrfs_root *root, struct btrfs_path *p,
1530 struct extent_buffer *b, int level, int ins_len,
1531 int *write_lock_level)
1532 {
1533 struct btrfs_fs_info *fs_info = root->fs_info;
1534 int ret = 0;
1535
1536 if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
1537 BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
1538
1539 if (*write_lock_level < level + 1) {
1540 *write_lock_level = level + 1;
1541 btrfs_release_path(p);
1542 return -EAGAIN;
1543 }
1544
1545 reada_for_balance(p, level);
1546 ret = split_node(trans, root, p, level);
1547
1548 b = p->nodes[level];
1549 } else if (ins_len < 0 && btrfs_header_nritems(b) <
1550 BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
1551
1552 if (*write_lock_level < level + 1) {
1553 *write_lock_level = level + 1;
1554 btrfs_release_path(p);
1555 return -EAGAIN;
1556 }
1557
1558 reada_for_balance(p, level);
1559 ret = balance_level(trans, root, p, level);
1560 if (ret)
1561 return ret;
1562
1563 b = p->nodes[level];
1564 if (!b) {
1565 btrfs_release_path(p);
1566 return -EAGAIN;
1567 }
1568 BUG_ON(btrfs_header_nritems(b) == 1);
1569 }
1570 return ret;
1571 }
1572
btrfs_find_item(struct btrfs_root * fs_root,struct btrfs_path * path,u64 iobjectid,u64 ioff,u8 key_type,struct btrfs_key * found_key)1573 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
1574 u64 iobjectid, u64 ioff, u8 key_type,
1575 struct btrfs_key *found_key)
1576 {
1577 int ret;
1578 struct btrfs_key key;
1579 struct extent_buffer *eb;
1580
1581 ASSERT(path);
1582 ASSERT(found_key);
1583
1584 key.type = key_type;
1585 key.objectid = iobjectid;
1586 key.offset = ioff;
1587
1588 ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
1589 if (ret < 0)
1590 return ret;
1591
1592 eb = path->nodes[0];
1593 if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
1594 ret = btrfs_next_leaf(fs_root, path);
1595 if (ret)
1596 return ret;
1597 eb = path->nodes[0];
1598 }
1599
1600 btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
1601 if (found_key->type != key.type ||
1602 found_key->objectid != key.objectid)
1603 return 1;
1604
1605 return 0;
1606 }
1607
btrfs_search_slot_get_root(struct btrfs_root * root,struct btrfs_path * p,int write_lock_level)1608 static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
1609 struct btrfs_path *p,
1610 int write_lock_level)
1611 {
1612 struct extent_buffer *b;
1613 int root_lock = 0;
1614 int level = 0;
1615
1616 if (p->search_commit_root) {
1617 b = root->commit_root;
1618 atomic_inc(&b->refs);
1619 level = btrfs_header_level(b);
1620 /*
1621 * Ensure that all callers have set skip_locking when
1622 * p->search_commit_root = 1.
1623 */
1624 ASSERT(p->skip_locking == 1);
1625
1626 goto out;
1627 }
1628
1629 if (p->skip_locking) {
1630 b = btrfs_root_node(root);
1631 level = btrfs_header_level(b);
1632 goto out;
1633 }
1634
1635 /* We try very hard to do read locks on the root */
1636 root_lock = BTRFS_READ_LOCK;
1637
1638 /*
1639 * If the level is set to maximum, we can skip trying to get the read
1640 * lock.
1641 */
1642 if (write_lock_level < BTRFS_MAX_LEVEL) {
1643 /*
1644 * We don't know the level of the root node until we actually
1645 * have it read locked
1646 */
1647 b = btrfs_read_lock_root_node(root);
1648 level = btrfs_header_level(b);
1649 if (level > write_lock_level)
1650 goto out;
1651
1652 /* Whoops, must trade for write lock */
1653 btrfs_tree_read_unlock(b);
1654 free_extent_buffer(b);
1655 }
1656
1657 b = btrfs_lock_root_node(root);
1658 root_lock = BTRFS_WRITE_LOCK;
1659
1660 /* The level might have changed, check again */
1661 level = btrfs_header_level(b);
1662
1663 out:
1664 /*
1665 * The root may have failed to write out at some point, and thus is no
1666 * longer valid, return an error in this case.
1667 */
1668 if (!extent_buffer_uptodate(b)) {
1669 if (root_lock)
1670 btrfs_tree_unlock_rw(b, root_lock);
1671 free_extent_buffer(b);
1672 return ERR_PTR(-EIO);
1673 }
1674
1675 p->nodes[level] = b;
1676 if (!p->skip_locking)
1677 p->locks[level] = root_lock;
1678 /*
1679 * Callers are responsible for dropping b's references.
1680 */
1681 return b;
1682 }
1683
1684 /*
1685 * Replace the extent buffer at the lowest level of the path with a cloned
1686 * version. The purpose is to be able to use it safely, after releasing the
1687 * commit root semaphore, even if relocation is happening in parallel, the
1688 * transaction used for relocation is committed and the extent buffer is
1689 * reallocated in the next transaction.
1690 *
1691 * This is used in a context where the caller does not prevent transaction
1692 * commits from happening, either by holding a transaction handle or holding
1693 * some lock, while it's doing searches through a commit root.
1694 * At the moment it's only used for send operations.
1695 */
finish_need_commit_sem_search(struct btrfs_path * path)1696 static int finish_need_commit_sem_search(struct btrfs_path *path)
1697 {
1698 const int i = path->lowest_level;
1699 const int slot = path->slots[i];
1700 struct extent_buffer *lowest = path->nodes[i];
1701 struct extent_buffer *clone;
1702
1703 ASSERT(path->need_commit_sem);
1704
1705 if (!lowest)
1706 return 0;
1707
1708 lockdep_assert_held_read(&lowest->fs_info->commit_root_sem);
1709
1710 clone = btrfs_clone_extent_buffer(lowest);
1711 if (!clone)
1712 return -ENOMEM;
1713
1714 btrfs_release_path(path);
1715 path->nodes[i] = clone;
1716 path->slots[i] = slot;
1717
1718 return 0;
1719 }
1720
1721 /*
1722 * btrfs_search_slot - look for a key in a tree and perform necessary
1723 * modifications to preserve tree invariants.
1724 *
1725 * @trans: Handle of transaction, used when modifying the tree
1726 * @p: Holds all btree nodes along the search path
1727 * @root: The root node of the tree
1728 * @key: The key we are looking for
1729 * @ins_len: Indicates purpose of search:
1730 * >0 for inserts it's size of item inserted (*)
1731 * <0 for deletions
1732 * 0 for plain searches, not modifying the tree
1733 *
1734 * (*) If size of item inserted doesn't include
1735 * sizeof(struct btrfs_item), then p->search_for_extension must
1736 * be set.
1737 * @cow: boolean should CoW operations be performed. Must always be 1
1738 * when modifying the tree.
1739 *
1740 * If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
1741 * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
1742 *
1743 * If @key is found, 0 is returned and you can find the item in the leaf level
1744 * of the path (level 0)
1745 *
1746 * If @key isn't found, 1 is returned and the leaf level of the path (level 0)
1747 * points to the slot where it should be inserted
1748 *
1749 * If an error is encountered while searching the tree a negative error number
1750 * is returned
1751 */
btrfs_search_slot(struct btrfs_trans_handle * trans,struct btrfs_root * root,const struct btrfs_key * key,struct btrfs_path * p,int ins_len,int cow)1752 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1753 const struct btrfs_key *key, struct btrfs_path *p,
1754 int ins_len, int cow)
1755 {
1756 struct btrfs_fs_info *fs_info = root->fs_info;
1757 struct extent_buffer *b;
1758 int slot;
1759 int ret;
1760 int err;
1761 int level;
1762 int lowest_unlock = 1;
1763 /* everything at write_lock_level or lower must be write locked */
1764 int write_lock_level = 0;
1765 u8 lowest_level = 0;
1766 int min_write_lock_level;
1767 int prev_cmp;
1768
1769 lowest_level = p->lowest_level;
1770 WARN_ON(lowest_level && ins_len > 0);
1771 WARN_ON(p->nodes[0] != NULL);
1772 BUG_ON(!cow && ins_len);
1773
1774 if (ins_len < 0) {
1775 lowest_unlock = 2;
1776
1777 /* when we are removing items, we might have to go up to level
1778 * two as we update tree pointers Make sure we keep write
1779 * for those levels as well
1780 */
1781 write_lock_level = 2;
1782 } else if (ins_len > 0) {
1783 /*
1784 * for inserting items, make sure we have a write lock on
1785 * level 1 so we can update keys
1786 */
1787 write_lock_level = 1;
1788 }
1789
1790 if (!cow)
1791 write_lock_level = -1;
1792
1793 if (cow && (p->keep_locks || p->lowest_level))
1794 write_lock_level = BTRFS_MAX_LEVEL;
1795
1796 min_write_lock_level = write_lock_level;
1797
1798 if (p->need_commit_sem) {
1799 ASSERT(p->search_commit_root);
1800 down_read(&fs_info->commit_root_sem);
1801 }
1802
1803 again:
1804 prev_cmp = -1;
1805 b = btrfs_search_slot_get_root(root, p, write_lock_level);
1806 if (IS_ERR(b)) {
1807 ret = PTR_ERR(b);
1808 goto done;
1809 }
1810
1811 while (b) {
1812 int dec = 0;
1813
1814 level = btrfs_header_level(b);
1815
1816 if (cow) {
1817 bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
1818
1819 /*
1820 * if we don't really need to cow this block
1821 * then we don't want to set the path blocking,
1822 * so we test it here
1823 */
1824 if (!should_cow_block(trans, root, b))
1825 goto cow_done;
1826
1827 /*
1828 * must have write locks on this node and the
1829 * parent
1830 */
1831 if (level > write_lock_level ||
1832 (level + 1 > write_lock_level &&
1833 level + 1 < BTRFS_MAX_LEVEL &&
1834 p->nodes[level + 1])) {
1835 write_lock_level = level + 1;
1836 btrfs_release_path(p);
1837 goto again;
1838 }
1839
1840 if (last_level)
1841 err = btrfs_cow_block(trans, root, b, NULL, 0,
1842 &b,
1843 BTRFS_NESTING_COW);
1844 else
1845 err = btrfs_cow_block(trans, root, b,
1846 p->nodes[level + 1],
1847 p->slots[level + 1], &b,
1848 BTRFS_NESTING_COW);
1849 if (err) {
1850 ret = err;
1851 goto done;
1852 }
1853 }
1854 cow_done:
1855 p->nodes[level] = b;
1856 /*
1857 * Leave path with blocking locks to avoid massive
1858 * lock context switch, this is made on purpose.
1859 */
1860
1861 /*
1862 * we have a lock on b and as long as we aren't changing
1863 * the tree, there is no way to for the items in b to change.
1864 * It is safe to drop the lock on our parent before we
1865 * go through the expensive btree search on b.
1866 *
1867 * If we're inserting or deleting (ins_len != 0), then we might
1868 * be changing slot zero, which may require changing the parent.
1869 * So, we can't drop the lock until after we know which slot
1870 * we're operating on.
1871 */
1872 if (!ins_len && !p->keep_locks) {
1873 int u = level + 1;
1874
1875 if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
1876 btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
1877 p->locks[u] = 0;
1878 }
1879 }
1880
1881 /*
1882 * If btrfs_bin_search returns an exact match (prev_cmp == 0)
1883 * we can safely assume the target key will always be in slot 0
1884 * on lower levels due to the invariants BTRFS' btree provides,
1885 * namely that a btrfs_key_ptr entry always points to the
1886 * lowest key in the child node, thus we can skip searching
1887 * lower levels
1888 */
1889 if (prev_cmp == 0) {
1890 slot = 0;
1891 ret = 0;
1892 } else {
1893 ret = btrfs_bin_search(b, key, &slot);
1894 prev_cmp = ret;
1895 if (ret < 0)
1896 goto done;
1897 }
1898
1899 if (level == 0) {
1900 p->slots[level] = slot;
1901 /*
1902 * Item key already exists. In this case, if we are
1903 * allowed to insert the item (for example, in dir_item
1904 * case, item key collision is allowed), it will be
1905 * merged with the original item. Only the item size
1906 * grows, no new btrfs item will be added. If
1907 * search_for_extension is not set, ins_len already
1908 * accounts the size btrfs_item, deduct it here so leaf
1909 * space check will be correct.
1910 */
1911 if (ret == 0 && ins_len > 0 && !p->search_for_extension) {
1912 ASSERT(ins_len >= sizeof(struct btrfs_item));
1913 ins_len -= sizeof(struct btrfs_item);
1914 }
1915 if (ins_len > 0 &&
1916 btrfs_leaf_free_space(b) < ins_len) {
1917 if (write_lock_level < 1) {
1918 write_lock_level = 1;
1919 btrfs_release_path(p);
1920 goto again;
1921 }
1922
1923 err = split_leaf(trans, root, key,
1924 p, ins_len, ret == 0);
1925
1926 BUG_ON(err > 0);
1927 if (err) {
1928 ret = err;
1929 goto done;
1930 }
1931 }
1932 if (!p->search_for_split)
1933 unlock_up(p, level, lowest_unlock,
1934 min_write_lock_level, NULL);
1935 goto done;
1936 }
1937 if (ret && slot > 0) {
1938 dec = 1;
1939 slot--;
1940 }
1941 p->slots[level] = slot;
1942 err = setup_nodes_for_search(trans, root, p, b, level, ins_len,
1943 &write_lock_level);
1944 if (err == -EAGAIN)
1945 goto again;
1946 if (err) {
1947 ret = err;
1948 goto done;
1949 }
1950 b = p->nodes[level];
1951 slot = p->slots[level];
1952
1953 /*
1954 * Slot 0 is special, if we change the key we have to update
1955 * the parent pointer which means we must have a write lock on
1956 * the parent
1957 */
1958 if (slot == 0 && ins_len && write_lock_level < level + 1) {
1959 write_lock_level = level + 1;
1960 btrfs_release_path(p);
1961 goto again;
1962 }
1963
1964 unlock_up(p, level, lowest_unlock, min_write_lock_level,
1965 &write_lock_level);
1966
1967 if (level == lowest_level) {
1968 if (dec)
1969 p->slots[level]++;
1970 goto done;
1971 }
1972
1973 err = read_block_for_search(root, p, &b, level, slot, key);
1974 if (err == -EAGAIN)
1975 goto again;
1976 if (err) {
1977 ret = err;
1978 goto done;
1979 }
1980
1981 if (!p->skip_locking) {
1982 level = btrfs_header_level(b);
1983
1984 btrfs_maybe_reset_lockdep_class(root, b);
1985
1986 if (level <= write_lock_level) {
1987 btrfs_tree_lock(b);
1988 p->locks[level] = BTRFS_WRITE_LOCK;
1989 } else {
1990 btrfs_tree_read_lock(b);
1991 p->locks[level] = BTRFS_READ_LOCK;
1992 }
1993 p->nodes[level] = b;
1994 }
1995 }
1996 ret = 1;
1997 done:
1998 if (ret < 0 && !p->skip_release_on_error)
1999 btrfs_release_path(p);
2000
2001 if (p->need_commit_sem) {
2002 int ret2;
2003
2004 ret2 = finish_need_commit_sem_search(p);
2005 up_read(&fs_info->commit_root_sem);
2006 if (ret2)
2007 ret = ret2;
2008 }
2009
2010 return ret;
2011 }
2012 ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO);
2013
2014 /*
2015 * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2016 * current state of the tree together with the operations recorded in the tree
2017 * modification log to search for the key in a previous version of this tree, as
2018 * denoted by the time_seq parameter.
2019 *
2020 * Naturally, there is no support for insert, delete or cow operations.
2021 *
2022 * The resulting path and return value will be set up as if we called
2023 * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2024 */
btrfs_search_old_slot(struct btrfs_root * root,const struct btrfs_key * key,struct btrfs_path * p,u64 time_seq)2025 int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
2026 struct btrfs_path *p, u64 time_seq)
2027 {
2028 struct btrfs_fs_info *fs_info = root->fs_info;
2029 struct extent_buffer *b;
2030 int slot;
2031 int ret;
2032 int err;
2033 int level;
2034 int lowest_unlock = 1;
2035 u8 lowest_level = 0;
2036
2037 lowest_level = p->lowest_level;
2038 WARN_ON(p->nodes[0] != NULL);
2039
2040 if (p->search_commit_root) {
2041 BUG_ON(time_seq);
2042 return btrfs_search_slot(NULL, root, key, p, 0, 0);
2043 }
2044
2045 again:
2046 b = btrfs_get_old_root(root, time_seq);
2047 if (!b) {
2048 ret = -EIO;
2049 goto done;
2050 }
2051 level = btrfs_header_level(b);
2052 p->locks[level] = BTRFS_READ_LOCK;
2053
2054 while (b) {
2055 int dec = 0;
2056
2057 level = btrfs_header_level(b);
2058 p->nodes[level] = b;
2059
2060 /*
2061 * we have a lock on b and as long as we aren't changing
2062 * the tree, there is no way to for the items in b to change.
2063 * It is safe to drop the lock on our parent before we
2064 * go through the expensive btree search on b.
2065 */
2066 btrfs_unlock_up_safe(p, level + 1);
2067
2068 ret = btrfs_bin_search(b, key, &slot);
2069 if (ret < 0)
2070 goto done;
2071
2072 if (level == 0) {
2073 p->slots[level] = slot;
2074 unlock_up(p, level, lowest_unlock, 0, NULL);
2075 goto done;
2076 }
2077
2078 if (ret && slot > 0) {
2079 dec = 1;
2080 slot--;
2081 }
2082 p->slots[level] = slot;
2083 unlock_up(p, level, lowest_unlock, 0, NULL);
2084
2085 if (level == lowest_level) {
2086 if (dec)
2087 p->slots[level]++;
2088 goto done;
2089 }
2090
2091 err = read_block_for_search(root, p, &b, level, slot, key);
2092 if (err == -EAGAIN)
2093 goto again;
2094 if (err) {
2095 ret = err;
2096 goto done;
2097 }
2098
2099 level = btrfs_header_level(b);
2100 btrfs_tree_read_lock(b);
2101 b = btrfs_tree_mod_log_rewind(fs_info, p, b, time_seq);
2102 if (!b) {
2103 ret = -ENOMEM;
2104 goto done;
2105 }
2106 p->locks[level] = BTRFS_READ_LOCK;
2107 p->nodes[level] = b;
2108 }
2109 ret = 1;
2110 done:
2111 if (ret < 0)
2112 btrfs_release_path(p);
2113
2114 return ret;
2115 }
2116
2117 /*
2118 * helper to use instead of search slot if no exact match is needed but
2119 * instead the next or previous item should be returned.
2120 * When find_higher is true, the next higher item is returned, the next lower
2121 * otherwise.
2122 * When return_any and find_higher are both true, and no higher item is found,
2123 * return the next lower instead.
2124 * When return_any is true and find_higher is false, and no lower item is found,
2125 * return the next higher instead.
2126 * It returns 0 if any item is found, 1 if none is found (tree empty), and
2127 * < 0 on error
2128 */
btrfs_search_slot_for_read(struct btrfs_root * root,const struct btrfs_key * key,struct btrfs_path * p,int find_higher,int return_any)2129 int btrfs_search_slot_for_read(struct btrfs_root *root,
2130 const struct btrfs_key *key,
2131 struct btrfs_path *p, int find_higher,
2132 int return_any)
2133 {
2134 int ret;
2135 struct extent_buffer *leaf;
2136
2137 again:
2138 ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
2139 if (ret <= 0)
2140 return ret;
2141 /*
2142 * a return value of 1 means the path is at the position where the
2143 * item should be inserted. Normally this is the next bigger item,
2144 * but in case the previous item is the last in a leaf, path points
2145 * to the first free slot in the previous leaf, i.e. at an invalid
2146 * item.
2147 */
2148 leaf = p->nodes[0];
2149
2150 if (find_higher) {
2151 if (p->slots[0] >= btrfs_header_nritems(leaf)) {
2152 ret = btrfs_next_leaf(root, p);
2153 if (ret <= 0)
2154 return ret;
2155 if (!return_any)
2156 return 1;
2157 /*
2158 * no higher item found, return the next
2159 * lower instead
2160 */
2161 return_any = 0;
2162 find_higher = 0;
2163 btrfs_release_path(p);
2164 goto again;
2165 }
2166 } else {
2167 if (p->slots[0] == 0) {
2168 ret = btrfs_prev_leaf(root, p);
2169 if (ret < 0)
2170 return ret;
2171 if (!ret) {
2172 leaf = p->nodes[0];
2173 if (p->slots[0] == btrfs_header_nritems(leaf))
2174 p->slots[0]--;
2175 return 0;
2176 }
2177 if (!return_any)
2178 return 1;
2179 /*
2180 * no lower item found, return the next
2181 * higher instead
2182 */
2183 return_any = 0;
2184 find_higher = 1;
2185 btrfs_release_path(p);
2186 goto again;
2187 } else {
2188 --p->slots[0];
2189 }
2190 }
2191 return 0;
2192 }
2193
2194 /*
2195 * Execute search and call btrfs_previous_item to traverse backwards if the item
2196 * was not found.
2197 *
2198 * Return 0 if found, 1 if not found and < 0 if error.
2199 */
btrfs_search_backwards(struct btrfs_root * root,struct btrfs_key * key,struct btrfs_path * path)2200 int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key,
2201 struct btrfs_path *path)
2202 {
2203 int ret;
2204
2205 ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
2206 if (ret > 0)
2207 ret = btrfs_previous_item(root, path, key->objectid, key->type);
2208
2209 if (ret == 0)
2210 btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]);
2211
2212 return ret;
2213 }
2214
2215 /*
2216 * adjust the pointers going up the tree, starting at level
2217 * making sure the right key of each node is points to 'key'.
2218 * This is used after shifting pointers to the left, so it stops
2219 * fixing up pointers when a given leaf/node is not in slot 0 of the
2220 * higher levels
2221 *
2222 */
fixup_low_keys(struct btrfs_path * path,struct btrfs_disk_key * key,int level)2223 static void fixup_low_keys(struct btrfs_path *path,
2224 struct btrfs_disk_key *key, int level)
2225 {
2226 int i;
2227 struct extent_buffer *t;
2228 int ret;
2229
2230 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2231 int tslot = path->slots[i];
2232
2233 if (!path->nodes[i])
2234 break;
2235 t = path->nodes[i];
2236 ret = btrfs_tree_mod_log_insert_key(t, tslot,
2237 BTRFS_MOD_LOG_KEY_REPLACE, GFP_ATOMIC);
2238 BUG_ON(ret < 0);
2239 btrfs_set_node_key(t, key, tslot);
2240 btrfs_mark_buffer_dirty(path->nodes[i]);
2241 if (tslot != 0)
2242 break;
2243 }
2244 }
2245
2246 /*
2247 * update item key.
2248 *
2249 * This function isn't completely safe. It's the caller's responsibility
2250 * that the new key won't break the order
2251 */
btrfs_set_item_key_safe(struct btrfs_fs_info * fs_info,struct btrfs_path * path,const struct btrfs_key * new_key)2252 void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
2253 struct btrfs_path *path,
2254 const struct btrfs_key *new_key)
2255 {
2256 struct btrfs_disk_key disk_key;
2257 struct extent_buffer *eb;
2258 int slot;
2259
2260 eb = path->nodes[0];
2261 slot = path->slots[0];
2262 if (slot > 0) {
2263 btrfs_item_key(eb, &disk_key, slot - 1);
2264 if (unlikely(comp_keys(&disk_key, new_key) >= 0)) {
2265 btrfs_crit(fs_info,
2266 "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
2267 slot, btrfs_disk_key_objectid(&disk_key),
2268 btrfs_disk_key_type(&disk_key),
2269 btrfs_disk_key_offset(&disk_key),
2270 new_key->objectid, new_key->type,
2271 new_key->offset);
2272 btrfs_print_leaf(eb);
2273 BUG();
2274 }
2275 }
2276 if (slot < btrfs_header_nritems(eb) - 1) {
2277 btrfs_item_key(eb, &disk_key, slot + 1);
2278 if (unlikely(comp_keys(&disk_key, new_key) <= 0)) {
2279 btrfs_crit(fs_info,
2280 "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
2281 slot, btrfs_disk_key_objectid(&disk_key),
2282 btrfs_disk_key_type(&disk_key),
2283 btrfs_disk_key_offset(&disk_key),
2284 new_key->objectid, new_key->type,
2285 new_key->offset);
2286 btrfs_print_leaf(eb);
2287 BUG();
2288 }
2289 }
2290
2291 btrfs_cpu_key_to_disk(&disk_key, new_key);
2292 btrfs_set_item_key(eb, &disk_key, slot);
2293 btrfs_mark_buffer_dirty(eb);
2294 if (slot == 0)
2295 fixup_low_keys(path, &disk_key, 1);
2296 }
2297
2298 /*
2299 * Check key order of two sibling extent buffers.
2300 *
2301 * Return true if something is wrong.
2302 * Return false if everything is fine.
2303 *
2304 * Tree-checker only works inside one tree block, thus the following
2305 * corruption can not be detected by tree-checker:
2306 *
2307 * Leaf @left | Leaf @right
2308 * --------------------------------------------------------------
2309 * | 1 | 2 | 3 | 4 | 5 | f6 | | 7 | 8 |
2310 *
2311 * Key f6 in leaf @left itself is valid, but not valid when the next
2312 * key in leaf @right is 7.
2313 * This can only be checked at tree block merge time.
2314 * And since tree checker has ensured all key order in each tree block
2315 * is correct, we only need to bother the last key of @left and the first
2316 * key of @right.
2317 */
check_sibling_keys(struct extent_buffer * left,struct extent_buffer * right)2318 static bool check_sibling_keys(struct extent_buffer *left,
2319 struct extent_buffer *right)
2320 {
2321 struct btrfs_key left_last;
2322 struct btrfs_key right_first;
2323 int level = btrfs_header_level(left);
2324 int nr_left = btrfs_header_nritems(left);
2325 int nr_right = btrfs_header_nritems(right);
2326
2327 /* No key to check in one of the tree blocks */
2328 if (!nr_left || !nr_right)
2329 return false;
2330
2331 if (level) {
2332 btrfs_node_key_to_cpu(left, &left_last, nr_left - 1);
2333 btrfs_node_key_to_cpu(right, &right_first, 0);
2334 } else {
2335 btrfs_item_key_to_cpu(left, &left_last, nr_left - 1);
2336 btrfs_item_key_to_cpu(right, &right_first, 0);
2337 }
2338
2339 if (btrfs_comp_cpu_keys(&left_last, &right_first) >= 0) {
2340 btrfs_crit(left->fs_info,
2341 "bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)",
2342 left_last.objectid, left_last.type,
2343 left_last.offset, right_first.objectid,
2344 right_first.type, right_first.offset);
2345 return true;
2346 }
2347 return false;
2348 }
2349
2350 /*
2351 * try to push data from one node into the next node left in the
2352 * tree.
2353 *
2354 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
2355 * error, and > 0 if there was no room in the left hand block.
2356 */
push_node_left(struct btrfs_trans_handle * trans,struct extent_buffer * dst,struct extent_buffer * src,int empty)2357 static int push_node_left(struct btrfs_trans_handle *trans,
2358 struct extent_buffer *dst,
2359 struct extent_buffer *src, int empty)
2360 {
2361 struct btrfs_fs_info *fs_info = trans->fs_info;
2362 int push_items = 0;
2363 int src_nritems;
2364 int dst_nritems;
2365 int ret = 0;
2366
2367 src_nritems = btrfs_header_nritems(src);
2368 dst_nritems = btrfs_header_nritems(dst);
2369 push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
2370 WARN_ON(btrfs_header_generation(src) != trans->transid);
2371 WARN_ON(btrfs_header_generation(dst) != trans->transid);
2372
2373 if (!empty && src_nritems <= 8)
2374 return 1;
2375
2376 if (push_items <= 0)
2377 return 1;
2378
2379 if (empty) {
2380 push_items = min(src_nritems, push_items);
2381 if (push_items < src_nritems) {
2382 /* leave at least 8 pointers in the node if
2383 * we aren't going to empty it
2384 */
2385 if (src_nritems - push_items < 8) {
2386 if (push_items <= 8)
2387 return 1;
2388 push_items -= 8;
2389 }
2390 }
2391 } else
2392 push_items = min(src_nritems - 8, push_items);
2393
2394 /* dst is the left eb, src is the middle eb */
2395 if (check_sibling_keys(dst, src)) {
2396 ret = -EUCLEAN;
2397 btrfs_abort_transaction(trans, ret);
2398 return ret;
2399 }
2400 ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
2401 if (ret) {
2402 btrfs_abort_transaction(trans, ret);
2403 return ret;
2404 }
2405 copy_extent_buffer(dst, src,
2406 btrfs_node_key_ptr_offset(dst_nritems),
2407 btrfs_node_key_ptr_offset(0),
2408 push_items * sizeof(struct btrfs_key_ptr));
2409
2410 if (push_items < src_nritems) {
2411 /*
2412 * Don't call btrfs_tree_mod_log_insert_move() here, key removal
2413 * was already fully logged by btrfs_tree_mod_log_eb_copy() above.
2414 */
2415 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
2416 btrfs_node_key_ptr_offset(push_items),
2417 (src_nritems - push_items) *
2418 sizeof(struct btrfs_key_ptr));
2419 }
2420 btrfs_set_header_nritems(src, src_nritems - push_items);
2421 btrfs_set_header_nritems(dst, dst_nritems + push_items);
2422 btrfs_mark_buffer_dirty(src);
2423 btrfs_mark_buffer_dirty(dst);
2424
2425 return ret;
2426 }
2427
2428 /*
2429 * try to push data from one node into the next node right in the
2430 * tree.
2431 *
2432 * returns 0 if some ptrs were pushed, < 0 if there was some horrible
2433 * error, and > 0 if there was no room in the right hand block.
2434 *
2435 * this will only push up to 1/2 the contents of the left node over
2436 */
balance_node_right(struct btrfs_trans_handle * trans,struct extent_buffer * dst,struct extent_buffer * src)2437 static int balance_node_right(struct btrfs_trans_handle *trans,
2438 struct extent_buffer *dst,
2439 struct extent_buffer *src)
2440 {
2441 struct btrfs_fs_info *fs_info = trans->fs_info;
2442 int push_items = 0;
2443 int max_push;
2444 int src_nritems;
2445 int dst_nritems;
2446 int ret = 0;
2447
2448 WARN_ON(btrfs_header_generation(src) != trans->transid);
2449 WARN_ON(btrfs_header_generation(dst) != trans->transid);
2450
2451 src_nritems = btrfs_header_nritems(src);
2452 dst_nritems = btrfs_header_nritems(dst);
2453 push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
2454 if (push_items <= 0)
2455 return 1;
2456
2457 if (src_nritems < 4)
2458 return 1;
2459
2460 max_push = src_nritems / 2 + 1;
2461 /* don't try to empty the node */
2462 if (max_push >= src_nritems)
2463 return 1;
2464
2465 if (max_push < push_items)
2466 push_items = max_push;
2467
2468 /* dst is the right eb, src is the middle eb */
2469 if (check_sibling_keys(src, dst)) {
2470 ret = -EUCLEAN;
2471 btrfs_abort_transaction(trans, ret);
2472 return ret;
2473 }
2474 ret = btrfs_tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
2475 BUG_ON(ret < 0);
2476 memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
2477 btrfs_node_key_ptr_offset(0),
2478 (dst_nritems) *
2479 sizeof(struct btrfs_key_ptr));
2480
2481 ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items,
2482 push_items);
2483 if (ret) {
2484 btrfs_abort_transaction(trans, ret);
2485 return ret;
2486 }
2487 copy_extent_buffer(dst, src,
2488 btrfs_node_key_ptr_offset(0),
2489 btrfs_node_key_ptr_offset(src_nritems - push_items),
2490 push_items * sizeof(struct btrfs_key_ptr));
2491
2492 btrfs_set_header_nritems(src, src_nritems - push_items);
2493 btrfs_set_header_nritems(dst, dst_nritems + push_items);
2494
2495 btrfs_mark_buffer_dirty(src);
2496 btrfs_mark_buffer_dirty(dst);
2497
2498 return ret;
2499 }
2500
2501 /*
2502 * helper function to insert a new root level in the tree.
2503 * A new node is allocated, and a single item is inserted to
2504 * point to the existing root
2505 *
2506 * returns zero on success or < 0 on failure.
2507 */
insert_new_root(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,int level)2508 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2509 struct btrfs_root *root,
2510 struct btrfs_path *path, int level)
2511 {
2512 struct btrfs_fs_info *fs_info = root->fs_info;
2513 u64 lower_gen;
2514 struct extent_buffer *lower;
2515 struct extent_buffer *c;
2516 struct extent_buffer *old;
2517 struct btrfs_disk_key lower_key;
2518 int ret;
2519
2520 BUG_ON(path->nodes[level]);
2521 BUG_ON(path->nodes[level-1] != root->node);
2522
2523 lower = path->nodes[level-1];
2524 if (level == 1)
2525 btrfs_item_key(lower, &lower_key, 0);
2526 else
2527 btrfs_node_key(lower, &lower_key, 0);
2528
2529 c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
2530 &lower_key, level, root->node->start, 0,
2531 BTRFS_NESTING_NEW_ROOT);
2532 if (IS_ERR(c))
2533 return PTR_ERR(c);
2534
2535 root_add_used(root, fs_info->nodesize);
2536
2537 btrfs_set_header_nritems(c, 1);
2538 btrfs_set_node_key(c, &lower_key, 0);
2539 btrfs_set_node_blockptr(c, 0, lower->start);
2540 lower_gen = btrfs_header_generation(lower);
2541 WARN_ON(lower_gen != trans->transid);
2542
2543 btrfs_set_node_ptr_generation(c, 0, lower_gen);
2544
2545 btrfs_mark_buffer_dirty(c);
2546
2547 old = root->node;
2548 ret = btrfs_tree_mod_log_insert_root(root->node, c, false);
2549 BUG_ON(ret < 0);
2550 rcu_assign_pointer(root->node, c);
2551
2552 /* the super has an extra ref to root->node */
2553 free_extent_buffer(old);
2554
2555 add_root_to_dirty_list(root);
2556 atomic_inc(&c->refs);
2557 path->nodes[level] = c;
2558 path->locks[level] = BTRFS_WRITE_LOCK;
2559 path->slots[level] = 0;
2560 return 0;
2561 }
2562
2563 /*
2564 * worker function to insert a single pointer in a node.
2565 * the node should have enough room for the pointer already
2566 *
2567 * slot and level indicate where you want the key to go, and
2568 * blocknr is the block the key points to.
2569 */
insert_ptr(struct btrfs_trans_handle * trans,struct btrfs_path * path,struct btrfs_disk_key * key,u64 bytenr,int slot,int level)2570 static void insert_ptr(struct btrfs_trans_handle *trans,
2571 struct btrfs_path *path,
2572 struct btrfs_disk_key *key, u64 bytenr,
2573 int slot, int level)
2574 {
2575 struct extent_buffer *lower;
2576 int nritems;
2577 int ret;
2578
2579 BUG_ON(!path->nodes[level]);
2580 btrfs_assert_tree_locked(path->nodes[level]);
2581 lower = path->nodes[level];
2582 nritems = btrfs_header_nritems(lower);
2583 BUG_ON(slot > nritems);
2584 BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info));
2585 if (slot != nritems) {
2586 if (level) {
2587 ret = btrfs_tree_mod_log_insert_move(lower, slot + 1,
2588 slot, nritems - slot);
2589 BUG_ON(ret < 0);
2590 }
2591 memmove_extent_buffer(lower,
2592 btrfs_node_key_ptr_offset(slot + 1),
2593 btrfs_node_key_ptr_offset(slot),
2594 (nritems - slot) * sizeof(struct btrfs_key_ptr));
2595 }
2596 if (level) {
2597 ret = btrfs_tree_mod_log_insert_key(lower, slot,
2598 BTRFS_MOD_LOG_KEY_ADD, GFP_NOFS);
2599 BUG_ON(ret < 0);
2600 }
2601 btrfs_set_node_key(lower, key, slot);
2602 btrfs_set_node_blockptr(lower, slot, bytenr);
2603 WARN_ON(trans->transid == 0);
2604 btrfs_set_node_ptr_generation(lower, slot, trans->transid);
2605 btrfs_set_header_nritems(lower, nritems + 1);
2606 btrfs_mark_buffer_dirty(lower);
2607 }
2608
2609 /*
2610 * split the node at the specified level in path in two.
2611 * The path is corrected to point to the appropriate node after the split
2612 *
2613 * Before splitting this tries to make some room in the node by pushing
2614 * left and right, if either one works, it returns right away.
2615 *
2616 * returns 0 on success and < 0 on failure
2617 */
split_node(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,int level)2618 static noinline int split_node(struct btrfs_trans_handle *trans,
2619 struct btrfs_root *root,
2620 struct btrfs_path *path, int level)
2621 {
2622 struct btrfs_fs_info *fs_info = root->fs_info;
2623 struct extent_buffer *c;
2624 struct extent_buffer *split;
2625 struct btrfs_disk_key disk_key;
2626 int mid;
2627 int ret;
2628 u32 c_nritems;
2629
2630 c = path->nodes[level];
2631 WARN_ON(btrfs_header_generation(c) != trans->transid);
2632 if (c == root->node) {
2633 /*
2634 * trying to split the root, lets make a new one
2635 *
2636 * tree mod log: We don't log_removal old root in
2637 * insert_new_root, because that root buffer will be kept as a
2638 * normal node. We are going to log removal of half of the
2639 * elements below with btrfs_tree_mod_log_eb_copy(). We're
2640 * holding a tree lock on the buffer, which is why we cannot
2641 * race with other tree_mod_log users.
2642 */
2643 ret = insert_new_root(trans, root, path, level + 1);
2644 if (ret)
2645 return ret;
2646 } else {
2647 ret = push_nodes_for_insert(trans, root, path, level);
2648 c = path->nodes[level];
2649 if (!ret && btrfs_header_nritems(c) <
2650 BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3)
2651 return 0;
2652 if (ret < 0)
2653 return ret;
2654 }
2655
2656 c_nritems = btrfs_header_nritems(c);
2657 mid = (c_nritems + 1) / 2;
2658 btrfs_node_key(c, &disk_key, mid);
2659
2660 split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
2661 &disk_key, level, c->start, 0,
2662 BTRFS_NESTING_SPLIT);
2663 if (IS_ERR(split))
2664 return PTR_ERR(split);
2665
2666 root_add_used(root, fs_info->nodesize);
2667 ASSERT(btrfs_header_level(c) == level);
2668
2669 ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
2670 if (ret) {
2671 btrfs_tree_unlock(split);
2672 free_extent_buffer(split);
2673 btrfs_abort_transaction(trans, ret);
2674 return ret;
2675 }
2676 copy_extent_buffer(split, c,
2677 btrfs_node_key_ptr_offset(0),
2678 btrfs_node_key_ptr_offset(mid),
2679 (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
2680 btrfs_set_header_nritems(split, c_nritems - mid);
2681 btrfs_set_header_nritems(c, mid);
2682
2683 btrfs_mark_buffer_dirty(c);
2684 btrfs_mark_buffer_dirty(split);
2685
2686 insert_ptr(trans, path, &disk_key, split->start,
2687 path->slots[level + 1] + 1, level + 1);
2688
2689 if (path->slots[level] >= mid) {
2690 path->slots[level] -= mid;
2691 btrfs_tree_unlock(c);
2692 free_extent_buffer(c);
2693 path->nodes[level] = split;
2694 path->slots[level + 1] += 1;
2695 } else {
2696 btrfs_tree_unlock(split);
2697 free_extent_buffer(split);
2698 }
2699 return 0;
2700 }
2701
2702 /*
2703 * how many bytes are required to store the items in a leaf. start
2704 * and nr indicate which items in the leaf to check. This totals up the
2705 * space used both by the item structs and the item data
2706 */
leaf_space_used(struct extent_buffer * l,int start,int nr)2707 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
2708 {
2709 struct btrfs_item *start_item;
2710 struct btrfs_item *end_item;
2711 int data_len;
2712 int nritems = btrfs_header_nritems(l);
2713 int end = min(nritems, start + nr) - 1;
2714
2715 if (!nr)
2716 return 0;
2717 start_item = btrfs_item_nr(start);
2718 end_item = btrfs_item_nr(end);
2719 data_len = btrfs_item_offset(l, start_item) +
2720 btrfs_item_size(l, start_item);
2721 data_len = data_len - btrfs_item_offset(l, end_item);
2722 data_len += sizeof(struct btrfs_item) * nr;
2723 WARN_ON(data_len < 0);
2724 return data_len;
2725 }
2726
2727 /*
2728 * The space between the end of the leaf items and
2729 * the start of the leaf data. IOW, how much room
2730 * the leaf has left for both items and data
2731 */
btrfs_leaf_free_space(struct extent_buffer * leaf)2732 noinline int btrfs_leaf_free_space(struct extent_buffer *leaf)
2733 {
2734 struct btrfs_fs_info *fs_info = leaf->fs_info;
2735 int nritems = btrfs_header_nritems(leaf);
2736 int ret;
2737
2738 ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
2739 if (ret < 0) {
2740 btrfs_crit(fs_info,
2741 "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
2742 ret,
2743 (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
2744 leaf_space_used(leaf, 0, nritems), nritems);
2745 }
2746 return ret;
2747 }
2748
2749 /*
2750 * min slot controls the lowest index we're willing to push to the
2751 * right. We'll push up to and including min_slot, but no lower
2752 */
__push_leaf_right(struct btrfs_path * path,int data_size,int empty,struct extent_buffer * right,int free_space,u32 left_nritems,u32 min_slot)2753 static noinline int __push_leaf_right(struct btrfs_path *path,
2754 int data_size, int empty,
2755 struct extent_buffer *right,
2756 int free_space, u32 left_nritems,
2757 u32 min_slot)
2758 {
2759 struct btrfs_fs_info *fs_info = right->fs_info;
2760 struct extent_buffer *left = path->nodes[0];
2761 struct extent_buffer *upper = path->nodes[1];
2762 struct btrfs_map_token token;
2763 struct btrfs_disk_key disk_key;
2764 int slot;
2765 u32 i;
2766 int push_space = 0;
2767 int push_items = 0;
2768 struct btrfs_item *item;
2769 u32 nr;
2770 u32 right_nritems;
2771 u32 data_end;
2772 u32 this_item_size;
2773
2774 if (empty)
2775 nr = 0;
2776 else
2777 nr = max_t(u32, 1, min_slot);
2778
2779 if (path->slots[0] >= left_nritems)
2780 push_space += data_size;
2781
2782 slot = path->slots[1];
2783 i = left_nritems - 1;
2784 while (i >= nr) {
2785 item = btrfs_item_nr(i);
2786
2787 if (!empty && push_items > 0) {
2788 if (path->slots[0] > i)
2789 break;
2790 if (path->slots[0] == i) {
2791 int space = btrfs_leaf_free_space(left);
2792
2793 if (space + push_space * 2 > free_space)
2794 break;
2795 }
2796 }
2797
2798 if (path->slots[0] == i)
2799 push_space += data_size;
2800
2801 this_item_size = btrfs_item_size(left, item);
2802 if (this_item_size + sizeof(*item) + push_space > free_space)
2803 break;
2804
2805 push_items++;
2806 push_space += this_item_size + sizeof(*item);
2807 if (i == 0)
2808 break;
2809 i--;
2810 }
2811
2812 if (push_items == 0)
2813 goto out_unlock;
2814
2815 WARN_ON(!empty && push_items == left_nritems);
2816
2817 /* push left to right */
2818 right_nritems = btrfs_header_nritems(right);
2819
2820 push_space = btrfs_item_end_nr(left, left_nritems - push_items);
2821 push_space -= leaf_data_end(left);
2822
2823 /* make room in the right data area */
2824 data_end = leaf_data_end(right);
2825 memmove_extent_buffer(right,
2826 BTRFS_LEAF_DATA_OFFSET + data_end - push_space,
2827 BTRFS_LEAF_DATA_OFFSET + data_end,
2828 BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
2829
2830 /* copy from the left data area */
2831 copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET +
2832 BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
2833 BTRFS_LEAF_DATA_OFFSET + leaf_data_end(left),
2834 push_space);
2835
2836 memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
2837 btrfs_item_nr_offset(0),
2838 right_nritems * sizeof(struct btrfs_item));
2839
2840 /* copy the items from left to right */
2841 copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
2842 btrfs_item_nr_offset(left_nritems - push_items),
2843 push_items * sizeof(struct btrfs_item));
2844
2845 /* update the item pointers */
2846 btrfs_init_map_token(&token, right);
2847 right_nritems += push_items;
2848 btrfs_set_header_nritems(right, right_nritems);
2849 push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
2850 for (i = 0; i < right_nritems; i++) {
2851 item = btrfs_item_nr(i);
2852 push_space -= btrfs_token_item_size(&token, item);
2853 btrfs_set_token_item_offset(&token, item, push_space);
2854 }
2855
2856 left_nritems -= push_items;
2857 btrfs_set_header_nritems(left, left_nritems);
2858
2859 if (left_nritems)
2860 btrfs_mark_buffer_dirty(left);
2861 else
2862 btrfs_clean_tree_block(left);
2863
2864 btrfs_mark_buffer_dirty(right);
2865
2866 btrfs_item_key(right, &disk_key, 0);
2867 btrfs_set_node_key(upper, &disk_key, slot + 1);
2868 btrfs_mark_buffer_dirty(upper);
2869
2870 /* then fixup the leaf pointer in the path */
2871 if (path->slots[0] >= left_nritems) {
2872 path->slots[0] -= left_nritems;
2873 if (btrfs_header_nritems(path->nodes[0]) == 0)
2874 btrfs_clean_tree_block(path->nodes[0]);
2875 btrfs_tree_unlock(path->nodes[0]);
2876 free_extent_buffer(path->nodes[0]);
2877 path->nodes[0] = right;
2878 path->slots[1] += 1;
2879 } else {
2880 btrfs_tree_unlock(right);
2881 free_extent_buffer(right);
2882 }
2883 return 0;
2884
2885 out_unlock:
2886 btrfs_tree_unlock(right);
2887 free_extent_buffer(right);
2888 return 1;
2889 }
2890
2891 /*
2892 * push some data in the path leaf to the right, trying to free up at
2893 * least data_size bytes. returns zero if the push worked, nonzero otherwise
2894 *
2895 * returns 1 if the push failed because the other node didn't have enough
2896 * room, 0 if everything worked out and < 0 if there were major errors.
2897 *
2898 * this will push starting from min_slot to the end of the leaf. It won't
2899 * push any slot lower than min_slot
2900 */
push_leaf_right(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,int min_data_size,int data_size,int empty,u32 min_slot)2901 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2902 *root, struct btrfs_path *path,
2903 int min_data_size, int data_size,
2904 int empty, u32 min_slot)
2905 {
2906 struct extent_buffer *left = path->nodes[0];
2907 struct extent_buffer *right;
2908 struct extent_buffer *upper;
2909 int slot;
2910 int free_space;
2911 u32 left_nritems;
2912 int ret;
2913
2914 if (!path->nodes[1])
2915 return 1;
2916
2917 slot = path->slots[1];
2918 upper = path->nodes[1];
2919 if (slot >= btrfs_header_nritems(upper) - 1)
2920 return 1;
2921
2922 btrfs_assert_tree_locked(path->nodes[1]);
2923
2924 right = btrfs_read_node_slot(upper, slot + 1);
2925 /*
2926 * slot + 1 is not valid or we fail to read the right node,
2927 * no big deal, just return.
2928 */
2929 if (IS_ERR(right))
2930 return 1;
2931
2932 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
2933
2934 free_space = btrfs_leaf_free_space(right);
2935 if (free_space < data_size)
2936 goto out_unlock;
2937
2938 /* cow and double check */
2939 ret = btrfs_cow_block(trans, root, right, upper,
2940 slot + 1, &right, BTRFS_NESTING_RIGHT_COW);
2941 if (ret)
2942 goto out_unlock;
2943
2944 free_space = btrfs_leaf_free_space(right);
2945 if (free_space < data_size)
2946 goto out_unlock;
2947
2948 left_nritems = btrfs_header_nritems(left);
2949 if (left_nritems == 0)
2950 goto out_unlock;
2951
2952 if (check_sibling_keys(left, right)) {
2953 ret = -EUCLEAN;
2954 btrfs_abort_transaction(trans, ret);
2955 btrfs_tree_unlock(right);
2956 free_extent_buffer(right);
2957 return ret;
2958 }
2959 if (path->slots[0] == left_nritems && !empty) {
2960 /* Key greater than all keys in the leaf, right neighbor has
2961 * enough room for it and we're not emptying our leaf to delete
2962 * it, therefore use right neighbor to insert the new item and
2963 * no need to touch/dirty our left leaf. */
2964 btrfs_tree_unlock(left);
2965 free_extent_buffer(left);
2966 path->nodes[0] = right;
2967 path->slots[0] = 0;
2968 path->slots[1]++;
2969 return 0;
2970 }
2971
2972 return __push_leaf_right(path, min_data_size, empty,
2973 right, free_space, left_nritems, min_slot);
2974 out_unlock:
2975 btrfs_tree_unlock(right);
2976 free_extent_buffer(right);
2977 return 1;
2978 }
2979
2980 /*
2981 * push some data in the path leaf to the left, trying to free up at
2982 * least data_size bytes. returns zero if the push worked, nonzero otherwise
2983 *
2984 * max_slot can put a limit on how far into the leaf we'll push items. The
2985 * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the
2986 * items
2987 */
__push_leaf_left(struct btrfs_path * path,int data_size,int empty,struct extent_buffer * left,int free_space,u32 right_nritems,u32 max_slot)2988 static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
2989 int empty, struct extent_buffer *left,
2990 int free_space, u32 right_nritems,
2991 u32 max_slot)
2992 {
2993 struct btrfs_fs_info *fs_info = left->fs_info;
2994 struct btrfs_disk_key disk_key;
2995 struct extent_buffer *right = path->nodes[0];
2996 int i;
2997 int push_space = 0;
2998 int push_items = 0;
2999 struct btrfs_item *item;
3000 u32 old_left_nritems;
3001 u32 nr;
3002 int ret = 0;
3003 u32 this_item_size;
3004 u32 old_left_item_size;
3005 struct btrfs_map_token token;
3006
3007 if (empty)
3008 nr = min(right_nritems, max_slot);
3009 else
3010 nr = min(right_nritems - 1, max_slot);
3011
3012 for (i = 0; i < nr; i++) {
3013 item = btrfs_item_nr(i);
3014
3015 if (!empty && push_items > 0) {
3016 if (path->slots[0] < i)
3017 break;
3018 if (path->slots[0] == i) {
3019 int space = btrfs_leaf_free_space(right);
3020
3021 if (space + push_space * 2 > free_space)
3022 break;
3023 }
3024 }
3025
3026 if (path->slots[0] == i)
3027 push_space += data_size;
3028
3029 this_item_size = btrfs_item_size(right, item);
3030 if (this_item_size + sizeof(*item) + push_space > free_space)
3031 break;
3032
3033 push_items++;
3034 push_space += this_item_size + sizeof(*item);
3035 }
3036
3037 if (push_items == 0) {
3038 ret = 1;
3039 goto out;
3040 }
3041 WARN_ON(!empty && push_items == btrfs_header_nritems(right));
3042
3043 /* push data from right to left */
3044 copy_extent_buffer(left, right,
3045 btrfs_item_nr_offset(btrfs_header_nritems(left)),
3046 btrfs_item_nr_offset(0),
3047 push_items * sizeof(struct btrfs_item));
3048
3049 push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
3050 btrfs_item_offset_nr(right, push_items - 1);
3051
3052 copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
3053 leaf_data_end(left) - push_space,
3054 BTRFS_LEAF_DATA_OFFSET +
3055 btrfs_item_offset_nr(right, push_items - 1),
3056 push_space);
3057 old_left_nritems = btrfs_header_nritems(left);
3058 BUG_ON(old_left_nritems <= 0);
3059
3060 btrfs_init_map_token(&token, left);
3061 old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
3062 for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3063 u32 ioff;
3064
3065 item = btrfs_item_nr(i);
3066
3067 ioff = btrfs_token_item_offset(&token, item);
3068 btrfs_set_token_item_offset(&token, item,
3069 ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size));
3070 }
3071 btrfs_set_header_nritems(left, old_left_nritems + push_items);
3072
3073 /* fixup right node */
3074 if (push_items > right_nritems)
3075 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3076 right_nritems);
3077
3078 if (push_items < right_nritems) {
3079 push_space = btrfs_item_offset_nr(right, push_items - 1) -
3080 leaf_data_end(right);
3081 memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
3082 BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3083 BTRFS_LEAF_DATA_OFFSET +
3084 leaf_data_end(right), push_space);
3085
3086 memmove_extent_buffer(right, btrfs_item_nr_offset(0),
3087 btrfs_item_nr_offset(push_items),
3088 (btrfs_header_nritems(right) - push_items) *
3089 sizeof(struct btrfs_item));
3090 }
3091
3092 btrfs_init_map_token(&token, right);
3093 right_nritems -= push_items;
3094 btrfs_set_header_nritems(right, right_nritems);
3095 push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3096 for (i = 0; i < right_nritems; i++) {
3097 item = btrfs_item_nr(i);
3098
3099 push_space = push_space - btrfs_token_item_size(&token, item);
3100 btrfs_set_token_item_offset(&token, item, push_space);
3101 }
3102
3103 btrfs_mark_buffer_dirty(left);
3104 if (right_nritems)
3105 btrfs_mark_buffer_dirty(right);
3106 else
3107 btrfs_clean_tree_block(right);
3108
3109 btrfs_item_key(right, &disk_key, 0);
3110 fixup_low_keys(path, &disk_key, 1);
3111
3112 /* then fixup the leaf pointer in the path */
3113 if (path->slots[0] < push_items) {
3114 path->slots[0] += old_left_nritems;
3115 btrfs_tree_unlock(path->nodes[0]);
3116 free_extent_buffer(path->nodes[0]);
3117 path->nodes[0] = left;
3118 path->slots[1] -= 1;
3119 } else {
3120 btrfs_tree_unlock(left);
3121 free_extent_buffer(left);
3122 path->slots[0] -= push_items;
3123 }
3124 BUG_ON(path->slots[0] < 0);
3125 return ret;
3126 out:
3127 btrfs_tree_unlock(left);
3128 free_extent_buffer(left);
3129 return ret;
3130 }
3131
3132 /*
3133 * push some data in the path leaf to the left, trying to free up at
3134 * least data_size bytes. returns zero if the push worked, nonzero otherwise
3135 *
3136 * max_slot can put a limit on how far into the leaf we'll push items. The
3137 * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the
3138 * items
3139 */
push_leaf_left(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,int min_data_size,int data_size,int empty,u32 max_slot)3140 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
3141 *root, struct btrfs_path *path, int min_data_size,
3142 int data_size, int empty, u32 max_slot)
3143 {
3144 struct extent_buffer *right = path->nodes[0];
3145 struct extent_buffer *left;
3146 int slot;
3147 int free_space;
3148 u32 right_nritems;
3149 int ret = 0;
3150
3151 slot = path->slots[1];
3152 if (slot == 0)
3153 return 1;
3154 if (!path->nodes[1])
3155 return 1;
3156
3157 right_nritems = btrfs_header_nritems(right);
3158 if (right_nritems == 0)
3159 return 1;
3160
3161 btrfs_assert_tree_locked(path->nodes[1]);
3162
3163 left = btrfs_read_node_slot(path->nodes[1], slot - 1);
3164 /*
3165 * slot - 1 is not valid or we fail to read the left node,
3166 * no big deal, just return.
3167 */
3168 if (IS_ERR(left))
3169 return 1;
3170
3171 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
3172
3173 free_space = btrfs_leaf_free_space(left);
3174 if (free_space < data_size) {
3175 ret = 1;
3176 goto out;
3177 }
3178
3179 /* cow and double check */
3180 ret = btrfs_cow_block(trans, root, left,
3181 path->nodes[1], slot - 1, &left,
3182 BTRFS_NESTING_LEFT_COW);
3183 if (ret) {
3184 /* we hit -ENOSPC, but it isn't fatal here */
3185 if (ret == -ENOSPC)
3186 ret = 1;
3187 goto out;
3188 }
3189
3190 free_space = btrfs_leaf_free_space(left);
3191 if (free_space < data_size) {
3192 ret = 1;
3193 goto out;
3194 }
3195
3196 if (check_sibling_keys(left, right)) {
3197 ret = -EUCLEAN;
3198 btrfs_abort_transaction(trans, ret);
3199 goto out;
3200 }
3201 return __push_leaf_left(path, min_data_size,
3202 empty, left, free_space, right_nritems,
3203 max_slot);
3204 out:
3205 btrfs_tree_unlock(left);
3206 free_extent_buffer(left);
3207 return ret;
3208 }
3209
3210 /*
3211 * split the path's leaf in two, making sure there is at least data_size
3212 * available for the resulting leaf level of the path.
3213 */
copy_for_split(struct btrfs_trans_handle * trans,struct btrfs_path * path,struct extent_buffer * l,struct extent_buffer * right,int slot,int mid,int nritems)3214 static noinline void copy_for_split(struct btrfs_trans_handle *trans,
3215 struct btrfs_path *path,
3216 struct extent_buffer *l,
3217 struct extent_buffer *right,
3218 int slot, int mid, int nritems)
3219 {
3220 struct btrfs_fs_info *fs_info = trans->fs_info;
3221 int data_copy_size;
3222 int rt_data_off;
3223 int i;
3224 struct btrfs_disk_key disk_key;
3225 struct btrfs_map_token token;
3226
3227 nritems = nritems - mid;
3228 btrfs_set_header_nritems(right, nritems);
3229 data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(l);
3230
3231 copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
3232 btrfs_item_nr_offset(mid),
3233 nritems * sizeof(struct btrfs_item));
3234
3235 copy_extent_buffer(right, l,
3236 BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) -
3237 data_copy_size, BTRFS_LEAF_DATA_OFFSET +
3238 leaf_data_end(l), data_copy_size);
3239
3240 rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid);
3241
3242 btrfs_init_map_token(&token, right);
3243 for (i = 0; i < nritems; i++) {
3244 struct btrfs_item *item = btrfs_item_nr(i);
3245 u32 ioff;
3246
3247 ioff = btrfs_token_item_offset(&token, item);
3248 btrfs_set_token_item_offset(&token, item, ioff + rt_data_off);
3249 }
3250
3251 btrfs_set_header_nritems(l, mid);
3252 btrfs_item_key(right, &disk_key, 0);
3253 insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1);
3254
3255 btrfs_mark_buffer_dirty(right);
3256 btrfs_mark_buffer_dirty(l);
3257 BUG_ON(path->slots[0] != slot);
3258
3259 if (mid <= slot) {
3260 btrfs_tree_unlock(path->nodes[0]);
3261 free_extent_buffer(path->nodes[0]);
3262 path->nodes[0] = right;
3263 path->slots[0] -= mid;
3264 path->slots[1] += 1;
3265 } else {
3266 btrfs_tree_unlock(right);
3267 free_extent_buffer(right);
3268 }
3269
3270 BUG_ON(path->slots[0] < 0);
3271 }
3272
3273 /*
3274 * double splits happen when we need to insert a big item in the middle
3275 * of a leaf. A double split can leave us with 3 mostly empty leaves:
3276 * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
3277 * A B C
3278 *
3279 * We avoid this by trying to push the items on either side of our target
3280 * into the adjacent leaves. If all goes well we can avoid the double split
3281 * completely.
3282 */
push_for_double_split(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,int data_size)3283 static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
3284 struct btrfs_root *root,
3285 struct btrfs_path *path,
3286 int data_size)
3287 {
3288 int ret;
3289 int progress = 0;
3290 int slot;
3291 u32 nritems;
3292 int space_needed = data_size;
3293
3294 slot = path->slots[0];
3295 if (slot < btrfs_header_nritems(path->nodes[0]))
3296 space_needed -= btrfs_leaf_free_space(path->nodes[0]);
3297
3298 /*
3299 * try to push all the items after our slot into the
3300 * right leaf
3301 */
3302 ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
3303 if (ret < 0)
3304 return ret;
3305
3306 if (ret == 0)
3307 progress++;
3308
3309 nritems = btrfs_header_nritems(path->nodes[0]);
3310 /*
3311 * our goal is to get our slot at the start or end of a leaf. If
3312 * we've done so we're done
3313 */
3314 if (path->slots[0] == 0 || path->slots[0] == nritems)
3315 return 0;
3316
3317 if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
3318 return 0;
3319
3320 /* try to push all the items before our slot into the next leaf */
3321 slot = path->slots[0];
3322 space_needed = data_size;
3323 if (slot > 0)
3324 space_needed -= btrfs_leaf_free_space(path->nodes[0]);
3325 ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
3326 if (ret < 0)
3327 return ret;
3328
3329 if (ret == 0)
3330 progress++;
3331
3332 if (progress)
3333 return 0;
3334 return 1;
3335 }
3336
3337 /*
3338 * split the path's leaf in two, making sure there is at least data_size
3339 * available for the resulting leaf level of the path.
3340 *
3341 * returns 0 if all went well and < 0 on failure.
3342 */
split_leaf(struct btrfs_trans_handle * trans,struct btrfs_root * root,const struct btrfs_key * ins_key,struct btrfs_path * path,int data_size,int extend)3343 static noinline int split_leaf(struct btrfs_trans_handle *trans,
3344 struct btrfs_root *root,
3345 const struct btrfs_key *ins_key,
3346 struct btrfs_path *path, int data_size,
3347 int extend)
3348 {
3349 struct btrfs_disk_key disk_key;
3350 struct extent_buffer *l;
3351 u32 nritems;
3352 int mid;
3353 int slot;
3354 struct extent_buffer *right;
3355 struct btrfs_fs_info *fs_info = root->fs_info;
3356 int ret = 0;
3357 int wret;
3358 int split;
3359 int num_doubles = 0;
3360 int tried_avoid_double = 0;
3361
3362 l = path->nodes[0];
3363 slot = path->slots[0];
3364 if (extend && data_size + btrfs_item_size_nr(l, slot) +
3365 sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
3366 return -EOVERFLOW;
3367
3368 /* first try to make some room by pushing left and right */
3369 if (data_size && path->nodes[1]) {
3370 int space_needed = data_size;
3371
3372 if (slot < btrfs_header_nritems(l))
3373 space_needed -= btrfs_leaf_free_space(l);
3374
3375 wret = push_leaf_right(trans, root, path, space_needed,
3376 space_needed, 0, 0);
3377 if (wret < 0)
3378 return wret;
3379 if (wret) {
3380 space_needed = data_size;
3381 if (slot > 0)
3382 space_needed -= btrfs_leaf_free_space(l);
3383 wret = push_leaf_left(trans, root, path, space_needed,
3384 space_needed, 0, (u32)-1);
3385 if (wret < 0)
3386 return wret;
3387 }
3388 l = path->nodes[0];
3389
3390 /* did the pushes work? */
3391 if (btrfs_leaf_free_space(l) >= data_size)
3392 return 0;
3393 }
3394
3395 if (!path->nodes[1]) {
3396 ret = insert_new_root(trans, root, path, 1);
3397 if (ret)
3398 return ret;
3399 }
3400 again:
3401 split = 1;
3402 l = path->nodes[0];
3403 slot = path->slots[0];
3404 nritems = btrfs_header_nritems(l);
3405 mid = (nritems + 1) / 2;
3406
3407 if (mid <= slot) {
3408 if (nritems == 1 ||
3409 leaf_space_used(l, mid, nritems - mid) + data_size >
3410 BTRFS_LEAF_DATA_SIZE(fs_info)) {
3411 if (slot >= nritems) {
3412 split = 0;
3413 } else {
3414 mid = slot;
3415 if (mid != nritems &&
3416 leaf_space_used(l, mid, nritems - mid) +
3417 data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
3418 if (data_size && !tried_avoid_double)
3419 goto push_for_double;
3420 split = 2;
3421 }
3422 }
3423 }
3424 } else {
3425 if (leaf_space_used(l, 0, mid) + data_size >
3426 BTRFS_LEAF_DATA_SIZE(fs_info)) {
3427 if (!extend && data_size && slot == 0) {
3428 split = 0;
3429 } else if ((extend || !data_size) && slot == 0) {
3430 mid = 1;
3431 } else {
3432 mid = slot;
3433 if (mid != nritems &&
3434 leaf_space_used(l, mid, nritems - mid) +
3435 data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
3436 if (data_size && !tried_avoid_double)
3437 goto push_for_double;
3438 split = 2;
3439 }
3440 }
3441 }
3442 }
3443
3444 if (split == 0)
3445 btrfs_cpu_key_to_disk(&disk_key, ins_key);
3446 else
3447 btrfs_item_key(l, &disk_key, mid);
3448
3449 /*
3450 * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double
3451 * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES
3452 * subclasses, which is 8 at the time of this patch, and we've maxed it
3453 * out. In the future we could add a
3454 * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
3455 * use BTRFS_NESTING_NEW_ROOT.
3456 */
3457 right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3458 &disk_key, 0, l->start, 0,
3459 num_doubles ? BTRFS_NESTING_NEW_ROOT :
3460 BTRFS_NESTING_SPLIT);
3461 if (IS_ERR(right))
3462 return PTR_ERR(right);
3463
3464 root_add_used(root, fs_info->nodesize);
3465
3466 if (split == 0) {
3467 if (mid <= slot) {
3468 btrfs_set_header_nritems(right, 0);
3469 insert_ptr(trans, path, &disk_key,
3470 right->start, path->slots[1] + 1, 1);
3471 btrfs_tree_unlock(path->nodes[0]);
3472 free_extent_buffer(path->nodes[0]);
3473 path->nodes[0] = right;
3474 path->slots[0] = 0;
3475 path->slots[1] += 1;
3476 } else {
3477 btrfs_set_header_nritems(right, 0);
3478 insert_ptr(trans, path, &disk_key,
3479 right->start, path->slots[1], 1);
3480 btrfs_tree_unlock(path->nodes[0]);
3481 free_extent_buffer(path->nodes[0]);
3482 path->nodes[0] = right;
3483 path->slots[0] = 0;
3484 if (path->slots[1] == 0)
3485 fixup_low_keys(path, &disk_key, 1);
3486 }
3487 /*
3488 * We create a new leaf 'right' for the required ins_len and
3489 * we'll do btrfs_mark_buffer_dirty() on this leaf after copying
3490 * the content of ins_len to 'right'.
3491 */
3492 return ret;
3493 }
3494
3495 copy_for_split(trans, path, l, right, slot, mid, nritems);
3496
3497 if (split == 2) {
3498 BUG_ON(num_doubles != 0);
3499 num_doubles++;
3500 goto again;
3501 }
3502
3503 return 0;
3504
3505 push_for_double:
3506 push_for_double_split(trans, root, path, data_size);
3507 tried_avoid_double = 1;
3508 if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
3509 return 0;
3510 goto again;
3511 }
3512
setup_leaf_for_split(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,int ins_len)3513 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
3514 struct btrfs_root *root,
3515 struct btrfs_path *path, int ins_len)
3516 {
3517 struct btrfs_key key;
3518 struct extent_buffer *leaf;
3519 struct btrfs_file_extent_item *fi;
3520 u64 extent_len = 0;
3521 u32 item_size;
3522 int ret;
3523
3524 leaf = path->nodes[0];
3525 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3526
3527 BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
3528 key.type != BTRFS_EXTENT_CSUM_KEY);
3529
3530 if (btrfs_leaf_free_space(leaf) >= ins_len)
3531 return 0;
3532
3533 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3534 if (key.type == BTRFS_EXTENT_DATA_KEY) {
3535 fi = btrfs_item_ptr(leaf, path->slots[0],
3536 struct btrfs_file_extent_item);
3537 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
3538 }
3539 btrfs_release_path(path);
3540
3541 path->keep_locks = 1;
3542 path->search_for_split = 1;
3543 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
3544 path->search_for_split = 0;
3545 if (ret > 0)
3546 ret = -EAGAIN;
3547 if (ret < 0)
3548 goto err;
3549
3550 ret = -EAGAIN;
3551 leaf = path->nodes[0];
3552 /* if our item isn't there, return now */
3553 if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
3554 goto err;
3555
3556 /* the leaf has changed, it now has room. return now */
3557 if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len)
3558 goto err;
3559
3560 if (key.type == BTRFS_EXTENT_DATA_KEY) {
3561 fi = btrfs_item_ptr(leaf, path->slots[0],
3562 struct btrfs_file_extent_item);
3563 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
3564 goto err;
3565 }
3566
3567 ret = split_leaf(trans, root, &key, path, ins_len, 1);
3568 if (ret)
3569 goto err;
3570
3571 path->keep_locks = 0;
3572 btrfs_unlock_up_safe(path, 1);
3573 return 0;
3574 err:
3575 path->keep_locks = 0;
3576 return ret;
3577 }
3578
split_item(struct btrfs_path * path,const struct btrfs_key * new_key,unsigned long split_offset)3579 static noinline int split_item(struct btrfs_path *path,
3580 const struct btrfs_key *new_key,
3581 unsigned long split_offset)
3582 {
3583 struct extent_buffer *leaf;
3584 struct btrfs_item *item;
3585 struct btrfs_item *new_item;
3586 int slot;
3587 char *buf;
3588 u32 nritems;
3589 u32 item_size;
3590 u32 orig_offset;
3591 struct btrfs_disk_key disk_key;
3592
3593 leaf = path->nodes[0];
3594 BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item));
3595
3596 item = btrfs_item_nr(path->slots[0]);
3597 orig_offset = btrfs_item_offset(leaf, item);
3598 item_size = btrfs_item_size(leaf, item);
3599
3600 buf = kmalloc(item_size, GFP_NOFS);
3601 if (!buf)
3602 return -ENOMEM;
3603
3604 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
3605 path->slots[0]), item_size);
3606
3607 slot = path->slots[0] + 1;
3608 nritems = btrfs_header_nritems(leaf);
3609 if (slot != nritems) {
3610 /* shift the items */
3611 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
3612 btrfs_item_nr_offset(slot),
3613 (nritems - slot) * sizeof(struct btrfs_item));
3614 }
3615
3616 btrfs_cpu_key_to_disk(&disk_key, new_key);
3617 btrfs_set_item_key(leaf, &disk_key, slot);
3618
3619 new_item = btrfs_item_nr(slot);
3620
3621 btrfs_set_item_offset(leaf, new_item, orig_offset);
3622 btrfs_set_item_size(leaf, new_item, item_size - split_offset);
3623
3624 btrfs_set_item_offset(leaf, item,
3625 orig_offset + item_size - split_offset);
3626 btrfs_set_item_size(leaf, item, split_offset);
3627
3628 btrfs_set_header_nritems(leaf, nritems + 1);
3629
3630 /* write the data for the start of the original item */
3631 write_extent_buffer(leaf, buf,
3632 btrfs_item_ptr_offset(leaf, path->slots[0]),
3633 split_offset);
3634
3635 /* write the data for the new item */
3636 write_extent_buffer(leaf, buf + split_offset,
3637 btrfs_item_ptr_offset(leaf, slot),
3638 item_size - split_offset);
3639 btrfs_mark_buffer_dirty(leaf);
3640
3641 BUG_ON(btrfs_leaf_free_space(leaf) < 0);
3642 kfree(buf);
3643 return 0;
3644 }
3645
3646 /*
3647 * This function splits a single item into two items,
3648 * giving 'new_key' to the new item and splitting the
3649 * old one at split_offset (from the start of the item).
3650 *
3651 * The path may be released by this operation. After
3652 * the split, the path is pointing to the old item. The
3653 * new item is going to be in the same node as the old one.
3654 *
3655 * Note, the item being split must be smaller enough to live alone on
3656 * a tree block with room for one extra struct btrfs_item
3657 *
3658 * This allows us to split the item in place, keeping a lock on the
3659 * leaf the entire time.
3660 */
btrfs_split_item(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,const struct btrfs_key * new_key,unsigned long split_offset)3661 int btrfs_split_item(struct btrfs_trans_handle *trans,
3662 struct btrfs_root *root,
3663 struct btrfs_path *path,
3664 const struct btrfs_key *new_key,
3665 unsigned long split_offset)
3666 {
3667 int ret;
3668 ret = setup_leaf_for_split(trans, root, path,
3669 sizeof(struct btrfs_item));
3670 if (ret)
3671 return ret;
3672
3673 ret = split_item(path, new_key, split_offset);
3674 return ret;
3675 }
3676
3677 /*
3678 * This function duplicate a item, giving 'new_key' to the new item.
3679 * It guarantees both items live in the same tree leaf and the new item
3680 * is contiguous with the original item.
3681 *
3682 * This allows us to split file extent in place, keeping a lock on the
3683 * leaf the entire time.
3684 */
btrfs_duplicate_item(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,const struct btrfs_key * new_key)3685 int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
3686 struct btrfs_root *root,
3687 struct btrfs_path *path,
3688 const struct btrfs_key *new_key)
3689 {
3690 struct extent_buffer *leaf;
3691 int ret;
3692 u32 item_size;
3693
3694 leaf = path->nodes[0];
3695 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3696 ret = setup_leaf_for_split(trans, root, path,
3697 item_size + sizeof(struct btrfs_item));
3698 if (ret)
3699 return ret;
3700
3701 path->slots[0]++;
3702 setup_items_for_insert(root, path, new_key, &item_size, 1);
3703 leaf = path->nodes[0];
3704 memcpy_extent_buffer(leaf,
3705 btrfs_item_ptr_offset(leaf, path->slots[0]),
3706 btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
3707 item_size);
3708 return 0;
3709 }
3710
3711 /*
3712 * make the item pointed to by the path smaller. new_size indicates
3713 * how small to make it, and from_end tells us if we just chop bytes
3714 * off the end of the item or if we shift the item to chop bytes off
3715 * the front.
3716 */
btrfs_truncate_item(struct btrfs_path * path,u32 new_size,int from_end)3717 void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
3718 {
3719 int slot;
3720 struct extent_buffer *leaf;
3721 struct btrfs_item *item;
3722 u32 nritems;
3723 unsigned int data_end;
3724 unsigned int old_data_start;
3725 unsigned int old_size;
3726 unsigned int size_diff;
3727 int i;
3728 struct btrfs_map_token token;
3729
3730 leaf = path->nodes[0];
3731 slot = path->slots[0];
3732
3733 old_size = btrfs_item_size_nr(leaf, slot);
3734 if (old_size == new_size)
3735 return;
3736
3737 nritems = btrfs_header_nritems(leaf);
3738 data_end = leaf_data_end(leaf);
3739
3740 old_data_start = btrfs_item_offset_nr(leaf, slot);
3741
3742 size_diff = old_size - new_size;
3743
3744 BUG_ON(slot < 0);
3745 BUG_ON(slot >= nritems);
3746
3747 /*
3748 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3749 */
3750 /* first correct the data pointers */
3751 btrfs_init_map_token(&token, leaf);
3752 for (i = slot; i < nritems; i++) {
3753 u32 ioff;
3754 item = btrfs_item_nr(i);
3755
3756 ioff = btrfs_token_item_offset(&token, item);
3757 btrfs_set_token_item_offset(&token, item, ioff + size_diff);
3758 }
3759
3760 /* shift the data */
3761 if (from_end) {
3762 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
3763 data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
3764 data_end, old_data_start + new_size - data_end);
3765 } else {
3766 struct btrfs_disk_key disk_key;
3767 u64 offset;
3768
3769 btrfs_item_key(leaf, &disk_key, slot);
3770
3771 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
3772 unsigned long ptr;
3773 struct btrfs_file_extent_item *fi;
3774
3775 fi = btrfs_item_ptr(leaf, slot,
3776 struct btrfs_file_extent_item);
3777 fi = (struct btrfs_file_extent_item *)(
3778 (unsigned long)fi - size_diff);
3779
3780 if (btrfs_file_extent_type(leaf, fi) ==
3781 BTRFS_FILE_EXTENT_INLINE) {
3782 ptr = btrfs_item_ptr_offset(leaf, slot);
3783 memmove_extent_buffer(leaf, ptr,
3784 (unsigned long)fi,
3785 BTRFS_FILE_EXTENT_INLINE_DATA_START);
3786 }
3787 }
3788
3789 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
3790 data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
3791 data_end, old_data_start - data_end);
3792
3793 offset = btrfs_disk_key_offset(&disk_key);
3794 btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
3795 btrfs_set_item_key(leaf, &disk_key, slot);
3796 if (slot == 0)
3797 fixup_low_keys(path, &disk_key, 1);
3798 }
3799
3800 item = btrfs_item_nr(slot);
3801 btrfs_set_item_size(leaf, item, new_size);
3802 btrfs_mark_buffer_dirty(leaf);
3803
3804 if (btrfs_leaf_free_space(leaf) < 0) {
3805 btrfs_print_leaf(leaf);
3806 BUG();
3807 }
3808 }
3809
3810 /*
3811 * make the item pointed to by the path bigger, data_size is the added size.
3812 */
btrfs_extend_item(struct btrfs_path * path,u32 data_size)3813 void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
3814 {
3815 int slot;
3816 struct extent_buffer *leaf;
3817 struct btrfs_item *item;
3818 u32 nritems;
3819 unsigned int data_end;
3820 unsigned int old_data;
3821 unsigned int old_size;
3822 int i;
3823 struct btrfs_map_token token;
3824
3825 leaf = path->nodes[0];
3826
3827 nritems = btrfs_header_nritems(leaf);
3828 data_end = leaf_data_end(leaf);
3829
3830 if (btrfs_leaf_free_space(leaf) < data_size) {
3831 btrfs_print_leaf(leaf);
3832 BUG();
3833 }
3834 slot = path->slots[0];
3835 old_data = btrfs_item_end_nr(leaf, slot);
3836
3837 BUG_ON(slot < 0);
3838 if (slot >= nritems) {
3839 btrfs_print_leaf(leaf);
3840 btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d",
3841 slot, nritems);
3842 BUG();
3843 }
3844
3845 /*
3846 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3847 */
3848 /* first correct the data pointers */
3849 btrfs_init_map_token(&token, leaf);
3850 for (i = slot; i < nritems; i++) {
3851 u32 ioff;
3852 item = btrfs_item_nr(i);
3853
3854 ioff = btrfs_token_item_offset(&token, item);
3855 btrfs_set_token_item_offset(&token, item, ioff - data_size);
3856 }
3857
3858 /* shift the data */
3859 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
3860 data_end - data_size, BTRFS_LEAF_DATA_OFFSET +
3861 data_end, old_data - data_end);
3862
3863 data_end = old_data;
3864 old_size = btrfs_item_size_nr(leaf, slot);
3865 item = btrfs_item_nr(slot);
3866 btrfs_set_item_size(leaf, item, old_size + data_size);
3867 btrfs_mark_buffer_dirty(leaf);
3868
3869 if (btrfs_leaf_free_space(leaf) < 0) {
3870 btrfs_print_leaf(leaf);
3871 BUG();
3872 }
3873 }
3874
3875 /**
3876 * setup_items_for_insert - Helper called before inserting one or more items
3877 * to a leaf. Main purpose is to save stack depth by doing the bulk of the work
3878 * in a function that doesn't call btrfs_search_slot
3879 *
3880 * @root: root we are inserting items to
3881 * @path: points to the leaf/slot where we are going to insert new items
3882 * @cpu_key: array of keys for items to be inserted
3883 * @data_size: size of the body of each item we are going to insert
3884 * @nr: size of @cpu_key/@data_size arrays
3885 */
setup_items_for_insert(struct btrfs_root * root,struct btrfs_path * path,const struct btrfs_key * cpu_key,u32 * data_size,int nr)3886 void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
3887 const struct btrfs_key *cpu_key, u32 *data_size,
3888 int nr)
3889 {
3890 struct btrfs_fs_info *fs_info = root->fs_info;
3891 struct btrfs_item *item;
3892 int i;
3893 u32 nritems;
3894 unsigned int data_end;
3895 struct btrfs_disk_key disk_key;
3896 struct extent_buffer *leaf;
3897 int slot;
3898 struct btrfs_map_token token;
3899 u32 total_size;
3900 u32 total_data = 0;
3901
3902 for (i = 0; i < nr; i++)
3903 total_data += data_size[i];
3904 total_size = total_data + (nr * sizeof(struct btrfs_item));
3905
3906 if (path->slots[0] == 0) {
3907 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
3908 fixup_low_keys(path, &disk_key, 1);
3909 }
3910 btrfs_unlock_up_safe(path, 1);
3911
3912 leaf = path->nodes[0];
3913 slot = path->slots[0];
3914
3915 nritems = btrfs_header_nritems(leaf);
3916 data_end = leaf_data_end(leaf);
3917
3918 if (btrfs_leaf_free_space(leaf) < total_size) {
3919 btrfs_print_leaf(leaf);
3920 btrfs_crit(fs_info, "not enough freespace need %u have %d",
3921 total_size, btrfs_leaf_free_space(leaf));
3922 BUG();
3923 }
3924
3925 btrfs_init_map_token(&token, leaf);
3926 if (slot != nritems) {
3927 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
3928
3929 if (old_data < data_end) {
3930 btrfs_print_leaf(leaf);
3931 btrfs_crit(fs_info,
3932 "item at slot %d with data offset %u beyond data end of leaf %u",
3933 slot, old_data, data_end);
3934 BUG();
3935 }
3936 /*
3937 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3938 */
3939 /* first correct the data pointers */
3940 for (i = slot; i < nritems; i++) {
3941 u32 ioff;
3942
3943 item = btrfs_item_nr(i);
3944 ioff = btrfs_token_item_offset(&token, item);
3945 btrfs_set_token_item_offset(&token, item,
3946 ioff - total_data);
3947 }
3948 /* shift the items */
3949 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
3950 btrfs_item_nr_offset(slot),
3951 (nritems - slot) * sizeof(struct btrfs_item));
3952
3953 /* shift the data */
3954 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
3955 data_end - total_data, BTRFS_LEAF_DATA_OFFSET +
3956 data_end, old_data - data_end);
3957 data_end = old_data;
3958 }
3959
3960 /* setup the item for the new data */
3961 for (i = 0; i < nr; i++) {
3962 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
3963 btrfs_set_item_key(leaf, &disk_key, slot + i);
3964 item = btrfs_item_nr(slot + i);
3965 data_end -= data_size[i];
3966 btrfs_set_token_item_offset(&token, item, data_end);
3967 btrfs_set_token_item_size(&token, item, data_size[i]);
3968 }
3969
3970 btrfs_set_header_nritems(leaf, nritems + nr);
3971 btrfs_mark_buffer_dirty(leaf);
3972
3973 if (btrfs_leaf_free_space(leaf) < 0) {
3974 btrfs_print_leaf(leaf);
3975 BUG();
3976 }
3977 }
3978
3979 /*
3980 * Given a key and some data, insert items into the tree.
3981 * This does all the path init required, making room in the tree if needed.
3982 */
btrfs_insert_empty_items(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,const struct btrfs_key * cpu_key,u32 * data_size,int nr)3983 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3984 struct btrfs_root *root,
3985 struct btrfs_path *path,
3986 const struct btrfs_key *cpu_key, u32 *data_size,
3987 int nr)
3988 {
3989 int ret = 0;
3990 int slot;
3991 int i;
3992 u32 total_size = 0;
3993 u32 total_data = 0;
3994
3995 for (i = 0; i < nr; i++)
3996 total_data += data_size[i];
3997
3998 total_size = total_data + (nr * sizeof(struct btrfs_item));
3999 ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
4000 if (ret == 0)
4001 return -EEXIST;
4002 if (ret < 0)
4003 return ret;
4004
4005 slot = path->slots[0];
4006 BUG_ON(slot < 0);
4007
4008 setup_items_for_insert(root, path, cpu_key, data_size, nr);
4009 return 0;
4010 }
4011
4012 /*
4013 * Given a key and some data, insert an item into the tree.
4014 * This does all the path init required, making room in the tree if needed.
4015 */
btrfs_insert_item(struct btrfs_trans_handle * trans,struct btrfs_root * root,const struct btrfs_key * cpu_key,void * data,u32 data_size)4016 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4017 const struct btrfs_key *cpu_key, void *data,
4018 u32 data_size)
4019 {
4020 int ret = 0;
4021 struct btrfs_path *path;
4022 struct extent_buffer *leaf;
4023 unsigned long ptr;
4024
4025 path = btrfs_alloc_path();
4026 if (!path)
4027 return -ENOMEM;
4028 ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4029 if (!ret) {
4030 leaf = path->nodes[0];
4031 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4032 write_extent_buffer(leaf, data, ptr, data_size);
4033 btrfs_mark_buffer_dirty(leaf);
4034 }
4035 btrfs_free_path(path);
4036 return ret;
4037 }
4038
4039 /*
4040 * delete the pointer from a given node.
4041 *
4042 * the tree should have been previously balanced so the deletion does not
4043 * empty a node.
4044 */
del_ptr(struct btrfs_root * root,struct btrfs_path * path,int level,int slot)4045 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
4046 int level, int slot)
4047 {
4048 struct extent_buffer *parent = path->nodes[level];
4049 u32 nritems;
4050 int ret;
4051
4052 nritems = btrfs_header_nritems(parent);
4053 if (slot != nritems - 1) {
4054 if (level) {
4055 ret = btrfs_tree_mod_log_insert_move(parent, slot,
4056 slot + 1, nritems - slot - 1);
4057 BUG_ON(ret < 0);
4058 }
4059 memmove_extent_buffer(parent,
4060 btrfs_node_key_ptr_offset(slot),
4061 btrfs_node_key_ptr_offset(slot + 1),
4062 sizeof(struct btrfs_key_ptr) *
4063 (nritems - slot - 1));
4064 } else if (level) {
4065 ret = btrfs_tree_mod_log_insert_key(parent, slot,
4066 BTRFS_MOD_LOG_KEY_REMOVE, GFP_NOFS);
4067 BUG_ON(ret < 0);
4068 }
4069
4070 nritems--;
4071 btrfs_set_header_nritems(parent, nritems);
4072 if (nritems == 0 && parent == root->node) {
4073 BUG_ON(btrfs_header_level(root->node) != 1);
4074 /* just turn the root into a leaf and break */
4075 btrfs_set_header_level(root->node, 0);
4076 } else if (slot == 0) {
4077 struct btrfs_disk_key disk_key;
4078
4079 btrfs_node_key(parent, &disk_key, 0);
4080 fixup_low_keys(path, &disk_key, level + 1);
4081 }
4082 btrfs_mark_buffer_dirty(parent);
4083 }
4084
4085 /*
4086 * a helper function to delete the leaf pointed to by path->slots[1] and
4087 * path->nodes[1].
4088 *
4089 * This deletes the pointer in path->nodes[1] and frees the leaf
4090 * block extent. zero is returned if it all worked out, < 0 otherwise.
4091 *
4092 * The path must have already been setup for deleting the leaf, including
4093 * all the proper balancing. path->nodes[1] must be locked.
4094 */
btrfs_del_leaf(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct extent_buffer * leaf)4095 static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4096 struct btrfs_root *root,
4097 struct btrfs_path *path,
4098 struct extent_buffer *leaf)
4099 {
4100 WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4101 del_ptr(root, path, 1, path->slots[1]);
4102
4103 /*
4104 * btrfs_free_extent is expensive, we want to make sure we
4105 * aren't holding any locks when we call it
4106 */
4107 btrfs_unlock_up_safe(path, 0);
4108
4109 root_sub_used(root, leaf->len);
4110
4111 atomic_inc(&leaf->refs);
4112 btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1);
4113 free_extent_buffer_stale(leaf);
4114 }
4115 /*
4116 * delete the item at the leaf level in path. If that empties
4117 * the leaf, remove it from the tree
4118 */
btrfs_del_items(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,int slot,int nr)4119 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4120 struct btrfs_path *path, int slot, int nr)
4121 {
4122 struct btrfs_fs_info *fs_info = root->fs_info;
4123 struct extent_buffer *leaf;
4124 struct btrfs_item *item;
4125 u32 last_off;
4126 u32 dsize = 0;
4127 int ret = 0;
4128 int wret;
4129 int i;
4130 u32 nritems;
4131
4132 leaf = path->nodes[0];
4133 last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
4134
4135 for (i = 0; i < nr; i++)
4136 dsize += btrfs_item_size_nr(leaf, slot + i);
4137
4138 nritems = btrfs_header_nritems(leaf);
4139
4140 if (slot + nr != nritems) {
4141 int data_end = leaf_data_end(leaf);
4142 struct btrfs_map_token token;
4143
4144 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4145 data_end + dsize,
4146 BTRFS_LEAF_DATA_OFFSET + data_end,
4147 last_off - data_end);
4148
4149 btrfs_init_map_token(&token, leaf);
4150 for (i = slot + nr; i < nritems; i++) {
4151 u32 ioff;
4152
4153 item = btrfs_item_nr(i);
4154 ioff = btrfs_token_item_offset(&token, item);
4155 btrfs_set_token_item_offset(&token, item, ioff + dsize);
4156 }
4157
4158 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
4159 btrfs_item_nr_offset(slot + nr),
4160 sizeof(struct btrfs_item) *
4161 (nritems - slot - nr));
4162 }
4163 btrfs_set_header_nritems(leaf, nritems - nr);
4164 nritems -= nr;
4165
4166 /* delete the leaf if we've emptied it */
4167 if (nritems == 0) {
4168 if (leaf == root->node) {
4169 btrfs_set_header_level(leaf, 0);
4170 } else {
4171 btrfs_clean_tree_block(leaf);
4172 btrfs_del_leaf(trans, root, path, leaf);
4173 }
4174 } else {
4175 int used = leaf_space_used(leaf, 0, nritems);
4176 if (slot == 0) {
4177 struct btrfs_disk_key disk_key;
4178
4179 btrfs_item_key(leaf, &disk_key, 0);
4180 fixup_low_keys(path, &disk_key, 1);
4181 }
4182
4183 /* delete the leaf if it is mostly empty */
4184 if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
4185 /* push_leaf_left fixes the path.
4186 * make sure the path still points to our leaf
4187 * for possible call to del_ptr below
4188 */
4189 slot = path->slots[1];
4190 atomic_inc(&leaf->refs);
4191
4192 wret = push_leaf_left(trans, root, path, 1, 1,
4193 1, (u32)-1);
4194 if (wret < 0 && wret != -ENOSPC)
4195 ret = wret;
4196
4197 if (path->nodes[0] == leaf &&
4198 btrfs_header_nritems(leaf)) {
4199 wret = push_leaf_right(trans, root, path, 1,
4200 1, 1, 0);
4201 if (wret < 0 && wret != -ENOSPC)
4202 ret = wret;
4203 }
4204
4205 if (btrfs_header_nritems(leaf) == 0) {
4206 path->slots[1] = slot;
4207 btrfs_del_leaf(trans, root, path, leaf);
4208 free_extent_buffer(leaf);
4209 ret = 0;
4210 } else {
4211 /* if we're still in the path, make sure
4212 * we're dirty. Otherwise, one of the
4213 * push_leaf functions must have already
4214 * dirtied this buffer
4215 */
4216 if (path->nodes[0] == leaf)
4217 btrfs_mark_buffer_dirty(leaf);
4218 free_extent_buffer(leaf);
4219 }
4220 } else {
4221 btrfs_mark_buffer_dirty(leaf);
4222 }
4223 }
4224 return ret;
4225 }
4226
4227 /*
4228 * search the tree again to find a leaf with lesser keys
4229 * returns 0 if it found something or 1 if there are no lesser leaves.
4230 * returns < 0 on io errors.
4231 *
4232 * This may release the path, and so you may lose any locks held at the
4233 * time you call it.
4234 */
btrfs_prev_leaf(struct btrfs_root * root,struct btrfs_path * path)4235 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
4236 {
4237 struct btrfs_key key;
4238 struct btrfs_key orig_key;
4239 struct btrfs_disk_key found_key;
4240 int ret;
4241
4242 btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
4243 orig_key = key;
4244
4245 if (key.offset > 0) {
4246 key.offset--;
4247 } else if (key.type > 0) {
4248 key.type--;
4249 key.offset = (u64)-1;
4250 } else if (key.objectid > 0) {
4251 key.objectid--;
4252 key.type = (u8)-1;
4253 key.offset = (u64)-1;
4254 } else {
4255 return 1;
4256 }
4257
4258 btrfs_release_path(path);
4259 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4260 if (ret <= 0)
4261 return ret;
4262
4263 /*
4264 * Previous key not found. Even if we were at slot 0 of the leaf we had
4265 * before releasing the path and calling btrfs_search_slot(), we now may
4266 * be in a slot pointing to the same original key - this can happen if
4267 * after we released the path, one of more items were moved from a
4268 * sibling leaf into the front of the leaf we had due to an insertion
4269 * (see push_leaf_right()).
4270 * If we hit this case and our slot is > 0 and just decrement the slot
4271 * so that the caller does not process the same key again, which may or
4272 * may not break the caller, depending on its logic.
4273 */
4274 if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
4275 btrfs_item_key(path->nodes[0], &found_key, path->slots[0]);
4276 ret = comp_keys(&found_key, &orig_key);
4277 if (ret == 0) {
4278 if (path->slots[0] > 0) {
4279 path->slots[0]--;
4280 return 0;
4281 }
4282 /*
4283 * At slot 0, same key as before, it means orig_key is
4284 * the lowest, leftmost, key in the tree. We're done.
4285 */
4286 return 1;
4287 }
4288 }
4289
4290 btrfs_item_key(path->nodes[0], &found_key, 0);
4291 ret = comp_keys(&found_key, &key);
4292 /*
4293 * We might have had an item with the previous key in the tree right
4294 * before we released our path. And after we released our path, that
4295 * item might have been pushed to the first slot (0) of the leaf we
4296 * were holding due to a tree balance. Alternatively, an item with the
4297 * previous key can exist as the only element of a leaf (big fat item).
4298 * Therefore account for these 2 cases, so that our callers (like
4299 * btrfs_previous_item) don't miss an existing item with a key matching
4300 * the previous key we computed above.
4301 */
4302 if (ret <= 0)
4303 return 0;
4304 return 1;
4305 }
4306
4307 /*
4308 * A helper function to walk down the tree starting at min_key, and looking
4309 * for nodes or leaves that are have a minimum transaction id.
4310 * This is used by the btree defrag code, and tree logging
4311 *
4312 * This does not cow, but it does stuff the starting key it finds back
4313 * into min_key, so you can call btrfs_search_slot with cow=1 on the
4314 * key and get a writable path.
4315 *
4316 * This honors path->lowest_level to prevent descent past a given level
4317 * of the tree.
4318 *
4319 * min_trans indicates the oldest transaction that you are interested
4320 * in walking through. Any nodes or leaves older than min_trans are
4321 * skipped over (without reading them).
4322 *
4323 * returns zero if something useful was found, < 0 on error and 1 if there
4324 * was nothing in the tree that matched the search criteria.
4325 */
btrfs_search_forward(struct btrfs_root * root,struct btrfs_key * min_key,struct btrfs_path * path,u64 min_trans)4326 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
4327 struct btrfs_path *path,
4328 u64 min_trans)
4329 {
4330 struct extent_buffer *cur;
4331 struct btrfs_key found_key;
4332 int slot;
4333 int sret;
4334 u32 nritems;
4335 int level;
4336 int ret = 1;
4337 int keep_locks = path->keep_locks;
4338
4339 path->keep_locks = 1;
4340 again:
4341 cur = btrfs_read_lock_root_node(root);
4342 level = btrfs_header_level(cur);
4343 WARN_ON(path->nodes[level]);
4344 path->nodes[level] = cur;
4345 path->locks[level] = BTRFS_READ_LOCK;
4346
4347 if (btrfs_header_generation(cur) < min_trans) {
4348 ret = 1;
4349 goto out;
4350 }
4351 while (1) {
4352 nritems = btrfs_header_nritems(cur);
4353 level = btrfs_header_level(cur);
4354 sret = btrfs_bin_search(cur, min_key, &slot);
4355 if (sret < 0) {
4356 ret = sret;
4357 goto out;
4358 }
4359
4360 /* at the lowest level, we're done, setup the path and exit */
4361 if (level == path->lowest_level) {
4362 if (slot >= nritems)
4363 goto find_next_key;
4364 ret = 0;
4365 path->slots[level] = slot;
4366 btrfs_item_key_to_cpu(cur, &found_key, slot);
4367 goto out;
4368 }
4369 if (sret && slot > 0)
4370 slot--;
4371 /*
4372 * check this node pointer against the min_trans parameters.
4373 * If it is too old, skip to the next one.
4374 */
4375 while (slot < nritems) {
4376 u64 gen;
4377
4378 gen = btrfs_node_ptr_generation(cur, slot);
4379 if (gen < min_trans) {
4380 slot++;
4381 continue;
4382 }
4383 break;
4384 }
4385 find_next_key:
4386 /*
4387 * we didn't find a candidate key in this node, walk forward
4388 * and find another one
4389 */
4390 if (slot >= nritems) {
4391 path->slots[level] = slot;
4392 sret = btrfs_find_next_key(root, path, min_key, level,
4393 min_trans);
4394 if (sret == 0) {
4395 btrfs_release_path(path);
4396 goto again;
4397 } else {
4398 goto out;
4399 }
4400 }
4401 /* save our key for returning back */
4402 btrfs_node_key_to_cpu(cur, &found_key, slot);
4403 path->slots[level] = slot;
4404 if (level == path->lowest_level) {
4405 ret = 0;
4406 goto out;
4407 }
4408 cur = btrfs_read_node_slot(cur, slot);
4409 if (IS_ERR(cur)) {
4410 ret = PTR_ERR(cur);
4411 goto out;
4412 }
4413
4414 btrfs_tree_read_lock(cur);
4415
4416 path->locks[level - 1] = BTRFS_READ_LOCK;
4417 path->nodes[level - 1] = cur;
4418 unlock_up(path, level, 1, 0, NULL);
4419 }
4420 out:
4421 path->keep_locks = keep_locks;
4422 if (ret == 0) {
4423 btrfs_unlock_up_safe(path, path->lowest_level + 1);
4424 memcpy(min_key, &found_key, sizeof(found_key));
4425 }
4426 return ret;
4427 }
4428
4429 /*
4430 * this is similar to btrfs_next_leaf, but does not try to preserve
4431 * and fixup the path. It looks for and returns the next key in the
4432 * tree based on the current path and the min_trans parameters.
4433 *
4434 * 0 is returned if another key is found, < 0 if there are any errors
4435 * and 1 is returned if there are no higher keys in the tree
4436 *
4437 * path->keep_locks should be set to 1 on the search made before
4438 * calling this function.
4439 */
btrfs_find_next_key(struct btrfs_root * root,struct btrfs_path * path,struct btrfs_key * key,int level,u64 min_trans)4440 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
4441 struct btrfs_key *key, int level, u64 min_trans)
4442 {
4443 int slot;
4444 struct extent_buffer *c;
4445
4446 WARN_ON(!path->keep_locks && !path->skip_locking);
4447 while (level < BTRFS_MAX_LEVEL) {
4448 if (!path->nodes[level])
4449 return 1;
4450
4451 slot = path->slots[level] + 1;
4452 c = path->nodes[level];
4453 next:
4454 if (slot >= btrfs_header_nritems(c)) {
4455 int ret;
4456 int orig_lowest;
4457 struct btrfs_key cur_key;
4458 if (level + 1 >= BTRFS_MAX_LEVEL ||
4459 !path->nodes[level + 1])
4460 return 1;
4461
4462 if (path->locks[level + 1] || path->skip_locking) {
4463 level++;
4464 continue;
4465 }
4466
4467 slot = btrfs_header_nritems(c) - 1;
4468 if (level == 0)
4469 btrfs_item_key_to_cpu(c, &cur_key, slot);
4470 else
4471 btrfs_node_key_to_cpu(c, &cur_key, slot);
4472
4473 orig_lowest = path->lowest_level;
4474 btrfs_release_path(path);
4475 path->lowest_level = level;
4476 ret = btrfs_search_slot(NULL, root, &cur_key, path,
4477 0, 0);
4478 path->lowest_level = orig_lowest;
4479 if (ret < 0)
4480 return ret;
4481
4482 c = path->nodes[level];
4483 slot = path->slots[level];
4484 if (ret == 0)
4485 slot++;
4486 goto next;
4487 }
4488
4489 if (level == 0)
4490 btrfs_item_key_to_cpu(c, key, slot);
4491 else {
4492 u64 gen = btrfs_node_ptr_generation(c, slot);
4493
4494 if (gen < min_trans) {
4495 slot++;
4496 goto next;
4497 }
4498 btrfs_node_key_to_cpu(c, key, slot);
4499 }
4500 return 0;
4501 }
4502 return 1;
4503 }
4504
btrfs_next_old_leaf(struct btrfs_root * root,struct btrfs_path * path,u64 time_seq)4505 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
4506 u64 time_seq)
4507 {
4508 int slot;
4509 int level;
4510 struct extent_buffer *c;
4511 struct extent_buffer *next;
4512 struct btrfs_fs_info *fs_info = root->fs_info;
4513 struct btrfs_key key;
4514 bool need_commit_sem = false;
4515 u32 nritems;
4516 int ret;
4517 int i;
4518
4519 nritems = btrfs_header_nritems(path->nodes[0]);
4520 if (nritems == 0)
4521 return 1;
4522
4523 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
4524 again:
4525 level = 1;
4526 next = NULL;
4527 btrfs_release_path(path);
4528
4529 path->keep_locks = 1;
4530
4531 if (time_seq) {
4532 ret = btrfs_search_old_slot(root, &key, path, time_seq);
4533 } else {
4534 if (path->need_commit_sem) {
4535 path->need_commit_sem = 0;
4536 need_commit_sem = true;
4537 down_read(&fs_info->commit_root_sem);
4538 }
4539 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4540 }
4541 path->keep_locks = 0;
4542
4543 if (ret < 0)
4544 goto done;
4545
4546 nritems = btrfs_header_nritems(path->nodes[0]);
4547 /*
4548 * by releasing the path above we dropped all our locks. A balance
4549 * could have added more items next to the key that used to be
4550 * at the very end of the block. So, check again here and
4551 * advance the path if there are now more items available.
4552 */
4553 if (nritems > 0 && path->slots[0] < nritems - 1) {
4554 if (ret == 0)
4555 path->slots[0]++;
4556 ret = 0;
4557 goto done;
4558 }
4559 /*
4560 * So the above check misses one case:
4561 * - after releasing the path above, someone has removed the item that
4562 * used to be at the very end of the block, and balance between leafs
4563 * gets another one with bigger key.offset to replace it.
4564 *
4565 * This one should be returned as well, or we can get leaf corruption
4566 * later(esp. in __btrfs_drop_extents()).
4567 *
4568 * And a bit more explanation about this check,
4569 * with ret > 0, the key isn't found, the path points to the slot
4570 * where it should be inserted, so the path->slots[0] item must be the
4571 * bigger one.
4572 */
4573 if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
4574 ret = 0;
4575 goto done;
4576 }
4577
4578 while (level < BTRFS_MAX_LEVEL) {
4579 if (!path->nodes[level]) {
4580 ret = 1;
4581 goto done;
4582 }
4583
4584 slot = path->slots[level] + 1;
4585 c = path->nodes[level];
4586 if (slot >= btrfs_header_nritems(c)) {
4587 level++;
4588 if (level == BTRFS_MAX_LEVEL) {
4589 ret = 1;
4590 goto done;
4591 }
4592 continue;
4593 }
4594
4595
4596 /*
4597 * Our current level is where we're going to start from, and to
4598 * make sure lockdep doesn't complain we need to drop our locks
4599 * and nodes from 0 to our current level.
4600 */
4601 for (i = 0; i < level; i++) {
4602 if (path->locks[level]) {
4603 btrfs_tree_read_unlock(path->nodes[i]);
4604 path->locks[i] = 0;
4605 }
4606 free_extent_buffer(path->nodes[i]);
4607 path->nodes[i] = NULL;
4608 }
4609
4610 next = c;
4611 ret = read_block_for_search(root, path, &next, level,
4612 slot, &key);
4613 if (ret == -EAGAIN)
4614 goto again;
4615
4616 if (ret < 0) {
4617 btrfs_release_path(path);
4618 goto done;
4619 }
4620
4621 if (!path->skip_locking) {
4622 ret = btrfs_try_tree_read_lock(next);
4623 if (!ret && time_seq) {
4624 /*
4625 * If we don't get the lock, we may be racing
4626 * with push_leaf_left, holding that lock while
4627 * itself waiting for the leaf we've currently
4628 * locked. To solve this situation, we give up
4629 * on our lock and cycle.
4630 */
4631 free_extent_buffer(next);
4632 btrfs_release_path(path);
4633 cond_resched();
4634 goto again;
4635 }
4636 if (!ret)
4637 btrfs_tree_read_lock(next);
4638 }
4639 break;
4640 }
4641 path->slots[level] = slot;
4642 while (1) {
4643 level--;
4644 path->nodes[level] = next;
4645 path->slots[level] = 0;
4646 if (!path->skip_locking)
4647 path->locks[level] = BTRFS_READ_LOCK;
4648 if (!level)
4649 break;
4650
4651 ret = read_block_for_search(root, path, &next, level,
4652 0, &key);
4653 if (ret == -EAGAIN)
4654 goto again;
4655
4656 if (ret < 0) {
4657 btrfs_release_path(path);
4658 goto done;
4659 }
4660
4661 if (!path->skip_locking)
4662 btrfs_tree_read_lock(next);
4663 }
4664 ret = 0;
4665 done:
4666 unlock_up(path, 0, 1, 0, NULL);
4667 if (need_commit_sem) {
4668 int ret2;
4669
4670 path->need_commit_sem = 1;
4671 ret2 = finish_need_commit_sem_search(path);
4672 up_read(&fs_info->commit_root_sem);
4673 if (ret2)
4674 ret = ret2;
4675 }
4676
4677 return ret;
4678 }
4679
4680 /*
4681 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
4682 * searching until it gets past min_objectid or finds an item of 'type'
4683 *
4684 * returns 0 if something is found, 1 if nothing was found and < 0 on error
4685 */
btrfs_previous_item(struct btrfs_root * root,struct btrfs_path * path,u64 min_objectid,int type)4686 int btrfs_previous_item(struct btrfs_root *root,
4687 struct btrfs_path *path, u64 min_objectid,
4688 int type)
4689 {
4690 struct btrfs_key found_key;
4691 struct extent_buffer *leaf;
4692 u32 nritems;
4693 int ret;
4694
4695 while (1) {
4696 if (path->slots[0] == 0) {
4697 ret = btrfs_prev_leaf(root, path);
4698 if (ret != 0)
4699 return ret;
4700 } else {
4701 path->slots[0]--;
4702 }
4703 leaf = path->nodes[0];
4704 nritems = btrfs_header_nritems(leaf);
4705 if (nritems == 0)
4706 return 1;
4707 if (path->slots[0] == nritems)
4708 path->slots[0]--;
4709
4710 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4711 if (found_key.objectid < min_objectid)
4712 break;
4713 if (found_key.type == type)
4714 return 0;
4715 if (found_key.objectid == min_objectid &&
4716 found_key.type < type)
4717 break;
4718 }
4719 return 1;
4720 }
4721
4722 /*
4723 * search in extent tree to find a previous Metadata/Data extent item with
4724 * min objecitd.
4725 *
4726 * returns 0 if something is found, 1 if nothing was found and < 0 on error
4727 */
btrfs_previous_extent_item(struct btrfs_root * root,struct btrfs_path * path,u64 min_objectid)4728 int btrfs_previous_extent_item(struct btrfs_root *root,
4729 struct btrfs_path *path, u64 min_objectid)
4730 {
4731 struct btrfs_key found_key;
4732 struct extent_buffer *leaf;
4733 u32 nritems;
4734 int ret;
4735
4736 while (1) {
4737 if (path->slots[0] == 0) {
4738 ret = btrfs_prev_leaf(root, path);
4739 if (ret != 0)
4740 return ret;
4741 } else {
4742 path->slots[0]--;
4743 }
4744 leaf = path->nodes[0];
4745 nritems = btrfs_header_nritems(leaf);
4746 if (nritems == 0)
4747 return 1;
4748 if (path->slots[0] == nritems)
4749 path->slots[0]--;
4750
4751 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4752 if (found_key.objectid < min_objectid)
4753 break;
4754 if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
4755 found_key.type == BTRFS_METADATA_ITEM_KEY)
4756 return 0;
4757 if (found_key.objectid == min_objectid &&
4758 found_key.type < BTRFS_EXTENT_ITEM_KEY)
4759 break;
4760 }
4761 return 1;
4762 }
4763