• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
3  * All Rights Reserved.
4  *
5  * This program is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU General Public License as
7  * published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope that it would be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write the Free Software Foundation,
16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17  */
18 #include "xfs.h"
19 #include "xfs_fs.h"
20 #include "xfs_shared.h"
21 #include "xfs_format.h"
22 #include "xfs_log_format.h"
23 #include "xfs_trans_resv.h"
24 #include "xfs_bit.h"
25 #include "xfs_mount.h"
26 #include "xfs_defer.h"
27 #include "xfs_inode.h"
28 #include "xfs_trans.h"
29 #include "xfs_inode_item.h"
30 #include "xfs_buf_item.h"
31 #include "xfs_btree.h"
32 #include "xfs_error.h"
33 #include "xfs_trace.h"
34 #include "xfs_cksum.h"
35 #include "xfs_alloc.h"
36 #include "xfs_log.h"
37 
38 /*
39  * Cursor allocation zone.
40  */
41 kmem_zone_t	*xfs_btree_cur_zone;
42 
43 /*
44  * Btree magic numbers.
45  */
46 static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {
47 	{ XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, 0, XFS_BMAP_MAGIC, XFS_IBT_MAGIC,
48 	  XFS_FIBT_MAGIC, 0 },
49 	{ XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC, XFS_RMAP_CRC_MAGIC,
50 	  XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC,
51 	  XFS_REFC_CRC_MAGIC }
52 };
53 #define xfs_btree_magic(cur) \
54 	xfs_magics[!!((cur)->bc_flags & XFS_BTREE_CRC_BLOCKS)][cur->bc_btnum]
55 
56 STATIC int				/* error (0 or EFSCORRUPTED) */
xfs_btree_check_lblock(struct xfs_btree_cur * cur,struct xfs_btree_block * block,int level,struct xfs_buf * bp)57 xfs_btree_check_lblock(
58 	struct xfs_btree_cur	*cur,	/* btree cursor */
59 	struct xfs_btree_block	*block,	/* btree long form block pointer */
60 	int			level,	/* level of the btree block */
61 	struct xfs_buf		*bp)	/* buffer for block, if any */
62 {
63 	int			lblock_ok = 1; /* block passes checks */
64 	struct xfs_mount	*mp;	/* file system mount point */
65 
66 	mp = cur->bc_mp;
67 
68 	if (xfs_sb_version_hascrc(&mp->m_sb)) {
69 		lblock_ok = lblock_ok &&
70 			uuid_equal(&block->bb_u.l.bb_uuid,
71 				   &mp->m_sb.sb_meta_uuid) &&
72 			block->bb_u.l.bb_blkno == cpu_to_be64(
73 				bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
74 	}
75 
76 	lblock_ok = lblock_ok &&
77 		be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) &&
78 		be16_to_cpu(block->bb_level) == level &&
79 		be16_to_cpu(block->bb_numrecs) <=
80 			cur->bc_ops->get_maxrecs(cur, level) &&
81 		block->bb_u.l.bb_leftsib &&
82 		(block->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK) ||
83 		 XFS_FSB_SANITY_CHECK(mp,
84 			be64_to_cpu(block->bb_u.l.bb_leftsib))) &&
85 		block->bb_u.l.bb_rightsib &&
86 		(block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK) ||
87 		 XFS_FSB_SANITY_CHECK(mp,
88 			be64_to_cpu(block->bb_u.l.bb_rightsib)));
89 
90 	if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp,
91 			XFS_ERRTAG_BTREE_CHECK_LBLOCK,
92 			XFS_RANDOM_BTREE_CHECK_LBLOCK))) {
93 		if (bp)
94 			trace_xfs_btree_corrupt(bp, _RET_IP_);
95 		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
96 		return -EFSCORRUPTED;
97 	}
98 	return 0;
99 }
100 
101 STATIC int				/* error (0 or EFSCORRUPTED) */
xfs_btree_check_sblock(struct xfs_btree_cur * cur,struct xfs_btree_block * block,int level,struct xfs_buf * bp)102 xfs_btree_check_sblock(
103 	struct xfs_btree_cur	*cur,	/* btree cursor */
104 	struct xfs_btree_block	*block,	/* btree short form block pointer */
105 	int			level,	/* level of the btree block */
106 	struct xfs_buf		*bp)	/* buffer containing block */
107 {
108 	struct xfs_mount	*mp;	/* file system mount point */
109 	struct xfs_buf		*agbp;	/* buffer for ag. freespace struct */
110 	struct xfs_agf		*agf;	/* ag. freespace structure */
111 	xfs_agblock_t		agflen;	/* native ag. freespace length */
112 	int			sblock_ok = 1; /* block passes checks */
113 
114 	mp = cur->bc_mp;
115 	agbp = cur->bc_private.a.agbp;
116 	agf = XFS_BUF_TO_AGF(agbp);
117 	agflen = be32_to_cpu(agf->agf_length);
118 
119 	if (xfs_sb_version_hascrc(&mp->m_sb)) {
120 		sblock_ok = sblock_ok &&
121 			uuid_equal(&block->bb_u.s.bb_uuid,
122 				   &mp->m_sb.sb_meta_uuid) &&
123 			block->bb_u.s.bb_blkno == cpu_to_be64(
124 				bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
125 	}
126 
127 	sblock_ok = sblock_ok &&
128 		be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) &&
129 		be16_to_cpu(block->bb_level) == level &&
130 		be16_to_cpu(block->bb_numrecs) <=
131 			cur->bc_ops->get_maxrecs(cur, level) &&
132 		(block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) ||
133 		 be32_to_cpu(block->bb_u.s.bb_leftsib) < agflen) &&
134 		block->bb_u.s.bb_leftsib &&
135 		(block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) ||
136 		 be32_to_cpu(block->bb_u.s.bb_rightsib) < agflen) &&
137 		block->bb_u.s.bb_rightsib;
138 
139 	if (unlikely(XFS_TEST_ERROR(!sblock_ok, mp,
140 			XFS_ERRTAG_BTREE_CHECK_SBLOCK,
141 			XFS_RANDOM_BTREE_CHECK_SBLOCK))) {
142 		if (bp)
143 			trace_xfs_btree_corrupt(bp, _RET_IP_);
144 		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
145 		return -EFSCORRUPTED;
146 	}
147 	return 0;
148 }
149 
150 /*
151  * Debug routine: check that block header is ok.
152  */
153 int
xfs_btree_check_block(struct xfs_btree_cur * cur,struct xfs_btree_block * block,int level,struct xfs_buf * bp)154 xfs_btree_check_block(
155 	struct xfs_btree_cur	*cur,	/* btree cursor */
156 	struct xfs_btree_block	*block,	/* generic btree block pointer */
157 	int			level,	/* level of the btree block */
158 	struct xfs_buf		*bp)	/* buffer containing block, if any */
159 {
160 	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
161 		return xfs_btree_check_lblock(cur, block, level, bp);
162 	else
163 		return xfs_btree_check_sblock(cur, block, level, bp);
164 }
165 
166 /*
167  * Check that (long) pointer is ok.
168  */
169 int					/* error (0 or EFSCORRUPTED) */
xfs_btree_check_lptr(struct xfs_btree_cur * cur,xfs_fsblock_t bno,int level)170 xfs_btree_check_lptr(
171 	struct xfs_btree_cur	*cur,	/* btree cursor */
172 	xfs_fsblock_t		bno,	/* btree block disk address */
173 	int			level)	/* btree block level */
174 {
175 	XFS_WANT_CORRUPTED_RETURN(cur->bc_mp,
176 		level > 0 &&
177 		bno != NULLFSBLOCK &&
178 		XFS_FSB_SANITY_CHECK(cur->bc_mp, bno));
179 	return 0;
180 }
181 
182 #ifdef DEBUG
183 /*
184  * Check that (short) pointer is ok.
185  */
186 STATIC int				/* error (0 or EFSCORRUPTED) */
xfs_btree_check_sptr(struct xfs_btree_cur * cur,xfs_agblock_t bno,int level)187 xfs_btree_check_sptr(
188 	struct xfs_btree_cur	*cur,	/* btree cursor */
189 	xfs_agblock_t		bno,	/* btree block disk address */
190 	int			level)	/* btree block level */
191 {
192 	xfs_agblock_t		agblocks = cur->bc_mp->m_sb.sb_agblocks;
193 
194 	XFS_WANT_CORRUPTED_RETURN(cur->bc_mp,
195 		level > 0 &&
196 		bno != NULLAGBLOCK &&
197 		bno != 0 &&
198 		bno < agblocks);
199 	return 0;
200 }
201 
202 /*
203  * Check that block ptr is ok.
204  */
205 STATIC int				/* error (0 or EFSCORRUPTED) */
xfs_btree_check_ptr(struct xfs_btree_cur * cur,union xfs_btree_ptr * ptr,int index,int level)206 xfs_btree_check_ptr(
207 	struct xfs_btree_cur	*cur,	/* btree cursor */
208 	union xfs_btree_ptr	*ptr,	/* btree block disk address */
209 	int			index,	/* offset from ptr to check */
210 	int			level)	/* btree block level */
211 {
212 	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
213 		return xfs_btree_check_lptr(cur,
214 				be64_to_cpu((&ptr->l)[index]), level);
215 	} else {
216 		return xfs_btree_check_sptr(cur,
217 				be32_to_cpu((&ptr->s)[index]), level);
218 	}
219 }
220 #endif
221 
222 /*
223  * Calculate CRC on the whole btree block and stuff it into the
224  * long-form btree header.
225  *
226  * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
227  * it into the buffer so recovery knows what the last modification was that made
228  * it to disk.
229  */
230 void
xfs_btree_lblock_calc_crc(struct xfs_buf * bp)231 xfs_btree_lblock_calc_crc(
232 	struct xfs_buf		*bp)
233 {
234 	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
235 	struct xfs_buf_log_item	*bip = bp->b_fspriv;
236 
237 	if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
238 		return;
239 	if (bip)
240 		block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
241 	xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
242 }
243 
244 bool
xfs_btree_lblock_verify_crc(struct xfs_buf * bp)245 xfs_btree_lblock_verify_crc(
246 	struct xfs_buf		*bp)
247 {
248 	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
249 	struct xfs_mount	*mp = bp->b_target->bt_mount;
250 
251 	if (xfs_sb_version_hascrc(&mp->m_sb)) {
252 		if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.l.bb_lsn)))
253 			return false;
254 		return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
255 	}
256 
257 	return true;
258 }
259 
260 /*
261  * Calculate CRC on the whole btree block and stuff it into the
262  * short-form btree header.
263  *
264  * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
265  * it into the buffer so recovery knows what the last modification was that made
266  * it to disk.
267  */
268 void
xfs_btree_sblock_calc_crc(struct xfs_buf * bp)269 xfs_btree_sblock_calc_crc(
270 	struct xfs_buf		*bp)
271 {
272 	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
273 	struct xfs_buf_log_item	*bip = bp->b_fspriv;
274 
275 	if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
276 		return;
277 	if (bip)
278 		block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
279 	xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
280 }
281 
282 bool
xfs_btree_sblock_verify_crc(struct xfs_buf * bp)283 xfs_btree_sblock_verify_crc(
284 	struct xfs_buf		*bp)
285 {
286 	struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
287 	struct xfs_mount	*mp = bp->b_target->bt_mount;
288 
289 	if (xfs_sb_version_hascrc(&mp->m_sb)) {
290 		if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn)))
291 			return false;
292 		return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
293 	}
294 
295 	return true;
296 }
297 
298 static int
xfs_btree_free_block(struct xfs_btree_cur * cur,struct xfs_buf * bp)299 xfs_btree_free_block(
300 	struct xfs_btree_cur	*cur,
301 	struct xfs_buf		*bp)
302 {
303 	int			error;
304 
305 	error = cur->bc_ops->free_block(cur, bp);
306 	if (!error) {
307 		xfs_trans_binval(cur->bc_tp, bp);
308 		XFS_BTREE_STATS_INC(cur, free);
309 	}
310 	return error;
311 }
312 
313 /*
314  * Delete the btree cursor.
315  */
316 void
xfs_btree_del_cursor(xfs_btree_cur_t * cur,int error)317 xfs_btree_del_cursor(
318 	xfs_btree_cur_t	*cur,		/* btree cursor */
319 	int		error)		/* del because of error */
320 {
321 	int		i;		/* btree level */
322 
323 	/*
324 	 * Clear the buffer pointers, and release the buffers.
325 	 * If we're doing this in the face of an error, we
326 	 * need to make sure to inspect all of the entries
327 	 * in the bc_bufs array for buffers to be unlocked.
328 	 * This is because some of the btree code works from
329 	 * level n down to 0, and if we get an error along
330 	 * the way we won't have initialized all the entries
331 	 * down to 0.
332 	 */
333 	for (i = 0; i < cur->bc_nlevels; i++) {
334 		if (cur->bc_bufs[i])
335 			xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
336 		else if (!error)
337 			break;
338 	}
339 	/*
340 	 * Can't free a bmap cursor without having dealt with the
341 	 * allocated indirect blocks' accounting.
342 	 */
343 	ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP ||
344 	       cur->bc_private.b.allocated == 0);
345 	/*
346 	 * Free the cursor.
347 	 */
348 	kmem_zone_free(xfs_btree_cur_zone, cur);
349 }
350 
351 /*
352  * Duplicate the btree cursor.
353  * Allocate a new one, copy the record, re-get the buffers.
354  */
355 int					/* error */
xfs_btree_dup_cursor(xfs_btree_cur_t * cur,xfs_btree_cur_t ** ncur)356 xfs_btree_dup_cursor(
357 	xfs_btree_cur_t	*cur,		/* input cursor */
358 	xfs_btree_cur_t	**ncur)		/* output cursor */
359 {
360 	xfs_buf_t	*bp;		/* btree block's buffer pointer */
361 	int		error;		/* error return value */
362 	int		i;		/* level number of btree block */
363 	xfs_mount_t	*mp;		/* mount structure for filesystem */
364 	xfs_btree_cur_t	*new;		/* new cursor value */
365 	xfs_trans_t	*tp;		/* transaction pointer, can be NULL */
366 
367 	tp = cur->bc_tp;
368 	mp = cur->bc_mp;
369 
370 	/*
371 	 * Allocate a new cursor like the old one.
372 	 */
373 	new = cur->bc_ops->dup_cursor(cur);
374 
375 	/*
376 	 * Copy the record currently in the cursor.
377 	 */
378 	new->bc_rec = cur->bc_rec;
379 
380 	/*
381 	 * For each level current, re-get the buffer and copy the ptr value.
382 	 */
383 	for (i = 0; i < new->bc_nlevels; i++) {
384 		new->bc_ptrs[i] = cur->bc_ptrs[i];
385 		new->bc_ra[i] = cur->bc_ra[i];
386 		bp = cur->bc_bufs[i];
387 		if (bp) {
388 			error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
389 						   XFS_BUF_ADDR(bp), mp->m_bsize,
390 						   0, &bp,
391 						   cur->bc_ops->buf_ops);
392 			if (error) {
393 				xfs_btree_del_cursor(new, error);
394 				*ncur = NULL;
395 				return error;
396 			}
397 		}
398 		new->bc_bufs[i] = bp;
399 	}
400 	*ncur = new;
401 	return 0;
402 }
403 
404 /*
405  * XFS btree block layout and addressing:
406  *
407  * There are two types of blocks in the btree: leaf and non-leaf blocks.
408  *
409  * The leaf record start with a header then followed by records containing
410  * the values.  A non-leaf block also starts with the same header, and
411  * then first contains lookup keys followed by an equal number of pointers
412  * to the btree blocks at the previous level.
413  *
414  *		+--------+-------+-------+-------+-------+-------+-------+
415  * Leaf:	| header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N |
416  *		+--------+-------+-------+-------+-------+-------+-------+
417  *
418  *		+--------+-------+-------+-------+-------+-------+-------+
419  * Non-Leaf:	| header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N |
420  *		+--------+-------+-------+-------+-------+-------+-------+
421  *
422  * The header is called struct xfs_btree_block for reasons better left unknown
423  * and comes in different versions for short (32bit) and long (64bit) block
424  * pointers.  The record and key structures are defined by the btree instances
425  * and opaque to the btree core.  The block pointers are simple disk endian
426  * integers, available in a short (32bit) and long (64bit) variant.
427  *
428  * The helpers below calculate the offset of a given record, key or pointer
429  * into a btree block (xfs_btree_*_offset) or return a pointer to the given
430  * record, key or pointer (xfs_btree_*_addr).  Note that all addressing
431  * inside the btree block is done using indices starting at one, not zero!
432  *
433  * If XFS_BTREE_OVERLAPPING is set, then this btree supports keys containing
434  * overlapping intervals.  In such a tree, records are still sorted lowest to
435  * highest and indexed by the smallest key value that refers to the record.
436  * However, nodes are different: each pointer has two associated keys -- one
437  * indexing the lowest key available in the block(s) below (the same behavior
438  * as the key in a regular btree) and another indexing the highest key
439  * available in the block(s) below.  Because records are /not/ sorted by the
440  * highest key, all leaf block updates require us to compute the highest key
441  * that matches any record in the leaf and to recursively update the high keys
442  * in the nodes going further up in the tree, if necessary.  Nodes look like
443  * this:
444  *
445  *		+--------+-----+-----+-----+-----+-----+-------+-------+-----+
446  * Non-Leaf:	| header | lo1 | hi1 | lo2 | hi2 | ... | ptr 1 | ptr 2 | ... |
447  *		+--------+-----+-----+-----+-----+-----+-------+-------+-----+
448  *
449  * To perform an interval query on an overlapped tree, perform the usual
450  * depth-first search and use the low and high keys to decide if we can skip
451  * that particular node.  If a leaf node is reached, return the records that
452  * intersect the interval.  Note that an interval query may return numerous
453  * entries.  For a non-overlapped tree, simply search for the record associated
454  * with the lowest key and iterate forward until a non-matching record is
455  * found.  Section 14.3 ("Interval Trees") of _Introduction to Algorithms_ by
456  * Cormen, Leiserson, Rivest, and Stein (2nd or 3rd ed. only) discuss this in
457  * more detail.
458  *
459  * Why do we care about overlapping intervals?  Let's say you have a bunch of
460  * reverse mapping records on a reflink filesystem:
461  *
462  * 1: +- file A startblock B offset C length D -----------+
463  * 2:      +- file E startblock F offset G length H --------------+
464  * 3:      +- file I startblock F offset J length K --+
465  * 4:                                                        +- file L... --+
466  *
467  * Now say we want to map block (B+D) into file A at offset (C+D).  Ideally,
468  * we'd simply increment the length of record 1.  But how do we find the record
469  * that ends at (B+D-1) (i.e. record 1)?  A LE lookup of (B+D-1) would return
470  * record 3 because the keys are ordered first by startblock.  An interval
471  * query would return records 1 and 2 because they both overlap (B+D-1), and
472  * from that we can pick out record 1 as the appropriate left neighbor.
473  *
474  * In the non-overlapped case you can do a LE lookup and decrement the cursor
475  * because a record's interval must end before the next record.
476  */
477 
478 /*
479  * Return size of the btree block header for this btree instance.
480  */
xfs_btree_block_len(struct xfs_btree_cur * cur)481 static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur)
482 {
483 	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
484 		if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
485 			return XFS_BTREE_LBLOCK_CRC_LEN;
486 		return XFS_BTREE_LBLOCK_LEN;
487 	}
488 	if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
489 		return XFS_BTREE_SBLOCK_CRC_LEN;
490 	return XFS_BTREE_SBLOCK_LEN;
491 }
492 
493 /*
494  * Return size of btree block pointers for this btree instance.
495  */
xfs_btree_ptr_len(struct xfs_btree_cur * cur)496 static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur)
497 {
498 	return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
499 		sizeof(__be64) : sizeof(__be32);
500 }
501 
502 /*
503  * Calculate offset of the n-th record in a btree block.
504  */
505 STATIC size_t
xfs_btree_rec_offset(struct xfs_btree_cur * cur,int n)506 xfs_btree_rec_offset(
507 	struct xfs_btree_cur	*cur,
508 	int			n)
509 {
510 	return xfs_btree_block_len(cur) +
511 		(n - 1) * cur->bc_ops->rec_len;
512 }
513 
514 /*
515  * Calculate offset of the n-th key in a btree block.
516  */
517 STATIC size_t
xfs_btree_key_offset(struct xfs_btree_cur * cur,int n)518 xfs_btree_key_offset(
519 	struct xfs_btree_cur	*cur,
520 	int			n)
521 {
522 	return xfs_btree_block_len(cur) +
523 		(n - 1) * cur->bc_ops->key_len;
524 }
525 
526 /*
527  * Calculate offset of the n-th high key in a btree block.
528  */
529 STATIC size_t
xfs_btree_high_key_offset(struct xfs_btree_cur * cur,int n)530 xfs_btree_high_key_offset(
531 	struct xfs_btree_cur	*cur,
532 	int			n)
533 {
534 	return xfs_btree_block_len(cur) +
535 		(n - 1) * cur->bc_ops->key_len + (cur->bc_ops->key_len / 2);
536 }
537 
538 /*
539  * Calculate offset of the n-th block pointer in a btree block.
540  */
541 STATIC size_t
xfs_btree_ptr_offset(struct xfs_btree_cur * cur,int n,int level)542 xfs_btree_ptr_offset(
543 	struct xfs_btree_cur	*cur,
544 	int			n,
545 	int			level)
546 {
547 	return xfs_btree_block_len(cur) +
548 		cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len +
549 		(n - 1) * xfs_btree_ptr_len(cur);
550 }
551 
552 /*
553  * Return a pointer to the n-th record in the btree block.
554  */
555 STATIC union xfs_btree_rec *
xfs_btree_rec_addr(struct xfs_btree_cur * cur,int n,struct xfs_btree_block * block)556 xfs_btree_rec_addr(
557 	struct xfs_btree_cur	*cur,
558 	int			n,
559 	struct xfs_btree_block	*block)
560 {
561 	return (union xfs_btree_rec *)
562 		((char *)block + xfs_btree_rec_offset(cur, n));
563 }
564 
565 /*
566  * Return a pointer to the n-th key in the btree block.
567  */
568 STATIC union xfs_btree_key *
xfs_btree_key_addr(struct xfs_btree_cur * cur,int n,struct xfs_btree_block * block)569 xfs_btree_key_addr(
570 	struct xfs_btree_cur	*cur,
571 	int			n,
572 	struct xfs_btree_block	*block)
573 {
574 	return (union xfs_btree_key *)
575 		((char *)block + xfs_btree_key_offset(cur, n));
576 }
577 
578 /*
579  * Return a pointer to the n-th high key in the btree block.
580  */
581 STATIC union xfs_btree_key *
xfs_btree_high_key_addr(struct xfs_btree_cur * cur,int n,struct xfs_btree_block * block)582 xfs_btree_high_key_addr(
583 	struct xfs_btree_cur	*cur,
584 	int			n,
585 	struct xfs_btree_block	*block)
586 {
587 	return (union xfs_btree_key *)
588 		((char *)block + xfs_btree_high_key_offset(cur, n));
589 }
590 
591 /*
592  * Return a pointer to the n-th block pointer in the btree block.
593  */
594 STATIC union xfs_btree_ptr *
xfs_btree_ptr_addr(struct xfs_btree_cur * cur,int n,struct xfs_btree_block * block)595 xfs_btree_ptr_addr(
596 	struct xfs_btree_cur	*cur,
597 	int			n,
598 	struct xfs_btree_block	*block)
599 {
600 	int			level = xfs_btree_get_level(block);
601 
602 	ASSERT(block->bb_level != 0);
603 
604 	return (union xfs_btree_ptr *)
605 		((char *)block + xfs_btree_ptr_offset(cur, n, level));
606 }
607 
608 /*
609  * Get the root block which is stored in the inode.
610  *
611  * For now this btree implementation assumes the btree root is always
612  * stored in the if_broot field of an inode fork.
613  */
614 STATIC struct xfs_btree_block *
xfs_btree_get_iroot(struct xfs_btree_cur * cur)615 xfs_btree_get_iroot(
616 	struct xfs_btree_cur	*cur)
617 {
618 	struct xfs_ifork	*ifp;
619 
620 	ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork);
621 	return (struct xfs_btree_block *)ifp->if_broot;
622 }
623 
624 /*
625  * Retrieve the block pointer from the cursor at the given level.
626  * This may be an inode btree root or from a buffer.
627  */
628 STATIC struct xfs_btree_block *		/* generic btree block pointer */
xfs_btree_get_block(struct xfs_btree_cur * cur,int level,struct xfs_buf ** bpp)629 xfs_btree_get_block(
630 	struct xfs_btree_cur	*cur,	/* btree cursor */
631 	int			level,	/* level in btree */
632 	struct xfs_buf		**bpp)	/* buffer containing the block */
633 {
634 	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
635 	    (level == cur->bc_nlevels - 1)) {
636 		*bpp = NULL;
637 		return xfs_btree_get_iroot(cur);
638 	}
639 
640 	*bpp = cur->bc_bufs[level];
641 	return XFS_BUF_TO_BLOCK(*bpp);
642 }
643 
644 /*
645  * Get a buffer for the block, return it with no data read.
646  * Long-form addressing.
647  */
648 xfs_buf_t *				/* buffer for fsbno */
xfs_btree_get_bufl(xfs_mount_t * mp,xfs_trans_t * tp,xfs_fsblock_t fsbno,uint lock)649 xfs_btree_get_bufl(
650 	xfs_mount_t	*mp,		/* file system mount point */
651 	xfs_trans_t	*tp,		/* transaction pointer */
652 	xfs_fsblock_t	fsbno,		/* file system block number */
653 	uint		lock)		/* lock flags for get_buf */
654 {
655 	xfs_daddr_t		d;		/* real disk block address */
656 
657 	ASSERT(fsbno != NULLFSBLOCK);
658 	d = XFS_FSB_TO_DADDR(mp, fsbno);
659 	return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
660 }
661 
662 /*
663  * Get a buffer for the block, return it with no data read.
664  * Short-form addressing.
665  */
666 xfs_buf_t *				/* buffer for agno/agbno */
xfs_btree_get_bufs(xfs_mount_t * mp,xfs_trans_t * tp,xfs_agnumber_t agno,xfs_agblock_t agbno,uint lock)667 xfs_btree_get_bufs(
668 	xfs_mount_t	*mp,		/* file system mount point */
669 	xfs_trans_t	*tp,		/* transaction pointer */
670 	xfs_agnumber_t	agno,		/* allocation group number */
671 	xfs_agblock_t	agbno,		/* allocation group block number */
672 	uint		lock)		/* lock flags for get_buf */
673 {
674 	xfs_daddr_t		d;		/* real disk block address */
675 
676 	ASSERT(agno != NULLAGNUMBER);
677 	ASSERT(agbno != NULLAGBLOCK);
678 	d = XFS_AGB_TO_DADDR(mp, agno, agbno);
679 	return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
680 }
681 
682 /*
683  * Check for the cursor referring to the last block at the given level.
684  */
685 int					/* 1=is last block, 0=not last block */
xfs_btree_islastblock(xfs_btree_cur_t * cur,int level)686 xfs_btree_islastblock(
687 	xfs_btree_cur_t		*cur,	/* btree cursor */
688 	int			level)	/* level to check */
689 {
690 	struct xfs_btree_block	*block;	/* generic btree block pointer */
691 	xfs_buf_t		*bp;	/* buffer containing block */
692 
693 	block = xfs_btree_get_block(cur, level, &bp);
694 	xfs_btree_check_block(cur, block, level, bp);
695 	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
696 		return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK);
697 	else
698 		return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK);
699 }
700 
701 /*
702  * Change the cursor to point to the first record at the given level.
703  * Other levels are unaffected.
704  */
705 STATIC int				/* success=1, failure=0 */
xfs_btree_firstrec(xfs_btree_cur_t * cur,int level)706 xfs_btree_firstrec(
707 	xfs_btree_cur_t		*cur,	/* btree cursor */
708 	int			level)	/* level to change */
709 {
710 	struct xfs_btree_block	*block;	/* generic btree block pointer */
711 	xfs_buf_t		*bp;	/* buffer containing block */
712 
713 	/*
714 	 * Get the block pointer for this level.
715 	 */
716 	block = xfs_btree_get_block(cur, level, &bp);
717 	if (xfs_btree_check_block(cur, block, level, bp))
718 		return 0;
719 	/*
720 	 * It's empty, there is no such record.
721 	 */
722 	if (!block->bb_numrecs)
723 		return 0;
724 	/*
725 	 * Set the ptr value to 1, that's the first record/key.
726 	 */
727 	cur->bc_ptrs[level] = 1;
728 	return 1;
729 }
730 
731 /*
732  * Change the cursor to point to the last record in the current block
733  * at the given level.  Other levels are unaffected.
734  */
735 STATIC int				/* success=1, failure=0 */
xfs_btree_lastrec(xfs_btree_cur_t * cur,int level)736 xfs_btree_lastrec(
737 	xfs_btree_cur_t		*cur,	/* btree cursor */
738 	int			level)	/* level to change */
739 {
740 	struct xfs_btree_block	*block;	/* generic btree block pointer */
741 	xfs_buf_t		*bp;	/* buffer containing block */
742 
743 	/*
744 	 * Get the block pointer for this level.
745 	 */
746 	block = xfs_btree_get_block(cur, level, &bp);
747 	if (xfs_btree_check_block(cur, block, level, bp))
748 		return 0;
749 	/*
750 	 * It's empty, there is no such record.
751 	 */
752 	if (!block->bb_numrecs)
753 		return 0;
754 	/*
755 	 * Set the ptr value to numrecs, that's the last record/key.
756 	 */
757 	cur->bc_ptrs[level] = be16_to_cpu(block->bb_numrecs);
758 	return 1;
759 }
760 
761 /*
762  * Compute first and last byte offsets for the fields given.
763  * Interprets the offsets table, which contains struct field offsets.
764  */
765 void
xfs_btree_offsets(__int64_t fields,const short * offsets,int nbits,int * first,int * last)766 xfs_btree_offsets(
767 	__int64_t	fields,		/* bitmask of fields */
768 	const short	*offsets,	/* table of field offsets */
769 	int		nbits,		/* number of bits to inspect */
770 	int		*first,		/* output: first byte offset */
771 	int		*last)		/* output: last byte offset */
772 {
773 	int		i;		/* current bit number */
774 	__int64_t	imask;		/* mask for current bit number */
775 
776 	ASSERT(fields != 0);
777 	/*
778 	 * Find the lowest bit, so the first byte offset.
779 	 */
780 	for (i = 0, imask = 1LL; ; i++, imask <<= 1) {
781 		if (imask & fields) {
782 			*first = offsets[i];
783 			break;
784 		}
785 	}
786 	/*
787 	 * Find the highest bit, so the last byte offset.
788 	 */
789 	for (i = nbits - 1, imask = 1LL << i; ; i--, imask >>= 1) {
790 		if (imask & fields) {
791 			*last = offsets[i + 1] - 1;
792 			break;
793 		}
794 	}
795 }
796 
797 /*
798  * Get a buffer for the block, return it read in.
799  * Long-form addressing.
800  */
801 int
xfs_btree_read_bufl(struct xfs_mount * mp,struct xfs_trans * tp,xfs_fsblock_t fsbno,uint lock,struct xfs_buf ** bpp,int refval,const struct xfs_buf_ops * ops)802 xfs_btree_read_bufl(
803 	struct xfs_mount	*mp,		/* file system mount point */
804 	struct xfs_trans	*tp,		/* transaction pointer */
805 	xfs_fsblock_t		fsbno,		/* file system block number */
806 	uint			lock,		/* lock flags for read_buf */
807 	struct xfs_buf		**bpp,		/* buffer for fsbno */
808 	int			refval,		/* ref count value for buffer */
809 	const struct xfs_buf_ops *ops)
810 {
811 	struct xfs_buf		*bp;		/* return value */
812 	xfs_daddr_t		d;		/* real disk block address */
813 	int			error;
814 
815 	if (!XFS_FSB_SANITY_CHECK(mp, fsbno))
816 		return -EFSCORRUPTED;
817 	d = XFS_FSB_TO_DADDR(mp, fsbno);
818 	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
819 				   mp->m_bsize, lock, &bp, ops);
820 	if (error)
821 		return error;
822 	if (bp)
823 		xfs_buf_set_ref(bp, refval);
824 	*bpp = bp;
825 	return 0;
826 }
827 
828 /*
829  * Read-ahead the block, don't wait for it, don't return a buffer.
830  * Long-form addressing.
831  */
832 /* ARGSUSED */
833 void
xfs_btree_reada_bufl(struct xfs_mount * mp,xfs_fsblock_t fsbno,xfs_extlen_t count,const struct xfs_buf_ops * ops)834 xfs_btree_reada_bufl(
835 	struct xfs_mount	*mp,		/* file system mount point */
836 	xfs_fsblock_t		fsbno,		/* file system block number */
837 	xfs_extlen_t		count,		/* count of filesystem blocks */
838 	const struct xfs_buf_ops *ops)
839 {
840 	xfs_daddr_t		d;
841 
842 	ASSERT(fsbno != NULLFSBLOCK);
843 	d = XFS_FSB_TO_DADDR(mp, fsbno);
844 	xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
845 }
846 
847 /*
848  * Read-ahead the block, don't wait for it, don't return a buffer.
849  * Short-form addressing.
850  */
851 /* ARGSUSED */
852 void
xfs_btree_reada_bufs(struct xfs_mount * mp,xfs_agnumber_t agno,xfs_agblock_t agbno,xfs_extlen_t count,const struct xfs_buf_ops * ops)853 xfs_btree_reada_bufs(
854 	struct xfs_mount	*mp,		/* file system mount point */
855 	xfs_agnumber_t		agno,		/* allocation group number */
856 	xfs_agblock_t		agbno,		/* allocation group block number */
857 	xfs_extlen_t		count,		/* count of filesystem blocks */
858 	const struct xfs_buf_ops *ops)
859 {
860 	xfs_daddr_t		d;
861 
862 	ASSERT(agno != NULLAGNUMBER);
863 	ASSERT(agbno != NULLAGBLOCK);
864 	d = XFS_AGB_TO_DADDR(mp, agno, agbno);
865 	xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
866 }
867 
868 STATIC int
xfs_btree_readahead_lblock(struct xfs_btree_cur * cur,int lr,struct xfs_btree_block * block)869 xfs_btree_readahead_lblock(
870 	struct xfs_btree_cur	*cur,
871 	int			lr,
872 	struct xfs_btree_block	*block)
873 {
874 	int			rval = 0;
875 	xfs_fsblock_t		left = be64_to_cpu(block->bb_u.l.bb_leftsib);
876 	xfs_fsblock_t		right = be64_to_cpu(block->bb_u.l.bb_rightsib);
877 
878 	if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) {
879 		xfs_btree_reada_bufl(cur->bc_mp, left, 1,
880 				     cur->bc_ops->buf_ops);
881 		rval++;
882 	}
883 
884 	if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) {
885 		xfs_btree_reada_bufl(cur->bc_mp, right, 1,
886 				     cur->bc_ops->buf_ops);
887 		rval++;
888 	}
889 
890 	return rval;
891 }
892 
893 STATIC int
xfs_btree_readahead_sblock(struct xfs_btree_cur * cur,int lr,struct xfs_btree_block * block)894 xfs_btree_readahead_sblock(
895 	struct xfs_btree_cur	*cur,
896 	int			lr,
897 	struct xfs_btree_block *block)
898 {
899 	int			rval = 0;
900 	xfs_agblock_t		left = be32_to_cpu(block->bb_u.s.bb_leftsib);
901 	xfs_agblock_t		right = be32_to_cpu(block->bb_u.s.bb_rightsib);
902 
903 
904 	if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
905 		xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
906 				     left, 1, cur->bc_ops->buf_ops);
907 		rval++;
908 	}
909 
910 	if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
911 		xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
912 				     right, 1, cur->bc_ops->buf_ops);
913 		rval++;
914 	}
915 
916 	return rval;
917 }
918 
919 /*
920  * Read-ahead btree blocks, at the given level.
921  * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
922  */
923 STATIC int
xfs_btree_readahead(struct xfs_btree_cur * cur,int lev,int lr)924 xfs_btree_readahead(
925 	struct xfs_btree_cur	*cur,		/* btree cursor */
926 	int			lev,		/* level in btree */
927 	int			lr)		/* left/right bits */
928 {
929 	struct xfs_btree_block	*block;
930 
931 	/*
932 	 * No readahead needed if we are at the root level and the
933 	 * btree root is stored in the inode.
934 	 */
935 	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
936 	    (lev == cur->bc_nlevels - 1))
937 		return 0;
938 
939 	if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev])
940 		return 0;
941 
942 	cur->bc_ra[lev] |= lr;
943 	block = XFS_BUF_TO_BLOCK(cur->bc_bufs[lev]);
944 
945 	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
946 		return xfs_btree_readahead_lblock(cur, lr, block);
947 	return xfs_btree_readahead_sblock(cur, lr, block);
948 }
949 
950 STATIC xfs_daddr_t
xfs_btree_ptr_to_daddr(struct xfs_btree_cur * cur,union xfs_btree_ptr * ptr)951 xfs_btree_ptr_to_daddr(
952 	struct xfs_btree_cur	*cur,
953 	union xfs_btree_ptr	*ptr)
954 {
955 	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
956 		ASSERT(ptr->l != cpu_to_be64(NULLFSBLOCK));
957 
958 		return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
959 	} else {
960 		ASSERT(cur->bc_private.a.agno != NULLAGNUMBER);
961 		ASSERT(ptr->s != cpu_to_be32(NULLAGBLOCK));
962 
963 		return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno,
964 					be32_to_cpu(ptr->s));
965 	}
966 }
967 
968 /*
969  * Readahead @count btree blocks at the given @ptr location.
970  *
971  * We don't need to care about long or short form btrees here as we have a
972  * method of converting the ptr directly to a daddr available to us.
973  */
974 STATIC void
xfs_btree_readahead_ptr(struct xfs_btree_cur * cur,union xfs_btree_ptr * ptr,xfs_extlen_t count)975 xfs_btree_readahead_ptr(
976 	struct xfs_btree_cur	*cur,
977 	union xfs_btree_ptr	*ptr,
978 	xfs_extlen_t		count)
979 {
980 	xfs_buf_readahead(cur->bc_mp->m_ddev_targp,
981 			  xfs_btree_ptr_to_daddr(cur, ptr),
982 			  cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops);
983 }
984 
985 /*
986  * Set the buffer for level "lev" in the cursor to bp, releasing
987  * any previous buffer.
988  */
989 STATIC void
xfs_btree_setbuf(xfs_btree_cur_t * cur,int lev,xfs_buf_t * bp)990 xfs_btree_setbuf(
991 	xfs_btree_cur_t		*cur,	/* btree cursor */
992 	int			lev,	/* level in btree */
993 	xfs_buf_t		*bp)	/* new buffer to set */
994 {
995 	struct xfs_btree_block	*b;	/* btree block */
996 
997 	if (cur->bc_bufs[lev])
998 		xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[lev]);
999 	cur->bc_bufs[lev] = bp;
1000 	cur->bc_ra[lev] = 0;
1001 
1002 	b = XFS_BUF_TO_BLOCK(bp);
1003 	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
1004 		if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK))
1005 			cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
1006 		if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK))
1007 			cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
1008 	} else {
1009 		if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK))
1010 			cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
1011 		if (b->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK))
1012 			cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
1013 	}
1014 }
1015 
1016 STATIC int
xfs_btree_ptr_is_null(struct xfs_btree_cur * cur,union xfs_btree_ptr * ptr)1017 xfs_btree_ptr_is_null(
1018 	struct xfs_btree_cur	*cur,
1019 	union xfs_btree_ptr	*ptr)
1020 {
1021 	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1022 		return ptr->l == cpu_to_be64(NULLFSBLOCK);
1023 	else
1024 		return ptr->s == cpu_to_be32(NULLAGBLOCK);
1025 }
1026 
1027 STATIC void
xfs_btree_set_ptr_null(struct xfs_btree_cur * cur,union xfs_btree_ptr * ptr)1028 xfs_btree_set_ptr_null(
1029 	struct xfs_btree_cur	*cur,
1030 	union xfs_btree_ptr	*ptr)
1031 {
1032 	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1033 		ptr->l = cpu_to_be64(NULLFSBLOCK);
1034 	else
1035 		ptr->s = cpu_to_be32(NULLAGBLOCK);
1036 }
1037 
1038 /*
1039  * Get/set/init sibling pointers
1040  */
1041 STATIC void
xfs_btree_get_sibling(struct xfs_btree_cur * cur,struct xfs_btree_block * block,union xfs_btree_ptr * ptr,int lr)1042 xfs_btree_get_sibling(
1043 	struct xfs_btree_cur	*cur,
1044 	struct xfs_btree_block	*block,
1045 	union xfs_btree_ptr	*ptr,
1046 	int			lr)
1047 {
1048 	ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
1049 
1050 	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
1051 		if (lr == XFS_BB_RIGHTSIB)
1052 			ptr->l = block->bb_u.l.bb_rightsib;
1053 		else
1054 			ptr->l = block->bb_u.l.bb_leftsib;
1055 	} else {
1056 		if (lr == XFS_BB_RIGHTSIB)
1057 			ptr->s = block->bb_u.s.bb_rightsib;
1058 		else
1059 			ptr->s = block->bb_u.s.bb_leftsib;
1060 	}
1061 }
1062 
1063 STATIC void
xfs_btree_set_sibling(struct xfs_btree_cur * cur,struct xfs_btree_block * block,union xfs_btree_ptr * ptr,int lr)1064 xfs_btree_set_sibling(
1065 	struct xfs_btree_cur	*cur,
1066 	struct xfs_btree_block	*block,
1067 	union xfs_btree_ptr	*ptr,
1068 	int			lr)
1069 {
1070 	ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
1071 
1072 	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
1073 		if (lr == XFS_BB_RIGHTSIB)
1074 			block->bb_u.l.bb_rightsib = ptr->l;
1075 		else
1076 			block->bb_u.l.bb_leftsib = ptr->l;
1077 	} else {
1078 		if (lr == XFS_BB_RIGHTSIB)
1079 			block->bb_u.s.bb_rightsib = ptr->s;
1080 		else
1081 			block->bb_u.s.bb_leftsib = ptr->s;
1082 	}
1083 }
1084 
1085 void
xfs_btree_init_block_int(struct xfs_mount * mp,struct xfs_btree_block * buf,xfs_daddr_t blkno,__u32 magic,__u16 level,__u16 numrecs,__u64 owner,unsigned int flags)1086 xfs_btree_init_block_int(
1087 	struct xfs_mount	*mp,
1088 	struct xfs_btree_block	*buf,
1089 	xfs_daddr_t		blkno,
1090 	__u32			magic,
1091 	__u16			level,
1092 	__u16			numrecs,
1093 	__u64			owner,
1094 	unsigned int		flags)
1095 {
1096 	buf->bb_magic = cpu_to_be32(magic);
1097 	buf->bb_level = cpu_to_be16(level);
1098 	buf->bb_numrecs = cpu_to_be16(numrecs);
1099 
1100 	if (flags & XFS_BTREE_LONG_PTRS) {
1101 		buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK);
1102 		buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK);
1103 		if (flags & XFS_BTREE_CRC_BLOCKS) {
1104 			buf->bb_u.l.bb_blkno = cpu_to_be64(blkno);
1105 			buf->bb_u.l.bb_owner = cpu_to_be64(owner);
1106 			uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid);
1107 			buf->bb_u.l.bb_pad = 0;
1108 			buf->bb_u.l.bb_lsn = 0;
1109 		}
1110 	} else {
1111 		/* owner is a 32 bit value on short blocks */
1112 		__u32 __owner = (__u32)owner;
1113 
1114 		buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
1115 		buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
1116 		if (flags & XFS_BTREE_CRC_BLOCKS) {
1117 			buf->bb_u.s.bb_blkno = cpu_to_be64(blkno);
1118 			buf->bb_u.s.bb_owner = cpu_to_be32(__owner);
1119 			uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid);
1120 			buf->bb_u.s.bb_lsn = 0;
1121 		}
1122 	}
1123 }
1124 
1125 void
xfs_btree_init_block(struct xfs_mount * mp,struct xfs_buf * bp,__u32 magic,__u16 level,__u16 numrecs,__u64 owner,unsigned int flags)1126 xfs_btree_init_block(
1127 	struct xfs_mount *mp,
1128 	struct xfs_buf	*bp,
1129 	__u32		magic,
1130 	__u16		level,
1131 	__u16		numrecs,
1132 	__u64		owner,
1133 	unsigned int	flags)
1134 {
1135 	xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
1136 				 magic, level, numrecs, owner, flags);
1137 }
1138 
1139 STATIC void
xfs_btree_init_block_cur(struct xfs_btree_cur * cur,struct xfs_buf * bp,int level,int numrecs)1140 xfs_btree_init_block_cur(
1141 	struct xfs_btree_cur	*cur,
1142 	struct xfs_buf		*bp,
1143 	int			level,
1144 	int			numrecs)
1145 {
1146 	__u64 owner;
1147 
1148 	/*
1149 	 * we can pull the owner from the cursor right now as the different
1150 	 * owners align directly with the pointer size of the btree. This may
1151 	 * change in future, but is safe for current users of the generic btree
1152 	 * code.
1153 	 */
1154 	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1155 		owner = cur->bc_private.b.ip->i_ino;
1156 	else
1157 		owner = cur->bc_private.a.agno;
1158 
1159 	xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
1160 				 xfs_btree_magic(cur), level, numrecs,
1161 				 owner, cur->bc_flags);
1162 }
1163 
1164 /*
1165  * Return true if ptr is the last record in the btree and
1166  * we need to track updates to this record.  The decision
1167  * will be further refined in the update_lastrec method.
1168  */
1169 STATIC int
xfs_btree_is_lastrec(struct xfs_btree_cur * cur,struct xfs_btree_block * block,int level)1170 xfs_btree_is_lastrec(
1171 	struct xfs_btree_cur	*cur,
1172 	struct xfs_btree_block	*block,
1173 	int			level)
1174 {
1175 	union xfs_btree_ptr	ptr;
1176 
1177 	if (level > 0)
1178 		return 0;
1179 	if (!(cur->bc_flags & XFS_BTREE_LASTREC_UPDATE))
1180 		return 0;
1181 
1182 	xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1183 	if (!xfs_btree_ptr_is_null(cur, &ptr))
1184 		return 0;
1185 	return 1;
1186 }
1187 
1188 STATIC void
xfs_btree_buf_to_ptr(struct xfs_btree_cur * cur,struct xfs_buf * bp,union xfs_btree_ptr * ptr)1189 xfs_btree_buf_to_ptr(
1190 	struct xfs_btree_cur	*cur,
1191 	struct xfs_buf		*bp,
1192 	union xfs_btree_ptr	*ptr)
1193 {
1194 	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1195 		ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
1196 					XFS_BUF_ADDR(bp)));
1197 	else {
1198 		ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp,
1199 					XFS_BUF_ADDR(bp)));
1200 	}
1201 }
1202 
1203 STATIC void
xfs_btree_set_refs(struct xfs_btree_cur * cur,struct xfs_buf * bp)1204 xfs_btree_set_refs(
1205 	struct xfs_btree_cur	*cur,
1206 	struct xfs_buf		*bp)
1207 {
1208 	switch (cur->bc_btnum) {
1209 	case XFS_BTNUM_BNO:
1210 	case XFS_BTNUM_CNT:
1211 		xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF);
1212 		break;
1213 	case XFS_BTNUM_INO:
1214 	case XFS_BTNUM_FINO:
1215 		xfs_buf_set_ref(bp, XFS_INO_BTREE_REF);
1216 		break;
1217 	case XFS_BTNUM_BMAP:
1218 		xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF);
1219 		break;
1220 	case XFS_BTNUM_RMAP:
1221 		xfs_buf_set_ref(bp, XFS_RMAP_BTREE_REF);
1222 		break;
1223 	case XFS_BTNUM_REFC:
1224 		xfs_buf_set_ref(bp, XFS_REFC_BTREE_REF);
1225 		break;
1226 	default:
1227 		ASSERT(0);
1228 	}
1229 }
1230 
1231 STATIC int
xfs_btree_get_buf_block(struct xfs_btree_cur * cur,union xfs_btree_ptr * ptr,int flags,struct xfs_btree_block ** block,struct xfs_buf ** bpp)1232 xfs_btree_get_buf_block(
1233 	struct xfs_btree_cur	*cur,
1234 	union xfs_btree_ptr	*ptr,
1235 	int			flags,
1236 	struct xfs_btree_block	**block,
1237 	struct xfs_buf		**bpp)
1238 {
1239 	struct xfs_mount	*mp = cur->bc_mp;
1240 	xfs_daddr_t		d;
1241 
1242 	/* need to sort out how callers deal with failures first */
1243 	ASSERT(!(flags & XBF_TRYLOCK));
1244 
1245 	d = xfs_btree_ptr_to_daddr(cur, ptr);
1246 	*bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d,
1247 				 mp->m_bsize, flags);
1248 
1249 	if (!*bpp)
1250 		return -ENOMEM;
1251 
1252 	(*bpp)->b_ops = cur->bc_ops->buf_ops;
1253 	*block = XFS_BUF_TO_BLOCK(*bpp);
1254 	return 0;
1255 }
1256 
1257 /*
1258  * Read in the buffer at the given ptr and return the buffer and
1259  * the block pointer within the buffer.
1260  */
1261 STATIC int
xfs_btree_read_buf_block(struct xfs_btree_cur * cur,union xfs_btree_ptr * ptr,int flags,struct xfs_btree_block ** block,struct xfs_buf ** bpp)1262 xfs_btree_read_buf_block(
1263 	struct xfs_btree_cur	*cur,
1264 	union xfs_btree_ptr	*ptr,
1265 	int			flags,
1266 	struct xfs_btree_block	**block,
1267 	struct xfs_buf		**bpp)
1268 {
1269 	struct xfs_mount	*mp = cur->bc_mp;
1270 	xfs_daddr_t		d;
1271 	int			error;
1272 
1273 	/* need to sort out how callers deal with failures first */
1274 	ASSERT(!(flags & XBF_TRYLOCK));
1275 
1276 	d = xfs_btree_ptr_to_daddr(cur, ptr);
1277 	error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
1278 				   mp->m_bsize, flags, bpp,
1279 				   cur->bc_ops->buf_ops);
1280 	if (error)
1281 		return error;
1282 
1283 	xfs_btree_set_refs(cur, *bpp);
1284 	*block = XFS_BUF_TO_BLOCK(*bpp);
1285 	return 0;
1286 }
1287 
1288 /*
1289  * Copy keys from one btree block to another.
1290  */
1291 STATIC void
xfs_btree_copy_keys(struct xfs_btree_cur * cur,union xfs_btree_key * dst_key,union xfs_btree_key * src_key,int numkeys)1292 xfs_btree_copy_keys(
1293 	struct xfs_btree_cur	*cur,
1294 	union xfs_btree_key	*dst_key,
1295 	union xfs_btree_key	*src_key,
1296 	int			numkeys)
1297 {
1298 	ASSERT(numkeys >= 0);
1299 	memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len);
1300 }
1301 
1302 /*
1303  * Copy records from one btree block to another.
1304  */
1305 STATIC void
xfs_btree_copy_recs(struct xfs_btree_cur * cur,union xfs_btree_rec * dst_rec,union xfs_btree_rec * src_rec,int numrecs)1306 xfs_btree_copy_recs(
1307 	struct xfs_btree_cur	*cur,
1308 	union xfs_btree_rec	*dst_rec,
1309 	union xfs_btree_rec	*src_rec,
1310 	int			numrecs)
1311 {
1312 	ASSERT(numrecs >= 0);
1313 	memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len);
1314 }
1315 
1316 /*
1317  * Copy block pointers from one btree block to another.
1318  */
1319 STATIC void
xfs_btree_copy_ptrs(struct xfs_btree_cur * cur,union xfs_btree_ptr * dst_ptr,union xfs_btree_ptr * src_ptr,int numptrs)1320 xfs_btree_copy_ptrs(
1321 	struct xfs_btree_cur	*cur,
1322 	union xfs_btree_ptr	*dst_ptr,
1323 	union xfs_btree_ptr	*src_ptr,
1324 	int			numptrs)
1325 {
1326 	ASSERT(numptrs >= 0);
1327 	memcpy(dst_ptr, src_ptr, numptrs * xfs_btree_ptr_len(cur));
1328 }
1329 
1330 /*
1331  * Shift keys one index left/right inside a single btree block.
1332  */
1333 STATIC void
xfs_btree_shift_keys(struct xfs_btree_cur * cur,union xfs_btree_key * key,int dir,int numkeys)1334 xfs_btree_shift_keys(
1335 	struct xfs_btree_cur	*cur,
1336 	union xfs_btree_key	*key,
1337 	int			dir,
1338 	int			numkeys)
1339 {
1340 	char			*dst_key;
1341 
1342 	ASSERT(numkeys >= 0);
1343 	ASSERT(dir == 1 || dir == -1);
1344 
1345 	dst_key = (char *)key + (dir * cur->bc_ops->key_len);
1346 	memmove(dst_key, key, numkeys * cur->bc_ops->key_len);
1347 }
1348 
1349 /*
1350  * Shift records one index left/right inside a single btree block.
1351  */
1352 STATIC void
xfs_btree_shift_recs(struct xfs_btree_cur * cur,union xfs_btree_rec * rec,int dir,int numrecs)1353 xfs_btree_shift_recs(
1354 	struct xfs_btree_cur	*cur,
1355 	union xfs_btree_rec	*rec,
1356 	int			dir,
1357 	int			numrecs)
1358 {
1359 	char			*dst_rec;
1360 
1361 	ASSERT(numrecs >= 0);
1362 	ASSERT(dir == 1 || dir == -1);
1363 
1364 	dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len);
1365 	memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len);
1366 }
1367 
1368 /*
1369  * Shift block pointers one index left/right inside a single btree block.
1370  */
1371 STATIC void
xfs_btree_shift_ptrs(struct xfs_btree_cur * cur,union xfs_btree_ptr * ptr,int dir,int numptrs)1372 xfs_btree_shift_ptrs(
1373 	struct xfs_btree_cur	*cur,
1374 	union xfs_btree_ptr	*ptr,
1375 	int			dir,
1376 	int			numptrs)
1377 {
1378 	char			*dst_ptr;
1379 
1380 	ASSERT(numptrs >= 0);
1381 	ASSERT(dir == 1 || dir == -1);
1382 
1383 	dst_ptr = (char *)ptr + (dir * xfs_btree_ptr_len(cur));
1384 	memmove(dst_ptr, ptr, numptrs * xfs_btree_ptr_len(cur));
1385 }
1386 
1387 /*
1388  * Log key values from the btree block.
1389  */
1390 STATIC void
xfs_btree_log_keys(struct xfs_btree_cur * cur,struct xfs_buf * bp,int first,int last)1391 xfs_btree_log_keys(
1392 	struct xfs_btree_cur	*cur,
1393 	struct xfs_buf		*bp,
1394 	int			first,
1395 	int			last)
1396 {
1397 	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1398 	XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
1399 
1400 	if (bp) {
1401 		xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1402 		xfs_trans_log_buf(cur->bc_tp, bp,
1403 				  xfs_btree_key_offset(cur, first),
1404 				  xfs_btree_key_offset(cur, last + 1) - 1);
1405 	} else {
1406 		xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
1407 				xfs_ilog_fbroot(cur->bc_private.b.whichfork));
1408 	}
1409 
1410 	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1411 }
1412 
1413 /*
1414  * Log record values from the btree block.
1415  */
1416 void
xfs_btree_log_recs(struct xfs_btree_cur * cur,struct xfs_buf * bp,int first,int last)1417 xfs_btree_log_recs(
1418 	struct xfs_btree_cur	*cur,
1419 	struct xfs_buf		*bp,
1420 	int			first,
1421 	int			last)
1422 {
1423 	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1424 	XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
1425 
1426 	xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1427 	xfs_trans_log_buf(cur->bc_tp, bp,
1428 			  xfs_btree_rec_offset(cur, first),
1429 			  xfs_btree_rec_offset(cur, last + 1) - 1);
1430 
1431 	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1432 }
1433 
1434 /*
1435  * Log block pointer fields from a btree block (nonleaf).
1436  */
1437 STATIC void
xfs_btree_log_ptrs(struct xfs_btree_cur * cur,struct xfs_buf * bp,int first,int last)1438 xfs_btree_log_ptrs(
1439 	struct xfs_btree_cur	*cur,	/* btree cursor */
1440 	struct xfs_buf		*bp,	/* buffer containing btree block */
1441 	int			first,	/* index of first pointer to log */
1442 	int			last)	/* index of last pointer to log */
1443 {
1444 	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1445 	XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
1446 
1447 	if (bp) {
1448 		struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
1449 		int			level = xfs_btree_get_level(block);
1450 
1451 		xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1452 		xfs_trans_log_buf(cur->bc_tp, bp,
1453 				xfs_btree_ptr_offset(cur, first, level),
1454 				xfs_btree_ptr_offset(cur, last + 1, level) - 1);
1455 	} else {
1456 		xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
1457 			xfs_ilog_fbroot(cur->bc_private.b.whichfork));
1458 	}
1459 
1460 	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1461 }
1462 
1463 /*
1464  * Log fields from a btree block header.
1465  */
1466 void
xfs_btree_log_block(struct xfs_btree_cur * cur,struct xfs_buf * bp,int fields)1467 xfs_btree_log_block(
1468 	struct xfs_btree_cur	*cur,	/* btree cursor */
1469 	struct xfs_buf		*bp,	/* buffer containing btree block */
1470 	int			fields)	/* mask of fields: XFS_BB_... */
1471 {
1472 	int			first;	/* first byte offset logged */
1473 	int			last;	/* last byte offset logged */
1474 	static const short	soffsets[] = {	/* table of offsets (short) */
1475 		offsetof(struct xfs_btree_block, bb_magic),
1476 		offsetof(struct xfs_btree_block, bb_level),
1477 		offsetof(struct xfs_btree_block, bb_numrecs),
1478 		offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib),
1479 		offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib),
1480 		offsetof(struct xfs_btree_block, bb_u.s.bb_blkno),
1481 		offsetof(struct xfs_btree_block, bb_u.s.bb_lsn),
1482 		offsetof(struct xfs_btree_block, bb_u.s.bb_uuid),
1483 		offsetof(struct xfs_btree_block, bb_u.s.bb_owner),
1484 		offsetof(struct xfs_btree_block, bb_u.s.bb_crc),
1485 		XFS_BTREE_SBLOCK_CRC_LEN
1486 	};
1487 	static const short	loffsets[] = {	/* table of offsets (long) */
1488 		offsetof(struct xfs_btree_block, bb_magic),
1489 		offsetof(struct xfs_btree_block, bb_level),
1490 		offsetof(struct xfs_btree_block, bb_numrecs),
1491 		offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib),
1492 		offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib),
1493 		offsetof(struct xfs_btree_block, bb_u.l.bb_blkno),
1494 		offsetof(struct xfs_btree_block, bb_u.l.bb_lsn),
1495 		offsetof(struct xfs_btree_block, bb_u.l.bb_uuid),
1496 		offsetof(struct xfs_btree_block, bb_u.l.bb_owner),
1497 		offsetof(struct xfs_btree_block, bb_u.l.bb_crc),
1498 		offsetof(struct xfs_btree_block, bb_u.l.bb_pad),
1499 		XFS_BTREE_LBLOCK_CRC_LEN
1500 	};
1501 
1502 	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1503 	XFS_BTREE_TRACE_ARGBI(cur, bp, fields);
1504 
1505 	if (bp) {
1506 		int nbits;
1507 
1508 		if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
1509 			/*
1510 			 * We don't log the CRC when updating a btree
1511 			 * block but instead recreate it during log
1512 			 * recovery.  As the log buffers have checksums
1513 			 * of their own this is safe and avoids logging a crc
1514 			 * update in a lot of places.
1515 			 */
1516 			if (fields == XFS_BB_ALL_BITS)
1517 				fields = XFS_BB_ALL_BITS_CRC;
1518 			nbits = XFS_BB_NUM_BITS_CRC;
1519 		} else {
1520 			nbits = XFS_BB_NUM_BITS;
1521 		}
1522 		xfs_btree_offsets(fields,
1523 				  (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
1524 					loffsets : soffsets,
1525 				  nbits, &first, &last);
1526 		xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1527 		xfs_trans_log_buf(cur->bc_tp, bp, first, last);
1528 	} else {
1529 		xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
1530 			xfs_ilog_fbroot(cur->bc_private.b.whichfork));
1531 	}
1532 
1533 	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1534 }
1535 
1536 /*
1537  * Increment cursor by one record at the level.
1538  * For nonzero levels the leaf-ward information is untouched.
1539  */
1540 int						/* error */
xfs_btree_increment(struct xfs_btree_cur * cur,int level,int * stat)1541 xfs_btree_increment(
1542 	struct xfs_btree_cur	*cur,
1543 	int			level,
1544 	int			*stat)		/* success/failure */
1545 {
1546 	struct xfs_btree_block	*block;
1547 	union xfs_btree_ptr	ptr;
1548 	struct xfs_buf		*bp;
1549 	int			error;		/* error return value */
1550 	int			lev;
1551 
1552 	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1553 	XFS_BTREE_TRACE_ARGI(cur, level);
1554 
1555 	ASSERT(level < cur->bc_nlevels);
1556 
1557 	/* Read-ahead to the right at this level. */
1558 	xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
1559 
1560 	/* Get a pointer to the btree block. */
1561 	block = xfs_btree_get_block(cur, level, &bp);
1562 
1563 #ifdef DEBUG
1564 	error = xfs_btree_check_block(cur, block, level, bp);
1565 	if (error)
1566 		goto error0;
1567 #endif
1568 
1569 	/* We're done if we remain in the block after the increment. */
1570 	if (++cur->bc_ptrs[level] <= xfs_btree_get_numrecs(block))
1571 		goto out1;
1572 
1573 	/* Fail if we just went off the right edge of the tree. */
1574 	xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1575 	if (xfs_btree_ptr_is_null(cur, &ptr))
1576 		goto out0;
1577 
1578 	XFS_BTREE_STATS_INC(cur, increment);
1579 
1580 	/*
1581 	 * March up the tree incrementing pointers.
1582 	 * Stop when we don't go off the right edge of a block.
1583 	 */
1584 	for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1585 		block = xfs_btree_get_block(cur, lev, &bp);
1586 
1587 #ifdef DEBUG
1588 		error = xfs_btree_check_block(cur, block, lev, bp);
1589 		if (error)
1590 			goto error0;
1591 #endif
1592 
1593 		if (++cur->bc_ptrs[lev] <= xfs_btree_get_numrecs(block))
1594 			break;
1595 
1596 		/* Read-ahead the right block for the next loop. */
1597 		xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
1598 	}
1599 
1600 	/*
1601 	 * If we went off the root then we are either seriously
1602 	 * confused or have the tree root in an inode.
1603 	 */
1604 	if (lev == cur->bc_nlevels) {
1605 		if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
1606 			goto out0;
1607 		ASSERT(0);
1608 		error = -EFSCORRUPTED;
1609 		goto error0;
1610 	}
1611 	ASSERT(lev < cur->bc_nlevels);
1612 
1613 	/*
1614 	 * Now walk back down the tree, fixing up the cursor's buffer
1615 	 * pointers and key numbers.
1616 	 */
1617 	for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
1618 		union xfs_btree_ptr	*ptrp;
1619 
1620 		ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
1621 		--lev;
1622 		error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
1623 		if (error)
1624 			goto error0;
1625 
1626 		xfs_btree_setbuf(cur, lev, bp);
1627 		cur->bc_ptrs[lev] = 1;
1628 	}
1629 out1:
1630 	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1631 	*stat = 1;
1632 	return 0;
1633 
1634 out0:
1635 	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1636 	*stat = 0;
1637 	return 0;
1638 
1639 error0:
1640 	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1641 	return error;
1642 }
1643 
1644 /*
1645  * Decrement cursor by one record at the level.
1646  * For nonzero levels the leaf-ward information is untouched.
1647  */
1648 int						/* error */
xfs_btree_decrement(struct xfs_btree_cur * cur,int level,int * stat)1649 xfs_btree_decrement(
1650 	struct xfs_btree_cur	*cur,
1651 	int			level,
1652 	int			*stat)		/* success/failure */
1653 {
1654 	struct xfs_btree_block	*block;
1655 	xfs_buf_t		*bp;
1656 	int			error;		/* error return value */
1657 	int			lev;
1658 	union xfs_btree_ptr	ptr;
1659 
1660 	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1661 	XFS_BTREE_TRACE_ARGI(cur, level);
1662 
1663 	ASSERT(level < cur->bc_nlevels);
1664 
1665 	/* Read-ahead to the left at this level. */
1666 	xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
1667 
1668 	/* We're done if we remain in the block after the decrement. */
1669 	if (--cur->bc_ptrs[level] > 0)
1670 		goto out1;
1671 
1672 	/* Get a pointer to the btree block. */
1673 	block = xfs_btree_get_block(cur, level, &bp);
1674 
1675 #ifdef DEBUG
1676 	error = xfs_btree_check_block(cur, block, level, bp);
1677 	if (error)
1678 		goto error0;
1679 #endif
1680 
1681 	/* Fail if we just went off the left edge of the tree. */
1682 	xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
1683 	if (xfs_btree_ptr_is_null(cur, &ptr))
1684 		goto out0;
1685 
1686 	XFS_BTREE_STATS_INC(cur, decrement);
1687 
1688 	/*
1689 	 * March up the tree decrementing pointers.
1690 	 * Stop when we don't go off the left edge of a block.
1691 	 */
1692 	for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1693 		if (--cur->bc_ptrs[lev] > 0)
1694 			break;
1695 		/* Read-ahead the left block for the next loop. */
1696 		xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
1697 	}
1698 
1699 	/*
1700 	 * If we went off the root then we are seriously confused.
1701 	 * or the root of the tree is in an inode.
1702 	 */
1703 	if (lev == cur->bc_nlevels) {
1704 		if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
1705 			goto out0;
1706 		ASSERT(0);
1707 		error = -EFSCORRUPTED;
1708 		goto error0;
1709 	}
1710 	ASSERT(lev < cur->bc_nlevels);
1711 
1712 	/*
1713 	 * Now walk back down the tree, fixing up the cursor's buffer
1714 	 * pointers and key numbers.
1715 	 */
1716 	for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
1717 		union xfs_btree_ptr	*ptrp;
1718 
1719 		ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
1720 		--lev;
1721 		error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
1722 		if (error)
1723 			goto error0;
1724 		xfs_btree_setbuf(cur, lev, bp);
1725 		cur->bc_ptrs[lev] = xfs_btree_get_numrecs(block);
1726 	}
1727 out1:
1728 	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1729 	*stat = 1;
1730 	return 0;
1731 
1732 out0:
1733 	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1734 	*stat = 0;
1735 	return 0;
1736 
1737 error0:
1738 	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1739 	return error;
1740 }
1741 
1742 STATIC int
xfs_btree_lookup_get_block(struct xfs_btree_cur * cur,int level,union xfs_btree_ptr * pp,struct xfs_btree_block ** blkp)1743 xfs_btree_lookup_get_block(
1744 	struct xfs_btree_cur	*cur,	/* btree cursor */
1745 	int			level,	/* level in the btree */
1746 	union xfs_btree_ptr	*pp,	/* ptr to btree block */
1747 	struct xfs_btree_block	**blkp) /* return btree block */
1748 {
1749 	struct xfs_buf		*bp;	/* buffer pointer for btree block */
1750 	int			error = 0;
1751 
1752 	/* special case the root block if in an inode */
1753 	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
1754 	    (level == cur->bc_nlevels - 1)) {
1755 		*blkp = xfs_btree_get_iroot(cur);
1756 		return 0;
1757 	}
1758 
1759 	/*
1760 	 * If the old buffer at this level for the disk address we are
1761 	 * looking for re-use it.
1762 	 *
1763 	 * Otherwise throw it away and get a new one.
1764 	 */
1765 	bp = cur->bc_bufs[level];
1766 	if (bp && XFS_BUF_ADDR(bp) == xfs_btree_ptr_to_daddr(cur, pp)) {
1767 		*blkp = XFS_BUF_TO_BLOCK(bp);
1768 		return 0;
1769 	}
1770 
1771 	error = xfs_btree_read_buf_block(cur, pp, 0, blkp, &bp);
1772 	if (error)
1773 		return error;
1774 
1775 	/* Check the inode owner since the verifiers don't. */
1776 	if (xfs_sb_version_hascrc(&cur->bc_mp->m_sb) &&
1777 	    !(cur->bc_private.b.flags & XFS_BTCUR_BPRV_INVALID_OWNER) &&
1778 	    (cur->bc_flags & XFS_BTREE_LONG_PTRS) &&
1779 	    be64_to_cpu((*blkp)->bb_u.l.bb_owner) !=
1780 			cur->bc_private.b.ip->i_ino)
1781 		goto out_bad;
1782 
1783 	/* Did we get the level we were looking for? */
1784 	if (be16_to_cpu((*blkp)->bb_level) != level)
1785 		goto out_bad;
1786 
1787 	/* Check that internal nodes have at least one record. */
1788 	if (level != 0 && be16_to_cpu((*blkp)->bb_numrecs) == 0)
1789 		goto out_bad;
1790 
1791 	xfs_btree_setbuf(cur, level, bp);
1792 	return 0;
1793 
1794 out_bad:
1795 	*blkp = NULL;
1796 	xfs_trans_brelse(cur->bc_tp, bp);
1797 	return -EFSCORRUPTED;
1798 }
1799 
1800 /*
1801  * Get current search key.  For level 0 we don't actually have a key
1802  * structure so we make one up from the record.  For all other levels
1803  * we just return the right key.
1804  */
1805 STATIC union xfs_btree_key *
xfs_lookup_get_search_key(struct xfs_btree_cur * cur,int level,int keyno,struct xfs_btree_block * block,union xfs_btree_key * kp)1806 xfs_lookup_get_search_key(
1807 	struct xfs_btree_cur	*cur,
1808 	int			level,
1809 	int			keyno,
1810 	struct xfs_btree_block	*block,
1811 	union xfs_btree_key	*kp)
1812 {
1813 	if (level == 0) {
1814 		cur->bc_ops->init_key_from_rec(kp,
1815 				xfs_btree_rec_addr(cur, keyno, block));
1816 		return kp;
1817 	}
1818 
1819 	return xfs_btree_key_addr(cur, keyno, block);
1820 }
1821 
1822 /*
1823  * Lookup the record.  The cursor is made to point to it, based on dir.
1824  * stat is set to 0 if can't find any such record, 1 for success.
1825  */
1826 int					/* error */
xfs_btree_lookup(struct xfs_btree_cur * cur,xfs_lookup_t dir,int * stat)1827 xfs_btree_lookup(
1828 	struct xfs_btree_cur	*cur,	/* btree cursor */
1829 	xfs_lookup_t		dir,	/* <=, ==, or >= */
1830 	int			*stat)	/* success/failure */
1831 {
1832 	struct xfs_btree_block	*block;	/* current btree block */
1833 	__int64_t		diff;	/* difference for the current key */
1834 	int			error;	/* error return value */
1835 	int			keyno;	/* current key number */
1836 	int			level;	/* level in the btree */
1837 	union xfs_btree_ptr	*pp;	/* ptr to btree block */
1838 	union xfs_btree_ptr	ptr;	/* ptr to btree block */
1839 
1840 	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1841 	XFS_BTREE_TRACE_ARGI(cur, dir);
1842 
1843 	XFS_BTREE_STATS_INC(cur, lookup);
1844 
1845 	/* No such thing as a zero-level tree. */
1846 	if (cur->bc_nlevels == 0)
1847 		return -EFSCORRUPTED;
1848 
1849 	block = NULL;
1850 	keyno = 0;
1851 
1852 	/* initialise start pointer from cursor */
1853 	cur->bc_ops->init_ptr_from_cur(cur, &ptr);
1854 	pp = &ptr;
1855 
1856 	/*
1857 	 * Iterate over each level in the btree, starting at the root.
1858 	 * For each level above the leaves, find the key we need, based
1859 	 * on the lookup record, then follow the corresponding block
1860 	 * pointer down to the next level.
1861 	 */
1862 	for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
1863 		/* Get the block we need to do the lookup on. */
1864 		error = xfs_btree_lookup_get_block(cur, level, pp, &block);
1865 		if (error)
1866 			goto error0;
1867 
1868 		if (diff == 0) {
1869 			/*
1870 			 * If we already had a key match at a higher level, we
1871 			 * know we need to use the first entry in this block.
1872 			 */
1873 			keyno = 1;
1874 		} else {
1875 			/* Otherwise search this block. Do a binary search. */
1876 
1877 			int	high;	/* high entry number */
1878 			int	low;	/* low entry number */
1879 
1880 			/* Set low and high entry numbers, 1-based. */
1881 			low = 1;
1882 			high = xfs_btree_get_numrecs(block);
1883 			if (!high) {
1884 				/* Block is empty, must be an empty leaf. */
1885 				ASSERT(level == 0 && cur->bc_nlevels == 1);
1886 
1887 				cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
1888 				XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1889 				*stat = 0;
1890 				return 0;
1891 			}
1892 
1893 			/* Binary search the block. */
1894 			while (low <= high) {
1895 				union xfs_btree_key	key;
1896 				union xfs_btree_key	*kp;
1897 
1898 				XFS_BTREE_STATS_INC(cur, compare);
1899 
1900 				/* keyno is average of low and high. */
1901 				keyno = (low + high) >> 1;
1902 
1903 				/* Get current search key */
1904 				kp = xfs_lookup_get_search_key(cur, level,
1905 						keyno, block, &key);
1906 
1907 				/*
1908 				 * Compute difference to get next direction:
1909 				 *  - less than, move right
1910 				 *  - greater than, move left
1911 				 *  - equal, we're done
1912 				 */
1913 				diff = cur->bc_ops->key_diff(cur, kp);
1914 				if (diff < 0)
1915 					low = keyno + 1;
1916 				else if (diff > 0)
1917 					high = keyno - 1;
1918 				else
1919 					break;
1920 			}
1921 		}
1922 
1923 		/*
1924 		 * If there are more levels, set up for the next level
1925 		 * by getting the block number and filling in the cursor.
1926 		 */
1927 		if (level > 0) {
1928 			/*
1929 			 * If we moved left, need the previous key number,
1930 			 * unless there isn't one.
1931 			 */
1932 			if (diff > 0 && --keyno < 1)
1933 				keyno = 1;
1934 			pp = xfs_btree_ptr_addr(cur, keyno, block);
1935 
1936 #ifdef DEBUG
1937 			error = xfs_btree_check_ptr(cur, pp, 0, level);
1938 			if (error)
1939 				goto error0;
1940 #endif
1941 			cur->bc_ptrs[level] = keyno;
1942 		}
1943 	}
1944 
1945 	/* Done with the search. See if we need to adjust the results. */
1946 	if (dir != XFS_LOOKUP_LE && diff < 0) {
1947 		keyno++;
1948 		/*
1949 		 * If ge search and we went off the end of the block, but it's
1950 		 * not the last block, we're in the wrong block.
1951 		 */
1952 		xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1953 		if (dir == XFS_LOOKUP_GE &&
1954 		    keyno > xfs_btree_get_numrecs(block) &&
1955 		    !xfs_btree_ptr_is_null(cur, &ptr)) {
1956 			int	i;
1957 
1958 			cur->bc_ptrs[0] = keyno;
1959 			error = xfs_btree_increment(cur, 0, &i);
1960 			if (error)
1961 				goto error0;
1962 			XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
1963 			XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1964 			*stat = 1;
1965 			return 0;
1966 		}
1967 	} else if (dir == XFS_LOOKUP_LE && diff > 0)
1968 		keyno--;
1969 	cur->bc_ptrs[0] = keyno;
1970 
1971 	/* Return if we succeeded or not. */
1972 	if (keyno == 0 || keyno > xfs_btree_get_numrecs(block))
1973 		*stat = 0;
1974 	else if (dir != XFS_LOOKUP_EQ || diff == 0)
1975 		*stat = 1;
1976 	else
1977 		*stat = 0;
1978 	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1979 	return 0;
1980 
1981 error0:
1982 	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1983 	return error;
1984 }
1985 
1986 /* Find the high key storage area from a regular key. */
1987 STATIC union xfs_btree_key *
xfs_btree_high_key_from_key(struct xfs_btree_cur * cur,union xfs_btree_key * key)1988 xfs_btree_high_key_from_key(
1989 	struct xfs_btree_cur	*cur,
1990 	union xfs_btree_key	*key)
1991 {
1992 	ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING);
1993 	return (union xfs_btree_key *)((char *)key +
1994 			(cur->bc_ops->key_len / 2));
1995 }
1996 
1997 /* Determine the low (and high if overlapped) keys of a leaf block */
1998 STATIC void
xfs_btree_get_leaf_keys(struct xfs_btree_cur * cur,struct xfs_btree_block * block,union xfs_btree_key * key)1999 xfs_btree_get_leaf_keys(
2000 	struct xfs_btree_cur	*cur,
2001 	struct xfs_btree_block	*block,
2002 	union xfs_btree_key	*key)
2003 {
2004 	union xfs_btree_key	max_hkey;
2005 	union xfs_btree_key	hkey;
2006 	union xfs_btree_rec	*rec;
2007 	union xfs_btree_key	*high;
2008 	int			n;
2009 
2010 	rec = xfs_btree_rec_addr(cur, 1, block);
2011 	cur->bc_ops->init_key_from_rec(key, rec);
2012 
2013 	if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2014 
2015 		cur->bc_ops->init_high_key_from_rec(&max_hkey, rec);
2016 		for (n = 2; n <= xfs_btree_get_numrecs(block); n++) {
2017 			rec = xfs_btree_rec_addr(cur, n, block);
2018 			cur->bc_ops->init_high_key_from_rec(&hkey, rec);
2019 			if (cur->bc_ops->diff_two_keys(cur, &hkey, &max_hkey)
2020 					> 0)
2021 				max_hkey = hkey;
2022 		}
2023 
2024 		high = xfs_btree_high_key_from_key(cur, key);
2025 		memcpy(high, &max_hkey, cur->bc_ops->key_len / 2);
2026 	}
2027 }
2028 
2029 /* Determine the low (and high if overlapped) keys of a node block */
2030 STATIC void
xfs_btree_get_node_keys(struct xfs_btree_cur * cur,struct xfs_btree_block * block,union xfs_btree_key * key)2031 xfs_btree_get_node_keys(
2032 	struct xfs_btree_cur	*cur,
2033 	struct xfs_btree_block	*block,
2034 	union xfs_btree_key	*key)
2035 {
2036 	union xfs_btree_key	*hkey;
2037 	union xfs_btree_key	*max_hkey;
2038 	union xfs_btree_key	*high;
2039 	int			n;
2040 
2041 	if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2042 		memcpy(key, xfs_btree_key_addr(cur, 1, block),
2043 				cur->bc_ops->key_len / 2);
2044 
2045 		max_hkey = xfs_btree_high_key_addr(cur, 1, block);
2046 		for (n = 2; n <= xfs_btree_get_numrecs(block); n++) {
2047 			hkey = xfs_btree_high_key_addr(cur, n, block);
2048 			if (cur->bc_ops->diff_two_keys(cur, hkey, max_hkey) > 0)
2049 				max_hkey = hkey;
2050 		}
2051 
2052 		high = xfs_btree_high_key_from_key(cur, key);
2053 		memcpy(high, max_hkey, cur->bc_ops->key_len / 2);
2054 	} else {
2055 		memcpy(key, xfs_btree_key_addr(cur, 1, block),
2056 				cur->bc_ops->key_len);
2057 	}
2058 }
2059 
2060 /* Derive the keys for any btree block. */
2061 STATIC void
xfs_btree_get_keys(struct xfs_btree_cur * cur,struct xfs_btree_block * block,union xfs_btree_key * key)2062 xfs_btree_get_keys(
2063 	struct xfs_btree_cur	*cur,
2064 	struct xfs_btree_block	*block,
2065 	union xfs_btree_key	*key)
2066 {
2067 	if (be16_to_cpu(block->bb_level) == 0)
2068 		xfs_btree_get_leaf_keys(cur, block, key);
2069 	else
2070 		xfs_btree_get_node_keys(cur, block, key);
2071 }
2072 
2073 /*
2074  * Decide if we need to update the parent keys of a btree block.  For
2075  * a standard btree this is only necessary if we're updating the first
2076  * record/key.  For an overlapping btree, we must always update the
2077  * keys because the highest key can be in any of the records or keys
2078  * in the block.
2079  */
2080 static inline bool
xfs_btree_needs_key_update(struct xfs_btree_cur * cur,int ptr)2081 xfs_btree_needs_key_update(
2082 	struct xfs_btree_cur	*cur,
2083 	int			ptr)
2084 {
2085 	return (cur->bc_flags & XFS_BTREE_OVERLAPPING) || ptr == 1;
2086 }
2087 
2088 /*
2089  * Update the low and high parent keys of the given level, progressing
2090  * towards the root.  If force_all is false, stop if the keys for a given
2091  * level do not need updating.
2092  */
2093 STATIC int
__xfs_btree_updkeys(struct xfs_btree_cur * cur,int level,struct xfs_btree_block * block,struct xfs_buf * bp0,bool force_all)2094 __xfs_btree_updkeys(
2095 	struct xfs_btree_cur	*cur,
2096 	int			level,
2097 	struct xfs_btree_block	*block,
2098 	struct xfs_buf		*bp0,
2099 	bool			force_all)
2100 {
2101 	union xfs_btree_key	key;	/* keys from current level */
2102 	union xfs_btree_key	*lkey;	/* keys from the next level up */
2103 	union xfs_btree_key	*hkey;
2104 	union xfs_btree_key	*nlkey;	/* keys from the next level up */
2105 	union xfs_btree_key	*nhkey;
2106 	struct xfs_buf		*bp;
2107 	int			ptr;
2108 
2109 	ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING);
2110 
2111 	/* Exit if there aren't any parent levels to update. */
2112 	if (level + 1 >= cur->bc_nlevels)
2113 		return 0;
2114 
2115 	trace_xfs_btree_updkeys(cur, level, bp0);
2116 
2117 	lkey = &key;
2118 	hkey = xfs_btree_high_key_from_key(cur, lkey);
2119 	xfs_btree_get_keys(cur, block, lkey);
2120 	for (level++; level < cur->bc_nlevels; level++) {
2121 #ifdef DEBUG
2122 		int		error;
2123 #endif
2124 		block = xfs_btree_get_block(cur, level, &bp);
2125 		trace_xfs_btree_updkeys(cur, level, bp);
2126 #ifdef DEBUG
2127 		error = xfs_btree_check_block(cur, block, level, bp);
2128 		if (error) {
2129 			XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2130 			return error;
2131 		}
2132 #endif
2133 		ptr = cur->bc_ptrs[level];
2134 		nlkey = xfs_btree_key_addr(cur, ptr, block);
2135 		nhkey = xfs_btree_high_key_addr(cur, ptr, block);
2136 		if (!force_all &&
2137 		    !(cur->bc_ops->diff_two_keys(cur, nlkey, lkey) != 0 ||
2138 		      cur->bc_ops->diff_two_keys(cur, nhkey, hkey) != 0))
2139 			break;
2140 		xfs_btree_copy_keys(cur, nlkey, lkey, 1);
2141 		xfs_btree_log_keys(cur, bp, ptr, ptr);
2142 		if (level + 1 >= cur->bc_nlevels)
2143 			break;
2144 		xfs_btree_get_node_keys(cur, block, lkey);
2145 	}
2146 
2147 	return 0;
2148 }
2149 
2150 /* Update all the keys from some level in cursor back to the root. */
2151 STATIC int
xfs_btree_updkeys_force(struct xfs_btree_cur * cur,int level)2152 xfs_btree_updkeys_force(
2153 	struct xfs_btree_cur	*cur,
2154 	int			level)
2155 {
2156 	struct xfs_buf		*bp;
2157 	struct xfs_btree_block	*block;
2158 
2159 	block = xfs_btree_get_block(cur, level, &bp);
2160 	return __xfs_btree_updkeys(cur, level, block, bp, true);
2161 }
2162 
2163 /*
2164  * Update the parent keys of the given level, progressing towards the root.
2165  */
2166 STATIC int
xfs_btree_update_keys(struct xfs_btree_cur * cur,int level)2167 xfs_btree_update_keys(
2168 	struct xfs_btree_cur	*cur,
2169 	int			level)
2170 {
2171 	struct xfs_btree_block	*block;
2172 	struct xfs_buf		*bp;
2173 	union xfs_btree_key	*kp;
2174 	union xfs_btree_key	key;
2175 	int			ptr;
2176 
2177 	ASSERT(level >= 0);
2178 
2179 	block = xfs_btree_get_block(cur, level, &bp);
2180 	if (cur->bc_flags & XFS_BTREE_OVERLAPPING)
2181 		return __xfs_btree_updkeys(cur, level, block, bp, false);
2182 
2183 	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2184 	XFS_BTREE_TRACE_ARGIK(cur, level, keyp);
2185 
2186 	/*
2187 	 * Go up the tree from this level toward the root.
2188 	 * At each level, update the key value to the value input.
2189 	 * Stop when we reach a level where the cursor isn't pointing
2190 	 * at the first entry in the block.
2191 	 */
2192 	xfs_btree_get_keys(cur, block, &key);
2193 	for (level++, ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
2194 #ifdef DEBUG
2195 		int		error;
2196 #endif
2197 		block = xfs_btree_get_block(cur, level, &bp);
2198 #ifdef DEBUG
2199 		error = xfs_btree_check_block(cur, block, level, bp);
2200 		if (error) {
2201 			XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2202 			return error;
2203 		}
2204 #endif
2205 		ptr = cur->bc_ptrs[level];
2206 		kp = xfs_btree_key_addr(cur, ptr, block);
2207 		xfs_btree_copy_keys(cur, kp, &key, 1);
2208 		xfs_btree_log_keys(cur, bp, ptr, ptr);
2209 	}
2210 
2211 	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2212 	return 0;
2213 }
2214 
2215 /*
2216  * Update the record referred to by cur to the value in the
2217  * given record. This either works (return 0) or gets an
2218  * EFSCORRUPTED error.
2219  */
2220 int
xfs_btree_update(struct xfs_btree_cur * cur,union xfs_btree_rec * rec)2221 xfs_btree_update(
2222 	struct xfs_btree_cur	*cur,
2223 	union xfs_btree_rec	*rec)
2224 {
2225 	struct xfs_btree_block	*block;
2226 	struct xfs_buf		*bp;
2227 	int			error;
2228 	int			ptr;
2229 	union xfs_btree_rec	*rp;
2230 
2231 	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2232 	XFS_BTREE_TRACE_ARGR(cur, rec);
2233 
2234 	/* Pick up the current block. */
2235 	block = xfs_btree_get_block(cur, 0, &bp);
2236 
2237 #ifdef DEBUG
2238 	error = xfs_btree_check_block(cur, block, 0, bp);
2239 	if (error)
2240 		goto error0;
2241 #endif
2242 	/* Get the address of the rec to be updated. */
2243 	ptr = cur->bc_ptrs[0];
2244 	rp = xfs_btree_rec_addr(cur, ptr, block);
2245 
2246 	/* Fill in the new contents and log them. */
2247 	xfs_btree_copy_recs(cur, rp, rec, 1);
2248 	xfs_btree_log_recs(cur, bp, ptr, ptr);
2249 
2250 	/*
2251 	 * If we are tracking the last record in the tree and
2252 	 * we are at the far right edge of the tree, update it.
2253 	 */
2254 	if (xfs_btree_is_lastrec(cur, block, 0)) {
2255 		cur->bc_ops->update_lastrec(cur, block, rec,
2256 					    ptr, LASTREC_UPDATE);
2257 	}
2258 
2259 	/* Pass new key value up to our parent. */
2260 	if (xfs_btree_needs_key_update(cur, ptr)) {
2261 		error = xfs_btree_update_keys(cur, 0);
2262 		if (error)
2263 			goto error0;
2264 	}
2265 
2266 	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2267 	return 0;
2268 
2269 error0:
2270 	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2271 	return error;
2272 }
2273 
2274 /*
2275  * Move 1 record left from cur/level if possible.
2276  * Update cur to reflect the new path.
2277  */
2278 STATIC int					/* error */
xfs_btree_lshift(struct xfs_btree_cur * cur,int level,int * stat)2279 xfs_btree_lshift(
2280 	struct xfs_btree_cur	*cur,
2281 	int			level,
2282 	int			*stat)		/* success/failure */
2283 {
2284 	struct xfs_buf		*lbp;		/* left buffer pointer */
2285 	struct xfs_btree_block	*left;		/* left btree block */
2286 	int			lrecs;		/* left record count */
2287 	struct xfs_buf		*rbp;		/* right buffer pointer */
2288 	struct xfs_btree_block	*right;		/* right btree block */
2289 	struct xfs_btree_cur	*tcur;		/* temporary btree cursor */
2290 	int			rrecs;		/* right record count */
2291 	union xfs_btree_ptr	lptr;		/* left btree pointer */
2292 	union xfs_btree_key	*rkp = NULL;	/* right btree key */
2293 	union xfs_btree_ptr	*rpp = NULL;	/* right address pointer */
2294 	union xfs_btree_rec	*rrp = NULL;	/* right record pointer */
2295 	int			error;		/* error return value */
2296 	int			i;
2297 
2298 	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2299 	XFS_BTREE_TRACE_ARGI(cur, level);
2300 
2301 	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2302 	    level == cur->bc_nlevels - 1)
2303 		goto out0;
2304 
2305 	/* Set up variables for this block as "right". */
2306 	right = xfs_btree_get_block(cur, level, &rbp);
2307 
2308 #ifdef DEBUG
2309 	error = xfs_btree_check_block(cur, right, level, rbp);
2310 	if (error)
2311 		goto error0;
2312 #endif
2313 
2314 	/* If we've got no left sibling then we can't shift an entry left. */
2315 	xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2316 	if (xfs_btree_ptr_is_null(cur, &lptr))
2317 		goto out0;
2318 
2319 	/*
2320 	 * If the cursor entry is the one that would be moved, don't
2321 	 * do it... it's too complicated.
2322 	 */
2323 	if (cur->bc_ptrs[level] <= 1)
2324 		goto out0;
2325 
2326 	/* Set up the left neighbor as "left". */
2327 	error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
2328 	if (error)
2329 		goto error0;
2330 
2331 	/* If it's full, it can't take another entry. */
2332 	lrecs = xfs_btree_get_numrecs(left);
2333 	if (lrecs == cur->bc_ops->get_maxrecs(cur, level))
2334 		goto out0;
2335 
2336 	rrecs = xfs_btree_get_numrecs(right);
2337 
2338 	/*
2339 	 * We add one entry to the left side and remove one for the right side.
2340 	 * Account for it here, the changes will be updated on disk and logged
2341 	 * later.
2342 	 */
2343 	lrecs++;
2344 	rrecs--;
2345 
2346 	XFS_BTREE_STATS_INC(cur, lshift);
2347 	XFS_BTREE_STATS_ADD(cur, moves, 1);
2348 
2349 	/*
2350 	 * If non-leaf, copy a key and a ptr to the left block.
2351 	 * Log the changes to the left block.
2352 	 */
2353 	if (level > 0) {
2354 		/* It's a non-leaf.  Move keys and pointers. */
2355 		union xfs_btree_key	*lkp;	/* left btree key */
2356 		union xfs_btree_ptr	*lpp;	/* left address pointer */
2357 
2358 		lkp = xfs_btree_key_addr(cur, lrecs, left);
2359 		rkp = xfs_btree_key_addr(cur, 1, right);
2360 
2361 		lpp = xfs_btree_ptr_addr(cur, lrecs, left);
2362 		rpp = xfs_btree_ptr_addr(cur, 1, right);
2363 #ifdef DEBUG
2364 		error = xfs_btree_check_ptr(cur, rpp, 0, level);
2365 		if (error)
2366 			goto error0;
2367 #endif
2368 		xfs_btree_copy_keys(cur, lkp, rkp, 1);
2369 		xfs_btree_copy_ptrs(cur, lpp, rpp, 1);
2370 
2371 		xfs_btree_log_keys(cur, lbp, lrecs, lrecs);
2372 		xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs);
2373 
2374 		ASSERT(cur->bc_ops->keys_inorder(cur,
2375 			xfs_btree_key_addr(cur, lrecs - 1, left), lkp));
2376 	} else {
2377 		/* It's a leaf.  Move records.  */
2378 		union xfs_btree_rec	*lrp;	/* left record pointer */
2379 
2380 		lrp = xfs_btree_rec_addr(cur, lrecs, left);
2381 		rrp = xfs_btree_rec_addr(cur, 1, right);
2382 
2383 		xfs_btree_copy_recs(cur, lrp, rrp, 1);
2384 		xfs_btree_log_recs(cur, lbp, lrecs, lrecs);
2385 
2386 		ASSERT(cur->bc_ops->recs_inorder(cur,
2387 			xfs_btree_rec_addr(cur, lrecs - 1, left), lrp));
2388 	}
2389 
2390 	xfs_btree_set_numrecs(left, lrecs);
2391 	xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
2392 
2393 	xfs_btree_set_numrecs(right, rrecs);
2394 	xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
2395 
2396 	/*
2397 	 * Slide the contents of right down one entry.
2398 	 */
2399 	XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1);
2400 	if (level > 0) {
2401 		/* It's a nonleaf. operate on keys and ptrs */
2402 #ifdef DEBUG
2403 		int			i;		/* loop index */
2404 
2405 		for (i = 0; i < rrecs; i++) {
2406 			error = xfs_btree_check_ptr(cur, rpp, i + 1, level);
2407 			if (error)
2408 				goto error0;
2409 		}
2410 #endif
2411 		xfs_btree_shift_keys(cur,
2412 				xfs_btree_key_addr(cur, 2, right),
2413 				-1, rrecs);
2414 		xfs_btree_shift_ptrs(cur,
2415 				xfs_btree_ptr_addr(cur, 2, right),
2416 				-1, rrecs);
2417 
2418 		xfs_btree_log_keys(cur, rbp, 1, rrecs);
2419 		xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
2420 	} else {
2421 		/* It's a leaf. operate on records */
2422 		xfs_btree_shift_recs(cur,
2423 			xfs_btree_rec_addr(cur, 2, right),
2424 			-1, rrecs);
2425 		xfs_btree_log_recs(cur, rbp, 1, rrecs);
2426 	}
2427 
2428 	/*
2429 	 * Using a temporary cursor, update the parent key values of the
2430 	 * block on the left.
2431 	 */
2432 	if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2433 		error = xfs_btree_dup_cursor(cur, &tcur);
2434 		if (error)
2435 			goto error0;
2436 		i = xfs_btree_firstrec(tcur, level);
2437 		XFS_WANT_CORRUPTED_GOTO(tcur->bc_mp, i == 1, error0);
2438 
2439 		error = xfs_btree_decrement(tcur, level, &i);
2440 		if (error)
2441 			goto error1;
2442 
2443 		/* Update the parent high keys of the left block, if needed. */
2444 		error = xfs_btree_update_keys(tcur, level);
2445 		if (error)
2446 			goto error1;
2447 
2448 		xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
2449 	}
2450 
2451 	/* Update the parent keys of the right block. */
2452 	error = xfs_btree_update_keys(cur, level);
2453 	if (error)
2454 		goto error0;
2455 
2456 	/* Slide the cursor value left one. */
2457 	cur->bc_ptrs[level]--;
2458 
2459 	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2460 	*stat = 1;
2461 	return 0;
2462 
2463 out0:
2464 	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2465 	*stat = 0;
2466 	return 0;
2467 
2468 error0:
2469 	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2470 	return error;
2471 
2472 error1:
2473 	XFS_BTREE_TRACE_CURSOR(tcur, XBT_ERROR);
2474 	xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
2475 	return error;
2476 }
2477 
2478 /*
2479  * Move 1 record right from cur/level if possible.
2480  * Update cur to reflect the new path.
2481  */
2482 STATIC int					/* error */
xfs_btree_rshift(struct xfs_btree_cur * cur,int level,int * stat)2483 xfs_btree_rshift(
2484 	struct xfs_btree_cur	*cur,
2485 	int			level,
2486 	int			*stat)		/* success/failure */
2487 {
2488 	struct xfs_buf		*lbp;		/* left buffer pointer */
2489 	struct xfs_btree_block	*left;		/* left btree block */
2490 	struct xfs_buf		*rbp;		/* right buffer pointer */
2491 	struct xfs_btree_block	*right;		/* right btree block */
2492 	struct xfs_btree_cur	*tcur;		/* temporary btree cursor */
2493 	union xfs_btree_ptr	rptr;		/* right block pointer */
2494 	union xfs_btree_key	*rkp;		/* right btree key */
2495 	int			rrecs;		/* right record count */
2496 	int			lrecs;		/* left record count */
2497 	int			error;		/* error return value */
2498 	int			i;		/* loop counter */
2499 
2500 	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2501 	XFS_BTREE_TRACE_ARGI(cur, level);
2502 
2503 	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2504 	    (level == cur->bc_nlevels - 1))
2505 		goto out0;
2506 
2507 	/* Set up variables for this block as "left". */
2508 	left = xfs_btree_get_block(cur, level, &lbp);
2509 
2510 #ifdef DEBUG
2511 	error = xfs_btree_check_block(cur, left, level, lbp);
2512 	if (error)
2513 		goto error0;
2514 #endif
2515 
2516 	/* If we've got no right sibling then we can't shift an entry right. */
2517 	xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
2518 	if (xfs_btree_ptr_is_null(cur, &rptr))
2519 		goto out0;
2520 
2521 	/*
2522 	 * If the cursor entry is the one that would be moved, don't
2523 	 * do it... it's too complicated.
2524 	 */
2525 	lrecs = xfs_btree_get_numrecs(left);
2526 	if (cur->bc_ptrs[level] >= lrecs)
2527 		goto out0;
2528 
2529 	/* Set up the right neighbor as "right". */
2530 	error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
2531 	if (error)
2532 		goto error0;
2533 
2534 	/* If it's full, it can't take another entry. */
2535 	rrecs = xfs_btree_get_numrecs(right);
2536 	if (rrecs == cur->bc_ops->get_maxrecs(cur, level))
2537 		goto out0;
2538 
2539 	XFS_BTREE_STATS_INC(cur, rshift);
2540 	XFS_BTREE_STATS_ADD(cur, moves, rrecs);
2541 
2542 	/*
2543 	 * Make a hole at the start of the right neighbor block, then
2544 	 * copy the last left block entry to the hole.
2545 	 */
2546 	if (level > 0) {
2547 		/* It's a nonleaf. make a hole in the keys and ptrs */
2548 		union xfs_btree_key	*lkp;
2549 		union xfs_btree_ptr	*lpp;
2550 		union xfs_btree_ptr	*rpp;
2551 
2552 		lkp = xfs_btree_key_addr(cur, lrecs, left);
2553 		lpp = xfs_btree_ptr_addr(cur, lrecs, left);
2554 		rkp = xfs_btree_key_addr(cur, 1, right);
2555 		rpp = xfs_btree_ptr_addr(cur, 1, right);
2556 
2557 #ifdef DEBUG
2558 		for (i = rrecs - 1; i >= 0; i--) {
2559 			error = xfs_btree_check_ptr(cur, rpp, i, level);
2560 			if (error)
2561 				goto error0;
2562 		}
2563 #endif
2564 
2565 		xfs_btree_shift_keys(cur, rkp, 1, rrecs);
2566 		xfs_btree_shift_ptrs(cur, rpp, 1, rrecs);
2567 
2568 #ifdef DEBUG
2569 		error = xfs_btree_check_ptr(cur, lpp, 0, level);
2570 		if (error)
2571 			goto error0;
2572 #endif
2573 
2574 		/* Now put the new data in, and log it. */
2575 		xfs_btree_copy_keys(cur, rkp, lkp, 1);
2576 		xfs_btree_copy_ptrs(cur, rpp, lpp, 1);
2577 
2578 		xfs_btree_log_keys(cur, rbp, 1, rrecs + 1);
2579 		xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1);
2580 
2581 		ASSERT(cur->bc_ops->keys_inorder(cur, rkp,
2582 			xfs_btree_key_addr(cur, 2, right)));
2583 	} else {
2584 		/* It's a leaf. make a hole in the records */
2585 		union xfs_btree_rec	*lrp;
2586 		union xfs_btree_rec	*rrp;
2587 
2588 		lrp = xfs_btree_rec_addr(cur, lrecs, left);
2589 		rrp = xfs_btree_rec_addr(cur, 1, right);
2590 
2591 		xfs_btree_shift_recs(cur, rrp, 1, rrecs);
2592 
2593 		/* Now put the new data in, and log it. */
2594 		xfs_btree_copy_recs(cur, rrp, lrp, 1);
2595 		xfs_btree_log_recs(cur, rbp, 1, rrecs + 1);
2596 	}
2597 
2598 	/*
2599 	 * Decrement and log left's numrecs, bump and log right's numrecs.
2600 	 */
2601 	xfs_btree_set_numrecs(left, --lrecs);
2602 	xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
2603 
2604 	xfs_btree_set_numrecs(right, ++rrecs);
2605 	xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
2606 
2607 	/*
2608 	 * Using a temporary cursor, update the parent key values of the
2609 	 * block on the right.
2610 	 */
2611 	error = xfs_btree_dup_cursor(cur, &tcur);
2612 	if (error)
2613 		goto error0;
2614 	i = xfs_btree_lastrec(tcur, level);
2615 	XFS_WANT_CORRUPTED_GOTO(tcur->bc_mp, i == 1, error0);
2616 
2617 	error = xfs_btree_increment(tcur, level, &i);
2618 	if (error)
2619 		goto error1;
2620 
2621 	/* Update the parent high keys of the left block, if needed. */
2622 	if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2623 		error = xfs_btree_update_keys(cur, level);
2624 		if (error)
2625 			goto error1;
2626 	}
2627 
2628 	/* Update the parent keys of the right block. */
2629 	error = xfs_btree_update_keys(tcur, level);
2630 	if (error)
2631 		goto error1;
2632 
2633 	xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
2634 
2635 	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2636 	*stat = 1;
2637 	return 0;
2638 
2639 out0:
2640 	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2641 	*stat = 0;
2642 	return 0;
2643 
2644 error0:
2645 	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2646 	return error;
2647 
2648 error1:
2649 	XFS_BTREE_TRACE_CURSOR(tcur, XBT_ERROR);
2650 	xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
2651 	return error;
2652 }
2653 
2654 /*
2655  * Split cur/level block in half.
2656  * Return new block number and the key to its first
2657  * record (to be inserted into parent).
2658  */
2659 STATIC int					/* error */
__xfs_btree_split(struct xfs_btree_cur * cur,int level,union xfs_btree_ptr * ptrp,union xfs_btree_key * key,struct xfs_btree_cur ** curp,int * stat)2660 __xfs_btree_split(
2661 	struct xfs_btree_cur	*cur,
2662 	int			level,
2663 	union xfs_btree_ptr	*ptrp,
2664 	union xfs_btree_key	*key,
2665 	struct xfs_btree_cur	**curp,
2666 	int			*stat)		/* success/failure */
2667 {
2668 	union xfs_btree_ptr	lptr;		/* left sibling block ptr */
2669 	struct xfs_buf		*lbp;		/* left buffer pointer */
2670 	struct xfs_btree_block	*left;		/* left btree block */
2671 	union xfs_btree_ptr	rptr;		/* right sibling block ptr */
2672 	struct xfs_buf		*rbp;		/* right buffer pointer */
2673 	struct xfs_btree_block	*right;		/* right btree block */
2674 	union xfs_btree_ptr	rrptr;		/* right-right sibling ptr */
2675 	struct xfs_buf		*rrbp;		/* right-right buffer pointer */
2676 	struct xfs_btree_block	*rrblock;	/* right-right btree block */
2677 	int			lrecs;
2678 	int			rrecs;
2679 	int			src_index;
2680 	int			error;		/* error return value */
2681 #ifdef DEBUG
2682 	int			i;
2683 #endif
2684 
2685 	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2686 	XFS_BTREE_TRACE_ARGIPK(cur, level, *ptrp, key);
2687 
2688 	XFS_BTREE_STATS_INC(cur, split);
2689 
2690 	/* Set up left block (current one). */
2691 	left = xfs_btree_get_block(cur, level, &lbp);
2692 
2693 #ifdef DEBUG
2694 	error = xfs_btree_check_block(cur, left, level, lbp);
2695 	if (error)
2696 		goto error0;
2697 #endif
2698 
2699 	xfs_btree_buf_to_ptr(cur, lbp, &lptr);
2700 
2701 	/* Allocate the new block. If we can't do it, we're toast. Give up. */
2702 	error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, stat);
2703 	if (error)
2704 		goto error0;
2705 	if (*stat == 0)
2706 		goto out0;
2707 	XFS_BTREE_STATS_INC(cur, alloc);
2708 
2709 	/* Set up the new block as "right". */
2710 	error = xfs_btree_get_buf_block(cur, &rptr, 0, &right, &rbp);
2711 	if (error)
2712 		goto error0;
2713 
2714 	/* Fill in the btree header for the new right block. */
2715 	xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0);
2716 
2717 	/*
2718 	 * Split the entries between the old and the new block evenly.
2719 	 * Make sure that if there's an odd number of entries now, that
2720 	 * each new block will have the same number of entries.
2721 	 */
2722 	lrecs = xfs_btree_get_numrecs(left);
2723 	rrecs = lrecs / 2;
2724 	if ((lrecs & 1) && cur->bc_ptrs[level] <= rrecs + 1)
2725 		rrecs++;
2726 	src_index = (lrecs - rrecs + 1);
2727 
2728 	XFS_BTREE_STATS_ADD(cur, moves, rrecs);
2729 
2730 	/* Adjust numrecs for the later get_*_keys() calls. */
2731 	lrecs -= rrecs;
2732 	xfs_btree_set_numrecs(left, lrecs);
2733 	xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs);
2734 
2735 	/*
2736 	 * Copy btree block entries from the left block over to the
2737 	 * new block, the right. Update the right block and log the
2738 	 * changes.
2739 	 */
2740 	if (level > 0) {
2741 		/* It's a non-leaf.  Move keys and pointers. */
2742 		union xfs_btree_key	*lkp;	/* left btree key */
2743 		union xfs_btree_ptr	*lpp;	/* left address pointer */
2744 		union xfs_btree_key	*rkp;	/* right btree key */
2745 		union xfs_btree_ptr	*rpp;	/* right address pointer */
2746 
2747 		lkp = xfs_btree_key_addr(cur, src_index, left);
2748 		lpp = xfs_btree_ptr_addr(cur, src_index, left);
2749 		rkp = xfs_btree_key_addr(cur, 1, right);
2750 		rpp = xfs_btree_ptr_addr(cur, 1, right);
2751 
2752 #ifdef DEBUG
2753 		for (i = src_index; i < rrecs; i++) {
2754 			error = xfs_btree_check_ptr(cur, lpp, i, level);
2755 			if (error)
2756 				goto error0;
2757 		}
2758 #endif
2759 
2760 		/* Copy the keys & pointers to the new block. */
2761 		xfs_btree_copy_keys(cur, rkp, lkp, rrecs);
2762 		xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs);
2763 
2764 		xfs_btree_log_keys(cur, rbp, 1, rrecs);
2765 		xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
2766 
2767 		/* Stash the keys of the new block for later insertion. */
2768 		xfs_btree_get_node_keys(cur, right, key);
2769 	} else {
2770 		/* It's a leaf.  Move records.  */
2771 		union xfs_btree_rec	*lrp;	/* left record pointer */
2772 		union xfs_btree_rec	*rrp;	/* right record pointer */
2773 
2774 		lrp = xfs_btree_rec_addr(cur, src_index, left);
2775 		rrp = xfs_btree_rec_addr(cur, 1, right);
2776 
2777 		/* Copy records to the new block. */
2778 		xfs_btree_copy_recs(cur, rrp, lrp, rrecs);
2779 		xfs_btree_log_recs(cur, rbp, 1, rrecs);
2780 
2781 		/* Stash the keys of the new block for later insertion. */
2782 		xfs_btree_get_leaf_keys(cur, right, key);
2783 	}
2784 
2785 	/*
2786 	 * Find the left block number by looking in the buffer.
2787 	 * Adjust sibling pointers.
2788 	 */
2789 	xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB);
2790 	xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB);
2791 	xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2792 	xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
2793 
2794 	xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS);
2795 	xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
2796 
2797 	/*
2798 	 * If there's a block to the new block's right, make that block
2799 	 * point back to right instead of to left.
2800 	 */
2801 	if (!xfs_btree_ptr_is_null(cur, &rrptr)) {
2802 		error = xfs_btree_read_buf_block(cur, &rrptr,
2803 							0, &rrblock, &rrbp);
2804 		if (error)
2805 			goto error0;
2806 		xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB);
2807 		xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
2808 	}
2809 
2810 	/* Update the parent high keys of the left block, if needed. */
2811 	if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2812 		error = xfs_btree_update_keys(cur, level);
2813 		if (error)
2814 			goto error0;
2815 	}
2816 
2817 	/*
2818 	 * If the cursor is really in the right block, move it there.
2819 	 * If it's just pointing past the last entry in left, then we'll
2820 	 * insert there, so don't change anything in that case.
2821 	 */
2822 	if (cur->bc_ptrs[level] > lrecs + 1) {
2823 		xfs_btree_setbuf(cur, level, rbp);
2824 		cur->bc_ptrs[level] -= lrecs;
2825 	}
2826 	/*
2827 	 * If there are more levels, we'll need another cursor which refers
2828 	 * the right block, no matter where this cursor was.
2829 	 */
2830 	if (level + 1 < cur->bc_nlevels) {
2831 		error = xfs_btree_dup_cursor(cur, curp);
2832 		if (error)
2833 			goto error0;
2834 		(*curp)->bc_ptrs[level + 1]++;
2835 	}
2836 	*ptrp = rptr;
2837 	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2838 	*stat = 1;
2839 	return 0;
2840 out0:
2841 	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2842 	*stat = 0;
2843 	return 0;
2844 
2845 error0:
2846 	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2847 	return error;
2848 }
2849 
2850 struct xfs_btree_split_args {
2851 	struct xfs_btree_cur	*cur;
2852 	int			level;
2853 	union xfs_btree_ptr	*ptrp;
2854 	union xfs_btree_key	*key;
2855 	struct xfs_btree_cur	**curp;
2856 	int			*stat;		/* success/failure */
2857 	int			result;
2858 	bool			kswapd;	/* allocation in kswapd context */
2859 	struct completion	*done;
2860 	struct work_struct	work;
2861 };
2862 
2863 /*
2864  * Stack switching interfaces for allocation
2865  */
2866 static void
xfs_btree_split_worker(struct work_struct * work)2867 xfs_btree_split_worker(
2868 	struct work_struct	*work)
2869 {
2870 	struct xfs_btree_split_args	*args = container_of(work,
2871 						struct xfs_btree_split_args, work);
2872 	unsigned long		pflags;
2873 	unsigned long		new_pflags = PF_FSTRANS;
2874 
2875 	/*
2876 	 * we are in a transaction context here, but may also be doing work
2877 	 * in kswapd context, and hence we may need to inherit that state
2878 	 * temporarily to ensure that we don't block waiting for memory reclaim
2879 	 * in any way.
2880 	 */
2881 	if (args->kswapd)
2882 		new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
2883 
2884 	current_set_flags_nested(&pflags, new_pflags);
2885 
2886 	args->result = __xfs_btree_split(args->cur, args->level, args->ptrp,
2887 					 args->key, args->curp, args->stat);
2888 	complete(args->done);
2889 
2890 	current_restore_flags_nested(&pflags, new_pflags);
2891 }
2892 
2893 /*
2894  * BMBT split requests often come in with little stack to work on. Push
2895  * them off to a worker thread so there is lots of stack to use. For the other
2896  * btree types, just call directly to avoid the context switch overhead here.
2897  */
2898 STATIC int					/* error */
xfs_btree_split(struct xfs_btree_cur * cur,int level,union xfs_btree_ptr * ptrp,union xfs_btree_key * key,struct xfs_btree_cur ** curp,int * stat)2899 xfs_btree_split(
2900 	struct xfs_btree_cur	*cur,
2901 	int			level,
2902 	union xfs_btree_ptr	*ptrp,
2903 	union xfs_btree_key	*key,
2904 	struct xfs_btree_cur	**curp,
2905 	int			*stat)		/* success/failure */
2906 {
2907 	struct xfs_btree_split_args	args;
2908 	DECLARE_COMPLETION_ONSTACK(done);
2909 
2910 	if (cur->bc_btnum != XFS_BTNUM_BMAP)
2911 		return __xfs_btree_split(cur, level, ptrp, key, curp, stat);
2912 
2913 	args.cur = cur;
2914 	args.level = level;
2915 	args.ptrp = ptrp;
2916 	args.key = key;
2917 	args.curp = curp;
2918 	args.stat = stat;
2919 	args.done = &done;
2920 	args.kswapd = current_is_kswapd();
2921 	INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker);
2922 	queue_work(xfs_alloc_wq, &args.work);
2923 	wait_for_completion(&done);
2924 	destroy_work_on_stack(&args.work);
2925 	return args.result;
2926 }
2927 
2928 
2929 /*
2930  * Copy the old inode root contents into a real block and make the
2931  * broot point to it.
2932  */
2933 int						/* error */
xfs_btree_new_iroot(struct xfs_btree_cur * cur,int * logflags,int * stat)2934 xfs_btree_new_iroot(
2935 	struct xfs_btree_cur	*cur,		/* btree cursor */
2936 	int			*logflags,	/* logging flags for inode */
2937 	int			*stat)		/* return status - 0 fail */
2938 {
2939 	struct xfs_buf		*cbp;		/* buffer for cblock */
2940 	struct xfs_btree_block	*block;		/* btree block */
2941 	struct xfs_btree_block	*cblock;	/* child btree block */
2942 	union xfs_btree_key	*ckp;		/* child key pointer */
2943 	union xfs_btree_ptr	*cpp;		/* child ptr pointer */
2944 	union xfs_btree_key	*kp;		/* pointer to btree key */
2945 	union xfs_btree_ptr	*pp;		/* pointer to block addr */
2946 	union xfs_btree_ptr	nptr;		/* new block addr */
2947 	int			level;		/* btree level */
2948 	int			error;		/* error return code */
2949 #ifdef DEBUG
2950 	int			i;		/* loop counter */
2951 #endif
2952 
2953 	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2954 	XFS_BTREE_STATS_INC(cur, newroot);
2955 
2956 	ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
2957 
2958 	level = cur->bc_nlevels - 1;
2959 
2960 	block = xfs_btree_get_iroot(cur);
2961 	pp = xfs_btree_ptr_addr(cur, 1, block);
2962 
2963 	/* Allocate the new block. If we can't do it, we're toast. Give up. */
2964 	error = cur->bc_ops->alloc_block(cur, pp, &nptr, stat);
2965 	if (error)
2966 		goto error0;
2967 	if (*stat == 0) {
2968 		XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2969 		return 0;
2970 	}
2971 	XFS_BTREE_STATS_INC(cur, alloc);
2972 
2973 	/* Copy the root into a real block. */
2974 	error = xfs_btree_get_buf_block(cur, &nptr, 0, &cblock, &cbp);
2975 	if (error)
2976 		goto error0;
2977 
2978 	/*
2979 	 * we can't just memcpy() the root in for CRC enabled btree blocks.
2980 	 * In that case have to also ensure the blkno remains correct
2981 	 */
2982 	memcpy(cblock, block, xfs_btree_block_len(cur));
2983 	if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
2984 		if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
2985 			cblock->bb_u.l.bb_blkno = cpu_to_be64(cbp->b_bn);
2986 		else
2987 			cblock->bb_u.s.bb_blkno = cpu_to_be64(cbp->b_bn);
2988 	}
2989 
2990 	be16_add_cpu(&block->bb_level, 1);
2991 	xfs_btree_set_numrecs(block, 1);
2992 	cur->bc_nlevels++;
2993 	cur->bc_ptrs[level + 1] = 1;
2994 
2995 	kp = xfs_btree_key_addr(cur, 1, block);
2996 	ckp = xfs_btree_key_addr(cur, 1, cblock);
2997 	xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock));
2998 
2999 	cpp = xfs_btree_ptr_addr(cur, 1, cblock);
3000 #ifdef DEBUG
3001 	for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
3002 		error = xfs_btree_check_ptr(cur, pp, i, level);
3003 		if (error)
3004 			goto error0;
3005 	}
3006 #endif
3007 	xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock));
3008 
3009 #ifdef DEBUG
3010 	error = xfs_btree_check_ptr(cur, &nptr, 0, level);
3011 	if (error)
3012 		goto error0;
3013 #endif
3014 	xfs_btree_copy_ptrs(cur, pp, &nptr, 1);
3015 
3016 	xfs_iroot_realloc(cur->bc_private.b.ip,
3017 			  1 - xfs_btree_get_numrecs(cblock),
3018 			  cur->bc_private.b.whichfork);
3019 
3020 	xfs_btree_setbuf(cur, level, cbp);
3021 
3022 	/*
3023 	 * Do all this logging at the end so that
3024 	 * the root is at the right level.
3025 	 */
3026 	xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
3027 	xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
3028 	xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
3029 
3030 	*logflags |=
3031 		XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork);
3032 	*stat = 1;
3033 	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3034 	return 0;
3035 error0:
3036 	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3037 	return error;
3038 }
3039 
3040 /*
3041  * Allocate a new root block, fill it in.
3042  */
3043 STATIC int				/* error */
xfs_btree_new_root(struct xfs_btree_cur * cur,int * stat)3044 xfs_btree_new_root(
3045 	struct xfs_btree_cur	*cur,	/* btree cursor */
3046 	int			*stat)	/* success/failure */
3047 {
3048 	struct xfs_btree_block	*block;	/* one half of the old root block */
3049 	struct xfs_buf		*bp;	/* buffer containing block */
3050 	int			error;	/* error return value */
3051 	struct xfs_buf		*lbp;	/* left buffer pointer */
3052 	struct xfs_btree_block	*left;	/* left btree block */
3053 	struct xfs_buf		*nbp;	/* new (root) buffer */
3054 	struct xfs_btree_block	*new;	/* new (root) btree block */
3055 	int			nptr;	/* new value for key index, 1 or 2 */
3056 	struct xfs_buf		*rbp;	/* right buffer pointer */
3057 	struct xfs_btree_block	*right;	/* right btree block */
3058 	union xfs_btree_ptr	rptr;
3059 	union xfs_btree_ptr	lptr;
3060 
3061 	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3062 	XFS_BTREE_STATS_INC(cur, newroot);
3063 
3064 	/* initialise our start point from the cursor */
3065 	cur->bc_ops->init_ptr_from_cur(cur, &rptr);
3066 
3067 	/* Allocate the new block. If we can't do it, we're toast. Give up. */
3068 	error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, stat);
3069 	if (error)
3070 		goto error0;
3071 	if (*stat == 0)
3072 		goto out0;
3073 	XFS_BTREE_STATS_INC(cur, alloc);
3074 
3075 	/* Set up the new block. */
3076 	error = xfs_btree_get_buf_block(cur, &lptr, 0, &new, &nbp);
3077 	if (error)
3078 		goto error0;
3079 
3080 	/* Set the root in the holding structure  increasing the level by 1. */
3081 	cur->bc_ops->set_root(cur, &lptr, 1);
3082 
3083 	/*
3084 	 * At the previous root level there are now two blocks: the old root,
3085 	 * and the new block generated when it was split.  We don't know which
3086 	 * one the cursor is pointing at, so we set up variables "left" and
3087 	 * "right" for each case.
3088 	 */
3089 	block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp);
3090 
3091 #ifdef DEBUG
3092 	error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp);
3093 	if (error)
3094 		goto error0;
3095 #endif
3096 
3097 	xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
3098 	if (!xfs_btree_ptr_is_null(cur, &rptr)) {
3099 		/* Our block is left, pick up the right block. */
3100 		lbp = bp;
3101 		xfs_btree_buf_to_ptr(cur, lbp, &lptr);
3102 		left = block;
3103 		error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
3104 		if (error)
3105 			goto error0;
3106 		bp = rbp;
3107 		nptr = 1;
3108 	} else {
3109 		/* Our block is right, pick up the left block. */
3110 		rbp = bp;
3111 		xfs_btree_buf_to_ptr(cur, rbp, &rptr);
3112 		right = block;
3113 		xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
3114 		error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
3115 		if (error)
3116 			goto error0;
3117 		bp = lbp;
3118 		nptr = 2;
3119 	}
3120 
3121 	/* Fill in the new block's btree header and log it. */
3122 	xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2);
3123 	xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
3124 	ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
3125 			!xfs_btree_ptr_is_null(cur, &rptr));
3126 
3127 	/* Fill in the key data in the new root. */
3128 	if (xfs_btree_get_level(left) > 0) {
3129 		/*
3130 		 * Get the keys for the left block's keys and put them directly
3131 		 * in the parent block.  Do the same for the right block.
3132 		 */
3133 		xfs_btree_get_node_keys(cur, left,
3134 				xfs_btree_key_addr(cur, 1, new));
3135 		xfs_btree_get_node_keys(cur, right,
3136 				xfs_btree_key_addr(cur, 2, new));
3137 	} else {
3138 		/*
3139 		 * Get the keys for the left block's records and put them
3140 		 * directly in the parent block.  Do the same for the right
3141 		 * block.
3142 		 */
3143 		xfs_btree_get_leaf_keys(cur, left,
3144 			xfs_btree_key_addr(cur, 1, new));
3145 		xfs_btree_get_leaf_keys(cur, right,
3146 			xfs_btree_key_addr(cur, 2, new));
3147 	}
3148 	xfs_btree_log_keys(cur, nbp, 1, 2);
3149 
3150 	/* Fill in the pointer data in the new root. */
3151 	xfs_btree_copy_ptrs(cur,
3152 		xfs_btree_ptr_addr(cur, 1, new), &lptr, 1);
3153 	xfs_btree_copy_ptrs(cur,
3154 		xfs_btree_ptr_addr(cur, 2, new), &rptr, 1);
3155 	xfs_btree_log_ptrs(cur, nbp, 1, 2);
3156 
3157 	/* Fix up the cursor. */
3158 	xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
3159 	cur->bc_ptrs[cur->bc_nlevels] = nptr;
3160 	cur->bc_nlevels++;
3161 	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3162 	*stat = 1;
3163 	return 0;
3164 error0:
3165 	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3166 	return error;
3167 out0:
3168 	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3169 	*stat = 0;
3170 	return 0;
3171 }
3172 
3173 STATIC int
xfs_btree_make_block_unfull(struct xfs_btree_cur * cur,int level,int numrecs,int * oindex,int * index,union xfs_btree_ptr * nptr,struct xfs_btree_cur ** ncur,union xfs_btree_key * key,int * stat)3174 xfs_btree_make_block_unfull(
3175 	struct xfs_btree_cur	*cur,	/* btree cursor */
3176 	int			level,	/* btree level */
3177 	int			numrecs,/* # of recs in block */
3178 	int			*oindex,/* old tree index */
3179 	int			*index,	/* new tree index */
3180 	union xfs_btree_ptr	*nptr,	/* new btree ptr */
3181 	struct xfs_btree_cur	**ncur,	/* new btree cursor */
3182 	union xfs_btree_key	*key,	/* key of new block */
3183 	int			*stat)
3184 {
3185 	int			error = 0;
3186 
3187 	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
3188 	    level == cur->bc_nlevels - 1) {
3189 	    	struct xfs_inode *ip = cur->bc_private.b.ip;
3190 
3191 		if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
3192 			/* A root block that can be made bigger. */
3193 			xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork);
3194 			*stat = 1;
3195 		} else {
3196 			/* A root block that needs replacing */
3197 			int	logflags = 0;
3198 
3199 			error = xfs_btree_new_iroot(cur, &logflags, stat);
3200 			if (error || *stat == 0)
3201 				return error;
3202 
3203 			xfs_trans_log_inode(cur->bc_tp, ip, logflags);
3204 		}
3205 
3206 		return 0;
3207 	}
3208 
3209 	/* First, try shifting an entry to the right neighbor. */
3210 	error = xfs_btree_rshift(cur, level, stat);
3211 	if (error || *stat)
3212 		return error;
3213 
3214 	/* Next, try shifting an entry to the left neighbor. */
3215 	error = xfs_btree_lshift(cur, level, stat);
3216 	if (error)
3217 		return error;
3218 
3219 	if (*stat) {
3220 		*oindex = *index = cur->bc_ptrs[level];
3221 		return 0;
3222 	}
3223 
3224 	/*
3225 	 * Next, try splitting the current block in half.
3226 	 *
3227 	 * If this works we have to re-set our variables because we
3228 	 * could be in a different block now.
3229 	 */
3230 	error = xfs_btree_split(cur, level, nptr, key, ncur, stat);
3231 	if (error || *stat == 0)
3232 		return error;
3233 
3234 
3235 	*index = cur->bc_ptrs[level];
3236 	return 0;
3237 }
3238 
3239 /*
3240  * Insert one record/level.  Return information to the caller
3241  * allowing the next level up to proceed if necessary.
3242  */
3243 STATIC int
xfs_btree_insrec(struct xfs_btree_cur * cur,int level,union xfs_btree_ptr * ptrp,union xfs_btree_rec * rec,union xfs_btree_key * key,struct xfs_btree_cur ** curp,int * stat)3244 xfs_btree_insrec(
3245 	struct xfs_btree_cur	*cur,	/* btree cursor */
3246 	int			level,	/* level to insert record at */
3247 	union xfs_btree_ptr	*ptrp,	/* i/o: block number inserted */
3248 	union xfs_btree_rec	*rec,	/* record to insert */
3249 	union xfs_btree_key	*key,	/* i/o: block key for ptrp */
3250 	struct xfs_btree_cur	**curp,	/* output: new cursor replacing cur */
3251 	int			*stat)	/* success/failure */
3252 {
3253 	struct xfs_btree_block	*block;	/* btree block */
3254 	struct xfs_buf		*bp;	/* buffer for block */
3255 	union xfs_btree_ptr	nptr;	/* new block ptr */
3256 	struct xfs_btree_cur	*ncur;	/* new btree cursor */
3257 	union xfs_btree_key	nkey;	/* new block key */
3258 	union xfs_btree_key	*lkey;
3259 	int			optr;	/* old key/record index */
3260 	int			ptr;	/* key/record index */
3261 	int			numrecs;/* number of records */
3262 	int			error;	/* error return value */
3263 #ifdef DEBUG
3264 	int			i;
3265 #endif
3266 	xfs_daddr_t		old_bn;
3267 
3268 	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3269 	XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, &rec);
3270 
3271 	ncur = NULL;
3272 	lkey = &nkey;
3273 
3274 	/*
3275 	 * If we have an external root pointer, and we've made it to the
3276 	 * root level, allocate a new root block and we're done.
3277 	 */
3278 	if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
3279 	    (level >= cur->bc_nlevels)) {
3280 		error = xfs_btree_new_root(cur, stat);
3281 		xfs_btree_set_ptr_null(cur, ptrp);
3282 
3283 		XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3284 		return error;
3285 	}
3286 
3287 	/* If we're off the left edge, return failure. */
3288 	ptr = cur->bc_ptrs[level];
3289 	if (ptr == 0) {
3290 		XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3291 		*stat = 0;
3292 		return 0;
3293 	}
3294 
3295 	optr = ptr;
3296 
3297 	XFS_BTREE_STATS_INC(cur, insrec);
3298 
3299 	/* Get pointers to the btree buffer and block. */
3300 	block = xfs_btree_get_block(cur, level, &bp);
3301 	old_bn = bp ? bp->b_bn : XFS_BUF_DADDR_NULL;
3302 	numrecs = xfs_btree_get_numrecs(block);
3303 
3304 #ifdef DEBUG
3305 	error = xfs_btree_check_block(cur, block, level, bp);
3306 	if (error)
3307 		goto error0;
3308 
3309 	/* Check that the new entry is being inserted in the right place. */
3310 	if (ptr <= numrecs) {
3311 		if (level == 0) {
3312 			ASSERT(cur->bc_ops->recs_inorder(cur, rec,
3313 				xfs_btree_rec_addr(cur, ptr, block)));
3314 		} else {
3315 			ASSERT(cur->bc_ops->keys_inorder(cur, key,
3316 				xfs_btree_key_addr(cur, ptr, block)));
3317 		}
3318 	}
3319 #endif
3320 
3321 	/*
3322 	 * If the block is full, we can't insert the new entry until we
3323 	 * make the block un-full.
3324 	 */
3325 	xfs_btree_set_ptr_null(cur, &nptr);
3326 	if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) {
3327 		error = xfs_btree_make_block_unfull(cur, level, numrecs,
3328 					&optr, &ptr, &nptr, &ncur, lkey, stat);
3329 		if (error || *stat == 0)
3330 			goto error0;
3331 	}
3332 
3333 	/*
3334 	 * The current block may have changed if the block was
3335 	 * previously full and we have just made space in it.
3336 	 */
3337 	block = xfs_btree_get_block(cur, level, &bp);
3338 	numrecs = xfs_btree_get_numrecs(block);
3339 
3340 #ifdef DEBUG
3341 	error = xfs_btree_check_block(cur, block, level, bp);
3342 	if (error)
3343 		return error;
3344 #endif
3345 
3346 	/*
3347 	 * At this point we know there's room for our new entry in the block
3348 	 * we're pointing at.
3349 	 */
3350 	XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1);
3351 
3352 	if (level > 0) {
3353 		/* It's a nonleaf. make a hole in the keys and ptrs */
3354 		union xfs_btree_key	*kp;
3355 		union xfs_btree_ptr	*pp;
3356 
3357 		kp = xfs_btree_key_addr(cur, ptr, block);
3358 		pp = xfs_btree_ptr_addr(cur, ptr, block);
3359 
3360 #ifdef DEBUG
3361 		for (i = numrecs - ptr; i >= 0; i--) {
3362 			error = xfs_btree_check_ptr(cur, pp, i, level);
3363 			if (error)
3364 				return error;
3365 		}
3366 #endif
3367 
3368 		xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1);
3369 		xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1);
3370 
3371 #ifdef DEBUG
3372 		error = xfs_btree_check_ptr(cur, ptrp, 0, level);
3373 		if (error)
3374 			goto error0;
3375 #endif
3376 
3377 		/* Now put the new data in, bump numrecs and log it. */
3378 		xfs_btree_copy_keys(cur, kp, key, 1);
3379 		xfs_btree_copy_ptrs(cur, pp, ptrp, 1);
3380 		numrecs++;
3381 		xfs_btree_set_numrecs(block, numrecs);
3382 		xfs_btree_log_ptrs(cur, bp, ptr, numrecs);
3383 		xfs_btree_log_keys(cur, bp, ptr, numrecs);
3384 #ifdef DEBUG
3385 		if (ptr < numrecs) {
3386 			ASSERT(cur->bc_ops->keys_inorder(cur, kp,
3387 				xfs_btree_key_addr(cur, ptr + 1, block)));
3388 		}
3389 #endif
3390 	} else {
3391 		/* It's a leaf. make a hole in the records */
3392 		union xfs_btree_rec             *rp;
3393 
3394 		rp = xfs_btree_rec_addr(cur, ptr, block);
3395 
3396 		xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1);
3397 
3398 		/* Now put the new data in, bump numrecs and log it. */
3399 		xfs_btree_copy_recs(cur, rp, rec, 1);
3400 		xfs_btree_set_numrecs(block, ++numrecs);
3401 		xfs_btree_log_recs(cur, bp, ptr, numrecs);
3402 #ifdef DEBUG
3403 		if (ptr < numrecs) {
3404 			ASSERT(cur->bc_ops->recs_inorder(cur, rp,
3405 				xfs_btree_rec_addr(cur, ptr + 1, block)));
3406 		}
3407 #endif
3408 	}
3409 
3410 	/* Log the new number of records in the btree header. */
3411 	xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
3412 
3413 	/*
3414 	 * If we just inserted into a new tree block, we have to
3415 	 * recalculate nkey here because nkey is out of date.
3416 	 *
3417 	 * Otherwise we're just updating an existing block (having shoved
3418 	 * some records into the new tree block), so use the regular key
3419 	 * update mechanism.
3420 	 */
3421 	if (bp && bp->b_bn != old_bn) {
3422 		xfs_btree_get_keys(cur, block, lkey);
3423 	} else if (xfs_btree_needs_key_update(cur, optr)) {
3424 		error = xfs_btree_update_keys(cur, level);
3425 		if (error)
3426 			goto error0;
3427 	}
3428 
3429 	/*
3430 	 * If we are tracking the last record in the tree and
3431 	 * we are at the far right edge of the tree, update it.
3432 	 */
3433 	if (xfs_btree_is_lastrec(cur, block, level)) {
3434 		cur->bc_ops->update_lastrec(cur, block, rec,
3435 					    ptr, LASTREC_INSREC);
3436 	}
3437 
3438 	/*
3439 	 * Return the new block number, if any.
3440 	 * If there is one, give back a record value and a cursor too.
3441 	 */
3442 	*ptrp = nptr;
3443 	if (!xfs_btree_ptr_is_null(cur, &nptr)) {
3444 		xfs_btree_copy_keys(cur, key, lkey, 1);
3445 		*curp = ncur;
3446 	}
3447 
3448 	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3449 	*stat = 1;
3450 	return 0;
3451 
3452 error0:
3453 	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3454 	return error;
3455 }
3456 
3457 /*
3458  * Insert the record at the point referenced by cur.
3459  *
3460  * A multi-level split of the tree on insert will invalidate the original
3461  * cursor.  All callers of this function should assume that the cursor is
3462  * no longer valid and revalidate it.
3463  */
3464 int
xfs_btree_insert(struct xfs_btree_cur * cur,int * stat)3465 xfs_btree_insert(
3466 	struct xfs_btree_cur	*cur,
3467 	int			*stat)
3468 {
3469 	int			error;	/* error return value */
3470 	int			i;	/* result value, 0 for failure */
3471 	int			level;	/* current level number in btree */
3472 	union xfs_btree_ptr	nptr;	/* new block number (split result) */
3473 	struct xfs_btree_cur	*ncur;	/* new cursor (split result) */
3474 	struct xfs_btree_cur	*pcur;	/* previous level's cursor */
3475 	union xfs_btree_key	bkey;	/* key of block to insert */
3476 	union xfs_btree_key	*key;
3477 	union xfs_btree_rec	rec;	/* record to insert */
3478 
3479 	level = 0;
3480 	ncur = NULL;
3481 	pcur = cur;
3482 	key = &bkey;
3483 
3484 	xfs_btree_set_ptr_null(cur, &nptr);
3485 
3486 	/* Make a key out of the record data to be inserted, and save it. */
3487 	cur->bc_ops->init_rec_from_cur(cur, &rec);
3488 	cur->bc_ops->init_key_from_rec(key, &rec);
3489 
3490 	/*
3491 	 * Loop going up the tree, starting at the leaf level.
3492 	 * Stop when we don't get a split block, that must mean that
3493 	 * the insert is finished with this level.
3494 	 */
3495 	do {
3496 		/*
3497 		 * Insert nrec/nptr into this level of the tree.
3498 		 * Note if we fail, nptr will be null.
3499 		 */
3500 		error = xfs_btree_insrec(pcur, level, &nptr, &rec, key,
3501 				&ncur, &i);
3502 		if (error) {
3503 			if (pcur != cur)
3504 				xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
3505 			goto error0;
3506 		}
3507 
3508 		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3509 		level++;
3510 
3511 		/*
3512 		 * See if the cursor we just used is trash.
3513 		 * Can't trash the caller's cursor, but otherwise we should
3514 		 * if ncur is a new cursor or we're about to be done.
3515 		 */
3516 		if (pcur != cur &&
3517 		    (ncur || xfs_btree_ptr_is_null(cur, &nptr))) {
3518 			/* Save the state from the cursor before we trash it */
3519 			if (cur->bc_ops->update_cursor)
3520 				cur->bc_ops->update_cursor(pcur, cur);
3521 			cur->bc_nlevels = pcur->bc_nlevels;
3522 			xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
3523 		}
3524 		/* If we got a new cursor, switch to it. */
3525 		if (ncur) {
3526 			pcur = ncur;
3527 			ncur = NULL;
3528 		}
3529 	} while (!xfs_btree_ptr_is_null(cur, &nptr));
3530 
3531 	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3532 	*stat = i;
3533 	return 0;
3534 error0:
3535 	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3536 	return error;
3537 }
3538 
3539 /*
3540  * Try to merge a non-leaf block back into the inode root.
3541  *
3542  * Note: the killroot names comes from the fact that we're effectively
3543  * killing the old root block.  But because we can't just delete the
3544  * inode we have to copy the single block it was pointing to into the
3545  * inode.
3546  */
3547 STATIC int
xfs_btree_kill_iroot(struct xfs_btree_cur * cur)3548 xfs_btree_kill_iroot(
3549 	struct xfs_btree_cur	*cur)
3550 {
3551 	int			whichfork = cur->bc_private.b.whichfork;
3552 	struct xfs_inode	*ip = cur->bc_private.b.ip;
3553 	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
3554 	struct xfs_btree_block	*block;
3555 	struct xfs_btree_block	*cblock;
3556 	union xfs_btree_key	*kp;
3557 	union xfs_btree_key	*ckp;
3558 	union xfs_btree_ptr	*pp;
3559 	union xfs_btree_ptr	*cpp;
3560 	struct xfs_buf		*cbp;
3561 	int			level;
3562 	int			index;
3563 	int			numrecs;
3564 	int			error;
3565 #ifdef DEBUG
3566 	union xfs_btree_ptr	ptr;
3567 	int			i;
3568 #endif
3569 
3570 	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3571 
3572 	ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
3573 	ASSERT(cur->bc_nlevels > 1);
3574 
3575 	/*
3576 	 * Don't deal with the root block needs to be a leaf case.
3577 	 * We're just going to turn the thing back into extents anyway.
3578 	 */
3579 	level = cur->bc_nlevels - 1;
3580 	if (level == 1)
3581 		goto out0;
3582 
3583 	/*
3584 	 * Give up if the root has multiple children.
3585 	 */
3586 	block = xfs_btree_get_iroot(cur);
3587 	if (xfs_btree_get_numrecs(block) != 1)
3588 		goto out0;
3589 
3590 	cblock = xfs_btree_get_block(cur, level - 1, &cbp);
3591 	numrecs = xfs_btree_get_numrecs(cblock);
3592 
3593 	/*
3594 	 * Only do this if the next level will fit.
3595 	 * Then the data must be copied up to the inode,
3596 	 * instead of freeing the root you free the next level.
3597 	 */
3598 	if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level))
3599 		goto out0;
3600 
3601 	XFS_BTREE_STATS_INC(cur, killroot);
3602 
3603 #ifdef DEBUG
3604 	xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
3605 	ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
3606 	xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
3607 	ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
3608 #endif
3609 
3610 	index = numrecs - cur->bc_ops->get_maxrecs(cur, level);
3611 	if (index) {
3612 		xfs_iroot_realloc(cur->bc_private.b.ip, index,
3613 				  cur->bc_private.b.whichfork);
3614 		block = ifp->if_broot;
3615 	}
3616 
3617 	be16_add_cpu(&block->bb_numrecs, index);
3618 	ASSERT(block->bb_numrecs == cblock->bb_numrecs);
3619 
3620 	kp = xfs_btree_key_addr(cur, 1, block);
3621 	ckp = xfs_btree_key_addr(cur, 1, cblock);
3622 	xfs_btree_copy_keys(cur, kp, ckp, numrecs);
3623 
3624 	pp = xfs_btree_ptr_addr(cur, 1, block);
3625 	cpp = xfs_btree_ptr_addr(cur, 1, cblock);
3626 #ifdef DEBUG
3627 	for (i = 0; i < numrecs; i++) {
3628 		error = xfs_btree_check_ptr(cur, cpp, i, level - 1);
3629 		if (error) {
3630 			XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3631 			return error;
3632 		}
3633 	}
3634 #endif
3635 	xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
3636 
3637 	error = xfs_btree_free_block(cur, cbp);
3638 	if (error) {
3639 		XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3640 		return error;
3641 	}
3642 
3643 	cur->bc_bufs[level - 1] = NULL;
3644 	be16_add_cpu(&block->bb_level, -1);
3645 	xfs_trans_log_inode(cur->bc_tp, ip,
3646 		XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork));
3647 	cur->bc_nlevels--;
3648 out0:
3649 	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3650 	return 0;
3651 }
3652 
3653 /*
3654  * Kill the current root node, and replace it with it's only child node.
3655  */
3656 STATIC int
xfs_btree_kill_root(struct xfs_btree_cur * cur,struct xfs_buf * bp,int level,union xfs_btree_ptr * newroot)3657 xfs_btree_kill_root(
3658 	struct xfs_btree_cur	*cur,
3659 	struct xfs_buf		*bp,
3660 	int			level,
3661 	union xfs_btree_ptr	*newroot)
3662 {
3663 	int			error;
3664 
3665 	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3666 	XFS_BTREE_STATS_INC(cur, killroot);
3667 
3668 	/*
3669 	 * Update the root pointer, decreasing the level by 1 and then
3670 	 * free the old root.
3671 	 */
3672 	cur->bc_ops->set_root(cur, newroot, -1);
3673 
3674 	error = xfs_btree_free_block(cur, bp);
3675 	if (error) {
3676 		XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3677 		return error;
3678 	}
3679 
3680 	cur->bc_bufs[level] = NULL;
3681 	cur->bc_ra[level] = 0;
3682 	cur->bc_nlevels--;
3683 
3684 	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3685 	return 0;
3686 }
3687 
3688 STATIC int
xfs_btree_dec_cursor(struct xfs_btree_cur * cur,int level,int * stat)3689 xfs_btree_dec_cursor(
3690 	struct xfs_btree_cur	*cur,
3691 	int			level,
3692 	int			*stat)
3693 {
3694 	int			error;
3695 	int			i;
3696 
3697 	if (level > 0) {
3698 		error = xfs_btree_decrement(cur, level, &i);
3699 		if (error)
3700 			return error;
3701 	}
3702 
3703 	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3704 	*stat = 1;
3705 	return 0;
3706 }
3707 
3708 /*
3709  * Single level of the btree record deletion routine.
3710  * Delete record pointed to by cur/level.
3711  * Remove the record from its block then rebalance the tree.
3712  * Return 0 for error, 1 for done, 2 to go on to the next level.
3713  */
3714 STATIC int					/* error */
xfs_btree_delrec(struct xfs_btree_cur * cur,int level,int * stat)3715 xfs_btree_delrec(
3716 	struct xfs_btree_cur	*cur,		/* btree cursor */
3717 	int			level,		/* level removing record from */
3718 	int			*stat)		/* fail/done/go-on */
3719 {
3720 	struct xfs_btree_block	*block;		/* btree block */
3721 	union xfs_btree_ptr	cptr;		/* current block ptr */
3722 	struct xfs_buf		*bp;		/* buffer for block */
3723 	int			error;		/* error return value */
3724 	int			i;		/* loop counter */
3725 	union xfs_btree_ptr	lptr;		/* left sibling block ptr */
3726 	struct xfs_buf		*lbp;		/* left buffer pointer */
3727 	struct xfs_btree_block	*left;		/* left btree block */
3728 	int			lrecs = 0;	/* left record count */
3729 	int			ptr;		/* key/record index */
3730 	union xfs_btree_ptr	rptr;		/* right sibling block ptr */
3731 	struct xfs_buf		*rbp;		/* right buffer pointer */
3732 	struct xfs_btree_block	*right;		/* right btree block */
3733 	struct xfs_btree_block	*rrblock;	/* right-right btree block */
3734 	struct xfs_buf		*rrbp;		/* right-right buffer pointer */
3735 	int			rrecs = 0;	/* right record count */
3736 	struct xfs_btree_cur	*tcur;		/* temporary btree cursor */
3737 	int			numrecs;	/* temporary numrec count */
3738 
3739 	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3740 	XFS_BTREE_TRACE_ARGI(cur, level);
3741 
3742 	tcur = NULL;
3743 
3744 	/* Get the index of the entry being deleted, check for nothing there. */
3745 	ptr = cur->bc_ptrs[level];
3746 	if (ptr == 0) {
3747 		XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3748 		*stat = 0;
3749 		return 0;
3750 	}
3751 
3752 	/* Get the buffer & block containing the record or key/ptr. */
3753 	block = xfs_btree_get_block(cur, level, &bp);
3754 	numrecs = xfs_btree_get_numrecs(block);
3755 
3756 #ifdef DEBUG
3757 	error = xfs_btree_check_block(cur, block, level, bp);
3758 	if (error)
3759 		goto error0;
3760 #endif
3761 
3762 	/* Fail if we're off the end of the block. */
3763 	if (ptr > numrecs) {
3764 		XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3765 		*stat = 0;
3766 		return 0;
3767 	}
3768 
3769 	XFS_BTREE_STATS_INC(cur, delrec);
3770 	XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr);
3771 
3772 	/* Excise the entries being deleted. */
3773 	if (level > 0) {
3774 		/* It's a nonleaf. operate on keys and ptrs */
3775 		union xfs_btree_key	*lkp;
3776 		union xfs_btree_ptr	*lpp;
3777 
3778 		lkp = xfs_btree_key_addr(cur, ptr + 1, block);
3779 		lpp = xfs_btree_ptr_addr(cur, ptr + 1, block);
3780 
3781 #ifdef DEBUG
3782 		for (i = 0; i < numrecs - ptr; i++) {
3783 			error = xfs_btree_check_ptr(cur, lpp, i, level);
3784 			if (error)
3785 				goto error0;
3786 		}
3787 #endif
3788 
3789 		if (ptr < numrecs) {
3790 			xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr);
3791 			xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr);
3792 			xfs_btree_log_keys(cur, bp, ptr, numrecs - 1);
3793 			xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1);
3794 		}
3795 	} else {
3796 		/* It's a leaf. operate on records */
3797 		if (ptr < numrecs) {
3798 			xfs_btree_shift_recs(cur,
3799 				xfs_btree_rec_addr(cur, ptr + 1, block),
3800 				-1, numrecs - ptr);
3801 			xfs_btree_log_recs(cur, bp, ptr, numrecs - 1);
3802 		}
3803 	}
3804 
3805 	/*
3806 	 * Decrement and log the number of entries in the block.
3807 	 */
3808 	xfs_btree_set_numrecs(block, --numrecs);
3809 	xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
3810 
3811 	/*
3812 	 * If we are tracking the last record in the tree and
3813 	 * we are at the far right edge of the tree, update it.
3814 	 */
3815 	if (xfs_btree_is_lastrec(cur, block, level)) {
3816 		cur->bc_ops->update_lastrec(cur, block, NULL,
3817 					    ptr, LASTREC_DELREC);
3818 	}
3819 
3820 	/*
3821 	 * We're at the root level.  First, shrink the root block in-memory.
3822 	 * Try to get rid of the next level down.  If we can't then there's
3823 	 * nothing left to do.
3824 	 */
3825 	if (level == cur->bc_nlevels - 1) {
3826 		if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
3827 			xfs_iroot_realloc(cur->bc_private.b.ip, -1,
3828 					  cur->bc_private.b.whichfork);
3829 
3830 			error = xfs_btree_kill_iroot(cur);
3831 			if (error)
3832 				goto error0;
3833 
3834 			error = xfs_btree_dec_cursor(cur, level, stat);
3835 			if (error)
3836 				goto error0;
3837 			*stat = 1;
3838 			return 0;
3839 		}
3840 
3841 		/*
3842 		 * If this is the root level, and there's only one entry left,
3843 		 * and it's NOT the leaf level, then we can get rid of this
3844 		 * level.
3845 		 */
3846 		if (numrecs == 1 && level > 0) {
3847 			union xfs_btree_ptr	*pp;
3848 			/*
3849 			 * pp is still set to the first pointer in the block.
3850 			 * Make it the new root of the btree.
3851 			 */
3852 			pp = xfs_btree_ptr_addr(cur, 1, block);
3853 			error = xfs_btree_kill_root(cur, bp, level, pp);
3854 			if (error)
3855 				goto error0;
3856 		} else if (level > 0) {
3857 			error = xfs_btree_dec_cursor(cur, level, stat);
3858 			if (error)
3859 				goto error0;
3860 		}
3861 		*stat = 1;
3862 		return 0;
3863 	}
3864 
3865 	/*
3866 	 * If we deleted the leftmost entry in the block, update the
3867 	 * key values above us in the tree.
3868 	 */
3869 	if (xfs_btree_needs_key_update(cur, ptr)) {
3870 		error = xfs_btree_update_keys(cur, level);
3871 		if (error)
3872 			goto error0;
3873 	}
3874 
3875 	/*
3876 	 * If the number of records remaining in the block is at least
3877 	 * the minimum, we're done.
3878 	 */
3879 	if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) {
3880 		error = xfs_btree_dec_cursor(cur, level, stat);
3881 		if (error)
3882 			goto error0;
3883 		return 0;
3884 	}
3885 
3886 	/*
3887 	 * Otherwise, we have to move some records around to keep the
3888 	 * tree balanced.  Look at the left and right sibling blocks to
3889 	 * see if we can re-balance by moving only one record.
3890 	 */
3891 	xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
3892 	xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB);
3893 
3894 	if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
3895 		/*
3896 		 * One child of root, need to get a chance to copy its contents
3897 		 * into the root and delete it. Can't go up to next level,
3898 		 * there's nothing to delete there.
3899 		 */
3900 		if (xfs_btree_ptr_is_null(cur, &rptr) &&
3901 		    xfs_btree_ptr_is_null(cur, &lptr) &&
3902 		    level == cur->bc_nlevels - 2) {
3903 			error = xfs_btree_kill_iroot(cur);
3904 			if (!error)
3905 				error = xfs_btree_dec_cursor(cur, level, stat);
3906 			if (error)
3907 				goto error0;
3908 			return 0;
3909 		}
3910 	}
3911 
3912 	ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) ||
3913 	       !xfs_btree_ptr_is_null(cur, &lptr));
3914 
3915 	/*
3916 	 * Duplicate the cursor so our btree manipulations here won't
3917 	 * disrupt the next level up.
3918 	 */
3919 	error = xfs_btree_dup_cursor(cur, &tcur);
3920 	if (error)
3921 		goto error0;
3922 
3923 	/*
3924 	 * If there's a right sibling, see if it's ok to shift an entry
3925 	 * out of it.
3926 	 */
3927 	if (!xfs_btree_ptr_is_null(cur, &rptr)) {
3928 		/*
3929 		 * Move the temp cursor to the last entry in the next block.
3930 		 * Actually any entry but the first would suffice.
3931 		 */
3932 		i = xfs_btree_lastrec(tcur, level);
3933 		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3934 
3935 		error = xfs_btree_increment(tcur, level, &i);
3936 		if (error)
3937 			goto error0;
3938 		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3939 
3940 		i = xfs_btree_lastrec(tcur, level);
3941 		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3942 
3943 		/* Grab a pointer to the block. */
3944 		right = xfs_btree_get_block(tcur, level, &rbp);
3945 #ifdef DEBUG
3946 		error = xfs_btree_check_block(tcur, right, level, rbp);
3947 		if (error)
3948 			goto error0;
3949 #endif
3950 		/* Grab the current block number, for future use. */
3951 		xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB);
3952 
3953 		/*
3954 		 * If right block is full enough so that removing one entry
3955 		 * won't make it too empty, and left-shifting an entry out
3956 		 * of right to us works, we're done.
3957 		 */
3958 		if (xfs_btree_get_numrecs(right) - 1 >=
3959 		    cur->bc_ops->get_minrecs(tcur, level)) {
3960 			error = xfs_btree_lshift(tcur, level, &i);
3961 			if (error)
3962 				goto error0;
3963 			if (i) {
3964 				ASSERT(xfs_btree_get_numrecs(block) >=
3965 				       cur->bc_ops->get_minrecs(tcur, level));
3966 
3967 				xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3968 				tcur = NULL;
3969 
3970 				error = xfs_btree_dec_cursor(cur, level, stat);
3971 				if (error)
3972 					goto error0;
3973 				return 0;
3974 			}
3975 		}
3976 
3977 		/*
3978 		 * Otherwise, grab the number of records in right for
3979 		 * future reference, and fix up the temp cursor to point
3980 		 * to our block again (last record).
3981 		 */
3982 		rrecs = xfs_btree_get_numrecs(right);
3983 		if (!xfs_btree_ptr_is_null(cur, &lptr)) {
3984 			i = xfs_btree_firstrec(tcur, level);
3985 			XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3986 
3987 			error = xfs_btree_decrement(tcur, level, &i);
3988 			if (error)
3989 				goto error0;
3990 			XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3991 		}
3992 	}
3993 
3994 	/*
3995 	 * If there's a left sibling, see if it's ok to shift an entry
3996 	 * out of it.
3997 	 */
3998 	if (!xfs_btree_ptr_is_null(cur, &lptr)) {
3999 		/*
4000 		 * Move the temp cursor to the first entry in the
4001 		 * previous block.
4002 		 */
4003 		i = xfs_btree_firstrec(tcur, level);
4004 		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
4005 
4006 		error = xfs_btree_decrement(tcur, level, &i);
4007 		if (error)
4008 			goto error0;
4009 		i = xfs_btree_firstrec(tcur, level);
4010 		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
4011 
4012 		/* Grab a pointer to the block. */
4013 		left = xfs_btree_get_block(tcur, level, &lbp);
4014 #ifdef DEBUG
4015 		error = xfs_btree_check_block(cur, left, level, lbp);
4016 		if (error)
4017 			goto error0;
4018 #endif
4019 		/* Grab the current block number, for future use. */
4020 		xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB);
4021 
4022 		/*
4023 		 * If left block is full enough so that removing one entry
4024 		 * won't make it too empty, and right-shifting an entry out
4025 		 * of left to us works, we're done.
4026 		 */
4027 		if (xfs_btree_get_numrecs(left) - 1 >=
4028 		    cur->bc_ops->get_minrecs(tcur, level)) {
4029 			error = xfs_btree_rshift(tcur, level, &i);
4030 			if (error)
4031 				goto error0;
4032 			if (i) {
4033 				ASSERT(xfs_btree_get_numrecs(block) >=
4034 				       cur->bc_ops->get_minrecs(tcur, level));
4035 				xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
4036 				tcur = NULL;
4037 				if (level == 0)
4038 					cur->bc_ptrs[0]++;
4039 				XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
4040 				*stat = 1;
4041 				return 0;
4042 			}
4043 		}
4044 
4045 		/*
4046 		 * Otherwise, grab the number of records in right for
4047 		 * future reference.
4048 		 */
4049 		lrecs = xfs_btree_get_numrecs(left);
4050 	}
4051 
4052 	/* Delete the temp cursor, we're done with it. */
4053 	xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
4054 	tcur = NULL;
4055 
4056 	/* If here, we need to do a join to keep the tree balanced. */
4057 	ASSERT(!xfs_btree_ptr_is_null(cur, &cptr));
4058 
4059 	if (!xfs_btree_ptr_is_null(cur, &lptr) &&
4060 	    lrecs + xfs_btree_get_numrecs(block) <=
4061 			cur->bc_ops->get_maxrecs(cur, level)) {
4062 		/*
4063 		 * Set "right" to be the starting block,
4064 		 * "left" to be the left neighbor.
4065 		 */
4066 		rptr = cptr;
4067 		right = block;
4068 		rbp = bp;
4069 		error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
4070 		if (error)
4071 			goto error0;
4072 
4073 	/*
4074 	 * If that won't work, see if we can join with the right neighbor block.
4075 	 */
4076 	} else if (!xfs_btree_ptr_is_null(cur, &rptr) &&
4077 		   rrecs + xfs_btree_get_numrecs(block) <=
4078 			cur->bc_ops->get_maxrecs(cur, level)) {
4079 		/*
4080 		 * Set "left" to be the starting block,
4081 		 * "right" to be the right neighbor.
4082 		 */
4083 		lptr = cptr;
4084 		left = block;
4085 		lbp = bp;
4086 		error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
4087 		if (error)
4088 			goto error0;
4089 
4090 	/*
4091 	 * Otherwise, we can't fix the imbalance.
4092 	 * Just return.  This is probably a logic error, but it's not fatal.
4093 	 */
4094 	} else {
4095 		error = xfs_btree_dec_cursor(cur, level, stat);
4096 		if (error)
4097 			goto error0;
4098 		return 0;
4099 	}
4100 
4101 	rrecs = xfs_btree_get_numrecs(right);
4102 	lrecs = xfs_btree_get_numrecs(left);
4103 
4104 	/*
4105 	 * We're now going to join "left" and "right" by moving all the stuff
4106 	 * in "right" to "left" and deleting "right".
4107 	 */
4108 	XFS_BTREE_STATS_ADD(cur, moves, rrecs);
4109 	if (level > 0) {
4110 		/* It's a non-leaf.  Move keys and pointers. */
4111 		union xfs_btree_key	*lkp;	/* left btree key */
4112 		union xfs_btree_ptr	*lpp;	/* left address pointer */
4113 		union xfs_btree_key	*rkp;	/* right btree key */
4114 		union xfs_btree_ptr	*rpp;	/* right address pointer */
4115 
4116 		lkp = xfs_btree_key_addr(cur, lrecs + 1, left);
4117 		lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left);
4118 		rkp = xfs_btree_key_addr(cur, 1, right);
4119 		rpp = xfs_btree_ptr_addr(cur, 1, right);
4120 #ifdef DEBUG
4121 		for (i = 1; i < rrecs; i++) {
4122 			error = xfs_btree_check_ptr(cur, rpp, i, level);
4123 			if (error)
4124 				goto error0;
4125 		}
4126 #endif
4127 		xfs_btree_copy_keys(cur, lkp, rkp, rrecs);
4128 		xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs);
4129 
4130 		xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
4131 		xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
4132 	} else {
4133 		/* It's a leaf.  Move records.  */
4134 		union xfs_btree_rec	*lrp;	/* left record pointer */
4135 		union xfs_btree_rec	*rrp;	/* right record pointer */
4136 
4137 		lrp = xfs_btree_rec_addr(cur, lrecs + 1, left);
4138 		rrp = xfs_btree_rec_addr(cur, 1, right);
4139 
4140 		xfs_btree_copy_recs(cur, lrp, rrp, rrecs);
4141 		xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
4142 	}
4143 
4144 	XFS_BTREE_STATS_INC(cur, join);
4145 
4146 	/*
4147 	 * Fix up the number of records and right block pointer in the
4148 	 * surviving block, and log it.
4149 	 */
4150 	xfs_btree_set_numrecs(left, lrecs + rrecs);
4151 	xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB),
4152 	xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
4153 	xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
4154 
4155 	/* If there is a right sibling, point it to the remaining block. */
4156 	xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
4157 	if (!xfs_btree_ptr_is_null(cur, &cptr)) {
4158 		error = xfs_btree_read_buf_block(cur, &cptr, 0, &rrblock, &rrbp);
4159 		if (error)
4160 			goto error0;
4161 		xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB);
4162 		xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
4163 	}
4164 
4165 	/* Free the deleted block. */
4166 	error = xfs_btree_free_block(cur, rbp);
4167 	if (error)
4168 		goto error0;
4169 
4170 	/*
4171 	 * If we joined with the left neighbor, set the buffer in the
4172 	 * cursor to the left block, and fix up the index.
4173 	 */
4174 	if (bp != lbp) {
4175 		cur->bc_bufs[level] = lbp;
4176 		cur->bc_ptrs[level] += lrecs;
4177 		cur->bc_ra[level] = 0;
4178 	}
4179 	/*
4180 	 * If we joined with the right neighbor and there's a level above
4181 	 * us, increment the cursor at that level.
4182 	 */
4183 	else if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) ||
4184 		   (level + 1 < cur->bc_nlevels)) {
4185 		error = xfs_btree_increment(cur, level + 1, &i);
4186 		if (error)
4187 			goto error0;
4188 	}
4189 
4190 	/*
4191 	 * Readjust the ptr at this level if it's not a leaf, since it's
4192 	 * still pointing at the deletion point, which makes the cursor
4193 	 * inconsistent.  If this makes the ptr 0, the caller fixes it up.
4194 	 * We can't use decrement because it would change the next level up.
4195 	 */
4196 	if (level > 0)
4197 		cur->bc_ptrs[level]--;
4198 
4199 	/*
4200 	 * We combined blocks, so we have to update the parent keys if the
4201 	 * btree supports overlapped intervals.  However, bc_ptrs[level + 1]
4202 	 * points to the old block so that the caller knows which record to
4203 	 * delete.  Therefore, the caller must be savvy enough to call updkeys
4204 	 * for us if we return stat == 2.  The other exit points from this
4205 	 * function don't require deletions further up the tree, so they can
4206 	 * call updkeys directly.
4207 	 */
4208 
4209 	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
4210 	/* Return value means the next level up has something to do. */
4211 	*stat = 2;
4212 	return 0;
4213 
4214 error0:
4215 	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
4216 	if (tcur)
4217 		xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
4218 	return error;
4219 }
4220 
4221 /*
4222  * Delete the record pointed to by cur.
4223  * The cursor refers to the place where the record was (could be inserted)
4224  * when the operation returns.
4225  */
4226 int					/* error */
xfs_btree_delete(struct xfs_btree_cur * cur,int * stat)4227 xfs_btree_delete(
4228 	struct xfs_btree_cur	*cur,
4229 	int			*stat)	/* success/failure */
4230 {
4231 	int			error;	/* error return value */
4232 	int			level;
4233 	int			i;
4234 	bool			joined = false;
4235 
4236 	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
4237 
4238 	/*
4239 	 * Go up the tree, starting at leaf level.
4240 	 *
4241 	 * If 2 is returned then a join was done; go to the next level.
4242 	 * Otherwise we are done.
4243 	 */
4244 	for (level = 0, i = 2; i == 2; level++) {
4245 		error = xfs_btree_delrec(cur, level, &i);
4246 		if (error)
4247 			goto error0;
4248 		if (i == 2)
4249 			joined = true;
4250 	}
4251 
4252 	/*
4253 	 * If we combined blocks as part of deleting the record, delrec won't
4254 	 * have updated the parent high keys so we have to do that here.
4255 	 */
4256 	if (joined && (cur->bc_flags & XFS_BTREE_OVERLAPPING)) {
4257 		error = xfs_btree_updkeys_force(cur, 0);
4258 		if (error)
4259 			goto error0;
4260 	}
4261 
4262 	if (i == 0) {
4263 		for (level = 1; level < cur->bc_nlevels; level++) {
4264 			if (cur->bc_ptrs[level] == 0) {
4265 				error = xfs_btree_decrement(cur, level, &i);
4266 				if (error)
4267 					goto error0;
4268 				break;
4269 			}
4270 		}
4271 	}
4272 
4273 	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
4274 	*stat = i;
4275 	return 0;
4276 error0:
4277 	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
4278 	return error;
4279 }
4280 
4281 /*
4282  * Get the data from the pointed-to record.
4283  */
4284 int					/* error */
xfs_btree_get_rec(struct xfs_btree_cur * cur,union xfs_btree_rec ** recp,int * stat)4285 xfs_btree_get_rec(
4286 	struct xfs_btree_cur	*cur,	/* btree cursor */
4287 	union xfs_btree_rec	**recp,	/* output: btree record */
4288 	int			*stat)	/* output: success/failure */
4289 {
4290 	struct xfs_btree_block	*block;	/* btree block */
4291 	struct xfs_buf		*bp;	/* buffer pointer */
4292 	int			ptr;	/* record number */
4293 #ifdef DEBUG
4294 	int			error;	/* error return value */
4295 #endif
4296 
4297 	ptr = cur->bc_ptrs[0];
4298 	block = xfs_btree_get_block(cur, 0, &bp);
4299 
4300 #ifdef DEBUG
4301 	error = xfs_btree_check_block(cur, block, 0, bp);
4302 	if (error)
4303 		return error;
4304 #endif
4305 
4306 	/*
4307 	 * Off the right end or left end, return failure.
4308 	 */
4309 	if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) {
4310 		*stat = 0;
4311 		return 0;
4312 	}
4313 
4314 	/*
4315 	 * Point to the record and extract its data.
4316 	 */
4317 	*recp = xfs_btree_rec_addr(cur, ptr, block);
4318 	*stat = 1;
4319 	return 0;
4320 }
4321 
4322 /* Visit a block in a btree. */
4323 STATIC int
xfs_btree_visit_block(struct xfs_btree_cur * cur,int level,xfs_btree_visit_blocks_fn fn,void * data)4324 xfs_btree_visit_block(
4325 	struct xfs_btree_cur		*cur,
4326 	int				level,
4327 	xfs_btree_visit_blocks_fn	fn,
4328 	void				*data)
4329 {
4330 	struct xfs_btree_block		*block;
4331 	struct xfs_buf			*bp;
4332 	union xfs_btree_ptr		rptr;
4333 	int				error;
4334 
4335 	/* do right sibling readahead */
4336 	xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
4337 	block = xfs_btree_get_block(cur, level, &bp);
4338 
4339 	/* process the block */
4340 	error = fn(cur, level, data);
4341 	if (error)
4342 		return error;
4343 
4344 	/* now read rh sibling block for next iteration */
4345 	xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
4346 	if (xfs_btree_ptr_is_null(cur, &rptr))
4347 		return -ENOENT;
4348 
4349 	return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
4350 }
4351 
4352 
4353 /* Visit every block in a btree. */
4354 int
xfs_btree_visit_blocks(struct xfs_btree_cur * cur,xfs_btree_visit_blocks_fn fn,void * data)4355 xfs_btree_visit_blocks(
4356 	struct xfs_btree_cur		*cur,
4357 	xfs_btree_visit_blocks_fn	fn,
4358 	void				*data)
4359 {
4360 	union xfs_btree_ptr		lptr;
4361 	int				level;
4362 	struct xfs_btree_block		*block = NULL;
4363 	int				error = 0;
4364 
4365 	cur->bc_ops->init_ptr_from_cur(cur, &lptr);
4366 
4367 	/* for each level */
4368 	for (level = cur->bc_nlevels - 1; level >= 0; level--) {
4369 		/* grab the left hand block */
4370 		error = xfs_btree_lookup_get_block(cur, level, &lptr, &block);
4371 		if (error)
4372 			return error;
4373 
4374 		/* readahead the left most block for the next level down */
4375 		if (level > 0) {
4376 			union xfs_btree_ptr     *ptr;
4377 
4378 			ptr = xfs_btree_ptr_addr(cur, 1, block);
4379 			xfs_btree_readahead_ptr(cur, ptr, 1);
4380 
4381 			/* save for the next iteration of the loop */
4382 			xfs_btree_copy_ptrs(cur, &lptr, ptr, 1);
4383 		}
4384 
4385 		/* for each buffer in the level */
4386 		do {
4387 			error = xfs_btree_visit_block(cur, level, fn, data);
4388 		} while (!error);
4389 
4390 		if (error != -ENOENT)
4391 			return error;
4392 	}
4393 
4394 	return 0;
4395 }
4396 
4397 /*
4398  * Change the owner of a btree.
4399  *
4400  * The mechanism we use here is ordered buffer logging. Because we don't know
4401  * how many buffers were are going to need to modify, we don't really want to
4402  * have to make transaction reservations for the worst case of every buffer in a
4403  * full size btree as that may be more space that we can fit in the log....
4404  *
4405  * We do the btree walk in the most optimal manner possible - we have sibling
4406  * pointers so we can just walk all the blocks on each level from left to right
4407  * in a single pass, and then move to the next level and do the same. We can
4408  * also do readahead on the sibling pointers to get IO moving more quickly,
4409  * though for slow disks this is unlikely to make much difference to performance
4410  * as the amount of CPU work we have to do before moving to the next block is
4411  * relatively small.
4412  *
4413  * For each btree block that we load, modify the owner appropriately, set the
4414  * buffer as an ordered buffer and log it appropriately. We need to ensure that
4415  * we mark the region we change dirty so that if the buffer is relogged in
4416  * a subsequent transaction the changes we make here as an ordered buffer are
4417  * correctly relogged in that transaction.  If we are in recovery context, then
4418  * just queue the modified buffer as delayed write buffer so the transaction
4419  * recovery completion writes the changes to disk.
4420  */
4421 struct xfs_btree_block_change_owner_info {
4422 	__uint64_t		new_owner;
4423 	struct list_head	*buffer_list;
4424 };
4425 
4426 static int
xfs_btree_block_change_owner(struct xfs_btree_cur * cur,int level,void * data)4427 xfs_btree_block_change_owner(
4428 	struct xfs_btree_cur	*cur,
4429 	int			level,
4430 	void			*data)
4431 {
4432 	struct xfs_btree_block_change_owner_info	*bbcoi = data;
4433 	struct xfs_btree_block	*block;
4434 	struct xfs_buf		*bp;
4435 
4436 	/* modify the owner */
4437 	block = xfs_btree_get_block(cur, level, &bp);
4438 	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
4439 		if (block->bb_u.l.bb_owner == cpu_to_be64(bbcoi->new_owner))
4440 			return 0;
4441 		block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner);
4442 	} else {
4443 		if (block->bb_u.s.bb_owner == cpu_to_be32(bbcoi->new_owner))
4444 			return 0;
4445 		block->bb_u.s.bb_owner = cpu_to_be32(bbcoi->new_owner);
4446 	}
4447 
4448 	/*
4449 	 * If the block is a root block hosted in an inode, we might not have a
4450 	 * buffer pointer here and we shouldn't attempt to log the change as the
4451 	 * information is already held in the inode and discarded when the root
4452 	 * block is formatted into the on-disk inode fork. We still change it,
4453 	 * though, so everything is consistent in memory.
4454 	 */
4455 	if (!bp) {
4456 		ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
4457 		ASSERT(level == cur->bc_nlevels - 1);
4458 		return 0;
4459 	}
4460 
4461 	if (cur->bc_tp) {
4462 		if (!xfs_trans_ordered_buf(cur->bc_tp, bp)) {
4463 			xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
4464 			return -EAGAIN;
4465 		}
4466 	} else {
4467 		xfs_buf_delwri_queue(bp, bbcoi->buffer_list);
4468 	}
4469 
4470 	return 0;
4471 }
4472 
4473 int
xfs_btree_change_owner(struct xfs_btree_cur * cur,__uint64_t new_owner,struct list_head * buffer_list)4474 xfs_btree_change_owner(
4475 	struct xfs_btree_cur	*cur,
4476 	__uint64_t		new_owner,
4477 	struct list_head	*buffer_list)
4478 {
4479 	struct xfs_btree_block_change_owner_info	bbcoi;
4480 
4481 	bbcoi.new_owner = new_owner;
4482 	bbcoi.buffer_list = buffer_list;
4483 
4484 	return xfs_btree_visit_blocks(cur, xfs_btree_block_change_owner,
4485 			&bbcoi);
4486 }
4487 
4488 /**
4489  * xfs_btree_sblock_v5hdr_verify() -- verify the v5 fields of a short-format
4490  *				      btree block
4491  *
4492  * @bp: buffer containing the btree block
4493  * @max_recs: pointer to the m_*_mxr max records field in the xfs mount
4494  * @pag_max_level: pointer to the per-ag max level field
4495  */
4496 bool
xfs_btree_sblock_v5hdr_verify(struct xfs_buf * bp)4497 xfs_btree_sblock_v5hdr_verify(
4498 	struct xfs_buf		*bp)
4499 {
4500 	struct xfs_mount	*mp = bp->b_target->bt_mount;
4501 	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
4502 	struct xfs_perag	*pag = bp->b_pag;
4503 
4504 	if (!xfs_sb_version_hascrc(&mp->m_sb))
4505 		return false;
4506 	if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
4507 		return false;
4508 	if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
4509 		return false;
4510 	if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
4511 		return false;
4512 	return true;
4513 }
4514 
4515 /**
4516  * xfs_btree_sblock_verify() -- verify a short-format btree block
4517  *
4518  * @bp: buffer containing the btree block
4519  * @max_recs: maximum records allowed in this btree node
4520  */
4521 bool
xfs_btree_sblock_verify(struct xfs_buf * bp,unsigned int max_recs)4522 xfs_btree_sblock_verify(
4523 	struct xfs_buf		*bp,
4524 	unsigned int		max_recs)
4525 {
4526 	struct xfs_mount	*mp = bp->b_target->bt_mount;
4527 	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
4528 
4529 	/* numrecs verification */
4530 	if (be16_to_cpu(block->bb_numrecs) > max_recs)
4531 		return false;
4532 
4533 	/* sibling pointer verification */
4534 	if (!block->bb_u.s.bb_leftsib ||
4535 	    (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks &&
4536 	     block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK)))
4537 		return false;
4538 	if (!block->bb_u.s.bb_rightsib ||
4539 	    (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks &&
4540 	     block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK)))
4541 		return false;
4542 
4543 	return true;
4544 }
4545 
4546 /*
4547  * Calculate the number of btree levels needed to store a given number of
4548  * records in a short-format btree.
4549  */
4550 uint
xfs_btree_compute_maxlevels(struct xfs_mount * mp,uint * limits,unsigned long len)4551 xfs_btree_compute_maxlevels(
4552 	struct xfs_mount	*mp,
4553 	uint			*limits,
4554 	unsigned long		len)
4555 {
4556 	uint			level;
4557 	unsigned long		maxblocks;
4558 
4559 	maxblocks = (len + limits[0] - 1) / limits[0];
4560 	for (level = 1; maxblocks > 1; level++)
4561 		maxblocks = (maxblocks + limits[1] - 1) / limits[1];
4562 	return level;
4563 }
4564 
4565 /*
4566  * Query a regular btree for all records overlapping a given interval.
4567  * Start with a LE lookup of the key of low_rec and return all records
4568  * until we find a record with a key greater than the key of high_rec.
4569  */
4570 STATIC int
xfs_btree_simple_query_range(struct xfs_btree_cur * cur,union xfs_btree_key * low_key,union xfs_btree_key * high_key,xfs_btree_query_range_fn fn,void * priv)4571 xfs_btree_simple_query_range(
4572 	struct xfs_btree_cur		*cur,
4573 	union xfs_btree_key		*low_key,
4574 	union xfs_btree_key		*high_key,
4575 	xfs_btree_query_range_fn	fn,
4576 	void				*priv)
4577 {
4578 	union xfs_btree_rec		*recp;
4579 	union xfs_btree_key		rec_key;
4580 	__int64_t			diff;
4581 	int				stat;
4582 	bool				firstrec = true;
4583 	int				error;
4584 
4585 	ASSERT(cur->bc_ops->init_high_key_from_rec);
4586 	ASSERT(cur->bc_ops->diff_two_keys);
4587 
4588 	/*
4589 	 * Find the leftmost record.  The btree cursor must be set
4590 	 * to the low record used to generate low_key.
4591 	 */
4592 	stat = 0;
4593 	error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, &stat);
4594 	if (error)
4595 		goto out;
4596 
4597 	/* Nothing?  See if there's anything to the right. */
4598 	if (!stat) {
4599 		error = xfs_btree_increment(cur, 0, &stat);
4600 		if (error)
4601 			goto out;
4602 	}
4603 
4604 	while (stat) {
4605 		/* Find the record. */
4606 		error = xfs_btree_get_rec(cur, &recp, &stat);
4607 		if (error || !stat)
4608 			break;
4609 
4610 		/* Skip if high_key(rec) < low_key. */
4611 		if (firstrec) {
4612 			cur->bc_ops->init_high_key_from_rec(&rec_key, recp);
4613 			firstrec = false;
4614 			diff = cur->bc_ops->diff_two_keys(cur, low_key,
4615 					&rec_key);
4616 			if (diff > 0)
4617 				goto advloop;
4618 		}
4619 
4620 		/* Stop if high_key < low_key(rec). */
4621 		cur->bc_ops->init_key_from_rec(&rec_key, recp);
4622 		diff = cur->bc_ops->diff_two_keys(cur, &rec_key, high_key);
4623 		if (diff > 0)
4624 			break;
4625 
4626 		/* Callback */
4627 		error = fn(cur, recp, priv);
4628 		if (error < 0 || error == XFS_BTREE_QUERY_RANGE_ABORT)
4629 			break;
4630 
4631 advloop:
4632 		/* Move on to the next record. */
4633 		error = xfs_btree_increment(cur, 0, &stat);
4634 		if (error)
4635 			break;
4636 	}
4637 
4638 out:
4639 	return error;
4640 }
4641 
4642 /*
4643  * Query an overlapped interval btree for all records overlapping a given
4644  * interval.  This function roughly follows the algorithm given in
4645  * "Interval Trees" of _Introduction to Algorithms_, which is section
4646  * 14.3 in the 2nd and 3rd editions.
4647  *
4648  * First, generate keys for the low and high records passed in.
4649  *
4650  * For any leaf node, generate the high and low keys for the record.
4651  * If the record keys overlap with the query low/high keys, pass the
4652  * record to the function iterator.
4653  *
4654  * For any internal node, compare the low and high keys of each
4655  * pointer against the query low/high keys.  If there's an overlap,
4656  * follow the pointer.
4657  *
4658  * As an optimization, we stop scanning a block when we find a low key
4659  * that is greater than the query's high key.
4660  */
4661 STATIC int
xfs_btree_overlapped_query_range(struct xfs_btree_cur * cur,union xfs_btree_key * low_key,union xfs_btree_key * high_key,xfs_btree_query_range_fn fn,void * priv)4662 xfs_btree_overlapped_query_range(
4663 	struct xfs_btree_cur		*cur,
4664 	union xfs_btree_key		*low_key,
4665 	union xfs_btree_key		*high_key,
4666 	xfs_btree_query_range_fn	fn,
4667 	void				*priv)
4668 {
4669 	union xfs_btree_ptr		ptr;
4670 	union xfs_btree_ptr		*pp;
4671 	union xfs_btree_key		rec_key;
4672 	union xfs_btree_key		rec_hkey;
4673 	union xfs_btree_key		*lkp;
4674 	union xfs_btree_key		*hkp;
4675 	union xfs_btree_rec		*recp;
4676 	struct xfs_btree_block		*block;
4677 	__int64_t			ldiff;
4678 	__int64_t			hdiff;
4679 	int				level;
4680 	struct xfs_buf			*bp;
4681 	int				i;
4682 	int				error;
4683 
4684 	/* Load the root of the btree. */
4685 	level = cur->bc_nlevels - 1;
4686 	cur->bc_ops->init_ptr_from_cur(cur, &ptr);
4687 	error = xfs_btree_lookup_get_block(cur, level, &ptr, &block);
4688 	if (error)
4689 		return error;
4690 	xfs_btree_get_block(cur, level, &bp);
4691 	trace_xfs_btree_overlapped_query_range(cur, level, bp);
4692 #ifdef DEBUG
4693 	error = xfs_btree_check_block(cur, block, level, bp);
4694 	if (error)
4695 		goto out;
4696 #endif
4697 	cur->bc_ptrs[level] = 1;
4698 
4699 	while (level < cur->bc_nlevels) {
4700 		block = xfs_btree_get_block(cur, level, &bp);
4701 
4702 		/* End of node, pop back towards the root. */
4703 		if (cur->bc_ptrs[level] > be16_to_cpu(block->bb_numrecs)) {
4704 pop_up:
4705 			if (level < cur->bc_nlevels - 1)
4706 				cur->bc_ptrs[level + 1]++;
4707 			level++;
4708 			continue;
4709 		}
4710 
4711 		if (level == 0) {
4712 			/* Handle a leaf node. */
4713 			recp = xfs_btree_rec_addr(cur, cur->bc_ptrs[0], block);
4714 
4715 			cur->bc_ops->init_high_key_from_rec(&rec_hkey, recp);
4716 			ldiff = cur->bc_ops->diff_two_keys(cur, &rec_hkey,
4717 					low_key);
4718 
4719 			cur->bc_ops->init_key_from_rec(&rec_key, recp);
4720 			hdiff = cur->bc_ops->diff_two_keys(cur, high_key,
4721 					&rec_key);
4722 
4723 			/*
4724 			 * If (record's high key >= query's low key) and
4725 			 *    (query's high key >= record's low key), then
4726 			 * this record overlaps the query range; callback.
4727 			 */
4728 			if (ldiff >= 0 && hdiff >= 0) {
4729 				error = fn(cur, recp, priv);
4730 				if (error < 0 ||
4731 				    error == XFS_BTREE_QUERY_RANGE_ABORT)
4732 					break;
4733 			} else if (hdiff < 0) {
4734 				/* Record is larger than high key; pop. */
4735 				goto pop_up;
4736 			}
4737 			cur->bc_ptrs[level]++;
4738 			continue;
4739 		}
4740 
4741 		/* Handle an internal node. */
4742 		lkp = xfs_btree_key_addr(cur, cur->bc_ptrs[level], block);
4743 		hkp = xfs_btree_high_key_addr(cur, cur->bc_ptrs[level], block);
4744 		pp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[level], block);
4745 
4746 		ldiff = cur->bc_ops->diff_two_keys(cur, hkp, low_key);
4747 		hdiff = cur->bc_ops->diff_two_keys(cur, high_key, lkp);
4748 
4749 		/*
4750 		 * If (pointer's high key >= query's low key) and
4751 		 *    (query's high key >= pointer's low key), then
4752 		 * this record overlaps the query range; follow pointer.
4753 		 */
4754 		if (ldiff >= 0 && hdiff >= 0) {
4755 			level--;
4756 			error = xfs_btree_lookup_get_block(cur, level, pp,
4757 					&block);
4758 			if (error)
4759 				goto out;
4760 			xfs_btree_get_block(cur, level, &bp);
4761 			trace_xfs_btree_overlapped_query_range(cur, level, bp);
4762 #ifdef DEBUG
4763 			error = xfs_btree_check_block(cur, block, level, bp);
4764 			if (error)
4765 				goto out;
4766 #endif
4767 			cur->bc_ptrs[level] = 1;
4768 			continue;
4769 		} else if (hdiff < 0) {
4770 			/* The low key is larger than the upper range; pop. */
4771 			goto pop_up;
4772 		}
4773 		cur->bc_ptrs[level]++;
4774 	}
4775 
4776 out:
4777 	/*
4778 	 * If we don't end this function with the cursor pointing at a record
4779 	 * block, a subsequent non-error cursor deletion will not release
4780 	 * node-level buffers, causing a buffer leak.  This is quite possible
4781 	 * with a zero-results range query, so release the buffers if we
4782 	 * failed to return any results.
4783 	 */
4784 	if (cur->bc_bufs[0] == NULL) {
4785 		for (i = 0; i < cur->bc_nlevels; i++) {
4786 			if (cur->bc_bufs[i]) {
4787 				xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
4788 				cur->bc_bufs[i] = NULL;
4789 				cur->bc_ptrs[i] = 0;
4790 				cur->bc_ra[i] = 0;
4791 			}
4792 		}
4793 	}
4794 
4795 	return error;
4796 }
4797 
4798 /*
4799  * Query a btree for all records overlapping a given interval of keys.  The
4800  * supplied function will be called with each record found; return one of the
4801  * XFS_BTREE_QUERY_RANGE_{CONTINUE,ABORT} values or the usual negative error
4802  * code.  This function returns XFS_BTREE_QUERY_RANGE_ABORT, zero, or a
4803  * negative error code.
4804  */
4805 int
xfs_btree_query_range(struct xfs_btree_cur * cur,union xfs_btree_irec * low_rec,union xfs_btree_irec * high_rec,xfs_btree_query_range_fn fn,void * priv)4806 xfs_btree_query_range(
4807 	struct xfs_btree_cur		*cur,
4808 	union xfs_btree_irec		*low_rec,
4809 	union xfs_btree_irec		*high_rec,
4810 	xfs_btree_query_range_fn	fn,
4811 	void				*priv)
4812 {
4813 	union xfs_btree_rec		rec;
4814 	union xfs_btree_key		low_key;
4815 	union xfs_btree_key		high_key;
4816 
4817 	/* Find the keys of both ends of the interval. */
4818 	cur->bc_rec = *high_rec;
4819 	cur->bc_ops->init_rec_from_cur(cur, &rec);
4820 	cur->bc_ops->init_key_from_rec(&high_key, &rec);
4821 
4822 	cur->bc_rec = *low_rec;
4823 	cur->bc_ops->init_rec_from_cur(cur, &rec);
4824 	cur->bc_ops->init_key_from_rec(&low_key, &rec);
4825 
4826 	/* Enforce low key < high key. */
4827 	if (cur->bc_ops->diff_two_keys(cur, &low_key, &high_key) > 0)
4828 		return -EINVAL;
4829 
4830 	if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
4831 		return xfs_btree_simple_query_range(cur, &low_key,
4832 				&high_key, fn, priv);
4833 	return xfs_btree_overlapped_query_range(cur, &low_key, &high_key,
4834 			fn, priv);
4835 }
4836 
4837 /*
4838  * Calculate the number of blocks needed to store a given number of records
4839  * in a short-format (per-AG metadata) btree.
4840  */
4841 xfs_extlen_t
xfs_btree_calc_size(struct xfs_mount * mp,uint * limits,unsigned long long len)4842 xfs_btree_calc_size(
4843 	struct xfs_mount	*mp,
4844 	uint			*limits,
4845 	unsigned long long	len)
4846 {
4847 	int			level;
4848 	int			maxrecs;
4849 	xfs_extlen_t		rval;
4850 
4851 	maxrecs = limits[0];
4852 	for (level = 0, rval = 0; len > 1; level++) {
4853 		len += maxrecs - 1;
4854 		do_div(len, maxrecs);
4855 		maxrecs = limits[1];
4856 		rval += len;
4857 	}
4858 	return rval;
4859 }
4860 
4861 static int
xfs_btree_count_blocks_helper(struct xfs_btree_cur * cur,int level,void * data)4862 xfs_btree_count_blocks_helper(
4863 	struct xfs_btree_cur	*cur,
4864 	int			level,
4865 	void			*data)
4866 {
4867 	xfs_extlen_t		*blocks = data;
4868 	(*blocks)++;
4869 
4870 	return 0;
4871 }
4872 
4873 /* Count the blocks in a btree and return the result in *blocks. */
4874 int
xfs_btree_count_blocks(struct xfs_btree_cur * cur,xfs_extlen_t * blocks)4875 xfs_btree_count_blocks(
4876 	struct xfs_btree_cur	*cur,
4877 	xfs_extlen_t		*blocks)
4878 {
4879 	*blocks = 0;
4880 	return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper,
4881 			blocks);
4882 }
4883