• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
4  * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
5  */
6 
7 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
8 
9 #include <linux/slab.h>
10 #include <linux/spinlock.h>
11 #include <linux/completion.h>
12 #include <linux/buffer_head.h>
13 #include <linux/fs.h>
14 #include <linux/gfs2_ondisk.h>
15 #include <linux/prefetch.h>
16 #include <linux/blkdev.h>
17 #include <linux/rbtree.h>
18 #include <linux/random.h>
19 
20 #include "gfs2.h"
21 #include "incore.h"
22 #include "glock.h"
23 #include "glops.h"
24 #include "lops.h"
25 #include "meta_io.h"
26 #include "quota.h"
27 #include "rgrp.h"
28 #include "super.h"
29 #include "trans.h"
30 #include "util.h"
31 #include "log.h"
32 #include "inode.h"
33 #include "trace_gfs2.h"
34 #include "dir.h"
35 
36 #define BFITNOENT ((u32)~0)
37 #define NO_BLOCK ((u64)~0)
38 
39 /*
40  * These routines are used by the resource group routines (rgrp.c)
41  * to keep track of block allocation.  Each block is represented by two
42  * bits.  So, each byte represents GFS2_NBBY (i.e. 4) blocks.
43  *
44  * 0 = Free
45  * 1 = Used (not metadata)
46  * 2 = Unlinked (still in use) inode
47  * 3 = Used (metadata)
48  */
49 
50 struct gfs2_extent {
51 	struct gfs2_rbm rbm;
52 	u32 len;
53 };
54 
55 static const char valid_change[16] = {
56 	        /* current */
57 	/* n */ 0, 1, 1, 1,
58 	/* e */ 1, 0, 0, 0,
59 	/* w */ 0, 0, 0, 1,
60 	        1, 0, 0, 0
61 };
62 
63 static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
64 			 const struct gfs2_inode *ip, bool nowrap);
65 
66 
67 /**
68  * gfs2_setbit - Set a bit in the bitmaps
69  * @rbm: The position of the bit to set
70  * @do_clone: Also set the clone bitmap, if it exists
71  * @new_state: the new state of the block
72  *
73  */
74 
gfs2_setbit(const struct gfs2_rbm * rbm,bool do_clone,unsigned char new_state)75 static inline void gfs2_setbit(const struct gfs2_rbm *rbm, bool do_clone,
76 			       unsigned char new_state)
77 {
78 	unsigned char *byte1, *byte2, *end, cur_state;
79 	struct gfs2_bitmap *bi = rbm_bi(rbm);
80 	unsigned int buflen = bi->bi_bytes;
81 	const unsigned int bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE;
82 
83 	byte1 = bi->bi_bh->b_data + bi->bi_offset + (rbm->offset / GFS2_NBBY);
84 	end = bi->bi_bh->b_data + bi->bi_offset + buflen;
85 
86 	BUG_ON(byte1 >= end);
87 
88 	cur_state = (*byte1 >> bit) & GFS2_BIT_MASK;
89 
90 	if (unlikely(!valid_change[new_state * 4 + cur_state])) {
91 		struct gfs2_sbd *sdp = rbm->rgd->rd_sbd;
92 
93 		fs_warn(sdp, "buf_blk = 0x%x old_state=%d, new_state=%d\n",
94 			rbm->offset, cur_state, new_state);
95 		fs_warn(sdp, "rgrp=0x%llx bi_start=0x%x biblk: 0x%llx\n",
96 			(unsigned long long)rbm->rgd->rd_addr, bi->bi_start,
97 			(unsigned long long)bi->bi_bh->b_blocknr);
98 		fs_warn(sdp, "bi_offset=0x%x bi_bytes=0x%x block=0x%llx\n",
99 			bi->bi_offset, bi->bi_bytes,
100 			(unsigned long long)gfs2_rbm_to_block(rbm));
101 		dump_stack();
102 		gfs2_consist_rgrpd(rbm->rgd);
103 		return;
104 	}
105 	*byte1 ^= (cur_state ^ new_state) << bit;
106 
107 	if (do_clone && bi->bi_clone) {
108 		byte2 = bi->bi_clone + bi->bi_offset + (rbm->offset / GFS2_NBBY);
109 		cur_state = (*byte2 >> bit) & GFS2_BIT_MASK;
110 		*byte2 ^= (cur_state ^ new_state) << bit;
111 	}
112 }
113 
114 /**
115  * gfs2_testbit - test a bit in the bitmaps
116  * @rbm: The bit to test
117  * @use_clone: If true, test the clone bitmap, not the official bitmap.
118  *
119  * Some callers like gfs2_unaligned_extlen need to test the clone bitmaps,
120  * not the "real" bitmaps, to avoid allocating recently freed blocks.
121  *
122  * Returns: The two bit block state of the requested bit
123  */
124 
gfs2_testbit(const struct gfs2_rbm * rbm,bool use_clone)125 static inline u8 gfs2_testbit(const struct gfs2_rbm *rbm, bool use_clone)
126 {
127 	struct gfs2_bitmap *bi = rbm_bi(rbm);
128 	const u8 *buffer;
129 	const u8 *byte;
130 	unsigned int bit;
131 
132 	if (use_clone && bi->bi_clone)
133 		buffer = bi->bi_clone;
134 	else
135 		buffer = bi->bi_bh->b_data;
136 	buffer += bi->bi_offset;
137 	byte = buffer + (rbm->offset / GFS2_NBBY);
138 	bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE;
139 
140 	return (*byte >> bit) & GFS2_BIT_MASK;
141 }
142 
143 /**
144  * gfs2_bit_search
145  * @ptr: Pointer to bitmap data
146  * @mask: Mask to use (normally 0x55555.... but adjusted for search start)
147  * @state: The state we are searching for
148  *
149  * We xor the bitmap data with a patter which is the bitwise opposite
150  * of what we are looking for, this gives rise to a pattern of ones
151  * wherever there is a match. Since we have two bits per entry, we
152  * take this pattern, shift it down by one place and then and it with
153  * the original. All the even bit positions (0,2,4, etc) then represent
154  * successful matches, so we mask with 0x55555..... to remove the unwanted
155  * odd bit positions.
156  *
157  * This allows searching of a whole u64 at once (32 blocks) with a
158  * single test (on 64 bit arches).
159  */
160 
gfs2_bit_search(const __le64 * ptr,u64 mask,u8 state)161 static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state)
162 {
163 	u64 tmp;
164 	static const u64 search[] = {
165 		[0] = 0xffffffffffffffffULL,
166 		[1] = 0xaaaaaaaaaaaaaaaaULL,
167 		[2] = 0x5555555555555555ULL,
168 		[3] = 0x0000000000000000ULL,
169 	};
170 	tmp = le64_to_cpu(*ptr) ^ search[state];
171 	tmp &= (tmp >> 1);
172 	tmp &= mask;
173 	return tmp;
174 }
175 
176 /**
177  * rs_cmp - multi-block reservation range compare
178  * @blk: absolute file system block number of the new reservation
179  * @len: number of blocks in the new reservation
180  * @rs: existing reservation to compare against
181  *
182  * returns: 1 if the block range is beyond the reach of the reservation
183  *         -1 if the block range is before the start of the reservation
184  *          0 if the block range overlaps with the reservation
185  */
rs_cmp(u64 blk,u32 len,struct gfs2_blkreserv * rs)186 static inline int rs_cmp(u64 blk, u32 len, struct gfs2_blkreserv *rs)
187 {
188 	u64 startblk = gfs2_rbm_to_block(&rs->rs_rbm);
189 
190 	if (blk >= startblk + rs->rs_free)
191 		return 1;
192 	if (blk + len - 1 < startblk)
193 		return -1;
194 	return 0;
195 }
196 
197 /**
198  * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing
199  *       a block in a given allocation state.
200  * @buf: the buffer that holds the bitmaps
201  * @len: the length (in bytes) of the buffer
202  * @goal: start search at this block's bit-pair (within @buffer)
203  * @state: GFS2_BLKST_XXX the state of the block we're looking for.
204  *
205  * Scope of @goal and returned block number is only within this bitmap buffer,
206  * not entire rgrp or filesystem.  @buffer will be offset from the actual
207  * beginning of a bitmap block buffer, skipping any header structures, but
208  * headers are always a multiple of 64 bits long so that the buffer is
209  * always aligned to a 64 bit boundary.
210  *
211  * The size of the buffer is in bytes, but is it assumed that it is
212  * always ok to read a complete multiple of 64 bits at the end
213  * of the block in case the end is no aligned to a natural boundary.
214  *
215  * Return: the block number (bitmap buffer scope) that was found
216  */
217 
gfs2_bitfit(const u8 * buf,const unsigned int len,u32 goal,u8 state)218 static u32 gfs2_bitfit(const u8 *buf, const unsigned int len,
219 		       u32 goal, u8 state)
220 {
221 	u32 spoint = (goal << 1) & ((8*sizeof(u64)) - 1);
222 	const __le64 *ptr = ((__le64 *)buf) + (goal >> 5);
223 	const __le64 *end = (__le64 *)(buf + ALIGN(len, sizeof(u64)));
224 	u64 tmp;
225 	u64 mask = 0x5555555555555555ULL;
226 	u32 bit;
227 
228 	/* Mask off bits we don't care about at the start of the search */
229 	mask <<= spoint;
230 	tmp = gfs2_bit_search(ptr, mask, state);
231 	ptr++;
232 	while(tmp == 0 && ptr < end) {
233 		tmp = gfs2_bit_search(ptr, 0x5555555555555555ULL, state);
234 		ptr++;
235 	}
236 	/* Mask off any bits which are more than len bytes from the start */
237 	if (ptr == end && (len & (sizeof(u64) - 1)))
238 		tmp &= (((u64)~0) >> (64 - 8*(len & (sizeof(u64) - 1))));
239 	/* Didn't find anything, so return */
240 	if (tmp == 0)
241 		return BFITNOENT;
242 	ptr--;
243 	bit = __ffs64(tmp);
244 	bit /= 2;	/* two bits per entry in the bitmap */
245 	return (((const unsigned char *)ptr - buf) * GFS2_NBBY) + bit;
246 }
247 
248 /**
249  * gfs2_rbm_from_block - Set the rbm based upon rgd and block number
250  * @rbm: The rbm with rgd already set correctly
251  * @block: The block number (filesystem relative)
252  *
253  * This sets the bi and offset members of an rbm based on a
254  * resource group and a filesystem relative block number. The
255  * resource group must be set in the rbm on entry, the bi and
256  * offset members will be set by this function.
257  *
258  * Returns: 0 on success, or an error code
259  */
260 
gfs2_rbm_from_block(struct gfs2_rbm * rbm,u64 block)261 static int gfs2_rbm_from_block(struct gfs2_rbm *rbm, u64 block)
262 {
263 	if (!rgrp_contains_block(rbm->rgd, block))
264 		return -E2BIG;
265 	rbm->bii = 0;
266 	rbm->offset = block - rbm->rgd->rd_data0;
267 	/* Check if the block is within the first block */
268 	if (rbm->offset < rbm_bi(rbm)->bi_blocks)
269 		return 0;
270 
271 	/* Adjust for the size diff between gfs2_meta_header and gfs2_rgrp */
272 	rbm->offset += (sizeof(struct gfs2_rgrp) -
273 			sizeof(struct gfs2_meta_header)) * GFS2_NBBY;
274 	rbm->bii = rbm->offset / rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
275 	rbm->offset -= rbm->bii * rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
276 	return 0;
277 }
278 
279 /**
280  * gfs2_rbm_incr - increment an rbm structure
281  * @rbm: The rbm with rgd already set correctly
282  *
283  * This function takes an existing rbm structure and increments it to the next
284  * viable block offset.
285  *
286  * Returns: If incrementing the offset would cause the rbm to go past the
287  *          end of the rgrp, true is returned, otherwise false.
288  *
289  */
290 
gfs2_rbm_incr(struct gfs2_rbm * rbm)291 static bool gfs2_rbm_incr(struct gfs2_rbm *rbm)
292 {
293 	if (rbm->offset + 1 < rbm_bi(rbm)->bi_blocks) { /* in the same bitmap */
294 		rbm->offset++;
295 		return false;
296 	}
297 	if (rbm->bii == rbm->rgd->rd_length - 1) /* at the last bitmap */
298 		return true;
299 
300 	rbm->offset = 0;
301 	rbm->bii++;
302 	return false;
303 }
304 
305 /**
306  * gfs2_unaligned_extlen - Look for free blocks which are not byte aligned
307  * @rbm: Position to search (value/result)
308  * @n_unaligned: Number of unaligned blocks to check
309  * @len: Decremented for each block found (terminate on zero)
310  *
311  * Returns: true if a non-free block is encountered
312  */
313 
gfs2_unaligned_extlen(struct gfs2_rbm * rbm,u32 n_unaligned,u32 * len)314 static bool gfs2_unaligned_extlen(struct gfs2_rbm *rbm, u32 n_unaligned, u32 *len)
315 {
316 	u32 n;
317 	u8 res;
318 
319 	for (n = 0; n < n_unaligned; n++) {
320 		res = gfs2_testbit(rbm, true);
321 		if (res != GFS2_BLKST_FREE)
322 			return true;
323 		(*len)--;
324 		if (*len == 0)
325 			return true;
326 		if (gfs2_rbm_incr(rbm))
327 			return true;
328 	}
329 
330 	return false;
331 }
332 
333 /**
334  * gfs2_free_extlen - Return extent length of free blocks
335  * @rrbm: Starting position
336  * @len: Max length to check
337  *
338  * Starting at the block specified by the rbm, see how many free blocks
339  * there are, not reading more than len blocks ahead. This can be done
340  * using memchr_inv when the blocks are byte aligned, but has to be done
341  * on a block by block basis in case of unaligned blocks. Also this
342  * function can cope with bitmap boundaries (although it must stop on
343  * a resource group boundary)
344  *
345  * Returns: Number of free blocks in the extent
346  */
347 
gfs2_free_extlen(const struct gfs2_rbm * rrbm,u32 len)348 static u32 gfs2_free_extlen(const struct gfs2_rbm *rrbm, u32 len)
349 {
350 	struct gfs2_rbm rbm = *rrbm;
351 	u32 n_unaligned = rbm.offset & 3;
352 	u32 size = len;
353 	u32 bytes;
354 	u32 chunk_size;
355 	u8 *ptr, *start, *end;
356 	u64 block;
357 	struct gfs2_bitmap *bi;
358 
359 	if (n_unaligned &&
360 	    gfs2_unaligned_extlen(&rbm, 4 - n_unaligned, &len))
361 		goto out;
362 
363 	n_unaligned = len & 3;
364 	/* Start is now byte aligned */
365 	while (len > 3) {
366 		bi = rbm_bi(&rbm);
367 		start = bi->bi_bh->b_data;
368 		if (bi->bi_clone)
369 			start = bi->bi_clone;
370 		start += bi->bi_offset;
371 		end = start + bi->bi_bytes;
372 		BUG_ON(rbm.offset & 3);
373 		start += (rbm.offset / GFS2_NBBY);
374 		bytes = min_t(u32, len / GFS2_NBBY, (end - start));
375 		ptr = memchr_inv(start, 0, bytes);
376 		chunk_size = ((ptr == NULL) ? bytes : (ptr - start));
377 		chunk_size *= GFS2_NBBY;
378 		BUG_ON(len < chunk_size);
379 		len -= chunk_size;
380 		block = gfs2_rbm_to_block(&rbm);
381 		if (gfs2_rbm_from_block(&rbm, block + chunk_size)) {
382 			n_unaligned = 0;
383 			break;
384 		}
385 		if (ptr) {
386 			n_unaligned = 3;
387 			break;
388 		}
389 		n_unaligned = len & 3;
390 	}
391 
392 	/* Deal with any bits left over at the end */
393 	if (n_unaligned)
394 		gfs2_unaligned_extlen(&rbm, n_unaligned, &len);
395 out:
396 	return size - len;
397 }
398 
399 /**
400  * gfs2_bitcount - count the number of bits in a certain state
401  * @rgd: the resource group descriptor
402  * @buffer: the buffer that holds the bitmaps
403  * @buflen: the length (in bytes) of the buffer
404  * @state: the state of the block we're looking for
405  *
406  * Returns: The number of bits
407  */
408 
gfs2_bitcount(struct gfs2_rgrpd * rgd,const u8 * buffer,unsigned int buflen,u8 state)409 static u32 gfs2_bitcount(struct gfs2_rgrpd *rgd, const u8 *buffer,
410 			 unsigned int buflen, u8 state)
411 {
412 	const u8 *byte = buffer;
413 	const u8 *end = buffer + buflen;
414 	const u8 state1 = state << 2;
415 	const u8 state2 = state << 4;
416 	const u8 state3 = state << 6;
417 	u32 count = 0;
418 
419 	for (; byte < end; byte++) {
420 		if (((*byte) & 0x03) == state)
421 			count++;
422 		if (((*byte) & 0x0C) == state1)
423 			count++;
424 		if (((*byte) & 0x30) == state2)
425 			count++;
426 		if (((*byte) & 0xC0) == state3)
427 			count++;
428 	}
429 
430 	return count;
431 }
432 
433 /**
434  * gfs2_rgrp_verify - Verify that a resource group is consistent
435  * @rgd: the rgrp
436  *
437  */
438 
gfs2_rgrp_verify(struct gfs2_rgrpd * rgd)439 void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
440 {
441 	struct gfs2_sbd *sdp = rgd->rd_sbd;
442 	struct gfs2_bitmap *bi = NULL;
443 	u32 length = rgd->rd_length;
444 	u32 count[4], tmp;
445 	int buf, x;
446 
447 	memset(count, 0, 4 * sizeof(u32));
448 
449 	/* Count # blocks in each of 4 possible allocation states */
450 	for (buf = 0; buf < length; buf++) {
451 		bi = rgd->rd_bits + buf;
452 		for (x = 0; x < 4; x++)
453 			count[x] += gfs2_bitcount(rgd,
454 						  bi->bi_bh->b_data +
455 						  bi->bi_offset,
456 						  bi->bi_bytes, x);
457 	}
458 
459 	if (count[0] != rgd->rd_free) {
460 		gfs2_lm(sdp, "free data mismatch:  %u != %u\n",
461 			count[0], rgd->rd_free);
462 		gfs2_consist_rgrpd(rgd);
463 		return;
464 	}
465 
466 	tmp = rgd->rd_data - rgd->rd_free - rgd->rd_dinodes;
467 	if (count[1] != tmp) {
468 		gfs2_lm(sdp, "used data mismatch:  %u != %u\n",
469 			count[1], tmp);
470 		gfs2_consist_rgrpd(rgd);
471 		return;
472 	}
473 
474 	if (count[2] + count[3] != rgd->rd_dinodes) {
475 		gfs2_lm(sdp, "used metadata mismatch:  %u != %u\n",
476 			count[2] + count[3], rgd->rd_dinodes);
477 		gfs2_consist_rgrpd(rgd);
478 		return;
479 	}
480 }
481 
482 /**
483  * gfs2_blk2rgrpd - Find resource group for a given data/meta block number
484  * @sdp: The GFS2 superblock
485  * @blk: The data block number
486  * @exact: True if this needs to be an exact match
487  *
488  * The @exact argument should be set to true by most callers. The exception
489  * is when we need to match blocks which are not represented by the rgrp
490  * bitmap, but which are part of the rgrp (i.e. padding blocks) which are
491  * there for alignment purposes. Another way of looking at it is that @exact
492  * matches only valid data/metadata blocks, but with @exact false, it will
493  * match any block within the extent of the rgrp.
494  *
495  * Returns: The resource group, or NULL if not found
496  */
497 
gfs2_blk2rgrpd(struct gfs2_sbd * sdp,u64 blk,bool exact)498 struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk, bool exact)
499 {
500 	struct rb_node *n, *next;
501 	struct gfs2_rgrpd *cur;
502 
503 	spin_lock(&sdp->sd_rindex_spin);
504 	n = sdp->sd_rindex_tree.rb_node;
505 	while (n) {
506 		cur = rb_entry(n, struct gfs2_rgrpd, rd_node);
507 		next = NULL;
508 		if (blk < cur->rd_addr)
509 			next = n->rb_left;
510 		else if (blk >= cur->rd_data0 + cur->rd_data)
511 			next = n->rb_right;
512 		if (next == NULL) {
513 			spin_unlock(&sdp->sd_rindex_spin);
514 			if (exact) {
515 				if (blk < cur->rd_addr)
516 					return NULL;
517 				if (blk >= cur->rd_data0 + cur->rd_data)
518 					return NULL;
519 			}
520 			return cur;
521 		}
522 		n = next;
523 	}
524 	spin_unlock(&sdp->sd_rindex_spin);
525 
526 	return NULL;
527 }
528 
529 /**
530  * gfs2_rgrpd_get_first - get the first Resource Group in the filesystem
531  * @sdp: The GFS2 superblock
532  *
533  * Returns: The first rgrp in the filesystem
534  */
535 
gfs2_rgrpd_get_first(struct gfs2_sbd * sdp)536 struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp)
537 {
538 	const struct rb_node *n;
539 	struct gfs2_rgrpd *rgd;
540 
541 	spin_lock(&sdp->sd_rindex_spin);
542 	n = rb_first(&sdp->sd_rindex_tree);
543 	rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
544 	spin_unlock(&sdp->sd_rindex_spin);
545 
546 	return rgd;
547 }
548 
549 /**
550  * gfs2_rgrpd_get_next - get the next RG
551  * @rgd: the resource group descriptor
552  *
553  * Returns: The next rgrp
554  */
555 
gfs2_rgrpd_get_next(struct gfs2_rgrpd * rgd)556 struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd)
557 {
558 	struct gfs2_sbd *sdp = rgd->rd_sbd;
559 	const struct rb_node *n;
560 
561 	spin_lock(&sdp->sd_rindex_spin);
562 	n = rb_next(&rgd->rd_node);
563 	if (n == NULL)
564 		n = rb_first(&sdp->sd_rindex_tree);
565 
566 	if (unlikely(&rgd->rd_node == n)) {
567 		spin_unlock(&sdp->sd_rindex_spin);
568 		return NULL;
569 	}
570 	rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
571 	spin_unlock(&sdp->sd_rindex_spin);
572 	return rgd;
573 }
574 
check_and_update_goal(struct gfs2_inode * ip)575 void check_and_update_goal(struct gfs2_inode *ip)
576 {
577 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
578 	if (!ip->i_goal || gfs2_blk2rgrpd(sdp, ip->i_goal, 1) == NULL)
579 		ip->i_goal = ip->i_no_addr;
580 }
581 
gfs2_free_clones(struct gfs2_rgrpd * rgd)582 void gfs2_free_clones(struct gfs2_rgrpd *rgd)
583 {
584 	int x;
585 
586 	for (x = 0; x < rgd->rd_length; x++) {
587 		struct gfs2_bitmap *bi = rgd->rd_bits + x;
588 		kfree(bi->bi_clone);
589 		bi->bi_clone = NULL;
590 	}
591 }
592 
dump_rs(struct seq_file * seq,const struct gfs2_blkreserv * rs,const char * fs_id_buf)593 static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs,
594 		    const char *fs_id_buf)
595 {
596 	struct gfs2_inode *ip = container_of(rs, struct gfs2_inode, i_res);
597 
598 	gfs2_print_dbg(seq, "%s  B: n:%llu s:%llu b:%u f:%u\n", fs_id_buf,
599 		       (unsigned long long)ip->i_no_addr,
600 		       (unsigned long long)gfs2_rbm_to_block(&rs->rs_rbm),
601 		       rs->rs_rbm.offset, rs->rs_free);
602 }
603 
604 /**
605  * __rs_deltree - remove a multi-block reservation from the rgd tree
606  * @rs: The reservation to remove
607  *
608  */
__rs_deltree(struct gfs2_blkreserv * rs)609 static void __rs_deltree(struct gfs2_blkreserv *rs)
610 {
611 	struct gfs2_rgrpd *rgd;
612 
613 	if (!gfs2_rs_active(rs))
614 		return;
615 
616 	rgd = rs->rs_rbm.rgd;
617 	trace_gfs2_rs(rs, TRACE_RS_TREEDEL);
618 	rb_erase(&rs->rs_node, &rgd->rd_rstree);
619 	RB_CLEAR_NODE(&rs->rs_node);
620 
621 	if (rs->rs_free) {
622 		u64 last_block = gfs2_rbm_to_block(&rs->rs_rbm) +
623 				 rs->rs_free - 1;
624 		struct gfs2_rbm last_rbm = { .rgd = rs->rs_rbm.rgd, };
625 		struct gfs2_bitmap *start, *last;
626 
627 		/* return reserved blocks to the rgrp */
628 		BUG_ON(rs->rs_rbm.rgd->rd_reserved < rs->rs_free);
629 		rs->rs_rbm.rgd->rd_reserved -= rs->rs_free;
630 		/* The rgrp extent failure point is likely not to increase;
631 		   it will only do so if the freed blocks are somehow
632 		   contiguous with a span of free blocks that follows. Still,
633 		   it will force the number to be recalculated later. */
634 		rgd->rd_extfail_pt += rs->rs_free;
635 		rs->rs_free = 0;
636 		if (gfs2_rbm_from_block(&last_rbm, last_block))
637 			return;
638 		start = rbm_bi(&rs->rs_rbm);
639 		last = rbm_bi(&last_rbm);
640 		do
641 			clear_bit(GBF_FULL, &start->bi_flags);
642 		while (start++ != last);
643 	}
644 }
645 
646 /**
647  * gfs2_rs_deltree - remove a multi-block reservation from the rgd tree
648  * @rs: The reservation to remove
649  *
650  */
gfs2_rs_deltree(struct gfs2_blkreserv * rs)651 void gfs2_rs_deltree(struct gfs2_blkreserv *rs)
652 {
653 	struct gfs2_rgrpd *rgd;
654 
655 	rgd = rs->rs_rbm.rgd;
656 	if (rgd) {
657 		spin_lock(&rgd->rd_rsspin);
658 		__rs_deltree(rs);
659 		BUG_ON(rs->rs_free);
660 		spin_unlock(&rgd->rd_rsspin);
661 	}
662 }
663 
664 /**
665  * gfs2_rs_delete - delete a multi-block reservation
666  * @ip: The inode for this reservation
667  *
668  */
gfs2_rs_delete(struct gfs2_inode * ip)669 void gfs2_rs_delete(struct gfs2_inode *ip)
670 {
671 	struct inode *inode = &ip->i_inode;
672 
673 	down_write(&ip->i_rw_mutex);
674 	if (atomic_read(&inode->i_writecount) <= 1)
675 		gfs2_rs_deltree(&ip->i_res);
676 	up_write(&ip->i_rw_mutex);
677 }
678 
679 /**
680  * return_all_reservations - return all reserved blocks back to the rgrp.
681  * @rgd: the rgrp that needs its space back
682  *
683  * We previously reserved a bunch of blocks for allocation. Now we need to
684  * give them back. This leave the reservation structures in tact, but removes
685  * all of their corresponding "no-fly zones".
686  */
return_all_reservations(struct gfs2_rgrpd * rgd)687 static void return_all_reservations(struct gfs2_rgrpd *rgd)
688 {
689 	struct rb_node *n;
690 	struct gfs2_blkreserv *rs;
691 
692 	spin_lock(&rgd->rd_rsspin);
693 	while ((n = rb_first(&rgd->rd_rstree))) {
694 		rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
695 		__rs_deltree(rs);
696 	}
697 	spin_unlock(&rgd->rd_rsspin);
698 }
699 
gfs2_clear_rgrpd(struct gfs2_sbd * sdp)700 void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
701 {
702 	struct rb_node *n;
703 	struct gfs2_rgrpd *rgd;
704 	struct gfs2_glock *gl;
705 
706 	while ((n = rb_first(&sdp->sd_rindex_tree))) {
707 		rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
708 		gl = rgd->rd_gl;
709 
710 		rb_erase(n, &sdp->sd_rindex_tree);
711 
712 		if (gl) {
713 			if (gl->gl_state != LM_ST_UNLOCKED) {
714 				gfs2_glock_cb(gl, LM_ST_UNLOCKED);
715 				flush_delayed_work(&gl->gl_work);
716 			}
717 			gfs2_rgrp_brelse(rgd);
718 			glock_clear_object(gl, rgd);
719 			gfs2_glock_put(gl);
720 		}
721 
722 		gfs2_free_clones(rgd);
723 		return_all_reservations(rgd);
724 		kfree(rgd->rd_bits);
725 		rgd->rd_bits = NULL;
726 		kmem_cache_free(gfs2_rgrpd_cachep, rgd);
727 	}
728 }
729 
730 /**
731  * gfs2_compute_bitstructs - Compute the bitmap sizes
732  * @rgd: The resource group descriptor
733  *
734  * Calculates bitmap descriptors, one for each block that contains bitmap data
735  *
736  * Returns: errno
737  */
738 
compute_bitstructs(struct gfs2_rgrpd * rgd)739 static int compute_bitstructs(struct gfs2_rgrpd *rgd)
740 {
741 	struct gfs2_sbd *sdp = rgd->rd_sbd;
742 	struct gfs2_bitmap *bi;
743 	u32 length = rgd->rd_length; /* # blocks in hdr & bitmap */
744 	u32 bytes_left, bytes;
745 	int x;
746 
747 	if (!length)
748 		return -EINVAL;
749 
750 	rgd->rd_bits = kcalloc(length, sizeof(struct gfs2_bitmap), GFP_NOFS);
751 	if (!rgd->rd_bits)
752 		return -ENOMEM;
753 
754 	bytes_left = rgd->rd_bitbytes;
755 
756 	for (x = 0; x < length; x++) {
757 		bi = rgd->rd_bits + x;
758 
759 		bi->bi_flags = 0;
760 		/* small rgrp; bitmap stored completely in header block */
761 		if (length == 1) {
762 			bytes = bytes_left;
763 			bi->bi_offset = sizeof(struct gfs2_rgrp);
764 			bi->bi_start = 0;
765 			bi->bi_bytes = bytes;
766 			bi->bi_blocks = bytes * GFS2_NBBY;
767 		/* header block */
768 		} else if (x == 0) {
769 			bytes = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_rgrp);
770 			bi->bi_offset = sizeof(struct gfs2_rgrp);
771 			bi->bi_start = 0;
772 			bi->bi_bytes = bytes;
773 			bi->bi_blocks = bytes * GFS2_NBBY;
774 		/* last block */
775 		} else if (x + 1 == length) {
776 			bytes = bytes_left;
777 			bi->bi_offset = sizeof(struct gfs2_meta_header);
778 			bi->bi_start = rgd->rd_bitbytes - bytes_left;
779 			bi->bi_bytes = bytes;
780 			bi->bi_blocks = bytes * GFS2_NBBY;
781 		/* other blocks */
782 		} else {
783 			bytes = sdp->sd_sb.sb_bsize -
784 				sizeof(struct gfs2_meta_header);
785 			bi->bi_offset = sizeof(struct gfs2_meta_header);
786 			bi->bi_start = rgd->rd_bitbytes - bytes_left;
787 			bi->bi_bytes = bytes;
788 			bi->bi_blocks = bytes * GFS2_NBBY;
789 		}
790 
791 		bytes_left -= bytes;
792 	}
793 
794 	if (bytes_left) {
795 		gfs2_consist_rgrpd(rgd);
796 		return -EIO;
797 	}
798 	bi = rgd->rd_bits + (length - 1);
799 	if ((bi->bi_start + bi->bi_bytes) * GFS2_NBBY != rgd->rd_data) {
800 		gfs2_lm(sdp,
801 			"ri_addr = %llu\n"
802 			"ri_length = %u\n"
803 			"ri_data0 = %llu\n"
804 			"ri_data = %u\n"
805 			"ri_bitbytes = %u\n"
806 			"start=%u len=%u offset=%u\n",
807 			(unsigned long long)rgd->rd_addr,
808 			rgd->rd_length,
809 			(unsigned long long)rgd->rd_data0,
810 			rgd->rd_data,
811 			rgd->rd_bitbytes,
812 			bi->bi_start, bi->bi_bytes, bi->bi_offset);
813 		gfs2_consist_rgrpd(rgd);
814 		return -EIO;
815 	}
816 
817 	return 0;
818 }
819 
820 /**
821  * gfs2_ri_total - Total up the file system space, according to the rindex.
822  * @sdp: the filesystem
823  *
824  */
gfs2_ri_total(struct gfs2_sbd * sdp)825 u64 gfs2_ri_total(struct gfs2_sbd *sdp)
826 {
827 	u64 total_data = 0;
828 	struct inode *inode = sdp->sd_rindex;
829 	struct gfs2_inode *ip = GFS2_I(inode);
830 	char buf[sizeof(struct gfs2_rindex)];
831 	int error, rgrps;
832 
833 	for (rgrps = 0;; rgrps++) {
834 		loff_t pos = rgrps * sizeof(struct gfs2_rindex);
835 
836 		if (pos + sizeof(struct gfs2_rindex) > i_size_read(inode))
837 			break;
838 		error = gfs2_internal_read(ip, buf, &pos,
839 					   sizeof(struct gfs2_rindex));
840 		if (error != sizeof(struct gfs2_rindex))
841 			break;
842 		total_data += be32_to_cpu(((struct gfs2_rindex *)buf)->ri_data);
843 	}
844 	return total_data;
845 }
846 
rgd_insert(struct gfs2_rgrpd * rgd)847 static int rgd_insert(struct gfs2_rgrpd *rgd)
848 {
849 	struct gfs2_sbd *sdp = rgd->rd_sbd;
850 	struct rb_node **newn = &sdp->sd_rindex_tree.rb_node, *parent = NULL;
851 
852 	/* Figure out where to put new node */
853 	while (*newn) {
854 		struct gfs2_rgrpd *cur = rb_entry(*newn, struct gfs2_rgrpd,
855 						  rd_node);
856 
857 		parent = *newn;
858 		if (rgd->rd_addr < cur->rd_addr)
859 			newn = &((*newn)->rb_left);
860 		else if (rgd->rd_addr > cur->rd_addr)
861 			newn = &((*newn)->rb_right);
862 		else
863 			return -EEXIST;
864 	}
865 
866 	rb_link_node(&rgd->rd_node, parent, newn);
867 	rb_insert_color(&rgd->rd_node, &sdp->sd_rindex_tree);
868 	sdp->sd_rgrps++;
869 	return 0;
870 }
871 
872 /**
873  * read_rindex_entry - Pull in a new resource index entry from the disk
874  * @ip: Pointer to the rindex inode
875  *
876  * Returns: 0 on success, > 0 on EOF, error code otherwise
877  */
878 
read_rindex_entry(struct gfs2_inode * ip)879 static int read_rindex_entry(struct gfs2_inode *ip)
880 {
881 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
882 	loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex);
883 	struct gfs2_rindex buf;
884 	int error;
885 	struct gfs2_rgrpd *rgd;
886 
887 	if (pos >= i_size_read(&ip->i_inode))
888 		return 1;
889 
890 	error = gfs2_internal_read(ip, (char *)&buf, &pos,
891 				   sizeof(struct gfs2_rindex));
892 
893 	if (error != sizeof(struct gfs2_rindex))
894 		return (error == 0) ? 1 : error;
895 
896 	rgd = kmem_cache_zalloc(gfs2_rgrpd_cachep, GFP_NOFS);
897 	error = -ENOMEM;
898 	if (!rgd)
899 		return error;
900 
901 	rgd->rd_sbd = sdp;
902 	rgd->rd_addr = be64_to_cpu(buf.ri_addr);
903 	rgd->rd_length = be32_to_cpu(buf.ri_length);
904 	rgd->rd_data0 = be64_to_cpu(buf.ri_data0);
905 	rgd->rd_data = be32_to_cpu(buf.ri_data);
906 	rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes);
907 	spin_lock_init(&rgd->rd_rsspin);
908 
909 	error = gfs2_glock_get(sdp, rgd->rd_addr,
910 			       &gfs2_rgrp_glops, CREATE, &rgd->rd_gl);
911 	if (error)
912 		goto fail;
913 
914 	error = compute_bitstructs(rgd);
915 	if (error)
916 		goto fail_glock;
917 
918 	rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr;
919 	rgd->rd_flags &= ~(GFS2_RDF_UPTODATE | GFS2_RDF_PREFERRED);
920 	if (rgd->rd_data > sdp->sd_max_rg_data)
921 		sdp->sd_max_rg_data = rgd->rd_data;
922 	spin_lock(&sdp->sd_rindex_spin);
923 	error = rgd_insert(rgd);
924 	spin_unlock(&sdp->sd_rindex_spin);
925 	if (!error) {
926 		glock_set_object(rgd->rd_gl, rgd);
927 		return 0;
928 	}
929 
930 	error = 0; /* someone else read in the rgrp; free it and ignore it */
931 fail_glock:
932 	gfs2_glock_put(rgd->rd_gl);
933 
934 fail:
935 	kfree(rgd->rd_bits);
936 	rgd->rd_bits = NULL;
937 	kmem_cache_free(gfs2_rgrpd_cachep, rgd);
938 	return error;
939 }
940 
941 /**
942  * set_rgrp_preferences - Run all the rgrps, selecting some we prefer to use
943  * @sdp: the GFS2 superblock
944  *
945  * The purpose of this function is to select a subset of the resource groups
946  * and mark them as PREFERRED. We do it in such a way that each node prefers
947  * to use a unique set of rgrps to minimize glock contention.
948  */
set_rgrp_preferences(struct gfs2_sbd * sdp)949 static void set_rgrp_preferences(struct gfs2_sbd *sdp)
950 {
951 	struct gfs2_rgrpd *rgd, *first;
952 	int i;
953 
954 	/* Skip an initial number of rgrps, based on this node's journal ID.
955 	   That should start each node out on its own set. */
956 	rgd = gfs2_rgrpd_get_first(sdp);
957 	for (i = 0; i < sdp->sd_lockstruct.ls_jid; i++)
958 		rgd = gfs2_rgrpd_get_next(rgd);
959 	first = rgd;
960 
961 	do {
962 		rgd->rd_flags |= GFS2_RDF_PREFERRED;
963 		for (i = 0; i < sdp->sd_journals; i++) {
964 			rgd = gfs2_rgrpd_get_next(rgd);
965 			if (!rgd || rgd == first)
966 				break;
967 		}
968 	} while (rgd && rgd != first);
969 }
970 
971 /**
972  * gfs2_ri_update - Pull in a new resource index from the disk
973  * @ip: pointer to the rindex inode
974  *
975  * Returns: 0 on successful update, error code otherwise
976  */
977 
gfs2_ri_update(struct gfs2_inode * ip)978 static int gfs2_ri_update(struct gfs2_inode *ip)
979 {
980 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
981 	int error;
982 
983 	do {
984 		error = read_rindex_entry(ip);
985 	} while (error == 0);
986 
987 	if (error < 0)
988 		return error;
989 
990 	if (RB_EMPTY_ROOT(&sdp->sd_rindex_tree)) {
991 		fs_err(sdp, "no resource groups found in the file system.\n");
992 		return -ENOENT;
993 	}
994 	set_rgrp_preferences(sdp);
995 
996 	sdp->sd_rindex_uptodate = 1;
997 	return 0;
998 }
999 
1000 /**
1001  * gfs2_rindex_update - Update the rindex if required
1002  * @sdp: The GFS2 superblock
1003  *
1004  * We grab a lock on the rindex inode to make sure that it doesn't
1005  * change whilst we are performing an operation. We keep this lock
1006  * for quite long periods of time compared to other locks. This
1007  * doesn't matter, since it is shared and it is very, very rarely
1008  * accessed in the exclusive mode (i.e. only when expanding the filesystem).
1009  *
1010  * This makes sure that we're using the latest copy of the resource index
1011  * special file, which might have been updated if someone expanded the
1012  * filesystem (via gfs2_grow utility), which adds new resource groups.
1013  *
1014  * Returns: 0 on succeess, error code otherwise
1015  */
1016 
gfs2_rindex_update(struct gfs2_sbd * sdp)1017 int gfs2_rindex_update(struct gfs2_sbd *sdp)
1018 {
1019 	struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex);
1020 	struct gfs2_glock *gl = ip->i_gl;
1021 	struct gfs2_holder ri_gh;
1022 	int error = 0;
1023 	int unlock_required = 0;
1024 
1025 	/* Read new copy from disk if we don't have the latest */
1026 	if (!sdp->sd_rindex_uptodate) {
1027 		if (!gfs2_glock_is_locked_by_me(gl)) {
1028 			error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, &ri_gh);
1029 			if (error)
1030 				return error;
1031 			unlock_required = 1;
1032 		}
1033 		if (!sdp->sd_rindex_uptodate)
1034 			error = gfs2_ri_update(ip);
1035 		if (unlock_required)
1036 			gfs2_glock_dq_uninit(&ri_gh);
1037 	}
1038 
1039 	return error;
1040 }
1041 
gfs2_rgrp_in(struct gfs2_rgrpd * rgd,const void * buf)1042 static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
1043 {
1044 	const struct gfs2_rgrp *str = buf;
1045 	u32 rg_flags;
1046 
1047 	rg_flags = be32_to_cpu(str->rg_flags);
1048 	rg_flags &= ~GFS2_RDF_MASK;
1049 	rgd->rd_flags &= GFS2_RDF_MASK;
1050 	rgd->rd_flags |= rg_flags;
1051 	rgd->rd_free = be32_to_cpu(str->rg_free);
1052 	rgd->rd_dinodes = be32_to_cpu(str->rg_dinodes);
1053 	rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration);
1054 	/* rd_data0, rd_data and rd_bitbytes already set from rindex */
1055 }
1056 
gfs2_rgrp_ondisk2lvb(struct gfs2_rgrp_lvb * rgl,const void * buf)1057 static void gfs2_rgrp_ondisk2lvb(struct gfs2_rgrp_lvb *rgl, const void *buf)
1058 {
1059 	const struct gfs2_rgrp *str = buf;
1060 
1061 	rgl->rl_magic = cpu_to_be32(GFS2_MAGIC);
1062 	rgl->rl_flags = str->rg_flags;
1063 	rgl->rl_free = str->rg_free;
1064 	rgl->rl_dinodes = str->rg_dinodes;
1065 	rgl->rl_igeneration = str->rg_igeneration;
1066 	rgl->__pad = 0UL;
1067 }
1068 
gfs2_rgrp_out(struct gfs2_rgrpd * rgd,void * buf)1069 static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
1070 {
1071 	struct gfs2_rgrpd *next = gfs2_rgrpd_get_next(rgd);
1072 	struct gfs2_rgrp *str = buf;
1073 	u32 crc;
1074 
1075 	str->rg_flags = cpu_to_be32(rgd->rd_flags & ~GFS2_RDF_MASK);
1076 	str->rg_free = cpu_to_be32(rgd->rd_free);
1077 	str->rg_dinodes = cpu_to_be32(rgd->rd_dinodes);
1078 	if (next == NULL)
1079 		str->rg_skip = 0;
1080 	else if (next->rd_addr > rgd->rd_addr)
1081 		str->rg_skip = cpu_to_be32(next->rd_addr - rgd->rd_addr);
1082 	str->rg_igeneration = cpu_to_be64(rgd->rd_igeneration);
1083 	str->rg_data0 = cpu_to_be64(rgd->rd_data0);
1084 	str->rg_data = cpu_to_be32(rgd->rd_data);
1085 	str->rg_bitbytes = cpu_to_be32(rgd->rd_bitbytes);
1086 	str->rg_crc = 0;
1087 	crc = gfs2_disk_hash(buf, sizeof(struct gfs2_rgrp));
1088 	str->rg_crc = cpu_to_be32(crc);
1089 
1090 	memset(&str->rg_reserved, 0, sizeof(str->rg_reserved));
1091 	gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, buf);
1092 }
1093 
gfs2_rgrp_lvb_valid(struct gfs2_rgrpd * rgd)1094 static int gfs2_rgrp_lvb_valid(struct gfs2_rgrpd *rgd)
1095 {
1096 	struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl;
1097 	struct gfs2_rgrp *str = (struct gfs2_rgrp *)rgd->rd_bits[0].bi_bh->b_data;
1098 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1099 	int valid = 1;
1100 
1101 	if (rgl->rl_flags != str->rg_flags) {
1102 		fs_warn(sdp, "GFS2: rgd: %llu lvb flag mismatch %u/%u",
1103 			(unsigned long long)rgd->rd_addr,
1104 		       be32_to_cpu(rgl->rl_flags), be32_to_cpu(str->rg_flags));
1105 		valid = 0;
1106 	}
1107 	if (rgl->rl_free != str->rg_free) {
1108 		fs_warn(sdp, "GFS2: rgd: %llu lvb free mismatch %u/%u",
1109 			(unsigned long long)rgd->rd_addr,
1110 			be32_to_cpu(rgl->rl_free), be32_to_cpu(str->rg_free));
1111 		valid = 0;
1112 	}
1113 	if (rgl->rl_dinodes != str->rg_dinodes) {
1114 		fs_warn(sdp, "GFS2: rgd: %llu lvb dinode mismatch %u/%u",
1115 			(unsigned long long)rgd->rd_addr,
1116 			be32_to_cpu(rgl->rl_dinodes),
1117 			be32_to_cpu(str->rg_dinodes));
1118 		valid = 0;
1119 	}
1120 	if (rgl->rl_igeneration != str->rg_igeneration) {
1121 		fs_warn(sdp, "GFS2: rgd: %llu lvb igen mismatch %llu/%llu",
1122 			(unsigned long long)rgd->rd_addr,
1123 			(unsigned long long)be64_to_cpu(rgl->rl_igeneration),
1124 			(unsigned long long)be64_to_cpu(str->rg_igeneration));
1125 		valid = 0;
1126 	}
1127 	return valid;
1128 }
1129 
count_unlinked(struct gfs2_rgrpd * rgd)1130 static u32 count_unlinked(struct gfs2_rgrpd *rgd)
1131 {
1132 	struct gfs2_bitmap *bi;
1133 	const u32 length = rgd->rd_length;
1134 	const u8 *buffer = NULL;
1135 	u32 i, goal, count = 0;
1136 
1137 	for (i = 0, bi = rgd->rd_bits; i < length; i++, bi++) {
1138 		goal = 0;
1139 		buffer = bi->bi_bh->b_data + bi->bi_offset;
1140 		WARN_ON(!buffer_uptodate(bi->bi_bh));
1141 		while (goal < bi->bi_blocks) {
1142 			goal = gfs2_bitfit(buffer, bi->bi_bytes, goal,
1143 					   GFS2_BLKST_UNLINKED);
1144 			if (goal == BFITNOENT)
1145 				break;
1146 			count++;
1147 			goal++;
1148 		}
1149 	}
1150 
1151 	return count;
1152 }
1153 
1154 
1155 /**
1156  * gfs2_rgrp_bh_get - Read in a RG's header and bitmaps
1157  * @rgd: the struct gfs2_rgrpd describing the RG to read in
1158  *
1159  * Read in all of a Resource Group's header and bitmap blocks.
1160  * Caller must eventually call gfs2_rgrp_brelse() to free the bitmaps.
1161  *
1162  * Returns: errno
1163  */
1164 
gfs2_rgrp_bh_get(struct gfs2_rgrpd * rgd)1165 static int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
1166 {
1167 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1168 	struct gfs2_glock *gl = rgd->rd_gl;
1169 	unsigned int length = rgd->rd_length;
1170 	struct gfs2_bitmap *bi;
1171 	unsigned int x, y;
1172 	int error;
1173 
1174 	if (rgd->rd_bits[0].bi_bh != NULL)
1175 		return 0;
1176 
1177 	for (x = 0; x < length; x++) {
1178 		bi = rgd->rd_bits + x;
1179 		error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, 0, &bi->bi_bh);
1180 		if (error)
1181 			goto fail;
1182 	}
1183 
1184 	for (y = length; y--;) {
1185 		bi = rgd->rd_bits + y;
1186 		error = gfs2_meta_wait(sdp, bi->bi_bh);
1187 		if (error)
1188 			goto fail;
1189 		if (gfs2_metatype_check(sdp, bi->bi_bh, y ? GFS2_METATYPE_RB :
1190 					      GFS2_METATYPE_RG)) {
1191 			error = -EIO;
1192 			goto fail;
1193 		}
1194 	}
1195 
1196 	if (!(rgd->rd_flags & GFS2_RDF_UPTODATE)) {
1197 		for (x = 0; x < length; x++)
1198 			clear_bit(GBF_FULL, &rgd->rd_bits[x].bi_flags);
1199 		gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
1200 		rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK);
1201 		rgd->rd_free_clone = rgd->rd_free;
1202 		/* max out the rgrp allocation failure point */
1203 		rgd->rd_extfail_pt = rgd->rd_free;
1204 	}
1205 	if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) {
1206 		rgd->rd_rgl->rl_unlinked = cpu_to_be32(count_unlinked(rgd));
1207 		gfs2_rgrp_ondisk2lvb(rgd->rd_rgl,
1208 				     rgd->rd_bits[0].bi_bh->b_data);
1209 	}
1210 	else if (sdp->sd_args.ar_rgrplvb) {
1211 		if (!gfs2_rgrp_lvb_valid(rgd)){
1212 			gfs2_consist_rgrpd(rgd);
1213 			error = -EIO;
1214 			goto fail;
1215 		}
1216 		if (rgd->rd_rgl->rl_unlinked == 0)
1217 			rgd->rd_flags &= ~GFS2_RDF_CHECK;
1218 	}
1219 	return 0;
1220 
1221 fail:
1222 	while (x--) {
1223 		bi = rgd->rd_bits + x;
1224 		brelse(bi->bi_bh);
1225 		bi->bi_bh = NULL;
1226 		gfs2_assert_warn(sdp, !bi->bi_clone);
1227 	}
1228 
1229 	return error;
1230 }
1231 
update_rgrp_lvb(struct gfs2_rgrpd * rgd)1232 static int update_rgrp_lvb(struct gfs2_rgrpd *rgd)
1233 {
1234 	u32 rl_flags;
1235 
1236 	if (rgd->rd_flags & GFS2_RDF_UPTODATE)
1237 		return 0;
1238 
1239 	if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic)
1240 		return gfs2_rgrp_bh_get(rgd);
1241 
1242 	rl_flags = be32_to_cpu(rgd->rd_rgl->rl_flags);
1243 	rl_flags &= ~GFS2_RDF_MASK;
1244 	rgd->rd_flags &= GFS2_RDF_MASK;
1245 	rgd->rd_flags |= (rl_flags | GFS2_RDF_CHECK);
1246 	if (rgd->rd_rgl->rl_unlinked == 0)
1247 		rgd->rd_flags &= ~GFS2_RDF_CHECK;
1248 	rgd->rd_free = be32_to_cpu(rgd->rd_rgl->rl_free);
1249 	rgd->rd_free_clone = rgd->rd_free;
1250 	rgd->rd_dinodes = be32_to_cpu(rgd->rd_rgl->rl_dinodes);
1251 	rgd->rd_igeneration = be64_to_cpu(rgd->rd_rgl->rl_igeneration);
1252 	return 0;
1253 }
1254 
gfs2_rgrp_go_lock(struct gfs2_holder * gh)1255 int gfs2_rgrp_go_lock(struct gfs2_holder *gh)
1256 {
1257 	struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object;
1258 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1259 
1260 	if (gh->gh_flags & GL_SKIP && sdp->sd_args.ar_rgrplvb)
1261 		return 0;
1262 	return gfs2_rgrp_bh_get(rgd);
1263 }
1264 
1265 /**
1266  * gfs2_rgrp_brelse - Release RG bitmaps read in with gfs2_rgrp_bh_get()
1267  * @rgd: The resource group
1268  *
1269  */
1270 
gfs2_rgrp_brelse(struct gfs2_rgrpd * rgd)1271 void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd)
1272 {
1273 	int x, length = rgd->rd_length;
1274 
1275 	for (x = 0; x < length; x++) {
1276 		struct gfs2_bitmap *bi = rgd->rd_bits + x;
1277 		if (bi->bi_bh) {
1278 			brelse(bi->bi_bh);
1279 			bi->bi_bh = NULL;
1280 		}
1281 	}
1282 }
1283 
gfs2_rgrp_send_discards(struct gfs2_sbd * sdp,u64 offset,struct buffer_head * bh,const struct gfs2_bitmap * bi,unsigned minlen,u64 * ptrimmed)1284 int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
1285 			     struct buffer_head *bh,
1286 			     const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed)
1287 {
1288 	struct super_block *sb = sdp->sd_vfs;
1289 	u64 blk;
1290 	sector_t start = 0;
1291 	sector_t nr_blks = 0;
1292 	int rv;
1293 	unsigned int x;
1294 	u32 trimmed = 0;
1295 	u8 diff;
1296 
1297 	for (x = 0; x < bi->bi_bytes; x++) {
1298 		const u8 *clone = bi->bi_clone ? bi->bi_clone : bi->bi_bh->b_data;
1299 		clone += bi->bi_offset;
1300 		clone += x;
1301 		if (bh) {
1302 			const u8 *orig = bh->b_data + bi->bi_offset + x;
1303 			diff = ~(*orig | (*orig >> 1)) & (*clone | (*clone >> 1));
1304 		} else {
1305 			diff = ~(*clone | (*clone >> 1));
1306 		}
1307 		diff &= 0x55;
1308 		if (diff == 0)
1309 			continue;
1310 		blk = offset + ((bi->bi_start + x) * GFS2_NBBY);
1311 		while(diff) {
1312 			if (diff & 1) {
1313 				if (nr_blks == 0)
1314 					goto start_new_extent;
1315 				if ((start + nr_blks) != blk) {
1316 					if (nr_blks >= minlen) {
1317 						rv = sb_issue_discard(sb,
1318 							start, nr_blks,
1319 							GFP_NOFS, 0);
1320 						if (rv)
1321 							goto fail;
1322 						trimmed += nr_blks;
1323 					}
1324 					nr_blks = 0;
1325 start_new_extent:
1326 					start = blk;
1327 				}
1328 				nr_blks++;
1329 			}
1330 			diff >>= 2;
1331 			blk++;
1332 		}
1333 	}
1334 	if (nr_blks >= minlen) {
1335 		rv = sb_issue_discard(sb, start, nr_blks, GFP_NOFS, 0);
1336 		if (rv)
1337 			goto fail;
1338 		trimmed += nr_blks;
1339 	}
1340 	if (ptrimmed)
1341 		*ptrimmed = trimmed;
1342 	return 0;
1343 
1344 fail:
1345 	if (sdp->sd_args.ar_discard)
1346 		fs_warn(sdp, "error %d on discard request, turning discards off for this filesystem\n", rv);
1347 	sdp->sd_args.ar_discard = 0;
1348 	return -EIO;
1349 }
1350 
1351 /**
1352  * gfs2_fitrim - Generate discard requests for unused bits of the filesystem
1353  * @filp: Any file on the filesystem
1354  * @argp: Pointer to the arguments (also used to pass result)
1355  *
1356  * Returns: 0 on success, otherwise error code
1357  */
1358 
gfs2_fitrim(struct file * filp,void __user * argp)1359 int gfs2_fitrim(struct file *filp, void __user *argp)
1360 {
1361 	struct inode *inode = file_inode(filp);
1362 	struct gfs2_sbd *sdp = GFS2_SB(inode);
1363 	struct request_queue *q = bdev_get_queue(sdp->sd_vfs->s_bdev);
1364 	struct buffer_head *bh;
1365 	struct gfs2_rgrpd *rgd;
1366 	struct gfs2_rgrpd *rgd_end;
1367 	struct gfs2_holder gh;
1368 	struct fstrim_range r;
1369 	int ret = 0;
1370 	u64 amt;
1371 	u64 trimmed = 0;
1372 	u64 start, end, minlen;
1373 	unsigned int x;
1374 	unsigned bs_shift = sdp->sd_sb.sb_bsize_shift;
1375 
1376 	if (!capable(CAP_SYS_ADMIN))
1377 		return -EPERM;
1378 
1379 	if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
1380 		return -EROFS;
1381 
1382 	if (!blk_queue_discard(q))
1383 		return -EOPNOTSUPP;
1384 
1385 	if (copy_from_user(&r, argp, sizeof(r)))
1386 		return -EFAULT;
1387 
1388 	ret = gfs2_rindex_update(sdp);
1389 	if (ret)
1390 		return ret;
1391 
1392 	start = r.start >> bs_shift;
1393 	end = start + (r.len >> bs_shift);
1394 	minlen = max_t(u64, r.minlen, sdp->sd_sb.sb_bsize);
1395 	minlen = max_t(u64, minlen,
1396 		       q->limits.discard_granularity) >> bs_shift;
1397 
1398 	if (end <= start || minlen > sdp->sd_max_rg_data)
1399 		return -EINVAL;
1400 
1401 	rgd = gfs2_blk2rgrpd(sdp, start, 0);
1402 	rgd_end = gfs2_blk2rgrpd(sdp, end, 0);
1403 
1404 	if ((gfs2_rgrpd_get_first(sdp) == gfs2_rgrpd_get_next(rgd_end))
1405 	    && (start > rgd_end->rd_data0 + rgd_end->rd_data))
1406 		return -EINVAL; /* start is beyond the end of the fs */
1407 
1408 	while (1) {
1409 
1410 		ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &gh);
1411 		if (ret)
1412 			goto out;
1413 
1414 		if (!(rgd->rd_flags & GFS2_RGF_TRIMMED)) {
1415 			/* Trim each bitmap in the rgrp */
1416 			for (x = 0; x < rgd->rd_length; x++) {
1417 				struct gfs2_bitmap *bi = rgd->rd_bits + x;
1418 				ret = gfs2_rgrp_send_discards(sdp,
1419 						rgd->rd_data0, NULL, bi, minlen,
1420 						&amt);
1421 				if (ret) {
1422 					gfs2_glock_dq_uninit(&gh);
1423 					goto out;
1424 				}
1425 				trimmed += amt;
1426 			}
1427 
1428 			/* Mark rgrp as having been trimmed */
1429 			ret = gfs2_trans_begin(sdp, RES_RG_HDR, 0);
1430 			if (ret == 0) {
1431 				bh = rgd->rd_bits[0].bi_bh;
1432 				rgd->rd_flags |= GFS2_RGF_TRIMMED;
1433 				gfs2_trans_add_meta(rgd->rd_gl, bh);
1434 				gfs2_rgrp_out(rgd, bh->b_data);
1435 				gfs2_trans_end(sdp);
1436 			}
1437 		}
1438 		gfs2_glock_dq_uninit(&gh);
1439 
1440 		if (rgd == rgd_end)
1441 			break;
1442 
1443 		rgd = gfs2_rgrpd_get_next(rgd);
1444 	}
1445 
1446 out:
1447 	r.len = trimmed << bs_shift;
1448 	if (copy_to_user(argp, &r, sizeof(r)))
1449 		return -EFAULT;
1450 
1451 	return ret;
1452 }
1453 
1454 /**
1455  * rs_insert - insert a new multi-block reservation into the rgrp's rb_tree
1456  * @ip: the inode structure
1457  *
1458  */
rs_insert(struct gfs2_inode * ip)1459 static void rs_insert(struct gfs2_inode *ip)
1460 {
1461 	struct rb_node **newn, *parent = NULL;
1462 	int rc;
1463 	struct gfs2_blkreserv *rs = &ip->i_res;
1464 	struct gfs2_rgrpd *rgd = rs->rs_rbm.rgd;
1465 	u64 fsblock = gfs2_rbm_to_block(&rs->rs_rbm);
1466 
1467 	BUG_ON(gfs2_rs_active(rs));
1468 
1469 	spin_lock(&rgd->rd_rsspin);
1470 	newn = &rgd->rd_rstree.rb_node;
1471 	while (*newn) {
1472 		struct gfs2_blkreserv *cur =
1473 			rb_entry(*newn, struct gfs2_blkreserv, rs_node);
1474 
1475 		parent = *newn;
1476 		rc = rs_cmp(fsblock, rs->rs_free, cur);
1477 		if (rc > 0)
1478 			newn = &((*newn)->rb_right);
1479 		else if (rc < 0)
1480 			newn = &((*newn)->rb_left);
1481 		else {
1482 			spin_unlock(&rgd->rd_rsspin);
1483 			WARN_ON(1);
1484 			return;
1485 		}
1486 	}
1487 
1488 	rb_link_node(&rs->rs_node, parent, newn);
1489 	rb_insert_color(&rs->rs_node, &rgd->rd_rstree);
1490 
1491 	/* Do our rgrp accounting for the reservation */
1492 	rgd->rd_reserved += rs->rs_free; /* blocks reserved */
1493 	spin_unlock(&rgd->rd_rsspin);
1494 	trace_gfs2_rs(rs, TRACE_RS_INSERT);
1495 }
1496 
1497 /**
1498  * rgd_free - return the number of free blocks we can allocate.
1499  * @rgd: the resource group
1500  *
1501  * This function returns the number of free blocks for an rgrp.
1502  * That's the clone-free blocks (blocks that are free, not including those
1503  * still being used for unlinked files that haven't been deleted.)
1504  *
1505  * It also subtracts any blocks reserved by someone else, but does not
1506  * include free blocks that are still part of our current reservation,
1507  * because obviously we can (and will) allocate them.
1508  */
rgd_free(struct gfs2_rgrpd * rgd,struct gfs2_blkreserv * rs)1509 static inline u32 rgd_free(struct gfs2_rgrpd *rgd, struct gfs2_blkreserv *rs)
1510 {
1511 	u32 tot_reserved, tot_free;
1512 
1513 	if (WARN_ON_ONCE(rgd->rd_reserved < rs->rs_free))
1514 		return 0;
1515 	tot_reserved = rgd->rd_reserved - rs->rs_free;
1516 
1517 	if (rgd->rd_free_clone < tot_reserved)
1518 		tot_reserved = 0;
1519 
1520 	tot_free = rgd->rd_free_clone - tot_reserved;
1521 
1522 	return tot_free;
1523 }
1524 
1525 /**
1526  * rg_mblk_search - find a group of multiple free blocks to form a reservation
1527  * @rgd: the resource group descriptor
1528  * @ip: pointer to the inode for which we're reserving blocks
1529  * @ap: the allocation parameters
1530  *
1531  */
1532 
rg_mblk_search(struct gfs2_rgrpd * rgd,struct gfs2_inode * ip,const struct gfs2_alloc_parms * ap)1533 static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip,
1534 			   const struct gfs2_alloc_parms *ap)
1535 {
1536 	struct gfs2_rbm rbm = { .rgd = rgd, };
1537 	u64 goal;
1538 	struct gfs2_blkreserv *rs = &ip->i_res;
1539 	u32 extlen;
1540 	u32 free_blocks = rgd_free(rgd, rs);
1541 	int ret;
1542 	struct inode *inode = &ip->i_inode;
1543 
1544 	if (S_ISDIR(inode->i_mode))
1545 		extlen = 1;
1546 	else {
1547 		extlen = max_t(u32, atomic_read(&ip->i_sizehint), ap->target);
1548 		extlen = clamp(extlen, (u32)RGRP_RSRV_MINBLKS, free_blocks);
1549 	}
1550 	if ((rgd->rd_free_clone < rgd->rd_reserved) || (free_blocks < extlen))
1551 		return;
1552 
1553 	/* Find bitmap block that contains bits for goal block */
1554 	if (rgrp_contains_block(rgd, ip->i_goal))
1555 		goal = ip->i_goal;
1556 	else
1557 		goal = rgd->rd_last_alloc + rgd->rd_data0;
1558 
1559 	if (WARN_ON(gfs2_rbm_from_block(&rbm, goal)))
1560 		return;
1561 
1562 	ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &extlen, ip, true);
1563 	if (ret == 0) {
1564 		rs->rs_rbm = rbm;
1565 		rs->rs_free = extlen;
1566 		rs_insert(ip);
1567 	} else {
1568 		if (goal == rgd->rd_last_alloc + rgd->rd_data0)
1569 			rgd->rd_last_alloc = 0;
1570 	}
1571 }
1572 
1573 /**
1574  * gfs2_next_unreserved_block - Return next block that is not reserved
1575  * @rgd: The resource group
1576  * @block: The starting block
1577  * @length: The required length
1578  * @ip: Ignore any reservations for this inode
1579  *
1580  * If the block does not appear in any reservation, then return the
1581  * block number unchanged. If it does appear in the reservation, then
1582  * keep looking through the tree of reservations in order to find the
1583  * first block number which is not reserved.
1584  */
1585 
gfs2_next_unreserved_block(struct gfs2_rgrpd * rgd,u64 block,u32 length,const struct gfs2_inode * ip)1586 static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block,
1587 				      u32 length,
1588 				      const struct gfs2_inode *ip)
1589 {
1590 	struct gfs2_blkreserv *rs;
1591 	struct rb_node *n;
1592 	int rc;
1593 
1594 	spin_lock(&rgd->rd_rsspin);
1595 	n = rgd->rd_rstree.rb_node;
1596 	while (n) {
1597 		rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
1598 		rc = rs_cmp(block, length, rs);
1599 		if (rc < 0)
1600 			n = n->rb_left;
1601 		else if (rc > 0)
1602 			n = n->rb_right;
1603 		else
1604 			break;
1605 	}
1606 
1607 	if (n) {
1608 		while ((rs_cmp(block, length, rs) == 0) && (&ip->i_res != rs)) {
1609 			block = gfs2_rbm_to_block(&rs->rs_rbm) + rs->rs_free;
1610 			n = n->rb_right;
1611 			if (n == NULL)
1612 				break;
1613 			rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
1614 		}
1615 	}
1616 
1617 	spin_unlock(&rgd->rd_rsspin);
1618 	return block;
1619 }
1620 
1621 /**
1622  * gfs2_reservation_check_and_update - Check for reservations during block alloc
1623  * @rbm: The current position in the resource group
1624  * @ip: The inode for which we are searching for blocks
1625  * @minext: The minimum extent length
1626  * @maxext: A pointer to the maximum extent structure
1627  *
1628  * This checks the current position in the rgrp to see whether there is
1629  * a reservation covering this block. If not then this function is a
1630  * no-op. If there is, then the position is moved to the end of the
1631  * contiguous reservation(s) so that we are pointing at the first
1632  * non-reserved block.
1633  *
1634  * Returns: 0 if no reservation, 1 if @rbm has changed, otherwise an error
1635  */
1636 
gfs2_reservation_check_and_update(struct gfs2_rbm * rbm,const struct gfs2_inode * ip,u32 minext,struct gfs2_extent * maxext)1637 static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm,
1638 					     const struct gfs2_inode *ip,
1639 					     u32 minext,
1640 					     struct gfs2_extent *maxext)
1641 {
1642 	u64 block = gfs2_rbm_to_block(rbm);
1643 	u32 extlen = 1;
1644 	u64 nblock;
1645 	int ret;
1646 
1647 	/*
1648 	 * If we have a minimum extent length, then skip over any extent
1649 	 * which is less than the min extent length in size.
1650 	 */
1651 	if (minext) {
1652 		extlen = gfs2_free_extlen(rbm, minext);
1653 		if (extlen <= maxext->len)
1654 			goto fail;
1655 	}
1656 
1657 	/*
1658 	 * Check the extent which has been found against the reservations
1659 	 * and skip if parts of it are already reserved
1660 	 */
1661 	nblock = gfs2_next_unreserved_block(rbm->rgd, block, extlen, ip);
1662 	if (nblock == block) {
1663 		if (!minext || extlen >= minext)
1664 			return 0;
1665 
1666 		if (extlen > maxext->len) {
1667 			maxext->len = extlen;
1668 			maxext->rbm = *rbm;
1669 		}
1670 fail:
1671 		nblock = block + extlen;
1672 	}
1673 	ret = gfs2_rbm_from_block(rbm, nblock);
1674 	if (ret < 0)
1675 		return ret;
1676 	return 1;
1677 }
1678 
1679 /**
1680  * gfs2_rbm_find - Look for blocks of a particular state
1681  * @rbm: Value/result starting position and final position
1682  * @state: The state which we want to find
1683  * @minext: Pointer to the requested extent length (NULL for a single block)
1684  *          This is updated to be the actual reservation size.
1685  * @ip: If set, check for reservations
1686  * @nowrap: Stop looking at the end of the rgrp, rather than wrapping
1687  *          around until we've reached the starting point.
1688  *
1689  * Side effects:
1690  * - If looking for free blocks, we set GBF_FULL on each bitmap which
1691  *   has no free blocks in it.
1692  * - If looking for free blocks, we set rd_extfail_pt on each rgrp which
1693  *   has come up short on a free block search.
1694  *
1695  * Returns: 0 on success, -ENOSPC if there is no block of the requested state
1696  */
1697 
gfs2_rbm_find(struct gfs2_rbm * rbm,u8 state,u32 * minext,const struct gfs2_inode * ip,bool nowrap)1698 static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
1699 			 const struct gfs2_inode *ip, bool nowrap)
1700 {
1701 	bool scan_from_start = rbm->bii == 0 && rbm->offset == 0;
1702 	struct buffer_head *bh;
1703 	int last_bii;
1704 	u32 offset;
1705 	u8 *buffer;
1706 	bool wrapped = false;
1707 	int ret;
1708 	struct gfs2_bitmap *bi;
1709 	struct gfs2_extent maxext = { .rbm.rgd = rbm->rgd, };
1710 
1711 	/*
1712 	 * Determine the last bitmap to search.  If we're not starting at the
1713 	 * beginning of a bitmap, we need to search that bitmap twice to scan
1714 	 * the entire resource group.
1715 	 */
1716 	last_bii = rbm->bii - (rbm->offset == 0);
1717 
1718 	while(1) {
1719 		bi = rbm_bi(rbm);
1720 		if ((ip == NULL || !gfs2_rs_active(&ip->i_res)) &&
1721 		    test_bit(GBF_FULL, &bi->bi_flags) &&
1722 		    (state == GFS2_BLKST_FREE))
1723 			goto next_bitmap;
1724 
1725 		bh = bi->bi_bh;
1726 		buffer = bh->b_data + bi->bi_offset;
1727 		WARN_ON(!buffer_uptodate(bh));
1728 		if (state != GFS2_BLKST_UNLINKED && bi->bi_clone)
1729 			buffer = bi->bi_clone + bi->bi_offset;
1730 		offset = gfs2_bitfit(buffer, bi->bi_bytes, rbm->offset, state);
1731 		if (offset == BFITNOENT) {
1732 			if (state == GFS2_BLKST_FREE && rbm->offset == 0)
1733 				set_bit(GBF_FULL, &bi->bi_flags);
1734 			goto next_bitmap;
1735 		}
1736 		rbm->offset = offset;
1737 		if (ip == NULL)
1738 			return 0;
1739 
1740 		ret = gfs2_reservation_check_and_update(rbm, ip,
1741 							minext ? *minext : 0,
1742 							&maxext);
1743 		if (ret == 0)
1744 			return 0;
1745 		if (ret > 0)
1746 			goto next_iter;
1747 		if (ret == -E2BIG) {
1748 			rbm->bii = 0;
1749 			rbm->offset = 0;
1750 			goto res_covered_end_of_rgrp;
1751 		}
1752 		return ret;
1753 
1754 next_bitmap:	/* Find next bitmap in the rgrp */
1755 		rbm->offset = 0;
1756 		rbm->bii++;
1757 		if (rbm->bii == rbm->rgd->rd_length)
1758 			rbm->bii = 0;
1759 res_covered_end_of_rgrp:
1760 		if (rbm->bii == 0) {
1761 			if (wrapped)
1762 				break;
1763 			wrapped = true;
1764 			if (nowrap)
1765 				break;
1766 		}
1767 next_iter:
1768 		/* Have we scanned the entire resource group? */
1769 		if (wrapped && rbm->bii > last_bii)
1770 			break;
1771 	}
1772 
1773 	if (minext == NULL || state != GFS2_BLKST_FREE)
1774 		return -ENOSPC;
1775 
1776 	/* If the extent was too small, and it's smaller than the smallest
1777 	   to have failed before, remember for future reference that it's
1778 	   useless to search this rgrp again for this amount or more. */
1779 	if (wrapped && (scan_from_start || rbm->bii > last_bii) &&
1780 	    *minext < rbm->rgd->rd_extfail_pt)
1781 		rbm->rgd->rd_extfail_pt = *minext;
1782 
1783 	/* If the maximum extent we found is big enough to fulfill the
1784 	   minimum requirements, use it anyway. */
1785 	if (maxext.len) {
1786 		*rbm = maxext.rbm;
1787 		*minext = maxext.len;
1788 		return 0;
1789 	}
1790 
1791 	return -ENOSPC;
1792 }
1793 
1794 /**
1795  * try_rgrp_unlink - Look for any unlinked, allocated, but unused inodes
1796  * @rgd: The rgrp
1797  * @last_unlinked: block address of the last dinode we unlinked
1798  * @skip: block address we should explicitly not unlink
1799  *
1800  * Returns: 0 if no error
1801  *          The inode, if one has been found, in inode.
1802  */
1803 
try_rgrp_unlink(struct gfs2_rgrpd * rgd,u64 * last_unlinked,u64 skip)1804 static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip)
1805 {
1806 	u64 block;
1807 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1808 	struct gfs2_glock *gl;
1809 	struct gfs2_inode *ip;
1810 	int error;
1811 	int found = 0;
1812 	struct gfs2_rbm rbm = { .rgd = rgd, .bii = 0, .offset = 0 };
1813 
1814 	while (1) {
1815 		error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, NULL, NULL,
1816 				      true);
1817 		if (error == -ENOSPC)
1818 			break;
1819 		if (WARN_ON_ONCE(error))
1820 			break;
1821 
1822 		block = gfs2_rbm_to_block(&rbm);
1823 		if (gfs2_rbm_from_block(&rbm, block + 1))
1824 			break;
1825 		if (*last_unlinked != NO_BLOCK && block <= *last_unlinked)
1826 			continue;
1827 		if (block == skip)
1828 			continue;
1829 		*last_unlinked = block;
1830 
1831 		error = gfs2_glock_get(sdp, block, &gfs2_iopen_glops, CREATE, &gl);
1832 		if (error)
1833 			continue;
1834 
1835 		/* If the inode is already in cache, we can ignore it here
1836 		 * because the existing inode disposal code will deal with
1837 		 * it when all refs have gone away. Accessing gl_object like
1838 		 * this is not safe in general. Here it is ok because we do
1839 		 * not dereference the pointer, and we only need an approx
1840 		 * answer to whether it is NULL or not.
1841 		 */
1842 		ip = gl->gl_object;
1843 
1844 		if (ip || !gfs2_queue_delete_work(gl, 0))
1845 			gfs2_glock_put(gl);
1846 		else
1847 			found++;
1848 
1849 		/* Limit reclaim to sensible number of tasks */
1850 		if (found > NR_CPUS)
1851 			return;
1852 	}
1853 
1854 	rgd->rd_flags &= ~GFS2_RDF_CHECK;
1855 	return;
1856 }
1857 
1858 /**
1859  * gfs2_rgrp_congested - Use stats to figure out whether an rgrp is congested
1860  * @rgd: The rgrp in question
1861  * @loops: An indication of how picky we can be (0=very, 1=less so)
1862  *
1863  * This function uses the recently added glock statistics in order to
1864  * figure out whether a parciular resource group is suffering from
1865  * contention from multiple nodes. This is done purely on the basis
1866  * of timings, since this is the only data we have to work with and
1867  * our aim here is to reject a resource group which is highly contended
1868  * but (very important) not to do this too often in order to ensure that
1869  * we do not land up introducing fragmentation by changing resource
1870  * groups when not actually required.
1871  *
1872  * The calculation is fairly simple, we want to know whether the SRTTB
1873  * (i.e. smoothed round trip time for blocking operations) to acquire
1874  * the lock for this rgrp's glock is significantly greater than the
1875  * time taken for resource groups on average. We introduce a margin in
1876  * the form of the variable @var which is computed as the sum of the two
1877  * respective variences, and multiplied by a factor depending on @loops
1878  * and whether we have a lot of data to base the decision on. This is
1879  * then tested against the square difference of the means in order to
1880  * decide whether the result is statistically significant or not.
1881  *
1882  * Returns: A boolean verdict on the congestion status
1883  */
1884 
gfs2_rgrp_congested(const struct gfs2_rgrpd * rgd,int loops)1885 static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops)
1886 {
1887 	const struct gfs2_glock *gl = rgd->rd_gl;
1888 	const struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
1889 	struct gfs2_lkstats *st;
1890 	u64 r_dcount, l_dcount;
1891 	u64 l_srttb, a_srttb = 0;
1892 	s64 srttb_diff;
1893 	u64 sqr_diff;
1894 	u64 var;
1895 	int cpu, nonzero = 0;
1896 
1897 	preempt_disable();
1898 	for_each_present_cpu(cpu) {
1899 		st = &per_cpu_ptr(sdp->sd_lkstats, cpu)->lkstats[LM_TYPE_RGRP];
1900 		if (st->stats[GFS2_LKS_SRTTB]) {
1901 			a_srttb += st->stats[GFS2_LKS_SRTTB];
1902 			nonzero++;
1903 		}
1904 	}
1905 	st = &this_cpu_ptr(sdp->sd_lkstats)->lkstats[LM_TYPE_RGRP];
1906 	if (nonzero)
1907 		do_div(a_srttb, nonzero);
1908 	r_dcount = st->stats[GFS2_LKS_DCOUNT];
1909 	var = st->stats[GFS2_LKS_SRTTVARB] +
1910 	      gl->gl_stats.stats[GFS2_LKS_SRTTVARB];
1911 	preempt_enable();
1912 
1913 	l_srttb = gl->gl_stats.stats[GFS2_LKS_SRTTB];
1914 	l_dcount = gl->gl_stats.stats[GFS2_LKS_DCOUNT];
1915 
1916 	if ((l_dcount < 1) || (r_dcount < 1) || (a_srttb == 0))
1917 		return false;
1918 
1919 	srttb_diff = a_srttb - l_srttb;
1920 	sqr_diff = srttb_diff * srttb_diff;
1921 
1922 	var *= 2;
1923 	if (l_dcount < 8 || r_dcount < 8)
1924 		var *= 2;
1925 	if (loops == 1)
1926 		var *= 2;
1927 
1928 	return ((srttb_diff < 0) && (sqr_diff > var));
1929 }
1930 
1931 /**
1932  * gfs2_rgrp_used_recently
1933  * @rs: The block reservation with the rgrp to test
1934  * @msecs: The time limit in milliseconds
1935  *
1936  * Returns: True if the rgrp glock has been used within the time limit
1937  */
gfs2_rgrp_used_recently(const struct gfs2_blkreserv * rs,u64 msecs)1938 static bool gfs2_rgrp_used_recently(const struct gfs2_blkreserv *rs,
1939 				    u64 msecs)
1940 {
1941 	u64 tdiff;
1942 
1943 	tdiff = ktime_to_ns(ktime_sub(ktime_get_real(),
1944                             rs->rs_rbm.rgd->rd_gl->gl_dstamp));
1945 
1946 	return tdiff > (msecs * 1000 * 1000);
1947 }
1948 
gfs2_orlov_skip(const struct gfs2_inode * ip)1949 static u32 gfs2_orlov_skip(const struct gfs2_inode *ip)
1950 {
1951 	const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1952 	u32 skip;
1953 
1954 	get_random_bytes(&skip, sizeof(skip));
1955 	return skip % sdp->sd_rgrps;
1956 }
1957 
gfs2_select_rgrp(struct gfs2_rgrpd ** pos,const struct gfs2_rgrpd * begin)1958 static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *begin)
1959 {
1960 	struct gfs2_rgrpd *rgd = *pos;
1961 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1962 
1963 	rgd = gfs2_rgrpd_get_next(rgd);
1964 	if (rgd == NULL)
1965 		rgd = gfs2_rgrpd_get_first(sdp);
1966 	*pos = rgd;
1967 	if (rgd != begin) /* If we didn't wrap */
1968 		return true;
1969 	return false;
1970 }
1971 
1972 /**
1973  * fast_to_acquire - determine if a resource group will be fast to acquire
1974  *
1975  * If this is one of our preferred rgrps, it should be quicker to acquire,
1976  * because we tried to set ourselves up as dlm lock master.
1977  */
fast_to_acquire(struct gfs2_rgrpd * rgd)1978 static inline int fast_to_acquire(struct gfs2_rgrpd *rgd)
1979 {
1980 	struct gfs2_glock *gl = rgd->rd_gl;
1981 
1982 	if (gl->gl_state != LM_ST_UNLOCKED && list_empty(&gl->gl_holders) &&
1983 	    !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) &&
1984 	    !test_bit(GLF_DEMOTE, &gl->gl_flags))
1985 		return 1;
1986 	if (rgd->rd_flags & GFS2_RDF_PREFERRED)
1987 		return 1;
1988 	return 0;
1989 }
1990 
1991 /**
1992  * gfs2_inplace_reserve - Reserve space in the filesystem
1993  * @ip: the inode to reserve space for
1994  * @ap: the allocation parameters
1995  *
1996  * We try our best to find an rgrp that has at least ap->target blocks
1997  * available. After a couple of passes (loops == 2), the prospects of finding
1998  * such an rgrp diminish. At this stage, we return the first rgrp that has
1999  * at least ap->min_target blocks available. Either way, we set ap->allowed to
2000  * the number of blocks available in the chosen rgrp.
2001  *
2002  * Returns: 0 on success,
2003  *          -ENOMEM if a suitable rgrp can't be found
2004  *          errno otherwise
2005  */
2006 
gfs2_inplace_reserve(struct gfs2_inode * ip,struct gfs2_alloc_parms * ap)2007 int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap)
2008 {
2009 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2010 	struct gfs2_rgrpd *begin = NULL;
2011 	struct gfs2_blkreserv *rs = &ip->i_res;
2012 	int error = 0, rg_locked, flags = 0;
2013 	u64 last_unlinked = NO_BLOCK;
2014 	int loops = 0;
2015 	u32 free_blocks, skip = 0;
2016 
2017 	if (sdp->sd_args.ar_rgrplvb)
2018 		flags |= GL_SKIP;
2019 	if (gfs2_assert_warn(sdp, ap->target))
2020 		return -EINVAL;
2021 	if (gfs2_rs_active(rs)) {
2022 		begin = rs->rs_rbm.rgd;
2023 	} else if (rs->rs_rbm.rgd &&
2024 		   rgrp_contains_block(rs->rs_rbm.rgd, ip->i_goal)) {
2025 		begin = rs->rs_rbm.rgd;
2026 	} else {
2027 		check_and_update_goal(ip);
2028 		rs->rs_rbm.rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);
2029 	}
2030 	if (S_ISDIR(ip->i_inode.i_mode) && (ap->aflags & GFS2_AF_ORLOV))
2031 		skip = gfs2_orlov_skip(ip);
2032 	if (rs->rs_rbm.rgd == NULL)
2033 		return -EBADSLT;
2034 
2035 	while (loops < 3) {
2036 		rg_locked = 1;
2037 
2038 		if (!gfs2_glock_is_locked_by_me(rs->rs_rbm.rgd->rd_gl)) {
2039 			rg_locked = 0;
2040 			if (skip && skip--)
2041 				goto next_rgrp;
2042 			if (!gfs2_rs_active(rs)) {
2043 				if (loops == 0 &&
2044 				    !fast_to_acquire(rs->rs_rbm.rgd))
2045 					goto next_rgrp;
2046 				if ((loops < 2) &&
2047 				    gfs2_rgrp_used_recently(rs, 1000) &&
2048 				    gfs2_rgrp_congested(rs->rs_rbm.rgd, loops))
2049 					goto next_rgrp;
2050 			}
2051 			error = gfs2_glock_nq_init(rs->rs_rbm.rgd->rd_gl,
2052 						   LM_ST_EXCLUSIVE, flags,
2053 						   &ip->i_rgd_gh);
2054 			if (unlikely(error))
2055 				return error;
2056 			if (!gfs2_rs_active(rs) && (loops < 2) &&
2057 			    gfs2_rgrp_congested(rs->rs_rbm.rgd, loops))
2058 				goto skip_rgrp;
2059 			if (sdp->sd_args.ar_rgrplvb) {
2060 				error = update_rgrp_lvb(rs->rs_rbm.rgd);
2061 				if (unlikely(error)) {
2062 					gfs2_glock_dq_uninit(&ip->i_rgd_gh);
2063 					return error;
2064 				}
2065 			}
2066 		}
2067 
2068 		/* Skip unusable resource groups */
2069 		if ((rs->rs_rbm.rgd->rd_flags & (GFS2_RGF_NOALLOC |
2070 						 GFS2_RDF_ERROR)) ||
2071 		    (loops == 0 && ap->target > rs->rs_rbm.rgd->rd_extfail_pt))
2072 			goto skip_rgrp;
2073 
2074 		if (sdp->sd_args.ar_rgrplvb)
2075 			gfs2_rgrp_bh_get(rs->rs_rbm.rgd);
2076 
2077 		/* Get a reservation if we don't already have one */
2078 		if (!gfs2_rs_active(rs))
2079 			rg_mblk_search(rs->rs_rbm.rgd, ip, ap);
2080 
2081 		/* Skip rgrps when we can't get a reservation on first pass */
2082 		if (!gfs2_rs_active(rs) && (loops < 1))
2083 			goto check_rgrp;
2084 
2085 		/* If rgrp has enough free space, use it */
2086 		free_blocks = rgd_free(rs->rs_rbm.rgd, rs);
2087 		if (free_blocks >= ap->target ||
2088 		    (loops == 2 && ap->min_target &&
2089 		     free_blocks >= ap->min_target)) {
2090 			ap->allowed = free_blocks;
2091 			return 0;
2092 		}
2093 check_rgrp:
2094 		/* Check for unlinked inodes which can be reclaimed */
2095 		if (rs->rs_rbm.rgd->rd_flags & GFS2_RDF_CHECK)
2096 			try_rgrp_unlink(rs->rs_rbm.rgd, &last_unlinked,
2097 					ip->i_no_addr);
2098 skip_rgrp:
2099 		/* Drop reservation, if we couldn't use reserved rgrp */
2100 		if (gfs2_rs_active(rs))
2101 			gfs2_rs_deltree(rs);
2102 
2103 		/* Unlock rgrp if required */
2104 		if (!rg_locked)
2105 			gfs2_glock_dq_uninit(&ip->i_rgd_gh);
2106 next_rgrp:
2107 		/* Find the next rgrp, and continue looking */
2108 		if (gfs2_select_rgrp(&rs->rs_rbm.rgd, begin))
2109 			continue;
2110 		if (skip)
2111 			continue;
2112 
2113 		/* If we've scanned all the rgrps, but found no free blocks
2114 		 * then this checks for some less likely conditions before
2115 		 * trying again.
2116 		 */
2117 		loops++;
2118 		/* Check that fs hasn't grown if writing to rindex */
2119 		if (ip == GFS2_I(sdp->sd_rindex) && !sdp->sd_rindex_uptodate) {
2120 			error = gfs2_ri_update(ip);
2121 			if (error)
2122 				return error;
2123 		}
2124 		/* Flushing the log may release space */
2125 		if (loops == 2)
2126 			gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL |
2127 				       GFS2_LFC_INPLACE_RESERVE);
2128 	}
2129 
2130 	return -ENOSPC;
2131 }
2132 
2133 /**
2134  * gfs2_inplace_release - release an inplace reservation
2135  * @ip: the inode the reservation was taken out on
2136  *
2137  * Release a reservation made by gfs2_inplace_reserve().
2138  */
2139 
gfs2_inplace_release(struct gfs2_inode * ip)2140 void gfs2_inplace_release(struct gfs2_inode *ip)
2141 {
2142 	if (gfs2_holder_initialized(&ip->i_rgd_gh))
2143 		gfs2_glock_dq_uninit(&ip->i_rgd_gh);
2144 }
2145 
2146 /**
2147  * gfs2_alloc_extent - allocate an extent from a given bitmap
2148  * @rbm: the resource group information
2149  * @dinode: TRUE if the first block we allocate is for a dinode
2150  * @n: The extent length (value/result)
2151  *
2152  * Add the bitmap buffer to the transaction.
2153  * Set the found bits to @new_state to change block's allocation state.
2154  */
gfs2_alloc_extent(const struct gfs2_rbm * rbm,bool dinode,unsigned int * n)2155 static void gfs2_alloc_extent(const struct gfs2_rbm *rbm, bool dinode,
2156 			     unsigned int *n)
2157 {
2158 	struct gfs2_rbm pos = { .rgd = rbm->rgd, };
2159 	const unsigned int elen = *n;
2160 	u64 block;
2161 	int ret;
2162 
2163 	*n = 1;
2164 	block = gfs2_rbm_to_block(rbm);
2165 	gfs2_trans_add_meta(rbm->rgd->rd_gl, rbm_bi(rbm)->bi_bh);
2166 	gfs2_setbit(rbm, true, dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
2167 	block++;
2168 	while (*n < elen) {
2169 		ret = gfs2_rbm_from_block(&pos, block);
2170 		if (ret || gfs2_testbit(&pos, true) != GFS2_BLKST_FREE)
2171 			break;
2172 		gfs2_trans_add_meta(pos.rgd->rd_gl, rbm_bi(&pos)->bi_bh);
2173 		gfs2_setbit(&pos, true, GFS2_BLKST_USED);
2174 		(*n)++;
2175 		block++;
2176 	}
2177 }
2178 
2179 /**
2180  * rgblk_free - Change alloc state of given block(s)
2181  * @sdp: the filesystem
2182  * @rgd: the resource group the blocks are in
2183  * @bstart: the start of a run of blocks to free
2184  * @blen: the length of the block run (all must lie within ONE RG!)
2185  * @new_state: GFS2_BLKST_XXX the after-allocation block state
2186  */
2187 
rgblk_free(struct gfs2_sbd * sdp,struct gfs2_rgrpd * rgd,u64 bstart,u32 blen,unsigned char new_state)2188 static void rgblk_free(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd,
2189 		       u64 bstart, u32 blen, unsigned char new_state)
2190 {
2191 	struct gfs2_rbm rbm;
2192 	struct gfs2_bitmap *bi, *bi_prev = NULL;
2193 
2194 	rbm.rgd = rgd;
2195 	if (WARN_ON_ONCE(gfs2_rbm_from_block(&rbm, bstart)))
2196 		return;
2197 	while (blen--) {
2198 		bi = rbm_bi(&rbm);
2199 		if (bi != bi_prev) {
2200 			if (!bi->bi_clone) {
2201 				bi->bi_clone = kmalloc(bi->bi_bh->b_size,
2202 						      GFP_NOFS | __GFP_NOFAIL);
2203 				memcpy(bi->bi_clone + bi->bi_offset,
2204 				       bi->bi_bh->b_data + bi->bi_offset,
2205 				       bi->bi_bytes);
2206 			}
2207 			gfs2_trans_add_meta(rbm.rgd->rd_gl, bi->bi_bh);
2208 			bi_prev = bi;
2209 		}
2210 		gfs2_setbit(&rbm, false, new_state);
2211 		gfs2_rbm_incr(&rbm);
2212 	}
2213 }
2214 
2215 /**
2216  * gfs2_rgrp_dump - print out an rgrp
2217  * @seq: The iterator
2218  * @rgd: The rgrp in question
2219  * @fs_id_buf: pointer to file system id (if requested)
2220  *
2221  */
2222 
gfs2_rgrp_dump(struct seq_file * seq,struct gfs2_rgrpd * rgd,const char * fs_id_buf)2223 void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_rgrpd *rgd,
2224 		    const char *fs_id_buf)
2225 {
2226 	struct gfs2_blkreserv *trs;
2227 	const struct rb_node *n;
2228 
2229 	gfs2_print_dbg(seq, "%s R: n:%llu f:%02x b:%u/%u i:%u r:%u e:%u\n",
2230 		       fs_id_buf,
2231 		       (unsigned long long)rgd->rd_addr, rgd->rd_flags,
2232 		       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes,
2233 		       rgd->rd_reserved, rgd->rd_extfail_pt);
2234 	if (rgd->rd_sbd->sd_args.ar_rgrplvb) {
2235 		struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl;
2236 
2237 		gfs2_print_dbg(seq, "%s  L: f:%02x b:%u i:%u\n", fs_id_buf,
2238 			       be32_to_cpu(rgl->rl_flags),
2239 			       be32_to_cpu(rgl->rl_free),
2240 			       be32_to_cpu(rgl->rl_dinodes));
2241 	}
2242 	spin_lock(&rgd->rd_rsspin);
2243 	for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) {
2244 		trs = rb_entry(n, struct gfs2_blkreserv, rs_node);
2245 		dump_rs(seq, trs, fs_id_buf);
2246 	}
2247 	spin_unlock(&rgd->rd_rsspin);
2248 }
2249 
gfs2_rgrp_error(struct gfs2_rgrpd * rgd)2250 static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
2251 {
2252 	struct gfs2_sbd *sdp = rgd->rd_sbd;
2253 	char fs_id_buf[sizeof(sdp->sd_fsname) + 7];
2254 
2255 	fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n",
2256 		(unsigned long long)rgd->rd_addr);
2257 	fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n");
2258 	sprintf(fs_id_buf, "fsid=%s: ", sdp->sd_fsname);
2259 	gfs2_rgrp_dump(NULL, rgd, fs_id_buf);
2260 	rgd->rd_flags |= GFS2_RDF_ERROR;
2261 }
2262 
2263 /**
2264  * gfs2_adjust_reservation - Adjust (or remove) a reservation after allocation
2265  * @ip: The inode we have just allocated blocks for
2266  * @rbm: The start of the allocated blocks
2267  * @len: The extent length
2268  *
2269  * Adjusts a reservation after an allocation has taken place. If the
2270  * reservation does not match the allocation, or if it is now empty
2271  * then it is removed.
2272  */
2273 
gfs2_adjust_reservation(struct gfs2_inode * ip,const struct gfs2_rbm * rbm,unsigned len)2274 static void gfs2_adjust_reservation(struct gfs2_inode *ip,
2275 				    const struct gfs2_rbm *rbm, unsigned len)
2276 {
2277 	struct gfs2_blkreserv *rs = &ip->i_res;
2278 	struct gfs2_rgrpd *rgd = rbm->rgd;
2279 	unsigned rlen;
2280 	u64 block;
2281 	int ret;
2282 
2283 	spin_lock(&rgd->rd_rsspin);
2284 	if (gfs2_rs_active(rs)) {
2285 		if (gfs2_rbm_eq(&rs->rs_rbm, rbm)) {
2286 			block = gfs2_rbm_to_block(rbm);
2287 			ret = gfs2_rbm_from_block(&rs->rs_rbm, block + len);
2288 			rlen = min(rs->rs_free, len);
2289 			rs->rs_free -= rlen;
2290 			rgd->rd_reserved -= rlen;
2291 			trace_gfs2_rs(rs, TRACE_RS_CLAIM);
2292 			if (rs->rs_free && !ret)
2293 				goto out;
2294 			/* We used up our block reservation, so we should
2295 			   reserve more blocks next time. */
2296 			atomic_add(RGRP_RSRV_ADDBLKS, &ip->i_sizehint);
2297 		}
2298 		__rs_deltree(rs);
2299 	}
2300 out:
2301 	spin_unlock(&rgd->rd_rsspin);
2302 }
2303 
2304 /**
2305  * gfs2_set_alloc_start - Set starting point for block allocation
2306  * @rbm: The rbm which will be set to the required location
2307  * @ip: The gfs2 inode
2308  * @dinode: Flag to say if allocation includes a new inode
2309  *
2310  * This sets the starting point from the reservation if one is active
2311  * otherwise it falls back to guessing a start point based on the
2312  * inode's goal block or the last allocation point in the rgrp.
2313  */
2314 
gfs2_set_alloc_start(struct gfs2_rbm * rbm,const struct gfs2_inode * ip,bool dinode)2315 static void gfs2_set_alloc_start(struct gfs2_rbm *rbm,
2316 				 const struct gfs2_inode *ip, bool dinode)
2317 {
2318 	u64 goal;
2319 
2320 	if (gfs2_rs_active(&ip->i_res)) {
2321 		*rbm = ip->i_res.rs_rbm;
2322 		return;
2323 	}
2324 
2325 	if (!dinode && rgrp_contains_block(rbm->rgd, ip->i_goal))
2326 		goal = ip->i_goal;
2327 	else
2328 		goal = rbm->rgd->rd_last_alloc + rbm->rgd->rd_data0;
2329 
2330 	if (WARN_ON_ONCE(gfs2_rbm_from_block(rbm, goal))) {
2331 		rbm->bii = 0;
2332 		rbm->offset = 0;
2333 	}
2334 }
2335 
2336 /**
2337  * gfs2_alloc_blocks - Allocate one or more blocks of data and/or a dinode
2338  * @ip: the inode to allocate the block for
2339  * @bn: Used to return the starting block number
2340  * @nblocks: requested number of blocks/extent length (value/result)
2341  * @dinode: 1 if we're allocating a dinode block, else 0
2342  * @generation: the generation number of the inode
2343  *
2344  * Returns: 0 or error
2345  */
2346 
gfs2_alloc_blocks(struct gfs2_inode * ip,u64 * bn,unsigned int * nblocks,bool dinode,u64 * generation)2347 int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
2348 		      bool dinode, u64 *generation)
2349 {
2350 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2351 	struct buffer_head *dibh;
2352 	struct gfs2_rbm rbm = { .rgd = ip->i_res.rs_rbm.rgd, };
2353 	unsigned int ndata;
2354 	u64 block; /* block, within the file system scope */
2355 	int error;
2356 
2357 	gfs2_set_alloc_start(&rbm, ip, dinode);
2358 	error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, ip, false);
2359 
2360 	if (error == -ENOSPC) {
2361 		gfs2_set_alloc_start(&rbm, ip, dinode);
2362 		error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, NULL, false);
2363 	}
2364 
2365 	/* Since all blocks are reserved in advance, this shouldn't happen */
2366 	if (error) {
2367 		fs_warn(sdp, "inum=%llu error=%d, nblocks=%u, full=%d fail_pt=%d\n",
2368 			(unsigned long long)ip->i_no_addr, error, *nblocks,
2369 			test_bit(GBF_FULL, &rbm.rgd->rd_bits->bi_flags),
2370 			rbm.rgd->rd_extfail_pt);
2371 		goto rgrp_error;
2372 	}
2373 
2374 	gfs2_alloc_extent(&rbm, dinode, nblocks);
2375 	block = gfs2_rbm_to_block(&rbm);
2376 	rbm.rgd->rd_last_alloc = block - rbm.rgd->rd_data0;
2377 	if (gfs2_rs_active(&ip->i_res))
2378 		gfs2_adjust_reservation(ip, &rbm, *nblocks);
2379 	ndata = *nblocks;
2380 	if (dinode)
2381 		ndata--;
2382 
2383 	if (!dinode) {
2384 		ip->i_goal = block + ndata - 1;
2385 		error = gfs2_meta_inode_buffer(ip, &dibh);
2386 		if (error == 0) {
2387 			struct gfs2_dinode *di =
2388 				(struct gfs2_dinode *)dibh->b_data;
2389 			gfs2_trans_add_meta(ip->i_gl, dibh);
2390 			di->di_goal_meta = di->di_goal_data =
2391 				cpu_to_be64(ip->i_goal);
2392 			brelse(dibh);
2393 		}
2394 	}
2395 	if (rbm.rgd->rd_free < *nblocks) {
2396 		fs_warn(sdp, "nblocks=%u\n", *nblocks);
2397 		goto rgrp_error;
2398 	}
2399 
2400 	rbm.rgd->rd_free -= *nblocks;
2401 	if (dinode) {
2402 		rbm.rgd->rd_dinodes++;
2403 		*generation = rbm.rgd->rd_igeneration++;
2404 		if (*generation == 0)
2405 			*generation = rbm.rgd->rd_igeneration++;
2406 	}
2407 
2408 	gfs2_trans_add_meta(rbm.rgd->rd_gl, rbm.rgd->rd_bits[0].bi_bh);
2409 	gfs2_rgrp_out(rbm.rgd, rbm.rgd->rd_bits[0].bi_bh->b_data);
2410 
2411 	gfs2_statfs_change(sdp, 0, -(s64)*nblocks, dinode ? 1 : 0);
2412 	if (dinode)
2413 		gfs2_trans_remove_revoke(sdp, block, *nblocks);
2414 
2415 	gfs2_quota_change(ip, *nblocks, ip->i_inode.i_uid, ip->i_inode.i_gid);
2416 
2417 	rbm.rgd->rd_free_clone -= *nblocks;
2418 	trace_gfs2_block_alloc(ip, rbm.rgd, block, *nblocks,
2419 			       dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
2420 	*bn = block;
2421 	return 0;
2422 
2423 rgrp_error:
2424 	gfs2_rgrp_error(rbm.rgd);
2425 	return -EIO;
2426 }
2427 
2428 /**
2429  * __gfs2_free_blocks - free a contiguous run of block(s)
2430  * @ip: the inode these blocks are being freed from
2431  * @rgd: the resource group the blocks are in
2432  * @bstart: first block of a run of contiguous blocks
2433  * @blen: the length of the block run
2434  * @meta: 1 if the blocks represent metadata
2435  *
2436  */
2437 
__gfs2_free_blocks(struct gfs2_inode * ip,struct gfs2_rgrpd * rgd,u64 bstart,u32 blen,int meta)2438 void __gfs2_free_blocks(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
2439 			u64 bstart, u32 blen, int meta)
2440 {
2441 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2442 
2443 	rgblk_free(sdp, rgd, bstart, blen, GFS2_BLKST_FREE);
2444 	trace_gfs2_block_alloc(ip, rgd, bstart, blen, GFS2_BLKST_FREE);
2445 	rgd->rd_free += blen;
2446 	rgd->rd_flags &= ~GFS2_RGF_TRIMMED;
2447 	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
2448 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
2449 
2450 	/* Directories keep their data in the metadata address space */
2451 	if (meta || ip->i_depth || gfs2_is_jdata(ip))
2452 		gfs2_journal_wipe(ip, bstart, blen);
2453 }
2454 
2455 /**
2456  * gfs2_free_meta - free a contiguous run of data block(s)
2457  * @ip: the inode these blocks are being freed from
2458  * @rgd: the resource group the blocks are in
2459  * @bstart: first block of a run of contiguous blocks
2460  * @blen: the length of the block run
2461  *
2462  */
2463 
gfs2_free_meta(struct gfs2_inode * ip,struct gfs2_rgrpd * rgd,u64 bstart,u32 blen)2464 void gfs2_free_meta(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
2465 		    u64 bstart, u32 blen)
2466 {
2467 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2468 
2469 	__gfs2_free_blocks(ip, rgd, bstart, blen, 1);
2470 	gfs2_statfs_change(sdp, 0, +blen, 0);
2471 	gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid);
2472 }
2473 
gfs2_unlink_di(struct inode * inode)2474 void gfs2_unlink_di(struct inode *inode)
2475 {
2476 	struct gfs2_inode *ip = GFS2_I(inode);
2477 	struct gfs2_sbd *sdp = GFS2_SB(inode);
2478 	struct gfs2_rgrpd *rgd;
2479 	u64 blkno = ip->i_no_addr;
2480 
2481 	rgd = gfs2_blk2rgrpd(sdp, blkno, true);
2482 	if (!rgd)
2483 		return;
2484 	rgblk_free(sdp, rgd, blkno, 1, GFS2_BLKST_UNLINKED);
2485 	trace_gfs2_block_alloc(ip, rgd, blkno, 1, GFS2_BLKST_UNLINKED);
2486 	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
2487 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
2488 	be32_add_cpu(&rgd->rd_rgl->rl_unlinked, 1);
2489 }
2490 
gfs2_free_di(struct gfs2_rgrpd * rgd,struct gfs2_inode * ip)2491 void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
2492 {
2493 	struct gfs2_sbd *sdp = rgd->rd_sbd;
2494 
2495 	rgblk_free(sdp, rgd, ip->i_no_addr, 1, GFS2_BLKST_FREE);
2496 	if (!rgd->rd_dinodes)
2497 		gfs2_consist_rgrpd(rgd);
2498 	rgd->rd_dinodes--;
2499 	rgd->rd_free++;
2500 
2501 	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
2502 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
2503 	be32_add_cpu(&rgd->rd_rgl->rl_unlinked, -1);
2504 
2505 	gfs2_statfs_change(sdp, 0, +1, -1);
2506 	trace_gfs2_block_alloc(ip, rgd, ip->i_no_addr, 1, GFS2_BLKST_FREE);
2507 	gfs2_quota_change(ip, -1, ip->i_inode.i_uid, ip->i_inode.i_gid);
2508 	gfs2_journal_wipe(ip, ip->i_no_addr, 1);
2509 }
2510 
2511 /**
2512  * gfs2_check_blk_type - Check the type of a block
2513  * @sdp: The superblock
2514  * @no_addr: The block number to check
2515  * @type: The block type we are looking for
2516  *
2517  * Returns: 0 if the block type matches the expected type
2518  *          -ESTALE if it doesn't match
2519  *          or -ve errno if something went wrong while checking
2520  */
2521 
gfs2_check_blk_type(struct gfs2_sbd * sdp,u64 no_addr,unsigned int type)2522 int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type)
2523 {
2524 	struct gfs2_rgrpd *rgd;
2525 	struct gfs2_holder rgd_gh;
2526 	struct gfs2_rbm rbm;
2527 	int error = -EINVAL;
2528 
2529 	rgd = gfs2_blk2rgrpd(sdp, no_addr, 1);
2530 	if (!rgd)
2531 		goto fail;
2532 
2533 	error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
2534 	if (error)
2535 		goto fail;
2536 
2537 	rbm.rgd = rgd;
2538 	error = gfs2_rbm_from_block(&rbm, no_addr);
2539 	if (!WARN_ON_ONCE(error)) {
2540 		if (gfs2_testbit(&rbm, false) != type)
2541 			error = -ESTALE;
2542 	}
2543 
2544 	gfs2_glock_dq_uninit(&rgd_gh);
2545 
2546 fail:
2547 	return error;
2548 }
2549 
2550 /**
2551  * gfs2_rlist_add - add a RG to a list of RGs
2552  * @ip: the inode
2553  * @rlist: the list of resource groups
2554  * @block: the block
2555  *
2556  * Figure out what RG a block belongs to and add that RG to the list
2557  *
2558  * FIXME: Don't use NOFAIL
2559  *
2560  */
2561 
gfs2_rlist_add(struct gfs2_inode * ip,struct gfs2_rgrp_list * rlist,u64 block)2562 void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
2563 		    u64 block)
2564 {
2565 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2566 	struct gfs2_rgrpd *rgd;
2567 	struct gfs2_rgrpd **tmp;
2568 	unsigned int new_space;
2569 	unsigned int x;
2570 
2571 	if (gfs2_assert_warn(sdp, !rlist->rl_ghs))
2572 		return;
2573 
2574 	/*
2575 	 * The resource group last accessed is kept in the last position.
2576 	 */
2577 
2578 	if (rlist->rl_rgrps) {
2579 		rgd = rlist->rl_rgd[rlist->rl_rgrps - 1];
2580 		if (rgrp_contains_block(rgd, block))
2581 			return;
2582 		rgd = gfs2_blk2rgrpd(sdp, block, 1);
2583 	} else {
2584 		rgd = ip->i_res.rs_rbm.rgd;
2585 		if (!rgd || !rgrp_contains_block(rgd, block))
2586 			rgd = gfs2_blk2rgrpd(sdp, block, 1);
2587 	}
2588 
2589 	if (!rgd) {
2590 		fs_err(sdp, "rlist_add: no rgrp for block %llu\n",
2591 		       (unsigned long long)block);
2592 		return;
2593 	}
2594 
2595 	for (x = 0; x < rlist->rl_rgrps; x++) {
2596 		if (rlist->rl_rgd[x] == rgd) {
2597 			swap(rlist->rl_rgd[x],
2598 			     rlist->rl_rgd[rlist->rl_rgrps - 1]);
2599 			return;
2600 		}
2601 	}
2602 
2603 	if (rlist->rl_rgrps == rlist->rl_space) {
2604 		new_space = rlist->rl_space + 10;
2605 
2606 		tmp = kcalloc(new_space, sizeof(struct gfs2_rgrpd *),
2607 			      GFP_NOFS | __GFP_NOFAIL);
2608 
2609 		if (rlist->rl_rgd) {
2610 			memcpy(tmp, rlist->rl_rgd,
2611 			       rlist->rl_space * sizeof(struct gfs2_rgrpd *));
2612 			kfree(rlist->rl_rgd);
2613 		}
2614 
2615 		rlist->rl_space = new_space;
2616 		rlist->rl_rgd = tmp;
2617 	}
2618 
2619 	rlist->rl_rgd[rlist->rl_rgrps++] = rgd;
2620 }
2621 
2622 /**
2623  * gfs2_rlist_alloc - all RGs have been added to the rlist, now allocate
2624  *      and initialize an array of glock holders for them
2625  * @rlist: the list of resource groups
2626  *
2627  * FIXME: Don't use NOFAIL
2628  *
2629  */
2630 
gfs2_rlist_alloc(struct gfs2_rgrp_list * rlist)2631 void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist)
2632 {
2633 	unsigned int x;
2634 
2635 	rlist->rl_ghs = kmalloc_array(rlist->rl_rgrps,
2636 				      sizeof(struct gfs2_holder),
2637 				      GFP_NOFS | __GFP_NOFAIL);
2638 	for (x = 0; x < rlist->rl_rgrps; x++)
2639 		gfs2_holder_init(rlist->rl_rgd[x]->rd_gl,
2640 				LM_ST_EXCLUSIVE, 0,
2641 				&rlist->rl_ghs[x]);
2642 }
2643 
2644 /**
2645  * gfs2_rlist_free - free a resource group list
2646  * @rlist: the list of resource groups
2647  *
2648  */
2649 
gfs2_rlist_free(struct gfs2_rgrp_list * rlist)2650 void gfs2_rlist_free(struct gfs2_rgrp_list *rlist)
2651 {
2652 	unsigned int x;
2653 
2654 	kfree(rlist->rl_rgd);
2655 
2656 	if (rlist->rl_ghs) {
2657 		for (x = 0; x < rlist->rl_rgrps; x++)
2658 			gfs2_holder_uninit(&rlist->rl_ghs[x]);
2659 		kfree(rlist->rl_ghs);
2660 		rlist->rl_ghs = NULL;
2661 	}
2662 }
2663 
2664