• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
4  * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
5  */
6 
7 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
8 
9 #include <linux/slab.h>
10 #include <linux/spinlock.h>
11 #include <linux/completion.h>
12 #include <linux/buffer_head.h>
13 #include <linux/fs.h>
14 #include <linux/gfs2_ondisk.h>
15 #include <linux/prefetch.h>
16 #include <linux/blkdev.h>
17 #include <linux/rbtree.h>
18 #include <linux/random.h>
19 
20 #include "gfs2.h"
21 #include "incore.h"
22 #include "glock.h"
23 #include "glops.h"
24 #include "lops.h"
25 #include "meta_io.h"
26 #include "quota.h"
27 #include "rgrp.h"
28 #include "super.h"
29 #include "trans.h"
30 #include "util.h"
31 #include "log.h"
32 #include "inode.h"
33 #include "trace_gfs2.h"
34 #include "dir.h"
35 
36 #define BFITNOENT ((u32)~0)
37 #define NO_BLOCK ((u64)~0)
38 
39 /*
40  * These routines are used by the resource group routines (rgrp.c)
41  * to keep track of block allocation.  Each block is represented by two
42  * bits.  So, each byte represents GFS2_NBBY (i.e. 4) blocks.
43  *
44  * 0 = Free
45  * 1 = Used (not metadata)
46  * 2 = Unlinked (still in use) inode
47  * 3 = Used (metadata)
48  */
49 
50 struct gfs2_extent {
51 	struct gfs2_rbm rbm;
52 	u32 len;
53 };
54 
55 static const char valid_change[16] = {
56 	        /* current */
57 	/* n */ 0, 1, 1, 1,
58 	/* e */ 1, 0, 0, 0,
59 	/* w */ 0, 0, 0, 1,
60 	        1, 0, 0, 0
61 };
62 
63 static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
64 			 const struct gfs2_inode *ip, bool nowrap);
65 
66 
67 /**
68  * gfs2_setbit - Set a bit in the bitmaps
69  * @rbm: The position of the bit to set
70  * @do_clone: Also set the clone bitmap, if it exists
71  * @new_state: the new state of the block
72  *
73  */
74 
gfs2_setbit(const struct gfs2_rbm * rbm,bool do_clone,unsigned char new_state)75 static inline void gfs2_setbit(const struct gfs2_rbm *rbm, bool do_clone,
76 			       unsigned char new_state)
77 {
78 	unsigned char *byte1, *byte2, *end, cur_state;
79 	struct gfs2_bitmap *bi = rbm_bi(rbm);
80 	unsigned int buflen = bi->bi_bytes;
81 	const unsigned int bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE;
82 
83 	byte1 = bi->bi_bh->b_data + bi->bi_offset + (rbm->offset / GFS2_NBBY);
84 	end = bi->bi_bh->b_data + bi->bi_offset + buflen;
85 
86 	BUG_ON(byte1 >= end);
87 
88 	cur_state = (*byte1 >> bit) & GFS2_BIT_MASK;
89 
90 	if (unlikely(!valid_change[new_state * 4 + cur_state])) {
91 		struct gfs2_sbd *sdp = rbm->rgd->rd_sbd;
92 
93 		fs_warn(sdp, "buf_blk = 0x%x old_state=%d, new_state=%d\n",
94 			rbm->offset, cur_state, new_state);
95 		fs_warn(sdp, "rgrp=0x%llx bi_start=0x%x biblk: 0x%llx\n",
96 			(unsigned long long)rbm->rgd->rd_addr, bi->bi_start,
97 			(unsigned long long)bi->bi_bh->b_blocknr);
98 		fs_warn(sdp, "bi_offset=0x%x bi_bytes=0x%x block=0x%llx\n",
99 			bi->bi_offset, bi->bi_bytes,
100 			(unsigned long long)gfs2_rbm_to_block(rbm));
101 		dump_stack();
102 		gfs2_consist_rgrpd(rbm->rgd);
103 		return;
104 	}
105 	*byte1 ^= (cur_state ^ new_state) << bit;
106 
107 	if (do_clone && bi->bi_clone) {
108 		byte2 = bi->bi_clone + bi->bi_offset + (rbm->offset / GFS2_NBBY);
109 		cur_state = (*byte2 >> bit) & GFS2_BIT_MASK;
110 		*byte2 ^= (cur_state ^ new_state) << bit;
111 	}
112 }
113 
114 /**
115  * gfs2_testbit - test a bit in the bitmaps
116  * @rbm: The bit to test
117  * @use_clone: If true, test the clone bitmap, not the official bitmap.
118  *
119  * Some callers like gfs2_unaligned_extlen need to test the clone bitmaps,
120  * not the "real" bitmaps, to avoid allocating recently freed blocks.
121  *
122  * Returns: The two bit block state of the requested bit
123  */
124 
gfs2_testbit(const struct gfs2_rbm * rbm,bool use_clone)125 static inline u8 gfs2_testbit(const struct gfs2_rbm *rbm, bool use_clone)
126 {
127 	struct gfs2_bitmap *bi = rbm_bi(rbm);
128 	const u8 *buffer;
129 	const u8 *byte;
130 	unsigned int bit;
131 
132 	if (use_clone && bi->bi_clone)
133 		buffer = bi->bi_clone;
134 	else
135 		buffer = bi->bi_bh->b_data;
136 	buffer += bi->bi_offset;
137 	byte = buffer + (rbm->offset / GFS2_NBBY);
138 	bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE;
139 
140 	return (*byte >> bit) & GFS2_BIT_MASK;
141 }
142 
143 /**
144  * gfs2_bit_search
145  * @ptr: Pointer to bitmap data
146  * @mask: Mask to use (normally 0x55555.... but adjusted for search start)
147  * @state: The state we are searching for
148  *
149  * We xor the bitmap data with a patter which is the bitwise opposite
150  * of what we are looking for, this gives rise to a pattern of ones
151  * wherever there is a match. Since we have two bits per entry, we
152  * take this pattern, shift it down by one place and then and it with
153  * the original. All the even bit positions (0,2,4, etc) then represent
154  * successful matches, so we mask with 0x55555..... to remove the unwanted
155  * odd bit positions.
156  *
157  * This allows searching of a whole u64 at once (32 blocks) with a
158  * single test (on 64 bit arches).
159  */
160 
gfs2_bit_search(const __le64 * ptr,u64 mask,u8 state)161 static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state)
162 {
163 	u64 tmp;
164 	static const u64 search[] = {
165 		[0] = 0xffffffffffffffffULL,
166 		[1] = 0xaaaaaaaaaaaaaaaaULL,
167 		[2] = 0x5555555555555555ULL,
168 		[3] = 0x0000000000000000ULL,
169 	};
170 	tmp = le64_to_cpu(*ptr) ^ search[state];
171 	tmp &= (tmp >> 1);
172 	tmp &= mask;
173 	return tmp;
174 }
175 
176 /**
177  * rs_cmp - multi-block reservation range compare
178  * @blk: absolute file system block number of the new reservation
179  * @len: number of blocks in the new reservation
180  * @rs: existing reservation to compare against
181  *
182  * returns: 1 if the block range is beyond the reach of the reservation
183  *         -1 if the block range is before the start of the reservation
184  *          0 if the block range overlaps with the reservation
185  */
rs_cmp(u64 blk,u32 len,struct gfs2_blkreserv * rs)186 static inline int rs_cmp(u64 blk, u32 len, struct gfs2_blkreserv *rs)
187 {
188 	u64 startblk = gfs2_rbm_to_block(&rs->rs_rbm);
189 
190 	if (blk >= startblk + rs->rs_free)
191 		return 1;
192 	if (blk + len - 1 < startblk)
193 		return -1;
194 	return 0;
195 }
196 
197 /**
198  * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing
199  *       a block in a given allocation state.
200  * @buf: the buffer that holds the bitmaps
201  * @len: the length (in bytes) of the buffer
202  * @goal: start search at this block's bit-pair (within @buffer)
203  * @state: GFS2_BLKST_XXX the state of the block we're looking for.
204  *
205  * Scope of @goal and returned block number is only within this bitmap buffer,
206  * not entire rgrp or filesystem.  @buffer will be offset from the actual
207  * beginning of a bitmap block buffer, skipping any header structures, but
208  * headers are always a multiple of 64 bits long so that the buffer is
209  * always aligned to a 64 bit boundary.
210  *
211  * The size of the buffer is in bytes, but is it assumed that it is
212  * always ok to read a complete multiple of 64 bits at the end
213  * of the block in case the end is no aligned to a natural boundary.
214  *
215  * Return: the block number (bitmap buffer scope) that was found
216  */
217 
gfs2_bitfit(const u8 * buf,const unsigned int len,u32 goal,u8 state)218 static u32 gfs2_bitfit(const u8 *buf, const unsigned int len,
219 		       u32 goal, u8 state)
220 {
221 	u32 spoint = (goal << 1) & ((8*sizeof(u64)) - 1);
222 	const __le64 *ptr = ((__le64 *)buf) + (goal >> 5);
223 	const __le64 *end = (__le64 *)(buf + ALIGN(len, sizeof(u64)));
224 	u64 tmp;
225 	u64 mask = 0x5555555555555555ULL;
226 	u32 bit;
227 
228 	/* Mask off bits we don't care about at the start of the search */
229 	mask <<= spoint;
230 	tmp = gfs2_bit_search(ptr, mask, state);
231 	ptr++;
232 	while(tmp == 0 && ptr < end) {
233 		tmp = gfs2_bit_search(ptr, 0x5555555555555555ULL, state);
234 		ptr++;
235 	}
236 	/* Mask off any bits which are more than len bytes from the start */
237 	if (ptr == end && (len & (sizeof(u64) - 1)))
238 		tmp &= (((u64)~0) >> (64 - 8*(len & (sizeof(u64) - 1))));
239 	/* Didn't find anything, so return */
240 	if (tmp == 0)
241 		return BFITNOENT;
242 	ptr--;
243 	bit = __ffs64(tmp);
244 	bit /= 2;	/* two bits per entry in the bitmap */
245 	return (((const unsigned char *)ptr - buf) * GFS2_NBBY) + bit;
246 }
247 
248 /**
249  * gfs2_rbm_from_block - Set the rbm based upon rgd and block number
250  * @rbm: The rbm with rgd already set correctly
251  * @block: The block number (filesystem relative)
252  *
253  * This sets the bi and offset members of an rbm based on a
254  * resource group and a filesystem relative block number. The
255  * resource group must be set in the rbm on entry, the bi and
256  * offset members will be set by this function.
257  *
258  * Returns: 0 on success, or an error code
259  */
260 
gfs2_rbm_from_block(struct gfs2_rbm * rbm,u64 block)261 static int gfs2_rbm_from_block(struct gfs2_rbm *rbm, u64 block)
262 {
263 	if (!rgrp_contains_block(rbm->rgd, block))
264 		return -E2BIG;
265 	rbm->bii = 0;
266 	rbm->offset = block - rbm->rgd->rd_data0;
267 	/* Check if the block is within the first block */
268 	if (rbm->offset < rbm_bi(rbm)->bi_blocks)
269 		return 0;
270 
271 	/* Adjust for the size diff between gfs2_meta_header and gfs2_rgrp */
272 	rbm->offset += (sizeof(struct gfs2_rgrp) -
273 			sizeof(struct gfs2_meta_header)) * GFS2_NBBY;
274 	rbm->bii = rbm->offset / rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
275 	rbm->offset -= rbm->bii * rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
276 	return 0;
277 }
278 
279 /**
280  * gfs2_rbm_incr - increment an rbm structure
281  * @rbm: The rbm with rgd already set correctly
282  *
283  * This function takes an existing rbm structure and increments it to the next
284  * viable block offset.
285  *
286  * Returns: If incrementing the offset would cause the rbm to go past the
287  *          end of the rgrp, true is returned, otherwise false.
288  *
289  */
290 
gfs2_rbm_incr(struct gfs2_rbm * rbm)291 static bool gfs2_rbm_incr(struct gfs2_rbm *rbm)
292 {
293 	if (rbm->offset + 1 < rbm_bi(rbm)->bi_blocks) { /* in the same bitmap */
294 		rbm->offset++;
295 		return false;
296 	}
297 	if (rbm->bii == rbm->rgd->rd_length - 1) /* at the last bitmap */
298 		return true;
299 
300 	rbm->offset = 0;
301 	rbm->bii++;
302 	return false;
303 }
304 
305 /**
306  * gfs2_unaligned_extlen - Look for free blocks which are not byte aligned
307  * @rbm: Position to search (value/result)
308  * @n_unaligned: Number of unaligned blocks to check
309  * @len: Decremented for each block found (terminate on zero)
310  *
311  * Returns: true if a non-free block is encountered
312  */
313 
gfs2_unaligned_extlen(struct gfs2_rbm * rbm,u32 n_unaligned,u32 * len)314 static bool gfs2_unaligned_extlen(struct gfs2_rbm *rbm, u32 n_unaligned, u32 *len)
315 {
316 	u32 n;
317 	u8 res;
318 
319 	for (n = 0; n < n_unaligned; n++) {
320 		res = gfs2_testbit(rbm, true);
321 		if (res != GFS2_BLKST_FREE)
322 			return true;
323 		(*len)--;
324 		if (*len == 0)
325 			return true;
326 		if (gfs2_rbm_incr(rbm))
327 			return true;
328 	}
329 
330 	return false;
331 }
332 
333 /**
334  * gfs2_free_extlen - Return extent length of free blocks
335  * @rrbm: Starting position
336  * @len: Max length to check
337  *
338  * Starting at the block specified by the rbm, see how many free blocks
339  * there are, not reading more than len blocks ahead. This can be done
340  * using memchr_inv when the blocks are byte aligned, but has to be done
341  * on a block by block basis in case of unaligned blocks. Also this
342  * function can cope with bitmap boundaries (although it must stop on
343  * a resource group boundary)
344  *
345  * Returns: Number of free blocks in the extent
346  */
347 
gfs2_free_extlen(const struct gfs2_rbm * rrbm,u32 len)348 static u32 gfs2_free_extlen(const struct gfs2_rbm *rrbm, u32 len)
349 {
350 	struct gfs2_rbm rbm = *rrbm;
351 	u32 n_unaligned = rbm.offset & 3;
352 	u32 size = len;
353 	u32 bytes;
354 	u32 chunk_size;
355 	u8 *ptr, *start, *end;
356 	u64 block;
357 	struct gfs2_bitmap *bi;
358 
359 	if (n_unaligned &&
360 	    gfs2_unaligned_extlen(&rbm, 4 - n_unaligned, &len))
361 		goto out;
362 
363 	n_unaligned = len & 3;
364 	/* Start is now byte aligned */
365 	while (len > 3) {
366 		bi = rbm_bi(&rbm);
367 		start = bi->bi_bh->b_data;
368 		if (bi->bi_clone)
369 			start = bi->bi_clone;
370 		start += bi->bi_offset;
371 		end = start + bi->bi_bytes;
372 		BUG_ON(rbm.offset & 3);
373 		start += (rbm.offset / GFS2_NBBY);
374 		bytes = min_t(u32, len / GFS2_NBBY, (end - start));
375 		ptr = memchr_inv(start, 0, bytes);
376 		chunk_size = ((ptr == NULL) ? bytes : (ptr - start));
377 		chunk_size *= GFS2_NBBY;
378 		BUG_ON(len < chunk_size);
379 		len -= chunk_size;
380 		block = gfs2_rbm_to_block(&rbm);
381 		if (gfs2_rbm_from_block(&rbm, block + chunk_size)) {
382 			n_unaligned = 0;
383 			break;
384 		}
385 		if (ptr) {
386 			n_unaligned = 3;
387 			break;
388 		}
389 		n_unaligned = len & 3;
390 	}
391 
392 	/* Deal with any bits left over at the end */
393 	if (n_unaligned)
394 		gfs2_unaligned_extlen(&rbm, n_unaligned, &len);
395 out:
396 	return size - len;
397 }
398 
399 /**
400  * gfs2_bitcount - count the number of bits in a certain state
401  * @rgd: the resource group descriptor
402  * @buffer: the buffer that holds the bitmaps
403  * @buflen: the length (in bytes) of the buffer
404  * @state: the state of the block we're looking for
405  *
406  * Returns: The number of bits
407  */
408 
gfs2_bitcount(struct gfs2_rgrpd * rgd,const u8 * buffer,unsigned int buflen,u8 state)409 static u32 gfs2_bitcount(struct gfs2_rgrpd *rgd, const u8 *buffer,
410 			 unsigned int buflen, u8 state)
411 {
412 	const u8 *byte = buffer;
413 	const u8 *end = buffer + buflen;
414 	const u8 state1 = state << 2;
415 	const u8 state2 = state << 4;
416 	const u8 state3 = state << 6;
417 	u32 count = 0;
418 
419 	for (; byte < end; byte++) {
420 		if (((*byte) & 0x03) == state)
421 			count++;
422 		if (((*byte) & 0x0C) == state1)
423 			count++;
424 		if (((*byte) & 0x30) == state2)
425 			count++;
426 		if (((*byte) & 0xC0) == state3)
427 			count++;
428 	}
429 
430 	return count;
431 }
432 
433 /**
434  * gfs2_rgrp_verify - Verify that a resource group is consistent
435  * @rgd: the rgrp
436  *
437  */
438 
gfs2_rgrp_verify(struct gfs2_rgrpd * rgd)439 void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
440 {
441 	struct gfs2_sbd *sdp = rgd->rd_sbd;
442 	struct gfs2_bitmap *bi = NULL;
443 	u32 length = rgd->rd_length;
444 	u32 count[4], tmp;
445 	int buf, x;
446 
447 	memset(count, 0, 4 * sizeof(u32));
448 
449 	/* Count # blocks in each of 4 possible allocation states */
450 	for (buf = 0; buf < length; buf++) {
451 		bi = rgd->rd_bits + buf;
452 		for (x = 0; x < 4; x++)
453 			count[x] += gfs2_bitcount(rgd,
454 						  bi->bi_bh->b_data +
455 						  bi->bi_offset,
456 						  bi->bi_bytes, x);
457 	}
458 
459 	if (count[0] != rgd->rd_free) {
460 		gfs2_lm(sdp, "free data mismatch:  %u != %u\n",
461 			count[0], rgd->rd_free);
462 		gfs2_consist_rgrpd(rgd);
463 		return;
464 	}
465 
466 	tmp = rgd->rd_data - rgd->rd_free - rgd->rd_dinodes;
467 	if (count[1] != tmp) {
468 		gfs2_lm(sdp, "used data mismatch:  %u != %u\n",
469 			count[1], tmp);
470 		gfs2_consist_rgrpd(rgd);
471 		return;
472 	}
473 
474 	if (count[2] + count[3] != rgd->rd_dinodes) {
475 		gfs2_lm(sdp, "used metadata mismatch:  %u != %u\n",
476 			count[2] + count[3], rgd->rd_dinodes);
477 		gfs2_consist_rgrpd(rgd);
478 		return;
479 	}
480 }
481 
482 /**
483  * gfs2_blk2rgrpd - Find resource group for a given data/meta block number
484  * @sdp: The GFS2 superblock
485  * @blk: The data block number
486  * @exact: True if this needs to be an exact match
487  *
488  * The @exact argument should be set to true by most callers. The exception
489  * is when we need to match blocks which are not represented by the rgrp
490  * bitmap, but which are part of the rgrp (i.e. padding blocks) which are
491  * there for alignment purposes. Another way of looking at it is that @exact
492  * matches only valid data/metadata blocks, but with @exact false, it will
493  * match any block within the extent of the rgrp.
494  *
495  * Returns: The resource group, or NULL if not found
496  */
497 
gfs2_blk2rgrpd(struct gfs2_sbd * sdp,u64 blk,bool exact)498 struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk, bool exact)
499 {
500 	struct rb_node *n, *next;
501 	struct gfs2_rgrpd *cur;
502 
503 	spin_lock(&sdp->sd_rindex_spin);
504 	n = sdp->sd_rindex_tree.rb_node;
505 	while (n) {
506 		cur = rb_entry(n, struct gfs2_rgrpd, rd_node);
507 		next = NULL;
508 		if (blk < cur->rd_addr)
509 			next = n->rb_left;
510 		else if (blk >= cur->rd_data0 + cur->rd_data)
511 			next = n->rb_right;
512 		if (next == NULL) {
513 			spin_unlock(&sdp->sd_rindex_spin);
514 			if (exact) {
515 				if (blk < cur->rd_addr)
516 					return NULL;
517 				if (blk >= cur->rd_data0 + cur->rd_data)
518 					return NULL;
519 			}
520 			return cur;
521 		}
522 		n = next;
523 	}
524 	spin_unlock(&sdp->sd_rindex_spin);
525 
526 	return NULL;
527 }
528 
529 /**
530  * gfs2_rgrpd_get_first - get the first Resource Group in the filesystem
531  * @sdp: The GFS2 superblock
532  *
533  * Returns: The first rgrp in the filesystem
534  */
535 
gfs2_rgrpd_get_first(struct gfs2_sbd * sdp)536 struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp)
537 {
538 	const struct rb_node *n;
539 	struct gfs2_rgrpd *rgd;
540 
541 	spin_lock(&sdp->sd_rindex_spin);
542 	n = rb_first(&sdp->sd_rindex_tree);
543 	rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
544 	spin_unlock(&sdp->sd_rindex_spin);
545 
546 	return rgd;
547 }
548 
549 /**
550  * gfs2_rgrpd_get_next - get the next RG
551  * @rgd: the resource group descriptor
552  *
553  * Returns: The next rgrp
554  */
555 
gfs2_rgrpd_get_next(struct gfs2_rgrpd * rgd)556 struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd)
557 {
558 	struct gfs2_sbd *sdp = rgd->rd_sbd;
559 	const struct rb_node *n;
560 
561 	spin_lock(&sdp->sd_rindex_spin);
562 	n = rb_next(&rgd->rd_node);
563 	if (n == NULL)
564 		n = rb_first(&sdp->sd_rindex_tree);
565 
566 	if (unlikely(&rgd->rd_node == n)) {
567 		spin_unlock(&sdp->sd_rindex_spin);
568 		return NULL;
569 	}
570 	rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
571 	spin_unlock(&sdp->sd_rindex_spin);
572 	return rgd;
573 }
574 
check_and_update_goal(struct gfs2_inode * ip)575 void check_and_update_goal(struct gfs2_inode *ip)
576 {
577 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
578 	if (!ip->i_goal || gfs2_blk2rgrpd(sdp, ip->i_goal, 1) == NULL)
579 		ip->i_goal = ip->i_no_addr;
580 }
581 
gfs2_free_clones(struct gfs2_rgrpd * rgd)582 void gfs2_free_clones(struct gfs2_rgrpd *rgd)
583 {
584 	int x;
585 
586 	for (x = 0; x < rgd->rd_length; x++) {
587 		struct gfs2_bitmap *bi = rgd->rd_bits + x;
588 		kfree(bi->bi_clone);
589 		bi->bi_clone = NULL;
590 	}
591 }
592 
dump_rs(struct seq_file * seq,const struct gfs2_blkreserv * rs,const char * fs_id_buf)593 static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs,
594 		    const char *fs_id_buf)
595 {
596 	struct gfs2_inode *ip = container_of(rs, struct gfs2_inode, i_res);
597 
598 	gfs2_print_dbg(seq, "%s  B: n:%llu s:%llu b:%u f:%u\n", fs_id_buf,
599 		       (unsigned long long)ip->i_no_addr,
600 		       (unsigned long long)gfs2_rbm_to_block(&rs->rs_rbm),
601 		       rs->rs_rbm.offset, rs->rs_free);
602 }
603 
604 /**
605  * __rs_deltree - remove a multi-block reservation from the rgd tree
606  * @rs: The reservation to remove
607  *
608  */
__rs_deltree(struct gfs2_blkreserv * rs)609 static void __rs_deltree(struct gfs2_blkreserv *rs)
610 {
611 	struct gfs2_rgrpd *rgd;
612 
613 	if (!gfs2_rs_active(rs))
614 		return;
615 
616 	rgd = rs->rs_rbm.rgd;
617 	trace_gfs2_rs(rs, TRACE_RS_TREEDEL);
618 	rb_erase(&rs->rs_node, &rgd->rd_rstree);
619 	RB_CLEAR_NODE(&rs->rs_node);
620 
621 	if (rs->rs_free) {
622 		u64 last_block = gfs2_rbm_to_block(&rs->rs_rbm) +
623 				 rs->rs_free - 1;
624 		struct gfs2_rbm last_rbm = { .rgd = rs->rs_rbm.rgd, };
625 		struct gfs2_bitmap *start, *last;
626 
627 		/* return reserved blocks to the rgrp */
628 		BUG_ON(rs->rs_rbm.rgd->rd_reserved < rs->rs_free);
629 		rs->rs_rbm.rgd->rd_reserved -= rs->rs_free;
630 		/* The rgrp extent failure point is likely not to increase;
631 		   it will only do so if the freed blocks are somehow
632 		   contiguous with a span of free blocks that follows. Still,
633 		   it will force the number to be recalculated later. */
634 		rgd->rd_extfail_pt += rs->rs_free;
635 		rs->rs_free = 0;
636 		if (gfs2_rbm_from_block(&last_rbm, last_block))
637 			return;
638 		start = rbm_bi(&rs->rs_rbm);
639 		last = rbm_bi(&last_rbm);
640 		do
641 			clear_bit(GBF_FULL, &start->bi_flags);
642 		while (start++ != last);
643 	}
644 }
645 
646 /**
647  * gfs2_rs_deltree - remove a multi-block reservation from the rgd tree
648  * @rs: The reservation to remove
649  *
650  */
gfs2_rs_deltree(struct gfs2_blkreserv * rs)651 void gfs2_rs_deltree(struct gfs2_blkreserv *rs)
652 {
653 	struct gfs2_rgrpd *rgd;
654 
655 	rgd = rs->rs_rbm.rgd;
656 	if (rgd) {
657 		spin_lock(&rgd->rd_rsspin);
658 		__rs_deltree(rs);
659 		BUG_ON(rs->rs_free);
660 		spin_unlock(&rgd->rd_rsspin);
661 	}
662 }
663 
664 /**
665  * gfs2_rs_delete - delete a multi-block reservation
666  * @ip: The inode for this reservation
667  *
668  */
gfs2_rs_delete(struct gfs2_inode * ip)669 void gfs2_rs_delete(struct gfs2_inode *ip)
670 {
671 	struct inode *inode = &ip->i_inode;
672 
673 	down_write(&ip->i_rw_mutex);
674 	if (atomic_read(&inode->i_writecount) <= 1)
675 		gfs2_rs_deltree(&ip->i_res);
676 	up_write(&ip->i_rw_mutex);
677 }
678 
679 /**
680  * return_all_reservations - return all reserved blocks back to the rgrp.
681  * @rgd: the rgrp that needs its space back
682  *
683  * We previously reserved a bunch of blocks for allocation. Now we need to
684  * give them back. This leave the reservation structures in tact, but removes
685  * all of their corresponding "no-fly zones".
686  */
return_all_reservations(struct gfs2_rgrpd * rgd)687 static void return_all_reservations(struct gfs2_rgrpd *rgd)
688 {
689 	struct rb_node *n;
690 	struct gfs2_blkreserv *rs;
691 
692 	spin_lock(&rgd->rd_rsspin);
693 	while ((n = rb_first(&rgd->rd_rstree))) {
694 		rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
695 		__rs_deltree(rs);
696 	}
697 	spin_unlock(&rgd->rd_rsspin);
698 }
699 
gfs2_clear_rgrpd(struct gfs2_sbd * sdp)700 void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
701 {
702 	struct rb_node *n;
703 	struct gfs2_rgrpd *rgd;
704 	struct gfs2_glock *gl;
705 
706 	while ((n = rb_first(&sdp->sd_rindex_tree))) {
707 		rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
708 		gl = rgd->rd_gl;
709 
710 		rb_erase(n, &sdp->sd_rindex_tree);
711 
712 		if (gl) {
713 			if (gl->gl_state != LM_ST_UNLOCKED) {
714 				gfs2_glock_cb(gl, LM_ST_UNLOCKED);
715 				flush_delayed_work(&gl->gl_work);
716 			}
717 			gfs2_rgrp_brelse(rgd);
718 			glock_clear_object(gl, rgd);
719 			gfs2_glock_put(gl);
720 		}
721 
722 		gfs2_free_clones(rgd);
723 		return_all_reservations(rgd);
724 		kfree(rgd->rd_bits);
725 		rgd->rd_bits = NULL;
726 		kmem_cache_free(gfs2_rgrpd_cachep, rgd);
727 	}
728 }
729 
730 /**
731  * gfs2_compute_bitstructs - Compute the bitmap sizes
732  * @rgd: The resource group descriptor
733  *
734  * Calculates bitmap descriptors, one for each block that contains bitmap data
735  *
736  * Returns: errno
737  */
738 
compute_bitstructs(struct gfs2_rgrpd * rgd)739 static int compute_bitstructs(struct gfs2_rgrpd *rgd)
740 {
741 	struct gfs2_sbd *sdp = rgd->rd_sbd;
742 	struct gfs2_bitmap *bi;
743 	u32 length = rgd->rd_length; /* # blocks in hdr & bitmap */
744 	u32 bytes_left, bytes;
745 	int x;
746 
747 	if (!length)
748 		return -EINVAL;
749 
750 	rgd->rd_bits = kcalloc(length, sizeof(struct gfs2_bitmap), GFP_NOFS);
751 	if (!rgd->rd_bits)
752 		return -ENOMEM;
753 
754 	bytes_left = rgd->rd_bitbytes;
755 
756 	for (x = 0; x < length; x++) {
757 		bi = rgd->rd_bits + x;
758 
759 		bi->bi_flags = 0;
760 		/* small rgrp; bitmap stored completely in header block */
761 		if (length == 1) {
762 			bytes = bytes_left;
763 			bi->bi_offset = sizeof(struct gfs2_rgrp);
764 			bi->bi_start = 0;
765 			bi->bi_bytes = bytes;
766 			bi->bi_blocks = bytes * GFS2_NBBY;
767 		/* header block */
768 		} else if (x == 0) {
769 			bytes = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_rgrp);
770 			bi->bi_offset = sizeof(struct gfs2_rgrp);
771 			bi->bi_start = 0;
772 			bi->bi_bytes = bytes;
773 			bi->bi_blocks = bytes * GFS2_NBBY;
774 		/* last block */
775 		} else if (x + 1 == length) {
776 			bytes = bytes_left;
777 			bi->bi_offset = sizeof(struct gfs2_meta_header);
778 			bi->bi_start = rgd->rd_bitbytes - bytes_left;
779 			bi->bi_bytes = bytes;
780 			bi->bi_blocks = bytes * GFS2_NBBY;
781 		/* other blocks */
782 		} else {
783 			bytes = sdp->sd_sb.sb_bsize -
784 				sizeof(struct gfs2_meta_header);
785 			bi->bi_offset = sizeof(struct gfs2_meta_header);
786 			bi->bi_start = rgd->rd_bitbytes - bytes_left;
787 			bi->bi_bytes = bytes;
788 			bi->bi_blocks = bytes * GFS2_NBBY;
789 		}
790 
791 		bytes_left -= bytes;
792 	}
793 
794 	if (bytes_left) {
795 		gfs2_consist_rgrpd(rgd);
796 		return -EIO;
797 	}
798 	bi = rgd->rd_bits + (length - 1);
799 	if ((bi->bi_start + bi->bi_bytes) * GFS2_NBBY != rgd->rd_data) {
800 		gfs2_lm(sdp,
801 			"ri_addr = %llu\n"
802 			"ri_length = %u\n"
803 			"ri_data0 = %llu\n"
804 			"ri_data = %u\n"
805 			"ri_bitbytes = %u\n"
806 			"start=%u len=%u offset=%u\n",
807 			(unsigned long long)rgd->rd_addr,
808 			rgd->rd_length,
809 			(unsigned long long)rgd->rd_data0,
810 			rgd->rd_data,
811 			rgd->rd_bitbytes,
812 			bi->bi_start, bi->bi_bytes, bi->bi_offset);
813 		gfs2_consist_rgrpd(rgd);
814 		return -EIO;
815 	}
816 
817 	return 0;
818 }
819 
820 /**
821  * gfs2_ri_total - Total up the file system space, according to the rindex.
822  * @sdp: the filesystem
823  *
824  */
gfs2_ri_total(struct gfs2_sbd * sdp)825 u64 gfs2_ri_total(struct gfs2_sbd *sdp)
826 {
827 	u64 total_data = 0;
828 	struct inode *inode = sdp->sd_rindex;
829 	struct gfs2_inode *ip = GFS2_I(inode);
830 	char buf[sizeof(struct gfs2_rindex)];
831 	int error, rgrps;
832 
833 	for (rgrps = 0;; rgrps++) {
834 		loff_t pos = rgrps * sizeof(struct gfs2_rindex);
835 
836 		if (pos + sizeof(struct gfs2_rindex) > i_size_read(inode))
837 			break;
838 		error = gfs2_internal_read(ip, buf, &pos,
839 					   sizeof(struct gfs2_rindex));
840 		if (error != sizeof(struct gfs2_rindex))
841 			break;
842 		total_data += be32_to_cpu(((struct gfs2_rindex *)buf)->ri_data);
843 	}
844 	return total_data;
845 }
846 
rgd_insert(struct gfs2_rgrpd * rgd)847 static int rgd_insert(struct gfs2_rgrpd *rgd)
848 {
849 	struct gfs2_sbd *sdp = rgd->rd_sbd;
850 	struct rb_node **newn = &sdp->sd_rindex_tree.rb_node, *parent = NULL;
851 
852 	/* Figure out where to put new node */
853 	while (*newn) {
854 		struct gfs2_rgrpd *cur = rb_entry(*newn, struct gfs2_rgrpd,
855 						  rd_node);
856 
857 		parent = *newn;
858 		if (rgd->rd_addr < cur->rd_addr)
859 			newn = &((*newn)->rb_left);
860 		else if (rgd->rd_addr > cur->rd_addr)
861 			newn = &((*newn)->rb_right);
862 		else
863 			return -EEXIST;
864 	}
865 
866 	rb_link_node(&rgd->rd_node, parent, newn);
867 	rb_insert_color(&rgd->rd_node, &sdp->sd_rindex_tree);
868 	sdp->sd_rgrps++;
869 	return 0;
870 }
871 
872 /**
873  * read_rindex_entry - Pull in a new resource index entry from the disk
874  * @ip: Pointer to the rindex inode
875  *
876  * Returns: 0 on success, > 0 on EOF, error code otherwise
877  */
878 
read_rindex_entry(struct gfs2_inode * ip)879 static int read_rindex_entry(struct gfs2_inode *ip)
880 {
881 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
882 	loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex);
883 	struct gfs2_rindex buf;
884 	int error;
885 	struct gfs2_rgrpd *rgd;
886 
887 	if (pos >= i_size_read(&ip->i_inode))
888 		return 1;
889 
890 	error = gfs2_internal_read(ip, (char *)&buf, &pos,
891 				   sizeof(struct gfs2_rindex));
892 
893 	if (error != sizeof(struct gfs2_rindex))
894 		return (error == 0) ? 1 : error;
895 
896 	rgd = kmem_cache_zalloc(gfs2_rgrpd_cachep, GFP_NOFS);
897 	error = -ENOMEM;
898 	if (!rgd)
899 		return error;
900 
901 	rgd->rd_sbd = sdp;
902 	rgd->rd_addr = be64_to_cpu(buf.ri_addr);
903 	rgd->rd_length = be32_to_cpu(buf.ri_length);
904 	rgd->rd_data0 = be64_to_cpu(buf.ri_data0);
905 	rgd->rd_data = be32_to_cpu(buf.ri_data);
906 	rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes);
907 	spin_lock_init(&rgd->rd_rsspin);
908 
909 	error = gfs2_glock_get(sdp, rgd->rd_addr,
910 			       &gfs2_rgrp_glops, CREATE, &rgd->rd_gl);
911 	if (error)
912 		goto fail;
913 
914 	error = compute_bitstructs(rgd);
915 	if (error)
916 		goto fail_glock;
917 
918 	rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr;
919 	rgd->rd_flags &= ~(GFS2_RDF_UPTODATE | GFS2_RDF_PREFERRED);
920 	if (rgd->rd_data > sdp->sd_max_rg_data)
921 		sdp->sd_max_rg_data = rgd->rd_data;
922 	spin_lock(&sdp->sd_rindex_spin);
923 	error = rgd_insert(rgd);
924 	spin_unlock(&sdp->sd_rindex_spin);
925 	if (!error) {
926 		glock_set_object(rgd->rd_gl, rgd);
927 		return 0;
928 	}
929 
930 	error = 0; /* someone else read in the rgrp; free it and ignore it */
931 fail_glock:
932 	gfs2_glock_put(rgd->rd_gl);
933 
934 fail:
935 	kfree(rgd->rd_bits);
936 	rgd->rd_bits = NULL;
937 	kmem_cache_free(gfs2_rgrpd_cachep, rgd);
938 	return error;
939 }
940 
941 /**
942  * set_rgrp_preferences - Run all the rgrps, selecting some we prefer to use
943  * @sdp: the GFS2 superblock
944  *
945  * The purpose of this function is to select a subset of the resource groups
946  * and mark them as PREFERRED. We do it in such a way that each node prefers
947  * to use a unique set of rgrps to minimize glock contention.
948  */
set_rgrp_preferences(struct gfs2_sbd * sdp)949 static void set_rgrp_preferences(struct gfs2_sbd *sdp)
950 {
951 	struct gfs2_rgrpd *rgd, *first;
952 	int i;
953 
954 	/* Skip an initial number of rgrps, based on this node's journal ID.
955 	   That should start each node out on its own set. */
956 	rgd = gfs2_rgrpd_get_first(sdp);
957 	for (i = 0; i < sdp->sd_lockstruct.ls_jid; i++)
958 		rgd = gfs2_rgrpd_get_next(rgd);
959 	first = rgd;
960 
961 	do {
962 		rgd->rd_flags |= GFS2_RDF_PREFERRED;
963 		for (i = 0; i < sdp->sd_journals; i++) {
964 			rgd = gfs2_rgrpd_get_next(rgd);
965 			if (!rgd || rgd == first)
966 				break;
967 		}
968 	} while (rgd && rgd != first);
969 }
970 
971 /**
972  * gfs2_ri_update - Pull in a new resource index from the disk
973  * @ip: pointer to the rindex inode
974  *
975  * Returns: 0 on successful update, error code otherwise
976  */
977 
gfs2_ri_update(struct gfs2_inode * ip)978 static int gfs2_ri_update(struct gfs2_inode *ip)
979 {
980 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
981 	int error;
982 
983 	do {
984 		error = read_rindex_entry(ip);
985 	} while (error == 0);
986 
987 	if (error < 0)
988 		return error;
989 
990 	if (RB_EMPTY_ROOT(&sdp->sd_rindex_tree)) {
991 		fs_err(sdp, "no resource groups found in the file system.\n");
992 		return -ENOENT;
993 	}
994 	set_rgrp_preferences(sdp);
995 
996 	sdp->sd_rindex_uptodate = 1;
997 	return 0;
998 }
999 
1000 /**
1001  * gfs2_rindex_update - Update the rindex if required
1002  * @sdp: The GFS2 superblock
1003  *
1004  * We grab a lock on the rindex inode to make sure that it doesn't
1005  * change whilst we are performing an operation. We keep this lock
1006  * for quite long periods of time compared to other locks. This
1007  * doesn't matter, since it is shared and it is very, very rarely
1008  * accessed in the exclusive mode (i.e. only when expanding the filesystem).
1009  *
1010  * This makes sure that we're using the latest copy of the resource index
1011  * special file, which might have been updated if someone expanded the
1012  * filesystem (via gfs2_grow utility), which adds new resource groups.
1013  *
1014  * Returns: 0 on succeess, error code otherwise
1015  */
1016 
gfs2_rindex_update(struct gfs2_sbd * sdp)1017 int gfs2_rindex_update(struct gfs2_sbd *sdp)
1018 {
1019 	struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex);
1020 	struct gfs2_glock *gl = ip->i_gl;
1021 	struct gfs2_holder ri_gh;
1022 	int error = 0;
1023 	int unlock_required = 0;
1024 
1025 	/* Read new copy from disk if we don't have the latest */
1026 	if (!sdp->sd_rindex_uptodate) {
1027 		if (!gfs2_glock_is_locked_by_me(gl)) {
1028 			error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, &ri_gh);
1029 			if (error)
1030 				return error;
1031 			unlock_required = 1;
1032 		}
1033 		if (!sdp->sd_rindex_uptodate)
1034 			error = gfs2_ri_update(ip);
1035 		if (unlock_required)
1036 			gfs2_glock_dq_uninit(&ri_gh);
1037 	}
1038 
1039 	return error;
1040 }
1041 
gfs2_rgrp_in(struct gfs2_rgrpd * rgd,const void * buf)1042 static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
1043 {
1044 	const struct gfs2_rgrp *str = buf;
1045 	u32 rg_flags;
1046 
1047 	rg_flags = be32_to_cpu(str->rg_flags);
1048 	rg_flags &= ~GFS2_RDF_MASK;
1049 	rgd->rd_flags &= GFS2_RDF_MASK;
1050 	rgd->rd_flags |= rg_flags;
1051 	rgd->rd_free = be32_to_cpu(str->rg_free);
1052 	rgd->rd_dinodes = be32_to_cpu(str->rg_dinodes);
1053 	rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration);
1054 	/* rd_data0, rd_data and rd_bitbytes already set from rindex */
1055 }
1056 
gfs2_rgrp_ondisk2lvb(struct gfs2_rgrp_lvb * rgl,const void * buf)1057 static void gfs2_rgrp_ondisk2lvb(struct gfs2_rgrp_lvb *rgl, const void *buf)
1058 {
1059 	const struct gfs2_rgrp *str = buf;
1060 
1061 	rgl->rl_magic = cpu_to_be32(GFS2_MAGIC);
1062 	rgl->rl_flags = str->rg_flags;
1063 	rgl->rl_free = str->rg_free;
1064 	rgl->rl_dinodes = str->rg_dinodes;
1065 	rgl->rl_igeneration = str->rg_igeneration;
1066 	rgl->__pad = 0UL;
1067 }
1068 
gfs2_rgrp_out(struct gfs2_rgrpd * rgd,void * buf)1069 static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
1070 {
1071 	struct gfs2_rgrpd *next = gfs2_rgrpd_get_next(rgd);
1072 	struct gfs2_rgrp *str = buf;
1073 	u32 crc;
1074 
1075 	str->rg_flags = cpu_to_be32(rgd->rd_flags & ~GFS2_RDF_MASK);
1076 	str->rg_free = cpu_to_be32(rgd->rd_free);
1077 	str->rg_dinodes = cpu_to_be32(rgd->rd_dinodes);
1078 	if (next == NULL)
1079 		str->rg_skip = 0;
1080 	else if (next->rd_addr > rgd->rd_addr)
1081 		str->rg_skip = cpu_to_be32(next->rd_addr - rgd->rd_addr);
1082 	str->rg_igeneration = cpu_to_be64(rgd->rd_igeneration);
1083 	str->rg_data0 = cpu_to_be64(rgd->rd_data0);
1084 	str->rg_data = cpu_to_be32(rgd->rd_data);
1085 	str->rg_bitbytes = cpu_to_be32(rgd->rd_bitbytes);
1086 	str->rg_crc = 0;
1087 	crc = gfs2_disk_hash(buf, sizeof(struct gfs2_rgrp));
1088 	str->rg_crc = cpu_to_be32(crc);
1089 
1090 	memset(&str->rg_reserved, 0, sizeof(str->rg_reserved));
1091 	gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, buf);
1092 }
1093 
gfs2_rgrp_lvb_valid(struct gfs2_rgrpd * rgd)1094 static int gfs2_rgrp_lvb_valid(struct gfs2_rgrpd *rgd)
1095 {
1096 	struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl;
1097 	struct gfs2_rgrp *str = (struct gfs2_rgrp *)rgd->rd_bits[0].bi_bh->b_data;
1098 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1099 	int valid = 1;
1100 
1101 	if (rgl->rl_flags != str->rg_flags) {
1102 		fs_warn(sdp, "GFS2: rgd: %llu lvb flag mismatch %u/%u",
1103 			(unsigned long long)rgd->rd_addr,
1104 		       be32_to_cpu(rgl->rl_flags), be32_to_cpu(str->rg_flags));
1105 		valid = 0;
1106 	}
1107 	if (rgl->rl_free != str->rg_free) {
1108 		fs_warn(sdp, "GFS2: rgd: %llu lvb free mismatch %u/%u",
1109 			(unsigned long long)rgd->rd_addr,
1110 			be32_to_cpu(rgl->rl_free), be32_to_cpu(str->rg_free));
1111 		valid = 0;
1112 	}
1113 	if (rgl->rl_dinodes != str->rg_dinodes) {
1114 		fs_warn(sdp, "GFS2: rgd: %llu lvb dinode mismatch %u/%u",
1115 			(unsigned long long)rgd->rd_addr,
1116 			be32_to_cpu(rgl->rl_dinodes),
1117 			be32_to_cpu(str->rg_dinodes));
1118 		valid = 0;
1119 	}
1120 	if (rgl->rl_igeneration != str->rg_igeneration) {
1121 		fs_warn(sdp, "GFS2: rgd: %llu lvb igen mismatch %llu/%llu",
1122 			(unsigned long long)rgd->rd_addr,
1123 			(unsigned long long)be64_to_cpu(rgl->rl_igeneration),
1124 			(unsigned long long)be64_to_cpu(str->rg_igeneration));
1125 		valid = 0;
1126 	}
1127 	return valid;
1128 }
1129 
count_unlinked(struct gfs2_rgrpd * rgd)1130 static u32 count_unlinked(struct gfs2_rgrpd *rgd)
1131 {
1132 	struct gfs2_bitmap *bi;
1133 	const u32 length = rgd->rd_length;
1134 	const u8 *buffer = NULL;
1135 	u32 i, goal, count = 0;
1136 
1137 	for (i = 0, bi = rgd->rd_bits; i < length; i++, bi++) {
1138 		goal = 0;
1139 		buffer = bi->bi_bh->b_data + bi->bi_offset;
1140 		WARN_ON(!buffer_uptodate(bi->bi_bh));
1141 		while (goal < bi->bi_blocks) {
1142 			goal = gfs2_bitfit(buffer, bi->bi_bytes, goal,
1143 					   GFS2_BLKST_UNLINKED);
1144 			if (goal == BFITNOENT)
1145 				break;
1146 			count++;
1147 			goal++;
1148 		}
1149 	}
1150 
1151 	return count;
1152 }
1153 
1154 
1155 /**
1156  * gfs2_rgrp_bh_get - Read in a RG's header and bitmaps
1157  * @rgd: the struct gfs2_rgrpd describing the RG to read in
1158  *
1159  * Read in all of a Resource Group's header and bitmap blocks.
1160  * Caller must eventually call gfs2_rgrp_brelse() to free the bitmaps.
1161  *
1162  * Returns: errno
1163  */
1164 
gfs2_rgrp_bh_get(struct gfs2_rgrpd * rgd)1165 static int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
1166 {
1167 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1168 	struct gfs2_glock *gl = rgd->rd_gl;
1169 	unsigned int length = rgd->rd_length;
1170 	struct gfs2_bitmap *bi;
1171 	unsigned int x, y;
1172 	int error;
1173 
1174 	if (rgd->rd_bits[0].bi_bh != NULL)
1175 		return 0;
1176 
1177 	for (x = 0; x < length; x++) {
1178 		bi = rgd->rd_bits + x;
1179 		error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, 0, &bi->bi_bh);
1180 		if (error)
1181 			goto fail;
1182 	}
1183 
1184 	for (y = length; y--;) {
1185 		bi = rgd->rd_bits + y;
1186 		error = gfs2_meta_wait(sdp, bi->bi_bh);
1187 		if (error)
1188 			goto fail;
1189 		if (gfs2_metatype_check(sdp, bi->bi_bh, y ? GFS2_METATYPE_RB :
1190 					      GFS2_METATYPE_RG)) {
1191 			error = -EIO;
1192 			goto fail;
1193 		}
1194 	}
1195 
1196 	if (!(rgd->rd_flags & GFS2_RDF_UPTODATE)) {
1197 		for (x = 0; x < length; x++)
1198 			clear_bit(GBF_FULL, &rgd->rd_bits[x].bi_flags);
1199 		gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
1200 		rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK);
1201 		rgd->rd_free_clone = rgd->rd_free;
1202 		/* max out the rgrp allocation failure point */
1203 		rgd->rd_extfail_pt = rgd->rd_free;
1204 	}
1205 	if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) {
1206 		rgd->rd_rgl->rl_unlinked = cpu_to_be32(count_unlinked(rgd));
1207 		gfs2_rgrp_ondisk2lvb(rgd->rd_rgl,
1208 				     rgd->rd_bits[0].bi_bh->b_data);
1209 	}
1210 	else if (sdp->sd_args.ar_rgrplvb) {
1211 		if (!gfs2_rgrp_lvb_valid(rgd)){
1212 			gfs2_consist_rgrpd(rgd);
1213 			error = -EIO;
1214 			goto fail;
1215 		}
1216 		if (rgd->rd_rgl->rl_unlinked == 0)
1217 			rgd->rd_flags &= ~GFS2_RDF_CHECK;
1218 	}
1219 	return 0;
1220 
1221 fail:
1222 	while (x--) {
1223 		bi = rgd->rd_bits + x;
1224 		brelse(bi->bi_bh);
1225 		bi->bi_bh = NULL;
1226 		gfs2_assert_warn(sdp, !bi->bi_clone);
1227 	}
1228 
1229 	return error;
1230 }
1231 
update_rgrp_lvb(struct gfs2_rgrpd * rgd)1232 static int update_rgrp_lvb(struct gfs2_rgrpd *rgd)
1233 {
1234 	u32 rl_flags;
1235 
1236 	if (rgd->rd_flags & GFS2_RDF_UPTODATE)
1237 		return 0;
1238 
1239 	if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic)
1240 		return gfs2_rgrp_bh_get(rgd);
1241 
1242 	rl_flags = be32_to_cpu(rgd->rd_rgl->rl_flags);
1243 	rl_flags &= ~GFS2_RDF_MASK;
1244 	rgd->rd_flags &= GFS2_RDF_MASK;
1245 	rgd->rd_flags |= (rl_flags | GFS2_RDF_CHECK);
1246 	if (rgd->rd_rgl->rl_unlinked == 0)
1247 		rgd->rd_flags &= ~GFS2_RDF_CHECK;
1248 	rgd->rd_free = be32_to_cpu(rgd->rd_rgl->rl_free);
1249 	rgd->rd_free_clone = rgd->rd_free;
1250 	/* max out the rgrp allocation failure point */
1251 	rgd->rd_extfail_pt = rgd->rd_free;
1252 	rgd->rd_dinodes = be32_to_cpu(rgd->rd_rgl->rl_dinodes);
1253 	rgd->rd_igeneration = be64_to_cpu(rgd->rd_rgl->rl_igeneration);
1254 	return 0;
1255 }
1256 
gfs2_rgrp_go_lock(struct gfs2_holder * gh)1257 int gfs2_rgrp_go_lock(struct gfs2_holder *gh)
1258 {
1259 	struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object;
1260 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1261 
1262 	if (gh->gh_flags & GL_SKIP && sdp->sd_args.ar_rgrplvb)
1263 		return 0;
1264 	return gfs2_rgrp_bh_get(rgd);
1265 }
1266 
1267 /**
1268  * gfs2_rgrp_brelse - Release RG bitmaps read in with gfs2_rgrp_bh_get()
1269  * @rgd: The resource group
1270  *
1271  */
1272 
gfs2_rgrp_brelse(struct gfs2_rgrpd * rgd)1273 void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd)
1274 {
1275 	int x, length = rgd->rd_length;
1276 
1277 	for (x = 0; x < length; x++) {
1278 		struct gfs2_bitmap *bi = rgd->rd_bits + x;
1279 		if (bi->bi_bh) {
1280 			brelse(bi->bi_bh);
1281 			bi->bi_bh = NULL;
1282 		}
1283 	}
1284 }
1285 
gfs2_rgrp_send_discards(struct gfs2_sbd * sdp,u64 offset,struct buffer_head * bh,const struct gfs2_bitmap * bi,unsigned minlen,u64 * ptrimmed)1286 int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
1287 			     struct buffer_head *bh,
1288 			     const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed)
1289 {
1290 	struct super_block *sb = sdp->sd_vfs;
1291 	u64 blk;
1292 	sector_t start = 0;
1293 	sector_t nr_blks = 0;
1294 	int rv;
1295 	unsigned int x;
1296 	u32 trimmed = 0;
1297 	u8 diff;
1298 
1299 	for (x = 0; x < bi->bi_bytes; x++) {
1300 		const u8 *clone = bi->bi_clone ? bi->bi_clone : bi->bi_bh->b_data;
1301 		clone += bi->bi_offset;
1302 		clone += x;
1303 		if (bh) {
1304 			const u8 *orig = bh->b_data + bi->bi_offset + x;
1305 			diff = ~(*orig | (*orig >> 1)) & (*clone | (*clone >> 1));
1306 		} else {
1307 			diff = ~(*clone | (*clone >> 1));
1308 		}
1309 		diff &= 0x55;
1310 		if (diff == 0)
1311 			continue;
1312 		blk = offset + ((bi->bi_start + x) * GFS2_NBBY);
1313 		while(diff) {
1314 			if (diff & 1) {
1315 				if (nr_blks == 0)
1316 					goto start_new_extent;
1317 				if ((start + nr_blks) != blk) {
1318 					if (nr_blks >= minlen) {
1319 						rv = sb_issue_discard(sb,
1320 							start, nr_blks,
1321 							GFP_NOFS, 0);
1322 						if (rv)
1323 							goto fail;
1324 						trimmed += nr_blks;
1325 					}
1326 					nr_blks = 0;
1327 start_new_extent:
1328 					start = blk;
1329 				}
1330 				nr_blks++;
1331 			}
1332 			diff >>= 2;
1333 			blk++;
1334 		}
1335 	}
1336 	if (nr_blks >= minlen) {
1337 		rv = sb_issue_discard(sb, start, nr_blks, GFP_NOFS, 0);
1338 		if (rv)
1339 			goto fail;
1340 		trimmed += nr_blks;
1341 	}
1342 	if (ptrimmed)
1343 		*ptrimmed = trimmed;
1344 	return 0;
1345 
1346 fail:
1347 	if (sdp->sd_args.ar_discard)
1348 		fs_warn(sdp, "error %d on discard request, turning discards off for this filesystem\n", rv);
1349 	sdp->sd_args.ar_discard = 0;
1350 	return -EIO;
1351 }
1352 
1353 /**
1354  * gfs2_fitrim - Generate discard requests for unused bits of the filesystem
1355  * @filp: Any file on the filesystem
1356  * @argp: Pointer to the arguments (also used to pass result)
1357  *
1358  * Returns: 0 on success, otherwise error code
1359  */
1360 
gfs2_fitrim(struct file * filp,void __user * argp)1361 int gfs2_fitrim(struct file *filp, void __user *argp)
1362 {
1363 	struct inode *inode = file_inode(filp);
1364 	struct gfs2_sbd *sdp = GFS2_SB(inode);
1365 	struct request_queue *q = bdev_get_queue(sdp->sd_vfs->s_bdev);
1366 	struct buffer_head *bh;
1367 	struct gfs2_rgrpd *rgd;
1368 	struct gfs2_rgrpd *rgd_end;
1369 	struct gfs2_holder gh;
1370 	struct fstrim_range r;
1371 	int ret = 0;
1372 	u64 amt;
1373 	u64 trimmed = 0;
1374 	u64 start, end, minlen;
1375 	unsigned int x;
1376 	unsigned bs_shift = sdp->sd_sb.sb_bsize_shift;
1377 
1378 	if (!capable(CAP_SYS_ADMIN))
1379 		return -EPERM;
1380 
1381 	if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
1382 		return -EROFS;
1383 
1384 	if (!blk_queue_discard(q))
1385 		return -EOPNOTSUPP;
1386 
1387 	if (copy_from_user(&r, argp, sizeof(r)))
1388 		return -EFAULT;
1389 
1390 	ret = gfs2_rindex_update(sdp);
1391 	if (ret)
1392 		return ret;
1393 
1394 	start = r.start >> bs_shift;
1395 	end = start + (r.len >> bs_shift);
1396 	minlen = max_t(u64, r.minlen, sdp->sd_sb.sb_bsize);
1397 	minlen = max_t(u64, minlen,
1398 		       q->limits.discard_granularity) >> bs_shift;
1399 
1400 	if (end <= start || minlen > sdp->sd_max_rg_data)
1401 		return -EINVAL;
1402 
1403 	rgd = gfs2_blk2rgrpd(sdp, start, 0);
1404 	rgd_end = gfs2_blk2rgrpd(sdp, end, 0);
1405 
1406 	if ((gfs2_rgrpd_get_first(sdp) == gfs2_rgrpd_get_next(rgd_end))
1407 	    && (start > rgd_end->rd_data0 + rgd_end->rd_data))
1408 		return -EINVAL; /* start is beyond the end of the fs */
1409 
1410 	while (1) {
1411 
1412 		ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &gh);
1413 		if (ret)
1414 			goto out;
1415 
1416 		if (!(rgd->rd_flags & GFS2_RGF_TRIMMED)) {
1417 			/* Trim each bitmap in the rgrp */
1418 			for (x = 0; x < rgd->rd_length; x++) {
1419 				struct gfs2_bitmap *bi = rgd->rd_bits + x;
1420 				ret = gfs2_rgrp_send_discards(sdp,
1421 						rgd->rd_data0, NULL, bi, minlen,
1422 						&amt);
1423 				if (ret) {
1424 					gfs2_glock_dq_uninit(&gh);
1425 					goto out;
1426 				}
1427 				trimmed += amt;
1428 			}
1429 
1430 			/* Mark rgrp as having been trimmed */
1431 			ret = gfs2_trans_begin(sdp, RES_RG_HDR, 0);
1432 			if (ret == 0) {
1433 				bh = rgd->rd_bits[0].bi_bh;
1434 				rgd->rd_flags |= GFS2_RGF_TRIMMED;
1435 				gfs2_trans_add_meta(rgd->rd_gl, bh);
1436 				gfs2_rgrp_out(rgd, bh->b_data);
1437 				gfs2_trans_end(sdp);
1438 			}
1439 		}
1440 		gfs2_glock_dq_uninit(&gh);
1441 
1442 		if (rgd == rgd_end)
1443 			break;
1444 
1445 		rgd = gfs2_rgrpd_get_next(rgd);
1446 	}
1447 
1448 out:
1449 	r.len = trimmed << bs_shift;
1450 	if (copy_to_user(argp, &r, sizeof(r)))
1451 		return -EFAULT;
1452 
1453 	return ret;
1454 }
1455 
1456 /**
1457  * rs_insert - insert a new multi-block reservation into the rgrp's rb_tree
1458  * @ip: the inode structure
1459  *
1460  */
rs_insert(struct gfs2_inode * ip)1461 static void rs_insert(struct gfs2_inode *ip)
1462 {
1463 	struct rb_node **newn, *parent = NULL;
1464 	int rc;
1465 	struct gfs2_blkreserv *rs = &ip->i_res;
1466 	struct gfs2_rgrpd *rgd = rs->rs_rbm.rgd;
1467 	u64 fsblock = gfs2_rbm_to_block(&rs->rs_rbm);
1468 
1469 	BUG_ON(gfs2_rs_active(rs));
1470 
1471 	spin_lock(&rgd->rd_rsspin);
1472 	newn = &rgd->rd_rstree.rb_node;
1473 	while (*newn) {
1474 		struct gfs2_blkreserv *cur =
1475 			rb_entry(*newn, struct gfs2_blkreserv, rs_node);
1476 
1477 		parent = *newn;
1478 		rc = rs_cmp(fsblock, rs->rs_free, cur);
1479 		if (rc > 0)
1480 			newn = &((*newn)->rb_right);
1481 		else if (rc < 0)
1482 			newn = &((*newn)->rb_left);
1483 		else {
1484 			spin_unlock(&rgd->rd_rsspin);
1485 			WARN_ON(1);
1486 			return;
1487 		}
1488 	}
1489 
1490 	rb_link_node(&rs->rs_node, parent, newn);
1491 	rb_insert_color(&rs->rs_node, &rgd->rd_rstree);
1492 
1493 	/* Do our rgrp accounting for the reservation */
1494 	rgd->rd_reserved += rs->rs_free; /* blocks reserved */
1495 	spin_unlock(&rgd->rd_rsspin);
1496 	trace_gfs2_rs(rs, TRACE_RS_INSERT);
1497 }
1498 
1499 /**
1500  * rgd_free - return the number of free blocks we can allocate.
1501  * @rgd: the resource group
1502  *
1503  * This function returns the number of free blocks for an rgrp.
1504  * That's the clone-free blocks (blocks that are free, not including those
1505  * still being used for unlinked files that haven't been deleted.)
1506  *
1507  * It also subtracts any blocks reserved by someone else, but does not
1508  * include free blocks that are still part of our current reservation,
1509  * because obviously we can (and will) allocate them.
1510  */
rgd_free(struct gfs2_rgrpd * rgd,struct gfs2_blkreserv * rs)1511 static inline u32 rgd_free(struct gfs2_rgrpd *rgd, struct gfs2_blkreserv *rs)
1512 {
1513 	u32 tot_reserved, tot_free;
1514 
1515 	if (WARN_ON_ONCE(rgd->rd_reserved < rs->rs_free))
1516 		return 0;
1517 	tot_reserved = rgd->rd_reserved - rs->rs_free;
1518 
1519 	if (rgd->rd_free_clone < tot_reserved)
1520 		tot_reserved = 0;
1521 
1522 	tot_free = rgd->rd_free_clone - tot_reserved;
1523 
1524 	return tot_free;
1525 }
1526 
1527 /**
1528  * rg_mblk_search - find a group of multiple free blocks to form a reservation
1529  * @rgd: the resource group descriptor
1530  * @ip: pointer to the inode for which we're reserving blocks
1531  * @ap: the allocation parameters
1532  *
1533  */
1534 
rg_mblk_search(struct gfs2_rgrpd * rgd,struct gfs2_inode * ip,const struct gfs2_alloc_parms * ap)1535 static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip,
1536 			   const struct gfs2_alloc_parms *ap)
1537 {
1538 	struct gfs2_rbm rbm = { .rgd = rgd, };
1539 	u64 goal;
1540 	struct gfs2_blkreserv *rs = &ip->i_res;
1541 	u32 extlen;
1542 	u32 free_blocks = rgd_free(rgd, rs);
1543 	int ret;
1544 	struct inode *inode = &ip->i_inode;
1545 
1546 	if (S_ISDIR(inode->i_mode))
1547 		extlen = 1;
1548 	else {
1549 		extlen = max_t(u32, atomic_read(&ip->i_sizehint), ap->target);
1550 		extlen = clamp(extlen, (u32)RGRP_RSRV_MINBLKS, free_blocks);
1551 	}
1552 	if ((rgd->rd_free_clone < rgd->rd_reserved) || (free_blocks < extlen))
1553 		return;
1554 
1555 	/* Find bitmap block that contains bits for goal block */
1556 	if (rgrp_contains_block(rgd, ip->i_goal))
1557 		goal = ip->i_goal;
1558 	else
1559 		goal = rgd->rd_last_alloc + rgd->rd_data0;
1560 
1561 	if (WARN_ON(gfs2_rbm_from_block(&rbm, goal)))
1562 		return;
1563 
1564 	ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &extlen, ip, true);
1565 	if (ret == 0) {
1566 		rs->rs_rbm = rbm;
1567 		rs->rs_free = extlen;
1568 		rs_insert(ip);
1569 	} else {
1570 		if (goal == rgd->rd_last_alloc + rgd->rd_data0)
1571 			rgd->rd_last_alloc = 0;
1572 	}
1573 }
1574 
1575 /**
1576  * gfs2_next_unreserved_block - Return next block that is not reserved
1577  * @rgd: The resource group
1578  * @block: The starting block
1579  * @length: The required length
1580  * @ip: Ignore any reservations for this inode
1581  *
1582  * If the block does not appear in any reservation, then return the
1583  * block number unchanged. If it does appear in the reservation, then
1584  * keep looking through the tree of reservations in order to find the
1585  * first block number which is not reserved.
1586  */
1587 
gfs2_next_unreserved_block(struct gfs2_rgrpd * rgd,u64 block,u32 length,const struct gfs2_inode * ip)1588 static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block,
1589 				      u32 length,
1590 				      const struct gfs2_inode *ip)
1591 {
1592 	struct gfs2_blkreserv *rs;
1593 	struct rb_node *n;
1594 	int rc;
1595 
1596 	spin_lock(&rgd->rd_rsspin);
1597 	n = rgd->rd_rstree.rb_node;
1598 	while (n) {
1599 		rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
1600 		rc = rs_cmp(block, length, rs);
1601 		if (rc < 0)
1602 			n = n->rb_left;
1603 		else if (rc > 0)
1604 			n = n->rb_right;
1605 		else
1606 			break;
1607 	}
1608 
1609 	if (n) {
1610 		while ((rs_cmp(block, length, rs) == 0) && (&ip->i_res != rs)) {
1611 			block = gfs2_rbm_to_block(&rs->rs_rbm) + rs->rs_free;
1612 			n = n->rb_right;
1613 			if (n == NULL)
1614 				break;
1615 			rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
1616 		}
1617 	}
1618 
1619 	spin_unlock(&rgd->rd_rsspin);
1620 	return block;
1621 }
1622 
1623 /**
1624  * gfs2_reservation_check_and_update - Check for reservations during block alloc
1625  * @rbm: The current position in the resource group
1626  * @ip: The inode for which we are searching for blocks
1627  * @minext: The minimum extent length
1628  * @maxext: A pointer to the maximum extent structure
1629  *
1630  * This checks the current position in the rgrp to see whether there is
1631  * a reservation covering this block. If not then this function is a
1632  * no-op. If there is, then the position is moved to the end of the
1633  * contiguous reservation(s) so that we are pointing at the first
1634  * non-reserved block.
1635  *
1636  * Returns: 0 if no reservation, 1 if @rbm has changed, otherwise an error
1637  */
1638 
gfs2_reservation_check_and_update(struct gfs2_rbm * rbm,const struct gfs2_inode * ip,u32 minext,struct gfs2_extent * maxext)1639 static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm,
1640 					     const struct gfs2_inode *ip,
1641 					     u32 minext,
1642 					     struct gfs2_extent *maxext)
1643 {
1644 	u64 block = gfs2_rbm_to_block(rbm);
1645 	u32 extlen = 1;
1646 	u64 nblock;
1647 	int ret;
1648 
1649 	/*
1650 	 * If we have a minimum extent length, then skip over any extent
1651 	 * which is less than the min extent length in size.
1652 	 */
1653 	if (minext > 1) {
1654 		extlen = gfs2_free_extlen(rbm, minext);
1655 		if (extlen <= maxext->len)
1656 			goto fail;
1657 	}
1658 
1659 	/*
1660 	 * Check the extent which has been found against the reservations
1661 	 * and skip if parts of it are already reserved
1662 	 */
1663 	nblock = gfs2_next_unreserved_block(rbm->rgd, block, extlen, ip);
1664 	if (nblock == block) {
1665 		if (!minext || extlen >= minext)
1666 			return 0;
1667 
1668 		if (extlen > maxext->len) {
1669 			maxext->len = extlen;
1670 			maxext->rbm = *rbm;
1671 		}
1672 fail:
1673 		nblock = block + extlen;
1674 	}
1675 	ret = gfs2_rbm_from_block(rbm, nblock);
1676 	if (ret < 0)
1677 		return ret;
1678 	return 1;
1679 }
1680 
1681 /**
1682  * gfs2_rbm_find - Look for blocks of a particular state
1683  * @rbm: Value/result starting position and final position
1684  * @state: The state which we want to find
1685  * @minext: Pointer to the requested extent length
1686  *          This is updated to be the actual reservation size.
1687  * @ip: If set, check for reservations
1688  * @nowrap: Stop looking at the end of the rgrp, rather than wrapping
1689  *          around until we've reached the starting point.
1690  *
1691  * Side effects:
1692  * - If looking for free blocks, we set GBF_FULL on each bitmap which
1693  *   has no free blocks in it.
1694  * - If looking for free blocks, we set rd_extfail_pt on each rgrp which
1695  *   has come up short on a free block search.
1696  *
1697  * Returns: 0 on success, -ENOSPC if there is no block of the requested state
1698  */
1699 
gfs2_rbm_find(struct gfs2_rbm * rbm,u8 state,u32 * minext,const struct gfs2_inode * ip,bool nowrap)1700 static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
1701 			 const struct gfs2_inode *ip, bool nowrap)
1702 {
1703 	bool scan_from_start = rbm->bii == 0 && rbm->offset == 0;
1704 	struct buffer_head *bh;
1705 	int last_bii;
1706 	u32 offset;
1707 	u8 *buffer;
1708 	bool wrapped = false;
1709 	int ret;
1710 	struct gfs2_bitmap *bi;
1711 	struct gfs2_extent maxext = { .rbm.rgd = rbm->rgd, };
1712 
1713 	/*
1714 	 * Determine the last bitmap to search.  If we're not starting at the
1715 	 * beginning of a bitmap, we need to search that bitmap twice to scan
1716 	 * the entire resource group.
1717 	 */
1718 	last_bii = rbm->bii - (rbm->offset == 0);
1719 
1720 	while(1) {
1721 		bi = rbm_bi(rbm);
1722 		if (test_bit(GBF_FULL, &bi->bi_flags) &&
1723 		    (state == GFS2_BLKST_FREE))
1724 			goto next_bitmap;
1725 
1726 		bh = bi->bi_bh;
1727 		buffer = bh->b_data + bi->bi_offset;
1728 		WARN_ON(!buffer_uptodate(bh));
1729 		if (state != GFS2_BLKST_UNLINKED && bi->bi_clone)
1730 			buffer = bi->bi_clone + bi->bi_offset;
1731 		offset = gfs2_bitfit(buffer, bi->bi_bytes, rbm->offset, state);
1732 		if (offset == BFITNOENT) {
1733 			if (state == GFS2_BLKST_FREE && rbm->offset == 0)
1734 				set_bit(GBF_FULL, &bi->bi_flags);
1735 			goto next_bitmap;
1736 		}
1737 		rbm->offset = offset;
1738 		if (ip == NULL)
1739 			return 0;
1740 
1741 		ret = gfs2_reservation_check_and_update(rbm, ip, *minext,
1742 							&maxext);
1743 		if (ret == 0)
1744 			return 0;
1745 		if (ret > 0)
1746 			goto next_iter;
1747 		if (ret == -E2BIG) {
1748 			rbm->bii = 0;
1749 			rbm->offset = 0;
1750 			goto res_covered_end_of_rgrp;
1751 		}
1752 		return ret;
1753 
1754 next_bitmap:	/* Find next bitmap in the rgrp */
1755 		rbm->offset = 0;
1756 		rbm->bii++;
1757 		if (rbm->bii == rbm->rgd->rd_length)
1758 			rbm->bii = 0;
1759 res_covered_end_of_rgrp:
1760 		if (rbm->bii == 0) {
1761 			if (wrapped)
1762 				break;
1763 			wrapped = true;
1764 			if (nowrap)
1765 				break;
1766 		}
1767 next_iter:
1768 		/* Have we scanned the entire resource group? */
1769 		if (wrapped && rbm->bii > last_bii)
1770 			break;
1771 	}
1772 
1773 	if (state != GFS2_BLKST_FREE)
1774 		return -ENOSPC;
1775 
1776 	/* If the extent was too small, and it's smaller than the smallest
1777 	   to have failed before, remember for future reference that it's
1778 	   useless to search this rgrp again for this amount or more. */
1779 	if (wrapped && (scan_from_start || rbm->bii > last_bii) &&
1780 	    *minext < rbm->rgd->rd_extfail_pt)
1781 		rbm->rgd->rd_extfail_pt = *minext - 1;
1782 
1783 	/* If the maximum extent we found is big enough to fulfill the
1784 	   minimum requirements, use it anyway. */
1785 	if (maxext.len) {
1786 		*rbm = maxext.rbm;
1787 		*minext = maxext.len;
1788 		return 0;
1789 	}
1790 
1791 	return -ENOSPC;
1792 }
1793 
1794 /**
1795  * try_rgrp_unlink - Look for any unlinked, allocated, but unused inodes
1796  * @rgd: The rgrp
1797  * @last_unlinked: block address of the last dinode we unlinked
1798  * @skip: block address we should explicitly not unlink
1799  *
1800  * Returns: 0 if no error
1801  *          The inode, if one has been found, in inode.
1802  */
1803 
try_rgrp_unlink(struct gfs2_rgrpd * rgd,u64 * last_unlinked,u64 skip)1804 static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip)
1805 {
1806 	u64 block;
1807 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1808 	struct gfs2_glock *gl;
1809 	struct gfs2_inode *ip;
1810 	int error;
1811 	int found = 0;
1812 	struct gfs2_rbm rbm = { .rgd = rgd, .bii = 0, .offset = 0 };
1813 
1814 	while (1) {
1815 		error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, NULL, NULL,
1816 				      true);
1817 		if (error == -ENOSPC)
1818 			break;
1819 		if (WARN_ON_ONCE(error))
1820 			break;
1821 
1822 		block = gfs2_rbm_to_block(&rbm);
1823 		if (gfs2_rbm_from_block(&rbm, block + 1))
1824 			break;
1825 		if (*last_unlinked != NO_BLOCK && block <= *last_unlinked)
1826 			continue;
1827 		if (block == skip)
1828 			continue;
1829 		*last_unlinked = block;
1830 
1831 		error = gfs2_glock_get(sdp, block, &gfs2_iopen_glops, CREATE, &gl);
1832 		if (error)
1833 			continue;
1834 
1835 		/* If the inode is already in cache, we can ignore it here
1836 		 * because the existing inode disposal code will deal with
1837 		 * it when all refs have gone away. Accessing gl_object like
1838 		 * this is not safe in general. Here it is ok because we do
1839 		 * not dereference the pointer, and we only need an approx
1840 		 * answer to whether it is NULL or not.
1841 		 */
1842 		ip = gl->gl_object;
1843 
1844 		if (ip || !gfs2_queue_delete_work(gl, 0))
1845 			gfs2_glock_put(gl);
1846 		else
1847 			found++;
1848 
1849 		/* Limit reclaim to sensible number of tasks */
1850 		if (found > NR_CPUS)
1851 			return;
1852 	}
1853 
1854 	rgd->rd_flags &= ~GFS2_RDF_CHECK;
1855 	return;
1856 }
1857 
1858 /**
1859  * gfs2_rgrp_congested - Use stats to figure out whether an rgrp is congested
1860  * @rgd: The rgrp in question
1861  * @loops: An indication of how picky we can be (0=very, 1=less so)
1862  *
1863  * This function uses the recently added glock statistics in order to
1864  * figure out whether a parciular resource group is suffering from
1865  * contention from multiple nodes. This is done purely on the basis
1866  * of timings, since this is the only data we have to work with and
1867  * our aim here is to reject a resource group which is highly contended
1868  * but (very important) not to do this too often in order to ensure that
1869  * we do not land up introducing fragmentation by changing resource
1870  * groups when not actually required.
1871  *
1872  * The calculation is fairly simple, we want to know whether the SRTTB
1873  * (i.e. smoothed round trip time for blocking operations) to acquire
1874  * the lock for this rgrp's glock is significantly greater than the
1875  * time taken for resource groups on average. We introduce a margin in
1876  * the form of the variable @var which is computed as the sum of the two
1877  * respective variences, and multiplied by a factor depending on @loops
1878  * and whether we have a lot of data to base the decision on. This is
1879  * then tested against the square difference of the means in order to
1880  * decide whether the result is statistically significant or not.
1881  *
1882  * Returns: A boolean verdict on the congestion status
1883  */
1884 
gfs2_rgrp_congested(const struct gfs2_rgrpd * rgd,int loops)1885 static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops)
1886 {
1887 	const struct gfs2_glock *gl = rgd->rd_gl;
1888 	const struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
1889 	struct gfs2_lkstats *st;
1890 	u64 r_dcount, l_dcount;
1891 	u64 l_srttb, a_srttb = 0;
1892 	s64 srttb_diff;
1893 	u64 sqr_diff;
1894 	u64 var;
1895 	int cpu, nonzero = 0;
1896 
1897 	preempt_disable();
1898 	for_each_present_cpu(cpu) {
1899 		st = &per_cpu_ptr(sdp->sd_lkstats, cpu)->lkstats[LM_TYPE_RGRP];
1900 		if (st->stats[GFS2_LKS_SRTTB]) {
1901 			a_srttb += st->stats[GFS2_LKS_SRTTB];
1902 			nonzero++;
1903 		}
1904 	}
1905 	st = &this_cpu_ptr(sdp->sd_lkstats)->lkstats[LM_TYPE_RGRP];
1906 	if (nonzero)
1907 		do_div(a_srttb, nonzero);
1908 	r_dcount = st->stats[GFS2_LKS_DCOUNT];
1909 	var = st->stats[GFS2_LKS_SRTTVARB] +
1910 	      gl->gl_stats.stats[GFS2_LKS_SRTTVARB];
1911 	preempt_enable();
1912 
1913 	l_srttb = gl->gl_stats.stats[GFS2_LKS_SRTTB];
1914 	l_dcount = gl->gl_stats.stats[GFS2_LKS_DCOUNT];
1915 
1916 	if ((l_dcount < 1) || (r_dcount < 1) || (a_srttb == 0))
1917 		return false;
1918 
1919 	srttb_diff = a_srttb - l_srttb;
1920 	sqr_diff = srttb_diff * srttb_diff;
1921 
1922 	var *= 2;
1923 	if (l_dcount < 8 || r_dcount < 8)
1924 		var *= 2;
1925 	if (loops == 1)
1926 		var *= 2;
1927 
1928 	return ((srttb_diff < 0) && (sqr_diff > var));
1929 }
1930 
1931 /**
1932  * gfs2_rgrp_used_recently
1933  * @rs: The block reservation with the rgrp to test
1934  * @msecs: The time limit in milliseconds
1935  *
1936  * Returns: True if the rgrp glock has been used within the time limit
1937  */
gfs2_rgrp_used_recently(const struct gfs2_blkreserv * rs,u64 msecs)1938 static bool gfs2_rgrp_used_recently(const struct gfs2_blkreserv *rs,
1939 				    u64 msecs)
1940 {
1941 	u64 tdiff;
1942 
1943 	tdiff = ktime_to_ns(ktime_sub(ktime_get_real(),
1944                             rs->rs_rbm.rgd->rd_gl->gl_dstamp));
1945 
1946 	return tdiff > (msecs * 1000 * 1000);
1947 }
1948 
gfs2_orlov_skip(const struct gfs2_inode * ip)1949 static u32 gfs2_orlov_skip(const struct gfs2_inode *ip)
1950 {
1951 	const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1952 	u32 skip;
1953 
1954 	get_random_bytes(&skip, sizeof(skip));
1955 	return skip % sdp->sd_rgrps;
1956 }
1957 
gfs2_select_rgrp(struct gfs2_rgrpd ** pos,const struct gfs2_rgrpd * begin)1958 static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *begin)
1959 {
1960 	struct gfs2_rgrpd *rgd = *pos;
1961 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1962 
1963 	rgd = gfs2_rgrpd_get_next(rgd);
1964 	if (rgd == NULL)
1965 		rgd = gfs2_rgrpd_get_first(sdp);
1966 	*pos = rgd;
1967 	if (rgd != begin) /* If we didn't wrap */
1968 		return true;
1969 	return false;
1970 }
1971 
1972 /**
1973  * fast_to_acquire - determine if a resource group will be fast to acquire
1974  *
1975  * If this is one of our preferred rgrps, it should be quicker to acquire,
1976  * because we tried to set ourselves up as dlm lock master.
1977  */
fast_to_acquire(struct gfs2_rgrpd * rgd)1978 static inline int fast_to_acquire(struct gfs2_rgrpd *rgd)
1979 {
1980 	struct gfs2_glock *gl = rgd->rd_gl;
1981 
1982 	if (gl->gl_state != LM_ST_UNLOCKED && list_empty(&gl->gl_holders) &&
1983 	    !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) &&
1984 	    !test_bit(GLF_DEMOTE, &gl->gl_flags))
1985 		return 1;
1986 	if (rgd->rd_flags & GFS2_RDF_PREFERRED)
1987 		return 1;
1988 	return 0;
1989 }
1990 
1991 /**
1992  * gfs2_inplace_reserve - Reserve space in the filesystem
1993  * @ip: the inode to reserve space for
1994  * @ap: the allocation parameters
1995  *
1996  * We try our best to find an rgrp that has at least ap->target blocks
1997  * available. After a couple of passes (loops == 2), the prospects of finding
1998  * such an rgrp diminish. At this stage, we return the first rgrp that has
1999  * at least ap->min_target blocks available. Either way, we set ap->allowed to
2000  * the number of blocks available in the chosen rgrp.
2001  *
2002  * Returns: 0 on success,
2003  *          -ENOMEM if a suitable rgrp can't be found
2004  *          errno otherwise
2005  */
2006 
gfs2_inplace_reserve(struct gfs2_inode * ip,struct gfs2_alloc_parms * ap)2007 int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap)
2008 {
2009 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2010 	struct gfs2_rgrpd *begin = NULL;
2011 	struct gfs2_blkreserv *rs = &ip->i_res;
2012 	int error = 0, rg_locked, flags = 0;
2013 	u64 last_unlinked = NO_BLOCK;
2014 	int loops = 0;
2015 	u32 free_blocks, skip = 0;
2016 
2017 	if (sdp->sd_args.ar_rgrplvb)
2018 		flags |= GL_SKIP;
2019 	if (gfs2_assert_warn(sdp, ap->target))
2020 		return -EINVAL;
2021 	if (gfs2_rs_active(rs)) {
2022 		begin = rs->rs_rbm.rgd;
2023 	} else if (rs->rs_rbm.rgd &&
2024 		   rgrp_contains_block(rs->rs_rbm.rgd, ip->i_goal)) {
2025 		begin = rs->rs_rbm.rgd;
2026 	} else {
2027 		check_and_update_goal(ip);
2028 		rs->rs_rbm.rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);
2029 	}
2030 	if (S_ISDIR(ip->i_inode.i_mode) && (ap->aflags & GFS2_AF_ORLOV))
2031 		skip = gfs2_orlov_skip(ip);
2032 	if (rs->rs_rbm.rgd == NULL)
2033 		return -EBADSLT;
2034 
2035 	while (loops < 3) {
2036 		rg_locked = 1;
2037 
2038 		if (!gfs2_glock_is_locked_by_me(rs->rs_rbm.rgd->rd_gl)) {
2039 			rg_locked = 0;
2040 			if (skip && skip--)
2041 				goto next_rgrp;
2042 			if (!gfs2_rs_active(rs)) {
2043 				if (loops == 0 &&
2044 				    !fast_to_acquire(rs->rs_rbm.rgd))
2045 					goto next_rgrp;
2046 				if ((loops < 2) &&
2047 				    gfs2_rgrp_used_recently(rs, 1000) &&
2048 				    gfs2_rgrp_congested(rs->rs_rbm.rgd, loops))
2049 					goto next_rgrp;
2050 			}
2051 			error = gfs2_glock_nq_init(rs->rs_rbm.rgd->rd_gl,
2052 						   LM_ST_EXCLUSIVE, flags,
2053 						   &ip->i_rgd_gh);
2054 			if (unlikely(error))
2055 				return error;
2056 			if (!gfs2_rs_active(rs) && (loops < 2) &&
2057 			    gfs2_rgrp_congested(rs->rs_rbm.rgd, loops))
2058 				goto skip_rgrp;
2059 			if (sdp->sd_args.ar_rgrplvb) {
2060 				error = update_rgrp_lvb(rs->rs_rbm.rgd);
2061 				if (unlikely(error)) {
2062 					gfs2_glock_dq_uninit(&ip->i_rgd_gh);
2063 					return error;
2064 				}
2065 			}
2066 		}
2067 
2068 		/* Skip unusable resource groups */
2069 		if ((rs->rs_rbm.rgd->rd_flags & (GFS2_RGF_NOALLOC |
2070 						 GFS2_RDF_ERROR)) ||
2071 		    (loops == 0 && ap->target > rs->rs_rbm.rgd->rd_extfail_pt))
2072 			goto skip_rgrp;
2073 
2074 		if (sdp->sd_args.ar_rgrplvb)
2075 			gfs2_rgrp_bh_get(rs->rs_rbm.rgd);
2076 
2077 		/* Get a reservation if we don't already have one */
2078 		if (!gfs2_rs_active(rs))
2079 			rg_mblk_search(rs->rs_rbm.rgd, ip, ap);
2080 
2081 		/* Skip rgrps when we can't get a reservation on first pass */
2082 		if (!gfs2_rs_active(rs) && (loops < 1))
2083 			goto check_rgrp;
2084 
2085 		/* If rgrp has enough free space, use it */
2086 		free_blocks = rgd_free(rs->rs_rbm.rgd, rs);
2087 		if (free_blocks >= ap->target ||
2088 		    (loops == 2 && ap->min_target &&
2089 		     free_blocks >= ap->min_target)) {
2090 			ap->allowed = free_blocks;
2091 			return 0;
2092 		}
2093 check_rgrp:
2094 		/* Check for unlinked inodes which can be reclaimed */
2095 		if (rs->rs_rbm.rgd->rd_flags & GFS2_RDF_CHECK)
2096 			try_rgrp_unlink(rs->rs_rbm.rgd, &last_unlinked,
2097 					ip->i_no_addr);
2098 skip_rgrp:
2099 		/* Drop reservation, if we couldn't use reserved rgrp */
2100 		if (gfs2_rs_active(rs))
2101 			gfs2_rs_deltree(rs);
2102 
2103 		/* Unlock rgrp if required */
2104 		if (!rg_locked)
2105 			gfs2_glock_dq_uninit(&ip->i_rgd_gh);
2106 next_rgrp:
2107 		/* Find the next rgrp, and continue looking */
2108 		if (gfs2_select_rgrp(&rs->rs_rbm.rgd, begin))
2109 			continue;
2110 		if (skip)
2111 			continue;
2112 
2113 		/* If we've scanned all the rgrps, but found no free blocks
2114 		 * then this checks for some less likely conditions before
2115 		 * trying again.
2116 		 */
2117 		loops++;
2118 		/* Check that fs hasn't grown if writing to rindex */
2119 		if (ip == GFS2_I(sdp->sd_rindex) && !sdp->sd_rindex_uptodate) {
2120 			error = gfs2_ri_update(ip);
2121 			if (error)
2122 				return error;
2123 		}
2124 		/* Flushing the log may release space */
2125 		if (loops == 2)
2126 			gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL |
2127 				       GFS2_LFC_INPLACE_RESERVE);
2128 	}
2129 
2130 	return -ENOSPC;
2131 }
2132 
2133 /**
2134  * gfs2_inplace_release - release an inplace reservation
2135  * @ip: the inode the reservation was taken out on
2136  *
2137  * Release a reservation made by gfs2_inplace_reserve().
2138  */
2139 
gfs2_inplace_release(struct gfs2_inode * ip)2140 void gfs2_inplace_release(struct gfs2_inode *ip)
2141 {
2142 	if (gfs2_holder_initialized(&ip->i_rgd_gh))
2143 		gfs2_glock_dq_uninit(&ip->i_rgd_gh);
2144 }
2145 
2146 /**
2147  * gfs2_alloc_extent - allocate an extent from a given bitmap
2148  * @rbm: the resource group information
2149  * @dinode: TRUE if the first block we allocate is for a dinode
2150  * @n: The extent length (value/result)
2151  *
2152  * Add the bitmap buffer to the transaction.
2153  * Set the found bits to @new_state to change block's allocation state.
2154  */
gfs2_alloc_extent(const struct gfs2_rbm * rbm,bool dinode,unsigned int * n)2155 static void gfs2_alloc_extent(const struct gfs2_rbm *rbm, bool dinode,
2156 			     unsigned int *n)
2157 {
2158 	struct gfs2_rbm pos = { .rgd = rbm->rgd, };
2159 	const unsigned int elen = *n;
2160 	u64 block;
2161 	int ret;
2162 
2163 	*n = 1;
2164 	block = gfs2_rbm_to_block(rbm);
2165 	gfs2_trans_add_meta(rbm->rgd->rd_gl, rbm_bi(rbm)->bi_bh);
2166 	gfs2_setbit(rbm, true, dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
2167 	block++;
2168 	while (*n < elen) {
2169 		ret = gfs2_rbm_from_block(&pos, block);
2170 		if (ret || gfs2_testbit(&pos, true) != GFS2_BLKST_FREE)
2171 			break;
2172 		gfs2_trans_add_meta(pos.rgd->rd_gl, rbm_bi(&pos)->bi_bh);
2173 		gfs2_setbit(&pos, true, GFS2_BLKST_USED);
2174 		(*n)++;
2175 		block++;
2176 	}
2177 }
2178 
2179 /**
2180  * rgblk_free - Change alloc state of given block(s)
2181  * @sdp: the filesystem
2182  * @rgd: the resource group the blocks are in
2183  * @bstart: the start of a run of blocks to free
2184  * @blen: the length of the block run (all must lie within ONE RG!)
2185  * @new_state: GFS2_BLKST_XXX the after-allocation block state
2186  */
2187 
rgblk_free(struct gfs2_sbd * sdp,struct gfs2_rgrpd * rgd,u64 bstart,u32 blen,unsigned char new_state)2188 static void rgblk_free(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd,
2189 		       u64 bstart, u32 blen, unsigned char new_state)
2190 {
2191 	struct gfs2_rbm rbm;
2192 	struct gfs2_bitmap *bi, *bi_prev = NULL;
2193 
2194 	rbm.rgd = rgd;
2195 	if (WARN_ON_ONCE(gfs2_rbm_from_block(&rbm, bstart)))
2196 		return;
2197 	while (blen--) {
2198 		bi = rbm_bi(&rbm);
2199 		if (bi != bi_prev) {
2200 			if (!bi->bi_clone) {
2201 				bi->bi_clone = kmalloc(bi->bi_bh->b_size,
2202 						      GFP_NOFS | __GFP_NOFAIL);
2203 				memcpy(bi->bi_clone + bi->bi_offset,
2204 				       bi->bi_bh->b_data + bi->bi_offset,
2205 				       bi->bi_bytes);
2206 			}
2207 			gfs2_trans_add_meta(rbm.rgd->rd_gl, bi->bi_bh);
2208 			bi_prev = bi;
2209 		}
2210 		gfs2_setbit(&rbm, false, new_state);
2211 		gfs2_rbm_incr(&rbm);
2212 	}
2213 }
2214 
2215 /**
2216  * gfs2_rgrp_dump - print out an rgrp
2217  * @seq: The iterator
2218  * @rgd: The rgrp in question
2219  * @fs_id_buf: pointer to file system id (if requested)
2220  *
2221  */
2222 
gfs2_rgrp_dump(struct seq_file * seq,struct gfs2_rgrpd * rgd,const char * fs_id_buf)2223 void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_rgrpd *rgd,
2224 		    const char *fs_id_buf)
2225 {
2226 	struct gfs2_blkreserv *trs;
2227 	const struct rb_node *n;
2228 
2229 	gfs2_print_dbg(seq, "%s R: n:%llu f:%02x b:%u/%u i:%u r:%u e:%u\n",
2230 		       fs_id_buf,
2231 		       (unsigned long long)rgd->rd_addr, rgd->rd_flags,
2232 		       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes,
2233 		       rgd->rd_reserved, rgd->rd_extfail_pt);
2234 	if (rgd->rd_sbd->sd_args.ar_rgrplvb && rgd->rd_rgl) {
2235 		struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl;
2236 
2237 		gfs2_print_dbg(seq, "%s  L: f:%02x b:%u i:%u\n", fs_id_buf,
2238 			       be32_to_cpu(rgl->rl_flags),
2239 			       be32_to_cpu(rgl->rl_free),
2240 			       be32_to_cpu(rgl->rl_dinodes));
2241 	}
2242 	spin_lock(&rgd->rd_rsspin);
2243 	for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) {
2244 		trs = rb_entry(n, struct gfs2_blkreserv, rs_node);
2245 		dump_rs(seq, trs, fs_id_buf);
2246 	}
2247 	spin_unlock(&rgd->rd_rsspin);
2248 }
2249 
gfs2_rgrp_error(struct gfs2_rgrpd * rgd)2250 static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
2251 {
2252 	struct gfs2_sbd *sdp = rgd->rd_sbd;
2253 	char fs_id_buf[sizeof(sdp->sd_fsname) + 7];
2254 
2255 	fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n",
2256 		(unsigned long long)rgd->rd_addr);
2257 	fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n");
2258 	sprintf(fs_id_buf, "fsid=%s: ", sdp->sd_fsname);
2259 	gfs2_rgrp_dump(NULL, rgd, fs_id_buf);
2260 	rgd->rd_flags |= GFS2_RDF_ERROR;
2261 }
2262 
2263 /**
2264  * gfs2_adjust_reservation - Adjust (or remove) a reservation after allocation
2265  * @ip: The inode we have just allocated blocks for
2266  * @rbm: The start of the allocated blocks
2267  * @len: The extent length
2268  *
2269  * Adjusts a reservation after an allocation has taken place. If the
2270  * reservation does not match the allocation, or if it is now empty
2271  * then it is removed.
2272  */
2273 
gfs2_adjust_reservation(struct gfs2_inode * ip,const struct gfs2_rbm * rbm,unsigned len)2274 static void gfs2_adjust_reservation(struct gfs2_inode *ip,
2275 				    const struct gfs2_rbm *rbm, unsigned len)
2276 {
2277 	struct gfs2_blkreserv *rs = &ip->i_res;
2278 	struct gfs2_rgrpd *rgd = rbm->rgd;
2279 	unsigned rlen;
2280 	u64 block;
2281 	int ret;
2282 
2283 	spin_lock(&rgd->rd_rsspin);
2284 	if (gfs2_rs_active(rs)) {
2285 		if (gfs2_rbm_eq(&rs->rs_rbm, rbm)) {
2286 			block = gfs2_rbm_to_block(rbm);
2287 			ret = gfs2_rbm_from_block(&rs->rs_rbm, block + len);
2288 			rlen = min(rs->rs_free, len);
2289 			rs->rs_free -= rlen;
2290 			rgd->rd_reserved -= rlen;
2291 			trace_gfs2_rs(rs, TRACE_RS_CLAIM);
2292 			if (rs->rs_free && !ret)
2293 				goto out;
2294 			/* We used up our block reservation, so we should
2295 			   reserve more blocks next time. */
2296 			atomic_add(RGRP_RSRV_ADDBLKS, &ip->i_sizehint);
2297 		}
2298 		__rs_deltree(rs);
2299 	}
2300 out:
2301 	spin_unlock(&rgd->rd_rsspin);
2302 }
2303 
2304 /**
2305  * gfs2_set_alloc_start - Set starting point for block allocation
2306  * @rbm: The rbm which will be set to the required location
2307  * @ip: The gfs2 inode
2308  * @dinode: Flag to say if allocation includes a new inode
2309  *
2310  * This sets the starting point from the reservation if one is active
2311  * otherwise it falls back to guessing a start point based on the
2312  * inode's goal block or the last allocation point in the rgrp.
2313  */
2314 
gfs2_set_alloc_start(struct gfs2_rbm * rbm,const struct gfs2_inode * ip,bool dinode)2315 static void gfs2_set_alloc_start(struct gfs2_rbm *rbm,
2316 				 const struct gfs2_inode *ip, bool dinode)
2317 {
2318 	u64 goal;
2319 
2320 	if (gfs2_rs_active(&ip->i_res)) {
2321 		*rbm = ip->i_res.rs_rbm;
2322 		return;
2323 	}
2324 
2325 	if (!dinode && rgrp_contains_block(rbm->rgd, ip->i_goal))
2326 		goal = ip->i_goal;
2327 	else
2328 		goal = rbm->rgd->rd_last_alloc + rbm->rgd->rd_data0;
2329 
2330 	if (WARN_ON_ONCE(gfs2_rbm_from_block(rbm, goal))) {
2331 		rbm->bii = 0;
2332 		rbm->offset = 0;
2333 	}
2334 }
2335 
2336 /**
2337  * gfs2_alloc_blocks - Allocate one or more blocks of data and/or a dinode
2338  * @ip: the inode to allocate the block for
2339  * @bn: Used to return the starting block number
2340  * @nblocks: requested number of blocks/extent length (value/result)
2341  * @dinode: 1 if we're allocating a dinode block, else 0
2342  * @generation: the generation number of the inode
2343  *
2344  * Returns: 0 or error
2345  */
2346 
gfs2_alloc_blocks(struct gfs2_inode * ip,u64 * bn,unsigned int * nblocks,bool dinode,u64 * generation)2347 int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
2348 		      bool dinode, u64 *generation)
2349 {
2350 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2351 	struct buffer_head *dibh;
2352 	struct gfs2_rbm rbm = { .rgd = ip->i_res.rs_rbm.rgd, };
2353 	unsigned int ndata;
2354 	u64 block; /* block, within the file system scope */
2355 	u32 minext = 1;
2356 	int error;
2357 
2358 	gfs2_set_alloc_start(&rbm, ip, dinode);
2359 	error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &minext, ip, false);
2360 
2361 	if (error == -ENOSPC) {
2362 		gfs2_set_alloc_start(&rbm, ip, dinode);
2363 		error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &minext, NULL, false);
2364 	}
2365 
2366 	/* Since all blocks are reserved in advance, this shouldn't happen */
2367 	if (error) {
2368 		fs_warn(sdp, "inum=%llu error=%d, nblocks=%u, full=%d fail_pt=%d\n",
2369 			(unsigned long long)ip->i_no_addr, error, *nblocks,
2370 			test_bit(GBF_FULL, &rbm.rgd->rd_bits->bi_flags),
2371 			rbm.rgd->rd_extfail_pt);
2372 		goto rgrp_error;
2373 	}
2374 
2375 	gfs2_alloc_extent(&rbm, dinode, nblocks);
2376 	block = gfs2_rbm_to_block(&rbm);
2377 	rbm.rgd->rd_last_alloc = block - rbm.rgd->rd_data0;
2378 	if (gfs2_rs_active(&ip->i_res))
2379 		gfs2_adjust_reservation(ip, &rbm, *nblocks);
2380 	ndata = *nblocks;
2381 	if (dinode)
2382 		ndata--;
2383 
2384 	if (!dinode) {
2385 		ip->i_goal = block + ndata - 1;
2386 		error = gfs2_meta_inode_buffer(ip, &dibh);
2387 		if (error == 0) {
2388 			struct gfs2_dinode *di =
2389 				(struct gfs2_dinode *)dibh->b_data;
2390 			gfs2_trans_add_meta(ip->i_gl, dibh);
2391 			di->di_goal_meta = di->di_goal_data =
2392 				cpu_to_be64(ip->i_goal);
2393 			brelse(dibh);
2394 		}
2395 	}
2396 	if (rbm.rgd->rd_free < *nblocks) {
2397 		fs_warn(sdp, "nblocks=%u\n", *nblocks);
2398 		goto rgrp_error;
2399 	}
2400 
2401 	rbm.rgd->rd_free -= *nblocks;
2402 	if (dinode) {
2403 		rbm.rgd->rd_dinodes++;
2404 		*generation = rbm.rgd->rd_igeneration++;
2405 		if (*generation == 0)
2406 			*generation = rbm.rgd->rd_igeneration++;
2407 	}
2408 
2409 	gfs2_trans_add_meta(rbm.rgd->rd_gl, rbm.rgd->rd_bits[0].bi_bh);
2410 	gfs2_rgrp_out(rbm.rgd, rbm.rgd->rd_bits[0].bi_bh->b_data);
2411 
2412 	gfs2_statfs_change(sdp, 0, -(s64)*nblocks, dinode ? 1 : 0);
2413 	if (dinode)
2414 		gfs2_trans_remove_revoke(sdp, block, *nblocks);
2415 
2416 	gfs2_quota_change(ip, *nblocks, ip->i_inode.i_uid, ip->i_inode.i_gid);
2417 
2418 	rbm.rgd->rd_free_clone -= *nblocks;
2419 	trace_gfs2_block_alloc(ip, rbm.rgd, block, *nblocks,
2420 			       dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
2421 	*bn = block;
2422 	return 0;
2423 
2424 rgrp_error:
2425 	gfs2_rgrp_error(rbm.rgd);
2426 	return -EIO;
2427 }
2428 
2429 /**
2430  * __gfs2_free_blocks - free a contiguous run of block(s)
2431  * @ip: the inode these blocks are being freed from
2432  * @rgd: the resource group the blocks are in
2433  * @bstart: first block of a run of contiguous blocks
2434  * @blen: the length of the block run
2435  * @meta: 1 if the blocks represent metadata
2436  *
2437  */
2438 
__gfs2_free_blocks(struct gfs2_inode * ip,struct gfs2_rgrpd * rgd,u64 bstart,u32 blen,int meta)2439 void __gfs2_free_blocks(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
2440 			u64 bstart, u32 blen, int meta)
2441 {
2442 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2443 
2444 	rgblk_free(sdp, rgd, bstart, blen, GFS2_BLKST_FREE);
2445 	trace_gfs2_block_alloc(ip, rgd, bstart, blen, GFS2_BLKST_FREE);
2446 	rgd->rd_free += blen;
2447 	rgd->rd_flags &= ~GFS2_RGF_TRIMMED;
2448 	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
2449 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
2450 
2451 	/* Directories keep their data in the metadata address space */
2452 	if (meta || ip->i_depth || gfs2_is_jdata(ip))
2453 		gfs2_journal_wipe(ip, bstart, blen);
2454 }
2455 
2456 /**
2457  * gfs2_free_meta - free a contiguous run of data block(s)
2458  * @ip: the inode these blocks are being freed from
2459  * @rgd: the resource group the blocks are in
2460  * @bstart: first block of a run of contiguous blocks
2461  * @blen: the length of the block run
2462  *
2463  */
2464 
gfs2_free_meta(struct gfs2_inode * ip,struct gfs2_rgrpd * rgd,u64 bstart,u32 blen)2465 void gfs2_free_meta(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
2466 		    u64 bstart, u32 blen)
2467 {
2468 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2469 
2470 	__gfs2_free_blocks(ip, rgd, bstart, blen, 1);
2471 	gfs2_statfs_change(sdp, 0, +blen, 0);
2472 	gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid);
2473 }
2474 
gfs2_unlink_di(struct inode * inode)2475 void gfs2_unlink_di(struct inode *inode)
2476 {
2477 	struct gfs2_inode *ip = GFS2_I(inode);
2478 	struct gfs2_sbd *sdp = GFS2_SB(inode);
2479 	struct gfs2_rgrpd *rgd;
2480 	u64 blkno = ip->i_no_addr;
2481 
2482 	rgd = gfs2_blk2rgrpd(sdp, blkno, true);
2483 	if (!rgd)
2484 		return;
2485 	rgblk_free(sdp, rgd, blkno, 1, GFS2_BLKST_UNLINKED);
2486 	trace_gfs2_block_alloc(ip, rgd, blkno, 1, GFS2_BLKST_UNLINKED);
2487 	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
2488 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
2489 	be32_add_cpu(&rgd->rd_rgl->rl_unlinked, 1);
2490 }
2491 
gfs2_free_di(struct gfs2_rgrpd * rgd,struct gfs2_inode * ip)2492 void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
2493 {
2494 	struct gfs2_sbd *sdp = rgd->rd_sbd;
2495 
2496 	rgblk_free(sdp, rgd, ip->i_no_addr, 1, GFS2_BLKST_FREE);
2497 	if (!rgd->rd_dinodes)
2498 		gfs2_consist_rgrpd(rgd);
2499 	rgd->rd_dinodes--;
2500 	rgd->rd_free++;
2501 
2502 	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
2503 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
2504 	be32_add_cpu(&rgd->rd_rgl->rl_unlinked, -1);
2505 
2506 	gfs2_statfs_change(sdp, 0, +1, -1);
2507 	trace_gfs2_block_alloc(ip, rgd, ip->i_no_addr, 1, GFS2_BLKST_FREE);
2508 	gfs2_quota_change(ip, -1, ip->i_inode.i_uid, ip->i_inode.i_gid);
2509 	gfs2_journal_wipe(ip, ip->i_no_addr, 1);
2510 }
2511 
2512 /**
2513  * gfs2_check_blk_type - Check the type of a block
2514  * @sdp: The superblock
2515  * @no_addr: The block number to check
2516  * @type: The block type we are looking for
2517  *
2518  * Returns: 0 if the block type matches the expected type
2519  *          -ESTALE if it doesn't match
2520  *          or -ve errno if something went wrong while checking
2521  */
2522 
gfs2_check_blk_type(struct gfs2_sbd * sdp,u64 no_addr,unsigned int type)2523 int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type)
2524 {
2525 	struct gfs2_rgrpd *rgd;
2526 	struct gfs2_holder rgd_gh;
2527 	struct gfs2_rbm rbm;
2528 	int error = -EINVAL;
2529 
2530 	rgd = gfs2_blk2rgrpd(sdp, no_addr, 1);
2531 	if (!rgd)
2532 		goto fail;
2533 
2534 	error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
2535 	if (error)
2536 		goto fail;
2537 
2538 	rbm.rgd = rgd;
2539 	error = gfs2_rbm_from_block(&rbm, no_addr);
2540 	if (!WARN_ON_ONCE(error)) {
2541 		if (gfs2_testbit(&rbm, false) != type)
2542 			error = -ESTALE;
2543 	}
2544 
2545 	gfs2_glock_dq_uninit(&rgd_gh);
2546 
2547 fail:
2548 	return error;
2549 }
2550 
2551 /**
2552  * gfs2_rlist_add - add a RG to a list of RGs
2553  * @ip: the inode
2554  * @rlist: the list of resource groups
2555  * @block: the block
2556  *
2557  * Figure out what RG a block belongs to and add that RG to the list
2558  *
2559  * FIXME: Don't use NOFAIL
2560  *
2561  */
2562 
gfs2_rlist_add(struct gfs2_inode * ip,struct gfs2_rgrp_list * rlist,u64 block)2563 void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
2564 		    u64 block)
2565 {
2566 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2567 	struct gfs2_rgrpd *rgd;
2568 	struct gfs2_rgrpd **tmp;
2569 	unsigned int new_space;
2570 	unsigned int x;
2571 
2572 	if (gfs2_assert_warn(sdp, !rlist->rl_ghs))
2573 		return;
2574 
2575 	/*
2576 	 * The resource group last accessed is kept in the last position.
2577 	 */
2578 
2579 	if (rlist->rl_rgrps) {
2580 		rgd = rlist->rl_rgd[rlist->rl_rgrps - 1];
2581 		if (rgrp_contains_block(rgd, block))
2582 			return;
2583 		rgd = gfs2_blk2rgrpd(sdp, block, 1);
2584 	} else {
2585 		rgd = ip->i_res.rs_rbm.rgd;
2586 		if (!rgd || !rgrp_contains_block(rgd, block))
2587 			rgd = gfs2_blk2rgrpd(sdp, block, 1);
2588 	}
2589 
2590 	if (!rgd) {
2591 		fs_err(sdp, "rlist_add: no rgrp for block %llu\n",
2592 		       (unsigned long long)block);
2593 		return;
2594 	}
2595 
2596 	for (x = 0; x < rlist->rl_rgrps; x++) {
2597 		if (rlist->rl_rgd[x] == rgd) {
2598 			swap(rlist->rl_rgd[x],
2599 			     rlist->rl_rgd[rlist->rl_rgrps - 1]);
2600 			return;
2601 		}
2602 	}
2603 
2604 	if (rlist->rl_rgrps == rlist->rl_space) {
2605 		new_space = rlist->rl_space + 10;
2606 
2607 		tmp = kcalloc(new_space, sizeof(struct gfs2_rgrpd *),
2608 			      GFP_NOFS | __GFP_NOFAIL);
2609 
2610 		if (rlist->rl_rgd) {
2611 			memcpy(tmp, rlist->rl_rgd,
2612 			       rlist->rl_space * sizeof(struct gfs2_rgrpd *));
2613 			kfree(rlist->rl_rgd);
2614 		}
2615 
2616 		rlist->rl_space = new_space;
2617 		rlist->rl_rgd = tmp;
2618 	}
2619 
2620 	rlist->rl_rgd[rlist->rl_rgrps++] = rgd;
2621 }
2622 
2623 /**
2624  * gfs2_rlist_alloc - all RGs have been added to the rlist, now allocate
2625  *      and initialize an array of glock holders for them
2626  * @rlist: the list of resource groups
2627  *
2628  * FIXME: Don't use NOFAIL
2629  *
2630  */
2631 
gfs2_rlist_alloc(struct gfs2_rgrp_list * rlist)2632 void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist)
2633 {
2634 	unsigned int x;
2635 
2636 	rlist->rl_ghs = kmalloc_array(rlist->rl_rgrps,
2637 				      sizeof(struct gfs2_holder),
2638 				      GFP_NOFS | __GFP_NOFAIL);
2639 	for (x = 0; x < rlist->rl_rgrps; x++)
2640 		gfs2_holder_init(rlist->rl_rgd[x]->rd_gl,
2641 				LM_ST_EXCLUSIVE, 0,
2642 				&rlist->rl_ghs[x]);
2643 }
2644 
2645 /**
2646  * gfs2_rlist_free - free a resource group list
2647  * @rlist: the list of resource groups
2648  *
2649  */
2650 
gfs2_rlist_free(struct gfs2_rgrp_list * rlist)2651 void gfs2_rlist_free(struct gfs2_rgrp_list *rlist)
2652 {
2653 	unsigned int x;
2654 
2655 	kfree(rlist->rl_rgd);
2656 
2657 	if (rlist->rl_ghs) {
2658 		for (x = 0; x < rlist->rl_rgrps; x++)
2659 			gfs2_holder_uninit(&rlist->rl_ghs[x]);
2660 		kfree(rlist->rl_ghs);
2661 		rlist->rl_ghs = NULL;
2662 	}
2663 }
2664 
2665