• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
3  * Written by Alex Tomas <alex@clusterfs.com>
4  *
5  * This program is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License version 2 as
7  * published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public Licens
15  * along with this program; if not, write to the Free Software
16  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
17  */
18 
19 
20 /*
21  * mballoc.c contains the multiblocks allocation routines
22  */
23 
24 #include "ext4_jbd2.h"
25 #include "mballoc.h"
26 #include <linux/log2.h>
27 #include <linux/module.h>
28 #include <linux/slab.h>
29 #include <linux/nospec.h>
30 #include <linux/backing-dev.h>
31 #include <trace/events/ext4.h>
32 
33 #ifdef CONFIG_EXT4_DEBUG
34 ushort ext4_mballoc_debug __read_mostly;
35 
36 module_param_named(mballoc_debug, ext4_mballoc_debug, ushort, 0644);
37 MODULE_PARM_DESC(mballoc_debug, "Debugging level for ext4's mballoc");
38 #endif
39 
40 /*
41  * MUSTDO:
42  *   - test ext4_ext_search_left() and ext4_ext_search_right()
43  *   - search for metadata in few groups
44  *
45  * TODO v4:
46  *   - normalization should take into account whether file is still open
47  *   - discard preallocations if no free space left (policy?)
48  *   - don't normalize tails
49  *   - quota
50  *   - reservation for superuser
51  *
52  * TODO v3:
53  *   - bitmap read-ahead (proposed by Oleg Drokin aka green)
54  *   - track min/max extents in each group for better group selection
55  *   - mb_mark_used() may allocate chunk right after splitting buddy
56  *   - tree of groups sorted by number of free blocks
57  *   - error handling
58  */
59 
60 /*
61  * The allocation request involve request for multiple number of blocks
62  * near to the goal(block) value specified.
63  *
64  * During initialization phase of the allocator we decide to use the
65  * group preallocation or inode preallocation depending on the size of
66  * the file. The size of the file could be the resulting file size we
67  * would have after allocation, or the current file size, which ever
68  * is larger. If the size is less than sbi->s_mb_stream_request we
69  * select to use the group preallocation. The default value of
70  * s_mb_stream_request is 16 blocks. This can also be tuned via
71  * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in
72  * terms of number of blocks.
73  *
74  * The main motivation for having small file use group preallocation is to
75  * ensure that we have small files closer together on the disk.
76  *
77  * First stage the allocator looks at the inode prealloc list,
78  * ext4_inode_info->i_prealloc_list, which contains list of prealloc
79  * spaces for this particular inode. The inode prealloc space is
80  * represented as:
81  *
82  * pa_lstart -> the logical start block for this prealloc space
83  * pa_pstart -> the physical start block for this prealloc space
84  * pa_len    -> length for this prealloc space (in clusters)
85  * pa_free   ->  free space available in this prealloc space (in clusters)
86  *
87  * The inode preallocation space is used looking at the _logical_ start
88  * block. If only the logical file block falls within the range of prealloc
89  * space we will consume the particular prealloc space. This makes sure that
90  * we have contiguous physical blocks representing the file blocks
91  *
92  * The important thing to be noted in case of inode prealloc space is that
93  * we don't modify the values associated to inode prealloc space except
94  * pa_free.
95  *
96  * If we are not able to find blocks in the inode prealloc space and if we
97  * have the group allocation flag set then we look at the locality group
98  * prealloc space. These are per CPU prealloc list represented as
99  *
100  * ext4_sb_info.s_locality_groups[smp_processor_id()]
101  *
102  * The reason for having a per cpu locality group is to reduce the contention
103  * between CPUs. It is possible to get scheduled at this point.
104  *
105  * The locality group prealloc space is used looking at whether we have
106  * enough free space (pa_free) within the prealloc space.
107  *
108  * If we can't allocate blocks via inode prealloc or/and locality group
109  * prealloc then we look at the buddy cache. The buddy cache is represented
110  * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets
111  * mapped to the buddy and bitmap information regarding different
112  * groups. The buddy information is attached to buddy cache inode so that
113  * we can access them through the page cache. The information regarding
114  * each group is loaded via ext4_mb_load_buddy.  The information involve
115  * block bitmap and buddy information. The information are stored in the
116  * inode as:
117  *
118  *  {                        page                        }
119  *  [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
120  *
121  *
122  * one block each for bitmap and buddy information.  So for each group we
123  * take up 2 blocks. A page can contain blocks_per_page (PAGE_CACHE_SIZE /
124  * blocksize) blocks.  So it can have information regarding groups_per_page
125  * which is blocks_per_page/2
126  *
127  * The buddy cache inode is not stored on disk. The inode is thrown
128  * away when the filesystem is unmounted.
129  *
130  * We look for count number of blocks in the buddy cache. If we were able
131  * to locate that many free blocks we return with additional information
132  * regarding rest of the contiguous physical block available
133  *
134  * Before allocating blocks via buddy cache we normalize the request
135  * blocks. This ensure we ask for more blocks that we needed. The extra
136  * blocks that we get after allocation is added to the respective prealloc
137  * list. In case of inode preallocation we follow a list of heuristics
138  * based on file size. This can be found in ext4_mb_normalize_request. If
139  * we are doing a group prealloc we try to normalize the request to
140  * sbi->s_mb_group_prealloc.  The default value of s_mb_group_prealloc is
141  * dependent on the cluster size; for non-bigalloc file systems, it is
142  * 512 blocks. This can be tuned via
143  * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in
144  * terms of number of blocks. If we have mounted the file system with -O
145  * stripe=<value> option the group prealloc request is normalized to the
146  * the smallest multiple of the stripe value (sbi->s_stripe) which is
147  * greater than the default mb_group_prealloc.
148  *
149  * The regular allocator (using the buddy cache) supports a few tunables.
150  *
151  * /sys/fs/ext4/<partition>/mb_min_to_scan
152  * /sys/fs/ext4/<partition>/mb_max_to_scan
153  * /sys/fs/ext4/<partition>/mb_order2_req
154  *
155  * The regular allocator uses buddy scan only if the request len is power of
156  * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
157  * value of s_mb_order2_reqs can be tuned via
158  * /sys/fs/ext4/<partition>/mb_order2_req.  If the request len is equal to
159  * stripe size (sbi->s_stripe), we try to search for contiguous block in
160  * stripe size. This should result in better allocation on RAID setups. If
161  * not, we search in the specific group using bitmap for best extents. The
162  * tunable min_to_scan and max_to_scan control the behaviour here.
163  * min_to_scan indicate how long the mballoc __must__ look for a best
164  * extent and max_to_scan indicates how long the mballoc __can__ look for a
165  * best extent in the found extents. Searching for the blocks starts with
166  * the group specified as the goal value in allocation context via
167  * ac_g_ex. Each group is first checked based on the criteria whether it
168  * can be used for allocation. ext4_mb_good_group explains how the groups are
169  * checked.
170  *
171  * Both the prealloc space are getting populated as above. So for the first
172  * request we will hit the buddy cache which will result in this prealloc
173  * space getting filled. The prealloc space is then later used for the
174  * subsequent request.
175  */
176 
177 /*
178  * mballoc operates on the following data:
179  *  - on-disk bitmap
180  *  - in-core buddy (actually includes buddy and bitmap)
181  *  - preallocation descriptors (PAs)
182  *
183  * there are two types of preallocations:
184  *  - inode
185  *    assiged to specific inode and can be used for this inode only.
186  *    it describes part of inode's space preallocated to specific
187  *    physical blocks. any block from that preallocated can be used
188  *    independent. the descriptor just tracks number of blocks left
189  *    unused. so, before taking some block from descriptor, one must
190  *    make sure corresponded logical block isn't allocated yet. this
191  *    also means that freeing any block within descriptor's range
192  *    must discard all preallocated blocks.
193  *  - locality group
194  *    assigned to specific locality group which does not translate to
195  *    permanent set of inodes: inode can join and leave group. space
196  *    from this type of preallocation can be used for any inode. thus
197  *    it's consumed from the beginning to the end.
198  *
199  * relation between them can be expressed as:
200  *    in-core buddy = on-disk bitmap + preallocation descriptors
201  *
202  * this mean blocks mballoc considers used are:
203  *  - allocated blocks (persistent)
204  *  - preallocated blocks (non-persistent)
205  *
206  * consistency in mballoc world means that at any time a block is either
207  * free or used in ALL structures. notice: "any time" should not be read
208  * literally -- time is discrete and delimited by locks.
209  *
210  *  to keep it simple, we don't use block numbers, instead we count number of
211  *  blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA.
212  *
213  * all operations can be expressed as:
214  *  - init buddy:			buddy = on-disk + PAs
215  *  - new PA:				buddy += N; PA = N
216  *  - use inode PA:			on-disk += N; PA -= N
217  *  - discard inode PA			buddy -= on-disk - PA; PA = 0
218  *  - use locality group PA		on-disk += N; PA -= N
219  *  - discard locality group PA		buddy -= PA; PA = 0
220  *  note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap
221  *        is used in real operation because we can't know actual used
222  *        bits from PA, only from on-disk bitmap
223  *
224  * if we follow this strict logic, then all operations above should be atomic.
225  * given some of them can block, we'd have to use something like semaphores
226  * killing performance on high-end SMP hardware. let's try to relax it using
227  * the following knowledge:
228  *  1) if buddy is referenced, it's already initialized
229  *  2) while block is used in buddy and the buddy is referenced,
230  *     nobody can re-allocate that block
231  *  3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has
232  *     bit set and PA claims same block, it's OK. IOW, one can set bit in
233  *     on-disk bitmap if buddy has same bit set or/and PA covers corresponded
234  *     block
235  *
236  * so, now we're building a concurrency table:
237  *  - init buddy vs.
238  *    - new PA
239  *      blocks for PA are allocated in the buddy, buddy must be referenced
240  *      until PA is linked to allocation group to avoid concurrent buddy init
241  *    - use inode PA
242  *      we need to make sure that either on-disk bitmap or PA has uptodate data
243  *      given (3) we care that PA-=N operation doesn't interfere with init
244  *    - discard inode PA
245  *      the simplest way would be to have buddy initialized by the discard
246  *    - use locality group PA
247  *      again PA-=N must be serialized with init
248  *    - discard locality group PA
249  *      the simplest way would be to have buddy initialized by the discard
250  *  - new PA vs.
251  *    - use inode PA
252  *      i_data_sem serializes them
253  *    - discard inode PA
254  *      discard process must wait until PA isn't used by another process
255  *    - use locality group PA
256  *      some mutex should serialize them
257  *    - discard locality group PA
258  *      discard process must wait until PA isn't used by another process
259  *  - use inode PA
260  *    - use inode PA
261  *      i_data_sem or another mutex should serializes them
262  *    - discard inode PA
263  *      discard process must wait until PA isn't used by another process
264  *    - use locality group PA
265  *      nothing wrong here -- they're different PAs covering different blocks
266  *    - discard locality group PA
267  *      discard process must wait until PA isn't used by another process
268  *
269  * now we're ready to make few consequences:
270  *  - PA is referenced and while it is no discard is possible
271  *  - PA is referenced until block isn't marked in on-disk bitmap
272  *  - PA changes only after on-disk bitmap
273  *  - discard must not compete with init. either init is done before
274  *    any discard or they're serialized somehow
275  *  - buddy init as sum of on-disk bitmap and PAs is done atomically
276  *
277  * a special case when we've used PA to emptiness. no need to modify buddy
278  * in this case, but we should care about concurrent init
279  *
280  */
281 
282  /*
283  * Logic in few words:
284  *
285  *  - allocation:
286  *    load group
287  *    find blocks
288  *    mark bits in on-disk bitmap
289  *    release group
290  *
291  *  - use preallocation:
292  *    find proper PA (per-inode or group)
293  *    load group
294  *    mark bits in on-disk bitmap
295  *    release group
296  *    release PA
297  *
298  *  - free:
299  *    load group
300  *    mark bits in on-disk bitmap
301  *    release group
302  *
303  *  - discard preallocations in group:
304  *    mark PAs deleted
305  *    move them onto local list
306  *    load on-disk bitmap
307  *    load group
308  *    remove PA from object (inode or locality group)
309  *    mark free blocks in-core
310  *
311  *  - discard inode's preallocations:
312  */
313 
314 /*
315  * Locking rules
316  *
317  * Locks:
318  *  - bitlock on a group	(group)
319  *  - object (inode/locality)	(object)
320  *  - per-pa lock		(pa)
321  *
322  * Paths:
323  *  - new pa
324  *    object
325  *    group
326  *
327  *  - find and use pa:
328  *    pa
329  *
330  *  - release consumed pa:
331  *    pa
332  *    group
333  *    object
334  *
335  *  - generate in-core bitmap:
336  *    group
337  *        pa
338  *
339  *  - discard all for given object (inode, locality group):
340  *    object
341  *        pa
342  *    group
343  *
344  *  - discard all for given group:
345  *    group
346  *        pa
347  *    group
348  *        object
349  *
350  */
351 static struct kmem_cache *ext4_pspace_cachep;
352 static struct kmem_cache *ext4_ac_cachep;
353 static struct kmem_cache *ext4_free_data_cachep;
354 
355 /* We create slab caches for groupinfo data structures based on the
356  * superblock block size.  There will be one per mounted filesystem for
357  * each unique s_blocksize_bits */
358 #define NR_GRPINFO_CACHES 8
359 static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];
360 
361 static const char *ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = {
362 	"ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k",
363 	"ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k",
364 	"ext4_groupinfo_64k", "ext4_groupinfo_128k"
365 };
366 
367 static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
368 					ext4_group_t group);
369 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
370 						ext4_group_t group);
371 static void ext4_free_data_callback(struct super_block *sb,
372 				struct ext4_journal_cb_entry *jce, int rc);
373 
mb_correct_addr_and_bit(int * bit,void * addr)374 static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
375 {
376 #if BITS_PER_LONG == 64
377 	*bit += ((unsigned long) addr & 7UL) << 3;
378 	addr = (void *) ((unsigned long) addr & ~7UL);
379 #elif BITS_PER_LONG == 32
380 	*bit += ((unsigned long) addr & 3UL) << 3;
381 	addr = (void *) ((unsigned long) addr & ~3UL);
382 #else
383 #error "how many bits you are?!"
384 #endif
385 	return addr;
386 }
387 
mb_test_bit(int bit,void * addr)388 static inline int mb_test_bit(int bit, void *addr)
389 {
390 	/*
391 	 * ext4_test_bit on architecture like powerpc
392 	 * needs unsigned long aligned address
393 	 */
394 	addr = mb_correct_addr_and_bit(&bit, addr);
395 	return ext4_test_bit(bit, addr);
396 }
397 
mb_set_bit(int bit,void * addr)398 static inline void mb_set_bit(int bit, void *addr)
399 {
400 	addr = mb_correct_addr_and_bit(&bit, addr);
401 	ext4_set_bit(bit, addr);
402 }
403 
mb_clear_bit(int bit,void * addr)404 static inline void mb_clear_bit(int bit, void *addr)
405 {
406 	addr = mb_correct_addr_and_bit(&bit, addr);
407 	ext4_clear_bit(bit, addr);
408 }
409 
mb_test_and_clear_bit(int bit,void * addr)410 static inline int mb_test_and_clear_bit(int bit, void *addr)
411 {
412 	addr = mb_correct_addr_and_bit(&bit, addr);
413 	return ext4_test_and_clear_bit(bit, addr);
414 }
415 
mb_find_next_zero_bit(void * addr,int max,int start)416 static inline int mb_find_next_zero_bit(void *addr, int max, int start)
417 {
418 	int fix = 0, ret, tmpmax;
419 	addr = mb_correct_addr_and_bit(&fix, addr);
420 	tmpmax = max + fix;
421 	start += fix;
422 
423 	ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix;
424 	if (ret > max)
425 		return max;
426 	return ret;
427 }
428 
mb_find_next_bit(void * addr,int max,int start)429 static inline int mb_find_next_bit(void *addr, int max, int start)
430 {
431 	int fix = 0, ret, tmpmax;
432 	addr = mb_correct_addr_and_bit(&fix, addr);
433 	tmpmax = max + fix;
434 	start += fix;
435 
436 	ret = ext4_find_next_bit(addr, tmpmax, start) - fix;
437 	if (ret > max)
438 		return max;
439 	return ret;
440 }
441 
mb_find_buddy(struct ext4_buddy * e4b,int order,int * max)442 static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
443 {
444 	char *bb;
445 
446 	BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
447 	BUG_ON(max == NULL);
448 
449 	if (order > e4b->bd_blkbits + 1) {
450 		*max = 0;
451 		return NULL;
452 	}
453 
454 	/* at order 0 we see each particular block */
455 	if (order == 0) {
456 		*max = 1 << (e4b->bd_blkbits + 3);
457 		return e4b->bd_bitmap;
458 	}
459 
460 	bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
461 	*max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
462 
463 	return bb;
464 }
465 
466 #ifdef DOUBLE_CHECK
mb_free_blocks_double(struct inode * inode,struct ext4_buddy * e4b,int first,int count)467 static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
468 			   int first, int count)
469 {
470 	int i;
471 	struct super_block *sb = e4b->bd_sb;
472 
473 	if (unlikely(e4b->bd_info->bb_bitmap == NULL))
474 		return;
475 	assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
476 	for (i = 0; i < count; i++) {
477 		if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) {
478 			ext4_fsblk_t blocknr;
479 
480 			blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
481 			blocknr += EXT4_C2B(EXT4_SB(sb), first + i);
482 			ext4_grp_locked_error(sb, e4b->bd_group,
483 					      inode ? inode->i_ino : 0,
484 					      blocknr,
485 					      "freeing block already freed "
486 					      "(bit %u)",
487 					      first + i);
488 		}
489 		mb_clear_bit(first + i, e4b->bd_info->bb_bitmap);
490 	}
491 }
492 
mb_mark_used_double(struct ext4_buddy * e4b,int first,int count)493 static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count)
494 {
495 	int i;
496 
497 	if (unlikely(e4b->bd_info->bb_bitmap == NULL))
498 		return;
499 	assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
500 	for (i = 0; i < count; i++) {
501 		BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap));
502 		mb_set_bit(first + i, e4b->bd_info->bb_bitmap);
503 	}
504 }
505 
mb_cmp_bitmaps(struct ext4_buddy * e4b,void * bitmap)506 static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
507 {
508 	if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) {
509 		unsigned char *b1, *b2;
510 		int i;
511 		b1 = (unsigned char *) e4b->bd_info->bb_bitmap;
512 		b2 = (unsigned char *) bitmap;
513 		for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
514 			if (b1[i] != b2[i]) {
515 				ext4_msg(e4b->bd_sb, KERN_ERR,
516 					 "corruption in group %u "
517 					 "at byte %u(%u): %x in copy != %x "
518 					 "on disk/prealloc",
519 					 e4b->bd_group, i, i * 8, b1[i], b2[i]);
520 				BUG();
521 			}
522 		}
523 	}
524 }
525 
526 #else
mb_free_blocks_double(struct inode * inode,struct ext4_buddy * e4b,int first,int count)527 static inline void mb_free_blocks_double(struct inode *inode,
528 				struct ext4_buddy *e4b, int first, int count)
529 {
530 	return;
531 }
mb_mark_used_double(struct ext4_buddy * e4b,int first,int count)532 static inline void mb_mark_used_double(struct ext4_buddy *e4b,
533 						int first, int count)
534 {
535 	return;
536 }
mb_cmp_bitmaps(struct ext4_buddy * e4b,void * bitmap)537 static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
538 {
539 	return;
540 }
541 #endif
542 
543 #ifdef AGGRESSIVE_CHECK
544 
545 #define MB_CHECK_ASSERT(assert)						\
546 do {									\
547 	if (!(assert)) {						\
548 		printk(KERN_EMERG					\
549 			"Assertion failure in %s() at %s:%d: \"%s\"\n",	\
550 			function, file, line, # assert);		\
551 		BUG();							\
552 	}								\
553 } while (0)
554 
__mb_check_buddy(struct ext4_buddy * e4b,char * file,const char * function,int line)555 static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
556 				const char *function, int line)
557 {
558 	struct super_block *sb = e4b->bd_sb;
559 	int order = e4b->bd_blkbits + 1;
560 	int max;
561 	int max2;
562 	int i;
563 	int j;
564 	int k;
565 	int count;
566 	struct ext4_group_info *grp;
567 	int fragments = 0;
568 	int fstart;
569 	struct list_head *cur;
570 	void *buddy;
571 	void *buddy2;
572 
573 	{
574 		static int mb_check_counter;
575 		if (mb_check_counter++ % 100 != 0)
576 			return 0;
577 	}
578 
579 	while (order > 1) {
580 		buddy = mb_find_buddy(e4b, order, &max);
581 		MB_CHECK_ASSERT(buddy);
582 		buddy2 = mb_find_buddy(e4b, order - 1, &max2);
583 		MB_CHECK_ASSERT(buddy2);
584 		MB_CHECK_ASSERT(buddy != buddy2);
585 		MB_CHECK_ASSERT(max * 2 == max2);
586 
587 		count = 0;
588 		for (i = 0; i < max; i++) {
589 
590 			if (mb_test_bit(i, buddy)) {
591 				/* only single bit in buddy2 may be 1 */
592 				if (!mb_test_bit(i << 1, buddy2)) {
593 					MB_CHECK_ASSERT(
594 						mb_test_bit((i<<1)+1, buddy2));
595 				} else if (!mb_test_bit((i << 1) + 1, buddy2)) {
596 					MB_CHECK_ASSERT(
597 						mb_test_bit(i << 1, buddy2));
598 				}
599 				continue;
600 			}
601 
602 			/* both bits in buddy2 must be 1 */
603 			MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2));
604 			MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2));
605 
606 			for (j = 0; j < (1 << order); j++) {
607 				k = (i * (1 << order)) + j;
608 				MB_CHECK_ASSERT(
609 					!mb_test_bit(k, e4b->bd_bitmap));
610 			}
611 			count++;
612 		}
613 		MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count);
614 		order--;
615 	}
616 
617 	fstart = -1;
618 	buddy = mb_find_buddy(e4b, 0, &max);
619 	for (i = 0; i < max; i++) {
620 		if (!mb_test_bit(i, buddy)) {
621 			MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free);
622 			if (fstart == -1) {
623 				fragments++;
624 				fstart = i;
625 			}
626 			continue;
627 		}
628 		fstart = -1;
629 		/* check used bits only */
630 		for (j = 0; j < e4b->bd_blkbits + 1; j++) {
631 			buddy2 = mb_find_buddy(e4b, j, &max2);
632 			k = i >> j;
633 			MB_CHECK_ASSERT(k < max2);
634 			MB_CHECK_ASSERT(mb_test_bit(k, buddy2));
635 		}
636 	}
637 	MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info));
638 	MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments);
639 
640 	grp = ext4_get_group_info(sb, e4b->bd_group);
641 	list_for_each(cur, &grp->bb_prealloc_list) {
642 		ext4_group_t groupnr;
643 		struct ext4_prealloc_space *pa;
644 		pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
645 		ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k);
646 		MB_CHECK_ASSERT(groupnr == e4b->bd_group);
647 		for (i = 0; i < pa->pa_len; i++)
648 			MB_CHECK_ASSERT(mb_test_bit(k + i, buddy));
649 	}
650 	return 0;
651 }
652 #undef MB_CHECK_ASSERT
653 #define mb_check_buddy(e4b) __mb_check_buddy(e4b,	\
654 					__FILE__, __func__, __LINE__)
655 #else
656 #define mb_check_buddy(e4b)
657 #endif
658 
659 /*
660  * Divide blocks started from @first with length @len into
661  * smaller chunks with power of 2 blocks.
662  * Clear the bits in bitmap which the blocks of the chunk(s) covered,
663  * then increase bb_counters[] for corresponded chunk size.
664  */
ext4_mb_mark_free_simple(struct super_block * sb,void * buddy,ext4_grpblk_t first,ext4_grpblk_t len,struct ext4_group_info * grp)665 static void ext4_mb_mark_free_simple(struct super_block *sb,
666 				void *buddy, ext4_grpblk_t first, ext4_grpblk_t len,
667 					struct ext4_group_info *grp)
668 {
669 	struct ext4_sb_info *sbi = EXT4_SB(sb);
670 	ext4_grpblk_t min;
671 	ext4_grpblk_t max;
672 	ext4_grpblk_t chunk;
673 	unsigned int border;
674 
675 	BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb));
676 
677 	border = 2 << sb->s_blocksize_bits;
678 
679 	while (len > 0) {
680 		/* find how many blocks can be covered since this position */
681 		max = ffs(first | border) - 1;
682 
683 		/* find how many blocks of power 2 we need to mark */
684 		min = fls(len) - 1;
685 
686 		if (max < min)
687 			min = max;
688 		chunk = 1 << min;
689 
690 		/* mark multiblock chunks only */
691 		grp->bb_counters[min]++;
692 		if (min > 0)
693 			mb_clear_bit(first >> min,
694 				     buddy + sbi->s_mb_offsets[min]);
695 
696 		len -= chunk;
697 		first += chunk;
698 	}
699 }
700 
701 /*
702  * Cache the order of the largest free extent we have available in this block
703  * group.
704  */
705 static void
mb_set_largest_free_order(struct super_block * sb,struct ext4_group_info * grp)706 mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
707 {
708 	int i;
709 	int bits;
710 
711 	grp->bb_largest_free_order = -1; /* uninit */
712 
713 	bits = sb->s_blocksize_bits + 1;
714 	for (i = bits; i >= 0; i--) {
715 		if (grp->bb_counters[i] > 0) {
716 			grp->bb_largest_free_order = i;
717 			break;
718 		}
719 	}
720 }
721 
722 static noinline_for_stack
ext4_mb_generate_buddy(struct super_block * sb,void * buddy,void * bitmap,ext4_group_t group)723 void ext4_mb_generate_buddy(struct super_block *sb,
724 				void *buddy, void *bitmap, ext4_group_t group)
725 {
726 	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
727 	struct ext4_sb_info *sbi = EXT4_SB(sb);
728 	ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
729 	ext4_grpblk_t i = 0;
730 	ext4_grpblk_t first;
731 	ext4_grpblk_t len;
732 	unsigned free = 0;
733 	unsigned fragments = 0;
734 	unsigned long long period = get_cycles();
735 
736 	/* initialize buddy from bitmap which is aggregation
737 	 * of on-disk bitmap and preallocations */
738 	i = mb_find_next_zero_bit(bitmap, max, 0);
739 	grp->bb_first_free = i;
740 	while (i < max) {
741 		fragments++;
742 		first = i;
743 		i = mb_find_next_bit(bitmap, max, i);
744 		len = i - first;
745 		free += len;
746 		if (len > 1)
747 			ext4_mb_mark_free_simple(sb, buddy, first, len, grp);
748 		else
749 			grp->bb_counters[0]++;
750 		if (i < max)
751 			i = mb_find_next_zero_bit(bitmap, max, i);
752 	}
753 	grp->bb_fragments = fragments;
754 
755 	if (free != grp->bb_free) {
756 		ext4_grp_locked_error(sb, group, 0, 0,
757 				      "block bitmap and bg descriptor "
758 				      "inconsistent: %u vs %u free clusters",
759 				      free, grp->bb_free);
760 		/*
761 		 * If we intend to continue, we consider group descriptor
762 		 * corrupt and update bb_free using bitmap value
763 		 */
764 		grp->bb_free = free;
765 		if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
766 			percpu_counter_sub(&sbi->s_freeclusters_counter,
767 					   grp->bb_free);
768 		set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
769 	}
770 	mb_set_largest_free_order(sb, grp);
771 
772 	clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
773 
774 	period = get_cycles() - period;
775 	spin_lock(&EXT4_SB(sb)->s_bal_lock);
776 	EXT4_SB(sb)->s_mb_buddies_generated++;
777 	EXT4_SB(sb)->s_mb_generation_time += period;
778 	spin_unlock(&EXT4_SB(sb)->s_bal_lock);
779 }
780 
mb_regenerate_buddy(struct ext4_buddy * e4b)781 static void mb_regenerate_buddy(struct ext4_buddy *e4b)
782 {
783 	int count;
784 	int order = 1;
785 	void *buddy;
786 
787 	while ((buddy = mb_find_buddy(e4b, order++, &count))) {
788 		ext4_set_bits(buddy, 0, count);
789 	}
790 	e4b->bd_info->bb_fragments = 0;
791 	memset(e4b->bd_info->bb_counters, 0,
792 		sizeof(*e4b->bd_info->bb_counters) *
793 		(e4b->bd_sb->s_blocksize_bits + 2));
794 
795 	ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy,
796 		e4b->bd_bitmap, e4b->bd_group);
797 }
798 
799 /* The buddy information is attached the buddy cache inode
800  * for convenience. The information regarding each group
801  * is loaded via ext4_mb_load_buddy. The information involve
802  * block bitmap and buddy information. The information are
803  * stored in the inode as
804  *
805  * {                        page                        }
806  * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
807  *
808  *
809  * one block each for bitmap and buddy information.
810  * So for each group we take up 2 blocks. A page can
811  * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize)  blocks.
812  * So it can have information regarding groups_per_page which
813  * is blocks_per_page/2
814  *
815  * Locking note:  This routine takes the block group lock of all groups
816  * for this page; do not hold this lock when calling this routine!
817  */
818 
ext4_mb_init_cache(struct page * page,char * incore,gfp_t gfp)819 static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
820 {
821 	ext4_group_t ngroups;
822 	int blocksize;
823 	int blocks_per_page;
824 	int groups_per_page;
825 	int err = 0;
826 	int i;
827 	ext4_group_t first_group, group;
828 	int first_block;
829 	struct super_block *sb;
830 	struct buffer_head *bhs;
831 	struct buffer_head **bh = NULL;
832 	struct inode *inode;
833 	char *data;
834 	char *bitmap;
835 	struct ext4_group_info *grinfo;
836 
837 	mb_debug(1, "init page %lu\n", page->index);
838 
839 	inode = page->mapping->host;
840 	sb = inode->i_sb;
841 	ngroups = ext4_get_groups_count(sb);
842 	blocksize = 1 << inode->i_blkbits;
843 	blocks_per_page = PAGE_CACHE_SIZE / blocksize;
844 
845 	groups_per_page = blocks_per_page >> 1;
846 	if (groups_per_page == 0)
847 		groups_per_page = 1;
848 
849 	/* allocate buffer_heads to read bitmaps */
850 	if (groups_per_page > 1) {
851 		i = sizeof(struct buffer_head *) * groups_per_page;
852 		bh = kzalloc(i, gfp);
853 		if (bh == NULL) {
854 			err = -ENOMEM;
855 			goto out;
856 		}
857 	} else
858 		bh = &bhs;
859 
860 	first_group = page->index * blocks_per_page / 2;
861 
862 	/* read all groups the page covers into the cache */
863 	for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
864 		if (group >= ngroups)
865 			break;
866 
867 		grinfo = ext4_get_group_info(sb, group);
868 		/*
869 		 * If page is uptodate then we came here after online resize
870 		 * which added some new uninitialized group info structs, so
871 		 * we must skip all initialized uptodate buddies on the page,
872 		 * which may be currently in use by an allocating task.
873 		 */
874 		if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) {
875 			bh[i] = NULL;
876 			continue;
877 		}
878 		bh[i] = ext4_read_block_bitmap_nowait(sb, group);
879 		if (IS_ERR(bh[i])) {
880 			err = PTR_ERR(bh[i]);
881 			bh[i] = NULL;
882 			goto out;
883 		}
884 		mb_debug(1, "read bitmap for group %u\n", group);
885 	}
886 
887 	/* wait for I/O completion */
888 	for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
889 		int err2;
890 
891 		if (!bh[i])
892 			continue;
893 		err2 = ext4_wait_block_bitmap(sb, group, bh[i]);
894 		if (!err)
895 			err = err2;
896 	}
897 
898 	first_block = page->index * blocks_per_page;
899 	for (i = 0; i < blocks_per_page; i++) {
900 		group = (first_block + i) >> 1;
901 		if (group >= ngroups)
902 			break;
903 
904 		if (!bh[group - first_group])
905 			/* skip initialized uptodate buddy */
906 			continue;
907 
908 		if (!buffer_verified(bh[group - first_group]))
909 			/* Skip faulty bitmaps */
910 			continue;
911 		err = 0;
912 
913 		/*
914 		 * data carry information regarding this
915 		 * particular group in the format specified
916 		 * above
917 		 *
918 		 */
919 		data = page_address(page) + (i * blocksize);
920 		bitmap = bh[group - first_group]->b_data;
921 
922 		/*
923 		 * We place the buddy block and bitmap block
924 		 * close together
925 		 */
926 		if ((first_block + i) & 1) {
927 			/* this is block of buddy */
928 			BUG_ON(incore == NULL);
929 			mb_debug(1, "put buddy for group %u in page %lu/%x\n",
930 				group, page->index, i * blocksize);
931 			trace_ext4_mb_buddy_bitmap_load(sb, group);
932 			grinfo = ext4_get_group_info(sb, group);
933 			grinfo->bb_fragments = 0;
934 			memset(grinfo->bb_counters, 0,
935 			       sizeof(*grinfo->bb_counters) *
936 				(sb->s_blocksize_bits+2));
937 			/*
938 			 * incore got set to the group block bitmap below
939 			 */
940 			ext4_lock_group(sb, group);
941 			/* init the buddy */
942 			memset(data, 0xff, blocksize);
943 			ext4_mb_generate_buddy(sb, data, incore, group);
944 			ext4_unlock_group(sb, group);
945 			incore = NULL;
946 		} else {
947 			/* this is block of bitmap */
948 			BUG_ON(incore != NULL);
949 			mb_debug(1, "put bitmap for group %u in page %lu/%x\n",
950 				group, page->index, i * blocksize);
951 			trace_ext4_mb_bitmap_load(sb, group);
952 
953 			/* see comments in ext4_mb_put_pa() */
954 			ext4_lock_group(sb, group);
955 			memcpy(data, bitmap, blocksize);
956 
957 			/* mark all preallocated blks used in in-core bitmap */
958 			ext4_mb_generate_from_pa(sb, data, group);
959 			ext4_mb_generate_from_freelist(sb, data, group);
960 			ext4_unlock_group(sb, group);
961 
962 			/* set incore so that the buddy information can be
963 			 * generated using this
964 			 */
965 			incore = data;
966 		}
967 	}
968 	SetPageUptodate(page);
969 
970 out:
971 	if (bh) {
972 		for (i = 0; i < groups_per_page; i++)
973 			brelse(bh[i]);
974 		if (bh != &bhs)
975 			kfree(bh);
976 	}
977 	return err;
978 }
979 
980 /*
981  * Lock the buddy and bitmap pages. This make sure other parallel init_group
982  * on the same buddy page doesn't happen whild holding the buddy page lock.
983  * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap
984  * are on the same page e4b->bd_buddy_page is NULL and return value is 0.
985  */
ext4_mb_get_buddy_page_lock(struct super_block * sb,ext4_group_t group,struct ext4_buddy * e4b,gfp_t gfp)986 static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
987 		ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp)
988 {
989 	struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
990 	int block, pnum, poff;
991 	int blocks_per_page;
992 	struct page *page;
993 
994 	e4b->bd_buddy_page = NULL;
995 	e4b->bd_bitmap_page = NULL;
996 
997 	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
998 	/*
999 	 * the buddy cache inode stores the block bitmap
1000 	 * and buddy information in consecutive blocks.
1001 	 * So for each group we need two blocks.
1002 	 */
1003 	block = group * 2;
1004 	pnum = block / blocks_per_page;
1005 	poff = block % blocks_per_page;
1006 	page = find_or_create_page(inode->i_mapping, pnum, gfp);
1007 	if (!page)
1008 		return -ENOMEM;
1009 	BUG_ON(page->mapping != inode->i_mapping);
1010 	e4b->bd_bitmap_page = page;
1011 	e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
1012 
1013 	if (blocks_per_page >= 2) {
1014 		/* buddy and bitmap are on the same page */
1015 		return 0;
1016 	}
1017 
1018 	block++;
1019 	pnum = block / blocks_per_page;
1020 	page = find_or_create_page(inode->i_mapping, pnum, gfp);
1021 	if (!page)
1022 		return -ENOMEM;
1023 	BUG_ON(page->mapping != inode->i_mapping);
1024 	e4b->bd_buddy_page = page;
1025 	return 0;
1026 }
1027 
ext4_mb_put_buddy_page_lock(struct ext4_buddy * e4b)1028 static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
1029 {
1030 	if (e4b->bd_bitmap_page) {
1031 		unlock_page(e4b->bd_bitmap_page);
1032 		page_cache_release(e4b->bd_bitmap_page);
1033 	}
1034 	if (e4b->bd_buddy_page) {
1035 		unlock_page(e4b->bd_buddy_page);
1036 		page_cache_release(e4b->bd_buddy_page);
1037 	}
1038 }
1039 
1040 /*
1041  * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
1042  * block group lock of all groups for this page; do not hold the BG lock when
1043  * calling this routine!
1044  */
1045 static noinline_for_stack
ext4_mb_init_group(struct super_block * sb,ext4_group_t group,gfp_t gfp)1046 int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp)
1047 {
1048 
1049 	struct ext4_group_info *this_grp;
1050 	struct ext4_buddy e4b;
1051 	struct page *page;
1052 	int ret = 0;
1053 
1054 	might_sleep();
1055 	mb_debug(1, "init group %u\n", group);
1056 	this_grp = ext4_get_group_info(sb, group);
1057 	/*
1058 	 * This ensures that we don't reinit the buddy cache
1059 	 * page which map to the group from which we are already
1060 	 * allocating. If we are looking at the buddy cache we would
1061 	 * have taken a reference using ext4_mb_load_buddy and that
1062 	 * would have pinned buddy page to page cache.
1063 	 * The call to ext4_mb_get_buddy_page_lock will mark the
1064 	 * page accessed.
1065 	 */
1066 	ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b, gfp);
1067 	if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
1068 		/*
1069 		 * somebody initialized the group
1070 		 * return without doing anything
1071 		 */
1072 		goto err;
1073 	}
1074 
1075 	page = e4b.bd_bitmap_page;
1076 	ret = ext4_mb_init_cache(page, NULL, gfp);
1077 	if (ret)
1078 		goto err;
1079 	if (!PageUptodate(page)) {
1080 		ret = -EIO;
1081 		goto err;
1082 	}
1083 
1084 	if (e4b.bd_buddy_page == NULL) {
1085 		/*
1086 		 * If both the bitmap and buddy are in
1087 		 * the same page we don't need to force
1088 		 * init the buddy
1089 		 */
1090 		ret = 0;
1091 		goto err;
1092 	}
1093 	/* init buddy cache */
1094 	page = e4b.bd_buddy_page;
1095 	ret = ext4_mb_init_cache(page, e4b.bd_bitmap, gfp);
1096 	if (ret)
1097 		goto err;
1098 	if (!PageUptodate(page)) {
1099 		ret = -EIO;
1100 		goto err;
1101 	}
1102 err:
1103 	ext4_mb_put_buddy_page_lock(&e4b);
1104 	return ret;
1105 }
1106 
1107 /*
1108  * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
1109  * block group lock of all groups for this page; do not hold the BG lock when
1110  * calling this routine!
1111  */
1112 static noinline_for_stack int
ext4_mb_load_buddy_gfp(struct super_block * sb,ext4_group_t group,struct ext4_buddy * e4b,gfp_t gfp)1113 ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
1114 		       struct ext4_buddy *e4b, gfp_t gfp)
1115 {
1116 	int blocks_per_page;
1117 	int block;
1118 	int pnum;
1119 	int poff;
1120 	struct page *page;
1121 	int ret;
1122 	struct ext4_group_info *grp;
1123 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1124 	struct inode *inode = sbi->s_buddy_cache;
1125 
1126 	might_sleep();
1127 	mb_debug(1, "load group %u\n", group);
1128 
1129 	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1130 	grp = ext4_get_group_info(sb, group);
1131 
1132 	e4b->bd_blkbits = sb->s_blocksize_bits;
1133 	e4b->bd_info = grp;
1134 	e4b->bd_sb = sb;
1135 	e4b->bd_group = group;
1136 	e4b->bd_buddy_page = NULL;
1137 	e4b->bd_bitmap_page = NULL;
1138 
1139 	if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
1140 		/*
1141 		 * we need full data about the group
1142 		 * to make a good selection
1143 		 */
1144 		ret = ext4_mb_init_group(sb, group, gfp);
1145 		if (ret)
1146 			return ret;
1147 	}
1148 
1149 	/*
1150 	 * the buddy cache inode stores the block bitmap
1151 	 * and buddy information in consecutive blocks.
1152 	 * So for each group we need two blocks.
1153 	 */
1154 	block = group * 2;
1155 	pnum = block / blocks_per_page;
1156 	poff = block % blocks_per_page;
1157 
1158 	/* we could use find_or_create_page(), but it locks page
1159 	 * what we'd like to avoid in fast path ... */
1160 	page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
1161 	if (page == NULL || !PageUptodate(page)) {
1162 		if (page)
1163 			/*
1164 			 * drop the page reference and try
1165 			 * to get the page with lock. If we
1166 			 * are not uptodate that implies
1167 			 * somebody just created the page but
1168 			 * is yet to initialize the same. So
1169 			 * wait for it to initialize.
1170 			 */
1171 			page_cache_release(page);
1172 		page = find_or_create_page(inode->i_mapping, pnum, gfp);
1173 		if (page) {
1174 			BUG_ON(page->mapping != inode->i_mapping);
1175 			if (!PageUptodate(page)) {
1176 				ret = ext4_mb_init_cache(page, NULL, gfp);
1177 				if (ret) {
1178 					unlock_page(page);
1179 					goto err;
1180 				}
1181 				mb_cmp_bitmaps(e4b, page_address(page) +
1182 					       (poff * sb->s_blocksize));
1183 			}
1184 			unlock_page(page);
1185 		}
1186 	}
1187 	if (page == NULL) {
1188 		ret = -ENOMEM;
1189 		goto err;
1190 	}
1191 	if (!PageUptodate(page)) {
1192 		ret = -EIO;
1193 		goto err;
1194 	}
1195 
1196 	/* Pages marked accessed already */
1197 	e4b->bd_bitmap_page = page;
1198 	e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
1199 
1200 	block++;
1201 	pnum = block / blocks_per_page;
1202 	poff = block % blocks_per_page;
1203 
1204 	page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
1205 	if (page == NULL || !PageUptodate(page)) {
1206 		if (page)
1207 			page_cache_release(page);
1208 		page = find_or_create_page(inode->i_mapping, pnum, gfp);
1209 		if (page) {
1210 			BUG_ON(page->mapping != inode->i_mapping);
1211 			if (!PageUptodate(page)) {
1212 				ret = ext4_mb_init_cache(page, e4b->bd_bitmap,
1213 							 gfp);
1214 				if (ret) {
1215 					unlock_page(page);
1216 					goto err;
1217 				}
1218 			}
1219 			unlock_page(page);
1220 		}
1221 	}
1222 	if (page == NULL) {
1223 		ret = -ENOMEM;
1224 		goto err;
1225 	}
1226 	if (!PageUptodate(page)) {
1227 		ret = -EIO;
1228 		goto err;
1229 	}
1230 
1231 	/* Pages marked accessed already */
1232 	e4b->bd_buddy_page = page;
1233 	e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
1234 
1235 	BUG_ON(e4b->bd_bitmap_page == NULL);
1236 	BUG_ON(e4b->bd_buddy_page == NULL);
1237 
1238 	return 0;
1239 
1240 err:
1241 	if (page)
1242 		page_cache_release(page);
1243 	if (e4b->bd_bitmap_page)
1244 		page_cache_release(e4b->bd_bitmap_page);
1245 	if (e4b->bd_buddy_page)
1246 		page_cache_release(e4b->bd_buddy_page);
1247 	e4b->bd_buddy = NULL;
1248 	e4b->bd_bitmap = NULL;
1249 	return ret;
1250 }
1251 
ext4_mb_load_buddy(struct super_block * sb,ext4_group_t group,struct ext4_buddy * e4b)1252 static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1253 			      struct ext4_buddy *e4b)
1254 {
1255 	return ext4_mb_load_buddy_gfp(sb, group, e4b, GFP_NOFS);
1256 }
1257 
ext4_mb_unload_buddy(struct ext4_buddy * e4b)1258 static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
1259 {
1260 	if (e4b->bd_bitmap_page)
1261 		page_cache_release(e4b->bd_bitmap_page);
1262 	if (e4b->bd_buddy_page)
1263 		page_cache_release(e4b->bd_buddy_page);
1264 }
1265 
1266 
mb_find_order_for_block(struct ext4_buddy * e4b,int block)1267 static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
1268 {
1269 	int order = 1;
1270 	int bb_incr = 1 << (e4b->bd_blkbits - 1);
1271 	void *bb;
1272 
1273 	BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
1274 	BUG_ON(block >= (1 << (e4b->bd_blkbits + 3)));
1275 
1276 	bb = e4b->bd_buddy;
1277 	while (order <= e4b->bd_blkbits + 1) {
1278 		block = block >> 1;
1279 		if (!mb_test_bit(block, bb)) {
1280 			/* this block is part of buddy of order 'order' */
1281 			return order;
1282 		}
1283 		bb += bb_incr;
1284 		bb_incr >>= 1;
1285 		order++;
1286 	}
1287 	return 0;
1288 }
1289 
mb_clear_bits(void * bm,int cur,int len)1290 static void mb_clear_bits(void *bm, int cur, int len)
1291 {
1292 	__u32 *addr;
1293 
1294 	len = cur + len;
1295 	while (cur < len) {
1296 		if ((cur & 31) == 0 && (len - cur) >= 32) {
1297 			/* fast path: clear whole word at once */
1298 			addr = bm + (cur >> 3);
1299 			*addr = 0;
1300 			cur += 32;
1301 			continue;
1302 		}
1303 		mb_clear_bit(cur, bm);
1304 		cur++;
1305 	}
1306 }
1307 
1308 /* clear bits in given range
1309  * will return first found zero bit if any, -1 otherwise
1310  */
mb_test_and_clear_bits(void * bm,int cur,int len)1311 static int mb_test_and_clear_bits(void *bm, int cur, int len)
1312 {
1313 	__u32 *addr;
1314 	int zero_bit = -1;
1315 
1316 	len = cur + len;
1317 	while (cur < len) {
1318 		if ((cur & 31) == 0 && (len - cur) >= 32) {
1319 			/* fast path: clear whole word at once */
1320 			addr = bm + (cur >> 3);
1321 			if (*addr != (__u32)(-1) && zero_bit == -1)
1322 				zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0);
1323 			*addr = 0;
1324 			cur += 32;
1325 			continue;
1326 		}
1327 		if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1)
1328 			zero_bit = cur;
1329 		cur++;
1330 	}
1331 
1332 	return zero_bit;
1333 }
1334 
ext4_set_bits(void * bm,int cur,int len)1335 void ext4_set_bits(void *bm, int cur, int len)
1336 {
1337 	__u32 *addr;
1338 
1339 	len = cur + len;
1340 	while (cur < len) {
1341 		if ((cur & 31) == 0 && (len - cur) >= 32) {
1342 			/* fast path: set whole word at once */
1343 			addr = bm + (cur >> 3);
1344 			*addr = 0xffffffff;
1345 			cur += 32;
1346 			continue;
1347 		}
1348 		mb_set_bit(cur, bm);
1349 		cur++;
1350 	}
1351 }
1352 
1353 /*
1354  * _________________________________________________________________ */
1355 
mb_buddy_adjust_border(int * bit,void * bitmap,int side)1356 static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side)
1357 {
1358 	if (mb_test_bit(*bit + side, bitmap)) {
1359 		mb_clear_bit(*bit, bitmap);
1360 		(*bit) -= side;
1361 		return 1;
1362 	}
1363 	else {
1364 		(*bit) += side;
1365 		mb_set_bit(*bit, bitmap);
1366 		return -1;
1367 	}
1368 }
1369 
mb_buddy_mark_free(struct ext4_buddy * e4b,int first,int last)1370 static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last)
1371 {
1372 	int max;
1373 	int order = 1;
1374 	void *buddy = mb_find_buddy(e4b, order, &max);
1375 
1376 	while (buddy) {
1377 		void *buddy2;
1378 
1379 		/* Bits in range [first; last] are known to be set since
1380 		 * corresponding blocks were allocated. Bits in range
1381 		 * (first; last) will stay set because they form buddies on
1382 		 * upper layer. We just deal with borders if they don't
1383 		 * align with upper layer and then go up.
1384 		 * Releasing entire group is all about clearing
1385 		 * single bit of highest order buddy.
1386 		 */
1387 
1388 		/* Example:
1389 		 * ---------------------------------
1390 		 * |   1   |   1   |   1   |   1   |
1391 		 * ---------------------------------
1392 		 * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
1393 		 * ---------------------------------
1394 		 *   0   1   2   3   4   5   6   7
1395 		 *      \_____________________/
1396 		 *
1397 		 * Neither [1] nor [6] is aligned to above layer.
1398 		 * Left neighbour [0] is free, so mark it busy,
1399 		 * decrease bb_counters and extend range to
1400 		 * [0; 6]
1401 		 * Right neighbour [7] is busy. It can't be coaleasced with [6], so
1402 		 * mark [6] free, increase bb_counters and shrink range to
1403 		 * [0; 5].
1404 		 * Then shift range to [0; 2], go up and do the same.
1405 		 */
1406 
1407 
1408 		if (first & 1)
1409 			e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1);
1410 		if (!(last & 1))
1411 			e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1);
1412 		if (first > last)
1413 			break;
1414 		order++;
1415 
1416 		if (first == last || !(buddy2 = mb_find_buddy(e4b, order, &max))) {
1417 			mb_clear_bits(buddy, first, last - first + 1);
1418 			e4b->bd_info->bb_counters[order - 1] += last - first + 1;
1419 			break;
1420 		}
1421 		first >>= 1;
1422 		last >>= 1;
1423 		buddy = buddy2;
1424 	}
1425 }
1426 
mb_free_blocks(struct inode * inode,struct ext4_buddy * e4b,int first,int count)1427 static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1428 			   int first, int count)
1429 {
1430 	int left_is_free = 0;
1431 	int right_is_free = 0;
1432 	int block;
1433 	int last = first + count - 1;
1434 	struct super_block *sb = e4b->bd_sb;
1435 
1436 	if (WARN_ON(count == 0))
1437 		return;
1438 	BUG_ON(last >= (sb->s_blocksize << 3));
1439 	assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
1440 	/* Don't bother if the block group is corrupt. */
1441 	if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
1442 		return;
1443 
1444 	mb_check_buddy(e4b);
1445 	mb_free_blocks_double(inode, e4b, first, count);
1446 
1447 	e4b->bd_info->bb_free += count;
1448 	if (first < e4b->bd_info->bb_first_free)
1449 		e4b->bd_info->bb_first_free = first;
1450 
1451 	/* access memory sequentially: check left neighbour,
1452 	 * clear range and then check right neighbour
1453 	 */
1454 	if (first != 0)
1455 		left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap);
1456 	block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count);
1457 	if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0])
1458 		right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap);
1459 
1460 	if (unlikely(block != -1)) {
1461 		struct ext4_sb_info *sbi = EXT4_SB(sb);
1462 		ext4_fsblk_t blocknr;
1463 
1464 		blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
1465 		blocknr += EXT4_C2B(EXT4_SB(sb), block);
1466 		ext4_grp_locked_error(sb, e4b->bd_group,
1467 				      inode ? inode->i_ino : 0,
1468 				      blocknr,
1469 				      "freeing already freed block "
1470 				      "(bit %u); block bitmap corrupt.",
1471 				      block);
1472 		if (!EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))
1473 			percpu_counter_sub(&sbi->s_freeclusters_counter,
1474 					   e4b->bd_info->bb_free);
1475 		/* Mark the block group as corrupt. */
1476 		set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT,
1477 			&e4b->bd_info->bb_state);
1478 		mb_regenerate_buddy(e4b);
1479 		goto done;
1480 	}
1481 
1482 	/* let's maintain fragments counter */
1483 	if (left_is_free && right_is_free)
1484 		e4b->bd_info->bb_fragments--;
1485 	else if (!left_is_free && !right_is_free)
1486 		e4b->bd_info->bb_fragments++;
1487 
1488 	/* buddy[0] == bd_bitmap is a special case, so handle
1489 	 * it right away and let mb_buddy_mark_free stay free of
1490 	 * zero order checks.
1491 	 * Check if neighbours are to be coaleasced,
1492 	 * adjust bitmap bb_counters and borders appropriately.
1493 	 */
1494 	if (first & 1) {
1495 		first += !left_is_free;
1496 		e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1;
1497 	}
1498 	if (!(last & 1)) {
1499 		last -= !right_is_free;
1500 		e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1;
1501 	}
1502 
1503 	if (first <= last)
1504 		mb_buddy_mark_free(e4b, first >> 1, last >> 1);
1505 
1506 done:
1507 	mb_set_largest_free_order(sb, e4b->bd_info);
1508 	mb_check_buddy(e4b);
1509 }
1510 
mb_find_extent(struct ext4_buddy * e4b,int block,int needed,struct ext4_free_extent * ex)1511 static int mb_find_extent(struct ext4_buddy *e4b, int block,
1512 				int needed, struct ext4_free_extent *ex)
1513 {
1514 	int next = block;
1515 	int max, order;
1516 	void *buddy;
1517 
1518 	assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
1519 	BUG_ON(ex == NULL);
1520 
1521 	buddy = mb_find_buddy(e4b, 0, &max);
1522 	BUG_ON(buddy == NULL);
1523 	BUG_ON(block >= max);
1524 	if (mb_test_bit(block, buddy)) {
1525 		ex->fe_len = 0;
1526 		ex->fe_start = 0;
1527 		ex->fe_group = 0;
1528 		return 0;
1529 	}
1530 
1531 	/* find actual order */
1532 	order = mb_find_order_for_block(e4b, block);
1533 	block = block >> order;
1534 
1535 	ex->fe_len = 1 << order;
1536 	ex->fe_start = block << order;
1537 	ex->fe_group = e4b->bd_group;
1538 
1539 	/* calc difference from given start */
1540 	next = next - ex->fe_start;
1541 	ex->fe_len -= next;
1542 	ex->fe_start += next;
1543 
1544 	while (needed > ex->fe_len &&
1545 	       mb_find_buddy(e4b, order, &max)) {
1546 
1547 		if (block + 1 >= max)
1548 			break;
1549 
1550 		next = (block + 1) * (1 << order);
1551 		if (mb_test_bit(next, e4b->bd_bitmap))
1552 			break;
1553 
1554 		order = mb_find_order_for_block(e4b, next);
1555 
1556 		block = next >> order;
1557 		ex->fe_len += 1 << order;
1558 	}
1559 
1560 	BUG_ON(ex->fe_start + ex->fe_len > (1 << (e4b->bd_blkbits + 3)));
1561 	return ex->fe_len;
1562 }
1563 
mb_mark_used(struct ext4_buddy * e4b,struct ext4_free_extent * ex)1564 static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
1565 {
1566 	int ord;
1567 	int mlen = 0;
1568 	int max = 0;
1569 	int cur;
1570 	int start = ex->fe_start;
1571 	int len = ex->fe_len;
1572 	unsigned ret = 0;
1573 	int len0 = len;
1574 	void *buddy;
1575 
1576 	BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3));
1577 	BUG_ON(e4b->bd_group != ex->fe_group);
1578 	assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
1579 	mb_check_buddy(e4b);
1580 	mb_mark_used_double(e4b, start, len);
1581 
1582 	e4b->bd_info->bb_free -= len;
1583 	if (e4b->bd_info->bb_first_free == start)
1584 		e4b->bd_info->bb_first_free += len;
1585 
1586 	/* let's maintain fragments counter */
1587 	if (start != 0)
1588 		mlen = !mb_test_bit(start - 1, e4b->bd_bitmap);
1589 	if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0])
1590 		max = !mb_test_bit(start + len, e4b->bd_bitmap);
1591 	if (mlen && max)
1592 		e4b->bd_info->bb_fragments++;
1593 	else if (!mlen && !max)
1594 		e4b->bd_info->bb_fragments--;
1595 
1596 	/* let's maintain buddy itself */
1597 	while (len) {
1598 		ord = mb_find_order_for_block(e4b, start);
1599 
1600 		if (((start >> ord) << ord) == start && len >= (1 << ord)) {
1601 			/* the whole chunk may be allocated at once! */
1602 			mlen = 1 << ord;
1603 			buddy = mb_find_buddy(e4b, ord, &max);
1604 			BUG_ON((start >> ord) >= max);
1605 			mb_set_bit(start >> ord, buddy);
1606 			e4b->bd_info->bb_counters[ord]--;
1607 			start += mlen;
1608 			len -= mlen;
1609 			BUG_ON(len < 0);
1610 			continue;
1611 		}
1612 
1613 		/* store for history */
1614 		if (ret == 0)
1615 			ret = len | (ord << 16);
1616 
1617 		/* we have to split large buddy */
1618 		BUG_ON(ord <= 0);
1619 		buddy = mb_find_buddy(e4b, ord, &max);
1620 		mb_set_bit(start >> ord, buddy);
1621 		e4b->bd_info->bb_counters[ord]--;
1622 
1623 		ord--;
1624 		cur = (start >> ord) & ~1U;
1625 		buddy = mb_find_buddy(e4b, ord, &max);
1626 		mb_clear_bit(cur, buddy);
1627 		mb_clear_bit(cur + 1, buddy);
1628 		e4b->bd_info->bb_counters[ord]++;
1629 		e4b->bd_info->bb_counters[ord]++;
1630 	}
1631 	mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
1632 
1633 	ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0);
1634 	mb_check_buddy(e4b);
1635 
1636 	return ret;
1637 }
1638 
1639 /*
1640  * Must be called under group lock!
1641  */
ext4_mb_use_best_found(struct ext4_allocation_context * ac,struct ext4_buddy * e4b)1642 static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
1643 					struct ext4_buddy *e4b)
1644 {
1645 	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
1646 	int ret;
1647 
1648 	BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group);
1649 	BUG_ON(ac->ac_status == AC_STATUS_FOUND);
1650 
1651 	ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len);
1652 	ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical;
1653 	ret = mb_mark_used(e4b, &ac->ac_b_ex);
1654 
1655 	/* preallocation can change ac_b_ex, thus we store actually
1656 	 * allocated blocks for history */
1657 	ac->ac_f_ex = ac->ac_b_ex;
1658 
1659 	ac->ac_status = AC_STATUS_FOUND;
1660 	ac->ac_tail = ret & 0xffff;
1661 	ac->ac_buddy = ret >> 16;
1662 
1663 	/*
1664 	 * take the page reference. We want the page to be pinned
1665 	 * so that we don't get a ext4_mb_init_cache_call for this
1666 	 * group until we update the bitmap. That would mean we
1667 	 * double allocate blocks. The reference is dropped
1668 	 * in ext4_mb_release_context
1669 	 */
1670 	ac->ac_bitmap_page = e4b->bd_bitmap_page;
1671 	get_page(ac->ac_bitmap_page);
1672 	ac->ac_buddy_page = e4b->bd_buddy_page;
1673 	get_page(ac->ac_buddy_page);
1674 	/* store last allocated for subsequent stream allocation */
1675 	if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
1676 		spin_lock(&sbi->s_md_lock);
1677 		sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
1678 		sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
1679 		spin_unlock(&sbi->s_md_lock);
1680 	}
1681 }
1682 
1683 /*
1684  * regular allocator, for general purposes allocation
1685  */
1686 
ext4_mb_check_limits(struct ext4_allocation_context * ac,struct ext4_buddy * e4b,int finish_group)1687 static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
1688 					struct ext4_buddy *e4b,
1689 					int finish_group)
1690 {
1691 	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
1692 	struct ext4_free_extent *bex = &ac->ac_b_ex;
1693 	struct ext4_free_extent *gex = &ac->ac_g_ex;
1694 	struct ext4_free_extent ex;
1695 	int max;
1696 
1697 	if (ac->ac_status == AC_STATUS_FOUND)
1698 		return;
1699 	/*
1700 	 * We don't want to scan for a whole year
1701 	 */
1702 	if (ac->ac_found > sbi->s_mb_max_to_scan &&
1703 			!(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
1704 		ac->ac_status = AC_STATUS_BREAK;
1705 		return;
1706 	}
1707 
1708 	/*
1709 	 * Haven't found good chunk so far, let's continue
1710 	 */
1711 	if (bex->fe_len < gex->fe_len)
1712 		return;
1713 
1714 	if ((finish_group || ac->ac_found > sbi->s_mb_min_to_scan)
1715 			&& bex->fe_group == e4b->bd_group) {
1716 		/* recheck chunk's availability - we don't know
1717 		 * when it was found (within this lock-unlock
1718 		 * period or not) */
1719 		max = mb_find_extent(e4b, bex->fe_start, gex->fe_len, &ex);
1720 		if (max >= gex->fe_len) {
1721 			ext4_mb_use_best_found(ac, e4b);
1722 			return;
1723 		}
1724 	}
1725 }
1726 
1727 /*
1728  * The routine checks whether found extent is good enough. If it is,
1729  * then the extent gets marked used and flag is set to the context
1730  * to stop scanning. Otherwise, the extent is compared with the
1731  * previous found extent and if new one is better, then it's stored
1732  * in the context. Later, the best found extent will be used, if
1733  * mballoc can't find good enough extent.
1734  *
1735  * FIXME: real allocation policy is to be designed yet!
1736  */
ext4_mb_measure_extent(struct ext4_allocation_context * ac,struct ext4_free_extent * ex,struct ext4_buddy * e4b)1737 static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
1738 					struct ext4_free_extent *ex,
1739 					struct ext4_buddy *e4b)
1740 {
1741 	struct ext4_free_extent *bex = &ac->ac_b_ex;
1742 	struct ext4_free_extent *gex = &ac->ac_g_ex;
1743 
1744 	BUG_ON(ex->fe_len <= 0);
1745 	BUG_ON(ex->fe_len > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
1746 	BUG_ON(ex->fe_start >= EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
1747 	BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);
1748 
1749 	ac->ac_found++;
1750 
1751 	/*
1752 	 * The special case - take what you catch first
1753 	 */
1754 	if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
1755 		*bex = *ex;
1756 		ext4_mb_use_best_found(ac, e4b);
1757 		return;
1758 	}
1759 
1760 	/*
1761 	 * Let's check whether the chuck is good enough
1762 	 */
1763 	if (ex->fe_len == gex->fe_len) {
1764 		*bex = *ex;
1765 		ext4_mb_use_best_found(ac, e4b);
1766 		return;
1767 	}
1768 
1769 	/*
1770 	 * If this is first found extent, just store it in the context
1771 	 */
1772 	if (bex->fe_len == 0) {
1773 		*bex = *ex;
1774 		return;
1775 	}
1776 
1777 	/*
1778 	 * If new found extent is better, store it in the context
1779 	 */
1780 	if (bex->fe_len < gex->fe_len) {
1781 		/* if the request isn't satisfied, any found extent
1782 		 * larger than previous best one is better */
1783 		if (ex->fe_len > bex->fe_len)
1784 			*bex = *ex;
1785 	} else if (ex->fe_len > gex->fe_len) {
1786 		/* if the request is satisfied, then we try to find
1787 		 * an extent that still satisfy the request, but is
1788 		 * smaller than previous one */
1789 		if (ex->fe_len < bex->fe_len)
1790 			*bex = *ex;
1791 	}
1792 
1793 	ext4_mb_check_limits(ac, e4b, 0);
1794 }
1795 
1796 static noinline_for_stack
ext4_mb_try_best_found(struct ext4_allocation_context * ac,struct ext4_buddy * e4b)1797 int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
1798 					struct ext4_buddy *e4b)
1799 {
1800 	struct ext4_free_extent ex = ac->ac_b_ex;
1801 	ext4_group_t group = ex.fe_group;
1802 	int max;
1803 	int err;
1804 
1805 	BUG_ON(ex.fe_len <= 0);
1806 	err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
1807 	if (err)
1808 		return err;
1809 
1810 	ext4_lock_group(ac->ac_sb, group);
1811 	max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex);
1812 
1813 	if (max > 0) {
1814 		ac->ac_b_ex = ex;
1815 		ext4_mb_use_best_found(ac, e4b);
1816 	}
1817 
1818 	ext4_unlock_group(ac->ac_sb, group);
1819 	ext4_mb_unload_buddy(e4b);
1820 
1821 	return 0;
1822 }
1823 
1824 static noinline_for_stack
ext4_mb_find_by_goal(struct ext4_allocation_context * ac,struct ext4_buddy * e4b)1825 int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
1826 				struct ext4_buddy *e4b)
1827 {
1828 	ext4_group_t group = ac->ac_g_ex.fe_group;
1829 	int max;
1830 	int err;
1831 	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
1832 	struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
1833 	struct ext4_free_extent ex;
1834 
1835 	if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL))
1836 		return 0;
1837 	if (grp->bb_free == 0)
1838 		return 0;
1839 
1840 	err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
1841 	if (err)
1842 		return err;
1843 
1844 	if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) {
1845 		ext4_mb_unload_buddy(e4b);
1846 		return 0;
1847 	}
1848 
1849 	ext4_lock_group(ac->ac_sb, group);
1850 	max = mb_find_extent(e4b, ac->ac_g_ex.fe_start,
1851 			     ac->ac_g_ex.fe_len, &ex);
1852 	ex.fe_logical = 0xDEADFA11; /* debug value */
1853 
1854 	if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
1855 		ext4_fsblk_t start;
1856 
1857 		start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) +
1858 			ex.fe_start;
1859 		/* use do_div to get remainder (would be 64-bit modulo) */
1860 		if (do_div(start, sbi->s_stripe) == 0) {
1861 			ac->ac_found++;
1862 			ac->ac_b_ex = ex;
1863 			ext4_mb_use_best_found(ac, e4b);
1864 		}
1865 	} else if (max >= ac->ac_g_ex.fe_len) {
1866 		BUG_ON(ex.fe_len <= 0);
1867 		BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
1868 		BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
1869 		ac->ac_found++;
1870 		ac->ac_b_ex = ex;
1871 		ext4_mb_use_best_found(ac, e4b);
1872 	} else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) {
1873 		/* Sometimes, caller may want to merge even small
1874 		 * number of blocks to an existing extent */
1875 		BUG_ON(ex.fe_len <= 0);
1876 		BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
1877 		BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
1878 		ac->ac_found++;
1879 		ac->ac_b_ex = ex;
1880 		ext4_mb_use_best_found(ac, e4b);
1881 	}
1882 	ext4_unlock_group(ac->ac_sb, group);
1883 	ext4_mb_unload_buddy(e4b);
1884 
1885 	return 0;
1886 }
1887 
1888 /*
1889  * The routine scans buddy structures (not bitmap!) from given order
1890  * to max order and tries to find big enough chunk to satisfy the req
1891  */
1892 static noinline_for_stack
ext4_mb_simple_scan_group(struct ext4_allocation_context * ac,struct ext4_buddy * e4b)1893 void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
1894 					struct ext4_buddy *e4b)
1895 {
1896 	struct super_block *sb = ac->ac_sb;
1897 	struct ext4_group_info *grp = e4b->bd_info;
1898 	void *buddy;
1899 	int i;
1900 	int k;
1901 	int max;
1902 
1903 	BUG_ON(ac->ac_2order <= 0);
1904 	for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) {
1905 		if (grp->bb_counters[i] == 0)
1906 			continue;
1907 
1908 		buddy = mb_find_buddy(e4b, i, &max);
1909 		BUG_ON(buddy == NULL);
1910 
1911 		k = mb_find_next_zero_bit(buddy, max, 0);
1912 		BUG_ON(k >= max);
1913 
1914 		ac->ac_found++;
1915 
1916 		ac->ac_b_ex.fe_len = 1 << i;
1917 		ac->ac_b_ex.fe_start = k << i;
1918 		ac->ac_b_ex.fe_group = e4b->bd_group;
1919 
1920 		ext4_mb_use_best_found(ac, e4b);
1921 
1922 		BUG_ON(ac->ac_b_ex.fe_len != ac->ac_g_ex.fe_len);
1923 
1924 		if (EXT4_SB(sb)->s_mb_stats)
1925 			atomic_inc(&EXT4_SB(sb)->s_bal_2orders);
1926 
1927 		break;
1928 	}
1929 }
1930 
1931 /*
1932  * The routine scans the group and measures all found extents.
1933  * In order to optimize scanning, caller must pass number of
1934  * free blocks in the group, so the routine can know upper limit.
1935  */
1936 static noinline_for_stack
ext4_mb_complex_scan_group(struct ext4_allocation_context * ac,struct ext4_buddy * e4b)1937 void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
1938 					struct ext4_buddy *e4b)
1939 {
1940 	struct super_block *sb = ac->ac_sb;
1941 	void *bitmap = e4b->bd_bitmap;
1942 	struct ext4_free_extent ex;
1943 	int i;
1944 	int free;
1945 
1946 	free = e4b->bd_info->bb_free;
1947 	if (WARN_ON(free <= 0))
1948 		return;
1949 
1950 	i = e4b->bd_info->bb_first_free;
1951 
1952 	while (free && ac->ac_status == AC_STATUS_CONTINUE) {
1953 		i = mb_find_next_zero_bit(bitmap,
1954 						EXT4_CLUSTERS_PER_GROUP(sb), i);
1955 		if (i >= EXT4_CLUSTERS_PER_GROUP(sb)) {
1956 			/*
1957 			 * IF we have corrupt bitmap, we won't find any
1958 			 * free blocks even though group info says we
1959 			 * we have free blocks
1960 			 */
1961 			ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
1962 					"%d free clusters as per "
1963 					"group info. But bitmap says 0",
1964 					free);
1965 			break;
1966 		}
1967 
1968 		mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex);
1969 		if (WARN_ON(ex.fe_len <= 0))
1970 			break;
1971 		if (free < ex.fe_len) {
1972 			ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
1973 					"%d free clusters as per "
1974 					"group info. But got %d blocks",
1975 					free, ex.fe_len);
1976 			/*
1977 			 * The number of free blocks differs. This mostly
1978 			 * indicate that the bitmap is corrupt. So exit
1979 			 * without claiming the space.
1980 			 */
1981 			break;
1982 		}
1983 		ex.fe_logical = 0xDEADC0DE; /* debug value */
1984 		ext4_mb_measure_extent(ac, &ex, e4b);
1985 
1986 		i += ex.fe_len;
1987 		free -= ex.fe_len;
1988 	}
1989 
1990 	ext4_mb_check_limits(ac, e4b, 1);
1991 }
1992 
1993 /*
1994  * This is a special case for storages like raid5
1995  * we try to find stripe-aligned chunks for stripe-size-multiple requests
1996  */
1997 static noinline_for_stack
ext4_mb_scan_aligned(struct ext4_allocation_context * ac,struct ext4_buddy * e4b)1998 void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
1999 				 struct ext4_buddy *e4b)
2000 {
2001 	struct super_block *sb = ac->ac_sb;
2002 	struct ext4_sb_info *sbi = EXT4_SB(sb);
2003 	void *bitmap = e4b->bd_bitmap;
2004 	struct ext4_free_extent ex;
2005 	ext4_fsblk_t first_group_block;
2006 	ext4_fsblk_t a;
2007 	ext4_grpblk_t i;
2008 	int max;
2009 
2010 	BUG_ON(sbi->s_stripe == 0);
2011 
2012 	/* find first stripe-aligned block in group */
2013 	first_group_block = ext4_group_first_block_no(sb, e4b->bd_group);
2014 
2015 	a = first_group_block + sbi->s_stripe - 1;
2016 	do_div(a, sbi->s_stripe);
2017 	i = (a * sbi->s_stripe) - first_group_block;
2018 
2019 	while (i < EXT4_CLUSTERS_PER_GROUP(sb)) {
2020 		if (!mb_test_bit(i, bitmap)) {
2021 			max = mb_find_extent(e4b, i, sbi->s_stripe, &ex);
2022 			if (max >= sbi->s_stripe) {
2023 				ac->ac_found++;
2024 				ex.fe_logical = 0xDEADF00D; /* debug value */
2025 				ac->ac_b_ex = ex;
2026 				ext4_mb_use_best_found(ac, e4b);
2027 				break;
2028 			}
2029 		}
2030 		i += sbi->s_stripe;
2031 	}
2032 }
2033 
2034 /*
2035  * This is now called BEFORE we load the buddy bitmap.
2036  * Returns either 1 or 0 indicating that the group is either suitable
2037  * for the allocation or not. In addition it can also return negative
2038  * error code when something goes wrong.
2039  */
ext4_mb_good_group(struct ext4_allocation_context * ac,ext4_group_t group,int cr)2040 static int ext4_mb_good_group(struct ext4_allocation_context *ac,
2041 				ext4_group_t group, int cr)
2042 {
2043 	unsigned free, fragments;
2044 	int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
2045 	struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
2046 
2047 	BUG_ON(cr < 0 || cr >= 4);
2048 
2049 	free = grp->bb_free;
2050 	if (free == 0)
2051 		return 0;
2052 	if (cr <= 2 && free < ac->ac_g_ex.fe_len)
2053 		return 0;
2054 
2055 	if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
2056 		return 0;
2057 
2058 	/* We only do this if the grp has never been initialized */
2059 	if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
2060 		int ret = ext4_mb_init_group(ac->ac_sb, group, GFP_NOFS);
2061 		if (ret)
2062 			return ret;
2063 	}
2064 
2065 	fragments = grp->bb_fragments;
2066 	if (fragments == 0)
2067 		return 0;
2068 
2069 	switch (cr) {
2070 	case 0:
2071 		BUG_ON(ac->ac_2order == 0);
2072 
2073 		/* Avoid using the first bg of a flexgroup for data files */
2074 		if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
2075 		    (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
2076 		    ((group % flex_size) == 0))
2077 			return 0;
2078 
2079 		if ((ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) ||
2080 		    (free / fragments) >= ac->ac_g_ex.fe_len)
2081 			return 1;
2082 
2083 		if (grp->bb_largest_free_order < ac->ac_2order)
2084 			return 0;
2085 
2086 		return 1;
2087 	case 1:
2088 		if ((free / fragments) >= ac->ac_g_ex.fe_len)
2089 			return 1;
2090 		break;
2091 	case 2:
2092 		if (free >= ac->ac_g_ex.fe_len)
2093 			return 1;
2094 		break;
2095 	case 3:
2096 		return 1;
2097 	default:
2098 		BUG();
2099 	}
2100 
2101 	return 0;
2102 }
2103 
2104 static noinline_for_stack int
ext4_mb_regular_allocator(struct ext4_allocation_context * ac)2105 ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
2106 {
2107 	ext4_group_t ngroups, group, i;
2108 	int cr;
2109 	int err = 0, first_err = 0;
2110 	struct ext4_sb_info *sbi;
2111 	struct super_block *sb;
2112 	struct ext4_buddy e4b;
2113 
2114 	sb = ac->ac_sb;
2115 	sbi = EXT4_SB(sb);
2116 	ngroups = ext4_get_groups_count(sb);
2117 	/* non-extent files are limited to low blocks/groups */
2118 	if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
2119 		ngroups = sbi->s_blockfile_groups;
2120 
2121 	BUG_ON(ac->ac_status == AC_STATUS_FOUND);
2122 
2123 	/* first, try the goal */
2124 	err = ext4_mb_find_by_goal(ac, &e4b);
2125 	if (err || ac->ac_status == AC_STATUS_FOUND)
2126 		goto out;
2127 
2128 	if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
2129 		goto out;
2130 
2131 	/*
2132 	 * ac->ac2_order is set only if the fe_len is a power of 2
2133 	 * if ac2_order is set we also set criteria to 0 so that we
2134 	 * try exact allocation using buddy.
2135 	 */
2136 	i = fls(ac->ac_g_ex.fe_len);
2137 	ac->ac_2order = 0;
2138 	/*
2139 	 * We search using buddy data only if the order of the request
2140 	 * is greater than equal to the sbi_s_mb_order2_reqs
2141 	 * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
2142 	 * We also support searching for power-of-two requests only for
2143 	 * requests upto maximum buddy size we have constructed.
2144 	 */
2145 	if (i >= sbi->s_mb_order2_reqs && i <= sb->s_blocksize_bits + 2) {
2146 		/*
2147 		 * This should tell if fe_len is exactly power of 2
2148 		 */
2149 		if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0)
2150 			ac->ac_2order = array_index_nospec(i - 1,
2151 							   sb->s_blocksize_bits + 2);
2152 	}
2153 
2154 	/* if stream allocation is enabled, use global goal */
2155 	if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
2156 		/* TBD: may be hot point */
2157 		spin_lock(&sbi->s_md_lock);
2158 		ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
2159 		ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
2160 		spin_unlock(&sbi->s_md_lock);
2161 	}
2162 
2163 	/* Let's just scan groups to find more-less suitable blocks */
2164 	cr = ac->ac_2order ? 0 : 1;
2165 	/*
2166 	 * cr == 0 try to get exact allocation,
2167 	 * cr == 3  try to get anything
2168 	 */
2169 repeat:
2170 	for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
2171 		ac->ac_criteria = cr;
2172 		/*
2173 		 * searching for the right group start
2174 		 * from the goal value specified
2175 		 */
2176 		group = ac->ac_g_ex.fe_group;
2177 
2178 		for (i = 0; i < ngroups; group++, i++) {
2179 			int ret = 0;
2180 			cond_resched();
2181 			/*
2182 			 * Artificially restricted ngroups for non-extent
2183 			 * files makes group > ngroups possible on first loop.
2184 			 */
2185 			if (group >= ngroups)
2186 				group = 0;
2187 
2188 			/* This now checks without needing the buddy page */
2189 			ret = ext4_mb_good_group(ac, group, cr);
2190 			if (ret <= 0) {
2191 				if (!first_err)
2192 					first_err = ret;
2193 				continue;
2194 			}
2195 
2196 			err = ext4_mb_load_buddy(sb, group, &e4b);
2197 			if (err)
2198 				goto out;
2199 
2200 			ext4_lock_group(sb, group);
2201 
2202 			/*
2203 			 * We need to check again after locking the
2204 			 * block group
2205 			 */
2206 			ret = ext4_mb_good_group(ac, group, cr);
2207 			if (ret <= 0) {
2208 				ext4_unlock_group(sb, group);
2209 				ext4_mb_unload_buddy(&e4b);
2210 				if (!first_err)
2211 					first_err = ret;
2212 				continue;
2213 			}
2214 
2215 			ac->ac_groups_scanned++;
2216 			if (cr == 0)
2217 				ext4_mb_simple_scan_group(ac, &e4b);
2218 			else if (cr == 1 && sbi->s_stripe &&
2219 					!(ac->ac_g_ex.fe_len % sbi->s_stripe))
2220 				ext4_mb_scan_aligned(ac, &e4b);
2221 			else
2222 				ext4_mb_complex_scan_group(ac, &e4b);
2223 
2224 			ext4_unlock_group(sb, group);
2225 			ext4_mb_unload_buddy(&e4b);
2226 
2227 			if (ac->ac_status != AC_STATUS_CONTINUE)
2228 				break;
2229 		}
2230 	}
2231 
2232 	if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
2233 	    !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
2234 		/*
2235 		 * We've been searching too long. Let's try to allocate
2236 		 * the best chunk we've found so far
2237 		 */
2238 
2239 		ext4_mb_try_best_found(ac, &e4b);
2240 		if (ac->ac_status != AC_STATUS_FOUND) {
2241 			/*
2242 			 * Someone more lucky has already allocated it.
2243 			 * The only thing we can do is just take first
2244 			 * found block(s)
2245 			printk(KERN_DEBUG "EXT4-fs: someone won our chunk\n");
2246 			 */
2247 			ac->ac_b_ex.fe_group = 0;
2248 			ac->ac_b_ex.fe_start = 0;
2249 			ac->ac_b_ex.fe_len = 0;
2250 			ac->ac_status = AC_STATUS_CONTINUE;
2251 			ac->ac_flags |= EXT4_MB_HINT_FIRST;
2252 			cr = 3;
2253 			atomic_inc(&sbi->s_mb_lost_chunks);
2254 			goto repeat;
2255 		}
2256 	}
2257 out:
2258 	if (!err && ac->ac_status != AC_STATUS_FOUND && first_err)
2259 		err = first_err;
2260 	return err;
2261 }
2262 
ext4_mb_seq_groups_start(struct seq_file * seq,loff_t * pos)2263 static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
2264 {
2265 	struct super_block *sb = seq->private;
2266 	ext4_group_t group;
2267 
2268 	if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
2269 		return NULL;
2270 	group = *pos + 1;
2271 	return (void *) ((unsigned long) group);
2272 }
2273 
ext4_mb_seq_groups_next(struct seq_file * seq,void * v,loff_t * pos)2274 static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
2275 {
2276 	struct super_block *sb = seq->private;
2277 	ext4_group_t group;
2278 
2279 	++*pos;
2280 	if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
2281 		return NULL;
2282 	group = *pos + 1;
2283 	return (void *) ((unsigned long) group);
2284 }
2285 
ext4_mb_seq_groups_show(struct seq_file * seq,void * v)2286 static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
2287 {
2288 	struct super_block *sb = seq->private;
2289 	ext4_group_t group = (ext4_group_t) ((unsigned long) v);
2290 	int i;
2291 	int err, buddy_loaded = 0;
2292 	struct ext4_buddy e4b;
2293 	struct ext4_group_info *grinfo;
2294 	struct sg {
2295 		struct ext4_group_info info;
2296 		ext4_grpblk_t counters[EXT4_MAX_BLOCK_LOG_SIZE + 2];
2297 	} sg;
2298 
2299 	group--;
2300 	if (group == 0)
2301 		seq_puts(seq, "#group: free  frags first ["
2302 			      " 2^0   2^1   2^2   2^3   2^4   2^5   2^6  "
2303 			      " 2^7   2^8   2^9   2^10  2^11  2^12  2^13  ]");
2304 
2305 	i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
2306 		sizeof(struct ext4_group_info);
2307 	grinfo = ext4_get_group_info(sb, group);
2308 	/* Load the group info in memory only if not already loaded. */
2309 	if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) {
2310 		err = ext4_mb_load_buddy(sb, group, &e4b);
2311 		if (err) {
2312 			seq_printf(seq, "#%-5u: I/O error\n", group);
2313 			return 0;
2314 		}
2315 		buddy_loaded = 1;
2316 	}
2317 
2318 	memcpy(&sg, ext4_get_group_info(sb, group), i);
2319 
2320 	if (buddy_loaded)
2321 		ext4_mb_unload_buddy(&e4b);
2322 
2323 	seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
2324 			sg.info.bb_fragments, sg.info.bb_first_free);
2325 	for (i = 0; i <= 13; i++)
2326 		seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ?
2327 				sg.info.bb_counters[i] : 0);
2328 	seq_printf(seq, " ]\n");
2329 
2330 	return 0;
2331 }
2332 
ext4_mb_seq_groups_stop(struct seq_file * seq,void * v)2333 static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v)
2334 {
2335 }
2336 
2337 static const struct seq_operations ext4_mb_seq_groups_ops = {
2338 	.start  = ext4_mb_seq_groups_start,
2339 	.next   = ext4_mb_seq_groups_next,
2340 	.stop   = ext4_mb_seq_groups_stop,
2341 	.show   = ext4_mb_seq_groups_show,
2342 };
2343 
ext4_mb_seq_groups_open(struct inode * inode,struct file * file)2344 static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
2345 {
2346 	struct super_block *sb = PDE_DATA(inode);
2347 	int rc;
2348 
2349 	rc = seq_open(file, &ext4_mb_seq_groups_ops);
2350 	if (rc == 0) {
2351 		struct seq_file *m = file->private_data;
2352 		m->private = sb;
2353 	}
2354 	return rc;
2355 
2356 }
2357 
2358 const struct file_operations ext4_seq_mb_groups_fops = {
2359 	.owner		= THIS_MODULE,
2360 	.open		= ext4_mb_seq_groups_open,
2361 	.read		= seq_read,
2362 	.llseek		= seq_lseek,
2363 	.release	= seq_release,
2364 };
2365 
get_groupinfo_cache(int blocksize_bits)2366 static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
2367 {
2368 	int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
2369 	struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index];
2370 
2371 	BUG_ON(!cachep);
2372 	return cachep;
2373 }
2374 
2375 /*
2376  * Allocate the top-level s_group_info array for the specified number
2377  * of groups
2378  */
ext4_mb_alloc_groupinfo(struct super_block * sb,ext4_group_t ngroups)2379 int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups)
2380 {
2381 	struct ext4_sb_info *sbi = EXT4_SB(sb);
2382 	unsigned size;
2383 	struct ext4_group_info ***old_groupinfo, ***new_groupinfo;
2384 
2385 	size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >>
2386 		EXT4_DESC_PER_BLOCK_BITS(sb);
2387 	if (size <= sbi->s_group_info_size)
2388 		return 0;
2389 
2390 	size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size);
2391 	new_groupinfo = ext4_kvzalloc(size, GFP_KERNEL);
2392 	if (!new_groupinfo) {
2393 		ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group");
2394 		return -ENOMEM;
2395 	}
2396 	rcu_read_lock();
2397 	old_groupinfo = rcu_dereference(sbi->s_group_info);
2398 	if (old_groupinfo)
2399 		memcpy(new_groupinfo, old_groupinfo,
2400 		       sbi->s_group_info_size * sizeof(*sbi->s_group_info));
2401 	rcu_read_unlock();
2402 	rcu_assign_pointer(sbi->s_group_info, new_groupinfo);
2403 	sbi->s_group_info_size = size / sizeof(*sbi->s_group_info);
2404 	if (old_groupinfo)
2405 		ext4_kvfree_array_rcu(old_groupinfo);
2406 	ext4_debug("allocated s_groupinfo array for %d meta_bg's\n",
2407 		   sbi->s_group_info_size);
2408 	return 0;
2409 }
2410 
2411 /* Create and initialize ext4_group_info data for the given group. */
ext4_mb_add_groupinfo(struct super_block * sb,ext4_group_t group,struct ext4_group_desc * desc)2412 int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2413 			  struct ext4_group_desc *desc)
2414 {
2415 	int i;
2416 	int metalen = 0;
2417 	int idx = group >> EXT4_DESC_PER_BLOCK_BITS(sb);
2418 	struct ext4_sb_info *sbi = EXT4_SB(sb);
2419 	struct ext4_group_info **meta_group_info;
2420 	struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
2421 
2422 	/*
2423 	 * First check if this group is the first of a reserved block.
2424 	 * If it's true, we have to allocate a new table of pointers
2425 	 * to ext4_group_info structures
2426 	 */
2427 	if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
2428 		metalen = sizeof(*meta_group_info) <<
2429 			EXT4_DESC_PER_BLOCK_BITS(sb);
2430 		meta_group_info = kmalloc(metalen, GFP_NOFS);
2431 		if (meta_group_info == NULL) {
2432 			ext4_msg(sb, KERN_ERR, "can't allocate mem "
2433 				 "for a buddy group");
2434 			goto exit_meta_group_info;
2435 		}
2436 		rcu_read_lock();
2437 		rcu_dereference(sbi->s_group_info)[idx] = meta_group_info;
2438 		rcu_read_unlock();
2439 	}
2440 
2441 	meta_group_info = sbi_array_rcu_deref(sbi, s_group_info, idx);
2442 	i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
2443 
2444 	meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_NOFS);
2445 	if (meta_group_info[i] == NULL) {
2446 		ext4_msg(sb, KERN_ERR, "can't allocate buddy mem");
2447 		goto exit_group_info;
2448 	}
2449 	set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
2450 		&(meta_group_info[i]->bb_state));
2451 
2452 	/*
2453 	 * initialize bb_free to be able to skip
2454 	 * empty groups without initialization
2455 	 */
2456 	if (ext4_has_group_desc_csum(sb) &&
2457 	    (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
2458 		meta_group_info[i]->bb_free =
2459 			ext4_free_clusters_after_init(sb, group, desc);
2460 	} else {
2461 		meta_group_info[i]->bb_free =
2462 			ext4_free_group_clusters(sb, desc);
2463 	}
2464 
2465 	INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
2466 	init_rwsem(&meta_group_info[i]->alloc_sem);
2467 	meta_group_info[i]->bb_free_root = RB_ROOT;
2468 	meta_group_info[i]->bb_largest_free_order = -1;  /* uninit */
2469 
2470 #ifdef DOUBLE_CHECK
2471 	{
2472 		struct buffer_head *bh;
2473 		meta_group_info[i]->bb_bitmap =
2474 			kmalloc(sb->s_blocksize, GFP_NOFS);
2475 		BUG_ON(meta_group_info[i]->bb_bitmap == NULL);
2476 		bh = ext4_read_block_bitmap(sb, group);
2477 		BUG_ON(IS_ERR_OR_NULL(bh));
2478 		memcpy(meta_group_info[i]->bb_bitmap, bh->b_data,
2479 			sb->s_blocksize);
2480 		put_bh(bh);
2481 	}
2482 #endif
2483 
2484 	return 0;
2485 
2486 exit_group_info:
2487 	/* If a meta_group_info table has been allocated, release it now */
2488 	if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
2489 		struct ext4_group_info ***group_info;
2490 
2491 		rcu_read_lock();
2492 		group_info = rcu_dereference(sbi->s_group_info);
2493 		kfree(group_info[idx]);
2494 		group_info[idx] = NULL;
2495 		rcu_read_unlock();
2496 	}
2497 exit_meta_group_info:
2498 	return -ENOMEM;
2499 } /* ext4_mb_add_groupinfo */
2500 
ext4_mb_init_backend(struct super_block * sb)2501 static int ext4_mb_init_backend(struct super_block *sb)
2502 {
2503 	ext4_group_t ngroups = ext4_get_groups_count(sb);
2504 	ext4_group_t i;
2505 	struct ext4_sb_info *sbi = EXT4_SB(sb);
2506 	int err;
2507 	struct ext4_group_desc *desc;
2508 	struct ext4_group_info ***group_info;
2509 	struct kmem_cache *cachep;
2510 
2511 	err = ext4_mb_alloc_groupinfo(sb, ngroups);
2512 	if (err)
2513 		return err;
2514 
2515 	sbi->s_buddy_cache = new_inode(sb);
2516 	if (sbi->s_buddy_cache == NULL) {
2517 		ext4_msg(sb, KERN_ERR, "can't get new inode");
2518 		goto err_freesgi;
2519 	}
2520 	/* To avoid potentially colliding with an valid on-disk inode number,
2521 	 * use EXT4_BAD_INO for the buddy cache inode number.  This inode is
2522 	 * not in the inode hash, so it should never be found by iget(), but
2523 	 * this will avoid confusion if it ever shows up during debugging. */
2524 	sbi->s_buddy_cache->i_ino = EXT4_BAD_INO;
2525 	EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
2526 	for (i = 0; i < ngroups; i++) {
2527 		desc = ext4_get_group_desc(sb, i, NULL);
2528 		if (desc == NULL) {
2529 			ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i);
2530 			goto err_freebuddy;
2531 		}
2532 		if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
2533 			goto err_freebuddy;
2534 	}
2535 
2536 	return 0;
2537 
2538 err_freebuddy:
2539 	cachep = get_groupinfo_cache(sb->s_blocksize_bits);
2540 	while (i-- > 0)
2541 		kmem_cache_free(cachep, ext4_get_group_info(sb, i));
2542 	i = sbi->s_group_info_size;
2543 	rcu_read_lock();
2544 	group_info = rcu_dereference(sbi->s_group_info);
2545 	while (i-- > 0)
2546 		kfree(group_info[i]);
2547 	rcu_read_unlock();
2548 	iput(sbi->s_buddy_cache);
2549 err_freesgi:
2550 	rcu_read_lock();
2551 	kvfree(rcu_dereference(sbi->s_group_info));
2552 	rcu_read_unlock();
2553 	return -ENOMEM;
2554 }
2555 
ext4_groupinfo_destroy_slabs(void)2556 static void ext4_groupinfo_destroy_slabs(void)
2557 {
2558 	int i;
2559 
2560 	for (i = 0; i < NR_GRPINFO_CACHES; i++) {
2561 		if (ext4_groupinfo_caches[i])
2562 			kmem_cache_destroy(ext4_groupinfo_caches[i]);
2563 		ext4_groupinfo_caches[i] = NULL;
2564 	}
2565 }
2566 
ext4_groupinfo_create_slab(size_t size)2567 static int ext4_groupinfo_create_slab(size_t size)
2568 {
2569 	static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex);
2570 	int slab_size;
2571 	int blocksize_bits = order_base_2(size);
2572 	int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
2573 	struct kmem_cache *cachep;
2574 
2575 	if (cache_index >= NR_GRPINFO_CACHES)
2576 		return -EINVAL;
2577 
2578 	if (unlikely(cache_index < 0))
2579 		cache_index = 0;
2580 
2581 	mutex_lock(&ext4_grpinfo_slab_create_mutex);
2582 	if (ext4_groupinfo_caches[cache_index]) {
2583 		mutex_unlock(&ext4_grpinfo_slab_create_mutex);
2584 		return 0;	/* Already created */
2585 	}
2586 
2587 	slab_size = offsetof(struct ext4_group_info,
2588 				bb_counters[blocksize_bits + 2]);
2589 
2590 	cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index],
2591 					slab_size, 0, SLAB_RECLAIM_ACCOUNT,
2592 					NULL);
2593 
2594 	ext4_groupinfo_caches[cache_index] = cachep;
2595 
2596 	mutex_unlock(&ext4_grpinfo_slab_create_mutex);
2597 	if (!cachep) {
2598 		printk(KERN_EMERG
2599 		       "EXT4-fs: no memory for groupinfo slab cache\n");
2600 		return -ENOMEM;
2601 	}
2602 
2603 	return 0;
2604 }
2605 
ext4_mb_init(struct super_block * sb)2606 int ext4_mb_init(struct super_block *sb)
2607 {
2608 	struct ext4_sb_info *sbi = EXT4_SB(sb);
2609 	unsigned i, j;
2610 	unsigned offset, offset_incr;
2611 	unsigned max;
2612 	int ret;
2613 
2614 	i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
2615 
2616 	sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
2617 	if (sbi->s_mb_offsets == NULL) {
2618 		ret = -ENOMEM;
2619 		goto out;
2620 	}
2621 
2622 	i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs);
2623 	sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
2624 	if (sbi->s_mb_maxs == NULL) {
2625 		ret = -ENOMEM;
2626 		goto out;
2627 	}
2628 
2629 	ret = ext4_groupinfo_create_slab(sb->s_blocksize);
2630 	if (ret < 0)
2631 		goto out;
2632 
2633 	/* order 0 is regular bitmap */
2634 	sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
2635 	sbi->s_mb_offsets[0] = 0;
2636 
2637 	i = 1;
2638 	offset = 0;
2639 	offset_incr = 1 << (sb->s_blocksize_bits - 1);
2640 	max = sb->s_blocksize << 2;
2641 	do {
2642 		sbi->s_mb_offsets[i] = offset;
2643 		sbi->s_mb_maxs[i] = max;
2644 		offset += offset_incr;
2645 		offset_incr = offset_incr >> 1;
2646 		max = max >> 1;
2647 		i++;
2648 	} while (i <= sb->s_blocksize_bits + 1);
2649 
2650 	spin_lock_init(&sbi->s_md_lock);
2651 	spin_lock_init(&sbi->s_bal_lock);
2652 
2653 	sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
2654 	sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
2655 	sbi->s_mb_stats = MB_DEFAULT_STATS;
2656 	sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
2657 	sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
2658 	/*
2659 	 * The default group preallocation is 512, which for 4k block
2660 	 * sizes translates to 2 megabytes.  However for bigalloc file
2661 	 * systems, this is probably too big (i.e, if the cluster size
2662 	 * is 1 megabyte, then group preallocation size becomes half a
2663 	 * gigabyte!).  As a default, we will keep a two megabyte
2664 	 * group pralloc size for cluster sizes up to 64k, and after
2665 	 * that, we will force a minimum group preallocation size of
2666 	 * 32 clusters.  This translates to 8 megs when the cluster
2667 	 * size is 256k, and 32 megs when the cluster size is 1 meg,
2668 	 * which seems reasonable as a default.
2669 	 */
2670 	sbi->s_mb_group_prealloc = max(MB_DEFAULT_GROUP_PREALLOC >>
2671 				       sbi->s_cluster_bits, 32);
2672 	/*
2673 	 * If there is a s_stripe > 1, then we set the s_mb_group_prealloc
2674 	 * to the lowest multiple of s_stripe which is bigger than
2675 	 * the s_mb_group_prealloc as determined above. We want
2676 	 * the preallocation size to be an exact multiple of the
2677 	 * RAID stripe size so that preallocations don't fragment
2678 	 * the stripes.
2679 	 */
2680 	if (sbi->s_stripe > 1) {
2681 		sbi->s_mb_group_prealloc = roundup(
2682 			sbi->s_mb_group_prealloc, sbi->s_stripe);
2683 	}
2684 
2685 	sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
2686 	if (sbi->s_locality_groups == NULL) {
2687 		ret = -ENOMEM;
2688 		goto out;
2689 	}
2690 	for_each_possible_cpu(i) {
2691 		struct ext4_locality_group *lg;
2692 		lg = per_cpu_ptr(sbi->s_locality_groups, i);
2693 		mutex_init(&lg->lg_mutex);
2694 		for (j = 0; j < PREALLOC_TB_SIZE; j++)
2695 			INIT_LIST_HEAD(&lg->lg_prealloc_list[j]);
2696 		spin_lock_init(&lg->lg_prealloc_lock);
2697 	}
2698 
2699 	/* init file for buddy data */
2700 	ret = ext4_mb_init_backend(sb);
2701 	if (ret != 0)
2702 		goto out_free_locality_groups;
2703 
2704 	return 0;
2705 
2706 out_free_locality_groups:
2707 	free_percpu(sbi->s_locality_groups);
2708 	sbi->s_locality_groups = NULL;
2709 out:
2710 	kfree(sbi->s_mb_offsets);
2711 	sbi->s_mb_offsets = NULL;
2712 	kfree(sbi->s_mb_maxs);
2713 	sbi->s_mb_maxs = NULL;
2714 	return ret;
2715 }
2716 
2717 /* need to called with the ext4 group lock held */
ext4_mb_cleanup_pa(struct ext4_group_info * grp)2718 static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
2719 {
2720 	struct ext4_prealloc_space *pa;
2721 	struct list_head *cur, *tmp;
2722 	int count = 0;
2723 
2724 	list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) {
2725 		pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
2726 		list_del(&pa->pa_group_list);
2727 		count++;
2728 		kmem_cache_free(ext4_pspace_cachep, pa);
2729 	}
2730 	if (count)
2731 		mb_debug(1, "mballoc: %u PAs left\n", count);
2732 
2733 }
2734 
ext4_mb_release(struct super_block * sb)2735 int ext4_mb_release(struct super_block *sb)
2736 {
2737 	ext4_group_t ngroups = ext4_get_groups_count(sb);
2738 	ext4_group_t i;
2739 	int num_meta_group_infos;
2740 	struct ext4_group_info *grinfo, ***group_info;
2741 	struct ext4_sb_info *sbi = EXT4_SB(sb);
2742 	struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
2743 
2744 	if (sbi->s_group_info) {
2745 		for (i = 0; i < ngroups; i++) {
2746 			grinfo = ext4_get_group_info(sb, i);
2747 #ifdef DOUBLE_CHECK
2748 			kfree(grinfo->bb_bitmap);
2749 #endif
2750 			ext4_lock_group(sb, i);
2751 			ext4_mb_cleanup_pa(grinfo);
2752 			ext4_unlock_group(sb, i);
2753 			kmem_cache_free(cachep, grinfo);
2754 		}
2755 		num_meta_group_infos = (ngroups +
2756 				EXT4_DESC_PER_BLOCK(sb) - 1) >>
2757 			EXT4_DESC_PER_BLOCK_BITS(sb);
2758 		rcu_read_lock();
2759 		group_info = rcu_dereference(sbi->s_group_info);
2760 		for (i = 0; i < num_meta_group_infos; i++)
2761 			kfree(group_info[i]);
2762 		kvfree(group_info);
2763 		rcu_read_unlock();
2764 	}
2765 	kfree(sbi->s_mb_offsets);
2766 	kfree(sbi->s_mb_maxs);
2767 	iput(sbi->s_buddy_cache);
2768 	if (sbi->s_mb_stats) {
2769 		ext4_msg(sb, KERN_INFO,
2770 		       "mballoc: %u blocks %u reqs (%u success)",
2771 				atomic_read(&sbi->s_bal_allocated),
2772 				atomic_read(&sbi->s_bal_reqs),
2773 				atomic_read(&sbi->s_bal_success));
2774 		ext4_msg(sb, KERN_INFO,
2775 		      "mballoc: %u extents scanned, %u goal hits, "
2776 				"%u 2^N hits, %u breaks, %u lost",
2777 				atomic_read(&sbi->s_bal_ex_scanned),
2778 				atomic_read(&sbi->s_bal_goals),
2779 				atomic_read(&sbi->s_bal_2orders),
2780 				atomic_read(&sbi->s_bal_breaks),
2781 				atomic_read(&sbi->s_mb_lost_chunks));
2782 		ext4_msg(sb, KERN_INFO,
2783 		       "mballoc: %lu generated and it took %Lu",
2784 				sbi->s_mb_buddies_generated,
2785 				sbi->s_mb_generation_time);
2786 		ext4_msg(sb, KERN_INFO,
2787 		       "mballoc: %u preallocated, %u discarded",
2788 				atomic_read(&sbi->s_mb_preallocated),
2789 				atomic_read(&sbi->s_mb_discarded));
2790 	}
2791 
2792 	free_percpu(sbi->s_locality_groups);
2793 
2794 	return 0;
2795 }
2796 
ext4_issue_discard(struct super_block * sb,ext4_group_t block_group,ext4_grpblk_t cluster,int count,unsigned long flags)2797 static inline int ext4_issue_discard(struct super_block *sb,
2798 		ext4_group_t block_group, ext4_grpblk_t cluster, int count,
2799 		unsigned long flags)
2800 {
2801 	ext4_fsblk_t discard_block;
2802 
2803 	discard_block = (EXT4_C2B(EXT4_SB(sb), cluster) +
2804 			 ext4_group_first_block_no(sb, block_group));
2805 	count = EXT4_C2B(EXT4_SB(sb), count);
2806 	trace_ext4_discard_blocks(sb,
2807 			(unsigned long long) discard_block, count);
2808 	return sb_issue_discard(sb, discard_block, count, GFP_NOFS, flags);
2809 }
2810 
2811 /*
2812  * This function is called by the jbd2 layer once the commit has finished,
2813  * so we know we can free the blocks that were released with that commit.
2814  */
ext4_free_data_callback(struct super_block * sb,struct ext4_journal_cb_entry * jce,int rc)2815 static void ext4_free_data_callback(struct super_block *sb,
2816 				    struct ext4_journal_cb_entry *jce,
2817 				    int rc)
2818 {
2819 	struct ext4_free_data *entry = (struct ext4_free_data *)jce;
2820 	struct ext4_buddy e4b;
2821 	struct ext4_group_info *db;
2822 	int err, count = 0, count2 = 0;
2823 
2824 	mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
2825 		 entry->efd_count, entry->efd_group, entry);
2826 
2827 	if (test_opt(sb, DISCARD)) {
2828 		err = ext4_issue_discard(sb, entry->efd_group,
2829 					 entry->efd_start_cluster,
2830 					 entry->efd_count, 0);
2831 		if (err && err != -EOPNOTSUPP)
2832 			ext4_msg(sb, KERN_WARNING, "discard request in"
2833 				 " group:%d block:%d count:%d failed"
2834 				 " with %d", entry->efd_group,
2835 				 entry->efd_start_cluster,
2836 				 entry->efd_count, err);
2837 	}
2838 
2839 	err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
2840 	/* we expect to find existing buddy because it's pinned */
2841 	BUG_ON(err != 0);
2842 
2843 
2844 	db = e4b.bd_info;
2845 	/* there are blocks to put in buddy to make them really free */
2846 	count += entry->efd_count;
2847 	count2++;
2848 	ext4_lock_group(sb, entry->efd_group);
2849 	/* Take it out of per group rb tree */
2850 	rb_erase(&entry->efd_node, &(db->bb_free_root));
2851 	mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count);
2852 
2853 	/*
2854 	 * Clear the trimmed flag for the group so that the next
2855 	 * ext4_trim_fs can trim it.
2856 	 * If the volume is mounted with -o discard, online discard
2857 	 * is supported and the free blocks will be trimmed online.
2858 	 */
2859 	if (!test_opt(sb, DISCARD))
2860 		EXT4_MB_GRP_CLEAR_TRIMMED(db);
2861 
2862 	if (!db->bb_free_root.rb_node) {
2863 		/* No more items in the per group rb tree
2864 		 * balance refcounts from ext4_mb_free_metadata()
2865 		 */
2866 		page_cache_release(e4b.bd_buddy_page);
2867 		page_cache_release(e4b.bd_bitmap_page);
2868 	}
2869 	ext4_unlock_group(sb, entry->efd_group);
2870 	kmem_cache_free(ext4_free_data_cachep, entry);
2871 	ext4_mb_unload_buddy(&e4b);
2872 
2873 	mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
2874 }
2875 
ext4_init_mballoc(void)2876 int __init ext4_init_mballoc(void)
2877 {
2878 	ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space,
2879 					SLAB_RECLAIM_ACCOUNT);
2880 	if (ext4_pspace_cachep == NULL)
2881 		return -ENOMEM;
2882 
2883 	ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context,
2884 				    SLAB_RECLAIM_ACCOUNT);
2885 	if (ext4_ac_cachep == NULL) {
2886 		kmem_cache_destroy(ext4_pspace_cachep);
2887 		return -ENOMEM;
2888 	}
2889 
2890 	ext4_free_data_cachep = KMEM_CACHE(ext4_free_data,
2891 					   SLAB_RECLAIM_ACCOUNT);
2892 	if (ext4_free_data_cachep == NULL) {
2893 		kmem_cache_destroy(ext4_pspace_cachep);
2894 		kmem_cache_destroy(ext4_ac_cachep);
2895 		return -ENOMEM;
2896 	}
2897 	return 0;
2898 }
2899 
ext4_exit_mballoc(void)2900 void ext4_exit_mballoc(void)
2901 {
2902 	/*
2903 	 * Wait for completion of call_rcu()'s on ext4_pspace_cachep
2904 	 * before destroying the slab cache.
2905 	 */
2906 	rcu_barrier();
2907 	kmem_cache_destroy(ext4_pspace_cachep);
2908 	kmem_cache_destroy(ext4_ac_cachep);
2909 	kmem_cache_destroy(ext4_free_data_cachep);
2910 	ext4_groupinfo_destroy_slabs();
2911 }
2912 
2913 
2914 /*
2915  * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps
2916  * Returns 0 if success or error code
2917  */
2918 static noinline_for_stack int
ext4_mb_mark_diskspace_used(struct ext4_allocation_context * ac,handle_t * handle,unsigned int reserv_clstrs)2919 ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2920 				handle_t *handle, unsigned int reserv_clstrs)
2921 {
2922 	struct buffer_head *bitmap_bh = NULL;
2923 	struct ext4_group_desc *gdp;
2924 	struct buffer_head *gdp_bh;
2925 	struct ext4_sb_info *sbi;
2926 	struct super_block *sb;
2927 	ext4_fsblk_t block;
2928 	int err, len;
2929 
2930 	BUG_ON(ac->ac_status != AC_STATUS_FOUND);
2931 	BUG_ON(ac->ac_b_ex.fe_len <= 0);
2932 
2933 	sb = ac->ac_sb;
2934 	sbi = EXT4_SB(sb);
2935 
2936 	bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
2937 	if (IS_ERR(bitmap_bh)) {
2938 		err = PTR_ERR(bitmap_bh);
2939 		bitmap_bh = NULL;
2940 		goto out_err;
2941 	}
2942 
2943 	BUFFER_TRACE(bitmap_bh, "getting write access");
2944 	err = ext4_journal_get_write_access(handle, bitmap_bh);
2945 	if (err)
2946 		goto out_err;
2947 
2948 	err = -EIO;
2949 	gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh);
2950 	if (!gdp)
2951 		goto out_err;
2952 
2953 	ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
2954 			ext4_free_group_clusters(sb, gdp));
2955 
2956 	BUFFER_TRACE(gdp_bh, "get_write_access");
2957 	err = ext4_journal_get_write_access(handle, gdp_bh);
2958 	if (err)
2959 		goto out_err;
2960 
2961 	block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
2962 
2963 	len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
2964 	if (!ext4_inode_block_valid(ac->ac_inode, block, len)) {
2965 		ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
2966 			   "fs metadata", block, block+len);
2967 		/* File system mounted not to panic on error
2968 		 * Fix the bitmap and return EFSCORRUPTED
2969 		 * We leak some of the blocks here.
2970 		 */
2971 		ext4_lock_group(sb, ac->ac_b_ex.fe_group);
2972 		ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
2973 			      ac->ac_b_ex.fe_len);
2974 		ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
2975 		err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
2976 		if (!err)
2977 			err = -EFSCORRUPTED;
2978 		goto out_err;
2979 	}
2980 
2981 	ext4_lock_group(sb, ac->ac_b_ex.fe_group);
2982 #ifdef AGGRESSIVE_CHECK
2983 	{
2984 		int i;
2985 		for (i = 0; i < ac->ac_b_ex.fe_len; i++) {
2986 			BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i,
2987 						bitmap_bh->b_data));
2988 		}
2989 	}
2990 #endif
2991 	ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
2992 		      ac->ac_b_ex.fe_len);
2993 	if (ext4_has_group_desc_csum(sb) &&
2994 	    (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
2995 		gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
2996 		ext4_free_group_clusters_set(sb, gdp,
2997 					     ext4_free_clusters_after_init(sb,
2998 						ac->ac_b_ex.fe_group, gdp));
2999 	}
3000 	len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len;
3001 	ext4_free_group_clusters_set(sb, gdp, len);
3002 	ext4_block_bitmap_csum_set(sb, ac->ac_b_ex.fe_group, gdp, bitmap_bh);
3003 	ext4_group_desc_csum_set(sb, ac->ac_b_ex.fe_group, gdp);
3004 
3005 	ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
3006 	percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len);
3007 	/*
3008 	 * Now reduce the dirty block count also. Should not go negative
3009 	 */
3010 	if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
3011 		/* release all the reserved blocks if non delalloc */
3012 		percpu_counter_sub(&sbi->s_dirtyclusters_counter,
3013 				   reserv_clstrs);
3014 
3015 	if (sbi->s_log_groups_per_flex) {
3016 		ext4_group_t flex_group = ext4_flex_group(sbi,
3017 							  ac->ac_b_ex.fe_group);
3018 		atomic64_sub(ac->ac_b_ex.fe_len,
3019 			     &sbi_array_rcu_deref(sbi, s_flex_groups,
3020 						  flex_group)->free_clusters);
3021 	}
3022 
3023 	err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
3024 	if (err)
3025 		goto out_err;
3026 	err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
3027 
3028 out_err:
3029 	brelse(bitmap_bh);
3030 	return err;
3031 }
3032 
3033 /*
3034  * here we normalize request for locality group
3035  * Group request are normalized to s_mb_group_prealloc, which goes to
3036  * s_strip if we set the same via mount option.
3037  * s_mb_group_prealloc can be configured via
3038  * /sys/fs/ext4/<partition>/mb_group_prealloc
3039  *
3040  * XXX: should we try to preallocate more than the group has now?
3041  */
ext4_mb_normalize_group_request(struct ext4_allocation_context * ac)3042 static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
3043 {
3044 	struct super_block *sb = ac->ac_sb;
3045 	struct ext4_locality_group *lg = ac->ac_lg;
3046 
3047 	BUG_ON(lg == NULL);
3048 	ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
3049 	mb_debug(1, "#%u: goal %u blocks for locality group\n",
3050 		current->pid, ac->ac_g_ex.fe_len);
3051 }
3052 
3053 /*
3054  * Normalization means making request better in terms of
3055  * size and alignment
3056  */
3057 static noinline_for_stack void
ext4_mb_normalize_request(struct ext4_allocation_context * ac,struct ext4_allocation_request * ar)3058 ext4_mb_normalize_request(struct ext4_allocation_context *ac,
3059 				struct ext4_allocation_request *ar)
3060 {
3061 	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
3062 	int bsbits, max;
3063 	ext4_lblk_t end;
3064 	loff_t size, start_off;
3065 	loff_t orig_size __maybe_unused;
3066 	ext4_lblk_t start;
3067 	struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
3068 	struct ext4_prealloc_space *pa;
3069 
3070 	/* do normalize only data requests, metadata requests
3071 	   do not need preallocation */
3072 	if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
3073 		return;
3074 
3075 	/* sometime caller may want exact blocks */
3076 	if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
3077 		return;
3078 
3079 	/* caller may indicate that preallocation isn't
3080 	 * required (it's a tail, for example) */
3081 	if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC)
3082 		return;
3083 
3084 	if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) {
3085 		ext4_mb_normalize_group_request(ac);
3086 		return ;
3087 	}
3088 
3089 	bsbits = ac->ac_sb->s_blocksize_bits;
3090 
3091 	/* first, let's learn actual file size
3092 	 * given current request is allocated */
3093 	size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
3094 	size = size << bsbits;
3095 	if (size < i_size_read(ac->ac_inode))
3096 		size = i_size_read(ac->ac_inode);
3097 	orig_size = size;
3098 
3099 	/* max size of free chunks */
3100 	max = 2 << bsbits;
3101 
3102 #define NRL_CHECK_SIZE(req, size, max, chunk_size)	\
3103 		(req <= (size) || max <= (chunk_size))
3104 
3105 	/* first, try to predict filesize */
3106 	/* XXX: should this table be tunable? */
3107 	start_off = 0;
3108 	if (size <= 16 * 1024) {
3109 		size = 16 * 1024;
3110 	} else if (size <= 32 * 1024) {
3111 		size = 32 * 1024;
3112 	} else if (size <= 64 * 1024) {
3113 		size = 64 * 1024;
3114 	} else if (size <= 128 * 1024) {
3115 		size = 128 * 1024;
3116 	} else if (size <= 256 * 1024) {
3117 		size = 256 * 1024;
3118 	} else if (size <= 512 * 1024) {
3119 		size = 512 * 1024;
3120 	} else if (size <= 1024 * 1024) {
3121 		size = 1024 * 1024;
3122 	} else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) {
3123 		start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
3124 						(21 - bsbits)) << 21;
3125 		size = 2 * 1024 * 1024;
3126 	} else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) {
3127 		start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
3128 							(22 - bsbits)) << 22;
3129 		size = 4 * 1024 * 1024;
3130 	} else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len,
3131 					(8<<20)>>bsbits, max, 8 * 1024)) {
3132 		start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
3133 							(23 - bsbits)) << 23;
3134 		size = 8 * 1024 * 1024;
3135 	} else {
3136 		start_off = (loff_t) ac->ac_o_ex.fe_logical << bsbits;
3137 		size	  = (loff_t) EXT4_C2B(EXT4_SB(ac->ac_sb),
3138 					      ac->ac_o_ex.fe_len) << bsbits;
3139 	}
3140 	size = size >> bsbits;
3141 	start = start_off >> bsbits;
3142 
3143 	/* don't cover already allocated blocks in selected range */
3144 	if (ar->pleft && start <= ar->lleft) {
3145 		size -= ar->lleft + 1 - start;
3146 		start = ar->lleft + 1;
3147 	}
3148 	if (ar->pright && start + size - 1 >= ar->lright)
3149 		size -= start + size - ar->lright;
3150 
3151 	/*
3152 	 * Trim allocation request for filesystems with artificially small
3153 	 * groups.
3154 	 */
3155 	if (size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb))
3156 		size = EXT4_BLOCKS_PER_GROUP(ac->ac_sb);
3157 
3158 	end = start + size;
3159 
3160 	/* check we don't cross already preallocated blocks */
3161 	rcu_read_lock();
3162 	list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
3163 		ext4_lblk_t pa_end;
3164 
3165 		if (pa->pa_deleted)
3166 			continue;
3167 		spin_lock(&pa->pa_lock);
3168 		if (pa->pa_deleted) {
3169 			spin_unlock(&pa->pa_lock);
3170 			continue;
3171 		}
3172 
3173 		pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
3174 						  pa->pa_len);
3175 
3176 		/* PA must not overlap original request */
3177 		BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end ||
3178 			ac->ac_o_ex.fe_logical < pa->pa_lstart));
3179 
3180 		/* skip PAs this normalized request doesn't overlap with */
3181 		if (pa->pa_lstart >= end || pa_end <= start) {
3182 			spin_unlock(&pa->pa_lock);
3183 			continue;
3184 		}
3185 		BUG_ON(pa->pa_lstart <= start && pa_end >= end);
3186 
3187 		/* adjust start or end to be adjacent to this pa */
3188 		if (pa_end <= ac->ac_o_ex.fe_logical) {
3189 			BUG_ON(pa_end < start);
3190 			start = pa_end;
3191 		} else if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
3192 			BUG_ON(pa->pa_lstart > end);
3193 			end = pa->pa_lstart;
3194 		}
3195 		spin_unlock(&pa->pa_lock);
3196 	}
3197 	rcu_read_unlock();
3198 	size = end - start;
3199 
3200 	/* XXX: extra loop to check we really don't overlap preallocations */
3201 	rcu_read_lock();
3202 	list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
3203 		ext4_lblk_t pa_end;
3204 
3205 		spin_lock(&pa->pa_lock);
3206 		if (pa->pa_deleted == 0) {
3207 			pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
3208 							  pa->pa_len);
3209 			BUG_ON(!(start >= pa_end || end <= pa->pa_lstart));
3210 		}
3211 		spin_unlock(&pa->pa_lock);
3212 	}
3213 	rcu_read_unlock();
3214 
3215 	if (start + size <= ac->ac_o_ex.fe_logical &&
3216 			start > ac->ac_o_ex.fe_logical) {
3217 		ext4_msg(ac->ac_sb, KERN_ERR,
3218 			 "start %lu, size %lu, fe_logical %lu",
3219 			 (unsigned long) start, (unsigned long) size,
3220 			 (unsigned long) ac->ac_o_ex.fe_logical);
3221 		BUG();
3222 	}
3223 	BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
3224 
3225 	/* now prepare goal request */
3226 
3227 	/* XXX: is it better to align blocks WRT to logical
3228 	 * placement or satisfy big request as is */
3229 	ac->ac_g_ex.fe_logical = start;
3230 	ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size);
3231 
3232 	/* define goal start in order to merge */
3233 	if (ar->pright && (ar->lright == (start + size))) {
3234 		/* merge to the right */
3235 		ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size,
3236 						&ac->ac_f_ex.fe_group,
3237 						&ac->ac_f_ex.fe_start);
3238 		ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
3239 	}
3240 	if (ar->pleft && (ar->lleft + 1 == start)) {
3241 		/* merge to the left */
3242 		ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1,
3243 						&ac->ac_f_ex.fe_group,
3244 						&ac->ac_f_ex.fe_start);
3245 		ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
3246 	}
3247 
3248 	mb_debug(1, "goal: %u(was %u) blocks at %u\n", (unsigned) size,
3249 		(unsigned) orig_size, (unsigned) start);
3250 }
3251 
ext4_mb_collect_stats(struct ext4_allocation_context * ac)3252 static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
3253 {
3254 	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
3255 
3256 	if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) {
3257 		atomic_inc(&sbi->s_bal_reqs);
3258 		atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
3259 		if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len)
3260 			atomic_inc(&sbi->s_bal_success);
3261 		atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
3262 		if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
3263 				ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
3264 			atomic_inc(&sbi->s_bal_goals);
3265 		if (ac->ac_found > sbi->s_mb_max_to_scan)
3266 			atomic_inc(&sbi->s_bal_breaks);
3267 	}
3268 
3269 	if (ac->ac_op == EXT4_MB_HISTORY_ALLOC)
3270 		trace_ext4_mballoc_alloc(ac);
3271 	else
3272 		trace_ext4_mballoc_prealloc(ac);
3273 }
3274 
3275 /*
3276  * Called on failure; free up any blocks from the inode PA for this
3277  * context.  We don't need this for MB_GROUP_PA because we only change
3278  * pa_free in ext4_mb_release_context(), but on failure, we've already
3279  * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed.
3280  */
ext4_discard_allocated_blocks(struct ext4_allocation_context * ac)3281 static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
3282 {
3283 	struct ext4_prealloc_space *pa = ac->ac_pa;
3284 	struct ext4_buddy e4b;
3285 	int err;
3286 
3287 	if (pa == NULL) {
3288 		if (ac->ac_f_ex.fe_len == 0)
3289 			return;
3290 		err = ext4_mb_load_buddy(ac->ac_sb, ac->ac_f_ex.fe_group, &e4b);
3291 		if (err) {
3292 			/*
3293 			 * This should never happen since we pin the
3294 			 * pages in the ext4_allocation_context so
3295 			 * ext4_mb_load_buddy() should never fail.
3296 			 */
3297 			WARN(1, "mb_load_buddy failed (%d)", err);
3298 			return;
3299 		}
3300 		ext4_lock_group(ac->ac_sb, ac->ac_f_ex.fe_group);
3301 		mb_free_blocks(ac->ac_inode, &e4b, ac->ac_f_ex.fe_start,
3302 			       ac->ac_f_ex.fe_len);
3303 		ext4_unlock_group(ac->ac_sb, ac->ac_f_ex.fe_group);
3304 		ext4_mb_unload_buddy(&e4b);
3305 		return;
3306 	}
3307 	if (pa->pa_type == MB_INODE_PA)
3308 		pa->pa_free += ac->ac_b_ex.fe_len;
3309 }
3310 
3311 /*
3312  * use blocks preallocated to inode
3313  */
ext4_mb_use_inode_pa(struct ext4_allocation_context * ac,struct ext4_prealloc_space * pa)3314 static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
3315 				struct ext4_prealloc_space *pa)
3316 {
3317 	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
3318 	ext4_fsblk_t start;
3319 	ext4_fsblk_t end;
3320 	int len;
3321 
3322 	/* found preallocated blocks, use them */
3323 	start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart);
3324 	end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len),
3325 		  start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len));
3326 	len = EXT4_NUM_B2C(sbi, end - start);
3327 	ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group,
3328 					&ac->ac_b_ex.fe_start);
3329 	ac->ac_b_ex.fe_len = len;
3330 	ac->ac_status = AC_STATUS_FOUND;
3331 	ac->ac_pa = pa;
3332 
3333 	BUG_ON(start < pa->pa_pstart);
3334 	BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len));
3335 	BUG_ON(pa->pa_free < len);
3336 	pa->pa_free -= len;
3337 
3338 	mb_debug(1, "use %llu/%u from inode pa %p\n", start, len, pa);
3339 }
3340 
3341 /*
3342  * use blocks preallocated to locality group
3343  */
ext4_mb_use_group_pa(struct ext4_allocation_context * ac,struct ext4_prealloc_space * pa)3344 static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
3345 				struct ext4_prealloc_space *pa)
3346 {
3347 	unsigned int len = ac->ac_o_ex.fe_len;
3348 
3349 	ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart,
3350 					&ac->ac_b_ex.fe_group,
3351 					&ac->ac_b_ex.fe_start);
3352 	ac->ac_b_ex.fe_len = len;
3353 	ac->ac_status = AC_STATUS_FOUND;
3354 	ac->ac_pa = pa;
3355 
3356 	/* we don't correct pa_pstart or pa_plen here to avoid
3357 	 * possible race when the group is being loaded concurrently
3358 	 * instead we correct pa later, after blocks are marked
3359 	 * in on-disk bitmap -- see ext4_mb_release_context()
3360 	 * Other CPUs are prevented from allocating from this pa by lg_mutex
3361 	 */
3362 	mb_debug(1, "use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa);
3363 }
3364 
3365 /*
3366  * Return the prealloc space that have minimal distance
3367  * from the goal block. @cpa is the prealloc
3368  * space that is having currently known minimal distance
3369  * from the goal block.
3370  */
3371 static struct ext4_prealloc_space *
ext4_mb_check_group_pa(ext4_fsblk_t goal_block,struct ext4_prealloc_space * pa,struct ext4_prealloc_space * cpa)3372 ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
3373 			struct ext4_prealloc_space *pa,
3374 			struct ext4_prealloc_space *cpa)
3375 {
3376 	ext4_fsblk_t cur_distance, new_distance;
3377 
3378 	if (cpa == NULL) {
3379 		atomic_inc(&pa->pa_count);
3380 		return pa;
3381 	}
3382 	cur_distance = abs(goal_block - cpa->pa_pstart);
3383 	new_distance = abs(goal_block - pa->pa_pstart);
3384 
3385 	if (cur_distance <= new_distance)
3386 		return cpa;
3387 
3388 	/* drop the previous reference */
3389 	atomic_dec(&cpa->pa_count);
3390 	atomic_inc(&pa->pa_count);
3391 	return pa;
3392 }
3393 
3394 /*
3395  * search goal blocks in preallocated space
3396  */
3397 static noinline_for_stack int
ext4_mb_use_preallocated(struct ext4_allocation_context * ac)3398 ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
3399 {
3400 	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
3401 	int order, i;
3402 	struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
3403 	struct ext4_locality_group *lg;
3404 	struct ext4_prealloc_space *pa, *cpa = NULL;
3405 	ext4_fsblk_t goal_block;
3406 
3407 	/* only data can be preallocated */
3408 	if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
3409 		return 0;
3410 
3411 	/* first, try per-file preallocation */
3412 	rcu_read_lock();
3413 	list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
3414 
3415 		/* all fields in this condition don't change,
3416 		 * so we can skip locking for them */
3417 		if (ac->ac_o_ex.fe_logical < pa->pa_lstart ||
3418 		    ac->ac_o_ex.fe_logical >= (pa->pa_lstart +
3419 					       EXT4_C2B(sbi, pa->pa_len)))
3420 			continue;
3421 
3422 		/* non-extent files can't have physical blocks past 2^32 */
3423 		if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
3424 		    (pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len) >
3425 		     EXT4_MAX_BLOCK_FILE_PHYS))
3426 			continue;
3427 
3428 		/* found preallocated blocks, use them */
3429 		spin_lock(&pa->pa_lock);
3430 		if (pa->pa_deleted == 0 && pa->pa_free) {
3431 			atomic_inc(&pa->pa_count);
3432 			ext4_mb_use_inode_pa(ac, pa);
3433 			spin_unlock(&pa->pa_lock);
3434 			ac->ac_criteria = 10;
3435 			rcu_read_unlock();
3436 			return 1;
3437 		}
3438 		spin_unlock(&pa->pa_lock);
3439 	}
3440 	rcu_read_unlock();
3441 
3442 	/* can we use group allocation? */
3443 	if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC))
3444 		return 0;
3445 
3446 	/* inode may have no locality group for some reason */
3447 	lg = ac->ac_lg;
3448 	if (lg == NULL)
3449 		return 0;
3450 	order  = fls(ac->ac_o_ex.fe_len) - 1;
3451 	if (order > PREALLOC_TB_SIZE - 1)
3452 		/* The max size of hash table is PREALLOC_TB_SIZE */
3453 		order = PREALLOC_TB_SIZE - 1;
3454 
3455 	goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex);
3456 	/*
3457 	 * search for the prealloc space that is having
3458 	 * minimal distance from the goal block.
3459 	 */
3460 	for (i = order; i < PREALLOC_TB_SIZE; i++) {
3461 		rcu_read_lock();
3462 		list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i],
3463 					pa_inode_list) {
3464 			spin_lock(&pa->pa_lock);
3465 			if (pa->pa_deleted == 0 &&
3466 					pa->pa_free >= ac->ac_o_ex.fe_len) {
3467 
3468 				cpa = ext4_mb_check_group_pa(goal_block,
3469 								pa, cpa);
3470 			}
3471 			spin_unlock(&pa->pa_lock);
3472 		}
3473 		rcu_read_unlock();
3474 	}
3475 	if (cpa) {
3476 		ext4_mb_use_group_pa(ac, cpa);
3477 		ac->ac_criteria = 20;
3478 		return 1;
3479 	}
3480 	return 0;
3481 }
3482 
3483 /*
3484  * the function goes through all block freed in the group
3485  * but not yet committed and marks them used in in-core bitmap.
3486  * buddy must be generated from this bitmap
3487  * Need to be called with the ext4 group lock held
3488  */
ext4_mb_generate_from_freelist(struct super_block * sb,void * bitmap,ext4_group_t group)3489 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
3490 						ext4_group_t group)
3491 {
3492 	struct rb_node *n;
3493 	struct ext4_group_info *grp;
3494 	struct ext4_free_data *entry;
3495 
3496 	grp = ext4_get_group_info(sb, group);
3497 	n = rb_first(&(grp->bb_free_root));
3498 
3499 	while (n) {
3500 		entry = rb_entry(n, struct ext4_free_data, efd_node);
3501 		ext4_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count);
3502 		n = rb_next(n);
3503 	}
3504 	return;
3505 }
3506 
3507 /*
3508  * the function goes through all preallocation in this group and marks them
3509  * used in in-core bitmap. buddy must be generated from this bitmap
3510  * Need to be called with ext4 group lock held
3511  */
3512 static noinline_for_stack
ext4_mb_generate_from_pa(struct super_block * sb,void * bitmap,ext4_group_t group)3513 void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
3514 					ext4_group_t group)
3515 {
3516 	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
3517 	struct ext4_prealloc_space *pa;
3518 	struct list_head *cur;
3519 	ext4_group_t groupnr;
3520 	ext4_grpblk_t start;
3521 	int preallocated = 0;
3522 	int len;
3523 
3524 	/* all form of preallocation discards first load group,
3525 	 * so the only competing code is preallocation use.
3526 	 * we don't need any locking here
3527 	 * notice we do NOT ignore preallocations with pa_deleted
3528 	 * otherwise we could leave used blocks available for
3529 	 * allocation in buddy when concurrent ext4_mb_put_pa()
3530 	 * is dropping preallocation
3531 	 */
3532 	list_for_each(cur, &grp->bb_prealloc_list) {
3533 		pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
3534 		spin_lock(&pa->pa_lock);
3535 		ext4_get_group_no_and_offset(sb, pa->pa_pstart,
3536 					     &groupnr, &start);
3537 		len = pa->pa_len;
3538 		spin_unlock(&pa->pa_lock);
3539 		if (unlikely(len == 0))
3540 			continue;
3541 		BUG_ON(groupnr != group);
3542 		ext4_set_bits(bitmap, start, len);
3543 		preallocated += len;
3544 	}
3545 	mb_debug(1, "prellocated %u for group %u\n", preallocated, group);
3546 }
3547 
ext4_mb_pa_callback(struct rcu_head * head)3548 static void ext4_mb_pa_callback(struct rcu_head *head)
3549 {
3550 	struct ext4_prealloc_space *pa;
3551 	pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
3552 
3553 	BUG_ON(atomic_read(&pa->pa_count));
3554 	BUG_ON(pa->pa_deleted == 0);
3555 	kmem_cache_free(ext4_pspace_cachep, pa);
3556 }
3557 
3558 /*
3559  * drops a reference to preallocated space descriptor
3560  * if this was the last reference and the space is consumed
3561  */
ext4_mb_put_pa(struct ext4_allocation_context * ac,struct super_block * sb,struct ext4_prealloc_space * pa)3562 static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
3563 			struct super_block *sb, struct ext4_prealloc_space *pa)
3564 {
3565 	ext4_group_t grp;
3566 	ext4_fsblk_t grp_blk;
3567 
3568 	/* in this short window concurrent discard can set pa_deleted */
3569 	spin_lock(&pa->pa_lock);
3570 	if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) {
3571 		spin_unlock(&pa->pa_lock);
3572 		return;
3573 	}
3574 
3575 	if (pa->pa_deleted == 1) {
3576 		spin_unlock(&pa->pa_lock);
3577 		return;
3578 	}
3579 
3580 	pa->pa_deleted = 1;
3581 	spin_unlock(&pa->pa_lock);
3582 
3583 	grp_blk = pa->pa_pstart;
3584 	/*
3585 	 * If doing group-based preallocation, pa_pstart may be in the
3586 	 * next group when pa is used up
3587 	 */
3588 	if (pa->pa_type == MB_GROUP_PA)
3589 		grp_blk--;
3590 
3591 	grp = ext4_get_group_number(sb, grp_blk);
3592 
3593 	/*
3594 	 * possible race:
3595 	 *
3596 	 *  P1 (buddy init)			P2 (regular allocation)
3597 	 *					find block B in PA
3598 	 *  copy on-disk bitmap to buddy
3599 	 *  					mark B in on-disk bitmap
3600 	 *					drop PA from group
3601 	 *  mark all PAs in buddy
3602 	 *
3603 	 * thus, P1 initializes buddy with B available. to prevent this
3604 	 * we make "copy" and "mark all PAs" atomic and serialize "drop PA"
3605 	 * against that pair
3606 	 */
3607 	ext4_lock_group(sb, grp);
3608 	list_del(&pa->pa_group_list);
3609 	ext4_unlock_group(sb, grp);
3610 
3611 	spin_lock(pa->pa_obj_lock);
3612 	list_del_rcu(&pa->pa_inode_list);
3613 	spin_unlock(pa->pa_obj_lock);
3614 
3615 	call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
3616 }
3617 
3618 /*
3619  * creates new preallocated space for given inode
3620  */
3621 static noinline_for_stack int
ext4_mb_new_inode_pa(struct ext4_allocation_context * ac)3622 ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
3623 {
3624 	struct super_block *sb = ac->ac_sb;
3625 	struct ext4_sb_info *sbi = EXT4_SB(sb);
3626 	struct ext4_prealloc_space *pa;
3627 	struct ext4_group_info *grp;
3628 	struct ext4_inode_info *ei;
3629 
3630 	/* preallocate only when found space is larger then requested */
3631 	BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
3632 	BUG_ON(ac->ac_status != AC_STATUS_FOUND);
3633 	BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
3634 
3635 	pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
3636 	if (pa == NULL)
3637 		return -ENOMEM;
3638 
3639 	if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) {
3640 		int winl;
3641 		int wins;
3642 		int win;
3643 		int offs;
3644 
3645 		/* we can't allocate as much as normalizer wants.
3646 		 * so, found space must get proper lstart
3647 		 * to cover original request */
3648 		BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical);
3649 		BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len);
3650 
3651 		/* we're limited by original request in that
3652 		 * logical block must be covered any way
3653 		 * winl is window we can move our chunk within */
3654 		winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical;
3655 
3656 		/* also, we should cover whole original request */
3657 		wins = EXT4_C2B(sbi, ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len);
3658 
3659 		/* the smallest one defines real window */
3660 		win = min(winl, wins);
3661 
3662 		offs = ac->ac_o_ex.fe_logical %
3663 			EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
3664 		if (offs && offs < win)
3665 			win = offs;
3666 
3667 		ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical -
3668 			EXT4_NUM_B2C(sbi, win);
3669 		BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
3670 		BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
3671 	}
3672 
3673 	/* preallocation can change ac_b_ex, thus we store actually
3674 	 * allocated blocks for history */
3675 	ac->ac_f_ex = ac->ac_b_ex;
3676 
3677 	pa->pa_lstart = ac->ac_b_ex.fe_logical;
3678 	pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
3679 	pa->pa_len = ac->ac_b_ex.fe_len;
3680 	pa->pa_free = pa->pa_len;
3681 	atomic_set(&pa->pa_count, 1);
3682 	spin_lock_init(&pa->pa_lock);
3683 	INIT_LIST_HEAD(&pa->pa_inode_list);
3684 	INIT_LIST_HEAD(&pa->pa_group_list);
3685 	pa->pa_deleted = 0;
3686 	pa->pa_type = MB_INODE_PA;
3687 
3688 	mb_debug(1, "new inode pa %p: %llu/%u for %u\n", pa,
3689 			pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3690 	trace_ext4_mb_new_inode_pa(ac, pa);
3691 
3692 	ext4_mb_use_inode_pa(ac, pa);
3693 	atomic_add(pa->pa_free, &sbi->s_mb_preallocated);
3694 
3695 	ei = EXT4_I(ac->ac_inode);
3696 	grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
3697 
3698 	pa->pa_obj_lock = &ei->i_prealloc_lock;
3699 	pa->pa_inode = ac->ac_inode;
3700 
3701 	ext4_lock_group(sb, ac->ac_b_ex.fe_group);
3702 	list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
3703 	ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
3704 
3705 	spin_lock(pa->pa_obj_lock);
3706 	list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list);
3707 	spin_unlock(pa->pa_obj_lock);
3708 
3709 	return 0;
3710 }
3711 
3712 /*
3713  * creates new preallocated space for locality group inodes belongs to
3714  */
3715 static noinline_for_stack int
ext4_mb_new_group_pa(struct ext4_allocation_context * ac)3716 ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
3717 {
3718 	struct super_block *sb = ac->ac_sb;
3719 	struct ext4_locality_group *lg;
3720 	struct ext4_prealloc_space *pa;
3721 	struct ext4_group_info *grp;
3722 
3723 	/* preallocate only when found space is larger then requested */
3724 	BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
3725 	BUG_ON(ac->ac_status != AC_STATUS_FOUND);
3726 	BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
3727 
3728 	BUG_ON(ext4_pspace_cachep == NULL);
3729 	pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
3730 	if (pa == NULL)
3731 		return -ENOMEM;
3732 
3733 	/* preallocation can change ac_b_ex, thus we store actually
3734 	 * allocated blocks for history */
3735 	ac->ac_f_ex = ac->ac_b_ex;
3736 
3737 	pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
3738 	pa->pa_lstart = pa->pa_pstart;
3739 	pa->pa_len = ac->ac_b_ex.fe_len;
3740 	pa->pa_free = pa->pa_len;
3741 	atomic_set(&pa->pa_count, 1);
3742 	spin_lock_init(&pa->pa_lock);
3743 	INIT_LIST_HEAD(&pa->pa_inode_list);
3744 	INIT_LIST_HEAD(&pa->pa_group_list);
3745 	pa->pa_deleted = 0;
3746 	pa->pa_type = MB_GROUP_PA;
3747 
3748 	mb_debug(1, "new group pa %p: %llu/%u for %u\n", pa,
3749 			pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3750 	trace_ext4_mb_new_group_pa(ac, pa);
3751 
3752 	ext4_mb_use_group_pa(ac, pa);
3753 	atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
3754 
3755 	grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
3756 	lg = ac->ac_lg;
3757 	BUG_ON(lg == NULL);
3758 
3759 	pa->pa_obj_lock = &lg->lg_prealloc_lock;
3760 	pa->pa_inode = NULL;
3761 
3762 	ext4_lock_group(sb, ac->ac_b_ex.fe_group);
3763 	list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
3764 	ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
3765 
3766 	/*
3767 	 * We will later add the new pa to the right bucket
3768 	 * after updating the pa_free in ext4_mb_release_context
3769 	 */
3770 	return 0;
3771 }
3772 
ext4_mb_new_preallocation(struct ext4_allocation_context * ac)3773 static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
3774 {
3775 	int err;
3776 
3777 	if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
3778 		err = ext4_mb_new_group_pa(ac);
3779 	else
3780 		err = ext4_mb_new_inode_pa(ac);
3781 	return err;
3782 }
3783 
3784 /*
3785  * finds all unused blocks in on-disk bitmap, frees them in
3786  * in-core bitmap and buddy.
3787  * @pa must be unlinked from inode and group lists, so that
3788  * nobody else can find/use it.
3789  * the caller MUST hold group/inode locks.
3790  * TODO: optimize the case when there are no in-core structures yet
3791  */
3792 static noinline_for_stack int
ext4_mb_release_inode_pa(struct ext4_buddy * e4b,struct buffer_head * bitmap_bh,struct ext4_prealloc_space * pa)3793 ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3794 			struct ext4_prealloc_space *pa)
3795 {
3796 	struct super_block *sb = e4b->bd_sb;
3797 	struct ext4_sb_info *sbi = EXT4_SB(sb);
3798 	unsigned int end;
3799 	unsigned int next;
3800 	ext4_group_t group;
3801 	ext4_grpblk_t bit;
3802 	unsigned long long grp_blk_start;
3803 	int err = 0;
3804 	int free = 0;
3805 
3806 	BUG_ON(pa->pa_deleted == 0);
3807 	ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
3808 	grp_blk_start = pa->pa_pstart - EXT4_C2B(sbi, bit);
3809 	BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
3810 	end = bit + pa->pa_len;
3811 
3812 	while (bit < end) {
3813 		bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit);
3814 		if (bit >= end)
3815 			break;
3816 		next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
3817 		mb_debug(1, "    free preallocated %u/%u in group %u\n",
3818 			 (unsigned) ext4_group_first_block_no(sb, group) + bit,
3819 			 (unsigned) next - bit, (unsigned) group);
3820 		free += next - bit;
3821 
3822 		trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit);
3823 		trace_ext4_mb_release_inode_pa(pa, (grp_blk_start +
3824 						    EXT4_C2B(sbi, bit)),
3825 					       next - bit);
3826 		mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
3827 		bit = next + 1;
3828 	}
3829 	if (free != pa->pa_free) {
3830 		ext4_msg(e4b->bd_sb, KERN_CRIT,
3831 			 "pa %p: logic %lu, phys. %lu, len %lu",
3832 			 pa, (unsigned long) pa->pa_lstart,
3833 			 (unsigned long) pa->pa_pstart,
3834 			 (unsigned long) pa->pa_len);
3835 		ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u",
3836 					free, pa->pa_free);
3837 		/*
3838 		 * pa is already deleted so we use the value obtained
3839 		 * from the bitmap and continue.
3840 		 */
3841 	}
3842 	atomic_add(free, &sbi->s_mb_discarded);
3843 
3844 	return err;
3845 }
3846 
3847 static noinline_for_stack int
ext4_mb_release_group_pa(struct ext4_buddy * e4b,struct ext4_prealloc_space * pa)3848 ext4_mb_release_group_pa(struct ext4_buddy *e4b,
3849 				struct ext4_prealloc_space *pa)
3850 {
3851 	struct super_block *sb = e4b->bd_sb;
3852 	ext4_group_t group;
3853 	ext4_grpblk_t bit;
3854 
3855 	trace_ext4_mb_release_group_pa(sb, pa);
3856 	BUG_ON(pa->pa_deleted == 0);
3857 	ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
3858 	BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
3859 	mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
3860 	atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
3861 	trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len);
3862 
3863 	return 0;
3864 }
3865 
3866 /*
3867  * releases all preallocations in given group
3868  *
3869  * first, we need to decide discard policy:
3870  * - when do we discard
3871  *   1) ENOSPC
3872  * - how many do we discard
3873  *   1) how many requested
3874  */
3875 static noinline_for_stack int
ext4_mb_discard_group_preallocations(struct super_block * sb,ext4_group_t group,int needed)3876 ext4_mb_discard_group_preallocations(struct super_block *sb,
3877 					ext4_group_t group, int needed)
3878 {
3879 	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
3880 	struct buffer_head *bitmap_bh = NULL;
3881 	struct ext4_prealloc_space *pa, *tmp;
3882 	struct list_head list;
3883 	struct ext4_buddy e4b;
3884 	int err;
3885 	int busy = 0;
3886 	int free = 0;
3887 
3888 	mb_debug(1, "discard preallocation for group %u\n", group);
3889 
3890 	if (list_empty(&grp->bb_prealloc_list))
3891 		return 0;
3892 
3893 	bitmap_bh = ext4_read_block_bitmap(sb, group);
3894 	if (IS_ERR(bitmap_bh)) {
3895 		err = PTR_ERR(bitmap_bh);
3896 		ext4_error(sb, "Error %d reading block bitmap for %u",
3897 			   err, group);
3898 		return 0;
3899 	}
3900 
3901 	err = ext4_mb_load_buddy(sb, group, &e4b);
3902 	if (err) {
3903 		ext4_warning(sb, "Error %d loading buddy information for %u",
3904 			     err, group);
3905 		put_bh(bitmap_bh);
3906 		return 0;
3907 	}
3908 
3909 	if (needed == 0)
3910 		needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1;
3911 
3912 	INIT_LIST_HEAD(&list);
3913 repeat:
3914 	ext4_lock_group(sb, group);
3915 	list_for_each_entry_safe(pa, tmp,
3916 				&grp->bb_prealloc_list, pa_group_list) {
3917 		spin_lock(&pa->pa_lock);
3918 		if (atomic_read(&pa->pa_count)) {
3919 			spin_unlock(&pa->pa_lock);
3920 			busy = 1;
3921 			continue;
3922 		}
3923 		if (pa->pa_deleted) {
3924 			spin_unlock(&pa->pa_lock);
3925 			continue;
3926 		}
3927 
3928 		/* seems this one can be freed ... */
3929 		pa->pa_deleted = 1;
3930 
3931 		/* we can trust pa_free ... */
3932 		free += pa->pa_free;
3933 
3934 		spin_unlock(&pa->pa_lock);
3935 
3936 		list_del(&pa->pa_group_list);
3937 		list_add(&pa->u.pa_tmp_list, &list);
3938 	}
3939 
3940 	/* if we still need more blocks and some PAs were used, try again */
3941 	if (free < needed && busy) {
3942 		busy = 0;
3943 		ext4_unlock_group(sb, group);
3944 		cond_resched();
3945 		goto repeat;
3946 	}
3947 
3948 	/* found anything to free? */
3949 	if (list_empty(&list)) {
3950 		BUG_ON(free != 0);
3951 		goto out;
3952 	}
3953 
3954 	/* now free all selected PAs */
3955 	list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
3956 
3957 		/* remove from object (inode or locality group) */
3958 		spin_lock(pa->pa_obj_lock);
3959 		list_del_rcu(&pa->pa_inode_list);
3960 		spin_unlock(pa->pa_obj_lock);
3961 
3962 		if (pa->pa_type == MB_GROUP_PA)
3963 			ext4_mb_release_group_pa(&e4b, pa);
3964 		else
3965 			ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
3966 
3967 		list_del(&pa->u.pa_tmp_list);
3968 		call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
3969 	}
3970 
3971 out:
3972 	ext4_unlock_group(sb, group);
3973 	ext4_mb_unload_buddy(&e4b);
3974 	put_bh(bitmap_bh);
3975 	return free;
3976 }
3977 
3978 /*
3979  * releases all non-used preallocated blocks for given inode
3980  *
3981  * It's important to discard preallocations under i_data_sem
3982  * We don't want another block to be served from the prealloc
3983  * space when we are discarding the inode prealloc space.
3984  *
3985  * FIXME!! Make sure it is valid at all the call sites
3986  */
ext4_discard_preallocations(struct inode * inode)3987 void ext4_discard_preallocations(struct inode *inode)
3988 {
3989 	struct ext4_inode_info *ei = EXT4_I(inode);
3990 	struct super_block *sb = inode->i_sb;
3991 	struct buffer_head *bitmap_bh = NULL;
3992 	struct ext4_prealloc_space *pa, *tmp;
3993 	ext4_group_t group = 0;
3994 	struct list_head list;
3995 	struct ext4_buddy e4b;
3996 	int err;
3997 
3998 	if (!S_ISREG(inode->i_mode)) {
3999 		/*BUG_ON(!list_empty(&ei->i_prealloc_list));*/
4000 		return;
4001 	}
4002 
4003 	mb_debug(1, "discard preallocation for inode %lu\n", inode->i_ino);
4004 	trace_ext4_discard_preallocations(inode);
4005 
4006 	INIT_LIST_HEAD(&list);
4007 
4008 repeat:
4009 	/* first, collect all pa's in the inode */
4010 	spin_lock(&ei->i_prealloc_lock);
4011 	while (!list_empty(&ei->i_prealloc_list)) {
4012 		pa = list_entry(ei->i_prealloc_list.next,
4013 				struct ext4_prealloc_space, pa_inode_list);
4014 		BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock);
4015 		spin_lock(&pa->pa_lock);
4016 		if (atomic_read(&pa->pa_count)) {
4017 			/* this shouldn't happen often - nobody should
4018 			 * use preallocation while we're discarding it */
4019 			spin_unlock(&pa->pa_lock);
4020 			spin_unlock(&ei->i_prealloc_lock);
4021 			ext4_msg(sb, KERN_ERR,
4022 				 "uh-oh! used pa while discarding");
4023 			WARN_ON(1);
4024 			schedule_timeout_uninterruptible(HZ);
4025 			goto repeat;
4026 
4027 		}
4028 		if (pa->pa_deleted == 0) {
4029 			pa->pa_deleted = 1;
4030 			spin_unlock(&pa->pa_lock);
4031 			list_del_rcu(&pa->pa_inode_list);
4032 			list_add(&pa->u.pa_tmp_list, &list);
4033 			continue;
4034 		}
4035 
4036 		/* someone is deleting pa right now */
4037 		spin_unlock(&pa->pa_lock);
4038 		spin_unlock(&ei->i_prealloc_lock);
4039 
4040 		/* we have to wait here because pa_deleted
4041 		 * doesn't mean pa is already unlinked from
4042 		 * the list. as we might be called from
4043 		 * ->clear_inode() the inode will get freed
4044 		 * and concurrent thread which is unlinking
4045 		 * pa from inode's list may access already
4046 		 * freed memory, bad-bad-bad */
4047 
4048 		/* XXX: if this happens too often, we can
4049 		 * add a flag to force wait only in case
4050 		 * of ->clear_inode(), but not in case of
4051 		 * regular truncate */
4052 		schedule_timeout_uninterruptible(HZ);
4053 		goto repeat;
4054 	}
4055 	spin_unlock(&ei->i_prealloc_lock);
4056 
4057 	list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
4058 		BUG_ON(pa->pa_type != MB_INODE_PA);
4059 		group = ext4_get_group_number(sb, pa->pa_pstart);
4060 
4061 		err = ext4_mb_load_buddy_gfp(sb, group, &e4b,
4062 					     GFP_NOFS|__GFP_NOFAIL);
4063 		if (err) {
4064 			ext4_error(sb, "Error %d loading buddy information for %u",
4065 				   err, group);
4066 			continue;
4067 		}
4068 
4069 		bitmap_bh = ext4_read_block_bitmap(sb, group);
4070 		if (IS_ERR(bitmap_bh)) {
4071 			err = PTR_ERR(bitmap_bh);
4072 			ext4_error(sb, "Error %d reading block bitmap for %u",
4073 					err, group);
4074 			ext4_mb_unload_buddy(&e4b);
4075 			continue;
4076 		}
4077 
4078 		ext4_lock_group(sb, group);
4079 		list_del(&pa->pa_group_list);
4080 		ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
4081 		ext4_unlock_group(sb, group);
4082 
4083 		ext4_mb_unload_buddy(&e4b);
4084 		put_bh(bitmap_bh);
4085 
4086 		list_del(&pa->u.pa_tmp_list);
4087 		call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
4088 	}
4089 }
4090 
4091 #ifdef CONFIG_EXT4_DEBUG
ext4_mb_show_ac(struct ext4_allocation_context * ac)4092 static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
4093 {
4094 	struct super_block *sb = ac->ac_sb;
4095 	ext4_group_t ngroups, i;
4096 
4097 	if (!ext4_mballoc_debug ||
4098 	    (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED))
4099 		return;
4100 
4101 	ext4_msg(ac->ac_sb, KERN_ERR, "Can't allocate:"
4102 			" Allocation context details:");
4103 	ext4_msg(ac->ac_sb, KERN_ERR, "status %d flags %d",
4104 			ac->ac_status, ac->ac_flags);
4105 	ext4_msg(ac->ac_sb, KERN_ERR, "orig %lu/%lu/%lu@%lu, "
4106 		 	"goal %lu/%lu/%lu@%lu, "
4107 			"best %lu/%lu/%lu@%lu cr %d",
4108 			(unsigned long)ac->ac_o_ex.fe_group,
4109 			(unsigned long)ac->ac_o_ex.fe_start,
4110 			(unsigned long)ac->ac_o_ex.fe_len,
4111 			(unsigned long)ac->ac_o_ex.fe_logical,
4112 			(unsigned long)ac->ac_g_ex.fe_group,
4113 			(unsigned long)ac->ac_g_ex.fe_start,
4114 			(unsigned long)ac->ac_g_ex.fe_len,
4115 			(unsigned long)ac->ac_g_ex.fe_logical,
4116 			(unsigned long)ac->ac_b_ex.fe_group,
4117 			(unsigned long)ac->ac_b_ex.fe_start,
4118 			(unsigned long)ac->ac_b_ex.fe_len,
4119 			(unsigned long)ac->ac_b_ex.fe_logical,
4120 			(int)ac->ac_criteria);
4121 	ext4_msg(ac->ac_sb, KERN_ERR, "%d found", ac->ac_found);
4122 	ext4_msg(ac->ac_sb, KERN_ERR, "groups: ");
4123 	ngroups = ext4_get_groups_count(sb);
4124 	for (i = 0; i < ngroups; i++) {
4125 		struct ext4_group_info *grp = ext4_get_group_info(sb, i);
4126 		struct ext4_prealloc_space *pa;
4127 		ext4_grpblk_t start;
4128 		struct list_head *cur;
4129 		ext4_lock_group(sb, i);
4130 		list_for_each(cur, &grp->bb_prealloc_list) {
4131 			pa = list_entry(cur, struct ext4_prealloc_space,
4132 					pa_group_list);
4133 			spin_lock(&pa->pa_lock);
4134 			ext4_get_group_no_and_offset(sb, pa->pa_pstart,
4135 						     NULL, &start);
4136 			spin_unlock(&pa->pa_lock);
4137 			printk(KERN_ERR "PA:%u:%d:%u \n", i,
4138 			       start, pa->pa_len);
4139 		}
4140 		ext4_unlock_group(sb, i);
4141 
4142 		if (grp->bb_free == 0)
4143 			continue;
4144 		printk(KERN_ERR "%u: %d/%d \n",
4145 		       i, grp->bb_free, grp->bb_fragments);
4146 	}
4147 	printk(KERN_ERR "\n");
4148 }
4149 #else
ext4_mb_show_ac(struct ext4_allocation_context * ac)4150 static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac)
4151 {
4152 	return;
4153 }
4154 #endif
4155 
4156 /*
4157  * We use locality group preallocation for small size file. The size of the
4158  * file is determined by the current size or the resulting size after
4159  * allocation which ever is larger
4160  *
4161  * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req
4162  */
ext4_mb_group_or_file(struct ext4_allocation_context * ac)4163 static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
4164 {
4165 	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
4166 	int bsbits = ac->ac_sb->s_blocksize_bits;
4167 	loff_t size, isize;
4168 
4169 	if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
4170 		return;
4171 
4172 	if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
4173 		return;
4174 
4175 	size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
4176 	isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
4177 		>> bsbits;
4178 
4179 	if ((size == isize) &&
4180 	    !ext4_fs_is_busy(sbi) &&
4181 	    (atomic_read(&ac->ac_inode->i_writecount) == 0)) {
4182 		ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
4183 		return;
4184 	}
4185 
4186 	if (sbi->s_mb_group_prealloc <= 0) {
4187 		ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
4188 		return;
4189 	}
4190 
4191 	/* don't use group allocation for large files */
4192 	size = max(size, isize);
4193 	if (size > sbi->s_mb_stream_request) {
4194 		ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
4195 		return;
4196 	}
4197 
4198 	BUG_ON(ac->ac_lg != NULL);
4199 	/*
4200 	 * locality group prealloc space are per cpu. The reason for having
4201 	 * per cpu locality group is to reduce the contention between block
4202 	 * request from multiple CPUs.
4203 	 */
4204 	ac->ac_lg = raw_cpu_ptr(sbi->s_locality_groups);
4205 
4206 	/* we're going to use group allocation */
4207 	ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;
4208 
4209 	/* serialize all allocations in the group */
4210 	mutex_lock(&ac->ac_lg->lg_mutex);
4211 }
4212 
4213 static noinline_for_stack int
ext4_mb_initialize_context(struct ext4_allocation_context * ac,struct ext4_allocation_request * ar)4214 ext4_mb_initialize_context(struct ext4_allocation_context *ac,
4215 				struct ext4_allocation_request *ar)
4216 {
4217 	struct super_block *sb = ar->inode->i_sb;
4218 	struct ext4_sb_info *sbi = EXT4_SB(sb);
4219 	struct ext4_super_block *es = sbi->s_es;
4220 	ext4_group_t group;
4221 	unsigned int len;
4222 	ext4_fsblk_t goal;
4223 	ext4_grpblk_t block;
4224 
4225 	/* we can't allocate > group size */
4226 	len = ar->len;
4227 
4228 	/* just a dirty hack to filter too big requests  */
4229 	if (len >= EXT4_CLUSTERS_PER_GROUP(sb))
4230 		len = EXT4_CLUSTERS_PER_GROUP(sb);
4231 
4232 	/* start searching from the goal */
4233 	goal = ar->goal;
4234 	if (goal < le32_to_cpu(es->s_first_data_block) ||
4235 			goal >= ext4_blocks_count(es))
4236 		goal = le32_to_cpu(es->s_first_data_block);
4237 	ext4_get_group_no_and_offset(sb, goal, &group, &block);
4238 
4239 	/* set up allocation goals */
4240 	ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical);
4241 	ac->ac_status = AC_STATUS_CONTINUE;
4242 	ac->ac_sb = sb;
4243 	ac->ac_inode = ar->inode;
4244 	ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical;
4245 	ac->ac_o_ex.fe_group = group;
4246 	ac->ac_o_ex.fe_start = block;
4247 	ac->ac_o_ex.fe_len = len;
4248 	ac->ac_g_ex = ac->ac_o_ex;
4249 	ac->ac_flags = ar->flags;
4250 
4251 	/* we have to define context: we'll we work with a file or
4252 	 * locality group. this is a policy, actually */
4253 	ext4_mb_group_or_file(ac);
4254 
4255 	mb_debug(1, "init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, "
4256 			"left: %u/%u, right %u/%u to %swritable\n",
4257 			(unsigned) ar->len, (unsigned) ar->logical,
4258 			(unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
4259 			(unsigned) ar->lleft, (unsigned) ar->pleft,
4260 			(unsigned) ar->lright, (unsigned) ar->pright,
4261 			atomic_read(&ar->inode->i_writecount) ? "" : "non-");
4262 	return 0;
4263 
4264 }
4265 
4266 static noinline_for_stack void
ext4_mb_discard_lg_preallocations(struct super_block * sb,struct ext4_locality_group * lg,int order,int total_entries)4267 ext4_mb_discard_lg_preallocations(struct super_block *sb,
4268 					struct ext4_locality_group *lg,
4269 					int order, int total_entries)
4270 {
4271 	ext4_group_t group = 0;
4272 	struct ext4_buddy e4b;
4273 	struct list_head discard_list;
4274 	struct ext4_prealloc_space *pa, *tmp;
4275 
4276 	mb_debug(1, "discard locality group preallocation\n");
4277 
4278 	INIT_LIST_HEAD(&discard_list);
4279 
4280 	spin_lock(&lg->lg_prealloc_lock);
4281 	list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
4282 						pa_inode_list) {
4283 		spin_lock(&pa->pa_lock);
4284 		if (atomic_read(&pa->pa_count)) {
4285 			/*
4286 			 * This is the pa that we just used
4287 			 * for block allocation. So don't
4288 			 * free that
4289 			 */
4290 			spin_unlock(&pa->pa_lock);
4291 			continue;
4292 		}
4293 		if (pa->pa_deleted) {
4294 			spin_unlock(&pa->pa_lock);
4295 			continue;
4296 		}
4297 		/* only lg prealloc space */
4298 		BUG_ON(pa->pa_type != MB_GROUP_PA);
4299 
4300 		/* seems this one can be freed ... */
4301 		pa->pa_deleted = 1;
4302 		spin_unlock(&pa->pa_lock);
4303 
4304 		list_del_rcu(&pa->pa_inode_list);
4305 		list_add(&pa->u.pa_tmp_list, &discard_list);
4306 
4307 		total_entries--;
4308 		if (total_entries <= 5) {
4309 			/*
4310 			 * we want to keep only 5 entries
4311 			 * allowing it to grow to 8. This
4312 			 * mak sure we don't call discard
4313 			 * soon for this list.
4314 			 */
4315 			break;
4316 		}
4317 	}
4318 	spin_unlock(&lg->lg_prealloc_lock);
4319 
4320 	list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) {
4321 		int err;
4322 
4323 		group = ext4_get_group_number(sb, pa->pa_pstart);
4324 		err = ext4_mb_load_buddy_gfp(sb, group, &e4b,
4325 					     GFP_NOFS|__GFP_NOFAIL);
4326 		if (err) {
4327 			ext4_error(sb, "Error %d loading buddy information for %u",
4328 				   err, group);
4329 			continue;
4330 		}
4331 		ext4_lock_group(sb, group);
4332 		list_del(&pa->pa_group_list);
4333 		ext4_mb_release_group_pa(&e4b, pa);
4334 		ext4_unlock_group(sb, group);
4335 
4336 		ext4_mb_unload_buddy(&e4b);
4337 		list_del(&pa->u.pa_tmp_list);
4338 		call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
4339 	}
4340 }
4341 
4342 /*
4343  * We have incremented pa_count. So it cannot be freed at this
4344  * point. Also we hold lg_mutex. So no parallel allocation is
4345  * possible from this lg. That means pa_free cannot be updated.
4346  *
4347  * A parallel ext4_mb_discard_group_preallocations is possible.
4348  * which can cause the lg_prealloc_list to be updated.
4349  */
4350 
ext4_mb_add_n_trim(struct ext4_allocation_context * ac)4351 static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
4352 {
4353 	int order, added = 0, lg_prealloc_count = 1;
4354 	struct super_block *sb = ac->ac_sb;
4355 	struct ext4_locality_group *lg = ac->ac_lg;
4356 	struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa;
4357 
4358 	order = fls(pa->pa_free) - 1;
4359 	if (order > PREALLOC_TB_SIZE - 1)
4360 		/* The max size of hash table is PREALLOC_TB_SIZE */
4361 		order = PREALLOC_TB_SIZE - 1;
4362 	/* Add the prealloc space to lg */
4363 	spin_lock(&lg->lg_prealloc_lock);
4364 	list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
4365 						pa_inode_list) {
4366 		spin_lock(&tmp_pa->pa_lock);
4367 		if (tmp_pa->pa_deleted) {
4368 			spin_unlock(&tmp_pa->pa_lock);
4369 			continue;
4370 		}
4371 		if (!added && pa->pa_free < tmp_pa->pa_free) {
4372 			/* Add to the tail of the previous entry */
4373 			list_add_tail_rcu(&pa->pa_inode_list,
4374 						&tmp_pa->pa_inode_list);
4375 			added = 1;
4376 			/*
4377 			 * we want to count the total
4378 			 * number of entries in the list
4379 			 */
4380 		}
4381 		spin_unlock(&tmp_pa->pa_lock);
4382 		lg_prealloc_count++;
4383 	}
4384 	if (!added)
4385 		list_add_tail_rcu(&pa->pa_inode_list,
4386 					&lg->lg_prealloc_list[order]);
4387 	spin_unlock(&lg->lg_prealloc_lock);
4388 
4389 	/* Now trim the list to be not more than 8 elements */
4390 	if (lg_prealloc_count > 8) {
4391 		ext4_mb_discard_lg_preallocations(sb, lg,
4392 						  order, lg_prealloc_count);
4393 		return;
4394 	}
4395 	return ;
4396 }
4397 
4398 /*
4399  * release all resource we used in allocation
4400  */
ext4_mb_release_context(struct ext4_allocation_context * ac)4401 static int ext4_mb_release_context(struct ext4_allocation_context *ac)
4402 {
4403 	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
4404 	struct ext4_prealloc_space *pa = ac->ac_pa;
4405 	if (pa) {
4406 		if (pa->pa_type == MB_GROUP_PA) {
4407 			/* see comment in ext4_mb_use_group_pa() */
4408 			spin_lock(&pa->pa_lock);
4409 			pa->pa_pstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
4410 			pa->pa_lstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
4411 			pa->pa_free -= ac->ac_b_ex.fe_len;
4412 			pa->pa_len -= ac->ac_b_ex.fe_len;
4413 			spin_unlock(&pa->pa_lock);
4414 		}
4415 	}
4416 	if (pa) {
4417 		/*
4418 		 * We want to add the pa to the right bucket.
4419 		 * Remove it from the list and while adding
4420 		 * make sure the list to which we are adding
4421 		 * doesn't grow big.
4422 		 */
4423 		if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) {
4424 			spin_lock(pa->pa_obj_lock);
4425 			list_del_rcu(&pa->pa_inode_list);
4426 			spin_unlock(pa->pa_obj_lock);
4427 			ext4_mb_add_n_trim(ac);
4428 		}
4429 		ext4_mb_put_pa(ac, ac->ac_sb, pa);
4430 	}
4431 	if (ac->ac_bitmap_page)
4432 		page_cache_release(ac->ac_bitmap_page);
4433 	if (ac->ac_buddy_page)
4434 		page_cache_release(ac->ac_buddy_page);
4435 	if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
4436 		mutex_unlock(&ac->ac_lg->lg_mutex);
4437 	ext4_mb_collect_stats(ac);
4438 	return 0;
4439 }
4440 
ext4_mb_discard_preallocations(struct super_block * sb,int needed)4441 static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
4442 {
4443 	ext4_group_t i, ngroups = ext4_get_groups_count(sb);
4444 	int ret;
4445 	int freed = 0;
4446 
4447 	trace_ext4_mb_discard_preallocations(sb, needed);
4448 	for (i = 0; i < ngroups && needed > 0; i++) {
4449 		ret = ext4_mb_discard_group_preallocations(sb, i, needed);
4450 		freed += ret;
4451 		needed -= ret;
4452 	}
4453 
4454 	return freed;
4455 }
4456 
4457 /*
4458  * Main entry point into mballoc to allocate blocks
4459  * it tries to use preallocation first, then falls back
4460  * to usual allocation
4461  */
ext4_mb_new_blocks(handle_t * handle,struct ext4_allocation_request * ar,int * errp)4462 ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4463 				struct ext4_allocation_request *ar, int *errp)
4464 {
4465 	int freed;
4466 	struct ext4_allocation_context *ac = NULL;
4467 	struct ext4_sb_info *sbi;
4468 	struct super_block *sb;
4469 	ext4_fsblk_t block = 0;
4470 	unsigned int inquota = 0;
4471 	unsigned int reserv_clstrs = 0;
4472 
4473 	might_sleep();
4474 	sb = ar->inode->i_sb;
4475 	sbi = EXT4_SB(sb);
4476 
4477 	trace_ext4_request_blocks(ar);
4478 
4479 	/* Allow to use superuser reservation for quota file */
4480 	if (IS_NOQUOTA(ar->inode))
4481 		ar->flags |= EXT4_MB_USE_ROOT_BLOCKS;
4482 
4483 	if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) {
4484 		/* Without delayed allocation we need to verify
4485 		 * there is enough free blocks to do block allocation
4486 		 * and verify allocation doesn't exceed the quota limits.
4487 		 */
4488 		while (ar->len &&
4489 			ext4_claim_free_clusters(sbi, ar->len, ar->flags)) {
4490 
4491 			/* let others to free the space */
4492 			cond_resched();
4493 			ar->len = ar->len >> 1;
4494 		}
4495 		if (!ar->len) {
4496 			*errp = -ENOSPC;
4497 			return 0;
4498 		}
4499 		reserv_clstrs = ar->len;
4500 		if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) {
4501 			dquot_alloc_block_nofail(ar->inode,
4502 						 EXT4_C2B(sbi, ar->len));
4503 		} else {
4504 			while (ar->len &&
4505 				dquot_alloc_block(ar->inode,
4506 						  EXT4_C2B(sbi, ar->len))) {
4507 
4508 				ar->flags |= EXT4_MB_HINT_NOPREALLOC;
4509 				ar->len--;
4510 			}
4511 		}
4512 		inquota = ar->len;
4513 		if (ar->len == 0) {
4514 			*errp = -EDQUOT;
4515 			goto out;
4516 		}
4517 	}
4518 
4519 	ac = kmem_cache_zalloc(ext4_ac_cachep, GFP_NOFS);
4520 	if (!ac) {
4521 		ar->len = 0;
4522 		*errp = -ENOMEM;
4523 		goto out;
4524 	}
4525 
4526 	*errp = ext4_mb_initialize_context(ac, ar);
4527 	if (*errp) {
4528 		ar->len = 0;
4529 		goto out;
4530 	}
4531 
4532 	ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
4533 	if (!ext4_mb_use_preallocated(ac)) {
4534 		ac->ac_op = EXT4_MB_HISTORY_ALLOC;
4535 		ext4_mb_normalize_request(ac, ar);
4536 repeat:
4537 		/* allocate space in core */
4538 		*errp = ext4_mb_regular_allocator(ac);
4539 		if (*errp)
4540 			goto discard_and_exit;
4541 
4542 		/* as we've just preallocated more space than
4543 		 * user requested originally, we store allocated
4544 		 * space in a special descriptor */
4545 		if (ac->ac_status == AC_STATUS_FOUND &&
4546 		    ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
4547 			*errp = ext4_mb_new_preallocation(ac);
4548 		if (*errp) {
4549 		discard_and_exit:
4550 			ext4_discard_allocated_blocks(ac);
4551 			goto errout;
4552 		}
4553 	}
4554 	if (likely(ac->ac_status == AC_STATUS_FOUND)) {
4555 		*errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
4556 		if (*errp) {
4557 			ext4_discard_allocated_blocks(ac);
4558 			goto errout;
4559 		} else {
4560 			block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
4561 			ar->len = ac->ac_b_ex.fe_len;
4562 		}
4563 	} else {
4564 		freed  = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len);
4565 		if (freed)
4566 			goto repeat;
4567 		*errp = -ENOSPC;
4568 	}
4569 
4570 errout:
4571 	if (*errp) {
4572 		ac->ac_b_ex.fe_len = 0;
4573 		ar->len = 0;
4574 		ext4_mb_show_ac(ac);
4575 	}
4576 	ext4_mb_release_context(ac);
4577 out:
4578 	if (ac)
4579 		kmem_cache_free(ext4_ac_cachep, ac);
4580 	if (inquota && ar->len < inquota)
4581 		dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len));
4582 	if (!ar->len) {
4583 		if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0)
4584 			/* release all the reserved blocks if non delalloc */
4585 			percpu_counter_sub(&sbi->s_dirtyclusters_counter,
4586 						reserv_clstrs);
4587 	}
4588 
4589 	trace_ext4_allocate_blocks(ar, (unsigned long long)block);
4590 
4591 	return block;
4592 }
4593 
4594 /*
4595  * We can merge two free data extents only if the physical blocks
4596  * are contiguous, AND the extents were freed by the same transaction,
4597  * AND the blocks are associated with the same group.
4598  */
can_merge(struct ext4_free_data * entry1,struct ext4_free_data * entry2)4599 static int can_merge(struct ext4_free_data *entry1,
4600 			struct ext4_free_data *entry2)
4601 {
4602 	if ((entry1->efd_tid == entry2->efd_tid) &&
4603 	    (entry1->efd_group == entry2->efd_group) &&
4604 	    ((entry1->efd_start_cluster + entry1->efd_count) == entry2->efd_start_cluster))
4605 		return 1;
4606 	return 0;
4607 }
4608 
4609 static noinline_for_stack int
ext4_mb_free_metadata(handle_t * handle,struct ext4_buddy * e4b,struct ext4_free_data * new_entry)4610 ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4611 		      struct ext4_free_data *new_entry)
4612 {
4613 	ext4_group_t group = e4b->bd_group;
4614 	ext4_grpblk_t cluster;
4615 	struct ext4_free_data *entry;
4616 	struct ext4_group_info *db = e4b->bd_info;
4617 	struct super_block *sb = e4b->bd_sb;
4618 	struct ext4_sb_info *sbi = EXT4_SB(sb);
4619 	struct rb_node **n = &db->bb_free_root.rb_node, *node;
4620 	struct rb_node *parent = NULL, *new_node;
4621 
4622 	BUG_ON(!ext4_handle_valid(handle));
4623 	BUG_ON(e4b->bd_bitmap_page == NULL);
4624 	BUG_ON(e4b->bd_buddy_page == NULL);
4625 
4626 	new_node = &new_entry->efd_node;
4627 	cluster = new_entry->efd_start_cluster;
4628 
4629 	if (!*n) {
4630 		/* first free block exent. We need to
4631 		   protect buddy cache from being freed,
4632 		 * otherwise we'll refresh it from
4633 		 * on-disk bitmap and lose not-yet-available
4634 		 * blocks */
4635 		page_cache_get(e4b->bd_buddy_page);
4636 		page_cache_get(e4b->bd_bitmap_page);
4637 	}
4638 	while (*n) {
4639 		parent = *n;
4640 		entry = rb_entry(parent, struct ext4_free_data, efd_node);
4641 		if (cluster < entry->efd_start_cluster)
4642 			n = &(*n)->rb_left;
4643 		else if (cluster >= (entry->efd_start_cluster + entry->efd_count))
4644 			n = &(*n)->rb_right;
4645 		else {
4646 			ext4_grp_locked_error(sb, group, 0,
4647 				ext4_group_first_block_no(sb, group) +
4648 				EXT4_C2B(sbi, cluster),
4649 				"Block already on to-be-freed list");
4650 			kmem_cache_free(ext4_free_data_cachep, new_entry);
4651 			return 0;
4652 		}
4653 	}
4654 
4655 	rb_link_node(new_node, parent, n);
4656 	rb_insert_color(new_node, &db->bb_free_root);
4657 
4658 	/* Now try to see the extent can be merged to left and right */
4659 	node = rb_prev(new_node);
4660 	if (node) {
4661 		entry = rb_entry(node, struct ext4_free_data, efd_node);
4662 		if (can_merge(entry, new_entry) &&
4663 		    ext4_journal_callback_try_del(handle, &entry->efd_jce)) {
4664 			new_entry->efd_start_cluster = entry->efd_start_cluster;
4665 			new_entry->efd_count += entry->efd_count;
4666 			rb_erase(node, &(db->bb_free_root));
4667 			kmem_cache_free(ext4_free_data_cachep, entry);
4668 		}
4669 	}
4670 
4671 	node = rb_next(new_node);
4672 	if (node) {
4673 		entry = rb_entry(node, struct ext4_free_data, efd_node);
4674 		if (can_merge(new_entry, entry) &&
4675 		    ext4_journal_callback_try_del(handle, &entry->efd_jce)) {
4676 			new_entry->efd_count += entry->efd_count;
4677 			rb_erase(node, &(db->bb_free_root));
4678 			kmem_cache_free(ext4_free_data_cachep, entry);
4679 		}
4680 	}
4681 	/* Add the extent to transaction's private list */
4682 	ext4_journal_callback_add(handle, ext4_free_data_callback,
4683 				  &new_entry->efd_jce);
4684 	return 0;
4685 }
4686 
4687 /**
4688  * ext4_free_blocks() -- Free given blocks and update quota
4689  * @handle:		handle for this transaction
4690  * @inode:		inode
4691  * @block:		start physical block to free
4692  * @count:		number of blocks to count
4693  * @flags:		flags used by ext4_free_blocks
4694  */
ext4_free_blocks(handle_t * handle,struct inode * inode,struct buffer_head * bh,ext4_fsblk_t block,unsigned long count,int flags)4695 void ext4_free_blocks(handle_t *handle, struct inode *inode,
4696 		      struct buffer_head *bh, ext4_fsblk_t block,
4697 		      unsigned long count, int flags)
4698 {
4699 	struct buffer_head *bitmap_bh = NULL;
4700 	struct super_block *sb = inode->i_sb;
4701 	struct ext4_group_desc *gdp;
4702 	unsigned int overflow;
4703 	ext4_grpblk_t bit;
4704 	struct buffer_head *gd_bh;
4705 	ext4_group_t block_group;
4706 	struct ext4_sb_info *sbi;
4707 	struct ext4_buddy e4b;
4708 	unsigned int count_clusters;
4709 	int err = 0;
4710 	int ret;
4711 
4712 	might_sleep();
4713 	if (bh) {
4714 		if (block)
4715 			BUG_ON(block != bh->b_blocknr);
4716 		else
4717 			block = bh->b_blocknr;
4718 	}
4719 
4720 	sbi = EXT4_SB(sb);
4721 	if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
4722 	    !ext4_inode_block_valid(inode, block, count)) {
4723 		ext4_error(sb, "Freeing blocks not in datazone - "
4724 			   "block = %llu, count = %lu", block, count);
4725 		goto error_return;
4726 	}
4727 
4728 	ext4_debug("freeing block %llu\n", block);
4729 	trace_ext4_free_blocks(inode, block, count, flags);
4730 
4731 	if (bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
4732 		BUG_ON(count > 1);
4733 
4734 		ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
4735 			    inode, bh, block);
4736 	}
4737 
4738 	/*
4739 	 * We need to make sure we don't reuse the freed block until
4740 	 * after the transaction is committed, which we can do by
4741 	 * treating the block as metadata, below.  We make an
4742 	 * exception if the inode is to be written in writeback mode
4743 	 * since writeback mode has weak data consistency guarantees.
4744 	 */
4745 	if (!ext4_should_writeback_data(inode))
4746 		flags |= EXT4_FREE_BLOCKS_METADATA;
4747 
4748 	/*
4749 	 * If the extent to be freed does not begin on a cluster
4750 	 * boundary, we need to deal with partial clusters at the
4751 	 * beginning and end of the extent.  Normally we will free
4752 	 * blocks at the beginning or the end unless we are explicitly
4753 	 * requested to avoid doing so.
4754 	 */
4755 	overflow = EXT4_PBLK_COFF(sbi, block);
4756 	if (overflow) {
4757 		if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {
4758 			overflow = sbi->s_cluster_ratio - overflow;
4759 			block += overflow;
4760 			if (count > overflow)
4761 				count -= overflow;
4762 			else
4763 				return;
4764 		} else {
4765 			block -= overflow;
4766 			count += overflow;
4767 		}
4768 	}
4769 	overflow = EXT4_LBLK_COFF(sbi, count);
4770 	if (overflow) {
4771 		if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {
4772 			if (count > overflow)
4773 				count -= overflow;
4774 			else
4775 				return;
4776 		} else
4777 			count += sbi->s_cluster_ratio - overflow;
4778 	}
4779 
4780 	if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
4781 		int i;
4782 
4783 		for (i = 0; i < count; i++) {
4784 			cond_resched();
4785 			bh = sb_find_get_block(inode->i_sb, block + i);
4786 			if (!bh)
4787 				continue;
4788 			ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
4789 				    inode, bh, block + i);
4790 		}
4791 	}
4792 
4793 do_more:
4794 	overflow = 0;
4795 	ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
4796 
4797 	if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(
4798 			ext4_get_group_info(sb, block_group))))
4799 		return;
4800 
4801 	/*
4802 	 * Check to see if we are freeing blocks across a group
4803 	 * boundary.
4804 	 */
4805 	if (EXT4_C2B(sbi, bit) + count > EXT4_BLOCKS_PER_GROUP(sb)) {
4806 		overflow = EXT4_C2B(sbi, bit) + count -
4807 			EXT4_BLOCKS_PER_GROUP(sb);
4808 		count -= overflow;
4809 	}
4810 	count_clusters = EXT4_NUM_B2C(sbi, count);
4811 	bitmap_bh = ext4_read_block_bitmap(sb, block_group);
4812 	if (IS_ERR(bitmap_bh)) {
4813 		err = PTR_ERR(bitmap_bh);
4814 		bitmap_bh = NULL;
4815 		goto error_return;
4816 	}
4817 	gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
4818 	if (!gdp) {
4819 		err = -EIO;
4820 		goto error_return;
4821 	}
4822 
4823 	if (in_range(ext4_block_bitmap(sb, gdp), block, count) ||
4824 	    in_range(ext4_inode_bitmap(sb, gdp), block, count) ||
4825 	    in_range(block, ext4_inode_table(sb, gdp),
4826 		     EXT4_SB(sb)->s_itb_per_group) ||
4827 	    in_range(block + count - 1, ext4_inode_table(sb, gdp),
4828 		     EXT4_SB(sb)->s_itb_per_group)) {
4829 
4830 		ext4_error(sb, "Freeing blocks in system zone - "
4831 			   "Block = %llu, count = %lu", block, count);
4832 		/* err = 0. ext4_std_error should be a no op */
4833 		goto error_return;
4834 	}
4835 
4836 	BUFFER_TRACE(bitmap_bh, "getting write access");
4837 	err = ext4_journal_get_write_access(handle, bitmap_bh);
4838 	if (err)
4839 		goto error_return;
4840 
4841 	/*
4842 	 * We are about to modify some metadata.  Call the journal APIs
4843 	 * to unshare ->b_data if a currently-committing transaction is
4844 	 * using it
4845 	 */
4846 	BUFFER_TRACE(gd_bh, "get_write_access");
4847 	err = ext4_journal_get_write_access(handle, gd_bh);
4848 	if (err)
4849 		goto error_return;
4850 #ifdef AGGRESSIVE_CHECK
4851 	{
4852 		int i;
4853 		for (i = 0; i < count_clusters; i++)
4854 			BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
4855 	}
4856 #endif
4857 	trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters);
4858 
4859 	/* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */
4860 	err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b,
4861 				     GFP_NOFS|__GFP_NOFAIL);
4862 	if (err)
4863 		goto error_return;
4864 
4865 	if ((flags & EXT4_FREE_BLOCKS_METADATA) && ext4_handle_valid(handle)) {
4866 		struct ext4_free_data *new_entry;
4867 		/*
4868 		 * blocks being freed are metadata. these blocks shouldn't
4869 		 * be used until this transaction is committed
4870 		 *
4871 		 * We use __GFP_NOFAIL because ext4_free_blocks() is not allowed
4872 		 * to fail.
4873 		 */
4874 		new_entry = kmem_cache_alloc(ext4_free_data_cachep,
4875 				GFP_NOFS|__GFP_NOFAIL);
4876 		new_entry->efd_start_cluster = bit;
4877 		new_entry->efd_group = block_group;
4878 		new_entry->efd_count = count_clusters;
4879 		new_entry->efd_tid = handle->h_transaction->t_tid;
4880 
4881 		ext4_lock_group(sb, block_group);
4882 		mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
4883 		ext4_mb_free_metadata(handle, &e4b, new_entry);
4884 	} else {
4885 		/* need to update group_info->bb_free and bitmap
4886 		 * with group lock held. generate_buddy look at
4887 		 * them with group lock_held
4888 		 */
4889 		if (test_opt(sb, DISCARD)) {
4890 			err = ext4_issue_discard(sb, block_group, bit, count,
4891 						 0);
4892 			if (err && err != -EOPNOTSUPP)
4893 				ext4_msg(sb, KERN_WARNING, "discard request in"
4894 					 " group:%d block:%d count:%lu failed"
4895 					 " with %d", block_group, bit, count,
4896 					 err);
4897 		} else
4898 			EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info);
4899 
4900 		ext4_lock_group(sb, block_group);
4901 		mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
4902 		mb_free_blocks(inode, &e4b, bit, count_clusters);
4903 	}
4904 
4905 	ret = ext4_free_group_clusters(sb, gdp) + count_clusters;
4906 	ext4_free_group_clusters_set(sb, gdp, ret);
4907 	ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh);
4908 	ext4_group_desc_csum_set(sb, block_group, gdp);
4909 	ext4_unlock_group(sb, block_group);
4910 
4911 	if (sbi->s_log_groups_per_flex) {
4912 		ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
4913 		atomic64_add(count_clusters,
4914 			     &sbi_array_rcu_deref(sbi, s_flex_groups,
4915 						  flex_group)->free_clusters);
4916 	}
4917 
4918 	if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
4919 		dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
4920 	percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters);
4921 
4922 	ext4_mb_unload_buddy(&e4b);
4923 
4924 	/* We dirtied the bitmap block */
4925 	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
4926 	err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
4927 
4928 	/* And the group descriptor block */
4929 	BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
4930 	ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
4931 	if (!err)
4932 		err = ret;
4933 
4934 	if (overflow && !err) {
4935 		block += count;
4936 		count = overflow;
4937 		put_bh(bitmap_bh);
4938 		goto do_more;
4939 	}
4940 error_return:
4941 	brelse(bitmap_bh);
4942 	ext4_std_error(sb, err);
4943 	return;
4944 }
4945 
4946 /**
4947  * ext4_group_add_blocks() -- Add given blocks to an existing group
4948  * @handle:			handle to this transaction
4949  * @sb:				super block
4950  * @block:			start physical block to add to the block group
4951  * @count:			number of blocks to free
4952  *
4953  * This marks the blocks as free in the bitmap and buddy.
4954  */
ext4_group_add_blocks(handle_t * handle,struct super_block * sb,ext4_fsblk_t block,unsigned long count)4955 int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
4956 			 ext4_fsblk_t block, unsigned long count)
4957 {
4958 	struct buffer_head *bitmap_bh = NULL;
4959 	struct buffer_head *gd_bh;
4960 	ext4_group_t block_group;
4961 	ext4_grpblk_t bit;
4962 	unsigned int i;
4963 	struct ext4_group_desc *desc;
4964 	struct ext4_sb_info *sbi = EXT4_SB(sb);
4965 	struct ext4_buddy e4b;
4966 	int err = 0, ret, blk_free_count;
4967 	ext4_grpblk_t blocks_freed;
4968 
4969 	ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
4970 
4971 	if (count == 0)
4972 		return 0;
4973 
4974 	ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
4975 	/*
4976 	 * Check to see if we are freeing blocks across a group
4977 	 * boundary.
4978 	 */
4979 	if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
4980 		ext4_warning(sb, "too much blocks added to group %u\n",
4981 			     block_group);
4982 		err = -EINVAL;
4983 		goto error_return;
4984 	}
4985 
4986 	bitmap_bh = ext4_read_block_bitmap(sb, block_group);
4987 	if (IS_ERR(bitmap_bh)) {
4988 		err = PTR_ERR(bitmap_bh);
4989 		bitmap_bh = NULL;
4990 		goto error_return;
4991 	}
4992 
4993 	desc = ext4_get_group_desc(sb, block_group, &gd_bh);
4994 	if (!desc) {
4995 		err = -EIO;
4996 		goto error_return;
4997 	}
4998 
4999 	if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
5000 	    in_range(ext4_inode_bitmap(sb, desc), block, count) ||
5001 	    in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
5002 	    in_range(block + count - 1, ext4_inode_table(sb, desc),
5003 		     sbi->s_itb_per_group)) {
5004 		ext4_error(sb, "Adding blocks in system zones - "
5005 			   "Block = %llu, count = %lu",
5006 			   block, count);
5007 		err = -EINVAL;
5008 		goto error_return;
5009 	}
5010 
5011 	BUFFER_TRACE(bitmap_bh, "getting write access");
5012 	err = ext4_journal_get_write_access(handle, bitmap_bh);
5013 	if (err)
5014 		goto error_return;
5015 
5016 	/*
5017 	 * We are about to modify some metadata.  Call the journal APIs
5018 	 * to unshare ->b_data if a currently-committing transaction is
5019 	 * using it
5020 	 */
5021 	BUFFER_TRACE(gd_bh, "get_write_access");
5022 	err = ext4_journal_get_write_access(handle, gd_bh);
5023 	if (err)
5024 		goto error_return;
5025 
5026 	for (i = 0, blocks_freed = 0; i < count; i++) {
5027 		BUFFER_TRACE(bitmap_bh, "clear bit");
5028 		if (!mb_test_bit(bit + i, bitmap_bh->b_data)) {
5029 			ext4_error(sb, "bit already cleared for block %llu",
5030 				   (ext4_fsblk_t)(block + i));
5031 			BUFFER_TRACE(bitmap_bh, "bit already cleared");
5032 		} else {
5033 			blocks_freed++;
5034 		}
5035 	}
5036 
5037 	err = ext4_mb_load_buddy(sb, block_group, &e4b);
5038 	if (err)
5039 		goto error_return;
5040 
5041 	/*
5042 	 * need to update group_info->bb_free and bitmap
5043 	 * with group lock held. generate_buddy look at
5044 	 * them with group lock_held
5045 	 */
5046 	ext4_lock_group(sb, block_group);
5047 	mb_clear_bits(bitmap_bh->b_data, bit, count);
5048 	mb_free_blocks(NULL, &e4b, bit, count);
5049 	blk_free_count = blocks_freed + ext4_free_group_clusters(sb, desc);
5050 	ext4_free_group_clusters_set(sb, desc, blk_free_count);
5051 	ext4_block_bitmap_csum_set(sb, block_group, desc, bitmap_bh);
5052 	ext4_group_desc_csum_set(sb, block_group, desc);
5053 	ext4_unlock_group(sb, block_group);
5054 	percpu_counter_add(&sbi->s_freeclusters_counter,
5055 			   EXT4_NUM_B2C(sbi, blocks_freed));
5056 
5057 	if (sbi->s_log_groups_per_flex) {
5058 		ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
5059 		atomic64_add(EXT4_NUM_B2C(sbi, blocks_freed),
5060 			     &sbi_array_rcu_deref(sbi, s_flex_groups,
5061 						  flex_group)->free_clusters);
5062 	}
5063 
5064 	ext4_mb_unload_buddy(&e4b);
5065 
5066 	/* We dirtied the bitmap block */
5067 	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
5068 	err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
5069 
5070 	/* And the group descriptor block */
5071 	BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
5072 	ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
5073 	if (!err)
5074 		err = ret;
5075 
5076 error_return:
5077 	brelse(bitmap_bh);
5078 	ext4_std_error(sb, err);
5079 	return err;
5080 }
5081 
5082 /**
5083  * ext4_trim_extent -- function to TRIM one single free extent in the group
5084  * @sb:		super block for the file system
5085  * @start:	starting block of the free extent in the alloc. group
5086  * @count:	number of blocks to TRIM
5087  * @group:	alloc. group we are working with
5088  * @e4b:	ext4 buddy for the group
5089  * @blkdev_flags: flags for the block device
5090  *
5091  * Trim "count" blocks starting at "start" in the "group". To assure that no
5092  * one will allocate those blocks, mark it as used in buddy bitmap. This must
5093  * be called with under the group lock.
5094  */
ext4_trim_extent(struct super_block * sb,int start,int count,ext4_group_t group,struct ext4_buddy * e4b,unsigned long blkdev_flags)5095 static int ext4_trim_extent(struct super_block *sb, int start, int count,
5096 			    ext4_group_t group, struct ext4_buddy *e4b,
5097 			    unsigned long blkdev_flags)
5098 __releases(bitlock)
5099 __acquires(bitlock)
5100 {
5101 	struct ext4_free_extent ex;
5102 	int ret = 0;
5103 
5104 	trace_ext4_trim_extent(sb, group, start, count);
5105 
5106 	assert_spin_locked(ext4_group_lock_ptr(sb, group));
5107 
5108 	ex.fe_start = start;
5109 	ex.fe_group = group;
5110 	ex.fe_len = count;
5111 
5112 	/*
5113 	 * Mark blocks used, so no one can reuse them while
5114 	 * being trimmed.
5115 	 */
5116 	mb_mark_used(e4b, &ex);
5117 	ext4_unlock_group(sb, group);
5118 	ret = ext4_issue_discard(sb, group, start, count, blkdev_flags);
5119 	ext4_lock_group(sb, group);
5120 	mb_free_blocks(NULL, e4b, start, ex.fe_len);
5121 	return ret;
5122 }
5123 
5124 /**
5125  * ext4_trim_all_free -- function to trim all free space in alloc. group
5126  * @sb:			super block for file system
5127  * @group:		group to be trimmed
5128  * @start:		first group block to examine
5129  * @max:		last group block to examine
5130  * @minblocks:		minimum extent block count
5131  * @blkdev_flags:	flags for the block device
5132  *
5133  * ext4_trim_all_free walks through group's buddy bitmap searching for free
5134  * extents. When the free block is found, ext4_trim_extent is called to TRIM
5135  * the extent.
5136  *
5137  *
5138  * ext4_trim_all_free walks through group's block bitmap searching for free
5139  * extents. When the free extent is found, mark it as used in group buddy
5140  * bitmap. Then issue a TRIM command on this extent and free the extent in
5141  * the group buddy bitmap. This is done until whole group is scanned.
5142  */
5143 static ext4_grpblk_t
ext4_trim_all_free(struct super_block * sb,ext4_group_t group,ext4_grpblk_t start,ext4_grpblk_t max,ext4_grpblk_t minblocks,unsigned long blkdev_flags)5144 ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
5145 		   ext4_grpblk_t start, ext4_grpblk_t max,
5146 		   ext4_grpblk_t minblocks, unsigned long blkdev_flags)
5147 {
5148 	void *bitmap;
5149 	ext4_grpblk_t next, count = 0, free_count = 0;
5150 	struct ext4_buddy e4b;
5151 	int ret = 0;
5152 
5153 	trace_ext4_trim_all_free(sb, group, start, max);
5154 
5155 	ret = ext4_mb_load_buddy(sb, group, &e4b);
5156 	if (ret) {
5157 		ext4_warning(sb, "Error %d loading buddy information for %u",
5158 			     ret, group);
5159 		return ret;
5160 	}
5161 	bitmap = e4b.bd_bitmap;
5162 
5163 	ext4_lock_group(sb, group);
5164 	if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) &&
5165 	    minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks))
5166 		goto out;
5167 
5168 	start = (e4b.bd_info->bb_first_free > start) ?
5169 		e4b.bd_info->bb_first_free : start;
5170 
5171 	while (start <= max) {
5172 		start = mb_find_next_zero_bit(bitmap, max + 1, start);
5173 		if (start > max)
5174 			break;
5175 		next = mb_find_next_bit(bitmap, max + 1, start);
5176 
5177 		if ((next - start) >= minblocks) {
5178 			ret = ext4_trim_extent(sb, start,
5179 					       next - start, group, &e4b,
5180 					       blkdev_flags);
5181 			if (ret && ret != -EOPNOTSUPP)
5182 				break;
5183 			ret = 0;
5184 			count += next - start;
5185 		}
5186 		free_count += next - start;
5187 		start = next + 1;
5188 
5189 		if (fatal_signal_pending(current)) {
5190 			count = -ERESTARTSYS;
5191 			break;
5192 		}
5193 
5194 		if (need_resched()) {
5195 			ext4_unlock_group(sb, group);
5196 			cond_resched();
5197 			ext4_lock_group(sb, group);
5198 		}
5199 
5200 		if ((e4b.bd_info->bb_free - free_count) < minblocks)
5201 			break;
5202 	}
5203 
5204 	if (!ret) {
5205 		ret = count;
5206 		EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
5207 	}
5208 out:
5209 	ext4_unlock_group(sb, group);
5210 	ext4_mb_unload_buddy(&e4b);
5211 
5212 	ext4_debug("trimmed %d blocks in the group %d\n",
5213 		count, group);
5214 
5215 	return ret;
5216 }
5217 
5218 /**
5219  * ext4_trim_fs() -- trim ioctl handle function
5220  * @sb:			superblock for filesystem
5221  * @range:		fstrim_range structure
5222  * @blkdev_flags:	flags for the block device
5223  *
5224  * start:	First Byte to trim
5225  * len:		number of Bytes to trim from start
5226  * minlen:	minimum extent length in Bytes
5227  * ext4_trim_fs goes through all allocation groups containing Bytes from
5228  * start to start+len. For each such a group ext4_trim_all_free function
5229  * is invoked to trim all free space.
5230  */
ext4_trim_fs(struct super_block * sb,struct fstrim_range * range,unsigned long blkdev_flags)5231 int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range,
5232 			unsigned long blkdev_flags)
5233 {
5234 	struct request_queue *q = bdev_get_queue(sb->s_bdev);
5235 	struct ext4_group_info *grp;
5236 	ext4_group_t group, first_group, last_group;
5237 	ext4_grpblk_t cnt = 0, first_cluster, last_cluster;
5238 	uint64_t start, end, minlen, trimmed = 0;
5239 	ext4_fsblk_t first_data_blk =
5240 			le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
5241 	ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es);
5242 	int ret = 0;
5243 
5244 	start = range->start >> sb->s_blocksize_bits;
5245 	end = start + (range->len >> sb->s_blocksize_bits) - 1;
5246 	minlen = EXT4_NUM_B2C(EXT4_SB(sb),
5247 			      range->minlen >> sb->s_blocksize_bits);
5248 
5249 	if (minlen > EXT4_CLUSTERS_PER_GROUP(sb) ||
5250 	    start >= max_blks ||
5251 	    range->len < sb->s_blocksize)
5252 		return -EINVAL;
5253 	/* No point to try to trim less than discard granularity */
5254 	if (range->minlen < q->limits.discard_granularity) {
5255 		minlen = EXT4_NUM_B2C(EXT4_SB(sb),
5256 			q->limits.discard_granularity >> sb->s_blocksize_bits);
5257 		if (minlen > EXT4_CLUSTERS_PER_GROUP(sb))
5258 			goto out;
5259 	}
5260 	if (end >= max_blks)
5261 		end = max_blks - 1;
5262 	if (end <= first_data_blk)
5263 		goto out;
5264 	if (start < first_data_blk)
5265 		start = first_data_blk;
5266 
5267 	/* Determine first and last group to examine based on start and end */
5268 	ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
5269 				     &first_group, &first_cluster);
5270 	ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) end,
5271 				     &last_group, &last_cluster);
5272 
5273 	/* end now represents the last cluster to discard in this group */
5274 	end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
5275 
5276 	for (group = first_group; group <= last_group; group++) {
5277 		grp = ext4_get_group_info(sb, group);
5278 		/* We only do this if the grp has never been initialized */
5279 		if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
5280 			ret = ext4_mb_init_group(sb, group, GFP_NOFS);
5281 			if (ret)
5282 				break;
5283 		}
5284 
5285 		/*
5286 		 * For all the groups except the last one, last cluster will
5287 		 * always be EXT4_CLUSTERS_PER_GROUP(sb)-1, so we only need to
5288 		 * change it for the last group, note that last_cluster is
5289 		 * already computed earlier by ext4_get_group_no_and_offset()
5290 		 */
5291 		if (group == last_group)
5292 			end = last_cluster;
5293 
5294 		if (grp->bb_free >= minlen) {
5295 			cnt = ext4_trim_all_free(sb, group, first_cluster,
5296 						end, minlen, blkdev_flags);
5297 			if (cnt < 0) {
5298 				ret = cnt;
5299 				break;
5300 			}
5301 			trimmed += cnt;
5302 		}
5303 
5304 		/*
5305 		 * For every group except the first one, we are sure
5306 		 * that the first cluster to discard will be cluster #0.
5307 		 */
5308 		first_cluster = 0;
5309 	}
5310 
5311 	if (!ret)
5312 		atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen);
5313 
5314 out:
5315 	range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits;
5316 	return ret;
5317 }
5318