• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1  /*
2   * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
3   * Written by Alex Tomas <alex@clusterfs.com>
4   *
5   * This program is free software; you can redistribute it and/or modify
6   * it under the terms of the GNU General Public License version 2 as
7   * published by the Free Software Foundation.
8   *
9   * This program is distributed in the hope that it will be useful,
10   * but WITHOUT ANY WARRANTY; without even the implied warranty of
11   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12   * GNU General Public License for more details.
13   *
14   * You should have received a copy of the GNU General Public Licens
15   * along with this program; if not, write to the Free Software
16   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
17   */
18  
19  
20  /*
21   * mballoc.c contains the multiblocks allocation routines
22   */
23  
24  #include "ext4_jbd2.h"
25  #include "mballoc.h"
26  #include <linux/log2.h>
27  #include <linux/module.h>
28  #include <linux/slab.h>
29  #include <linux/nospec.h>
30  #include <linux/backing-dev.h>
31  #include <trace/events/ext4.h>
32  
33  #ifdef CONFIG_EXT4_DEBUG
34  ushort ext4_mballoc_debug __read_mostly;
35  
36  module_param_named(mballoc_debug, ext4_mballoc_debug, ushort, 0644);
37  MODULE_PARM_DESC(mballoc_debug, "Debugging level for ext4's mballoc");
38  #endif
39  
40  /*
41   * MUSTDO:
42   *   - test ext4_ext_search_left() and ext4_ext_search_right()
43   *   - search for metadata in few groups
44   *
45   * TODO v4:
46   *   - normalization should take into account whether file is still open
47   *   - discard preallocations if no free space left (policy?)
48   *   - don't normalize tails
49   *   - quota
50   *   - reservation for superuser
51   *
52   * TODO v3:
53   *   - bitmap read-ahead (proposed by Oleg Drokin aka green)
54   *   - track min/max extents in each group for better group selection
55   *   - mb_mark_used() may allocate chunk right after splitting buddy
56   *   - tree of groups sorted by number of free blocks
57   *   - error handling
58   */
59  
60  /*
61   * The allocation request involve request for multiple number of blocks
62   * near to the goal(block) value specified.
63   *
64   * During initialization phase of the allocator we decide to use the
65   * group preallocation or inode preallocation depending on the size of
66   * the file. The size of the file could be the resulting file size we
67   * would have after allocation, or the current file size, which ever
68   * is larger. If the size is less than sbi->s_mb_stream_request we
69   * select to use the group preallocation. The default value of
70   * s_mb_stream_request is 16 blocks. This can also be tuned via
71   * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in
72   * terms of number of blocks.
73   *
74   * The main motivation for having small file use group preallocation is to
75   * ensure that we have small files closer together on the disk.
76   *
77   * First stage the allocator looks at the inode prealloc list,
78   * ext4_inode_info->i_prealloc_list, which contains list of prealloc
79   * spaces for this particular inode. The inode prealloc space is
80   * represented as:
81   *
82   * pa_lstart -> the logical start block for this prealloc space
83   * pa_pstart -> the physical start block for this prealloc space
84   * pa_len    -> length for this prealloc space (in clusters)
85   * pa_free   ->  free space available in this prealloc space (in clusters)
86   *
87   * The inode preallocation space is used looking at the _logical_ start
88   * block. If only the logical file block falls within the range of prealloc
89   * space we will consume the particular prealloc space. This makes sure that
90   * we have contiguous physical blocks representing the file blocks
91   *
92   * The important thing to be noted in case of inode prealloc space is that
93   * we don't modify the values associated to inode prealloc space except
94   * pa_free.
95   *
96   * If we are not able to find blocks in the inode prealloc space and if we
97   * have the group allocation flag set then we look at the locality group
98   * prealloc space. These are per CPU prealloc list represented as
99   *
100   * ext4_sb_info.s_locality_groups[smp_processor_id()]
101   *
102   * The reason for having a per cpu locality group is to reduce the contention
103   * between CPUs. It is possible to get scheduled at this point.
104   *
105   * The locality group prealloc space is used looking at whether we have
106   * enough free space (pa_free) within the prealloc space.
107   *
108   * If we can't allocate blocks via inode prealloc or/and locality group
109   * prealloc then we look at the buddy cache. The buddy cache is represented
110   * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets
111   * mapped to the buddy and bitmap information regarding different
112   * groups. The buddy information is attached to buddy cache inode so that
113   * we can access them through the page cache. The information regarding
114   * each group is loaded via ext4_mb_load_buddy.  The information involve
115   * block bitmap and buddy information. The information are stored in the
116   * inode as:
117   *
118   *  {                        page                        }
119   *  [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
120   *
121   *
122   * one block each for bitmap and buddy information.  So for each group we
123   * take up 2 blocks. A page can contain blocks_per_page (PAGE_CACHE_SIZE /
124   * blocksize) blocks.  So it can have information regarding groups_per_page
125   * which is blocks_per_page/2
126   *
127   * The buddy cache inode is not stored on disk. The inode is thrown
128   * away when the filesystem is unmounted.
129   *
130   * We look for count number of blocks in the buddy cache. If we were able
131   * to locate that many free blocks we return with additional information
132   * regarding rest of the contiguous physical block available
133   *
134   * Before allocating blocks via buddy cache we normalize the request
135   * blocks. This ensure we ask for more blocks that we needed. The extra
136   * blocks that we get after allocation is added to the respective prealloc
137   * list. In case of inode preallocation we follow a list of heuristics
138   * based on file size. This can be found in ext4_mb_normalize_request. If
139   * we are doing a group prealloc we try to normalize the request to
140   * sbi->s_mb_group_prealloc.  The default value of s_mb_group_prealloc is
141   * dependent on the cluster size; for non-bigalloc file systems, it is
142   * 512 blocks. This can be tuned via
143   * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in
144   * terms of number of blocks. If we have mounted the file system with -O
145   * stripe=<value> option the group prealloc request is normalized to the
146   * the smallest multiple of the stripe value (sbi->s_stripe) which is
147   * greater than the default mb_group_prealloc.
148   *
149   * The regular allocator (using the buddy cache) supports a few tunables.
150   *
151   * /sys/fs/ext4/<partition>/mb_min_to_scan
152   * /sys/fs/ext4/<partition>/mb_max_to_scan
153   * /sys/fs/ext4/<partition>/mb_order2_req
154   *
155   * The regular allocator uses buddy scan only if the request len is power of
156   * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
157   * value of s_mb_order2_reqs can be tuned via
158   * /sys/fs/ext4/<partition>/mb_order2_req.  If the request len is equal to
159   * stripe size (sbi->s_stripe), we try to search for contiguous block in
160   * stripe size. This should result in better allocation on RAID setups. If
161   * not, we search in the specific group using bitmap for best extents. The
162   * tunable min_to_scan and max_to_scan control the behaviour here.
163   * min_to_scan indicate how long the mballoc __must__ look for a best
164   * extent and max_to_scan indicates how long the mballoc __can__ look for a
165   * best extent in the found extents. Searching for the blocks starts with
166   * the group specified as the goal value in allocation context via
167   * ac_g_ex. Each group is first checked based on the criteria whether it
168   * can be used for allocation. ext4_mb_good_group explains how the groups are
169   * checked.
170   *
171   * Both the prealloc space are getting populated as above. So for the first
172   * request we will hit the buddy cache which will result in this prealloc
173   * space getting filled. The prealloc space is then later used for the
174   * subsequent request.
175   */
176  
177  /*
178   * mballoc operates on the following data:
179   *  - on-disk bitmap
180   *  - in-core buddy (actually includes buddy and bitmap)
181   *  - preallocation descriptors (PAs)
182   *
183   * there are two types of preallocations:
184   *  - inode
185   *    assiged to specific inode and can be used for this inode only.
186   *    it describes part of inode's space preallocated to specific
187   *    physical blocks. any block from that preallocated can be used
188   *    independent. the descriptor just tracks number of blocks left
189   *    unused. so, before taking some block from descriptor, one must
190   *    make sure corresponded logical block isn't allocated yet. this
191   *    also means that freeing any block within descriptor's range
192   *    must discard all preallocated blocks.
193   *  - locality group
194   *    assigned to specific locality group which does not translate to
195   *    permanent set of inodes: inode can join and leave group. space
196   *    from this type of preallocation can be used for any inode. thus
197   *    it's consumed from the beginning to the end.
198   *
199   * relation between them can be expressed as:
200   *    in-core buddy = on-disk bitmap + preallocation descriptors
201   *
202   * this mean blocks mballoc considers used are:
203   *  - allocated blocks (persistent)
204   *  - preallocated blocks (non-persistent)
205   *
206   * consistency in mballoc world means that at any time a block is either
207   * free or used in ALL structures. notice: "any time" should not be read
208   * literally -- time is discrete and delimited by locks.
209   *
210   *  to keep it simple, we don't use block numbers, instead we count number of
211   *  blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA.
212   *
213   * all operations can be expressed as:
214   *  - init buddy:			buddy = on-disk + PAs
215   *  - new PA:				buddy += N; PA = N
216   *  - use inode PA:			on-disk += N; PA -= N
217   *  - discard inode PA			buddy -= on-disk - PA; PA = 0
218   *  - use locality group PA		on-disk += N; PA -= N
219   *  - discard locality group PA		buddy -= PA; PA = 0
220   *  note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap
221   *        is used in real operation because we can't know actual used
222   *        bits from PA, only from on-disk bitmap
223   *
224   * if we follow this strict logic, then all operations above should be atomic.
225   * given some of them can block, we'd have to use something like semaphores
226   * killing performance on high-end SMP hardware. let's try to relax it using
227   * the following knowledge:
228   *  1) if buddy is referenced, it's already initialized
229   *  2) while block is used in buddy and the buddy is referenced,
230   *     nobody can re-allocate that block
231   *  3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has
232   *     bit set and PA claims same block, it's OK. IOW, one can set bit in
233   *     on-disk bitmap if buddy has same bit set or/and PA covers corresponded
234   *     block
235   *
236   * so, now we're building a concurrency table:
237   *  - init buddy vs.
238   *    - new PA
239   *      blocks for PA are allocated in the buddy, buddy must be referenced
240   *      until PA is linked to allocation group to avoid concurrent buddy init
241   *    - use inode PA
242   *      we need to make sure that either on-disk bitmap or PA has uptodate data
243   *      given (3) we care that PA-=N operation doesn't interfere with init
244   *    - discard inode PA
245   *      the simplest way would be to have buddy initialized by the discard
246   *    - use locality group PA
247   *      again PA-=N must be serialized with init
248   *    - discard locality group PA
249   *      the simplest way would be to have buddy initialized by the discard
250   *  - new PA vs.
251   *    - use inode PA
252   *      i_data_sem serializes them
253   *    - discard inode PA
254   *      discard process must wait until PA isn't used by another process
255   *    - use locality group PA
256   *      some mutex should serialize them
257   *    - discard locality group PA
258   *      discard process must wait until PA isn't used by another process
259   *  - use inode PA
260   *    - use inode PA
261   *      i_data_sem or another mutex should serializes them
262   *    - discard inode PA
263   *      discard process must wait until PA isn't used by another process
264   *    - use locality group PA
265   *      nothing wrong here -- they're different PAs covering different blocks
266   *    - discard locality group PA
267   *      discard process must wait until PA isn't used by another process
268   *
269   * now we're ready to make few consequences:
270   *  - PA is referenced and while it is no discard is possible
271   *  - PA is referenced until block isn't marked in on-disk bitmap
272   *  - PA changes only after on-disk bitmap
273   *  - discard must not compete with init. either init is done before
274   *    any discard or they're serialized somehow
275   *  - buddy init as sum of on-disk bitmap and PAs is done atomically
276   *
277   * a special case when we've used PA to emptiness. no need to modify buddy
278   * in this case, but we should care about concurrent init
279   *
280   */
281  
282   /*
283   * Logic in few words:
284   *
285   *  - allocation:
286   *    load group
287   *    find blocks
288   *    mark bits in on-disk bitmap
289   *    release group
290   *
291   *  - use preallocation:
292   *    find proper PA (per-inode or group)
293   *    load group
294   *    mark bits in on-disk bitmap
295   *    release group
296   *    release PA
297   *
298   *  - free:
299   *    load group
300   *    mark bits in on-disk bitmap
301   *    release group
302   *
303   *  - discard preallocations in group:
304   *    mark PAs deleted
305   *    move them onto local list
306   *    load on-disk bitmap
307   *    load group
308   *    remove PA from object (inode or locality group)
309   *    mark free blocks in-core
310   *
311   *  - discard inode's preallocations:
312   */
313  
314  /*
315   * Locking rules
316   *
317   * Locks:
318   *  - bitlock on a group	(group)
319   *  - object (inode/locality)	(object)
320   *  - per-pa lock		(pa)
321   *
322   * Paths:
323   *  - new pa
324   *    object
325   *    group
326   *
327   *  - find and use pa:
328   *    pa
329   *
330   *  - release consumed pa:
331   *    pa
332   *    group
333   *    object
334   *
335   *  - generate in-core bitmap:
336   *    group
337   *        pa
338   *
339   *  - discard all for given object (inode, locality group):
340   *    object
341   *        pa
342   *    group
343   *
344   *  - discard all for given group:
345   *    group
346   *        pa
347   *    group
348   *        object
349   *
350   */
351  static struct kmem_cache *ext4_pspace_cachep;
352  static struct kmem_cache *ext4_ac_cachep;
353  static struct kmem_cache *ext4_free_data_cachep;
354  
355  /* We create slab caches for groupinfo data structures based on the
356   * superblock block size.  There will be one per mounted filesystem for
357   * each unique s_blocksize_bits */
358  #define NR_GRPINFO_CACHES 8
359  static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];
360  
361  static const char *ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = {
362  	"ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k",
363  	"ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k",
364  	"ext4_groupinfo_64k", "ext4_groupinfo_128k"
365  };
366  
367  static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
368  					ext4_group_t group);
369  static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
370  						ext4_group_t group);
371  static void ext4_free_data_callback(struct super_block *sb,
372  				struct ext4_journal_cb_entry *jce, int rc);
373  
mb_correct_addr_and_bit(int * bit,void * addr)374  static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
375  {
376  #if BITS_PER_LONG == 64
377  	*bit += ((unsigned long) addr & 7UL) << 3;
378  	addr = (void *) ((unsigned long) addr & ~7UL);
379  #elif BITS_PER_LONG == 32
380  	*bit += ((unsigned long) addr & 3UL) << 3;
381  	addr = (void *) ((unsigned long) addr & ~3UL);
382  #else
383  #error "how many bits you are?!"
384  #endif
385  	return addr;
386  }
387  
mb_test_bit(int bit,void * addr)388  static inline int mb_test_bit(int bit, void *addr)
389  {
390  	/*
391  	 * ext4_test_bit on architecture like powerpc
392  	 * needs unsigned long aligned address
393  	 */
394  	addr = mb_correct_addr_and_bit(&bit, addr);
395  	return ext4_test_bit(bit, addr);
396  }
397  
mb_set_bit(int bit,void * addr)398  static inline void mb_set_bit(int bit, void *addr)
399  {
400  	addr = mb_correct_addr_and_bit(&bit, addr);
401  	ext4_set_bit(bit, addr);
402  }
403  
mb_clear_bit(int bit,void * addr)404  static inline void mb_clear_bit(int bit, void *addr)
405  {
406  	addr = mb_correct_addr_and_bit(&bit, addr);
407  	ext4_clear_bit(bit, addr);
408  }
409  
mb_test_and_clear_bit(int bit,void * addr)410  static inline int mb_test_and_clear_bit(int bit, void *addr)
411  {
412  	addr = mb_correct_addr_and_bit(&bit, addr);
413  	return ext4_test_and_clear_bit(bit, addr);
414  }
415  
mb_find_next_zero_bit(void * addr,int max,int start)416  static inline int mb_find_next_zero_bit(void *addr, int max, int start)
417  {
418  	int fix = 0, ret, tmpmax;
419  	addr = mb_correct_addr_and_bit(&fix, addr);
420  	tmpmax = max + fix;
421  	start += fix;
422  
423  	ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix;
424  	if (ret > max)
425  		return max;
426  	return ret;
427  }
428  
mb_find_next_bit(void * addr,int max,int start)429  static inline int mb_find_next_bit(void *addr, int max, int start)
430  {
431  	int fix = 0, ret, tmpmax;
432  	addr = mb_correct_addr_and_bit(&fix, addr);
433  	tmpmax = max + fix;
434  	start += fix;
435  
436  	ret = ext4_find_next_bit(addr, tmpmax, start) - fix;
437  	if (ret > max)
438  		return max;
439  	return ret;
440  }
441  
mb_find_buddy(struct ext4_buddy * e4b,int order,int * max)442  static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
443  {
444  	char *bb;
445  
446  	BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
447  	BUG_ON(max == NULL);
448  
449  	if (order > e4b->bd_blkbits + 1) {
450  		*max = 0;
451  		return NULL;
452  	}
453  
454  	/* at order 0 we see each particular block */
455  	if (order == 0) {
456  		*max = 1 << (e4b->bd_blkbits + 3);
457  		return e4b->bd_bitmap;
458  	}
459  
460  	bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
461  	*max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
462  
463  	return bb;
464  }
465  
466  #ifdef DOUBLE_CHECK
mb_free_blocks_double(struct inode * inode,struct ext4_buddy * e4b,int first,int count)467  static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
468  			   int first, int count)
469  {
470  	int i;
471  	struct super_block *sb = e4b->bd_sb;
472  
473  	if (unlikely(e4b->bd_info->bb_bitmap == NULL))
474  		return;
475  	assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
476  	for (i = 0; i < count; i++) {
477  		if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) {
478  			ext4_fsblk_t blocknr;
479  
480  			blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
481  			blocknr += EXT4_C2B(EXT4_SB(sb), first + i);
482  			ext4_grp_locked_error(sb, e4b->bd_group,
483  					      inode ? inode->i_ino : 0,
484  					      blocknr,
485  					      "freeing block already freed "
486  					      "(bit %u)",
487  					      first + i);
488  		}
489  		mb_clear_bit(first + i, e4b->bd_info->bb_bitmap);
490  	}
491  }
492  
mb_mark_used_double(struct ext4_buddy * e4b,int first,int count)493  static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count)
494  {
495  	int i;
496  
497  	if (unlikely(e4b->bd_info->bb_bitmap == NULL))
498  		return;
499  	assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
500  	for (i = 0; i < count; i++) {
501  		BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap));
502  		mb_set_bit(first + i, e4b->bd_info->bb_bitmap);
503  	}
504  }
505  
mb_cmp_bitmaps(struct ext4_buddy * e4b,void * bitmap)506  static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
507  {
508  	if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) {
509  		unsigned char *b1, *b2;
510  		int i;
511  		b1 = (unsigned char *) e4b->bd_info->bb_bitmap;
512  		b2 = (unsigned char *) bitmap;
513  		for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
514  			if (b1[i] != b2[i]) {
515  				ext4_msg(e4b->bd_sb, KERN_ERR,
516  					 "corruption in group %u "
517  					 "at byte %u(%u): %x in copy != %x "
518  					 "on disk/prealloc",
519  					 e4b->bd_group, i, i * 8, b1[i], b2[i]);
520  				BUG();
521  			}
522  		}
523  	}
524  }
525  
526  #else
mb_free_blocks_double(struct inode * inode,struct ext4_buddy * e4b,int first,int count)527  static inline void mb_free_blocks_double(struct inode *inode,
528  				struct ext4_buddy *e4b, int first, int count)
529  {
530  	return;
531  }
mb_mark_used_double(struct ext4_buddy * e4b,int first,int count)532  static inline void mb_mark_used_double(struct ext4_buddy *e4b,
533  						int first, int count)
534  {
535  	return;
536  }
mb_cmp_bitmaps(struct ext4_buddy * e4b,void * bitmap)537  static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
538  {
539  	return;
540  }
541  #endif
542  
543  #ifdef AGGRESSIVE_CHECK
544  
545  #define MB_CHECK_ASSERT(assert)						\
546  do {									\
547  	if (!(assert)) {						\
548  		printk(KERN_EMERG					\
549  			"Assertion failure in %s() at %s:%d: \"%s\"\n",	\
550  			function, file, line, # assert);		\
551  		BUG();							\
552  	}								\
553  } while (0)
554  
__mb_check_buddy(struct ext4_buddy * e4b,char * file,const char * function,int line)555  static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
556  				const char *function, int line)
557  {
558  	struct super_block *sb = e4b->bd_sb;
559  	int order = e4b->bd_blkbits + 1;
560  	int max;
561  	int max2;
562  	int i;
563  	int j;
564  	int k;
565  	int count;
566  	struct ext4_group_info *grp;
567  	int fragments = 0;
568  	int fstart;
569  	struct list_head *cur;
570  	void *buddy;
571  	void *buddy2;
572  
573  	{
574  		static int mb_check_counter;
575  		if (mb_check_counter++ % 100 != 0)
576  			return 0;
577  	}
578  
579  	while (order > 1) {
580  		buddy = mb_find_buddy(e4b, order, &max);
581  		MB_CHECK_ASSERT(buddy);
582  		buddy2 = mb_find_buddy(e4b, order - 1, &max2);
583  		MB_CHECK_ASSERT(buddy2);
584  		MB_CHECK_ASSERT(buddy != buddy2);
585  		MB_CHECK_ASSERT(max * 2 == max2);
586  
587  		count = 0;
588  		for (i = 0; i < max; i++) {
589  
590  			if (mb_test_bit(i, buddy)) {
591  				/* only single bit in buddy2 may be 1 */
592  				if (!mb_test_bit(i << 1, buddy2)) {
593  					MB_CHECK_ASSERT(
594  						mb_test_bit((i<<1)+1, buddy2));
595  				} else if (!mb_test_bit((i << 1) + 1, buddy2)) {
596  					MB_CHECK_ASSERT(
597  						mb_test_bit(i << 1, buddy2));
598  				}
599  				continue;
600  			}
601  
602  			/* both bits in buddy2 must be 1 */
603  			MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2));
604  			MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2));
605  
606  			for (j = 0; j < (1 << order); j++) {
607  				k = (i * (1 << order)) + j;
608  				MB_CHECK_ASSERT(
609  					!mb_test_bit(k, e4b->bd_bitmap));
610  			}
611  			count++;
612  		}
613  		MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count);
614  		order--;
615  	}
616  
617  	fstart = -1;
618  	buddy = mb_find_buddy(e4b, 0, &max);
619  	for (i = 0; i < max; i++) {
620  		if (!mb_test_bit(i, buddy)) {
621  			MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free);
622  			if (fstart == -1) {
623  				fragments++;
624  				fstart = i;
625  			}
626  			continue;
627  		}
628  		fstart = -1;
629  		/* check used bits only */
630  		for (j = 0; j < e4b->bd_blkbits + 1; j++) {
631  			buddy2 = mb_find_buddy(e4b, j, &max2);
632  			k = i >> j;
633  			MB_CHECK_ASSERT(k < max2);
634  			MB_CHECK_ASSERT(mb_test_bit(k, buddy2));
635  		}
636  	}
637  	MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info));
638  	MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments);
639  
640  	grp = ext4_get_group_info(sb, e4b->bd_group);
641  	list_for_each(cur, &grp->bb_prealloc_list) {
642  		ext4_group_t groupnr;
643  		struct ext4_prealloc_space *pa;
644  		pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
645  		ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k);
646  		MB_CHECK_ASSERT(groupnr == e4b->bd_group);
647  		for (i = 0; i < pa->pa_len; i++)
648  			MB_CHECK_ASSERT(mb_test_bit(k + i, buddy));
649  	}
650  	return 0;
651  }
652  #undef MB_CHECK_ASSERT
653  #define mb_check_buddy(e4b) __mb_check_buddy(e4b,	\
654  					__FILE__, __func__, __LINE__)
655  #else
656  #define mb_check_buddy(e4b)
657  #endif
658  
659  /*
660   * Divide blocks started from @first with length @len into
661   * smaller chunks with power of 2 blocks.
662   * Clear the bits in bitmap which the blocks of the chunk(s) covered,
663   * then increase bb_counters[] for corresponded chunk size.
664   */
ext4_mb_mark_free_simple(struct super_block * sb,void * buddy,ext4_grpblk_t first,ext4_grpblk_t len,struct ext4_group_info * grp)665  static void ext4_mb_mark_free_simple(struct super_block *sb,
666  				void *buddy, ext4_grpblk_t first, ext4_grpblk_t len,
667  					struct ext4_group_info *grp)
668  {
669  	struct ext4_sb_info *sbi = EXT4_SB(sb);
670  	ext4_grpblk_t min;
671  	ext4_grpblk_t max;
672  	ext4_grpblk_t chunk;
673  	unsigned int border;
674  
675  	BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb));
676  
677  	border = 2 << sb->s_blocksize_bits;
678  
679  	while (len > 0) {
680  		/* find how many blocks can be covered since this position */
681  		max = ffs(first | border) - 1;
682  
683  		/* find how many blocks of power 2 we need to mark */
684  		min = fls(len) - 1;
685  
686  		if (max < min)
687  			min = max;
688  		chunk = 1 << min;
689  
690  		/* mark multiblock chunks only */
691  		grp->bb_counters[min]++;
692  		if (min > 0)
693  			mb_clear_bit(first >> min,
694  				     buddy + sbi->s_mb_offsets[min]);
695  
696  		len -= chunk;
697  		first += chunk;
698  	}
699  }
700  
701  /*
702   * Cache the order of the largest free extent we have available in this block
703   * group.
704   */
705  static void
mb_set_largest_free_order(struct super_block * sb,struct ext4_group_info * grp)706  mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
707  {
708  	int i;
709  	int bits;
710  
711  	grp->bb_largest_free_order = -1; /* uninit */
712  
713  	bits = sb->s_blocksize_bits + 1;
714  	for (i = bits; i >= 0; i--) {
715  		if (grp->bb_counters[i] > 0) {
716  			grp->bb_largest_free_order = i;
717  			break;
718  		}
719  	}
720  }
721  
722  static noinline_for_stack
ext4_mb_generate_buddy(struct super_block * sb,void * buddy,void * bitmap,ext4_group_t group)723  void ext4_mb_generate_buddy(struct super_block *sb,
724  				void *buddy, void *bitmap, ext4_group_t group)
725  {
726  	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
727  	struct ext4_sb_info *sbi = EXT4_SB(sb);
728  	ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
729  	ext4_grpblk_t i = 0;
730  	ext4_grpblk_t first;
731  	ext4_grpblk_t len;
732  	unsigned free = 0;
733  	unsigned fragments = 0;
734  	unsigned long long period = get_cycles();
735  
736  	/* initialize buddy from bitmap which is aggregation
737  	 * of on-disk bitmap and preallocations */
738  	i = mb_find_next_zero_bit(bitmap, max, 0);
739  	grp->bb_first_free = i;
740  	while (i < max) {
741  		fragments++;
742  		first = i;
743  		i = mb_find_next_bit(bitmap, max, i);
744  		len = i - first;
745  		free += len;
746  		if (len > 1)
747  			ext4_mb_mark_free_simple(sb, buddy, first, len, grp);
748  		else
749  			grp->bb_counters[0]++;
750  		if (i < max)
751  			i = mb_find_next_zero_bit(bitmap, max, i);
752  	}
753  	grp->bb_fragments = fragments;
754  
755  	if (free != grp->bb_free) {
756  		ext4_grp_locked_error(sb, group, 0, 0,
757  				      "block bitmap and bg descriptor "
758  				      "inconsistent: %u vs %u free clusters",
759  				      free, grp->bb_free);
760  		/*
761  		 * If we intend to continue, we consider group descriptor
762  		 * corrupt and update bb_free using bitmap value
763  		 */
764  		grp->bb_free = free;
765  		if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
766  			percpu_counter_sub(&sbi->s_freeclusters_counter,
767  					   grp->bb_free);
768  		set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
769  	}
770  	mb_set_largest_free_order(sb, grp);
771  
772  	clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
773  
774  	period = get_cycles() - period;
775  	spin_lock(&EXT4_SB(sb)->s_bal_lock);
776  	EXT4_SB(sb)->s_mb_buddies_generated++;
777  	EXT4_SB(sb)->s_mb_generation_time += period;
778  	spin_unlock(&EXT4_SB(sb)->s_bal_lock);
779  }
780  
mb_regenerate_buddy(struct ext4_buddy * e4b)781  static void mb_regenerate_buddy(struct ext4_buddy *e4b)
782  {
783  	int count;
784  	int order = 1;
785  	void *buddy;
786  
787  	while ((buddy = mb_find_buddy(e4b, order++, &count))) {
788  		ext4_set_bits(buddy, 0, count);
789  	}
790  	e4b->bd_info->bb_fragments = 0;
791  	memset(e4b->bd_info->bb_counters, 0,
792  		sizeof(*e4b->bd_info->bb_counters) *
793  		(e4b->bd_sb->s_blocksize_bits + 2));
794  
795  	ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy,
796  		e4b->bd_bitmap, e4b->bd_group);
797  }
798  
799  /* The buddy information is attached the buddy cache inode
800   * for convenience. The information regarding each group
801   * is loaded via ext4_mb_load_buddy. The information involve
802   * block bitmap and buddy information. The information are
803   * stored in the inode as
804   *
805   * {                        page                        }
806   * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
807   *
808   *
809   * one block each for bitmap and buddy information.
810   * So for each group we take up 2 blocks. A page can
811   * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize)  blocks.
812   * So it can have information regarding groups_per_page which
813   * is blocks_per_page/2
814   *
815   * Locking note:  This routine takes the block group lock of all groups
816   * for this page; do not hold this lock when calling this routine!
817   */
818  
ext4_mb_init_cache(struct page * page,char * incore,gfp_t gfp)819  static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
820  {
821  	ext4_group_t ngroups;
822  	int blocksize;
823  	int blocks_per_page;
824  	int groups_per_page;
825  	int err = 0;
826  	int i;
827  	ext4_group_t first_group, group;
828  	int first_block;
829  	struct super_block *sb;
830  	struct buffer_head *bhs;
831  	struct buffer_head **bh = NULL;
832  	struct inode *inode;
833  	char *data;
834  	char *bitmap;
835  	struct ext4_group_info *grinfo;
836  
837  	mb_debug(1, "init page %lu\n", page->index);
838  
839  	inode = page->mapping->host;
840  	sb = inode->i_sb;
841  	ngroups = ext4_get_groups_count(sb);
842  	blocksize = 1 << inode->i_blkbits;
843  	blocks_per_page = PAGE_CACHE_SIZE / blocksize;
844  
845  	groups_per_page = blocks_per_page >> 1;
846  	if (groups_per_page == 0)
847  		groups_per_page = 1;
848  
849  	/* allocate buffer_heads to read bitmaps */
850  	if (groups_per_page > 1) {
851  		i = sizeof(struct buffer_head *) * groups_per_page;
852  		bh = kzalloc(i, gfp);
853  		if (bh == NULL) {
854  			err = -ENOMEM;
855  			goto out;
856  		}
857  	} else
858  		bh = &bhs;
859  
860  	first_group = page->index * blocks_per_page / 2;
861  
862  	/* read all groups the page covers into the cache */
863  	for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
864  		if (group >= ngroups)
865  			break;
866  
867  		grinfo = ext4_get_group_info(sb, group);
868  		/*
869  		 * If page is uptodate then we came here after online resize
870  		 * which added some new uninitialized group info structs, so
871  		 * we must skip all initialized uptodate buddies on the page,
872  		 * which may be currently in use by an allocating task.
873  		 */
874  		if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) {
875  			bh[i] = NULL;
876  			continue;
877  		}
878  		bh[i] = ext4_read_block_bitmap_nowait(sb, group);
879  		if (IS_ERR(bh[i])) {
880  			err = PTR_ERR(bh[i]);
881  			bh[i] = NULL;
882  			goto out;
883  		}
884  		mb_debug(1, "read bitmap for group %u\n", group);
885  	}
886  
887  	/* wait for I/O completion */
888  	for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
889  		int err2;
890  
891  		if (!bh[i])
892  			continue;
893  		err2 = ext4_wait_block_bitmap(sb, group, bh[i]);
894  		if (!err)
895  			err = err2;
896  	}
897  
898  	first_block = page->index * blocks_per_page;
899  	for (i = 0; i < blocks_per_page; i++) {
900  		group = (first_block + i) >> 1;
901  		if (group >= ngroups)
902  			break;
903  
904  		if (!bh[group - first_group])
905  			/* skip initialized uptodate buddy */
906  			continue;
907  
908  		if (!buffer_verified(bh[group - first_group]))
909  			/* Skip faulty bitmaps */
910  			continue;
911  		err = 0;
912  
913  		/*
914  		 * data carry information regarding this
915  		 * particular group in the format specified
916  		 * above
917  		 *
918  		 */
919  		data = page_address(page) + (i * blocksize);
920  		bitmap = bh[group - first_group]->b_data;
921  
922  		/*
923  		 * We place the buddy block and bitmap block
924  		 * close together
925  		 */
926  		if ((first_block + i) & 1) {
927  			/* this is block of buddy */
928  			BUG_ON(incore == NULL);
929  			mb_debug(1, "put buddy for group %u in page %lu/%x\n",
930  				group, page->index, i * blocksize);
931  			trace_ext4_mb_buddy_bitmap_load(sb, group);
932  			grinfo = ext4_get_group_info(sb, group);
933  			grinfo->bb_fragments = 0;
934  			memset(grinfo->bb_counters, 0,
935  			       sizeof(*grinfo->bb_counters) *
936  				(sb->s_blocksize_bits+2));
937  			/*
938  			 * incore got set to the group block bitmap below
939  			 */
940  			ext4_lock_group(sb, group);
941  			/* init the buddy */
942  			memset(data, 0xff, blocksize);
943  			ext4_mb_generate_buddy(sb, data, incore, group);
944  			ext4_unlock_group(sb, group);
945  			incore = NULL;
946  		} else {
947  			/* this is block of bitmap */
948  			BUG_ON(incore != NULL);
949  			mb_debug(1, "put bitmap for group %u in page %lu/%x\n",
950  				group, page->index, i * blocksize);
951  			trace_ext4_mb_bitmap_load(sb, group);
952  
953  			/* see comments in ext4_mb_put_pa() */
954  			ext4_lock_group(sb, group);
955  			memcpy(data, bitmap, blocksize);
956  
957  			/* mark all preallocated blks used in in-core bitmap */
958  			ext4_mb_generate_from_pa(sb, data, group);
959  			ext4_mb_generate_from_freelist(sb, data, group);
960  			ext4_unlock_group(sb, group);
961  
962  			/* set incore so that the buddy information can be
963  			 * generated using this
964  			 */
965  			incore = data;
966  		}
967  	}
968  	SetPageUptodate(page);
969  
970  out:
971  	if (bh) {
972  		for (i = 0; i < groups_per_page; i++)
973  			brelse(bh[i]);
974  		if (bh != &bhs)
975  			kfree(bh);
976  	}
977  	return err;
978  }
979  
980  /*
981   * Lock the buddy and bitmap pages. This make sure other parallel init_group
982   * on the same buddy page doesn't happen whild holding the buddy page lock.
983   * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap
984   * are on the same page e4b->bd_buddy_page is NULL and return value is 0.
985   */
ext4_mb_get_buddy_page_lock(struct super_block * sb,ext4_group_t group,struct ext4_buddy * e4b,gfp_t gfp)986  static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
987  		ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp)
988  {
989  	struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
990  	int block, pnum, poff;
991  	int blocks_per_page;
992  	struct page *page;
993  
994  	e4b->bd_buddy_page = NULL;
995  	e4b->bd_bitmap_page = NULL;
996  
997  	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
998  	/*
999  	 * the buddy cache inode stores the block bitmap
1000  	 * and buddy information in consecutive blocks.
1001  	 * So for each group we need two blocks.
1002  	 */
1003  	block = group * 2;
1004  	pnum = block / blocks_per_page;
1005  	poff = block % blocks_per_page;
1006  	page = find_or_create_page(inode->i_mapping, pnum, gfp);
1007  	if (!page)
1008  		return -ENOMEM;
1009  	BUG_ON(page->mapping != inode->i_mapping);
1010  	e4b->bd_bitmap_page = page;
1011  	e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
1012  
1013  	if (blocks_per_page >= 2) {
1014  		/* buddy and bitmap are on the same page */
1015  		return 0;
1016  	}
1017  
1018  	block++;
1019  	pnum = block / blocks_per_page;
1020  	page = find_or_create_page(inode->i_mapping, pnum, gfp);
1021  	if (!page)
1022  		return -ENOMEM;
1023  	BUG_ON(page->mapping != inode->i_mapping);
1024  	e4b->bd_buddy_page = page;
1025  	return 0;
1026  }
1027  
ext4_mb_put_buddy_page_lock(struct ext4_buddy * e4b)1028  static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
1029  {
1030  	if (e4b->bd_bitmap_page) {
1031  		unlock_page(e4b->bd_bitmap_page);
1032  		page_cache_release(e4b->bd_bitmap_page);
1033  	}
1034  	if (e4b->bd_buddy_page) {
1035  		unlock_page(e4b->bd_buddy_page);
1036  		page_cache_release(e4b->bd_buddy_page);
1037  	}
1038  }
1039  
1040  /*
1041   * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
1042   * block group lock of all groups for this page; do not hold the BG lock when
1043   * calling this routine!
1044   */
1045  static noinline_for_stack
ext4_mb_init_group(struct super_block * sb,ext4_group_t group,gfp_t gfp)1046  int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp)
1047  {
1048  
1049  	struct ext4_group_info *this_grp;
1050  	struct ext4_buddy e4b;
1051  	struct page *page;
1052  	int ret = 0;
1053  
1054  	might_sleep();
1055  	mb_debug(1, "init group %u\n", group);
1056  	this_grp = ext4_get_group_info(sb, group);
1057  	/*
1058  	 * This ensures that we don't reinit the buddy cache
1059  	 * page which map to the group from which we are already
1060  	 * allocating. If we are looking at the buddy cache we would
1061  	 * have taken a reference using ext4_mb_load_buddy and that
1062  	 * would have pinned buddy page to page cache.
1063  	 * The call to ext4_mb_get_buddy_page_lock will mark the
1064  	 * page accessed.
1065  	 */
1066  	ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b, gfp);
1067  	if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
1068  		/*
1069  		 * somebody initialized the group
1070  		 * return without doing anything
1071  		 */
1072  		goto err;
1073  	}
1074  
1075  	page = e4b.bd_bitmap_page;
1076  	ret = ext4_mb_init_cache(page, NULL, gfp);
1077  	if (ret)
1078  		goto err;
1079  	if (!PageUptodate(page)) {
1080  		ret = -EIO;
1081  		goto err;
1082  	}
1083  
1084  	if (e4b.bd_buddy_page == NULL) {
1085  		/*
1086  		 * If both the bitmap and buddy are in
1087  		 * the same page we don't need to force
1088  		 * init the buddy
1089  		 */
1090  		ret = 0;
1091  		goto err;
1092  	}
1093  	/* init buddy cache */
1094  	page = e4b.bd_buddy_page;
1095  	ret = ext4_mb_init_cache(page, e4b.bd_bitmap, gfp);
1096  	if (ret)
1097  		goto err;
1098  	if (!PageUptodate(page)) {
1099  		ret = -EIO;
1100  		goto err;
1101  	}
1102  err:
1103  	ext4_mb_put_buddy_page_lock(&e4b);
1104  	return ret;
1105  }
1106  
1107  /*
1108   * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
1109   * block group lock of all groups for this page; do not hold the BG lock when
1110   * calling this routine!
1111   */
1112  static noinline_for_stack int
ext4_mb_load_buddy_gfp(struct super_block * sb,ext4_group_t group,struct ext4_buddy * e4b,gfp_t gfp)1113  ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
1114  		       struct ext4_buddy *e4b, gfp_t gfp)
1115  {
1116  	int blocks_per_page;
1117  	int block;
1118  	int pnum;
1119  	int poff;
1120  	struct page *page;
1121  	int ret;
1122  	struct ext4_group_info *grp;
1123  	struct ext4_sb_info *sbi = EXT4_SB(sb);
1124  	struct inode *inode = sbi->s_buddy_cache;
1125  
1126  	might_sleep();
1127  	mb_debug(1, "load group %u\n", group);
1128  
1129  	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1130  	grp = ext4_get_group_info(sb, group);
1131  
1132  	e4b->bd_blkbits = sb->s_blocksize_bits;
1133  	e4b->bd_info = grp;
1134  	e4b->bd_sb = sb;
1135  	e4b->bd_group = group;
1136  	e4b->bd_buddy_page = NULL;
1137  	e4b->bd_bitmap_page = NULL;
1138  
1139  	if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
1140  		/*
1141  		 * we need full data about the group
1142  		 * to make a good selection
1143  		 */
1144  		ret = ext4_mb_init_group(sb, group, gfp);
1145  		if (ret)
1146  			return ret;
1147  	}
1148  
1149  	/*
1150  	 * the buddy cache inode stores the block bitmap
1151  	 * and buddy information in consecutive blocks.
1152  	 * So for each group we need two blocks.
1153  	 */
1154  	block = group * 2;
1155  	pnum = block / blocks_per_page;
1156  	poff = block % blocks_per_page;
1157  
1158  	/* we could use find_or_create_page(), but it locks page
1159  	 * what we'd like to avoid in fast path ... */
1160  	page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
1161  	if (page == NULL || !PageUptodate(page)) {
1162  		if (page)
1163  			/*
1164  			 * drop the page reference and try
1165  			 * to get the page with lock. If we
1166  			 * are not uptodate that implies
1167  			 * somebody just created the page but
1168  			 * is yet to initialize the same. So
1169  			 * wait for it to initialize.
1170  			 */
1171  			page_cache_release(page);
1172  		page = find_or_create_page(inode->i_mapping, pnum, gfp);
1173  		if (page) {
1174  			BUG_ON(page->mapping != inode->i_mapping);
1175  			if (!PageUptodate(page)) {
1176  				ret = ext4_mb_init_cache(page, NULL, gfp);
1177  				if (ret) {
1178  					unlock_page(page);
1179  					goto err;
1180  				}
1181  				mb_cmp_bitmaps(e4b, page_address(page) +
1182  					       (poff * sb->s_blocksize));
1183  			}
1184  			unlock_page(page);
1185  		}
1186  	}
1187  	if (page == NULL) {
1188  		ret = -ENOMEM;
1189  		goto err;
1190  	}
1191  	if (!PageUptodate(page)) {
1192  		ret = -EIO;
1193  		goto err;
1194  	}
1195  
1196  	/* Pages marked accessed already */
1197  	e4b->bd_bitmap_page = page;
1198  	e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
1199  
1200  	block++;
1201  	pnum = block / blocks_per_page;
1202  	poff = block % blocks_per_page;
1203  
1204  	page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
1205  	if (page == NULL || !PageUptodate(page)) {
1206  		if (page)
1207  			page_cache_release(page);
1208  		page = find_or_create_page(inode->i_mapping, pnum, gfp);
1209  		if (page) {
1210  			BUG_ON(page->mapping != inode->i_mapping);
1211  			if (!PageUptodate(page)) {
1212  				ret = ext4_mb_init_cache(page, e4b->bd_bitmap,
1213  							 gfp);
1214  				if (ret) {
1215  					unlock_page(page);
1216  					goto err;
1217  				}
1218  			}
1219  			unlock_page(page);
1220  		}
1221  	}
1222  	if (page == NULL) {
1223  		ret = -ENOMEM;
1224  		goto err;
1225  	}
1226  	if (!PageUptodate(page)) {
1227  		ret = -EIO;
1228  		goto err;
1229  	}
1230  
1231  	/* Pages marked accessed already */
1232  	e4b->bd_buddy_page = page;
1233  	e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
1234  
1235  	BUG_ON(e4b->bd_bitmap_page == NULL);
1236  	BUG_ON(e4b->bd_buddy_page == NULL);
1237  
1238  	return 0;
1239  
1240  err:
1241  	if (page)
1242  		page_cache_release(page);
1243  	if (e4b->bd_bitmap_page)
1244  		page_cache_release(e4b->bd_bitmap_page);
1245  	if (e4b->bd_buddy_page)
1246  		page_cache_release(e4b->bd_buddy_page);
1247  	e4b->bd_buddy = NULL;
1248  	e4b->bd_bitmap = NULL;
1249  	return ret;
1250  }
1251  
ext4_mb_load_buddy(struct super_block * sb,ext4_group_t group,struct ext4_buddy * e4b)1252  static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1253  			      struct ext4_buddy *e4b)
1254  {
1255  	return ext4_mb_load_buddy_gfp(sb, group, e4b, GFP_NOFS);
1256  }
1257  
ext4_mb_unload_buddy(struct ext4_buddy * e4b)1258  static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
1259  {
1260  	if (e4b->bd_bitmap_page)
1261  		page_cache_release(e4b->bd_bitmap_page);
1262  	if (e4b->bd_buddy_page)
1263  		page_cache_release(e4b->bd_buddy_page);
1264  }
1265  
1266  
mb_find_order_for_block(struct ext4_buddy * e4b,int block)1267  static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
1268  {
1269  	int order = 1;
1270  	int bb_incr = 1 << (e4b->bd_blkbits - 1);
1271  	void *bb;
1272  
1273  	BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
1274  	BUG_ON(block >= (1 << (e4b->bd_blkbits + 3)));
1275  
1276  	bb = e4b->bd_buddy;
1277  	while (order <= e4b->bd_blkbits + 1) {
1278  		block = block >> 1;
1279  		if (!mb_test_bit(block, bb)) {
1280  			/* this block is part of buddy of order 'order' */
1281  			return order;
1282  		}
1283  		bb += bb_incr;
1284  		bb_incr >>= 1;
1285  		order++;
1286  	}
1287  	return 0;
1288  }
1289  
mb_clear_bits(void * bm,int cur,int len)1290  static void mb_clear_bits(void *bm, int cur, int len)
1291  {
1292  	__u32 *addr;
1293  
1294  	len = cur + len;
1295  	while (cur < len) {
1296  		if ((cur & 31) == 0 && (len - cur) >= 32) {
1297  			/* fast path: clear whole word at once */
1298  			addr = bm + (cur >> 3);
1299  			*addr = 0;
1300  			cur += 32;
1301  			continue;
1302  		}
1303  		mb_clear_bit(cur, bm);
1304  		cur++;
1305  	}
1306  }
1307  
1308  /* clear bits in given range
1309   * will return first found zero bit if any, -1 otherwise
1310   */
mb_test_and_clear_bits(void * bm,int cur,int len)1311  static int mb_test_and_clear_bits(void *bm, int cur, int len)
1312  {
1313  	__u32 *addr;
1314  	int zero_bit = -1;
1315  
1316  	len = cur + len;
1317  	while (cur < len) {
1318  		if ((cur & 31) == 0 && (len - cur) >= 32) {
1319  			/* fast path: clear whole word at once */
1320  			addr = bm + (cur >> 3);
1321  			if (*addr != (__u32)(-1) && zero_bit == -1)
1322  				zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0);
1323  			*addr = 0;
1324  			cur += 32;
1325  			continue;
1326  		}
1327  		if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1)
1328  			zero_bit = cur;
1329  		cur++;
1330  	}
1331  
1332  	return zero_bit;
1333  }
1334  
ext4_set_bits(void * bm,int cur,int len)1335  void ext4_set_bits(void *bm, int cur, int len)
1336  {
1337  	__u32 *addr;
1338  
1339  	len = cur + len;
1340  	while (cur < len) {
1341  		if ((cur & 31) == 0 && (len - cur) >= 32) {
1342  			/* fast path: set whole word at once */
1343  			addr = bm + (cur >> 3);
1344  			*addr = 0xffffffff;
1345  			cur += 32;
1346  			continue;
1347  		}
1348  		mb_set_bit(cur, bm);
1349  		cur++;
1350  	}
1351  }
1352  
1353  /*
1354   * _________________________________________________________________ */
1355  
mb_buddy_adjust_border(int * bit,void * bitmap,int side)1356  static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side)
1357  {
1358  	if (mb_test_bit(*bit + side, bitmap)) {
1359  		mb_clear_bit(*bit, bitmap);
1360  		(*bit) -= side;
1361  		return 1;
1362  	}
1363  	else {
1364  		(*bit) += side;
1365  		mb_set_bit(*bit, bitmap);
1366  		return -1;
1367  	}
1368  }
1369  
mb_buddy_mark_free(struct ext4_buddy * e4b,int first,int last)1370  static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last)
1371  {
1372  	int max;
1373  	int order = 1;
1374  	void *buddy = mb_find_buddy(e4b, order, &max);
1375  
1376  	while (buddy) {
1377  		void *buddy2;
1378  
1379  		/* Bits in range [first; last] are known to be set since
1380  		 * corresponding blocks were allocated. Bits in range
1381  		 * (first; last) will stay set because they form buddies on
1382  		 * upper layer. We just deal with borders if they don't
1383  		 * align with upper layer and then go up.
1384  		 * Releasing entire group is all about clearing
1385  		 * single bit of highest order buddy.
1386  		 */
1387  
1388  		/* Example:
1389  		 * ---------------------------------
1390  		 * |   1   |   1   |   1   |   1   |
1391  		 * ---------------------------------
1392  		 * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
1393  		 * ---------------------------------
1394  		 *   0   1   2   3   4   5   6   7
1395  		 *      \_____________________/
1396  		 *
1397  		 * Neither [1] nor [6] is aligned to above layer.
1398  		 * Left neighbour [0] is free, so mark it busy,
1399  		 * decrease bb_counters and extend range to
1400  		 * [0; 6]
1401  		 * Right neighbour [7] is busy. It can't be coaleasced with [6], so
1402  		 * mark [6] free, increase bb_counters and shrink range to
1403  		 * [0; 5].
1404  		 * Then shift range to [0; 2], go up and do the same.
1405  		 */
1406  
1407  
1408  		if (first & 1)
1409  			e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1);
1410  		if (!(last & 1))
1411  			e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1);
1412  		if (first > last)
1413  			break;
1414  		order++;
1415  
1416  		if (first == last || !(buddy2 = mb_find_buddy(e4b, order, &max))) {
1417  			mb_clear_bits(buddy, first, last - first + 1);
1418  			e4b->bd_info->bb_counters[order - 1] += last - first + 1;
1419  			break;
1420  		}
1421  		first >>= 1;
1422  		last >>= 1;
1423  		buddy = buddy2;
1424  	}
1425  }
1426  
mb_free_blocks(struct inode * inode,struct ext4_buddy * e4b,int first,int count)1427  static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1428  			   int first, int count)
1429  {
1430  	int left_is_free = 0;
1431  	int right_is_free = 0;
1432  	int block;
1433  	int last = first + count - 1;
1434  	struct super_block *sb = e4b->bd_sb;
1435  
1436  	if (WARN_ON(count == 0))
1437  		return;
1438  	BUG_ON(last >= (sb->s_blocksize << 3));
1439  	assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
1440  	/* Don't bother if the block group is corrupt. */
1441  	if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
1442  		return;
1443  
1444  	mb_check_buddy(e4b);
1445  	mb_free_blocks_double(inode, e4b, first, count);
1446  
1447  	e4b->bd_info->bb_free += count;
1448  	if (first < e4b->bd_info->bb_first_free)
1449  		e4b->bd_info->bb_first_free = first;
1450  
1451  	/* access memory sequentially: check left neighbour,
1452  	 * clear range and then check right neighbour
1453  	 */
1454  	if (first != 0)
1455  		left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap);
1456  	block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count);
1457  	if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0])
1458  		right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap);
1459  
1460  	if (unlikely(block != -1)) {
1461  		struct ext4_sb_info *sbi = EXT4_SB(sb);
1462  		ext4_fsblk_t blocknr;
1463  
1464  		blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
1465  		blocknr += EXT4_C2B(EXT4_SB(sb), block);
1466  		ext4_grp_locked_error(sb, e4b->bd_group,
1467  				      inode ? inode->i_ino : 0,
1468  				      blocknr,
1469  				      "freeing already freed block "
1470  				      "(bit %u); block bitmap corrupt.",
1471  				      block);
1472  		if (!EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))
1473  			percpu_counter_sub(&sbi->s_freeclusters_counter,
1474  					   e4b->bd_info->bb_free);
1475  		/* Mark the block group as corrupt. */
1476  		set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT,
1477  			&e4b->bd_info->bb_state);
1478  		mb_regenerate_buddy(e4b);
1479  		goto done;
1480  	}
1481  
1482  	/* let's maintain fragments counter */
1483  	if (left_is_free && right_is_free)
1484  		e4b->bd_info->bb_fragments--;
1485  	else if (!left_is_free && !right_is_free)
1486  		e4b->bd_info->bb_fragments++;
1487  
1488  	/* buddy[0] == bd_bitmap is a special case, so handle
1489  	 * it right away and let mb_buddy_mark_free stay free of
1490  	 * zero order checks.
1491  	 * Check if neighbours are to be coaleasced,
1492  	 * adjust bitmap bb_counters and borders appropriately.
1493  	 */
1494  	if (first & 1) {
1495  		first += !left_is_free;
1496  		e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1;
1497  	}
1498  	if (!(last & 1)) {
1499  		last -= !right_is_free;
1500  		e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1;
1501  	}
1502  
1503  	if (first <= last)
1504  		mb_buddy_mark_free(e4b, first >> 1, last >> 1);
1505  
1506  done:
1507  	mb_set_largest_free_order(sb, e4b->bd_info);
1508  	mb_check_buddy(e4b);
1509  }
1510  
mb_find_extent(struct ext4_buddy * e4b,int block,int needed,struct ext4_free_extent * ex)1511  static int mb_find_extent(struct ext4_buddy *e4b, int block,
1512  				int needed, struct ext4_free_extent *ex)
1513  {
1514  	int next = block;
1515  	int max, order;
1516  	void *buddy;
1517  
1518  	assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
1519  	BUG_ON(ex == NULL);
1520  
1521  	buddy = mb_find_buddy(e4b, 0, &max);
1522  	BUG_ON(buddy == NULL);
1523  	BUG_ON(block >= max);
1524  	if (mb_test_bit(block, buddy)) {
1525  		ex->fe_len = 0;
1526  		ex->fe_start = 0;
1527  		ex->fe_group = 0;
1528  		return 0;
1529  	}
1530  
1531  	/* find actual order */
1532  	order = mb_find_order_for_block(e4b, block);
1533  	block = block >> order;
1534  
1535  	ex->fe_len = 1 << order;
1536  	ex->fe_start = block << order;
1537  	ex->fe_group = e4b->bd_group;
1538  
1539  	/* calc difference from given start */
1540  	next = next - ex->fe_start;
1541  	ex->fe_len -= next;
1542  	ex->fe_start += next;
1543  
1544  	while (needed > ex->fe_len &&
1545  	       mb_find_buddy(e4b, order, &max)) {
1546  
1547  		if (block + 1 >= max)
1548  			break;
1549  
1550  		next = (block + 1) * (1 << order);
1551  		if (mb_test_bit(next, e4b->bd_bitmap))
1552  			break;
1553  
1554  		order = mb_find_order_for_block(e4b, next);
1555  
1556  		block = next >> order;
1557  		ex->fe_len += 1 << order;
1558  	}
1559  
1560  	BUG_ON(ex->fe_start + ex->fe_len > (1 << (e4b->bd_blkbits + 3)));
1561  	return ex->fe_len;
1562  }
1563  
mb_mark_used(struct ext4_buddy * e4b,struct ext4_free_extent * ex)1564  static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
1565  {
1566  	int ord;
1567  	int mlen = 0;
1568  	int max = 0;
1569  	int cur;
1570  	int start = ex->fe_start;
1571  	int len = ex->fe_len;
1572  	unsigned ret = 0;
1573  	int len0 = len;
1574  	void *buddy;
1575  
1576  	BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3));
1577  	BUG_ON(e4b->bd_group != ex->fe_group);
1578  	assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
1579  	mb_check_buddy(e4b);
1580  	mb_mark_used_double(e4b, start, len);
1581  
1582  	e4b->bd_info->bb_free -= len;
1583  	if (e4b->bd_info->bb_first_free == start)
1584  		e4b->bd_info->bb_first_free += len;
1585  
1586  	/* let's maintain fragments counter */
1587  	if (start != 0)
1588  		mlen = !mb_test_bit(start - 1, e4b->bd_bitmap);
1589  	if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0])
1590  		max = !mb_test_bit(start + len, e4b->bd_bitmap);
1591  	if (mlen && max)
1592  		e4b->bd_info->bb_fragments++;
1593  	else if (!mlen && !max)
1594  		e4b->bd_info->bb_fragments--;
1595  
1596  	/* let's maintain buddy itself */
1597  	while (len) {
1598  		ord = mb_find_order_for_block(e4b, start);
1599  
1600  		if (((start >> ord) << ord) == start && len >= (1 << ord)) {
1601  			/* the whole chunk may be allocated at once! */
1602  			mlen = 1 << ord;
1603  			buddy = mb_find_buddy(e4b, ord, &max);
1604  			BUG_ON((start >> ord) >= max);
1605  			mb_set_bit(start >> ord, buddy);
1606  			e4b->bd_info->bb_counters[ord]--;
1607  			start += mlen;
1608  			len -= mlen;
1609  			BUG_ON(len < 0);
1610  			continue;
1611  		}
1612  
1613  		/* store for history */
1614  		if (ret == 0)
1615  			ret = len | (ord << 16);
1616  
1617  		/* we have to split large buddy */
1618  		BUG_ON(ord <= 0);
1619  		buddy = mb_find_buddy(e4b, ord, &max);
1620  		mb_set_bit(start >> ord, buddy);
1621  		e4b->bd_info->bb_counters[ord]--;
1622  
1623  		ord--;
1624  		cur = (start >> ord) & ~1U;
1625  		buddy = mb_find_buddy(e4b, ord, &max);
1626  		mb_clear_bit(cur, buddy);
1627  		mb_clear_bit(cur + 1, buddy);
1628  		e4b->bd_info->bb_counters[ord]++;
1629  		e4b->bd_info->bb_counters[ord]++;
1630  	}
1631  	mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
1632  
1633  	ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0);
1634  	mb_check_buddy(e4b);
1635  
1636  	return ret;
1637  }
1638  
1639  /*
1640   * Must be called under group lock!
1641   */
ext4_mb_use_best_found(struct ext4_allocation_context * ac,struct ext4_buddy * e4b)1642  static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
1643  					struct ext4_buddy *e4b)
1644  {
1645  	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
1646  	int ret;
1647  
1648  	BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group);
1649  	BUG_ON(ac->ac_status == AC_STATUS_FOUND);
1650  
1651  	ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len);
1652  	ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical;
1653  	ret = mb_mark_used(e4b, &ac->ac_b_ex);
1654  
1655  	/* preallocation can change ac_b_ex, thus we store actually
1656  	 * allocated blocks for history */
1657  	ac->ac_f_ex = ac->ac_b_ex;
1658  
1659  	ac->ac_status = AC_STATUS_FOUND;
1660  	ac->ac_tail = ret & 0xffff;
1661  	ac->ac_buddy = ret >> 16;
1662  
1663  	/*
1664  	 * take the page reference. We want the page to be pinned
1665  	 * so that we don't get a ext4_mb_init_cache_call for this
1666  	 * group until we update the bitmap. That would mean we
1667  	 * double allocate blocks. The reference is dropped
1668  	 * in ext4_mb_release_context
1669  	 */
1670  	ac->ac_bitmap_page = e4b->bd_bitmap_page;
1671  	get_page(ac->ac_bitmap_page);
1672  	ac->ac_buddy_page = e4b->bd_buddy_page;
1673  	get_page(ac->ac_buddy_page);
1674  	/* store last allocated for subsequent stream allocation */
1675  	if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
1676  		spin_lock(&sbi->s_md_lock);
1677  		sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
1678  		sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
1679  		spin_unlock(&sbi->s_md_lock);
1680  	}
1681  }
1682  
1683  /*
1684   * regular allocator, for general purposes allocation
1685   */
1686  
ext4_mb_check_limits(struct ext4_allocation_context * ac,struct ext4_buddy * e4b,int finish_group)1687  static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
1688  					struct ext4_buddy *e4b,
1689  					int finish_group)
1690  {
1691  	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
1692  	struct ext4_free_extent *bex = &ac->ac_b_ex;
1693  	struct ext4_free_extent *gex = &ac->ac_g_ex;
1694  	struct ext4_free_extent ex;
1695  	int max;
1696  
1697  	if (ac->ac_status == AC_STATUS_FOUND)
1698  		return;
1699  	/*
1700  	 * We don't want to scan for a whole year
1701  	 */
1702  	if (ac->ac_found > sbi->s_mb_max_to_scan &&
1703  			!(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
1704  		ac->ac_status = AC_STATUS_BREAK;
1705  		return;
1706  	}
1707  
1708  	/*
1709  	 * Haven't found good chunk so far, let's continue
1710  	 */
1711  	if (bex->fe_len < gex->fe_len)
1712  		return;
1713  
1714  	if ((finish_group || ac->ac_found > sbi->s_mb_min_to_scan)
1715  			&& bex->fe_group == e4b->bd_group) {
1716  		/* recheck chunk's availability - we don't know
1717  		 * when it was found (within this lock-unlock
1718  		 * period or not) */
1719  		max = mb_find_extent(e4b, bex->fe_start, gex->fe_len, &ex);
1720  		if (max >= gex->fe_len) {
1721  			ext4_mb_use_best_found(ac, e4b);
1722  			return;
1723  		}
1724  	}
1725  }
1726  
1727  /*
1728   * The routine checks whether found extent is good enough. If it is,
1729   * then the extent gets marked used and flag is set to the context
1730   * to stop scanning. Otherwise, the extent is compared with the
1731   * previous found extent and if new one is better, then it's stored
1732   * in the context. Later, the best found extent will be used, if
1733   * mballoc can't find good enough extent.
1734   *
1735   * FIXME: real allocation policy is to be designed yet!
1736   */
ext4_mb_measure_extent(struct ext4_allocation_context * ac,struct ext4_free_extent * ex,struct ext4_buddy * e4b)1737  static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
1738  					struct ext4_free_extent *ex,
1739  					struct ext4_buddy *e4b)
1740  {
1741  	struct ext4_free_extent *bex = &ac->ac_b_ex;
1742  	struct ext4_free_extent *gex = &ac->ac_g_ex;
1743  
1744  	BUG_ON(ex->fe_len <= 0);
1745  	BUG_ON(ex->fe_len > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
1746  	BUG_ON(ex->fe_start >= EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
1747  	BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);
1748  
1749  	ac->ac_found++;
1750  
1751  	/*
1752  	 * The special case - take what you catch first
1753  	 */
1754  	if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
1755  		*bex = *ex;
1756  		ext4_mb_use_best_found(ac, e4b);
1757  		return;
1758  	}
1759  
1760  	/*
1761  	 * Let's check whether the chuck is good enough
1762  	 */
1763  	if (ex->fe_len == gex->fe_len) {
1764  		*bex = *ex;
1765  		ext4_mb_use_best_found(ac, e4b);
1766  		return;
1767  	}
1768  
1769  	/*
1770  	 * If this is first found extent, just store it in the context
1771  	 */
1772  	if (bex->fe_len == 0) {
1773  		*bex = *ex;
1774  		return;
1775  	}
1776  
1777  	/*
1778  	 * If new found extent is better, store it in the context
1779  	 */
1780  	if (bex->fe_len < gex->fe_len) {
1781  		/* if the request isn't satisfied, any found extent
1782  		 * larger than previous best one is better */
1783  		if (ex->fe_len > bex->fe_len)
1784  			*bex = *ex;
1785  	} else if (ex->fe_len > gex->fe_len) {
1786  		/* if the request is satisfied, then we try to find
1787  		 * an extent that still satisfy the request, but is
1788  		 * smaller than previous one */
1789  		if (ex->fe_len < bex->fe_len)
1790  			*bex = *ex;
1791  	}
1792  
1793  	ext4_mb_check_limits(ac, e4b, 0);
1794  }
1795  
1796  static noinline_for_stack
ext4_mb_try_best_found(struct ext4_allocation_context * ac,struct ext4_buddy * e4b)1797  int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
1798  					struct ext4_buddy *e4b)
1799  {
1800  	struct ext4_free_extent ex = ac->ac_b_ex;
1801  	ext4_group_t group = ex.fe_group;
1802  	int max;
1803  	int err;
1804  
1805  	BUG_ON(ex.fe_len <= 0);
1806  	err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
1807  	if (err)
1808  		return err;
1809  
1810  	ext4_lock_group(ac->ac_sb, group);
1811  	max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex);
1812  
1813  	if (max > 0) {
1814  		ac->ac_b_ex = ex;
1815  		ext4_mb_use_best_found(ac, e4b);
1816  	}
1817  
1818  	ext4_unlock_group(ac->ac_sb, group);
1819  	ext4_mb_unload_buddy(e4b);
1820  
1821  	return 0;
1822  }
1823  
1824  static noinline_for_stack
ext4_mb_find_by_goal(struct ext4_allocation_context * ac,struct ext4_buddy * e4b)1825  int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
1826  				struct ext4_buddy *e4b)
1827  {
1828  	ext4_group_t group = ac->ac_g_ex.fe_group;
1829  	int max;
1830  	int err;
1831  	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
1832  	struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
1833  	struct ext4_free_extent ex;
1834  
1835  	if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL))
1836  		return 0;
1837  	if (grp->bb_free == 0)
1838  		return 0;
1839  
1840  	err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
1841  	if (err)
1842  		return err;
1843  
1844  	if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) {
1845  		ext4_mb_unload_buddy(e4b);
1846  		return 0;
1847  	}
1848  
1849  	ext4_lock_group(ac->ac_sb, group);
1850  	max = mb_find_extent(e4b, ac->ac_g_ex.fe_start,
1851  			     ac->ac_g_ex.fe_len, &ex);
1852  	ex.fe_logical = 0xDEADFA11; /* debug value */
1853  
1854  	if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
1855  		ext4_fsblk_t start;
1856  
1857  		start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) +
1858  			ex.fe_start;
1859  		/* use do_div to get remainder (would be 64-bit modulo) */
1860  		if (do_div(start, sbi->s_stripe) == 0) {
1861  			ac->ac_found++;
1862  			ac->ac_b_ex = ex;
1863  			ext4_mb_use_best_found(ac, e4b);
1864  		}
1865  	} else if (max >= ac->ac_g_ex.fe_len) {
1866  		BUG_ON(ex.fe_len <= 0);
1867  		BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
1868  		BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
1869  		ac->ac_found++;
1870  		ac->ac_b_ex = ex;
1871  		ext4_mb_use_best_found(ac, e4b);
1872  	} else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) {
1873  		/* Sometimes, caller may want to merge even small
1874  		 * number of blocks to an existing extent */
1875  		BUG_ON(ex.fe_len <= 0);
1876  		BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
1877  		BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
1878  		ac->ac_found++;
1879  		ac->ac_b_ex = ex;
1880  		ext4_mb_use_best_found(ac, e4b);
1881  	}
1882  	ext4_unlock_group(ac->ac_sb, group);
1883  	ext4_mb_unload_buddy(e4b);
1884  
1885  	return 0;
1886  }
1887  
1888  /*
1889   * The routine scans buddy structures (not bitmap!) from given order
1890   * to max order and tries to find big enough chunk to satisfy the req
1891   */
1892  static noinline_for_stack
ext4_mb_simple_scan_group(struct ext4_allocation_context * ac,struct ext4_buddy * e4b)1893  void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
1894  					struct ext4_buddy *e4b)
1895  {
1896  	struct super_block *sb = ac->ac_sb;
1897  	struct ext4_group_info *grp = e4b->bd_info;
1898  	void *buddy;
1899  	int i;
1900  	int k;
1901  	int max;
1902  
1903  	BUG_ON(ac->ac_2order <= 0);
1904  	for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) {
1905  		if (grp->bb_counters[i] == 0)
1906  			continue;
1907  
1908  		buddy = mb_find_buddy(e4b, i, &max);
1909  		BUG_ON(buddy == NULL);
1910  
1911  		k = mb_find_next_zero_bit(buddy, max, 0);
1912  		BUG_ON(k >= max);
1913  
1914  		ac->ac_found++;
1915  
1916  		ac->ac_b_ex.fe_len = 1 << i;
1917  		ac->ac_b_ex.fe_start = k << i;
1918  		ac->ac_b_ex.fe_group = e4b->bd_group;
1919  
1920  		ext4_mb_use_best_found(ac, e4b);
1921  
1922  		BUG_ON(ac->ac_b_ex.fe_len != ac->ac_g_ex.fe_len);
1923  
1924  		if (EXT4_SB(sb)->s_mb_stats)
1925  			atomic_inc(&EXT4_SB(sb)->s_bal_2orders);
1926  
1927  		break;
1928  	}
1929  }
1930  
1931  /*
1932   * The routine scans the group and measures all found extents.
1933   * In order to optimize scanning, caller must pass number of
1934   * free blocks in the group, so the routine can know upper limit.
1935   */
1936  static noinline_for_stack
ext4_mb_complex_scan_group(struct ext4_allocation_context * ac,struct ext4_buddy * e4b)1937  void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
1938  					struct ext4_buddy *e4b)
1939  {
1940  	struct super_block *sb = ac->ac_sb;
1941  	void *bitmap = e4b->bd_bitmap;
1942  	struct ext4_free_extent ex;
1943  	int i;
1944  	int free;
1945  
1946  	free = e4b->bd_info->bb_free;
1947  	if (WARN_ON(free <= 0))
1948  		return;
1949  
1950  	i = e4b->bd_info->bb_first_free;
1951  
1952  	while (free && ac->ac_status == AC_STATUS_CONTINUE) {
1953  		i = mb_find_next_zero_bit(bitmap,
1954  						EXT4_CLUSTERS_PER_GROUP(sb), i);
1955  		if (i >= EXT4_CLUSTERS_PER_GROUP(sb)) {
1956  			/*
1957  			 * IF we have corrupt bitmap, we won't find any
1958  			 * free blocks even though group info says we
1959  			 * we have free blocks
1960  			 */
1961  			ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
1962  					"%d free clusters as per "
1963  					"group info. But bitmap says 0",
1964  					free);
1965  			break;
1966  		}
1967  
1968  		mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex);
1969  		if (WARN_ON(ex.fe_len <= 0))
1970  			break;
1971  		if (free < ex.fe_len) {
1972  			ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
1973  					"%d free clusters as per "
1974  					"group info. But got %d blocks",
1975  					free, ex.fe_len);
1976  			/*
1977  			 * The number of free blocks differs. This mostly
1978  			 * indicate that the bitmap is corrupt. So exit
1979  			 * without claiming the space.
1980  			 */
1981  			break;
1982  		}
1983  		ex.fe_logical = 0xDEADC0DE; /* debug value */
1984  		ext4_mb_measure_extent(ac, &ex, e4b);
1985  
1986  		i += ex.fe_len;
1987  		free -= ex.fe_len;
1988  	}
1989  
1990  	ext4_mb_check_limits(ac, e4b, 1);
1991  }
1992  
1993  /*
1994   * This is a special case for storages like raid5
1995   * we try to find stripe-aligned chunks for stripe-size-multiple requests
1996   */
1997  static noinline_for_stack
ext4_mb_scan_aligned(struct ext4_allocation_context * ac,struct ext4_buddy * e4b)1998  void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
1999  				 struct ext4_buddy *e4b)
2000  {
2001  	struct super_block *sb = ac->ac_sb;
2002  	struct ext4_sb_info *sbi = EXT4_SB(sb);
2003  	void *bitmap = e4b->bd_bitmap;
2004  	struct ext4_free_extent ex;
2005  	ext4_fsblk_t first_group_block;
2006  	ext4_fsblk_t a;
2007  	ext4_grpblk_t i;
2008  	int max;
2009  
2010  	BUG_ON(sbi->s_stripe == 0);
2011  
2012  	/* find first stripe-aligned block in group */
2013  	first_group_block = ext4_group_first_block_no(sb, e4b->bd_group);
2014  
2015  	a = first_group_block + sbi->s_stripe - 1;
2016  	do_div(a, sbi->s_stripe);
2017  	i = (a * sbi->s_stripe) - first_group_block;
2018  
2019  	while (i < EXT4_CLUSTERS_PER_GROUP(sb)) {
2020  		if (!mb_test_bit(i, bitmap)) {
2021  			max = mb_find_extent(e4b, i, sbi->s_stripe, &ex);
2022  			if (max >= sbi->s_stripe) {
2023  				ac->ac_found++;
2024  				ex.fe_logical = 0xDEADF00D; /* debug value */
2025  				ac->ac_b_ex = ex;
2026  				ext4_mb_use_best_found(ac, e4b);
2027  				break;
2028  			}
2029  		}
2030  		i += sbi->s_stripe;
2031  	}
2032  }
2033  
2034  /*
2035   * This is now called BEFORE we load the buddy bitmap.
2036   * Returns either 1 or 0 indicating that the group is either suitable
2037   * for the allocation or not. In addition it can also return negative
2038   * error code when something goes wrong.
2039   */
ext4_mb_good_group(struct ext4_allocation_context * ac,ext4_group_t group,int cr)2040  static int ext4_mb_good_group(struct ext4_allocation_context *ac,
2041  				ext4_group_t group, int cr)
2042  {
2043  	unsigned free, fragments;
2044  	int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
2045  	struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
2046  
2047  	BUG_ON(cr < 0 || cr >= 4);
2048  
2049  	free = grp->bb_free;
2050  	if (free == 0)
2051  		return 0;
2052  	if (cr <= 2 && free < ac->ac_g_ex.fe_len)
2053  		return 0;
2054  
2055  	if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
2056  		return 0;
2057  
2058  	/* We only do this if the grp has never been initialized */
2059  	if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
2060  		int ret = ext4_mb_init_group(ac->ac_sb, group, GFP_NOFS);
2061  		if (ret)
2062  			return ret;
2063  	}
2064  
2065  	fragments = grp->bb_fragments;
2066  	if (fragments == 0)
2067  		return 0;
2068  
2069  	switch (cr) {
2070  	case 0:
2071  		BUG_ON(ac->ac_2order == 0);
2072  
2073  		/* Avoid using the first bg of a flexgroup for data files */
2074  		if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
2075  		    (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
2076  		    ((group % flex_size) == 0))
2077  			return 0;
2078  
2079  		if ((ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) ||
2080  		    (free / fragments) >= ac->ac_g_ex.fe_len)
2081  			return 1;
2082  
2083  		if (grp->bb_largest_free_order < ac->ac_2order)
2084  			return 0;
2085  
2086  		return 1;
2087  	case 1:
2088  		if ((free / fragments) >= ac->ac_g_ex.fe_len)
2089  			return 1;
2090  		break;
2091  	case 2:
2092  		if (free >= ac->ac_g_ex.fe_len)
2093  			return 1;
2094  		break;
2095  	case 3:
2096  		return 1;
2097  	default:
2098  		BUG();
2099  	}
2100  
2101  	return 0;
2102  }
2103  
2104  static noinline_for_stack int
ext4_mb_regular_allocator(struct ext4_allocation_context * ac)2105  ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
2106  {
2107  	ext4_group_t ngroups, group, i;
2108  	int cr;
2109  	int err = 0, first_err = 0;
2110  	struct ext4_sb_info *sbi;
2111  	struct super_block *sb;
2112  	struct ext4_buddy e4b;
2113  
2114  	sb = ac->ac_sb;
2115  	sbi = EXT4_SB(sb);
2116  	ngroups = ext4_get_groups_count(sb);
2117  	/* non-extent files are limited to low blocks/groups */
2118  	if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
2119  		ngroups = sbi->s_blockfile_groups;
2120  
2121  	BUG_ON(ac->ac_status == AC_STATUS_FOUND);
2122  
2123  	/* first, try the goal */
2124  	err = ext4_mb_find_by_goal(ac, &e4b);
2125  	if (err || ac->ac_status == AC_STATUS_FOUND)
2126  		goto out;
2127  
2128  	if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
2129  		goto out;
2130  
2131  	/*
2132  	 * ac->ac2_order is set only if the fe_len is a power of 2
2133  	 * if ac2_order is set we also set criteria to 0 so that we
2134  	 * try exact allocation using buddy.
2135  	 */
2136  	i = fls(ac->ac_g_ex.fe_len);
2137  	ac->ac_2order = 0;
2138  	/*
2139  	 * We search using buddy data only if the order of the request
2140  	 * is greater than equal to the sbi_s_mb_order2_reqs
2141  	 * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
2142  	 * We also support searching for power-of-two requests only for
2143  	 * requests upto maximum buddy size we have constructed.
2144  	 */
2145  	if (i >= sbi->s_mb_order2_reqs && i <= sb->s_blocksize_bits + 2) {
2146  		/*
2147  		 * This should tell if fe_len is exactly power of 2
2148  		 */
2149  		if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0)
2150  			ac->ac_2order = array_index_nospec(i - 1,
2151  							   sb->s_blocksize_bits + 2);
2152  	}
2153  
2154  	/* if stream allocation is enabled, use global goal */
2155  	if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
2156  		/* TBD: may be hot point */
2157  		spin_lock(&sbi->s_md_lock);
2158  		ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
2159  		ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
2160  		spin_unlock(&sbi->s_md_lock);
2161  	}
2162  
2163  	/* Let's just scan groups to find more-less suitable blocks */
2164  	cr = ac->ac_2order ? 0 : 1;
2165  	/*
2166  	 * cr == 0 try to get exact allocation,
2167  	 * cr == 3  try to get anything
2168  	 */
2169  repeat:
2170  	for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
2171  		ac->ac_criteria = cr;
2172  		/*
2173  		 * searching for the right group start
2174  		 * from the goal value specified
2175  		 */
2176  		group = ac->ac_g_ex.fe_group;
2177  
2178  		for (i = 0; i < ngroups; group++, i++) {
2179  			int ret = 0;
2180  			cond_resched();
2181  			/*
2182  			 * Artificially restricted ngroups for non-extent
2183  			 * files makes group > ngroups possible on first loop.
2184  			 */
2185  			if (group >= ngroups)
2186  				group = 0;
2187  
2188  			/* This now checks without needing the buddy page */
2189  			ret = ext4_mb_good_group(ac, group, cr);
2190  			if (ret <= 0) {
2191  				if (!first_err)
2192  					first_err = ret;
2193  				continue;
2194  			}
2195  
2196  			err = ext4_mb_load_buddy(sb, group, &e4b);
2197  			if (err)
2198  				goto out;
2199  
2200  			ext4_lock_group(sb, group);
2201  
2202  			/*
2203  			 * We need to check again after locking the
2204  			 * block group
2205  			 */
2206  			ret = ext4_mb_good_group(ac, group, cr);
2207  			if (ret <= 0) {
2208  				ext4_unlock_group(sb, group);
2209  				ext4_mb_unload_buddy(&e4b);
2210  				if (!first_err)
2211  					first_err = ret;
2212  				continue;
2213  			}
2214  
2215  			ac->ac_groups_scanned++;
2216  			if (cr == 0)
2217  				ext4_mb_simple_scan_group(ac, &e4b);
2218  			else if (cr == 1 && sbi->s_stripe &&
2219  					!(ac->ac_g_ex.fe_len % sbi->s_stripe))
2220  				ext4_mb_scan_aligned(ac, &e4b);
2221  			else
2222  				ext4_mb_complex_scan_group(ac, &e4b);
2223  
2224  			ext4_unlock_group(sb, group);
2225  			ext4_mb_unload_buddy(&e4b);
2226  
2227  			if (ac->ac_status != AC_STATUS_CONTINUE)
2228  				break;
2229  		}
2230  	}
2231  
2232  	if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
2233  	    !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
2234  		/*
2235  		 * We've been searching too long. Let's try to allocate
2236  		 * the best chunk we've found so far
2237  		 */
2238  
2239  		ext4_mb_try_best_found(ac, &e4b);
2240  		if (ac->ac_status != AC_STATUS_FOUND) {
2241  			/*
2242  			 * Someone more lucky has already allocated it.
2243  			 * The only thing we can do is just take first
2244  			 * found block(s)
2245  			printk(KERN_DEBUG "EXT4-fs: someone won our chunk\n");
2246  			 */
2247  			ac->ac_b_ex.fe_group = 0;
2248  			ac->ac_b_ex.fe_start = 0;
2249  			ac->ac_b_ex.fe_len = 0;
2250  			ac->ac_status = AC_STATUS_CONTINUE;
2251  			ac->ac_flags |= EXT4_MB_HINT_FIRST;
2252  			cr = 3;
2253  			atomic_inc(&sbi->s_mb_lost_chunks);
2254  			goto repeat;
2255  		}
2256  	}
2257  out:
2258  	if (!err && ac->ac_status != AC_STATUS_FOUND && first_err)
2259  		err = first_err;
2260  	return err;
2261  }
2262  
ext4_mb_seq_groups_start(struct seq_file * seq,loff_t * pos)2263  static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
2264  {
2265  	struct super_block *sb = seq->private;
2266  	ext4_group_t group;
2267  
2268  	if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
2269  		return NULL;
2270  	group = *pos + 1;
2271  	return (void *) ((unsigned long) group);
2272  }
2273  
ext4_mb_seq_groups_next(struct seq_file * seq,void * v,loff_t * pos)2274  static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
2275  {
2276  	struct super_block *sb = seq->private;
2277  	ext4_group_t group;
2278  
2279  	++*pos;
2280  	if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
2281  		return NULL;
2282  	group = *pos + 1;
2283  	return (void *) ((unsigned long) group);
2284  }
2285  
ext4_mb_seq_groups_show(struct seq_file * seq,void * v)2286  static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
2287  {
2288  	struct super_block *sb = seq->private;
2289  	ext4_group_t group = (ext4_group_t) ((unsigned long) v);
2290  	int i;
2291  	int err, buddy_loaded = 0;
2292  	struct ext4_buddy e4b;
2293  	struct ext4_group_info *grinfo;
2294  	struct sg {
2295  		struct ext4_group_info info;
2296  		ext4_grpblk_t counters[EXT4_MAX_BLOCK_LOG_SIZE + 2];
2297  	} sg;
2298  
2299  	group--;
2300  	if (group == 0)
2301  		seq_puts(seq, "#group: free  frags first ["
2302  			      " 2^0   2^1   2^2   2^3   2^4   2^5   2^6  "
2303  			      " 2^7   2^8   2^9   2^10  2^11  2^12  2^13  ]");
2304  
2305  	i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
2306  		sizeof(struct ext4_group_info);
2307  	grinfo = ext4_get_group_info(sb, group);
2308  	/* Load the group info in memory only if not already loaded. */
2309  	if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) {
2310  		err = ext4_mb_load_buddy(sb, group, &e4b);
2311  		if (err) {
2312  			seq_printf(seq, "#%-5u: I/O error\n", group);
2313  			return 0;
2314  		}
2315  		buddy_loaded = 1;
2316  	}
2317  
2318  	memcpy(&sg, ext4_get_group_info(sb, group), i);
2319  
2320  	if (buddy_loaded)
2321  		ext4_mb_unload_buddy(&e4b);
2322  
2323  	seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
2324  			sg.info.bb_fragments, sg.info.bb_first_free);
2325  	for (i = 0; i <= 13; i++)
2326  		seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ?
2327  				sg.info.bb_counters[i] : 0);
2328  	seq_printf(seq, " ]\n");
2329  
2330  	return 0;
2331  }
2332  
ext4_mb_seq_groups_stop(struct seq_file * seq,void * v)2333  static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v)
2334  {
2335  }
2336  
2337  static const struct seq_operations ext4_mb_seq_groups_ops = {
2338  	.start  = ext4_mb_seq_groups_start,
2339  	.next   = ext4_mb_seq_groups_next,
2340  	.stop   = ext4_mb_seq_groups_stop,
2341  	.show   = ext4_mb_seq_groups_show,
2342  };
2343  
ext4_mb_seq_groups_open(struct inode * inode,struct file * file)2344  static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
2345  {
2346  	struct super_block *sb = PDE_DATA(inode);
2347  	int rc;
2348  
2349  	rc = seq_open(file, &ext4_mb_seq_groups_ops);
2350  	if (rc == 0) {
2351  		struct seq_file *m = file->private_data;
2352  		m->private = sb;
2353  	}
2354  	return rc;
2355  
2356  }
2357  
2358  const struct file_operations ext4_seq_mb_groups_fops = {
2359  	.owner		= THIS_MODULE,
2360  	.open		= ext4_mb_seq_groups_open,
2361  	.read		= seq_read,
2362  	.llseek		= seq_lseek,
2363  	.release	= seq_release,
2364  };
2365  
get_groupinfo_cache(int blocksize_bits)2366  static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
2367  {
2368  	int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
2369  	struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index];
2370  
2371  	BUG_ON(!cachep);
2372  	return cachep;
2373  }
2374  
2375  /*
2376   * Allocate the top-level s_group_info array for the specified number
2377   * of groups
2378   */
ext4_mb_alloc_groupinfo(struct super_block * sb,ext4_group_t ngroups)2379  int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups)
2380  {
2381  	struct ext4_sb_info *sbi = EXT4_SB(sb);
2382  	unsigned size;
2383  	struct ext4_group_info ***old_groupinfo, ***new_groupinfo;
2384  
2385  	size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >>
2386  		EXT4_DESC_PER_BLOCK_BITS(sb);
2387  	if (size <= sbi->s_group_info_size)
2388  		return 0;
2389  
2390  	size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size);
2391  	new_groupinfo = ext4_kvzalloc(size, GFP_KERNEL);
2392  	if (!new_groupinfo) {
2393  		ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group");
2394  		return -ENOMEM;
2395  	}
2396  	rcu_read_lock();
2397  	old_groupinfo = rcu_dereference(sbi->s_group_info);
2398  	if (old_groupinfo)
2399  		memcpy(new_groupinfo, old_groupinfo,
2400  		       sbi->s_group_info_size * sizeof(*sbi->s_group_info));
2401  	rcu_read_unlock();
2402  	rcu_assign_pointer(sbi->s_group_info, new_groupinfo);
2403  	sbi->s_group_info_size = size / sizeof(*sbi->s_group_info);
2404  	if (old_groupinfo)
2405  		ext4_kvfree_array_rcu(old_groupinfo);
2406  	ext4_debug("allocated s_groupinfo array for %d meta_bg's\n",
2407  		   sbi->s_group_info_size);
2408  	return 0;
2409  }
2410  
2411  /* Create and initialize ext4_group_info data for the given group. */
ext4_mb_add_groupinfo(struct super_block * sb,ext4_group_t group,struct ext4_group_desc * desc)2412  int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2413  			  struct ext4_group_desc *desc)
2414  {
2415  	int i;
2416  	int metalen = 0;
2417  	int idx = group >> EXT4_DESC_PER_BLOCK_BITS(sb);
2418  	struct ext4_sb_info *sbi = EXT4_SB(sb);
2419  	struct ext4_group_info **meta_group_info;
2420  	struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
2421  
2422  	/*
2423  	 * First check if this group is the first of a reserved block.
2424  	 * If it's true, we have to allocate a new table of pointers
2425  	 * to ext4_group_info structures
2426  	 */
2427  	if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
2428  		metalen = sizeof(*meta_group_info) <<
2429  			EXT4_DESC_PER_BLOCK_BITS(sb);
2430  		meta_group_info = kmalloc(metalen, GFP_NOFS);
2431  		if (meta_group_info == NULL) {
2432  			ext4_msg(sb, KERN_ERR, "can't allocate mem "
2433  				 "for a buddy group");
2434  			goto exit_meta_group_info;
2435  		}
2436  		rcu_read_lock();
2437  		rcu_dereference(sbi->s_group_info)[idx] = meta_group_info;
2438  		rcu_read_unlock();
2439  	}
2440  
2441  	meta_group_info = sbi_array_rcu_deref(sbi, s_group_info, idx);
2442  	i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
2443  
2444  	meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_NOFS);
2445  	if (meta_group_info[i] == NULL) {
2446  		ext4_msg(sb, KERN_ERR, "can't allocate buddy mem");
2447  		goto exit_group_info;
2448  	}
2449  	set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
2450  		&(meta_group_info[i]->bb_state));
2451  
2452  	/*
2453  	 * initialize bb_free to be able to skip
2454  	 * empty groups without initialization
2455  	 */
2456  	if (ext4_has_group_desc_csum(sb) &&
2457  	    (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
2458  		meta_group_info[i]->bb_free =
2459  			ext4_free_clusters_after_init(sb, group, desc);
2460  	} else {
2461  		meta_group_info[i]->bb_free =
2462  			ext4_free_group_clusters(sb, desc);
2463  	}
2464  
2465  	INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
2466  	init_rwsem(&meta_group_info[i]->alloc_sem);
2467  	meta_group_info[i]->bb_free_root = RB_ROOT;
2468  	meta_group_info[i]->bb_largest_free_order = -1;  /* uninit */
2469  
2470  #ifdef DOUBLE_CHECK
2471  	{
2472  		struct buffer_head *bh;
2473  		meta_group_info[i]->bb_bitmap =
2474  			kmalloc(sb->s_blocksize, GFP_NOFS);
2475  		BUG_ON(meta_group_info[i]->bb_bitmap == NULL);
2476  		bh = ext4_read_block_bitmap(sb, group);
2477  		BUG_ON(IS_ERR_OR_NULL(bh));
2478  		memcpy(meta_group_info[i]->bb_bitmap, bh->b_data,
2479  			sb->s_blocksize);
2480  		put_bh(bh);
2481  	}
2482  #endif
2483  
2484  	return 0;
2485  
2486  exit_group_info:
2487  	/* If a meta_group_info table has been allocated, release it now */
2488  	if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
2489  		struct ext4_group_info ***group_info;
2490  
2491  		rcu_read_lock();
2492  		group_info = rcu_dereference(sbi->s_group_info);
2493  		kfree(group_info[idx]);
2494  		group_info[idx] = NULL;
2495  		rcu_read_unlock();
2496  	}
2497  exit_meta_group_info:
2498  	return -ENOMEM;
2499  } /* ext4_mb_add_groupinfo */
2500  
ext4_mb_init_backend(struct super_block * sb)2501  static int ext4_mb_init_backend(struct super_block *sb)
2502  {
2503  	ext4_group_t ngroups = ext4_get_groups_count(sb);
2504  	ext4_group_t i;
2505  	struct ext4_sb_info *sbi = EXT4_SB(sb);
2506  	int err;
2507  	struct ext4_group_desc *desc;
2508  	struct ext4_group_info ***group_info;
2509  	struct kmem_cache *cachep;
2510  
2511  	err = ext4_mb_alloc_groupinfo(sb, ngroups);
2512  	if (err)
2513  		return err;
2514  
2515  	sbi->s_buddy_cache = new_inode(sb);
2516  	if (sbi->s_buddy_cache == NULL) {
2517  		ext4_msg(sb, KERN_ERR, "can't get new inode");
2518  		goto err_freesgi;
2519  	}
2520  	/* To avoid potentially colliding with an valid on-disk inode number,
2521  	 * use EXT4_BAD_INO for the buddy cache inode number.  This inode is
2522  	 * not in the inode hash, so it should never be found by iget(), but
2523  	 * this will avoid confusion if it ever shows up during debugging. */
2524  	sbi->s_buddy_cache->i_ino = EXT4_BAD_INO;
2525  	EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
2526  	for (i = 0; i < ngroups; i++) {
2527  		desc = ext4_get_group_desc(sb, i, NULL);
2528  		if (desc == NULL) {
2529  			ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i);
2530  			goto err_freebuddy;
2531  		}
2532  		if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
2533  			goto err_freebuddy;
2534  	}
2535  
2536  	return 0;
2537  
2538  err_freebuddy:
2539  	cachep = get_groupinfo_cache(sb->s_blocksize_bits);
2540  	while (i-- > 0)
2541  		kmem_cache_free(cachep, ext4_get_group_info(sb, i));
2542  	i = sbi->s_group_info_size;
2543  	rcu_read_lock();
2544  	group_info = rcu_dereference(sbi->s_group_info);
2545  	while (i-- > 0)
2546  		kfree(group_info[i]);
2547  	rcu_read_unlock();
2548  	iput(sbi->s_buddy_cache);
2549  err_freesgi:
2550  	rcu_read_lock();
2551  	kvfree(rcu_dereference(sbi->s_group_info));
2552  	rcu_read_unlock();
2553  	return -ENOMEM;
2554  }
2555  
ext4_groupinfo_destroy_slabs(void)2556  static void ext4_groupinfo_destroy_slabs(void)
2557  {
2558  	int i;
2559  
2560  	for (i = 0; i < NR_GRPINFO_CACHES; i++) {
2561  		if (ext4_groupinfo_caches[i])
2562  			kmem_cache_destroy(ext4_groupinfo_caches[i]);
2563  		ext4_groupinfo_caches[i] = NULL;
2564  	}
2565  }
2566  
ext4_groupinfo_create_slab(size_t size)2567  static int ext4_groupinfo_create_slab(size_t size)
2568  {
2569  	static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex);
2570  	int slab_size;
2571  	int blocksize_bits = order_base_2(size);
2572  	int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
2573  	struct kmem_cache *cachep;
2574  
2575  	if (cache_index >= NR_GRPINFO_CACHES)
2576  		return -EINVAL;
2577  
2578  	if (unlikely(cache_index < 0))
2579  		cache_index = 0;
2580  
2581  	mutex_lock(&ext4_grpinfo_slab_create_mutex);
2582  	if (ext4_groupinfo_caches[cache_index]) {
2583  		mutex_unlock(&ext4_grpinfo_slab_create_mutex);
2584  		return 0;	/* Already created */
2585  	}
2586  
2587  	slab_size = offsetof(struct ext4_group_info,
2588  				bb_counters[blocksize_bits + 2]);
2589  
2590  	cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index],
2591  					slab_size, 0, SLAB_RECLAIM_ACCOUNT,
2592  					NULL);
2593  
2594  	ext4_groupinfo_caches[cache_index] = cachep;
2595  
2596  	mutex_unlock(&ext4_grpinfo_slab_create_mutex);
2597  	if (!cachep) {
2598  		printk(KERN_EMERG
2599  		       "EXT4-fs: no memory for groupinfo slab cache\n");
2600  		return -ENOMEM;
2601  	}
2602  
2603  	return 0;
2604  }
2605  
ext4_mb_init(struct super_block * sb)2606  int ext4_mb_init(struct super_block *sb)
2607  {
2608  	struct ext4_sb_info *sbi = EXT4_SB(sb);
2609  	unsigned i, j;
2610  	unsigned offset, offset_incr;
2611  	unsigned max;
2612  	int ret;
2613  
2614  	i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
2615  
2616  	sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
2617  	if (sbi->s_mb_offsets == NULL) {
2618  		ret = -ENOMEM;
2619  		goto out;
2620  	}
2621  
2622  	i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs);
2623  	sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
2624  	if (sbi->s_mb_maxs == NULL) {
2625  		ret = -ENOMEM;
2626  		goto out;
2627  	}
2628  
2629  	ret = ext4_groupinfo_create_slab(sb->s_blocksize);
2630  	if (ret < 0)
2631  		goto out;
2632  
2633  	/* order 0 is regular bitmap */
2634  	sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
2635  	sbi->s_mb_offsets[0] = 0;
2636  
2637  	i = 1;
2638  	offset = 0;
2639  	offset_incr = 1 << (sb->s_blocksize_bits - 1);
2640  	max = sb->s_blocksize << 2;
2641  	do {
2642  		sbi->s_mb_offsets[i] = offset;
2643  		sbi->s_mb_maxs[i] = max;
2644  		offset += offset_incr;
2645  		offset_incr = offset_incr >> 1;
2646  		max = max >> 1;
2647  		i++;
2648  	} while (i <= sb->s_blocksize_bits + 1);
2649  
2650  	spin_lock_init(&sbi->s_md_lock);
2651  	spin_lock_init(&sbi->s_bal_lock);
2652  
2653  	sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
2654  	sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
2655  	sbi->s_mb_stats = MB_DEFAULT_STATS;
2656  	sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
2657  	sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
2658  	/*
2659  	 * The default group preallocation is 512, which for 4k block
2660  	 * sizes translates to 2 megabytes.  However for bigalloc file
2661  	 * systems, this is probably too big (i.e, if the cluster size
2662  	 * is 1 megabyte, then group preallocation size becomes half a
2663  	 * gigabyte!).  As a default, we will keep a two megabyte
2664  	 * group pralloc size for cluster sizes up to 64k, and after
2665  	 * that, we will force a minimum group preallocation size of
2666  	 * 32 clusters.  This translates to 8 megs when the cluster
2667  	 * size is 256k, and 32 megs when the cluster size is 1 meg,
2668  	 * which seems reasonable as a default.
2669  	 */
2670  	sbi->s_mb_group_prealloc = max(MB_DEFAULT_GROUP_PREALLOC >>
2671  				       sbi->s_cluster_bits, 32);
2672  	/*
2673  	 * If there is a s_stripe > 1, then we set the s_mb_group_prealloc
2674  	 * to the lowest multiple of s_stripe which is bigger than
2675  	 * the s_mb_group_prealloc as determined above. We want
2676  	 * the preallocation size to be an exact multiple of the
2677  	 * RAID stripe size so that preallocations don't fragment
2678  	 * the stripes.
2679  	 */
2680  	if (sbi->s_stripe > 1) {
2681  		sbi->s_mb_group_prealloc = roundup(
2682  			sbi->s_mb_group_prealloc, sbi->s_stripe);
2683  	}
2684  
2685  	sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
2686  	if (sbi->s_locality_groups == NULL) {
2687  		ret = -ENOMEM;
2688  		goto out;
2689  	}
2690  	for_each_possible_cpu(i) {
2691  		struct ext4_locality_group *lg;
2692  		lg = per_cpu_ptr(sbi->s_locality_groups, i);
2693  		mutex_init(&lg->lg_mutex);
2694  		for (j = 0; j < PREALLOC_TB_SIZE; j++)
2695  			INIT_LIST_HEAD(&lg->lg_prealloc_list[j]);
2696  		spin_lock_init(&lg->lg_prealloc_lock);
2697  	}
2698  
2699  	/* init file for buddy data */
2700  	ret = ext4_mb_init_backend(sb);
2701  	if (ret != 0)
2702  		goto out_free_locality_groups;
2703  
2704  	return 0;
2705  
2706  out_free_locality_groups:
2707  	free_percpu(sbi->s_locality_groups);
2708  	sbi->s_locality_groups = NULL;
2709  out:
2710  	kfree(sbi->s_mb_offsets);
2711  	sbi->s_mb_offsets = NULL;
2712  	kfree(sbi->s_mb_maxs);
2713  	sbi->s_mb_maxs = NULL;
2714  	return ret;
2715  }
2716  
2717  /* need to called with the ext4 group lock held */
ext4_mb_cleanup_pa(struct ext4_group_info * grp)2718  static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
2719  {
2720  	struct ext4_prealloc_space *pa;
2721  	struct list_head *cur, *tmp;
2722  	int count = 0;
2723  
2724  	list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) {
2725  		pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
2726  		list_del(&pa->pa_group_list);
2727  		count++;
2728  		kmem_cache_free(ext4_pspace_cachep, pa);
2729  	}
2730  	if (count)
2731  		mb_debug(1, "mballoc: %u PAs left\n", count);
2732  
2733  }
2734  
ext4_mb_release(struct super_block * sb)2735  int ext4_mb_release(struct super_block *sb)
2736  {
2737  	ext4_group_t ngroups = ext4_get_groups_count(sb);
2738  	ext4_group_t i;
2739  	int num_meta_group_infos;
2740  	struct ext4_group_info *grinfo, ***group_info;
2741  	struct ext4_sb_info *sbi = EXT4_SB(sb);
2742  	struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
2743  
2744  	if (sbi->s_group_info) {
2745  		for (i = 0; i < ngroups; i++) {
2746  			grinfo = ext4_get_group_info(sb, i);
2747  #ifdef DOUBLE_CHECK
2748  			kfree(grinfo->bb_bitmap);
2749  #endif
2750  			ext4_lock_group(sb, i);
2751  			ext4_mb_cleanup_pa(grinfo);
2752  			ext4_unlock_group(sb, i);
2753  			kmem_cache_free(cachep, grinfo);
2754  		}
2755  		num_meta_group_infos = (ngroups +
2756  				EXT4_DESC_PER_BLOCK(sb) - 1) >>
2757  			EXT4_DESC_PER_BLOCK_BITS(sb);
2758  		rcu_read_lock();
2759  		group_info = rcu_dereference(sbi->s_group_info);
2760  		for (i = 0; i < num_meta_group_infos; i++)
2761  			kfree(group_info[i]);
2762  		kvfree(group_info);
2763  		rcu_read_unlock();
2764  	}
2765  	kfree(sbi->s_mb_offsets);
2766  	kfree(sbi->s_mb_maxs);
2767  	iput(sbi->s_buddy_cache);
2768  	if (sbi->s_mb_stats) {
2769  		ext4_msg(sb, KERN_INFO,
2770  		       "mballoc: %u blocks %u reqs (%u success)",
2771  				atomic_read(&sbi->s_bal_allocated),
2772  				atomic_read(&sbi->s_bal_reqs),
2773  				atomic_read(&sbi->s_bal_success));
2774  		ext4_msg(sb, KERN_INFO,
2775  		      "mballoc: %u extents scanned, %u goal hits, "
2776  				"%u 2^N hits, %u breaks, %u lost",
2777  				atomic_read(&sbi->s_bal_ex_scanned),
2778  				atomic_read(&sbi->s_bal_goals),
2779  				atomic_read(&sbi->s_bal_2orders),
2780  				atomic_read(&sbi->s_bal_breaks),
2781  				atomic_read(&sbi->s_mb_lost_chunks));
2782  		ext4_msg(sb, KERN_INFO,
2783  		       "mballoc: %lu generated and it took %Lu",
2784  				sbi->s_mb_buddies_generated,
2785  				sbi->s_mb_generation_time);
2786  		ext4_msg(sb, KERN_INFO,
2787  		       "mballoc: %u preallocated, %u discarded",
2788  				atomic_read(&sbi->s_mb_preallocated),
2789  				atomic_read(&sbi->s_mb_discarded));
2790  	}
2791  
2792  	free_percpu(sbi->s_locality_groups);
2793  
2794  	return 0;
2795  }
2796  
ext4_issue_discard(struct super_block * sb,ext4_group_t block_group,ext4_grpblk_t cluster,int count,unsigned long flags)2797  static inline int ext4_issue_discard(struct super_block *sb,
2798  		ext4_group_t block_group, ext4_grpblk_t cluster, int count,
2799  		unsigned long flags)
2800  {
2801  	ext4_fsblk_t discard_block;
2802  
2803  	discard_block = (EXT4_C2B(EXT4_SB(sb), cluster) +
2804  			 ext4_group_first_block_no(sb, block_group));
2805  	count = EXT4_C2B(EXT4_SB(sb), count);
2806  	trace_ext4_discard_blocks(sb,
2807  			(unsigned long long) discard_block, count);
2808  	return sb_issue_discard(sb, discard_block, count, GFP_NOFS, flags);
2809  }
2810  
2811  /*
2812   * This function is called by the jbd2 layer once the commit has finished,
2813   * so we know we can free the blocks that were released with that commit.
2814   */
ext4_free_data_callback(struct super_block * sb,struct ext4_journal_cb_entry * jce,int rc)2815  static void ext4_free_data_callback(struct super_block *sb,
2816  				    struct ext4_journal_cb_entry *jce,
2817  				    int rc)
2818  {
2819  	struct ext4_free_data *entry = (struct ext4_free_data *)jce;
2820  	struct ext4_buddy e4b;
2821  	struct ext4_group_info *db;
2822  	int err, count = 0, count2 = 0;
2823  
2824  	mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
2825  		 entry->efd_count, entry->efd_group, entry);
2826  
2827  	if (test_opt(sb, DISCARD)) {
2828  		err = ext4_issue_discard(sb, entry->efd_group,
2829  					 entry->efd_start_cluster,
2830  					 entry->efd_count, 0);
2831  		if (err && err != -EOPNOTSUPP)
2832  			ext4_msg(sb, KERN_WARNING, "discard request in"
2833  				 " group:%d block:%d count:%d failed"
2834  				 " with %d", entry->efd_group,
2835  				 entry->efd_start_cluster,
2836  				 entry->efd_count, err);
2837  	}
2838  
2839  	err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
2840  	/* we expect to find existing buddy because it's pinned */
2841  	BUG_ON(err != 0);
2842  
2843  
2844  	db = e4b.bd_info;
2845  	/* there are blocks to put in buddy to make them really free */
2846  	count += entry->efd_count;
2847  	count2++;
2848  	ext4_lock_group(sb, entry->efd_group);
2849  	/* Take it out of per group rb tree */
2850  	rb_erase(&entry->efd_node, &(db->bb_free_root));
2851  	mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count);
2852  
2853  	/*
2854  	 * Clear the trimmed flag for the group so that the next
2855  	 * ext4_trim_fs can trim it.
2856  	 * If the volume is mounted with -o discard, online discard
2857  	 * is supported and the free blocks will be trimmed online.
2858  	 */
2859  	if (!test_opt(sb, DISCARD))
2860  		EXT4_MB_GRP_CLEAR_TRIMMED(db);
2861  
2862  	if (!db->bb_free_root.rb_node) {
2863  		/* No more items in the per group rb tree
2864  		 * balance refcounts from ext4_mb_free_metadata()
2865  		 */
2866  		page_cache_release(e4b.bd_buddy_page);
2867  		page_cache_release(e4b.bd_bitmap_page);
2868  	}
2869  	ext4_unlock_group(sb, entry->efd_group);
2870  	kmem_cache_free(ext4_free_data_cachep, entry);
2871  	ext4_mb_unload_buddy(&e4b);
2872  
2873  	mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
2874  }
2875  
ext4_init_mballoc(void)2876  int __init ext4_init_mballoc(void)
2877  {
2878  	ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space,
2879  					SLAB_RECLAIM_ACCOUNT);
2880  	if (ext4_pspace_cachep == NULL)
2881  		return -ENOMEM;
2882  
2883  	ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context,
2884  				    SLAB_RECLAIM_ACCOUNT);
2885  	if (ext4_ac_cachep == NULL) {
2886  		kmem_cache_destroy(ext4_pspace_cachep);
2887  		return -ENOMEM;
2888  	}
2889  
2890  	ext4_free_data_cachep = KMEM_CACHE(ext4_free_data,
2891  					   SLAB_RECLAIM_ACCOUNT);
2892  	if (ext4_free_data_cachep == NULL) {
2893  		kmem_cache_destroy(ext4_pspace_cachep);
2894  		kmem_cache_destroy(ext4_ac_cachep);
2895  		return -ENOMEM;
2896  	}
2897  	return 0;
2898  }
2899  
ext4_exit_mballoc(void)2900  void ext4_exit_mballoc(void)
2901  {
2902  	/*
2903  	 * Wait for completion of call_rcu()'s on ext4_pspace_cachep
2904  	 * before destroying the slab cache.
2905  	 */
2906  	rcu_barrier();
2907  	kmem_cache_destroy(ext4_pspace_cachep);
2908  	kmem_cache_destroy(ext4_ac_cachep);
2909  	kmem_cache_destroy(ext4_free_data_cachep);
2910  	ext4_groupinfo_destroy_slabs();
2911  }
2912  
2913  
2914  /*
2915   * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps
2916   * Returns 0 if success or error code
2917   */
2918  static noinline_for_stack int
ext4_mb_mark_diskspace_used(struct ext4_allocation_context * ac,handle_t * handle,unsigned int reserv_clstrs)2919  ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2920  				handle_t *handle, unsigned int reserv_clstrs)
2921  {
2922  	struct buffer_head *bitmap_bh = NULL;
2923  	struct ext4_group_desc *gdp;
2924  	struct buffer_head *gdp_bh;
2925  	struct ext4_sb_info *sbi;
2926  	struct super_block *sb;
2927  	ext4_fsblk_t block;
2928  	int err, len;
2929  
2930  	BUG_ON(ac->ac_status != AC_STATUS_FOUND);
2931  	BUG_ON(ac->ac_b_ex.fe_len <= 0);
2932  
2933  	sb = ac->ac_sb;
2934  	sbi = EXT4_SB(sb);
2935  
2936  	bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
2937  	if (IS_ERR(bitmap_bh)) {
2938  		err = PTR_ERR(bitmap_bh);
2939  		bitmap_bh = NULL;
2940  		goto out_err;
2941  	}
2942  
2943  	BUFFER_TRACE(bitmap_bh, "getting write access");
2944  	err = ext4_journal_get_write_access(handle, bitmap_bh);
2945  	if (err)
2946  		goto out_err;
2947  
2948  	err = -EIO;
2949  	gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh);
2950  	if (!gdp)
2951  		goto out_err;
2952  
2953  	ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
2954  			ext4_free_group_clusters(sb, gdp));
2955  
2956  	BUFFER_TRACE(gdp_bh, "get_write_access");
2957  	err = ext4_journal_get_write_access(handle, gdp_bh);
2958  	if (err)
2959  		goto out_err;
2960  
2961  	block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
2962  
2963  	len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
2964  	if (!ext4_inode_block_valid(ac->ac_inode, block, len)) {
2965  		ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
2966  			   "fs metadata", block, block+len);
2967  		/* File system mounted not to panic on error
2968  		 * Fix the bitmap and return EFSCORRUPTED
2969  		 * We leak some of the blocks here.
2970  		 */
2971  		ext4_lock_group(sb, ac->ac_b_ex.fe_group);
2972  		ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
2973  			      ac->ac_b_ex.fe_len);
2974  		ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
2975  		err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
2976  		if (!err)
2977  			err = -EFSCORRUPTED;
2978  		goto out_err;
2979  	}
2980  
2981  	ext4_lock_group(sb, ac->ac_b_ex.fe_group);
2982  #ifdef AGGRESSIVE_CHECK
2983  	{
2984  		int i;
2985  		for (i = 0; i < ac->ac_b_ex.fe_len; i++) {
2986  			BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i,
2987  						bitmap_bh->b_data));
2988  		}
2989  	}
2990  #endif
2991  	ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
2992  		      ac->ac_b_ex.fe_len);
2993  	if (ext4_has_group_desc_csum(sb) &&
2994  	    (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
2995  		gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
2996  		ext4_free_group_clusters_set(sb, gdp,
2997  					     ext4_free_clusters_after_init(sb,
2998  						ac->ac_b_ex.fe_group, gdp));
2999  	}
3000  	len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len;
3001  	ext4_free_group_clusters_set(sb, gdp, len);
3002  	ext4_block_bitmap_csum_set(sb, ac->ac_b_ex.fe_group, gdp, bitmap_bh);
3003  	ext4_group_desc_csum_set(sb, ac->ac_b_ex.fe_group, gdp);
3004  
3005  	ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
3006  	percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len);
3007  	/*
3008  	 * Now reduce the dirty block count also. Should not go negative
3009  	 */
3010  	if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
3011  		/* release all the reserved blocks if non delalloc */
3012  		percpu_counter_sub(&sbi->s_dirtyclusters_counter,
3013  				   reserv_clstrs);
3014  
3015  	if (sbi->s_log_groups_per_flex) {
3016  		ext4_group_t flex_group = ext4_flex_group(sbi,
3017  							  ac->ac_b_ex.fe_group);
3018  		atomic64_sub(ac->ac_b_ex.fe_len,
3019  			     &sbi_array_rcu_deref(sbi, s_flex_groups,
3020  						  flex_group)->free_clusters);
3021  	}
3022  
3023  	err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
3024  	if (err)
3025  		goto out_err;
3026  	err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
3027  
3028  out_err:
3029  	brelse(bitmap_bh);
3030  	return err;
3031  }
3032  
3033  /*
3034   * here we normalize request for locality group
3035   * Group request are normalized to s_mb_group_prealloc, which goes to
3036   * s_strip if we set the same via mount option.
3037   * s_mb_group_prealloc can be configured via
3038   * /sys/fs/ext4/<partition>/mb_group_prealloc
3039   *
3040   * XXX: should we try to preallocate more than the group has now?
3041   */
ext4_mb_normalize_group_request(struct ext4_allocation_context * ac)3042  static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
3043  {
3044  	struct super_block *sb = ac->ac_sb;
3045  	struct ext4_locality_group *lg = ac->ac_lg;
3046  
3047  	BUG_ON(lg == NULL);
3048  	ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
3049  	mb_debug(1, "#%u: goal %u blocks for locality group\n",
3050  		current->pid, ac->ac_g_ex.fe_len);
3051  }
3052  
3053  /*
3054   * Normalization means making request better in terms of
3055   * size and alignment
3056   */
3057  static noinline_for_stack void
ext4_mb_normalize_request(struct ext4_allocation_context * ac,struct ext4_allocation_request * ar)3058  ext4_mb_normalize_request(struct ext4_allocation_context *ac,
3059  				struct ext4_allocation_request *ar)
3060  {
3061  	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
3062  	int bsbits, max;
3063  	ext4_lblk_t end;
3064  	loff_t size, start_off;
3065  	loff_t orig_size __maybe_unused;
3066  	ext4_lblk_t start;
3067  	struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
3068  	struct ext4_prealloc_space *pa;
3069  
3070  	/* do normalize only data requests, metadata requests
3071  	   do not need preallocation */
3072  	if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
3073  		return;
3074  
3075  	/* sometime caller may want exact blocks */
3076  	if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
3077  		return;
3078  
3079  	/* caller may indicate that preallocation isn't
3080  	 * required (it's a tail, for example) */
3081  	if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC)
3082  		return;
3083  
3084  	if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) {
3085  		ext4_mb_normalize_group_request(ac);
3086  		return ;
3087  	}
3088  
3089  	bsbits = ac->ac_sb->s_blocksize_bits;
3090  
3091  	/* first, let's learn actual file size
3092  	 * given current request is allocated */
3093  	size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
3094  	size = size << bsbits;
3095  	if (size < i_size_read(ac->ac_inode))
3096  		size = i_size_read(ac->ac_inode);
3097  	orig_size = size;
3098  
3099  	/* max size of free chunks */
3100  	max = 2 << bsbits;
3101  
3102  #define NRL_CHECK_SIZE(req, size, max, chunk_size)	\
3103  		(req <= (size) || max <= (chunk_size))
3104  
3105  	/* first, try to predict filesize */
3106  	/* XXX: should this table be tunable? */
3107  	start_off = 0;
3108  	if (size <= 16 * 1024) {
3109  		size = 16 * 1024;
3110  	} else if (size <= 32 * 1024) {
3111  		size = 32 * 1024;
3112  	} else if (size <= 64 * 1024) {
3113  		size = 64 * 1024;
3114  	} else if (size <= 128 * 1024) {
3115  		size = 128 * 1024;
3116  	} else if (size <= 256 * 1024) {
3117  		size = 256 * 1024;
3118  	} else if (size <= 512 * 1024) {
3119  		size = 512 * 1024;
3120  	} else if (size <= 1024 * 1024) {
3121  		size = 1024 * 1024;
3122  	} else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) {
3123  		start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
3124  						(21 - bsbits)) << 21;
3125  		size = 2 * 1024 * 1024;
3126  	} else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) {
3127  		start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
3128  							(22 - bsbits)) << 22;
3129  		size = 4 * 1024 * 1024;
3130  	} else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len,
3131  					(8<<20)>>bsbits, max, 8 * 1024)) {
3132  		start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
3133  							(23 - bsbits)) << 23;
3134  		size = 8 * 1024 * 1024;
3135  	} else {
3136  		start_off = (loff_t) ac->ac_o_ex.fe_logical << bsbits;
3137  		size	  = (loff_t) EXT4_C2B(EXT4_SB(ac->ac_sb),
3138  					      ac->ac_o_ex.fe_len) << bsbits;
3139  	}
3140  	size = size >> bsbits;
3141  	start = start_off >> bsbits;
3142  
3143  	/* don't cover already allocated blocks in selected range */
3144  	if (ar->pleft && start <= ar->lleft) {
3145  		size -= ar->lleft + 1 - start;
3146  		start = ar->lleft + 1;
3147  	}
3148  	if (ar->pright && start + size - 1 >= ar->lright)
3149  		size -= start + size - ar->lright;
3150  
3151  	/*
3152  	 * Trim allocation request for filesystems with artificially small
3153  	 * groups.
3154  	 */
3155  	if (size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb))
3156  		size = EXT4_BLOCKS_PER_GROUP(ac->ac_sb);
3157  
3158  	end = start + size;
3159  
3160  	/* check we don't cross already preallocated blocks */
3161  	rcu_read_lock();
3162  	list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
3163  		ext4_lblk_t pa_end;
3164  
3165  		if (pa->pa_deleted)
3166  			continue;
3167  		spin_lock(&pa->pa_lock);
3168  		if (pa->pa_deleted) {
3169  			spin_unlock(&pa->pa_lock);
3170  			continue;
3171  		}
3172  
3173  		pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
3174  						  pa->pa_len);
3175  
3176  		/* PA must not overlap original request */
3177  		BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end ||
3178  			ac->ac_o_ex.fe_logical < pa->pa_lstart));
3179  
3180  		/* skip PAs this normalized request doesn't overlap with */
3181  		if (pa->pa_lstart >= end || pa_end <= start) {
3182  			spin_unlock(&pa->pa_lock);
3183  			continue;
3184  		}
3185  		BUG_ON(pa->pa_lstart <= start && pa_end >= end);
3186  
3187  		/* adjust start or end to be adjacent to this pa */
3188  		if (pa_end <= ac->ac_o_ex.fe_logical) {
3189  			BUG_ON(pa_end < start);
3190  			start = pa_end;
3191  		} else if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
3192  			BUG_ON(pa->pa_lstart > end);
3193  			end = pa->pa_lstart;
3194  		}
3195  		spin_unlock(&pa->pa_lock);
3196  	}
3197  	rcu_read_unlock();
3198  	size = end - start;
3199  
3200  	/* XXX: extra loop to check we really don't overlap preallocations */
3201  	rcu_read_lock();
3202  	list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
3203  		ext4_lblk_t pa_end;
3204  
3205  		spin_lock(&pa->pa_lock);
3206  		if (pa->pa_deleted == 0) {
3207  			pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
3208  							  pa->pa_len);
3209  			BUG_ON(!(start >= pa_end || end <= pa->pa_lstart));
3210  		}
3211  		spin_unlock(&pa->pa_lock);
3212  	}
3213  	rcu_read_unlock();
3214  
3215  	if (start + size <= ac->ac_o_ex.fe_logical &&
3216  			start > ac->ac_o_ex.fe_logical) {
3217  		ext4_msg(ac->ac_sb, KERN_ERR,
3218  			 "start %lu, size %lu, fe_logical %lu",
3219  			 (unsigned long) start, (unsigned long) size,
3220  			 (unsigned long) ac->ac_o_ex.fe_logical);
3221  		BUG();
3222  	}
3223  	BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
3224  
3225  	/* now prepare goal request */
3226  
3227  	/* XXX: is it better to align blocks WRT to logical
3228  	 * placement or satisfy big request as is */
3229  	ac->ac_g_ex.fe_logical = start;
3230  	ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size);
3231  
3232  	/* define goal start in order to merge */
3233  	if (ar->pright && (ar->lright == (start + size))) {
3234  		/* merge to the right */
3235  		ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size,
3236  						&ac->ac_f_ex.fe_group,
3237  						&ac->ac_f_ex.fe_start);
3238  		ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
3239  	}
3240  	if (ar->pleft && (ar->lleft + 1 == start)) {
3241  		/* merge to the left */
3242  		ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1,
3243  						&ac->ac_f_ex.fe_group,
3244  						&ac->ac_f_ex.fe_start);
3245  		ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
3246  	}
3247  
3248  	mb_debug(1, "goal: %u(was %u) blocks at %u\n", (unsigned) size,
3249  		(unsigned) orig_size, (unsigned) start);
3250  }
3251  
ext4_mb_collect_stats(struct ext4_allocation_context * ac)3252  static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
3253  {
3254  	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
3255  
3256  	if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) {
3257  		atomic_inc(&sbi->s_bal_reqs);
3258  		atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
3259  		if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len)
3260  			atomic_inc(&sbi->s_bal_success);
3261  		atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
3262  		if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
3263  				ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
3264  			atomic_inc(&sbi->s_bal_goals);
3265  		if (ac->ac_found > sbi->s_mb_max_to_scan)
3266  			atomic_inc(&sbi->s_bal_breaks);
3267  	}
3268  
3269  	if (ac->ac_op == EXT4_MB_HISTORY_ALLOC)
3270  		trace_ext4_mballoc_alloc(ac);
3271  	else
3272  		trace_ext4_mballoc_prealloc(ac);
3273  }
3274  
3275  /*
3276   * Called on failure; free up any blocks from the inode PA for this
3277   * context.  We don't need this for MB_GROUP_PA because we only change
3278   * pa_free in ext4_mb_release_context(), but on failure, we've already
3279   * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed.
3280   */
ext4_discard_allocated_blocks(struct ext4_allocation_context * ac)3281  static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
3282  {
3283  	struct ext4_prealloc_space *pa = ac->ac_pa;
3284  	struct ext4_buddy e4b;
3285  	int err;
3286  
3287  	if (pa == NULL) {
3288  		if (ac->ac_f_ex.fe_len == 0)
3289  			return;
3290  		err = ext4_mb_load_buddy(ac->ac_sb, ac->ac_f_ex.fe_group, &e4b);
3291  		if (err) {
3292  			/*
3293  			 * This should never happen since we pin the
3294  			 * pages in the ext4_allocation_context so
3295  			 * ext4_mb_load_buddy() should never fail.
3296  			 */
3297  			WARN(1, "mb_load_buddy failed (%d)", err);
3298  			return;
3299  		}
3300  		ext4_lock_group(ac->ac_sb, ac->ac_f_ex.fe_group);
3301  		mb_free_blocks(ac->ac_inode, &e4b, ac->ac_f_ex.fe_start,
3302  			       ac->ac_f_ex.fe_len);
3303  		ext4_unlock_group(ac->ac_sb, ac->ac_f_ex.fe_group);
3304  		ext4_mb_unload_buddy(&e4b);
3305  		return;
3306  	}
3307  	if (pa->pa_type == MB_INODE_PA)
3308  		pa->pa_free += ac->ac_b_ex.fe_len;
3309  }
3310  
3311  /*
3312   * use blocks preallocated to inode
3313   */
ext4_mb_use_inode_pa(struct ext4_allocation_context * ac,struct ext4_prealloc_space * pa)3314  static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
3315  				struct ext4_prealloc_space *pa)
3316  {
3317  	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
3318  	ext4_fsblk_t start;
3319  	ext4_fsblk_t end;
3320  	int len;
3321  
3322  	/* found preallocated blocks, use them */
3323  	start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart);
3324  	end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len),
3325  		  start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len));
3326  	len = EXT4_NUM_B2C(sbi, end - start);
3327  	ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group,
3328  					&ac->ac_b_ex.fe_start);
3329  	ac->ac_b_ex.fe_len = len;
3330  	ac->ac_status = AC_STATUS_FOUND;
3331  	ac->ac_pa = pa;
3332  
3333  	BUG_ON(start < pa->pa_pstart);
3334  	BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len));
3335  	BUG_ON(pa->pa_free < len);
3336  	pa->pa_free -= len;
3337  
3338  	mb_debug(1, "use %llu/%u from inode pa %p\n", start, len, pa);
3339  }
3340  
3341  /*
3342   * use blocks preallocated to locality group
3343   */
ext4_mb_use_group_pa(struct ext4_allocation_context * ac,struct ext4_prealloc_space * pa)3344  static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
3345  				struct ext4_prealloc_space *pa)
3346  {
3347  	unsigned int len = ac->ac_o_ex.fe_len;
3348  
3349  	ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart,
3350  					&ac->ac_b_ex.fe_group,
3351  					&ac->ac_b_ex.fe_start);
3352  	ac->ac_b_ex.fe_len = len;
3353  	ac->ac_status = AC_STATUS_FOUND;
3354  	ac->ac_pa = pa;
3355  
3356  	/* we don't correct pa_pstart or pa_plen here to avoid
3357  	 * possible race when the group is being loaded concurrently
3358  	 * instead we correct pa later, after blocks are marked
3359  	 * in on-disk bitmap -- see ext4_mb_release_context()
3360  	 * Other CPUs are prevented from allocating from this pa by lg_mutex
3361  	 */
3362  	mb_debug(1, "use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa);
3363  }
3364  
3365  /*
3366   * Return the prealloc space that have minimal distance
3367   * from the goal block. @cpa is the prealloc
3368   * space that is having currently known minimal distance
3369   * from the goal block.
3370   */
3371  static struct ext4_prealloc_space *
ext4_mb_check_group_pa(ext4_fsblk_t goal_block,struct ext4_prealloc_space * pa,struct ext4_prealloc_space * cpa)3372  ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
3373  			struct ext4_prealloc_space *pa,
3374  			struct ext4_prealloc_space *cpa)
3375  {
3376  	ext4_fsblk_t cur_distance, new_distance;
3377  
3378  	if (cpa == NULL) {
3379  		atomic_inc(&pa->pa_count);
3380  		return pa;
3381  	}
3382  	cur_distance = abs(goal_block - cpa->pa_pstart);
3383  	new_distance = abs(goal_block - pa->pa_pstart);
3384  
3385  	if (cur_distance <= new_distance)
3386  		return cpa;
3387  
3388  	/* drop the previous reference */
3389  	atomic_dec(&cpa->pa_count);
3390  	atomic_inc(&pa->pa_count);
3391  	return pa;
3392  }
3393  
3394  /*
3395   * search goal blocks in preallocated space
3396   */
3397  static noinline_for_stack int
ext4_mb_use_preallocated(struct ext4_allocation_context * ac)3398  ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
3399  {
3400  	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
3401  	int order, i;
3402  	struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
3403  	struct ext4_locality_group *lg;
3404  	struct ext4_prealloc_space *pa, *cpa = NULL;
3405  	ext4_fsblk_t goal_block;
3406  
3407  	/* only data can be preallocated */
3408  	if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
3409  		return 0;
3410  
3411  	/* first, try per-file preallocation */
3412  	rcu_read_lock();
3413  	list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
3414  
3415  		/* all fields in this condition don't change,
3416  		 * so we can skip locking for them */
3417  		if (ac->ac_o_ex.fe_logical < pa->pa_lstart ||
3418  		    ac->ac_o_ex.fe_logical >= (pa->pa_lstart +
3419  					       EXT4_C2B(sbi, pa->pa_len)))
3420  			continue;
3421  
3422  		/* non-extent files can't have physical blocks past 2^32 */
3423  		if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
3424  		    (pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len) >
3425  		     EXT4_MAX_BLOCK_FILE_PHYS))
3426  			continue;
3427  
3428  		/* found preallocated blocks, use them */
3429  		spin_lock(&pa->pa_lock);
3430  		if (pa->pa_deleted == 0 && pa->pa_free) {
3431  			atomic_inc(&pa->pa_count);
3432  			ext4_mb_use_inode_pa(ac, pa);
3433  			spin_unlock(&pa->pa_lock);
3434  			ac->ac_criteria = 10;
3435  			rcu_read_unlock();
3436  			return 1;
3437  		}
3438  		spin_unlock(&pa->pa_lock);
3439  	}
3440  	rcu_read_unlock();
3441  
3442  	/* can we use group allocation? */
3443  	if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC))
3444  		return 0;
3445  
3446  	/* inode may have no locality group for some reason */
3447  	lg = ac->ac_lg;
3448  	if (lg == NULL)
3449  		return 0;
3450  	order  = fls(ac->ac_o_ex.fe_len) - 1;
3451  	if (order > PREALLOC_TB_SIZE - 1)
3452  		/* The max size of hash table is PREALLOC_TB_SIZE */
3453  		order = PREALLOC_TB_SIZE - 1;
3454  
3455  	goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex);
3456  	/*
3457  	 * search for the prealloc space that is having
3458  	 * minimal distance from the goal block.
3459  	 */
3460  	for (i = order; i < PREALLOC_TB_SIZE; i++) {
3461  		rcu_read_lock();
3462  		list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i],
3463  					pa_inode_list) {
3464  			spin_lock(&pa->pa_lock);
3465  			if (pa->pa_deleted == 0 &&
3466  					pa->pa_free >= ac->ac_o_ex.fe_len) {
3467  
3468  				cpa = ext4_mb_check_group_pa(goal_block,
3469  								pa, cpa);
3470  			}
3471  			spin_unlock(&pa->pa_lock);
3472  		}
3473  		rcu_read_unlock();
3474  	}
3475  	if (cpa) {
3476  		ext4_mb_use_group_pa(ac, cpa);
3477  		ac->ac_criteria = 20;
3478  		return 1;
3479  	}
3480  	return 0;
3481  }
3482  
3483  /*
3484   * the function goes through all block freed in the group
3485   * but not yet committed and marks them used in in-core bitmap.
3486   * buddy must be generated from this bitmap
3487   * Need to be called with the ext4 group lock held
3488   */
ext4_mb_generate_from_freelist(struct super_block * sb,void * bitmap,ext4_group_t group)3489  static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
3490  						ext4_group_t group)
3491  {
3492  	struct rb_node *n;
3493  	struct ext4_group_info *grp;
3494  	struct ext4_free_data *entry;
3495  
3496  	grp = ext4_get_group_info(sb, group);
3497  	n = rb_first(&(grp->bb_free_root));
3498  
3499  	while (n) {
3500  		entry = rb_entry(n, struct ext4_free_data, efd_node);
3501  		ext4_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count);
3502  		n = rb_next(n);
3503  	}
3504  	return;
3505  }
3506  
3507  /*
3508   * the function goes through all preallocation in this group and marks them
3509   * used in in-core bitmap. buddy must be generated from this bitmap
3510   * Need to be called with ext4 group lock held
3511   */
3512  static noinline_for_stack
ext4_mb_generate_from_pa(struct super_block * sb,void * bitmap,ext4_group_t group)3513  void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
3514  					ext4_group_t group)
3515  {
3516  	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
3517  	struct ext4_prealloc_space *pa;
3518  	struct list_head *cur;
3519  	ext4_group_t groupnr;
3520  	ext4_grpblk_t start;
3521  	int preallocated = 0;
3522  	int len;
3523  
3524  	/* all form of preallocation discards first load group,
3525  	 * so the only competing code is preallocation use.
3526  	 * we don't need any locking here
3527  	 * notice we do NOT ignore preallocations with pa_deleted
3528  	 * otherwise we could leave used blocks available for
3529  	 * allocation in buddy when concurrent ext4_mb_put_pa()
3530  	 * is dropping preallocation
3531  	 */
3532  	list_for_each(cur, &grp->bb_prealloc_list) {
3533  		pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
3534  		spin_lock(&pa->pa_lock);
3535  		ext4_get_group_no_and_offset(sb, pa->pa_pstart,
3536  					     &groupnr, &start);
3537  		len = pa->pa_len;
3538  		spin_unlock(&pa->pa_lock);
3539  		if (unlikely(len == 0))
3540  			continue;
3541  		BUG_ON(groupnr != group);
3542  		ext4_set_bits(bitmap, start, len);
3543  		preallocated += len;
3544  	}
3545  	mb_debug(1, "prellocated %u for group %u\n", preallocated, group);
3546  }
3547  
ext4_mb_pa_callback(struct rcu_head * head)3548  static void ext4_mb_pa_callback(struct rcu_head *head)
3549  {
3550  	struct ext4_prealloc_space *pa;
3551  	pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
3552  
3553  	BUG_ON(atomic_read(&pa->pa_count));
3554  	BUG_ON(pa->pa_deleted == 0);
3555  	kmem_cache_free(ext4_pspace_cachep, pa);
3556  }
3557  
3558  /*
3559   * drops a reference to preallocated space descriptor
3560   * if this was the last reference and the space is consumed
3561   */
ext4_mb_put_pa(struct ext4_allocation_context * ac,struct super_block * sb,struct ext4_prealloc_space * pa)3562  static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
3563  			struct super_block *sb, struct ext4_prealloc_space *pa)
3564  {
3565  	ext4_group_t grp;
3566  	ext4_fsblk_t grp_blk;
3567  
3568  	/* in this short window concurrent discard can set pa_deleted */
3569  	spin_lock(&pa->pa_lock);
3570  	if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) {
3571  		spin_unlock(&pa->pa_lock);
3572  		return;
3573  	}
3574  
3575  	if (pa->pa_deleted == 1) {
3576  		spin_unlock(&pa->pa_lock);
3577  		return;
3578  	}
3579  
3580  	pa->pa_deleted = 1;
3581  	spin_unlock(&pa->pa_lock);
3582  
3583  	grp_blk = pa->pa_pstart;
3584  	/*
3585  	 * If doing group-based preallocation, pa_pstart may be in the
3586  	 * next group when pa is used up
3587  	 */
3588  	if (pa->pa_type == MB_GROUP_PA)
3589  		grp_blk--;
3590  
3591  	grp = ext4_get_group_number(sb, grp_blk);
3592  
3593  	/*
3594  	 * possible race:
3595  	 *
3596  	 *  P1 (buddy init)			P2 (regular allocation)
3597  	 *					find block B in PA
3598  	 *  copy on-disk bitmap to buddy
3599  	 *  					mark B in on-disk bitmap
3600  	 *					drop PA from group
3601  	 *  mark all PAs in buddy
3602  	 *
3603  	 * thus, P1 initializes buddy with B available. to prevent this
3604  	 * we make "copy" and "mark all PAs" atomic and serialize "drop PA"
3605  	 * against that pair
3606  	 */
3607  	ext4_lock_group(sb, grp);
3608  	list_del(&pa->pa_group_list);
3609  	ext4_unlock_group(sb, grp);
3610  
3611  	spin_lock(pa->pa_obj_lock);
3612  	list_del_rcu(&pa->pa_inode_list);
3613  	spin_unlock(pa->pa_obj_lock);
3614  
3615  	call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
3616  }
3617  
3618  /*
3619   * creates new preallocated space for given inode
3620   */
3621  static noinline_for_stack int
ext4_mb_new_inode_pa(struct ext4_allocation_context * ac)3622  ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
3623  {
3624  	struct super_block *sb = ac->ac_sb;
3625  	struct ext4_sb_info *sbi = EXT4_SB(sb);
3626  	struct ext4_prealloc_space *pa;
3627  	struct ext4_group_info *grp;
3628  	struct ext4_inode_info *ei;
3629  
3630  	/* preallocate only when found space is larger then requested */
3631  	BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
3632  	BUG_ON(ac->ac_status != AC_STATUS_FOUND);
3633  	BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
3634  
3635  	pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
3636  	if (pa == NULL)
3637  		return -ENOMEM;
3638  
3639  	if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) {
3640  		int winl;
3641  		int wins;
3642  		int win;
3643  		int offs;
3644  
3645  		/* we can't allocate as much as normalizer wants.
3646  		 * so, found space must get proper lstart
3647  		 * to cover original request */
3648  		BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical);
3649  		BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len);
3650  
3651  		/* we're limited by original request in that
3652  		 * logical block must be covered any way
3653  		 * winl is window we can move our chunk within */
3654  		winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical;
3655  
3656  		/* also, we should cover whole original request */
3657  		wins = EXT4_C2B(sbi, ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len);
3658  
3659  		/* the smallest one defines real window */
3660  		win = min(winl, wins);
3661  
3662  		offs = ac->ac_o_ex.fe_logical %
3663  			EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
3664  		if (offs && offs < win)
3665  			win = offs;
3666  
3667  		ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical -
3668  			EXT4_NUM_B2C(sbi, win);
3669  		BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
3670  		BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
3671  	}
3672  
3673  	/* preallocation can change ac_b_ex, thus we store actually
3674  	 * allocated blocks for history */
3675  	ac->ac_f_ex = ac->ac_b_ex;
3676  
3677  	pa->pa_lstart = ac->ac_b_ex.fe_logical;
3678  	pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
3679  	pa->pa_len = ac->ac_b_ex.fe_len;
3680  	pa->pa_free = pa->pa_len;
3681  	atomic_set(&pa->pa_count, 1);
3682  	spin_lock_init(&pa->pa_lock);
3683  	INIT_LIST_HEAD(&pa->pa_inode_list);
3684  	INIT_LIST_HEAD(&pa->pa_group_list);
3685  	pa->pa_deleted = 0;
3686  	pa->pa_type = MB_INODE_PA;
3687  
3688  	mb_debug(1, "new inode pa %p: %llu/%u for %u\n", pa,
3689  			pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3690  	trace_ext4_mb_new_inode_pa(ac, pa);
3691  
3692  	ext4_mb_use_inode_pa(ac, pa);
3693  	atomic_add(pa->pa_free, &sbi->s_mb_preallocated);
3694  
3695  	ei = EXT4_I(ac->ac_inode);
3696  	grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
3697  
3698  	pa->pa_obj_lock = &ei->i_prealloc_lock;
3699  	pa->pa_inode = ac->ac_inode;
3700  
3701  	ext4_lock_group(sb, ac->ac_b_ex.fe_group);
3702  	list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
3703  	ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
3704  
3705  	spin_lock(pa->pa_obj_lock);
3706  	list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list);
3707  	spin_unlock(pa->pa_obj_lock);
3708  
3709  	return 0;
3710  }
3711  
3712  /*
3713   * creates new preallocated space for locality group inodes belongs to
3714   */
3715  static noinline_for_stack int
ext4_mb_new_group_pa(struct ext4_allocation_context * ac)3716  ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
3717  {
3718  	struct super_block *sb = ac->ac_sb;
3719  	struct ext4_locality_group *lg;
3720  	struct ext4_prealloc_space *pa;
3721  	struct ext4_group_info *grp;
3722  
3723  	/* preallocate only when found space is larger then requested */
3724  	BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
3725  	BUG_ON(ac->ac_status != AC_STATUS_FOUND);
3726  	BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
3727  
3728  	BUG_ON(ext4_pspace_cachep == NULL);
3729  	pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
3730  	if (pa == NULL)
3731  		return -ENOMEM;
3732  
3733  	/* preallocation can change ac_b_ex, thus we store actually
3734  	 * allocated blocks for history */
3735  	ac->ac_f_ex = ac->ac_b_ex;
3736  
3737  	pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
3738  	pa->pa_lstart = pa->pa_pstart;
3739  	pa->pa_len = ac->ac_b_ex.fe_len;
3740  	pa->pa_free = pa->pa_len;
3741  	atomic_set(&pa->pa_count, 1);
3742  	spin_lock_init(&pa->pa_lock);
3743  	INIT_LIST_HEAD(&pa->pa_inode_list);
3744  	INIT_LIST_HEAD(&pa->pa_group_list);
3745  	pa->pa_deleted = 0;
3746  	pa->pa_type = MB_GROUP_PA;
3747  
3748  	mb_debug(1, "new group pa %p: %llu/%u for %u\n", pa,
3749  			pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3750  	trace_ext4_mb_new_group_pa(ac, pa);
3751  
3752  	ext4_mb_use_group_pa(ac, pa);
3753  	atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
3754  
3755  	grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
3756  	lg = ac->ac_lg;
3757  	BUG_ON(lg == NULL);
3758  
3759  	pa->pa_obj_lock = &lg->lg_prealloc_lock;
3760  	pa->pa_inode = NULL;
3761  
3762  	ext4_lock_group(sb, ac->ac_b_ex.fe_group);
3763  	list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
3764  	ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
3765  
3766  	/*
3767  	 * We will later add the new pa to the right bucket
3768  	 * after updating the pa_free in ext4_mb_release_context
3769  	 */
3770  	return 0;
3771  }
3772  
ext4_mb_new_preallocation(struct ext4_allocation_context * ac)3773  static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
3774  {
3775  	int err;
3776  
3777  	if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
3778  		err = ext4_mb_new_group_pa(ac);
3779  	else
3780  		err = ext4_mb_new_inode_pa(ac);
3781  	return err;
3782  }
3783  
3784  /*
3785   * finds all unused blocks in on-disk bitmap, frees them in
3786   * in-core bitmap and buddy.
3787   * @pa must be unlinked from inode and group lists, so that
3788   * nobody else can find/use it.
3789   * the caller MUST hold group/inode locks.
3790   * TODO: optimize the case when there are no in-core structures yet
3791   */
3792  static noinline_for_stack int
ext4_mb_release_inode_pa(struct ext4_buddy * e4b,struct buffer_head * bitmap_bh,struct ext4_prealloc_space * pa)3793  ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3794  			struct ext4_prealloc_space *pa)
3795  {
3796  	struct super_block *sb = e4b->bd_sb;
3797  	struct ext4_sb_info *sbi = EXT4_SB(sb);
3798  	unsigned int end;
3799  	unsigned int next;
3800  	ext4_group_t group;
3801  	ext4_grpblk_t bit;
3802  	unsigned long long grp_blk_start;
3803  	int err = 0;
3804  	int free = 0;
3805  
3806  	BUG_ON(pa->pa_deleted == 0);
3807  	ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
3808  	grp_blk_start = pa->pa_pstart - EXT4_C2B(sbi, bit);
3809  	BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
3810  	end = bit + pa->pa_len;
3811  
3812  	while (bit < end) {
3813  		bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit);
3814  		if (bit >= end)
3815  			break;
3816  		next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
3817  		mb_debug(1, "    free preallocated %u/%u in group %u\n",
3818  			 (unsigned) ext4_group_first_block_no(sb, group) + bit,
3819  			 (unsigned) next - bit, (unsigned) group);
3820  		free += next - bit;
3821  
3822  		trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit);
3823  		trace_ext4_mb_release_inode_pa(pa, (grp_blk_start +
3824  						    EXT4_C2B(sbi, bit)),
3825  					       next - bit);
3826  		mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
3827  		bit = next + 1;
3828  	}
3829  	if (free != pa->pa_free) {
3830  		ext4_msg(e4b->bd_sb, KERN_CRIT,
3831  			 "pa %p: logic %lu, phys. %lu, len %lu",
3832  			 pa, (unsigned long) pa->pa_lstart,
3833  			 (unsigned long) pa->pa_pstart,
3834  			 (unsigned long) pa->pa_len);
3835  		ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u",
3836  					free, pa->pa_free);
3837  		/*
3838  		 * pa is already deleted so we use the value obtained
3839  		 * from the bitmap and continue.
3840  		 */
3841  	}
3842  	atomic_add(free, &sbi->s_mb_discarded);
3843  
3844  	return err;
3845  }
3846  
3847  static noinline_for_stack int
ext4_mb_release_group_pa(struct ext4_buddy * e4b,struct ext4_prealloc_space * pa)3848  ext4_mb_release_group_pa(struct ext4_buddy *e4b,
3849  				struct ext4_prealloc_space *pa)
3850  {
3851  	struct super_block *sb = e4b->bd_sb;
3852  	ext4_group_t group;
3853  	ext4_grpblk_t bit;
3854  
3855  	trace_ext4_mb_release_group_pa(sb, pa);
3856  	BUG_ON(pa->pa_deleted == 0);
3857  	ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
3858  	BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
3859  	mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
3860  	atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
3861  	trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len);
3862  
3863  	return 0;
3864  }
3865  
3866  /*
3867   * releases all preallocations in given group
3868   *
3869   * first, we need to decide discard policy:
3870   * - when do we discard
3871   *   1) ENOSPC
3872   * - how many do we discard
3873   *   1) how many requested
3874   */
3875  static noinline_for_stack int
ext4_mb_discard_group_preallocations(struct super_block * sb,ext4_group_t group,int needed)3876  ext4_mb_discard_group_preallocations(struct super_block *sb,
3877  					ext4_group_t group, int needed)
3878  {
3879  	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
3880  	struct buffer_head *bitmap_bh = NULL;
3881  	struct ext4_prealloc_space *pa, *tmp;
3882  	struct list_head list;
3883  	struct ext4_buddy e4b;
3884  	int err;
3885  	int busy = 0;
3886  	int free = 0;
3887  
3888  	mb_debug(1, "discard preallocation for group %u\n", group);
3889  
3890  	if (list_empty(&grp->bb_prealloc_list))
3891  		return 0;
3892  
3893  	bitmap_bh = ext4_read_block_bitmap(sb, group);
3894  	if (IS_ERR(bitmap_bh)) {
3895  		err = PTR_ERR(bitmap_bh);
3896  		ext4_error(sb, "Error %d reading block bitmap for %u",
3897  			   err, group);
3898  		return 0;
3899  	}
3900  
3901  	err = ext4_mb_load_buddy(sb, group, &e4b);
3902  	if (err) {
3903  		ext4_warning(sb, "Error %d loading buddy information for %u",
3904  			     err, group);
3905  		put_bh(bitmap_bh);
3906  		return 0;
3907  	}
3908  
3909  	if (needed == 0)
3910  		needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1;
3911  
3912  	INIT_LIST_HEAD(&list);
3913  repeat:
3914  	ext4_lock_group(sb, group);
3915  	list_for_each_entry_safe(pa, tmp,
3916  				&grp->bb_prealloc_list, pa_group_list) {
3917  		spin_lock(&pa->pa_lock);
3918  		if (atomic_read(&pa->pa_count)) {
3919  			spin_unlock(&pa->pa_lock);
3920  			busy = 1;
3921  			continue;
3922  		}
3923  		if (pa->pa_deleted) {
3924  			spin_unlock(&pa->pa_lock);
3925  			continue;
3926  		}
3927  
3928  		/* seems this one can be freed ... */
3929  		pa->pa_deleted = 1;
3930  
3931  		/* we can trust pa_free ... */
3932  		free += pa->pa_free;
3933  
3934  		spin_unlock(&pa->pa_lock);
3935  
3936  		list_del(&pa->pa_group_list);
3937  		list_add(&pa->u.pa_tmp_list, &list);
3938  	}
3939  
3940  	/* if we still need more blocks and some PAs were used, try again */
3941  	if (free < needed && busy) {
3942  		busy = 0;
3943  		ext4_unlock_group(sb, group);
3944  		cond_resched();
3945  		goto repeat;
3946  	}
3947  
3948  	/* found anything to free? */
3949  	if (list_empty(&list)) {
3950  		BUG_ON(free != 0);
3951  		goto out;
3952  	}
3953  
3954  	/* now free all selected PAs */
3955  	list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
3956  
3957  		/* remove from object (inode or locality group) */
3958  		spin_lock(pa->pa_obj_lock);
3959  		list_del_rcu(&pa->pa_inode_list);
3960  		spin_unlock(pa->pa_obj_lock);
3961  
3962  		if (pa->pa_type == MB_GROUP_PA)
3963  			ext4_mb_release_group_pa(&e4b, pa);
3964  		else
3965  			ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
3966  
3967  		list_del(&pa->u.pa_tmp_list);
3968  		call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
3969  	}
3970  
3971  out:
3972  	ext4_unlock_group(sb, group);
3973  	ext4_mb_unload_buddy(&e4b);
3974  	put_bh(bitmap_bh);
3975  	return free;
3976  }
3977  
3978  /*
3979   * releases all non-used preallocated blocks for given inode
3980   *
3981   * It's important to discard preallocations under i_data_sem
3982   * We don't want another block to be served from the prealloc
3983   * space when we are discarding the inode prealloc space.
3984   *
3985   * FIXME!! Make sure it is valid at all the call sites
3986   */
ext4_discard_preallocations(struct inode * inode)3987  void ext4_discard_preallocations(struct inode *inode)
3988  {
3989  	struct ext4_inode_info *ei = EXT4_I(inode);
3990  	struct super_block *sb = inode->i_sb;
3991  	struct buffer_head *bitmap_bh = NULL;
3992  	struct ext4_prealloc_space *pa, *tmp;
3993  	ext4_group_t group = 0;
3994  	struct list_head list;
3995  	struct ext4_buddy e4b;
3996  	int err;
3997  
3998  	if (!S_ISREG(inode->i_mode)) {
3999  		/*BUG_ON(!list_empty(&ei->i_prealloc_list));*/
4000  		return;
4001  	}
4002  
4003  	mb_debug(1, "discard preallocation for inode %lu\n", inode->i_ino);
4004  	trace_ext4_discard_preallocations(inode);
4005  
4006  	INIT_LIST_HEAD(&list);
4007  
4008  repeat:
4009  	/* first, collect all pa's in the inode */
4010  	spin_lock(&ei->i_prealloc_lock);
4011  	while (!list_empty(&ei->i_prealloc_list)) {
4012  		pa = list_entry(ei->i_prealloc_list.next,
4013  				struct ext4_prealloc_space, pa_inode_list);
4014  		BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock);
4015  		spin_lock(&pa->pa_lock);
4016  		if (atomic_read(&pa->pa_count)) {
4017  			/* this shouldn't happen often - nobody should
4018  			 * use preallocation while we're discarding it */
4019  			spin_unlock(&pa->pa_lock);
4020  			spin_unlock(&ei->i_prealloc_lock);
4021  			ext4_msg(sb, KERN_ERR,
4022  				 "uh-oh! used pa while discarding");
4023  			WARN_ON(1);
4024  			schedule_timeout_uninterruptible(HZ);
4025  			goto repeat;
4026  
4027  		}
4028  		if (pa->pa_deleted == 0) {
4029  			pa->pa_deleted = 1;
4030  			spin_unlock(&pa->pa_lock);
4031  			list_del_rcu(&pa->pa_inode_list);
4032  			list_add(&pa->u.pa_tmp_list, &list);
4033  			continue;
4034  		}
4035  
4036  		/* someone is deleting pa right now */
4037  		spin_unlock(&pa->pa_lock);
4038  		spin_unlock(&ei->i_prealloc_lock);
4039  
4040  		/* we have to wait here because pa_deleted
4041  		 * doesn't mean pa is already unlinked from
4042  		 * the list. as we might be called from
4043  		 * ->clear_inode() the inode will get freed
4044  		 * and concurrent thread which is unlinking
4045  		 * pa from inode's list may access already
4046  		 * freed memory, bad-bad-bad */
4047  
4048  		/* XXX: if this happens too often, we can
4049  		 * add a flag to force wait only in case
4050  		 * of ->clear_inode(), but not in case of
4051  		 * regular truncate */
4052  		schedule_timeout_uninterruptible(HZ);
4053  		goto repeat;
4054  	}
4055  	spin_unlock(&ei->i_prealloc_lock);
4056  
4057  	list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
4058  		BUG_ON(pa->pa_type != MB_INODE_PA);
4059  		group = ext4_get_group_number(sb, pa->pa_pstart);
4060  
4061  		err = ext4_mb_load_buddy_gfp(sb, group, &e4b,
4062  					     GFP_NOFS|__GFP_NOFAIL);
4063  		if (err) {
4064  			ext4_error(sb, "Error %d loading buddy information for %u",
4065  				   err, group);
4066  			continue;
4067  		}
4068  
4069  		bitmap_bh = ext4_read_block_bitmap(sb, group);
4070  		if (IS_ERR(bitmap_bh)) {
4071  			err = PTR_ERR(bitmap_bh);
4072  			ext4_error(sb, "Error %d reading block bitmap for %u",
4073  					err, group);
4074  			ext4_mb_unload_buddy(&e4b);
4075  			continue;
4076  		}
4077  
4078  		ext4_lock_group(sb, group);
4079  		list_del(&pa->pa_group_list);
4080  		ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
4081  		ext4_unlock_group(sb, group);
4082  
4083  		ext4_mb_unload_buddy(&e4b);
4084  		put_bh(bitmap_bh);
4085  
4086  		list_del(&pa->u.pa_tmp_list);
4087  		call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
4088  	}
4089  }
4090  
4091  #ifdef CONFIG_EXT4_DEBUG
ext4_mb_show_ac(struct ext4_allocation_context * ac)4092  static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
4093  {
4094  	struct super_block *sb = ac->ac_sb;
4095  	ext4_group_t ngroups, i;
4096  
4097  	if (!ext4_mballoc_debug ||
4098  	    (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED))
4099  		return;
4100  
4101  	ext4_msg(ac->ac_sb, KERN_ERR, "Can't allocate:"
4102  			" Allocation context details:");
4103  	ext4_msg(ac->ac_sb, KERN_ERR, "status %d flags %d",
4104  			ac->ac_status, ac->ac_flags);
4105  	ext4_msg(ac->ac_sb, KERN_ERR, "orig %lu/%lu/%lu@%lu, "
4106  		 	"goal %lu/%lu/%lu@%lu, "
4107  			"best %lu/%lu/%lu@%lu cr %d",
4108  			(unsigned long)ac->ac_o_ex.fe_group,
4109  			(unsigned long)ac->ac_o_ex.fe_start,
4110  			(unsigned long)ac->ac_o_ex.fe_len,
4111  			(unsigned long)ac->ac_o_ex.fe_logical,
4112  			(unsigned long)ac->ac_g_ex.fe_group,
4113  			(unsigned long)ac->ac_g_ex.fe_start,
4114  			(unsigned long)ac->ac_g_ex.fe_len,
4115  			(unsigned long)ac->ac_g_ex.fe_logical,
4116  			(unsigned long)ac->ac_b_ex.fe_group,
4117  			(unsigned long)ac->ac_b_ex.fe_start,
4118  			(unsigned long)ac->ac_b_ex.fe_len,
4119  			(unsigned long)ac->ac_b_ex.fe_logical,
4120  			(int)ac->ac_criteria);
4121  	ext4_msg(ac->ac_sb, KERN_ERR, "%d found", ac->ac_found);
4122  	ext4_msg(ac->ac_sb, KERN_ERR, "groups: ");
4123  	ngroups = ext4_get_groups_count(sb);
4124  	for (i = 0; i < ngroups; i++) {
4125  		struct ext4_group_info *grp = ext4_get_group_info(sb, i);
4126  		struct ext4_prealloc_space *pa;
4127  		ext4_grpblk_t start;
4128  		struct list_head *cur;
4129  		ext4_lock_group(sb, i);
4130  		list_for_each(cur, &grp->bb_prealloc_list) {
4131  			pa = list_entry(cur, struct ext4_prealloc_space,
4132  					pa_group_list);
4133  			spin_lock(&pa->pa_lock);
4134  			ext4_get_group_no_and_offset(sb, pa->pa_pstart,
4135  						     NULL, &start);
4136  			spin_unlock(&pa->pa_lock);
4137  			printk(KERN_ERR "PA:%u:%d:%u \n", i,
4138  			       start, pa->pa_len);
4139  		}
4140  		ext4_unlock_group(sb, i);
4141  
4142  		if (grp->bb_free == 0)
4143  			continue;
4144  		printk(KERN_ERR "%u: %d/%d \n",
4145  		       i, grp->bb_free, grp->bb_fragments);
4146  	}
4147  	printk(KERN_ERR "\n");
4148  }
4149  #else
ext4_mb_show_ac(struct ext4_allocation_context * ac)4150  static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac)
4151  {
4152  	return;
4153  }
4154  #endif
4155  
4156  /*
4157   * We use locality group preallocation for small size file. The size of the
4158   * file is determined by the current size or the resulting size after
4159   * allocation which ever is larger
4160   *
4161   * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req
4162   */
ext4_mb_group_or_file(struct ext4_allocation_context * ac)4163  static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
4164  {
4165  	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
4166  	int bsbits = ac->ac_sb->s_blocksize_bits;
4167  	loff_t size, isize;
4168  
4169  	if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
4170  		return;
4171  
4172  	if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
4173  		return;
4174  
4175  	size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
4176  	isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
4177  		>> bsbits;
4178  
4179  	if ((size == isize) &&
4180  	    !ext4_fs_is_busy(sbi) &&
4181  	    (atomic_read(&ac->ac_inode->i_writecount) == 0)) {
4182  		ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
4183  		return;
4184  	}
4185  
4186  	if (sbi->s_mb_group_prealloc <= 0) {
4187  		ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
4188  		return;
4189  	}
4190  
4191  	/* don't use group allocation for large files */
4192  	size = max(size, isize);
4193  	if (size > sbi->s_mb_stream_request) {
4194  		ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
4195  		return;
4196  	}
4197  
4198  	BUG_ON(ac->ac_lg != NULL);
4199  	/*
4200  	 * locality group prealloc space are per cpu. The reason for having
4201  	 * per cpu locality group is to reduce the contention between block
4202  	 * request from multiple CPUs.
4203  	 */
4204  	ac->ac_lg = raw_cpu_ptr(sbi->s_locality_groups);
4205  
4206  	/* we're going to use group allocation */
4207  	ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;
4208  
4209  	/* serialize all allocations in the group */
4210  	mutex_lock(&ac->ac_lg->lg_mutex);
4211  }
4212  
4213  static noinline_for_stack int
ext4_mb_initialize_context(struct ext4_allocation_context * ac,struct ext4_allocation_request * ar)4214  ext4_mb_initialize_context(struct ext4_allocation_context *ac,
4215  				struct ext4_allocation_request *ar)
4216  {
4217  	struct super_block *sb = ar->inode->i_sb;
4218  	struct ext4_sb_info *sbi = EXT4_SB(sb);
4219  	struct ext4_super_block *es = sbi->s_es;
4220  	ext4_group_t group;
4221  	unsigned int len;
4222  	ext4_fsblk_t goal;
4223  	ext4_grpblk_t block;
4224  
4225  	/* we can't allocate > group size */
4226  	len = ar->len;
4227  
4228  	/* just a dirty hack to filter too big requests  */
4229  	if (len >= EXT4_CLUSTERS_PER_GROUP(sb))
4230  		len = EXT4_CLUSTERS_PER_GROUP(sb);
4231  
4232  	/* start searching from the goal */
4233  	goal = ar->goal;
4234  	if (goal < le32_to_cpu(es->s_first_data_block) ||
4235  			goal >= ext4_blocks_count(es))
4236  		goal = le32_to_cpu(es->s_first_data_block);
4237  	ext4_get_group_no_and_offset(sb, goal, &group, &block);
4238  
4239  	/* set up allocation goals */
4240  	ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical);
4241  	ac->ac_status = AC_STATUS_CONTINUE;
4242  	ac->ac_sb = sb;
4243  	ac->ac_inode = ar->inode;
4244  	ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical;
4245  	ac->ac_o_ex.fe_group = group;
4246  	ac->ac_o_ex.fe_start = block;
4247  	ac->ac_o_ex.fe_len = len;
4248  	ac->ac_g_ex = ac->ac_o_ex;
4249  	ac->ac_flags = ar->flags;
4250  
4251  	/* we have to define context: we'll we work with a file or
4252  	 * locality group. this is a policy, actually */
4253  	ext4_mb_group_or_file(ac);
4254  
4255  	mb_debug(1, "init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, "
4256  			"left: %u/%u, right %u/%u to %swritable\n",
4257  			(unsigned) ar->len, (unsigned) ar->logical,
4258  			(unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
4259  			(unsigned) ar->lleft, (unsigned) ar->pleft,
4260  			(unsigned) ar->lright, (unsigned) ar->pright,
4261  			atomic_read(&ar->inode->i_writecount) ? "" : "non-");
4262  	return 0;
4263  
4264  }
4265  
4266  static noinline_for_stack void
ext4_mb_discard_lg_preallocations(struct super_block * sb,struct ext4_locality_group * lg,int order,int total_entries)4267  ext4_mb_discard_lg_preallocations(struct super_block *sb,
4268  					struct ext4_locality_group *lg,
4269  					int order, int total_entries)
4270  {
4271  	ext4_group_t group = 0;
4272  	struct ext4_buddy e4b;
4273  	struct list_head discard_list;
4274  	struct ext4_prealloc_space *pa, *tmp;
4275  
4276  	mb_debug(1, "discard locality group preallocation\n");
4277  
4278  	INIT_LIST_HEAD(&discard_list);
4279  
4280  	spin_lock(&lg->lg_prealloc_lock);
4281  	list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
4282  						pa_inode_list) {
4283  		spin_lock(&pa->pa_lock);
4284  		if (atomic_read(&pa->pa_count)) {
4285  			/*
4286  			 * This is the pa that we just used
4287  			 * for block allocation. So don't
4288  			 * free that
4289  			 */
4290  			spin_unlock(&pa->pa_lock);
4291  			continue;
4292  		}
4293  		if (pa->pa_deleted) {
4294  			spin_unlock(&pa->pa_lock);
4295  			continue;
4296  		}
4297  		/* only lg prealloc space */
4298  		BUG_ON(pa->pa_type != MB_GROUP_PA);
4299  
4300  		/* seems this one can be freed ... */
4301  		pa->pa_deleted = 1;
4302  		spin_unlock(&pa->pa_lock);
4303  
4304  		list_del_rcu(&pa->pa_inode_list);
4305  		list_add(&pa->u.pa_tmp_list, &discard_list);
4306  
4307  		total_entries--;
4308  		if (total_entries <= 5) {
4309  			/*
4310  			 * we want to keep only 5 entries
4311  			 * allowing it to grow to 8. This
4312  			 * mak sure we don't call discard
4313  			 * soon for this list.
4314  			 */
4315  			break;
4316  		}
4317  	}
4318  	spin_unlock(&lg->lg_prealloc_lock);
4319  
4320  	list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) {
4321  		int err;
4322  
4323  		group = ext4_get_group_number(sb, pa->pa_pstart);
4324  		err = ext4_mb_load_buddy_gfp(sb, group, &e4b,
4325  					     GFP_NOFS|__GFP_NOFAIL);
4326  		if (err) {
4327  			ext4_error(sb, "Error %d loading buddy information for %u",
4328  				   err, group);
4329  			continue;
4330  		}
4331  		ext4_lock_group(sb, group);
4332  		list_del(&pa->pa_group_list);
4333  		ext4_mb_release_group_pa(&e4b, pa);
4334  		ext4_unlock_group(sb, group);
4335  
4336  		ext4_mb_unload_buddy(&e4b);
4337  		list_del(&pa->u.pa_tmp_list);
4338  		call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
4339  	}
4340  }
4341  
4342  /*
4343   * We have incremented pa_count. So it cannot be freed at this
4344   * point. Also we hold lg_mutex. So no parallel allocation is
4345   * possible from this lg. That means pa_free cannot be updated.
4346   *
4347   * A parallel ext4_mb_discard_group_preallocations is possible.
4348   * which can cause the lg_prealloc_list to be updated.
4349   */
4350  
ext4_mb_add_n_trim(struct ext4_allocation_context * ac)4351  static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
4352  {
4353  	int order, added = 0, lg_prealloc_count = 1;
4354  	struct super_block *sb = ac->ac_sb;
4355  	struct ext4_locality_group *lg = ac->ac_lg;
4356  	struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa;
4357  
4358  	order = fls(pa->pa_free) - 1;
4359  	if (order > PREALLOC_TB_SIZE - 1)
4360  		/* The max size of hash table is PREALLOC_TB_SIZE */
4361  		order = PREALLOC_TB_SIZE - 1;
4362  	/* Add the prealloc space to lg */
4363  	spin_lock(&lg->lg_prealloc_lock);
4364  	list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
4365  						pa_inode_list) {
4366  		spin_lock(&tmp_pa->pa_lock);
4367  		if (tmp_pa->pa_deleted) {
4368  			spin_unlock(&tmp_pa->pa_lock);
4369  			continue;
4370  		}
4371  		if (!added && pa->pa_free < tmp_pa->pa_free) {
4372  			/* Add to the tail of the previous entry */
4373  			list_add_tail_rcu(&pa->pa_inode_list,
4374  						&tmp_pa->pa_inode_list);
4375  			added = 1;
4376  			/*
4377  			 * we want to count the total
4378  			 * number of entries in the list
4379  			 */
4380  		}
4381  		spin_unlock(&tmp_pa->pa_lock);
4382  		lg_prealloc_count++;
4383  	}
4384  	if (!added)
4385  		list_add_tail_rcu(&pa->pa_inode_list,
4386  					&lg->lg_prealloc_list[order]);
4387  	spin_unlock(&lg->lg_prealloc_lock);
4388  
4389  	/* Now trim the list to be not more than 8 elements */
4390  	if (lg_prealloc_count > 8) {
4391  		ext4_mb_discard_lg_preallocations(sb, lg,
4392  						  order, lg_prealloc_count);
4393  		return;
4394  	}
4395  	return ;
4396  }
4397  
4398  /*
4399   * release all resource we used in allocation
4400   */
ext4_mb_release_context(struct ext4_allocation_context * ac)4401  static int ext4_mb_release_context(struct ext4_allocation_context *ac)
4402  {
4403  	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
4404  	struct ext4_prealloc_space *pa = ac->ac_pa;
4405  	if (pa) {
4406  		if (pa->pa_type == MB_GROUP_PA) {
4407  			/* see comment in ext4_mb_use_group_pa() */
4408  			spin_lock(&pa->pa_lock);
4409  			pa->pa_pstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
4410  			pa->pa_lstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
4411  			pa->pa_free -= ac->ac_b_ex.fe_len;
4412  			pa->pa_len -= ac->ac_b_ex.fe_len;
4413  			spin_unlock(&pa->pa_lock);
4414  		}
4415  	}
4416  	if (pa) {
4417  		/*
4418  		 * We want to add the pa to the right bucket.
4419  		 * Remove it from the list and while adding
4420  		 * make sure the list to which we are adding
4421  		 * doesn't grow big.
4422  		 */
4423  		if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) {
4424  			spin_lock(pa->pa_obj_lock);
4425  			list_del_rcu(&pa->pa_inode_list);
4426  			spin_unlock(pa->pa_obj_lock);
4427  			ext4_mb_add_n_trim(ac);
4428  		}
4429  		ext4_mb_put_pa(ac, ac->ac_sb, pa);
4430  	}
4431  	if (ac->ac_bitmap_page)
4432  		page_cache_release(ac->ac_bitmap_page);
4433  	if (ac->ac_buddy_page)
4434  		page_cache_release(ac->ac_buddy_page);
4435  	if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
4436  		mutex_unlock(&ac->ac_lg->lg_mutex);
4437  	ext4_mb_collect_stats(ac);
4438  	return 0;
4439  }
4440  
ext4_mb_discard_preallocations(struct super_block * sb,int needed)4441  static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
4442  {
4443  	ext4_group_t i, ngroups = ext4_get_groups_count(sb);
4444  	int ret;
4445  	int freed = 0;
4446  
4447  	trace_ext4_mb_discard_preallocations(sb, needed);
4448  	for (i = 0; i < ngroups && needed > 0; i++) {
4449  		ret = ext4_mb_discard_group_preallocations(sb, i, needed);
4450  		freed += ret;
4451  		needed -= ret;
4452  	}
4453  
4454  	return freed;
4455  }
4456  
4457  /*
4458   * Main entry point into mballoc to allocate blocks
4459   * it tries to use preallocation first, then falls back
4460   * to usual allocation
4461   */
ext4_mb_new_blocks(handle_t * handle,struct ext4_allocation_request * ar,int * errp)4462  ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4463  				struct ext4_allocation_request *ar, int *errp)
4464  {
4465  	int freed;
4466  	struct ext4_allocation_context *ac = NULL;
4467  	struct ext4_sb_info *sbi;
4468  	struct super_block *sb;
4469  	ext4_fsblk_t block = 0;
4470  	unsigned int inquota = 0;
4471  	unsigned int reserv_clstrs = 0;
4472  
4473  	might_sleep();
4474  	sb = ar->inode->i_sb;
4475  	sbi = EXT4_SB(sb);
4476  
4477  	trace_ext4_request_blocks(ar);
4478  
4479  	/* Allow to use superuser reservation for quota file */
4480  	if (IS_NOQUOTA(ar->inode))
4481  		ar->flags |= EXT4_MB_USE_ROOT_BLOCKS;
4482  
4483  	if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) {
4484  		/* Without delayed allocation we need to verify
4485  		 * there is enough free blocks to do block allocation
4486  		 * and verify allocation doesn't exceed the quota limits.
4487  		 */
4488  		while (ar->len &&
4489  			ext4_claim_free_clusters(sbi, ar->len, ar->flags)) {
4490  
4491  			/* let others to free the space */
4492  			cond_resched();
4493  			ar->len = ar->len >> 1;
4494  		}
4495  		if (!ar->len) {
4496  			*errp = -ENOSPC;
4497  			return 0;
4498  		}
4499  		reserv_clstrs = ar->len;
4500  		if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) {
4501  			dquot_alloc_block_nofail(ar->inode,
4502  						 EXT4_C2B(sbi, ar->len));
4503  		} else {
4504  			while (ar->len &&
4505  				dquot_alloc_block(ar->inode,
4506  						  EXT4_C2B(sbi, ar->len))) {
4507  
4508  				ar->flags |= EXT4_MB_HINT_NOPREALLOC;
4509  				ar->len--;
4510  			}
4511  		}
4512  		inquota = ar->len;
4513  		if (ar->len == 0) {
4514  			*errp = -EDQUOT;
4515  			goto out;
4516  		}
4517  	}
4518  
4519  	ac = kmem_cache_zalloc(ext4_ac_cachep, GFP_NOFS);
4520  	if (!ac) {
4521  		ar->len = 0;
4522  		*errp = -ENOMEM;
4523  		goto out;
4524  	}
4525  
4526  	*errp = ext4_mb_initialize_context(ac, ar);
4527  	if (*errp) {
4528  		ar->len = 0;
4529  		goto out;
4530  	}
4531  
4532  	ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
4533  	if (!ext4_mb_use_preallocated(ac)) {
4534  		ac->ac_op = EXT4_MB_HISTORY_ALLOC;
4535  		ext4_mb_normalize_request(ac, ar);
4536  repeat:
4537  		/* allocate space in core */
4538  		*errp = ext4_mb_regular_allocator(ac);
4539  		if (*errp)
4540  			goto discard_and_exit;
4541  
4542  		/* as we've just preallocated more space than
4543  		 * user requested originally, we store allocated
4544  		 * space in a special descriptor */
4545  		if (ac->ac_status == AC_STATUS_FOUND &&
4546  		    ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
4547  			*errp = ext4_mb_new_preallocation(ac);
4548  		if (*errp) {
4549  		discard_and_exit:
4550  			ext4_discard_allocated_blocks(ac);
4551  			goto errout;
4552  		}
4553  	}
4554  	if (likely(ac->ac_status == AC_STATUS_FOUND)) {
4555  		*errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
4556  		if (*errp) {
4557  			ext4_discard_allocated_blocks(ac);
4558  			goto errout;
4559  		} else {
4560  			block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
4561  			ar->len = ac->ac_b_ex.fe_len;
4562  		}
4563  	} else {
4564  		freed  = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len);
4565  		if (freed)
4566  			goto repeat;
4567  		*errp = -ENOSPC;
4568  	}
4569  
4570  errout:
4571  	if (*errp) {
4572  		ac->ac_b_ex.fe_len = 0;
4573  		ar->len = 0;
4574  		ext4_mb_show_ac(ac);
4575  	}
4576  	ext4_mb_release_context(ac);
4577  out:
4578  	if (ac)
4579  		kmem_cache_free(ext4_ac_cachep, ac);
4580  	if (inquota && ar->len < inquota)
4581  		dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len));
4582  	if (!ar->len) {
4583  		if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0)
4584  			/* release all the reserved blocks if non delalloc */
4585  			percpu_counter_sub(&sbi->s_dirtyclusters_counter,
4586  						reserv_clstrs);
4587  	}
4588  
4589  	trace_ext4_allocate_blocks(ar, (unsigned long long)block);
4590  
4591  	return block;
4592  }
4593  
4594  /*
4595   * We can merge two free data extents only if the physical blocks
4596   * are contiguous, AND the extents were freed by the same transaction,
4597   * AND the blocks are associated with the same group.
4598   */
can_merge(struct ext4_free_data * entry1,struct ext4_free_data * entry2)4599  static int can_merge(struct ext4_free_data *entry1,
4600  			struct ext4_free_data *entry2)
4601  {
4602  	if ((entry1->efd_tid == entry2->efd_tid) &&
4603  	    (entry1->efd_group == entry2->efd_group) &&
4604  	    ((entry1->efd_start_cluster + entry1->efd_count) == entry2->efd_start_cluster))
4605  		return 1;
4606  	return 0;
4607  }
4608  
4609  static noinline_for_stack int
ext4_mb_free_metadata(handle_t * handle,struct ext4_buddy * e4b,struct ext4_free_data * new_entry)4610  ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4611  		      struct ext4_free_data *new_entry)
4612  {
4613  	ext4_group_t group = e4b->bd_group;
4614  	ext4_grpblk_t cluster;
4615  	struct ext4_free_data *entry;
4616  	struct ext4_group_info *db = e4b->bd_info;
4617  	struct super_block *sb = e4b->bd_sb;
4618  	struct ext4_sb_info *sbi = EXT4_SB(sb);
4619  	struct rb_node **n = &db->bb_free_root.rb_node, *node;
4620  	struct rb_node *parent = NULL, *new_node;
4621  
4622  	BUG_ON(!ext4_handle_valid(handle));
4623  	BUG_ON(e4b->bd_bitmap_page == NULL);
4624  	BUG_ON(e4b->bd_buddy_page == NULL);
4625  
4626  	new_node = &new_entry->efd_node;
4627  	cluster = new_entry->efd_start_cluster;
4628  
4629  	if (!*n) {
4630  		/* first free block exent. We need to
4631  		   protect buddy cache from being freed,
4632  		 * otherwise we'll refresh it from
4633  		 * on-disk bitmap and lose not-yet-available
4634  		 * blocks */
4635  		page_cache_get(e4b->bd_buddy_page);
4636  		page_cache_get(e4b->bd_bitmap_page);
4637  	}
4638  	while (*n) {
4639  		parent = *n;
4640  		entry = rb_entry(parent, struct ext4_free_data, efd_node);
4641  		if (cluster < entry->efd_start_cluster)
4642  			n = &(*n)->rb_left;
4643  		else if (cluster >= (entry->efd_start_cluster + entry->efd_count))
4644  			n = &(*n)->rb_right;
4645  		else {
4646  			ext4_grp_locked_error(sb, group, 0,
4647  				ext4_group_first_block_no(sb, group) +
4648  				EXT4_C2B(sbi, cluster),
4649  				"Block already on to-be-freed list");
4650  			kmem_cache_free(ext4_free_data_cachep, new_entry);
4651  			return 0;
4652  		}
4653  	}
4654  
4655  	rb_link_node(new_node, parent, n);
4656  	rb_insert_color(new_node, &db->bb_free_root);
4657  
4658  	/* Now try to see the extent can be merged to left and right */
4659  	node = rb_prev(new_node);
4660  	if (node) {
4661  		entry = rb_entry(node, struct ext4_free_data, efd_node);
4662  		if (can_merge(entry, new_entry) &&
4663  		    ext4_journal_callback_try_del(handle, &entry->efd_jce)) {
4664  			new_entry->efd_start_cluster = entry->efd_start_cluster;
4665  			new_entry->efd_count += entry->efd_count;
4666  			rb_erase(node, &(db->bb_free_root));
4667  			kmem_cache_free(ext4_free_data_cachep, entry);
4668  		}
4669  	}
4670  
4671  	node = rb_next(new_node);
4672  	if (node) {
4673  		entry = rb_entry(node, struct ext4_free_data, efd_node);
4674  		if (can_merge(new_entry, entry) &&
4675  		    ext4_journal_callback_try_del(handle, &entry->efd_jce)) {
4676  			new_entry->efd_count += entry->efd_count;
4677  			rb_erase(node, &(db->bb_free_root));
4678  			kmem_cache_free(ext4_free_data_cachep, entry);
4679  		}
4680  	}
4681  	/* Add the extent to transaction's private list */
4682  	ext4_journal_callback_add(handle, ext4_free_data_callback,
4683  				  &new_entry->efd_jce);
4684  	return 0;
4685  }
4686  
4687  /**
4688   * ext4_free_blocks() -- Free given blocks and update quota
4689   * @handle:		handle for this transaction
4690   * @inode:		inode
4691   * @block:		start physical block to free
4692   * @count:		number of blocks to count
4693   * @flags:		flags used by ext4_free_blocks
4694   */
ext4_free_blocks(handle_t * handle,struct inode * inode,struct buffer_head * bh,ext4_fsblk_t block,unsigned long count,int flags)4695  void ext4_free_blocks(handle_t *handle, struct inode *inode,
4696  		      struct buffer_head *bh, ext4_fsblk_t block,
4697  		      unsigned long count, int flags)
4698  {
4699  	struct buffer_head *bitmap_bh = NULL;
4700  	struct super_block *sb = inode->i_sb;
4701  	struct ext4_group_desc *gdp;
4702  	unsigned int overflow;
4703  	ext4_grpblk_t bit;
4704  	struct buffer_head *gd_bh;
4705  	ext4_group_t block_group;
4706  	struct ext4_sb_info *sbi;
4707  	struct ext4_buddy e4b;
4708  	unsigned int count_clusters;
4709  	int err = 0;
4710  	int ret;
4711  
4712  	might_sleep();
4713  	if (bh) {
4714  		if (block)
4715  			BUG_ON(block != bh->b_blocknr);
4716  		else
4717  			block = bh->b_blocknr;
4718  	}
4719  
4720  	sbi = EXT4_SB(sb);
4721  	if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
4722  	    !ext4_inode_block_valid(inode, block, count)) {
4723  		ext4_error(sb, "Freeing blocks not in datazone - "
4724  			   "block = %llu, count = %lu", block, count);
4725  		goto error_return;
4726  	}
4727  
4728  	ext4_debug("freeing block %llu\n", block);
4729  	trace_ext4_free_blocks(inode, block, count, flags);
4730  
4731  	if (bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
4732  		BUG_ON(count > 1);
4733  
4734  		ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
4735  			    inode, bh, block);
4736  	}
4737  
4738  	/*
4739  	 * We need to make sure we don't reuse the freed block until
4740  	 * after the transaction is committed, which we can do by
4741  	 * treating the block as metadata, below.  We make an
4742  	 * exception if the inode is to be written in writeback mode
4743  	 * since writeback mode has weak data consistency guarantees.
4744  	 */
4745  	if (!ext4_should_writeback_data(inode))
4746  		flags |= EXT4_FREE_BLOCKS_METADATA;
4747  
4748  	/*
4749  	 * If the extent to be freed does not begin on a cluster
4750  	 * boundary, we need to deal with partial clusters at the
4751  	 * beginning and end of the extent.  Normally we will free
4752  	 * blocks at the beginning or the end unless we are explicitly
4753  	 * requested to avoid doing so.
4754  	 */
4755  	overflow = EXT4_PBLK_COFF(sbi, block);
4756  	if (overflow) {
4757  		if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {
4758  			overflow = sbi->s_cluster_ratio - overflow;
4759  			block += overflow;
4760  			if (count > overflow)
4761  				count -= overflow;
4762  			else
4763  				return;
4764  		} else {
4765  			block -= overflow;
4766  			count += overflow;
4767  		}
4768  	}
4769  	overflow = EXT4_LBLK_COFF(sbi, count);
4770  	if (overflow) {
4771  		if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {
4772  			if (count > overflow)
4773  				count -= overflow;
4774  			else
4775  				return;
4776  		} else
4777  			count += sbi->s_cluster_ratio - overflow;
4778  	}
4779  
4780  	if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
4781  		int i;
4782  
4783  		for (i = 0; i < count; i++) {
4784  			cond_resched();
4785  			bh = sb_find_get_block(inode->i_sb, block + i);
4786  			if (!bh)
4787  				continue;
4788  			ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
4789  				    inode, bh, block + i);
4790  		}
4791  	}
4792  
4793  do_more:
4794  	overflow = 0;
4795  	ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
4796  
4797  	if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(
4798  			ext4_get_group_info(sb, block_group))))
4799  		return;
4800  
4801  	/*
4802  	 * Check to see if we are freeing blocks across a group
4803  	 * boundary.
4804  	 */
4805  	if (EXT4_C2B(sbi, bit) + count > EXT4_BLOCKS_PER_GROUP(sb)) {
4806  		overflow = EXT4_C2B(sbi, bit) + count -
4807  			EXT4_BLOCKS_PER_GROUP(sb);
4808  		count -= overflow;
4809  	}
4810  	count_clusters = EXT4_NUM_B2C(sbi, count);
4811  	bitmap_bh = ext4_read_block_bitmap(sb, block_group);
4812  	if (IS_ERR(bitmap_bh)) {
4813  		err = PTR_ERR(bitmap_bh);
4814  		bitmap_bh = NULL;
4815  		goto error_return;
4816  	}
4817  	gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
4818  	if (!gdp) {
4819  		err = -EIO;
4820  		goto error_return;
4821  	}
4822  
4823  	if (in_range(ext4_block_bitmap(sb, gdp), block, count) ||
4824  	    in_range(ext4_inode_bitmap(sb, gdp), block, count) ||
4825  	    in_range(block, ext4_inode_table(sb, gdp),
4826  		     EXT4_SB(sb)->s_itb_per_group) ||
4827  	    in_range(block + count - 1, ext4_inode_table(sb, gdp),
4828  		     EXT4_SB(sb)->s_itb_per_group)) {
4829  
4830  		ext4_error(sb, "Freeing blocks in system zone - "
4831  			   "Block = %llu, count = %lu", block, count);
4832  		/* err = 0. ext4_std_error should be a no op */
4833  		goto error_return;
4834  	}
4835  
4836  	BUFFER_TRACE(bitmap_bh, "getting write access");
4837  	err = ext4_journal_get_write_access(handle, bitmap_bh);
4838  	if (err)
4839  		goto error_return;
4840  
4841  	/*
4842  	 * We are about to modify some metadata.  Call the journal APIs
4843  	 * to unshare ->b_data if a currently-committing transaction is
4844  	 * using it
4845  	 */
4846  	BUFFER_TRACE(gd_bh, "get_write_access");
4847  	err = ext4_journal_get_write_access(handle, gd_bh);
4848  	if (err)
4849  		goto error_return;
4850  #ifdef AGGRESSIVE_CHECK
4851  	{
4852  		int i;
4853  		for (i = 0; i < count_clusters; i++)
4854  			BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
4855  	}
4856  #endif
4857  	trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters);
4858  
4859  	/* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */
4860  	err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b,
4861  				     GFP_NOFS|__GFP_NOFAIL);
4862  	if (err)
4863  		goto error_return;
4864  
4865  	if ((flags & EXT4_FREE_BLOCKS_METADATA) && ext4_handle_valid(handle)) {
4866  		struct ext4_free_data *new_entry;
4867  		/*
4868  		 * blocks being freed are metadata. these blocks shouldn't
4869  		 * be used until this transaction is committed
4870  		 *
4871  		 * We use __GFP_NOFAIL because ext4_free_blocks() is not allowed
4872  		 * to fail.
4873  		 */
4874  		new_entry = kmem_cache_alloc(ext4_free_data_cachep,
4875  				GFP_NOFS|__GFP_NOFAIL);
4876  		new_entry->efd_start_cluster = bit;
4877  		new_entry->efd_group = block_group;
4878  		new_entry->efd_count = count_clusters;
4879  		new_entry->efd_tid = handle->h_transaction->t_tid;
4880  
4881  		ext4_lock_group(sb, block_group);
4882  		mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
4883  		ext4_mb_free_metadata(handle, &e4b, new_entry);
4884  	} else {
4885  		/* need to update group_info->bb_free and bitmap
4886  		 * with group lock held. generate_buddy look at
4887  		 * them with group lock_held
4888  		 */
4889  		if (test_opt(sb, DISCARD)) {
4890  			err = ext4_issue_discard(sb, block_group, bit, count,
4891  						 0);
4892  			if (err && err != -EOPNOTSUPP)
4893  				ext4_msg(sb, KERN_WARNING, "discard request in"
4894  					 " group:%d block:%d count:%lu failed"
4895  					 " with %d", block_group, bit, count,
4896  					 err);
4897  		} else
4898  			EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info);
4899  
4900  		ext4_lock_group(sb, block_group);
4901  		mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
4902  		mb_free_blocks(inode, &e4b, bit, count_clusters);
4903  	}
4904  
4905  	ret = ext4_free_group_clusters(sb, gdp) + count_clusters;
4906  	ext4_free_group_clusters_set(sb, gdp, ret);
4907  	ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh);
4908  	ext4_group_desc_csum_set(sb, block_group, gdp);
4909  	ext4_unlock_group(sb, block_group);
4910  
4911  	if (sbi->s_log_groups_per_flex) {
4912  		ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
4913  		atomic64_add(count_clusters,
4914  			     &sbi_array_rcu_deref(sbi, s_flex_groups,
4915  						  flex_group)->free_clusters);
4916  	}
4917  
4918  	if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
4919  		dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
4920  	percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters);
4921  
4922  	ext4_mb_unload_buddy(&e4b);
4923  
4924  	/* We dirtied the bitmap block */
4925  	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
4926  	err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
4927  
4928  	/* And the group descriptor block */
4929  	BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
4930  	ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
4931  	if (!err)
4932  		err = ret;
4933  
4934  	if (overflow && !err) {
4935  		block += count;
4936  		count = overflow;
4937  		put_bh(bitmap_bh);
4938  		goto do_more;
4939  	}
4940  error_return:
4941  	brelse(bitmap_bh);
4942  	ext4_std_error(sb, err);
4943  	return;
4944  }
4945  
4946  /**
4947   * ext4_group_add_blocks() -- Add given blocks to an existing group
4948   * @handle:			handle to this transaction
4949   * @sb:				super block
4950   * @block:			start physical block to add to the block group
4951   * @count:			number of blocks to free
4952   *
4953   * This marks the blocks as free in the bitmap and buddy.
4954   */
ext4_group_add_blocks(handle_t * handle,struct super_block * sb,ext4_fsblk_t block,unsigned long count)4955  int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
4956  			 ext4_fsblk_t block, unsigned long count)
4957  {
4958  	struct buffer_head *bitmap_bh = NULL;
4959  	struct buffer_head *gd_bh;
4960  	ext4_group_t block_group;
4961  	ext4_grpblk_t bit;
4962  	unsigned int i;
4963  	struct ext4_group_desc *desc;
4964  	struct ext4_sb_info *sbi = EXT4_SB(sb);
4965  	struct ext4_buddy e4b;
4966  	int err = 0, ret, blk_free_count;
4967  	ext4_grpblk_t blocks_freed;
4968  
4969  	ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
4970  
4971  	if (count == 0)
4972  		return 0;
4973  
4974  	ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
4975  	/*
4976  	 * Check to see if we are freeing blocks across a group
4977  	 * boundary.
4978  	 */
4979  	if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
4980  		ext4_warning(sb, "too much blocks added to group %u\n",
4981  			     block_group);
4982  		err = -EINVAL;
4983  		goto error_return;
4984  	}
4985  
4986  	bitmap_bh = ext4_read_block_bitmap(sb, block_group);
4987  	if (IS_ERR(bitmap_bh)) {
4988  		err = PTR_ERR(bitmap_bh);
4989  		bitmap_bh = NULL;
4990  		goto error_return;
4991  	}
4992  
4993  	desc = ext4_get_group_desc(sb, block_group, &gd_bh);
4994  	if (!desc) {
4995  		err = -EIO;
4996  		goto error_return;
4997  	}
4998  
4999  	if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
5000  	    in_range(ext4_inode_bitmap(sb, desc), block, count) ||
5001  	    in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
5002  	    in_range(block + count - 1, ext4_inode_table(sb, desc),
5003  		     sbi->s_itb_per_group)) {
5004  		ext4_error(sb, "Adding blocks in system zones - "
5005  			   "Block = %llu, count = %lu",
5006  			   block, count);
5007  		err = -EINVAL;
5008  		goto error_return;
5009  	}
5010  
5011  	BUFFER_TRACE(bitmap_bh, "getting write access");
5012  	err = ext4_journal_get_write_access(handle, bitmap_bh);
5013  	if (err)
5014  		goto error_return;
5015  
5016  	/*
5017  	 * We are about to modify some metadata.  Call the journal APIs
5018  	 * to unshare ->b_data if a currently-committing transaction is
5019  	 * using it
5020  	 */
5021  	BUFFER_TRACE(gd_bh, "get_write_access");
5022  	err = ext4_journal_get_write_access(handle, gd_bh);
5023  	if (err)
5024  		goto error_return;
5025  
5026  	for (i = 0, blocks_freed = 0; i < count; i++) {
5027  		BUFFER_TRACE(bitmap_bh, "clear bit");
5028  		if (!mb_test_bit(bit + i, bitmap_bh->b_data)) {
5029  			ext4_error(sb, "bit already cleared for block %llu",
5030  				   (ext4_fsblk_t)(block + i));
5031  			BUFFER_TRACE(bitmap_bh, "bit already cleared");
5032  		} else {
5033  			blocks_freed++;
5034  		}
5035  	}
5036  
5037  	err = ext4_mb_load_buddy(sb, block_group, &e4b);
5038  	if (err)
5039  		goto error_return;
5040  
5041  	/*
5042  	 * need to update group_info->bb_free and bitmap
5043  	 * with group lock held. generate_buddy look at
5044  	 * them with group lock_held
5045  	 */
5046  	ext4_lock_group(sb, block_group);
5047  	mb_clear_bits(bitmap_bh->b_data, bit, count);
5048  	mb_free_blocks(NULL, &e4b, bit, count);
5049  	blk_free_count = blocks_freed + ext4_free_group_clusters(sb, desc);
5050  	ext4_free_group_clusters_set(sb, desc, blk_free_count);
5051  	ext4_block_bitmap_csum_set(sb, block_group, desc, bitmap_bh);
5052  	ext4_group_desc_csum_set(sb, block_group, desc);
5053  	ext4_unlock_group(sb, block_group);
5054  	percpu_counter_add(&sbi->s_freeclusters_counter,
5055  			   EXT4_NUM_B2C(sbi, blocks_freed));
5056  
5057  	if (sbi->s_log_groups_per_flex) {
5058  		ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
5059  		atomic64_add(EXT4_NUM_B2C(sbi, blocks_freed),
5060  			     &sbi_array_rcu_deref(sbi, s_flex_groups,
5061  						  flex_group)->free_clusters);
5062  	}
5063  
5064  	ext4_mb_unload_buddy(&e4b);
5065  
5066  	/* We dirtied the bitmap block */
5067  	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
5068  	err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
5069  
5070  	/* And the group descriptor block */
5071  	BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
5072  	ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
5073  	if (!err)
5074  		err = ret;
5075  
5076  error_return:
5077  	brelse(bitmap_bh);
5078  	ext4_std_error(sb, err);
5079  	return err;
5080  }
5081  
5082  /**
5083   * ext4_trim_extent -- function to TRIM one single free extent in the group
5084   * @sb:		super block for the file system
5085   * @start:	starting block of the free extent in the alloc. group
5086   * @count:	number of blocks to TRIM
5087   * @group:	alloc. group we are working with
5088   * @e4b:	ext4 buddy for the group
5089   * @blkdev_flags: flags for the block device
5090   *
5091   * Trim "count" blocks starting at "start" in the "group". To assure that no
5092   * one will allocate those blocks, mark it as used in buddy bitmap. This must
5093   * be called with under the group lock.
5094   */
ext4_trim_extent(struct super_block * sb,int start,int count,ext4_group_t group,struct ext4_buddy * e4b,unsigned long blkdev_flags)5095  static int ext4_trim_extent(struct super_block *sb, int start, int count,
5096  			    ext4_group_t group, struct ext4_buddy *e4b,
5097  			    unsigned long blkdev_flags)
5098  __releases(bitlock)
5099  __acquires(bitlock)
5100  {
5101  	struct ext4_free_extent ex;
5102  	int ret = 0;
5103  
5104  	trace_ext4_trim_extent(sb, group, start, count);
5105  
5106  	assert_spin_locked(ext4_group_lock_ptr(sb, group));
5107  
5108  	ex.fe_start = start;
5109  	ex.fe_group = group;
5110  	ex.fe_len = count;
5111  
5112  	/*
5113  	 * Mark blocks used, so no one can reuse them while
5114  	 * being trimmed.
5115  	 */
5116  	mb_mark_used(e4b, &ex);
5117  	ext4_unlock_group(sb, group);
5118  	ret = ext4_issue_discard(sb, group, start, count, blkdev_flags);
5119  	ext4_lock_group(sb, group);
5120  	mb_free_blocks(NULL, e4b, start, ex.fe_len);
5121  	return ret;
5122  }
5123  
5124  /**
5125   * ext4_trim_all_free -- function to trim all free space in alloc. group
5126   * @sb:			super block for file system
5127   * @group:		group to be trimmed
5128   * @start:		first group block to examine
5129   * @max:		last group block to examine
5130   * @minblocks:		minimum extent block count
5131   * @blkdev_flags:	flags for the block device
5132   *
5133   * ext4_trim_all_free walks through group's buddy bitmap searching for free
5134   * extents. When the free block is found, ext4_trim_extent is called to TRIM
5135   * the extent.
5136   *
5137   *
5138   * ext4_trim_all_free walks through group's block bitmap searching for free
5139   * extents. When the free extent is found, mark it as used in group buddy
5140   * bitmap. Then issue a TRIM command on this extent and free the extent in
5141   * the group buddy bitmap. This is done until whole group is scanned.
5142   */
5143  static ext4_grpblk_t
ext4_trim_all_free(struct super_block * sb,ext4_group_t group,ext4_grpblk_t start,ext4_grpblk_t max,ext4_grpblk_t minblocks,unsigned long blkdev_flags)5144  ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
5145  		   ext4_grpblk_t start, ext4_grpblk_t max,
5146  		   ext4_grpblk_t minblocks, unsigned long blkdev_flags)
5147  {
5148  	void *bitmap;
5149  	ext4_grpblk_t next, count = 0, free_count = 0;
5150  	struct ext4_buddy e4b;
5151  	int ret = 0;
5152  
5153  	trace_ext4_trim_all_free(sb, group, start, max);
5154  
5155  	ret = ext4_mb_load_buddy(sb, group, &e4b);
5156  	if (ret) {
5157  		ext4_warning(sb, "Error %d loading buddy information for %u",
5158  			     ret, group);
5159  		return ret;
5160  	}
5161  	bitmap = e4b.bd_bitmap;
5162  
5163  	ext4_lock_group(sb, group);
5164  	if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) &&
5165  	    minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks))
5166  		goto out;
5167  
5168  	start = (e4b.bd_info->bb_first_free > start) ?
5169  		e4b.bd_info->bb_first_free : start;
5170  
5171  	while (start <= max) {
5172  		start = mb_find_next_zero_bit(bitmap, max + 1, start);
5173  		if (start > max)
5174  			break;
5175  		next = mb_find_next_bit(bitmap, max + 1, start);
5176  
5177  		if ((next - start) >= minblocks) {
5178  			ret = ext4_trim_extent(sb, start,
5179  					       next - start, group, &e4b,
5180  					       blkdev_flags);
5181  			if (ret && ret != -EOPNOTSUPP)
5182  				break;
5183  			ret = 0;
5184  			count += next - start;
5185  		}
5186  		free_count += next - start;
5187  		start = next + 1;
5188  
5189  		if (fatal_signal_pending(current)) {
5190  			count = -ERESTARTSYS;
5191  			break;
5192  		}
5193  
5194  		if (need_resched()) {
5195  			ext4_unlock_group(sb, group);
5196  			cond_resched();
5197  			ext4_lock_group(sb, group);
5198  		}
5199  
5200  		if ((e4b.bd_info->bb_free - free_count) < minblocks)
5201  			break;
5202  	}
5203  
5204  	if (!ret) {
5205  		ret = count;
5206  		EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
5207  	}
5208  out:
5209  	ext4_unlock_group(sb, group);
5210  	ext4_mb_unload_buddy(&e4b);
5211  
5212  	ext4_debug("trimmed %d blocks in the group %d\n",
5213  		count, group);
5214  
5215  	return ret;
5216  }
5217  
5218  /**
5219   * ext4_trim_fs() -- trim ioctl handle function
5220   * @sb:			superblock for filesystem
5221   * @range:		fstrim_range structure
5222   * @blkdev_flags:	flags for the block device
5223   *
5224   * start:	First Byte to trim
5225   * len:		number of Bytes to trim from start
5226   * minlen:	minimum extent length in Bytes
5227   * ext4_trim_fs goes through all allocation groups containing Bytes from
5228   * start to start+len. For each such a group ext4_trim_all_free function
5229   * is invoked to trim all free space.
5230   */
ext4_trim_fs(struct super_block * sb,struct fstrim_range * range,unsigned long blkdev_flags)5231  int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range,
5232  			unsigned long blkdev_flags)
5233  {
5234  	struct request_queue *q = bdev_get_queue(sb->s_bdev);
5235  	struct ext4_group_info *grp;
5236  	ext4_group_t group, first_group, last_group;
5237  	ext4_grpblk_t cnt = 0, first_cluster, last_cluster;
5238  	uint64_t start, end, minlen, trimmed = 0;
5239  	ext4_fsblk_t first_data_blk =
5240  			le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
5241  	ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es);
5242  	int ret = 0;
5243  
5244  	start = range->start >> sb->s_blocksize_bits;
5245  	end = start + (range->len >> sb->s_blocksize_bits) - 1;
5246  	minlen = EXT4_NUM_B2C(EXT4_SB(sb),
5247  			      range->minlen >> sb->s_blocksize_bits);
5248  
5249  	if (minlen > EXT4_CLUSTERS_PER_GROUP(sb) ||
5250  	    start >= max_blks ||
5251  	    range->len < sb->s_blocksize)
5252  		return -EINVAL;
5253  	/* No point to try to trim less than discard granularity */
5254  	if (range->minlen < q->limits.discard_granularity) {
5255  		minlen = EXT4_NUM_B2C(EXT4_SB(sb),
5256  			q->limits.discard_granularity >> sb->s_blocksize_bits);
5257  		if (minlen > EXT4_CLUSTERS_PER_GROUP(sb))
5258  			goto out;
5259  	}
5260  	if (end >= max_blks)
5261  		end = max_blks - 1;
5262  	if (end <= first_data_blk)
5263  		goto out;
5264  	if (start < first_data_blk)
5265  		start = first_data_blk;
5266  
5267  	/* Determine first and last group to examine based on start and end */
5268  	ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
5269  				     &first_group, &first_cluster);
5270  	ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) end,
5271  				     &last_group, &last_cluster);
5272  
5273  	/* end now represents the last cluster to discard in this group */
5274  	end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
5275  
5276  	for (group = first_group; group <= last_group; group++) {
5277  		grp = ext4_get_group_info(sb, group);
5278  		/* We only do this if the grp has never been initialized */
5279  		if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
5280  			ret = ext4_mb_init_group(sb, group, GFP_NOFS);
5281  			if (ret)
5282  				break;
5283  		}
5284  
5285  		/*
5286  		 * For all the groups except the last one, last cluster will
5287  		 * always be EXT4_CLUSTERS_PER_GROUP(sb)-1, so we only need to
5288  		 * change it for the last group, note that last_cluster is
5289  		 * already computed earlier by ext4_get_group_no_and_offset()
5290  		 */
5291  		if (group == last_group)
5292  			end = last_cluster;
5293  
5294  		if (grp->bb_free >= minlen) {
5295  			cnt = ext4_trim_all_free(sb, group, first_cluster,
5296  						end, minlen, blkdev_flags);
5297  			if (cnt < 0) {
5298  				ret = cnt;
5299  				break;
5300  			}
5301  			trimmed += cnt;
5302  		}
5303  
5304  		/*
5305  		 * For every group except the first one, we are sure
5306  		 * that the first cluster to discard will be cluster #0.
5307  		 */
5308  		first_cluster = 0;
5309  	}
5310  
5311  	if (!ret)
5312  		atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen);
5313  
5314  out:
5315  	range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits;
5316  	return ret;
5317  }
5318