1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
4 * Written by Alex Tomas <alex@clusterfs.com>
5 */
6
7
8 /*
9 * mballoc.c contains the multiblocks allocation routines
10 */
11
12 #include "ext4_jbd2.h"
13 #include "mballoc.h"
14 #include <linux/log2.h>
15 #include <linux/module.h>
16 #include <linux/slab.h>
17 #include <linux/nospec.h>
18 #include <linux/backing-dev.h>
19 #include <linux/freezer.h>
20 #include <trace/events/ext4.h>
21
22 /*
23 * MUSTDO:
24 * - test ext4_ext_search_left() and ext4_ext_search_right()
25 * - search for metadata in few groups
26 *
27 * TODO v4:
28 * - normalization should take into account whether file is still open
29 * - discard preallocations if no free space left (policy?)
30 * - don't normalize tails
31 * - quota
32 * - reservation for superuser
33 *
34 * TODO v3:
35 * - bitmap read-ahead (proposed by Oleg Drokin aka green)
36 * - track min/max extents in each group for better group selection
37 * - mb_mark_used() may allocate chunk right after splitting buddy
38 * - tree of groups sorted by number of free blocks
39 * - error handling
40 */
41
42 /*
43 * The allocation request involve request for multiple number of blocks
44 * near to the goal(block) value specified.
45 *
46 * During initialization phase of the allocator we decide to use the
47 * group preallocation or inode preallocation depending on the size of
48 * the file. The size of the file could be the resulting file size we
49 * would have after allocation, or the current file size, which ever
50 * is larger. If the size is less than sbi->s_mb_stream_request we
51 * select to use the group preallocation. The default value of
52 * s_mb_stream_request is 16 blocks. This can also be tuned via
53 * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in
54 * terms of number of blocks.
55 *
56 * The main motivation for having small file use group preallocation is to
57 * ensure that we have small files closer together on the disk.
58 *
59 * First stage the allocator looks at the inode prealloc list,
60 * ext4_inode_info->i_prealloc_list, which contains list of prealloc
61 * spaces for this particular inode. The inode prealloc space is
62 * represented as:
63 *
64 * pa_lstart -> the logical start block for this prealloc space
65 * pa_pstart -> the physical start block for this prealloc space
66 * pa_len -> length for this prealloc space (in clusters)
67 * pa_free -> free space available in this prealloc space (in clusters)
68 *
69 * The inode preallocation space is used looking at the _logical_ start
70 * block. If only the logical file block falls within the range of prealloc
71 * space we will consume the particular prealloc space. This makes sure that
72 * we have contiguous physical blocks representing the file blocks
73 *
74 * The important thing to be noted in case of inode prealloc space is that
75 * we don't modify the values associated to inode prealloc space except
76 * pa_free.
77 *
78 * If we are not able to find blocks in the inode prealloc space and if we
79 * have the group allocation flag set then we look at the locality group
80 * prealloc space. These are per CPU prealloc list represented as
81 *
82 * ext4_sb_info.s_locality_groups[smp_processor_id()]
83 *
84 * The reason for having a per cpu locality group is to reduce the contention
85 * between CPUs. It is possible to get scheduled at this point.
86 *
87 * The locality group prealloc space is used looking at whether we have
88 * enough free space (pa_free) within the prealloc space.
89 *
90 * If we can't allocate blocks via inode prealloc or/and locality group
91 * prealloc then we look at the buddy cache. The buddy cache is represented
92 * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets
93 * mapped to the buddy and bitmap information regarding different
94 * groups. The buddy information is attached to buddy cache inode so that
95 * we can access them through the page cache. The information regarding
96 * each group is loaded via ext4_mb_load_buddy. The information involve
97 * block bitmap and buddy information. The information are stored in the
98 * inode as:
99 *
100 * { page }
101 * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
102 *
103 *
104 * one block each for bitmap and buddy information. So for each group we
105 * take up 2 blocks. A page can contain blocks_per_page (PAGE_SIZE /
106 * blocksize) blocks. So it can have information regarding groups_per_page
107 * which is blocks_per_page/2
108 *
109 * The buddy cache inode is not stored on disk. The inode is thrown
110 * away when the filesystem is unmounted.
111 *
112 * We look for count number of blocks in the buddy cache. If we were able
113 * to locate that many free blocks we return with additional information
114 * regarding rest of the contiguous physical block available
115 *
116 * Before allocating blocks via buddy cache we normalize the request
117 * blocks. This ensure we ask for more blocks that we needed. The extra
118 * blocks that we get after allocation is added to the respective prealloc
119 * list. In case of inode preallocation we follow a list of heuristics
120 * based on file size. This can be found in ext4_mb_normalize_request. If
121 * we are doing a group prealloc we try to normalize the request to
122 * sbi->s_mb_group_prealloc. The default value of s_mb_group_prealloc is
123 * dependent on the cluster size; for non-bigalloc file systems, it is
124 * 512 blocks. This can be tuned via
125 * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in
126 * terms of number of blocks. If we have mounted the file system with -O
127 * stripe=<value> option the group prealloc request is normalized to the
128 * smallest multiple of the stripe value (sbi->s_stripe) which is
129 * greater than the default mb_group_prealloc.
130 *
131 * If "mb_optimize_scan" mount option is set, we maintain in memory group info
132 * structures in two data structures:
133 *
134 * 1) Array of largest free order lists (sbi->s_mb_largest_free_orders)
135 *
136 * Locking: sbi->s_mb_largest_free_orders_locks(array of rw locks)
137 *
138 * This is an array of lists where the index in the array represents the
139 * largest free order in the buddy bitmap of the participating group infos of
140 * that list. So, there are exactly MB_NUM_ORDERS(sb) (which means total
141 * number of buddy bitmap orders possible) number of lists. Group-infos are
142 * placed in appropriate lists.
143 *
144 * 2) Average fragment size rb tree (sbi->s_mb_avg_fragment_size_root)
145 *
146 * Locking: sbi->s_mb_rb_lock (rwlock)
147 *
148 * This is a red black tree consisting of group infos and the tree is sorted
149 * by average fragment sizes (which is calculated as ext4_group_info->bb_free
150 * / ext4_group_info->bb_fragments).
151 *
152 * When "mb_optimize_scan" mount option is set, mballoc consults the above data
153 * structures to decide the order in which groups are to be traversed for
154 * fulfilling an allocation request.
155 *
156 * At CR = 0, we look for groups which have the largest_free_order >= the order
157 * of the request. We directly look at the largest free order list in the data
158 * structure (1) above where largest_free_order = order of the request. If that
159 * list is empty, we look at remaining list in the increasing order of
160 * largest_free_order. This allows us to perform CR = 0 lookup in O(1) time.
161 *
162 * At CR = 1, we only consider groups where average fragment size > request
163 * size. So, we lookup a group which has average fragment size just above or
164 * equal to request size using our rb tree (data structure 2) in O(log N) time.
165 *
166 * If "mb_optimize_scan" mount option is not set, mballoc traverses groups in
167 * linear order which requires O(N) search time for each CR 0 and CR 1 phase.
168 *
169 * The regular allocator (using the buddy cache) supports a few tunables.
170 *
171 * /sys/fs/ext4/<partition>/mb_min_to_scan
172 * /sys/fs/ext4/<partition>/mb_max_to_scan
173 * /sys/fs/ext4/<partition>/mb_order2_req
174 * /sys/fs/ext4/<partition>/mb_linear_limit
175 *
176 * The regular allocator uses buddy scan only if the request len is power of
177 * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
178 * value of s_mb_order2_reqs can be tuned via
179 * /sys/fs/ext4/<partition>/mb_order2_req. If the request len is equal to
180 * stripe size (sbi->s_stripe), we try to search for contiguous block in
181 * stripe size. This should result in better allocation on RAID setups. If
182 * not, we search in the specific group using bitmap for best extents. The
183 * tunable min_to_scan and max_to_scan control the behaviour here.
184 * min_to_scan indicate how long the mballoc __must__ look for a best
185 * extent and max_to_scan indicates how long the mballoc __can__ look for a
186 * best extent in the found extents. Searching for the blocks starts with
187 * the group specified as the goal value in allocation context via
188 * ac_g_ex. Each group is first checked based on the criteria whether it
189 * can be used for allocation. ext4_mb_good_group explains how the groups are
190 * checked.
191 *
192 * When "mb_optimize_scan" is turned on, as mentioned above, the groups may not
193 * get traversed linearly. That may result in subsequent allocations being not
194 * close to each other. And so, the underlying device may get filled up in a
195 * non-linear fashion. While that may not matter on non-rotational devices, for
196 * rotational devices that may result in higher seek times. "mb_linear_limit"
197 * tells mballoc how many groups mballoc should search linearly before
198 * performing consulting above data structures for more efficient lookups. For
199 * non rotational devices, this value defaults to 0 and for rotational devices
200 * this is set to MB_DEFAULT_LINEAR_LIMIT.
201 *
202 * Both the prealloc space are getting populated as above. So for the first
203 * request we will hit the buddy cache which will result in this prealloc
204 * space getting filled. The prealloc space is then later used for the
205 * subsequent request.
206 */
207
208 /*
209 * mballoc operates on the following data:
210 * - on-disk bitmap
211 * - in-core buddy (actually includes buddy and bitmap)
212 * - preallocation descriptors (PAs)
213 *
214 * there are two types of preallocations:
215 * - inode
216 * assiged to specific inode and can be used for this inode only.
217 * it describes part of inode's space preallocated to specific
218 * physical blocks. any block from that preallocated can be used
219 * independent. the descriptor just tracks number of blocks left
220 * unused. so, before taking some block from descriptor, one must
221 * make sure corresponded logical block isn't allocated yet. this
222 * also means that freeing any block within descriptor's range
223 * must discard all preallocated blocks.
224 * - locality group
225 * assigned to specific locality group which does not translate to
226 * permanent set of inodes: inode can join and leave group. space
227 * from this type of preallocation can be used for any inode. thus
228 * it's consumed from the beginning to the end.
229 *
230 * relation between them can be expressed as:
231 * in-core buddy = on-disk bitmap + preallocation descriptors
232 *
233 * this mean blocks mballoc considers used are:
234 * - allocated blocks (persistent)
235 * - preallocated blocks (non-persistent)
236 *
237 * consistency in mballoc world means that at any time a block is either
238 * free or used in ALL structures. notice: "any time" should not be read
239 * literally -- time is discrete and delimited by locks.
240 *
241 * to keep it simple, we don't use block numbers, instead we count number of
242 * blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA.
243 *
244 * all operations can be expressed as:
245 * - init buddy: buddy = on-disk + PAs
246 * - new PA: buddy += N; PA = N
247 * - use inode PA: on-disk += N; PA -= N
248 * - discard inode PA buddy -= on-disk - PA; PA = 0
249 * - use locality group PA on-disk += N; PA -= N
250 * - discard locality group PA buddy -= PA; PA = 0
251 * note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap
252 * is used in real operation because we can't know actual used
253 * bits from PA, only from on-disk bitmap
254 *
255 * if we follow this strict logic, then all operations above should be atomic.
256 * given some of them can block, we'd have to use something like semaphores
257 * killing performance on high-end SMP hardware. let's try to relax it using
258 * the following knowledge:
259 * 1) if buddy is referenced, it's already initialized
260 * 2) while block is used in buddy and the buddy is referenced,
261 * nobody can re-allocate that block
262 * 3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has
263 * bit set and PA claims same block, it's OK. IOW, one can set bit in
264 * on-disk bitmap if buddy has same bit set or/and PA covers corresponded
265 * block
266 *
267 * so, now we're building a concurrency table:
268 * - init buddy vs.
269 * - new PA
270 * blocks for PA are allocated in the buddy, buddy must be referenced
271 * until PA is linked to allocation group to avoid concurrent buddy init
272 * - use inode PA
273 * we need to make sure that either on-disk bitmap or PA has uptodate data
274 * given (3) we care that PA-=N operation doesn't interfere with init
275 * - discard inode PA
276 * the simplest way would be to have buddy initialized by the discard
277 * - use locality group PA
278 * again PA-=N must be serialized with init
279 * - discard locality group PA
280 * the simplest way would be to have buddy initialized by the discard
281 * - new PA vs.
282 * - use inode PA
283 * i_data_sem serializes them
284 * - discard inode PA
285 * discard process must wait until PA isn't used by another process
286 * - use locality group PA
287 * some mutex should serialize them
288 * - discard locality group PA
289 * discard process must wait until PA isn't used by another process
290 * - use inode PA
291 * - use inode PA
292 * i_data_sem or another mutex should serializes them
293 * - discard inode PA
294 * discard process must wait until PA isn't used by another process
295 * - use locality group PA
296 * nothing wrong here -- they're different PAs covering different blocks
297 * - discard locality group PA
298 * discard process must wait until PA isn't used by another process
299 *
300 * now we're ready to make few consequences:
301 * - PA is referenced and while it is no discard is possible
302 * - PA is referenced until block isn't marked in on-disk bitmap
303 * - PA changes only after on-disk bitmap
304 * - discard must not compete with init. either init is done before
305 * any discard or they're serialized somehow
306 * - buddy init as sum of on-disk bitmap and PAs is done atomically
307 *
308 * a special case when we've used PA to emptiness. no need to modify buddy
309 * in this case, but we should care about concurrent init
310 *
311 */
312
313 /*
314 * Logic in few words:
315 *
316 * - allocation:
317 * load group
318 * find blocks
319 * mark bits in on-disk bitmap
320 * release group
321 *
322 * - use preallocation:
323 * find proper PA (per-inode or group)
324 * load group
325 * mark bits in on-disk bitmap
326 * release group
327 * release PA
328 *
329 * - free:
330 * load group
331 * mark bits in on-disk bitmap
332 * release group
333 *
334 * - discard preallocations in group:
335 * mark PAs deleted
336 * move them onto local list
337 * load on-disk bitmap
338 * load group
339 * remove PA from object (inode or locality group)
340 * mark free blocks in-core
341 *
342 * - discard inode's preallocations:
343 */
344
345 /*
346 * Locking rules
347 *
348 * Locks:
349 * - bitlock on a group (group)
350 * - object (inode/locality) (object)
351 * - per-pa lock (pa)
352 * - cr0 lists lock (cr0)
353 * - cr1 tree lock (cr1)
354 *
355 * Paths:
356 * - new pa
357 * object
358 * group
359 *
360 * - find and use pa:
361 * pa
362 *
363 * - release consumed pa:
364 * pa
365 * group
366 * object
367 *
368 * - generate in-core bitmap:
369 * group
370 * pa
371 *
372 * - discard all for given object (inode, locality group):
373 * object
374 * pa
375 * group
376 *
377 * - discard all for given group:
378 * group
379 * pa
380 * group
381 * object
382 *
383 * - allocation path (ext4_mb_regular_allocator)
384 * group
385 * cr0/cr1
386 */
387 static struct kmem_cache *ext4_pspace_cachep;
388 static struct kmem_cache *ext4_ac_cachep;
389 static struct kmem_cache *ext4_free_data_cachep;
390
391 /* We create slab caches for groupinfo data structures based on the
392 * superblock block size. There will be one per mounted filesystem for
393 * each unique s_blocksize_bits */
394 #define NR_GRPINFO_CACHES 8
395 static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];
396
397 static const char * const ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = {
398 "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k",
399 "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k",
400 "ext4_groupinfo_64k", "ext4_groupinfo_128k"
401 };
402
403 static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
404 ext4_group_t group);
405 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
406 ext4_group_t group);
407 static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac);
408
409 static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
410 ext4_group_t group, int cr);
411
412 static int ext4_try_to_trim_range(struct super_block *sb,
413 struct ext4_buddy *e4b, ext4_grpblk_t start,
414 ext4_grpblk_t max, ext4_grpblk_t minblocks);
415
416 /*
417 * The algorithm using this percpu seq counter goes below:
418 * 1. We sample the percpu discard_pa_seq counter before trying for block
419 * allocation in ext4_mb_new_blocks().
420 * 2. We increment this percpu discard_pa_seq counter when we either allocate
421 * or free these blocks i.e. while marking those blocks as used/free in
422 * mb_mark_used()/mb_free_blocks().
423 * 3. We also increment this percpu seq counter when we successfully identify
424 * that the bb_prealloc_list is not empty and hence proceed for discarding
425 * of those PAs inside ext4_mb_discard_group_preallocations().
426 *
427 * Now to make sure that the regular fast path of block allocation is not
428 * affected, as a small optimization we only sample the percpu seq counter
429 * on that cpu. Only when the block allocation fails and when freed blocks
430 * found were 0, that is when we sample percpu seq counter for all cpus using
431 * below function ext4_get_discard_pa_seq_sum(). This happens after making
432 * sure that all the PAs on grp->bb_prealloc_list got freed or if it's empty.
433 */
434 static DEFINE_PER_CPU(u64, discard_pa_seq);
ext4_get_discard_pa_seq_sum(void)435 static inline u64 ext4_get_discard_pa_seq_sum(void)
436 {
437 int __cpu;
438 u64 __seq = 0;
439
440 for_each_possible_cpu(__cpu)
441 __seq += per_cpu(discard_pa_seq, __cpu);
442 return __seq;
443 }
444
mb_correct_addr_and_bit(int * bit,void * addr)445 static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
446 {
447 #if BITS_PER_LONG == 64
448 *bit += ((unsigned long) addr & 7UL) << 3;
449 addr = (void *) ((unsigned long) addr & ~7UL);
450 #elif BITS_PER_LONG == 32
451 *bit += ((unsigned long) addr & 3UL) << 3;
452 addr = (void *) ((unsigned long) addr & ~3UL);
453 #else
454 #error "how many bits you are?!"
455 #endif
456 return addr;
457 }
458
mb_test_bit(int bit,void * addr)459 static inline int mb_test_bit(int bit, void *addr)
460 {
461 /*
462 * ext4_test_bit on architecture like powerpc
463 * needs unsigned long aligned address
464 */
465 addr = mb_correct_addr_and_bit(&bit, addr);
466 return ext4_test_bit(bit, addr);
467 }
468
mb_set_bit(int bit,void * addr)469 static inline void mb_set_bit(int bit, void *addr)
470 {
471 addr = mb_correct_addr_and_bit(&bit, addr);
472 ext4_set_bit(bit, addr);
473 }
474
mb_clear_bit(int bit,void * addr)475 static inline void mb_clear_bit(int bit, void *addr)
476 {
477 addr = mb_correct_addr_and_bit(&bit, addr);
478 ext4_clear_bit(bit, addr);
479 }
480
mb_test_and_clear_bit(int bit,void * addr)481 static inline int mb_test_and_clear_bit(int bit, void *addr)
482 {
483 addr = mb_correct_addr_and_bit(&bit, addr);
484 return ext4_test_and_clear_bit(bit, addr);
485 }
486
mb_find_next_zero_bit(void * addr,int max,int start)487 static inline int mb_find_next_zero_bit(void *addr, int max, int start)
488 {
489 int fix = 0, ret, tmpmax;
490 addr = mb_correct_addr_and_bit(&fix, addr);
491 tmpmax = max + fix;
492 start += fix;
493
494 ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix;
495 if (ret > max)
496 return max;
497 return ret;
498 }
499
mb_find_next_bit(void * addr,int max,int start)500 static inline int mb_find_next_bit(void *addr, int max, int start)
501 {
502 int fix = 0, ret, tmpmax;
503 addr = mb_correct_addr_and_bit(&fix, addr);
504 tmpmax = max + fix;
505 start += fix;
506
507 ret = ext4_find_next_bit(addr, tmpmax, start) - fix;
508 if (ret > max)
509 return max;
510 return ret;
511 }
512
mb_find_buddy(struct ext4_buddy * e4b,int order,int * max)513 static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
514 {
515 char *bb;
516
517 BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
518 BUG_ON(max == NULL);
519
520 if (order > e4b->bd_blkbits + 1) {
521 *max = 0;
522 return NULL;
523 }
524
525 /* at order 0 we see each particular block */
526 if (order == 0) {
527 *max = 1 << (e4b->bd_blkbits + 3);
528 return e4b->bd_bitmap;
529 }
530
531 bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
532 *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
533
534 return bb;
535 }
536
537 #ifdef DOUBLE_CHECK
mb_free_blocks_double(struct inode * inode,struct ext4_buddy * e4b,int first,int count)538 static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
539 int first, int count)
540 {
541 int i;
542 struct super_block *sb = e4b->bd_sb;
543
544 if (unlikely(e4b->bd_info->bb_bitmap == NULL))
545 return;
546 assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
547 for (i = 0; i < count; i++) {
548 if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) {
549 ext4_fsblk_t blocknr;
550
551 blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
552 blocknr += EXT4_C2B(EXT4_SB(sb), first + i);
553 ext4_grp_locked_error(sb, e4b->bd_group,
554 inode ? inode->i_ino : 0,
555 blocknr,
556 "freeing block already freed "
557 "(bit %u)",
558 first + i);
559 ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
560 EXT4_GROUP_INFO_BBITMAP_CORRUPT);
561 }
562 mb_clear_bit(first + i, e4b->bd_info->bb_bitmap);
563 }
564 }
565
mb_mark_used_double(struct ext4_buddy * e4b,int first,int count)566 static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count)
567 {
568 int i;
569
570 if (unlikely(e4b->bd_info->bb_bitmap == NULL))
571 return;
572 assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
573 for (i = 0; i < count; i++) {
574 BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap));
575 mb_set_bit(first + i, e4b->bd_info->bb_bitmap);
576 }
577 }
578
mb_cmp_bitmaps(struct ext4_buddy * e4b,void * bitmap)579 static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
580 {
581 if (unlikely(e4b->bd_info->bb_bitmap == NULL))
582 return;
583 if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) {
584 unsigned char *b1, *b2;
585 int i;
586 b1 = (unsigned char *) e4b->bd_info->bb_bitmap;
587 b2 = (unsigned char *) bitmap;
588 for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
589 if (b1[i] != b2[i]) {
590 ext4_msg(e4b->bd_sb, KERN_ERR,
591 "corruption in group %u "
592 "at byte %u(%u): %x in copy != %x "
593 "on disk/prealloc",
594 e4b->bd_group, i, i * 8, b1[i], b2[i]);
595 BUG();
596 }
597 }
598 }
599 }
600
mb_group_bb_bitmap_alloc(struct super_block * sb,struct ext4_group_info * grp,ext4_group_t group)601 static void mb_group_bb_bitmap_alloc(struct super_block *sb,
602 struct ext4_group_info *grp, ext4_group_t group)
603 {
604 struct buffer_head *bh;
605
606 grp->bb_bitmap = kmalloc(sb->s_blocksize, GFP_NOFS);
607 if (!grp->bb_bitmap)
608 return;
609
610 bh = ext4_read_block_bitmap(sb, group);
611 if (IS_ERR_OR_NULL(bh)) {
612 kfree(grp->bb_bitmap);
613 grp->bb_bitmap = NULL;
614 return;
615 }
616
617 memcpy(grp->bb_bitmap, bh->b_data, sb->s_blocksize);
618 put_bh(bh);
619 }
620
mb_group_bb_bitmap_free(struct ext4_group_info * grp)621 static void mb_group_bb_bitmap_free(struct ext4_group_info *grp)
622 {
623 kfree(grp->bb_bitmap);
624 }
625
626 #else
mb_free_blocks_double(struct inode * inode,struct ext4_buddy * e4b,int first,int count)627 static inline void mb_free_blocks_double(struct inode *inode,
628 struct ext4_buddy *e4b, int first, int count)
629 {
630 return;
631 }
mb_mark_used_double(struct ext4_buddy * e4b,int first,int count)632 static inline void mb_mark_used_double(struct ext4_buddy *e4b,
633 int first, int count)
634 {
635 return;
636 }
mb_cmp_bitmaps(struct ext4_buddy * e4b,void * bitmap)637 static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
638 {
639 return;
640 }
641
mb_group_bb_bitmap_alloc(struct super_block * sb,struct ext4_group_info * grp,ext4_group_t group)642 static inline void mb_group_bb_bitmap_alloc(struct super_block *sb,
643 struct ext4_group_info *grp, ext4_group_t group)
644 {
645 return;
646 }
647
mb_group_bb_bitmap_free(struct ext4_group_info * grp)648 static inline void mb_group_bb_bitmap_free(struct ext4_group_info *grp)
649 {
650 return;
651 }
652 #endif
653
654 #ifdef AGGRESSIVE_CHECK
655
656 #define MB_CHECK_ASSERT(assert) \
657 do { \
658 if (!(assert)) { \
659 printk(KERN_EMERG \
660 "Assertion failure in %s() at %s:%d: \"%s\"\n", \
661 function, file, line, # assert); \
662 BUG(); \
663 } \
664 } while (0)
665
__mb_check_buddy(struct ext4_buddy * e4b,char * file,const char * function,int line)666 static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
667 const char *function, int line)
668 {
669 struct super_block *sb = e4b->bd_sb;
670 int order = e4b->bd_blkbits + 1;
671 int max;
672 int max2;
673 int i;
674 int j;
675 int k;
676 int count;
677 struct ext4_group_info *grp;
678 int fragments = 0;
679 int fstart;
680 struct list_head *cur;
681 void *buddy;
682 void *buddy2;
683
684 if (e4b->bd_info->bb_check_counter++ % 10)
685 return 0;
686
687 while (order > 1) {
688 buddy = mb_find_buddy(e4b, order, &max);
689 MB_CHECK_ASSERT(buddy);
690 buddy2 = mb_find_buddy(e4b, order - 1, &max2);
691 MB_CHECK_ASSERT(buddy2);
692 MB_CHECK_ASSERT(buddy != buddy2);
693 MB_CHECK_ASSERT(max * 2 == max2);
694
695 count = 0;
696 for (i = 0; i < max; i++) {
697
698 if (mb_test_bit(i, buddy)) {
699 /* only single bit in buddy2 may be 1 */
700 if (!mb_test_bit(i << 1, buddy2)) {
701 MB_CHECK_ASSERT(
702 mb_test_bit((i<<1)+1, buddy2));
703 } else if (!mb_test_bit((i << 1) + 1, buddy2)) {
704 MB_CHECK_ASSERT(
705 mb_test_bit(i << 1, buddy2));
706 }
707 continue;
708 }
709
710 /* both bits in buddy2 must be 1 */
711 MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2));
712 MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2));
713
714 for (j = 0; j < (1 << order); j++) {
715 k = (i * (1 << order)) + j;
716 MB_CHECK_ASSERT(
717 !mb_test_bit(k, e4b->bd_bitmap));
718 }
719 count++;
720 }
721 MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count);
722 order--;
723 }
724
725 fstart = -1;
726 buddy = mb_find_buddy(e4b, 0, &max);
727 for (i = 0; i < max; i++) {
728 if (!mb_test_bit(i, buddy)) {
729 MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free);
730 if (fstart == -1) {
731 fragments++;
732 fstart = i;
733 }
734 continue;
735 }
736 fstart = -1;
737 /* check used bits only */
738 for (j = 0; j < e4b->bd_blkbits + 1; j++) {
739 buddy2 = mb_find_buddy(e4b, j, &max2);
740 k = i >> j;
741 MB_CHECK_ASSERT(k < max2);
742 MB_CHECK_ASSERT(mb_test_bit(k, buddy2));
743 }
744 }
745 MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info));
746 MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments);
747
748 grp = ext4_get_group_info(sb, e4b->bd_group);
749 if (!grp)
750 return NULL;
751 list_for_each(cur, &grp->bb_prealloc_list) {
752 ext4_group_t groupnr;
753 struct ext4_prealloc_space *pa;
754 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
755 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k);
756 MB_CHECK_ASSERT(groupnr == e4b->bd_group);
757 for (i = 0; i < pa->pa_len; i++)
758 MB_CHECK_ASSERT(mb_test_bit(k + i, buddy));
759 }
760 return 0;
761 }
762 #undef MB_CHECK_ASSERT
763 #define mb_check_buddy(e4b) __mb_check_buddy(e4b, \
764 __FILE__, __func__, __LINE__)
765 #else
766 #define mb_check_buddy(e4b)
767 #endif
768
769 /*
770 * Divide blocks started from @first with length @len into
771 * smaller chunks with power of 2 blocks.
772 * Clear the bits in bitmap which the blocks of the chunk(s) covered,
773 * then increase bb_counters[] for corresponded chunk size.
774 */
ext4_mb_mark_free_simple(struct super_block * sb,void * buddy,ext4_grpblk_t first,ext4_grpblk_t len,struct ext4_group_info * grp)775 static void ext4_mb_mark_free_simple(struct super_block *sb,
776 void *buddy, ext4_grpblk_t first, ext4_grpblk_t len,
777 struct ext4_group_info *grp)
778 {
779 struct ext4_sb_info *sbi = EXT4_SB(sb);
780 ext4_grpblk_t min;
781 ext4_grpblk_t max;
782 ext4_grpblk_t chunk;
783 unsigned int border;
784
785 BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb));
786
787 border = 2 << sb->s_blocksize_bits;
788
789 while (len > 0) {
790 /* find how many blocks can be covered since this position */
791 max = ffs(first | border) - 1;
792
793 /* find how many blocks of power 2 we need to mark */
794 min = fls(len) - 1;
795
796 if (max < min)
797 min = max;
798 chunk = 1 << min;
799
800 /* mark multiblock chunks only */
801 grp->bb_counters[min]++;
802 if (min > 0)
803 mb_clear_bit(first >> min,
804 buddy + sbi->s_mb_offsets[min]);
805
806 len -= chunk;
807 first += chunk;
808 }
809 }
810
ext4_mb_rb_insert(struct rb_root * root,struct rb_node * new,int (* cmp)(struct rb_node *,struct rb_node *))811 static void ext4_mb_rb_insert(struct rb_root *root, struct rb_node *new,
812 int (*cmp)(struct rb_node *, struct rb_node *))
813 {
814 struct rb_node **iter = &root->rb_node, *parent = NULL;
815
816 while (*iter) {
817 parent = *iter;
818 if (cmp(new, *iter) > 0)
819 iter = &((*iter)->rb_left);
820 else
821 iter = &((*iter)->rb_right);
822 }
823
824 rb_link_node(new, parent, iter);
825 rb_insert_color(new, root);
826 }
827
828 static int
ext4_mb_avg_fragment_size_cmp(struct rb_node * rb1,struct rb_node * rb2)829 ext4_mb_avg_fragment_size_cmp(struct rb_node *rb1, struct rb_node *rb2)
830 {
831 struct ext4_group_info *grp1 = rb_entry(rb1,
832 struct ext4_group_info,
833 bb_avg_fragment_size_rb);
834 struct ext4_group_info *grp2 = rb_entry(rb2,
835 struct ext4_group_info,
836 bb_avg_fragment_size_rb);
837 int num_frags_1, num_frags_2;
838
839 num_frags_1 = grp1->bb_fragments ?
840 grp1->bb_free / grp1->bb_fragments : 0;
841 num_frags_2 = grp2->bb_fragments ?
842 grp2->bb_free / grp2->bb_fragments : 0;
843
844 return (num_frags_2 - num_frags_1);
845 }
846
847 /*
848 * Reinsert grpinfo into the avg_fragment_size tree with new average
849 * fragment size.
850 */
851 static void
mb_update_avg_fragment_size(struct super_block * sb,struct ext4_group_info * grp)852 mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp)
853 {
854 struct ext4_sb_info *sbi = EXT4_SB(sb);
855
856 if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_free == 0)
857 return;
858
859 write_lock(&sbi->s_mb_rb_lock);
860 if (!RB_EMPTY_NODE(&grp->bb_avg_fragment_size_rb)) {
861 rb_erase(&grp->bb_avg_fragment_size_rb,
862 &sbi->s_mb_avg_fragment_size_root);
863 RB_CLEAR_NODE(&grp->bb_avg_fragment_size_rb);
864 }
865
866 ext4_mb_rb_insert(&sbi->s_mb_avg_fragment_size_root,
867 &grp->bb_avg_fragment_size_rb,
868 ext4_mb_avg_fragment_size_cmp);
869 write_unlock(&sbi->s_mb_rb_lock);
870 }
871
872 /*
873 * Choose next group by traversing largest_free_order lists. Updates *new_cr if
874 * cr level needs an update.
875 */
ext4_mb_choose_next_group_cr0(struct ext4_allocation_context * ac,int * new_cr,ext4_group_t * group,ext4_group_t ngroups)876 static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac,
877 int *new_cr, ext4_group_t *group, ext4_group_t ngroups)
878 {
879 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
880 struct ext4_group_info *iter, *grp;
881 int i;
882
883 if (ac->ac_status == AC_STATUS_FOUND)
884 return;
885
886 if (unlikely(sbi->s_mb_stats && ac->ac_flags & EXT4_MB_CR0_OPTIMIZED))
887 atomic_inc(&sbi->s_bal_cr0_bad_suggestions);
888
889 grp = NULL;
890 for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) {
891 if (list_empty(&sbi->s_mb_largest_free_orders[i]))
892 continue;
893 read_lock(&sbi->s_mb_largest_free_orders_locks[i]);
894 if (list_empty(&sbi->s_mb_largest_free_orders[i])) {
895 read_unlock(&sbi->s_mb_largest_free_orders_locks[i]);
896 continue;
897 }
898 grp = NULL;
899 list_for_each_entry(iter, &sbi->s_mb_largest_free_orders[i],
900 bb_largest_free_order_node) {
901 if (sbi->s_mb_stats)
902 atomic64_inc(&sbi->s_bal_cX_groups_considered[0]);
903 if (likely(ext4_mb_good_group(ac, iter->bb_group, 0))) {
904 grp = iter;
905 break;
906 }
907 }
908 read_unlock(&sbi->s_mb_largest_free_orders_locks[i]);
909 if (grp)
910 break;
911 }
912
913 if (!grp) {
914 /* Increment cr and search again */
915 *new_cr = 1;
916 } else {
917 *group = grp->bb_group;
918 ac->ac_last_optimal_group = *group;
919 ac->ac_flags |= EXT4_MB_CR0_OPTIMIZED;
920 }
921 }
922
923 /*
924 * Choose next group by traversing average fragment size tree. Updates *new_cr
925 * if cr lvel needs an update. Sets EXT4_MB_SEARCH_NEXT_LINEAR to indicate that
926 * the linear search should continue for one iteration since there's lock
927 * contention on the rb tree lock.
928 */
ext4_mb_choose_next_group_cr1(struct ext4_allocation_context * ac,int * new_cr,ext4_group_t * group,ext4_group_t ngroups)929 static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac,
930 int *new_cr, ext4_group_t *group, ext4_group_t ngroups)
931 {
932 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
933 int avg_fragment_size, best_so_far;
934 struct rb_node *node, *found;
935 struct ext4_group_info *grp;
936
937 /*
938 * If there is contention on the lock, instead of waiting for the lock
939 * to become available, just continue searching lineraly. We'll resume
940 * our rb tree search later starting at ac->ac_last_optimal_group.
941 */
942 if (!read_trylock(&sbi->s_mb_rb_lock)) {
943 ac->ac_flags |= EXT4_MB_SEARCH_NEXT_LINEAR;
944 return;
945 }
946
947 if (unlikely(ac->ac_flags & EXT4_MB_CR1_OPTIMIZED)) {
948 if (sbi->s_mb_stats)
949 atomic_inc(&sbi->s_bal_cr1_bad_suggestions);
950 /* We have found something at CR 1 in the past */
951 grp = ext4_get_group_info(ac->ac_sb, ac->ac_last_optimal_group);
952 for (found = rb_next(&grp->bb_avg_fragment_size_rb); found != NULL;
953 found = rb_next(found)) {
954 grp = rb_entry(found, struct ext4_group_info,
955 bb_avg_fragment_size_rb);
956 if (sbi->s_mb_stats)
957 atomic64_inc(&sbi->s_bal_cX_groups_considered[1]);
958 if (likely(ext4_mb_good_group(ac, grp->bb_group, 1)))
959 break;
960 }
961 goto done;
962 }
963
964 node = sbi->s_mb_avg_fragment_size_root.rb_node;
965 best_so_far = 0;
966 found = NULL;
967
968 while (node) {
969 grp = rb_entry(node, struct ext4_group_info,
970 bb_avg_fragment_size_rb);
971 avg_fragment_size = 0;
972 if (ext4_mb_good_group(ac, grp->bb_group, 1)) {
973 avg_fragment_size = grp->bb_fragments ?
974 grp->bb_free / grp->bb_fragments : 0;
975 if (!best_so_far || avg_fragment_size < best_so_far) {
976 best_so_far = avg_fragment_size;
977 found = node;
978 }
979 }
980 if (avg_fragment_size > ac->ac_g_ex.fe_len)
981 node = node->rb_right;
982 else
983 node = node->rb_left;
984 }
985
986 done:
987 if (found) {
988 grp = rb_entry(found, struct ext4_group_info,
989 bb_avg_fragment_size_rb);
990 *group = grp->bb_group;
991 ac->ac_flags |= EXT4_MB_CR1_OPTIMIZED;
992 } else {
993 *new_cr = 2;
994 }
995
996 read_unlock(&sbi->s_mb_rb_lock);
997 ac->ac_last_optimal_group = *group;
998 }
999
should_optimize_scan(struct ext4_allocation_context * ac)1000 static inline int should_optimize_scan(struct ext4_allocation_context *ac)
1001 {
1002 if (unlikely(!test_opt2(ac->ac_sb, MB_OPTIMIZE_SCAN)))
1003 return 0;
1004 if (ac->ac_criteria >= 2)
1005 return 0;
1006 if (!ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))
1007 return 0;
1008 return 1;
1009 }
1010
1011 /*
1012 * Return next linear group for allocation. If linear traversal should not be
1013 * performed, this function just returns the same group
1014 */
1015 static ext4_group_t
next_linear_group(struct ext4_allocation_context * ac,ext4_group_t group,ext4_group_t ngroups)1016 next_linear_group(struct ext4_allocation_context *ac, ext4_group_t group,
1017 ext4_group_t ngroups)
1018 {
1019 if (!should_optimize_scan(ac))
1020 goto inc_and_return;
1021
1022 if (ac->ac_groups_linear_remaining) {
1023 ac->ac_groups_linear_remaining--;
1024 goto inc_and_return;
1025 }
1026
1027 if (ac->ac_flags & EXT4_MB_SEARCH_NEXT_LINEAR) {
1028 ac->ac_flags &= ~EXT4_MB_SEARCH_NEXT_LINEAR;
1029 goto inc_and_return;
1030 }
1031
1032 return group;
1033 inc_and_return:
1034 /*
1035 * Artificially restricted ngroups for non-extent
1036 * files makes group > ngroups possible on first loop.
1037 */
1038 return group + 1 >= ngroups ? 0 : group + 1;
1039 }
1040
1041 /*
1042 * ext4_mb_choose_next_group: choose next group for allocation.
1043 *
1044 * @ac Allocation Context
1045 * @new_cr This is an output parameter. If the there is no good group
1046 * available at current CR level, this field is updated to indicate
1047 * the new cr level that should be used.
1048 * @group This is an input / output parameter. As an input it indicates the
1049 * next group that the allocator intends to use for allocation. As
1050 * output, this field indicates the next group that should be used as
1051 * determined by the optimization functions.
1052 * @ngroups Total number of groups
1053 */
ext4_mb_choose_next_group(struct ext4_allocation_context * ac,int * new_cr,ext4_group_t * group,ext4_group_t ngroups)1054 static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac,
1055 int *new_cr, ext4_group_t *group, ext4_group_t ngroups)
1056 {
1057 *new_cr = ac->ac_criteria;
1058
1059 if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining) {
1060 *group = next_linear_group(ac, *group, ngroups);
1061 return;
1062 }
1063
1064 if (*new_cr == 0) {
1065 ext4_mb_choose_next_group_cr0(ac, new_cr, group, ngroups);
1066 } else if (*new_cr == 1) {
1067 ext4_mb_choose_next_group_cr1(ac, new_cr, group, ngroups);
1068 } else {
1069 /*
1070 * TODO: For CR=2, we can arrange groups in an rb tree sorted by
1071 * bb_free. But until that happens, we should never come here.
1072 */
1073 WARN_ON(1);
1074 }
1075 }
1076
1077 /*
1078 * Cache the order of the largest free extent we have available in this block
1079 * group.
1080 */
1081 static void
mb_set_largest_free_order(struct super_block * sb,struct ext4_group_info * grp)1082 mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
1083 {
1084 struct ext4_sb_info *sbi = EXT4_SB(sb);
1085 int i;
1086
1087 for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--)
1088 if (grp->bb_counters[i] > 0)
1089 break;
1090 /* No need to move between order lists? */
1091 if (!test_opt2(sb, MB_OPTIMIZE_SCAN) ||
1092 i == grp->bb_largest_free_order) {
1093 grp->bb_largest_free_order = i;
1094 return;
1095 }
1096
1097 if (grp->bb_largest_free_order >= 0) {
1098 write_lock(&sbi->s_mb_largest_free_orders_locks[
1099 grp->bb_largest_free_order]);
1100 list_del_init(&grp->bb_largest_free_order_node);
1101 write_unlock(&sbi->s_mb_largest_free_orders_locks[
1102 grp->bb_largest_free_order]);
1103 }
1104 grp->bb_largest_free_order = i;
1105 if (grp->bb_largest_free_order >= 0 && grp->bb_free) {
1106 write_lock(&sbi->s_mb_largest_free_orders_locks[
1107 grp->bb_largest_free_order]);
1108 list_add_tail(&grp->bb_largest_free_order_node,
1109 &sbi->s_mb_largest_free_orders[grp->bb_largest_free_order]);
1110 write_unlock(&sbi->s_mb_largest_free_orders_locks[
1111 grp->bb_largest_free_order]);
1112 }
1113 }
1114
1115 static noinline_for_stack
ext4_mb_generate_buddy(struct super_block * sb,void * buddy,void * bitmap,ext4_group_t group,struct ext4_group_info * grp)1116 void ext4_mb_generate_buddy(struct super_block *sb,
1117 void *buddy, void *bitmap, ext4_group_t group,
1118 struct ext4_group_info *grp)
1119 {
1120 struct ext4_sb_info *sbi = EXT4_SB(sb);
1121 ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
1122 ext4_grpblk_t i = 0;
1123 ext4_grpblk_t first;
1124 ext4_grpblk_t len;
1125 unsigned free = 0;
1126 unsigned fragments = 0;
1127 unsigned long long period = get_cycles();
1128
1129 /* initialize buddy from bitmap which is aggregation
1130 * of on-disk bitmap and preallocations */
1131 i = mb_find_next_zero_bit(bitmap, max, 0);
1132 grp->bb_first_free = i;
1133 while (i < max) {
1134 fragments++;
1135 first = i;
1136 i = mb_find_next_bit(bitmap, max, i);
1137 len = i - first;
1138 free += len;
1139 if (len > 1)
1140 ext4_mb_mark_free_simple(sb, buddy, first, len, grp);
1141 else
1142 grp->bb_counters[0]++;
1143 if (i < max)
1144 i = mb_find_next_zero_bit(bitmap, max, i);
1145 }
1146 grp->bb_fragments = fragments;
1147
1148 if (free != grp->bb_free) {
1149 ext4_grp_locked_error(sb, group, 0, 0,
1150 "block bitmap and bg descriptor "
1151 "inconsistent: %u vs %u free clusters",
1152 free, grp->bb_free);
1153 /*
1154 * If we intend to continue, we consider group descriptor
1155 * corrupt and update bb_free using bitmap value
1156 */
1157 grp->bb_free = free;
1158 ext4_mark_group_bitmap_corrupted(sb, group,
1159 EXT4_GROUP_INFO_BBITMAP_CORRUPT);
1160 }
1161 mb_set_largest_free_order(sb, grp);
1162
1163 clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
1164
1165 period = get_cycles() - period;
1166 atomic_inc(&sbi->s_mb_buddies_generated);
1167 atomic64_add(period, &sbi->s_mb_generation_time);
1168 mb_update_avg_fragment_size(sb, grp);
1169 }
1170
1171 /* The buddy information is attached the buddy cache inode
1172 * for convenience. The information regarding each group
1173 * is loaded via ext4_mb_load_buddy. The information involve
1174 * block bitmap and buddy information. The information are
1175 * stored in the inode as
1176 *
1177 * { page }
1178 * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
1179 *
1180 *
1181 * one block each for bitmap and buddy information.
1182 * So for each group we take up 2 blocks. A page can
1183 * contain blocks_per_page (PAGE_SIZE / blocksize) blocks.
1184 * So it can have information regarding groups_per_page which
1185 * is blocks_per_page/2
1186 *
1187 * Locking note: This routine takes the block group lock of all groups
1188 * for this page; do not hold this lock when calling this routine!
1189 */
1190
ext4_mb_init_cache(struct page * page,char * incore,gfp_t gfp)1191 static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
1192 {
1193 ext4_group_t ngroups;
1194 int blocksize;
1195 int blocks_per_page;
1196 int groups_per_page;
1197 int err = 0;
1198 int i;
1199 ext4_group_t first_group, group;
1200 int first_block;
1201 struct super_block *sb;
1202 struct buffer_head *bhs;
1203 struct buffer_head **bh = NULL;
1204 struct inode *inode;
1205 char *data;
1206 char *bitmap;
1207 struct ext4_group_info *grinfo;
1208
1209 inode = page->mapping->host;
1210 sb = inode->i_sb;
1211 ngroups = ext4_get_groups_count(sb);
1212 blocksize = i_blocksize(inode);
1213 blocks_per_page = PAGE_SIZE / blocksize;
1214
1215 mb_debug(sb, "init page %lu\n", page->index);
1216
1217 groups_per_page = blocks_per_page >> 1;
1218 if (groups_per_page == 0)
1219 groups_per_page = 1;
1220
1221 /* allocate buffer_heads to read bitmaps */
1222 if (groups_per_page > 1) {
1223 i = sizeof(struct buffer_head *) * groups_per_page;
1224 bh = kzalloc(i, gfp);
1225 if (bh == NULL) {
1226 err = -ENOMEM;
1227 goto out;
1228 }
1229 } else
1230 bh = &bhs;
1231
1232 first_group = page->index * blocks_per_page / 2;
1233
1234 /* read all groups the page covers into the cache */
1235 for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
1236 if (group >= ngroups)
1237 break;
1238
1239 grinfo = ext4_get_group_info(sb, group);
1240 if (!grinfo)
1241 continue;
1242 /*
1243 * If page is uptodate then we came here after online resize
1244 * which added some new uninitialized group info structs, so
1245 * we must skip all initialized uptodate buddies on the page,
1246 * which may be currently in use by an allocating task.
1247 */
1248 if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) {
1249 bh[i] = NULL;
1250 continue;
1251 }
1252 bh[i] = ext4_read_block_bitmap_nowait(sb, group, false);
1253 if (IS_ERR(bh[i])) {
1254 err = PTR_ERR(bh[i]);
1255 bh[i] = NULL;
1256 goto out;
1257 }
1258 mb_debug(sb, "read bitmap for group %u\n", group);
1259 }
1260
1261 /* wait for I/O completion */
1262 for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
1263 int err2;
1264
1265 if (!bh[i])
1266 continue;
1267 err2 = ext4_wait_block_bitmap(sb, group, bh[i]);
1268 if (!err)
1269 err = err2;
1270 }
1271
1272 first_block = page->index * blocks_per_page;
1273 for (i = 0; i < blocks_per_page; i++) {
1274 group = (first_block + i) >> 1;
1275 if (group >= ngroups)
1276 break;
1277
1278 if (!bh[group - first_group])
1279 /* skip initialized uptodate buddy */
1280 continue;
1281
1282 if (!buffer_verified(bh[group - first_group]))
1283 /* Skip faulty bitmaps */
1284 continue;
1285 err = 0;
1286
1287 /*
1288 * data carry information regarding this
1289 * particular group in the format specified
1290 * above
1291 *
1292 */
1293 data = page_address(page) + (i * blocksize);
1294 bitmap = bh[group - first_group]->b_data;
1295
1296 /*
1297 * We place the buddy block and bitmap block
1298 * close together
1299 */
1300 if ((first_block + i) & 1) {
1301 /* this is block of buddy */
1302 BUG_ON(incore == NULL);
1303 mb_debug(sb, "put buddy for group %u in page %lu/%x\n",
1304 group, page->index, i * blocksize);
1305 trace_ext4_mb_buddy_bitmap_load(sb, group);
1306 grinfo = ext4_get_group_info(sb, group);
1307 if (!grinfo) {
1308 err = -EFSCORRUPTED;
1309 goto out;
1310 }
1311 grinfo->bb_fragments = 0;
1312 memset(grinfo->bb_counters, 0,
1313 sizeof(*grinfo->bb_counters) *
1314 (MB_NUM_ORDERS(sb)));
1315 /*
1316 * incore got set to the group block bitmap below
1317 */
1318 ext4_lock_group(sb, group);
1319 /* init the buddy */
1320 memset(data, 0xff, blocksize);
1321 ext4_mb_generate_buddy(sb, data, incore, group, grinfo);
1322 ext4_unlock_group(sb, group);
1323 incore = NULL;
1324 } else {
1325 /* this is block of bitmap */
1326 BUG_ON(incore != NULL);
1327 mb_debug(sb, "put bitmap for group %u in page %lu/%x\n",
1328 group, page->index, i * blocksize);
1329 trace_ext4_mb_bitmap_load(sb, group);
1330
1331 /* see comments in ext4_mb_put_pa() */
1332 ext4_lock_group(sb, group);
1333 memcpy(data, bitmap, blocksize);
1334
1335 /* mark all preallocated blks used in in-core bitmap */
1336 ext4_mb_generate_from_pa(sb, data, group);
1337 ext4_mb_generate_from_freelist(sb, data, group);
1338 ext4_unlock_group(sb, group);
1339
1340 /* set incore so that the buddy information can be
1341 * generated using this
1342 */
1343 incore = data;
1344 }
1345 }
1346 SetPageUptodate(page);
1347
1348 out:
1349 if (bh) {
1350 for (i = 0; i < groups_per_page; i++)
1351 brelse(bh[i]);
1352 if (bh != &bhs)
1353 kfree(bh);
1354 }
1355 return err;
1356 }
1357
1358 /*
1359 * Lock the buddy and bitmap pages. This make sure other parallel init_group
1360 * on the same buddy page doesn't happen whild holding the buddy page lock.
1361 * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap
1362 * are on the same page e4b->bd_buddy_page is NULL and return value is 0.
1363 */
ext4_mb_get_buddy_page_lock(struct super_block * sb,ext4_group_t group,struct ext4_buddy * e4b,gfp_t gfp)1364 static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
1365 ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp)
1366 {
1367 struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
1368 int block, pnum, poff;
1369 int blocks_per_page;
1370 struct page *page;
1371
1372 e4b->bd_buddy_page = NULL;
1373 e4b->bd_bitmap_page = NULL;
1374
1375 blocks_per_page = PAGE_SIZE / sb->s_blocksize;
1376 /*
1377 * the buddy cache inode stores the block bitmap
1378 * and buddy information in consecutive blocks.
1379 * So for each group we need two blocks.
1380 */
1381 block = group * 2;
1382 pnum = block / blocks_per_page;
1383 poff = block % blocks_per_page;
1384 page = find_or_create_page(inode->i_mapping, pnum, gfp);
1385 if (!page)
1386 return -ENOMEM;
1387 BUG_ON(page->mapping != inode->i_mapping);
1388 e4b->bd_bitmap_page = page;
1389 e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
1390
1391 if (blocks_per_page >= 2) {
1392 /* buddy and bitmap are on the same page */
1393 return 0;
1394 }
1395
1396 block++;
1397 pnum = block / blocks_per_page;
1398 page = find_or_create_page(inode->i_mapping, pnum, gfp);
1399 if (!page)
1400 return -ENOMEM;
1401 BUG_ON(page->mapping != inode->i_mapping);
1402 e4b->bd_buddy_page = page;
1403 return 0;
1404 }
1405
ext4_mb_put_buddy_page_lock(struct ext4_buddy * e4b)1406 static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
1407 {
1408 if (e4b->bd_bitmap_page) {
1409 unlock_page(e4b->bd_bitmap_page);
1410 put_page(e4b->bd_bitmap_page);
1411 }
1412 if (e4b->bd_buddy_page) {
1413 unlock_page(e4b->bd_buddy_page);
1414 put_page(e4b->bd_buddy_page);
1415 }
1416 }
1417
1418 /*
1419 * Locking note: This routine calls ext4_mb_init_cache(), which takes the
1420 * block group lock of all groups for this page; do not hold the BG lock when
1421 * calling this routine!
1422 */
1423 static noinline_for_stack
ext4_mb_init_group(struct super_block * sb,ext4_group_t group,gfp_t gfp)1424 int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp)
1425 {
1426
1427 struct ext4_group_info *this_grp;
1428 struct ext4_buddy e4b;
1429 struct page *page;
1430 int ret = 0;
1431
1432 might_sleep();
1433 mb_debug(sb, "init group %u\n", group);
1434 this_grp = ext4_get_group_info(sb, group);
1435 if (!this_grp)
1436 return -EFSCORRUPTED;
1437
1438 /*
1439 * This ensures that we don't reinit the buddy cache
1440 * page which map to the group from which we are already
1441 * allocating. If we are looking at the buddy cache we would
1442 * have taken a reference using ext4_mb_load_buddy and that
1443 * would have pinned buddy page to page cache.
1444 * The call to ext4_mb_get_buddy_page_lock will mark the
1445 * page accessed.
1446 */
1447 ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b, gfp);
1448 if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
1449 /*
1450 * somebody initialized the group
1451 * return without doing anything
1452 */
1453 goto err;
1454 }
1455
1456 page = e4b.bd_bitmap_page;
1457 ret = ext4_mb_init_cache(page, NULL, gfp);
1458 if (ret)
1459 goto err;
1460 if (!PageUptodate(page)) {
1461 ret = -EIO;
1462 goto err;
1463 }
1464
1465 if (e4b.bd_buddy_page == NULL) {
1466 /*
1467 * If both the bitmap and buddy are in
1468 * the same page we don't need to force
1469 * init the buddy
1470 */
1471 ret = 0;
1472 goto err;
1473 }
1474 /* init buddy cache */
1475 page = e4b.bd_buddy_page;
1476 ret = ext4_mb_init_cache(page, e4b.bd_bitmap, gfp);
1477 if (ret)
1478 goto err;
1479 if (!PageUptodate(page)) {
1480 ret = -EIO;
1481 goto err;
1482 }
1483 err:
1484 ext4_mb_put_buddy_page_lock(&e4b);
1485 return ret;
1486 }
1487
1488 /*
1489 * Locking note: This routine calls ext4_mb_init_cache(), which takes the
1490 * block group lock of all groups for this page; do not hold the BG lock when
1491 * calling this routine!
1492 */
1493 static noinline_for_stack int
ext4_mb_load_buddy_gfp(struct super_block * sb,ext4_group_t group,struct ext4_buddy * e4b,gfp_t gfp)1494 ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
1495 struct ext4_buddy *e4b, gfp_t gfp)
1496 {
1497 int blocks_per_page;
1498 int block;
1499 int pnum;
1500 int poff;
1501 struct page *page;
1502 int ret;
1503 struct ext4_group_info *grp;
1504 struct ext4_sb_info *sbi = EXT4_SB(sb);
1505 struct inode *inode = sbi->s_buddy_cache;
1506
1507 might_sleep();
1508 mb_debug(sb, "load group %u\n", group);
1509
1510 blocks_per_page = PAGE_SIZE / sb->s_blocksize;
1511 grp = ext4_get_group_info(sb, group);
1512 if (!grp)
1513 return -EFSCORRUPTED;
1514
1515 e4b->bd_blkbits = sb->s_blocksize_bits;
1516 e4b->bd_info = grp;
1517 e4b->bd_sb = sb;
1518 e4b->bd_group = group;
1519 e4b->bd_buddy_page = NULL;
1520 e4b->bd_bitmap_page = NULL;
1521
1522 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
1523 /*
1524 * we need full data about the group
1525 * to make a good selection
1526 */
1527 ret = ext4_mb_init_group(sb, group, gfp);
1528 if (ret)
1529 return ret;
1530 }
1531
1532 /*
1533 * the buddy cache inode stores the block bitmap
1534 * and buddy information in consecutive blocks.
1535 * So for each group we need two blocks.
1536 */
1537 block = group * 2;
1538 pnum = block / blocks_per_page;
1539 poff = block % blocks_per_page;
1540
1541 /* we could use find_or_create_page(), but it locks page
1542 * what we'd like to avoid in fast path ... */
1543 page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
1544 if (page == NULL || !PageUptodate(page)) {
1545 if (page)
1546 /*
1547 * drop the page reference and try
1548 * to get the page with lock. If we
1549 * are not uptodate that implies
1550 * somebody just created the page but
1551 * is yet to initialize the same. So
1552 * wait for it to initialize.
1553 */
1554 put_page(page);
1555 page = find_or_create_page(inode->i_mapping, pnum, gfp);
1556 if (page) {
1557 BUG_ON(page->mapping != inode->i_mapping);
1558 if (!PageUptodate(page)) {
1559 ret = ext4_mb_init_cache(page, NULL, gfp);
1560 if (ret) {
1561 unlock_page(page);
1562 goto err;
1563 }
1564 mb_cmp_bitmaps(e4b, page_address(page) +
1565 (poff * sb->s_blocksize));
1566 }
1567 unlock_page(page);
1568 }
1569 }
1570 if (page == NULL) {
1571 ret = -ENOMEM;
1572 goto err;
1573 }
1574 if (!PageUptodate(page)) {
1575 ret = -EIO;
1576 goto err;
1577 }
1578
1579 /* Pages marked accessed already */
1580 e4b->bd_bitmap_page = page;
1581 e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
1582
1583 block++;
1584 pnum = block / blocks_per_page;
1585 poff = block % blocks_per_page;
1586
1587 page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
1588 if (page == NULL || !PageUptodate(page)) {
1589 if (page)
1590 put_page(page);
1591 page = find_or_create_page(inode->i_mapping, pnum, gfp);
1592 if (page) {
1593 BUG_ON(page->mapping != inode->i_mapping);
1594 if (!PageUptodate(page)) {
1595 ret = ext4_mb_init_cache(page, e4b->bd_bitmap,
1596 gfp);
1597 if (ret) {
1598 unlock_page(page);
1599 goto err;
1600 }
1601 }
1602 unlock_page(page);
1603 }
1604 }
1605 if (page == NULL) {
1606 ret = -ENOMEM;
1607 goto err;
1608 }
1609 if (!PageUptodate(page)) {
1610 ret = -EIO;
1611 goto err;
1612 }
1613
1614 /* Pages marked accessed already */
1615 e4b->bd_buddy_page = page;
1616 e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
1617
1618 return 0;
1619
1620 err:
1621 if (page)
1622 put_page(page);
1623 if (e4b->bd_bitmap_page)
1624 put_page(e4b->bd_bitmap_page);
1625 if (e4b->bd_buddy_page)
1626 put_page(e4b->bd_buddy_page);
1627 e4b->bd_buddy = NULL;
1628 e4b->bd_bitmap = NULL;
1629 return ret;
1630 }
1631
ext4_mb_load_buddy(struct super_block * sb,ext4_group_t group,struct ext4_buddy * e4b)1632 static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1633 struct ext4_buddy *e4b)
1634 {
1635 return ext4_mb_load_buddy_gfp(sb, group, e4b, GFP_NOFS);
1636 }
1637
ext4_mb_unload_buddy(struct ext4_buddy * e4b)1638 static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
1639 {
1640 if (e4b->bd_bitmap_page)
1641 put_page(e4b->bd_bitmap_page);
1642 if (e4b->bd_buddy_page)
1643 put_page(e4b->bd_buddy_page);
1644 }
1645
1646
mb_find_order_for_block(struct ext4_buddy * e4b,int block)1647 static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
1648 {
1649 int order = 1, max;
1650 void *bb;
1651
1652 BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
1653 BUG_ON(block >= (1 << (e4b->bd_blkbits + 3)));
1654
1655 while (order <= e4b->bd_blkbits + 1) {
1656 bb = mb_find_buddy(e4b, order, &max);
1657 if (!mb_test_bit(block >> order, bb)) {
1658 /* this block is part of buddy of order 'order' */
1659 return order;
1660 }
1661 order++;
1662 }
1663 return 0;
1664 }
1665
mb_clear_bits(void * bm,int cur,int len)1666 static void mb_clear_bits(void *bm, int cur, int len)
1667 {
1668 __u32 *addr;
1669
1670 len = cur + len;
1671 while (cur < len) {
1672 if ((cur & 31) == 0 && (len - cur) >= 32) {
1673 /* fast path: clear whole word at once */
1674 addr = bm + (cur >> 3);
1675 *addr = 0;
1676 cur += 32;
1677 continue;
1678 }
1679 mb_clear_bit(cur, bm);
1680 cur++;
1681 }
1682 }
1683
1684 /* clear bits in given range
1685 * will return first found zero bit if any, -1 otherwise
1686 */
mb_test_and_clear_bits(void * bm,int cur,int len)1687 static int mb_test_and_clear_bits(void *bm, int cur, int len)
1688 {
1689 __u32 *addr;
1690 int zero_bit = -1;
1691
1692 len = cur + len;
1693 while (cur < len) {
1694 if ((cur & 31) == 0 && (len - cur) >= 32) {
1695 /* fast path: clear whole word at once */
1696 addr = bm + (cur >> 3);
1697 if (*addr != (__u32)(-1) && zero_bit == -1)
1698 zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0);
1699 *addr = 0;
1700 cur += 32;
1701 continue;
1702 }
1703 if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1)
1704 zero_bit = cur;
1705 cur++;
1706 }
1707
1708 return zero_bit;
1709 }
1710
ext4_set_bits(void * bm,int cur,int len)1711 void ext4_set_bits(void *bm, int cur, int len)
1712 {
1713 __u32 *addr;
1714
1715 len = cur + len;
1716 while (cur < len) {
1717 if ((cur & 31) == 0 && (len - cur) >= 32) {
1718 /* fast path: set whole word at once */
1719 addr = bm + (cur >> 3);
1720 *addr = 0xffffffff;
1721 cur += 32;
1722 continue;
1723 }
1724 mb_set_bit(cur, bm);
1725 cur++;
1726 }
1727 }
1728
mb_buddy_adjust_border(int * bit,void * bitmap,int side)1729 static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side)
1730 {
1731 if (mb_test_bit(*bit + side, bitmap)) {
1732 mb_clear_bit(*bit, bitmap);
1733 (*bit) -= side;
1734 return 1;
1735 }
1736 else {
1737 (*bit) += side;
1738 mb_set_bit(*bit, bitmap);
1739 return -1;
1740 }
1741 }
1742
mb_buddy_mark_free(struct ext4_buddy * e4b,int first,int last)1743 static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last)
1744 {
1745 int max;
1746 int order = 1;
1747 void *buddy = mb_find_buddy(e4b, order, &max);
1748
1749 while (buddy) {
1750 void *buddy2;
1751
1752 /* Bits in range [first; last] are known to be set since
1753 * corresponding blocks were allocated. Bits in range
1754 * (first; last) will stay set because they form buddies on
1755 * upper layer. We just deal with borders if they don't
1756 * align with upper layer and then go up.
1757 * Releasing entire group is all about clearing
1758 * single bit of highest order buddy.
1759 */
1760
1761 /* Example:
1762 * ---------------------------------
1763 * | 1 | 1 | 1 | 1 |
1764 * ---------------------------------
1765 * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
1766 * ---------------------------------
1767 * 0 1 2 3 4 5 6 7
1768 * \_____________________/
1769 *
1770 * Neither [1] nor [6] is aligned to above layer.
1771 * Left neighbour [0] is free, so mark it busy,
1772 * decrease bb_counters and extend range to
1773 * [0; 6]
1774 * Right neighbour [7] is busy. It can't be coaleasced with [6], so
1775 * mark [6] free, increase bb_counters and shrink range to
1776 * [0; 5].
1777 * Then shift range to [0; 2], go up and do the same.
1778 */
1779
1780
1781 if (first & 1)
1782 e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1);
1783 if (!(last & 1))
1784 e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1);
1785 if (first > last)
1786 break;
1787 order++;
1788
1789 if (first == last || !(buddy2 = mb_find_buddy(e4b, order, &max))) {
1790 mb_clear_bits(buddy, first, last - first + 1);
1791 e4b->bd_info->bb_counters[order - 1] += last - first + 1;
1792 break;
1793 }
1794 first >>= 1;
1795 last >>= 1;
1796 buddy = buddy2;
1797 }
1798 }
1799
mb_free_blocks(struct inode * inode,struct ext4_buddy * e4b,int first,int count)1800 static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1801 int first, int count)
1802 {
1803 int left_is_free = 0;
1804 int right_is_free = 0;
1805 int block;
1806 int last = first + count - 1;
1807 struct super_block *sb = e4b->bd_sb;
1808
1809 if (WARN_ON(count == 0))
1810 return;
1811 BUG_ON(last >= (sb->s_blocksize << 3));
1812 assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
1813 /* Don't bother if the block group is corrupt. */
1814 if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
1815 return;
1816
1817 mb_check_buddy(e4b);
1818 mb_free_blocks_double(inode, e4b, first, count);
1819
1820 this_cpu_inc(discard_pa_seq);
1821 e4b->bd_info->bb_free += count;
1822 if (first < e4b->bd_info->bb_first_free)
1823 e4b->bd_info->bb_first_free = first;
1824
1825 /* access memory sequentially: check left neighbour,
1826 * clear range and then check right neighbour
1827 */
1828 if (first != 0)
1829 left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap);
1830 block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count);
1831 if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0])
1832 right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap);
1833
1834 if (unlikely(block != -1)) {
1835 struct ext4_sb_info *sbi = EXT4_SB(sb);
1836 ext4_fsblk_t blocknr;
1837
1838 blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
1839 blocknr += EXT4_C2B(sbi, block);
1840 if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
1841 ext4_grp_locked_error(sb, e4b->bd_group,
1842 inode ? inode->i_ino : 0,
1843 blocknr,
1844 "freeing already freed block (bit %u); block bitmap corrupt.",
1845 block);
1846 ext4_mark_group_bitmap_corrupted(
1847 sb, e4b->bd_group,
1848 EXT4_GROUP_INFO_BBITMAP_CORRUPT);
1849 }
1850 goto done;
1851 }
1852
1853 /* let's maintain fragments counter */
1854 if (left_is_free && right_is_free)
1855 e4b->bd_info->bb_fragments--;
1856 else if (!left_is_free && !right_is_free)
1857 e4b->bd_info->bb_fragments++;
1858
1859 /* buddy[0] == bd_bitmap is a special case, so handle
1860 * it right away and let mb_buddy_mark_free stay free of
1861 * zero order checks.
1862 * Check if neighbours are to be coaleasced,
1863 * adjust bitmap bb_counters and borders appropriately.
1864 */
1865 if (first & 1) {
1866 first += !left_is_free;
1867 e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1;
1868 }
1869 if (!(last & 1)) {
1870 last -= !right_is_free;
1871 e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1;
1872 }
1873
1874 if (first <= last)
1875 mb_buddy_mark_free(e4b, first >> 1, last >> 1);
1876
1877 done:
1878 mb_set_largest_free_order(sb, e4b->bd_info);
1879 mb_update_avg_fragment_size(sb, e4b->bd_info);
1880 mb_check_buddy(e4b);
1881 }
1882
mb_find_extent(struct ext4_buddy * e4b,int block,int needed,struct ext4_free_extent * ex)1883 static int mb_find_extent(struct ext4_buddy *e4b, int block,
1884 int needed, struct ext4_free_extent *ex)
1885 {
1886 int next = block;
1887 int max, order;
1888 void *buddy;
1889
1890 assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
1891 BUG_ON(ex == NULL);
1892
1893 buddy = mb_find_buddy(e4b, 0, &max);
1894 BUG_ON(buddy == NULL);
1895 BUG_ON(block >= max);
1896 if (mb_test_bit(block, buddy)) {
1897 ex->fe_len = 0;
1898 ex->fe_start = 0;
1899 ex->fe_group = 0;
1900 return 0;
1901 }
1902
1903 /* find actual order */
1904 order = mb_find_order_for_block(e4b, block);
1905 block = block >> order;
1906
1907 ex->fe_len = 1 << order;
1908 ex->fe_start = block << order;
1909 ex->fe_group = e4b->bd_group;
1910
1911 /* calc difference from given start */
1912 next = next - ex->fe_start;
1913 ex->fe_len -= next;
1914 ex->fe_start += next;
1915
1916 while (needed > ex->fe_len &&
1917 mb_find_buddy(e4b, order, &max)) {
1918
1919 if (block + 1 >= max)
1920 break;
1921
1922 next = (block + 1) * (1 << order);
1923 if (mb_test_bit(next, e4b->bd_bitmap))
1924 break;
1925
1926 order = mb_find_order_for_block(e4b, next);
1927
1928 block = next >> order;
1929 ex->fe_len += 1 << order;
1930 }
1931
1932 if (ex->fe_start + ex->fe_len > EXT4_CLUSTERS_PER_GROUP(e4b->bd_sb)) {
1933 /* Should never happen! (but apparently sometimes does?!?) */
1934 WARN_ON(1);
1935 ext4_grp_locked_error(e4b->bd_sb, e4b->bd_group, 0, 0,
1936 "corruption or bug in mb_find_extent "
1937 "block=%d, order=%d needed=%d ex=%u/%d/%d@%u",
1938 block, order, needed, ex->fe_group, ex->fe_start,
1939 ex->fe_len, ex->fe_logical);
1940 ex->fe_len = 0;
1941 ex->fe_start = 0;
1942 ex->fe_group = 0;
1943 }
1944 return ex->fe_len;
1945 }
1946
mb_mark_used(struct ext4_buddy * e4b,struct ext4_free_extent * ex)1947 static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
1948 {
1949 int ord;
1950 int mlen = 0;
1951 int max = 0;
1952 int cur;
1953 int start = ex->fe_start;
1954 int len = ex->fe_len;
1955 unsigned ret = 0;
1956 int len0 = len;
1957 void *buddy;
1958
1959 BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3));
1960 BUG_ON(e4b->bd_group != ex->fe_group);
1961 assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
1962 mb_check_buddy(e4b);
1963 mb_mark_used_double(e4b, start, len);
1964
1965 this_cpu_inc(discard_pa_seq);
1966 e4b->bd_info->bb_free -= len;
1967 if (e4b->bd_info->bb_first_free == start)
1968 e4b->bd_info->bb_first_free += len;
1969
1970 /* let's maintain fragments counter */
1971 if (start != 0)
1972 mlen = !mb_test_bit(start - 1, e4b->bd_bitmap);
1973 if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0])
1974 max = !mb_test_bit(start + len, e4b->bd_bitmap);
1975 if (mlen && max)
1976 e4b->bd_info->bb_fragments++;
1977 else if (!mlen && !max)
1978 e4b->bd_info->bb_fragments--;
1979
1980 /* let's maintain buddy itself */
1981 while (len) {
1982 ord = mb_find_order_for_block(e4b, start);
1983
1984 if (((start >> ord) << ord) == start && len >= (1 << ord)) {
1985 /* the whole chunk may be allocated at once! */
1986 mlen = 1 << ord;
1987 buddy = mb_find_buddy(e4b, ord, &max);
1988 BUG_ON((start >> ord) >= max);
1989 mb_set_bit(start >> ord, buddy);
1990 e4b->bd_info->bb_counters[ord]--;
1991 start += mlen;
1992 len -= mlen;
1993 BUG_ON(len < 0);
1994 continue;
1995 }
1996
1997 /* store for history */
1998 if (ret == 0)
1999 ret = len | (ord << 16);
2000
2001 /* we have to split large buddy */
2002 BUG_ON(ord <= 0);
2003 buddy = mb_find_buddy(e4b, ord, &max);
2004 mb_set_bit(start >> ord, buddy);
2005 e4b->bd_info->bb_counters[ord]--;
2006
2007 ord--;
2008 cur = (start >> ord) & ~1U;
2009 buddy = mb_find_buddy(e4b, ord, &max);
2010 mb_clear_bit(cur, buddy);
2011 mb_clear_bit(cur + 1, buddy);
2012 e4b->bd_info->bb_counters[ord]++;
2013 e4b->bd_info->bb_counters[ord]++;
2014 }
2015 mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
2016
2017 mb_update_avg_fragment_size(e4b->bd_sb, e4b->bd_info);
2018 ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0);
2019 mb_check_buddy(e4b);
2020
2021 return ret;
2022 }
2023
2024 /*
2025 * Must be called under group lock!
2026 */
ext4_mb_use_best_found(struct ext4_allocation_context * ac,struct ext4_buddy * e4b)2027 static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
2028 struct ext4_buddy *e4b)
2029 {
2030 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
2031 int ret;
2032
2033 BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group);
2034 BUG_ON(ac->ac_status == AC_STATUS_FOUND);
2035
2036 ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len);
2037 ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical;
2038 ret = mb_mark_used(e4b, &ac->ac_b_ex);
2039
2040 /* preallocation can change ac_b_ex, thus we store actually
2041 * allocated blocks for history */
2042 ac->ac_f_ex = ac->ac_b_ex;
2043
2044 ac->ac_status = AC_STATUS_FOUND;
2045 ac->ac_tail = ret & 0xffff;
2046 ac->ac_buddy = ret >> 16;
2047
2048 /*
2049 * take the page reference. We want the page to be pinned
2050 * so that we don't get a ext4_mb_init_cache_call for this
2051 * group until we update the bitmap. That would mean we
2052 * double allocate blocks. The reference is dropped
2053 * in ext4_mb_release_context
2054 */
2055 ac->ac_bitmap_page = e4b->bd_bitmap_page;
2056 get_page(ac->ac_bitmap_page);
2057 ac->ac_buddy_page = e4b->bd_buddy_page;
2058 get_page(ac->ac_buddy_page);
2059 /* store last allocated for subsequent stream allocation */
2060 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
2061 spin_lock(&sbi->s_md_lock);
2062 sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
2063 sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
2064 spin_unlock(&sbi->s_md_lock);
2065 }
2066 /*
2067 * As we've just preallocated more space than
2068 * user requested originally, we store allocated
2069 * space in a special descriptor.
2070 */
2071 if (ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
2072 ext4_mb_new_preallocation(ac);
2073
2074 }
2075
ext4_mb_check_limits(struct ext4_allocation_context * ac,struct ext4_buddy * e4b,int finish_group)2076 static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
2077 struct ext4_buddy *e4b,
2078 int finish_group)
2079 {
2080 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
2081 struct ext4_free_extent *bex = &ac->ac_b_ex;
2082 struct ext4_free_extent *gex = &ac->ac_g_ex;
2083 struct ext4_free_extent ex;
2084 int max;
2085
2086 if (ac->ac_status == AC_STATUS_FOUND)
2087 return;
2088 /*
2089 * We don't want to scan for a whole year
2090 */
2091 if (ac->ac_found > sbi->s_mb_max_to_scan &&
2092 !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
2093 ac->ac_status = AC_STATUS_BREAK;
2094 return;
2095 }
2096
2097 /*
2098 * Haven't found good chunk so far, let's continue
2099 */
2100 if (bex->fe_len < gex->fe_len)
2101 return;
2102
2103 if ((finish_group || ac->ac_found > sbi->s_mb_min_to_scan)
2104 && bex->fe_group == e4b->bd_group) {
2105 /* recheck chunk's availability - we don't know
2106 * when it was found (within this lock-unlock
2107 * period or not) */
2108 max = mb_find_extent(e4b, bex->fe_start, gex->fe_len, &ex);
2109 if (max >= gex->fe_len) {
2110 ext4_mb_use_best_found(ac, e4b);
2111 return;
2112 }
2113 }
2114 }
2115
2116 /*
2117 * The routine checks whether found extent is good enough. If it is,
2118 * then the extent gets marked used and flag is set to the context
2119 * to stop scanning. Otherwise, the extent is compared with the
2120 * previous found extent and if new one is better, then it's stored
2121 * in the context. Later, the best found extent will be used, if
2122 * mballoc can't find good enough extent.
2123 *
2124 * FIXME: real allocation policy is to be designed yet!
2125 */
ext4_mb_measure_extent(struct ext4_allocation_context * ac,struct ext4_free_extent * ex,struct ext4_buddy * e4b)2126 static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
2127 struct ext4_free_extent *ex,
2128 struct ext4_buddy *e4b)
2129 {
2130 struct ext4_free_extent *bex = &ac->ac_b_ex;
2131 struct ext4_free_extent *gex = &ac->ac_g_ex;
2132
2133 BUG_ON(ex->fe_len <= 0);
2134 BUG_ON(ex->fe_len > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
2135 BUG_ON(ex->fe_start >= EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
2136 BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);
2137
2138 ac->ac_found++;
2139
2140 /*
2141 * The special case - take what you catch first
2142 */
2143 if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
2144 *bex = *ex;
2145 ext4_mb_use_best_found(ac, e4b);
2146 return;
2147 }
2148
2149 /*
2150 * Let's check whether the chuck is good enough
2151 */
2152 if (ex->fe_len == gex->fe_len) {
2153 *bex = *ex;
2154 ext4_mb_use_best_found(ac, e4b);
2155 return;
2156 }
2157
2158 /*
2159 * If this is first found extent, just store it in the context
2160 */
2161 if (bex->fe_len == 0) {
2162 *bex = *ex;
2163 return;
2164 }
2165
2166 /*
2167 * If new found extent is better, store it in the context
2168 */
2169 if (bex->fe_len < gex->fe_len) {
2170 /* if the request isn't satisfied, any found extent
2171 * larger than previous best one is better */
2172 if (ex->fe_len > bex->fe_len)
2173 *bex = *ex;
2174 } else if (ex->fe_len > gex->fe_len) {
2175 /* if the request is satisfied, then we try to find
2176 * an extent that still satisfy the request, but is
2177 * smaller than previous one */
2178 if (ex->fe_len < bex->fe_len)
2179 *bex = *ex;
2180 }
2181
2182 ext4_mb_check_limits(ac, e4b, 0);
2183 }
2184
2185 static noinline_for_stack
ext4_mb_try_best_found(struct ext4_allocation_context * ac,struct ext4_buddy * e4b)2186 int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
2187 struct ext4_buddy *e4b)
2188 {
2189 struct ext4_free_extent ex = ac->ac_b_ex;
2190 ext4_group_t group = ex.fe_group;
2191 int max;
2192 int err;
2193
2194 BUG_ON(ex.fe_len <= 0);
2195 err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
2196 if (err)
2197 return err;
2198
2199 ext4_lock_group(ac->ac_sb, group);
2200 max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex);
2201
2202 if (max > 0) {
2203 ac->ac_b_ex = ex;
2204 ext4_mb_use_best_found(ac, e4b);
2205 }
2206
2207 ext4_unlock_group(ac->ac_sb, group);
2208 ext4_mb_unload_buddy(e4b);
2209
2210 return 0;
2211 }
2212
2213 static noinline_for_stack
ext4_mb_find_by_goal(struct ext4_allocation_context * ac,struct ext4_buddy * e4b)2214 int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
2215 struct ext4_buddy *e4b)
2216 {
2217 ext4_group_t group = ac->ac_g_ex.fe_group;
2218 int max;
2219 int err;
2220 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
2221 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
2222 struct ext4_free_extent ex;
2223
2224 if (!grp)
2225 return -EFSCORRUPTED;
2226 if (!(ac->ac_flags & (EXT4_MB_HINT_TRY_GOAL | EXT4_MB_HINT_GOAL_ONLY)))
2227 return 0;
2228 if (grp->bb_free == 0)
2229 return 0;
2230
2231 err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
2232 if (err)
2233 return err;
2234
2235 if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) {
2236 ext4_mb_unload_buddy(e4b);
2237 return 0;
2238 }
2239
2240 ext4_lock_group(ac->ac_sb, group);
2241 max = mb_find_extent(e4b, ac->ac_g_ex.fe_start,
2242 ac->ac_g_ex.fe_len, &ex);
2243 ex.fe_logical = 0xDEADFA11; /* debug value */
2244
2245 if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
2246 ext4_fsblk_t start;
2247
2248 start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) +
2249 ex.fe_start;
2250 /* use do_div to get remainder (would be 64-bit modulo) */
2251 if (do_div(start, sbi->s_stripe) == 0) {
2252 ac->ac_found++;
2253 ac->ac_b_ex = ex;
2254 ext4_mb_use_best_found(ac, e4b);
2255 }
2256 } else if (max >= ac->ac_g_ex.fe_len) {
2257 BUG_ON(ex.fe_len <= 0);
2258 BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
2259 BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
2260 ac->ac_found++;
2261 ac->ac_b_ex = ex;
2262 ext4_mb_use_best_found(ac, e4b);
2263 } else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) {
2264 /* Sometimes, caller may want to merge even small
2265 * number of blocks to an existing extent */
2266 BUG_ON(ex.fe_len <= 0);
2267 BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
2268 BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
2269 ac->ac_found++;
2270 ac->ac_b_ex = ex;
2271 ext4_mb_use_best_found(ac, e4b);
2272 }
2273 ext4_unlock_group(ac->ac_sb, group);
2274 ext4_mb_unload_buddy(e4b);
2275
2276 return 0;
2277 }
2278
2279 /*
2280 * The routine scans buddy structures (not bitmap!) from given order
2281 * to max order and tries to find big enough chunk to satisfy the req
2282 */
2283 static noinline_for_stack
ext4_mb_simple_scan_group(struct ext4_allocation_context * ac,struct ext4_buddy * e4b)2284 void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
2285 struct ext4_buddy *e4b)
2286 {
2287 struct super_block *sb = ac->ac_sb;
2288 struct ext4_group_info *grp = e4b->bd_info;
2289 void *buddy;
2290 int i;
2291 int k;
2292 int max;
2293
2294 BUG_ON(ac->ac_2order <= 0);
2295 for (i = ac->ac_2order; i < MB_NUM_ORDERS(sb); i++) {
2296 if (grp->bb_counters[i] == 0)
2297 continue;
2298
2299 buddy = mb_find_buddy(e4b, i, &max);
2300 BUG_ON(buddy == NULL);
2301
2302 k = mb_find_next_zero_bit(buddy, max, 0);
2303 if (k >= max) {
2304 ext4_grp_locked_error(ac->ac_sb, e4b->bd_group, 0, 0,
2305 "%d free clusters of order %d. But found 0",
2306 grp->bb_counters[i], i);
2307 ext4_mark_group_bitmap_corrupted(ac->ac_sb,
2308 e4b->bd_group,
2309 EXT4_GROUP_INFO_BBITMAP_CORRUPT);
2310 break;
2311 }
2312 ac->ac_found++;
2313
2314 ac->ac_b_ex.fe_len = 1 << i;
2315 ac->ac_b_ex.fe_start = k << i;
2316 ac->ac_b_ex.fe_group = e4b->bd_group;
2317
2318 ext4_mb_use_best_found(ac, e4b);
2319
2320 BUG_ON(ac->ac_f_ex.fe_len != ac->ac_g_ex.fe_len);
2321
2322 if (EXT4_SB(sb)->s_mb_stats)
2323 atomic_inc(&EXT4_SB(sb)->s_bal_2orders);
2324
2325 break;
2326 }
2327 }
2328
2329 /*
2330 * The routine scans the group and measures all found extents.
2331 * In order to optimize scanning, caller must pass number of
2332 * free blocks in the group, so the routine can know upper limit.
2333 */
2334 static noinline_for_stack
ext4_mb_complex_scan_group(struct ext4_allocation_context * ac,struct ext4_buddy * e4b)2335 void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
2336 struct ext4_buddy *e4b)
2337 {
2338 struct super_block *sb = ac->ac_sb;
2339 void *bitmap = e4b->bd_bitmap;
2340 struct ext4_free_extent ex;
2341 int i;
2342 int free;
2343
2344 free = e4b->bd_info->bb_free;
2345 if (WARN_ON(free <= 0))
2346 return;
2347
2348 i = e4b->bd_info->bb_first_free;
2349
2350 while (free && ac->ac_status == AC_STATUS_CONTINUE) {
2351 i = mb_find_next_zero_bit(bitmap,
2352 EXT4_CLUSTERS_PER_GROUP(sb), i);
2353 if (i >= EXT4_CLUSTERS_PER_GROUP(sb)) {
2354 /*
2355 * IF we have corrupt bitmap, we won't find any
2356 * free blocks even though group info says we
2357 * have free blocks
2358 */
2359 ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
2360 "%d free clusters as per "
2361 "group info. But bitmap says 0",
2362 free);
2363 ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
2364 EXT4_GROUP_INFO_BBITMAP_CORRUPT);
2365 break;
2366 }
2367
2368 mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex);
2369 if (WARN_ON(ex.fe_len <= 0))
2370 break;
2371 if (free < ex.fe_len) {
2372 ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
2373 "%d free clusters as per "
2374 "group info. But got %d blocks",
2375 free, ex.fe_len);
2376 ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
2377 EXT4_GROUP_INFO_BBITMAP_CORRUPT);
2378 /*
2379 * The number of free blocks differs. This mostly
2380 * indicate that the bitmap is corrupt. So exit
2381 * without claiming the space.
2382 */
2383 break;
2384 }
2385 ex.fe_logical = 0xDEADC0DE; /* debug value */
2386 ext4_mb_measure_extent(ac, &ex, e4b);
2387
2388 i += ex.fe_len;
2389 free -= ex.fe_len;
2390 }
2391
2392 ext4_mb_check_limits(ac, e4b, 1);
2393 }
2394
2395 /*
2396 * This is a special case for storages like raid5
2397 * we try to find stripe-aligned chunks for stripe-size-multiple requests
2398 */
2399 static noinline_for_stack
ext4_mb_scan_aligned(struct ext4_allocation_context * ac,struct ext4_buddy * e4b)2400 void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
2401 struct ext4_buddy *e4b)
2402 {
2403 struct super_block *sb = ac->ac_sb;
2404 struct ext4_sb_info *sbi = EXT4_SB(sb);
2405 void *bitmap = e4b->bd_bitmap;
2406 struct ext4_free_extent ex;
2407 ext4_fsblk_t first_group_block;
2408 ext4_fsblk_t a;
2409 ext4_grpblk_t i;
2410 int max;
2411
2412 BUG_ON(sbi->s_stripe == 0);
2413
2414 /* find first stripe-aligned block in group */
2415 first_group_block = ext4_group_first_block_no(sb, e4b->bd_group);
2416
2417 a = first_group_block + sbi->s_stripe - 1;
2418 do_div(a, sbi->s_stripe);
2419 i = (a * sbi->s_stripe) - first_group_block;
2420
2421 while (i < EXT4_CLUSTERS_PER_GROUP(sb)) {
2422 if (!mb_test_bit(i, bitmap)) {
2423 max = mb_find_extent(e4b, i, sbi->s_stripe, &ex);
2424 if (max >= sbi->s_stripe) {
2425 ac->ac_found++;
2426 ex.fe_logical = 0xDEADF00D; /* debug value */
2427 ac->ac_b_ex = ex;
2428 ext4_mb_use_best_found(ac, e4b);
2429 break;
2430 }
2431 }
2432 i += sbi->s_stripe;
2433 }
2434 }
2435
2436 /*
2437 * This is also called BEFORE we load the buddy bitmap.
2438 * Returns either 1 or 0 indicating that the group is either suitable
2439 * for the allocation or not.
2440 */
ext4_mb_good_group(struct ext4_allocation_context * ac,ext4_group_t group,int cr)2441 static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
2442 ext4_group_t group, int cr)
2443 {
2444 ext4_grpblk_t free, fragments;
2445 int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
2446 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
2447
2448 BUG_ON(cr < 0 || cr >= 4);
2449
2450 if (unlikely(!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
2451 return false;
2452
2453 free = grp->bb_free;
2454 if (free == 0)
2455 return false;
2456
2457 fragments = grp->bb_fragments;
2458 if (fragments == 0)
2459 return false;
2460
2461 switch (cr) {
2462 case 0:
2463 BUG_ON(ac->ac_2order == 0);
2464
2465 /* Avoid using the first bg of a flexgroup for data files */
2466 if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
2467 (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
2468 ((group % flex_size) == 0))
2469 return false;
2470
2471 if (free < ac->ac_g_ex.fe_len)
2472 return false;
2473
2474 if (ac->ac_2order >= MB_NUM_ORDERS(ac->ac_sb))
2475 return true;
2476
2477 if (grp->bb_largest_free_order < ac->ac_2order)
2478 return false;
2479
2480 return true;
2481 case 1:
2482 if ((free / fragments) >= ac->ac_g_ex.fe_len)
2483 return true;
2484 break;
2485 case 2:
2486 if (free >= ac->ac_g_ex.fe_len)
2487 return true;
2488 break;
2489 case 3:
2490 return true;
2491 default:
2492 BUG();
2493 }
2494
2495 return false;
2496 }
2497
2498 /*
2499 * This could return negative error code if something goes wrong
2500 * during ext4_mb_init_group(). This should not be called with
2501 * ext4_lock_group() held.
2502 *
2503 * Note: because we are conditionally operating with the group lock in
2504 * the EXT4_MB_STRICT_CHECK case, we need to fake out sparse in this
2505 * function using __acquire and __release. This means we need to be
2506 * super careful before messing with the error path handling via "goto
2507 * out"!
2508 */
ext4_mb_good_group_nolock(struct ext4_allocation_context * ac,ext4_group_t group,int cr)2509 static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac,
2510 ext4_group_t group, int cr)
2511 {
2512 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
2513 struct super_block *sb = ac->ac_sb;
2514 struct ext4_sb_info *sbi = EXT4_SB(sb);
2515 bool should_lock = ac->ac_flags & EXT4_MB_STRICT_CHECK;
2516 ext4_grpblk_t free;
2517 int ret = 0;
2518
2519 if (!grp)
2520 return -EFSCORRUPTED;
2521 if (sbi->s_mb_stats)
2522 atomic64_inc(&sbi->s_bal_cX_groups_considered[ac->ac_criteria]);
2523 if (should_lock) {
2524 ext4_lock_group(sb, group);
2525 __release(ext4_group_lock_ptr(sb, group));
2526 }
2527 free = grp->bb_free;
2528 if (free == 0)
2529 goto out;
2530 if (cr <= 2 && free < ac->ac_g_ex.fe_len)
2531 goto out;
2532 if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
2533 goto out;
2534 if (should_lock) {
2535 __acquire(ext4_group_lock_ptr(sb, group));
2536 ext4_unlock_group(sb, group);
2537 }
2538
2539 /* We only do this if the grp has never been initialized */
2540 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
2541 struct ext4_group_desc *gdp =
2542 ext4_get_group_desc(sb, group, NULL);
2543 int ret;
2544
2545 /* cr=0/1 is a very optimistic search to find large
2546 * good chunks almost for free. If buddy data is not
2547 * ready, then this optimization makes no sense. But
2548 * we never skip the first block group in a flex_bg,
2549 * since this gets used for metadata block allocation,
2550 * and we want to make sure we locate metadata blocks
2551 * in the first block group in the flex_bg if possible.
2552 */
2553 if (cr < 2 &&
2554 (!sbi->s_log_groups_per_flex ||
2555 ((group & ((1 << sbi->s_log_groups_per_flex) - 1)) != 0)) &&
2556 !(ext4_has_group_desc_csum(sb) &&
2557 (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))))
2558 return 0;
2559 ret = ext4_mb_init_group(sb, group, GFP_NOFS);
2560 if (ret)
2561 return ret;
2562 }
2563
2564 if (should_lock) {
2565 ext4_lock_group(sb, group);
2566 __release(ext4_group_lock_ptr(sb, group));
2567 }
2568 ret = ext4_mb_good_group(ac, group, cr);
2569 out:
2570 if (should_lock) {
2571 __acquire(ext4_group_lock_ptr(sb, group));
2572 ext4_unlock_group(sb, group);
2573 }
2574 return ret;
2575 }
2576
2577 /*
2578 * Start prefetching @nr block bitmaps starting at @group.
2579 * Return the next group which needs to be prefetched.
2580 */
ext4_mb_prefetch(struct super_block * sb,ext4_group_t group,unsigned int nr,int * cnt)2581 ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group,
2582 unsigned int nr, int *cnt)
2583 {
2584 ext4_group_t ngroups = ext4_get_groups_count(sb);
2585 struct buffer_head *bh;
2586 struct blk_plug plug;
2587
2588 blk_start_plug(&plug);
2589 while (nr-- > 0) {
2590 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group,
2591 NULL);
2592 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
2593
2594 /*
2595 * Prefetch block groups with free blocks; but don't
2596 * bother if it is marked uninitialized on disk, since
2597 * it won't require I/O to read. Also only try to
2598 * prefetch once, so we avoid getblk() call, which can
2599 * be expensive.
2600 */
2601 if (gdp && grp && !EXT4_MB_GRP_TEST_AND_SET_READ(grp) &&
2602 EXT4_MB_GRP_NEED_INIT(grp) &&
2603 ext4_free_group_clusters(sb, gdp) > 0 &&
2604 !(ext4_has_group_desc_csum(sb) &&
2605 (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) {
2606 bh = ext4_read_block_bitmap_nowait(sb, group, true);
2607 if (bh && !IS_ERR(bh)) {
2608 if (!buffer_uptodate(bh) && cnt)
2609 (*cnt)++;
2610 brelse(bh);
2611 }
2612 }
2613 if (++group >= ngroups)
2614 group = 0;
2615 }
2616 blk_finish_plug(&plug);
2617 return group;
2618 }
2619
2620 /*
2621 * Prefetching reads the block bitmap into the buffer cache; but we
2622 * need to make sure that the buddy bitmap in the page cache has been
2623 * initialized. Note that ext4_mb_init_group() will block if the I/O
2624 * is not yet completed, or indeed if it was not initiated by
2625 * ext4_mb_prefetch did not start the I/O.
2626 *
2627 * TODO: We should actually kick off the buddy bitmap setup in a work
2628 * queue when the buffer I/O is completed, so that we don't block
2629 * waiting for the block allocation bitmap read to finish when
2630 * ext4_mb_prefetch_fini is called from ext4_mb_regular_allocator().
2631 */
ext4_mb_prefetch_fini(struct super_block * sb,ext4_group_t group,unsigned int nr)2632 void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group,
2633 unsigned int nr)
2634 {
2635 while (nr-- > 0) {
2636 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group,
2637 NULL);
2638 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
2639
2640 if (!group)
2641 group = ext4_get_groups_count(sb);
2642 group--;
2643 grp = ext4_get_group_info(sb, group);
2644
2645 if (grp && gdp && EXT4_MB_GRP_NEED_INIT(grp) &&
2646 ext4_free_group_clusters(sb, gdp) > 0 &&
2647 !(ext4_has_group_desc_csum(sb) &&
2648 (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) {
2649 if (ext4_mb_init_group(sb, group, GFP_NOFS))
2650 break;
2651 }
2652 }
2653 }
2654
2655 static noinline_for_stack int
ext4_mb_regular_allocator(struct ext4_allocation_context * ac)2656 ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
2657 {
2658 ext4_group_t prefetch_grp = 0, ngroups, group, i;
2659 int cr = -1, new_cr;
2660 int err = 0, first_err = 0;
2661 unsigned int nr = 0, prefetch_ios = 0;
2662 struct ext4_sb_info *sbi;
2663 struct super_block *sb;
2664 struct ext4_buddy e4b;
2665 int lost;
2666
2667 sb = ac->ac_sb;
2668 sbi = EXT4_SB(sb);
2669 ngroups = ext4_get_groups_count(sb);
2670 /* non-extent files are limited to low blocks/groups */
2671 if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
2672 ngroups = sbi->s_blockfile_groups;
2673
2674 BUG_ON(ac->ac_status == AC_STATUS_FOUND);
2675
2676 /* first, try the goal */
2677 err = ext4_mb_find_by_goal(ac, &e4b);
2678 if (err || ac->ac_status == AC_STATUS_FOUND)
2679 goto out;
2680
2681 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
2682 goto out;
2683
2684 /*
2685 * ac->ac_2order is set only if the fe_len is a power of 2
2686 * if ac->ac_2order is set we also set criteria to 0 so that we
2687 * try exact allocation using buddy.
2688 */
2689 i = fls(ac->ac_g_ex.fe_len);
2690 ac->ac_2order = 0;
2691 /*
2692 * We search using buddy data only if the order of the request
2693 * is greater than equal to the sbi_s_mb_order2_reqs
2694 * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
2695 * We also support searching for power-of-two requests only for
2696 * requests upto maximum buddy size we have constructed.
2697 */
2698 if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) {
2699 /*
2700 * This should tell if fe_len is exactly power of 2
2701 */
2702 if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0)
2703 ac->ac_2order = array_index_nospec(i - 1,
2704 MB_NUM_ORDERS(sb));
2705 }
2706
2707 /* if stream allocation is enabled, use global goal */
2708 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
2709 /* TBD: may be hot point */
2710 spin_lock(&sbi->s_md_lock);
2711 ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
2712 ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
2713 spin_unlock(&sbi->s_md_lock);
2714 }
2715
2716 /* Let's just scan groups to find more-less suitable blocks */
2717 cr = ac->ac_2order ? 0 : 1;
2718 /*
2719 * cr == 0 try to get exact allocation,
2720 * cr == 3 try to get anything
2721 */
2722 repeat:
2723 for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
2724 ac->ac_criteria = cr;
2725 /*
2726 * searching for the right group start
2727 * from the goal value specified
2728 */
2729 group = ac->ac_g_ex.fe_group;
2730 ac->ac_last_optimal_group = group;
2731 ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups;
2732 prefetch_grp = group;
2733
2734 for (i = 0, new_cr = cr; i < ngroups; i++,
2735 ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups)) {
2736 int ret = 0;
2737
2738 cond_resched();
2739 if (new_cr != cr) {
2740 cr = new_cr;
2741 goto repeat;
2742 }
2743
2744 /*
2745 * Batch reads of the block allocation bitmaps
2746 * to get multiple READs in flight; limit
2747 * prefetching at cr=0/1, otherwise mballoc can
2748 * spend a lot of time loading imperfect groups
2749 */
2750 if ((prefetch_grp == group) &&
2751 (cr > 1 ||
2752 prefetch_ios < sbi->s_mb_prefetch_limit)) {
2753 unsigned int curr_ios = prefetch_ios;
2754
2755 nr = sbi->s_mb_prefetch;
2756 if (ext4_has_feature_flex_bg(sb)) {
2757 nr = 1 << sbi->s_log_groups_per_flex;
2758 nr -= group & (nr - 1);
2759 nr = min(nr, sbi->s_mb_prefetch);
2760 }
2761 prefetch_grp = ext4_mb_prefetch(sb, group,
2762 nr, &prefetch_ios);
2763 if (prefetch_ios == curr_ios)
2764 nr = 0;
2765 }
2766
2767 /* This now checks without needing the buddy page */
2768 ret = ext4_mb_good_group_nolock(ac, group, cr);
2769 if (ret <= 0) {
2770 if (!first_err)
2771 first_err = ret;
2772 continue;
2773 }
2774
2775 err = ext4_mb_load_buddy(sb, group, &e4b);
2776 if (err)
2777 goto out;
2778
2779 ext4_lock_group(sb, group);
2780
2781 /*
2782 * We need to check again after locking the
2783 * block group
2784 */
2785 ret = ext4_mb_good_group(ac, group, cr);
2786 if (ret == 0) {
2787 ext4_unlock_group(sb, group);
2788 ext4_mb_unload_buddy(&e4b);
2789 continue;
2790 }
2791
2792 ac->ac_groups_scanned++;
2793 if (cr == 0)
2794 ext4_mb_simple_scan_group(ac, &e4b);
2795 else if (cr == 1 && sbi->s_stripe &&
2796 !(ac->ac_g_ex.fe_len % sbi->s_stripe))
2797 ext4_mb_scan_aligned(ac, &e4b);
2798 else
2799 ext4_mb_complex_scan_group(ac, &e4b);
2800
2801 ext4_unlock_group(sb, group);
2802 ext4_mb_unload_buddy(&e4b);
2803
2804 if (ac->ac_status != AC_STATUS_CONTINUE)
2805 break;
2806 }
2807 /* Processed all groups and haven't found blocks */
2808 if (sbi->s_mb_stats && i == ngroups)
2809 atomic64_inc(&sbi->s_bal_cX_failed[cr]);
2810 }
2811
2812 if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
2813 !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
2814 /*
2815 * We've been searching too long. Let's try to allocate
2816 * the best chunk we've found so far
2817 */
2818 ext4_mb_try_best_found(ac, &e4b);
2819 if (ac->ac_status != AC_STATUS_FOUND) {
2820 /*
2821 * Someone more lucky has already allocated it.
2822 * The only thing we can do is just take first
2823 * found block(s)
2824 */
2825 lost = atomic_inc_return(&sbi->s_mb_lost_chunks);
2826 mb_debug(sb, "lost chunk, group: %u, start: %d, len: %d, lost: %d\n",
2827 ac->ac_b_ex.fe_group, ac->ac_b_ex.fe_start,
2828 ac->ac_b_ex.fe_len, lost);
2829
2830 ac->ac_b_ex.fe_group = 0;
2831 ac->ac_b_ex.fe_start = 0;
2832 ac->ac_b_ex.fe_len = 0;
2833 ac->ac_status = AC_STATUS_CONTINUE;
2834 ac->ac_flags |= EXT4_MB_HINT_FIRST;
2835 cr = 3;
2836 goto repeat;
2837 }
2838 }
2839
2840 if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND)
2841 atomic64_inc(&sbi->s_bal_cX_hits[ac->ac_criteria]);
2842 out:
2843 if (!err && ac->ac_status != AC_STATUS_FOUND && first_err)
2844 err = first_err;
2845
2846 mb_debug(sb, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n",
2847 ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status,
2848 ac->ac_flags, cr, err);
2849
2850 if (nr)
2851 ext4_mb_prefetch_fini(sb, prefetch_grp, nr);
2852
2853 return err;
2854 }
2855
ext4_mb_seq_groups_start(struct seq_file * seq,loff_t * pos)2856 static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
2857 {
2858 struct super_block *sb = PDE_DATA(file_inode(seq->file));
2859 ext4_group_t group;
2860
2861 if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
2862 return NULL;
2863 group = *pos + 1;
2864 return (void *) ((unsigned long) group);
2865 }
2866
ext4_mb_seq_groups_next(struct seq_file * seq,void * v,loff_t * pos)2867 static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
2868 {
2869 struct super_block *sb = PDE_DATA(file_inode(seq->file));
2870 ext4_group_t group;
2871
2872 ++*pos;
2873 if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
2874 return NULL;
2875 group = *pos + 1;
2876 return (void *) ((unsigned long) group);
2877 }
2878
ext4_mb_seq_groups_show(struct seq_file * seq,void * v)2879 static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
2880 {
2881 struct super_block *sb = PDE_DATA(file_inode(seq->file));
2882 ext4_group_t group = (ext4_group_t) ((unsigned long) v);
2883 int i;
2884 int err, buddy_loaded = 0;
2885 struct ext4_buddy e4b;
2886 struct ext4_group_info *grinfo;
2887 unsigned char blocksize_bits = min_t(unsigned char,
2888 sb->s_blocksize_bits,
2889 EXT4_MAX_BLOCK_LOG_SIZE);
2890 struct sg {
2891 struct ext4_group_info info;
2892 ext4_grpblk_t counters[EXT4_MAX_BLOCK_LOG_SIZE + 2];
2893 } sg;
2894
2895 group--;
2896 if (group == 0)
2897 seq_puts(seq, "#group: free frags first ["
2898 " 2^0 2^1 2^2 2^3 2^4 2^5 2^6 "
2899 " 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]\n");
2900
2901 i = (blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
2902 sizeof(struct ext4_group_info);
2903
2904 grinfo = ext4_get_group_info(sb, group);
2905 if (!grinfo)
2906 return 0;
2907 /* Load the group info in memory only if not already loaded. */
2908 if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) {
2909 err = ext4_mb_load_buddy(sb, group, &e4b);
2910 if (err) {
2911 seq_printf(seq, "#%-5u: I/O error\n", group);
2912 return 0;
2913 }
2914 buddy_loaded = 1;
2915 }
2916
2917 memcpy(&sg, grinfo, i);
2918
2919 if (buddy_loaded)
2920 ext4_mb_unload_buddy(&e4b);
2921
2922 seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
2923 sg.info.bb_fragments, sg.info.bb_first_free);
2924 for (i = 0; i <= 13; i++)
2925 seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ?
2926 sg.info.bb_counters[i] : 0);
2927 seq_puts(seq, " ]\n");
2928
2929 return 0;
2930 }
2931
ext4_mb_seq_groups_stop(struct seq_file * seq,void * v)2932 static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v)
2933 {
2934 }
2935
2936 const struct seq_operations ext4_mb_seq_groups_ops = {
2937 .start = ext4_mb_seq_groups_start,
2938 .next = ext4_mb_seq_groups_next,
2939 .stop = ext4_mb_seq_groups_stop,
2940 .show = ext4_mb_seq_groups_show,
2941 };
2942
ext4_seq_mb_stats_show(struct seq_file * seq,void * offset)2943 int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset)
2944 {
2945 struct super_block *sb = (struct super_block *)seq->private;
2946 struct ext4_sb_info *sbi = EXT4_SB(sb);
2947
2948 seq_puts(seq, "mballoc:\n");
2949 if (!sbi->s_mb_stats) {
2950 seq_puts(seq, "\tmb stats collection turned off.\n");
2951 seq_puts(seq, "\tTo enable, please write \"1\" to sysfs file mb_stats.\n");
2952 return 0;
2953 }
2954 seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs));
2955 seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success));
2956
2957 seq_printf(seq, "\tgroups_scanned: %u\n", atomic_read(&sbi->s_bal_groups_scanned));
2958
2959 seq_puts(seq, "\tcr0_stats:\n");
2960 seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[0]));
2961 seq_printf(seq, "\t\tgroups_considered: %llu\n",
2962 atomic64_read(&sbi->s_bal_cX_groups_considered[0]));
2963 seq_printf(seq, "\t\tuseless_loops: %llu\n",
2964 atomic64_read(&sbi->s_bal_cX_failed[0]));
2965 seq_printf(seq, "\t\tbad_suggestions: %u\n",
2966 atomic_read(&sbi->s_bal_cr0_bad_suggestions));
2967
2968 seq_puts(seq, "\tcr1_stats:\n");
2969 seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[1]));
2970 seq_printf(seq, "\t\tgroups_considered: %llu\n",
2971 atomic64_read(&sbi->s_bal_cX_groups_considered[1]));
2972 seq_printf(seq, "\t\tuseless_loops: %llu\n",
2973 atomic64_read(&sbi->s_bal_cX_failed[1]));
2974 seq_printf(seq, "\t\tbad_suggestions: %u\n",
2975 atomic_read(&sbi->s_bal_cr1_bad_suggestions));
2976
2977 seq_puts(seq, "\tcr2_stats:\n");
2978 seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[2]));
2979 seq_printf(seq, "\t\tgroups_considered: %llu\n",
2980 atomic64_read(&sbi->s_bal_cX_groups_considered[2]));
2981 seq_printf(seq, "\t\tuseless_loops: %llu\n",
2982 atomic64_read(&sbi->s_bal_cX_failed[2]));
2983
2984 seq_puts(seq, "\tcr3_stats:\n");
2985 seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[3]));
2986 seq_printf(seq, "\t\tgroups_considered: %llu\n",
2987 atomic64_read(&sbi->s_bal_cX_groups_considered[3]));
2988 seq_printf(seq, "\t\tuseless_loops: %llu\n",
2989 atomic64_read(&sbi->s_bal_cX_failed[3]));
2990 seq_printf(seq, "\textents_scanned: %u\n", atomic_read(&sbi->s_bal_ex_scanned));
2991 seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals));
2992 seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders));
2993 seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks));
2994 seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks));
2995
2996 seq_printf(seq, "\tbuddies_generated: %u/%u\n",
2997 atomic_read(&sbi->s_mb_buddies_generated),
2998 ext4_get_groups_count(sb));
2999 seq_printf(seq, "\tbuddies_time_used: %llu\n",
3000 atomic64_read(&sbi->s_mb_generation_time));
3001 seq_printf(seq, "\tpreallocated: %u\n",
3002 atomic_read(&sbi->s_mb_preallocated));
3003 seq_printf(seq, "\tdiscarded: %u\n",
3004 atomic_read(&sbi->s_mb_discarded));
3005 return 0;
3006 }
3007
ext4_mb_seq_structs_summary_start(struct seq_file * seq,loff_t * pos)3008 static void *ext4_mb_seq_structs_summary_start(struct seq_file *seq, loff_t *pos)
3009 __acquires(&EXT4_SB(sb)->s_mb_rb_lock)
3010 {
3011 struct super_block *sb = PDE_DATA(file_inode(seq->file));
3012 unsigned long position;
3013
3014 read_lock(&EXT4_SB(sb)->s_mb_rb_lock);
3015
3016 if (*pos < 0 || *pos >= MB_NUM_ORDERS(sb) + 1)
3017 return NULL;
3018 position = *pos + 1;
3019 return (void *) ((unsigned long) position);
3020 }
3021
ext4_mb_seq_structs_summary_next(struct seq_file * seq,void * v,loff_t * pos)3022 static void *ext4_mb_seq_structs_summary_next(struct seq_file *seq, void *v, loff_t *pos)
3023 {
3024 struct super_block *sb = PDE_DATA(file_inode(seq->file));
3025 unsigned long position;
3026
3027 ++*pos;
3028 if (*pos < 0 || *pos >= MB_NUM_ORDERS(sb) + 1)
3029 return NULL;
3030 position = *pos + 1;
3031 return (void *) ((unsigned long) position);
3032 }
3033
ext4_mb_seq_structs_summary_show(struct seq_file * seq,void * v)3034 static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v)
3035 {
3036 struct super_block *sb = PDE_DATA(file_inode(seq->file));
3037 struct ext4_sb_info *sbi = EXT4_SB(sb);
3038 unsigned long position = ((unsigned long) v);
3039 struct ext4_group_info *grp;
3040 struct rb_node *n;
3041 unsigned int count, min, max;
3042
3043 position--;
3044 if (position >= MB_NUM_ORDERS(sb)) {
3045 seq_puts(seq, "fragment_size_tree:\n");
3046 n = rb_first(&sbi->s_mb_avg_fragment_size_root);
3047 if (!n) {
3048 seq_puts(seq, "\ttree_min: 0\n\ttree_max: 0\n\ttree_nodes: 0\n");
3049 return 0;
3050 }
3051 grp = rb_entry(n, struct ext4_group_info, bb_avg_fragment_size_rb);
3052 min = grp->bb_fragments ? grp->bb_free / grp->bb_fragments : 0;
3053 count = 1;
3054 while (rb_next(n)) {
3055 count++;
3056 n = rb_next(n);
3057 }
3058 grp = rb_entry(n, struct ext4_group_info, bb_avg_fragment_size_rb);
3059 max = grp->bb_fragments ? grp->bb_free / grp->bb_fragments : 0;
3060
3061 seq_printf(seq, "\ttree_min: %u\n\ttree_max: %u\n\ttree_nodes: %u\n",
3062 min, max, count);
3063 return 0;
3064 }
3065
3066 if (position == 0) {
3067 seq_printf(seq, "optimize_scan: %d\n",
3068 test_opt2(sb, MB_OPTIMIZE_SCAN) ? 1 : 0);
3069 seq_puts(seq, "max_free_order_lists:\n");
3070 }
3071 count = 0;
3072 list_for_each_entry(grp, &sbi->s_mb_largest_free_orders[position],
3073 bb_largest_free_order_node)
3074 count++;
3075 seq_printf(seq, "\tlist_order_%u_groups: %u\n",
3076 (unsigned int)position, count);
3077
3078 return 0;
3079 }
3080
ext4_mb_seq_structs_summary_stop(struct seq_file * seq,void * v)3081 static void ext4_mb_seq_structs_summary_stop(struct seq_file *seq, void *v)
3082 __releases(&EXT4_SB(sb)->s_mb_rb_lock)
3083 {
3084 struct super_block *sb = PDE_DATA(file_inode(seq->file));
3085
3086 read_unlock(&EXT4_SB(sb)->s_mb_rb_lock);
3087 }
3088
3089 const struct seq_operations ext4_mb_seq_structs_summary_ops = {
3090 .start = ext4_mb_seq_structs_summary_start,
3091 .next = ext4_mb_seq_structs_summary_next,
3092 .stop = ext4_mb_seq_structs_summary_stop,
3093 .show = ext4_mb_seq_structs_summary_show,
3094 };
3095
get_groupinfo_cache(int blocksize_bits)3096 static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
3097 {
3098 int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
3099 struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index];
3100
3101 BUG_ON(!cachep);
3102 return cachep;
3103 }
3104
3105 /*
3106 * Allocate the top-level s_group_info array for the specified number
3107 * of groups
3108 */
ext4_mb_alloc_groupinfo(struct super_block * sb,ext4_group_t ngroups)3109 int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups)
3110 {
3111 struct ext4_sb_info *sbi = EXT4_SB(sb);
3112 unsigned size;
3113 struct ext4_group_info ***old_groupinfo, ***new_groupinfo;
3114
3115 size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >>
3116 EXT4_DESC_PER_BLOCK_BITS(sb);
3117 if (size <= sbi->s_group_info_size)
3118 return 0;
3119
3120 size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size);
3121 new_groupinfo = kvzalloc(size, GFP_KERNEL);
3122 if (!new_groupinfo) {
3123 ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group");
3124 return -ENOMEM;
3125 }
3126 rcu_read_lock();
3127 old_groupinfo = rcu_dereference(sbi->s_group_info);
3128 if (old_groupinfo)
3129 memcpy(new_groupinfo, old_groupinfo,
3130 sbi->s_group_info_size * sizeof(*sbi->s_group_info));
3131 rcu_read_unlock();
3132 rcu_assign_pointer(sbi->s_group_info, new_groupinfo);
3133 sbi->s_group_info_size = size / sizeof(*sbi->s_group_info);
3134 if (old_groupinfo)
3135 ext4_kvfree_array_rcu(old_groupinfo);
3136 ext4_debug("allocated s_groupinfo array for %d meta_bg's\n",
3137 sbi->s_group_info_size);
3138 return 0;
3139 }
3140
3141 /* Create and initialize ext4_group_info data for the given group. */
ext4_mb_add_groupinfo(struct super_block * sb,ext4_group_t group,struct ext4_group_desc * desc)3142 int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
3143 struct ext4_group_desc *desc)
3144 {
3145 int i;
3146 int metalen = 0;
3147 int idx = group >> EXT4_DESC_PER_BLOCK_BITS(sb);
3148 struct ext4_sb_info *sbi = EXT4_SB(sb);
3149 struct ext4_group_info **meta_group_info;
3150 struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
3151
3152 /*
3153 * First check if this group is the first of a reserved block.
3154 * If it's true, we have to allocate a new table of pointers
3155 * to ext4_group_info structures
3156 */
3157 if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
3158 metalen = sizeof(*meta_group_info) <<
3159 EXT4_DESC_PER_BLOCK_BITS(sb);
3160 meta_group_info = kmalloc(metalen, GFP_NOFS);
3161 if (meta_group_info == NULL) {
3162 ext4_msg(sb, KERN_ERR, "can't allocate mem "
3163 "for a buddy group");
3164 goto exit_meta_group_info;
3165 }
3166 rcu_read_lock();
3167 rcu_dereference(sbi->s_group_info)[idx] = meta_group_info;
3168 rcu_read_unlock();
3169 }
3170
3171 meta_group_info = sbi_array_rcu_deref(sbi, s_group_info, idx);
3172 i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
3173
3174 meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_NOFS);
3175 if (meta_group_info[i] == NULL) {
3176 ext4_msg(sb, KERN_ERR, "can't allocate buddy mem");
3177 goto exit_group_info;
3178 }
3179 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
3180 &(meta_group_info[i]->bb_state));
3181
3182 /*
3183 * initialize bb_free to be able to skip
3184 * empty groups without initialization
3185 */
3186 if (ext4_has_group_desc_csum(sb) &&
3187 (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
3188 meta_group_info[i]->bb_free =
3189 ext4_free_clusters_after_init(sb, group, desc);
3190 } else {
3191 meta_group_info[i]->bb_free =
3192 ext4_free_group_clusters(sb, desc);
3193 }
3194
3195 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
3196 init_rwsem(&meta_group_info[i]->alloc_sem);
3197 meta_group_info[i]->bb_free_root = RB_ROOT;
3198 INIT_LIST_HEAD(&meta_group_info[i]->bb_largest_free_order_node);
3199 RB_CLEAR_NODE(&meta_group_info[i]->bb_avg_fragment_size_rb);
3200 meta_group_info[i]->bb_largest_free_order = -1; /* uninit */
3201 meta_group_info[i]->bb_group = group;
3202
3203 mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group);
3204 return 0;
3205
3206 exit_group_info:
3207 /* If a meta_group_info table has been allocated, release it now */
3208 if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
3209 struct ext4_group_info ***group_info;
3210
3211 rcu_read_lock();
3212 group_info = rcu_dereference(sbi->s_group_info);
3213 kfree(group_info[idx]);
3214 group_info[idx] = NULL;
3215 rcu_read_unlock();
3216 }
3217 exit_meta_group_info:
3218 return -ENOMEM;
3219 } /* ext4_mb_add_groupinfo */
3220
ext4_mb_init_backend(struct super_block * sb)3221 static int ext4_mb_init_backend(struct super_block *sb)
3222 {
3223 ext4_group_t ngroups = ext4_get_groups_count(sb);
3224 ext4_group_t i;
3225 struct ext4_sb_info *sbi = EXT4_SB(sb);
3226 int err;
3227 struct ext4_group_desc *desc;
3228 struct ext4_group_info ***group_info;
3229 struct kmem_cache *cachep;
3230
3231 err = ext4_mb_alloc_groupinfo(sb, ngroups);
3232 if (err)
3233 return err;
3234
3235 sbi->s_buddy_cache = new_inode(sb);
3236 if (sbi->s_buddy_cache == NULL) {
3237 ext4_msg(sb, KERN_ERR, "can't get new inode");
3238 goto err_freesgi;
3239 }
3240 /* To avoid potentially colliding with an valid on-disk inode number,
3241 * use EXT4_BAD_INO for the buddy cache inode number. This inode is
3242 * not in the inode hash, so it should never be found by iget(), but
3243 * this will avoid confusion if it ever shows up during debugging. */
3244 sbi->s_buddy_cache->i_ino = EXT4_BAD_INO;
3245 EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
3246 for (i = 0; i < ngroups; i++) {
3247 cond_resched();
3248 desc = ext4_get_group_desc(sb, i, NULL);
3249 if (desc == NULL) {
3250 ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i);
3251 goto err_freebuddy;
3252 }
3253 if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
3254 goto err_freebuddy;
3255 }
3256
3257 if (ext4_has_feature_flex_bg(sb)) {
3258 /* a single flex group is supposed to be read by a single IO.
3259 * 2 ^ s_log_groups_per_flex != UINT_MAX as s_mb_prefetch is
3260 * unsigned integer, so the maximum shift is 32.
3261 */
3262 if (sbi->s_es->s_log_groups_per_flex >= 32) {
3263 ext4_msg(sb, KERN_ERR, "too many log groups per flexible block group");
3264 goto err_freebuddy;
3265 }
3266 sbi->s_mb_prefetch = min_t(uint, 1 << sbi->s_es->s_log_groups_per_flex,
3267 BLK_MAX_SEGMENT_SIZE >> (sb->s_blocksize_bits - 9));
3268 sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */
3269 } else {
3270 sbi->s_mb_prefetch = 32;
3271 }
3272 if (sbi->s_mb_prefetch > ext4_get_groups_count(sb))
3273 sbi->s_mb_prefetch = ext4_get_groups_count(sb);
3274 /* now many real IOs to prefetch within a single allocation at cr=0
3275 * given cr=0 is an CPU-related optimization we shouldn't try to
3276 * load too many groups, at some point we should start to use what
3277 * we've got in memory.
3278 * with an average random access time 5ms, it'd take a second to get
3279 * 200 groups (* N with flex_bg), so let's make this limit 4
3280 */
3281 sbi->s_mb_prefetch_limit = sbi->s_mb_prefetch * 4;
3282 if (sbi->s_mb_prefetch_limit > ext4_get_groups_count(sb))
3283 sbi->s_mb_prefetch_limit = ext4_get_groups_count(sb);
3284
3285 return 0;
3286
3287 err_freebuddy:
3288 cachep = get_groupinfo_cache(sb->s_blocksize_bits);
3289 while (i-- > 0) {
3290 struct ext4_group_info *grp = ext4_get_group_info(sb, i);
3291
3292 if (grp)
3293 kmem_cache_free(cachep, grp);
3294 }
3295 i = sbi->s_group_info_size;
3296 rcu_read_lock();
3297 group_info = rcu_dereference(sbi->s_group_info);
3298 while (i-- > 0)
3299 kfree(group_info[i]);
3300 rcu_read_unlock();
3301 iput(sbi->s_buddy_cache);
3302 err_freesgi:
3303 rcu_read_lock();
3304 kvfree(rcu_dereference(sbi->s_group_info));
3305 rcu_read_unlock();
3306 return -ENOMEM;
3307 }
3308
ext4_groupinfo_destroy_slabs(void)3309 static void ext4_groupinfo_destroy_slabs(void)
3310 {
3311 int i;
3312
3313 for (i = 0; i < NR_GRPINFO_CACHES; i++) {
3314 kmem_cache_destroy(ext4_groupinfo_caches[i]);
3315 ext4_groupinfo_caches[i] = NULL;
3316 }
3317 }
3318
ext4_groupinfo_create_slab(size_t size)3319 static int ext4_groupinfo_create_slab(size_t size)
3320 {
3321 static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex);
3322 int slab_size;
3323 int blocksize_bits = order_base_2(size);
3324 int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
3325 struct kmem_cache *cachep;
3326
3327 if (cache_index >= NR_GRPINFO_CACHES)
3328 return -EINVAL;
3329
3330 if (unlikely(cache_index < 0))
3331 cache_index = 0;
3332
3333 mutex_lock(&ext4_grpinfo_slab_create_mutex);
3334 if (ext4_groupinfo_caches[cache_index]) {
3335 mutex_unlock(&ext4_grpinfo_slab_create_mutex);
3336 return 0; /* Already created */
3337 }
3338
3339 slab_size = offsetof(struct ext4_group_info,
3340 bb_counters[blocksize_bits + 2]);
3341
3342 cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index],
3343 slab_size, 0, SLAB_RECLAIM_ACCOUNT,
3344 NULL);
3345
3346 ext4_groupinfo_caches[cache_index] = cachep;
3347
3348 mutex_unlock(&ext4_grpinfo_slab_create_mutex);
3349 if (!cachep) {
3350 printk(KERN_EMERG
3351 "EXT4-fs: no memory for groupinfo slab cache\n");
3352 return -ENOMEM;
3353 }
3354
3355 return 0;
3356 }
3357
ext4_discard_work(struct work_struct * work)3358 static void ext4_discard_work(struct work_struct *work)
3359 {
3360 struct ext4_sb_info *sbi = container_of(work,
3361 struct ext4_sb_info, s_discard_work);
3362 struct super_block *sb = sbi->s_sb;
3363 struct ext4_free_data *fd, *nfd;
3364 struct ext4_buddy e4b;
3365 struct list_head discard_list;
3366 ext4_group_t grp, load_grp;
3367 int err = 0;
3368
3369 INIT_LIST_HEAD(&discard_list);
3370 spin_lock(&sbi->s_md_lock);
3371 list_splice_init(&sbi->s_discard_list, &discard_list);
3372 spin_unlock(&sbi->s_md_lock);
3373
3374 load_grp = UINT_MAX;
3375 list_for_each_entry_safe(fd, nfd, &discard_list, efd_list) {
3376 /*
3377 * If filesystem is umounting or no memory or suffering
3378 * from no space, give up the discard
3379 */
3380 if ((sb->s_flags & SB_ACTIVE) && !err &&
3381 !atomic_read(&sbi->s_retry_alloc_pending)) {
3382 grp = fd->efd_group;
3383 if (grp != load_grp) {
3384 if (load_grp != UINT_MAX)
3385 ext4_mb_unload_buddy(&e4b);
3386
3387 err = ext4_mb_load_buddy(sb, grp, &e4b);
3388 if (err) {
3389 kmem_cache_free(ext4_free_data_cachep, fd);
3390 load_grp = UINT_MAX;
3391 continue;
3392 } else {
3393 load_grp = grp;
3394 }
3395 }
3396
3397 ext4_lock_group(sb, grp);
3398 ext4_try_to_trim_range(sb, &e4b, fd->efd_start_cluster,
3399 fd->efd_start_cluster + fd->efd_count - 1, 1);
3400 ext4_unlock_group(sb, grp);
3401 }
3402 kmem_cache_free(ext4_free_data_cachep, fd);
3403 }
3404
3405 if (load_grp != UINT_MAX)
3406 ext4_mb_unload_buddy(&e4b);
3407 }
3408
ext4_mb_init(struct super_block * sb)3409 int ext4_mb_init(struct super_block *sb)
3410 {
3411 struct ext4_sb_info *sbi = EXT4_SB(sb);
3412 unsigned i, j;
3413 unsigned offset, offset_incr;
3414 unsigned max;
3415 int ret;
3416
3417 i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_offsets);
3418
3419 sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
3420 if (sbi->s_mb_offsets == NULL) {
3421 ret = -ENOMEM;
3422 goto out;
3423 }
3424
3425 i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_maxs);
3426 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
3427 if (sbi->s_mb_maxs == NULL) {
3428 ret = -ENOMEM;
3429 goto out;
3430 }
3431
3432 ret = ext4_groupinfo_create_slab(sb->s_blocksize);
3433 if (ret < 0)
3434 goto out;
3435
3436 /* order 0 is regular bitmap */
3437 sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
3438 sbi->s_mb_offsets[0] = 0;
3439
3440 i = 1;
3441 offset = 0;
3442 offset_incr = 1 << (sb->s_blocksize_bits - 1);
3443 max = sb->s_blocksize << 2;
3444 do {
3445 sbi->s_mb_offsets[i] = offset;
3446 sbi->s_mb_maxs[i] = max;
3447 offset += offset_incr;
3448 offset_incr = offset_incr >> 1;
3449 max = max >> 1;
3450 i++;
3451 } while (i < MB_NUM_ORDERS(sb));
3452
3453 sbi->s_mb_avg_fragment_size_root = RB_ROOT;
3454 sbi->s_mb_largest_free_orders =
3455 kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head),
3456 GFP_KERNEL);
3457 if (!sbi->s_mb_largest_free_orders) {
3458 ret = -ENOMEM;
3459 goto out;
3460 }
3461 sbi->s_mb_largest_free_orders_locks =
3462 kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t),
3463 GFP_KERNEL);
3464 if (!sbi->s_mb_largest_free_orders_locks) {
3465 ret = -ENOMEM;
3466 goto out;
3467 }
3468 for (i = 0; i < MB_NUM_ORDERS(sb); i++) {
3469 INIT_LIST_HEAD(&sbi->s_mb_largest_free_orders[i]);
3470 rwlock_init(&sbi->s_mb_largest_free_orders_locks[i]);
3471 }
3472 rwlock_init(&sbi->s_mb_rb_lock);
3473
3474 spin_lock_init(&sbi->s_md_lock);
3475 sbi->s_mb_free_pending = 0;
3476 INIT_LIST_HEAD(&sbi->s_freed_data_list);
3477 INIT_LIST_HEAD(&sbi->s_discard_list);
3478 INIT_WORK(&sbi->s_discard_work, ext4_discard_work);
3479 atomic_set(&sbi->s_retry_alloc_pending, 0);
3480
3481 sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
3482 sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
3483 sbi->s_mb_stats = MB_DEFAULT_STATS;
3484 sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
3485 sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
3486 sbi->s_mb_max_inode_prealloc = MB_DEFAULT_MAX_INODE_PREALLOC;
3487 /*
3488 * The default group preallocation is 512, which for 4k block
3489 * sizes translates to 2 megabytes. However for bigalloc file
3490 * systems, this is probably too big (i.e, if the cluster size
3491 * is 1 megabyte, then group preallocation size becomes half a
3492 * gigabyte!). As a default, we will keep a two megabyte
3493 * group pralloc size for cluster sizes up to 64k, and after
3494 * that, we will force a minimum group preallocation size of
3495 * 32 clusters. This translates to 8 megs when the cluster
3496 * size is 256k, and 32 megs when the cluster size is 1 meg,
3497 * which seems reasonable as a default.
3498 */
3499 sbi->s_mb_group_prealloc = max(MB_DEFAULT_GROUP_PREALLOC >>
3500 sbi->s_cluster_bits, 32);
3501 /*
3502 * If there is a s_stripe > 1, then we set the s_mb_group_prealloc
3503 * to the lowest multiple of s_stripe which is bigger than
3504 * the s_mb_group_prealloc as determined above. We want
3505 * the preallocation size to be an exact multiple of the
3506 * RAID stripe size so that preallocations don't fragment
3507 * the stripes.
3508 */
3509 if (sbi->s_stripe > 1) {
3510 sbi->s_mb_group_prealloc = roundup(
3511 sbi->s_mb_group_prealloc, sbi->s_stripe);
3512 }
3513
3514 sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
3515 if (sbi->s_locality_groups == NULL) {
3516 ret = -ENOMEM;
3517 goto out;
3518 }
3519 for_each_possible_cpu(i) {
3520 struct ext4_locality_group *lg;
3521 lg = per_cpu_ptr(sbi->s_locality_groups, i);
3522 mutex_init(&lg->lg_mutex);
3523 for (j = 0; j < PREALLOC_TB_SIZE; j++)
3524 INIT_LIST_HEAD(&lg->lg_prealloc_list[j]);
3525 spin_lock_init(&lg->lg_prealloc_lock);
3526 }
3527
3528 if (blk_queue_nonrot(bdev_get_queue(sb->s_bdev)))
3529 sbi->s_mb_max_linear_groups = 0;
3530 else
3531 sbi->s_mb_max_linear_groups = MB_DEFAULT_LINEAR_LIMIT;
3532 /* init file for buddy data */
3533 ret = ext4_mb_init_backend(sb);
3534 if (ret != 0)
3535 goto out_free_locality_groups;
3536
3537 return 0;
3538
3539 out_free_locality_groups:
3540 free_percpu(sbi->s_locality_groups);
3541 sbi->s_locality_groups = NULL;
3542 out:
3543 kfree(sbi->s_mb_largest_free_orders);
3544 kfree(sbi->s_mb_largest_free_orders_locks);
3545 kfree(sbi->s_mb_offsets);
3546 sbi->s_mb_offsets = NULL;
3547 kfree(sbi->s_mb_maxs);
3548 sbi->s_mb_maxs = NULL;
3549 return ret;
3550 }
3551
3552 /* need to called with the ext4 group lock held */
ext4_mb_cleanup_pa(struct ext4_group_info * grp)3553 static int ext4_mb_cleanup_pa(struct ext4_group_info *grp)
3554 {
3555 struct ext4_prealloc_space *pa;
3556 struct list_head *cur, *tmp;
3557 int count = 0;
3558
3559 list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) {
3560 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
3561 list_del(&pa->pa_group_list);
3562 count++;
3563 kmem_cache_free(ext4_pspace_cachep, pa);
3564 }
3565 return count;
3566 }
3567
ext4_mb_release(struct super_block * sb)3568 int ext4_mb_release(struct super_block *sb)
3569 {
3570 ext4_group_t ngroups = ext4_get_groups_count(sb);
3571 ext4_group_t i;
3572 int num_meta_group_infos;
3573 struct ext4_group_info *grinfo, ***group_info;
3574 struct ext4_sb_info *sbi = EXT4_SB(sb);
3575 struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
3576 int count;
3577
3578 if (test_opt(sb, DISCARD)) {
3579 /*
3580 * wait the discard work to drain all of ext4_free_data
3581 */
3582 flush_work(&sbi->s_discard_work);
3583 WARN_ON_ONCE(!list_empty(&sbi->s_discard_list));
3584 }
3585
3586 if (sbi->s_group_info) {
3587 for (i = 0; i < ngroups; i++) {
3588 cond_resched();
3589 grinfo = ext4_get_group_info(sb, i);
3590 if (!grinfo)
3591 continue;
3592 mb_group_bb_bitmap_free(grinfo);
3593 ext4_lock_group(sb, i);
3594 count = ext4_mb_cleanup_pa(grinfo);
3595 if (count)
3596 mb_debug(sb, "mballoc: %d PAs left\n",
3597 count);
3598 ext4_unlock_group(sb, i);
3599 kmem_cache_free(cachep, grinfo);
3600 }
3601 num_meta_group_infos = (ngroups +
3602 EXT4_DESC_PER_BLOCK(sb) - 1) >>
3603 EXT4_DESC_PER_BLOCK_BITS(sb);
3604 rcu_read_lock();
3605 group_info = rcu_dereference(sbi->s_group_info);
3606 for (i = 0; i < num_meta_group_infos; i++)
3607 kfree(group_info[i]);
3608 kvfree(group_info);
3609 rcu_read_unlock();
3610 }
3611 kfree(sbi->s_mb_largest_free_orders);
3612 kfree(sbi->s_mb_largest_free_orders_locks);
3613 kfree(sbi->s_mb_offsets);
3614 kfree(sbi->s_mb_maxs);
3615 iput(sbi->s_buddy_cache);
3616 if (sbi->s_mb_stats) {
3617 ext4_msg(sb, KERN_INFO,
3618 "mballoc: %u blocks %u reqs (%u success)",
3619 atomic_read(&sbi->s_bal_allocated),
3620 atomic_read(&sbi->s_bal_reqs),
3621 atomic_read(&sbi->s_bal_success));
3622 ext4_msg(sb, KERN_INFO,
3623 "mballoc: %u extents scanned, %u groups scanned, %u goal hits, "
3624 "%u 2^N hits, %u breaks, %u lost",
3625 atomic_read(&sbi->s_bal_ex_scanned),
3626 atomic_read(&sbi->s_bal_groups_scanned),
3627 atomic_read(&sbi->s_bal_goals),
3628 atomic_read(&sbi->s_bal_2orders),
3629 atomic_read(&sbi->s_bal_breaks),
3630 atomic_read(&sbi->s_mb_lost_chunks));
3631 ext4_msg(sb, KERN_INFO,
3632 "mballoc: %u generated and it took %llu",
3633 atomic_read(&sbi->s_mb_buddies_generated),
3634 atomic64_read(&sbi->s_mb_generation_time));
3635 ext4_msg(sb, KERN_INFO,
3636 "mballoc: %u preallocated, %u discarded",
3637 atomic_read(&sbi->s_mb_preallocated),
3638 atomic_read(&sbi->s_mb_discarded));
3639 }
3640
3641 free_percpu(sbi->s_locality_groups);
3642
3643 return 0;
3644 }
3645
ext4_issue_discard(struct super_block * sb,ext4_group_t block_group,ext4_grpblk_t cluster,int count,struct bio ** biop)3646 static inline int ext4_issue_discard(struct super_block *sb,
3647 ext4_group_t block_group, ext4_grpblk_t cluster, int count,
3648 struct bio **biop)
3649 {
3650 ext4_fsblk_t discard_block;
3651
3652 discard_block = (EXT4_C2B(EXT4_SB(sb), cluster) +
3653 ext4_group_first_block_no(sb, block_group));
3654 count = EXT4_C2B(EXT4_SB(sb), count);
3655 trace_ext4_discard_blocks(sb,
3656 (unsigned long long) discard_block, count);
3657 if (biop) {
3658 return __blkdev_issue_discard(sb->s_bdev,
3659 (sector_t)discard_block << (sb->s_blocksize_bits - 9),
3660 (sector_t)count << (sb->s_blocksize_bits - 9),
3661 GFP_NOFS, 0, biop);
3662 } else
3663 return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
3664 }
3665
ext4_free_data_in_buddy(struct super_block * sb,struct ext4_free_data * entry)3666 static void ext4_free_data_in_buddy(struct super_block *sb,
3667 struct ext4_free_data *entry)
3668 {
3669 struct ext4_buddy e4b;
3670 struct ext4_group_info *db;
3671 int err, count = 0, count2 = 0;
3672
3673 mb_debug(sb, "gonna free %u blocks in group %u (0x%p):",
3674 entry->efd_count, entry->efd_group, entry);
3675
3676 err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
3677 /* we expect to find existing buddy because it's pinned */
3678 BUG_ON(err != 0);
3679
3680 spin_lock(&EXT4_SB(sb)->s_md_lock);
3681 EXT4_SB(sb)->s_mb_free_pending -= entry->efd_count;
3682 spin_unlock(&EXT4_SB(sb)->s_md_lock);
3683
3684 db = e4b.bd_info;
3685 /* there are blocks to put in buddy to make them really free */
3686 count += entry->efd_count;
3687 count2++;
3688 ext4_lock_group(sb, entry->efd_group);
3689 /* Take it out of per group rb tree */
3690 rb_erase(&entry->efd_node, &(db->bb_free_root));
3691 mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count);
3692
3693 /*
3694 * Clear the trimmed flag for the group so that the next
3695 * ext4_trim_fs can trim it.
3696 * If the volume is mounted with -o discard, online discard
3697 * is supported and the free blocks will be trimmed online.
3698 */
3699 if (!test_opt(sb, DISCARD))
3700 EXT4_MB_GRP_CLEAR_TRIMMED(db);
3701
3702 if (!db->bb_free_root.rb_node) {
3703 /* No more items in the per group rb tree
3704 * balance refcounts from ext4_mb_free_metadata()
3705 */
3706 put_page(e4b.bd_buddy_page);
3707 put_page(e4b.bd_bitmap_page);
3708 }
3709 ext4_unlock_group(sb, entry->efd_group);
3710 ext4_mb_unload_buddy(&e4b);
3711
3712 mb_debug(sb, "freed %d blocks in %d structures\n", count,
3713 count2);
3714 }
3715
3716 /*
3717 * This function is called by the jbd2 layer once the commit has finished,
3718 * so we know we can free the blocks that were released with that commit.
3719 */
ext4_process_freed_data(struct super_block * sb,tid_t commit_tid)3720 void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid)
3721 {
3722 struct ext4_sb_info *sbi = EXT4_SB(sb);
3723 struct ext4_free_data *entry, *tmp;
3724 struct list_head freed_data_list;
3725 struct list_head *cut_pos = NULL;
3726 bool wake;
3727
3728 INIT_LIST_HEAD(&freed_data_list);
3729
3730 spin_lock(&sbi->s_md_lock);
3731 list_for_each_entry(entry, &sbi->s_freed_data_list, efd_list) {
3732 if (entry->efd_tid != commit_tid)
3733 break;
3734 cut_pos = &entry->efd_list;
3735 }
3736 if (cut_pos)
3737 list_cut_position(&freed_data_list, &sbi->s_freed_data_list,
3738 cut_pos);
3739 spin_unlock(&sbi->s_md_lock);
3740
3741 list_for_each_entry(entry, &freed_data_list, efd_list)
3742 ext4_free_data_in_buddy(sb, entry);
3743
3744 if (test_opt(sb, DISCARD)) {
3745 spin_lock(&sbi->s_md_lock);
3746 wake = list_empty(&sbi->s_discard_list);
3747 list_splice_tail(&freed_data_list, &sbi->s_discard_list);
3748 spin_unlock(&sbi->s_md_lock);
3749 if (wake)
3750 queue_work(system_unbound_wq, &sbi->s_discard_work);
3751 } else {
3752 list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list)
3753 kmem_cache_free(ext4_free_data_cachep, entry);
3754 }
3755 }
3756
ext4_init_mballoc(void)3757 int __init ext4_init_mballoc(void)
3758 {
3759 ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space,
3760 SLAB_RECLAIM_ACCOUNT);
3761 if (ext4_pspace_cachep == NULL)
3762 goto out;
3763
3764 ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context,
3765 SLAB_RECLAIM_ACCOUNT);
3766 if (ext4_ac_cachep == NULL)
3767 goto out_pa_free;
3768
3769 ext4_free_data_cachep = KMEM_CACHE(ext4_free_data,
3770 SLAB_RECLAIM_ACCOUNT);
3771 if (ext4_free_data_cachep == NULL)
3772 goto out_ac_free;
3773
3774 return 0;
3775
3776 out_ac_free:
3777 kmem_cache_destroy(ext4_ac_cachep);
3778 out_pa_free:
3779 kmem_cache_destroy(ext4_pspace_cachep);
3780 out:
3781 return -ENOMEM;
3782 }
3783
ext4_exit_mballoc(void)3784 void ext4_exit_mballoc(void)
3785 {
3786 /*
3787 * Wait for completion of call_rcu()'s on ext4_pspace_cachep
3788 * before destroying the slab cache.
3789 */
3790 rcu_barrier();
3791 kmem_cache_destroy(ext4_pspace_cachep);
3792 kmem_cache_destroy(ext4_ac_cachep);
3793 kmem_cache_destroy(ext4_free_data_cachep);
3794 ext4_groupinfo_destroy_slabs();
3795 }
3796
3797
3798 /*
3799 * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps
3800 * Returns 0 if success or error code
3801 */
3802 static noinline_for_stack int
ext4_mb_mark_diskspace_used(struct ext4_allocation_context * ac,handle_t * handle,unsigned int reserv_clstrs)3803 ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
3804 handle_t *handle, unsigned int reserv_clstrs)
3805 {
3806 struct buffer_head *bitmap_bh = NULL;
3807 struct ext4_group_desc *gdp;
3808 struct buffer_head *gdp_bh;
3809 struct ext4_sb_info *sbi;
3810 struct super_block *sb;
3811 ext4_fsblk_t block;
3812 int err, len;
3813
3814 BUG_ON(ac->ac_status != AC_STATUS_FOUND);
3815 BUG_ON(ac->ac_b_ex.fe_len <= 0);
3816
3817 sb = ac->ac_sb;
3818 sbi = EXT4_SB(sb);
3819
3820 bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
3821 if (IS_ERR(bitmap_bh)) {
3822 err = PTR_ERR(bitmap_bh);
3823 bitmap_bh = NULL;
3824 goto out_err;
3825 }
3826
3827 BUFFER_TRACE(bitmap_bh, "getting write access");
3828 err = ext4_journal_get_write_access(handle, sb, bitmap_bh,
3829 EXT4_JTR_NONE);
3830 if (err)
3831 goto out_err;
3832
3833 err = -EIO;
3834 gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh);
3835 if (!gdp)
3836 goto out_err;
3837
3838 ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
3839 ext4_free_group_clusters(sb, gdp));
3840
3841 BUFFER_TRACE(gdp_bh, "get_write_access");
3842 err = ext4_journal_get_write_access(handle, sb, gdp_bh, EXT4_JTR_NONE);
3843 if (err)
3844 goto out_err;
3845
3846 block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
3847
3848 len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
3849 if (!ext4_inode_block_valid(ac->ac_inode, block, len)) {
3850 ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
3851 "fs metadata", block, block+len);
3852 /* File system mounted not to panic on error
3853 * Fix the bitmap and return EFSCORRUPTED
3854 * We leak some of the blocks here.
3855 */
3856 ext4_lock_group(sb, ac->ac_b_ex.fe_group);
3857 ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
3858 ac->ac_b_ex.fe_len);
3859 ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
3860 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
3861 if (!err)
3862 err = -EFSCORRUPTED;
3863 goto out_err;
3864 }
3865
3866 ext4_lock_group(sb, ac->ac_b_ex.fe_group);
3867 #ifdef AGGRESSIVE_CHECK
3868 {
3869 int i;
3870 for (i = 0; i < ac->ac_b_ex.fe_len; i++) {
3871 BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i,
3872 bitmap_bh->b_data));
3873 }
3874 }
3875 #endif
3876 ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
3877 ac->ac_b_ex.fe_len);
3878 if (ext4_has_group_desc_csum(sb) &&
3879 (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
3880 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
3881 ext4_free_group_clusters_set(sb, gdp,
3882 ext4_free_clusters_after_init(sb,
3883 ac->ac_b_ex.fe_group, gdp));
3884 }
3885 len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len;
3886 ext4_free_group_clusters_set(sb, gdp, len);
3887 ext4_block_bitmap_csum_set(sb, ac->ac_b_ex.fe_group, gdp, bitmap_bh);
3888 ext4_group_desc_csum_set(sb, ac->ac_b_ex.fe_group, gdp);
3889
3890 ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
3891 percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len);
3892 /*
3893 * Now reduce the dirty block count also. Should not go negative
3894 */
3895 if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
3896 /* release all the reserved blocks if non delalloc */
3897 percpu_counter_sub(&sbi->s_dirtyclusters_counter,
3898 reserv_clstrs);
3899
3900 if (sbi->s_log_groups_per_flex) {
3901 ext4_group_t flex_group = ext4_flex_group(sbi,
3902 ac->ac_b_ex.fe_group);
3903 atomic64_sub(ac->ac_b_ex.fe_len,
3904 &sbi_array_rcu_deref(sbi, s_flex_groups,
3905 flex_group)->free_clusters);
3906 }
3907
3908 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
3909 if (err)
3910 goto out_err;
3911 err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
3912
3913 out_err:
3914 brelse(bitmap_bh);
3915 return err;
3916 }
3917
3918 /*
3919 * Idempotent helper for Ext4 fast commit replay path to set the state of
3920 * blocks in bitmaps and update counters.
3921 */
ext4_mb_mark_bb(struct super_block * sb,ext4_fsblk_t block,int len,int state)3922 void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block,
3923 int len, int state)
3924 {
3925 struct buffer_head *bitmap_bh = NULL;
3926 struct ext4_group_desc *gdp;
3927 struct buffer_head *gdp_bh;
3928 struct ext4_sb_info *sbi = EXT4_SB(sb);
3929 ext4_group_t group;
3930 ext4_grpblk_t blkoff;
3931 int i, err;
3932 int already;
3933 unsigned int clen, clen_changed, thisgrp_len;
3934
3935 while (len > 0) {
3936 ext4_get_group_no_and_offset(sb, block, &group, &blkoff);
3937
3938 /*
3939 * Check to see if we are freeing blocks across a group
3940 * boundary.
3941 * In case of flex_bg, this can happen that (block, len) may
3942 * span across more than one group. In that case we need to
3943 * get the corresponding group metadata to work with.
3944 * For this we have goto again loop.
3945 */
3946 thisgrp_len = min_t(unsigned int, (unsigned int)len,
3947 EXT4_BLOCKS_PER_GROUP(sb) - EXT4_C2B(sbi, blkoff));
3948 clen = EXT4_NUM_B2C(sbi, thisgrp_len);
3949
3950 bitmap_bh = ext4_read_block_bitmap(sb, group);
3951 if (IS_ERR(bitmap_bh)) {
3952 err = PTR_ERR(bitmap_bh);
3953 bitmap_bh = NULL;
3954 break;
3955 }
3956
3957 err = -EIO;
3958 gdp = ext4_get_group_desc(sb, group, &gdp_bh);
3959 if (!gdp)
3960 break;
3961
3962 ext4_lock_group(sb, group);
3963 already = 0;
3964 for (i = 0; i < clen; i++)
3965 if (!mb_test_bit(blkoff + i, bitmap_bh->b_data) ==
3966 !state)
3967 already++;
3968
3969 clen_changed = clen - already;
3970 if (state)
3971 ext4_set_bits(bitmap_bh->b_data, blkoff, clen);
3972 else
3973 mb_test_and_clear_bits(bitmap_bh->b_data, blkoff, clen);
3974 if (ext4_has_group_desc_csum(sb) &&
3975 (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
3976 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
3977 ext4_free_group_clusters_set(sb, gdp,
3978 ext4_free_clusters_after_init(sb, group, gdp));
3979 }
3980 if (state)
3981 clen = ext4_free_group_clusters(sb, gdp) - clen_changed;
3982 else
3983 clen = ext4_free_group_clusters(sb, gdp) + clen_changed;
3984
3985 ext4_free_group_clusters_set(sb, gdp, clen);
3986 ext4_block_bitmap_csum_set(sb, group, gdp, bitmap_bh);
3987 ext4_group_desc_csum_set(sb, group, gdp);
3988
3989 ext4_unlock_group(sb, group);
3990
3991 if (sbi->s_log_groups_per_flex) {
3992 ext4_group_t flex_group = ext4_flex_group(sbi, group);
3993 struct flex_groups *fg = sbi_array_rcu_deref(sbi,
3994 s_flex_groups, flex_group);
3995
3996 if (state)
3997 atomic64_sub(clen_changed, &fg->free_clusters);
3998 else
3999 atomic64_add(clen_changed, &fg->free_clusters);
4000
4001 }
4002
4003 err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh);
4004 if (err)
4005 break;
4006 sync_dirty_buffer(bitmap_bh);
4007 err = ext4_handle_dirty_metadata(NULL, NULL, gdp_bh);
4008 sync_dirty_buffer(gdp_bh);
4009 if (err)
4010 break;
4011
4012 block += thisgrp_len;
4013 len -= thisgrp_len;
4014 brelse(bitmap_bh);
4015 BUG_ON(len < 0);
4016 }
4017
4018 if (err)
4019 brelse(bitmap_bh);
4020 }
4021
4022 /*
4023 * here we normalize request for locality group
4024 * Group request are normalized to s_mb_group_prealloc, which goes to
4025 * s_strip if we set the same via mount option.
4026 * s_mb_group_prealloc can be configured via
4027 * /sys/fs/ext4/<partition>/mb_group_prealloc
4028 *
4029 * XXX: should we try to preallocate more than the group has now?
4030 */
ext4_mb_normalize_group_request(struct ext4_allocation_context * ac)4031 static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
4032 {
4033 struct super_block *sb = ac->ac_sb;
4034 struct ext4_locality_group *lg = ac->ac_lg;
4035
4036 BUG_ON(lg == NULL);
4037 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
4038 mb_debug(sb, "goal %u blocks for locality group\n", ac->ac_g_ex.fe_len);
4039 }
4040
4041 /*
4042 * Normalization means making request better in terms of
4043 * size and alignment
4044 */
4045 static noinline_for_stack void
ext4_mb_normalize_request(struct ext4_allocation_context * ac,struct ext4_allocation_request * ar)4046 ext4_mb_normalize_request(struct ext4_allocation_context *ac,
4047 struct ext4_allocation_request *ar)
4048 {
4049 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
4050 struct ext4_super_block *es = sbi->s_es;
4051 int bsbits, max;
4052 loff_t size, start_off, end;
4053 loff_t orig_size __maybe_unused;
4054 ext4_lblk_t start;
4055 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
4056 struct ext4_prealloc_space *pa;
4057
4058 /* do normalize only data requests, metadata requests
4059 do not need preallocation */
4060 if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
4061 return;
4062
4063 /* sometime caller may want exact blocks */
4064 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
4065 return;
4066
4067 /* caller may indicate that preallocation isn't
4068 * required (it's a tail, for example) */
4069 if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC)
4070 return;
4071
4072 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) {
4073 ext4_mb_normalize_group_request(ac);
4074 return ;
4075 }
4076
4077 bsbits = ac->ac_sb->s_blocksize_bits;
4078
4079 /* first, let's learn actual file size
4080 * given current request is allocated */
4081 size = extent_logical_end(sbi, &ac->ac_o_ex);
4082 size = size << bsbits;
4083 if (size < i_size_read(ac->ac_inode))
4084 size = i_size_read(ac->ac_inode);
4085 orig_size = size;
4086
4087 /* max size of free chunks */
4088 max = 2 << bsbits;
4089
4090 #define NRL_CHECK_SIZE(req, size, max, chunk_size) \
4091 (req <= (size) || max <= (chunk_size))
4092
4093 /* first, try to predict filesize */
4094 /* XXX: should this table be tunable? */
4095 start_off = 0;
4096 if (size <= 16 * 1024) {
4097 size = 16 * 1024;
4098 } else if (size <= 32 * 1024) {
4099 size = 32 * 1024;
4100 } else if (size <= 64 * 1024) {
4101 size = 64 * 1024;
4102 } else if (size <= 128 * 1024) {
4103 size = 128 * 1024;
4104 } else if (size <= 256 * 1024) {
4105 size = 256 * 1024;
4106 } else if (size <= 512 * 1024) {
4107 size = 512 * 1024;
4108 } else if (size <= 1024 * 1024) {
4109 size = 1024 * 1024;
4110 } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) {
4111 start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
4112 (21 - bsbits)) << 21;
4113 size = 2 * 1024 * 1024;
4114 } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) {
4115 start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
4116 (22 - bsbits)) << 22;
4117 size = 4 * 1024 * 1024;
4118 } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len,
4119 (8<<20)>>bsbits, max, 8 * 1024)) {
4120 start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
4121 (23 - bsbits)) << 23;
4122 size = 8 * 1024 * 1024;
4123 } else {
4124 start_off = (loff_t) ac->ac_o_ex.fe_logical << bsbits;
4125 size = (loff_t) EXT4_C2B(EXT4_SB(ac->ac_sb),
4126 ac->ac_o_ex.fe_len) << bsbits;
4127 }
4128 size = size >> bsbits;
4129 start = start_off >> bsbits;
4130
4131 /*
4132 * For tiny groups (smaller than 8MB) the chosen allocation
4133 * alignment may be larger than group size. Make sure the
4134 * alignment does not move allocation to a different group which
4135 * makes mballoc fail assertions later.
4136 */
4137 start = max(start, rounddown(ac->ac_o_ex.fe_logical,
4138 (ext4_lblk_t)EXT4_BLOCKS_PER_GROUP(ac->ac_sb)));
4139
4140 /* avoid unnecessary preallocation that may trigger assertions */
4141 if (start + size > EXT_MAX_BLOCKS)
4142 size = EXT_MAX_BLOCKS - start;
4143
4144 /* don't cover already allocated blocks in selected range */
4145 if (ar->pleft && start <= ar->lleft) {
4146 size -= ar->lleft + 1 - start;
4147 start = ar->lleft + 1;
4148 }
4149 if (ar->pright && start + size - 1 >= ar->lright)
4150 size -= start + size - ar->lright;
4151
4152 /*
4153 * Trim allocation request for filesystems with artificially small
4154 * groups.
4155 */
4156 if (size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb))
4157 size = EXT4_BLOCKS_PER_GROUP(ac->ac_sb);
4158
4159 end = start + size;
4160
4161 /* check we don't cross already preallocated blocks */
4162 rcu_read_lock();
4163 list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
4164 loff_t pa_end;
4165
4166 if (pa->pa_deleted)
4167 continue;
4168 spin_lock(&pa->pa_lock);
4169 if (pa->pa_deleted) {
4170 spin_unlock(&pa->pa_lock);
4171 continue;
4172 }
4173
4174 pa_end = pa_logical_end(EXT4_SB(ac->ac_sb), pa);
4175
4176 /* PA must not overlap original request */
4177 BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end ||
4178 ac->ac_o_ex.fe_logical < pa->pa_lstart));
4179
4180 /* skip PAs this normalized request doesn't overlap with */
4181 if (pa->pa_lstart >= end || pa_end <= start) {
4182 spin_unlock(&pa->pa_lock);
4183 continue;
4184 }
4185 BUG_ON(pa->pa_lstart <= start && pa_end >= end);
4186
4187 /* adjust start or end to be adjacent to this pa */
4188 if (pa_end <= ac->ac_o_ex.fe_logical) {
4189 BUG_ON(pa_end < start);
4190 start = pa_end;
4191 } else if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
4192 BUG_ON(pa->pa_lstart > end);
4193 end = pa->pa_lstart;
4194 }
4195 spin_unlock(&pa->pa_lock);
4196 }
4197 rcu_read_unlock();
4198 size = end - start;
4199
4200 /* XXX: extra loop to check we really don't overlap preallocations */
4201 rcu_read_lock();
4202 list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
4203 loff_t pa_end;
4204
4205 spin_lock(&pa->pa_lock);
4206 if (pa->pa_deleted == 0) {
4207 pa_end = pa_logical_end(EXT4_SB(ac->ac_sb), pa);
4208 BUG_ON(!(start >= pa_end || end <= pa->pa_lstart));
4209 }
4210 spin_unlock(&pa->pa_lock);
4211 }
4212 rcu_read_unlock();
4213
4214 if (start + size <= ac->ac_o_ex.fe_logical &&
4215 start > ac->ac_o_ex.fe_logical) {
4216 ext4_msg(ac->ac_sb, KERN_ERR,
4217 "start %lu, size %lu, fe_logical %lu",
4218 (unsigned long) start, (unsigned long) size,
4219 (unsigned long) ac->ac_o_ex.fe_logical);
4220 BUG();
4221 }
4222 BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
4223
4224 /* now prepare goal request */
4225
4226 /* XXX: is it better to align blocks WRT to logical
4227 * placement or satisfy big request as is */
4228 ac->ac_g_ex.fe_logical = start;
4229 ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size);
4230
4231 /* define goal start in order to merge */
4232 if (ar->pright && (ar->lright == (start + size)) &&
4233 ar->pright >= size &&
4234 ar->pright - size >= le32_to_cpu(es->s_first_data_block)) {
4235 /* merge to the right */
4236 ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size,
4237 &ac->ac_g_ex.fe_group,
4238 &ac->ac_g_ex.fe_start);
4239 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
4240 }
4241 if (ar->pleft && (ar->lleft + 1 == start) &&
4242 ar->pleft + 1 < ext4_blocks_count(es)) {
4243 /* merge to the left */
4244 ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1,
4245 &ac->ac_g_ex.fe_group,
4246 &ac->ac_g_ex.fe_start);
4247 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
4248 }
4249
4250 mb_debug(ac->ac_sb, "goal: %lld(was %lld) blocks at %u\n", size,
4251 orig_size, start);
4252 }
4253
ext4_mb_collect_stats(struct ext4_allocation_context * ac)4254 static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
4255 {
4256 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
4257
4258 if (sbi->s_mb_stats && ac->ac_g_ex.fe_len >= 1) {
4259 atomic_inc(&sbi->s_bal_reqs);
4260 atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
4261 if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len)
4262 atomic_inc(&sbi->s_bal_success);
4263 atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
4264 atomic_add(ac->ac_groups_scanned, &sbi->s_bal_groups_scanned);
4265 if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
4266 ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
4267 atomic_inc(&sbi->s_bal_goals);
4268 if (ac->ac_found > sbi->s_mb_max_to_scan)
4269 atomic_inc(&sbi->s_bal_breaks);
4270 }
4271
4272 if (ac->ac_op == EXT4_MB_HISTORY_ALLOC)
4273 trace_ext4_mballoc_alloc(ac);
4274 else
4275 trace_ext4_mballoc_prealloc(ac);
4276 }
4277
4278 /*
4279 * Called on failure; free up any blocks from the inode PA for this
4280 * context. We don't need this for MB_GROUP_PA because we only change
4281 * pa_free in ext4_mb_release_context(), but on failure, we've already
4282 * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed.
4283 */
ext4_discard_allocated_blocks(struct ext4_allocation_context * ac)4284 static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
4285 {
4286 struct ext4_prealloc_space *pa = ac->ac_pa;
4287 struct ext4_buddy e4b;
4288 int err;
4289
4290 if (pa == NULL) {
4291 if (ac->ac_f_ex.fe_len == 0)
4292 return;
4293 err = ext4_mb_load_buddy(ac->ac_sb, ac->ac_f_ex.fe_group, &e4b);
4294 if (err) {
4295 /*
4296 * This should never happen since we pin the
4297 * pages in the ext4_allocation_context so
4298 * ext4_mb_load_buddy() should never fail.
4299 */
4300 WARN(1, "mb_load_buddy failed (%d)", err);
4301 return;
4302 }
4303 ext4_lock_group(ac->ac_sb, ac->ac_f_ex.fe_group);
4304 mb_free_blocks(ac->ac_inode, &e4b, ac->ac_f_ex.fe_start,
4305 ac->ac_f_ex.fe_len);
4306 ext4_unlock_group(ac->ac_sb, ac->ac_f_ex.fe_group);
4307 ext4_mb_unload_buddy(&e4b);
4308 return;
4309 }
4310 if (pa->pa_type == MB_INODE_PA)
4311 pa->pa_free += ac->ac_b_ex.fe_len;
4312 }
4313
4314 /*
4315 * use blocks preallocated to inode
4316 */
ext4_mb_use_inode_pa(struct ext4_allocation_context * ac,struct ext4_prealloc_space * pa)4317 static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
4318 struct ext4_prealloc_space *pa)
4319 {
4320 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
4321 ext4_fsblk_t start;
4322 ext4_fsblk_t end;
4323 int len;
4324
4325 /* found preallocated blocks, use them */
4326 start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart);
4327 end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len),
4328 start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len));
4329 len = EXT4_NUM_B2C(sbi, end - start);
4330 ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group,
4331 &ac->ac_b_ex.fe_start);
4332 ac->ac_b_ex.fe_len = len;
4333 ac->ac_status = AC_STATUS_FOUND;
4334 ac->ac_pa = pa;
4335
4336 BUG_ON(start < pa->pa_pstart);
4337 BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len));
4338 BUG_ON(pa->pa_free < len);
4339 BUG_ON(ac->ac_b_ex.fe_len <= 0);
4340 pa->pa_free -= len;
4341
4342 mb_debug(ac->ac_sb, "use %llu/%d from inode pa %p\n", start, len, pa);
4343 }
4344
4345 /*
4346 * use blocks preallocated to locality group
4347 */
ext4_mb_use_group_pa(struct ext4_allocation_context * ac,struct ext4_prealloc_space * pa)4348 static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
4349 struct ext4_prealloc_space *pa)
4350 {
4351 unsigned int len = ac->ac_o_ex.fe_len;
4352
4353 ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart,
4354 &ac->ac_b_ex.fe_group,
4355 &ac->ac_b_ex.fe_start);
4356 ac->ac_b_ex.fe_len = len;
4357 ac->ac_status = AC_STATUS_FOUND;
4358 ac->ac_pa = pa;
4359
4360 /* we don't correct pa_pstart or pa_plen here to avoid
4361 * possible race when the group is being loaded concurrently
4362 * instead we correct pa later, after blocks are marked
4363 * in on-disk bitmap -- see ext4_mb_release_context()
4364 * Other CPUs are prevented from allocating from this pa by lg_mutex
4365 */
4366 mb_debug(ac->ac_sb, "use %u/%u from group pa %p\n",
4367 pa->pa_lstart-len, len, pa);
4368 }
4369
4370 /*
4371 * Return the prealloc space that have minimal distance
4372 * from the goal block. @cpa is the prealloc
4373 * space that is having currently known minimal distance
4374 * from the goal block.
4375 */
4376 static struct ext4_prealloc_space *
ext4_mb_check_group_pa(ext4_fsblk_t goal_block,struct ext4_prealloc_space * pa,struct ext4_prealloc_space * cpa)4377 ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
4378 struct ext4_prealloc_space *pa,
4379 struct ext4_prealloc_space *cpa)
4380 {
4381 ext4_fsblk_t cur_distance, new_distance;
4382
4383 if (cpa == NULL) {
4384 atomic_inc(&pa->pa_count);
4385 return pa;
4386 }
4387 cur_distance = abs(goal_block - cpa->pa_pstart);
4388 new_distance = abs(goal_block - pa->pa_pstart);
4389
4390 if (cur_distance <= new_distance)
4391 return cpa;
4392
4393 /* drop the previous reference */
4394 atomic_dec(&cpa->pa_count);
4395 atomic_inc(&pa->pa_count);
4396 return pa;
4397 }
4398
4399 /*
4400 * search goal blocks in preallocated space
4401 */
4402 static noinline_for_stack bool
ext4_mb_use_preallocated(struct ext4_allocation_context * ac)4403 ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
4404 {
4405 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
4406 int order, i;
4407 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
4408 struct ext4_locality_group *lg;
4409 struct ext4_prealloc_space *pa, *cpa = NULL;
4410 ext4_fsblk_t goal_block;
4411
4412 /* only data can be preallocated */
4413 if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
4414 return false;
4415
4416 /* first, try per-file preallocation */
4417 rcu_read_lock();
4418 list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
4419
4420 /* all fields in this condition don't change,
4421 * so we can skip locking for them */
4422 if (ac->ac_o_ex.fe_logical < pa->pa_lstart ||
4423 ac->ac_o_ex.fe_logical >= pa_logical_end(sbi, pa))
4424 continue;
4425
4426 /* non-extent files can't have physical blocks past 2^32 */
4427 if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
4428 (pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len) >
4429 EXT4_MAX_BLOCK_FILE_PHYS))
4430 continue;
4431
4432 /* found preallocated blocks, use them */
4433 spin_lock(&pa->pa_lock);
4434 if (pa->pa_deleted == 0 && pa->pa_free) {
4435 atomic_inc(&pa->pa_count);
4436 ext4_mb_use_inode_pa(ac, pa);
4437 spin_unlock(&pa->pa_lock);
4438 ac->ac_criteria = 10;
4439 rcu_read_unlock();
4440 return true;
4441 }
4442 spin_unlock(&pa->pa_lock);
4443 }
4444 rcu_read_unlock();
4445
4446 /* can we use group allocation? */
4447 if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC))
4448 return false;
4449
4450 /* inode may have no locality group for some reason */
4451 lg = ac->ac_lg;
4452 if (lg == NULL)
4453 return false;
4454 order = fls(ac->ac_o_ex.fe_len) - 1;
4455 if (order > PREALLOC_TB_SIZE - 1)
4456 /* The max size of hash table is PREALLOC_TB_SIZE */
4457 order = PREALLOC_TB_SIZE - 1;
4458
4459 goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex);
4460 /*
4461 * search for the prealloc space that is having
4462 * minimal distance from the goal block.
4463 */
4464 for (i = order; i < PREALLOC_TB_SIZE; i++) {
4465 rcu_read_lock();
4466 list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i],
4467 pa_inode_list) {
4468 spin_lock(&pa->pa_lock);
4469 if (pa->pa_deleted == 0 &&
4470 pa->pa_free >= ac->ac_o_ex.fe_len) {
4471
4472 cpa = ext4_mb_check_group_pa(goal_block,
4473 pa, cpa);
4474 }
4475 spin_unlock(&pa->pa_lock);
4476 }
4477 rcu_read_unlock();
4478 }
4479 if (cpa) {
4480 ext4_mb_use_group_pa(ac, cpa);
4481 ac->ac_criteria = 20;
4482 return true;
4483 }
4484 return false;
4485 }
4486
4487 /*
4488 * the function goes through all block freed in the group
4489 * but not yet committed and marks them used in in-core bitmap.
4490 * buddy must be generated from this bitmap
4491 * Need to be called with the ext4 group lock held
4492 */
ext4_mb_generate_from_freelist(struct super_block * sb,void * bitmap,ext4_group_t group)4493 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
4494 ext4_group_t group)
4495 {
4496 struct rb_node *n;
4497 struct ext4_group_info *grp;
4498 struct ext4_free_data *entry;
4499
4500 grp = ext4_get_group_info(sb, group);
4501 if (!grp)
4502 return;
4503 n = rb_first(&(grp->bb_free_root));
4504
4505 while (n) {
4506 entry = rb_entry(n, struct ext4_free_data, efd_node);
4507 ext4_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count);
4508 n = rb_next(n);
4509 }
4510 return;
4511 }
4512
4513 /*
4514 * the function goes through all preallocation in this group and marks them
4515 * used in in-core bitmap. buddy must be generated from this bitmap
4516 * Need to be called with ext4 group lock held
4517 */
4518 static noinline_for_stack
ext4_mb_generate_from_pa(struct super_block * sb,void * bitmap,ext4_group_t group)4519 void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
4520 ext4_group_t group)
4521 {
4522 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
4523 struct ext4_prealloc_space *pa;
4524 struct list_head *cur;
4525 ext4_group_t groupnr;
4526 ext4_grpblk_t start;
4527 int preallocated = 0;
4528 int len;
4529
4530 if (!grp)
4531 return;
4532
4533 /* all form of preallocation discards first load group,
4534 * so the only competing code is preallocation use.
4535 * we don't need any locking here
4536 * notice we do NOT ignore preallocations with pa_deleted
4537 * otherwise we could leave used blocks available for
4538 * allocation in buddy when concurrent ext4_mb_put_pa()
4539 * is dropping preallocation
4540 */
4541 list_for_each(cur, &grp->bb_prealloc_list) {
4542 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
4543 spin_lock(&pa->pa_lock);
4544 ext4_get_group_no_and_offset(sb, pa->pa_pstart,
4545 &groupnr, &start);
4546 len = pa->pa_len;
4547 spin_unlock(&pa->pa_lock);
4548 if (unlikely(len == 0))
4549 continue;
4550 BUG_ON(groupnr != group);
4551 ext4_set_bits(bitmap, start, len);
4552 preallocated += len;
4553 }
4554 mb_debug(sb, "preallocated %d for group %u\n", preallocated, group);
4555 }
4556
ext4_mb_mark_pa_deleted(struct super_block * sb,struct ext4_prealloc_space * pa)4557 static void ext4_mb_mark_pa_deleted(struct super_block *sb,
4558 struct ext4_prealloc_space *pa)
4559 {
4560 struct ext4_inode_info *ei;
4561
4562 if (pa->pa_deleted) {
4563 ext4_warning(sb, "deleted pa, type:%d, pblk:%llu, lblk:%u, len:%d\n",
4564 pa->pa_type, pa->pa_pstart, pa->pa_lstart,
4565 pa->pa_len);
4566 return;
4567 }
4568
4569 pa->pa_deleted = 1;
4570
4571 if (pa->pa_type == MB_INODE_PA) {
4572 ei = EXT4_I(pa->pa_inode);
4573 atomic_dec(&ei->i_prealloc_active);
4574 }
4575 }
4576
ext4_mb_pa_callback(struct rcu_head * head)4577 static void ext4_mb_pa_callback(struct rcu_head *head)
4578 {
4579 struct ext4_prealloc_space *pa;
4580 pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
4581
4582 BUG_ON(atomic_read(&pa->pa_count));
4583 BUG_ON(pa->pa_deleted == 0);
4584 kmem_cache_free(ext4_pspace_cachep, pa);
4585 }
4586
4587 /*
4588 * drops a reference to preallocated space descriptor
4589 * if this was the last reference and the space is consumed
4590 */
ext4_mb_put_pa(struct ext4_allocation_context * ac,struct super_block * sb,struct ext4_prealloc_space * pa)4591 static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
4592 struct super_block *sb, struct ext4_prealloc_space *pa)
4593 {
4594 ext4_group_t grp;
4595 ext4_fsblk_t grp_blk;
4596
4597 /* in this short window concurrent discard can set pa_deleted */
4598 spin_lock(&pa->pa_lock);
4599 if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) {
4600 spin_unlock(&pa->pa_lock);
4601 return;
4602 }
4603
4604 if (pa->pa_deleted == 1) {
4605 spin_unlock(&pa->pa_lock);
4606 return;
4607 }
4608
4609 ext4_mb_mark_pa_deleted(sb, pa);
4610 spin_unlock(&pa->pa_lock);
4611
4612 grp_blk = pa->pa_pstart;
4613 /*
4614 * If doing group-based preallocation, pa_pstart may be in the
4615 * next group when pa is used up
4616 */
4617 if (pa->pa_type == MB_GROUP_PA)
4618 grp_blk--;
4619
4620 grp = ext4_get_group_number(sb, grp_blk);
4621
4622 /*
4623 * possible race:
4624 *
4625 * P1 (buddy init) P2 (regular allocation)
4626 * find block B in PA
4627 * copy on-disk bitmap to buddy
4628 * mark B in on-disk bitmap
4629 * drop PA from group
4630 * mark all PAs in buddy
4631 *
4632 * thus, P1 initializes buddy with B available. to prevent this
4633 * we make "copy" and "mark all PAs" atomic and serialize "drop PA"
4634 * against that pair
4635 */
4636 ext4_lock_group(sb, grp);
4637 list_del(&pa->pa_group_list);
4638 ext4_unlock_group(sb, grp);
4639
4640 spin_lock(pa->pa_obj_lock);
4641 list_del_rcu(&pa->pa_inode_list);
4642 spin_unlock(pa->pa_obj_lock);
4643
4644 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
4645 }
4646
4647 /*
4648 * creates new preallocated space for given inode
4649 */
4650 static noinline_for_stack void
ext4_mb_new_inode_pa(struct ext4_allocation_context * ac)4651 ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
4652 {
4653 struct super_block *sb = ac->ac_sb;
4654 struct ext4_sb_info *sbi = EXT4_SB(sb);
4655 struct ext4_prealloc_space *pa;
4656 struct ext4_group_info *grp;
4657 struct ext4_inode_info *ei;
4658
4659 /* preallocate only when found space is larger then requested */
4660 BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
4661 BUG_ON(ac->ac_status != AC_STATUS_FOUND);
4662 BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
4663 BUG_ON(ac->ac_pa == NULL);
4664
4665 pa = ac->ac_pa;
4666
4667 if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) {
4668 struct ext4_free_extent ex = {
4669 .fe_logical = ac->ac_g_ex.fe_logical,
4670 .fe_len = ac->ac_g_ex.fe_len,
4671 };
4672 loff_t orig_goal_end = extent_logical_end(sbi, &ex);
4673
4674 /* we can't allocate as much as normalizer wants.
4675 * so, found space must get proper lstart
4676 * to cover original request */
4677 BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical);
4678 BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len);
4679
4680 /*
4681 * Use the below logic for adjusting best extent as it keeps
4682 * fragmentation in check while ensuring logical range of best
4683 * extent doesn't overflow out of goal extent:
4684 *
4685 * 1. Check if best ex can be kept at end of goal and still
4686 * cover original start
4687 * 2. Else, check if best ex can be kept at start of goal and
4688 * still cover original start
4689 * 3. Else, keep the best ex at start of original request.
4690 */
4691 ex.fe_len = ac->ac_b_ex.fe_len;
4692
4693 ex.fe_logical = orig_goal_end - EXT4_C2B(sbi, ex.fe_len);
4694 if (ac->ac_o_ex.fe_logical >= ex.fe_logical)
4695 goto adjust_bex;
4696
4697 ex.fe_logical = ac->ac_g_ex.fe_logical;
4698 if (ac->ac_o_ex.fe_logical < extent_logical_end(sbi, &ex))
4699 goto adjust_bex;
4700
4701 ex.fe_logical = ac->ac_o_ex.fe_logical;
4702 adjust_bex:
4703 ac->ac_b_ex.fe_logical = ex.fe_logical;
4704
4705 BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
4706 BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
4707 BUG_ON(extent_logical_end(sbi, &ex) > orig_goal_end);
4708 }
4709
4710 /* preallocation can change ac_b_ex, thus we store actually
4711 * allocated blocks for history */
4712 ac->ac_f_ex = ac->ac_b_ex;
4713
4714 pa->pa_lstart = ac->ac_b_ex.fe_logical;
4715 pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
4716 pa->pa_len = ac->ac_b_ex.fe_len;
4717 pa->pa_free = pa->pa_len;
4718 spin_lock_init(&pa->pa_lock);
4719 INIT_LIST_HEAD(&pa->pa_inode_list);
4720 INIT_LIST_HEAD(&pa->pa_group_list);
4721 pa->pa_deleted = 0;
4722 pa->pa_type = MB_INODE_PA;
4723
4724 mb_debug(sb, "new inode pa %p: %llu/%d for %u\n", pa, pa->pa_pstart,
4725 pa->pa_len, pa->pa_lstart);
4726 trace_ext4_mb_new_inode_pa(ac, pa);
4727
4728 ext4_mb_use_inode_pa(ac, pa);
4729 atomic_add(pa->pa_free, &sbi->s_mb_preallocated);
4730
4731 ei = EXT4_I(ac->ac_inode);
4732 grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
4733 if (!grp)
4734 return;
4735
4736 pa->pa_obj_lock = &ei->i_prealloc_lock;
4737 pa->pa_inode = ac->ac_inode;
4738
4739 list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
4740
4741 spin_lock(pa->pa_obj_lock);
4742 list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list);
4743 spin_unlock(pa->pa_obj_lock);
4744 atomic_inc(&ei->i_prealloc_active);
4745 }
4746
4747 /*
4748 * creates new preallocated space for locality group inodes belongs to
4749 */
4750 static noinline_for_stack void
ext4_mb_new_group_pa(struct ext4_allocation_context * ac)4751 ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
4752 {
4753 struct super_block *sb = ac->ac_sb;
4754 struct ext4_locality_group *lg;
4755 struct ext4_prealloc_space *pa;
4756 struct ext4_group_info *grp;
4757
4758 /* preallocate only when found space is larger then requested */
4759 BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
4760 BUG_ON(ac->ac_status != AC_STATUS_FOUND);
4761 BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
4762 BUG_ON(ac->ac_pa == NULL);
4763
4764 pa = ac->ac_pa;
4765
4766 /* preallocation can change ac_b_ex, thus we store actually
4767 * allocated blocks for history */
4768 ac->ac_f_ex = ac->ac_b_ex;
4769
4770 pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
4771 pa->pa_lstart = pa->pa_pstart;
4772 pa->pa_len = ac->ac_b_ex.fe_len;
4773 pa->pa_free = pa->pa_len;
4774 spin_lock_init(&pa->pa_lock);
4775 INIT_LIST_HEAD(&pa->pa_inode_list);
4776 INIT_LIST_HEAD(&pa->pa_group_list);
4777 pa->pa_deleted = 0;
4778 pa->pa_type = MB_GROUP_PA;
4779
4780 mb_debug(sb, "new group pa %p: %llu/%d for %u\n", pa, pa->pa_pstart,
4781 pa->pa_len, pa->pa_lstart);
4782 trace_ext4_mb_new_group_pa(ac, pa);
4783
4784 ext4_mb_use_group_pa(ac, pa);
4785 atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
4786
4787 grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
4788 if (!grp)
4789 return;
4790 lg = ac->ac_lg;
4791 BUG_ON(lg == NULL);
4792
4793 pa->pa_obj_lock = &lg->lg_prealloc_lock;
4794 pa->pa_inode = NULL;
4795
4796 list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
4797
4798 /*
4799 * We will later add the new pa to the right bucket
4800 * after updating the pa_free in ext4_mb_release_context
4801 */
4802 }
4803
ext4_mb_new_preallocation(struct ext4_allocation_context * ac)4804 static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
4805 {
4806 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
4807 ext4_mb_new_group_pa(ac);
4808 else
4809 ext4_mb_new_inode_pa(ac);
4810 }
4811
4812 /*
4813 * finds all unused blocks in on-disk bitmap, frees them in
4814 * in-core bitmap and buddy.
4815 * @pa must be unlinked from inode and group lists, so that
4816 * nobody else can find/use it.
4817 * the caller MUST hold group/inode locks.
4818 * TODO: optimize the case when there are no in-core structures yet
4819 */
4820 static noinline_for_stack int
ext4_mb_release_inode_pa(struct ext4_buddy * e4b,struct buffer_head * bitmap_bh,struct ext4_prealloc_space * pa)4821 ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
4822 struct ext4_prealloc_space *pa)
4823 {
4824 struct super_block *sb = e4b->bd_sb;
4825 struct ext4_sb_info *sbi = EXT4_SB(sb);
4826 unsigned int end;
4827 unsigned int next;
4828 ext4_group_t group;
4829 ext4_grpblk_t bit;
4830 unsigned long long grp_blk_start;
4831 int free = 0;
4832
4833 BUG_ON(pa->pa_deleted == 0);
4834 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
4835 grp_blk_start = pa->pa_pstart - EXT4_C2B(sbi, bit);
4836 BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
4837 end = bit + pa->pa_len;
4838
4839 while (bit < end) {
4840 bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit);
4841 if (bit >= end)
4842 break;
4843 next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
4844 mb_debug(sb, "free preallocated %u/%u in group %u\n",
4845 (unsigned) ext4_group_first_block_no(sb, group) + bit,
4846 (unsigned) next - bit, (unsigned) group);
4847 free += next - bit;
4848
4849 trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit);
4850 trace_ext4_mb_release_inode_pa(pa, (grp_blk_start +
4851 EXT4_C2B(sbi, bit)),
4852 next - bit);
4853 mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
4854 bit = next + 1;
4855 }
4856 if (free != pa->pa_free) {
4857 ext4_msg(e4b->bd_sb, KERN_CRIT,
4858 "pa %p: logic %lu, phys. %lu, len %d",
4859 pa, (unsigned long) pa->pa_lstart,
4860 (unsigned long) pa->pa_pstart,
4861 pa->pa_len);
4862 ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u",
4863 free, pa->pa_free);
4864 /*
4865 * pa is already deleted so we use the value obtained
4866 * from the bitmap and continue.
4867 */
4868 }
4869 atomic_add(free, &sbi->s_mb_discarded);
4870
4871 return 0;
4872 }
4873
4874 static noinline_for_stack int
ext4_mb_release_group_pa(struct ext4_buddy * e4b,struct ext4_prealloc_space * pa)4875 ext4_mb_release_group_pa(struct ext4_buddy *e4b,
4876 struct ext4_prealloc_space *pa)
4877 {
4878 struct super_block *sb = e4b->bd_sb;
4879 ext4_group_t group;
4880 ext4_grpblk_t bit;
4881
4882 trace_ext4_mb_release_group_pa(sb, pa);
4883 BUG_ON(pa->pa_deleted == 0);
4884 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
4885 if (unlikely(group != e4b->bd_group && pa->pa_len != 0)) {
4886 ext4_warning(sb, "bad group: expected %u, group %u, pa_start %llu",
4887 e4b->bd_group, group, pa->pa_pstart);
4888 return 0;
4889 }
4890 mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
4891 atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
4892 trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len);
4893
4894 return 0;
4895 }
4896
4897 /*
4898 * releases all preallocations in given group
4899 *
4900 * first, we need to decide discard policy:
4901 * - when do we discard
4902 * 1) ENOSPC
4903 * - how many do we discard
4904 * 1) how many requested
4905 */
4906 static noinline_for_stack int
ext4_mb_discard_group_preallocations(struct super_block * sb,ext4_group_t group,int * busy)4907 ext4_mb_discard_group_preallocations(struct super_block *sb,
4908 ext4_group_t group, int *busy)
4909 {
4910 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
4911 struct buffer_head *bitmap_bh = NULL;
4912 struct ext4_prealloc_space *pa, *tmp;
4913 struct list_head list;
4914 struct ext4_buddy e4b;
4915 int err;
4916 int free = 0;
4917
4918 if (!grp)
4919 return 0;
4920 mb_debug(sb, "discard preallocation for group %u\n", group);
4921 if (list_empty(&grp->bb_prealloc_list))
4922 goto out_dbg;
4923
4924 bitmap_bh = ext4_read_block_bitmap(sb, group);
4925 if (IS_ERR(bitmap_bh)) {
4926 err = PTR_ERR(bitmap_bh);
4927 ext4_error_err(sb, -err,
4928 "Error %d reading block bitmap for %u",
4929 err, group);
4930 goto out_dbg;
4931 }
4932
4933 err = ext4_mb_load_buddy(sb, group, &e4b);
4934 if (err) {
4935 ext4_warning(sb, "Error %d loading buddy information for %u",
4936 err, group);
4937 put_bh(bitmap_bh);
4938 goto out_dbg;
4939 }
4940
4941 INIT_LIST_HEAD(&list);
4942 ext4_lock_group(sb, group);
4943 list_for_each_entry_safe(pa, tmp,
4944 &grp->bb_prealloc_list, pa_group_list) {
4945 spin_lock(&pa->pa_lock);
4946 if (atomic_read(&pa->pa_count)) {
4947 spin_unlock(&pa->pa_lock);
4948 *busy = 1;
4949 continue;
4950 }
4951 if (pa->pa_deleted) {
4952 spin_unlock(&pa->pa_lock);
4953 continue;
4954 }
4955
4956 /* seems this one can be freed ... */
4957 ext4_mb_mark_pa_deleted(sb, pa);
4958
4959 if (!free)
4960 this_cpu_inc(discard_pa_seq);
4961
4962 /* we can trust pa_free ... */
4963 free += pa->pa_free;
4964
4965 spin_unlock(&pa->pa_lock);
4966
4967 list_del(&pa->pa_group_list);
4968 list_add(&pa->u.pa_tmp_list, &list);
4969 }
4970
4971 /* now free all selected PAs */
4972 list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
4973
4974 /* remove from object (inode or locality group) */
4975 spin_lock(pa->pa_obj_lock);
4976 list_del_rcu(&pa->pa_inode_list);
4977 spin_unlock(pa->pa_obj_lock);
4978
4979 if (pa->pa_type == MB_GROUP_PA)
4980 ext4_mb_release_group_pa(&e4b, pa);
4981 else
4982 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
4983
4984 list_del(&pa->u.pa_tmp_list);
4985 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
4986 }
4987
4988 ext4_unlock_group(sb, group);
4989 ext4_mb_unload_buddy(&e4b);
4990 put_bh(bitmap_bh);
4991 out_dbg:
4992 mb_debug(sb, "discarded (%d) blocks preallocated for group %u bb_free (%d)\n",
4993 free, group, grp->bb_free);
4994 return free;
4995 }
4996
4997 /*
4998 * releases all non-used preallocated blocks for given inode
4999 *
5000 * It's important to discard preallocations under i_data_sem
5001 * We don't want another block to be served from the prealloc
5002 * space when we are discarding the inode prealloc space.
5003 *
5004 * FIXME!! Make sure it is valid at all the call sites
5005 */
ext4_discard_preallocations(struct inode * inode,unsigned int needed)5006 void ext4_discard_preallocations(struct inode *inode, unsigned int needed)
5007 {
5008 struct ext4_inode_info *ei = EXT4_I(inode);
5009 struct super_block *sb = inode->i_sb;
5010 struct buffer_head *bitmap_bh = NULL;
5011 struct ext4_prealloc_space *pa, *tmp;
5012 ext4_group_t group = 0;
5013 struct list_head list;
5014 struct ext4_buddy e4b;
5015 int err;
5016
5017 if (!S_ISREG(inode->i_mode)) {
5018 /*BUG_ON(!list_empty(&ei->i_prealloc_list));*/
5019 return;
5020 }
5021
5022 if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)
5023 return;
5024
5025 mb_debug(sb, "discard preallocation for inode %lu\n",
5026 inode->i_ino);
5027 trace_ext4_discard_preallocations(inode,
5028 atomic_read(&ei->i_prealloc_active), needed);
5029
5030 INIT_LIST_HEAD(&list);
5031
5032 if (needed == 0)
5033 needed = UINT_MAX;
5034
5035 repeat:
5036 /* first, collect all pa's in the inode */
5037 spin_lock(&ei->i_prealloc_lock);
5038 while (!list_empty(&ei->i_prealloc_list) && needed) {
5039 pa = list_entry(ei->i_prealloc_list.prev,
5040 struct ext4_prealloc_space, pa_inode_list);
5041 BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock);
5042 spin_lock(&pa->pa_lock);
5043 if (atomic_read(&pa->pa_count)) {
5044 /* this shouldn't happen often - nobody should
5045 * use preallocation while we're discarding it */
5046 spin_unlock(&pa->pa_lock);
5047 spin_unlock(&ei->i_prealloc_lock);
5048 ext4_msg(sb, KERN_ERR,
5049 "uh-oh! used pa while discarding");
5050 WARN_ON(1);
5051 schedule_timeout_uninterruptible(HZ);
5052 goto repeat;
5053
5054 }
5055 if (pa->pa_deleted == 0) {
5056 ext4_mb_mark_pa_deleted(sb, pa);
5057 spin_unlock(&pa->pa_lock);
5058 list_del_rcu(&pa->pa_inode_list);
5059 list_add(&pa->u.pa_tmp_list, &list);
5060 needed--;
5061 continue;
5062 }
5063
5064 /* someone is deleting pa right now */
5065 spin_unlock(&pa->pa_lock);
5066 spin_unlock(&ei->i_prealloc_lock);
5067
5068 /* we have to wait here because pa_deleted
5069 * doesn't mean pa is already unlinked from
5070 * the list. as we might be called from
5071 * ->clear_inode() the inode will get freed
5072 * and concurrent thread which is unlinking
5073 * pa from inode's list may access already
5074 * freed memory, bad-bad-bad */
5075
5076 /* XXX: if this happens too often, we can
5077 * add a flag to force wait only in case
5078 * of ->clear_inode(), but not in case of
5079 * regular truncate */
5080 schedule_timeout_uninterruptible(HZ);
5081 goto repeat;
5082 }
5083 spin_unlock(&ei->i_prealloc_lock);
5084
5085 list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
5086 BUG_ON(pa->pa_type != MB_INODE_PA);
5087 group = ext4_get_group_number(sb, pa->pa_pstart);
5088
5089 err = ext4_mb_load_buddy_gfp(sb, group, &e4b,
5090 GFP_NOFS|__GFP_NOFAIL);
5091 if (err) {
5092 ext4_error_err(sb, -err, "Error %d loading buddy information for %u",
5093 err, group);
5094 continue;
5095 }
5096
5097 bitmap_bh = ext4_read_block_bitmap(sb, group);
5098 if (IS_ERR(bitmap_bh)) {
5099 err = PTR_ERR(bitmap_bh);
5100 ext4_error_err(sb, -err, "Error %d reading block bitmap for %u",
5101 err, group);
5102 ext4_mb_unload_buddy(&e4b);
5103 continue;
5104 }
5105
5106 ext4_lock_group(sb, group);
5107 list_del(&pa->pa_group_list);
5108 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
5109 ext4_unlock_group(sb, group);
5110
5111 ext4_mb_unload_buddy(&e4b);
5112 put_bh(bitmap_bh);
5113
5114 list_del(&pa->u.pa_tmp_list);
5115 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
5116 }
5117 }
5118
ext4_mb_pa_alloc(struct ext4_allocation_context * ac)5119 static int ext4_mb_pa_alloc(struct ext4_allocation_context *ac)
5120 {
5121 struct ext4_prealloc_space *pa;
5122
5123 BUG_ON(ext4_pspace_cachep == NULL);
5124 pa = kmem_cache_zalloc(ext4_pspace_cachep, GFP_NOFS);
5125 if (!pa)
5126 return -ENOMEM;
5127 atomic_set(&pa->pa_count, 1);
5128 ac->ac_pa = pa;
5129 return 0;
5130 }
5131
ext4_mb_pa_free(struct ext4_allocation_context * ac)5132 static void ext4_mb_pa_free(struct ext4_allocation_context *ac)
5133 {
5134 struct ext4_prealloc_space *pa = ac->ac_pa;
5135
5136 BUG_ON(!pa);
5137 ac->ac_pa = NULL;
5138 WARN_ON(!atomic_dec_and_test(&pa->pa_count));
5139 kmem_cache_free(ext4_pspace_cachep, pa);
5140 }
5141
5142 #ifdef CONFIG_EXT4_DEBUG
ext4_mb_show_pa(struct super_block * sb)5143 static inline void ext4_mb_show_pa(struct super_block *sb)
5144 {
5145 ext4_group_t i, ngroups;
5146
5147 if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))
5148 return;
5149
5150 ngroups = ext4_get_groups_count(sb);
5151 mb_debug(sb, "groups: ");
5152 for (i = 0; i < ngroups; i++) {
5153 struct ext4_group_info *grp = ext4_get_group_info(sb, i);
5154 struct ext4_prealloc_space *pa;
5155 ext4_grpblk_t start;
5156 struct list_head *cur;
5157
5158 if (!grp)
5159 continue;
5160 ext4_lock_group(sb, i);
5161 list_for_each(cur, &grp->bb_prealloc_list) {
5162 pa = list_entry(cur, struct ext4_prealloc_space,
5163 pa_group_list);
5164 spin_lock(&pa->pa_lock);
5165 ext4_get_group_no_and_offset(sb, pa->pa_pstart,
5166 NULL, &start);
5167 spin_unlock(&pa->pa_lock);
5168 mb_debug(sb, "PA:%u:%d:%d\n", i, start,
5169 pa->pa_len);
5170 }
5171 ext4_unlock_group(sb, i);
5172 mb_debug(sb, "%u: %d/%d\n", i, grp->bb_free,
5173 grp->bb_fragments);
5174 }
5175 }
5176
ext4_mb_show_ac(struct ext4_allocation_context * ac)5177 static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
5178 {
5179 struct super_block *sb = ac->ac_sb;
5180
5181 if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))
5182 return;
5183
5184 mb_debug(sb, "Can't allocate:"
5185 " Allocation context details:");
5186 mb_debug(sb, "status %u flags 0x%x",
5187 ac->ac_status, ac->ac_flags);
5188 mb_debug(sb, "orig %lu/%lu/%lu@%lu, "
5189 "goal %lu/%lu/%lu@%lu, "
5190 "best %lu/%lu/%lu@%lu cr %d",
5191 (unsigned long)ac->ac_o_ex.fe_group,
5192 (unsigned long)ac->ac_o_ex.fe_start,
5193 (unsigned long)ac->ac_o_ex.fe_len,
5194 (unsigned long)ac->ac_o_ex.fe_logical,
5195 (unsigned long)ac->ac_g_ex.fe_group,
5196 (unsigned long)ac->ac_g_ex.fe_start,
5197 (unsigned long)ac->ac_g_ex.fe_len,
5198 (unsigned long)ac->ac_g_ex.fe_logical,
5199 (unsigned long)ac->ac_b_ex.fe_group,
5200 (unsigned long)ac->ac_b_ex.fe_start,
5201 (unsigned long)ac->ac_b_ex.fe_len,
5202 (unsigned long)ac->ac_b_ex.fe_logical,
5203 (int)ac->ac_criteria);
5204 mb_debug(sb, "%u found", ac->ac_found);
5205 ext4_mb_show_pa(sb);
5206 }
5207 #else
ext4_mb_show_pa(struct super_block * sb)5208 static inline void ext4_mb_show_pa(struct super_block *sb)
5209 {
5210 return;
5211 }
ext4_mb_show_ac(struct ext4_allocation_context * ac)5212 static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac)
5213 {
5214 ext4_mb_show_pa(ac->ac_sb);
5215 return;
5216 }
5217 #endif
5218
5219 /*
5220 * We use locality group preallocation for small size file. The size of the
5221 * file is determined by the current size or the resulting size after
5222 * allocation which ever is larger
5223 *
5224 * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req
5225 */
ext4_mb_group_or_file(struct ext4_allocation_context * ac)5226 static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
5227 {
5228 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
5229 int bsbits = ac->ac_sb->s_blocksize_bits;
5230 loff_t size, isize;
5231 bool inode_pa_eligible, group_pa_eligible;
5232
5233 if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
5234 return;
5235
5236 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
5237 return;
5238
5239 group_pa_eligible = sbi->s_mb_group_prealloc > 0;
5240 inode_pa_eligible = true;
5241 size = extent_logical_end(sbi, &ac->ac_o_ex);
5242 isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
5243 >> bsbits;
5244
5245 /* No point in using inode preallocation for closed files */
5246 if ((size == isize) && !ext4_fs_is_busy(sbi) &&
5247 !inode_is_open_for_write(ac->ac_inode))
5248 inode_pa_eligible = false;
5249
5250 size = max(size, isize);
5251 /* Don't use group allocation for large files */
5252 if (size > sbi->s_mb_stream_request)
5253 group_pa_eligible = false;
5254
5255 if (!group_pa_eligible) {
5256 if (inode_pa_eligible)
5257 ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
5258 else
5259 ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
5260 return;
5261 }
5262
5263 BUG_ON(ac->ac_lg != NULL);
5264 /*
5265 * locality group prealloc space are per cpu. The reason for having
5266 * per cpu locality group is to reduce the contention between block
5267 * request from multiple CPUs.
5268 */
5269 ac->ac_lg = raw_cpu_ptr(sbi->s_locality_groups);
5270
5271 /* we're going to use group allocation */
5272 ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;
5273
5274 /* serialize all allocations in the group */
5275 mutex_lock(&ac->ac_lg->lg_mutex);
5276 }
5277
5278 static noinline_for_stack int
ext4_mb_initialize_context(struct ext4_allocation_context * ac,struct ext4_allocation_request * ar)5279 ext4_mb_initialize_context(struct ext4_allocation_context *ac,
5280 struct ext4_allocation_request *ar)
5281 {
5282 struct super_block *sb = ar->inode->i_sb;
5283 struct ext4_sb_info *sbi = EXT4_SB(sb);
5284 struct ext4_super_block *es = sbi->s_es;
5285 ext4_group_t group;
5286 unsigned int len;
5287 ext4_fsblk_t goal;
5288 ext4_grpblk_t block;
5289
5290 /* we can't allocate > group size */
5291 len = ar->len;
5292
5293 /* just a dirty hack to filter too big requests */
5294 if (len >= EXT4_CLUSTERS_PER_GROUP(sb))
5295 len = EXT4_CLUSTERS_PER_GROUP(sb);
5296
5297 /* start searching from the goal */
5298 goal = ar->goal;
5299 if (goal < le32_to_cpu(es->s_first_data_block) ||
5300 goal >= ext4_blocks_count(es))
5301 goal = le32_to_cpu(es->s_first_data_block);
5302 ext4_get_group_no_and_offset(sb, goal, &group, &block);
5303
5304 /* set up allocation goals */
5305 ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical);
5306 ac->ac_status = AC_STATUS_CONTINUE;
5307 ac->ac_sb = sb;
5308 ac->ac_inode = ar->inode;
5309 ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical;
5310 ac->ac_o_ex.fe_group = group;
5311 ac->ac_o_ex.fe_start = block;
5312 ac->ac_o_ex.fe_len = len;
5313 ac->ac_g_ex = ac->ac_o_ex;
5314 ac->ac_flags = ar->flags;
5315
5316 /* we have to define context: we'll work with a file or
5317 * locality group. this is a policy, actually */
5318 ext4_mb_group_or_file(ac);
5319
5320 mb_debug(sb, "init ac: %u blocks @ %u, goal %u, flags 0x%x, 2^%d, "
5321 "left: %u/%u, right %u/%u to %swritable\n",
5322 (unsigned) ar->len, (unsigned) ar->logical,
5323 (unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
5324 (unsigned) ar->lleft, (unsigned) ar->pleft,
5325 (unsigned) ar->lright, (unsigned) ar->pright,
5326 inode_is_open_for_write(ar->inode) ? "" : "non-");
5327 return 0;
5328
5329 }
5330
5331 static noinline_for_stack void
ext4_mb_discard_lg_preallocations(struct super_block * sb,struct ext4_locality_group * lg,int order,int total_entries)5332 ext4_mb_discard_lg_preallocations(struct super_block *sb,
5333 struct ext4_locality_group *lg,
5334 int order, int total_entries)
5335 {
5336 ext4_group_t group = 0;
5337 struct ext4_buddy e4b;
5338 struct list_head discard_list;
5339 struct ext4_prealloc_space *pa, *tmp;
5340
5341 mb_debug(sb, "discard locality group preallocation\n");
5342
5343 INIT_LIST_HEAD(&discard_list);
5344
5345 spin_lock(&lg->lg_prealloc_lock);
5346 list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
5347 pa_inode_list,
5348 lockdep_is_held(&lg->lg_prealloc_lock)) {
5349 spin_lock(&pa->pa_lock);
5350 if (atomic_read(&pa->pa_count)) {
5351 /*
5352 * This is the pa that we just used
5353 * for block allocation. So don't
5354 * free that
5355 */
5356 spin_unlock(&pa->pa_lock);
5357 continue;
5358 }
5359 if (pa->pa_deleted) {
5360 spin_unlock(&pa->pa_lock);
5361 continue;
5362 }
5363 /* only lg prealloc space */
5364 BUG_ON(pa->pa_type != MB_GROUP_PA);
5365
5366 /* seems this one can be freed ... */
5367 ext4_mb_mark_pa_deleted(sb, pa);
5368 spin_unlock(&pa->pa_lock);
5369
5370 list_del_rcu(&pa->pa_inode_list);
5371 list_add(&pa->u.pa_tmp_list, &discard_list);
5372
5373 total_entries--;
5374 if (total_entries <= 5) {
5375 /*
5376 * we want to keep only 5 entries
5377 * allowing it to grow to 8. This
5378 * mak sure we don't call discard
5379 * soon for this list.
5380 */
5381 break;
5382 }
5383 }
5384 spin_unlock(&lg->lg_prealloc_lock);
5385
5386 list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) {
5387 int err;
5388
5389 group = ext4_get_group_number(sb, pa->pa_pstart);
5390 err = ext4_mb_load_buddy_gfp(sb, group, &e4b,
5391 GFP_NOFS|__GFP_NOFAIL);
5392 if (err) {
5393 ext4_error_err(sb, -err, "Error %d loading buddy information for %u",
5394 err, group);
5395 continue;
5396 }
5397 ext4_lock_group(sb, group);
5398 list_del(&pa->pa_group_list);
5399 ext4_mb_release_group_pa(&e4b, pa);
5400 ext4_unlock_group(sb, group);
5401
5402 ext4_mb_unload_buddy(&e4b);
5403 list_del(&pa->u.pa_tmp_list);
5404 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
5405 }
5406 }
5407
5408 /*
5409 * We have incremented pa_count. So it cannot be freed at this
5410 * point. Also we hold lg_mutex. So no parallel allocation is
5411 * possible from this lg. That means pa_free cannot be updated.
5412 *
5413 * A parallel ext4_mb_discard_group_preallocations is possible.
5414 * which can cause the lg_prealloc_list to be updated.
5415 */
5416
ext4_mb_add_n_trim(struct ext4_allocation_context * ac)5417 static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
5418 {
5419 int order, added = 0, lg_prealloc_count = 1;
5420 struct super_block *sb = ac->ac_sb;
5421 struct ext4_locality_group *lg = ac->ac_lg;
5422 struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa;
5423
5424 order = fls(pa->pa_free) - 1;
5425 if (order > PREALLOC_TB_SIZE - 1)
5426 /* The max size of hash table is PREALLOC_TB_SIZE */
5427 order = PREALLOC_TB_SIZE - 1;
5428 /* Add the prealloc space to lg */
5429 spin_lock(&lg->lg_prealloc_lock);
5430 list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
5431 pa_inode_list,
5432 lockdep_is_held(&lg->lg_prealloc_lock)) {
5433 spin_lock(&tmp_pa->pa_lock);
5434 if (tmp_pa->pa_deleted) {
5435 spin_unlock(&tmp_pa->pa_lock);
5436 continue;
5437 }
5438 if (!added && pa->pa_free < tmp_pa->pa_free) {
5439 /* Add to the tail of the previous entry */
5440 list_add_tail_rcu(&pa->pa_inode_list,
5441 &tmp_pa->pa_inode_list);
5442 added = 1;
5443 /*
5444 * we want to count the total
5445 * number of entries in the list
5446 */
5447 }
5448 spin_unlock(&tmp_pa->pa_lock);
5449 lg_prealloc_count++;
5450 }
5451 if (!added)
5452 list_add_tail_rcu(&pa->pa_inode_list,
5453 &lg->lg_prealloc_list[order]);
5454 spin_unlock(&lg->lg_prealloc_lock);
5455
5456 /* Now trim the list to be not more than 8 elements */
5457 if (lg_prealloc_count > 8) {
5458 ext4_mb_discard_lg_preallocations(sb, lg,
5459 order, lg_prealloc_count);
5460 return;
5461 }
5462 return ;
5463 }
5464
5465 /*
5466 * if per-inode prealloc list is too long, trim some PA
5467 */
ext4_mb_trim_inode_pa(struct inode * inode)5468 static void ext4_mb_trim_inode_pa(struct inode *inode)
5469 {
5470 struct ext4_inode_info *ei = EXT4_I(inode);
5471 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
5472 int count, delta;
5473
5474 count = atomic_read(&ei->i_prealloc_active);
5475 delta = (sbi->s_mb_max_inode_prealloc >> 2) + 1;
5476 if (count > sbi->s_mb_max_inode_prealloc + delta) {
5477 count -= sbi->s_mb_max_inode_prealloc;
5478 ext4_discard_preallocations(inode, count);
5479 }
5480 }
5481
5482 /*
5483 * release all resource we used in allocation
5484 */
ext4_mb_release_context(struct ext4_allocation_context * ac)5485 static int ext4_mb_release_context(struct ext4_allocation_context *ac)
5486 {
5487 struct inode *inode = ac->ac_inode;
5488 struct ext4_inode_info *ei = EXT4_I(inode);
5489 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
5490 struct ext4_prealloc_space *pa = ac->ac_pa;
5491 if (pa) {
5492 if (pa->pa_type == MB_GROUP_PA) {
5493 /* see comment in ext4_mb_use_group_pa() */
5494 spin_lock(&pa->pa_lock);
5495 pa->pa_pstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
5496 pa->pa_lstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
5497 pa->pa_free -= ac->ac_b_ex.fe_len;
5498 pa->pa_len -= ac->ac_b_ex.fe_len;
5499 spin_unlock(&pa->pa_lock);
5500
5501 /*
5502 * We want to add the pa to the right bucket.
5503 * Remove it from the list and while adding
5504 * make sure the list to which we are adding
5505 * doesn't grow big.
5506 */
5507 if (likely(pa->pa_free)) {
5508 spin_lock(pa->pa_obj_lock);
5509 list_del_rcu(&pa->pa_inode_list);
5510 spin_unlock(pa->pa_obj_lock);
5511 ext4_mb_add_n_trim(ac);
5512 }
5513 }
5514
5515 if (pa->pa_type == MB_INODE_PA) {
5516 /*
5517 * treat per-inode prealloc list as a lru list, then try
5518 * to trim the least recently used PA.
5519 */
5520 spin_lock(pa->pa_obj_lock);
5521 list_move(&pa->pa_inode_list, &ei->i_prealloc_list);
5522 spin_unlock(pa->pa_obj_lock);
5523 }
5524
5525 ext4_mb_put_pa(ac, ac->ac_sb, pa);
5526 }
5527 if (ac->ac_bitmap_page)
5528 put_page(ac->ac_bitmap_page);
5529 if (ac->ac_buddy_page)
5530 put_page(ac->ac_buddy_page);
5531 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
5532 mutex_unlock(&ac->ac_lg->lg_mutex);
5533 ext4_mb_collect_stats(ac);
5534 ext4_mb_trim_inode_pa(inode);
5535 return 0;
5536 }
5537
ext4_mb_discard_preallocations(struct super_block * sb,int needed)5538 static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
5539 {
5540 ext4_group_t i, ngroups = ext4_get_groups_count(sb);
5541 int ret;
5542 int freed = 0, busy = 0;
5543 int retry = 0;
5544
5545 trace_ext4_mb_discard_preallocations(sb, needed);
5546
5547 if (needed == 0)
5548 needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1;
5549 repeat:
5550 for (i = 0; i < ngroups && needed > 0; i++) {
5551 ret = ext4_mb_discard_group_preallocations(sb, i, &busy);
5552 freed += ret;
5553 needed -= ret;
5554 cond_resched();
5555 }
5556
5557 if (needed > 0 && busy && ++retry < 3) {
5558 busy = 0;
5559 goto repeat;
5560 }
5561
5562 return freed;
5563 }
5564
ext4_mb_discard_preallocations_should_retry(struct super_block * sb,struct ext4_allocation_context * ac,u64 * seq)5565 static bool ext4_mb_discard_preallocations_should_retry(struct super_block *sb,
5566 struct ext4_allocation_context *ac, u64 *seq)
5567 {
5568 int freed;
5569 u64 seq_retry = 0;
5570 bool ret = false;
5571
5572 freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len);
5573 if (freed) {
5574 ret = true;
5575 goto out_dbg;
5576 }
5577 seq_retry = ext4_get_discard_pa_seq_sum();
5578 if (!(ac->ac_flags & EXT4_MB_STRICT_CHECK) || seq_retry != *seq) {
5579 ac->ac_flags |= EXT4_MB_STRICT_CHECK;
5580 *seq = seq_retry;
5581 ret = true;
5582 }
5583
5584 out_dbg:
5585 mb_debug(sb, "freed %d, retry ? %s\n", freed, ret ? "yes" : "no");
5586 return ret;
5587 }
5588
5589 static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle,
5590 struct ext4_allocation_request *ar, int *errp);
5591
5592 /*
5593 * Main entry point into mballoc to allocate blocks
5594 * it tries to use preallocation first, then falls back
5595 * to usual allocation
5596 */
ext4_mb_new_blocks(handle_t * handle,struct ext4_allocation_request * ar,int * errp)5597 ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
5598 struct ext4_allocation_request *ar, int *errp)
5599 {
5600 struct ext4_allocation_context *ac = NULL;
5601 struct ext4_sb_info *sbi;
5602 struct super_block *sb;
5603 ext4_fsblk_t block = 0;
5604 unsigned int inquota = 0;
5605 unsigned int reserv_clstrs = 0;
5606 int retries = 0;
5607 u64 seq;
5608
5609 might_sleep();
5610 sb = ar->inode->i_sb;
5611 sbi = EXT4_SB(sb);
5612
5613 trace_ext4_request_blocks(ar);
5614 if (sbi->s_mount_state & EXT4_FC_REPLAY)
5615 return ext4_mb_new_blocks_simple(handle, ar, errp);
5616
5617 /* Allow to use superuser reservation for quota file */
5618 if (ext4_is_quota_file(ar->inode))
5619 ar->flags |= EXT4_MB_USE_ROOT_BLOCKS;
5620
5621 if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) {
5622 /* Without delayed allocation we need to verify
5623 * there is enough free blocks to do block allocation
5624 * and verify allocation doesn't exceed the quota limits.
5625 */
5626 while (ar->len &&
5627 ext4_claim_free_clusters(sbi, ar->len, ar->flags)) {
5628
5629 /* let others to free the space */
5630 cond_resched();
5631 ar->len = ar->len >> 1;
5632 }
5633 if (!ar->len) {
5634 ext4_mb_show_pa(sb);
5635 *errp = -ENOSPC;
5636 return 0;
5637 }
5638 reserv_clstrs = ar->len;
5639 if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) {
5640 dquot_alloc_block_nofail(ar->inode,
5641 EXT4_C2B(sbi, ar->len));
5642 } else {
5643 while (ar->len &&
5644 dquot_alloc_block(ar->inode,
5645 EXT4_C2B(sbi, ar->len))) {
5646
5647 ar->flags |= EXT4_MB_HINT_NOPREALLOC;
5648 ar->len--;
5649 }
5650 }
5651 inquota = ar->len;
5652 if (ar->len == 0) {
5653 *errp = -EDQUOT;
5654 goto out;
5655 }
5656 }
5657
5658 ac = kmem_cache_zalloc(ext4_ac_cachep, GFP_NOFS);
5659 if (!ac) {
5660 ar->len = 0;
5661 *errp = -ENOMEM;
5662 goto out;
5663 }
5664
5665 *errp = ext4_mb_initialize_context(ac, ar);
5666 if (*errp) {
5667 ar->len = 0;
5668 goto out;
5669 }
5670
5671 ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
5672 seq = this_cpu_read(discard_pa_seq);
5673 if (!ext4_mb_use_preallocated(ac)) {
5674 ac->ac_op = EXT4_MB_HISTORY_ALLOC;
5675 ext4_mb_normalize_request(ac, ar);
5676
5677 *errp = ext4_mb_pa_alloc(ac);
5678 if (*errp)
5679 goto errout;
5680 repeat:
5681 /* allocate space in core */
5682 *errp = ext4_mb_regular_allocator(ac);
5683 /*
5684 * pa allocated above is added to grp->bb_prealloc_list only
5685 * when we were able to allocate some block i.e. when
5686 * ac->ac_status == AC_STATUS_FOUND.
5687 * And error from above mean ac->ac_status != AC_STATUS_FOUND
5688 * So we have to free this pa here itself.
5689 */
5690 if (*errp) {
5691 ext4_mb_pa_free(ac);
5692 ext4_discard_allocated_blocks(ac);
5693 goto errout;
5694 }
5695 if (ac->ac_status == AC_STATUS_FOUND &&
5696 ac->ac_o_ex.fe_len >= ac->ac_f_ex.fe_len)
5697 ext4_mb_pa_free(ac);
5698 }
5699 if (likely(ac->ac_status == AC_STATUS_FOUND)) {
5700 *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
5701 if (*errp) {
5702 ext4_discard_allocated_blocks(ac);
5703 goto errout;
5704 } else {
5705 block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
5706 ar->len = ac->ac_b_ex.fe_len;
5707 }
5708 } else {
5709 if (++retries < 3 &&
5710 ext4_mb_discard_preallocations_should_retry(sb, ac, &seq))
5711 goto repeat;
5712 /*
5713 * If block allocation fails then the pa allocated above
5714 * needs to be freed here itself.
5715 */
5716 ext4_mb_pa_free(ac);
5717 *errp = -ENOSPC;
5718 }
5719
5720 errout:
5721 if (*errp) {
5722 ac->ac_b_ex.fe_len = 0;
5723 ar->len = 0;
5724 ext4_mb_show_ac(ac);
5725 }
5726 ext4_mb_release_context(ac);
5727 out:
5728 if (ac)
5729 kmem_cache_free(ext4_ac_cachep, ac);
5730 if (inquota && ar->len < inquota)
5731 dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len));
5732 if (!ar->len) {
5733 if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0)
5734 /* release all the reserved blocks if non delalloc */
5735 percpu_counter_sub(&sbi->s_dirtyclusters_counter,
5736 reserv_clstrs);
5737 }
5738
5739 trace_ext4_allocate_blocks(ar, (unsigned long long)block);
5740
5741 return block;
5742 }
5743
5744 /*
5745 * We can merge two free data extents only if the physical blocks
5746 * are contiguous, AND the extents were freed by the same transaction,
5747 * AND the blocks are associated with the same group.
5748 */
ext4_try_merge_freed_extent(struct ext4_sb_info * sbi,struct ext4_free_data * entry,struct ext4_free_data * new_entry,struct rb_root * entry_rb_root)5749 static void ext4_try_merge_freed_extent(struct ext4_sb_info *sbi,
5750 struct ext4_free_data *entry,
5751 struct ext4_free_data *new_entry,
5752 struct rb_root *entry_rb_root)
5753 {
5754 if ((entry->efd_tid != new_entry->efd_tid) ||
5755 (entry->efd_group != new_entry->efd_group))
5756 return;
5757 if (entry->efd_start_cluster + entry->efd_count ==
5758 new_entry->efd_start_cluster) {
5759 new_entry->efd_start_cluster = entry->efd_start_cluster;
5760 new_entry->efd_count += entry->efd_count;
5761 } else if (new_entry->efd_start_cluster + new_entry->efd_count ==
5762 entry->efd_start_cluster) {
5763 new_entry->efd_count += entry->efd_count;
5764 } else
5765 return;
5766 spin_lock(&sbi->s_md_lock);
5767 list_del(&entry->efd_list);
5768 spin_unlock(&sbi->s_md_lock);
5769 rb_erase(&entry->efd_node, entry_rb_root);
5770 kmem_cache_free(ext4_free_data_cachep, entry);
5771 }
5772
5773 static noinline_for_stack int
ext4_mb_free_metadata(handle_t * handle,struct ext4_buddy * e4b,struct ext4_free_data * new_entry)5774 ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
5775 struct ext4_free_data *new_entry)
5776 {
5777 ext4_group_t group = e4b->bd_group;
5778 ext4_grpblk_t cluster;
5779 ext4_grpblk_t clusters = new_entry->efd_count;
5780 struct ext4_free_data *entry;
5781 struct ext4_group_info *db = e4b->bd_info;
5782 struct super_block *sb = e4b->bd_sb;
5783 struct ext4_sb_info *sbi = EXT4_SB(sb);
5784 struct rb_node **n = &db->bb_free_root.rb_node, *node;
5785 struct rb_node *parent = NULL, *new_node;
5786
5787 BUG_ON(!ext4_handle_valid(handle));
5788 BUG_ON(e4b->bd_bitmap_page == NULL);
5789 BUG_ON(e4b->bd_buddy_page == NULL);
5790
5791 new_node = &new_entry->efd_node;
5792 cluster = new_entry->efd_start_cluster;
5793
5794 if (!*n) {
5795 /* first free block exent. We need to
5796 protect buddy cache from being freed,
5797 * otherwise we'll refresh it from
5798 * on-disk bitmap and lose not-yet-available
5799 * blocks */
5800 get_page(e4b->bd_buddy_page);
5801 get_page(e4b->bd_bitmap_page);
5802 }
5803 while (*n) {
5804 parent = *n;
5805 entry = rb_entry(parent, struct ext4_free_data, efd_node);
5806 if (cluster < entry->efd_start_cluster)
5807 n = &(*n)->rb_left;
5808 else if (cluster >= (entry->efd_start_cluster + entry->efd_count))
5809 n = &(*n)->rb_right;
5810 else {
5811 ext4_grp_locked_error(sb, group, 0,
5812 ext4_group_first_block_no(sb, group) +
5813 EXT4_C2B(sbi, cluster),
5814 "Block already on to-be-freed list");
5815 kmem_cache_free(ext4_free_data_cachep, new_entry);
5816 return 0;
5817 }
5818 }
5819
5820 rb_link_node(new_node, parent, n);
5821 rb_insert_color(new_node, &db->bb_free_root);
5822
5823 /* Now try to see the extent can be merged to left and right */
5824 node = rb_prev(new_node);
5825 if (node) {
5826 entry = rb_entry(node, struct ext4_free_data, efd_node);
5827 ext4_try_merge_freed_extent(sbi, entry, new_entry,
5828 &(db->bb_free_root));
5829 }
5830
5831 node = rb_next(new_node);
5832 if (node) {
5833 entry = rb_entry(node, struct ext4_free_data, efd_node);
5834 ext4_try_merge_freed_extent(sbi, entry, new_entry,
5835 &(db->bb_free_root));
5836 }
5837
5838 spin_lock(&sbi->s_md_lock);
5839 list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list);
5840 sbi->s_mb_free_pending += clusters;
5841 spin_unlock(&sbi->s_md_lock);
5842 return 0;
5843 }
5844
5845 /*
5846 * Simple allocator for Ext4 fast commit replay path. It searches for blocks
5847 * linearly starting at the goal block and also excludes the blocks which
5848 * are going to be in use after fast commit replay.
5849 */
ext4_mb_new_blocks_simple(handle_t * handle,struct ext4_allocation_request * ar,int * errp)5850 static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle,
5851 struct ext4_allocation_request *ar, int *errp)
5852 {
5853 struct buffer_head *bitmap_bh;
5854 struct super_block *sb = ar->inode->i_sb;
5855 ext4_group_t group;
5856 ext4_grpblk_t blkoff;
5857 ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
5858 ext4_grpblk_t i = 0;
5859 ext4_fsblk_t goal, block;
5860 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
5861
5862 goal = ar->goal;
5863 if (goal < le32_to_cpu(es->s_first_data_block) ||
5864 goal >= ext4_blocks_count(es))
5865 goal = le32_to_cpu(es->s_first_data_block);
5866
5867 ar->len = 0;
5868 ext4_get_group_no_and_offset(sb, goal, &group, &blkoff);
5869 for (; group < ext4_get_groups_count(sb); group++) {
5870 bitmap_bh = ext4_read_block_bitmap(sb, group);
5871 if (IS_ERR(bitmap_bh)) {
5872 *errp = PTR_ERR(bitmap_bh);
5873 pr_warn("Failed to read block bitmap\n");
5874 return 0;
5875 }
5876
5877 ext4_get_group_no_and_offset(sb,
5878 max(ext4_group_first_block_no(sb, group), goal),
5879 NULL, &blkoff);
5880 while (1) {
5881 i = mb_find_next_zero_bit(bitmap_bh->b_data, max,
5882 blkoff);
5883 if (i >= max)
5884 break;
5885 if (ext4_fc_replay_check_excluded(sb,
5886 ext4_group_first_block_no(sb, group) + i)) {
5887 blkoff = i + 1;
5888 } else
5889 break;
5890 }
5891 brelse(bitmap_bh);
5892 if (i < max)
5893 break;
5894 }
5895
5896 if (group >= ext4_get_groups_count(sb) || i >= max) {
5897 *errp = -ENOSPC;
5898 return 0;
5899 }
5900
5901 block = ext4_group_first_block_no(sb, group) + i;
5902 ext4_mb_mark_bb(sb, block, 1, 1);
5903 ar->len = 1;
5904
5905 return block;
5906 }
5907
ext4_free_blocks_simple(struct inode * inode,ext4_fsblk_t block,unsigned long count)5908 static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block,
5909 unsigned long count)
5910 {
5911 struct buffer_head *bitmap_bh;
5912 struct super_block *sb = inode->i_sb;
5913 struct ext4_group_desc *gdp;
5914 struct buffer_head *gdp_bh;
5915 ext4_group_t group;
5916 ext4_grpblk_t blkoff;
5917 int already_freed = 0, err, i;
5918
5919 ext4_get_group_no_and_offset(sb, block, &group, &blkoff);
5920 bitmap_bh = ext4_read_block_bitmap(sb, group);
5921 if (IS_ERR(bitmap_bh)) {
5922 err = PTR_ERR(bitmap_bh);
5923 pr_warn("Failed to read block bitmap\n");
5924 return;
5925 }
5926 gdp = ext4_get_group_desc(sb, group, &gdp_bh);
5927 if (!gdp)
5928 return;
5929
5930 for (i = 0; i < count; i++) {
5931 if (!mb_test_bit(blkoff + i, bitmap_bh->b_data))
5932 already_freed++;
5933 }
5934 mb_clear_bits(bitmap_bh->b_data, blkoff, count);
5935 err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh);
5936 if (err)
5937 return;
5938 ext4_free_group_clusters_set(
5939 sb, gdp, ext4_free_group_clusters(sb, gdp) +
5940 count - already_freed);
5941 ext4_block_bitmap_csum_set(sb, group, gdp, bitmap_bh);
5942 ext4_group_desc_csum_set(sb, group, gdp);
5943 ext4_handle_dirty_metadata(NULL, NULL, gdp_bh);
5944 sync_dirty_buffer(bitmap_bh);
5945 sync_dirty_buffer(gdp_bh);
5946 brelse(bitmap_bh);
5947 }
5948
5949 /**
5950 * ext4_mb_clear_bb() -- helper function for freeing blocks.
5951 * Used by ext4_free_blocks()
5952 * @handle: handle for this transaction
5953 * @inode: inode
5954 * @bh: optional buffer of the block to be freed
5955 * @block: starting physical block to be freed
5956 * @count: number of blocks to be freed
5957 * @flags: flags used by ext4_free_blocks
5958 */
ext4_mb_clear_bb(handle_t * handle,struct inode * inode,ext4_fsblk_t block,unsigned long count,int flags)5959 static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode,
5960 ext4_fsblk_t block, unsigned long count,
5961 int flags)
5962 {
5963 struct buffer_head *bitmap_bh = NULL;
5964 struct super_block *sb = inode->i_sb;
5965 struct ext4_group_desc *gdp;
5966 struct ext4_group_info *grp;
5967 unsigned int overflow;
5968 ext4_grpblk_t bit;
5969 struct buffer_head *gd_bh;
5970 ext4_group_t block_group;
5971 struct ext4_sb_info *sbi;
5972 struct ext4_buddy e4b;
5973 unsigned int count_clusters;
5974 int err = 0;
5975 int ret;
5976
5977 sbi = EXT4_SB(sb);
5978
5979 if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
5980 !ext4_inode_block_valid(inode, block, count)) {
5981 ext4_error(sb, "Freeing blocks in system zone - "
5982 "Block = %llu, count = %lu", block, count);
5983 /* err = 0. ext4_std_error should be a no op */
5984 goto error_return;
5985 }
5986 flags |= EXT4_FREE_BLOCKS_VALIDATED;
5987
5988 do_more:
5989 overflow = 0;
5990 ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
5991
5992 grp = ext4_get_group_info(sb, block_group);
5993 if (unlikely(!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
5994 return;
5995
5996 /*
5997 * Check to see if we are freeing blocks across a group
5998 * boundary.
5999 */
6000 if (EXT4_C2B(sbi, bit) + count > EXT4_BLOCKS_PER_GROUP(sb)) {
6001 overflow = EXT4_C2B(sbi, bit) + count -
6002 EXT4_BLOCKS_PER_GROUP(sb);
6003 count -= overflow;
6004 /* The range changed so it's no longer validated */
6005 flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
6006 }
6007 count_clusters = EXT4_NUM_B2C(sbi, count);
6008 bitmap_bh = ext4_read_block_bitmap(sb, block_group);
6009 if (IS_ERR(bitmap_bh)) {
6010 err = PTR_ERR(bitmap_bh);
6011 bitmap_bh = NULL;
6012 goto error_return;
6013 }
6014 gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
6015 if (!gdp) {
6016 err = -EIO;
6017 goto error_return;
6018 }
6019
6020 if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
6021 !ext4_inode_block_valid(inode, block, count)) {
6022 ext4_error(sb, "Freeing blocks in system zone - "
6023 "Block = %llu, count = %lu", block, count);
6024 /* err = 0. ext4_std_error should be a no op */
6025 goto error_return;
6026 }
6027
6028 BUFFER_TRACE(bitmap_bh, "getting write access");
6029 err = ext4_journal_get_write_access(handle, sb, bitmap_bh,
6030 EXT4_JTR_NONE);
6031 if (err)
6032 goto error_return;
6033
6034 /*
6035 * We are about to modify some metadata. Call the journal APIs
6036 * to unshare ->b_data if a currently-committing transaction is
6037 * using it
6038 */
6039 BUFFER_TRACE(gd_bh, "get_write_access");
6040 err = ext4_journal_get_write_access(handle, sb, gd_bh, EXT4_JTR_NONE);
6041 if (err)
6042 goto error_return;
6043 #ifdef AGGRESSIVE_CHECK
6044 {
6045 int i;
6046 for (i = 0; i < count_clusters; i++)
6047 BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
6048 }
6049 #endif
6050 trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters);
6051
6052 /* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */
6053 err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b,
6054 GFP_NOFS|__GFP_NOFAIL);
6055 if (err)
6056 goto error_return;
6057
6058 /*
6059 * We need to make sure we don't reuse the freed block until after the
6060 * transaction is committed. We make an exception if the inode is to be
6061 * written in writeback mode since writeback mode has weak data
6062 * consistency guarantees.
6063 */
6064 if (ext4_handle_valid(handle) &&
6065 ((flags & EXT4_FREE_BLOCKS_METADATA) ||
6066 !ext4_should_writeback_data(inode))) {
6067 struct ext4_free_data *new_entry;
6068 /*
6069 * We use __GFP_NOFAIL because ext4_free_blocks() is not allowed
6070 * to fail.
6071 */
6072 new_entry = kmem_cache_alloc(ext4_free_data_cachep,
6073 GFP_NOFS|__GFP_NOFAIL);
6074 new_entry->efd_start_cluster = bit;
6075 new_entry->efd_group = block_group;
6076 new_entry->efd_count = count_clusters;
6077 new_entry->efd_tid = handle->h_transaction->t_tid;
6078
6079 ext4_lock_group(sb, block_group);
6080 mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
6081 ext4_mb_free_metadata(handle, &e4b, new_entry);
6082 } else {
6083 /* need to update group_info->bb_free and bitmap
6084 * with group lock held. generate_buddy look at
6085 * them with group lock_held
6086 */
6087 if (test_opt(sb, DISCARD)) {
6088 err = ext4_issue_discard(sb, block_group, bit,
6089 count_clusters, NULL);
6090 if (err && err != -EOPNOTSUPP)
6091 ext4_msg(sb, KERN_WARNING, "discard request in"
6092 " group:%u block:%d count:%lu failed"
6093 " with %d", block_group, bit, count,
6094 err);
6095 } else
6096 EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info);
6097
6098 ext4_lock_group(sb, block_group);
6099 mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
6100 mb_free_blocks(inode, &e4b, bit, count_clusters);
6101 }
6102
6103 ret = ext4_free_group_clusters(sb, gdp) + count_clusters;
6104 ext4_free_group_clusters_set(sb, gdp, ret);
6105 ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh);
6106 ext4_group_desc_csum_set(sb, block_group, gdp);
6107 ext4_unlock_group(sb, block_group);
6108
6109 if (sbi->s_log_groups_per_flex) {
6110 ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
6111 atomic64_add(count_clusters,
6112 &sbi_array_rcu_deref(sbi, s_flex_groups,
6113 flex_group)->free_clusters);
6114 }
6115
6116 /*
6117 * on a bigalloc file system, defer the s_freeclusters_counter
6118 * update to the caller (ext4_remove_space and friends) so they
6119 * can determine if a cluster freed here should be rereserved
6120 */
6121 if (!(flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)) {
6122 if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
6123 dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
6124 percpu_counter_add(&sbi->s_freeclusters_counter,
6125 count_clusters);
6126 }
6127
6128 ext4_mb_unload_buddy(&e4b);
6129
6130 /* We dirtied the bitmap block */
6131 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
6132 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
6133
6134 /* And the group descriptor block */
6135 BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
6136 ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
6137 if (!err)
6138 err = ret;
6139
6140 if (overflow && !err) {
6141 block += count;
6142 count = overflow;
6143 put_bh(bitmap_bh);
6144 /* The range changed so it's no longer validated */
6145 flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
6146 goto do_more;
6147 }
6148 error_return:
6149 brelse(bitmap_bh);
6150 ext4_std_error(sb, err);
6151 return;
6152 }
6153
6154 /**
6155 * ext4_free_blocks() -- Free given blocks and update quota
6156 * @handle: handle for this transaction
6157 * @inode: inode
6158 * @bh: optional buffer of the block to be freed
6159 * @block: starting physical block to be freed
6160 * @count: number of blocks to be freed
6161 * @flags: flags used by ext4_free_blocks
6162 */
ext4_free_blocks(handle_t * handle,struct inode * inode,struct buffer_head * bh,ext4_fsblk_t block,unsigned long count,int flags)6163 void ext4_free_blocks(handle_t *handle, struct inode *inode,
6164 struct buffer_head *bh, ext4_fsblk_t block,
6165 unsigned long count, int flags)
6166 {
6167 struct super_block *sb = inode->i_sb;
6168 unsigned int overflow;
6169 struct ext4_sb_info *sbi;
6170
6171 sbi = EXT4_SB(sb);
6172
6173 if (bh) {
6174 if (block)
6175 BUG_ON(block != bh->b_blocknr);
6176 else
6177 block = bh->b_blocknr;
6178 }
6179
6180 if (sbi->s_mount_state & EXT4_FC_REPLAY) {
6181 ext4_free_blocks_simple(inode, block, EXT4_NUM_B2C(sbi, count));
6182 return;
6183 }
6184
6185 might_sleep();
6186
6187 if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
6188 !ext4_inode_block_valid(inode, block, count)) {
6189 ext4_error(sb, "Freeing blocks not in datazone - "
6190 "block = %llu, count = %lu", block, count);
6191 return;
6192 }
6193 flags |= EXT4_FREE_BLOCKS_VALIDATED;
6194
6195 ext4_debug("freeing block %llu\n", block);
6196 trace_ext4_free_blocks(inode, block, count, flags);
6197
6198 if (bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
6199 BUG_ON(count > 1);
6200
6201 ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
6202 inode, bh, block);
6203 }
6204
6205 /*
6206 * If the extent to be freed does not begin on a cluster
6207 * boundary, we need to deal with partial clusters at the
6208 * beginning and end of the extent. Normally we will free
6209 * blocks at the beginning or the end unless we are explicitly
6210 * requested to avoid doing so.
6211 */
6212 overflow = EXT4_PBLK_COFF(sbi, block);
6213 if (overflow) {
6214 if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {
6215 overflow = sbi->s_cluster_ratio - overflow;
6216 block += overflow;
6217 if (count > overflow)
6218 count -= overflow;
6219 else
6220 return;
6221 } else {
6222 block -= overflow;
6223 count += overflow;
6224 }
6225 /* The range changed so it's no longer validated */
6226 flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
6227 }
6228 overflow = EXT4_LBLK_COFF(sbi, count);
6229 if (overflow) {
6230 if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {
6231 if (count > overflow)
6232 count -= overflow;
6233 else
6234 return;
6235 } else
6236 count += sbi->s_cluster_ratio - overflow;
6237 /* The range changed so it's no longer validated */
6238 flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
6239 }
6240
6241 if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
6242 int i;
6243 int is_metadata = flags & EXT4_FREE_BLOCKS_METADATA;
6244
6245 for (i = 0; i < count; i++) {
6246 cond_resched();
6247 if (is_metadata)
6248 bh = sb_find_get_block(inode->i_sb, block + i);
6249 ext4_forget(handle, is_metadata, inode, bh, block + i);
6250 }
6251 }
6252
6253 ext4_mb_clear_bb(handle, inode, block, count, flags);
6254 return;
6255 }
6256
6257 /**
6258 * ext4_group_add_blocks() -- Add given blocks to an existing group
6259 * @handle: handle to this transaction
6260 * @sb: super block
6261 * @block: start physical block to add to the block group
6262 * @count: number of blocks to free
6263 *
6264 * This marks the blocks as free in the bitmap and buddy.
6265 */
ext4_group_add_blocks(handle_t * handle,struct super_block * sb,ext4_fsblk_t block,unsigned long count)6266 int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
6267 ext4_fsblk_t block, unsigned long count)
6268 {
6269 struct buffer_head *bitmap_bh = NULL;
6270 struct buffer_head *gd_bh;
6271 ext4_group_t block_group;
6272 ext4_grpblk_t bit;
6273 unsigned int i;
6274 struct ext4_group_desc *desc;
6275 struct ext4_sb_info *sbi = EXT4_SB(sb);
6276 struct ext4_buddy e4b;
6277 int err = 0, ret, free_clusters_count;
6278 ext4_grpblk_t clusters_freed;
6279 ext4_fsblk_t first_cluster = EXT4_B2C(sbi, block);
6280 ext4_fsblk_t last_cluster = EXT4_B2C(sbi, block + count - 1);
6281 unsigned long cluster_count = last_cluster - first_cluster + 1;
6282
6283 ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
6284
6285 if (count == 0)
6286 return 0;
6287
6288 ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
6289 /*
6290 * Check to see if we are freeing blocks across a group
6291 * boundary.
6292 */
6293 if (bit + cluster_count > EXT4_CLUSTERS_PER_GROUP(sb)) {
6294 ext4_warning(sb, "too many blocks added to group %u",
6295 block_group);
6296 err = -EINVAL;
6297 goto error_return;
6298 }
6299
6300 bitmap_bh = ext4_read_block_bitmap(sb, block_group);
6301 if (IS_ERR(bitmap_bh)) {
6302 err = PTR_ERR(bitmap_bh);
6303 bitmap_bh = NULL;
6304 goto error_return;
6305 }
6306
6307 desc = ext4_get_group_desc(sb, block_group, &gd_bh);
6308 if (!desc) {
6309 err = -EIO;
6310 goto error_return;
6311 }
6312
6313 if (!ext4_sb_block_valid(sb, NULL, block, count)) {
6314 ext4_error(sb, "Adding blocks in system zones - "
6315 "Block = %llu, count = %lu",
6316 block, count);
6317 err = -EINVAL;
6318 goto error_return;
6319 }
6320
6321 BUFFER_TRACE(bitmap_bh, "getting write access");
6322 err = ext4_journal_get_write_access(handle, sb, bitmap_bh,
6323 EXT4_JTR_NONE);
6324 if (err)
6325 goto error_return;
6326
6327 /*
6328 * We are about to modify some metadata. Call the journal APIs
6329 * to unshare ->b_data if a currently-committing transaction is
6330 * using it
6331 */
6332 BUFFER_TRACE(gd_bh, "get_write_access");
6333 err = ext4_journal_get_write_access(handle, sb, gd_bh, EXT4_JTR_NONE);
6334 if (err)
6335 goto error_return;
6336
6337 for (i = 0, clusters_freed = 0; i < cluster_count; i++) {
6338 BUFFER_TRACE(bitmap_bh, "clear bit");
6339 if (!mb_test_bit(bit + i, bitmap_bh->b_data)) {
6340 ext4_error(sb, "bit already cleared for block %llu",
6341 (ext4_fsblk_t)(block + i));
6342 BUFFER_TRACE(bitmap_bh, "bit already cleared");
6343 } else {
6344 clusters_freed++;
6345 }
6346 }
6347
6348 err = ext4_mb_load_buddy(sb, block_group, &e4b);
6349 if (err)
6350 goto error_return;
6351
6352 /*
6353 * need to update group_info->bb_free and bitmap
6354 * with group lock held. generate_buddy look at
6355 * them with group lock_held
6356 */
6357 ext4_lock_group(sb, block_group);
6358 mb_clear_bits(bitmap_bh->b_data, bit, cluster_count);
6359 mb_free_blocks(NULL, &e4b, bit, cluster_count);
6360 free_clusters_count = clusters_freed +
6361 ext4_free_group_clusters(sb, desc);
6362 ext4_free_group_clusters_set(sb, desc, free_clusters_count);
6363 ext4_block_bitmap_csum_set(sb, block_group, desc, bitmap_bh);
6364 ext4_group_desc_csum_set(sb, block_group, desc);
6365 ext4_unlock_group(sb, block_group);
6366 percpu_counter_add(&sbi->s_freeclusters_counter,
6367 clusters_freed);
6368
6369 if (sbi->s_log_groups_per_flex) {
6370 ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
6371 atomic64_add(clusters_freed,
6372 &sbi_array_rcu_deref(sbi, s_flex_groups,
6373 flex_group)->free_clusters);
6374 }
6375
6376 ext4_mb_unload_buddy(&e4b);
6377
6378 /* We dirtied the bitmap block */
6379 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
6380 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
6381
6382 /* And the group descriptor block */
6383 BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
6384 ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
6385 if (!err)
6386 err = ret;
6387
6388 error_return:
6389 brelse(bitmap_bh);
6390 ext4_std_error(sb, err);
6391 return err;
6392 }
6393
6394 /**
6395 * ext4_trim_extent -- function to TRIM one single free extent in the group
6396 * @sb: super block for the file system
6397 * @start: starting block of the free extent in the alloc. group
6398 * @count: number of blocks to TRIM
6399 * @e4b: ext4 buddy for the group
6400 *
6401 * Trim "count" blocks starting at "start" in the "group". To assure that no
6402 * one will allocate those blocks, mark it as used in buddy bitmap. This must
6403 * be called with under the group lock.
6404 */
ext4_trim_extent(struct super_block * sb,int start,int count,struct ext4_buddy * e4b)6405 static int ext4_trim_extent(struct super_block *sb,
6406 int start, int count, struct ext4_buddy *e4b)
6407 __releases(bitlock)
6408 __acquires(bitlock)
6409 {
6410 struct ext4_free_extent ex;
6411 ext4_group_t group = e4b->bd_group;
6412 int ret = 0;
6413
6414 trace_ext4_trim_extent(sb, group, start, count);
6415
6416 assert_spin_locked(ext4_group_lock_ptr(sb, group));
6417
6418 ex.fe_start = start;
6419 ex.fe_group = group;
6420 ex.fe_len = count;
6421
6422 /*
6423 * Mark blocks used, so no one can reuse them while
6424 * being trimmed.
6425 */
6426 mb_mark_used(e4b, &ex);
6427 ext4_unlock_group(sb, group);
6428 ret = ext4_issue_discard(sb, group, start, count, NULL);
6429 ext4_lock_group(sb, group);
6430 mb_free_blocks(NULL, e4b, start, ex.fe_len);
6431 return ret;
6432 }
6433
ext4_last_grp_cluster(struct super_block * sb,ext4_group_t grp)6434 static ext4_grpblk_t ext4_last_grp_cluster(struct super_block *sb,
6435 ext4_group_t grp)
6436 {
6437 unsigned long nr_clusters_in_group;
6438
6439 if (grp < (ext4_get_groups_count(sb) - 1))
6440 nr_clusters_in_group = EXT4_CLUSTERS_PER_GROUP(sb);
6441 else
6442 nr_clusters_in_group = (ext4_blocks_count(EXT4_SB(sb)->s_es) -
6443 ext4_group_first_block_no(sb, grp))
6444 >> EXT4_CLUSTER_BITS(sb);
6445
6446 return nr_clusters_in_group - 1;
6447 }
6448
ext4_trim_interrupted(void)6449 static bool ext4_trim_interrupted(void)
6450 {
6451 return fatal_signal_pending(current) || freezing(current);
6452 }
6453
ext4_try_to_trim_range(struct super_block * sb,struct ext4_buddy * e4b,ext4_grpblk_t start,ext4_grpblk_t max,ext4_grpblk_t minblocks)6454 static int ext4_try_to_trim_range(struct super_block *sb,
6455 struct ext4_buddy *e4b, ext4_grpblk_t start,
6456 ext4_grpblk_t max, ext4_grpblk_t minblocks)
6457 __acquires(ext4_group_lock_ptr(sb, e4b->bd_group))
6458 __releases(ext4_group_lock_ptr(sb, e4b->bd_group))
6459 {
6460 ext4_grpblk_t next, count, free_count, last, origin_start;
6461 bool set_trimmed = false;
6462 void *bitmap;
6463
6464 last = ext4_last_grp_cluster(sb, e4b->bd_group);
6465 bitmap = e4b->bd_bitmap;
6466 if (start == 0 && max >= last)
6467 set_trimmed = true;
6468 origin_start = start;
6469 start = max(e4b->bd_info->bb_first_free, start);
6470 count = 0;
6471 free_count = 0;
6472
6473 while (start <= max) {
6474 start = mb_find_next_zero_bit(bitmap, max + 1, start);
6475 if (start > max)
6476 break;
6477
6478 next = mb_find_next_bit(bitmap, last + 1, start);
6479 if (origin_start == 0 && next >= last)
6480 set_trimmed = true;
6481
6482 if ((next - start) >= minblocks) {
6483 int ret = ext4_trim_extent(sb, start, next - start, e4b);
6484
6485 if (ret && ret != -EOPNOTSUPP)
6486 return count;
6487 count += next - start;
6488 }
6489 free_count += next - start;
6490 start = next + 1;
6491
6492 if (ext4_trim_interrupted())
6493 return count;
6494
6495 if (need_resched()) {
6496 ext4_unlock_group(sb, e4b->bd_group);
6497 cond_resched();
6498 ext4_lock_group(sb, e4b->bd_group);
6499 }
6500
6501 if ((e4b->bd_info->bb_free - free_count) < minblocks)
6502 break;
6503 }
6504
6505 if (set_trimmed)
6506 EXT4_MB_GRP_SET_TRIMMED(e4b->bd_info);
6507
6508 return count;
6509 }
6510
6511 /**
6512 * ext4_trim_all_free -- function to trim all free space in alloc. group
6513 * @sb: super block for file system
6514 * @group: group to be trimmed
6515 * @start: first group block to examine
6516 * @max: last group block to examine
6517 * @minblocks: minimum extent block count
6518 *
6519 * ext4_trim_all_free walks through group's block bitmap searching for free
6520 * extents. When the free extent is found, mark it as used in group buddy
6521 * bitmap. Then issue a TRIM command on this extent and free the extent in
6522 * the group buddy bitmap.
6523 */
6524 static ext4_grpblk_t
ext4_trim_all_free(struct super_block * sb,ext4_group_t group,ext4_grpblk_t start,ext4_grpblk_t max,ext4_grpblk_t minblocks)6525 ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
6526 ext4_grpblk_t start, ext4_grpblk_t max,
6527 ext4_grpblk_t minblocks)
6528 {
6529 struct ext4_buddy e4b;
6530 int ret;
6531
6532 trace_ext4_trim_all_free(sb, group, start, max);
6533
6534 ret = ext4_mb_load_buddy(sb, group, &e4b);
6535 if (ret) {
6536 ext4_warning(sb, "Error %d loading buddy information for %u",
6537 ret, group);
6538 return ret;
6539 }
6540
6541 ext4_lock_group(sb, group);
6542
6543 if (!EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) ||
6544 minblocks < EXT4_SB(sb)->s_last_trim_minblks)
6545 ret = ext4_try_to_trim_range(sb, &e4b, start, max, minblocks);
6546 else
6547 ret = 0;
6548
6549 ext4_unlock_group(sb, group);
6550 ext4_mb_unload_buddy(&e4b);
6551
6552 ext4_debug("trimmed %d blocks in the group %d\n",
6553 ret, group);
6554
6555 return ret;
6556 }
6557
6558 /**
6559 * ext4_trim_fs() -- trim ioctl handle function
6560 * @sb: superblock for filesystem
6561 * @range: fstrim_range structure
6562 *
6563 * start: First Byte to trim
6564 * len: number of Bytes to trim from start
6565 * minlen: minimum extent length in Bytes
6566 * ext4_trim_fs goes through all allocation groups containing Bytes from
6567 * start to start+len. For each such a group ext4_trim_all_free function
6568 * is invoked to trim all free space.
6569 */
ext4_trim_fs(struct super_block * sb,struct fstrim_range * range)6570 int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
6571 {
6572 struct request_queue *q = bdev_get_queue(sb->s_bdev);
6573 struct ext4_group_info *grp;
6574 ext4_group_t group, first_group, last_group;
6575 ext4_grpblk_t cnt = 0, first_cluster, last_cluster;
6576 uint64_t start, end, minlen, trimmed = 0;
6577 ext4_fsblk_t first_data_blk =
6578 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
6579 ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es);
6580 int ret = 0;
6581
6582 start = range->start >> sb->s_blocksize_bits;
6583 end = start + (range->len >> sb->s_blocksize_bits) - 1;
6584 minlen = EXT4_NUM_B2C(EXT4_SB(sb),
6585 range->minlen >> sb->s_blocksize_bits);
6586
6587 if (minlen > EXT4_CLUSTERS_PER_GROUP(sb) ||
6588 start >= max_blks ||
6589 range->len < sb->s_blocksize)
6590 return -EINVAL;
6591 /* No point to try to trim less than discard granularity */
6592 if (range->minlen < q->limits.discard_granularity) {
6593 minlen = EXT4_NUM_B2C(EXT4_SB(sb),
6594 q->limits.discard_granularity >> sb->s_blocksize_bits);
6595 if (minlen > EXT4_CLUSTERS_PER_GROUP(sb))
6596 goto out;
6597 }
6598 if (end >= max_blks - 1)
6599 end = max_blks - 1;
6600 if (end <= first_data_blk)
6601 goto out;
6602 if (start < first_data_blk)
6603 start = first_data_blk;
6604
6605 /* Determine first and last group to examine based on start and end */
6606 ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
6607 &first_group, &first_cluster);
6608 ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) end,
6609 &last_group, &last_cluster);
6610
6611 /* end now represents the last cluster to discard in this group */
6612 end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
6613
6614 for (group = first_group; group <= last_group; group++) {
6615 if (ext4_trim_interrupted())
6616 break;
6617 grp = ext4_get_group_info(sb, group);
6618 if (!grp)
6619 continue;
6620 /* We only do this if the grp has never been initialized */
6621 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
6622 ret = ext4_mb_init_group(sb, group, GFP_NOFS);
6623 if (ret)
6624 break;
6625 }
6626
6627 /*
6628 * For all the groups except the last one, last cluster will
6629 * always be EXT4_CLUSTERS_PER_GROUP(sb)-1, so we only need to
6630 * change it for the last group, note that last_cluster is
6631 * already computed earlier by ext4_get_group_no_and_offset()
6632 */
6633 if (group == last_group)
6634 end = last_cluster;
6635 if (grp->bb_free >= minlen) {
6636 cnt = ext4_trim_all_free(sb, group, first_cluster,
6637 end, minlen);
6638 if (cnt < 0) {
6639 ret = cnt;
6640 break;
6641 }
6642 trimmed += cnt;
6643 }
6644
6645 /*
6646 * For every group except the first one, we are sure
6647 * that the first cluster to discard will be cluster #0.
6648 */
6649 first_cluster = 0;
6650 }
6651
6652 if (!ret)
6653 EXT4_SB(sb)->s_last_trim_minblks = minlen;
6654
6655 out:
6656 range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits;
6657 return ret;
6658 }
6659
6660 /* Iterate all the free extents in the group. */
6661 int
ext4_mballoc_query_range(struct super_block * sb,ext4_group_t group,ext4_grpblk_t start,ext4_grpblk_t end,ext4_mballoc_query_range_fn formatter,void * priv)6662 ext4_mballoc_query_range(
6663 struct super_block *sb,
6664 ext4_group_t group,
6665 ext4_grpblk_t start,
6666 ext4_grpblk_t end,
6667 ext4_mballoc_query_range_fn formatter,
6668 void *priv)
6669 {
6670 void *bitmap;
6671 ext4_grpblk_t next;
6672 struct ext4_buddy e4b;
6673 int error;
6674
6675 error = ext4_mb_load_buddy(sb, group, &e4b);
6676 if (error)
6677 return error;
6678 bitmap = e4b.bd_bitmap;
6679
6680 ext4_lock_group(sb, group);
6681
6682 start = max(e4b.bd_info->bb_first_free, start);
6683 if (end >= EXT4_CLUSTERS_PER_GROUP(sb))
6684 end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
6685
6686 while (start <= end) {
6687 start = mb_find_next_zero_bit(bitmap, end + 1, start);
6688 if (start > end)
6689 break;
6690 next = mb_find_next_bit(bitmap, end + 1, start);
6691
6692 ext4_unlock_group(sb, group);
6693 error = formatter(sb, group, start, next - start, priv);
6694 if (error)
6695 goto out_unload;
6696 ext4_lock_group(sb, group);
6697
6698 start = next + 1;
6699 }
6700
6701 ext4_unlock_group(sb, group);
6702 out_unload:
6703 ext4_mb_unload_buddy(&e4b);
6704
6705 return error;
6706 }
6707